diff --git a/.github/workflows/code-check.yml b/.github/workflows/code-check.yml
new file mode 100644
index 00000000..a7e43d9f
--- /dev/null
+++ b/.github/workflows/code-check.yml
@@ -0,0 +1,34 @@
+name: CI
+
+on:
+  push:
+  pull_request:
+  workflow_dispatch:
+  repository_dispatch:
+    types: [my_event]
+jobs:
+  format-check:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pre-commit pytest pytest-cov 
+          pip install -r requirements.txt
+          pip install -e .
+          pip install black==24.10.0
+      - name: Run pre-commit
+        run: pre-commit run --all-files
+
+      # - name: Run unit tests
+      #   run: pushd tests/unit && pytest && popd
+        
diff --git a/.github/workflows/pr-title-check.yml b/.github/workflows/pr-title-check.yml
new file mode 100644
index 00000000..ae7befd4
--- /dev/null
+++ b/.github/workflows/pr-title-check.yml
@@ -0,0 +1,28 @@
+name: "Lint PR"
+
+on:
+  pull_request_target:
+    types:
+      - opened
+      - edited
+      - synchronize
+
+jobs:
+  main:
+    name: Validate PR title
+    runs-on: ubuntu-latest
+    steps:
+      # https://www.conventionalcommits.org/en/v1.0.0/#summary
+      - uses: amannn/action-semantic-pull-request@v5
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          requireScope: true
+          subjectPattern: ^(?![A-Z]).+$
+          # If `subjectPattern` is configured, you can use this property to override
+          # the default error message that is shown when the pattern doesn't match.
+          # The variables `subject` and `title` can be used within the message.
+          subjectPatternError: |
+            The subject "{subject}" found in the pull request title "{title}"
+            didn't match the configured pattern. Please ensure that the subject
+            doesn't start with an uppercase character.
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 55a71ae2..3dfa7d36 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,4 +12,6 @@
 *.pyc
 /dist
 .vscode/
+.idea/
+.venv/
 __pycache__/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..26ba54fb
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+repos:
+  - repo: https://github.com/psf/black
+    rev: 22.3.0
+    hooks:
+      - id: black
+        files: ^kag/.*\.py$
+        exclude: |
+          (?x)^(
+            kag/solver/logic/core_modules/rule_runner/rule_runner.py |
+            kag/solver/logic/core_modules/parser/logic_node_parser.py
+          )$
+
+  - repo: https://github.com/pycqa/flake8
+    rev: 4.0.1
+    hooks:
+      - id: flake8
+        files: ^kag/.*\.py$
diff --git a/KAG_VERSION b/KAG_VERSION
index d7cdee28..5a2a5806 100644
--- a/KAG_VERSION
+++ b/KAG_VERSION
@@ -1 +1 @@
-0.5.2-beta1
+0.6
diff --git a/MANIFEST.in b/MANIFEST.in
index a922307c..9a655d7d 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,2 +1,6 @@
 recursive-include kag *
 recursive-exclude kag/examples *
+global-exclude *.pyc
+global-exclude *.pyo
+global-exclude *.pyd
+global-exclude __pycache__
\ No newline at end of file
diff --git a/build.sh b/build.sh
new file mode 100644
index 00000000..a45cb572
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,5 @@
+rm -rf build
+
+rm -rf dist
+
+python setup.py sdist bdist_wheel
diff --git a/kag/__init__.py b/kag/__init__.py
index 72bfda5c..00456a7b 100644
--- a/kag/__init__.py
+++ b/kag/__init__.py
@@ -1,3 +1,4 @@
+# flake8: noqa
 # Apache License
 # Version 2.0, January 2004
 # http://www.apache.org/licenses/
@@ -202,8 +203,27 @@
 
 
 __package_name__ = "openspg-kag"
-__version__ = "0.5.2-beta1"
+__version__ = "0.6"
 
-from kag.common.env import init_env
+# Register Built-in Components
+from kag.common.conf import init_env
 
 init_env()
+
+import kag.interface
+import kag.interface.solver.execute
+import kag.interface.solver.plan
+import kag.solver.execute
+import kag.solver.plan
+import kag.solver.retriever
+import kag.solver.tools
+import kag.builder.component
+import kag.builder.default_chain
+import kag.builder.runner
+import kag.builder.prompt
+import kag.solver.prompt
+import kag.common.vectorize_model
+import kag.common.llm
+import kag.common.checkpointer
+import kag.solver
+import kag.bin.commands
diff --git a/kag/interface/retriever/__init__.py b/kag/bin/__init__.py
similarity index 100%
rename from kag/interface/retriever/__init__.py
rename to kag/bin/__init__.py
diff --git a/kag/bin/base.py b/kag/bin/base.py
new file mode 100644
index 00000000..3a41d743
--- /dev/null
+++ b/kag/bin/base.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import argparse
+import logging
+import typing
+from kag.common.registry import Registrable
+
+logger = logging.getLogger()
+
+
+def add_commands(
+    subparsers: argparse._SubParsersAction, command_names: typing.List[str] = None
+):
+    """add commands to subparsers"""
+    all_cmds = Command.list_available()
+    if command_names is None:
+        logger.warn("invalid command_names, will add all available commands.")
+        command_names = all_cmds
+    for cmd in command_names:
+        if cmd not in all_cmds:
+            raise ValueError(f"command {cmd} not in available commands {all_cmds}")
+        # Command Subclasses doesn't accept init args, so just pass subclass name is OK.
+        cls = Command.from_config(cmd)
+        cls.add_to_parser(subparsers)
+
+
+class Command(Registrable):
+    def get_handler(self):
+        """return handler of current command"""
+        return self.handler
+
+    def add_to_parser(self, subparsers: argparse._SubParsersAction):
+        """setup accept arguments"""
+        raise NotImplementedError("setup_parser not implemented yet.")
+
+    @staticmethod
+    def handler(args: argparse.Namespace):
+        """function to proces the request."""
+        raise NotImplementedError("handler not implemented yet.")
diff --git a/kag/common/retriever/__init__.py b/kag/bin/commands/__init__.py
similarity index 74%
rename from kag/common/retriever/__init__.py
rename to kag/bin/commands/__init__.py
index 05156aa5..40427f11 100644
--- a/kag/common/retriever/__init__.py
+++ b/kag/bin/commands/__init__.py
@@ -9,10 +9,7 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
+from kag.bin.commands.info import ListRegisterInfo
 
-from kag.common.retriever.kag_retriever import DefaultRetriever
-from kag.common.retriever.retriever import Retriever
-__all__ = [
-    "DefaultRetriever",
-    "Retriever"
-]
+
+__all__ = ["ListRegisterInfo"]
diff --git a/kag/bin/commands/info.py b/kag/bin/commands/info.py
new file mode 100644
index 00000000..a818647a
--- /dev/null
+++ b/kag/bin/commands/info.py
@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import argparse
+from tabulate import tabulate
+from kag.bin.base import Command
+from kag.common.registry import Registrable
+from kag.common.utils import reset, bold, red, green, blue
+
+
+@Command.register("register_info")
+class ListRegisterInfo(Command):
+    def add_to_parser(self, subparsers: argparse._SubParsersAction):
+        parser = subparsers.add_parser(
+            "interface", help="Show the interface info of the KAG components."
+        )
+        parser.add_argument("--cls", help="class name to query")
+        parser.add_argument(
+            "--list", help="list all component interfaces in KAG", action="store_true"
+        )
+        parser.set_defaults(func=self.get_handler())
+
+    @staticmethod
+    def get_cls(cls_name):
+        interface_classes = Registrable.list_all_registered(with_leaf_classes=False)
+        for item in interface_classes:
+            if item.__name__ == cls_name:
+                return item
+        raise ValueError(f"class {cls_name} is not a valid kag configurable class")
+
+    @staticmethod
+    def handle_list(args: argparse.Namespace):
+        interface_classes = Registrable.list_all_registered(with_leaf_classes=False)
+        data = []
+        for cls in interface_classes:
+            data.append([cls.__name__, cls.__module__])
+        headers = [f"{bold}{red}class{reset}", f"{bold}{red}module{reset}"]
+        msg = (
+            f"{bold}{red}Below are the interfaces provided by KAG."
+            f"For detailed information on each class, please use the command `kag interface --cls $class_name`{reset}"
+        )
+        print(msg)
+        print(tabulate(data, headers, tablefmt="grid"))
+
+    @staticmethod
+    def handle_cls(args: argparse.Namespace):
+        cls_obj = ListRegisterInfo.get_cls(args.cls)
+        if not issubclass(cls_obj, Registrable):
+            raise ValueError(f"class {args.cls} is not a valid kag configurable class")
+        availables = cls_obj.list_available_with_detail()
+        seg = " " * 20
+
+        deduped_availables = {}
+        for register_name, cls_info in availables.items():
+            cls = cls_info["class"]
+            if cls not in deduped_availables:
+                deduped_availables[cls] = [register_name]
+            else:
+                deduped_availables[cls].append(register_name)
+
+        print(f"{bold}{red}{seg}Documentation of {args.cls}{seg}{reset}")
+        import inspect
+
+        print(inspect.getdoc(cls_obj))
+        print(f"{bold}{red}{seg}Registered subclasses of {args.cls}{seg}{reset}")
+        visited = set()
+        for register_name, cls_info in availables.items():
+            cls = cls_info["class"]
+            if cls in visited:
+                continue
+            visited.add(cls)
+            print(f"{bold}{blue}[{cls}]{reset}")
+            register_names = " / ".join([f'"{x}"' for x in deduped_availables[cls]])
+            print(f"{bold}{green}Register Name:{reset} {register_names}\n")
+
+            # print(f"Class Name: {cls_info['class']}")
+            print(f"{bold}{green}Documentation:{reset}\n{cls_info['doc']}\n")
+            print(f"{bold}{green}Initializer:{reset}\n{cls_info['constructor']}\n")
+
+            required_arguments = []
+            for item in cls_info["params"]["required_params"]:
+                required_arguments.append(f"  {item}")
+            if len(required_arguments) == 0:
+                required_arguments = "  No Required Arguments found"
+            else:
+                required_arguments = "\n".join(required_arguments)
+            print(f"{bold}{green}Required Arguments:{reset}\n{required_arguments}\n")
+
+            optional_arguments = []
+            for item in cls_info["params"]["optional_params"]:
+                optional_arguments.append(f"  {item}")
+            if len(optional_arguments) == 0:
+                optional_arguments = "  No Optional Arguments found"
+            else:
+                optional_arguments = "\n".join(optional_arguments)
+            print(f"{bold}{green}Optional Arguments:{reset}\n{optional_arguments}\n")
+            print(f"{bold}{green}Sample Useage:{reset}\n  {cls_info['sample_useage']}")
+            # for k, v in cls_info.items():
+            #     print(f"{k}: {v}")
+            print("\n")
+
+    @staticmethod
+    def handler(args: argparse.Namespace):
+        if args.list:
+            ListRegisterInfo.handle_list(args)
+        else:
+            ListRegisterInfo.handle_cls(args)
diff --git a/kag/bin/kag_cmds.py b/kag/bin/kag_cmds.py
new file mode 100644
index 00000000..39f31270
--- /dev/null
+++ b/kag/bin/kag_cmds.py
@@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import argparse
+from kag.bin.base import add_commands
+
+
+def build_parser():
+    parser = argparse.ArgumentParser()
+    subparsers = parser.add_subparsers(
+        dest="subcommand_name",
+        title="subcommands",
+        help="subcommands supported by kag",
+    )
+    # add registered commands to parser
+    cmds = [
+        "register_info",
+    ]
+    add_commands(subparsers, cmds)
+    return parser
+
+
+def main():
+    """entry point of script"""
+    parser = build_parser()
+    args = parser.parse_args()
+    args.func(args)
diff --git a/kag/solver/logic/core_modules/op_executor/__init__.py b/kag/bridge/__init__.py
similarity index 100%
rename from kag/solver/logic/core_modules/op_executor/__init__.py
rename to kag/bridge/__init__.py
diff --git a/kag/bridge/spg_server_bridge.py b/kag/bridge/spg_server_bridge.py
new file mode 100644
index 00000000..7fde8f72
--- /dev/null
+++ b/kag/bridge/spg_server_bridge.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import os
+import json
+import kag.interface as interface
+from kag.common.conf import KAGConstants, init_env
+
+
+def init_kag_config(project_id: str, host_addr: str):
+
+    os.environ[KAGConstants.ENV_KAG_PROJECT_ID] = project_id
+    os.environ[KAGConstants.ENV_KAG_PROJECT_HOST_ADDR] = host_addr
+    init_env()
+
+
+class SPGServerBridge:
+    def __init__(self):
+        pass
+
+    def run_reader(self, config, input_data):
+        if isinstance(config, str):
+            config = json.loads(config)
+        scanner_config = config["scanner"]
+        reader_config = config["reader"]
+        scanner = interface.ScannerABC.from_config(scanner_config)
+        reader = interface.ReaderABC.from_config(reader_config)
+        chunks = []
+        for data in scanner.generate(input_data):
+            chunks += reader.invoke(data, write_ckpt=False)
+        return [x.to_dict() for x in chunks]
+
+    def run_component(self, component_name, component_config, input_data):
+        if isinstance(component_config, str):
+            component_config = json.loads(component_config)
+
+        cls = getattr(interface, component_name)
+        instance = cls.from_config(component_config)
+        if hasattr(instance.input_types, "from_dict"):
+            input_data = instance.input_types.from_dict(input_data)
+        return [x.to_dict() for x in instance.invoke(input_data, write_ckpt=False)]
diff --git a/kag/builder/__init__.py b/kag/builder/__init__.py
index 6f6914a4..e69de29b 100644
--- a/kag/builder/__init__.py
+++ b/kag/builder/__init__.py
@@ -1,10 +0,0 @@
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
diff --git a/kag/builder/component/__init__.py b/kag/builder/component/__init__.py
index 0dfd96e7..971b2826 100644
--- a/kag/builder/component/__init__.py
+++ b/kag/builder/component/__init__.py
@@ -10,13 +10,76 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
 
+from kag.builder.component.external_graph.external_graph import (
+    DefaultExternalGraphLoader,
+)
+from kag.builder.component.extractor.schema_free_extractor import SchemaFreeExtractor
+from kag.builder.component.extractor.schema_constraint_extractor import (
+    SchemaConstraintExtractor,
+)
+from kag.builder.component.aligner.kag_aligner import KAGAligner
+from kag.builder.component.aligner.spg_aligner import SPGAligner
+from kag.builder.component.postprocessor.kag_postprocessor import KAGPostProcessor
+
 from kag.builder.component.mapping.spg_type_mapping import SPGTypeMapping
 from kag.builder.component.mapping.relation_mapping import RelationMapping
+from kag.builder.component.mapping.spo_mapping import SPOMapping
+from kag.builder.component.scanner.csv_scanner import CSVScanner
+from kag.builder.component.scanner.json_scanner import JSONScanner
+from kag.builder.component.scanner.yuque_scanner import YuqueScanner
+from kag.builder.component.scanner.dataset_scanner import (
+    MusiqueCorpusScanner,
+    HotpotqaCorpusScanner,
+)
+from kag.builder.component.scanner.file_scanner import FileScanner
+from kag.builder.component.scanner.directory_scanner import DirectoryScanner
+
+
+from kag.builder.component.reader.pdf_reader import PDFReader
+from kag.builder.component.reader.markdown_reader import MarkDownReader
+from kag.builder.component.reader.docx_reader import DocxReader
+from kag.builder.component.reader.txt_reader import TXTReader
+from kag.builder.component.reader.mix_reader import MixReader
+
+from kag.builder.component.reader.dict_reader import DictReader
+
+
+from kag.builder.component.splitter.length_splitter import LengthSplitter
+from kag.builder.component.splitter.pattern_splitter import PatternSplitter
+from kag.builder.component.splitter.outline_splitter import OutlineSplitter
+from kag.builder.component.splitter.semantic_splitter import SemanticSplitter
+from kag.builder.component.vectorizer.batch_vectorizer import BatchVectorizer
 from kag.builder.component.writer.kg_writer import KGWriter
 
 
 __all__ = [
+    "DefaultExternalGraphLoader",
+    "SchemaFreeExtractor",
+    "SchemaConstraintExtractor",
+    "KAGAligner",
+    "SPGAligner",
+    "KAGPostProcessor",
+    "KGWriter",
     "SPGTypeMapping",
     "RelationMapping",
+    "SPOMapping",
+    "TXTReader",
+    "PDFReader",
+    "MarkDownReader",
+    "DocxReader",
+    "MixReader",
+    "DictReader",
+    "JSONScanner",
+    "HotpotqaCorpusScanner",
+    "MusiqueCorpusScanner",
+    "FileScanner",
+    "DirectoryScanner",
+    "YuqueScanner",
+    "CSVScanner",
+    "LengthSplitter",
+    "PatternSplitter",
+    "OutlineSplitter",
+    "SemanticSplitter",
+    "BatchVectorizer",
     "KGWriter",
 ]
diff --git a/kag/builder/component/aligner/__init__.py b/kag/builder/component/aligner/__init__.py
index 123acd8d..e69de29b 100644
--- a/kag/builder/component/aligner/__init__.py
+++ b/kag/builder/component/aligner/__init__.py
@@ -1,12 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
diff --git a/kag/builder/component/aligner/kag_post_processor.py b/kag/builder/component/aligner/kag_aligner.py
similarity index 56%
rename from kag/builder/component/aligner/kag_post_processor.py
rename to kag/builder/component/aligner/kag_aligner.py
index 9722c0cd..9f1bd46f 100644
--- a/kag/builder/component/aligner/kag_post_processor.py
+++ b/kag/builder/component/aligner/kag_aligner.py
@@ -13,12 +13,25 @@
 from typing import List, Sequence, Dict, Type
 
 from kag.builder.model.sub_graph import SubGraph
-from kag.interface.builder import AlignerABC
+from kag.interface import AlignerABC
 from knext.common.base.runnable import Input, Output
 
 
-class KAGPostProcessorAligner(AlignerABC):
+@AlignerABC.register("kag")
+class KAGAligner(AlignerABC):
+    """
+    A class that extends the AlignerABC base class. It is responsible for aligning and merging subgraphs.
+
+    This class provides methods to handle the alignment and merging of subgraphs, as well as properties to define the input and output types.
+    """
+
     def __init__(self, **kwargs):
+        """
+        Initializes the KAGAligner instance.
+
+        Args:
+            **kwargs: Arbitrary keyword arguments passed to the parent class constructor.
+        """
         super().__init__(**kwargs)
 
     @property
@@ -30,6 +43,16 @@ def output_types(self) -> Type[Output]:
         return SubGraph
 
     def invoke(self, input: List[SubGraph], **kwargs) -> SubGraph:
+        """
+        Merges a list of subgraphs into a single subgraph.
+
+        Args:
+            input (List[SubGraph]): A list of subgraphs to be merged.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            SubGraph: The merged subgraph containing all nodes and edges from the input subgraphs.
+        """
         merged_sub_graph = SubGraph(nodes=[], edges=[])
         for sub_graph in input:
             for node in sub_graph.nodes:
@@ -41,9 +64,15 @@ def invoke(self, input: List[SubGraph], **kwargs) -> SubGraph:
         return merged_sub_graph
 
     def _handle(self, input: Sequence[Dict]) -> Dict:
+        """
+        Handles the input by converting it to the appropriate type, invoking the aligner, and converting the output back to a dictionary.
+
+        Args:
+            input (Sequence[Dict]): A sequence of dictionaries representing subgraphs.
+
+        Returns:
+            Dict: A dictionary representing the merged subgraph.
+        """
         _input = [self.input_types.from_dict(i) for i in input]
         _output = self.invoke(_input)
         return _output.to_dict()
-
-    def batch(self, inputs: List[Input], **kwargs) -> List[Output]:
-        pass
diff --git a/kag/builder/component/aligner/spg_post_processor.py b/kag/builder/component/aligner/spg_aligner.py
similarity index 71%
rename from kag/builder/component/aligner/spg_post_processor.py
rename to kag/builder/component/aligner/spg_aligner.py
index b446c15b..cca5b7c7 100644
--- a/kag/builder/component/aligner/spg_post_processor.py
+++ b/kag/builder/component/aligner/spg_aligner.py
@@ -12,8 +12,9 @@
 
 from typing import List, Type, Dict
 
-from kag.interface.builder import AlignerABC
+from kag.interface import AlignerABC
 from knext.schema.client import BASIC_TYPES
+from kag.common.conf import KAG_PROJECT_CONF
 from kag.builder.model.spg_record import SPGRecord
 from kag.builder.model.sub_graph import SubGraph
 from knext.common.base.runnable import Input, Output
@@ -21,10 +22,17 @@
 from knext.schema.model.base import ConstraintTypeEnum, BaseSpgType
 
 
-class SPGPostProcessorAligner(AlignerABC):
+@AlignerABC.register("spg")
+class SPGAligner(AlignerABC):
+    """
+    A class that extends the AlignerABC base class. It is responsible for aligning and merging SPG records into subgraphs.
+
+    This class provides methods to handle the alignment and merging of SPG records, as well as properties to define the input and output types.
+    """
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        self.spg_types = SchemaClient(project_id=self.project_id).load()
+        self.spg_types = SchemaClient(project_id=KAG_PROJECT_CONF.project_id).load()
 
     @property
     def input_types(self) -> Type[Input]:
@@ -35,6 +43,15 @@ def output_types(self) -> Type[Output]:
         return SubGraph
 
     def merge(self, spg_records: List[SPGRecord]):
+        """
+        Merges a list of SPG records into a single set of records, combining properties as necessary.
+
+        Args:
+            spg_records (List[SPGRecord]): A list of SPG records to be merged.
+
+        Returns:
+            List[SPGRecord]: A list of merged SPG records.
+        """
         merged_spg_records = {}
         for record in spg_records:
             key = f"{record.spg_type_name}#{record.get_property('name', '')}"
@@ -75,6 +92,16 @@ def merge(self, spg_records: List[SPGRecord]):
     def from_spg_record(
         spg_types: Dict[str, BaseSpgType], spg_records: List[SPGRecord]
     ):
+        """
+        Converts a list of SPG records into a subgraph.
+
+        Args:
+            spg_types (Dict[str, BaseSpgType]): A dictionary mapping SPG type names to their corresponding types.
+            spg_records (List[SPGRecord]): A list of SPG records to be converted.
+
+        Returns:
+            SubGraph: A subgraph representing the converted SPG records.
+        """
         sub_graph = SubGraph([], [])
         for record in spg_records:
             s_id = record.id
@@ -107,10 +134,30 @@ def from_spg_record(
         return sub_graph
 
     def invoke(self, input: Input, **kwargs) -> List[Output]:
+        """
+        Processes a single input and returns a list of outputs.
+
+        Args:
+            input (Input): The input to be processed.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List[Output]: A list containing the processed output.
+        """
         subgraph = SubGraph.from_spg_record(self.spg_types, [input])
         return [subgraph]
 
     def batch(self, inputs: List[Input], **kwargs) -> List[Output]:
+        """
+        Processes a batch of inputs and returns a list of outputs.
+
+        Args:
+            inputs (List[Input]): A list of inputs to be processed.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List[Output]: A list of outputs corresponding to the processed inputs.
+        """
         merged_records = self.merge(inputs)
         subgraph = SubGraph.from_spg_record(self.spg_types, merged_records)
         return [subgraph]
diff --git a/kag/builder/component/base.py b/kag/builder/component/base.py
deleted file mode 100644
index 0117478a..00000000
--- a/kag/builder/component/base.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-import os
-from abc import ABC
-from typing import List, Dict
-import logging
-
-from knext.common.base.component import Component
-from knext.common.base.runnable import Input, Output
-from knext.project.client import ProjectClient
-from kag.common.llm.client import LLMClient
-
-
-class BuilderComponent(Component, ABC):
-    """
-    Abstract base class for all builder component.
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.project_id = kwargs.get("project_id",None) or os.getenv("KAG_PROJECT_ID")
-        self.config = ProjectClient().get_config(self.project_id)
-
-
-    def _init_llm(self) -> LLMClient:
-        """
-        Initializes the Large Language Model (LLM) client.
-
-        This method retrieves the LLM configuration from environment variables and the project ID.
-        It then fetches the project configuration using the project ID and updates the LLM configuration
-        with any additional settings from the project. Finally, it creates and initializes the LLM client
-        using the updated configuration.
-
-        Args:
-            None
-
-        Returns:
-            LLMClient
-        """
-        llm_config = eval(os.getenv("KAG_LLM", "{}"))
-        project_id = self.project_id or os.getenv("KAG_PROJECT_ID")
-        if project_id:
-            try:
-                config = ProjectClient().get_config(project_id)
-                llm_config.update(config.get("llm", {}))
-            except:
-                logging.warning(
-                    f"Failed to get project config for project id: {project_id}"
-                )
-        llm = LLMClient.from_config(llm_config)
-        return llm
-
-    @property
-    def type(self):
-        """
-        Get the type label of the object.
-
-        Returns:
-            str: The type label of the object, fixed as "BUILDER".
-        """
-        return "BUILDER"
-
-    def batch(self, inputs: List[Input], **kwargs) -> List[Output]:
-        results = []
-        for input in inputs:
-            results.extend(self.invoke(input, **kwargs))
-        return results
-
-    def _handle(self, input: Dict) -> List[Dict]:
-        _input = self.input_types.from_dict(input) if isinstance(input, dict) else input
-        _output = self.invoke(_input)
-        return [_o.to_dict() for _o in _output if _o]
diff --git a/kag/solver/logic/core_modules/op_executor/op_deduce/__init__.py b/kag/builder/component/external_graph/__init__.py
similarity index 100%
rename from kag/solver/logic/core_modules/op_executor/op_deduce/__init__.py
rename to kag/builder/component/external_graph/__init__.py
diff --git a/kag/builder/component/external_graph/external_graph.py b/kag/builder/component/external_graph/external_graph.py
new file mode 100644
index 00000000..9728c2cf
--- /dev/null
+++ b/kag/builder/component/external_graph/external_graph.py
@@ -0,0 +1,212 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import json
+import numpy as np
+import logging
+from typing import List, Union, Dict
+from kag.interface import ExternalGraphLoaderABC, MatchConfig
+from kag.common.conf import KAG_PROJECT_CONF
+from kag.builder.model.sub_graph import Node, Edge, SubGraph
+from knext.schema.client import SchemaClient
+
+from knext.search.client import SearchClient
+
+
+logger = logging.getLogger()
+
+
+@ExternalGraphLoaderABC.register("base", constructor="from_json_file", as_default=True)
+class DefaultExternalGraphLoader(ExternalGraphLoaderABC):
+    """
+    A default implementation of the ExternalGraphLoaderABC interface.
+
+    This class is responsible for loading external graph data based on the provided nodes, edges, and match configuration.
+    """
+
+    def __init__(
+        self,
+        nodes: List[Node],
+        edges: List[Edge],
+        match_config: MatchConfig,
+    ):
+        """
+        Initializes the DefaultExternalGraphLoader with the given nodes, edges, and match configuration.
+
+        Args:
+            nodes (List[Node]): A list of Node objects representing the nodes in the graph.
+            edges (List[Edge]): A list of Edge objects representing the edges in the graph.
+            match_config (MatchConfig): The configuration for matching query str to graph nodes.
+        """
+        super().__init__()
+        self.schema = SchemaClient(project_id=KAG_PROJECT_CONF.project_id).load()
+        for node in nodes:
+            if node.label not in self.schema:
+                raise ValueError(
+                    f"Type of node {node.to_dict()} is beyond the schema definition."
+                )
+            for k in node.properties.keys():
+                if k not in self.schema[node.label]:
+                    raise ValueError(
+                        f"Property of node {node.to_dict()} is beyond the schema definition."
+                    )
+        self.nodes = nodes
+        self.edges = edges
+
+        self.vocabulary = {}
+        self.node_labels = set()
+        for node in self.nodes:
+            self.vocabulary[node.name] = node
+            self.node_labels.add(node.label)
+
+        import jieba
+
+        for word in self.vocabulary.keys():
+            jieba.add_word(word)
+
+        self.match_config = match_config
+        self._init_search()
+
+    def _init_search(self):
+        self._search_client = SearchClient(
+            KAG_PROJECT_CONF.host_addr, KAG_PROJECT_CONF.project_id
+        )
+
+    def _group_by_label(self, data: Union[List[Node], List[Edge]]):
+        groups = {}
+
+        for item in data:
+            label = item.label
+            if label not in groups:
+                groups[label] = [item]
+            else:
+                groups[label].append(item)
+        return list(groups.values())
+
+    def _group_by_cnt(self, data, n):
+        return [data[i : i + n] for i in range(0, len(data), n)]
+
+    def dump(self, max_num_nodes: int = 4096, max_num_edges: int = 4096):
+        graphs = []
+        # process nodes
+        for item in self._group_by_label(self.nodes):
+            for grouped_nodes in self._group_by_cnt(item, max_num_nodes):
+                graphs.append(SubGraph(nodes=grouped_nodes, edges=[]))
+
+        # process edges
+        for item in self._group_by_label(self.edges):
+            for grouped_edges in self._group_by_cnt(item, max_num_edges):
+                graphs.append(SubGraph(nodes=[], edges=grouped_edges))
+
+        return graphs
+
+    def ner(self, content: str):
+        output = []
+        import jieba
+
+        for word in jieba.cut(content):
+            if word in self.vocabulary:
+                output.append(self.vocabulary[word])
+        return output
+
+    def get_allowed_labels(self, labels: List[str] = None):
+        allowed_labels = []
+
+        namespace = KAG_PROJECT_CONF.namespace
+        if labels is None:
+            allowed_labels = [f"{namespace}.{x}" for x in self.node_labels]
+        else:
+            for label in labels:
+                # remove namespace
+                if label.startswith(KAG_PROJECT_CONF.namespace):
+                    label = label.split(".")[1]
+                if label in self.node_labels:
+                    allowed_labels.append(f"{namespace}.{label}")
+        return allowed_labels
+
+    def search_result_to_node(self, search_result: Dict):
+        output = []
+        for label in search_result["__labels__"]:
+            node = {
+                "id": search_result["id"],
+                "name": search_result["name"],
+                "label": label,
+            }
+            output.append(Node.from_dict(node))
+        return output
+
+    def text_match(self, query: str, k: int = 1, labels: List[str] = None):
+        allowed_labels = self.get_allowed_labels(labels)
+        text_matched = self._search_client.search_text(query, allowed_labels, topk=k)
+        return text_matched
+
+    def vector_match(
+        self,
+        query: Union[List[float], np.ndarray],
+        k: int = 1,
+        threshold: float = 0.9,
+        labels: List[str] = None,
+    ):
+        allowed_labels = self.get_allowed_labels(labels)
+        if isinstance(query, np.ndarray):
+            query = query.tolist()
+        matched_results = []
+        for label in allowed_labels:
+            vector_matched = self._search_client.search_vector(
+                label=label, property_key="name", query_vector=query, topk=k
+            )
+            matched_results.extend(vector_matched)
+
+        filtered_results = []
+        for item in matched_results:
+            score = item["score"]
+            if score >= threshold:
+                filtered_results.append(item)
+        return filtered_results
+
+    def match_entity(self, query: Union[str, List[float], np.ndarray]):
+        if isinstance(query, str):
+            return self.text_match(
+                query, k=self.match_config.k, labels=self.match_config.labels
+            )
+        else:
+            return self.vector_match(
+                query,
+                k=self.match_config.k,
+                labels=self.match_config.labels,
+                threshold=self.match_config.threshold,
+            )
+
+    @classmethod
+    def from_json_file(
+        cls,
+        node_file_path: str,
+        edge_file_path: str,
+        match_config: MatchConfig,
+    ):
+        """
+        Creates an instance of DefaultExternalGraphLoader from JSON files containing node and edge data.
+
+        Args:
+            node_file_path (str): The path to the JSON file containing node data.
+            edge_file_path (str): The path to the JSON file containing edge data.
+            match_config (MatchConfig): The configuration for matching query str to graph nodes.
+
+        Returns:
+            DefaultExternalGraphLoader: An instance of DefaultExternalGraphLoader initialized with the data from the JSON files.
+        """
+        nodes = []
+        for item in json.load(open(node_file_path, "r")):
+            nodes.append(Node.from_dict(item))
+        edges = []
+        for item in json.load(open(edge_file_path, "r")):
+            edges.append(Edge.from_dict(item))
+        return cls(nodes=nodes, edges=edges, match_config=match_config)
diff --git a/kag/builder/component/extractor/__init__.py b/kag/builder/component/extractor/__init__.py
index dbde8cd2..e69de29b 100644
--- a/kag/builder/component/extractor/__init__.py
+++ b/kag/builder/component/extractor/__init__.py
@@ -1,23 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-from kag.builder.component.extractor.kag_extractor import KAGExtractor
-from kag.builder.component.extractor.spg_extractor import SPGExtractor
-from kag.builder.component.extractor.user_defined_extractor import (
-    UserDefinedExtractor,
-)
-
-__all__ = [
-    "KAGExtractor",
-    "SPGExtractor",
-    "UserDefinedExtractor",
-]
diff --git a/kag/builder/component/extractor/schema_constraint_extractor.py b/kag/builder/component/extractor/schema_constraint_extractor.py
new file mode 100644
index 00000000..4dfbb2ac
--- /dev/null
+++ b/kag/builder/component/extractor/schema_constraint_extractor.py
@@ -0,0 +1,429 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import copy
+import logging
+from typing import Dict, Type, List
+
+from kag.interface import LLMClient
+from tenacity import stop_after_attempt, retry
+
+from kag.interface import ExtractorABC, PromptABC, ExternalGraphLoaderABC
+
+from kag.common.conf import KAG_PROJECT_CONF
+from kag.common.utils import processing_phrases, to_camel_case
+from kag.builder.model.chunk import Chunk
+from kag.builder.model.sub_graph import SubGraph
+from kag.builder.prompt.utils import init_prompt_with_fallback
+from knext.schema.client import CHUNK_TYPE, BASIC_TYPES
+from knext.common.base.runnable import Input, Output
+from knext.schema.client import SchemaClient
+
+logger = logging.getLogger(__name__)
+
+
+@ExtractorABC.register("schema_constraint")
+@ExtractorABC.register("schema_constraint_extractor")
+class SchemaConstraintExtractor(ExtractorABC):
+    """
+    Perform knowledge extraction for enforcing schema constraints, including entities, events and their edges.
+    The types of entities and events, along with their respective attributes, are automatically inherited from the project's schema.
+    """
+
+    def __init__(
+        self,
+        llm: LLMClient,
+        ner_prompt: PromptABC = None,
+        std_prompt: PromptABC = None,
+        relation_prompt: PromptABC = None,
+        event_prompt: PromptABC = None,
+        external_graph: ExternalGraphLoaderABC = None,
+    ):
+        """
+        Initializes the SchemaBasedExtractor instance.
+
+        Args:
+            llm (LLMClient): The language model client used for extraction.
+            ner_prompt (PromptABC, optional): The prompt for named entity recognition. Defaults to None.
+            std_prompt (PromptABC, optional): The prompt for named entity standardization. Defaults to None.
+            relation_prompt (PromptABC, optional): The prompt for relation extraction. Defaults to None.
+            event_prompt (PromptABC, optional): The prompt for event extraction. Defaults to None.
+            external_graph (ExternalGraphLoaderABC, optional): The external graph loader for additional data. Defaults to None.
+        """
+        super().__init__()
+        self.llm = llm
+        self.schema = SchemaClient(project_id=KAG_PROJECT_CONF.project_id).load()
+        self.ner_prompt = ner_prompt
+        self.std_prompt = std_prompt
+        self.relation_prompt = relation_prompt
+        self.event_prompt = event_prompt
+
+        biz_scene = KAG_PROJECT_CONF.biz_scene
+        if self.ner_prompt is None:
+            self.ner_prompt = init_prompt_with_fallback("ner", biz_scene)
+        if self.std_prompt is None:
+            self.std_prompt = init_prompt_with_fallback("std", biz_scene)
+        self.external_graph = external_graph
+
+    @property
+    def input_types(self) -> Type[Input]:
+        return Chunk
+
+    @property
+    def output_types(self) -> Type[Output]:
+        return SubGraph
+
+    @retry(stop=stop_after_attempt(3))
+    def named_entity_recognition(self, passage: str):
+        """
+        Performs named entity recognition on a given text passage.
+        Args:
+            passage (str): The text to perform named entity recognition on.
+        Returns:
+            The result of the named entity recognition operation.
+        """
+        ner_result = self.llm.invoke({"input": passage}, self.ner_prompt)
+        if self.external_graph:
+            extra_ner_result = self.external_graph.ner(passage)
+        else:
+            extra_ner_result = []
+        output = []
+        dedup = set()
+        for item in extra_ner_result:
+            name = item.name
+            if name not in dedup:
+                dedup.add(name)
+                output.append(
+                    {
+                        "name": name,
+                        "category": item.label,
+                        "properties": item.properties,
+                    }
+                )
+        for item in ner_result:
+            name = item.get("name", None)
+            category = item.get("category", None)
+            if name is None or category is None:
+                continue
+            if not isinstance(name, str):
+                continue
+            if name not in dedup:
+                dedup.add(name)
+                output.append(item)
+        return output
+
+    @retry(stop=stop_after_attempt(3))
+    def named_entity_standardization(self, passage: str, entities: List[Dict]):
+        """
+        Performs named entity standardization on a given text passage and entities.
+
+        Args:
+            passage (str): The text passage.
+            entities (List[Dict]): The list of entities to standardize.
+
+        Returns:
+            The result of the named entity standardization operation.
+        """
+        return self.llm.invoke(
+            {"input": passage, "named_entities": entities}, self.std_prompt
+        )
+
+    @retry(stop=stop_after_attempt(3))
+    def relations_extraction(self, passage: str, entities: List[Dict]):
+        """
+        Performs relation extraction on a given text passage and entities.
+
+        Args:
+            passage (str): The text passage.
+            entities (List[Dict]): The list of entities.
+
+        Returns:
+            The result of the relation extraction operation.
+        """
+        if self.relation_prompt is None:
+            logger.debug("Relation extraction prompt not configured, skip.")
+
+            return []
+        return self.llm.invoke(
+            {"input": passage, "entity_list": entities}, self.relation_prompt
+        )
+
+    @retry(stop=stop_after_attempt(3))
+    def event_extraction(self, passage: str):
+        """
+        Performs event extraction on a given text passage.
+
+        Args:
+            passage (str): The text passage.
+
+        Returns:
+            The result of the event extraction operation.
+        """
+        if self.event_prompt is None:
+            logger.debug("Event extraction prompt not configured, skip.")
+            return []
+        return self.llm.invoke({"input": passage}, self.event_prompt)
+
+    def parse_nodes_and_edges(self, entities: List[Dict], category: str = None):
+        """
+        Parses nodes and edges from a list of entities.
+
+        Args:
+            entities (List[Dict]): The list of entities.
+
+        Returns:
+            Tuple[List[Node], List[Edge]]: The parsed nodes and edges.
+        """
+        graph = SubGraph([], [])
+        entities = copy.deepcopy(entities)
+        root_nodes = []
+        for record in entities:
+            if record is None:
+                continue
+            if isinstance(record, str):
+                record = {"name": record}
+            s_name = record.get("name", "")
+            s_label = record.get("category", category)
+            properties = record.get("properties", {})
+            # At times, the name and/or label is placed in the properties.
+            if not s_name:
+                s_name = properties.pop("name", "")
+            if not s_label:
+                s_label = properties.pop("category", "")
+            if not s_name or not s_label:
+                continue
+            s_name = processing_phrases(s_name)
+            root_nodes.append((s_name, s_label))
+            tmp_properties = copy.deepcopy(properties)
+            spg_type = self.schema.get(s_label)
+            for prop_name, prop_value in properties.items():
+                if prop_value is None:
+                    tmp_properties.pop(prop_name)
+                    continue
+                if prop_name in spg_type.properties:
+                    prop_schema = spg_type.properties.get(prop_name)
+                    o_label = prop_schema.object_type_name_en
+                    if o_label not in BASIC_TYPES:
+                        # pop and convert property to node and edge
+                        if not isinstance(prop_value, list):
+                            prop_value = [prop_value]
+                        (
+                            new_root_nodes,
+                            new_nodes,
+                            new_edges,
+                        ) = self.parse_nodes_and_edges(prop_value, o_label)
+                        graph.nodes.extend(new_nodes)
+                        graph.edges.extend(new_edges)
+                        # connect current node to property generated nodes
+                        for node in new_root_nodes:
+                            graph.add_edge(
+                                s_id=s_name,
+                                s_label=s_label,
+                                p=prop_name,
+                                o_id=node[0],
+                                o_label=node[1],
+                            )
+                        tmp_properties.pop(prop_name)
+            record["properties"] = tmp_properties
+            # NOTE: For property converted to nodes/edges, we keep a copy of the original property values.
+            #       Perhaps it is not necessary?
+            graph.add_node(id=s_name, name=s_name, label=s_label, properties=properties)
+
+            if "official_name" in record:
+                official_name = processing_phrases(record["official_name"])
+                if official_name != s_name:
+                    graph.add_node(
+                        id=official_name,
+                        name=official_name,
+                        label=s_label,
+                        properties=properties,
+                    )
+                    graph.add_edge(
+                        s_id=s_name,
+                        s_label=s_label,
+                        p="OfficialName",
+                        o_id=official_name,
+                        o_label=s_label,
+                    )
+
+        return root_nodes, graph.nodes, graph.edges
+
+    @staticmethod
+    def add_relations_to_graph(
+        sub_graph: SubGraph, entities: List[Dict], relations: List[list]
+    ):
+        """
+        Add edges to the subgraph based on a list of relations and entities.
+        Args:
+            sub_graph (SubGraph): The subgraph to add edges to.
+            entities (List[Dict]): A list of entities, for looking up category information.
+            relations (List[list]): A list of relations, each representing a relationship to be added to the subgraph.
+        Returns:
+            The constructed subgraph.
+
+        """
+
+        for rel in relations:
+            if len(rel) != 5:
+                continue
+            s_name, s_category, predicate, o_name, o_category = rel
+            s_name = processing_phrases(s_name)
+            sub_graph.add_node(s_name, s_name, s_category)
+            o_name = processing_phrases(o_name)
+            sub_graph.add_node(o_name, o_name, o_category)
+            edge_type = to_camel_case(predicate)
+            if edge_type:
+                sub_graph.add_edge(s_name, s_category, edge_type, o_name, o_category)
+        return sub_graph
+
+    @staticmethod
+    def add_chunk_to_graph(sub_graph: SubGraph, chunk: Chunk):
+        """
+        Associates a Chunk object with the subgraph, adding it as a node and connecting it with existing nodes.
+        Args:
+            sub_graph (SubGraph): The subgraph to add the chunk information to.
+            chunk (Chunk): The chunk object containing the text and metadata.
+        Returns:
+            The constructed subgraph.
+        """
+        for node in sub_graph.nodes:
+            sub_graph.add_edge(node.id, node.label, "source", chunk.id, CHUNK_TYPE)
+        sub_graph.add_node(
+            id=chunk.id,
+            name=chunk.name,
+            label=CHUNK_TYPE,
+            properties={
+                "id": chunk.id,
+                "name": chunk.name,
+                "content": f"{chunk.name}\n{chunk.content}",
+                **chunk.kwargs,
+            },
+        )
+        sub_graph.id = chunk.id
+        return sub_graph
+
+    def assemble_subgraph(
+        self,
+        chunk: Chunk,
+        entities: List[Dict],
+        relations: List[list],
+        events: List[Dict],
+    ):
+        """
+        Assembles a subgraph from the given chunk, entities, events, and relations.
+
+        Args:
+            chunk (Chunk): The chunk object.
+            entities (List[Dict]): The list of entities.
+            events (List[Dict]): The list of events.
+
+        Returns:
+            The constructed subgraph.
+        """
+        graph = SubGraph([], [])
+        _, entity_nodes, entity_edges = self.parse_nodes_and_edges(entities)
+        graph.nodes.extend(entity_nodes)
+        graph.edges.extend(entity_edges)
+        _, event_nodes, event_edges = self.parse_nodes_and_edges(events)
+        graph.nodes.extend(event_nodes)
+        graph.edges.extend(event_edges)
+        self.add_relations_to_graph(graph, entities, relations)
+        self.add_chunk_to_graph(graph, chunk)
+        return graph
+
+    def append_official_name(
+        self, source_entities: List[Dict], entities_with_official_name: List[Dict]
+    ):
+        """
+        Appends official names to entities.
+
+        Args:
+            source_entities (List[Dict]): A list of source entities.
+            entities_with_official_name (List[Dict]): A list of entities with official names.
+        """
+        tmp_dict = {}
+        for tmp_entity in entities_with_official_name:
+            name = tmp_entity["name"]
+            category = tmp_entity["category"]
+            official_name = tmp_entity["official_name"]
+            key = f"{category}{name}"
+            tmp_dict[key] = official_name
+
+        for tmp_entity in source_entities:
+            name = tmp_entity["name"]
+            category = tmp_entity["category"]
+            key = f"{category}{name}"
+            if key in tmp_dict:
+                official_name = tmp_dict[key]
+                tmp_entity["official_name"] = official_name
+
+    def postprocess_graph(self, graph):
+        """
+        Postprocesses the graph by merging nodes with the same name and label.
+
+        Args:
+            graph (SubGraph): The graph to postprocess.
+
+        Returns:
+            The postprocessed graph.
+        """
+        try:
+            all_node_properties = {}
+            for node in graph.nodes:
+                id_ = node.id
+                name = node.name
+                label = node.label
+                key = (id_, name, label)
+                if key not in all_node_properties:
+                    all_node_properties[key] = node.properties
+                else:
+                    all_node_properties[key].update(node.properties)
+            new_graph = SubGraph([], [])
+            for key, node_properties in all_node_properties.items():
+                id_, name, label = key
+                new_graph.add_node(
+                    id=id_, name=name, label=label, properties=node_properties
+                )
+            new_graph.edges = graph.edges
+            return new_graph
+        except:
+            return graph
+
+    def _invoke(self, input: Input, **kwargs) -> List[Output]:
+        """
+        Invokes the extractor on the given input.
+
+        Args:
+            input (Input): The input data.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List[Output]: The list of output results.
+        """
+        title = input.name
+        passage = title + "\n" + input.content
+
+        out = []
+        entities = self.named_entity_recognition(passage)
+        events = self.event_extraction(passage)
+        named_entities = []
+        for entity in entities:
+            named_entities.append(
+                {"name": entity["name"], "category": entity["category"]}
+            )
+        relations = self.relations_extraction(passage, named_entities)
+        std_entities = self.named_entity_standardization(passage, named_entities)
+        self.append_official_name(entities, std_entities)
+        subgraph = self.assemble_subgraph(input, entities, relations, events)
+        out.append(self.postprocess_graph(subgraph))
+        logger.debug(f"input passage:\n{passage}")
+        logger.debug(f"output graphs:\n{out}")
+        return out
diff --git a/kag/builder/component/extractor/kag_extractor.py b/kag/builder/component/extractor/schema_free_extractor.py
similarity index 64%
rename from kag/builder/component/extractor/kag_extractor.py
rename to kag/builder/component/extractor/schema_free_extractor.py
index fd6f9913..ccf29128 100644
--- a/kag/builder/component/extractor/kag_extractor.py
+++ b/kag/builder/component/extractor/schema_free_extractor.py
@@ -11,64 +11,75 @@
 # or implied.
 import copy
 import logging
-import os
 from typing import Dict, Type, List
 
+from kag.interface import LLMClient
 from tenacity import stop_after_attempt, retry
 
-from kag.builder.prompt.spg_prompt import SPG_KGPrompt
-from kag.interface.builder import ExtractorABC
-from kag.common.base.prompt_op import PromptOp
-from knext.schema.client import OTHER_TYPE, CHUNK_TYPE, BASIC_TYPES
+from kag.interface import ExtractorABC, PromptABC, ExternalGraphLoaderABC
+
+from kag.common.conf import KAG_PROJECT_CONF
 from kag.common.utils import processing_phrases, to_camel_case
 from kag.builder.model.chunk import Chunk
 from kag.builder.model.sub_graph import SubGraph
+from kag.builder.prompt.utils import init_prompt_with_fallback
+from knext.schema.client import OTHER_TYPE, CHUNK_TYPE, BASIC_TYPES
 from knext.common.base.runnable import Input, Output
 from knext.schema.client import SchemaClient
-from knext.schema.model.base import SpgTypeEnum
 
 logger = logging.getLogger(__name__)
 
 
-class KAGExtractor(ExtractorABC):
+@ExtractorABC.register("schema_free")
+@ExtractorABC.register("schema_free_extractor")
+class SchemaFreeExtractor(ExtractorABC):
     """
     A class for extracting knowledge graph subgraphs from text using a large language model (LLM).
     Inherits from the Extractor base class.
+
+    Attributes:
+        llm (LLMClient): The large language model client used for text processing.
+        schema (SchemaClient): The schema client used to load the schema for the project.
+        ner_prompt (PromptABC): The prompt used for named entity recognition.
+        std_prompt (PromptABC): The prompt used for named entity standardization.
+        triple_prompt (PromptABC): The prompt used for triple extraction.
+        external_graph (ExternalGraphLoaderABC): The external graph loader used for additional NER.
     """
 
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.llm = self._init_llm()
-        self.prompt_config = self.config.get("prompt", {})
-        self.biz_scene = self.prompt_config.get("biz_scene") or os.getenv(
-            "KAG_PROMPT_BIZ_SCENE", "default"
-        )
-        self.language = self.prompt_config.get("language") or os.getenv(
-            "KAG_PROMPT_LANGUAGE", "en"
-        )
-        self.schema = SchemaClient(project_id=self.project_id).load()
-        self.ner_prompt = PromptOp.load(self.biz_scene, "ner")(
-            language=self.language, project_id=self.project_id
-        )
-        self.std_prompt = PromptOp.load(self.biz_scene, "std")(language=self.language)
-        self.triple_prompt = PromptOp.load(self.biz_scene, "triple")(
-            language=self.language
-        )
-        self.kg_types = []
-        for type_name, spg_type in self.schema.items():
-            if type_name in SPG_KGPrompt.ignored_types:
-                continue
-            if spg_type.spg_type_enum == SpgTypeEnum.Concept:
-                continue
-            properties = list(spg_type.properties.keys())
-            for p in properties:
-                if p not in SPG_KGPrompt.ignored_properties:
-                    self.kg_types.append(type_name)
-                    break
-        if self.kg_types:
-            self.kg_prompt = SPG_KGPrompt(
-                self.kg_types, language=self.language, project_id=self.project_id
-            )
+    def __init__(
+        self,
+        llm: LLMClient,
+        ner_prompt: PromptABC = None,
+        std_prompt: PromptABC = None,
+        triple_prompt: PromptABC = None,
+        external_graph: ExternalGraphLoaderABC = None,
+    ):
+        """
+        Initializes the KAGExtractor with the specified parameters.
+
+        Args:
+            llm (LLMClient): The large language model client.
+            ner_prompt (PromptABC, optional): The prompt for named entity recognition. Defaults to None.
+            std_prompt (PromptABC, optional): The prompt for named entity standardization. Defaults to None.
+            triple_prompt (PromptABC, optional): The prompt for triple extraction. Defaults to None.
+            external_graph (ExternalGraphLoaderABC, optional): The external graph loader. Defaults to None.
+        """
+        super().__init__()
+        self.llm = llm
+        self.schema = SchemaClient(project_id=KAG_PROJECT_CONF.project_id).load()
+        self.ner_prompt = ner_prompt
+        self.std_prompt = std_prompt
+        self.triple_prompt = triple_prompt
+
+        biz_scene = KAG_PROJECT_CONF.biz_scene
+        if self.ner_prompt is None:
+            self.ner_prompt = init_prompt_with_fallback("ner", biz_scene)
+        if self.std_prompt is None:
+            self.std_prompt = init_prompt_with_fallback("std", biz_scene)
+        if self.triple_prompt is None:
+            self.triple_prompt = init_prompt_with_fallback("triple", biz_scene)
+
+        self.external_graph = external_graph
 
     @property
     def input_types(self) -> Type[Input]:
@@ -87,12 +98,34 @@ def named_entity_recognition(self, passage: str):
         Returns:
             The result of the named entity recognition operation.
         """
-        if self.kg_types:
-            kg_result = self.llm.invoke({"input": passage}, self.kg_prompt)
-        else:
-            kg_result = []
         ner_result = self.llm.invoke({"input": passage}, self.ner_prompt)
-        return kg_result + ner_result
+        if self.external_graph:
+            extra_ner_result = self.external_graph.ner(passage)
+        else:
+            extra_ner_result = []
+        output = []
+        dedup = set()
+        for item in extra_ner_result:
+            name = item.name
+            label = item.label
+            description = item.properties.get("desc", "")
+            semantic_type = item.properties.get("semanticType", label)
+            if name not in dedup:
+                dedup.add(name)
+                output.append(
+                    {
+                        "name": name,
+                        "type": semantic_type,
+                        "category": label,
+                        "description": description,
+                    }
+                )
+        for item in ner_result:
+            name = item.get("name", None)
+            if name and name not in dedup:
+                dedup.add(name)
+                output.append(item)
+        return output
 
     @retry(stop=stop_after_attempt(3))
     def named_entity_standardization(self, passage: str, entities: List[Dict]):
@@ -125,20 +158,26 @@ def triples_extraction(self, passage: str, entities: List[Dict]):
         )
 
     def assemble_sub_graph_with_spg_records(self, entities: List[Dict]):
+        """
+        Assembles a subgraph using SPG records.
+
+        Args:
+            entities (List[Dict]): A list of entities to be used for subgraph assembly.
+
+        Returns:
+            The assembled subgraph and the updated list of entities.
+        """
         sub_graph = SubGraph([], [])
         for record in entities:
-            s_name = record.get("entity", "")
+            s_name = record.get("name", "")
             s_label = record.get("category", "")
             properties = record.get("properties", {})
             tmp_properties = copy.deepcopy(properties)
             spg_type = self.schema.get(s_label)
-            if not spg_type:
-                continue
             for prop_name, prop_value in properties.items():
                 if prop_value == "NAN":
                     tmp_properties.pop(prop_name)
                     continue
-
                 if prop_name in spg_type.properties:
                     from knext.schema.model.property import Property
 
@@ -173,11 +212,14 @@ def assemble_sub_graph_with_triples(
             sub_graph (SubGraph): The subgraph to add edges to.
             entities (List[Dict]): A list of entities, for looking up category information.
             triples (List[list]): A list of triples, each representing a relationship to be added to the subgraph.
+        Returns:
+            The constructed subgraph.
+
         """
 
         def get_category(entities_data, entity_name):
             for entity in entities_data:
-                if entity["entity"] == entity_name:
+                if entity["name"] == entity_name:
                     return entity["category"]
             return None
 
@@ -194,7 +236,6 @@ def get_category(entities_data, entity_name):
             if o_category is None:
                 o_category = OTHER_TYPE
                 sub_graph.add_node(tri[2], tri[2], o_category)
-
             edge_type = to_camel_case(tri[1])
             if edge_type:
                 sub_graph.add_edge(tri[0], s_category, edge_type, tri[2], o_category)
@@ -208,6 +249,8 @@ def assemble_sub_graph_with_chunk(sub_graph: SubGraph, chunk: Chunk):
         Args:
             sub_graph (SubGraph): The subgraph to add the chunk information to.
             chunk (Chunk): The chunk object containing the text and metadata.
+        Returns:
+            The constructed subgraph.
         """
         for node in sub_graph.nodes:
             sub_graph.add_edge(node.id, node.label, "source", chunk.id, CHUNK_TYPE)
@@ -240,7 +283,7 @@ def assemble_sub_graph(
             entities (List[Dict]): A list of entities identified in the chunk.
             triples (List[list]): A list of triples representing relationships between entities.
         Returns:
-            SubGraph: The constructed subgraph.
+            The constructed subgraph.
         """
         self.assemble_sub_graph_with_entities(sub_graph, entities)
         self.assemble_sub_graph_with_triples(sub_graph, entities, triples)
@@ -259,7 +302,7 @@ def assemble_sub_graph_with_entities(
         """
 
         for ent in entities:
-            name = processing_phrases(ent["entity"])
+            name = processing_phrases(ent["name"])
             sub_graph.add_node(
                 name,
                 name,
@@ -302,26 +345,31 @@ def append_official_name(
             source_entities (List[Dict]): A list of source entities.
             entities_with_official_name (List[Dict]): A list of entities with official names.
         """
-        tmp_dict = {}
-        for tmp_entity in entities_with_official_name:
-            name = tmp_entity["entity"]
-            category = tmp_entity["category"]
-            official_name = tmp_entity["official_name"]
-            key = f"{category}{name}"
-            tmp_dict[key] = official_name
-
-        for tmp_entity in source_entities:
-            name = tmp_entity["entity"]
-            category = tmp_entity["category"]
-            key = f"{category}{name}"
-            if key in tmp_dict:
-                official_name = tmp_dict[key]
-                tmp_entity["official_name"] = official_name
-
-    def quoteStr(self, input: str) -> str:
-        return f"""{input}"""
-
-    def invoke(self, input: Input, **kwargs) -> List[Output]:
+        try:
+            tmp_dict = {}
+            for tmp_entity in entities_with_official_name:
+                if "name" in tmp_entity:
+                    name = tmp_entity["name"]
+                elif "entity" in tmp_entity:
+                    name = tmp_entity["entity"]
+                else:
+                    continue
+                category = tmp_entity["category"]
+                official_name = tmp_entity["official_name"]
+                key = f"{category}{name}"
+                tmp_dict[key] = official_name
+
+            for tmp_entity in source_entities:
+                name = tmp_entity["name"]
+                category = tmp_entity["category"]
+                key = f"{category}{name}"
+                if key in tmp_dict:
+                    official_name = tmp_dict[key]
+                    tmp_entity["official_name"] = official_name
+        except Exception as e:
+            logger.warn(f"failed to process official name, info: {e}")
+
+    def _invoke(self, input: Input, **kwargs) -> List[Output]:
         """
         Invokes the semantic extractor to process input data.
 
@@ -332,24 +380,19 @@ def invoke(self, input: Input, **kwargs) -> List[Output]:
         Returns:
             List[Output]: A list of processed results, containing subgraph information.
         """
-        title = input.name
-        passage = self.quoteStr(title + "\n" + input.content)
 
-        try:
-            entities = self.named_entity_recognition(passage)
-            sub_graph, entities = self.assemble_sub_graph_with_spg_records(entities)
-            filtered_entities = [
-                {k: v for k, v in ent.items() if k in ["entity", "category"]}
-                for ent in entities
-            ]
-            triples = self.triples_extraction(passage, filtered_entities)
-            std_entities = self.named_entity_standardization(passage, filtered_entities)
-            self.append_official_name(entities, std_entities)
-            self.assemble_sub_graph(sub_graph, input, entities, triples)
-            return [sub_graph]
-        except Exception as e:
-            import traceback
-
-            traceback.print_exc()
-            logger.info(e)
-        return []
+        title = input.name
+        passage = title + "\n" + input.content
+        out = []
+        entities = self.named_entity_recognition(passage)
+        sub_graph, entities = self.assemble_sub_graph_with_spg_records(entities)
+        filtered_entities = [
+            {k: v for k, v in ent.items() if k in ["name", "category"]}
+            for ent in entities
+        ]
+        triples = self.triples_extraction(passage, filtered_entities)
+        std_entities = self.named_entity_standardization(passage, filtered_entities)
+        self.append_official_name(entities, std_entities)
+        self.assemble_sub_graph(sub_graph, input, entities, triples)
+        out.append(sub_graph)
+        return out
diff --git a/kag/builder/component/extractor/spg_extractor.py b/kag/builder/component/extractor/spg_extractor.py
deleted file mode 100644
index b1c63930..00000000
--- a/kag/builder/component/extractor/spg_extractor.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-import copy
-import logging
-from typing import List, Dict
-
-from tenacity import retry, stop_after_attempt
-
-from kag.builder.component.extractor import KAGExtractor
-from kag.builder.model.sub_graph import SubGraph
-from kag.builder.prompt.spg_prompt import SPG_KGPrompt
-from kag.common.base.prompt_op import PromptOp
-from knext.common.base.runnable import Input, Output
-
-from knext.schema.client import BASIC_TYPES
-
-logger = logging.getLogger(__name__)
-
-
-class SPGExtractor(KAGExtractor):
-    """
-    A Builder Component that extracting structured data from long texts by invoking large language model.
-
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.spg_ner_types, self.kag_ner_types = [], []
-        for type_name, spg_type in self.schema.items():
-            properties = list(spg_type.properties.keys())
-            for p in properties:
-                if p not in SPG_KGPrompt.ignored_properties:
-                    self.spg_ner_types.append(type_name)
-                    continue
-            self.kag_ner_types.append(type_name)
-        self.kag_ner_prompt = PromptOp.load(self.biz_scene, "ner")(language=self.language, project_id=self.project_id)
-        self.spg_ner_prompt = SPG_KGPrompt(self.spg_ner_types, self.language, project_id=self.project_id)
-
-    @retry(stop=stop_after_attempt(3))
-    def named_entity_recognition(self, passage: str):
-        """
-        Performs named entity recognition on a given text passage.
-        Args:
-            passage (str): The text to perform named entity recognition on.
-        Returns:
-            The result of the named entity recognition operation.
-        """
-        spg_ner_result = self.llm.batch({"input": passage}, self.spg_ner_prompt)
-        kag_ner_result = self.llm.invoke({"input": passage}, self.kag_ner_prompt)
-        return spg_ner_result + kag_ner_result
-
-    def assemble_sub_graph_with_spg_records(self, entities: List[Dict]):
-        sub_graph = SubGraph([], [])
-        for record in entities:
-            s_name = record.get("entity", "")
-            s_label = record.get("category", "")
-            properties = record.get("properties", {})
-            tmp_properties = copy.deepcopy(properties)
-            spg_type = self.schema.get(s_label)
-            for prop_name, prop_value in properties.items():
-                if prop_value == "NAN":
-                    tmp_properties.pop(prop_name)
-                    continue
-                if prop_name in spg_type.properties:
-                    from knext.schema.model.property import Property
-                    prop: Property = spg_type.properties.get(prop_name)
-                    o_label = prop.object_type_name_en
-                    if o_label not in BASIC_TYPES:
-                        if isinstance(prop_value, str):
-                            prop_value = [prop_value]
-                        for o_name in prop_value:
-                            sub_graph.add_node(id=o_name, name=o_name, label=o_label)
-                            sub_graph.add_edge(s_id=s_name, s_label=s_label, p=prop_name, o_id=o_name, o_label=o_label)
-                        tmp_properties.pop(prop_name)
-            record["properties"] = tmp_properties
-            sub_graph.add_node(id=s_name, name=s_name, label=s_label, properties=properties)
-        return sub_graph, entities
-
-    def invoke(self, input: Input, **kwargs) -> List[Output]:
-        """
-        Invokes the semantic extractor to process input data.
-
-        Args:
-            input (Input): Input data containing name and content.
-            **kwargs: Additional keyword arguments.
-
-        Returns:
-            List[Output]: A list of processed results, containing subgraph information.
-        """
-        title = input.name
-        passage = title + "\n" + input.content
-
-        try:
-            entities = self.named_entity_recognition(passage)
-            sub_graph, entities = self.assemble_sub_graph_with_spg_records(entities)
-            filtered_entities = [{k: v for k, v in ent.items() if k in ["entity", "category"]} for ent in entities]
-            triples = self.triples_extraction(passage, filtered_entities)
-            std_entities = self.named_entity_standardization(passage, filtered_entities)
-            self.append_official_name(entities, std_entities)
-            self.assemble_sub_graph(sub_graph, input, entities, triples)
-            return [sub_graph]
-        except Exception as e:
-            import traceback
-
-            traceback.print_exc()
-            logger.info(e)
-        return []
diff --git a/kag/builder/component/mapping/__init__.py b/kag/builder/component/mapping/__init__.py
index e0744009..e69de29b 100644
--- a/kag/builder/component/mapping/__init__.py
+++ b/kag/builder/component/mapping/__init__.py
@@ -1,21 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-from kag.builder.component.mapping.spg_type_mapping import SPGTypeMapping
-from kag.builder.component.mapping.relation_mapping import RelationMapping
-from kag.builder.component.mapping.spo_mapping import SPOMapping
-
-__all__ = [
-    "SPGTypeMapping",
-    "RelationMapping",
-    "SPOMapping",
-]
diff --git a/kag/builder/component/mapping/relation_mapping.py b/kag/builder/component/mapping/relation_mapping.py
index 47fa9f64..d77d2db7 100644
--- a/kag/builder/component/mapping/relation_mapping.py
+++ b/kag/builder/component/mapping/relation_mapping.py
@@ -10,40 +10,46 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
 
-from collections import defaultdict
 from typing import Dict, List
 
 from kag.builder.model.sub_graph import SubGraph
 from knext.common.base.runnable import Input, Output
 from knext.schema.client import SchemaClient
-
-from knext.schema.model.schema_helper import (
-    SPGTypeName,
-    RelationName,
-)
-from kag.interface.builder.mapping_abc import MappingABC
+from kag.common.conf import KAG_PROJECT_CONF
+from kag.interface import MappingABC
 
 
+@MappingABC.register("relation")
 class RelationMapping(MappingABC):
     """
-    A class that handles relation mappings by assembling subgraphs based on given subject, predicate, and object names.
-    This class extends the Mapping class.
-
-    Args:
-        subject_name (SPGTypeName): The name of the subject type.
-        predicate_name (RelationName): The name of the predicate.
-        object_name (SPGTypeName): The name of the object type.
+    A class that extends the MappingABC class.
+    It handles relation mappings by assembling subgraphs based on given subject, predicate, and object names.
     """
 
     def __init__(
         self,
-        subject_name: SPGTypeName,
-        predicate_name: RelationName,
-        object_name: SPGTypeName,
-        **kwargs
+        subject_name: str,
+        predicate_name: str,
+        object_name: str,
+        src_id_field: str = None,
+        dst_id_field: str = None,
+        property_mapping: dict = {},
+        **kwargs,
     ):
+        """
+        Initializes the RelationMapping instance.
+
+        Args:
+            subject_name (str): The name of the subject type.
+            predicate_name (str): The name of the predicate type.
+            object_name (str): The name of the object type.
+            src_id_field (str, optional): The field name for the source ID. Defaults to None.
+            dst_id_field (str, optional): The field name for the destination ID. Defaults to None.
+            property_mapping (dict, optional): A dictionary mapping properties. Defaults to {}.
+            **kwargs: Additional keyword arguments passed to the parent class constructor.
+        """
         super().__init__(**kwargs)
-        schema = SchemaClient(project_id=self.project_id).load()
+        schema = SchemaClient(project_id=KAG_PROJECT_CONF.project_id).load()
         assert subject_name in schema, f"{subject_name} is not a valid SPG type name"
         assert object_name in schema, f"{object_name} is not a valid SPG type name"
         self.subject_type = schema.get(subject_name)
@@ -54,10 +60,9 @@ def __init__(
         ), f"{predicate_name} is not a valid SPG property/relation name"
         self.predicate_name = predicate_name
 
-        self.src_id_field = None
-        self.dst_id_field = None
-        self.property_mapping: Dict = defaultdict(list)
-        self.linking_strategies: Dict = dict()
+        self.src_id_field = src_id_field
+        self.dst_id_field = dst_id_field
+        self.property_mapping = property_mapping
 
     def add_src_id_mapping(self, source_name: str):
         """
@@ -96,7 +101,11 @@ def add_sub_property_mapping(self, source_name: str, target_name: str):
         Returns:
             self
         """
-        self.property_mapping[target_name].append(source_name)
+
+        if target_name in self.property_mapping:
+            self.property_mapping[target_name].append(source_name)
+        else:
+            self.property_mapping[target_name] = [source_name]
         return self
 
     @property
diff --git a/kag/builder/component/mapping/spg_type_mapping.py b/kag/builder/component/mapping/spg_type_mapping.py
index 49400f70..3aa33487 100644
--- a/kag/builder/component/mapping/spg_type_mapping.py
+++ b/kag/builder/component/mapping/spg_type_mapping.py
@@ -15,33 +15,31 @@
 import pandas
 
 from knext.schema.client import BASIC_TYPES
-from kag.builder.model.sub_graph import SubGraph, Node
+from kag.builder.model.sub_graph import SubGraph
 from knext.common.base.runnable import Input, Output
 from knext.schema.client import SchemaClient
 from knext.schema.model.base import SpgTypeEnum
-
 from knext.schema.model.schema_helper import (
-    SPGTypeName,
     PropertyName,
 )
+from kag.common.conf import KAG_PROJECT_CONF
 from kag.interface.builder.mapping_abc import MappingABC
-
-FuseFunc = Callable[[SubGraph], List[SubGraph]]
-LinkFunc = Callable[[str, Node], List[Node]]
+from kag.common.registry import Functor
 
 
+@MappingABC.register("spg")
+@MappingABC.register("spg_mapping")
 class SPGTypeMapping(MappingABC):
     """
-    A class for mapping SPG (Simple Property Graph) types and handling their properties and strategies.
+    A class for mapping SPG(Semantic-enhanced Programmable Graph) types and handling their properties and strategies.
 
     Attributes:
         spg_type_name (SPGTypeName): The name of the SPG type.
         fuse_op (FuseOpABC, optional): The user-defined fuse operator. Defaults to None.
     """
 
-    def __init__(self, spg_type_name: SPGTypeName, fuse_func: FuseFunc = None, **kwargs):
-        super().__init__(**kwargs)
-        self.schema = SchemaClient(project_id=self.project_id).load()
+    def __init__(self, spg_type_name: str, fuse_func: Functor = None):
+        self.schema = SchemaClient(project_id=KAG_PROJECT_CONF.project_id).load()
         assert (
             spg_type_name in self.schema
         ), f"SPG type [{spg_type_name}] does not exist."
@@ -55,7 +53,7 @@ def add_property_mapping(
         self,
         source_name: str,
         target_name: PropertyName,
-        link_func: LinkFunc = None,
+        link_func: Callable = None,
     ):
         """
         Adds a property mapping from a source name to a target name within the SPG type.
diff --git a/kag/builder/component/mapping/spo_mapping.py b/kag/builder/component/mapping/spo_mapping.py
index 2ab11c93..57b7e978 100644
--- a/kag/builder/component/mapping/spo_mapping.py
+++ b/kag/builder/component/mapping/spo_mapping.py
@@ -10,7 +10,6 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
 import json
-from collections import defaultdict
 from typing import List, Type, Dict
 
 from kag.interface.builder.mapping_abc import MappingABC
@@ -19,17 +18,44 @@
 from knext.schema.client import OTHER_TYPE
 
 
+@MappingABC.register("spo")
+@MappingABC.register("spo_mapping")
 class SPOMapping(MappingABC):
+    """
+    A class that extends the MappingABC base class.
+    It is responsible for mapping structured dictionaries to a list of SubGraphs.
+    """
+
+    def __init__(
+        self,
+        s_type_col: str = None,
+        s_id_col: str = None,
+        p_type_col: str = None,
+        o_type_col: str = None,
+        o_id_col: str = None,
+        sub_property_col: str = None,
+        sub_property_mapping: dict = {},
+    ):
+        """
+        Initializes the SPOMapping instance.
 
-    def __init__(self):
+        Args:
+            s_type_col (str, optional): The column name for the subject type. Defaults to None.
+            s_id_col (str, optional): The column name for the subject ID. Defaults to None.
+            p_type_col (str, optional): The column name for the predicate type. Defaults to None.
+            o_type_col (str, optional): The column name for the object type. Defaults to None.
+            o_id_col (str, optional): The column name for the object ID. Defaults to None.
+            sub_property_col (str, optional): The column name for sub-properties. Defaults to None.
+            sub_property_mapping (dict, optional): A dictionary mapping sub-properties. Defaults to {}.
+        """
         super().__init__()
-        self.s_type_col = None
-        self.s_id_col = None
-        self.p_type_col = None
-        self.o_type_col = None
-        self.o_id_col = None
-        self.sub_property_mapping = defaultdict(list)
-        self.sub_property_col = None
+        self.s_type_col = s_type_col
+        self.s_id_col = s_id_col
+        self.p_type_col = p_type_col
+        self.o_type_col = o_type_col
+        self.o_id_col = o_id_col
+        self.sub_property_col = sub_property_col
+        self.sub_property_mapping = sub_property_mapping
 
     @property
     def input_types(self) -> Type[Input]:
@@ -39,7 +65,27 @@ def input_types(self) -> Type[Input]:
     def output_types(self) -> Type[Output]:
         return SubGraph
 
-    def add_field_mappings(self, s_id_col: str, p_type_col: str, o_id_col: str, s_type_col: str = None, o_type_col: str = None):
+    def add_field_mappings(
+        self,
+        s_id_col: str,
+        p_type_col: str,
+        o_id_col: str,
+        s_type_col: str = None,
+        o_type_col: str = None,
+    ):
+        """
+        Adds field mappings for the subject, predicate, and object types and IDs.
+
+        Args:
+            s_id_col (str): The column name for the subject ID.
+            p_type_col (str): The column name for the predicate type.
+            o_id_col (str): The column name for the object ID.
+            s_type_col (str, optional): The column name for the subject type. Defaults to None.
+            o_type_col (str, optional): The column name for the object type. Defaults to None.
+
+        Returns:
+            self
+        """
         self.s_type_col = s_type_col
         self.s_id_col = s_id_col
         self.p_type_col = p_type_col
@@ -63,7 +109,10 @@ def add_sub_property_mapping(self, source_name: str, target_name: str = None):
         if not target_name:
             self.sub_property_col = source_name
         else:
-            self.sub_property_mapping[target_name].append(source_name)
+            if target_name in self.sub_property_mapping:
+                self.sub_property_mapping[target_name].append(source_name)
+            else:
+                self.sub_property_mapping[target_name] = [source_name]
         return self
 
     def assemble_sub_graph(self, record: Dict[str, str]):
@@ -86,14 +135,21 @@ def assemble_sub_graph(self, record: Dict[str, str]):
         sub_graph.add_node(id=o_id, name=o_id, label=o_type)
         sub_properties = {}
         if self.sub_property_col:
-            sub_properties = json.loads(record.get(self.sub_property_col, '{}'))
+            sub_properties = json.loads(record.get(self.sub_property_col, "{}"))
             sub_properties = {k: str(v) for k, v in sub_properties.items()}
         else:
             for target_name, source_names in self.sub_property_mapping.items():
                 for source_name in source_names:
                     value = record.get(source_name)
                     sub_properties[target_name] = value
-        sub_graph.add_edge(s_id=s_id, s_label=s_type, p=p, o_id=o_id, o_label=o_type, properties=sub_properties)
+        sub_graph.add_edge(
+            s_id=s_id,
+            s_label=s_type,
+            p=p,
+            o_id=o_id,
+            o_label=o_type,
+            properties=sub_properties,
+        )
         return sub_graph
 
     def invoke(self, input: Input, **kwargs) -> List[Output]:
@@ -105,7 +161,7 @@ def invoke(self, input: Input, **kwargs) -> List[Output]:
             **kwargs: Additional keyword arguments.
 
         Returns:
-            List[Output]: A list of resulting sub-graphs.
+            List[Output]: A list of resulting subgraphs.
         """
         record: Dict[str, str] = input
         sub_graph = self.assemble_sub_graph(record)
diff --git a/kag/solver/logic/core_modules/op_executor/op_deduce/module/__init__.py b/kag/builder/component/postprocessor/__init__.py
similarity index 100%
rename from kag/solver/logic/core_modules/op_executor/op_deduce/module/__init__.py
rename to kag/builder/component/postprocessor/__init__.py
diff --git a/kag/builder/component/postprocessor/kag_postprocessor.py b/kag/builder/component/postprocessor/kag_postprocessor.py
new file mode 100644
index 00000000..8af36b06
--- /dev/null
+++ b/kag/builder/component/postprocessor/kag_postprocessor.py
@@ -0,0 +1,190 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import logging
+from typing import List
+from tenacity import stop_after_attempt, retry
+from kag.interface import PostProcessorABC
+from kag.interface import ExternalGraphLoaderABC
+from kag.builder.model.sub_graph import SubGraph
+from kag.common.conf import KAGConstants, KAG_PROJECT_CONF
+from kag.common.utils import get_vector_field_name
+from knext.search.client import SearchClient
+from knext.schema.client import SchemaClient, OTHER_TYPE
+
+
+logger = logging.getLogger()
+
+
+@PostProcessorABC.register("base", as_default=True)
+@PostProcessorABC.register("kag_post_processor")
+class KAGPostProcessor(PostProcessorABC):
+    """
+    A class that extends the PostProcessorABC base class.
+    It provides methods to handle various post-processing tasks on subgraphs
+    including filtering, entity linking based on similarity, and linking based on an external graph.
+    """
+
+    def __init__(
+        self,
+        similarity_threshold: float = 0.9,
+        external_graph: ExternalGraphLoaderABC = None,
+    ):
+        """
+        Initializes the KAGPostProcessor instance.
+
+        Args:
+            similarity_threshold (float, optional): The similarity threshold for entity linking. Defaults to 0.9.
+            external_graph (ExternalGraphLoaderABC, optional): An instance of ExternalGraphLoaderABC for external graph-based linking. Defaults to None.
+        """
+        super().__init__()
+        self.schema = SchemaClient(project_id=KAG_PROJECT_CONF.project_id).load()
+        self.similarity_threshold = similarity_threshold
+        self.external_graph = external_graph
+        self._init_search()
+
+    def format_label(self, label: str):
+        """
+        Formats the label by adding the project namespace if it is not already present.
+
+        Args:
+            label (str): The label to be formatted.
+
+        Returns:
+            str: The formatted label.
+        """
+        namespace = KAG_PROJECT_CONF.namespace
+        if label.split(".")[0] == namespace:
+            return label
+        return f"{namespace}.{label}"
+
+    def _init_search(self):
+        """
+        Initializes the search client for entity linking.
+        """
+        self._search_client = SearchClient(
+            KAG_PROJECT_CONF.host_addr, KAG_PROJECT_CONF.project_id
+        )
+
+    def filter_invalid_data(self, graph: SubGraph):
+        """
+        Filters out invalid nodes and edges from the subgraph.
+
+        Args:
+            graph (SubGraph): The subgraph to be filtered.
+
+        Returns:
+            SubGraph: The filtered subgraph.
+        """
+        valid_nodes = []
+        valid_edges = []
+        for node in graph.nodes:
+            if not node.id or not node.label:
+                continue
+            if node.label not in self.schema:
+                node.label = self.format_label(OTHER_TYPE)
+            # for k in node.properties.keys():
+            #     if k not in self.schema[node.label]:
+            #         continue
+            valid_nodes.append(node)
+        for edge in graph.edges:
+            if edge.label:
+                valid_edges.append(edge)
+        return SubGraph(nodes=valid_nodes, edges=valid_edges)
+
+    @retry(stop=stop_after_attempt(3))
+    def _entity_link(
+        self, graph: SubGraph, property_key: str = "name", labels: List[str] = None
+    ):
+        """
+        Performs entity linking based on the given property key and labels.
+
+        Args:
+            graph (SubGraph): The subgraph to perform entity linking on.
+            property_key (str, optional): The property key to use for linking. Defaults to "name".
+            labels (List[str], optional): The labels to consider for linking. Defaults to None.
+        """
+        vector_field_name = get_vector_field_name(property_key)
+        for node in graph.nodes:
+            if labels is None:
+                link_labels = [self.format_label(node.label)]
+            else:
+                link_labels = [self.format_label(x) for x in labels]
+            vector = node.properties.get(vector_field_name)
+            if vector:
+                all_similar_nodes = []
+                for label in link_labels:
+                    similar_nodes = self._search_client.search_vector(
+                        label=label,
+                        property_key=property_key,
+                        query_vector=[float(x) for x in vector],
+                        topk=1,
+                        params={},
+                    )
+                    all_similar_nodes.extend(similar_nodes)
+                for item in all_similar_nodes:
+                    score = item["score"]
+                    if (
+                        score >= self.similarity_threshold
+                        and node.id != item["node"]["id"]
+                    ):
+                        graph.add_edge(
+                            node.id,
+                            node.label,
+                            KAGConstants.KAG_SIMILAR_EDGE_NAME,
+                            item["node"]["id"],
+                            item["node"]["__labels__"][0],
+                        )
+
+    def similarity_based_link(self, graph: SubGraph, property_key: str = "name"):
+        """
+        Performs entity linking based on similarity.
+
+        Args:
+            graph (SubGraph): The subgraph to perform entity linking on.
+            property_key (str, optional): The property key to use for linking. Defaults to "name".
+        """
+        self._entity_link(graph, property_key, None)
+
+    def external_graph_based_link(self, graph: SubGraph, property_key: str = "name"):
+        """
+        Performs entity linking based on the user provided external graph.
+
+        Args:
+            graph (SubGraph): The subgraph to perform entity linking on.
+            property_key (str, optional): The property key to use for linking. Defaults to "name".
+        """
+        if not self.external_graph:
+            return
+        labels = self.external_graph.get_allowed_labels()
+        self._entity_link(graph, property_key, labels)
+
+    def _invoke(self, input, **kwargs):
+        """
+        Invokes the post-processing pipeline on the input subgraph.
+
+        Args:
+            input: The input subgraph to be processed.
+
+        Returns:
+            List[SubGraph]: A list containing the processed subgraph.
+        """
+        origin_num_nodes = len(input.nodes)
+        origin_num_edges = len(input.edges)
+        new_graph = self.filter_invalid_data(input)
+        self.similarity_based_link(new_graph)
+        self.external_graph_based_link(new_graph)
+        new_num_nodes = len(new_graph.nodes)
+        new_num_edges = len(new_graph.edges)
+        logger.debug(
+            f"origin: {origin_num_nodes}/{origin_num_edges}, processed: {new_num_nodes}/{new_num_edges}"
+        )
+        return [new_graph]
diff --git a/kag/builder/component/reader/__init__.py b/kag/builder/component/reader/__init__.py
index df6c45b5..e69de29b 100644
--- a/kag/builder/component/reader/__init__.py
+++ b/kag/builder/component/reader/__init__.py
@@ -1,33 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-from kag.builder.component.reader.csv_reader import CSVReader
-from kag.builder.component.reader.pdf_reader import PDFReader
-from kag.builder.component.reader.json_reader import JSONReader
-from kag.builder.component.reader.markdown_reader import MarkDownReader
-from kag.builder.component.reader.docx_reader import DocxReader
-from kag.builder.component.reader.txt_reader import TXTReader
-from kag.builder.component.reader.dataset_reader import HotpotqaCorpusReader, TwowikiCorpusReader, MusiqueCorpusReader
-from kag.builder.component.reader.yuque_reader import YuqueReader
-
-__all__ = [
-    "TXTReader",
-    "PDFReader",
-    "MarkDownReader",
-    "JSONReader",
-    "HotpotqaCorpusReader",
-    "MusiqueCorpusReader",
-    "TwowikiCorpusReader",
-    "YuqueReader",
-    "CSVReader",
-    "DocxReader",
-]
diff --git a/kag/builder/component/reader/csv_reader.py b/kag/builder/component/reader/csv_reader.py
deleted file mode 100644
index 9c7c157d..00000000
--- a/kag/builder/component/reader/csv_reader.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-import os
-from typing import List, Type, Dict
-
-import pandas as pd
-
-from kag.builder.model.chunk import Chunk
-from kag.interface.builder.reader_abc import SourceReaderABC
-from knext.common.base.runnable import Input, Output
-
-
-class CSVReader(SourceReaderABC):
-    """
-    A class for reading CSV files, inheriting from `SourceReader`.
-    Supports converting CSV data into either a list of dictionaries or a list of Chunk objects.
-
-    Args:
-        output_type (Output): Specifies the output type, which can be "Dict" or "Chunk".
-        **kwargs: Additional keyword arguments passed to the parent class constructor.
-    """
-
-    def __init__(self, output_type="Chunk", **kwargs):
-        super().__init__(**kwargs)
-        if output_type == "Dict":
-            self.output_types = Dict[str, str]
-        else:
-            self.output_types = Chunk
-        self.id_col = kwargs.get("id_col", "id")
-        self.name_col = kwargs.get("name_col", "name")
-        self.content_col = kwargs.get("content_col", "content")
-
-    @property
-    def input_types(self) -> Type[Input]:
-        return str
-
-    @property
-    def output_types(self) -> Type[Output]:
-        return self._output_types
-
-    @output_types.setter
-    def output_types(self, output_types):
-        self._output_types = output_types
-
-    def invoke(self, input: Input, **kwargs) -> List[Output]:
-        """
-        Reads a CSV file and converts the data format based on the output type.
-
-        Args:
-            input (Input): Input parameter, expected to be a string representing the path to the CSV file.
-            **kwargs: Additional keyword arguments, which may include `id_column`, `name_column`, `content_column`, etc.
-
-        Returns:
-            List[Output]:
-                - If `output_types` is `Chunk`, returns a list of Chunk objects.
-                - If `output_types` is `Dict`, returns a list of dictionaries.
-        """
-
-        try:
-            data = pd.read_csv(input)
-            data = data.astype(str)
-        except Exception as e:
-            raise IOError(f"Failed to read the file: {e}")
-
-        if self.output_types == Chunk:
-            chunks = []
-            basename, _ = os.path.splitext(os.path.basename(input))
-            for idx, row in enumerate(data.to_dict(orient="records")):
-                kwargs = {k: v for k, v in row.items() if k not in [self.id_col, self.name_col, self.content_col]}
-                chunks.append(
-                    Chunk(
-                        id=row.get(self.id_col) or Chunk.generate_hash_id(f"{input}#{idx}"),
-                        name=row.get(self.name_col) or f"{basename}#{idx}",
-                        content=row[self.content_col],
-                        **kwargs
-                    )
-                )
-            return chunks
-        else:
-            return data.to_dict(orient="records")
diff --git a/kag/builder/component/reader/dataset_reader.py b/kag/builder/component/reader/dataset_reader.py
deleted file mode 100644
index 850b87ef..00000000
--- a/kag/builder/component/reader/dataset_reader.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import json
-import os
-from typing import List, Type
-
-from kag.builder.model.chunk import Chunk
-from kag.interface.builder import SourceReaderABC
-from knext.common.base.runnable import Input, Output
-
-
-class HotpotqaCorpusReader(SourceReaderABC):
-    @property
-    def input_types(self) -> Type[Input]:
-        """The type of input this Runnable object accepts specified as a type annotation."""
-        return str
-
-    @property
-    def output_types(self) -> Type[Output]:
-        """The type of output this Runnable object produces specified as a type annotation."""
-        return Chunk
-
-    def invoke(self, input: str, **kwargs) -> List[Output]:
-        if os.path.exists(str(input)):
-            with open(input, "r") as f:
-                corpus = json.load(f)
-        else:
-            corpus = json.loads(input)
-        chunks = []
-
-        for item_key, item_value in corpus.items():
-            chunk = Chunk(
-                id=item_key,
-                name=item_key,
-                content="\n".join(item_value),
-            )
-            chunks.append(chunk)
-        return chunks
-
-
-class MusiqueCorpusReader(SourceReaderABC):
-    @property
-    def input_types(self) -> Type[Input]:
-        """The type of input this Runnable object accepts specified as a type annotation."""
-        return str
-
-    @property
-    def output_types(self) -> Type[Output]:
-        """The type of output this Runnable object produces specified as a type annotation."""
-        return Chunk
-
-    def get_basename(self, file_name: str):
-        base, ext = os.path.splitext(os.path.basename(file_name))
-        return base
-
-    def invoke(self, input: str, **kwargs) -> List[Output]:
-        id_column = kwargs.get("id_column", "title")
-        name_column = kwargs.get("name_column", "title")
-        content_column = kwargs.get("content_column", "text")
-
-        if os.path.exists(str(input)):
-            with open(input, "r") as f:
-                corpusList = json.load(f)
-        else:
-            corpusList = input
-        chunks = []
-
-        for item in corpusList:
-            chunk = Chunk(
-                id=item[id_column],
-                name=item[name_column],
-                content=item[content_column],
-            )
-            chunks.append(chunk)
-        return chunks
-
-
-class TwowikiCorpusReader(MusiqueCorpusReader):
-    @property
-    def input_types(self) -> Type[Input]:
-        """The type of input this Runnable object accepts specified as a type annotation."""
-        return str
-
-    @property
-    def output_types(self) -> Type[Output]:
-        """The type of output this Runnable object produces specified as a type annotation."""
-        return Chunk
diff --git a/kag/builder/component/reader/dict_reader.py b/kag/builder/component/reader/dict_reader.py
new file mode 100644
index 00000000..bf90b24d
--- /dev/null
+++ b/kag/builder/component/reader/dict_reader.py
@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from typing import Dict, List
+from kag.interface import ReaderABC
+from knext.common.base.runnable import Output, Input
+from kag.builder.model.chunk import Chunk
+
+
+@ReaderABC.register("dict")
+@ReaderABC.register("dict_reader")
+class DictReader(ReaderABC):
+    """
+    A class for reading dictionaries into Chunk objects.
+
+    This class inherits from ReaderABC and provides the functionality to convert dictionary inputs
+    into a list of Chunk objects.
+
+    Attributes:
+        id_col (str): The key in the input dictionary that corresponds to the chunk's ID.
+        name_col (str): The key in the input dictionary that corresponds to the chunk's name.
+        content_col (str): The key in the input dictionary that corresponds to the chunk's content.
+    """
+
+    def __init__(
+        self, id_col: str = "id", name_col: str = "name", content_col: str = "content"
+    ):
+        """
+        Initializes the DictReader with the specified column names.
+
+        Args:
+            id_col (str): The key in the input dictionary that corresponds to the chunk's ID. Defaults to "id".
+            name_col (str): The key in the input dictionary that corresponds to the chunk's name. Defaults to "name".
+            content_col (str): The key in the input dictionary that corresponds to the chunk's content. Defaults to "content".
+        """
+        super().__init__()
+        self.id_col = id_col
+        self.name_col = name_col
+        self.content_col = content_col
+
+    @property
+    def input_types(self) -> Input:
+        return Dict
+
+    def _invoke(self, input: Input, **kwargs) -> List[Output]:
+        """
+        Converts the input dictionary into a list of Chunk objects.
+
+        Args:
+            input (Input): The input dictionary containing the data to be parsed.
+            **kwargs: Additional keyword arguments, currently unused but kept for potential future expansion.
+
+        Returns:
+            List[Output]: A list containing a single Chunk object created from the input dictionary.
+        """
+        chunk_id = input.get(self.id_col)
+        chunk_name = input.get(self.name_col)
+        chunk_content = input.get(self.content_col)
+        if self.id_col in input:
+            input.pop(self.id_col)
+        if self.name_col in input:
+            input.pop(self.name_col)
+        if self.content_col in input:
+            input.pop(self.content_col)
+
+        return [Chunk(id=chunk_id, name=chunk_name, content=chunk_content, **input)]
diff --git a/kag/builder/component/reader/docx_reader.py b/kag/builder/component/reader/docx_reader.py
index d9208f62..06464301 100644
--- a/kag/builder/component/reader/docx_reader.py
+++ b/kag/builder/component/reader/docx_reader.py
@@ -11,17 +11,17 @@
 # or implied.
 
 import os
-from typing import List, Type,Union
+from typing import List, Union
 
 from docx import Document
-
-from kag.builder.component.reader import MarkDownReader
+from kag.interface import LLMClient
 from kag.builder.model.chunk import Chunk
-from kag.interface.builder import SourceReaderABC
+from kag.interface import ReaderABC
+from kag.builder.prompt.outline_prompt import OutlinePrompt
+from kag.common.conf import KAG_PROJECT_CONF
+from kag.common.utils import generate_hash_id
 from knext.common.base.runnable import Input, Output
 
-from kag.common.llm.client import LLMClient
-from kag.builder.prompt.outline_prompt import OutlinePrompt
 
 def split_txt(content):
     from modelscope.outputs import OutputKeys
@@ -30,40 +30,49 @@ def split_txt(content):
 
     p = pipeline(
         task=Tasks.document_segmentation,
-        model='damo/nlp_bert_document-segmentation_chinese-base')
+        model="damo/nlp_bert_document-segmentation_chinese-base",
+    )
 
     result = p(documents=content)
     result = result[OutputKeys.TEXT]
-    
-    res = [r for r in result.split('\n\t') if len(r) > 0]
-    
+
+    res = [r for r in result.split("\n\t") if len(r) > 0]
+
     return res
 
 
-    
-class DocxReader(SourceReaderABC):
+@ReaderABC.register("docx")
+@ReaderABC.register("docx_reader")
+class DocxReader(ReaderABC):
     """
-    A class for reading Docx files, inheriting from SourceReader.
-    This class is specifically designed to extract text content from Docx files and generate Chunk objects based on the extracted content.
+    A class for reading Docx files into Chunk objects.
+
+    This class inherits from ReaderABC and provides the functionality to process Docx files,
+    extract their text content, and convert it into a list of Chunk objects.
     """
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.split_level = kwargs.get("split_level", 3)
-        self.split_using_outline = kwargs.get("split_using_outline", True)
-        self.outline_flag = True
-        self.llm = self._init_llm()
-        language = os.getenv("KAG_PROMPT_LANGUAGE", "zh")
-        self.prompt = OutlinePrompt(language)
-
-    @property
-    def input_types(self) -> Type[Input]:
-        return str
-
-    @property
-    def output_types(self) -> Type[Output]:
-        return Chunk
-    
-    def outline_chunk(self, chunk: Union[Chunk, List[Chunk]],basename) -> List[Chunk]:
+
+    def __init__(self, llm: LLMClient = None):
+        """
+        Initializes the DocxReader with an optional LLMClient instance.
+
+        Args:
+            llm (LLMClient): An optional LLMClient instance used for generating outlines. Defaults to None.
+        """
+        super().__init__()
+        self.llm = llm
+        self.prompt = OutlinePrompt(KAG_PROJECT_CONF.language)
+
+    def outline_chunk(self, chunk: Union[Chunk, List[Chunk]], basename) -> List[Chunk]:
+        """
+        Generates outlines for the given chunk(s) and separates the content based on these outlines.
+
+        Args:
+            chunk (Union[Chunk, List[Chunk]]): A single Chunk object or a list of Chunk objects.
+            basename: The base name used for generating chunk IDs and names.
+
+        Returns:
+            List[Chunk]: A list of Chunk objects separated by the generated outlines.
+        """
         if isinstance(chunk, Chunk):
             chunk = [chunk]
         outlines = []
@@ -71,20 +80,35 @@ def outline_chunk(self, chunk: Union[Chunk, List[Chunk]],basename) -> List[Chunk
             outline = self.llm.invoke({"input": c.content}, self.prompt)
             outlines.extend(outline)
         content = "\n".join([c.content for c in chunk])
-        chunks = self.sep_by_outline(content, outlines,basename)
+        chunks = self.sep_by_outline(content, outlines, basename)
         return chunks
-    
-    def sep_by_outline(self,content,outlines,basename):
+
+    def sep_by_outline(self, content, outlines, basename):
+        """
+        Separates the content based on the provided outlines.
+
+        Args:
+            content (str): The content to be separated.
+            outlines (List[str]): A list of outlines used to separate the content.
+            basename: The base name used for generating chunk IDs and names.
+
+        Returns:
+            List[Chunk]: A list of Chunk objects separated by the provided outlines.
+        """
         position_check = []
         for outline in outlines:
             start = content.find(outline)
-            position_check.append((outline,start))
+            position_check.append((outline, start))
         chunks = []
-        for idx,pc in enumerate(position_check):
+        for idx, pc in enumerate(position_check):
             chunk = Chunk(
-                id = Chunk.generate_hash_id(f"{basename}#{pc[0]}"),
+                id=generate_hash_id(f"{basename}#{pc[0]}"),
                 name=f"{basename}#{pc[0]}",
-                content=content[pc[1]:position_check[idx+1][1] if idx+1 < len(position_check) else len(position_check)],
+                content=content[
+                    pc[1] : position_check[idx + 1][1]
+                    if idx + 1 < len(position_check)
+                    else len(position_check)
+                ],
             )
             chunks.append(chunk)
         return chunks
@@ -111,16 +135,25 @@ def _extract_text_from_docx(doc: Document) -> str:
         for para in doc.paragraphs:
             full_text.append(para.text)
         return full_text
-    
+
     def _get_title_from_text(self, text: str) -> str:
+        """
+        Extracts the title from the provided text.
+
+        Args:
+            text (str): The text from which to extract the title.
+
+        Returns:
+            str: The extracted title and the remaining text.
+        """
         text = text.strip()
-        title = text.split('\n')[0]
-        text = "\n".join(text.split('\n'))
-        return title,text
+        title = text.split("\n")[0]
+        text = "\n".join(text.split("\n"))
+        return title, text
 
-    def invoke(self, input: Input, **kwargs) -> List[Output]:
+    def _invoke(self, input: Input, **kwargs) -> List[Output]:
         """
-        Processes the input Docx file, extracts its text content, and generates a Chunk object.
+        Processes the input Docx file, extracts its text content, and generates Chunk objects.
 
         Args:
             input (Input): The file path of the Docx file to be processed.
@@ -136,9 +169,9 @@ def invoke(self, input: Input, **kwargs) -> List[Output]:
 
         if not input:
             raise ValueError("Input cannot be empty")
-        
+
         chunks = []
-        
+
         try:
             doc = Document(input)
             full_text = self._extract_text_from_docx(doc)
@@ -148,32 +181,12 @@ def invoke(self, input: Input, **kwargs) -> List[Output]:
 
         basename, _ = os.path.splitext(os.path.basename(input))
 
-        for text in full_text:
-            title,text = self._get_title_from_text(text)
-            chunk = Chunk(
-                id=Chunk.generate_hash_id(f"{basename}#{title}"),
-                name=f"{basename}#{title}",
-                content=text,
-            )
-            chunks.append(chunk)
-
-        if len(chunks) < 2:
-            chunks = self.outline_chunk(chunks,basename)
-        
-        if len(chunks) < 2:
-            semantic_res = split_txt(content)
-            chunks = [Chunk(
-                id=Chunk.generate_hash_id(input+"#"+r[:10]),
-                name=basename+"#"+r[:10],
-                content=r,
-            ) for r in semantic_res]
+        chunk = Chunk(
+            id=generate_hash_id(input),
+            name=basename,
+            content=content,
+            **{"documentId": basename, "documentName": basename},
+        )
+        chunks.append(chunk)
 
         return chunks
-
-
-if __name__== "__main__":
-    reader = DocxReader()
-    print(reader.output_types)
-    file_path = os.path.dirname(__file__)
-    res = reader.invoke(os.path.join(file_path,"../../../../tests/builder/data/test_docx.docx"))
-    print(res)
\ No newline at end of file
diff --git a/kag/builder/component/reader/json_reader.py b/kag/builder/component/reader/json_reader.py
deleted file mode 100644
index 9ee27f54..00000000
--- a/kag/builder/component/reader/json_reader.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import json
-import os
-from typing import List, Type, Dict, Union
-
-from kag.builder.component.reader.markdown_reader import MarkDownReader
-from kag.builder.model.chunk import Chunk
-from kag.interface.builder.reader_abc import SourceReaderABC
-from knext.common.base.runnable import Input, Output
-
-from kag.common.llm.client import LLMClient
-
-
-class JSONReader(SourceReaderABC):
-    """
-    A class for reading JSON files, inheriting from `SourceReader`.
-    Supports converting JSON data into either a list of dictionaries or a list of Chunk objects.
-
-    Args:
-        output_types (Output): Specifies the output type, which can be "Dict" or "Chunk".
-        **kwargs: Additional keyword arguments passed to the parent class constructor.
-    """
-
-    def __init__(self, output_type="Chunk", **kwargs):
-        super().__init__(**kwargs)
-        if output_type == "Dict":
-            self.output_types = Dict[str, str]
-        else:
-            self.output_types = Chunk
-        self.id_col = kwargs.get("id_col", "id")
-        self.name_col = kwargs.get("name_col", "name")
-        self.content_col = kwargs.get("content_col", "content")
-
-    @property
-    def input_types(self) -> Type[Input]:
-        return str
-
-    @property
-    def output_types(self) -> Type[Output]:
-        return self._output_types
-
-    @output_types.setter
-    def output_types(self, output_types):
-        self._output_types = output_types
-
-    @staticmethod
-    def _read_from_file(file_path: str) -> Union[dict, list]:
-        """
-        Safely reads JSON from a file and returns its content.
-
-        Args:
-            file_path (str): The path to the JSON file.
-
-        Returns:
-            Union[dict, list]: The parsed JSON content.
-
-        Raises:
-            ValueError: If there is an error reading the JSON file.
-        """
-        try:
-            with open(file_path, "r") as file:
-                return json.load(file)
-        except json.JSONDecodeError as e:
-            raise ValueError(f"Error reading JSON from file: {e}")
-        except FileNotFoundError as e:
-            raise ValueError(f"File not found: {e}")
-
-    @staticmethod
-    def _parse_json_string(json_string: str) -> Union[dict, list]:
-        """
-        Parses a JSON string and returns its content.
-
-        Args:
-            json_string (str): The JSON string to parse.
-
-        Returns:
-            Union[dict, list]: The parsed JSON content.
-
-        Raises:
-            ValueError: If there is an error parsing the JSON string.
-        """
-        try:
-            return json.loads(json_string)
-        except json.JSONDecodeError as e:
-            raise ValueError(f"Error parsing JSON string: {e}")
-
-    def invoke(self, input: str, **kwargs) -> List[Output]:
-        """
-        Parses the input string data and generates a list of Chunk objects or returns the original data.
-
-        This method supports receiving JSON-formatted strings. It extracts specific fields based on provided keyword arguments.
-        It can read from a file or directly parse a string. If the input data is in the expected format, it generates a list of Chunk objects;
-        otherwise, it throws a ValueError if the input is not a JSON array or object.
-
-        Args:
-            input (str): The input data, which can be a JSON string or a file path.
-            **kwargs: Keyword arguments used to specify the field names for ID, name, and content.
-
-        Returns:
-            List[Output]: A list of Chunk objects or the original data.
-
-        Raises:
-            ValueError: If the input data format is incorrect or parsing fails.
-        """
-
-        id_col = kwargs.get("id_col", "id")
-        name_col = kwargs.get("name_col", "name")
-        content_col = kwargs.get("content_col", "content")
-        self.id_col = id_col
-        self.name_col = name_col
-        self.content_col = content_col
-        try:
-            if os.path.exists(input):
-                corpus = self._read_from_file(input)
-            else:
-                corpus = self._parse_json_string(input)
-        except ValueError as e:
-            raise e
-
-        if not isinstance(corpus, (list, dict)):
-            raise ValueError("Expected input to be a JSON array or object")
-
-        if isinstance(corpus, dict):
-            corpus = [corpus]
-
-        if self.output_types == Chunk:
-            chunks = []
-            basename, _ = os.path.splitext(os.path.basename(input))
-            for idx, item in enumerate(corpus):
-                if not isinstance(item, dict):
-                    continue
-
-                chunk = Chunk(
-                    id=item.get(self.id_col) or Chunk.generate_hash_id(f"{input}#{idx}"),
-                    name=item.get(self.name_col) or f"{basename}#{idx}",
-                    content=item.get(self.content_col),
-                )
-                chunks.append(chunk)
-
-            return chunks
-        else:
-            return corpus
-
-if __name__ == "__main__":
-    reader = JSONReader()
-    json_string = '''[
-            {
-                "title": "test_json", 
-                "text": "Test content"
-            }
-        ]'''
-    chunks = reader.invoke(json_string,name_column="title",content_col = "text")
-    res = 1
\ No newline at end of file
diff --git a/kag/builder/component/reader/markdown_reader.py b/kag/builder/component/reader/markdown_reader.py
index adfcffbd..ba212c8e 100644
--- a/kag/builder/component/reader/markdown_reader.py
+++ b/kag/builder/component/reader/markdown_reader.py
@@ -12,24 +12,37 @@
 
 import os
 
-import bs4.element
 import markdown
 from bs4 import BeautifulSoup, Tag
-from typing import List, Type
+
 import logging
 import re
 import requests
-import pandas as pd
-from io import StringIO
-from tenacity import stop_after_attempt, retry
+from typing import List, Dict
 
-from kag.interface.builder import SourceReaderABC
-from kag.builder.model.chunk import Chunk, ChunkTypeEnum
-from knext.common.base.runnable import Output, Input
+
+from kag.interface import ReaderABC
+from kag.builder.model.chunk import Chunk
+from kag.interface import LLMClient
 from kag.builder.prompt.analyze_table_prompt import AnalyzeTablePrompt
+from knext.common.base.runnable import Output, Input
+
 
+logger = logging.getLogger(__name__)
 
-class MarkDownReader(SourceReaderABC):
+
+class MarkdownNode:
+    def __init__(self, title: str, level: int, content: str = ""):
+        self.title = title
+        self.level = level
+        self.content = content
+        self.children: List[MarkdownNode] = []
+        self.tables: List[Dict] = []  # 存储表格数据
+
+
+@ReaderABC.register("md")
+@ReaderABC.register("md_reader")
+class MarkDownReader(ReaderABC):
     """
     A class for reading MarkDown files, inheriting from `SourceReader`.
     Supports converting MarkDown data into a list of Chunk objects.
@@ -41,352 +54,344 @@ class MarkDownReader(SourceReaderABC):
     ALL_LEVELS = [f"h{x}" for x in range(1, 7)]
     TABLE_CHUCK_FLAG = "<<<table_chuck>>>"
 
-    def __init__(self, cut_depth: int = 1, **kwargs):
+    def __init__(self, cut_depth: int = 3, llm: LLMClient = None, **kwargs):
         super().__init__(**kwargs)
         self.cut_depth = int(cut_depth)
-        self.llm_module = kwargs.get("llm_module", None)
+        self.llm = llm
         self.analyze_table_prompt = AnalyzeTablePrompt(language="zh")
         self.analyze_img_prompt = AnalyzeTablePrompt(language="zh")
 
     @property
-    def input_types(self) -> Type[Input]:
+    def input_types(self):
         return str
 
     @property
-    def output_types(self) -> Type[Output]:
+    def output_types(self):
         return Chunk
 
-    def to_text(self, level_tags):
-        """
-        Converts parsed hierarchical tags into text content.
+    def solve_content(
+        self, id: str, title: str, content: str, **kwargs
+    ) -> List[Output]:
+        # Convert Markdown to HTML with additional extensions for lists
+        html = markdown.markdown(
+            content, extensions=["tables", "nl2br", "sane_lists", "fenced_code"]
+        )
+        soup = BeautifulSoup(html, "html.parser")
+
+        def is_in_code_block(element):
+            """Check if an element is inside a code block"""
+            parent = element.parent
+            while parent:
+                if parent.name in ["pre", "code"]:
+                    return True
+                parent = parent.parent
+            return False
+
+        def process_text_with_links(element):
+            """Process text containing links, preserving original markdown format"""
+            result = []
+            current_text = ""
+
+            for child in element.children:
+                if isinstance(child, Tag):
+                    if child.name == "a":
+                        # If there's previous text, add it first
+                        if current_text:
+                            result.append(current_text.strip())
+                            current_text = ""
+
+                        # Rebuild markdown format link
+                        link_text = child.get_text().strip()
+                        href = child.get("href", "")
+                        title = child.get("title", "")
+
+                        if title:
+                            result.append(f'[{link_text}]({href} "{title}")')
+                        else:
+                            result.append(f"[{link_text}]({href})")
+                    else:
+                        current_text += child.get_text()
+                else:
+                    current_text += str(child)
+
+            if current_text:
+                result.append(current_text.strip())
+
+            return " ".join(result)
+
+        # Initialize root node
+        root = MarkdownNode("root", 0)
+        stack = [root]
+        current_content = []
+
+        # Traverse all elements
+        all_elements = soup.find_all(
+            [
+                "h1",
+                "h2",
+                "h3",
+                "h4",
+                "h5",
+                "h6",
+                "p",
+                "table",
+                "ul",
+                "ol",
+                "li",
+                "pre",
+                "code",
+            ]
+        )
+        for element in all_elements:
+            if element.name.startswith("h") and not is_in_code_block(element):
+                # Only process headers that are not in code blocks
+                # Handle title logic
+                if current_content and stack[-1].title != "root":
+                    stack[-1].content = "\n".join(current_content)
+                current_content = []
+
+                level = int(element.name[1])
+                title_text = process_text_with_links(element)  # Process links in title
+                new_node = MarkdownNode(title_text, level)
+
+                while stack and stack[-1].level >= level:
+                    stack.pop()
+
+                if stack:
+                    stack[-1].children.append(new_node)
+                stack.append(new_node)
+
+            elif element.name in ["code"]:
+                # Preserve code blocks as is
+                text = element.get_text()
+                if text:
+                    current_content.append(text)
+
+            elif element.name in ["ul", "ol"]:
+                continue
+
+            elif element.name == "li":
+                text = process_text_with_links(element)  # Process links in list items
+                if text:
+                    if element.find_parent("ol"):
+                        index = len(element.find_previous_siblings("li")) + 1
+                        current_content.append(f"{index}. {text}")
+                    else:
+                        current_content.append(f"* {text}")
+
+            elif element.name == "table":
+                # Process table
+                table_data = []
+                headers = []
+
+                if element.find("thead"):
+                    for th in element.find("thead").find_all("th"):
+                        headers.append(th.get_text().strip())
+
+                if element.find("tbody"):
+                    for row in element.find("tbody").find_all("tr"):
+                        row_data = {}
+                        for i, td in enumerate(row.find_all("td")):
+                            if i < len(headers):
+                                row_data[headers[i]] = td.get_text().strip()
+                        table_data.append(row_data)
+
+                # Add table to current node
+                if stack[-1].title != "root":
+                    stack[-1].tables.append({"headers": headers, "data": table_data})
+
+            elif element.name == "p":
+                text = process_text_with_links(element)  # Process links in paragraphs
+                if text:
+                    if not text.startswith("* ") and not re.match(r"^\d+\. ", text):
+                        current_content.append(text)
+
+        # Process content of the last node
+        if current_content and stack[-1].title != "root":
+            stack[-1].content = "\n".join(current_content)
+
+        outputs = self._convert_to_outputs(root, id)
+        return outputs
+
+    def _convert_to_outputs(
+        self,
+        node: MarkdownNode,
+        id: str,
+        parent_id: str = None,
+        parent_titles: List[str] = None,
+        parent_contents: List[str] = None,
+    ) -> List[Output]:
+        def convert_table_to_markdown(headers, data):
+            """Convert table data to markdown format"""
+            if not headers or not data:
+                return ""
+
+            # Build header row
+            header_row = " | ".join(headers)
+            # Build separator row
+            separator = " | ".join(["---"] * len(headers))
+            # Build data rows
+            data_rows = []
+            for row in data:
+                row_values = [str(row.get(header, "")) for header in headers]
+                data_rows.append(" | ".join(row_values))
+
+            # Combine all rows
+            table_md = f"\n| {header_row} |\n| {separator} |\n"
+            table_md += "\n".join(f"| {row} |" for row in data_rows)
+            return table_md + "\n"
+
+        def collect_tables(n: MarkdownNode):
+            """Collect tables from node and its children"""
+            tables = []
+            table_md = []
+            if n.tables:
+                for table in n.tables:
+                    tables.append(table)
+                    table_md.append(
+                        convert_table_to_markdown(table["headers"], table["data"])
+                    )
+            for child in n.children:
+                child_tables, child_table_md = collect_tables(child)
+                tables.extend(child_tables)
+                table_md.extend(child_table_md)
+            return tables, table_md
+
+        def collect_children_content(n: MarkdownNode):
+            """Collect content from node and its children"""
+            content = []
+            if n.content:
+                content.append(n.content)
+            # Add current node's table content
+            for table in n.tables:
+                content.append(
+                    convert_table_to_markdown(table["headers"], table["data"])
+                )
+            # Process child nodes recursively
+            for child in n.children:
+                content.extend(collect_children_content(child))
+            return content
 
-        Args:
-            level_tags (list): Parsed tags organized by Markdown heading levels and other tags.
+        outputs = []
+        if parent_titles is None:
+            parent_titles = []
+        if parent_contents is None:
+            parent_contents = []
 
-        Returns:
-            str: Text content derived from the parsed tags.
-        """
-        content = []
-        for item in level_tags:
-            if isinstance(item, list):
-                content.append(self.to_text(item))
-            else:
-                header, tag = item
-                if not isinstance(tag, Tag):
-                    continue
-                elif tag.name in self.ALL_LEVELS:
-                    content.append(
-                        f"{header}-{tag.text}" if len(header) > 0 else tag.text
-                    )
-                else:
-                    content.append(self.tag_to_text(tag))
-        return "\n".join(content)
+        current_titles = parent_titles + ([node.title] if node.title != "root" else [])
 
-    def tag_to_text(self, tag: bs4.element.Tag):
-        """
-        将html tag转换为text
-        如果是table，输出markdown，添加表格标记，方便后续构建Chunk
-        :param tag:
-        :return:
-        """
-        if tag.name == "table":
-            try:
-                html_table = str(tag)
-                table_df = pd.read_html(html_table)[0]
-                return f"{self.TABLE_CHUCK_FLAG}{table_df.to_markdown(index=False)}{self.TABLE_CHUCK_FLAG}"
-            except:
-                logging.warning("parse table tag to text error", exc_info=True)
-        return tag.text
-
-    @retry(stop=stop_after_attempt(5))
-    def analyze_table(self, table,analyze_mathod="human"):
-        if analyze_mathod == "llm":
-            if self.llm_module == None:
-                logging.INFO("llm_module is None, cannot use analyze_table")
-                return table
-            variables = {
-                "table": table
-            }
-            response = self.llm_module.invoke(
-                variables = variables,
-                prompt_op = self.analyze_table_prompt,
-                with_json_parse=False
-            )
-            if response is  None or response == "" or response == []:
-                raise Exception("llm_module return None")
-            return response
-        else:
-            from io import StringIO
-            import pandas as pd
-            try:
-                df = pd.read_html(StringIO(table))[0]
-            except Exception as e:
-                logging.warning(f"analyze_table error: {e}")
-                return table
-            content = ""
-            for index, row in df.iterrows():
-                content+=f"第{index+1}行的数据如下:"
-                for col_name, value in row.items():
-                    content+=f"{col_name}的值为{value}，"
-                content+='\n'
-            return content
+        # If current node level equals target level, create output
+        if node.level >= self.cut_depth:
+            full_title = " / ".join(current_titles)
 
-    
-    @retry(stop=stop_after_attempt(5))
-    def analyze_img(self, img_url):
-        response = requests.get(img_url)
-        response.raise_for_status()
-        image_data = response.content
-
-        pass
-
-    def replace_table(self, content: str):
-        pattern = r"<table[^>]*>([\s\S]*?)<\/table>"
-        for match in re.finditer(pattern, content):
-            table = match.group(0)
-            table = self.analyze_table(table)
-            content = content.replace(match.group(1), table)
-        return content
-
-    def replace_img(self, content: str):
-        pattern = r"<img[^>]*src=[\"\']([^\"\']*)[\"\']"
-        for match in re.finditer(pattern, content):
-            img_url = match.group(1)
-            img_msg = self.analyze_img(img_url)
-            content = content.replace(match.group(0), img_msg)
-        return content
-
-    def extract_table(self, level_tags, header=""):
-        """
-        Extracts tables from the parsed hierarchical tags along with their headers.
+            # Merge content: parent content + current content
+            all_content = parent_contents + ([node.content] if node.content else [])
 
-        Args:
-            level_tags (list): Parsed tags organized by Markdown heading levels and other tags.
-            header (str): Current header text being processed.
+            # Add current node's table content
+            for table in node.tables:
+                all_content.append(
+                    convert_table_to_markdown(table["headers"], table["data"])
+                )
 
-        Returns:
-            list: A list of tuples, each containing the table's header, context text, and the table tag.
-        """
-        tables = []
-        for idx, item in enumerate(level_tags):
-            if isinstance(item, list):
-                tables += self.extract_table(item, header)
-            else:
-                tag = item[1]
-                if not isinstance(tag, Tag):
-                    continue
-                if tag.name in self.ALL_LEVELS:
-                    header = f"{header}-{tag.text}" if len(header) > 0 else tag.text
-
-                if tag.name == "table":
-                    if idx - 1 >= 0:
-                        context = level_tags[idx - 1]
-                        if isinstance(context, tuple):
-                            tables.append((header, context[1].text, tag))
-                    else:
-                        tables.append((header, "", tag))
-        return tables
-
-    def parse_level_tags(
-            self,
-            level_tags: list,
-            level: str,
-            parent_header: str = "",
-            cur_header: str = "",
-    ):
-        """
-        Recursively parses level tags to organize them into a structured format.
+            # Add all child node content (including tables)
+            for child in node.children:
+                child_content = collect_children_content(child)
+                all_content.extend(child_content)
 
-        Args:
-            level_tags (list): A list of tags to be parsed.
-            level (str): The current level being processed.
-            parent_header (str): The header of the parent tag.
-            cur_header (str): The header of the current tag.
+            current_output = Chunk(
+                id=f"{id}_{len(outputs)}",
+                parent_id=parent_id,
+                name=full_title,
+                content="\n".join(filter(None, all_content)),
+            )
 
-        Returns:
-            list: A structured representation of the parsed tags.
-        """
-        if len(level_tags) == 0:
-            return []
-        output = []
-        prefix_tags = []
-        while len(level_tags) > 0:
-            tag = level_tags[0]
-            if tag.name in self.ALL_LEVELS:
-                break
-            else:
-                prefix_tags.append((parent_header, level_tags.pop(0)))
-        if len(prefix_tags) > 0:
-            output.append(prefix_tags)
-
-        cur = []
-        while len(level_tags) > 0:
-            tag = level_tags[0]
-            if tag.name not in self.ALL_LEVELS:
-                cur.append((parent_header, level_tags.pop(0)))
-            else:
-
-                if tag.name > level:
-                    cur += self.parse_level_tags(
-                        level_tags,
-                        tag.name,
-                        f"{parent_header}-{cur_header}"
-                        if len(parent_header) > 0
-                        else cur_header,
-                        tag.name,
+            # Collect table data and convert to markdown format
+            all_tables = []
+            table_contents = []
+            if node.tables:
+                for table in node.tables:
+                    all_tables.append(table)
+                    table_contents.append(
+                        convert_table_to_markdown(table["headers"], table["data"])
                     )
-                elif tag.name == level:
-                    if len(cur) > 0:
-                        output.append(cur)
-                    cur = [(parent_header, level_tags.pop(0))]
-                    cur_header = tag.text
-                else:
-                    if len(cur) > 0:
-                        output.append(cur)
-                    return output
-        if len(cur) > 0:
-            output.append(cur)
-        return output
-
-    def cut(self, level_tags, cur_level, final_level):
-        """
-        Cuts the provided level tags into chunks based on the specified levels.
 
-        Args:
-            level_tags (list): A list of tags to be cut.
-            cur_level (int): The current level in the hierarchy.
-            final_level (int): The final level to which the tags should be cut.
+            for child in node.children:
+                child_tables, child_table_md = collect_tables(child)
+                all_tables.extend(child_tables)
+                table_contents.extend(child_table_md)
+
+            if all_tables:
+                current_output.metadata = {"tables": all_tables}
+                current_output.table = "\n".join(
+                    table_contents
+                )  # Save all tables in markdown format
+
+            outputs.append(current_output)
+
+        # If current node level is less than target level, continue traversing
+        elif node.level < self.cut_depth:
+            # Check if any subtree contains target level nodes
+            has_target_level = False
+            current_contents = parent_contents + (
+                [node.content] if node.content else []
+            )
 
-        Returns:
-            list: A list of cut chunks.
-        """
-        output = []
-        if cur_level == final_level:
-            cur_prefix = []
-            for sublevel_tags in level_tags:
-                if (
-                        isinstance(sublevel_tags, tuple)
-                ):
-                    cur_prefix.append(self.to_text([sublevel_tags,]))
-                else:
-                    break
-            cur_prefix = "\n".join(cur_prefix)
-
-            if len(cur_prefix) > 0:
-                output.append(cur_prefix)
-            for sublevel_tags in level_tags:
-                if isinstance(sublevel_tags, list):
-                    output.append(cur_prefix + "\n" + self.to_text(sublevel_tags))
-            return output
-        else:
-            cur_prefix = []
-            for sublevel_tags in level_tags:
-                if (
-                        isinstance(sublevel_tags, tuple)
-                ):
-                    cur_prefix.append(sublevel_tags[1].text)
-                else:
-                    break
-            cur_prefix = "\n".join(cur_prefix)
-            if len(cur_prefix) > 0:
-                output.append(cur_prefix)
+            # Add current node's tables to content
+            for table in node.tables:
+                current_contents.append(
+                    convert_table_to_markdown(table["headers"], table["data"])
+                )
 
-            for sublevel_tags in level_tags:
-                if isinstance(sublevel_tags, list):
-                    output += self.cut(sublevel_tags, cur_level + 1, final_level)
-            return output
+            for child in node.children:
+                child_outputs = self._convert_to_outputs(
+                    child, id, parent_id, current_titles, current_contents
+                )
+                if child_outputs:
+                    has_target_level = True
+                    outputs.extend(child_outputs)
+
+            # If no target level nodes found and current node is not root, output current node
+            if not has_target_level and node.title != "root":
+                full_title = " / ".join(current_titles)
+                all_content = current_contents
+
+                for child in node.children:
+                    child_content = collect_children_content(child)
+                    all_content.extend(child_content)
+
+                current_output = Chunk(
+                    id=f"{id}_{len(outputs)}",
+                    parent_id=parent_id,
+                    name=full_title,
+                    content="\n".join(filter(None, all_content)),
+                )
 
-    def solve_content(self, id: str, title: str, content: str, **kwargs) -> List[Output]:
-        """
-        Converts Markdown content into structured chunks.
+                # Collect table data and convert to markdown format
+                all_tables = []
+                table_contents = []
+                if node.tables:
+                    for table in node.tables:
+                        all_tables.append(table)
+                        table_contents.append(
+                            convert_table_to_markdown(table["headers"], table["data"])
+                        )
 
-        Args:
-            id (str): An identifier for the content.
-            title (str): The title of the content.
-            content (str): The Markdown formatted content to be processed.
+                for child in node.children:
+                    child_tables, child_table_md = collect_tables(child)
+                    all_tables.extend(child_tables)
+                    table_contents.extend(child_table_md)
 
-        Returns:
-            List[Output]: A list of processed content chunks.
-        """
-        html_content = markdown.markdown(
-            content, extensions=["markdown.extensions.tables"]
-        )
-        # html_content = self.replace_table(html_content)
-        soup = BeautifulSoup(html_content, "html.parser")
-        if soup is None:
-            raise ValueError("The MarkDown file appears to be empty or unreadable.")
-
-        top_level = None
-        for level in self.ALL_LEVELS:
-            tmp = soup.find_all(level)
-            if len(tmp) > 0:
-                top_level = level
-                break
-        if top_level is None:
-            chunk = Chunk(
-                id=Chunk.generate_hash_id(str(id)),
-                name=title,
-                content=soup.text,
-                ref=kwargs.get("ref", ""),
-            )
-            return [chunk]
-        tags = [tag for tag in soup.children if isinstance(tag, Tag)]
-
-        level_tags = self.parse_level_tags(tags, top_level)
-        cutted = self.cut(level_tags, 0, self.cut_depth)
-
-        chunks = []
-
-        for idx, content in enumerate(cutted):
-            chunk = None
-            if self.TABLE_CHUCK_FLAG in content:
-                chunk = self.get_table_chuck(content, title, id, idx)
-                chunk.ref = kwargs.get("ref", "")
-            else:
-                chunk = Chunk(
-                    id=Chunk.generate_hash_id(f"{id}#{idx}"),
-                    name=f"{title}#{idx}",
-                    content=content,
-                    ref=kwargs.get("ref", ""),
-                )
-            chunks.append(chunk)
-        return chunks
+                if all_tables:
+                    current_output.metadata = {"tables": all_tables}
+                    current_output.table = "\n".join(
+                        table_contents
+                    )  # Save all tables in markdown format
 
-    def get_table_chuck(self, table_chunk_str: str, title: str, id: str, idx: int) -> Chunk:
-        """
-        convert table chunk
-        :param table_chunk_str:
-        :return:
-        """
-        table_chunk_str = table_chunk_str.replace("\\N", "")
-        pattern = f"{self.TABLE_CHUCK_FLAG}(.*){self.TABLE_CHUCK_FLAG}"
-        matches = re.findall(pattern, table_chunk_str, re.DOTALL)
-        if not matches or len(matches) <= 0:
-            # 找不到表格信息，按照Text Chunk处理
-            return Chunk(
-                id=Chunk.generate_hash_id(f"{id}#{idx}"),
-                name=f"{title}#{idx}",
-                content=table_chunk_str,
-            )
-        table_markdown_str = matches[0]
-        html_table_str = markdown.markdown(table_markdown_str, extensions=["markdown.extensions.tables"])
-        try:
-            df = pd.read_html(html_table_str)[0]
-        except Exception as e:
-            logging.warning(f"get_table_chuck error: {e}")
-            df = pd.DataFrame()
-
-        # 确认是表格Chunk，去除内容中的TABLE_CHUCK_FLAG
-        replaced_table_text = re.sub(pattern, f'\n{table_markdown_str}\n', table_chunk_str, flags=re.DOTALL)
-        return Chunk(
-            id=Chunk.generate_hash_id(f"{id}#{idx}"),
-            name=f"{title}#{idx}",
-            content=replaced_table_text,
-            type=ChunkTypeEnum.Table,
-            csv_data=df.to_csv(index=False),
-        )
+                outputs.append(current_output)
+
+        return outputs
 
-    def invoke(self, input: Input, **kwargs) -> List[Output]:
+    def _invoke(self, input: Input, **kwargs) -> List[Output]:
         """
         Processes a Markdown file and returns its content as structured chunks.
 
@@ -411,4 +416,55 @@ def invoke(self, input: Input, **kwargs) -> List[Output]:
         basename, _ = os.path.splitext(os.path.basename(file_path))
 
         chunks = self.solve_content(input, basename, content)
+        length_500_list = []
+        length_1000_list = []
+        length_5000_list = []
+        length_smal_list = []
+        for chunk in chunks:
+            if chunk.content is not None:
+                if len(chunk.content) > 5000:
+                    length_5000_list.append(chunk)
+                elif len(chunk.content) > 1000:
+                    length_1000_list.append(chunk)
+                elif len(chunk.content) > 500:
+                    length_500_list.append(chunk)
+                elif len(chunk.content) <= 500:
+                    length_smal_list.append(chunk)
+        return chunks
+
+
+@ReaderABC.register("yuque")
+@ReaderABC.register("yuque_reader")
+class YuequeReader(MarkDownReader):
+    """
+    A class for parsing Yueque documents into Chunk objects.
+
+    This class inherits from MarkDownParser and provides the functionality to process Yueque documents,
+    extract their content, and convert it into a list of Chunk objects.
+    """
+
+    def _invoke(self, input: Input, **kwargs) -> List[Output]:
+        """
+        Processes the input Yueque document and converts it into a list of Chunk objects.
+
+        Args:
+            input (Input): The input string containing the Yueque token and URL.
+            **kwargs: Additional keyword arguments, currently unused but kept for potential future expansion.
+
+        Returns:
+            List[Output]: A list of Chunk objects representing the parsed content.
+
+        Raises:
+            HTTPError: If the request to the Yueque URL fails.
+        """
+        token, url = input.split("@", 1)
+        headers = {"X-Auth-Token": token}
+        response = requests.get(url, headers=headers)
+        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
+        data = response.json()["data"]
+        id = data.get("id", "")
+        title = data.get("title", "")
+        content = data.get("body", "")
+
+        chunks = self.solve_content(id, title, content)
         return chunks
diff --git a/kag/builder/component/reader/mix_reader.py b/kag/builder/component/reader/mix_reader.py
new file mode 100644
index 00000000..6af7380a
--- /dev/null
+++ b/kag/builder/component/reader/mix_reader.py
@@ -0,0 +1,99 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import os
+from typing import List
+
+from kag.interface import ReaderABC
+from knext.common.base.runnable import Input, Output
+from kag.builder.component.reader.txt_reader import TXTReader
+from kag.builder.component.reader.pdf_reader import PDFReader
+from kag.builder.component.reader.docx_reader import DocxReader
+from kag.builder.component.reader.markdown_reader import MarkDownReader
+from kag.builder.component.reader.dict_reader import DictReader
+
+
+@ReaderABC.register("mix", as_default=True)
+@ReaderABC.register("mix_reader")
+class MixReader(ReaderABC):
+    """
+    A reader class that can handle multiple types of inputs by delegating to specific readers.
+
+    This class initializes with a mapping of file types to their respective readers.
+    It provides a method to invoke the appropriate reader based on the input type.
+
+    """
+
+    def __init__(
+        self,
+        txt_reader: TXTReader = None,
+        pdf_reader: PDFReader = None,
+        docx_reader: DocxReader = None,
+        md_reader: MarkDownReader = None,
+        dict_reader: DictReader = None,
+    ):
+        """
+        Initializes the MixReader with a mapping of file types to their respective readers.
+
+        Args:
+            txt_reader (TXTReader, optional): Reader for .txt files. Defaults to None.
+            pdf_reader (PDFReader, optional): Reader for .pdf files. Defaults to None.
+            docx_reader (DocxReader, optional): Reader for .docx files. Defaults to None.
+            md_reader (MarkDownReader, optional): Reader for .md files. Defaults to None.
+            dict_reader (DictReader, optional): Reader for dictionary inputs. Defaults to None.
+        """
+        super().__init__()
+        self.parse_map = {
+            "txt": txt_reader,
+            "pdf": pdf_reader,
+            "docx": docx_reader,
+            "md": md_reader,
+            "dict": dict_reader,
+        }
+
+    def _invoke(self, input: Input, **kwargs) -> List[Output]:
+        """
+        Invokes the appropriate reader based on the input type.
+
+        Args:
+            input (Input): The input to be parsed. This can be a file path or a dictionary.
+            **kwargs: Additional keyword arguments to be passed to the reader.
+
+        Returns:
+            List[Output]: A list of parsed outputs.
+
+        Raises:
+            ValueError: If the input is empty.
+            FileNotFoundError: If the input file does not exist.
+            NotImplementedError: If the file suffix is not supported.
+            KeyError: If the reader for the given file type is not correctly configured.
+        """
+        if not input:
+            raise ValueError("Input cannot be empty")
+        if isinstance(input, dict):
+            reader_type = "dict"
+
+        else:
+            if os.path.exists(input):
+                raise FileNotFoundError(f"File {input} not found.")
+
+            file_suffix = input.split(".")[-1]
+            if file_suffix not in self.parse_map:
+                raise NotImplementedError(
+                    f"File suffix {file_suffix} not supported yet."
+                )
+            reader_type = file_suffix
+
+        reader = self.reader_map[reader_type]
+        if reader is None:
+            raise KeyError(f"{reader_type} reader not correctly configured.")
+        return self.parse_map[file_suffix]._invoke(input)
diff --git a/kag/builder/component/reader/pdf_reader.py b/kag/builder/component/reader/pdf_reader.py
index c60020d8..682a5a19 100644
--- a/kag/builder/component/reader/pdf_reader.py
+++ b/kag/builder/component/reader/pdf_reader.py
@@ -12,28 +12,26 @@
 
 import os
 import re
-from typing import List, Sequence, Type, Union
+from typing import List, Sequence, Union
+
+import pdfminer.layout  # noqa
 
-from langchain_community.document_loaders import PyPDFLoader
-import pdfminer.layout
 
 from kag.builder.model.chunk import Chunk
-from kag.interface.builder import SourceReaderABC
-from knext.common.base.runnable import Input, Output
-from kag.builder.prompt.outline_prompt import OutlinePrompt
+from kag.interface import ReaderABC
 
+from kag.builder.prompt.outline_prompt import OutlinePrompt
+from kag.interface import LLMClient
+from kag.common.conf import KAG_PROJECT_CONF
+from kag.common.utils import generate_hash_id
+from knext.common.base.runnable import Output
 from pdfminer.high_level import extract_text
 from pdfminer.high_level import extract_pages
 from pdfminer.layout import LTTextContainer, LTPage
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
-from pdfminer.layout import LAParams,LTTextBox
-from pdfminer.pdfpage import PDFPage
-from pdfminer.pdfparser import PDFParser
-from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
-from pdfminer.converter import PDFPageAggregator
-from pdfminer.pdfpage import PDFTextExtractionNotAllowed
-import pdfminer
+import pdfminer  # noqa
+import PyPDF2
 
 
 import logging
@@ -41,34 +39,207 @@
 logger = logging.getLogger(__name__)
 
 
-class PDFReader(SourceReaderABC):
+@ReaderABC.register("pdf")
+@ReaderABC.register("pdf_reader")
+class PDFReader(ReaderABC):
     """
-    A PDF reader class that inherits from SourceReader.
+    A class for reading PDF files into a list of text chunks, inheriting from `ReaderABC`.
 
-    Attributes:
-        if_split (bool): Whether to split the content by pages. Default is False.
-        use_pypdf (bool): Whether to use PyPDF2 for processing PDF files. Default is True.
+    This class is responsible for parsing PDF files and converting them into a list of Chunk objects.
+    It inherits from `ReaderABC` and overrides the necessary methods to handle PDF-specific operations.
     """
 
-    def __init__(self, **kwargs):
+    def __init__(
+        self,
+        cut_depth: int = 3,
+        outline_flag: bool = True,
+        is_ocr: bool = False,
+        llm: LLMClient = None,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
-        self.split_level = kwargs.get("split_level", 3)
-        self.split_using_outline = kwargs.get("split_using_outline", True)
-        self.outline_flag = True
-        self.llm = self._init_llm()
-        language = os.getenv("KAG_PROMPT_LANGUAGE", "zh")
+        self.cut_depth = cut_depth
+        self.outline_flag = outline_flag
+        self.is_ocr = is_ocr
+        self.llm = llm
+        language = KAG_PROJECT_CONF.language
         self.prompt = OutlinePrompt(language)
 
-
     @property
-    def input_types(self) -> Type[Input]:
+    def input_types(self):
         return str
 
     @property
-    def output_types(self) -> Type[Output]:
+    def output_types(self):
         return Chunk
-    
-    def outline_chunk(self, chunk: Union[Chunk, List[Chunk]],basename) -> List[Chunk]:
+
+    def _get_full_outlines(self):
+        outlines = self.pdf_reader.outline
+        level_outlines = []
+
+        def _extract_outline_page_numbers(outlines, level=0):
+            for outline in outlines:
+                if isinstance(outline, list):
+                    _extract_outline_page_numbers(outline, level + 1)
+                else:
+                    title = outline.title
+                    page_number = self.pdf_reader.get_destination_page_number(outline)
+                    level_outlines.append((title, level, page_number, 0))
+
+        _extract_outline_page_numbers(outlines)
+        for idx, outline in enumerate(level_outlines):
+            level_outlines[idx] = (
+                outline[0],
+                outline[1],
+                outline[2],
+                level_outlines[idx + 1][2] if idx + 1 < len(level_outlines) else -1,
+            )
+        return level_outlines
+
+    def extract_content_from_outline(
+        self, page_contents, level_outlines
+    ) -> List[Chunk]:
+        total_content = "".join(page_contents)
+
+        def get_content_start(outline, page_contents):
+            page_start = outline[2]
+            page_end = outline[3]
+
+            previous_pages_length = sum(
+                len(content) for content in page_contents[:page_start]
+            )
+
+            find_content = "".join(
+                page_contents[page_start : page_end + 1 if page_end != -1 else None]
+            )
+
+            # 标准化标题中的特殊字符
+            def normalize_text(text):
+                # 将破折号"—"转换为中文数字"一"
+                text = text.replace("—", "一")
+                # 可以添加其他中英文标点的统一转换
+                text = re.sub(r"［", "[", text)
+                text = re.sub(r"］", "]", text)
+                text = re.sub(r"（", "(", text)
+                text = re.sub(r"）", ")", text)
+                return text
+
+            outline = (normalize_text(outline[0]), outline[1], outline[2], outline[3])
+
+            def fuzzy_search(pattern, text, threshold=0.90):
+                from difflib import SequenceMatcher
+
+                pattern_len = len(pattern)
+                for i in range(len(text) - pattern_len + 1):
+                    substring = text[i : i + pattern_len]
+                    similarity = SequenceMatcher(None, pattern, substring).ratio()
+                    if similarity >= threshold:
+                        return i
+                return -1
+
+            # 先尝试使用原始标题进行模糊匹配
+            title_with_spaces = outline[0].strip()
+            fuzzy_match_pos = fuzzy_search(title_with_spaces, find_content)
+            if fuzzy_match_pos != -1:
+                return previous_pages_length + fuzzy_match_pos
+
+            # 如果没找到，尝试使用去除所有空格的标题
+            title_no_spaces = title_with_spaces.replace(" ", "")
+            find_content_no_spaces = find_content.replace(" ", "")
+            fuzzy_match_pos = fuzzy_search(title_no_spaces, find_content_no_spaces)
+
+            if fuzzy_match_pos != -1:
+                # 计算原始文本中的实际位置
+                original_pos = 0
+                no_spaces_pos = 0
+                while no_spaces_pos < fuzzy_match_pos:
+                    if find_content[original_pos] != " ":
+                        no_spaces_pos += 1
+                    original_pos += 1
+                return previous_pages_length + original_pos
+
+            # 在扩展范围内进行模糊匹配
+            extended_content = "".join(
+                page_contents[
+                    max(0, page_start - 1) : page_end if page_end != -1 else None
+                ]
+            )
+
+            fuzzy_match_pos = fuzzy_search(title_with_spaces, extended_content)
+            if fuzzy_match_pos != -1:
+                extended_previous_length = sum(
+                    len(content) for content in page_contents[: max(0, page_start - 1)]
+                )
+                return extended_previous_length + fuzzy_match_pos
+
+            # 最后尝试不带空格的扩展内容
+            extended_content_no_spaces = extended_content.replace(" ", "")
+            fuzzy_match_pos = fuzzy_search(title_no_spaces, extended_content_no_spaces)
+            if fuzzy_match_pos != -1:
+                original_pos = 0
+                no_spaces_pos = 0
+                while no_spaces_pos < fuzzy_match_pos:
+                    if extended_content[original_pos] != " ":
+                        no_spaces_pos += 1
+                    original_pos += 1
+
+                extended_previous_length = sum(
+                    len(content) for content in page_contents[: max(0, page_start - 1)]
+                )
+                return extended_previous_length + original_pos
+
+            return -1
+
+        final_content = []
+        for idx, outline in enumerate(level_outlines):
+            start = get_content_start(outline, page_contents)
+            next_start = (
+                get_content_start(level_outlines[idx + 1], page_contents)
+                if idx + 1 < len(level_outlines)
+                else -1
+            )
+            if start >= 0 and next_start >= 0:
+                content = total_content[start:next_start]
+                final_content.append(
+                    (outline[0], outline[1], start, next_start, content)
+                )
+            elif start >= 0 and next_start < 0 and idx + 1 == len(level_outlines):
+                content = total_content[start:]
+                final_content.append((outline[0], outline[1], start, -1, content))
+        return final_content
+
+    def convert_finel_content_to_chunks(self, final_content):
+        def create_chunk(title, content, basename):
+            return Chunk(
+                id=generate_hash_id(f"{basename}#{title}"),
+                name=f"{basename}#{title}",
+                content=content,
+                sub_chunks=[],
+            )
+
+        level_map = {}
+        chunks = []
+
+        for title, level, start, end, content in final_content:
+            chunk = create_chunk(
+                title, content, os.path.splitext(os.path.basename(self.fd.name))[0]
+            )
+            chunks.append(chunk)
+
+            if level == 0:
+                level_map[0] = chunk
+            else:
+                parent_level = level - 1
+                while parent_level >= 0:
+                    if parent_level in level_map:
+                        level_map[parent_level].sub_chunks.append(chunk)
+                        break
+                    parent_level -= 1
+                level_map[level] = chunk
+
+        return chunks
+
+    def outline_chunk(self, chunk: Union[Chunk, List[Chunk]], basename) -> List[Chunk]:
         if isinstance(chunk, Chunk):
             chunk = [chunk]
         outlines = []
@@ -76,26 +247,30 @@ def outline_chunk(self, chunk: Union[Chunk, List[Chunk]],basename) -> List[Chunk
             outline = self.llm.invoke({"input": c.content}, self.prompt)
             outlines.extend(outline)
         content = "\n".join([c.content for c in chunk])
-        chunks = self.sep_by_outline(content, outlines,basename)
+        chunks = self.sep_by_outline(content, outlines, basename)
         return chunks
-    
-    def sep_by_outline(self,content,outlines,basename):
+
+    def sep_by_outline(self, content, outlines, basename):
         position_check = []
         for outline in outlines:
             start = content.find(outline)
-            position_check.append((outline,start))
+            position_check.append((outline, start))
         chunks = []
-        for idx,pc in enumerate(position_check):
+        for idx, pc in enumerate(position_check):
             chunk = Chunk(
-                id = Chunk.generate_hash_id(f"{basename}#{pc[0]}"),
+                id=generate_hash_id(f"{basename}#{pc[0]}"),
                 name=f"{basename}#{pc[0]}",
-                content=content[pc[1]:position_check[idx+1][1] if idx+1 < len(position_check) else len(position_check)],
+                content=content[
+                    pc[1] : (
+                        position_check[idx + 1][1]
+                        if idx + 1 < len(position_check)
+                        else len(position_check)
+                    )
+                ],
             )
             chunks.append(chunk)
         return chunks
 
-        
-
     @staticmethod
     def _process_single_page(
         page: str,
@@ -149,7 +324,7 @@ def _extract_text_from_page(page_layout: LTPage) -> str:
                 text += element.get_text()
         return text
 
-    def invoke(self, input: str, **kwargs) -> Sequence[Output]:
+    def _invoke(self, input: str, **kwargs) -> Sequence[Output]:
         """
         Processes a PDF file, splitting or extracting content based on configuration.
 
@@ -170,85 +345,140 @@ def invoke(self, input: str, **kwargs) -> Sequence[Output]:
         if not os.path.isfile(input):
             raise FileNotFoundError(f"The file {input} does not exist.")
 
-
-        self.fd = open(input, "rb")
-        self.parser = PDFParser(self.fd)
-        self.document = PDFDocument(self.parser)
-        chunks = []
-        basename, _ = os.path.splitext(os.path.basename(input))
-
-        
-        # get outline
+        self.fd = None
         try:
-            outlines = self.document.get_outlines()
-        except Exception as e:
-            logger.warning(f"loading PDF file: {e}")
-            self.outline_flag = False
-        
-        
-        if not self.outline_flag:
-
-            with open(input, "rb") as file:
-                for idx, page_layout in enumerate(extract_pages(file)):
-                    content = ""
-                    for element in page_layout:
-                        if hasattr(element, "get_text"):
-                            content = content + element.get_text()
+            self.fd = open(input, "rb")
+            self.pdf_reader = PyPDF2.PdfReader(self.fd)
+            self.level_outlines = self._get_full_outlines()
+            self.parser = PDFParser(self.fd)
+            self.document = PDFDocument(self.parser)
+            chunks = []
+            basename, _ = os.path.splitext(os.path.basename(input))
+
+            # get outline
+            try:
+                outlines = self.document.get_outlines()
+            except Exception as e:
+                logger.warning(f"loading PDF file: {e}")
+                self.outline_flag = False
+
+            if not self.outline_flag:
+
+                with open(input, "rb") as file:
+                    for idx, page_layout in enumerate(extract_pages(file)):
+                        content = ""
+                        for element in page_layout:
+                            if hasattr(element, "get_text"):
+                                content = content + element.get_text()
+                        chunk = Chunk(
+                            id=generate_hash_id(f"{basename}#{idx}"),
+                            name=f"{basename}#{idx}",
+                            content=content,
+                        )
+                        chunks.append(chunk)
+                # try:
+                #     outline_chunks = self.outline_chunk(chunks, basename)
+                # except Exception as e:
+                #     raise RuntimeError(f"Error loading PDF file: {e}")
+                # if len(outline_chunks) > 0:
+                #     chunks = outline_chunks
+
+            elif True:
+                split_words = []
+
+                page_contents = []
+
+                with open(input, "rb") as file:
+                    for idx, page_layout in enumerate(extract_pages(file)):
+                        content = ""
+                        for element in page_layout:
+                            if hasattr(element, "get_text"):
+                                content = content + element.get_text()
+                        content = content.replace("\n", "")
+                        page_contents.append(content)
+
+                # 使用正则表达式移除所有空白字符（包括空格、制表符、换行符等）
+                page_contents = [
+                    re.sub(r"\s+", "", content) for content in page_contents
+                ]
+                page_contents = [
+                    re.sub(r"[\s\u200b\u200c\u200d\ufeff]+", "", content)
+                    for content in page_contents
+                ]
+                page_contents = ["".join(content.split()) for content in page_contents]
+
+                final_content = self.extract_content_from_outline(
+                    page_contents, self.level_outlines
+                )
+                chunks = self.convert_finel_content_to_chunks(final_content)
+
+            else:
+                for item in outlines:
+                    level, title, dest, a, se = item
+                    split_words.append(title.strip().replace(" ", ""))
+                # save the outline position in content
+                try:
+                    text = extract_text(input)
+
+                except Exception as e:
+                    raise RuntimeError(f"Error loading PDF file: {e}")
+
+                cleaned_pages = [
+                    self._process_single_page(x, "", False, False) for x in text
+                ]
+                sentences = []
+                for cleaned_page in cleaned_pages:
+                    sentences += cleaned_page
+
+                content = "".join(sentences)
+                positions = [(input, 0)]
+                for split_word in split_words:
+                    pattern = re.compile(split_word)
+                    start = 0
+                    for i, match in enumerate(re.finditer(pattern, content)):
+                        if i <= 1:
+                            start, end = match.span()
+                    if start > 0:
+                        positions.append((split_word, start))
+
+                for idx, position in enumerate(positions):
                     chunk = Chunk(
-                        id=Chunk.generate_hash_id(f"{basename}#{idx}"),
-                        name=f"{basename}#{idx}",
-                        content=content,
+                        id=generate_hash_id(f"{basename}#{position[0]}"),
+                        name=f"{basename}#{position[0]}",
+                        content=content[
+                            position[1] : (
+                                positions[idx + 1][1]
+                                if idx + 1 < len(positions)
+                                else None
+                            )
+                        ],
                     )
                     chunks.append(chunk)
-            try:
-                outline_chunks =  self.outline_chunk(chunks, basename)
-            except Exception as e:
-                raise RuntimeError(f"Error loading PDF file: {e}")
-            if len(outline_chunks) > 0:
-                chunks = outline_chunks
-                
-        else:
-            split_words = []
-        
-            for item in outlines:
-                level, title, dest, a, se = item
-                split_words.append(title.strip().replace(" ",""))
-            # save the outline position in content
-            try:
-                text = extract_text(input)
 
-            except Exception as e:
-                raise RuntimeError(f"Error loading PDF file: {e}")
-
-            cleaned_pages = [
-                self._process_single_page(x, "", False, False) for x in text
-            ]
-            sentences = []
-            for cleaned_page in cleaned_pages:
-                sentences += cleaned_page
-
-            content = "".join(sentences)
-            positions = [(input,0)]
-            for split_word in split_words:
-                pattern = re.compile(split_word)
-                for i,match in enumerate(re.finditer(pattern, content)):
-                    if i == 1:
-                        start, end = match.span()
-                        positions.append((split_word,start))
-            
-            for idx,position in enumerate(positions):
-                chunk = Chunk(
-                    id = Chunk.generate_hash_id(f"{basename}#{position[0]}"),
-                    name=f"{basename}#{position[0]}",
-                    content=content[position[1]:positions[idx+1][1] if idx+1 < len(positions) else None],
-                )
-                chunks.append(chunk)
+            # # 保存中间结果到文件
+            # import pickle
 
-        return chunks
+            # with open("debug_data.pkl", "wb") as f:
+            #     pickle.dump(
+            #         {"page_contents": page_contents, "level_outlines": self.level_outlines},
+            #         f,
+            #     )
 
+            return chunks
 
-if __name__ == '__main__':
-    reader = PDFReader(split_using_outline=True)
-    pdf_path = os.path.join(os.path.dirname(__file__),"../../../../tests/builder/data/aiwen.pdf")
-    chunk = reader.invoke(pdf_path)
-    print(chunk)
\ No newline at end of file
+        except Exception as e:
+            raise RuntimeError(f"Error loading PDF file: {e}")
+        finally:
+            if self.fd:
+                self.fd.close()
+
+
+if __name__ == "__main__":
+    pdf_reader = PDFReader()
+    pdf_path = os.path.join(
+        os.path.dirname(__file__), "../../../../tests/builder/data/aiwen.pdf"
+    )
+    pdf_path = "/Users/zhangxinhong.zxh/Downloads/labor-law-v5.pdf"
+    # pdf_path = "/Users/zhangxinhong.zxh/Downloads/toaz.info-5dsm-5-pr_56e68a629dc4fe62699960dd5afbe362.pdf"
+    chunk = pdf_reader.invoke(pdf_path)
+    a = 1
diff --git a/kag/builder/component/reader/txt_reader.py b/kag/builder/component/reader/txt_reader.py
index 6f9d7a08..dfc99000 100644
--- a/kag/builder/component/reader/txt_reader.py
+++ b/kag/builder/component/reader/txt_reader.py
@@ -11,29 +11,27 @@
 # or implied.
 
 import os
-from typing import List, Type
+from typing import List
 
 from kag.builder.model.chunk import Chunk
-from kag.interface.builder import SourceReaderABC
+from kag.interface import ReaderABC
+from kag.common.utils import generate_hash_id
 from knext.common.base.runnable import Input, Output
 
 
-class TXTReader(SourceReaderABC):
+@ReaderABC.register("txt")
+@ReaderABC.register("txt_reader")
+class TXTReader(ReaderABC):
     """
-    A PDF reader class that inherits from SourceReader.
-    """
-
-    @property
-    def input_types(self) -> Type[Input]:
-        return str
+    A class for parsing text files or text content into Chunk objects.
 
-    @property
-    def output_types(self) -> Type[Output]:
-        return Chunk
+    This class inherits from ReaderABC and provides the functionality to read text content,
+    whether it is from a file or directly provided as a string, and convert it into a list of Chunk objects.
+    """
 
-    def invoke(self, input: Input, **kwargs) -> List[Output]:
+    def _invoke(self, input: Input, **kwargs) -> List[Output]:
         """
-        The main method for processing text reading. This method reads the content of the input (which can be a file path or text content) and converts it into a Chunk object.
+        The main method for processing text reading. This method reads the content of the input (which can be a file path or text content) and converts it into chunks.
 
         Args:
             input (Input): The input string, which can be the path to a text file or direct text content.
@@ -51,7 +49,7 @@ def invoke(self, input: Input, **kwargs) -> List[Output]:
 
         try:
             if os.path.exists(input):
-                with open(input, "r", encoding='utf-8') as f:
+                with open(input, "r", encoding="utf-8") as f:
                     content = f.read()
             else:
                 content = input
@@ -60,7 +58,7 @@ def invoke(self, input: Input, **kwargs) -> List[Output]:
 
         basename, _ = os.path.splitext(os.path.basename(input))
         chunk = Chunk(
-            id=Chunk.generate_hash_id(input),
+            id=generate_hash_id(input),
             name=basename,
             content=content,
         )
diff --git a/kag/builder/component/reader/yuque_reader.py b/kag/builder/component/reader/yuque_reader.py
deleted file mode 100644
index e585c097..00000000
--- a/kag/builder/component/reader/yuque_reader.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import requests
-from typing import Type, List
-
-from kag.builder.component.reader import MarkDownReader
-from kag.builder.model.chunk import Chunk
-from kag.interface.builder import SourceReaderABC
-from knext.common.base.runnable import Input, Output
-
-from kag.common.llm.client import LLMClient
-
-
-class YuqueReader(SourceReaderABC):
-    def __init__(self, token: str, **kwargs):
-        super().__init__(**kwargs)
-        self.token = token
-        self.markdown_reader = MarkDownReader(**kwargs)
-
-    @property
-    def input_types(self) -> Type[Input]:
-        """The type of input this Runnable object accepts specified as a type annotation."""
-        return str
-
-    @property
-    def output_types(self) -> Type[Output]:
-        """The type of output this Runnable object produces specified as a type annotation."""
-        return Chunk
-
-    @staticmethod
-    def get_yuque_api_data(token, url):
-        headers = {"X-Auth-Token": token}
-
-        try:
-            response = requests.get(url, headers=headers)
-            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
-            return response.json()["data"]  # Assuming the API returns JSON data
-        except requests.exceptions.HTTPError as http_err:
-            print(f"HTTP error occurred: {http_err}")
-        except requests.exceptions.RequestException as err:
-            print(f"Error occurred: {err}")
-        except Exception as err:
-            print(f"An error occurred: {err}")
-
-    def invoke(self, input: str, **kwargs) -> List[Output]:
-        if not input:
-            raise ValueError("Input cannot be empty")
-
-        url: str = input
-        data = self.get_yuque_api_data(self.token, url)
-        id = data.get("id", "")
-        title = data.get("title", "")
-        content = data.get("body", "")
-
-        chunks = self.markdown_reader.solve_content(id, title, content)
-
-        return chunks
\ No newline at end of file
diff --git a/kag/solver/logic/core_modules/op_executor/op_math/__init__.py b/kag/builder/component/scanner/__init__.py
similarity index 100%
rename from kag/solver/logic/core_modules/op_executor/op_math/__init__.py
rename to kag/builder/component/scanner/__init__.py
diff --git a/kag/builder/component/scanner/csv_scanner.py b/kag/builder/component/scanner/csv_scanner.py
new file mode 100644
index 00000000..687395b8
--- /dev/null
+++ b/kag/builder/component/scanner/csv_scanner.py
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+from typing import Dict, List
+
+import pandas as pd
+from kag.interface import ScannerABC
+from kag.common.utils import generate_hash_id
+from knext.common.base.runnable import Input, Output
+
+
+@ScannerABC.register("csv")
+@ScannerABC.register("csv_scanner")
+class CSVScanner(ScannerABC):
+    def __init__(
+        self,
+        header: bool = True,
+        col_names: List[str] = None,
+        col_ids: List[int] = None,
+        rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__(rank=rank, world_size=world_size)
+        self.header = header
+        self.col_names = col_names
+        self.col_ids = col_ids
+
+    @property
+    def input_types(self) -> Input:
+        return str
+
+    @property
+    def output_types(self) -> Output:
+        return Dict
+
+    def load_data(self, input: Input, **kwargs) -> List[Output]:
+        """
+        Loads data from a CSV file and converts it into a list of dictionaries.
+
+        Args:
+            input (Input): The input file path to the CSV file.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List[Output]: A list of dictionaries containing the processed data.
+        """
+        input = self.download_data(input)
+        if self.header:
+            data = pd.read_csv(input, dtype=str)
+        else:
+            data = pd.read_csv(input, dtype=str, header=None)
+        col_keys = self.col_names if self.col_names else self.col_ids
+        if col_keys is None:
+            return data.to_dict(orient="records")
+
+        contents = []
+        for _, row in data.iterrows():
+            for k, v in row.items():
+                if k in col_keys:
+                    v = str(v)
+                    name = v[:5] + "..." + v[-5:]
+                    contents.append(
+                        {"id": generate_hash_id(v), "name": name, "content": v}
+                    )
+
+        return contents
diff --git a/kag/builder/component/scanner/dataset_scanner.py b/kag/builder/component/scanner/dataset_scanner.py
new file mode 100644
index 00000000..7313ebf8
--- /dev/null
+++ b/kag/builder/component/scanner/dataset_scanner.py
@@ -0,0 +1,127 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import json
+import os
+from typing import List, Type, Dict
+
+
+from kag.interface import ScannerABC
+from knext.common.base.runnable import Input, Output
+
+
+@ScannerABC.register("hotpotqa")
+@ScannerABC.register("hotpotqa_dataset_scanner")
+class HotpotqaCorpusScanner(ScannerABC):
+    """
+    A class for reading HotpotQA dataset and converting it into a list of dictionaries, inheriting from `ScannerABC`.
+
+    This class is responsible for reading HotpotQA corpus and converting it into a list of dictionaries.
+    It inherits from `ScannerABC` and overrides the necessary methods to handle HotpotQA-specific operations.
+    """
+
+    @property
+    def input_types(self) -> Type[Input]:
+        return str
+
+    @property
+    def output_types(self) -> Type[Output]:
+        return Dict
+
+    def load_data(self, input: Input, **kwargs) -> List[Output]:
+        """
+        Loads data from a HotpotQA corpus file or JSON string and returns it as a list of dictionaries.
+
+        This method reads HotpotQA corpus data from a file or parses a JSON string and returns it as a list of dictionaries.
+        If the input is a file path, it reads the file; if the input is a JSON string, it parses the string.
+
+        Args:
+            input (Input): The HotpotQA corpus file path or JSON string to load.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List[Output]: A list of dictionaries, where each dictionary represents a HotpotQA item.
+        """
+        if os.path.exists(str(input)):
+            with open(input, "r") as f:
+                corpus = json.load(f)
+        else:
+            corpus = json.loads(input)
+
+        data = []
+        for item_key, item_value in corpus.items():
+            data.append(
+                {"id": item_key, "name": item_key, "content": "\n".join(item_value)}
+            )
+        return data
+
+
+@ScannerABC.register("musique")
+@ScannerABC.register("2wiki")
+@ScannerABC.register("musique_dataset_scanner")
+@ScannerABC.register("2wiki_dataset_scanner")
+class MusiqueCorpusScanner(ScannerABC):
+    """
+    A class for reading Musique/2Wiki dataset and converting it into a list of dictionaries, inheriting from `ScannerABC`.
+
+    This class is responsible for reading Musique/2Wiki corpus and converting it into a list of dictionaries.
+    It inherits from `ScannerABC` and overrides the necessary methods to handle Musique/2Wiki-specific operations.
+    """
+
+    @property
+    def input_types(self) -> Type[Input]:
+        """The type of input this Runnable object accepts specified as a type annotation."""
+        return str
+
+    @property
+    def output_types(self) -> Type[Output]:
+        """The type of output this Runnable object produces specified as a type annotation."""
+        return Dict
+
+    def get_basename(self, file_name: str):
+        base, _ = os.path.splitext(os.path.basename(file_name))
+        return base
+
+    def load_data(self, input: Input, **kwargs) -> List[Output]:
+        """
+        Loads data from a Musique/2Wiki corpus file or JSON string and returns it as a list of dictionaries.
+
+        This method reads Musique/2Wiki corpus data from a file or parses a JSON string and returns it as a list of dictionaries.
+        If the input is a file path, it reads the file; if the input is a JSON string, it parses the string.
+
+        Args:
+            input (Input): The Musique/2Wiki corpus file path or JSON string to load.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List[Output]: A list of dictionaries, where each dictionary represents a Musique/2Wiki item.
+        """
+
+        if os.path.exists(input):
+            with open(input, "r") as f:
+                corpus = json.load(f)
+        else:
+            corpus = json.loads(input)
+
+        data = []
+
+        for idx, item in enumerate(corpus):
+            title = item["title"]
+            content = item["text"]
+            data.append(
+                {
+                    "id": f"{title}#{idx}",
+                    "name": title,
+                    "content": content,
+                }
+            )
+        return data
diff --git a/kag/builder/component/scanner/directory_scanner.py b/kag/builder/component/scanner/directory_scanner.py
new file mode 100644
index 00000000..8a6cf2ca
--- /dev/null
+++ b/kag/builder/component/scanner/directory_scanner.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import os
+import re
+from typing import List
+
+from kag.interface import ScannerABC
+
+from knext.common.base.runnable import Input, Output
+
+
+@ScannerABC.register("dir")
+@ScannerABC.register("dir_file_scanner")
+class DirectoryScanner(ScannerABC):
+    """
+    A class for reading files from a directory based on a specified file pattern or suffix, inheriting from `ScannerABC`.
+    It can be used in conjunction with the parsers such as PDF/MarkDown parser to convert files into Chunks.
+
+    This class is responsible for reading files from a directory and returning a list of file paths that match the specified file pattern/suffix.
+    It inherits from `ScannerABC` and overrides the necessary methods to handle directory-specific operations.
+
+    """
+
+    def __init__(
+        self,
+        file_pattern: str = None,
+        file_suffix: str = None,
+        rank: int = 0,
+        world_size: int = 1,
+    ):
+        """
+        Initializes the DirectoryScanner with the specified file pattern, file suffix, rank, and world size.
+
+        Args:
+            file_pattern (str, optional): The regex pattern to match file names. Defaults to None.
+            file_suffix (str, optional): The file suffix to match if `file_pattern` is not provided. Defaults to None.
+            rank (int, optional): The rank of the current worker. Defaults to 0.
+            world_size (int, optional): The total number of workers. Defaults to 1.
+        """
+        super().__init__(rank=rank, world_size=world_size)
+        if file_pattern is None:
+            if file_suffix:
+                file_pattern = f".*{file_suffix}$"
+            else:
+                file_pattern = r".*txt$"
+        self.file_pattern = re.compile(file_pattern)
+
+    @property
+    def input_types(self) -> Input:
+        return str
+
+    @property
+    def output_types(self) -> Output:
+        return str
+
+    def find_files_by_regex(self, directory):
+        """
+        Finds files in the specified directory that match the file pattern.
+
+        Args:
+            directory (str): The directory to search for files.
+
+        Returns:
+            List[str]: A list of file paths that match the file pattern.
+        """
+        matched_files = []
+        for root, dirs, files in os.walk(directory):
+            for file in files:
+                if self.file_pattern.match(file):
+                    file_path = os.path.join(root, file)
+                    matched_files.append(file_path)
+        return matched_files
+
+    def load_data(self, input: Input, **kwargs) -> List[Output]:
+        """
+        Loads data by finding files in the specified directory that match the file pattern.
+
+        This method searches the directory specified by the input and returns a list of file paths that match the file pattern.
+
+        Args:
+            input (Input): The directory to search for files.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List[Output]: A list of file paths that match the file pattern.
+        """
+        return self.find_files_by_regex(input)
diff --git a/kag/builder/component/scanner/file_scanner.py b/kag/builder/component/scanner/file_scanner.py
new file mode 100644
index 00000000..653fa6c4
--- /dev/null
+++ b/kag/builder/component/scanner/file_scanner.py
@@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import os
+from typing import List
+
+from kag.interface import ScannerABC
+from kag.common.conf import KAG_PROJECT_CONF
+from knext.common.base.runnable import Input, Output
+
+
+@ScannerABC.register("file")
+@ScannerABC.register("file_scanner")
+class FileScanner(ScannerABC):
+    """
+    A class for reading single file and returning the path, inheriting from `ScannerABC`.
+
+    This class is responsible for reading SINGLE file and returning the path as a list of strings.
+    It inherits from `ScannerABC` and overrides the necessary methods to handle file-specific operations.
+    """
+
+    @property
+    def input_types(self) -> Input:
+        return str
+
+    @property
+    def output_types(self) -> Output:
+        return str
+
+    def load_data(self, input: Input, **kwargs) -> List[Output]:
+        """
+        Loads data by returning the input file path as a list of strings.
+
+        This method takes the input file path and returns it as a list containing the file path.
+
+        Args:
+            input (Input): The file path to load.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List[Output]: A list containing the input file path.
+        """
+        if input.startswith("http://") or input.startswith("https://"):
+            from kag.common.utils import download_from_http
+
+            local_file_path = os.path.join(KAG_PROJECT_CONF.ckpt_dir, "file_scanner")
+            if not os.path.exists(local_file_path):
+                os.makedirs(local_file_path)
+            local_file = os.path.join(local_file_path, os.path.basename(input))
+            local_file = download_from_http(input, local_file)
+            return [local_file]
+        return [input]
diff --git a/kag/builder/component/scanner/json_scanner.py b/kag/builder/component/scanner/json_scanner.py
new file mode 100644
index 00000000..53af7696
--- /dev/null
+++ b/kag/builder/component/scanner/json_scanner.py
@@ -0,0 +1,113 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import json
+import os
+from typing import Union, Dict, List
+
+from kag.interface import ScannerABC
+from knext.common.base.runnable import Input, Output
+
+
+@ScannerABC.register("json")
+@ScannerABC.register("json_scanner")
+class JSONScanner(ScannerABC):
+    """
+    A class for reading JSON files or parsing JSON-formatted strings into a list of dictionaries, inheriting from `ScannerABC`.
+
+    This class is responsible for reading JSON files or parsing JSON-formatted strings and converting them into a list of dictionaries.
+    It inherits from `ScannerABC` and overrides the necessary methods to handle JSON-specific operations.
+
+    Note: The JSON data must be a list of dictionaries.
+    """
+
+    @property
+    def input_types(self) -> Input:
+        return str
+
+    @property
+    def output_types(self) -> Output:
+        return Dict
+
+    @staticmethod
+    def _read_from_file(file_path: str) -> Union[dict, list]:
+        """
+        Reads JSON data from a file and returns it as a list of dictionaries.
+
+        Args:
+            file_path (str): The path to the JSON file.
+
+        Returns:
+            List[Dict]: The JSON data loaded from the file.
+
+        Raises:
+            ValueError: If there is an error reading the JSON from the file or if the file is not found.
+        """
+        try:
+            with open(file_path, "r") as file:
+                return json.load(file)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Error reading JSON from file: {e}")
+        except FileNotFoundError as e:
+            raise ValueError(f"File not found: {e}")
+
+    @staticmethod
+    def _parse_json_string(json_string: str) -> Union[dict, list]:
+        """
+        Parses a JSON string and returns it as a list of dictionaries.
+
+        Args:
+            json_string (str): The JSON string to parse.
+
+        Returns:
+            List[Dict]: The parsed JSON data.
+
+        Raises:
+            ValueError: If there is an error parsing the JSON string.
+        """
+        try:
+            return json.loads(json_string)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Error parsing JSON string: {e}")
+
+    def load_data(self, input: Input, **kwargs) -> List[Output]:
+        """
+        Loads data from a JSON file or JSON string and returns it as a list of dictionaries.
+
+        This method reads JSON data from a file or parses a JSON string and returns it as a list of dictionaries.
+        If the input is a file path, it reads the file; if the input is a JSON string, it parses the string.
+
+        Args:
+            input (Input): The JSON file path or JSON string to load.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List[Output]: A list of dictionaries, where each dictionary represents a JSON object.
+
+        Raises:
+            ValueError: If there is an error reading the JSON data or if the input is not a valid JSON array or object.
+        """
+        input = self.download_data(input)
+        try:
+            if os.path.exists(input):
+                corpus = self._read_from_file(input)
+            else:
+                corpus = self._parse_json_string(input)
+        except ValueError as e:
+            raise e
+
+        if not isinstance(corpus, (list, dict)):
+            raise ValueError("Expected input to be a JSON array or object")
+
+        if isinstance(corpus, dict):
+            corpus = [corpus]
+        return corpus
diff --git a/kag/builder/component/scanner/yuque_scanner.py b/kag/builder/component/scanner/yuque_scanner.py
new file mode 100644
index 00000000..ef4bd5c3
--- /dev/null
+++ b/kag/builder/component/scanner/yuque_scanner.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import os
+import requests
+from typing import Type, List, Union
+
+# from kag.builder.component.reader.markdown_reader import MarkDownReader
+from kag.interface import ScannerABC
+from knext.common.base.runnable import Input, Output
+
+
+@ScannerABC.register("yuque")
+@ScannerABC.register("yuque_scanner")
+class YuqueScanner(ScannerABC):
+    """
+    A class for reading data from Yuque, a Chinese documentation platform, inheriting from `ScannerABC`.
+
+    This class is responsible for reading the Yuque knowledge base and return the urls of the documents it contains.
+    It can be used in conjunction with the Yuque parser to convert Yuque documents into Chunks.
+
+    It inherits from `ScannerABC` and overrides the necessary methods to handle Yuque-specific operations.
+
+    Args:
+        token (str): The authentication token for accessing Yuque API.
+        rank (int, optional): The rank of the current worker. Defaults to 0.
+        world_size (int, optional): The total number of workers. Defaults to 1.
+    """
+
+    def __init__(self, token: str):
+        """
+        Initializes the YuqueScanner with the specified token, rank, and world size.
+
+        Args:
+            token (str): The authentication token for accessing Yuque API.
+            rank (int, optional): The rank of the current worker. Defaults to 0.
+            world_size (int, optional): The total number of workers. Defaults to 1.
+        """
+        super().__init__()
+        self.token = token
+
+    @property
+    def input_types(self) -> Type[Input]:
+        """The type of input this Runnable object accepts specified as a type annotation."""
+        return Union[str, List[str]]
+
+    @property
+    def output_types(self) -> Type[Output]:
+        """The type of output this Runnable object produces specified as a type annotation."""
+        return str
+
+    def get_yuque_api_data(self, url):
+        """
+        Fetches data from the Yuque API using the specified URL and authentication token.
+
+        Args:
+            url (str): The URL to fetch data from.
+
+        Returns:
+            dict: The JSON data returned by the Yuque API.
+
+        Raises:
+            HTTPError: If the API returns a bad response (4xx or 5xx).
+        """
+        headers = {"X-Auth-Token": self.token}
+        response = requests.get(url, headers=headers)
+        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
+        return response.json()["data"]  # Assuming the API returns JSON data
+
+    def load_data(self, input: Input, **kwargs) -> List[Output]:
+        """
+        Loads data from the Yuque API and returns it as a list of document url strings.
+
+        This method fetches data from the Yuque API using the provided URL and converts it into a list of strings.
+        If the input is a single document url, it returns a list containing the token and URL.
+        If the input is a knowledge base, it returns a list of strings where each string contains the token and the URL of each document it contains.
+
+        Args:
+            input (Input): The URL to fetch data from.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List[Output]: A list of strings, where each string contains the token and the URL of each document.
+        """
+        url = input
+        if isinstance(url, str):
+            data = self.get_yuque_api_data(url)
+            if isinstance(data, dict):
+                # for single yuque doc
+                return [f"{self.token}@{url}"]
+            output = []
+            for item in data:
+                slug = item["slug"]
+                output.append(os.path.join(url, slug))
+            return [f"{self.token}@{url}" for url in output]
+        else:
+            return [f"{self.token}@{x}" for x in url]
diff --git a/kag/builder/component/splitter/__init__.py b/kag/builder/component/splitter/__init__.py
index c91070a8..e69de29b 100644
--- a/kag/builder/component/splitter/__init__.py
+++ b/kag/builder/component/splitter/__init__.py
@@ -1,23 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-from kag.builder.component.splitter.length_splitter import LengthSplitter
-from kag.builder.component.splitter.semantic_splitter import SemanticSplitter
-from kag.builder.component.splitter.pattern_splitter import PatternSplitter
-from kag.builder.component.splitter.outline_splitter import OutlineSplitter
-
-
-__all__ = [
-    "LengthSplitter",
-    "SemanticSplitter",
-    "PatternSplitter",
-]
diff --git a/kag/builder/component/splitter/base_table_splitter.py b/kag/builder/component/splitter/base_table_splitter.py
index 72a0b314..8af66c9e 100644
--- a/kag/builder/component/splitter/base_table_splitter.py
+++ b/kag/builder/component/splitter/base_table_splitter.py
@@ -10,28 +10,52 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
 
-from abc import ABC
-from typing import Type, List, Union
-
 from kag.builder.model.chunk import Chunk
-from kag.interface.builder import SplitterABC
+from kag.interface import SplitterABC
 
 
 class BaseTableSplitter(SplitterABC):
     """
-    A base class for splitting table, inheriting from Splitter.
+    A base class for splitting table data into smaller chunks.
+
+    This class inherits from SplitterABC and provides the functionality to split table data
+    represented in markdown format into smaller chunks.
     """
 
+    def __init__(self):
+        super().__init__()
+
     def split_table(self, org_chunk: Chunk, chunk_size: int = 2000, sep: str = "\n"):
         """
-        split markdown format table into smaller markdown table
+        Splits a markdown format table into smaller markdown tables.
+
+        Args:
+            org_chunk (Chunk): The original chunk containing the table data.
+            chunk_size (int): The maximum size of each smaller chunk. Defaults to 2000.
+            sep (str): The separator used to join the table rows. Defaults to "\n".
+
+        Returns:
+            List[Chunk]: A list of smaller chunks resulting from the split operation.
         """
         try:
-            return self._split_table(org_chunk=org_chunk, chunk_size=chunk_size, sep=sep)
+            return self._split_table(
+                org_chunk=org_chunk, chunk_size=chunk_size, sep=sep
+            )
         except Exception:
             return None
 
     def _split_table(self, org_chunk: Chunk, chunk_size: int = 2000, sep: str = "\n"):
+        """
+        Internal method to split a markdown format table into smaller markdown tables.
+
+        Args:
+            org_chunk (Chunk): The original chunk containing the table data.
+            chunk_size (int): The maximum size of each smaller chunk. Defaults to 2000.
+            sep (str): The separator used to join the table rows. Defaults to "\n".
+
+        Returns:
+            List[Chunk]: A list of smaller chunks resulting from the split operation.
+        """
         output = []
         content = org_chunk.content
         table_start = content.find("|")
@@ -56,6 +80,7 @@ def _split_table(self, org_chunk: Chunk, chunk_size: int = 2000, sep: str = "\n"
             cur.append(row)
             cur_len += len(row)
 
+        cur.append(content[table_end:])
         if len(cur) > 0:
             splitted.append(cur)
 
@@ -66,7 +91,7 @@ def _split_table(self, org_chunk: Chunk, chunk_size: int = 2000, sep: str = "\n"
                 name=f"{org_chunk.name}#{idx}",
                 content=sep.join(sentences),
                 type=org_chunk.type,
-                **org_chunk.kwargs
+                **org_chunk.kwargs,
             )
             output.append(chunk)
         return output
diff --git a/kag/builder/component/splitter/length_splitter.py b/kag/builder/component/splitter/length_splitter.py
index 2e9dcfcd..e86cafbc 100644
--- a/kag/builder/component/splitter/length_splitter.py
+++ b/kag/builder/component/splitter/length_splitter.py
@@ -10,26 +10,41 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
 
-from typing import Type, List, Union
-
+from typing import Type, List
+from kag.interface import SplitterABC
 from kag.builder.model.chunk import Chunk, ChunkTypeEnum
+from kag.interface.builder.base import KAG_PROJECT_CONF
+from kag.common.utils import generate_hash_id
 from knext.common.base.runnable import Input, Output
 from kag.builder.component.splitter.base_table_splitter import BaseTableSplitter
 
 
+@SplitterABC.register("length")
+@SplitterABC.register("length_splitter")
 class LengthSplitter(BaseTableSplitter):
     """
-    A class for splitting text based on length, inheriting from Splitter.
+    A class for splitting text based on length.
+
+    This class inherits from BaseTableSplitter and provides the functionality to split text
+    into smaller chunks based on a specified length and window size. It also handles table data
+    by splitting it into smaller markdown tables.
 
     Attributes:
-        split_length (int): The maximum length of each split chunk.
+        split_length (int): The maximum length of each chunk.
         window_length (int): The length of the overlap between chunks.
     """
 
-    def __init__(self, split_length: int = 500, window_length: int = 100, **kwargs):
-        super().__init__(**kwargs)
-        self.split_length = int(split_length)
-        self.window_length = int(window_length)
+    def __init__(self, split_length: int = 500, window_length: int = 100):
+        """
+        Initializes the LengthSplitter with the specified split length and window length.
+
+        Args:
+            split_length (int): The maximum length of each chunk. Defaults to 500.
+            window_length (int): The length of the overlap between chunks. Defaults to 100.
+        """
+        super().__init__()
+        self.split_length = split_length
+        self.window_length = window_length
 
     @property
     def input_types(self) -> Type[Input]:
@@ -39,37 +54,52 @@ def input_types(self) -> Type[Input]:
     def output_types(self) -> Type[Output]:
         return Chunk
 
+    def chunk_breakdown(self, chunk):
+        chunks = self.logic_break(chunk)
+        if chunks:
+            res_chunks = []
+            for c in chunks:
+                res_chunks.extend(self.chunk_breakdown(c))
+        else:
+            res_chunks = self.slide_window_chunk(
+                chunk, self.split_length, self.window_length
+            )
+        return res_chunks
+
+    def logic_break(self, chunk):
+        return None
+
     def split_sentence(self, content):
         """
         Splits the given content into sentences based on delimiters.
 
         Args:
-            content (str): The content to be split.
+            content (str): The content to be split into sentences.
 
         Returns:
-            list: A list of sentences.
+            List[str]: A list of sentences.
         """
-        sentence_delimiters = ".。？?！!"
+        sentence_delimiters = ".。？?！!" if KAG_PROJECT_CONF.language == "en" else "。？！"
         output = []
         start = 0
         for idx, char in enumerate(content):
             if char in sentence_delimiters:
                 end = idx
-                tmp = content[start: end + 1].strip()
+                tmp = content[start : end + 1].strip()
                 if len(tmp) > 0:
-                    output.append(tmp)
+                    output.append(tmp.strip())
                 start = idx + 1
-        res = content[start:]
+        res = content[start:].strip()
         if len(res) > 0:
             output.append(res)
         return output
 
     def slide_window_chunk(
-            self,
-            org_chunk: Chunk,
-            chunk_size: int = 2000,
-            window_length: int = 300,
-            sep: str = "\n",
+        self,
+        org_chunk: Chunk,
+        chunk_size: int = 2000,
+        window_length: int = 300,
+        sep: str = "\n",
     ) -> List[Chunk]:
         """
         Splits the content into chunks using a sliding window approach.
@@ -84,7 +114,9 @@ def slide_window_chunk(
             List[Chunk]: A list of Chunk objects.
         """
         if org_chunk.type == ChunkTypeEnum.Table:
-            table_chunks = self.split_table(org_chunk=org_chunk, chunk_size=chunk_size, sep=sep)
+            table_chunks = self.split_table(
+                org_chunk=org_chunk, chunk_size=chunk_size, sep=sep
+            )
             if table_chunks is not None:
                 return table_chunks
         content = self.split_sentence(org_chunk.content)
@@ -112,38 +144,36 @@ def slide_window_chunk(
         output = []
         for idx, sentences in enumerate(splitted):
             chunk = Chunk(
-                id=f"{org_chunk.id}#{chunk_size}#{window_length}#{idx}#LEN",
+                id=generate_hash_id(f"{org_chunk.id}#{idx}"),
                 name=f"{org_chunk.name}",
                 content=sep.join(sentences),
                 type=org_chunk.type,
-                **org_chunk.kwargs
+                chunk_size=chunk_size,
+                window_length=window_length,
+                **org_chunk.kwargs,
             )
             output.append(chunk)
         return output
 
-    def invoke(self, input: Chunk, **kwargs) -> List[Output]:
+    def _invoke(self, input: Chunk, **kwargs) -> List[Output]:
         """
-        Invokes the splitter on the given input chunk.
+        Invokes the splitting of the input chunk based on the specified length and window size.
 
         Args:
-            input (Chunk): The input chunk to be split.
-            **kwargs: Additional keyword arguments.
+            input (Chunk): The chunk(s) to be split.
+            **kwargs: Additional keyword arguments, currently unused but kept for potential future expansion.
 
         Returns:
-            List[Output]: A list of split chunks.
+            List[Output]: A list of Chunk objects resulting from the split operation.
         """
         cutted = []
-        if isinstance(input,list):
+        if isinstance(input, list):
             for item in input:
                 cutted.extend(
-                    self.slide_window_chunk(
-                        item, self.split_length, self.window_length
-                    )
+                    self.slide_window_chunk(item, self.split_length, self.window_length)
                 )
         else:
             cutted.extend(
-                self.slide_window_chunk(
-                    input, self.split_length, self.window_length
-                )
+                self.slide_window_chunk(input, self.split_length, self.window_length)
             )
         return cutted
diff --git a/kag/builder/component/splitter/outline_splitter.py b/kag/builder/component/splitter/outline_splitter.py
index c0f6f6d7..510ad829 100644
--- a/kag/builder/component/splitter/outline_splitter.py
+++ b/kag/builder/component/splitter/outline_splitter.py
@@ -9,27 +9,51 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
-import logging 
+import collections
+import logging
 import os
 import re
-from typing import List, Type,Union
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import List, Type, Union, Tuple
 
-from kag.interface.builder import SplitterABC
-from kag.builder.prompt.outline_prompt import OutlinePrompt
-from kag.builder.model.chunk import Chunk
+import matplotlib.pyplot as plt
+from kag.interface.common.prompt import PromptABC
 from knext.common.base.runnable import Input, Output
-from kag.common.llm.client.llm_client import LLMClient
+from kag.common.conf import KAG_PROJECT_CONF, KAG_CONFIG
+from kag.common.utils import generate_hash_id
+from kag.builder.model.chunk import Chunk, dump_chunks
+from kag.builder.model.chunk import ChunkTypeEnum
+from kag.builder.prompt.outline_align_prompt import OutlineAlignPrompt
+from kag.interface import SplitterABC
+from kag.interface import LLMClient
 
 logger = logging.getLogger(__name__)
 
+
+@SplitterABC.register("outline")
+@SplitterABC.register("outline_splitter")
 class OutlineSplitter(SplitterABC):
-    
-    def __init__(self,**kwargs):
+    def __init__(
+        self,
+        llm: LLMClient,
+        min_length: int = 100,
+        workers: int = 10,
+        chunk_size: int = 500,
+        llm_max_tokens: int = 8000,
+        align_parallel: bool = False,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
-        self.llm = self._init_llm()
-        language = os.getenv("KAG_PROMPT_LANGUAGE", "zh")
-        self.prompt = OutlinePrompt(language)
-        
+        self.llm = llm
+        self.prompt = PromptABC.from_config(
+            {"type": "outline", "language": KAG_PROJECT_CONF.language}
+        )
+        self.min_length = min_length
+        self.workers = workers
+        self.chunk_size = chunk_size
+        self.llm_max_tokens = llm_max_tokens
+        self.align_parallel = align_parallel
+
     @property
     def input_types(self) -> Type[Input]:
         return Chunk
@@ -37,49 +61,1096 @@ def input_types(self) -> Type[Input]:
     @property
     def output_types(self) -> Type[Output]:
         return Chunk
-    
+
+    def build_catalog_tree(self, outlines_with_content):
+        catalog_tree = []
+        stack = []  # 用于跟踪当前的节点层级，格式为 [(title, level, node), ...]
+
+        for title, content, sd_content, level in outlines_with_content:
+            # 找到正确的父节点
+            while stack and stack[-1][1] >= level:  # 父节点的级别应该更高（数字更小）
+                stack.pop()
+
+            # # 创建新节点
+            # # title应该拼上所有父节点的title
+            # if stack:
+            #     # only add title if stack level
+            #     title = "/".join([item[0] for item in stack] + [title])
+            node = {
+                "title": title,
+                "content": content,
+                "children": [],
+                "start": sd_content[0],
+                "end": sd_content[1],
+            }
+
+            # 如果栈为空，或者当前节点的级别高于栈顶节点的级别，说明当前节点是根节点或新的分支节点
+            if not stack or stack[-1][1] >= level:
+                if stack:
+                    stack[-1][2]["children"].append(node)  # 将新节点添加到最近的父节点的 children 列表中
+                else:
+                    catalog_tree.append(node)  # 如果栈为空，说明这是一个根节点
+            else:
+                # 将新节点添加到找到的父节点的 children 列表中
+                stack[-1][2]["children"].append(node)
+
+            # 将新节点和其级别、标题添加到 stack 中，以便后续添加子节点
+            stack.append((title, level, node))
+
+        return catalog_tree
+
+    def simplify_catalog_tree(self, node, parent=None, parent_content_length=0):
+        # 首先递归处理所有子节点
+        for child in list(node["children"]):  # 使用 list 来复制列表，以便在迭代时修改它
+            self.simplify_catalog_tree(child, node, len(node["content"]))
+
+        # 然后检查当前节点是否可以与父节点合并
+        content_length = len(node["content"])
+        if content_length + parent_content_length <= self.chunk_size and parent:
+            # 如果当前节点的内容长度加上父节点的内容长度不超过阈值，则合并
+            parent["content"] += " " + node["content"]
+            # 将当前节点的子节点添加到父节点的子节点列表中
+            parent["children"].extend(node["children"])
+            # 从父节点的子节点列表中移除当前节点
+            parent["children"].remove(node)
+            return  # 停止进一步处理
+
     def outline_chunk(self, chunk: Union[Chunk, List[Chunk]]) -> List[Chunk]:
         if isinstance(chunk, Chunk):
             chunk = [chunk]
         outlines = []
         for c in chunk:
-            outline = self.llm.invoke({"input": c.content}, self.prompt)
+            outline = self.llm.invoke(
+                {"input": c.content, "current_outline": outlines}, self.prompt
+            )
+            # 过滤无效的 outlines
+            outline = self.filter_outlines(outline)
             outlines.extend(outline)
         content = "\n".join([c.content for c in chunk])
-        chunks = self.sep_by_outline(content, outlines)
+        # chunks = self.sep_by_outline_ignore_duplicates(
+        #     content, outlines, org_chunk=chunk
+        # )
+        chunks = self.sep_by_outline_with_outline_tree(
+            content, outlines, org_chunk=chunk
+        )
+        return chunks
+
+    def process_batch(self, batch: List[Chunk]) -> List[Tuple[str, int]]:
+        """
+        处理单个批次的文档块
+
+        Args:
+            batch: List[Chunk] 待处理的文档块
+
+        Returns:
+            List[Tuple[str, int]] 提取的outline列表
+        """
+        outlines = []
+        current_outlines = []
+
+        for c in batch:
+            # 传入当前已提取的outlines作为上下文
+            outline = self.llm.invoke(
+                {"input": c.content, "current_outline": current_outlines}, self.prompt
+            )
+
+            # 过滤无效的outlines
+            # paralle模式可以用量大的outline: 暂时没有好的方法:TODO
+            if self.align_parallel:
+                valid_outlines = self.filter_outlines_parallel(outline)
+            else:
+                valid_outlines = self.filter_outlines(outline)
+            outlines.extend(valid_outlines)
+            current_outlines.extend(valid_outlines)
+
+        return outlines
+
+    def align_outlines(self, outlines):
+        """
+        使用LLM对齐提取的outline层级，使用前一个对齐完成的batch的后30%作为交叉部分
+
+        Args:
+            outlines: List[Tuple[str, int]] 原始outline列表
+
+        Returns:
+            List[Tuple[str, int]] 对齐后的outline列表
+        """
+        if not outlines:
+            return []
+
+        # 初始化align prompt
+        align_prompt = PromptABC.from_config(
+            {"type": "outline_align", "language": KAG_PROJECT_CONF.language}
+        )
+
+        max_length = 4000
+
+        try:
+            # 处理第一个batch
+            current_batch = []
+            aligned_outlines = []
+
+            for outline in outlines:
+                # 计算添加当前outline后的总字符串长度
+                test_batch = current_batch + [outline]
+                batch_str = str(test_batch)  # 将整个batch转换为字符串计算长度
+
+                if len(batch_str) <= max_length:
+                    current_batch.append(outline)
+                else:
+                    break
+
+            # 对齐第一个batch
+            if current_batch:
+                aligned_batch = self.llm.invoke(
+                    {"outlines": current_batch}, align_prompt
+                )
+                aligned_outlines.extend(aligned_batch)
+                last_aligned = aligned_batch
+
+                # 处理剩余的outlines
+                remaining_outlines = outlines[len(current_batch) :]
+
+                while remaining_outlines:
+                    # 获取前一个batch最后30%的内容作为交叉部分
+                    overlap_count = max(1, len(last_aligned) * 30 // 100)
+                    overlap_part = last_aligned[-overlap_count:]
+
+                    # 构建新batch
+                    current_batch = []
+
+                    # 添加新的outlines直到达到长度限制
+                    for outline in remaining_outlines:
+                        test_batch = overlap_part + current_batch + [outline]
+                        batch_str = str(test_batch)
+
+                        if len(batch_str) <= max_length:
+                            current_batch.append(outline)
+                        else:
+                            break
+
+                    if not current_batch:
+                        # 如果无法添加任何新outline，说明单个outline太长，需要特殊处理
+                        logger.warning(
+                            "Single outline too long, processing individually"
+                        )
+                        current_batch = [remaining_outlines[0]]
+
+                    # 对齐当前batch（包含交叉部分）
+                    full_batch = overlap_part + current_batch
+                    aligned_batch = self.llm.invoke(
+                        {"outlines": full_batch}, align_prompt
+                    )
+
+                    # 只保留非交叉部分的结果
+                    aligned_outlines.extend(aligned_batch[overlap_count:])
+                    last_aligned = aligned_batch
+
+                    # 更新remaining_outlines
+                    remaining_outlines = remaining_outlines[len(current_batch) :]
+
+            return aligned_outlines
+
+        except Exception as e:
+            logger.error(f"Error aligning outlines with LLM: {str(e)}")
+            return self._rule_based_align(outlines)
+
+    def align_outlines_parallel(self, outlines):
+        """
+        并行处理outline对齐，每个batch与相邻batch有30%的交叉部分
+
+        Args:
+            outlines: List[Tuple[str, int]] 原始outline列表
+
+        Returns:
+            List[Tuple[str, int]] 对齐后的outline列表
+        """
+        if not outlines:
+            return []
+
+        # 初始化align prompt
+        language = os.getenv("KAG_PROMPT_LANGUAGE", "zh")
+        align_prompt = OutlineAlignPrompt(language)
+        max_length = 8000
+
+        try:
+            # 将outlines分成多个batch，每个batch最大长度不超过max_length
+            batches = []
+            current_batch = []
+
+            for outline in outlines:
+                test_batch = current_batch + [outline]
+                batch_str = str(test_batch)
+
+                if len(batch_str) <= max_length:
+                    current_batch.append(outline)
+                else:
+                    if current_batch:
+                        batches.append(current_batch)
+                    current_batch = [outline]
+
+            if current_batch:
+                batches.append(current_batch)
+
+            # 并行处理每个batch
+            futures = []
+            with ThreadPoolExecutor(max_workers=self.workers) as executor:
+                for i, batch in enumerate(batches):
+                    # 获取与前一个batch的交叉部分
+                    prev_overlap = []
+                    if i > 0:
+                        prev_batch = batches[i - 1]
+                        overlap_count = max(1, len(prev_batch) * 30 // 100)
+                        prev_overlap = prev_batch[-overlap_count:]
+
+                    # 获取与后一个batch的交叉部分
+                    next_overlap = []
+                    if i < len(batches) - 1:
+                        next_batch = batches[i + 1]
+                        overlap_count = max(1, len(next_batch) * 30 // 100)
+                        next_overlap = next_batch[:overlap_count]
+
+                    # 构建完整的batch（包含交叉部分）
+                    full_batch = prev_overlap + batch + next_overlap
+
+                    # 提交任务到线程池
+                    future = executor.submit(
+                        self.llm.invoke, {"outlines": full_batch}, align_prompt
+                    )
+                    futures.append((i, future, len(prev_overlap), len(next_overlap)))
+
+            # 收集结果并按原始顺序合并
+            results = [None] * len(batches)
+            for i, future, prev_len, next_len in futures:
+                try:
+                    aligned_batch = future.result()
+                    # 只保留非交叉部分
+                    results[i] = aligned_batch[prev_len : len(aligned_batch) - next_len]
+                except Exception as e:
+                    logger.error(f"Error processing batch {i}: {str(e)}")
+                    # 如果处理失败，使用规则based对齐处理该batch
+                    results[i] = self._rule_based_align(batches[i])
+
+            # 合并所有结果
+            aligned_outlines = []
+            for batch_result in results:
+                aligned_outlines.extend(batch_result)
+
+            return aligned_outlines
+
+        except Exception as e:
+            logger.error(f"Error aligning outlines with LLM: {str(e)}")
+            return self._rule_based_align(outlines)
+
+    def _rule_based_align(self, outlines):
+        """
+        基于规则的outline对齐(作为备选方案)
+        """
+        # 保留原有的基于规则的对齐逻辑作为备选
+        title_patterns = {
+            "chapter": r"第[一二三四五六七八九十\d]+章",
+            "section": r"第[一二三四五六七八九十\d]+节",
+            "part": r"第[一二三四五六七八九十\d]+部分",
+            "article": r"第[一二三四五六七八九十\d]+条",
+        }
+
+        pattern_levels = {"chapter": 1, "section": 2, "part": 1, "article": 3}
+
+        aligned_outlines = []
+        for title, level in outlines:
+            matched_pattern = None
+            for pattern_type, pattern in title_patterns.items():
+                if re.search(pattern, title):
+                    matched_pattern = pattern_type
+                    break
+
+            if matched_pattern:
+                aligned_level = pattern_levels[matched_pattern]
+            else:
+                aligned_level = level
+
+            aligned_outlines.append((title, aligned_level))
+
+        return aligned_outlines
+
+    def outline_chunk_batch(self, chunk: List[Chunk]) -> List[Chunk]:
+        """
+        批量处理文档块并提取大纲
+
+        Args:
+            chunk: List[Chunk] 输入的文档块列表
+
+        Returns:
+            List[Chunk] 处理后的文档块列表
+        """
+        assert isinstance(chunk, list)
+        self.batch_size = len(chunk) // self.workers if len(chunk) > self.workers else 1
+
+        outlines = []
+        # 将 chunk 分成多个批次,这里注意，为了保证outline抽取的连续行，每个batch需要连续的chunk
+        batches = [
+            chunk[i : i + self.batch_size]
+            for i in range(0, len(chunk), self.batch_size)
+        ]
+
+        mapping = {}
+        futures = []
+        with ThreadPoolExecutor(max_workers=self.workers) as executor:
+            # 提交每个批次到线程池
+            for idx, batch in enumerate(batches):
+                future = executor.submit(self.process_batch, batch)
+                mapping[future] = idx
+                futures.append(future)
+
+            results = [0] * len(batches)
+            # 等待所有批次完成并收集结果
+            for future in as_completed(futures):
+                results[mapping[future]] = future.result()
+                # logger.info(f"outline batch{mapping[future]} done")
+
+        for result in results:
+            outlines.extend(result)
+
+        content = "\n".join([c.content for c in chunk])
+
+        if self.align_parallel:
+            aligned_outlines = self.align_outlines_parallel(outlines)
+        else:
+            aligned_outlines = self.align_outlines(outlines)
+        # 使用对齐后的outlines进行分块
+        chunks = self.sep_by_outline_with_outline_tree(
+            content, aligned_outlines, org_chunk=chunk
+        )
+
         return chunks
-    
-    def sep_by_outline(self,content,outlines):
+
+    def filter_outlines_parallel(self, raw_outlines):
+        """
+        过滤掉无效的标题，保留包含数字特征的标题。
+        数字特征包括:
+        1. 阿拉伯数字 (0-9)
+        2. 中文数字 (一二三...百千万亿)
+        3. 罗马数字 (I,II,III,IV...)
+        4. 序号标记 (①,②,③...)
+        5. 带数字的常见标记 (第x章、x.x、x)等
+        """
+        # 匹配纯数字和标点的无效标题
+        invalid_pattern = r"""
+                ^                      # 匹配开头
+                [0-9一二三四五六七八九十零IIVXLCDM\-.\(\)\[\]\s]*  # 数字和标点
+                $                      # 匹配结尾
+            """
+
+        # 匹配数字特征的模式
+        number_pattern = r"""
+                \d+                                          | # 阿拉伯数字
+                [一二三四五六七八九十百千万亿]+              | # 中文数字
+                [ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+                          | # 中文罗马数字
+                [IVXLCDMivxlcdm]+                           | # 英文罗马数字
+                [①②③④⑤⑥⑦⑧⑨⑩]+                            | # 圈数字
+                [⑴⑵⑶⑷⑸⑹⑺⑻⑼⑽]+                            | # 括号数字
+                第[一二三四五六七八九十百千万\d]+[章节篇部]    | # 第x章/节/篇/部
+                [第]?[0-9一二三四五六七八九十百千万]+[条]     | # (第)x条
+                \d+\.\d+                                     | # 数字层级(如1.1)
+                [(]\d+[)]                                     # 括号数字
+            """
+
+        valid_outlines = []
+        for title, level in raw_outlines:
+            title = title.strip()
+            # 过滤纯数字标题
+            if re.fullmatch(invalid_pattern, title, re.VERBOSE):
+                continue
+            # 检查是否包含数字特征
+            if not re.search(number_pattern, title, re.VERBOSE):
+                continue
+            valid_outlines.append((title, level))
+
+        return valid_outlines
+
+    def filter_outlines(self, raw_outlines):
+        """
+        过滤标题，只保留具有明确章节层级的标题。
+
+        章节层级分为四级:
+        1级(最高层): 篇、卷、部、编
+        2级: 章
+        3级: 节
+        4级: 小节、款、项、目
+
+        支持多种常见写法:
+        - 带"第"字: 第一章、第1章
+        - 不带"第"字: 一、1、(一)、(1)
+        - 数字类型: 阿拉伯数字、中文数字、罗马数字
+        """
+        # 数字模式
+        numbers = r"""
+            (?:
+                (?:[一二三四五六七八九十百千万]+)                  | # 中文数字
+                (?:\d+)                                           | # 阿拉伯数字
+                (?:[IVXLCDMivxlcdm]+)                            | # 罗马数字
+                (?:①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳)                  | # 圈数字
+                (?:\(\d+\))                                      | # (1)
+                (?:\((?:[一二三四五六七八九十]+)\))                # (一)
+            )
+        """
+
+        # 章节标识词
+        level1_words = r"(?:篇|卷|部|编)"
+        level2_words = r"(?:章)"
+        level3_words = r"(?:节)"
+        level4_words = r"(?:小节|款|项|目)"
+
+        # 完整的章节匹配模式
+        section_pattern = rf"""
+            ^                                           # 匹配开头
+            (?:
+                (?:第\s*{numbers}\s*(?:{level1_words})) | # 第x篇/卷/部/编
+                (?:第\s*{numbers}\s*(?:{level2_words})) | # 第x章
+                (?:第\s*{numbers}\s*(?:{level3_words})) | # 第x节
+                (?:第\s*{numbers}\s*(?:{level4_words})) | # 第x小节/款/项/目
+                (?:{numbers}\s*[、.\s]\s*(?:{level1_words})) | # x、篇/卷/部/编
+                (?:{numbers}\s*[、.\s]\s*(?:{level2_words})) | # x、章
+                (?:{numbers}\s*[、.\s]\s*(?:{level3_words})) | # x、节
+                (?:{numbers}\s*[、.\s]\s*(?:{level4_words}))   # x、小节/款/项/目
+            )
+            [\s\S]*                                     # 标题剩余部分
+            $                                           # 匹配结尾
+        """
+
+        def determine_level(title: str) -> int:
+            """根据标题内容确定层级"""
+            if any(word in title for word in level1_words.strip("(?:)").split("|")):
+                return 1
+            elif any(word in title for word in level2_words.strip("(?:)").split("|")):
+                return 2
+            elif any(word in title for word in level3_words.strip("(?:)").split("|")):
+                return 3
+            elif any(word in title for word in level4_words.strip("(?:)").split("|")):
+                return 4
+            return 0  # 未匹配到任何层级
+
+        valid_outlines = []
+        for title, level in raw_outlines:
+            title = title.strip()
+            # 检查是否是有效的章节标题
+            if re.match(section_pattern, title, re.VERBOSE):
+                # 根据标题内容确定实际层级
+                actual_level = determine_level(title)
+                if actual_level > 0:  # 只添加成功确定层级的标题
+                    valid_outlines.append((title, actual_level))
+
+        return valid_outlines
+
+    def unify_outline_levels(self, outlines):
+        """
+        统一相同类型标题的级别，如 "第一节" 和 "第二节" 应有相同的层级。
+
+        Args:
+            outlines (list): 提取的标题列表，格式为 [(标题文本, 级别), ...]。
+
+        Returns:
+            list: 调整后的标题列表，格式同输入。
+        """
+        if not outlines:
+            return []
+
+        # 辅助函数：判断标题是否属于同类型
+        def is_same_type(title1, title2):
+            """
+            判断两个标题是否属于同一类型。
+            """
+            # 检查是否包含 "章" 或 "节"，并判断编号相似
+            keywords = ["章", "节", "部分", "篇"]
+            for keyword in keywords:
+                if keyword in title1 and keyword in title2:
+                    return True
+            return False
+
+        # 建立类型到级别的映射
+        type_to_level = {}
+        for title, level in outlines:
+            for keyword in ["章", "节", "部分", "篇"]:
+                if keyword in title:
+                    type_to_level.setdefault(keyword, level)
+
+        # 调整级别
+        unified_outlines = []
+        for title, level in outlines:
+            for keyword in ["章", "节", "部分", "篇"]:
+                if keyword in title and keyword in type_to_level:
+                    level = type_to_level[keyword]
+                    break
+            unified_outlines.append((title, level))
+
+        return unified_outlines
+
+    def sep_by_outline(self, content, outlines):
+        """
+        按层级划分内容为 chunks，剔除无效的标题。
+        """
+        # 过滤无效的 outlines
+        outlines = self.filter_outlines(outlines)
+
         position_check = []
         for outline in outlines:
-            start = content.find(outline)
-            position_check.append((outline,start))
+            start = content.find(outline[0])
+            if start != -1:
+                position_check.append((outline, start))
+
+        if not position_check:
+            return []  # 如果没有找到任何标题，返回空
+
         chunks = []
-        for idx,pc in enumerate(position_check):
+        father_stack = []
+
+        for idx, (outline, start) in enumerate(position_check):
+            title, level = outline
+            end = (
+                position_check[idx + 1][1]
+                if idx + 1 < len(position_check)
+                else len(content)
+            )
+            while father_stack and father_stack[-1][1] >= level:
+                father_stack.pop()
+            full_path = "/".join([item[0] for item in father_stack] + [title])
+            chunk_content = content[start:end]
             chunk = Chunk(
-                id = Chunk.generate_hash_id(f"{pc[0]}#{idx}"),
-                name=f"{pc[0]}#{idx}",
-                content=content[pc[1]:position_check[idx+1][1] if idx+1 < len(position_check) else len(position_check)],
+                id=generate_hash_id(f"{full_path}#{idx}"),
+                name=full_path,
+                content=chunk_content,
             )
             chunks.append(chunk)
+            father_stack.append((title, level))
+
         return chunks
-    
-    def invoke(self,input: Input, **kwargs) -> List[Chunk]:
-        chunks = self.outline_chunk(input)
+
+    def sep_by_outline_with_merge(
+        self, content, outlines, min_length=200, max_length=5000
+    ):
+        """
+        按层级划分内容为 chunks，并对过短的 chunk 尝试进行合并，控制合并后长度。
+
+        参数：
+        - content: str，完整内容。
+        - outlines: List[Tuple[str, int]]，每个标题及其层级的列表。
+        - min_length: int，chunk 的最小长度，低于此值时尝试合并。
+        - max_length: int，chunk 的最大长度，合并后不能超过此值。
+
+        返回：
+        - List[Chunk]，分割后的 chunk 列表。
+        """
+        # 过滤无效的 outlines
+        outlines = self.filter_outlines(outlines)
+
+        position_check = []
+        for outline in outlines:
+            start = content.find(outline[0])
+            if start != -1:
+                position_check.append((outline, start))
+
+        if not position_check:
+            return []  # 如果没有找到任何标题，返回空
+
+        chunks = []
+        father_stack = []
+
+        for idx, (outline, start) in enumerate(position_check):
+            title, level = outline
+            end = (
+                position_check[idx + 1][1]
+                if idx + 1 < len(position_check)
+                else len(content)
+            )
+            while father_stack and father_stack[-1][1] >= level:
+                father_stack.pop()
+            full_path = "/".join([item[0] for item in father_stack] + [title])
+            chunk_content = content[start:end]
+            chunk = Chunk(
+                id=generate_hash_id(f"{full_path}#{idx}"),
+                name=full_path,
+                content=chunk_content,
+            )
+            chunks.append(chunk)
+            father_stack.append((title, level))
+
+        # 合并过短的 chunks
+        merged_chunks = []
+        buffer = None
+
+        for chunk in chunks:
+            if buffer:
+                # 当前 chunk 合并到 buffer 中
+                if (
+                    chunk.name.startswith(buffer.name)  # 同一父级目录
+                    and len(buffer.content) + len(chunk.content) <= max_length
+                ):
+                    buffer.content += chunk.content
+                    buffer.name = buffer.name  # 名称不变，保持父级目录路径
+                    continue
+                else:
+                    merged_chunks.append(buffer)
+                    buffer = None
+
+            if len(chunk.content) < min_length:
+                # 缓存过短的 chunk
+                buffer = chunk
+            else:
+                # 长度足够，直接加入结果
+                merged_chunks.append(chunk)
+
+        # 如果最后一个 chunk 被缓存在 buffer，直接加入结果
+        if buffer:
+            merged_chunks.append(buffer)
+
+        return merged_chunks
+
+    def split_sentence(self, content):
+        """
+        Splits the given content into sentences based on delimiters.
+
+        Args:
+            content (str): The content to be split.
+
+        Returns:
+            list: A list of sentences.
+        """
+        sentence_delimiters = ".。？?！!"
+        output = []
+        start = 0
+        for idx, char in enumerate(content):
+            if char in sentence_delimiters:
+                end = idx
+                tmp = content[start : end + 1].strip()
+                if len(tmp) > 0:
+                    output.append(tmp)
+                start = idx + 1
+        res = content[start:]
+        if len(res) > 0:
+            output.append(res)
+        return output
+
+    def slide_window_chunk(
+        self,
+        org_chunk: Chunk,
+        chunk_size: int = 2000,
+        window_length: int = 300,
+        sep: str = "\n",
+    ) -> List[Chunk]:
+        """
+        Splits the content into chunks using a sliding window approach.
+
+        Args:
+            org_chunk (Chunk): The original chunk to be split.
+            chunk_size (int, optional): The maximum size of each chunk. Defaults to 2000.
+            window_length (int, optional): The length of the overlap between chunks. Defaults to 300.
+            sep (str, optional): The separator used to join sentences. Defaults to "\n".
+
+        Returns:
+            List[Chunk]: A list of Chunk objects.
+        """
+        if org_chunk.type == ChunkTypeEnum.Table:
+            table_chunks = self.split_table(
+                org_chunk=org_chunk, chunk_size=chunk_size, sep=sep
+            )
+            if table_chunks is not None:
+                return table_chunks
+        content = self.split_sentence(org_chunk.content)
+        splitted = []
+        cur = []
+        cur_len = 0
+        for sentence in content:
+            if cur_len + len(sentence) > chunk_size:
+                if cur:
+                    splitted.append(cur)
+                tmp = []
+                cur_len = 0
+                for item in cur[::-1]:
+                    if cur_len >= window_length:
+                        break
+                    tmp.append(item)
+                    cur_len += len(item)
+                cur = tmp[::-1]
+
+            cur.append(sentence)
+            cur_len += len(sentence)
+        if len(cur) > 0:
+            splitted.append(cur)
+
+        output = []
+        for idx, sentences in enumerate(splitted):
+            chunk = Chunk(
+                id=generate_hash_id(f"{org_chunk.id}#{idx}"),
+                name=f"{org_chunk.name}#{idx}",
+                content=sep.join(sentences),
+                type=org_chunk.type,
+                **org_chunk.kwargs,
+            )
+            output.append(chunk)
+        return output
+
+    def sep_by_outline_ignore_duplicates(
+        self, content, outlines, min_length=50, max_length=500, org_chunk=None
+    ):
+        """
+        按层级划分内容为 chunks，剔除无效的标题，并忽略重复的标题。
+
+        参数：
+        - content: str，完整内容。
+        - outlines: List[Tuple[str, int]]，每个标题及其层级的列表。
+        - min_length: int，chunk 的最小长度，低于此值时尝试合并。
+        - max_length: int，chunk 的最大长度，合并后不能超过此值。
+
+        返回：
+        - List[Chunk]，分割后的 chunk 列表。
+        """
+
+        if not outlines or len(outlines) == 0:
+            cutted = []
+            if isinstance(org_chunk, list):
+                for item in org_chunk:
+                    cutted.extend(self.slide_window_chunk(item))
+            return cutted
+
+        position_check = []
+        seen_titles = set()
+        for outline in outlines:
+            title, level = outline
+            start = content.find(title)
+            if start != -1 and title not in seen_titles:
+                # 检查position_check是否为空或者当前start是否大于上一个元素的start
+                if not position_check or start > position_check[-1][1]:
+                    position_check.append((outline, start))
+                else:
+                    # 如果当前start不大于上一个元素的start，则跳过这个元素
+                    continue
+                seen_titles.add(title)
+
+        if not position_check:
+            return []
+
+        chunks = []
+        father_stack = []
+
+        for idx, (outline, start) in enumerate(position_check):
+            title, level = outline
+            end = (
+                position_check[idx + 1][1]
+                if idx + 1 < len(position_check)
+                else len(content)
+            )
+            while father_stack and father_stack[-1][1] >= level:
+                father_stack.pop()
+            full_path = "/".join([item[0] for item in father_stack] + [title])
+            chunk_content = content[start:end]
+
+            # add origin kwargs
+            origin_properties = {}
+            for key, value in org_chunk[0].kwargs.items():
+                origin_properties[key] = value
+
+            chunk = Chunk(
+                id=generate_hash_id(f"{full_path}#{idx}"),
+                name=full_path,
+                content=chunk_content,
+                **origin_properties,
+                start=start,
+                end=end,
+            )
+            chunks.append(chunk)
+            father_stack.append((title, level))
+
+        # 导出start end的chunk结果
+        # dump_chunks_with_start_end(chunks, output_path="./start_end_chunk.json")
+
+        # 合并过短的 chunks
+        merged_chunks = []
+        buffer = None
+
+        for chunk in chunks:
+            if buffer:
+                # 当前 chunk 合并到 buffer 中
+                if (
+                    chunk.name.startswith(buffer.name)  # 同一父级目录
+                    and len(buffer.content) + len(chunk.content) <= max_length
+                ):
+                    buffer.content += chunk.content
+                    continue
+                else:
+                    merged_chunks.append(buffer)
+                    buffer = None
+
+            if len(chunk.content) < min_length:
+                # 缓存过短的 chunk
+                buffer = chunk
+            else:
+                # 长度足够，直接加入结果
+                merged_chunks.append(chunk)
+
+        # 如果最后一个 chunk 被缓存在 buffer，直接加入结果
+        if buffer:
+            merged_chunks.append(buffer)
+
+        for idx, chunk in enumerate(merged_chunks):
+            chunk.prev_content = merged_chunks[idx - 1].content if idx > 0 else None
+            chunk.next_content = (
+                merged_chunks[idx + 1].content if idx < len(merged_chunks) - 1 else None
+            )
+
+        return merged_chunks
+
+    def sep_by_outline_with_outline_tree(self, content, outlines, org_chunk=None):
+        """
+        按层级划分内容为 chunks，剔除无效的标题，并忽略重复的标题。
+
+        参数：
+        - content: str，完整内容。
+        - outlines: List[Tuple[str, int]]，每个标题及其层级的列表。
+        - min_length: int，chunk 的最小长度，低于此值时尝试合并。
+        - max_length: int，chunk 的最大长度，合并后不能超过此值。
+
+        返回：
+        - List[Chunk]，分割后的 chunk 列表。
+        """
+
+        if not outlines or len(outlines) == 0:
+            cutted = []
+            if isinstance(org_chunk, list):
+                for item in org_chunk:
+                    cutted.extend(self.slide_window_chunk(item))
+            return cutted
+
+        position_check = []
+        seen_titles = set()
+        for outline in outlines:
+            title, level = outline
+            start = content.find(title)
+            if start != -1 and title not in seen_titles:
+                # 检查position_check是否为空或者当前start是否大于上一个元素的start
+                if not position_check or start > position_check[-1][1]:
+                    position_check.append((outline, start))
+                else:
+                    # 如果当前start不大于上一个元素的start，则跳过这个元素
+                    continue
+                seen_titles.add(title)
+
+        for idx, (outline, start) in enumerate(position_check):
+            title, level = outline
+            end = (
+                position_check[idx + 1][1]
+                if idx + 1 < len(position_check)
+                else len(content)
+            )
+            position_check[idx] = (outline, start, end)
+
+        outlines_with_content = []
+        for outline, start, end in position_check:
+            title, level = outline
+            t_content = content[start:end]
+            sd_content = (start, end)
+            outlines_with_content.append((title, t_content, sd_content, level))
+
+        # 构建目录树
+        catalog_tree = self.build_catalog_tree(outlines_with_content)
+
+        # 简化目录树
+        # if catalog_tree:
+        #     for node in catalog_tree:
+        #         self.simplify_catalog_tree(node)
+
+        # add origin kwargs
+        origin_properties = {}
+        for key, value in org_chunk[0].kwargs.items():
+            origin_properties[key] = value
+
+        def generate_chunks(node, chunks=None, parent_title=""):
+            if chunks is None:
+                chunks = []
+
+            # 构建当前节点的完整title
+            full_title = (
+                "/".join([parent_title, node["title"]])
+                if parent_title
+                else node["title"]
+            )
+
+            # 为当前节点生成chunk
+            chunk_id = generate_hash_id(full_title)  # 使用完整title生成ID
+            chunk = Chunk(
+                id=chunk_id,
+                name=full_title,  # 使用完整title
+                content=node["content"],
+                # 假设origin_properties是全局的或者在函数外部定义的，包含其他需要的属性
+                **origin_properties,
+                start=node["start"],
+                end=node["end"],
+            )
+            chunks.append(chunk)
+
+            # 递归为子节点生成chunk
+            for child in node.get("children", []):
+                generate_chunks(child, chunks, full_title)  # 将当前完整title传递给子节点
+
+            return chunks
+
+        chunks = []
+        for node in catalog_tree:
+            chunks.extend(generate_chunks(node))
+
+        # 导出start end的chunk结果
+        # dump_chunks_with_start_end(chunks, output_path="./start_end_chunk.json")
+
+        # 合并过短的 chunks
+        merged_chunks = []
+        buffer = None
+
+        for chunk in chunks:
+            if buffer:
+                # 当前 chunk 合并到 buffer 中
+                if (
+                    chunk.name.startswith(buffer.name)  # 同一父级目录
+                    and len(buffer.content) + len(chunk.content) <= self.chunk_size
+                ):
+                    buffer.content += chunk.content
+                    continue
+                else:
+                    merged_chunks.append(buffer)
+                    buffer = None
+
+            if len(chunk.content) < self.min_length:
+                # 缓存过短的 chunk
+                buffer = chunk
+            else:
+                # 长度足够，直接加入结果
+                merged_chunks.append(chunk)
+
+        # 如果最后一个 chunk 被缓存在 buffer，直接加入结果
+        if buffer:
+            merged_chunks.append(buffer)
+
+        for i in range(len(merged_chunks) - 1, -1, -1):
+            chunk = merged_chunks[i]
+            if len(chunk.content) < (self.min_length * 0.5):
+                del merged_chunks[i]
+
+        for idx, chunk in enumerate(merged_chunks):
+            chunk.prev_content = merged_chunks[idx - 1].content if idx > 0 else None
+            chunk.next_content = (
+                merged_chunks[idx + 1].content if idx < len(merged_chunks) - 1 else None
+            )
+
+        return merged_chunks
+
+    def log(self, chunks, log_path="./chunk_log.txt"):
+        length_counts = collections.defaultdict(int)
+
+        for chunk in chunks:
+            length = len(chunk.content)
+            length_segment = length // 10
+            length_counts[length_segment] += 1
+
+        with open(log_path, "a") as f:
+            for length_segment, count in length_counts.items():
+                f.write(
+                    f"Length segment {length_segment*10}-{(length_segment+1)*10} chunks: {count}\n"
+                )
+
+        # 绘制长度分布图
+        self.plot_length_distribution(length_counts)
+
+    def plot_length_distribution(self, length_counts):
+        segments = list(length_counts.keys())
+        counts = list(length_counts.values())
+
+        plt.figure(figsize=(10, 6))
+        plt.bar(segments, counts, color="blue")
+        plt.xlabel("Length Segment")
+        plt.ylabel("Number of Chunks")
+        plt.title("Chunk Length Distribution")
+        plt.xticks(segments)
+        plt.savefig("chunk_length_distribution.png")
+
+    def splitter_chunk(self, input: Input, **kwargs) -> List[Chunk]:
+        cutted = []
+        chunk_size = kwargs.get("chunk_size")
+        if isinstance(input, list):
+            for item in input:
+                cutted.extend(self.slide_window_chunk(item, chunk_size=chunk_size))
+        else:
+            cutted.extend(self.slide_window_chunk(input, chunk_size=chunk_size))
+        return cutted
+
+    def invoke(self, input: Input, **kwargs) -> List[Chunk]:
+        chunks = self.splitter_chunk(input, chunk_size=self.llm_max_tokens // 2)
+        chunks = self.outline_chunk_batch(chunks)
+        # chunks = self.splitter_chunk(chunks, chunk_size=self.chunk_size)
+        # self.log(chunks)
         return chunks
 
+
 if __name__ == "__main__":
     from kag.builder.component.splitter.length_splitter import LengthSplitter
-    from kag.builder.component.splitter.outline_splitter import OutlineSplitter
     from kag.builder.component.reader.docx_reader import DocxReader
-    from kag.common.env import init_kag_config
-    init_kag_config(os.path.join(os.path.dirname(__file__),"../../../../tests/builder/component/test_config.cfg"))
+    from kag.builder.component.reader.txt_reader import TXTReader
+    from kag.builder.component.reader.pdf_reader import PDFReader
+
+    pdf_reader = PDFReader()
     docx_reader = DocxReader()
-    length_splitter = LengthSplitter(split_length=8000)
-    outline_splitter = OutlineSplitter()
-    docx_path = os.path.join(os.path.dirname(__file__),"../../../../tests/builder/data/test_docx.docx")
-    # chain = docx_reader >> length_splitter >> outline_splitter
-    chunk = docx_reader.invoke(docx_path)
-    chunks = length_splitter.invoke(chunk)
-    chunks = outline_splitter.invoke(chunks)
-    print(chunks)
\ No newline at end of file
+    txt_reader = TXTReader()
+    length_splitter = LengthSplitter(split_length=5000)
+
+    llm = LLMClient.from_config(KAG_CONFIG.all_config["llm"])
+    outline_splitter = OutlineSplitter(llm=llm)
+    txt_path = os.path.join(
+        os.path.dirname(__file__), "../../../../tests/builder/data/儿科学_short.txt"
+    )
+    docx_path = "/Users/zhangxinhong.zxh/Downloads/waikexue_short.docx"
+    test_dir = "/Users/zhangxinhong.zxh/Downloads/1127_medkag_book"
+    pdf_path = "/Users/zhangxinhong.zxh/Downloads/toaz.info-5dsm-5-pr_56e68a629dc4fe62699960dd5afbe362.pdf"
+    files = [
+        os.path.join(test_dir, file)
+        for file in os.listdir(test_dir)
+        if file.endswith(".docx")
+    ]
+    files = [
+        files[0],
+    ]
+
+    def process_file(file):
+        chain = docx_reader >> outline_splitter
+        chunks = chain.invoke(file, max_workers=10)
+        dump_chunks(chunks, output_path=file.replace(".docx", ".json"))
+
+    def process_txt(txt):
+        chain = txt_reader >> outline_splitter
+        chunks = chain.invoke(txt, max_workers=10)
+        dump_chunks(chunks, output_path=txt.replace(".txt", ".json"))
+
+    def process_file_without_chain(file):
+        chunk = docx_reader.invoke(file)
+        chunks = outline_splitter.invoke(chunk)
+        dump_chunks(chunks, output_path=file.replace(".docx", ".json"))
+
+    def process_txt_without_chain(txt):
+        chunk = txt_reader.invoke(txt)
+        chunks = outline_splitter.invoke(chunk)
+        dump_chunks(chunks, output_path=txt.replace(".txt", ".json"))
+
+    def process_pdf_without_chain(pdf):
+        chunk = pdf_reader.invoke(pdf)
+        chunks = outline_splitter.invoke(chunk)
+        dump_chunks(chunks, output_path=pdf.replace(".pdf", ".json"))
+
+    # with ThreadPoolExecutor(max_workers=10) as executor:
+    #     futures = [executor.submit(process_file, file) for file in files]
+
+    # for future in as_completed(futures):
+    #     print(future.result())
+
+    process_file_without_chain(docx_path)
+    a = 1
+    # chunk = docx_reader.invoke(docx_path)
+    # chunk = txt_reader.invoke(txt_path)
+    # chunks = length_splitter.invoke(chunk)
+    # chunks = outline_splitter.invoke(chunks)
+    # print(chunks)
diff --git a/kag/builder/component/splitter/pattern_splitter.py b/kag/builder/component/splitter/pattern_splitter.py
index 0b72f265..32f4737c 100644
--- a/kag/builder/component/splitter/pattern_splitter.py
+++ b/kag/builder/component/splitter/pattern_splitter.py
@@ -10,27 +10,37 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
 
-from typing import Type, List, Union
+# flake8: noqa
 import re
-import os
+from typing import Type, List, Union
 
-from kag.builder.model.chunk import Chunk, ChunkTypeEnum
-from kag.interface.builder.splitter_abc import SplitterABC
+
+from kag.builder.model.chunk import Chunk
+from kag.interface import SplitterABC
+from kag.common.utils import generate_hash_id
 from knext.common.base.runnable import Input, Output
 
 
+@SplitterABC.register("pattern")
+@SplitterABC.register("pattern_splitter")
 class PatternSplitter(SplitterABC):
-    def __init__(self, pattern_dict: dict = None, chunk_cut_num=None):
+    """
+    A class for splitting text content based on specified patterns and chunking strategies.
+    """
+
+    def __init__(self, pattern_dict: dict = None, chunk_cut_num: int = None):
         """
-        pattern_dict:
-        {
-            "pattern": 匹配pattern,
-            "group": {
-                "header":1,
-                "name":2,
-                "content":3
-            }
-        }
+        Initializes the PatternSplitter with the given pattern dictionary and chunk cut number.
+
+        Args:
+            pattern_dict (dict, optional): A dictionary containing the pattern and group mappings.
+                Defaults to a predefined pattern if not provided.
+                Example:
+                {
+                    "pattern": r"(\d+).([^0-9]+?)？([^0-9第版].*?)(?=\d+\.|$)",
+                    "group": {"header": 2, "name": 2, "content": 0}
+                }
+            chunk_cut_num (int, optional): The number of characters to cut chunks into. Defaults to None.
         """
         super().__init__()
         if pattern_dict is None:
@@ -53,6 +63,15 @@ def output_types(self) -> Type[Output]:
         return List[Chunk]
 
     def split_sentence(self, content):
+        """
+        Splits the given content into sentences based on delimiters.
+
+        Args:
+            content (str): The content to be split into sentences.
+
+        Returns:
+            List[str]: A list of sentences extracted from the content.
+        """
         sentence_delimiters = "。？?！!；;\n"
         output = []
         start = 0
@@ -76,7 +95,19 @@ def slide_window_chunk(
         sep: str = "\n",
         prefix: str = "SlideWindow",
     ) -> List[Chunk]:
+        """
+        Splits the content into chunks using a sliding window approach.
 
+        Args:
+            content (Union[str, List[str]]): The content to be chunked.
+            chunk_size (int, optional): The maximum size of each chunk. Defaults to 2000.
+            window_length (int, optional): The length of the sliding window. Defaults to 300.
+            sep (str, optional): The separator to join sentences within a chunk. Defaults to "\n".
+            prefix (str, optional): The prefix to use for chunk names. Defaults to "SlideWindow".
+
+        Returns:
+            List[Chunk]: A list of Chunk objects representing the chunked content.
+        """
         if isinstance(content, str):
             content = self.split_sentence(content)
         splitted = []
@@ -103,7 +134,7 @@ def slide_window_chunk(
         for idx, sentences in enumerate(splitted):
             chunk_name = f"{prefix}#{idx}"
             chunk = Chunk(
-                id=Chunk.generate_hash_id(chunk_name),
+                id=generate_hash_id(chunk_name),
                 name=chunk_name,
                 content=sep.join(sentences),
             )
@@ -114,6 +145,15 @@ def chunk_split(
         self,
         chunk: Chunk,
     ) -> List[Chunk]:
+        """
+        Splits the given chunk into smaller chunks based on the pattern and chunk cut number.
+
+        Args:
+            chunk (Chunk): The chunk to be split.
+
+        Returns:
+            List[Chunk]: A list of smaller Chunk objects.
+        """
         text = chunk.content
 
         pattern = re.compile(self.pattern, re.DOTALL)
@@ -127,7 +167,7 @@ def chunk_split(
             chunk = Chunk(
                 chunk_header=match.group(self.group["header"]),
                 name=match.group(self.group["name"]),
-                id=Chunk.generate_hash_id(match.group(self.group["content"])),
+                id=generate_hash_id(match.group(self.group["content"])),
                 content=match.group(self.group["content"]),
             )
             chunk = [chunk]
@@ -145,43 +185,16 @@ def chunk_split(
 
         return chunks
 
-    def invoke(self, input: Chunk, **kwargs) -> List[Output]:
+    def _invoke(self, input: Chunk, **kwargs) -> List[Output]:
+        """
+        Invokes the chunk splitting process on the given input.
+
+        Args:
+            input (Chunk): The input chunk to be processed.
+            **kwargs: Additional keyword arguments, currently unused but kept for potential future expansion.
 
+        Returns:
+            List[Output]: A list of output chunks.
+        """
         chunks = self.chunk_split(input)
         return chunks
-
-    def to_rest(self):
-        pass
-
-    @classmethod
-    def from_rest(cls, rest_model):
-        pass
-
-
-class LayeredPatternSpliter(PatternSplitter):
-    pass
-
-
-def _test():
-    pattern_dict = {
-        "pattern": r"(\d+)\.([^0-9]+?)？([^0-9第版].*?)(?=\d+\.|$)",
-        "group": {"header": 2, "name": 2, "content": 0},
-    }
-    ds = PatternSplitter(pattern_dict=pattern_dict)
-    from kag.builder.component.reader.pdf_reader import PDFReader
-
-    reader = PDFReader()
-    file_path = os.path.dirname(__file__)
-    test_file_path = os.path.join(file_path, "../../../../tests/builder/data/aiwen.pdf")
-    pre_output = reader._handle(test_file_path)
-
-    handle_input = pre_output[0]
-    handle_result = ds._handle(handle_input)
-    print("handle_result", handle_result)
-
-    return handle_result
-
-
-if __name__ == "__main__":
-    res = _test()
-    print(res)
diff --git a/kag/builder/component/splitter/semantic_splitter.py b/kag/builder/component/splitter/semantic_splitter.py
index 40ba22b2..d5e15391 100644
--- a/kag/builder/component/splitter/semantic_splitter.py
+++ b/kag/builder/component/splitter/semantic_splitter.py
@@ -10,41 +10,56 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
 import logging
-import os
 import re
 from typing import List, Type
 
-from kag.interface.builder import SplitterABC
+from kag.interface import SplitterABC
 from kag.builder.prompt.semantic_seg_prompt import SemanticSegPrompt
 from kag.builder.model.chunk import Chunk
+from kag.interface import LLMClient
+from kag.common.conf import KAG_PROJECT_CONF
+from kag.common.utils import generate_hash_id
 from knext.common.base.runnable import Input, Output
-from kag.common.llm.client.llm_client import LLMClient
 
 logger = logging.getLogger(__name__)
 
 
+@SplitterABC.register("semantic")
+@SplitterABC.register("semantic_splitter")
 class SemanticSplitter(SplitterABC):
     """
     A class for semantically splitting text into smaller chunks based on the content's structure and meaning.
-    Inherits from the Splitter class.
+    Inherits from the SplitterABC class.
 
-    Attributes:
-        kept_char_pattern (re.Pattern): Regex pattern to match Chinese/ASCII characters.
-        split_length (int): The maximum length of each chunk after splitting.
-        llm_client (LLMClient): Instance of LLMClient initialized with `model` config.
-        semantic_seg_op (SemanticSegPrompt): Instance of SemanticSegPrompt for semantic segmentation.
     """
 
-    def __init__(self, split_length: int = 1000, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(
+        self,
+        llm: LLMClient,
+        kept_char_pattern: str = None,
+        split_length: int = 1000,
+    ):
+        """
+        Initializes the SemanticSplitter with the given LLMClient, kept character pattern, and split length.
+
+        Args:
+            llm (LLMClient): Instance of LLMClient initialized with `model` config.
+            kept_char_pattern (str, optional): Regex pattern to match Chinese/ASCII characters.
+                Defaults to a predefined pattern if not provided.
+            split_length (int, optional): The maximum length of each chunk after splitting. Defaults to 1000.
+            **kwargs: Additional keyword arguments to be passed to the superclass.
+        """
+        super().__init__()
         # Chinese/ASCII characters
-        self.kept_char_pattern = re.compile(
-            r"[^\u4e00-\u9fa5\u3000-\u303F\uFF01-\uFF0F\uFF1A-\uFF20\uFF3B-\uFF40\uFF5B-\uFF65\x00-\x7F]+"
-        )
-        self.split_length = int(split_length)
-        self.llm = self._init_llm()
-        language = os.getenv("KAG_PROMPT_LANGUAGE", "zh")
-        self.semantic_seg_op = SemanticSegPrompt(language)
+        if kept_char_pattern is None:
+            self.kept_char_pattern = re.compile(
+                r"[^\u4e00-\u9fa5\u3000-\u303F\uFF01-\uFF0F\uFF1A-\uFF20\uFF3B-\uFF40\uFF5B-\uFF65\x00-\x7F]+"
+            )
+        else:
+            self.kept_char_pattern = re.compile(kept_char_pattern)
+        self.split_length = split_length
+        self.llm = llm
+        self.semantic_seg_op = SemanticSegPrompt(KAG_PROJECT_CONF.language)
 
     @property
     def input_types(self) -> Type[Input]:
@@ -103,6 +118,8 @@ def semantic_chunk(
         """
         result = self.llm.invoke({"input": org_chunk.content}, self.semantic_seg_op)
         splitted = self.parse_llm_output(org_chunk.content, result)
+        if len(splitted) == 0:
+            return [org_chunk]
         logger.debug(f"splitted = {splitted}")
         chunks = []
         for idx, item in enumerate(splitted):
@@ -113,30 +130,26 @@ def semantic_chunk(
                     name=f"{org_chunk.name}#{split_name}",
                     content=item["content"],
                     abstract=item["name"],
-                    **org_chunk.kwargs
+                    **org_chunk.kwargs,
                 )
                 chunks.append(chunk)
             else:
                 print("chunk over size")
                 innerChunk = Chunk(
-                    id=Chunk.generate_hash_id(item["content"]),
+                    id=generate_hash_id(item["content"]),
                     name=f"{org_chunk.name}#{split_name}",
                     content=item["content"],
                 )
-                chunks.extend(
-                    self.semantic_chunk(
-                        innerChunk, chunk_size
-                    )
-                )
+                chunks.extend(self.semantic_chunk(innerChunk, chunk_size))
         return chunks
 
-    def invoke(self, input: Input, **kwargs) -> List[Output]:
+    def _invoke(self, input: Input, **kwargs) -> List[Output]:
         """
         Invokes the splitting process on the provided input.
 
         Args:
             input (Input): The input to be processed.
-            **kwargs: Additional keyword arguments.
+            **kwargs: Additional keyword arguments, currently unused but kept for potential future expansion.
 
         Returns:
             List[Output]: A list of outputs generated from the input.
diff --git a/kag/builder/component/vectorizer/__init__.py b/kag/builder/component/vectorizer/__init__.py
index 93aa6cd4..e69de29b 100644
--- a/kag/builder/component/vectorizer/__init__.py
+++ b/kag/builder/component/vectorizer/__init__.py
@@ -1,11 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
diff --git a/kag/builder/component/vectorizer/batch_vectorizer.py b/kag/builder/component/vectorizer/batch_vectorizer.py
index 208f8e9f..9a2b1125 100644
--- a/kag/builder/component/vectorizer/batch_vectorizer.py
+++ b/kag/builder/component/vectorizer/batch_vectorizer.py
@@ -9,17 +9,18 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
-import os
 from collections import defaultdict
 from typing import List
+from tenacity import stop_after_attempt, retry
 
 from kag.builder.model.sub_graph import SubGraph
-from knext.common.base.runnable import Input, Output
-from kag.common.vectorizer import Vectorizer
-from kag.interface.builder.vectorizer_abc import VectorizerABC
+from kag.common.conf import KAG_PROJECT_CONF
+
+from kag.common.utils import get_vector_field_name
+from kag.interface import VectorizerABC, VectorizeModelABC
 from knext.schema.client import SchemaClient
-from knext.project.client import ProjectClient
 from knext.schema.model.base import IndexTypeEnum
+from knext.common.base.runnable import Input, Output
 
 
 class EmbeddingVectorPlaceholder(object):
@@ -43,22 +44,15 @@ class EmbeddingVectorManager(object):
     def __init__(self):
         self._placeholders = []
 
-    def _create_vector_field_name(self, property_key):
-        from kag.common.utils import to_snake_case
-
-        name = f"{property_key}_vector"
-        name = to_snake_case(name)
-        return "_" + name
-
     def get_placeholder(self, properties, vector_field):
         for property_key, property_value in properties.items():
-            field_name = self._create_vector_field_name(property_key)
+            field_name = get_vector_field_name(property_key)
             if field_name != vector_field:
                 continue
             if not property_value:
                 return None
             if not isinstance(property_value, str):
-                message = f"property {property_key!r} must be string to generate embedding vector"
+                message = f"property {property_key!r} must be string to generate embedding vector, got {property_value} with type {type(property_value)}"
                 raise RuntimeError(message)
             num = len(self._placeholders)
             placeholder = EmbeddingVectorPlaceholder(
@@ -78,11 +72,10 @@ def _get_text_batch(self):
         return text_batch
 
     def _generate_vectors(self, vectorizer, text_batch, batch_size=32):
-        if isinstance(text_batch, str):
-            text_batch = [text_batch]
         texts = list(text_batch)
         if not texts:
             return []
+
         if len(texts) % batch_size == 0:
             n_batchs = len(texts) // batch_size
         else:
@@ -99,9 +92,9 @@ def _fill_vectors(self, vectors, text_batch):
             for placeholder in placeholders:
                 placeholder._embedding_vector = vector
 
-    def batch_generate(self, vectorizer):
+    def batch_generate(self, vectorizer, batch_size=32):
         text_batch = self._get_text_batch()
-        vectors = self._generate_vectors(vectorizer, text_batch)
+        vectors = self._generate_vectors(vectorizer, text_batch, batch_size)
         self._fill_vectors(vectors, text_batch)
 
     def patch(self):
@@ -115,7 +108,7 @@ def __init__(self, vectorizer, vector_index_meta=None, extra_labels=("Entity",))
         self._extra_labels = extra_labels
         self._vector_index_meta = vector_index_meta or {}
 
-    def batch_generate(self, node_batch):
+    def batch_generate(self, node_batch, batch_size=32):
         manager = EmbeddingVectorManager()
         vector_index_meta = self._vector_index_meta
         for node_item in node_batch:
@@ -132,41 +125,49 @@ def batch_generate(self, node_batch):
                     placeholder = manager.get_placeholder(properties, vector_field)
                     if placeholder is not None:
                         properties[vector_field] = placeholder
-        manager.batch_generate(self._vectorizer)
+        manager.batch_generate(self._vectorizer, batch_size)
         manager.patch()
 
 
+@VectorizerABC.register("batch")
+@VectorizerABC.register("batch_vectorizer")
 class BatchVectorizer(VectorizerABC):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.project_id = self.project_id or os.getenv("KAG_PROJECT_ID")
-        self._init_graph_store()
-        self.vec_meta = self._init_vec_meta()
-        self.vectorizer = Vectorizer.from_config(self.vectorizer_config)
+    """
+    A class for generating embedding vectors for node attributes in a SubGraph in batches.
 
-    def _init_graph_store(self):
-        """
-        Initializes the Graph Store client.
+    This class inherits from VectorizerABC and provides the functionality to generate embedding vectors
+    for node attributes in a SubGraph in batches. It uses a specified vectorization model and processes
+    the nodes of a specified batch size.
 
-        This method retrieves the graph store configuration from environment variables and the project ID.
-        It then fetches the project configuration using the project ID and updates the graph store configuration
-        with any additional settings from the project. Finally, it creates and initializes the graph store client
-        using the updated configuration.
+    Attributes:
+        project_id (int): The ID of the project associated with the SubGraph.
+        vec_meta (defaultdict): Metadata for vector fields in the SubGraph.
+        vectorize_model (VectorizeModelABC): The model used for generating embedding vectors.
+        batch_size (int): The size of the batches in which to process the nodes.
+    """
 
-        Args:
-            project_id (str): The id of project.
+    def __init__(self, vectorize_model: VectorizeModelABC, batch_size: int = 32):
+        """
+        Initializes the BatchVectorizer with the specified vectorization model and batch size.
 
-        Returns:
-            GraphStore
+        Args:
+            vectorize_model (VectorizeModelABC): The model used for generating embedding vectors.
+            batch_size (int): The size of the batches in which to process the nodes. Defaults to 32.
         """
-        graph_store_config = eval(os.getenv("KAG_GRAPH_STORE", "{}"))
-        vectorizer_config = eval(os.getenv("KAG_VECTORIZER", "{}"))
-        config = ProjectClient().get_config(self.project_id)
-        graph_store_config.update(config.get("graph_store", {}))
-        vectorizer_config.update(config.get("vectorizer", {}))
-        self.vectorizer_config = vectorizer_config
+        super().__init__()
+        self.project_id = KAG_PROJECT_CONF.project_id
+        # self._init_graph_store()
+        self.vec_meta = self._init_vec_meta()
+        self.vectorize_model = vectorize_model
+        self.batch_size = batch_size
 
     def _init_vec_meta(self):
+        """
+        Initializes the vector metadata for the SubGraph.
+
+        Returns:
+            defaultdict: Metadata for vector fields in the SubGraph.
+        """
         vec_meta = defaultdict(list)
         schema_client = SchemaClient(project_id=self.project_id)
         spg_types = schema_client.load()
@@ -176,32 +177,31 @@ def _init_vec_meta(self):
                     IndexTypeEnum.Vector,
                     IndexTypeEnum.TextAndVector,
                 ]:
-                    vec_meta[type_name].append(
-                        self._create_vector_field_name(prop_name)
-                    )
+                    vec_meta[type_name].append(get_vector_field_name(prop_name))
         return vec_meta
 
-    def _create_vector_field_name(self, property_key):
-        from kag.common.utils import to_snake_case
+    @retry(stop=stop_after_attempt(3))
+    def _generate_embedding_vectors(self, input_subgraph: SubGraph) -> SubGraph:
+        """
+        Generates embedding vectors for the nodes in the input SubGraph.
 
-        name = f"{property_key}_vector"
-        name = to_snake_case(name)
-        return "_" + name
+        Args:
+            input_subgraph (SubGraph): The SubGraph for which to generate embedding vectors.
 
-    def _generate_embedding_vectors(
-        self, vectorizer: Vectorizer, input: SubGraph
-    ) -> SubGraph:
+        Returns:
+            SubGraph: The modified SubGraph with generated embedding vectors.
+        """
         node_list = []
         node_batch = []
-        for node in input.nodes:
+        for node in input_subgraph.nodes:
             if not node.id or not node.name:
                 continue
             properties = {"id": node.id, "name": node.name}
             properties.update(node.properties)
             node_list.append((node, properties))
             node_batch.append((node.label, properties.copy()))
-        generator = EmbeddingVectorGenerator(vectorizer, self.vec_meta)
-        generator.batch_generate(node_batch)
+        generator = EmbeddingVectorGenerator(self.vectorize_model, self.vec_meta)
+        generator.batch_generate(node_batch, self.batch_size)
         for (node, properties), (_node_label, new_properties) in zip(
             node_list, node_batch
         ):
@@ -209,8 +209,18 @@ def _generate_embedding_vectors(
                 if key in new_properties and new_properties[key] == value:
                     del new_properties[key]
             node.properties.update(new_properties)
-        return input
+        return input_subgraph
 
-    def invoke(self, input: Input, **kwargs) -> List[Output]:
-        modified_input = self._generate_embedding_vectors(self.vectorizer, input)
+    def _invoke(self, input_subgraph: Input, **kwargs) -> List[Output]:
+        """
+        Invokes the generation of embedding vectors for the input SubGraph.
+
+        Args:
+            input_subgraph (Input): The SubGraph for which to generate embedding vectors.
+            **kwargs: Additional keyword arguments, currently unused but kept for potential future expansion.
+
+        Returns:
+            List[Output]: A list containing the modified SubGraph with generated embedding vectors.
+        """
+        modified_input = self._generate_embedding_vectors(input_subgraph)
         return [modified_input]
diff --git a/kag/builder/component/writer/__init__.py b/kag/builder/component/writer/__init__.py
index 3d92f23e..e69de29b 100644
--- a/kag/builder/component/writer/__init__.py
+++ b/kag/builder/component/writer/__init__.py
@@ -1,17 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-from kag.builder.component.writer.kg_writer import KGWriter
-
-__all__ = [
-    "KGWriter",
-]
diff --git a/kag/builder/component/writer/kg_writer.py b/kag/builder/component/writer/kg_writer.py
index 155bf1bf..8b687b0d 100644
--- a/kag/builder/component/writer/kg_writer.py
+++ b/kag/builder/component/writer/kg_writer.py
@@ -9,14 +9,15 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
+import json
 import logging
-import os
 from enum import Enum
 from typing import Type, Dict, List
 
-from knext.graph_algo.client import GraphAlgoClient
+from knext.graph.client import GraphClient
 from kag.builder.model.sub_graph import SubGraph
-from kag.interface.builder.writer_abc import SinkWriterABC
+from kag.interface import SinkWriterABC
+from kag.common.conf import KAG_PROJECT_CONF
 from knext.common.base.runnable import Input, Output
 
 logger = logging.getLogger(__name__)
@@ -27,19 +28,30 @@ class AlterOperationEnum(str, Enum):
     Delete = "DELETE"
 
 
+@SinkWriterABC.register("kg", as_default=True)
+@SinkWriterABC.register("kg_writer", as_default=True)
 class KGWriter(SinkWriterABC):
     """
-    A class that extends `SinkWriter` to handle writing data into a Neo4j knowledge graph.
+    A class for writing SubGraphs to a Knowledge Graph (KG) storage.
 
-    This class is responsible for configuring the graph store based on environment variables and
-    an optional project ID, initializing the Neo4j client, and setting up the schema.
-    It also manages semantic indexing and multi-threaded operations.
+    This class inherits from SinkWriterABC and provides the functionality to write SubGraphs
+    to a Knowledge Graph storage system. It supports operations like upsert and delete.
     """
 
-    def __init__(self, project_id: str = None, **kwargs):
+    def __init__(self, project_id: int = None, **kwargs):
+        """
+        Initializes the KGWriter with the specified project ID.
+
+        Args:
+            project_id (int): The ID of the project associated with the KG. Defaults to None.
+            **kwargs: Additional keyword arguments passed to the superclass.
+        """
         super().__init__(**kwargs)
-        self.project_id = project_id or os.getenv("KAG_PROJECT_ID")
-        self.client = GraphAlgoClient(project_id=project_id)
+        if project_id is None:
+            self.project_id = KAG_PROJECT_CONF.project_id
+        else:
+            self.project_id = project_id
+        self.client = GraphClient(project_id=project_id)
 
     @property
     def input_types(self) -> Type[Input]:
@@ -49,25 +61,84 @@ def input_types(self) -> Type[Input]:
     def output_types(self) -> Type[Output]:
         return None
 
+    def format_label(self, label: str):
+        """
+        Formats the label by adding the project namespace if it is not already present.
+
+        Args:
+            label (str): The label to be formatted.
+
+        Returns:
+            str: The formatted label.
+        """
+        namespace = KAG_PROJECT_CONF.namespace
+        if label.split(".")[0] == namespace:
+            return label
+        return f"{namespace}.{label}"
+
+    def standarlize_graph(self, graph):
+        for node in graph.nodes:
+            node.label = self.format_label(node.label)
+        for edge in graph.edges:
+            edge.from_type = self.format_label(edge.from_type)
+            edge.to_type = self.format_label(edge.to_type)
+
+        for node in graph.nodes:
+            for k, v in node.properties.items():
+                if k.startswith("_"):
+                    continue
+                if not isinstance(v, str):
+                    node.properties[k] = json.dumps(v, ensure_ascii=False)
+        for edge in graph.edges:
+            for k, v in edge.properties.items():
+                if k.startswith("_"):
+                    continue
+                if not isinstance(v, str):
+                    edge.properties[k] = json.dumps(v, ensure_ascii=False)
+
+        return graph
+
     def invoke(
-        self, input: Input, alter_operation: str = AlterOperationEnum.Upsert, lead_to_builder: bool = False
+        self,
+        input: Input,
+        alter_operation: str = AlterOperationEnum.Upsert,
+        lead_to_builder: bool = False,
+        **kwargs,
     ) -> List[Output]:
         """
         Invokes the specified operation (upsert or delete) on the graph store.
 
         Args:
             input (Input): The input object representing the subgraph to operate on.
-            alter_operation (str): The type of operation to perform (Upsert or Delete).
-            lead_to_builder (str): enable lead to event infer builder
+            alter_operation (str): The type of operation to perform (Upsert or Delete). Defaults to Upsert.
+            lead_to_builder (bool): Enable lead to event infer builder. Defaults to False.
 
         Returns:
             List[Output]: A list of output objects (currently always [None]).
         """
-        self.client.write_graph(sub_graph=input.to_dict(), operation=alter_operation, lead_to_builder=lead_to_builder)
-        return [None]
+
+        input = self.standarlize_graph(input)
+        logger.debug(f"final graph to write: {input}")
+        self.client.write_graph(
+            sub_graph=input.to_dict(),
+            operation=alter_operation,
+            lead_to_builder=lead_to_builder,
+        )
+        return [input]
 
     def _handle(self, input: Dict, alter_operation: str, **kwargs):
-        """The calling interface provided for SPGServer."""
+        """
+        The calling interface provided for SPGServer.
+
+        Args:
+            input (Dict): The input dictionary representing the subgraph to operate on.
+            alter_operation (str): The type of operation to perform (Upsert or Delete).
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            None: This method currently returns None.
+        """
         _input = self.input_types.from_dict(input)
-        _output = self.invoke(_input, alter_operation)
+        _output = self.invoke(_input, alter_operation)  # noqa
+
         return None
diff --git a/kag/builder/default_chain.py b/kag/builder/default_chain.py
index ab04aff9..1f7ea2aa 100644
--- a/kag/builder/default_chain.py
+++ b/kag/builder/default_chain.py
@@ -9,149 +9,182 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
-
 import logging
-import importlib
-import os
-
-from kag.builder.component import SPGTypeMapping, KGWriter
-from kag.builder.component.extractor import KAGExtractor
-from kag.builder.component.splitter import LengthSplitter
-from kag.builder.component.vectorizer.batch_vectorizer import BatchVectorizer
-from knext.common.base.chain import Chain
-from knext.builder.builder_chain_abc import BuilderChainABC
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from kag.interface import (
+    ReaderABC,
+    MappingABC,
+    ExtractorABC,
+    SplitterABC,
+    VectorizerABC,
+    PostProcessorABC,
+    SinkWriterABC,
+    KAGBuilderChain,
+)
+from kag.common.utils import generate_hash_id
 
 logger = logging.getLogger(__name__)
 
 
-def get_reader(file_path: str):
-    file = os.path.basename(file_path)
-    suffix = file.split(".")[-1]
-    assert suffix.lower() in READER_MAPPING, f"{suffix} is not supported. Supported suffixes are: {list(READER_MAPPING.keys())}"
-    reader_path = READER_MAPPING.get(suffix.lower())
-    mod_path, class_name = reader_path.rsplit('.', 1)
-    module = importlib.import_module(mod_path)
-    reader_class = getattr(module, class_name)
-
-    return reader_class
-
-
-READER_MAPPING = {
-    "csv": "kag.builder.component.reader.csv_reader.CSVReader",
-    "json": "kag.builder.component.reader.json_reader.JSONReader",
-    "txt": "kag.builder.component.reader.txt_reader.TXTReader",
-    "pdf": "kag.builder.component.reader.pdf_reader.PDFReader",
-    "docx": "kag.builder.component.reader.docx_reader.DocxReader",
-    "md": "kag.builder.component.reader.markdown_reader.MarkdownReader",
-}
-
-
-class DefaultStructuredBuilderChain(BuilderChainABC):
+@KAGBuilderChain.register("structured")
+@KAGBuilderChain.register("structured_builder_chain")
+class DefaultStructuredBuilderChain(KAGBuilderChain):
     """
-    A class representing a default SPG builder chain, used to import structured data based on schema definitions
-
-    Steps:
-        0. Initializing by a give SpgType name, which indicates the target of import.
-        1. SourceReader: Reading structured dicts from a given file.
-        2. SPGTypeMapping: Mapping source fields to the properties of target type, and assemble a sub graph.
-        By default, the same name mapping is used, which means importing the source field into a property with the same name.
-        3. KGWriter: Writing sub graph into KG storage.
-
-    Attributes:
-        spg_type_name (str): The name of the SPG type.
+    A class representing a default SPG builder chain, used to import structured data based on schema definitions.
+    It consists of a mapping component, a writer component, and an optional vectorizer component.
     """
 
-    def __init__(self, spg_type_name: str, **kwargs):
-        super().__init__(**kwargs)
-        self.spg_type_name = spg_type_name
-
-    def build(self, **kwargs):
+    def __init__(
+        self,
+        mapping: MappingABC,
+        writer: SinkWriterABC,
+        vectorizer: VectorizerABC = None,
+    ):
         """
-        Builds the processing chain for the SPG.
+        Initializes the DefaultStructuredBuilderChain instance.
 
         Args:
-            **kwargs: Additional keyword arguments.
-
-        Returns:
-            chain: The constructed processing chain.
+            mapping (MappingABC): The mapping component to be used.
+            writer (SinkWriterABC): The writer component to be used.
+            vectorizer (VectorizerABC, optional): The vectorizer component to be used. Defaults to None.
         """
-        file_path = kwargs.get("file_path")
-        source = get_reader(file_path)(output_type="Dict")
-        mapping = SPGTypeMapping(spg_type_name=self.spg_type_name)
-        sink = KGWriter()
-
-        chain = source >> mapping >> sink
-        return chain
+        self.mapping = mapping
+        self.writer = writer
+        self.vectorizer = vectorizer
 
-    def invoke(self, file_path, max_workers=10, **kwargs):
-        logger.info(f"begin processing file_path:{file_path}")
+    def build(self, **kwargs):
         """
-        Invokes the processing chain with the given file path and optional parameters.
+        Construct the builder chain by connecting the mapping, vectorizer (if available), and writer components.
 
         Args:
-            file_path (str): The path to the input file.
-            max_workers (int, optional): The maximum number of workers. Defaults to 10.
             **kwargs: Additional keyword arguments.
 
         Returns:
-            The result of invoking the processing chain.
+            KAGBuilderChain: The constructed builder chain.
         """
-        return super().invoke(file_path=file_path, max_workers=max_workers, **kwargs)
+        if self.vectorizer:
+            chain = self.mapping >> self.vectorizer >> self.writer
+        else:
+            chain = self.mapping >> self.writer
 
+        return chain
 
-class DefaultUnstructuredBuilderChain(BuilderChainABC):
-    """
-    A class representing a default KAG builder chain, used to extract graph from documents and import unstructured data.
+    # def get_component_with_ckpts(self):
+    #     return [
+    #         self.mapping,
+    #         self.vectorizer,
+    #         self.writer,
+    #     ]
 
-    Steps:
-        0. Initializing.
-        1. SourceReader: Reading chunks from a given file.
-        2. LengthSplitter: Splitting chunk to smaller chunks. The chunk size can be adjusted through parameters.
-        3. KAGExtractor: Extracting entities and relations from chunks, and assembling a sub graph.
-            By default,the extraction process includes NER and SPO Extraction.
-        4. KGWriter: Writing sub graph into KG storage.
+    # def close_checkpointers(self):
+    #     for node in self.get_component_with_ckpts():
+    #         if node and hasattr(node, "checkpointer"):
+    #             node.checkpointer.close()
 
-    """
 
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
+@KAGBuilderChain.register("unstructured")
+@KAGBuilderChain.register("unstructured_builder_chain")
+class DefaultUnstructuredBuilderChain(KAGBuilderChain):
+    """
+    A class representing a default unstructured builder chain, used to build a knowledge graph from unstructured text data such as txt and pdf files.
+    It consists of a reader, splitter, extractor, vectorizer, optional post-processor, and writer components.
+    """
 
-    def build(self, **kwargs) -> Chain:
+    def __init__(
+        self,
+        reader: ReaderABC,
+        splitter: SplitterABC,
+        extractor: ExtractorABC = None,
+        vectorizer: VectorizerABC = None,
+        writer: SinkWriterABC = None,
+        post_processor: PostProcessorABC = None,
+    ):
         """
-        Builds the processing chain for the KAG.
+        Initializes the DefaultUnstructuredBuilderChain instance.
 
         Args:
-            **kwargs: Additional keyword arguments.
-
-        Returns:
-            chain: The constructed processing chain.
+            reader (ReaderABC): The reader component to be used.
+            splitter (SplitterABC): The splitter component to be used.
+            extractor (ExtractorABC): The extractor component to be used.
+            vectorizer (VectorizerABC): The vectorizer component to be used.
+            writer (SinkWriterABC): The writer component to be used.
+            post_processor (PostProcessorABC, optional): The post-processor component to be used. Defaults to None.
         """
-        file_path = kwargs.get("file_path")
-        split_length = kwargs.get("split_length")
-        window_length = kwargs.get("window_length")
-        source = get_reader(file_path)()
-        splitter = LengthSplitter(split_length, window_length)
-        extractor = KAGExtractor()
-        vectorizer = BatchVectorizer()
-        sink = KGWriter()
-
-        chain = source >> splitter >> extractor >> vectorizer >> sink
-        return chain
+        self.reader = reader
+        self.splitter = splitter
+        self.extractor = extractor
+        self.vectorizer = vectorizer
+        self.post_processor = post_processor
+        self.writer = writer
 
-    def invoke(self, file_path: str, split_length: int = 500, window_length: int = 100, max_workers=10, **kwargs):
-        logger.info(f"begin processing file_path:{file_path}")
+    def build(self, **kwargs):
+        pass
+
+    def invoke(self, input_data, max_workers=10, **kwargs):
         """
-        Invokes the processing chain with the given file path and optional parameters.
+        Invokes the builder chain to process the input file.
 
         Args:
-            file_path (str): The path to the input file.
-            split_length (int, optional): The length at which the file should be split. Defaults to 500.
-            window_length (int, optional): The length of the processing window. Defaults to 100.
-            max_workers (int, optional): The maximum number of worker threads. Defaults to 10.
-
+            file_path: The path to the input file to be processed.
+            max_workers (int, optional): The maximum number of threads to use. Defaults to 10.
             **kwargs: Additional keyword arguments.
 
         Returns:
-            The result of invoking the processing chain.
+            List: The final output from the builder chain.
         """
-        return super().invoke(file_path=file_path, max_workers=max_workers, split_length=window_length, window_length=window_length, **kwargs)
+
+        def execute_node(node, node_input, **kwargs):
+            if not isinstance(node_input, list):
+                node_input = [node_input]
+            node_output = []
+            for item in node_input:
+                node_output.extend(node.invoke(item, **kwargs))
+            return node_output
+
+        def run_extract(chunk):
+            flow_data = [chunk]
+            input_key = chunk.hash_key
+            for node in [
+                self.extractor,
+                self.vectorizer,
+                self.post_processor,
+                self.writer,
+            ]:
+                if node is None:
+                    continue
+                flow_data = execute_node(node, flow_data, key=input_key)
+            return {input_key: flow_data[0]}
+
+        reader_output = self.reader.invoke(input_data, key=generate_hash_id(input_data))
+        splitter_output = []
+
+        for chunk in reader_output:
+            splitter_output.extend(self.splitter.invoke(chunk, key=chunk.hash_key))
+
+        processed_chunk_keys = kwargs.get("processed_chunk_keys", set())
+        filtered_chunks = []
+        processed = 0
+        for chunk in splitter_output:
+            if chunk.hash_key not in processed_chunk_keys:
+                filtered_chunks.append(chunk)
+            else:
+                processed += 1
+        logger.debug(
+            f"Total chunks: {len(splitter_output)}. Checkpointed: {processed}, Pending: {len(filtered_chunks)}."
+        )
+        result = []
+        with ThreadPoolExecutor(max_workers) as executor:
+            futures = [executor.submit(run_extract, chunk) for chunk in filtered_chunks]
+
+            from tqdm import tqdm
+
+            for inner_future in tqdm(
+                as_completed(futures),
+                total=len(futures),
+                desc="KAG Extraction From Chunk",
+                position=1,
+                leave=False,
+            ):
+                ret = inner_future.result()
+                result.append(ret)
+        return result
diff --git a/kag/builder/model/chunk.py b/kag/builder/model/chunk.py
index a5db11c3..526fffcd 100644
--- a/kag/builder/model/chunk.py
+++ b/kag/builder/model/chunk.py
@@ -9,9 +9,10 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
-import hashlib
 from enum import Enum
 from typing import Dict, Any
+from kag.common.utils import generate_hash_id
+import json
 
 
 class ChunkTypeEnum(str, Enum):
@@ -26,29 +27,27 @@ def __init__(
         name: str,
         content: str,
         type: ChunkTypeEnum = ChunkTypeEnum.Text,
-        **kwargs
+        **kwargs,
     ):
         self.id = id
         self.name = name
         self.type = type
         self.content = content
         self.kwargs = kwargs
+        for key, value in kwargs.items():
+            setattr(self, key, value)
 
-    @staticmethod
-    def generate_hash_id(value):
-        if isinstance(value, str):
-            value = value.encode("utf-8")
-        hasher = hashlib.sha256()
-        hasher.update(value)
-        return hasher.hexdigest()
+    @property
+    def hash_key(self):
+        return generate_hash_id(f"{self.id}{self.name}{self.content}")
 
     def __str__(self):
         tmp = {
             "id": self.id,
             "name": self.name,
-            "content": self.content
-            if len(self.content) <= 64
-            else self.content[:64] + " ...",
+            "content": (
+                self.content if len(self.content) <= 64 else self.content[:64] + " ..."
+            ),
         }
         return f"<Chunk>: {tmp}"
 
@@ -59,7 +58,9 @@ def to_dict(self):
             "id": self.id,
             "name": self.name,
             "content": self.content,
-            "type": self.type.value if isinstance(self.type, ChunkTypeEnum) else self.type,
+            "type": (
+                self.type.value if isinstance(self.type, ChunkTypeEnum) else self.type
+            ),
             "properties": self.kwargs,
         }
 
@@ -72,3 +73,10 @@ def from_dict(cls, input_: Dict[str, Any]):
             type=input_.get("type"),
             **input_.get("properties", {}),
         )
+
+
+def dump_chunks(chunks, **kwargs):
+    if kwargs.get("output_path"):
+        with open(kwargs.get("output_path"), "w") as f:
+            for chunk in chunks:
+                f.write(json.dumps(chunk.to_dict(), ensure_ascii=False) + "\n")
diff --git a/kag/builder/model/spg_record.py b/kag/builder/model/spg_record.py
index 5c5b6825..f737dc00 100644
--- a/kag/builder/model/spg_record.py
+++ b/kag/builder/model/spg_record.py
@@ -23,145 +23,165 @@ class SPGRecord:
     """Data structure in operator, used to store entity information."""
 
     def __init__(self, spg_type_name: SPGTypeName):
+        """
+        Initializes a new instance of the SPGRecord class.
+
+        Args:
+            spg_type_name (SPGTypeName): The type name of the SPG entity.
+        """
         self._spg_type_name = spg_type_name
         self._properties = {}
         self._relations = {}
 
     @property
     def id(self) -> str:
+        """
+        Gets the ID of the SPGRecord.
+
+        Returns:
+            str: The ID of the SPGRecord.
+        """
         return self.get_property("id", "")
 
     @property
     def name(self) -> str:
+        """
+        Gets the name of the SPGRecord.
+
+        Returns:
+            str: The name of the SPGRecord.
+        """
         return self.get_property("name", self.id)
 
     @property
     def spg_type_name(self) -> SPGTypeName:
-        """Gets the spg_type_name of this SPGRecord.  # noqa: E501
-
+        """
+        Gets the SPG type name of this SPGRecord.
 
-        :return: The spg_type_name of this SPGRecord.  # noqa: E501
-        :rtype: str
+        Returns:
+            SPGTypeName: The SPG type name of this SPGRecord.
         """
         return self._spg_type_name
 
     @spg_type_name.setter
     def spg_type_name(self, spg_type_name: SPGTypeName):
-        """Sets the spg_type_name of this SPGRecord.
-
+        """
+        Sets the SPG type name of this SPGRecord.
 
-        :param spg_type_name: The spg_type_name of this SPGRecord.  # noqa: E501
-        :type: str
+        Args:
+            spg_type_name (SPGTypeName): The SPG type name of this SPGRecord.
         """
         self._spg_type_name = spg_type_name
 
     @property
     def properties(self) -> Dict[PropertyName, str]:
-        """Gets the properties of this SPGRecord.  # noqa: E501
-
+        """
+        Gets the properties of this SPGRecord.
 
-        :return: The properties of this SPGRecord.  # noqa: E501
-        :rtype: dict
+        Returns:
+            Dict[PropertyName, str]: The properties of this SPGRecord.
         """
         return self._properties
 
     @properties.setter
     def properties(self, properties: Dict[PropertyName, str]):
-        """Sets the properties of this SPGRecord.
-
+        """
+        Sets the properties of this SPGRecord.
 
-        :param properties: The properties of this SPGRecord.  # noqa: E501
-        :type: dict
+        Args:
+            properties (Dict[PropertyName, str]): The properties of this SPGRecord.
         """
         self._properties = properties
 
     @property
     def relations(self) -> Dict[str, str]:
-        """Gets the relations of this SPGRecord.  # noqa: E501
-
+        """
+        Gets the relations of this SPGRecord.
 
-        :return: The relations of this SPGRecord.  # noqa: E501
-        :rtype: dict
+        Returns:
+            Dict[str, str]: The relations of this SPGRecord.
         """
         return self._relations
 
     @relations.setter
     def relations(self, relations: Dict[str, str]):
-        """Sets the properties of this SPGRecord.
-
+        """
+        Sets the relations of this SPGRecord.
 
-        :param relations: The relations of this SPGRecord.  # noqa: E501
-        :type: dict
+        Args:
+            relations (Dict[str, str]): The relations of this SPGRecord.
         """
         self._relations = relations
 
     def get_property(
         self, property_name: PropertyName, default_value: str = None
     ) -> str:
-        """Gets a property of this SPGRecord by name.  # noqa: E501
+        """
+        Gets a property of this SPGRecord by name.
 
+        Args:
+            property_name (PropertyName): The property name.
+            default_value (str, optional): If the property value is None, the default_value will be returned. Defaults to None.
 
-        :param property_name: The property name.  # noqa: E501
-        :param default_value: If property value is None, the default_value will be return.  # noqa: E501
-        :return: A property value.  # noqa: E501
-        :rtype: str
+        Returns:
+            str: The property value.
         """
         return self.properties.get(property_name, default_value)
 
     def upsert_property(self, property_name: PropertyName, value: str):
-        """Upsert a property of this SPGRecord.  # noqa: E501
-
+        """
+        Upserts a property of this SPGRecord.
 
-        :param property_name: The updated property name.  # noqa: E501
-        :param value: The updated property value.  # noqa: E501
-        :type: str
+        Args:
+            property_name (PropertyName): The updated property name.
+            value (str): The updated property value.
         """
         self.properties[property_name] = value
         return self
 
     def append_property(self, property_name: PropertyName, value: str):
-        """Append a property of this SPGRecord.  # noqa: E501
-
+        """
+        Appends a property of this SPGRecord.
 
-        :param property_name: The updated property name.  # noqa: E501
-        :param value: The updated property value.  # noqa: E501
-        :type: str
+        Args:
+            property_name (PropertyName): The updated property name.
+            value (str): The updated property value.
         """
         property_value = self.get_property(property_name)
         if property_value:
-            property_value_list = property_value.split(',')
+            property_value_list = property_value.split(",")
             if value not in property_value_list:
-                self.properties[property_name] = property_value + ',' + value
+                self.properties[property_name] = property_value + "," + value
         else:
             self.properties[property_name] = value
         return self
 
     def upsert_properties(self, properties: Dict[PropertyName, str]):
-        """Upsert properties of this SPGRecord.  # noqa: E501
-
+        """
+        Upserts properties of this SPGRecord.
 
-        :param properties: The updated properties.  # noqa: E501
-        :type: dict
+        Args:
+            properties (Dict[PropertyName, str]): The updated properties.
         """
         self.properties.update(properties)
         return self
 
     def remove_property(self, property_name: PropertyName):
-        """Removes a property of this SPGRecord.  # noqa: E501
-
+        """
+        Removes a property of this SPGRecord.
 
-        :param property_name: The property name.  # noqa: E501
-        :type: str
+        Args:
+            property_name (PropertyName): The property name.
         """
         self.properties.pop(property_name)
         return self
 
     def remove_properties(self, property_names: List[PropertyName]):
-        """Removes properties by given names.  # noqa: E501
-
+        """
+        Removes properties by given names.
 
-        :param property_names: A list of property names.  # noqa: E501
-        :type: list
+        Args:
+            property_names (List[PropertyName]): A list of property names.
         """
         for property_name in property_names:
             self.properties.pop(property_name)
@@ -173,37 +193,39 @@ def get_relation(
         object_type_name: SPGTypeName,
         default_value: str = None,
     ) -> str:
-        """Gets a relation of this SPGRecord by name.  # noqa: E501
+        """
+        Gets a relation of this SPGRecord by name.
 
+        Args:
+            relation_name (RelationName): The relation name.
+            object_type_name (SPGTypeName): The object SPG type name.
+            default_value (str, optional): If the relation value is None, the default_value will be returned. Defaults to None.
 
-        :param relation_name: The relation name.  # noqa: E501
-        :param object_type_name: The object SPG type name.  # noqa: E501
-        :param default_value: If property value is None, the default_value will be return.  # noqa: E501
-        :return: A relation value.  # noqa: E501
-        :rtype: str
+        Returns:
+            str: The relation value.
         """
         return self.relations.get(relation_name + "#" + object_type_name, default_value)
 
     def upsert_relation(
         self, relation_name: RelationName, object_type_name: SPGTypeName, value: str
     ):
-        """Upsert a relation of this SPGRecord.  # noqa: E501
-
+        """
+        Upserts a relation of this SPGRecord.
 
-        :param relation_name: The updated relation name.  # noqa: E501
-        :param object_type_name: The object SPG type name.  # noqa: E501
-        :param value: The updated relation value.  # noqa: E501
-        :type: str
+        Args:
+            relation_name (RelationName): The updated relation name.
+            object_type_name (SPGTypeName): The object SPG type name.
+            value (str): The updated relation value.
         """
         self.relations[relation_name + "#" + object_type_name] = value
         return self
 
     def upsert_relations(self, relations: Dict[Tuple[RelationName, SPGTypeName], str]):
-        """Upsert relations of this SPGRecord.  # noqa: E501
-
+        """
+        Upserts relations of this SPGRecord.
 
-        :param relations: The updated relations.  # noqa: E501
-        :type: dict
+        Args:
+            relations (Dict[Tuple[RelationName, SPGTypeName], str]): The updated relations.
         """
         for (relation_name, object_type_name), value in relations.items():
             self.relations[relation_name + "#" + object_type_name] = value
@@ -212,33 +234,43 @@ def upsert_relations(self, relations: Dict[Tuple[RelationName, SPGTypeName], str
     def remove_relation(
         self, relation_name: RelationName, object_type_name: SPGTypeName
     ):
-        """Removes a relation of this SPGRecord.  # noqa: E501
-
+        """
+        Removes a relation of this SPGRecord.
 
-        :param relation_name: The relation name.  # noqa: E501
-        :param object_type_name: The object SPG type name.  # noqa: E501
-        :type: str
+        Args:
+            relation_name (RelationName): The relation name.
+            object_type_name (SPGTypeName): The object SPG type name.
         """
         self.relations.pop(relation_name + "#" + object_type_name)
         return self
 
     def remove_relations(self, relation_names: List[Tuple[RelationName, SPGTypeName]]):
-        """Removes relations by given names.  # noqa: E501
-
+        """
+        Removes relations by given names.
 
-        :param relation_names: A list of relation names.  # noqa: E501
-        :type: list
+        Args:
+            relation_names (List[Tuple[RelationName, SPGTypeName]]): A list of relation names.
         """
-        for (relation_name, object_type_name) in relation_names:
+        for relation_name, object_type_name in relation_names:
             self.relations.pop(relation_name + "#" + object_type_name)
         return self
 
     def to_str(self):
-        """Returns the string representation of the model"""
+        """
+        Returns the string representation of the model.
+
+        Returns:
+            str: The string representation of the model.
+        """
         return pprint.pformat(self.__dict__())
 
     def to_dict(self):
-        """Returns the model properties as a dict"""
+        """
+        Returns the model properties as a dict.
+
+        Returns:
+            dict: The model properties as a dict.
+        """
 
         return {
             "spgTypeName": self.spg_type_name,
@@ -249,7 +281,12 @@ def to_dict(self):
         }
 
     def __dict__(self):
-        """Returns this SPGRecord as a dict"""
+        """
+        Returns this SPGRecord as a dict.
+
+        Returns:
+            dict: This SPGRecord as a dict.
+        """
         return {
             "spgTypeName": self.spg_type_name,
             "properties": self.properties,
@@ -258,7 +295,15 @@ def __dict__(self):
 
     @classmethod
     def from_dict(cls, input: Dict[str, Any]):
-        """Returns the model from a dict"""
+        """
+        Returns the model from a dict.
+
+        Args:
+            input (Dict[str, Any]): The input dictionary.
+
+        Returns:
+            SPGRecord: The model from the input dictionary.
+        """
         spg_type_name = input.get("spgTypeName")
         _cls = cls(spg_type_name)
         properties = input.get("properties")
@@ -272,5 +317,10 @@ def from_dict(cls, input: Dict[str, Any]):
         return _cls
 
     def __repr__(self):
-        """For `print` and `pprint`"""
+        """
+        For `print` and `pprint`.
+
+        Returns:
+            str: The string representation of the model.
+        """
         return pprint.pformat(self.__dict__())
diff --git a/kag/builder/model/sub_graph.py b/kag/builder/model/sub_graph.py
index ff4ebb7f..ccc4c5c6 100644
--- a/kag/builder/model/sub_graph.py
+++ b/kag/builder/model/sub_graph.py
@@ -10,10 +10,11 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
 import pprint
+import copy
 from typing import Dict, List, Any
 
-from knext.schema.client import BASIC_TYPES
 from kag.builder.model.spg_record import SPGRecord
+from knext.schema.client import BASIC_TYPES
 from knext.schema.model.base import BaseSpgType
 
 
@@ -41,14 +42,14 @@ def from_spg_record(cls, idx, spg_record: SPGRecord):
 
     @staticmethod
     def unique_key(spg_record):
-        return spg_record.spg_type_name + '_' + spg_record.get_property("name", "")
+        return spg_record.spg_type_name + "_" + spg_record.get_property("name", "")
 
     def to_dict(self):
         return {
             "id": self.id,
             "name": self.name,
             "label": self.label,
-            "properties": self.properties,
+            "properties": copy.deepcopy(self.properties),
         }
 
     @classmethod
@@ -57,11 +58,15 @@ def from_dict(cls, input: Dict):
             _id=input["id"],
             name=input["name"],
             label=input["label"],
-            properties=input["properties"],
+            properties=input.get("properties", {}),
         )
 
     def __eq__(self, other):
-        return self.name == other.name and self.label == other.label and self.properties == other.properties
+        return (
+            self.name == other.name
+            and self.label == other.label
+            and self.properties == other.properties
+        )
 
 
 class Edge(object):
@@ -74,7 +79,12 @@ class Edge(object):
     properties: Dict[str, str]
 
     def __init__(
-            self, _id: str, from_node: Node, to_node: Node, label: str, properties: Dict[str, str]
+        self,
+        _id: str,
+        from_node: Node,
+        to_node: Node,
+        label: str,
+        properties: Dict[str, str],
     ):
         self.from_id = from_node.id
         self.from_type = from_node.label
@@ -88,12 +98,19 @@ def __init__(
 
     @classmethod
     def from_spg_record(
-            cls, s_idx, subject_record: SPGRecord, o_idx, object_record: SPGRecord, label: str
+        cls,
+        s_idx,
+        subject_record: SPGRecord,
+        o_idx,
+        object_record: SPGRecord,
+        label: str,
     ):
         from_node = Node.from_spg_record(s_idx, subject_record)
         to_node = Node.from_spg_record(o_idx, object_record)
 
-        return cls(_id="", from_node=from_node, to_node=to_node, label=label, properties={})
+        return cls(
+            _id="", from_node=from_node, to_node=to_node, label=label, properties={}
+        )
 
     def to_dict(self):
         return {
@@ -103,21 +120,35 @@ def to_dict(self):
             "fromType": self.from_type,
             "toType": self.to_type,
             "label": self.label,
-            "properties": self.properties,
+            "properties": copy.deepcopy(self.properties),
         }
 
     @classmethod
     def from_dict(cls, input: Dict):
         return cls(
             _id=input["id"],
-            from_node=Node(_id=input["from"], name=input["from"],label=input["fromType"], properties={}),
-            to_node=Node(_id=input["to"], name=input["to"], label=input["toType"], properties={}),
+            from_node=Node(
+                _id=input["from"],
+                name=input["from"],
+                label=input["fromType"],
+                properties={},
+            ),
+            to_node=Node(
+                _id=input["to"], name=input["to"], label=input["toType"], properties={}
+            ),
             label=input["label"],
-            properties=input["properties"],
+            properties=input.get("properties", {}),
         )
 
     def __eq__(self, other):
-        return self.from_id == other.from_id and self.to_id == other.to_id and self.label == other.label and self.properties == other.properties and self.from_type == other.from_type and self.to_type == other.to_type
+        return (
+            self.from_id == other.from_id
+            and self.to_id == other.to_id
+            and self.label == other.label
+            and self.properties == other.properties
+            and self.from_type == other.from_type
+            and self.to_type == other.to_type
+        )
 
 
 class SubGraph(object):
@@ -135,12 +166,18 @@ def add_node(self, id: str, name: str, label: str, properties=None):
         self.nodes.append(Node(_id=id, name=name, label=label, properties=properties))
         return self
 
-    def add_edge(self, s_id: str, s_label: str, p: str, o_id: str, o_label: str, properties=None):
+    def add_edge(
+        self, s_id: str, s_label: str, p: str, o_id: str, o_label: str, properties=None
+    ):
         if not properties:
             properties = dict()
         s_node = Node(_id=s_id, name=s_id, label=s_label, properties={})
         o_node = Node(_id=o_id, name=o_id, label=o_label, properties={})
-        self.edges.append(Edge(_id="", from_node=s_node, to_node=o_node, label=p, properties=properties))
+        self.edges.append(
+            Edge(
+                _id="", from_node=s_node, to_node=o_node, label=p, properties=properties
+            )
+        )
         return self
 
     def to_dict(self):
@@ -152,7 +189,7 @@ def to_dict(self):
     def __repr__(self):
         return pprint.pformat(self.to_dict())
 
-    def merge(self, sub_graph: 'SubGraph'):
+    def merge(self, sub_graph: "SubGraph"):
         self.nodes.extend(sub_graph.nodes)
         self.edges.extend(sub_graph.edges)
 
@@ -164,21 +201,30 @@ def from_spg_record(
         for record in spg_records:
             s_id = record.id
             s_name = record.name
-            s_label = record.spg_type_name.split('.')[-1]
+            s_label = record.spg_type_name.split(".")[-1]
             properties = record.properties
 
             spg_type = spg_types.get(record.spg_type_name)
             for prop_name, prop_value in record.properties.items():
                 if prop_name in spg_type.properties:
                     from knext.schema.model.property import Property
+
                     prop: Property = spg_type.properties.get(prop_name)
-                    o_label = prop.object_type_name.split('.')[-1]
+                    o_label = prop.object_type_name.split(".")[-1]
                     if o_label not in BASIC_TYPES:
-                        prop_value_list = prop_value.split(',')
+                        prop_value_list = prop_value.split(",")
                         for o_id in prop_value_list:
-                            sub_graph.add_edge(s_id=s_id, s_label=s_label, p=prop_name, o_id=o_id, o_label=o_label)
+                            sub_graph.add_edge(
+                                s_id=s_id,
+                                s_label=s_label,
+                                p=prop_name,
+                                o_id=o_id,
+                                o_label=o_label,
+                            )
                         properties.pop(prop_name)
-            sub_graph.add_node(id=s_id, name=s_name, label=s_label, properties=properties)
+            sub_graph.add_node(
+                id=s_id, name=s_name, label=s_label, properties=properties
+            )
 
         return sub_graph
 
diff --git a/kag/builder/operator/__init__.py b/kag/builder/operator/__init__.py
index 123acd8d..93aa6cd4 100644
--- a/kag/builder/operator/__init__.py
+++ b/kag/builder/operator/__init__.py
@@ -9,4 +9,3 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
-
diff --git a/kag/builder/prompt/__init__.py b/kag/builder/prompt/__init__.py
index e69de29b..ad9e9bd2 100644
--- a/kag/builder/prompt/__init__.py
+++ b/kag/builder/prompt/__init__.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+from kag.builder.prompt.default.ner import OpenIENERPrompt as DefaultOpenIENERPrompt
+from kag.builder.prompt.default.std import (
+    OpenIEEntitystandardizationdPrompt as DefaultOpenIEEntitystandardizationdPrompt,
+)
+from kag.builder.prompt.default.triple import (
+    OpenIETriplePrompt as DefaultOpenIETriplePrompt,
+)
+
+from kag.builder.prompt.medical.ner import OpenIENERPrompt as MedicalOpenIENERPrompt
+from kag.builder.prompt.medical.std import (
+    OpenIEEntitystandardizationdPrompt as MedicalOpenIEEntitystandardizationdPrompt,
+)
+from kag.builder.prompt.medical.triple import (
+    OpenIETriplePrompt as MedicalOpenIETriplePrompt,
+)
+
+from kag.builder.prompt.analyze_table_prompt import AnalyzeTablePrompt
+from kag.builder.prompt.spg_prompt import SPGPrompt, SPGEntityPrompt, SPGEventPrompt
+from kag.builder.prompt.semantic_seg_prompt import SemanticSegPrompt
+from kag.builder.prompt.outline_prompt import OutlinePrompt
+
+
+__all__ = [
+    "DefaultOpenIENERPrompt",
+    "DefaultOpenIEEntitystandardizationdPrompt",
+    "DefaultOpenIETriplePrompt",
+    "MedicalOpenIENERPrompt",
+    "MedicalOpenIEEntitystandardizationdPrompt",
+    "MedicalOpenIETriplePrompt",
+    "AnalyzeTablePrompt",
+    "OutlinePrompt",
+    "SemanticSegPrompt",
+    "SPGPrompt",
+    "SPGEntityPrompt",
+    "SPGEventPrompt",
+]
diff --git a/kag/builder/prompt/analyze_table_prompt.py b/kag/builder/prompt/analyze_table_prompt.py
index 00b9ade0..cda19aa2 100644
--- a/kag/builder/prompt/analyze_table_prompt.py
+++ b/kag/builder/prompt/analyze_table_prompt.py
@@ -13,34 +13,24 @@
 import json
 import logging
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 logger = logging.getLogger(__name__)
 
 
-
-
-class AnalyzeTablePrompt(PromptOp):
+@PromptABC.register("analyze_table")
+class AnalyzeTablePrompt(PromptABC):
     template_zh: str = """你是一个分析表格的专家, 从table中提取信息并分析，最后返回表格有效信息"""
     template_en: str = """You are an expert in knowledge graph extraction. Based on the schema defined by the constraint, extract all entities and their attributes from the input. Return NAN for attributes not explicitly mentioned in the input. Output the results in standard JSON format, as a list."""
 
-    def __init__(
-        self,
-        language: str = "zh",
-    ):
-        super().__init__(
-            language=language,
-        )
-
     def build_prompt(self, variables) -> str:
         return json.dumps(
             {
                 "instruction": self.template,
-                "table": variables.get("table",""),
+                "table": variables.get("table", ""),
             },
             ensure_ascii=False,
         )
 
     def parse_response(self, response: str, **kwargs):
         return response
-
diff --git a/kag/builder/prompt/default/ner.py b/kag/builder/prompt/default/ner.py
index 1cc92310..66709a6d 100644
--- a/kag/builder/prompt/default/ner.py
+++ b/kag/builder/prompt/default/ner.py
@@ -12,66 +12,66 @@
 
 import json
 from string import Template
-from typing import List, Optional
-
-from kag.common.base.prompt_op import PromptOp
+from typing import List
+from kag.common.conf import KAG_PROJECT_CONF
+from kag.interface import PromptABC
 from knext.schema.client import SchemaClient
 
 
-class OpenIENERPrompt(PromptOp):
-
+@PromptABC.register("default_ner")
+class OpenIENERPrompt(PromptABC):
     template_en = """
     {
     "instruction": "You're a very effective entity extraction system. Please extract all the entities that are important for knowledge build and question, along with type, category and a brief description of the entity. The description of the entity is based on your OWN KNOWLEDGE AND UNDERSTANDING and does not need to be limited to the context. the entity's category belongs taxonomically to one of the items defined by schema, please also output the category. Note: Type refers to a specific, well-defined classification, such as Professor, Actor, while category is a broader group or class that may contain more than one type, such as Person, Works. Return an empty list if the entity type does not exist. Please respond in the format of a JSON string.You can refer to the example for extraction.",
     "schema": $schema,
     "example": [
         {
-            "input": "The Rezort\nThe Rezort is a 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger.\n It stars Dougray Scott, Jessica De Gouw and Martin McCann.\n After humanity wins a devastating war against zombies, the few remaining undead are kept on a secure island, where they are hunted for sport.\n When something goes wrong with the island's security, the guests must face the possibility of a new outbreak.",
+            "input": "The Rezort is a 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger. It stars Dougray Scott, Jessica De Gouw and Martin McCann. After humanity wins a devastating war against zombies, the few remaining undead are kept on a secure island, where they are hunted for sport. When something goes wrong with the island's security, the guests must face the possibility of a new outbreak.",
             "output": [
                         {
-                            "entity": "The Rezort",
+                            "name": "The Rezort",
                             "type": "Movie",
                             "category": "Works",
                             "description": "A 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger."
                         },
                         {
-                            "entity": "2015",
+                            "name": "2015",
                             "type": "Year",
                             "category": "Date",
                             "description": "The year the movie 'The Rezort' was released."
                         },
                         {
-                            "entity": "British",
+                            "name": "British",
                             "type": "Nationality",
                             "category": "GeographicLocation",
                             "description": "Great Britain, the island that includes England, Scotland, and Wales."
                         },
                         {
-                            "entity": "Steve Barker",
+                            "name": "Steve Barker",
                             "type": "Director",
                             "category": "Person",
                             "description": "Steve Barker is an English film director and screenwriter."
                         },
                         {
-                            "entity": "Paul Gerstenberger",
+                            "name": "Paul Gerstenberger",
                             "type": "Writer",
                             "category": "Person",
                             "description": "Paul is a writer and producer, known for The Rezort (2015), Primeval (2007) and House of Anubis (2011)."
                         },
                         {
-                            "entity": "Dougray Scott",
+                            "name": "Dougray Scott",
                             "type": "Actor",
                             "category": "Person",
                             "description": "Stephen Dougray Scott (born 26 November 1965) is a Scottish actor."
                         },
                         {
-                            "entity": "Jessica De Gouw",
+                            "name": "Jessica De Gouw",
                             "type": "Actor",
                             "category": "Person",
                             "description": "Jessica Elise De Gouw (born 15 February 1988) is an Australian actress. "
                         },
                         {
-                            "entity": "Martin McCann",
+                            "name": "Martin McCann",
                             "type": "Actor",
                             "category": "Person",
                             "description": "Martin McCann is an actor from Northern Ireland. In 2020, he was listed as number 48 on The Irish Times list of Ireland's greatest film actors"
@@ -89,52 +89,52 @@ class OpenIENERPrompt(PromptOp):
         "schema": $schema,
         "example": [
             {
-                "input": "《Rezort》\n《Rezort》是一部 2015 年英国僵尸恐怖片，由史蒂夫·巴克执导，保罗·格斯滕伯格编剧。\n 该片由道格瑞·斯科特、杰西卡·德·古维和马丁·麦凯恩主演。\n 在人类赢得与僵尸的毁灭性战争后，剩下的少数不死生物被关在一个安全的岛屿上，在那里他们被猎杀作为消遣。\n 当岛上的安全出现问题时，客人们必须面对新一轮疫情爆发的可能性。",
+                "input": "《Rezort》是一部 2015年英国僵尸恐怖片，由史蒂夫·巴克执导，保罗·格斯滕伯格编剧。该片由道格瑞·斯科特、杰西卡·德·古维和马丁·麦凯恩主演。在人类赢得与僵尸的毁灭性战争后，剩下的少数不死生物被关在一个安全的岛屿上，在那里他们被猎杀作为消遣。当岛上的安全出现问题时，客人们必须面对新一轮疫情爆发的可能性。",
                 "output": [
                             {
-                                "entity": "The Rezort",
+                                "name": "The Rezort",
                                 "type": "Movie",
                                 "category": "Works",
                                 "description": "一部 2015 年英国僵尸恐怖片，由史蒂夫·巴克执导，保罗·格斯滕伯格编剧。"
                             },
                             {
-                                "entity": "2015",
+                                "name": "2015",
                                 "type": "Year",
                                 "category": "Date",
                                 "description": "电影《The Rezort》上映的年份。"
                             },
                             {
-                                "entity": "英国",
+                                "name": "英国",
                                 "type": "Nationality",
                                 "category": "GeographicLocation",
                                 "description": "大不列颠，包括英格兰、苏格兰和威尔士的岛屿。"
                             },
                             {
-                                "entity": "史蒂夫·巴克",
+                                "name": "史蒂夫·巴克",
                                 "type": "Director",
                                 "category": "Person",
                                 "description": "史蒂夫·巴克 是一名英国电影导演和剧作家"
                             },
                             {
-                                "entity": "保罗·格斯滕伯格",
+                                "name": "保罗·格斯滕伯格",
                                 "type": "Writer",
                                 "category": "Person",
                                 "description": "保罗·格斯滕伯格 (Paul Gerstenberger) 是一名作家和制片人，因《The Rezort》（2015 年）、《Primeval》（2007 年）和《House of Anubis》（2011 年）而闻名。"
                             },
                             {
-                                "entity": "道格雷·斯科特",
+                                "name": "道格雷·斯科特",
                                 "type": "Actor",
                                 "category": "Person",
                                 "description": "斯蒂芬·道格雷·斯科特 (Stephen Dougray Scott，1965 年 11 月 26 日出生) 是一位苏格兰演员。"
                             },
                             {
-                                "entity": "杰西卡·德·古维",
+                                "name": "杰西卡·德·古维",
                                 "type": "Actor",
                                 "category": "Person",
                                 "description": "杰西卡·伊莉斯·德·古维 (Jessica Elise De Gouw，1988 年 2 月 15 日出生) 是一位澳大利亚女演员。"
                             },
                             {
-                                "entity": "马丁·麦肯",
+                                "name": "马丁·麦肯",
                                 "type": "Actor",
                                 "category": "Person",
                                 "description": "马丁·麦肯是来自北爱尔兰的演员。2020 年，他在《爱尔兰时报》爱尔兰最伟大电影演员名单中排名第 48 位"
@@ -146,12 +146,14 @@ class OpenIENERPrompt(PromptOp):
     }    
         """
 
-    def __init__(
-            self, language: Optional[str] = "en", **kwargs
-    ):
+    def __init__(self, language: str = "", **kwargs):
         super().__init__(language, **kwargs)
-        self.schema = SchemaClient(project_id=self.project_id).extract_types()
-        self.template = Template(self.template).safe_substitute(schema=self.schema)
+        self.schema = SchemaClient(
+            project_id=KAG_PROJECT_CONF.project_id
+        ).extract_types()
+        self.template = Template(self.template).safe_substitute(
+            schema=json.dumps(self.schema)
+        )
 
     @property
     def template_variables(self) -> List[str]:
diff --git a/kag/builder/prompt/default/std.py b/kag/builder/prompt/default/std.py
index d56f0090..8045a582 100644
--- a/kag/builder/prompt/default/std.py
+++ b/kag/builder/prompt/default/std.py
@@ -11,65 +11,66 @@
 # or implied.
 
 import json
-from typing import Optional, List
+from typing import List
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 
-class OpenIEEntitystandardizationdPrompt(PromptOp):
+@PromptABC.register("default_std")
+class OpenIEEntitystandardizationdPrompt(PromptABC):
     template_en = """
 {
     "instruction": "The `input` field contains a user provided context. The `named_entities` field contains extracted named entities from the context, which may be unclear abbreviations, aliases, or slang. To eliminate ambiguity, please attempt to provide the official names of these entities based on the context and your own knowledge. Note that entities with the same meaning can only have ONE official name. Please respond in the format of a single JSONArray string without any explanation, as shown in the `output` field of the provided example.",
     "example": {
-        "input": "American History\nWhen did the political party that favored harsh punishment of southern states after the Civil War, gain control of the House? Republicans regained control of the chamber they had lost in the 2006 midterm elections.",
+        "input": "American History.When did the political party that favored harsh punishment of southern states after the Civil War, gain control of the House? Republicans regained control of the chamber they had lost in the 2006 midterm elections.",
         "named_entities": [
-            {"entity": "American", "category": "GeographicLocation"},
-            {"entity": "political party", "category": "Organization"},
-            {"entity": "southern states", "category": "GeographicLocation"},
-            {"entity": "Civil War", "category": "Keyword"},
-            {"entity": "House", "category": "Organization"},
-            {"entity": "Republicans", "category": "Organization"},
-            {"entity": "chamber", "category": "Organization"},
-            {"entity": "2006 midterm elections", "category": "Date"}
+            {"name": "American", "category": "GeographicLocation"},
+            {"name": "political party", "category": "Organization"},
+            {"name": "southern states", "category": "GeographicLocation"},
+            {"name": "Civil War", "category": "Keyword"},
+            {"name": "House", "category": "Organization"},
+            {"name": "Republicans", "category": "Organization"},
+            {"name": "chamber", "category": "Organization"},
+            {"name": "2006 midterm elections", "category": "Date"}
         ],
         "output": [
             {
-                "entity": "American",
+                "name": "American",
                 "category": "GeographicLocation",
                 "official_name": "United States of America"
             },
             {
-                "entity": "political party",
+                "name": "political party",
                 "category": "Organization",
                 "official_name": "Radical Republicans"
             },
             {
-                "entity": "southern states",
+                "name": "southern states",
                 "category": "GeographicLocation",
                 "official_name": "Confederacy"
             },
             {
-                "entity": "Civil War",
+                "name": "Civil War",
                 "category": "Keyword",
                 "official_name": "American Civil War"
             },
             {
-                "entity": "House",
+                "name": "House",
                 "category": "Organization",
                 "official_name": "United States House of Representatives"
             },
             {
-                "entity": "Republicans",
+                "name": "Republicans",
                 "category": "Organization",
                 "official_name": "Republican Party"
             },
             {
-                "entity": "chamber",
+                "name": "chamber",
                 "category": "Organization",
                 "official_name": "United States House of Representatives"
             },
             {
-                "entity": "midterm elections",
+                "name": "midterm elections",
                 "category": "Date",
                 "official_name": "United States midterm elections"
             }
@@ -84,26 +85,26 @@ class OpenIEEntitystandardizationdPrompt(PromptOp):
 {
     "instruction": "input字段包含用户提供的上下文。命名实体字段包含从上下文中提取的命名实体，这些可能是含义不明的缩写、别名或俚语。为了消除歧义，请尝试根据上下文和您自己的知识提供这些实体的官方名称。请注意，具有相同含义的实体只能有一个官方名称。请按照提供的示例中的输出字段格式，以单个JSONArray字符串形式回复，无需任何解释。",
     "example": {
-        "input": "烦躁不安、语妄、失眠酌用镇静药，禁用抑制呼吸的镇静药。\n3.并发症的处理经抗菌药物治疗后，高热常在24小时内消退，或数日内逐渐下降。\n若体温降而复升或3天后仍不降者，应考虑SP的肺外感染，如腋胸、心包炎或关节炎等。治疗：接胸腔压力调节管＋吸引机负压吸引水瓶装置闭式负压吸引宜连续，如经12小时后肺仍未复张，应查找原因。",
+        "input": "烦躁不安、语妄、失眠酌用镇静药，禁用抑制呼吸的镇静药。3.并发症的处理经抗菌药物治疗后，高热常在24小时内消退，或数日内逐渐下降。若体温降而复升或3天后仍不降者，应考虑SP的肺外感染，如腋胸、心包炎或关节炎等。治疗：接胸腔压力调节管＋吸引机负压吸引水瓶装置闭式负压吸引宜连续，如经12小时后肺仍未复张，应查找原因。",
         "named_entities": [
-            {"entity": "烦躁不安", "category": "Symptom"},
-            {"entity": "语妄", "category": "Symptom"},
-            {"entity": "失眠", "category": "Symptom"},
-            {"entity": "镇静药", "category": "Medicine"},
-            {"entity": "肺外感染", "category": "Disease"},
-            {"entity": "胸腔压力调节管", "category": "MedicalEquipment"},
-            {"entity": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"},
-            {"entity": "闭式负压吸引", "category": "SurgicalOperation"}
+            {"name": "烦躁不安", "category": "Symptom"},
+            {"name": "语妄", "category": "Symptom"},
+            {"name": "失眠", "category": "Symptom"},
+            {"name": "镇静药", "category": "Medicine"},
+            {"name": "肺外感染", "category": "Disease"},
+            {"name": "胸腔压力调节管", "category": "MedicalEquipment"},
+            {"name": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"},
+            {"name": "闭式负压吸引", "category": "SurgicalOperation"}
         ],
         "output": [
-            {"entity": "烦躁不安", "category": "Symptom", "official_name": "焦虑不安"},
-            {"entity": "语妄", "category": "Symptom", "official_name": "谵妄"},
-            {"entity": "失眠", "category": "Symptom", "official_name": "失眠症"},
-            {"entity": "镇静药", "category": "Medicine", "official_name": "镇静剂"},
-            {"entity": "肺外感染", "category": "Disease", "official_name": "肺外感染"},
-            {"entity": "胸腔压力调节管", "category": "MedicalEquipment", "official_name": "胸腔引流管"},
-            {"entity": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment", "official_name": "负压吸引装置"},
-            {"entity": "闭式负压吸引", "category": "SurgicalOperation", "official_name": "闭式负压引流"}
+            {"name": "烦躁不安", "category": "Symptom", "official_name": "焦虑不安"},
+            {"name": "语妄", "category": "Symptom", "official_name": "谵妄"},
+            {"name": "失眠", "category": "Symptom", "official_name": "失眠症"},
+            {"name": "镇静药", "category": "Medicine", "official_name": "镇静剂"},
+            {"name": "肺外感染", "category": "Disease", "official_name": "肺外感染"},
+            {"name": "胸腔压力调节管", "category": "MedicalEquipment", "official_name": "胸腔引流管"},
+            {"name": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment", "official_name": "负压吸引装置"},
+            {"name": "闭式负压吸引", "category": "SurgicalOperation", "official_name": "闭式负压引流"}
         ]
     },
     "input": $input,
@@ -111,15 +112,11 @@ class OpenIEEntitystandardizationdPrompt(PromptOp):
 }    
     """
 
-    def __init__(self, language: Optional[str] = "en"):
-        super().__init__(language)
-
     @property
     def template_variables(self) -> List[str]:
         return ["input", "named_entities"]
 
     def parse_response(self, response: str, **kwargs):
-
         rsp = response
         if isinstance(rsp, str):
             rsp = json.loads(rsp)
@@ -134,10 +131,10 @@ def parse_response(self, response: str, **kwargs):
         entities = kwargs.get("named_entities", [])
         for entity in standardized_entity:
             merged.append(entity)
-            entities_with_offical_name.add(entity["entity"])
+            entities_with_offical_name.add(entity["name"])
         # in case llm ignores some entities
         for entity in entities:
-            if entity["entity"] not in entities_with_offical_name:
-                entity["official_name"] = entity["entity"]
+            if entity["name"] not in entities_with_offical_name:
+                entity["official_name"] = entity["name"]
                 merged.append(entity)
         return merged
diff --git a/kag/builder/prompt/default/triple.py b/kag/builder/prompt/default/triple.py
index c870604c..03584a6b 100644
--- a/kag/builder/prompt/default/triple.py
+++ b/kag/builder/prompt/default/triple.py
@@ -11,66 +11,67 @@
 # or implied.
 
 import json
-from typing import Optional, List
+from typing import List
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 
-class OpenIETriplePrompt(PromptOp):
+@PromptABC.register("default_triple")
+class OpenIETriplePrompt(PromptABC):
     template_en = """
 {
-    "instruction": "You are an expert specializing in carrying out open information extraction (OpenIE). Please extract any possible relations (including subject, predicate, object) from the given text, and list them following the json format {\"triples\": [[\"subject\", \"predicate\",  \"object\"]]}\n. If there are none, do not list them.\n.\n\nPay attention to the following requirements:\n- Each triple should contain at least one, but preferably two, of the named entities in the entity_list.\n- Clearly resolve pronouns to their specific names to maintain clarity.",
+    "instruction": "You are an expert specializing in carrying out open information extraction (OpenIE). Please extract any possible relations (including subject, predicate, object) from the given text, and list them following the json format {\"triples\": [[\"subject\", \"predicate\",  \"object\"]]}. If there are none, do not list them..Pay attention to the following requirements:- Each triple should contain at least one, but preferably two, of the named entities in the entity_list.- Clearly resolve pronouns to their specific names to maintain clarity.",
     "entity_list": $entity_list,
     "input": "$input",
     "example": {
-        "input": "The Rezort\nThe Rezort is a 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger.\n It stars Dougray Scott, Jessica De Gouw and Martin McCann.\n After humanity wins a devastating war against zombies, the few remaining undead are kept on a secure island, where they are hunted for sport.\n When something goes wrong with the island's security, the guests must face the possibility of a new outbreak.",
+        "input": "The RezortThe Rezort is a 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger. It stars Dougray Scott, Jessica De Gouw and Martin McCann. After humanity wins a devastating war against zombies, the few remaining undead are kept on a secure island, where they are hunted for sport. When something goes wrong with the island's security, the guests must face the possibility of a new outbreak.",
         "entity_list": [
             {
-                "entity": "The Rezort",
+                "name": "The Rezort",
                 "category": "Works"
             },
             {
-                "entity": "2015",
+                "name": "2015",
                 "category": "Others"
             },
             {
-                "entity": "British",
+                "name": "British",
                 "category": "GeographicLocation"
             },
             {
-                "entity": "Steve Barker",
+                "name": "Steve Barker",
                 "category": "Person"
             },
             {
-                "entity": "Paul Gerstenberger",
+                "name": "Paul Gerstenberger",
                 "category": "Person"
             },
             {
-                "entity": "Dougray Scott",
+                "name": "Dougray Scott",
                 "category": "Person"
             },
             {
-                "entity": "Jessica De Gouw",
+                "name": "Jessica De Gouw",
                 "category": "Person"
             },
             {
-                "entity": "Martin McCann",
+                "name": "Martin McCann",
                 "category": "Person"
             },
             {
-                "entity": "zombies",
+                "name": "zombies",
                 "category": "Creature"
             },
             {
-                "entity": "zombie horror film",
+                "name": "zombie horror film",
                 "category": "Concept"
             },
             {
-                "entity": "humanity",
+                "name": "humanity",
                 "category": "Concept"
             },
             {
-                "entity": "secure island",
+                "name": "secure island",
                 "category": "GeographicLocation"
             }
         ],
@@ -151,16 +152,16 @@ class OpenIETriplePrompt(PromptOp):
     "entity_list": $entity_list,
     "input": "$input",
     "example": {
-        "input": "烦躁不安、语妄、失眠酌用镇静药，禁用抑制呼吸的镇静药。\n3.并发症的处理经抗菌药物治疗后，高热常在24小时内消退，或数日内逐渐下降。\n若体温降而复升或3天后仍不降者，应考虑SP的肺外感染，如腋胸、心包炎或关节炎等。治疗：接胸腔压力调节管＋吸引机负压吸引水瓶装置闭式负压吸引宜连续，如经12小时后肺仍未复张，应查找原因。",
+        "input": "烦躁不安、语妄、失眠酌用镇静药，禁用抑制呼吸的镇静药。3.并发症的处理经抗菌药物治疗后，高热常在24小时内消退，或数日内逐渐下降。若体温降而复升或3天后仍不降者，应考虑SP的肺外感染，如腋胸、心包炎或关节炎等。治疗：接胸腔压力调节管＋吸引机负压吸引水瓶装置闭式负压吸引宜连续，如经12小时后肺仍未复张，应查找原因。",
         "entity_list": [
-            {"entity": "烦躁不安", "category": "Symptom"},
-            {"entity": "语妄", "category": "Symptom"},
-            {"entity": "失眠", "category": "Symptom"},
-            {"entity": "镇静药", "category": "Medicine"},
-            {"entity": "肺外感染", "category": "Disease"},
-            {"entity": "胸腔压力调节管", "category": "MedicalEquipment"},
-            {"entity": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"},
-            {"entity": "闭式负压吸引", "category": "SurgicalOperation"}
+            {"name": "烦躁不安", "category": "Symptom"},
+            {"name": "语妄", "category": "Symptom"},
+            {"name": "失眠", "category": "Symptom"},
+            {"name": "镇静药", "category": "Medicine"},
+            {"name": "肺外感染", "category": "Disease"},
+            {"name": "胸腔压力调节管", "category": "MedicalEquipment"},
+            {"name": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"},
+            {"name": "闭式负压吸引", "category": "SurgicalOperation"}
         ],
         "output":[
             ["烦躁不安", "酌用", "镇静药"],
@@ -178,9 +179,6 @@ class OpenIETriplePrompt(PromptOp):
 }    
     """
 
-    def __init__(self, language: Optional[str] = "en"):
-        super().__init__(language)
-
     @property
     def template_variables(self) -> List[str]:
         return ["entity_list", "input"]
diff --git a/kag/builder/prompt/medical/ner.py b/kag/builder/prompt/medical/ner.py
index 07c6298a..1e2ce65e 100644
--- a/kag/builder/prompt/medical/ner.py
+++ b/kag/builder/prompt/medical/ner.py
@@ -12,14 +12,14 @@
 
 import json
 from string import Template
-from typing import List, Optional
-
-from kag.common.base.prompt_op import PromptOp
+from typing import List
+from kag.common.conf import KAG_PROJECT_CONF
+from kag.interface import PromptABC
 from knext.schema.client import SchemaClient
 
 
-class OpenIENERPrompt(PromptOp):
-
+@PromptABC.register("medical_ner")
+class OpenIENERPrompt(PromptABC):
     template_zh = """
     {
         "instruction": "你是命名实体识别的专家。请从输入中提取与模式定义匹配的实体。如果不存在该类型的实体，请返回一个空列表。请以JSON字符串格式回应。你可以参照example进行抽取。",
@@ -28,14 +28,14 @@ class OpenIENERPrompt(PromptOp):
             {
                 "input": "烦躁不安、语妄、失眠酌用镇静药，禁用抑制呼吸的镇静药。\n3.并发症的处理经抗菌药物治疗后，高热常在24小时内消退，或数日内逐渐下降。\n若体温降而复升或3天后仍不降者，应考虑SP的肺外感染。\n治疗：接胸腔压力调节管＋吸引机负压吸引水瓶装置闭式负压吸引宜连续，如经12小时后肺仍未复张，应查找原因。",
                 "output": [
-                        {"entity": "烦躁不安", "category": "Symptom"},
-                        {"entity": "语妄", "category": "Symptom"},
-                        {"entity": "失眠", "category": "Symptom"},
-                        {"entity": "镇静药", "category": "Medicine"},
-                        {"entity": "肺外感染", "category": "Disease"},
-                        {"entity": "胸腔压力调节管", "category": "MedicalEquipment"},
-                        {"entity": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"},
-                        {"entity": "闭式负压吸引", "category": "SurgicalOperation"}
+                        {"name": "烦躁不安", "category": "Symptom"},
+                        {"name": "语妄", "category": "Symptom"},
+                        {"name": "失眠", "category": "Symptom"},
+                        {"name": "镇静药", "category": "Medicine"},
+                        {"name": "肺外感染", "category": "Disease"},
+                        {"name": "胸腔压力调节管", "category": "MedicalEquipment"},
+                        {"name": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"},
+                        {"name": "闭式负压吸引", "category": "SurgicalOperation"}
                     ]
             }
         ],
@@ -45,11 +45,11 @@ class OpenIENERPrompt(PromptOp):
 
     template_en = template_zh
 
-    def __init__(
-            self, language: Optional[str] = "en", **kwargs
-    ):
+    def __init__(self, language: str = "", **kwargs):
         super().__init__(language, **kwargs)
-        self.schema = SchemaClient(project_id=self.project_id).extract_types()
+        self.schema = SchemaClient(
+            project_id=KAG_PROJECT_CONF.project_id
+        ).extract_types()
         self.template = Template(self.template).safe_substitute(schema=self.schema)
 
     @property
diff --git a/kag/builder/prompt/medical/std.py b/kag/builder/prompt/medical/std.py
index 88ec1283..19f2232c 100644
--- a/kag/builder/prompt/medical/std.py
+++ b/kag/builder/prompt/medical/std.py
@@ -11,37 +11,37 @@
 # or implied.
 
 import json
-from typing import Optional, List
+from typing import List
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 
-class OpenIEEntitystandardizationdPrompt(PromptOp):
-
+@PromptABC.register("medical_std")
+class OpenIEEntitystandardizationdPrompt(PromptABC):
     template_zh = """
 {
     "instruction": "input字段包含用户提供的上下文。命名实体字段包含从上下文中提取的命名实体，这些可能是含义不明的缩写、别名或俚语。为了消除歧义，请尝试根据上下文和您自己的知识提供这些实体的官方名称。请注意，具有相同含义的实体只能有一个官方名称。请按照提供的示例中的输出字段格式，以单个JSONArray字符串形式回复，无需任何解释。",
     "example": {
         "input": "烦躁不安、语妄、失眠酌用镇静药，禁用抑制呼吸的镇静药。\n3.并发症的处理经抗菌药物治疗后，高热常在24小时内消退，或数日内逐渐下降。\n若体温降而复升或3天后仍不降者，应考虑SP的肺外感染，如腋胸、心包炎或关节炎等。治疗：接胸腔压力调节管＋吸引机负压吸引水瓶装置闭式负压吸引宜连续，如经12小时后肺仍未复张，应查找原因。",
         "named_entities": [
-            {"entity": "烦躁不安", "category": "Symptom"},
-            {"entity": "语妄", "category": "Symptom"},
-            {"entity": "失眠", "category": "Symptom"},
-            {"entity": "镇静药", "category": "Medicine"},
-            {"entity": "肺外感染", "category": "Disease"},
-            {"entity": "胸腔压力调节管", "category": "MedicalEquipment"},
-            {"entity": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"},
-            {"entity": "闭式负压吸引", "category": "SurgicalOperation"}
+            {"name": "烦躁不安", "category": "Symptom"},
+            {"name": "语妄", "category": "Symptom"},
+            {"name": "失眠", "category": "Symptom"},
+            {"name": "镇静药", "category": "Medicine"},
+            {"name": "肺外感染", "category": "Disease"},
+            {"name": "胸腔压力调节管", "category": "MedicalEquipment"},
+            {"name": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"},
+            {"name": "闭式负压吸引", "category": "SurgicalOperation"}
         ],
         "output": [
-            {"entity": "烦躁不安", "category": "Symptom", "official_name": "焦虑不安"},
-            {"entity": "语妄", "category": "Symptom", "official_name": "谵妄"},
-            {"entity": "失眠", "category": "Symptom", "official_name": "失眠症"},
-            {"entity": "镇静药", "category": "Medicine", "official_name": "镇静剂"},
-            {"entity": "肺外感染", "category": "Disease", "official_name": "肺外感染"},
-            {"entity": "胸腔压力调节管", "category": "MedicalEquipment", "official_name": "胸腔引流管"},
-            {"entity": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment", "official_name": "负压吸引装置"},
-            {"entity": "闭式负压吸引", "category": "SurgicalOperation", "official_name": "闭式负压引流"}
+            {"name": "烦躁不安", "category": "Symptom", "official_name": "焦虑不安"},
+            {"name": "语妄", "category": "Symptom", "official_name": "谵妄"},
+            {"name": "失眠", "category": "Symptom", "official_name": "失眠症"},
+            {"name": "镇静药", "category": "Medicine", "official_name": "镇静剂"},
+            {"name": "肺外感染", "category": "Disease", "official_name": "肺外感染"},
+            {"name": "胸腔压力调节管", "category": "MedicalEquipment", "official_name": "胸腔引流管"},
+            {"name": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment", "official_name": "负压吸引装置"},
+            {"name": "闭式负压吸引", "category": "SurgicalOperation", "official_name": "闭式负压引流"}
         ]
     },
     "input": $input,
@@ -51,15 +51,11 @@ class OpenIEEntitystandardizationdPrompt(PromptOp):
 
     template_en = template_zh
 
-    def __init__(self, language: Optional[str] = "en"):
-        super().__init__(language)
-
     @property
     def template_variables(self) -> List[str]:
         return ["input", "named_entities"]
 
     def parse_response(self, response: str, **kwargs):
-
         rsp = response
         if isinstance(rsp, str):
             rsp = json.loads(rsp)
@@ -74,10 +70,10 @@ def parse_response(self, response: str, **kwargs):
         entities = kwargs.get("named_entities", [])
         for entity in standardized_entity:
             merged.append(entity)
-            entities_with_offical_name.add(entity["entity"])
+            entities_with_offical_name.add(entity["name"])
         # in case llm ignores some entities
         for entity in entities:
-            if entity["entity"] not in entities_with_offical_name:
-                entity["official_name"] = entity["entity"]
+            if entity["name"] not in entities_with_offical_name:
+                entity["official_name"] = entity["name"]
                 merged.append(entity)
         return merged
diff --git a/kag/builder/prompt/medical/triple.py b/kag/builder/prompt/medical/triple.py
index 2b5aaff8..1c573fac 100644
--- a/kag/builder/prompt/medical/triple.py
+++ b/kag/builder/prompt/medical/triple.py
@@ -11,13 +11,13 @@
 # or implied.
 
 import json
-from typing import Optional, List, Dict, Any
+from typing import List
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 
-class OpenIETriplePrompt(PromptOp):
-
+@PromptABC.register("medical_triple")
+class OpenIETriplePrompt(PromptABC):
     template_zh = """
 {
     "instruction": "您是一位专门从事开放信息提取（OpenIE）的专家。请从input字段的文本中提取任何可能的关系（包括主语、谓语、宾语），并按照JSON格式列出它们，须遵循example字段的示例格式。请注意以下要求：1. 每个三元组应至少包含entity_list实体列表中的一个，但最好是两个命名实体。2. 明确地将代词解析为特定名称，以保持清晰度。",
@@ -26,14 +26,14 @@ class OpenIETriplePrompt(PromptOp):
     "example": {
         "input": "烦躁不安、语妄、失眠酌用镇静药，禁用抑制呼吸的镇静药。\n3.并发症的处理经抗菌药物治疗后，高热常在24小时内消退，或数日内逐渐下降。\n若体温降而复升或3天后仍不降者，应考虑SP的肺外感染，如腋胸、心包炎或关节炎等。治疗：接胸腔压力调节管＋吸引机负压吸引水瓶装置闭式负压吸引宜连续，如经12小时后肺仍未复张，应查找原因。",
         "entity_list": [
-            {"entity": "烦躁不安", "category": "Symptom"},
-            {"entity": "语妄", "category": "Symptom"},
-            {"entity": "失眠", "category": "Symptom"},
-            {"entity": "镇静药", "category": "Medicine"},
-            {"entity": "肺外感染", "category": "Disease"},
-            {"entity": "胸腔压力调节管", "category": "MedicalEquipment"},
-            {"entity": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"},
-            {"entity": "闭式负压吸引", "category": "SurgicalOperation"}
+            {"name": "烦躁不安", "category": "Symptom"},
+            {"name": "语妄", "category": "Symptom"},
+            {"name": "失眠", "category": "Symptom"},
+            {"name": "镇静药", "category": "Medicine"},
+            {"name": "肺外感染", "category": "Disease"},
+            {"name": "胸腔压力调节管", "category": "MedicalEquipment"},
+            {"name": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"},
+            {"name": "闭式负压吸引", "category": "SurgicalOperation"}
         ],
         "output":[
             ["烦躁不安", "酌用", "镇静药"],
@@ -53,9 +53,6 @@ class OpenIETriplePrompt(PromptOp):
 
     template_en = template_zh
 
-    def __init__(self, language: Optional[str] = "en"):
-        super().__init__(language)
-
     @property
     def template_variables(self) -> List[str]:
         return ["entity_list", "input"]
diff --git a/kag/builder/prompt/oneke_prompt.py b/kag/builder/prompt/oneke_prompt.py
deleted file mode 100644
index 25c3dd69..00000000
--- a/kag/builder/prompt/oneke_prompt.py
+++ /dev/null
@@ -1,518 +0,0 @@
-#
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import json
-import logging
-import re
-from abc import ABC
-from typing import List, Dict, Any
-from collections import defaultdict
-
-from knext.schema.model.schema_helper import SPGTypeName
-from kag.builder.model.spg_record import SPGRecord
-from kag.builder.prompt.spg_prompt import SPGPrompt
-import uuid
-
-logger = logging.getLogger(__name__)
-
-
-class OneKEPrompt(SPGPrompt, ABC):
-    template_zh: str = ""
-    template_en: str = ""
-
-    def __init__(self, **kwargs):
-        types_list = kwargs.get("types_list", [])
-        language = kwargs.get("language", "zh")
-        with_description = kwargs.get("with_description", False)
-        split_num = kwargs.get("split_num", 4)
-        super().__init__(types_list, **kwargs)
-        self.language = language
-        if language == "zh":
-            self.template = self.template_zh
-        else:
-            self.template = self.template_en
-        self.with_description = with_description
-        self.split_num = split_num
-
-        self._init_render_variables()
-        self._render()
-
-        self.params = kwargs
-
-    def build_prompt(self, variables: Dict[str, str]) -> List[str]:
-        instructions = []
-        for schema in self.schema_list:
-            instructions.append(
-                json.dumps(
-                    {
-                        "instruction": self.template,
-                        "schema": schema,
-                        "input": variables.get("input"),
-                    },
-                    ensure_ascii=False,
-                )
-            )
-        return instructions
-
-    def parse_response(self, response: str) -> List[SPGRecord]:
-        raise NotImplementedError
-
-    def _render(self):
-        raise NotImplementedError
-
-    def multischema_split_by_num(self, split_num, schemas: List[Any]):
-        negative_length = max(len(schemas) // split_num, 1) * split_num
-        total_schemas = []
-        for i in range(0, negative_length, split_num):
-            total_schemas.append(schemas[i : i + split_num])
-
-        remain_len = max(1, split_num // 2)
-        tmp_schemas = schemas[negative_length:]
-        if len(schemas) - negative_length >= remain_len and len(tmp_schemas) > 0:
-            total_schemas.append(tmp_schemas)
-        elif len(tmp_schemas) > 0:
-            total_schemas[-1].extend(tmp_schemas)
-        return total_schemas
-
-
-class OneKE_NERPrompt(OneKEPrompt):
-    template_zh: str = (
-        "你是专门进行实体抽取的专家。请从input中抽取出符合schema定义的实体，不存在的实体类型返回空列表。请按照JSON字符串的格式回答。"
-    )
-    template_en: str = "You are an expert in named entity recognition. Please extract entities that match the schema definition from the input. Return an empty list if the entity type does not exist. Please respond in the format of a JSON string."
-
-    def __init__(
-        self,
-        entity_types: List[SPGTypeName],
-        language: str = "zh",
-        with_description: bool = False,
-        split_num: int = 4,
-        **kwargs,
-    ):
-        super().__init__(
-            types_list=entity_types,
-            language=language,
-            with_description=with_description,
-            split_num=split_num,
-            **kwargs,
-        )
-
-    def parse_response(self, response: str) -> List[SPGRecord]:
-        if isinstance(response, list) and len(response) > 0:
-            response = response[0]
-        try:
-            ent_obj = json.loads(response)
-        except json.decoder.JSONDecodeError:
-            logger.error("OneKE_NERPrompt response JSONDecodeError error.")
-            return []
-        if type(ent_obj) != dict:
-            logger.error("OneKE_NERPrompt response type error.")
-            return []
-
-        spg_records = []
-        for type_zh, values in ent_obj.items():
-            if type_zh not in self.spg_type_schema_info_zh:
-                logger.warning(f"Unrecognized entity_type: {type_zh}")
-                continue
-            type_en, _ = self.spg_type_schema_info_zh[type_zh]
-            for value in values:
-                spg_record = SPGRecord(type_en)
-                spg_record.upsert_properties({"id": value, "name": value})
-                spg_records.append(spg_record)
-        return spg_records
-
-    def _render(self):
-        entity_list = []
-        for spg_type in self.spg_types:
-            entity_list.append(spg_type.name_zh)
-        self.schema_list = self.multischema_split_by_num(self.split_num, entity_list)
-
-
-class OneKE_SPOPrompt(OneKEPrompt):
-    template_zh: str = (
-        "你是专门进行SPO三元组抽取的专家。请从input中抽取出符合schema定义的spo关系三元组，不存在的关系返回空列表。请按照JSON字符串的格式回答。"
-    )
-    template_en: str = "You are an expert in spo(subject, predicate, object) triples extraction. Please extract SPO relationship triples that match the schema definition from the input. Return an empty list for relationships that do not exist. Please respond in the format of a JSON string."
-
-    def __init__(
-        self,
-        spo_types: List[SPGTypeName],
-        language: str = "zh",
-        with_description: bool = False,
-        split_num: int = 4,
-        **kwargs,
-    ):
-        super().__init__(
-            types_list=spo_types,
-            language=language,
-            with_description=with_description,
-            split_num=split_num,
-            **kwargs,
-        )
-        self.properties_mapper = {}
-        self.relations_mapper = {}
-
-    def parse_response(self, response: str) -> List[SPGRecord]:
-        if isinstance(response, list) and len(response) > 0:
-            response = response[0]
-        try:
-            re_obj = json.loads(response)
-        except json.decoder.JSONDecodeError:
-            logger.error("OneKE_REPrompt response JSONDecodeError error.")
-            return []
-        if type(re_obj) != dict:
-            logger.error("OneKE_REPrompt response type error.")
-            return []
-
-        relation_dcir = defaultdict(list)
-        for relation_zh, values in re_obj.items():
-            if relation_zh not in self.property_info_zh[relation_zh]:
-                logger.warning(f"Unrecognized relation: {relation_zh}")
-                continue
-            if values and isinstance(values, list):
-                for value in values:
-                    if (
-                        type(value) != dict
-                        or "subject" not in value
-                        or "object" not in value
-                    ):
-                        logger.warning("OneKE_REPrompt response type error.")
-                        continue
-                    s_zh, o_zh = value.get("subject", ""), value.get("object", "")
-                    relation_dcir[relation_zh].append((s_zh, o_zh))
-
-        spg_records = []
-        for relation_zh, sub_obj_list in relation_dcir.items():
-            sub_dict = defaultdict(list)
-            for s_zh, o_zh in sub_obj_list:
-                sub_dict[s_zh].append(o_zh)
-            for s_zh, o_list in sub_dict.items():
-                if s_zh in self.spg_type_schema_info_zh:
-                    logger.warning(f"Unrecognized subject_type: {s_zh}")
-                    continue
-                object_value = ",".join(o_list)
-                s_type_zh = self.properties_mapper.get(relation_zh, None)
-                if s_type_zh is not None:
-                    s_type_en, _ = self.spg_type_schema_info_zh[s_type_zh]
-                    relation_en, _ = self.property_info_zh[relation_zh]
-                    spg_record = SPGRecord(s_type_en).upsert_properties(
-                        {"id": s_zh, "name": s_zh}
-                    )
-                    spg_record.upsert_property(relation_en, object_value)
-                else:
-                    s_type_zh, o_type_zh = self.relations_mapper.get(
-                        relation_zh, [None, None]
-                    )
-                    if s_type_zh is None or o_type_zh is None:
-                        logger.warning(f"Unrecognized relation: {relation_zh}")
-                        continue
-                    s_type_en, _ = self.spg_type_schema_info_zh[s_type_zh]
-                    spg_record = SPGRecord(s_type_en).upsert_properties(
-                        {"id": s_zh, "name": s_zh}
-                    )
-                    relation_en, _, object_type = self.relation_info_zh[s_type_zh][
-                        relation_zh
-                    ]
-                    spg_record.upsert_relation(relation_en, object_type, object_value)
-                spg_records.append(spg_record)
-        return spg_records
-
-    def _render(self):
-        spo_list = []
-        for spg_type in self.spg_types:
-            type_en, _ = self.spg_type_schema_info_zh[spg_type]
-            for v in spg_type.properties.values():
-                spo_list.append(
-                    {
-                        "subject_type": spg_type.name_zh,
-                        "predicate": v.name_zh,
-                        "object_type": "文本",
-                    }
-                )
-                self.properties_mapper[v.name_zh] = spg_type
-            for v in spg_type.relations.values():
-                _, _, object_type = self.relation_info_en[type_en][v.name]
-                spo_list.append(
-                    {
-                        "subject_type": spg_type.name_zh,
-                        "predicate": v.name_zh,
-                        "object_type": object_type,
-                    }
-                )
-                self.relations_mapper[v.name_zh] = [spg_type, object_type]
-        self.schema_list = self.multischema_split_by_num(self.split_num, spo_list)
-
-
-class OneKE_REPrompt(OneKE_SPOPrompt):
-    template_zh: str = (
-        "你是专门进行关系抽取的专家。请从input中抽取出符合schema定义的关系三元组，不存在的关系返回空列表。请按照JSON字符串的格式回答。"
-    )
-    template_en: str = "You are an expert in relationship extraction. Please extract relationship triples that match the schema definition from the input. Return an empty list for relationships that do not exist. Please respond in the format of a JSON string."
-
-    def __init__(
-        self,
-        relation_types: List[SPGTypeName],
-        language: str = "zh",
-        with_description: bool = False,
-        split_num: int = 4,
-        **kwargs,
-    ):
-        super().__init__(
-            relation_types, language, with_description, split_num, **kwargs
-        )
-
-    def _render(self):
-        re_list = []
-        for spg_type in self.spg_types:
-            type_en, _ = self.spg_type_schema_info_zh[spg_type]
-            for v in spg_type.properties.values():
-                re_list.append(v.name_zh)
-                self.properties_mapper[v.name_zh] = spg_type
-            for v in spg_type.relations.values():
-                v_zh, _, object_type = self.relation_info_en[type_en][v.name]
-                re_list.append(v.name_zh)
-                self.relations_mapper[v.name_zh] = [spg_type, object_type]
-        self.schema_list = self.multischema_split_by_num(self.split_num, re_list)
-
-
-class OneKE_KGPrompt(OneKEPrompt):
-    template_zh: str = "你是一个图谱实体知识结构化专家。根据输入实体类型(entity type)的schema描述，从文本中抽取出相应的实体实例和其属性信息，不存在的属性不输出, 属性存在多值就返回列表，并输出为可解析的json格式。"
-    template_en: str = "You are an expert in structured knowledge systems for graph entities. Based on the schema description of the input entity type, you extract the corresponding entity instances and their attribute information from the text. Attributes that do not exist should not be output. If an attribute has multiple values, a list should be returned. The results should be output in a parsable JSON format."
-
-    def __init__(
-        self,
-        entity_types: List[SPGTypeName],
-        language: str = "zh",
-        with_description: bool = False,
-        split_num: int = 4,
-        **kwargs,
-    ):
-        super().__init__(
-            types_list=entity_types,
-            language=language,
-            with_description=with_description,
-            split_num=split_num,
-            **kwargs,
-        )
-
-    def parse_response(self, response: str) -> List[SPGRecord]:
-        if isinstance(response, list) and len(response) > 0:
-            response = response[0]
-        try:
-            re_obj = json.loads(response)
-        except json.decoder.JSONDecodeError:
-            logger.error("OneKE_KGPrompt response JSONDecodeError error.")
-            return []
-        if type(re_obj) != dict:
-            logger.error("OneKE_KGPrompt response type error.")
-            return []
-
-        spg_records = []
-        for type_zh, type_value in re_obj.items():
-            if type_zh not in self.spg_type_schema_info_zh:
-                logger.warning(f"Unrecognized entity_type: {type_zh}")
-                continue
-            type_en, _ = self.spg_type_schema_info_zh[type_zh]
-            if type_value and isinstance(type_value, dict):
-                for name, attrs in type_value.items():
-                    spg_record = SPGRecord(type_en).upsert_properties(
-                        {"id": name, "name": name}
-                    )
-                    for attr_zh, attr_value in attrs.items():
-                        if isinstance(attr_value, list):
-                            attr_value = ",".join(attr_value)
-                        if attr_zh in self.property_info_zh[type_zh]:
-                            attr_en, _, object_type = self.property_info_zh[type_zh][
-                                attr_zh
-                            ]
-                            spg_record.upsert_property(attr_en, attr_value)
-                        elif attr_zh in self.relation_info_zh[type_zh]:
-                            attr_en, _, object_type = self.relation_info_zh[type_zh][
-                                attr_zh
-                            ]
-                            spg_record.upsert_relation(attr_en, object_type, attr_value)
-                        else:
-                            logger.warning(f"Unrecognized attribute: {attr_zh}")
-                            continue
-                        if object_type == "Integer":
-                            matches = re.findall(r"\d+", attr_value)
-                            if matches:
-                                spg_record.upsert_property(attr_en, matches[0])
-                        elif object_type == "Float":
-                            matches = re.findall(r"\d+(?:\.\d+)?", attr_value)
-                            if matches:
-                                spg_record.upsert_property(attr_en, matches[0])
-                    spg_records.append(spg_record)
-        return spg_records
-
-    def _render(self):
-        spo_list = []
-        for spg_type in self.spg_types:
-            if not self.with_description:
-                attributes = []
-                attributes.extend(
-                    [
-                        v.name_zh
-                        for k, v in spg_type.properties.items()
-                        if k not in self.ignored_properties
-                    ]
-                )
-                attributes.extend(
-                    [
-                        v.name_zh
-                        for k, v in spg_type.relations.items()
-                        if v.name_zh not in attributes
-                        and k not in self.ignored_relations
-                    ]
-                )
-            else:
-                attributes = {}
-                attributes.update(
-                    {
-                        v.name_zh: v.desc or ""
-                        for k, v in spg_type.properties.items()
-                        if k not in self.ignored_properties
-                    }
-                )
-                attributes.update(
-                    {
-                        v.name_zh: v.desc or ""
-                        for k, v in spg_type.relations.items()
-                        if v.name_zh not in attributes
-                        and k not in self.ignored_relations
-                    }
-                )
-            entity_type = spg_type.name_zh
-            spo_list.append({"entity_type": entity_type, "attributes": attributes})
-
-        self.schema_list = self.multischema_split_by_num(self.split_num, spo_list)
-
-
-class OneKE_EEPrompt(OneKEPrompt):
-    template_zh: str = "你是专门进行事件提取的专家。请从input中抽取出符合schema定义的事件，不存在的事件返回空列表，不存在的论元返回NAN，如果论元存在多值请返回列表。请按照JSON字符串的格式回答。"
-    template_en: str = "You are an expert in event extraction. Please extract events from the input that conform to the schema definition. Return an empty list for events that do not exist, and return NAN for arguments that do not exist. If an argument has multiple values, please return a list. Respond in the format of a JSON string."
-
-    def __init__(
-        self,
-        event_types: List[SPGTypeName],
-        language: str = "zh",
-        with_description: bool = False,
-        split_num: int = 4,
-        **kwargs,
-    ):
-        super().__init__(
-            types_list=event_types,
-            language=language,
-            with_description=with_description,
-            split_num=split_num,
-            **kwargs,
-        )
-
-    def parse_response(self, response: str) -> List[SPGRecord]:
-        if isinstance(response, list) and len(response) > 0:
-            response = response[0]
-        try:
-            ee_obj = json.loads(response)
-        except json.decoder.JSONDecodeError:
-            logger.error("OneKE_EEPrompt response JSONDecodeError error.")
-            return []
-        if type(ee_obj) != dict:
-            logger.error("OneKE_EEPrompt response type error.")
-            return []
-
-        spg_records = []
-        for type_zh, type_values in ee_obj.items():
-            if type_zh not in self.spg_type_schema_info_zh:
-                logger.warning(f"Unrecognized event_type: {type_zh}")
-                continue
-            type_en, _ = self.spg_type_schema_info_zh[type_zh]
-            if type_values and isinstance(type_values, list):
-                for type_value in type_values:
-                    uuid_4 = uuid.uuid4()
-                    spg_record = (
-                        SPGRecord(type_en)
-                        .upsert_property("id", str(uuid_4))
-                        .upsert_property("name", type_zh)
-                    )
-                    arguments = type_value.get("arguments")
-                    if arguments and isinstance(arguments, dict):
-                        for attr_zh, attr_value in arguments.items():
-                            if isinstance(attr_value, list):
-                                attr_value = ",".join(attr_value)
-                            if attr_zh in self.property_info_zh[type_zh]:
-                                attr_en, _, object_type = self.property_info_zh[
-                                    type_zh
-                                ][attr_zh]
-                                spg_record.upsert_property(attr_en, attr_value)
-                            elif attr_zh in self.relation_info_zh[type_zh]:
-                                attr_en, _, object_type = self.relation_info_zh[
-                                    type_zh
-                                ][attr_zh]
-                                spg_record.upsert_relation(
-                                    attr_en, object_type, attr_value
-                                )
-                            else:
-                                logger.warning(f"Unrecognized attribute: {attr_zh}")
-                                continue
-                            if object_type == "Integer":
-                                matches = re.findall(r"\d+", attr_value)
-                                if matches:
-                                    spg_record.upsert_property(attr_en, matches[0])
-                            elif object_type == "Float":
-                                matches = re.findall(r"\d+(?:\.\d+)?", attr_value)
-                                if matches:
-                                    spg_record.upsert_property(attr_en, matches[0])
-                    spg_records.append(spg_record)
-        return spg_records
-
-    def _render(self):
-        event_list = []
-        for spg_type in self.spg_types:
-            if not self.with_description:
-                arguments = []
-                arguments.extend(
-                    [
-                        v.name_zh
-                        for k, v in spg_type.properties.items()
-                        if k not in self.ignored_properties
-                    ]
-                )
-                arguments.extend(
-                    [
-                        v.name_zh
-                        for k, v in spg_type.relations.items()
-                        if v.name_zh not in arguments
-                        and k not in self.ignored_relations
-                    ]
-                )
-            else:
-                arguments = {}
-                arguments.update(
-                    {
-                        v.name_zh: v.desc or ""
-                        for k, v in spg_type.properties.items()
-                        if k not in self.ignored_properties
-                    }
-                )
-                arguments.update(
-                    {
-                        v.name_zh: v.desc or ""
-                        for k, v in spg_type.relations.items()
-                        if v.name_zh not in arguments
-                        and k not in self.ignored_relations
-                    }
-                )
-            event_type = spg_type.name_zh
-            event_list.append(
-                {"event_type": event_type, "trigger": True, "arguments": arguments}
-            )
-        self.schema_list = self.multischema_split_by_num(self.split_num, event_list)
diff --git a/kag/builder/prompt/outline_align_prompt.py b/kag/builder/prompt/outline_align_prompt.py
new file mode 100644
index 00000000..0fb0aa30
--- /dev/null
+++ b/kag/builder/prompt/outline_align_prompt.py
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from typing import Optional, List
+from kag.interface import PromptABC
+import ast
+
+
+@PromptABC.register("outline_align")
+class OutlineAlignPrompt(PromptABC):
+    template_zh = """
+{
+    "instruction": "请分析以下大纲列表，统一调整标题的层级。遵循以下规则：
+1. 相同类型的标题应该有相同的层级，例如所有'第X章'都应该是同一层级
+2. 层级关系应该符合逻辑，例如:
+   - 章(1级) > 节(2级) > 条(3级)
+   - 部分(1级) > 章(2级) > 节(3级)
+3. 考虑标题的上下文关系，确保层级的连贯性
+4. 如果标题不含明确的层级标识，根据其内容和上下文推断合适的层级
+
+请务必按照以下格式返回，不要返回其他任何内容，请返回调整后的大纲列表，格式为:
+[(标题1, 层级1), (标题2, 层级2), ...]
+
+输入的大纲列表为:
+$outlines",
+    "example": [
+        {
+            "input": [
+                ("第一章 绪论", 2),
+                ("第一节 研究背景", 1),
+                ("第二章 文献综述", 1),
+                ("第二节 研究方法", 2)
+            ],
+            "output": [
+                ("第一章 绪论", 1),
+                ("第一节 研究背景", 2),
+                ("第二章 文献综述", 1),
+                ("第二节 研究方法", 2)
+            ]
+        }
+    ]
+}
+"""
+
+    template_en = """
+{
+    "instruction": "Please analyze the following outline list and unify the levels of titles according to these rules:
+1. Similar types of titles should have the same level (e.g., all 'Chapter X' should be at the same level)
+2. Level relationships should follow logic, e.g.:
+   - Chapter(1) > Section(2) > Article(3)
+   - Part(1) > Chapter(2) > Section(3)
+3. Consider context relationships between titles to ensure level continuity
+4. For titles without clear level indicators, infer appropriate levels based on content and context
+
+Please return the adjusted outline list in the format:
+[(title1, level1), (title2, level2), ...]
+
+Input outline list:
+$outlines",
+    "example": [
+        {
+            "input": [
+                ("Chapter 1 Introduction", 2),
+                ("Section 1.1 Background", 1),
+                ("Chapter 2 Literature Review", 1),
+                ("Section 2.1 Methods", 2)
+            ],
+            "output": [
+                ("Chapter 1 Introduction", 1),
+                ("Section 1.1 Background", 2),
+                ("Chapter 2 Literature Review", 1),
+                ("Section 2.1 Methods", 2)
+            ]
+        }
+    ]
+}
+"""
+
+    def __init__(self, language: Optional[str] = "zh"):
+        super().__init__(language)
+
+    @property
+    def template_variables(self) -> List[str]:
+        return ["outlines"]
+
+    def parse_response(self, response: str, **kwargs):
+        if isinstance(response, str):
+            cleaned_data = response.strip("`python\n[] \n")
+            cleaned_data = "[" + cleaned_data + "]"
+            return ast.literal_eval(cleaned_data)
+        if isinstance(response, dict) and "output" in response:
+            return response["output"]
+        return response
diff --git a/kag/builder/prompt/outline_prompt.py b/kag/builder/prompt/outline_prompt.py
index f7911a69..01cc299f 100644
--- a/kag/builder/prompt/outline_prompt.py
+++ b/kag/builder/prompt/outline_prompt.py
@@ -10,74 +10,43 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
 
-import json
 from typing import Optional, List
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
+import ast
 
 
-class OutlinePrompt(PromptOp):
+@PromptABC.register("outline")
+class OutlinePrompt(PromptABC):
     template_zh = """
 {
-    "instruction": "\n请理解input字段中的文本内容，识别文本的结构和组成部分，并帮我提取出以下内容的标题，可能有多个标题分散在文本的各个地方，仅返属于原文的回标题文本即可，不要返回其他任何内容，须按照python list的格式回答，具体形式请遵从example字段中给出的若干例子。",
+    "instruction": "\n给定一段纯文本内容，请提取其中的标题，并返回一个列表。每个标题应包含以下信息：\n- 标题文本\n- 标题级别（例如 1 表示一级标题，2 表示二级标题等）\n\n假设标题遵循以下规则：\n1. 标题通常带有数字，我们的文本可能是从一些图片OCR生成的，所以标题可能隐藏在段落中，尽可能找出这些隐藏在段落中带有数字的标题\n2. 标题的级别可以通过以下方式推断：\n   - 一级标题：通常是篇章级别的内容。\n   - 二级标题：通常是章节级别的内容，具有简洁的文字描述，有时以 \"第X部分\"、\"第X章\"、\"Part X\" 等类似形式开头。\n   - 三级标题及以下：通常是段落或细节级别的标题，可能包含数字编号（如\"1.\"或\"1.1\"），或者较长且具体的描述（如\"1.1 子标题\"或\"第1节 概述\"）。\n3. 标题的级别也可以通过上下文判断：\n   - 如果两个标题之间的文本内容非常短（例如少于一定字数），后面的标题可能是更高或相同级别的标题。\n   - 连续编号的标题（如“第1条”“第2条”）通常属于同一级别。\n   - 标题层级通常由其数字层次决定，例如“1”“1.1”“1.1.1”依次为 1 级、2 级、3 级。\n   - 如果一个标题包含关键词如“部分”“章”“节”“条”，且其长度适中（例如 5 至 20 个字符），该标题的级别往往比更长或更短的标题要高。\n4. 以下标题可以直接忽略：\n   - 含有纯数字或仅由数字和标点组成的标题（例如“1.”、“2.1”等）。\n   - 重复出现的标题（例如页眉或页脚被误识别为标题的情况）。\n5. 如果某些内容无法明确判断为标题，或者不符合上述规则，请忽略。\n\n请根据上述规则，返回一个包含标题和对应级别的列表，格式如下：\n[\n    (\"标题文本1\", 1),\n    (\"标题文本2\", 2),\n    (\"标题文本3\", 3),\n    ...\n]，我还会给你提供之前内容抽取出的目录current_outlines，你需要根据当前已经抽取的目录，自行判断抽取标题的粒度以及对应的等级",
     "input": "$input",
+    "current_outline:": "$current_outline",
     "example": [
         {
-            "input": "第8条 原 则
-
-1.各成员方在制订或修正其法律和规章时，可采取必要措施以保护公众健康和营养，并促进对其社会经济和技术发展至关重要部门的公众利益，只要该措施符合本协议规定。
-
-2.可能需要采取与本协议的规定相一致的适当的措施，以防止知识产权所有者滥用知识产权或藉以对贸易进行不合理限制或实行对国际间的技术转让产生不利影响的作法。
-
-第二部分 关于知识产权的效力、范围及使用的标准
-
-第1节 版权及相关权利
-
-第9条 与《伯尔尼公约》的关系",
+            "input": "第8条 原 则\n\n1.各成员方在制订或修正其法律和规章时，可采取必要措施以保护公众健康和营养，并促进对其社会经济和技术发展至关重要部门的公众利益，只要该措施符合本协议规定。\n\n2.可能需要采取与本协议的规定相一致的适当的措施，以防止知识产权所有者滥用知识产权或藉以对贸易进行不合理限制或实行对国际间的技术转让产生不利影响的作法。\n\n第二部分 关于知识产权的效力、范围及使用的标准\n\n第1节 版权及相关权利\n\n第9条 与《伯尔尼公约》的关系",
             "output": [
-                "第8条 原 则",
-                "第二部分 关于知识产权的效力、范围及使用的标准",
-                "第1节 版权及相关权利",
-                "第9条 与《伯尔尼公约》的关系"
-            ],
+                ("第8条 原 则",3),
+                ("第二部分 关于知识产权的效力、范围及使用的标准",1),
+                ("第1节 版权及相关权利",2),
+                ("第9条 与《伯尔尼公约》的关系",3)
+            ]
         },
         {
-            "input": "第16条 授予权利
-
-1.已注册商标所有者应拥有阻止所有未经其同意的第三方在贸易中使用与已注册商标相同或相似的商品或服务的，其使用有可能招致混淆的相同或相似的标志。在对相同商品或服务使用相同标志的情况下，应推定存在混淆之可能。上述权利不应妨碍任何现行的优先权，也不应影响各成员方以使用为条件获得注册权的可能性。
-
-2.1967《巴黎公约》第6条副则经对细节作必要修改后应适用于服务。在确定一个商标是否为知名商标时，各成员方应考虑到有关部分的公众对该商标的了解，包括由于该商标的推行而在有关成员方得到的了解。
-
-3.1967《巴黎公约》第6条副则经对细节作必要修改后应适用于与已注册商标的商品和服务不相似的商品或服务，条件是该商标与该商品和服务有关的使用会表明该商品或服务与已注册商标所有者之间的联系，而且已注册商标所有者的利益有可能为此种使用所破坏。
-
-第17条 例 外\n ",
+            "input": "第16条 授予权利\n\n1.已注册商标所有者应拥有阻止所有未经其同意的第三方在贸易中使用与已注册商标相同或相似的商品或服务的，其使用有可能招致混淆的相同或相似的标志。在对相同商品或服务使用相同标志的情况下，应推定存在混淆之可能。上述权利不应妨碍任何现行的优先权，也不应影响各成员方以使用为条件获得注册权的可能性。\n\n2.1967《巴黎公约》第6条副则经对细节作必要修改后应适用于服务。在确定一个商标是否为知名商标时，各成员方应考虑到有关部分的公众对该商标的了解，包括由于该商标的推行而在有关成员方得到的了解。\n\n3.1967《巴黎公约》第6条副则经对细节作必要修改后应适用于与已注册商标的商品和服务不相似的商品或服务，条件是该商标与该商品和服务有关的使用会表明该商品或服务与已注册商标所有者之间的联系，而且已注册商标所有者的利益有可能为此种使用所破坏。\n\n第17条 例 外\n ",
             "output": [
-                "第16条 授予权利",
-                "第17条 例 外"
-            ],
+                ("第16条 授予权利",3),
+                ("第17条 例 外",3)
+            ]
         },
         {
-            "input":"的做法。
-
-（4）此类使用应是非独占性的。
-
-（5）此类使用应是不可转让的，除非是同享有此类使用的那部分企业或信誉一道转让。
-
-（6）任何此类使用之授权，均应主要是为授权此类使用的成员方国内市场供应之目的。
-
-（7）在被授权人的合法利益受到充分保护的条件下，当导致此类使用授权的情况下不复存在和可能不再产生时，有义务将其终止；应有动机的请求，主管当局应有权对上述情况的继续存在进行检查。
-
-（8）考虑到授权的经济价值，应视具体情况向权利人支付充分的补偿金。
-
-（9）任何与此类使用之授权有关的决定，其法律效力应接受该成员方境内更高当局的司法审查或其他独立审查。
-
-（10）任何与为此类使用而提供的补偿金有关的决定，应接受成员方境内更高当局的司法审查或其他独立审查。
-",
-            "output": [],
-        },
+            "input": "的做法。\n\n（4）此类使用应是非独占性的。\n\n（5）此类使用应是不可转让的，除非是同享有此类使用的那部分企业或信誉一道转让。\n\n（6）任何此类使用之授权，均应主要是为授权此类使用的成员方国内市场供应之目的。\n\n（7）在被授权人的合法利益受到充分保护的条件下，当导致此类使用授权的情况下不复存在和可能不再产生时，有义务将其终止；应有动机的请求，主管当局应有权对上述情况的继续存在进行检查。\n\n（8）考虑到授权的经济价值，应视具体情况向权利人支付充分的补偿金。\n\n（9）任何与此类使用之授权有关的决定，其法律效力应接受该成员方境内更高当局的司法审查或其他独立审查。\n\n（10）任何与为此类使用而提供的补偿金有关的决定，应接受成员方境内更高当局的司法审查或其他独立审查。\n",
+            "output": []
+        }
     ]
-}    
-    """
+}
+"""
 
     template_en = """
 {
@@ -147,11 +116,16 @@ def __init__(self, language: Optional[str] = "zh"):
 
     @property
     def template_variables(self) -> List[str]:
-        return ["input"]
+        return ["input", "current_outline"]
 
     def parse_response(self, response: str, **kwargs):
         if isinstance(response, str):
-            response = json.loads(response)
+            cleaned_data = response.strip("`python\n[] \n")  # 去除 Markdown 语法和多余的空格
+            cleaned_data = "[" + cleaned_data + "]"  # 恢复为列表格式
+
+            # 使用 ast.literal_eval 将字符串转换为实际的列表对象
+            list_data = ast.literal_eval(cleaned_data)
+            return list_data
         if isinstance(response, dict) and "output" in response:
             response = response["output"]
 
diff --git a/kag/builder/prompt/semantic_seg_prompt.py b/kag/builder/prompt/semantic_seg_prompt.py
index fb09b564..9399cda6 100644
--- a/kag/builder/prompt/semantic_seg_prompt.py
+++ b/kag/builder/prompt/semantic_seg_prompt.py
@@ -11,12 +11,13 @@
 # or implied.
 
 import json
-from typing import Optional, List
+from typing import List
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 
-class SemanticSegPrompt(PromptOp):
+@PromptABC.register("semantic_seg")
+class SemanticSegPrompt(PromptABC):
     template_zh = """
 {
     "instruction": "\n请理解input字段中的文本内容，识别文本的结构和组成部分，并按照语义主题确定分割点，将其切分成互不重叠的若干小节。如果文章有章节等可识别的结构信息，请直接按照顶层结构进行切分。\n请按照schema定义的字段返回，包含小节摘要和小节起始点。须按照JSON字符串的格式回答。具体形式请遵从example字段中给出的若干例子。",
@@ -111,9 +112,6 @@ class SemanticSegPrompt(PromptOp):
 }    
     """
 
-    def __init__(self, language: Optional[str] = "zh"):
-        super().__init__(language)
-
     @property
     def template_variables(self) -> List[str]:
         return ["input"]
diff --git a/kag/builder/prompt/spg_prompt.py b/kag/builder/prompt/spg_prompt.py
index f14678de..a094f1ea 100644
--- a/kag/builder/prompt/spg_prompt.py
+++ b/kag/builder/prompt/spg_prompt.py
@@ -12,244 +12,589 @@
 
 import json
 import logging
-from abc import ABC
+import copy
 from typing import List, Dict
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 from knext.schema.client import SchemaClient
-from knext.schema.model.base import BaseSpgType, SpgTypeEnum
+from knext.schema.model.base import SpgTypeEnum, ConstraintTypeEnum
 from knext.schema.model.schema_helper import SPGTypeName
 from kag.builder.model.spg_record import SPGRecord
+from kag.common.conf import KAG_PROJECT_CONF
+from knext.schema.client import OTHER_TYPE
 
 logger = logging.getLogger(__name__)
 
 
-class SPGPrompt(PromptOp, ABC):
-    spg_types: Dict[str, BaseSpgType]
+class SPGPrompt(PromptABC):
+    """
+    Base class for generating SPG schema-based entity/event extraction prompts.
+
+    Attributes:
+        ignored_types (List[str]): List of SPG types to be ignored.
+        ignored_properties (List[str]): List of properties to be ignored.
+        default_properties (Dict[str, str]): Default properties for SPG types.
+        ignored_relations (List[str]): List of relations to be ignored.
+    """
+
     ignored_types: List[str] = ["Chunk"]
-    ignored_properties: List[str] = ["id", "name", "description", "stdId", "eventTime", "desc", "semanticType"]
+    ignored_properties: List[str] = [
+        "id",
+        "stdId",
+        "desc",
+        "description",
+        "eventTime",
+    ]
+    default_properties: Dict[str, str] = {
+        "name": "Text",
+    }
+
     ignored_relations: List[str] = ["isA"]
-    basic_types = {"Text": "文本", "Integer": "整型", "Float": "浮点型"}
 
     def __init__(
         self,
-        spg_type_names: List[SPGTypeName],
-        language: str = "zh",
+        spg_type_names: List[SPGTypeName] = [],
+        language: str = "",
         **kwargs,
     ):
+        """
+        Initializes the SPGPrompt instance.
+
+        Args:
+            spg_type_names (List[SPGTypeName], optional): List of SPG type names. Defaults to [].
+            language (str, optional): Language for the prompt. Defaults to "".
+            **kwargs: Additional keyword arguments.
+        """
         super().__init__(language=language, **kwargs)
-        self.all_schema_types = SchemaClient(project_id=self.project_id).load()
+        self.schema = SchemaClient(project_id=KAG_PROJECT_CONF.project_id).load()
         self.spg_type_names = spg_type_names
         if not spg_type_names:
-            self.spg_types = self.all_schema_types
+            self.spg_types = self.schema
         else:
-            self.spg_types = {k: v for k, v in self.all_schema_types.items() if k in spg_type_names}
-        self.schema_list = []
-
-        self._init_render_variables()
+            self.spg_types = {
+                k: v for k, v in self.schema.items() if k in spg_type_names
+            }
+        self.create_prompt_schema()
+        # self._init_render_variables()
 
     @property
     def template_variables(self) -> List[str]:
+        """
+        Returns the list of template variables used in the prompt.
+
+        Returns:
+            List[str]: List of template variables.
+        """
         return ["schema", "input"]
 
-    def _init_render_variables(self):
-        self.type_en_to_zh = {"Text": "文本", "Integer": "整型", "Float": "浮点型"}
-        self.type_zh_to_en = {
-            "文本": "Text",
-            "整型": "Integer",
-            "浮点型": "Float",
-        }
-        self.prop_en_to_zh = {}
-        self.prop_zh_to_en = {}
-        for type_name, spg_type in self.all_schema_types.items():
-            self.type_en_to_zh[type_name] = spg_type.name_zh
-            self.type_en_to_zh[spg_type.name_zh] = type_name
-            self.prop_zh_to_en[type_name] = {}
-            self.prop_en_to_zh[type_name] = {}
-            for _prop in spg_type.properties.values():
-                if _prop.name in self.ignored_properties:
+    def get_accept_types(self):
+        """
+        Returns the list of accepted SPG types.
+
+        Returns:
+            List[SpgTypeEnum]: List of accepted SPG types.
+        """
+        return [
+            SpgTypeEnum.Entity,
+            SpgTypeEnum.Concept,
+            SpgTypeEnum.Event,
+        ]
+
+    def build_prompt(self, variables: Dict[str, str]) -> str:
+        """
+        Builds the prompt using the provided variables.
+
+        Args:
+            variables (Dict[str, str]): Dictionary of variables to be used in the prompt.
+
+        Returns:
+            str: The built prompt.
+        """
+        return super().build_prompt(
+            {
+                "schema": copy.deepcopy(self.prompt_schema),
+                "input": variables.get("input"),
+            }
+        )
+
+    def process_property_name(self, name: str):
+        """
+        Process property name by removing descriptions enclosed in parentheses.
+        Args:
+            name (dict):  property names (possibly containing descriptions in parentheses)
+
+        Returns:
+            str: A new string having the descriptions in parentheses removed.
+
+        Example:
+            >>> name = 'authors(authors of work, such as director, actor, lyricist, composer and singer)'
+            >>> process_property_name(input_properties)
+            'authors'
+        """
+
+        return name.split("(")[0]
+
+    def process_property_names(self, properties: Dict):
+        """
+        Process property names by removing descriptions enclosed in parentheses.
+
+        This method iterates through the given dictionary of properties, removes any
+        descriptions enclosed in parentheses from the property names, and returns a new
+        dictionary with the processed names. If a property value is itself a dictionary,
+        this method will recursively process it.
+
+        Args:
+            properties (dict): A dictionary where keys are property names (possibly containing
+                               descriptions in parentheses) and values are either property values
+                               or nested dictionaries.
+
+        Returns:
+            dict: A new dictionary with the same structure as the input, but with all property
+                  names having their descriptions in parentheses removed.
+        Example:
+            >>> input_properties = {
+            ...     "authors(authors of work, such as director, actor, lyricist, composer and singer)": "John Doe"
+            ... }
+            >>> process_property_names(input_properties)
+            {'authors': 'John Doe'}
+        """
+        output = {}
+        for k, v in properties.items():
+            k = self.process_property_name(k)
+            if isinstance(v, dict):
+                output[k] = self.process_property_names(v)
+            else:
+                output[k] = v
+        return output
+
+    def parse_response(self, response: str, **kwargs) -> List[SPGRecord]:
+        """
+        Parses the response string into a list of SPG records.
+
+        Args:
+            response (str): The response string to be parsed.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List[SPGRecord]: List of parsed SPG records.
+        """
+        rsp = response
+        if isinstance(rsp, str):
+            rsp = json.loads(rsp)
+        if isinstance(rsp, dict) and "output" in rsp:
+            rsp = rsp["output"]
+        outputs = []
+        for item in rsp:
+            if "category" not in item or item["category"] not in self.schema:
+                continue
+            properties = item.get("properties", {})
+            if "name" not in properties:
+                continue
+            output = {}
+            output["category"] = item["category"]
+            output["name"] = properties.pop("name")
+            output["properties"] = self.process_property_names(properties)
+            outputs.append(output)
+        return outputs
+
+    def create_prompt_schema(self):
+        """
+        Creates the schema for extraction prompt based on the project schema.
+        """
+        prompt_schema = []
+        accept_types = self.get_accept_types()
+        for type_name, spg_type in self.spg_types.items():
+            if type_name in self.ignored_types:
+                continue
+            if spg_type.spg_type_enum not in accept_types:
+                continue
+            type_desc = spg_type.desc
+            properties = copy.deepcopy(self.default_properties)
+            for k, v in spg_type.properties.items():
+                if k in self.ignored_properties or k in self.default_properties:
                     continue
-                self.prop_en_to_zh[type_name][_prop.name] = _prop.name_zh
-                self.prop_zh_to_en[type_name][_prop.name_zh] = _prop.name
-            for _rel in spg_type.relations.values():
-                if _rel.is_dynamic:
+                multi_value = ConstraintTypeEnum.MultiValue.value in v.constraint
+                obj_type_name = v.object_type_name.split(".")[-1]
+                if multi_value:
+                    obj_type_name = f"List[{obj_type_name}]"
+                if v.desc:
+                    v_name = f"{v.name}({v.desc})"
+                else:
+                    v_name = v.name
+                properties[v_name] = obj_type_name
+
+            for k, v in spg_type.relations.items():
+                if k in self.ignored_relations or k in self.default_properties:
+                    continue
+                if v.name in properties:
                     continue
-                self.prop_en_to_zh[type_name][_rel.name] = _rel.name_zh
-                self.prop_zh_to_en[type_name][_rel.name_zh] = _rel.name
+                obj_type_name = v.object_type_name.split(".")[-1]
+                if v.desc:
+                    v_name = f"{v.name}({v.desc})"
+                else:
+                    v_name = v.name
+                properties[v_name] = obj_type_name
+
+            if type_desc:
+                prompt_schema.append(
+                    {f"{type_name}({type_desc})": {"properties": properties}}
+                )
+            else:
+                prompt_schema.append({type_name: {"properties": properties}})
 
-    def _render(self):
-        raise NotImplementedError
+        self.prompt_schema = prompt_schema
 
 
-class SPG_KGPrompt(SPGPrompt):
-    template_zh: str = """
-    {
-        "instruction": "你是一个图谱知识抽取的专家, 基于constraint 定义的schema，从input 中抽取出所有的实体及其属性，input中未明确提及的属性返回NAN，以标准json 格式输出，结果返回list",
-        "schema": $schema,
+@PromptABC.register("spg_entity")
+class SPGEntityPrompt(SPGPrompt):
+    template_zh: dict = {
+        "instruction": "作为一个图谱知识抽取的专家, 你需要基于定义了实体类型及对应属性的schema，从input字段的文本中抽取出所有的实体及其属性，schema中标记为List的属性返回list，未能提取的属性返回null。以标准json list格式输出，list中每个元素形如{category: properties}，你可以参考example字段中给出的示例格式。注意实体属性的SemanticType指的是一个相比实体类型更具体且明确定义的类型，例如Person类型的SemanticType可以是Professor或Actor。",
         "example": [
-        {
-            "input": "甲状腺结节是指在甲状腺内的肿块，可随吞咽动作随甲状腺而上下移动，是临床常见的病症，可由多种病因引起。临床上有多种甲状腺疾病，如甲状腺退行性变、炎症、自身免疫以及新生物等都可以表现为结节。甲状腺结节可以单发，也可以多发，多发结节比单发结节的发病率高，但单发结节甲状腺癌的发生率较高。患者通常可以选择在普外科，甲状腺外科，内分泌科，头颈外科挂号就诊。有些患者可以触摸到自己颈部前方的结节。在大多情况下，甲状腺结节没有任何症状，甲状腺功能也是正常的。甲状腺结节进展为其它甲状腺疾病的概率只有1%。有些人会感觉到颈部疼痛、咽喉部异物感，或者存在压迫感。当甲状腺结节发生囊内自发性出血时，疼痛感会更加强烈。治疗方面，一般情况下可以用放射性碘治疗，复方碘口服液(Lugol液)等，或者服用抗甲状腺药物来抑制甲状腺激素的分泌。目前常用的抗甲状腺药物是硫脲类化合物，包括硫氧嘧啶类的丙基硫氧嘧啶(PTU)和甲基硫氧嘧啶(MTU)及咪唑类的甲硫咪唑和卡比马唑。",
-            "schema": {
-                "Disease": {
-                    "properties": {
-                        "complication": "并发症",
-                        "commonSymptom": "常见症状",
-                        "applicableMedicine": "适用药品",
-                        "department": "就诊科室",
-                        "diseaseSite": "发病部位",
-                    }
-                },"Medicine": {
-                    "properties": {
-                    }
-                }
+            {
+                "input": "周杰伦（Jay Chou），1979年1月18日出生于台湾省新北市，祖籍福建省永春县，华语流行乐男歌手、音乐人、演员、导演、编剧，毕业于淡江中学。2000年，发行个人首张音乐专辑《Jay》 [26]。2023年凭借《最伟大的作品》获得第一届浪潮音乐大赏年度制作、最佳作曲、最佳音乐录影带三项大奖。",
+                "output": [
+                    {
+                        "category": "Person",
+                        "properties": {
+                            "name": "周杰伦",
+                            "semanticType": "Musician",
+                            "description": "华语流行乐男歌手、音乐人、演员、导演、编剧",
+                        },
+                    },
+                    {
+                        "category": "GeographicLocation",
+                        "properties": {
+                            "name": "台湾省新北市",
+                            "semanticType": "City",
+                            "description": "周杰伦的出生地",
+                        },
+                    },
+                    {
+                        "category": "GeographicLocation",
+                        "properties": {
+                            "name": "福建省永春县",
+                            "semanticType": "County",
+                            "description": "周杰伦的祖籍",
+                        },
+                    },
+                    {
+                        "category": "Organization",
+                        "properties": {
+                            "name": "淡江中学",
+                            "semanticType": "School",
+                            "description": "周杰伦的毕业学校",
+                        },
+                    },
+                    {
+                        "category": "Works",
+                        "properties": {
+                            "name": "Jay",
+                            "semanticType": "Album",
+                            "description": "周杰伦的个人首张音乐专辑",
+                        },
+                    },
+                    {
+                        "category": "Works",
+                        "properties": {
+                            "name": "最伟大的作品",
+                            "semanticType": "MusicVideo",
+                            "description": "周杰伦凭借此作品获得多项音乐大奖",
+                        },
+                    },
+                ],
             }
-            "output": [
-                {
-                    "entity": "甲状腺结节",
-                    "category":"Disease"
-                    "properties": {
-                        "complication": "甲状腺癌",
-                        "commonSymptom": ["颈部疼痛", "咽喉部异物感", "压迫感"],
-                        "applicableMedicine": ["复方碘口服液(Lugol液)", "丙基硫氧嘧啶(PTU)", "甲基硫氧嘧啶(MTU)", "甲硫咪唑", "卡比马唑"],
-                        "department": ["普外科", "甲状腺外科", "内分泌科", "头颈外科"],
-                        "diseaseSite": "甲状腺",
-                    }
-                },{
-                    "entity":"复方碘口服液(Lugol液)",
-                    "category":"Medicine"
-                },{
-                    "entity":"丙基硫氧嘧啶(PTU)",
-                    "category":"Medicine"
-                },{
-                    "entity":"甲基硫氧嘧啶(MTU)",
-                    "category":"Medicine"
-                },{
-                    "entity":"甲硫咪唑",
-                    "category":"Medicine"
-                },{
-                    "entity":"卡比马唑",
-                    "category":"Medicine"
-                }
-            ],
-    "input": "$input"
+        ],
     }
-    """
 
-    template_en: str = """
-    {
-        "instruction": "You are an expert in knowledge graph extraction. Based on the schema defined by constraints, extract all entities and their attributes from the input. For attributes not explicitly mentioned in the input, return NAN. Output the results in standard JSON format as a list.",
-        "schema": $schema,
+    template_en: dict = {
+        "instruction": "As an expert in graph knowledge extraction, you need to extract all entities and their properties from the text in the input field based on a schema that defines entity types and their corresponding attributes. Attributes marked as List in the schema should return a list, and attributes not extracted should return null. Output the results in a standard JSON list format, where each element in the list is in the form of {category: properties}. You can refer to the example format provided in the example field. Note that the SemanticType of an entity attribute refers to a more specific and clearly defined type compared to the entity type itself, such as Professor or Actor for the Person type.",
         "example": [
-        {
-            "input": "Thyroid nodules refer to lumps within the thyroid gland that can move up and down with swallowing, and they are a common clinical condition that can be caused by various etiologies. Clinically, many thyroid diseases, such as thyroid degeneration, inflammation, autoimmune conditions, and neoplasms, can present as nodules. Thyroid nodules can occur singly or in multiple forms; multiple nodules have a higher incidence than single nodules, but single nodules have a higher likelihood of being thyroid cancer. Patients typically have the option to register for consultation in general surgery, thyroid surgery, endocrinology, or head and neck surgery. Some patients can feel the nodules in the front of their neck. In most cases, thyroid nodules are asymptomatic, and thyroid function is normal. The probability of thyroid nodules progressing to other thyroid diseases is only about 1%. Some individuals may experience neck pain, a foreign body sensation in the throat, or a feeling of pressure. When spontaneous intracystic bleeding occurs in a thyroid nodule, the pain can be more intense. Treatment options generally include radioactive iodine therapy, Lugol's solution (a compound iodine oral solution), or antithyroid medications to suppress thyroid hormone secretion. Currently, commonly used antithyroid drugs are thiourea compounds, including propylthiouracil (PTU) and methylthiouracil (MTU) from the thiouracil class, and methimazole and carbimazole from the imidazole class.",
-            "schema": {
-                "Disease": {
-                    "properties": {
-                        "complication": "Disease",
-                        "commonSymptom": "Symptom",
-                        "applicableMedicine": "Medicine",
-                        "department": "HospitalDepartment",
-                        "diseaseSite": "HumanBodyPart"
-                    }
-                },"Medicine": {
-                    "properties": {
-                    }
-                }
+            {
+                "input": "Jay Chou, born on January 18, 1979, in New Taipei City, Taiwan Province, with ancestral roots in Yongchun County, Fujian Province, is a renowned male singer, musician, actor, director, and screenwriter in the realm of Chinese pop music. He graduated from Tamkang University. In 2000, he released his debut solo album, <Jay> [26]. In 2023, he was honored with three major awards at the inaugural Wave Music Awards for Best Production, Best Composition, and Best Music Video for his album The Greatest Work.",
+                "output": [
+                    {
+                        "category": "Person",
+                        "properties": {
+                            "name": "Jay Chou",
+                            "semanticType": "Musician",
+                            "description": "renowned male singer, musician, actor, director, and screenwriter in the realm of Chinese pop music",
+                        },
+                    },
+                    {
+                        "category": "GeographicLocation",
+                        "properties": {
+                            "name": "New Taipei City, Taiwan Province",
+                            "semanticType": "City",
+                            "description": "Jay Chou's birthplace",
+                        },
+                    },
+                    {
+                        "category": "GeographicLocation",
+                        "properties": {
+                            "name": "Yongchun County, Fujian Province",
+                            "semanticType": "County",
+                            "description": "Jay Chou's ancestral roots",
+                        },
+                    },
+                    {
+                        "category": "Organization",
+                        "properties": {
+                            "name": "Tamkang University",
+                            "semanticType": "University",
+                            "description": "Jay Chou's alma mater",
+                        },
+                    },
+                    {
+                        "category": "Works",
+                        "properties": {
+                            "name": "Jay",
+                            "semanticType": "Album",
+                            "description": "Jay Chou's debut solo album",
+                        },
+                    },
+                    {
+                        "category": "Works",
+                        "properties": {
+                            "name": "The Greatest Work",
+                            "semanticType": "Album",
+                            "description": "Jay Chou's album for which he won multiple awards",
+                        },
+                    },
+                ],
             }
+        ],
+    }
+
+    def get_accept_types(self):
+        return [
+            SpgTypeEnum.Entity,
+            SpgTypeEnum.Concept,
+        ]
+
+
+@PromptABC.register("spg_event")
+class SPGEventPrompt(SPGPrompt):
+    template_zh: dict = {
+        "instruction": "作为一个知识图谱图谱事件抽取的专家, 你需要基于定义的事件类型及对应属性的schema，从input字段的文本中抽取出所有的事件及其属性，schema中标记为List的属性返回list，未能提取的属性返回null。以标准json list格式输出，list中每个元素形如{category: properties}，你可以参考example字段中给出的示例格式。",
+        "example": {
+            "input": "1986年，周星驰被调入无线电视台戏剧组；同年，他在单元情景剧《哥哥的女友》中饰演可爱活泼又略带羞涩的潘家伟，这也是他第一次在情景剧中担任男主角；之后，他还在温兆伦、郭晋安等人主演的电视剧中跑龙套。",
             "output": [
                 {
-                    "entity": "Thyroid Nodule",
-                    "category": "Disease",
+                    "category": "Event",
+                    "properties": {
+                        "name": "周星驰被调入无线电视台戏剧组",
+                        "abstract": "1986年，周星驰被调入无线电视台戏剧组。",
+                        "subject": "周星驰",
+                        "time": "1986年",
+                        "location": "无线电视台",
+                        "participants": [],
+                        "semanticType": "调动",
+                    },
+                },
+                {
+                    "category": "Event",
                     "properties": {
-                        "complication": "Thyroid Cancer",
-                        "commonSymptom": ["Neck Pain", "Foreign Body Sensation in the Throat", "Feeling of Pressure"],
-                        "applicableMedicine": ["Lugol's Solution (Compound Iodine Oral Solution)", "Propylthiouracil (PTU)", "Methylthiouracil (MTU)", "Methimazole", "Carbimazole"],\n            "department": ["General Surgery", "Thyroid Surgery", "Endocrinology", "Head and Neck Surgery"],\n            "diseaseSite": "Thyroid"\n        }\n    },\n    {\n        "entity": "Lugol's Solution (Compound Iodine Oral Solution)",
-                    "category": "Medicine"
+                        "name": "周星驰在《哥哥的女友》中饰演潘家伟",
+                        "abstract": "1986年，周星驰在单元情景剧《哥哥的女友》中饰演可爱活泼又略带羞涩的潘家伟，这也是他第一次在情景剧中担任男主角。",
+                        "subject": "周星驰",
+                        "time": "1986年",
+                        "location": None,
+                        "participants": [],
+                        "semanticType": "演出",
+                    },
                 },
                 {
-                    "entity": "Propylthiouracil (PTU)",
-                    "category": "Medicine"
+                    "category": "Event",
+                    "properties": {
+                        "name": "周星驰跑龙套",
+                        "abstract": "1986年，周星驰在温兆伦、郭晋安等人主演的电视剧中跑龙套。",
+                        "subject": "周星驰",
+                        "time": "1986年",
+                        "location": None,
+                        "participants": ["温兆伦", "郭晋安"],
+                        "semanticType": "演出",
+                    },
                 },
+            ],
+        },
+    }
+
+    template_en: dict = {
+        "instruction": "As an expert in knowledge graph event extraction, you need to extract all events and their attributes from the text in the input field based on the defined event types and corresponding attribute schema. For attributes marked as List in the schema, return them as a list, and for attributes that cannot be extracted, return null. Output in the standard JSON list format, with each element in the list having the form {category: properties}. You can refer to the example format provided in the example field.",
+        "example": {
+            "input": "In 1986, Stephen Chow was transferred to the drama department of Television Broadcasts Limited (TVB). In the same year, he played the role of Pan Jiawei, a lovable, lively, and slightly shy character, in the episodic situational comedy <My Brother's Girlfriend.> This was his first time taking on a lead role in a sitcom. Later, he also had minor roles in TV series starring actors such as Anthony Wong and Aaron Kwok.",
+            "output": [
                 {
-                    "entity": "Methylthiouracil (MTU)",
-                    "category": "Medicine"
+                    "category": "Event",
+                    "properties": {
+                        "name": "Stephen Chow was transferred to the drama department of TVB",
+                        "abstract": "In 1986, Stephen Chow was transferred to the drama department of Television Broadcasts Limited (TVB).",
+                        "subject": "Stephen Chow",
+                        "time": "1986",
+                        "location": "Television Broadcasts Limited (TVB)",
+                        "participants": [],
+                        "semanticType": "调动",
+                    },
                 },
                 {
-                    "entity": "Methimazole",
-                    "category": "Medicine"
+                    "category": "Event",
+                    "properties": {
+                        "name": "Stephen Chow played Pan Jiawei in My Brother's Girlfriend",
+                        "abstract": "In 1986, Stephen Chow played the role of Pan Jiawei, a lovable, lively, and slightly shy character, in the episodic situational comedy <My Brother's Girlfriend.> This was his first time taking on a lead role in a sitcom.",
+                        "subject": "Stephen Chow",
+                        "time": "1986",
+                        "location": None,
+                        "participants": [],
+                        "semanticType": "演出",
+                    },
                 },
                 {
-                    "entity": "Carbimazole",
-                    "category": "Medicine"
-                }
+                    "category": "Event",
+                    "properties": {
+                        "name": "Stephen Chow had minor roles in TV series",
+                        "abstract": "Later, Stephen Chow also had minor roles in TV series starring actors such as Anthony Wong and Aaron Kwok.",
+                        "subject": "Stephen Chow",
+                        "time": None,
+                        "location": None,
+                        "participants": ["Anthony Wong", "Aaron Kwok"],
+                        "semanticType": "演出",
+                    },
+                },
             ],
-    "input": "$input"
+        },
     }
-    """
 
-    def __init__(
-        self,
-        spg_type_names: List[SPGTypeName],
-        language: str = "zh",
-        **kwargs
-    ):
-        super().__init__(
-            spg_type_names=spg_type_names,
-            language=language,
-            **kwargs
-        )
-        self._render()
+    def get_accept_types(self):
+        return [
+            SpgTypeEnum.Event,
+        ]
+
+
+@PromptABC.register("spg_relation")
+class SPGRelationPrompt(SPGPrompt):
+    template_zh: dict = {
+        "instruction": "您是一位专门从事开放信息提取（OpenIE）的专家。schema定义了你需要关注的实体类型以及可选的用括号包围的类型解释，entity_list是一组实体列表。请从input字段的文本中提取任何可能的[主语实体，主语实体类类型，谓语，宾语实体，宾语实体类型]五元组，并按照JSON列表格式列出它们。请严格遵循以下要求：\n1. 主语实体和宾语实体应至少有一个包含在entity_list实体列表，但不要求都包含\n2. 主语和宾语实体类型必须是schema定义的类型，否则无效，\n3. 明确地将代词解析为对应名称，以保持清晰度。",
+        "example": {
+            "input": "1986年，周星驰被调入无线电视台戏剧组；同年，他在单元情景剧《哥哥的女友》中饰演可爱活泼又略带羞涩的潘家伟，这也是他第一次在情景剧中担任男主角；之后，他还在温兆伦、郭晋安等人主演的电视剧中跑龙套。",
+            "entity_list": [
+                {"name": "周星驰", "category": "Person"},
+                {"name": "无线电视台", "category": "Organization"},
+                {"name": "哥哥的女友", "category": "Works"},
+                {"name": "潘家伟", "category": "Person"},
+                {"name": "温兆伦", "category": "Person"},
+                {"name": "郭晋安", "category": "Person"},
+            ],
+            "output": [
+                ["周星驰", "Person", "被调入", "无线电视台", "Organization"],
+                ["周星驰", "Person", "出演", "哥哥的女朋友", "Works"],
+                ["周星驰", "Person", "饰演", "潘家伟", "Person"],
+                ["周星驰", "Person", "共演", "温兆伦", "Person"],
+                ["周星驰", "Person", "共演", "郭晋安", "Person"],
+                [
+                    "周星驰",
+                    "Person",
+                    "跑龙套",
+                    "温兆伦、郭晋安等人主演的电视剧",
+                    "Works",
+                ],
+            ],
+        },
+    }
+
+    template_en: dict = {
+        "instruction": "You are an expert in Open Information Extraction (OpenIE). The schema defines the entity types you need to focus on, along with optional type explanations enclosed in parentheses. The entity_list is a set of entity lists. Please extract any possible [subject entity, subject entity class type, predicate, object entity, object entity type] quintuples from the text in the input field and list them in JSON list format. Please adhere strictly to the following requirements:1. At least one of the subject entity and object entity must appear in the entity_list.\n2. The subject and object entity types must be defined in the schema; otherwise, they are considered invalid.\n3.Resolve pronouns to their corresponding names explicitly to maintain clarity.",
+        "example": {
+            "input": "In 1986, Stephen Chow was transferred to the drama division of TVB; that same year, he played the cute, lively, and slightly shy Pan Jiawei in the situational drama 'My Brother's Girlfriend,' which was also his first time as the male lead in a situational drama; later, he also appeared as an extra in TV dramas starring Deric Wan, Roger Kwok, and others.",
+            "entity_list": [
+                {"name": "Stephen Chow", "category": "Person"},
+                {"name": "TVB", "category": "Organization"},
+                {"name": "My Brother's Girlfriend", "category": "Works"},
+                {"name": "Pan Jiawei", "category": "Person"},
+                {"name": "Deric Wan", "category": "Person"},
+                {"name": "Roger Kwok", "category": "Person"},
+            ],
+            "output": [
+                ["Stephen Chow", "Person", "was transferred to", "TVB", "Organization"],
+                [
+                    "Stephen Chow",
+                    "Person",
+                    "starred in",
+                    "My Brother's Girlfriend",
+                    "Works",
+                ],
+                ["Stephen Chow", "Person", "played", "Pan Jiawei", "Person"],
+                ["Stephen Chow", "Person", "co-starred with", "Deric Wan", "Person"],
+                ["Stephen Chow", "Person", "co-starred with", "Roger Kwok", "Person"],
+                [
+                    "Stephen Chow",
+                    "Person",
+                    "appeared as an extra in",
+                    "TV dramas starring Deric Wan, Roger Kwok, and others",
+                    "Works",
+                ],
+            ],
+        },
+    }
+
+    def get_accept_types(self):
+        """
+        Returns the list of accepted SPG types.
+
+        Returns:
+            List[SpgTypeEnum]: List of accepted SPG types.
+        """
+        return [
+            SpgTypeEnum.Entity,
+            SpgTypeEnum.Concept,
+        ]
 
     def build_prompt(self, variables: Dict[str, str]) -> str:
-        schema = {}
-        for tmpSchema in self.schema_list:
-            schema.update(tmpSchema)
+        """
+        Builds the prompt using the provided variables.
+
+        Args:
+            variables (Dict[str, str]): Dictionary of variables to be used in the prompt.
 
-        return super().build_prompt({"schema": schema, "input": variables.get("input")})
+        Returns:
+            str: The built prompt.
+        """
+        schema = []
+        for item in self.prompt_schema:
+            schema.extend(item.keys())
+        return super().build_prompt(
+            {
+                "schema": schema,
+                "input": variables.get("input"),
+            }
+        )
 
     def parse_response(self, response: str, **kwargs) -> List[SPGRecord]:
+        """
+        Parses the response string into a list of SPG records.
+
+        Args:
+            response (str): The response string to be parsed.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List[SPGRecord]: List of parsed SPG records.
+        """
         rsp = response
         if isinstance(rsp, str):
             rsp = json.loads(rsp)
         if isinstance(rsp, dict) and "output" in rsp:
             rsp = rsp["output"]
-        if isinstance(rsp, dict) and "named_entities" in rsp:
-            entities = rsp["named_entities"]
-        else:
-            entities = rsp
-
-        return entities
-
-    def _render(self):
-        spo_list = []
-        for type_name, spg_type in self.spg_types.items():
-            if spg_type.spg_type_enum not in [SpgTypeEnum.Entity, SpgTypeEnum.Concept, SpgTypeEnum.Event]:
+        outputs = []
+        for item in rsp:
+            if len(item) != 5:
                 continue
-            constraint = {}
-            properties = {}
-            properties.update(
-                {
-                    v.name: (f"{v.name_zh}" if not v.desc else f"{v.name_zh}，{v.desc}") if self.language == "zh" else (f"{v.name}" if not v.desc else f"{v.name}, {v.desc}")
-                    for k, v in spg_type.properties.items()
-                    if k not in self.ignored_properties
-                }
-            )
-            properties.update(
-                {
-                    f"{v.name}#{v.object_type_name_en}": (
-                        f"{v.name_zh}，类型是{v.object_type_name_zh}"
-                        if not v.desc
-                        else f"{v.name_zh}，{v.desc}，类型是{v.object_type_name_zh}"
-                    ) if self.language == "zh" else (
-                        f"{v.name}, the type is {v.object_type_name_en}"
-                        if not v.desc
-                        else f"{v.name}，{v.desc}, the type is {v.object_type_name_en}"
-                    )
-                    for k, v in spg_type.relations.items()
-                    if not v.is_dynamic and k not in self.ignored_relations
-                }
-            )
-            constraint.update({"properties": properties})
-            spo_list.append({type_name: constraint})
-
-        self.schema_list = spo_list
+            s_name, s_label, predicate, o_name, o_label = item
+            s_label = self.process_property_name(s_label)
+            o_label = self.process_property_name(o_label)
+            # force convert to OTHER_TYPE or just drop it?
+            if s_label not in self.schema:
+                s_label = OTHER_TYPE
+            if o_label not in self.schema:
+                o_label = OTHER_TYPE
+            outputs.append([s_name, s_label, predicate, o_name, o_label])
+        return outputs
diff --git a/kag/builder/component/extractor/user_defined_extractor.py b/kag/builder/prompt/utils.py
similarity index 54%
rename from kag/builder/component/extractor/user_defined_extractor.py
rename to kag/builder/prompt/utils.py
index ada267c2..79984001 100644
--- a/kag/builder/component/extractor/user_defined_extractor.py
+++ b/kag/builder/prompt/utils.py
@@ -10,20 +10,15 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
 
-from typing import Dict, List
+from kag.interface import PromptABC
 
-from knext.common.base.runnable import Input, Output
-from kag.interface.builder import ExtractorABC
 
+def init_prompt_with_fallback(prompt_name, biz_scene):
+    try:
+        return PromptABC.from_config({"type": f"{biz_scene}_{prompt_name}"})
+    except Exception as e:
+        print(
+            f"fail to initialize prompts with biz scene {biz_scene}, fallback to default biz scene, info: {e}"
+        )
 
-class UserDefinedExtractor(ExtractorABC):
-    @property
-    def input_types(self) -> Input:
-        return Dict[str, str]
-
-    @property
-    def output_types(self) -> Output:
-        return Dict[str, str]
-
-    def invoke(self, input: Input, **kwargs) -> List[Output]:
-        return input
+        return PromptABC.from_config({"type": f"default_{prompt_name}"})
diff --git a/kag/builder/runner.py b/kag/builder/runner.py
new file mode 100644
index 00000000..c1420d85
--- /dev/null
+++ b/kag/builder/runner.py
@@ -0,0 +1,221 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+import os
+import traceback
+import logging
+import threading
+from typing import Dict
+from tqdm import tqdm
+
+from kag.common.conf import KAG_PROJECT_CONF
+from kag.common.registry import Registrable
+from kag.common.utils import reset, bold, red, generate_hash_id
+from kag.common.checkpointer import CheckpointerManager
+from kag.interface import KAGBuilderChain, ScannerABC
+
+from kag.builder.model.sub_graph import SubGraph
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+logger = logging.getLogger()
+
+
+def str_abstract(value: str):
+    """
+    Abstracts a string value by returning the base name if it is a file path, or the first 10 characters otherwise.
+
+    Args:
+        value (str): The string value to be abstracted.
+
+    Returns:
+        str: The abstracted string value.
+    """
+    if os.path.exists(value):
+        return os.path.basename(value)
+    return value[:10]
+
+
+def dict_abstract(value: Dict):
+    """
+    Abstracts each value in a dictionary by converting it to a string and then abstracting the string.
+
+    Args:
+        value (Dict): The dictionary to be abstracted.
+
+    Returns:
+        Dict: The abstracted dictionary.
+    """
+    output = {}
+    for k, v in value.items():
+        output[k] = str_abstract(str(v))
+    return output
+
+
+def generate_hash_id_and_abstract(value):
+    hash_id = generate_hash_id(value)
+    if isinstance(value, dict):
+        abstract = dict_abstract(value)
+    else:
+        abstract = str_abstract(value)
+    return hash_id, abstract
+
+
+class BuilderChainRunner(Registrable):
+    """
+    A class that manages the execution of a KAGBuilderChain with parallel processing and checkpointing.
+
+    This class provides methods to initialize the runner, process input data, and manage checkpoints for tracking processed data.
+    """
+
+    def __init__(
+        self,
+        scanner: ScannerABC,
+        chain: KAGBuilderChain,
+        num_chains: int = 2,
+        num_threads_per_chain: int = 8,
+    ):
+        """
+        Initializes the BuilderChainRunner instance.
+
+        Args:
+            scanner (ScannerABC): The source scanner to generate input data.
+            chain (KAGBuilderChain): The builder chain to process the input data.
+            num_chains (int, optional): The number of parallel threads to use, with each thread launching a builder chain instance. Defaults to 2.
+            num_threads_per_chain (int, optional): The number of parallel workers within a builder chain. Defaults to 8.
+            ckpt_dir (str, optional): The directory to store checkpoint files. Defaults to "./ckpt".
+        """
+        self.scanner = scanner
+        self.chain = chain
+        self.num_chains = num_chains
+        self.num_threads_per_chain = num_threads_per_chain
+        self.ckpt_dir = KAG_PROJECT_CONF.ckpt_dir
+
+        self.checkpointer = CheckpointerManager.get_checkpointer(
+            {
+                "type": "txt",
+                "ckpt_dir": self.ckpt_dir,
+                "rank": self.scanner.sharding_info.get_rank(),
+                "world_size": self.scanner.sharding_info.get_world_size(),
+            }
+        )
+        self.processed_chunks = CheckpointerManager.get_checkpointer(
+            {
+                "type": "zodb",
+                "ckpt_dir": os.path.join(self.ckpt_dir, "chain"),
+                "rank": self.scanner.sharding_info.get_rank(),
+                "world_size": self.scanner.sharding_info.get_world_size(),
+            }
+        )
+        self._local = threading.local()
+
+    def invoke(self, input):
+        """
+        Processes the input data using the builder chain in parallel and manages checkpoints.
+
+        Args:
+            input: The input data to be processed.
+        """
+
+        # def process(thread_local, chain_conf, data, data_id, data_abstract):
+        #     try:
+        #         if not hasattr(thread_local, "chain"):
+        #             if chain_conf:
+        #                 thread_local.chain = KAGBuilderChain.from_config(chain_conf)
+        #             else:
+        #                 thread_local.chain = self.chain
+        #         result = thread_local.chain.invoke(
+        #             data, max_workers=self.num_threads_per_chain
+        #         )
+        #         return data, data_id, data_abstract, result
+        #     except Exception:
+        #         traceback.print_exc()
+        #         return None
+
+        def process(data, data_id, data_abstract):
+            try:
+                result = self.chain.invoke(
+                    data,
+                    max_workers=self.num_threads_per_chain,
+                    processed_chunk_keys=self.processed_chunks.keys(),
+                )
+                return data, data_id, data_abstract, result
+            except Exception:
+                traceback.print_exc()
+                return None
+
+        futures = []
+        print(f"Processing {input}")
+        success = 0
+        try:
+            with ThreadPoolExecutor(self.num_chains) as executor:
+                for item in self.scanner.generate(input):
+                    item_id, item_abstract = generate_hash_id_and_abstract(item)
+                    if self.checkpointer.exists(item_id):
+                        continue
+                    fut = executor.submit(
+                        process,
+                        item,
+                        item_id,
+                        item_abstract,
+                    )
+                    futures.append(fut)
+
+                success = 0
+                for future in tqdm(
+                    as_completed(futures),
+                    total=len(futures),
+                    desc="Progress",
+                    position=0,
+                ):
+                    result = future.result()
+                    if result is not None:
+                        item, item_id, item_abstract, chain_output = result
+                        info = {}
+                        num_nodes = 0
+                        num_edges = 0
+                        num_subgraphs = 0
+                        for item in chain_output:
+                            if isinstance(item, SubGraph):
+                                num_nodes += len(item.nodes)
+                                num_edges += len(item.edges)
+                                num_subgraphs += 1
+                            elif isinstance(item, dict):
+
+                                for k, v in item.items():
+                                    self.processed_chunks.write_to_ckpt(k, k)
+                                    if isinstance(v, SubGraph):
+                                        num_nodes += len(v.nodes)
+                                        num_edges += len(v.edges)
+                                        num_subgraphs += 1
+
+                        info = {
+                            "num_nodes": num_nodes,
+                            "num_edges": num_edges,
+                            "num_subgraphs": num_subgraphs,
+                        }
+                        self.checkpointer.write_to_ckpt(
+                            item_id, {"abstract": item_abstract, "graph_stat": info}
+                        )
+                        success += 1
+        except:
+            traceback.print_exc()
+        CheckpointerManager.close()
+        msg = (
+            f"{bold}{red}Done process {len(futures)} records, with {success} successfully processed and {len(futures)-success} failures encountered.\n"
+            f"The log file is located at {self.checkpointer._ckpt_file_path}. "
+            f"Please access this file to obtain detailed task statistics.{reset}"
+        )
+        print(msg)
+
+
+BuilderChainRunner.register("base", as_default=True)(BuilderChainRunner)
diff --git a/kag/common/__init__.py b/kag/common/__init__.py
index 123acd8d..93aa6cd4 100644
--- a/kag/common/__init__.py
+++ b/kag/common/__init__.py
@@ -9,4 +9,3 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
-
diff --git a/kag/common/arks_pb2.py b/kag/common/arks_pb2.py
index 0a693f00..01462624 100644
--- a/kag/common/arks_pb2.py
+++ b/kag/common/arks_pb2.py
@@ -6,191 +6,166 @@
 # 参考文档: https://yuque.antfin-inc.com/ai-infra/ndhopc/smk38dcs9zqr1ssh#Kb7e0
 
 import sys
-_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+
+_b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode("latin1"))
 from google.protobuf.internal import enum_type_wrapper
 from google.protobuf import descriptor as _descriptor
 from google.protobuf import message as _message
 from google.protobuf import reflection as _reflection
 from google.protobuf import symbol_database as _symbol_database
+
 # @@protoc_insertion_point(imports)
 
 _sym_db = _symbol_database.Default()
 
 
-
-
 DESCRIPTOR = _descriptor.FileDescriptor(
-  name='arks.proto',
-  package='arks',
-  syntax='proto2',
-  serialized_options=_b('\n\025com.alipay.arks.proto'),
-  serialized_pb=_b('\n\narks.proto\x12\x04\x61rks\"\xfc\x01\n\x13InferTensorContents\x12\x14\n\x0cstring_value\x18\x01 \x03(\t\x12\x12\n\nbool_value\x18\x02 \x03(\x08\x12\x11\n\tint_value\x18\x03 \x03(\x05\x12\x13\n\x0bint64_value\x18\x04 \x03(\x03\x12\x12\n\nuint_value\x18\x05 \x03(\r\x12\x14\n\x0cuint64_value\x18\x06 \x03(\x04\x12\x12\n\nfp32_value\x18\x07 \x03(\x02\x12\x12\n\nfp64_value\x18\x08 \x03(\x01\x12\x12\n\nbyte_value\x18\t \x03(\x0c\x12-\n\x04type\x18\n \x01(\x0e\x32\x11.arks.ContentType:\x0cTYPE_INVALID\"q\n\x04Pair\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t\x12+\n\x08\x63ontents\x18\x03 \x01(\x0b\x32\x19.arks.InferTensorContents\x12\x10\n\x08pb_value\x18\x04 \x03(\x0c\x12\x0e\n\x06shapes\x18\x05 \x03(\x05\"\x97\x01\n\x06RowKey\x12\x0f\n\x07row_key\x18\x01 \x01(\t\x12\x10\n\x08versions\x18\x02 \x03(\x03\x12\x1a\n\x12\x61nt_fea_track_info\x18\x03 \x01(\t\x12\'\n\npartitions\x18\x04 \x03(\x0b\x32\x13.arks.PartitionInfo\x12%\n\x11realtime_features\x18\x05 \x03(\x0b\x32\n.arks.Pair\",\n\rPartitionInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t\"\xb9\x01\n\x04Item\x12\x0f\n\x07item_id\x18\x01 \x02(\t\x12\x1c\n\x08\x66\x65\x61tures\x18\x02 \x03(\x0b\x32\n.arks.Pair\x12\x1e\n\nattributes\x18\x03 \x03(\x0b\x32\n.arks.Pair\x12\r\n\x05score\x18\x04 \x01(\x02\x12 \n\tsub_items\x18\x05 \x03(\x0b\x32\r.arks.SubItem\x12\x1d\n\x11is_features_valid\x18\x06 \x03(\x08\x42\x02\x10\x01\x12\x12\n\x06scores\x18\x07 \x03(\x02\x42\x02\x10\x01\"\x9a\x01\n\x07SubItem\x12\x0f\n\x07item_id\x18\x01 \x01(\t\x12\x1c\n\x08\x66\x65\x61tures\x18\x02 \x03(\x0b\x32\n.arks.Pair\x12\r\n\x05score\x18\x03 \x01(\x02\x12\x1d\n\x11is_features_valid\x18\x04 \x03(\x08\x42\x02\x10\x01\x12\x12\n\x06scores\x18\x05 \x03(\x02\x42\x02\x10\x01\x12\x1e\n\nattributes\x18\x06 \x03(\x0b\x32\n.arks.Pair\"\xc7\x03\n\x08SeekPlan\x12\x14\n\x0cstorage_type\x18\x01 \x01(\t\x12\r\n\x05table\x18\x02 \x01(\t\x12\x15\n\rcolumn_family\x18\x03 \x01(\t\x12\x0f\n\x07\x63olumns\x18\x04 \x03(\t\x12\x12\n\nkvpair_sep\x18\x05 \x01(\t\x12\x0e\n\x06kv_sep\x18\x06 \x01(\t\x12\x0f\n\x07\x63luster\x18\x07 \x01(\t\x12\x1e\n\x08row_keys\x18\x08 \x03(\x0b\x32\x0c.arks.RowKey\x12\x12\n\ntimeout_ms\x18\t \x01(\x05\x12\x1b\n\x13\x63\x61\x63he_expire_second\x18\n \x01(\x05\x12\x10\n\x08url_user\x18\x0b \x01(\t\x12\x10\n\x08url_item\x18\x0c \x01(\t\x12\x17\n\x0f\x61nt_feature_req\x18\r \x01(\x0c\x12\n\n\x02id\x18\x0e \x01(\t\x12\x16\n\x0ekb_feature_req\x18\x0f \x01(\x0c\x12\x11\n\tdebuginfo\x18\x10 \x01(\t\x12\x11\n\tseparator\x18\x11 \x01(\t\x12=\n\x12item_sequence_type\x18\x12 \x01(\x0e\x32\x16.arks.ItemSequenceType:\tTYPE_NONE\x12\"\n\x0emissing_values\x18\x13 \x03(\x0b\x32\n.arks.Pair\"\x8f\x01\n\x0b\x44umpReqInfo\x12\x0e\n\x06time_s\x18\x01 \x01(\x05\x12\x0e\n\x06oss_id\x18\x02 \x01(\t\x12\x0f\n\x07oss_key\x18\x03 \x01(\t\x12\x13\n\x0btarget_addr\x18\x04 \x01(\t\x12\x10\n\x08query_id\x18\x05 \x01(\x03\x12\r\n\x05token\x18\x06 \x01(\t\x12\x0b\n\x03\x61pp\x18\x07 \x01(\t\x12\x0c\n\x04host\x18\x08 \x01(\t\"\xb3\x04\n\x0b\x41rksRequest\x12\x12\n\x07version\x18\x01 \x01(\x05:\x01\x31\x12\r\n\x05\x64\x65\x62ug\x18\x02 \x01(\x05\x12\x0f\n\x07is_ping\x18\x03 \x01(\x08\x12\x12\n\nsession_id\x18\x04 \x01(\t\x12\x13\n\x0b\x62ucket_name\x18\x05 \x01(\t\x12\x0b\n\x03uid\x18\x06 \x01(\t\x12 \n\x0cuser_profile\x18\x07 \x03(\x0b\x32\n.arks.Pair\x12\"\n\x0escene_features\x18\x08 \x03(\x0b\x32\n.arks.Pair\x12\x19\n\x05items\x18\t \x03(\x0b\x32\n.arks.Item\x12\x15\n\x07is_sort\x18\n \x01(\x08:\x04true\x12\x11\n\x05\x63ount\x18\x0b \x01(\x05:\x02\x31\x30\x12.\n\nout_format\x18\x0c \x01(\x0e\x32\x16.arks.OutputFormatType:\x02PB\x12\x12\n\nchain_name\x18\r \x01(\t\x12\x0b\n\x03scm\x18\x0e \x01(\t\x12\x12\n\nscene_name\x18\x0f \x01(\t\x12\x14\n\x0citem_schemas\x18\x10 \x03(\t\x12\x18\n\x10sub_item_schemas\x18\x11 \x03(\t\x12\"\n\nseek_plans\x18\x12 \x03(\x0b\x32\x0e.arks.SeekPlan\x12(\n\rdump_req_info\x18\x13 \x01(\x0b\x32\x11.arks.DumpReqInfo\x12\x10\n\x08\x61pp_name\x18\x14 \x01(\t\x12\x16\n\x0ereq_timeout_ms\x18\x15 \x01(\x04\x12\x16\n\x0e\x63lient_version\x18\x16 \x01(\t\x12\n\n\x02ip\x18\x17 \x01(\t\"\xba\x02\n\x0c\x41rksResponse\x12,\n\nerror_code\x18\x01 \x01(\x0e\x32\x0f.arks.ErrorCode:\x07SUCCESS\x12\x12\n\nsession_id\x18\x02 \x01(\t\x12\x13\n\x0b\x62ucket_name\x18\x03 \x01(\t\x12 \n\x0cuser_profile\x18\x04 \x03(\x0b\x32\n.arks.Pair\x12\x19\n\x05items\x18\x05 \x03(\x0b\x32\n.arks.Item\x12\x11\n\tdebug_msg\x18\x06 \x01(\t\x12\x0b\n\x03scm\x18\x07 \x01(\t\x12\"\n\nseek_plans\x18\x08 \x03(\x0b\x32\x0e.arks.SeekPlan\x12\x0f\n\x07\x65rr_msg\x18\t \x01(\t\x12\x10\n\x08\x61lgo_ret\x18\n \x01(\x05\x12\x10\n\x08\x61lgo_msg\x18\x0b \x01(\t\x12\x11\n\ttrace_msg\x18\x0c \x01(\t\x12\n\n\x02rt\x18\r \x01(\x05*T\n\x10OutputFormatType\x12\x06\n\x02PB\x10\x01\x12\x08\n\x04JSON\x10\x02\x12\x08\n\x04TEXT\x10\x03\x12\r\n\tSNAPPY_PB\x10\x04\x12\x06\n\x02\x46\x42\x10\x05\x12\r\n\tSNAPPY_FB\x10\x06*\x86\x01\n\tErrorCode\x12\x0b\n\x07SUCCESS\x10\x00\x12\x0b\n\x07TIMEOUT\x10\x01\x12\r\n\tSCENE_ERR\x10\x02\x12\r\n\tPARAM_ERR\x10\x03\x12\x0e\n\nSYSTEM_ERR\x10\x04\x12\x0f\n\x0bSERVICE_ERR\x10\x05\x12\x10\n\x0c\x46LOW_CONTROL\x10\x06\x12\x0e\n\nOTHERS_ERR\x10\x07*\xae\x01\n\x0b\x43ontentType\x12\x10\n\x0cTYPE_INVALID\x10\x00\x12\r\n\tTYPE_BOOL\x10\x01\x12\x0e\n\nTYPE_INT32\x10\x02\x12\x0e\n\nTYPE_INT64\x10\x03\x12\x0f\n\x0bTYPE_UINT32\x10\x04\x12\x0f\n\x0bTYPE_UINT64\x10\x05\x12\r\n\tTYPE_FP32\x10\x06\x12\r\n\tTYPE_FP64\x10\x07\x12\x0f\n\x0bTYPE_STRING\x10\x08\x12\r\n\tTYPE_BYTE\x10\t*A\n\x10ItemSequenceType\x12\r\n\tTYPE_NONE\x10\x00\x12\x0f\n\x0bTYPE_CONCAT\x10\x01\x12\r\n\tTYPE_FLAT\x10\x02\x42\x17\n\x15\x63om.alipay.arks.proto')
+    name="arks.proto",
+    package="arks",
+    syntax="proto2",
+    serialized_options=_b("\n\025com.alipay.arks.proto"),
+    serialized_pb=_b(
+        '\n\narks.proto\x12\x04\x61rks"\xfc\x01\n\x13InferTensorContents\x12\x14\n\x0cstring_value\x18\x01 \x03(\t\x12\x12\n\nbool_value\x18\x02 \x03(\x08\x12\x11\n\tint_value\x18\x03 \x03(\x05\x12\x13\n\x0bint64_value\x18\x04 \x03(\x03\x12\x12\n\nuint_value\x18\x05 \x03(\r\x12\x14\n\x0cuint64_value\x18\x06 \x03(\x04\x12\x12\n\nfp32_value\x18\x07 \x03(\x02\x12\x12\n\nfp64_value\x18\x08 \x03(\x01\x12\x12\n\nbyte_value\x18\t \x03(\x0c\x12-\n\x04type\x18\n \x01(\x0e\x32\x11.arks.ContentType:\x0cTYPE_INVALID"q\n\x04Pair\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t\x12+\n\x08\x63ontents\x18\x03 \x01(\x0b\x32\x19.arks.InferTensorContents\x12\x10\n\x08pb_value\x18\x04 \x03(\x0c\x12\x0e\n\x06shapes\x18\x05 \x03(\x05"\x97\x01\n\x06RowKey\x12\x0f\n\x07row_key\x18\x01 \x01(\t\x12\x10\n\x08versions\x18\x02 \x03(\x03\x12\x1a\n\x12\x61nt_fea_track_info\x18\x03 \x01(\t\x12\'\n\npartitions\x18\x04 \x03(\x0b\x32\x13.arks.PartitionInfo\x12%\n\x11realtime_features\x18\x05 \x03(\x0b\x32\n.arks.Pair",\n\rPartitionInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t"\xb9\x01\n\x04Item\x12\x0f\n\x07item_id\x18\x01 \x02(\t\x12\x1c\n\x08\x66\x65\x61tures\x18\x02 \x03(\x0b\x32\n.arks.Pair\x12\x1e\n\nattributes\x18\x03 \x03(\x0b\x32\n.arks.Pair\x12\r\n\x05score\x18\x04 \x01(\x02\x12 \n\tsub_items\x18\x05 \x03(\x0b\x32\r.arks.SubItem\x12\x1d\n\x11is_features_valid\x18\x06 \x03(\x08\x42\x02\x10\x01\x12\x12\n\x06scores\x18\x07 \x03(\x02\x42\x02\x10\x01"\x9a\x01\n\x07SubItem\x12\x0f\n\x07item_id\x18\x01 \x01(\t\x12\x1c\n\x08\x66\x65\x61tures\x18\x02 \x03(\x0b\x32\n.arks.Pair\x12\r\n\x05score\x18\x03 \x01(\x02\x12\x1d\n\x11is_features_valid\x18\x04 \x03(\x08\x42\x02\x10\x01\x12\x12\n\x06scores\x18\x05 \x03(\x02\x42\x02\x10\x01\x12\x1e\n\nattributes\x18\x06 \x03(\x0b\x32\n.arks.Pair"\xc7\x03\n\x08SeekPlan\x12\x14\n\x0cstorage_type\x18\x01 \x01(\t\x12\r\n\x05table\x18\x02 \x01(\t\x12\x15\n\rcolumn_family\x18\x03 \x01(\t\x12\x0f\n\x07\x63olumns\x18\x04 \x03(\t\x12\x12\n\nkvpair_sep\x18\x05 \x01(\t\x12\x0e\n\x06kv_sep\x18\x06 \x01(\t\x12\x0f\n\x07\x63luster\x18\x07 \x01(\t\x12\x1e\n\x08row_keys\x18\x08 \x03(\x0b\x32\x0c.arks.RowKey\x12\x12\n\ntimeout_ms\x18\t \x01(\x05\x12\x1b\n\x13\x63\x61\x63he_expire_second\x18\n \x01(\x05\x12\x10\n\x08url_user\x18\x0b \x01(\t\x12\x10\n\x08url_item\x18\x0c \x01(\t\x12\x17\n\x0f\x61nt_feature_req\x18\r \x01(\x0c\x12\n\n\x02id\x18\x0e \x01(\t\x12\x16\n\x0ekb_feature_req\x18\x0f \x01(\x0c\x12\x11\n\tdebuginfo\x18\x10 \x01(\t\x12\x11\n\tseparator\x18\x11 \x01(\t\x12=\n\x12item_sequence_type\x18\x12 \x01(\x0e\x32\x16.arks.ItemSequenceType:\tTYPE_NONE\x12"\n\x0emissing_values\x18\x13 \x03(\x0b\x32\n.arks.Pair"\x8f\x01\n\x0b\x44umpReqInfo\x12\x0e\n\x06time_s\x18\x01 \x01(\x05\x12\x0e\n\x06oss_id\x18\x02 \x01(\t\x12\x0f\n\x07oss_key\x18\x03 \x01(\t\x12\x13\n\x0btarget_addr\x18\x04 \x01(\t\x12\x10\n\x08query_id\x18\x05 \x01(\x03\x12\r\n\x05token\x18\x06 \x01(\t\x12\x0b\n\x03\x61pp\x18\x07 \x01(\t\x12\x0c\n\x04host\x18\x08 \x01(\t"\xb3\x04\n\x0b\x41rksRequest\x12\x12\n\x07version\x18\x01 \x01(\x05:\x01\x31\x12\r\n\x05\x64\x65\x62ug\x18\x02 \x01(\x05\x12\x0f\n\x07is_ping\x18\x03 \x01(\x08\x12\x12\n\nsession_id\x18\x04 \x01(\t\x12\x13\n\x0b\x62ucket_name\x18\x05 \x01(\t\x12\x0b\n\x03uid\x18\x06 \x01(\t\x12 \n\x0cuser_profile\x18\x07 \x03(\x0b\x32\n.arks.Pair\x12"\n\x0escene_features\x18\x08 \x03(\x0b\x32\n.arks.Pair\x12\x19\n\x05items\x18\t \x03(\x0b\x32\n.arks.Item\x12\x15\n\x07is_sort\x18\n \x01(\x08:\x04true\x12\x11\n\x05\x63ount\x18\x0b \x01(\x05:\x02\x31\x30\x12.\n\nout_format\x18\x0c \x01(\x0e\x32\x16.arks.OutputFormatType:\x02PB\x12\x12\n\nchain_name\x18\r \x01(\t\x12\x0b\n\x03scm\x18\x0e \x01(\t\x12\x12\n\nscene_name\x18\x0f \x01(\t\x12\x14\n\x0citem_schemas\x18\x10 \x03(\t\x12\x18\n\x10sub_item_schemas\x18\x11 \x03(\t\x12"\n\nseek_plans\x18\x12 \x03(\x0b\x32\x0e.arks.SeekPlan\x12(\n\rdump_req_info\x18\x13 \x01(\x0b\x32\x11.arks.DumpReqInfo\x12\x10\n\x08\x61pp_name\x18\x14 \x01(\t\x12\x16\n\x0ereq_timeout_ms\x18\x15 \x01(\x04\x12\x16\n\x0e\x63lient_version\x18\x16 \x01(\t\x12\n\n\x02ip\x18\x17 \x01(\t"\xba\x02\n\x0c\x41rksResponse\x12,\n\nerror_code\x18\x01 \x01(\x0e\x32\x0f.arks.ErrorCode:\x07SUCCESS\x12\x12\n\nsession_id\x18\x02 \x01(\t\x12\x13\n\x0b\x62ucket_name\x18\x03 \x01(\t\x12 \n\x0cuser_profile\x18\x04 \x03(\x0b\x32\n.arks.Pair\x12\x19\n\x05items\x18\x05 \x03(\x0b\x32\n.arks.Item\x12\x11\n\tdebug_msg\x18\x06 \x01(\t\x12\x0b\n\x03scm\x18\x07 \x01(\t\x12"\n\nseek_plans\x18\x08 \x03(\x0b\x32\x0e.arks.SeekPlan\x12\x0f\n\x07\x65rr_msg\x18\t \x01(\t\x12\x10\n\x08\x61lgo_ret\x18\n \x01(\x05\x12\x10\n\x08\x61lgo_msg\x18\x0b \x01(\t\x12\x11\n\ttrace_msg\x18\x0c \x01(\t\x12\n\n\x02rt\x18\r \x01(\x05*T\n\x10OutputFormatType\x12\x06\n\x02PB\x10\x01\x12\x08\n\x04JSON\x10\x02\x12\x08\n\x04TEXT\x10\x03\x12\r\n\tSNAPPY_PB\x10\x04\x12\x06\n\x02\x46\x42\x10\x05\x12\r\n\tSNAPPY_FB\x10\x06*\x86\x01\n\tErrorCode\x12\x0b\n\x07SUCCESS\x10\x00\x12\x0b\n\x07TIMEOUT\x10\x01\x12\r\n\tSCENE_ERR\x10\x02\x12\r\n\tPARAM_ERR\x10\x03\x12\x0e\n\nSYSTEM_ERR\x10\x04\x12\x0f\n\x0bSERVICE_ERR\x10\x05\x12\x10\n\x0c\x46LOW_CONTROL\x10\x06\x12\x0e\n\nOTHERS_ERR\x10\x07*\xae\x01\n\x0b\x43ontentType\x12\x10\n\x0cTYPE_INVALID\x10\x00\x12\r\n\tTYPE_BOOL\x10\x01\x12\x0e\n\nTYPE_INT32\x10\x02\x12\x0e\n\nTYPE_INT64\x10\x03\x12\x0f\n\x0bTYPE_UINT32\x10\x04\x12\x0f\n\x0bTYPE_UINT64\x10\x05\x12\r\n\tTYPE_FP32\x10\x06\x12\r\n\tTYPE_FP64\x10\x07\x12\x0f\n\x0bTYPE_STRING\x10\x08\x12\r\n\tTYPE_BYTE\x10\t*A\n\x10ItemSequenceType\x12\r\n\tTYPE_NONE\x10\x00\x12\x0f\n\x0bTYPE_CONCAT\x10\x01\x12\r\n\tTYPE_FLAT\x10\x02\x42\x17\n\x15\x63om.alipay.arks.proto'
+    ),
 )
 
 _OUTPUTFORMATTYPE = _descriptor.EnumDescriptor(
-  name='OutputFormatType',
-  full_name='arks.OutputFormatType',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='PB', index=0, number=1,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='JSON', index=1, number=2,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='TEXT', index=2, number=3,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='SNAPPY_PB', index=3, number=4,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FB', index=4, number=5,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='SNAPPY_FB', index=5, number=6,
-      serialized_options=None,
-      type=None),
-  ],
-  containing_type=None,
-  serialized_options=None,
-  serialized_start=2422,
-  serialized_end=2506,
+    name="OutputFormatType",
+    full_name="arks.OutputFormatType",
+    filename=None,
+    file=DESCRIPTOR,
+    values=[
+        _descriptor.EnumValueDescriptor(
+            name="PB", index=0, number=1, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="JSON", index=1, number=2, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="TEXT", index=2, number=3, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="SNAPPY_PB", index=3, number=4, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="FB", index=4, number=5, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="SNAPPY_FB", index=5, number=6, serialized_options=None, type=None
+        ),
+    ],
+    containing_type=None,
+    serialized_options=None,
+    serialized_start=2422,
+    serialized_end=2506,
 )
 _sym_db.RegisterEnumDescriptor(_OUTPUTFORMATTYPE)
 
 OutputFormatType = enum_type_wrapper.EnumTypeWrapper(_OUTPUTFORMATTYPE)
 _ERRORCODE = _descriptor.EnumDescriptor(
-  name='ErrorCode',
-  full_name='arks.ErrorCode',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='SUCCESS', index=0, number=0,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='TIMEOUT', index=1, number=1,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='SCENE_ERR', index=2, number=2,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PARAM_ERR', index=3, number=3,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='SYSTEM_ERR', index=4, number=4,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='SERVICE_ERR', index=5, number=5,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FLOW_CONTROL', index=6, number=6,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='OTHERS_ERR', index=7, number=7,
-      serialized_options=None,
-      type=None),
-  ],
-  containing_type=None,
-  serialized_options=None,
-  serialized_start=2509,
-  serialized_end=2643,
+    name="ErrorCode",
+    full_name="arks.ErrorCode",
+    filename=None,
+    file=DESCRIPTOR,
+    values=[
+        _descriptor.EnumValueDescriptor(
+            name="SUCCESS", index=0, number=0, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="TIMEOUT", index=1, number=1, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="SCENE_ERR", index=2, number=2, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="PARAM_ERR", index=3, number=3, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="SYSTEM_ERR", index=4, number=4, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="SERVICE_ERR", index=5, number=5, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="FLOW_CONTROL", index=6, number=6, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="OTHERS_ERR", index=7, number=7, serialized_options=None, type=None
+        ),
+    ],
+    containing_type=None,
+    serialized_options=None,
+    serialized_start=2509,
+    serialized_end=2643,
 )
 _sym_db.RegisterEnumDescriptor(_ERRORCODE)
 
 ErrorCode = enum_type_wrapper.EnumTypeWrapper(_ERRORCODE)
 _CONTENTTYPE = _descriptor.EnumDescriptor(
-  name='ContentType',
-  full_name='arks.ContentType',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='TYPE_INVALID', index=0, number=0,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='TYPE_BOOL', index=1, number=1,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='TYPE_INT32', index=2, number=2,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='TYPE_INT64', index=3, number=3,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='TYPE_UINT32', index=4, number=4,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='TYPE_UINT64', index=5, number=5,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='TYPE_FP32', index=6, number=6,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='TYPE_FP64', index=7, number=7,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='TYPE_STRING', index=8, number=8,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='TYPE_BYTE', index=9, number=9,
-      serialized_options=None,
-      type=None),
-  ],
-  containing_type=None,
-  serialized_options=None,
-  serialized_start=2646,
-  serialized_end=2820,
+    name="ContentType",
+    full_name="arks.ContentType",
+    filename=None,
+    file=DESCRIPTOR,
+    values=[
+        _descriptor.EnumValueDescriptor(
+            name="TYPE_INVALID", index=0, number=0, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="TYPE_BOOL", index=1, number=1, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="TYPE_INT32", index=2, number=2, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="TYPE_INT64", index=3, number=3, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="TYPE_UINT32", index=4, number=4, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="TYPE_UINT64", index=5, number=5, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="TYPE_FP32", index=6, number=6, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="TYPE_FP64", index=7, number=7, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="TYPE_STRING", index=8, number=8, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="TYPE_BYTE", index=9, number=9, serialized_options=None, type=None
+        ),
+    ],
+    containing_type=None,
+    serialized_options=None,
+    serialized_start=2646,
+    serialized_end=2820,
 )
 _sym_db.RegisterEnumDescriptor(_CONTENTTYPE)
 
 ContentType = enum_type_wrapper.EnumTypeWrapper(_CONTENTTYPE)
 _ITEMSEQUENCETYPE = _descriptor.EnumDescriptor(
-  name='ItemSequenceType',
-  full_name='arks.ItemSequenceType',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='TYPE_NONE', index=0, number=0,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='TYPE_CONCAT', index=1, number=1,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='TYPE_FLAT', index=2, number=2,
-      serialized_options=None,
-      type=None),
-  ],
-  containing_type=None,
-  serialized_options=None,
-  serialized_start=2822,
-  serialized_end=2887,
+    name="ItemSequenceType",
+    full_name="arks.ItemSequenceType",
+    filename=None,
+    file=DESCRIPTOR,
+    values=[
+        _descriptor.EnumValueDescriptor(
+            name="TYPE_NONE", index=0, number=0, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="TYPE_CONCAT", index=1, number=1, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="TYPE_FLAT", index=2, number=2, serialized_options=None, type=None
+        ),
+    ],
+    containing_type=None,
+    serialized_options=None,
+    serialized_start=2822,
+    serialized_end=2887,
 )
 _sym_db.RegisterEnumDescriptor(_ITEMSEQUENCETYPE)
 
@@ -224,1044 +199,2131 @@
 TYPE_FLAT = 2
 
 
-
 _INFERTENSORCONTENTS = _descriptor.Descriptor(
-  name='InferTensorContents',
-  full_name='arks.InferTensorContents',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='string_value', full_name='arks.InferTensorContents.string_value', index=0,
-      number=1, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='bool_value', full_name='arks.InferTensorContents.bool_value', index=1,
-      number=2, type=8, cpp_type=7, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='int_value', full_name='arks.InferTensorContents.int_value', index=2,
-      number=3, type=5, cpp_type=1, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='int64_value', full_name='arks.InferTensorContents.int64_value', index=3,
-      number=4, type=3, cpp_type=2, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='uint_value', full_name='arks.InferTensorContents.uint_value', index=4,
-      number=5, type=13, cpp_type=3, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='uint64_value', full_name='arks.InferTensorContents.uint64_value', index=5,
-      number=6, type=4, cpp_type=4, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='fp32_value', full_name='arks.InferTensorContents.fp32_value', index=6,
-      number=7, type=2, cpp_type=6, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='fp64_value', full_name='arks.InferTensorContents.fp64_value', index=7,
-      number=8, type=1, cpp_type=5, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='byte_value', full_name='arks.InferTensorContents.byte_value', index=8,
-      number=9, type=12, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='type', full_name='arks.InferTensorContents.type', index=9,
-      number=10, type=14, cpp_type=8, label=1,
-      has_default_value=True, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  serialized_options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=21,
-  serialized_end=273,
+    name="InferTensorContents",
+    full_name="arks.InferTensorContents",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="string_value",
+            full_name="arks.InferTensorContents.string_value",
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="bool_value",
+            full_name="arks.InferTensorContents.bool_value",
+            index=1,
+            number=2,
+            type=8,
+            cpp_type=7,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="int_value",
+            full_name="arks.InferTensorContents.int_value",
+            index=2,
+            number=3,
+            type=5,
+            cpp_type=1,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="int64_value",
+            full_name="arks.InferTensorContents.int64_value",
+            index=3,
+            number=4,
+            type=3,
+            cpp_type=2,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="uint_value",
+            full_name="arks.InferTensorContents.uint_value",
+            index=4,
+            number=5,
+            type=13,
+            cpp_type=3,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="uint64_value",
+            full_name="arks.InferTensorContents.uint64_value",
+            index=5,
+            number=6,
+            type=4,
+            cpp_type=4,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="fp32_value",
+            full_name="arks.InferTensorContents.fp32_value",
+            index=6,
+            number=7,
+            type=2,
+            cpp_type=6,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="fp64_value",
+            full_name="arks.InferTensorContents.fp64_value",
+            index=7,
+            number=8,
+            type=1,
+            cpp_type=5,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="byte_value",
+            full_name="arks.InferTensorContents.byte_value",
+            index=8,
+            number=9,
+            type=12,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="type",
+            full_name="arks.InferTensorContents.type",
+            index=9,
+            number=10,
+            type=14,
+            cpp_type=8,
+            label=1,
+            has_default_value=True,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax="proto2",
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=21,
+    serialized_end=273,
 )
 
 
 _PAIR = _descriptor.Descriptor(
-  name='Pair',
-  full_name='arks.Pair',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='key', full_name='arks.Pair.key', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='value', full_name='arks.Pair.value', index=1,
-      number=2, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='contents', full_name='arks.Pair.contents', index=2,
-      number=3, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='pb_value', full_name='arks.Pair.pb_value', index=3,
-      number=4, type=12, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='shapes', full_name='arks.Pair.shapes', index=4,
-      number=5, type=5, cpp_type=1, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  serialized_options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=275,
-  serialized_end=388,
+    name="Pair",
+    full_name="arks.Pair",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="key",
+            full_name="arks.Pair.key",
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="value",
+            full_name="arks.Pair.value",
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="contents",
+            full_name="arks.Pair.contents",
+            index=2,
+            number=3,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="pb_value",
+            full_name="arks.Pair.pb_value",
+            index=3,
+            number=4,
+            type=12,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="shapes",
+            full_name="arks.Pair.shapes",
+            index=4,
+            number=5,
+            type=5,
+            cpp_type=1,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax="proto2",
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=275,
+    serialized_end=388,
 )
 
 
 _ROWKEY = _descriptor.Descriptor(
-  name='RowKey',
-  full_name='arks.RowKey',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='row_key', full_name='arks.RowKey.row_key', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='versions', full_name='arks.RowKey.versions', index=1,
-      number=2, type=3, cpp_type=2, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='ant_fea_track_info', full_name='arks.RowKey.ant_fea_track_info', index=2,
-      number=3, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='partitions', full_name='arks.RowKey.partitions', index=3,
-      number=4, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='realtime_features', full_name='arks.RowKey.realtime_features', index=4,
-      number=5, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  serialized_options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=391,
-  serialized_end=542,
+    name="RowKey",
+    full_name="arks.RowKey",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="row_key",
+            full_name="arks.RowKey.row_key",
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="versions",
+            full_name="arks.RowKey.versions",
+            index=1,
+            number=2,
+            type=3,
+            cpp_type=2,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="ant_fea_track_info",
+            full_name="arks.RowKey.ant_fea_track_info",
+            index=2,
+            number=3,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="partitions",
+            full_name="arks.RowKey.partitions",
+            index=3,
+            number=4,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="realtime_features",
+            full_name="arks.RowKey.realtime_features",
+            index=4,
+            number=5,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax="proto2",
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=391,
+    serialized_end=542,
 )
 
 
 _PARTITIONINFO = _descriptor.Descriptor(
-  name='PartitionInfo',
-  full_name='arks.PartitionInfo',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='arks.PartitionInfo.name', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='value', full_name='arks.PartitionInfo.value', index=1,
-      number=2, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  serialized_options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=544,
-  serialized_end=588,
+    name="PartitionInfo",
+    full_name="arks.PartitionInfo",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="name",
+            full_name="arks.PartitionInfo.name",
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="value",
+            full_name="arks.PartitionInfo.value",
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax="proto2",
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=544,
+    serialized_end=588,
 )
 
 
 _ITEM = _descriptor.Descriptor(
-  name='Item',
-  full_name='arks.Item',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='item_id', full_name='arks.Item.item_id', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='features', full_name='arks.Item.features', index=1,
-      number=2, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='attributes', full_name='arks.Item.attributes', index=2,
-      number=3, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='score', full_name='arks.Item.score', index=3,
-      number=4, type=2, cpp_type=6, label=1,
-      has_default_value=False, default_value=float(0),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='sub_items', full_name='arks.Item.sub_items', index=4,
-      number=5, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='is_features_valid', full_name='arks.Item.is_features_valid', index=5,
-      number=6, type=8, cpp_type=7, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=_b('\020\001'), file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='scores', full_name='arks.Item.scores', index=6,
-      number=7, type=2, cpp_type=6, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=_b('\020\001'), file=DESCRIPTOR),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  serialized_options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=591,
-  serialized_end=776,
+    name="Item",
+    full_name="arks.Item",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="item_id",
+            full_name="arks.Item.item_id",
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=2,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="features",
+            full_name="arks.Item.features",
+            index=1,
+            number=2,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="attributes",
+            full_name="arks.Item.attributes",
+            index=2,
+            number=3,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="score",
+            full_name="arks.Item.score",
+            index=3,
+            number=4,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="sub_items",
+            full_name="arks.Item.sub_items",
+            index=4,
+            number=5,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="is_features_valid",
+            full_name="arks.Item.is_features_valid",
+            index=5,
+            number=6,
+            type=8,
+            cpp_type=7,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=_b("\020\001"),
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="scores",
+            full_name="arks.Item.scores",
+            index=6,
+            number=7,
+            type=2,
+            cpp_type=6,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=_b("\020\001"),
+            file=DESCRIPTOR,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax="proto2",
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=591,
+    serialized_end=776,
 )
 
 
 _SUBITEM = _descriptor.Descriptor(
-  name='SubItem',
-  full_name='arks.SubItem',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='item_id', full_name='arks.SubItem.item_id', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='features', full_name='arks.SubItem.features', index=1,
-      number=2, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='score', full_name='arks.SubItem.score', index=2,
-      number=3, type=2, cpp_type=6, label=1,
-      has_default_value=False, default_value=float(0),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='is_features_valid', full_name='arks.SubItem.is_features_valid', index=3,
-      number=4, type=8, cpp_type=7, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=_b('\020\001'), file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='scores', full_name='arks.SubItem.scores', index=4,
-      number=5, type=2, cpp_type=6, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=_b('\020\001'), file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='attributes', full_name='arks.SubItem.attributes', index=5,
-      number=6, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  serialized_options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=779,
-  serialized_end=933,
+    name="SubItem",
+    full_name="arks.SubItem",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="item_id",
+            full_name="arks.SubItem.item_id",
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="features",
+            full_name="arks.SubItem.features",
+            index=1,
+            number=2,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="score",
+            full_name="arks.SubItem.score",
+            index=2,
+            number=3,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="is_features_valid",
+            full_name="arks.SubItem.is_features_valid",
+            index=3,
+            number=4,
+            type=8,
+            cpp_type=7,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=_b("\020\001"),
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="scores",
+            full_name="arks.SubItem.scores",
+            index=4,
+            number=5,
+            type=2,
+            cpp_type=6,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=_b("\020\001"),
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="attributes",
+            full_name="arks.SubItem.attributes",
+            index=5,
+            number=6,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax="proto2",
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=779,
+    serialized_end=933,
 )
 
 
 _SEEKPLAN = _descriptor.Descriptor(
-  name='SeekPlan',
-  full_name='arks.SeekPlan',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='storage_type', full_name='arks.SeekPlan.storage_type', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='table', full_name='arks.SeekPlan.table', index=1,
-      number=2, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='column_family', full_name='arks.SeekPlan.column_family', index=2,
-      number=3, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='columns', full_name='arks.SeekPlan.columns', index=3,
-      number=4, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='kvpair_sep', full_name='arks.SeekPlan.kvpair_sep', index=4,
-      number=5, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='kv_sep', full_name='arks.SeekPlan.kv_sep', index=5,
-      number=6, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='cluster', full_name='arks.SeekPlan.cluster', index=6,
-      number=7, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='row_keys', full_name='arks.SeekPlan.row_keys', index=7,
-      number=8, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='timeout_ms', full_name='arks.SeekPlan.timeout_ms', index=8,
-      number=9, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='cache_expire_second', full_name='arks.SeekPlan.cache_expire_second', index=9,
-      number=10, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='url_user', full_name='arks.SeekPlan.url_user', index=10,
-      number=11, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='url_item', full_name='arks.SeekPlan.url_item', index=11,
-      number=12, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='ant_feature_req', full_name='arks.SeekPlan.ant_feature_req', index=12,
-      number=13, type=12, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b(""),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='id', full_name='arks.SeekPlan.id', index=13,
-      number=14, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='kb_feature_req', full_name='arks.SeekPlan.kb_feature_req', index=14,
-      number=15, type=12, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b(""),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='debuginfo', full_name='arks.SeekPlan.debuginfo', index=15,
-      number=16, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='separator', full_name='arks.SeekPlan.separator', index=16,
-      number=17, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='item_sequence_type', full_name='arks.SeekPlan.item_sequence_type', index=17,
-      number=18, type=14, cpp_type=8, label=1,
-      has_default_value=True, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='missing_values', full_name='arks.SeekPlan.missing_values', index=18,
-      number=19, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  serialized_options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=936,
-  serialized_end=1391,
+    name="SeekPlan",
+    full_name="arks.SeekPlan",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="storage_type",
+            full_name="arks.SeekPlan.storage_type",
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="table",
+            full_name="arks.SeekPlan.table",
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="column_family",
+            full_name="arks.SeekPlan.column_family",
+            index=2,
+            number=3,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="columns",
+            full_name="arks.SeekPlan.columns",
+            index=3,
+            number=4,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="kvpair_sep",
+            full_name="arks.SeekPlan.kvpair_sep",
+            index=4,
+            number=5,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="kv_sep",
+            full_name="arks.SeekPlan.kv_sep",
+            index=5,
+            number=6,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="cluster",
+            full_name="arks.SeekPlan.cluster",
+            index=6,
+            number=7,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="row_keys",
+            full_name="arks.SeekPlan.row_keys",
+            index=7,
+            number=8,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="timeout_ms",
+            full_name="arks.SeekPlan.timeout_ms",
+            index=8,
+            number=9,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="cache_expire_second",
+            full_name="arks.SeekPlan.cache_expire_second",
+            index=9,
+            number=10,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="url_user",
+            full_name="arks.SeekPlan.url_user",
+            index=10,
+            number=11,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="url_item",
+            full_name="arks.SeekPlan.url_item",
+            index=11,
+            number=12,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="ant_feature_req",
+            full_name="arks.SeekPlan.ant_feature_req",
+            index=12,
+            number=13,
+            type=12,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b(""),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="id",
+            full_name="arks.SeekPlan.id",
+            index=13,
+            number=14,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="kb_feature_req",
+            full_name="arks.SeekPlan.kb_feature_req",
+            index=14,
+            number=15,
+            type=12,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b(""),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="debuginfo",
+            full_name="arks.SeekPlan.debuginfo",
+            index=15,
+            number=16,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="separator",
+            full_name="arks.SeekPlan.separator",
+            index=16,
+            number=17,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="item_sequence_type",
+            full_name="arks.SeekPlan.item_sequence_type",
+            index=17,
+            number=18,
+            type=14,
+            cpp_type=8,
+            label=1,
+            has_default_value=True,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="missing_values",
+            full_name="arks.SeekPlan.missing_values",
+            index=18,
+            number=19,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax="proto2",
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=936,
+    serialized_end=1391,
 )
 
 
 _DUMPREQINFO = _descriptor.Descriptor(
-  name='DumpReqInfo',
-  full_name='arks.DumpReqInfo',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='time_s', full_name='arks.DumpReqInfo.time_s', index=0,
-      number=1, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='oss_id', full_name='arks.DumpReqInfo.oss_id', index=1,
-      number=2, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='oss_key', full_name='arks.DumpReqInfo.oss_key', index=2,
-      number=3, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='target_addr', full_name='arks.DumpReqInfo.target_addr', index=3,
-      number=4, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='query_id', full_name='arks.DumpReqInfo.query_id', index=4,
-      number=5, type=3, cpp_type=2, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='token', full_name='arks.DumpReqInfo.token', index=5,
-      number=6, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='app', full_name='arks.DumpReqInfo.app', index=6,
-      number=7, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='host', full_name='arks.DumpReqInfo.host', index=7,
-      number=8, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  serialized_options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1394,
-  serialized_end=1537,
+    name="DumpReqInfo",
+    full_name="arks.DumpReqInfo",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="time_s",
+            full_name="arks.DumpReqInfo.time_s",
+            index=0,
+            number=1,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="oss_id",
+            full_name="arks.DumpReqInfo.oss_id",
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="oss_key",
+            full_name="arks.DumpReqInfo.oss_key",
+            index=2,
+            number=3,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="target_addr",
+            full_name="arks.DumpReqInfo.target_addr",
+            index=3,
+            number=4,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="query_id",
+            full_name="arks.DumpReqInfo.query_id",
+            index=4,
+            number=5,
+            type=3,
+            cpp_type=2,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="token",
+            full_name="arks.DumpReqInfo.token",
+            index=5,
+            number=6,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="app",
+            full_name="arks.DumpReqInfo.app",
+            index=6,
+            number=7,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="host",
+            full_name="arks.DumpReqInfo.host",
+            index=7,
+            number=8,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax="proto2",
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=1394,
+    serialized_end=1537,
 )
 
 
 _ARKSREQUEST = _descriptor.Descriptor(
-  name='ArksRequest',
-  full_name='arks.ArksRequest',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='version', full_name='arks.ArksRequest.version', index=0,
-      number=1, type=5, cpp_type=1, label=1,
-      has_default_value=True, default_value=1,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='debug', full_name='arks.ArksRequest.debug', index=1,
-      number=2, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='is_ping', full_name='arks.ArksRequest.is_ping', index=2,
-      number=3, type=8, cpp_type=7, label=1,
-      has_default_value=False, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='session_id', full_name='arks.ArksRequest.session_id', index=3,
-      number=4, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='bucket_name', full_name='arks.ArksRequest.bucket_name', index=4,
-      number=5, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='uid', full_name='arks.ArksRequest.uid', index=5,
-      number=6, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='user_profile', full_name='arks.ArksRequest.user_profile', index=6,
-      number=7, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='scene_features', full_name='arks.ArksRequest.scene_features', index=7,
-      number=8, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='items', full_name='arks.ArksRequest.items', index=8,
-      number=9, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='is_sort', full_name='arks.ArksRequest.is_sort', index=9,
-      number=10, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=True,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='count', full_name='arks.ArksRequest.count', index=10,
-      number=11, type=5, cpp_type=1, label=1,
-      has_default_value=True, default_value=10,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='out_format', full_name='arks.ArksRequest.out_format', index=11,
-      number=12, type=14, cpp_type=8, label=1,
-      has_default_value=True, default_value=1,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='chain_name', full_name='arks.ArksRequest.chain_name', index=12,
-      number=13, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='scm', full_name='arks.ArksRequest.scm', index=13,
-      number=14, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='scene_name', full_name='arks.ArksRequest.scene_name', index=14,
-      number=15, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='item_schemas', full_name='arks.ArksRequest.item_schemas', index=15,
-      number=16, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='sub_item_schemas', full_name='arks.ArksRequest.sub_item_schemas', index=16,
-      number=17, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='seek_plans', full_name='arks.ArksRequest.seek_plans', index=17,
-      number=18, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='dump_req_info', full_name='arks.ArksRequest.dump_req_info', index=18,
-      number=19, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='app_name', full_name='arks.ArksRequest.app_name', index=19,
-      number=20, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='req_timeout_ms', full_name='arks.ArksRequest.req_timeout_ms', index=20,
-      number=21, type=4, cpp_type=4, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='client_version', full_name='arks.ArksRequest.client_version', index=21,
-      number=22, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='ip', full_name='arks.ArksRequest.ip', index=22,
-      number=23, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  serialized_options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1540,
-  serialized_end=2103,
+    name="ArksRequest",
+    full_name="arks.ArksRequest",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="version",
+            full_name="arks.ArksRequest.version",
+            index=0,
+            number=1,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=1,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="debug",
+            full_name="arks.ArksRequest.debug",
+            index=1,
+            number=2,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="is_ping",
+            full_name="arks.ArksRequest.is_ping",
+            index=2,
+            number=3,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=False,
+            default_value=False,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="session_id",
+            full_name="arks.ArksRequest.session_id",
+            index=3,
+            number=4,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="bucket_name",
+            full_name="arks.ArksRequest.bucket_name",
+            index=4,
+            number=5,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="uid",
+            full_name="arks.ArksRequest.uid",
+            index=5,
+            number=6,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="user_profile",
+            full_name="arks.ArksRequest.user_profile",
+            index=6,
+            number=7,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="scene_features",
+            full_name="arks.ArksRequest.scene_features",
+            index=7,
+            number=8,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="items",
+            full_name="arks.ArksRequest.items",
+            index=8,
+            number=9,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="is_sort",
+            full_name="arks.ArksRequest.is_sort",
+            index=9,
+            number=10,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=True,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="count",
+            full_name="arks.ArksRequest.count",
+            index=10,
+            number=11,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=10,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="out_format",
+            full_name="arks.ArksRequest.out_format",
+            index=11,
+            number=12,
+            type=14,
+            cpp_type=8,
+            label=1,
+            has_default_value=True,
+            default_value=1,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="chain_name",
+            full_name="arks.ArksRequest.chain_name",
+            index=12,
+            number=13,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="scm",
+            full_name="arks.ArksRequest.scm",
+            index=13,
+            number=14,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="scene_name",
+            full_name="arks.ArksRequest.scene_name",
+            index=14,
+            number=15,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="item_schemas",
+            full_name="arks.ArksRequest.item_schemas",
+            index=15,
+            number=16,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="sub_item_schemas",
+            full_name="arks.ArksRequest.sub_item_schemas",
+            index=16,
+            number=17,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="seek_plans",
+            full_name="arks.ArksRequest.seek_plans",
+            index=17,
+            number=18,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="dump_req_info",
+            full_name="arks.ArksRequest.dump_req_info",
+            index=18,
+            number=19,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="app_name",
+            full_name="arks.ArksRequest.app_name",
+            index=19,
+            number=20,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="req_timeout_ms",
+            full_name="arks.ArksRequest.req_timeout_ms",
+            index=20,
+            number=21,
+            type=4,
+            cpp_type=4,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="client_version",
+            full_name="arks.ArksRequest.client_version",
+            index=21,
+            number=22,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="ip",
+            full_name="arks.ArksRequest.ip",
+            index=22,
+            number=23,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax="proto2",
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=1540,
+    serialized_end=2103,
 )
 
 
 _ARKSRESPONSE = _descriptor.Descriptor(
-  name='ArksResponse',
-  full_name='arks.ArksResponse',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='error_code', full_name='arks.ArksResponse.error_code', index=0,
-      number=1, type=14, cpp_type=8, label=1,
-      has_default_value=True, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='session_id', full_name='arks.ArksResponse.session_id', index=1,
-      number=2, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='bucket_name', full_name='arks.ArksResponse.bucket_name', index=2,
-      number=3, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='user_profile', full_name='arks.ArksResponse.user_profile', index=3,
-      number=4, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='items', full_name='arks.ArksResponse.items', index=4,
-      number=5, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='debug_msg', full_name='arks.ArksResponse.debug_msg', index=5,
-      number=6, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='scm', full_name='arks.ArksResponse.scm', index=6,
-      number=7, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='seek_plans', full_name='arks.ArksResponse.seek_plans', index=7,
-      number=8, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='err_msg', full_name='arks.ArksResponse.err_msg', index=8,
-      number=9, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='algo_ret', full_name='arks.ArksResponse.algo_ret', index=9,
-      number=10, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='algo_msg', full_name='arks.ArksResponse.algo_msg', index=10,
-      number=11, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='trace_msg', full_name='arks.ArksResponse.trace_msg', index=11,
-      number=12, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='rt', full_name='arks.ArksResponse.rt', index=12,
-      number=13, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  serialized_options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=2106,
-  serialized_end=2420,
+    name="ArksResponse",
+    full_name="arks.ArksResponse",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="error_code",
+            full_name="arks.ArksResponse.error_code",
+            index=0,
+            number=1,
+            type=14,
+            cpp_type=8,
+            label=1,
+            has_default_value=True,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="session_id",
+            full_name="arks.ArksResponse.session_id",
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="bucket_name",
+            full_name="arks.ArksResponse.bucket_name",
+            index=2,
+            number=3,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="user_profile",
+            full_name="arks.ArksResponse.user_profile",
+            index=3,
+            number=4,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="items",
+            full_name="arks.ArksResponse.items",
+            index=4,
+            number=5,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="debug_msg",
+            full_name="arks.ArksResponse.debug_msg",
+            index=5,
+            number=6,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="scm",
+            full_name="arks.ArksResponse.scm",
+            index=6,
+            number=7,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="seek_plans",
+            full_name="arks.ArksResponse.seek_plans",
+            index=7,
+            number=8,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="err_msg",
+            full_name="arks.ArksResponse.err_msg",
+            index=8,
+            number=9,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="algo_ret",
+            full_name="arks.ArksResponse.algo_ret",
+            index=9,
+            number=10,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="algo_msg",
+            full_name="arks.ArksResponse.algo_msg",
+            index=10,
+            number=11,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="trace_msg",
+            full_name="arks.ArksResponse.trace_msg",
+            index=11,
+            number=12,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="rt",
+            full_name="arks.ArksResponse.rt",
+            index=12,
+            number=13,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax="proto2",
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=2106,
+    serialized_end=2420,
 )
 
-_INFERTENSORCONTENTS.fields_by_name['type'].enum_type = _CONTENTTYPE
-_PAIR.fields_by_name['contents'].message_type = _INFERTENSORCONTENTS
-_ROWKEY.fields_by_name['partitions'].message_type = _PARTITIONINFO
-_ROWKEY.fields_by_name['realtime_features'].message_type = _PAIR
-_ITEM.fields_by_name['features'].message_type = _PAIR
-_ITEM.fields_by_name['attributes'].message_type = _PAIR
-_ITEM.fields_by_name['sub_items'].message_type = _SUBITEM
-_SUBITEM.fields_by_name['features'].message_type = _PAIR
-_SUBITEM.fields_by_name['attributes'].message_type = _PAIR
-_SEEKPLAN.fields_by_name['row_keys'].message_type = _ROWKEY
-_SEEKPLAN.fields_by_name['item_sequence_type'].enum_type = _ITEMSEQUENCETYPE
-_SEEKPLAN.fields_by_name['missing_values'].message_type = _PAIR
-_ARKSREQUEST.fields_by_name['user_profile'].message_type = _PAIR
-_ARKSREQUEST.fields_by_name['scene_features'].message_type = _PAIR
-_ARKSREQUEST.fields_by_name['items'].message_type = _ITEM
-_ARKSREQUEST.fields_by_name['out_format'].enum_type = _OUTPUTFORMATTYPE
-_ARKSREQUEST.fields_by_name['seek_plans'].message_type = _SEEKPLAN
-_ARKSREQUEST.fields_by_name['dump_req_info'].message_type = _DUMPREQINFO
-_ARKSRESPONSE.fields_by_name['error_code'].enum_type = _ERRORCODE
-_ARKSRESPONSE.fields_by_name['user_profile'].message_type = _PAIR
-_ARKSRESPONSE.fields_by_name['items'].message_type = _ITEM
-_ARKSRESPONSE.fields_by_name['seek_plans'].message_type = _SEEKPLAN
-DESCRIPTOR.message_types_by_name['InferTensorContents'] = _INFERTENSORCONTENTS
-DESCRIPTOR.message_types_by_name['Pair'] = _PAIR
-DESCRIPTOR.message_types_by_name['RowKey'] = _ROWKEY
-DESCRIPTOR.message_types_by_name['PartitionInfo'] = _PARTITIONINFO
-DESCRIPTOR.message_types_by_name['Item'] = _ITEM
-DESCRIPTOR.message_types_by_name['SubItem'] = _SUBITEM
-DESCRIPTOR.message_types_by_name['SeekPlan'] = _SEEKPLAN
-DESCRIPTOR.message_types_by_name['DumpReqInfo'] = _DUMPREQINFO
-DESCRIPTOR.message_types_by_name['ArksRequest'] = _ARKSREQUEST
-DESCRIPTOR.message_types_by_name['ArksResponse'] = _ARKSRESPONSE
-DESCRIPTOR.enum_types_by_name['OutputFormatType'] = _OUTPUTFORMATTYPE
-DESCRIPTOR.enum_types_by_name['ErrorCode'] = _ERRORCODE
-DESCRIPTOR.enum_types_by_name['ContentType'] = _CONTENTTYPE
-DESCRIPTOR.enum_types_by_name['ItemSequenceType'] = _ITEMSEQUENCETYPE
+_INFERTENSORCONTENTS.fields_by_name["type"].enum_type = _CONTENTTYPE
+_PAIR.fields_by_name["contents"].message_type = _INFERTENSORCONTENTS
+_ROWKEY.fields_by_name["partitions"].message_type = _PARTITIONINFO
+_ROWKEY.fields_by_name["realtime_features"].message_type = _PAIR
+_ITEM.fields_by_name["features"].message_type = _PAIR
+_ITEM.fields_by_name["attributes"].message_type = _PAIR
+_ITEM.fields_by_name["sub_items"].message_type = _SUBITEM
+_SUBITEM.fields_by_name["features"].message_type = _PAIR
+_SUBITEM.fields_by_name["attributes"].message_type = _PAIR
+_SEEKPLAN.fields_by_name["row_keys"].message_type = _ROWKEY
+_SEEKPLAN.fields_by_name["item_sequence_type"].enum_type = _ITEMSEQUENCETYPE
+_SEEKPLAN.fields_by_name["missing_values"].message_type = _PAIR
+_ARKSREQUEST.fields_by_name["user_profile"].message_type = _PAIR
+_ARKSREQUEST.fields_by_name["scene_features"].message_type = _PAIR
+_ARKSREQUEST.fields_by_name["items"].message_type = _ITEM
+_ARKSREQUEST.fields_by_name["out_format"].enum_type = _OUTPUTFORMATTYPE
+_ARKSREQUEST.fields_by_name["seek_plans"].message_type = _SEEKPLAN
+_ARKSREQUEST.fields_by_name["dump_req_info"].message_type = _DUMPREQINFO
+_ARKSRESPONSE.fields_by_name["error_code"].enum_type = _ERRORCODE
+_ARKSRESPONSE.fields_by_name["user_profile"].message_type = _PAIR
+_ARKSRESPONSE.fields_by_name["items"].message_type = _ITEM
+_ARKSRESPONSE.fields_by_name["seek_plans"].message_type = _SEEKPLAN
+DESCRIPTOR.message_types_by_name["InferTensorContents"] = _INFERTENSORCONTENTS
+DESCRIPTOR.message_types_by_name["Pair"] = _PAIR
+DESCRIPTOR.message_types_by_name["RowKey"] = _ROWKEY
+DESCRIPTOR.message_types_by_name["PartitionInfo"] = _PARTITIONINFO
+DESCRIPTOR.message_types_by_name["Item"] = _ITEM
+DESCRIPTOR.message_types_by_name["SubItem"] = _SUBITEM
+DESCRIPTOR.message_types_by_name["SeekPlan"] = _SEEKPLAN
+DESCRIPTOR.message_types_by_name["DumpReqInfo"] = _DUMPREQINFO
+DESCRIPTOR.message_types_by_name["ArksRequest"] = _ARKSREQUEST
+DESCRIPTOR.message_types_by_name["ArksResponse"] = _ARKSRESPONSE
+DESCRIPTOR.enum_types_by_name["OutputFormatType"] = _OUTPUTFORMATTYPE
+DESCRIPTOR.enum_types_by_name["ErrorCode"] = _ERRORCODE
+DESCRIPTOR.enum_types_by_name["ContentType"] = _CONTENTTYPE
+DESCRIPTOR.enum_types_by_name["ItemSequenceType"] = _ITEMSEQUENCETYPE
 _sym_db.RegisterFileDescriptor(DESCRIPTOR)
 
-InferTensorContents = _reflection.GeneratedProtocolMessageType('InferTensorContents', (_message.Message,), dict(
-  DESCRIPTOR = _INFERTENSORCONTENTS,
-  __module__ = 'arks_pb2'
-  # @@protoc_insertion_point(class_scope:arks.InferTensorContents)
-  ))
+InferTensorContents = _reflection.GeneratedProtocolMessageType(
+    "InferTensorContents",
+    (_message.Message,),
+    dict(
+        DESCRIPTOR=_INFERTENSORCONTENTS,
+        __module__="arks_pb2"
+        # @@protoc_insertion_point(class_scope:arks.InferTensorContents)
+    ),
+)
 _sym_db.RegisterMessage(InferTensorContents)
 
-Pair = _reflection.GeneratedProtocolMessageType('Pair', (_message.Message,), dict(
-  DESCRIPTOR = _PAIR,
-  __module__ = 'arks_pb2'
-  # @@protoc_insertion_point(class_scope:arks.Pair)
-  ))
+Pair = _reflection.GeneratedProtocolMessageType(
+    "Pair",
+    (_message.Message,),
+    dict(
+        DESCRIPTOR=_PAIR,
+        __module__="arks_pb2"
+        # @@protoc_insertion_point(class_scope:arks.Pair)
+    ),
+)
 _sym_db.RegisterMessage(Pair)
 
-RowKey = _reflection.GeneratedProtocolMessageType('RowKey', (_message.Message,), dict(
-  DESCRIPTOR = _ROWKEY,
-  __module__ = 'arks_pb2'
-  # @@protoc_insertion_point(class_scope:arks.RowKey)
-  ))
+RowKey = _reflection.GeneratedProtocolMessageType(
+    "RowKey",
+    (_message.Message,),
+    dict(
+        DESCRIPTOR=_ROWKEY,
+        __module__="arks_pb2"
+        # @@protoc_insertion_point(class_scope:arks.RowKey)
+    ),
+)
 _sym_db.RegisterMessage(RowKey)
 
-PartitionInfo = _reflection.GeneratedProtocolMessageType('PartitionInfo', (_message.Message,), dict(
-  DESCRIPTOR = _PARTITIONINFO,
-  __module__ = 'arks_pb2'
-  # @@protoc_insertion_point(class_scope:arks.PartitionInfo)
-  ))
+PartitionInfo = _reflection.GeneratedProtocolMessageType(
+    "PartitionInfo",
+    (_message.Message,),
+    dict(
+        DESCRIPTOR=_PARTITIONINFO,
+        __module__="arks_pb2"
+        # @@protoc_insertion_point(class_scope:arks.PartitionInfo)
+    ),
+)
 _sym_db.RegisterMessage(PartitionInfo)
 
-Item = _reflection.GeneratedProtocolMessageType('Item', (_message.Message,), dict(
-  DESCRIPTOR = _ITEM,
-  __module__ = 'arks_pb2'
-  # @@protoc_insertion_point(class_scope:arks.Item)
-  ))
+Item = _reflection.GeneratedProtocolMessageType(
+    "Item",
+    (_message.Message,),
+    dict(
+        DESCRIPTOR=_ITEM,
+        __module__="arks_pb2"
+        # @@protoc_insertion_point(class_scope:arks.Item)
+    ),
+)
 _sym_db.RegisterMessage(Item)
 
-SubItem = _reflection.GeneratedProtocolMessageType('SubItem', (_message.Message,), dict(
-  DESCRIPTOR = _SUBITEM,
-  __module__ = 'arks_pb2'
-  # @@protoc_insertion_point(class_scope:arks.SubItem)
-  ))
+SubItem = _reflection.GeneratedProtocolMessageType(
+    "SubItem",
+    (_message.Message,),
+    dict(
+        DESCRIPTOR=_SUBITEM,
+        __module__="arks_pb2"
+        # @@protoc_insertion_point(class_scope:arks.SubItem)
+    ),
+)
 _sym_db.RegisterMessage(SubItem)
 
-SeekPlan = _reflection.GeneratedProtocolMessageType('SeekPlan', (_message.Message,), dict(
-  DESCRIPTOR = _SEEKPLAN,
-  __module__ = 'arks_pb2'
-  # @@protoc_insertion_point(class_scope:arks.SeekPlan)
-  ))
+SeekPlan = _reflection.GeneratedProtocolMessageType(
+    "SeekPlan",
+    (_message.Message,),
+    dict(
+        DESCRIPTOR=_SEEKPLAN,
+        __module__="arks_pb2"
+        # @@protoc_insertion_point(class_scope:arks.SeekPlan)
+    ),
+)
 _sym_db.RegisterMessage(SeekPlan)
 
-DumpReqInfo = _reflection.GeneratedProtocolMessageType('DumpReqInfo', (_message.Message,), dict(
-  DESCRIPTOR = _DUMPREQINFO,
-  __module__ = 'arks_pb2'
-  # @@protoc_insertion_point(class_scope:arks.DumpReqInfo)
-  ))
+DumpReqInfo = _reflection.GeneratedProtocolMessageType(
+    "DumpReqInfo",
+    (_message.Message,),
+    dict(
+        DESCRIPTOR=_DUMPREQINFO,
+        __module__="arks_pb2"
+        # @@protoc_insertion_point(class_scope:arks.DumpReqInfo)
+    ),
+)
 _sym_db.RegisterMessage(DumpReqInfo)
 
-ArksRequest = _reflection.GeneratedProtocolMessageType('ArksRequest', (_message.Message,), dict(
-  DESCRIPTOR = _ARKSREQUEST,
-  __module__ = 'arks_pb2'
-  # @@protoc_insertion_point(class_scope:arks.ArksRequest)
-  ))
+ArksRequest = _reflection.GeneratedProtocolMessageType(
+    "ArksRequest",
+    (_message.Message,),
+    dict(
+        DESCRIPTOR=_ARKSREQUEST,
+        __module__="arks_pb2"
+        # @@protoc_insertion_point(class_scope:arks.ArksRequest)
+    ),
+)
 _sym_db.RegisterMessage(ArksRequest)
 
-ArksResponse = _reflection.GeneratedProtocolMessageType('ArksResponse', (_message.Message,), dict(
-  DESCRIPTOR = _ARKSRESPONSE,
-  __module__ = 'arks_pb2'
-  # @@protoc_insertion_point(class_scope:arks.ArksResponse)
-  ))
+ArksResponse = _reflection.GeneratedProtocolMessageType(
+    "ArksResponse",
+    (_message.Message,),
+    dict(
+        DESCRIPTOR=_ARKSRESPONSE,
+        __module__="arks_pb2"
+        # @@protoc_insertion_point(class_scope:arks.ArksResponse)
+    ),
+)
 _sym_db.RegisterMessage(ArksResponse)
 
 
 DESCRIPTOR._options = None
-_ITEM.fields_by_name['is_features_valid']._options = None
-_ITEM.fields_by_name['scores']._options = None
-_SUBITEM.fields_by_name['is_features_valid']._options = None
-_SUBITEM.fields_by_name['scores']._options = None
+_ITEM.fields_by_name["is_features_valid"]._options = None
+_ITEM.fields_by_name["scores"]._options = None
+_SUBITEM.fields_by_name["is_features_valid"]._options = None
+_SUBITEM.fields_by_name["scores"]._options = None
 # @@protoc_insertion_point(module_scope)
diff --git a/kag/common/base/prompt_op.py b/kag/common/base/prompt_op.py
deleted file mode 100644
index 057e35bf..00000000
--- a/kag/common/base/prompt_op.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import importlib
-import inspect
-import os
-import sys
-from abc import ABC
-from string import Template
-from typing import List
-
-
-BUILDER_PROMPT_PATH = "kag.builder.prompt"
-SOLVER_PROMPT_PATH = "kag.solver.prompt"
-
-
-class PromptOp(ABC):
-    """
-    Provides a template for generating and parsing prompts related to specific business scenes.
-
-    Subclasses must implement the template strings for specific languages (English or Chinese)
-    and override the `template_variables` and `parse_response` methods.
-    """
-
-    """English template string"""
-    template_en: str = ""
-    """Chinese template string"""
-    template_zh: str = ""
-
-    def __init__(self, language: str, **kwargs):
-        """
-        Initializes the PromptOp instance with the selected language.
-
-        Args:
-            language (str): The language for the prompt, should be either "en" or "zh".
-
-        Raises:
-            AssertionError: If the provided language is not supported.
-        """
-
-        assert language in ["en", "zh"], f"language[{language}] is not supported."
-        self.template = self.template_en if language == "en" else self.template_zh
-        self.language = language
-        self.template_variables_value = {}
-        if "project_id" in kwargs:
-            self.project_id = kwargs["project_id"]
-
-    @property
-    def template_variables(self) -> List[str]:
-        """
-        Gets the list of template variables.
-
-        Must be implemented by subclasses.
-
-        Returns:
-        - List[str]: A list of template variable names.
-
-        Raises:
-        - NotImplementedError: If the subclass does not implement this method.
-        """
-
-        raise NotImplementedError(
-            f"{self.__class__.__name__} need to implement `template_variables` method."
-        )
-
-    def process_template_string_to_avoid_dollar_problem(self, template_string):
-        new_template_str = template_string.replace('$', '$$')
-        for var in self.template_variables:
-            new_template_str = new_template_str.replace(f'$${var}', f'${var}')
-        return new_template_str
-
-    def build_prompt(self, variables) -> str:
-        """
-        Build a prompt based on the template and provided variables.
-
-        This method replaces placeholders in the template with actual variable values.
-        If a variable is not provided, it defaults to an empty string.
-
-        Parameters:
-        - variables: A dictionary containing variable names and their corresponding values.
-
-        Returns:
-        - A string or list of strings, depending on the template content.
-        """
-
-        self.template_variables_value = variables
-        template_string = self.process_template_string_to_avoid_dollar_problem(self.template)
-        template = Template(template_string)
-        return template.substitute(**variables)
-
-    def parse_response(self, response: str, **kwargs):
-        """
-        Parses the response string.
-
-        Must be implemented by subclasses.
-
-        Parameters:
-        - response (str): The response string to be parsed.
-
-        Raises:
-        - NotImplementedError: If the subclass does not implement this method.
-        """
-
-        raise NotImplementedError(
-            f"{self.__class__.__name__} need to implement `parse_response` method."
-        )
-
-    @classmethod
-    def load(cls, biz_scene: str, type: str):
-        """
-        Dynamically loads the corresponding PromptOp subclass object based on the business scene and type.
-
-        Parameters:
-        - biz_scene (str): The name of the business scene.
-        - type (str): The type of prompt.
-
-        Returns:
-        - subclass of PromptOp: The loaded PromptOp subclass object.
-
-        Raises:
-        - ImportError: If the specified module or class does not exist.
-        """
-        dir_paths = [
-            os.path.join(os.getenv("KAG_PROJECT_ROOT_PATH", ""), "builder", "prompt"),
-            os.path.join(os.getenv("KAG_PROJECT_ROOT_PATH", ""), "solver", "prompt"),
-        ]
-        module_paths = [
-            '.'.join([BUILDER_PROMPT_PATH, biz_scene, type]),
-            '.'.join([SOLVER_PROMPT_PATH, biz_scene, type]),
-            '.'.join([BUILDER_PROMPT_PATH, 'default', type]),
-            '.'.join([SOLVER_PROMPT_PATH, 'default', type]),
-        ]
-
-        def find_class_from_dir(dir, type):
-            sys.path.append(dir)
-
-            for root, dirs, files in os.walk(dir):
-                for file in files:
-                    if file.endswith(".py") and file.startswith(f"{type}."):
-                        module_name = file[:-3]
-                        try:
-                            module = importlib.import_module(module_name)
-                        except ImportError:
-                            continue
-                        cls_found = find_class_from_module(module)
-                        if cls_found:
-                            return cls_found
-            return None
-
-        def find_class_from_module(module):
-            classes = inspect.getmembers(module, inspect.isclass)
-            for class_name, class_obj in classes:
-                import kag
-                if issubclass(class_obj, kag.common.base.prompt_op.PromptOp) and inspect.getmodule(class_obj) == module:
-                    return class_obj
-            return None
-
-        for dir_path in dir_paths:
-            try:
-                cls_found = find_class_from_dir(dir_path, type)
-                if cls_found:
-                    return cls_found
-            except ImportError:
-                continue
-
-        for module_path in module_paths:
-            try:
-                module = importlib.import_module(module_path)
-                cls_found = find_class_from_module(module)
-                if cls_found:
-                    return cls_found
-            except ModuleNotFoundError:
-                continue
-
-        raise ValueError(f'Not support prompt with biz_scene[{biz_scene}] and type[{type}]')
diff --git a/kag/common/benchmarks/evaUtils.py b/kag/common/benchmarks/evaUtils.py
index f443e8a0..3543f74f 100644
--- a/kag/common/benchmarks/evaUtils.py
+++ b/kag/common/benchmarks/evaUtils.py
@@ -1,5 +1,7 @@
 import re
+import json
 import string
+import traceback
 from collections import Counter
 
 
@@ -17,15 +19,16 @@ def normalize_answer(s):
     Returns:
     str: The standardized answer string.
     """
+
     def remove_articles(text):
-        return re.sub(r'\b(a|an|the)\b', ' ', text)
+        return re.sub(r"\b(a|an|the)\b", " ", text)
 
     def white_space_fix(text):
-        return ' '.join(text.split())
+        return " ".join(text.split())
 
     def remove_punc(text):
         exclude = set(string.punctuation)
-        return ''.join(ch for ch in text if ch not in exclude)
+        return "".join(ch for ch in text if ch not in exclude)
 
     def lower(text):
         return str(text).lower()
@@ -52,10 +55,16 @@ def f1_score(prediction, ground_truth):
 
     ZERO_METRIC = (0, 0, 0)
 
-    if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
+    if (
+        normalized_prediction in ["yes", "no", "noanswer"]
+        and normalized_prediction != normalized_ground_truth
+    ):
         return ZERO_METRIC
 
-    if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
+    if (
+        normalized_ground_truth in ["yes", "no", "noanswer"]
+        and normalized_prediction != normalized_ground_truth
+    ):
         return ZERO_METRIC
 
     prediction_tokens = normalized_prediction.split()
@@ -78,35 +87,156 @@ def f1_score(prediction, ground_truth):
 def exact_match_score(prediction, ground_truth):
     """
     Calculates the exact match score between a predicted answer and the ground truth answer.
-    
+
     This function normalizes both the predicted answer and the ground truth answer before comparing them.
     Normalization is performed to ensure that non-essential differences such as spaces and case are ignored.
-    
+
     Parameters:
     prediction (str): The predicted answer string.
     ground_truth (str): The ground truth answer string.
-    
+
     Returns:
     int: 1 if the predicted answer exactly matches the ground truth answer, otherwise 0.
     """
 
     return 1 if normalize_answer(prediction) == normalize_answer(ground_truth) else 0
 
+
 def get_em_f1(prediction, gold):
     """
     Calculates the Exact Match (EM) score and F1 score between the prediction and the gold standard.
-    
+
     This function evaluates the performance of a model in text similarity tasks by calculating the EM score and F1 score to measure the accuracy of the predictions.
-    
+
     Parameters:
     prediction (str): The output predicted by the model.
     gold (str): The gold standard output (i.e., the correct output).
-    
+
     Returns:
     tuple: A tuple containing two floats, the EM score and the F1 score. The EM score represents the exact match accuracy, while the F1 score is a combination of precision and recall.
     """
 
     em = exact_match_score(prediction, gold)
     f1, precision, recall = f1_score(prediction, gold)
-    
-    return float(em), f1
\ No newline at end of file
+
+    return float(em), f1
+
+
+def compare_summarization_answers(
+    query,
+    answer1,
+    answer2,
+    *,
+    api_key="EMPTY",
+    base_url="http://127.0.0.1:38080/v1",
+    model="gpt-4o-mini",
+    language="English",
+    retries=3,
+):
+    """
+    Given a query and two answers, compare the answers with an LLM for Comprehensiveness, Diversity and Empowerment.
+
+    This function is adapted from LightRAG for evaluating GraphRAG and LightRAG in QFS (query-focused summarization)
+    tasks:
+
+      https://github.com/HKUDS/LightRAG/blob/45cea6e/examples/batch_eval.py
+
+    Parameters:
+    query (str): The query inputed to LLMs.
+    answer1 (str): Answer generated by an LLM.
+    answer2 (str): Answer generated by another LLM.
+    api_key (str): API key to use when invoke the evaluating LLM.
+    base_url (str): base url to use when invoke the evaluating LLM.
+    model (str): model name to use when invoke the evaluating LLM.
+    language (str): language of the explanation
+    retries (int): number of retries
+
+    Returns:
+    str: response content generated by the evaluating LLM.
+    """
+    from openai import OpenAI
+
+    sys_prompt = """
+    ---Role---
+    You are an expert tasked with evaluating two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**.
+    """
+    prompt = f"""
+    You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**.
+
+    - **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question?
+    - **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question?
+    - **Empowerment**: How well does the answer help the reader understand and make informed judgments about the topic?
+
+    For each criterion, give each answer a score between 0 and 10, choose the better answer (either Answer 1 or Answer 2) and explain why.
+    Then, give each answer an overall score between 0 and 10, and select an overall winner based on these three categories.
+
+    Here is the question:
+    {query}
+
+    Here are the two answers:
+
+    **Answer 1:**
+    {answer1}
+
+    **Answer 2:**
+    {answer2}
+
+    Evaluate both answers using the three criteria listed above and provide detailed explanations for each criterion.
+
+    Output your evaluation in the following JSON format:
+
+    {{
+        "Comprehensiveness": {{
+            "Score 1": [Score of Answer 1 - an integer between 0 and 10],
+            "Score 2": [Score of Answer 2 - an integer between 0 and 10],
+            "Winner": "[Answer 1 or Answer 2]",
+            "Explanation": "[Provide explanation in {language} here]"
+        }},
+        "Diversity": {{
+            "Score 1": [Score of Answer 1 - an integer between 0 and 10],
+            "Score 2": [Score of Answer 2 - an integer between 0 and 10],
+            "Winner": "[Answer 1 or Answer 2]",
+            "Explanation": "[Provide explanation in {language} here]"
+        }},
+        "Empowerment": {{
+            "Score 1": [Score of Answer 1 - an integer between 0 and 10],
+            "Score 2": [Score of Answer 2 - an integer between 0 and 10],
+            "Winner": "[Answer 1 or Answer 2]",
+            "Explanation": "[Provide explanation in {language} here]"
+        }},
+        "Overall": {{
+            "Score 1": [Score of Answer 1 - an integer between 0 and 10],
+            "Score 2": [Score of Answer 2 - an integer between 0 and 10],
+            "Winner": "[Answer 1 or Answer 2]",
+            "Explanation": "[Summarize why this answer is the overall winner based on the three criteria in {language}]"
+        }}
+    }}
+    """
+    for index in range(retries):
+        content = None
+        try:
+            client = OpenAI(api_key=api_key, base_url=base_url)
+            response = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": sys_prompt},
+                    {"role": "user", "content": prompt},
+                ],
+            )
+            content = response.choices[0].message.content
+            if content.startswith("```json") and content.endswith("```"):
+                content = content[7:-3]
+            metrics = json.loads(content)
+            return metrics
+        except Exception:
+            if index == retries - 1:
+                message = (
+                    f"Comparing summarization answers failed.\n"
+                    f"query: {query}\n"
+                    f"answer1: {answer1}\n"
+                    f"answer2: {answer2}\n"
+                    f"content: {content}\n"
+                    f"exception:\n{traceback.format_exc()}"
+                )
+                print(message)
+                return None
diff --git a/kag/common/benchmarks/evaluate.py b/kag/common/benchmarks/evaluate.py
index 4b920f93..1a574627 100644
--- a/kag/common/benchmarks/evaluate.py
+++ b/kag/common/benchmarks/evaluate.py
@@ -1,22 +1,25 @@
-
 from typing import List
+from tqdm import tqdm
+from concurrent.futures import ThreadPoolExecutor, as_completed
 
 from .evaUtils import get_em_f1
+from .evaUtils import compare_summarization_answers
 
 
-class Evaluate():
+class Evaluate:
 
     """
     provide evaluation for benchmarks, such as em、f1、answer_similarity, answer_correctness
     """
-    def __init__(self, embedding_factory = "text-embedding-ada-002"):
+
+    def __init__(self, embedding_factory="text-embedding-ada-002"):
         self.embedding_factory = embedding_factory
 
     def evaForSimilarity(self, predictionlist: List[str], goldlist: List[str]):
         """
         evaluate the similarity between prediction and gold #TODO
         """
-        # data_samples = {  
+        # data_samples = {
         #     'question': [],
         #     'answer': predictionlist,
         #     'ground_truth': goldlist
@@ -29,7 +32,6 @@ def evaForSimilarity(self, predictionlist: List[str], goldlist: List[str]):
         # return np.average(score.to_pandas()[['answer_similarity']])
         return 0.0
 
-
     def getBenchMark(self, predictionlist: List[str], goldlist: List[str]):
         """
         Calculates and returns evaluation metrics between predictions and ground truths.
@@ -45,21 +47,113 @@ def getBenchMark(self, predictionlist: List[str], goldlist: List[str]):
         dict: Dictionary containing EM, F1 score, and answer similarity.
         """
         # Initialize total metrics
-        total_metrics = {'em': 0.0, 'f1': 0.0, 'answer_similarity': 0.0}
-        
+        total_metrics = {"em": 0.0, "f1": 0.0, "answer_similarity": 0.0}
+
         # Iterate over prediction and gold lists to calculate EM and F1 scores
         for prediction, gold in zip(predictionlist, goldlist):
-            em, f1 = get_em_f1(prediction, gold)  # Call external function to calculate EM and F1
-            total_metrics['em'] += em  # Accumulate EM score
-            total_metrics['f1'] += f1  # Accumulate F1 score
-        
+            em, f1 = get_em_f1(
+                prediction, gold
+            )  # Call external function to calculate EM and F1
+            total_metrics["em"] += em  # Accumulate EM score
+            total_metrics["f1"] += f1  # Accumulate F1 score
+
         # Calculate average EM and F1 scores
-        total_metrics['em'] /= len(predictionlist)
-        total_metrics['f1'] /= len(predictionlist)
-        
+        total_metrics["em"] /= len(predictionlist)
+        total_metrics["f1"] /= len(predictionlist)
+
         # Call method to calculate answer similarity
-        total_metrics['answer_similarity'] = self.evaForSimilarity(predictionlist, goldlist)
+        total_metrics["answer_similarity"] = self.evaForSimilarity(
+            predictionlist, goldlist
+        )
 
         # Return evaluation metrics dictionary
         return total_metrics
 
+    def getSummarizationMetrics(
+        self,
+        queries: List[str],
+        answers1: List[str],
+        answers2: List[str],
+        *,
+        api_key="EMPTY",
+        base_url="http://127.0.0.1:38080/v1",
+        model="gpt-4o-mini",
+        language="English",
+        retries=3,
+        max_workers=50,
+    ):
+        """
+        Calculates and returns QFS (query-focused summarization) evaluation metrics
+        for the given queries, answers1 and answers2.
+
+        This function evaluates the triple (query, answer1, answer2) by feeding it
+        into an evaluating LLM specified as `api_key`, `base_url` and `model`.
+
+        Parameters:
+        queries (List[str]): List of queries.
+        answers1 (List[str]): List of answers generated by an LLM (LLM-1).
+        answers2 (List[str]): List of answers generated by another LLM (LLM-2).
+        api_key (str): API key to use when invoke the evaluating LLM.
+        base_url (str): base url to use when invoke the evaluating LLM.
+        model (str): model name to use when invoke the evaluating LLM.
+        language (str): language of the explanation
+        retries (int): number of retries
+        max_workers (int): number of workers
+
+        Returns:
+        dict: Dictionary containing the average metrics and the responses
+              generated by the evaluating LLM.
+        """
+        responses = [None] * len(queries)
+        all_keys = "Comprehensiveness", "Diversity", "Empowerment", "Overall"
+        all_items = "Score 1", "Score 2"
+        average_metrics = {key: {item: 0.0 for item in all_items} for key in all_keys}
+        success_count = 0
+
+        def process_sample(index, query, answer1, answer2):
+            metrics = compare_summarization_answers(
+                query,
+                answer1,
+                answer2,
+                api_key=api_key,
+                base_url=base_url,
+                model=model,
+                language=language,
+                retries=retries,
+            )
+            if metrics is None:
+                print(
+                    f"fail to compare answers of query {index + 1}.\n"
+                    f"      query: {query}\n"
+                    f"    answer1: {answer1}\n"
+                    f"    answer2: {answer2}\n"
+                )
+            else:
+                responses[index] = metrics
+            return metrics
+
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = [
+                executor.submit(process_sample, index, query, answer1, answer2)
+                for index, (query, answer1, answer2) in enumerate(
+                    zip(queries, answers1, answers2)
+                )
+            ]
+            for future in tqdm(
+                as_completed(futures), total=len(futures), desc="Evaluating: "
+            ):
+                metrics = future.result()
+                if metrics is not None:
+                    for key in all_keys:
+                        for item in all_items:
+                            average_metrics[key][item] += metrics[key][item]
+                    success_count += 1
+        if success_count > 0:
+            for key in all_keys:
+                for item in all_items:
+                    average_metrics[key][item] /= success_count
+        result = {
+            "average_metrics": average_metrics,
+            "responses": responses,
+        }
+        return result
diff --git a/kag/common/llm/config/__init__.py b/kag/common/checkpointer/__init__.py
similarity index 62%
rename from kag/common/llm/config/__init__.py
rename to kag/common/checkpointer/__init__.py
index 9a3a13aa..d2deddb2 100644
--- a/kag/common/llm/config/__init__.py
+++ b/kag/common/checkpointer/__init__.py
@@ -9,15 +9,9 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
+from kag.common.checkpointer.base import CheckPointer, CheckpointerManager
+from kag.common.checkpointer.txt_checkpointer import TxtCheckPointer
+from kag.common.checkpointer.bin_checkpointer import BinCheckPointer
 
-from kag.common.llm.config.openai import OpenAIConfig
-from kag.common.llm.config.base import LLMConfig
-from kag.common.llm.config.vllm import VLLMConfig
-from kag.common.llm.config.ollama import OllamaConfig
 
-__all__ = [
-    "OpenAIConfig",
-    "LLMConfig",
-    "VLLMConfig",
-    "OllamaConfig"
-]
+__all__ = ["CheckPointer", "CheckpointerManager", "TxtCheckPointer", "BinCheckPointer"]
diff --git a/kag/common/checkpointer/base.py b/kag/common/checkpointer/base.py
new file mode 100644
index 00000000..17c2f6fd
--- /dev/null
+++ b/kag/common/checkpointer/base.py
@@ -0,0 +1,190 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import os
+import threading
+from kag.common.registry import Registrable
+from kag.common.utils import reset, bold, red, generate_hash_id
+
+
+class CheckPointer(Registrable):
+    """
+    A class for managing checkpoints in a distributed environment.
+
+    This class provides methods to open, read, write, and close checkpoint files.
+    It is designed to handle checkpoints in a distributed setting, where multiple
+    processes may be writing checkpoints in parallel.
+
+    Attributes:
+        ckpt_file_name (str): The format string for checkpoint file names.
+    """
+
+    ckpt_file_name = "kag_checkpoint_{}_{}.ckpt"
+
+    def __init__(self, ckpt_dir: str, rank: int = 0, world_size: int = 1):
+        """
+        Initializes the CheckPointer with the given checkpoint directory, rank, and world size.
+
+        Args:
+            ckpt_dir (str): The directory where checkpoint files are stored.
+            rank (int): The rank of the current process (default is 0).
+            world_size (int): The total number of processes in the distributed environment (default is 1).
+        """
+        self._ckpt_dir = ckpt_dir
+        if not os.path.exists(ckpt_dir):
+            os.makedirs(ckpt_dir, exist_ok=True)
+        self.rank = rank
+        self.world_size = world_size
+        self._ckpt_file_path = os.path.join(
+            self._ckpt_dir, CheckPointer.ckpt_file_name.format(rank, world_size)
+        )
+        self._ckpt = self.open()
+        self._closed = False
+        if self.size() > 0:
+            print(
+                f"{bold}{red}Existing checkpoint found in {self._ckpt_dir}, with {self.size()} records.{reset}"
+            )
+
+    def open(self):
+        """
+        Opens the checkpoint file and returns the checkpoint object.
+
+        Returns:
+            Any: The checkpoint object, which can be used for reading and writing.
+        """
+        raise NotImplementedError("open not implemented yet.")
+
+    def read_from_ckpt(self, key):
+        """
+        Reads a value from the checkpoint file using the specified key.
+
+        Args:
+            key (str): The key to retrieve the value from the checkpoint.
+
+        Returns:
+            Any: The value associated with the key in the checkpoint.
+        """
+        raise NotImplementedError("read_from_ckpt not implemented yet.")
+
+    def write_to_ckpt(self, key, value):
+        """
+        Writes a value to the checkpoint file using the specified key.
+
+        Args:
+            key (str): The key to store the value in the checkpoint.
+            value (Any): The value to be stored in the checkpoint.
+        """
+        raise NotImplementedError("write_to_ckpt not implemented yet.")
+
+    def _close(self):
+        """
+        Closes the checkpoint file.
+        """
+        raise NotImplementedError("close not implemented yet.")
+
+    def close(self):
+        """
+        Closes the checkpoint file.
+        """
+        if not self._closed:
+            self._close()
+            self._closed = True
+
+    def exists(self, key):
+        """
+        Checks if a key exists in the checkpoint file.
+
+        Args:
+            key (str): The key to check for existence in the checkpoint.
+
+        Returns:
+            bool: True if the key exists in the checkpoint, False otherwise.
+        """
+        raise NotImplementedError("close not implemented yet.")
+
+    def keys(self):
+        """
+        Returns the key set contained in the checkpoint file.
+
+        Returns:
+            set:  The key set contained in the checkpoint.
+        """
+
+        raise NotImplementedError("keys not implemented yet.")
+
+    def size(self):
+        """
+        Return the number of records in the checkpoint file.
+
+        Returns:
+            int: the number of records in the checkpoint file.
+        """
+
+        raise NotImplementedError("size not implemented yet.")
+
+    def __contains__(self, key):
+        """
+        Defines the behavior of the `in` operator for the object.
+        Args:
+            key (str): The key to check for existence in the checkpoint.
+
+        Returns:
+            bool: True if the key exists in the checkpoint, False otherwise.
+        """
+
+        return self.exists(key)
+
+
+class CheckpointerManager:
+    """
+    Manages the lifecycle of CheckPointer objects.
+
+    This class provides a thread-safe mechanism to retrieve and close CheckPointer
+    instances based on a configuration. It uses a global dictionary to cache
+    CheckPointer objects, ensuring that each configuration corresponds to a unique
+    instance.
+    """
+
+    _CKPT_OBJS = {}
+    _LOCK = threading.Lock()
+
+    @staticmethod
+    def get_checkpointer(config):
+        """
+        Retrieves or creates a CheckPointer instance based on the provided configuration.
+
+        Args:
+            config (dict): The configuration used to initialize the CheckPointer.
+
+        Returns:
+            CheckPointer: A CheckPointer instance corresponding to the configuration.
+        """
+        with CheckpointerManager._LOCK:
+            key = generate_hash_id(config)
+            if key not in CheckpointerManager._CKPT_OBJS:
+                ckpter = CheckPointer.from_config(config)
+                CheckpointerManager._CKPT_OBJS[key] = ckpter
+            return CheckpointerManager._CKPT_OBJS[key]
+
+    @staticmethod
+    def close():
+        """
+        Closes all cached CheckPointer instances.
+
+        This method iterates through all cached CheckPointer objects and calls their
+        `close` method to release resources. After calling this method, the cache
+        will be cleared.
+        """
+        with CheckpointerManager._LOCK:
+            for v in CheckpointerManager._CKPT_OBJS.values():
+                v.close()
+            CheckpointerManager._CKPT_OBJS.clear()
diff --git a/kag/common/checkpointer/bin_checkpointer.py b/kag/common/checkpointer/bin_checkpointer.py
new file mode 100644
index 00000000..e247972a
--- /dev/null
+++ b/kag/common/checkpointer/bin_checkpointer.py
@@ -0,0 +1,217 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import shelve
+import logging
+import transaction
+import threading
+import pickle
+import BTrees.OOBTree
+from ZODB import DB
+from ZODB.FileStorage import FileStorage
+from kag.common.checkpointer.base import CheckPointer
+
+logger = logging.getLogger()
+
+
+@CheckPointer.register("bin")
+class BinCheckPointer(CheckPointer):
+    """
+    A subclass of CheckPointer that uses shelve for binary checkpoint management.
+
+    This class extends the CheckPointer class to provide binary checkpoint
+    management using the shelve module. It supports opening, reading, writing,
+    and closing checkpoint files in a binary format.
+    """
+
+    def open(self):
+        """
+        Opens the checkpoint file using shelve in writeback mode.
+
+        Returns:
+            Any: The shelve object representing the checkpoint file.
+        """
+        return shelve.open(self._ckpt_file_path, "c", writeback=True)
+
+    def exists(self, key):
+        """
+        Checks if a key exists in the checkpoint file.
+
+        Args:
+            key (str): The key to check for existence in the checkpoint.
+
+        Returns:
+            bool: True if the key exists in the checkpoint, False otherwise.
+        """
+        return key in self._ckpt
+
+    def read_from_ckpt(self, key):
+        """
+        Reads a value from the checkpoint file using the specified key.
+
+        Args:
+            key (str): The key to retrieve the value from the checkpoint.
+
+        Returns:
+            Any: The value associated with the key in the checkpoint.
+        """
+        return self._ckpt[key]
+
+    def write_to_ckpt(self, key, value):
+        """
+        Writes a value to the checkpoint file using the specified key.
+
+        Args:
+            key (str): The key to store the value in the checkpoint.
+            value (Any): The value to be stored in the checkpoint.
+        """
+        self._ckpt[key] = value
+        self._ckpt.sync()
+
+    def _close(self):
+        """
+        Closes the checkpoint file and ensures data is written to disk.
+        """
+        self._ckpt.sync()
+        self._ckpt.close()
+
+    def size(self):
+        """
+        Returns the number of entries in the checkpoint.
+        Returns:
+            int: The number of entries in the checkpoint.
+        """
+
+        return len(self._ckpt)
+
+    def keys(self):
+        return set(self._ckpt.keys())
+
+
+@CheckPointer.register("zodb")
+class ZODBCheckPointer(CheckPointer):
+    """
+    A CheckPointer implementation that uses ZODB as the underlying storage.
+
+    This class provides methods to open, read, write, and close checkpoints using ZODB.
+    """
+
+    def __init__(self, ckpt_dir: str, rank: int = 0, world_size: int = 1):
+        """
+        Initializes the ZODBCheckPointer with the given checkpoint directory, rank, and world size.
+
+        Args:
+            ckpt_dir (str): The directory where checkpoint files are stored.
+            rank (int): The rank of the current process (default is 0).
+            world_size (int): The total number of processes in the distributed environment (default is 1).
+        """
+        self._lock = threading.Lock()
+        super().__init__(ckpt_dir, rank, world_size)
+
+    def open(self):
+        """
+        Opens the ZODB database and returns the root object for checkpoint storage.
+
+        Returns:
+            dict: The root object of the ZODB database, which is a dictionary-like object.
+        """
+        with self._lock:
+            storage = FileStorage(self._ckpt_file_path)
+            db = DB(storage)
+            with db.transaction() as conn:
+                if not hasattr(conn.root, "data"):
+                    conn.root.data = BTrees.OOBTree.BTree()
+            return db
+
+    def read_from_ckpt(self, key):
+        """
+        Reads a value from the checkpoint using the specified key.
+
+        Args:
+            key (str): The key to retrieve the value from the checkpoint.
+
+        Returns:
+            Any: The value associated with the key in the checkpoint.
+        """
+        with self._lock:
+            with self._ckpt.transaction() as conn:
+                obj = conn.root.data.get(key, None)
+            if obj:
+                return pickle.loads(obj)
+            else:
+                return None
+
+    def write_to_ckpt(self, key, value):
+        """
+        Writes a value to the checkpoint using the specified key.
+        By default, ZODB tracks modifications to the written object (value) and
+        continuously synchronizes these changes to the storage. For example, if
+        the value is a `SubGraph` object, subsequent modifications to its
+        attributes will be synchronized, which is not what we expect.
+        Therefore, we use `pickle` to serialize the value object before writing it,
+        ensuring that the object behaves as an immutable object.
+
+        Args:
+            key (str): The key to store the value in the checkpoint.
+            value (Any): The value to be stored in the checkpoint.
+        """
+        with self._lock:
+            try:
+                with self._ckpt.transaction() as conn:
+                    conn.root.data[key] = pickle.dumps(value)
+            except Exception as e:
+                logger.warn(f"failed to write checkpoint {key} to db, info: {e}")
+
+    def _close(self):
+        """
+        Closes the ZODB database connection.
+        """
+        with self._lock:
+            try:
+                transaction.commit()
+            except:
+                transaction.abort()
+            if self._ckpt is not None:
+                self._ckpt.close()
+
+    def exists(self, key):
+        """
+        Checks if a key exists in the checkpoint.
+
+        Args:
+            key (str): The key to check for existence in the checkpoint.
+
+        Returns:
+            bool: True if the key exists in the checkpoint, False otherwise.
+        """
+        with self._lock:
+            with self._ckpt.transaction() as conn:
+                return key in conn.root.data
+
+    def size(self):
+        """
+        Returns the number of entries in the checkpoint.
+
+        This method calculates the size of the checkpoint by counting the number
+        of keys stored in the checkpoint's data dictionary. It ensures thread-safe
+        access to the checkpoint by using a lock.
+
+        Returns:
+            int: The number of entries in the checkpoint.
+        """
+        with self._lock:
+            with self._ckpt.transaction() as conn:
+                return len(conn.root.data)
+
+    def keys(self):
+        with self._lock:
+            with self._ckpt.transaction() as conn:
+                return set(conn.root.data.keys())
diff --git a/kag/common/checkpointer/ckpt/kag_checkpoint_0_1.ckpt.db b/kag/common/checkpointer/ckpt/kag_checkpoint_0_1.ckpt.db
new file mode 100644
index 00000000..71e41cd7
Binary files /dev/null and b/kag/common/checkpointer/ckpt/kag_checkpoint_0_1.ckpt.db differ
diff --git a/kag/common/checkpointer/txt_checkpointer.py b/kag/common/checkpointer/txt_checkpointer.py
new file mode 100644
index 00000000..5f58afde
--- /dev/null
+++ b/kag/common/checkpointer/txt_checkpointer.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import os
+import json
+from kag.common.checkpointer.base import CheckPointer
+
+
+@CheckPointer.register("txt")
+class TxtCheckPointer(CheckPointer):
+    """
+    A subclass of CheckPointer that uses a text file for checkpoint management.
+
+    This class extends the CheckPointer class to provide checkpoint management
+    using a text file. It supports opening, reading, writing, and closing
+    checkpoint files in a text format. Each checkpoint entry is stored as a
+    JSON object in the file.
+    """
+
+    def open(self):
+        """
+        Opens the checkpoint file and loads existing data into a dictionary.
+
+        Returns:
+            dict: A dictionary containing the checkpoint data.
+        """
+        ckpt = {}
+        if os.path.exists(self._ckpt_file_path):
+            with open(self._ckpt_file_path, "r") as reader:
+                for line in reader:
+                    data = json.loads(line)
+                    ckpt[data["id"]] = data["value"]
+        self._writer = open(self._ckpt_file_path, "a")
+        return ckpt
+
+    def exists(self, key):
+        """
+        Checks if a key exists in the checkpoint file.
+
+        Args:
+            key (str): The key to check for existence in the checkpoint.
+
+        Returns:
+            bool: True if the key exists in the checkpoint, False otherwise.
+        """
+        return key in self._ckpt
+
+    def read_from_ckpt(self, key):
+        """
+        Reads a value from the checkpoint file using the specified key.
+
+        Args:
+            key (str): The key to retrieve the value from the checkpoint.
+
+        Returns:
+            Any: The value associated with the key in the checkpoint.
+        """
+        return self._ckpt[key]
+
+    def write_to_ckpt(self, key, value):
+        """
+        Writes a value to the checkpoint file using the specified key.
+
+        Args:
+            key (str): The key to store the value in the checkpoint.
+            value (Any): The value to be stored in the checkpoint.
+        """
+        self._ckpt[key] = value
+        self._writer.write(json.dumps({"id": key, "value": value}, ensure_ascii=False))
+        self._writer.write("\n")
+        self._writer.flush()
+
+    def _close(self):
+        """
+        Closes the checkpoint file and ensures data is written to disk.
+        """
+        self._writer.flush()
+        self._writer.close()
+
+    def size(self):
+        return len(self._ckpt)
+
+    def keys(self):
+        return set(self._ckpt.keys())
diff --git a/kag/common/conf.py b/kag/common/conf.py
new file mode 100644
index 00000000..044b8faa
--- /dev/null
+++ b/kag/common/conf.py
@@ -0,0 +1,202 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import copy
+import os
+import logging
+import yaml
+import json
+import pprint
+from pathlib import Path
+from typing import Union, Optional
+
+from knext.project.client import ProjectClient
+
+
+class KAGConstants(object):
+    LOCAL_SCHEMA_URL = "http://localhost:8887"
+    DEFAULT_KAG_CONFIG_FILE_NAME = "default_config.yaml"
+    KAG_CONFIG_FILE_NAME = "kag_config.yaml"
+    DEFAULT_KAG_CONFIG_PATH = os.path.join(__file__, DEFAULT_KAG_CONFIG_FILE_NAME)
+    KAG_CFG_PREFIX = "KAG"
+    GLOBAL_CONFIG_KEY = "global"
+    PROJECT_CONFIG_KEY = "project"
+    KAG_NAMESPACE_KEY = "namespace"
+    KAG_PROJECT_ID_KEY = "id"
+    KAG_PROJECT_HOST_ADDR_KEY = "host_addr"
+    KAG_LANGUAGE_KEY = "language"
+    KAG_CKPT_DIR_KEY = "checkpoint_path"
+    KAG_BIZ_SCENE_KEY = "biz_scene"
+    ENV_KAG_PROJECT_ID = "KAG_PROJECT_ID"
+    ENV_KAG_PROJECT_HOST_ADDR = "KAG_PROJECT_HOST_ADDR"
+    ENV_KAG_DEBUG_DUMP_CONFIG = "KAG_DEBUG_DUMP_CONFIG"
+    KAG_SIMILAR_EDGE_NAME = "similar"
+
+    KS8_ENV_TF_CONFIG = "TF_CONFIG"
+    K8S_ENV_MASTER_ADDR = "MASTER_ADDR"
+    K8S_ENV_MASTER_PORT = "MASTER_PORT"
+    K8S_ENV_WORLD_SIZE = "WORLD_SIZE"
+    K8S_ENV_RANK = "RANK"
+    K8S_ENV_POD_NAME = "POD_NAME"
+
+
+class KAGGlobalConf:
+    def __init__(self):
+        self._extra = {}
+
+    def initialize(self, **kwargs):
+        self.project_id = kwargs.pop(
+            KAGConstants.KAG_PROJECT_ID_KEY,
+            os.getenv(KAGConstants.ENV_KAG_PROJECT_ID, "1"),
+        )
+        self.host_addr = kwargs.pop(
+            KAGConstants.KAG_PROJECT_HOST_ADDR_KEY,
+            os.getenv(KAGConstants.ENV_KAG_PROJECT_HOST_ADDR, "http://127.0.0.1:8887"),
+        )
+        self.biz_scene = kwargs.pop(KAGConstants.KAG_BIZ_SCENE_KEY, "default")
+        self.language = kwargs.pop(KAGConstants.KAG_LANGUAGE_KEY, "en")
+        self.namespace = kwargs.pop(KAGConstants.KAG_NAMESPACE_KEY, None)
+        self.ckpt_dir = kwargs.pop(KAGConstants.KAG_CKPT_DIR_KEY, "ckpt")
+
+        # process configs set to class attr directly
+        for k in self._extra.keys():
+            if hasattr(self, k):
+                delattr(self, k)
+
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+        self._extra = kwargs
+
+        print(
+            f"Done initialize project config with host addr {self.host_addr} and project_id {self.project_id}"
+        )
+
+
+def _closest_cfg(
+    path: Union[str, os.PathLike] = ".",
+    prev_path: Optional[Union[str, os.PathLike]] = None,
+) -> str:
+    """
+    Return the path to the closest .kag.cfg file by traversing the current
+    directory and its parents
+    """
+    if prev_path is not None and str(path) == str(prev_path):
+        return ""
+    path = Path(path).resolve()
+    cfg_file = path / KAGConstants.KAG_CONFIG_FILE_NAME
+    if cfg_file.exists():
+        return str(cfg_file)
+    return _closest_cfg(path.parent, path)
+
+
+def load_config(prod: bool = False):
+    """
+    Get kag config file as a ConfigParser.
+    """
+    if prod:
+        project_id = os.getenv(KAGConstants.ENV_KAG_PROJECT_ID)
+        host_addr = os.getenv(KAGConstants.ENV_KAG_PROJECT_HOST_ADDR)
+        project_client = ProjectClient(host_addr=host_addr)
+        project = project_client.get_by_id(project_id)
+        config = json.loads(project.config)
+        if "project" not in config:
+            config["project"] = {
+                KAGConstants.KAG_PROJECT_ID_KEY: project_id,
+                KAGConstants.KAG_PROJECT_HOST_ADDR_KEY: host_addr,
+                KAGConstants.KAG_NAMESPACE_KEY: project.namespace,
+            }
+            prompt_config = config.pop("prompt", {})
+            for key in [KAGConstants.KAG_LANGUAGE_KEY, KAGConstants.KAG_BIZ_SCENE_KEY]:
+                if key in prompt_config:
+                    config["project"][key] = prompt_config[key]
+        if "vectorizer" in config and "vectorize_model" not in config:
+            config["vectorize_model"] = config["vectorizer"]
+        return config
+    else:
+        config_file = _closest_cfg()
+        if os.path.exists(config_file) and os.path.isfile(config_file):
+            print(f"found config file: {config_file}")
+            with open(config_file, "r") as reader:
+                config = reader.read()
+            return yaml.safe_load(config)
+        else:
+            return {}
+
+
+class KAGConfigMgr:
+    def __init__(self):
+        self.config = {}
+        self.global_config = KAGGlobalConf()
+        self._is_initialized = False
+
+    def init_log_config(self, config):
+        log_conf = config.get("log", {})
+        if log_conf:
+            log_level = log_conf.get("level", "INFO")
+        else:
+            log_level = "INFO"
+        logging.basicConfig(level=logging.getLevelName(log_level))
+        logging.getLogger("neo4j.notifications").setLevel(logging.ERROR)
+        logging.getLogger("neo4j.io").setLevel(logging.INFO)
+        logging.getLogger("neo4j.pool").setLevel(logging.INFO)
+
+    def initialize(self, prod: bool = True):
+        config = load_config(prod)
+        if self._is_initialized:
+            print(
+                "Reinitialize the KAG configuration, an operation that should exclusively be triggered within the Java invocation context."
+            )
+            print(f"original config: {self.config}")
+            print(f"new config: {config}")
+        self.prod = prod
+        self.config = config
+        global_config = self.config.get(KAGConstants.PROJECT_CONFIG_KEY, {})
+        self.global_config.initialize(**global_config)
+        self.init_log_config(self.config)
+        self._is_initialized = True
+
+    @property
+    def all_config(self):
+        return copy.deepcopy(self.config)
+
+
+KAG_CONFIG = KAGConfigMgr()
+
+KAG_PROJECT_CONF = KAG_CONFIG.global_config
+
+
+def init_env():
+    project_id = os.getenv(KAGConstants.ENV_KAG_PROJECT_ID)
+    host_addr = os.getenv(KAGConstants.ENV_KAG_PROJECT_HOST_ADDR)
+    if project_id and host_addr:
+        prod = True
+    else:
+        prod = False
+    global KAG_CONFIG
+    KAG_CONFIG.initialize(prod)
+
+    if prod:
+        msg = "Done init config from server"
+    else:
+        msg = "Done init config from local file"
+    os.environ[KAGConstants.ENV_KAG_PROJECT_ID] = str(KAG_PROJECT_CONF.project_id)
+    os.environ[KAGConstants.ENV_KAG_PROJECT_HOST_ADDR] = str(KAG_PROJECT_CONF.host_addr)
+    if len(KAG_CONFIG.all_config) > 0:
+        dump_flag = os.getenv(KAGConstants.ENV_KAG_DEBUG_DUMP_CONFIG)
+        if dump_flag is not None and dump_flag.strip() == "1":
+            print(f"{msg}:")
+            pprint.pprint(KAG_CONFIG.all_config, indent=2)
+        else:
+            print(
+                f"{msg}: set {KAGConstants.ENV_KAG_DEBUG_DUMP_CONFIG}=1 to dump config"
+            )
+    else:
+        print("No config found.")
diff --git a/kag/common/default_config.cfg b/kag/common/default_config.cfg
deleted file mode 100644
index 04de60db..00000000
--- a/kag/common/default_config.cfg
+++ /dev/null
@@ -1,33 +0,0 @@
-
-[project]
-with_server = True
-host_addr = http://127.0.0.1:8887
-
-[vectorizer]
-vectorizer = kag.common.vectorizer.OpenAIVectorizer
-model = bge-m3
-api_key = EMPTY
-base_url = http://127.0.0.1:11434/v1
-vector_dimensions = 1024
-
-[llm]
-client_type = ollama
-base_url = http://localhost:11434/api/generate
-model = llama3.1
-
-
-[indexer]
-with_semantic = False
-similarity_threshold = 0.8
-
-[retriever]
-with_semantic = False
-pagerank_threshold = 0.9
-match_threshold = 0.8
-top_k = 10
-
-[schedule]
-interval_minutes = -1
-
-[log]
-level = INFO
\ No newline at end of file
diff --git a/kag/common/env.py b/kag/common/env.py
index 60e9907f..916726de 100644
--- a/kag/common/env.py
+++ b/kag/common/env.py
@@ -1,117 +1,145 @@
 # -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-import logging
 import os
-import sys
-from configparser import ConfigParser as CP
-from pathlib import Path
-from typing import Union, Optional
-
-import kag.common as common
-
-class ConfigParser(CP):
-    def __init__(self,defaults=None):
-        CP.__init__(self,defaults=defaults)
-    def optionxform(self, optionstr):
-        return optionstr
-
-
-LOCAL_SCHEMA_URL = "http://localhost:8887"
-DEFAULT_KAG_CONFIG_FILE_NAME = "default_config.cfg"
-DEFAULT_KAG_CONFIG_PATH = os.path.join(common.__path__[0], DEFAULT_KAG_CONFIG_FILE_NAME)
-KAG_CFG_PREFIX = "KAG"
-
-
-def init_env():
-    """Initialize environment to use command-line tool from inside a project
-    dir. This sets the Scrapy settings module and modifies the Python path to
-    be able to locate the project module.
-    """
-    project_cfg, root_path = get_config()
-
-    init_kag_config(Path(root_path) / "kag_config.cfg")
-
-
-def get_config():
-    """
-    Get kag config file as a ConfigParser.
-    """
-    local_cfg_path = _closest_cfg()
-    local_cfg = ConfigParser()
-    local_cfg.read(local_cfg_path)
-
-    projdir = ""
-    if local_cfg_path:
-        projdir = str(Path(local_cfg_path).parent)
-        if projdir not in sys.path:
-            sys.path.append(projdir)
-
-    return local_cfg, projdir
-
-
-def _closest_cfg(
-    path: Union[str, os.PathLike] = ".",
-    prev_path: Optional[Union[str, os.PathLike]] = None,
-) -> str:
-    """
-    Return the path to the closest .kag.cfg file by traversing the current
-    directory and its parents
-    """
-    if prev_path is not None and str(path) == str(prev_path):
-        return ""
-    path = Path(path).resolve()
-    cfg_file = path / "kag_config.cfg"
-    if cfg_file.exists():
-        return str(cfg_file)
-    return _closest_cfg(path.parent, path)
-
-
-def get_cfg_files():
-    """
-    Get global and local kag config files and paths.
-    """
-    local_cfg_path = _closest_cfg()
-    local_cfg = ConfigParser()
-    local_cfg.read(local_cfg_path)
-
-    if local_cfg_path:
-        projdir = str(Path(local_cfg_path).parent)
-        if projdir not in sys.path:
-            sys.path.append(projdir)
-
-    return local_cfg, local_cfg_path
-
-
-
-def init_kag_config(config_path: Union[str, Path] = None):
-    if not config_path or isinstance(config_path, Path) and not config_path.exists():
-        config_path = DEFAULT_KAG_CONFIG_PATH
-    kag_cfg = ConfigParser()
-    kag_cfg.read(config_path)
-    os.environ["KAG_PROJECT_ROOT_PATH"] = os.path.abspath(os.path.dirname(config_path))
-
-    for section in kag_cfg.sections():
-        sec_cfg = {}
-        for key, value in kag_cfg.items(section):
-            item_cfg_key = f"{KAG_CFG_PREFIX}_{section}_{key}".upper()
-            os.environ[item_cfg_key] = value
-            sec_cfg[key] = value
-        sec_cfg_key = f"{KAG_CFG_PREFIX}_{section}".upper()
-        os.environ[sec_cfg_key] = str(sec_cfg)
-        if section == "log":
-            for key, value in kag_cfg.items(section):
-                if key == "level":
-                    logging.basicConfig(level=logging.getLevelName(value))
-                    # neo4j log level set to be default error
-                    logging.getLogger("neo4j.notifications").setLevel(logging.ERROR)
-                    logging.getLogger("neo4j.io").setLevel(logging.INFO)
-                    logging.getLogger("neo4j.pool").setLevel(logging.INFO)
+import json
+import time
+import datetime
+import socket
+import traceback
+from kag.common.conf import KAGConstants
+
+
+def parse_tf_config():
+    tf_config_str = os.environ.get(KAGConstants.KS8_ENV_TF_CONFIG, None)
+    if tf_config_str is None:
+        return None
+    else:
+        return json.loads(tf_config_str)
+
+
+def get_role_number(config, role_name):
+    role_info = config["cluster"].get(role_name, None)
+    if role_info is None:
+        return 0
+    else:
+        return len(role_info)
+
+
+def get_rank(default=None):
+    if KAGConstants.K8S_ENV_RANK in os.environ:
+        return int(os.environ[KAGConstants.K8S_ENV_RANK])
+
+    tf_config = parse_tf_config()
+    if tf_config is None:
+        return default
+
+    num_master = get_role_number(tf_config, "master")
+    task_type = tf_config["task"]["type"]
+    task_index = tf_config["task"]["index"]
+    if task_type == "master":
+        rank = task_index
+    elif task_type == "worker":
+        rank = num_master + task_index
+    else:
+        rank = default
+
+    return rank
+
+
+def get_world_size(default=None):
+    if KAGConstants.K8S_ENV_WORLD_SIZE in os.environ:
+        return os.environ[KAGConstants.K8S_ENV_WORLD_SIZE]
+
+    tf_config = parse_tf_config()
+    if tf_config is None:
+        return default
+
+    num_master = get_role_number(tf_config, "master")
+    num_worker = get_role_number(tf_config, "worker")
+
+    return num_master + num_worker
+
+
+def get_master_port(default=None):
+    return os.environ.get(KAGConstants.K8S_ENV_MASTER_PORT, default)
+
+
+def get_master_addr(default=None):
+    if KAGConstants.K8S_ENV_MASTER_ADDR in os.environ:
+        return os.environ[KAGConstants.K8S_ENV_MASTER_ADDR]
+
+    tf_config = parse_tf_config()
+    if tf_config is None:
+        return default
+
+    return tf_config["cluster"]["worker"][0]
+
+
+def host2tensor(master_port):
+    import torch
+
+    host_str = socket.gethostbyname(socket.gethostname())
+    host = [int(x) for x in host_str.split(".")]
+    host.append(int(master_port))
+    host_tensor = torch.tensor(host)
+    return host_tensor
+
+
+def tensor2host(host_tensor):
+    host_tensor = host_tensor.tolist()
+    host = ".".join([str(x) for x in host_tensor[0:4]])
+    port = host_tensor[4]
+    return f"{host}:{port}"
+
+
+def sync_hosts():
+    import torch
+    import torch.distributed as dist
+
+    rank = get_rank()
+    if rank is None:
+        raise ValueError("can't get rank of container")
+    rank = int(rank)
+
+    world_size = get_world_size()
+    if world_size is None:
+        raise ValueError("can't get world_size of container")
+    world_size = int(world_size)
+
+    master_port = get_master_port()
+    if master_port is None:
+        raise ValueError("can't get master_port of container")
+    master_port = int(master_port)
+
+    while True:
+        try:
+            dist.init_process_group(
+                backend="gloo",
+                rank=rank,
+                world_size=world_size,
+                timeout=datetime.timedelta(days=1),
+            )
+            break
+        except Exception as e:
+            error_traceback = traceback.format_exc()
+            print(f"failed to init process group, info: {e}\n\n\n{error_traceback}")
+            time.sleep(60)
+    print("Done init process group, get all hosts...")
+    host_tensors = [torch.tensor([0, 0, 0, 0, 0]) for x in range(world_size)]
+    dist.all_gather(host_tensors, host2tensor(master_port))
+    # we need to destory torch process group to release MASTER_PORT, otherwise the server
+    # can't serving on it .
+    print("Done get all hosts, destory process group...")
+    dist.destroy_process_group()
+    time.sleep(10)
+    return [tensor2host(x) for x in host_tensors]
+
+
+def extract_job_name_from_pod_name(pod_name):
+    if "-ptjob" in pod_name:
+        return pod_name.rsplit("-ptjob", maxsplit=1)[0]
+    elif "-tfjob" in pod_name:
+        return pod_name.rsplit("-tfjob", maxsplit=1)[0]
+    elif "-mpijob" in pod_name:
+        return pod_name.rsplit("-mpijob", maxsplit=1)[0]
+    else:
+        return None
diff --git a/kag/common/graphstore/graph_store.py b/kag/common/graphstore/graph_store.py
index 8877ad2b..1cc65f83 100644
--- a/kag/common/graphstore/graph_store.py
+++ b/kag/common/graphstore/graph_store.py
@@ -49,7 +49,9 @@ def upsert_node(self, label, properties, id_key="id", extra_labels=("Entity",)):
         pass
 
     @abstractmethod
-    def upsert_nodes(self, label, properties_list, id_key="id", extra_labels=("Entity",)):
+    def upsert_nodes(
+        self, label, properties_list, id_key="id", extra_labels=("Entity",)
+    ):
         """
         Insert or update multiple nodes.
 
@@ -112,10 +114,18 @@ def delete_nodes(self, label, id_values, id_key="id"):
         pass
 
     @abstractmethod
-    def upsert_relationship(self, start_node_label, start_node_id_value,
-                            end_node_label, end_node_id_value,
-                            rel_type, properties, upsert_nodes=True,
-                            start_node_id_key="id", end_node_id_key="id"):
+    def upsert_relationship(
+        self,
+        start_node_label,
+        start_node_id_value,
+        end_node_label,
+        end_node_id_value,
+        rel_type,
+        properties,
+        upsert_nodes=True,
+        start_node_id_key="id",
+        end_node_id_key="id",
+    ):
         """
         Insert or update a relationship.
 
@@ -133,9 +143,16 @@ def upsert_relationship(self, start_node_label, start_node_id_value,
         pass
 
     @abstractmethod
-    def upsert_relationships(self, start_node_label, end_node_label, rel_type,
-                             relationships, upsert_nodes=True, start_node_id_key="id",
-                             end_node_id_key="id"):
+    def upsert_relationships(
+        self,
+        start_node_label,
+        end_node_label,
+        rel_type,
+        relationships,
+        upsert_nodes=True,
+        start_node_id_key="id",
+        end_node_id_key="id",
+    ):
         """
         Insert or update multiple relationships.
 
@@ -151,9 +168,16 @@ def upsert_relationships(self, start_node_label, end_node_label, rel_type,
         pass
 
     @abstractmethod
-    def delete_relationship(self, start_node_label, start_node_id_value,
-                            end_node_label, end_node_id_value,
-                            rel_type, start_node_id_key="id", end_node_id_key="id"):
+    def delete_relationship(
+        self,
+        start_node_label,
+        start_node_id_value,
+        end_node_label,
+        end_node_id_value,
+        rel_type,
+        start_node_id_key="id",
+        end_node_id_key="id",
+    ):
         """
         Delete a specified relationship.
 
@@ -169,9 +193,16 @@ def delete_relationship(self, start_node_label, start_node_id_value,
         pass
 
     @abstractmethod
-    def delete_relationships(self, start_node_label, start_node_id_values,
-                             end_node_label, end_node_id_values, rel_type,
-                             start_node_id_key="id", end_node_id_key="id"):
+    def delete_relationships(
+        self,
+        start_node_label,
+        start_node_id_values,
+        end_node_label,
+        end_node_id_values,
+        rel_type,
+        start_node_id_key="id",
+        end_node_id_key="id",
+    ):
         """
         Delete multiple relationships.
 
@@ -211,9 +242,16 @@ def create_text_index(self, labels, property_keys, index_name=None):
         pass
 
     @abstractmethod
-    def create_vector_index(self, label, property_key, index_name=None,
-                            vector_dimensions=768, metric_type="cosine",
-                            hnsw_m=None, hnsw_ef_construction=None):
+    def create_vector_index(
+        self,
+        label,
+        property_key,
+        index_name=None,
+        vector_dimensions=768,
+        metric_type="cosine",
+        hnsw_m=None,
+        hnsw_ef_construction=None,
+    ):
         """
         Create a vector index.
 
@@ -239,7 +277,9 @@ def delete_index(self, index_name):
         pass
 
     @abstractmethod
-    def text_search(self, query_string, label_constraints=None, topk=10, index_name=None):
+    def text_search(
+        self, query_string, label_constraints=None, topk=10, index_name=None
+    ):
         """
         Perform a text search.
 
@@ -255,7 +295,15 @@ def text_search(self, query_string, label_constraints=None, topk=10, index_name=
         pass
 
     @abstractmethod
-    def vector_search(self, label, property_key, query_text_or_vector, topk=10, index_name=None, ef_search=None):
+    def vector_search(
+        self,
+        label,
+        property_key,
+        query_text_or_vector,
+        topk=10,
+        index_name=None,
+        ef_search=None,
+    ):
         """
         Perform a vector search.
 
diff --git a/kag/common/graphstore/neo4j_graph_store.py b/kag/common/graphstore/neo4j_graph_store.py
index 33b46d9d..97bd5c47 100644
--- a/kag/common/graphstore/neo4j_graph_store.py
+++ b/kag/common/graphstore/neo4j_graph_store.py
@@ -10,7 +10,6 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
 import logging
-import os
 import re
 import threading
 import time
@@ -25,18 +24,20 @@
 
 logger = logging.getLogger(__name__)
 
+
 class SingletonMeta(ABCMeta):
     """
     Thread-safe Singleton metaclass
     """
+
     _instances = {}
     _lock = threading.Lock()
 
     def __call__(cls, *args, **kwargs):
-        uri = kwargs.get('uri')
-        user = kwargs.get('user')
-        password = kwargs.get('password')
-        database = kwargs.get('database', 'neo4j')
+        uri = kwargs.get("uri")
+        user = kwargs.get("user")
+        password = kwargs.get("password")
+        database = kwargs.get("database", "neo4j")
         key = (cls, uri, user, password, database)
 
         with cls._lock:
@@ -46,12 +47,19 @@ def __call__(cls, *args, **kwargs):
 
 
 class Neo4jClient(GraphStore, metaclass=SingletonMeta):
-
-    def __init__(self, uri, user, password, database="neo4j", init_type="write", interval_minutes=10):
+    def __init__(
+        self,
+        uri,
+        user,
+        password,
+        database="neo4j",
+        init_type="write",
+        interval_minutes=10,
+    ):
         self._driver = GraphDatabase.driver(uri, auth=(user, password))
         logger.info(f"init Neo4jClient uri: {uri} database: {database}")
         self._database = database
-        self._lucene_special_chars = "\\+-!():^[]\"{}~*?|&/"
+        self._lucene_special_chars = '\\+-!():^[]"{}~*?|&/'
         self._lucene_pattern = self._get_lucene_pattern()
         self._simple_ident = "[A-Za-z_][A-Za-z0-9_]*"
         self._simple_ident_pattern = re.compile(self._simple_ident)
@@ -71,14 +79,16 @@ def close(self):
         self._driver.close()
 
     def schedule_constraint(self, interval_minutes):
-
         def job():
             try:
                 self._labels = self._create_unique_constraint()
                 self._update_pagerank_graph()
             except Exception as e:
                 import traceback
-                logger.error(f"Error run scheduled job: {traceback.format_exc()}")
+
+                logger.error(
+                    f"Error run scheduled job, info: {e},\ntraceback:\n {traceback.format_exc()}"
+                )
 
         def run_scheduled_tasks():
             while True:
@@ -116,7 +126,9 @@ def _create_unique_index_constraint(self, label, session):
         try:
             result = session.run(create_constraint_query)
             result.consume()
-            logger.debug(f"Unique constraint created for constraint_name: {constraint_name}")
+            logger.debug(
+                f"Unique constraint created for constraint_name: {constraint_name}"
+            )
         except Exception as e:
             logger.debug(f"warn creating constraint for {constraint_name}: {e}")
             self._create_index_constraint(self, label, session)
@@ -186,7 +198,12 @@ def _collect_text_index_info(self, schema_types):
                 label_property_keys = {}
                 for property_key in properties:
                     index_type = properties[property_key].index_type
-                    if property_key == "name" or index_type and index_type in (IndexTypeEnum.Text, IndexTypeEnum.TextAndVector):
+                    if (
+                        property_key == "name"
+                        or index_type
+                        and index_type
+                        in (IndexTypeEnum.Text, IndexTypeEnum.TextAndVector)
+                    ):
                         label_property_keys[property_key] = True
                 if label_property_keys:
                     labels[label] = True
@@ -199,9 +216,13 @@ def upsert_node(self, label, properties, id_key="id", extra_labels=("Entity",)):
             if label not in self._labels:
                 self._create_unique_index_constraint(self, label, session)
             try:
-                return session.execute_write(self._upsert_node, self, label, id_key, properties, extra_labels)
+                return session.execute_write(
+                    self._upsert_node, self, label, id_key, properties, extra_labels
+                )
             except Exception as e:
-                logger.error(f"upsert_node label:{label} properties:{properties} Exception: {e}")
+                logger.error(
+                    f"upsert_node label:{label} properties:{properties} Exception: {e}"
+                )
                 return None
 
     @staticmethod
@@ -209,23 +230,36 @@ def _upsert_node(tx, self, label, id_key, properties, extra_labels):
         if not label:
             logger.warning("label cannot be None or empty strings")
             return None
-        query = (f"MERGE (n:{self._escape_neo4j(label)} {{{self._escape_neo4j(id_key)}: $properties.{self._escape_neo4j(id_key)}}}) "
-                  "SET n += $properties ")
+        query = (
+            f"MERGE (n:{self._escape_neo4j(label)} {{{self._escape_neo4j(id_key)}: $properties.{self._escape_neo4j(id_key)}}}) "
+            "SET n += $properties "
+        )
         if extra_labels:
             query += f", n:{':'.join(self._escape_neo4j(extra_label) for extra_label in extra_labels)} "
         query += "RETURN n"
         result = tx.run(query, properties=properties)
         return result.single()[0]
 
-    def upsert_nodes(self, label, properties_list, id_key="id", extra_labels=("Entity",)):
+    def upsert_nodes(
+        self, label, properties_list, id_key="id", extra_labels=("Entity",)
+    ):
         self._preprocess_node_properties_list(label, properties_list, extra_labels)
         with self._driver.session(database=self._database) as session:
             if label not in self._labels:
                 self._create_unique_index_constraint(self, label, session)
             try:
-                return session.execute_write(self._upsert_nodes, self, label, properties_list, id_key, extra_labels)
+                return session.execute_write(
+                    self._upsert_nodes,
+                    self,
+                    label,
+                    properties_list,
+                    id_key,
+                    extra_labels,
+                )
             except Exception as e:
-                logger.error(f"upsert_nodes label:{label} properties:{properties_list} Exception: {e}")
+                logger.error(
+                    f"upsert_nodes label:{label} properties:{properties_list} Exception: {e}"
+                )
                 return None
 
     @staticmethod
@@ -233,14 +267,16 @@ def _upsert_nodes(tx, self, label, properties_list, id_key, extra_labels):
         if not label:
             logger.warning("label cannot be None or empty strings")
             return None
-        query = ("UNWIND $properties_list AS properties "
-                f"MERGE (n:{self._escape_neo4j(label)} {{{self._escape_neo4j(id_key)}: properties.{self._escape_neo4j(id_key)}}}) "
-                 "SET n += properties ")
+        query = (
+            "UNWIND $properties_list AS properties "
+            f"MERGE (n:{self._escape_neo4j(label)} {{{self._escape_neo4j(id_key)}: properties.{self._escape_neo4j(id_key)}}}) "
+            "SET n += properties "
+        )
         if extra_labels:
             query += f", n:{':'.join(self._escape_neo4j(extra_label) for extra_label in extra_labels)} "
         query += "RETURN n"
         result = tx.run(query, properties_list=properties_list)
-        return [record['n'] for record in result]
+        return [record["n"] for record in result]
 
     def _get_embedding_vector(self, properties, vector_field):
         for property_key, property_value in properties.items():
@@ -256,7 +292,9 @@ def _get_embedding_vector(self, properties, vector_field):
                 vector = self.vectorizer.vectorize(property_value)
                 return vector
             except Exception as e:
-                logger.info(f"An error occurred while vectorizing property {property_key!r}: {e}")
+                logger.info(
+                    f"An error occurred while vectorizing property {property_key!r}: {e}"
+                )
             return None
         return None
 
@@ -287,7 +325,9 @@ def batch_preprocess_node_properties(self, node_batch, extra_labels=("Entity",))
             return
 
         class EmbeddingVectorPlaceholder(object):
-            def __init__(self, number, properties, vector_field, property_key, property_value):
+            def __init__(
+                self, number, properties, vector_field, property_key, property_value
+            ):
                 self._number = number
                 self._properties = properties
                 self._vector_field = vector_field
@@ -317,7 +357,9 @@ def get_placeholder(self, graph_store, properties, vector_field):
                         message = f"property {property_key!r} must be string to generate embedding vector"
                         raise RuntimeError(message)
                     num = len(self._placeholders)
-                    placeholder = EmbeddingVectorPlaceholder(num, properties, vector_field, property_key, property_value)
+                    placeholder = EmbeddingVectorPlaceholder(
+                        num, properties, vector_field, property_key, property_value
+                    )
                     self._placeholders.append(placeholder)
                     return placeholder
                 return None
@@ -364,7 +406,9 @@ def patch(self):
                 for vector_field in vec_meta[label]:
                     if vector_field in properties:
                         continue
-                    placeholder = manager.get_placeholder(self, properties, vector_field)
+                    placeholder = manager.get_placeholder(
+                        self, properties, vector_field
+                    )
                     if placeholder is not None:
                         properties[vector_field] = placeholder
         manager.batch_vectorize(self._vectorizer)
@@ -406,25 +450,58 @@ def _delete_nodes(tx, self, label, id_key, id_values):
         query = f"UNWIND $id_values AS id_value MATCH (n:{self._escape_neo4j(label)} {{{self._escape_neo4j(id_key)}: id_value}}) DETACH DELETE n"
         tx.run(query, id_values=id_values)
 
-    def upsert_relationship(self, start_node_label, start_node_id_value,
-                            end_node_label, end_node_id_value, rel_type,
-                            properties, upsert_nodes=True, start_node_id_key="id", end_node_id_key="id"):
+    def upsert_relationship(
+        self,
+        start_node_label,
+        start_node_id_value,
+        end_node_label,
+        end_node_id_value,
+        rel_type,
+        properties,
+        upsert_nodes=True,
+        start_node_id_key="id",
+        end_node_id_key="id",
+    ):
         rel_type = self._escape_neo4j(rel_type)
         with self._driver.session(database=self._database) as session:
             try:
-                return session.execute_write(self._upsert_relationship, self, start_node_label, start_node_id_key,
-                                         start_node_id_value, end_node_label, end_node_id_key,
-                                         end_node_id_value, rel_type, properties, upsert_nodes)
+                return session.execute_write(
+                    self._upsert_relationship,
+                    self,
+                    start_node_label,
+                    start_node_id_key,
+                    start_node_id_value,
+                    end_node_label,
+                    end_node_id_key,
+                    end_node_id_value,
+                    rel_type,
+                    properties,
+                    upsert_nodes,
+                )
             except Exception as e:
-                logger.error(f"upsert_relationship rel_type:{rel_type} properties:{properties} Exception: {e}")
+                logger.error(
+                    f"upsert_relationship rel_type:{rel_type} properties:{properties} Exception: {e}"
+                )
                 return None
 
     @staticmethod
-    def _upsert_relationship(tx, self, start_node_label, start_node_id_key, start_node_id_value,
-                             end_node_label, end_node_id_key, end_node_id_value,
-                             rel_type, properties, upsert_nodes):
+    def _upsert_relationship(
+        tx,
+        self,
+        start_node_label,
+        start_node_id_key,
+        start_node_id_value,
+        end_node_label,
+        end_node_id_key,
+        end_node_id_value,
+        rel_type,
+        properties,
+        upsert_nodes,
+    ):
         if not start_node_label or not end_node_label or not rel_type:
-            logger.warning("start_node_label, end_node_label, and rel_type cannot be None or empty strings")
+            logger.warning(
+                "start_node_label, end_node_label, and rel_type cannot be None or empty strings"
+            )
             return None
         if upsert_nodes:
             query = (
@@ -438,25 +515,59 @@ def _upsert_relationship(tx, self, start_node_label, start_node_id_key, start_no
                 f"(b:{self._escape_neo4j(end_node_label)} {{{self._escape_neo4j(end_node_id_key)}: $end_node_id_value}}) "
                 f"MERGE (a)-[r:{self._escape_neo4j(rel_type)}]->(b) SET r += $properties RETURN r"
             )
-        result = tx.run(query, start_node_id_value=start_node_id_value,
-                        end_node_id_value=end_node_id_value, properties=properties)
+        result = tx.run(
+            query,
+            start_node_id_value=start_node_id_value,
+            end_node_id_value=end_node_id_value,
+            properties=properties,
+        )
         return result.single()
 
-    def upsert_relationships(self, start_node_label, end_node_label, rel_type, relations,
-                             upsert_nodes=True, start_node_id_key="id", end_node_id_key="id"):
+    def upsert_relationships(
+        self,
+        start_node_label,
+        end_node_label,
+        rel_type,
+        relations,
+        upsert_nodes=True,
+        start_node_id_key="id",
+        end_node_id_key="id",
+    ):
         with self._driver.session(database=self._database) as session:
             try:
-                return session.execute_write(self._upsert_relationships, self, relations, start_node_label,
-                                         start_node_id_key, end_node_label, end_node_id_key, rel_type, upsert_nodes)
+                return session.execute_write(
+                    self._upsert_relationships,
+                    self,
+                    relations,
+                    start_node_label,
+                    start_node_id_key,
+                    end_node_label,
+                    end_node_id_key,
+                    rel_type,
+                    upsert_nodes,
+                )
             except Exception as e:
-                logger.error(f"upsert_relationships rel_type:{rel_type} relations:{relations} Exception: {e}")
+                logger.error(
+                    f"upsert_relationships rel_type:{rel_type} relations:{relations} Exception: {e}"
+                )
                 return None
 
     @staticmethod
-    def _upsert_relationships(tx, self, relations, start_node_label, start_node_id_key,
-                              end_node_label, end_node_id_key, rel_type, upsert_nodes):
+    def _upsert_relationships(
+        tx,
+        self,
+        relations,
+        start_node_label,
+        start_node_id_key,
+        end_node_label,
+        end_node_id_key,
+        rel_type,
+        upsert_nodes,
+    ):
         if not start_node_label or not end_node_label or not rel_type:
-            logger.warning("start_node_label, end_node_label, and rel_type cannot be None or empty strings")
+            logger.warning(
+                "start_node_label, end_node_label, and rel_type cannot be None or empty strings"
+            )
             return None
         if upsert_nodes:
             query = (
@@ -473,51 +584,111 @@ def _upsert_relationships(tx, self, relations, start_node_label, start_node_id_k
                 f"MERGE (a)-[r:{self._escape_neo4j(rel_type)}]->(b) SET r += relationship.properties RETURN r"
             )
 
-        result = tx.run(query, relations=relations,
-                        start_node_label=start_node_label, start_node_id_key=start_node_id_key,
-                        end_node_label=end_node_label, end_node_id_key=end_node_id_key,
-                        rel_type=rel_type)
-        return [record['r'] for record in result]
-
-    def delete_relationship(self, start_node_label, start_node_id_value,
-                            end_node_label, end_node_id_value, rel_type,
-                            start_node_id_key="id", end_node_id_key="id"):
+        result = tx.run(
+            query,
+            relations=relations,
+            start_node_label=start_node_label,
+            start_node_id_key=start_node_id_key,
+            end_node_label=end_node_label,
+            end_node_id_key=end_node_id_key,
+            rel_type=rel_type,
+        )
+        return [record["r"] for record in result]
+
+    def delete_relationship(
+        self,
+        start_node_label,
+        start_node_id_value,
+        end_node_label,
+        end_node_id_value,
+        rel_type,
+        start_node_id_key="id",
+        end_node_id_key="id",
+    ):
         with self._driver.session(database=self._database) as session:
             try:
-                session.execute_write(self._delete_relationship, self, start_node_label, start_node_id_key,
-                                      start_node_id_value, end_node_label, end_node_id_key,
-                                      end_node_id_value, rel_type)
+                session.execute_write(
+                    self._delete_relationship,
+                    self,
+                    start_node_label,
+                    start_node_id_key,
+                    start_node_id_value,
+                    end_node_label,
+                    end_node_id_key,
+                    end_node_id_value,
+                    rel_type,
+                )
             except Exception as e:
                 logger.error(f"delete_relationship rel_type:{rel_type} Exception: {e}")
 
-
     @staticmethod
-    def _delete_relationship(tx, self, start_node_label, start_node_id_key, start_node_id_value,
-                             end_node_label, end_node_id_key, end_node_id_value, rel_type):
+    def _delete_relationship(
+        tx,
+        self,
+        start_node_label,
+        start_node_id_key,
+        start_node_id_value,
+        end_node_label,
+        end_node_id_key,
+        end_node_id_value,
+        rel_type,
+    ):
         query = (
             f"MATCH (a:{self._escape_neo4j(start_node_label)} {{{self._escape_neo4j(start_node_id_key)}: $start_node_id_value}})-[r:{self._escape_neo4j(rel_type)}]->"
             f"(b:{self._escape_neo4j(end_node_label)} {{{self._escape_neo4j(end_node_id_key)}: $end_node_id_value}}) DELETE r"
         )
-        tx.run(query, start_node_id_value=start_node_id_value, end_node_id_value=end_node_id_value)
+        tx.run(
+            query,
+            start_node_id_value=start_node_id_value,
+            end_node_id_value=end_node_id_value,
+        )
 
-    def delete_relationships(self, start_node_label, start_node_id_values,
-                             end_node_label, end_node_id_values, rel_type,
-                             start_node_id_key="id", end_node_id_key="id"):
+    def delete_relationships(
+        self,
+        start_node_label,
+        start_node_id_values,
+        end_node_label,
+        end_node_id_values,
+        rel_type,
+        start_node_id_key="id",
+        end_node_id_key="id",
+    ):
         with self._driver.session(database=self._database) as session:
-            session.execute_write(self._delete_relationships, self,
-                                  start_node_label, start_node_id_key, start_node_id_values,
-                                  end_node_label, end_node_id_key, end_node_id_values, rel_type)
+            session.execute_write(
+                self._delete_relationships,
+                self,
+                start_node_label,
+                start_node_id_key,
+                start_node_id_values,
+                end_node_label,
+                end_node_id_key,
+                end_node_id_values,
+                rel_type,
+            )
 
     @staticmethod
-    def _delete_relationships(tx, self, start_node_label, start_node_id_key, start_node_id_values,
-                              end_node_label, end_node_id_key, end_node_id_values, rel_type):
+    def _delete_relationships(
+        tx,
+        self,
+        start_node_label,
+        start_node_id_key,
+        start_node_id_values,
+        end_node_label,
+        end_node_id_key,
+        end_node_id_values,
+        rel_type,
+    ):
         query = (
             "UNWIND $start_node_id_values AS start_node_id_value "
             "UNWIND $end_node_id_values AS end_node_id_value "
             f"MATCH (a:{self._escape_neo4j(start_node_label)} {{{self._escape_neo4j(start_node_id_key)}: start_node_id_value}})-[r:{self._escape_neo4j(rel_type)}]->"
             f"(b:{self._escape_neo4j(end_node_label)} {{{self._escape_neo4j(end_node_id_key)}: end_node_id_value}}) DELETE r"
         )
-        tx.run(query, start_node_id_values=start_node_id_values, end_node_id_values=end_node_id_values)
+        tx.run(
+            query,
+            start_node_id_values=start_node_id_values,
+            end_node_id_values=end_node_id_values,
+        )
 
     def _get_lucene_pattern(self):
         string = re.escape(self._lucene_special_chars)
@@ -539,7 +710,7 @@ def _get_utf16_codepoints(self, string):
         for ch in string:
             data = ch.encode("utf-16-le")
             for i in range(0, len(data), 2):
-                value = int.from_bytes(data[i:i+2], "little")
+                value = int.from_bytes(data[i : i + 2], "little")
                 result.append(value)
         return tuple(result)
 
@@ -562,6 +733,7 @@ def _escape_neo4j(self, name):
 
     def _to_snake_case(self, name):
         import re
+
         words = re.findall("[A-Za-z][a-z0-9]*", name)
         result = "_".join(words).lower()
         return result
@@ -578,7 +750,9 @@ def _create_vector_field_name(self, property_key):
 
     def create_index(self, label, property_key, index_name=None):
         with self._driver.session(database=self._database) as session:
-            session.execute_write(self._create_index, self, label, property_key, index_name)
+            session.execute_write(
+                self._create_index, self, label, property_key, index_name
+            )
 
     @staticmethod
     def _create_index(tx, self, label, property_key, index_name):
@@ -596,50 +770,87 @@ def create_text_index(self, labels, property_keys, index_name=None):
         if index_name is None:
             index_name = "_default_text_index"
         label_spec = "|".join(self._escape_neo4j(label) for label in labels)
-        property_spec = ", ".join(f"n.{self._escape_neo4j(key)}" for key in property_keys)
+        property_spec = ", ".join(
+            f"n.{self._escape_neo4j(key)}" for key in property_keys
+        )
         query = (
             f"CREATE FULLTEXT INDEX {self._escape_neo4j(index_name)} IF NOT EXISTS "
             f"FOR (n:{label_spec}) ON EACH [{property_spec}]"
         )
+
         def do_create_text_index(tx):
             tx.run(query)
+
         with self._driver.session(database=self._database) as session:
             session.execute_write(do_create_text_index)
         return index_name
 
-    def create_vector_index(self, label, property_key, index_name=None,
-                            vector_dimensions=768, metric_type="cosine",
-                            hnsw_m=None, hnsw_ef_construction=None):
+    def create_vector_index(
+        self,
+        label,
+        property_key,
+        index_name=None,
+        vector_dimensions=768,
+        metric_type="cosine",
+        hnsw_m=None,
+        hnsw_ef_construction=None,
+    ):
         if index_name is None:
             index_name = self._create_vector_index_name(label, property_key)
         if not property_key.lower().endswith("vector"):
             property_key = self._create_vector_field_name(property_key)
         with self._driver.session(database=self._database) as session:
-            session.execute_write(self._create_vector_index, self, label, property_key, index_name,
-                                  vector_dimensions, metric_type, hnsw_m, hnsw_ef_construction)
+            session.execute_write(
+                self._create_vector_index,
+                self,
+                label,
+                property_key,
+                index_name,
+                vector_dimensions,
+                metric_type,
+                hnsw_m,
+                hnsw_ef_construction,
+            )
         self.refresh_vector_index_meta(force=True)
         return index_name
 
     @staticmethod
-    def _create_vector_index(tx, self, label, property_key, index_name, vector_dimensions, metric_type, hnsw_m, hnsw_ef_construction):
+    def _create_vector_index(
+        tx,
+        self,
+        label,
+        property_key,
+        index_name,
+        vector_dimensions,
+        metric_type,
+        hnsw_m,
+        hnsw_ef_construction,
+    ):
         query = (
             f"CREATE VECTOR INDEX {self._escape_neo4j(index_name)} IF NOT EXISTS FOR (n:{self._escape_neo4j(label)}) ON (n.{self._escape_neo4j(property_key)}) "
-             "OPTIONS { indexConfig: {"
-             "  `vector.dimensions`: $vector_dimensions,"
-             "  `vector.similarity_function`: $metric_type"
+            "OPTIONS { indexConfig: {"
+            "  `vector.dimensions`: $vector_dimensions,"
+            "  `vector.similarity_function`: $metric_type"
         )
         if hnsw_m is not None:
             query += ",  `vector.hnsw.m`: $hnsw_m"
         if hnsw_ef_construction is not None:
             query += ",  `vector.hnsw.ef_construction`: $hnsw_ef_construction"
         query += "}}"
-        tx.run(query, vector_dimensions=vector_dimensions, metric_type=metric_type,
-               hnsw_m=hnsw_m, hnsw_ef_construction=hnsw_ef_construction)
+        tx.run(
+            query,
+            vector_dimensions=vector_dimensions,
+            metric_type=metric_type,
+            hnsw_m=hnsw_m,
+            hnsw_ef_construction=hnsw_ef_construction,
+        )
 
     def refresh_vector_index_meta(self, force=False):
         import time
+
         if not force and time.time() - self._vec_meta_ts < self._vec_meta_timeout:
             return
+
         def do_refresh_vector_index_meta(tx):
             query = "SHOW VECTOR INDEX"
             res = tx.run(query)
@@ -647,14 +858,17 @@ def do_refresh_vector_index_meta(tx):
             meta = dict()
             for record in data:
                 if record["entityType"] == "NODE":
-                    label, = record["labelsOrTypes"]
-                    vector_field, = record["properties"]
-                    if vector_field.startswith("_") and vector_field.endswith("_vector"):
+                    (label,) = record["labelsOrTypes"]
+                    (vector_field,) = record["properties"]
+                    if vector_field.startswith("_") and vector_field.endswith(
+                        "_vector"
+                    ):
                         if label not in meta:
                             meta[label] = []
                         meta[label].append(vector_field)
             self._vec_meta = meta
             self._vec_meta_ts = time.time()
+
         with self._driver.session(database=self._database) as session:
             session.execute_read(do_refresh_vector_index_meta)
 
@@ -678,7 +892,9 @@ def vectorizer(self):
     def vectorizer(self, value):
         self._vectorizer = value
 
-    def text_search(self, query_string, label_constraints=None, topk=10, index_name=None):
+    def text_search(
+        self, query_string, label_constraints=None, topk=10, index_name=None
+    ):
         if index_name is None:
             index_name = "_default_text_index"
         if label_constraints is None:
@@ -686,31 +902,48 @@ def text_search(self, query_string, label_constraints=None, topk=10, index_name=
         elif isinstance(label_constraints, str):
             label_constraints = self._escape_neo4j(label_constraints)
         elif isinstance(label_constraints, (list, tuple)):
-            label_constraints = "|".join(self._escape_neo4j(label_constraint) for label_constraint in label_constraints)
+            label_constraints = "|".join(
+                self._escape_neo4j(label_constraint)
+                for label_constraint in label_constraints
+            )
         else:
             message = f"invalid label_constraints: {label_constraints!r}"
             raise RuntimeError(message)
         if label_constraints is None:
-            query = ("CALL db.index.fulltext.queryNodes($index_name, $query_string) "
-                     "YIELD node AS node, score "
-                     "RETURN node, score")
+            query = (
+                "CALL db.index.fulltext.queryNodes($index_name, $query_string) "
+                "YIELD node AS node, score "
+                "RETURN node, score"
+            )
         else:
-            query = ("CALL db.index.fulltext.queryNodes($index_name, $query_string) "
-                     "YIELD node AS node, score "
-                    f"WHERE (node:{label_constraints}) "
-                     "RETURN node, score")
+            query = (
+                "CALL db.index.fulltext.queryNodes($index_name, $query_string) "
+                "YIELD node AS node, score "
+                f"WHERE (node:{label_constraints}) "
+                "RETURN node, score"
+            )
         query += " LIMIT $topk"
         query_string = self._make_lucene_query(query_string)
 
         def do_text_search(tx):
-            res = tx.run(query, query_string=query_string, topk=topk, index_name=index_name)
+            res = tx.run(
+                query, query_string=query_string, topk=topk, index_name=index_name
+            )
             data = res.data()
             return data
 
         with self._driver.session(database=self._database) as session:
             return session.execute_read(do_text_search)
 
-    def vector_search(self, label, property_key, query_text_or_vector, topk=10, index_name=None, ef_search=None):
+    def vector_search(
+        self,
+        label,
+        property_key,
+        query_text_or_vector,
+        topk=10,
+        index_name=None,
+        ef_search=None,
+    ):
         if ef_search is not None:
             if ef_search < topk:
                 message = f"ef_search must be greater than or equal to topk; {ef_search!r} is invalid"
@@ -719,13 +952,17 @@ def vector_search(self, label, property_key, query_text_or_vector, topk=10, inde
         if index_name is None:
             vec_meta = self._vec_meta
             if label not in vec_meta:
-                logger.warning(f"vector index not defined for label, return empty. label: {label}, "
-                               f"property_key: {property_key}, query_text_or_vector: {query_text_or_vector}.")
+                logger.warning(
+                    f"vector index not defined for label, return empty. label: {label}, "
+                    f"property_key: {property_key}, query_text_or_vector: {query_text_or_vector}."
+                )
                 return []
             vector_field = self._create_vector_field_name(property_key)
             if vector_field not in vec_meta[label]:
-                logger.warning(f"vector index not defined for field, return empty. label: {label}, "
-                               f"property_key: {property_key}, query_text_or_vector: {query_text_or_vector}.")
+                logger.warning(
+                    f"vector index not defined for field, return empty. label: {label}, "
+                    f"property_key: {property_key}, query_text_or_vector: {query_text_or_vector}."
+                )
                 return []
         if index_name is None:
             index_name = self._create_vector_index_name(label, property_key)
@@ -736,16 +973,27 @@ def vector_search(self, label, property_key, query_text_or_vector, topk=10, inde
 
         def do_vector_search(tx):
             if ef_search is not None:
-                query = ("CALL db.index.vector.queryNodes($index_name, $ef_search, $query_vector) "
-                         "YIELD node, score "
-                         "RETURN node, score, labels(node) as __labels__"
-                        f"LIMIT {topk}")
-                res = tx.run(query, query_vector=query_vector, ef_search=ef_search, index_name=index_name)
+                query = (
+                    "CALL db.index.vector.queryNodes($index_name, $ef_search, $query_vector) "
+                    "YIELD node, score "
+                    "RETURN node, score, labels(node) as __labels__"
+                    f"LIMIT {topk}"
+                )
+                res = tx.run(
+                    query,
+                    query_vector=query_vector,
+                    ef_search=ef_search,
+                    index_name=index_name,
+                )
             else:
-                query = ("CALL db.index.vector.queryNodes($index_name, $topk, $query_vector) "
-                         "YIELD node, score "
-                         "RETURN node, score, labels(node) as __labels__")
-                res = tx.run(query, query_vector=query_vector, topk=topk, index_name=index_name)
+                query = (
+                    "CALL db.index.vector.queryNodes($index_name, $topk, $query_vector) "
+                    "YIELD node, score "
+                    "RETURN node, score, labels(node) as __labels__"
+                )
+                res = tx.run(
+                    query, query_vector=query_vector, topk=topk, index_name=index_name
+                )
             data = res.data()
             for record in data:
                 record["node"]["__labels__"] = record["__labels__"]
@@ -757,41 +1005,59 @@ def do_vector_search(tx):
 
     def _create_all_graph(self, graph_name):
         with self._driver.session(database=self._database) as session:
-            logger.debug(f"create pagerank graph graph_name：{graph_name} database：{self._database}")
-            result = session.run(f"""
+            logger.debug(
+                f"create pagerank graph graph_name：{graph_name} database：{self._database}"
+            )
+            result = session.run(
+                f"""
             CALL gds.graph.exists('{graph_name}') YIELD exists
             WHERE exists
             CALL gds.graph.drop('{graph_name}') YIELD graphName
             RETURN graphName
-            """)
+            """
+            )
             summary = result.consume()
-            logger.debug(f"create pagerank graph exists graph_name：{graph_name} database：{self._database} succeed "
-                  f"executed：{summary.result_available_after} consumed：{summary.result_consumed_after}")
+            logger.debug(
+                f"create pagerank graph exists graph_name：{graph_name} database：{self._database} succeed "
+                f"executed：{summary.result_available_after} consumed：{summary.result_consumed_after}"
+            )
 
-            result = session.run(f"""
+            result = session.run(
+                f"""
             CALL gds.graph.project('{graph_name}','*','*')
             YIELD graphName, nodeCount AS nodes, relationshipCount AS rels
             RETURN graphName, nodes, rels
-            """)
+            """
+            )
             summary = result.consume()
-            logger.debug(f"create pagerank graph graph_name：{graph_name} database：{self._database} succeed "
-                  f"executed：{summary.result_available_after} consumed：{summary.result_consumed_after}")
+            logger.debug(
+                f"create pagerank graph graph_name：{graph_name} database：{self._database} succeed "
+                f"executed：{summary.result_available_after} consumed：{summary.result_consumed_after}"
+            )
 
     def _drop_all_graph(self, graph_name):
         with self._driver.session(database=self._database) as session:
-            logger.debug(f"drop pagerank graph graph_name：{graph_name} database：{self._database}")
-            result = session.run(f"""
+            logger.debug(
+                f"drop pagerank graph graph_name：{graph_name} database：{self._database}"
+            )
+            result = session.run(
+                f"""
             CALL gds.graph.exists('{graph_name}') YIELD exists
             WHERE exists
             CALL gds.graph.drop('{graph_name}') YIELD graphName
             RETURN graphName
-            """)
+            """
+            )
             result.consume()
-            logger.debug(f"drop pagerank graph graph_name：{graph_name} database：{self._database} succeed")
+            logger.debug(
+                f"drop pagerank graph graph_name：{graph_name} database：{self._database} succeed"
+            )
 
     def execute_pagerank(self, iterations=20, damping_factor=0.85):
         with self._driver.session(database=self._database) as session:
-            return session.execute_write(self._execute_pagerank, iterations, damping_factor)
+            return session.execute_write(
+                self._execute_pagerank, iterations, damping_factor
+            )
 
     @staticmethod
     def _execute_pagerank(tx, iterations, damping_factor):
@@ -809,7 +1075,9 @@ def get_pagerank_scores(self, start_nodes, target_type):
         with self._driver.session(database=self._database) as session:
             all_graph = self._allGraph
             self._exists_all_graph(session, all_graph)
-            data = session.execute_write(self._get_pagerank_scores, self, all_graph, start_nodes, target_type)
+            data = session.execute_write(
+                self._get_pagerank_scores, self, all_graph, start_nodes, target_type
+            )
         return data
 
     @staticmethod
@@ -817,13 +1085,15 @@ def _get_pagerank_scores(tx, self, graph_name, start_nodes, return_type):
         match_clauses = []
         match_identify = []
         for index, node in enumerate(start_nodes):
-            node_type, node_name = node['type'], node['name']
+            node_type, node_name = node["type"], node["name"]
             node_identify = f"node_{index}"
-            match_clauses.append(f"MATCH ({node_identify}:{self._escape_neo4j(node_type)} {{name: '{escape_single_quotes(node_name)}'}})")
+            match_clauses.append(
+                f"MATCH ({node_identify}:{self._escape_neo4j(node_type)} {{name: '{escape_single_quotes(node_name)}'}})"
+            )
             match_identify.append(node_identify)
 
-        match_query = ' '.join(match_clauses)
-        match_identify_str = ', '.join(match_identify)
+        match_query = " ".join(match_clauses)
+        match_identify_str = ", ".join(match_identify)
 
         pagerank_query = f"""
         {match_query}
@@ -845,16 +1115,20 @@ def _get_pagerank_scores(tx, self, graph_name, start_nodes, return_type):
     def _exists_all_graph(session, graph_name):
         try:
             logger.debug(f"exists pagerank graph graph_name：{graph_name}")
-            result = session.run(f"""
+            result = session.run(
+                f"""
             CALL gds.graph.exists('{graph_name}') YIELD exists
             WHERE NOT exists
             CALL gds.graph.project('{graph_name}','*','*')
             YIELD graphName, nodeCount AS nodes, relationshipCount AS rels
             RETURN graphName, nodes, rels
-            """)
+            """
+            )
             summary = result.consume()
-            logger.debug(f"exists pagerank graph graph_name：{graph_name} succeed "
-                  f"executed：{summary.result_available_after} consumed：{summary.result_consumed_after}")
+            logger.debug(
+                f"exists pagerank graph graph_name：{graph_name} succeed "
+                f"executed：{summary.result_available_after} consumed：{summary.result_consumed_after}"
+            )
         except Exception as e:
             logger.debug(f"Error exists pagerank graph {graph_name}: {e}")
 
@@ -873,18 +1147,26 @@ def _count(tx, self, label):
     def create_database(self, database):
         with self._driver.session(database=self._database) as session:
             database = database.lower()
-            result = session.run(f"CREATE DATABASE {self._escape_neo4j(database)} IF NOT EXISTS")
+            result = session.run(
+                f"CREATE DATABASE {self._escape_neo4j(database)} IF NOT EXISTS"
+            )
             summary = result.consume()
-            logger.info(f"create_database {database} succeed "
-                  f"executed：{summary.result_available_after} consumed：{summary.result_consumed_after}")
+            logger.info(
+                f"create_database {database} succeed "
+                f"executed：{summary.result_available_after} consumed：{summary.result_consumed_after}"
+            )
 
     def delete_all_data(self, database):
         if self._database != database:
-            raise ValueError(f"Error: Current database ({self._database}) is not the same as the target database ({database}).")
+            raise ValueError(
+                f"Error: Current database ({self._database}) is not the same as the target database ({database})."
+            )
 
         with self._driver.session(database=database) as session:
             while True:
-                result = session.run("MATCH (n)  WITH n LIMIT 100000  DETACH DELETE n RETURN count(*)")
+                result = session.run(
+                    "MATCH (n)  WITH n LIMIT 100000  DETACH DELETE n RETURN count(*)"
+                )
                 count = result.single()[0]
                 logger.info(f"Deleted {count} nodes in this batch.")
                 if count == 0:
@@ -893,7 +1175,9 @@ def delete_all_data(self, database):
 
     def run_cypher_query(self, database, query, parameters=None):
         if database and self._database != database:
-            raise ValueError(f"Current database ({self._database}) is not the same as the target database ({database}).")
+            raise ValueError(
+                f"Current database ({self._database}) is not the same as the target database ({database})."
+            )
 
         with self._driver.session(database=database) as session:
             result = session.run(query, parameters)
diff --git a/kag/common/graphstore/rest/__init__.py b/kag/common/graphstore/rest/__init__.py
index 923147a3..2cce4606 100644
--- a/kag/common/graphstore/rest/__init__.py
+++ b/kag/common/graphstore/rest/__init__.py
@@ -35,4 +35,6 @@
 from kag.common.graphstore.rest.models.edge_record_instance import EdgeRecordInstance
 from kag.common.graphstore.rest.models.upsert_edge_request import UpsertEdgeRequest
 from kag.common.graphstore.rest.models.upsert_vertex_request import UpsertVertexRequest
-from kag.common.graphstore.rest.models.vertex_record_instance import VertexRecordInstance
+from kag.common.graphstore.rest.models.vertex_record_instance import (
+    VertexRecordInstance,
+)
diff --git a/kag/common/graphstore/rest/graph_api.py b/kag/common/graphstore/rest/graph_api.py
index e2875966..13dcd5ea 100644
--- a/kag/common/graphstore/rest/graph_api.py
+++ b/kag/common/graphstore/rest/graph_api.py
@@ -18,10 +18,7 @@
 import six
 
 from kag.common.rest.api_client import ApiClient
-from kag.common.rest.exceptions import (  # noqa: F401
-    ApiTypeError,
-    ApiValueError
-)
+from kag.common.rest.exceptions import ApiTypeError, ApiValueError  # noqa: F401
 
 
 class GraphApi(object):
@@ -57,7 +54,7 @@ def graph_delete_edge_post(self, **kwargs):  # noqa: E501
                  If the method is called asynchronously,
                  returns the request thread.
         """
-        kwargs['_return_http_data_only'] = True
+        kwargs["_return_http_data_only"] = True
         return self.graph_delete_edge_post_with_http_info(**kwargs)  # noqa: E501
 
     def graph_delete_edge_post_with_http_info(self, **kwargs):  # noqa: E501
@@ -86,26 +83,24 @@ def graph_delete_edge_post_with_http_info(self, **kwargs):  # noqa: E501
 
         local_var_params = locals()
 
-        all_params = [
-            'delete_edge_request'
-        ]
+        all_params = ["delete_edge_request"]
         all_params.extend(
             [
-                'async_req',
-                '_return_http_data_only',
-                '_preload_content',
-                '_request_timeout'
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
             ]
         )
 
-        for key, val in six.iteritems(local_var_params['kwargs']):
+        for key, val in six.iteritems(local_var_params["kwargs"]):
             if key not in all_params:
                 raise ApiTypeError(
                     "Got an unexpected keyword argument '%s'"
                     " to method graph_delete_edge_post" % key
                 )
             local_var_params[key] = val
-        del local_var_params['kwargs']
+        del local_var_params["kwargs"]
 
         collection_formats = {}
 
@@ -119,34 +114,42 @@ def graph_delete_edge_post_with_http_info(self, **kwargs):  # noqa: E501
         local_var_files = {}
 
         body_params = None
-        if 'delete_edge_request' in local_var_params:
-            body_params = local_var_params['delete_edge_request']
+        if "delete_edge_request" in local_var_params:
+            body_params = local_var_params["delete_edge_request"]
         # HTTP header `Accept`
-        header_params['Accept'] = self.api_client.select_header_accept(
-            ['application/json'])  # noqa: E501
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
 
         # HTTP header `Content-Type`
-        header_params['Content-Type'] = self.api_client.select_header_content_type(  # noqa: E501
-            ['application/json'])  # noqa: E501
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
 
         # Authentication setting
         auth_settings = []  # noqa: E501
 
         return self.api_client.call_api(
-            '/graph/deleteEdge', 'POST',
+            "/graph/deleteEdge",
+            "POST",
             path_params,
             query_params,
             header_params,
             body=body_params,
             post_params=form_params,
             files=local_var_files,
-            response_type='object',  # noqa: E501
+            response_type="object",  # noqa: E501
             auth_settings=auth_settings,
-            async_req=local_var_params.get('async_req'),
-            _return_http_data_only=local_var_params.get('_return_http_data_only'),  # noqa: E501
-            _preload_content=local_var_params.get('_preload_content', True),
-            _request_timeout=local_var_params.get('_request_timeout'),
-            collection_formats=collection_formats)
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
 
     def graph_delete_vertex_post(self, **kwargs):  # noqa: E501
         """delete_vertex  # noqa: E501
@@ -169,7 +172,7 @@ def graph_delete_vertex_post(self, **kwargs):  # noqa: E501
                  If the method is called asynchronously,
                  returns the request thread.
         """
-        kwargs['_return_http_data_only'] = True
+        kwargs["_return_http_data_only"] = True
         return self.graph_delete_vertex_post_with_http_info(**kwargs)  # noqa: E501
 
     def graph_delete_vertex_post_with_http_info(self, **kwargs):  # noqa: E501
@@ -198,26 +201,24 @@ def graph_delete_vertex_post_with_http_info(self, **kwargs):  # noqa: E501
 
         local_var_params = locals()
 
-        all_params = [
-            'delete_vertex_request'
-        ]
+        all_params = ["delete_vertex_request"]
         all_params.extend(
             [
-                'async_req',
-                '_return_http_data_only',
-                '_preload_content',
-                '_request_timeout'
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
             ]
         )
 
-        for key, val in six.iteritems(local_var_params['kwargs']):
+        for key, val in six.iteritems(local_var_params["kwargs"]):
             if key not in all_params:
                 raise ApiTypeError(
                     "Got an unexpected keyword argument '%s'"
                     " to method graph_delete_vertex_post" % key
                 )
             local_var_params[key] = val
-        del local_var_params['kwargs']
+        del local_var_params["kwargs"]
 
         collection_formats = {}
 
@@ -231,34 +232,42 @@ def graph_delete_vertex_post_with_http_info(self, **kwargs):  # noqa: E501
         local_var_files = {}
 
         body_params = None
-        if 'delete_vertex_request' in local_var_params:
-            body_params = local_var_params['delete_vertex_request']
+        if "delete_vertex_request" in local_var_params:
+            body_params = local_var_params["delete_vertex_request"]
         # HTTP header `Accept`
-        header_params['Accept'] = self.api_client.select_header_accept(
-            ['application/json'])  # noqa: E501
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
 
         # HTTP header `Content-Type`
-        header_params['Content-Type'] = self.api_client.select_header_content_type(  # noqa: E501
-            ['application/json'])  # noqa: E501
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
 
         # Authentication setting
         auth_settings = []  # noqa: E501
 
         return self.api_client.call_api(
-            '/graph/deleteVertex', 'POST',
+            "/graph/deleteVertex",
+            "POST",
             path_params,
             query_params,
             header_params,
             body=body_params,
             post_params=form_params,
             files=local_var_files,
-            response_type='object',  # noqa: E501
+            response_type="object",  # noqa: E501
             auth_settings=auth_settings,
-            async_req=local_var_params.get('async_req'),
-            _return_http_data_only=local_var_params.get('_return_http_data_only'),  # noqa: E501
-            _preload_content=local_var_params.get('_preload_content', True),
-            _request_timeout=local_var_params.get('_request_timeout'),
-            collection_formats=collection_formats)
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
 
     def graph_upsert_edge_post(self, **kwargs):  # noqa: E501
         """upsert_edge  # noqa: E501
@@ -281,7 +290,7 @@ def graph_upsert_edge_post(self, **kwargs):  # noqa: E501
                  If the method is called asynchronously,
                  returns the request thread.
         """
-        kwargs['_return_http_data_only'] = True
+        kwargs["_return_http_data_only"] = True
         return self.graph_upsert_edge_post_with_http_info(**kwargs)  # noqa: E501
 
     def graph_upsert_edge_post_with_http_info(self, **kwargs):  # noqa: E501
@@ -310,26 +319,24 @@ def graph_upsert_edge_post_with_http_info(self, **kwargs):  # noqa: E501
 
         local_var_params = locals()
 
-        all_params = [
-            'upsert_edge_request'
-        ]
+        all_params = ["upsert_edge_request"]
         all_params.extend(
             [
-                'async_req',
-                '_return_http_data_only',
-                '_preload_content',
-                '_request_timeout'
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
             ]
         )
 
-        for key, val in six.iteritems(local_var_params['kwargs']):
+        for key, val in six.iteritems(local_var_params["kwargs"]):
             if key not in all_params:
                 raise ApiTypeError(
                     "Got an unexpected keyword argument '%s'"
                     " to method graph_upsert_edge_post" % key
                 )
             local_var_params[key] = val
-        del local_var_params['kwargs']
+        del local_var_params["kwargs"]
 
         collection_formats = {}
 
@@ -343,34 +350,42 @@ def graph_upsert_edge_post_with_http_info(self, **kwargs):  # noqa: E501
         local_var_files = {}
 
         body_params = None
-        if 'upsert_edge_request' in local_var_params:
-            body_params = local_var_params['upsert_edge_request']
+        if "upsert_edge_request" in local_var_params:
+            body_params = local_var_params["upsert_edge_request"]
         # HTTP header `Accept`
-        header_params['Accept'] = self.api_client.select_header_accept(
-            ['application/json'])  # noqa: E501
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
 
         # HTTP header `Content-Type`
-        header_params['Content-Type'] = self.api_client.select_header_content_type(  # noqa: E501
-            ['application/json'])  # noqa: E501
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
 
         # Authentication setting
         auth_settings = []  # noqa: E501
 
         return self.api_client.call_api(
-            '/graph/upsertEdge', 'POST',
+            "/graph/upsertEdge",
+            "POST",
             path_params,
             query_params,
             header_params,
             body=body_params,
             post_params=form_params,
             files=local_var_files,
-            response_type='object',  # noqa: E501
+            response_type="object",  # noqa: E501
             auth_settings=auth_settings,
-            async_req=local_var_params.get('async_req'),
-            _return_http_data_only=local_var_params.get('_return_http_data_only'),  # noqa: E501
-            _preload_content=local_var_params.get('_preload_content', True),
-            _request_timeout=local_var_params.get('_request_timeout'),
-            collection_formats=collection_formats)
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
 
     def graph_upsert_vertex_post(self, **kwargs):  # noqa: E501
         """upsert_vertex  # noqa: E501
@@ -393,7 +408,7 @@ def graph_upsert_vertex_post(self, **kwargs):  # noqa: E501
                  If the method is called asynchronously,
                  returns the request thread.
         """
-        kwargs['_return_http_data_only'] = True
+        kwargs["_return_http_data_only"] = True
         return self.graph_upsert_vertex_post_with_http_info(**kwargs)  # noqa: E501
 
     def graph_upsert_vertex_post_with_http_info(self, **kwargs):  # noqa: E501
@@ -422,26 +437,24 @@ def graph_upsert_vertex_post_with_http_info(self, **kwargs):  # noqa: E501
 
         local_var_params = locals()
 
-        all_params = [
-            'upsert_vertex_request'
-        ]
+        all_params = ["upsert_vertex_request"]
         all_params.extend(
             [
-                'async_req',
-                '_return_http_data_only',
-                '_preload_content',
-                '_request_timeout'
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
             ]
         )
 
-        for key, val in six.iteritems(local_var_params['kwargs']):
+        for key, val in six.iteritems(local_var_params["kwargs"]):
             if key not in all_params:
                 raise ApiTypeError(
                     "Got an unexpected keyword argument '%s'"
                     " to method graph_upsert_vertex_post" % key
                 )
             local_var_params[key] = val
-        del local_var_params['kwargs']
+        del local_var_params["kwargs"]
 
         collection_formats = {}
 
@@ -455,31 +468,39 @@ def graph_upsert_vertex_post_with_http_info(self, **kwargs):  # noqa: E501
         local_var_files = {}
 
         body_params = None
-        if 'upsert_vertex_request' in local_var_params:
-            body_params = local_var_params['upsert_vertex_request']
+        if "upsert_vertex_request" in local_var_params:
+            body_params = local_var_params["upsert_vertex_request"]
         # HTTP header `Accept`
-        header_params['Accept'] = self.api_client.select_header_accept(
-            ['application/json'])  # noqa: E501
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
 
         # HTTP header `Content-Type`
-        header_params['Content-Type'] = self.api_client.select_header_content_type(  # noqa: E501
-            ['application/json'])  # noqa: E501
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
 
         # Authentication setting
         auth_settings = []  # noqa: E501
 
         return self.api_client.call_api(
-            '/graph/upsertVertex', 'POST',
+            "/graph/upsertVertex",
+            "POST",
             path_params,
             query_params,
             header_params,
             body=body_params,
             post_params=form_params,
             files=local_var_files,
-            response_type='object',  # noqa: E501
+            response_type="object",  # noqa: E501
             auth_settings=auth_settings,
-            async_req=local_var_params.get('async_req'),
-            _return_http_data_only=local_var_params.get('_return_http_data_only'),  # noqa: E501
-            _preload_content=local_var_params.get('_preload_content', True),
-            _request_timeout=local_var_params.get('_request_timeout'),
-            collection_formats=collection_formats)
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
diff --git a/kag/common/graphstore/rest/models/__init__.py b/kag/common/graphstore/rest/models/__init__.py
index 9660757a..ef11492f 100644
--- a/kag/common/graphstore/rest/models/__init__.py
+++ b/kag/common/graphstore/rest/models/__init__.py
@@ -16,4 +16,6 @@
 from kag.common.graphstore.rest.models.edge_record_instance import EdgeRecordInstance
 from kag.common.graphstore.rest.models.upsert_edge_request import UpsertEdgeRequest
 from kag.common.graphstore.rest.models.upsert_vertex_request import UpsertVertexRequest
-from kag.common.graphstore.rest.models.vertex_record_instance import VertexRecordInstance
+from kag.common.graphstore.rest.models.vertex_record_instance import (
+    VertexRecordInstance,
+)
diff --git a/kag/common/graphstore/rest/models/delete_edge_request.py b/kag/common/graphstore/rest/models/delete_edge_request.py
index 4dc2984f..6d0a03ed 100644
--- a/kag/common/graphstore/rest/models/delete_edge_request.py
+++ b/kag/common/graphstore/rest/models/delete_edge_request.py
@@ -32,17 +32,13 @@ class DeleteEdgeRequest(object):
       attribute_map (dict): The key is attribute name
                             and the value is json key in definition.
     """
-    openapi_types = {
-        'project_id': 'int',
-        'edges': 'list[EdgeRecordInstance]'
-    }
+    openapi_types = {"project_id": "int", "edges": "list[EdgeRecordInstance]"}
 
-    attribute_map = {
-        'project_id': 'projectId',
-        'edges': 'edges'
-    }
+    attribute_map = {"project_id": "projectId", "edges": "edges"}
 
-    def __init__(self, project_id=None, edges=None, local_vars_configuration=None):  # noqa: E501
+    def __init__(
+        self, project_id=None, edges=None, local_vars_configuration=None
+    ):  # noqa: E501
         """DeleteEdgeRequest - a model defined in OpenAPI"""  # noqa: E501
         if local_vars_configuration is None:
             local_vars_configuration = Configuration()
@@ -73,8 +69,12 @@ def project_id(self, project_id):
         :param project_id: The project_id of this DeleteEdgeRequest.  # noqa: E501
         :type: int
         """
-        if self.local_vars_configuration.client_side_validation and project_id is None:  # noqa: E501
-            raise ValueError("Invalid value for `project_id`, must not be `None`")  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
 
         self._project_id = project_id
 
@@ -96,8 +96,12 @@ def edges(self, edges):
         :param edges: The edges of this DeleteEdgeRequest.  # noqa: E501
         :type: list[EdgeRecordInstance]
         """
-        if self.local_vars_configuration.client_side_validation and edges is None:  # noqa: E501
-            raise ValueError("Invalid value for `edges`, must not be `None`")  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation and edges is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `edges`, must not be `None`"
+            )  # noqa: E501
 
         self._edges = edges
 
@@ -108,18 +112,20 @@ def to_dict(self):
         for attr, _ in six.iteritems(self.openapi_types):
             value = getattr(self, attr)
             if isinstance(value, list):
-                result[attr] = list(map(
-                    lambda x: x.to_dict() if hasattr(x, "to_dict") else x,
-                    value
-                ))
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
             elif hasattr(value, "to_dict"):
                 result[attr] = value.to_dict()
             elif isinstance(value, dict):
-                result[attr] = dict(map(
-                    lambda item: (item[0], item[1].to_dict())
-                    if hasattr(item[1], "to_dict") else item,
-                    value.items()
-                ))
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
             else:
                 result[attr] = value
 
diff --git a/kag/common/graphstore/rest/models/delete_vertex_request.py b/kag/common/graphstore/rest/models/delete_vertex_request.py
index 1e9b980a..f6384a20 100644
--- a/kag/common/graphstore/rest/models/delete_vertex_request.py
+++ b/kag/common/graphstore/rest/models/delete_vertex_request.py
@@ -32,17 +32,13 @@ class DeleteVertexRequest(object):
       attribute_map (dict): The key is attribute name
                             and the value is json key in definition.
     """
-    openapi_types = {
-        'project_id': 'int',
-        'vertices': 'list[VertexRecordInstance]'
-    }
+    openapi_types = {"project_id": "int", "vertices": "list[VertexRecordInstance]"}
 
-    attribute_map = {
-        'project_id': 'projectId',
-        'vertices': 'vertices'
-    }
+    attribute_map = {"project_id": "projectId", "vertices": "vertices"}
 
-    def __init__(self, project_id=None, vertices=None, local_vars_configuration=None):  # noqa: E501
+    def __init__(
+        self, project_id=None, vertices=None, local_vars_configuration=None
+    ):  # noqa: E501
         """DeleteVertexRequest - a model defined in OpenAPI"""  # noqa: E501
         if local_vars_configuration is None:
             local_vars_configuration = Configuration()
@@ -73,8 +69,12 @@ def project_id(self, project_id):
         :param project_id: The project_id of this DeleteVertexRequest.  # noqa: E501
         :type: int
         """
-        if self.local_vars_configuration.client_side_validation and project_id is None:  # noqa: E501
-            raise ValueError("Invalid value for `project_id`, must not be `None`")  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
 
         self._project_id = project_id
 
@@ -96,8 +96,12 @@ def vertices(self, vertices):
         :param vertices: The vertices of this DeleteVertexRequest.  # noqa: E501
         :type: list[VertexRecordInstance]
         """
-        if self.local_vars_configuration.client_side_validation and vertices is None:  # noqa: E501
-            raise ValueError("Invalid value for `vertices`, must not be `None`")  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation and vertices is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `vertices`, must not be `None`"
+            )  # noqa: E501
 
         self._vertices = vertices
 
@@ -108,18 +112,20 @@ def to_dict(self):
         for attr, _ in six.iteritems(self.openapi_types):
             value = getattr(self, attr)
             if isinstance(value, list):
-                result[attr] = list(map(
-                    lambda x: x.to_dict() if hasattr(x, "to_dict") else x,
-                    value
-                ))
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
             elif hasattr(value, "to_dict"):
                 result[attr] = value.to_dict()
             elif isinstance(value, dict):
-                result[attr] = dict(map(
-                    lambda item: (item[0], item[1].to_dict())
-                    if hasattr(item[1], "to_dict") else item,
-                    value.items()
-                ))
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
             else:
                 result[attr] = value
 
diff --git a/kag/common/graphstore/rest/models/edge_record_instance.py b/kag/common/graphstore/rest/models/edge_record_instance.py
index 77873ddd..e901fdde 100644
--- a/kag/common/graphstore/rest/models/edge_record_instance.py
+++ b/kag/common/graphstore/rest/models/edge_record_instance.py
@@ -33,24 +33,33 @@ class EdgeRecordInstance(object):
                             and the value is json key in definition.
     """
     openapi_types = {
-        'src_type': 'str',
-        'src_id': 'str',
-        'dst_type': 'str',
-        'dst_id': 'str',
-        'label': 'str',
-        'properties': 'object'
+        "src_type": "str",
+        "src_id": "str",
+        "dst_type": "str",
+        "dst_id": "str",
+        "label": "str",
+        "properties": "object",
     }
 
     attribute_map = {
-        'src_type': 'srcType',
-        'src_id': 'srcId',
-        'dst_type': 'dstType',
-        'dst_id': 'dstId',
-        'label': 'label',
-        'properties': 'properties'
+        "src_type": "srcType",
+        "src_id": "srcId",
+        "dst_type": "dstType",
+        "dst_id": "dstId",
+        "label": "label",
+        "properties": "properties",
     }
 
-    def __init__(self, src_type=None, src_id=None, dst_type=None, dst_id=None, label=None, properties=None, local_vars_configuration=None):  # noqa: E501
+    def __init__(
+        self,
+        src_type=None,
+        src_id=None,
+        dst_type=None,
+        dst_id=None,
+        label=None,
+        properties=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
         """EdgeRecordInstance - a model defined in OpenAPI"""  # noqa: E501
         if local_vars_configuration is None:
             local_vars_configuration = Configuration()
@@ -89,8 +98,12 @@ def src_type(self, src_type):
         :param src_type: The src_type of this EdgeRecordInstance.  # noqa: E501
         :type: str
         """
-        if self.local_vars_configuration.client_side_validation and src_type is None:  # noqa: E501
-            raise ValueError("Invalid value for `src_type`, must not be `None`")  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation and src_type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `src_type`, must not be `None`"
+            )  # noqa: E501
 
         self._src_type = src_type
 
@@ -112,8 +125,12 @@ def src_id(self, src_id):
         :param src_id: The src_id of this EdgeRecordInstance.  # noqa: E501
         :type: str
         """
-        if self.local_vars_configuration.client_side_validation and src_id is None:  # noqa: E501
-            raise ValueError("Invalid value for `src_id`, must not be `None`")  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation and src_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `src_id`, must not be `None`"
+            )  # noqa: E501
 
         self._src_id = src_id
 
@@ -135,8 +152,12 @@ def dst_type(self, dst_type):
         :param dst_type: The dst_type of this EdgeRecordInstance.  # noqa: E501
         :type: str
         """
-        if self.local_vars_configuration.client_side_validation and dst_type is None:  # noqa: E501
-            raise ValueError("Invalid value for `dst_type`, must not be `None`")  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation and dst_type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `dst_type`, must not be `None`"
+            )  # noqa: E501
 
         self._dst_type = dst_type
 
@@ -158,8 +179,12 @@ def dst_id(self, dst_id):
         :param dst_id: The dst_id of this EdgeRecordInstance.  # noqa: E501
         :type: str
         """
-        if self.local_vars_configuration.client_side_validation and dst_id is None:  # noqa: E501
-            raise ValueError("Invalid value for `dst_id`, must not be `None`")  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation and dst_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `dst_id`, must not be `None`"
+            )  # noqa: E501
 
         self._dst_id = dst_id
 
@@ -181,8 +206,12 @@ def label(self, label):
         :param label: The label of this EdgeRecordInstance.  # noqa: E501
         :type: str
         """
-        if self.local_vars_configuration.client_side_validation and label is None:  # noqa: E501
-            raise ValueError("Invalid value for `label`, must not be `None`")  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation and label is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `label`, must not be `None`"
+            )  # noqa: E501
 
         self._label = label
 
@@ -204,8 +233,12 @@ def properties(self, properties):
         :param properties: The properties of this EdgeRecordInstance.  # noqa: E501
         :type: object
         """
-        if self.local_vars_configuration.client_side_validation and properties is None:  # noqa: E501
-            raise ValueError("Invalid value for `properties`, must not be `None`")  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation and properties is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `properties`, must not be `None`"
+            )  # noqa: E501
 
         self._properties = properties
 
@@ -216,18 +249,20 @@ def to_dict(self):
         for attr, _ in six.iteritems(self.openapi_types):
             value = getattr(self, attr)
             if isinstance(value, list):
-                result[attr] = list(map(
-                    lambda x: x.to_dict() if hasattr(x, "to_dict") else x,
-                    value
-                ))
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
             elif hasattr(value, "to_dict"):
                 result[attr] = value.to_dict()
             elif isinstance(value, dict):
-                result[attr] = dict(map(
-                    lambda item: (item[0], item[1].to_dict())
-                    if hasattr(item[1], "to_dict") else item,
-                    value.items()
-                ))
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
             else:
                 result[attr] = value
 
diff --git a/kag/common/graphstore/rest/models/upsert_edge_request.py b/kag/common/graphstore/rest/models/upsert_edge_request.py
index 7dd1c89a..5cd69ed1 100644
--- a/kag/common/graphstore/rest/models/upsert_edge_request.py
+++ b/kag/common/graphstore/rest/models/upsert_edge_request.py
@@ -33,18 +33,24 @@ class UpsertEdgeRequest(object):
                             and the value is json key in definition.
     """
     openapi_types = {
-        'project_id': 'int',
-        'upsert_adjacent_vertices': 'bool',
-        'edges': 'list[EdgeRecordInstance]'
+        "project_id": "int",
+        "upsert_adjacent_vertices": "bool",
+        "edges": "list[EdgeRecordInstance]",
     }
 
     attribute_map = {
-        'project_id': 'projectId',
-        'upsert_adjacent_vertices': 'upsertAdjacentVertices',
-        'edges': 'edges'
+        "project_id": "projectId",
+        "upsert_adjacent_vertices": "upsertAdjacentVertices",
+        "edges": "edges",
     }
 
-    def __init__(self, project_id=None, upsert_adjacent_vertices=None, edges=None, local_vars_configuration=None):  # noqa: E501
+    def __init__(
+        self,
+        project_id=None,
+        upsert_adjacent_vertices=None,
+        edges=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
         """UpsertEdgeRequest - a model defined in OpenAPI"""  # noqa: E501
         if local_vars_configuration is None:
             local_vars_configuration = Configuration()
@@ -77,8 +83,12 @@ def project_id(self, project_id):
         :param project_id: The project_id of this UpsertEdgeRequest.  # noqa: E501
         :type: int
         """
-        if self.local_vars_configuration.client_side_validation and project_id is None:  # noqa: E501
-            raise ValueError("Invalid value for `project_id`, must not be `None`")  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
 
         self._project_id = project_id
 
@@ -100,8 +110,13 @@ def upsert_adjacent_vertices(self, upsert_adjacent_vertices):
         :param upsert_adjacent_vertices: The upsert_adjacent_vertices of this UpsertEdgeRequest.  # noqa: E501
         :type: bool
         """
-        if self.local_vars_configuration.client_side_validation and upsert_adjacent_vertices is None:  # noqa: E501
-            raise ValueError("Invalid value for `upsert_adjacent_vertices`, must not be `None`")  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and upsert_adjacent_vertices is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `upsert_adjacent_vertices`, must not be `None`"
+            )  # noqa: E501
 
         self._upsert_adjacent_vertices = upsert_adjacent_vertices
 
@@ -123,8 +138,12 @@ def edges(self, edges):
         :param edges: The edges of this UpsertEdgeRequest.  # noqa: E501
         :type: list[EdgeRecordInstance]
         """
-        if self.local_vars_configuration.client_side_validation and edges is None:  # noqa: E501
-            raise ValueError("Invalid value for `edges`, must not be `None`")  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation and edges is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `edges`, must not be `None`"
+            )  # noqa: E501
 
         self._edges = edges
 
@@ -135,18 +154,20 @@ def to_dict(self):
         for attr, _ in six.iteritems(self.openapi_types):
             value = getattr(self, attr)
             if isinstance(value, list):
-                result[attr] = list(map(
-                    lambda x: x.to_dict() if hasattr(x, "to_dict") else x,
-                    value
-                ))
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
             elif hasattr(value, "to_dict"):
                 result[attr] = value.to_dict()
             elif isinstance(value, dict):
-                result[attr] = dict(map(
-                    lambda item: (item[0], item[1].to_dict())
-                    if hasattr(item[1], "to_dict") else item,
-                    value.items()
-                ))
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
             else:
                 result[attr] = value
 
diff --git a/kag/common/graphstore/rest/models/upsert_vertex_request.py b/kag/common/graphstore/rest/models/upsert_vertex_request.py
index 6ed6cec1..682968b8 100644
--- a/kag/common/graphstore/rest/models/upsert_vertex_request.py
+++ b/kag/common/graphstore/rest/models/upsert_vertex_request.py
@@ -32,17 +32,13 @@ class UpsertVertexRequest(object):
       attribute_map (dict): The key is attribute name
                             and the value is json key in definition.
     """
-    openapi_types = {
-        'project_id': 'int',
-        'vertices': 'list[VertexRecordInstance]'
-    }
+    openapi_types = {"project_id": "int", "vertices": "list[VertexRecordInstance]"}
 
-    attribute_map = {
-        'project_id': 'projectId',
-        'vertices': 'vertices'
-    }
+    attribute_map = {"project_id": "projectId", "vertices": "vertices"}
 
-    def __init__(self, project_id=None, vertices=None, local_vars_configuration=None):  # noqa: E501
+    def __init__(
+        self, project_id=None, vertices=None, local_vars_configuration=None
+    ):  # noqa: E501
         """UpsertVertexRequest - a model defined in OpenAPI"""  # noqa: E501
         if local_vars_configuration is None:
             local_vars_configuration = Configuration()
@@ -73,8 +69,12 @@ def project_id(self, project_id):
         :param project_id: The project_id of this UpsertVertexRequest.  # noqa: E501
         :type: int
         """
-        if self.local_vars_configuration.client_side_validation and project_id is None:  # noqa: E501
-            raise ValueError("Invalid value for `project_id`, must not be `None`")  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
 
         self._project_id = project_id
 
@@ -96,8 +96,12 @@ def vertices(self, vertices):
         :param vertices: The vertices of this UpsertVertexRequest.  # noqa: E501
         :type: list[VertexRecordInstance]
         """
-        if self.local_vars_configuration.client_side_validation and vertices is None:  # noqa: E501
-            raise ValueError("Invalid value for `vertices`, must not be `None`")  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation and vertices is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `vertices`, must not be `None`"
+            )  # noqa: E501
 
         self._vertices = vertices
 
@@ -108,18 +112,20 @@ def to_dict(self):
         for attr, _ in six.iteritems(self.openapi_types):
             value = getattr(self, attr)
             if isinstance(value, list):
-                result[attr] = list(map(
-                    lambda x: x.to_dict() if hasattr(x, "to_dict") else x,
-                    value
-                ))
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
             elif hasattr(value, "to_dict"):
                 result[attr] = value.to_dict()
             elif isinstance(value, dict):
-                result[attr] = dict(map(
-                    lambda item: (item[0], item[1].to_dict())
-                    if hasattr(item[1], "to_dict") else item,
-                    value.items()
-                ))
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
             else:
                 result[attr] = value
 
diff --git a/kag/common/graphstore/rest/models/vertex_record_instance.py b/kag/common/graphstore/rest/models/vertex_record_instance.py
index 8fe12ca2..710891c1 100644
--- a/kag/common/graphstore/rest/models/vertex_record_instance.py
+++ b/kag/common/graphstore/rest/models/vertex_record_instance.py
@@ -33,20 +33,27 @@ class VertexRecordInstance(object):
                             and the value is json key in definition.
     """
     openapi_types = {
-        'type': 'str',
-        'id': 'str',
-        'properties': 'object',
-        'vectors': 'object'
+        "type": "str",
+        "id": "str",
+        "properties": "object",
+        "vectors": "object",
     }
 
     attribute_map = {
-        'type': 'type',
-        'id': 'id',
-        'properties': 'properties',
-        'vectors': 'vectors'
+        "type": "type",
+        "id": "id",
+        "properties": "properties",
+        "vectors": "vectors",
     }
 
-    def __init__(self, type=None, id=None, properties=None, vectors=None, local_vars_configuration=None):  # noqa: E501
+    def __init__(
+        self,
+        type=None,
+        id=None,
+        properties=None,
+        vectors=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
         """VertexRecordInstance - a model defined in OpenAPI"""  # noqa: E501
         if local_vars_configuration is None:
             local_vars_configuration = Configuration()
@@ -81,8 +88,12 @@ def type(self, type):
         :param type: The type of this VertexRecordInstance.  # noqa: E501
         :type: str
         """
-        if self.local_vars_configuration.client_side_validation and type is None:  # noqa: E501
-            raise ValueError("Invalid value for `type`, must not be `None`")  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation and type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `type`, must not be `None`"
+            )  # noqa: E501
 
         self._type = type
 
@@ -104,7 +115,9 @@ def id(self, id):
         :param id: The id of this VertexRecordInstance.  # noqa: E501
         :type: str
         """
-        if self.local_vars_configuration.client_side_validation and id is None:  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation and id is None
+        ):  # noqa: E501
             raise ValueError("Invalid value for `id`, must not be `None`")  # noqa: E501
 
         self._id = id
@@ -127,8 +140,12 @@ def properties(self, properties):
         :param properties: The properties of this VertexRecordInstance.  # noqa: E501
         :type: object
         """
-        if self.local_vars_configuration.client_side_validation and properties is None:  # noqa: E501
-            raise ValueError("Invalid value for `properties`, must not be `None`")  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation and properties is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `properties`, must not be `None`"
+            )  # noqa: E501
 
         self._properties = properties
 
@@ -150,8 +167,12 @@ def vectors(self, vectors):
         :param vectors: The vectors of this VertexRecordInstance.  # noqa: E501
         :type: object
         """
-        if self.local_vars_configuration.client_side_validation and vectors is None:  # noqa: E501
-            raise ValueError("Invalid value for `vectors`, must not be `None`")  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation and vectors is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `vectors`, must not be `None`"
+            )  # noqa: E501
 
         self._vectors = vectors
 
@@ -162,18 +183,20 @@ def to_dict(self):
         for attr, _ in six.iteritems(self.openapi_types):
             value = getattr(self, attr)
             if isinstance(value, list):
-                result[attr] = list(map(
-                    lambda x: x.to_dict() if hasattr(x, "to_dict") else x,
-                    value
-                ))
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
             elif hasattr(value, "to_dict"):
                 result[attr] = value.to_dict()
             elif isinstance(value, dict):
-                result[attr] = dict(map(
-                    lambda item: (item[0], item[1].to_dict())
-                    if hasattr(item[1], "to_dict") else item,
-                    value.items()
-                ))
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
             else:
                 result[attr] = value
 
diff --git a/kag/common/llm/__init__.py b/kag/common/llm/__init__.py
index cee64ad5..5d3bfc1e 100644
--- a/kag/common/llm/__init__.py
+++ b/kag/common/llm/__init__.py
@@ -10,14 +10,18 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
 
+
+from kag.common.llm.openai_client import OpenAIClient
+from kag.common.llm.vllm_client import VLLMClient
+from kag.common.llm.ollama_client import OllamaClient
 from kag.common.llm.llm_config_checker import LLMConfigChecker
-from kag.common.llm.client.vllm_client import VLLMClient
-from kag.common.llm.client.ollama_client import OllamaClient
-from kag.common.llm.client.openai_client import OpenAIClient
+from kag.common.llm.mock_llm import MockLLMClient
 
 __all__ = [
-    "LLMConfigChecker",
-    "VLLMClient",
+    "LLMClient",
     "OpenAIClient",
-    "OllamaClient"
+    "VLLMClient",
+    "OllamaClient",
+    "MockLLMClient",
+    "LLMConfigChecker",
 ]
diff --git a/kag/common/llm/client/llm_client.py b/kag/common/llm/client/llm_client.py
deleted file mode 100644
index 3720516d..00000000
--- a/kag/common/llm/client/llm_client.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import os
-import json
-from pathlib import Path
-from typing import Union, Dict, List, Any
-import logging
-import traceback
-import yaml
-
-from kag.common.base.prompt_op import PromptOp
-from kag.common.llm.config import *
-
-
-logger = logging.getLogger(__name__)
-
-
-class LLMClient:
-    # Define the model type
-    model: str
-
-    config_cls_map = {
-        "maas": OpenAIConfig,
-        "vllm": VLLMConfig,
-        "ollama": OllamaConfig,
-    }
-
-    def __init__(self, **kwargs):
-        self.model = kwargs.get("model", None)
-    
-    @classmethod
-    def get_config_cls(self,config:dict):
-        client_type = config.get("client_type", None)
-        return LLMClient.config_cls_map.get(client_type, None)
-        
-    @classmethod
-    def get_llm_cls(self,config: LLMConfig):
-        from kag.common.llm.client import VLLMClient,OpenAIClient,OllamaClient
-        return {
-            VLLMConfig: VLLMClient,
-            OpenAIConfig: OpenAIClient,
-            OllamaConfig: OllamaClient,
-        }[config.__class__]
-
-    @classmethod
-    def from_config(cls, config: Union[str, dict]):
-        """
-        Initialize an LLMClient instance from a configuration file or dictionary.
-
-        :param config: Path to a configuration file or a configuration dictionary
-        :return: Initialized LLMClient instance
-        :raises FileNotFoundError: If the configuration file is not found
-        :raises ValueError: If the model type is unsupported
-        """
-        if isinstance(config, str):
-            config_path = Path(config)
-            if config_path.is_file():
-                try:
-                    with open(config_path, "r") as f:
-                        nn_config = yaml.safe_load(f)
-                except:
-                    logger.error(f"Failed to parse config file")
-                    raise
-            else:
-                logger.error(f"Config file not found: {config}")
-                raise FileNotFoundError(f"Config file not found: {config}")
-        else:
-            # If config is already a dictionary, use it directly
-            nn_config = config
-        
-        config_cls = LLMClient.get_config_cls(nn_config)
-        if config_cls is None:
-            logger.error(f"Unsupported model type: {nn_config.get('client_type', None)}")
-            raise ValueError(f"Unsupported model type")
-        llm_config = config_cls(**nn_config)
-        llm_cls = LLMClient.get_llm_cls(llm_config)
-        return llm_cls(llm_config)
-        
-            
-    def __call__(self, prompt: Union[str, dict, list]) -> str:
-        """
-        Perform inference on the given prompt and return the result.
-
-        :param prompt: Input prompt for inference
-        :return: Inference result
-        :raises NotImplementedError: If the subclass has not implemented this method
-        """
-        raise NotImplementedError
-
-    def call_with_json_parse(self, prompt: Union[str, dict, list]):
-        """
-        Perform inference on the given prompt and attempt to parse the result as JSON.
-
-        :param prompt: Input prompt for inference
-        :return: Parsed result
-        :raises NotImplementedError: If the subclass has not implemented this method
-        """
-        res = self(prompt)
-        _end = res.rfind("```")
-        _start = res.find("```json")
-        if _end != -1 and _start != -1:
-            json_str = res[_start + len("```json"): _end].strip()
-        else:
-            json_str = res
-        try:
-            json_result = json.loads(json_str)
-        except:
-            return res
-        return json_result
-
-    def invoke(self, variables: Dict[str, Any], prompt_op: PromptOp, with_json_parse: bool = True,
-               with_except: bool = False):
-        """
-        Call the model and process the result.
-
-        :param variables: Variables used to build the prompt
-        :param prompt_op: Prompt operation object for building and parsing prompts
-        :param with_json_parse: Whether to attempt parsing the response as JSON
-        :param with_except: Whether to raise exception
-        :return: Processed result list
-        """
-        result = []
-        prompt = prompt_op.build_prompt(variables)
-        logger.debug(f"Prompt: {prompt}")
-        if not prompt:
-            return result
-        response = ""
-        try:
-            response = self.call_with_json_parse(prompt=prompt) if with_json_parse else self(prompt)
-            logger.debug(f"Response: {response}")
-            result = prompt_op.parse_response(response, model=self.model, **variables)
-            logger.debug(f"Result: {result}")
-        except Exception as e:
-            import traceback
-            logger.debug(f"Error {e} during invocation: {traceback.format_exc()}")
-            if with_except:
-                raise RuntimeError(f"call llm exception! llm output = {response} , llm input={prompt}, err={e}")
-        return result
-
-    def batch(self, variables: Dict[str, Any], prompt_op: PromptOp, with_json_parse: bool = True) -> List:
-        """
-        Batch process prompts.
-
-        :param variables: Variables used to build the prompts
-        :param prompt_op: Prompt operation object for building and parsing prompts
-        :param with_json_parse: Whether to attempt parsing the response as JSON
-        :return: List of all processed results
-        """
-        results = []
-        prompts = prompt_op.build_prompt(variables)
-        # If there is only one prompt, call the `invoke` method directly
-        if isinstance(prompts, str):
-            return self.invoke(variables, prompt_op, with_json_parse=with_json_parse)
-
-        for idx, prompt in enumerate(prompts, start=0):
-            logger.debug(f"Prompt_{idx}: {prompt}")
-            try:
-                response = self.call_with_json_parse(prompt=prompt) if with_json_parse else self(prompt)
-                logger.debug(f"Response_{idx}: {response}")
-                result = prompt_op.parse_response(response, idx=idx, model=self.model, **variables)
-                logger.debug(f"Result_{idx}: {result}")
-                results.extend(result)
-            except Exception as e:
-                logger.error(f"Error processing prompt {idx}: {e}")
-                logger.debug(traceback.format_exc())
-                continue
-        return results
-    
\ No newline at end of file
diff --git a/kag/common/llm/client/ollama_client.py b/kag/common/llm/client/ollama_client.py
deleted file mode 100644
index a4f04408..00000000
--- a/kag/common/llm/client/ollama_client.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import os
-import ast
-import re
-import json
-import time
-import uuid
-import html
-from binascii import b2a_hex
-from datetime import datetime
-from pathlib import Path
-from typing import Union, Dict, List, Any
-from urllib import request
-from collections import defaultdict
-
-from openai import OpenAI
-import logging
-from ollama import Client
-
-import requests
-import traceback
-from Crypto.Cipher import AES
-from requests import RequestException
-
-from kag.common import arks_pb2
-from kag.common.base.prompt_op import PromptOp
-from kag.common.llm.config import OllamaConfig
-
-from kag.common.llm.client.llm_client import LLMClient
-
-
-# logging.basicConfig(level=logging.DEBUG)
-logger = logging.getLogger(__name__)
-
-class OllamaClient(LLMClient):
-    def __init__(self, llm_config: OllamaConfig):
-        self.model = llm_config.model
-        self.base_url = llm_config.base_url
-        self.param = {}
-        self.client = Client(host=self.base_url)
-
-    def sync_request(self, prompt,image=None):
-        # import pdb; pdb.set_trace()
-        response = self.client.generate(model=self.model, prompt=prompt, stream=False)
-        content = response["response"]
-        content = content.replace("&rdquo;", "”").replace("&ldquo;", "“")
-        content = content.replace("&middot;", "")
-
-        return content
-
-    def __call__(self, prompt,image=None):
-        return self.sync_request(prompt,image)
-
-    def call_with_json_parse(self, prompt):
-        rsp = self.sync_request(prompt)
-        _end = rsp.rfind("```")
-        _start = rsp.find("```json")
-        if _end != -1 and _start != -1:
-            json_str = rsp[_start + len("```json"): _end].strip()
-        else:
-            json_str = rsp
-        try:
-            json_result = json.loads(json_str)
-        except:
-            return rsp
-        return json_result
diff --git a/kag/common/llm/config/base.py b/kag/common/llm/config/base.py
deleted file mode 100644
index 40f4442a..00000000
--- a/kag/common/llm/config/base.py
+++ /dev/null
@@ -1,9 +0,0 @@
-"""LLM Parameters model."""
-
-from pydantic import BaseModel, Field
-
-
-
-class LLMConfig(BaseModel):
-    """LLM Config model."""
-
diff --git a/kag/common/llm/config/enums.py b/kag/common/llm/config/enums.py
deleted file mode 100644
index 8741cf74..00000000
--- a/kag/common/llm/config/enums.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) 2024 Microsoft Corporation.
-# Licensed under the MIT License
-
-"""A module containing 'PipelineCacheConfig', 'PipelineFileCacheConfig' and 'PipelineMemoryCacheConfig' models."""
-
-from __future__ import annotations
-
-from enum import Enum
-
-
-class CacheType(str, Enum):
-    """The cache configuration type for the pipeline."""
-
-    file = "file"
-    """The file cache configuration type."""
-    memory = "memory"
-    """The memory cache configuration type."""
-    none = "none"
-    """The none cache configuration type."""
-    blob = "blob"
-    """The blob cache configuration type."""
-
-    def __repr__(self):
-        """Get a string representation."""
-        return f'"{self.value}"'
-
-
-class InputFileType(str, Enum):
-    """The input file type for the pipeline."""
-
-    csv = "csv"
-    """The CSV input type."""
-    text = "text"
-    """The text input type."""
-
-    def __repr__(self):
-        """Get a string representation."""
-        return f'"{self.value}"'
-
-
-class InputType(str, Enum):
-    """The input type for the pipeline."""
-
-    file = "file"
-    """The file storage type."""
-    blob = "blob"
-    """The blob storage type."""
-
-    def __repr__(self):
-        """Get a string representation."""
-        return f'"{self.value}"'
-
-
-class StorageType(str, Enum):
-    """The storage type for the pipeline."""
-
-    file = "file"
-    """The file storage type."""
-    memory = "memory"
-    """The memory storage type."""
-    blob = "blob"
-    """The blob storage type."""
-
-    def __repr__(self):
-        """Get a string representation."""
-        return f'"{self.value}"'
-
-
-class ReportingType(str, Enum):
-    """The reporting configuration type for the pipeline."""
-
-    file = "file"
-    """The file reporting configuration type."""
-    console = "console"
-    """The console reporting configuration type."""
-    blob = "blob"
-    """The blob reporting configuration type."""
-
-    def __repr__(self):
-        """Get a string representation."""
-        return f'"{self.value}"'
-
-
-class TextEmbeddingTarget(str, Enum):
-    """The target to use for text embeddings."""
-
-    all = "all"
-    required = "required"
-
-    def __repr__(self):
-        """Get a string representation."""
-        return f'"{self.value}"'
-
-
-class LLMType(str, Enum):
-    """LLMType enum class definition."""
-
-    # Embeddings
-    OpenAIEmbedding = "openai_embedding"
-    AzureOpenAIEmbedding = "azure_openai_embedding"
-
-    # Raw Completion
-    OpenAI = "openai"
-    AzureOpenAI = "azure_openai"
-
-    # Chat Completion
-    OpenAIChat = "openai_chat"
-    AzureOpenAIChat = "azure_openai_chat"
-
-    # Debug
-    StaticResponse = "static_response"
-
-    def __repr__(self):
-        """Get a string representation."""
-        return f'"{self.value}"'
diff --git a/kag/common/llm/config/llm_config.cfg b/kag/common/llm/config/llm_config.cfg
deleted file mode 100644
index a780c3fa..00000000
--- a/kag/common/llm/config/llm_config.cfg
+++ /dev/null
@@ -1,67 +0,0 @@
-
-#-----------------------------------------------------------------------------------#
-# openai SDK maas.   client_type = maas                                             #
-                                                                                    #
-# TongYi                                                                            #
-[llm]                                                                               #
-client_type = maas                                                                  #
-base_url = https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions       #
-api_key = "put your tongyi api key here"                                            #
-model = qwen-turbo                                                                  #                                                                              #
-                                                                                    #
-# Deepseek                                                                          #
-[llm]                                                                               #
-client_type = maas                                                                  #
-base_url = https://api.deepseek.com/beta                                            #
-api_key = "put your deepseek api key here"                                          #
-model = deepseek-chat                                                               #
-                                                                                    #
-# OpenAI                                                                            #
-[llm]                                                                               #
-client_type = maas                                                                  #
-base_url = https://api.openai.com/v1/chat/completions                               #
-api_key = "put your openai api key here"                                            #
-model = gpt-3.5-turbo                                                               #
-                                                                                    #
-#-----------------------------------------------------------------------------------#
-
-
-
-
-#-----------------------------------------------------------------------------------#
-# local llm service.    client_type = vllm                                          #
-                                                                                    #
-# vllm                                                                              #
-[llm]                                                                               #
-client_type = vllm                                                                  #
-base_url = http://localhost:8000/v1/chat/completions                                #
-model = qwen-7b-chat                                                                #
-                                                                                    #
-#-----------------------------------------------------------------------------------#
-
-
-
-
-#-----------------------------------------------------------------------------------#
-# maya llm service.    client_type = maya                                           #
-                                                                                    #
-[llm]                                                                               #
-client_type = maya                                                                  #
-scene_name = Qwen2_7B_Instruct_Knowledge                                            #
-chain_name = v1                                                                     #
-lora_name = humming-v25                                                             #
-                                                                                    #
-#-----------------------------------------------------------------------------------#
-
-
-
-
-#-----------------------------------------------------------------------------------#
-                                                                                    #
-# ollama                                                                            #
-[llm]
-client_type = ollama
-base_url = http://localhost:11434/api/generate
-model = llama3.1                                                                #
-                                                                                    #
-#-----------------------------------------------------------------------------------#
diff --git a/kag/common/llm/config/ollama.py b/kag/common/llm/config/ollama.py
deleted file mode 100644
index 595ad8c5..00000000
--- a/kag/common/llm/config/ollama.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from pydantic import Field
-from kag.common.llm.config.base import LLMConfig
-
-
-class OllamaConfig(LLMConfig):
-    model: str = Field(
-        description="model name."
-    )
-    base_url: str = Field(
-        description="post url."
-    )
\ No newline at end of file
diff --git a/kag/common/llm/config/openai.py b/kag/common/llm/config/openai.py
deleted file mode 100644
index dc54bd44..00000000
--- a/kag/common/llm/config/openai.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from pydantic import Field
-from kag.common.llm.config.base import LLMConfig
-
-
-class OpenAIConfig(LLMConfig):
-    api_key: str = Field(
-        description="api key."
-    )
-    stream: bool = Field(
-        description="if use stream mode",default=False
-    )
-    model: str = Field(
-        description="model name."
-    )
-    temperature: float = Field(
-        description="temperature.",default=0.7
-    )
-    base_url: str = Field(
-        description="post url."
-    )
\ No newline at end of file
diff --git a/kag/common/llm/config/proxy.py b/kag/common/llm/config/proxy.py
deleted file mode 100644
index 62c43b65..00000000
--- a/kag/common/llm/config/proxy.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from kag.common.llm.config.base import ProxyLLMConfig
-
-
-class GPTProxyLLMConfig(ProxyLLMConfig):
-    pass
-    
-    
-class DeepSeekProxyLLMConfig(ProxyLLMConfig):
-    pass 
diff --git a/kag/common/llm/config/vllm.py b/kag/common/llm/config/vllm.py
deleted file mode 100644
index 6a018eb4..00000000
--- a/kag/common/llm/config/vllm.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from pydantic import Field
-from kag.common.llm.config.base import LLMConfig
-
-
-class VLLMConfig(LLMConfig):
-    model: str = Field(
-        description="model name."
-    )
-    base_url: str = Field(
-        description="post url."
-    )
\ No newline at end of file
diff --git a/kag/common/llm/llm_config_checker.py b/kag/common/llm/llm_config_checker.py
index c2ea3d84..7e9dd844 100644
--- a/kag/common/llm/llm_config_checker.py
+++ b/kag/common/llm/llm_config_checker.py
@@ -31,7 +31,8 @@ def check(self, config: str) -> str:
         :rtype: str
         :raises RuntimeError: if the config is invalid
         """
-        from kag.common.llm.client import LLMClient
+        from kag.interface import LLMClient
+
         config = json.loads(config)
         llm_client = LLMClient.from_config(config)
         try:
@@ -39,12 +40,13 @@ def check(self, config: str) -> str:
             return res
         except Exception as ex:
             raise RuntimeError(f"invalid llm config: {config}, for details: {ex}")
-        
+
+
 if __name__ == "__main__":
-    config = '''
+    config = """
         {"client_type" :"ollama",
         "base_url" : "http://localhost:11434/",
         "model" : "llama3.1" }
-    '''
+    """
     config_checker = LLMConfigChecker()
     res = config_checker.check(config)
diff --git a/kag/common/llm/mock_llm.py b/kag/common/llm/mock_llm.py
new file mode 100644
index 00000000..dc685d28
--- /dev/null
+++ b/kag/common/llm/mock_llm.py
@@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import time
+import json
+from kag.interface import LLMClient
+
+
+@LLMClient.register("mock")
+class MockLLMClient(LLMClient):
+    """
+    MockLLMClient is a mock implementation of the LLMClient class, used for testing purposes.
+
+    This class provides a method to simulate the behavior of a language model client by matching input prompts.
+    """
+
+    def __init__(self):
+        """
+        Initializes the MockLLMClient instance.
+        """
+        pass
+
+    def match_input(self, prompt):
+        """
+        Simulates the behavior of a language model call by matching the input prompt.
+
+        Args:
+            prompt: The input prompt to be matched.
+        """
+        time.sleep(0.3)  # mimic llm call
+        if "You're a very effective entity extraction system" in prompt:
+            return [
+                {
+                    "entity": "The Rezort",
+                    "type": "Movie",
+                    "category": "Works",
+                    "description": "A 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger.",
+                },
+                {
+                    "entity": "2015",
+                    "type": "Year",
+                    "category": "Date",
+                    "description": "The year the movie 'The Rezort' was released.",
+                },
+            ]
+        if "please attempt to provide the official names of" in prompt:
+            return [
+                {
+                    "entity": "The Rezort",
+                    "type": "Movie",
+                    "category": "Works",
+                    "description": "A 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger.",
+                },
+                {
+                    "entity": "2015",
+                    "type": "Year",
+                    "category": "Date",
+                    "description": "The year the movie 'The Rezort' was released.",
+                },
+            ]
+        if (
+            "You are an expert specializing in carrying out open information extraction"
+            in prompt
+        ):
+            return [
+                ["The Rezort", "is", "zombie horror film"],
+                ["The Rezort", "publish at", "2015"],
+            ]
+        return "I am an intelligent assistant"
+
+    def __call__(self, prompt):
+        return json.dumps(self.match_input(prompt))
+
+    def call_with_json_parse(self, prompt):
+        return self.match_input(prompt)
diff --git a/kag/common/llm/ollama_client.py b/kag/common/llm/ollama_client.py
new file mode 100644
index 00000000..82868978
--- /dev/null
+++ b/kag/common/llm/ollama_client.py
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import json
+
+import logging
+from ollama import Client
+
+from kag.interface import LLMClient
+from tenacity import retry, stop_after_attempt
+
+
+# logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+
+@LLMClient.register("ollama")
+class OllamaClient(LLMClient):
+    """
+    A client class for interacting with the Ollama API.
+
+    This class provides methods to make synchronous requests to the Ollama API, handle model calls, and parse responses.
+    """
+
+    def __init__(self, model: str, base_url: str):
+        """
+        Initializes the OllamaClient instance.
+
+        Args:
+            model (str): The model to use for requests.
+            base_url (str): The base URL for the Ollama API.
+        """
+        self.model = model
+        self.base_url = base_url
+        self.param = {}
+        self.client = Client(host=self.base_url)
+        self.check()
+
+    def sync_request(self, prompt, image=None):
+        """
+        Makes a synchronous request to the Ollama API with the given prompt.
+
+        Args:
+            prompt: The prompt to send to the Ollama API.
+            image: Optional image data to include in the request.
+
+        Returns:
+            str: The content of the response from the Ollama API.
+        """
+        response = self.client.generate(model=self.model, prompt=prompt, stream=False)
+        content = response["response"]
+        content = content.replace("&rdquo;", "”").replace("&ldquo;", "“")
+        content = content.replace("&middot;", "")
+
+        return content
+
+    def __call__(self, prompt, image=None):
+        """
+        Executes a model request when the object is called and returns the result.
+
+        Parameters:
+            prompt (str): The prompt provided to the model.
+
+        Returns:
+            str: The response content generated by the model.
+        """
+
+        return self.sync_request(prompt, image)
+
+    @retry(stop=stop_after_attempt(3))
+    def call_with_json_parse(self, prompt):
+        """
+        Calls the model and attempts to parse the response into JSON format.
+
+        Parameters:
+            prompt (str): The prompt provided to the model.
+
+        Returns:
+            Union[dict, str]: If the response is valid JSON, returns the parsed dictionary; otherwise, returns the original response.
+        """
+
+        rsp = self.sync_request(prompt)
+        _end = rsp.rfind("```")
+        _start = rsp.find("```json")
+        if _end != -1 and _start != -1:
+            json_str = rsp[_start + len("```json") : _end].strip()
+        else:
+            json_str = rsp
+        try:
+            json_result = json.loads(json_str)
+        except:
+            return rsp
+        return json_result
diff --git a/kag/common/llm/client/openai_client.py b/kag/common/llm/openai_client.py
similarity index 62%
rename from kag/common/llm/client/openai_client.py
rename to kag/common/llm/openai_client.py
index 6a96e687..47f6dbeb 100644
--- a/kag/common/llm/client/openai_client.py
+++ b/kag/common/llm/openai_client.py
@@ -12,52 +12,55 @@
 
 
 import json
-from typing import Union
 from openai import OpenAI
 import logging
 
-from kag.common.llm.client.llm_client import LLMClient
-from kag.common.llm.config import OpenAIConfig
+from kag.interface import LLMClient
+from tenacity import retry, stop_after_attempt
 
-# logging.basicConfig(level=logging.DEBUG)
+logging.getLogger("openai").setLevel(logging.ERROR)
+logging.getLogger("httpx").setLevel(logging.ERROR)
 logger = logging.getLogger(__name__)
 
 
+@LLMClient.register("maas")
+@LLMClient.register("openai")
 class OpenAIClient(LLMClient):
     """
     A client class for interacting with the OpenAI API.
 
     Initializes the client with an API key, base URL, streaming option, temperature parameter, and default model.
 
-    Parameters:
-        api_key (str): The OpenAI API key.
-        base_url (str): The base URL of the API.
-        stream (bool, optional): Whether to process responses in a streaming manner. Default is False.
-        temperature (int, optional): Sampling temperature to control the randomness of the model's output. Default is 0.7.
-        model (str, optional): The default model to use.
-
-    Attributes:
-        api_key (str): The OpenAI API key.
-        base_url (str): The base URL of the API.
-        model (str): The default model to use.
-        stream (bool): Whether to process responses in a streaming manner.
-        temperature (float): Sampling temperature.
-        client (OpenAI): An instance of the OpenAI API client.
     """
+
     def __init__(
-            self,
-            llm_config:OpenAIConfig
+        self,
+        api_key: str,
+        base_url: str,
+        model: str,
+        stream: bool = False,
+        temperature: float = 0.7,
     ):
-        # Initialize the OpenAIClient object
-        self.api_key = llm_config.api_key
-        self.base_url = llm_config.base_url
-        self.model = llm_config.model
-        self.stream = llm_config.stream
-        self.temperature = llm_config.temperature
-        self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
+        """
+        Initializes the OpenAIClient instance.
+
+        Args:
+            api_key (str): The API key for accessing the OpenAI API.
+            base_url (str): The base URL for the OpenAI API.
+            model (str): The default model to use for requests.
+            stream (bool, optional): Whether to stream the response. Defaults to False.
+            temperature (float, optional): The temperature parameter for the model. Defaults to 0.7.
+        """
 
+        self.api_key = api_key
+        self.base_url = base_url
+        self.model = model
+        self.stream = stream
+        self.temperature = temperature
+        self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
+        self.check()
 
-    def __call__(self, prompt:str, image_url:str=None):
+    def __call__(self, prompt: str, image_url: str = None):
         """
         Executes a model request when the object is called and returns the result.
 
@@ -71,18 +74,12 @@ def __call__(self, prompt:str, image_url:str=None):
         if image_url:
             message = [
                 {"role": "system", "content": "you are a helpful assistant"},
-                {"role": "user", "content": [
-                    {
-                        "type": "text",
-                        "text": prompt
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                        "url": image_url
-                        }
-                    }
-                    ]
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                        {"type": "image_url", "image_url": {"url": image_url}},
+                    ],
                 },
             ]
             response = self.client.chat.completions.create(
@@ -93,7 +90,7 @@ def __call__(self, prompt:str, image_url:str=None):
             )
             rsp = response.choices[0].message.content
             return rsp
-            
+
         else:
             message = [
                 {"role": "system", "content": "you are a helpful assistant"},
@@ -108,6 +105,7 @@ def __call__(self, prompt:str, image_url:str=None):
             rsp = response.choices[0].message.content
             return rsp
 
+    @retry(stop=stop_after_attempt(3))
     def call_with_json_parse(self, prompt):
         """
         Calls the model and attempts to parse the response into JSON format.
@@ -123,11 +121,11 @@ def call_with_json_parse(self, prompt):
         _end = rsp.rfind("```")
         _start = rsp.find("```json")
         if _end != -1 and _start != -1:
-            json_str = rsp[_start + len("```json"): _end].strip()
+            json_str = rsp[_start + len("```json") : _end].strip()
         else:
             json_str = rsp
         try:
             json_result = json.loads(json_str)
         except:
             return rsp
-        return json_result
\ No newline at end of file
+        return json_result
diff --git a/kag/common/llm/client/vllm_client.py b/kag/common/llm/vllm_client.py
similarity index 50%
rename from kag/common/llm/client/vllm_client.py
rename to kag/common/llm/vllm_client.py
index b1154403..6f430f3e 100644
--- a/kag/common/llm/client/vllm_client.py
+++ b/kag/common/llm/vllm_client.py
@@ -10,46 +10,49 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
 
-import os
-import ast
-import re
+
 import json
-import time
-import uuid
-import html
-from binascii import b2a_hex
-from datetime import datetime
-from pathlib import Path
-from typing import Union, Dict, List, Any
-from urllib import request
-from collections import defaultdict
-
-from openai import OpenAI
 import logging
-
 import requests
-import traceback
-from Crypto.Cipher import AES
-from requests import RequestException
-
-from kag.common import arks_pb2
-from kag.common.base.prompt_op import PromptOp
-from kag.common.llm.config import VLLMConfig
-
-from kag.common.llm.client.llm_client import LLMClient
+from kag.interface import LLMClient
+from tenacity import retry, stop_after_attempt
 
 
 # logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 
+
+@LLMClient.register("vllm")
 class VLLMClient(LLMClient):
-    def __init__(self, llm_config: VLLMConfig):
-        self.model = llm_config.model
-        self.base_url = llm_config.base_url
+    """
+    A client class for interacting with a language model deployed by VLLM.
+
+    This class provides methods to make synchronous requests to the VLLM server, handle model calls, and parse responses.
+    """
+
+    def __init__(self, model: str, base_url: str):
+        """
+        Initializes the VLLMClient instance.
+
+        Args:
+            model (str): The model to use for requests.
+            base_url (str): The base URL for the VLLM API.
+        """
+        self.model = model
+        self.base_url = base_url
         self.param = {}
+        self.check()
 
     def sync_request(self, prompt):
-        # import pdb; pdb.set_trace()
+        """
+        Makes a synchronous request to the VLLM API with the given prompt.
+
+        Args:
+            prompt: The prompt to send to the VLLM API.
+
+        Returns:
+            str: The content of the response from the VLLM API.
+        """
         self.param["messages"] = prompt
         self.param["model"] = self.model
 
@@ -66,18 +69,37 @@ def sync_request(self, prompt):
         return content
 
     def __call__(self, prompt):
-        content = [
-          {"role": "user", "content": prompt}
-          ]
+        """
+        Executes a model request when the object is called and returns the result.
+
+        Parameters:
+            prompt (str): The prompt provided to the model.
+
+        Returns:
+            str: The response content generated by the model.
+        """
+
+        content = [{"role": "user", "content": prompt}]
         return self.sync_request(content)
 
+    @retry(stop=stop_after_attempt(3))
     def call_with_json_parse(self, prompt):
+        """
+        Calls the model and attempts to parse the response into JSON format.
+
+        Parameters:
+            prompt (str): The prompt provided to the model.
+
+        Returns:
+            Union[dict, str]: If the response is valid JSON, returns the parsed dictionary; otherwise, returns the original response.
+        """
+
         content = [{"role": "user", "content": prompt}]
         rsp = self.sync_request(content)
         _end = rsp.rfind("```")
         _start = rsp.find("```json")
         if _end != -1 and _start != -1:
-            json_str = rsp[_start + len("```json"): _end].strip()
+            json_str = rsp[_start + len("```json") : _end].strip()
         else:
             json_str = rsp
         try:
diff --git a/kag/common/llm/client/__init__.py b/kag/common/registry/__init__.py
similarity index 60%
rename from kag/common/llm/client/__init__.py
rename to kag/common/registry/__init__.py
index b26f378a..3ab66aed 100644
--- a/kag/common/llm/client/__init__.py
+++ b/kag/common/registry/__init__.py
@@ -10,15 +10,16 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
 
-from kag.common.llm.client.openai_client import OpenAIClient
-from kag.common.llm.client.vllm_client import VLLMClient
-from kag.common.llm.client.llm_client import LLMClient
-from kag.common.llm.client.ollama_client import OllamaClient
+from kag.common.registry.registrable import Registrable, ConfigurationError
+from kag.common.registry.lazy import Lazy
+from kag.common.registry.functor import Functor
+from kag.common.registry.utils import import_modules_from_path
 
 
 __all__ = [
-    "OpenAIClient",
-    "LLMClient",
-    "VLLMClient",
-    "OllamaClient"
+    "Registrable",
+    "ConfigurationError",
+    "Lazy",
+    "Functor",
+    "import_modules_from_path",
 ]
diff --git a/kag/common/registry/functor.py b/kag/common/registry/functor.py
new file mode 100644
index 00000000..e2286ecd
--- /dev/null
+++ b/kag/common/registry/functor.py
@@ -0,0 +1,166 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import logging
+import collections
+from kag.common.registry.registrable import (
+    Registrable,
+    ConfigurationError,
+    RegistrableType,
+    create_kwargs,
+)
+from types import FunctionType
+
+from typing import Type, Union, Callable, Dict, cast
+from functools import partial
+from pyhocon import ConfigTree, ConfigFactory
+
+logger = logging.getLogger()
+
+
+@Registrable.register("functor")
+class Functor(Registrable):
+    """
+    A special `Registrable` for functions(NOT classes).
+    It is used to register user defined functions. The registered function will acquire the
+    ability of instantiate from configuration.
+
+    e.g.:
+
+    @Functor.register("simple1")
+    def simple_func1(name: "str", age: list = []):
+        print(f"name = {name}")
+        print(f"age = {age}")
+        return "+".join(age)
+    conf1 = {"type": "simple1", "name": "zzs", "age": ["1", "2", "3"]}
+    func = Functor.from_config(conf1)
+    func() # same as: simple_func1(name = "zzs", age = ["1", "2", "3"])
+
+    We can also serialize it backto configuration:
+
+    reconstructed_conf = func.to_config()
+    reconstructed_func = Functor.from_config(reconstructed_conf)
+    """
+
+    def __init__(self, function: partial, register_type: str):
+        self._func = function
+        self.__register_type__ = register_type
+
+    def __call__(self, *args, **kwargs):
+        return self._func(*args, **kwargs)
+
+    @classmethod
+    def register(
+        cls: Type[RegistrableType],
+        name: str,
+        exist_ok: bool = True,
+        as_default=False,
+    ):
+        registry = Registrable._registry[cls]
+        if as_default:
+            cls.default_implementation = name
+
+        def add_function_to_registry(func: FunctionType):
+            # Add to registry, raise an error if key has already been used.
+            if name in registry:
+                if exist_ok:
+                    message = (
+                        f"{name} has already been registered as {registry[name]}, but "
+                        f"exist_ok=True, so overwriting it with {func}"
+                    )
+                    logger.info(message)
+                else:
+                    message = (
+                        f"Cannot register {name} as {cls.__name__}; "
+                        f"name already in use for {registry[name]}"
+                    )
+                    raise ConfigurationError(message)
+            registry[name] = func
+
+            return func
+
+        return add_function_to_registry
+
+    @classmethod
+    def from_config(
+        cls: Type[RegistrableType],
+        params: Union[str, Dict, ConfigTree],
+        constructor_to_call: Callable[..., RegistrableType] = None,
+        constructor_to_inspect: Union[
+            Callable[..., RegistrableType], Callable[[RegistrableType], None]
+        ] = None,
+    ) -> RegistrableType:
+
+        if isinstance(params, str):
+            params = ConfigFactory.from_dict({"type": params})
+        elif isinstance(params, collections.abc.Mapping) and not isinstance(
+            params, ConfigTree
+        ):
+            params = ConfigFactory.from_dict(params)
+
+        if not isinstance(params, ConfigTree):
+            raise ConfigurationError(
+                f"from_config was passed a `{params}` object that was not able to convert to `ConfigTree`. "
+                "This probably indicates malformed parameters."
+                f"This happened when constructing an object of type {cls}."
+            )
+
+        # registered_funcs = Registrable._registry.get(cls)
+        registered_funcs = cls.list_available()
+        if len(registered_funcs) == 0:
+            raise ConfigurationError("There are no registered functions.")
+
+        as_registrable = cast(Type[Functor], cls)
+        default_choice = as_registrable.default_implementation
+        # call with BaseClass.from_prams, should use `type` to point out which subclasss to use
+        choice = params.pop("type", default_choice)
+        choices = as_registrable.list_available()
+
+        if choice not in choices:
+            message = (
+                f"{choice} not in acceptable choices for type: {choices}. "
+                "You should make sure the class is correctly registerd. "
+            )
+            raise ConfigurationError(message)
+
+        function = Registrable._registry[as_registrable][choice]
+        # setattr(function, "__register_type__", choice)
+        constructor_to_inspect = cast(Callable[..., RegistrableType], function)
+        accepts_kwargs, kwargs = create_kwargs(
+            constructor_to_inspect,
+            cls,
+            params,
+        )
+        if accepts_kwargs:
+            params.clear()
+        if len(params) > 0:
+            raise ConfigurationError(
+                f"These params are not used for constructing {cls}:\n{params}"
+            )
+
+        return cls(partial(function, **kwargs), choice)
+
+    def to_config(self) -> ConfigTree:
+        config = {}
+
+        if hasattr(self, "__register_type__") and self.__register_type__:
+            config["type"] = self.__register_type__
+
+        for k, v in self._func.keywords.items():
+            if k in self.NonParams:
+                continue
+            if hasattr(v, "to_config"):
+                conf = v.to_config()
+            else:
+                conf = self._to_config(v)
+            config[k] = conf
+        return ConfigFactory.from_dict(config)
diff --git a/kag/common/registry/lazy.py b/kag/common/registry/lazy.py
new file mode 100644
index 00000000..1b3f281e
--- /dev/null
+++ b/kag/common/registry/lazy.py
@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import inspect
+from pyhocon import ConfigTree
+from typing import Callable, Generic, TypeVar, Type, Union, Any
+
+T = TypeVar("T")
+
+
+class Lazy(Generic[T]):
+    """
+    This class is for use when constructing objects using `Registrable`, when an argument to a
+    constructor has a _sequential dependency_ with another argument to the same constructor.
+
+    For example, in a `Trainer` class you might want to take a `Model` and an `Optimizer` as arguments,
+    but the `Optimizer` needs to be constructed using the parameters from the `Model`. You can give
+    the type annotation `Lazy[Optimizer]` to the optimizer argument, then inside the constructor
+    call `optimizer.construct(parameters=model.parameters)`.
+
+    This is only recommended for use when you have registered a `@classmethod` as the constructor
+    for your class, instead of using `__init__`.  Having a `Lazy[]` type annotation on an argument
+    to an `__init__` method makes your class completely dependent on being constructed using the
+    `Registrable` pipeline, which is not a good idea.
+
+    The actual implementation here is incredibly simple; the logic that handles the lazy
+    construction is actually found in `Registrable`, where we have a special case for a `Lazy` type
+    annotation.
+
+    ```python
+    @classmethod
+    def my_constructor(
+        cls,
+        some_object: Lazy[MyObject],
+        optional_object: Lazy[MyObject] = None,
+        required_object_with_default: Lazy[MyObject] = Lazy(MyObjectDefault),
+    ) -> MyClass:
+        obj1 = some_object.construct()
+        obj2 = None if optional_object is None else optional_object.construct()
+        obj3 = required_object_with_default.construct()
+    ```
+
+    """
+
+    def __init__(
+        self, constructor: Union[Type[T], Callable[..., T]], original_params: Any = None
+    ):
+        constructor_to_use: Callable[..., T]
+
+        if inspect.isclass(constructor):
+
+            def constructor_to_use(**kwargs):
+                return constructor.from_config(ConfigTree({}), **kwargs)
+
+        else:
+            constructor_to_use = constructor
+
+        self._constructor = constructor_to_use
+        self.original_params = original_params
+
+    def construct(self, **kwargs) -> T:
+        return self._constructor(**kwargs)
diff --git a/kag/common/registry/registrable.py b/kag/common/registry/registrable.py
new file mode 100644
index 00000000..7e0bbacd
--- /dev/null
+++ b/kag/common/registry/registrable.py
@@ -0,0 +1,911 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import inspect
+import importlib
+import logging
+import functools
+import collections
+import traceback
+
+from pathlib import Path
+from pyhocon import ConfigTree, ConfigFactory
+from pyhocon.exceptions import ConfigMissingException
+from copy import deepcopy
+from collections import defaultdict
+from typing import (
+    TypeVar,
+    Type,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    Union,
+    cast,
+    Any,
+    get_origin,
+    get_args,
+    Mapping,
+    Set,
+    Iterable,
+)
+from kag.common.registry.lazy import Lazy
+
+
+class ConfigurationError(Exception):
+    def __init__(self, message: str):
+        super().__init__()
+        self.message = message
+
+    def __str__(self):
+        return self.message
+
+
+logger = logging.getLogger()
+
+RegistrableType = TypeVar("RegistrableType", bound="Registrable")
+
+
+def str_to_bool(s):
+    if isinstance(s, bool):
+        return s
+    s = s.lower()
+    if s == "true":
+        return True
+    elif s == "false":
+        return False
+    elif s == "none":
+        return None
+    elif s == "0":
+        return False
+    elif s == "1":
+        return True
+    else:
+        raise ValueError(f"not supported string {s}")
+
+
+def auto_setattr(func, self, args, kwargs):
+    # handle default values
+    def try_setattr(attr, val):
+        try:
+            setattr(self, attr, val)
+        except Exception as e:
+            logger.warning(
+                f"set attribute {attr} of type {type(self)} error, info: {e}"
+            )
+
+    attrs, varargs, varkw, defaults = (inspect.getfullargspec(func))[:4]
+    if defaults:
+        for attr, val in zip(reversed(attrs), reversed(defaults)):
+            try_setattr(attr, val)
+    # handle positional arguments
+    positional_attrs = attrs[1:]
+    for attr, val in zip(positional_attrs, args):
+        try_setattr(attr, val)
+
+    if kwargs:
+        for attr, val in kwargs.items():
+            try_setattr(attr, val)
+
+
+def autoargs(func):
+    """A decorator which automatically assign the inputs of the function to self PRIOR to executing
+    the function."""
+
+    @functools.wraps(func)
+    def wrapper(self, *args, **kwargs):
+        auto_setattr(func, self, args=args, kwargs=kwargs)
+
+        try:
+            ret = func(self, *args, **kwargs)
+        except TypeError as e:
+            raise TypeError(
+                "call %s.%s failed, details:%s"
+                % (type(self).__name__, func.__name__, str(e))
+            )
+
+        return ret
+
+    return wrapper
+
+
+def can_accept_arg(obj, arg: str) -> bool:
+    """
+    Checks whether the provided obj takes a certain arg.
+    If it's a class, we're really checking whether its constructor does.
+    If it's a function or method, we're checking the object itself.
+    Otherwise, we raise an error.
+    """
+    if inspect.isclass(obj):
+        signature = inspect.signature(obj.__init__)
+    elif inspect.ismethod(obj) or inspect.isfunction(obj):
+        signature = inspect.signature(obj)
+    else:
+        raise ConfigurationError(f"object {obj} is not callable")
+    return arg in signature.parameters
+
+
+def can_accept_kwargs(obj) -> bool:
+    """
+    Checks whether a provided object takes in any positional arguments.
+    Similar to accept_arg, we do this for both the __init__ function of
+    the class or a function / method
+    Otherwise, we raise an error
+    """
+    if inspect.isclass(obj):
+        signature = inspect.signature(obj.__init__)
+    elif inspect.ismethod(obj) or inspect.isfunction(obj):
+        signature = inspect.signature(obj)
+    else:
+        raise ConfigurationError(f"object {obj} is not callable")
+    return any(
+        p.kind == inspect.Parameter.VAR_KEYWORD  # type: ignore
+        for p in signature.parameters.values()
+    )
+
+
+def can_construct_from_config(type_: Type) -> bool:
+    if type_ in [str, int, float, bool]:
+        return True
+    origin = getattr(type_, "__origin__", None)
+    if origin == Lazy:
+        return True
+    elif origin:
+        if hasattr(type_, "from_config"):
+            return True
+        args = getattr(type_, "__args__")
+        return all(can_construct_from_config(arg) for arg in args)
+
+    return hasattr(type_, "from_config")
+
+
+def remove_optional(annotation: type) -> type:
+    """
+    Remove Optional[X](alias of Union[T, None]) annotations by filtering out NoneType from Union[X, NoneType].
+    """
+    origin = get_origin(annotation)
+    args = get_args(annotation)
+
+    if origin == Union:
+        return Union[tuple([arg for arg in args if arg != type(None)])]  # noqa
+    else:
+        return annotation
+
+
+def extract_parameters(
+    cls: Type[RegistrableType],
+    constructor: Union[
+        Callable[..., RegistrableType], Callable[[RegistrableType], None]
+    ] = None,
+) -> Dict[str, Any]:
+    """
+    Extracts the parameters from the constructor of a class, excluding any variable positional parameters.
+
+    Args:
+        cls (Type[RegistrableType]): The class whose constructor parameters are to be extracted.
+        constructor (Union[Callable[..., RegistrableType], Callable[[RegistrableType], None]], optional): The constructor method to inspect. Defaults to cls.__init__.
+
+    Returns:
+        Dict[str, Any]: A dictionary containing the parameters of the constructor, excluding any variable positional parameters.
+    """
+    if constructor is None:
+        constructor = cls.__init__
+    if isinstance(constructor, str):
+        constructor = getattr(cls, constructor)
+    signature = inspect.signature(constructor)
+    parameters = dict(signature.parameters)
+
+    var_positional_key = None
+    for param in parameters.values():
+        if param.kind == param.VAR_POSITIONAL:
+            var_positional_key = param.name
+            break
+    if var_positional_key:
+        del parameters[var_positional_key]
+    return parameters
+
+
+def create_kwargs(
+    constructor: Callable[..., RegistrableType],
+    cls: Type[RegistrableType],
+    actual_params: ConfigTree,
+) -> Tuple[bool, Dict[str, Any]]:
+    """
+    Given some class, a `Params` object, and potentially other keyword arguments,
+    create a dict of keyword args suitable for passing to the class's constructor.
+
+    The function does this by finding the class's constructor, matching the constructor
+    arguments to entries in the `params` object, and instantiating values for the parameters
+    using the type annotation and possibly a from_config method.
+
+    """
+    # Get the signature of the constructor.
+
+    kwargs: Dict[str, Any] = {}
+
+    formal_parameters = extract_parameters(cls, constructor)
+    accepts_kwargs = False
+
+    # Iterate over all the constructor parameters and their annotations.
+    for param_name, param in formal_parameters.items():
+        if param_name == "self":
+            continue
+        if param.kind == param.VAR_KEYWORD:
+            # if constructor takes **kwargs, we will put all the remaining params to kwargs
+            accepts_kwargs = True
+            continue
+
+        # annotation = remove_optional(param.annotation)
+        constructed_arg = pop_and_construct_arg(
+            cls.__name__,
+            param_name,
+            param.annotation,
+            param.default,
+            actual_params,
+        )
+        if constructed_arg is not param.default:
+            kwargs[param_name] = constructed_arg
+
+        # If we just ended up constructing the default value for the parameter, we can just omit it.
+        # Leaving it in can cause issues with **kwargs in some corner cases, where you might end up
+        # with multiple values for a single parameter (e.g., the default value gives you lazy=False
+        # for a dataset reader inside **kwargs, but a particular dataset reader actually hard-codes
+        # lazy=True - the superclass sees both lazy=True and lazy=False in its constructor).
+    # if constructor accepts kwargs, put remainder params to kwargs
+    if accepts_kwargs:
+        kwargs.update(actual_params)
+    return accepts_kwargs, kwargs
+
+
+def pop_and_construct_arg(
+    class_name: str,
+    argument_name: str,
+    annotation: Type,
+    default: Any,
+    actual_params: ConfigTree,
+) -> Any:
+    annotation = remove_optional(annotation)
+    popped_params = (
+        actual_params.pop(argument_name, default)
+        if default != inspect.Parameter.empty
+        else actual_params.pop(argument_name)
+    )
+    if popped_params is None:
+        return None
+
+    return construct_arg(
+        class_name,
+        argument_name,
+        popped_params,
+        annotation,
+        default,
+    )
+
+
+def construct_arg(
+    class_name: str,
+    argument_name: str,
+    popped_params: Any,
+    annotation: Type,
+    default: Any,
+) -> Any:
+    origin = get_origin(annotation)
+    args = get_args(annotation)
+
+    optional = default != inspect.Parameter.empty
+    # annotation is subclass of Registrable
+    if hasattr(annotation, "from_config"):
+        if popped_params is default:
+            return default
+        elif popped_params is not None:
+            # If `popped_params` has already been instantiated, use this object directly.
+            if isinstance(popped_params, annotation):
+                return popped_params
+            return annotation.from_config(ConfigFactory.from_dict(popped_params))
+        elif not optional:
+            # Not optional and not supplied, that's an error!
+            raise ConfigurationError(f"expected key {argument_name} for {class_name}")
+        else:
+            return default
+
+    # If the parameter type is a Python primitive, just pop it off
+    # using the correct casting pop_xyz operation.
+    elif annotation == int:
+        if type(popped_params) in {int, bool, str}:
+            return annotation(popped_params)
+        else:
+            raise TypeError(f"Expected {argument_name} to be a {annotation.__name__}.")
+    elif annotation == bool:
+        if type(popped_params) in {int, bool}:
+            return annotation(popped_params)
+        # string likes 'true', 'false', 'none' can be convert to bool correctly
+        # NOTE: bool(str) will always return True for nonempty str.
+        elif type(popped_params) == str:
+            return str_to_bool(popped_params)
+
+    elif annotation == str:
+        # Strings are special because we allow casting from Path to str.
+        if type(popped_params) == str or isinstance(popped_params, Path):
+            return str(popped_params)  # type: ignore
+        else:
+            raise TypeError(f"Expected {argument_name} to be a string.")
+    elif annotation == float:
+        # Floats are special because in Python, you can put an int wherever you can put a float.
+        # https://mypy.readthedocs.io/en/stable/duck_type_compatibility.html
+        if type(popped_params) in {int, float, str}:
+            return popped_params
+        else:
+            raise TypeError(f"Expected {argument_name} to be numeric.")
+
+    elif annotation == ConfigTree:
+        if isinstance(popped_params, ConfigTree):
+            return popped_params
+        elif type(popped_params) in {collections.abc.Mapping, Mapping, Dict, dict}:
+            return ConfigFactory.from_dict(popped_params)
+        else:
+            raise TypeError(f"Expected {argument_name} to be Dict.")
+    # This is special logic for handling types like Dict[str, TokenIndexer],
+    # List[TokenIndexer], Tuple[TokenIndexer, Tokenizer], and Set[TokenIndexer],
+    # which it creates by instantiating each value from_config and returning the resulting structure.
+    elif (
+        origin in {collections.abc.Mapping, Mapping, Dict, dict}
+        and len(args) == 2
+        and can_construct_from_config(args[-1])
+    ):
+        value_cls = annotation.__args__[-1]
+
+        value_dict = {}
+
+        for key, value_params in popped_params.items():
+            value_dict[key] = construct_arg(
+                str(value_cls),
+                argument_name + "." + key,
+                value_params,
+                value_cls,
+                inspect.Parameter.empty,
+            )
+
+        return value_dict
+
+    elif origin in (Tuple, tuple) and all(
+        can_construct_from_config(arg) for arg in args
+    ):
+        value_list = []
+
+        for i, (value_cls, value_params) in enumerate(
+            zip(annotation.__args__, popped_params)
+        ):
+            value = construct_arg(
+                str(value_cls),
+                argument_name + f".{i}",
+                value_params,
+                value_cls,
+                inspect.Parameter.empty,
+            )
+            value_list.append(value)
+
+        return tuple(value_list)
+
+    elif origin in (Set, set) and len(args) == 1 and can_construct_from_config(args[0]):
+        value_cls = annotation.__args__[0]
+
+        value_set = set()
+
+        for i, value_params in enumerate(popped_params):
+            value = construct_arg(
+                str(value_cls),
+                argument_name + f".{i}",
+                value_params,
+                value_cls,
+                inspect.Parameter.empty,
+            )
+            value_set.add(value)
+
+        return value_set
+
+    elif origin == Union:
+        # Storing this so we can recover it later if we need to.
+        backup_params = deepcopy(popped_params)
+
+        # We'll try each of the given types in the union sequentially, returning the first one that
+        # succeeds.
+        all_err_msg = []
+        for arg_annotation in args:
+            try:
+                return construct_arg(
+                    str(arg_annotation),
+                    argument_name,
+                    popped_params,
+                    arg_annotation,
+                    default,
+                )
+            except (
+                ValueError,
+                TypeError,
+                ConfigurationError,
+                AttributeError,
+                ConfigMissingException,
+            ) as e:
+                # Our attempt to construct the argument may have modified popped_params, so we
+                # restore it here.
+
+                popped_params = deepcopy(backup_params)
+                err_msg = f" Exception caught for constructing {arg_annotation}: {e}\n{traceback.format_exc()}"
+                all_err_msg.append(err_msg)
+        # If none of them succeeded, we crash.
+        info_separatpr = f"{'='*40}\n"
+        info = (
+            f"Failed to construct argument {argument_name} with type {annotation}, details:\n"
+            f"{'='*80}"
+            f"\n{info_separatpr.join(all_err_msg)}"
+        )
+
+        raise ConfigurationError(info)
+    elif origin == Lazy:
+        if popped_params is default:
+            return default
+
+        value_cls = args[0]
+
+        def constructor(**kwargs):
+            return value_cls.from_config(params=deepcopy(popped_params), **kwargs)
+
+        return Lazy(constructor, deepcopy(popped_params))  # type: ignore
+
+    # For any other kind of iterable, we will just assume that a list is good enough, and treat
+    # it the same as List. This condition needs to be at the end, so we don't catch other kinds
+    # of Iterables with this branch.
+    elif (
+        origin in {collections.abc.Iterable, Iterable, List, list}
+        and len(args) == 1
+        and can_construct_from_config(args[0])
+    ):
+        value_cls = annotation.__args__[0]
+
+        value_list = []
+
+        for i, value_params in enumerate(popped_params):
+            value = construct_arg(
+                str(value_cls),
+                argument_name + f".{i}",
+                value_params,
+                value_cls,
+                inspect.Parameter.empty,
+            )
+            value_list.append(value)
+
+        return value_list
+
+    else:
+        return popped_params
+
+
+class Registrable:
+    """
+    This class is motivated by the original work:
+    https://github.com/allenai/allennlp/blob/main/allennlp/common/from_params.py
+    """
+
+    _registry: Dict[Type, Dict[str, Tuple[Type, Optional[str]]]] = defaultdict(dict)
+    default_implementation: Optional[str] = None
+    NonParams = []
+
+    @autoargs
+    def __init__(self, **kwargs):
+        pass
+
+    @classmethod
+    def register(
+        cls: Type[RegistrableType],
+        name: str,
+        constructor: str = None,
+        exist_ok: bool = True,
+        as_default=False,
+    ):
+        registry = Registrable._registry[cls]
+        if as_default:
+            cls.default_implementation = name
+
+        def add_subclass_to_registry(subclass: Type[RegistrableType]):
+            # Add to registry, raise an error if key has already been used.
+            if name in registry:
+                if exist_ok:
+                    message = (
+                        f"{name} of class {subclass} has already been registered as {registry[name][0].__name__}, but "
+                        f"exist_ok=True, so overwriting with {cls.__name__}"
+                    )
+                    logger.info(message)
+                else:
+                    message = (
+                        f"Cannot register {name} as {cls.__name__}; "
+                        f"name already in use for {registry[name][0].__name__}"
+                    )
+                    raise ConfigurationError(message)
+            if inspect.isclass(subclass):
+                # not wrapped.
+                if not hasattr(subclass.__init__, "__wrapped__"):
+                    subclass.__init__ = autoargs(subclass.__init__)
+
+            registry[name] = (subclass, constructor)
+
+            return subclass
+
+        return add_subclass_to_registry
+
+    @classmethod
+    def by_name(
+        cls: Type[RegistrableType], name: str
+    ) -> Callable[..., RegistrableType]:
+        """
+        Returns a callable function that constructs an argument of the registered class.  Because
+        you can register particular functions as constructors for specific names, this isn't
+        necessarily the `__init__` method of some class.
+        """
+        subclass, constructor = cls.resolve_class_name(name)
+        if not constructor:
+            return subclass
+        else:
+            return getattr(subclass, constructor)
+
+    @classmethod
+    def resolve_class_name(
+        cls: Type[RegistrableType], name: str
+    ) -> Tuple[Type[RegistrableType], Optional[str]]:
+        if name in Registrable._registry[cls]:
+            subclass, constructor = Registrable._registry[cls][name]
+            return subclass, constructor
+        elif "." in name:
+            # This might be a fully qualified class name, so we'll try importing its "module"
+            # and finding it there.
+            parts = name.split(".")
+            submodule = ".".join(parts[:-1])
+            class_name = parts[-1]
+
+            try:
+                module = importlib.import_module(submodule)
+            except ModuleNotFoundError:
+                raise ConfigurationError(
+                    f"tried to interpret {name} as a path to a class "
+                    f"but unable to import module {submodule}"
+                )
+
+            try:
+                subclass = getattr(module, class_name)
+                constructor = None
+                return subclass, constructor
+            except AttributeError:
+                raise ConfigurationError(
+                    f"tried to interpret {name} as a path to a class "
+                    f"but unable to find class {class_name} in {submodule}"
+                )
+
+        else:
+            # is not a qualified class name
+            raise ConfigurationError(
+                f"{name} is not a registered name for {cls.__name__}. "
+                "You probably need to use the --include-package flag "
+                "to load your custom code. Alternatively, you can specify your choices "
+                """using fully-qualified paths, e.g. {"model": "my_module.models.MyModel"} """
+                "in which case they will be automatically imported correctly."
+            )
+
+    @classmethod
+    def list_all_registered(cls, with_leaf_classes: bool = False) -> List[str]:
+        registered = set()
+        for k, v in Registrable._registry.items():
+            registered.add(k)
+            if with_leaf_classes:
+                if isinstance(v, dict):
+                    for _, register_cls in v.items():
+                        registered.add(register_cls[0])
+        return sorted(list(registered), key=lambda x: (x.__module__, x.__name__))
+
+    @classmethod
+    def list_available(cls) -> List[str]:
+        """List default first if it exists"""
+        keys = list(Registrable._registry[cls].keys())
+        default = cls.default_implementation
+
+        if default is None:
+            return keys
+        elif default not in keys:
+            raise ConfigurationError(
+                f"Default implementation {default} is not registered"
+            )
+        else:
+            return [default] + [k for k in keys if k != default]
+
+    @classmethod
+    def list_available_with_detail(cls) -> Dict:
+        """List default first if it exists"""
+        register_dict = Registrable._registry[cls]
+        availables = {}
+        for k, v in register_dict.items():
+            params = extract_parameters(v[0], v[1])
+            required_params = []
+            optional_params = []
+            sample_config = {"type": k}
+            for arg_name, arg_def in params.items():
+                if arg_name.strip() == "self":
+                    continue
+                annotation = arg_def.annotation
+                if annotation == inspect.Parameter.empty:
+                    annotation = None
+                default = arg_def.default
+                required = default == inspect.Parameter.empty
+                # if default == inspect.Parameter.empty:
+                #     default = None
+                if required:
+                    arg_info = (
+                        f"{arg_name}: {annotation.__name__ if annotation else 'Any'}"
+                    )
+                    required_params.append(arg_info)
+                else:
+                    arg_info = f"{arg_name}: {annotation.__name__ if annotation else 'Any'} = {default}"
+                    optional_params.append(arg_info)
+                if required:
+                    sample_config[arg_name] = f"Your {arg_name} config"
+                else:
+                    sample_config[arg_name] = default
+
+                # if default != None:
+                #     sample_config[arg_name] = default
+
+            if v[1] is None or v[1] == "__init__":
+                constructor_doc_string = inspect.getdoc(getattr(v[0], "__init__"))
+            else:
+                constructor_doc_string = inspect.getdoc(getattr(v[0], v[1]))
+            availables[k] = {
+                "class": f"{v[0].__module__}.{v[0].__name__}",
+                "doc": inspect.getdoc(v[0]),
+                "constructor": constructor_doc_string,
+                "params": {
+                    "required_params": required_params,
+                    "optional_params": optional_params,
+                },
+                # "default_config": default_conf,
+                "sample_useage": f"{cls.__name__}.from_config({sample_config})",
+            }
+        return availables
+
+    @classmethod
+    def from_config(
+        cls: Type[RegistrableType],
+        params: Union[str, Dict, ConfigTree],
+        constructor_to_call: Callable[..., RegistrableType] = None,
+        constructor_to_inspect: Union[
+            Callable[..., RegistrableType], Callable[[RegistrableType], None]
+        ] = None,
+    ) -> RegistrableType:
+        """
+        Instantiate the object via parameters.
+        The `constructor_to_call` and `constructor_to_inspect` arguments deal with a bit of
+        redirection that we do.  We allow you to register particular `@classmethods` on a class as
+        the constructor to use for a registered name.  This lets you, e.g., have a single
+        `Vocabulary` class that can be constructed in two different ways, with different names
+        registered to each constructor.  In order to handle this, we need to know not just the class
+        we're trying to construct (`cls`), but also what method we should inspect to find its
+        arguments (`constructor_to_inspect`), and what method to call when we're done constructing
+        arguments (`constructor_to_call`).  These two methods are the same when you've used a
+        `@classmethod` as your constructor, but they are `different` when you use the default
+        constructor (because you inspect `__init__`, but call `cls()`).
+        """
+
+        logger.debug(
+            f"instantiating class {cls} from params {getattr(params, 'params', params)} "
+        )
+
+        if params is None:
+            return None
+
+        if isinstance(params, str):
+            params = ConfigFactory.from_dict({"type": params})
+        elif isinstance(params, collections.abc.Mapping) and not isinstance(
+            params, ConfigTree
+        ):
+            params = ConfigFactory.from_dict(params)
+        original_params = deepcopy(params)
+        if not isinstance(params, ConfigTree):
+            raise ConfigurationError(
+                f"from_config was passed a `{params}` object that was not able to convert to `ConfigTree`. "
+                "This probably indicates malformed parameters."
+                f"This happened when constructing an object of type {cls}."
+            )
+
+        registered_subclasses = Registrable._registry.get(cls)
+        try:
+            # instantiate object from base class
+            if registered_subclasses and not constructor_to_call:
+                as_registrable = cast(Type[Registrable], cls)
+                default_choice = as_registrable.default_implementation
+                # call with BaseClass.from_prams, should use `type` to point out which subclasss to use
+                choice = params.pop("type", default_choice)
+                choices = as_registrable.list_available()
+                # if cls has subclass and choice not found in params, we'll instantiate cls itself
+                if choice is None:
+                    subclass, constructor_name = cls, None
+                # invalid choice encountered, raise
+                elif choice not in choices:
+                    message = (
+                        f"{choice} not in acceptable choices for type: {choices}. "
+                        "You should make sure the class is correctly registerd. "
+                    )
+                    raise ConfigurationError(message)
+
+                else:
+                    subclass, constructor_name = as_registrable.resolve_class_name(
+                        choice
+                    )
+
+                # See the docstring for an explanation of what's going on here.
+                if not constructor_name:
+                    constructor_to_inspect = subclass.__init__
+                    constructor_to_call = subclass  # type: ignore
+                else:
+                    constructor_to_inspect = cast(
+                        Callable[..., RegistrableType],
+                        getattr(subclass, constructor_name),
+                    )
+                    constructor_to_call = constructor_to_inspect
+
+                retyped_subclass = cast(Type[RegistrableType], subclass)
+
+                instant = retyped_subclass.from_config(
+                    params=params,
+                    constructor_to_call=constructor_to_call,
+                    constructor_to_inspect=constructor_to_inspect,
+                )
+
+                setattr(instant, "__register_type__", choice)
+                setattr(instant, "__original_parameters__", original_params)
+                # return ins
+            else:
+                # pop unused type declaration
+                register_type = params.pop("type", None)
+
+                if not constructor_to_inspect:
+                    constructor_to_inspect = cls.__init__
+                if not constructor_to_call:
+                    constructor_to_call = cls
+
+                if constructor_to_inspect == object.__init__:
+                    # This class does not have an explicit constructor, so don't give it any kwargs.
+                    # Without this logic, create_kwargs will look at object.__init__ and see that
+                    # it takes *args and **kwargs and look for those.
+                    accepts_kwargs, kwargs = False, {}
+                else:
+                    # This class has a constructor, so create kwargs for it.
+                    constructor_to_inspect = cast(
+                        Callable[..., RegistrableType], constructor_to_inspect
+                    )
+                    accepts_kwargs, kwargs = create_kwargs(
+                        constructor_to_inspect,
+                        cls,
+                        params,
+                    )
+
+                instant = constructor_to_call(**kwargs)  # type: ignore
+                setattr(instant, "__register_type__", register_type)
+                setattr(
+                    instant,
+                    "__constructor_called__",
+                    functools.partial(constructor_to_call, **kwargs),
+                )
+                setattr(instant, "__original_parameters__", original_params)
+                # if constructor takes kwargs, they can't be infered from constructor. Therefore we should record
+                # which attrs are created by kwargs to correctly restore the configs by `to_config`.
+                if accepts_kwargs:
+                    remaining_kwargs = set(params)
+                    params.clear()
+                    setattr(instant, "__from_config_kwargs__", remaining_kwargs)
+        except Exception as e:
+            logger.warn(f"Failed to initialize class {cls}, info: {e}")
+            raise e
+        if len(params) > 0:
+            logger.warn(f"These params are not used for constructing {cls}:\n{params}")
+
+        return instant
+
+    def _to_config(self, v):
+        """iteratively convert v to params"""
+        v_type = type(v)
+        if hasattr(v, "to_config"):
+            params = v.to_config()
+        elif v_type in {collections.abc.Mapping, Mapping, Dict, dict}:
+            params = {}
+            for subk, subv in v.items():
+                params[subk] = self._to_config(subv)
+        elif v_type in {
+            collections.abc.Iterable,
+            Iterable,
+            List,
+            list,
+            Tuple,
+            tuple,
+            Set,
+            set,
+        }:
+            params = [self._to_config(x) for x in v]
+        else:
+            params = v
+        return params
+
+    def to_config(self) -> ConfigTree:
+        """
+        convert object back to params.
+        Note: If the object is not instantiated by from_config, we can't transfer it back.
+
+        """
+        # user can modify object after instantiated, so directly return original params
+        # may not be a good way.
+        # if hasattr(self, "__original_parameters__") and self.__original_parameters__:
+        #     return __original_parameters__
+        config = {}
+
+        if hasattr(self, "__register_type__") and self.__register_type__:
+            config["type"] = self.__register_type__
+
+        for k, v in self.__constructor_called__.keywords.items():
+            if k in self.NonParams:
+                continue
+            # we don't directly use the value stored in __constructor_called__.keywords, because
+            # the value could be a Lazy object, which can't convert to params. Instead, we use
+            # attrs of instance itself.
+            if hasattr(self, k):
+                v = getattr(self, k)
+            if hasattr(v, "to_config"):
+                conf = v.to_config()
+            else:
+                conf = self._to_config(v)
+            config[k] = conf
+        return ConfigFactory.from_dict(config)
+
+    def to_config_with_constructor(self, constructor: str = None) -> ConfigTree:
+        """convert object back to params.
+        Different from `to_config`, this function can convert objects that are not instantiated by `from_config`,
+        but sometimes it may not give correct result.
+        For example, suppose the class has more than one constructor, and we instantiated by constructorA but convert
+        it to params of constructorB. So use it with caution.
+        One should always use `from_config` to instantiate the object and `to_config` to convert it back to params.
+        """
+        config = {}
+
+        if hasattr(self, "__register_type__") and self.__register_type__:
+            config["type"] = self.__register_type__
+        if constructor:
+            constructor = getattr(self, constructor)
+        else:
+            constructor = self.__init__
+
+        constructor_params = extract_parameters(type(self), constructor)
+        accepts_kwargs = False
+        for k, v in constructor_params.items():
+            if k in self.NonParams:
+                continue
+
+            if v.kind == v.VAR_KEYWORD:
+                accepts_kwargs = True
+                continue
+            # get param instance from class attr
+            v_instance = getattr(self, v.name, None)
+
+            if hasattr(v_instance, "to_config"):
+                conf = v_instance.to_config()
+            else:
+                conf = self._to_config(v_instance)
+            config[k] = conf
+        if accepts_kwargs:
+            for k in self.__from_config_kwargs__:
+                if hasattr(self, k):
+                    config[k] = getattr(self, k)
+        return ConfigFactory.from_dict(config)
diff --git a/kag/common/registry/utils.py b/kag/common/registry/utils.py
new file mode 100644
index 00000000..247d5845
--- /dev/null
+++ b/kag/common/registry/utils.py
@@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import os
+import sys
+import importlib
+import pkgutil
+from pathlib import Path
+from typing import Union
+
+
+def append_python_path(path: Union[os.PathLike, str]) -> None:
+    """
+    Append the given path to `sys.path`.
+    """
+    # In some environments, such as TC, it fails when sys.path contains a relative path, such as ".".
+    path = Path(path).resolve()
+    path = str(path)
+    sys.path.append(path)
+
+
+def import_modules_from_path(path: str) -> None:
+    """
+    Import all submodules under the given package.
+    User can specify their custom packages and have their custom
+    classes get loaded and registered.
+    """
+    path = os.path.abspath(os.path.normpath(path))
+    importlib.invalidate_caches()
+    tmp = path.rsplit("/", 1)
+    if len(tmp) == 1:
+        module_path = "."
+        package_name = tmp[0]
+    else:
+        module_path, package_name = tmp
+    append_python_path(module_path)
+    # Import at top level
+    module = importlib.import_module(package_name)
+    path = list(getattr(module, "__path__", []))
+    path_string = "" if not path else path[0]
+    # walk_packages only finds immediate children, so need to recurse.
+    for module_finder, name, _ in pkgutil.walk_packages(path):
+        # Sometimes when you import third-party libraries that are on your path,
+        # `pkgutil.walk_packages` returns those too, so we need to skip them.
+        if path_string and module_finder.path != path_string:
+            continue
+        # subpackage = f"{package_name}.{name}"
+        subpackage = f"{path_string}/{name}"
+
+        import_modules_from_path(subpackage)
diff --git a/kag/common/reranker/__init__.py b/kag/common/reranker/__init__.py
index a945c8dd..4d9914d1 100644
--- a/kag/common/reranker/__init__.py
+++ b/kag/common/reranker/__init__.py
@@ -13,7 +13,4 @@
 from kag.common.reranker.bge_reranker import BGEReranker
 from kag.common.reranker.reranker import Reranker
 
-__all__ = [
-    "BGEReranker",
-    "Reranker"
-]
+__all__ = ["BGEReranker", "Reranker"]
diff --git a/kag/common/reranker/bge_reranker.py b/kag/common/reranker/bge_reranker.py
index 45a63615..e74cb022 100644
--- a/kag/common/reranker/bge_reranker.py
+++ b/kag/common/reranker/bge_reranker.py
@@ -20,60 +20,61 @@
 def rrf_score(length, r: int = 1):
     """
     Calculates the RRF (Recursive Robust Function) scores.
-    
+
     This function generates a score sequence of the given length, where each score is calculated based on the index according to the formula 1/(r+i).
     RRF is a method used in information retrieval and data analysis, and this function provides a way to generate weights based on document indices.
-    
+
     Parameters:
     length: int, the length of the score sequence, i.e., the number of scores to generate.
     r: int, optional, default is 1. Controls the starting index of the scores. Increasing the value of r shifts the emphasis towards later scores.
-    
+
     Returns:
     numpy.ndarray, an array containing the scores calculated according to the given formula.
     """
     return np.array([1 / (r + i) for i in range(length)])
 
 
-
 class BGEReranker(Reranker):
     """
     BGEReranker class is a subclass of Reranker that reranks given queries and passages.
-    
+
     This class uses the FlagReranker model from FlagEmbedding to score and reorder passages.
-    
+
     Args:
         model_path (str): Path to the FlagReranker model.
         use_fp16 (bool): Whether to use half-precision floating-point numbers for computation. Default is True.
     """
+
     def __init__(self, model_path: str, use_fp16: bool = True):
         from FlagEmbedding import FlagReranker
+
         self.model_path = model_path
         self.model = FlagReranker(self.model_path, use_fp16=use_fp16)
 
     def rerank(self, queries: List[str], passages: List[str]):
         """
         Reranks given queries and passages.
-        
+
         Args:
             queries (List[str]): List of queries.
             passages (List[str]): List of passages, where each passage is a string.
-            
+
         Returns:
             new_passages (List[str]): List of passages after reranking.
         """
         # Calculate initial ranking scores for passages
         rank_scores = rrf_score(len(passages))
         passage_scores = np.zeros(len(passages)) + rank_scores
-        
+
         # For each query, compute passage scores using the model and accumulate them
         for query in queries:
             scores = self.model.compute_score([[query, x] for x in passages])
             sorted_idx = np.argsort(-np.array(scores))
             for rank, passage_id in enumerate(sorted_idx):
                 passage_scores[passage_id] += rank_scores[rank]
-        
+
         # Perform final sorting of passages based on accumulated scores
         merged_sorted_idx = np.argsort(-passage_scores)
-        
+
         new_passages = [passages[x] for x in merged_sorted_idx]
-        return new_passages
\ No newline at end of file
+        return new_passages
diff --git a/kag/common/reranker/reranker.py b/kag/common/reranker/reranker.py
index 69b97a25..92e6d968 100644
--- a/kag/common/reranker/reranker.py
+++ b/kag/common/reranker/reranker.py
@@ -43,4 +43,4 @@ def rerank(self, queries: List[str], passages: List[str]):
 
         The function is currently not implemented and raises an exception to indicate this.
         """
-        raise NotImplementedError("rerank not implemented yet.")
\ No newline at end of file
+        raise NotImplementedError("rerank not implemented yet.")
diff --git a/kag/common/retriever/kag_retriever.py b/kag/common/retriever/kag_retriever.py
deleted file mode 100644
index 4bc19aff..00000000
--- a/kag/common/retriever/kag_retriever.py
+++ /dev/null
@@ -1,414 +0,0 @@
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import os
-from tenacity import retry, stop_after_attempt
-
-from kag.common.base.prompt_op import PromptOp
-from kag.common.vectorizer import Vectorizer
-from knext.graph_algo.client import GraphAlgoClient
-from kag.interface.retriever.chunk_retriever_abc import ChunkRetrieverABC
-from typing import List, Dict
-
-import numpy as np
-import logging
-
-from knext.reasoner.client import ReasonerClient
-from knext.schema.client import CHUNK_TYPE, OTHER_TYPE
-from knext.project.client import ProjectClient
-from kag.common.utils import processing_phrases
-from knext.search.client import SearchClient
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.config import LogicFormConfiguration
-
-logger = logging.getLogger(__name__)
-
-
-class DefaultRetriever(ChunkRetrieverABC):
-    """
-    KAGRetriever class for retrieving and processing knowledge graph data from a graph database.
-
-    this retriever references the implementation of Hippoag for the combination of dpr & ppr, developer can define your Retriever
-
-    Parameters:
-    - project_id (str, optional): Project ID to load specific project configurations.
-    - host_addr (str, optional): host addr to load specific server addr configurations.
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-        self.schema_util = SchemaUtils(LogicFormConfiguration(kwargs))
-
-        self._init_search()
-
-        self.ner_prompt = PromptOp.load(self.biz_scene, "question_ner")(language=self.language, project_id=self.project_id)
-        self.std_prompt = PromptOp.load(self.biz_scene, "std")(language=self.language)
-
-        self.pagerank_threshold = 0.9
-        self.match_threshold = 0.8
-        self.pagerank_weight = 0.5
-
-        self.reranker_model_path = os.getenv("KAG_RETRIEVER_RERANKER_MODEL_PATH")
-        if self.reranker_model_path:
-            from kag.common.reranker.reranker import BGEReranker
-            self.reranker = BGEReranker(self.reranker_model_path, use_fp16=True)
-        else:
-            self.reranker = None
-
-        self.with_semantic = True
-
-    def _init_search(self):
-        self.sc: SearchClient = SearchClient(self.host_addr, self.project_id)
-        vectorizer_config = eval(os.getenv("KAG_VECTORIZER", "{}"))
-        if self.host_addr and self.project_id:
-            config = ProjectClient(host_addr=self.host_addr, project_id=self.project_id).get_config(self.project_id)
-            vectorizer_config.update(config.get("vectorizer", {}))
-
-        self.vectorizer = Vectorizer.from_config(
-            vectorizer_config
-        )
-        self.reason: ReasonerClient = ReasonerClient(self.host_addr, self.project_id)
-        self.graph_algo = GraphAlgoClient(self.host_addr, self.project_id)
-
-
-
-
-    @retry(stop=stop_after_attempt(3))
-    def named_entity_recognition(self, query: str):
-        """
-        Perform named entity recognition.
-
-        This method invokes the pre-configured service client (self.llm) to process the input query,
-        using the named entity recognition (NER) prompt (self.ner_prompt).
-
-        Parameters:
-        query (str): The text input provided by the user or system for named entity recognition.
-
-        Returns:
-        The result returned by the service client, with the type and format depending on the used service.
-        """
-        return self.llm_module.invoke({"input": query}, self.ner_prompt)
-
-    @retry(stop=stop_after_attempt(3))
-    def named_entity_standardization(self, query: str, entities: List[Dict]):
-        """
-        Entity standardization function.
-
-        This function calls a remote service to process the input query and named entities,
-        standardizing the entities. This is useful for unifying different representations of the same entity in text,
-        improving the performance of natural language processing tasks.
-
-        Parameters:
-        - query: A string containing the query with named entities.
-        - entities: A list of dictionaries, each containing information about named entities.
-
-        Returns:
-        - The result of the remote service call, typically standardized named entity information.
-        """
-        return self.llm_module.invoke(
-            {"input": query, "named_entities": entities}, self.std_prompt
-        )
-
-    @staticmethod
-    def append_official_name(source_entities: List[Dict], entities_with_official_name: List[Dict]):
-        """
-        Appends official names to entities.
-
-        Parameters:
-        source_entities (List[Dict]): A list of source entities.
-        entities_with_official_name (List[Dict]): A list of entities with official names.
-
-        """
-        tmp_dict = {}
-        for tmp_entity in entities_with_official_name:
-            name = tmp_entity["entity"]
-            category = tmp_entity["category"]
-            official_name = tmp_entity["official_name"]
-            key = f"{category}{name}"
-            tmp_dict[key] = official_name
-
-        for tmp_entity in source_entities:
-            name = tmp_entity["entity"]
-            category = tmp_entity["category"]
-            key = f"{category}{name}"
-            if key in tmp_dict:
-                official_name = tmp_dict[key]
-                tmp_entity["official_name"] = official_name
-
-    def calculate_sim_scores(self, query: str, doc_nums: int):
-        """
-        Calculate the vector similarity scores between a query and document chunks.
-
-        Parameters:
-        query (str): The user's query text.
-        doc_nums (int): The number of document chunks to return.
-
-        Returns:
-        dict: A dictionary with keys as document chunk IDs and values as the vector similarity scores.
-        """
-        scores = dict()
-        try:
-            query_vector = self.vectorizer.vectorize(query)
-            top_k = self.sc.search_vector(
-                label=self.schema_util.get_label_within_prefix(CHUNK_TYPE),
-                property_key="content",
-                query_vector=query_vector,
-                topk=doc_nums
-            )
-            scores = {item["node"]["id"]: item["score"] for item in top_k}
-        except Exception as e:
-            logger.error(
-                f"run calculate_sim_scores failed, info: {e}", exc_info=True
-            )
-        return scores
-
-    def calculate_pagerank_scores(self, start_nodes: List[Dict]):
-        """
-        Calculate and retrieve PageRank scores for the given starting nodes.
-
-        Parameters:
-        start_nodes (list): A list containing document fragment IDs to be used as starting nodes for the PageRank algorithm.
-
-        Returns:
-        ppr_doc_scores (dict): A dictionary containing each document fragment ID and its corresponding PageRank score.
-
-        This method uses the PageRank algorithm in the graph store to compute scores for document fragments. If `start_nodes` is empty,
-        it returns an empty dictionary. Otherwise, it attempts to retrieve PageRank scores from the graph store and converts the result
-        into a dictionary format where keys are document fragment IDs and values are their respective PageRank scores. Any exceptions,
-        such as failures in running `run_pagerank_igraph_chunk`, are logged.
-        """
-        scores = dict()
-        if len(start_nodes) != 0:
-            try:
-                scores = self.graph_algo.calculate_pagerank_scores(
-                    self.schema_util.get_label_within_prefix(CHUNK_TYPE),
-                    start_nodes
-                )
-            except Exception as e:
-                logger.error(
-                    f"run calculate_pagerank_scores failed, info: {e}, start_nodes: {start_nodes}", exc_info=True
-                )
-        return scores
-
-    def match_entities(self, queries: Dict[str, str], top_k: int = 1):
-        """
-        Match entities based on the provided queries.
-
-        :param queries: A dictionary containing keywords and their labels.
-        :param top_k: The number of top results to return. Default is 1.
-        :return: A tuple containing a list of matched entities and their scores.
-        """
-        matched_entities = []
-        matched_entities_scores = []
-        for query, query_type in queries.items():
-            query = processing_phrases(query)
-            if query_type not in self.schema_util.node_en_zh.keys():
-                query_type = self.schema_util.get_label_within_prefix(OTHER_TYPE)
-            else:
-                query_type = self.schema_util.get_label_within_prefix(query_type)
-            typed_nodes = self.sc.search_vector(
-                label=query_type,
-                property_key="name",
-                query_vector=self.vectorizer.vectorize(query),
-                topk=top_k,
-            )
-            if query_type != self.schema_util.get_label_within_prefix(OTHER_TYPE):
-                nontyped_nodes = self.sc.search_vector(
-                    label=self.schema_util.get_label_within_prefix(OTHER_TYPE),
-                    property_key="name",
-                    query_vector=self.vectorizer.vectorize(query),
-                    topk=top_k,
-                )
-            else:
-                nontyped_nodes = typed_nodes
-
-            if len(typed_nodes) == 0 and len(nontyped_nodes) != 0:
-                matched_entities.append(
-                    {"name": nontyped_nodes[0]["node"]["name"], "type": OTHER_TYPE}
-                )
-                matched_entities_scores.append(nontyped_nodes[0]["score"])
-            elif len(typed_nodes) != 0 and len(nontyped_nodes) != 0:
-                if typed_nodes[0]["score"] > 0.8:
-                    matched_entities.append(
-                        {"name": typed_nodes[0]["node"]["name"], "type": query_type}
-                    )
-                    matched_entities_scores.append(typed_nodes[0]["score"])
-                else:
-                    matched_entities.append(
-                        {"name": nontyped_nodes[0]["node"]["name"], "type": OTHER_TYPE}
-                    )
-                    matched_entities_scores.append(nontyped_nodes[0]["score"])
-                    matched_entities.append(
-                        {"name": typed_nodes[0]["node"]["name"], "type": query_type}
-                    )
-                    matched_entities_scores.append(typed_nodes[0]["score"])
-            elif len(typed_nodes) != 0 and len(nontyped_nodes) == 0:
-                if typed_nodes[0]["score"] > 0.8:
-                    matched_entities.append(
-                        {"name": typed_nodes[0]["node"]["name"], "type": query_type}
-                    )
-                    matched_entities_scores.append(typed_nodes[0]["score"])
-
-        if not matched_entities:
-            logger.info(f"No entities matched for {queries}")
-        return matched_entities, matched_entities_scores
-
-    def calculate_combined_scores(self, sim_scores: Dict[str, float], pagerank_scores: Dict[str, float]):
-        """
-        Calculate and return the combined scores that integrate both similarity scores and PageRank scores.
-
-        Parameters:
-        sim_scores (Dict[str, float]): A dictionary containing similarity scores, where keys are identifiers and values are scores.
-        pagerank_scores (Dict[str, float]): A dictionary containing PageRank scores, where keys are identifiers and values are scores.
-
-        Returns:
-        Dict[str, float]: A dictionary containing the combined scores, where keys are identifiers and values are the combined scores.
-        """
-        def min_max_normalize(x):
-            if len(x) == 0:
-                return []
-            if np.max(x) - np.min(x) > 0:
-                return (x - np.min(x)) / (np.max(x) - np.min(x))
-            else:
-                return x - np.min(x)
-
-        all_keys = set(pagerank_scores.keys()).union(set(sim_scores.keys()))
-        for key in all_keys:
-            sim_scores.setdefault(key, 0.0)
-            pagerank_scores.setdefault(key, 0.0)
-        sim_scores = dict(zip(sim_scores.keys(), min_max_normalize(
-            np.array(list(sim_scores.values()))
-        )))
-        pagerank_scores = dict(zip(pagerank_scores.keys(), min_max_normalize(
-            np.array(list(pagerank_scores.values()))
-        )))
-        combined_scores = dict()
-        for key in pagerank_scores.keys():
-            combined_scores[key] = (sim_scores[key] * (1 - self.pagerank_weight) +
-                                    pagerank_scores[key] * self.pagerank_weight
-                                    )
-        return combined_scores
-
-    def recall_docs(self, query: str, top_k: int = 5, **kwargs):
-        """
-        Recall relevant documents based on the query string.
-
-        Parameters:
-        - query (str): The user's query string.
-        - top_k (int, optional): The number of documents to return, default is 5.
-
-        Keyword Arguments:
-        - kwargs: Additional keyword arguments.
-
-        Returns:
-        - list: A list containing the top_k most relevant documents.
-        """
-        assert isinstance(query, str), "Query must be a string"
-
-        chunk_nums = top_k * 20
-        if chunk_nums == 0:
-            return []
-
-        ner_list = self.named_entity_recognition(query)
-        print(ner_list)
-        if self.with_semantic:
-            std_ner_list = self.named_entity_standardization(query, ner_list)
-            self.append_official_name(ner_list, std_ner_list)
-
-        entities = {}
-        for item in ner_list:
-            entity = item.get("entity", "")
-            category = item.get("category", "")
-            official_name = item.get("official_name", "")
-            if not entity or not (category or official_name):
-                continue
-            if category.lower() in ["works", "person", "other"]:
-                entities[entity] = category
-            else:
-                entities[entity] = official_name or category
-
-        sim_scores = self.calculate_sim_scores(query, chunk_nums)
-        matched_entities, matched_scores = self.match_entities(entities)
-        pagerank_scores = self.calculate_pagerank_scores(matched_entities)
-
-        if not matched_entities:
-            combined_scores = sim_scores
-        elif matched_entities and np.min(matched_scores) > self.pagerank_threshold:
-            combined_scores = pagerank_scores
-        else:
-            combined_scores = self.calculate_combined_scores(sim_scores, pagerank_scores)
-        sorted_scores = sorted(
-            combined_scores.items(), key=lambda item: item[1], reverse=True
-        )
-        logger.debug(f"sorted_scores: {sorted_scores}")
-
-        return self.get_all_docs_by_id(query, sorted_scores, top_k)
-
-    def get_all_docs_by_id(self, query: str, doc_ids: list, top_k: int):
-        """
-        Retrieve a list of documents based on their IDs.
-
-        Parameters:
-        - query (str): The query string for text matching.
-        - doc_ids (list): A list of document IDs to retrieve documents.
-        - top_k (int): The maximum number of documents to return.
-
-        Returns:
-        - list: A list of matched documents.
-        """
-        matched_docs = []
-        hits_docs = set()
-        counter = 0
-        for doc_id in doc_ids:
-            if counter == top_k:
-                break
-            if isinstance(doc_id, tuple):
-                doc_score = doc_id[1]
-                doc_id = doc_id[0]
-            else:
-                doc_score = doc_ids[doc_id]
-            counter += 1
-            node = self.reason.query_node(label=self.schema_util.get_label_within_prefix(CHUNK_TYPE), id_value=doc_id)
-            node_dict = dict(node.items())
-            matched_docs.append(f"#{node_dict['name']}#{node_dict['content']}#{doc_score}")
-            hits_docs.add(node_dict['name'])
-        try:
-            text_matched = self.sc.search_text(query, [self.schema_util.get_label_within_prefix(CHUNK_TYPE)], topk=1)
-            if text_matched:
-                for item in text_matched:
-                    title = item["node"]["name"]
-                    if title not in hits_docs:
-                        if len(matched_docs) > 0:
-                            matched_docs.pop()
-                        else:
-                            logger.warning(f"{query} matched docs is empty")
-                        matched_docs.append(f'#{item["node"]["name"]}#{item["node"]["content"]}#{item["score"]}')
-                        break
-        except Exception as e:
-            logger.warning(f"{query} query chunk failed: {e}", exc_info=True)
-        logger.debug(f"matched_docs: {matched_docs}")
-        return matched_docs
-
-    def rerank_docs(self, queries: List[str], passages: List[str]):
-        """
-        Re-ranks the given passages based on the provided queries.
-
-        Parameters:
-        - queries (List[str]): A list of queries.
-        - passages (List[str]): A list of passages.
-
-        Returns:
-        - List[str]: A re-ranked list of passages.
-        """
-        if self.reranker is None:
-            return passages
-        return self.reranker.rerank(queries, passages)
diff --git a/kag/common/retriever/retriever.py b/kag/common/retriever/retriever.py
deleted file mode 100644
index e125248b..00000000
--- a/kag/common/retriever/retriever.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import io
-import json
-from pathlib import Path
-from abc import ABC, abstractmethod
-from typing import Any, Union, Iterable, Tuple
-
-from typing import Dict
-import logging
-
-logger = logging.getLogger(__name__)
-
-Item = Dict[str, Any]
-RetrievalResult = Iterable[Tuple[Item, float]]
-
-
-class Retriever(ABC):
-    """
-    Retriever indexing a collection of items and supports fast retrieving of the
-    desired items given a query.
-    """
-
-    @classmethod
-    def from_config(cls, config: Union[str, Path, Dict[str, Any]]) -> "Retriever":
-        """
-        Create retriever from `config`.
-
-        If `config` is a string or path, it will be loaded as a dictionary depending
-        on its file extension. Currently, the following formats are supported:
-
-        * .json: JSON
-        * .json5: JSON with comments support
-        * .yaml: YAML
-
-        :param config: retriever config
-        :type config: str, Path or Dict[str, Any]
-        :return: retriever instance
-        :rtype: Retriever
-        """
-        from kag.common.utils import dynamic_import_class
-
-        if isinstance(config, (str, Path)):
-            config_path = config
-            if not isinstance(config_path, Path):
-                config_path = Path(config_path)
-            if config_path.name.endswith(".yaml"):
-                import yaml
-
-                with io.open(config_path, "r", encoding="utf-8") as fin:
-                    config = yaml.safe_load(fin)
-            elif config_path.name.endswith(".json5"):
-                import json5
-
-                with io.open(config_path, "r", encoding="utf-8") as fin:
-                    config = json5.load(fin)
-            elif config_path.name.endswith(".json"):
-                with io.open(config_path, "r", encoding="utf-8") as fin:
-                    config = json.load(fin)
-            else:
-                message = "only .json, .json5 and .yaml are supported currently; "
-                message += "can not load retriever config from %r" % str(config_path)
-                raise RuntimeError(message)
-        elif isinstance(config, dict):
-            pass
-        else:
-            message = "only str, Path and dict are supported; "
-            message += "invalid retriever config: %r" % (config,)
-            raise RuntimeError(message)
-
-        class_name = config.get("retriever")
-        if class_name is None:
-            message = "retriever class name is not specified"
-            raise RuntimeError(message)
-        retriever_class = dynamic_import_class(class_name, "retriever")
-        if not issubclass(retriever_class, Retriever):
-            message = "class %r is not a retriever class" % (class_name,)
-            raise RuntimeError(message)
-        retriever = retriever_class._from_config(config)
-        return retriever
-
-    @classmethod
-    @abstractmethod
-    def _from_config(cls, config: Dict[str, Any]) -> "Retriever":
-        """
-        Create retriever from `config`. This method is supposed to be implemented
-        by derived classes.
-
-        :param config: retriever config
-        :type config: Dict[str, Any]
-        :return: retriever instance
-        :rtype: Retriever
-        """
-        message = "abstract method _from_config is not implemented"
-        raise NotImplementedError(message)
-
-    def index(self, items: Union[Item, Iterable[Item]]) -> None:
-        """
-        Add one or more items to the index of the retriever.
-
-        NOTE: This method may not be supported by the retriever.
-
-        :param items: items to index
-        :type items: Item or Iterable[Item]
-        """
-        message = "method index is not supported by the retriever"
-        raise RuntimeError(message)
-
-    @abstractmethod
-    def retrieve(
-            self, queries: Union[str, Iterable[str]], top_k: int = 10
-    ) -> Union[RetrievalResult, Iterable[RetrievalResult]]:
-        """
-        Retrieve items for the given query or queries.
-
-        :param queries: queries to retrieve
-        :type queries: str or Iterable[str]
-        :param int top_k: how many most related items to return for each query, default to 10
-        :return: retrieval results of the queries
-        :rtype: RetrievalResult or Iterable[RetrievalResult]
-        """
-        message = "abstract method retrieve is not implemented"
-        raise NotImplementedError(message)
-
-
diff --git a/kag/common/sharding_info.py b/kag/common/sharding_info.py
new file mode 100644
index 00000000..08d7c4cf
--- /dev/null
+++ b/kag/common/sharding_info.py
@@ -0,0 +1,208 @@
+# -*- coding: utf-8 -*-
+from kag.common.registry import Registrable
+
+
+class ShardingInfo(Registrable):
+    """
+    A class representing sharding information for distributed computing.
+
+    This class provides methods to manage and query sharding information across
+    multiple machines, instances, and processes. It inherits from the `Registrable`
+    class.
+
+    Attributes:
+        machine_id (int): The ID of the current machine. Default is 0.
+        machine_count (int): The total number of machines. Default is 1.
+        instance_id (int): The ID of the current instance. Default is 0.
+        instance_count (int): The total number of instances. Default is 1.
+        process_id (int): The ID of the current process. Default is 0.
+        process_count (int): The total number of processes. Default is 1.
+        shard_id (int, optional): The ID of the current shard. Default is None.
+        shard_count (int, optional): The total number of shards. Default is None.
+        shard_by_machine (bool): Whether to shard by machine. Default is True.
+        shard_by_instance (bool): Whether to shard by instance. Default is True.
+        shard_by_process (bool): Whether to shard by process. Default is True.
+    """
+
+    def __init__(
+        self,
+        machine_id: int = 0,
+        machine_count: int = 1,
+        instance_id: int = 0,
+        instance_count: int = 1,
+        process_id: int = 0,
+        process_count: int = 1,
+        shard_id: int = None,
+        shard_count: int = None,
+    ):
+        """
+        Initializes a new instance of the ShardingInfo class.
+
+        Args:
+            machine_id (int): The ID of the current machine. Default is 0.
+            machine_count (int): The total number of machines. Default is 1.
+            instance_id (int): The ID of the current instance. Default is 0.
+            instance_count (int): The total number of instances. Default is 1.
+            process_id (int): The ID of the current process. Default is 0.
+            process_count (int): The total number of processes. Default is 1.
+            shard_id (int, optional): The ID of the current shard. Default is None.
+            shard_count (int, optional): The total number of shards. Default is None.
+        """
+        self.instance_id = instance_id
+        self.instance_count = instance_count
+        self.machine_id = machine_id
+        self.machine_count = machine_count
+        self.process_id = process_id
+        self.process_count = process_count
+        self.shard_id = shard_id
+        self.shard_count = shard_count
+
+        self.shard_by_machine = True
+        self.shard_by_instance = True
+        self.shard_by_process = True
+
+    def shard_by(
+        self, machine: bool = True, instance: bool = True, process: bool = True
+    ):
+        """
+        Configures the sharding strategy by specifying whether to shard by machine,
+        instance, or process.
+
+        Args:
+            machine (bool): Whether to shard by machine. Default is True.
+            instance (bool): Whether to shard by instance. Default is True.
+            process (bool): Whether to shard by process. Default is True.
+        """
+        self.shard_by_machine = machine
+        self.shard_by_instance = instance
+        self.shard_by_process = process
+
+    def get_rank(self):
+        """
+        Returns the rank of the current shard based on the configured sharding strategy.
+
+        Returns:
+            int: The rank of the current shard.
+        """
+        if self.shard_id is not None:
+            return self.shard_id
+        if self.shard_by_machine:
+            machine_id = self.machine_id
+        else:
+            machine_id = 0
+        if self.shard_by_instance:
+            instance_id, instance_count = self.instance_id, self.instance_count
+        else:
+            instance_id, instance_count = 0, 1
+        if self.shard_by_process:
+            process_id, process_count = self.process_id, self.process_count
+        else:
+            process_id, process_count = 0, 1
+
+        return process_count * (machine_id * instance_count + instance_id) + process_id
+
+    def get_world_size(self):
+        """
+        Returns the total number of shards in the world based on the configured sharding strategy.
+
+        Returns:
+            int: The total number of shards.
+        """
+        if self.shard_count is not None:
+            return self.shard_count
+        world_size = 1
+        if self.shard_by_machine:
+            world_size *= self.machine_count
+        if self.shard_by_instance:
+            world_size *= self.instance_count
+        if self.shard_by_process:
+            world_size *= self.process_count
+        return world_size
+
+    def get_sharding_range(self, total: int):
+        """
+        Returns the range of indices that the current shard is responsible for.
+
+        Args:
+            total (int): The total number of items to be sharded.
+
+        Returns:
+            Tuple[int, int]: A tuple containing the start and end indices of the range.
+        """
+        rank = self.get_rank()
+        world_size = self.get_world_size()
+        if total % world_size == 0:
+            workload = total // world_size
+        else:
+            workload = total // world_size + 1
+        start = workload * rank
+        end = min(total, workload * (rank + 1))
+        return start, end
+
+    @property
+    def is_master_process(self):
+        """
+        Checks if the current process is the master process.
+
+        Returns:
+            bool: True if the current process is the master process, False otherwise.
+        """
+        return self.process_id == 0
+
+    @property
+    def is_master_instance(self):
+        """
+        Checks if the current instance is the master instance.
+
+        Returns:
+            bool: True if the current instance is the master instance, False otherwise.
+        """
+        return self.instance_id == 0
+
+    @property
+    def is_master_machine(self):
+        """
+        Checks if the current machine is the master machine.
+
+        Returns:
+            bool: True if the current machine is the master machine, False otherwise.
+        """
+        return self.machine_id == 0
+
+    def __str__(self):
+        """
+        Returns a string representation of the ShardingInfo object.
+
+        Returns:
+            str: A string containing the rank, world size, and other sharding details.
+        """
+        content = (
+            f"ShardingInfo: rank={self.get_rank()}, world_size={self.get_world_size()}, "
+            f"machine: {self.machine_id}/{self.machine_count}, "
+            f"instance: {self.instance_id}/{self.instance_count}, "
+            f"process: {self.process_id}/{self.process_count}"
+        )
+        return content
+
+    __repr__ = __str__
+
+    def copy(self):
+        """
+        Creates a copy of the current ShardingInfo object.
+
+        Returns:
+            ShardingInfo: A new instance of ShardingInfo with the same attributes.
+        """
+        return ShardingInfo(
+            self.machine_id,
+            self.machine_count,
+            self.instance_id,
+            self.instance_count,
+            self.process_id,
+            self.process_count,
+            self.shard_id,
+            self.shard_count,
+        )
+
+
+ShardingInfo.register("base")(ShardingInfo)
diff --git a/kag/common/utils.py b/kag/common/utils.py
index 2a6f5ac0..c7c98924 100644
--- a/kag/common/utils.py
+++ b/kag/common/utils.py
@@ -12,51 +12,30 @@
 import re
 import sys
 import json
-from typing import Type,Tuple
-import inspect
+import hashlib
 import os
-from pathlib import Path
+import tempfile
+import requests
 import importlib
+from typing import Tuple
+from pathlib import Path
+
 from shutil import copystat, copy2
 from typing import Any, Union
 from jinja2 import Environment, FileSystemLoader, Template
 from stat import S_IWUSR as OWNER_WRITE_PERMISSION
+from tenacity import retry, stop_after_attempt
 
-
-def _register(root, path, files, class_type):
-    relative_path = os.path.relpath(path, root)
-    module_prefix = relative_path.replace(".", "").replace("/", ".")
-    module_prefix = module_prefix + "." if module_prefix else ""
-    for file_name in files:
-        if file_name.endswith(".py"):
-            module_name = module_prefix + os.path.splitext(file_name)[0]
-            import importlib
-
-            module = importlib.import_module(module_name)
-            classes = inspect.getmembers(module, inspect.isclass)
-            for class_name, class_obj in classes:
-                if (
-                    issubclass(class_obj, class_type)
-                    and inspect.getmodule(class_obj) == module
-                ):
-
-                    class_type.register(
-                        name=class_name,
-                        local_path=os.path.join(path, file_name),
-                        module_path=module_name,
-                    )(class_obj)
-
-
-def register_from_package(path: str, class_type: Type) -> None:
-    """
-    Register all classes under the given package.
-    Only registered classes can be recognized by kag.
-    """
-    if not append_python_path(path):
-        return
-    for root, dirs, files in os.walk(path):
-        _register(path, root, files, class_type)
-    class_type._has_registered = True
+reset = "\033[0m"
+bold = "\033[1m"
+underline = "\033[4m"
+red = "\033[31m"
+green = "\033[32m"
+yellow = "\033[33m"
+blue = "\033[34m"
+magenta = "\033[35m"
+cyan = "\033[36m"
+white = "\033[37m"
 
 
 def append_python_path(path: str) -> bool:
@@ -70,6 +49,7 @@ def append_python_path(path: str) -> bool:
         return True
     return False
 
+
 def render_template(
     root_dir: Union[str, os.PathLike], file: Union[str, os.PathLike], **kwargs: Any
 ) -> None:
@@ -82,7 +62,6 @@ def render_template(
 
     if path_obj.suffix == ".tmpl":
         path_obj.rename(render_path)
-
     render_path.write_text(content, "utf8")
 
 
@@ -113,7 +92,7 @@ def copyfile(src: Path, dst: Path, **kwargs):
     _make_writable(dst)
     if dst.suffix != ".tmpl":
         return
-    render_template('/', dst, **kwargs)
+    render_template("/", dst, **kwargs)
 
 
 def remove_files_except(path, file, new_file):
@@ -137,7 +116,6 @@ def load_json(content):
     try:
         return json.loads(content)
     except json.JSONDecodeError as e:
-
         substr = content[: e.colno - 1]
         return json.loads(substr)
 
@@ -194,8 +172,7 @@ def processing_phrases(phrase):
 def to_camel_case(phrase):
     s = processing_phrases(phrase).replace(" ", "_")
     return "".join(
-        word.capitalize() if i != 0 else word
-        for i, word in enumerate(s.split("_"))
+        word.capitalize() if i != 0 else word for i, word in enumerate(s.split("_"))
     )
 
 
@@ -203,3 +180,98 @@ def to_snake_case(name):
     words = re.findall("[A-Za-z][a-z0-9]*", name)
     result = "_".join(words).lower()
     return result
+
+
+def get_vector_field_name(property_key: str):
+    name = f"{property_key}_vector"
+    name = to_snake_case(name)
+    return "_" + name
+
+
+def split_list_into_n_parts(lst, n):
+    length = len(lst)
+    part_size = length // n
+    seg = [x * part_size for x in range(n)]
+    seg.append(min(length, part_size * n))
+
+    remainder = length % n
+
+    result = []
+
+    # 分割列表
+    start = 0
+    for i in range(n):
+        # 计算当前份的元素数量
+        if i < remainder:
+            end = start + part_size + 1
+        else:
+            end = start + part_size
+
+        # 添加当前份到结果列表
+        result.append(lst[start:end])
+
+        # 更新起始位置
+        start = end
+
+    return result
+
+
+def generate_hash_id(value):
+    """
+    Generates a hash ID and an abstracted version of the input value.
+
+    If the input value is a dictionary, it sorts the dictionary items and abstracts the dictionary.
+    If the input value is not a dictionary, it abstracts the value directly.
+
+    Args:
+        value: The input value to be hashed and abstracted.
+
+    Returns:
+        Tuple[str, Any]: A tuple containing the hash ID and the abstracted value.
+    """
+    if isinstance(value, dict):
+        sorted_items = sorted(value.items())
+        key = str(sorted_items)
+    else:
+        key = value
+    if isinstance(key, str):
+        key = key.encode("utf-8")
+    hasher = hashlib.sha256()
+    hasher.update(key)
+
+    return hasher.hexdigest()
+
+
+@retry(stop=stop_after_attempt(3))
+def download_from_http(url: str, dest: str = None) -> str:
+    """Downloads a file from an HTTP URL and saves it to a temporary directory.
+
+    This function uses the requests library to download a file from the specified
+    HTTP URL and saves it to the system's temporary directory. After the download
+    is complete, it returns the local path of the downloaded file.
+
+    Args:
+        url (str): The HTTP URL of the file to be downloaded.
+
+    Returns:
+        str: The local path of the downloaded file.
+
+    """
+
+    # Send an HTTP GET request to download the file
+    response = requests.get(url, stream=True)
+    response.raise_for_status()  # Check if the request was successful
+
+    if dest is None:
+        # Create a temporary file
+        temp_dir = tempfile.gettempdir()
+        temp_file_path = os.path.join(temp_dir, os.path.basename(url))
+        dest = temp_file_path
+
+    with open(dest, "wb") as temp_file:
+        # Write the downloaded content to the temporary file
+        for chunk in response.iter_content(chunk_size=1024**2):
+            temp_file.write(chunk)
+
+    # Return the path of the temporary file
+    return temp_file.name
diff --git a/kag/common/vectorize_model/__init__.py b/kag/common/vectorize_model/__init__.py
new file mode 100644
index 00000000..1af8cfd3
--- /dev/null
+++ b/kag/common/vectorize_model/__init__.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from kag.common.vectorize_model.local_bge_model import (
+    LocalBGEVectorizeModel,
+    LocalBGEM3VectorizeModel,
+)
+from kag.common.vectorize_model.openai_model import OpenAIVectorizeModel
+from kag.common.vectorize_model.mock_model import MockVectorizeModel
+from kag.common.vectorize_model.vectorize_model_config_checker import (
+    VectorizeModelConfigChecker,
+)
+
+
+__all__ = [
+    "LocalBGEM3VectorizeModel",
+    "LocalBGEVectorizeModel",
+    "OpenAIVectorizeModel",
+    "MockVectorizeModel",
+    "VectorizeModelConfigChecker",
+]
diff --git a/kag/common/vectorize_model/local_bge_model.py b/kag/common/vectorize_model/local_bge_model.py
new file mode 100644
index 00000000..b87d9d32
--- /dev/null
+++ b/kag/common/vectorize_model/local_bge_model.py
@@ -0,0 +1,201 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import os
+import logging
+import threading
+from typing import Union, Iterable
+from kag.interface import VectorizeModelABC, EmbeddingVector
+
+logger = logging.getLogger()
+
+
+LOCAL_MODEL_MAP = {}
+
+
+@VectorizeModelABC.register("bge")
+class LocalBGEVectorizeModel(VectorizeModelABC):
+    """
+    A class that extends the VectorizeModelABC base class.
+    It invokes local BGE embedding models to convert texts into embedding vectors.
+    """
+
+    _LOCK = threading.Lock()
+
+    def __init__(
+        self,
+        path: str,
+        url: str = None,
+        query_instruction_for_retrieval: str = None,
+        vector_dimensions: int = None,
+    ):
+        """
+        Initializes the LocalBGEVectorizeModel instance.
+
+        Args:
+            path (str): The path to the local BGE model.
+            url (str, optional): The URL to download the model if not found locally. Defaults to None.
+            query_instruction_for_retrieval (str, optional): The query instruction for retrieval. Defaults to None.
+            vector_dimensions (int, optional): The number of dimensions for the embedding vectors. Defaults to None.
+        """
+        super().__init__(vector_dimensions)
+        self.model_path = os.path.expanduser(path)
+        self.url = url
+        config_path = os.path.join(self.model_path, "config.json")
+        if not os.path.isfile(config_path):
+            if url is None:
+                message = f"model not found at {path!r}, nor model url specified"
+                raise RuntimeError(message)
+            logger.info("Model file not found in path, start downloading...")
+            self._download_model(self.model_path, self.url)
+        default_chinese_query_instruction_for_retrieval = "为这个句子生成表示以用于向量检索："
+        default_english_query_instruction_for_retrieval = (
+            "Represent this sentence for searching relevant passages:"
+        )
+        if "BAAI/bge-base-zh-v1.5" in path:
+            default_query_instruction_for_retrieval = (
+                default_chinese_query_instruction_for_retrieval
+            )
+        else:
+            default_query_instruction_for_retrieval = (
+                default_english_query_instruction_for_retrieval
+            )
+
+        if query_instruction_for_retrieval:
+            self.query_instruction_for_retrieval = query_instruction_for_retrieval
+        else:
+            self.query_instruction_for_retrieval = (
+                default_query_instruction_for_retrieval
+            )
+        with LocalBGEVectorizeModel._LOCK:
+            if self.model_path in LOCAL_MODEL_MAP:
+                logger.info("Found existing model, reuse.")
+                model = LOCAL_MODEL_MAP[self.model_path]
+            else:
+                model = self._load_model(self.model_path)
+                LOCAL_MODEL_MAP[self.model_path] = model
+            self.model = model
+
+    def _load_model(self, path):
+        """
+        Loads the BGE model from the specified path.
+
+        Args:
+            path (str): The path to the BGE model.
+
+        Returns:
+            FlagModel: The loaded BGE model.
+        """
+        # We need to import sklearn at first, otherwise sklearn will fail on macOS with m chip.
+        import sklearn  # noqa
+        from FlagEmbedding import FlagModel
+
+        logger.info(
+            f"Loading FlagModel from {path!r} with query_instruction_for_retrieval={self.query_instruction_for_retrieval!r}"
+        )
+        model = FlagModel(
+            path,
+            query_instruction_for_retrieval=self.query_instruction_for_retrieval,
+            use_fp16=False,
+        )
+        return model
+
+    def vectorize(
+        self, texts: Union[str, Iterable[str]]
+    ) -> Union[EmbeddingVector, Iterable[EmbeddingVector]]:
+        """
+        Vectorizes text(s) into embedding vector(s).
+
+        Args:
+            texts (Union[str, Iterable[str]]): The text or texts to vectorize.
+
+        Returns:
+            Union[EmbeddingVector, Iterable[EmbeddingVector]]: The embedding vector(s) of the text(s).
+        """
+
+        result = self.model.encode(texts)
+        return result.tolist()
+
+
+@VectorizeModelABC.register("bge_m3")
+class LocalBGEM3VectorizeModel(VectorizeModelABC):
+    """
+    A class that extends the VectorizeModelABC base class.
+    It invokes local BGE-M3 embedding models to convert texts into embedding vectors.
+    """
+
+    _LOCK = threading.Lock()
+
+    def __init__(
+        self,
+        path: str,
+        url: str = None,
+        vector_dimensions: int = None,
+    ):
+        """
+        Initializes the LocalBGEM3VectorizeModel instance.
+
+        Args:
+            path (str): The path to the local BGE-M3 model.
+            url (str, optional): The URL to download the model if not found locally. Defaults to None.
+            vector_dimensions (int, optional): The number of dimensions for the embedding vectors. Defaults to None.
+        """
+        super().__init__(vector_dimensions)
+        self.url = url
+        self.model_path = os.path.expanduser(path)
+        config_path = os.path.join(self.model_path, "config.json")
+        if not os.path.isfile(config_path):
+            if url is None:
+                message = f"model not found at {path!r}, nor model url specified"
+                raise RuntimeError(message)
+            self._download_model(path, url)
+        with LocalBGEM3VectorizeModel._LOCK:
+            if self.model_path in LOCAL_MODEL_MAP:
+                logger.info("Found existing model, reuse.")
+                model = LOCAL_MODEL_MAP[self.model_path]
+            else:
+                model = self._load_model(self.model_path)
+                LOCAL_MODEL_MAP[self.model_path] = model
+            self.model = model
+
+    def _load_model(self, path):
+        """
+        Loads the BGE-M3 model from the specified path.
+
+        Args:
+            path (str): The path to the BGE-M3 model.
+
+        Returns:
+            BGEM3FlagModel: The loaded BGE-M3 model.
+        """
+        # We need to import sklearn at first, otherwise sklearn will fail on macOS with m chip.
+
+        import sklearn  # noqa
+        from FlagEmbedding import BGEM3FlagModel
+
+        logger.info(f"Loading BGEM3FlagModel from {path!r}")
+        model = BGEM3FlagModel(path, use_fp16=False)
+        return model
+
+    def vectorize(
+        self, texts: Union[str, Iterable[str]]
+    ) -> Union[EmbeddingVector, Iterable[EmbeddingVector]]:
+        """
+        Vectorizes text(s) into embedding vector(s).
+
+        Args:
+            texts (Union[str, Iterable[str]]): The text or texts to vectorize.
+
+        Returns:
+            Union[EmbeddingVector, Iterable[EmbeddingVector]]: The embedding vector(s) of the text(s).
+        """
+        result = self.model.encode(texts)["dense_vecs"]
+        return result.tolist()
diff --git a/kag/common/vectorize_model/mock_model.py b/kag/common/vectorize_model/mock_model.py
new file mode 100644
index 00000000..a930b576
--- /dev/null
+++ b/kag/common/vectorize_model/mock_model.py
@@ -0,0 +1,51 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import numpy as np
+from typing import Union, Iterable
+from kag.interface import VectorizeModelABC, EmbeddingVector
+
+
+@VectorizeModelABC.register("mock")
+class MockVectorizeModel(VectorizeModelABC):
+    """
+    A mock implementation of the VectorizeModelABC class, used for testing purposes.
+
+    This class provides a method to generate random embedding vectors for given texts.
+    """
+
+    def __init__(
+        self,
+        vector_dimensions: int = None,
+    ):
+        """
+        Initializes the MockVectorizeModel instance.
+
+        Args:
+            vector_dimensions (int, optional): The number of dimensions for the embedding vectors. Defaults to None.
+        """
+        super().__init__(vector_dimensions)
+
+    def vectorize(
+        self, texts: Union[str, Iterable[str]]
+    ) -> Union[EmbeddingVector, Iterable[EmbeddingVector]]:
+        """
+        Generates random embedding vectors for the given texts.
+
+        Args:
+            texts (Union[str, Iterable[str]]): The text or texts to vectorize.
+
+        Returns:
+            Union[EmbeddingVector, Iterable[EmbeddingVector]]: The embedding vector(s) of the text(s).
+        """
+        if isinstance(texts, str):
+            return np.random.rand(self._vector_dimensions).tolist()
+        else:
+            return np.random.rand(len(texts), self._vector_dimensions).tolist()
diff --git a/kag/common/vectorize_model/openai_model.py b/kag/common/vectorize_model/openai_model.py
new file mode 100644
index 00000000..ab26860c
--- /dev/null
+++ b/kag/common/vectorize_model/openai_model.py
@@ -0,0 +1,62 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from typing import Union, Iterable
+from openai import OpenAI
+from kag.interface import VectorizeModelABC, EmbeddingVector
+
+
+@VectorizeModelABC.register("openai")
+class OpenAIVectorizeModel(VectorizeModelABC):
+    """
+    A class that extends the VectorizeModelABC base class.
+    It invokes OpenAI or OpenAI-compatible embedding services to convert texts into embedding vectors.
+    """
+
+    def __init__(
+        self,
+        model: str = "text-embedding-3-small",
+        api_key: str = "",
+        base_url: str = "",
+        vector_dimensions: int = None,
+    ):
+        """
+        Initializes the OpenAIVectorizeModel instance.
+
+        Args:
+            model (str, optional): The model to use for embedding. Defaults to "text-embedding-3-small".
+            api_key (str, optional): The API key for accessing the OpenAI service. Defaults to "".
+            base_url (str, optional): The base URL for the OpenAI service. Defaults to "".
+            vector_dimensions (int, optional): The number of dimensions for the embedding vectors. Defaults to None.
+        """
+        super().__init__(vector_dimensions)
+        self.client = OpenAI(api_key=api_key, base_url=base_url)
+
+    def vectorize(
+        self, texts: Union[str, Iterable[str]]
+    ) -> Union[EmbeddingVector, Iterable[EmbeddingVector]]:
+        """
+        Vectorizes a text string into an embedding vector or multiple text strings into multiple embedding vectors.
+
+        Args:
+            texts (Union[str, Iterable[str]]): The text or texts to vectorize.
+
+        Returns:
+            Union[EmbeddingVector, Iterable[EmbeddingVector]]: The embedding vector(s) of the text(s).
+        """
+        results = self.client.embeddings.create(input=texts, model=self.model)
+        results = [item.embedding for item in results.data]
+        if isinstance(texts, str):
+            assert len(results) == 1
+            return results[0]
+        else:
+            assert len(results) == len(texts)
+            return results
diff --git a/kag/common/vectorize_model/vectorize_model_config_checker.py b/kag/common/vectorize_model/vectorize_model_config_checker.py
new file mode 100644
index 00000000..2932d64c
--- /dev/null
+++ b/kag/common/vectorize_model/vectorize_model_config_checker.py
@@ -0,0 +1,47 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import json
+
+
+class VectorizeModelConfigChecker:
+    """
+    A class that checks whether the vectorizer configuration is valid.
+
+    This class provides a method to validate the vectorizer configuration and return the embedding vector dimensions if valid.
+    """
+
+    def check(self, vectorizer_config: str) -> int:
+        """
+        Checks the vectorizer configuration.
+
+        If the configuration is valid, it returns the actual embedding vector dimensions.
+        If the configuration is invalid, it raises a RuntimeError exception.
+
+        Args:
+            vectorizer_config (str): The vectorizer configuration to be checked.
+
+        Returns:
+            int: The embedding vector dimensions.
+
+        Raises:
+            RuntimeError: If the configuration is invalid.
+        """
+        try:
+            config = json.loads(vectorizer_config)
+            from kag.interface import VectorizeModelABC
+
+            vectorizer = VectorizeModelABC.from_config(config)
+            res = vectorizer.vectorize("hello")
+            return len(res)
+        except Exception as ex:
+            message = "invalid vectorizer config: %s" % str(ex)
+            raise RuntimeError(message) from ex
diff --git a/kag/common/vectorizer/__init__.py b/kag/common/vectorizer/__init__.py
deleted file mode 100644
index b95190e0..00000000
--- a/kag/common/vectorizer/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-from kag.common.vectorizer.local_bge_m3_vectorizer import LocalBGEM3Vectorizer
-from kag.common.vectorizer.local_bge_vectorizer import LocalBGEVectorizer
-from kag.common.vectorizer.openai_vectorizer import OpenAIVectorizer
-from kag.common.vectorizer.vectorizer import Vectorizer
-from kag.common.vectorizer.vectorizer_config_checker import VectorizerConfigChecker
-
-
-__all__ = [
-    "LocalBGEM3Vectorizer",
-    "LocalBGEVectorizer",
-    "OpenAIVectorizer",
-    "Vectorizer",
-    "VectorizerConfigChecker",
-]
diff --git a/kag/common/vectorizer/local_bge_m3_vectorizer.py b/kag/common/vectorizer/local_bge_m3_vectorizer.py
deleted file mode 100644
index 75b0179b..00000000
--- a/kag/common/vectorizer/local_bge_m3_vectorizer.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import io
-import os
-import threading
-import tarfile
-import requests
-from typing import Any, Union, Iterable, Dict
-from kag.common.vectorizer.vectorizer import Vectorizer
-
-
-EmbeddingVector = Iterable[float]
-
-
-class LocalBGEM3Vectorizer(Vectorizer):
-    """
-    Invoke local bge-m3 embedding models to turn texts into embedding vectors.
-    """
-
-    _local_model_map = {}
-    _lock = threading.Lock()
-
-    def __init__(self, config: Dict[str, Any]):
-        super().__init__(config)
-        path = config.get("path")
-        if path is None:
-            message = "model path is required"
-            raise RuntimeError(message)
-        url = config.get("url")
-        path = os.path.expanduser(path)
-        config_path = os.path.join(path, "config.json")
-        if not os.path.isfile(config_path):
-            if url is None:
-                message = f"model not found at {path!r}, nor model url specified"
-                raise RuntimeError(message)
-            self._download_model(path, url)
-        self._path = path
-        self._url = url
-        with self._lock:
-            if path in self._local_model_map:
-                self._model = self._local_model_map[path]
-            else:
-                self._model = self._load_model(path)
-                self._local_model_map[path] = self._model
-
-    @classmethod
-    def _from_config(cls, config: Dict[str, Any]) -> Vectorizer:
-        """
-        Create vectorizer from `config`.
-
-        :param config: vectorizer config
-        :type config: Dict[str, Any]
-        :return: vectorizer instance
-        :rtype: Vectorizer
-        """
-        vectorizer = cls(config)
-        return vectorizer
-
-    def _download_model(self, path, url):
-        res = requests.get(url)
-        with io.BytesIO(res.content) as fileobj:
-            with tarfile.open(fileobj=fileobj) as tar:
-                tar.extractall(path=path)
-        config_path = os.path.join(path, "config.json")
-        if not os.path.isfile(config_path):
-            message = f"model config not found at {config_path!r}, url {url!r} specified an invalid model"
-            raise RuntimeError(message)
-
-    def _load_model(self, path):
-        # We need to import sklearn at first, otherwise sklearn will fail on macOS with m chip.
-        import sklearn
-        from FlagEmbedding import BGEM3FlagModel
-
-        print(f"Loading BGEM3FlagModel from {path!r}")
-        model = BGEM3FlagModel(path, use_fp16=True)
-        return model
-
-    def vectorize(self, texts: Union[str, Iterable[str]]) -> Union[EmbeddingVector, Iterable[EmbeddingVector]]:
-        """
-        Vectorize a text string into an embedding vector or multiple text strings into
-        multiple embedding vectors.
-
-        :param texts: texts to vectorize
-        :type texts: str or Iterable[str]
-        :return: embedding vectors of the texts
-        :rtype: EmbeddingVector or Iterable[EmbeddingVector]
-        """
-        result = self._model.encode(texts)["dense_vecs"]
-        return result.tolist()
diff --git a/kag/common/vectorizer/local_bge_vectorizer.py b/kag/common/vectorizer/local_bge_vectorizer.py
deleted file mode 100644
index 0869df15..00000000
--- a/kag/common/vectorizer/local_bge_vectorizer.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import io
-import os
-import threading
-import tarfile
-import requests
-from typing import Any, Union, Iterable, Dict
-from kag.common.vectorizer.vectorizer import Vectorizer
-
-
-EmbeddingVector = Iterable[float]
-
-
-class LocalBGEVectorizer(Vectorizer):
-    """
-    Invoke local bge embedding models to turn texts into embedding vectors.
-    """
-
-    _local_model_map = {}
-    _lock = threading.Lock()
-
-    def __init__(self, config: Dict[str, Any]):
-        super().__init__(config)
-        path = config.get("path")
-        if path is None:
-            message = "model path is required"
-            raise RuntimeError(message)
-        url = config.get("url")
-        path = os.path.expanduser(path)
-        config_path = os.path.join(path, "config.json")
-        if not os.path.isfile(config_path):
-            if url is None:
-                message = f"model not found at {path!r}, nor model url specified"
-                raise RuntimeError(message)
-            self._download_model(path, url)
-        default_chinese_query_instruction_for_retrieval = "为这个句子生成表示以用于向量检索："
-        default_english_query_instruction_for_retrieval = "Represent this sentence for searching relevant passages:"
-        if "BAAI/bge-base-zh-v1.5" in path:
-            default_query_instruction_for_retrieval = default_chinese_query_instruction_for_retrieval
-        else:
-            default_query_instruction_for_retrieval = default_english_query_instruction_for_retrieval
-        query_instruction_for_retrieval = config.get("query_instruction_for_retrieval", default_query_instruction_for_retrieval)
-        self._path = path
-        self._url = url
-        self._query_instruction_for_retrieval = query_instruction_for_retrieval
-        with self._lock:
-            if path in self._local_model_map:
-                self._model = self._local_model_map[path]
-            else:
-                self._model = self._load_model(path)
-                self._local_model_map[path] = self._model
-
-    @classmethod
-    def _from_config(cls, config: Dict[str, Any]) -> Vectorizer:
-        """
-        Create vectorizer from `config`.
-
-        :param config: vectorizer config
-        :type config: Dict[str, Any]
-        :return: vectorizer instance
-        :rtype: Vectorizer
-        """
-        vectorizer = cls(config)
-        return vectorizer
-
-    def _download_model(self, path, url):
-        res = requests.get(url)
-        with io.BytesIO(res.content) as fileobj:
-            with tarfile.open(fileobj=fileobj) as tar:
-                tar.extractall(path=path)
-        config_path = os.path.join(path, "config.json")
-        if not os.path.isfile(config_path):
-            message = f"model config not found at {config_path!r}, url {url!r} specified an invalid model"
-            raise RuntimeError(message)
-
-    def _load_model(self, path):
-        # We need to import sklearn at first, otherwise sklearn will fail on macOS with m chip.
-        import sklearn
-        from FlagEmbedding import FlagModel
-
-        print(f"Loading FlagModel from {path!r} with query_instruction_for_retrieval={self._query_instruction_for_retrieval!r}")
-        model = FlagModel(path,
-                          query_instruction_for_retrieval=self._query_instruction_for_retrieval,
-                          use_fp16=True)
-        return model
-
-    def vectorize(self, texts: Union[str, Iterable[str]]) -> Union[EmbeddingVector, Iterable[EmbeddingVector]]:
-        """
-        Vectorize a text string into an embedding vector or multiple text strings into
-        multiple embedding vectors.
-
-        :param texts: texts to vectorize
-        :type texts: str or Iterable[str]
-        :return: embedding vectors of the texts
-        :rtype: EmbeddingVector or Iterable[EmbeddingVector]
-        """
-        result = self._model.encode(texts)
-        return result.tolist()
diff --git a/kag/common/vectorizer/openai_vectorizer.py b/kag/common/vectorizer/openai_vectorizer.py
deleted file mode 100644
index 13894b7a..00000000
--- a/kag/common/vectorizer/openai_vectorizer.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-from typing import Any, Union, Iterable, Dict
-from openai import OpenAI
-from kag.common.vectorizer.vectorizer import Vectorizer
-
-
-EmbeddingVector = Iterable[float]
-
-
-class OpenAIVectorizer(Vectorizer):
-    """
-    Invoke OpenAI or OpenAI-compatible embedding services to turn texts into embedding vectors.
-    """
-
-    def __init__(self, config: Dict[str, Any]):
-        super().__init__(config)
-        self.model = config.get("model","text-embedding-3-small")
-        self.api_key = config.get("api_key")
-        self.base_url = config.get("base_url")
-        if not self.api_key:
-            raise ValueError("OpenAI API key is not set")
-        self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
-
-    @classmethod
-    def _from_config(cls, config: Dict[str, Any]) -> Vectorizer:
-        """
-        Create vectorizer from `config`.
-
-        :param config: vectorizer config
-        :type config: Dict[str, Any]
-        :return: vectorizer instance
-        :rtype: Vectorizer
-        """
-        vectorizer = cls(config)
-        return vectorizer
-
-    def vectorize(self, texts: Union[str, Iterable[str]]) -> Union[EmbeddingVector, Iterable[EmbeddingVector]]:
-        """
-        Vectorize a text string into an embedding vector or multiple text strings into
-        multiple embedding vectors.
-
-        :param texts: texts to vectorize
-        :type texts: str or Iterable[str]
-        :return: embedding vectors of the texts
-        :rtype: EmbeddingVector or Iterable[EmbeddingVector]
-        """
-        results = self.client.embeddings.create(input=texts, model=self.model)
-        results = [item.embedding for item in results.data]
-        if isinstance(texts, str):
-            assert len(results) == 1
-            return results[0]
-        else:
-            assert len(results) == len(texts)
-            return results
diff --git a/kag/common/vectorizer/vectorizer.py b/kag/common/vectorizer/vectorizer.py
deleted file mode 100644
index 3a32123d..00000000
--- a/kag/common/vectorizer/vectorizer.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import io
-import json
-from pathlib import Path
-from abc import ABC, abstractmethod
-from typing import Any, Union, Iterable, Optional, Dict
-
-EmbeddingVector = Iterable[float]
-
-
-class Vectorizer(ABC):
-    """
-    Vectorizer turns texts into embedding vectors.
-    """
-
-    def __init__(self, config: Dict[str, Any]):
-        self._config = config
-        self._vector_dimensions = None
-
-    @classmethod
-    def from_config(cls, config: Union[str, Path, Dict[str, Any]]) -> "Vectorizer":
-        """
-        Create vectorizer from `config`.
-
-        If `config` is a string or path, it will be loaded as a dictionary depending
-        on its file extension. Currently, the following formats are supported:
-
-        * .json: JSON
-        * .json5: JSON with comments support
-        * .yaml: YAML
-
-        :param config: vectorizer config
-        :type config: str, Path or Dict[str, Any]
-        :return: vectorizer instance
-        :rtype: Vectorizer
-        """
-        from kag.common.utils import dynamic_import_class
-
-        if isinstance(config, (str, Path)):
-            config_path = config
-            if not isinstance(config_path, Path):
-                config_path = Path(config_path)
-            if config_path.name.endswith(".yaml"):
-                import yaml
-
-                with io.open(config_path, "r", encoding="utf-8") as fin:
-                    config = yaml.safe_load(fin)
-            elif config_path.name.endswith(".json5"):
-                import json5
-
-                with io.open(config_path, "r", encoding="utf-8") as fin:
-                    config = json5.load(fin)
-            elif config_path.name.endswith(".json"):
-                with io.open(config_path, "r", encoding="utf-8") as fin:
-                    config = json.load(fin)
-            else:
-                message = "only .json, .json5 and .yaml are supported currently; "
-                message += "can not load vectorizer config from %r" % str(config_path)
-                raise RuntimeError(message)
-        elif isinstance(config, dict):
-            pass
-        else:
-            message = "only str, Path and dict are supported; "
-            message += "invalid vectorizer config: %r" % (config,)
-            raise RuntimeError(message)
-
-        class_name = config.get("vectorizer")
-        if class_name is None:
-            message = "vectorizer class name is not specified"
-            raise RuntimeError(message)
-        vectorizer_class = dynamic_import_class(class_name, "vectorizer")
-        if not issubclass(vectorizer_class, Vectorizer):
-            message = "class %r is not a vectorizer class" % (class_name,)
-            raise RuntimeError(message)
-        vectorizer = vectorizer_class._from_config(config)
-        return vectorizer
-
-    @classmethod
-    @abstractmethod
-    def _from_config(cls, config: Dict[str, Any]) -> "Vectorizer":
-        """
-        Create vectorizer from `config`. This method is supposed to be implemented
-        by derived classes.
-
-        :param config: vectorizer config
-        :type config: Dict[str, Any]
-        :return: vectorizer instance
-        :rtype: Vectorizer
-        """
-        message = "abstract method _from_config is not implemented"
-        raise NotImplementedError(message)
-
-    def _get_vector_dimensions(self, config: Dict[str, Any]) -> Optional[int]:
-        """
-        Get embedding vector dimensions from `config`.
-
-        * If vector dimensions is not specified in `config`, return None.
-
-        * If vector dimensions is specified in `config` but not a positive integer,
-          raise an exception.
-
-        :param config: vectorizer config
-        :type config: Dict[str, Any]
-        :return: embedding vector dimensions or None
-        :rtype: Optional[int]
-        """
-        value = config.get("vector_dimensions")
-        if value is None:
-            return None
-        if isinstance(value, str):
-            try:
-                value = int(value)
-            except ValueError as ex:
-                message = "vector_dimensions must be integer; "
-                message += "%r is invalid" % (value,)
-                raise RuntimeError(message) from ex
-        if not isinstance(value, int) or value <= 0:
-            message = "vector_dimensions must be positive-integer; "
-            message += "%r is invalid" % (value,)
-            raise RuntimeError(message)
-        return value
-
-    @property
-    def vector_dimensions(self):
-        """
-        Dimension of generated embedding vectors.
-        """
-        if self._vector_dimensions is not None:
-            return self._vector_dimensions
-        try:
-            example_input = "This is a test."
-            example_vector = self.vectorize(example_input)
-        except Exception as ex:
-            message = "the embedding service is not available"
-            raise RuntimeError(message) from ex
-        value = self._get_vector_dimensions(self._config)
-        if value is not None and value != len(example_vector):
-            message = "invalid 'vector_dimensions', specified %d; " % value
-            message += "but the actual generated embedding vector is of %d dimensions" % len(example_vector)
-            raise RuntimeError(message)
-        self._vector_dimensions = len(example_vector)
-        return self._vector_dimensions
-
-    @abstractmethod
-    def vectorize(self, texts: Union[str, Iterable[str]]) -> Union[EmbeddingVector, Iterable[EmbeddingVector]]:
-        """
-        Vectorize a text string into an embedding vector or multiple text strings into
-        multiple embedding vectors.
-
-        :param texts: texts to vectorize
-        :type texts: str or Iterable[str]
-        :return: embedding vectors of the texts
-        :rtype: EmbeddingVector or Iterable[EmbeddingVector]
-        """
-        message = "abstract method vectorize is not implemented"
-        raise NotImplementedError(message)
diff --git a/kag/common/vectorizer/vectorizer_config_checker.py b/kag/common/vectorizer/vectorizer_config_checker.py
deleted file mode 100644
index 2177c25d..00000000
--- a/kag/common/vectorizer/vectorizer_config_checker.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import json
-from kag.common.vectorizer.vectorizer import Vectorizer
-
-
-class VectorizerConfigChecker(object):
-    """
-    Check whether the vectorizer config is valid.
-    """
-
-    def check(self, vectorizer_config: str) -> int:
-        """
-        Check the vectorizer config.
-
-        * If the config is valid, return the actual embedding vector dimensions.
-
-        * If the config is invalid, raise a RuntimeError exception.
-
-        :param vectorizer_config: vectorizer config
-        :type vectorizer_config: str
-        :return: embedding vector dimensions
-        :rtype: int
-        :raises RuntimeError: if the config is invalid
-        """
-        try:
-            config = json.loads(vectorizer_config)
-            vectorizer = Vectorizer.from_config(config)
-            vector_dimensions = vectorizer.vector_dimensions
-            return vector_dimensions
-        except Exception as ex:
-            message = "invalid vectorizer config: %s" % str(ex)
-            raise RuntimeError(message) from ex
diff --git a/kag/examples/2wiki/.gitignore b/kag/examples/2wiki/.gitignore
new file mode 100644
index 00000000..5c920f23
--- /dev/null
+++ b/kag/examples/2wiki/.gitignore
@@ -0,0 +1,3 @@
+ckpt/
+/solver/2wiki_res_*.json
+/solver/2wiki_metrics_*.json
diff --git a/kag/examples/2wiki/README.md b/kag/examples/2wiki/README.md
new file mode 100644
index 00000000..5f9c214a
--- /dev/null
+++ b/kag/examples/2wiki/README.md
@@ -0,0 +1,69 @@
+# KAG Example: TwoWiki
+
+[2WikiMultiHopQA](https://arxiv.org/abs/1809.09600) is a multi-hop QA dataset
+for comprehensive evaluation of reasoning steps. It's used by [KAG](https://arxiv.org/abs/2409.13731)
+and [HippoRAG](https://arxiv.org/abs/2405.14831) for multi-hop question answering
+performance evaluation.
+
+Here we demonstrate how to build a knowledge graph for the 2WikiMultiHopQA dataset,
+generate answers to those evaluation questions with KAG and calculate EM and F1
+metrics of the KAG generated answers compared to the ground-truth answers.
+
+## Steps to reproduce
+
+1. Follow the Quick Start guide of KAG to install the OpenSPG server and KAG.
+
+   The following steps assume the Python virtual environment with KAG installed
+   is activated and the current directory is [2wiki](.).
+
+2. (Optional) Update [indexer.py](./builder/indexer.py) and [evaFor2wiki.py](./solver/evaFor2wiki.py)
+   to use the larger dataset. You may want to skip this step the first time and
+   use the small dataset to get started quickly.
+
+3. Update the ``openie_llm``, ``chat_llm`` and ``vectorizer_model`` configurations
+   in [kag_config.yaml](./kag_config.yaml) properly.
+
+4. Restore the KAG project.
+
+   ```bash
+   knext project restore --host_addr http://127.0.0.1:8887 --proj_path .
+   ```
+
+5. Commit the schema.
+
+   ```bash
+   knext schema commit
+   ```
+
+6. Execute [indexer.py](./builder/indexer.py) in the [builder](./builder) directory to build the knowledge graph.
+
+   ```bash
+   cd builder && python indexer.py && cd ..
+   ```
+
+7. Execute [evaFor2wiki.py](./solver/evaFor2wiki.py) in the [solver](./solver) directory
+   to generate the answers and calculate the EM and F1 metrics.
+
+   ```bash
+   cd solver && python evaFor2wiki.py && cd ..
+   ```
+
+   The generated answers are saved to ``./solver/2wiki_res_*.json``.
+
+   The calculated EM and F1 metrics are saved to ``./solver/2wiki_metrics_*.json``.
+
+8. (Optional) To delete checkpoints, execute the following commands.
+
+   ```bash
+   rm -rf ./builder/ckpt
+   rm -rf ./solver/ckpt
+   ```
+
+   To delete the KAG project and related knowledge graph, execute the following similar command.
+   Replace the OpenSPG server address and KAG project id with actual values.
+
+   ```bash
+   curl http://127.0.0.1:8887/project/api/delete?projectId=1
+   ```
+
+9. (Optional) Restart from Step 2 and try the larger dataset.
diff --git a/kag/examples/2wiki/builder/__init__.py b/kag/examples/2wiki/builder/__init__.py
index 94be39bc..7a018e7c 100644
--- a/kag/examples/2wiki/builder/__init__.py
+++ b/kag/examples/2wiki/builder/__init__.py
@@ -11,4 +11,4 @@
 
 """
 Builder Dir.
-"""
\ No newline at end of file
+"""
diff --git a/kag/examples/2wiki/builder/data/__init__.py b/kag/examples/2wiki/builder/data/__init__.py
index 6a8637b9..59bacd4d 100644
--- a/kag/examples/2wiki/builder/data/__init__.py
+++ b/kag/examples/2wiki/builder/data/__init__.py
@@ -11,4 +11,4 @@
 
 """
 Place the files to be used for building the index in this directory.
-"""
\ No newline at end of file
+"""
diff --git a/kag/examples/2wiki/builder/indexer.py b/kag/examples/2wiki/builder/indexer.py
index 8f687ec1..67332d01 100644
--- a/kag/examples/2wiki/builder/indexer.py
+++ b/kag/examples/2wiki/builder/indexer.py
@@ -8,74 +8,27 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
-import json
-import logging
 import os
-from typing import Type, List
+import logging
+from kag.common.registry import import_modules_from_path
 
-from kag.builder.component import KGWriter
-from kag.builder.component.extractor import KAGExtractor
-from kag.builder.component.splitter import LengthSplitter
-from kag.builder.component.vectorizer.batch_vectorizer import BatchVectorizer
-from kag.builder.model.chunk import Chunk
-from kag.examples.utils import generate_hash_id
-from kag.interface.builder.reader_abc import SourceReaderABC
-from knext.common.base.runnable import Input, Output
-from knext.builder.builder_chain_abc import BuilderChainABC
+from kag.builder.runner import BuilderChainRunner
 
 logger = logging.getLogger(__name__)
 
 
-class TwowikiCorpusReader(SourceReaderABC):
-    @property
-    def input_types(self) -> Type[Input]:
-        """The type of input this Runnable object accepts specified as a type annotation."""
-        return str
-
-    @property
-    def output_types(self) -> Type[Output]:
-        """The type of output this Runnable object produces specified as a type annotation."""
-        return Chunk
-
-    def invoke(self, input: str, **kwargs) -> List[Output]:
-        if os.path.exists(str(input)):
-            with open(input, "r") as f:
-                corpus = json.load(f)
-        else:
-            corpus = json.loads(input)
-        chunks = []
-
-        for idx, item in enumerate(corpus):
-            chunk = Chunk(
-                id=generate_hash_id(item['text']),
-                name=item['title'],
-                content=item['text'],
-            )
-            chunks.append(chunk)
-        return chunks
-
-
-class TwowikiBuilderChain(BuilderChainABC):
-    def build(self, **kwargs):
-        source = TwowikiCorpusReader()
-        splitter = LengthSplitter(split_length=2000)
-        extractor = KAGExtractor()
-        vectorizer = BatchVectorizer()
-        sink = KGWriter()
-
-        return source >> splitter >> extractor >> vectorizer >> sink
+def buildKB(file_path):
+    from kag.common.conf import KAG_CONFIG
 
+    runner = BuilderChainRunner.from_config(KAG_CONFIG.all_config["kag_builder_pipeline"])
+    runner.invoke(file_path)
 
-def buildKB(corpusFilePath):
-    TwowikiBuilderChain().invoke(file_path=corpusFilePath, max_workers=20)
+    logger.info(f"\n\nbuildKB successfully for {file_path}\n\n")
 
-    logger.info(f"\n\nbuildKB successfully for {corpusFilePath}\n\n")
 
+if __name__ == "__main__":
+    import_modules_from_path(".")
+    dir_path = os.path.dirname(__file__)
+    file_path = os.path.join(dir_path, "data/2wiki_sub_corpus.json")
 
-if __name__ == '__main__':
-    filePath = "./data/2wiki_sub_corpus.json"
-    # filePath = "./data/2wiki_corpus.json"
-    corpusFilePath = os.path.join(
-        os.path.abspath(os.path.dirname(__file__)), filePath
-    )
-    buildKB(corpusFilePath)
+    buildKB(file_path)
diff --git a/kag/examples/2wiki/builder/prompt/ner.py b/kag/examples/2wiki/builder/prompt/ner.py
deleted file mode 100644
index cf5aa897..00000000
--- a/kag/examples/2wiki/builder/prompt/ner.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import json
-from string import Template
-from typing import List, Optional
-
-from kag.common.base.prompt_op import PromptOp
-from knext.schema.client import SchemaClient
-
-
-class OpenIENERPrompt(PromptOp):
-
-    template_en = """
-    {
-    "instruction": "You're a very effective entity extraction system. Please extract all the entities that are important for knowledge build and question, along with type, category and a brief description of the entity. The description of the entity is based on your OWN KNOWLEDGE AND UNDERSTANDING and does not need to be limited to the context. the entity's category belongs taxonomically to one of the items defined by schema, please also output the category. Note: Type refers to a specific, well-defined classification, such as Professor, Actor, while category is a broader group or class that may contain more than one type, such as Person, Works. Return an empty list if the entity type does not exist. Please respond in the format of a JSON string.You can refer to the example for extraction.",
-    "schema": $schema,
-    "example": [
-        {
-            "input": "The Rezort\nThe Rezort is a 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger.\n It stars Dougray Scott, Jessica De Gouw and Martin McCann.\n After humanity wins a devastating war against zombies, the few remaining undead are kept on a secure island, where they are hunted for sport.\n When something goes wrong with the island's security, the guests must face the possibility of a new outbreak.",
-            "output": [
-                        {
-                            "entity": "The Rezort",
-                            "type": "Movie",
-                            "category": "Works",
-                            "description": "A 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger."
-                        },
-                        {
-                            "entity": "2015",
-                            "type": "Year",
-                            "category": "Date",
-                            "description": "The year the movie 'The Rezort' was released."
-                        },
-                        {
-                            "entity": "British",
-                            "type": "Nationality",
-                            "category": "GeographicLocation",
-                            "description": "Great Britain, the island that includes England, Scotland, and Wales."
-                        },
-                        {
-                            "entity": "Steve Barker",
-                            "type": "Director",
-                            "category": "Person",
-                            "description": "Steve Barker is an English film director and screenwriter."
-                        },
-                        {
-                            "entity": "Paul Gerstenberger",
-                            "type": "Writer",
-                            "category": "Person",
-                            "description": "Paul is a writer and producer, known for The Rezort (2015), Primeval (2007) and House of Anubis (2011)."
-                        },
-                        {
-                            "entity": "Dougray Scott",
-                            "type": "Actor",
-                            "category": "Person",
-                            "description": "Stephen Dougray Scott (born 26 November 1965) is a Scottish actor."
-                        },
-                        {
-                            "entity": "Jessica De Gouw",
-                            "type": "Actor",
-                            "category": "Person",
-                            "description": "Jessica Elise De Gouw (born 15 February 1988) is an Australian actress. "
-                        },
-                        {
-                            "entity": "Martin McCann",
-                            "type": "Actor",
-                            "category": "Person",
-                            "description": "Martin McCann is an actor from Northern Ireland. In 2020, he was listed as number 48 on The Irish Times list of Ireland's greatest film actors"
-                        }
-                    ]
-        }
-    ],
-    "input": "$input"
-}    
-        """
-
-    template_zh = template_en
-
-    def __init__(
-            self, language: Optional[str] = "en", **kwargs
-    ):
-        super().__init__(language, **kwargs)
-        self.schema = SchemaClient(project_id=self.project_id).extract_types()
-        self.template = Template(self.template).safe_substitute(schema=self.schema)
-
-    @property
-    def template_variables(self) -> List[str]:
-        return ["input"]
-
-    def parse_response(self, response: str, **kwargs):
-        rsp = response
-        if isinstance(rsp, str):
-            rsp = json.loads(rsp)
-        if isinstance(rsp, dict) and "output" in rsp:
-            rsp = rsp["output"]
-        if isinstance(rsp, dict) and "named_entities" in rsp:
-            entities = rsp["named_entities"]
-        else:
-            entities = rsp
-
-        return entities
diff --git a/kag/examples/2wiki/builder/prompt/std.py b/kag/examples/2wiki/builder/prompt/std.py
deleted file mode 100644
index 1dfcfaaa..00000000
--- a/kag/examples/2wiki/builder/prompt/std.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import json
-from typing import Optional, List
-
-from kag.common.base.prompt_op import PromptOp
-
-
-class OpenIEEntitystandardizationdPrompt(PromptOp):
-    template_en = """
-{
-    "instruction": "The `input` field contains a user provided context. The `named_entities` field contains extracted named entities from the context, which may be unclear abbreviations, aliases, or slang. To eliminate ambiguity, please attempt to provide the official names of these entities based on the context and your own knowledge. Note that entities with the same meaning can only have ONE official name. Please respond in the format of a single JSONArray string without any explanation, as shown in the `output` field of the provided example.",
-    "example": {
-        "input": "American History\nWhen did the political party that favored harsh punishment of southern states after the Civil War, gain control of the House? Republicans regained control of the chamber they had lost in the 2006 midterm elections.",
-        "named_entities": [
-            {"entity": "American", "category": "GeographicLocation"},
-            {"entity": "political party", "category": "Organization"},
-            {"entity": "southern states", "category": "GeographicLocation"},
-            {"entity": "Civil War", "category": "Keyword"},
-            {"entity": "House", "category": "Organization"},
-            {"entity": "Republicans", "category": "Organization"},
-            {"entity": "chamber", "category": "Organization"},
-            {"entity": "2006 midterm elections", "category": "Date"}
-        ],
-        "output": [
-            {
-                "entity": "American",
-                "category": "GeographicLocation",
-                "official_name": "United States of America"
-            },
-            {
-                "entity": "political party",
-                "category": "Organization",
-                "official_name": "Radical Republicans"
-            },
-            {
-                "entity": "southern states",
-                "category": "GeographicLocation",
-                "official_name": "Confederacy"
-            },
-            {
-                "entity": "Civil War",
-                "category": "Keyword",
-                "official_name": "American Civil War"
-            },
-            {
-                "entity": "House",
-                "category": "Organization",
-                "official_name": "United States House of Representatives"
-            },
-            {
-                "entity": "Republicans",
-                "category": "Organization",
-                "official_name": "Republican Party"
-            },
-            {
-                "entity": "chamber",
-                "category": "Organization",
-                "official_name": "United States House of Representatives"
-            },
-            {
-                "entity": "midterm elections",
-                "category": "Date",
-                "official_name": "United States midterm elections"
-            }
-        ]
-    },
-    "input": "$input",
-    "named_entities": $named_entities
-}
-    """
-
-    template_zh = """"""
-
-    def __init__(self, language: Optional[str] = "en"):
-        super().__init__(language)
-
-    @property
-    def template_variables(self) -> List[str]:
-        return ["input", "named_entities"]
-
-    def parse_response(self, response: str, **kwargs):
-
-        rsp = response
-        if isinstance(rsp, str):
-            rsp = json.loads(rsp)
-        if isinstance(rsp, dict) and "output" in rsp:
-            rsp = rsp["output"]
-        if isinstance(rsp, dict) and "named_entities" in rsp:
-            standardized_entity = rsp["named_entities"]
-        else:
-            standardized_entity = rsp
-        entities_with_offical_name = set()
-        merged = []
-        entities = kwargs.get("named_entities", [])
-        for entity in standardized_entity:
-            merged.append(entity)
-            entities_with_offical_name.add(entity["entity"])
-        # in case llm ignores some entities
-        for entity in entities:
-            if entity["entity"] not in entities_with_offical_name:
-                entity["official_name"] = entity["entity"]
-                merged.append(entity)
-        return merged
diff --git a/kag/examples/2wiki/builder/prompt/triple.py b/kag/examples/2wiki/builder/prompt/triple.py
deleted file mode 100644
index 9e375e2c..00000000
--- a/kag/examples/2wiki/builder/prompt/triple.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import json
-from typing import Optional, List
-
-from kag.common.base.prompt_op import PromptOp
-
-
-class OpenIETriplePrompt(PromptOp):
-    template_en = """
-{
-    "instruction": "You are an expert specializing in carrying out open information extraction (OpenIE). Please extract any possible relations (including subject, predicate, object) from the given text, and list them following the json format {\"triples\": [[\"subject\", \"predicate\",  \"object\"]]}\n. If there are none, do not list them.\n.\n\nPay attention to the following requirements:\n- Each triple should contain at least one, but preferably two, of the named entities in the entity_list.\n- Clearly resolve pronouns to their specific names to maintain clarity.",
-    "entity_list": $entity_list,
-    "input": "$input",
-    "example": {
-        "input": "The Rezort\nThe Rezort is a 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger.\n It stars Dougray Scott, Jessica De Gouw and Martin McCann.\n After humanity wins a devastating war against zombies, the few remaining undead are kept on a secure island, where they are hunted for sport.\n When something goes wrong with the island's security, the guests must face the possibility of a new outbreak.",
-        "entity_list": [
-            {
-                "entity": "The Rezort",
-                "category": "Works"
-            },
-            {
-                "entity": "2015",
-                "category": "Others"
-            },
-            {
-                "entity": "British",
-                "category": "GeographicLocation"
-            },
-            {
-                "entity": "Steve Barker",
-                "category": "Person"
-            },
-            {
-                "entity": "Paul Gerstenberger",
-                "category": "Person"
-            },
-            {
-                "entity": "Dougray Scott",
-                "category": "Person"
-            },
-            {
-                "entity": "Jessica De Gouw",
-                "category": "Person"
-            },
-            {
-                "entity": "Martin McCann",
-                "category": "Person"
-            },
-            {
-                "entity": "zombies",
-                "category": "Creature"
-            },
-            {
-                "entity": "zombie horror film",
-                "category": "Concept"
-            },
-            {
-                "entity": "humanity",
-                "category": "Concept"
-            },
-            {
-                "entity": "secure island",
-                "category": "GeographicLocation"
-            }
-        ],
-        "output": [
-            [
-                "The Rezort",
-                "is",
-                "zombie horror film"
-            ],
-            [
-                "The Rezort",
-                "publish at",
-                "2015"
-            ],
-            [
-                "The Rezort",
-                "released",
-                "British"
-            ],
-            [
-                "The Rezort",
-                "is directed by",
-                "Steve Barker"
-            ],
-            [
-                "The Rezort",
-                "is written by",
-                "Paul Gerstenberger"
-            ],
-            [
-                "The Rezort",
-                "stars",
-                "Dougray Scott"
-            ],
-            [
-                "The Rezort",
-                "stars",
-                "Jessica De Gouw"
-            ],
-            [
-                "The Rezort",
-                "stars",
-                "Martin McCann"
-            ],
-            [
-                "humanity",
-                "wins",
-                "a devastating war against zombies"
-            ],
-            [
-                "the few remaining undead",
-                "are kept on",
-                "a secure island"
-            ],
-            [
-                "they",
-                "are hunted for",
-                "sport"
-            ],
-            [
-                "something",
-                "goes wrong with",
-                "the island's security"
-            ],
-            [
-                "the guests",
-                "must face",
-                "the possibility of a new outbreak"
-            ]
-        ]
-    }
-}    
-    """
-
-    def __init__(self, language: Optional[str] = "en"):
-        super().__init__(language)
-
-    @property
-    def template_variables(self) -> List[str]:
-        return ["entity_list", "input"]
-
-    def parse_response(self, response: str, **kwargs):
-        rsp = response
-        if isinstance(rsp, str):
-            rsp = json.loads(rsp)
-        if isinstance(rsp, dict) and "output" in rsp:
-            rsp = rsp["output"]
-        if isinstance(rsp, dict) and "triples" in rsp:
-            triples = rsp["triples"]
-        else:
-            triples = rsp
-
-        standardized_triples = []
-        for triple in triples:
-            if isinstance(triple, list):
-                standardized_triples.append(triple)
-            elif isinstance(triple, dict):
-                s = triple.get("subject")
-                p = triple.get("predicate")
-                o = triple.get("object")
-                if s and p and o:
-                    standardized_triples.append([s, p, o])
-
-        return standardized_triples
diff --git a/kag/examples/2wiki/kag_config.cfg b/kag/examples/2wiki/kag_config.cfg
deleted file mode 100644
index 55ded269..00000000
--- a/kag/examples/2wiki/kag_config.cfg
+++ /dev/null
@@ -1,27 +0,0 @@
-[project]
-namespace = TwoWiki
-host_addr = http://127.0.0.1:8887
-id = 11
-
-[vectorizer]
-vectorizer = kag.common.vectorizer.OpenAIVectorizer
-model = bge-m3
-api_key = EMPTY
-base_url = http://127.0.0.1:11434/v1
-vector_dimensions = 1024
-
-[llm]
-client_type = maas
-base_url = https://api.deepseek.com/
-api_key = put your deepseek api key here
-model = deepseek-chat
-
-[log]
-level = INFO
-
-[qa]
-force_chunk_retriever = True
-
-[prompt]
-language = en
-biz_scene = default
\ No newline at end of file
diff --git a/kag/examples/2wiki/kag_config.yaml b/kag/examples/2wiki/kag_config.yaml
new file mode 100644
index 00000000..ac2c8110
--- /dev/null
+++ b/kag/examples/2wiki/kag_config.yaml
@@ -0,0 +1,126 @@
+#------------project configuration start----------------#
+openie_llm: &openie_llm
+  api_key: key
+  base_url: https://api.deepseek.com
+  model: deepseek-chat
+  type: maas
+
+chat_llm: &chat_llm
+  api_key: key
+  base_url: https://api.deepseek.com
+  model: deepseek-chat
+  type: maas
+
+vectorize_model: &vectorize_model
+  api_key: key
+  base_url: https://api.siliconflow.cn/v1/
+  model: BAAI/bge-m3
+  type: openai
+  vector_dimensions: 1024
+vectorizer: *vectorize_model
+
+log:
+  level: INFO
+
+project:
+  biz_scene: default
+  host_addr: http://127.0.0.1:8887
+  id: '7'
+  language: en
+  namespace: TwoWiki
+#------------project configuration end----------------#
+
+#------------kag-builder configuration start----------------#
+kag_builder_pipeline:
+  chain:
+    type: unstructured_builder_chain # kag.builder.default_chain.DefaultUnstructuredBuilderChain
+    extractor:
+      type: schema_free_extractor # kag.builder.component.extractor.schema_free_extractor.SchemaFreeExtractor
+      llm: *openie_llm
+      ner_prompt:
+        type: default_ner # kag.builder.prompt.default.ner.OpenIENERPrompt
+      std_prompt:
+        type: default_std # kag.builder.prompt.default.std.OpenIEEntitystandardizationdPrompt
+      triple_prompt:
+        type: default_triple # kag.builder.prompt.default.triple.OpenIETriplePrompt
+    reader:
+      type: dict_reader # kag.builder.component.reader.dict_reader.DictReader
+    post_processor:
+      type: kag_post_processor # kag.builder.component.postprocessor.kag_postprocessor.KAGPostProcessor
+      similarity_threshold: 0.9
+    splitter:
+      type: length_splitter # kag.builder.component.splitter.length_splitter.LengthSplitter
+      split_length: 100000
+      window_length: 0
+    vectorizer:
+      type: batch_vectorizer # kag.builder.component.vectorizer.batch_vectorizer.BatchVectorizer
+      vectorize_model: *vectorize_model
+    writer:
+      type: kg_writer # kag.builder.component.writer.kg_writer.KGWriter
+  num_threads_per_chain: 1
+  num_chains: 16
+  scanner:
+    type: 2wiki_dataset_scanner # kag.builder.component.scanner.dataset_scanner.MusiqueCorpusScanner
+#------------kag-builder configuration end----------------#
+
+#------------kag-solver configuration start----------------#
+search_api: &search_api
+  type: openspg_search_api #kag.solver.tools.search_api.impl.openspg_search_api.OpenSPGSearchAPI
+
+graph_api: &graph_api
+  type: openspg_graph_api #kag.solver.tools.graph_api.impl.openspg_graph_api.OpenSPGGraphApi
+
+exact_kg_retriever: &exact_kg_retriever
+  type: default_exact_kg_retriever # kag.solver.retriever.impl.default_exact_kg_retriever.DefaultExactKgRetriever
+  el_num: 5
+  llm_client: *chat_llm
+  search_api: *search_api
+  graph_api: *graph_api
+
+fuzzy_kg_retriever: &fuzzy_kg_retriever
+  type: default_fuzzy_kg_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever
+  el_num: 5
+  vectorize_model: *vectorize_model
+  llm_client: *chat_llm
+  search_api: *search_api
+  graph_api: *graph_api
+
+chunk_retriever: &chunk_retriever
+  type: default_chunk_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever
+  llm_client: *chat_llm
+  recall_num: 10
+  rerank_topk: 10
+
+kag_solver_pipeline:
+  memory:
+    type: default_memory # kag.solver.implementation.default_memory.DefaultMemory
+    llm_client: *chat_llm
+  max_iterations: 3
+  reasoner:
+    type: default_reasoner # kag.solver.implementation.default_reasoner.DefaultReasoner
+    llm_client: *chat_llm
+    lf_planner:
+      type: default_lf_planner # kag.solver.plan.default_lf_planner.DefaultLFPlanner
+      llm_client: *chat_llm
+      vectorize_model: *vectorize_model
+    lf_executor:
+      type: default_lf_executor # kag.solver.execute.default_lf_executor.DefaultLFExecutor
+      llm_client: *chat_llm
+      force_chunk_retriever: true
+      exact_kg_retriever: *exact_kg_retriever
+      fuzzy_kg_retriever: *fuzzy_kg_retriever
+      chunk_retriever: *chunk_retriever
+      merger:
+        type: default_lf_sub_query_res_merger # kag.solver.execute.default_sub_query_merger.DefaultLFSubQueryResMerger
+        vectorize_model: *vectorize_model
+        chunk_retriever: *chunk_retriever
+  generator:
+    type: default_generator # kag.solver.implementation.default_generator.DefaultGenerator
+    llm_client: *chat_llm
+    generate_prompt:
+      type: resp_simple # kag/examples/2wiki/solver/prompt/resp_generator.py
+  reflector:
+    type: default_reflector # kag.solver.implementation.default_reflector.DefaultReflector
+    llm_client: *chat_llm
+
+#------------kag-solver configuration end----------------#
diff --git a/kag/examples/2wiki/reasoner/__init__.py b/kag/examples/2wiki/reasoner/__init__.py
index a0c4032b..8b8a3c91 100644
--- a/kag/examples/2wiki/reasoner/__init__.py
+++ b/kag/examples/2wiki/reasoner/__init__.py
@@ -17,4 +17,4 @@
 MATCH (s:DEFAULT.Company)
 RETURN s.id, s.address
 ```
-"""
\ No newline at end of file
+"""
diff --git a/kag/examples/2wiki/schema/__init__.py b/kag/examples/2wiki/schema/__init__.py
index ef3dde6d..8ac86acc 100644
--- a/kag/examples/2wiki/schema/__init__.py
+++ b/kag/examples/2wiki/schema/__init__.py
@@ -15,4 +15,4 @@
     You can execute `kag schema commit` to commit your schema to SPG server.
 
 
-"""
\ No newline at end of file
+"""
diff --git a/kag/examples/2wiki/solver/evaFor2wiki.py b/kag/examples/2wiki/solver/evaFor2wiki.py
index d6a533b1..76cd4a4f 100644
--- a/kag/examples/2wiki/solver/evaFor2wiki.py
+++ b/kag/examples/2wiki/solver/evaFor2wiki.py
@@ -7,27 +7,29 @@
 from tqdm import tqdm
 
 from kag.common.benchmarks.evaluate import Evaluate
-from kag.common.env import init_kag_config
 from kag.solver.logic.solver_pipeline import SolverPipeline
+from kag.common.conf import KAG_CONFIG
+from kag.common.registry import import_modules_from_path
+
+from kag.common.checkpointer import CheckpointerManager
 
 logger = logging.getLogger(__name__)
 
 
 class EvaFor2wiki:
-
     """
     init for kag client
     """
-    def __init__(self, configFilePath):
-        self.configFilePath = configFilePath
-        init_kag_config(self.configFilePath)
+
+    def __init__(self):
+        pass
 
     """
         qa from knowledge base, 
     """
+
     def qa(self, query):
-        # CA
-        resp = SolverPipeline()
+        resp = SolverPipeline.from_config(KAG_CONFIG.all_config["kag_solver_pipeline"])
         answer, traceLog = resp.run(query)
 
         logger.info(f"\n\nso the answer for '{query}' is: {answer}\n\n")
@@ -37,19 +39,29 @@ def qa(self, query):
         parallel qa from knowledge base
         and getBenchmarks(em, f1, answer_similarity)
     """
+
     def parallelQaAndEvaluate(
         self, qaFilePath, resFilePath, threadNum=1, upperLimit=10
     ):
+        ckpt = CheckpointerManager.get_checkpointer(
+            {"type": "zodb", "ckpt_dir": "ckpt"}
+        )
+
         def process_sample(data):
             try:
                 sample_idx, sample = data
                 sample_id = sample["_id"]
                 question = sample["question"]
                 gold = sample["answer"]
-                prediction, traceLog = self.qa(question)
-
-                evaObj = Evaluate()
-                metrics = evaObj.getBenchMark([prediction], [gold])
+                if question in ckpt:
+                    print(f"found existing answer to question: {question}")
+                    prediction, traceLog = ckpt.read_from_ckpt(question)
+                else:
+                    prediction, traceLog = self.qa(question)
+                    ckpt.write_to_ckpt(question, (prediction, traceLog))
+
+                evalObj = Evaluate()
+                metrics = evalObj.getBenchMark([prediction], [gold])
                 return sample_idx, sample_id, prediction, metrics, traceLog
             except Exception as e:
                 import traceback
@@ -104,30 +116,28 @@ def process_sample(data):
                 res_metrics[item_key] = item_value / total_metrics["processNum"]
             else:
                 res_metrics[item_key] = total_metrics["processNum"]
+        CheckpointerManager.close()
         return res_metrics
 
 
 if __name__ == "__main__":
-    configFilePath = os.path.join(
-        os.path.abspath(os.path.dirname(__file__)), "../kag_config.cfg"
-    )
-    evalObj = EvaFor2wiki(configFilePath=configFilePath)
+    import_modules_from_path("./prompt")
+    evalObj = EvaFor2wiki()
 
+    start_time = time.time()
     filePath = "./data/2wiki_qa_sub.json"
-    # filePath = "./data/2wiki_qa.json"
-    qaFilePath = os.path.join(
-        os.path.abspath(os.path.dirname(__file__)), filePath
-    )
 
-    start_time = time.time()
+    evalObj.qa("When did Lothair Ii's mother die?")
+
+    qaFilePath = os.path.join(os.path.abspath(os.path.dirname(__file__)), filePath)
     resFilePath = os.path.join(
-        os.path.abspath(os.path.dirname(__file__)), f"2wiki_qa_res_{start_time}.json"
+        os.path.abspath(os.path.dirname(__file__)), f"2wiki_res_{start_time}.json"
     )
     total_metrics = evalObj.parallelQaAndEvaluate(
-        qaFilePath, resFilePath, threadNum=20, upperLimit=1000
+        qaFilePath, resFilePath, threadNum=20, upperLimit=10000
     )
-    total_metrics['cost'] = time.time() - start_time
+
+    total_metrics["cost"] = time.time() - start_time
     with open(f"./2wiki_metrics_{start_time}.json", "w") as f:
         json.dump(total_metrics, f)
-
     print(total_metrics)
diff --git a/kag/examples/2wiki/solver/prompt/__init__.py b/kag/examples/2wiki/solver/prompt/__init__.py
index dadd42a3..dfa931cd 100644
--- a/kag/examples/2wiki/solver/prompt/__init__.py
+++ b/kag/examples/2wiki/solver/prompt/__init__.py
@@ -11,4 +11,4 @@
 
 """
 Place the prompts to be used for solving problems in this directory.
-"""
\ No newline at end of file
+"""
diff --git a/kag/examples/2wiki/solver/prompt/resp_generator.py b/kag/examples/2wiki/solver/prompt/resp_generator.py
index 70e96cc9..cb8d76ab 100644
--- a/kag/examples/2wiki/solver/prompt/resp_generator.py
+++ b/kag/examples/2wiki/solver/prompt/resp_generator.py
@@ -3,26 +3,26 @@
 from typing import List
 import logging
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 logger = logging.getLogger(__name__)
 
 
-class RespGenerator(PromptOp):
-    template_zh = "基于给定的引用信息回答问题。" \
-                  "\n只输出答案，不需要输出额外的信息。" \
-                  "\n给定的引用信息：'$memory'\n问题：'$instruction'"
-    template_en = "Answer the question based on the given reference." \
-                 "\nOnly give me the answer and do not output any other words." \
-                 "\nThe following are given reference:'$memory'\nQuestion: '$instruction'"
-
-    def __init__(self, language: str):
-        super().__init__(language)
+@PromptABC.register("resp_simple")
+class RespGenerator(PromptABC):
+    template_zh = (
+        "基于给定的引用信息回答问题。" "\n只输出答案，不需要输出额外的信息。" "\n给定的引用信息：'$memory'\n问题：'$instruction'"
+    )
+    template_en = (
+        "Answer the question based on the given reference."
+        "\nOnly give me the answer and do not output any other words."
+        "\nThe following are given reference:'$memory'\nQuestion: '$instruction'"
+    )
 
     @property
     def template_variables(self) -> List[str]:
         return ["memory", "instruction"]
 
     def parse_response(self, response: str, **kwargs):
-        logger.debug('推理器判别:{}'.format(response))
+        logger.debug("推理器判别:{}".format(response))
         return response
diff --git a/kag/examples/README.md b/kag/examples/README.md
index 6587ce7f..3e629a64 100644
--- a/kag/examples/README.md
+++ b/kag/examples/README.md
@@ -11,14 +11,14 @@ Create your new knext project from knext cli tool.
     host_addr = http://localhost:8887
 
     [vectorizer]
-    vectorizer = kag.common.vectorizer.OpenAIVectorizer
+    type = openai
     model = bge-m3
     api_key = EMPTY
     base_url = http://127.0.0.1:11434/v1
     vector_dimensions = 1024
 
     [llm]
-    client_type = ollama
+    type = ollama
     base_url = http://localhost:11434/api/generate
     model = llama3.1
 
@@ -130,7 +130,6 @@ Create your new knext project from knext cli tool.
             pass
 
         def qa(self, query):
-            # CA
             resp = SolverPipeline()
             answer, trace_log = resp.run(query)
 
diff --git a/kag/examples/2wiki/builder/prompt/__init__.py b/kag/examples/baike/builder/__init__.py
similarity index 86%
rename from kag/examples/2wiki/builder/prompt/__init__.py
rename to kag/examples/baike/builder/__init__.py
index 247bb44c..7a018e7c 100644
--- a/kag/examples/2wiki/builder/prompt/__init__.py
+++ b/kag/examples/baike/builder/__init__.py
@@ -10,5 +10,5 @@
 # or implied.
 
 """
-Place the prompts to be used for building the index in this directory.
-"""
\ No newline at end of file
+Builder Dir.
+"""
diff --git "a/kag/examples/baike/builder/data/\345\221\250\346\230\237\351\251\260\347\231\276\347\247\221.txt" "b/kag/examples/baike/builder/data/\345\221\250\346\230\237\351\251\260\347\231\276\347\247\221.txt"
new file mode 100644
index 00000000..dd363aac
--- /dev/null
+++ "b/kag/examples/baike/builder/data/\345\221\250\346\230\237\351\251\260\347\231\276\347\247\221.txt"
@@ -0,0 +1,29 @@
+周星驰（Stephen Chow），1962年6月22日出生于中国香港，祖籍浙江宁波 [178]，华语影视男演员、导演、编剧、监制、制片人、出品人、主持人、国家一级演员、西南民族大学客座教授 [82]、中国人民大学教授 [82]。
+1980年，成为丽的电视台特约演员，从而进入演艺圈 [32]。1981年，出演荧幕处女作《IQ成熟时》 [161]。1988年，将演艺事业的重心转向大银幕，在电影《霹雳先锋》中首次担任男主角 [91]。1990年，凭借喜剧片《一本漫画闯天涯》确立其无厘头的表演风格 [137]，之后又凭借喜剧动作片《赌圣》、喜剧片《逃学威龙》两度打破香港电影票房纪录 [1] [142]。1993年上映的古装喜剧片《唐伯虎点秋香》使得周星驰第四次拿到香港电影年度票房冠军 [136]。1994年，周星驰开始转型，他首度出任导演的电影作品是《国产凌凌漆》 [94]。1995年，主演的喜剧爱情片《大话西游》成为周星驰后现代电影的代表作 [92]。2001年，自导自演的喜剧片《少林足球》打破香港电影票房纪录 [24]。2003年，成为美国《时代周刊》封面人物 [4]。2013年，执导古装电影《西游·降魔篇》，该片以2.18亿美元的票房成绩打破华语电影在全球的票房纪录 [5]。2016年，担任科幻喜剧片《美人鱼》的导演、编剧、制片人 [6-7]，该片创下中国内地影史单片票房纪录 [81] [84]。
+作为演员，他获得过第21届香港电影金像奖最佳男主角奖，亚太电影节最佳男主角奖等奖项 [3] [139]，入选“中国电影百年百位优秀演员”以及“中国电影百年名人堂” [85] [138]。作为导演，他先后获得第21届香港电影金像奖最佳导演奖、第42届台湾电影金马奖最佳导演等奖项 [3] [80]。
+早年经历
+1962年，周星驰出生在香港九龙的贫民区 [200]。母亲凌宝儿引用《滕王阁序》中的诗句“雄州雾列，俊采星驰”给儿子取名周星驰 [200]。周星驰有一个姐姐和一个妹妹，一家人住在一间狭窄的木板房里，全家一个月的生活费是50元 [200]。尽管家境贫寒，但母亲凌宝儿还是尽可能给周星驰买体面的衣服 [126]。周星驰7岁时，父母离异 [8]。周星驰从小就沉默寡言，不爱说话 [8]。父母离婚后，他变得愈发沉默寡言 [180]。周星驰的爱好是看电影以及TVB电视剧，他尤其爱看李小龙的电影。他是因为迷上李小龙才走上电影之路 [8]。为了学习功夫，周星驰还拜了李小龙的授业师兄黄淳梁为师 [180]。他曾经热衷于用炒热的绿豆练习铁砂掌，而且还只用右手练 [126]。周星驰童年时做过很多工作，包括帮老人摆地摊卖指甲钳，去酒楼推着滑轮车卖虾饺，到五金厂打工，在尖沙咀骑自行车兜售报纸 [200]。
+周星驰中学就读于香港圣玛利奥英文书院 [159]。他在学校的成绩不好，除了语文，其它科目成绩都不行，但是周星驰的老师曾经称赞他画画不错 [8]。周星驰16岁时，利用暑假时间卖过点心、眼镜和电器 [142]。中学毕业后，他做过两个月的办公室助理 [126]。后来，他通过姐姐的朋友认识了梁朝伟，两个人都有一个明星梦 [8]。
+演艺经历
+初涉影视
+1980年，周星驰成为丽的电视台的特约演员，从而正式进入演艺圈 [32] [160]。1981年，出演个人首部电视剧《IQ成熟时》 [161]。1982年，周星驰报考了第11期无线电视艺员训练班，但没有考上；同年，他在戚美珍的介绍下进入第11期无线电视艺员训练班夜训班学习 [126]。
+1983年，从无线电视艺员训练班毕业后，周星驰正式成为无线电视台的签约艺员 [126]，他被指派担任儿童节目《430穿梭机》的主持人，并且还在节目中与龙炳基共同主演单元剧《黑白僵尸》 [126] [142]。为了实现当演员的梦想，周星驰在主持电视节目之余努力寻找跑龙套的机会。他在跑龙套期间常常跟导演争取展现自己的机会 [142]。终于，他在武侠剧《射雕英雄传》中获得了一个宋兵乙的龙套角色，该片也是他首次和吴孟达合作的作品 [126]；周星驰为了剧中角色花了不少心血，在导演的设计中这个角色是被人一掌打死，但他认为这样不真实，于是给角色设计了反抗的动作，但是并没有被导演采纳 [142]；之后，他还在时装剧《北斗双雄》中扮演一个问题少年 [126]。
+多面演绎
+1986年，周星驰被调入无线电视台戏剧组；同年，他在单元情景剧《哥哥的女友》中饰演可爱活泼又略带羞涩的潘家伟，这也是他第一次在情景剧中担任男主角；之后，他还在温兆伦、郭晋安等人主演的电视剧中跑龙套。 [126]
+1988年，周星驰与万梓良、李美凤共同出演动作片《捕风汉子》，该片是他出演的第一部电影，因为这部电影，他结识了香港演员万梓良。万梓良欣赏周星驰的演技，于是他们之后又合作了时装商战剧《他来自江湖》 [126]；其后，周星驰得到电影公司老板李修贤的赏识，在电影《霹雳先锋》中饰演一个浪荡江湖的小弟 [128]，该片是周星驰首次在电影中担任男主角 [91]，他也凭借该片获得第25届台湾电影金马奖最佳男配角奖、第8届香港电影金像奖最佳配角奖提名 [9] [107]。《霹雳先锋》上映的这一年也是其电视剧播出最密集的一年，他先后有《梦边缘》、《刑警本色》、《大都会》、《斗气一族》等六部电视剧在TVB播出，其无厘头的搞笑方式通过电视剧引起了香港人的注意 [91]。1989年，周星驰和罗慧娟、吴孟达共同出演了TVB古装武侠剧《盖世豪侠》 [126]。
+事业上升
+1990年5月24日，主演的动作喜剧片《咖喱辣椒》上映，周星驰以即兴发挥的方式完成了与张学友的配戏，并且受到广泛好评 [181]。在接下来的几部电影中，周星驰延续了《咖喱辣椒》的表演方式 [181]。周星驰的成名作是喜剧电影《赌圣》，该片不仅让周星驰发挥了无厘头的表演风格，同时也为他所表演的小人物开拓了成型的道路 [134]。周星驰的表演，在传统的英雄人物塑造之外，又添加了戏谑的成分 [91]。该片在香港地区收获4132万港元的票房 [2]，不仅获得香港年度票房排行榜冠军 [137]，还打破了香港地区的票房纪录 [142]，周星驰凭借该片获得第10届香港电影金像奖最佳男主角奖提名 [10]。这一年，他还通过喜剧片《一本漫画闯天涯》确立其无厘头的表演风格 [137]，该片也成为周星驰风格形成的重要转折点 [33]。7月，主演时装喜剧《孖仔孖心肝》，周星驰在剧中饰演性格爽朗且具有正义感的王利就 [184]。同年，他还主演了喜剧片《赌侠》，在片中饰演身负多项特异功能绝技的阿星 [78]，该片在香港地区的最终票房达到4034万港元，位列香港年度票房排行榜第二名 [11]。1990年的华语片香港票房前十名中有三部是周星驰主演的电影，其中冠军《赌圣》、亚军《赌侠》票房都超过了4000万港元 [14]。
+1991年2月，在爱情喜剧片《整蛊专家》中饰演整人专家古晶 [12]，作为无厘头电影，该片无论主题、故事情节还是造型都呈现出一种卡通画的夸张 [185]，影片在香港上映后最终票房为3138.8万港元 [186]；7月，在喜剧片《逃学威龙》中改变小混混的银幕形象，饰演飞虎队队长周星星 [79]，该片在香港地区上映后以4382万港元的票房成绩获得香港年度票房冠军 [11]，并打破香港地区的票房纪录 [1]；8月，主演喜剧科幻片《赌侠2上海滩赌圣》，该片在香港的票房达到3186万港元；同年，周星驰与成龙、周润发并称为“双周一成” [13]。
+1992年1月，主演的喜剧片《家有喜事》在香港上映后最终票房为4899万港元，获得香港年度票房排行榜亚军 [11]；4月，主演《逃学威龙》系列电影的第二部《逃学威龙2》，在片中饰演以个人身份到学校协助曹达华的交通警察周星星 [187]；7月，在古装喜剧片《审死官》中饰演口才和能力出色的状师宋世杰 [188]，该片以4988万港元的票房成绩获得香港电影年度票房冠军 [2]，并再度打破票房纪录 [200]，周星驰亦凭借该片获得第37届亚太电影节最佳男主角奖以及第12届香港电影金像奖最佳男主角提名 [16] [137]；9月，与林青霞、李嘉欣共同主演武侠喜剧片《鹿鼎记Ⅱ：神龙教》，在片中饰演护送建宁公主嫁到云南的韦小宝 [202]，该片在香港地区的最终票房达到3658万港元 [186]，位列香港十大卖座电影第五名 [186]；这一年，他还出演了古装电影《武状元苏乞儿》，他所饰演的主人公苏灿在即将考取武状元之际，遭遇歹人设计陷害，被打断全身经脉，沦落成了乞丐 [189-190]，该片是他出演的第一部悲喜剧风格的影片 [205]，周星驰通过具有特色的表演表现出了苏灿在顺意时期的豪爽性格，与影片后半部分惨淡的遭遇形成鲜明的对比 [205]。在1992年香港年度票房排行榜中，排名前五名的电影全部由周星驰主演，且每一部电影的票房都超过3600万港元。周星驰在这个时期成为了一个符号，而其独特的表演风格则被称为“无厘头文化” [14]。
+1993年，与巩俐、郑佩佩共同主演古装喜剧片《唐伯虎点秋香》，在片中饰演天资聪慧、诗画双绝的江南四大才子之首唐伯虎 [15]，影片通过解构手法对经典文本进行重新解读，历史上风流倜傥、才华横溢的才子唐伯虎在片中被周星驰塑造成游手好闲、吊儿郎当的痞子模样 [68]。该片在香港取得4017万港元的票房，获得香港年度票房排行榜冠军 [2]。这一年，他还相继主演了逃学威龙系列电影的第三部《逃学威龙3：龙过鸡年》以及武侠喜剧片《济公》 [174-175]。
+自导自演
+1994年到1999年是周星驰的转型阶段。这时期的周星驰不再满足于无厘头式的创作，而试图在影片中融入更多正剧和悲剧的成分 [135]。1994年，周星驰第一次担任导演，推出个人首部自编自导自演的电影《国产凌凌漆》 [94] [99]；该片在香港地区票房达到3752万港元，在香港年度票房排行榜上排名第三 [11]；周星驰在片中饰演手持杀猪刀、不走寻常路的特工阿漆 [94]，并凭借该片获得第14届香港电影金像奖最佳男主角提名 [17]；该片对美国谍战影片007系列进行了戏拟的创作，周星驰所扮演的人物和《007》系列影片中的特工以相同的方式出场，但是同样的镜头却给观众完全不一样的视觉感受 [68]。这一年，他还在自导自演的喜剧动作片《破坏之王》中饰演快餐小子阿星 [100]；此外，他还出演了古装喜剧片《九品芝麻官之白面包青天》，在片中饰演候补知县包龙星 [18]。
+1995年1月21日，主演的喜剧片《大话西游之月光宝盒》在中国香港上映 [96]，他在片饰演对白晶晶一见钟情的至尊宝 [95]；为了演好片中角色，周星驰在导演刘镇伟的建议下专门去看了金·凯瑞主演的电影《变相怪杰》 [70]。随后他又主演了《大话西游》系列电影的下部《大话西游之大圣娶亲》，在片中饰演为寻找紫霞仙子而回到五百年前的至尊宝 [98]，并凭借该片获得第1届香港电影金紫荆奖最佳男主角奖、第15届香港电影金像奖最佳男主角提名 [22] [120]。《大话西游》的加长纪念版于2017年在中国内地重映后票房突破1.3亿元，成为首部票房破亿元的华语重映影片 [97]。《大话西游》是周星驰的转型之作，尽管影片在票房上没有达到投资方的预期，但经过VCD等媒介的传播后逐步在中国内地走红 [19] [93]。该片对传统电影进行了解构，成为周星驰后现代风格的代表作之一 [92]；通过这部电影，周星驰不仅第一次尝试创作电影，还第一次自己开公司投资拍摄电影。该片也被外界看作是周星驰在电影创作上的分水岭，自此之后周星驰电影不再单纯依靠搞笑 [91]；同年，主演科幻片《百变星君》，在片中饰演学业无成却挥金如土的富豪之子李泽星 [20]，该片上映后以3533万港元的票房成绩，位列香港年度票房排行榜第三位 [11]。
+1996年，自导自演科幻喜剧片《大内密探零零发》 [194]；片中有一场当众扇周星驰耳光的戏是背对着镜头，这场戏周星驰完全可以使用替身，但他仍坚持自己完成 [192]；周星驰在影片的情节设置上借鉴了《奇门遁甲》等作品，并重新解构了部分内容 [191]，此外他还发挥个人风格，设计了很多有创造性的情节 [193]；该片的票房达到3605万港元，位列香港年度票房排行榜第三位 [11]；同年，担任喜剧动作片《食神》的导演、编剧、主演 [196]，在片中饰演在饮食界享有盛名的史蒂芬·周 [195]，该片上映后以4086万港币的票房成绩位列香港年度票房排行榜第二位 [11]，此外，影片还被威尼斯国际电影节选为观摩影片 [21]；周星驰通过这部电影向导演、制片人转型，并陆续推出了《少林足球》、《功夫》、《长江七号》等影片 [131]。
+1997年，主演喜剧片《97家有喜事》，在片中饰演个性反叛、不修边幅的老恭，该片在香港地区收获4044万港元的票房，获得香港年度票房亚军 [11]；之后，他又相继主演了喜剧片《算死草》、贺岁片《行运一条龙》等作品 [162] [165]。
+1999年，周星驰自导自演了带有自传性质的喜剧片《喜剧之王》，在片中饰演虽屡遭失败但仍不气馁的群众演员尹天仇 [197]，并在表演上回归到了卓别林式的喜剧风格 [23]。该片表达了周星驰喜剧演员生涯的心路历程，片中“我是一个演员”的台词更是周星驰对于其表演道路的总结 [85]。该片是周星驰主导的第一部影片 [181]，它让周星驰从一个演员转变成职业导演 [140]。在《喜剧之王》中，周星驰突破编剧和导演所给予的表演空间，把更多的个人想法融入到作品中。影片在延续周星驰演员时期喜剧风格的同时，也有了更深的文化内涵 [181]。该片在香港上映以后以2984万港元的票房成绩获得香港年度票房冠军 [2]。这一年，周星驰还与张家辉、吴君如合作主演了喜剧片《千王之王2000》，在片中饰演千王之王黄师虎 [158]。
+2001年，周星驰自导自演了喜剧片《少林足球》 [167]。在影片拍摄期间，周星驰因为练习踢球而双腿臃肿，导致他两天无法下床 [200]。《少林足球》是周星驰第一次完全掌控的电影 [176]，他将特效、功夫以及足球结合在一起，并借由陈国坤身穿的黄色连体服完成致敬李小龙的愿望 [148]。他在片中饰演的阿星虽然以拾荒为生，但对武术极度痴迷，在足球教练明峰的说服下加入少林足球队，努力实现自己的人生价值 [68]。周星驰在片中突破以往风格，表演也变得内敛 [167]，他凭借该片获得第21届香港电影金像奖最佳导演奖、最佳男主角奖以及杰出青年导演奖 [3]，而该片亦获得第21届香港电影金像奖最佳电影奖、日本电影蓝丝带奖最佳外语片等奖项 [3] [26-27]，并被美国《时代周刊》选为“世界史上25部最佳体育电影之一” [176]。该片在香港地区的最终票房达到6073万港币，不仅获得香港年度票房冠军，还打破了香港地区票房纪录 [24-25]。
+2003年，周星驰成为美国《时代周刊》的封面人物，并入选该杂志评出的“29位亚洲英雄” [4]。2004年，担任喜剧动作片《功夫》的导演、编剧兼主演，该片在全球的总票房达到1.05亿美元，在香港以6127万港元的票房成绩打破香港地区的票房纪录，并创下华语电影在北美上映的单厅票房纪录 [28]，获得第24届香港电影金像奖最佳影片奖、第42届台湾电影金马奖最佳剧情片奖、第63届美国金球奖最佳外语片提名、第59届英国电影学院奖最佳非英语片等奖项 [47] [56] [105] [154]，而周星驰个人则凭借该片获得第42届台湾电影金马奖最佳导演奖 [47]。《功夫》是周星驰面向国际推出的一部作品，他在片中淡化了原来夸张的演艺方式，通过故事、画面和人物性格来表达想要阐述的东西 [181]。
+2005年，在中国电影表演艺术学会举办的评选活动中，周星驰被选为“中国电影百年百位优秀演员”之一 [85]；同年，入选“中国电影百年名人堂” [138]
+2008年，自导自演科幻题材的电影《长江7号》，该片是周星驰的转型之作，他摒弃了无厘头，转而走悲剧路线 [206]，该片在香港上映以后以5140万港元的票房成绩获得香港电影年度票房冠军，在中国内地则收获了2.02亿元的票房，获得中国内地上半年票房冠军 [164]。2010年，担任动画电影《长江7号爱地球》的制片人以及编剧 [163]。
+
diff --git "a/kag/examples/baike/builder/data/\345\221\250\346\235\260\344\274\246\347\231\276\347\247\221.txt" "b/kag/examples/baike/builder/data/\345\221\250\346\235\260\344\274\246\347\231\276\347\247\221.txt"
new file mode 100644
index 00000000..2b9acb82
--- /dev/null
+++ "b/kag/examples/baike/builder/data/\345\221\250\346\235\260\344\274\246\347\231\276\347\247\221.txt"
@@ -0,0 +1,12 @@
+周杰伦（Jay Chou），1979年1月18日出生于台湾省新北市，祖籍福建省永春县，华语流行乐男歌手、音乐人、演员、导演、编剧，毕业于淡江中学。
+2000年，发行个人首张音乐专辑《Jay》 [26]。2001年，凭借专辑《范特西》奠定其融合中西方音乐的风格 [16]。2002年，举行“The One”世界巡回演唱会 [1]。2003年，成为美国《时代》杂志封面人物 [2]；同年，发行音乐专辑《叶惠美》 [21]，该专辑获得第15届台湾金曲奖最佳流行音乐演唱专辑奖 [23]。2004年，发行音乐专辑《七里香》 [29]，该专辑在全亚洲的首月销量达到300万张 [316]；同年，获得世界音乐大奖中国区最畅销艺人奖 [320]。2005年，主演个人首部电影《头文字D》 [314]，并凭借该片获得第25届香港电影金像奖和第42届台湾电影金马奖的最佳新演员奖 [3] [315]。2006年起，他连续三年获得世界音乐大奖中国区最畅销艺人奖 [4]。
+2007年，自编自导爱情电影《不能说的秘密》 [321]，同年，成立杰威尔音乐有限公司 [10]。2008年，凭借歌曲《青花瓷》获得第19届台湾金曲奖最佳作曲人奖 [292]。2009年，入选美国CNN“25位亚洲最具影响力人物” [6]；同年，凭借专辑《魔杰座》获得第20届台湾金曲奖最佳国语男歌手奖 [7]。2010年，入选美国《Fast Company》杂志评出的“全球百大创意人物”。2011年，凭借专辑《跨时代》获得第22届台湾金曲奖最佳国语男歌手奖 [294]。2012年，登上福布斯中国名人榜榜首 [8]。2014年，发行个人首张数字音乐专辑《哎呦，不错哦》 [295]。2023年，凭借专辑《最伟大的作品》成为首位获得国际唱片业协会“全球畅销专辑榜”冠军的华语歌手 [287]。
+周杰伦出生于台湾省新北市，祖籍福建省泉州市永春县 [13]。4岁的时候，母亲叶惠美把他送到淡江山叶幼儿音乐班学习钢琴。初中二年级时，父母因性格不合离婚，周杰伦归母亲叶惠美抚养。中考时，没有考上普通高中，同年，因为擅长钢琴而被淡江中学第一届音乐班录取。高中毕业以后，两次报考台北大学音乐系均没有被录取，于是开始在一家餐馆打工。1997年9月，周杰伦在母亲的鼓励下报名参加了台北星光电视台的娱乐节目《超级新人王》 [26]，并在节目中邀请他人演唱了自己独立创作的歌曲《梦有翅膀》；当主持人吴宗宪看到这首歌曲的曲谱后，便邀请周杰伦到阿尔发音乐公司担任音乐助理。当时，全唱片公司只有四位员工，包括周杰伦和创作歌词的作词人方文山 [367]。1998年，周杰伦创作了歌曲《眼泪知道》，公司把这首歌曲给到刘德华后被退歌，后为张惠妹创作的歌曲《忍者》（后收录于周杰伦个人音乐专辑《范特西》中）也被退回 [14]。2000年，音乐人杨峻荣在听到周杰伦独立创作的歌曲《可爱女人》的卡带后便立刻被吸引，并在吴宗宪的支持下争取了2000万元新台币经费来力捧周杰伦 [367]。
+2000年，周杰伦在杨峻荣的推荐下开始演唱自己创作的歌曲 [367]；11月7日，发行个人首张音乐专辑《Jay》 [26]，并包办专辑全部歌曲的作曲、和声编写以及监制工作 [368]，该专辑融合了R&B、嘻哈等多种音乐风格 [369]，发行于21世纪元年的当口，周杰伦亦在当时流行音乐疲软之际凭借R&B的音乐风格掀起了一股狂热的R&B潮流 [370]，周杰伦尝试着把高难度的西班牙式弦乐演奏表现在了专辑的许多歌曲中 [369]，使得整张专辑的意境十分逼近电影配乐 [371]，发行后获得IFPI香港唱片销量大奖十大销量国语唱片奖 [372]，其中的主打歌曲《星晴》获得第24届十大中文金曲优秀国语歌曲金奖 [15]，而他也凭借该专辑在华语乐坛受到关注，并在次年举办的第12届台湾金曲奖颁奖典礼上凭借该专辑获得最佳流行音乐演唱专辑奖 [361]、入围最佳专辑制作人奖 [372]，凭借专辑中的歌曲《可爱女人》提名最佳作曲人奖 [371]。
+2001年9月，周杰伦发行个人第二张音乐专辑《范特西》 [26]，他除了担任专辑的制作人外，还包办了专辑中所有歌曲的作曲，该专辑是周杰伦确立其音乐风格的作品 [16]，其中不仅囊括了抒情R&B歌曲，周杰伦还扩展想象空间，将摇滚、加快版Rap、日本民族风味的音乐风格融入到了该专辑中 [373]，专辑中结合中西方音乐元素的主打歌曲《双截棍》成为饶舌歌曲的代表作之一，而该专辑的发行也让周杰伦打开了东南亚地区的音乐市场 [16]，并于次年凭借该专辑获得第13届台湾金曲奖最佳专辑制作人奖、最佳流行音乐专辑奖 [241]，以及香港唱片销量大奖颁奖典礼十大销量国语唱片等奖项，周杰伦亦凭借专辑中的歌曲《爱在西元前》获得第13届台湾金曲奖最佳作曲人奖 [228]；10月，为李玟创作融合中西方音乐元素的歌曲《刀马旦》 [325]；12月24日，发行个人音乐EP《范特西plus》，收录了他在桃园巨蛋演唱会上演唱的《你比从前快乐》《世界末日》等歌曲；同年，获得第19届十大劲歌金曲颁奖典礼最受欢迎唱作歌星金奖、叱咤乐坛流行榜颁奖典礼叱咤乐坛生力军男歌手金奖等奖项。
+2002年，参演个人首部电视剧《星情花园》；2月，在新加坡新达城国际会议展览中心举行演唱会；7月，发行个人第三张音乐专辑《八度空间》 [26] [317]，除了包办专辑中所有歌曲的作曲外，他还担任专辑的制作人 [17]，该专辑以节奏蓝调风格的歌曲为主，并获得g-music风云榜白金音乐奖十大金碟奖、华语流行乐传媒大奖十大华语唱片奖、新加坡金曲奖大奖年度最畅销男歌手专辑奖等奖项 [18]；9月28日，在台北体育场举行“The One”演唱会；12月12日至16日，在香港体育馆举行5场“The One”演唱会；12月25日，在美国拉斯维加斯举办“The One”演唱会；同年，获得第1届MTV日本音乐录影带大奖亚洲最杰出艺人奖、第2届全球华语歌曲排行榜最受欢迎创作歌手奖和最佳制作人奖 [350]、第9届新加坡金曲奖亚太最受推崇男歌手奖等奖项 [19]。
+2003年2月，成为美国《时代周刊》亚洲版的封面人物 [2]；3月，在第3届音乐风云榜上获得港台年度最佳唱作人奖、年度风云大奖等奖项，其演唱的歌曲《暗号》则获得港台年度十大金曲奖 [236]；5月17日，在马来西亚吉隆坡默迪卡体育场举行“The One”演唱会；7月16日，他的歌曲《以父之名》在亚洲超过50家电台首播，预计有8亿人同时收听，而该曲首播的当日也被这些电台定为“周杰伦日” [20]；7月31日，发行个人第四张音乐专辑《叶惠美》 [21] [26]，他不仅包办了专辑所有歌曲的作曲，还担任专辑的制作人和造型师 [21]，该专辑发行首月在亚洲的销量突破200万张 [22]，并于次年获得第15届台湾金曲奖最佳流行音乐演唱专辑奖、第4届全球华语歌曲排行榜年度最受欢迎专辑等奖项 [23-24]，专辑主打歌曲《东风破》也是周杰伦具有代表性的中国风作品之一，而他亦凭借该曲获得第4届华语音乐传媒大奖最佳作曲人奖；9月12日，在北京工人体育场举行“The One”演唱会；11月13日，发行个人音乐EP《寻找周杰伦》 [25]，该EP收录了周杰伦为同名电影《寻找周杰伦》创作的两首歌曲《轨迹》《断了的弦》 [25]；12月12日，在上海体育场举办“The One”演唱会，并演唱了变奏版的《双截棍》、加长版的《爷爷泡的茶》等歌曲；同年，客串出演的电影处女作《寻找周杰伦》上映 [90]。
+2004年1月21日，首次登上中央电视台春节联欢晚会的舞台，并演唱歌曲《龙拳》 [27-28]；3月，在第4届音乐风云榜上获得台湾地区最受欢迎男歌手奖、年度风云大奖、年度港台及海外华人最佳制作人等奖项 [326]；8月3日，发行融合嘻哈、R&B、古典音乐等风格的音乐专辑《七里香》 [29] [289]，该专辑是一张带有浓重东方抒情摇滚风格的音乐作品，并维持了周杰伦一贯的高格调，风格也更加统一，周杰伦也在被流行乐坛一再忽略或曲解的本土文化中调动着民乐所有灵性的想象力，展现了其独树一帜的音乐才华 [29]，专辑发行当月在全亚洲的首月销量便突破了300万张 [316]，而专辑同名主打歌曲《七里香》则获得了第27届十大中文金曲十大金曲奖、优秀流行国语歌曲奖金奖，以及第5届全球华语歌曲排行榜年度25大金曲奖等奖项 [30]，他亦凭借该专辑获得了第16届世界音乐大奖中国区最畅销艺人奖等多个音乐奖项 [320]；10月起，在中国台湾省台北市、中国香港、美国洛杉矶、蒙特维尔等地举行“无与伦比”世界巡回演唱会 [374]。
+2005年1月11日，在第11届全球华语榜中榜颁奖盛典上获得港台最佳男歌手奖、港台最受欢迎男歌手奖、港台最佳创作歌手奖等奖项 [31]；4月，凭借专辑《七里香》入围第16届台湾金曲奖最佳国语男演唱人奖、最佳流行音乐演唱专辑奖，凭借歌曲《七里香》入围第16届台湾金曲奖最佳作曲人奖；6月23日，由其担任男主角主演的电影《头文字D》上映 [91]，他在该片中饰演藤原拓海 [314] [347]，这也是他主演的个人首部电影 [314]，他也凭借该片获得第42届台湾电影金马奖最佳新演员奖 [3]、第25届香港电影金像奖最佳新演员奖 [315]；7月1日，在上海体育场举行“无与伦比巡回演唱会” [32]；7月9日，在北京工人体育场举行“无与伦比巡回演唱会” [33]。8月31日，在日本发行个人首张精选专辑《Initial J》 [327]，该专辑收录了周杰伦为电影《头文字D》演唱的主题曲《一路向北》和《飘移》 [34]；11月1日，发行个人第六张音乐专辑《11月的萧邦》 [296]，并包办了专辑中所有歌曲的作曲以及专辑的造型设计 [35]，该专辑发行后以4.28%的销售份额获得台湾G-MUSIC年终排行榜冠军；同年，其创作的歌曲《蜗牛》入选“上海中学生爱国主义歌曲推荐目录” [328]。
+2006年1月11日，在第12届全球华语榜中榜颁奖盛典上获得最佳男歌手奖、最佳创作歌手奖、最受欢迎男歌手奖，并凭借歌曲《夜曲》及其MV分别获得年度最佳歌曲奖、最受欢迎音乐录影带奖 [234]；1月20日，发行个人音乐EP《霍元甲》 [329]，同名主打歌曲《霍元甲》是李连杰主演的同名电影《霍元甲》的主题曲 [36]；1月23日，在第28届十大中文金曲颁奖典礼上获得了优秀流行歌手大奖、全年最高销量歌手大奖男歌手奖 [246]；2月5日至6日，在日本东京举行演唱会；9月，发行个人第七张音乐专辑《依然范特西》 [290]，该专辑延续了周杰伦以往的音乐风格，并融合了中国风、说唱等音乐风格，其中与费玉清合唱的中国风歌曲《千里之外》获得第13届全球华语音乐榜中榜年度最佳歌曲奖、第29届十大中文金曲全国最受欢迎中文歌曲奖等奖项 [37-38]，该专辑发行后以5.34%的销售份额位列台湾五大唱片排行榜第一位 [39]，并获得中华音乐人交流协会年度十大优良专辑奖、IFPI香港唱片销量大奖最高销量国语唱片奖等奖项 [40]；12月，发行个人音乐EP《黄金甲》 [330]，该专辑获得IFPI香港唱片销量大奖十大畅销国语唱片奖 [332]；同年，获得世界音乐大奖中国区最畅销艺人奖 [4]；12月14日，主演的古装动作片《满城尽带黄金甲》在中国内地上映 [331]，他在片中饰演武功超群的二王子元杰，并凭借该片获得第16届上海影评人奖最佳男演员奖，而他为该片创作并演唱的主题曲《菊花台》则获得了第26届香港电影金像奖最佳原创电影歌曲奖 [92] [220]。
+
diff --git "a/kag/examples/baike/builder/data/\345\221\250\346\266\246\345\217\221\347\231\276\347\247\221.txt" "b/kag/examples/baike/builder/data/\345\221\250\346\266\246\345\217\221\347\231\276\347\247\221.txt"
new file mode 100644
index 00000000..356d270e
--- /dev/null
+++ "b/kag/examples/baike/builder/data/\345\221\250\346\266\246\345\217\221\347\231\276\347\247\221.txt"
@@ -0,0 +1,8 @@
+周润发（Chow Yun Fat），1955年5月18日出生于中国香港南丫岛，籍贯广东省江门市开平市 [1]，华语影视男演员、摄影家，国家一级演员。
+1976年，出演个人首部电影《投胎人》 [2]。1980年，主演民国剧《上海滩》获得关注 [3]。1985年，凭借电影《等待黎明》获得第22届台湾电影金马奖最佳男主角奖 [114] [123]。1986年，主演的动作片《英雄本色》获得该年度香港电影票房冠军 [124]，他凭借该片获得第6届香港电影金像奖最佳男主角奖 [125]。1988年，凭借电影《龙虎风云》获得第7届香港电影金像奖最佳男主角奖 [5]。1989年，主演剧情片《赌神》、动作片《喋血双雄》 [126-127]。1990年，凭借电影《阿郎的故事》获得第9届香港电影金像奖最佳男主角奖 [6]。1991年，主演的剧情片《纵横四海》成为其代表作 [7]。1998年，开始前往美国好莱坞发展 [9]。2000年，主演的剧情片《卧虎藏龙》在国际获得广泛关注 [10]。2003年，获颁特区政府银紫荆星章 [11]。
+2005年，被评为“中国电影百年百位优秀演员”之一 [12]。2007年-2010年间，相继主演《姨妈的后现代生活》《让子弹飞》等多部电影 [13-14] [134]。2011年，凭借电影《孔子》获得第14届中国电影华表奖优秀境外华裔男演员奖 [128]。2012年，获得第15届上海国际电影节华语电影杰出贡献奖 [130]；此后，相继主演《铜雀台》《澳门风云》《寒战2》等电影 [129] [131-132]。2018年，主演的剧情片《无双》成为该年度国庆档首部票房突破10亿元的电影 [146]，他亦凭借该片获得第14届中美电影节最佳男主角奖 [89]。2023年，主演剧情片《别叫我“赌神”》 [145]；同年，获得第28届釜山国际电影节亚洲电影人奖 [142]。
+周润发出生于香港南丫岛的一个农村家庭，籍贯广东省江门市开平市 [1]，他的父亲周容允是出海打渔的船员，常年漂泊海上，周润发的母亲种菜养鸡，也经常到别人家里帮佣 [18]。因生活清苦，周润发从小帮母亲打零工贴补家用。童年时父亲因为好赌而输光月薪，因为父亲的薪水都输光了，发薪水时，母亲只能去父亲公司领回一桶油 [19]，周润发因此从小厌恶赌博 [20]。1965年，即周润发10岁时，因为家庭生活困难，妈妈带着孩子们去位于九龙的外婆家居住，妈妈去当工人养家糊口 [18]。周润发读到中学三年级的时候，父亲积劳成疾，一病不起，家里再也没有能力供他继续上学，他便过早地踏入社会，寻找工作。在从事演艺工作之前，周润发一直生活在社会底层，仅他干过的职业，就有商行侍役、电子厂童工、酒店服务员、邮差、照相器材售货员等等 [21]。1973年，18岁的周润发在报纸上看到无线电视演员训练班（TVB）的征人广告，便与朋友一同去应征 [22]，得到担任考官之一的钟景辉的赏识 [23]；之后，他考入了无线电视台第三期艺员训练班，与吴孟达、林岭东是同学。入行之初，周润发跑过一段时间的龙套，并出演了《民间传奇》《红楼梦》《毕业后》等剧集中的配角角色 [147]。
+1974年，周润发顺利从训练班毕业，经过短暂的龙套生涯，周润发就得到了演出的机会。1975年，出演的由萧笙、李惠民、陈宇超联合执导的古装爱情剧《红楼梦》播出，周润发在剧中饰演蒋玉函 [24]。1976年，出演的由张森执导的剧情电影《投胎人》上映 [2]；4月，与刘志荣、黄杏秀合作出演的喜剧《新苏小妹三难新郎》播出，在剧中饰演王安石之子王雱 [25]；9月，与余安安、林建明联袂主演的剧情片《池女》首映；同年，与张午郎、黄杏秀合作主演的剧情电影《捞家邪牌姑爷仔》上映 [25]。
+1977年，与汪明荃、南红合作出演了家庭剧《家变》，周润发在剧中饰演廉政公署人员何严明；同年，出演动作犯罪电影《入册》 [2]。1978年1月，与廖咏湘领衔主演的剧情电影《爱欲狂潮》首映 [25]；8月，出演的歌舞电视剧《青春热潮》播出；随后，与刘嘉玲、吴孟达等人合作出演的恐怖惊悚剧《幻海奇情》首播；同年，与赛祝娟领衔主演的剧情电影《O女》上映，周润发在电影中饰演澳门富家子管厌平。
+1979年，与任达华领衔主演了剧情类电视剧《有楼收租》，周润发在该剧中饰演黑社会成员阿龙 [25]；随后，其主演的悬疑剧《龙潭群英》首播；同年，与廖伟雄、郑裕玲、欧阳佩珊等人联袂主演了家庭爱情剧《网中人》，周润发在剧中饰演中大工商管理系毕业生程纬。
+1980年，主演了由余允抗执导的犯罪喜剧片，在片中饰演年少气盛的阿杰 [2]；3月，与吕良伟、赵雅芝领衔主演的民国剧《上海滩》在无线电视台播出，周润发在剧中饰演矛盾、犹豫、冷酷的许文强 [3]；4月，与郑裕玲、任达华合作主演的家庭类电视剧《亲情》首播，周润发在剧中饰演石启泰的次子石晖 [26]；同年，主演了由王天林执导的动作喜剧电影《懵女大贼傻侦探》，周润发在电影中饰演杀手梁标 [2]；随后，领衔主演了喜剧动作片《金榜英雄》（又名《系咁先》），在片中饰演刚从警校毕业的朱嘉华 [2]。
diff --git a/kag/examples/baike/builder/indexer.py b/kag/examples/baike/builder/indexer.py
new file mode 100644
index 00000000..e95956dd
--- /dev/null
+++ b/kag/examples/baike/builder/indexer.py
@@ -0,0 +1,32 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import logging
+from kag.common.registry import import_modules_from_path
+
+from kag.builder.runner import BuilderChainRunner
+
+logger = logging.getLogger(__name__)
+
+
+def buildKB(file_path):
+    from kag.common.conf import KAG_CONFIG
+
+    runner = BuilderChainRunner.from_config(KAG_CONFIG.all_config["kag_builder_pipeline"])
+    runner.invoke(file_path)
+
+    logger.info(f"\n\nbuildKB successfully for {file_path}\n\n")
+
+
+if __name__ == "__main__":
+    import_modules_from_path(".")
+    file_path = "./data/"
+
+    buildKB(file_path)
diff --git a/kag/examples/baike/kag_config.yaml b/kag/examples/baike/kag_config.yaml
new file mode 100644
index 00000000..e4ebca6d
--- /dev/null
+++ b/kag/examples/baike/kag_config.yaml
@@ -0,0 +1,128 @@
+#------------project configuration start----------------#
+openie_llm: &openie_llm
+  api_key: key
+  base_url: https://api.deepseek.com
+  model: deepseek-chat
+  type: maas
+
+chat_llm: &chat_llm
+  api_key: key
+  base_url: https://api.deepseek.com
+  model: deepseek-chat
+  type: maas
+
+vectorize_model: &vectorize_model
+  api_key: key
+  base_url: https://api.siliconflow.cn/v1/
+  model: BAAI/bge-m3
+  type: openai
+  vector_dimensions: 1024
+vectorizer: *vectorize_model
+
+log:
+  level: INFO
+
+project:
+  biz_scene: default
+  host_addr: http://127.0.0.1:8887
+  id: '7'
+  language: zh
+  namespace: BaiKe
+#------------project configuration end----------------#
+
+#------------kag-builder configuration start----------------#
+kag_builder_pipeline:
+  chain:
+    type: unstructured_builder_chain # kag.builder.default_chain.DefaultUnstructuredBuilderChain
+    extractor:
+      type: schema_constraint_extractor # kag.builder.component.extractor.schema_constraint_extractor.SchemaConstraintExtractor
+      llm: *openie_llm
+      ner_prompt:
+        type: spg_entity # kag.builder.prompt.spg_prompt.SPGEntityPrompt
+      event_prompt:
+        type: spg_event # kag.builder.prompt.spg_prompt.SPGEventPrompt
+      std_prompt:
+        type: default_std # kag.builder.prompt.default.std.OpenIEEntitystandardizationdPrompt
+      relation_prompt:
+        type: spg_relation # kag.builder.prompt.spg_prompt.SPGRelationPrompt
+    reader:
+      type: txt_reader # kag.builder.component.reader.txt_reader.TXTReader
+    post_processor:
+      type: kag_post_processor # kag.builder.component.postprocessor.kag_postprocessor.KAGPostProcessor
+      similarity_threshold: 0.9
+    splitter:
+      type: length_splitter # kag.builder.component.splitter.length_splitter.LengthSplitter
+      split_length: 300
+      window_length: 0
+    vectorizer:
+      type: batch_vectorizer # kag.builder.component.vectorizer.batch_vectorizer.BatchVectorizer
+      vectorize_model: *vectorize_model
+    writer:
+      type: kg_writer # kag.builder.component.writer.kg_writer.KGWriter
+  num_threads_per_chain: 2
+  num_chains: 4
+  scanner:
+    type: dir_file_scanner # kag.builder.component.scanner.directory_scanner.DirectoryScanner
+#------------kag-builder configuration end----------------#
+
+#------------kag-solver configuration start----------------#
+search_api: &search_api
+  type: openspg_search_api #kag.solver.tools.search_api.impl.openspg_search_api.OpenSPGSearchAPI
+
+graph_api: &graph_api
+  type: openspg_graph_api #kag.solver.tools.graph_api.impl.openspg_graph_api.OpenSPGGraphApi
+
+exact_kg_retriever: &exact_kg_retriever
+  type: default_exact_kg_retriever # kag.solver.retriever.impl.default_exact_kg_retriever.DefaultExactKgRetriever
+  el_num: 5
+  llm_client: *chat_llm
+  search_api: *search_api
+  graph_api: *graph_api
+
+fuzzy_kg_retriever: &fuzzy_kg_retriever
+  type: default_fuzzy_kg_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever
+  el_num: 5
+  vectorize_model: *vectorize_model
+  llm_client: *chat_llm
+  search_api: *search_api
+  graph_api: *graph_api
+
+chunk_retriever: &chunk_retriever
+  type: default_chunk_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever
+  llm_client: *chat_llm
+  recall_num: 10
+  rerank_topk: 10
+
+kag_solver_pipeline:
+  memory:
+    type: default_memory # kag.solver.implementation.default_memory.DefaultMemory
+    llm_client: *chat_llm
+  max_iterations: 3
+  reasoner:
+    type: default_reasoner # kag.solver.implementation.default_reasoner.DefaultReasoner
+    llm_client: *chat_llm
+    lf_planner:
+      type: default_lf_planner # kag.solver.plan.default_lf_planner.DefaultLFPlanner
+      llm_client: *chat_llm
+      vectorize_model: *vectorize_model
+    lf_executor:
+      type: default_lf_executor # kag.solver.execute.default_lf_executor.DefaultLFExecutor
+      llm_client: *chat_llm
+      force_chunk_retriever: true
+      exact_kg_retriever: *exact_kg_retriever
+      fuzzy_kg_retriever: *fuzzy_kg_retriever
+      chunk_retriever: *chunk_retriever
+      merger:
+        type: default_lf_sub_query_res_merger # kag.solver.execute.default_sub_query_merger.DefaultLFSubQueryResMerger
+        vectorize_model: *vectorize_model
+        chunk_retriever: *chunk_retriever
+  generator:
+    type: default_generator # kag.solver.implementation.default_generator.DefaultGenerator
+    llm_client: *chat_llm
+    generate_prompt:
+      type: default_resp_generator # kag.solver.prompt.default.resp_generator.RespGenerator
+  reflector:
+    type: default_reflector # kag.solver.implementation.default_reflector.DefaultReflector
+    llm_client: *chat_llm
+
+#------------kag-solver configuration end----------------#
diff --git a/kag/examples/baike/schema/BaiKe.schema b/kag/examples/baike/schema/BaiKe.schema
new file mode 100644
index 00000000..23747bc9
--- /dev/null
+++ b/kag/examples/baike/schema/BaiKe.schema
@@ -0,0 +1,141 @@
+namespace BaiKe
+
+Chunk(文本块): EntityType
+     desc: A chunk refers to a segment of text.
+     properties:
+        content(内容): Text
+          index: TextAndVector
+
+ArtificialObject(人造物体): EntityType
+     desc: a human-made entity that does not occur naturally.
+     properties:
+        info(信息): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Astronomy(天文学): EntityType
+     properties:
+        info(信息): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Building(建筑): EntityType
+     properties:
+        info(信息): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Creature(生物): EntityType
+     desc: generally refers to any living being, especially animals
+     properties:
+        info(信息): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Concept(概念): EntityType
+     properties:
+        info(信息): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Date(日期): EntityType
+     properties:
+        info(信息): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+GeographicLocation(地理位置): EntityType
+     properties:
+        info(信息): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Keyword(关键词): EntityType
+     properties:
+        info(信息): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Medicine(药物): EntityType
+     properties:
+        info(信息): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+
+NaturalScience(自然科学): EntityType
+     properties:
+        info(信息): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Organization(组织机构): EntityType
+     properties:
+        info(信息): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Person(人物): EntityType
+     properties:
+        info(信息): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+        job(工作): Text
+            constraint: MultiValue
+
+Transport(运输): EntityType
+     properties:
+        info(信息): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Works(作品): EntityType
+     properties:
+        info(信息): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+        publisTime(发行时间): Date
+        authors(作者): Person
+            desc: authors of work, such as director, actor,  lyricist, composer and singer
+            constraint: MultiValue
+            
+BaikeEvent(事件): EventType
+     properties:
+        subject(主体): Person
+        participants(参与者): Person
+            desc: the participants of event, such as subject and objects
+            constraint: MultiValue
+        time(时间): Date            
+        location(地点): GeographicLocation
+        abstract(摘要): Text
+            index: TextAndVector        
+        semanticType(事件语义类型): Text
+            desc: a more specific and clearly defined type, such as Professor or Actor for the Person type
+            index: Text
+
+Others(其他): EntityType
+     desc: Entities that does not belong to any other type
+     properties:
+        info(信息): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+SemanticConcept(语义概念): EntityType
+     properties:
+        desc(内容): Text
+            index: Text
diff --git a/kag/solver/logic/core_modules/op_executor/op_output/__init__.py b/kag/examples/baike/solver/__init__.py
similarity index 100%
rename from kag/solver/logic/core_modules/op_executor/op_output/__init__.py
rename to kag/examples/baike/solver/__init__.py
diff --git a/kag/examples/baike/solver/eval.py b/kag/examples/baike/solver/eval.py
new file mode 100644
index 00000000..0ca82830
--- /dev/null
+++ b/kag/examples/baike/solver/eval.py
@@ -0,0 +1,36 @@
+import json
+import logging
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from tqdm import tqdm
+
+from kag.common.benchmarks.evaluate import Evaluate
+from kag.solver.logic.solver_pipeline import SolverPipeline
+from kag.common.conf import KAG_CONFIG
+from kag.common.registry import import_modules_from_path
+
+from kag.common.checkpointer import CheckpointerManager
+
+
+def qa(query):
+    resp = SolverPipeline.from_config(KAG_CONFIG.all_config["kag_solver_pipeline"])
+    answer, traceLog = resp.run(query)
+
+    print(f"\n\nso the answer for '{query}' is: {answer}\n\n")  #
+    print(traceLog)
+    return answer, traceLog
+
+
+if __name__ == "__main__":
+    import_modules_from_path("./prompt")
+    queries = [
+        "周星驰的姓名有何含义？",
+        "周星驰和万梓良有什么关系",
+        "周星驰在首部自编自导自演的电影中，票房达到多少，他在其中扮演什么角色",
+        "周杰伦曾经为哪些自己出演的电影创作主题曲？",
+        "周杰伦在春晚上演唱过什么歌曲？是在哪一年",
+    ]
+    for q in queries:
+        qa(q)
diff --git a/kag/examples/hotpotqa/builder/prompt/__init__.py b/kag/examples/baike/solver/prompt/__init__.py
similarity index 86%
rename from kag/examples/hotpotqa/builder/prompt/__init__.py
rename to kag/examples/baike/solver/prompt/__init__.py
index 247bb44c..dfa931cd 100644
--- a/kag/examples/hotpotqa/builder/prompt/__init__.py
+++ b/kag/examples/baike/solver/prompt/__init__.py
@@ -10,5 +10,5 @@
 # or implied.
 
 """
-Place the prompts to be used for building the index in this directory.
-"""
\ No newline at end of file
+Place the prompts to be used for solving problems in this directory.
+"""
diff --git a/kag/examples/baike/solver/prompt/resp_generator.py b/kag/examples/baike/solver/prompt/resp_generator.py
new file mode 100644
index 00000000..cb8d76ab
--- /dev/null
+++ b/kag/examples/baike/solver/prompt/resp_generator.py
@@ -0,0 +1,28 @@
+import re
+from string import Template
+from typing import List
+import logging
+
+from kag.interface import PromptABC
+
+logger = logging.getLogger(__name__)
+
+
+@PromptABC.register("resp_simple")
+class RespGenerator(PromptABC):
+    template_zh = (
+        "基于给定的引用信息回答问题。" "\n只输出答案，不需要输出额外的信息。" "\n给定的引用信息：'$memory'\n问题：'$instruction'"
+    )
+    template_en = (
+        "Answer the question based on the given reference."
+        "\nOnly give me the answer and do not output any other words."
+        "\nThe following are given reference:'$memory'\nQuestion: '$instruction'"
+    )
+
+    @property
+    def template_variables(self) -> List[str]:
+        return ["memory", "instruction"]
+
+    def parse_response(self, response: str, **kwargs):
+        logger.debug("推理器判别:{}".format(response))
+        return response
diff --git a/kag/examples/csqa/.gitignore b/kag/examples/csqa/.gitignore
new file mode 100644
index 00000000..50e414ac
--- /dev/null
+++ b/kag/examples/csqa/.gitignore
@@ -0,0 +1,3 @@
+ckpt/
+/cs.jsonl
+/solver/data/csqa_kag_answers.json
diff --git a/kag/examples/csqa/README.md b/kag/examples/csqa/README.md
new file mode 100644
index 00000000..ee1cca95
--- /dev/null
+++ b/kag/examples/csqa/README.md
@@ -0,0 +1,84 @@
+# KAG Example: CSQA
+
+The [UltraDomain](https://huggingface.co/datasets/TommyChien/UltraDomain/tree/main)
+``cs.jsonl`` dataset contains 10 documents in Computer Science and
+100 questions with their answers about those documents.
+
+Here we demonstrate how to build a knowledge graph for those documents,
+generate answers to those questions with KAG and compare KAG generated
+answers with those from other RAG systems.
+
+## Steps to reproduce
+
+1. Follow the Quick Start guide of KAG to install the OpenSPG server and KAG.
+
+   The following steps assume the Python virtual environment with KAG installed
+   is activated and the current directory is [csqa](.).
+
+2. (Optional) Download [UltraDomain](https://huggingface.co/datasets/TommyChien/UltraDomain/tree/main)
+   ``cs.jsonl`` and execute [generate_data.py](./generate_data.py) to generate data files in
+   [./builder/data](./builder/data) and [./solver/data](./solver/data). Since the generated files
+   were committed, this step is optional.
+
+   ```bash
+   python generate_data.py
+   ```
+
+3. Update the ``openie_llm``, ``chat_llm`` and ``vectorizer_model`` configurations
+   in [kag_config.yaml](./kag_config.yaml) properly.
+   The ``splitter`` and ``num_threads_per_chain`` configurations may also be updated
+   to match with other systems.
+
+4. Restore the KAG project.
+
+   ```bash
+   knext project restore --host_addr http://127.0.0.1:8887 --proj_path .
+   ```
+
+5. Commit the schema.
+
+   ```bash
+   knext schema commit
+   ```
+
+6. Execute [indexer.py](./builder/indexer.py) in the [builder](./builder) directory to build the knowledge graph.
+
+   ```bash
+   cd builder && python indexer.py && cd ..
+   ```
+
+7. Execute [eval.py](./solver/eval.py) in the [solver](./solver) directory to generate the answers.
+
+   ```bash
+   cd solver && python eval.py && cd ..
+   ```
+
+   The results are saved to ``./solver/data/csqa_kag_answers.json``.
+
+8. (Optional) Follow the LightRAG [Reproduce](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#reproduce)
+   steps to generate answers to the questions and save the results to
+   [./solver/data/csqa_lightrag_answers.json](./solver/data/csqa_lightrag_answers.json).
+   Since a copy was committed, this step is optional.
+
+9. Update the LLM configurations in [summarization_metrics.py](./solver/summarization_metrics.py)
+   and [factual_correctness.py](./solver/factual_correctness.py)
+   and execute them to get the metrics.
+
+   ```bash
+   python ./solver/summarization_metrics.py
+   python ./solver/factual_correctness.py
+   ```
+
+10. (Optional) To delete checkpoints, execute the following commands.
+
+    ```bash
+    rm -rf ./builder/ckpt
+    rm -rf ./solver/ckpt
+    ```
+
+    To delete the KAG project and related knowledge graph, execute the following similar command.
+    Replace the OpenSPG server address and KAG project id with actual values.
+
+    ```bash
+    curl http://127.0.0.1:8887/project/api/delete?projectId=1
+    ```
diff --git a/kag/examples/musique/builder/prompt/__init__.py b/kag/examples/csqa/builder/__init__.py
similarity index 86%
rename from kag/examples/musique/builder/prompt/__init__.py
rename to kag/examples/csqa/builder/__init__.py
index 247bb44c..7a018e7c 100644
--- a/kag/examples/musique/builder/prompt/__init__.py
+++ b/kag/examples/csqa/builder/__init__.py
@@ -10,5 +10,5 @@
 # or implied.
 
 """
-Place the prompts to be used for building the index in this directory.
-"""
\ No newline at end of file
+Builder Dir.
+"""
diff --git a/kag/examples/csqa/builder/data/__init__.py b/kag/examples/csqa/builder/data/__init__.py
new file mode 100644
index 00000000..59bacd4d
--- /dev/null
+++ b/kag/examples/csqa/builder/data/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+"""
+Place the files to be used for building the index in this directory.
+"""
diff --git a/kag/examples/csqa/builder/data/guide_to_java.txt b/kag/examples/csqa/builder/data/guide_to_java.txt
new file mode 100644
index 00000000..66839a40
--- /dev/null
+++ b/kag/examples/csqa/builder/data/guide_to_java.txt
@@ -0,0 +1,9536 @@
+Guide to Java
+ 
+James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6© Springer-Verlag London 2014
+
+Undergraduate Topics in Computer Science
+
+Series EditorIan Mackie
+
+Undergraduate Topics in Computer Science (UTiCS) delivers high-quality instructional content for undergraduates studying in all areas of computing and information science. From core foundational and theoretical material to final-year topics and applications, UTiCS books take a fresh, concise, and modern approach and are ideal for self-study or for a one- or two-semester course. The texts are all authored by established experts in their fields, reviewed by an international advisory board, and contain numerous examples and problems. Many include fully worked solutions.
+
+For further volumes: http://​www.​springer.​com/​series/​7592
+
+James T. Streib and Takako Soma
+
+Guide to JavaA Concise Introduction to Programming
+
+James T. Streib
+
+Department of Computer Science, Illinois College, Jacksonville, IL, USA
+
+Takako Soma
+
+Department of Computer Science, Illinois College, Jacksonville, IL, USA
+
+ISSN 1863-7310e-ISSN 2197-1781
+
+ISBN 978-1-4471-6316-9e-ISBN 978-1-4471-6317-6
+
+Springer London Heidelberg New York Dordrecht
+
+Library of Congress Control Number: 2014931850
+
+© Springer-Verlag London 2014
+
+Undergraduate Topics in Computer Science
+
+This work is subject to copyright. All rights are reserved by the Publisher, whether the whole or part of the material is concerned, specifically the rights of translation, reprinting, reuse of illustrations, recitation, broadcasting, reproduction on microfilms or in any other physical way, and transmission or information storage and retrieval, electronic adaptation, computer software, or by similar or dissimilar methodology now known or hereafter developed. Exempted from this legal reservation are brief excerpts in connection with reviews or scholarly analysis or material supplied specifically for the purpose of being entered and executed on a computer system, for exclusive use by the purchaser of the work. Duplication of this publication or parts thereof is permitted only under the provisions of the Copyright Law of the Publisher's location, in its current version, and permission for use must always be obtained from Springer. Permissions for use may be obtained through RightsLink at the Copyright Clearance Center. Violations are liable to prosecution under the respective Copyright Law.
+
+The use of general descriptive names, registered names, trademarks, service marks, etc. in this publication does not imply, even in the absence of a specific statement, that such names are exempt from the relevant protective laws and regulations and therefore free for general use.
+
+While the advice and information in this book are believed to be true and accurate at the date of publication, neither the authors nor the editors nor the publisher can accept any legal responsibility for any errors or omissions that may be made. The publisher makes no warranty, express or implied, with respect to the material contained herein.
+
+Printed on acid-free paper
+
+Springer is part of Springer Science+Business Media (www.springer.com)
+
+Preface
+
+## Purpose
+
+The purpose of this text is to help the reader learn very quickly how to program using the Java programming language. This is accomplished by concentrating on the fundamentals, providing plenty of illustrations and examples, and using visual contour diagrams to illustrate the object-oriented semantics of the language.
+
+## Comparison to Other Texts
+
+There are a number of texts on the Java programming language. Some of these texts provide plenty of examples and are very comprehensive, but unfortunately they sometimes seem to cover too many details, which might make it difficult for a beginning programmer to discern which points are the most relevant. There are also other texts that attempt to provide a shortened introduction to the language, but it seems that these texts might not provide the necessary examples and illustrations and might be better suited for readers who have previous programming experience.
+
+## Need
+
+This text attempts to fill the gap between the above two types of books. First, it provides plenty of examples and concentrates primarily on the fundamentals of the Java programming language so that the reader can stay focused on the key concepts. Second, by concentrating on the fundamentals, it allows the text to be more concise and yet still accessible to readers who have no prior programming experience. The result is that the reader can learn the Java programming language very quickly and also have a good foundation to learn more complex topics later.
+
+## Features of This Text
+
+This text provides many examples and illustrations. It further has an early introduction to object-oriented programming and uses contour diagrams to illustrate various object-oriented concepts. The contour model was originally developed by John B. Johnson [1]. The model was elaborated on by Organick, Forsythe, and Plummer to illustrate subprograms, parameter passing, and recursion in procedural and functional languages [2]. The model seems quite adaptable to newer programming methodologies such as object-oriented programming as illustrated in a paper by the authors of this text [3]. As discussed in that paper, it was shown that the use of contour diagrams can be an effective tool in helping one learn object-oriented concepts in the Java programming language. By acquiring a good working model of objects, there is less chance of possible misconceptions.
+
+In many paragraphs of the text, questions are asked of the reader to help them interact with the material and think about the subject matter just presented. Hopefully the reader will take a few moments to try to answer these questions on their own before proceeding to the answer that immediately follows. To help further reinforce concepts, each chapter has one or more complete programs to illustrate many of the concepts presented and also to help readers learn how to write programs on their own. In addition, for review and practice, there are summaries and exercises provided at the end of each chapter. Further, in the appendices at the end of the text, there are answers to selected exercises and a glossary of important terms. A summary of these features is listed below:
+
+  * Stresses the fundamentals
+
+  * Provides many examples and illustrations
+
+  * Has an early introduction to objects
+
+  * Uses contour diagrams to illustrate object-oriented concepts
+
+  * Asks readers questions to help them interact with the material
+
+  * Has one or more complete programs in every chapter
+
+  * Provides chapter summaries
+
+  * Includes exercises at the end of each chapter, with selected answers in an appendix
+
+  * Has a glossary of important terms
+
+## Overview of the Chapters
+
+This text first allows the reader to understand a simple program with the appropriate input, processing, and output, followed by an early introduction to objects. It then looks at selection and iteration structures followed by more object-oriented concepts. Next, strings and arrays are examined. This is followed by recursion, inheritance and polymorphism, and elementary files. The appendices include information on graphical input/output, exception processing, Javadoc, a glossary, and answers to selected exercises. Lastly there are references and useful websites and an index. The following provides a brief synopsis of the chapters and appendices:
+
+  * Chapter 1 provides an introduction to variables, input/output, and arithmetic operations.
+
+  * Chapter 2 introduces objects and contour diagrams.
+
+  * Chapter 3 explains selection structures.
+
+  * Chapter 4 shows how iteration structures work.
+
+  * Chapter 5 revisits object-oriented concepts.
+
+  * Chapter 6 introduces string variables and processing.
+
+  * Chapter 7 illustrates arrays and array processing.
+
+  * Chapter 8 examines recursion.
+
+  * Chapter 9 explores inheritance and polymorphism.
+
+  * Chapter 10 discusses elementary files.
+
+  * Appendix A gives an introduction to graphical input/output.
+
+  * Appendix B discusses elementary exception processing.
+
+  * Appendix C presents the basics of Javadoc.
+
+  * Appendix D lists a glossary of key terms.
+
+  * Appendix E provides answers to selected exercises.
+
+## Scope
+
+As mentioned previously, this text concentrates on the fundamentals of the Java programming language such as input/output, object-oriented programming, arithmetic and logic instructions, control structures, strings, arrays including elementary sorting and searching, recursion, and files. As a result, it might not cover all the details that are found in some other texts, and if necessary, these topics can be supplemented by the instructor or reader, or covered in a subsequent text and/or second semester course.
+
+## Audience
+
+This text is intended primarily for readers who have not had any previous programming experience; however, this does not preclude its use by others who have programmed previously. It can serve as a text in an introductory programming course, as an introduction to a second language in a practicum course, as a supplement in a course on the concepts of programming languages, or as a self-study guide in either academe or industry. Although no prior programming is assumed, it is recommended that readers have the equivalent of an introduction to functions course that includes trigonometry which will help with problem solving and understanding the examples presented in the text.
+
+## Acknowledgments
+
+The authors would like to thank the reviewers Mark E. Bollman of Albion College, James W. Chaffee of the University of Iowa, Naomi E. Hahn of Illinois College, Carroll W. Morrow of Augustana College, and Curt M. White of DePaul University. Also, the authors would like to acknowledge the students of Illinois College who have read and used various sections of the text in the classroom. On a personal note, James Streib would like to acknowledge his father William J. Streib for their numerous conversations, and thank his wife Kimberly A. Streib and son Daniel M. Streib for their continued patience. Takako Soma would like to thank her family and friends, near and far.
+
+Note that Java is a registered trademark of Oracle and/or its affiliates and that Windows is a registered trademark of Microsoft Corporation in the United States and/or other countries.
+
+## Feedback
+
+The possibility of errors exist in any text, therefore any corrections, comments, or suggestions are welcome and can be sent to the authors via the e-mail addresses below. In addition to copies of the complete programs presented in the text, any significant corrections can be found at the website below.
+
+Website: http://​www.​jtstreib.​com/​GuideJavaProgram​ming.​html
+
+James T. Streib
+
+Takako Soma
+
+October 21, 2013
+
+Contents
+
+1 Variables, Input/​Output, and Arithmetic 1
+
+1.​1 Introduction 1
+
+1.​2 Java Skeleton 5
+
+1.​3 Variables and Constants 6
+
+1.​4 Assignment Statements 10
+
+1.​5 Output 13
+
+1.​6 Input 20
+
+1.​7 Arithmetic Statements 22
+
+1.​8 Comments 29
+
+1.​9 Program Design 30
+
+1.​10 Complete Program:​ Implementing a Simple Program 33
+
+1.​11 Summary 36
+
+1.​12 Exercises (Items Marked with an * Have Solutions in Appendix E) 36
+
+2 Objects:​ An Introduction 39
+
+2.​1 Introduction 39
+
+2.​2 Classes and Objects 40
+
+2.​3 Public and Private Data Members 41
+
+2.​4 Value-Returning Methods 42
+
+2.​5 Void Methods and Parameters 42
+
+2.​6 Creating Objects and Invoking Methods 44
+
+2.​7 Contour Diagrams 45
+
+2.​8 Constructors 50
+
+2.​9 Multiple Objects and Classes 53
+
+2.​10 Universal Modeling Language (UML) Class Diagrams 60
+
+2.​11 Complete Program:​ Implementing a Simple Class and Client Program 62
+
+2.​12 Summary 63
+
+2.​13 Exercises (Items Marked with an * Have Solutions in Appendix E) 65
+
+3 Selection Structures 69
+
+3.​1 Introduction 69
+
+3.​2 If-Then Structure 69
+
+3.​3 If-Then-Else Structure 75
+
+3.​4 Nested If Structures 78
+
+3.​4.​1 If-Then-Else-If Structure 78
+
+3.​4.​2 If-Then-If Structure 80
+
+3.​4.​3 Dangling Else Problem 82
+
+3.​5 Logical Operators 86
+
+3.​6 Case Structure 93
+
+3.​7 Complete Programs:​ Implementing Selection Structures 98
+
+3.​7.​1 Simple Program 98
+
+3.​7.​2 Program with Objects 101
+
+3.​8 Summary 103
+
+3.​9 Exercises (Items Marked with an * Have Solutions in Appendix E) 103
+
+4 Iteration Structures 107
+
+4.​1 Introduction 107
+
+4.​2 Pretest Indefinite Loop Structure 108
+
+4.​2.​1 Count-Controlled Indefinite Iteration Structure 109
+
+4.​2.​2 Sentinel Controlled Loop 116
+
+4.​3 Posttest Indefinite Loop Structure 120
+
+4.​4 Definite Iteration Loop Structure 124
+
+4.​5 Nested Iteration Structures 127
+
+4.​6 Potential Problems 129
+
+4.​7 Complete Programs:​ Implementing Iteration Structures 130
+
+4.​7.​1 Simple Program 131
+
+4.​7.​2 Program with Objects 133
+
+4.​8 Summary 138
+
+4.​9 Exercises (Items Marked with an * Have Solutions in Appendix E) 138
+
+5 Objects:​ Revisited 143
+
+5.​1 Sending an Object to a Method 143
+
+5.​2 Returning an Object from a Method 146
+
+5.​3 Overloaded Constructors and Methods 148
+
+5.​4 Use of the Reserved Word this 153
+
+5.​5 Class Constants, Variables, and Methods 157
+
+5.​5.​1 Local, Instance, and Class Constants 157
+
+5.​5.​2 Local, Instance, and Class Variables 162
+
+5.​5.​3 Class Methods 165
+
+5.​6 Complete Programs:​ Implementing Objects 167
+
+5.​6.​1 Program Focusing on Overloaded Methods 167
+
+5.​6.​2 Program Focusing on Class Data Members and Class Methods 175
+
+5.​7 Summary 179
+
+5.​8 Exercises (Items Marked with an * Have Solutions in Appendix E) 179
+
+6 Strings 185
+
+6.​1 Introduction 185
+
+6.​2 String Class 185
+
+6.​3 String Concatenation 186
+
+6.​4 Methods in String Class 188
+
+6.​4.​1 The length Method 188
+
+6.​4.​2 The indexOf Method 188
+
+6.​4.​3 The substring Method 189
+
+6.​4.​4 Comparison of Two String Objects 191
+
+6.​4.​5 The equalsIgnoreCase​ Method 194
+
+6.​4.​6 The charAt Method 195
+
+6.​5 The toString Method 196
+
+6.​6 Complete Program:​ Implementing String Objects 198
+
+6.​7 Summary 200
+
+6.​8 Exercises (Items Marked with an * Have Solutions in Appendix E) 201
+
+7 Arrays 203
+
+7.​1 Introduction 203
+
+7.​2 Array Declaration 203
+
+7.​3 Array Access 205
+
+7.​4 Input, Output, Simple Processing, and Methods 206
+
+7.​4.​1 Input 207
+
+7.​4.​2 Output 210
+
+7.​4.​3 Simple Processing 211
+
+7.​4.​4 Passing an Array to and from a Method 212
+
+7.​5 Reversing an Array 213
+
+7.​6 Searching an Array 218
+
+7.​6.​1 Sequential Search 218
+
+7.​6.​2 Binary Search 219
+
+7.​6.​3 Elementary Analysis 221
+
+7.​7 Sorting an Array 221
+
+7.​7.​1 Simplified Bubble Sort 222
+
+7.​7.​2 Modified Bubble Sort 224
+
+7.​8 Two-Dimensional Arrays 225
+
+7.​8.​1 Declaration, Creation, and Initialization 226
+
+7.​8.​2 Input and Output 228
+
+7.​8.​3 Processing Data 229
+
+7.​8.​4 Passing a Two-Dimensional Array to and from a Method 232
+
+7.​8.​5 Asymmetrical Two-Dimensional Arrays 234
+
+7.​9 Arrays of Objects 236
+
+7.​10 Complete Program:​ Implementing an Array 238
+
+7.​11 Summary 242
+
+7.​12 Exercises (Items Marked with an * Have Solutions in Appendix E) 242
+
+8 Recursion 245
+
+8.​1 Introduction 245
+
+8.​2 The Power Function 245
+
+8.​3 Stack Frames 253
+
+8.​4 Fibonacci Numbers 254
+
+8.​5 Complete Program:​ Implementing Recursion 264
+
+8.​6 Summary 266
+
+8.​7 Exercises (Items Marked with an * Have Solutions in Appendix E) 266
+
+9 Objects:​ Inheritance and Polymorphism 267
+
+9.​1 Inheritance 267
+
+9.​2 Protected Variables and Methods 276
+
+9.​3 Abstract Classes 277
+
+9.​4 Polymorphism 278
+
+9.​5 Complete Program:​ Implementing Inheritance and Polymorphism 284
+
+9.​6 Summary 288
+
+9.​7 Exercises (Items Marked with an * Have Solutions in Appendix E) 289
+
+10 Elementary File Input and Output 293
+
+10.​1 Introduction 293
+
+10.​2 File Input 293
+
+10.​3 File Output 298
+
+10.​4 File Input and Output Using an Array 300
+
+10.​5 Specifying the File Location 303
+
+10.​6 Complete Programs:​ Implementing File Input and Output 305
+
+10.​6.​1 Matrix Multiplication 305
+
+10.​6.​2 Sorting Data in a File 307
+
+10.​7 Summary 309
+
+10.​8 Exercises (Items Marked with an * Have Solutions in Appendix E) 309
+
+Appendix A Simple Graphical Input and Output311
+
+A.1 Message Dialog Boxes311
+
+A.2 Input Dialog Boxes312
+
+A.3 Converting String Input from Input Dialog Boxes to Numbers314
+
+A.4 Confirmation Dialog Boxes316
+
+A.5 Option Dialog Boxes317
+
+Appendix B Exceptions321
+
+B.1 Exception Class and Error Class321
+
+B.2 Handling an Exception322
+
+B.3 Throwing Exceptions and Multiple catch Blocks325
+
+B.4 Checked and Unchecked Exceptions330
+
+Appendix C Javadoc Comments335
+
+C.1 Javadoc335
+
+C.2 More Javadoc Tags338
+
+C.3 Generating JavadocDocumentation fromaCommandLine339
+
+Appendix D Glossary341
+
+Appendix E Answers to Selected Exercises345
+
+References and Useful Websites353
+
+Index355
+James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6_1
+
+© Springer-Verlag London 2014
+
+# 1. Variables, Input/Output, and Arithmetic
+
+James T. Streib1  and Takako Soma1
+
+(1)
+
+Department of Computer Science, Illinois College, Jacksonville, IL, USA
+
+Abstract
+
+In addition to an introduction to hardware and software concepts, including the concept of compiling, interpreting, and executing a program, this chapter provides an initial skeleton program from which to create subsequent programs. An introduction to variables, constants, assignment statements, arithmetic operations, and simple input/output using the keyboard and monitor is also provided. Further, there is a discussion concerning errors, comments, and program design. A simple complete program is included at the end of the chapter.
+
+## 1.1 Introduction
+
+As many readers may already know from using applications software such as word processing, a computer system is composed of two major parts: hardware and software. The hardware is the physical computer that includes five basic components: the central processing unit (CPU), the random access memory (RAM) or just memory for short, input (typically a keyboard), output (typically a monitor), and storage (often a disk) as shown in Fig. 1.1.
+
+Fig. 1.1
+
+Computer hardware
+
+In order for computer hardware to perform, it is necessary that it has a software. Essentially, software (often called a program) is the set of instructions that tells the computer what to do and when to do it. A program is typically loaded from storage into the computer's RAM for subsequent execution in the computer's CPU. As the program executes or runs, it will typically ask the user to input data which will also be stored in RAM, the program will then process the data, and various results will be output to the monitor. This input, process, output sequence is sometimes abbreviated as IPO.
+
+The only type of instruction a computer can actually understand is low-level machine language, where different types of CPUs can have different machine languages. Machine language is made up of ones and zeros, which makes programming in machine language very tedious and error prone. An alternative to using machine language is assembly language which is also a low-level language that uses mnemonics (or abbreviations) and is easier to use than ones and zeros [4]. However, if the only language that the computer can directly understand is machine language, how does the computer understand assembly language? The answer is that the assembly language is converted into machine language by another program called an assembler (see Fig. 1.2). Note that there is a one-to-one correspondence between assembly language and machine language, and for every assembly language instruction, there is typically only one machine language instruction. However, even though assembly language is easier to program in than machine language, different types of CPUs can also have different types of assembly languages, so the assembly language of one machine can be different from that of another machine.
+
+Fig. 1.2
+
+Assemblers and compilers
+
+The solution to making programming easier and allow programs to be used on different machines is through the use of high-level languages which are more English-like and math-like. One of the first high-level programming languages was FORTRAN (FORmula TRANslation), which was developed in the early 1950s to help solve mathematical problems. There have been a number of high-level languages developed since that time to meet the needs of many different users. Some of these include COBOL (COmmon Business Oriented Language) developed in the 1950s for the business world, BASIC (Beginners All-purpose Symbolic Instruction Code) developed in the 1960s for beginning programmers, Pascal in the 1970s previously used for teaching computer science students, C in the 1970s for systems programming, and C++ in the 1980s for object-oriented programming.
+
+The program needed to convert or translate a high-level language to a low-level language is either a compiler or an interpreter. Although there is a one-to-one correspondence between assembly language and machine language, there is a one-to-many correspondence between a high-level language and a low-level language. This means that for one high-level language instruction, there can be many low-level assembly or machine language instructions. Even though different CPUs need different compilers or interpreters to convert a particular high-level language into the appropriate machine language, compliers and interpreters allow the same high-level language to be used on different CPUs.
+
+The difference between a compiler and an interpreter is that a compiler will translate the high-level language instructions for the entire program to the corresponding machine language for subsequent execution, whereas an interpreter will translate and then execute each instruction one at a time. Further, a compiler might translate directly to machine language, or it might translate the high-level language to assembly language, and then let an assembler convert the assembly language program to machine language as shown in Fig. 1.2. Once the machine language is created, it is subsequently loaded into the computer's RAM and executed by the CPU.
+
+As mentioned above, an interpreter works slightly differently than a compiler. Instead of converting an entire high-level program into machine language all at once and then executing the machine language, an interpreter converts one line of the high-level program to machine language and then immediately executes the machine language instructions before proceeding on with the converting and executing of the next high-level instruction (see Fig. 1.3). The result is that compiler-generated code executes faster than interpreted code because the program does not need to be converted each time it is executed. However, interpreters might be more convenient in an education or development environment because of the many modifications that are made to a program which require a program to be converted each time a change is made.
+
+Fig. 1.3
+
+Compilers and interpreters
+
+The Java programming language was developed at Sun MicroSystems (which is now a subsidiary of Oracle Corporation) and was released in 1995. The intent of the language was for portability on the World Wide Web. It does not contain some of the features of C++ (such as operator overloading and multiple inheritance, where overloading and inheritance will be discussed in Chaps.​ 5 and ), so it is an easier language to learn. Object-Oriented Programming (OOP) is a programming methodology that makes it more convenient to reuse software as will be discussed further in Chaps.​ 2, , and . Although no prior programming experience is necessary to learn Java in this text, programmers with experience in C or C++ will recognize a number of similarities between Java and these languages. Conversely, programmers learning Java first will also notice a number of similarities should they subsequently learn C or C++. The reason for this similarity between these languages is that both Java and C++ are based on C.
+
+Java is somewhat unique in that it uses both a compiler and an interpreter to convert the high-level instructions to machine language. A compiler is used to convert the Java instructions into an intermediate-level language known as bytecode, and then the bytecode is converted into machine language using an interpreter. The advantage of using both a compiler and an interpreter is that most of the translation process can be done by the compiler, and when bytecode is sent to different types of machines, it can be translated by an interpreter into the machine language of the particular type of machine the code needs to be run on (see Fig. 1.4). Note that just as there can be a one-to-many relationship between high-level and low-level instructions, there can be a one-to-many relationship between Java and bytecode. However, unlike the one-to-one relationship between assembly language and machine language, there can be a one-to-many relationship between bytecode and machine language, depending on the machine for which the bytecode is being interpreted.
+
+Fig. 1.4
+
+Java instructions, bytecode, and machine language
+
+When learning a new programming language, one should distinguish between the syntax and the semantics of a program. Simply stated, the syntax is the grammar of the language, and the semantics is the meaning or what each instruction does. To explain further, syntax is the spelling of the individual words, where the semicolons go, and so on. If mistakes are made, the compiler will detect what are known as syntax errors, generate messages to the programmer, and the program will not be compiled or executed. Although syntax is very important, there is a tendency for first-time programmers to spend too much time learning syntax to avoid syntax errors. However, there must be equal time spent on semantics to ensure that the program does what the programmer intended it to do. Even though there might not be any syntax errors, there can be what are called execution errors or run-time errors, such as division by zero. When these types of errors occur, the appropriate error messages are generated and execution stops. Even worse, there can also be logic errors, which are mistakes in the logic of the program and the program does not do what was intended. The unfortunate aspect of logic errors is that they do not produce any error messages which can make them extremely difficult to find and fix. The process of finding and fixing logic errors is known as debugging. When learning to program, one must be attentive not only to the syntax of the language but also to the semantics of the language. Both are stressed in this text, and with time and practice, a beginning programmer can get better at both.
+
+## 1.2 Java Skeleton
+
+Probably the best way to understand a programming language is to start right away with a sample program. Although the following program does not do anything, it will serve as a skeleton to add instructions in the future and provide a starting point to understand the basic layout of a Java program. At first the program in Fig. 1.5 might look a bit intimidating, but examining and discussing each of the statements should help one understand it better. Although some of the descriptions discussed below might be a little advanced and confusing now, it helps to realize that each of the words in the program has an important purpose and each of them will be discussed later in detail throughout the text. As one learns more about Java and starts to fill in the skeleton with other instructions, it will become less intimidating.
+
+Fig. 1.5
+
+Java skeleton program
+
+The first line in the program begins with the reserved word class. A reserved word is one that has a special meaning in a program and cannot have its meaning changed by the programmer nor used for identifiers (or names) of packages, classes, variables, or methods. A package is like a folder in which classes can be stored. A class is a definition of a group of objects that includes data members (places to store data) and methods (places to put the program logic). Although classes and objects will be discussed further in Chap.​ 2, for now think of a class as a blueprint for a house and the houses built from the blueprint as objects. The word Skeleton is a name of the class that is provided by the programmer. Usually class names begin with a capital letter. Braces are used to identify blocks of code and data and require matching opening and closing braces. The entire definition of the class, Skeleton, should be placed between the first opening brace and the last closing brace.
+
+This class has one method definition starting on the second line. Typically the method is indented to improve the readability of the program. The first three words in the second line are reserved words. The word public is one of the access or visibility modifiers which will also be discussed further in Chap.​ 2. The main method is always defined using public visibility, so that the program can be executed by the interpreter. The word static means this is a class method, and the main method is always declared static so that it can be executed without creating an object of the class as will be discussed further in Chap.​ 5. The word void means that main is a non-value-returning method as will be discussed further in Chap.​ 2. Next, the word main is the name of the method. When a program is run, the system will search for the main method and start executing instructions in the main method first. Inside of the parentheses after the name of the method, parameters are listed along with their types to allow the method to receive values as will be discussed further in Chap.​ 2. The main method has a parameter called args which is an array of type String, and the square brackets indicate args is an array where strings and arrays will be discussed further in Chaps.​ 6 and , respectively. The definition of the main method starts with an opening brace and ends with a closing brace. Inside the braces, a sequence of instructions would be placed.
+
+For now, the method does not have any instructions other than a comment line. Comments will not be compiled and executed when the program is run. They are used to make programs easier for other programmers to understand. Comments can start with // symbols and continue to the end of the line, or be placed between /* and */ symbols. The // symbols are used for a single-line comment, and /* and */ are used when the comments run over multiple lines. The above program should compile without any syntax errors and run without any execution errors, except it does not do anything.
+
+Again the above description should give the reader some insight into the meaning of various words in the skeleton program. As should be noticed, there were several references to subsequent chapters. What might be helpful to the reader is to return to this section later after reading the subsequent chapters and see that the above is more understandable. For now it should be understood that each of the words has a particular meaning and that the program serves as a skeleton in which to insert code as will be done in the following sections.
+
+## 1.3 Variables and Constants
+
+One of the first things that need to be added to the skeleton are memory locations so that data can be stored, and another name for a memory location is a variable. Since the contents of the memory location can vary, just as a variable in mathematics, these two terms can be used interchangeably.
+
+In order to understand variables and how data is stored in memory, it is oftentimes very helpful to draw a picture of the memory location. A memory location can be thought of as a mailbox that has two main parts. One part is the contents, which includes the letters that are inside the mailbox, and the other is the address of the mailbox as shown in Fig. 1.6.
+
+Fig. 1.6
+
+Representation of memory
+
+The address of the mailbox is usually a number, like the address of a memory location in a computer. At the machine language level, the address is in ones and zeros, just like the machine language instructions mentioned in the first section of this chapter. However, using numbers to represent the address of a memory location can be quite confusing, especially if there are hundreds of memory locations in a program. Instead it is helpful to use characters to form names, called symbolic addressing, to make it easier to remember what data is stored in what memory location as shown in Fig. 1.7. In this example, the name number is used to describe the contents of the corresponding memory location. This is one of the primary advantages of using assembly language over machine language, and this is also true of all high-level languages including Java.
+
+Fig. 1.7
+
+Using names for memory locations
+
+Instead of a three-dimensional representation of a mailbox to represent a memory location, it is much easier to draw a two-dimensional representation. Further, instead of using ones and zeros to represent the contents of the memory location, it is easier to use the decimal number system to represent values as follows:
+
+Although not as crucial in high-level languages (like Java) as low-level languages (machine and assembly languages), it is important to remember that a memory location has two features: its address and its contents. In Java, the programmer is typically concerned about its contents.
+
+Given the above representation of variables, how are they actually created or declared? When a variable is declared, there are two things that must be done. First, a variable needs to be given a name so that it can be referred to by various instructions in the program, and second, the type of data that will be stored in the memory location needs to be indicated. The reason for this is that although all the data is stored as ones and zeros as discussed above, different types of data are stored using different combinations of ones and zeros. A single one or zero is called a binary digit (abbreviated as a bit), and a group of 8 bits is called a byte. Typically the more bytes that make up a memory location, the larger the number that can be stored in the location. Although how the data is actually stored is beyond the scope of this text, Table 1.1 shows some of the types of data, the size, and the range of values that can be stored for each type.
+
+Table 1.1
+
+Data types
+
+Type | Size | Range
+
+---|---|---
+
+byte | 1 byte | −128 to 127
+
+short | 2 bytes | −32,768 to 32,767
+
+int | 4 bytes | −2,147,483,648 to 2,147,483,647
+
+long | 8 bytes | −9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+
+float | 4 bytes | −3.40282347 × 1038 to 3.4028347 × 1038
+
+double | 8 bytes | −1.79769313486231570 × 10308 to 1.79769313486231570 × 10308
+
+char | 2 bytes | one character
+
+String | 2 or more bytes | one or more characters
+
+Typically the types int, double, char, and String are the ones that are used the most frequently. For example, should one want to declare a variable named number and have it store an integer, it would be declared as follows:
+
+int number;
+
+First the type is indicated, in this case int for integer, and then the identifier or name of the variable number is given, followed by a semicolon. The name of the variable can be almost anything except for a reserved word, but there are certain rules that need to be followed as well as some suggestions that should be followed. The length of the variable name should be from 1 to any number of characters long. Further, the variable name can be composed of letters, numbers, underscores _, and dollar signs $, but must begin with a letter. Also, the variable name is case sensitive, meaning that cat, Cat, and CAT are separate variable names and correspond to separate memory locations.
+
+Typically a variable name should not be too long, because they can be difficult to read, but by the same token, they should not be too short either, for it could become difficult to remember what it represents. For example, if the letter n were used instead of number, then it might not be clear whether n stood for name, number, or numeral. Exceptions to this are for variables from a mathematical expression. For example, the variables x, y, and z are commonly used to represent the points of a Cartesian coordinate system, or i, j, or k are used for loop control variables as will be discussed in Chap.​ 4. Although most of the time this text will avoid the use of shorter names, on occasion shorter names might be used to save space or for the sake of simplicity to concentrate on other aspects of a code segment. If a variable is too long, it can be difficult to read as in the following: numberofcatsanddogs. Common practice in Java is not to capitalize the first letter of a variable but to capitalize the first letter in all subsequent words, as in numberOfCatsAndDogs. Notice that it is a little easier to read that way. Also on occasion, abbreviations can be used such as num instead of number, but be sure to use good abbreviations, and this text will occasionally show some of the more commonly used ones.
+
+Variables of other types can be declared as well, such as a variable of type float or double. Although numbers of type float take up less space in the computer's memory, they are less precise and can sometimes cause inaccuracy in calculations. Even though they take up more memory, this text will use double variables to alleviate some possible problems later. For example, should one want to declare a variable to hold a double precision value, it would be declared as follows:
+
+double average;
+
+Further it could contain a value and would look like the following:
+
+Notice that instead of showing the number zero as an integer, it is represented as a real number with a decimal point, to indicate its type as a double.
+
+All of the types given in Table 1.1, other than the String type, are known as primitive data types, meaning that when they are declared, the memory needed to store the associated data is allocated at that time. However, a String data type is a reference data type. When a variable of type String is declared, the memory allocated is not used to store the data, but rather only to store a reference to the data. String data types are unique in that although they are technically objects, they can be used syntactically as if they were primitive data types.
+
+The first part of this text will use strings in a very limited capacity. An understanding of strings is much easier once one has had an introduction to objects and practice with objects, so a full description of how string objects are created and manipulated is presented in Chap.​ 6. However, for now, this text will represent strings "as if" they are primitive data types, and the following shows a character primitive data type and a simplified view of the string data type. For example, a character and string could be declared as follows:
+
+char initial; String name;
+
+and would be represented with values as follows, respectively:
+
+Note that the char data type is represented using single quotation marks and that the String is represented using double quotation marks. Although a character could be represented as a String of length one, it is usually better to use the char data type. Further, there are also ways to extract a single char type from a String data type. Again, a full description will be deferred until Chap.​ 6.
+
+In contrast to variables, a constant can be declared so that its value cannot be changed. Although not nearly as useful as variables, constants have their place in a program when a value does not need to be changed, nor should it be changed. For example, if an integer N always needs to remain a 7, then it could be declared as follows, where the use of the reserved word final indicates that N is a constant:
+
+final int N = 7;
+
+Typically constant names are declared as all capital letters to help other programmers distinguish them from variables. In another example, suppose a number like PI needs only two digits after the decimal point, then it could be declared as follows:
+
+final double PI = 3.14;
+
+Although the use of a constant might not be readily apparent at this time, their use will become clearer in subsequent examples after discussing assignment statements in the next section.
+
+## 1.4 Assignment Statements
+
+In the previous section, all the drawings of the memory locations had values in them. How did those values get there? By default, Java technically initializes all int variables to 0 and double variables to 0.0. Also, char variables are initialized to ", the empty character, and String variables are initialized to null as will be discussed further in Chap.​ 6. Although this can be helpful in some instances, in many other languages variables do not have a default value. The variables contain whatever was in that memory location from the last time it was used which could be interpreted as junk to another program, cause logic errors, and be difficult to debug. Variables with unknown initial values are said to be indeterminate. As a result, many programmers do not use Java's default values and assume instead that the initial values of variables are indeterminate, which will also be the assumption of this text. So instead of initially showing an integer variable with the number 0 in it, this text will show the variable as indeterminate with a dashed line in it as shown below:
+
+Does this mean that all variables need to be initialized to some value? Not necessarily. As will be seen, only those variables that need an initial value for subsequent processing should be initialized. Initializing a variable to a value when it does not need to be initialized could be confusing to other programmers reading the code, as will be discussed later in this chapter and in Chap.​ 4 on iteration structures.
+
+So if a variable is assumed not to be initialized, how does one initialize a variable to a value such as 0 or any other value for that matter, such as 5? After a variable is declared, it can be given a value in an assignment statement using an assignment symbol. The assignment symbol is the equal sign. However, when one first starts to use the equal sign, one must remember that it does not mean that the variable on the left is "equal to" the value on the right, but rather that the value on the right is copied into or assigned to the variable on the left. Again, this is best shown by way of an example:
+
+int number;
+
+number = 5;
+
+After the variable number is declared as type int, the second statement indicates that the integer 5 is assigned or copied into the variable number and the memory location would then appear as follows:
+
+Again, the assignment statement is not really saying that number is equal to 5 or equals 5, but rather that the variable number is assigned a 5 or takes on the value of 5. Although it is tempting to say that number equals 5 and even though most people will understand what is meant, try to avoid saying it, and there will be less difficulty in the future as shown in Sect. 1.7 on arithmetic statements.
+
+Note that it is possible to combine the previous two statements into one statement as shown below. It looks similar to the definition of a constant in the previous section but without the word final in the statement:
+
+int number = 5;
+
+The above syntax is perfectly legal and saves a line when writing a program. However, when first learning a language, it helps to reinforce the distinction between the declaration of a variable and the assignment of a value to a variable. Of course if one's instructor does not mind the above shortcut or if one is studying this text on their own and likes the shortcut, then go ahead and use it. However, this text will use the previous two line method at least for the next few chapters to help reinforce the distinction between the declaration of a variable and the assignment of a value to a variable.
+
+Continuing, what if one wanted to take the contents of number, and copy it into another memory location named answer? For example, consider the following code segment:
+
+int number, answer;
+
+number = 5;
+
+answer = number;
+
+After both number and answer have been declared in the first line, the variable number is then assigned the value 5 in the second line and answer will still be indeterminate. The memory locations would look as follows:
+
+The third line then takes a copy of the contents of number and places it into the memory location answer as shown below:
+
+Note that the assignment statement does not remove the 5 from number and put it into answer, but rather it takes a copy of the 5 and puts it into answer. The original 5 in number does not disappear. Why does it copy and not move it? The reason is because it is actually faster for the computer to copy it and not take the time to delete the original. This is a fundamental concept in most computer languages and will become more important later in the writing of subsequent programs.
+
+Again, the important point to notice is that the copying of values is from right to left, not left to right. This sometimes causes confusion among beginning programmers, possibly because they are used to reading from left to right. The reason why Java and many previous languages go from right to left is because they are mimicking some of the assembly languages on many machines. Ideally it would be nice if languages used an arrow to show how values are copied as shown below:
+
+However, most keyboards do not have an arrow character, so an equal sign was used. Just be very careful to remember that values are copied from right to left and there should not be any problems.
+
+Assigning variables of type double is similar to the above and will not be shown here; however, a couple of points need to be made concerning assigning variables of different types. For example, what would happen if a variable of type int was assigned to a variable of type double as shown below?
+
+int number;
+
+double result;
+
+number = 5;
+
+result = number;
+
+As before, the contents of the memory locations after the assignment of 5 to number would be as follows:
+
+Then when the next assignment statement is executed, the int value of 5 would be copied, converted to a double value of 5.0, and assigned to result as follows:
+
+Would the value in number be converted to a 5.0? The answer is no, as shown above, because only the variable to the left of the assignment symbol is altered by an assignment statement. The 5 in number is not converted, but rather when it is copied, it is converted to the proper type so that it can be assigned to result.
+
+If an int value can be stored in a variable of type double, is the reverse true? The answer is no, because, for example, how could the number 5.7 be stored as an integer without the fractional part? A way around this problem is to use a typecast operator. A typecast operator allows a value of one type to be converted to another type. In the case below, the typecast operator (int) converts the double value in number to type int so it can be assigned to result. As before, the value in number would not change and would still contain a 5.7. However, what happens to the fractional part? The result is that it is truncated and a 5 is stored in result:
+
+double number;
+
+int result;
+
+number = 5.7;
+
+result = (int) number;
+
+What if the value needed to be rounded instead? Fortunately Java has the Math class which contains a method named round. A method is somewhat like a function in mathematics. The name of the class, Math, is followed by a period and the name of the method, round. Parentheses are placed after the method name and contain the argument, number, which is sent to the method. The code segment from above is rewritten below:
+
+double number;
+
+int result;
+
+number = 5.7;
+
+result = (int) Math.round(number);
+
+Unfortunately, when the round method is sent a value of type double, it returns a value of type long, but the typecast operator (int) can again be used to convert the value of type long to type int. Since number contains 5.7, the variable result would contain a 6. Again, the value in number would not change and would still contain a 5.7. Of course if the precision of the type double is needed, the better solution would be to change the type of result to double to preserve the fractional part of number. The round method is one of the many methods available in the Math class which is discussed in more detail in Sect. 1.7 on arithmetic statements.
+
+## 1.5 Output
+
+Unless a program performs some type of output, it is not particularly useful. Output can be of many forms including output to a screen, a printer, a disk, or even some form of movement such as a robot on an assembly line. In this section, only output to a screen will be considered. Although there are several ways to output data to the screen, this section will examine the simplest of them to get started. More advanced methods of output will be examined in Chap.​ 10 and Appendix A, and one can jump to these locations and learn these methods if one is reading this text independently or at the discretion of one's instructor. However, this text will use the methods introduced in this chapter for the sake of simplicity.
+
+One of the more common first programs written when learning a new language is the infamous "Hello World!" program. The advantage of this program is to make sure that one is writing a program correctly and using the compiler properly. This program can be written as shown in Fig. 1.8.
+
+Fig. 1.8
+
+Hello World!
+
+This program looks very similar to the original Skeleton program in Sect. 1.2, except that the class name has been changed from Skeleton to Output and the comment line has been replaced with the System.out.println("Hello World!"); statement. This statement outputs the string contained within the double quotation marks to the monitor. Java uses System.out to refer to console output and the standard output device by default is the monitor. To perform console output, one simply uses the println method to display a primitive value or a string to the monitor. The println method is part of the Java Application Programming Interface (API) which is a predefined set of classes that can be used in any Java program. The classes and methods in the Java API provide a variety of fundamental services that are not part of the language itself.
+
+The method name println is often pronounced as "print line," even though it is not spelled that way. The print portion of println causes the information in the parentheses to be output to the computer screen, and then the ln portion of println causes the cursor on the screen to move down to the next line. In this case, the only information in the parentheses is the string "Hello World!". Of course, the statement is terminated with a semicolon just as the declaration statements and assignment statements were in Sects. 1.3 and 1.4, respectively. Go ahead and try typing in this program on your computer using the IDE (Integrated Development Environment) installed in your lab, home computer, or place of employment and then compile and execute the program. Provided there are no syntax errors, the output should appear similar to the following, where the underscore represents the location of the cursor on the screen:
+
+Hello World!
+
+_
+
+Notice that the quotation marks are not output to the screen and the cursor appears on the next line. Also note that the cursor might not appear on the screen, since there is no input as of yet, but in this example, it still serves to illustrate where any subsequent output would appear. However, what would happen should one leave off the ln portion of the println, as shown below?
+
+System.out.print("Hello World!");
+
+Given the previous description concerning the println above, the output would be as follows:
+
+Hello World!_
+
+At first glance, this does not appear to be much different than the original sample output. However, if one looks carefully, note the location of the cursor. It is not on the second line but rather at the end of the string. The statement outputs the string to the screen, but with the absence of the ln, the cursor does not move down to the next line. In fact, if the cursor does not show up on the screen, one could not notice the difference. Even though it might not be detected on the screen, it is important to know where the cursor is located, so that subsequent output is correct. For example, what if one split the string so that it appears on two separate lines? This can be accomplished by using two separate System.out.println statements as follows:
+
+System.out.println("Hello");
+
+System.out.println("World!");
+
+As one might suspect, the output would appear as follows:
+
+Hello
+
+World!
+
+_
+
+The string "Hello" is output and the cursor moves down to the next line. Then, the string "World!" is output, and again the cursor moves down to the next line in preparation for the subsequent line to be output. However, what if one accidently used two separate System.out.print statements instead?
+
+System.out.print("Hello");
+
+System.out.print("World!");
+
+The output would appear as given below:
+
+HelloWorld!_
+
+Note that this output appears similar to using a single System.out.print statement as shown previously. Why are they similar? After the first System.out.print output the word Hello, the cursor stayed on the same line and did not move to the second line. So when the second System.out.print was executed, the word World! was output on the same line, and since there was no ln in the second statement, the cursor stayed on the same line. One might also notice there is no space between the two words. Why did this happen? Since there is no space at the end of the first string within the double quotes, nor a space at the beginning of the second string, a space did not appear in the output.
+
+Although this is similar to the example using the System.out.print, could it be changed to mimic the first example in this section? The answer is yes, as in the following example:
+
+System.out.print("Hello ");
+
+System.out.print("World!");
+
+System.out.println();
+
+In this case, the word Hello followed by a space would be output, and then the word World! would be output. The last line would output nothing, because there is no string in the parentheses, but the ln would cause the cursor to move down to the next line as shown below:
+
+Hello World!
+
+_
+
+Although the above three line code segment produces the same output as the original single-line statement, why would one want to use this latter example? Usually one would not and the single line is preferable to using multiple lines. However, there are instances where one needs to break up an output line into multiple lines for the sake of convenience as will be illustrated in the next section on input and in Chap.​ 3 on selection statements.
+
+As a further example of formatting output, what if one wanted to output the following with a blank line between the two words and the cursor at the bottom?
+
+Hello
+
+World!
+
+_
+
+The following code segment would accomplish this task:
+
+System.out.println("Hello");
+
+System.out.println();
+
+System.out.println("World!");
+
+The first statement outputs the word Hello and moves the cursor down to the second line. The second statement does not output anything, so the ln of the System.out.println statement causes the cursor to move down to the third line and the blank line to appear on output. Lastly, the word World! is output and the cursor moves down to the fourth line. What if one wanted to output two blank lines, would the following code segment work?
+
+System.out.print("Hello");
+
+System.out.println();
+
+System.out.println();
+
+System.out.println("World!");
+
+At first glance, it might appear to work, but look carefully. Notice that the first statement does not contain a println but rather only a print. The result would be exactly the same as the previous code segment since the first statement outputs the word Hello, but does not move the cursor down to the next line on the screen. The second statement is a System.out.println, and it moves the cursor down from the first line to the second line of output. The second System.out.println creates a single blank line.
+
+Unfortunately, this is a mistake that is sometimes made by beginning Java programmers, where they assume that anytime there is a System.out.println(); a blank line is produced. The only time a blank line is produced is when there is not a preceding System.out.print statement. This is yet another reason why one should tend to avoid using the System.out.print statement unless under special circumstances, again discussed in the next section and Chap.​ 3. The correct code segment to produce two blank lines is given below. Note that the first statement is a System.out.println:
+
+System.out.println("Hello");
+
+System.out.println();
+
+System.out.println();
+
+System.out.println("World!");
+
+Although the above code segments are useful for outputting strings and formatting output, how does one output integers and real numbers? Combining the information learned in the previous two sections, one can then have a program as shown in Fig. 1.9.
+
+Fig. 1.9
+
+Outputting an int precision number
+
+This program declares the variable num to be of type int, assigns the value 5 to num, and then outputs the contents of the variable num. Note that the variable num is not enclosed in quotation marks, so the word num is not output, but rather the contents of the variable num are output. Unfortunately, only the integer 5 would be output to the screen which would not be very useful. Instead, it is helpful to output some other information for the user to identify and understand the information on the screen.
+
+The output statement in the program in Fig. 1.9 can be modified to include the string "The number is " followed by a plus sign prior to the variable num as shown in Fig. 1.10. A plus sign between two strings or between a string and any other type of data means concatenation. In other words, the string "The number is " and the contents of num are output as if they are one string. It should be noted that one needs to be careful should only two integers be separated by a plus sign, because then it would mean addition as will be discussed in Sect. 1.7. However, provided a string or a concatenated string appears to the left, then the item to the right of the plus sign will be concatenated instead of added. Note that there is a space within the quotes at the end of the string so that the contents of the variable num are separated from the word is in the string. The result is that the output of this program would appear as follows:
+
+The number is 5
+
+_
+
+Fig. 1.10
+
+Outputting an int precision number with description of output
+
+What happens if one outputs a number of type double using the same format shown in Fig. 1.10? For example, Fig. 1.11 outputs the contents of the variable num of type double.
+
+Fig. 1.11
+
+Outputting a double precision number without formatting
+
+As will be discussed further in Sect. 1.7, the / means division and num will take on the value of one third. When the above program is compiled and executed, the screen displays
+
+The number is 0.3333333333333333
+
+Although using high precision is necessary during computation, it may not be needed when a number of type double is displayed. How can one limit the number of digits after the decimal point in a floating-point number? A predefined method in the Java API called printf can be used. The general syntax of the printf method is as follows:
+
+printf(control string, expr, expr,...)
+
+where control string is a string that may consist of substrings and format specifiers and an expr represents a variable, expression, or constant value. A format specifier indicates how an expr should be displayed. A specifier %d is used for a decimal integer, %f for a floating-point number, %c for a character, and %s for a string. For numbers, the total width and precision can be indicated in a specifier. For example, the specifier %10d outputs an integer value with a width of at least 10. The specifier %10.2f outputs a floating-point number with a width of at least 10 including a decimal point and two digits after the decimal point. The width of character and string values can also be indicated. For example, the specifier %3c outputs a single character and adds two spaces before it, and %10s outputs a string with a width at least 10 characters. If there is more than one expr to be output, they must match the specifiers within the control string in order, number, and type. Using the formatting information described above, the program in Fig. 1.11 can be rewritten as follows in Fig. 1.12.
+
+Fig. 1.12
+
+Formatting a double precision number
+
+The floating-point number stored in the variable num will be output with two digits after the decimal point. Since a space is included before the specifier in the string after the word is, there will be a space between is and the number as shown below:
+
+The number is 0.33
+
+Also notice that since the printf method does not move the cursor to the next line, just like a print method. A System.out.println(); statement needs to be added at the end of the program in order to have the same effect as the program in Fig. 1.11.
+
+Some characters cannot be simply included between double quotes for output. In order to output a double quotation mark, two characters, a backslash and a double quote, need to be used, \". The following statement
+
+System.out.println("He said \"Hello\".");
+
+will output
+
+He said "Hello".
+
+Similarly, a backslash can be output by placing an extra backslash in front of one as shown below:
+
+System.out.println("How to output a backslash, \\\");
+
+This will produce an output of
+
+How to output backslash, \
+
+## 1.6 Input
+
+The ability to declare variables, assign values to them, and output strings and variables is very important but does not allow for many useful programs. As it stands, anytime one wants to change the output of a program, one has to edit the program and recompile it before executing the program. What is needed is a way to input data into a program. As with output, input can come from a variety of sources such as the keyboard, mouse, a disk, or even from sensors such as those that might be on a robot on an assembly line. Although other methods for input can be found in Chap.​ 10 and Appendix A, this section will deal with the simplest form of input.
+
+As in the last section, it is best to start with a simple example based on the previous program in Fig. 1.10 and modified as shown in Fig. 1.13. Although the description of the first few lines of the following program might be a little complicated due to the nature of input in Java, the actual statements that perform the input are less complicated as will be seen shortly.
+
+Fig. 1.13
+
+Program to input an integer
+
+Notice the addition of the import statement in the first line. The import statement is added in order to use a predefined method for input. All the predefined classes and methods in the Java API are organized into packages, and the import statement identifies those packages that will be used in a program. For example, the following statement imports the Scanner class of the java.util package:
+
+import java.util.Scanner;
+
+A second option uses an asterisk to indicate that any class inside the package might be used in the program. Thus, the statement
+
+import java.util.*;
+
+allows any of the classes in the java.util package to be referenced in the program. The second option is used in the program shown in Fig. 1.13.
+
+Remember when the System.out.println, System.out.print, and System.out.printf statements were used in the previous section for output, the java.lang package which includes the System class was not imported at the beginning of the program. This is because the java.lang package, which includes the System and Math classes, is used extensively, and it is automatically imported into all Java programs.
+
+Returning back to Fig. 1.13, in order for input to work properly, one needs a place to store the data entered. The first statement in the body of the main method declares the variable num as type int. The next statement is the declaration of the variable scanner of type Scanner as shown below:
+
+Scanner scanner;
+
+Scanner is not a primitive data type like int or double, but rather it is a class. As discussed briefly at the beginning of Sect. 1.2 and will be discussed further in Chap.​ 2, a class is like the set of blueprints for a building. The following statement
+
+scanner = new Scanner(System.in);
+
+creates a new instance of the Scanner class, or in other words a Scanner object. This can be thought of as how an individual building might be constructed from a set of blueprints. Java uses System.in to refer to the standard input device, which is the keyboard. Unlike output, input is not directly supported in Java; however, the Scanner class can be used to create an object to get input from the keyboard. The above statement then assigns a reference to the new object to the variable scanner. Again, although this might be a little confusing at this point, the important thing is be sure to include the import statement and the above two statements in any program that needs to input data.
+
+The next statement below shows how the Scanner object is used to scan the input for the next integer. The method nextInt will make the system wait until an integer is entered from the keyboard, and then the integer input is assigned to the variable num:
+
+num = scanner.nextInt();
+
+The last statement in the program is the same as before where the value of num is output to the computer screen. However, if one were to enter, compile, and run this program as given, the result might be a little confusing. The reason is that there would only be a blinking cursor on the screen as the system is waiting for input and there would be no indication of what should be input without having to look at the program. To solve this problem, it is usually best to provide a prompt to let the user know what should be input. A prompt is just an output of a message to the user to help them understand what is expected to be input. The program in Fig. 1.14 includes a prompt just prior to the input.
+
+Fig. 1.14
+
+Prompting a user to input a number
+
+As can be seen, the prompt is nothing more than the output of a string to indicate what the program is expecting in terms of input. Notice that a System.out.print(); is used to cause the input to stay on the same line. Further, a prompt should be formatted well. Note that there is a space after the colon so that the cursor is separated from the prompt. After entering the data and when the user presses the enter key, the cursor then moves to the next line.
+
+Furthermore, a prompt should be user friendly. A user-friendly prompt is one that clearly describes what the user should input, as in the case above where it asks for an integer. A user-friendly prompt can be polite such as "Please enter a number: ", but typically a prompt should avoid the use of first person words like "I" and "you", as in "I would like you to...", since the computer is a machine, not a human.
+
+Now would be a good time to enter, compile, and run the program in Fig. 1.14 to see how it works. The results should be similar to the following:
+
+Enter an integer: 5
+
+The integer is 5
+
+_
+
+In addition to nextInt, the method nextDouble reads a number of type double, the method next reads a word of type String that ends prior to a space, and the method nextLine reads an entire line of text of type String, including all the spaces until the user presses the enter or return key. All of these methods work similarly to the method nextInt.
+
+## 1.7 Arithmetic Statements
+
+The ability to input data, copy data from one memory location to another, and output data is fundamental to almost every computer program. However, unless there is the capability to manipulate and process data to convert it into information that can be output and used, the power of the computer has hardly been tapped. One of the first things computers were used for and continue to be used for is arithmetic computation, which is the subject of this section.
+
+The four basic operations of arithmetic, addition, subtraction, multiplication, and division can be accomplished in Java by the use of the binary operators +, -, *, and /, respectively. The word binary in this case does not mean the binary number system, but rather that these operators have two operands (such as variables and constants) that are manipulated by the operators. As before, the best way to illustrate this is through an example. Consider the following code segment:
+
+int num1, num2, sum;
+
+num1 = 5;
+
+num2 = 7;
+
+sum = num1 + num2;
+
+After the variables of num1 and num2 have been assigned the values 5 and 7, respectively, the contents of the memory locations would appear as follows:
+
+What occurs next is that the expression on the right side of the last assignment statement is evaluated. The contents of num1 are brought into the CPU, and then the contents of num2 are added to it in the CPU. Once the expression on the right side of the assignment symbol has been evaluated, the result of the expression in the CPU is then copied into the variable to the left of the assignment symbol. As in Sect. 1.4, the copying goes from right to left, so the expression is always on the right side of the equal sign and there can only be one variable on the left side. The results of this evaluation and assignment can be seen below:
+
+Of course the values for num1 and num2 in the above segment could have been input from the keyboard, and the result in sum could be output to the screen, but for now simple assignment statements are used to initialize num1 and num2, and the value of sum is not output to keep the segment simple. The examples following will use this same pattern; however, a complete program using input and output will be shown in Sect. 1.10.
+
+Similar equations can be made using subtraction, multiplication, and division, and examples incorporating these operators will follow later in this section. Still, a few comments need to be made about mixing variables of different types. As shown above, when two variables of the same type are used, the result is of that type. However, should one or both of these operands be of type double, then the result will also be of type double. For example, if num1 is of type int and num2 is of type double, then the result of the expression would be of type double. Of course, if the result of the expression is of type double, then it could not be assigned to the variable sum of type int. Either the round method would need to be used or the type of sum would need to be changed to double.
+
+There is also a unique aspect to the division operation depending on the types of its operands. As with the other operators, if either or both of the operands are of type double, then the result of the division is also of type double. So, for example, 7.0 divided by 2 would be 3.5. If both operands are of type int, the result will of course be of type int. Although this does not pose a problem with the other arithmetic operators, the result of division when performing arithmetic often has a fractional component, and one would write it as 3½, 3.5, or possibly 3 with a remainder of 1. However, if the result of the division operation in Java is of type int, the fractional part is discarded and the result is simply 3. Although one does not get the fractional part with integer division, what if one wanted to determine the remainder? That can be done with the mod operator which is represented by the percent sign, %. To illustrate, consider the following code segment, where all variables are of type int:
+
+int num1, num2, quotient, remainder;
+
+num1 = 7;
+
+num2 = 2;
+
+quotient = num1 / num2;
+
+remainder = num1 % num2;
+
+Upon completion of the segment, the respective memory locations would contain the following:
+
+Although it is relatively easy to create some simple instructions that contain only one operator, what about expressions with more than one operator? In that case, an awareness of the precedence of the various operators is needed. The precedence in Java is the same as in mathematics, on a calculator, or in a spreadsheet application program. First, the multiplication and division operators have precedence over addition and subtraction. For example, given the following code segment, what are the contents in answer?
+
+int answer, x, y, z;
+
+x = 2;
+
+y = 3;
+
+z = 4;
+
+answer = x + y * z;
+
+Unfortunately if one guessed 20, that would be wrong. Remember that multiplication has precedence over addition so the result of the multiplication of y and z, which contain 3 and 4, would be 12, plus the contents of x, which is 2, would be 14.
+
+However, what if one wanted to perform the addition first? As in arithmetic, one can always use parentheses to override the precedence of the operators, so that
+
+answer = (x + y) * z;
+
+would result in answer containing a 20. If there are more than one set of parentheses, then the innermost nested ones are evaluated first, and if the parentheses are not nested, the parentheses are evaluated from left to right. In fact, if there is a tie of any sort, such as two addition symbols, or an addition symbol and a subtraction symbol, the order is also from left to right.
+
+Given all this information, what would be the answers in the following segment?
+
+int answer1, answer2, x, y, z;
+
+x = 3;
+
+y = 4;
+
+z = 5;
+
+answer1 = x - y + 6 / z;
+
+answer2 = (x * (y + 2)) % 2 – 1;
+
+First, note that there are some constants in the mathematical expressions on the right side of the assignment statement and this is perfectly acceptable. In the first expression, the 6 / z is evaluated first and the result would be 1. After that, which operation is performed second? Since there is a tie in the precedence between the subtraction and the addition, and the subtraction is on the left, it is performed first, where 3 minus 4 is -1. Lastly, the 1 from the division is added to the -1 from the subtraction, so the answer is 0.
+
+In the second expression, which operation is performed first? Since there are nested parentheses, the y \+ 2 is performed first with an answer of 6. Then the 3 in x is multiplied by the 6 for a value of 18. Then the 18 is divided by 2, where the remainder is 0, and lastly the 1 is subtracted from the 0 for a final answer of -1.
+
+When trying to evaluate expressions, it is sometimes helpful to draw a line underneath each of the sub-expressions to help one remember which parts of the expression have been evaluated and remember their respective values. For example, in the first expression above, it would appear as follows:
+
+Since parentheses override the order of precedence, why can't one just use parentheses all of the time and avoid having to remember the order of precedence? One could do that, but the resulting expressions would have an inordinate number of parentheses and they could be quite difficult to read. Further, since the precedence rules in most languages are fairly similar and most programmers use parentheses sparingly, it is to one's advantage to learn and use them correctly. For further practice, see the exercises at the end of this chapter.
+
+Just as there are binary operators that have two operands, there also exist unary operators that have only one operand. The two most common are the plus sign and the minus sign, where the latter is used more frequently as in the following example:
+
+z = -x + y;
+
+The thing to remember about unary operators is that they have a higher priority than binary operators. So in the above statement, the negative of the value contained in x is added with the value in y and the result placed in the variable z. Should one want to negate the entire quantity, then parentheses would need to be used as in the following example, where the values in x and y are added together first, then negated, and the result placed in z.
+
+z = -(x + y);
+
+There are of course other arithmetic expressions to be learned, including how the contents of a variable can be incremented or decremented by 1 or more. There are a couple of ways to do this, and the method that is applicable in most programming languages will be examined first. One way is to first get the contents of a variable, add or subtract 1, and then copy the new number back to the variable as follows:
+
+int x, y;
+
+x = 0;
+
+y = 0;
+
+x = x + 1;
+
+y = y - 1;
+
+At first the fourth and fifth statements above might appear unusual to the beginning programmer. The fourth statement seems to be saying that x is equal to x \+ 1, which would be impossible in algebra. How could a value in x be equal to itself plus 1? The answer is that it cannot. The reason why this might look unusual is that one might be mistaking the equal sign in Java as an equal sign in algebra, which it is not. If one recalls from Sect. 1.4, the equal sign in Java is the assignment symbol which takes a copy of the result on the right side and places it in the variable on the left.
+
+In this case, the value in x, which is a 0 as shown above, plus a 1 is 1, and that is the value placed into x. So prior to execution of the fourth statement, the value in x is a 0, and after the execution of the fourth statement, the value in x is a 1. The same sort of process occurs with the statement using subtraction where the final value in y would be a -1. Also note that since both variables appear on the right side of the assignment symbol, they must be initialized to some value and should not be indeterminate. At first these statements might be a little confusing, but with time they become second nature. Statements like these are often used to increment and decrement variables that are used as counters and will be discussed in detail in Chap.​ 4.
+
+Since these operations are fairly commonplace, the languages C, C++, and Java have shortcuts for these as follows:
+
+These operators are very convenient. The operators on the left side work the same way as those on the right when they are used as standalone statements. The style on the right is seen more often and will be used again extensively in Chap.​ 4. However, when used as part of a larger expression, the two styles have entirely different meanings. For example, consider the following two statements:
+
+If x and y originally contain a 2, their respective memory locations would initially appear as follows:
+
+At first it might seem that all four variables would contain a 3, but that would be incorrect. When the ++ ( or −− ) appears prior to a variable, the increment is performed before the assignment or any other operation that might be in the expression. On the other hand, if the ++ (or again −− ) appears after the variable, then any other operations are performed first, including the assignment operation. The result is that in the example on the left, the value of x is incremented by 1, which makes x contain a 3, and then the new value of x would be assigned to a, which would then also contain a 3. In the example on the right, the value in the variable y, which is a 2, is first assigned to b . Then the value in y would be incremented to 3 and the value in b would still be a 2 as shown below:
+
+As mentioned above, as standalone operators, the ++ and −− can be fairly useful and easy to use, and this text will use them more frequently in Chap.​ 4. However, using the more simple initial approach such as x = x \+ 1; is common in almost all languages, so this text will tend to use this initially to help reinforce how an expression like this works. Further, when these operators are used in more complicated expressions, their use becomes much more difficult to understand, and it is for this reason that this text will tend to avoid the use of the ++ or −− operators in this fashion. However, be aware that intermediate and advanced texts often use these operators more frequently in complicated expressions, so one needs to know how they work and also be careful when reading code containing them.
+
+As shown at the beginning of this section, when two variables are added together, the sum is often stored in a third variable. However, similar to counting, when a constant such as a 1 is added to a variable in the process of trying to find a total, one variable is added to another variable. For example, consider the following segment:
+
+int total, num;
+
+total = 0;
+
+num = 5;
+
+total = total + num;
+
+where the initial contents of the respective memory locations would appear as follows:
+
+As with previously incrementing by 1, it might look a little odd to see the variable total on both sides of the equal sign. Again the equal sign does not mean equality but assignment, where the expression on the right is evaluated first and the results are then stored in the variable on the left. Also, since the variable total appears on both sides of the assignment symbol, it needs to be initialized with a value prior to the statement. After the 0 and 5 are added together, the results are then placed back into total as follows:
+
+Just as with the increment operation, the ability to find a total also has a shortcut. This shortcut is as follows and has the same effect as the instruction above.
+
+total += num;
+
+Similar shortcuts can also be used with the subtraction, multiplication, and division operators, but they are used less frequently than addition. As with the previous shortcuts, this is only possible in languages like C, C++, and Java and does not appear in all languages. Likewise, since they do not appear in all languages and do not illustrate as readily how values can be totaled, this text will tend not to use these shortcuts as often.
+
+Although all the basic arithmetic operation are available in the Java programming language, there are a number of other functions that would be helpful to have available. In addition to the constants PI and E for pi and e, respectively, many extra functions are in the Math class. Including the round method previously introduced in Sect. 1.4, some of the other methods include square root, the power function, and the trigonometric functions. These methods along with some others are shown in Table 1.2.
+
+Table 1.2
+
+Various methods in the Math class
+
+Method | Function performed | Arguments | Value returned
+
+---|---|---|---
+
+cos( x ) | cosine | double (in radians) | double
+
+pow( x,y ) | x to the power of y | double | double
+
+round( x ) | round | float (or double) | int (or long)
+
+sin( x ) | sine | double (in radians) | double
+
+sqrt( x ) | square root | double | double
+
+tan( x ) | tangent | double (in radians) | double
+
+toDegrees( x ) | convert radians to degrees | double | double
+
+toRadians( x ) | convert degrees to radians | double | double
+
+To illustrate a few of these functions, examine the program segment in Fig. 1.15.
+
+Fig. 1.15
+
+Sample Math class constants and methods
+
+The methods should be fairly straightforward given their descriptive names and the reader's requisite mathematical background. After execution of the segment, the answers stored in the variables power, sqRoot, sine, and cosine would be 8.0, 2.0, 0.0, and -1.0, respectively. Note that the value in z is in terms of PI, because the trigonometric functions work with radians instead of degrees. If the initial value in z was in degrees, the method toRadians could be used.
+
+## 1.8 Comments
+
+Although comments were discussed briefly in Sect. 1.2, there are few more items that should be discussed. As mentioned previously, comments are either preceded by two slashes //, and the remainder of the line is considered a comment by the compiler, or a comment can begin with a slash and an asterisk /* and end with an asterisk and a slash */ which allows a comment to extend over multiple lines in a program. Single-line comments are helpful in explaining an individual line or multiple lines of code. Although a single-line comment can be placed off to the right-hand side of the statement it is describing, it can sometimes get crowded once code is indented as shown in Chaps.​ 3 and . As a result, this text will usually place comments just prior to a line of code or code segment being documented. For example, the following comment helps the reader of the program understand what the subsequent statement accomplishes:
+
+// calculate the area of a rectangle
+
+areaRect = base * height;
+
+Multiple-line comments are also helpful to create what are called headings at the beginning of programs and methods in class definitions. The format of these headings can vary in different computer courses and companies, so be sure to determine your local requirements. An example of one such heading might be as follows:
+
+/* name: your name
+
+class : cs 1xx
+
+prog : one
+
+date : mm/dd/yyyy
+
+*/
+
+Once filled with the corresponding information, this heading identifies the author of the program, which class it was written for, the program number, and the date written. As can be seen, comments are good for documenting what various sections of code do in a program and identify who wrote a program, among other things. Having comments within a program explaining what a program does is known as internal documentation, whereas having explanations that appear in manuals (whether online or in physical manuals) is known as external documentation. Internal documentation tends to be more specific and is helpful to programmers, whereas external documentation tends to be more general and is useful to users, customers, and managers who may not understand programming.
+
+Although at first some of the simpler programs will not appear to need comments, it becomes imperative to include comments as programs become larger and more complex. If the original programmer is on vacation or is no longer with a company, documentation is essential to help other programmers understand how the program works. Although many of the programs written in a first programming course might not be too complex, it is helpful to include comments to gain practice in good commenting techniques. To that end, the complete programs at the end of each chapter will include comments to help the reader understand the program and learn some commenting techniques.
+
+There is also another way to document a program using Javadoc. This technique is very useful with larger programs that have many classes and methods, and an introduction is presented in Appendix C. Again, many computer science departments and computer science professors have different documentation standards, as do many different companies. Although they share some commonalities, there can also be a number of differences. Find out what your professor's or company standards are and be sure to follow them closely.
+
+## 1.9 Program Design
+
+When writing a program for the first time, there is a tendency to want to just start keying the program into the computer and get it to work. Initially this method appears to work fairly well when programs are small at the beginning of a text and in a class. As mentioned previously, many beginning programmers focus primarily on the syntax of their program, and they want to avoid getting syntax errors. However, as problems get more complex, they become more difficult to solve, and programs written this way will tend to have not only more syntax errors but complicated logic errors which are more difficult to correct since no error messages are provided.
+
+As an analogy, an individual might be able to build a small storage shed by just sawing and nailing some lumber together without worrying about the overall design of the project. However, with a larger project such as a house, apartment building, or office building, that methodology would not be sufficient. Instead there are many other people who must be consulted, including the original customer who wants the building built, the architects who work with the customer, the contractors, and carpenters. The same holds true in the world of programming which involves customers, users, and managers.
+
+What are needed are various strategies and tools to help write programs correctly. Just as in the above example where blueprints and plans are used by the architect, there are techniques that can be used by analysts, software engineers, and programmers. Although the complete process for developing software might not be needed initially with smaller programs, it does not hurt to practice the various techniques on smaller programs to gain familiarity, so that when one advances to more difficult projects, one is comfortable with many of the techniques. Although the following techniques are used primarily with non-object-oriented programs, they can be augmented with object-oriented design techniques introduced in the next chapter and used in larger programs.
+
+There are many different methodologies and number of stages within the various methodologies for solving problems that can be found in different texts, but upon closer examination, they are all rather similar. They tend to include at least four stages, and they are usually comparable to the following:
+
+1.
+
+Analysis
+
+2.
+
+Design
+
+3.
+
+Implementation
+
+4.
+
+Maintenance
+
+The analysis stage is where the needs of the user or customer are first determined. Questions concerning the form and quantity of the input, the type of processing that needs to be done, the storage requirements of data, and the type of output needed are asked and clarified at this stage. This would be similar to a customer in a construction project trying to determine what type of building should be built. In a first semester programming class, this stage may or may not be included. Sometimes a professor might have already completed the analysis stage and included what is needed in the programming assignment. However, at other times, they might require this stage and a number of questions will need to be asked by the student. This might be especially true when working on a team project in a senior capstone course.
+
+The design stage is where a project begins to take shape. It is similar to the architect creating a set of blueprints and models for the user to examine, because changes are much easier to make on paper or with the model than once the construction of the building has started. Various tools such as UML diagrams (discussed in the next chapter) and pseudocode (discussed later in this section) are used by analysts, software engineers, and programmers to help design the program. Again it is much easier to make changes during the design phase than once the programming has begun.
+
+The implementation stage is where the code is actually written, compiled, and errors are corrected. Once the code is free of syntax errors, it is thoroughly tested. This includes testing various components of the program to be sure each section is working properly. If not, then the code needs to be debugged to correct any logic errors. In addition to the various components, the entire program needs to be tested to ensure that all the components work together as planned. Sometimes errors are a result of not following the design, whereas other times, it is not necessarily the code but rather the design itself that has the error, in which case one has to go back and correct the error in the design. The result is that each of the stages above is not a step that needs to be rigorously adhered to, but rather one stage may need to return to a previous stage for clarification or to fix a possible error.
+
+Although it is tempting to jump directly to the implementation stage, this tendency should be avoided. It is important to take the time to properly design the algorithm first before starting to key in a program. An algorithm is a step-by-step sequence of instructions, not necessarily implemented on a computer. Once an algorithm is implemented in a specific language, it is then a program. By taking the time to design a well-thought-out algorithm, there will be fewer logic errors in the program. Although it might seem to take longer to include the design stage, the savings will be more than made up for in less time spent debugging logic errors later.
+
+The maintenance stage is where all the modifications and updates take place. In an industrial strength program, more time is spent in the maintenance phase than all of the three preceding stages. This is because once a program is up and running, there can be numerous changes that need to be made over the lifetime of a program. This is another reason why a program should be designed well in order to facilitate modifications later in the life of a program. Unfortunately, beginning programmers do not often experience this stage of a program, because once the concepts are learned from one programming assignment, the program is often not used again and another program is assigned to introduce the next set of concepts. However, in some upper-level courses, the assignments get longer, existing programs might be modified and reused, and students get to have some experience with the maintenance stage of programs. Regardless, it helps even beginning students to design well-thought-out programs to gain practice in the event that a professor decides it might be easier to modify an existing program rather than having to design a new program from scratch, as done in the real world.
+
+One technique that can help during the design stage is the use of pseudocode. Pseudocode is a combination of English and a programming language. Since it is really not a programming language, this is the reason for its name as "pseudo" code. The advantage of using pseudocode is that one can concentrate on the logic of an algorithm and not worry about the syntax of a particular programming language. In fact, well-written pseudocode should be understood by any programmer regardless of the programming language that they use, and they should be able to convert the pseudocode into their particular programming language. However, there can be many different versions and levels of detail that can be included in pseudocode, so it is best to check with one's instructor or company if there are any preferences or standards that are employed. In this text, when pseudocode is used, it will be written with as much detail as possible so as not to be ambiguous and to help with the translation into Java. As a simple example, consider the following pseudocode on the left and the Java statement on the right:
+
+Note first that an arrow is used instead of an equal sign to indicate an assignment statement. This helps illustrate the direction of assignment, since some languages use symbols other than an equal sign to illustrate assignment. Also notice that a mathematical symbol is used instead of an asterisk to illustrate multiplication. Lastly, a semicolon is not used since not all other languages use them to terminate statements. The result is that the pseudocode is more generic and helps in the translation to other languages and not just the Java programming language. Again, this is just one sample of pseudocode, so be sure to check your local guidelines and requirements.
+
+Even when all attempts to write a logically correct program are followed, the possibility of logic errors still exists. When this occurs, a programmer should not start to randomly alter code in the hope that the error might be fixed. Although this might work occasionally with smaller programs, it rarely works as programs become larger and more complex. Instead, one should look for patterns in the output in an attempt to isolate the problem. Further, one needs to carefully check the program by walking through the code to ensure that it is doing what was originally intended. To assist in this process, many IDEs include debuggers that can trace the contents of various memory locations to help locate a logic error. However, do not rely on the debugger alone to help correct the problem, but rather use it as a tool to assist in tracing the logic of the program. If a debugger is not available, well-placed output statements at critical points in the program can help in the debugging process. In the end, it is the programmer reading the code carefully to see what the code is actually doing rather than what one thinks it is doing that will ultimately fix logic errors in a program.
+
+## 1.10 Complete Program: Implementing a Simple Program
+
+Combining all the material from Chap.​ 1, one can now write a simple program to prompt for and input various numbers, perform a wide variety of calculations, and output answers as needed. In this section, a program that calculates two roots of a quadratic equation ax 2 \+ bx + c = 0 will be developed and implemented. As might be recalled from mathematics, the following is the definition of the two roots:
+
+and
+
+Problem statement: Write a program to calculate the two roots of a quadratic equation. Assume that a ≠ 0 and the relationship b 2 ≥ 4ac holds, so there will be real number solutions for x.
+
+Once a problem statement has been given, the requirements can be determined by analyzing the problem. The program will:
+
+  * Prompt a user to enter values for a, b, and c
+
+  * Compute the two roots
+
+  * Display the two roots
+
+During the design stage, pseudocode can be used to outline the program. At this point, one does not need to be concerned with the details of the implementation, such as the name of the class or the parameters in the main method. It lists the steps that need to be taken to accomplish the task. The following is the pseudocode for a program calculating two roots of a quadratic equation:
+
+  * declare a, b, c, root1, root2
+
+  * input (a)
+
+  * input (b)
+
+  * input (c)
+
+  * root1 ←
+
+  * root2 ←
+
+  * output (root1, root2)
+
+Observe in the formulas for the roots that the expression in the square root is called the discriminant and is used in calculating both roots. Therefore, the square root of discriminant can be calculated prior to the computation of root1 and root2, so that it does not need to be calculated twice. The augmented pseudocode is
+
+  * declare a, b, c, root1, root2, sqrtDiscr
+
+  * input (a)
+
+  * input (b)
+
+  * input (c)
+
+  * sqrtDiscr ←
+
+  * root1 ← (-b \+ sqrtDiscr)/(2a)
+
+  * root2 ← (-b − sqrtDiscr)/(2a)
+
+  * output (root1, root2)
+
+After the design phase comes the implementation phase. Consider the following program that is derived from the pseudocode above:
+
+Observe the formula for the discriminant for root1 and root2 . The methods sqrt and pow are defined in the Math class and are used to calculate the square root of the discriminant and the number b raised to the power of 2. All the parentheses are necessary to obtain the answer, which is accurate to at least two decimal places. In the output section of the program, println is called at the beginning in order to have a blank line between the input and output. The specifiers for root1 and root2 do not include the width to avoid any extra space before the roots are output since an extra space is included in the string. Given the above program, sample input and output are shown below:
+
+Enter a: 2.0
+
+Enter b: -5.0
+
+Enter c: -3.0
+
+Two roots of the equation, 2.0*x*x + -5.0*x + -3.0 = 0, are
+
+3.00 and -0.50.
+
+## 1.11 Summary
+
+  * Machine language and assembly language are low-level languages, where the former uses ones and zeros and the latter uses mnemonics. High-level languages are more English-like, where C, C++, and Java are examples of high-level languages.
+
+  * Compilers convert the entire high-level language program into machine language before executing the machine language program, whereas interpreters convert a high-level language program one instruction at a time and then execute only the corresponding machine language instructions before converting the next high-level instruction.
+
+  * Java is a hybrid system, where the Java instructions are converted into an intermediate language called bytecode using a compiler and then the bytecode is converted into machine language using an interpreter.
+
+  * System.out.print leaves the cursor on the same line, whereas System.out.println moves the cursor to the next line.
+
+  * Just because there are no arguments in a System.out.println, it does not mean a blank line is output. A blank line is output with a System.out.println when there are no preceding System.out.print statements.
+
+  * Remember that multiplication and division have a higher precedence than addition and subtraction and that unary operators have an even higher precedence.
+
+  * Parentheses can override any operator precedence, where the innermost nested parentheses have the highest precedence. It is also good practice not to use unnecessary parentheses.
+
+  * Whenever there is a tie at any level of precedence, the operators or parentheses are evaluated from left to right.
+
+  * The ++ or −− operators are an easy shortcut when used as standalone statements. However, great care must be taken when they are used in assignment statements or with other operators. In that case, if the ++ or −− precede a variable, it is performed first, but if they appear after the operand, they are performed last.
+
+## 1.12 Exercises (Items Marked with an * Have Solutions in Appendix E)
+
+1.
+
+Indicate whether the following statements are syntactically correct or incorrect. If incorrect, indicate what is wrong with the statement:
+
+A.
+
+integer num1, num2;
+
+*B.
+
+double num3;
+
+C.
+
+7.8 = num3; Assume that a variable num3 has been declared correctly.
+
+*D.
+
+int j;
+
+j = 5.5;
+
+2.
+
+Assume the following declaration and initialization of variables:
+
+int i, j;
+
+double d;
+
+i = 1;
+
+j = 5;
+
+d = 2.34;
+
+Determine the value for each of the following expressions, or explain why it is not a valid expression:
+
+*A.
+
+i / j;
+
+B.
+
+j + d;
+
+C.
+
+Math.pow(j);
+
+D.
+
+i - j * d
+
+E.
+
+i + d * (j * 3 – 2) / 4
+
+3.
+
+Assuming the following declaration and initialization of variables,
+
+int i;
+
+double d;
+
+i = 3;
+
+d = 2.34;
+
+Determine the value assigned to the variable in each of the following assignment statements, or explain why it is not a valid assignment statement:
+
+A.
+
+i = d;
+
+*B.
+
+d = i + d;
+
+C.
+
+d = Math.pow(5, Math.sqrt(Math.pow(i, 2)));
+
+4.
+
+Implement each of the following statements in the Java language:
+
+A.
+
+Declare a variable weight of type double.
+
+*B.
+
+Declare a constant EULER_NUMBER of type double and assign it the value 2.7182.
+
+5.
+
+Given the following Java program, what will be output to the screen? Be sure to line everything up properly. Use an underscore to represent a blank and the words blank line to represent a blank line:
+
+class OutputTest {
+
+public static void main (String[] args) {
+
+System.out.println("alpha ");
+
+System.out.println();
+
+System.out.print(" beta");
+
+System.out.println(" gamma");
+
+}
+
+}
+
+*6.
+
+Write code to output the following pattern:
+
+** **
+
+** **
+
+****
+
+****
+
+****
+
+****
+
+** **
+
+** **
+
+*7.
+
+After the following statements are executed, what is stored in value1, value2, and value3?
+
+int value1 = 5;
+
+int value2 = 9;
+
+int value3 = 4;
+
+value1 = value2;
+
+value2 = value3;
+
+value3 = value1;
+
+8.
+
+Write an equivalent Java assignment statement for each of these mathematical expressions.
+
+A.
+
+*B.
+
+C.
+
+9.
+
+Write a complete program to prompt for and input a number, and then compute 2 to the power of the number that was input. The form of the input and output can be found below, and as always be careful with the vertical and horizontal spacing.
+
+  * Input and Output:
+
+  * Enter the number: 4.0
+
+  * Two to the power of 4.0 is 16.0.
+
+Reference
+
+4.
+
+Streib JT (2011) Guide to assembly language: a concise introduction. Springer, London
+James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6_2
+
+© Springer-Verlag London 2014
+
+# 2. Objects: An Introduction
+
+James T. Streib1  and Takako Soma1
+
+(1)
+
+Department of Computer Science, Illinois College, Jacksonville, IL, USA
+
+Abstract
+
+This chapter introduces classes and objects. Public and private data members along with value-returning methods and void methods with parameters are discussed. How objects are created and how methods are invoked are illustrated using contour diagrams. Contours help the reader have a better understanding of object-oriented concepts by providing visual representation of objects. Constructors are introduced along with multiple objects and classes. Lastly, UML (Universal Modeling Language) class diagrams are illustrated and a complete program implementing a simple class and client program is provided.
+
+## 2.1 Introduction
+
+Having written a complete Java program in the proceeding chapter, one should have a basic understanding of how a program works. However, as programs get larger, they can become very difficult to modify. It would be similar to trying to write a paper or book as just one long paragraph without any chapters, sections, or paragraphs. To help make a program easier to modify and maintain, it can be broken up into sections much like a book is divided up into chapters. Further, if a section of a book needed to be referred to many times, instead of repeating that section over and over again, it could possibly be placed in an appendix, and then the appendix can be referred to as necessary. Similarly, if a section of a program needs to be used again, the program can be broken up into subprograms. Instead of having to rewrite the code, a program can just call the same subprogram repetitively, thus saving time rewriting the code and saving memory as well.
+
+However, what if the repeated code is only slightly different from the code that has been previously written? One could rewrite the code again with only slight modifications, but the chance for making mistakes would increase. There would also be time wasted rewriting existing code and memory wasted to store the code.
+
+Instead of the above scenario, the programming methodology called object-oriented programming (OOP) could be used. OOP allows programmers to identify the common memory locations and code and then create what is known as a class. Then as variations of the class are needed, they can be made based on the original class. This allows for the reuse of a software that has been initially created in the original class, and the new classes are just variations on the theme of the original class.
+
+A class is essentially a definition of an object or group of objects. For example, in the real world, the drawings, plans, or blueprints for a house are a definition for a single house or a group of houses. Although blueprints could be drawn up for a single custom-built house, many times there might be a set of master blueprints for a group of houses. A subdivision could be built with houses that are all very similar but have various subtle differences so that they do not all look the same. For example, some houses might be built with different color siding, with windows in different locations, with one or two car garages, and so on. The reason for doing this is to keep the cost of the individual houses reasonable. Should a major change in the blueprint need to be made for all the houses, then only the master blueprints would need to be changed. However, if a change only needs to be made to some of the houses, such as only to those houses that have fireplaces, then only the individual supplement that contains the plans for fireplaces would need to be changed. This idea is called inheritance and will be explored further in Chap.​ 9. However, before learning more about that topic, the fundamentals of object-oriented programming must be discussed first.
+
+## 2.2 Classes and Objects
+
+In object-oriented terminology, the master blueprint would be called the class definition, and an actual house would be an instance of that class or what is known as an object as shown in Fig. 2.1. This can be a source of confusion for some beginning programmers which sometimes use the words class and object interchangeably. However, if one keeps the distinction between the plans or blueprints as the class and the individual houses as instances of the class or the objects themselves, it makes the learning of object-oriented programming easier in the long run.
+
+Fig. 2.1
+
+Classes and objects using blueprints and houses
+
+Although a class can be placed in the same file right before or after the class that contains the main program, it is often placed into a separate file. This eventually helps when there are a number of different classes and when there is more than one programmer working on a project. However, this text will show classes immediately after the main program in order to save space.
+
+As with the initial skeleton of the main program in Chap.​ 1, the introduction of classes will also start with an empty class called Number as shown below:
+
+class Number {
+
+}
+
+As can be seen, a class is somewhat similar to the main program except it is much simpler. As before, the word class is a reserved word, Number is the name of the class, and the opening and closing braces indicate the body of the class.
+
+## 2.3 Public and Private Data Members
+
+As before, an empty class is not very useful until code is added to it. Two of the most important items in a class definition are its data members and methods. A data member is similar to the declaration of a variable in the previous chapter. An important difference is that data members need to be declared using the access modifiers: public or private. A public data member is one that can be seen and used by an object of the class in which it was declared but can also be used outside the object, such as the main program. A private data member is one that can only be seen or used within an object of the class and cannot be used externally, such as by the main program. As shown below, the variable or data member x is declared as private, and the data members y and z are declared as public.
+
+class Number {
+
+private int x;
+
+public int y, z;
+
+}
+
+At first, one might be tempted to declare all data members as public to allow for easy access from the calling program. However, this would be in contradiction with why one creates a class in the first place. One of the important aspects of OOP is data encapsulation. This means that the data in an instance of a class is encapsulated within the object and not directly accessible from the outside world. For example, in an automobile there are various parts which are inaccessible when one is driving, such as the fuel tank. However, through a gauge on the dashboard, one can tell whether there is fuel in the fuel tank. This is similar to public and private data members, where in many instances one does not want the main program having direct access to the data members. So although it is possible to declare data members as public, they will most often be declared as private.
+
+If a data member is not directly accessible when it is declared as private, how does one gain access to it? The answer is through a method, specifically a public method which can indirectly allow access to private data members. Although methods are sometimes declared as private, for now most of the methods will be declared as public. If a method is just accessing and examining the contents of a data member, it is known as an accessor. Should a method alter a data member, it is known as a mutator. An accessor method is often used to get the contents of a data member and a mutator is often used to set the contents of a data member. In particular, an accessor method is known as a value-returning method, and a mutator is known as a void method, as discussed in the next two sections.
+
+## 2.4 Value-Returning Methods
+
+First, consider a method that returns the contents of a private integer data member x as follows:
+
+public int getX() {
+
+return x;
+
+}
+
+The word public means that the method can be accessed from the main program. If the data member is private, then the method invoked from the main program to access the data member is declared as public. (How the method is invoked will be discussed shortly.) The word int is the type of the value that will be returned to the main program. The name of the method is getX and it is used in the main program to invoke the method. Inside the opening and closing parentheses () is known as a parameter list and is used for sending information to the method. Since this method is an accessor and not a mutator, there is no information being sent to the method, so the parameter list is empty. The opening and closing braces {} indicate the body of the method that contains the instructions, just as in the main program. The return instruction followed by the variable x indicates what value will be returned to the main program. Although there can be more than one return statement in a method, it is a good programming practice to include only one return statement, and typically as the last statement in the method, as will be discussed later in Chap.​ 3. Returning to the automobile example, the getX accessor method is somewhat like the fuel gauge on the dash panel of a car that displays the amount of fuel in the fuel tank.
+
+## 2.5 Void Methods and Parameters
+
+As an example of a void method, consider the following:
+
+public void setX(int a) {
+
+x = a;
+
+}
+
+As with the value-returning method, the void method is also public so it can be invoked from the main program. The word void indicates that the method will not return a value. Similarly, setX is the name of the method that will be used when invoking the method from the main program as will be discussed in the next section.
+
+Unlike the previous method, this method has a parameter (sometimes called a formal parameter) between the parentheses. Notice that it looks similar to a variable declaration, and in a sense, it is like a variable declaration with a type and a variable name. However, what is unique about a parameter is that it can accept a value from the calling program. This is accomplished through an invoking statement, where there is another variable or constant called an argument (sometimes called an actual parameter) and the value of the argument is passed to the parameter. This is not unlike how the value on the right side of an assignment symbol = is copied into the variable on the left side. This copying of a value from an argument to a parameter is known as pass-by-value, or in other words this type of parameter is known as a value parameter. A value parameter provides one-way communication from the main program to the method. Other programming languages have additional parameter passing mechanisms that provide two-way communication, but Java has only value parameters, which makes the task of learning parameters a little easier. A visual example of how this works will be demonstrated in the section on contour diagrams later in this chapter. Lastly, the only statement in the method is x = a; which is a simple assignment statement that takes a copy of the contents in the parameter a and copies it into the data member x, as discussed in Chap.​ 1.
+
+A question that might be asked is where is the data member x, since it does not appear in either of the two methods. If the variable is used by only one of the two methods, it should be declared locally in that method, but if the value in the variable is needed in both methods, it should be declared as a data member in the class. If a variable is declared in a method, it is sometimes referred to as a local variable since only that method has access to it. However, if a variable is declared as a data member, it is sometimes referred to as a global variable since it is accessible by all the methods in the object. In this example, since the variable x is used by both methods, it is declared as a data member so that both methods have access to it. To illustrate a complete class using both the data member x and the two methods above, the class definition of Number is shown in Fig. 2.2.
+
+Fig. 2.2
+
+Number class
+
+Unlike the previous skeleton, the new class Number above only contains the private data member x. Also, the order of the methods is irrelevant. Sometimes the methods are put in alphabetical order, but this text will typically list the mutators first followed by the accessors, and then order them alphabetically within each group. The use of comments and line spacing helps with the readability of the class, although they will sometimes be omitted to save space in this text.
+
+## 2.6 Creating Objects and Invoking Methods
+
+Given the discussion of classes and methods in the previous sections, how are instances of classes created and the methods invoked? The best way is to show an example of a complete main program. Using the skeleton program from Chap.​ 1 with the appropriate code added, consider the program in Fig. 2.3.
+
+Fig. 2.3
+
+Invoking program
+
+Note that there are two variables named y and z declared as type int, but there is also a variable named num that is declared as type Number. Just like different variables can be declared as primitive data types, variables can also be declared as a type of a class. Similar to the primitive types, the contents of the class variables are initially indeterminate. In order to create a new instance of a class, in other words a new object, the new operator must be used, and then a reference to the new object is typically placed into a variable. The statement num = new Number(); performs these two tasks. First, a new object is created via the new Number() section of the statement. Then a reference to that new object is placed in the variable num through the assignment symbol =. It is important to remember that simply declaring a variable is not sufficient to create an object, but rather after the variable is declared, a new object must be created and then assigned to the variable. A shorter way of doing this is as follows:
+
+Number num = new Number();
+
+Although this technique might occasionally be used later in the text to save space, for now the two statements as shown below will be used to reinforce the concepts of variable declaration, object creation, and the assignment of references to variables.
+
+Number num;
+
+num = new Number();
+
+This also reinforces the idea concerning the separate declaration and assignment of variables presented in Chap.​ 1. If one's instructor prefers using a single statement or if one is reading this text independently and wants to use just one statement, then of course do so.
+
+## 2.7 Contour Diagrams
+
+As indicated in the preface, contour diagrams are a very useful mechanism to visualize how objects, data members, methods, and parameters work. By building a good working visual model of objects, there will be less of chance having misconceptions as to how objects work. By building a solid foundation of the fundamental concepts, it makes it easier to understand more complex ideas in the future.
+
+The purpose of using contours is to not only show the data members, similar to the variables that were drawn in Chap.​ 1, but also to show the scope of where the data members are accessible. The scope of a local variable is the method where it is declared, and the scope of data member is all of the methods in the object.
+
+Although not required, it is also helpful to include the type of the variable in the contour to avoid confusion among the many different types of variables. In addition to the variables, contours can also show how parameters are represented in the methods. Lastly, contours show the dynamic or changing nature of a program as it executes.
+
+As before, it is helpful to start with an example. The program from Fig. 2.3 is combined with the class from Fig. 2.2 to create Fig. 2.4 with each line numbered in a comment to the right for convenience in the description that follows. The contour diagram in Fig. 2.5 shows the state of execution just prior to the execution of Line 5 in the main program.
+
+Fig. 2.4
+
+Invoking program and Number class
+
+Fig. 2.5
+
+State of execution just prior to Line 5
+
+The outer contour represents the class Invoke, and the inner contour around the boxes shows the scope of the variables in the main program. Although the contours do not indicate much presently, the use of the contours will become clear shortly. Further, note that although technically the Invoke contour should be drawn for each of the following figures, it is not very useful at this time and will not be drawn for the rest of this chapter in order to simplify the drawings. However, it will be reintroduced and discussed further in Chap.​ 5.
+
+Continuing, the first column of boxes on the left indicates the names of the variables, and the boxes in the middle indicate the types of the variables, where y and z are of type int, and num is of type Number. Lastly, the boxes on the right indicate the current contents of the variables. Note that the state of execution is just prior to line 5, not after its execution. While technically y and z are initialized by the system to 0, this text will continue to assume that the variables do not contain an initial value and are indeterminate as discussed in Chap.​ 1.
+
+Although rather simplistic here, once Line 5 is executed, the contents of variable y now contain the value 5. Figure 2.6 shows the state of execution just prior to the execution of Line 6 and also does not show the outer contour for the Invoke class.
+
+Fig. 2.6
+
+State of execution just prior to Line 6
+
+However, when Line 6 is executed, things start to get interesting. Just like the Invoke contour was drawn in Fig. 2.5, when a new instance of the Number class is created, a new corresponding contour is also created. Although as mentioned previously the contour for Invoke is not very useful at this time, the contour for Number is necessary for the following discussion. Note that there is one data member in the class and it is shown within the Number contour. Once the instance is created, a reference to the object is assigned to the variable num. This reference is illustrated as an arrow in the contour diagram, where the arrow points to the new contour and the end of the arrow is placed in the variable num. Figure 2.7 shows the state of execution just prior to Line 7 in main.
+
+Fig. 2.7
+
+State of execution just prior to Line 7
+
+The next line to be executed is Line 7, which invokes the method setX. Prior to having the flow of control go from Line 7 to Line 15 in the setX method, a number of things need to occur. Just like when a new object is created and a corresponding contour is drawn, the same holds true when a method is invoked. Since the method is part of the instance of the class Number, this is where the corresponding contour appears. A convenient way of remembering this is that whenever there is a dot in the invocation of a method, then one needs to follow the reference or arrow to the corresponding contour. With the instruction num.setX(y); one just starts with the variable num, then follows the arrow to the Number contour, and then within the Number contour creates another contour for the setX method as shown in Fig. 2.8 which illustrates the state of execution just prior to Line 15 in setX.
+
+Fig. 2.8
+
+State of execution just prior to Line 15
+
+Note that the contour setX has a memory location associated with it for the parameter a. As mentioned in Sect. 2.5, a parameter is essentially a variable that takes on the value of the corresponding argument. Since the value contained in the variable y which is used as an argument in the main program is a 5, then the corresponding parameter takes on a copy of that same value, similar to an assignment statement. This also illustrates why parameters in Java are called value parameters, because they merely take on the value of the corresponding argument. Note that an argument and the corresponding parameter can have the same name or different names. In this example, the argument y and parameter a have different names, illustrating that the two do not have to be the same. Then when Line 15 is executed, Fig. 2.9 shows the state of execution just prior to Line 16 in setX.
+
+Fig. 2.9
+
+State of execution just prior to Line 16
+
+Note that Line 15 is the assignment statement x = a; where the contents of the parameter a will be copied into the variable x. However, notice that the parameter a is inside the contour for setX and the variable x is in the contour for the object or instance of Number. Is it okay for the contents of a to be assigned to x? The answer is yes. The reason is that when executing a statement that contains a variable, the system first looks for the variable within the innermost contour for the variable. If it is found, it uses that variable or parameter. If it is not, then the system looks at the variables contained within the next most encompassing contour diagram. If the variable is found, it is used. However, if the variable is not found, then a syntax error will be generated during compilation time. It is very important to note that although the system will look at any encompassing contour, it cannot look into another contour. In other words, it will look outside of a contour, but it cannot look into another contour.
+
+Another way of looking at this is to say that the scope of the variable a includes only the method setX; however, the scope of the variable x includes both the object num and the method setX. The word scope is just a way of expressing in which objects and methods a variable is accessible. Problems can occur when there are two variables of the same name, and examples will be illustrated later in Chap.​ 5, but for now this text will use different variable names to avoid this difficulty.
+
+Although Line 16 is not an instruction, it does represent the end of method setX. When the method is done executing, control is transferred back to the main program. Since setX is a void method, control is transferred back to the line just after the one that invoked the method. The result is that Fig. 2.10 represents the state of execution just prior to Line 8 in the main program.
+
+Fig. 2.10
+
+State of execution just prior to Line 8
+
+Note that the contour for the setX method is shaded as light gray. The reason for this is to indicate the contour is deallocated, where the memory locations associated with the method are no longer accessible. Although the contour can and is often simply erased as shown in Fig. 2.11, it is sometimes helpful to show the contour as shaded prior to erasing it so that the contents of the memory locations can still be seen by others. Although shading a contour might be difficult when drawing a contour by hand, an alternative is to just very lightly cross it out while still allowing its contents to be seen.
+
+Fig. 2.11
+
+State of execution just prior to Line 8 (alternative)
+
+So what happens when Line 8 is executed? Similar but somewhat different to the invoking of the void method setX, the value-returning method getX is invoked, and the state of execution just prior to Line 18 is shown in Fig. 2.12.
+
+Fig. 2.12
+
+State of execution just prior to Line 18
+
+Note that there are no memory locations allocated in the contour for getX. The reason for this is that there are no parameters in the parameter list, nor are there any local variables declared within the method, as will be discussed later. As a result, no memory locations are allocated within the contour. So what happens when the return x; statement is executed? Since there is no variable declared by the name x in the getX contour, the system looks outside the contour to see the variable x in the Number contour. The number 5 in the variable x is the value returned to the main program. Since this is a value-returning method, control does not return back to the line after the line that invoked the method, but rather control is returned back to the same line from which it was invoked, so that the value returned can be assigned to a variable or possibly output. When the return is executed, control is transferred back to Line 8, where the number 5 is assigned to the variable z in the main program.
+
+Figure 2.13 shows the state of execution just prior to Line 9 with the contour for getX shaded as discussed previously. Alternatively, the contour for getX does not need to be shaded nor drawn as shown in Fig. 2.14.
+
+Fig. 2.13
+
+State of execution just prior to Line 9
+
+Fig. 2.14
+
+State of execution just prior to Line 9 (alternative)
+
+Since Line 9 is just a print statement and does not contribute to the understanding of objects, the state of execution after Line 9 is not shown here. Although almost every contour was drawn to illustrate the intricate details in the preceding example, this will not always be the case. In the future, some of the more simplistic contours might be skipped, but should they be needed they will be drawn in order to explain a particular concept, as in the next section on constructors.
+
+## 2.8 Constructors
+
+When a new object is created, it is sometimes nice to have the various private data members initialized to specific values. This is convenient and allows variables to have default values in case a programmer forgets to initialize them. The mechanism needed to accomplish this task is known as a constructor. A constructor is a special method that is automatically invoked once at the time an object is created via the new instruction. It looks similar to other methods, but instead of having its own unique name as determined by the programmer, it has the same name as the class. Although this can be confusing at first, it helps to remember that when a new object of a class like Number is created, the method that serves as the constructor for the class has the same name, Number, and does not have a return type. Again, it is best to show an example. In this case the constructor initializes the data member x to the default value 0, again assuming that the initial value of variables is indeterminate as discussed in Chap.​ 1.
+
+public Number() {
+
+x = 0;
+
+}
+
+Including the above constructor, the previous class would look as shown in Fig. 2.15, where typically constructors are located after the data members but prior to all the other methods.
+
+Fig. 2.15
+
+The Number class with a constructor
+
+Using the first 11 lines of the main program in Fig. 2.4 and replacing lines 12 through 20 with the code from Fig. 2.15, the program in Fig. 2.16 is the revised one from Fig. 2.4 that now incorporates a constructor. Instead of walking through the entire program as was done in the last section, only the first few lines of the program will be executed to illustrate how a constructor works.
+
+Fig. 2.16
+
+Invoking program and Number class with a constructor
+
+After executing Line 5, the contour in Fig. 2.17 shows the state of execution just prior to the execution of Line 6 in the main program. If the contour looks familiar, it is because it is the same contour that appeared previously in Fig. 2.6.
+
+Fig. 2.17
+
+State of execution just prior to Line 6
+
+However, what happens when Line 6 is executed is different from the previous program. As before a contour is created for an instance of the Number class which contains the variable x. Recall from the discussion above that a constructor is automatically executed when a new instance of an object is created. As a result, a contour is also created for the constructor as shown in Fig. 2.18 which shows the state of execution just prior to Line 15 in the constructor for the class Number.
+
+Fig. 2.18
+
+State of execution just prior to Line 15
+
+Notice that the contour is empty, since there are no local variables or parameters as was the case previously with the getX() method. Also note that there is no arrow pointing to the contour either. That is because while the constructor is executing, the reference to the object has not yet been assigned to the variable num.
+
+After Line 15 is executed, the state of execution looks as shown in Fig. 2.19. Notice that the variable x has been initialized to 0 . Since there is not a variable named x in the constructor, the system looks outside to find the variable x in the class Number, similar to the setX method as discussed previously. Once Line 16 is finished, the contour for the constructor is deallocated and shaded in gray. The flow of control then returns back to Line 6 in the main program, and the reference to the object is assigned to the variable num as shown in Fig. 2.20.
+
+Fig. 2.19
+
+State of execution just prior to Line 16
+
+Fig. 2.20
+
+State of execution just prior to Line 7
+
+The program then continues to execute Line 7 just as it did previously, where the only difference is that the variable x has been initialized to the number 0 instead of being indeterminate. Although the initialization could have been accomplished by invoking the setX method with a parameter of 0, the advantage of using a constructor is that a programmer does not need to explicitly invoke a method and does not run the risk of forgetting to do so, which under some circumstances might cause a logic error. Although this is a simple example, as programs become more complicated, the role of a constructor will become more important. When one begins to learn more about data structures in later courses, the role of the constructor as just a mere initializer will diminish, and it takes on roles more befitting of its namesake as a constructor. For now, it is a good practice to use constructors when possible to gain more familiarity and become more comfortable with their use and function.
+
+## 2.9 Multiple Objects and Classes
+
+Is it possible to have more than one instance of a class or more than one class? The answer is yes and this section will address some of the issues with multiple objects and classes. For example, if one wanted to have two instances of the preceding Number class, the program could be written as in Fig. 2.21. In the interest of simplifying the contours, the number of variables has been reduced in this example. For example, instead of using local variables as arguments as done in the previous section, constants are used as arguments in Lines 6 and 7. Also, note that the values returned from getX are not stored in variables, but rather just simply output as shown in Lines 8 and 9. Again, these shortcuts are not generally encouraged, but they do save some space in the contour diagrams and hopefully help the reader see the points currently under consideration more clearly.
+
+Fig. 2.21
+
+Program to create multiple instances of the same class
+
+Notice that there are now two variables of type Number on Line 3. As before, it is helpful to use contour diagrams to assist in the understanding of the code. In this case, only the first part of the code will be executed, and the remainder of the code is left as an exercise at the end of the chapter. Figure 2.22 shows the state of execution after Line 5 but just prior to Line 6.
+
+Fig. 2.22
+
+State of execution after creating two instances prior to Line 6
+
+Note that after the constructor has been invoked twice, there are now two instances of the class Number. There are also two variables with the same name, x, but does this cause any problems during the execution of the program? The answer is no, because each variable x is in a different instance of the Number class, where one of the variables is in the object referenced by num1 and the other by num2. Upon completion of Line 6, Fig. 2.23 shows the state of execution after the execution of Line 18, but prior to the execution of Line 19 in the setX method.
+
+Fig. 2.23
+
+State of execution just prior to Line 19
+
+As before, the contents of the parameter a have been placed in the data member x. However, is there any confusion as to where the setX method contour should appear? No there is not; since the method call was num1.setX(5); the system knows to execute the setX method in the contour referenced by num1. As discussed previously in Sect. 2.7, an easy way of reading the code num1.setX(5); is to first go to the variable name in the contour, in this case num1, and when there is a dot after the variable name in the code, follow the corresponding reference or arrow to the appropriate contour. In other words, a dot in the line of code refers to a reference or arrow in the contour diagram. After following the reference to the corresponding contour diagram, the contour for the method setX is created. This also reinforces that it is very important to create the initial object contour and corresponding reference correctly when the new instruction is first executed, because all subsequent code is dependent upon it.
+
+Although the creation of two instances of the same class is fairly straightforward, one must be careful when manipulating the two instances. For example, what if one wanted to take a copy of the integer 5 in the variable x in num1 and put it in the variable x in num2? At first it would seem to be a simple assignment operation from Chap.​ 1, for example, a = b; to copy an integer from the variable b into the variable a. However, when dealing with objects, the results might not be what one expects. For example, what if one wrote the code num2 = num1? The contents of num1 would be copied into num2, but remember, what exactly is in num1? It is not the integer 5, but rather a reference to the corresponding object that contains the integer 5. What is copied is not the integer 5, but rather the result would be that num2 points to the same object as num1 and the previous object that num2 referenced would be deallocated as shown in Fig. 2.24.
+
+Fig. 2.24
+
+Results of num2 = num1;
+
+Given that the simple assignment statement above does not accomplish the intended task, how then could the integer 5 be copied from the x in num1 to the x in num2? Although another technique will be shown later in Chap.​ 5, for now a temporary variable temp could be used, and the contents of x in num1 could be retrieved using the method getX. Then the corresponding x in num2 could be set with the method setX as shown in the following code segment:
+
+int temp;
+
+temp = num1.getX();
+
+num2.setX(temp);
+
+Alternatively, the temporary variable might not be used, and the getX method could be used as a parameter for the setX method as shown in the following shortened segment:
+
+num2.setX(num1.getX());
+
+Here the getX method is invoked first, and then the results returned are used as a parameter to be sent to the setX method. Although the above shortcut works well, for now this text will occasionally use a temporary variable to help make the code a little easier to read.
+
+Just as it is possible to have multiple instances of a single class, it is also possible to have multiple instances of multiple classes. To elaborate further on the Number class and make it a little more interesting, suppose there is a class defined that has methods to calculate the area of a square and another class has methods to define and calculate the area of a rectangle. Although it could be argued that a square is just a special case of a rectangle, for now they will be defined as two separate classes, and this will pave the way to help explain the concept of inheritance later in Chap.​ 9.
+
+The class Square will need a method to set the length of the sides and another to calculate the area of the square. Although the method that calculates the area could also return the area (see Sect. 2.11 for the alternative technique), for now an accessor method will be used to return the area of the square, and all three methods are shown in Fig. 2.25.
+
+Fig. 2.25
+
+Square class
+
+Note that instead of a single data member as in the previous example, there are now two private data members, one for the side and one for the area. Except for the different variable names, note the constructor, setSide, and getArea methods are similar to the constructor, setX, and getX methods in the previous example. The only real difference is the inclusion of the calcArea method which calculates the area of the square, and it is implemented as a void method.
+
+The Rectangle class can be implemented similar to the Square class. The major difference between these two classes is that with a rectangle, it is possible to have the two sides be of different lengths, so there needs to be two variables instead of just one to represent the sides, in this case, sideX and sideY as shown in Fig. 2.26.
+
+Fig. 2.26
+
+Rectangle class
+
+Notice the use of three variables in the bodies of the constructor and the calcArea method. Also, since the setSide method is modifying more than one side, the body of that method is also changed, but more importantly, the setSide method has two parameters instead of just one. Lastly, the getArea method remains unchanged.
+
+Both classes can now be implemented and used with a main program as illustrated in Fig. 2.27. As with the last program and again not generally encouraged, in order to help save space in the contours, note that in Lines 7 and 8 constants are used as arguments and in Lines 11 and 12 the get methods are located in the println statements.
+
+Fig. 2.27
+
+The main program along with the Square and Rectangle classes
+
+As before, in order to see the difference between instances of multiple classes, it is helpful to walk through the contour diagrams, at least part of the way. The contour in Fig. 2.28 illustrates the state of execution after Line 6 and before the execution of Line 7 in the main program.
+
+Fig. 2.28
+
+State of execution just prior to Line 7
+
+Previously in Fig. 2.22, the two object contours were identical because they were two instances of the same class. However, here in Fig. 2.28 the two object contours are different because they are instances of different classes. After executing Line 7, Fig. 2.29 shows the state of execution just prior to Line 23 in the setSide method.
+
+Fig. 2.29
+
+State of execution prior to Line 23
+
+Is there any confusion as to where the setSide method contour appears? No, since the method call was square.setSide(2); the system knows to execute the setSide method in the Square class because square is of type Square. Although somewhat different, this is similar to the previous example in Fig. 2.23 where there were two variables of the same name, but in that example there were two instances of the same class. In this case, there are two methods of the same name, but they are in two different classes. As before, an easy way of reading the code and the contour diagram is to go to the variable name, in this case square, and when there is a dot after the variable name in the code, follow the corresponding reference or arrow to the appropriate contour and then create the method contour in the corresponding object contour.
+
+After returning to Line 8 in the main program, the rect.setSide(3, 4); statement is executed, and control is transferred to Line 39 in the corresponding setSide method in the Rectangle class. Figure 2.30 then shows the state of execution just prior to Line 41.
+
+Fig. 2.30
+
+State of execution just prior to Line 41
+
+Note that this time the setSide method contour appears in the Rectangle class contour, and there are two parameters instead of one. Later it will be seen that there can be several methods within a class with the same name; however, they can be distinguished by having a different number, type, or order of the types of parameters. This concept is called method overloading and will be discussed in detail in Chap.​ 5. In the current example, although there are two methods that have the same name, it is not a problem because the two methods are in different classes. As with the previous example, the completion of the contours is left as an exercise at the end of the chapter.
+
+## 2.10 Universal Modeling Language (UML) Class Diagrams
+
+Whereas contours are helpful in examining how a specific object works, when an application becomes larger and includes several classes, it is helpful to get a better picture of the relationship among the various classes using Universal Modeling Language (UML) diagrams. UML diagrams can also help one not only see relationships between classes but also see the relationships among the objects of different classes. UML is a language specifying a graphical notation for describing software designs in an object-oriented style. It gives one an overall view of a complex system more effectively than a Java program which may provide too much detail. Again, whereas contour diagrams are helpful when trying to understand the execution of a program, UML diagrams are helpful when trying to design a program. The class definitions and objects discussed in the previous sections can be illustrated using UML class diagrams. Figure 2.31 shows how the Number class in Fig. 2.16 can be displayed using UML class diagram notation.
+
+Fig. 2.31
+
+UML class diagram of Number class
+
+In the UML class diagram, both data members and methods are included. A class is displayed as a box that includes three sections: The top section gives the class name, the middle section includes the data members for individual objects of the class, and the bottom section includes methods that can be applied to objects. In this example, the middle section represents the data member x, and the type of the data member is specified by placing a colon : followed by the name of the type. The methods in the Number class include the constructor Number, along with the two methods, a mutator setX and an accessor getX. Methods are denoted as the following format:
+
+methodName(parameterName: parameterType): returnType
+
+Notice that if there is no information being sent to the method, the inside of the parentheses will be empty, and if the method does not return a value, the returnType will not be included. In Fig. 2.31, the type of the return value is specified after the colon, similar to the type of data members. The parameter list (a: int) for the method setX indicates that information is sent to the method and the value of a, which is of type int, is assigned to the data member. By having an empty parameter list in the parentheses, the getX method does not accept any information and returns a value of type int which is the value stored in the data member x.
+
+Similar to contour diagrams, but not as detailed, UML notation can also be used to illustrate objects graphically. In the main method of Fig. 2.16, an object named num is instantiated from the class Number. Then the value 5 is assigned to the data member of the object num through a mutator method. UML notation for the object after Line 7 is executed is shown in Fig. 2.32.
+
+Fig. 2.32
+
+UML notation for object num of the Number class
+
+In the diagram, the top section gives the object name followed by the class name after the colon, all of which is underlined. The bottom section lists the data members. In this example, the variable x contains the value 5.
+
+## 2.11 Complete Program: Implementing a Simple Class and Client Program
+
+Combining all the material from this chapter, one can now define a simple class and use an instance of the class in a client program. In this section, a program to calculate the area of a circle will be developed.
+
+Problem Statement: Write a program to calculate the area of a circle.
+
+Once a problem statement is given, the requirements can be established by analyzing the problem. The program will:
+
+  * Accept a radius from the user
+
+  * Compute the area of the circle using the given radius
+
+  * Display the area
+
+Next, some further issues can be considered. Since the area of more than one circle may need to be calculated, a class describing a circle should be defined separately from the main program. In the definition of a circle, only the value of the radius which is the main characteristic of a circle should be kept. In some circumstances where a calculation is very complex, it might be better to calculate the result just once and invoke a method to get the result each time it is needed, thus saving compute time. But since the calculation for the area of the circle is not very complex, it can be computed at any time using the value of the radius, and it does not need to be stored in the object.
+
+Having addressed some of the issues, the design of the application can proceed. The definition of the Circle class in UML notation is shown in Fig. 2.33.
+
+Fig. 2.33
+
+UML class diagram of the Circle class
+
+According to the diagram, a Circle object has a data member radius of type double which is a property that characterizes a circle shown in the middle section. The behavior of an object is defined by the methods in the bottom section. The first method is a constructor which creates a new object and performs the initialization of the data members when a new object is created. Each circle can assign a value of radius by performing the setRadius method, invoke the computeArea method to return its area, and return the value of radius using the getRadius method.
+
+After the design phase comes the implementation phase. Figure 2.34 contains the code defining the class for a Circle object.
+
+Fig. 2.34
+
+Circle class
+
+A client program to test the functionality of the Circle class is given in Fig. 2.35.
+
+Fig. 2.35
+
+A client program for Circle class
+
+When the above program is compiled and executed using the sample input of 2.0, the output of the program looks like this:
+
+Enter the radius: 2.0
+
+The area of the circle with a radius of 2.00 cm is 12.57 square cm.
+
+In this example, an object circle was instantiated from the class Circle, and the user provided 2.0 for the value of the radius of the circle. The UML notation for the object, circle, is shown in Fig. 2.36.
+
+Fig. 2.36
+
+UML notation for the object, circle, of the Circle class
+
+As before, the top section contains the object name circle followed by the class name Circle after the colon, all of which is underlined. The bottom section lists the data member radius of the object circle. In this example, the variable radius has a value 2.0.
+
+## 2.12 Summary
+
+  * Remember that a class is like a definition, whereas an instance of a class is an object.
+
+  * Private data members and methods can only be accessed internally within an object of a class, whereas public data members and methods can be accessed both internally and externally.
+
+  * A value-returning method is used to return a value back to the invoking statement.
+
+  * It is best to use only one return statement in a value-returning method and also to place the return statement as the last statement in the method.
+
+  * A void method is usually used to set values in an object.
+
+  * Arguments in an invoking statement are used to send values to a method, and the corresponding parameters are used to receive values within the method.
+
+  * Each time an object is created or a method is invoked, a corresponding contour should be drawn.
+
+  * The new instruction creates a new instance of a class, and the reference to the new instance is often assigned to a variable.
+
+  * A constructor is automatically invoked when the new instruction is executed and is often used to initialize data members. Remember that a constructor has the same name as the name of the class and does not have a return type.
+
+## 2.13 Exercises (Items Marked with an * Have Solutions in Appendix E)
+
+1.
+
+Indicate whether the following statements using the Circle class in Fig. 2.34 in Sect. 2.11 are syntactically correct or incorrect. If incorrect, indicate what is wrong with the statement:
+
+*A.
+
+Circle circle = new circle();
+
+B.
+
+Circle circle
+
+Circle = new Circle(5);
+
+*C.
+
+circle.getRadius(); assume that an object circle has been declared and created correctly.
+
+D.
+
+circle.setRadius("two"); assume that an object circle has been declared and created correctly.
+
+E.
+
+circle.setRadius(); assume that an object circle has been declared and created correctly.
+
+2.
+
+Draw contour diagrams to show the state of execution prior to the following line numbers of the CalcAreaCircle class in Fig. 2.35 in Sect. 2.11.
+
+A.
+
+Line 8
+
+B.
+
+Line 12 (assume an input value of 2.0)
+
+3.
+
+Draw contour diagrams to show the state of execution prior to Line 8 of the Invoke class in Fig. 2.21 in Sect. 2.9.
+
+4.
+
+Answer the questions A–D about the following declaration of class Circle:
+
+*A.
+
+Declare and create a variable of type Circle called innerCircle.
+
+B.
+
+Write a statement using the setRadius method to change the value of innerCircle's data member, radius to 10.0.
+
+*C.
+
+Write a statement using the getRadius method to output the value of innerCircle's data member, radius, preceded by the phrase "The value of radius is ".
+
+D.
+
+Write a statement using the computeCircumference method to output the value of innerCircle's circumference, preceded by the phrase "The value of the circumference is ".
+
+5.
+
+Draw contour diagrams to show the state of execution prior to Line 11 of the class Multiple shown in Fig. 2.27 in Sect. 2.9.
+
+6.
+
+Write a complete program to calculate the volumes of a cone and a hollow cylinder. The shape of a hollow cylinder is shown below, where r is the radius of the inner cylinder and R is the radius of the outer cylinder:
+
+First, draw a UML diagram similar to Fig. 2.31 for a class named Cone as described below and then write the code to implement the Cone class.
+
+*A.
+
+The Cone class has two private data members, radius and height , of type double.
+
+B.
+
+Write code for a constructor to set the data members to default values of 0.0.
+
+C.
+
+Write code for the accessor methods, getRadius and getHeight, that return the value of the appropriate data member.
+
+*D.
+
+Write code for the mutator methods, setRadius and setHeight, that each have one formal parameter which is stored as the value of the data member.
+
+E.
+
+Write a method named computeVolume to compute the volume of a cone and return the computed volume to the client. The formula to find the volume of a cone is  .
+
+Second, draw a UML diagram similar to Fig. 2.31 for a class named HollowCylinder as described below and then write the code to implement the HollowCylinder class.
+
+F.
+
+The HollowCylinder class has three private data members, innerRadius, outerRadius, and height, of type double.
+
+G.
+
+Write code for a constructor to set the data members to 0.0.
+
+H.
+
+Write code for the accessor methods, getInnerRadius, getOuterRadius, and getHeight, that return the value of the appropriate data member.
+
+I.
+
+Write code for the mutator methods, setInnerRadius, setOuterRadius, and setHeight, that each have one formal parameter which is stored as the value of the data member.
+
+J.
+
+Write a method named computeVolume to compute the volume of a hollow cylinder and return the computed volume to the client. The formula to find the volume of a hollow cylinder is πh(R 2 − r 2).
+
+Third, write a client program to test the Cone and HollowCylinder class as defined above. Name this class CalcVolume. The main method should perform the following tasks:
+
+K.
+
+Allow the user to enter a radius of the cone.
+
+L.
+
+Allow the user to enter a height of the cone.
+
+M.
+
+Declare and create a Cone object setting the data members to the values entered by the user.
+
+N.
+
+Allow the user to enter an inner radius of the hollow cylinder.
+
+O.
+
+Allow the user to enter an outer radius of the hollow cylinder.
+
+P.
+
+Allow the user to enter a height of the hollow cylinder.
+
+Q.
+
+Declare and create a HollowCylinder object setting the data members to the values entered by the user.
+
+R.
+
+Output the phrase "The volume of the cone with a radius of XX cm and a height of XX cm is XX cubic cm.", where the XXs are the input values and the value returned from the method.
+
+S.
+
+Output the phrase "The volume of the hollow cylinder with an inner radius of XX cm, an outer radius of XX cm, and a height of XX cm is XX cubic cm.", where the XXs are the input values and the value returned from the method.
+
+  * Here is some sample input and output:
+
+Input for the cone
+
+Enter the radius: 2.0
+
+Enter the height: 3.0
+
+Input for the hollow cylinder
+
+Enter the inner radius: 2.0
+
+Enter the outer radius: 4.0
+
+Enter the height: 3.0
+
+The volume of the cone with a radius of 2.00 cm and
+
+a height of 3.00 cm is 12.57 cubic cm.
+
+The volume of the hollow cylinder with an inner radius
+
+of 2.00 cm, an outer radius of 4.00 cm, and
+
+a height of 3.00 cm is 113.10 cubic cm.
+
+  * Finally, draw a UML diagram similar to Fig. 2.32 for the objects created in the main method.
+
+James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6_3
+
+© Springer-Verlag London 2014
+
+# 3. Selection Structures
+
+James T. Streib1  and Takako Soma1
+
+(1)
+
+Department of Computer Science, Illinois College, Jacksonville, IL, USA
+
+Abstract
+
+Selection structures are explained in this chapter using flowcharts, pseudocode, and the corresponding Java code. The if-then, if-then-else, and nested if structures, including if-then-else-if and if-then-if structures, are introduced. The dangling-else problem is also discussed. Logical operators are presented followed by the introduction of the case structure. Two complete programs are provided: one with objects and one without.
+
+## 3.1 Introduction
+
+Chapter  showed how to perform input, arithmetic, and output, which are fundamental to many subsequent programs. Chapter 2 introduced elementary object-oriented programming, which allows programs to be designed using objects and methods. Although invoking a method causes a program to branch to another subprogram and this alters the flow of control, the order in which the methods are executed can be determined by examining the code to see the order in which they are invoked. In other words, each time the program is executed, it would have the same order of execution regardless of what was input. What gives software some of its power is the ability to alter the flow of control of a program, so that during different executions of the program with different input, it will behave in a different fashion. This ability is a result of a program being able to use control structures.
+
+The word "structure" is a generic description of statements regardless of the programming language, whereas "statements" are the individual instructions which can vary from language to language. Control structures can alter the flow of control of a program and can be classified as two main groups, selection structures and iteration structures. Selection structures, sometimes also called decision structures, allow the program to take two or more different paths based on different conditions, whereas iteration structures, sometimes called repetition structures, allow a program to repeat a part of the code many times. In this chapter, various forms of the selection structures will be examined along with the associated Java statements.
+
+## 3.2 If-Then Structure
+
+The most basic of the selection structures is the if-then structure. If a particular condition is true, the then portion of the structure is executed; otherwise the then portion of the structure is not executed. It is very similar to natural languages, where one might say "If it is hot today, then I'll buy ice cream." If it was actually hot later in the day, then one would buy ice cream; otherwise one would not buy ice cream. Before looking at specific Java code for this example, it is helpful to look at a visual representation using a flowchart. There are many different types of flowcharts, where Fig. 3.1 shows the type of flowchart that will be used in this text.
+
+Fig. 3.1
+
+Flowchart representing an if-then structure
+
+In Fig. 3.1, the diamond shape represents a selection structure and the arrows represent the flow of control. The arrow at the top represents entrance into the selection structure. The statement inside the diamond is a question and its results are either true or false. The two labeled arrows exiting the diamond represent the flow of control should the condition be true or false. The true branch is known as the then branch which contains a rectangle representing a statement, and there are no statements in the false branch. The rectangles can be used to hold various statements such as input, output, and assignment statements. In this example, the question is asked "Is it hot?", and if the answer is true, the then or true branch is taken and one would "Buy Ice Cream." Should the answer to the question be false, the false branch is taken and one does not buy ice cream.
+
+However, the example shown in Fig. 3.1 is not very precise for writing a program. It is not clear what is classified as hot, so it might be better to specify a particular temperature. To make it easier to write a program, it would be best to use a variable such as temp for temperature, where temp would first need to be input. It could then be tested in an if-then structure. For example, if it is 90° Fahrenheit or above, the message "Buy Ice Cream" could be output. Although not necessary now, but for convenience later, a message indicating "End of Program" can also be output as shown in Fig. 3.2.
+
+Fig. 3.2
+
+Flowchart using the variable temp
+
+Specifically, the flowchart in Fig. 3.2 first inputs the value of temp. Next it tests if the value in temp is greater than or equal to 90. If it is true, it outputs the message "Buy Ice Cream", and if it is false, it does not output the message "Buy Ice Cream". In either case, the flow of control continues on to the end of the if-then structure and the message "End of Program" is output.
+
+The comparison between temp and 90 is known as a conditional expression, and the greater than or equal to symbol is known as a relational operator and it could be any of the relational operators that one has previously learned in mathematics. For example, one could also say temp "greater than" 89, where 90 would still output the message "Buy Ice Cream", and a temp "equal to" 89 would not. However, what if the variable temp was of type double? Then, a temp of 89.5 would cause the message "Buy Ice Cream" to be output, and this might not be what was intended. As a result, it is a good idea not to change what is given and to implement what was originally intended.
+
+Although flowcharts are good for visually depicting the logic of a program, sometimes they are cumbersome to draw. As an alternative to flowcharts, pseudocode can be used to create the logic for a program as discussed previously in Chap.​ 1. The above flowchart could be implemented in pseudocode as follows:
+
+  * input temp
+
+  * if temp ≥ 90 then
+
+    * output "Buy Ice Cream"
+
+  * output "End of Program"
+
+After temp is input, the word if indicates an if-then structure. The condition appears between the words if and then and the word then is optional. If the condition is true, the statement immediately following the if statement is executed, and execution proceeds to the statement following. Note that the true section of the structure is indented to visually indicate the then section. If the condition is false, control branches or jumps over the indented then section and the last statement is executed.
+
+Given the above flowchart and pseudocode, how could they be implemented in Java? The code would look as shown below:
+
+System.out.print("Enter a temperature: ");
+
+temp=scanner.nextInt();
+
+if(temp >= 90)
+
+System.out.println("Buy Ice Cream");
+
+System.out.println("End of Program");
+
+The input and output statements should look familiar from Chap.​ 1. What is new and different is the if-then statement. Note that there are parentheses around the conditional expression and the word then does not appear in the code. Although the word then does not and should not appear in Java, the true section of an if-then statement is still referred to as the then section. Also, just like the pseudocode, it is a good idea to indent the true or then section, but be aware that indenting the code does not affect the flow of control in the program. It is done as a courtesy for other programmers to help improve the readability and maintainability of the code.
+
+Lastly, note that the ≥ symbol has been replaced with the >= symbols. This is because the mathematical symbol ≥ does not exist in the Java programming language and the >= symbols need to be used instead. As one might suspect, some of the other mathematical symbols do not exist in Java as well as indicated in Table 3.1.
+
+Table 3.1
+
+Relational symbols
+
+Mathematical symbol | Java symbol
+
+---|---
+
+> | >
+
+≥ | >=
+
+< | <
+
+≤ | <=
+
+= | ==
+
+≠ | !=
+
+In addition to the "less than or equal to" symbols, notice the "equal to" symbol. Instead of a "single" equal sign, it is represented in Java as a "double" equal sign. The reason for this is to distinguish the check for equality == from the assignment symbol =. This is a common mistake for beginning Java programmers to use the wrong symbol, so extra care must be taken when writing a conditional expression in a control structure. Although not as problematic as the "equal to" symbol, notice that the "not equal to" symbol is !=.
+
+To illustrate a complete program that can be keyed into the computer to test the current if-then statement, see Fig. 3.3. This program can also be modified to test subsequent selection statements introduced in this chapter.
+
+Fig. 3.3
+
+Complete program using the if-then statement
+
+It should further be pointed out that syntactically there can be only one statement in the then section of an if statement in Java. But if there can be only one statement in Java, how can more than one statement be placed in the then section? Taking a minute to think about it, a way this problem can be solved has already been presented in Chap.​ 2. Yes, multiple statements could be placed in a method and then an invoke statement could be placed in the then section. However, if a method was not being used to solve this problem, how could more than one statement be put into the then section?
+
+With flowcharts and pseudocode, there are no restrictions to using only one statement as there is in Java. In a flowchart, additional boxes can be placed in the then branch and each box represents a new statement. For example, in addition to the message "Buy Ice Cream", the message "Buy Lemonade" could be added as shown in Fig. 3.4.
+
+Fig. 3.4
+
+Flowchart with two statements in the then section
+
+In pseudocode, if more than one statement is needed in the then section, it is simply inserted and indented to visually indicate to the reader that the additional statements are part of the then section and do not belong after the then section, such as in the following:
+
+  * input temp
+
+  * if temp ≥ 90 then
+
+    * output "Buy Ice Cream"
+
+    * output "Buy Lemonade"
+
+  * output "End of Program"
+
+However, if one attempted to write the above pseudocode in Java as follows, there would be a logic error:
+
+// *** Caution: Incorrectly Implemented Code ***
+
+System.out.print("Enter a temperature: ");
+
+temp=scanner.nextInt();
+
+if(temp >= 90)
+
+System.out.println("Buy Ice Cream");
+
+System.out.println("Buy Lemonade");
+
+System.out.println("End of Program");
+
+Although this might look correct, it is sometimes a common error made by beginning programmers. By merely moving the "Buy Lemonade" statement to the left as shown below, there is no change in the logic of the segment, and the true flow of control is made more obvious, where the "Buy Lemonade" message is output regardless of the temperature:
+
+// *** Caution: Incorrectly Implemented Code ***
+
+System.out.print("Enter a temperature: ");
+
+temp=scanner.nextInt();
+
+if(temp >= 90)
+
+System.out.println("Buy Ice Cream");
+
+System.out.println("Buy Lemonade"); // <\- - - Unindented
+
+System.out.println("End of Program");
+
+As stated previously, the indentation of the code does not affect the flow of control of the program in Java. So how does one indicate that there is more than one line of code in the then section? The answer is through the use of a compound statement. A compound statement is indicated by the use of opening and closing braces, { and }. For example, the above pseudocode would be correctly implemented as follows:
+
+// *** Correctly Implemented Code ***
+
+System.out.print("Enter a temperature: ");
+
+temp=scanner.nextInt();
+
+if(temp >= 90) {
+
+System.out.println("Buy Ice Cream");
+
+System.out.println("Buy Lemonade");
+
+}
+
+System.out.println("End of Program");
+
+The compiler sees the compound statement which allows more than one statement to be in the then section. Although syntactically to the compiler there is still only one statement, specifically the compound statement, there are now logically two statements in the then section. Notice that the opening brace appears just after the closing parentheses of the conditional expression and the closing brace lines up with the if statement. Although there are a number of other styles, this text will use the style shown above. However, should one's instructor or place of employment use a different style, be sure to follow it.
+
+## 3.3 If-Then-Else Structure
+
+The if-then structure is helpful when there is something that needs to be done in addition to the normal flow of control. However, what if one wanted to have a program do one thing in one case and another thing in an alternative case. Using a new example, assume that if the number of credit hours input, using the variable credits, is 120 or greater, the program should output the message "Graduate"; otherwise the program should output "Does not graduate".
+
+Is it possible to solve this problem using only if-then structures? The answer is yes, by using two if-then structures in the pseudocode that follows:
+
+  * if credits ≥ 120 then
+
+    * output "Graduate"
+
+  * if credits < 120 then
+
+    * output "Does not graduate"
+
+Although this solution works, the problem with this method is that it has to ask two questions. For example, if the number of credit hours is equal to 120, then the message "Graduates" would be output. However, even though the message has already been output, the code still needs to check to see if the number of credit hours is less than 120 and branch around the output "Does not graduate" message. It should be clear that if one of the options is true, the other one is false, so there is no need to check the opposite condition. This can be accomplished with the use of the if-then-else structure. An example of the flowchart for this scenario is as shown in Fig. 3.5.
+
+Fig. 3.5
+
+If-then-else structure
+
+Note that unlike the flowchart in the previous section, the false section is no longer empty. Instead, it contains a box to output the message "Does not graduate". The false section of the flowchart is also called the else section. The pseudocode for this flowchart is shown below:
+
+  * input credits
+
+  * if credits ≥ 120 then
+
+    * output "Graduate"
+
+  * else
+
+    * output "Does not graduate"
+
+  * output "End of Program"
+
+Notice that the word else lines up with the word if and that the else section of the pseudocode lines up with the then section. The Java code to implement the pseudocode is as follows:
+
+System.out.print("Enter the credit hours: ");
+
+credits=scanner.nextInt();
+
+if(credits >= 120)
+
+System.out.println("Graduate");
+
+else
+
+System.out.println("Does not graduate");
+
+System.out.println("End of Program");
+
+As with the pseudocode, notice that the word if and the word else line up and the then and else sections line up. What if there needs to be more than one statement in either the then or else sections? As before with the if-then statement, a compound statement must be used.
+
+It is possible to reverse the above then and else sections, but one needs to be cautious and reverse the conditional expression correctly. What is the opposite of greater than or equal to? Be careful, it is not less than or equal to. If one used less than or equal to, then those students who had exactly 120 credit hours would be listed as not graduating, much to their dismay! Instead, the opposite of greater than or equal to is simply less than as shown below:
+
+System.out.print("Enter the credit hours: ");
+
+credits=scanner.nextInt();
+
+if(credits < 120)
+
+System.out.println("Does not graduate");
+
+else
+
+System.out.println("Graduates");
+
+System.out.println("End of Program");
+
+Although the above code performs identically to the previous code, why should one be chosen over the other? Unless there is a compelling reason to do otherwise, such as the original description is unduly confusing, it is usually better to write the code to follow the original specifications as given. However, if either way is acceptable, then code is often written to have the most common occurrence in the then section and the exception in the else section. In the above example, most seniors will probably have 120 credit hours or more at graduation, so using the original code segment is probably the best choice.
+
+When writing if-then structures, it is important to write them so that they not only work correctly but they are also efficient in terms of memory utilization. For example, consider the following code segment:
+
+if (a > 0) {
+
+b = b + 1;
+
+a = a - b;
+
+c = c + a;
+
+}
+
+else {
+
+b = b + 1;
+
+a = a + b;
+
+c = c + a;
+
+}
+
+Note that the first and last statements in both the then and else sections are the same. The only statement that is different between the two is the middle statement in each segment. Given that the other statements are the same, why are they duplicated in the then and else sections? The answer is that they should not be and they can be moved. Not only are they taking up more memory, they also present a possible problem when someone attempts to modify the code, where a programmer might accidently modify a statement in one section and fail to modify the other statement in the other section which might lead to a subsequent logic error. Although this does not appear to present as much of problem here in a small code segment, it could be much more serious in larger code segments.
+
+If the duplicate statements are to be consolidated and moved, where should they be relocated? By examining the above code segment, the variable b modified in the first statement in each segment is used by the second statement, so it should be moved prior to the if statement. Similarly, the variable a used in the last statement is modified by the middle statement, so it should be relocated after the if-then-else statement. In other words, care must be used to ensure that the logic is not altered when moving statements to optimize an if-then-else statement or any code segment for that matter. Below is the modified code segment that clearly is less cluttered without the braces, uses less memory, and would be easier to modify in the future. The result is that once one has written code that works correctly, be sure to take the time to ensure that it is also a well-written code.
+
+b = b + 1;
+
+if(a > 0)
+
+a = a - b;
+
+else
+
+a = a + b;
+
+c = c + a;
+
+Note further that it is also possible to write an if-then structure as an if-then-else with either an empty else section or an empty then section. In both cases, leaving an empty else or then section in Java requires a semicolon in either section, which might lead subsequent programmers to wonder what might have been accidently left out. Unless there is intent to fill in the empty section in the immediate future, it is best to just write the code simply as an if-then. If code is written with an empty else section, the else section should be removed. In the case of an empty then section, it is usually best to carefully reverse the conditional expression and again write the code as an if-then.
+
+## 3.4 Nested If Structures
+
+If there is only one selection, the if-then is the best choice, and should there be two selections, the if-then-else structure is the obvious choice. But what if there are three or more choices? Sure, a series of if-then structures could be used, but although this "works," it is a very inefficient solution as discussed in the previous section. Instead, a series of if-then-else structures could be nested. There are two ways if-then-else structures can be nested: the subsequent if-then-else statements could be nested in the else section or in the then section of the previous if-then-else. The first form of nesting is called an if-then-else-if structure and the second is called an if-then-if structure. Note that there are no Java statements that correspond to each of these two structures, but rather they can be created fairly easily from a pair of if-then-else statements. Of the two, the former tends to be used more often and will be discussed first.
+
+### 3.4.1 If-Then-Else-If Structure
+
+As mentioned above, an if-then-else-if structure is created when an if-then-else is nested in the else section of an if-then-else. Using a new example, assume that the temperature is input in degrees Celsius and messages are to be output as to whether water is in the form of steam, water, or ice. At 100° or greater, water is in the form of steam, and at 0° or less, it is in the form of ice; otherwise it is in its liquid state. As before, it is helpful to view the structure in the form of a flowchart as shown in Fig. 3.6.
+
+Fig. 3.6
+
+Nested if-then-else-if structure
+
+Notice that the second if statement appears in the else section of the first statement. The dotted lines are not part of the flowchart, but rather are included to help one see that the inner if-then-else is contained in the else section of the outer if-then-else. If the first condition is true, the message "Steam" is output and no further testing is necessary. If the first condition is false, then further testing occurs in the nested if-then-else structure. Given the flowchart in Fig. 3.6, the corresponding pseudocode would appear as follows:
+
+As with the flowchart, the dashed lines are not part of the pseudocode. Rather, they are included to allow one to see how the inner if-then-else structure is nested in the else portion of the outer if-then-else structure. In particular, note that the nested if and else line up with the output statement in the then section of the outer if-then-else structure. Again, if the first condition is true, the then section is executed and no further testing occurs, but if the first condition is false, the nested if is executed.
+
+As would be expected, the Java code looks very similar:
+
+System.out.print("Enter the temperature: ");
+
+temp=scanner.nextInt();
+
+if(temp >= 100)
+
+System.out.println("Steam");
+
+else
+
+if(temp > 0)
+
+System.out.println("Water");
+
+else
+
+System.out.println("Ice");
+
+System.out.println("End of Program");
+
+The dashed lines are not included in the Java code so that one can concentrate on the indentation and the syntax. As with the pseudocode, note how the inner if and else line up with the System.out.println statement in the then section of the outer if statement.
+
+Since there appears to be more than one statement in the else section of the outer if-then-else structure, does there need to be a pair of braces, { and }, in that section? In other words, does a compound statement need to be used? The answer is no, because an if-then-else statement is syntactically considered to be a single statement. Although it would not cause a syntax error to include the braces, it could cause some programmers to wonder if a second statement was forgotten and not included. Some instructors might not care whether the extra pair of braces is included, but this text will omit them to help the reader get used to this programming style.
+
+Does it matter which test is first? If all the groups are equal, then the answer is no. However, if one of the groups occurs more frequently, then it would be best to put it first so that fewer tests would need to be done. This is especially true when an if statement is inside an iteration structure as will be seen in Chap.​ 4. What if the middle section occurs more often? This could prove to be a problem at this point, but it will be discussed further in Sect. 3.5 on logical operators.
+
+### 3.4.2 If-Then-If Structure
+
+Since it is possible to nest an if-then-else structure in the else section of an outer if-then-else structure, is it possible to nest an if-then-else structure in the then section of an outer if-then-else structure? The answer is yes, and this type of structure is called an if-then-if structure. Again, there is no Java statement called an if-then-if, but rather this name merely indicates what section the subsequent if-the-else is nested. The flowchart for an if-then-if that implements the example from the previous section is shown in Fig. 3.7.
+
+Fig. 3.7
+
+Nested if-then-if structure
+
+As before, the dashed lines are not part of the flowchart but help indicate how the if-then-else is nested in the then section of the outer if-then-else. In particular, notice how the relational expression in the first if is changed from ≥100 to >0. The reason for this is because previously when the temp was at 100 or greater, the then section would be executed and the message "Steam" would be output. However, with the if-then-if structure, the then section now contains a nested if and has two groups that need to be further subdivided. The relational expression in the outer if structure is changed to >0, so when temp is zero or less than zero, execution proceeds to the else section. As discussed previously in Sect. 3.2, be careful to write the relational expression properly, otherwise a logic error could occur. After checking for a temperature greater than zero, the nested if checks whether the temperature is greater than or equal to 100, and if so the message "Steam" is output, otherwise the message "Water" is output. As before, the pseudocode for the nested if-then-if can be found below:
+
+Notice the nested if-then-else in the then section of the outer if-then-else and note the level of indentation. As should be expected, the Java code follows. Again pay attention to the indentation and the absence of braces:
+
+System.out.print("Enter the temperature: ");
+
+temp=scanner.nextInt();
+
+if(temp > 0)
+
+if(temp >= 100)
+
+System.out.println("Steam");
+
+else
+
+System.out.println("Water");
+
+else
+
+System.out.println("Ice");
+
+System.out.println("End of Program");
+
+Since the if-then-else-if and the if-then-if structures can perform the same tasks, which is the better choice? In one sense it depends on the circumstances. If the original specifications are written in such a fashion to make it easier to implement with one or the other structure, then the most appropriate structure should be used. However, often the original specifications are written in a way that is easier to communicate to other users and programmers, and this tends to be in an if-then-else-if fashion. For example, assume there were an equal number of different denominations of coins and someone wanted to move all of the one cent pieces. Ordinarily a person would not try to remove all the other coins to leave only the one cent coins, but instead it would be easier to merely remove the one cent coins. If there were subsequent coins to be removed, such as the five cent pieces, they would be the next to be removed and so on.
+
+This is similar to the previous example, where instead of checking for temperatures above freezing and then checking for temperatures that produce steam or water, it is more natural to check for the temperatures that are greater than or equal to 100°. In other words, the if-then-else-if structure is often chosen over the if-then-if structure because that is the way people often speak and tend to write specifications. Further, it is helpful to have the program written similar to the specifications to assist other programmers who might be maintaining and modifying the program in the future. There is yet another reason why the if-then-else structure is used more often than the if-then-if as discussed in the next section.
+
+### 3.4.3 Dangling Else Problem
+
+The if-then-if structure also suffers from an occasional problem due to the nature of the Java syntax. For example, assume that one wanted to modify the previous temperature example to implement the flowchart in Fig. 3.8 which only the messages for "Steam" and "Ice" are to be output.
+
+Fig. 3.8
+
+"Ice" or "Steam" flowchart
+
+The flowchart can also be implemented as shown in the following pseudocode:
+
+In both cases, what is intended is that if the temperature is greater than or equal to 100, then the first and second if statements are true and the message "Steam" is output. If the temperature is 0 or less than 0, the first if is false and the message "Ice" is output. However, if the temperature is greater than 0 or less than 100, then the first if statement would be true, and the second if would be false, and since there is no code in the else section of the second if, no message is output. It would appear that the code for the above could be implemented as follows:
+
+// *** Caution: Incorrectly Implemented Code ***
+
+System.out.print("Enter the temperature: ");
+
+temp=scanner.nextInt();
+
+if(temp > 0)
+
+if(temp >= 100)
+
+System.out.println("Steam");
+
+else
+
+System.out.println("Ice");
+
+System.out.println("End of Program");
+
+However, what appears to be correctly implemented code is not accurately implementing the logic from the flowchart and pseudocode. If the pseudocode follows from the flowchart, and the code follows from the pseudocode, how can this be? The problem is that the pseudocode is relying on indentation to indicate which parts belong in the then and else sections, but recall from Sect. 3.2 that indentation does not affect the flow of control in Java or in most languages for that matter. This is known as the "dangling else" problem. It might not be clear which if statement the else statement is paired. If the above code segment has the else and the subsequent System.out.println indented, note that the code presents itself entirely differently:
+
+// *** Caution: Incorrectly Implemented Code ***
+
+System.out.print("Enter the temperature: ");
+
+temp=scanner.nextInt();
+
+if(temp > 0)
+
+if(temp >= 100)
+
+System.out.println("Steam");
+
+else // Indented - - - ->
+
+System.out.println("Ice"); // Indented - - - ->
+
+System.out.println("End of Program");
+
+Instead of the else appearing to belong to the outer if, it now seems to belong to the inner if statement. If indenting doesn't affect the flow of control, which of the above two code segments is correct? The answer is neither, but the second one more accurately represents the flow of control, because an else is always matched with the closest if statement. The result is that the flowchart for the above code segment is as shown in Fig. 3.9.
+
+Fig. 3.9
+
+Flowchart representing the "Dangling Else" problem
+
+If temp is less than or equal to 0, then nothing is output, and if the temperature is greater than 0, but less than 100, then the message "Ice" is output, which is clearly incorrect. Although indenting is a useful way of indicating flow of control in pseudocode, it is only useful in illustrating the flow of control in Java when it is done properly. If indenting will not help correct the above problem, what can be done to correct the code? There are a couple of solutions. One is to include braces to force the else to match up with the outer if instead of the inner if as shown below:
+
+// *** Correctly Implemented Code ***
+
+System.out.print("Enter the temperature: ");
+
+temp=scanner.nextInt();
+
+if(temp > 0) {
+
+if(temp >= 100)
+
+System.out.println("Steam");
+
+}
+
+else
+
+System.out.println("Ice");
+
+System.out.println("End of Program");
+
+Note that in addition to the braces, the else is moved to the left to line up with the outer if to improve readability. But doesn't the inclusion of braces contradict the suggestion from Sect. 3.2 to not use braces for a single statement and use them only when they are necessary? No, not in this case, because although the if-then structure in Java is only a single statement, the braces are necessary in this case to force the else to match with the proper if statement.
+
+In fact, some might suggest that braces should always be used to avoid a special case such as this. However, it seems somewhat counterintuitive to use braces everywhere for only a single potential error, since too many braces might clutter up a program and hurt the overall readability. There is another solution and that is to generally avoid the use of the if-then-if structure and instead primarily use the if-then-else-if structure, which does not suffer from this problem, as shown below:
+
+// *** Correctly Implemented Code ***
+
+System.out.print("Enter the temperature: ");
+
+temp=scanner.nextInt();
+
+if(temp >= 100)
+
+System.out.println("Steam");
+
+else
+
+if(temp <= 0)
+
+System.out.println("Ice");
+
+System.out.println("End of Program");
+
+Again, does this mean one should never use the if-then-if structure? No, as mentioned previously use the if-then-if structure only when the nature of the problem lends itself to its usage and use extra caution to ensure that the code written actually implements the intended logic. Further, an example of the use of the if-then-if structure is shown in the next two sections.
+
+However, it might appear that the initial cause of the above problem results from the indentation used in the previous pseudocode. Does this mean that one should not rely on indentation when writing pseudocode and braces should be used to help indicate nesting? The answer is largely left up to the individual, the instructor of a class, or the standards in a company. As long as one is aware of the potential problem, indentation can be used in pseudocode to indicate the flow of control. Also, if one wants to ensure that a mistake does not occur in writing subsequent Java from the pseudocode, then the inclusion of braces in the above instance would provide extra insurance that the pseudocode is not accidently implemented incorrectly. However, this text will not use braces in pseudocode to save space and help the reader better understand the potential problems.
+
+## 3.5 Logical Operators
+
+Although nested if statements are very useful in the circumstances discussed in the previous section, there are techniques that can make them even more useful. For example, assume that the only message needed to be output was the opposite of the example presented in the previous section. If the temperature is greater than 0° and less than 100°, only the message "Water" needs to be output. This could be done with either an if-then-if structure or an if-then-else-if structure, where the former is shown below:
+
+System.out.print("Enter the temperature: ");
+
+temp=scanner.nextInt();
+
+if(temp > 0)
+
+if(temp < 100)
+
+System.out.println("Water");
+
+System.out.println("End of Program");
+
+However, does the use of an if-then-if above go against the suggestion in the previous section to use the if-then-else-if? No not really, because this is one of those cases that lend itself better to the use of the if-then-if. The use of an if-then-else-if would result in an empty then section which should be avoided as discussed in Sect. 3.3 and as shown below:
+
+System.out.printl("Enter the temperature: ");
+
+temp=scanner.nextInt();
+
+if(temp >= 100);
+
+else
+
+if(temp > 0)
+
+System.out.println("Water");
+
+System.out.println("End of Program");
+
+Note the semicolon at the end of the first if statement indicating an empty then section, which can be quite confusing. Clearly in this instance the if-then-if structure is a better solution than the if-then-else-if structure. However, by using logic there is an even better solution to this problem, and before presenting the solution, it is best to look over the fundamentals of logic operations.
+
+Logical operators are also known as Boolean operators, which are named after George Boole (1815–1864) an English-born mathematician and logician. The results of Boolean operations are the values true or false which can be stored in variables of type boolean as shown below:
+
+boolean flag;
+
+flag = true;
+
+Further, any relational or logic operation can be assigned to a boolean variable, and that variable can be used subsequently in an if statement. Although not used as often, it is sometimes helpful to have a relation in one part of a program, set a boolean variable (often called a flag and coded as flag), and then test the flag later in another part of the program. The result is that both of the following code segments are equivalent:
+
+if(x == 0)
+
+System.out.println("x equals 0");
+
+flag = x == 0;
+
+if(flag)
+
+System.out.println("x equals 0");
+
+Although at first the assignment statement of the second segment might look a little strange, if one thinks about it for a minute, the comparison of x == 0 results in either true or false. The true or false is then assigned to the boolean variable flag. Lastly, when the if statement is executed, should the value in flag be true, the then portion of the if is executed. Otherwise the value in flag is false, the then portion is skipped, and any statement that might follow is executed. In the second instance, does the variable flag need to be compared to the Boolean values of true or false? The answer is no, because the variable flag is of type boolean and already contains either the value true or false, so the comparison is unnecessary. Although the first example is more common, again the second is useful to set a flag in one part of a program and test it in another part of a program.
+
+Continuing, there are three fundamental logic operations called and, or, and not. The first of these three has a value of true when both conditions are true. For example, a graduation requirement for a major in computer science might include that a student takes both a course in calculus and discrete mathematics. If one takes one course but not the other, or takes neither course, then the major will not be complete. This can be represented in the form of a truth table, where all the possible combinations of the two courses are listed on the left side and the result of the and operation is listed on the right in Table 3.2. The variables c and d are used to represent the calculus and discrete mathematics courses, respectively, and the letters T and F are used to represent the values true and false, respectively. Note that result is true only when both c and d are true.
+
+Table 3.2
+
+Truth table for the and operation
+
+c | d | c and d
+
+---|---|---
+
+F | F | F
+
+F | T | F
+
+T | F | F
+
+T | T | T
+
+As an example of the or operation, suppose that in order to complete a major in computer science a student must take one of two electives, such as a course in artificial intelligence or a course in computer graphics. If a student takes one course or the other, then the student has fulfilled the requirement. But what if both courses are taken? In the case of the or operation under consideration here, known as an inclusive-or, the results are true when one or the other, or both are true. The result is that a student would have also fulfilled the requirement if both courses were taken. On the other hand, an exclusive-or is true when only one or the other is true, but not both. Although some other languages have both types of or operators, Java only has the inclusive-or as illustrated in the truth table in Table 3.3, where the letter a represents artificial intelligence and the letter c represents computer graphics. As can be seen, if either a or c is true, or both are true, the result is true. If neither is true, the result is false.
+
+Table 3.3
+
+Truth table for the or operation
+
+a | c | a or c
+
+---|---|---
+
+F | F | F
+
+F | T | T
+
+T | F | T
+
+T | T | T
+
+The last of the logic operators is the not operator, which when applied to something that is true initially, the result is false and vice versa. For example, if one has taken an introduction to computer science course, then the result is true, but if one has not taken the course, the result is false. In Table 3.4 the letter c represents the introduction to computer science course. Since there is only one variable, there are only two entries in the truth table. In fact, to determine the number of entries needed in a truth table, just count the number of variables in the expression and raise 2 to that power. For example, if there were three variables in a logical expression, how many entries would be needed? The answer is 2 raised to the 3rd power which is equal to 8.
+
+Table 3.4
+
+Truth table for the not operation
+
+c | not c
+
+---|---
+
+F | T
+
+T | F
+
+In Java the and, or, and not operations are represented using the &&, ||, and ! symbols, as shown in Table 3.5.
+
+Table 3.5
+
+Logic operations and Java symbols
+
+Logic operation | Java symbol
+
+---|---
+
+and | &&
+
+or | ||
+
+not | !
+
+Using this information, how can the if-then-if structure presented at the beginning of this section be simplified? Instead of checking first whether temp is greater than 0 and subsequently checking whether temp is less than 100, it would make sense to use the and operation. Although it would be nice to use a range such as 0 < temp < 100 as done in mathematics, note that this would cause a syntax error in Java. Instead, the relation must be written with two separate comparisons each using the variable temp as in temp > 0 && temp < 100. The previous if-then-if structure can now be written as follows:
+
+System.out.print("Enter the temperature: ");
+
+temp=scanner.nextInt();
+
+if(temp > 0 && temp < 100)
+
+System.out.println("Water");
+
+System.out.println("End of Program");
+
+Could the above if statement been written as if(temp >= 1 && temp <= 99)? Given that the variable temp is of type int in the past couple of examples, the answer is yes. However, what if the variable temp was a double? Then, a temperature such as 0.5° would not be output as "Water," which would be incorrect. Again as discussed previously in Sect. 3.3, it is usually better to write a program with the proper endpoints and relations even when programming with integers to help prevent a possible future logic error should a program be modified later.
+
+Although the basic operations of logic are fairly simple, expressions can become quite complex as the number of operations increase, so extra care must be taken when creating Boolean expressions. For example, suppose someone had originally coded the following if statement with an empty then section to check for a correct battery voltage in order for a system to operate correctly. Further, suppose that one wanted to convert the if-else structure to an if-then structure, how could that be accomplished?
+
+if(voltage < 10.5 || voltage > 14.0);
+
+else
+
+System.out.println("Correct Voltage");
+
+The message needs to be moved from the else section to the then section. In other words the message should be output when the condition is true, not when it is false. The simple way to convert the condition is to simply add a not operator in front of the conditional expression and remember to remove the semicolon from the end of the if statement as follows:
+
+if(!(voltage < 10.5 || voltage > 14.0))
+
+System.out.println("Correct Voltage");
+
+However, one must be careful with the not, because just as arithmetic operators have precedence rules, so to do logical operators. The not operator has the highest priority, the and operator has the second highest priority, and the or operator has the lowest priority. Further, just as with arithmetic operators, when there is a tie between two operators, the order is from left to right, and parentheses can be used to override any precedence rules where the expression in the innermost nested parentheses is evaluated first. The order of precedence for logical operators is summarized in Table 3.6.
+
+Table 3.6
+
+Logical operator precedence
+
+Operator | Precedence
+
+---|---
+
+innermost nested () | Highest
+
+! |
+
+&&
+
+||
+
+Tie – left to right | Lowest
+
+As a result, note that when the not is added, there are a set of parentheses around the original logical operator and its operands from the previous if statement, because without them the result would be different. A truth table is a convenient way to prove that the two are different. To simplify the above relations, the Boolean variables a and b are used in the truth table below:
+
+Notice that the intermediate columns are shown to help ensure that there are no mistakes, or if one is made, it is easy to see where it occurred. Further, note that the arrow pointing to the two columns shows that !a || b is not equal to !(a || b). Specifically, the values in the second and fourth line down are not equal, and although the other two are correct, it takes only one instance to prove that they are not equal. Further, something like this might be difficult to catch when testing a program. If these particular instances are not tested, a program could subsequently have a logic error and no error message would be generated.
+
+Returning to the if statement, what if one didn't want to have the not symbol in the if statement. Could it be rewritten without the ! symbol? The answer is yes, but again one must be careful when changing a logical expression. Similar to what can be done in arithmetic with a minus sign, the not symbol can be distributed over the terms in the parentheses. Although similar, it is different than arithmetic and De Morgan's laws must be used, which were formulated by Augustus De Morgan (1806–1871), a British mathematician and logician. Simply stated, if a not is distributed over either an and operator or an or operator, the operands must be complemented. Further, the operators must be changed to an or operator or an and operator, respectively. To help understand these laws better, they are listed in Table 3.7.
+
+Table 3.7
+
+De Morgan's laws
+
+not (a and b) = not a or not b
+
+---
+
+not (a or b) = not a and not b
+
+To show that the laws are indeed correct, a truth table can be used to prove that they are equal using the techniques shown above, and this is left as an exercise at the end of the chapter. To show how De Morgan's laws can be used in Java in the previous if statement, first the ! symbol is distributed over the operands and then the || operator is changed to an && operator as shown below:
+
+if(!(voltage < 10.5) && !(voltage > 14.0))
+
+System.out.println("Correct Voltage");
+
+Since there are now two not symbols, the relations can be changed to their opposites, thus eliminating the need for the two not symbols. Of course, one has to be careful to reverse the relationals correctly as has been discussed previously. The final if statement without the ! symbols is shown below:
+
+if(voltage >= 10.5 && voltage <= 14.0)
+
+System.out.println("Correct Voltage");
+
+Given some of the potential problems above, if a code segment can be written without using logical operators, then generally it is better to do so to avoid the added complexity and the potential for errors. When creating nested if structures, it is helpful not to have the first if contain a logical operator and instead rewrite the if structure to use a simple expression first. For example, in a code segment concerning temperatures, instead of starting with the water range and using an and operator, it is better to start with the steam or ice range which do not require a logical operator.
+
+Another potential complexity often occurs when some beginning programmers feel compelled to include a logical operator on subsequent if statements. However, this is often unnecessary as shown previously in the temperature example where the first if checks for temperatures of 100° and above. Since the higher temperatures have already been removed by the first if statement, it is not necessary to include the logical operators in the subsequent if statement to check whether the temperatures are below 100°. As a general rule, if the logical operators are necessary or they help to reduce the number of if statements, then they should be included. However, if the code can be written without the use of logical operators, it is best not to include them. An example of when to use or not use logical operators can be found at the beginning of the next section.
+
+As one writes logical operators with conditional expressions as operands, care must also be taken which conditional expression comes first. For example, the following code segment checks to make sure that i is not equal to 0 and that the results of the division operation are positive before outputting a message. What would happen if both i and total contained a 0?
+
+if(i != 0 && total / i >= 0 )
+
+System.out.println("The average is positive");
+
+Since i is equal to 0, the result of the first operand is false. However, does it matter what the results of the second operand are? Since false && false is false and false && true is also false, there is no need to check the second operand. This averts the division by zero error and the then portion of the if statement would not be executed. This is known as a short circuit, where if the first operand of an && operation is false, there is no need to check the second operand.
+
+So given the above, what would happen if the operands were reversed as follows and the value i and total were still 0?
+
+if(total / i >= 0 && i != 0)
+
+System.out.println("The average is positive");
+
+At first, it seems to be okay because the if statement is still checking to see if i is not equal to 0. However, although both tests are included in the if statement, recall from the discussion above that the operand on the left is evaluated first. Further if i was not equal to 0, there would not be a problem, but in the instance where i is equal to 0, there would be a division by zero error before the comparison of i to 0 in the second operand.
+
+A similar problem can occur with the || operator, where if the first operand is true, there is no need to check the second operand. The reason this occurs with both the && and || operators is the result of the underlying machine language generated by the compiler and the interpreter. For a further explanation, see Guide to Assembly Language: A Concise Introduction [4]. Although this short circuit evaluation of statements can be helpful in some instances, it can cause a problem if one is not careful with the order of the operands. So when writing logical operators, in addition to being careful with the precedence of logical operators and De Morgan's laws, one should also be careful with the order of the operands.
+
+## 3.6 Case Structure
+
+As can be imagined, if the number of nested if statements becomes too deep, the resulting code might be difficult to read and maintain. For example, consider when a student's quiz score is input and a message is output indicating how well the student performed as implemented in the following code segment:
+
+Notice the use of an or operation in the first if statement to test for a score of either 9 or 10 and the output of the message "Very Good". Note that an and operator could have been used instead as in if(score >= 9 && score <= 10), but since the range is only two integers, it is probably better represented using an or operator. However, with the last if statement above, it is easier to use the and operator to test for the range of numbers instead of listing out each of the possibilities. Lastly, notice that if the score does not fall between 0 and 10 inclusive, then a message is output indicating that it is an invalid quiz score.
+
+Although the above code segment works, what if there were more levels of scores to check and corresponding messages to be output? The level of indentation could become quite ungainly and the code might become more difficult to read and modify. Luckily, most languages have what is known as a case structure to help with these situations. In Java this structure is known as the switch statement. A switch statement is like a multi-way if statement. The contents of a simple variable or the result of an expression causes the flow of control to branch to one of the many particular cases, and the corresponding code is then executed. The above nested if-then-else-if structure can be implemented using a switch statement as follows:
+
+The first thing to be aware of is that the variable score cannot be of type double or float. Although it is possible to use typecast operators with these types, in these instances the use of nested if structures might be a better choice. This is one of the drawbacks of the switch statement, where typically only variables or expressions of type int and char can be used. The second thing to note in the switch statement is that the variable score is not part of a relational expression (using >, >=, etc.) as it can be in an if statement. Instead, the contents of the variable score are compared with each of the case statements that follow. If a match is found, then control is transferred to the corresponding case statement, and the code that follows is executed. For example, if the value in the variable score is a 10, then control is transferred to case 10: and the code that follows is executed. As mentioned above, an expression can be used instead of a variable, and an example of this follows later.
+
+Syntactically, there is one set of braces which indicate the beginning and end of the entire switch statement; however, note that there are no braces in each of the individual case sections even when there is more than one statement. The reason for this is that at the end of each case section, a break statement is included. The use of the break statement causes the flow of control to be transferred to the end of the switch statement. Without it, the flow of control would fall through to the code that follows the next case statement. Although it is legal to write code that does not use a break statement, the need to do so is very rare and is considered to be of poor programming style. Doing so usually makes code difficult to debug or modify and should be avoided.
+
+The last section of the switch statement is the default statement, which is executed when a matching case is not found. Although a default can be placed anywhere within the switch statement, it is typically placed at the end of the switch statement. It should be noted that switch statements are not required to have a default statement. However, if a switch statement does not have a default statement and the particular value is not found in the cases given, then nothing will be executed in the switch statement, and in the previous example, nothing would be output. Although this might be what was intended, a value that is not part of the data to be processed might cause a logic error later on in the program. As a result, default statements are usually included as a precautionary measure.
+
+Notice that the default case does not have a break statement. If there were no default statement, then the last case section would not need to have a break statement either. The reason is that upon completion of executing the code in the last case or default, the flow of control will simply fall through to the next statement following the switch statement. Although a break statement could be included, it is not necessary and will not be included in this text.
+
+With respect to indenting, there are a number of styles that can be followed, but typically the individual case statements are indented three spaces, and the code in each section is lined up after the colons. Again, should one's instructor or place of employment have a different style, be sure to follow that style.
+
+Also, note that each of the individual possible values of the variable score has its own case statement. Unfortunately a relation cannot be used in the case statements and this is another of the switch statement's drawbacks. However, there are on occasion a few ways around this limitation as will be seen later.
+
+For example, instead of having quiz scores of 10 through 0, what if the variable score was used to hold an exam score from 100 through 0, where a score of 100 through 90 inclusive was to output a message "Very Good", 89 through 80 was to output the message "Good", and so on? For a nested if structure, the solution is fairly simple. Instead of just checking for one or two integers as in the previous nested if structure, it could be modified to check for a range of integers using an and logical operator as in the following segment:
+
+Note that each if statement has an && to check for a range of values. However, wasn't it suggested in Sect. 3.5 to avoid this? Yes it was, but in previous examples, such as the temperature example, there were no upper and lower bounds, but in this case there are the bounds of 0 and 100. Although it appears necessary to include a range in each if statement in this example, is there a way that it could be rewritten to avoid having to include an and operator in every if statement? The answer is yes, where an extra if statement can be placed prior to the other if statements. This can be written as an if-then-else-if structure starting with if(score < 0 || score > 100) and with the error message at the beginning, or it can be coded as an if-then-if, which allows for the error message to be written at the end. To reflect the preferred order of the switch statement, the latter if structure is chosen as shown in the following segment:
+
+Note the use of De Morgan's rules in the first if statement where the || is replaced with an && and the relations are reversed. With an if statement checking the range of the scores added at the beginning, there is no longer a need to have an and operator in each of the subsequent if statements, which simplifies the code. Also, the extra if at the beginning makes it so the last if statement checking for the range from 0 to 59 can be eliminated, since after all the previous if statements, the only scores left would be in that range. Although an if-then-if is used as the outer if, the last nested if has its own else statement and therefore the problem of a dangling else is avoided.
+
+As can be seen, the exam score problem can be implemented relatively easily using nested if statements, but how could this be implemented using a switch statement? Does there need to be a separate case for each of the 101 possibilities? Without using an arithmetic expression, the answer would be yes. However, since the messages output are based upon exam scores in multiples of 10, if one thinks about it for a minute, there is a solution to this problem. What if each number is divided by 10? For example, if the score 98 is divided by 10, then the answer appears to be 9.8. But wasn't it said previously that the switch statement can't be used with floating point numbers? The answer is yes. However, recall that an integer divided by an integer is an integer, so the answer above would be just 9, not 9.8. Since each division results in an integer, the control can be transferred to the appropriate case. As another example, what if the value in score is a 70 or 79? Then, 70/10 is 7 and 79/10 is also 7, so in both cases a message of "Fair" could be output.
+
+But what about values that fall outside the range, such as −10 and 110? When divided by 10, they result in −1 and 11, respectively, and would be caught by the default statement. However, what about numbers like −1 and 101? When divided by 10, they would result in 0 and 10, respectively, so clearly this would not work. The solution is similar to the preceding nested if structure as shown below:
+
+Notice that there are no braces around the switch statement in the then section of the if-then-else statement because it is syntactically only one statement. Since the value in score is being divided by 10, will the value in score be altered? No, because as discussed in Chap.​ 1, the variable score is not being assigned a new value. Also notice that there is no default statement because the error message is part of the else section of the if-then-else statement. Lastly, note that since case 0: is the last statement, the break statement is not included prior to the closing brace of the switch statement.
+
+Given that it appears that the switch statement can solve this problem, when should the switch statement be used instead of nested if statements? Granted the above solution was helpful in this instance, because each of the message categories were multiples of 10. If other problems are multiple of other particular values, then the switch statement can be just as useful. However, if each of the categories are not of the same multiple, then the switch statement might not be as useful and nested if statements are probably a better solution to the problem.
+
+In general, if statements can work in all instances and the switch statement has various limitations. If there are only one or two alternatives, then the if-then or if-then-else structures are probably the best choice, because using the switch statement is probably overkill. Likewise, if there are only three or possibly four alternatives, then the if-then-else-if will be used by this text to give the reader practice with using nested if statements. If the problem has five or more of alternatives, then the switch statement can be the better choice. However, if the number of cases for each alternative are too numerous, then nested if statements might again provide the best solution.
+
+## 3.7 Complete Programs: Implementing Selection Structures
+
+The first program in this section is a simple program that does not include objects, whereas the second program incorporates objects to help reinforce concepts learned in Chap.​ 2.
+
+### 3.7.1 Simple Program
+
+Hurricanes are classified into five categories by the US National Oceanic and Atmospheric Administration (NOAA) based on the speed of the wind as shown below:
+
+Category | Wind speed (mph)
+
+---|---
+
+1 | 74–95
+
+2 | 96–110
+
+3 | 111–130
+
+4 | 131–155
+
+5 | Over 155
+
+In this section a program using selection structures which will categorize a hurricane will be developed. As in the past two chapters, this program will be developed step by step. First, the problem that will be solved is:
+
+Problem Statement: Write a program to classify a hurricane.
+
+Once a problem statement is given, the requirements can be established by analyzing the problem. The program will:
+
+  * Accept the wind speed of a hurricane from a user
+
+  * Determine the category of the hurricane
+
+  * Display the category of the hurricane
+
+Because of the nature of the problem, a selection structure will be used. Since there are five alternatives, five separate if statements could be used to check the range of the wind speed. Assuming the wind speed is stored in the variable windSpeed, a possible solution is shown below:
+
+if(windSpeed >= 74 && windSpeed <= 95)
+
+System.out.println("The hurricane is category 1.");
+
+if(windSpeed >= 96 && windSpeed <= 110)
+
+System.out.println("The hurricane is category 2.");
+
+if(windSpeed >= 111 && windSpeed <= 130)
+
+System.out.println("The hurricane is category 3.");
+
+if(windSpeed >= 131 && windSpeed <= 155)
+
+System.out.println("The hurricane is category 4.");
+
+if(windSpeed > 155)
+
+System.out.println("The hurricane is category 5.");
+
+Is this a good design? The answer is no, because all five conditions will be checked every time the program is run as was discussed in Sect. 3.3. This means a nested if structure would be a better choice. How can the conditions be nested? Here is one solution:
+
+Is this a good design? It is better than the first solution because whenever the condition becomes true, the rest of the conditions will not be checked. However, it is always a good idea to reduce the number of logical operators. The complete code shown below will check the wind speed in reverse order so that a logical operator is not required in the first if statement nor in the subsequent if statements:
+
+Notice that the code is indented only two spaces instead of three to help conserve space. Although three spaces is preferred, when using a number other than three, be sure to be consistent. When the above program is compiled and executed using the sample input of 125, the output of the program looks like this:
+
+Enter the wind speed (mph): 125
+
+The hurricane is category 3.
+
+The first two conditions returned false, and since the third condition was true, it found the hurricane was category 3. The flow of control skipped the rest of the conditions in the nested selection structure and reached the end of the program. The program also checks for an invalid wind speed, which is any negative value. When the program is executed with −50 as a wind speed, the output looks as shown below:
+
+Enter the wind speed (mph): -50
+
+Invalid wind speed.
+
+### 3.7.2 Program with Objects
+
+How can the concept of objects, discussed in Chap.​ 2, be incorporated into the program in the previous section? If an object for a hurricane is created, information about a particular hurricane such as a wind speed and a category can be stored inside of the object, and two hurricanes can be compared. Figure 3.10 contains the code defining the class for a Hurricane object.
+
+Fig. 3.10
+
+Hurricane class
+
+Notice the setCategory method uses the value of windSpeed which is stored in the object to determine the category of the hurricane. As a result, the setCategory method does not require any parameters. In the main program shown in Fig. 3.11, two hurricane objects are created. After a user enters the wind speed of both hurricanes, the program determines the categories and outputs them. Then, it compares the categories of the two hurricanes to decide the strongest storm.
+
+Fig. 3.11
+
+A client program for Hurricane class
+
+The stronger hurricane can be found by comparing the categories of the two hurricanes. Since the value of the category is stored in each object, it can be retrieved by using an accessor, the getCategory method. When the above program is compiled and executed using the sample input of 100 and 160, the output of the program looks as given below:
+
+Enter the wind speed (hurricane1): 100
+
+Enter the wind speed (hurricane2): 160
+
+Hurricane1 is category 2.
+
+Hurricane2 is category 5.
+
+Hurricane2 is stronger.
+
+## 3.8 Summary
+
+  * The then and else sections of an if statement can syntactically contain only one statement. Should more than one statement need to be included, use a compound statement by putting two or more statements in braces. If there is only one statement in the then or else section, braces are not needed and should not be used.
+
+  * Empty then or else sections should be avoided in if-then-else statements and the code should be rewritten as an if-then.
+
+  * When nesting if statements, the if-then-else-if structure tends to be used more often than the if-then-if structure. When using the if-then-if structure, be careful to avoid the dangling else problem.
+
+  * Logical operator precedence from highest to lowest is () – innermost nested first, !, &&, ||, and in a tie – left to right.
+
+  * De Morgan's laws are not (a and b) = not a or not b and not (a or b) = not a and not b.
+
+  * The switch statement works well with integer and character data but is not as useful with floating point or double precision data.
+
+  * Generally, be sure to include a break statement after every case section, except for the last one, unless there is a default statement at the end.
+
+  * Although a default statement is not required in a switch statement, it is usually a good idea to include one at the end and it does not need a break statement.
+
+  * Should there be only one or two alternatives, use an if-then or if-then-else statement respectively and avoid the use of a switch statement. If there are three or four alternatives, a switch could be used, but in this text nested if statements will be used. Lastly, if there are five or more alternatives, a switch statement should be used if possible.
+
+## 3.9 Exercises (Items Marked with an * Have Solutions in Appendix E)
+
+1.
+
+Given the code segment below, indicate the output for the following initial values of y:
+
+int x = 50;
+
+if(y > 10)
+
+x = 30;
+
+if(y < 20)
+
+x = 40;
+
+System.out.println(x);
+
+*A.
+
+What is the output if the integer variable y contains 10?
+
+B.
+
+What is the output if the integer variable y contains 15?
+
+C.
+
+What is the output if the integer variable y contains 30?
+
+2.
+
+Given the code segment below, indicate the output for the following initial values of x and y:
+
+A.
+
+What is the output if the integer variable x contains 10 and y contains −15?
+
+*B.
+
+What is the output if the integer variable x contains 100 and y contains 20?
+
+C.
+
+What is the output if the integer variable x contains 200 and y contains −100?
+
+3.
+
+Given the code segment below, indicate the output for the following initial values of x, y, and z:
+
+A.
+
+What is the output if the integer variable x contains 1, y contains 0, and z contains 2?
+
+B.
+
+What is the output if the integer variable x contains 0, y contains 1, and z contains −1?
+
+*C.
+
+What is the output if the integer variable x contains 1, y contains 2, and z contains 1?
+
+4.
+
+Declare a Boolean variable, isEligible, and assign it a value of false.
+
+5.
+
+Evaluate each Boolean expression as true or false. Show intermediate steps. Assume int num1 = 5, int num2 = -2, int num3 = 0, boolean flag1 = true, and boolean flag2 = false .
+
+*A.
+
+num1 > num2 || flag2
+
+B.
+
+num1 < num2 && num3 >= 0
+
+*C.
+
+num2 < 0 || flag1 && flag2
+
+D.
+
+(num2 < 0 || flag1) && flag2
+
+*E.
+
+(num2 < 0 || !flag1) && flag2
+
+F.
+
+num1 != 0 && num2 != 0 && num3 != 0
+
+6.
+
+Using a truth table, show that the first De Morgan's law discussed in Sect. 3.5 is correct.
+
+7.
+
+Using a truth table, show that the second De Morgan's law discussed in Sect. 3.5 is correct.
+
+*8.
+
+Write a code segment to ask a user to enter a number between 1 and 4, and print the name of the class (Freshman, Sophomore, Junior, and Senior) corresponding to the number. Use a case structure.
+
+*9.
+
+Repeat the previous exercise using a selection structure instead of a case structure.
+
+10.
+
+Write a code segment to ask a user to enter a number between 1 and 12, and print the name of the month corresponding to the number. Use a selection structure.
+
+11.
+
+Repeat the previous exercise using a case structure instead of a selection structure.
+
+12.
+
+In Sect. 3.5 it was mentioned that a mathematical expression like 0 < temp < 100 would cause a syntax error if used as a condition in an if-then structure in a Java program. Explain why.
+
+13.
+
+The dew point temperature is a good indicator of how humid it feels during a hot day. The US National Weather Service (NWS) summarizes the human perception of humidity using the dew point temperatures shown in the table below.
+
+Dew point temperature (°F) | Human perception
+
+---|---
+
+75 or higher | Extremely uncomfortable
+
+70–74 | Very humid
+
+65–69 | Somewhat uncomfortable
+
+60–64 | OK
+
+55–59 | Comfortable
+
+50–54 | Very comfortable
+
+49 or lower | A bit dry
+
+Write a complete program using a selection structure to output how a person feels for a given dew point temperature. The program should perform the following tasks:
+
+a.
+
+Allow the user to enter a dew point temperature.
+
+b.
+
+Determine the human perception for a given dew point temperature.
+
+c.
+
+Output the corresponding phrase from the table.
+
+Here is some sample input and output:
+
+Enter a dew point temperature (F): 55
+
+Comfortable
+
+Enter a dew point temperature (F): 30
+
+A bit dry
+
+Enter a dew point temperature (F): 90
+
+Extremely uncomfortable
+
+Enter a dew point temperature (F): 65
+
+Somewhat uncomfortable
+
+14.
+
+Repeat the previous exercise using a case structure instead of a selection structure.
+
+15.
+
+Write a complete program to compare the temperatures of three different cities and find the hottest city. First, implement a class called Thermometer as described below:
+
+A.
+
+Thermometer has one private data member, temperature of type double.
+
+B.
+
+Write code for a constructor to set a data member to the default value of 0.0.
+
+C.
+
+Write code for an accessor method, getTemperature, which returns the value of the appropriate data member.
+
+D.
+
+Write code for a mutator method, setTemperature, which has one formal parameter, and store it as the value of the data member.
+
+Then, write a client program to test the Thermometer class defined above. Call this class Temperatures. The main method should perform the following tasks:
+
+E.
+
+Allow the user to enter the temperatures of three cities.
+
+F.
+
+Declare and create three Thermometer objects setting the instance data member to the values entered by the user.
+
+G.
+
+If city1 is the hottest city among the three cities, output a phrase like "City1 is the hottest city."
+
+Here is some sample input and output:
+
+Enter the temperature of city1: 93.4
+
+Enter the temperature of city2: 76.1
+
+Enter the temperature of city3: 85.8
+
+City1 is the hottest city.
+
+Enter the temperature of city1: 76.5
+
+Enter the temperature of city2: 85.2
+
+Enter the temperature of city3: 66.9
+
+City2 is the hottest city.
+
+Reference
+
+4.
+
+Streib JT (2011) Guide to assembly language: a concise introduction. Springer, London
+James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6_4
+
+© Springer-Verlag London 2014
+
+# 4. Iteration Structures
+
+James T. Streib1  and Takako Soma1
+
+(1)
+
+Department of Computer Science, Illinois College, Jacksonville, IL, USA
+
+Abstract
+
+This chapter shows how iterations structures work using flowcharts, pseudocode and Java. It includes pretest indefinite loop structures, both count and sentinel controlled while loops. The posttest indefinite do-while loop and the definite iteration for loop are also discussed. Nested loops and potential problems are examined, and complete programs both with and without objects are included.
+
+## 4.1 Introduction
+
+Selection structures were discussed in Chap.​ 3, which allows a program to follow one of two or more paths. Iteration structures, sometimes called repetition structures, allow a program to repeat a section of code many times. It is this capability to repeat or loop that gives the computer the ability to perform a task over and over again.
+
+In creating any type of loop, it will generally have three parts: initialization, test, and change. When performing a repetitive task, one typically does not think about the particular steps of the repetition, but taking a moment to think about the process, one can recognize these three components. For example, if a student needs to do a number of homework problems for a mathematics class, they might count each of the problems, starting with the number one. This can be seen as the initialization phase which is performed just once. As the student starts to do the first problem, they might look at their notes to see how many problems they need to do, where in this example the student might need to do ten problems. Noticing that the count one has not passed the number ten, the student realizes the assigned homework is not completed. This is known as the test phase. As the student finishes the first problem, the student then counts to the next number, two, and this act of counting is the change phase of the repetitive process. The student again compares the count to the number of problems to be completed. This process of counting and comparing is the repetitive process of change and test. The process continues until the student has finished the tenth problem and the iterative process stops. Although this detailed analysis is much more than what a person does when performing a repetitive task, it is what the computer needs to do to perform a loop.
+
+In particular, this chapter will examine indefinite and definite loop structures. The first type of loop iterates an unknown number of times, whereas the second type of loop structure loops a fixed number of times. The first of these two loops can be divided into what are known as pretest and posttest loop structures, where the first has the test or conditional expression at the beginning of the loop and the second has the conditional expression at the end of the loop. Since the pretest indefinite loop structure is probably the most versatile, it is discussed first.
+
+## 4.2 Pretest Indefinite Loop Structure
+
+A pretest indefinite loop structure is a loop that has the test or conditional expression at the beginning of the loop and can iterate an indefinite number of times. An indefinite loop structure can also be made to loop a fixed number of times, and this is one of the reasons it is a very useful loop structure. The pretest indefinite loop structure in Java is known as a while loop. The while loop can generically be represented in a flowchart as shown in Fig. 4.1.
+
+Fig. 4.1
+
+Generic while loop
+
+At first glance, the flowchart of the while loop might appear similar to the flowchart for the if structure presented in the last chapter. The reason for this might be because of the diamond-shaped conditional expression near the top of the flowchart, but upon closer examination, one should be able to see a number of differences. The first box is for the initialization of a variable which occurs just once. That is followed by the diamond-shaped box where the test of the variable occurs. Note that like the if structure, there is a true and a false branch, but instead of the true branch going off to the right, it is pointing downward. Further, note that the two branches do not meet together at the bottom, but instead the false branch goes to the box with the "End of Program" message and the true branch ultimately ends up going back to the test. It is the true branch that forms the actual loop. The first section in the loop is known as the body of the loop. It is here that any task or tasks that need to be performed repetitively can be placed. This can be any sort of input, processing, or output that needs to be performed. The body of the loop can also include nested if structures or even nested loops as will be shown later in this chapter. Lastly, the change to the variable occurs before the flow of control loops back to the test. Although the change can occur anywhere in the loop, it is best to be consistent in its placement, and for now it is the last thing that is done in the loop.
+
+### 4.2.1 Count-Controlled Indefinite Iteration Structure
+
+Although the generic flowchart is fine for understanding the basic layout and concept of a loop, it is helpful to see exactly how the loop performs. In the next flowchart, the initialize, test, and change are replaced with more specific statements. In this case, the loop is known as a count-controlled loop and the variable controlling the loop is sometimes called the Loop Control Variable (LCV). In this example, the LCV will be the variable i as shown in Fig. 4.2.
+
+Fig. 4.2
+
+Count-controlled while loop
+
+To understand the loop, the best thing to do is walk through the logic. First, the variable i in the flowchart is initialized to 1. Then, the variable i is tested to see if it is less than or equal to 3, which is true. The body of the loop is executed for the first time and the value of i is incremented by 1, so that the value of i is equal to 2. The flow of control is returned back to the test, where i is less than or equal to 3. The body of the loop is executed for the second time, and the value of i is incremented to 3. The value is tested again and i is still less than or equal to 3, so the body of the loop is executed for the third time and the value of i is incremented to 4. The next time the value is tested, it is no longer less than or equal to 3, so the false branch is taken and the message "End of Program" is output. In the end, the final value of i is 4 and the body of the loop was executed three times.
+
+As in the previous chapter on if structures, it is nice to examine the pseudocode equivalent of the while structure as seen below:
+
+  * i ← 1
+
+  * while i ≤ 3 do
+
+    * //body of loop
+
+    * i ← i + 1
+
+  * output "End of Program"
+
+First, note that the while is written as while i ≤ 3 do, where while-do is a common way to describe the while loop structure. Of course, if one wanted to write it as while (i ≤ 3) to make the pseudocode look more like the Java language as will be seen shortly, that is okay. However, it is recommended that whatever style of pseudocode is chosen, it should be consistent. As with if structures, note that the body of the loop, including the increment, is indented approximately three spaces. Lastly, note that the output statement is not in the loop so it is not indented.
+
+As one might suspect, the Java syntax is similar to the pseudocode as shown below:
+
+i = 1;
+
+while(i <= 3) {
+
+// body of loop
+
+i = i + 1;
+
+}
+
+System.out.println("End of Program");
+
+The first line is the initialization, the second line is the test with the conditional expression in parentheses like an if statement, and the increment of the variable i is inside the compound statement. Note that the statement i++ could be used instead as shown in Chap.​ 1, and this style is often used in loops. Notice that braces are being used around the comment concerning the body of the loop and also the increment. Are these braces required in this particular code segment? At first the answer might seem to be yes, because there appear to be two statements in the loop. However, recall from Chap.​ 1 that comments are ignored by the compiler, so technically there is only one statement in the loop and the answer to the question is no. Why then are there braces included in the above segment? The reason is that in addition to the increment, there are usually other statements in the body of the loop. It is uncommon to see only one statement in a while loop, so braces are included in the above example in anticipation of more statements being added later.
+
+What if the user wanted to loop a different number of times other than three? That would require the user to modify and recompile the program, but many users do not have knowledge of programming. To expand upon the above, the value 3 could be changed to an integer variable n, and the value for n could be prompted for and input from the user as shown below:
+
+System.out.print("Enter the number of times to loop: ");
+
+n = scanner.nextInt();
+
+i = 1;
+
+while(i <= n) {
+
+// body of loop
+
+i = i + 1;
+
+}
+
+System.out.println("End of Program");
+
+If the user entered the value 3, the loop would still iterate three times as it did before. Further, the user now has the option to enter any other number for the value of n which allows the loop to have more versatility. However, what if the user entered a value of 0 instead? One other important thing about a while loop is that it is known as a pretest loop, meaning that the test is at the beginning of the loop. In this particular case, the variable i is initialized to 1 and then the comparison would be performed. Since the 1 in the variable i is not less than or equal to the 0 in the variable n, the result would be false and the body of the loop would not be executed. This is one of the important features about a pretest loop because the body of the loop might be executed anywhere from zero to many times. This is a reason why the while loop is one of the more versatile loops as will be seen below.
+
+As an example of how the while loop structure can be used to solve a problem in the Java language, consider a user who wants to add a series of numbers. If there are a relatively small fixed number of integers to be added, then a loop might not be necessary. Consider the following program that would add three numbers entered by the user:
+
+int num1, num2, num3, total;
+
+System.out.print("Enter an integer to be summed: ");
+
+num1 = scanner.nextInt();
+
+System.out.print("Enter an integer to be summed: ");
+
+num2 = scanner.nextInt();
+
+System.out.print("Enter an integer to be summed: ");
+
+num3 = scanner.nextInt();
+
+total = num1 + num2 + num3;
+
+System.out.println("The total is " + total);
+
+Although the above works, what if there were a large number of integers to be added, say 1,000? The number of variables, prompts, and inputs would be overwhelming when writing the code, and the program would also take up a lot of memory. Returning to the example above where only three numbers need to be added, the number of variables used to store the input could be reduced to one. This would make the task a little easier, but more importantly it paves the way to see how the problem could be solved using a loop.
+
+Using only a single variable num instead of three variables, the first integer could be prompted for, input, and placed into the variable total. The second integer could be input into the variable num and added to the variable total. The same would occur with the third integer and then the sum in total is output.
+
+In the code above, the three prompts and inputs look the same, but the assigning of the first integer input into total makes it different from the subsequent assignment statements. The last two groups of statements indicated by the brackets could be placed in a loop, but the first group could not be placed in the loop. It would be convenient if there did not need to be the exception, so instead of assigning the first value input into total, the variable total could be initialized to zero; thus, the first value input into num could be added to the variable total just as all the other integers.
+
+The first group is no longer a special case, so it can also be put into a loop that iterates three times. The body of the loop would contain a prompt and input for the integer num followed by the variable num added to the variable total. However, to allow for the first time num is added, the variable total would need to be initialized to zero prior to the loop. Then each time through the loop, the current value in num could be added to the previous value in the variable total. The first time through the loop, the value in num would be added to the zero in total, the second time to the previous value in total, and so on until the loop terminates, and the final value in the variable total is the sum of all the integers input.
+
+Notice that the basic loop is the same as the loop presented earlier, with the initialization, test, and change of the variable i. Also note that the variable total is initialized to zero so that the integers input can be summed. Lastly, notice that three statements from the previous code segment are no longer written three times, but rather only once, because the loop will iterate three times and accomplish the same task.
+
+How does one know what belongs inside the loop and what belongs outside the loop? If outside the loop, does it belong before or after the loop? By looking for patterns on a smaller number of items, one should be able to see those items that need to be repeated and those items that need to be executed only once. In the above example, the variables for counting and the total need to be initialized only once, and they should be placed prior to the loop. Since the output of the total needs to occur only once, it should be placed outside and after the loop. Further, since there are three integers to be prompted for, input, and summed, that code should be placed inside the loop. An advantage of the above code segment is that if just three values were being input or 1,000 values were being input, the only thing that would need to be changed is the number 3 in the while statement. This version of the code is much easier to write than straight line code and also takes up less memory.
+
+The previous code segment is a significant step forward by utilizing the power of the computer to perform repetitive tasks; however, it can be improved. As it is currently written, if the user wants to input and sum four integers instead of three, the user would have to edit and recompile the program. Since most users are not programmers, is there a way to make this program easier to use? The answer is yes. As before, a prompt and input can be placed prior to the loop to allow the user to input the number of integers to be summed as shown below:
+
+int num, total, i, n;
+
+total = 0;
+
+i = 1;
+
+System.out.print("Enter the # of integers to be summed: ");
+
+n = scanner.nextInt();
+
+while(i <= n) {
+
+System.out.print("Enter an integer to be summed: ");
+
+num = scanner.nextInt();
+
+total = total + num;
+
+i = i + 1;
+
+}
+
+System.out.println("The total is " + total);
+
+Notice the prompt and input of the variable n prior to the while statement, and also notice that the number 3 in the while statement has been changed to the variable n. Again, this makes the program much more useful since it does not require the user to make changes to the program. For example, if the user started the program and then decided that they did not want to sum any integers, the user could just enter the number 0, and since the while loop is a pretest loop, the user would never be prompted to input any integers. Further, since total was initialized to 0, the message indicating a total of 0 would be output also.
+
+There are of course other tasks that could be added to the above program. For example, what if the user wanted to find the average of the integers entered, how would this be written? Since total needs to be divided by the number of items, one thought is to use the value in the variable i. However, its final value is one more than the number of items entered. If three items were input and since it was initialized with a 1, it would contain the number 4 at the end of the loop. That value could be decremented by one to make it the correct number, but why use the counter when the variable n contains the number of items which was originally entered by the user? The answer is that the use of the variable n is the better choice as shown in the following code segment:
+
+First, notice that average is declared as type double. Also, note that the calculation of the average is outside the loop at the end of the segment because the average only needs to be calculated once. Offhand, the above segment appears to be fairly good. However, there are a few problems with it. If the program was executed using a 3 for the first prompt and then using the three integers 5, 7, and 8 for the values to be summed and averaged, what would the answer be? Using a calculator one would say 6.666..., but is this the answer that the program would generate? The answer is no because the program would output the answer 6.0, which is incorrect. The variable average is type double so that is not the problem. However, look carefully at the division on the right side of the assignment symbol. Recall from Chap.​ 1, an integer divided by an integer is an integer, which in this case is 6. The assignment of the integer to a variable of type double causes the 6 to be changed to 6.0, which is the number that is output. How can this be corrected? The answer from Chap.​ 1 is to use a (double) typecast operator on one of the variables involved in the division which will force the answer to be of type double. Also, it would help to format the output so that it would not be a repeating decimal.
+
+There is another problem with the previous code segment that might not be as readily apparent. What would happen if the user entered a 0 for the number of items to be summed and averaged? As discussed previously, the user would not be prompted for integers to be entered. The problem occurs after the loop in the division statement. The value in n would be a 0 which would cause an execution error, or in other words a run-time error. How could this problem be solved? An if statement could be included so that division would not occur unless the value in n is positive. Should the average message still be output? That would depend on the original specifications. In this case it would not hurt to still output the message, but it would probably be a good idea to ensure that the variable average contained the value 0. The updated program with all of the above changes can be seen below:
+
+Although typically users will not enter a negative number or the number 0 as the number of items to be summed, programmers need to write programs that work correctly under such circumstances. The old adage "If something can go wrong, it will" applies to software development as well. As a result, these sorts of possibilities should also be addressed in the design and specifications of programs so that they will be taken care of properly when a program is written. This sort of programming is known as robust programming and will be discussed at various points throughout the text. However, at other times it will not be included when introducing a new concept and to save space. When encountering an assignment or specifications for a programming project that lack robustness, it is always advisable to check with the user or the instructor when in a classroom setting.
+
+### 4.2.2 Sentinel Controlled Loop
+
+The use of a prompt in the previous program to indicate how many integers will be entered is better than having the number "hard coded" into the program. A disadvantage with the previous loop structure is that it requires the user to know in advance how many integers will be entered prior to running the program. If the user miscounts the number of integers, the program will not work correctly. For example, if the user overcounts the number of integers, then the user will have one or more extra prompts to enter data and the average will be off, which is unacceptable. If the user undercounts the number of integers, then the user will have leftover data and again the average will be off. In these cases the only real alternative is for the user to restart the program from the beginning. Although this is not much of a problem for a small data set, it is clearly impractical for a large number of data items.
+
+Instead of having the user count all the data items prior to running the program, wouldn't it be useful to have the program do the counting for the user? This can be accomplished using a sentinel controlled loop, or what is sometimes called an End of Data (EOD) loop, which is usually implemented using a while loop. The idea is that the user continues to enter data until a sentinel value or end of data indicator is entered indicating that the end of data has been reached. The key is that the sentinel or EOD indicator must be a value that is different from the other data values. Using the above example, if only nonnegative integers were entered, then a negative integer such as −1 could be used as a sentinel. The main disadvantage of this method is that sometimes there is not an acceptable value that can serve as a sentinel, but in those instances where a sentinel is available, the sentinel controlled loop is better than the previous count-controlled loop. Although a count is not necessary to control the loop anymore, a count can be added to the program to help calculate the average as will be seen later.
+
+As always, it is helpful to begin with an example as shown in the following code segment:
+
+System.out.print("Enter a non-negative integer or -1 to stop: ");
+
+num = scanner.nextInt();
+
+while(num != -1) {
+
+// body of loop
+
+System.out.print("Enter a non-negative integer or -1 to stop: ");
+
+num = scanner.nextInt();
+
+}
+
+System.out.println("End of Program");
+
+The first thing to notice is that the variable i is no longer controlling the loop. Since the while loop does not need a counter, it is called an indefinite loop structure. Whereas in the previous section one could tell how many times the loop would iterate merely by looking at it, such as looping 3 times or in some cases n times, here the number of times is not readily apparent and the code could loop indefinitely.
+
+At first this loop might appear a little confusing because the value num is prompted for and input in two places, once outside prior to the loop and another time inside at the end of the loop. However, if one takes a little time to think about the loop, it is not as confusing as it looks. First, the prompt and input outside prior to the loop is sometimes called a priming read. This can be thought of as the initialization section of the loop. The test portion of the loop includes the comparison of the value input into the variable num to the sentinel value of −1. If the value input is equal to the sentinel, then the loop is not executed, otherwise the data can be processed in the body of the loop. The second prompt and input is the change portion of the loop, where all subsequent values are input. Again, if a subsequent value input is not equal to the sentinel, the value is processed, otherwise the loop terminates.
+
+A disadvantage to the above loop is that as written, only a value of −1 will terminate the loop. What would happen if the user input a −2? As can be seen, all other negative values would be processed in the body of the loop, which might not be what was intended. Instead, the prompt and test could be rewritten to include all negative numbers as sentinel values as shown below:
+
+System.out.print("Enter a non-negative integer ");
+
+System.out.print("or a negative integer to stop: ");
+
+num = scanner.nextInt();
+
+while(num >= 0) {
+
+// body of loop
+
+System.out.print("Enter a non-negative integer ");
+
+System.out.print("or a negative integer to stop: ");
+
+num = scanner.nextInt();
+
+}
+
+System.out.println("End of Program");
+
+Note that due to the length of the prompts, they are split into separate print statements and that the while statement now checks to see if num is greater than or equal to 0. Again, as long as the sentinel value is not part of the data to be processed, the sentinel controlled loop can prove to be a nice alternative to count-controlled loops. To help illustrate the usefulness of this loop, the following code segment shows how it can be used to implement the calculation of total in the example from the previous section:
+
+int num, total;
+
+total = 0;
+
+System.out.print("Enter a non-negative integer to be summed ");
+
+System.out.print("or a negative integer to stop: ");
+
+num = scanner.nextInt();
+
+while(num >= 0) {
+
+total = total + num;
+
+System.out.print("Enter a non-negative integer to be summed ");
+
+System.out.print("or a negative integer to stop: ");
+
+num = scanner.nextInt();
+
+}
+
+System.out.println("The total is " + total);
+
+As before, the value of total should be initialized to 0 prior to the loop. Notice that adding num to total is the first line in the body of the loop. Is this correct? At first this might look a little strange, but it is correct. Remember that the priming read will input the first value to be summed. Also, sometimes beginning programmers think there should be an if statement before adding num to total because they think that the sentinel value might be included in the total. However, an if statement is not necessary because the while loop is a pretest loop, and if a sentinel value is input, the loop would terminate.
+
+Can this loop be further expanded to include the calculation of the average as done previously? Yes, but a count will need to be added to the loop so that the total can be divided by the number of integers that are input as shown below:
+
+First notice that the value of i is initialized to 1 as has been done previously, and again it is incremented at the beginning of the loop prior to when total is calculated. Although the increment could be placed elsewhere, it is usually a good idea to keep all calculations together for ease of reading and modification of the code. Another thing to notice is that the variable i does not appear in the parentheses of the while statement. This again is because it is a sentinel controlled loop and not a count-controlled loop. Further, note the i-1 in the if statement, because the final value in i is one more than the number of times the loop was executed. Also notice that the total is divided by (i – 1), because without the parentheses the division would be incorrect. However, instead of using i - 1 twice, it might be more convenient to subtract 1 from i and then use just i as shown in the code segment below:
+
+i = i - 1;
+
+if(i > 0)
+
+average = (double) total / i;
+
+else
+
+average = 0.0;
+
+Although this method works, there is a more convenient way of solving this problem. Even though individuals tend to start counting from the number 1, it is often more helpful to have programs start counting from the number 0. By starting the count from 0, the final value in i will no longer be off by 1 at the end of the segment. This will become even more apparent in Chap.​ 7 on arrays, because an array actually starts at location 0. The following code segment reflects this change:
+
+So far the count-controlled loop and the sentinel controlled loop have been introduced separately. Is it possible to combine both in one loop? Given the information presented in Sect.​ 3.​5 on logic operations, the answer is yes. For example, what if one wanted to have a sentinel controlled loop that would accept up to a maximum of 10 numbers? In other words, the user could keep entering data until a sentinel value was entered, but if a sentinel value was not entered, the loop would stop after 10 numbers had been entered. The result is that the tests for the sentinel value and the count would need to occur in the while statement. Looking at a portion of the previous program, an && operator could be added to the while statement so that the body of the loop is executed only when both the value in num is not equal to a sentinel value and the count is less than 10.
+
+Note that the test for i is less than 10 instead of less than or equal to 10. This is because the variable i now begins at 0 instead of 1. If the value in num is greater than or equal to 0 and the count is less than 10, then the body of the loop is executed. However, if either the value in num is a sentinel value or the value in i is 10 or greater, then the loop will not be executed.
+
+What if there isn't an acceptable value that can be used as a sentinel value? Another possibility is to repeatedly prompt the user and ask if there is any data to be entered. A prompt asking the user to enter a Y or N, for yes or no, respectively, could be output using a sentinel controlled loop. Then, if there is more data, the user could be prompted to input data for each iteration through the loop as shown below:
+
+Note that the while loop checks for either an uppercase Y or a lowercase y to make it convenient for the user. Also, notice that if the user does not respond with either Y or y, it is assumed that the user entered either N or n and the loop terminates. Further, the prompts for more data can be different as necessary, as shown by the inclusion of the word more in the last prompt above. The disadvantage to this program segment is that the user has to enter a character each time before entering the actual data to be processed, but if a suitable sentinel value cannot be found, then this might be the only alternative.
+
+## 4.3 Posttest Indefinite Loop Structure
+
+In addition to the pretest indefinite loop structure of the previous section, Java also has a posttest indefinite loop structure called the do-while structure. Whereas a pretest loop has its test at the beginning and the body of the loop may be executed zero to many times, the posttest loop structure has its test at the end of the loop and the body of the loop will be executed one to many times. In other words, regardless of the result of the test, the body of the posttest loop will be executed at least once. As before, looking at the flowchart is a good place to start as shown in Fig. 4.3.
+
+Fig. 4.3
+
+Count-controlled do-while loop
+
+It is easy to notice that the test condition is now located at the end of the loop instead of the beginning, thus showing it is a posttest loop structure. The body of the loop is executed while the condition is true, and when it is false, the flow of control falls through to the next statement. The above flowchart can be written in pseudocode as follows:
+
+  * i ← 1
+
+  * do
+
+    * //body of loop
+
+    * i ← i + 1
+
+  * while i ≤ 3
+
+  * output "End of Program"
+
+As with previous pseudocode, the indenting indicates the body of the loop. As should be suspected, the Java code looks similar as follows:
+
+i = 1;
+
+do {
+
+// body of loop
+
+i = i + 1;
+
+} while(i <= 3);
+
+System.out.println("End of Program");
+
+Notice the use of a compound statement, the { }, which is not optional within the do-while statement. Even if there is only one statement between the words do and while, a compound statement must be included. However, since the body of a do-while almost always has more than one statement, it is unlikely that one would forget to include the braces. Modifying the above code segment to prompt the user to enter the number of times to loop, similar to the last section, results in the code segment below:
+
+System.out.print("Enter the number of times to loop: ");
+
+n = scanner.nextInt();
+
+i = 1;
+
+do {
+
+// body of loop
+
+i = i + 1;
+
+} while(i <= n);
+
+System.out.println("End of Program");
+
+How many times would the body of the loop be executed in the above code segment if the user entered a value of 0 for n? The answer is one. Unlike the answer of zero for the pretest loop structure, the body of the loop is executed at least once with a posttest loop structure, because the comparison is at the end after the body of the loop has been executed. If one did not want the above code to iterate once in the event that someone entered a value of 0 for n, how would the code need to be modified? If one thinks about it, an if statement would need to be added at the beginning of the body of the loop or just prior to the loop to check for a value of zero or a negative number. Of these two choices, the if would be better placed outside the loop so that it does not need to be checked through each iteration of the loop and is executed only once prior to the loop as shown below:
+
+System.out.print("Enter the number of times to loop: ");
+
+n = scanner.nextInt();
+
+if(n >= 1) {
+
+i = 1;
+
+do {
+
+// body of loop
+
+i = i + 1;
+
+} while(i <= n);
+
+}
+
+System.out.println("End of Program");
+
+Although the above code segment solves the problem of iterating once through the loop when the value of n is 0 or negative, it does appear a little cumbersome with the use of both an if and a do-while statement. The above code segment can be easily implemented using a simple while loop as presented in the previous section and repeated below:
+
+System.out.print("Enter the number of times to loop: ");
+
+n = scanner.nextInt();
+
+i = 1;
+
+while(i <= n) {
+
+// body of loop
+
+i = i + 1;
+
+}
+
+System.out.println("End of Program");
+
+Clearly, the second example above using only the while loop is simpler than the previous example using an if and do-while statements. This is not to say that the many examples in the previous section and other problems cannot be implemented using the do-while and an if statements (see the exercises at the end of the chapter). Rather it is oftentimes simpler to use just the while statement instead. It is for this reason that the while statement tends to be used more often than the do-while statement.
+
+Although in most cases having the test at the beginning is more convenient, there are some special cases where the do-while can be quite useful. For example, assume that for input a user has to input an integer between 0 and 10, inclusive. If the user enters a number outside the range, then the user needs to be re-prompted to input the number again. At first this might seem to be a good application for the if statement, but what if the user continues to enter the wrong number? A single if statement would allow the user only one chance to reenter a correct number. Instead, a loop would be a better choice. The problem could be solved using a while loop, but since the user has to be prompted at least once, the do-while might be a good choice as seen below:
+
+do {
+
+System.out.print("Enter a number between 0 and 10, inclusive: ");
+
+number = scanner.nextInt();
+
+} while(number < 0 || number > 10);
+
+The above loop provides a simple way to give a user multiple attempts to correct a problem with the input data. However, a disadvantage of the above loop is that the user might continue on indefinitely entering the wrong number. A solution is that a counter could be added so that after a certain number of attempts, the loop stops. Then, an if statement after the loop could check the number of attempts and either use a default value or exit the program.
+
+Another disadvantage of the above code segment is that the subsequent message output is the same as the first one, so the user might not understand what they did incorrectly. If a more detailed message is needed, an if could be added to the body of the loop to check a flag and offer a different message.
+
+firstAttempt=true;
+
+do {
+
+if(firstAttempt)
+
+firstAttempt=false;
+
+else
+
+System.out.println(number + " is an incorrect number");
+
+System.out.print("Enter a number between 0 and 10, inclusive: ");
+
+number = scanner.nextInt();
+
+} while(number < 0 || number > 10);
+
+Note the firstAttempt flag is set to true prior to the loop in order to indicate the first attempt, and once in the loop, the flag is set to false to indicate subsequent attempts. In the case of a subsequent attempt, a message is output to the user indicating what was input so that they might see what was incorrect. Notice that regardless of whether it was the first attempt or a subsequent attempt, a number needs to be prompted for and input, so the prompt and input statements come after the if statement. However, the use of the flag and if statement might seem a little clumsy, so possibly a while loop could be used instead. The advantage here is that the message in the body of the loop could be different than the initial message used in the priming read as follows:
+
+System.out.print("Enter a number between 0 and 10, inclusive: ");
+
+number = scanner.nextInt();
+
+while(number < 0 || number > 10) {
+
+System.out.print(number + " is an incorrect number, try again");
+
+System.out.print("Enter a number between 0 and 10, inclusive: ");
+
+number = scanner.nextInt();
+
+}
+
+As suggested previously, a count could also be added so that after a certain number of attempts, the loop would stop. Again in this case, the pretest loop seems to be a little more appropriate than the posttest loop. In any event, a programmer should analyze the requirements and specifications of the program to be written and use the type of loop that best suits the task at hand.
+
+## 4.4 Definite Iteration Loop Structure
+
+As discussed in Sect. 4.2.1, the while loop can be used as a count-controlled loop. Since loops often need to iterate a fixed number of times, most languages include what is known as a definite iteration loop structure or what is sometimes called a fixed iteration loop structure. In Java, this is called a for loop, and like the while loop, it is a pretest loop.
+
+The for loop has a flowchart similar to the one shown previously in Fig. 4.2. However, instead of having the initialization and test as separate statements as they are in the while loop, they are included as part of the for loop statement. To help illustrate this in flowchart form, the diamond that has only the test portion of a while loop can be replaced with a rectangle that contains all three parts typically present in a loop (Fig. 4.4).
+
+Fig. 4.4
+
+Definite iteration loop flowchart
+
+Notice that the initialization, test, and change are all located in one rectangle signifying that all three operations are written in the same statement. The optional internal arrows illustrate how the flow of control occurs within the statement. Notice that the order of operations is the same as with the previous flowchart for the while statement. The initialization is done just once prior to the loop. The test is done prior to the body of the loop and the change occurs after the body of the loop.
+
+The pseudocode for the for loop can be written as follows:
+
+  * for i ← 1 to 3 incremented by 1 do
+
+    * //body of loop
+
+  * output "End of Program"
+
+In the for loop, the initialization is indicated as i←1, the to 3 is the test, and the change is the incremented by 1. Note that the use of the word do is optional and the body of the loop is indented. As before, the Java code follows:
+
+for(i=1; i<=3; i++)
+
+// body of loop
+
+System.out.println("End of Program");
+
+After the for in the parentheses are the initialization i=1, the test i<=3, and the change or increment i++, all separated by semicolons. Note that the increment is using the shortcut i++ which is common in a for statement. Also notice that there are no braces in this example around the body of the loop, because if there is only one statement, they are unnecessary. Since the change or increment of the variable i is in the for statement itself, it is not uncommon that there might be only one statement in the body of a for loop. However, if there is more than one statement in the body of the loop, the use of a compound statement is necessary. In the above example, it is assumed that the variable i is declared elsewhere, but it is also possible to declare the variable i within the for statement itself by preceding the initialization of i with the word int as in for(int i=1; i<=3; i++). This is also a fairly common practice and will be used on many occasions in the future.
+
+Note that it is possible to have more than one statement in each of the three sections that are separated by semicolons within the parenthesis and each statement would be separated by commas. This gives the for statement quite a bit of flexibility, but this can become quite confusing and is considered by some to be poor programming practice. Since anything that can be done with a for loop can also be done by the while loop, should such a complex for loop need to be written, the programmer is usually better off writing the loop as a while loop. That being said, when should the for loop be used instead of a while loop? Since the for loop is typically thought of as a fixed iteration structure, it is in those situations where a fixed number of tasks need to be done that the for loop should be used.
+
+As an example of using the for loop, assume that Java did not contain the pow function in the Math class. How could the power function be implemented using iteration? As before, whenever trying to solve a problem using iteration, it helps to write down an example using specific values to see if a pattern can be found, followed by a more general solution. For example, when trying to calculate x n , where x is the number 2 and n is an integer greater than or equal to zero, then the following is a list of possible results:
+
+  * 20 = 1
+
+  * 21 = 1 * 2 = 2
+
+  * 22 = 1 * 2 * 2 = 4
+
+  * 23 = 1 * 2 * 2 * 2 = 8
+
+  * .
+
+  * .
+
+  * 2 n = 1* 2 * 2 * 2 *... * 2 (n times)
+
+Further, if x is considered to be a positive nonzero integer in this example, then the above can be rewritten more generally as follows:
+
+  * x 0 = 1
+
+  * x 1 = 1 * x
+
+  * x 2 = 1 * x * x
+
+  * x 3 = 1 * x * x * x
+
+  * .
+
+  * .
+
+  * x n = 1* x * x * x *... * x (n times)
+
+As stated above, when solving a problem, it is helpful to try and see if there is a pattern present. In the above example, it can be seen that 20 and x 0 are defined to be 1, so that might be a good starting point for initialization. Further, note that for any value of n, there appears to be that number of multiplications present. For example, 23 is 2 multiplied by itself 3 times. This might be useful in the test part of the loop where the loop might need to iterate n times. Further, since the loop will iterate a fixed number of times, this would be a good fit for the for loop. Using this information, the loop skeleton from above can be modified to solve the problem.
+
+First, four variables will need to be declared, the loop control variable i, variables for both x and n, and a variable for the result which could be named answer as shown below:
+
+int i,x,n,answer;
+
+The values for x and n would need to be prompted for and input from the user as in the following:
+
+System.out.print("Enter a value for x: ");
+
+x = scanner.nextInt();
+
+System.out.print("Enter a value for n: ");
+
+n = scanner.nextInt();
+
+Next, if the loop needs to loop n times, then instead of having the relational expression compare the loop control variable i to 3 as was done previously, couldn't it instead be compared to n? The answer is yes, where the loop would not iterate 3 times, but rather n times. Also note that the answer for x 0 is 1. Further, each line in the definition for x n begins with the number 1, so this might be a good initial value for the variable answer. The result is that the following code segment could implement the power function:
+
+int i,x,n,answer;
+
+System.out.print("Enter a value for x: ");
+
+x = scanner.nextInt();
+
+System.out.print("Enter a value for n: ");
+
+n = scanner.nextInt();
+
+answer = 1;
+
+for(i=1; i<=n; i++)
+
+answer = answer * x;
+
+System.out.println(x + " raised to the " + n + " power = " + answer);
+
+Notice that answer is initialized to 1, that the loop iterates n times, and that each time through the loop answer is multiplied by x. Also note that there is only one statement in the body of the for loop so a compound statement is not used. What would happen if 0 or a negative value were entered for the value of n? The result would be that the initial value 1 in the variable i would not be less than or equal to the value 0 in n. Since the for loop is a pretest loop structure, the loop would not iterate, and the initial value 1 in answer would be output. Could this problem have been solved using a count-controlled while loop? Yes, but since the loop needs to iterate a fixed number of times, the for loop is the better choice. As will be seen later, the for loop will be especially useful with arrays in Chap.​ 7.
+
+## 4.5 Nested Iteration Structures
+
+As seen in Sect. 4.3, iteration structures can be nested within selection structures, and the reverse can also occur. Further, iteration structures can also be nested within other iteration structures, and when using nested loops, they require some special considerations. To start, consider the following nested while loops:
+
+int i,j;
+
+i = 1;
+
+while(i <= 3) {
+
+j = 1;
+
+while(j <= 2) {
+
+System.out.println("i = " + i + " j = " + j);
+
+j = j + 1;
+
+}
+
+i = i + 1;
+
+}
+
+System.out.println("End of Program");
+
+First, notice that the loop control variable for the outer loop is the variable i and the loop control variable for the inner loop is the variable j. Although it is okay to reuse the same variable when the loops are not nested, if the same variable is used in a nested loop, it might cause what is known as an infinite loop as discussed in the next section. Given the above code segment, how many times will the inner println output its message? The answer is six times. If the outer loop iterates 3 times and the inner loop iterates 2 times, then one can multiply the number of times each loop iterates to get the answer, where 3 times 2 is 6. The output of the above code segment can be seen below:
+
+i = 1 j = 1
+
+i = 1 j = 2
+
+i = 2 j = 1
+
+i = 2 j = 2
+
+i = 3 j = 1
+
+i = 3 j = 2
+
+End of Program
+
+Note that the variable j counts to 2 and then starts over again when the value of i changes. It is often said in a description of this behavior that the value of the inner loop control variable varies more rapidly than the outer loop control variable which varies more slowly. Looking at another segment, how many times would the message generated by the inner println be output in the following example?
+
+int n,count;
+
+System.out.print("Enter a value for n: ");
+
+n = scanner.nextInt();
+
+count = 0;
+
+for(int i=1; i<=n; i++)
+
+for(int j=1; j<=n; j++)
+
+System.out.println("count = " + count++);
+
+System.out.println("End of Program");
+
+Although one might answer that it depends on the value in n, one can still give answer in terms of n. Given the previous example where the number of times the body of the loop was executed was equal to the number of times iterated by the outer loop times the inner loop, the same principle applies here. The outer loop is n and the inner loop is n, so n times n equals n 2 . As a particular example, if the value of n was 6, then the body of the inner loop would execute 36 times.
+
+First, note that the variables i and j are declared in the for statements. Second, notice that there are no compound statements in either for loop in the above code segment. The reason is that the inner for loop has just one statement in the body of its loop and the inner for loop is just one statement in the body of the outer for loop so braces are unnecessary. Lastly, note the use of count++ which increments the value of count after it has been output.
+
+At present, the need for nested loops is not as great, but later in Chap.​ 7 nested loops will be important when data needs to be sorted, for example, in ascending order. Nested loops will also be important when dealing with what are known as two-dimensional arrays.
+
+## 4.6 Potential Problems
+
+There are a number of problems that can occur with loops, some of which have already been alluded to earlier in this chapter. For example, if the relation in the test section of a loop is incorrect, the loop might iterate more or less times than was originally intended. The best way to check for this is try going through the code segment using a small enough number so that it is easy to walk through the segment but a big enough number so that any pattern in the code can be observed. A good number to test with is the number 3 as has been used frequently in this chapter.
+
+Just as it is important to check that the final number is correct, it is also important to ensure that the initial value is correct. For example, switching from the number 1 to the number 0 as the initial value usually requires a change in the relation in the test as discussed in Sect. 4.2.
+
+Other considerations are to be sure that the loop control variable is initialized in the first place. If one forgets to initialize it, then the value in the loop control variable would be indeterminate and the loop would iterate an unknown number of times. Probably a more serious problem is when one forgets to include a change in the body of a loop. Even though the loop control variable has been initialized properly and tested correctly, if there is no change in the loop, one has what is called an infinite loop, meaning the loop never stops. This can make it seem that the computer is "locked up" and not responding, or the program might ask for input or messages are output without stopping.
+
+Other concerns happen when incrementing the loop control variable by a value other than 1, such as counting by 2 and testing for only a particular value instead of a range of values as in the following code segment:
+
+i = 0;
+
+while(i != 3) {
+
+// body of loop
+
+i = i + 2;
+
+}
+
+System.out.println("End of Program");
+
+Notice that the value of i starts with the number 0, then is incremented to 2, and then 4, so the value in i is never equal to the number 3. Although it is okay to increment by values other than 1, it is important that the comparison is in a range of numbers such as <=3 and that the loop iterates the expected number of times.
+
+One might have noticed that the loop control variables used have always been integers. A variable of type char can also be used as will be shown in the next section. Although real numbers can be used, sometimes the computer cannot represent real numbers accurately. For example, the number 0.1 cannot be represented exactly on a computer, because it is a repeating fraction in the binary number system (base 2) and is less than 0.1. If one wrote a program such as the following and added the value of 0.1, ten times, the result would not be equal to 1.0:
+
+double i;
+
+i = 0.0;
+
+while(i < 1.0) {
+
+// body of loop
+
+i = i + 0.1;
+
+}
+
+System.out.println("End of Program");
+
+Instead of looping ten times as might be expected, the above program actually iterates eleven times. Again, real numbers can be used, but it is generally not good practice.
+
+As said previously, when writing loops, or any code for that matter, it is important to check programs carefully with smaller data sets and to also test the program thoroughly with actual data on the computer to help avoid the possibility of logic errors.
+
+## 4.7 Complete Programs: Implementing Iteration Structures
+
+As in Chap.​ 3, the first example does not use objects and the second example includes objects.
+
+### 4.7.1 Simple Program
+
+Using iteration structures and selection structures, one can write programs that are more complex and robust. Suppose that a program needs to be developed to find an average and the highest test scores in a course. This program will:
+
+  * Allow a user to enter student exam scores assuming a score is an integer value between 0 and 100
+
+  * Compute the average and find the highest score
+
+  * Display the average and the highest score
+
+Since there will be more than one score that needs to be processed, instead of storing each score in different variables, a loop will be used to input them. What kind of loop should be used? Because most likely every class has a different number of students, the number of iterations will not be known in advance. The program could ask the user to enter the number of students before the loop and use a while loop or a for loop. On the other hand, since the range of scores is given, a sentinel value can be easily identified in order to use a sentinel loop. It is not a good idea to use a do-while loop, because there may be no scores to be processed. Using a sentinel of −1, a pretest indefinite sentinel controlled loop structure will be used here. When no score is entered, there is no reason to compute an average, find the highest score, or display them. Therefore, in that case the message, "No scores were entered." will be output. Finding the average of numbers using a loop was discussed in Sect. 4.2, but what about finding the highest score? Since all of the scores are not saved, the highest value cannot be determined after the loop is terminated by looking at all the data at once. Then, how can the highest score be found as the scores are input? The answer is to keep the highest score among the scores entered so far. Assuming all the variables are declared appropriately, the following code finds the highest value entered:
+
+// priming read
+
+System.out.print("Enter a score or -1 to stop: ");
+
+score = scanner.nextInt();
+
+highestScore = score;
+
+// loop to enter scores
+
+while(score != -1) {
+
+if(highestScore < score)
+
+highestScore = score;
+
+System.out.print("Enter a score or -1 to stop: ");
+
+score = scanner.nextInt();
+
+}
+
+Notice that the first score input is used to initialize the variable highestScore which keeps the highest value up to that point. If the first score is not −1, then in the loop the score is checked against the highest score. At this point, only one test score has been entered; therefore, the values of score and highestScore are the same, meaning the condition of the if statement is false. If the second value entered is not equal to −1, the body of the loop will be executed again. The second input is compared with the value of highestScore, which has the first value input at this point. If the condition is false, it means the first value input is greater than the second. If the condition is true, it means the most recent value input is greater than the highest one so far, so highestScore needs to be updated. This process is repeated until the user enters a sentinel value of −1. At the end, the value of highestScore is the largest value of all the scores input. The complete program is shown below:
+
+First, notice the prompt and input prior to the loop which is the priming read. It is necessary to determine whether to enter the loop or not by checking the first input value against the sentinel. The prompt and input in the loop determine if the loop should continue to iterate. As was discussed in Sect. 4.6, it is important to make sure that the loop will eventually terminate to avoid an infinite loop. In this program a sentinel value of −1 will stop the loop. If there are no scores and the user enters −1 at the very beginning, the program will not execute the body of the loop in the else section of the if-then-else, thus ensuring that division by 0 will not occur for the calculation of the average. With the input value of −1 the output is as follows:
+
+Enter a score or -1 to stop: -1
+
+No scores were entered.
+
+With values other than −1, the variable count is incremented by 1 inside the loop body to keep track of the number of scores and is used to find the average. Notice that sum, which has the total of all the scores, is declared as type double. Although score is of type int, by declaring sum as type double, the result of the calculation sum/count to find the average will be of type double since it is a double divided by an int. An example of the output with three scores is shown below:
+
+Enter a score or -1 to stop: 88
+
+Enter a score or -1 to stop: 97
+
+Enter a score or -1 to stop: 65
+
+Enter a score or -1 to stop: -1
+
+Average score is 83.33.
+
+The high score is 97.
+
+### 4.7.2 Program with Objects
+
+Next consider an example that involves objects. An object that keeps a distribution of scores for a particular exam is useful to figure out how many students made a grade of A, B, C, D, or F. The Grades class defines data members, a constructor, and three methods, enterGrade, getNumStudents, and getPercent. The definition of the Grades class is shown below and the actual implementation of the three methods is discussed shortly:
+
+Since the cutoff for the grade of A is 90, scores between 90 and 100 will receive a grade of A. Scores between 80 and 89 will result in a grade of B because the cutoff for the grade of B is 80, and so on. If the score is outside the range of 0–100, it is simply ignored in the enterGrade method. For example, what happens if the score is 95? Since it is a valid input inside the range of 0–100, the count is incremented by 1 to keep track of the number of scores entered. Then, it will increment the counter for the A group by 1. The enterGrade method shown in Fig. 4.5 is used to distribute the scores entered by the instructor into the correct grade group.
+
+Fig. 4.5
+
+Implementation of enterGrade method
+
+The getNumStudents method in Fig. 4.6 returns the number of scores assigned to a particular grade and is implemented using a switch statement. It takes a grade (A, B, etc.) in a variable of type char as a parameter and returns a value of type int.
+
+Fig. 4.6
+
+Implementation of getNumStudents method
+
+The getPercent method in Fig. 4.7 finds the percentage of scores assigned to a designated grade level and is also implemented using a switch statement. It takes a char value and returns a value of type double. Notice that the value 100.0 of type double is multiplied by the number of scores for the particular grade which is a value of type int, to make the result of type double. The result is divided by a value of type int stored in count, which results in the percentage of type double. If an invalid character is passed as a parameter, the value of −1, which represents an invalid value, is returned.
+
+Fig. 4.7
+
+Implementation of getPercent method
+
+Like the previous Scores program, the client program using a Grades object outputs the message "No scores were entered.", if there were no scores as shown below:
+
+Enter a score or -1 to stop: -1
+
+No scores were entered.
+
+An example of the output with eight scores is shown below:
+
+The client program will create an object of the Grade class named class1 and each score is processed as it is entered. The exam scores are input using a while loop since the number of scores is indefinite. The result is output using a for loop because the number of lines is known. The table displays the distribution and percent for each grade. The complete client program is shown below:
+
+The first line of the table contains column titles that are printed prior to the for loop. The second through fifth lines output the grade, distribution, and percent for grades for A, B, C, and D using a for loop. Notice that the char variable' letter is used as a loop control variable in the for loop. It is initialized to "A" at the beginning of the for loop, and when it is incremented by one, the value of letter is updated to the next character in alphabetical order such as A to B, and B to C. Because there is a gap between D and F, the information for the grade of F needs to be printed outside the for loop at the end. Control characters, c, d, and f, are used in the control string of the first printf statement to output the variables of type char, int, and double, respectively, in order to format the table as described in Chap.​ 1.
+
+## 4.8 Summary
+
+  * The while loop and the do-while loop are known as indefinite iteration loop structures.
+
+  * The for loop is known as a definite or fixed iteration loop structure.
+
+  * The do-while loop is a posttest loop structure and can iterate one to many times.
+
+  * The while loop and the for loop are pretest loops which can iterate zero to many times.
+
+  * The do-while loop must always use a compound statement in the body of the loop whether there are one or many statements.
+
+  * The body of the for and while loops only need to use a compound statement when there is more than one statement in the body of the loop. If there is only one statement, the compound statement is unnecessary.
+
+  * When nesting loops, be sure to use a different loop control variable for each loop.
+
+## 4.9 Exercises (Items Marked with an * Have Solutions in Appendix E)
+
+1.
+
+Identify the syntax errors in the following code segment:
+
+int sum, i;
+
+sum = 0;
+
+i = 0;
+
+while(i >= 0); {
+
+sum = sum + i;
+
+i = i + 2;
+
+}
+
+*2.
+
+Identify the syntax errors in the following code segment:
+
+int product;
+
+product = 1;
+
+for(i=1, i <= n, i++)
+
+product = product * i;
+
+*3.
+
+Determine the output from the following code segment:
+
+4.
+
+Determine the output from the following code segment:
+
+5.
+
+Determine the output from the following code segment:
+
+*6.
+
+Determine the output from the following code segment:
+
+int i, j;
+
+for(i=1; i<=5; i++) {
+
+for(j=1; j<=5-i; j++)
+
+System.out.print(" ");
+
+for(j=1; j<=2*i; j++)
+
+System.out.print("*");
+
+System.out.println();
+
+}
+
+7.
+
+Rewrite the following for loop as a
+
+A.
+
+While loop
+
+*B.
+
+do-while loop
+
+int total, count;
+
+total = 0;
+
+for(count = 1; count <= 40; count+=3) {
+
+total += count;
+
+}
+
+8.
+
+Assuming n is input, rewrite the following while loop as a(n)
+
+*A.
+
+for loop
+
+B.
+
+if statement and a do-while loop
+
+int total, count, n;
+
+total = 0;
+
+count = 0;
+
+n = 5;
+
+while(count < n) {
+
+total += count;
+
+count++;
+
+}
+
+9.
+
+A store is having a sale and items are either 30, 50, or 70 % off. Assuming all the items priced between $5.00 and $50.00 are on sale, output the following table using nested loops. Using correct formatting, make sure that the output is exactly as shown below:
+
+Original Price | 30% off | 50% off | 70% off
+
+---|---|---|---
+
+$ 5.00 | $ 3.50 | $ 2.50 | $ 1.50
+
+$10.00 | $ 7.00 | $ 5.00 | $ 3.00
+
+$15.00 | $10.50 | $ 7.50 | $ 4.50
+
+$20.00 | $14.00 | $10.00 | $ 6.00
+
+$25.00 | $17.50 | $12.50 | $ 7.50
+
+$30.00 | $21.00 | $15.00 | $ 9.00
+
+$35.00 | $24.50 | $17.50 | $10.50
+
+$40.00 | $28.00 | $20.00 | $12.00
+
+$45.00 | $31.50 | $22.50 | $13.50
+
+$50.00 | $35.00 | $25.00 | $15.00
+
+10.
+
+Repeat Exercise 15 in Chap.​ 3 to allow the user to enter temperatures for any number of cities using the best iteration structure.
+
+11.
+
+The Fibonacci sequence is the series of numbers which can be found by adding up the two numbers before it as shown below:
+
+  * 0, 1, 2, 3, 5, 8, 13, 21, 34,...
+
+Write a complete program to compute the Fibonacci number for an integer.
+
+12.
+
+Given two numbers, the largest divisor among all the integers that divide the two numbers is known as the greatest common divisor. For example, the positive divisors of 36 are 1, 2, 3, 4, 6, 9, 12, 18, and 36, and the positive divisors of 8 are 1, 2, 4, and 8. Thus, the common divisors of 36 and 8 are 1, 2, and 4. It follows that the greatest common divisor of 36 and 8 is 4. Write a complete program to compute the greatest common divisor of two integers.
+James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6_5
+
+© Springer-Verlag London 2014
+
+# 5. Objects: Revisited
+
+James T. Streib1  and Takako Soma1
+
+(1)
+
+Department of Computer Science, Illinois College, Jacksonville, IL, USA
+
+Abstract
+
+Objects are revisited in this chapter. The sending and returning an object to and from a method is illustrated using contours. Overloaded constructors and methods are discussed and the reserved word this is introduced. Local, instance, and class constants and variables along with class methods are shown using contour diagrams. Two complete programs, one with focus on overloaded methods and another with class data members and methods are included.
+
+Having learned in the previous two chapters about selection and iteration structures, both of which allow for more complex programs, it is time to return to the topic of objects that was introduced in Chap. . Objects allow programs to be created in a more modular way that makes complex programs easier to understand. In this chapter, topics such as passing objects to and from a method, constructor and method overloading, class data members and methods, and the use of the reserved word this will be discussed. At first, this chapter will use only simple objects to illustrate these concepts so that the details can more readily be understood and then more complex examples will be included in the complete program at the end of the chapter.
+
+## 5.1 Sending an Object to a Method
+
+So far all that has been discussed is how primitive data types can be sent to a method. However, data is often more complex than just a simple data type, so it would be helpful to have a way to send not just an item or two but rather an entire object to a method. For example, consider a method to determine the length of a line segment. It would need to be sent the two endpoints of the line, each consisting of x and y coordinates, which would require four arguments to be sent to the method. Since each point has two coordinates, this would lend itself to the creation of a simple class. Although in Java there is a Point class in the java.awt package, a point is a simple enough concept to help explain the sending of an object to a method that this text will define its own class for a point. Whereas the Java class Point uses integers, the class defined here will use double precision numbers and will be called PointD. Consider the preliminary definition of the class in Fig. 5.1.
+
+Fig. 5.1
+
+Preliminary definition of PointD class
+
+The PointD class definition is fairly simple with the usual get and set methods. However, what will make it more interesting is the introduction of a method which allows an invocation to send an object of type PointD. For this example, assume the existence of a method called distance which will calculate the distance between two points. Since the method will be defined within the PointD class, it can be invoked by an object of type PointD and also use an argument of type PointD. Assuming the existence of two points p1 and p2 of type PointD, the method could be invoked as dist=p1.distance(p2);. What would such a method look like? Recall from algebra that the distance formula is
+
+Then the code for the method could be as follows:
+
+public double distance(PointD p) {
+
+double dist;
+
+dist = Math.sqrt(Math.pow(x-p.getX(),2)
+
+\+ Math.pow(y-p.getY(),2));
+
+return dist;
+
+}
+
+First, notice that the method returns a value of type double. Second, note that the parameter is not of type double but rather of type PointD. Lastly, although the local variable dist is not required to be declared as local, it makes the subsequent contour diagram easier to follow when illustrating how objects are passed. Using all the information above and combined into a complete program, it could appear as shown below:
+
+Utilizing contour diagrams, the passing of objects can easily be illustrated. Note that some steps will be skipped since many of them were discussed thoroughly in Chap. . The state of execution prior to Line 11 in the main program would be as shown in Fig. 5.2.
+
+Fig. 5.2
+
+State of execution prior to Line 11
+
+Since the method distance is invoked from p1, the contour for the method appears in the contour referenced by p1 as shown in Fig. 5.3, indicating the state of execution just prior to Line 40 in the distance method.
+
+Fig. 5.3
+
+State of execution prior to Line 40
+
+In addition to the local variable dist, the method also contains a memory location for the parameter p. Note that when passing an object to a method via a parameter, the parameter does not contain the entire object. Rather, since the argument p2 has a reference to an object, the parameter p contains a copy of the reference to that same object. Although a straight arrow could have been drawn directly to the object, it would have covered up some of the information within the contour, so in this example, it is drawn around the contour diagram for the sake of neatness. However, in the future the arrows may be drawn over parts of contours in order to save space. Note that both the argument p2 and the parameter p are pointing to the same contour. When the calculation for the dist is performed, the references to x and y are to the ones globally accessible within the object pointed to by p1, whereas the getX and getY methods access the variables in the object referenced by p.
+
+## 5.2 Returning an Object from a Method
+
+If an object can be passed to a method, can an object be returned from a method? The answer is yes, as will be demonstrated in the example that follows. Whereas the previous example returned the dist of type double, this example will determine the midpoint of a line. The equations to determine the midpoint are as follows:
+
+Since the midpoint consists of x and y coordinates, this lends itself to the creation of a method to return an object of type PointD. The method midPoint below implements the equations above:
+
+public PointD midPoint(PointD p) {
+
+PointD mid;
+
+mid = new PointD();
+
+mid.setX( (x+p.getX()) / 2 );
+
+mid.setY( (y+p.getY()) / 2 );
+
+return mid;
+
+}
+
+Notice that in addition to the parameter, the return type is also of type PointD. The method also creates an instance of type PointD and assigns the reference to the variable mid which is also declared of type PointD. The method then calculates the midpoint and sets the x and y coordinates in mid prior to the return of the object to the invoking program.
+
+This method can be added to class PointD, and in Fig. 5.4, it replaces the previous method distance in order to save space.
+
+Fig. 5.4
+
+Complete program returning an object from a method
+
+Prior to the execution of Line 11, the contour diagram would look similar to Fig. 5.2 in the previous example, except the variable dist of type double would be replaced with the variable middle of type PointD. After invoking the midPoint method, the contour diagrams would appear similar to the ones shown in Fig. 5.3 in the previous section, except that in addition to the variable middle appearing in the main program, the distance contour would be replaced with the midPoint contour and the variable dist in the contour would be replaced with the variable mid of type PointD which would be indeterminate. However, once the body of the method midPoint is executed, that is when the significant differences can be seen when a new object is created in Line 39. Figure 5.5 illustrates this by showing the state of execution prior to the return statement in Line 42.
+
+Fig. 5.5
+
+Contour just prior to the execution of the return statement in Line 42
+
+Notice that in addition to the contour referenced by the parameter p, there is another contour referenced by the local variable mid that contains the coordinates of the midpoint. As with the passing of a reference to an object via a parameter, the entire contour will not be returned to the main program, but rather only the reference to the contour will be returned as illustrated in Fig. 5.6 which shows the state of execution prior to Line 12.
+
+Fig. 5.6
+
+Contour after returning to the main program prior to Line 12
+
+Notice that the contour for the method midPoint no longer exists after returning to the main program. However, the value in mid was returned back to the invoking statement on Line 11 and assigned to the variable middle, which now contains the reference to the object containing the midpoint values. When the output statements refer to the getX and getY methods of the appropriate objects, the correct values will be output.
+
+## 5.3 Overloaded Constructors and Methods
+
+The constructor in the previous example initializes the variables x and y to 0.0 as a default value. In addition, a constructor could have been created to initialize the instance variables to the values wanted by a programmer as shown in the following:
+
+public PointD(double xp, double yp) {
+
+x = xp;
+
+y = yp;
+
+}
+
+A programmer could then initialize x and y via the constructor when the object was created as shown below:
+
+p1 = new PointD(4.0,4.0);
+
+The advantage of this method is that a programmer does not need to invoke the setX and setY methods to initialize the variables in the object. Does this mean that the set methods could be deleted from the class definitions? If the values in the variables did not need to change, then yes the set methods could be deleted. However, what if after initializing the variables, their values needed to be changed later in the program? Then of course the set methods would need to be retained in the class definition.
+
+Given the previous constructor and the new constructor above, which of the two is better and which one should be included in the class definition? The answer depends on what needs to be done. For example, if the values are going to be changed often, then the first constructor and the set methods are the best choice, but if the values are going to be set just once, then the second constructor is probably the better choice.
+
+However, when the class is written, it might not be known which type of constructor would be the best one to include. Wouldn't it be nice to include both constructors and allow the programmer a choice? But further, could this cause a syntax error by having two constructors with the same name? The answer to the first question is yes and the answer to the second question is no. The reason why this would not cause an error is because even though the name of the constructor is the same, the number of parameters is different because the first constructor does not have any parameters and the second one has two parameters. This is known as overloading. In other words, even though constructors have the same name, they can differ by the number of parameters, the types of the parameters, or the order of the different types of parameters. When used carefully, overloading can be a very useful technique.
+
+Using the knowledge gained from Sect. 5.1, it is also possible to pass an object to a constructor. For example, if an object was already created and a copy of that object was needed, then that object could be passed via a parameter to another constructor to create the copy. Such a constructor would look as shown below:
+
+public PointD(PointD p) {
+
+x = p.getX();
+
+y = p.getY();
+
+}
+
+Notice that instead of two parameters of type double, there is now only one parameter of type PointD. In the body of the constructor, the coordinates are retrieved from the object sent using the getX and getY methods and placed into the x and y variables of the current object. The result is that if one wanted to create two objects with the same set of coordinates, instead of writing the following code:
+
+p1 = new PointD(1.0,1.0);
+
+p2 = new PointD(1.0,1.0);
+
+one would merely need to write the following:
+
+p1 = new PointD(1.0,1.0);
+
+p2 = new PointD(p1);
+
+Given the two new constructors, the original PointD class could be rewritten as follows:
+
+Using this new class, a programmer could create three different instances of the PointD class as follows:
+
+PointD p1, p2, p3;
+
+p1 = new PointD();
+
+p2 = new PointD(1.0,1.0);
+
+p3 = new PointD(p2);
+
+Notice that the objects are being created using three different constructors. The only difference is the number of arguments. Further, since the first constructor ensures that coordinates referenced by p1 will be initialized to 0.0, the second constructor initializes the variables referenced by p2 via the arguments, and the third constructor makes a copy of the previous object which will be referenced by p3, the set methods do not need to be called. However, if the values in the points need to be changed later, the set methods are still there if necessary.
+
+If a constructor is not included in a class by the programmer, the system will generate a default constructor. Should the programmer include a constructor without any parameters, then this constructor overrides the default constructor generated by the system. Although a bit confusing, this constructor provided by the programmer is also sometimes called a default constructor since it overrides the system default constructor. However, if one writes the two new constructors above, and a default constructor is not included by the programmer, then the system will not generate a default constructor. In such a case, were one to code a p1=new PointD(); statement, a syntax error would occur. The result is if one wants to override the system default constructor, it is a good idea to override it with a programmer-defined default constructor to avoid a possible syntax error. Even if overloading is not being used in the class, it is generally best for a programmer to include a default constructor and not rely on the system default constructor.
+
+Just as constructors can be overloaded, so can methods. As with constructors, the name of the method can be the same, but the number of parameters, the types of the parameters, or the order of the different types of parameters must be different. For example, take the distance method from Sect. 5.1 which requires one parameter as shown again below:
+
+public double distance(PointD p) {
+
+double dist;
+
+dist=Math.sqrt(Math.pow(x-p.getX(),2)
+
+\+ Math.pow(y-p.getY(),2));
+
+return dist;
+
+}
+
+What if another method was needed to determine the distance of a point from the origin? Certainly one could invoke the method above by having one of the two points as the origin using the new constructors introduced in this section as follows:
+
+PointD p1, p2;
+
+p1=new PointD();
+
+p2=new PointD(3.0,4.0);
+
+dist = p2.distance(p1);
+
+In this example, the default constructor initializes the coordinates of p1 to 0.0, and the second constructor initializes the coordinates of p2 to 3.0 and 4.0. But the assumption could be that the distance will be calculated from the origin, and it would be convenient not to need it as a parameter in the distance method. Such a method would look as follows:
+
+public double distance() {
+
+double dist;
+
+dist=Math.sqrt(Math.pow(x,2)+ Math.pow(y,2));
+
+return dist;
+
+}
+
+Instead of invoking the previous method with the dist = p2.distance(p1); statement, it could be invoked using the new method as follows:
+
+dist = p2.distance();
+
+Again, the name of the method is the same, but the number of parameters is different. As mentioned earlier, it is also possible to have the same number of parameters but different types of parameters or a different order of the different types of the parameters.
+
+For example, assume a method of the Student class was to be sent two parameters: one for the number of credit hours and another to indicate whether the student has graduated. In the main program below, notice that in one case, an integer is in the first argument position and in the second case a Boolean value is the first argument position. Would this cause a problem?
+
+If there were only one method named setInformation, the answer would be yes. However, notice the setInformation method is overloaded. The parameters are reversed in the second method so that the order of the arguments in the calling program does not matter. Thus, if a programmer accidently puts the arguments in the wrong order, there is no error. As stated previously, overloading can sometimes be helpful if used carefully and not excessively.
+
+## 5.4 Use of the Reserved Word this
+
+In looking at portion of the original PointD class from Fig. 5.1 shown below, the parameter names in the constructor and in the two set methods are listed as xp and yp.
+
+class PointD {
+
+private double x, y;
+
+public PointD(double xp, double yp) {
+
+x = xp;
+
+y = yp;
+
+}
+
+public void setX(double xp) {
+
+x = xp;
+
+}
+
+public void setY(double yp) {
+
+y = yp;
+
+}
+
+}
+
+What would happen if the names of the variables xp and yp were changed to x and y, respectively? What would x and y refer to, the data members or the parameters?
+
+/** Caution: Incorrectly Implemented code **/
+
+class PointD {
+
+private double x, y;
+
+public PointD(double x, double y) {
+
+x = x;
+
+y = y;
+
+}
+
+public void setX(double x) {
+
+x = x;
+
+}
+
+public void setY(double y) {
+
+y = y;
+
+}
+
+}
+
+The answer to the second question is that the parameters and local variables declared in a method take precedence over any globally declared variables in the object. The answer to the first question is that the contents of the parameters x and y would merely be assigned back into the memory locations associated with the parameter. The result is that the private data members would not contain the new values sent from the invoking program, and this is probably not what was intended.
+
+Is it possible to use the same variable names for both the parameters and the instance data members? The answer is yes. In any particular instance, the reserved word this can be used to refer to the instance. Java uses this as a self-referencing pointer to refer to the current object. Using the reserved word this, the previous class can be rewritten as shown below:
+
+class PointD {
+
+private double x, y;
+
+public PointD(double x, double y) {
+
+this.x = x;
+
+this.y = y;
+
+}
+
+public void setX(double x) {
+
+this.x = x;
+
+}
+
+public void setY(double y) {
+
+this.y = y;
+
+}
+
+}
+
+So, for example, consider the shortened skeleton of the program presented at the beginning of this chapter that uses only the setX and getX methods shown below:
+
+In the setX method, x refers to the parameter, and the value in x is assigned to this.x which is the data member x in the object. In a sense, this is a pointer to the current object as illustrated in the contour in Fig. 5.7 showing the state of execution just prior to Line 17.
+
+Fig. 5.7
+
+State of execution prior to Line 17
+
+Notice the arrow pointing back to the object PointD. It illustrates the word this and shows how the data member x is referenced. Although the example in Fig. 5.7 includes the cell for this and a self-referencing arrow, it tends to clutter up the contour diagrams, so in general it will not be included because its existence is understood. Notice that the constructor and the getX do not use the reserved word this on Lines 12, 13, and 19. In this case the word this is not necessary. Although one could still include the word this, it can be distracting to use it when it is not needed. As a result, this text will not use the word this unless it is necessary.
+
+The reserved word this can also be used in situations beyond just referring to variables. It can refer to constructors and methods as well. For example, consider the three constructors presented in the previous section and relisted below using the reserved word this in the second constructor:
+
+In one sense, the first constructor is just a special case of the second constructor, so it could be defined in terms of the second constructor. In other words, it could invoke the second constructor with the values 0.0 for the x and y coordinates. But how could it invoke the second constructor? Again, since it is the current object that needs to be referenced, the reserved word this could be used as shown below:
+
+public PointD() {
+
+this(0.0,0.0);
+
+}
+
+Even the third constructor could be written to invoke the second constructor as:
+
+public PointD(PointD p) {
+
+this(p.getX(),p.getY());
+
+}
+
+Since an object of type PointD is being passed to the constructor, the methods getX and getY can be invoked to retrieve the values in x and y, which in turn can be sent as arguments to the second constructor. In order to invoke the second constructor, it is referred to using this.
+
+The advantage of the above technique is that if later a change needs to be made to the constructors, it might not need to be made to all three constructors, but possibly only one of them. This reduces the possibility of introducing unintended errors into the program, and the result of the modifications introduced in this section can be seen below:
+
+As with variables and constructors, it is possible to use the word this when referring to methods in the same object. For example, suppose that a method needed to access another method such as the previous distance method within the same class. It could be invoked as this.distance(), but although the method can be invoked using the reserved word this, there is no need to do so. As a result, the use of the word this prior to the invoking of a method should be avoided.
+
+## 5.5 Class Constants, Variables, and Methods
+
+This section will discuss how constants, variables, and methods can be declared not only within a method and in each instance of a class but also how they can be declared in the class itself. First, it looks at constants, then variables, and lastly methods.
+
+### 5.5.1 Local, Instance, and Class Constants
+
+If a constant needs to be used only within a single method, then it can be declared within that method. However, if several methods in the same class use the same constant, it could be declared within each method but that will take up more memory. If that constant needs to be changed, then it will need to be changed in more than one location. Although there already exists the Math.PI constant discussed in Sect.​ 1.​7, consider for example, the following program which includes the user-defined constant PI:
+
+In addition to the existence of the local variables c and a to help with understanding the contour diagrams, notice that both methods have their own locally declared constant PI at Lines 24 and 30. When each method is executed, its own copy of the constant is allocated. The contour diagram in Fig. 5.8 illustrates that each method has its own copy and shows the state of execution prior to Line 33.
+
+Fig. 5.8
+
+State of execution prior to Line 33
+
+Even though one contour is deallocated (indicated by the shaded contour) before the next one is invoked, it still had to allocate the constant. While this is only a minor problem now, any local constants can take up much more space in a recursive algorithm as will be discussed in Chap. . Since there is a potential for wasted memory, it would be better if the constant were not associated with each method, but rather with the object as illustrated in the following section showing the Circle class:
+
+Only the class is shown here because the main program has not changed. Again, the local variables in the method remain to help with the contour diagrams, but notice that the declaration of the constant is no longer within each method, but rather in the class at Line 16. An immediate obvious advantage is that should the constant need to change, it needs only to be changed in one location. The contour diagram representing the state of execution prior to Line 32 is shown in Fig. 5.9.
+
+Fig. 5.9
+
+State of execution prior to Line 32
+
+Note that the constant PI no longer appears in each of the methods, but rather is located in an instance of the Circle class. The advantage to declaring the constant in the class as opposed to each individual method is that the constant only needs to be allocated once.
+
+However, what if more than one object was declared? Then there would be one constant allocated within each of the objects. Consider the following modification to the main program that declares and allocates two objects:
+
+double radius1, radius2; // Line 3
+
+Circle c1,c2; // Line 4
+
+c1 = new Circle(); // Line 5
+
+c2 = new Circle(); // Line 6
+
+radius1 = 3.0; // Line 7
+
+radius2 = 4.0 // Line 8
+
+c1.setRadius(radius1); // Line 9
+
+c2.setRadius(radius2); // Line 10
+
+Using the same Circle class as before, without invoking any of the methods except for the constructor, note the state of execution just prior to Line 9 in the main program in Fig. 5.10.
+
+Fig. 5.10
+
+State of execution prior to Line 9
+
+Notice that the constant PI appears in both instances of the Circle class. Just like with the methods when the constant was moved from the individual methods, wouldn't it be nice if the constant could be moved so that it would be accessible by both objects? This can be accomplished by using what is known as a class constant. Showing the new complete program below, a class constant is created by using the reserved word static as shown in Line 24 below:
+
+Executing the first few lines of the program as done previously, the contour diagram in Fig. 5.11 shows the state of execution just prior to Line 9. Notice that each of the instances does not have a local constant PI. As mentioned previously in Sect. 2.​7, just as the main program has a contour around it, as shown in Fig. 5.11, so does the class Circle. Using the word static creates the class constant PI that does not get allocated each time a new instance of the class Circle is created. When there is a reference to the constant PI, it is not found in the instance, but rather in the class. As can be seen, this saves memory, especially when many objects are created.
+
+In contour diagrams, how can one distinguish the contour for the class itself from the contours associated with the instances of the class? One way is to note that variables of type Circle point to the instances of the Circle class. However, another way to help the reader is to allow the contour associated with the class itself to have the name of the class (in this case Circle) and then use a superscript for each instance of the class to indicate the order in which the objects were created as shown in Fig. 5.11. When necessary to help make this distinction clear, this text will use superscripts.
+
+Fig. 5.11
+
+State of execution prior to Line 9
+
+Just as this text has previously not drawn the contour around the main program in the interest of saving space, it would also help to save space to not draw the contour around all the instances of each object. As can be seen in Fig. 5.11, it could get rather cumbersome to draw such large contours. However, on occasion it is still helpful to draw a contour to represent the class, so instead of drawing it around all the instances, it is sometimes convenient to draw it separately, with the understanding that all the instances are within that contour. This second alternative is shown in Fig. 5.12.
+
+Fig. 5.12
+
+Alternative contour diagram illustrating class constants
+
+Figure 5.11 is the ideal drawing and it will be used as necessary. However, generally and if needed, the contour for the class using a class constant will be drawn as shown in Fig. 5.12, with the understanding that all instances will be within that contour.
+
+### 5.5.2 Local, Instance, and Class Variables
+
+Local and instance variables are similar to local and instance constants. In fact, the variables c and a representing the circumference and area in the previous section are local variables in the methods, and the variable r representing the radius in a Circle object is an instance variable. In trying to decide where a variable needs to be declared, it helps to ask which methods need access to the variable. For example, the variables c and a were used only by the circumference and area methods, so it made sense to declare them there. However, the variable r is used by both methods; hence, it makes sense to declare it once within the object instead of in both methods.
+
+Although using the two local variables wasted a little memory, it made understanding the contours easier, and in this case it is not much of a problem. In fact, these variables are not even needed, because the expression to calculate each value could have been included in the return statement, as shown below:
+
+public double circumference () {
+
+return 2 * PI * r;
+
+}
+
+public double area() {
+
+return PI * r * r;
+
+}
+
+It is sometimes helpful to write the initial version of the code using extra memory to help understand how it works and help debug any logic errors, and then later the extra memory locations can be removed to make the code more efficient. This technique will become even more helpful when learning about recursion in Chap. .
+
+As with the constants in the previous section, just as some variables are better placed in the object as instance variables instead of as local variables in the methods, there are cases where some variables should be declared as class variables instead of as instance variables. For example, what if one wanted to count each time a new object was created? Although this could be done in the main program, what if an object other than the main program was also creating the objects to be counted? In this case, the main program could not count them, nor could an instance variable be used, because each instance could not count how many other objects of its own type were created. As one might suspect, this would be a good candidate for a class variable.
+
+A class variable is declared similarly to a class constant except the reserved word final is not used as shown in Line 15 of the following program which simulates a program that creates objects for charge cards that contain an account number:
+
+Although it would be nice to create an indefinite number of objects, that would be difficult to illustrate using contours and would also be difficult to implement without the use of arrays which will be introduced in Chap. . Instead, this program creates only three ChargeCard objects to help illustrate the class variable cardCount. Notice that their class variable is initialized by the compiler to 0 in Line 15. Then each time a new instance of the class is created, the class variable cardCount is incremented in the constructor. The contour in Fig. 5.13 illustrates the state of execution just prior to Line 10 in the main program.
+
+Fig. 5.13
+
+State of execution prior to Line 10 in main
+
+As can be seen, the class variable is shown in the ChargeCard contour which is accessible by all of the instances of that class, as discussed in the previous section. Also note that instead of using a variable such as card1 to gain access to a class variable, the name of the class ChargeCard in Line 11 is used instead. Further, the reader might have noticed that whereas the class constant in the previous section was declared as private, the class variable cardCount is declared as public. In one sense this might seem convenient, because the class variable is accessible in the main program in Line 11. However, as mentioned in Chap.  and as will be discussed in the next section, it is usually better to declare variables as private and access them using a public method.
+
+### 5.5.3 Class Methods
+
+Although declaring a class variable as public allowing it to be accessed from the main program works, it is not necessarily the best way to access class variables. Just as it is not a good idea to declare instance variables as public, the same applies to class variables. As before, it is better to declare class variables as private and then access them via a public class method. This is accomplished by declaring a method using the reserved word static as shown in the following modified program:
+
+First, notice that the method getCardCount has been added at Line 27. The use of the reserved word static makes it a class method instead of an instance method. Also note that the method is declared as public and the class variable cardCount at Line 15 is now declared as private. Next, notice in Line 11 that instead of accessing the class variable, the class method getCardCount is invoked to return the value of cardCount. As before, the class method is invoked using the class name ChargeCard.
+
+What is interesting to see is that when the main program invokes the class method getCardCount, the contour is not in one of the objects, but rather in the contour for the class ChargeCard as illustrated in Fig. 5.14 which shows the state of execution prior to Line 28 in the class method getCardCount. When Line 28 in the class method getCardCount is executed, it has access to the private class variable cardCount and will return the value 3 back to Line 11 in the main program.
+
+Fig. 5.14
+
+State of execution prior to Line 28 in the getCardCount method
+
+Given the above, one needs to plan carefully where various constants, variables, and methods are declared. As a general rule, it makes sense to declare constants as class constants since they cannot be modified, they are accessible to all methods in the objects within the class, and they save memory. As another rule of thumb, it is generally a good idea to declare all variables as locally as possible. This helps organize a program and makes it easier to understand and maintain. However, if a method or object needs to communicate information with other methods or objects, then declaring the variables as instance or class variables makes sense. Although it might seem easy and be tempting to declare all variables as instance and class variables, this can make a program difficult to maintain and debug in the future. Likewise with methods, they should usually be declared as instance methods unless individual objects need to share a method, and then it should be declared as a class method. The key is to take the time when designing and creating a program to determine where each variable and method should be declared.
+
+## 5.6 Complete Programs: Implementing Objects
+
+The first complete program implements overloaded methods, and the second utilizes class data members and class methods.
+
+### 5.6.1 Program Focusing on Overloaded Methods
+
+After defining the PointD class earlier this chapter which represents a point, a class that represents a line will be developed in this section. Since a line consists of points, the PointD class can also be used. The main program will:
+
+  * Set points and lines
+
+  * Compare two lines
+
+  * Find the distance between a line and a point
+
+A line can be defined in slope-intercept form y = mx + b, where m is the slope and b is the y-intercept, and the class will be named LineSI. The slope and y-intercept are kept in private instance variables, slope and intercept.
+
+Because a user may like to define a line in several different ways and to reinforce the concept of overloaded constructors, six constructors will be provided. The default constructor without any parameters will set the value of the slope and the y-intercept to 0.0. The next constructor accepts the value for the slope as a parameter and sets the y-intercept to 0.0 creating a line going through the origin. The third constructor receives a LineSI object and copies the slope and y-intercept of the line to the new object, essentially creating an identical line. This constructor is sometimes referred as a copy constructor. The fourth constructor accepts two parameters and assigns these values to the instance variables, slope and intercept. A line can also be defined in two-point form as
+
+where (x 0 , y 0) and (x 1 , y 1) are two different points on the line. So, the fifth constructor accepts two PointD objects, calculates the slope and the y-intercept, and assigns the results to appropriate data members. The last constructor receives the x and y coordinates of two points and calculates the slope and y-intercept. Initial implementations for the six overloaded constructors are shown below:
+
+All six overloaded constructors have the same name as the class and they are differentiated by their parameter lists. The first constructor has no parameters, the second and third constructors have one parameter, the fourth and fifth constructors have two parameters, and the sixth constructor has four parameters. Although both the second and third constructors have one parameter, the types are different; the second has one of type double and the third has one of type LineSI. The fourth and fifth constructors have two parameters; the fourth has two parameters of type double and the fifth has two parameters of type PointD.
+
+The reserved word this in a constructor invokes the other constructor with the corresponding parameter list within the same class. So, calling the default constructor in the main method to create a LineSI object causes the fourth constructor to be called as well. The second, third, fifth, and sixth constructors also call the fourth constructor by using the reserved word this. As was discussed in Sect. 5.4, the advantage of using the word this is that if a change needs to be made to a common feature of all the constructors, only the fourth constructor needs to be modified. Also, notice that in the fourth constructor, the keyword this is used in order to distinguish between the data member and the parameter. This ensures that values in the parameters are correctly copied into the data members.
+
+There will be two usual mutators to set each instance data member and two accessors to get the value of two data members as shown below:
+
+In addition to the two mutators above, there will be three more mutators named setLine to set both instance data members at the same time. Like constructors, methods can also be overloaded. The setLine method is overloaded; one takes the values of the slope and the y-intercept, another takes the x and y coordinates of two points as parameters, and the last takes two PointD objects. Even though the first and the second setLine methods have the same number of parameters, the types are different; the first setLine method has two parameters of type double and the second has two parameters of type PointD. The detailed implementations of these three overloaded methods are shown below:
+
+First, notice that the second and third setLine methods use the first setLine method. This is similar to the constructors, where all the other constructors invoked the fourth constructor.
+
+If one looks carefully, it can be seen that the implementation of the fourth constructor and the first setLine method is the same. Also, notice that the code for the fifth constructor appears similar to the code for the second setLine method except that the constructor is invoking the fourth constructor and the setLine method is calling the first setLine method with the corresponding parameter list defined within the class. The calculations for the slope and y-intercept used as the formal parameters in the methods are exactly the same. The same thing can be said for the sixth constructor and the third setLine method. How can one avoid having duplicate code in the program? The answer is to invoke the setLine method in the constructor instead of repeating the same code twice. This would make sense when more complex computations need to be performed several times in the separate methods within the class as in the second and third setLine methods. The modification to the fourth, fifth, and sixth constructors is illustrated below:
+
+The first setLine method can be further modified to avoid duplicate code. Notice that the two statements this.slope = slope; and this.intercept = intercept; are also in setSlope and setIntercept methods, respectively. Therefore, the original first setLine method can be rewritten as follows:
+
+// First setLine method, modified:
+
+public void setLine(double slope, double intercept) {
+
+// using setSlope and setIntercept methods
+
+setSlope(slope);
+
+setIntercept(intercept);
+
+}
+
+In order to understand the nesting of method calls in overloaded constructors and methods, consider what would happen when a LineSI object is created using a default constructor in the main method. Calling the default constructor would result in the fourth constructor being invoked. The fourth constructor will call the first setLine method which calls the setSlope and setIntercept methods to set the values of slope and intercept. Although at first this might seem more complicated, the purpose is to eliminate duplicate code making the program easier to maintain.
+
+The last two methods are named compareLines and distance. The LineSI object, which calls the method compareLines, will be compared to the LineSI object passed to the method. It returns true when the two lines are the same and false when they are different. The LineSI object, which calls the method distance, calculates the distance from the object to the point passed as a parameter.
+
+All the pieces are put together in the following class:
+
+Notice that along with the two private instance variables, the private class constant, DEFAULT_VALUE, was defined. It was declared as a class data member so that any method defined in the class can use it as a constant because the value does not need to be changed during execution. By declaring it as a class constant, it will avoid allocating memory for the same constant twice when it was used in the first and second constructors.
+
+The Lines class in Fig. 5.15 will test the methods defined in LineSI. It will create two points and six lines using six different constructors. Then it will output the properties of the lines and the result from the compareLines and distance methods.
+
+Fig. 5.15
+
+A client program for LineSI and PointD classes
+
+The output from the above program is given below:
+
+line1: slope = 0.5, intercept = 3.5
+
+line2: slope = 0.5, intercept = 3.5
+
+line3: slope = -1.0, intercept = 3.0
+
+line4: slope = 0.5, intercept = 3.5
+
+line5: slope = 0.0, intercept = 0.0
+
+line6: slope = 2.0, intercept = 0.0
+
+line1 and line2 are the same.
+
+line4 and line5 are not the same.
+
+The distance between line3 and pt1 is 1.41.
+
+The distance between line6 and pt2 is 3.58.
+
+### 5.6.2 Program Focusing on Class Data Members and Class Methods
+
+In this section, the ChargeCard class defined in Sect. 5.5.3 will be modified. Assume that a cardholder travels to Europe and uses the card for shopping. The amount charged in Euros should be converted into US dollars and added to the balance of the card. Using the application, a user should be able to:
+
+  * Open an account to receive a card
+
+  * Make purchases in either US dollars or Euros
+
+  * Print the current balance of the card
+
+The program should perform the conversion from Euros to US dollars. The calculation used in conversion is the same for any purchase made in Euros; therefore, all the Card objects can share the code for the conversion. For this reason, the convertEurosToDollars method will be declared as a class method. The program also keeps track of the conversion rate named rate in the program. Since rate is used in the class method and a class method does not have an access to an instance data member, rate should be declared as a class data member. Because the conversion rate changes frequently, it should be declared as a variable, not a constant. The mutator and accessor for rate will also be class methods since they deal with a class data member. The following code segment implements the class data member and class methods discussed so far:
+
+So far there is no instance data member or instance method implemented in the Card class; therefore, all the methods can be used without creating an object. The following main method will set the rate and output its value and the result of the conversion of 1.00 Euro to US dollars:
+
+public class Purchases {
+
+public static void main(String[] args) {
+
+// output the information for Euros conversion
+
+Card.setRate(1.2128);
+
+System.out.println("rate = " + Card.getRate());
+
+System.out.printf("1.00 euro is equal to %.2f dollars.",
+
+Card.convertEurosToDollars(1.00));
+
+System.out.println();
+
+}
+
+}
+
+Notice that the three class methods are invoked using the class name Card in the dot notation. The following is the output from the above program:
+
+rate = 1.2128
+
+1.00 euro is equal to 1.21 dollars.
+
+Now the data members, constructors, and instance methods can be added to the Card class. The additional data members include two class constants, DEFAULT_ACCOUNT_NUMBER and DEFAULT_BALANCE, and two instance variables, accountNum and balance. There will be two constructors: one default constructor and another constructor that has two formal parameters to store values in the instance variables. The setAccountNum method is a mutator to set the value of the variable accountNum. Both the purchaseInDollars and purchaseInEuros methods receive a formal parameter and increment the balance by the amount in the parameter. In the purchaseInEuros method, the amount of Euros passed to the method is converted to US dollars by calling the convertEurosToDollars method. There will also be two accessors, getAccountNum and getBalance, to get the values of the two instance variables. The following program defines the Card class:
+
+The complete main method in Fig. 5.16 includes the creation of a Card object, two purchases, one each in US dollars and Euros, and the output of the balance after each purchase.
+
+Fig. 5.16
+
+A client program for Card class
+
+The following is the output from the above program:
+
+rate = 1.2128
+
+1.00 euro is equal to 1.2128 dollars.
+
+after spending 100.00 dollars
+
+card: Account Number = 12345, balance = 100.00 dollars
+
+after spending 100.00 euros
+
+card: Account Number = 12345, balance = 221.28 dollars
+
+## 5.7 Summary
+
+  * In addition to primitive data types, objects can be sent to and returned from methods.
+
+  * Constructors and methods can be overloaded by having the same name but must have a different number of parameters, different types of parameters, or parameters of different types in a different order.
+
+  * The reserved word this is used to refer to instance variables when there are parameters of the same name and to constructors when one constructor is defined in terms of another.
+
+  * If a constant or variable is declared within a constructor or method, it is known as a local constant or variable.
+
+  * If a constant or variable is declared within an object, they are known as an instance constant or variable and can be accessed by any constructor or method within the object.
+
+  * The reserved word static causes a constant, variable, or method to be a class constant, variable, or method that can be accessed by an instance of the class.
+
+  * Take the time to determine where variables and methods should be declared to help balance readability, communication, debugging, maintainability, and memory allocation.
+
+## 5.8 Exercises (Items Marked with an * Have Solutions in Appendix E)
+
+1.
+
+Identify the valid and invalid overloaded constructors in the following code:
+
+2.
+
+Identify the valid and invalid overloaded methods in the following code:
+
+3.
+
+A hexahedron is a three-dimensional shape with six faces. In this problem, a class which represents a hexahedron with squares at the top and the bottom as shown below will be implemented.
+
+Assume that hexahedrons are made of different materials; therefore, the weight needs to be kept along with the side and the height in order to describe a particular hexahedron. The following code implements data members and a portion of the constructors of Hexahedron class. Complete the first six constructors to call the last one by using the reserved word this.
+
+4.
+
+Draw contour diagrams to show the state of execution right after the execution of the statement line1 = new LineSI(pt1, pt2); in Fig. 5.15 in Sect. 5.6.1.
+
+5.
+
+Draw contour diagrams to show the state of execution right after the execution of the statement card.purchaseInEuros(100.00); in Fig. 5.16 in Sect. 5.6.2.
+
+6.
+
+Implement a class Rectangle which represents a rectangle shape as described below:
+
+*A.
+
+The Rectangle class has one private class constant DEFAULT_VALUE that should be initialized to 0.0.
+
+*B.
+
+The Rectangle class has two private instance data members, sideX and sideY, of type double.
+
+*C.
+
+The first constructor is a default constructor and calls the third constructor (described below) using the reserved word this to set instance data members to the default value.
+
+D.
+
+The second constructor calls the third constructor (described below) using the reserved word this. It retrieves a Rectangle object as a formal parameter and copies sideX and sideY of the object to the new object.
+
+E.
+
+The third constructor calls the setSides method (described below). Two formal parameters are used as the parameters for the setSides method.
+
+*F.
+
+The mutator methods, setSideX and setSideY, each has one formal parameter and stores them in the instance data member.
+
+G.
+
+Another mutator method, setSides, has two formal parameters and stores them in the instance data members by using the setSideX and setSideY methods (described above).
+
+H.
+
+The accessor methods, getSideX and getSideY, return the value of the appropriate instance data member.
+
+I.
+
+A method named calcArea computes the area of a rectangle and returns the computed area.
+
+Next, write a client program to test the Rectangle class defined above. This class should be named Rectangles and should contain the main method which performs the following tasks:
+
+a.
+
+Declare three Rectangle objects.
+
+b.
+
+Create three Rectangle objects using the three different constructors.
+
+c.
+
+Output the contents of sideX and sideY of the three objects.
+
+d.
+
+Output the area of the third rectangle.
+
+Here is some sample output:
+
+rectangle1: sideX = 0.0, sideY = 0.0
+
+rectangle2: sideX = 3.0, sideY = 4.0
+
+rectangle3: sideX = 3.0, sideY = 4.0
+
+rectangle3: area = 12.0
+
+7.
+
+Expand the PointD class discussed in this chapter to include the quadrant information of a point. The x-axis and y-axis divide the plane into four regions called quadrants. The quadrants are labeled starting at the positive x-axis and going around counterclockwise as shown below:
+
+Write the new PointD class as described below. Points falling on the x-axis and y-axis are not considered to be in any quadrant, and therefore return the default value, 0:
+
+A.
+
+The PointD class has two private class constants, DEFAULT_VALUE of type double and DEFAULT_QUADRANT of type int, that should be initialized to 0.0 and 0, respectively.
+
+B.
+
+The PointD class has two private instance data members, x and y, of type double.
+
+C.
+
+The PointD class has one private instance data member quadrant of type int.
+
+D.
+
+The first constructor is a default constructor and calls the third constructor (described below), by using the reserved word this, to set the instance data members to the default values.
+
+E.
+
+The second constructor receives a PointD object as a formal parameter and stores the x, y, and quadrant of the object as the values of the instance data members.
+
+F.
+
+Third constructor calls the setPoint method (described below). Its two formal parameters are used as the parameters for the setPoint method.
+
+G.
+
+The mutator methods, setX and setY, have one formal parameter and call the setPoint method (described below). The setX method changes the value of data member x to the value of the parameter. The setY method changes the value of data member y to the value of the parameter.
+
+H.
+
+Another mutator method, setPoint, has two formal parameters and stores these values in the instance data members, x and y. It also sets the correct value for the data member quadrant depending on the values of the two parameters.
+
+I.
+
+The accessor methods, getX, getY, and getQuadrant, return the value of the appropriate instance data member.
+
+Next, write a client program to test the PointD class defined above. Call this class Points. The main method should perform the following tasks:
+
+J.
+
+Declare five PointD objects.
+
+K.
+
+Create five PointD objects using the three different constructors. The points should be in three different quadrants and also the origin.
+
+L.
+
+Output the contents of x, y, and quadrant for the five objects.
+
+M.
+
+Change the value of x or y for one of the points using a mutator so that the point will move to a different quadrant.
+
+Here is some sample output:
+
+point1: (0.0, 0.0) in quadrant 0
+
+point2: (2.0, -5.0) in quadrant 4
+
+point3: (2.0, -5.0) in quadrant 4
+
+point4: (2.0, 5.0) in quadrant 1
+
+point5: (-2.0, 5.0) in quadrant 2
+
+after calling set method
+
+point3: (-2.0, -5.0) in quadrant 3
+James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6_6
+
+© Springer-Verlag London 2014
+
+# 6. Strings
+
+James T. Streib1  and Takako Soma1
+
+(1)
+
+Department of Computer Science, Illinois College, Jacksonville, IL, USA
+
+Abstract
+
+This chapter discusses string variables and the String class. In addition to the concatenation of strings, various methods defined in the String class such as the length, indexOf, and substring methods are examined. The toString method which returns a string representation of the properties of an object is also shown along with a complete program implementing String objects.
+
+## 6.1 Introduction
+
+Up till now, this text has focused on numerical values such as integers and real numbers. In this chapter the focus is text values. Characters are another fundamental type of data used on a computer, and a string in Java is a sequence of characters. Each programming language supports a particular character set which is a list of characters in a particular order. The ASCII (American Standard Code for Information Interchange) character set is the most common one. The basic ASCII set uses seven bits per character to support 128 different characters including letters, punctuation, digits, special symbols, and control characters. In order to support more characters and symbols from many different natural languages, Java uses the Unicode character set, which uses 16 bits per character, supporting 65,536 unique characters. ASCII is a subset of the Unicode character set.
+
+Strings are not represented as a primitive data type such as int, double, or char but as an object of the String class. Text values can also be passed as an argument to methods such as system.out.print as described in Chap.​ 1. Similar to numbers, strings can be assigned to variables and manipulated using operators and methods defined in the String class.
+
+## 6.2 String Class
+
+The String class is a standard class, like the Math or Scanner classes, defined in the java.lang package. The following illustrates how a String variable is declared and a String object is created:
+
+String fullName;
+
+fullName = new String("Maya Plisetskaya");
+
+After the variable fullName is declared as type String, the second statement creates an object with a value "Maya Plisetskaya" and then a reference to the new object is placed in the variable, fullName. The contour diagram in Fig. 6.1 illustrates the state of execution after the above two statements.
+
+Fig. 6.1
+
+An object of String class
+
+Because the String class is a predefined class, a variable name is not in the contour diagram of the String object. Although the String class is not a primitive data type, a String object can be created by assigning a string within double quotes to a String variable, for example,
+
+String fullName;
+
+fullName = "Maya Plisetskaya";
+
+Even though it looks like the text value is directly assigned to the variable, the variable fullName does not contain an actual value, like with a primitive data type, but rather an address of the object. The contour diagram after the above two statements will be exactly the same as the one shown in Fig. 6.1.
+
+Further, notice that the following statements using the keyword new will assign a null value to the variable:
+
+String fullName;
+
+fullName = new String();
+
+The same thing will also happen with a simple assignment statement:
+
+String fullName;
+
+fullName = null;
+
+The differences between creating String objects using new statements and assignment statements will become more apparent in Sect. 6.4. Except for on a few occasions, the new statement will be used to create a String object in order to reinforce the ideas of object creation. In either case, once a String object is created, the string value inside of the object cannot be modified, which means that any of the characters in the string cannot be changed, nor can the string be shortened or lengthened. This property is called being immutable. If a string needs to be modified, an object of type StringBuffer which is a mutable sequence of characters can be used, but this is beyond the scope of this text.
+
+## 6.3 String Concatenation
+
+Although strings cannot be modified, there are a number of operators that can be used with strings. A useful String operation is concatenation accomplished by the use of a plus symbol, +, which was introduced briefly in Chap.​ 1 to support output. Two strings can be combined to create a new string. Consider the following example code segment:
+
+String firstName, lastName, fullName;
+
+firstName = new String("Maya");
+
+lastName = new String("Plisetskaya");
+
+fullName = firstName + " " + lastName;
+
+A first name and a last name are assigned to separate variables, firstName and lastName, respectively, and then combined together using a string concatenation operator. A contour diagram for fullName is again exactly the same as the one in Fig. 6.1. Notice that a space is concatenated between firstName and lastName. Without it, fullName would have the first name and a last name combined together as in "MayaPlisetskaya".
+
+A plus symbol was introduced as an arithmetic addition and as a concatenation in the output statements in Chap.​ 1. When an operator represents more than one operation, it is called an overloaded operator. What happens if overloaded operators appear in the expression with mixed types? The Java compiler treats + as an arithmetic addition when both the left and right operands are numbers, otherwise it will treat it as a string concatenation. Remember that the plus symbol is evaluated from left to right and the result of an expression with mixed types is String type. For example, what would the output be for the following code segment?
+
+int num1, num2;
+
+String str1, str2;
+
+num1 = 2;
+
+num2 = 3;
+
+str1 = new String("num1 + num2 = ");
+
+str2 = new String(" = num1 + num2");
+
+System.out.println(str1 + num1 + num2);
+
+System.out.println(num1 + num2 + str2);
+
+System.out.println(str1 + (num1 + num2));
+
+The first print statement results in
+
+num1 + num2 = 23
+
+Since the left operand of the first plus symbol is String and the right operand is int, it will treat the contents of num1 as String. Because the first plus sign was treated as concatenation, the left operand of the second plus sign is a String type. Further, the right operand of the second plus symbol is int; it will again treat the contents of num2 as a String.
+
+How about the second print statement? The first plus sign is treated as an arithmetic addition because the left and the right operands of the first plus sign are both int types. Then, the second plus symbol is treated as a string concatenation since the last operand is of type String and it is mixed-type operands. The output will be
+
+5 = num1 + num2
+
+In the third print statement, parentheses will force (num1 \+ num2) to be evaluated first. Therefore, the second + is treated as an arithmetic addition. The result will be
+
+num1 + num2 = 5
+
+Another operator that can be used on String objects is a shortcut operator, +=. It has the same effect as the shortcut of arithmetic addition discussed in Chap.​ 1 and is left as an exercise at the end of the chapter.
+
+## 6.4 Methods in String Class
+
+There are over 50 methods defined in the String class that can be found in the Java API specification document on the Oracle website at
+
+  * http://​docs.​oracle.​com/​javase/​7/​docs/​api/​java/​lang/​String.​html
+
+In this section, six of the most commonly used ones will be discussed: length, indexOf, substring, equals, equalsIgnoreCase, and charAt.
+
+### 6.4.1 The length Method
+
+In order to find the number of characters in a String object, the length method is used. For example, if the variable fullName refers to the string "Maya Plisetskaya", then
+
+fullName.length()
+
+will return the value 16 because there are 16 characters in the string. Notice that a space between the first name and the last name is counted as a character. If the string is empty, applying the length method results in 0.
+
+### 6.4.2 The indexOf Method
+
+A character in a string can be referred to by its position, or in other words its index, in the string. The index of the first character is 0, the second character is 1, and so on as illustrated in Fig. 6.2.
+
+Fig. 6.2
+
+Index of characters in the string
+
+To find the position of a substring of a string, the indexOf method can be used. The method will return the position of the first character of the substring in the string. Here are some examples using fullName:
+
+statement | return value
+
+---|---
+
+fullName.indexOf("Maya") | 0
+
+fullName.indexOf("set") | 8
+
+fullName.indexOf("Set") | -1
+
+fullName.indexOf("ya") | 2
+
+fullName.indexOf(" ") | 4
+
+The first statement returns 0 because "Maya" occurs at the beginning of the string. The word "set" starts at the position 8. The return value −1 from the third statement indicates that the substring does not exist in the string. Since it performs a case-sensitive search, it did not find "Set" starting with an uppercase letter. There are two occurrences of "ya" at the position 2 and 14. Since if there is more than one occurrence of the substring in the string, the position of the first character of the first matching substring is returned, the fourth statement returns 2. As it was mentioned before, a space is considered to be a character; therefore, the last statement returns 4 which is the position of the space in the string.
+
+### 6.4.3 The substring Method
+
+On some occasions, one's name needs to be printed in a format of a last name, a comma, a space, and a first name. How can it be formatted if the full name is given in a first name, a space, and a last name? The answer is that the first name and the last name can be extracted from the full name and rearranged. In order to extract a substring from a string, a substring method can be used. A substring method takes two integers as arguments: the position of the first letter of the substring and the position of the last letter of the substring + 1. Using the string in Fig. 6.2, this means that the statement fullName.substring(8, 11); will return "set". Here are some more examples:
+
+statement | return value
+
+---|---
+
+fullName.substring(0, 4) | Maya
+
+fullName.substring(2, 2) | an empty string
+
+fullName.substring(10, 6) | runtime error
+
+fullName.substring(18, 20) | runtime error
+
+The second statement will create a String object with empty string. The third example gives a runtime error because the first argument should be the same as or smaller than the second. In the fourth example, the arguments should be in the range of 0–16, otherwise they are out of bounds and cause a runtime error.
+
+Obtaining a first name, "Maya" from fullName is not very difficult. A statement fullName.substring(0, 4) would work. However, consider when the fullName contains a different name, for example, "George Balanchine". fullName.substring(0, 4); would return Geor, which is not the first name. How can this be changed so that the statement will extract the first name from any full name? Notice that the first name and the last name are separated by a space. So, using a position of the space spacePos = fullName.indexOf(" "), a first name can be easily extracted from any full name as in fullName.substring(0, spacePos). Once the first name is obtained, how can the last name be extracted? Remember the last name starts right after the space, so the position of the first letter of the last name is spacePos + 1. When does it end? It ends at the end of the string. Since fullName.length() returns 16 for "Maya Plisetskaya", which is the position of the last letter of the last name + 1, this is perfect for the second parameter of substring method for extracting a last name. All the pieces are put together in the following program:
+
+Alternatively, without declaring variables, spacePos and len, one could use return values from indexOf and length methods as arguments for the substring method.
+
+firstName = fullName.substring(0, fullName.indexOf(" "));
+
+lastName = fullName.substring(fullName.indexOf(" ")+1,
+
+fullName.length());
+
+Which way is better? The first option allocates memory for two more variables, spacePos and len; however, it does not call indexOf method twice as in the second option. For a small example like this, it does not matter which option one uses. For large programs, try to remember not to waste too much memory by declaring unnecessary variables and also try not to invoke complex methods multiple times. One should always be aware of a trade-off between space and time and make a very good balance between them when developing a large application.
+
+An example of the input and output from the above program is shown below:
+
+Enter full name, first name followed by last name: Maya Plisetskaya
+
+Plisetskaya, Maya
+
+### 6.4.4 Comparison of Two String Objects
+
+While a double equal sign, ==, was used to compare primitive data types, comparing two String objects takes extra care. Examine the following code segment:
+
+String str1, str2;
+
+str1 = new String("saddles");
+
+str2 = new String("saddles");
+
+System.out.println(str1 == str2);
+
+Is the output true or false? As a matter of fact, it prints false. Why does the comparison of str1 and str2 return false? Both String variables seem to contain the same value, "saddles", but remember that a String variable contains a reference to the String object, not the string itself. Since str1 and str2 are two completely different objects, two variables refer to different addresses shown below:
+
+The correct way to compare the contents of String object is to use a String method, equals.
+
+System.out.println(str1.equals(str2));
+
+The above statement will output true since both str1 and str2 have the same value. The equals method does not compare the references, but rather the contents of the strings being referenced. What about when a String object is created by assigning a string literal?
+
+String str3, str4;
+
+str3 = "halters";
+
+str4 = "halters";
+
+System.out.println(str3 == str4);
+
+System.out.println(str3.equals(str4));
+
+Interestingly, both print statements return true. This is because when the value is assigned to str4, the Java compiler will search the existing String objects for an exact match. If it finds one, which is the case here, a new String object is not created. Instead, the variable is assigned a reference to the existing String object show below:
+
+Of course, if the contents of one String variable is copied to another String variable, both variables would point to the same object as shown below because what is copied is the address of the object:
+
+String str5, str6;
+
+str5 = new String("bridles");
+
+str6 = str5;
+
+System.out.println(str5 == str6);
+
+System.out.println(str5.equals(str6));
+
+As can be seen in the above contour diagram, both print statements return true. Recall that this is exactly the same situation discussed in Sect.​ 2.​9, where variables of Number objects, num1 and num2, are referencing the same object containing the integer 5 after the assignment statement num1 = num2 shown in Fig.​ 2.​24 repeated below:
+
+The contour diagram showed that the intended task of copying the integer 5 from num1 to num2 was not accomplished. In general it is not a good idea to have two variables pointing to the same object, unless it is a String object. If the contents of the object num1 is referring to were modified by using a mutator method, the contents of the object num2 is referring to would be automatically changed because they are pointing to the same object. Is it the same way with String objects? If one were to execute the following statement to modify the contents of str5,
+
+str5 = "reins";
+
+the Java compiler would search the existing String objects for one containing "reins". So far, two objects with "saddles", one object with "halters", and one object with "bridles" have been created. Since it does not find an object with "reins", a new String object will be created. Therefore, str5 and str6 will be referencing different String objects as shown below:
+
+Now, the following statements will both return false:
+
+System.out.println(str5 == str6);
+
+System.out.println(str5.equals(str6));
+
+Unlike with num1 and num2, because of the immutable characteristic of String type, there is no danger of modifying the content of one object when two String variables are referencing the same object.
+
+### 6.4.5 The equalsIgnoreCase Method
+
+Assume that a program to play a Tic Tac Toe game has been written. At the end of each game, a user will be asked if he or she would like to play another game. For example, consider the code segment in Fig. 6.3:
+
+Fig. 6.3
+
+Use of a method from String class to compare strings
+
+Because of the !, the condition of the if statement is true when a user does not enter yes. Then, the variable selection will be changed to false, and eventually the program stops. What happens if a user wanted to play another game and entered Yes instead of yes? Because the equals method checks for an exact match, the if condition again is true. In case the user types yes in different ways, the if condition can be modified to
+
+if(!(response.equals("yes") || response.equals("Yes") ||
+
+response.equals("YES")))
+
+selection = false;
+
+Then, the user can enter "yes", "Yes", "YES" to continue. Actually, there is a way to include all the combinations of upper- or lowercase characters in the word "yes" such as "yEs", "yeS", and "yES". One can compare the content of String objects ignoring the case of characters in the string. An equalsIgnoreCase method compares the content of a String object to that of another String object ignoring case considerations. Two strings are considered to be equal if they are of the same length and corresponding characters in the two strings are equal ignoring the case of the characters. In other words, the search can be done in a case-insensitive way. One can rewrite the if condition as
+
+if(!response.equalsIgnoreCase("yes"))
+
+selection = false;
+
+Given the equalsIgnoreCase method, the user can enter "yes", "Yes", "YES" or any other combination of uppercase or lowercase characters in the word "yes" to continue.
+
+### 6.4.6 The charAt Method
+
+The charAt method returns the character stored at the specified position in the string. For example, if the variable name refers to the string "George Balanchine", then fullname.charAt(0) will return the value "G" because the character "G" is the first character. The statement fullname.charAt(2) will return the value "o" because the index of the character "o" is 2. Suppose one likes to know the number of occurrences of certain character in a string, for instance, the character "G" in fullname. Each character in the fullname can be checked using the charAt method inside the loop and a counter can be incremented. The following code segment counts the number of "G" characters in "George Balanchine":
+
+An output from the above code segment would be
+
+The name George Balanchine contains 1 character "G".
+
+Notice that it only counts the capital letter "G" and ignores lowercase letter "g". If both uppercase and lowercase letters need to be counted, the if condition would look like
+
+if(letter == "G" || letter == "g")
+
+and the code will return 2 because one uppercase "G" and one lowercase "g" exist in "George Balanchine". A summary of some of the methods in the String class can be found in Table 6.1.
+
+Table 6.1
+
+Various methods in the String class
+
+Method | Function preformed | Arguments | Value returned
+
+---|---|---|---
+
+charAt(pos) | Returns character at given index | int | char
+
+equals(str) | Compares strings | String | boolean
+
+equalsIgnoreCase(str) | Compares strings ignoring case | String | boolean
+
+indexOf(str) | Returns index of first occurrence of substring | String | int
+
+length() | Returns length of string | None | int
+
+substring(pos,pos) | Returns substring of string | int, int | String
+
+## 6.5 The toString Method
+
+The overriding method, toString, receives no parameters and returns a String type. Although overriding methods will be discussed further in Chap.​ 9, it is introduced here because it is a useful method that helps output data stored in objects. Prior to demonstrating how toString works, the PointD class from Fig.​ 5.​4 in Chap.​ 5 is relisted in Fig. 6.4.
+
+Fig. 6.4
+
+A client program and PointD class
+
+The main method in Fig. 6.4 creates objects of the PointD class and finds the midpoint of the two points. After executing the program, the output is
+
+The mid-point between (4.0,4.0) and (8.0,7.0) is (6.0,5.5)
+
+What would happen if the last five print statements of the main method were replaced by the following statement?
+
+System.out.println(middle);
+
+This statement is trying to output middle which is a PointD object. Does it output the contents of x and y of middle? The answer is no. Instead, the output would look like the following:
+
+PointD@ae3364
+
+What is this? Is it garbage? The answer to the second question is no, it is not garbage. However, it is not very useful information at this level of programming. The System.out.println outputs the name of the class PointD, an @ symbol, and the memory address of the object in hexadecimal (base 16) representation. Since each time the program is run the object might be in a different location in memory, the output may be different every time the program is executed. In order to output the contents of x and y, one needs to use accessor methods, such as getX and getY as done in Fig. 6.4. However, wouldn't it be nice if there was a method to return the contents of an object? A toString method could be written in the PointD class to return a string representation of the contents of the data members of an object. The method could return x and y as the location of a point in the format (x,y) and would be written as follows:
+
+public String toString() {
+
+return "(" + x + "," + y + ")";
+
+}
+
+Since the values in x and y are concatenated with strings, they are converted to type String and would be returned as a String. Then, in the following statement, the object middle can call the toString method
+
+System.out.println(middle.toString());
+
+and the above statement will produce an output of
+
+(6.0,5.5)
+
+Now, if the last five print statements in the main method in Fig. 6.4 were replaced by the following code,
+
+System.out.println("The mid-point between "
+
+\+ p1.toString() + " and " + p2.toString() + " is "
+
+\+ middle.toString());
+
+it would produce the same output as the original code as follows:
+
+The mid-point between (4.0,4.0) and (8.0,7.0) is (6.0,5.5)
+
+The usefulness of a toString method will be appreciated more when objects are discussed further in Chap.​ 9.
+
+## 6.6 Complete Program: Implementing String Objects
+
+In this section, an application which outputs course information will be developed. The program will:
+
+  * Ask the user for a name of a class. The input consists of a department code, a course number, and a course title, such as "CS 360 Theory of Computation".
+
+  * Process the input.
+
+  * Output the title of the class, level of the class, and the department that offers the class.
+
+An example of the input and output for the Theory of Computation course would be
+
+Enter the course: CS 360 Theory of Computation
+
+The class, "Theory of Computation", is a
+
+junior level class offered by the
+
+Computer Science department.
+
+and the input and output for a Calculus course could be
+
+Enter the course: MA 213 Calculus I
+
+The class, "Calculus I", is a
+
+sophomore level class offered by the
+
+Mathematics department.
+
+When the user provides input, the program will create an object and store pieces of information inside of the object. The name of the department will be determined by the department code which is the first piece of the input. The course number is the second piece of the input, and the course title is the rest of the input. The level of the course will be obtained by checking the course number. Figure 6.5 contains the code defining the class for a Course object.
+
+Fig. 6.5
+
+Course class
+
+The Course class consists of four data members that are all instance variables, two constructors, and mutators and accessors for each data member. The setDepartment method accepts a department code as a parameter, then the if-then-else structure determines the department, and the value is assigned to the data member. The setLevel method uses the value of data member, number, to figure out the level of the class. In order to use a case structure, the first character of number is extracted as a String and converted to a character since only char, byte, short, or int types can be used in the case statement. The charAt method is used to convert a string to a character. It takes a position of a character in a string and returns a character. The main program which uses Course class is shown in Fig. 6.6.
+
+Fig. 6.6
+
+A client program for Course class
+
+After the user enters an input, pieces of information are extracted and used to create a Course object. Notice that in order to include a double quote in a string literal, a backslash is used as in \", which was discussed in the output section of Chap.​ 1. This application can be extended to accommodate more departments and graduate level classes. Course objects can also be stored in an array for further manipulation which will be discussed in Chap.​ 7.
+
+## 6.7 Summary
+
+  * A String object can be created by using new, =, or += operators.
+
+  * String objects are immutable, which means their contents cannot be changed.
+
+  * When a String object is created by assigning a string literal, the Java compiler will search the existing String objects for an exact match. If it finds one, the variable is assigned a reference to the existing String object.
+
+  * When a String object is created using the keyword new, a new object will be created even if there already exists an object with the same string value.
+
+  * Individual characters of a string are numbered starting from 0.
+
+  * When an equals method is applied to String objects, it compares the contents of the objects being referenced.
+
+  * To compare the contents of String objects, a == operator cannot be used since it compares the references to objects.
+
+  * Some String methods include indexOf, length, substring, equals, equalsIgnoreCase, and charAt.
+
+## 6.8 Exercises (Items Marked with an * Have Solutions in Appendix E)
+
+1.
+
+Identify the errors in the following code segments:
+
+A.
+
+String text1;
+
+text1 = new String(girth);
+
+*B.
+
+String text2;
+
+text2 = new Text("shedding blade");
+
+C.
+
+String text3;
+
+text3 = new Sting("grazing muzzle");
+
+text3.indexOf("muzz");
+
+text3.length(5);
+
+2.
+
+Determine the return value for each of these expressions, assuming the following declaration:
+
+String org;
+
+org = new String ("American Quarter Horse Association");
+
+A.
+
+org.substring(5, 8)
+
+*B.
+
+org.length()
+
+C.
+
+org.substring(9, 22)
+
+*D.
+
+org.substring(17, 19) + org.substring(20, 22)
+
+E.
+
+org.substring(15, 16) + org.substring(18, 19)
+
+\+ org.substring(13, 14)
+
+\+ org.substring(org.length()–5, org.length())
+
+F.
+
+org += org
+
+3.
+
+Draw contour diagrams to show the state of execution after the execution of the following code segment:
+
+String s1, s2, s3, s4;
+
+s1 = new String("stirrup irons");
+
+s2 = "stirrup irons";
+
+s3 = new String("stirrup irons");
+
+s4 = s2;
+
+4.
+
+Determine the output from the following code segment:
+
+String star;
+
+star = "*";
+
+int i;
+
+for (i=0; i<5; i++) {
+
+System.out.println(star);
+
+star += star;
+
+}
+
+5.
+
+Write a program that asks the user for a positive integer, receives input as a String, and outputs a string with commas in the appropriate places. For example, if the input is
+
+1000000
+
+then the output is
+
+1,000,000
+
+6.
+
+Write a program for a given word and string that will
+
+a.
+
+Check if the word is in the string.
+
+b.
+
+Count all occurrences of the word in the string.
+
+c.
+
+Remove all occurrences of the word from the string.
+
+*7.
+
+With a given String object called org containing a value "American Quarter Horse Association", write a program to output an abbreviation of the string, AQHA.
+
+8.
+
+Modify the previous program to ask a user for a name of his or her organization and print an abbreviation of the name. Realize that the name of the organization consists of any number of words.
+James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6_7
+
+© Springer-Verlag London 2014
+
+# 7. Arrays
+
+James T. Streib1  and Takako Soma1
+
+(1)
+
+Department of Computer Science, Illinois College, Jacksonville, IL, USA
+
+Abstract
+
+Arrays and array processing are illustrated in the chapter starting with declaration, access, input and output. In addition to simple processing, the passing of an array to and from a method is demonstrated. Other processing includes reversing, searching (sequential and binary), and sorting an array using the bubble sort. Also, two-dimensional arrays and arrays of objects are introduced, along with a complete program.
+
+## 7.1 Introduction
+
+Similar to a string which can store a group of characters, an array can be used to store numbers of type int or double. Not only can arrays store numbers, but they can also be used to store strings, objects, and even other arrays. Arrays are extremely useful to store data that needs to be processed more than once, such as data that needs to be searched or sorted.
+
+Related to an array are the predefined Array and Vector classes which are beyond the scope of this text, because before learning how to use these classes, it is good to understand how to input, process, and output data using arrays. This chapter will first introduce the reader to declaring an array, and as in the past the best way to learn is to get started with an example.
+
+## 7.2 Array Declaration
+
+When declaring an array, the type of data that will be stored in the elements of the array must be specified. For example, to declare a memory location to store a reference to an array of type int called number, one would write the following:
+
+int number[];
+
+Alternatively, and used more often, the above could be declared as
+
+int[] number;
+
+This reserves a memory location called number, the square brackets indicate that it will be an array, and the word int indicates that each element of the array can contain an integer. Initially, the memory location number will contain a null reference, which means it does not initially reference anything.
+
+In order to create an array of three elements, the following instruction is needed:
+
+number = new int[3];
+
+Although the word new has also been used to create a new object, here it is used to create a new array. The number in the square brackets indicates the length of the array, in this case three elements. In this example, the first element is number0] and the last one is number[2]. As with simple variables, the contents of the array are initialized to 0, but as in [Chap.​ 1 this text will assume that the contents are indeterminate. Lastly, a reference to the array is placed into the memory location number via the assignment symbol and is represented as an arrow in the following diagram:
+
+Alternatively the previous two lines could be combined as follows:
+
+int[] number = new int[3];
+
+Although this takes up less space, the other two statements will be used more frequently to reinforce the concepts of declaration and allocation. As another alternative, a constant can be declared and used in the new statement. The advantage to this technique is that when iterating in a loop to process or output an array, the same constant can be used both to declare the array and as the end value of a for loop as will be seen in the next section:
+
+final int ARRAYSIZE = 3;
+
+int[] number;
+
+number = new int[ARRAYSIZE];
+
+As another alternative, an array can be declared and initialized using the following technique:
+
+int[] number = {0,0,0};
+
+While this is somewhat useful for small arrays, it would be impractical to initialize hundreds of elements. Though often smaller arrays will be initialized this way in order to save space, an alternative is presented in the next section.
+
+## 7.3 Array Access
+
+Assuming that an array has been created at the beginning of the program using the statements in the preceding section, the array can now be accessed. In order to access an individual element of an array, the name of the array is followed by the index of the element to be accessed. For example,
+
+number[0] = 5;
+
+indicates that the 0th element of the array, the first element, takes on the value of 5. This is illustrated in the following diagram:
+
+Be sure not to confuse the index, 0, with the contents of the array, 5. Notice that the 0th element of the array now contains the number 5. Should the contents of the first element need to be copied into the third element, it could be accomplished as follows:
+
+number[2] = number[0];
+
+and would be represented as shown below:
+
+When accessing various elements of an array, be careful not to try to access or alter any elements outside the range of the array. In the example above, do not try to access number[−1] or below, or try to access number[3] or above, because an execution error will occur.
+
+Although the accessing of individual elements can be useful in particular instances, it is often more practical to be able to access all of the array elements. As an example, what if the elements of the array need to be initialized to zero? If only three elements need to be initialized, the technique illustrated at the end of the previous section could be used, but what if instead of three elements, one hundred elements needed to be initialized? Clearly, listing out one hundred individual zeros would be impractical. Instead, as mentioned previously in Chap.​ 4, this can be accomplished by using an iteration structure. Though any of the loop structures can be used, under different circumstances some iteration structures are better choices than others.
+
+For example, if each element of the above array needs to be initialized to zero, which loop would be the best choice? Since there is a fixed number of elements to be initialized, then a fixed iteration loop structure could be used, specifically the for loop as shown below:
+
+for(int i=0; i<3; i++)
+
+number[i] = 0;
+
+Notice that the loop control variable is of type int and iterates from 0 to 2 corresponding to the three elements of the array. For each iteration of the loop, the number 0 is placed into the ith element of the array. As when accessing individual elements of an array, be careful not to have the loop try to access elements that are outside the range of the array, such as number[−1] or number[3] because again an execution error will occur.
+
+Assuming the declaration of the constant ARRAYSIZE in the previous section, the above code segment could be rewritten as follows:
+
+for(int i=0; i<ARRAYSIZE; i++)
+
+number[i] = 0;
+
+Another alternative to the programmer-defined constant ARRAYSIZE is to use the public constant length associated with the array as shown in the following code segment:
+
+for(int i=0; i<number.length; i++)
+
+number[i] = 0;
+
+Although this would not be helpful in creating the array, this is otherwise helpful because one would not have to remember the name of the programmer-defined constant. Also notice that a set of parentheses does not appear after the word length as it does with strings, as in .length() as discussed in Chap.​ 6. The reason is that .length() is a method with an empty argument list for use with strings, whereas .length is a public constant associated with an array. At first it can be a little hard to distinguish between the two, but with time and practice, it becomes easier to remember.
+
+In both cases, whether using either the programmer-defined constant or the public constant, they can be convenient when inputting, processing, or outputting the contents of an entire array. However, there are many times when an array is not entirely filled, so using either type of constant is not as useful as one might think as shown in the next section.
+
+## 7.4 Input, Output, Simple Processing, and Methods
+
+Although initializing an array is useful in some circumstances, more often data will need to be input by the user. The data input into the array is often processed and the array might subsequently be output. The first subsection examines the input, the second shows how to output an array, the third demonstrates some simple processing, and the fourth subsection illustrates passing an array to and from a method.
+
+### 7.4.1 Input
+
+As in the preceding section, assume that the following declarations are made at the beginning of the program and prior to the following input code segments:
+
+int[] number;
+
+number = new int[3];
+
+If there are exactly three items that need to be input into the three element array above, then the for loop is again the logical choice. As with input of data into simple variables, a prompt should be used:
+
+for(int i=0; i<3; i++) {
+
+System.out.print("Enter integer number " + (i+1) + ": ");
+
+number[i] = scanner.nextInt();
+
+}
+
+Note that the loop control variable is part of the prompt to help the user know what number is being entered. Although the array elements are numbered 0 to 2, the i+1 in the prompt allows the user entering the data to think in the more familiar terms of 1 to 3. Further note that the i+1 is in parentheses so the plus sign will be treated as addition instead of concatenation. Lastly, since the value of i+1 in the prompt is not assigned back into i, the value of the loop control variable and the index for the array is not altered. The format of the prompts with sample input is as follows:
+
+Enter integer number 1: 5
+
+Enter integer number 2: 7
+
+Enter integer number 3: 10
+
+Of course as discussed in Chap.​ 4 and assuming the declaration of the integer variable n, a user could be prompted for the number of integers to be entered as in the following:
+
+System.out.print("Enter the # of integers to be entered 1 - 3: ");
+
+n = scanner.nextInt();
+
+for(i=0; i<n; i++) {
+
+System.out.print("Enter integer number " + (i+1) + ": ");
+
+number[i] = scanner.nextInt();
+
+}
+
+However, what if the user needs to enter fewer items into the array than were initially allocated? In the above example, if the user only needs to enter two items, then only the last element would go unused. Further, what if an array were declared to hold 1,000 elements, but the user only needed to enter 20 items? The result would be that there would be 980 empty elements in the array, which would be a waste of memory. More problematic is what if the user of the above code segment needed an array of size 100 rather than an array of size 3 elements? Although the program could be modified and recompiled, this is not a viable option for a user who does not know how to program. Fortunately, there is a solution to this problem. The following declaration of the variable to hold the reference to the array would still occur at the beginning of the program as follows:
+
+int[] number;
+
+Then, instead of having the allocation of memory using the new statement and a constant prior to the input code segment, it could appear after the prompt for the number of items to be entered into the array as follows:
+
+System.out.print("Enter the # of integers to be entered: ");
+
+n = scanner.nextInt();
+
+number = new int[n];
+
+for(i=0; i<n; i++) {
+
+System.out.print("Enter integer number " + (i+1) + ": ");
+
+number[i] = scanner.nextInt();
+
+}
+
+Notice that the reference to the array is created after the prompt and input for the number of integers to be entered into the variable n. The advantage to this technique is that no wasted memory locations are declared. More importantly there are enough elements in the array for the user to enter the data and the user is not limited to a fixed number of data items.
+
+However, as discussed in Chap.​ 4, a problem with the above code is what if the user miscounts the number of data items to be entered and enters the wrong number of items to be input? Although the array will be declared to the size entered, the user might end up having more data to enter than was allowed for in the array or the user might have less data than expected and the for loop might iterate more times than needed.
+
+As before, a better solution might be to use a sentinel control loop. If one uses the code from Chap.​ 4 and alters it to substitute an array element instead of a simple variable, one might write something similar to the following code segment. However, there is still a problem with this code segment:
+
+// *** Caution: Incorrectly implemented code ***
+
+i = 0;
+
+System.out.print("Enter a non-negative integer ");
+
+System.out.print("or a negative integer to stop: ");
+
+number[i] = scanner.nextInt();
+
+while(number[i] >= 0) {
+
+i++;
+
+System.out.print("Enter a non-negative integer ");
+
+System.out.print("or a negative integer to stop: ");
+
+number[i] = scanner.nextInt();
+
+}
+
+As indicated by the comment prior to the code, the above code segment is implemented incorrectly. Although it appears to input all the valid data into the array, what is the problem? The problem is that the sentinel value is also input into the array. While this is not a major issue, the array would have to be declared to be one element larger to accommodate the sentinel value. Further, one would need to write all subsequent code to not process or output the sentinel value, which could be a potential source of logic errors.
+
+The best solution is not to put the sentinel value in the array in the first place. How could this be done? The problem is that both input statements put the values directly into the array. As an alternative, the value could be input to a temporary variable and checked to see whether it is a sentinel value before putting it into the array. However, instead of adding a couple of extra if statements, note that the while loop already checks for the sentinel value. If the value in the temporary variable is not a sentinel value, the body of the loop is entered and the value in the temporary variable can be copied into the array. On the other hand, if the value in the temporary variable is a sentinel value, the loop is not executed and the sentinel value is not placed in the array. A good name for the temporary variable is temp as in the following segment:
+
+i = 0;
+
+System.out.print("Enter a non-negative integer ");
+
+System.out.print("or a negative integer to stop: ");
+
+temp = scanner.nextInt();
+
+while(temp >= 0) {
+
+number[i] = temp;
+
+i++;
+
+System.out.print("Enter a non-negative integer ");
+
+System.out.print("or a negative integer to stop: ");
+
+temp = scanner.nextInt();
+
+}
+
+However, what is preventing the user from entering more data than there is space for in the array? Assume that the array is fixed at a particular size as in the following declaration and allocation:
+
+final int ARRAYSIZE = 10;
+
+int[] number;
+
+number = new int[ARRAYSIZE];
+
+Note that a constant is being used for the allocation of the array. The while statement in the above code segment can now be altered using the constant to ensure that the user does not enter more data than was allocated for the array as shown below:
+
+while(temp >= 0 && i < ARRAYSIZE) {
+
+or alternatively
+
+while(temp >= 0 && i < number.length) {
+
+Whereas the previous example using the for loop had the advantage that the array was the exact size the user wanted, the disadvantage was that the user might miscount the number of data items to be entered. However, the advantage of the sentinel controlled loop above is that it does the counting for the user, but the disadvantage is that it is still using a fixed-size array. Can't the user enter the size of the array? It is possible that they could, but then same problem could occur as before and the user might miscount the number of items to be input. Further, the code in the sentinel controlled loop is doing the counting of the number of items, and the array has to be declared before the data is input.
+
+In the field of computer science, there are always trade-offs, and it is up to the designers of the algorithms to determine the best possible solution to the problem at hand. As will be seen in subsequent courses in computer science, the concept of a linked list is helpful in solving the above problem, but it should be noted that that solution is not without its own set of limitations. Another possible solution to the current problem, when there are more data items to be entered into an array than has been allocated, is to have the program allocate an array of a larger size, say twice as large, then copy the contents of the old array into the new one and allow the user to continue to enter data into the new array. Although this solution might slow down the processing, it does avoid the consequences of an array that is not large enough and this is left as an exercise at the end of the chapter. However, in this text when using the sentinel controlled loop, the emphasis will be on selecting the right size array in the first place.
+
+### 7.4.2 Output
+
+The output of an array could be done as the data is input, but then the output would be intermixed with the input. A better solution is to output the contents of the array after all the data has been input. But how does one know how many data items have been input when using a sentinel controlled loop? The answer is with the variable i used in the previous code segment. Since a fixed number of values have been input, a for loop is the best choice for output. The for loop could be written to iterate i times, but since i is typically used as a loop control variable, it might be better to copy the value in i to another variable such as n and then have the for loop reuse the variable i as a loop control variable and iterate n times. It is also helpful to add a column heading prior to the output of the contents of the array as shown in the following code segment:
+
+n = i;
+
+System.out.println();
+
+System.out.println("Integers");
+
+System.out.println();
+
+for(i=0; i<n; i++)
+
+System.out.println(" " + number[i]);
+
+Note that a blank line is output both before and after the column heading, Integers. Assuming some values have already been input to the array, the output would look as shown below:
+
+Output
+
+Integers
+
+5
+
+7
+
+9
+
+Also note that the underlined word Output is not part of the output from the code segment but rather helps one see where the output begins and the blank line both before and after the column heading.
+
+### 7.4.3 Simple Processing
+
+What if the data needs to be modified prior to output? As a simple example, what if the output was to be the original number doubled? There are two ways that this can be accomplished. The first is to just output the number doubled but not alter the contents of the array as follows:
+
+for(i=0; i<n; i++)
+
+System.out.println(" " + (number[i] * 2) );
+
+However, what if the specifications actually indicate that the contents of the array should be altered and then output? This can be accomplished by the following code segment:
+
+for(i=0; i<n; i++)
+
+number[i] = number[i] * 2;
+
+for(i=0; i<n; i++)
+
+System.out.println(" " + number[i] );
+
+Notice that in this instance the contents of the array are actually altered in the first loop and then output in the second loop. However, as an aside, can this be done in only one loop? The answer is yes as can be seen below:
+
+for(i=0; i<n; i++) {
+
+number[i] = number[i] * 2;
+
+System.out.println(" " + number[i]);
+
+}
+
+Clearly, the second solution is the better of the two. Although there will be times when there is no choice but to have a separate loop for processing the data in an array, as will be seen in the next section, it is usually better to combine the two tasks into one loop, if at all possible.
+
+Returning to the previous example of not altering the array and only modifying the output or writing code to actually alter the array prior to output, which one is the correct solution to the problem? It depends upon the specifications for the program and how the program might be modified in the future. If the specifications require only the output of the new numbers and the array needs to retain the original values for subsequent processing, then the first version is the preferred method. However, if the specifications indicate that the numbers are to be altered and subsequent processing depends upon the altered numbers, then the second way is better. If it is unclear, it is usually better to ask to determine which of the two is the best way to solve the problem, and in this text if it is not specified, it should be assumed that the contents of the array ought to be altered.
+
+### 7.4.4 Passing an Array to and from a Method
+
+An array can be passed to and from a method fairly easily. For simplicity, assume there is a 3-element array to be input and output. From the main program, one method could be called to input data into the array and another method to output the array as follows:
+
+int[] number;
+
+number = inputNumber();
+
+outputNumber(number);
+
+The call to inputNumber will prompt for and input integers into a local 3-element array as shown below:
+
+public int[] inputNumber() {
+
+int[] num = new int[3];
+
+for(int i=0; i<number.length; i++) {
+
+System.out.print("Enter an integer: ");
+
+num[i] = scanner.nextInt();
+
+}
+
+return num;
+
+}
+
+Since the array is allocated locally, there is no reason to allocate an array in the main program. At the end of the method, a reference to the array is returned to the calling program. Note that a copy of the entire array is not returned, but only the reference. The variable num in the method points to the array, a copy of the reference is passed back to the main program, and the copy is assigned to the variable number as shown in the following diagram:
+
+Just as a reference to an array can be sent back from a method, it can be sent to a method too. Again, a copy of the entire array is not passed to the method outputNumber, but only the reference is sent to the method via a parameter:
+
+public void outputNumber(int[] num) {
+
+for(int i=0; i<number.length; i++)
+
+System.out.println(num[i]);
+
+}
+
+Since arrays can become quite large, sending and returning only the reference makes it very practical.
+
+## 7.5 Reversing an Array
+
+As an example of a type of processing that can be done with an array, what if one wanted to output the integers that were input in reverse order? Although one does not need to reverse the contents of an array very often, it does introduce a number of interesting ideas that pertain to processing data in an array and will help in subsequent sections. There are two ways that this reversing can be accomplished. The first is to input the values using a loop such as the sentinel controlled loop in the previous section and then output the contents of the array in reverse order. How can this be accomplished? Instead of starting at zero, the loop would need to start at the opposite end of the array. But where should this be? If the array is called number as in the last section and instead its length is 8, should it start at position 8, ARRAYSIZE, or number.length? No, because recall that an 8-element array would be numbered from 0 to 7, not 1 to 8. So should it start from position 7, ARRAYSIZE-1, or number.length-1? That depends on how many integers are in the array. If there are only six integers in the array, then it should not start from position 7, but rather from position 5. Why not 6? For the same reason just mentioned, if there are six integers in an array, they would typically occupy elements 0 to 5. So if there are n integers in an array, the output should start from position n-1 as shown in the following code segment:
+
+for(i=n-1; i>=0, i--)
+
+System.out.println(number[i]);
+
+Notice that the loop control variable starts at n-1, the loop continues while i is greater than or equal to 0, and that i is decremented each time through the loop. Although this would output the array in reverse order to the user, have the values in the array changed? The answer is no. So what if instead of outputting the array in reverse order, one actually wanted to reverse the contents of the array? One way to accomplish this task is to declare another array and then copy the contents of the first array into the second array in reverse order. However, what is a possible drawback with this solution? The problem is that it takes two arrays or twice as much memory. In this example, it would require two 10-element arrays for a total of 20 elements. For a small array this is not much of a problem, but for a very large array, this would entail a substantial amount of memory. Instead, the solution is to reverse the array in place, thus using only one array.
+
+The algorithm takes the first data item and the last data item and swaps them. Then, the second data item and the second to the last data item are swapped, and so on as shown in Fig. 7.1.
+
+Fig. 7.1
+
+Reversing an array
+
+Again, one needs to be careful not to swap elements that do not contain values. When n equals 6, element 0 is swapped with the n-1 element, then element 1 is swapped with the n-2 element, and so on. The loop control variable can be used for elements 0, 1, and 2, but how does one access elements n-1, n-2, and n-3? One solution is to use a second variable such as j so that when the loop control variable, say i, is incremented, the variable j is decremented. But are two variables really needed? If one thinks about it, one should be able to see a pattern in accessing both ends of the data. When i is zero, the contents of location 0 needs to be swapped with location n-1. Although a little difficult to see here, in the first instance i is equal to 0, so n-1 could be thought of as n-i-1. However, sometimes a pattern is difficult to see in the first instance, but can be seen a little better in subsequent instances. Consider the next case when i is 1, it needs to be swapped with n-2. Since i would be equal to 1, n-2 could again be thought of as n-i-1. So instead of using two indexes, only one index is needed, which is a little more elegant.
+
+Lastly, the matter of the swap needs to be considered. If the contents of two simple variables need to be swapped, how can this be accomplished? When the value of one variable is transferred to another variable, the previous contents of the variable being swapped into are destroyed, so the previous contents need to be stored in a temporary memory location, often called temp. First the contents of the variable x need to be put aside in the temporary memory location temp using a temp = x; instruction.
+
+Once the contents of variable x have been moved into temp, the contents of variable y can be copied into the variable x using an x = y; instruction.
+
+Now that the contents of y have been copied into x, the contents of temp can be copied into the variable y using a y = temp; instruction.
+
+The whole sequence of instructions is as follows:
+
+temp = x;
+
+x = y;
+
+y = temp;
+
+So how can this be used with an array? Instead of using simple variables, the corresponding location of the array can be substituted using the variables i and n-i-1 as discussed above and shown below:
+
+temp = number[i];
+
+number[i] = number[n-i-1];
+
+number[n-i-1] = temp;
+
+Assuming i is equal to 0 and n is equal to 6, then going from left to right in Fig. 7.2 the execution of the three instructions is shown in the dashed boxes above each array.
+
+Fig. 7.2
+
+Swapping items in an array
+
+Putting it all together with the loop results in the following code segment. However, one needs to be careful when writing the code to solve this problem. For example, can the error in the following code segment be spotted?
+
+// *** Caution: Incorrectly implemented code ***
+
+for(i=0; i<n; i++) {
+
+temp = number[i];
+
+number[i] = number[n-i-1];
+
+number[n-i-1] = temp;
+
+}
+
+The swapping is okay, but what about the number of times the for loop iterates? At first one might think that the for loop is iterating one more or one less time than it should, but look at the code again. The problem is that after i gets halfway through the loop and has swapped the first half with the second half of the array, the loop continues and swaps the second half back with the first half of the array. This can be a tough problem for a beginning programmer to detect, because after supposedly reversing the array, and subsequently outputting the array, there appears to be no change in the order! Rather, the loop should only go halfway through the array and then stop. This makes sense when there is an even number of values in the array, but what if there is an odd number of items in the array as in Fig. 7.3?
+
+Fig. 7.3
+
+Reversing an odd number of items in an array
+
+Certainly, one does not need to swap the center item with itself. If there are 7 items in the array and n equals 7, then 7 divided by 2 is 3. Isn't it 3.5? No, recall that when an integer is divided by an integer, the answer is an integer. The result is that the loop will iterate 3 times and swap the first three data items with the last three in the array. The correct code can be found below:
+
+// *** Correctly implemented code ***
+
+for(i=0; i<n/2; i++) {
+
+temp = number[i];
+
+number[i] = number[n-i-1];
+
+number[n-i-1] = temp;
+
+}
+
+After reversing the array, it can be output. Since the output looks the same as when just outputting the array in reverse order without actually reversing the contents of the array, the difference between the two ways of approaching the problem might seem subtle to a beginning programmer. However, that is exactly the point that is trying to be made in the previous section. Just because the output might look the same does not mean the code has been written correctly. It is important to understand the specifications before attempting to write a program. Does the user or instructor just expect a listing in reverse order, or is there a plan to have subsequent code process the data in reverse order? Of the two, the second example is probably the better choice because if any subsequent code expects the array to be modified, it is important to actually reverse the array. A code segment illustrating the input, reversing, and output is given in Fig. 7.4.
+
+Fig. 7.4
+
+Code segment to input, reverse, and output
+
+As can be seen, the segment uses a sentinel controlled loop to count and input the integers into the array and then copies the number of integers into n. It then reverses the integers in the array and lastly outputs the contents of the array.
+
+## 7.6 Searching an Array
+
+One of the benefits of storing data on a computer is that it can easily be retrieved. For example, once data has been placed into an array, it can subsequently be searched to see if a particular item is in the array. There are two common ways to search for data in an array, the sequential search and binary search.
+
+### 7.6.1 Sequential Search
+
+A sequential search is just as it sounds; the data in the array is searched in sequence from the beginning of the array to the end. It is similar to an instructor hunting for a particular exam in a stack of random exams on a desk, where he or she would start at the top of the pile of exams and proceed to the end. If he or she were lucky, in the best-case scenario, it might be the first one on the pile of exams. In the worst-case scenario, it would be the last one in the pile of exams. If there are n exams in the stack, it could take 1 to n times of picking up and looking at the exam to determine whether it is the correct one. However, usually it will be somewhere in between the first and last exams, and one could say that on average it will take n/2 times to find the exam. Of course, once the exam is found, there is no need to continue looking through the pile of exams and the searching can stop. Further, if the instructor is searching for more than one of the student's exams in the stack, then searching would continue until the end of the stack. Lastly, it is possible that the exam is not in the pile of exams, so in that case it is not found.
+
+This is essentially the algorithm that can be used when performing a sequential search on an array. Searching through the pile of exams is equivalent to searching through an array which can be accomplished using a loop. If the number of items in the array is known, such as n, a for loop could be used. Then, each element in the array can be compared to the item being searched. However, once it is found, there is no reason to continue searching through the array, so the loop should stop before reaching the end. Since there are two reasons why the loop might stop, the for loop might not be the best choice. Although a for loop could be used, the code for it is rather unstructured and the while loop is probably the better choice. Once the item being searched for is found, a boolean flag can be set and checked in the while loop to indicate that further iteration is no longer necessary. What if there are duplicates in the array? The iteration would need to continue and the possibility of searching for duplicates is left as an exercise at the end of the chapter.
+
+The name of the flag variable could be anything, but since it is indicating whether or not the item was found, the variable name found is a good one. Before entering the loop, the item has not been found, so the found flag can be initially set to false. The loop can then search until either the item is found or all the values in the array have been searched. Then, for each iteration of the loop, an if statement can compare whether the current item in the array is equal to the item being searched, and if so, the found flag is set to true. Otherwise, the found flag remains false. Assuming the array already contains various values, the code in Fig. 7.5 prompts for and inputs the value to be searched.
+
+Fig. 7.5
+
+Sequential search
+
+When the execution of the code segment is complete, if item was found, then the found flag will be true and i will indicate the location it was found. If the item was not found in the array, the found flag will remain false. Note that an && is used in the while statement, because only while both i is less than n and found is false should the loop continue to iterate. Also notice that !found is used in the while loop instead of found != true or found == false. Since the found flag is a boolean variable, it will contain either true or false, so it is not necessary to compare it to true or false. Likewise, in the if statement after the while loop found == true is not used because if the found flag is true, it is unnecessary to compare it to true.
+
+### 7.6.2 Binary Search
+
+A sequential search is useful when items are in random order, but what if the data to be searched are not in random order? Returning to the pile of exams, what if they were in alphabetical order? Wouldn't it be easier and faster to find the particular exam in question? For example, if a person's last name began with the letter A, then it would appear toward the top of the stack of exams, and if a person's last name began with the letter Z, then it would appear toward the bottom. If someone's last name began with the letter T, it would not make sense to start at the top and work their way down. Although unlikely, it is possible that the stack of exams contains only people whose last names begin with the letters S through Z, so starting at the other end might not be a good idea either.
+
+The safe route is to just split the stack of exams into two halves and determine whether the Ts are in the top half or the bottom half. In the case where the names on the exams begin with A through Z and in the middle is a name starting with the letter M, an exam with a name beginning with the letter T would be in the bottom half. In the case where the stack contains names that start with the letters S though Z, and if the name on the exam in the middle starts with the letter X, then the exam with the letter T would be in the top half. In either case, with just one comparison, the task of searching has been cut in half.
+
+The beauty of this technique is that after the stack of exams has been cut in two, the process can be repeated. Using the first example with the second half of exams from M through Z, it could then be cut in half again, where maybe the middle exam has a name that starts with the letter S and again the letter T would be in the second half. When the half with names starting with S through Z is cut in half again, and assuming the middle exam has a name that starts with the letter V, then the letter T would be in the first half. If at any time when the stack is cut in half and the exam being searched for happens to be in the middle, the processing would stop. This process would continue until there is only one exam left, and if it is not the exam being searched for, then the exam is not found.
+
+Consider the code segment in Fig. 7.6 which searches any array of integers. Notice that i is the lower index, j is the upper index, and mid is the middle position of the array to be searched.
+
+Fig. 7.6
+
+Binary search
+
+Should item be the middle integer, then it is found. Otherwise, depending on if the item is less than or greater than the middle integer, j or i takes on the value of mid - 1 or mid \+ 1, respectively. The search continues until item is found in the middle or i is greater than j indicating that item is not in the array.
+
+Note that whereas the sequential search can work with either unsorted or sorted data, the binary search can only work with sorted data. Further, if the data is unsorted, then only the sequential search can be used.
+
+### 7.6.3 Elementary Analysis
+
+Although at first the binary search might seem a little slow, it really is quite fast. For example, to make it simple, assume that there are 64 items (which is a power of 2) to be searched and the item is not in the list. When the array of 64 is cut in half, there would be 32 items to be searched. When 32 is cut in half, there are 16 to be searched, and 16 cut in half is 8. Half of 8 is 4, half of 4 is 2, and half of 2 is 1. The original stack of 64 is cut in half 6 times. That means with just 6 comparisons, the item would be found or not found in the worst-case scenario. With a sequential search, the worst-case scenario would take 64 times, where 6 is clearly better than 64.
+
+When one is first learning about logarithms, they are usually in base 10. Recall that 103 is equal to 1,000 and log10 1,000 = 3. However, since the above example is a binary search and if one thinks about it, 26 is equal to 64 and log2 64 equals 6. One will find in computer science that many algorithms will be binary in nature so when one sees a logarithm, it will usually be log2. Further, should the subscript be missing, then in the field of computer science, it can usually be assumed that the default is log2.
+
+Returning back to the binary search, it was seen that a group of 64 could be searched in 6 comparisons. If 1,024 integers were in an array, it would take just 10 comparisons in the worst case to find the item being searched, since 210 equals 1,024 and log2 1,024 equals 10. This is much better than the sequential search which would take 1,024 comparisons to find an integer in an array in the worst case, and on average it would take 1,024/2 times which equals 512. What if the number of items being searched is not a power of two? For example, what if there were 1,000 items to be searched? The answer is that it would be no worse than the next highest power of two, which in this case would be 1,024.
+
+So far, only concrete numbers have been used, but can this idea be generalized to an unknown number of items in an array? Yes, assume that there are n items in an array. If a sequential search were used, then the average case would be n/2 and the worst case would be n, whereas with the binary search the worst case would be log2 n. This concept of comparing algorithms is a very important one in the field of computer science, where the relative speed of algorithms can be compared with each other. A common notation is to use the capital letter O to compare the relative order of magnitude of various algorithms, and the use of the capital letter O is called Big O notation (pronounced Big Oh). So in the worst case the sequential search is said to be of order n or O (n) and in the worst case the binary search is said to be of order log n, or O (log n). Although introduced here and used on occasion elsewhere in this text, this concept becomes much more frequent in subsequent courses such as a second course in computer science that examines data structures or a course on advanced data structures and/or algorithm analysis.
+
+## 7.7 Sorting an Array
+
+As has been seen, the binary search is much faster than the sequential search. Its disadvantage is that the data must be in order. But how does the data get in the proper order? One way is to have the data entered in the proper order to begin with. However, that would require a lot of effort on behalf of the person entering the data. Instead, wouldn't it be more convenient to just enter the data in any order and let the power of the computer do the work of sorting the data? The answer is yes as will be seen shortly.
+
+There are many algorithms that have been developed to sort data. Some are sufficiently fast with small sets of data, but as the number of items to be sorted becomes larger, they are not very efficient. There are other algorithms that excel at large amounts of data but are not as efficient on smaller sets. There are still other algorithms that work well on data that has already been partially sorted, and others that are more efficient when the data is totally random. The more efficient an algorithm, the more complicated it is, and these are usually learned in subsequent computer science courses or texts. For now, this text will examine one of the simpler algorithms known as the bubble sort. As a way to help understand the bubble sort, this text breaks it into two separate sorting algorithms, where the basics are presented as the simplified bubble sort and then modified to help its efficiency, where the modified version is the true bubble sort.
+
+### 7.7.1 Simplified Bubble Sort
+
+Assuming one wants to sort data in ascending order (from the smallest to the largest), the bubble sort gets its name from the way the smaller values slowly move up toward the top of an array, as bubbles might slowly move up in a glass of soda. The bubble sort works by comparing pairs of adjacent integers and if the pair is out of order, swapping the two integers as shown in Fig. 7.7 which should be read from left to right, top to bottom.
+
+Fig. 7.7
+
+First pass of the bubble sort
+
+As can be seen, the first and second integers are compared, and if they are in the correct order, they remain as they are, but if the integers are out of order, they are swapped. This process is repeated for each pair of adjacent integers. Given an array of 5 data items, four pairs of integers are compared.
+
+After the first pass through the array, note that the smallest integer has moved up one position. Also, note that the bottom integer in the array is now the largest one. After one pass, there is no need to subsequently compare the bottom integer. So when going through a second pass comparing the pairs of integers, the loop can iterate one less time.
+
+But how many of these passes need to be made? If the first time through there are four pairs of integers to be compared and the second time there is one less integer to be sorted, then the second time through there would be only three pairs of numbers to be compared. It would follow that the third time through there would be two pairs and the fourth time there would be only one pair of integers to be compared. The result is that for 5 integers, there would be four passes through the array, each comparing one less pair of integers. If there were n integers in an array, then it would follow that n-1 passes would need to occur. To make this happen in a program, it should be apparent that a loop is needed. Further, the loop would need to iterate n-1 times. Since it is a number based on n, then a for loop would be a good choice.
+
+If the number of passes needs a loop, it should seem clear that the comparison of the pairs of integers within each pass also needs a loop. However, the number of pairs of integers to be compared is different each time. How can this problem be resolved? Notice that the number of pairs of integers that need to be compared decreases by one each time. Is there a variable that could be used for this? If there are n-1 comparisons the first time, n-2 the second time, and so on, and further if the outer loop control variable, say i, is going from 0 to 3, then that variable could be used to determine the number of comparisons. So when i is 0 the first time, n-i-1 would be equal to 4; then when i is equal to 1 the second time, then n-i-1 would be equal to 3; and so on. The expression n-i-1 should look familiar from the code for reversing an array, and this expression comes in handy on many occasions.
+
+Lastly, which two elements would need to be compared each time? Since i is used for the outer loop, then j could be used for the inner loop. So for the first time through when j is equal to 0, the 0th and 1st elements would be compared, which would be the j and j+1 elements, and when j is equal to 1, it would compare the 1st and 2nd elements and so on. The swap would be similar to the one developed in the previous section, except it would be between the two compared elements, j and j+1, as shown in the following code segment:
+
+for(i=0; i<n-1; i++)
+
+for(j=0; j<n-i-1; j++)
+
+if(number[j]>number[j+1]) {
+
+temp = number[j];
+
+number[j]= number[j+1];
+
+number[j+1] = temp;
+
+}
+
+The reader is encouraged to walk through the code segment to see how the algorithm works. Again, notice how the smallest number slowly moves or bubbles its way to the top of the array during each pass, thus giving the name to the bubble sort. To analyze the speed of this algorithm, it should be noticed that the outer loop iterates n-1 times. However, when doing analysis like this, the one less time than n that it loops is not very significant for a very large number n, so it is said to be of order n. The inner loop iterates one less time on each pass going from n-1 to 1 times, where it could be said that it loops on average n/2 times. But again, for a very large n, the division by two would still be a large number, so it is also said to be of order n. Recall from Chap.​ 4 that two nested loops each iterating n times the total number of iterations would be n*n, or n 2. Since in the current example, one loop is nested inside the other and also each loop is iterating approximately n times, this algorithm is of order n 2, or O(n 2).
+
+### 7.7.2 Modified Bubble Sort
+
+In the previous simplified sorting algorithm, does it make any difference whether the data in the array is in reverse order, random order, or already sorted? The answer is no, because the outer loop will still iterate n-1 times and the inner loop will still iterate n/2 times. Although this does not make a difference if the array is in reverse order, nor does it make a lot of difference if the array is totally random, what if the array is already sorted? Granted this might not happen very often, but if it was already sorted, it would still take O(n 2) to sort an already sorted array. Is there some way that this can be improved? During the first pass through the array, if there are no swaps between any of the pairs of elements, then it would be known that the array is already in order. Can the program be modified to take advantage of this scenario? Yes, a boolean flag can be used to indicate whether a swap has or has not occurred, and a good name for this flag is swap.
+
+The first for loop could be replaced with a while loop that not only checks to see how many passes have occurred but also checks to see if a swap has occurred. If a swap has not occurred, then another pass is not necessary. Initially the swap flag could be set to true prior to any code to indicate that a swap has occurred. This would force the execution of the first time through the outer loop. The first thing to be done inside to the loop is to reset the swap flag to false, so in case there are no swaps during the inner loop, then no subsequent passes through the outer loop need to occur. Lastly, should a swap occur in the if statement, the swap flag is sent to true, thus forcing another pass through the outer loop:
+
+swap = true;
+
+i = 0;
+
+while(i < n-1 && swap) {
+
+Swap = false;
+
+for(j=0; j<n-i-1; j++)
+
+if(number[j] > number[j+1]) {
+
+swap = true;
+
+temp = number[j];
+
+number[j] = number[j+1];
+
+number[j+1] = temp;
+
+}
+
+i++;
+
+}
+
+As before, notice that swap is used in the while loop instead of swap == true or swap ! = false. Also notice the addition of the extra set of braces for the while loop, because now syntactically there are three statements in the body of the loop: the setting of swap to false, the for statement, and the increment of i. Lastly, notice that if there is more than one swap in the inner for loop, the swap is set repetitively to true. Although this seems a little redundant, it is quicker and easier to just keep setting swap back to true than adding code to check to see if it is already set to true.
+
+The result is that if the data in the array is in reverse order, there is no increase in the speed of the algorithm. However, if the data is already in order, then there is only one pass through the outer loop, and the inner loop iterates n-1 times. So, this algorithm with data already sorted is O(n), and the bubble sort is one of the fastest sorting algorithms for data that is already in order. Although this might seem a little confusing to use a sorting algorithm with data that is already sorted, the algorithm also works fairly well for data that is close to being in order. If only a few items need to be swapped, then the outer loop will only iterate a few times, until there is a pass without any swaps, in which case the outer loop stops iterating. So in cases where data is possibly in order, or close to being in order, the bubble is a very good sort. However, for large amounts of data that is in reverse order, close to being in reverse order, or totally random, the bubble sort is not the best choice. As will be seen in later courses, there are a number of other sorting algorithms that can handle these situations much faster. Nonetheless for this text, the bubble sort provides a good starting point for understanding how sorting algorithms work and can be used to sort small sets of data.
+
+## 7.8 Two-Dimensional Arrays
+
+The preceding sections introduced how to declare variables for one-dimensional arrays, how to create them, and how to access elements in them. One-dimensional arrays work well when dealing with a set of data such as a collection of grades for one student. However, what if there are multiple sets of data, such as grades for several students? Then, the data could be stored in a two-dimensional array, which are sometimes called a 2D array.
+
+### 7.8.1 Declaration, Creation, and Initialization
+
+Suppose that there are four students in a class and they each took three exams. Instead of creating four separate one-dimensional arrays in order to record the exam scores for each student, one two-dimensional array can be used to store all the scores. Three exam scores for each student are kept in a row; therefore, there will be four rows and three columns in the table. Assume that the scores are of type int and the name of the array is scores. To declare a two-dimensional array, two sets of brackets are required. The first one is for the rows and the second one is for the columns as shown below:
+
+int scores[][];
+
+which is equivalent to
+
+int[][] scores;
+
+The two sets of brackets could be after or prior to the name of the array and the second example above is used more often. A diagram after the declaration is shown below:
+
+The following creates a two-dimensional array of four by three integer values:
+
+scores = new int[4][3];
+
+The number 4 in the first set of brackets specifies the number of rows and the number 3 in the second set of the brackets specifies the number of columns. The diagram in Fig. 7.8 illustrates the array after its creation. Notice that a two-dimensional array is actually an array of one-dimensional arrays, meaning that it consists of an array in which each element is a one-dimensional array.
+
+Fig. 7.8
+
+After creation of 2D array
+
+An array can be declared and created at the same time using the following statement:
+
+int[][] scores = new int[4][3];
+
+The diagram for the above statement is the same as that in Fig. 7.8. Again, in order to reinforce the concepts of declaration and allocation, two separate instructions are used in this text.
+
+To access the data in a two-dimensional array, two subscripts or indices are used, one for the row number and the other for the column number. As in a one-dimensional array, each index is of type int and starts from 0 in the array. The first exam score of the first student is stored in scores[0][0], the second exam score is stored in scores[0][1], and the third exam score is stored in scores[0][2]. The scores for the second student are kept in scores[1][0], scores[1][1], and scores[1][2]. The scores for the third and fourth students are stored in a similar fashion. Suppose that the first student made a 72 on the first exam, an 85 on the second exam, and a 91 on the third exam. Then, the following statements store the scores for the first student in the appropriate positions in the array:
+
+scores[0][0] = 72;
+
+scores[0][1] = 85;
+
+scores[0][2] = 91;
+
+If the second student made 95, 89, and 90 on the three exams, the statements below will initialize the scores for the second student:
+
+scores[1][0] = 95;
+
+scores[1][1] = 89;
+
+scores[1][2] = 90;
+
+Scores for the third and fourth students can be entered in a similar manner. The diagram in Fig. 7.9 shows the two-dimensional array after the initialization.
+
+Fig. 7.9
+
+After initialization of 2D array
+
+Alternatively the following statement will declare, create, and initialize a two-dimensional array:
+
+int[][] scores = {{72, 85, 91},
+
+{95, 89, 90},
+
+{77, 65, 73},
+
+{97, 92, 93}};
+
+The size of the array is determined by the number of values provided in the set of braces without explicitly specifying it inside the brackets. The diagram after the above statement is equivalent to the one in Fig. 7.9.
+
+### 7.8.2 Input and Output
+
+Although the techniques of assigning data used in the previous section are adequate for testing programs, how can the data be entered by the user? It is similar to a one-dimensional array, but instead of using a simple for loop, a nested for loop is used as shown below:
+
+int[][] scores;
+
+scores = new int[4][3];
+
+for(int i=0; i<4; i++) {
+
+for(int j=0; j<3; j++) {
+
+System.out.print("Student " + (i+1) + ", exam "
+
+\+ (j+1) + ": ");
+
+scores[i][j] = scanner.nextInt();
+
+}
+
+System.out.println();
+
+}
+
+Notice that each position in the array can be accessed using two index variables, i and j, for the row number and the column number, respectively, inside the loop. A portion of the output with sample input is as follows:
+
+Student 1, exam 1: 72
+
+Student 1, exam 2: 85
+
+Student 1, exam 3: 91
+
+Student 2, exam 1: 95
+
+Student 2, exam 2: 89
+
+Student 2, exam 3: 90
+
+...
+
+Alternatively, the number of rows and columns could be entered by the user, and a two-dimensional array could then be created dynamically as discussed in Sect. 7.4. Once scores are in the array, one can output them using a nested for loop. Suppose three exam scores for each student are to be output in a row. The code segment below outputs the column labels first followed by the row labels and scores:
+
+System.out.println("exam 1 exam 2 exam 3");
+
+for(int i=0; i<4; i++) {
+
+System.out.print("Student " + (i+1));
+
+for(int j=0; j<3; j++)
+
+System.out.print(" " + scores[i][j]);
+
+System.out.println();
+
+}
+
+Notice that the print statement for the column headings is outside the nested for loop, since they are only output once. The print statement for the row label is located prior to the inner for loop, which means it is output every time the control variable i of the outer for loop changes. Also notice that three scores for each student are output on the same line using the print in the inner for loop. The println after the inner for loop moves the cursor to the next line for the next student. The output from the above code segment is as follows:
+
+exam 1 exam 2 exam 3
+
+Student 1 72 85 91
+
+Student 2 95 89 90
+
+Student 3 77 65 73
+
+Student 4 97 92 93
+
+What if all the scores of thee exams need to be output line by line as shown below?
+
+Student 1 Student 2 Student 3 Student 4
+
+exam 1 72 95 77 97
+
+exam 2 85 89 65 92
+
+exam 3 91 90 73 93
+
+Again, a nested for loop can be used. In order to access all the scores in one column of the array before going to the next column, the column number has to remain the same in an outer for loop, while the row number is changing in the inner for loop. This is left as an exercise at the end of the chapter.
+
+### 7.8.3 Processing Data
+
+Using the array scores, how can the average of the three exam scores for the first student be calculated? All the scores for the first student are stored in the first row of the two-dimensional array. In order to find the average, the values in the first row have to be added together and divided by the number of exams. The following formula will find the average for the first student:
+
+(scores[0][0] + scores[0][1] + scores[0][2])/3;
+
+The average exam scores of other students can be found in the similar way. However, if the instructor would like to find the averages for a large class, it would not be efficient to list the formula for each student.
+
+To process arrays, the length field is useful as discussed earlier in this chapter. When an array is created, a reference to the array is stored in the variable. At the same time, the length of the array is stored in an instance constant named length. For a one-dimensional array, the length holds the number of elements in the array. Since a two-dimensional array is an array of one-dimensional arrays, there are several length fields associated with it. They keep track of the number of rows and the number of columns for each row. With the array shown in Fig. 7.9, the length of the array scores can be obtained by scores.length which is the size of the one-dimensional array that the variable scores is referring to. In this case, the value would be 4 indicating the number of rows. As shown in Fig. 7.9, the elements of the array, scores[0], scores[1], scores[2], and scores[3], are references to one-dimensional arrays. Therefore, their length can be obtained by scores[0].length, scores[1].length, scores[2].length, and scores[3].length. Since it is a four by three array, all of them have a value of 3 indicating that the number of columns of the array scores is 3.
+
+Returning back to finding the average of all the exam scores for the first student, a for loop can be used as shown below:
+
+double total, average;
+
+total = 0.0;
+
+for(int j=0; j<3; j++)
+
+total = total + scores[0][j];
+
+average = total/3;
+
+The variable total contains the total of the three exam scores and the variable average holds the average. The variable total is initialized to 0.0 at the beginning, and inside the for loop, the three test scores, scores[0][0], scores[0][1], and scores[0][2], are added together. The row number is fixed at 0 and the value of the index variable j changes from 0 to 2 accessing the scores of the first student. Since there are three exams, the total was divided by 3. Although the elements of the array scores are of type int, the value for average most likely requires more precision. Therefore, both the total and average were declared as type double in order to avoid integer division. Using the length field, the above code can be rewritten as
+
+double total, average;
+
+total = 0.0;
+
+for(int j=0; j<scores[0].length; j++)
+
+total = total + scores[0][j];
+
+average = total/scores[0].length;
+
+Notice that scores[0].length gives the number of the columns of the two-dimensional array, which is 3 in this example, indicating the number of exams. How can the above code be changed to find the average exam scores of all four students? Since the formula to find the average is the same for all the students, a nested for loop can be used as shown below:
+
+double total, average;
+
+for(int i=0; i<4; i++) {
+
+total = 0.0;
+
+for(int j=0; j<scores[i].length; j++)
+
+total = total + scores[i][j];
+
+average = total/scores[i].length;
+
+System.out.printf("average for student " + (i+1) + ": %5.2f", average);
+
+System.out.println();
+
+}
+
+Notice that the outer for loop is used to specify the particular student. All the 0's in the brackets in the previous code indicating the first student are replaced by the index variable i which changes from 0 to 3 for the 4 students in the class. Of course, the value 4 can be replaced by the length field as shown below:
+
+double total, average;
+
+for(int i=0; i<scores.length; i++) {
+
+total = 0.0;
+
+for(int j=0; j<scores[i].length; j++)
+
+total = total + scores[i][j];
+
+average = total/scores[i].length;
+
+System.out.printf("average for student " + (i+1) + ": %5.2f", average);
+
+System.out.println();
+
+}
+
+The scores.length gives the number of rows of the two-dimensional array. In this example it is 4, which is the number of students. Assuming that the size of the array is the same as the number of student and exams, the advantage of using the length field is that no matter how many students or exams, the same code can be used to find the average.
+
+The next question is can the average of the first, second, and third exams be found using a loop? The answer is yes. However, careful consideration should be taken concerning the order of the elements accessed in a two-dimensional array. In the previous example, the elements of the array were accessed in row-wise fashion. In order to find the average score for each exam, they have to be accessed in column-wise fashion. The key is the index variables i and j. In order to access all the data in one column, the column number has to remain the same while the row number is changing. The following code illustrates how the averages of the three exams are calculated:
+
+double total, average;
+
+for(int j=0; j<scores[0].length; j++) {
+
+total = 0.0;
+
+for(int i=0; i<scores.length; i++)
+
+total = total + scores[i][j];
+
+average = total/scores.length;
+
+System.out.printf("average for Exam " + (j+1) + ": %5.2f", average);
+
+System.out.println();
+
+}
+
+In the above code, the outer and inner for loops are swapped from the previous code segment, so that while the value of j remains the same, the value of i changes inside the inner for loop. Notice that the value of j changed from 0 to 2 indicating there are three exams. Even though the scores[0]. length is used in the condition of the outer for loop, any of the values from scores[1].length through scores[3].length could be used since they all have the same value for the number of columns.
+
+### 7.8.4 Passing a Two-Dimensional Array to and from a Method
+
+A two-dimensional array can be passed to a method just as a one-dimensional array can be passed to a method. The following program implements a method that calculates and outputs the average of the exam scores for each student in the class. The studentsAvg method is called from the main method:
+
+The output from the above code is shown below:
+
+average for student 1: 82.67
+
+average for student 2: 91.33
+
+average for student 3: 71.67
+
+average for student 4: 94.00
+
+Alternatively, since a two-dimensional array is an array of one-dimensional arrays, each row can be passed to the method separately. The method studentAvg implemented below takes a one-dimensional array of exam scores for one student as a parameter, calculates the average, and returns it:
+
+public static double studentAvg(int[] inRow) {
+
+double total, average;
+
+total = 0.0;
+
+for(int i=0; i<inRow.length; i++)
+
+total = total + inRow[i];
+
+average = total/inRow.length;
+
+return average;
+
+}
+
+How is the method above invoked? Since the method accepts an array of three scores for one student, as in studentAvg(scores[0]), it will return the average score for the first student. The average score for each student can be found by calling the method inside the loop as shown below:
+
+double average;
+
+for(int i=0; i<scores.length; i++) {
+
+average = studentAvg(scores[i]);
+
+System.out.printf("average for student " + (i+1)+ ": %5.2f", average);
+
+System.out.println();
+
+}
+
+Notice that when the method studentAvg was called, score[i] was sent to the method as an argument. Further, it is an element of a one-dimensional array which has a reference to another one-dimensional array that has the scores for one student.
+
+Just like a two-dimensional array can be sent to a method, it can be returned from a method. The following example shows how a two-dimensional array is created inside the method getScores and returned to the main method. There is no need to create an array in the main method after the declaration because the reference to the newly created array in the method getScores will be assigned to the variable scores when the flow of control returns from the method:
+
+Notice that the return type of the method getScores is int[][], which means it will return the reference to a two-dimensional array of int type.
+
+### 7.8.5 Asymmetrical Two-Dimensional Arrays
+
+Suppose that nonstop flights from several cities need to be recorded. A two-dimensional array can be used to keep this information. Each row can contain the list of destinations from a particular city. For example, there may be direct flights to Chicago, St. Louis, and Dallas/Fort Worth from City1, while Dallas/Fort Worth may be only the city reached from City2, and so on. It is possible that each city has a different number of nonstop flights to the destinations, which means that each row could have a different number of columns. Can a two-dimensional array have rows of unequal lengths? The answer is yes, because a two-dimensional array is an array of one-dimensional arrays, each one-dimensional array can be created separately using a different size. Before creating an asymmetrical two-dimensional array, consider the example from the previous section. Instead of creating an array scores using the following statements,
+
+int[][] scores;
+
+scores = new int[4][3];
+
+a one-dimensional array of size 4 can be created first and then for each row a one-dimensional array of the size 3 can be created next as shown below:
+
+int[][] scores;
+
+scores = new int[4][];
+
+scores[0] = new int[3];
+
+scores[1] = new int[3];
+
+scores[2] = new int[3];
+
+scores[3] = new int[3];
+
+The same thing can be accomplished using a loop.
+
+int[][] scores;
+
+scores = new int[4][];
+
+for(int i=0; i<4; i++)
+
+scores[i] = new int[3];
+
+Returning back to the flights example, the second alternative above can be used to create a two-dimensional array with rows of unequal lengths. Suppose there are three cities and the first city has three nonstop flights, the second city has one, and the third city has two. The following will declare and create an array city:
+
+String[][] city;
+
+city = new String[3][];
+
+city[0] = new String[3];
+
+city[1] = new String[1];
+
+city[2] = new String[2];
+
+The code below will assign values (ORD for Chicago, STL for St. Louis, and DFW for Dallas/Fort Worth) in the one-dimensional array for the first city:
+
+city[0][0] = "ORD";
+
+city[0][1] = "STL";
+
+city[0][2] = "DFW";
+
+Alternatively the following statement will accomplish declaration, creation, and initialization in one statement:
+
+String[][] city = {{"ORD", "STL", "DFW"},
+
+{"DFW"},
+
+{"ORD", "DSM"}};
+
+The following diagram shows the array city:
+
+Two-dimensional arrays are examples of multidimensional arrays. The same principle can be applied to n-dimensional arrays, where n can be any integer value. A three-dimensional array is left as an exercise at the end of the chapter.
+
+## 7.9 Arrays of Objects
+
+Looking at the scores example from Sect. 7.8, the two-dimensional array scores keeps only students' exam scores. It would be nice if the names of the students were associated with their scores. So far arrays with only primitive data types, strings, and arrays have been discussed. As seen in the preceding sections, an array is a collection of data of the same type regardless of the number of dimensions. Therefore, the scores of type int and the student names of type String cannot be stored together in a simple array, because a two-dimensional array whose columns contain values of different data types is not allowed.
+
+To get around this problem, a one-dimensional array of String type can be used for the name along with a two-dimensional array scores. The array could be declared as studentName which would be of size 4 containing the names of four students. The three scores in the one-dimensional array scores[0] would correspond to the student at studentName[0], the scores in scores[1] would be made by the student at studentName[1], and so on. This technique of using two separate arrays, called parallel arrays, is useful when a programming language does not support objects or other structures.
+
+In Java, instead of using parallel arrays, associated data can be encapsulated into an object, and a one-dimensional array of these objects can be created. Objects that represent the name of the student and the test scores can be described by the class Student. The name of the student and their test scores will be declared as instance variables, and a constructor and four accessors for each data member are defined in the class Student as shown below:
+
+In the main method, a one-dimensional array of type Student is declared, and an array of size four is created using the following statements:
+
+Student[] scores;
+
+scores = new Student[4];
+
+The execution of the above code will result in the diagram shown below:
+
+Notice that only the array is created and the elements of the array scores are initially null. Therefore, each individual object has to be created and the reference to it has to be placed in the array. Each object of type Student will contain the last name of the student and three test scores. The following statement will create an object and assign the reference to the object to the first position of the array, scores[0]:
+
+scores[0] = new Student("Fonteyn", 72, 85, 91);
+
+Similar statements will be used to place the other students in the array. Figure 7.10 illustrates the array of objects.
+
+Fig. 7.10
+
+Array scores with four objects of the type Student
+
+The following program will output the contents of the array scores, using a loop and accessors:
+
+Notice that scores[i] refers to an object of type Student in the array scores. Here an indexed expression is used to refer to an object instead of a simple variable. Therefore, the same syntax can be used to call the object's method such as in scores[i].getName() . The output from the above code is shown below:
+
+Name Exam 1 Exam 2 Exam 3
+
+Fonteyn 72 85 91
+
+Pavlova 95 89 90
+
+Baryshnikov 77 65 73
+
+Nureyev 97 92 93
+
+The average test scores of each student or each exam can be calculated using the accessors defined in the class Student.
+
+## 7.10 Complete Program: Implementing an Array
+
+Using an array, a program which calculates the standard deviation of a set of data will be developed in this section. The program will:
+
+  * Allow the user to enter the number of items and the actual data
+
+  * Compute the standard deviation of the data
+
+  * Display the standard deviation
+
+The standard deviation, represented by the symbol sigma, σ, is a measure of the spread of the data. If the distribution is roughly bell shaped and symmetric, then most of the data, approximately 68 %, lie within one standard deviation of the mean between (mean − σ) and (mean + σ), and almost all the data, approximately 95 %, lie within two standard deviations of the mean between (mean − 2σ) and (mean + 2σ). The definition of the standard deviation is
+
+First, the mean   is determined, which is the sum of the data divided by the number of data values. Then, the mean is subtracted from every number   to get the list of deviations. Next, the resulting deviations are squared giving  . Then, the squares are added to get their sum ∑i=1 n  . The result is divided by the number of items in the list to get the variance. Lastly, to obtain the standard deviation, the square root of the variance is calculated.
+
+If only the mean of the numbers was to be calculated, there is no reason to store the data in an array. Inside a loop the numbers the user enters can be summed and then the average can be computed outside the loop. However, to find the standard deviation, the data must be stored in some way because the deviations need to be calculated by the formula   using the mean and the original data.
+
+Declaring variables to store all the numbers is one way, but using an array is a better solution when the size of the data is large. Assuming that the numbers are all stored in an array named array, the following code will find the mean of the numbers in the array:
+
+total = 0;
+
+for(int i=0; i<array.length; i++)
+
+total = total + array[i];
+
+mean = total/array.length;
+
+The next step is to square each of the differences and add them together as shown below:
+
+total = 0;
+
+for(int i=0; i<array.length; i++)
+
+total = total + Math.pow(array[i] - mean, 2);
+
+Note that the method pow from the Math class is useful here. The following code calculates the variance by dividing the total by the number of items in the array:
+
+variance = total/array.length;
+
+Finally, the standard deviation can be computed by taking the square root of the variance as illustrated below:
+
+sigma = Math.sqrt(variance);
+
+Notice another method sqrt in the Math class is used here. In the complete program, three methods are defined to get data from the user, calculate the standard deviation, and output the result. These three methods, getData, computeStdDev, and outputStdDev, will be called from the main method. The complete program is shown below:
+
+When the above code is compiled and executed using the sample input of 39, 40, 38, 96, 42, 47, 50, 44, 46, and 50, the output of the program looks like the following:
+
+Enter the number of data: 10
+
+Enter the data 1: 39
+
+Enter the data 2: 40
+
+Enter the data 3: 38
+
+Enter the data 4: 96
+
+Enter the data 5: 42
+
+Enter the data 6: 47
+
+Enter the data 7: 50
+
+Enter the data 8: 44
+
+Enter the data 9: 46
+
+Enter the data 10: 50
+
+Standard deviation: 17.00
+
+## 7.11 Summary
+
+  * Do not confuse the index of an array element with the contents of an array element.
+
+  * Be careful not to access elements outside the bounds of the array.
+
+  * A sequential search works on data that is sorted or unsorted, whereas the binary search works only on data that is sorted.
+
+  * On average or in the worst-case scenario, the sequential search is O(n), whereas the binary search is on average or in the worst-case scenario O(log n).
+
+  * The simplified bubble sort is O(n 2) on already sorted data, whereas the modified bubble sort is O(n) on sorted data.
+
+  * On random data or data in reverse order, both the simplified and modified bubble sorts are O(n 2).
+
+  * A two-dimensional array is an array of one-dimensional arrays.
+
+  * An array can have any number of dimensions, although most arrays are either one or two dimensional.
+
+  * A two-dimensional array can have rows of unequal lengths.
+
+  * Elements of an array can be either a primitive data type or an object.
+
+## 7.12 Exercises (Items Marked with an * Have Solutions in Appendix E)
+
+1.
+
+Indicate whether the following statements are syntactically correct or incorrect. If incorrect, indicate what is wrong with the statement:
+
+A.
+
+int[] array[];
+
+*B.
+
+double data[] = new data[];
+
+*C.
+
+int[] = [2, 4, 6];
+
+D.
+
+double[][] doubleArray[10];
+
+*E.
+
+Student[5] class;
+
+F.
+
+Student[] student = new Student[26];
+
+*2.
+
+Assume that a one-dimensional array named intArray of type int is declared, created, and initialized correctly. Write a code segment to compute the sum of all numbers stored in the even-numbered elements, i.e., intArray[0] and intArray[2].
+
+3.
+
+Using the array intArray described in the previous exercise, write a code segment to output all the even numbers in the array, regardless of their position in the array.
+
+4.
+
+What is the output from the following code segment?
+
+double[][] grades;
+
+grades = new double[34][15];
+
+System.out.println(grades[11].length);
+
+System.out.println(grades[34].length);
+
+System.out.println(grades.length);
+
+System.out.println(grades[7][5].length);
+
+*5.
+
+What is the output from the following code segment?
+
+int[][] intArray = {{2, 5, 4}, {6, 3}, {9, 7, 1, 5}};
+
+System.out.println(intArray[0].length);
+
+System.out.println(intArray[2].length);
+
+System.out.println(intArray.length);
+
+6.
+
+Write the following code segments concerning a three-dimensional array.
+
+A.
+
+Write a statement to declare a 3 by 2 by 5 three-dimensional array of type int.
+
+B.
+
+Write a statement to create the array declared in the previous question.
+
+C.
+
+Using i, j, and k as index variables, write a code segment to store the value i*j*k in every position of the three-dimensional array created previously.
+
+*7.
+
+Using the array scores discussed in Sect. 7.8.2, write a code segment to output all the exam scores stored in the array. Each row should contain scores for all four students as shown below:
+
+Student 1 Student 2 Student 3 Student 4
+
+exam 1 72 95 77 97
+
+exam 2 85 89 65 92
+
+exam 3 91 90 73 93
+
+8.
+
+Using the array scores discussed in Sect. 7.8, write a method to find the average for a particular exam. The method should take a reference to a two-dimensional array and a column number as arguments. Then, implement a main method to find the average for each exam by calling a method inside a loop and output them.
+
+9.
+
+Using the array scores discussed in Sect. 7.9, write a code segment to find the lowest score in the entire array and output it.
+
+10.
+
+Using the array scores discussed in Sect. 7.9, write a code segment to find the highest score for each exam and output the score along with the student's name.
+
+11.
+
+Write a code segment to perform a sequential search on a one-dimensional array. Assume that the set of data could contain duplicates. If the item being searched for is found in the array, record the number of the occurrences also.
+
+12.
+
+Develop a program to store names in a one-dimensional array. The program should initially create a one-dimensional array which holds 10 String values. As the user enters names one by one, each name will be stored in the array. Whenever the array becomes full, create a new array that is twice the size of the previous array, copy the data over to the new array, and continue input.
+James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6_8
+
+© Springer-Verlag London 2014
+
+# 8. Recursion
+
+James T. Streib1  and Takako Soma1
+
+(1)
+
+Department of Computer Science, Illinois College, Jacksonville, IL, USA
+
+Abstract
+
+This chapter examines recursion using contour diagrams. The power function and Fibonacci numbers are used as examples. In addition to contour diagrams, stack frames and a tree of calls are shown as alternative ways of visualizing the recursive process. As with other chapters, a complete program is provided.
+
+## 8.1 Introduction
+
+In Chap.​ 4, the topic of iteration was discussed as a way to solve various problems using loop structures. In this chapter, an alternative method to solve similar problems using recursion is presented. Whereas iteration tends to use less memory and is faster, recursion tends to use more memory and is slower. If recursion is not as efficient in terms of speed and memory, why would one want to use it? The reason is that some problems lend themselves better to a recursive solution than to an iterative solution. Many mathematical solutions are expressed more clearly using a recursive definition, and many data structures and algorithms can be written easier using recursion resulting in a less complicated program.
+
+Since many programmers learn iteration first, sometimes the subsequent change to recursion can be a little difficult, although the reverse can be true as well. However, by using simple examples and contours, this transition can be made easier. With time and practice, one learns that recursion is a powerful tool for solving complex problems.
+
+## 8.2 The Power Function
+
+Recall from Sect.​ 4.​4 the assumption that Java did not contain the pow method in the Math class, so a for loop was used to calculate the power function, x n . As a brief review, the iterative solution to the problem began by initializing the variable answer to 1, used a for loop that iterated n times, and each time through the loop the variable answer was multiplied by x.
+
+Just as there was a pattern to finding an iterative solution, there is a pattern to solving a problem recursively. As with iteration, recursion also needs three parts: initialization, test, and change. However, instead of typically starting the first case with the number 1 and working forward as with iteration, in a sense recursion tends to look at the last case and work backward. So instead of starting at 1, recursion starts with the largest number as its initialization. As with iteration, it helps first to see if a pattern can be found using specific values. For example, assume x is equal to 2 for the power function x n with the pattern presented in Sect.​ 4.​4 repeated below:
+
+  * 20 = 1
+
+  * 21 = 1 * 2 = 2
+
+  * 22 = 1 * 2 * 2 = 4
+
+  * 23 = 1 * 2 * 2 * 2 = 8
+
+  * .
+
+  * .
+
+  * 2 n = 1* 2 * 2 * 2 *... * 2 (n times)
+
+First, note that 23 is equal to 1*2*2*2 and that 22 is equal to 1*2*2. Given this, couldn't the definition of 23 be thought of in terms of 22? In other words, couldn't 1*2*2*2 be defined as 22 *2? The answer is yes, where 22 can substitute for the 1*2*2 portion of 1*2*2*2 and 23 can be defined recursively in terms of 22. This process can continue, where 22 can be defined as 21 *2 and 21 can be defined as 20 *2. Just as something needs to change in the body of a loop, this is the change portion of recursion.
+
+Given there can be an infinite loop in iteration, there can also be "infinite" recursion. However, instead of looping forever, the program would recurse trying to solve the power function in terms of 2−1, 2−2, and so on until there is no more memory. Just as there needs to be a test to ensure that iteration does not continue indefinitely, there needs to be a test so that recursion does not continue indefinitely. Since 20 equals 1, this is where recursion should stop and this is often known as the base case or terminal case. Rewriting part of the pattern from above, it would look as follows:
+
+  * 20 = 1
+
+  * 21 = 20 * 2 = 2
+
+  * 22 = 21 * 2 = 4
+
+  * 23 = 22 * 2 = 8
+
+This is good for 23, but what about 2 n ? Looking at the above pattern, notice that each time the value for n is decreased by 1. Again, note that 23 = 22 *2, 22 = 21 *2, and so on. In terms of n, this can be rewritten as 2 n = 2 n−1 *2, so the last line of the original definition could be
+
+  * 2 n = 2 n−1 * 2
+
+Further, instead of using 2 for x, the entire definition could be rewritten in terms of x as follows:
+
+  * x 0 = 1
+
+  * x 1 = x 0 * x
+
+  * x 2 = x 1 * x
+
+  * x 3 = x 2 * x
+
+  * .
+
+  * .
+
+  * x n = x n−1* x
+
+However, the above still looks like an iterative definition, and there is a much more concise way of writing a recursive definition. For the sake of convenience, it helps to assume that neither x nor n is negative and that x and n are not both 0, since 00 is undefined. Then for all cases where n is greater than 0, the last line could be used. In the case where n is 0, the first line could be used as the base or terminal case. The resulting recursive definition is as follows:
+
+  * x n = {if n > 0, then x n−1 * x, otherwise 1}
+
+This forms the basis of the method which could be written as follows:
+
+public static int power(int x, int n) {
+
+int answer;
+
+if (n > 0)
+
+answer = power(x,n-1)*x;
+
+else
+
+answer = 1;
+
+return answer;
+
+}
+
+Notice the method is declared as static, so that a class does not need to be defined nor does an object need to be created as discussed in Chap.​ 5. Further, note that a local variable answer has been declared. As will be discussed later, this will waste memory in recursion, but for now using a memory location will be very helpful in tracing through the program using contour diagrams. After the code is understood using contours, the method can be rewritten to save memory as will be shown later. More importantly, notice that the power method is calling itself. Is that legal? Yes it is, but as discussed above, there needs to be a way to stop the recursion, and that is the purpose of the else section and the terminal case of answer=1. Of course a main program will need to be written to drive the method as shown in Fig. 8.1 with line numbers to help facilitate seeing the code execute via contours.
+
+Fig. 8.1
+
+main program and power method
+
+Before calling the power method, notice that the main program checks whether x is greater than or equal to 0, that n is greater than or equal to 0, and that x and n are not both 0. It is often best to first test the base case to ensure that it is working properly. So to start, assume that the user has entered a value of 2 for x and 0 for n. Since n is not greater than 0, there should be no recursion, and answer is assigned a value of 1 which is returned to the main program and output. Because this is a simple instance, a contour will not be written for this case.
+
+However, what if x is equal to 3 and n is equal to 2? This is when things start to get interesting and contours are very helpful. Figure 8.2 shows the state of execution just prior to Line 22 in power.
+
+Fig. 8.2
+
+Contour prior to the execution of Line 22 in the first call to power
+
+As discussed in Chap.​ 1, although typically the contour for Ch8Sample1 would not be drawn, it is helpful to see it in this case. Since the power method is static, notice that an object is not created nor is there a reference to an object. Instead, the contour for power is drawn in the class Ch8Sample1, just as the main method which is also declared as static. As can be seen in the contour for power, there is a new cell called ret. This is not the value returned from a method, but rather indicates where the method will return upon completion. Whereas previously it was fairly clear where a method was returning, with recursion and its multiple calls, it might not be so obvious. The ret cell also has listed a type of addr which is an abbreviation for address. Although there is not a type associated with this cell as there is with other variables and parameters, the address is the place where the flow of control will be transferred when the method is finished. Lastly, note that the line number is abbreviated as L14 and the name of the method main is included in the cell. Although in this case it should be apparent that Line 14 is in main, indicating the name of the method will be important as will be seen shortly.
+
+Since n is greater than 0, once Line 23 has begun to execute, the first thing that needs to be done is recursively call the power function. Figure 8.3 shows the state of execution just prior to Line 22 in the second call to the power method.
+
+Fig. 8.3
+
+Contour prior to the execution of Line 22 in the second call to power
+
+As can be seen, there are now two contours depicting the power method. Similar to when there was more than one object of the same type in Chap.​ 5, notice that superscripts have again been employed to distinguish between the two contours. Also note that when calling power a second time, the value of n has been decremented by 1. Lastly, notice that the ret field points back to Line 23 in the first call to power. Of course, when Line 22 in the second call to power is executed, n is still greater than 0, and there is another call to power as shown in Fig. 8.4 illustrating the state of execution prior to Line 22.
+
+Fig. 8.4
+
+Contour prior to the execution of Line 22 in the third call to power
+
+The third contour has now been added, where the return is to Line 23 in the second call to power and n is equal to 0. This time when Line 22 is executed, n is no longer greater than 0, but rather equal to 0, so instead of making the recursive call in the then section of the if statement, the else section is executed. This is the terminal case and no more recursive calls will occur. Instead 1 is assigned to answer, and Fig. 8.5 shows the state of execution prior to Line 26 in the third call to power.
+
+Fig. 8.5
+
+Contour prior to the execution of Line 26 in the third call to power
+
+After the execution of Line 26, the value in answer is returned to Line 23 in the second call to power. Then the value 1 is multiplied by the value 3 in x. The result is then placed into the variable answer, and Fig. 8.6 shows the state of execution prior to Line 26 in the second call to power.
+
+Fig. 8.6
+
+Contour prior to the execution of Line 26 in the second call to power
+
+Of course, the first thing one notices is that the contour for the third call to power is now shaded light gray to indicate that it is deallocated. Also, the value 3 is in answer ready to be returned to the first call to power. As before, contours can simply be erased as done in Fig. 8.7 which shows the state of execution prior to Line 26 in the first call to power.
+
+Fig. 8.7
+
+Contour prior to the execution of Line 26 in the first call to power
+
+Notice that the value 3 returned from the second call to power has been multiplied by the value 3 in x and the result 9 is placed in answer. The flow of control continues to Line 26, and the value 9 is returned back to the calling program. The 9 is then placed into answer as illustrated in Fig. 8.8 which shows the state of execution just prior to Line 15 in main.
+
+Fig. 8.8
+
+Contour prior to the execution of Line 15 in the first call to main
+
+Looking back at the base case in Fig. 8.5, notice that there were a lot of memory locations used to find answer in Fig. 8.8. If recursion takes up so much memory, why use it? Again, some problems are more naturally expressed using recursion than iteration. Further, with memory being much less expensive than it was in the past, the use of recursion is much less costly. Still, some larger problems can use quite a bit of memory, and there are some techniques to cut down on its usage. For example, the previous method used a variable answer each time a contour was created. Instead of assigning the result of the calculation to a variable, it can simply be returned to the calling method as shown in the following segment:
+
+public static int power(int x, int n) {
+
+if (n > 0)
+
+return power(x,n-1)*x;
+
+else
+
+return 1;
+
+}
+
+Of course, the method uses two return statements, which is considered unstructured programming. Again, if memory is a concern, this might be a justifiable trade-off. It is often helpful to initially write an algorithm with some built-in inefficiencies to ensure that it is working properly and then optimize the code, rather than initially try to optimize the code and risk, creating a code that does not work correctly in the first place.
+
+## 8.3 Stack Frames
+
+Notice that each time a recursive call occurs, another contour is drawn, and each time a new contour is created, more memory is used. Contours are helpful in understanding of the process of recursion. But how is this actually accomplished in the computer? It is done using a stack. A stack is known as a LIFO structure, which stands for Last In First Out. That means that the last item put on the stack is the first one taken off the stack, not unlike a stack of papers on a desk. The process of putting an item on a stack is known as a push operation, and the task of removing an item is known as a pop operation.
+
+When a method is called the first time, the values are stored in the variables, like when the first contour is drawn. However, in the program there is only one set of variables. What would happen when there is a recursive call to a method? What happens to the values in the variables? Instead of drawing a new contour, the variables in the contour need to be reused. The result is that all the variables in the method, along with some other possible information associated with the method, form what is known as a stack frame and it is pushed onto the stack. Once the values from the variables are stored on the stack, new values can now be stored in the variables. Each time there is another recursive call, the process is repeated. When there is a terminal case, the process reverses itself. As a simple example, assume there is only one recursive call. The values are pushed onto the stack and the variables reused. Then after the terminal case, the values can be popped off the stack and be placed back into the variables, and the processing can complete.
+
+Using the same example from the previous section calculating 32 and using only a partial contour diagram, Fig. 8.9 is the state of execution just prior to Line 26 in the program in Fig. 8.1 in the third call to power.
+
+Fig. 8.9
+
+Contour and stack prior to the execution of Line 26 in the third call to power
+
+Figure 8.9 corresponds to Fig. 8.5 in Sect. 8.1. Note first that there is only one contour for power. Even though it represents power 3 , it is just labeled power since the contour is used for all calls to power. As each call is made, the contents of the power contour are pushed onto the stack. When power 1 called power 2 , the variables in power 1 were pushed onto the stack so that power 2 could use the variables in the contour. Then when power 3 was called, the contents for power 2 were pushed onto the stack so that power 3 could use the contour. Once power 3 is ready to return to power 2 , the stack frame for power 2 is popped off the stack and put back into the contour, and so on. Simply stated, each new contour created after the first one means another stack frame needs to be pushed onto the stack, and each time a contour is deallocated, that means that a stack frame is popped off the stack.
+
+Note that the names of the cells and their types are not pushed onto the stack, but only the contents are pushed onto the stack. However, also notice that the order in which they are pushed is the same as they occur in the contour so one can determine which cell is which. Although one could draw the stack with the other information, it gets a little cumbersome, and this is one of the reasons why contours are sometimes a little more convenient.
+
+But wasn't it said that each recursive call wastes memory? The answer is yes, because the stack is implemented in the computer's memory and each time a stack frame is pushed onto the stack, more memory is used. If infinite recursion occurs, oftentimes a message will be output saying something to the effect that there is a stack overflow, meaning that the stack is full and no memory is available to push more items onto the stack.
+
+Notice that using contours and stack frames are just two ways of looking at the same process. Although the stack frame model is more accurate, it is a little more cumbersome to draw, whereas the contour model is easier to draw and makes it easier to keep track of previous values. The importance of keeping track of previous values will become even more apparent in the next section with a more involved use of recursion.
+
+## 8.4 Fibonacci Numbers
+
+Another example of the use of recursion is the calculation of Fibonacci numbers that one may have encountered in a mathematics course. The Fibonacci numbers can be defined as follows:
+
+  * Fibonacci(0) = 0
+
+  * Fibonacci(1) = 1
+
+  * Fibonacci(2) = 0 + 1 = 1
+
+  * Fibonacci(3) = 1 + 1 = 2
+
+  * Fibonacci(4) = 1 + 2 = 3
+
+  * Fibonacci(5) = 2 + 3 = 5
+
+  * Fibonacci(6) = 3 + 5 = 8
+
+Although this is an iterative definition, it can help in the finding of a recursive definition. First, notice the base or terminal cases for 0 and 1. Then notice that any other given line is the addition of the two previous lines. For example, Fibonacci(6) is the sum of the numbers 3 and 5, which are the answers for the fourth and fifth Fibonacci numbers. In other words, couldn't Fibonacci(6) be defined in terms of adding Fibonacci(5) and Fibonacci(4)? The answer is yes, but what would the nth Fibonacci number look like? It would be as follows:
+
+  * Fibonacci(n) = Fibonacci(n − 1) + Fibonacci(n − 2)
+
+Putting the base case and the nth case together, the definition of the Fibonacci numbers for nonnegative integers would be as follows:
+
+  * Fibonacci(n) = { if n = 0 or n = 1, then n,
+
+  * otherwise Fibonacci(n − 1) + Fibonacci(n − 2)}
+
+Given this definition, the code can then be written. As in the previous sections, it helps to use local variables to make the reading of contour diagrams easier.
+
+public static int fib(int n) {
+
+int answer1,answer2,answer;
+
+if (n > 1) {
+
+answer1 = fib(n-1);
+
+answer2 = fib(n-2);
+
+answer = answer1 + answer2;
+
+}
+
+else
+
+answer = n;
+
+return answer;
+
+}
+
+Again notice that the method is static and the name of the method is fib to save space in subsequent contour diagrams. Putting the above method together with a main program and adding Line numbers results in the program in Fig. 8.10.
+
+Fig. 8.10
+
+Fibonacci program
+
+The main program checks for a negative number before calling the fib method. In the case where the input of n is either a 0 or 1, the result is just a simple call to the terminal case, and a corresponding value of 0 or 1 is returned to the main program and output. However, more interesting is a nonterminal case, such as when n is equal to 3. Figure 8.11 shows the state of execution just prior to Line 21 in the first call to fib.
+
+Fig. 8.11
+
+Contour prior to the execution of Line 21 in the first call to fib
+
+As before, notice L12 main in the ret cell and the superscript for fib indicating the first call. Since 3 is greater than 1, the then portion of the if is taken. Then a recursive call is made as shown in Fig. 8.12 just prior to the execution of Line 21 in the second call to fib.
+
+Fig. 8.12
+
+Contour prior to the execution of Line 21 in the second call to fib
+
+In the second call to fib, the parameter n has been decremented by 1. Since 2 is greater than 1, another call is made, and Fig. 8.13 shows the state of execution prior to Line 21 in the third call to fib.
+
+Fig. 8.13
+
+Contour prior to the execution of Line 21 in third call to fib
+
+At Line 21, since n is no longer greater than 1 and the condition for the if statement is false, the else portion is executed and answer is set to 1. This value is then returned to Line 22 in the second call to fib, and the value 1 is stored in the variable answer1 as shown in Fig. 8.14 just prior to the execution of Line 23.
+
+Fig. 8.14
+
+Contour prior to the execution of Line 23 in the second call to fib
+
+Notice that the variable answer in the third call to fib is 1 and that the contour is shaded gray. Further, note that there are no values in answer1 and answer2 in the third call to fib, because it was a terminal case and no recursive calls were made. Again, notice the value 1 has been returned to the second call to fib and stored in answer1. However, instead of the flow of control returning back to the first call to fib as it did in the power example, there is another call to fib to calculate answer2. So Fig. 8.15 shows the state of execution prior to Line 21 in the fourth call to fib.
+
+Fig. 8.15
+
+Contour prior to the execution of Line 21 in the fourth call to fib
+
+At first glance, it might appear that the contour for the third call to fib is no longer shaded gray. However, look carefully and notice that it is not the third call but rather it is labeled the fourth call to the method fib, the value for n is 0, and ret references Line 23 in the second call to fib. This is the calculation for the second part of the second Fibonacci number. As before, n is not greater than 1, so the else section of the if statement is executed and answer is assigned a value of 0 that is returned to the second call. Figure 8.16 illustrates the state of execution prior to Line 24 in the second call to fib.
+
+Fig. 8.16
+
+Contour prior to the execution of Line 24 in the second call to fib
+
+As before, the contour for the fourth call to fib has been shaded to indicate deallocation, and the value 0 is returned to answer2 in the second call to fib. When Line 24 is executed, the values in answer1 and answer2 are added together and stored in answer. Then answer in the second call to fib is returned to answer1 in the first call to fib as shown in Fig. 8.17 illustrating the state of execution just prior to Line 23.
+
+Fig. 8.17
+
+Contour prior to the execution of Line 23 in the first call to fib
+
+Note now that the fourth call to fib has been erased so as not to cause confusion with the second call to fib which is now shaded to indicate it has been deallocated. Also, answer in the second call to fib now contains the sum of answer1 and answer2. Further, the value 1 in answer in the second call to fib has been returned to answer1 in the first call to fib. Even though there have been a number of calls, the second half of the calculation still needs to be determined. Figure 8.18 shows the state of execution prior to Line 21 in the fifth call to fib.
+
+Fig. 8.18
+
+Contour prior to the execution of Line 21 in the fifth call to fib
+
+As before, notice this is not the second call to fib, but rather it is the fifth call to fib to calculate answer2 in the first call to fib. Since n is not greater than 1, the else portion of the if statement in the fifth call to fib is executed, and a 1 is placed in answer and returned back to the first call to fib. Figure 8.19 shows the state of execution prior to Line 24 in the first call to fib.
+
+Fig. 8.19
+
+Contour prior to the execution of Line 24 in the first call to fib
+
+The fifth call to fib is now shaded indicating deallocation, and the value in answer is returned to answer2 in the first call to fib. The values in answer1 and answer2 in the first call to fib are then added together and stored in answer, which is returned and assigned to answer in main. Figure 8.20 shows the state of execution prior to answer being output in Line 13 in main.
+
+Fig. 8.20
+
+Contour prior to the execution of Line 13 in main
+
+As can be seen, the first call to fib is shaded to indicate deallocation, and answer in main contains the value 2 that was returned. Granted, this seems like a lot of work to calculate a Fibonacci number, but it shows the amount of memory that would be involved. Although there were a total of five calls to fib, only three contours were activated at any given time. As with the power method previously, the number of memory cells can be decreased by eliminating the temporary variables answer1, answer2, and answer as shown in the following code segment:
+
+public static int fib(int n) {
+
+if (n > 1) {
+
+return fib(n-1) + fib(n-2);
+
+else
+
+return n;
+
+}
+
+As before, this introduces the unstructured practice of two return statements, but if memory is an issue, then this is a possible alternative. An even more efficient solution is to use iteration, which was an exercise in Chap.​ 4.
+
+As with the power function, a stack could also be used to represent recursion, but with more complex algorithms, it can be a little confusing. Yet another way to represent recursion is to use a tree of calls. The tree is drawn from the top down with the first call at the top which is called the root. Then each call after that represents a branch and terminal calls are referred to as leaves. The tree of calls for the Fibonacci number problem is shown in Fig. 8.21.
+
+Fig. 8.21
+
+Tree of calls for fib(3)
+
+Notice that main makes a call to fib 1 (3), which then calls fib 2 (2), which then calls fib 3 (1). Once it is calculated, fib 3 returns the value 1 back to fib 2 , which calls fib 4 to calculate fib(0). Then the sum of those two can be returned to fib 1 which calls fib 5 to calculate fib(1). When that is completed, a 1 is returned to fib 1 , which then adds the two numbers and returns a 2 to main.
+
+Which is a better method to walk through recursion: stack frames, a tree of calls, or contours? It depends on the situation. As stated previously, stack frames are the most realistic but it is harder to use to keep track of each call. A tree of calls is short and convenient but lacks much of the detail. Given the drawbacks of these two extremes, this is why contours are used in this text. As one gets more proficient with recursion, one might gravitate to using a tree of calls for a simple problem, but still using contours when a problem gets more complicated or using stack frames when an accurate picture is needed.
+
+## 8.5 Complete Program: Implementing Recursion
+
+A program which computes the greatest common divisor of two integers using recursion will be developed in this section. The program will
+
+  * Ask the user to enter two integers
+
+  * Compute the greatest common divisor
+
+  * Display the result
+
+Of all the integers that divide the two numbers given, the largest is known as the greatest common divisor. For example, the positive divisors of 36 are 1, 2, 3, 4, 6, 9, 12, 18, and 36, and the positive divisors of 8 are 1, 2, 4, and 8. Thus, the common divisors of 36 and 8 are 1, 2, and 4. It follows that the greatest common divisor of 36 and 8 is 4. The Euclidean algorithm which computes the greatest common divisor of two integers starts with a pair of positive integers. It forms a new pair that consists of the smaller number of the two and the remainder which is obtained by dividing the larger number by the smaller number. This process repeats until one number is zero, and then the other number is the greatest common divisor of the original pair. The following illustrates how the greatest common divisor of 36 and 8 is found. First, 36 divided by 8 is 4 with a remainder of 4 (4 = 36 − 4 × 8). Then, 8 divided by 4 is 2 with a remainder of 0 (0 = 8 − 2 × 4). Since the last remainder is zero, the algorithm ends with 4 as the greatest common divisor of 36 and 8.
+
+A recursive method to find the greatest common divisor of two positive integers can be defined by the following:
+
+  *
+
+Recall from Sect.​ 1.​7 that % is the mod operator and if num1 and num2 are integers, num1%num2 returns the remainder. For example, 36%8 is 4. The implementation of the method gcd is shown below:
+
+public static int gcd(int num1, int num2) {
+
+if(num2 >= 1)
+
+return gcd(num2, num1%num2);
+
+else
+
+return num1;
+
+}
+
+The above method can be invoked for the pair 36 and 8 by
+
+int result;
+
+result = gcd(36, 8);
+
+After the execution of the method, the variable result will contain 4. In order to compute the greatest common divisor of 36 and 8, how many method calls were made? The first method call was gcd(36, 8), the next call was gcd(8, 4), and then gcd(4, 0) which was the last method call, resulting in a total of three method calls. The complete program with a main method is shown below:
+
+When the above code is compiled and executed using the sample input of 36 and 8, the output of the program is as follows:
+
+Enter first number: 36
+
+Enter second number: 8
+
+The greatest common divisor of 36 and 8 is 4.
+
+## 8.6 Summary
+
+  * It helps to hunt for patterns when trying to create a recursive definition.
+
+  * Be sure to identify the base or terminal case.
+
+  * Without a base case, "infinite" recursion will occur.
+
+  * When using contours, it is helpful to use local variables to store information.
+
+  * To optimize recursion, eliminate local variables.
+
+  * Drawing a stack frame and creating a tree of calls are alternatives to contour diagrams.
+
+## 8.7 Exercises (Items Marked with an * Have Solutions in Appendix E)
+
+1.
+
+Draw series of contour diagrams to show the state of execution of the program in Fig. 8.1 for x = 2 and n = 3.
+
+2.
+
+Draw series of contour diagrams to show the state of execution of the program in Fig. 8.10 for n = 2.
+
+3.
+
+Given the complete program in Sect. 8.4, what would happen if the numbers 36 and 8 were input in reverse order? How many contours for gcd would need to be drawn?
+
+4.
+
+Consider the program in Fig. 8.10 where Lines 22 and 23 are swapped. Draw a series of contour diagrams to show the state of execution for n = 3.
+
+5.
+
+Trace the program in Fig. 8.1 for x = 2 and n = 5 and draw the tree similar to the one in Fig. 8.21.
+
+6.
+
+Trace the program in Fig. 8.10 for n = 5 and draw the tree similar to the one in Fig. 8.21.
+
+*7.
+
+Write a recursive method to reverse a given string. The method accepts a string as a parameter and returns the reverse of the string. For example, if the argument is Java, then the method returns avaJ.
+
+8.
+
+Write a recursive method to multiply two positive integers using repeated addition.
+
+*9.
+
+Write a recursive method to compute the factorial of a nonnegative integer using the definition shown below:
+
+10.
+
+Write a recursive method to compute the binomial coefficient using the definition shown below:
+
+11.
+
+Find a reference on how to convert a decimal number to a binary number [4] and then write a recursive method to perform the conversion.
+
+Reference
+
+4.
+
+Streib JT (2011) Guide to assembly language: a concise introduction. Springer, London
+James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6_9
+
+© Springer-Verlag London 2014
+
+# 9. Objects: Inheritance and Polymorphism
+
+James T. Streib1  and Takako Soma1
+
+(1)
+
+Department of Computer Science, Illinois College, Jacksonville, IL, USA
+
+Abstract
+
+This chapter returns to objects and explores the concepts of inheritance. Contours are used to explain how a subclass is extended and inherits data members and methods from a superclass. Further, protected variables and methods along with abstract classes are discussed. Another object-oriented programming concept, polymorphism, which is a useful tool for developing software, is introduced. A complete program implementing inheritance and polymorphism is included.
+
+Objects were introduced in Chap.​ 2, and topics such as passing objects, method overloading, and class methods were discussed in Chap.​ 5. In this chapter the concepts of inheritance, overriding methods, abstract classes, and polymorphism will be illustrated. At first these concepts might sound a little bit intimidating, but introducing them with simple programs and contour diagrams makes the concepts easier to understand.
+
+## 9.1 Inheritance
+
+An important concept in object-oriented programming is software reuse. Writing a program when the same code needs to be written and rewritten with minor variations can be time-consuming and can also waste memory. Further, if the code has already been written for one situation, rewriting it not only wastes time and memory, but the chance of making a logic error in subsequent versions also increases. Instead, it makes sense to reuse software that has already been written and tested. A further advantage of software reuse is with the maintaining of code. When a segment needs to be changed, it only needs to be changed in one place, and again the chance of introducing logic errors decreases. An important way of maximizing software reuse is through inheritance.
+
+When a new class is created using inheritance, the new class can inherit data members and methods from an already existing class. The existing class is known as the parent class and the new class is called the child class. Also, the parent class is sometimes called the base class and the child class is called the derived class. An even more common name for the base class is the superclass, and the derived class is then called the subclass.
+
+As an example, a regular polygon has equal length sides. Further, a three-sided regular polygon is an equilateral triangle, a four-sided regular polygon is a square, a six-sided regular polygon is a hexagon, and an eight-sided regular polygon is an octagon. Although there exists a generic formula for the area for an n-sided regular polygon, this text will use the specific algebraic formulas for each of the regular polygons to help illustrate the concepts of inheritance, overriding methods, abstract classes, and polymorphism.
+
+The specific equations for the area of each of these polygons share a common part: the length of one of its sides squared or s 2. One might recognize this is also the equation for a square, and because a square is such a simple example, it is not included in subsequent examples. Since this equation is shared by all the other equations, it can be made local to the class for a regular polygon. As a result, a regular polygon can be thought of as the superclass, and the triangle, hexagon, and octagon can be thought of as subclasses.
+
+Using a simple example, consider the RegPoygon class as shown in Fig. 9.1. Given the previous chapters on classes, the RegPoygon class should look fairly familiar. Notice the local private variable lenSide which is for the length of a side. The constructor initializes the variable with the value sent via the parameter. Further, there is one method that squares the length of the side using the pow method from the Math class. Lastly, as before, there is a local variable in the method that helps when using contour diagrams, but if memory were an issue, it could be eliminated and the expression could be used in the return statement.
+
+Fig. 9.1
+
+RegPolygon class
+
+A main program segment that tests this class is shown in Fig. 9.2. Again, the statements in this program should be fairly familiar. A value is input from the user and a new instance of the RegPolygon class is created using the value that was input. Then the method is invoked and the value returned is output.
+
+Fig. 9.2
+
+Main program segment using the RegPolygon class
+
+However, what if one wanted to write a new class for a triangle with a method to calculate the area of a triangle? One could just write the necessary expression and be done with it.
+
+However, as mentioned previously, isn't a triangle a regular polygon? The equation for the area of an equilateral triangle is   which includes s 2. If the RegPolygon class already exists, then couldn't methods of that class be used? The answer as one might suspect is yes. The RegPolygon class would then be the superclass and the Triangle class would be a subclass, and the Triangle class could inherit methods from the RegPolygon class. Another way of saying this is that the Triangle class is an extension of the RegPolygon class.
+
+How is this accomplished in a program? The first line in the Triangle class would indicate that it extends the RegPolygon class as follows:
+
+class Triangle extends RegPolygon {
+
+By doing so, the Triangle class now has access to the data member, method, and constructor in the RegPolygon class. So instead of having to rewrite code segments, it can now reuse these code segments. How is this accomplished?
+
+First, it helps to look at the constructor for the Triangle class. Since the RegPolygon class already contains the variable lenSide and a Triangle is an extension of a RegPolygon, instead of declaring a local private variable, the variable in the RegPolygon class could be reused. And instead of initializing it in the Triangle class, the constructor in the RegPolygon class can also be reused. The constructor in the superclass RegPolygon is invoked by using super(lenSide) as shown in the following constructor:
+
+public Triangle(int lenSide) {
+
+super(lenSide);
+
+}
+
+Note that in order to invoke the constructor of the superclass, super(lenSide) must be the first line in the constructor as shown above. To calculate the area of a triangle, one would need to multiply   by the results returned from the method calcRegPolyArea in the RegPolygon class as shown below:
+
+public double calcArea() {
+
+double area;
+
+area = Math.sqrt(3.0) / 4.0 * calcRegPolyArea();
+
+return area;
+
+}
+
+Unlike the constructor, the invoking of other methods can occur anywhere in a method. As before, there is a local variable area declared in the method which will help later when creating contour diagrams. Would the word super need to be used as it was in the constructor? The answer in this case is no, but it is optional as in super.calcRegPolyArea(). Are there cases where super is needed? Yes, it is required in the constructor and in some other special cases as will be shown shortly. However, as a general rule, if it is not needed, do not include it. Before proceeding, it is helpful to see the complete Triangle class as shown in Fig. 9.3.
+
+Fig. 9.3
+
+Triangle class
+
+As always, it helps to see the main program segment that invokes the method in the Triangle class as shown in Fig. 9.4. The main program inputs lenSide for the triangle. It then creates a new instance of the Triangle class by invoking the constructor, which as seen in Fig. 9.3 invokes the constructor of the RegPolygon class. It then invokes the calcArea method of the Triangle class which subsequently invokes the calcRegPolyArea method of the RegPolygon class. Lastly, the area is output. But how does this look using contour diagrams? To do so requires putting Figs. 9.1, 9.3, and 9.4 together in a complete program with line numbers as shown in Fig. 9.5.
+
+Fig. 9.4
+
+Main program segment using the Triangle class
+
+Fig. 9.5
+
+Complete main program with the RegPolygon and Triangle classes
+
+As in previous chapters, not every step will be shown using contour diagrams, but steps will be shown only at critical points to illustrate how the code executes. Assuming that the user inputs 2 for the lenSide, a good first stopping point in the execution of the program is just prior to Line 20 (abbreviated L 20 in Fig. 9.5) in the Triangle class as shown in Fig. 9.6.
+
+Fig. 9.6
+
+Contour just prior to the execution of Line 20
+
+Although the contour for a constructor is often not shown, it is shown here to help with understanding the flow of control of the program. First note that the parameter lenSide contains the value 2 passed from the main program, but it has not yet been assigned to the variable lenSide in the RegPolygon object. Further notice that the contour for Triangle is nested inside the contour for the RegPolygon class. As might be suspected, the reason for this is because RegPolygon is the superclass and Triangle is the subclass. As in the past, since Triangle is nested inside RegPolygon, it now has access to the non-private variable in RegPolygon. In other words, it can inherit the non-private variable in RegPolygon. As the execution of super(lenSide) occurs, the flow of control is transferred to the constructor in RegPolygon, and Fig. 9.7 shows the state of execution just prior to Line 32.
+
+Fig. 9.7
+
+Contour just prior to the execution of the end of the constructor at Line 32
+
+The value in the argument lenSide in the Triangle constructor is transferred to the parameter lenSide in the RegPolygon constructor, and from there it is assigned to the data member lenSide in RegPolygon. Notice in Fig. 9.7 that both the parameter lenSide in RegPolygon constructor and the variable lenSide in RegPolygon now contain the value 2 from lenSide in Triangle. After the constructor in RegPolygon is done, it returns to the constructor for Triangle and control is returned to the main program. Figure 9.8 shows the state of execution just prior to Line 12.
+
+Fig. 9.8
+
+Contour just prior to the execution of Line 12 in main
+
+Notice that the two contours for the constructors are gone and the variable lenSide in RegPolygon now contains a 2. The method calcArea is then invoked, and the state of execution just prior to Line 24 is shown in Fig. 9.9.
+
+Fig. 9.9
+
+Contour prior to the execution of Line 24 in calcArea
+
+Since Triangle is a subclass of RegPolygon, the contour for the method calcArea is created in Triangle as the constructor was previously. Then as Line 24 is executed, the method calcRegPolyArea is invoked, and the value for the variable a is calculated as shown just prior to Line 36 in Fig. 9.10.
+
+Fig. 9.10
+
+Contour just prior to the execution of Line 36 in calcRegPolyArea
+
+Upon return from the method calcRegPolyArea, the state of execution just prior to Line 25 is shown in Fig. 9.11. Lastly, control is returned to the main program as shown just prior to output of the area on Line 14 in Fig. 9.12.
+
+Fig. 9.11
+
+Contour prior to the execution of Line 25 in calcArea
+
+Fig. 9.12
+
+Contour prior to the execution of Line 14 in the main program
+
+However, what if the name of the calcArea method in the Triangle class was changed to calcRegPolyArea? Would this cause a problem with the method calcRegPolyArea in the RegPolygon class? The answer is yes, because calcRegPolyArea in the Triangle class would have the same number and type of parameters as the calcRegPolyArea method in the RegPolygon class. A method in a subclass that has the same name, the same number of parameters, and the same type of parameters as another method in the superclass is known as an overriding method. Does this mean that there cannot be two methods of the same name, the same number of parameters, and same type of parameters, one in the superclass and one in the subclass? The answer is no, but if there is an overriding method, how does one access the method in the superclass? If calcRegPolyArea is invoked in the subclass, the method in the subclass would be used, and in this case it would recursively call itself which is not what is intended. As mentioned earlier, there are instances where the word super must be used and this is one of those instances. So, should one want to access the calcRegPolyArea method in the superclass, then the word super is no longer optional and must be used as shown in the segment in Fig. 9.13.
+
+Fig. 9.13
+
+Overriding the calcRegPolyArea() method
+
+First, note that the name of the method has been changed from calcArea to calcRegPolyArea. Further, by including the word super prior to the call to calcRegPolyArea, the method in the superclass RegPolygon is invoked instead of recursively calling the calcRegPolyArea method in the subclass. Again, in this case the word super is not optional. Using the word super only when it is needed helps alert other programmers reading the code that there are two methods of the same name. For now, instead of changing the method name to calcRegPolyArea, the program in Fig. 9.5 will retain the method name calcArea.
+
+## 9.2 Protected Variables and Methods
+
+In the program in Fig. 9.5, what would happen if a method in the Triangle class tried to access the variable in the RegPolygon class? Specifically, what if the constructor in the Triangle class tried to access the variable lenSide in the RegPolygon class? The answer is the same as if trying to access the variable from the main program. If a variable is private, then it can only be accessed by methods in the RegPolygon class; thus the variable lenSide is initialized using the constructor.
+
+However, if a variable were made public, then the methods of the subclass could access it. Unfortunately, the variable would also be accessible from the main program as well. Is there a way that would allow only methods in the subclass to access a variable in the superclass, but still not allow the variable to be accessed from the main program? The answer is yes. Instead of private or public access, protected access can be used as shown in the following:
+
+protected int lenSide;
+
+Now instead of initializing the variables via the RegPolygon constructor, the variables can be accessed directly as in the following modified Triangle constructor:
+
+public Cylinder(int lenSide) {
+
+super.lenSide = lenSide;
+
+}
+
+To access the variable lenSide in the RegPolygon class, notice the use of the word super. Also note that this could have been used instead of super, but the use of the word super is preferred because it alerts programmers who might subsequently read the code that the variable is not located in the current class but rather in the superclass.
+
+Since the RegPolygon constructor would no longer be invoked, it could be deleted. However, if it was retained, but not invoked, a default constructor would need to be added to the RegPolygon class as follows:
+
+public RegPolygon() {
+
+}
+
+Although accessing a variable in this manner works and is better than declaring a variable as public, it can still suffer from some of the same problems as being declared public when there are a large number of subclasses. As a result, given a choice between accessing a protected variable or accessing a private variable via a method, this text will generally choose the latter as shown previously in Fig. 9.5.
+
+However, notice in Fig. 9.5 that although the variables in the RegPolygon class are private, the methods are public. While this is acceptable when access to the method is needed by both the main program and a subclass, what if access is only needed via the subclass and not from the main program? Is there a way that this can be accomplished? Again, as might be suspected, just as variables can be made accessible only by a subclass, this can also be true for methods. This is accomplished again using protected instead of public as shown in the following headings:
+
+protected RegPolygon(int lenSide) {
+
+protected double calcRegPolyArea() {
+
+This corresponds to the previous suggestion that variables should remain private and only accessed through methods. Further, these methods can only be accessed from other methods within the class or any subclasses, and not from the main program.
+
+## 9.3 Abstract Classes
+
+Given the program in Fig. 9.5, there is nothing preventing the main program from creating an instance of the RegPolygon class. Although not very useful, even if the variable lenSide is private and the methods are protected, an instance could be created. Is there a way to make it so that an instance of the class cannot be created? Yes, and it is known as an abstract class. The result is that subclasses can still be defined, yet an instance of the superclass cannot be created. The following first line of the RegPolygon class shows how this is accomplished:
+
+abstract class RegPolygon {
+
+If it is possible to create an abstract class, is it also possible to create an abstract method? The answer again is yes. When creating an abstract method, the heading is declared in the superclass, but the body of the method is not defined as in the following:
+
+public abstract double calcArea();
+
+Again, note that there is no body to the method and the first line of the method ends in a semicolon. If the heading is in the superclass and there is no body to the method, where is the body defined? The complete method is defined in the subclass as it was before and as shown below:
+
+public double calcArea() {
+
+double area;
+
+area = Math.sqrt(3.0) * calcRegPolyArea() / 4.0;
+
+return area;
+
+}
+
+If the above method is the same as before, what is the advantage of doing this? The advantage is that it allows different subclasses to have different methods using the same heading to meet the needs of each subclass. For example, instead of a triangle, consider an octagon:
+
+The name for this new class could be Octagon. Further, since the equation for an octagon is  , it could also be a subclass of the RegPolygon class. Since the formula s 2 is the same, the calcRegPolyArea method of the RegPolygon class could be invoked, but unlike the calculation for the area of the triangle, it would not need to be multiplied by   but rather multiplied by  . There is no change to the Triangle class and the new Octogon class is as follows:
+
+class Octagon extends RegPolygon {
+
+public Octagon(int lenSide) {
+
+super(lenSide);
+
+}
+
+public double calcArea()
+
+double area;
+
+area = 2.0 * (1.0 + Math.sqrt(2.0)) * calcRegPolyArea();
+
+return area;
+
+}
+
+}
+
+Note in the first line that the Octagon class extends the RegPolygon class. Next, notice in the calcArea method that calcRegPolyArea() is not multiplied by   but rather by   as mentioned above.
+
+Note that an abstract class does not have to have any abstract methods, but if a class has abstract methods, the class needs to be declared as an abstract class. Using an abstract method in the superclass forces both subclasses to define different calcArea methods, and if the methods were not declared, a syntax error would occur. This is a handy feature to have when there are some differences in various subclasses, yet it is desired to retain some commonality among the subclasses.
+
+## 9.4 Polymorphism
+
+Another important feature of object-oriented programming is polymorphism, where the type of an object that is referenced by a superclass variable is determined at runtime instead of at compile time. This concept will be illustrated with the help of examples below.
+
+In Java, a variable of a superclass type can reference an object of any of its subclasses. In other words, both an object of the superclass and an object of a subclass can be referenced by a variable of the superclass type. Consider the definition of the class RegPolygon shown in Fig. 9.1 which is repeated below for convenience:
+
+class RegPolygon {
+
+private int lenSide;
+
+public RegPolygon(int lenSide) {
+
+this.lenSide = lenSide;
+
+}
+
+public double calcRegPolyArea() {
+
+double a;
+
+a = Math.pow(lenSide, 2);
+
+return a;
+
+}
+
+}
+
+Further, the class Triangle from Fig. 9.3, with the modification described in Fig. 9.13 with the method calcArea renamed to calcRegPolyArea, is shown below:
+
+class Triangle extends RegPolygon {
+
+public Triangle(int lenSide) {
+
+super(lenSide);
+
+}
+
+public double calcRegPolyArea() {
+
+double area;
+
+area = Math.sqrt(3.0) / 4.0 * super.calcRegPolyArea();
+
+return area;
+
+}
+
+}
+
+The class Triangle is a subclass of the class RegPolygon, and the method calcRegPolyArea in the Triangle class is overriding the method calcRegPolyArea in the RegPolygon class. Suppose two variables of type RegPolygon are declared in the main method as follows:
+
+RegPolygon shape1, shape2;
+
+Naturally, a reference to an object of the class RegPolygon can be assigned to these variables. For example, the following statement assigns an object of the RegPolygon class to the variable shape1.
+
+shape1 = new RegPolygon(5);
+
+In addition, a reference to an object of the class Triangle can also be assigned to these variables. The following statement assigns an object of the Triangle class to the variable shape2.
+
+shape2 = new Triangle(2);
+
+Next, using the method calcRegPolyArea defined in both the class RegPolygon and the class Triangle, the square of the side and the area of the triangle will be calculated. For the object shape1, the code segment can be found in Fig. 9.14. This code segment will output the area with a side of 5 as
+
+Fig. 9.14
+
+Code segment finding the square of the side of shape1
+
+area of shape1: 25.00
+
+Now, what would happen when the code segment in Fig. 9.15 is executed for the object shape2? Recall that the variable shape2 is of type Triangle. Will the method calcArea defined in the class RegPolygon be invoked and return 25.00? The answer is no. Instead it will output the following:
+
+Fig. 9.15
+
+Code segment finding the area of shape2
+
+area of shape2: 1.73
+
+This is the area of a triangle with a side of length 2. The reason is that the type of the object invoking the method calcRegPolyArea determines which calcRegPolyArea method is called, either the one in the class RegPolygon or the one in the class Triangle. Even though the variable shape2 is of type RegPolyton, it references a Triangle object because that is the type assigned to it during runtime by the shape2=new Triangle(2); statement. This means that the Triangle object is invoking the method calcRegPolyArea defined in the class Triangle when it is executed.
+
+This is an example of polymorphism. Variables shape1 and shape2 could reference either a RegPolygon object or a Triangle object. At compile time, it cannot be determined what type of the object they will reference. However, at runtime when the object invokes the method calcRegPolyArea, the type of the object is determined and the appropriate calcRegPolyArea method is called.
+
+If a variable of a superclass type can reference an object of a subclass type, can a variable of a subclass type reference an object of a superclass type? The answer is no. Consider the following code segment:
+
+Triangle shape3;
+
+shape3 = new RegPolygon(6);
+
+The second statement causes a compile-time error, because a reference variable of a subclass type is not allowed to reference an object of its superclass. As one might suspect, the following statement is also incorrect,
+
+shape3 = shape1;
+
+because the variable shape1 is referencing an object of type RegPolygon. What about the following statement?
+
+shape3 = shape2;
+
+At first it looks okay since the variable shape3 is of type Triangle and the variable shape2 references an object of the Triangle class. But, the answer is again no. It causes a compile-time error because even though shape2 references a Triangle object, the variable shape2 is of type RegPolygon. However, the following statement is legal:
+
+shape3 = (Triangle) shape2;
+
+The above statement uses a typecast operator, discussed in Chap.​ 1, which allows shape3 of type Triangle to reference the Triangle object that shape2 of type RegPolygon references.
+
+Suppose another subclass of the class RegPolygon named Hexagon is defined. The equation for a hexagon is   as shown below:
+
+class Hexagon extends RegPolygon {
+
+public Hexagon(int lenSide) {
+
+super(lenSide);
+
+}
+
+public double calcRegPolyArea() {
+
+double area;
+
+area = 3.0 * Math.sqrt(3.0) / 2.0 * super.calcRegPolyArea();
+
+return area;
+
+}
+
+}
+
+As discussed above, a variable of the class RegPolygon can reference an object of the class Hexagon, but a variable of the Hexagon class cannot reference an object of the RegPolygon class. Also, a variable of the Hexagon class cannot reference an object of the Triangle class, and vice versa, since the Hexagon class and the Triangle classes are both subclasses of the RegPolygon class, also known as sibling classes.
+
+Returning to the output of the code segments in Figs. 9.14 and 9.15, instead of displaying the words "shape1" and "shape2" as shown below, would it be better if the type of the polygon is output?
+
+area of shape1: 25.00
+
+area of shape2: 1.73
+
+Is there a way to determine the type of an object during the runtime and output it? The answer is yes. To determine the type of an object, Java provides the operator instanceof. This operator is especially useful because the variable of a superclass can reference an object of either its own class or a subclass type. Consider the following expression:
+
+shape1 instanceof Triangle
+
+This expression evaluates to true if the variable shape1 refers to an object of the class Triangle; otherwise it evaluates to false. Using the operator instanceof, the printf statements in Figs. 9.14 and 9.15 can be rewritten as follows:
+
+if(shape1 instanceof Triangle)
+
+System.out.printf("area of triangle: %.2f", area1);
+
+else
+
+System.out.printf("square of side: %.2f", area1);
+
+System.out.println();
+
+if(shape2 instanceof Triangle)
+
+System.out.printf("area of triangle: %.2f", area2);
+
+else
+
+System.out.printf("square of side: %.2f", area2);
+
+System.out.println();
+
+The output of the above code segment is
+
+square of side: 25.00
+
+area of triangle: 1.73
+
+Since the variable shape1 references a RegPolygon object, the first if condition returns false. Therefore the printf statement in the else block was executed stating that the square of the side is calculated. For shape2, the then portion of the second if statement was executed. However, what would happen if there are a large number of shapes whose areas need to be calculated? Instead of having each object calling the calcRegPolyArea method separately and having if statements for the output, an array of objects can be used to simplify the program.
+
+Consider the creation of an array with different types of regular polygons. If the array is declared as a type RegPolygon, each element of the array could be an object of its subclasses. The following code segment declares and creates an array named shapes of type RegPolygon with five elements, which can be the Triangle class or the Hexagon class:
+
+RegPolygon[] shapes;
+
+shapes = new RegPolygon[5];
+
+The following statements create either a Triangle object or a Hexagon object and place them in the array:
+
+shapes[0] = new Hexagon(3);
+
+shapes[1] = new Triangle(2);
+
+shapes[2] = new Triangle(5);
+
+shapes[3] = new Hexagon(4);
+
+shapes[4] = new Triangle(4);
+
+Once all the objects are stored in the array, a for loop can be used to calculate the areas and output them along with the type of the shape.
+
+for(int i=0; i<shapes.length; i++) {
+
+area = shapes[i].calcRegPolyArea();
+
+if(shapes[i] instanceof Triangle)
+
+System.out.printf("area of triangle: %.2f", area);
+
+else
+
+System.out.printf("area of hexagon: %.2f", area);
+
+System.out.println();
+
+}
+
+The output of the above code segment is of the same form as before:
+
+area of hexagon: 23.38
+
+area of triangle: 1.73
+
+area of triangle: 10.83
+
+area of hexagon: 41.57
+
+area of triangle: 6.93
+
+Again, the advantage of using an array is that the program does not need to have a series of calculations for the area and if statements, but rather only one calculation and if statement placed inside a loop.
+
+## 9.5 Complete Program: Implementing Inheritance and Polymorphism
+
+Combining all the material from this chapter, one can now develop a program that illustrates the concepts of inheritance and polymorphism. In this section, a program which keeps track of an employee's information for a company will be developed. The program will
+
+  * Allow a user to enter the employee information
+
+  * Compute the compensation for each employee
+
+  * Display the results
+
+Suppose each employee has a unique ID number and is either a full-time or a part-time employee. Full-time employees are salaried and part-time employees are paid hourly. Therefore, the company keeps track of the salary for each full-time employee and the hourly rate and the number of hours worked for each part-time employee. Since every employee has an ID number as a common field and other field(s) depending on the type of employment, the concept of inheritance can be used to organize the data. The Employee class could be the superclass and there could be two subclasses, a FullTime class and a PartTime class. The Employee class could have a data member named id of type integer and two methods, one constructor and a method toString, as discussed in Sect.​ 6.​5. The toString method returns a descriptive text and the contents of the variable id as a String type for the purpose of displaying information about the object. The definition of the Employee class is shown below:
+
+class Employee {
+
+private int id;
+
+public Employee(int id) {
+
+this.id = id;
+
+}
+
+public String toString() {
+
+return "An employee with ID " + id;
+
+}
+
+}
+
+The FullTime class inherits the id field from the parent class Employee and has one additional data member of its own, salary of type double. The id is also inherited by the PartTime class. Two more data members, hourlyRate and hoursWorked of type double, are declared in the PartTime class to determine the compensation. Both subclasses have a method toString which is an overriding method of the one in the Employee class. They also have a method named compensation to calculate the pay for the particular month. Both the FullTime class and the PartTime class are shown below:
+
+Notice that the method toString defined in the Employee class is invoked from the toString method of both subclasses using the method call super.toString(). The next two lines append the type of employment and the result from the compensation method as defined and calculated in its own class. The compensation method in the FullTime class simply returns the content of the variable salary, and the compensation method in the PartTime class calculates the wage multiplying the hourly rate by the number of hours the employee worked. The format method in the toString method, which is similar to printf, is a class method defined in the String class and is used to format the double number.
+
+As discussed in Sect.​ 2.​10, Universal Modeling Language (UML) diagrams help one to see the relationships among the various classes. Figure 9.16 shows how the Employee, FullTime, and PartTime classes can be displayed using UML class diagram notation.
+
+Fig. 9.16
+
+UML class diagram of the Employee, FullTime, and PartTime classes
+
+As can be seen, each box represents a particular class. The name of the class is in the top section of the box. A list of the data members is located in the middle section, and the list of the methods is in the bottom section. Two arrows show the relationship between the parent class and the two child classes. In the FullTime class, the middle section contains the data member salary and its type double following the colon. The list of methods includes the constructor FullTime along with the two methods, toString and compensation. The parameter list (id: int, salary: double) for the constructor indicates that id and salary are of type int and double, respectively, and are used to assign the values to the data members. By having an empty parameter list in the parentheses, both toString and compensation methods do not receive any information and return a value of type String and double, respectively.
+
+In the main method, an array of Employee type is created with the number of employees that the user inputs, and the information about each employee is collected from the user inside the for loop. After all the information is entered, the compensation for each employee is calculated and displayed using polymorphism. The complete main program is shown below:
+
+In the first for loop, notice that after an ID number is entered, the program asks the user if the employee is full-time or part-time. Depending on the type of the employment, only the necessary information is prompted for in the then or else section of the if statement. A for loop is also used for output. Because of the use of polymorphism, the type of the object at a particular position in the array is determined dynamically and the appropriate toString method is executed. When the above program is compiled and executed using the sample input of three employees, the output appears as given below:
+
+As can be seen from the above output, the user entered information for one full-time and two part-time employees.
+
+## 9.6 Summary
+
+  * The word extends is used to create a subclass.
+
+  * When accessing a constructor in a superclass, super must be used. It must also be the first line of the constructor of the subclass.
+
+  * An overriding method is one in a subclass that has the same name, the same number of parameters, and the same type of parameters as the one in the superclass.
+
+  * When there is not an overriding data member or method in a subclass, super is optional and generally not used. However, if there is an overriding data member or method in the subclass and the one in the superclass needs to be accessed, super is required.
+
+  * Use protected when variables or methods in a superclass are to be accessed only in the superclass and its subclasses.
+
+  * If the superclass is an abstract class, it can be extended by subclasses, but a new instance of the superclass cannot be created.
+
+  * The heading of an abstract method is placed in the superclass followed by a semicolon, and in the subclasses, the method must eventually be implemented.
+
+  * An abstract class does not need to include abstract methods, but if a class has abstract methods, the class must be declared as an abstract class.
+
+  * Polymorphism means the type of an object that is referenced by a superclass variable is determined at runtime.
+
+  * A variable of a superclass type can reference an object of its subclass type.
+
+  * A variable of a subclass type cannot reference an object of its superclass type.
+
+  * A variable of a subclass type cannot reference an object of another subclass type that shares the same parent. The two subclasses are known as sibling classes.
+
+  * The operator instanceof determines the type of an object.
+
+## 9.7 Exercises (Items Marked with an * Have Solutions in Appendix E)
+
+1.
+
+Suppose that Staff, Faculty, and StudentWorker are the subclasses of the Employee class. Indicate whether the following statements are syntactically correct or incorrect. If incorrect, indicate what is wrong with the statement:
+
+A.
+
+Employee employee = new Faculty();
+
+*B.
+
+Staff staff = new Employee();
+
+*C.
+
+StudentWorker student = new StudentWorker();
+
+D.
+
+Faculty faculty = new Staff();
+
+2.
+
+The Triangle class is derived from the RegPolygon class. Using the UML diagrams shown below, complete the following:
+
+A.
+
+List any overloaded methods in the RegPolygon and Triangle classes.
+
+*B.
+
+List any overriding methods in the RegPolygon and Triangle classes.
+
+C.
+
+If the variable lenSide is a private data member of the RegPolygon class, is lenSide accessible from the Triangle class?
+
+*D.
+
+If the variable lenSide is a protected data member of the RegPolygon class, is lenSide accessible from the Triangle class?
+
+E.
+
+If the variable lenSide is a protected data member of the RegPolygon class, is lenSide accessible from the main method?
+
+*3.
+
+Implement a class Engineer which extends from the FullTime class discussed in Sect. 9.5. Include a data member which describes the type of engineering and a method toString.
+
+4.
+
+Write a class Vehicle which keeps a vehicle identification number, license plate number, and a number of axles. Derive two classes from the Vehicle class named Car and Truck. Include a data member for the number of passengers in the Car class and a data member for the towing capacity for the Truck class. All three classes should have a toString method to be able to output information about a particular vehicle.
+
+5.
+
+Suppose that two different types of sources are used in a term paper: books and journal articles. The following UML diagram illustrates how the sources are organized.
+
+First, implement the three classes, Source, Book, and Article, and then write a main method to use them. In the main method, ask the user to enter the number of references, create an array of type Source using the size the user entered, use a loop to ask the user to enter the information for each reference (book or journal article), and then output the contents of each object.
+James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6_10
+
+© Springer-Verlag London 2014
+
+# 10. Elementary File Input and Output
+
+James T. Streib1  and Takako Soma1
+
+(1)
+
+Department of Computer Science, Illinois College, Jacksonville, IL, USA
+
+Abstract
+
+This chapter discusses elementary file, including how to obtain data from a file and how to save output to a file. It also discusses how the file location is specified. Two complete programs, one performing matrix multiplication using a data stored in a file, and another sorting string data from a file, are included.
+
+## 10.1 Introduction
+
+Simple input using a standard input device such as a keyboard and simple output using a standard output device such as a monitor were introduced in Chap.​ 1. With a small number of data, entering them using the keyboard works fine; however, with a large set of data, it can become troublesome. Recall the example discussed in Chap.​ 4, where the list of exam scores was entered from a keyboard and the average was calculated using a loop structure. Since the example used only three exam scores, it was not much trouble. However, if there were 100 or more students in the class and the exam scores are used several times for analysis, it would be inefficient to type scores at the keyboard each time the program is executed. In addition to the inconvenience of typing a large amount of data, typing can generate errors and cause erroneous results. Just like using a keyboard for input, sending output to the monitor also works well if the amount of information displayed is small; however, if a large number of statistics must be output or the results need to be distributed, the use of a monitor is not particularly a good option.
+
+What can be done about the limitations associated with getting input from the keyboard and sending output to the monitor? A solution is to use files, where they can be used to store all the input and output data. Another advantage of using files is that they can be created before running a program. Further, if the results are output to a file, a program does not have to be executed over and over to see the same result, and the file can be distributed easily. A file can be created for input to a program or the output examined using a utility program. This chapter will discuss how to obtain data from and save output to the file.
+
+## 10.2 File Input
+
+When the Scanner class was introduced in Java 5.0, also known as JavaTM 2 Platform Standard Edition 5.0 Development Kit (JDK 5.0), it significantly simplified the process of input both by reading data from the keyboard and a file. This is because the Scanner object processes a data line as a sequence of tokens. A token is an individual item that is a string of characters separated by delimiters. Any character can be designated as a delimiter, but a white space such as a blank, a tab, a newline, or a return is the most commonly used. For the file input, instead of associating an object of type Scanner to the standard input device, System.in, it is associated to an object of type File. The File class represents files and directory pathnames. Some of the purposes of this class are to create files and directories, and to search files. The following statement
+
+Scanner inFile = new Scanner(new File("grades1.txt"));
+
+will associate an object inFile of type Scanner to a data file grades1.txt. For now, assume that the file resides in the same directory as the Java program. The way to specify the input file in a different directory will be discussed in Sect. 10.5. The name of the file is grades1.txt, and it is passed as an argument to the constructor of the File class. Since the Scanner class is in the java.util package and the File class is in the java.io package, both packages have to be imported at the beginning of the program as shown below:
+
+import java.util.*;
+
+import java.io.*;
+
+Once import statements are included, the methods such as nextInt and next discussed in Chap.​ 1 can be used to input data from the file, just the same way an object of the Scanner class has been used to input the data from the standard input device.
+
+The program in Fig. 10.1 will read numbers from the grades1.txt file and output the average.
+
+Fig. 10.1
+
+A simple program that inputs data from a text file
+
+Notice that throws IOException is added in the main method header. An exception represents an error condition or an unexpected event that occurs during the normal course of program execution. Since exceptions are discussed in Appendix B, this section briefly mentions just enough about them to enable the program to use file input and output. When the program performs file processing operations, there is a chance that a system error will occur. For example, the system may not be able to locate the file or an error could occur during a file read operation. For this reason, Java requires an application to deal with exceptions in some form. A simple solution is to add a throws clause in the method header, and then the system will handle the exception by simply halting execution. Finally, notice that the last statement in the program is inFile.close(); which closes the input file grades1.txt with which inFile was associated.
+
+Assuming the file grades1.txt contains the following three values as shown below, a for loop is used to read the scores.
+
+71
+
+60
+
+75
+
+The output from the code in Fig. 10.1 is
+
+score 1: 71
+
+score 2: 60
+
+score 3: 75
+
+average: 68.67
+
+Similar to the File class, an object of the FileReader class could be associated to the file and used to create an object of the Scanner class. The following three statements
+
+Scanner inFile
+
+= new Scanner(new FileReader("grades1.txt"));
+
+and
+
+Scanner inFile
+
+= new Scanner(new FileReader(new File("grades1.txt")));
+
+along with the following discussed earlier
+
+Scanner inFile = new Scanner(new File("grades1.txt"));
+
+are all equivalent when creating a Scanner object for the purpose of the input. Besides using File and FileReader for file input, the File class is used to handle files in general, such as creation and deletion of files, and the FileReader class is used for reading character files. For more information, the definition of File and FileReader classes can be found in the Java API specification document on the Oracle website.
+
+Next, how can the code in Fig. 10.1 be modified if the number of scores in the input file is not known in advance? A sentinel value –1 could be added to an input file as shown below:
+
+71
+
+60
+
+75
+
+-1
+
+and the following code segment illustrates how a variation of the sentinel-controlled loop introduced in Sect.​ 4.​2 could be used.
+
+numStudents = 0;
+
+totalExam1 = 0;
+
+score = inFile.nextInt();
+
+while(score >= 0) {
+
+numStudents++;
+
+System.out.println("score " + numStudents + ": " + score);
+
+totalExam1 = totalExam1 + score;
+
+score = inFile.nextInt();
+
+}
+
+average1 = totalExam1/numStudents;
+
+System.out.println();
+
+System.out.printf("average: %.2f", average1);
+
+The variable numStudents is used to store the number of scores and calculate the average after the loop. However, what if one did not want to include a sentinel value in the data file? It would seem that the program should be able to keep reading the integers using a loop until there are no more scores in the file. Fortunately, the hasNextInt method can be used to check if another integer value exists in the file. If it does not find an integer, the method returns false. Using a while loop, the execution could continue to the statement that follows the loop. The revised loop is shown below:
+
+numStudents = 0;
+
+totalExam1 = 0;
+
+while(inFile.hasNextInt()) {
+
+score = inFile.nextInt();
+
+numStudents++;
+
+System.out.println("score " + numStudents + ": " + score);
+
+totalExam1 = totalExam1 + score;
+
+}
+
+average1 = totalExam1/numStudents;
+
+System.out.println();
+
+System.out.printf("average: %.2f", average1);
+
+The advantage to this technique is that the file does not need to contain a sentinel value, nor does the loop need a priming read. In addition to the method hasNextInt, there are a number of similar methods in the Scanner class that can be used with different types of data as listed in Table 10.1.
+
+Table 10.1
+
+Selected methods of the Scanner class
+
+Methods | Return type | Description
+
+---|---|---
+
+hasNext() | boolean | Returns true if there is another token available for input
+
+hasNextDouble() | boolean | Returns true if the next token is a double value
+
+hasNextInt() | boolean | Returns true if the next token is an int value
+
+hasNextLine() | boolean | Returns true if there is another line available for input
+
+next() | string | Returns the next token
+
+nextDouble() | double | Returns the next token as a double value
+
+nextInt() | int | Returns the next token as a int value
+
+nextLine() | string | Return the next line of input as a string. It may contain several token and spaces. The newline character \n could be there, but it is not included in the string
+
+Next, consider the case where the input file grades2.txt contains two sets of exam scores per line and the column headings as shown below:
+
+Exam1 Exam2
+
+71 95
+
+60 80
+
+75 76
+
+The task is to find the average score of both sets of exam scores. Since the first two items in the file are not scores, they have to be extracted using the next method instead of the nextInt and assigned to the String variables to be output later. Notice in the following code that both sets of scores are read and added to the appropriate variables during each iteration of the while loop before moving on to the next line. Further, since the number of students is not known in advance, it is necessary for the program to count the number of input lines using the variable numStudents as shown in Fig. 10.2.
+
+Fig. 10.2
+
+A program that inputs data from a text file
+
+The output from the program would look like the following:
+
+Exam1 Exam2
+
+Student 1: 71 95
+
+Student 2: 60 80
+
+Student 3: 75 76
+
+Exam1 average: 68.67
+
+Exam2 average: 83.67
+
+Each individual score was output as they were read from the file inside the loop, and the last two lines were output after the calculation of the average outside the loop.
+
+## 10.3 File Output
+
+To send output to a file, the classes PrintWriter and FileWriter are used. The PrintWriter class prints formatted text using methods like print, println, and printf. The FileWriter class is a counterpart of FileReader class and is meant for writing streams of characters. As with the FileReader class, the PrintWriter and FileWriter classes are contained in the package java.io which needs to be imported at the beginning of the program. For file output, a variable of type PrintWriter is declared and associated with the destination, the file where the output will be stored. Suppose the output is to be stored in the file outs.txt in the same directory as the source code. Again, the way to specify the output file in a different directory will be discussed in Sect. 10.5. Consider the following statement:
+
+PrintWriter outFile
+
+= new PrintWriter(new FileWriter("outs.txt"));
+
+This statement creates an object of type PrintWriter named outFile and associates it with the file outs.txt. An output file does not have to exist before it is opened for output. If it does not exist, the system creates an empty file in the current directory. If the designated output file already exists, a new empty file with the same name will be created, replacing the previous file of the same name. Sometimes, however, there is a time when new data should be appended to the end of the data that already exists in the file. The FileWriter class has an overloaded constructor that takes two arguments as in
+
+PrintWriter outFile
+
+= new PrintWriter(new FileWriter("outs.txt", true));
+
+The first argument is a name of the file and the second argument is a Boolean value. If it is true and the file already exists, the contents of the file will not be erased and the new data will be appended to the end of the file. If the argument is false and the file already exists, it will be replaced by the new one. If the boolean value is not included in the argument list, the value false is assumed and an existing file will be replaced. Finally, in any case, if the file does not exist, a new file is created.
+
+Similar to the Scanner class, an object of the File class could be associated to the file. Using an overloaded constructor of the PrintWriter class and a File object as an argument to create a PrintWriter object is shown below:
+
+PrintWriter outFile
+
+= new PrintWriter(new File("outs.txt"));
+
+Another overloaded constructor of the PrintWriter class simply takes a filename as an argument just like the Scanner class as shown below:
+
+PrintWriter outFile = new PrintWriter("outs.txt");
+
+The advantage of using an object of the class FileWriter over the File class or a simple filename is the ability of appending the text, if it is desired.
+
+Once the object of type PrintWriter is created, the methods such as print, println, and printf can be applied to the object outFile just the same way they have been used with the System.out. When the output is completed, the output file should be closed by using the method close shown in the following statement:
+
+outFile.close();
+
+Data to be written to a file is stored in an output buffer in memory before it is written to the file. Closing a file ensures that any data remaining in the buffer will be emptied. If the file is not closed, it is not considered an error, but it could be possible that not all the information generated by the program will be sent to the output file. Therefore, it is good practice to always close the output file. The program in Fig. 10.2 is modified to output the result to the file outs.txt as shown in Fig. 10.3.
+
+Fig. 10.3
+
+A program that outputs data to a text file
+
+The program in Fig. 10.3 will have the same output as the program in Fig. 10.2, but this time, it will be output to the file outs.txt. To see the output, simply open the file using a utility program and examine the results.
+
+## 10.4 File Input and Output Using an Array
+
+Assuming the scores from different exams are kept in separate files, how can the scores in each file be processed using the same program? It would not be a good idea to have the input filename hardcoded into the program. Instead, the program should allow the user to enter the filename. Also, after the scores are processed, the results can also be stored in a user-specified file. If variables are used for the name of both input and output files, it is not necessary to change and to compile the code every time the program is executed for a different set of data.
+
+If every course has a different number of students, the number of scores in the input file is not known in advance. Suppose that an array of the same size as the number of scores were to be created, then the scores would need to be counted and the count stored in a variable would be used to allocate the array. In order to count scores, every score is read without being stored or used for calculations. The code segment in Fig. 10.4 will count scores in the file.
+
+Fig. 10.4
+
+A code segment that counts the data in an input file
+
+Note that the user is prompted for and inputs the name of the file. Further, notice that inside the while loop, although the exam scores were read from the file using the statement inFile.nextInt(); because the return values were not used for any calculations at this point, they were not stored in memory. The instruction inFile.nextInt(); was simply used to count the number of exam scores. At the end of the while loop, the variable numStudents will have the number of scores in the file. The next step is to create an array of the size numStudents, read the scores from the file again, and this time store them in the array. Consider the following code segment that could be added to the code in Fig. 10.4 to do these tasks:
+
+// create array of size numStudents
+
+scores = new int[numStudents];
+
+// read scores from input file and save them in array
+
+for(i=0; i<numStudents; i++)
+
+scores[i] = inFile.nextInt();
+
+The above code is syntactically correct. However, when it is executed, a runtime error will be encountered and the program will halt unexpectedly. What is wrong with it? The problem is that after all the scores are read once, the end of the data file is reached and there is nothing left to read. In order to start back at the beginning of the input file, a solution is to close and reopen the file. Once the numbers are stored in the array, the average of the scores will be found. If the average is less than 70, then points are added to every student's score in order to make the average equal to 70. The following is the entire program:
+
+If the grades1.txt file shown below is used again as an input file,
+
+71
+
+60
+
+75
+
+and the user entered grades1adj.txt for the output file as shown below,
+
+Enter input filename: grades1.txt
+
+Enter output filename: grades1adj.txt
+
+after the execution, the grades1adj.txt file would contain the following:
+
+73
+
+62
+
+77
+
+which consists of the scores after being adjusted.
+
+## 10.5 Specifying the File Location
+
+Before reading the contents of the file or writing data to a file, a File object could be created and associated to the file. Consider a file structure in Windows® operating system as shown below:
+
+Assuming the current directory is Exam and the program is in the GradesVersion3.java file, the following statement will create an object of type File named file by invoking a constructor:
+
+File file = new File("grades2.txt");
+
+The argument to the constructor designates the name of the file to access. The system assumes the file is located in the current directory of the program. It is also possible to open a file that is stored in a different directory by providing an absolute pathname and a filename. An absolute pathname is the full pathname beginning with the disk drive name. Therefore, the absolute pathname for a file grades3.txt in the Homework directory in the Class1 directory is
+
+C:\Class1\Homework\grades3.txt
+
+A statement in GradesVersion3.java program that associates the input file grades3.txt to an object of type File would be
+
+File file = new File("C:\\\Class1\\\Homework\\\grades3.txt");
+
+Notice that there are two backslashes to separate directories, Class1 and Homework, and a directory and a file, Homework and grades3.txt. Recall from Chap.​ 1 that in order to insert special characters such as a double quotation mark and backslash into a string, Java requires a \ in front of the character like \" and \\\, respectively.
+
+Since other operating systems use a forward slash character \ to separate directories and a file in the pathname, the forward slash is also allowed in a program run on the Windows® operating system to describe the pathname in order to maintain the consistency across the different computer platforms as in
+
+File file = new File("C:/Class1/Homework/grades3.txt");
+
+An absolute pathname can also be used with constructors of the FileReader and FileWriter classes. A pathname a user enters through the keyboard can be stored in the variable of type String and used as a parameter, just like the simple filename discussed in the previous section.
+
+## 10.6 Complete Programs: Implementing File Input and Output
+
+Two complete programs will be discussed here. The first one deals with storing data into a two-dimensional array while reading the data from the file and appending the results into the existing file. The second program deals with strings. The list of strings will be read from the file and placed in the array. After they are sorted, the results will also be stored in the file.
+
+### 10.6.1 Matrix Multiplication
+
+This section designs a program that performs matrix multiplication. Given two matrices A and B, where both A and B contain 2 rows and 2 columns, the matrix product of A and B is matrix C that contains 2 rows and 2 columns. The entry in matrix C for row i column j, C i,j is the sum of the products of the elements for row i in matrix A and column j in matrix B. That is,
+
+The program asks a user to enter the name of the file which contains the matrices A and B. Consider the file matrix.txt which is used as an input file as shown below.
+
+1 2
+
+3 4
+
+5 6
+
+7 8
+
+In this example, the matrix A is   and the matrix B is  , and the program will read these values and place them in the two-dimensional arrays named matrix1 and matrix2. The result of the matrix multiplication is saved in the two-dimensional array named matrix3 and will be appended to the input file matrix.txt. The entire program is show below:
+
+Notice there are two nested for loops to obtain two matrices from the file and place them in the two-dimensional arrays. Since the same file is used for both input and output, the inFile is closed after the reading of the matrices. When the same file is opened for output, the second argument of the FileWriter constructor is set to be true for appending. After the output of a blank line, the result of the matrix multiplication   is added to the end of the file matrix.txt as shown below:
+
+1 2
+
+3 4
+
+5 6
+
+7 8
+
+19 22
+
+43 50
+
+### 10.6.2 Sorting Data in a File
+
+Another program that deals with file input and output is one that sorts string values stored in the file and outputs the results to another file. The input file terms.txt consists of one integer and a list of strings as shown below:
+
+15
+
+variables
+
+input
+
+output
+
+arithmetic
+
+class
+
+object
+
+contour
+
+selection
+
+iteration
+
+array
+
+recursion
+
+inheritance
+
+polymorphism
+
+exception
+
+file
+
+The number 15 indicates the number of words stored in the file. After this number is input, an array of 15 elements is created, and the strings are read and saved in the array. Then the words in the array are sorted using the sort method which is a class method of the Arrays class. This predefined sort method uses a merge sort that is usually discussed in subsequent courses and texts on data structures or algorithm analysis. Here, since the focus is file input and output, a preexisting method is used instead of writing a sort method, although the bubble sort discussed in Chap.​ 7 could be used instead and is left as an exercise at the end of the chapter. Finally, the sorted list is output to the file sortedTerms.txt. The following is the entire program:
+
+After the above code is executed, the output file sortedTerms.txt would contain a list of sorted words as shown below:
+
+arithmetic
+
+array
+
+class
+
+contour
+
+exception
+
+file
+
+inheritance
+
+input
+
+iteration
+
+object
+
+output
+
+polymorphism
+
+recursion
+
+selection
+
+variables
+
+## 10.7 Summary
+
+  * The Scanner class is used to read a text file.
+
+  * The File class and the FileReader class can be used to create an object of the Scanner class that opens a text file for input.
+
+  * After creating an object of the Scanner class to access a text file as input, the methods such as nextInt and next from the Scanner class can be used to read the file.
+
+  * The PrintWriter class is used for file output.
+
+  * When the class PrintWriter is used to open a text file for output, a new file is created regardless of whether the file with the same name exists.
+
+  * The File class or the FileWriter class can also be used to create an object of PrintWriter class that opens a text file for output.
+
+  * In order to append data to the end of a text file, set the second argument of the constructor of the class FileWriter to true. If the second argument is not present, the value false is assumed and an existing file will be replaced.
+
+  * After creating an object of the PrintWriter class for output, the methods such as print, println, and printf from the PrintWriter class can be used to write data to the text file.
+
+  * Once all of the operations intended to carry out on a given file have been completed, both the input file and output file should be closed by using the method close.
+
+## 10.8 Exercises (Items Marked with an * Have Solutions in Appendix E)
+
+1.
+
+Indicate whether the following statements are syntactically correct or incorrect. If incorrect, indicate what is wrong with the statement:
+
+A.
+
+Scanner inputFile = new Scanner(new File(Sample.dat));
+
+*B.
+
+File in = new File(new FileReader("in.txt"));
+
+C.
+
+FileWriter out = new FileWriter(new PrintWriter("o.txt"));
+
+*D.
+
+PrintWriter out = new PrintWriter("out.txt");
+
+E.
+
+FileWriter out = new File(new FileWriter("result.out"));
+
+2.
+
+Consider a program that reads data from an input file named in.dat, performs calculations, and outputs the results to a file named result.out.
+
+A.
+
+What would happen if the file in.dat did not exist before the program is executed?
+
+B.
+
+What are the contents of the file in.dat after the execution of the program?
+
+C.
+
+What would happen if the file result.out did not exist before the program is executed?
+
+D.
+
+What could happen if the output file was not closed at the end of the execution?
+
+*3.
+
+Write a program that asks a user to enter a file name and three numbers, and then store the three numbers in the user-specified file. After the execution of the program, open the file with a utility program to make sure the three numbers are there.
+
+4.
+
+Write a program that reads the three numbers from the file created in the previous exercise. After the data are read, display the smallest and the largest of the three numbers.
+
+5.
+
+Write a program that asks a user to enter the name of a file, and count and display the number of words that appear in the user-specified file. Use a utility program to create a simple text file that can be used to test the program.
+
+6.
+
+Write a program that prompts a user to enter the name of a file and a word. The program should then count all occurrences of the word in the file and display the number of occurrences. Use a utility program to create a simple text file that has many words in it and that can be used to test the program.
+
+7.
+
+Rewrite the program that sorts string values stored in the file described in Sect. 10.6.2 so that it uses the bubble sort discussed in Sect.​ 7.​7.​2.
+James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6© Springer-Verlag London 2014
+
+## Appendix A: Simple Graphical Input and Output
+
+Input and output using a text-based window were introduced in Chap.​ 1 . They are simple and easy to implement while learning the concepts of a programming language and creating programs for oneself. However, when an application is written for customers, a user-friendly graphical user interface ( GUI ) is an important part of developing programs. In this appendix, a simple graphical window called a dialog box which displays a message to the user or requests input will be discussed.
+
+### A.1 Message Dialog Boxes
+
+Simple GUI-based output to display a message dialog box can be accomplished by using the showMessageDialog method which is a class method defined in the standard class JOptionPane . Here is a statement that calls the method:
+
+JOptionPane.showMessageDialog(null, "Hello, World!");
+
+The method passes two arguments. The first argument controls the location of the dialog box and is an object of Java standard class JFrame that represents a single window on the screen. For now, a reserved word, null , is passed which causes the dialog box to appear in the center of the screen. The second argument is the message to be displayed in the dialog box. When the statement above is executed, the dialog box shown below appears on the screen.
+
+When the user clicks the OK button, the dialog box will close. If multiple lines of text need to be displayed, the control character \n can be used to separate the lines as in
+
+JOptionPane.showMessageDialog(null, "Hello,\nWorld!");
+
+which will result in the dialog box shown below:
+
+As can be seen, Hello , and World! are output on separate lines.
+
+### A.2 Input Dialog Boxes
+
+An input dialog box is a simple GUI-based input that can be created by using the showInputDialog method defined in the JOptionPane class. The following code shows how the showInputDialog method can be called:
+
+JOptionPane.showInputDialog(null, "What is your first name?");
+
+Just as with the showMessageDialog method, it sends a JFrame object and a String object as arguments. As before, the null value causes the dialog box to appear in the center of the screen. The second argument is a message displayed above a text field in the dialog box. The text field is an area in which the user can type a single line of input from the keyboard. When the statement is executed, a dialog box will appear as shown in Fig. A.1 .
+
+Fig. A.1
+
+An input dialog box asking the first name
+
+With this dialog box, a user can enter text in the text field as shown below:
+
+When the OK button or the Cancel button is clicked, or the enter key is pressed, the dialog box will disappear. However, it does not do anything more and the value entered in the text field is gone. In order to save the value the user entered, it has to be assigned to a variable as shown in the following code:
+
+String firstName;
+
+firstName = JOptionPane.showInputDialog(null ,
+
+"What is your first name?");
+
+If the user enters Maya in the text field and clicks the OK button or presses the enter key which is an alternative to clicking the OK button, a reference to the String object with the value "Maya" that is a return value from the method showInputDialog will be assigned to the String variable firstName . If the user clicks the OK button or presses the enter key without entering anything in the text field, firstName will reference the object of the String type with an empty string. If the user clicks the Cancel button regardless of what was entered in the text field, firstName will contain the value null .
+
+The following program demonstrates how to use both types of dialog boxes discussed above. The program uses an input dialog box to ask a user to enter his or her first name and displays a greeting in a message dialog box:
+
+import javax.swing.*;
+
+class MsgBoxName {
+
+public static void main(String[] args) {
+
+String firstName;
+
+firstName = JOptionPane.showInputDialog(null ,
+
+"What is your first name?");
+
+JOptionPane.showMessageDialog(null, "Hello, "
+
+\+ firstName + "!\nHow are you?");
+
+System.exit(0);
+
+}
+
+}
+
+First, notice the inclusion of the import statement at the top. Since the JOptionPane class is not automatically available to Java programs, any program that uses the JOptionPane class must have the import statement prior to the class definition. The statement tells the compiler where to find the JOptionPane class in the javax.swing package and makes it available to the program. Also, notice the last statement in the main method, System.exit(0); which causes the program to end. It is added because a program that uses JOptionPane does not automatically stop executing when the end of the main method is reached. The System.exit method requires an integer argument. This value is a status code that is passed back to the operating system. Although the code is usually ignored, it can be used outside the program to indicate whether the program terminated successfully or abnormally. The value 0 traditionally indicates that the program ended successfully. When the above program is executed, the input dialog box in Fig. A.1 appears. If the user enters Maya and clicks the OK button, the following message dialog box will be displayed.
+
+### A.3 Converting String Input from Input Dialog Boxes to Numbers
+
+Unlike the Scanner class that supports different input methods for specific data types, such as nextInt and nextDouble , the JOptionPane supports only string input. Even if the user enters numeric data, the showInputDialog method always returns the user's input as a String . For example, if the user enters the number 18 into an input dialog box, the showInputDialog method will return the String value "18" . This can be a problem if the input is supposed to be used later in mathematical calculations because mathematical computations cannot be performed on strings. In such a case, a conversion from a string to a number needs to be performed. Here is an example of how to accomplish this using the Integer.parseInt method to convert the user's input to an integer value:
+
+String str;
+
+int age;
+
+str = JOptionPane.showInputDialog(null,
+
+"How old are you?");
+
+age = Integer.parseInt(str);
+
+When the above code executes, the input dialog box in Fig. A.2 appears. After the user enters 18 , the dialog box would look as shown below:
+
+Fig. A.2
+
+An input dialog box asking the age
+
+When the user clicks the OK button, the dialog box disappears and the String variable str will hold the String value "18" . Then it will be converted to an integer and assigned to the int variable age . If the user enters a string that cannot be converted to a type int , for example, 18.0 or the word eighteen , a NumberFormatException error will result (the topic of exceptions will be covered in Appendix B). Table A.1 lists common methods to convert the string input to numerical data values.
+
+Table A.1
+
+Methods for converting strings to numbers
+
+Methods | Description
+
+---|---
+
+Byte.parseByte | Convert a String to a byte
+
+Double.parseDouble | Convert a String to a double
+
+Float.parseFloat | Convert a String to a float
+
+Integer.parseInt | Convert a String to a int
+
+Long.parseLong | Convert a String to a long
+
+Short.parseShort | Convert a String to a short
+
+Next, consider a program which asks a user's age using an input dialog box shown previously in Fig. A.2 and displays the following message in a message dialog box as shown below:
+
+It outputs the current age the user entered and the next year's age which is 1 year older. The code necessary to accomplish this task is shown below:
+
+Note that the age+1 is in parentheses so the plus sign is treated as a numerical addition instead of a string concatenation. Further, since the value of age+1 is not assigned back into age , the value of the variable age is not altered.
+
+### A.4 Confirmation Dialog Boxes
+
+Another useful method from the JOptionPane class is the showConfirmDialog method. A confirmation dialog box gives buttons to select, and when a user clicks one of the buttons, it returns an integer value. Recall the code segment from Fig. 6.3 in Chap.​ 6 that checks the string value the user enters after playing one Tic Tac Toe game to determine if the user wants to play another game. Again, assuming the program to play a Tic Tac Toe game has been written, the code segment in Fig. 6.3 can be rewritten using a confirmation dialog box as shown below instead of having the user enter "yes" or "no" :
+
+int selection;
+
+do {
+
+// play one Tic Tac Toe game
+
+selection = JOptionPane.showConfirmDialog(null, //#1
+
+"Would you like to play another Tic Tac Toe game?", //#2
+
+"Confirmation", //#3
+
+JOptionPane.YES_NO_OPTION); //#4
+
+} while(selection == JOptionPane.YES_OPTION);
+
+The showConfirmDialog method passes four arguments labeled in the comment to the right as #1 through #4 . The first argument, null , places the dialog box in the center of the screen. The second argument is a descriptive message to be output above the buttons in the dialog box to inform the user what should be done. The third argument is the title of the dialog box appears in the window's title bar. The fourth argument defines the set of option buttons that appear at the bottom of the dialog box. The JOptionPane.YES_NO_OPTION option displays a Yes button and a No button. The integer returned from the method indicates which option was selected by the user. When the user clicks the Yes button in the dialog box shown below, an integer value 0 , which is the value of the constant JOptionPane.YES_OPTION , is returned. When the user clicks the No button, an integer value 1 , which is the value of the constant JOptionPane.NO_OPTION , is returned. Therefore, the return value from the confirmation dialog box could be 0 or 1 , and the programmer does not have to memorize the actual value returned for the specific case. Whatever the value is, if the user clicks the Yes button, the return value should match with the value of the constant JOptionPane.YES_OPTION . Thus, all the programmer has to write is selection == JOptionPane.YES_OPTION instead of comparing the return value with the actual integer.
+
+With the above code, if the user clicks the Yes button after playing one Tic Tac Toe game, a new game will start. The user keeps playing as long as the Yes button is selected. When the code segment above is actually executed, since it does not contain the code which implements the Tic Tac Toe game, it simply keeps showing the confirmation dialog box inside the loop until the user clicks the No button.
+
+### A.5 Option Dialog Boxes
+
+In addition to the JOptionPane.YES_NO_OPTION option that was discussed in the previous section, the JOptionPane class defines another set of option buttons that appear in the dialog box including the JOptionPane.YES_NO_CANCEL_OPTION option which displays Yes , No , and Cancel buttons and the JOptionPane.OK_CANCEL_OPTION option which displays OK and Cancel buttons. Is there any way the buttons other than Yes , No , Cancel , or OK could be displayed in the dialog box? The answer is yes. An option dialog box allows a programmer to create custom buttons using an array structure introduced in Chap.​ 7 .
+
+As an example, assume that every conference attendee will fill out a survey at the conclusion of the conference. Each question will appear in the dialog box and an attendee will select one of the buttons. An example question is shown below:
+
+As can be seen, there are six option buttons with custom labels and a conference attendee can click any of them. Besides displaying a question and buttons, the program needs to know which button the user pressed and stores the information. The code to display the above dialog box is shown below:
+
+As before, the first argument of the showOptionDialog method indicates the placement of the dialog box. The null value centers the dialog box on the screen. The second argument is the question displayed above the option buttons. The third argument is the title of the dialog box which appears in the window's title bar. The fourth argument indicates the set of option buttons. The DEFAULT_OPTION is used since the programmer will define buttons in the seventh argument. If the predefined option such as JOptionPane.YES_NO_OPTION is used, then the seventh argument would be set to null . The fifth argument defines the style of the message. Here one of the default icons, QUESTION_MESSAGE , is used to display a question mark. The sixth argument can place additional icons in the dialog box. In this example, since the question mark icon is already added by the previous parameter, the null value is used to not display any more icons. The seventh argument specifies the buttons. The labels of the buttons are stored in the String array named options . The last argument allows a programmer to specify an initial choice. Since the argument is "average" , the average button is outlined, and if the user simply presses the enter key without choosing any of the buttons, the average button will be selected as a default. The showOptionDialog method returns an int value indicating the button that was activated. It basically returns the index value of the array options . For example, when the N/A button is selected, it returns the value 0 because the String value "N/A" is stored in the first location of the array, and when the awful button is clicked, the value 1 is returned. The integer value from each question can be used to create the result of the survey.
+
+For more information about dialog boxes, please refer to the Java API specification document at the Oracle website at http://​docs.​oracle.​com/​javase/​7/​docs/​api/​index.​html .
+
+## Appendix B: Exceptions
+
+Building robust programs is essential to the practice of programming. Robust programs are able to handle error conditions gracefully. If a program crashes when an invalid input is entered, the program is not very robust. This appendix describes a process called exception handling which can be used to improve the robustness of the program to prevent it from crashing and allow it to terminate in a controlled manner.
+
+### B.1 Exception Class and Error Class
+
+An exception represents an execution error, an error condition, or an unexpected event that occurs during the normal course of program execution. It is an instance of a class in the Java Application Programming Interface (API) which is a predefined set of classes that can be used in any Java program. The Java API contains an extensive hierarchy of exception classes. A portion of the hierarchy is shown in Fig. B.1 .
+
+Fig. B.1
+
+Hierarchy of exception classes
+
+As one can see, all of the classes in the hierarchy are subclasses of the Throwable class. Just below the Throwable class are the classes Error and Exception . Subclasses of the Error class are for exceptions when a critical error occurs, such as an internal error in the Java interpreter which indicates it has run out of resources and cannot continue operating. Subclasses of the Exception class include IOException and RuntimeException which also serve as superclasses to other classes. IOException is the superclass for exceptions related to input and output operations. RuntimeException serves as the superclass for exceptions that result from programming errors, such as an out of bounds array index.
+
+When an exception occurs, it is said to have been thrown . Unless an exception is detected by the program and dealt with, it causes the program to halt. To detect whether an exception has been thrown and prevent it from halting the program, Java allows programmers to create an exception handler which is a section of code that is executed when an exception is thrown. Exception handling is the process of catching an exception and then handling it. If the program does not provide an exception handler, the system uses the default exception handler, which outputs an error message and stops the program. The next section will show how exceptions can be caught and processed.
+
+### B.2 Handling an Exception
+
+Consider the following program which asks a user for a test score and then outputs it. When the program in Fig. B.2 is executed using a sample input of 80 , the output is as follows:
+
+Fig. B.2
+
+A program without exception handling
+
+Enter the score: 80
+
+Your score is 80.
+
+When a valid input is entered, the program terminates successfully. What happens if the real number 80.0 is entered instead of an integer? The program will halt in the middle of the execution and gives the error message shown below:
+
+Enter the score: 80.0
+
+java.util.InputMismatchException
+
+at java.util.Scanner.throwFor(Scanner.java:840)
+
+at java.util.Scanner.next(Scanner.java:1461)
+
+at java.util.Scanner.nextInt(Scanner.java:2091)
+
+at java.util.Scanner.nextInt(Scanner.java:2050)
+
+at ScoreVersion1.main(ScoreVersion1.java:9)
+
+This error message indicates the system has caught an exception called the InputMismatchException , because the value entered was of type double which cannot be read using the nextInt method. If the input was 8o , the digit 8 and a lower case letter o , the system will catch the same exception because the combination of a number 8 and a lower case letter o is not an integer. In the absence of an exception handler by a programmer, a single thrown exception will most likely result in program termination. Instead of depending on the system for exception handling, one can write code that catches and handles exceptions to increase the program's robustness.
+
+To handle an exception, a try-catch control statement is coded. In order to catch an InputMismatchException exception, the following code can be used:
+
+try {
+
+// try block
+
+}
+
+catch(InputMismatchException exception) {
+
+// catch block
+
+}
+
+After the keyword try , a block of code follows inside braces. This block of code is known as a try block. A try block has one or more statements that can potentially throw an exception, such as input statements. After the try clause comes a catch clause. A catch clause begins with the keyword catch , followed by a parameter declaration which includes the name of an exception class and a parameter. If the code in the try block throws an exception of the InputMismatchException class, an object of the InputMismatchException class is created. It will be caught by the catch clause and referenced by the variable exception . Then, the code in the catch block is executed. Note that both the try block and the catch block require braces.
+
+From the code shown in Fig. B.2 , the statement score = scanner.nextInt(); should be placed inside the try block because it can potentially throw an exception when a user enters a non-integer value. The statements that are executed in response to the thrown exception are placed in the matching catch block. To simply display an error message and continue when the exception is thrown, a try-catch statement can be added to the code in Fig. B.2 as shown below:
+
+If there are several statements in the try block, they are executed in sequence. When one of the statements throws an exception, control is passed to the matching catch block and the statements inside the catch block are executed. The execution then continues to the statement that follows the try-catch statement, ignoring any remaining statements in the try block. For example, if the user enters 8o , a number 8 and a lower case letter o , an exception is thrown, the program will skip the second statement, System.out.println("Your score is " + score + "."); in the try block, and the error message in the catch block will be output. Since there are no more statements in the program, it will terminate. The output would then look like the following:
+
+Enter the score: 8o
+
+Error: Score must be integer.
+
+If no statements in the try block throw an exception, then the catch block is ignored and execution continues with the statement following the try-catch statement. For example, the input 80 will result in the following:
+
+Enter the score: 80
+
+Your score is 80.
+
+By adding the try-catch statement, the program will not crash when a non-integer value is entered. However, it would be nice if the user is asked to reenter the input in order to continue. To accomplish this, the entire try-catch statement can be placed inside a loop as shown in Fig. B.3 .
+
+Fig. B.3
+
+Program with exception handling
+
+Notice that the variable flag is initialized to true at the beginning so the program will ask the user to enter the score at least once inside the while loop. In order to break out of the while loop, the contents of flag must be changed to false . The use of the boolean variable flag was discussed in Chaps.  and  , where it was used with selection and iteration structures, respectively. In this example, the only time that control should break out of the while loop is when the user enters an integer value. Therefore, the value of the flag is changed right after the nextInt method. If an integer is entered, the execution continues to the statement that follows the while loop, instead of jumping to the catch clause. Also, notice the first statement scanner.next(); inside the catch block. This removes the non-integer input value that caused an exception from the input buffer. Otherwise, the nextInt method processes the same invalid input from the first attempt over and over resulting in an infinite loop because the value would never be removed from the buffer and be assigned to the variable. The following output shows that the program will keep asking the user for input until valid value is entered:
+
+Enter the score: 80.0
+
+Error: Score must be integer.
+
+Enter the score: 8o
+
+Error: Score must be integer.
+
+Enter the score: eighty
+
+Error: Score must be integer.
+
+Enter the score: 80
+
+Your score is 80.
+
+As can been seen in the fourth attempt, the user finally entered an integer value which caused the program to break out of the while loop and output the score.
+
+###  B.3 Throwing Exceptions and Multiple catch Blocks
+
+Compared to the original code in Fig. B.2 , the code with a try-catch statement in Fig. B.3 is more robust because the program does not crash when a non-integer value is entered. However, what happens if a negative integer is entered? Because a negative integer is still an integer, the program proceeds producing an erroneous result and does not throw an exception. Since the score should not be a negative number or greater than 100, the program should only accept a value in the range of 0 and 100. Before writing the program using the exception handling feature, one without try-catch blocks will be first developed to show the difference between the two techniques.
+
+Because the user could enter non-integer values, it is not wise to use the nextInt method to read the input because it may cause abnormal termination when the input cannot be read as integer. Therefore, the input will be read as a String and checked to ensure that it consists of only digits. If it contains characters and decimal points, it cannot be an integer. If it is actually a number without a decimal point, it will be converted to the int type. Then, if it is between 0 and 100, the input is valid. The program below does these tasks:
+
+As before, the while loop will repeat until the user enters an integer value between 0 and 100 inclusive. Notice that the input is read using the method next instead of nextInt . This will allow both digits and characters to be read as String . After checking for a leading minus sign, the inner loop goes through each character in the input string to see if it lies between "0" and "9" in the Unicode character set. The if statement following checks the boolean variable isInt , and if the input consists only of digits and an optional minus sign, it will contain the value true . If this is the case, then the input is converted into an integer using the parseInt method defined in Integer class, which takes a String and returns a value of int type. If the number is in the correct range, another boolean variable flag is set to false to break out of the while loop.
+
+The following output shows that the program recovers not only from non-integer input but also an out of range integer value:
+
+Enter the score: 8o
+
+Error: Score must be integer.
+
+Enter the score: 180
+
+Error: Score must be in 0–100.
+
+Enter the score: 80
+
+Your score is 80.
+
+A program which does the same task as above can be written using a try-catch block as shown in Fig. B.4 . In this program, notice that the input is checked in the try block to see if it is in the correct range. If it is not, an exception is thrown by using the throw new RuntimeException(); statement. It creates an object of the RuntimeException class using a new statement. In the corresponding catch block, the thrown exception is caught, the reference to the object is assigned to the parameter exception , and the error message is displayed. Theoretically in the throw statement, any instance of the Throwable class or its subclasses including the Error class can be created. However, programs should not try to handle objects of the Error class or its subclasses. In general, only an instance of Exception class or its subclasses should be handled by programs, and this is why an object of the RuntimeException class that is a subclass of the Exception class was thrown in Fig. B.4 .
+
+Fig. B.4
+
+A program with multiple catch blocks
+
+Also notice that there are multiple catch blocks in the code shown in Fig. B.4 . When there are multiple catch blocks in a try-catch statement, they are checked in the order they are listed. Once a matching catch block is found, none of the subsequent ones are checked. Using the same input as before, when the input 8o is entered during the first iteration of the while loop, an InputMismatchException will be thrown and control looks for a matching catch block. In this case, the first catch block is executed, and then control will go back to the beginning of the while loop ignoring the second catch block. When the input 180 is entered, which is a valid integer value, the if condition is checked. Because the condition is false , a RuntimeException is thrown and control searches a matching catch block. Since this exception is not an object of the InputMismatchException class, the first catch block is skipped and the second catch block is executed. If the exception is thrown and there is not a matching catch block, then the system will handle the thrown exception by halting execution.
+
+Because the execution classes form an inheritance hierarchy, it is important to place the catch block for specialized exception classes before those for the more general exception classes. For example, consider the reversed order of the catch blocks from Fig. B.4 as shown below:
+
+try {
+
+score = scanner.nextInt();
+
+if(score < 0 || score > 100)
+
+throw new RuntimeException();
+
+flag = false;
+
+}
+
+catch(RuntimeException exception) {
+
+System.out.println("Error: Score must be in 0-100.");
+
+}
+
+catch(InputMismatchException exception) {
+
+scanner.next();
+
+System.out.println("Error: Score must be an integer.");
+
+}
+
+This results in a compiler error with the message:
+
+exception java.util.InputMismatchException has already
+
+been caught
+
+Why? Recall that the InputMismatchException class is a subclass of the RuntimeException class as shown in Fig. B.1 and partially repeated below:
+
+  * Exception
+
+    * IOException
+
+      *...
+
+    * RuntimeException
+
+      *...
+
+      * NoSuchElementException
+
+        * InputMismatchException
+
+When the object of the InputMismatchException class is thrown, the first catch block is executed and all other catch blocks are ignored. This means that the second catch block will never be executed because any exception object that is an instance of the RuntimeException class or its subclasses will match the first catch block.
+
+When there are multiple catch blocks, each catch clause has to correspond to a specific type of exception. With the example above, since the InputMismatchException class is a subclass of the RuntimeException class, both exceptions could be caught by the catch clause with RuntimeExeption . Further, having two catch clauses for the same type of exception in the try-catch statement, as shown below, will cause the compiler to issue an error message "exception java.lang.RuntimeException has already been caught" in the second catch clause.
+
+try {
+
+score = scanner.nextInt();
+
+if(score < 0 || score > 100)
+
+throw new RuntimeException();
+
+flag = false;
+
+}
+
+catch(RuntimeException exception) {
+
+scanner.next();
+
+System.out.println("Error: Score must be an integer.");
+
+}
+
+catch(RuntimeException exception) {
+
+System.out.println("Error: Score must be in 0-100.");
+
+}
+
+If there is a block of code that needs to be executed regardless of whether an exception is thrown, then the try-catch statement can include a finally block which must appear after all of the catch blocks. Consider the following while loop modified from Fig. B.4 with a finally block added at the end of the try-catch statement:
+
+The output using the same input values, 8o , 180 , and 80 , is shown below:
+
+Enter the score: 8o
+
+Error: Score must be integer.
+
+End of try-catch statement.
+
+Enter the score: 180
+
+Error: Score must be in 0-100.
+
+End of try-catch statement.
+
+Enter the score: 80
+
+End of try-catch statement.
+
+Your score is 80.
+
+Since the first two inputs were invalid, both an error message from the catch block and a message from the finally block were output. The last input did not throw an exception, so all the catch blocks were skipped, but the message from the finally block was still displayed.
+
+### B.4 Checked and Unchecked Exceptions
+
+Among the exceptions, including the ones listed in Fig. B.1 , there are two categories: checked and unchecked. Unchecked exceptions are those that inherit from the Error class or the RuntimeException class. They are also called runtime exceptions because they are detected during runtime. As mentioned before, the exceptions that inherit from the Error class are thrown when a critical error occurs, and therefore they should not be handled by the program. Exceptions that were handled in the previous sections are all instances of the RuntimeException class or its subclasses. However, in general not all the possible exceptions from the RuntimeException class are handled in the program because handling each one of them in the program is not practical. As a result, exception handling should only be used when the problem can be corrected, and simply catching and ignoring any exception is a bad practice.
+
+A RuntimeException indicates programming errors, so it could possibly be avoided altogether by writing better code. However, large applications might never be entirely bug-free, and exception handling can be used to display an appropriate message instead of surprising the user by an abnormal termination of the program. If the application is running critical tasks and must not crash, exception handling can be used to log the problem and the execution can continue.
+
+All exceptions that are not inherited from the Error class or the RuntimeException class are called checked exceptions because they are checked during compile time. Consider a program which opens a file, reads numbers from the file, and outputs the total. Suppose the scores.txt file contains the following data and exists in the same directory as the .java file:
+
+70
+
+80
+
+90
+
+The code in Fig. B.5 opens the scores.txt file, reads three numbers from the file, and outputs the total. What happens during the compilation of the program? The compiler will issue an error message "Unreported exception java.io.FileNotFoundException; must be caught or declared to be thrown" for the line inFile = new Scanner(new File("scores.txt")); because this statement can potentially throw a checked exception. If the file scores.txt does not exist as discussed in Chap.​ 10 , the checked exception of a FileNotFoundException has to be thrown. A simple solution to eliminate this error is to add a throws clause, throws IOException , in the method header. The throws clause informs the compiler of the exceptions that could be thrown from a program. If the exception actually occurs during runtime, because the system could not find the file scores.txt , the system will deal with the exception by halting execution. Consider the following modified version of the code from Fig. B.5 :
+
+Fig. B.5
+
+A program with a checked exception
+
+Notice that throws IOException is added in the main method header. The FileNotFoundException could be used in the header instead of IOException since it is the class that the exception object is actually created from. However, because the IOException class is a superclass of the FileNotFoundException class as shown below from Fig. B.1 , the throws clause with IOException can catch the instance of the FileNotFoundException class. Including the more general exception class in the header is useful since it can catch exceptions of all the subclasses.
+
+  * Exception
+
+    * IOException
+
+      * CharConversionException
+
+      * EOFException
+
+      * FileNotFoundException
+
+    * RuntimeException
+
+      *...
+
+The other way to handle a checked exception is to include the try-catch statement in the body of the program. Because the statement inFile = new Scanner(new File("scores.txt")); could possibly throw a checked exception, it should be included inside the try block. The statements that should be executed in response to the thrown exception are placed in the matching catch block. To simply display an error message and continue when the exception is thrown, a try-catch statement is added to the code in Fig. B.5 as shown below:
+
+If the designated file does not exist in the system, the program will stop whether a try-catch block exists or not. However, without a try-catch block, the execution stops abnormally, and with a try-catch block, the program terminates normally. If it was a part of a larger application program, it would be convenient if the program did not crash just because it did not find one file, but continued the execution of the next part of the program.
+
+## Appendix C: Javadoc Comments
+
+In Chap.​ 1 , different ways of documenting a Java program were discussed. As was mentioned, comments are intended for programmers and are ignored during execution. However, documentation is an important aspect of developing applications. In the real world, once an application is released, programming bugs that were not detected during development need to be fixed and new features may be added. Often those who modify a program are not the ones who developed it. The documentation then becomes very helpful for a programmer attempting to understand somebody else's program. This appendix explains more about specialized comments called Javadoc .
+
+### C.1 Javadoc
+
+Java provides a standard form for writing comments and documenting classes. Javadoc comments in a program interact with the documentation tool also named Javadoc, which comes with the Java Development Kit (JDK). The Javadoc tool reads the Javadoc comments from the source file and produces a collection of HyperText Markup Language ( HTML ) pages, which can be read and displayed by web browsers. These pages look just like the Java API specification document at the Oracle website at http://​docs.​oracle.​com/​javase/​7/​docs/​api/​index.​html . The HTML pages created by the Javadoc tool contain only documentation and no actual Java code. The documentation allows programmers to understand and use the classes someone else has written without seeing how they are actually implemented.
+
+Javadoc comments begin with a slash followed by two asterisks /** and end with an asterisk followed by a slash */ . Many programmers also place a single asterisk * at the start of each line in the comment as shown in the program in Fig. C.1 . Although they have no significance and the Javadoc tool ignores them, they make it easy to see the entire extent of the comments in the program.
+
+Fig. C.1
+
+A program with Javadoc comments
+
+The Javadoc comments for the class are placed between the import statements and the class header. After the description of the class, the rest of the comment consists of a series of Javadoc tags , which are special markers that begin with the @ symbol. Each tag tells the Javadoc tool certain information. The documentation for a class will usually contain an author tag. The Javadoc tag @author indicates the name of the programmer(s) who created the class. The Javadoc comments for the description of a method are placed above the method header. As an example, two Javadoc comments are added to the QuadEq class discussed in Sect. 1.10 of Chap.​ 1 and shown in Fig. C.1 .
+
+The use of Javadoc comments does not preclude the use of other types of comments in the program. In addition to the Javadoc comments in Fig. C.1 , the regular comments with two slashes // are used to describe the sections of the code. Since Javadoc comments included in the HTML page are the only ones describing the class, its data members, and its methods, the comments describing the sections will not appear in the HTML page even if they are written as Javadoc comments. However, the comments in the middle of the code are still important when a programmer is reading to understand the code. Therefore, Javadoc comments are useful for a programmer who simply uses the classes without looking at the implementation, and other comments in the code are helpful for a programmer who is actually modifying the code.
+
+Once all the Javadoc comments are added to the class, the next step is to generate the corresponding HTML documentation file. Many Java editors and Integrated Development Environments (IDEs) include a menu option that can be used to generate a Javadoc documentation file quickly and easily. Part of the resulting HTML page for the QuadEq class is shown below:
+
+In the nicely formatted HTML page, the description of the class which has been added to the program as a Javadoc comment is shown. The author tag appears in boldface and the names of the authors are shown as well. Since there is no constructor defined in the class, a system-generated default constructor is listed in the Constructor Summary section. The Method Summary section contains only the main method along with the Javadoc comments added in the program because only one method exists in the class.
+
+### C.2 More Javadoc Tags
+
+The format of the Javadoc comments for a method is similar to the one for a class. In addition to a general description, a number of Javadoc tags can be included. The main purpose of the comments for a method is to record its purpose, a list of any parameters passed to the method, and any value returned from the method. If the method receives a parameter, the @param tag is used, and if the method returns a value, the @return tag is added. The Javadoc comments for the method convertEurosToDollars as defined in the Card class from Sect. 5.6.2 are shown below:
+
+/**
+
+* Convert the passed value to Dollars.
+
+*
+
+* @param euros the amount in Euros
+
+* @return the amount in Dollars
+
+*/
+
+public static double convertEurosToDollars(double euros) {
+
+return euros*rate;
+
+}
+
+Notice that the Javadoc comments for the method need to be placed just above the method header. Each parameter of the method is documented by using a tag @param , followed by the name and the description of the parameter. A description of a return value is listed after the Javadoc tag @return . Notice the effect of the @param and @return tags in the following HTML document for the above method:
+
+The Javadoc comments for a constructor can be defined in a manner similar to the one for a method, except it does not have a @return tag. In addition to the above tags, if the method could throw exceptions, they can be listed using the @throws tag, just like the @param and the @return tags in the Javadoc comments. The topic of exceptions is discussed in Appendix B.
+
+More complex methods may need complete precondition and postcondition lists. Also an example of how the method is used may be useful information for other programmers. The tags such as @precondition , @postcondition , and @example that are not predefined in the Javadoc tool can be created by programmers. Since the convertEurosToDollars is a simple method, only the @example tag will be added to the Javadoc comments as shown below:
+
+/**
+
+* Convert the passed value to Dollars.
+
+*
+
+* @param euros the amount in Euros
+
+* @return the amount in Dollars
+
+* @example conversion of 1.00 Euros to US dollars -
+
+* Card.convertEurosToDollars(1.00);
+
+*/
+
+public static double convertEurosToDollars(double euros) {
+
+return euros*rate;
+
+}
+
+Note that in order to include the user-defined tags in the documentation, the HTML page may need to be generated from a command line if the Java editor does not have a capability of including the options, as will be discussed in the next section. The HTML document for the above method also appears in the next section.
+
+Similar to the standard classes, programmer-defined classes and HTML documentation can be shared with other programmers. First, .java files are written in the usual way but include the Javadoc comments described in this appendix. After they are compiled, the .class files can be moved to a location where other programmers can have access to them. Then the Javadoc tool can be run on each .java file to create an HTML page, and all Javadoc HTML files can be moved to a public place where a web browser could be used to read them. This way, by importing the classes at the beginning of the Java program, the programmer-defined classes are available to other programmers without compiling them just like the standard classes.
+
+### C.3 Generating Javadoc Documentation from a Command Line
+
+An HTML page can also be generated from a command line. In the command prompt window, the commands javac and java are used to compile and run Java programs, respectively. Similarly, the javadoc command is used for generating Javadoc documentation files. For example, to generate a Javadoc documentation file for the QuadEq class, the following command is used:
+
+javadoc QuadEq.java
+
+After the command is executed, a collection of HTML files will be created. The documentation can be viewed by opening the file index.html and clicking the QuadEq link.
+
+When a programmer-defined tag such as @example is included in the source code, options need to be included in the command line to generate the HTML. The following command can be used to create Javadoc documentation for the Card class which implements the method convertEurosToDollars :
+
+javadoc –private –author –tag param -tag return
+
+-tag example:a:"Example:" Card.java
+
+The –private option generates the documentation for the class, variables, and methods including the public , protected , and private members of the class. The –author option puts the author tag in boldface followed by the author's name in the documentation. The other options starting with –tag indicate the order in which the tags appear in the HTML file: the parameter(s) first, then the return specification, and finally the example. Two of these options, param and return , are predefined in the Javadoc system, so only –tag param and –tag return are listed. However, because an example tag is not predefined in Javadoc, the extra information at the end such as :a:"Example:" is needed and indicates how the tag is to appear in the documentation. The a: means that all occurrences of the @example tag should be put in the documentation along with a heading, which in this case is Example: as it appears in the quotation marks. Headings will always appear in boldface in the documentation created by the javadoc command. The following is the HTML document for the method convertEurosToDollars that is generated after the @example tag is added to the source code.
+
+For more information about Javadoc, refer to the Java API specification document at the Oracle website at http://​docs.​oracle.​com/​javase/​7/​docs/​technotes/​tools/​windows/​javadoc.​html .
+
+## Appendix D: Glossary
+
+All of the terms in italics in the text can be found in the index, and some of these terms (including abbreviations) can be found here in the glossary. The descriptions of terms in this glossary should not be used in lieu of the complete descriptions in the text, but rather they serve as a quick review. Should a more complete description be needed, the index can guide the reader to the appropriate pages where the terms are discussed in more detail.
+
+Algorithm
+
+A step-by-step sequence of instructions, but not necessarily a program for a computer.
+
+API
+
+Application Programming Interface.
+
+Array
+
+A collection of contiguous memory locations that have the same name and are distinguished from one another by an index.
+
+Assembly language
+
+A low-level language that uses mnemonics and is converted to machine language by an assembler.
+
+Bytecode
+
+An intermediate language between Java and machine language.
+
+Class
+
+A definition or blueprint of a set of objects.
+
+Compiler
+
+A translator that converts a high-level language program to a low-level language for subsequent execution.
+
+Contour diagram
+
+A visual representation of the state of execution of a program.
+
+CPU
+
+Central Processing Unit.
+
+Data members
+
+The variables and constants that are part of an object.
+
+EOD
+
+End of Data.
+
+Exception
+
+An execution error, an error condition, or an unexpected event during execution of a program.
+
+GUI
+
+Graphical User Interface.
+
+High-level language
+
+A more English-like and math-like programming language, such as Java.
+
+HTML
+
+HyperText Markup Language.
+
+IDE
+
+Integrated Development Environment.
+
+Inheritance
+
+The ability of a subclass to reuse methods and data members of a superclass.
+
+Interpreter
+
+A translator that converts and executes a high-level language program one instruction at a time.
+
+IPO
+
+Input Process Output.
+
+Iteration structures
+
+Allows a program to repeat a section of code, often called a loop.
+
+Javadoc
+
+Specialized comments for documenting classes and methods.
+
+LCV
+
+Loop Control Variable.
+
+LIFO
+
+Last In First Out as with a stack.
+
+Low-level language
+
+A language closer to a particular CPU, such as assembly language and machine language.
+
+Machine language
+
+The native language of the processor coded on ones and zeros.
+
+Method
+
+A series of instructions that can be invoked to access and manipulate the data members of an object.
+
+Object
+
+An instance of a class.
+
+OOP
+
+Object-Oriented Programming.
+
+Overloading
+
+A method in the same class that has the same name but a different number of parameters, different types of parameters, or parameters of different types in a different order.
+
+Overriding
+
+A method in a subclass that has the same name and also the same number and type of parameters as the one in the superclass.
+
+Polymorphism
+
+The type of an object referenced by a superclass variable determined at runtime.
+
+Pseudocode
+
+A design tool consisting of a combination of English and a programming language that helps one concentrate on logic instead of syntax when developing a program.
+
+RAM
+
+Random Access Memory.
+
+Recursion
+
+A definition that is defined in terms of itself and includes a base or terminal case.
+
+Selection structure s
+
+Allows a program to follow one of more paths, sometimes called decision structures.
+
+Semantics
+
+The meaning of what each instruction does in a programming language.
+
+Syntax
+
+The grammar of a programming language.
+
+UML
+
+Universal Modeling Language.
+
+Variables
+
+Named memory locations used to store data in a program.
+
+## Appendix E: Answers to Selected Exercises
+
+### Chapter 1
+
+1.B.
+
+Correct.
+
+1.D.
+
+Incorrect, a double number cannot be assigned to a variable of integer type.
+
+2.A.
+
+0
+
+3.B.
+
+5.34
+
+4.B.
+
+final double EULER_NUMBER = 2.7182 ;
+
+6.
+
+System.out.println("** **");
+
+System.out.println("** **");
+
+System.out.println(" ****");
+
+System.out.println(" ****");
+
+System.out.println(" ****");
+
+System.out.println(" ****");
+
+System.out.println("** **");
+
+System.out.println("** **");
+
+7.
+
+After execution, value1 is 9 , value2 is 4 , and value3 is 9 .
+
+8.B.
+
+s = r * Math.PI * Math.sqrt(Math.pow(r,2) + Math.pow(h,2));
+
+### Chapter 2
+
+1.A.
+
+Incorrect, it should be Circle circle = new Circle();
+
+1.C.
+
+Correct.
+
+4.A.
+
+Circle innerCircle;
+
+innerCircle = new Circle();
+
+4.C.
+
+System.out.println("The value of radius is "
+
+\+ innerCircle.getRadius());
+
+6.
+
+Answers to A. and D. of the Cone class
+
+### Chapter 3
+
+1.A.
+
+40
+
+2.B.
+
+50
+
+3.C.
+
+3
+
+5.A.
+
+true || false → true
+
+5.C.
+
+true || flag1 && flag2 → true || false → true
+
+5.E.
+
+(true || false) && false → true && false → false
+
+8.
+
+9.
+
+### Chapter 4
+
+2.
+
+, in the for statement
+
+3.
+
+sum = 1
+
+count = 2
+
+sum = 3
+
+count = 3
+
+sum = 6
+
+count = 4
+
+sum = 10
+
+count = 5
+
+sum = 10
+
+count = 5
+
+6.
+
+**
+
+****
+
+******
+
+********
+
+**********
+
+7.B.
+
+int total, count
+
+total = 0;
+
+count = 1;
+
+do {
+
+total += count;
+
+count += 3;
+
+} while (count <= 40);
+
+8.A.
+
+int total, count, n;
+
+total = 0;
+
+n = 5;
+
+for(count = 0; count < n; count++) {
+
+total += count;
+
+}
+
+### Chapter 5
+
+1.
+
+constructor 1 : valid
+
+constructor 3 : invalid
+
+2.
+
+method 2 : invalid
+
+method 6: valid
+
+method 10 : valid
+
+6.
+
+answers to A., B., C., and F. of the Cone class
+
+### Chapter 6
+
+1.B.
+
+The second line should be text2 = new String("Shedding blade");
+
+2.B.
+
+34
+
+2.D.
+
+Hose_
+
+7.
+
+### Chapter 7
+
+1.B.
+
+Incorrect, the size has to be specified.
+
+1.C.
+
+Incorrect, the braces have to be used instead of the square brackets.
+
+1.E.
+
+Incorrect, the size should not be specified.
+
+2.
+
+int total = 0;
+
+for(int i=0; i<intArray.length; i++)
+
+if(i%2 == 0)
+
+total = total + intArray[i];
+
+5.
+
+3
+
+4
+
+3
+
+7.
+
+### Chapter 8
+
+7.
+
+public static String reverseStr(String str) {
+
+if(str.length() <= 1)
+
+return str;
+
+return reverseStr(str.substring(1)) + str.charAt(0);
+
+}
+
+9.
+
+public static int factorial(int n) {
+
+if(n == 0)
+
+return 1;
+
+else
+
+return n * factorial(n-1);
+
+}
+
+### Chapter 9
+
+1.B.
+
+Incorrect, a variable of a subclass type cannot reference an object of a superclass type.
+
+1.C.
+
+Correct.
+
+2.B.
+
+calcRegPolyArea and toString .
+
+2.D.
+
+Yes.
+
+3.
+
+### Chapter 10
+
+1.B.
+
+Incorrect, there is no constructor in the File Class which takes the FileReader object as a parameter.
+
+1.D.
+
+Correct.
+
+3.
+
+References and Useful Websites
+
+References
+
+1.
+
+Johnson JB (1971) The contour model of block structured processes. SIGPLAN Notices 6(2):55–72
+
+2.
+
+Organick EI, Forsythe AI, Plummer RP (1978) Programming language structures. Academic Press, New York
+
+3.
+
+Streib JT, Soma T (2010) Using contour diagrams and JIVE to illustrate object-oriented semantics in the Java programming language. In: SIGCSE '10: Proceedings of the 41st ACM technical symposium on computer science education. Milwaukee, WI, USA, pp 510–514
+
+4.
+
+Streib JT (2011) Guide to assembly language: a concise introduction. Springer, London
+
+Useful Websites
+
+"Class File," information on File class discussed in Chapter 10: http://​docs.​oracle.​com/​javase/​7/​docs/​api/​java/​io/​File.​html
+
+"Class FileReader," information on FileReader class discussed in Chapter 10: http://​docs.​oracle.​com/​javase/​7/​docs/​api/​java/​io/​FileReader.​html
+
+"Class JOptionPane," information on dialog boxes discussed in Appendix A: http://​docs.​oracle.​com/​javase/​7/​docs/​api/​javax/​swing/​JOptionPane.​html
+
+"Class String," information on String class discussed in Chapter 6: http://​docs.​oracle.​com/​javase/​7/​docs/​api/​java/​lang/​String.​html
+
+"javadoc – The Java API Documentation Generator," information on Javadoc discussed in Appendix C: http://​docs.​oracle.​com/​javase/​7/​docs/​technotes/​tools/​windows/​javadoc.​html
+
+"Java™ Platform, Standard Edition 7 API Specification," format of documents created by the Javadoc discussed in Appendix C: http://​docs.​oracle.​com/​javase/​7/​docs/​api/​index.​html
+
+"Java™ Platform, Standard Edition 7 API Specification," list of classes and packages in Java 7: http://​docs.​oracle.​com/​javase/​7/​docs/​api/​index.​html
+
+Index
+
+A
+
+Accessors
+
+Actual parameters
+
+Algorithm
+
+analysis
+
+API (Application Programing Interface),14
+
+Arguments
+
+Arithmetic statements
+
+(+,− *, /, %, ++,−,+=)
+
+precedence
+
+Arrays
+
+files
+
+objects
+
+one-dimensional
+
+access
+
+declaration
+
+input
+
+output
+
+passing to/from a method
+
+processing
+
+reversing
+
+searching
+
+sorting
+
+two-dimensional
+
+asymmetrical
+
+declaration
+
+input
+
+output
+
+passing to/from method
+
+processing
+
+Assembler
+
+Assembly language
+
+Assignment statements (=)
+
+B
+
+Binary search
+
+Bit
+
+boolean
+
+break
+
+Bubble sort
+
+byte
+
+Byte
+
+Bytecode
+
+C
+
+case
+
+Case structure
+
+catch
+
+char
+
+Classes
+
+abstract
+
+multiple
+
+siblings
+
+class
+
+Comments
+
+javadoc,
+
+See See Javadoc
+
+Compiler
+
+Compound statements
+
+Constants
+
+class
+
+instance
+
+local
+
+Constructors
+
+default
+
+overloading
+
+Contour diagrams
+
+deallocation
+
+inheritance
+
+recursion
+
+strings
+
+Count controlled indefinite iteration
+
+structures
+
+CPU (Central Processing Unit)
+
+D
+
+Dangling else problem
+
+Data encapsulation
+
+Data member
+
+Data types
+
+Debugging
+
+default
+
+Definite iteration loop structure
+
+DeMorgan's Laws
+
+Dialog boxes
+
+confirmation
+
+input
+
+message
+
+option
+
+do while
+
+double
+
+E
+
+else
+
+EOD (End of Data)
+
+Exceptions
+
+checked
+
+handling
+
+hierarchy
+
+runtime
+
+throwing
+
+thrown
+
+try-catch block
+
+unchecked
+
+Execution errors
+
+extends
+
+F
+
+Fibonacci numbers
+
+Files
+
+arrays
+
+input
+
+location
+
+output
+
+final
+
+finally
+
+Fixed iteration loop structure
+
+Flags
+
+float
+
+Flowchart
+
+for
+
+Formal parameters
+
+G
+
+GUI (Graphical User Interface)
+
+H
+
+Hardware
+
+Hello world program
+
+High-level language
+
+HTML (HyperText Markup Language)
+
+I
+
+IDE (Integrated Development Environment)
+
+if
+
+If-then structures
+
+If-then-else structures
+
+Immutable
+
+import
+
+Infinite loop
+
+Inheritance,
+
+See See Objects
+
+Input
+
+Instance
+
+instanceof
+
+int
+
+Interpreter
+
+Invoking methods
+
+IPO (Input Process Output)
+
+Iteration structures
+
+J
+
+Java program skeleton
+
+Javadoc
+
+comments (/**, */)
+
+tags
+
+L
+
+LCV (Loop Control Variable)
+
+LIFO structure
+
+Logic errors
+
+Logical operators (!, &&, ||)
+
+precedence
+
+long
+
+Low-level language
+
+M
+
+Machine language
+
+main
+
+Math class
+
+Memory
+
+Methods
+
+class
+
+overloading
+
+overriding
+
+value-returning
+
+void
+
+Mnemonics
+
+Mutators
+
+N
+
+Nested if structures
+
+if-the-else-if structures
+
+if-then-if structures
+
+Nested iteration structures
+
+new
+
+O
+
+Objects
+
+arrays
+
+inheritance
+
+multiple
+
+overriding methods
+
+polymorphism
+
+returning an object
+
+sending objects
+
+subclasses
+
+superclasses
+
+One-dimensional arrays,
+
+See see Arrays
+
+OOP (Object-Oriented Programming)
+
+Output
+
+Overloading
+
+constructors
+
+methods
+
+operator (+)
+
+Overriding methods
+
+P
+
+Parameters
+
+Polymorphism,
+
+See See Objects
+
+Post-test indefinite loop structure
+
+Pre-test indefinite loop structures
+
+Priming read
+
+private
+
+Private
+
+data member
+
+Program design
+
+Prompts
+
+protected
+
+Pseudocode
+
+public
+
+Public
+
+data member
+
+R
+
+RAM
+
+Recursion
+
+base or terminal case
+
+Fibonacci numbers
+
+infinite
+
+power function
+
+stack frames
+
+tree of calls
+
+Relational symbols
+
+Reserved word
+
+return
+
+Run-time errors
+
+S
+
+Scanner
+
+Scope
+
+Selection structures 69–106
+
+Semantics
+
+Sentinel controlled loop
+
+Sequential search
+
+Short circuit
+
+short
+
+Software
+
+sort
+
+Stack
+
+static
+
+Storage
+
+String
+
+Strings
+
+comparison
+
+concatenation
+
+format
+
+methods
+
+super
+
+switch
+
+Symbolic addressing
+
+Syntax
+
+Syntax errors
+
+System.out.print
+
+System.out.printf
+
+System.out.println
+
+T
+
+this
+
+token
+
+Truth tables
+
+try
+
+Two-dimensional arrays,
+
+See See Arrays
+
+Typecast operator
+
+U
+
+UML (Universal Modeling Language)
+
+User friendly
+
+V
+
+Value parameters
+
+Variables
+
+class
+
+global
+
+instance
+
+local
+
+void
+
+W
+
+while
+
+While loops
+
diff --git a/kag/examples/csqa/builder/data/introducing_regular_expressions.txt b/kag/examples/csqa/builder/data/introducing_regular_expressions.txt
new file mode 100644
index 00000000..14abb9c2
--- /dev/null
+++ b/kag/examples/csqa/builder/data/introducing_regular_expressions.txt
@@ -0,0 +1,5264 @@
+Introducing Regular Expressions
+ 
+# Introducing Regular Expressions
+
+### Michael Fitzgerald
+
+Published by O'Reilly Media
+
+Beijing ⋅ Cambridge ⋅ Farnham ⋅ Köln ⋅ Sebastopol ⋅ Tokyo
+
+# SPECIAL OFFER: Upgrade this ebook with O'Reilly
+
+Click here for more information on this offer!
+
+ _Please note that upgrade offers are not available from sample content._
+
+# Preface
+
+This book shows you how to write regular expressions through examples. Its goal is to make learning regular expressions as easy as possible. In fact, this book demonstrates nearly every concept it presents by way of example so you can easily imitate and try them yourself.
+
+Regular expressions help you find patterns in text strings. More precisely, they are specially encoded text strings that match patterns in sets of strings, most often strings that are found in documents or files.
+
+Regular expressions began to emerge when mathematician Stephen Kleene wrote his book _Introduction to Metamathematics_ (New York, Van Nostrand), first published in 1952, though the concepts had been around since the early 1940s. They became more widely available to computer scientists with the advent of the Unix operating system—the work of Brian Kernighan, Dennis Ritchie, Ken Thompson, and others at AT&T Bell Labs—and its utilities, such as _sed_ and _grep_ , in the early 1970s.
+
+The earliest appearance that I can find of regular expressions in a computer application is in the QED editor. QED, short for Quick Editor, was written for the Berkeley Timesharing System, which ran on the Scientific Data Systems SDS 940. Documented in 1970, it was a rewrite by Ken Thompson of a previous editor on MIT's Compatible Time-Sharing System and yielded one of the earliest if not first practical implementations of regular expressions in computing. (Table A-1 in Appendix A documents the regex features of QED.)
+
+I'll use a variety of tools to demonstrate the examples. You will, I hope, find most of them usable and useful; others won't be usable because they are not readily available on your Windows system. You can skip the ones that aren't practical for you or that aren't appealing. But I recommend that anyone who is serious about a career in computing learn about regular expressions in a Unix-based environment. I have worked in that environment for 25 years and still learn new things every day.
+
+> "Those who don't understand Unix are condemned to reinvent it, poorly." —Henry Spencer
+
+Some of the tools I'll show you are available online via a web browser, which will be the easiest for most readers to use. Others you'll use from a command or a shell prompt, and a few you'll run on the desktop. The tools, if you don't have them, will be easy to download. The majority are free or won't cost you much money.
+
+This book also goes light on jargon. I'll share with you what the correct terms are when necessary, but in small doses. I use this approach because over the years, I've found that jargon can often create barriers. In other words, I'll try not to overwhelm you with the dry language that describes regular expressions. That is because the basic philosophy of this book is this: Doing useful things can come before knowing everything about a given subject.
+
+There are lots of different implementations of regular expressions. You will find regular expressions used in Unix command-line tools like _vi_ ( _vim_ ), _grep_ , and _sed_ , among others. You will find regular expressions in programming languages like Perl (of course), Java, JavaScript, C# or Ruby, and many more, and you will find them in declarative languages like XSLT 2.0. You will also find them in applications like Notepad++, Oxygen, or TextMate, among many others.
+
+Most of these implementations have similarities and differences. I won't cover all those differences in this book, but I will touch on a good number of them. If I attempted to document _all_ the differences between _all_ implementations, I'd have to be hospitalized. I won't get bogged down in these kinds of details in this book. You're expecting an introductory text, as advertised, and that is what you'll get.
+
+# Who Should Read This Book
+
+The audience for this book is people who haven't ever written a regular expression before. If you are new to regular expressions or programming, this book is a good place to start. In other words, I am writing for the reader who has heard of regular expressions and is interested in them but who doesn't really understand them yet. If that is you, then this book is a good fit.
+
+The order I'll go in to cover the features of regex is from the simple to the complex. In other words, we'll go step by simple step.
+
+Now, if you happen to already know something about regular expressions and how to use them, or if you are an experienced programmer, this book may not be where you want to start. This is a beginner's book, for rank beginners who need some hand-holding. If you have written some regular expressions before, and feel familiar with them, you can start here if you want, but I'm planning to take it slower than you will probably like.
+
+I recommend several books to read after this one. First, try Jeff Friedl's _Mastering Regular Expressions, Third Edition_ (see <http://shop.oreilly.com/product/9781565922570.do>). Friedl's book gives regular expressions a thorough going over, and I highly recommend it. I also recommend the _Regular_ Expressions _Cookbook_ (see <http://shop.oreilly.com/product/9780596520694.do>) by Jan Goyvaerts and Steven Levithan. Jan Goyvaerts is the creator of RegexBuddy, a powerful desktop application (see <http://www.regexbuddy.com/>). Steven Levithan created RegexPal, an online regular expression processor that you'll use in the first chapter of this book (see <http://www.regexpal.com>).
+
+# What You Need to Use This Book
+
+To get the most out of this book, you'll need access to tools available on Unix or Linux operating systems, such as Darwin on the Mac, a variant of BSD (Berkeley Software Distribution) on the Mac, or Cygwin on a Windows PC, which offers many GNU tools in its distribution (see <http://www.cygwin.com> and <http://www.gnu.org>).
+
+There will be plenty of examples for you to try out here. You can just read them if you want, but to really learn, you'll need to follow as many of them as you can, as the most important kind of learning, I think, always comes from doing, not from standing on the sidelines. You'll be introduced to websites that will teach you what regular expressions are by highlighting matched results, workhorse command line tools from the Unix world, and desktop applications that analyze regular expressions or use them to perform text search.
+
+You will find examples from this book on Github at <https://github.com/michaeljamesfitzgerald/Introducing-Regular-Expressions>. You will also find an archive of all the examples and test files in this book for download from <http://examples.oreilly.com/9781449392680/examples.zip>. It would be best if you create a working directory or folder on your computer and then download these files to that directory before you dive into the book.
+
+# Conventions Used in This Book
+
+The following typographical conventions are used in this book:
+
+ _Italic_
+
+Indicates new terms, URLs, email addresses, filenames, file extensions, and so forth.
+
+`Constant width`
+
+Used for program listings, as well as within paragraphs, to refer to program elements such as expressions and command lines or any other programmatic elements.
+
+### Tip
+
+This icon signifies a tip, suggestion, or a general note.
+
+# Using Code Examples
+
+This book is here to help you get your job done. In general, you may use the code in this book in your programs and documentation. You do not need to contact us for permission unless you're reproducing a significant portion of the code. For example, writing a program that uses several chunks of code from this book does not require permission. Selling or distributing a CD-ROM of examples from O'Reilly books does require permission. Answering a question by citing this book and quoting example code does not require permission. Incorporating a significant amount of example code from this book into your product's documentation does require permission.
+
+We appreciate, but do not require, attribution. An attribution usually includes the title, author, publisher, and ISBN. For example: " _Introducing Regular Expressions_ by Michael Fitzgerald (O'Reilly). Copyright 2012 Michael Fitzgerald, 978-1-4493-9268-0."
+
+If you feel your use of code examples falls outside fair use or the permission given above, feel free to contact O'Reilly at _permissions@oreilly.com_.
+
+# Safari® Books Online
+
+### Note
+
+Safari Books Online (www.safaribooksonline.com) is an on-demand digital library that delivers expert content in both book and video form from the world's leading authors in technology and business.
+
+Technology professionals, software developers, web designers, and business and creative professionals use Safari Books Online as their primary resource for research, problem solving, learning, and certification training.
+
+Safari Books Online offers a range of product mixes and pricing programs for organizations, government agencies, and individuals. Subscribers have access to thousands of books, training videos, and prepublication manuscripts in one fully searchable database from publishers like O'Reilly Media, Prentice Hall Professional, Addison-Wesley Professional, Microsoft Press, Sams, Que, Peachpit Press, Focal Press, Cisco Press, John Wiley & Sons, Syngress, Morgan Kaufmann, IBM Redbooks, Packt, Adobe Press, FT Press, Apress, Manning, New Riders, McGraw-Hill, Jones & Bartlett, Course Technology, and dozens more. For more information about Safari Books Online, please visit us online.
+
+# How to Contact Us
+
+Please address comments and questions concerning this book to the publisher:
+
+O'Reilly Media, Inc.  
+---  
+1005 Gravenstein Highway North  
+Sebastopol, CA 95472  
+800-998-9938 (in the United States or Canada)  
+707-829-0515 (international or local)  
+707-829-0104 (fax)
+
+This book has a web page listing errata, examples, and any additional information. You can access this page at:
+
+<http://orei.ly/intro_regex>  
+---
+
+To comment or to ask technical questions about this book, send email to:
+
+bookquestions@oreilly.com  
+---
+
+For more information about O'Reilly books, courses, conferences, and news, see its website at <http://www.oreilly.com>.
+
+Find O'Reilly on Facebook: <http://facebook.com/oreilly>
+
+Follow O'Reilly on Twitter: <http://twitter.com/oreillymedia>
+
+Watch O'Reilly on YouTube: <http://www.youtube.com/oreillymedia>
+
+# Acknowledgments
+
+Once again, I want to express appreciation to my editor at O'Reilly, Simon St. Laurent, a very patient man without whom this book would never have seen the light of day. Thank you to Seara Patterson Coburn and Roger Zauner for your helpful reviews. And, as always, I want to recognize the love of my life, Cristi, who is my _raison d'être_.
+
+# Chapter 1. What Is a Regular Expression?
+
+Regular expressions are specially encoded text strings used as patterns for matching sets of strings. They began to emerge in the 1940s as a way to describe regular languages, but they really began to show up in the programming world during the 1970s. The first place I could find them showing up was in the QED text editor written by Ken Thompson.
+
+> "A regular expression is a pattern which specifies a set of strings of characters; it is said to match certain strings." —Ken Thompson
+
+Regular expressions later became an important part of the tool suite that emerged from the Unix operating system—the _ed_ , _sed_ and _vi_ ( _vim_ ) editors, _grep_ , _AWK_ , among others. But the ways in which regular expressions were implemented were not always so regular.
+
+### Note
+
+This book takes an inductive approach; in other words, it moves from the specific to the general. So rather than an example after a treatise, you will often get the example first and then a short treatise following that. It's a learn-by-doing book.
+
+Regular expressions have a reputation for being gnarly, but that all depends on how you approach them. There is a natural progression from something as simple as this:
+
+    \d
+
+a _character shorthand_ that matches any digit from 0 to 9, to something a bit more complicated, like:
+
+    ^(\(\d{3}\)|^\d{3}[.-]?)?\d{3}[.-]?\d{4}$
+
+which is where we'll wind up at the end of this chapter: a fairly robust regular expression that matches a 10-digit, North American telephone number, with or without parentheses around the area code, or with or without hyphens or dots (periods) to separate the numbers. (The parentheses must be balanced, too; in other words, you can't just have one.)
+
+### Note
+
+Chapter 10 shows you a slightly more sophisticated regular expression for a phone number, but the one above is sufficient for the purposes of this chapter.
+
+If you don't get how that all works yet, don't worry: I'll explain the whole expression a little at a time in this chapter. If you will just follow the examples (and those throughout the book, for that matter), writing regular expressions will soon become second nature to you. Ready to find out for yourself?
+
+I at times represent Unicode characters in this book using their code point—a four-digit, hexadecimal (base 16) number. These code points are shown in the form _U+0000_. U+002E, for example, represents the code point for a full stop or period (.).
+
+# Getting Started with Regexpal
+
+First let me introduce you to the Regexpal website at <http://www.regexpal.com>. Open the site up in a browser, such as Google Chrome or Mozilla Firefox. You can see what the site looks like in Figure 1-1.
+
+Figure 1-1. Regexpal in the Google Chrome browser
+
+You can see that there is a text area near the top, and a larger text area below that. The top text box is for entering regular expressions, and the bottom one holds the subject or target text. The target text is the text or set of strings that you want to match.
+
+### Note
+
+At the end of this chapter and each following chapter, you'll find a "Technical Notes" section. These notes provide additional information about the technology discussed in the chapter and tell you where to get more information about that technology. Placing these notes at the end of the chapters helps keep the flow of the main text moving forward rather than stopping to discuss each detail along the way.
+
+# Matching a North American Phone Number
+
+Now we'll match a North American phone number with a regular expression. Type the phone number shown here into the lower section of Regexpal:
+
+    707-827-7019
+
+Do you recognize it? It's the number for O'Reilly Media.
+
+Let's match that number with a regular expression. There are lots of ways to do this, but to start out, simply enter the number itself in the upper section, exactly as it is written in the lower section (hold on now, don't sigh):
+
+    707-827-7019
+
+What you should see is the phone number you entered in the lower box highlighted from beginning to end in yellow. If that is what you see (as shown in Figure 1-2), then you are in business.
+
+### Note
+
+When I mention colors in this book, in relation to something you might see in an image or a screenshot, such as the highlighting in Regexpal, those colors may appear online and in e-book versions of this book, but, alas, not in print. So if you are reading this book on paper, then when I mention a color, your world will be grayscale, with my apologies.
+
+Figure 1-2. Ten-digit phone number highlighted in Regexpal
+
+What you have done in this regular expression is use something called a _string literal_ to match a string in the target text. A string literal is a literal representation of a string.
+
+Now delete the number in the upper box and replace it with just the number _7_. Did you see what happened? Now only the sevens are highlighted. The literal character (number) _7_ in the regular expression matches the four instances of the number 7 in the text you are matching.
+
+# Matching Digits with a Character Class
+
+What if you wanted to match all the numbers in the phone number, all at once? Or match any number for that matter?
+
+Try the following, exactly as shown, once again in the upper text box:
+
+    [0-9]
+
+All the numbers (more precisely _digits_ ) in the lower section are highlighted, in alternating yellow and blue. What the regular expression `[0-9]` is saying to the regex processor is, "Match any digit you find in the range 0 through 9."
+
+The square brackets are not literally matched because they are treated specially as _metacharacters_. A metacharacter has special meaning in regular expressions and is reserved. A regular expression in the form `[0-9]` is called a _character class_ , or sometimes a _character set_.
+
+You can limit the range of digits more precisely and get the same result using a more specific list of digits to match, such as the following:
+
+    [012789]
+
+This will match only those digits listed, that is, 0, 1, 2, 7, 8, and 9. Try it in the upper box. Once again, every digit in the lower box will be highlighted in alternating colors.
+
+To match any 10-digit, North American phone number, whose parts are separated by hyphens, you could do the following:
+
+    [0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]
+
+This will work, but it's bombastic. There is a better way with something called a shorthand.
+
+# Using a Character Shorthand
+
+Yet another way to match digits, which you saw at the beginning of the chapter, is with `\d` which, by itself, will match all Arabic digits, just like `[0-9]`. Try that in the top section and, as with the previous regular expressions, the digits below will be highlighted. This kind of regular expression is called a _character shorthand_. (It is also called a _character escape_ , but this term can be a little misleading, so I avoid it. I'll explain later.)
+
+To match any digit in the phone number, you could also do this:
+
+    \d\d\d-\d\d\d-\d\d\d\d
+
+Repeating the `\d` three and four times in sequence will exactly match three and four digits in sequence. The hyphen in the above regular expression is entered as a literal character and will be matched as such.
+
+What about those hyphens? How do you match them? You can use a literal hyphen (-) as already shown, or you could use an escaped uppercase _D_ (`\D`), which matches any character that is _not_ a digit.
+
+This sample uses `\D` in place of the literal hyphen.
+
+    \d\d\d\D\d\d\d\D\d\d\d\d
+
+Once again, the entire phone number, including the hyphens, should be highlighted this time.
+
+# Matching Any Character
+
+You could also match those pesky hyphens with a dot (.):
+
+    \d\d\d.\d\d\d.\d\d\d\d
+
+The dot or period essentially acts as a wildcard and will match any character (except, in certain situations, a line ending). In the example above, the regular expression matches the hyphen, but it could also match a percent sign (%):
+
+    707%827%7019
+
+Or a vertical bar (|):
+
+    707|827|7019
+
+Or any other character.
+
+### Note
+
+As I mentioned, the dot character (officially, the full stop) will not normally match a new line character, such as a line feed (U+000A). However, there are ways to make it possible to match a newline with a dot, which I will show you later. This is often called the _dotall_ option.
+
+# Capturing Groups and Back References
+
+You'll now match just a portion of the phone number using what is known as a _capturing group_. Then you'll refer to the content of the group with a _backreference_. To create a capturing group, enclose a `\d` in a pair of parentheses to place it in a group, and then follow it with a `\1` to backreference what was captured:
+
+    (\d)\d\1
+
+The `\1` refers back to what was captured in the group enclosed by parentheses. As a result, this regular expression matches the prefix `707`. Here is a breakdown of it:
+
+  * `(\d)` matches the first digit and captures it (the number _7_ )
+
+  * `\d` matches the next digit (the number _0_ ) but does not capture it because it is not enclosed in parentheses
+
+  * `\1` references the captured digit (the number _7_ )
+
+This will match only the area code. Don't worry if you don't fully understand this right now. You'll see plenty of examples of groups later in the book.
+
+You could now match the whole phone number with one group and several backreferences:
+
+    (\d)0\1\D\d\d\1\D\1\d\d\d
+
+But that's not quite as elegant as it could be. Let's try something that works even better.
+
+# Using Quantifiers
+
+Here is yet another way to match a phone number using a different syntax:
+
+    \d{3}-?\d{3}-?\d{4}
+
+The numbers in the curly braces tell the regex processor _exactly_ how many occurrences of those digits you want it to look for. The braces with numbers are a kind of _quantifier_. The braces themselves are considered metacharacters.
+
+The question mark (`?`) is another kind of quantifier. It follows the hyphen in the regular expression above and means that the hyphen is optional—that is, that there can be zero or one occurrence of the hyphen (one or none). There are other quantifiers such as the plus sign (`+`), which means "one or more," or the asterisk (`*`) which means "zero or more."
+
+Using quantifiers, you can make a regular expression even more concise:
+
+    (\d{3,4}[.-]?)+
+
+The plus sign again means that the quantity can occur one or more times. This regular expression will match either three or four digits, followed by an optional hyphen or dot, grouped together by parentheses, one or more times (`+`).
+
+Is your head spinning? I hope not. Here's a character-by-character analysis of the regular expression above:
+
+  * `(` open a capturing group
+
+  * `\` start character shorthand (escape the following character)
+
+  * `d` end character shorthand (match any digit in the range 0 through 9 with `\d`)
+
+  * `{` open quantifier
+
+  * `3` minimum quantity to match
+
+  * `,` separate quantities
+
+  * `4` maximum quantity to match
+
+  * `}` close quantifier
+
+  * `[` open character class
+
+  * `.` dot or period (matches literal dot)
+
+  * `-` literal character to match hyphen
+
+  * `]` close character class
+
+  * `?` zero or one quantifier
+
+  * `)` close capturing group
+
+  * `+` one or more quantifier
+
+This all works, but it's not quite right because it will also match other groups of 3 or 4 digits, whether in the form of a phone number or not. Yes, we learn from our mistakes better than our successes.
+
+So let's improve it a little:
+
+    (\d{3}[.-]?){2}\d{4}
+
+This will match two nonparenthesized sequences of three digits each, followed by an optional hyphen, and then followed by exactly four digits.
+
+# Quoting Literals
+
+Finally, here is a regular expression that allows literal parentheses to optionally wrap the first sequence of three digits, and makes the area code optional as well:
+
+    ^(\(\d{3}\)|^\d{3}[.-]?)?\d{3}[.-]?\d{4}$
+
+To ensure that it is easy to decipher, I'll look at this one character by character, too:
+
+  * `^` (caret) at the beginning of the regular expression, or following the vertical bar (`|`), means that the phone number will be at the beginning of a line.
+
+  * `(` opens a capturing group.
+
+  * `\(` is a literal open parenthesis.
+
+  * `\d` matches a digit.
+
+  * `{3}` is a quantifier that, following `\d`, matches exactly three digits.
+
+  * `\)` matches a literal close parenthesis.
+
+  * `|` (the vertical bar) indicates _alternation_ , that is, a given choice of alternatives. In other words, this says "match an area code with parentheses or without them."
+
+  * `^` matches the beginning of a line.
+
+  * `\d` matches a digit.
+
+  * `{3}` is a quantifier that matches exactly three digits.
+
+  * `[.-]?` matches an optional dot or hyphen.
+
+  * `)` close capturing group.
+
+  * `?` make the group optional, that is, the prefix in the group is not required.
+
+  * `\d` matches a digit.
+
+  * `{3}` matches exactly three digits.
+
+  * `[.-]?` matches another optional dot or hyphen.
+
+  * `\d` matches a digit.
+
+  * `{4}` matches exactly four digits.
+
+  * `$` matches the end of a line.
+
+This final regular expression matches a 10-digit, North American telephone number, with or without parentheses, hyphens, or dots. Try different forms of the number to see what will match (and what won't).
+
+### Note
+
+The capturing group in the above regular expression is not necessary. The group is necessary, but the capturing part is not. There is a better way to do this: a non-capturing group. When we revisit this regular expression in the last chapter of the book, you'll understand why.
+
+# A Sample of Applications
+
+To conclude this chapter, I'll show you the regular expression for a phone number in several applications.
+
+TextMate is an editor that is available only on the Mac and uses the same regular expression library as the Ruby programming language. You can use regular expressions through the Find (search) feature, as shown in Figure 1-3. Check the box next to _Regular expression_.
+
+Figure 1-3. Phone number regex in TextMate
+
+Notepad++ is available on Windows and is a popular, free editor that uses the PCRE regular expression library. You can access them through search and replace (Figure 1-4) by clicking the radio button next to _Regular expression_.
+
+Figure 1-4. Phone number regex in Notepad++
+
+Oxygen is also a popular and powerful XML editor that uses Perl 5 regular expression syntax. You can access regular expressions through the search and replace dialog, as shown in Figure 1-5, or through its regular expression builder for XML Schema. To use regular expressions with Find/Replace, check the box next to _Regular expression_.
+
+Figure 1-5. Phone number regex in Oxygen
+
+This is where the introduction ends. Congratulations. You've covered a lot of ground in this chapter. In the next chapter, we'll focus on simple pattern matching.
+
+# What You Learned in Chapter 1
+
+  * What a regular expression is
+
+  * How to use Regexpal, a simple regular expression processor
+
+  * How to match string literals
+
+  * How to match digits with a character class
+
+  * How to match a digit with a character shorthand
+
+  * How to match a non-digit with a character shorthand
+
+  * How to use a capturing group and a backreference
+
+  * How to match an exact quantity of a set of strings
+
+  * How to match a character optionally (zero or one) or one or more times
+
+  * How to match strings at either the beginning or the end of a line
+
+# Technical Notes
+
+  * Regexpal (<http://www.regexpal.com>) is a web-based, JavaScript-powered regex implementation. It's not the most complete implementation, and it doesn't do everything that regular expressions can do; however, it's a clean, simple, and very easy-to-use learning tool, and it provides plenty of features for you to get started.
+
+  * You can download the Chrome browser from <https://www.google.com/chrome> or Firefox from <http://www.mozilla.org/en-US/firefox/new/>.
+
+  * Why are there so many ways of doing things with regular expressions? One reason is because regular expressions have a wonderful quality called _composability_. A language, whether a formal, programming or schema language, that has the quality of _composability_ (James Clark explains it well at <http://www.thaiopensource.com/relaxng/design.html#section:5>) is one that lets you take its atomic parts and composition methods and then recombine them easily in different ways. Once you learn the different parts of regular expressions, you will take off in your ability to match strings of any kind.
+
+  * TextMate is available at <http://www.macromates.com>. For more information on regular expressions in TextMate, see <http://manual.macromates.com/en/regular_expressions>.
+
+  * For more information on Notepad, see <http://notepad-plus-plus.org>. For documentation on using regular expressions with Notepad, see <http://sourceforge.net/apps/mediawiki/notepad-plus/index.php?title=Regular_Expressions>.
+
+  * Find out more about Oxygen at <http://www.oxygenxml.com>. For information on using regex through find and replace, see <http://www.oxygenxml.com/doc/ug-editor/topics/find-replace-dialog.html>. For information on using its regular expression builder for XML Schema, see <http://www.oxygenxml.com/doc/ug-editor/topics/XML-schema-regexp-builder.html>.
+
+# Chapter 2. Simple Pattern Matching
+
+Regular expressions are all about matching and finding patterns in text, from simple patterns to the very complex. This chapter takes you on a tour of some of the simpler ways to match patterns using:
+
+  * String literals
+
+  * Digits
+
+  * Letters
+
+  * Characters of any kind
+
+In the first chapter, we used Steven Levithan's RegexPal to demonstrate regular expressions. In this chapter, we'll use Grant Skinner's RegExr site, found at <http://gskinner.com/regexr> (see Figure 2-1).
+
+### Note
+
+Each page of this book will take you deeper into the regular expression jungle. Feel free, however, to stop and smell the syntax. What I mean is, start trying out new things as soon as you discover them. Try. Fail fast. Get a grip. Move on. Nothing makes learning sink in like _doing_ something with it.
+
+Figure 2-1. Grant Skinner's RegExr in Firefox
+
+Before we go any further, I want to point out the helps that RegExr provides. Over on the right side of RegExr, you'll see three tabs. Take note of the Samples and Community tabs. The Samples tab provides helps for a lot of regular expression syntax, and the Community tab shows you a large number of contributed regular expressions that have been rated. You'll find a lot of good information in these tabs that may be useful to you. In addition, pop-ups appear when you hover over the regular expression or target text in RegExr, giving you helpful information. These resources are one of the reasons why RegExr is among my favorite online regex checkers.
+
+This chapter introduces you to our main text, "The Rime of the Ancient Mariner," by Samuel Taylor Coleridge, first published in _Lyrical Ballads_ (London, J. & A. Arch, 1798). We'll work with this poem in chapters that follow, starting with a plain-text version of the original and winding up with a version marked up in HTML5. The text for the whole poem is stored in a file called _rime.txt_ ; this chapter uses the file _rime-intro.txt_ that contains only the first few lines.
+
+The following lines are from _rime-intro.txt_ :
+
+    THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.
+
+    ARGUMENT.
+
+    How a Ship having passed the Line was driven by Storms to the cold
+    Country towards the South Pole; and how from thence she made her course
+    to the tropical Latitude of the Great Pacific Ocean; and of the strange
+    things that befell; and in what manner the Ancyent Marinere came back to
+    his own Country.
+
+    I.
+
+    1      It is an ancyent Marinere,
+    2        And he stoppeth one of three:
+    3      "By thy long grey beard and thy glittering eye
+    4        "Now wherefore stoppest me?
+
+Copy and paste the lines shown here into the lower text box in RegExr. You'll find the file _rime-intro.txt_ at Github at <https://github.com/michaeljamesfitzgerald/Introducing-Regular-Expressions>. You'll also find the same file in the download archive found at <http://examples.oreilly.com/9781449392680/examples.zip>. You can also find the text online at Project Gutenberg, but without the numbered lines (see <http://www.gutenberg.org/ebooks/9622>).
+
+# Matching String Literals
+
+The most outright, obvious feature of regular expressions is matching strings with one or more literal characters, called _string literals_ or just _literals_.
+
+The way to match literal strings is with normal, literal characters. Sounds familiar, doesn't it? This is similar to the way you might do a search in a word processing program or when submitting a keyword to a search engine. When you search for a string of text, character for character, you are searching with a string literal.
+
+If you want to match the word _Ship_ , for example, which is a word (string of characters) you'll find early in the poem, just type the word _Ship_ in the box at the top of Regexpal, and then the word will be highlighted in the lower text box. (Be sure to capitalize the word.)
+
+Did light blue highlighting show up below? You should be able to see the highlighting in the lower box. If you can't see it, check what you typed again.
+
+### Note
+
+By default, string matching is case-sensitive in Regexpal. If you want to match both lower- and uppercase, click the checkbox next to the words _Case insensitive_ at the top left of Regexpal. If you click this box, both _Ship_ and _ship_ would match if either was present in the target text.
+
+# Matching Digits
+
+In the top-left text box in RegExr, enter this character shorthand to match the digits:
+
+    \d
+
+This matches all the Arabic digits in the text area below because the _global_ checkbox is selected. Uncheck that checkbox, and `\d` will match only the first occurrence of a digit. (See Figure 2-2.)
+
+Figure 2-2. Matching all digits in RegExr with \d
+
+Now in place of `\d` use a character class that matches the same thing. Enter the following range of digits in the top text box of RegExr:
+
+    [0-9]
+
+As you can see in Figure 2-3, though the syntax is different, using `\d` does the same thing as `[0-9]`.
+
+Figure 2-3. Matching all digits in RegExr with [0-9]
+
+### Note
+
+You'll learn more about character classes in Chapter 5.
+
+The character class `[0-9]` is a _range_ , meaning that it will match the range of digits 0 through 9. You could also match digits 0 through 9 by listing all the digits:
+
+    [0123456789]
+
+If you want to match only the binary digits 0 and 1, you would use this character class:
+
+    [01]
+
+Try `[12]` in RegExr and look at the result. With a character class, you can pick the exact digits you want to match. The character shorthand for digits (`\d`) is shorter and simpler, but it doesn't have the power or flexibility of the character class. I use character classes when I can't use `\d` (it's not always supported) and when I need to get very specific about what digits I need to match; otherwise, I use `\d` because it's a simpler, more convenient syntax.
+
+# Matching Non-Digits
+
+As is often the case with shorthands, you can flip-flop—that is, you can go the other way. For example, if you want to match characters that are not digits, use this shorthand with an uppercase _D_ :
+
+    \D
+
+Try this shorthand in RegExr now. An uppercase _D_ , rather than a lowercase, matches non-digit characters (check Figure 2-4). This shorthand is the same as the following character class, a negated class (a negated class says in essence, "don't match these" or "match all but these"):
+
+    [^0-9]
+
+which is the same as:
+
+    [^\d]
+
+Figure 2-4. Matching non-digits in RegExr with \D
+
+# Matching Word and Non-Word Characters
+
+In RegExr, now swap `\D` with:
+
+    \w
+
+This shorthand will match all word characters (if the _global_ option is still checked). The difference between `\D` and `\w` is that `\D` matches whitespace, punctuation, quotation marks, hyphens, forward slashes, square brackets, and other similar characters, while `\w` does not—it matches letters and numbers.
+
+In English, `\w` matches essentially the same thing as the character class:
+
+    [a-zA-Z0-9]
+
+### Note
+
+You'll learn how to match characters beyond the set of English letters in Chapter 6.
+
+Now to match a non-word character, use an uppercase _W_ :
+
+    \W
+
+This shorthand matches whitespace, punctuation, and other kinds of characters that aren't used in words in this example. It is the same as using the following character class:
+
+    [^a-zA-Z0-9]
+
+Character classes, granted, allow you more control over what you match, but sometimes you don't want or need to type out all those characters. This is known as the "fewest keystrokes win" principle. But sometimes you must type all that stuff out to get precisely what you want. It is your choice.
+
+Just for fun, in RegExr try both:
+
+    [^\w]
+
+and
+
+    [^\W]
+
+Do you see the differences in what they match?
+
+Table 2-1 provides an extended list of character shorthands. Not all of these work in every regex processor.
+
+Table 2-1. Character shorthands
+
+Character Shorthand| Description  
+---|---
+
+\a | Alert
+
+\b | Word boundary
+
+[\b] | Backspace character
+
+\B | Non-word boundary
+
+`\c` _`x`_ | Control character
+
+\d | Digit character
+
+\D | Non-digit character
+
+`\d` _`xxx`_ | Decimal value for a character
+
+\f | Form  feed character
+
+\r | Carriage return
+
+\n | Newline character
+
+pass:[<literal>\o</literal><replaceable>\ _xxx_ </replaceable>] | Octal value for a character
+
+\s | Space character
+
+\S | Non-space character
+
+\t | Horizontal tab character
+
+\v | Vertical tab character
+
+\w | Word character
+
+\W | Non-word character
+
+\0 | Nul character
+
+`\` `x` _`xx`_ | Hexadecimal value for a character
+
+`\u` _`xxxx`_ | Unicode value for a character
+
+# Matching Whitespace
+
+To match whitespace, you can use this shorthand:
+
+    \s
+
+Try this in RegExr and see what lights up (see Figure 2-5). The following character class matches the same thing as `\s`:
+
+    [ \t\n\r]
+
+In other words, it matches:
+
+  * Spaces
+
+  * Tabs (`\t`)
+
+  * Line feeds (`\n`)
+
+  * Carriage returns (`\r`)
+
+### Note
+
+Spaces and tabs are highlighted in RegExr, but not line feeds or carriage returns.
+
+Figure 2-5. Matching whitespace in RegExr with \s
+
+As you can imagine, `\s` has its _compañero_. To match a non-whitespace character, use:
+
+    \S
+
+This matches everything except whitespace. It matches the character class:
+
+    [^ \t\n\r]
+
+Or:
+
+    [^\s]
+
+Test these out in RegExr to see what happens.
+
+In addition to those characters matched by `\s`, there are other, less common whitespace characters. Table 2-2 lists character shorthands for common whitespace characters and a few that are more rare.
+
+Table 2-2. Character shorthands for whitespace characters
+
+Character Shorthand| Description  
+---|---
+
+\f | Form feed
+
+\h | Horizontal whitespace
+
+\H | Not horizontal whitespace
+
+\n | Newline
+
+\r | Carriage return
+
+\t | Horizontal tab
+
+\v | Vertical tab (whitespace)
+
+\V | Not vertical whitespace
+
+### Note
+
+If you try `\h`, `\H`, or `\V` in RegExr, you will see results, but not with `\v`. Not all whitespace shorthands work with all regex processors.
+
+# Matching Any Character, Once Again
+
+There is a way to match _any_ character with regular expressions and that is with the dot, also known as a period or a full stop (U+002E). The dot matches all characters but line ending characters, except under certain circumstances.
+
+In RegExr, turn off the _global_ setting by clicking the checkbox next to it. Now any regular expression will match on the first match it finds in the target.
+
+Now to match a single character, any character, just enter a single dot in the top text box of RegExr.
+
+In Figure 2-6, you see that the dot matches the first character in the target, namely, the letter _T_.
+
+Figure 2-6. Matching a single character in RegExr with "."
+
+If you wanted to match the entire phrase _THE RIME_ , you could use eight dots:
+
+    ........
+
+But this isn't very practical, so I don't recommend using a series of dots like this often, if ever. Instead of eight dots, use a quantifier:
+
+    .{8}
+
+and it would match the first two words and the space in between, but crudely so. To see what I mean by _crudely_ , click the checkbox next to _global_ and see how useless this really is. It matches sequences of eight characters, end on end, all but the last few characters of the target.
+
+Let's try a different tack with word boundaries and starting and ending letters. Type the following in the upper text box of RegExr to see a slight difference:
+
+    \bA.{5}T\b
+
+This expression has a bit more specificity. (Try saying _specificity_ three times, out loud.) It matches the word _ANCYENT_ , an archaic spelling of _ancient_. How?
+
+  * The shorthand `\b` matches a word boundary, without consuming any characters.
+
+  * The characters _A_ and _T_ also bound the sequence of characters.
+
+  * `.{5}` matches any five characters.
+
+  * Match another word boundary with `\b`.
+
+This regular expression would actually match both _ANCYENT_ or _ANCIENT_.
+
+Now try it with a shorthand:
+
+    \b\w{7}\b
+
+Finally, I'll talk about matching zero or more characters:
+
+    .*
+
+which is the same as:
+
+    [^\n]
+
+or
+
+    [^\n\r]
+
+Similar to this is the dot used with the one or more quantifier (+):
+
+    .+
+
+Try these in RegExr and they will, either of them, match the first line (uncheck _global_ ). The reason why is that, normally, the dot does not match newline characters, such as a line feed (U+000A) or a carriage return (U+000D). Click the checkbox next to _dotall_ in RegExr, and then `.*` or `.+` will match _all_ the text in the lower box. ( _dotall_ means a dot will match all characters, including newlines.)
+
+The reason why it does this is because these quantifiers are _greedy_ ; in other words, they match all the characters they can. But don't worry about that quite yet. Chapter 7 explains quantifiers and greediness in more detail.
+
+# Marking Up the Text
+
+"The Rime of the Ancient Mariner" is just plain text. What if you wanted to display it on the Web? What if you wanted to mark it up as HTML5 using regular expressions, rather than by hand? How would you do that?
+
+In some of the following chapters, I'll show you ways to do this. I'll start out small in this chapter and then add more and more markup as you go along.
+
+In RegExr, click the Replace tab, check _multiline_ , and then, in the first text box, enter:
+
+    (^T.*$)
+
+Beginning at the top of the file, this will match the first line of the poem and then capture that text in a group using parentheses. In the next box, enter:
+
+    <h1>$1</h1>
+
+The replacement regex surrounds the captured group, represented by `$1`, in an _h1_ element. You can see the result in the lowest text area. The `$1` is a backreference, in Perl style. In most implementations, including Perl, you use this style: `\1`; but RegExr supports only `$1`, `$2`, `$3` and so forth. You'll learn more about groups and backreferences in Chapter 4.
+
+## Using _sed_ to Mark Up Text
+
+On a command line, you could also do this with _sed_. _sed_ is a Unix streaming editor that accepts regular expressions and allows you to transform text. It was first developed in the early 1970s by Lee McMahon at Bell Labs. If you are on the Mac or have a Linux box, you already have it.
+
+Test out _sed_ at a shell prompt (such as in a Terminal window on a Mac) with this line:
+
+    echo Hello | sed s/Hello/Goodbye/
+
+This is what should have happened:
+
+  * The _echo_ command prints the word _Hello_ to standard output (which is usually just your screen), but the vertical bar (|) pipes it to the _sed_ command that follows.
+
+  * This pipe directs the output of _echo_ to the input of _sed_.
+
+  * The _s_ (substitute) command of _sed_ then changes the word _Hello_ to _Goodbye_ , and _Goodbye_ is displayed on your screen.
+
+If you don't have _sed_ on your platform already, at the end of this chapter you'll find some technical notes with some pointers to installation information. You'll find discussed there two versions of _sed_ : BSD and GNU.
+
+Now try this: At a command or shell prompt, enter:
+
+    sed -n 's/^/<h1>/;s/$/<\/h1>/p;q' rime.txt
+
+And the output will be:
+
+    <h1>THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.</h1>
+
+Here is what the regex did, broken down into parts:
+
+  * The line starts by invoking the _sed_ program.
+
+  * The `-n` option suppresses _sed_ 's default behavior of echoing each line of input to the output. This is because you want to see only the line effected by the regex, that is, line 1.
+
+  * `s/^/<h1>/` places an _h1_ start-tag at the beginning (`^`) of the line.
+
+  * The semicolon (;) separates commands.
+
+  * `s/$/<\/h1>/` places an _h1_ end-tag at the end (`$`) of the line.
+
+  * The _p_ command prints the affected line (line 1). This is in contrast to `-n`, which echoes every line, regardless.
+
+  * Lastly, the _q_ command quits the program so that _sed_ processes only the first line.
+
+  * All these operations are performed against the file _rime.txt_.
+
+Another way of writing this line is with the `-e` option. The `-e` option appends the editing commands, one after another. I prefer the method with semicolons, of course, because it's shorter.
+
+    sed -ne 's/^/<h1>/' -e 's/$/<\/h1>/p' -e 'q' rime.txt
+
+You could also collect these commands in a file, as with _h1.sed_ shown here (this file is in the code repository mentioned earlier):
+
+    #!/usr/bin/sed
+
+    s/^/<h1>/
+    s/$/<\/h1>/
+    q
+
+To run it, type:
+
+    sed -f h1.sed rime.txt
+
+at a prompt in the same directory or folder as _rime.txt_.
+
+## Using Perl to Mark Up Text
+
+Finally, I'll show you how to do a similar process with Perl. Perl is a general purpose programming language created by Larry Wall back in 1987. It's known for its strong support of regular expressions and its text processing capabilities.
+
+Find out if Perl is already on your system by typing this at a command prompt, followed by Return or Enter:
+
+    perl -v
+
+This should return the version of Perl on your system or an error (see Technical Notes).
+
+To accomplish the same output as shown in the _sed_ example, enter this line at a prompt:
+
+    perl -ne 'if ($. == 1) { s/^/<h1>/; s/$/<\/h1>/m; print; }' rime.txt
+
+and, as with the _sed_ example, you will get this result:
+
+    <h1>THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.</h1>
+
+Here is what happened in the Perl command, broken down again into pieces:
+
+  *  _perl_ invokes the Perl program.
+
+  * The `-n` option loops through the input (the file _rime.txt_ ).
+
+  * The `-e` option allows you to submit program code on the command line, rather than from a file (like _sed_ ).
+
+  * The _if_ statement checks to see if you are on line 1. `$.` is a special variable in Perl that matches the current line.
+
+  * The first substitute command _s_ finds the beginning of the first line (`^`) and inserts an _h1_ start-tag there.
+
+  * The second substitute command searches for the end of the line (`$`), and then inserts an _h1_ end-tag.
+
+  * The _m_ or _multiline_ modifier or flag at the end of the substitute command indicates that you are treating this line distinctly and separately; consequently, the `$` matches the end of line 1, not the end of the file.
+
+  * At last, it prints the result to standard output (the screen).
+
+  * All these operations are performed again the file _rime.txt_.
+
+You could also hold all these commands in a program file, such as this file, _h1.pl_ , found in the example archive.
+
+    #!/usr/bin/perl -n
+
+    if ($. == 1) {
+      s/^/<h1>/;
+      s/$/<\/h1>/m;
+      print;
+    }
+
+And then, in the same directory as _rime.txt_ , run the program like this:
+
+    perl h1.pl rime.txt
+
+There are a lot of ways you can do things in Perl. I am not saying this is the most efficient way to add these tags. It is simply one way. Chances are, by the time this book is in print, I'll think of other, more efficient ways to do things with Perl (and other tools). I hope you will, too.
+
+In the next chapter, we'll talk about boundaries and what are known as _zero-width assertions_.
+
+# What You Learned in Chapter 2
+
+  * How to match string literals
+
+  * How to match digits and non-digits
+
+  * What the _global_ mode is
+
+  * How character shorthands compare with character classes
+
+  * How to match word and non-word characters
+
+  * How to match whitespace
+
+  * How to match any character with the dot
+
+  * What the _dotall_ mode is
+
+  * How to insert HTML markup to a line of text using RegExr, _sed_ , and Perl
+
+# Technical Notes
+
+  * RegExr is found at <http://www.gskinner.com/RegExr> and also has a desktop version (<http://www.gskinner.com/RegExr/desktop/>). RegExr was built in Flex 3 (<http://www.adobe.com/products/flex.html>) and relies on the ActionScript regular expression engine (<http://www.adobe.com/devnet/actionscript.html>). Its regular expressions are similar to those used by JavaScript (see <https://developer.mozilla.org/en/JavaScript/Reference/Global_Objects/RegExp>).
+
+  * Git is a fast version control system (<http://git-scm.com>). GitHub is a web-based repository for projects using Git (<http://github.com>). I suggest using the GitHub repository for samples in this book only if you feel comfortable with Git or with other modern version control systems, like Subversion or Mercurial.
+
+  * HTML5 (<http://www.w3.org/TR/html5/>) is the fifth major revision of the W3C's HTML, the markup language for publishing on the World Wide Web. It has been in draft for several years and changes regularly, but it is widely accepted as the heir apparent of HTML 4.01 and XHTML.
+
+  *  _sed_ is readily available on Unix/Linux systems, including the Mac (Darwin or BSD version). It is also available on Windows through distributions like Cygwin (<http://www.cygwin.com>) or individually at <http://gnuwin32.sourceforge.net/packages/sed.htm> (currently at version 4.2.1, see <http://www.gnu.org/software/sed/manual/sed.html>).
+
+  * To use the Perl examples in this chapter, you may have to install Perl on your system. It comes by default with Mac OS X Lion and often is on Linux systems. If you are on Windows, you can get Perl by installing the appropriate Cygwin packages (see <http://www.cygwin.com>) or by downloading the latest package from the ActiveState website (go to <http://www.activestate.com/activeperl/downloads>). For detailed information on installing Perl, visit <http://learn.perl.org/installing/> or <http://www.perl.org/get.html>.
+
+To find out if you already have Perl, enter the command below at a shell prompt. To do this, open a command or shell window on your system, such as a Terminal window (under Applications/Utilities) on the Mac or a Windows command line window (open Start, and then enter _cmd_ in the text box at the bottom of the menu). At the prompt, type:
+
+        perl -v
+
+If Perl is alive and well on your system, then this command will return version information for Perl. On my Mac running Lion, I've installed the latest version of Perl (5.16.0 at the time of this writing) from source and compiled it (see <http://www.cpan.org/src/5.0/perl-5.16.0.tar.gz>). I get the following information back when I enter the command above:
+
+        This is perl 5, version 16, subversion 0 (v5.16.0) built for darwin-2level
+
+    Copyright 1987-2012, Larry Wall
+
+    Perl may be copied only under the terms of either the Artistic License or the
+    GNU General Public License, which may be found in the Perl 5 source kit.
+
+    Complete documentation for Perl, including FAQ lists, should be found on
+    this system using "man perl" or "perldoc perl".  If you have access to the
+    Internet, point your browser at http://www.perl.org/, the Perl Home Page.
+
+Both `perl` and `perldoc` are installed at `/usr/local/bin` when compiled and built from source, which you can add to your path. For information on setting your path variable, see <http://java.com/en/download/help/path.xml>.
+
+# Chapter 3. Boundaries
+
+This chapter focuses on assertions. Assertions mark boundaries, but they don't consume characters—that is, characters will not be returned in a result. They are also known as _zero-width assertions_. A zero-width assertion doesn't match a character, per se, but rather a location in a string. Some of these, such as `^` and `$`, are also called _anchors_.
+
+The boundaries I'll talk about in this chapter are:
+
+  * The beginning and end of a line or string
+
+  * Word boundaries (two kinds)
+
+  * The beginning and end of a subject
+
+  * Boundaries that quote string literals
+
+To start, I'll use RegExr again, but this time, for variety, I'll use the Safari browser (however, you can use any browser you like). I'll also use the same text I used last time: the first 12 lines of _rime.txt_. Open the Safari browser with <http://gskinner.com/regexr> and copy the first 12 lines of _rime.txt_ from the code archive into the lower box.
+
+# The Beginning and End of a Line
+
+As you have seen a number of times already, to match the beginning of a line or string, use the caret or circumflex (U+005E):
+
+    ^
+
+Depending on the context, a `^` will match the beginning of a line or string, sometimes a whole document. The context depends on your application and what options you are using with that application.
+
+To match the end of a line or string, as you know, use the dollar sign:
+
+    $
+
+In RegExr, make sure that _multiline_ is checked. _global_ is checked by default when you open RegExr, but you can leave it checked or unchecked for this example. When _multiline_ is not checked, the entire target is considered one string.
+
+In the upper text box, enter this regular expression:
+
+    ^How.*Country\.$
+
+This will match the entire line beginning with the word _How_. Notice that the period or dot at the end is preceded by a backslash. This escapes the dot so that it is interpreted as a literal. If it was not escaped, what would it match? Any character. If you want to match a literal dot, you have to either escape it or put it in a character class (see Chapter 5).
+
+Figure 3-1. RegExr in Safari
+
+If you uncheck _multiline_ , then what happens? The highlighting is turned off. With it unchecked and _dotall_ checked, enter:
+
+    ^THE.*\?$
+
+and you'll see that it matches all the text.
+
+The _dotall_ option means that the dot will match newlines in addition to all other characters. Uncheck _dotall_ , and the expression matches nothing. However, the following:
+
+    ^THE.*
+
+will match the first line. Click _dotall_ again, and all text is matched again. The `\?$` is not required to match to the end of the text.
+
+# Word and Non-word Boundaries
+
+You have already seen `\b` used several times. It marks a word boundary. Try:
+
+    \bTHE\b
+
+and it will match both occurrences of _THE_ in the first line (with _global_ checked). Like, `^` or `$`, `\b` is a zero-width assertion. It may appear to match things like a space or the beginning of a line, but in actuality, what it matches is a zero-width nothing. Did you notice that the spaces around the second _THE_ are not highlighted? That is because they are not part of the match. Not the easiest thing to grasp, but you'll get it by seeing what it does and does not do.
+
+You can also match non-word boundaries. A non-word boundary matches locations that are not equivalent to a word boundary, like a letter or a number within a word or string. To match a non-word boundary, give this a spin:
+
+    \Be\B
+
+and watch what it matches (see Figure 3-2). You'll see that it matches a lowercase _e_ when it is surrounded by other letters or non-word characters. Being a zero-width assertion, it does not match the surrounding characters, but it recognizes when the literal _e_ is surrounded by non-word boundaries.
+
+Figure 3-2. Matching non-word boundaries with \B
+
+In some applications, another way for specifying a word boundary is with:
+
+    \<
+
+for the beginning of a word, and with:
+
+    \>
+
+for the end of the word. This is an older syntax, not available in most recent regex applications. It is useful in some instances because, unlike `\b`, which matches _any_ word boundary, this syntax allows you to match either the beginning or ending of a word.
+
+If you have _vi_ or _vim_ on your system, you can try this out with that editor. Just follow these steps. They're easy even if you have never used _vim_ before. In a command or shell window, change directories to where the poem is located and then open it with:
+
+    vim rime.txt
+
+Then enter the following search command:
+
+    /\>
+
+and press Enter or Return. The forward slash (`/`) is the way you begin a search in _vim_. Watch the cursor and you'll see that this search will find the ends of words. Press _n_ to repeat the search. Next enter:
+
+    /\<
+
+followed by Enter or Return. This time the search will find the beginning of words. To exit _vim_ , just type `ZZ`.
+
+This syntax also works with _grep_. Since the early 1970s, _grep_ like _sed_ has been a Unix mainstay. (In the 1980s, I had a coworker who had a vanity license plate that said _GREP._ ) Try this command from a shell prompt:
+
+    grep -Eoc '\<(THE|The|the)\>' rime.txt
+
+The - _E_ option indicates that you want to use extended regular expressions (EREs) rather than the basic regular expressions (BREs) which are used by _grep_ by default. The `-o` option means you want to show in the result only that part of the line that matches the pattern, and the `-c` option means only return a count of the result. The pattern in single quotes will match either _THE_ , _The_ , or _the_ as whole words. That's what the `\<` and `\>` help you find.
+
+This command will return:
+
+    259
+
+which is the count of the words found.
+
+On the other hand, if you don't include the `\<` and `\>`, you get a different result. Do it this way:
+
+    grep -Eoc '(THE|The|the)' rime.txt
+
+and you will get a different number:
+
+    327
+
+Why? Because the pattern will match only whole words, plus _any_ sequence of characters that contain the word. So that is one reason why the `\<` and `\>` can come in handy.
+
+# Other Anchors
+
+Similar to the `^` anchor is the following, a shorthand that matches the start of a subject:
+
+    \A
+
+This is not available with all regex implementations, but you can get it with Perl and PCRE (Perl Compatible Regular Expressions), for example. To match the end of a subject, you can use `\A`'s companion.
+
+    \Z
+
+Also, in some contexts:
+
+    \z
+
+ _pcregrep_ is a version of _grep_ for the PCRE library. (See Technical Notes to find out where to get it.) Once installed, to try this syntax with _pcregrep_ , you could do something like this:
+
+    pcregrep -c '\A\s*(THE|The|the)' rime.txt
+
+which will return a count (`-c`) of 108 occurrences of the word _the_ (in three cases) which occur near the beginning of a line, preceded by whitespace (zero or more). Next enter this command:
+
+    pcregrep -n '(MARINERE|Marinere)(.)?\Z' rime.txt
+
+This matches either _MARINERE_ or _Marinere_ at the end of a line (subject) and is followed by any optional character, which in this case is either a punctuation mark or the letter _S_. (The parentheses around the dot are not essential.)
+
+You'll see this output:
+
+    1:THE RIME OF THE ANCYENT MARINERE,
+    10:     It is an ancyent Marinere,
+    38:       The bright-eyed Marinere.
+    63:       The bright-eyed Marinere.
+    105:     "God save thee, ancyent Marinere!
+    282:     "I fear thee, ancyent Marinere!
+    702:     He loves to talk with Marineres
+
+The `-n` option with _pcregrep_ gives you the line numbers at the beginning of each line of output. The command line options of _pcregrep_ are very similar to those of _grep_. To see them, do:
+
+    pcre --help
+
+# Quoting a Group of Characters as Literals
+
+You can use these sequences to quote a set of characters as literals:
+
+    \Q
+
+and
+
+    \E
+
+To show you how this works, enter the following metacharacters in the lower box of RegExr:
+
+    .^$*+?|(){}[]\-
+
+These 15 metacharacters are treated as special characters in regular expressions, used for encoding a pattern. (The hyphen is treated specially, as signifying a range, inside of the square brackets of a character class. Otherwise, it's not special.)
+
+If you try to match those characters in the upper text box of RegExr, nothing will happen. Why? Because RegExr thinks (if it can think) that you are entering a regular expression, not literal characters. Now try:
+
+    \Q$\E
+
+and it will match `$` because anything between `\Q` and `\E` is interpreted as a literal character (see Figure 3-3). (Remember, you can precede a metacharacer with a \ to make it literal.)
+
+Figure 3-3. Quoting metacharacters as literals
+
+# Adding Tags
+
+In RegExr, uncheck _global_ and check _multiline_ , click the Replace tab, and then, in the first text box (marked number 1 in Figure 3-4), enter:
+
+    ^(.*)$
+
+This will match and capture the first line of text. Then in the next box (marked number 2), enter this or something similar:
+
+    <!DOCTYPE html>\n<html lang="en">\n<head><title>Rime</title></head>\n<body>\n
+        <h1>$1</h1>
+
+As you enter the replacement text, you'll notice that the subject text (shown in the box marked number 3) is changed in the results text box (marked number 4), to include the markup you've added (see Figure 3-4).
+
+Figure 3-4. Adding markup with RegExr
+
+RegExr does well to demonstrate one way to do this, but it is limited in what it can do. For example, it can't save any results out to a file. We have to look beyond the browser for that.
+
+## Adding Tags with _sed_
+
+On a command line, you could also do something similar to what we just did in RegExr with _sed_ , which you saw in the last chapter. The insert (`i`) command in _sed_ allows you to insert text above or before a location in a document or a string. By the way, the opposite of _i_ in _sed_ is _a_ , which appends text below or after a location. We'll use the append command later.
+
+The following command inserts the HTML5 doctype and several other tags, beginning at line 1:
+
+    sed '1 i\
+    <!DOCTYPE html>\
+    <html lang="en">\
+    <head>\
+    <title>Rime</title>\
+    </head>\
+    <body>
+
+    s/^/<h1>/
+    s/$/<\/h1>/
+    q' rime.txt
+
+The backslashes (`\`) at the end of the lines allow you to insert newlines into the stream and not execute the command prematurely. The backslashes in front of the quotation marks _escape_ the quotes so that they are seen as literal characters, not part of the command.
+
+When you run this _sed_ command correctly, this is what your output will look like:
+
+    <!DOCTYPE html>
+    <html lang="en">
+    <head>
+    <title>The Rime of the Ancyent Mariner (1798)</title>
+    </head>
+    <body>
+    <h1>THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.</h1>
+
+These same _sed_ commands are saved in the file _top.sed_ in the example archive. You can run this on the file using this command:
+
+    sed -f top.sed rime.txt
+
+You should get the same output as you saw in the previous command. If you want to save the output to a file, you can redirect the output to a file, like so:
+
+    sed -f top.sed rime.txt > temp
+
+In addition to showing the result on the screen, this redirect part of the command (`> temp`) will save the output to the file _temp_.
+
+## Adding Tags with Perl
+
+Let's try to accomplish this same thing with Perl. Without explaining everything that's going on, just try this:
+
+    perl -ne 'print "<!DOCTYPE html>\
+    <html lang=\"en\">\
+    <head><title>Rime</title></head>\
+    <body>\
+    " if $. == 1;
+    s/^/<h1>/;s/$/<\/h1>/m;print;exit;' rime.txt
+
+Compare this with the _sed_ command. How is it similar? How is it different? The _sed_ command is a little simpler, put Perl is a lot more powerful, in my opinion.
+
+Here is how it works:
+
+  * The `$.` variable, which is tested with the _if_ statement, represents the current line. The _if_ statement returns _true_ , meaning it passes the test that the current line is line 1.
+
+  * When Perl finds line 1 with _if_ , it prints the doctype and a few HTML tags. It is necessary to escape the quote marks as in _sed_.
+
+  * The first substitution inserts an _h1_ start-tag at the beginning of the line, and the second one inserts an _h1_ end-tag at the end of the line. The _m_ at the end of the second substitution means that it uses a _multiline_ modifier. This is done so that the command recognizes the end of the first line. Without _m_ , the _$_ would match to the end of the file.
+
+  * The _print_ command prints the result of the substitutions.
+
+  * The _exit_ command exits Perl immediately. Otherwise, because of `-n` option, it would loop through every line of the file, which we don't want for this script.
+
+That was a lot of typing, so I put all that Perl code in a file and called it _top.pl_ , also found in the code archive.
+
+    #!/usr/bin/perl -n
+
+    if ($ == 1) {
+    print "<!DOCTYPE html>\
+    <html lang=\"en\">\
+    <head>\
+    <title>The Rime of the Ancyent Mariner (1798)</title>\
+    </head>\
+    <body>\
+    ";
+    s/^/<h1>/;
+    s/$/<\/h1>/m;
+    print;
+    exit;
+    }
+
+Run this with:
+
+    perl top.pl rime.txt
+
+You get a similar output as in the previous command, though it is formed a little differently. (You can redirect the output with >, as with _sed_.)
+
+The next chapter covers alternation, groups, and backreferences, among other things. See you over there.
+
+# What You Learned in Chapter 3
+
+  * How to use anchors at the beginning or end of a line with `^` or `$`
+
+  * How to use word boundaries and non-word boundaries
+
+  * How to match the beginning or end of a subject with `\A` and `\Z` (or `\z`)
+
+  * How to quote strings as literals with `\Q` and `\E`
+
+  * How to add tags to a document with RegExr, _sed_ , and Perl
+
+# Technical Notes
+
+  *  _vi_ is a Unix editor developed in 1976 by Sun cofounder Bill Joy that uses regular expressions. The _vim_ editor is a replacement for _vi_ , developed primarily by Bram Moolenaar (see <http://www.vim.org>). An early paper on _vi_ by Bill Joy and Mark Horton is found here: <http://docs.freebsd.org/44doc/usd/12.vi/paper.html>. The first time I used _vi_ was in 1983, and I use it nearly every day. It lets me to do more things more quickly than with any other text editor. And it is so powerful that I am always discovering new features that I never knew about, even though I've been acquainted with it for nearly 30 years.
+
+  *  _grep_ is a Unix command-line utility for searching and printing strings with regular expressions. Invented by Ken Thompson in 1973, _grep_ is said to have grown out of the _ed_ editor command `g/re/p` (global/regular expression/print). It was superseded but not retired by _egrep_ (or _grep -E_ ), which uses extended regular expressions (EREs) and has additional metacharacters such as |, +, ?, (, and ). _fgrep_ ( _grep -F_ ) searches files using literal strings; metacharacters like $, *, and | don't have special meaning. _grep_ is available on Linux systems as well as the Mac OS X's Darwin. You can also get it as part of the Cygwin GNU distribution (<http://www.cygwin.com>) or you can download it from <http://gnuwin32.sourceforge.net/packages/grep.htm>.
+
+  * PCRE (<http://www.pcre.org>) or Perl Compatible Regular Expressions is a C library of functions (8-bit and 16-bit) for regular expressions that are compatible with Perl 5, and include some features of other implementations. _pcregrep_ is an 8-bit, _grep_ -like tool that enables you to use the features of the PCRE library on the command line. You can get _pcregrep_ for the Mac through Macports (<http://www.macports.org>) by running the command `sudo port install pcre`. (Xcode is a prerequisite; see <https://developer.apple.com/technologies/tools/>. Login required.)
+
+# Chapter 4. Alternation, Groups, and Backreferences
+
+You have already seen groups in action. Groups surround text with parentheses to help perform some operation, such as the following:
+
+  * Performing alternation, a choice between two or more optional patterns
+
+  * Creating subpatterns
+
+  * Capturing a group to later reference with a backreference
+
+  * Applying an operation to a grouped pattern, such as a quantifer
+
+  * Using non-capturing groups
+
+  * Atomic grouping (advanced)
+
+We'll be using a few contrived examples, in addition to the text from "The Rime of the Ancyent Mariner" again, in _rime.txt_. This time, I'll use the desktop version of RegExr, as well as other tools like _sed_. You can download the desktop version of RegExr from <http://www.regexr.com>, for Windows, Mac, or Linux (it was written with Adobe AIR). Click the Desktop Version link on the RegExr web page (lower-right corner) for more information.
+
+# Alternation
+
+Simply said, _alternation_ gives you a choice of alternate patterns to match. For example, let's say you wanted to find out how many occurrences of the article _the_ are in the "The Rime of the Ancient Mariner." The problem is, the word occurs as _THE_ , _The_ , and _the_ in the poem. You can use alternation to deal with this peculiarity.
+
+Open the RegExr desktop application by double-clicking on its icon. It looks very much like the online version but has the advantage of being local on your machine, so you won't suffer the network issues that sometimes occur when using web applications. I've copied and pasted the entire poem in RegExr desktop for the next exercise. I'm using it on a Mac running OS X Lion.
+
+In the top text box, enter the pattern:
+
+    (the|The|THE)
+
+and you'll see all occurrences of _the_ in the poem highlighted in the lower box (see Figure 4-1). Use the scroll bar to view more of the result.
+
+Figure 4-1. Using alternation in RegExr desktop version
+
+We can make this group shorter by applying an option. Options let you specify the way you would like to search for a pattern. For example, the option:
+
+    (?i)
+
+makes your pattern case-insensitive, so instead of using the original pattern with alternation, you can do this instead:
+
+    (?i)the
+
+Try this in RegExr to see how it works. You can also specify case-insensitivity by checking _ignoreCase_ in RegExr, but both will work. This and other options or modifiers are listed in Table 4-1.
+
+Table 4-1. Options in regular expressions
+
+Option| Description| Supported by  
+---|---|---
+
+`(?d)` | Unix lines | Java
+
+`(?i)` | Case insensitive | PCRE, Perl, Java
+
+`(?J)` | Allow duplicate names | PCRE[a]
+
+`(?m)` | Multiline | PCRE, Perl, Java
+
+`(?s)` | Single line (dotall) | PCRE, Perl, Java
+
+`(?u)` | Unicode case | Java
+
+`(?U)` | Default match lazy | PCRE
+
+`(?x)` | Ignore whitespace, comments | PCRE, Perl, Java
+
+`(?-...)` | Unset or turn off options | PCRE
+
+[a] See "Named Subpatterns" in <http://www.pcre.org/pcre.txt>.
+
+Let's now use alternation with _grep_. The options in Table 4-1, by the way, don't work with _grep_ , so you are going to use the original alternation pattern. To count the number of lines where the word _the_ occurs, regardless of case, one or more times, use:
+
+    grep -Ec "(the|The|THE)" rime.txt
+
+and get this answer:
+
+    327
+
+This result does not tell the whole story. Stay tuned.
+
+Here is an analysis of the _grep_ command:
+
+  * The _-E_ option means that you want to use extended regular expressions (EREs) rather than basic regular expressions (BREs). This, for example, saves you from having to escape the parentheses and the vertical bar, like `\(THE\|The\|the\)`, as you must with BREs.
+
+  * The _-c_ option returns a count of the matched lines (not matched words).
+
+  * The parentheses group the choice or alternation of _the_ , _The_ , or _THE_.
+
+  * The vertical bar separates possible choices, which are evaluated left to right.
+
+To get a count of actual words used, this approach will return each occurrence of the word, one per line:
+
+    grep -Eo "(the|The|THE)" rime.txt | wc -l
+
+This returns:
+
+    412
+
+And here is a bit more analysis:
+
+  * The `-o` option means to show only that part of the line that matches the pattern, though this is not apparent due to the pipe (`|`) to _wc_.
+
+  * The vertical bar, in this context, pipes the output of the _grep_ command to the input of the _wc_ command. _wc_ is a word count command, and `-l` counts the number of lines of the input.
+
+Why the big difference between 327 and 412? Because _-c_ gives you a count of matching lines, but there can be more than one match on each line. If you use _-o_ with _wc -l_ , then each occurrence of the various forms of the word will appear on a separate line and be counted, giving the higher number.
+
+To perform this same match with Perl, write your command this way:
+
+    perl -ne 'print if /(the|The|THE)/' rime.txt
+
+Or better yet, you can do it with the `(?i)` option mentioned earlier, but without alternation:
+
+    perl -ne 'print if /(?i)the/' rime.txt
+
+Or even better yet, append the _i_ modifier after the last pattern delimiter:
+
+    perl -ne 'print if /the/i' rime.txt
+
+and you will get the same outcome. The simpler the better. For a list of additional modifiers (also called _flags_ ), see Table 4-2"). Also, compare options (similar but with a different syntax) in Table 4-1.
+
+Table 4-2. Perl modifiers (flags)[1]
+
+Modifier| Description  
+---|---
+
+a | Match `\d`, `\s`, `\w`, and POSIX in ASCII range only
+
+c | Keep current position after match fails
+
+d | Use default, native rules of the platform
+
+g | Global matching
+
+i | Case-insensitive matching
+
+l | Use current locale's rules
+
+m | Multiline strings
+
+p | Preserve the matched string
+
+s | Treat strings as a single line
+
+u | Use Unicode rules when matching
+
+x | Ignore whitespace and comments
+
+[1] See <http://perldoc.perl.org/perlre.html#Modifiers>.
+
+# Subpatterns
+
+Most often, when you refer to _subpatterns_ in regular expressions, you are referring to a group or groups within groups. A subpattern is a pattern within a pattern. Often, a condition in a subpattern is matchable when a preceding pattern is matched, but not always. Subpatterns can be designed in a variety of ways, but we're concerned primarily with those defined within parentheses here.
+
+In one sense, the pattern you saw earlier:
+
+    (the|The|THE)
+
+has three subpatterns: _the_ is the first subpattern, _The_ is the second, and _THE_ the third, but matching the second subpattern, in this instance, is not dependent on matching the first. (The leftmost pattern is matched first.)
+
+Now here is one where the subpattern(s) depend on the previous pattern:
+
+    (t|T)h(e|eir)
+
+In plain language, this will match the literal characters _t_ or _T_ followed by an _h_ followed by either an _e_ or the letters _eir_. Accordingly, this pattern will match any of:
+
+  *  _the_
+
+  *  _The_
+
+  *  _their_
+
+  *  _Their_
+
+In this case, the second subpattern `(e|eir)` is dependent on the first `(tT)`.
+
+Subpatterns don't require parentheses. Here is an example of subpatterns done with character classes:
+
+    \b[tT]h[ceinry]*\b
+
+This pattern can match, in addition to _the_ or _The_ , words such as _thee_ , _thy_ and _thence_. The two word boundaries (`\b`) mean the pattern will match whole words, not letters embedded in other words.
+
+Here is a complete analysis of this pattern:
+
+  * `\b` matches a beginning word boundary.
+
+  * `[tT]` is a character class that matches either an lowercase _t_ or an uppercase _T_. We can consider this the first subpattern.
+
+  * Then the pattern matches (or attempts to match) a lowercase _h_.
+
+  * The second or last subpattern is also expressed as a character class `[ceinry]` followed by a quantifier `*` for zero or more.
+
+  * Finally, another word boundary `\b` ends the pattern.
+
+### Note
+
+One interesting aspect of the state of regular expressions is that terminology, while usually close in meaning, can also range far. In defining _subpattern_ and other terms in this book, I've examined a variety of sources and have tried to bring them together under one roof. But I suspect that there are some who would argue that a character class is not a subpattern. My take is they can function as subpatterns, so I lump them in.
+
+# Capturing Groups and Backreferences
+
+When a pattern groups all or part of its content into a pair of parentheses, it captures that content and stores it temporarily in memory. You can reuse that content if you wish by using a backreference, in the form:
+
+    \1
+
+or:
+
+    $1
+
+where `\1` or `$1` reference the first captured group, `\2` or `$2` reference the second captured group, and so on. _sed_ will only accept the `\1` form, but Perl accepts both.
+
+### Note
+
+Originally, _sed_ supported backreferences in the range `\1` through `\9`, but that limitation does not appear to exist any longer.
+
+You have already seen this in action, but I'll demonstrate it here again. We'll use it to rearrange the wording of a line of the poem, with apologies to Samuel Taylor Coleridge. In the top text box in RegExr, after clicking the Replace tab, enter this pattern:
+
+    (It is) (an ancyent Marinere)
+
+Scroll the subject text (third text area) down until you can see the highlighted line, and then in the second box, enter:
+
+    $2 $1
+
+and you'll see in the lowest box the line rearranged as:
+
+    an ancyent Marinere It is,
+
+(See Figure 4-2.)
+
+Figure 4-2. Referencing backreferences with $1 and $2
+
+Here is how to accomplish the same result with _sed_ :
+
+    sed -En 's/(It is) (an ancyent Marinere)/\2 \1/p' rime.txt
+
+and the output will be:
+
+    an ancyent Marinere It is,
+
+just as in RegExr. Let's analyze the _sed_ command to help you understand everything that is going on:
+
+  * The _-E_ option once again invokes EREs, so you don't have to quote the parentheses, for example.
+
+  * The _-n_ option suppresses the default behavior of printing every line.
+
+  * The substitute command searches for a match for the text "It is an ancyent Marinere," capturing it into two groups.
+
+  * The substitute command also replaces the match by rearranging the captured text in the output, with the backreference `\2` first, then `\1`.
+
+  * The _p_ at the end of the substitute command means you want to print the line.
+
+A similar command in Perl will do the same thing:
+
+    perl -ne 'print if s/(It is) (an ancyent Marinere)/\2 \1/' rime.txt
+
+Notice that this uses the `\1` style syntax. You can, of course, use the `$1` syntax, too:
+
+    perl -ne 'print if s/(It is) (an ancyent Marinere)/$2 $1/' rime.txt
+
+I like how Perl lets you print a selected line without jumping through hoops.
+
+I'd like to point out something about the output:
+
+    an ancyent Marinere It is,
+
+The capitalization got mixed up in the transformation. Perl can fix that with `\u` and `\l`. Here's how:
+
+    perl -ne 'print if s/(It is) (an ancyent Marinere)/\u$2 \l$1/' rime.txt
+
+Now the result looks much better:
+
+    An ancyent Marinere it is,
+
+And here is why:
+
+  * The `\l` syntax does not match anything, but it changes the character that follows to lowercase.
+
+  * The `\u` syntax capitalizes the character that follows it.
+
+  * The `\U` directive (not shown) turns the text string that follows into all uppercase.
+
+  * The `\L` directive (not shown) turns the text string that follows into all lowercase.
+
+These directives remain in effect until another is found (like `\l` or `\E`, the end of a quoted string). Experiment with these to see how they work.
+
+## Named Groups
+
+ _Named groups_ are captured groups with names. You can access those groups by name later, rather than by integer. I'll show you how here in Perl:
+
+    perl -ne 'print if s/(?<one>It is) (?<two>an ancyent Marinere)/\u$+{two} 
+           \l$+{one}/' rime.txt
+
+Let's look at it:
+
+  * Adding `?<one>` and `?<two>` inside the parentheses names the groups _one_ and _two_ , respectively.
+
+  * `$+{one}` references the group named _one_ , and `$+{two}`, the group named _two_.
+
+You can also reuse named groups within the pattern where the group was named. I'll show you what I mean. Let's say you were searching for a string that contained six zeros all together:
+
+    000000
+
+It's a shallow example, but serves to show you how this works. So name a group of three zeros with this pattern (the _z_ is arbitrary):
+
+    (?<z>0{3})
+
+You can then use the group again like this:
+
+    (?<z>0{3})\k<z>
+
+Or this:
+
+    (?<z>0{3})\k'z'
+
+Or this:
+
+    (?<z>0{3})\g{z}
+
+Try this in RegExr for quick results. All these examples will work. Table 4-3 shows many of the possibilities with named group syntax.
+
+Table 4-3. Named group syntax
+
+Syntax| Description  
+---|---
+
+(?< _name_ >...) | A named group
+
+(? _name_...) | Another named group
+
+(?P< _name_ >...) | A named group in Python
+
+\k< _name_ > | Reference by name in Perl
+
+`\k' _`name`_ '` | Reference by name in Perl
+
+`\g{ _`name`_ }` | Reference by name in Perl
+
+`\k{ _`name`_ }` | Reference by name in .NET
+
+`(?P= _`name`_ )` | Reference by name in Python
+
+# Non-Capturing Groups
+
+There are also groups that are non-capturing groups—that is, they don't store their content in memory. Sometimes this is an advantage, especially if you never intend to reference the group. Because it doesn't store its content, it is possible it may yield better performance, though performance issues are hardly perceptible when running the simple examples in this book.
+
+Remember the first group discussed in this chapter? Here it is again:
+
+    (the|The|THE)
+
+You don't need to backreference anything, so you could write a non-capturing group this way:
+
+    (?:the|The|THE)
+
+Going back to the beginning of this chapter, you could add an option to make the pattern case-insensitive, like this (though the option obviates the need for a group):
+
+    (?i)(?:the)
+
+Or you could do it this way:
+
+    (?:(?i)the)
+
+Or, better yet, the _pièce de résistance_ :
+
+    (?i:the)
+
+The option letter _i_ can be inserted between the question mark and the colon.
+
+## Atomic Groups
+
+Another kind of non-capturing group is the _atomic group_. If you are using a regex engine that does backtracking, this group will turn backtracking off, not for the entire regular expression but just for that part enclosed in the atomic group. The syntax looks like this:
+
+    (?>the)
+
+When would you want to use atomic groups? One of the things that can really slow regex processing is backtracking. The reason why is, as it tries all the possibilities, it takes time and computing resources. Sometimes it can gobble up a lot of time. When it gets really bad, it's called _catastrophic_ _backtracking_.
+
+You can turn off backtracking altogether by using a non-backtracking engine like re2 (<http://code.google.com/p/re2/>) or by turning it off for parts of your regular expression with atomic grouping.
+
+### Note
+
+My focus in this book is to introduce syntax. I talk very little about performance tuning here. Atomic groups are mainly a performance consideration in my view.
+
+In Chapter 5, you'll learn about character classes.
+
+# What You Learned in Chapter 4
+
+  * That alternation allows a choice between two or more patterns
+
+  * What options modifiers are and how to use them in a pattern
+
+  * Different kinds of subpatterns
+
+  * How to use capturing groups and backreferences
+
+  * How to use named groups and how to reference them
+
+  * How to use non-capturing groups.
+
+  * A little about atomic grouping.
+
+# Technical Notes
+
+  * The Adobe AIR runtime lets you use HTML, JavaScript, Flash, and ActionScript to build web applications that run as standalone client applications without having to use a browser. Find out more at <http://www.adobe.com/products/air.html>.
+
+  * Python (<http://www.python.org>) is an easy-to-understand, high-level programming language. It has a regular expression implementation (see <http://docs.python.org/library/re.html>).
+
+  * .NET (<http://www.microsoft.com/net>) is a programming framework for the Windows platform. It, too, has a regular expression implementation (see <http://msdn.microsoft.com/en-us/library/hs600312.aspx>).
+
+  * More advanced explanations of atomic grouping are available at <http://www.regular-expressions.info/atomic.html> and <http://stackoverflow.com/questions/6488944/atomic-group-and-non-capturing-group>.
+
+# Chapter 5. Character Classes
+
+I'll now talk more about character classes or what are sometimes called _bracketed expressions_. Character classes help you match specific characters, or sequences of specific characters. They can be just as broad or far-reaching as character shorthands—for example, the character shorthand `\d` will match the same characters as:
+
+    0-9
+
+But you can use character classes to be even more specific than that. In this way, they are more versatile than shorthands.
+
+Try these examples in whatever regex processor you prefer. I'll use Rubular in Opera and Reggy on the desktop.
+
+To do this testing, enter this string in the subject or target area of the web page:
+
+    ! " # $ % & ' ( ) * + , - . /
+    0       1       2       3       4       5       6       7       8       9
+    : ; < = > ? @
+    A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
+    [ \ ] ^ _ `
+    a b c d e f g h i j k l m n o p q r s t u v w x y z
+    { | } ~
+
+You don't have to type all that in. You'll find this text stored in the file _ascii-graphic.txt_ in the code archive that comes with this book.
+
+To start out, use a character class to match a set of English characters—in this case, the English vowels:
+
+    [aeiou]
+
+The lowercase vowels should be highlighted in the lower text area (see Figure 5-1). How would you highlight the uppercase vowels? How would you highlight or match both?
+
+Figure 5-1. Character class with Rubular in the Opera browser
+
+With character classes, you can also match a range of characters:
+
+    [a-z]
+
+This matches the lowercase letters _a_ through _z_. Try matching a smaller range of those characters, something like _a_ through _f_ :
+
+    [a-f]
+
+Of course, you can also specify a range of digits:
+
+    [0-9]
+
+Or an even smaller range such as 3, 4, 5, and 6:
+
+    [3-6]
+
+Now expand your horizon. If you wanted to match even numbers in the range 10 through 19, you could combine two character classes side by side, like this:
+
+    \b[1][24680]\b
+
+Or you could push things further and look for even numbers in the range 0 through 99 with this (yes, as we learned in high school, zero by itself is even):
+
+    \b[24680]\b|\b[1-9][24680]\b
+
+If you want to create a character class that matches hexadecimal digits, how would you do it? Here is a hint:
+
+    [a-fA-F0-9]
+
+You can also use shorthands inside of a character class. For example, to match whitespace and word characters, you could create a character class like this:
+
+    [\w\s]
+
+Which is the same as:
+
+    [_a-zA-Z \t\n\r]
+
+but easier to type.
+
+# Negated Character Classes
+
+You have already seen syntax a number of times, so I'll be brief. A negated character class matches characters that do not match the content of the class. For example, if you didn't want to match vowels, you could write (try it in your browser, then see Figure 5-2):
+
+    [^aeiou]
+
+In essence, the caret (`^`) at the beginning of the class means "No, I don't want these characters." (The caret _must_ appear at the beginning.)
+
+Figure 5-2. Negated character class with Regexpal in Opera
+
+# Union and Difference
+
+Character classes can act like sets. In fact, one other name for a character class is a _character set_. This functionality is not supported by all implementations. But Java supports it.
+
+I'll now show you a Mac desktop application called Reggy (see Technical Notes). Under Preferences (Figure 5-3), I changed the Regular Expression Syntax to _Java_ , and in Font (under Format), I changed the point size to 24 points for readability.
+
+Figure 5-3. Reggy preferences
+
+If you wanted a union of two character sets, you could do it like this:
+
+    [0-3[6-9]]
+
+The regex would match 0 through 3 or 6 through 9. Figure 5-4 shows you how this looks in Reggy.
+
+Figure 5-4. Union of two character sets in Reggy
+
+To match a difference (in essence, subtraction):
+
+    [a-z&&[^m-r]]
+
+Figure 5-5. Difference of two characters sets in Reggy
+
+which matches all the letters from _a_ to _z_ , except _m_ through _r_ (see Figure 5-5).
+
+# POSIX Character Classes
+
+POSIX or Portable Operating System Interface is a family of standards maintained by IEEE. It includes a regular expression standard, (ISO/IEC/IEEE 9945:2009), which provides a set of named character classes that have the form:
+
+    [[: _xxxx_ :]]
+
+where _xxxx_ is a name, such as _digit_ or _word_.
+
+To match alphanumeric characters (letters and digits), try:
+
+    [[:alnum:]]
+
+Figure 5-6 shows the alphanumeric class in Rubular.
+
+Figure 5-6. POSIX alphanumeric character class in Reggy
+
+An alternative for this is simply the shorthand `\w`. Which is easier to type, the POSIX character class or the shorthand? You know where I'm going: The least amount of typing wins. I admit I don't use POSIX classes very often. But they're still worth knowing about.
+
+For alphabetic characters in either upper- or lowercase, use:
+
+    [[:alpha:]]
+
+If you want to match characters in the ASCII range, choose:
+
+    [[:ascii:]]
+
+Of course, there are negated POSIX character classes as well, in the form:
+
+    [[:^ _xxxx_ :]]
+
+So if you wanted to match non-alphabetic characters, you could use:
+
+    [[:^alpha:]]
+
+To match space and tab characters, do:
+
+    [[:space:]]
+
+Or to match all whitespace characters, there's:
+
+    [[:blank:]]
+
+There are a number of these POSIX character classes, which are shown in Table 5-1.
+
+Table 5-1. POSIX character classes
+
+Character Class| Description  
+---|---
+
+[[:alnum:]] | Alphanumeric characters (letters and digits)
+
+[[:alpha:]] | Alphabetic characters (letters)
+
+[[:ascii:]] | ASCII characters (all 128)
+
+[[:blank:]] | Blank characters
+
+[[:ctrl:]] | Control characters
+
+[[:digit:]] | Digits
+
+[[:graph:]] | Graphic characters
+
+[[:lower:]] | Lowercase letters
+
+[[:print:]] | Printable characters
+
+[[:punct:]] | Punctuation characters
+
+[[:space:]] | Whitespace characters
+
+[[:upper:]] | Uppercase letters
+
+[[:word:]] | Word characters
+
+[[:xdigit:]] | Hexadecimal digits
+
+The next chapter is dedicated to matching Unicode and other characters.
+
+# What You Learned in Chapter 5
+
+  * How to create a character class or set with a bracketed expression
+
+  * How to create one or more ranges within a character class
+
+  * How to match even numbers in the range 0 through 99
+
+  * How to match a hexadecimal number
+
+  * How to use character shorthands within a character class
+
+  * How to negate a character class
+
+  * How to perform union, and difference with character classes
+
+  * What POSIX character classes are
+
+# Technical Notes
+
+  * The Mac desktop application Reggy can be downloaded for free at <http://www.reggyapp.com>. Reggy shows you what it has matched by changing the color of the matched text. The default is blue, but you can change this color in Preferences under the Reggy menu. Under Preferences, choose Java under Regular Expression Syntax.
+
+  * The Opera Next browser, currently in beta, can be downloaded from <http://www.opera.com/browser/next/>.
+
+  * Rubular is an online Ruby regular expression editor created by Michael Lovitt that supports both versions 1.8.7 and 1.9.2 of Ruby (see <http://www.rubular.com>).
+
+  * Read more about even numbers, of which zero is one, at <http://mathworld.wolfram.com/EvenNumber.html>.
+
+  * The Java (1.6) implementation of regular expressions is documented at <http://docs.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html>.
+
+  * You can find out more about IEEE and its family of POSIX standards at <http://www.ieee.org>.
+
+# Chapter 6. Matching Unicode and Other Characters
+
+You will have occasion to match characters or ranges of characters that are outside the scope of ASCII. ASCII, or the American Standard Code for Information Interchange, defines an English character set—the letters A through Z in upper- and lowercase, plus control and other characters. It's been around for a long time: The 128-character Latin-based set was standardized in 1968. That was back before there was such a thing as a personal computer, before VisiCalc, before the mouse, before the Web, but I still look up ASCII charts online regularly.
+
+I remember when I started my career many years ago, I worked with an engineer who kept an ASCII code chart in his wallet. Just in case. The ASCII Code Chart: Don't leave home without it.
+
+So I won't gainsay the importance of ASCII, but now it is dated, especially in light of the Unicode standard (<http://www.unicode.org>), which currently represents over 100,000 characters. Unicode, however, does not leave ASCII in the dust; it incorporates ASCII into its Basic Latin code table (see <http://www.unicode.org/charts/PDF/U0000.pdf>).
+
+In this chapter, you will step out of the province of ASCII into the not-so-new world of Unicode.
+
+The first text is _voltaire.txt_ from the code archive, a quote from Voltaire (1694–1778), the French Enlightenment philosopher.
+
+> Qu'est-ce que la tolérance? c'est l'apanage de l'humanité. Nous sommes tous pétris de faiblesses et d'erreurs; pardonnons-nous réciproquement nos sottises, c'est la première loi de la nature.
+
+Here is an English translation:
+
+> What is tolerance? It is the consequence of humanity. We are all formed of frailty and error; let us pardon reciprocally each other's folly—that is the first law of nature.
+
+# Matching a Unicode Character
+
+There are a variety of ways you can specify a Unicode character, also known as a code point. (For the purposes of this book, a Unicode character is one that is outside of the range of ASCII, though that is not strictly accurate.)
+
+Start out by placing the Voltaire quote in Regexpal (<http://www.regexpal.com>), and then entering this regular expression:
+
+    \u00e9
+
+The `\u` is followed by a hexadecimal value 00e9 (this is case insensitive—that is, 00E9 works, too). The value 00e9 is equivalent to the decimal value 233, well out of the ASCII range (0–127).
+
+Notice that the letter _é_ (small letter e with an acute accent) is highlighted in Regexpal (see Figure 6-1). That's because _é_ is the code point U+00E9 in Unicode, which was matched by `\u00e9`.
+
+Figure 6-1. Matching U+00E9 in Regexpal
+
+Regexpal uses the JavaScript implementation of regular expressions. JavaScript also allows you to use this syntax:
+
+    \xe9
+
+Try this in Regexpal and see how it matches the same character as `\u00e9`.
+
+Let's try it with a different regex engine. Open <http://regexhero.net/tester/> in a browser. Regex Hero is written in .NET and has a little different syntax. Drop the contents of the file _basho.txt_ into the text area labeled Target String. This contains a famous haiku written by the Japanese poet Matsuo Basho (who, coincidentally, died just one week before Voltaire was born).
+
+Here is the poem in Japanese:
+
+    古池
+    蛙飛び込む
+    水の音
+            —芭蕉 (1644–1694)
+
+And here is a translation in English:
+
+    At the ancient pond
+    a frog plunges into
+    the sound of water.
+            —Basho (1644–1694)
+
+To match part of the Japanese text, in the text area marked Regular Expression, type the following:
+
+    \u6c60
+
+This is the code point for the Japanese (Chinese) character for _pond_. It will be highlighted below (see Figure 6-2).
+
+Figure 6-2. Matching U+6c60 in Regex Hero
+
+While you are here, try matching the em dash (—) with:
+
+    \u2014
+
+Or the en dash (–) with:
+
+    \u2013
+
+Now look at these characters in an editor.
+
+## Using _vim_
+
+If you have _vim_ on your system, you can open _basho.txt_ with it, as shown:
+
+    vim basho.txt
+
+Now, starting with a slash (\\), enter a search with this line:
+
+    /\%u6c60
+
+followed by Enter or Return. The cursor moves to the beginning of the match, as you can see in Figure 6-3. Table 6-1 shows you your options. You can use _x_ or _X_ following the `\%` to match values in the range 0–255 (0–FF), _u_ to match up to four hexadecimal numbers in the range 256–65,535 (100–FFFF), or _U_ to match up to eight characters in the range 65,536–2,147,483,647 (10000–7FFFFFFF). That takes in a lot of code—a lot more than currently exist in Unicode.
+
+Table 6-1. Matching Unicode in Vim
+
+First Character| Maximum Characters| Maximum Value  
+---|---|---
+
+x or X | 2 | 255 (FF)
+
+u | 4 | 65,535 (FFFF)
+
+U | 8 | 2,147,483,647 (7FFFFFFF)
+
+Figure 6-3. Matching U+6c60 in Vim
+
+# Matching Characters with Octal Numbers
+
+You can also match characters using an octal (base 8) number, which uses the digits 0 to 7. In regex, this is done with three digits, preceded by a slash (\\).
+
+For example, the following octal number:
+
+    \351
+
+is the same as:
+
+    \u00e9
+
+Experiment with it in Regexpal with the Voltaire text. `\351` matches _é_ , with a little less typing.
+
+# Matching Unicode Character Properties
+
+In some implementations, such as Perl, you can match on Unicode character properties. The properties include characteristics like whether the character is a letter, number, or punctuation mark.
+
+I'll now introduce you to _ack_ , a command-line tool written in Perl that acts a lot like _grep_ (see <http://betterthangrep.com>). It won't come on your system; you have to download and install it yourself (see Technical Notes).
+
+We'll use _ack_ on an excerpt from Friederich Schiller's "An die Freude,"  composed in 1785 (German, if you can't tell):
+
+    An die Freude.
+
+    Freude, schöner Götterfunken,
+    Tochter aus Elisium,
+    Wir betreten feuertrunken
+    Himmlische, dein Heiligthum.
+    Deine Zauber binden wieder,
+    was der Mode Schwerd getheilt;
+    Bettler werden Fürstenbrüder,
+    wo dein sanfter Flügel weilt.
+
+    Seid umschlungen, Millionen!
+    Diesen Kuß der ganzen Welt!
+    Brüder, überm Sternenzelt
+    muß ein lieber Vater wohnen.
+
+There are a few interesting characters in this excerpt, beyond ASCII's small realm. We'll look at the text of this poem through properties. (If you would like a translation of this poem fragment, you can drop it into Google Translate.
+
+Using _ack_ on a command line, you can specify that you want to see all the characters whose property is Letter (L):
+
+    ack '\pL' schiller.txt
+
+This will show you all the letters highlighted. For lowercase letters, use _Ll_ , surrounded by braces:
+
+    ack '\p{Ll}' schiller.txt
+
+You must add the braces. For uppercase, it's _Lu_ :
+
+    ack '\p{Lu}' schiller.txt
+
+To specify characters that do _not_ match a property, we use uppercase _P_ :
+
+    ack '\PL' schiller.txt
+
+This highlights characters that are not letters.
+
+The following finds those that are not lowercase letters:
+
+    ack '\P{Ll}' schiller.txt
+
+And this highlights the ones that are not uppercase:
+
+    ack '\P{Lu}' schiller.txt
+
+You can also do this in yet another browser-based regex tester, <http://regex.larsolavtorvik.com>. Figure 6-4 shows the Schiller text with its lowercase letters highlighted using the lowercase property (`\p{Ll}`).
+
+Figure 6-4. Characters with the lowercase letter property
+
+Table 6-2 lists character property names for use with `\p{` _`property`_`}` or `\P{` _`property`_`}` (see pcresyntax(3) at <http://www.pcre.org/pcre.txt>). You can also match human languages with properties; see Table A-8.
+
+Table 6-2. Character properties
+
+Property| Description  
+---|---
+
+C | Other
+
+Cc | Control
+
+Cf | Format
+
+Cn | Unassigned
+
+Co | Private use
+
+Cs | Surrogate
+
+L | Letter
+
+Ll | Lowercase letter
+
+Lm | Modifier letter
+
+Lo | Other letter
+
+Lt | Title case letter
+
+Lu | Uppercase letter
+
+L& | Ll, Lu, or Lt
+
+M | Mark
+
+Mc | Spacing mark
+
+Me | Enclosing mark
+
+Mn | Non-spacing mark
+
+N | Number
+
+Nd | Decimal number
+
+Nl | Letter number
+
+No | Other number
+
+P | Punctuation
+
+Pc | Connector punctuation
+
+Pd | Dash punctuation
+
+Pe | Close punctuation
+
+Pf | Final punctuation
+
+Pi | Initial punctuation
+
+Po | Other punctuation
+
+Ps | Open punctuation
+
+S | Symbol
+
+Sc | Currency symbol
+
+Sk | Modifier symbol
+
+Sm | Mathematical symbol
+
+So | Other symbol
+
+Z | Separator
+
+Zl | Line separator
+
+Zp | Paragraph separator
+
+Zs | Space separator
+
+# Matching Control Characters
+
+How do you match control characters? It's not all that common that you will search for control characters in text, but it's a good thing to know. In the example repository or archive, you'll find the file _ascii.txt_ , which is a 128-line file that contains all the ASCII characters in it, each on separate line (hence the 128 lines). When you perform a search on the file, it will usually return a single line if it finds a match. This file is good for testing and general fun.
+
+### Note
+
+If you search for strings or control characters in _ascii.txt_ with _grep_ or _ack_ , they may interpret the file as a binary file. If so, when you run a script on it, either tool may simply report "Binary file ascii.txt matches" when it finds a match. That's all.
+
+In regular expressions, you can specify a control character like this:
+
+`\c` _`x`_
+
+where _x_ is the control character you want to match.
+
+Let's say, for example, you wanted to find a null character in a file. You can use Perl to do that with the following command:
+
+    perl -n -e 'print if /\c@/' ascii.txt
+
+Provided that you've got Perl on your system and it's running properly, you will get this result:
+
+    0. Null
+
+The reason why is that there is a null character on that line, even though you can't see the character in the result.
+
+### Note
+
+If you open _ascii.txt_ with an editor other than _vim_ , it will likely remove the control characters from the file, so I suggest you don't do it.
+
+You can also use `\0` to find a null character. Try this, too:
+
+    perl -n -e 'print if /\0/' ascii.txt
+
+Pressing on, you can find the bell (BEL) character using:
+
+    perl -n -e 'print if /\cG/' ascii.txt
+
+It will return the line:
+
+    7. Bell
+
+Or you can use the shorthand:
+
+    perl -n -e 'print if /\a/' ascii.txt
+
+To find the escape character, use:
+
+    perl -n -e 'print if /\c[/' ascii.txt
+
+which gives you:
+
+    27. Escape
+
+Or do it with a shorthand:
+
+    perl -n -e 'print if /\e/' ascii.txt
+
+How about a backspace character? Try:
+
+    perl -n -e 'print if /\cH/' ascii.txt
+
+which spits back:
+
+    8. Backspace
+
+You can also find a backspace using a bracketed expression:
+
+    perl -n -e 'print if /[\b]/' ascii.txt
+
+Without the brackets, how would `\b` be interpreted? That's right, as a word boundary, as you learned in Chapter 2. The brackets change the way the `\b` is understood by the processor. In this case, Perl sees it as a backspace character.
+
+Table 6-3 lists the ways we matched characters in this chapter.
+
+Table 6-3. Matching Unicode and other characters
+
+Code| Description  
+---|---
+
+`\u` _`xxxx`_ | Unicode (four places)
+
+`\` _`xxx`_ | Unicode (two places)
+
+`\x``{ _`xxxx`_ }` | Unicode (four places)
+
+`\x``{ _`xx`_ }` | Unicode (two places)
+
+`\000` | Octal (base 8)
+
+`\c _`x`_` | Control character
+
+`\0` | Null
+
+`\a` | Bell
+
+`\e` | Escape
+
+`[\b]` | Backspace
+
+That wraps things up for this chapter. In the next, you'll learn more about quantifiers.
+
+# What You Learned in Chapter 6
+
+  * How to match any Unicode character with `\u` _`xxxx`_ or `\` _`xxx`_
+
+  * How to match any Unicode character inside of _vim_ using `\%` _`xxx`_ , `\%X` _`xx`_ , `\%u` _`xxxx`_ , or `\%U` _`xxxx`_
+
+  * How to match characters in the range 0–255 using octal format with `\000`
+
+  * How to use Unicode character properties with `\p{` _`x`_`}`
+
+  * How to match control characters with `\e` or `\cH`
+
+  * More on how to use Perl on the command line (more Perl one-liners)
+
+# Technical Notes
+
+  * I entered control characters in _ascii.txt_ using _vim_ (<http://www.vim.org>). In _vim_ , you can use Ctrl+V followed by the appropriate control sequence for the character, such as Ctrl+C for the end-of-text character. I also used Ctrl+V followed by _x_ and the two-digit hexadecimal code for the character. You can also use digraphs to enter control codes; in _vim_ enter `:digraph` to see the possible codes. To enter a digraph, use Ctrl+K while in Insert mode, followed by a two-character digraph (for example, _NU_ for null).
+
+  * RegexHero (<http://regexhero.net/tester>) is a .NET regex implementation in a browser written by Steve Wortham. This one is for pay, but you can test it out for free, and if you like it, the prices are reasonable (you can buy it at a standard or a professional level).
+
+  *  _vim_ (<http://www.vim.org>) is an evolution of the _vi_ editor that was created by Bill Joy in 1976. The _vim_ editor was developed primarily by Bram Moolenaar. It seems archaic to the uninitiated, but as I've mentioned, it is incredibly powerful.
+
+  * The _ack_ tool (<http://betterthangrep.com>) is written in Perl. It acts like _grep_ and has many of its command line options, but it outperforms _grep_ in many ways. For example, it uses Perl regular expressions instead of basic regular expressions like _grep_ (without _-E_ ). For installation instructions, see <http://betterthangrep.com/install/>. I used the specific instructions under "Install the ack executable." I didn't use _curl_ but just downloaded _ack_ with the link provided and then copied the script into _/usr/bin_ on both my Mac and a PC running Cygwin (<http://www.cygwin.com>) in Windows 7.
+
+# Chapter 7. Quantifiers
+
+You have already seen some quantifiers at work earlier in this book, but here I'll talk about them in more detail.
+
+For our example this time, we'll use a Mac desktop application called Reggy (Figure 7-1), as we did in Chapter 5. Uncheck _Match All_ at the bottom to start.
+
+If you are not on a Mac, you can try these examples in one of the applications you've seen earlier in the book. Paste the right triangle of digits from the _triangle.txt_. The file is in the archive of examples.
+
+Figure 7-1. Reggy application
+
+# Greedy, Lazy, and Possessive
+
+I'm not talking about your teenager here. I'm talking about quantifiers. These adjectives may not sound like good character qualities, but they are interesting features of quantifiers that you need to understand if you want to use regular expressions with skill.
+
+Quantifiers are, by themselves, greedy. A greedy quantifier first tries to match the whole string. It grabs as much as it can, the whole input, trying to make a match. If the first attempt to match the whole string goes awry, it backs up one character and tries again. This is called _backtracking_. It keeps backing up one character at a time until it finds a match or runs out of characters to try. It also keeps track of what it is doing, so it puts the most load on resources compared with the next two approaches. It takes a mouthful, then spits back a little at a time, chewing on what it just ate. You get the idea.
+
+A lazy (sometimes called _reluctant_ ) quantifier takes a different tack. It starts at the beginning of the target, trying to find a match. It looks at the string one character at a time, trying to find what it is looking for. At last, it will attempt to match the whole string. To get a quantifier to be lazy, you have to append a question mark (`?`) to the regular quantifier. It chews one nibble at a time.
+
+A possessive quantifier grabs the whole target and then tries to find a match, but it makes only one attempt. It does not do any backtracking. A possessive quantifier appends a plus sign (+) to the regular quantifier. It doesn't chew; it just swallows, then wonders what it just ate. I'll demonstrate each of these in the pages that follow.
+
+# Matching with *, +, and ?
+
+If you have the triangle of digits in Reggy, you can now begin testing. First we'll use the Kleene star, named for the man credited as the inventor of regular expressions, Stephen Kleene. If you use the star or asterisk following a dot like this:
+
+    .*
+
+it would match, being greedy, all the characters (digits) in the subject text. As you know from earlier reading, `.*` matches any character zero or more times. All the digits in the lower box should be highlighted by changing color. Of the Kleene star, an early manual said:
+
+> A regular expression followed by "*" [Kleene star] is a regular expression which matches any number (including zero) of adjacent occurrences of the text matched by the regular expression.
+
+Now try:
+
+    9*
+
+and the row of nines near the bottom should be highlighted. Now:
+
+    9.*
+
+lights up the row of nines and the row of zeros below it. Because _Multiline_ is checked (at the bottom of the application window), the dot will match the newline character between the rows; normally, it would not.
+
+To match one or more 9s, try:
+
+    9+
+
+How is that different? You can't really tell because there are nine 9s in the subject text. The main difference is that + is looking for at least one 9, but `*` is looking for zero or more.
+
+To match zero or one time (optional), use:
+
+    9?
+
+This will match the first occurrence of 9 only. That 9 is considered optional, so because it does exist in the subject text, it is matched and highlighted. If you do this:
+
+    99?
+
+then both the first and second 9 are matched.
+
+Table 7-1 lists the basic quantifiers and some of the possibilities that they have. These quantifiers are by default _greedy_ , meaning that they match as many characters as they possibly can on the first attempt.
+
+Table 7-1. Basic quantifiers
+
+Syntax| Description  
+---|---
+
+`?` | Zero or one (optional)
+
++ | One or more
+
+`*` | Zero or more
+
+# Matching a Specific Number of Times
+
+When you use braces or squiggly brackets, you can match a pattern a specific number of times in a range. Unmodified, these are greedy quantifiers. For example:
+
+    7{1}
+
+will match the first occurrence of 7. If you wanted to match one _or more_ occurrences of the number 7, all you have to do is add a comma:
+
+    7{1,}
+
+You've probably realized that both:
+
+    7+
+
+and
+
+    7{1,}
+
+are essentially the same thing, and that:
+
+    7*
+
+and
+
+    7{0,}
+
+are likewise the same. In addition:
+
+    7?
+
+is the same as:
+
+    7{0,1}
+
+To find a range of matches, that is, to match _m_ to _n_ times:
+
+    7{3,5}
+
+This will match three, four, or five occurrences of 7.
+
+So to review, the squiggly bracket or range syntax is the most flexible and precise quantifier. Table 7-2 summarizes these features.
+
+Table 7-2. Summary of range syntax
+
+Syntax| Description  
+---|---
+
+{ _n_ } | Match _n_ times exactly
+
+{ _n_ ,} | Match _n_ or more times
+
+{ _m,n_ } | Match _m_ to _n_ times
+
+{0,1} | Same as `?` (zero or one)
+
+{1,0} | Same as + (one or more)
+
+{0,} | Same as `*` (zero or more)
+
+# Lazy Quantifiers
+
+Now let's set aside greediness and get lazy. The easiest way for you to understand this is by seeing it in action. In Reggy (making sure _Match All_ is unchecked), try to match zero or one 5 using a single question mark (`?`):
+
+    5?
+
+The first 5 is highlighted. Add an additional `?` to make the quantifier lazy:
+
+    5??
+
+Now it doesn't appear to match anything. The reason why is that the pattern is being lazy, that is, it's not even forced to match that first 5. By nature, the _lazy_ match matches as few characters as it can get away with. It's a slacker.
+
+Try this zero or more times:
+
+    5*?
+
+and it won't match anything either, because you gave it the option to match a minimum of zero times, and that's what it does.
+
+Try it again matching one or more times, à la lazy:
+
+    5+?
+
+And there you go. Lazy just got off the couch and matched one 5. That's all it had to do to keep its day job.
+
+Things get a bit more interesting as you apply _m,n_ matching. Try this:
+
+    5{2,5}?
+
+Only two 5s are matched, not all five of them, as a greedy match would.
+
+Table 7-3 lists the lazy quantifiers. When is lazy matching useful? You can use lazy matching when you want to match the bare minimum of characters, not the maximum possible.
+
+Table 7-3. Lazy quantifiers
+
+Syntax| Description  
+---|---
+
+?? | Lazy zero or one (optional)
+
++? | Lazy one or more
+
+*? | Lazy zero or more
+
+{ _n_ }? | Lazy _n_
+
+{ _n_ ,}? | Lazy _n_ or more
+
+{ _m,n_ }? | Lazy _m,n_
+
+# Possessive Quantifiers
+
+A possessive match is like a greedy match, it grabs as much as it can get away with. But unlike a greedy match: It does not backtrack. It does not give up anything it finds. It is selfish. That is why it is called _possessive_. Arms folded firmly, it doesn't give up any ground. But the good thing about possessive quantifiers is that they are faster, because they don't do any backtracking, and they also fail in a hurry.
+
+### Note
+
+The truth is, you can hardly tell the difference between greedy, lazy, and possessive matches with the examples in this book. But as you gain more experience, and performance tuning becomes important, you'll want to be aware of these differences.
+
+To make sense of this, first we'll try matching the zeroes with a leading zero, then with a trailing zero. In Reggy, make sure _Match All_ is checked, and enter this expression with a leading zero:
+
+    0.*+
+
+What happened? All the zeroes are highlighted. There was a match. The possessive match appears to do the same thing as a greedy match, with one subtle difference: There is no backtracking. You can now prove it. Enter this with a trailing zero:
+
+    .*+0
+
+No match. The reason why is there was no backtracking. It gobbled up the entire input and never looked back. It wasted its inheritance with riotous living. It can't find the trailing zero. It doesn't know where to look. If you remove the plus sign, it would find all the zeroes as it goes back to a greedy match.
+
+    .*0
+
+You might want to use a possessive quantifier when you are aware of what is in your text, you know where you will find matches. You don't care if it grabs with gusto. A possessive match can help you match with improved performance. Table 7-4 shows the possessive quantifiers.
+
+Table 7-4. Possessive quantifiers
+
+Syntax| Description  
+---|---
+
+?+ | Possessive zero or one (optional)
+
+````++ | Possessive one or more
+
+*+ | Possessive zero or more
+
+{ _n_ }+ | Possessive _n_
+
+{ _n_ ,}+ | Possessive _n_ or more
+
+{ _m,n_ }+ | Possessive _m,n_
+
+You'll be introduced to lookarounds in the next chapter.
+
+# What You Learned in Chapter 7
+
+  * The differences between greedy, lazy, and possessive matching
+
+  * How to match one or more (+)
+
+  * How to match optionally (zero or one, `?`)
+
+  * How to match zero or one (`*`)
+
+  * How to use { _m,n_ } quantifiers
+
+  * How to use greedy, lazy (reluctant), and possessive quantifiers.
+
+# Technical Notes
+
+The quote comes from Dennis Ritchie and Ken Thompson, _QED Text Editor_ (Murray Hill, NJ, Bell Labs, 1970) p. 3 (see <http://cm.bell-labs.com/cm/cs/who/dmr/qedman.pdf>).
+
+# Chapter 8. Lookarounds
+
+Lookarounds are non-capturing groups that match patterns based on what they find either in front of or behind a pattern. Lookarounds are also considered _zero-width assertions_.
+
+Lookarounds include:
+
+  * Positive lookaheads
+
+  * Negative lookaheads
+
+  * Positive lookbehinds
+
+  * Negative lookbehinds
+
+In this chapter, I'll show you how each of these works. We'll start out using RegExr on the desktop and then move on to Perl and _ack_ ( _grep_ doesn't know about lookarounds). Our text is still Coleridge's well-worn poem.
+
+# Positive Lookaheads
+
+Suppose you want to find every occurrence of the word _ancyent_ that is followed by _marinere_ (I use the archaic spellings because that is what is found in the file). To do this, we could use a positive lookahead.
+
+First let's try it in RegExr desktop. The following case-insentitive pattern goes in the text box at the top:
+
+    (?i)ancyent (?=marinere)
+
+### Note
+
+You can also specify case-insensitivity with RegExr by simply checking the box next to _ignoreCase_ , but both methods work.
+
+Because you use the case-insensitive option (`?i`), you don't need to worry about what case you use in your pattern. You are looking for every line that has the word _ancyent_ followed hard by _marinere_. The results will be highlighted in the text area below the pattern area (see Figure 8-1); however, only the first part of the pattern will be highlighted ( _ancyent_ ), not the lookahead pattern ( _Marinere_ ).
+
+Figure 8-1. Positive lookahead in RegExr
+
+Let's now use Perl to do a positive lookahead. You can form the command like so:
+
+    perl -ne 'print if /(?i)ancyent (?=marinere)/' rime.txt
+
+and the output should look like this:
+
+    THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.
+    How a Ship having passed the Line was driven by Storms to the cold Country towards 
+    the South Pole; and how from thence she made her course to the tropical Latitude of 
+    the Great Pacific Ocean; and of the strange things that befell; and in what manner the 
+    Ancyent Marinere came back to his own Country.
+         It is an ancyent Marinere,
+         "God save thee, ancyent Marinere!
+         "I fear thee, ancyent Marinere!
+
+There are five lines in the poem where the word _ancyent_ shows up right before the word _marinere_. What if we just wanted to check if the word following _ancyent_ started with the letter _m_ , either in upper- or lowercase? We could do it this way:
+
+    perl -ne 'print if /(?i)ancyent (?=m)/' rime.txt
+
+In addition to `Marinere`, you would get `man` and `Man`:
+
+    And thus spake on that ancyent man,
+    And thus spake on that ancyent Man,
+
+ _ack_ also can do lookarounds as it is written in Perl. The command-line interface for _ack_ is very similar to _grep_.
+
+Try this:
+
+    ack '(?i)ancyent (?=ma)' rime.txt
+
+and you'll see highlighted results, as shown in Figure 8-2.
+
+Figure 8-2. Positive lookahead with ack in Terminal
+
+With _ack_ , you can specify case-insensitivity with the command-line option _-i_ , rather than with the embedded option `(?i)`:
+
+    ack -i 'ancyent (?=ma)' rime.txt
+
+I'll throw something in here for good measure. If you want to add line numbers to _ack_ 's output, you can do several things. You can add the _-H_ option:
+
+    ack -Hi 'ancyent (?=ma)' rime.txt
+
+Or you could add this code with the _\--output_ option:
+
+    ack -i --output '$.:$_' 'ancyent (?=ma)' rime.txt
+
+This is a bit of a hack, and turns off highlighting, but it works.
+
+# Negative Lookaheads
+
+The flip side of a positive lookahead is a negative lookahead. This means that as you try to match a pattern, you _won't_ find a given lookahead pattern. A negative lookahead is formed like this:
+
+    (?i)ancyent (?!marinere)
+
+Only one character changed: The equals sign (`=`) in the positive lookahead became an exclamation point (`!`) in the negative lookahead. Figure 8-3 shows you this negative lookahead in Opera.
+
+Figure 8-3. Negative lookahead with RegExr in Opera
+
+In Perl, we could do a negative lookahead this way:
+
+    perl -ne 'print if /(?i)ancyent (?!marinere)/' rime.txt
+
+and this is what we would get back:
+
+    And thus spake on that ancyent man,
+    And thus spake on that ancyent Man,
+
+In _ack_ , the same results could be produced with:
+
+    ack -i 'ancyent (?!marinere)' rime.txt
+
+# Positive Lookbehinds
+
+A positive lookbehind looks to the left, in the opposite direction as a lookahead. The syntax is:
+
+    (?i)(?<=ancyent) marinere
+
+The positive lookbehind throws in a less-than sign (`<`), reminding you which direction lookbehind is. Try this in RegExr and see what the difference is. Instead of _ancyent_ being highlighted, _marinere_ is. Why? Because the positive lookbehind is a condition of the match and is not included or consumed in the match results.
+
+Do it like so in Perl:
+
+    perl -ne 'print if /(?i)(?<=ancyent) marinere/' rime.txt
+
+And like this with _ack_ :
+
+    ack -i '(?<=ancyent) marinere' rime.txt
+
+# Negative Lookbehinds
+
+Finally, there is the negative lookbehind. And how do you think this one works?
+
+It is looking to see if a pattern does _not_ show up behind in the left-to-right stream of text. Again, it adds a less-than sign (`<`), reminding you which direction lookbehind is.
+
+Do this in RegExr and see the results.
+
+    (?1)(?<!ancyent) marinere
+
+Scroll down to see what you got.
+
+Then try it in Perl:
+
+    perl -ne 'print if /(?i)(?<!ancyent) marinere/' rime.txt
+
+What you should see is this, with no sign of _ancyent_ anywhere:
+
+           The Marinere hath his will.
+           The bright-eyed Marinere.
+           The bright-eyed Marinere.
+         The Marineres gave it biscuit-worms,
+           Came to the Marinere's hollo!
+           Came to the Marinere's hollo!
+         The Marineres all 'gan work the ropes,
+         The Marineres all return'd to work
+         The Marineres all 'gan pull the ropes,
+           "When the Marinere's trance is abated."
+         He loves to talk with Marineres
+         The Marinere, whose eye is bright,
+
+And, lastly, do it this way in _ack_ :
+
+    ack -i '(?<!ancyent) marinere' rime.txt
+
+That wraps up our brief introduction to for lookaheads and lookbehinds, a powerful feature of modern regular expressions.
+
+In the next chapter, you'll see a full example of how to mark up a document with HTML5 using _sed_ and Perl.
+
+# What You Learned in Chapter 8
+
+  * How to do positive and negative lookaheads
+
+  * How to do both positive and negative lookbehinds
+
+# Technical Notes
+
+See also pages 59 through 66 of _Mastering Regular Expressions, Third Edition_.
+
+# Chapter 9. Marking Up a Document with HTML
+
+This chapter will take you step by step through the process of marking up plain-text documents with HTML5 using regular expressions, concluding what we started early in the book.
+
+Now, if it were me, I'd use AsciiDoc to do this work. But for our purposes here, we'll pretend that there is no such thing as AsciiDoc (what a shame). We'll plod along using a few tools we have at hand—namely, _sed_ and Perl—and our own ingenuity.
+
+For our text we'll still use Coleridge's poem in _rime.txt_.
+
+### Note
+
+The scripts in this chapter work well with _rime.txt_ because you understand the structure of that file. These scripts will give you less predictable results when used on arbitrary text files; however, they give you a starting point for handling text structures in more complex files.
+
+# Matching Tags
+
+Before we start adding markup to the poem, let's talk about how to match either HTML or XML tags. There are a variety of ways to match a tag, either start-tags (e.g., `<html>`) or end-tags (e.g., `</html>`), but I have found the one that follows to be reliable. It will match start-tags, with or without attributes:
+
+    <[_a-zA-Z][^>]*>
+
+Here is what it does:
+
+  * The first character is a left angle bracket (<).
+
+  * Elements can begin with an underscore character (_) in XML or a letter in the ASCII range, in either upper- or lowercase (see Technical Notes).
+
+  * Following the start character, the name can be followed by zero or more characters, any character other than a right angle bracket (>).
+
+  * The expression ends with a right angle bracket.
+
+Try this with _grep_. Match it against a sample DITA file in the archive, _lorem.dita_ :
+
+    grep -Eo '<[_a-zA-Z][^>]*>' lorem.dita
+
+yields this answer:
+
+    <topic id="lorem">
+    <title>
+    <body>
+    <p>
+    <p>
+    <ul>
+    <li>
+    <li>
+    <li>
+    <p>
+    <p>
+
+To match both start- and end-tags, simply add a forward slash followed by a question mark. The question mark makes the forward slash optional:
+
+    </?[_a-zA-Z][^>]*>
+
+I'm sticking with start-tags only here. To refine the output, I often pipe in a few other tools to make it prettier:
+
+    grep -Eo '<[_a-zA-Z][^>]*>' lorem.dita | sort | uniq | sed 's/^<//;s/ id=\".*\"//;s/>
+        $//'
+
+This gives you a list of sorted XML tag names:
+
+    body
+    li
+    p
+    p
+    title
+    topic
+    ul
+
+I'll take this a step further in the next and final chapter. The following sections will take you through some of the steps you have learned before, but with a few new twists.
+
+# Transforming Plain Text with _sed_
+
+Let's add some markup to the top of the text in _rime.txt_. We can do this with the insert command (`i\`). In the directory where the _rime.txt_ file is located, enter the following at a shell prompt:
+
+    sed '1 i\
+    <!DOCTYPE html>\
+    <html lang="en">\
+    <head>\
+    <title>The Rime of the Ancyent Marinere (1798)</title>\
+    <meta charset="utf-8"/>\
+    </head>\
+    <body>\
+
+    q' rime.txt
+
+After you press Enter or Return, your output should look like the following, with the tags at the top:
+
+    <!DOCTYPE html>
+    <html lang="en">
+    <head>
+    <title>The Rime of the Ancyent Marinere (1798)</title>
+    <meta charset="utf-8"/>
+    </head>
+    <body>
+    THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.
+
+The command you just entered did not actually change the file—it only produced an output to your screen. I'll show you how to write your changes to a file later.
+
+## Substitution with _sed_
+
+In the next example, _sed_ finds the first line of the file and captures the entire line in a capturing group using escaped parentheses `\(` and `\)`. _sed_ needs to escape the parentheses used to capture a group unless you use the _-E_ option (more on this in a moment). The beginning of the line is demarcated with `^`, and the end of the line with a `$`. The backreference `\1` pulls the captured text into the content of the _title_ element, indented with one space.
+
+Run the command that follows:
+
+    sed '1s/^\(.*\)$/ <title>\1<\/title>/;q' rime.txt
+
+The resulting line looks like this:
+
+    <title>THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.</title>
+
+Now try it this way:
+
+    sed -E '1s/^(.*)$/<!DOCTYPE html>\
+    <html lang="en">\
+    <head>\
+     <title>\1<\/title>\
+    <\/head>\
+    <body>\
+    <h1>\1<\/h1>\
+    /;q' rime.txt
+
+Let's talk about it:
+
+  * The _-E_ options tells _sed_ to use extended regular expressions or EREs (so you don't have to escape the parentheses, etc.).
+
+  * Using a substitute ( _s_ ) command, grab line 1 in a capturing group (`^(.*)$`) so you can reuse the text with `\1`.
+
+  * Create HTML tags and escape newlines with `\`.
+
+  * Insert the captured text in the _title_ and _h1_ tags using `\1`.
+
+  * Quit at this point (`q`) to stop printing the rest of the poem to the screen.
+
+The correct result is:
+
+    <!DOCTYPE html>
+    <html lang="en">
+    <head>
+     <title>THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.</title>
+    </head>
+    <body>
+    <h1>THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.</h1>
+
+## Handling Roman Numerals with _sed_
+
+The poem is divided into seven sections, with each section introduced with a Roman numeral. There is also an "ARGUMENT" heading. The following line will use _sed_ to capture that heading and those Roman numerals and surround them in _h2_ tags:
+
+    sed -En 's/^(ARGUMENT\.|I{0,3}V?I{0,2}\.)$/<h2>\1<\/h2>/p' rime.txt
+
+and here is what you'll see:
+
+    <h2>ARGUMENT\.</h2>
+    <h2>I.</h2
+    <h2>II.</h2
+    <h2>III.</h2
+    <h2>IV.</h2
+    <h2>V.</h2
+    <h2>VI.</h2
+    <h2>VII.</h2
+
+Following is a description of this previous _sed_ command:
+
+  * The _-E_ option gives you extended regular expressions, and the _-n_ option suppresses the printing of each line, which is _sed_ 's default behavior.
+
+  * The substitute ( _s_ ) command captures the heading and the seven uppercase Roman numerals, each on separate lines and followed by a period, in the range I through VII.
+
+  * The _s_ command then takes each line of captured text and nestles it in an _h2_ element.
+
+  * The _p_ flag at the end of the substitution prints the result to the screen.
+
+## Handling a Specific Paragraph with _sed_
+
+Next, this line finds a paragraph on line 5:
+
+    sed -En '5s/^([A-Z].*)$/<p>\1<\/p>/p' rime.txt
+
+and places that paragraph in a _p_ tag:
+
+    <p>How a Ship having passed the Line was driven by Storms to the cold Country towards 
+          the South Pole; and how from thence she made her course to the tropical Latitude 
+          of the Great Pacific Ocean; and of the strange things that befell; and in what 
+          manner the Ancyent Marinere came back to his own Country.</p>
+
+I know this looks like we are moving inchmeal at the moment, but hang on and I'll bring it all together in a page or two.
+
+## Handling the Lines of the Poem with _sed_
+
+Next we'll mark up the lines of the poem with:
+
+    sed -E '9s/^[ ]*(.*)/  <p>\1<br\/>/;10,832s/^([ ]{5,7}.*)/\1<br\/>/;
+           833s/^(.*)/\1<\/p>/' rime.txt
+
+These _sed_ substitutions depend on line numbers to get their little jobs done. This wouldn't work with a generalized case, but it works quite well when you know exactly what you are dealing with.
+
+  * On line 9, the first line of verse, the _s_ command grabs the line and, after prepending a few spaces, it inserts a _p_ start-tag and appends a _br_ (break) tag at the end of the line.
+
+  * Between lines 10 and 832, every line that begins with between 5 to 7 spaces gets a _br_ appended to it.
+
+  * On line 833, the last line of the poem, instead of a _br_ , the _s_ appends a _p_ end-tag.
+
+A sample of the resulting markup is here:
+
+    <p>It is an ancyent Marinere,<br/>
+         And he stoppeth one of three:<br/>
+      "By thy long grey beard and thy glittering eye<br/>
+         "Now wherefore stoppest me?<br/>
+
+      "The Bridegroom's doors are open'd wide<br/>
+        "And I am next of kin;<br/>
+      "The Guests are met, the Feast is set,--<br/>
+        "May'st hear the merry din.--<br/>
+
+You should also replace the blank lines with a _br_ , to keep the verses separated:
+
+    sed -E 's/^$/<br\/>/' rime.txt
+
+See what you just did:
+
+         He prayeth best who loveth best,
+           All things both great and small:
+         For the dear God, who loveth us,
+           He made and loveth all.
+    <br/>
+         The Marinere, whose eye is bright,
+           Whose beard with age is hoar,
+         Is gone; and now the wedding-guest
+           Turn'd from the bridegroom's door.
+    <br/>
+         He went, like one that hath been stunn'd
+           And is of sense forlorn:
+         A sadder and a wiser man
+           He rose the morrow morn.
+
+I have found that I can play with this kind of thing endlessly, getting the tags and space just right. I encourage you to do so yourself.
+
+# Appending Tags
+
+Now we'll append some tags to the end of the poem. With the append command (`a\`), the `$` finds the end (the last line) of the file, and appends (`a\`) the _body_ and _html_ end-tags after the last line:
+
+    sed '$ a\
+    <\/body>\
+    <\/html>\
+    ' rime.txt
+
+Here's how the end of the file will look now:
+
+         He went, like one that hath been stunn'd
+           And is of sense forlorn:
+         A sadder and a wiser man
+           He rose the morrow morn.
+    </body>
+    </html>
+
+Enough _sed_.
+
+What if you wanted to do all of these changes at the same time? You know what to do. You've already done it. You just have to put all these commands in a file and use the _-f_ option with _sed_.
+
+## Using a Command File with _sed_
+
+This example shows the file _html.sed_ , which collects all the previous _sed_ commands into one file, plus a command or two more. We'll use this file of commands to transform _rime.txt_ to HTML using _sed_. The numbered callouts in the example will guide you through what is happening in the _sed_ script.
+
+    #!/usr/bin/sed ![1](callouts/1.png)
+
+    1s/^(.*)$/<!DOCTYPE html>\ ![2](callouts/2.png)
+    <html lang="en">\
+    <head>\
+     <title>\1<\/title>\
+    <\/head>\
+    <body>\
+    <h1>\1<\/h1>\
+    /
+
+    s/^(ARGUMENT|I{0,3}V?I{0,2})\.$/<h2>\1<\/h2>/ ![3](callouts/3.png)
+    5s/^([A-Z].*)$/<p>\1<\/p>/ ![4](callouts/4.png)
+    9s/^[ ]*(.*)/  <p>\1<br\/>/ ![5](callouts/5.png)
+    10,832s/^([ ]{5,7}.*)/\1<br\/>/ ![6](callouts/6.png)
+    833s/^(.*)/\1<\/p>/ ![7](callouts/7.png)
+    13,$s/^$/<br\/>/ ![8](callouts/8.png)
+    $ a\ ![9](callouts/9.png)
+    <\/body>\
+    <\/html>\
+
+The first line is called the _shebang_ line, a hint to the shell of where the executable ( _sed_ ) is located.
+
+At line 1, substitute ( _s_ ) the line with the tags that follow. The backslash (\\) indicates that the text you want to add continues on the next line so a newline is inserted. Insert the title of the poem from line 1 with `\1`, as the content of _title_ and _h1_ elements.
+
+Surround headings and Roman numerals with _h2_ tags.
+
+On line 5, enclose the introductory paragraph in a _p_ element.
+
+On line 9, prepend a _p_ start-tag and add a _br_ at the end of the line.
+
+Between line 9 and 832, add a _br_ at the end of each line that begins with a certain number of spaces.
+
+At the end of the poem, append a _p_ end-tag.
+
+After line 13, replace each blank line with a break ( _br_ ).
+
+Appends a few tags at the end (`$`) of the document.
+
+To apply this command file to _rime.txt_ , enter this line, followed by Enter or Return:
+
+    sed -E -f html.sed rime.txt
+
+To redirect the output to a file:
+
+    sed -E -f html.sed rime.txt > rime.html
+
+Open _rime.html_ in a browser to see what you have created (see Figure 9-1).
+
+Figure 9-1. rime.html in Firefox
+
+# Transforming Plain Text with Perl
+
+I'll now show you how to mark up a file with HTML using Perl. First, like with _sed_ , I'll give you a series of one-liners; then I'll show those same commands in a file.
+
+### Note
+
+This book introduces you to only the rudiments of the Perl language, and how to get started using it. It is not a Perl tutorial or manual, but I hope to pique your interest in Perl and show you a few possibilities. A good place to get started with Perl is at the Learning Perl website found at <http://learn.perl.org/>, which also includes instructions on how to install it.
+
+If the current line (`$.`) is line 1, assign the whole line ($_) to the _$title_ variable and print _$title_.
+
+    perl -ne 'if ($. == 1) {chomp($title = $_); print "<h1>" . $title . "</h1>" . "\n";};' 
+        rime.txt
+
+If all goes well, the result should be:
+
+    <h1>THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.</h1>
+
+Here is an explanation for the Perl command :
+
+  * Test if you are on line 1 with `$.`
+
+  * Chomp the line ($_) and assign the string to the `$title` variable. When you chomp the line with the _chomp_ function, it removes the trailing newline from the string.
+
+  * Print `$title` in an _h1_ element, followed by a newline (`\n`).
+
+### Note
+
+For more information on Perl's built-in variables, such as `$.`, enter the command `perldoc -v $.` at a prompt ( _perldoc_ normally is installed when you install Perl). If this doesn't work, see Technical Notes.
+
+To prepend some markup to the top of the file, including that _h1_ tag, use this:
+
+    perl -ne 'if ($. == 1) {chomp($title = $_)};
+    print "<!DOCTYPE html>\
+    <html xmlns=\"http://www.w3.org/1999/xhtml\">\
+     <head>\
+      <title>$title</title>\
+      <meta charset=\"utf-8\"/>\
+     </head>\
+    <body>\
+    <h1>$title</h1>\n" if $. == 1; exit' rime.txt
+
+and you'll get the following output:
+
+    <!DOCTYPE html>
+    <html xmlns="http://www.w3.org/1999/xhtml">
+     <head>
+      <title>THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.</title>
+      <meta charset="utf-8"/>
+     </head>
+    <body>
+    <h1>THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.</h1>
+
+The _print_ function prints the tags that follow, and each line (except the last), is followed by a `\`, which enters a newline into the output. The `$title` variable is expanded within the _title_ and _h1_ elements.
+
+## Handling Roman Numerals with Perl
+
+To tag up the heading and those Roman numeral section breaks, use:
+
+    perl -ne 'print if s/^(ARGUMENT\.|I{0,3}V?I{0,2}\.)$/<h2>\1<\/h2>/;' rime.txt
+
+This is the output:
+
+    <h2>ARGUMENT.</h2>
+    <h2>I.</h2>
+    <h2>II.</h2>
+    <h2>III.</h2>
+    <h2>IV.</h2>
+    <h2>V.</h2>
+    <h2>VI.</h2>
+    <h2>VII.</h2>
+
+The substitute ( _s_ ) command captures the _ARGUMENT_ heading and those seven uppercase Roman numerals, each on separate lines and followed by a period, in the range I through VII. Then it encloses the captured text in an _h2_ tag.
+
+## Handling a Specific Paragraph with Perl
+
+Use this code to enclose the introductory paragraph in a _p_ element, if the line number is equal to 5:
+
+    perl -ne 'if ($. == 5) {s/^([A-Z].*)$/<p>$1<\/p>/;print;}' rime.txt
+
+You should see this:
+
+    <p>How a Ship having passed the Line was driven by Storms to the cold Country towards 
+          the South Pole; and how from thence she made her course to the tropical Latitude 
+          of the Great Pacific Ocean; and of the strange things that befell; and in what 
+          manner the Ancyent Marinere came back to his own Country.</p>
+
+## Handling the Lines of the Poem with Perl
+
+The following command places a _p_ start-tag at the beginning of the first line of the poem, and a _br_ tag after the end of that line:
+
+    perl -ne 'if ($. == 9) {s/^[ ]*(.*)/  <p>$1<br\/>/;print;}' rime.txt
+
+It gives you:
+
+    <p>It is an ancyent Marinere,<br/>
+
+Next, between lines 10 and 832, this bit of Perl puts a _br_ at the end of each line of the poem:
+
+    perl -ne 'if (10..832) { s/^([ ]{5,7}.*)/$1<br\/>/; print;}' rime.txt
+
+A sample of what you will see:
+
+    Farewell, farewell! but this I tell<br/>
+      To thee, thou wedding-guest!<br/>
+    He prayeth well who loveth well<br/>
+      Both man and bird and beast.<br/>
+
+Add a _p_ end-tag to the end of the last line of the poem.
+
+    perl -ne 'if ($. == 833) {s/^(.*)/$1<\/p>/; print;}' rime.txt
+
+It shows:
+
+    He rose the morrow morn.</p>
+
+Replace blank lines at the end of each line with a _br_ tag:
+
+    perl -ne 'if (9..eof) {s/^$/<br\/>/; print;}' rime.txt
+
+to yield this:
+
+    <br/>
+         He prayeth best who loveth best,
+           All things both great and small:
+         For the dear God, who loveth us,
+           He made and loveth all.
+    <br/>
+         The Marinere, whose eye is bright,
+           Whose beard with age is hoar,
+         Is gone; and now the wedding-guest
+           Turn'd from the bridegroom's door.
+    <br/>
+
+And finally, when the end of the file is discovered, print a couple of end-tags:
+
+    perl -ne 'if (eof) {print "</body>\n</html>\n"};' rime.txt
+
+All this code works together more easily when it's in a file. You'll see that next.
+
+## Using a File of Commands with Perl
+
+The following lists _html.pl_ which transforms _rime.txt_ to HTML using Perl. The numbered callouts in the example guide you through what is happening in the script.
+
+    #!/usr/bin/perl -p ![1](callouts/1.png)
+
+    if ($. == 1) { ![2](callouts/2.png)
+     chomp($title = $_);
+    }
+    print "<!DOCTYPE html>\ ![3](callouts/3.png)
+    <html xmlns=\"http://www.w3.org/1999/xhtml\">\
+     <head>\
+      <title>$title</title>\
+      <meta charset=\"utf-8\"/>\
+     </head>\
+    <body>\
+    <h1>$title</h1>\n" if $. == 1;
+    s/^(ARGUMENT|I{0,3}V?I{0,2})\.$/<h2>$1<\/h2>/; ![4](callouts/4.png)
+    if ($. == 5) { ![5](callouts/5.png)
+     s/^([A-Z].*)$/<p>$1<\/p>/;
+    }
+    if ($. == 9) { ![6](callouts/6.png)
+     s/^[ ]*(.*)/  <p>$1<br\/>/;
+    }
+    if (10..832) { ![7](callouts/7.png)
+     s/^([ ]{5,7}.*)/$1<br\/>/;
+    }
+    if (9..eof) { ![8](callouts/8.png)
+     s/^$/<br\/>/;
+    }
+    if ($. == 833) { ![9](callouts/9.png)
+     s/^(.*)$/$1<\/p>\n <\/body>\n<\/html>\n/;
+    }
+
+This is called the _shebang_ directive, which gives a hint to the shell of where the program you are running is located.
+
+If the current line (`$.`) is line 1, then assign the whole line ($_) to the _$title_ variable, chomping off (with `chomp`) the last character in the string (a newline) in the process.
+
+Print a doctype and several HTML tags at the top of the document at line 1, and reuse the value of the `$title` variable in several places.
+
+Give the ARGUMENT heading and the Roman numerals _h2_ tags.
+
+Surround the introductory paragraph with _p_ tags.
+
+Prepend a _p_ start-tag to the beginning of the first line of verse, and append a _br_ to that line.
+
+Append a _br_ tag to the end of each line of verse, except the last line.
+
+Replace each blank line, after line 9, with a _br_ tag.
+
+Append _p_ , _body_ , and _html_ end-tags to the last line.
+
+To run this, simply do the following:
+
+    perl html.pl rime.txt
+
+You can also redirect the output with a > to save your output to a file. In the next and final chapter, I'll conclude our regex tutorial.
+
+# What You Learned in Chapter 9
+
+  * How to use _sed_ on the command line
+
+  * How to prepend (insert), substitute, and append text (tags) with _sed_
+
+  * How to use Perl to do the same
+
+# Technical Notes
+
+  * AsciiDoc (<http://www.methods.co.nz/asciidoc/>) by Stuart Rackham is a text format that can be converted, using a Python processor, into HTML, PDF, ePUB, DocBook and man pages. The syntax for the text files is similar to Wiki or Markdown and much quicker than hand-coding HTML or XML tags.
+
+  * The underscore applies to XML tag names only, not HTML. In addition, XML tags can of course have a much wider range of characters in their names than what is represented in the ASCII set. For more information on characters used in XML names, see <http://www.w3.org/TR/REC-xml/#sec-common-syn>.
+
+  * If the command `perldoc` doesn't work, you have some alternatives. First, you can easily read about Perl online at <http://perldoc.perl.org>. (To learn more about `$.`, for example, go to <http://perldoc.perl.org/perlvar.html#Variables-related-to-filehandles>.) If you are on a Mac, try `perldoc5.12`. If you installed Perl from ActiveState, you will find it at `/usr/local/ActivePerl-5.XX/bin`. Both `perl` and `perldoc` are installed at `/usr/local/bin` when compiled and built from source. You can add `/usr/local/bin` to your path so `perl` and `perldoc` will run. For information on setting your path variable, see <http://java.com/en/download/help/path.xml>.
+
+# Chapter 10. The End of the Beginning
+
+> "Unix was not designed to stop you from doing stupid things, because that would also stop you from doing clever things." —Doug Gwyn
+
+Congratulations for making it this far. You're not a regular expression novice anymore. You have been introduced to the most commonly used regular expression syntax. And it will open a lot of possibilities up to you in your work as a programmer.
+
+Learning regular expressions has saved me a lot of time. Let me give you an example.
+
+I use a lot of XSLT at work, and often I have to analyze the tags that exist in a group of XML files.
+
+I showed you part of this in the last chapter, but here is a long one-liner that takes a list of tag names from _lorem.dita_ and converts it into a simple XSLT stylesheet:
+
+    grep -Eo '<[_a-zA-Z][^>]*>' lorem.dita | sort | uniq | sed '1 i\
+    <xml:stylsheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">\
+
+    ; s/^</\
+    <xsl:template match="/;s/ id=\".*\"//;s/>$/">\
+     <xsl:apply-templates\/>\
+    <\/xsl:template>/;$ a\
+    \
+    </xsl:stylesheet>\
+    '
+
+I know this script may appear a bit acrobatic, but after you work with this stuff for a long time, you start thinking like this. I am not even going to explain what I've done here, because I am sure you can figure it out on your own now.
+
+Here is what the output looks like:
+
+    <xml:stylsheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+
+    <xsl:template match="body">
+     <xsl:apply-templates/>
+    </xsl:template>
+
+    <xsl:template match="li">
+     <xsl:apply-templates/>
+    </xsl:template>
+
+    <xsl:template match="p">
+     <xsl:apply-templates/>
+    </xsl:template>
+
+    <xsl:template match="title">
+     <xsl:apply-templates/>
+    </xsl:template>
+
+    <xsl:template match="topic">
+     <xsl:apply-templates/>
+    </xsl:template>
+
+    <xsl:template match="ul">
+     <xsl:apply-templates/>
+    </xsl:template>
+
+    </xsl:stylesheet>
+
+That's only a start. Of course, this simple stylesheet will need a lot of editing before it can do anything useful, but this is the kind of thing that can save you a lot of keystrokes.
+
+I'll admit, it would be easier if I put these _sed_ commands in a file. As a matter of fact, I did. You'll find _xslt.sed_ in the sample archive. This is the file:
+
+    #!/usr/bin/sed
+
+    1 i\
+    <xml:stylsheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">\
+
+    s/^</\
+    <xsl:template match="/;s/ id=\".*\"//;s/>$/">\
+     <xsl:apply-templates\/>\
+    <\/xsl:template>/;$ a\
+    \
+    </xsl:stylesheet>\
+
+And here is how to run it:
+
+    grep -Eo '<[_a-zA-Z][^>]*>' lorem.dita | sort | uniq | sed -f xslt.sed
+
+# Learning More
+
+Even though you have a good strong grip on regex now, there is still lots to learn. I have a couple of suggestions of where to go next.
+
+I pass these recommendations along out of experience and observation, not from any sense of obligation or to be "salesy." I won't get any kickbacks for mentioning them. I talk about them because these resources will actually benefit you.
+
+Jeffrey E. F. Friedl's _Mastering Regular Expressions, Third Edition_ is the source many programmers look to for a definitive treatment of the regular expression. Both expansive and well-written, if you are going to do any significant work with regex, you need to have this book on your shelf or in your e-reader. Period.
+
+Jan Goyvaerts and Steven Levithan's _Regular Expressions Cookbook_ is another great piece of work, especially if you are comparing different implementations. I'd get this one, too.
+
+The _Regular Expression Pocket Reference: Regular Expressions for Perl, Ruby, PHP, Python, C, Java and .NET_ by Tony Stubblebine is a 128-page guide which, though it is several years old, still remains popular.
+
+Andrew Watt's book _Beginning Regular Expressions_ (Wrox, 2005) is highly rated. I have found Bruce Barnett's online _sed_ tutorial particularly useful (see <http://www.grymoire.com/Unix/Sed.html>). He demonstrates a number of _sed_ 's less understood features, features I have not explained here.
+
+# Notable Tools, Implementations, and Libraries
+
+I've mentioned a number of tools, implementations, and libraries in this book. I'll recap those here and mention several others.
+
+## Perl
+
+Perl is a popular, general-purpose programming language. A lot of people prefer Perl for text processing with regular expressions over other languages. You likely already have it, but for information on how to install Perl on your system, go to <http://www.perl.org/get.html>. Read about Perl's regular expressions at <http://perldoc.perl.org/perlre.html>. Don't get me wrong. There are plenty of other languages that do a great job with regex, but it pays to have Perl in your toolbox. To learn more, I'd get a copy of the latest edition of Learning Perl, by Randal Schwartz, brian d foy, and Tom Phoenix, also published by O'Reilly.
+
+## PCRE
+
+Perl Compatible Regular Expressions or PCRE (see <http://www.pcre.org>) is a regular expression library written in C (both 8-bit and 16-bit). This library mainly consists of functions that may be called within any C framework or from any other language that can use C libraries. It is compatible with Perl 5 regular expressions, as its name suggests, and includes some features from other regex implementations. The Notepad++ editor uses the PCRE library.
+
+ _pcregrep_ is an 8-bit, _grep_ -like tool that enables you to use the features of the PCRE library on the command line. You used it in Chapter 3. See <http://www.pcre.org> for download information (from <ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/>). You can get _pcregrep_ for the Mac through Macports (<http://www.macports.org>) by running the command `sudo port install pcre` (Xcode is a prerequisite; see <https://developer.apple.com/technologies/tools/>, where a login is required). To install it on the Windows platform (binaries), go to <http://gnuwin32.sourceforge.net/packages/pcre.htm>.
+
+## Ruby (Oniguruma)
+
+Oniguruma is a regular expression library that is standard with Ruby 1.9; see <http://oniguruma.rubyforge.org/>. It is written in C and was written specifically to support Ruby. You can try out Ruby's regular expression using Rubular, an online app that supports both 1.8.7 and 1.9.2 (see <http://www.rubular.com> and Figure 10-1). TextMate, by the way, uses the Oniguruma library.
+
+Figure 10-1. Phone number regex in Rubular
+
+## Python
+
+Python is a general-purpose programming language that supports regular expressions (see <http://www.python.org>). It was first created by Guido van Rossum in 1991. You can read about Python 3's regular expression syntax here: <http://docs.python.org/py3k/library/re.html?highlight=regular%20expressions>.
+
+## RE2
+
+RE2 is a non-backtracking C++ regular expression library (see <http://code.google.com/p/re2>). While RE2 is quite fast, it does not do backtracking or backreferences. It is available as a CPAN package for Perl and can fall back on Perl's native library if backreferences are needed. For instructions on making API calls, see <http://code.google.com/p/re2/wiki/CplusplusAPI>. For an interesting discussion on RE2, see "Regular Expression Matching in the Wild" at <http://swtch.com/~rsc/regexp/regexp3.html>.
+
+# Matching a North American Phone Number
+
+You remember the North American phone number example from the first chapter? You've come a long way since then.
+
+Here is a more robust regular expression for matching phone numbers than the one we used there. It is adapted from Goyvaerts and Levithan's example on page 235 of their _Regular Expressions Cookbook_ (first edition).
+
+    ^\(?(?:\d{3})\)?[-.]?(?:\d{3})[-.]?(?:\d{4})$
+
+Play with it with the tool of your choice (see it in Reggy in Figure 10-2). By now, you should be able to pick this regex apart with hardly any hand-holding. I'm proud of you for that. But I'll go over it for good measure.
+
+  * `^` is the zero-width assertion for the beginning of a line or subject.
+
+  * `\(?` is a literal left parenthesis, but it is optional (`?`).
+
+  * `(?:\d{3})` is a non-capturing group matching three consecutive digits.
+
+  * `\)?` is an optional right parenthesis.
+
+  * `[-.]?` allows for an optional hyphen or period (dot).
+
+  * `(?:\d{3})` is another non-capturing group matching three more consecutive digits.
+
+  * `[-.]?` allows for an optional hyphen or dot again.
+
+  * `(?:\d{4})` is yet another non-capturing group matching exactly four consecutive digits.
+
+  * `$` matches the end of a line or subject.
+
+This expression could be even more refined, but I leave that to you because you can now do it on your own.
+
+Figure 10-2. Phone number regex in Reggy
+
+# Matching an Email Address
+
+Lastly, I'll throw one more regular expression at you, an email address:
+
+    ^([\w-.!#$%&'*+-/=?^_`{|}~]+)@((?:\w+\.)+)(?:[a-zA-Z]{2,4})$
+
+This is an adaptation of one provided by Grant Skinner with RegExr. I'd like to challenge you to do your best to explain what each character means in the context of a regular expression, and to see if you can improve on it. I am sure you can.
+
+Thank you for your time. I've enjoyed spending it with you. You should now have a good grasp of the fundamental concepts of regular expressions. You are no longer a member of the beginners' club. I hope you've made friends with regular expressions and learned something worthwhile along the way.
+
+# What You Learned in Chapter 10
+
+  * How to extract a list of XML elements from a document and convert the list into an XSLT stylesheet.
+
+  * Where to find additional resources for learning about regular expressions.
+
+  * What are some notable regex tools, implementations, and libraries.
+
+  * A slightly, more robust pattern for matching a North American phone number.
+
+# Appendix A. Regular Expression Reference
+
+This appendix is a reference for regular expressions.
+
+# Regular Expressions in QED
+
+QED (short for Quick Editor) was originally written for the Berkeley Time-Sharing System, which ran on the Scientific Data Systems SDS 940. A rewrite of the original QED editor by Ken Thompson for MIT's Compatible Time-Sharing System yielded one of the earliest (if not the first) practical implementation of regular expressions in computing. Table A-1, taken from pages 3 and 4 of a 1970 Bell Labs memo, outlines the regex features in QED. It amazes me that most of this syntax has remained in use to this day, over 40 years later.
+
+Table A-1. QED regular expressions
+
+Feature| Description  
+---|---
+
+_literal_ | "a) An ordinary character [literal] is a regular expression which matches that character."
+
+^ | "b) _^_ is a regular expression which matches the null character at the beginning of a line."
+
+$ | "c) _$_ is a regular expression which matches the null character before the character <nl> [newline] (usually at the end of a line)."
+
+. | "d) _._ is a regular expression which matches any character except <nl> [newline]."
+
+[<string>] | "e) "[<string>]" is a regular expression which matches any of the characters in the <string> and no others."
+
+[^<string>] | "f) "[^<string>] is a regular expression which matches any character but <nl> [newline] and the characters of the <string>."
+
+* | "g) A regular expression followed by "*" is a regular expression which matches any number (including zero) of adjacent occurrences of the text matched by the regular expression."
+
+|
+
+"h) Two adjacent regular expressions form a regular expression which matches adjacent occurrences of the text matched by the regular expressions."
+
+| | "i) Two regular expressions separated by "|" form a regular expression which matches the text matched by either of the regular expressions."
+
+( ) | "j) A regular expression in parentheses is a regular expression which matches the same text as the original regular expression. Parentheses are used to alter the order of evaluation implied by g), h), and i): _a(b|c)d_ will match _abd_ or _acd_ , while _ab|cd_ matches _ab_ or _cd_."
+
+{ } | "k) If "<regexp>" is a regular expression, "{<regexp>}x" is a regular expression, where _x_ is any character. This regular expression matches the same things as <regexp>; it has certain side effects as explained under the Substitute command." [The Substitute command was formed _(.,.)S/ <regexp>/<string>/_ (see page 13 of the memo), similar to the way it is still used in programs like _sed_ and Perl.]
+
+\E | "l) If <rexname> is the name of a regular expression named by the E command (below), then "\E<rexname>" is a regular expression which matches the same things as the regular expression specified in the E command. More discussion is presented under the E command." [The \E command allowed you to name a regular expression and repeat its use by name.]
+
+|
+
+"m) The null regular expression standing alone is equivalent to the last regular expression encountered. Initially the null regular expression is undefined; it also becomes undefined after an erroneous regular expression and after use of the E command."
+
+|
+
+"n) Nothing else is a regular expression."
+
+|
+
+"o) No regular expression will match text spread across more than one line."
+
+# Metacharacters
+
+There are 14 metacharacters used in regular expressions, each with special meaning, as described in Table A-2. If you want to use one of these characters as a literal, you must precede it with a backslash to escape it. For example, you would escape the dollar sign like this `\$`, or a backslash like this `\\`.
+
+Table A-2. Metacharacters in regular expressions
+
+Metacharacter| Name| Code Point| Purpose  
+---|---|---|---
+
+. | Full Stop | U+002E | Match any character
+
+\ | Backslash | U+005C | Escape a character
+
+| | Vertical Bar | U+007C | Alternation (or)
+
+^ | Circumflex | U+005E | Beginning of a line anchor
+
+$ | Dollar Sign | U+0024 | End of a line anchor
+
+? | Question Mark | U+003F | Zero or one quantifier
+
+* | Asterisk | U+002A | Zero or more quantifier
+
++ | Plus Sign | U+002B | One or more quantifier
+
+[ | Left Square Bracket | U+005B | Open character class
+
+] | Right Square Bracket | U+005D | Close character class
+
+{ | Left Curly Brace | U+007B | Open quantifier or block
+
+} | Right Curly Brace | 007D | Close quantifier or block
+
+( | Left Parenthesis | U+0028 | Open group
+
+) | Right Parenthesis | U+0029 | Close group
+
+# Character Shorthands
+
+Table A-3 lists character shorthands used in regular expressions.
+
+Table A-3. Character shorthands
+
+Character Shorthand| Description  
+---|---
+
+\a | Alert
+
+\b | Word boundary
+
+[\b] | Backspace character
+
+\B | Non-word boundary
+
+\cx | Control character
+
+\d | Digit character
+
+\D | Non-digit character
+
+\d _xxx_ | Decimal value for a character
+
+\f | Form feed character
+
+\r | Carriage return
+
+\n | Newline character
+
+\o _xxx_ | Octal value for a character
+
+\s | Space character
+
+\S | Non-space character
+
+\t | Horizontal tab character
+
+\v | Vertical tab character
+
+\w | Word character
+
+\W | Non-word character
+
+\0 | Null character
+
+\x _xx_ | Hexadecimal value for a character
+
+\u _xxxx_ | Unicode value for a character
+
+# Whitespace
+
+Table A-4 is a list of character shorthands for whitespace.
+
+Table A-4. Whitespace characters
+
+Character Shorthand| Description  
+---|---
+
+\f | Form feed
+
+\h | Horizontal whitespace
+
+\H | Not horizontal whitespace
+
+\n | Newline
+
+\r | Carriage return
+
+\t | Horizontal tab
+
+\v | Vertical whitespace
+
+\V | Not vertical whitespace
+
+# Unicode Whitespace Characters
+
+Whitespace characters in Unicode are listed in Table A-5.
+
+Table A-5. Whitespace characters in Unicode
+
+Abbreviation or Nickname| Name| Unicode Code Point| Regex  
+---|---|---|---
+
+HT | Horizontal tab | U+0009 | \u0009 or \t
+
+LF | Line feed | U+000A | \u000A or \n
+
+VT | Vertical tab | U+000B | \u000B or \v
+
+FF | Form feed | U+000C | \u000C or \f
+
+CR | Carriage return | U+000D | \u000d or \r
+
+SP | Space | U+0020 | \u0020 or \s[a]
+
+NEL | Next line | U+0085 | \u0085
+
+NBSP | No-break space | U+00A0 | \u00A0
+
+— | Ogham space mark | U+1680 | \u1680
+
+MVS | Mongolian vowel separator | U+180E | \u180E
+
+BOM | Byte order mark | U+FEFF | \ufeff
+
+NQSP | En quad | U+2000 | \u2000
+
+MQSP, Mutton Quad | Em quad | U+2001 | \u2001
+
+ENSP, Nut | En space | U+2002 | \u2002
+
+EMSP, Mutton | Em space | U+2003 | \u2003
+
+3MSP, Thick space | Three-per-em space | U+2004 | \u2004
+
+4MSP, Mid space | Four-per-em space | U+2005 | \u2005
+
+6/MSP | Six-per-em space | U+2006 | \u2006
+
+FSP | Figure space | U+2007 | \u2007
+
+PSP | Punctuation space | U+2008 | \u2008
+
+THSP | Thin space | U+2009 | \u2009
+
+HSP | Hair space | U+200A | \u200A
+
+ZWSP | Zero width space | U+200B | \u200B
+
+LSEP | Line separator | U+2028 | \u2028
+
+PSEP | Paragraph separator | U+2029 | \u2029
+
+NNBSP | Narrow no-break space | U+202F | \u202F
+
+MMSP | Medium mathematical space | U+205F | \u205f
+
+IDSP | Ideographic space | U+3000 | \u3000
+
+[a] Also matches other whitespace.
+
+# Control Characters
+
+Table A-6 shows a way to match control characters in regular expressions.
+
+Table A-6. Matching control characters
+
+Control Character| Unicode Value| Abbreviation| Name  
+---|---|---|---
+
+c@[a] | U+0000 | NUL | Null
+
+\cA | U+0001 | SOH | Start of heading
+
+\cB | U+0002 | STX | Start of text
+
+\cC | U+0003 | ETX | End of text
+
+\cD | U+0004 | EOT | End of transmission
+
+\cE | U+0005 | ENQ | Enquiry
+
+\cF | U+0006 | ACK | Acknowledge
+
+\cG | U+0007 | BEL | Bell
+
+\cH | U+0008 | BS | Backspace
+
+\cI | U+0009 | HT | Character tabulation or horizontal tab
+
+\cJ | U+000A | LF | Line feed (newline, end of line)
+
+\cK | U+000B | VT | Line tabulation or vertical tab
+
+\cL | U+000C | FF | Form feed
+
+\cM | U+000D | CR | Carriage return
+
+\cN | U+000E | SO | Shift out
+
+\cO | U+000F | SI | Shift in
+
+\cP | U+0010 | DLE | Data link escape
+
+\cQ | U+0011 | DC1 | Device control one
+
+\cR | U+0012 | DC2 | Device control two
+
+\cS | U+0013 | DC3 | Device control three
+
+\cT | U+0014 | DC4 | Device control four
+
+\cU | U+0015 | NAK | Negative acknowledge
+
+\cV | U+0016 | SYN | Synchronous idle
+
+\cW | U+0017 | ETB | End of Transmission block
+
+\cX | U+0018 | CAN | Cancel
+
+\cY | U+0019 | EM | End of medium
+
+\cZ | U+001A | SUB | Substitute
+
+\c[ | U+001B | ESC | Escape
+
+\c\ | U+001C | FS | Information separator four
+
+\c] | U+001D | GS | Information separator three
+
+\c^ | U+001E | RS | Information separator two
+
+\c_ | U+001F | US | Information separator one
+
+[a] Can use upper- or lowercase. For example, `\cA` or `\ca` are equivalent; however, Java implementations require uppercase.`\cA` or `\ca` are equivalent; however, Java implementations require uppercase.
+
+# Character Properties
+
+Table A-7 lists character property names for use with `\p{` _property_`}` or `\P{` _property_`}`.
+
+Table A-7. Character properties[2]
+
+Property| Description  
+---|---
+
+C | Other
+
+Cc | Control
+
+Cf | Format
+
+Cn | Unassigned
+
+Co | Private use
+
+Cs | Surrogate
+
+L | Letter
+
+Ll | Lowercase letter
+
+Lm | Modifier letter
+
+Lo | Other letter
+
+Lt | Title case letter
+
+Lu | Uppercase letter
+
+L& | Ll, Lu, or Lt
+
+M | Mark
+
+Mc | Spacing mark
+
+Me | Enclosing mark
+
+Mn | Non-spacing mark
+
+N | Number
+
+Nd | Decimal number
+
+Nl | Letter number
+
+No | Other number
+
+P | Punctuation
+
+Pc | Connector punctuation
+
+Pd | Dash punctuation
+
+Pe | Close punctuation
+
+Pf | Final punctuation
+
+Pi | Initial punctuation
+
+Po | Other punctuation
+
+Ps | Open punctuation
+
+S | Symbol
+
+Sc | Currency symbol
+
+Sk | Modifier symbol
+
+Sm | Mathematical symbol
+
+So | Other symbol
+
+Z | Separator
+
+Zl | Line separator
+
+Zp | Paragraph separator
+
+Zs | Space separator
+
+[2] See pcresyntax(3) at <http://www.pcre.org/pcre.txt>.
+
+# Script Names for Character Properties
+
+Table A-8 shows the language script names for use with `/p{` _property_`}` or `/P{` _property_`}`.
+
+Table A-8. Script names[3]
+
+Arabic (Arab)| Glagolitic (Glag)| Lepcha (Lepc)| Samaritan (Samr)  
+---|---|---|---
+
+Armenian (Armn) | Gothic (Goth) | Limbu (Limb) | Saurashtra (Saur)
+
+Avestan (Avst) | Greek (Grek) | Linear B (Linb) | Shavian (Shaw)
+
+Balinese (Bali) | Gujarati (Gujr) | Lisu (Lisu) | Sinhala (Sinh)
+
+Bamum (Bamu) | Gurmukhi (Guru) | Lycian (Lyci) | Sundanese (Sund)
+
+Bengali (Beng) | Han (Hani) | Lydian (Lydi) | Syloti Nagri (Sylo)
+
+Bopomofo (Bopo) | Hangul (Hang) | Malayalam (Mlym) | Syriac (Syrc)
+
+Braille (Brai) | Hanunoo (Hano) | Meetei Mayek (Mtei) | Tagalog (Tglg)
+
+Buginese (Bugi) | Hebrew (Hebr) | Mongolian (Mong) | Tagbanwa (Tagb)
+
+Buhid (Buhd) | Hiragana (Hira) | Myanmar (Mymr) | Tai Le (Tale)
+
+Canadian Aboriginal (Cans) | Hrkt: Katakana or Hiragana) | New Tai Lue (Talu) | Tai Tham (Lana)
+
+Carian (Cari) | Imperial Aramaic (Armi) | Nko (Nkoo) | Tai Viet (Tavt)
+
+Cham (None) | Inherited (Zinh/Qaai) | Ogham (Ogam) | Tamil (Taml)
+
+Cherokee (Cher) | Inscriptional Pahlavi (Phli) | Ol Chiki (Olck) | Telugu (Telu)
+
+Common (Zyyy) | Inscriptional Parthian (Prti) | Old Italic (Ital) | Thaana (Thaa)
+
+Coptic (Copt/Qaac) | Javanese (Java) | Old Persian (Xpeo) | Thai (None)
+
+Cuneiform (Xsux) | Kaithi (Kthi) | Old South Arabian (Sarb) | Tibetan (Tibt)
+
+Cypriot (Cprt) | Kannada (Knda) | Old Turkic (Orkh) | Tifinagh (Tfng)
+
+Cyrillic (Cyrl) | Katakana (Kana) | Oriya (Orya) | Ugaritic (Ugar)
+
+Deseret (Dsrt) | Kayah Li (Kali) | Osmanya (Osma) | Unknown (Zzzz)
+
+Devanagari (Deva) | Kharoshthi (Khar) | Phags Pa (Phag) | Vai (Vaii)
+
+Egyptian Hieroglyphs (Egyp) | Khmer (Khmr) | Phoenician (Phnx) | Yi (Yiii)
+
+Ethiopic (Ethi) | Lao (Laoo) | Rejang (Rjng)
+
+|
+
+Georgian (Geor) | Latin (Latn) | Runic (Runr)
+
+|
+
+[3] See pcresyntax(3) at <http://www.pcre.org/pcre.txt> or <http://ruby.runpaint.org/regexps#properties>.
+
+# POSIX Character Classes
+
+Table A-9 shows a list of POSIX character classes.
+
+Table A-9. POSIX character classes
+
+Character Class| Description  
+---|---
+
+[[:alnum:]] | Alphanumeric characters (letters and digits)
+
+[[:alpha:]] | Alphabetic characters (letters)
+
+[[:ascii:]] | ASCII characters (all 128)
+
+[[:blank:]] | Blank characters
+
+[[:ctrl:]] | Control characters
+
+[[:digit:]] | Digits
+
+[[:graph:]] | Graphic characters
+
+[[:lower:]] | Lowercase letters
+
+[[:print:]] | Printable characters
+
+[[:punct:]] | Punctuation characters
+
+[[:space:]] | Whitespace characters
+
+[[:upper:]] | Uppercase letters
+
+[[:word:]] | Word characters
+
+[[:xdigit:]] | Hexadecimal digits
+
+# Options/Modifiers
+
+Tables A-10 and A-11") list options and modifiers.
+
+Table A-10. Options in regular expressions
+
+Option| Description| Supported by  
+---|---|---
+
+`(?d)` | Unix lines | Java
+
+`(?i)` | Case insensitive | PCRE, Perl, Java
+
+`(?J)` | Allow duplicate names | PCRE[a]
+
+`(?m)` | Multiline | PCRE, Perl, Java
+
+`(?s)` | Single line (dotall) | PCRE, Perl, Java
+
+`(?u)` | Unicode case | Java
+
+`(?U)` | Default match lazy | PCRE
+
+`(?x)` | Ignore whitespace, comments | PCRE, Perl, Java
+
+`(?-...)` | Unset or turn off options | PCRE
+
+[a] See "Named Subpatterns" in <http://www.pcre.org/pcre.txt>.<http://www.pcre.org/pcre.txt>.
+
+Table A-11. Perl modifiers (flags)[4]
+
+Modifier| Description  
+---|---
+
+a | Match `\d`, `\s`, `\w` and POSIX in ASCII range only
+
+c | Keep current position after match fails
+
+d | Use default, native rules of the platform
+
+g | Global matching
+
+i | Case-insensitive matching
+
+l | Use current locale's rules
+
+m | Multiline strings
+
+p | Preserve the matched string
+
+s | Treat strings as a single line
+
+u | Use Unicode rules when matching
+
+x | Ignore whitespace and comments
+
+[4] See <http://perldoc.perl.org/perlre.html#Modifiers>.
+
+# ASCII Code Chart with Regex
+
+Table A-12 is an ASCII code chart with regex cross-references.
+
+Table A-12. ASCII code chart
+
+Binary| Oct| Dec| Hex| Char| Kybd| Regex| Name  
+---|---|---|---|---|---|---|---
+
+00000000 | 0 | 0 | 0 | NUL | ^@ | \c@ | Null character
+
+00000001 | 1 | 1 | 1 | SOH | ^A | \cA | Start of header
+
+00000010 | 2 | 2 | 2 | STX | ^B | \cB | Start of text
+
+00000011 | 3 | 3 | 3 | ETX | ^C | \cC | End of text
+
+00000100 | 4 | 4 | 4 | EOT | ^D | \cD | End of transmission
+
+00000101 | 5 | 5 | 5 | ENQ | ^E | \cE | Enquiry
+
+00000110 | 6 | 6 | 6 | ACK | ^F | \cF | Acknowledgment
+
+00000111 | 7 | 7 | 7 | BEL | ^G | \a, \cG | Bell
+
+00001000 | 10 | 8 | 8 | BS | ^H | [\b], \cH | Backspace
+
+00001001 | 11 | 9 | 9 | HT | ^I | \t, \cI | Horizontal tab
+
+00001010 | 12 | 10 | 0A | LF | ^J | \n, \cJ | Line feed
+
+00001011 | 13 | 11 | 0B | VT | ^K | \v, \cK | Vertical tab
+
+00001100 | 14 | 12 | 0C | FF | ^L | \f, \cL | Form feed
+
+00001101 | 15 | 13 | 0D | CR | ^M | \r, \cM | Carriage return
+
+00001110 | 16 | 14 | 0E | SO | ^N | \cN | Shift out
+
+00001111 | 17 | 15 | 0F | SI | ^O | \cO | Shift in
+
+00010000 | 20 | 16 | 10 | DLE | ^P | \cP | Data link escape
+
+00010001 | 21 | 17 | 11 | DC1 | ^Q | \cQ | Device control 1 (XON)
+
+00010010 | 22 | 18 | 12 | DC2 | ^R | \cR | Device control 2
+
+00010011 | 23 | 19 | 13 | DC3 | ^S | \cS | Device control 3 (XOFF)
+
+00010100 | 24 | 20 | 14 | DC4 | ^T | \cT | Device control 4
+
+00010101 | 25 | 21 | 15 | NAK | ^U | \cU | Negative acknowledgement
+
+00010110 | 26 | 22 | 16 | SYN | ^V | \cV | Synchronous idle
+
+00010111 | 27 | 23 | 17 | ETB | ^W | \cW | End of transmission block
+
+00011000 | 30 | 24 | 18 | CAN | ^X | \cX | Cancel
+
+00011001 | 31 | 25 | 19 | EM | ^Y | \cY | End of medium
+
+00011010 | 32 | 26 | 1A | SUB | ^Z | \cZ | Substitute
+
+00011011 | 33 | 27 | 1B | ESC | ^[ | \e, \c[ | Escape
+
+00011100 | 34 | 28 | 1C | FS | ^| | \c| | File separator
+
+00011101 | 35 | 29 | 1D | GS | ^] | \c] | Group separator
+
+00011110 | 36 | 30 | 1E | RS | ^^ | \c^ | Record separator
+
+00011111 | 37 | 31 | 1F | US | ^_ | \c_ | Unit Separator
+
+00100000 | 40 | 32 | 20 | SP | SP | \s, [ ] | Space
+
+00100001 | 41 | 33 | 21 | ! | ! | ! | Exclamation mark
+
+00100010 | 42 | 34 | 22 | " | " | " | Quotation mark
+
+00100011 | 43 | 35 | 23 | # | # | # | Number sign
+
+00100100 | 44 | 36 | 24 | $ | $ | \$ | Dollar sign
+
+00100101 | 45 | 37 | 25 | % | % | % | Percent sign
+
+00100110 | 46 | 38 | 26 | & | & | & | Ampersand
+
+00100111 | 47 | 39 | 27 | ' | ' | ' | Apostrophe
+
+00101000 | 50 | 40 | 28 | ( | ( | (, \\( | Left parenthesis
+
+00101001 | 51 | 41 | 29 | ) | ) | ), \\) | Right parenthesis
+
+00101010 | 52 | 42 | 2A | * | * | * | Asterisk
+
+00101011 | 53 | 43 | 2B | + | + | + | Plus sign
+
+00101100 | 54 | 44 | 2C | " | " | " | Comma
+
+00101101 | 55 | 45 | 2D | - | - | - | Hyphen-minus
+
+00101110 | 56 | 46 | 2E | . | . | \\., [.] | Full stop
+
+00101111 | 57 | 47 | 2F | / | / | / | Solidus
+
+00110000 | 60 | 48 | 30 | 0 | 0 | \d, [0] | Digit zero
+
+00110001 | 61 | 49 | 31 | 1 | 1 | \d, [1] | Digit one
+
+00110010 | 62 | 50 | 32 | 2 | 2 | \d, [2] | Digit two
+
+00110011 | 63 | 51 | 33 | 3 | 3 | \d, [3] | Digit three
+
+00110100 | 64 | 52 | 34 | 4 | 4 | \d, [4] | Digit four
+
+00110101 | 65 | 53 | 35 | 5 | 5 | \d, [5] | Digit five
+
+00110110 | 66 | 54 | 36 | 6 | 6 | \d, [6] | Digit six
+
+00110111 | 67 | 55 | 37 | 7 | 7 | \d, [7] | Digit seven
+
+00111000 | 70 | 56 | 38 | 8 | 8 | \d, [8] | Digit eight
+
+00111001 | 71 | 57 | 39 | 9 | 9 | \d, [9] | Digit nine
+
+00111010 | 72 | 58 | 3A | : | : | : | Colon
+
+00111011 | 73 | 59 | 3B | ; | ; | ; | Semicolon
+
+00111100 | 74 | 60 | 3C | < | < | < | Less-than sign
+
+00111101 | 75 | 61 | 3D | = | = | = | Equals sign
+
+00111110 | 76 | 62 | 3E | > | > | > | Greater-than sign
+
+00111111 | 77 | 63 | 3F | ? | ? | ? | Question mark
+
+01000000 | 100 | 64 | 40 | @ | @ | @ | Commercial at
+
+01000001 | 101 | 65 | 41 | A | A | \w, [A] | Latin capital letter A
+
+01000010 | 102 | 66 | 42 | B | B | \w, [B] | Latin capital letter B
+
+01000011 | 103 | 67 | 43 | C | C | \w, [C] | Latin capital letter C
+
+01000100 | 104 | 68 | 44 | D | D | \w, [D] | Latin capital letter D
+
+01000101 | 105 | 69 | 45 | E | E | \w, [E] | Latin capital letter E
+
+01000110 | 106 | 70 | 46 | F | F | \w, [F] | Latin capital letter F
+
+01000111 | 107 | 71 | 47 | G | G | \w, [G] | Latin capital letter G
+
+01001000 | 110 | 72 | 48 | H | H | \w, [H] | Latin capital letter H
+
+01001001 | 111 | 73 | 49 | I | I | \w, [I] | Latin capital letter I
+
+01001010 | 112 | 74 | 4A | J | J | \w, [J] | Latin capital letter J
+
+01001011 | 113 | 75 | 4B | K | K | \w, [K] | Latin capital letter K
+
+01001100 | 114 | 76 | 4C | L | L | \w, [L] | Latin capital letter L
+
+01001101 | 115 | 77 | 4D | M | M | \w, [M] | Latin capital letter M
+
+01001110 | 116 | 78 | 4E | N | N | \w, [N] | Latin capital letter N
+
+01001111 | 117 | 79 | 4F | O | O | \w, [O] | Latin capital letter O
+
+01010000 | 120 | 80 | 50 | P | P | \w, [P] | Latin capital letter P
+
+01010001 | 121 | 81 | 51 | Q | Q | \w, [Q] | Latin capital letter Q
+
+01010010 | 122 | 82 | 52 | R | R | \w, [R] | Latin capital letter R
+
+01010011 | 123 | 83 | 53 | S | S | \w, [S] | Latin capital letter S
+
+01010100 | 124 | 84 | 54 | T | T | \w, [T] | Latin capital letter T
+
+01010101 | 125 | 85 | 55 | U | U | \w, [U] | Latin capital letter U
+
+01010110 | 126 | 86 | 56 | V | V | \w, [V] | Latin capital letter V
+
+01010111 | 127 | 87 | 57 | W | W | \w, [W] | Latin capital letter W
+
+01011000 | 130 | 88 | 58 | X | X | \w, [X] | Latin capital letter X
+
+01011001 | 131 | 89 | 59 | Y | Y | \w, [Y] | Latin capital letter Y
+
+01011010 | 132 | 90 | 5A | Z | Z | \w, [Z] | Latin capital letter Z
+
+01011011 | 133 | 91 | 5B | [ | [ | \\[ | Left square bracket
+
+01011100 | 134 | 92 | 5C | \ | \ | \ | Reverse solidus
+
+01011101 | 135 | 93 | 5D | ] | ] | \\] | Right square bracket
+
+01011110 | 136 | 94 | 5E | ^ | ^ | ^, [^] | Circumflex accent
+
+01011111 | 137 | 95 | 5F | _ | _ | _, [_] | Low line
+
+00100000 | 140 | 96 | 60 | ` | ` | \\` | Grave accent
+
+01100001 | 141 | 97 | 61 | a | a | \w, [a] | Latin small letter A
+
+01100010 | 142 | 98 | 62 | b | b | \w, [b] | Latin small letter B
+
+01100011 | 143 | 99 | 63 | c | c | \w, [c] | Latin small letter C
+
+01100100 | 144 | 100 | 64 | d | d | \w, [d] | Latin small letter D
+
+01100101 | 145 | 101 | 65 | e | e | \w, [e] | Latin small letter E
+
+01100110 | 146 | 102 | 66 | f | f | \w, [f] | Latin small letter F
+
+01100111 | 147 | 103 | 67 | g | g | \w, [g] | Latin small letter G
+
+01101000 | 150 | 104 | 68 | h | h | \w, [h] | Latin small letter H
+
+01101001 | 151 | 105 | 69 | i | i | \w, [i] | Latin small letter I
+
+01101010 | 152 | 106 | 6A | j | j | \w, [j] | Latin small letter J
+
+01101011 | 153 | 107 | 6B | k | k | \w, [k] | Latin small letter K
+
+01101100 | 154 | 108 | 6C | l | l | \w, [l] | Latin small letter L
+
+01101101 | 155 | 109 | 6D | m | m | \w, [m] | Latin small letter M
+
+01101110 | 156 | 110 | 6E | n | n | \w, [n] | Latin small letter N
+
+01101111 | 157 | 111 | 6F | o | o | \w, [o] | Latin small letter O
+
+01110000 | 160 | 112 | 70 | p | p | \w, [p] | Latin small letter P
+
+01110001 | 161 | 113 | 71 | q | q | \w, [q] | Latin small letter Q
+
+01110010 | 162 | 114 | 72 | r | r | \w, [r] | Latin small letter R
+
+01110011 | 163 | 115 | 73 | s | s | \w, [s] | Latin small letter S
+
+01110100 | 164 | 116 | 74 | t | t | \w, [t] | Latin small letter T
+
+01110101 | 165 | 117 | 75 | u | u | \w, [u] | Latin small letter U
+
+01110110 | 166 | 118 | 76 | v | v | \w, [v] | Latin small letter V
+
+01110111 | 167 | 119 | 77 | w | w | \w, [w] | Latin small letter W
+
+01111000 | 170 | 120 | 78 | x | x | \w, [x] | Latin small letter X
+
+01111001 | 171 | 121 | 79 | y | y | \w, [y] | Latin small letter Y
+
+01111010 | 172 | 122 | 7A | z | z | \w, [z] | Latin small letter Z
+
+01111011 | 173 | 123 | 7B | { | { | { | Left curly brace
+
+01111100 | 174 | 124 | 7C | | | | | | | Vertical line (Bar)
+
+01111101 | 175 | 125 | 7D | } | } | } | Right curly brace
+
+01111110 | 176 | 126 | 7E | ~ | ~ | \~ | Tilde
+
+01111111 | 177 | 127 | 7F | DEL | ^? | \c? | Delete
+
+# Technical Notes
+
+You can find Ken Thompson and Dennis Ritchie's QED memo-cum manual at <http://cm.bell-labs.com/cm/cs/who/dmr/qedman.pdf>.
+
+# Regular Expression Glossary
+
+anchor
+
+Specifies a location in a line or string. For example, the caret or circumflex character (`^`) signifies the beginning of a line or string of characters, and the dollar sign character (`$`), the end of a line or string.
+
+alternation
+
+Separating a list of regular expressions with a vertical bar (`|`) character, indicating _or_. In other words, match any of the regular expressions separated by one or more | characters. In some applications, such as _grep_ or _sed_ that use basic regular expressions (BREs), the `|` is preceded by a backslash, as in `\|`. _See also_ basic regular expressions.
+
+ASCII
+
+American Standard Code for Information Interchange. A 128-character encoding scheme for English (Latin) characters developed in the 1960s. _See also_ Unicode.
+
+assertions
+
+ _See_ zero-width assertions.
+
+atom
+
+ _See_ metacharacter.
+
+atomic group
+
+A grouping that turns off backtracking when a regular expression inside `(?>...)` fails to match. _See also_ backtracking, groups.
+
+backreference
+
+Refers to a previous regular expression captured with parentheses using a reference in the form of \1, \2, and so forth.
+
+backtracking
+
+Stepping back, character by character, through an attempted match to find a successful match. Used with a greedy match, but not a lazy or possessive match. Catastrophic backtracking occurs when a regex processor makes perhaps thousands of attempts to make a match and consumes a vast amount (read _most_ ) of the computing resources available. One way to avoid catastrophic backtracking is with atomic grouping. _See also_ atomic group, greedy match, lazy match, possessive match.
+
+basic regular expressions
+
+An early implementation of regular expressions that is less advanced and considered obsolete by most. Also called _BREs_. BREs required you to escape certain characters in order for them to function as metacharacters, such as braces (\`{` and `}`\\). _See also_ extended regular expressions.
+
+bound
+
+ _See_ quantifier.
+
+bracketed expression
+
+A regular expression given in square brackets; for example, _a-f]_ , that is, the range of lowercase letters a through f. _See also_ [character class.
+
+branch
+
+A concatenation of pieces in a regular expression in POSIX.1 terminology. _See also_ POSIX.
+
+BREs
+
+ _See_ basic regular expressions.
+
+capturing group
+
+ _See_ groups.
+
+catastrophic backtracking
+
+ _See_ backtracking.
+
+character class
+
+Usually, a set of characters enclosed in square brackets; for example, _[a-bA-B0-9]_ is a character class for all upper- and lowercase characters plus digits in the ASCII or Low Basic Latin character set.
+
+character escape
+
+A character preceded by a backward slash. Examples are \t (horizontal tab), \v (vertical tab), and \f (form feed).
+
+character set
+
+ _See_ character class.
+
+code point
+
+ _See_ Unicode.
+
+composability
+
+"A schema language (or indeed a programming language) provides a number of atomic objects and a number of methods of composition. The methods of composition can be used to combine atomic objects into compound objects which can in turn be composed into further compound objects. The composability of the language is the degree to which the various methods of composition can be applied uniformly to all the various objects of the language, both atomic and compound...Composability improves ease of learning and ease of use. Composability also tends to improve the ratio between complexity and power: for a given amount of complexity, a more composable language will be more powerful than a less composable one." From James Clark, "The Design of RELAX NG," <http://www.thaiopensource.com/relaxng/design.html#section:5>.
+
+ed
+
+The Unix line editor created by Ken Thompson in 1971, which implemented regular expressions. It was a precursor to _sed_ and _vi_.
+
+EREs
+
+ _See_ extended regular expressions.
+
+extended regular expressions
+
+Extended regular expressions or EREs added additional functionality to basic regular expressions or BREs, such as alternation (\|) and quantifiers such as ? and +, which work with _egrep_ (extended grep). These new features were delineated in IEEE POSIX standard 1003.2-1992. You can use the _-E_ option with _grep_ (same as using _egrep_ ), which means that you want to use extended regular expressions rather than basic regular expressions. _See also_ alternation, basic regular expressions, grep.
+
+flag
+
+ _Seemodifier_.
+
+greedy match
+
+A greedy match consumes as much of a target string as possible, and then backtracks through the string to attempt to find a match. _See_ backtracking, lazy match, possessive match.
+
+grep
+
+A Unix command-line utility for searching strings with regular expressions. Invented by Ken Thompson in 1973, _grep_ is said to have grown out of the _ed_ editor command `g/re/p` (global/regular expression/print). Superseded but not retired by _egrep_ (or _grep -E_ —which has additional metacharacters such as |, +, ?, (, and )— _grep_ uses basic regular expressions, whereas _grep -E_ or _egrep_ use extended regular expressions. _fgrep_ ( _grep -F_ ) searches files using literal strings and metacharacters like $, *, and | don't have special meaning. _See also_ basic regular expressions, extended regular expressions.
+
+groups
+
+Groups combine regular expression atoms within a pair of parentheses, `( )`. In some applications, such as _grep_ or _sed_ (without _-E_), you must precede the parenthesis with a backslash, as in `\)` or `\(`. There are capturing groups and non-capturing groups. A capturing group stores the captured group in memory so that it can be reused while a non-capturing group does not. Atomic groups do not backtrack. _See also_ atomic group.
+
+hexadecimal
+
+A base 16 numbering system represented by the digits 0–9 and the letters A–F or a–f. For example, the base 10 number 15 is represented as F in hexadecimal, and 16 is 10.
+
+hold buffer
+
+ _See_hold space.
+
+hold space
+
+Used by _sed_ to store one or more lines for further processing. Also called the _hold buffer_. _See also_ pattern space, _sed_.
+
+lazy match
+
+A lazy match consumes a subject string one character at a time, attempting to find a match. It does not backtrack. _See_ _also_ backtracking, greedy match, possessive match.
+
+literal
+
+ _See_ string literal.
+
+lookaround
+
+ _See_ lookahead, lookbehind.
+
+lookahead
+
+A regular expression that matches only if another specified regular expression follows the first. A positive lookahead uses the syntax `regex(?=regex)`. A negative lookahead means that the regular expression is _not_ followed by a regular expression that follows the first. Uses the syntax `regex(?!regex)`.
+
+lookbehind
+
+A regular expression that matches only if another specified regular expression precedes the first. A positive lookbehind uses the syntax `regex(?<=regex)`. A negative lookbehind means that the regular expression is _not_ followed by a regular expression that precedes the first. Uses the syntax `regex(?<!regex)`.
+
+matching
+
+A regular expression may match a given pattern in text and then, depending on the application, trigger a result.
+
+metacharacter
+
+A character that has a special meaning in regular expressions. These characters are (the commas in this list are separators) ., \, \|, *, +, ?, ~, $, [, ], (, ), {, }. Metacharacters are also called _atoms_.
+
+modifier
+
+A character placed after a match or substitution pattern that modifies the matching process. For example, the _i_ modifier makes the match case-insensitive. Also called a _flag_.
+
+negation
+
+Indicates that a regular expression does not match a given pattern. Given inside character classes with a leading caret character, as in `[^2-7]`, which would match other digits besides 2, 3, 4, 5, 6 and 7—that is, 0, 1, 8, 9.
+
+negative lookahead
+
+ _See_ lookahead.
+
+negative lookbehind
+
+ _See_ lookbehind.
+
+non-capturing group
+
+A group within parentheses that is not captured (that is, stored in memory for future use). The syntax for a non-capturing group is _(?:pattern)_. _See also_ groups.
+
+octal characters
+
+A character may be represented with an octal notation in regular expressions. In regular expressions, a character given in octal form is specified as passthrough:[`\o` _`xx`_ ] \o_xx_ where the _x_ represents a number in the range 1–9, using one to two places. For example, \o represents the character _é_ , the Latin small letter _e_ with an acute accent.
+
+occurrence constraint
+
+ _See_ quantifier.
+
+options
+
+Allows you to turn on and off options that modify the match. For example, the `(?i)` option indicates that the match will be case-insensitive. Similar to modifiers, but they use a different syntax. _See also_ modifier.
+
+pattern space
+
+The _sed_ program normally processes as input one line at a time. As each line is processed, it is placed in what is called _pattern space_ , to which patterns may be applied. This is also called the _work buffer_. _See also_ hold space, sed.
+
+Perl
+
+A general-purpose programming language created by Larry Wall in 1987, Perl is known for its strong support of regular expressions and its text processing capabilities. See <http://www.perl.org>.
+
+piece
+
+A portion of a regular expression, usually concatenated, in POSIX.1 terminology. _See also_ POSIX.
+
+positive lookahead
+
+ _See_ lookahead.
+
+positive lookbehind
+
+ _See_ lookbehind.
+
+POSIX
+
+Portable Operating System Interface for Unix. A family of Unix-related standards by the Institute of Electrical and Electronics Engineers (IEEE). The most recent POSIX standard for regular expressions is POSIX.1-2008 (see <http://standards.ieee.org/findstds/standard/1003.1-2008.html>).
+
+possessive match
+
+A possessive match consumes an entire subject string in one fell swoop, attempting to find a match. It does not backtrack. _See also_ backtracking, greedy match, lazy match.
+
+quantifier
+
+Defines the number of times a regular expression may occur in an attempted match. An integer or pair of integers separated by a comma, surrounded by braces, is one form; for example, `{3}` indicates that the expression may occur exactly three times (with older tools that use basic regular expressions, you must escape the braces, as in `\{3\}`).
+
+Other quantifiers include `?` (zero or one times), `+` (one or more), and `*` (zero or more). A quantifier is also called a _bound_ or a _modifier_. By themselves, quantifiers are greedy. There are also lazy quantifiers (e.g., `{3}?`) and possessive quantifiers (e.g., `{3}+`). _See also_ basic regular expressions, greedy match, lazy match, possessive match.
+
+regular expression
+
+A specially encoded string of characters that, when used within an application or utility, may match other strings or sets of strings. First described in the early 1950s by the mathematician Stephen Kleene (1909–1994) in his work with formal language theory in his book _Introduction to Metamathematics_ , published in 1952. Began to gain momentum in computer science with the work of Ken Thompson, _et al._ on the QED editor (under the General Electric Time Sharing System [GE-TSS] on a GE-635 computer) and, later, other tools under AT&T Bell Labs' Unix operating system in the early 1970s.
+
+sed
+
+A Unix streaming editor that accepts regular expressions and transforms text. It was developed in the early 1970s by Lee McMahon at Bell Labs. An example of _sed_ : `sed -n 's/this/that/g\' file.ext > new.ext`. Use _sed -E_ to indicate that you want to use extended regular expressions. _See also_ extended regular expressions.
+
+string literal
+
+A string of characters interpreted literally—for example, the literal string "It is an ancyent Marinere" as opposed to something like "[Ii]t[ ]is[ ].*nere."
+
+Unicode
+
+Unicode is a system for encoding characters for writing systems of the world. Each character in Unicode is assigned a numeric code point. There are over 100,000 characters represented in Unicode. In regular expressions, a Unicode character can be specified as `\u` _`xxxx`_ or `\x` _`{xxxx}`_ , where _x_ represents a hexadecimal number in the range 0–9, A–F (or a–f), using one to four places. For example, `\u00E9` represents the character _é_ , the Latin small letter _e_ with an acute accent. _See also_ <http://www.unicode.org>.
+
+vi
+
+A Unix editor that was first developed in 1976 by Bill Joy and that uses regular expressions. The _vim_ editor is an improved replacement for _vi_ , developed primarily by Bram Moolenaar (see <http://www.vim.org>). I currently use six or seven different editors during a regular work day, but the one I use most often is _vim. In fact, if I were shipwrecked on a desert island, and could have only one text editor, I would choose _vim_. No question.
+
+vim
+
+ _See_ vi.
+
+work buffer
+
+ _See_ pattern space.
+
+zero-width assertions
+
+Boundaries that do not consume any characters in a match. `^` and `$`, which match the beginning and end of a line, respectively, are examples.
+
+# Index
+
+### A note on the digital index
+
+A link in an index entry is displayed as the section title in which that entry appears. Because some sections have multiple index markers, it is not unusual for an entry to have several links to the same section. Clicking on any link will take you directly to the place in the text in which the marker appears.
+
+### Symbols
+
+$ (dollar sign), Quoting Literals, The Beginning and End of a Line, Regular Expressions in QED, Metacharacters
+
+matching end of line with, Regular Expressions in QED
+as metacharacter, Metacharacters
+usage examples, Quoting Literals, The Beginning and End of a Line
+() (parentheses), Capturing Groups and Back References, Quoting Literals, Alternation, Groups, and Backreferences, Subpatterns, Regular Expressions in QED, Metacharacters
+
+as metacharacters, Metacharacters
+QED regex feature, Regular Expressions in QED
+subpatterns and, Subpatterns
+usage examples, Capturing Groups and Back References, Quoting Literals, Alternation, Groups, and Backreferences
+* (asterisk), Using Quantifiers, Matching Any Character, Once Again, Subpatterns, Matching with *, +, and ?, Regular Expressions in QED, Metacharacters, Regular Expression Glossary
+
+as metacharacter, Metacharacters
+QED regex feature, Regular Expressions in QED
+as quantifier, Using Quantifiers, Matching Any Character, Once Again, Subpatterns, Matching with *, +, and ?, Regular Expression Glossary
+\+ (plus sign), Using Quantifiers, Matching with *, +, and ?, Metacharacters, Regular Expression Glossary
+
+as metacharacter, Metacharacters
+as quantifier, Using Quantifiers, Matching with *, +, and ?, Regular Expression Glossary
+\- (hyphen) metacharacter, Quoting a Group of Characters as Literals
+. (dot) character, Matching Any Character, Matching Any Character, Once Again, Regular Expressions in QED, Metacharacters
+
+described, Matching Any Character, Once Again
+matching any character, Matching Any Character
+as metacharacter, Metacharacters
+QED regex feature, Regular Expressions in QED
+/ (forward slash), Word and Non-word Boundaries, Matching Tags
+\0 (Null) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+; (semicolon), Using sed to Mark Up Text
+<> (angle brackets), Matching Tags
+? (question mark), Using Quantifiers, Quoting Literals, Matching with *, +, and ?, Matching Tags, Metacharacters, Regular Expression Glossary
+
+matching tags, Matching Tags
+as metacharacter, Metacharacters
+as quantifier, Using Quantifiers, Matching with *, +, and ?, Regular Expression Glossary
+usage examples, Quoting Literals
+] (square brackets), [Quoting Literals, Character Classes, Metacharacters
+
+as metacharacters, Metacharacters
+usage examples, Quoting Literals, Character Classes
+\ (backslash) metacharacter, Quoting Literals, The Beginning and End of a Line, Adding Tags with sed, Metacharacters, Metacharacters
+
+described, Metacharacters
+escaping metacharacters, The Beginning and End of a Line, Metacharacters
+inserting newlines, Adding Tags with sed
+usage example, Quoting Literals
+^ (caret), Quoting Literals, The Beginning and End of a Line–The Beginning and End of a Line, Negated Character Classes, Regular Expressions in QED, Metacharacters
+
+matching beginning or end of lines, The Beginning and End of a Line–The Beginning and End of a Line
+as metacharacter, Metacharacters
+negated character classes, Negated Character Classes
+QED regex feature, Regular Expressions in QED
+usage example, Quoting Literals
+_ (underscore), Matching Tags, Technical Notes
+{} (curly braces), Using Quantifiers, Quoting Literals, Matching a Specific Number of Times, Regular Expressions in QED, Metacharacters
+
+as metacharacters, Using Quantifiers, Metacharacters
+QED regex feature, Regular Expressions in QED
+usage example, Quoting Literals, Matching a Specific Number of Times
+| (vertical bar), Quoting Literals, Regular Expressions in QED, Metacharacters
+
+as metacharacter, Metacharacters
+QED regex feature, Regular Expressions in QED
+usage example, Quoting Literals
+
+### A
+
+a (append) command (sed), Appending Tags
+\a (alert) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+\A (start of subject) character shorthand, Other Anchors
+a modifier (Perl), Alternation, Options/Modifiers
+ack tool, Matching Unicode Character Properties, Technical Notes
+Adobe AIR runtime, Technical Notes
+alert (\a) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+[:alnum:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes
+[:alpha:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes
+alternation, Quoting Literals, Alternation, Alternation, Alternation, Alternation, Regular Expression Glossary
+
+described, Quoting Literals, Alternation, Regular Expression Glossary
+with grep, Alternation
+with Perl, Alternation
+with RegExr, Alternation
+American Standard Code for Information Interchange (ASCII), ASCII Code Chart with Regex–ASCII Code Chart with Regex, Regular Expression Glossary
+
+described, Regular Expression Glossary
+regex cross-references, ASCII Code Chart with Regex–ASCII Code Chart with Regex
+"An die Freude" (Schiller), Matching Unicode Character Properties
+anchors, Boundaries, Regular Expression Glossary
+angle brackets (<>), Matching Tags
+append (a) command (sed), Appending Tags
+ASCII (American Standard Code for Information Interchange), ASCII Code Chart with Regex–ASCII Code Chart with Regex, Regular Expression Glossary
+
+described, Regular Expression Glossary
+regex cross-references, ASCII Code Chart with Regex–ASCII Code Chart with Regex
+[:ascii:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes
+Asciidoc text format, Technical Notes
+assertions, Boundaries, Boundaries, Boundaries, Regular Expression Glossary
+
+as boundaries, Boundaries
+described, Boundaries
+zero-width, Boundaries, Regular Expression Glossary
+asterisk (*), Using Quantifiers, Matching Any Character, Once Again, Subpatterns, Matching with *, +, and ?, Regular Expressions in QED, Metacharacters, Regular Expression Glossary
+
+as metacharacter, Metacharacters
+QED regex feature, Regular Expressions in QED
+as quantifier, Using Quantifiers, Matching Any Character, Once Again, Subpatterns, Matching with *, +, and ?, Regular Expression Glossary
+atom, Regular Expression Glossary (see metacharacters)
+atomic groups, Atomic Groups, Technical Notes, Regular Expression Glossary
+
+### B
+
+\b] (backspace) character shorthand, [Matching Word and Non-Word Characters, Character Shorthands
+\b (word boundary) character shorthand, Matching Word and Non-Word Characters, Word and Non-word Boundaries–Word and Non-word Boundaries, Character Shorthands
+\B (non-word boundary) character shorthand, Matching Word and Non-Word Characters, Word and Non-word Boundaries, Character Shorthands
+backreferences, capturing groups and, Capturing Groups and Back References, Capturing Groups and Backreferences–Named Groups
+backslash (\\) metacharacter, Quoting Literals, The Beginning and End of a Line, Adding Tags with sed, Metacharacters, Metacharacters
+
+described, Metacharacters
+escaping metacharacters, The Beginning and End of a Line, Metacharacters
+inserting newlines, Adding Tags with sed
+usage example, Quoting Literals
+backspace \b] character shorthand, [Matching Word and Non-Word Characters, Character Shorthands
+backtracking, Atomic Groups, Atomic Groups, Greedy, Lazy, and Possessive–Matching a Specific Number of Times, Greedy, Lazy, and Possessive, Greedy, Lazy, and Possessive, Greedy, Lazy, and Possessive, Lazy Quantifiers, Possessive Quantifiers, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary
+
+catastrophic, Atomic Groups, Regular Expression Glossary
+described, Greedy, Lazy, and Possessive, Regular Expression Glossary
+greedy match and, Greedy, Lazy, and Possessive–Matching a Specific Number of Times, Regular Expression Glossary
+lazy match and, Greedy, Lazy, and Possessive, Lazy Quantifiers, Regular Expression Glossary
+possessive match and, Greedy, Lazy, and Possessive, Possessive Quantifiers, Regular Expression Glossary
+turning off, Atomic Groups
+Barnett, Bruce, Learning More
+Basho (poet), Matching a Unicode Character
+basic regular expressions (BREs), Word and Non-word Boundaries, Alternation, Regular Expression Glossary
+
+described, Regular Expression Glossary
+grep and, Word and Non-word Boundaries, Alternation
+Berkeley Time-Sharing System (BTSS), Preface, Regular Expressions in QED
+[:blank:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes
+bound, Regular Expression Glossary (see quantifiers)
+boundaries, Matching Word and Non-Word Characters, Boundaries, The Beginning and End of a Line–The Beginning and End of a Line, Word and Non-word Boundaries–Word and Non-word Boundaries, Word and Non-word Boundaries, Other Anchors, Quoting a Group of Characters as Literals, Adding Tags–Adding Tags with Perl, Character Shorthands
+
+adding tags, Adding Tags–Adding Tags with Perl
+assertions as, Boundaries
+matching beginning and end of lines, The Beginning and End of a Line–The Beginning and End of a Line
+matching start and end of subject, Other Anchors
+non-word, Word and Non-word Boundaries
+quoting groups of characters as literals, Quoting a Group of Characters as Literals
+word, Matching Word and Non-Word Characters, Word and Non-word Boundaries–Word and Non-word Boundaries, Character Shorthands
+bracketed expressions, Character Classes, Character Classes, Regular Expression Glossary
+
+(see also character classes)
+branches, Regular Expression Glossary
+BREs (basic regular expressions), Word and Non-word Boundaries, Alternation, Regular Expression Glossary
+
+described, Regular Expression Glossary
+grep and, Word and Non-word Boundaries, Alternation
+BTSS (Berkeley Time-Sharing System), Preface, Regular Expressions in QED
+
+### C
+
+\c xx (control) character shorthand, Matching Word and Non-Word Characters, Matching Control Characters, Character Shorthands
+c modifier (Perl), Alternation, Options/Modifiers
+capturing groups, Capturing Groups and Back References, Capturing Groups and Back References, Capturing Groups and Backreferences–Named Groups, Named Groups, Regular Expression Glossary
+
+backreferences and, Capturing Groups and Back References, Capturing Groups and Backreferences–Named Groups
+described, Capturing Groups and Back References, Regular Expression Glossary
+named groups, Named Groups
+caret (^), Quoting Literals, The Beginning and End of a Line–The Beginning and End of a Line, Negated Character Classes, Regular Expressions in QED, Metacharacters
+
+matching beginning or end of lines, The Beginning and End of a Line–The Beginning and End of a Line
+as metacharacter, Metacharacters
+negated character classes, Negated Character Classes
+QED regex feature, Regular Expressions in QED
+usage example, Quoting Literals
+carriage return (\r) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+case sensitivity, Matching String Literals, Alternation, Alternation, Positive Lookaheads
+
+in Regexpal, Matching String Literals
+in RegExr, Alternation, Positive Lookaheads
+in regular expressions, Alternation
+catastrophic backtracking, Atomic Groups, Regular Expression Glossary
+character classes, Matching Digits with a Character Class, Matching Digits with a Character Class, Matching Word and Non-Word Characters, Character Classes–Character Classes, Character Classes, Character Classes, Character Classes, Negated Character Classes, Union and Difference, Union and Difference, POSIX Character Classes–POSIX Character Classes, POSIX Character Classes, Regular Expression Glossary, Regular Expression Glossary
+
+creating, Character Classes
+described, Matching Digits with a Character Class, Character Classes–Character Classes, Regular Expression Glossary
+difference of, Union and Difference
+fewest keystrokes win principle and, Matching Word and Non-Word Characters
+matching digits with, Matching Digits with a Character Class
+matching range of characters, Character Classes
+matching range of digits, Character Classes
+negated, Negated Character Classes, Regular Expression Glossary
+POSIX, POSIX Character Classes–POSIX Character Classes, POSIX Character Classes
+union of, Union and Difference
+character escape, Using a Character Shorthand, The Beginning and End of a Line, Metacharacters, Regular Expression Glossary
+
+described, Using a Character Shorthand, Regular Expression Glossary
+metacharacters and, The Beginning and End of a Line, Metacharacters
+character properties, Matching Unicode Character Properties–Matching Unicode Character Properties, Matching Unicode Character Properties, Character Properties, Script Names for Character Properties
+
+described, Matching Unicode Character Properties, Character Properties
+matching, Matching Unicode Character Properties–Matching Unicode Character Properties
+script names for, Script Names for Character Properties
+character sets, Matching Digits with a Character Class (see character classes)
+character shorthand, What Is a Regular Expression?, Using a Character Shorthand, Using a Character Shorthand, Matching Word and Non-Word Characters, Matching Whitespace, Other Anchors, Quoting a Group of Characters as Literals, Character Classes, Character Shorthands, Whitespace
+
+character class and, Character Classes
+described, What Is a Regular Expression?, Using a Character Shorthand, Matching Word and Non-Word Characters, Character Shorthands
+matching digits with, Using a Character Shorthand
+quoting group of characters as literals, Quoting a Group of Characters as Literals
+start and end of subject, Other Anchors
+for whitespace, Matching Whitespace, Whitespace
+characters, Matching Any Character, Matching Any Character, Once Again–Matching Any Character, Once Again, Quoting a Group of Characters as Literals, Character Classes, Matching Unicode and Other Characters
+
+matching any, Matching Any Character, Matching Any Character, Once Again–Matching Any Character, Once Again
+matching range of, Character Classes, Matching Unicode and Other Characters
+quoting groups of characters as literals, Quoting a Group of Characters as Literals
+Chrome browser, Technical Notes
+circumflex, The Beginning and End of a Line (see caret (^))
+Clark, James, Technical Notes, Regular Expression Glossary
+code points, Regular Expression Glossary (see Unicode)
+Coleridge, Samuel Taylor, Simple Pattern Matching
+command files, Using sed to Mark Up Text, Using Perl to Mark Up Text, Adding Tags with sed, Adding Tags with Perl, Using a Command File with sed, Using a File of Commands with Perl, The End of the Beginning
+
+using with Perl, Using Perl to Mark Up Text, Adding Tags with Perl, Using a File of Commands with Perl
+using with sed, Using sed to Mark Up Text, Adding Tags with sed, Using a Command File with sed, The End of the Beginning
+composability, Technical Notes, Regular Expression Glossary
+control characters, Matching Word and Non-Word Characters, Matching Control Characters, Matching Control Characters, Technical Notes, Character Shorthands, Control Characters
+
+additional information, Technical Notes
+character shorthand, Matching Word and Non-Word Characters, Matching Control Characters, Character Shorthands
+matching, Matching Control Characters
+in regular expressions, Control Characters
+[:ctrl:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes
+curly braces {}, Using Quantifiers, Quoting Literals, Matching a Specific Number of Times, Regular Expressions in QED, Metacharacters
+
+as metacharacters, Using Quantifiers, Metacharacters
+QED regex feature, Regular Expressions in QED
+usage example, Quoting Literals, Matching a Specific Number of Times
+
+### D
+
+\d (digit) character shorthand, What Is a Regular Expression?, Using a Character Shorthand, Quoting Literals, Matching Digits–Matching Digits, Matching Word and Non-Word Characters, Character Shorthands
+
+described, Matching Word and Non-Word Characters, Character Shorthands
+matching digits, What Is a Regular Expression?, Matching Digits–Matching Digits
+usage example, Using a Character Shorthand, Quoting Literals
+\d xxx (decimal value) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+\D (non-digit) character shorthand, Using a Character Shorthand, Matching Non-Digits, Matching Word and Non-Word Characters, Character Shorthands
+
+described, Matching Word and Non-Word Characters, Character Shorthands
+matching non-digits, Matching Non-Digits
+usage example, Using a Character Shorthand
+d modifier (Perl), Alternation, Options/Modifiers
+decimal value (\d xxx) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+difference of character sets, Union and Difference
+[:digit:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes
+digits, What Is a Regular Expression?, Matching Digits with a Character Class, Using a Character Shorthand, Using a Character Shorthand, Matching Any Character, Capturing Groups and Back References, Quoting Literals–A Sample of Applications, Quoting Literals, Matching Digits–Matching Digits, Matching Digits–Matching Digits, Matching Word and Non-Word Characters, Character Classes, Character Shorthands
+
+capturing groups and backreferences, Capturing Groups and Back References
+character shorthand, What Is a Regular Expression?, Using a Character Shorthand, Quoting Literals, Matching Digits–Matching Digits, Matching Word and Non-Word Characters, Character Shorthands
+matching any characters, Matching Any Character
+matching range of, Character Classes
+matching with character classes, Matching Digits with a Character Class
+matching with character shorthand, Using a Character Shorthand
+matching with shorthand, Matching Digits–Matching Digits
+quoting literals, Quoting Literals–A Sample of Applications
+documents, marking up with HTML, Marking Up a Document with HTML (see marking up documents with HTML5)
+dollar sign ($), Quoting Literals, The Beginning and End of a Line, Regular Expressions in QED, Metacharacters
+
+matching end of line with, Regular Expressions in QED
+as metacharacter, Metacharacters
+usage examples, Quoting Literals, The Beginning and End of a Line
+dot (.) character, Matching Any Character, Matching Any Character, Once Again, Regular Expressions in QED, Metacharacters
+
+described, Matching Any Character, Once Again
+matching any character, Matching Any Character
+as metacharacter, Metacharacters
+QED regex feature, Regular Expressions in QED
+dotall option, Matching Any Character, Matching Any Character, Once Again, The Beginning and End of a Line
+
+### E
+
+\E (quoting literal characters) character shorthand, Quoting a Group of Characters as Literals
+E command-line option, Regular Expressions in QED
+echo command, Using sed to Mark Up Text
+ed editor, Regular Expression Glossary
+egrep utility, Technical Notes, Regular Expression Glossary
+email address example, Matching an Email Address
+EREs (extended regular expressions), Word and Non-word Boundaries, Alternation, Regular Expression Glossary
+
+described, Regular Expression Glossary
+grep -E option for, Word and Non-word Boundaries, Alternation
+
+### F
+
+\f (form feed) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+fewest keystrokes win principle, Matching Word and Non-Word Characters
+fgrep utility, Technical Notes, Regular Expression Glossary
+flags, Regular Expression Glossary (see modifiers (flags))
+form feed (\f) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+forward slash (/), Word and Non-word Boundaries, Matching Tags
+Friedl, Jeff, Who Should Read This Book, Learning More
+full stop, Matching Any Character, Once Again (see dot character)
+
+### G
+
+g modifier (Perl), Alternation, Options/Modifiers
+GE-TSS (General Electric Time Sharing System), Regular Expression Glossary
+Git version control system, Technical Notes
+Goyvaerts, Jan, Who Should Read This Book, Learning More, Matching a North American Phone Number
+[:graph:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes
+greedy match, Greedy, Lazy, and Possessive–Matching a Specific Number of Times, Regular Expression Glossary
+grep utility, Word and Non-word Boundaries, Word and Non-word Boundaries, Word and Non-word Boundaries, Word and Non-word Boundaries, Word and Non-word Boundaries, Technical Notes, Alternation, Alternation, Alternation, Alternation, Alternation, Regular Expression Glossary
+
+alternation with, Alternation
+BREs and, Word and Non-word Boundaries, Alternation
+-c option, Word and Non-word Boundaries, Alternation
+described, Technical Notes, Regular Expression Glossary
+-E option, Word and Non-word Boundaries, Alternation
+-o option, Word and Non-word Boundaries, Alternation
+search syntax, Word and Non-word Boundaries
+groups and grouping, Capturing Groups and Back References, Quoting a Group of Characters as Literals, Subpatterns, Capturing Groups and Backreferences–Named Groups, Named Groups, Non-Capturing Groups, Atomic Groups, Technical Notes, Lookarounds–What You Learned in Chapter 8, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary
+
+atomic, Atomic Groups, Technical Notes, Regular Expression Glossary
+capturing, Capturing Groups and Back References, Capturing Groups and Backreferences–Named Groups, Regular Expression Glossary
+described, Regular Expression Glossary
+lookarounds, Lookarounds–What You Learned in Chapter 8
+named, Named Groups
+non-capturing, Non-Capturing Groups, Regular Expression Glossary, Regular Expression Glossary
+quoting groups of characters as literals, Quoting a Group of Characters as Literals
+subpatterns, Subpatterns
+Gwyn, Doug, The End of the Beginning
+
+### H
+
+\h (horizontal) whitespace character, Matching Whitespace, Whitespace
+\H (non-horizontal) whitespace character, Matching Whitespace, Whitespace
+hexadecimal numbering system, Matching Word and Non-Word Characters, Character Classes, Matching a Unicode Character, Character Shorthands, Regular Expression Glossary
+
+character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+described, Regular Expression Glossary
+matching character classes, Character Classes
+matching Unicode characters, Matching a Unicode Character
+hold space, Regular Expression Glossary
+horizontal (\h) whitespace character, Matching Whitespace, Whitespace
+horizontal tab (\t) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+Horton, Mark, Technical Notes
+HTML5, Marking Up the Text, Technical Notes, Marking Up a Document with HTML–Technical Notes
+
+additional information, Technical Notes
+marking up documents with, Marking Up a Document with HTML–Technical Notes
+marking up text as, Marking Up the Text
+hyphen (-) metacharacter, Quoting a Group of Characters as Literals
+
+### I
+
+i (insert) command (sed), Adding Tags with sed, Transforming Plain Text with sed
+i modifier (Perl), Alternation, Options/Modifiers
+IEEE (Institute of Electrical and Electronics Engineers), POSIX Character Classes, Technical Notes, Regular Expression Glossary
+insert (i) command (sed), Adding Tags with sed, Transforming Plain Text with sed
+
+### J
+
+Java programming language, Union and Difference, Technical Notes
+Joy, Bill, Technical Notes, Technical Notes, Regular Expression Glossary
+
+### K
+
+Kernighan, Brian, Preface
+Kleene star, Matching with *, +, and ?
+Kleene, Stephen, Preface, Matching with *, +, and ?, Regular Expression Glossary
+
+### L
+
+l modifier (Perl), Alternation, Options/Modifiers
+lazy match, Greedy, Lazy, and Possessive, Lazy Quantifiers, Regular Expression Glossary
+Levithan, Steven, Who Should Read This Book, Learning More, Matching a North American Phone Number
+lines, matching beginning and end of, The Beginning and End of a Line–The Beginning and End of a Line
+literals, Regular Expression Glossary (see string literals)
+lookaheads, Positive Lookaheads–Positive Lookaheads, Negative Lookaheads, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary
+
+described, Regular Expression Glossary
+negative, Negative Lookaheads, Regular Expression Glossary
+positive, Positive Lookaheads–Positive Lookaheads, Regular Expression Glossary
+lookarounds, Regular Expression Glossary (see lookaheads; lookbehinds)
+lookbehinds, Positive Lookbehinds, Negative Lookbehinds, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary
+
+described, Regular Expression Glossary
+negative, Negative Lookbehinds, Regular Expression Glossary
+positive, Positive Lookbehinds, Regular Expression Glossary
+[:lower:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes
+
+### M
+
+m modifier (Perl), Alternation, Options/Modifiers
+marking up documents with HTML5, Using sed to Mark Up Text–Using sed to Mark Up Text, Using Perl to Mark Up Text–Using Perl to Mark Up Text, Adding Tags with sed–Adding Tags with sed, Adding Tags with Perl–Adding Tags with Perl, Marking Up a Document with HTML, Matching Tags, Transforming Plain Text with sed–Handling the Lines of the Poem with sed, Appending Tags–Using a Command File with sed, Transforming Plain Text with Perl–Using a File of Commands with Perl
+
+adding tags with Perl, Adding Tags with Perl–Adding Tags with Perl
+adding tags with sed, Adding Tags with sed–Adding Tags with sed
+appending tags, Appending Tags–Using a Command File with sed
+described, Marking Up a Document with HTML
+marking up with Perl, Using Perl to Mark Up Text–Using Perl to Mark Up Text
+marking up with sed, Using sed to Mark Up Text–Using sed to Mark Up Text
+matching tags, Matching Tags
+transforming plain text with Perl, Transforming Plain Text with Perl–Using a File of Commands with Perl
+transforming plain text with sed, Transforming Plain Text with sed–Handling the Lines of the Poem with sed
+marking up text, Using sed to Mark Up Text, Using Perl to Mark Up Text–Using Perl to Mark Up Text
+
+using Perl, Using Perl to Mark Up Text–Using Perl to Mark Up Text
+using sed, Using sed to Mark Up Text
+McMahon, Lee, Using sed to Mark Up Text, Regular Expression Glossary
+metacharacters, Matching Digits with a Character Class, Quoting a Group of Characters as Literals, Metacharacters, Metacharacters, Regular Expression Glossary
+
+described, Matching Digits with a Character Class, Quoting a Group of Characters as Literals, Regular Expression Glossary
+escaping, Metacharacters
+in regular expressions, Metacharacters
+modifiers (flags), Alternation, Alternation, Options/Modifiers, Regular Expression Glossary
+
+described, Regular Expression Glossary
+in regular expressions, Alternation, Alternation, Options/Modifiers
+Moolenaar, Bram, Technical Notes, Technical Notes, Regular Expression Glossary
+
+### N
+
+\n (newline) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+named groups, Named Groups
+negated character classes, Negated Character Classes, Regular Expression Glossary
+negative lookaheads, Negative Lookaheads, Regular Expression Glossary
+negative lookbehinds, Negative Lookbehinds, Regular Expression Glossary
+.NET programming framework, Technical Notes
+newlines, Matching Any Character, Matching Word and Non-Word Characters, The Beginning and End of a Line, Adding Tags with sed, Character Shorthands
+
+character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+inserting, Adding Tags with sed
+matching with dotall option, Matching Any Character, The Beginning and End of a Line
+non-capturing groups, Non-Capturing Groups, Lookarounds–What You Learned in Chapter 8, Regular Expression Glossary, Regular Expression Glossary
+
+described, Non-Capturing Groups, Regular Expression Glossary, Regular Expression Glossary
+lookarounds, Lookarounds–What You Learned in Chapter 8
+non-digit (\D) character shorthand, Using a Character Shorthand, Matching Non-Digits, Matching Word and Non-Word Characters, Character Shorthands
+
+described, Matching Word and Non-Word Characters, Character Shorthands
+matching non-digits, Matching Non-Digits
+usage example, Using a Character Shorthand
+non-horizontal (\H) whitespace character, Matching Whitespace, Whitespace
+non-space (\S) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+non-vertical (\V) whitespace character, Matching Whitespace, Whitespace
+non-word (\W) character shorthand, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Word and Non-Word Characters, Character Shorthands
+
+described, Matching Word and Non-Word Characters, Character Shorthands
+matching, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters
+non-word boundary (\B) character shorthand, Matching Word and Non-Word Characters, Word and Non-word Boundaries, Character Shorthands
+Notepad++ editor, A Sample of Applications, Technical Notes
+null (\0) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+numbers, Matching Digits with a Character Class (see digits)
+
+### O
+
+\o (octal value) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+occurrence constraints, Regular Expression Glossary (see quantifiers)
+octal characters, Matching Word and Non-Word Characters, Matching Characters with Octal Numbers, Character Shorthands, Regular Expression Glossary
+
+character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+described, Regular Expression Glossary
+matching Unicode with, Matching Characters with Octal Numbers
+Oniguruma library (Ruby), Ruby (Oniguruma)
+Opera Next browser, Technical Notes
+options, Alternation, Options/Modifiers, Regular Expression Glossary
+
+described, Regular Expression Glossary
+in regular expressions, Alternation, Options/Modifiers
+Oxygen XML editor, A Sample of Applications, Technical Notes
+
+### P
+
+p modifier (Perl), Alternation, Options/Modifiers
+parentheses (), Capturing Groups and Back References, Quoting Literals, Alternation, Groups, and Backreferences, Subpatterns, Regular Expressions in QED, Metacharacters
+
+as metacharacters, Metacharacters
+QED regex feature, Regular Expressions in QED
+subpatterns and, Subpatterns
+usage examples, Capturing Groups and Back References, Quoting Literals, Alternation, Groups, and Backreferences
+pattern matching, Simple Pattern Matching–Simple Pattern Matching, Matching String Literals, Matching Digits–Matching Digits, Matching Non-Digits, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Whitespace, Matching Any Character, Once Again–Matching Any Character, Once Again, Marking Up the Text–Using Perl to Mark Up Text, Subpatterns, Regular Expression Glossary
+
+described, Simple Pattern Matching–Simple Pattern Matching, Regular Expression Glossary
+marking up text, Marking Up the Text–Using Perl to Mark Up Text
+matching any character, Matching Any Character, Once Again–Matching Any Character, Once Again
+matching digits, Matching Digits–Matching Digits
+matching non-digits, Matching Non-Digits
+matching non-word characters, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters
+matching string literals, Matching String Literals
+matching whitespace, Matching Whitespace
+matching word characters, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters
+subpatterns and, Subpatterns
+pattern space, Regular Expression Glossary
+PCRE (Perl Compatible Regular Expressions), Other Anchors, Technical Notes, PCRE
+pcregrep utility, Other Anchors, Other Anchors, Other Anchors, Technical Notes, PCRE
+
+-c option, Other Anchors
+described, Other Anchors, Technical Notes, PCRE
+-n option, Other Anchors
+period, Matching Any Character, Once Again (see dot character)
+Perl Compatible Regular Expressions (PCRE), Other Anchors, Technical Notes, PCRE
+Perl programming language, Using Perl to Mark Up Text–Using Perl to Mark Up Text, Using Perl to Mark Up Text, Using Perl to Mark Up Text, Technical Notes, Other Anchors, Adding Tags with Perl, Adding Tags with Perl, Alternation, Alternation, Named Groups, Transforming Plain Text with Perl–Using a File of Commands with Perl, Handling Roman Numerals with Perl, Using a File of Commands with Perl, Perl, Options/Modifiers, Regular Expression Glossary
+
+accessing named groups, Named Groups
+adding tags, Adding Tags with Perl
+additional information, Technical Notes
+alternation and, Alternation
+command files and, Using Perl to Mark Up Text, Adding Tags with Perl, Using a File of Commands with Perl
+described, Using Perl to Mark Up Text, Perl, Regular Expression Glossary
+handling Roman numerals, Handling Roman Numerals with Perl
+marking up text, Using Perl to Mark Up Text–Using Perl to Mark Up Text
+modifiers in regular expressions, Alternation, Options/Modifiers
+start and end of subjects, Other Anchors
+transforming plain text with, Transforming Plain Text with Perl–Using a File of Commands with Perl
+perldoc command, Technical Notes
+phone numbers, Matching a North American Phone Number–Matching a North American Phone Number, Matching a North American Phone Number–Matching a North American Phone Number, Matching Digits with a Character Class, Matching Digits with a Character Class, Using a Character Shorthand, Matching Any Character, Capturing Groups and Back References, Quoting Literals–A Sample of Applications, Matching a North American Phone Number
+
+capturing groups and backreferences, Capturing Groups and Back References
+matching any characters, Matching Any Character
+matching digits with character classes, Matching Digits with a Character Class
+matching in regular expressions, Matching a North American Phone Number–Matching a North American Phone Number, Matching a North American Phone Number–Matching a North American Phone Number, Matching a North American Phone Number
+matching with character classes, Matching Digits with a Character Class
+matching with character shorthand, Using a Character Shorthand
+quoting literals, Quoting Literals–A Sample of Applications
+piece (regular expressions), Regular Expression Glossary
+plain text, Transforming Plain Text with sed (see strings and string literals)
+plus sign (+), Using Quantifiers, Matching with *, +, and ?, Metacharacters, Regular Expression Glossary
+
+as metacharacter, Metacharacters
+as quantifier, Using Quantifiers, Matching with *, +, and ?, Regular Expression Glossary
+Portable Operating System Interface for Unix (POSIX), POSIX Character Classes, Regular Expression Glossary
+positive lookaheads, Positive Lookaheads–Positive Lookaheads, Regular Expression Glossary
+positive lookbehinds, Positive Lookbehinds, Regular Expression Glossary
+POSIX (Portable Operating System Interface for Unix), POSIX Character Classes, Regular Expression Glossary
+POSIX character classes, POSIX Character Classes–POSIX Character Classes, POSIX Character Classes
+POSIX.1-2008 standard, Regular Expression Glossary
+possessive match, Greedy, Lazy, and Possessive, Possessive Quantifiers, Regular Expression Glossary
+[:print:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes
+Project Gutenberg, Simple Pattern Matching
+[:punct:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes
+Python programming language, Technical Notes, Python
+
+### Q
+
+q (quit) command (sed), Using sed to Mark Up Text
+\Q (quoting literal characters) character shorthand, Quoting a Group of Characters as Literals
+QED editor, What Is a Regular Expression?, Technical Notes, Regular Expressions in QED–Regular Expressions in QED, Regular Expressions in QED, Technical Notes, Technical Notes, Regular Expression Glossary
+
+additional information, Technical Notes
+Ken Thompson and, What Is a Regular Expression?, Technical Notes, Regular Expressions in QED, Technical Notes, Regular Expression Glossary
+regular expressions in, Regular Expressions in QED–Regular Expressions in QED
+quantifiers, Using Quantifiers, Using Quantifiers, Quoting Literals, Greedy, Lazy, and Possessive, Greedy, Lazy, and Possessive–Matching a Specific Number of Times, Greedy, Lazy, and Possessive, Greedy, Lazy, and Possessive, Matching with *, +, and ?, Matching a Specific Number of Times, Lazy Quantifiers, Possessive Quantifiers, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary
+
+described, Using Quantifiers, Greedy, Lazy, and Possessive, Regular Expression Glossary
+greedy match and, Greedy, Lazy, and Possessive–Matching a Specific Number of Times, Regular Expression Glossary
+lazy match and, Greedy, Lazy, and Possessive, Lazy Quantifiers, Regular Expression Glossary
+matching specific number of times, Matching a Specific Number of Times
+matching with *, +, and ?, Matching with *, +, and ?
+possessive match and, Greedy, Lazy, and Possessive, Possessive Quantifiers, Regular Expression Glossary
+usage examples, Using Quantifiers, Quoting Literals
+question mark (?), Using Quantifiers, Quoting Literals, Matching with *, +, and ?, Matching Tags, Metacharacters, Regular Expression Glossary
+
+matching tags, Matching Tags
+as metacharacter, Metacharacters
+as quantifier, Using Quantifiers, Matching with *, +, and ?, Regular Expression Glossary
+usage examples, Quoting Literals
+quit (q) command (sed), Using sed to Mark Up Text
+quoting literals, Quoting Literals–A Sample of Applications, Quoting a Group of Characters as Literals
+
+quoting groups of characters as, Quoting a Group of Characters as Literals
+usage example, Quoting Literals–A Sample of Applications
+
+### R
+
+\r (carriage return) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+Rackham, Stuart, Technical Notes
+range of characters, matching, Character Classes, Matching Unicode and Other Characters
+range of digits, matching, Character Classes
+Re2 library, RE2
+Regex Hero, Matching a Unicode Character, Technical Notes
+RegexBuddy application, Who Should Read This Book
+Regexpal regex processor, Who Should Read This Book, Getting Started with Regexpal, Matching a North American Phone Number–Matching a North American Phone Number, Technical Notes, Matching String Literals, Negated Character Classes, Matching a Unicode Character
+
+additional information, Technical Notes
+described, Who Should Read This Book, Getting Started with Regexpal
+matching phone numbers, Matching a North American Phone Number–Matching a North American Phone Number
+matching Unicode characters, Matching a Unicode Character
+negated character classes and, Negated Character Classes
+string matching in, Matching String Literals
+RegExr regex processor, Simple Pattern Matching, Simple Pattern Matching, Simple Pattern Matching, Matching Digits–Matching Digits, Matching Non-Digits, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Whitespace, Matching Any Character, Once Again–Matching Any Character, Once Again, Marking Up the Text, Marking Up the Text, Technical Notes, The Beginning and End of a Line–The Beginning and End of a Line, Quoting a Group of Characters as Literals, Adding Tags, Alternation, Groups, and Backreferences, Alternation, Alternation, Capturing Groups and Backreferences, Named Groups, Positive Lookaheads
+
+adding tags, Adding Tags
+additional information, Technical Notes
+alternation with, Alternation
+backreference support, Capturing Groups and Backreferences
+case-insensitivity, Alternation, Positive Lookaheads
+Community tab, Simple Pattern Matching
+described, Simple Pattern Matching
+downloading, Alternation, Groups, and Backreferences
+marking up text, Marking Up the Text
+matching any characters, Matching Any Character, Once Again–Matching Any Character, Once Again
+matching beginning and end of lines, The Beginning and End of a Line–The Beginning and End of a Line
+matching digits, Matching Digits–Matching Digits
+matching non-digits, Matching Non-Digits
+matching non-word characters, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters
+matching whitespace, Matching Whitespace
+matching word characters, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters
+metacharacters and, Quoting a Group of Characters as Literals
+named groups, Named Groups
+Replace tab, Marking Up the Text
+Samples tab, Simple Pattern Matching
+Reggy application, Technical Notes, Quantifiers
+regular expressions, Preface, What Is a Regular Expression?, Matching a North American Phone Number–Matching a North American Phone Number, Matching Digits with a Character Class, Using a Character Shorthand, Matching Any Character, Capturing Groups and Back References, Using Quantifiers, Quoting Literals–A Sample of Applications, Matching String Literals, Matching Digits–Matching Digits, Matching Non-Digits, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Word and Non-Word Characters, Matching Whitespace, Matching Any Character, Once Again–Matching Any Character, Once Again, Marking Up the Text, Quoting a Group of Characters as Literals, Alternation, Alternation, Subpatterns, Capturing Groups and Backreferences–Named Groups, Character Classes–Technical Notes, Learning More–What You Learned in Chapter 10, Matching a North American Phone Number, Matching an Email Address, Regular Expressions in QED–Regular Expressions in QED, Metacharacters, Character Shorthands, Control Characters, Options/Modifiers, Options/Modifiers, ASCII Code Chart with Regex–ASCII Code Chart with Regex, Regular Expression Glossary, Regular Expression Glossary
+
+additional information, Learning More–What You Learned in Chapter 10
+ASCII code chart, ASCII Code Chart with Regex–ASCII Code Chart with Regex
+capturing groups and backreferences, Capturing Groups and Back References, Capturing Groups and Backreferences–Named Groups
+character shorthand in, Matching Word and Non-Word Characters, Character Shorthands
+control characters in, Control Characters
+described, Preface, What Is a Regular Expression?, Regular Expression Glossary
+marking up text, Marking Up the Text
+matching any character, Matching Any Character, Matching Any Character, Once Again–Matching Any Character, Once Again
+matching digits, Matching Digits–Matching Digits
+matching digits with character classes, Matching Digits with a Character Class
+matching email addresses, Matching an Email Address
+matching non-digits, Matching Non-Digits
+matching non-word characters, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters
+matching phone numbers, Matching a North American Phone Number–Matching a North American Phone Number, Matching a North American Phone Number
+matching string literals, Matching String Literals
+matching whitespace, Matching Whitespace
+matching with character classes, Character Classes–Technical Notes
+matching with character shorthand, Using a Character Shorthand
+matching word characters, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters
+metacharacters in, Quoting a Group of Characters as Literals, Metacharacters
+modifiers in, Alternation, Options/Modifiers
+options in, Alternation, Options/Modifiers
+pieces of, Regular Expression Glossary
+in QED editor, Regular Expressions in QED–Regular Expressions in QED
+quantifiers in, Using Quantifiers
+quoting literals in, Quoting Literals–A Sample of Applications
+subpatterns and, Subpatterns
+reluctant (lazy) quantifiers, Greedy, Lazy, and Possessive
+"The Rime of the Ancient Mariner" (Coleridge), Simple Pattern Matching
+Ritchie, Dennis, Preface, Technical Notes, Technical Notes
+Roman numerals, Handling Roman Numerals with sed, Handling Roman Numerals with Perl
+
+handling with Perl, Handling Roman Numerals with Perl
+handling with sed, Handling Roman Numerals with sed
+Rubular Ruby regex processor, Technical Notes, Ruby (Oniguruma)
+
+### S
+
+s (substitute) command, Using sed to Mark Up Text, Using Perl to Mark Up Text, Capturing Groups and Backreferences, Capturing Groups and Backreferences, Substitution with sed, Using a Command File with sed, Handling Roman Numerals with Perl
+
+with Perl, Using Perl to Mark Up Text, Capturing Groups and Backreferences, Handling Roman Numerals with Perl
+with sed, Using sed to Mark Up Text, Capturing Groups and Backreferences, Substitution with sed, Using a Command File with sed
+\s (space) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+\S (non-space) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+s modifier (Perl), Alternation, Options/Modifiers
+Schiller, Friedrich, Matching Unicode Character Properties
+script names for character properties, Script Names for Character Properties
+search command (vim), Word and Non-word Boundaries
+sed editor, Using sed to Mark Up Text, Using sed to Mark Up Text, Using sed to Mark Up Text, Using sed to Mark Up Text, Using sed to Mark Up Text, Technical Notes, Adding Tags with sed–Adding Tags with sed, Adding Tags with sed, Adding Tags with sed, Capturing Groups and Backreferences, Capturing Groups and Backreferences, Capturing Groups and Backreferences, Capturing Groups and Backreferences, Capturing Groups and Backreferences, Transforming Plain Text with sed–Handling the Lines of the Poem with sed, Transforming Plain Text with sed, Substitution with sed, Handling Roman Numerals with sed, Appending Tags, Using a Command File with sed, Using a Command File with sed, The End of the Beginning, Regular Expression Glossary
+
+a (append) command, Appending Tags
+adding tags with, Adding Tags with sed–Adding Tags with sed
+additional information, Technical Notes
+backreference support, Capturing Groups and Backreferences, Capturing Groups and Backreferences
+command files and, Using sed to Mark Up Text, Adding Tags with sed, Using a Command File with sed, The End of the Beginning
+described, Using sed to Mark Up Text, Regular Expression Glossary
+-E option, Capturing Groups and Backreferences
+handling Roman numerals, Handling Roman Numerals with sed
+i (insert) command, Adding Tags with sed, Transforming Plain Text with sed
+marking up text, Using sed to Mark Up Text
+-n option, Capturing Groups and Backreferences
+q (quit) command, Using sed to Mark Up Text
+s (substitute) command, Using sed to Mark Up Text, Capturing Groups and Backreferences, Substitution with sed, Using a Command File with sed
+transforming plain text with, Transforming Plain Text with sed–Handling the Lines of the Poem with sed
+semicolon (;), Using sed to Mark Up Text
+shebang directive, Using a File of Commands with Perl
+Skinner, Grant, Simple Pattern Matching
+space character (\s) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+[:space:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes
+special characters in regular expressions, Quoting a Group of Characters as Literals (see metacharacters)
+Spencer, Henry, Preface
+square brackets ], [Quoting Literals, Character Classes, Metacharacters
+
+as metacharacters, Metacharacters
+usage examples, Quoting Literals, Character Classes
+strings and string literals, Matching a North American Phone Number, Matching a North American Phone Number, Quoting Literals–A Sample of Applications, Matching String Literals, The Beginning and End of a Line–The Beginning and End of a Line, Quoting a Group of Characters as Literals, Transforming Plain Text with sed–Handling the Lines of the Poem with sed, Transforming Plain Text with Perl–Using a File of Commands with Perl, Regular Expression Glossary
+
+described, Matching a North American Phone Number, Regular Expression Glossary
+matching, Matching String Literals
+matching beginning and end of lines, The Beginning and End of a Line–The Beginning and End of a Line
+matching phone numbers, Matching a North American Phone Number
+quoting, Quoting Literals–A Sample of Applications, Quoting a Group of Characters as Literals
+transforming with Perl, Transforming Plain Text with Perl–Using a File of Commands with Perl
+transforming with sed, Transforming Plain Text with sed–Handling the Lines of the Poem with sed
+Stubblebine, Tony, Learning More
+subpatterns, Subpatterns
+substitute (s) command, Using sed to Mark Up Text, Using Perl to Mark Up Text, Capturing Groups and Backreferences, Capturing Groups and Backreferences, Substitution with sed, Using a Command File with sed, Handling Roman Numerals with Perl
+
+with Perl, Using Perl to Mark Up Text, Capturing Groups and Backreferences, Handling Roman Numerals with Perl
+with sed, Using sed to Mark Up Text, Capturing Groups and Backreferences, Substitution with sed, Using a Command File with sed
+
+### T
+
+\t (horizontal tab) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+tab characters, Matching Word and Non-Word Characters, Matching Word and Non-Word Characters, Character Shorthands, Character Shorthands
+
+horizontal tab shorthand, Matching Word and Non-Word Characters, Character Shorthands
+vertical tab shorthand, Matching Word and Non-Word Characters, Character Shorthands
+tags, Adding Tags, Adding Tags with sed–Adding Tags with sed, Adding Tags with Perl, Matching Tags, Appending Tags–Using a Command File with sed
+
+adding with Perl, Adding Tags with Perl
+adding with sed, Adding Tags with sed–Adding Tags with sed
+appending, Appending Tags–Using a Command File with sed
+described, Adding Tags
+matching, Matching Tags
+text, Transforming Plain Text with sed (see strings and string literals)
+TextMate editor, A Sample of Applications, Technical Notes
+Thompson, Ken, Preface, What Is a Regular Expression?, Technical Notes, Technical Notes, Regular Expressions in QED, Technical Notes, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary
+
+ed editor and, Regular Expression Glossary
+grep and, Technical Notes, Regular Expression Glossary
+QED editor and, What Is a Regular Expression?, Technical Notes, Regular Expressions in QED, Technical Notes, Regular Expression Glossary
+regular expressions and, Preface
+
+### U
+
+\u (Unicode) character shorthand, Matching Word and Non-Word Characters, Matching a Unicode Character, Character Shorthands
+u modifier (Perl), Alternation, Options/Modifiers
+underscore (_), Matching Tags, Technical Notes
+Unicode, What Is a Regular Expression?, Matching Word and Non-Word Characters, Matching Unicode and Other Characters, Matching a Unicode Character–Matching Characters with Octal Numbers, Matching a Unicode Character, Matching Unicode Character Properties–Matching Unicode Character Properties, Character Shorthands, Unicode Whitespace Characters, Regular Expression Glossary, Regular Expression Glossary
+
+character shorthand, Matching Word and Non-Word Characters, Matching a Unicode Character, Character Shorthands
+code point assignments, What Is a Regular Expression?, Regular Expression Glossary
+described, Matching Unicode and Other Characters, Regular Expression Glossary
+matching character properties, Matching Unicode Character Properties–Matching Unicode Character Properties
+matching characters, Matching a Unicode Character–Matching Characters with Octal Numbers
+whitespace characters in, Unicode Whitespace Characters
+union of character sets, Union and Difference
+[:upper:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes
+
+### V
+
+\v (vertical tab) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+\v (vertical) whitespace character, Matching Whitespace, Whitespace
+\V (non-vertical) whitespace character, Matching Whitespace, Whitespace
+van Rossum, Guido, Python
+vertical (\v) whitespace character, Matching Whitespace, Whitespace
+vertical bar (|), Quoting Literals, Regular Expressions in QED, Metacharacters
+
+as metacharacter, Metacharacters
+QED regex feature, Regular Expressions in QED
+usage example, Quoting Literals
+vertical tab (\v) character shorthand, Matching Word and Non-Word Characters, Character Shorthands
+vi editor, Technical Notes, Regular Expression Glossary
+vim editor, Word and Non-word Boundaries, Using vim, Technical Notes, Regular Expression Glossary
+
+additional information, Technical Notes
+described, Regular Expression Glossary
+matching Unicode characters, Using vim
+search command in, Word and Non-word Boundaries
+Voltaire (philosopher), Matching Unicode and Other Characters
+
+### W
+
+\w (word) character shorthand, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Word and Non-Word Characters, Character Shorthands
+
+described, Matching Word and Non-Word Characters, Character Shorthands
+matching, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters
+\W (non-word) character shorthand, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Word and Non-Word Characters, Character Shorthands
+
+described, Matching Word and Non-Word Characters, Character Shorthands
+matching, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters
+Wall, Larry, Using Perl to Mark Up Text, Regular Expression Glossary
+Watt, Andrew, Learning More
+wc command, Alternation
+whitespace, Matching Whitespace, Matching Whitespace, Whitespace, Unicode Whitespace Characters
+
+character shorthand for, Matching Whitespace, Whitespace
+matching with RegExr, Matching Whitespace
+in Unicode, Unicode Whitespace Characters
+wildcards, matching any character, Matching Any Character
+word (\w) character shorthand, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Word and Non-Word Characters, Character Shorthands
+
+described, Matching Word and Non-Word Characters, Character Shorthands
+matching, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters
+word boundary (\b) character shorthand, Matching Word and Non-Word Characters, Word and Non-word Boundaries–Word and Non-word Boundaries, Character Shorthands
+[:word:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes
+work buffer, Regular Expression Glossary
+Wortham, Steve, Technical Notes
+
+### X
+
+x modifier (Perl), Alternation, Options/Modifiers
+[:xdigit:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes
+XML tags, Matching Tags, Technical Notes
+XSLT stylesheet, The End of the Beginning
+
+### Z
+
+\Z (end of subject) character shorthand, Other Anchors
+zero-width assertions, Boundaries, Regular Expression Glossary
+
+# About the Author
+
+Michael Fitzgerald describes Ruby as "my favorite language so far" and is working regularly with Ruby and the Rails framework. He has written over 150 Ruby programs for testing and demonstration, and has been developing a library of sample Ruby code. He is the author of Learning XSLT and XML Hacks, and co-author on the XML Pocket Reference.
+
+# Colophon
+
+The animal on the cover of _Introducing Regular Expressions_ is a fruit bat.
+
+Members of the suborder _Megachiroptera_ and family _Pteropodidae_ are known as fruit bats, flying foxes, old world fruit bats, or megabats. Despite the latter nickname, members of the Pteropodidae family vary greatly in size—the smallest measure six centimeters, while others weigh in at two pounds, with wingspans up to approximately five feet long.
+
+True to their name, fruit bats are frugivorous, or nectavorious, meaning they eat fruit or lick nectar from flowers. Some use their teeth to bite through fruit skin and actually eat the fruit, while others lick juices from crushed fruit. Because many of them dine on flower nectar, fruit bats are excellent pollinators and seed-spreaders—in fact, the World Bat Sanctuary estimates that approximately 95% of all new rainforest growth can be attributed to fruit bats' distribution of seeds. This relationship between the bats and plants is a form of mutualism—the way organisms of different species interact biologically for a mutual fitness benefit—known as chiropterophily.
+
+Fruit bats can be found all over the world, though they prefer warm, tropical climates, due in part to the availability of fruit and flowers. While they're excellent flyers, fruit bats are known for their clumsy landings; they often crash land into trees or try to grab limbs with their feet in order to stop themselves. This perpetuates the misconception that they're blind, when in fact, fruit bats are said to have the best vision of all the bat species, most of which rely on echolocation to get around. Fruit bats use vision—along with their advanced senses of smell—to locate food and navigate.
+
+The cover image is from Cassell's _Natural History_. The cover font is Adobe ITC Garamond. The text font is Linotype Birka; the heading font is Adobe Myriad Condensed; and the code font is LucasFont's TheSansMonoCondensed.
+
+# SPECIAL OFFER: Upgrade this ebook with O'Reilly
+
+Upgrade this ebook today for $4.99 at oreilly.com and get access to additional DRM-free formats, including PDF and EPUB, along with free lifetime updates.
+
+#
+
+# Introducing Regular Expressions
+
+### Michael Fitzgerald
+
+#### Editor
+
+### Simon St. Laurent
+
+Revision History  
+---  
+2012-07-10| First release
+
+Copyright © 2012 Michael Fitzgerald
+
+O'Reilly books may be purchased for educational, business, or sales promotional use. Online editions are also available for most titles (http://my.safaribooksonline.com). For more information, contact our corporate/institutional sales department: 800-998-9938 or corporate@oreilly.com.
+
+Nutshell Handbook, the Nutshell Handbook logo, and the O'Reilly logo are registered trademarks of O'Reilly Media, Inc. _Introducing Regular Expressions_ , the image of a fruit bat, and related trade dress are trademarks of O'Reilly Media, Inc.
+
+Many of the designations used by manufacturers and sellers to distinguish their products are claimed as trademarks. Where those designations appear in this book, and O'Reilly Media, Inc., was aware of a trademark claim, the designations have been printed in caps or initial caps.
+
+While every precaution has been taken in the preparation of this book, the publisher and authors assume no responsibility for errors or omissions, or for damages resulting from the use of the information contained herein.
+
+O'Reilly Media
+
+1005 Gravenstein Highway North
+
+Sebastopol, CA 95472
+
+2012-07-10T09:13:05-07:00
+
diff --git a/kag/examples/csqa/builder/data/introduction_to_the_theory_of_programming_languages.txt b/kag/examples/csqa/builder/data/introduction_to_the_theory_of_programming_languages.txt
new file mode 100644
index 00000000..1c28aec0
--- /dev/null
+++ b/kag/examples/csqa/builder/data/introduction_to_the_theory_of_programming_languages.txt
@@ -0,0 +1,2866 @@
+Introduction to the Theory of Programming Languages
+ 
+Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2© Springer-Verlag London Limited 2011
+
+Undergraduate Topics in Computer Science
+
+Series EditorIan Mackie
+
+Advisory EditorsSamson Abramsky, Chris Hankin, Dexter Kozen, Andrew Pitts, Hanne Riis Nielson, Steven Skiena and Iain Stewart
+
+Undergraduate Topics in Computer Science (UTiCS) delivers high-quality instructional content for undergraduates studying in all areas of computing and information science. From core foundational and theoretical material to final-year topics and applications, UTiCS books take a fresh, concise, and modern approach and are ideal for self-study or for a one- or two-semester course. The texts are all authored by established experts in their fields, reviewed by an international advisory board, and contain numerous examples and problems. Many include fully worked solutions.
+
+For other volumes: http://www.springer.com/series/7592
+
+Gilles Dowek and Jean-Jacques Lévy
+
+Introduction to the Theory of Programming Languages
+
+Gilles Dowek
+
+Labo. d'Informatique, École polytechnique, Palaiseau, France
+
+Jean-Jacques Lévy
+
+Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, Orsay Cedex, France
+
+ISSN 1863-7310
+
+ISBN 978-0-85729-075-5e-ISBN 978-0-85729-076-2
+
+Springer London Dordrecht Heidelberg New York
+
+British Library Cataloguing in Publication Data A catalogue record for this book is available from the British Library
+
+© Springer-Verlag London Limited 2011
+
+The work was first published in 2006 by Les editions de l'École polytechnique with the following title: 'Introduction à la théorie des langages de programmation'. The translator of the work is Maribel Fernandez.
+
+Apart from any fair dealing for the purposes of research or private study, or criticism or review, as permitted under the Copyright, Designs and Patents Act 1988, this publication may only be reproduced, stored or transmitted, in any form or by any means, with the prior permission in writing of the publishers, or in the case of reprographic reproduction in accordance with the terms of licenses issued by the Copyright Licensing Agency. Enquiries concerning reproduction outside those terms should be sent to the publishers.
+
+The use of registered names, trademarks, etc., in this publication does not imply, even in the absence of a specific statement, that such names are exempt from the relevant laws and regulations and therefore free for general use.
+
+The publisher makes no representation, express or implied, with regard to the accuracy of the information contained in this book and cannot accept any legal responsibility or liability for any errors or omissions that may be made.
+
+Printed on acid-free paper
+
+Springer is part of Springer Science+Business Media (www.springer.com)
+
+What Is the Theory of Programming Languages?
+
+The ultimate, definitive programming language has not been created yet, far from it. Almost every day a new language is created, and new functionalities are added to existing languages. Improvements in programming languages contribute to making programs more reliable, shorten the development time, and make programs easier to maintain. Improvements are also needed to satisfy new requirements, such as the development of parallel, distributed or mobile programs.
+
+The first thing that we need to describe, when defining a programming language, is its syntax . Should we write x := 1 or x = 1 ? Should we put brackets after an if or not? More generally, what are the strings of symbols that can be used as a program? There is a useful tool for this: the notion of a formal grammar . Using a grammar, we can describe the syntax of the language in a precise way, and this makes it possible to build programs to check the syntactical correctness of programs.
+
+But it is not sufficient to know what a syntactically correct program is in order to know what is going to happen when we run the program. When defining a programming language, it is also necessary to describe its semantics , that is, the expected behaviour of the program when it is executed. Two languages may have the same syntax but different semantics.
+
+The following is an example of what is meant (informally) by semantics. Function evaluation is often explained as follows. " The resultVof the evaluation of an expression of the formfe1...en , where the symbolfis a function defined by the expressionfx1...xn= e' , is obtained in the following way. First, the argumentse1, ...,enare evaluated, returning valuesW1, ...,Wn . Then, these values are associated to the variablesx1, ...,xn , and finally the expressione'is evaluated. The valueVis the result of this evaluation ."
+
+This explanation of the semantics of the language, expressed in a natural language (English), allows us to understand what happens when a program is executed, but is it precise? Consider, for example, the program
+
+Depending on the way we interpret the explanation given above, we can deduce that the program will result in the value 2 or in the value 9 . This is because the natural language explanation does not indicate whether we have to evaluate g 2 before or after g 7 , and the order in which we evaluate these expressions is important in this case. Instead, the explanation should have said: "the arguments e1, ...,en are evaluated starting frome1 " or else " starting fromen ".
+
+If two different programmers read an ambiguous explanation, they might understand different things. Even worse, the designers of the compilers for the language might choose different conventions. Then the same program will give different results depending on the compiler used.
+
+It is well known that natural languages are too imprecise to express the syntax of a programming language, a formal language should be used instead. Similarly, natural languages are too imprecise to express the semantics of a programming language, and we need to use a formal language for this.
+
+What is the semantics of a program? Let us take for instance a program p that requests an integer, computes its square, and displays the result of this operation. To describe the behaviour of this program, we need to describe a relation R between the input value and the associated output.
+
+The semantics of this program is, thus, a relation R between elements of the set E of input values and elements of the set S of output values, that is, a subset of E × S .
+
+The semantics of a program is then a binary relation. The semantics of a programming language is, in turn, a ternary relation: "the program p with input value e returns the output value s ". We denote this relation by p, e ↪ s . The program p and the input e are available before the execution of the program starts. Often, these two elements are paired in a termp e , and the semantics of the language assigns a value to this term. The semantics of the language is then a binary relation t  ↪  s .
+
+To express the semantics of a programming language we need a language that can express relations.
+
+When the semantics of a program is a functional relation, that is, for each input value there is at most one output value, we say that the program is deterministic . Video games are examples of non-deterministic programs, since some randomness is necessary to make the game enjoyable. A language is deterministic if all the programs that can be written in the language are deterministic, or equivalently, if the semantics is a functional relation. In this case, it is possible to define its semantics using a language to define functions instead of a language to define relations.
+
+Acknowledgements
+
+The authors would like to thank Gérard Assayag, Antonio Bucciarelli, Roberto Di Cosmo, Xavier Leroy, Dave MacQueen, Luc Maranget, Michel Mauny, François Pottier, Didier Rémy, Alan Schmitt, Élodie-Jane Sims and Véronique Viguié Donzeau-Gouge.
+
+Contents
+
+1 Terms and Relations 1
+
+1.1 Inductive Definitions 1
+
+1.1.1 The Fixed Point Theorem 1
+
+1.1.2 Inductive Definitions 4
+
+1.1.3 Structural Induction 6
+
+1.1.4 The Reflexive-Transitive Closure of a Relation 6
+
+1.2 Languages 7
+
+1.2.1 Languages Without Variables 7
+
+1.2.2 Variables 7
+
+1.2.3 Many-Sorted Languages 9
+
+1.2.4 Free and Bound Variables 10
+
+1.2.5 Substitution 10
+
+1.3 Three Ways to Define the Semantics of a Language 12
+
+1.3.1 Denotational Semantics 12
+
+1.3.2 Big-Step Operational Semantics 12
+
+1.3.3 Small-Step Operational Semantics 12
+
+1.3.4 Non-termination 13
+
+2 The Language PCF 15
+
+2.1 A Functional Language: PCF 15
+
+2.1.1 Programs Are Functions 15
+
+2.1.2 Functions Are First-Class Objects 15
+
+2.1.3 Functions with Several Arguments 16
+
+2.1.4 No Assignments 16
+
+2.1.5 Recursive Definitions 16
+
+2.1.6 Definitions 17
+
+2.1.7 The Language PCF 17
+
+2.2 Small-Step Operational Semantics for PCF 18
+
+2.2.1 Rules 18
+
+2.2.2 Numbers 19
+
+2.2.3 A Congruence 20
+
+2.2.4 An Example 21
+
+2.2.5 Irreducible Closed Terms 22
+
+2.2.6 Non-termination 23
+
+2.2.7 Confluence 24
+
+2.3 Reduction Strategies 24
+
+2.3.1 The Notion of a Strategy 24
+
+2.3.2 Weak Reduction 26
+
+2.3.3 Call by Name 26
+
+2.3.4 Call by Value 27
+
+2.3.5 A Bit of Laziness Is Needed 27
+
+2.4 Big-Step Operational Semantics for PCF 27
+
+2.4.1 Call by Name 28
+
+2.4.2 Call by Value 29
+
+2.5 Evaluation of PCF Programs 31
+
+3 From Evaluation to Interpretation 33
+
+3.1 Call by Name 33
+
+3.2 Call by Value 35
+
+3.3 An Optimisation: de Bruijn Indices 36
+
+3.4 Construction of Functions via Fixed Points 38
+
+3.4.1 First Variation: Recursive Closures 38
+
+3.4.2 Second Variation: Rational Values 40
+
+4 Compilation 43
+
+4.1 An Interpreter Written in a Language Without Functions 44
+
+4.2 From Interpretation to Compilation 44
+
+4.3 An Abstract Machine for PCF 45
+
+4.3.1 The Environment 45
+
+4.3.2 Closures 46
+
+4.3.3 PCF Constructs 46
+
+4.3.4 Using de Bruijn Indices 47
+
+4.3.5 Small-Step Operational Semantics 48
+
+4.4 Compilation of PCF 48
+
+5 PCF with Types 51
+
+5.1 Types 51
+
+5.1.1 PCF with Types 52
+
+5.1.2 The Typing Relation 53
+
+5.2 No Errors at Run Time 54
+
+5.2.1 Using Small-Step Operational Semantics 55
+
+5.2.2 Using Big-Step Operational Semantics 55
+
+5.3 Denotational Semantics for Typed PCF 56
+
+5.3.1 A Trivial Semantics 56
+
+5.3.2 Termination 57
+
+5.3.3 Scott's Ordering Relation 58
+
+5.3.4 Semantics of Fixed Points 59
+
+6 Type Inference 63
+
+6.1 Inferring Monomorphic Types 63
+
+6.1.1 Assigning Types to Untyped Terms 63
+
+6.1.2 Hindley's Algorithm 64
+
+6.1.3 Hindley's Algorithm with Immediate Resolution 66
+
+6.2 Polymorphism 68
+
+6.2.1 PCF with Polymorphic Types 68
+
+6.2.2 The Algorithm of Damas and Milner 70
+
+7 References and Assignment 73
+
+7.1 An Extension of PCF 74
+
+7.2 Semantics of PCF with References 75
+
+8 Records and Objects 81
+
+8.1 Records 81
+
+8.1.1 Labelled Fields 81
+
+8.1.2 An Extension of PCF with Records 82
+
+8.2 Objects 85
+
+8.2.1 Methods and Functional Fields 85
+
+8.2.2 What Is "Self"? 86
+
+8.2.3 Objects and References 88
+
+9 Epilogue 89
+
+References93
+
+Index95
+Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2_1© Springer-Verlag London Limited 2011
+
+# 1. Terms and Relations
+
+Gilles Dowek1  and Jean-Jacques Lévy2
+
+(1)
+
+Labo. d'Informatique, École polytechnique, route de Saclay, 91128 Palaiseau, France
+
+(2)
+
+Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, 28 rue Jean Rostand, 91893 Orsay Cedex, France
+
+Gilles Dowek (Corresponding author)
+
+Email: gilles.dowek@polytechnique.edu
+
+Jean-Jacques Lévy
+
+Email: jean-jacques.levy@inria.fr
+
+Abstract
+
+For the book to be really self contained, this chapter introduces all the basic notions about inductive definitions and formal languages in general (variables, expressions, substitution, bound and free variables, sorts,...). Then it introduces to three ways to define the semantics of a programming language: denotational semantics, big-step and small-step operational semantics. This chapter start from scratch and gives many examples.
+
+## 1.1 Inductive Definitions
+
+Since the semantics of a programming language is a relation, we will start by introducing some tools to define sets and relations.
+
+The most basic tool is the notion of an explicit definition. We can, for example, define explicitly the function that multiplies its argument by 2: x ↦ 2 * x, the set of even numbers: {n ∈ ℕ | ∃p ∈ ℕ n = 2 * p}, or the divisibility relation: {(n,m) ∈ ℕ2 | ∃p ∈ ℕ n = m * p}. However, these explicit definitions are not sufficient to define all the objects we need. A second tool to define sets and relations is the notion of an inductive definition. This notion is based on a simple theorem: the fixed point theorem.
+
+### 1.1.1 The Fixed Point Theorem
+
+Let ≤ be an ordering relation—that is, a reflexive, antisymmetric and transitive relation—over a set E, and let u0,u1,u2, ... be an increasing sequence, that is, a sequence such that u0 ≤ u1 ≤ u2 ≤ ... The element l of E is called limit of the sequence u0,u1,u2, ... if it is a least upper bound of the set {u0,u1,u2, ...}, that is, if
+
+  * for all i, ui ≤ l
+
+  * if, for all i, ui ≤ l', then l ≤ l'.
+
+If it exists, the limit of a sequence (ui)i is unique, and we denote it by limiui.
+
+The ordering relation ≤ is said to be weakly complete if all the increasing sequences have a limit.
+
+The standard ordering relation over the real numbers interval [0, 1] is an example of a weakly complete ordering. In addition, this relation has a least element 0. However, the standard ordering relation over ℝ+ is not weakly complete since the increasing sequence 0, 1, 2, 3, ... does not have a limit.
+
+Let A be an arbitrary set. The inclusion relation ⊆ over the set ℘(A) of all the subsets of A is another example of a weakly complete ordering. The limit of an increasing sequence U0,U1,U2, ... is the set  . In addition, this relation has a least element ∅.
+
+Let f be a function from E to E. The function f is increasing if
+
+It is continuous if, in addition, for any increasing sequence
+
+First Fixed Point Theorem
+
+Let ≤ be a weakly complete ordering relation over a setEthat has a least elementm. Letfbe a function fromEtoE. Iffis continuous thenp =limi (fi m)is the least fixed point off.
+
+Proof
+
+First, since m is the smallest element in E, m ≤ f m. The function f is increasing, therefore fi m ≤ fi+1 m. Since the sequence fi m is increasing, it has a limit. The sequence fi+1 m also has p as limit, thus, p =limi (f (fi m)) = f (limi (fi m)) = f p. Moreover, p is the least fixed point, because if q is another fixed point, then m ≤ q and fi m ≤ fi q = q (since f is increasing). Hence p =limi (fi m) ≤ q.
+
+The second fixed point theorem states the existence of a fixed point for increasing functions, even if they are not continuous, provided the ordering satisfies a stronger property.
+
+An ordering ≤ over a set E is strongly complete if every subset A of E has a least upper bound sup A.
+
+The standard ordering relation over the interval [0, 1] is an example of a strongly complete ordering relation. The standard ordering over ℝ+ is not strongly complete because the set ℝ+ itself has no upper bound.
+
+Let A be an arbitrary set. The inclusion relation ⊆ over the set ℘(A) of all the subsets of A is another example of strongly complete ordering. The least upper bound of a set B is the set  . □
+
+Exercise 1.1
+
+Show that any strongly complete ordering is also weakly complete.
+
+Is the ordering
+
+weakly complete? Is it strongly complete?
+
+Note that if the ordering ≤ over the set E is strongly complete, then any subset A of E has a greatest lower bound inf A. Indeed, let A be a subset of E, let B be the set {y ∈ E | ∀ x ∈ A y ≤ x} of lower bounds of A and l the least upper bound of B. By definition, l is an upper bound of the set B
+
+  * ∀y ∈ B y ≤ l
+
+and it is the least one
+
+  * (∀y ∈ B y ≤ l') ⇒ l ≤ l'
+
+It is easy to show that l is the greatest lower bound of A. Indeed, if x is an element of A, it is an upper bound of B and since l is the least upper bound, l ≤ x. Thus, l is a lower bound of A. To show that it is the greatest one, it is sufficient to note that if m is another lower bound of A, it is an element of B and therefore m ≤ l.
+
+The greatest lower bound of a set B of subsets of A is, of course, the set  .
+
+Second Fixed Point Theorem
+
+Let ≤ be a strongly complete ordering over a setE. Letfbe a function fromEtoE. Iffis increasing thenp = inf {c | f c ≤ c}is the least fixed point off.
+
+Proof
+
+Let C be the set {c | f c ≤ c} and c be an element of C. Then p ≤ c because p is a lower bound of C. Since the function f is increasing, we deduce that f p ≤ f c. Also, f c ≤ c because c is an element of C, so by transitivity f p ≤ c.
+
+The element f p is smaller than all the elements in C, it is therefore also smaller than or equal to its greatest lower bound: f p ≤ p.
+
+Since the function f is increasing, f (f p) ≤ f p, thus f p is an element of C, and since p is a lower bound of C, we deduce p ≤ f p. By antisymmetry, p = f p.
+
+Finally, by definition, all the fixed points of f belong to C, and they are therefore greater than p. □
+
+### 1.1.2 Inductive Definitions
+
+We will now see how these fixed point theorems can be used to define sets and relations.
+
+Let A be a set, f a function from An to A and E a subset of A. The set E is closed under the function f if for all a1, ...,an in E, fa1 ...an is also in E. For example, the set of all the even numbers is closed under the function n ↦ n + 2.
+
+Let A be a set. An inductive definition of a subset E of A is a family of partial functions f1 from   to A, f2 from   to A, .... The set E is defined as the smallest subset of A that is closed under the functions f1,f2, ....
+
+For example, the subset of ℕ that contains all the even numbers is inductively defined by the number 0—that is, the function from ℕ0 to ℕ that returns the value 0—and the function from ℕ to ℕ n ↦ n + 2. The subset of {a, b, c}∗ containing all the words of the form anbcn is inductively defined by the word b and the function m ↦ a m c. In general, a context free grammar can always be specified as an inductive set. In logic, the set of theorems is defined as the subset of all the propositions that is inductively defined by the axioms and deduction rules.
+
+The functions f1,f2, ... are called rules. Instead of writing a rule as x1 ...xn ↦ t, we will use the notation
+
+![
+$$\\frac{{{\\rm x}_1  \\ldots {\\rm x}_{\\rm n} }}{{\\rm t}}$$
+](A978-0-85729-076-2_1_Chapter_TeX2GIF_Equc.gif)
+
+For example, the set of even numbers is defined by the rules
+
+![
+$$
+\\bar 0
+$$
+](A978-0-85729-076-2_1_Chapter_TeX2GIF_Equd.gif)
+
+![
+$$
+\\frac{{\\rm n}}{{{\\rm n} + 2}}
+$$
+](A978-0-85729-076-2_1_Chapter_TeX2GIF_Eque.gif)
+
+Let P be the set of even numbers. We will sometimes write the rules as follows:
+
+![
+$$
+\\frac{{}}{{0 \\in {\\rm P}}}
+$$
+](A978-0-85729-076-2_1_Chapter_TeX2GIF_Equf.gif)
+
+![
+$$
+\\frac{{{\\rm n} \\in {\\rm P}}}{{{\\rm n} + 2 \\in {\\rm P}}}
+$$
+](A978-0-85729-076-2_1_Chapter_TeX2GIF_Equg.gif)
+
+In order to define a language inductively, we will sometimes use a notation borrowed from language theory, where, for example, the set of words of the form anbcn is defined as follows
+
+To show that there is indeed a smallest subset of A that is closed under the functions f1,f2, ..., we define a function F from ℘(A) to ℘(A)
+
+A subset C of A is closed under the functions f1,f2, ... if and only if F C ⊆ C.
+
+The function F is trivially increasing, that is, if C ⊆ C' then F C ⊆ F C'. In addition, it is continuous, that is, if C0 ⊆ C1 ⊆ C2 ⊆⋅⋅⋅ then F ( ) = (FCj). Indeed, if an element x of A is in F ( ), then there exists a number i and elements y1, ...,  in   such that x =fiy1 ... . Each of these elements is in one of the Cj. Since the sequence Cj is increasing, they are all in Ck, which is the largest of these sets. Therefore, the element x belongs to FCk and also to  (FCj). Conversely, if x is in  (FCj), then it belongs to some FCk, and there is therefore a number i and elements y1, ...,  of Ck such that x =fiy1 ... . The elements y1, ...,  are in  , and therefore x is in F ( ).
+
+The set E is defined as the least fixed point of the function F. This is the smallest set that satisfies the property F E = E and, according to the second fixed point theorem, it is also the smallest set that satisfies the property F E ⊆ E. Thus, it is the smallest set that is closed under the functions f1,f2, ....
+
+The set of even numbers is not the only subset of ℕ that contains 0 and is closed under the function n ↦ n + 2—the set ℕ, for example, also satisfies these properties—but it is the smallest one. It can be defined as the intersection of all those sets. The second fixed point theorem allows us to generalise this observation and define E as the intersection of all the sets that are closed under the functions f1,f2, ....
+
+The first fixed point theorem shows that an element x is in E if and only if there is some number k such that x is in the set Fk ∅. That is, if there is a function fi such that x =fiy1 ...  where y1, ...,  are in Fk−1 ∅. Iterating, that is, by induction on k, we can show that an element x of A is in E if and only if there exists a tree where the nodes are labelled by elements of A, the root is labelled by x, and if a node is labelled by c, then its children are labelled by d1, ...,dn such that for some rule f, we have c = fd1 ...dn. Such a tree is called a derivation for a. This notion of a derivation generalises the notion of proof in logic. We can then define the set E as the set of elements x of A for which there is a derivation.
+
+We will use a specific notation for derivations. First, the root of the tree will be written at the bottom, and the leaves at the top. Then, we will write a line over each node in the tree and we will write its children over the line.
+
+The number 8, for example, is in the set of even numbers, as the following derivation shows
+
+If we call P the set of even numbers, we can write the derivation as follows
+
+### 1.1.3 Structural Induction
+
+Inductive definitions suggest a method to write proofs. If a property is hereditary, that is, if each time it holds for y1, ..., , then it also holds for fiy1 ... , then we can deduce that it holds for all the elements of E.
+
+One way to show this, is to use the second fixed point theorem and to observe that the subset P of A containing all the elements that satisfy the property is closed under the functions fi and thus it includes E. Another way is to use the first fixed point theorem and to show by induction on k that all the elements in Fk ∅ satisfy the property.
+
+### 1.1.4 The Reflexive-Transitive Closure of a Relation
+
+The reflexive-transitive closure of a relation is an example of inductive definition. If R is a binary relation on a set A, we can inductively define another relation R∗, called the reflexive-transitive closure of R
+
+![
+$$
+\\frac{{}}{{{\\rm x R}^* {\\rm  y}}}{\\rm if x R y}
+$$
+](A978-0-85729-076-2_1_Chapter_TeX2GIF_Equk.gif)
+
+![
+$$
+\\frac{{}}{{{\\rm x R}^* {\\rm  x}}}
+$$
+](A978-0-85729-076-2_1_Chapter_TeX2GIF_Equl.gif)
+
+![
+$$
+\\frac{{{\\rm x R}^* {\\rm  y y R}^* {\\rm  z}}}{{{\\rm x R}^* {\\rm  z}}}
+$$
+](A978-0-85729-076-2_1_Chapter_TeX2GIF_Equm.gif)
+
+If we see R as a directed graph, then R∗ is the relation that links two nodes when there is a path from one to the other.
+
+## 1.2 Languages
+
+### 1.2.1 Languages Without Variables
+
+Now that we have introduced inductive definitions, we will use this technique to define the notion of a language. The notion of language that we will define does not take into account superficial syntactic conventions, for instance, it does not matter whether we write 3 + 4, +(3,4), or 3 4 +. This term will be represented in an abstract way by a tree.
+
+Each node in the tree will be labelled by a symbol. The number of children of a node depends on the node's label—2 children if the label is +, 0 if it is 3 or 4, ....
+
+A language is thus a set of symbols, each with an associated number called arity, or simply number of arguments, of the symbol. The symbols without arguments are called constants.
+
+The set of terms of the language is the set of trees inductively defined by
+
+  * if f is a symbol with n arguments and t1, ...,tn are terms then f(t1, ...,tn)—that is, the tree that has a root labelled by f and subtrees t1, ...,tn—is a term.
+
+### 1.2.2 Variables
+
+Imagine that we want to design a language to define functions. One possibility would be to use constants sin,cos, ... and a symbol with two arguments ○. We could, for instance, build the term sin ○ (cos ○ sin) in this language.
+
+However, we know that, to specify functions, it is easier to use a notion invented by F. Viète (1540–1603): the notion of a variable. Thus, the function described above can be written sin (cos (sin x)).
+
+Since the 1930's, we write this function x ↦ sin (cos (sin x)) or λx sin (cos (sin x)), using the symbol ↦ or λ to bind the variable x. By indicating explicitly which variables are bound, we can distinguish the arguments of the function from potential parameters, and we also fix the order of the arguments.
+
+The symbol ↦ appears to have been introduced by N. Bourbaki around 1930, and the symbol λ by A. Church around the same time. The notation λ is a simplified version of a previous notation   sin (cos (sin x)) used by A.N. Whitehead and B. Russell since the 1900's.
+
+The definition f = x ↦ sin (cos (sin x)) is sometimes written f x = sin (cos (sin x)). The advantage of writing f = x ↦ sin (cos (sin x)) is that in this way we can distinguish two different operations: the construction of the function x ↦ sin (cos (sin x)) and the definition itself, which gives a name to an object previously constructed. It is often important, in computer science, to have notations that allow us to build objects without necessarily giving them a name.
+
+In this book, we use the notation fun x -> sin (cos (sin x)) to specify this function.
+
+The term fun x -> sin (cos (sin x)) specifies a function. However, its subterm sin x does not specify anything: it is not a real number and it is not a function, because it contains a free variable whose value we do not know.
+
+To bind variables in terms, we need to extend the notion of term to include free variables, which will be bound later. This requires also new symbols, such as fun, which act as binders for the variables in some of their arguments. Other examples of binders are the symbol { | }, the symbol ∂/∂, the symbol ∫d, the symbols ∑ and ∏, the quantifiers ∀ and ∃,... In this book we will use several binders: the symbol fun above, the symbols fix,let,fixfun ....
+
+The arity of a symbol f will no longer be a number n, instead, we will use a finite sequence of numbers (k1, ...,kn) that will indicate that f binds k1 variables in its first argument, k2 variables in the second,..., kn variables in the nth.
+
+In this way, when a language is defined—that is, a set of symbols with their arities—and an infinite set of variables is given, we can define the set of terms inductively as follows
+
+  * variables are terms,
+
+  * if f is a symbol with arity (k1, ...,kn), t1, ...,tn are terms and  , ..., , ..., , ...,  are variables, then f(  ... t1, ...,  ... tn) is a term.
+
+The notation f(  ... t1, ...,  ... tn) denotes the tree
+
+This definition can be better understood with an example. We build a language in which terms specify real numbers and functions over the reals, and which includes two constants sin and cos to represent the functions sine and cosine, a symbol α, called application, such that α(f,x) is the object obtained by applying the function f to the object x and a symbol fun to build functions. This language includes then four symbols: the constants sin and cos, α with arity (0,0) and fun with arity (1); the set of terms is inductively defined by
+
+  * variables are terms,
+
+  * sin is a term,
+
+  * cos is a term,
+
+  * if t and u are terms then α(t,u) is a term,
+
+  * if t is a term and x is a variable then fun(x t) is a term.
+
+We will adopt a simplified notation, writing t u for the term α(t,u) and fun x -> t for the term fun(x t).
+
+For example, fun x -> sin (cos (sin x)) is a term in this language.
+
+### 1.2.3 Many-Sorted Languages
+
+In this book, we will sometimes use more general languages, called many-sorted languages. For instance, the language that is used to describe vectors with a finite number of constants, addition and scalar multiplication. In this language, there are two sorts of terms: terms describing a vector, and terms describing a scalar. In the definition of the language we indicate that the symbol + has two arguments, that are both vectors and that the symbol . has two arguments, which are a scalar and a vector.
+
+For this, we introduce a set with two elements {vect, scal}, called sorts, and we associate to the symbol . the arity (scal, vect, vect). This arity indicates that in a term of the form λ.v, the term λ must be of sort scal, the term v of sort vect, and the term λ.v is itself of sort vect.
+
+When, in addition, there are bound variables, the arity of a symbol f is a finite sequence (( , ..., , s'1), ..., ( , ..., , s'n), s") indicating that the symbol has n arguments, the first one of sort s'1 and binding k1 variables of sorts  , ..., , ..., and that the resulting term is itself of sort s".
+
+Given a language—that is, a set of sorts and a set of symbols each with an associated arity—and a family, indexed by sorts, of infinite, pairwise disjoint, sets of variables, we can inductively define terms as follows:
+
+  * variables of sort s are terms of sort s,
+
+  * if f is a symbol of arity (( , ..., , s'1), ..., ( , ..., , s'n), s"),  , ..., , ..., , ...,  are variables of sort  , ..., , ..., , ...,  and t1, ...,tn are terms of sort s'1, ..., s'n then f(  ... t1, ...,  ...  tn) is a term of sort s".
+
+### 1.2.4 Free and Bound Variables
+
+The set of variables of a term is defined by structural induction:
+
+  * Var(x) = {x},
+
+  * Var(f(  ... t1, ...,  ... tn)) = Var(t1) ∪ { , ..., } ∪ ⋅⋅⋅ ∪ Var(tn) ∪ { , ..., }.
+
+We can also define the set of free variables of a term:
+
+  * FV(x) = {x},
+
+  * FV(f(  ... t1, ...,  ... tn)) = (FV(t1) ∖ { , ..., }) ∪ ⋅⋅⋅ ∪ (FV(tn) ∖ { , ..., })
+
+For example, Var (fun x -> sin (cos (sin x))) = {x} and FV (fun x -> sin (cos (sin x))) = ∅.
+
+A term without free variables is said to be closed.
+
+The height of a term is also defined by structural induction:
+
+  * Height(x) = 0,
+
+  * Height(f( ... t1,..., ... tn))=1\+ max(Height(t1), ..., Height (tn)).
+
+### 1.2.5 Substitution
+
+The first operation that we need to define is substitution: indeed, the rôle of variables is not only to be bound but also to be substituted. For example, when we apply the function fun x -> sin (cos (sin x)) to the term 2 *π, at some point we will need to substitute in the term sin (cos (sin x)) the variable x by the term 2 *π.
+
+A substitution is simply a mapping from variables to terms, with a finite domain. In other words, a substitution is a finite set of pairs where the first element is a variable and the second a term, and such that each variable occurs at most once as first element in a pair. We can also define a substitution as an association list—θ =t1/x1 ...tn/xn.
+
+When a substitution is applied to a term, each occurrence of a variable x1, ...,xn in the term is replaced by t1, ...,tn, respectively.
+
+Of course, this replacement only affects the free variables. For example, if we substitute the variable x by the term 2 in the term x + 3, we should obtain the term 2 + 3. However, if we substitute the variable x by the term 2 in the term fun x -> x which represents the identity function we should obtain the term fun x -> x and not fun x -> 2.
+
+The first attempt to define the application of a substitution to a term is as follows:
+
+  * 〈θ〉xi =ti,
+
+  * 〈θ〉x = x if x is not in the domain of θ,
+
+  * 〈θ〉f( ... u1,..., ... un)=f( ...  , ...,  ...  )
+
+where we use the notation   for the restriction of the substitution θ to the set V ∖ {y1, ...,yk}, that is, the substitution where we have omitted all the pairs where the first element is one of the variables y1, ...,yk.
+
+This definition is problematic, because substitutions could capture variables. For example, the term fun x -> (x + y) represents the function that adds y to its argument. If we substitute y by 4 in this term, we obtain the term fun x -> (x + 4) representing the function that adds 4 to its argument. If we substitute y by z, we get the term fun x -> (x + z) representing the function that adds z to its argument. But if we substitute y by x, we obtain the function fun x -> (x + x) which doubles its argument, instead of the function that adds x to its argument as expected. We can avoid this problem if we change the name of the bound variable: bound variables are dummies, their name does not matter. In other words, in the term fun x -> (x + y), we can replace the bound variable x by any other variable, except of course y. Similarly, when we substitute in the term u the variables x1, ...,xn by the terms t1, ...,tn, we can change the names of the bound variables in u to make sure that their names do not occur in x1, ...,xn, or in the variables of t1, ...,tn, or in the variables of u, to avoid capture.
+
+We start by defining an equivalence relation on terms, by induction on the height of terms. This relation is called alphabetic equivalence—or α-equivalence—and it corresponds to variable renaming.
+
+  * x ∼ x,
+
+  * f(  ... t1, ...,  ... tn) ∼ f(  ...  t'1, ...,  ...  t'n) if for all i, and for any sequence of fresh variables z1, ...,  (that is, variables that do not occur in ti, t'i), we have  .
+
+For example, the terms fun x -> x + z and fun y -> y + z are α-equivalent.
+
+In the rest of the book we will work with terms moduloα-equivalence, that is, we will consider implicitly α-equivalence classes of terms.
+
+We can now define the operation of substitution by induction on the height of terms:
+
+  * θxi =ti,
+
+  * θx = x if x is not in the domain of θ,
+
+  * θf( ... u1,..., ... un)=f( ...  ) where  , ..., , ..., , ...,  are variables that do not occur in f(  ... u1, ...,  ... un) or in θ.
+
+For example, if we substitute the variable y by the term 2 * x in the term fun x -> x + y, we obtain the term fun z -> z + (2 * x). The choice of variable z is arbitrary, we could have chosen v or w, and we would have obtained the same term modulo α-equivalence.
+
+The composition of the substitutions θ =t1/x1 ...tn/xn and σ =u1/y1 ...up/yp is the substitution
+
+We can prove, by induction on the height of t, that for any term t
+
+## 1.3 Three Ways to Define the Semantics of a Language
+
+The semantics of a programming language is a binary relation over the set of terms in the language. Since we have already defined the notion of a language and introduced tools to define relations, we are ready to describe the three main techniques used for semantic definitions. The semantics of a language is usually given as a function, as an inductive definition, or as the reflexive-transitive closure of an explicitly defined relation. They are called denotational semantics, big-step operational semantics and small-step operational semantics, respectively.
+
+### 1.3.1 Denotational Semantics
+
+Denotational semantics is useful for deterministic languages. In this case, for each program p, the input-output relation defined by a program is a function, written 〚p〛. The relation ↪ is then defined by
+
+Of course, this simply moves the problem further down: we now need to define the function 〚p〛. For this, we will use two tools: explicit definitions of functions, and the fixed point theorem... but we will leave this for later.
+
+### 1.3.2 Big-Step Operational Semantics
+
+The big-step operational semantics is also called structural operational semantics (S.O.S.) or natural semantics. It gives an inductive definition of the relation ↪.
+
+### 1.3.3 Small-Step Operational Semantics
+
+The small-step operational semantics is also called reduction semantics. It defines the relation ↪ by means of another relation   that describes the basic steps to transform the initial term t into the final term s.
+
+For example, when we run the program fun x -> (x * x) + x with input 4, we obtain the result 20. But the term (fun x -> (x * x) + x) 4 does not become 20 in one step, it is first transformed into (4 * 4) + 4, then 16 + 4, and finally 20.
+
+The most important relation is not the one that links (fun x -> (x * x) + x) 4 with 20, but  , which relates the term (fun x -> (x * x) + x) 4 with (4 * 4) + 4, then the term (4 * 4) + 4 with 16 + 4 and finally the term 16 + 4 with the term 20.
+
+Once the relation   is given, ↪ can be derived from the reflexive-transitive closure   of the relation
+
+The fact that the term s is irreducible implies that there is nothing else to compute in s. For example, the term 20 is irreducible, but the term 16 + 4 is not. A term s is irreducible if there is no term s' such that s s'.
+
+### 1.3.4 Non-termination
+
+The execution of a program may produce a result, produce an error, or never terminate. Errors can be seen as particular kinds of results. For non-terminating programs, there are several ways to define a semantics. A first alternative is to consider that if the term t does not terminate, then there is no pair (t,s) in the relation ↪. Another alternative is to add a specific element ⊥ to the set of output values, and to state that the relation ↪ contains the pair (t,⊥) when the term t does not terminate.
+
+The difference may seem superficial: it is easy to delete all the pairs of the form (t,⊥), or to add such a pair if there is no pair of the form (t,s) in the relation. However, readers who are familiar with computability problems will notice that, if we add the pairs (t,⊥), the relation ↪ is no longer recursively enumerable.
+Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2_2© Springer-Verlag London Limited 2011
+
+# 2. The Language PCF
+
+Gilles Dowek1  and Jean-Jacques Lévy2
+
+(1)
+
+Labo. d'Informatique, École polytechnique, route de Saclay, 91128 Palaiseau, France
+
+(2)
+
+Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, 28 rue Jean Rostand, 91893 Orsay Cedex, France
+
+Gilles Dowek (Corresponding author)
+
+Email: gilles.dowek@polytechnique.edu
+
+Jean-Jacques Lévy
+
+Email: jean-jacques.levy@inria.fr
+
+Abstract
+
+This chapter introduces a specific programming language called PCF (and sometimes mini-ML). This language is one of the backbones of the book. It will be evaluated, interpreted, compiled, and extended (with types, references, records and objects) in the rest of the book. This chapter focuses on giving an informal description of the language, defining its small-step and big-step operational semantics and culminates with the implementation of an evaluator for this language.
+
+We will illustrate the various styles of semantics of programming languages with an example: the language PCF—Programming language for computable functions—, also called Mini-ML.
+
+## 2.1 A Functional Language: PCF
+
+### 2.1.1 Programs Are Functions
+
+We observed in the previous chapter that a deterministic program computes a function, and from this observation we derived the principles of denotational semantics. This remark is also the basis of a class of programming languages: functional languages, such as Caml, Haskell or Lisp, which are traditionally used to begin the study of programming languages.
+
+In these languages, the goal is to shorten the distance between the notion of a program and the notion of a mathematical function. In other words, the idea is to bring programs closer to their denotational semantics.
+
+The basic constructions in the language PCF are the explicit construction of a function, written fun x -> t, and the application of a function to an argument, written t u.
+
+PCF includes also a constant for each natural number, the operations +, -, *, /, and a test to detect zero ifz t then u else v. Addition and multiplication are defined for all natural numbers, and similarly for subtraction using the convention n - m = 0 if n < m. Division is the standard Euclidean division, division by 0 produces an error.
+
+### 2.1.2 Functions Are First-Class Objects
+
+In many programming languages, it is possible to define a function that takes another function as argument, or that returns another function, but often this requires the use of a syntax that is different from the syntax used for a standard argument such as an integer or a string. In a functional language, functions are defined in the same way whether they take numbers or functions as arguments.
+
+For example, the composition of a function with itself is defined by fun f -> fun x -> f (f x).
+
+To highlight the fact that functions are not considered different, and thus they can be used as arguments or returned as results for other functions, we say that functions are first class objects.
+
+### 2.1.3 Functions with Several Arguments
+
+In PCF, there is no symbol to build a function with several arguments. These functions are built as functions with one argument, using the isomorphism (A × B) -> C = A -> (B -> C). For instance, the function that associates to x and y the number x * x + y * y is defined as the function associating to x a function, which in turn associates to y the number x * x + y * y, that is, fun x -> fun y -> x * x + y * y.
+
+Then, to apply the function f to the numbers 3 and 4 we need to apply it first to 3, obtaining the term f 3, which represents the function that associates 3 * 3 + y * y to y, and then to 4, obtaining the term (f 3) 4. Since, by convention, application associates to the left, we will write this term simply as f 3 4.
+
+### 2.1.4 No Assignments
+
+In contrast with languages such as Caml or Java, the main feature of PCF is a total lack of assignments. There is no construction of the form x := t or x = t to assign a value to a "variable". We will describe, in Chap. 7, an extension of PCF with assignments.
+
+### 2.1.5 Recursive Definitions
+
+In Mathematics, some functions cannot be defined explicitly. For example, in a high-school textbook, the power function is often defined by
+
+or through a definition by induction.
+
+In programming languages, we use similar constructs: iterations and recursive definitions. PCF includes a special construct to define recursive functions.
+
+It is often said that a function is recursive if the function is used in its own definition. This is absurd: in programming languages, as everywhere else, circular definitions are meaningless. We cannot "define" the function fact by fun n -> ifz n then 1 else n * (fact (n - 1)). In general, we cannot define a function f by a term G which contains an occurrence of f. However, we can define the function f as the fixed point of the function fun f -> G. For example, we can define the function fact as the fixed point of the function fun f -> fun n -> ifz n then 1 else n * (f (n - 1)).
+
+Does this function have a fixed point? and if it does, is this fixed point unique? Otherwise, which fixed point are we referring to? We will leave these questions for a moment, and simply state that a recursive function is defined as a fixed point.
+
+In PCF, the symbol fix binds a variable in its argument, and the term fix f G denotes the fixed point of the function fun f -> G. The function fact can then be defined by fix f fun n -> ifz n then 1 else n * (f (n - 1)).
+
+Note, again, that using the symbol fix we can build the factorial function without necessarily giving it a name.
+
+### 2.1.6 Definitions
+
+We could, in theory, omit definitions and replace everywhere the defined symbols by their definitions. However, programs are simpler and clearer if we use definitions.
+
+We add then a final construct in PCF, written let x = t in u. The occurrences of the variable x in u are bound, but those in t are not. The symbol let is a binary operator that binds a variable in its second argument.
+
+### 2.1.7 The Language PCF
+
+The language PCF contains
+
+  * a symbol fun with one argument, that binds a variable in its argument,
+
+  * a symbol α with two arguments, which does not bind any variables in its arguments,
+
+  * an infinite number of constants to represent the natural numbers,
+
+  * four symbols +, -, * and / with two arguments, which do not bind any variables in their arguments,
+
+  * a symbol ifz with three arguments, which does not bind any variables in its arguments,
+
+  * a symbol fix with one argument, which binds a variable in its argument,
+
+  * a symbol let with two arguments, which binds a variable in its second argument.
+
+In other words, the syntax of PCF is inductively defined by
+
+Despite its small size, PCF is Turing complete, that is, all computable functions can be programmed in PCF.
+
+Exercise 2.1
+
+Write a PCF program that takes two natural numbers n and p as inputs and returns np.
+
+Exercise 2.2
+
+Write a PCF program that takes a natural number n as input and returns the number 1 if the input is a prime number, and 0 otherwise.
+
+Exercise 2.3
+
+(Polynomials in PCF) Write a PCF program that takes a natural number q as input, and returns the greatest natural number u such that u (u + 1) / 2 ≤ q.
+
+Cantor's function K is a function from ℕ2 to ℕ defined by fun n -> fun p -> (n + p) (n + p + 1) / 2 + n. Let K' be the function from ℕ to ℕ2 defined by fun q -> (q - (u (u + 1) / 2), u - q + u (u + 1) / 2) where u is the greatest natural number such that u (u + 1) / 2 ≤ q.
+
+Show that K ○ K' = id. Let n and p be two natural numbers, show that the greatest natural number u such that u (u + 1) / 2 ≤ (n + p) (n + p + 1) / 2 + n is n + p. Then deduce that K' ○ K = id. From this fact, deduce that K is a bijection from ℕ2 to ℕ.
+
+Let L be the function fun n -> fun p -> (K n p) + 1. A polynomial with integer coefficients a0 +a1 X + ⋅⋅⋅ +aiXi + ⋅⋅⋅ +anXn can be represented by the integer La0 (La1 (La2 ... (Lan 0) ...)).
+
+Write a PCF program that takes two natural numbers as input and returns the value of the polynomial represented by the first number applied to the second.
+
+## 2.2 Small-Step Operational Semantics for PCF
+
+### 2.2.1 Rules
+
+Let us apply the program fun x -> 2 * x to the constant 3. We obtain the term (fun x -> 2 * x) 3. According to the principles of small-step operational semantics, let us try to evaluate this term step by step, to obtain a result: 6 if all goes well. The first step in this simplification process is parameter passing, that is, the replacement of the formal argument x by the actual argument 3. The initial term becomes, after a first small-step transformation, the term 2 * 3. In the second step, the term 2 * 3 is evaluated, resulting in the number 6. The first small step, parameter passing, can be performed each time we have a term of the form (fun x -> t) u where a function fun x -> t is applied to an argument u. As a consequence, we define the following rule, called β-reduction rule
+
+The relation t ⟶ u should be read "t reduces—or rewrites—to u ". The second step mentioned above can be generalised as follows
+
+where ⊗ is any of the four arithmetic operators included in PCF. We add similar rules for conditionals
+
+a rule for fixed points
+
+and a rule for let
+
+A redex is a term t that can be reduced. In other words, a term t is a redex if there exists a term u such that t ⟶ u.
+
+### 2.2.2 Numbers
+
+It could be said, quite rightly, that the rule p ⊗ q ⟶ n (if p ⊗ q = n), of which 2 * 3 ⟶ 6 is an instance, does not really explain the semantics of the arithmetic operators, since it just replaces the multiplication in PCF by that of Mathematics. This choice is however motivated by the fact that we are not really interested in the semantics of arithmetic operators, instead, our goal is to highlight the semantics of the other constructs in the language.
+
+To define the semantics of the arithmetic operators in PCF without referring to the mathematical operators, we should consider a variant of PCF without numeric constants, where we introduce just one constant for the number 0 and a symbol S—"successor"—with one argument. The number 3, for instance, is represented by the term S(S(S(0))). We then add small-step rules
+
+0 + u ⟶ u
+
+S(t) + u ⟶ S(t + u)
+
+0 - u ⟶ 0
+
+t - 0 ⟶ t
+
+S(t) - S(u) ⟶ t - u
+
+0 * u ⟶ 0
+
+S(t) * u ⟶ t * u + u
+
+t / S(u) ⟶ ifz t - u then 0 else S((t - S(u)) / S(u))
+
+Note that, to be precise, we should add a rule for division by 0, which should raise an exception: error.
+
+Exercise 2.4
+
+(Church numerals) Instead of introducing the symbols 0 and S, we can represent the number n by the term fun z -> fun s -> s (s (...(s z)...)) rather than S(S(...(0)...)). Show that addition and multiplication can be programmed on these representations. Show that the function that checks whether a number is 0 can also be programmed.
+
+Exercise 2.5
+
+(Position numerals) It could be said that the representations of numbers using the symbols 0 and S, or using Church numerals, are not efficient, since the size of the term representing a number grows linearly with the number—as the representation in unary notation, where to write the number n we need n symbols—and not logarithmically, as it is the case with the usual position-based notation. An alternative could be to use a symbol z for the number 0 and two functions O and I to represent the functions n ↦ 2 * n and n ↦ 2 * n + 1. The number 26 would then be represented by the term O(I(O(I(I(z))))), and reversing it we obtain IIOIO, the binary representation of this number.
+
+Write a small-step operational semantics for the arithmetic operators in this language.
+
+### 2.2.3 A Congruence
+
+Using the rules of the small-step semantics we obtain
+
+Thus, denoting by ⟶∗ the reflexive-transitive closure of ⟶, we can write (fun x -> 2 * x) 3 ⟶∗6.
+
+However, with this definition, the term (2 + 3) + 4 does not reduce to the term 9 according to ⟶∗. Indeed, to reduce a term of the form t + u the terms t and u should be numeric constants, but our first term 2 + 3 is a sum, not a constant. The first step should then be the evaluation of 2 + 3, which produces the number 5. Then, a second step reduces 5 + 4 to 9. The problem is that, with our definition, the term 2 + 3 reduces to 5, but (2 + 3) + 4 does not reduce to 5 + 4.
+
+We need to define another relation, where rules can be applied to any subterm of a term to be reduced. Let us define inductively the relation   as follows
+
+![
+$$
+\\frac{{}}{{{\\rm t} \\triangleright {\\rm u}}}{\\rm if t} \\to {\\rm u}
+$$
+](A978-0-85729-076-2_2_Chapter_TeX2GIF_Equi.gif)
+
+![
+$$
+\\frac{{{\\rm t} \\triangleright {\\rm u}}}{{{\\rm t v} \\triangleright {\\rm u v}}}
+$$
+](A978-0-85729-076-2_2_Chapter_TeX2GIF_Equj.gif)
+
+![
+$$
+\\frac{{{\\rm t} \\triangleright {\\rm u}}}{{{\\rm v t} \\triangleright {\\rm v u}}}
+$$
+](A978-0-85729-076-2_2_Chapter_TeX2GIF_Equk.gif)
+
+![
+$$
+\\frac{{{\\rm t} \\triangleright {\\rm u}}}{{{\\rm fun x } -  > {\\rm  t} \\triangleright {\\rm fun x } -  > {\\rm  u}}}
+$$
+](A978-0-85729-076-2_2_Chapter_TeX2GIF_Equl.gif)
+
+![
+$$
+\\frac{{{\\rm t} \\triangleright {\\rm u}}}{{{\\rm t} + {\\rm v} \\triangleright {\\rm u} + {\\rm v}}}
+$$
+](A978-0-85729-076-2_2_Chapter_TeX2GIF_Equm.gif)
+
+It is possible to show that a term is a redex with respect to the relation   if and only if one of its subterms is a redex with respect to ⟶.
+
+### 2.2.4 An Example
+
+To illustrate PCF's small-step semantic rules, let us compute the factorial of 3.
+
+(fix f fun n -> ifz n then 1 else n * (f (n - 1))) 3
+
+  (fun n -> ifz n then 1 else n * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (n - 1))) 3
+
+  ifz 3 then 1 else 3 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (3 - 1))
+
+  3 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (3 - 1))
+
+  3 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) 2)
+
+  3 * ((fun n -> ifz n then 1 else n * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (n - 1))) 2)
+
+  3 * (ifz 2 then 1 else 2 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (2 - 1)))
+
+  3 * (2 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (2 - 1)))
+
+  3 * (2 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) 1))
+
+  3 * (2 * ((fun n -> ifz n then 1 else n * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (n - 1))) 1))
+
+  3 * (2 * (ifz 1 then 1 else 1 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (1 - 1))))
+
+  3 * (2 * (1 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (1 - 1))))
+
+  3 * (2 * (1 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) 0)))
+
+  3 * (2 * (1 * ((fun n -> ifz n then 1 else n * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (n - 1))) 0)))
+
+  3 * (2 * (1 * ((ifz 0 then 1 else 0 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (0 - 1))))))
+
+  3 * (2 * (1 * 1))   3 * (2 * 1)   3 * 2   6
+
+### 2.2.5 Irreducible Closed Terms
+
+A term t is irreducible if it cannot be reduced by  , that is, if there is no term u such that t  u.
+
+We can now define the relation "the term u is the result of the evaluation of term t ", where t is a closed term, by: t ↪ u if and only if t u and u is irreducible. In this case, the term u must be closed. Finally, the relation "the program p with inputs e1, ...,en produces the output s " is simply written pe1 ...en ↪ s.
+
+Exercise 2.6
+
+(Classification of irreducible closed terms) Show that a term is irreducible and closed if and only if it is of one of the following forms
+
+  * fun x -> t where t is irreducible and does not contain any free variables except possibly x,
+
+  * n where n is a number,
+
+  * V1V2, where V1 and V2 are irreducible closed terms and V1 is not of the form fun x -> t,
+
+  * V1 ⊗ V2, where V1 and V2 are irreducible closed terms and are not both numeric constants,
+
+  * ifzV1 thenV2 elseV3 where V1, V2 and V3 are irreducible closed terms and V1 is not a number.
+
+Numbers and irreducible closed terms of the form fun x -> t are called values. When the result of a computation is a value, we associate the value to the initial term, and we say that the term evaluates to this value.
+
+Unfortunately, values are not the only possible results. For example, the term (fun x -> x) 1 2 can be reduced to the term 1 2, which is irreducible and closed, and thus the term 1 2 is the result of the computation of (fun x -> x) 1 2. This result is meaningless, because we cannot apply the object 1, which is not a function, to 2. An irreducible closed term that is not a value is said to be stuck. Stuck terms have the form V1V2, where V1 and V2 are irreducible closed terms and V1 is not a function fun x -> t (for example 1 2), V1 ⊗ V2, where V1 and V2 are irreducible and closed and are not numbers (for example 1 + (fun x -> x)), and ifzV1 thenV2 elseV3 where V1, V2 and V3 are irreducible and closed and V1 is not a number (for example, ifz (fun x -> x) then 1 else 2).
+
+Exercise 2.7
+
+Which are the values associated to the terms
+
+and
+
+according to the small-step operational semantics of PCF?
+
+Exercise 2.8
+
+(Static binding) Does the small-step operational semantics of PCF associate the value 10 or the value 11 to the term
+
+The first versions of the language Lisp produced the value 11 instead of 10 for this term. In this case, we say that the binding is dynamic.
+
+### 2.2.6 Non-termination
+
+It is easy to see that the relation ↪ is not total, that is, there are terms t for which there is no term u such that t ↪ u. For example, the term b = fix x x reduces to itself, and only to itself. It does not reduce to any irreducible term.
+
+Exercise 2.9
+
+Let b1 = (fix f (fun x -> (f x))) 0. Show all the terms obtained by reducing this term. Does the computation produce a result in this case?
+
+Exercise 2.10
+
+(Curry's fixed point operator) Let t be a term and u be the term (fun y -> (t (y y)))(fun y -> (t (y y))). Show that u reduces to t u.
+
+Let t be a term and v be the term (fun y -> ((fun x -> t) (y y)))(fun y -> ((fun x -> t) (y y))). Show that v reduces to (v/x)t.
+
+Thus, we can deduce that the symbol fix is superfluous in PCF. However, it is not going to be superfluous later when we add types to PCF.
+
+Write a term u without using the symbol fix and equivalent to b = fix x x. Describe the terms that can be obtained by reduction. Does the computation produce a result in this case?
+
+### 2.2.7 Confluence
+
+Is it possible for a closed term to produce several results? And, in general, can a term reduce to several different irreducible terms? The answer to these questions is negative. In fact, every PCF program is deterministic, but this is not a trivial property. Let us see why.
+
+The term (3 + 4) + (5 + 6) has two subterms which are both redexes. We could then start by reducing 3 + 4 to 7 or 5 + 6 to 11. Indeed, the term (3 + 4) + (5 + 6) reduces to both 7 + (5 + 6) and (3 + 4) + 11. Fortunately, neither of these terms is irreducible, and if we continue the computation we reach in both cases the term 18.
+
+To prove that any term can be reduced to at most one irreducible term we need to prove that if two computations originating in the same term produce different terms, then they will eventually reach the same irreducible term.
+
+This property is a consequence of another property of the relation  : confluence. A relation R is confluent if each time we have aR∗b1 and aR∗b2, there exists some c such that b1R∗ c and b2R∗ c.
+
+It is not difficult to show that confluence implies that each term has at most one irreducible result. If the term t can be reduced to two irreducible terms u1 and u2, then we have t u1 and t u2. Since   is confluent, there exists a term v such that u1  v and u2  v. Since u1 is irreducible, the only term v such that u1  v is u1 itself. Therefore, u1 = v and similarly u2 = v. We conclude that u1 =u2. In other words, t reduces to at most one irreducible term.
+
+We will not give here the proof of confluence for the relation  . The idea is that when a term t contains two redexes r1 and r2, and t1 is obtained by reducing r1 and t2 is obtained by reducing r2, then we can find the residuals of r2 in t1 and reduce them. Similarly, we can reduce the residuals of r1 in t2, obtaining the same term. For example, by reducing 5 + 6 in 7 + (5 + 6) and reducing 3 + 4 in (3 + 4) + 11, we obtain the same term: 7 + 11.
+
+## 2.3 Reduction Strategies
+
+### 2.3.1 The Notion of a Strategy
+
+Since in PCF each term has at most one result (due to the unicity property mentioned above), it does not matter in which order we reduce the redexes in a term: if we reach an irreducible term, it will always be the same. However, it may be the case that one sequence of reduction reaches an irreducible term whereas another one does not. For example, let C be the term fun x -> 0 and let b1 be the term (fix f (fun x -> (f x))) 0. The term b1 reduces to b2 = (fun x -> (fix f (fun x -> (f x)) x)) 0 and then again to b1. The term Cb1 contains several redexes, and it can be reduced to 0 and to Cb2 which in turn contains several redexes and can be reduced to 0 and Cb1 (amongst other terms). By reducing always the innermost redex, we can build an infinite reduction sequence Cb1  Cb2  Cb1 , whereas reducing the outermost redex produces the result 0.
+
+This example may seem an exception, because it contains a function C that does not use its argument; but note that the ifz construct is similar, and in the example of the factorial function, when computing the factorial of 3 for instance, we can observe the same behaviour: The term ifz 0 then 1 else 0 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (0 - 1)) has several redexes. Outermost reduction produces the result 1 (the other redexes disappear), whereas reducing the redex fix f fun n -> ifz n then 1 else n * (f (n - 1)) we get an infinite reduction sequence. In other words, the term fact 3 can be reduced to 6, but it can also generate reductions that go on forever.
+
+Both Cb1 and fact 3 produce a unique result, but not all reduction sequences reach a result.
+
+Since the term Cb1 has the value 0 according to the PCF semantics, an evaluator, that is, a program that takes as input a PCF term and returns its value, should produce the result 0 when computing Cb1. Let us try to evaluate this term using some current compilers. In Caml, the program
+
+does not terminate. In Java, we have the same problem with the program
+
+Only a small number of compilers, using call by name or lazy evaluation, such as Haskell, Lazy-ML or Gaml, produce a terminating program for this term.
+
+This is because the small-step semantics of PCF does not correspond to the semantics of Caml or Java. In fact, it is too general and when a term has several redexes it does not specify which one should be reduced first. By default, it imposes termination of all programs that somehow can produce a result. An ingredient is missing in this semantic definition: the notion of a strategy, that specifies the order of reduction of redexes.
+
+A strategy is a partial function that associates to each term in its domain one of its redex occurrences. Given a strategy s, we can define another semantics, replacing the relation   by a new relation   such that t u if s t is defined and u is obtained by reducing the redex s t in t. Then, we define the relation   as the reflexive-transitive closure of  , and the relation ↪s as before.
+
+Instead of defining a strategy, an alternative would be to weaken the reduction rules, in particular the congruence rules, so that only some specific reductions can be performed.
+
+### 2.3.2 Weak Reduction
+
+Before defining outermost or innermost strategies for the term Cb1, let us give another example to show that the operational semantics defined above is too liberal, and to motivate the definition of strategies or weaker reduction rules. Let us apply the program fun x -> x + (4 + 5) to the constant 3. We obtain the term (fun x -> x + (4 + 5)) 3 that contains two redexes. We can then reduce it to 3 + (4 + 5) or to (fun x -> x + 9) 3. The first reduction is part of the execution of the program, but not the second. Usually, if we execute a function before passing arguments to it, we say that we are optimising or specialising the program.
+
+A weak reduction strategy never reduces a redex that is under a fun. Thus, weak reduction does not specialise programs, it just executes them. It follows that with a weak strategy all terms of the form fun x -> t are irreducible.
+
+Alternatively, we can define weak reduction by weakening the reduction rules, more precisely, by discarding the congruence rule
+
+![
+$$
+\\frac{{{\\rm t } \\triangleright {\\rm u}}}{{{\\rm fun x } -  > {\\rm t } \\triangleright {\\rm fun x } -  > {\\rm u}}}
+$$
+](A978-0-85729-076-2_2_Chapter_TeX2GIF_Equo.gif)
+
+Exercise 2.11
+
+(Classification of weak irreducible closed terms) Show that, under weak reduction, a closed irreducible term must have one of the following forms:
+
+  * fun x -> t, where t has at most x free,
+
+  * n where n is a number,
+
+  * V1V2, where V1 and V2 are irreducible closed terms and V1 is not a term of the form fun x -> t,
+
+  * V1 ⊗ V2, where V1 and V2 are irreducible closed terms and are not both numbers,
+
+  * ifzV1 thenV2 elseV3 where V1, V2 and V3 are irreducible closed terms and V1 is not a number.
+
+What is the difference with Exercise 2.6?
+
+Numbers and closed terms of the form fun x -> t are called values.
+
+### 2.3.3 Call by Name
+
+Let us analyse again the reductions available for the term Cb1. We need to decide whether we should evaluate the arguments of the function C before they are passed to the function, or we should pass to the function the arguments without evaluating them.
+
+The call by name strategy always reduces the leftmost redex first, and the weak call by name strategy always reduces the leftmost redex that is not under a fun. Thus, the term Cb1 reduces to 0. This strategy is interesting due to the following property, called standardisation: if a term can be reduced to an irreducible term, then the call by name strategy terminates. In other words, ↪n = ↪. Moreover, when we evaluate the term (fun x -> 0) (fact 10) using a call by name strategy, we do not need to compute the factorial of 10. However, if we evaluate the term (fun x -> x + x) (fact 10), using a call by name strategy, we will compute it twice, because this term reduces to (fact 10) + (fact 10). Most call by name evaluators use sharing to avoid this duplication of computation, and in this case we call it lazy evaluation.
+
+### 2.3.4 Call by Value
+
+Call by value, in contrast, always evaluates the arguments of a function before passing them to the function. It is based on the following convention: we can only reduce a term of the form (fun x -> t) u if u is a value. Thus, when we evaluate the term (fun x -> x + x) (fact 10), we start by reducing the argument to obtain (fun x -> x + x) 3628800, and then we reduce the leftmost redex. By doing this, we only compute the factorial of 10 once.
+
+All the strategies that evaluate arguments before passing them are in this class. For instance, the strategy that reduces always the leftmost redex amongst those that are authorised. Thus, call by value is not a unique strategy, but a family of strategies.
+
+This convention can also be defined by weakening the β-reduction rule: the term (fun x -> t) u is a redex only if the term u is a value.
+
+A weak strategy is said to implement call by value if it reduces a term of the form (fun x -> t) u only when u is a value and is not under a fun.
+
+### 2.3.5 A Bit of Laziness Is Needed
+
+Even under a call by value strategy, a conditional construct ifz must be evaluated under call by name: in a term of the form ifz t then u else v, we should never evaluate the three arguments. Instead, we should first evaluate t and depending on the result, evaluate either u or v.
+
+It is easy to see that if we evaluate the three arguments of an ifz then the evaluation of the term fact 3 does not terminate.
+
+Exercise 2.12
+
+Characterise the irreducible closed terms under weak call by name, then characterise the irreducible closed terms under weak call by value.
+
+## 2.4 Big-Step Operational Semantics for PCF
+
+Instead of defining a strategy, or weakening the reduction rules of the small-step operational semantics, we can control the order in which redexes are reduced by defining a big-step operational semantics.
+
+The big-step operational semantics of a programming language provides an inductive definition of the relation ↪, without first defining ⟶ and  .
+
+### 2.4.1 Call by Name
+
+Let us start by the call by name semantics for PCF. Consider a term of the form t u that is reduced under call by name to obtain an irreducible term V. We will start by reducing the redexes that occur in t until we obtain an irreducible term. If this term is of the form fun x -> t', then the whole term reduces to (fun x -> t') u and the left-most redex is the term itself. It reduces to (u/x)t', which in turn reduces to V. We can say that the term t u reduces under call by name to the irreducible term V if t reduces to fun x -> t' and (u/x)t' reduces to V.
+
+This can be expressed as a rule
+
+which will be part of the inductive definition of the relation ↪ (without first defining ⟶ and  ).
+
+Other rules state that the result of the computation for a term of the form fun is the term itself, that is, we are defining a weak reduction relation
+
+and that the result of the computation of a term of the form n is the term itself
+
+Also, there is a rule to give the semantics of arithmetic operators
+
+two rules to define the semantics of the ifz construct
+
+a rule to define the semantics of the fixed point operator
+
+and finally a rule to define the semantics of a let
+
+We can prove by structural induction on the evaluation relation that the result of the computation of a term is always a value, that is, a number or a closed term of the form fun. There are no stuck terms. The computation of a term such as ((fun x -> x) 1) 2, which gave rise to the term 1 2 (stuck) with the small-step semantics, does not produce a result with the big-step semantics, since none of the rules can be applied to this term. Indeed, there is no rule in the big-step semantics that explains how to evaluate an application where the left part evaluates to a number.
+
+### 2.4.2 Call by Value
+
+The rules defining the call by value semantics are similar, except for the application rule: we compute the value of the argument before passing it to the function
+
+and the let rule
+
+Summarising, we have the following rules
+
+Notice that, even under call by value, we keep the rules for the ifz
+
+that is, we do not evaluate the second and third arguments of an ifz until they are needed.
+
+Note also that, even under call by value, we keep the rule
+
+We must resist the temptation to evaluate the term fix x t to a value W before substituting it in t, because the rule
+
+requires, in order to evaluate fix x t, to start by evaluating fix x t which would create a loop and the term fact 3 would never produce a value—its evaluation would give rise to an infinite computation.
+
+Note finally that other rule combinations are possible. For example, some variants of the call by name semantics use call by value in the let rule.
+
+Exercise 2.13
+
+Which values do we obtain under big-step semantics for the terms
+
+and
+
+Compare your answer with that of Exercise 2.7.
+
+Exercise 2.14
+
+Does the big-step semantics associate the value 10 or the value 11 to the term
+
+Compare your answer with that of Exercise 2.8.
+
+## 2.5 Evaluation of PCF Programs
+
+A PCF evaluator is a program that takes a closed PCF term as input, and produces its value as output. When read in a bottom-up fashion, the rules in the big-step semantics can be seen as the kernel of such an evaluator: To evaluate an application t u one starts by evaluating u and t,... this is easy to program in a language like Caml
+
+In the case of an application, the rules of the big-step semantics leave us the freedom to evaluate u first or t first—call by value is not a strategy, but a family of strategies—, but the term (W/x)t' must be the third to be evaluated, because it is built out of the results of the first two evaluations.
+
+Exercise 2.15
+
+Write a call by name evaluator for PCF, that is, a program that takes as input a closed term and computes its value. Write a call by value evaluator. Evaluate the term fact 6 and the term Cb1 in both cases.
+
+PCF's denotational semantics is more difficult to define. This may seem a paradox, since PCF is a functional language and it should be easy to interpret its programs as functions. However, in PCF, any object can be applied to any object, and nothing stops us writing for instance the term fun x -> (x x). In contrast with mathematical functions, PCF functions do not have a domain. For this reasons, we will give a denotational semantics for PCF after we add types, in Chap. 5.
+Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2_3© Springer-Verlag London Limited 2011
+
+# 3. From Evaluation to Interpretation
+
+Gilles Dowek1  and Jean-Jacques Lévy2
+
+(1)
+
+Labo. d'Informatique, École polytechnique, route de Saclay, 91128 Palaiseau, France
+
+(2)
+
+Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, 28 rue Jean Rostand, 91893 Orsay Cedex, France
+
+Gilles Dowek (Corresponding author)
+
+Email: gilles.dowek@polytechnique.edu
+
+Jean-Jacques Lévy
+
+Email: jean-jacques.levy@inria.fr
+
+Abstract
+
+This chapter introduces an essential notion in the implementation of programming languages: that of environment. Then it uses it to transform the evaluator built in the previous chapter into an interpretor. It finally discusses several optimizations: the use of De Bruijn indices and recursive closures.
+
+## 3.1 Call by Name
+
+Using the rules of the big-step operational semantics, we can build an evaluator for PCF where a term of the form (fun x -> t) u is evaluated by first substituting the variable x by the term u everywhere in the body t of the function. For example, to evaluate the term (fun x -> (x * x) + x) 4, we substitute x by 4 in the term (x * x) + x and then we evaluate the term (4 * 4) + 4. Substitutions are costly operations; to increase the efficiency of the evaluator we could instead keep the association x = 4 in a separate structure called an environment, and evaluate the term (x * x) + x in that environment. A program that evaluates terms in this way is called an interpreter.
+
+An environment is a function from variables to terms, with a finite domain. It is in essence the same thing as a substitution, but different notations are used. We write an environment as a list of pairs x1 =t1, ...,xn =tn, where the same variable x may occur several times and in that case the rightmost pair has priority. Thus, in the environment x = 3, y = 4, x = 5, z = 8 we only consider x = 5, not x = 3, which is said to be hidden by the pair x = 5. Finally, if e is an environment and x = t a pair, we denote by e, x = t the list obtained by extending e with the pair x = t.
+
+During the evaluation of a term we might reach a free variable x. In this case, we will look for the term associated to this variable in the environment. It can be shown that, if we start with a closed term, then each time we reach a variable we will find an associated term in the environment.
+
+In fact, the situation is slightly more complicated, because in addition to the term u associated to the variable in the environment, we will also need to find the environment associated to u. A pair of a term and an environment is called a thunk. We will write it 〈u, e〉.
+
+Similarly, when we interpret a term of the form fun x -> t in an environment e, the result cannot simply be the term fun x -> t, because it might contain free variables and when interpreting the term t we will need the thunks associated to these variables in e. We introduce then a new notion of value, called a closure, consisting of a term that must be of the formfun x -> t and an environment e. We will write such values as follows 〈x, t, e〉. Values are no longer a subset of terms, and we will have to define a language of values independently from the language of terms.
+
+As a consequence, we will need to rewrite the rules for the call by name big-step operational semantics of PCF, in order to consider a relation of the form e ⊢ t ↪ V, read "t is interpreted as V in e ", where e is an environment, t a term and V a value. When the environment e is empty, this relation will be written ⊢ t ↪ V. The rules that extend the environment are the application rule, which adds a pair consisting of a variable x and a thunk 〈u, e〉, the let rule, which adds a pair consisting of the variable x and the thunk 〈t, e〉 and the fix rule, which adds a pair consisting of the variable x and the thunk 〈fix x t, e〉. In the latter rule, the term t is duplicated: one of the copies is interpreted and the other is kept in the environment for any recursive calls arising from the interpretation of the first one.
+
+Exercise 3.1
+
+Write a call by name interpreter for PCF.
+
+Exercise 3.2
+
+Which values will be obtained for the following terms according to the interpretation rules given above for PCF?
+
+and
+
+Compare with Exercises 2.7 and 2.13.
+
+Exercise 3.3
+
+Will the interpretation rules for PCF compute the value 10 or the value 11 for the term
+
+Compare with Exercises 2.8 and 2.14.
+
+## 3.2 Call by Value
+
+The situation is simpler with a call by value semantics. Indeed, when interpreting a term of the form (fun x -> t) u, we start by interpreting the term u. The result is a value, that is, a number or a closure, and it suffices to bind the variable x to this value in the environment. Similarly, to interpret a term of the form let x = t in u, we start by interpreting the term t. The result is a value and it suffices to bind the variable x to this value in the environment. Thus, the environments will associate to variables values instead of thunks (which are suspended until they can be interpreted). We no longer need the notion of a thunk.
+
+However, the evaluation rule for fix, unlike the application rule or the let rule, requires a variable to be substituted by a term of the form fix x t, which is not a value, and to evaluate such a term before substituting it or before storing it in the environment will give rise to infinite computations (as mentioned above). The environment will have to include then extended values, which are either values or thunks containing a term of the form fix x t and an environment e. When we access such an extended value, we will need to interpret it if it is a thunk. This leads us to the following rules
+
+Exercise 3.4
+
+When we compute the value of the term (fact 3) where the function fact is defined by fix f fun n -> ifz n then 1 else n * (f (n - 1)), we start by calling recursively the function fact with argument 2, which will create an association between the variable n and the value 2. When we come back from the recursive call to compute the value of n and perform the multiplication, is the variable n associated to the value 2 or the value 3? Why?
+
+Exercise 3.5
+
+Write a call by value interpreter for PCF.
+
+## 3.3 An Optimisation: de Bruijn Indices
+
+In the big-step operational semantic rules, environments are lists of pairs consisting of a variable and an extended value. We could replace this structure by a pair of lists of the same length, one containing the variables and the other the values. Thus, the list x = 12, y = 14, z = 16, w = 18 could be replaced by the list of variables x, y, z, w and the list of extended values 12, 14, 16, 18. To find the extended value associated to a variable, we just need to search through the first list to find the variable's position, and then find in the other list the element at the same position. The position of a variable in the first list is a number, called the de Bruijn index of the variable in the environment. In general, we can associate the number 0 to the last element of the list—the rightmost element—, 1 to the previous,..., n - 1 to the first element of the list—the leftmost one.
+
+The list of variables which will be needed for the interpretation of each subterm can be computed before starting the process of interpretation. In fact, we can associate a de Bruijn index to each occurrence of a variable before interpreting the term. For example, if we interpret the term fun x -> fun y ->(x + (fun z->fun w->(x + y + z + w))(2 * 8)(14 + 4))(5 + 7) (20 - 6) the variable y will necessarily be interpreted in an environment of the form x = ., y = ., z = ., w = ., that is, to find the value associated to y we need to find the value with index 2. We can then associate this index to the variable from the start.
+
+To compute the de Bruijn indices of the variables we simply need to traverse the term maintaining a variable environment, that is, a list of variables, where we associate the index p to the variable x in the environment e, if p is the position of the variable x in the environment e, starting from the end.
+
+  * |x|e =xp where p is the position of x in the environment e
+
+  * |t u|e = |t|e |u|e
+
+  * |fun x -> t|e = fun x -> |t|e,x
+
+  * |n|e = n
+
+  * |t + u|e = |t|e \+ |u|e
+
+  * |t - u|e = |t|e \- |u|e
+
+  * |t * u|e = |t|e * |u|e
+
+  * |t / u|e = |t|e / |u|e
+
+  * |ifz t then u else v|e = ifz |t|e then |u|e else |v|e
+
+  * |fix x t|e = fix x |t|e,x
+
+  * |let x = t in u|e = let x = |t|e in |u|e,x
+
+For example, the term above will be written fun x -> fun y -> (x1 \+ (fun z -> fun w -> (x3 +y2 +z1 +w0))(2 * 8)(14 + 4))(5 + 7)(20 - 6).
+
+It is easy to show that an occurrence of a subterm translated in the variable environment x1, ...,xn will always be interpreted in an environment of the form x1 = ., ...,xn = . For this reason, to find the value of the variable associated to the index p we will just look for the pth element in the environment.
+
+This suggests an alternative way to interpret a term: we start by computing the de Bruijn index for each occurrence of a variable; once the indices are known, we no longer need to keep in the environment the list of variables. The environment will simply be a list of extended values. Similarly, we can dispose of variable names in closures and in thunks. Indeed, variable names are useless now and we could for instance rewrite the term above as follows: fun _ -> fun _ -> (_1 \+ (fun _ -> fun _ -> (_3 \+ _2 \+ _1 \+ _0)) (2 * 8) (14 + 4)) (5 + 7) (20 - 6).
+
+The big-step operational semantic rules can now be defined as follows
+
+Exercise 3.6
+
+Write a program to replace each variable by its De Bruijn index. Write an interpreter for this language.
+
+Exercise 3.7
+
+Write the rules of the call by name big-step operational semantics using de Bruijn indices.
+
+We will highlight the advantages of this notation, which eliminates the names of variables, when we study compilation in the next chapter.
+
+In the meantime, notice that two terms have the same de Bruijn translations if and only if they are α-equivalent. This gives us a new definition of alphabetical equivalence. Replacing variables by indices that indicate the position where they are bound can be seen as a radical point of view that highlights the fact that bound variables are "dummies".
+
+## 3.4 Construction of Functions via Fixed Points
+
+In most programming languages, only functions can be recursively defined. The fix construct applies to a term of the form fun, or we could also replace the symbol fix by a symbol fixfun f x -> t that binds two variables in its argument. The call by value big-step semantic rule for the latter can be derived from the rules given above for fix and fun
+
+In this case, we could define simpler variations of the rules for the call by value interpreter.
+
+### 3.4.1 First Variation: Recursive Closures
+
+We will distinguish closures of the form 〈x, t, (e, f = 〈fixfun f x -> t, e〉)〉, which we will write 〈f, x, t, e〉 and call recursive closures.
+
+The rule that we have given to interpret the construction fixfun f x -> t can be reformulated as follows
+
+When we interpret an application t u under a call by value semantics, if the term t is interpreted as the recursive closure 〈f, x, t', e'〉, that is, 〈x, t', (e', f = 〈fixfun f x -> t', e'〉)〉 and the term u as the value W, then to interpret the term t u, the application rule requires to interpret the term t' in the environment e', f = 〈fixfun f x -> t', e'〉, x = W.
+
+We can anticipate the interpretation of the thunk 〈fixfun f x -> t', e〉 that appears in this environment, and this gives rise to the rule fixfun, the recursive closure 〈f, x, t', e'〉. In the case of recursive closures, the application rule can then be specialised as follows
+
+Thunks are no longer used in this rule; thus, under call by value, by introducing recursive closures we eliminate thunks and we no longer need the rule to interpret them.
+
+A final simplification: standard closures 〈x, t, e〉 can be replaced by recursive closures 〈f, x, t, e〉 where f is an arbitrary variable that does not occur in t. We can then discard the application rule for the case of standard closures.
+
+Finally, we obtain the rules
+
+where f is an arbitrary variable, different from x, that does not occur in t or e
+
+Exercise 3.8
+
+Write a call by value interpreter for PCF, using recursive closures.
+
+Exercise 3.9
+
+How will the rules of the big-step operational semantics with recursive closures change if variables are replaced by de Bruijn indices—see Sect. 3.3?
+
+### 3.4.2 Second Variation: Rational Values
+
+In the rule
+
+we can anticipate the interpretation of the thunk 〈fixfun f x -> t, e〉. Of course, the value of this thunk is the term 〈x, t, (e, f = 〈fixfun f x -> t, e〉)〉 where the thunk occurs again. We could decide to interpret it again, and again....
+
+As previously said, this kind of interpretation of a term of the form fix f t before substituting it or storing it in the environment leads to an infinite computation. Here, it leads to the construction of the infinite value 〈x, t, (e, f = 〈x, t, (e, f = 〈x, t, (e, f = 〈x, t, (e, f =...)〉)〉)〉)〉, which is an infinite term, but a rational one. There are well-known techniques for the representation of rational trees in the computer's memory. Here, we could represent this value by the structure.
+
+Using the notation FIX X 〈x, t, (e, f = X)〉 for this rational value, we can replace the rule above by
+
+and again thunks will no longer be needed.
+
+Note that it is sometimes better to represent such rational value in an equivalent way
+
+and in this case we could instead define rational environments.
+
+Exercise 3.10
+
+Write a call by value interpreter for PCF using rational values.
+
+Exercise 3.11
+
+How do these big-step operational semantic rules change if we replace variables by their de Bruijn indices—see Sect. 3.3?
+
+Exercise 3.12
+
+Could the technique of rational values be used to design an interpreter for the full PCF, that is, where we could define via fixed points not only functions but also arbitrary objects? Hint: what is the rational representation of the value of the term fix x x?
+
+To summarise, in this section we have seen that if a variable x has an occurrence in the term t, the reduction rule fix x t ⟶ (fix x t/x)t can be applied an infinite number of times starting from the term fix x t, because the term (fix x t/x)t contains again the term fix x t as a subterm. This corresponds to the replacement, in a recursive definition f = G(f), of f by G(f) an infinite number of times, which leads to the infinite program f = G(G(G(...))). In a sense, this explains the intuition that recursive programs are infinite programs. For example, the term fact could be written fun x -> ifz x then 1 else x * (ifz x - 1 then 1 else (x - 1) * (ifz x - 2 then 1 else (x - 2) * ⋅⋅⋅)). This replacement must only be done on demand: in a lazy way.
+
+We have seen that there are several ways to express this behaviour in the semantics of PCF—and finally in the code of a PCF interpreter: substitute x by fix x t and freeze this redex if it is under a fun or an ifz, store this redex as a thunk or a recursive closure and "unfreeze" the thunk on demand, represent the term f = G(G(G(...))) as a rational tree and traverse it on demand. A final method could be to use the encoding of fix given in Exercise 2.10, and only reduce this term (which requires the duplication of a subterm) when needed.
+
+Exercise 3.13
+
+(An extension of PCF with pairs) We extend PCF with the following constructions: t,u represents the pair where the first component is t and the second is u; fst t and snd t are, respectively, the first and second component of the pair t. Write small-step and big-step operational semantic rules for this extension of PCF. Write an interpreter for this extension of PCF.
+
+Exercise 3.14
+
+(An extension of PCF with lists) We extend PCF with the following constructions: nil denotes the empty list, cons n l denotes a list where the first element is the natural number n and l is the rest of the list, ifnil t then u else v checks whether a list is empty or not, hd l returns the first element of the list l and tl l the list l without its first element. Write small-step and big-step operational semantic rules for this extension of PCF. Write an interpreter for this extension of PCF. Write a program to implement a sorting algorithm over these lists.
+
+Exercise 3.15
+
+(An extension of PCF with trees) We extend PCF with the following constructions: L n denotes a tree that consists of one leaf labelled by the natural number n, N t u denotes a tree with two subtrees t and u, ifleaf t then u else v checks whether its first argument is a tree of the form L n or N t u, content t denotes the content of the tree t if it is a leaf, left t and right t denote, respectively, the left and right subtrees of t if it is not a leaf. Write small-step and big-step operational semantic rules for this extension of PCF. Write an interpreter for this extension of PCF.
+Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2_4© Springer-Verlag London Limited 2011
+
+# 4. Compilation
+
+Gilles Dowek1  and Jean-Jacques Lévy2
+
+(1)
+
+Labo. d'Informatique, École polytechnique, route de Saclay, 91128 Palaiseau, France
+
+(2)
+
+Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, 28 rue Jean Rostand, 91893 Orsay Cedex, France
+
+Gilles Dowek (Corresponding author)
+
+Email: gilles.dowek@polytechnique.edu
+
+Jean-Jacques Lévy
+
+Email: jean-jacques.levy@inria.fr
+
+Abstract
+
+In this chapter the interpretor is transformed into a compiler. The emphasis is put on the construction of an abstract machine, whose language is the target language of the compilation. This chapter ends with the bootstrapping of this compiler.
+
+When a computer comes out of the factory, it is not capable of interpreting a PCF term, not even a Caml or Java program. For a computer to be able to run a PCF, Caml or Java program, we need to have an interpreter for the language, which must be written in the machine language of the computer. In the previous chapter we described the principles underlying PCF interpretation, and we wrote an interpreter in a high-level language, such as Caml. We could continue this line of thought, and try to write now an interpreter in machine language....
+
+One possibility is to leave the realm of interpretation and move towards a compiler. An interpreter takes a PCF term as input and returns its value. A compiler, instead, is a program that takes a PCF term as argument and returns a program, in machine language, whose execution returns the value of the term. In other words, a PCF compiler is a program that translates PCF terms into machine language, that is, into a language which can be directly executed by the machine.
+
+One of the advantages of using a compiler is that the program is translated once and for all, when it is compiled, rather than each time it is executed. Once compiled, the execution is usually faster. Another advantage comes from the fact that a compiler can compile itself, we call this bootstrapping (see Exercise 4.4), whereas an interpreter cannot interpret itself.
+
+The implementation of a compiler should be guided by the rules of the operational semantics of the language (as was the case for the interpreter). To simplify, we will focus on a fragment of PCF where only functions can be defined recursively, and we will use the big-step semantics with recursive closures—see Sect. 3.4.
+
+The machine language that we will use is not a commercial one: it is the machine language of an imaginary computer. This kind of machine is called an abstract machine. We will write a program that will simulate the behaviour of this machine. The use of an abstract machine is not only motivated by pedagogical reasons, there are practical reasons too: the main compilers for Caml and Java, for instance, use abstract machines. Compiled programs are executed by a program that simulates the workings of the abstract machine, or are further translated (in a second compilation phase) to the machine language of a concrete machine.
+
+## 4.1 An Interpreter Written in a Language Without Functions
+
+In Chap. 2, we gave a big-step operational semantics for PCF and we used it to derive an interpreter for this language. For example, the rule
+
+results in the following piece of Caml code for the PCF interpreter
+
+Since Caml allows us to write local definitions, we can compute the value of the term interp env t and recover the value w after the computation, even if the variable w is bound to other values during the computation.
+
+If we tried to write the interpreter in machine language, or in any language that does not permit local definitions, then we would need to devise a mechanism to memorise the value w, for example using a stack: we could interpret the term u, put the result in the stack, then interpret the term t and finally pop the top of the stack and add it to the result of the interpretation.
+
+In this way, to interpret the term ((((1 + 2) + 3) + 4) + 5) + 6 we need to put the number 6, then the number 5,..., then the number 2 in the stack, then pop the number on the top of the stack (that is, 2) and add it to the number 1, then pop the number 3 and add it to the previous result, then... pop the number 6 and add it to the previous result, to obtain the final result: 21.
+
+## 4.2 From Interpretation to Compilation
+
+This interpreter can be decomposed into two programs. The first one can be seen as an object with two fields: a field that contains a natural number and that we call an accumulator, and a field that contains a list of natural numbers, called the stack. We have the following operations
+
+  * Ldin: puts the number n in the accumulator,
+
+  * Push: puts the contents of the accumulator on the top of the stack,
+
+  * Add: adds the top of the stack and the accumulator, leaves the result in the accumulator, and pops the top of the stack.
+
+This object is our abstract machine, and the three instructions above constitute its machine language. The fields are called registers.
+
+The second program takes a PCF term as input and, depending on the term, produces machine instructions, which will be executed by the machine, one by one. If t is a PCF term, we denote by |t| the sequence of abstract machine instructions generated by this program during the interpretation of the term. For instance, for the term ((((1 + 2) + 3) + 4) + 5) + 6, the machine instructions generated are: Ldi6, Push, Ldi5, Push, Ldi4, Push, Ldi3, Push, Ldi2, Push, Ldi 1, Add, Add, Add, Add, Add.
+
+Exercise 4.1
+
+Which instructions will be executed by the abstract machine when interpreting the term 1 + (2 + (3 + (4 + (5 + 6))))?
+
+This way of sharing the work resembles the behaviour of a car driver and a passenger in an unfamiliar city: the passenger reads the map and gives instructions to the driver, who follows the instructions without really knowing where the car is.
+
+If the passenger could generate the instructions just by looking at the map, it would be possible to record the list of instructions in a compact disk, which the driver could then listen to in the car. In this scenario, the passenger does not need to be in the car to guide the driver. Similarly, the interpreter could leave the sequence |t| of instructions in a file, and the file could then be executed later by the abstract machine. We have just transformed the interpreter into a compiler.
+
+In general, we consider that the abstract machine contains, in addition to the accumulator and the stack, a third register: the code, the list of instructions that have to be executed. At the beginning, the abstract machine looks for an instruction in the code register, executes it, then looks for another instruction... until the code register becomes empty. As we will see, the fact that the execution of an instruction may add new instructions to the code register will allow us to write loops and recursive definitions.
+
+## 4.3 An Abstract Machine for PCF
+
+### 4.3.1 The Environment
+
+So far we have only compiled a fragment of PCF: numbers and addition. Can this principle be generalised to the full language?
+
+First, recall that in PCF a term has to be interpreted in an environment. In addition to the accumulator, stack, and code, our abstract machine needs a fourth register: the environment. The machine must also include an instruction Extendx to extend the environment, adding the definition x = V where V is the content of the accumulator, and an instruction Searchx to look for the value associated to x in the environment and put it in the accumulator.
+
+When the machine executes the code generated by the compilation of several nested applications, the environment will change several times, and at the end of the execution the initial environment should be restored. The abstract machine needs then instructions Pushenv and Popenv to put the contents of the environment in the stack and recover it. These operations are often further decomposed into several operations to push and pop individual elements of the environment, but here we will not decompose them in this way.
+
+### 4.3.2 Closures
+
+In PCF it is also necessary to define closures as values. In addition to the instruction Ldin, we will need an instruction Mkclos(f,x,t), with two variables f and x and a term t as arguments. This instruction will build the closure 〈f, x, t, e〉, where e is the content of the environment register, and put the closure in the accumulator.
+
+### 4.3.3 PCF Constructs
+
+It is not difficult to compile a term of the form fun x -> t or fixfun f x -> t since we can simply generate the instruction Mkclos(f,x,t) to build a closure, which is the value of this kind of term.
+
+In the same way, it is easy to compile a term of the form x, we just need to generate the instruction Searchx to look for the value associated to x in the environment.
+
+Let us consider now the compilation of a term of the form t u. The corresponding big-step semantics rule is
+
+To interpret the term t u in the environment e, we start by interpreting u in the environment e, which returns the value W. We then interpret the term t in the environment e, obtaining the closure 〈f, x, t', e'〉, and finally we interpret t' in the environment (e', f = 〈f, x, t', e'〉), x = W, to obtain the final result.
+
+Now, let us see how an interpreter running in an abstract machine will deal with that term: to interpret the term t u, the abstract machine starts by interpreting u, and puts the result in the stack. Then, it interprets the term t, resulting in the closure 〈f, x, t', e'〉, and puts in the environment register the environment e', f = 〈f, x, t', e'〉, x = W, where W is the value at the top of the stack, which will then be removed from the stack. Finally, the machine interprets the term t'. To ensure that the contents of the environment register are restored at the end of the operations, it should be put in the stack at the beginning of the interpretation, and recovered from the stack at the end.
+
+Let us consider now the compilation process for such a term. The interpretation of the term u is replaced by the execution of the sequence |u| of instructions, and similarly the interpretation of the term t is replaced by the execution of the sequence |t| of instructions. The interpretation of t' has to be replaced by the execution of the sequence |t'| of instructions. However, there is a difficulty here: t' is not a subterm of t u, it is provided by the closure resulting from the interpretation of t. We then need to modify the notion of closure, and replace the term t in 〈f, x, t, e〉 by a sequence i of instructions. Thus, terms of the form fun x -> t and fixfun f x -> t should not be compiled into Mkclos(f, x, t), instead, they should be compiled into Mkclos(f, x, |t|) to build the closure 〈f, x, |t|, e〉 where e is the content of the environment register.
+
+Finally, we need to include in the machine an instruction Apply that takes a closure 〈f, x, i, e〉 from the accumulator, puts the environment e, f = 〈f, x, i, e〉, x = W, where W is the top of the stack, in the environment register, discards the top of the stack and adds to the code register the sequence i of instructions.
+
+The term t u can then be compiled as the sequence of instructions Pushenv, |u|, Push, |t|, Apply, Popenv.
+
+Summarising, the abstract machine has the set of instructions Ldin, Push, Add, Extendx, Searchx, Pushenv, Popenv, Mkclos(f,x,i) and Apply. To complete it, we just need to add the arithmetic operations Sub, Mult, Div and the test Test(i,j) to compile the operators -, *, / and ifz.
+
+### 4.3.4 Using de Bruijn Indices
+
+To simplify the machine we can use De Bruijn indices—see Sect. 3.3. Recall that the instruction Searchx is generated by the compilation of variables, and we have already seen that it is possible to determine the index of each variable occurrence statically. We could then compile a variable x using the instruction Searchn, where n is a number, instead of earchx.
+
+De Bruijn indices can be computed at the same time as the compilation is performed, it suffices to compile a term in a variable environment, and compile the variable x in the environment e by the instruction Searchn, where n is the position of the variable x in the environment e, starting by the end.
+
+This mechanism allows us to dispose of variables in environments, closures, and instructions Mkclos and Extend. Our abstract machine includes the instructions Ldin, Push, Extend, Searchn, Pushenv, Popenv, Mkclosi, Apply, Test(i,j), Add, Sub, Mult and Div.
+
+### 4.3.5 Small-Step Operational Semantics
+
+The machine state, the contents of its registers, is a tuple consisting of a value (the accumulator), a list where each element is either a value or a list of values (the stack), a list of values (the environment), and a sequence of instructions (the code).
+
+A small execution step consists of getting an instruction from the code register and executing it. The small-step semantics of the machine can be easily defined:
+
+  * (a,s,e,((Mkclos i),c)) ⟶ (〈i,e〉,s,e,c)
+
+  * (a,s,e,(Push,c)) ⟶ (a,(a,s),e,c)
+
+  * (a,s,e,(Extend,c)) ⟶ (a,s,(e,a),c)
+
+  * (a,s,e,((Search n),c)) ⟶ (V,s,e,c) if V is the nth value in e (starting from the end)
+
+  * (a,s,e,(Pushenv,c)) ⟶ (a,(e,s),e,c)
+
+  * (a,(e',s),e,(Popenv,c)) ⟶ (a,s,e',c)
+
+  * (〈i,e'〉,(W,s),e,(Apply,c)) ⟶ (〈i,e'〉,s,(e', 〈i,e'〉, W), i c)
+
+  * (a,s,e,((Ldi n),c)) ⟶ (n,s,e,c)
+
+  * (n,(m,s),e,(Add,c)) ⟶ (n + m,s,e,c)
+
+  * (n,(m,s),e,(Sub,c)) ⟶ (n - m,s,e,c)
+
+  * (n,(m,s),e,(Mult,c)) ⟶ (n * m,s,e,c)
+
+  * (n,(m,s),e,(Div,c)) ⟶ (n / m,s,e,c)
+
+  * (0,s,e,((Test(i,j)),c)) ⟶ (0,s,e,i c)
+
+  * (n,s,e,((Test(i,j)),c)) ⟶ (n,s,e,j c) if n is a number different from 0
+
+An irreducible term is a tuple where the fourth component—the contents of the code register—is empty. If i is a sequence of instructions and if the term (0,[ ],[ ],i) reduces to an irreducible term of the form (V,_,_,[ ]), then we say that V is the result of the execution of i, and we write i ⇒ V.
+
+## 4.4 Compilation of PCF
+
+We can now give the compilation rules for PCF
+
+  * |x|e =Search n where n is the position of x in the environment e
+
+  * |t u|e =Pushenv, |u|e,Push, |t|e,Apply,Popenv
+
+  * |fun x -> t|e =Mkclos |t|e,_,x
+
+  * |fixfun f x -> t|e =Mkclos |t|e, f, x
+
+  * |n|e =Ldi n
+
+  * |t + u|e = |u|e,Push, |t|e,Add
+
+  * |t - u|e = |u|e,Push, |t|e,Sub
+
+  * |t * u|e = |u|e,Push, |t|e,Mult
+
+  * |t / u|e = |u|e,Push, |t|e,Div
+
+  * |ifz t then u else v|e = |t|e,Test(|u|e,|v|e)
+
+  * |let x = t in u|e =Pushenv, |t|e,Extend, |u|e, x,Popenv
+
+For example, the compilation of
+
+generates the sequence of instructions Pushenv, Mkclos [Search0, Test([Ldi1], [Pushenv, Ldi1, Push, Search0, Sub, Push, Search1, Apply, Popenv, Push, Search0, Mult])], Extend, Pushenv, Ldi6, Push, Search0, Apply, Popenv, Popenv and the result of its execution is the number 720.
+
+The correctness of the compilation, and of the semantics of the abstract machine, can be stated as follows: if V is a numeric value, then ⊢ t ↪ V if and only if |t| ⇒ V.
+
+Exercise 4.2
+
+Write an abstract machine and a compiler for PCF.
+
+The state of the abstract machine at the beginning of the 14th execution step for the program Pushenv, Ldi1, Extend, Ldi6, Push, Ldi5, Push, Ldi4, Push, Ldi 3, Push, Ldi2, Push, Search0, Add, Add, Add, Add, Add, Popenv.
+
+Exercise 4.3
+
+We extend PCF with the tree operators described in Exercise 3.15. Write a compiler and an abstract machine for this extension of PCF.
+
+Exercise 4.4
+
+(A bootstrapping compiler) Many kinds of data structures can be represented using the trees described in Exercise 3.15. To start with, we can represent a natural number n as a tree L n. The character c can be represented by the tree L n where n is a code, for instance the ASCII code of the character c. If t1, t2,...,tn are trees, the list t1,t2, ...,tn can be represented by the tree N(t1, N(t2, ..., N(tn, L 0)...)). Finally, values of a type defined by constructors that are themselves representable could be defined by enumerating the constructors and representing the value C(V1,V2, ...,Vn) by the list L p,t1,t2, ...,tn where p is the number associated to the constructor C and t1, t2, ...,tn represent the values V1, V2, ...,Vn.
+
+We could, in particular, represent in this way programs written in the extended PCF language, or in the language of the abstract machine in Exercise 4.3. Modify the compiler and the abstract machine in Exercise 4.3 to accept programs represented by binary trees. The abstract machine will take two inputs: a compiled program, represented by a tree, and a value, and will apply the program to the value.
+
+Translate the compiler in Exercise 4.3 to PCF. After writing the compiler, compile it with the compiler defined in Exercise 4.3. The result is the first compiler executed by the PCF abstract machine. Compile this compiler (it will compile itself). Verify that the code produced is the same that was obtained with the compiler in Exercise 4.3. If this is true, we can destroy the first compiler and use instead the second: this is the bootstrap process.
+Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2_5© Springer-Verlag London Limited 2011
+
+# 5. PCF with Types
+
+Gilles Dowek1  and Jean-Jacques Lévy2
+
+(1)
+
+Labo. d'Informatique, École polytechnique, route de Saclay, 91128 Palaiseau, France
+
+(2)
+
+Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, 28 rue Jean Rostand, 91893 Orsay Cedex, France
+
+Gilles Dowek (Corresponding author)
+
+Email: gilles.dowek@polytechnique.edu
+
+Jean-Jacques Lévy
+
+Email: jean-jacques.levy@inria.fr
+
+Abstract
+
+This chapter opens a new part of the book, dedicated to types. The language PCF is extended by adding types. A type verification algorithm is described and the application to the static detection of errors is discussed in length. This chapter also describes the denotational semantics of PCF with types.
+
+In Chap. 2, we remarked that, in contrast with mathematical functions, the domain of PCF functions is not specified. For this reason, it is possible to apply the function fun x -> x + 1 to the function fun x -> x + 2, even if this application is meaningless.
+
+It is sometimes convenient to be able to apply any object to another object. For example, we can apply the identity function fun x -> x to itself, using the term (fun x -> x) (fun x -> x) that reduces to fun x -> x. More generally, the identity function in PCF is defined for any object, whereas in Mathematics it always has to be restricted to a specific domain. The ability to apply an object to itself was essential to show that the fix construct can be simulated in PCF using application and fun—see Exercise 2.10.
+
+However, the unrestricted application of an object to another may rise a number of problems. For example, we saw that the terms 1 2, 1 + (fun x -> x), ifz (fun x -> x) then 1 else 2 were irreducible closed terms according to the small-step semantics of PCF, but they are not values.
+
+The big-step operational semantics instead does not associate any result to a term such as (fun x -> x) 1 2. In practice, if we interpret a term of the form t u where t results in a number instead of a term of the form fun, an error is raised. This error will be detected at run time, instead of being detected statically (before execution) as one would expect.
+
+The fact that the domain of a PCF function is not specified also makes it more difficult to give a denotational semantics for PCF.
+
+The goal of this chapter is to define a version of PCF where functions come with associated domains, and to show that if a program is well-formed in this language, its interpretation cannot produce the errors mentioned above. We will also give a simple denotational semantics for this language.
+
+## 5.1 Types
+
+In Mathematics, the domain of a function is a set (any set). For example, we can define a function m from 2ℕ to ℕ that associates to each even number its half. Then, to check whether the expression m (3 + (4 + 1)) is well-formed or not, that is, to check whether the argument is in the domain of the function, we need to check whether 3 + (4 + 1) is even or not. For arbitrary sets, the problem of deciding whether a given element belongs to the set is undecidable in general. Therefore, the problem of checking the validity of terms is also undecidable in general. Besides, to know whether a term such as ifz t then u else v will produce an error or not, we need to know whether the value of t is a natural number or a term of the form fun (the parity of the number is not relevant in this case).
+
+These two remarks lead us to restrict the class of sets used to define the domains of functions. The sets in this restricted class will be called types.
+
+### 5.1.1 PCF with Types
+
+In PCF, types are inductively defined by
+
+  * nat—that is, ℕ—is a type,
+
+  * if A and B are types then A -> B—that is, the set of all the functions from A to B—is a type.
+
+Types can then be defined using a language that includes the constant nat and the symbol -> with two arguments that do not bind any variables. Such a term is also called a type.
+
+Functions in PCF were written fun x -> t, but will now include the type of the variable x. Thus, we will write fun x:nat -> x for the identity function on the natural number, and fun x:(nat -> nat) -> x for the identity function on functions from natural numbers to natural numbers. In general, the symbol fun will now have two arguments, a type and a term; it will bind a variable in the second argument. The typed version of the language PCF is a language with two sorts of objects: terms and types, and the arity of the symbol fun is ((type), (term, term), term). Also the symbols fix and let must now indicate the type of the bound variable.
+
+Summarising, typed PCF includes
+
+  * a term symbol fun with a type argument and a term argument, which binds a variable in the second argument,
+
+  * a term symbol α with two term arguments, that does not bind any variable,
+
+  * an infinite number of term constants to represent the natural numbers,
+
+  * four term symbols +, -, * and /, each with two arguments which do not bind any variables in their arguments,
+
+  * a term symbol ifz with three term arguments which do not bind any variables,
+
+  * a term symbol fix with a type argument and a term argument, which binds a variable in the second argument,
+
+  * a term symbol let with three arguments, where the first is a type and the others terms, binding a variable in the third argument,
+
+  * a type constant nat,
+
+  * a type symbol -> with two type arguments and which does not bind any variable in its arguments.
+
+Alternatively, we can define the syntax of the typed version of PCF inductively
+
+### 5.1.2 The Typing Relation
+
+We can now define by induction the relation t : A, read "the term t has type A ". More precisely, we will define by induction a ternary relation e ⊢ t : A, as we did for the interpretation relation, where t is a term that might have free variables and e is a typing environment that associates a type to each variable. This is an inductive definition, similar to the inductive definition of PCF's big-step operational semantics. We could imagine that it is the operational semantics of a language with the same syntax as PCF but where the interpretation of a term returns a type instead of a value—for this reason, it is called an abstract interpretation of the term.
+
+In the first rule only the rightmost declaration for x is taken into account, the others are hidden.
+
+The language includes variables of various sorts, in particular type variables for which we will use capital letters. Since no symbol can bind a type variable, a closed term will not contain type variables. Moreover, if a closed term t has the type A in the empty environment, then the type A must be closed too. So, type variables are not really used here; they will be used in the next chapter.
+
+Let e be an environment and t a term. Reasoning by induction on t, we can show that the term t has at most one type in the environment e.
+
+We can build a type checking algorithm based on the typing rules given above. The algorithm will check whether a term t has a type in an environment e, and if it does, it will give the type as a result. It will do this by typing recursively the direct subterms of the given term, and will then compute the type of term using the types of the subterms.
+
+Exercise 5.1
+
+Write a type checker for PCF.
+
+Reduction is still confluent on the typed language, and types bring us an additional property: all the terms that do not contain the operator fix terminate—Tait's Theorem. It will be impossible to build a term such as (fun x -> (x x)) (fun x -> (x x)), which does not terminate and does not contain fix.
+
+Exercise 5.2
+
+Write typing rules for the version of PCF that uses de Bruijn indices instead of variable names—see Sect. 3.3.
+
+Exercise 5.3
+
+We extend PCF with the constructs described in Exercise 3.13 to define pairs, and we introduce a symbol × to denote the Cartesian product of two types. Write typing rules for this extension of PCF. Write a type-checker for this extension of PCF.
+
+Exercise 5.4
+
+We extend PCF with the constructs described in Exercise 3.14 to define lists, and we introduce a type natlist for these lists. Write typing rules for this extension of PCF. Write a type-checker for this extension of PCF.
+
+Exercise 5.5
+
+We extend PCF with the constructs described in Exercise 3.15 to define trees, and we introduce a type nattree for these trees. Write typing rules for this extension of PCF. Write a type-checker for this extension of PCF.
+
+## 5.2 No Errors at Run Time
+
+We will now show that the interpretation of a correctly typed term cannot produce a type error at run time. For this we can use the small-step or the big-step semantics; the proof is slightly different depending on the semantics we use.
+
+### 5.2.1 Using Small-Step Operational Semantics
+
+Using the small-step operational semantics of the language, the property can be formulated as follows: the result of the computation of a typed closed term, if it exists, is a value. In other words, a typed closed term evaluates to a natural number or a closed term of the form fun x -> t; it can never be a stuck term: V1V2, where V1 and V2 are irreducible closed terms and V1 is not a term of the form fun x -> t, V1 ⊗ V2, where V1 and V2 are irreducible closed terms which are not both numbers, or a term of the form ifzV1 thenV2 elseV3 where V1, V2 and V3 are irreducible closed terms and V1 is not a number.
+
+The first lemma, which we will not prove here, is usually called subject reduction. It says that if a closed term t of type A reduces in one step to the term u (t  u), then u also has type A. We can deduce that if a closed term t of type A reduces to u in any number of steps (t  u), then u also has type A.
+
+The next step in the proof consists of showing that a term of the form fun cannot have the type nat and similarly a numeric constant cannot have a type of the form A -> B. This is done by a simple structural induction over the typing relation.
+
+The proof proceeds by showing that an irreducible closed term t of type nat is a constant representing a natural number and an irreducible closed term t of type A -> B has the form fun. This is done by structural induction on t.
+
+Since t is a closed term, it cannot be a variable. Since it is irreducible, it cannot be a fix or a let.
+
+We show that t cannot be an application, an arithmetic operator or a conditional. If t is an application t = u v then u has a type of the form C -> D. By induction hypothesis, this term must be of the form fun, and therefore t is a redex, contradicting our assumption (t is irreducible). If t is an arithmetic operator t = u ⊗ v then u and v have type nat. By induction hypothesis, they are numeric constants and therefore t is a redex, contradicting our assumption (t is irreducible). If t is a term of the form t = ifz u then v else w then u has type nat. By induction hypothesis, u is a numeric constant and therefore t is a redex, contradicting our assumption (t is irreducible).
+
+An irreducible closed term t is then either a numeric constant or a term of the form fun. If it has type nat, it is a constant; if it has type A -> B, it is a fun.
+
+If a well-typed closed term can be reduced to an irreducible closed term, this irreducible term will also be well typed, and will therefore be either a numeric constant or a term of the form fun.
+
+### 5.2.2 Using Big-Step Operational Semantics
+
+The property is formulated differently using the big-step operational semantics of the language. This is because in this style of semantics only values can be associated to terms (even if the terms are ill typed). One could say that the rules of the big-step operational semantics are incomplete, since they do not specify how to associate a value to an application whose left-hand side has a value that is a numeric constant, or how to associate a value to an arithmetic operation where the value of one of the arguments is a term of the form fun, or a value to a conditional where the first argument has a value that is of the form fun. However, for well-typed terms the rules are complete. In other words, the three examples that we have just mentioned cannot arise.
+
+We start by showing a type-preservation-by-interpretation lemma, which states that if a closed term t has type A then its value, if it exists, also has type A. This lemma corresponds to the subject reduction lemma of the small-step operational semantics.
+
+Then we show, as for the small-step semantics, that a term of the form fun cannot have type nat and, similarly, that a numeric constant cannot have a type of the form A -> B.
+
+Since we know that the value of a term is either a number or a term of the form fun, we deduce that the value of a term of type nat is a numeric constant, and the value of a term of type A -> B is a term of the form fun. Therefore, when interpreting a well-typed term, the left-hand side of an application will always be interpreted as a term of the form fun, the arguments of arithmetic operators will always be interpreted as numeric constants, and the first argument of an ifz will always be interpreted as a numeric constant.
+
+Exercise 5.6
+
+(Equivalent semantics) Show that the computation of a well-typed term produces a result under call by name small-step operational semantics if and only if it produces a result under call by name big-step operational semantics. Moreover, the result is the same in both cases. Show that the same property is true of the call by value semantics.
+
+Does this result hold also for the untyped version of PCF? Hint: what is the result of ((fun x -> x) 1) 2?
+
+## 5.3 Denotational Semantics for Typed PCF
+
+### 5.3.1 A Trivial Semantics
+
+We mentioned above that one of the goals of functional languages is to shorten the distance between the notion of a program and the notion of a function. In other words, the goal is to bring the program closer to its denotational semantics.
+
+We also said that it was difficult to give a denotational semantics for PCF without types, because functions did not have a domain of definition. Now that we have a type system for PCF, it is easier to give a denotational semantics.
+
+We associate to each type a set
+
+  * 〚nat〛 = ℕ,
+
+  * 〚A -> B〛 = 〚A〛 -> 〚B〛
+
+and to each term t of type A an element 〚t〛 of 〚A〛. If the term t has free variables, we will associate meanings to these variables via a semantic environmente.
+
+  * 〚x〛e = a, if e includes the pair x = a,
+
+  * 〚fun x:A -> t〛e = fun a:〚A〛 -> 〚t〛e,x=a,
+
+  * 〚t u〛e = 〚t〛e 〚u〛e,
+
+  * 〚n〛e = n,
+
+  * 〚t + u〛e = 〚t〛e + 〚u〛e, 〚t - u〛e = 〚t〛e - 〚u〛e, 〚t * u〛e = 〚t〛e * 〚u〛e, 〚t / u〛e = 〚t〛e / 〚u〛e,
+
+  * 〚ifz t then u else v〛e = 〚u〛e if 〚t〛e = 0 and 〚v〛e otherwise,
+
+  * 〚let x:A = t in u〛e = 〚u〛e,x=〚t〛e.
+
+This is really trivial: a program is a function and its semantics is the same function. Achieving this "triviality" is one of the goals in the design of functional languages.
+
+Two remarks are in order. First, division by 0 produces an error in PCF, whereas it is not defined in Mathematics. To be precise, we should add a value error to each set 〚A〛 and adapt the definition given above. Second, in this definition we have forgotten the construction fix.
+
+### 5.3.2 Termination
+
+The only construct with a non-trivial denotational semantics is fix, because this construct is not usually found in everyday definitions of functions in Mathematics. Unlike PCF, mathematical definitions can only use fixed points of functions that do have a fixed point, and even then if there are several fixed points it is essential to specify which one we are taking. We left these issues aside when we defined PCF, it is now time to deal with them.
+
+Consider a function that does not have a fixed point: the function fun x:nat -> x + 1. In PCF, we can build the term fix x:nat (x + 1). Similarly, the function fun f:(nat -> nat) -> fun x:nat -> (f x) + 1 does not have a fixed point but we can build the term fix f:(nat -> nat) fun x:nat -> (f x) + 1. On the other hand, the function fun x:nat -> x, has many fixed points, and still we can build the term fix x:nat x.
+
+When we defined the operational semantics of PCF, we gave a reduction rule
+
+that explains the idea of a fixed point. Using this rule, we can see that the term a = fix x:nat (x + 1) reduces to a + 1, then to (a + 1) + 1, ... without ever reaching an irreducible term. Similarly, if g = fix f:(nat -> nat) fun x:nat -> (f x) + 1, the term g 0 can be reduced in two steps to (g 0) + 1 and then ((g 0) + 1) + 1, ... and again will never reach an irreducible term. The same thing happens with the term b = fix x:nat x, which reduces to b, and again to b,... and will never reach an irreducible term. In other words, it appears that in PCF, when we take the fixed point of a function that does not have any, or that has more than one, the program does not terminate.
+
+The situation is similar in Caml, where the program
+
+loops, or in Java with the program
+
+There are even functions, such as fun x:nat -> x + x, which have a unique fixed point but for which the fix construct in PCF produces a non-terminating computation: fix x:nat (x + x).
+
+In other words, to understand the denotational semantics of the fixed point operator, we need to understand first the semantics of terms that do not terminate.
+
+The small-step operational semantics does not associate any result to these terms: there is no term V such that fix x:nat (x + 1) ↪ V. And the big-step operational semantics does not give us more information. As we have already said, we could complete the relation ↪ by adding a value ⊥ such that fix x:nat (x + 1) ↪ ⊥.
+
+We have the same options in denotational semantics. We could define a partial function 〚 〛, and leave 〚fix x:nat (x + 1)〛 undefined, or we could add a value ⊥ to 〚nat〛 and define 〚fix x:nat (x + 1)〛 = ⊥.
+
+If we include the value ⊥, the interpretation of a term of the form t + u will be obtained by interpreting first u and t, and if one of these terms loops, then the whole term t + u does. Thus, the denotational semantics of a term of the form t + u is defined as follows
+
+  * 〚t + u〛 = 〚t〛 + 〚u〛 if 〚t〛 and 〚u〛 are natural numbers,
+
+  * 〚t + u〛 = ⊥ if 〚t〛 = ⊥ or 〚u〛 = ⊥.
+
+We can now remark that the function 〚fun x:nat -> x + 1〛, which did not have a fixed point when ⊥ was not included, now has one: ⊥. This value is precisely the one we will define as semantics for the term fix x:nat (x + 1), which does not terminate. The function 〚fun x:nat -> x〛, which had several fixed points, now has an additional one ⊥, and we will choose this one as semantics for the term fix x:nat x. The function 〚fun x:nat -> x + x〛, which had a unique fixed point 0 now has two: 0 and ⊥, and again we will choose ⊥ as semantics for the term fix x:nat (x + x) that does not terminate.
+
+All the functions that we had mentioned have fixed points now, and if they have more than one, including ⊥, we will choose the latter as our privileged value.
+
+### 5.3.3 Scott's Ordering Relation
+
+To make the ideas discussed above more precise, we define an ordering relation, called Scott's ordering relation, on the set 〚nat〛 as follows
+
+and we define 〚fix x:nat t〛 as the least fixed point of the function 〚fun x:nat -> t〛, forcing the use of the fixed point ⊥ when more than one fixed point exist. It remains to prove that the least fixed point exists; we will use the fixed point theorem for this. To apply this theorem, we must show that the ordering relation that we defined on 〚nat〛 is weakly complete, and that the semantics of a program of type nat -> nat is a continuous function.
+
+More generally, we will build for each type A a set 〚A〛 endowed with a weakly complete ordering relation, and we will show that the semantics of a program of type A -> B is a continuous function from 〚A〛 to 〚B〛.
+
+We start by defining the sets 〚A〛. The set 〚nat〛 will be defined as ℕ∪{⊥}, with the ordering relation given above. The set 〚A -> B〛 is defined to be the set of all continuous functions from 〚A〛 to 〚B〛, with the ordering relation f ≤ g if for all x in 〚A〛, f x ≤ g x.
+
+We can show that these ordering relations are weakly complete. The ordering on 〚nat〛 is weakly complete because any increasing sequence is either constant or has the form ⊥, ⊥, ..., ⊥, n, n, ... and in both cases there is a limit.
+
+We will now show that if the ordering relations on 〚A〛 and 〚B〛 are weakly complete, then so is the ordering on 〚A -> B〛. Let us consider an increasing sequence fn over 〚A -> B〛. Using the definition of the ordering on 〚A -> B〛, for all x in 〚A〛, the sequence fn x, whose values are in 〚B〛, is also increasing, and therefore has a limit. Let us call F the function that associates to x the element limn (fn x). We can show—but we will not do it here—that the function F is in 〚A -> B〛, that is, it is a continuous function (this requires a lemma to permute limits). By construction, the function F is greater than all the functions fn, and it is the least such function. Therefore it is the limit of the sequence fn. Any increasing sequence has a limit and the ordering relation on 〚A -> B〛 is therefore weakly complete.
+
+Each set 〚A〛 has a least element, written ⊥A. The least element of 〚nat〛 is ⊥, and the least element of 〚A -> B〛 is the constant function that returns the value ⊥B for all arguments.
+
+### 5.3.4 Semantics of Fixed Points
+
+We can now go back to the denotational semantics of PCF, and add to the definition the missing case for fix
+
+  * 〚x〛e = a, if e contains the definition x = a,
+
+  * 〚fun x:A -> t〛e = fun a:〚A〛 -> 〚t〛e,x=a,
+
+  * 〚t u〛e = 〚t〛e 〚u〛e,
+
+  * 〚n〛e = n,
+
+  * 〚t ⊗ u〛e = 〚t〛e ⊗ 〚u〛e, if 〚t〛e and 〚u〛e are natural numbers, ⊥ otherwise,
+
+  * 〚ifz t then u else v〛e = 〚u〛e si 〚t〛e = 0, 〚v〛e if 〚t〛e is a natural number different from 0 and ⊥A, where A is the type of this term, if 〚t〛e = ⊥nat.
+
+  * 〚fix x:A t〛e = FIX (fun a:〚A〛 -> 〚t〛e,x=a) where FIX(f) is the least fixed point of the continuous function f,
+
+  * 〚let x:A = t in u〛e = 〚u〛e,x=〚t〛e.
+
+To show that this definition is correct, we need to prove that if t is a term of type A then 〚t〛 is in 〚A〛, that is, we need to prove that the function is continuous. This is true, but we will not prove it here.
+
+Exercise 5.7
+
+What is the semantics of the term fun x:nat -> 0? And the semantics of fix x:nat x and (fun x:nat -> 0) (fix x:nat x)?
+
+Exercise 5.8
+
+What is the value of 〚ifz t then u else v〛e, if 〚t〛e = 0, 〚u〛e = 0 and 〚v〛e = ⊥nat?
+
+We can now state the equivalence theorem for the two semantics. Let t be a closed term of type nat and n a natural number: t ↪ n under call by name if and only if 〚t〛 = n. The direct implication is not difficult to prove, but the converse is not trivial.
+
+Exercise 5.9
+
+Show, using the equivalence theorem, that if t is a closed term of type nat such that 〚t〛 = ⊥, there is no natural number n such that t ↪ n.
+
+Exercise 5.10
+
+Let G be the denotational semantics of the term fun f:(nat -> nat) -> fun n:nat -> ifz n then 1 else n * (f (n - 1)).
+
+The denotational semantics of the term fix f:(nat -> nat) fun n:nat -> ifz n then 1 else n * (f (n - 1)) is the least fixed point of G. By the first fixed point theorem, this is the limit of the sequence Gn(⊥nat -> nat). Which function is denoted by ⊥nat -> nat? And by Gn(⊥nat -> nat)? Identify the limit of this sequence.
+
+Show that for any natural number p, there exists a natural number m such that Gm(⊥nat -> nat)(p) =limnGn(⊥nat -> nat)(p).
+
+Exercise 5.11
+
+We consider the following elements in the set 〚nat -> nat〛: the function u that maps ⊥ to ⊥ and all other elements to 0, the function vi that maps ⊥ to ⊥, i to 1 and all other elements to 0, and the function wi that maps ⊥ to ⊥, 0, 1, ..., i-1 to 0 and all other elements to ⊥.
+
+Let F be an increasing function from 〚nat -> nat〛 to 〚nat〛, such that F u = 0 and for all i, Fvi = 1. Show that for all i, Fwi = ⊥. Show that the function F is not continuous.
+
+Show that it is not possible to write a PCF function that takes as argument a function g of type nat -> nat and returns 0 if for all n, g n = 0 and 1 otherwise.
+
+Exercise 5.12
+
+(An information-based approach to continuity) It might seem surprising that the notion of continuity is used to define the semantics of PCF, even though PCF works only with natural numbers, not with real numbers. In fact, the set of functions from ℕ to ℕ, or the set of sequences of natural numbers, is very similar to the set of real numbers.
+
+The intuition is that a real function f is continuous if to compute the initial n decimal places of f x it is sufficient to know a finite number of decimals in x. Unfortunately, this is technically false if x or f x are decimal numbers. We will say that a decimal number approximates a real number to the nth decimal place if the distance between the two is smaller than 10−n. Thus, the number π has two approximations to the second decimal place: 3.14 and 3.15, and it makes sense to say that the function f is continuous if to compute a decimal approximation of f x to the nth place it is sufficient to have some decimal approximation of x.
+
+The goal of this exercise is to show that, similarly, a function f from sequences of natural numbers to sequences of natural numbers is continuous if to compute the first n terms in f x it is sufficient to have an initial segment of x. If we agree to call a finite initial segment of the sequence a finite approximation, then we can rephrase it as follows: to compute an approximation of f x with n terms, it is sufficient to have a certain approximation of x.
+
+Let u be a sequence of natural numbers, and let U be the element of 〚nat -> nat〛 that associates ⊥ to ⊥ and ui to i.
+
+Let V be a sequence with elements in 〚nat -> nat〛
+
+Show that the sequence V converges to U. Let F be a continuous function on 〚nat -> nat〛. Show that the sequence FVi converges to F U. Show that the sequence FVi p converges to F U p. Show that there exists a natural number k such that FVk p = F U p. Show that to compute F U p, it suffices to have the first k terms in U. Show that to compute the first n terms in F U it is sufficient to know a finite number of terms in U.
+
+Consider the function that associates to a sequence u the number 0 if u is always 0, and 1 otherwise. Is this function continuous? Can it be written in PCF?
+
+Finally, notice that in these two examples, the approximations—decimal numbers or finite sequences—contain a finite amount of information, whereas the objects that they approximate—real numbers or infinite sequences—contain an infinite amount of information.
+
+Exercise 5.13
+
+(Gödel's System T) To avoid non-terminating computations, we can replace fix by a rec construct to define functions by induction. All the programs in this language terminate, but the language is no longer Turing complete. Still, it is not easy to find a program that cannot be represented in this language, you need to be an expert logician to build such a program.
+
+The function f defined by f 0 = a and f (n + 1) = g n (f n) is written rec a g. The small-step operational semantic rules for this construct are
+
+if n is a natural number different from 0.
+
+Program the factorial function in this language. Give typing rules for rec. Give a denotational semantics for this language.
+Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2_6© Springer-Verlag London Limited 2011
+
+# 6. Type Inference
+
+Gilles Dowek1  and Jean-Jacques Lévy2
+
+(1)
+
+Labo. d'Informatique, École polytechnique, route de Saclay, 91128 Palaiseau, France
+
+(2)
+
+Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, 28 rue Jean Rostand, 91893 Orsay Cedex, France
+
+Gilles Dowek (Corresponding author)
+
+Email: gilles.dowek@polytechnique.edu
+
+Jean-Jacques Lévy
+
+Email: jean-jacques.levy@inria.fr
+
+Abstract
+
+This chapter continues with types, but with a much more operational orientation. Powerful type inferences type inference algorithms are described, in particular one with polymorphic typing.
+
+In many programming languages, for instance Java and C, programmers must declare a type for each of the variables used in the program, writing for example fun x:nat -> x + 1. However, if we know that + can only work with numbers, it is not difficult to show that in the term fun x -> x + 1 the variable x has to be of type nat. We can then let the computer infer the types, rather than asking the programmer to write them. This is the goal of a type inference algorithm.
+
+## 6.1 Inferring Monomorphic Types
+
+### 6.1.1 Assigning Types to Untyped Terms
+
+We will now use the original syntax of PCF, where variables are not explicitly typed. Instead of writing fun x:nat -> x + 1, we will write fun x -> x + 1 as in Chap. 2.
+
+We can now define the language of terms and the language of types independently. The language of terms in PCF is defined as in Chap. 2 and the language of types consists of
+
+  * a constant nat, and
+
+  * a symbol -> with two arguments which does not bind any variable in its arguments.
+
+As before, the relation e ⊢ t : A (read "the term t has type A in the environment e ") can be defined by induction.
+
+![
+$$
+\\frac{{}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,x}\\,:{\\rm\\,A}}}{\\rm\\,if\\,e\\,contains\\,x}\\,:{\\rm\\,A}
+$$
+](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equa.gif)
+
+![
+$$
+\\frac{{{\\rm\\,e}\\,\\vdash {\\rm\\,u}\\,:{\\rm\\,A\\,e}\\,\\vdash {\\rm\\,t}\\,:{\\rm\\,A} -  > {\\rm\\,B}}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,t\\,u}\\,:{\\rm\\,B}}}
+$$
+](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equb.gif)
+
+![
+$$
+\\frac{{\\left\( {{\\rm\\,e, x}\\,:{\\rm\\,A}} \\right\)\\,\\vdash {\\rm\\,t}\\,:{\\rm\\,B}}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,fun\\,x} -  > {\\rm\\,t}\\,:{\\rm\\,A} \\,-  > {\\rm\\,B}}}
+$$
+](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equc.gif)
+
+![
+$$
+\\frac{{}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,n}\\,:{\\rm\\,nat}}}
+$$
+](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equd.gif)
+
+![
+$$
+\\frac{{{\\rm\\,e}\\,\\vdash {\\rm\\,u}\\,:{\\rm\\,nat e}\\,\\vdash {\\rm\\,t}\\,:{\\rm\\,nat}}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,t} \\otimes {\\rm\\,u}\\,:{\\rm\\,nat}}}
+$$
+](A978-0-85729-076-2_6_Chapter_TeX2GIF_Eque.gif)
+
+![
+$$
+\\frac{{{\\rm\\,e}\\,\\vdash {\\rm\\,t}\\,:{\\rm\\,nat e}\\,\\vdash {\\rm\\,u}\\,:{\\rm\\,A e}\\,\\vdash {\\rm\\,v}\\,:{\\rm\\,A}}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,ifz\\,t\\,then\\,u\\,else\\,v}\\,:{\\rm\\,A}}}
+$$
+](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equf.gif)
+
+![
+$$
+\\frac{{\\left\( {{\\rm\\,e, x}\\,:{\\rm\\,A}} \\right\)\\,\\vdash {\\rm\\,t}\\,:{\\rm\\,A}}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,fix\\,x\\,t}\\,:{\\rm\\,A}}}
+$$
+](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equg.gif)
+
+![
+$$
+\\frac{{{\\rm\\,e}\\,\\vdash {\\rm\\,t}\\,:{\\rm\\,A }\\left\( {{\\rm\\,e, x}\\,:{\\rm\\,A}} \\right\)\\,\\vdash {\\rm\\,u}\\,:{\\rm\\,B}}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,let\\,x}\\,=\\,{\\rm\\,t\\,in\\,u}\\,:{\\rm\\,B}}}
+$$
+](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equh.gif)
+
+Some terms, for example the term fun x -> x, may have more than one type in this system. For instance, we can derive the judgement ⊢ fun x -> x : nat -> nat and also the judgement ⊢ fun x -> x : (nat -> nat) -> (nat -> nat). A closed term may have a type with free variables, for example the term fun x -> x has type X -> X in the empty environment.
+
+We can prove that if a closed term t has a type A which contains variables in the empty environment, then t also has type θA for any substitution θ. For example, if we substitute the variable X by the type nat -> nat in X -> X, we obtain the type (nat -> nat) -> (nat -> nat) and this is one of the possible types for the term fun x -> x.
+
+### 6.1.2 Hindley's Algorithm
+
+We can now describe the type inference algorithm. We will first describe a version of the algorithm that has two phases. The first phase is similar to the type checking algorithm: it traverses the term, recursively, checking that the type constraints are satisfied, and computes the type of the term. There are however two important differences: first, when we are trying to type a term of the form fun x -> t in an environment e, since we do not know the type of the variable x we need to create a type variable X, extend the environment e with the declaration x : X, and type the term t in this extended environment. The second difference is that when typing an application t u, after computing types A and B for u and t, respectively, we cannot simply check that the type B has the form A -> C. Indeed, these two types might have variables. For this reason, at this point an equation between the types is generated B = A -> X. The second phase of the type inference algorithm solves these equations.
+
+Let us illustrate the idea with an example: to type the term fun f -> 2 + (f 1) we must type the term 2 + (f 1) in the environment f : X. For this, we need to type the term 2, which has type nat, and the term f 1. The term 1 has type nat and the term f has type X. We generate the equation X = nat -> Y and the type of f 1 is Y. Once the terms 2 and f 1 are typed, we generate the equations nat = nat and Y = nat, and the type of the term 2 + (f 1) is nat. Finally, the type of the term fun f -> 2 + (f 1) is X -> nat and the equations that we need to solve are
+
+This system of equations has a unique solution X = nat -> nat, Y = nat, and therefore the only type that we can assign to the term fun f -> 2 + (f 1) is (nat -> nat) -> nat.
+
+We can describe the first part of the algorithm using a set of rules in the style of the big-step operational semantics (as we did for the type checking algorithm), but in this case the result of the interpretation of a term will not be a value or a type, it will be a pair of a type and a set of equations on types. We write e ⊢ t  A, E to denote the relation between the environment e, the term t, the type A and the set of equations E.
+
+In the application rule, the variable X is an arbitrary variable that does not occur in e, A, B, E and F. In the rules for fun and fix, it is an arbitrary variable that does not occur in e.
+
+Let t be a closed term and let A and E be the type and the set of equations computed by this algorithm, that is, we have ⊢ t  A, E. A substitution σ =B1/X1, ...,Bn/Xn is a solution of E if, for each equation C = D in E, the types σC and σD are identical. We can show that if a substitution σ is a solution of the set E, then the type σA is a type for t in the empty environment. In general, if e ⊢ t  A, E, then for any solution σ of E, σA is a type for t in the environment σe. Conversely, if A' is a type for t in the empty environment, then there exists a substitution σ such that A' =σA and σ is a solution of the set E of equations.
+
+The second part of the algorithm deals with the type equations. The language of types does not have binders, it is a language generated by a constant nat and a symbol -> with two arguments. To solve the type equations, we use Robinson's unification algorithm, which solves equations in any arbitrary language without binders. This algorithm is in some respects similar to Gauss's algorithm to solve systems of equations. It proceeds by a series of transformations, defined as follows
+
+  * if an equation in the system is of the form A -> B = C -> D, it is replaced by the equations A = C and B = D,
+
+  * if an equation in the system is of the form nat = nat, it is removed from the system,
+
+  * if an equation in the system is of the form nat = A -> B or A -> B = nat, the algorithm fails,
+
+  * if an equation in the system is of the form X = X, it is removed from the system,
+
+  * if an equation in the system is of the form X = A or A = X, where X occurs in A and A is different from X, the algorithm fails,
+
+  * if an equation in the system is of the form X = A or A = X, where X does not occur in A and X occurs in other equations in the system, then X is substituted by A in all the other equations in the system.
+
+This algorithm terminates, but the proof is not trivial. If the algorithm fails, then the system does not have a solution. If it terminates without failure, then the final system is of the form X1 =A1, ...,Xn =An, where the Xi are different variables and do not occur in the Ai. In this case, the substitution σ =A1/X1, ...,An/Xn is a solution of the initial system. We can prove that this substitution is a principal solution of this system, in other words, for any solution θ of the initial system, there is some substitution η such that θ=η○σ. We write σ=mgu(E)—most general unifier: principal solution.
+
+Let t be a closed term, and let A and E be such that ⊢ t  A, E. Let σ be a principal solution of E. Then the term t has type σA in the empty environment. Moreover, σA is a principal type of t, that is, for any other type B of t, there exists a substitution η such that B =ησA.
+
+### 6.1.3 Hindley's Algorithm with Immediate Resolution
+
+There is a variant of Hindley's algorithm where instead of waiting until the end of the first phase to start solving the equations, the equations are solved as they are generated. In this case, instead of returning a type and a set of equations, the algorithm returns a type A and a substitution ρ that is a principal solution of the equations. We can also apply the substitution ρ to the type A as it is built.
+
+The algorithm has the following property: if e ⊢ t  A,ρ, then A is a principal type of t in the environment ρe. The algorithm is defined below.
+
+if σ = mgu(B =ρ′A -> X)
+
+if σ = mgu(A = nat) and σ′ = mgu(B = nat)
+
+if σ = mgu(A = nat) and σ′ = mgu(ρ″ B = C)
+
+Again, in the application rule X is an arbitrary variable that does not occur in e, A, B, ρ and ρ', and in the rules for fun and fix, it is a variable that does not occur in e.
+
+Exercise 6.1
+
+Give a principal type for the term fun x -> fun y -> (x (y + 1)) + 2. Describe all of its types.
+
+Give a principal type for the term fun x -> x. Describe all of its types.
+
+Exercise 6.2
+
+(Unicity of principal types) A substitution σ is called a renaming if it is an injective map associating a variable to each variable. For example, the substitution y/x, z/y is a renaming. Let A be a type and σ, σ′ two substitutions. Show that if σ′σA = A then σ|FV(A) is a renaming.
+
+Deduce that if A and A' are two principal types of a term t then there exists a renaming θ, with domain FV(A), such that A' =θA.
+
+Exercise 6.3
+
+In the general case of a language without binders, we can replace the first three rules in Robinson's unification algorithm by the two rules
+
+  * if an equation is of the form f(u1, ...,un) = f(v1, ...,vn), replace it by u1 =v1, ...,un =vn,
+
+  * if an equation is of the form f(u1, ...,un) = g(v1, ...,vp) where f and g are different symbols, fail.
+
+In a language that consists of a symbol + with two arguments and integer constants, does the equation (2 + (3 + X)) = (X + (Y + 2)) have a solution? And the equation X + 2 = 4?
+
+What is the difference between the equations in this language and the equations over integers studied at high school?
+
+Define the high school notion of solution using the small-step operational semantics of PCF. Does the equation X + 2 = 4 have a solution in this case?
+
+## 6.2 Polymorphism
+
+We have seen that the principal type of the term id = fun x -> x is X -> X. This means that the term id has type A -> A for any type A. We could give it a new type ∀X (X -> X) and add a rule so that if a term t has type ∀X A then it has the type (B/X)A for any type B. A type language that includes a universal quantifier is polymorphic.
+
+In the system presented in the previous section, the term let id = fun x -> x in id id was not typeable. Indeed, the typing rule for let requires that we type both fun x -> x and id id, but the latter is not typeable because we cannot assign the same type to both occurrences of the variable id. For this reason the term let id = fun x -> x in id id cannot be typed. This could be seen as a flaw in the type system, because the term (fun x -> x) (fun x -> x), obtained by replacing id by its definition, is typeable. Indeed, to type this term it is sufficient to assign type nat -> nat to the first occurrence of the bound variable x and type nat to the second.
+
+If we give the type ∀X (X -> X) to the symbol id in the term let id = fun x -> x in id id we can then use a different type for each occurrence of id in the term id id, and the term becomes typeable.
+
+Typing the term let id = fun x -> x in id id might seem a minor issue, and adding quantifiers to the type language might seem a high price to pay to obtain a marginal increase in power. However, this is a wrong impression. In fact, in the extension of PCF with lists—see Exercise 3.14—, this feature allows us to develop a unique sorting algorithm and apply it to all the lists, irrespective of the type of their arguments: let sort = t in u. Polymorphism entails more code reuse, and therefore more concise programs.
+
+We will therefore give a quantified type to the variables bound in a let, but use a standard type for variables that are bound in a fun or fix.
+
+### 6.2.1 PCF with Polymorphic Types
+
+We need to distinguish between types without quantifiers—we will continue to use the word types for these—and quantified types, which we will call type schemes. A scheme has the form ∀X1 ... ∀Xn A where A is a type. We will then define a language with two sorts: a sort for types and a sort for schemes. Since the sets of terms of each sort are disjoint in a many-sorted language, the set of types cannot be a subset of the set of schemes, and we will need to use a symbol [ ] to inject a type in the sort of the schemes. Thus, if A is a type, [A] will be the scheme consisting of the type A without any quantified variable.
+
+The language of types and schemes is defined by
+
+  * a type constant nat,
+
+  * a type symbol -> with two type arguments, which does not bind any variable in its arguments,
+
+  * a scheme symbol [ ] with one type argument, which does not bind any variable in its argument,
+
+  * a scheme symbol ∀ with one scheme argument, which binds a variable in its argument.
+
+This language includes variables for every sort, in particular scheme variables. However, these variables will not be used.
+
+An environment is now a list associating a scheme to each variable. We define inductively the relation "the term t has the scheme S in the environment e "
+
+![
+$$
+\\frac{{}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,x}\\,:{\\rm\\,S}}}{\\rm\\,if\\,e\\,contains\\,x}\\,:{\\rm\\,S}
+$$
+](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equz.gif)
+
+![
+$$
+\\frac{{{\\rm\\,e}\\,\\vdash {\\rm\\,u}:\\left\[ {\\rm\\,A} \\right\]{\\rm\\, e}\\,\\vdash {\\rm\\,t}:\\left\[ {{\\rm\\,A} \\,-  > {\\rm\\,B}} \\right\]}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,t u}:\\left\[ {\\rm\\,B} \\right\]}}
+$$
+](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equaa.gif)
+
+![
+$$
+\\frac{{\\left\( {{\\rm\\,e, x}:\\left\[ {\\rm\\,A} \\right\]} \\right\){\\rm\\, }\\,\\vdash {\\rm\\,t}:\\left\[ {\\rm\\,B} \\right\]}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,fun\\,x } \\,-  > {\\rm\\,t}:\\left\[ {{\\rm\\,A } \\,-  > {\\rm\\,B}} \\right\]}}
+$$
+](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equab.gif)
+
+![
+$$
+\\frac{{}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,n}:\\left\[ {{\\rm\\,nat}} \\right\]}}
+$$
+](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equac.gif)
+
+![
+$$
+\\frac{{{\\rm\\,e}\\,\\vdash {\\rm\\,u}:\\left\[ {{\\rm\\,nat}} \\right\]{\\rm\\, e}\\,\\vdash {\\rm\\,t}:\\left\[ {{\\rm\\,nat}} \\right\]}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,t} \\otimes {\\rm\\,u}:\\left\[ {{\\rm\\,nat}} \\right\]}}
+$$
+](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equad.gif)
+
+![
+$$
+\\frac{{{\\rm\\,e}\\,\\vdash {\\rm\\,t}\\,:\\left\[ {{\\rm\\,nat}} \\right\]{\\rm\\, e}\\,\\vdash {\\rm\\,u}:\\left\[ {\\rm\\,A} \\right\]{\\rm\\, e}\\,\\vdash {\\rm\\,v}\\,:\\left\[ {\\rm\\,A} \\right\]}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,ifz\\,t\\,then\\,u\\,else\\,v}:\\left\[ {\\rm\\,A} \\right\]}}
+$$
+](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equae.gif)
+
+![
+$$
+\\frac{{\\left\( {{\\rm\\,e,\\,x :}\\left\[ {\\rm\\,A} \\right\]} \\right\)\\,\\vdash {\\rm\\,t }\\,:\\left\[ {\\rm\\,A} \\right\]}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,fix\\,x\\,t}:\\left\[ {\\rm\\,A} \\right\]}}
+$$
+](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equaf.gif)
+
+![
+$$
+\\frac{{{\\rm\\,e}\\,\\vdash {\\rm\\,t}\\,:{\\rm\\,S }\\left\( {{\\rm\\,e,\\,x}\\,:{\\rm\\,S}} \\right\)\\,\\vdash {\\rm\\,u}\\,:\\left\[ {\\rm\\,B} \\right\]}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,let\\,x} = {\\rm\\,t\\,in\\,u}:\\left\[ {\\rm\\,B} \\right\]}}
+$$
+](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equag.gif)
+
+![
+$$
+\\frac{{{\\rm\\,e}\\,\\vdash {\\rm\\,t}\\,:{\\rm\\,S}}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,t}:\\forall {\\rm\\,X S}}}{\\rm\\,if\\,X\\,does\\,not\\,occur\\,free\\,in\\,e}
+$$
+](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equah.gif)
+
+![
+$$
+\\frac{{{\\rm\\,e }\\,\\vdash {\\rm\\, t }\\,:{\\rm\\, }\\forall {\\rm\\,X S}}}{{{\\rm\\,e }\\,\\vdash {\\rm\\, t }\\,:{\\rm\\, }\\left\( {{\\rm\\,A/X}} \\right\){\\rm\\, S}}}
+$$
+](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equai.gif)
+
+This inductive definition assigns a scheme to each term, in particular to variables. This is why variables are associated to schemes in the environment. However, when we type a term of the form fun x -> t or fix x t, we type t in an extended environment where the variable x is associated to a scheme [A] without quantifiers. A scheme can be associated to a term t only during the typing of a term of the form let x = t in u, and then this scheme is associated to the variable x.
+
+To introduce quantifiers in the scheme associated to t we use the penultimate rule, which allows us to quantify a variable in the scheme S if the variable does not occur free in e. Thus, in the empty environment, after assigning the scheme [X -> X] to the term fun x -> x we can assign the scheme ∀X [X -> X] to it. Note that in the environment x : [X], after assigning the scheme [X] to the variable x we cannot assign the scheme ∀X [X].
+
+Finally, note that if we have assigned a quantified scheme to a variable, or to an arbitrary term, we can remove the quantifier and substitute the free variable using the last rule. For example, in the environment x : ∀X [X -> X] we can assign the scheme [nat -> nat] to the variable x.
+
+### 6.2.2 The Algorithm of Damas and Milner
+
+We are now ready to define the inference algorithm. We will solve the equations on the fly, as we did in the second variant of Hindley's algorithm. The algorithm will be applied to a term t and an environment e, and it will return a type A and a substitution ρ such that the term t has the scheme [A] in the environment ρe. The only difference with respect to the second variant of Hindley's algorithm is in the first two rules
+
+if e contains x : ∀X1 ... ∀Xn [A] and Y1, ...,Yn are new variables
+
+where Gen(A,e) is the scheme obtained by quantifying in [A] all the type variables that are free in [A] but not in e.
+
+We can prove that if t is a closed term, the type A computed by this algorithm is a principal type of t, that is, if ⊢ t:[B] then B is an instance of A.
+
+Exercise 6.4
+
+Consider the extension of PCF with a type symbol list with one argument, which is a type. We write nat list for the type of lists of natural numbers, (nat -> nat) list will be the type of lists of functions from natural numbers to natural numbers, and (nat list) list will be the type of lists where the elements are lists of natural numbers.
+
+We add the following constructs to the language: a constant nil of type (A list) for any type A, representing the empty list, cons a l of type (A list) for any type A such that a has type A and l has type A list, which will represent a list where the first element is a and l is the rest of the list, ifnil t then u else v of type A if t has type B list and u, v are terms of type A, to check whether the list t is empty or not, hd l of type A if l is of type A list, that returns the first element of the list l, and tl l of type A list if l is of type A list, that returns the list l without the first element. Write typing rules for this extension of PCF. Write a type checker for this extension of PCF.
+
+Program the function map that associates to a function f and a list t1, ...,tn the list ft1, ..., ftn. What is the type of this function?
+
+Program a sorting algorithm. What is the type of this algorithm?
+
+In the type system described in this chapter, we can use quantified types for variables that are bound in a let. We could try to give a quantified type to variables that are bound in a fun. For example, we could give the type ∀X (X -> X) to the variable x in the term fun x -> x x, which will allow us to type this term. The language obtained in this way is called System F, and was defined by Girard and Reynolds. However, the typing relation is undecidable in System F, as shown by Wells, and we cannot hope to have a type inference algorithm for System F. Similarly, if we allow the variable bound by a fix to be polymorphic, the system becomes undecidable, as shown by Kfoury. Restricting the polymorphic aspects of the system to the let construct can be seen as a good compromise, it offers a good level of code reuse and type inference.
+Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2_7© Springer-Verlag London Limited 2011
+
+# 7. References and Assignment
+
+Gilles Dowek1  and Jean-Jacques Lévy2
+
+(1)
+
+Labo. d'Informatique, École polytechnique, route de Saclay, 91128 Palaiseau, France
+
+(2)
+
+Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, 28 rue Jean Rostand, 91893 Orsay Cedex, France
+
+Gilles Dowek (Corresponding author)
+
+Email: gilles.dowek@polytechnique.edu
+
+Jean-Jacques Lévy
+
+Email: jean-jacques.levy@inria.fr
+
+Abstract
+
+This chapter opens the last part of the book where PCF is extended with new features. This chapter focuses on references and assignments, and thus shifts from functional to imperative programming. A semantics is given for PCF with references and the interpretor of Chap. 3 is extended.
+
+Consider two numbers: π and the temperature in Paris. Today, the number π has a value between 3.14 and 3.15 and the temperature in Paris is between 16 and 17 degrees. Tomorrow, π will have the same value, but the temperature in Paris will probably change. In Mathematics, numbers are entities that do not change over time: the temperature in Paris is not a number that changes, it is a function that varies over time.
+
+However, formalising the temperature of a system as a function of time is perhaps too general. It does not take into account the fact that the variation in temperature at a given point in time depends, in general, of the temperature at this point and not the temperature ten seconds earlier or ten seconds later. In general, a system does not have access to the full temperature function, just the current value of the function. This is why equations in Physics are generally differential equations and not arbitrary equations on functions.
+
+In Computer Science, programs also use objects that vary over time. For example, in the program that manages the sale of tickets for a concert, the number of seats available varies over time: it decreases by one each time a ticket is sold. From the mathematical point of view, it is a function of time. However, to know whether it is possible or not to sell a ticket, or whether booking is no longer possible, the program only needs to know the current value of this function, not the full function: at a certain point t in time, it needs the value of the function at t.
+
+For this reason, when we write such a program, we do not represent the number of places available for the concert as a function, that is, as a term of type nat -> nat—assuming a discrete clock—, which would mean that at each instant t we know the number of seats still available for the concert at each instant t'. This is clearly impossible, since it requires to know the number of seats available at each instant t' in the future. We cannot express this number by a term of type nat either, because as a number the value of a term of type nat in PCF cannot change over time. We have to introduce another sort of terms for the values that change over time: references, also called variables but we prefer not to use the word variable in this context, since the notion of a reference is very different from the notion of a variable in Mathematics and in functional languages.
+
+If x is a reference, we can do two things with it, get its current value !x and modify its value x := t, that is, contribute to the construction of the function that we mentioned above, asserting that the value of the function is now, and until further notice, the current value of the term t.
+
+The issue of equality of "numbers that vary over time" is subtle. We could compare such a number, the temperature in Paris for instance, with a leaf in a tree: small, green and flexible in Spring, it becomes bigger, yellow and brittle in Autumn. There is clearly a change, but we know that it is the same leaf: nobody would believe that the little green leaf disintegrated and suddenly the big yellow leaf appeared ex nihilo. Although there is a transformation, the same leaf remains in the tree from March till October. This is an instance of the old paradox, that something can change while remaining the same. Similarly, the notion of temperature in Paris is always the same, even if the temperature changes over time. On the other hand, we can easily distinguish the temperature in Paris from the temperature in Rome: these are two different things, even if from time to time the temperature is the same in both cities.
+
+One way to deal with this paradox is to consider the temperature in Paris and the temperature in Rome as functions: a function may take different values at two different points and remain the same function, and two different functions might take the same value at a given point.
+
+In a program, if x and y are two references and we need to compare them, we should distinguish carefully between their equality as references, that is, whether x and y are the same thing or not—in mathematical terms: whether they are the same function of time—and equality of their contents, that is, whether the numbers !x and !y are the same at a particular point in time. In particular, equality of references implies that if we modify the value of x then the value of y also changes, but this is not the case if they are different references with the same value.
+
+## 7.1 An Extension of PCF
+
+We will now extend the language PCF with two new term constructors, written ! and :=.
+
+The term x := 4 denotes an action: it updates the value associated to the reference x. Compare with the term fact 3, that we have already seen, and which also denotes an action: the computation of the factorial of 3. There is a difference between these two actions: the effect of the computation of the factorial of 3 is a value, whereas the effect of the action x := 4 is a change in the "global state" of the universe. Before this action, the reference x had, for instance, the value 0, and after this action it has the value 4. When we add references to PCF, the interpretation of a term is not just a value, but a value and a new state of the universe. This modification of the state is a side effect of the interpretation of a term.
+
+The formal semantics of references in PCF defines the global state as a function from a finite set R to the set of values of PCF terms. The elements of the set R are called references. In the native programming language of a computer, its machine language, the set of references is fixed: it is the set of memory addresses of the computer. In other languages, the set R is arbitrary. In particular, when we define the semantics of a language, we do not distinguish between sets R and R' of the same cardinality (i.e., with the same number of elements). This means that programmers cannot know the exact set of memory addresses used to store the data.
+
+In PCF, as well as in most programming languages, the values associated to references may change over time. Moreover, the set R itself may vary over time: it is possible to create a reference during the execution of the program. To do this, the language includes a construct ref. The side effect associated to the interpretation of the term ref t is the creation of a new reference whose initial value is the current value of the term t. The value computed by this interpretation is the reference itself.
+
+Since the interpretation of the term ref t produces a value which is a reference, it is clear that references must be values in this extension of PCF.
+
+## 7.2 Semantics of PCF with References
+
+In the big-step operational semantics of this extension of PCF, the relation is of the form e, m ⊢ t ↪ V, m' where t is the term to be interpreted, e the environment where it will be interpreted, m the global state in which the interpretation will take place, V the value produced by the interpretation, and m' the new global state produced by the interpretation.
+
+We can now give rules for the three new constructs, ref, ! and :=
+
+if r is any reference not occurring in m'
+
+The construction t; u whose semantics is obtained by interpreting t, throwing away the value obtained, then interpreting u, is not very interesting in a language without side effects, because in that case the value of the term t; u is always the same as the value of u, assuming t terminates. We can now add it to PCF
+
+We can also add now constructions whilez, for,... which were of no interest in a language without side effects.
+
+Exercise 7.1
+
+Write an interpreter for the language PCF with references.
+
+The uncertainty that we mentioned at the beginning of the book regarding the evaluation of nested functions is finally elucidated.
+
+Exercise 7.2
+
+Consider the term
+
+What is the value of this term? In which order will the arguments be interpreted in PCF? Why?
+
+Modify the rules given above to obtain the value 2 instead of the value 9 for this term.
+
+In Sect. 2.5 we remarked: "In the case of an application...". What do you think of this remark?
+
+What is the value of this term in Caml?
+
+Consider the following Java program
+
+What is the value of this term?
+
+In which order does Caml interpret its arguments? and Java?
+
+Exercise 7.3
+
+Is the value of the term
+
+10 or 11? Compare with the answer for Exercise 2.8.
+
+Exercise 7.4
+
+Give the big-step operational semantics of the construction whilez. What is the value of the term given below?
+
+Exercise 7.5
+
+(The quirks of references under call by name) Consider the rules given above to define the big-step semantics of references. Do they follow a call by name or a call by value strategy? Give a similar rule for application under call by name, but keep the let in call by value. What is the value of the term let n = ref 0 in ((fun x -> x + x) (n := !n + 1; 4)); !n in call by value? And in call by name? What is the value of the term let n = ref 0 in ((fun x -> 2 * x) (n := !n + 1; 4)); !n in call by value? And in call by name?
+
+Exercise 7.6
+
+(Typing references) To type terms in the extension of PCF with references, we extend the language of types with a symbol ref, so that nat ref, for instance, is the type of references to a natural number. Thus, if t is a term of type A ref then !t is a term of type A.
+
+Extend the typing rules given in Sect. 5.1 in order to type the language PCF with references.
+
+Write a type-checking program for PCF with references.
+
+The combination of references and polymorphism is subtle; we will not attempt to mix them in this exercise.
+
+Exercise 7.7
+
+(From imperative to functional programs) Consider a term t defining a function from natural numbers to natural numbers, with p arguments and a free variable n of type nat ref. We associate to this term a function with p + 1 arguments that returns a pair of natural numbers—see Exercise 3.13—such that the image of a1, ...,ap, m is the pair of natural numbers consisting of the value of the term let n = ref m in (ta1 ...ap) and the value of the term !n at the end of the interpretation. Which function will be associated to the term
+
+  * fun z -> (n := !n + z; !n)?
+
+And to the term
+
+  * (fun z -> (n := !n + z; !n)) 7?
+
+And to the term
+
+  * (fun x -> fun y -> x) ((fun z -> (n := !n + z; !n)) 2) ((fun z -> (n := !n + z; !n)) 7)?
+
+Is it possible to program these functions in PCF without references?
+
+More generally,
+
+  * which function is associated to the term funy1 -> ... -> funyp -> 2?
+
+  * And to the term funy1 -> ... -> funyp ->y1?
+
+  * And to the term funy1 -> ... -> funyp -> !n?
+
+  * If t is a term of type nat and f is the function associated to the term funy1 -> ... -> funyp -> t, which function is associated to funy1 -> ... -> funyp -> n := t?
+
+  * If t and u are terms of type nat, and f and g are the functions associated to the terms funy1 -> ... -> funyp -> t and funy1 -> ... -> funyp -> u, which function is associated to funy1 -> ... -> funyp -> (t + u)?
+
+  * If t and u are terms of type nat and f and g are the functions associated to the terms funy1 -> ... -> funyp -> t and funy1 -> ... -> funyp -> u, which function is associated to funy1 -> ... -> funyp -> (t; u)?
+
+  * If t is a term of type nat -> ... -> nat -> nat—with q arguments of type nat—u1, ...,uq are terms of type nat, and f,g1, ...,gq the functions associated to the terms funy1 -> ... -> funyp -> t and funy1 -> ... -> funyp ->u1, ...,funy1 -> ... -> funyp ->uq, which function is associated to funy1 -> ... -> funyp -> (tu1 ...uq)?
+
+Is it possible to program these functions in PCF without references?
+
+Write a program to transform a PCF term containing these symbols and a free variable of type nat ref into a program without it and with the same semantics.
+
+Exercise 7.8
+
+(For those who prefer to write x := x + 1 instead of x := !x + 1) Consider now a finite set of references, and let us extend PCF by introducing a constant for each of these references. These references will be called mutable variables. The symbol := applies now to a mutable variable and a term, written X := t.
+
+If X is a mutable variable, the value that the operational semantics associates to the term X is the value associated to the reference X in the state available at the time of interpretation.
+
+Give a big-step operational semantics for this extension of PCF.
+
+Write an interpreter for this extension of PCF.
+
+Exercise 7.9
+
+(A minimal imperative language) Consider a language including integer constants, arithmetic operations, mutable variables—see Exercise 7.8—, assignment :=, sequence ;, a conditional ifz and a whilez loop (but without the usual notion of variable, fun, fix, let or application).
+
+Give rules to define the operational semantics of this language. Write an interpreter for this language. Write a program to compute factorial in this language. What can we program in this language?
+
+To conclude this chapter, we remark that in most programming languages there are two different ways to program the factorial function. For example, in Java, we can program it recursively
+
+or iteratively
+
+Should we prefer the first version or the second?
+
+Of course, the theory of programming languages does not give us an answer to "moral" questions of the form "Should we...?" We could nevertheless say a few words about the way this question has evolved.
+
+In the first programming languages—machine languages, assembly languages, Fortran, Basic,...—only the second version could be programmed. Indeed, a program with loops and references is easier to execute in a machine that is itself, in fine, a physical system with a mutable state, than a program that requires evaluating a function defined via a fixed point.
+
+Lisp was one of the first languages to promote the use of recursive definitions. With Lisp, for the first time, programs did away with references and side effects, and this simplified the semantics of the language, brought it close to mathematical language, allowed programmers to reason over programs in an easier way, and facilitated the task of writing complex programs. For example, it is much easier to write a program to compute the derivative of an algebraic expression using recursion than keeping track of a stack of expressions that are waiting to be treated. It was then natural to contrast the pure functional style of programming with the "impure" imperative one.
+
+But the first implementations of functional languages were very slow in comparison with those of imperative languages, precisely because, as we have said, it is more difficult to execute a functional program on a machine, which is a physical system, than it is to execute an imperative program. During the 1990's, the compilation techniques for functional languages made such a huge progress that efficiency is no longer a valid argument against functional programming today, except in the domain of intensive computation.
+
+Moreover, all modern languages include both functional and imperative features, which means that today the only valid argument to justify the choice of a particular style should be its simplicity and ease of use.
+
+From this point of view, it is clear that not all problems are identical. A program that computes derivatives for functional expressions is easier to express in functional style. In contrast, when we program the Logo turtle it is more natural to talk about the position of the turtle, its orientation,...—that is, its state at a given instant. It is also natural to talk about the actions that the turtle does: to move, to write a line,..., and it is not easy to express all this in a functional way: in fact, it is not natural to think of the turtle's actions as functions over the space of drawings.
+
+There is still one point that remains mysterious: programs, whether functional or imperative, are always functions from inputs to outputs. If imperative programming brought us new ways of defining functions, which in certain cases are more practical from a Computer Science point of view than the mathematical definitions that are typical of functional languages, we could wonder whether they would also be more practical for mathematicians. However, so far the mathematical language has not adopted the notion of reference.
+Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2_8© Springer-Verlag London Limited 2011
+
+# 8. Records and Objects
+
+Gilles Dowek1  and Jean-Jacques Lévy2
+
+(1)
+
+Labo. d'Informatique, École polytechnique, route de Saclay, 91128 Palaiseau, France
+
+(2)
+
+Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, 28 rue Jean Rostand, 91893 Orsay Cedex, France
+
+Gilles Dowek (Corresponding author)
+
+Email: gilles.dowek@polytechnique.edu
+
+Jean-Jacques Lévy
+
+Email: jean-jacques.levy@inria.fr
+
+Abstract
+
+The final chapter of the book is dedicated to object oriented programming languages. An extension of PCF with objects is defined and implemented. But before that, an extension of PCF with records is. Then objects are introduced as records with functional fields.
+
+## 8.1 Records
+
+In the equations describing the movement of two bodies that exert a force on each other, for example, a star and a planet, their positions are represented by three coordinates (functions of time). This leads to a system of differential equations with six variables. However, instead of "flattening" them, we can group them in two packages of three variables each, obtaining a system of differential equations with vector variables. There are mathematical tools to pack several values into one: the notion of a pair, which can be iterated to build tuples, and the notion of a finite sequence.
+
+In programming languages we also need tools to pack several values into one. The tools that we have for this are the notion of a pair, the notion of an array, the notion of a record, the notion of an object and the notion of a module. The components of those structures are called fields.
+
+### 8.1.1 Labelled Fields
+
+To represent the position of an object on Earth by latitude, longitude and altitude, we can use a tuple with three components: the first one is the latitude of the object, the second its longitude and the third its altitude. If we decide that the tuple (a,b,c) is the pair (a,(b,c)), then the element in the left-hand side is the latitude, the one in the left-hand side of the right-hand side component is its longitude and the one on the right of the right-hand side component is its altitude. There are several other combinations, and our choice here is clearly arbitrary.
+
+If instead we decide that the tuple (a,b,c) is represented by a function from {0,1,2} to ℝ that associates a to 0, b to 1 and c to 2, then the latitude of the object is the real number associated by this function to 0, its longitude is the number associated to 1 and its altitude is the number associated to 2. Again, there are other alternatives, and our choice is arbitrary.
+
+There is no reason to place these values in a specific position in the tuple, or to associate them with one number rather than another. Moreover, if in a program we need to change the data structure to add another field, we will have to update the program in several places. These modifications are likely to introduce errors, and we might end up confusing longitude and temperature...
+
+Since it is more convenient for programmers to identify the fields by using a name—"latitude", "longitude",...—instead of a position or a number, programming languages offer this possibility. This leads us to a notion of tuple with labelled fields, called record. From a mathematical point of view, a record is a function whose domain is an arbitrary finite set (rather than an initial segment of ℕ), and the elements of the set are the labels of the record.
+
+The idea of referring to the fields by a name instead of using their position in the tuple can also be used in the context of a function call. In some experimental languages, instead of writing f(4,2) we write f(abscissa = 4, ordinate = 2) or equivalently f(ordinate = 2, abscissa = 4).
+
+### 8.1.2 An Extension of PCF with Records
+
+To extend PCF with records, we add three symbols to the language: a symbol {} to build records, a symbol . to access a field in a record, and a symbol <- to build a new record identical to one previously constructed except for the value of one field.
+
+Before introducing these symbols we need to introduce a new sort for labels and an infinite set of constants, one for each label. Notice that there is no symbol to bind a variable of sort label, therefore there will be no such variables in a closed term. Moreover, the language does not include any other symbol to build terms of sort label, just the constants. Therefore, in a closed term the only subterms of sort label are constants. We can then add to PCF
+
+  * a symbol {} with 2n arguments that does not bind any variables; the arguments at odd positions are labels and the ones at even positions are terms,
+
+  * a symbol . with two arguments, where the first is a term and the second a label, which does not bind any variable,
+
+  * a symbol <- with three arguments where the first is a term, the second a label and the third a term, which does not bind any variable.
+
+Exercise 8.1
+
+In the definition of language that we gave in Chap. 1, each symbol has a fixed number of arguments. We cannot have then a symbol like {} which could have for instance 6 or 8 arguments. How could we fix the definition given above to make it compatible with the notion of language defined in Chap. 1? Hint: What is a list?
+
+The term {}(l1,t1, ...,ln,tn) will be written {l1 =t1, ...,ln =tn}, the term .(t,l) will be written t.l and the term <-(t,l,u) will be written t(l <\- u).
+
+The small-step operational semantics of PCF will now include the following rules
+
+Similarly, the big-step operational semantics is extended with the following rules
+
+Notice that in these rules the terms of sort label are not interpreted. This is because, as mentioned above, these terms are constants.
+
+Exercise 8.2
+
+Write an interpreter for PCF with records.
+
+Exercise 8.3
+
+The goal of this exercise is to represent a Logo turtle with a record containing an abscissa, an ordinate, and an angle. The turtle should have an internal state so that it can move without changing its identity—see the introduction to Chap. 7. There are two alternatives: the turtle can be defined as a record of references to real numbers, or as a reference to a record of real numbers. Write the function move-forward in both cases.
+
+In this exercise we assume that there is a type of real numbers and all the necessary operations.
+
+In the big-step operational semantics that we gave for PCF with records, the interpretation of the term {a = 3 + 4, b = 2} requires to perform the addition of 3 and 4. In contrast, once the value {a = 7, b = 2} is built, an access to the field a does not require to perform an arithmetic operation.
+
+An alternative would be to delay the addition and assume that the term {a = 3 + 4, b = 2} is a value that can be interpreted as itself. In this case, we will need to interpret the term 3 + 4 each time there is an access to the field a. We could say that this semantics is a call by name one, as opposed to the semantics we gave above, which follows the call by value strategy.
+
+In call by name, the rules of the operational semantics are
+
+Exercise 8.4
+
+Write an interpreter for PCF with records following the call by name semantics.
+
+If we compare these two semantics of records, we are lead to make the same comments as for the semantics of functions in call by value vs. call by name: the interpretation of let x = {a = fact 10, b = 4} in x.b requires the computation of the factorial of 10 in call by value, but not in call by name. On the other hand, the interpretation of let x = {a = fact 10, b = 4} in x.a + x.a under call by name triggers twice the computation of the factorial 10. The interpretation of let x = {a = fix y y, b = 4} in x.b produces an infinite loop under call by value, whereas it successfully returns 4 under call by name. Finally, when we also have references, the side effects of the interpretation of a field could be repeated several times if we access the filed several times—see Exercise 7.5.
+
+For example, if we build a record x with a field a that is a reference to a natural number, initially 0, and a function inc that increases this number by one, and then we write a term that increases this value and returns it, we obtain
+
+Under call by value, this term produces the result 1, as one expects. However, a call by name interpretation will access three times the field a of the record x, that is, it will interpret three times the term ref 0, creating three references that point to the value 0. The third reference, created by the interpretation of the term !(x.a), is never updated and therefore the interpretation of the programme above under call by name produces the result 0.
+
+To make sure that the call by value and the call by name interpretations produce the same result, we should avoid side effects—such as the creation of a reference in the example above—during the interpretation of fields. We can rewrite the term as follows
+
+which guarantees that the value will be 1, whether in call by value or call by name.
+
+Exercise 8.5
+
+(Types for records) Consider a type person for records with three fields: surname, name and telephone. Show that we can program the three functions x(surname <\- y), x(name <\- y) and x(telephone <\- y) without using the symbol <-, which means that this symbol is superfluous.
+
+Will this symbol be still superfluous if we have a type contactable including all the records which contain at least the field telephone?
+
+If we have a type person and a type contactable, do we still have unicity of types?
+
+## 8.2 Objects
+
+Programs usually deal with various kinds of data, often structured as records. For example, a company's computer system might deal with order forms from customers, invoices, pay slips.... A customer order might be represented as a record including the identification of the object ordered, the quantity requested... To print the data there are several alternatives. We could write a unique function print that starts by checking which kind of data we want to print—order form, pay slip...—and then prints it in a different format depending on the kind of data. Or we could write several functions: print_order_form, print_pay_slip... Alternatively, we could define a record print where each field is a printing function. Yet another option would be to make each printing function a part of the type. Such a data type is called a class, and its elements are called objects.
+
+In the most radical object-oriented programming style, each object, for instance, each order form, includes a different function print. An order form is then a record that contains, in addition to the standard fields—identification of the item requested, number of items ordered,...—a field print defining the printing function that should be used to print the object.
+
+Some languages, for instance Java, associate a print function to each class rather than each object. Thus, all the objects in the class share the printing function—whether static or dynamic. If we do not want to share the printing function for two objects t and u in the same class C, we need to define two sub-classes T and U of C, which inherit all the fields of C but redefine print differently.
+
+### 8.2.1 Methods and Functional Fields
+
+An object is simply a record where some fields are functions. In Java, where functions are not first-class objects, we must distinguish the fields that are functions from those that are not; the functional ones are called methods.
+
+In a language where functions are first-class objects, like PCF, this distinction is not necessary. Objects are then simply records, and we can program in an object-oriented style in the extension of PCF with records defined previously in this chapter.
+
+Exercise 8.6
+
+The program that manages the sale of tickets for a concert is an object with the following fields
+
+  * a reference to a natural number: the number of orchestra seats available,
+
+  * a reference to a natural number: the number of balcony seats available,
+
+  * a function that takes an object and a natural number as arguments—0 for orchestra and 1 for balcony—and returns the number 0 or the number 1 to indicate whether the booking is closed or there are still seats in that area,
+
+  * a function that takes an object and a natural number as arguments—0 for orchestra and 1 for balcony—, and reserves a seat by decreasing the number of seats available in that area; by convention it returns the value 0.
+
+Program this object in PCF with records.
+
+Typing systems for records and objects are out of the scope of this book. We will only say that if we give type A to the object defined in Exercise 8.6, then A must be the Cartesian product of nat ref, nat ref, A -> nat -> nat and A -> nat -> nat. We cannot define the type A as (nat ref) × (nat ref) × (A -> nat -> nat) × (A -> nat -> nat), because this is a circular definition. To define this type, we need to introduce a fixed point operator on types.
+
+If X -> Y denotes the space of functions from X to Y and B is a set with at least two elements, then the recursive equation A = (A -> B) does not have a solution. Indeed, it follows from Cantor's theorem that the cardinal of the set A -> B is strictly greater than that of A. The equation A = (nat ref) × (nat ref) × (A -> nat -> nat) × (A -> nat -> nat) does not have a solution either. As with the construction fix in PCF, it is not trivial to give a denotational semantics for the fixed point operator on types.
+
+### 8.2.2 What Is "Self"?
+
+If t is the object built in Exercise 8.6, to know whether the booking is closed or there are still orchestra tickets, we need to interpret the term t.free t 0. Indeed, the function t.free takes an object u and a natural number n and indicates whether the field associated to n in u—orchestra if n = 0, balcony if n = 1—is zero or not. In other words, the method free is static, as defined for example in Java.
+
+We now want the method free of the object t to apply to the object t itself, that is, we want to invoke it by interpreting the term t#free 0 instead of t.free t 0. In other words, we want this method to be dynamic.
+
+One way to achieve this is to consider the term t#l as an abbreviation for t.l t. The difficulty here is that if t is an object and l a label in this object, we can only use the term t#l if the field l is a function of type A -> ... where A is the type of t itself. In other words, we can only use the term t#l if l is the label of a method. If l is the label of a field that is not a method, we still need to write t.l.
+
+To avoid this distinction, we can state that all fields are functions. If a field a of an object t has the value 3, we transform it into a field with functional value fun s -> 3. Thus, the term t#a, that is, t.a t or (fun s -> 3) t, is interpreted as the value 3.
+
+The first argument of each method in the object is then a bound variable, which is usually called self or this. In fact, most programming languages use a special variable self or this which is implicitly bound in the object, and which denotes the object itself.
+
+When all methods in a record are terms of the form fun x -> ..., they can be interpreted as themselves, and we can simplify the rule
+
+by using
+
+Similarly, the rule
+
+specialises to
+
+and finally the rule
+
+can be replaced by
+
+To force all fields to be functions, we can modify the language of records, passing from a record language to an object-oriented language. The symbol {} now binds a variable in each even argument—terms—, the symbol . is replaced by the symbol #, the symbol <- now binds a variable in the third argument.
+
+The term {}(l1,s1t1, ...,ln,sntn) is written {l1 =ςs1t1, ...,ln =ςsntn}, the term #(t,l) is written t#l and the term <-(t,l,s u) is written t(l <-ςs u). The rules of the big-step operational semantics are now
+
+Exercise 8.7
+
+Write an interpreter for the language PCF with objects.
+
+Exercise 8.8
+
+(Late binding) Consider the term
+
+Is the value of this term 10 or 11? Compare this result with that of Exercise 2.8.
+
+### 8.2.3 Objects and References
+
+The standard definition of object includes a notion of internal state, which evolves in time. Thus, it combines the notion of object and reference, which are clearly separate in the definition of functional object given above.
+
+In a language with objects and references, when a non-functional field a = u is transformed into a = fun x -> u, the interpretation of fun x -> u does not produce the side effects produced by the interpretation of u. It is only when we access the field that the side effects will be visible. Thus, the behaviour is similar to that of records under call by name. The term
+
+is interpreted as the value 0 and not 1 as the term
+
+in call by value. We need to rewrite this term as follows
+
+if we want the interpretation to be the value 1.
+
+Exercise 8.9
+
+When we interpret a term of the form t#l, how many times is the term t interpreted? If the interpretation of t includes side effects, how many times will they take place? How can we force the term t to be interpreted only once?
+Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2_9© Springer-Verlag London Limited 2011
+
+# 9. Epilogue
+
+Gilles Dowek1  and Jean-Jacques Lévy2
+
+(1)
+
+Labo. d'Informatique, École polytechnique, route de Saclay, 91128 Palaiseau, France
+
+(2)
+
+Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, 28 rue Jean Rostand, 91893 Orsay Cedex, France
+
+Gilles Dowek (Corresponding author)
+
+Email: gilles.dowek@polytechnique.edu
+
+Jean-Jacques Lévy
+
+Email: jean-jacques.levy@inria.fr
+
+Abstract
+
+The Epilogue discusses the goals of the theory of programming languages: building tools to describe existing languages or to define new ones?
+
+The first goal of this book was to present the main tools to define the semantics of a programming language: small-step operational semantics, big-step operational semantics, and denotational semantics.
+
+We have stressed the fact that these three tools have the same purpose. In the three cases, the goal is to define a relation ↪ between a program, an input value and an output value. Since the goal is to define a relation, the question that arises naturally is: how do we define relations in mathematical language?
+
+The answer is the same in the three cases: the means to achieve the goal is the fixed point theorem. However, the similarity is superficial, because the fixed point theorems are used in different ways in the three semantics. By giving rise to inductive definitions, and hence reflexive-transitive closures, the fixed point theorem plays a major rôle in operational semantics. In contrast, it plays a minor rôle in denotational semantics, because it is only used to give the meaning of the construction fix. The denotational semantics of a language without fixed point, such as Gödel's System T—see Exercise 5.13—can be defined without using the fixed point theorem.
+
+To highlight the differences, we can look at the rôle of derivations. To establish that a term t has the value V in operational semantics, it is sufficient to show a derivation, or a sequence of reductions, that is, finite objects. In contrast, in denotational semantics the meaning of a term of the form fix is given as the least fixed point of a function, that is, a limit. For this reason, to establish that the value of a term t is V we sometimes need to compute the limit of a sequence, that is, we sometimes need to deal with an infinite object.
+
+Operational semantics have an advantage over denotational ones, because the relation ↪ can be defined in a more concrete way operationally. But on the other hand, operationally we can only define relations that are recursively enumerable, whereas denotationally we can define arbitrary relations. For this reason, in operational semantics we cannot complete the definition of the relation ↪ by adding a value ⊥ for the terms that do not terminate, because the resulting relation is not recursive, it cannot be effectively defined by induction. In contrast, denotationally it is not a problem to add such a value.
+
+We see here the dilemma that arises from the undecidability of the halting problem: we cannot complete the relation ↪ by adding ⊥ for the non-terminating terms, and at the same time define it inductively. We have to choose between completing the relation or defining it inductively, which leads to two different semantics. The readers who have followed logic courses before will recognise here the same issues that distinguish the truth judgements that are inductively defined, by the existence of a proof, from those that are defined by their validity in a model.
+
+The second goal of this book was to give the semantics of some programming language features: explicit definitions of functions, functions defined by fixed points, assignment, records, objects.... Here again, since the goal is to define functions, it is useful to start by looking at the ways in which functions are defined in Mathematics. In general, the comparison between the mathematical language and programming languages is fruitful, since the mathematical language is the closest we have to programming languages. This comparison shows some common points, but also some differences.
+
+The purpose of the study of programming language features is not to be exhaustive, but to show some informative examples. The point to remember is that, in the same way that Zoology is not the study of all the animal species one after the other, the study of programming languages should not consist of studying all languages one after the other. They should be organised according to their main features.
+
+We could continue this study by defining data types and exceptions. The study of data types would give us the opportunity to use again the fixed point theorem, and Robinson's unification algorithm, of which matching is a particular case. Going forward in this direction we could study the notion of backtracking which leads to Prolog. Other important points that we have left aside are the polymorphic typing of references, the notion of array, imperative objects, modules, type systems for records and objects (and in particular the notion of sub-type), concurrency....
+
+The final goal of this book was to present a number of applications of these tools, in particular for the design and implementation of interpreters and compilers, and also the implementation of type inference systems. The main point here is that the structure of a compiler is derived directly from the operational semantics of the language to be compiled. The next step would be the study of implementation techniques for abstract machines, and this would lead us to the study of memory management and garbage collection. We could also study program analysis, and design systems to deduce in an automatic or interactive way properties of programs, for instance, the property that states that the value returned by a sorting algorithm is a sorted list.
+
+The last point that remains to discuss is the rôle of the theory of programming languages, and in particular whether its purpose is to describe the existing programming languages, or to propose new languages.
+
+Astronomers study the galaxies that exist, and do not build new ones, whereas chemists study the existing molecules and build new ones. We know that in the latter case, the order in which theories and production techniques appear may vary: the transformation of mass into energy was achieved long time after the theory of relativity, whereas the steam engine appeared before the principles of thermodynamics were established.
+
+The theory of programming languages has enabled the development of new features, such as static binding, type inference, polymorphic types, garbage collection,... which are now available in commercial languages. In contrast, other functionalities, such as assignments and objects, were introduced in programming languages wildly, and the theory has been slow to follow. The development of a formal semantics for these constructs led in turn to new proposals, such as the recent extensions of Java with polymorphic types.
+
+The theory of programming languages has neither an exclusively descriptive rôle nor an exclusively leading rôle. It is this going backwards and forwards between the description of existing features and the design of new ones that gives the theory of programming languages its dynamics.
+Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2© Springer-Verlag London Limited 2011
+
+References
+
+1.
+
+Abadi, M., Cardelli, L.: A Theory of Objects. Springer, Berlin (1998)
+
+2.
+
+Dybvig, R.K.: The Scheme Programming Language, 2nd edn. Prentice Hall, New York (1996). www.scheme.com/tspl2d/
+
+3.
+
+Gunter, C.A.: Semantics of Programming Languages: Structures and Techniques. MIT Press, Cambridge (1992)MATH
+
+4.
+
+Kahn, G.: Natural semantics. In: Proceedings of the Symp. on Theoretical Aspects of Computer Science, TACS, Passau (1987)
+
+5.
+
+Mitchell, J.C.: Foundations for Programming Languages. MIT Press, Cambridge (1996)
+
+6.
+
+Mitchell, J.C.: Concepts in Programming Languages. Cambridge University Press, Cambridge (2002)MATHCrossRef
+
+7.
+
+Peyton Jones, S., Lester, D.: Types and Programming Languages. Prentice Hall, New York (1992)
+
+8.
+
+Pierce, B.C.: Types and Programming Languages. MIT Press, Cambridge (2002)
+
+9.
+
+Plotkin, G.D.: LCF considered as a programming language. Theor. Comput. Sci. 5 , 223–255 (1977) MathSciNetCrossRef90044-5)
+
+10.
+
+Plotkin, G.D.: A structural approach to operational semantics. Technical Report DAIMI FN–19, Computer Science Department, Aarhus University, Aarhus, Denmark, September 1981
+
+11.
+
+Reynolds, J.C.: Theories of Programming Languages. Cambridge University Press, Cambridge (1998)MATHCrossRef
+
+12.
+
+Scott, D.: Continuous Lattices. Lecture Notes in Math., vol. 274, pp. 97–136. Springer, Berlin (1972)
+
+13.
+
+Weis, P., Leroy, X.: Le langage Caml, 2nd edn. Dunod, Paris (1999)
+
+14.
+
+Winskel, G.: The Formal Semantics of Programming Languages. MIT Press, Cambridge (1993)MATH
+
+Index
+
+A
+
+Abstract machine
+
+Algorithm
+
+Damas and Milner
+
+Hindley's
+
+Robinson's
+
+α -equivalence
+
+Alphabetic equivalence
+
+Arity
+
+Array
+
+B
+
+β -reduction
+
+Binding
+
+dynamic
+
+late
+
+static
+
+C
+
+Call by name
+
+Call by value
+
+Church numeral
+
+Closed set
+
+Closure
+
+recursive
+
+Compiler
+
+bootstrapping
+
+Composition
+
+Confluence
+
+Constant
+
+Continuous function
+
+D
+
+De Bruijn index
+
+Definition
+
+explicit
+
+inductive
+
+Derivation
+
+Deterministic
+
+E
+
+Environment
+
+semantic
+
+typing
+
+Evaluate
+
+Evaluator
+
+F
+
+Fields
+
+Fixed point
+
+Curry
+
+first theorem
+
+function construction via
+
+in PCF
+
+second theorem
+
+Functionalisation
+
+H
+
+Height
+
+I
+
+Interpreter
+
+L
+
+Label
+
+Language
+
+Limit
+
+List
+
+M
+
+Method
+
+dynamic
+
+static
+
+Module
+
+N
+
+Number of arguments
+
+O
+
+Object
+
+Ordering
+
+Scott's
+
+strongly complete
+
+weakly complete
+
+P
+
+Pair
+
+PCF ( Programming language for computable functions )
+
+Polymorphism
+
+Position numerals
+
+R
+
+Record
+
+in call by name
+
+in call by value
+
+Redex
+
+Reduction
+
+call by name
+
+call by value
+
+lazy
+
+weak
+
+Reference
+
+Register
+
+accumulator
+
+code
+
+environment
+
+stack
+
+Renaming
+
+Result
+
+Rule
+
+S
+
+Semantics
+
+big-step operational
+
+denotational
+
+small-step operational
+
+Side effect
+
+Solution
+
+principal
+
+Sort
+
+Strategy
+
+Subject reduction
+
+Substitution
+
+System
+
+F
+
+T
+
+T
+
+Term
+
+closed
+
+irreducible
+
+stuck
+
+Thunk
+
+Tree
+
+Type
+
+checking
+
+inference
+
+principal
+
+Type preservation
+
+by interpretation
+
+Type scheme
+
+U
+
+Unification
+
+V
+
+Value
+
+extended
+
+rational
+
+Variable
+
+capture
+
+environment
+
+mutable
+
diff --git a/kag/examples/csqa/builder/data/joe_celko_s_sql_programming_style.txt b/kag/examples/csqa/builder/data/joe_celko_s_sql_programming_style.txt
new file mode 100644
index 00000000..344c34c9
--- /dev/null
+++ b/kag/examples/csqa/builder/data/joe_celko_s_sql_programming_style.txt
@@ -0,0 +1,4811 @@
+Joe Celko's SQL Programming Style
+ 
+Joe Celko's SQL Programming Style
+
+Joe Celko
+
+Morgan Kaufmann
+Publishing Director Michael Forster Publisher Diane Cerra Publishing Services Manager Andre Cuello Senior Production Editor George Morrison Editorial Assistant Asma Stephan Cover Design Side by Side Studios Cover Image Side by Side Studios Composition Multiscience Press, Inc. Copyeditor Multiscience Press, Inc. Proofreader Multiscience Press, Inc. Indexer Multiscience Press, Inc. Interior printer The Maple-Vail Book Manufacturing Group Cover printer Phoenix Color Corp.
+
+Morgan Kaufmann Publishers is an imprint of Elsevier. 500 Sansome Street, Suite 400, San Francisco, CA 94111
+
+This book is printed on acid-free paper.
+
+Designations used by companies to distinguish their products are often claimed as trademarks or registered trademarks. In all instances in which Morgan Kaufmann Publishers is aware of a claim, the product names appear in initial capital or all capital letters. Readers, however, should contact the appropriate companies for more complete information regarding trademarks and registration.
+
+No part of this publication may be reproduced, stored in a retrieval system, or transmitted in any form or by any means-electronic, mechanical, photocopying, scanning, or otherwise-without prior written permission of the publisher.
+
+Permissions may be sought directly from Elsevier's Science & Technology Rights Department in Oxford, UK: phone: (+44) 1865 843830, fax: (+44) 1865 853333, e-mail: permissions@elsevier.com.uk. You may also complete your request on-line via the Elsevier homepage (<http://elsevier.com>) by selecting "Customer Support" and then "Obtaining Permissions".
+
+Library of Congress Cataloging-in-Publication Data
+
+Application submitted.
+
+ISBN: 0-12-088797-5
+
+For information on all Morgan Kaufmann publications, visit our Web site at www.mkp.com or www.books.elsevier.com
+
+Printed in the United States of America 05 06 07 08 5 4 3 2 1
+To Eve Astrid Adersson, Miss American Π and April Wilson, who rubs me the right way
+Table of Contents
+
+Instructions for online access
+
+Cover
+
+Title Page
+
+Introduction
+
+1.1 Purpose of the Book
+
+1.2 Acknowledgments
+
+1.3 Corrections, Comments, and Future Editions
+
+Chapter 1: Names and Data Elements
+
+1.1 Names
+
+1.2 Follow the ISO-11179 Standards Naming Conventions
+
+1.3 Problems in Naming Data Elements
+
+Chapter 2: Fonts, Punctuation, and Spacing
+
+2.1 Typography and Code
+
+2.2 Word Spacing
+
+2.3 Follow Normal Punctuation Rules
+
+2.4 Use Full Reserved Words
+
+2.5 Avoid Proprietary Reserved Words if a Standard Keyword Is Available in Your SQL Product
+
+2.6 Avoid Proprietary Statements if a Standard Statement Is Available
+
+2.7 Rivers and Vertical Spacing
+
+2.8 Indentation
+
+2.9 Use Line Spacing to Group Statements
+
+Chapter 3: Data Declaration Language
+
+3.1 Put the Default in the Right Place
+
+3.2 The Default Value Should Be the Same Data Type as the Column
+
+3.3 Do Not Use Proprietary Data Types
+
+3.4 Place the PRIMARY KEY Declaration at the Start of the CREATE TABLE Statement
+
+3.5 Order the Columns in a Logical Sequence and Cluster Them in Logical Groups
+
+3.6 Indent Referential Constraints and Actions under the Data Type
+
+3.7 Give Constraints Names in the Production Code
+
+3.8 Put CHECK() Constraint Near what they Check
+
+3.9 Put Multiple Column Constraints as Near to Both Columns as Possible
+
+3.10 Put Table-Level CHECK() Constraints at the End of the Table Declaration
+
+3.11 Use CREATE ASSERTION for Multi-table Constraints
+
+3.12 Keep CHECK() Constraints Single Purposed
+
+3.13 Every Table Must Have a Key to Be a Table
+
+3.14 Do Not Split Attributes
+
+3.15 Do Not Use Object-Oriented Design for an RDBMS
+
+Chapter 4: Scales and Measurements
+
+4.1 Measurement Theory
+
+4.2 Types of Scales
+
+4.3 Using Scales
+
+4.4 Scale Conversion
+
+4.5 Derived Units
+
+4.6 Punctuation and Standard Units
+
+4.7 General Guidelines for Using Scales in a Database
+
+Chapter 5: Data Encoding Schemes
+
+5.1 Bad Encoding Schemes
+
+5.2 Encoding Scheme Types
+
+5.3 General Guidelines for Designing Encoding Schemes
+
+5.4 Multiple Character Sets
+
+Chapter 6: Coding Choices
+
+6.1 Pick Standard Constructions over Proprietary Constructions
+
+6.2 Pick Compact Constructions over Longer Equivalents
+
+6.3 Use Comments
+
+6.4 Avoid Optimizer Hints
+
+6.5 Avoid Triggers in Favor of DRI Actions
+
+6.6 Use SQL Stored Procedures
+
+6.7 Avoid User-Defined Functions and Extensions inside the Database
+
+6.8 Avoid Excessive Secondary Indexes
+
+6.9 Avoid Correlated Subqueries
+
+6.10 Avoid UNIONS
+
+6.11 Testing SQL
+
+Chapter 7: How to Use Views
+
+7.1 VIEW Naming Conventions Are the Same as Tables
+
+7.2 VIEWs Provide Row- and Column-Level Security
+
+7.3 VIEWs Ensure Efficient Access Paths
+
+7.4 VIEWs Mask Complexity from the User
+
+7.5 VIEWs Ensure Proper Data Derivation
+
+7.6 VIEWs Rename Tables and/or Columns
+
+7.7 VIEWs Enforce Complicated Integrity Constraints
+
+7.8 Updatable VIEWs
+
+7.9 Have a Reason for Each VIEW
+
+7.10 Avoid VIEW Proliferation
+
+7.11 Synchronize VIΕWs with Base Tables
+
+7.12 Improper Use of VIEWs
+
+7.13 Learn about Materialized VIEWs
+
+Chapter 8: How to Write Stored Procedures
+
+8.1 Most SQL 4GLs Are Not for Applications
+
+8.2 Basic Software Engineering
+
+8.3 Use Classic Structured Programming
+
+8.4 Avoid Portability Problems
+
+8.5 Scalar versus Structured Parameters
+
+8.6 Avoid Dynamic SQL
+
+Chapter 9: Heuristics
+
+9.1 Put the Specification into a Clear Statement
+
+9.2 Add the Words "Set of All . . ./" in Front of the Nouns
+
+9.3 Remove Active Verbs from the Problem Statement
+
+9.4 You Can Still Use Stubs
+
+9.5 Do Not Worry about Displaying the Data
+
+9.6 Your First Attempts Need Special Handling
+
+9.7 Do Not Think with Boxes and Arrows
+
+9.8 Draw Circles and Set Diagrams
+
+9.9 Learn Your Dialect
+
+9.10 Imagine That Your WHERE Clause Is "Super Ameba"
+
+9.11 Use the Newsgroups and Internet
+
+Chapter 10: Thinking in SQL
+
+10.1 Bad Programming in SQL and Procedural Languages
+
+10.2 Thinking of Columns as Fields
+
+10.3 Thinking in Processes, Not Declarations
+
+10.4 Thinking the Schema Should Look Like the Input Forms
+
+Resources
+
+Military Standards
+
+Metadata Standards
+
+ANSI and ISO Standards
+
+U.S. Government Codes
+
+Retail Industry
+
+Code Formatting and Naming Conventions
+
+Bibliography
+
+Reading Psychology
+
+Programming Considerations
+
+Index
+
+About the author
+Introduction
+
+I AM NOT trying to teach you to program in SQL in this book. You might want to read that again. If that is what you wanted, there are better books. This ought to be the second book you buy, not the first.
+
+I assume that you already write SQL at some level and want to get better at it. If you want to learn SQL programming tricks, get a copy of my other book, _SQL for Smarties_ (3rd edition, 2005). I am trying to teach the reader how to work in logical and declarative terms, instead of in a procedural or OO manner—"Query Eye for the Database Guy," if you will forgive a horrible contemporary pun.
+
+Few, if any, SQL programmers came to SQL before learning and writing for years in a procedural or object-oriented language. They then got one particular SQL product and were told to learn it on their own or with a book that has a title like "SQL for Brain-Dead Morons," "Learn SQL in Ten Easy Lessons or Five Hard Ones," or worse.
+
+This is absurd! It takes at least five years to learn to be a master carpenter or chef. Why would you believe people could become SQL gurus in a weekend? What they become is bad SQL programmers, who speak SQL in dialect from the local SQL product with a strong accent from their previous languages. You might want to read "Teach Yourself Programming in Ten Years" by Peter Norvig (www.norvig.com/21-days.html) or "No Silver Bullets" by Fred Brooks, _Computer,_ 20(4): 10–19, April 1987) to get a reality check.
+
+The horrible part is that these people often don't know they are bad programmers. At one extreme, the entire shop where they work is just as bad, and they never see anything else. At the other extreme, if anyone tries to tell them about their problems, they become defensive or angry. If you look at postings on SQL newsgroups, many programmers just want to get a kludge for an immediate problem and not actually obtain a true long-term solution.
+
+If these were woodworking newsgroups, their questions would be the equivalent of "What are the best kind of rocks to use to pound screws into fine furniture?" When someone tells them to use large chunks of granite, they are happy, but if you try to tell them about screwdrivers, they explode into a rage.
+
+You might want to read an essay on this phenomenon: "Unskilled and Unaware of It: How Difficulties in Recognizing One's Own Incompetence Lead to Inflated Self-Assessments" by Justin Kruger and David Dunning (Department of Psychology, Cornell University, www.apa.org/journals/psp/psp7761121.html).
+
+Or look at the actual and self-assessments of American high school students in mathematics and sciences that were part of the Bush administration's No Child Left Behind Act.
+
+# 1.1 Purpose of the Book
+
+So how did we old farts learn to be better programmers when dinosaurs walked the earth? One of the best helpers we had in the late 1970s when the structured programming revolution came along was a series of books entitled "[Pascal | FORTRAN | COBOL | BASIC] with Style: Programming Proverbs" by Henry Ledgard and some of his colleagues at MIT. The covers were done like a Victorian novel with angels, scrolls, and old-style typographical elements. And like a Victorian novel, the books were subtitled "Principles of Good Programming with Numerous Examples to Improve Programming Style and Proficiency." These books and others made a big difference for most of us because they taught us how to think like good programmers.
+
+My goals in this book are to improve SQL programming style and proficiency. To be more exact:
+
+1. To _help an individual programmer write Standard SQL without an accent or a dialect._ It is difficult to unlearn old habits but not impossible, and it is best to learn the right way from the start. Amateurs write code for themselves. A professional writes codeto be maintained and used by other people. My rule of thumb has been that you need to have a full year of SQL programming before you have your epiphany and suddenly see the world in three: valued logic, data models, and sets.
+
+2. _To give an SQL shop a coding standard for internal use._ I have tried carefully to give a rationale for each of my rules, and I have given exceptions to those rules when I could think of them. You may disagree with some of my choices, but you will have to provide research and examples to defend your position. It is not good enough to simply declare: "Well, that's the way we wrote code in FooTran, so it must be the will of God!" as an argument.
+
+If you are the team leader, you now have a book (and author) that you can hold up and blame for anything that your people do not like. Even if I am later shown to be wrong about something, you will have been consistent. It is much easier to repair errors if they were made consistently.
+
+3. _To give programmers the mental tools to approach a new problem with SQL as their tool._ I tell people it takes about a year to "get it" and drop your procedural programming habits.
+
+# 1.2 Acknowledgments
+
+Craig Mullins provided the structure of the chapter on VIEWs in an article in www.DBAzine.com. The formatting style is taken from a house style I have used in CMP magazines and other publications for more than a decade. Peter Gulutzan provided the data for the naming conventions in actual products from an article inwww.DBAzine.com. The affix conventions in Chapter 1 are based on internal standards from Teradata Corporation. The scales and measurements and the encoding schemes material appeared in several of my old magazine columns in _DBMS_ and _Database Programming & Design_ before they were collected into a chapter in my book _Data & Database_ (Morgan-Kaufmann Publishers). I have tried to give credit in the text, but so many people have participated in the newsgroups over the years that I know I am forgetting someone.
+
+And, obviously, thanks to Henry Ledgard and his "Programming Proverbs" series for the inspiration.
+
+I would also like to thank all of the newbie programmers who wrote bad code. It sounds a bit sarcastic, but it is not meant to be. Many of thenewbies are programmers who were thrown into a DBA or SQL programmer job by management without training or an experienced mentor. I do not want to blame the victims unless they are really not working on getting better. Your errors in syntax, semantics, and style showed me how you were thinking. Diagnosis is the first step to treatment.
+
+# 1.3 Corrections, Comments, and Future Editions
+
+Corrections and additions for future editions can be sent to Morgan-Kaufmann publishers directly or to me at my e-mail address, jcelko212@earthlink.net.
+CHAPTER 1 Names and Data Elements
+
+This is the old joke:
+
+"When I was a kid, we had three cats."
+
+"What were their names?"
+
+"Cat, cat, and cat."
+
+"That sounds screwed up. How did you tell them apart?"
+
+"Who cares? Cats don't come when you call them anyway!"
+
+YOUR DATA WILL not come when it is called either if you do not give it a name that is always distinct and recognizable. This is an important part of any database project. Bad names for the data elements make the code difficult, or even impossible, to read.
+
+I am not kidding about impossible to read. In the old days, software companies used to deliberately scramble source code names and remove formatting to hide the algorithm from the buyers. The tradition seems to linger on, even if not by intent. In August 2004, a SQL newsgroup had a posting in which all of the names were one letter and a long string of digits.
+
+There are now ISO-11179 metadata standards that describe rules for naming data elements and for registering standards. Because they are an ISO standard, they are what you should be using in SQL as well as everywhere else.
+
+That standard, a bit of typography, and some common sense will give you the rules you need to get started.
+
+# 1.1 Names
+
+In the early days, every programmer had his or her own personal naming conventions. Unfortunately, they were often highly creative. My favorite was a guy who picked a theme for his COBOL paragraph names: one program might use countries, another might use flowers, and so forth. This is obviously weird behavior even for a programmer, but many programmers had personal systems that made sense to themselves but not to other people.
+
+For example, the first FORTRAN I used allowed only six-letter names, so I became adept at using and inventing six-letter names. Programmers who started with weakly typed or typeless languages like to use Hungarian notation (see Leszynski and Reddick). Old habits are hard to give up.
+
+When software engineering became the norm, every shop developed its own naming conventions and enforced them with some kind of data dictionary. Perhaps the most widespread set of rules was MIL STD 8320.1, set up by the U.S. Department of Defense, but it never became popular outside of the federal government. This was a definite improvement over the prior nonsystem, but each shop varied quite a bit; some had formal rules for name construction, whereas others simply registered whatever the first name given to a data element was.
+
+Today, we have ISO-11179 standards, which are becoming increasingly widespread, required for certain government work, and being put into data repository products. Tools and repositories of standardized encoding schemes are being built to this standard. Given this and XML as a standard exchange format, ISO-11179 will be the way that metadata is referenced in the future.
+
+## 1.1.1 Watch the Length of Names
+
+#### Rationale:
+
+The SQL-92 standards have a maximum identifier length of 18 characters. This length came from the older COBOL standards. These days, SQL implementations allow longer names, but if you cannot say it in 18 characters, then you have a problem. Table 1.1 shows the maximum length for names of the most important SQL schema objects according to ISO and several popular SQL products.
+
+Table 1.1 I _dentifier lengths_
+
+The numbers in the table are either bytes or characters. A maximum character length can be smaller than a maximum byte length if you use a multibyte character set.
+
+Do not use super-long names. People have to read them, type them, and print them out. They also have to be able to understand those names when they look at the code, search for them in the data dictionary, and so forth. Finally, the names need to be shared in host programs that might not allow the same maximum length.
+
+But do not go to the other extreme of highly condensed names that are impossible to read without weeks of study. The old Bachman design tool was used to build DB2 databases back when column length was limited to 18 bytes. Sometimes the tool would change the logical attribute name to a physical column name by removing all of the vowels. Craig Mullins referred to this as "Bachman having a vowel movement on my DDL." This is a bad approach to getting the name to fit within a smaller number of characters.
+
+#### Exceptions:
+
+These exceptions would be on a case-by-case basis and probably the result of legacy systems that had different naming restrictions.
+
+## 1.1.2 Avoid All Special Characters in Names
+
+#### Rationale:
+
+Special characters in a name make it difficult or impossible to use the same name in the database and the host language programs or even to move a schema to another SQL product.
+
+Table 1.2 shows the characters allowed in names by the standards and popular SQL products.
+
+Table 1.2 _Identifier character sets_
+
+Generally, the first character of a name must be a letter, whereas subsequent characters may be letters, digits, or _ (underscore). Any database management system (DBMS) might also allow $, #, or @, but no DBMS allows all three, and in any case the special characters are not usable everywhere (Microsoft attaches special meaning to names that begin with @ or # and Oracle discourages special characters in the names of certain objects).
+
+But what is a letter? In the original SQL, all letters had to be uppercase Latin, so there were only 26 choices. Nowadays the repertoire is more extensive, but be wary of characters outside the Latin-1 character set for the following reasons:
+
+1. _IBM cannot always recognize a letter._ It just accepts that any multibyte character except space is a letter and will not attempt to determine whether it's uppercase or lowercase.
+
+2. _IBM and Oracle use the database's character set and so could have a migration problem with exotic letters._ Microsoft uses Unicode and so does not have this problem.
+
+Intermediate SQL-92 does not allow an identifier to end in an underscore. It is also not a good idea to put multiple underscores together; modern printers make it difficult to count the number of underscores in a chain.
+
+#### Exceptions:
+
+None
+
+## 1.1.3 Avoid Quoted Identifiers
+
+#### Rationale:
+
+####
+
+This feature was added to SQL-92. Its main use has been to alias column names to make printouts look like reports. This kludge defeats the purpose of a tiered architecture. Instead, it destroys portability of the code and invites poorly constructed names. Table 1.3 shows the characteristics of delimited identifiers.
+
+Table 1.3 _Quoted identifier character sets_
+
+If you find the character-set restrictions of names onerous, you can avoid them by putting identifiers inside double quotes. The result is a delimited identifier (or quoted identifier in Oracle terminology). Delimited identifiers may start with, and contain, any character. It is a bit uncertain how one can include the double quote (") character. The standard way is to double it, as in "Empl" "oyees" but that's not always documented.
+
+Support for delimited names is nearly universal, with only two major exceptions: (1) IBM will not allow nonalphanumeric characters for labels and variable names inside stored procedures, and (2) Microsoft will not allow quoted identifiers if the QUOTED_IDENTIFIER switch is off. The reason for the first exception is, perhaps, that IBM converts SQL procedures into another computer language before compilation. Suppose you make a table with a delimited identifier, for example:
+
+Now try to get that table with a regular identifier, thus:
+
+Will this work? According to the SQL standard, it should not, but with Microsoft, it might. The reason is case sensitivity, which we discuss in section 1.1.4.
+
+The quoted identifiers do not work well with hot languages, especially when they have spaces or special characters. For example, this is a valid insertion statement:
+
+ADO generates the following code:
+
+which is a syntax error.
+
+#### Exceptions:
+
+If you need to communicate a result to someone who cannot read or understand the properly constructed column names in Latin-1, then use quoted aliases to format the output. I have done this for Polish and Chinese speakers.
+
+I also use quoted names inside documentation so that they will immediately read as the name of a schema object and not a regular word in the sentence.
+
+The usual reason for this error is that the programmer confuses a data element name with a display header. In traditional procedural languages, the data file and the application are in the same tier; in SQL, the database is totally separate from the front end where the data is displayed.
+
+## 1.1.4 Enforce Capitalization Rules to Avoid Case-Sensitivity Problems
+
+#### Rationale:
+
+Case-sensitivity rules vary from product to product.
+
+Standard SQL, IBM, and Oracle will convert regular identifiers to uppercase but will not convert delimited identifiers to uppercase. For Microsoft, the case-sensitivity rule has nothing to do with whether the name is regular or delimited. Instead, identifiers depend on the default collation. If the default collation is case insensitive, then t equals T. If it's case sensitive, then t does not equal T.
+
+To sum up, there are two case-sensitivity problems. The first is that the delimited identifier "t" and the regular identifier t differ if one followsthe SQL standard. The second is that Microsoft does not follow the SQL standard. These problems make it difficult for one naming convention to fit everyone.
+
+#### Exceptions:
+
+I will give a simple set of rules based on principles of readability and typography, but there are other possible conventions:
+
+1. Avoid delimited identifiers so you have no problems.
+
+2. IBM uses only uppercase. Unfortunately, this is difficult to read and looks like you are still programming on a punchcard system.
+
+3. Microsoft and Oracle use lowercase except where it would look odd. Unfortunately, the definition of looking odd is not at all precise. Sometimes reserved words are uppercased, sometimes lowercased, and so forth.
+
+# 1.2 Follow the ISO-11179 Standards Naming Conventions
+
+This is a fairly new ISO standard for metadata, and it is not well understood. Fortunately, the parts that a SQL programmer needs to know are pretty obvious and simple. The real problem is in the many ways that people violate them. A short summary of the NCITS L8 Metadata Standards Committee rules for data elements can be found at the following sites:
+
+Also the pdf file:
+
+and the draft:
+
+The ISO-11179 standard is broken down into six sections:
+
+## 1.2.1 ISO-11179 for SQL
+
+#### Rationale:
+
+Although the formal standards are good, they are very general. It is handy to have a set of rules aimed at the SQL developer in his or her own language. Some of the interpretations given here are the consensus of experts, as taken from newsgroups and private e-mails.
+
+Taking the rules from Section ISO-11179–4, a scalar data element should do the following:
+
+1. Be unique (within any data dictionary in which it appears).
+
+2. Be stated in the singular.
+
+3. State what the concept is, not only what it is not.
+
+4. Be stated as a descriptive phrase or sentence(s).
+
+5. Contain only commonly understood abbreviations.
+
+6. Be expressed without embedding definitions of other data elements or underlying concepts.
+
+7. Tables, sets, and other collections shall be named with a collective, class, or plural name.
+
+8. Procedures shall have a verb in their name.
+
+9. A copy (alias) of a table shall include the base table name as well as the role it is playing at that time.
+
+This formalism is nice in theory, but names are subject to constraints imposed by software limitations in the real world, such as maximum name length and character sets. Another problem is that one data element may have many names depending on the context in which it is used. It might be called something in a report and something else in an electronic data interchange (EDI) file, and it might be different from the name in the database. But you want to avoid using multiple names in thesame database, and you should be able to detect them with metadata tools. Furthermore, you want to avoid using multiple names in different databases in the same enterprise. Unfortunately, this is much more difficult to detect without very good data dictionary tools. The data dictionary should include the external names and their context.
+
+#### Exceptions:
+
+The curse of legacy databases, legacy file systems, and other traditions can make this very difficult. If there is a common, well-understood name for a data element, then you can use this name instead of a constructed name. For example, "us_postal_code" is formally correct, but "zip_code" is well understood, and you can argue for simply "zip" or "zip4" as a name because it is a familiar term.
+
+## 1.2.2 Levels of Abstraction
+
+Name development begins at the conceptual level. An object class represents an idea, abstraction, or thing in the real world, such as tree or country. A property is something that describes all objects in the class, such as height or identifier. This lets us form terms such as "tree height" or "country identifier" from the combination of the class and the property.
+
+The level in the process is the logical level. A complete logical data element must include a form of representation for the values in its data value domain (the set of possible valid values of a data element). The representation term describes the data element's representation class. The representation class is equivalent to the class word of the prime/class naming convention with which many data administrators are familiar. This gets us to "tree height measure," "country identifier name," and "country identifier code" as possible data elements.
+
+There is a subtle difference between "identifier name" and "identifier code," and it might be so subtle that we do not want to model it, but we would need a rule to drop the property term in this case. The property would still exist as part of the inheritance structure of the data element, but it would not be part of the data element name.
+
+Some logical data elements can be considered generic elements if they are well defined and are shared across organizations. Country names and country codes are well defined in the ISO 3166 standard, "Codes for the Representation of Names of Countries," and you might simply reference this document.
+
+Note that this is the highest level at which true data elements, by the definition of ISO-11179, appear: They have an object class, a property, and a representation.
+
+The next is the application level. This is usually done with a quantifier that applies to the particular application. The quantifier will either subset the data value domain or add more restrictions to the definition so that we work with only those values needed in the application.
+
+For example, assume that we are using ISO-3166 country codes, but we are only interested in Europe. This would be a simple subset of the standard, but it will change slowly over time. However, the subset of countries with more than 20 centimeters of rain this year will vary greatly in a matter of weeks.
+
+Changes in the name to reflect this fact will be accomplished by addition of qualifier terms to the logical name. For example, if a view were to list all of the countries with which a certain organization had trading agreements, the query data element might be called "trading_partner_country_name" to show its role in the context of the VIEW or query that limits it. The data value domain would consist of a subset of countries listed in ISO-3166.
+
+The physical name is the lowest level. These are the names that actually appear in the database table column headers, file descriptions, EDI transaction file layouts, and so forth. They may be abbreviations or use a limited character set because of software restrictions. However, they might also add information about their origin or format.
+
+In a registry, each of the data element names and name components will always be paired with its context so that we know the source or usage of the name or name component. The goal is to be able to trace each data element from its source to wherever it is used, regardless of the name under which it appears.
+
+## 1.2.3 Avoid Descriptive Prefixes
+
+#### Rationale:
+
+Another silly convention among newbies is to use prefixes that describe something about the appearance of the data element in the current table. In the old days, when we worked with sequential file systems, the physical location of the file was very important.
+
+The "tbl-" prefix is particularly silly. Before you counter that this prefix answers the question of what something is, remember that SQL has only one data structure. What else could it be? Do you put "n-" in front of every noun you write? Do you think this would make Englisheasier to read? It is like infants announcing that everything is "thingie!" as they grab them.
+
+"To _be something is to be something in particular; to be nothing in particular or anything in general is to be nothing."_ —Aristotle
+
+The next worst affix is the <table name>. Why does a data element become something totally different from table to table? For example, "orders_upc" and "inventory_upc" are both UPC codes no matter where they appear, but by giving them two names, you are saying that they are totally, logically different things in your data model.
+
+A total nightmare is the combination of "id" in a base table (vague name) with a reference in a second table using the base table name as a prefix in the foreign key or non-foreign-key references. The queries fill up with code like "Orders.ID = OrderID," which quickly becomes a game of looking for the period and trying to figure out what a thousand different "ID" columns mean in the data dictionary.
+
+Affixes like "vw" for views tell you how the virtual table is implemented in the schema, but this has nothing to do with the data model. If I later decide to replace the view with a base table, do I change the name? The bad news is that a table often already exists with the same root name, which makes for more confusion.
+
+Equally silly and dangerous are column names that are prefixed with the data type. This is how it is physically represented and not what it means in the data model. The data dictionary will be trashed, because you have no idea if there are "intorder_nbr," "strorder_nbr," and perhaps even "forder_nbr," all trying to be the simple "order_nbr" at the same time. The user can also look at the data declaration language (DDL) and see the data type, defaults, and constraints if he or she does not remember them.
+
+The final affix problem is telling us that something is the primary key with a "PK_" or a foreign key with an "FK_" affix. That is how it is used in that particular table; it is not a part of its fundamental nature. The user can also look at the DDL and see the words "PRIMARY KEY" or "FOREIGN KEY .. REFERENCES.." in the column declarations.
+
+The strangest version of this is a rule on a Web site for a company that specializes in Oracle programming. It advocated "<table name>_CK_<column name>" for CHECK() constraints. This not only gives you no help in determining the errors that caused the violation, but it also limits you to one and only one constraint per column per table, and it leaves you to ask about constraints that use two or more columns.
+
+The same rules and warnings about affixes apply to all schema objects. You will see "usp_" for user-defined stored procedures, "trig_" for triggers, and so forth. In MS SQL Server, this is a serious problem, because the prefix "sp_" is used for system procedures and has special meaning in the architecture.
+
+If the schema object does something (triggers, procedures), then use a <verb><object> format for the name; the subject of the sentence is understood to be the procedure. We will go into more details on this topic in Chapter 8.
+
+#### Exceptions:
+
+You can find other opinions at:
+
+<http://www.craigsmullins.com/dbt_0999.htm>
+
+There was also a series of articles at:
+
+http://www.sqlservercentral.com/​columnists/​sjones/​codingstandardspart2formatting.asp
+
+http://www.sqlservercentral.com/​columnists/​sjones/​codingstandardspart1formatting.asp
+
+## 1.2.4 Develop Standardized Postfixes
+
+This list of postfixes is built on Teradata's internal standards and common usage. The Teradata standards are given in the Appendix.
+
+"_id" = identifier. It is unique in the schema and refers to one entity anywhere it appears in the schema. Never use "<table name>_id"; that is a name based on location and tells you this is probably not a real key at all. Just plain "id" is too vague to be useful to anyone and will screw up your data dictionary when you have to find a zillion of them, all different, but with the same data element name and perhaps the same oversized data type.
+
+"_date" or "dt" = date, temporal dimension. It is the date of something—employment, birth, termination, and so forth; there is no such column name as just a date by itself.
+
+"_nbr" or "num" = tag number. This is a string of digits that names something. Do not use "_no" because it looks like the Boolean yes/no value. I prefer "nbr" to "num" because it is used as a common abbreviation in several European languages.
+
+"_name" or "nm" = alphabetic name. This explains itself. It is also called a nominal scale.
+
+"_code" or "_cd" = a code is a standard maintained by a trusted source, usually outside of the enterprise. For example, the ZIP code is maintained by the U.S. Postal Service. A code is well understood in its context, so you might not have to translate it for humans.
+
+"_size" = an industry standard or company scale for a commodity, such as clothing, shoes, envelopes, or machine screws. There is usually a prototype that defines the sizes kept with a trusted source.
+
+"_tot" = a sum, an aggregated dimension that is logically different from its parts.
+
+"_seq" = sequence, ordinal numbering. This is not the same thing as a tag number, because it cannot have gaps.
+
+"_tally" = a count of values. Also called an absolute scale.
+
+"_cat" = category, an encoding that has an external source that has distinct groups of entities. There should be strong, formal criteria for establishing the category. The classification of Kingdom in Biology is an example.
+
+"_class" = an internal encoding that does not have an external source that reflects a subclassification of the entity. There should be strong formal criteria for the classification. The classification of plants in Biology is an example.
+
+"_type" = an encoding that has a common meaning both internally and externally. Types are usually less formal than a class and might overlap. For example, a driver's license might be typed for motorcycles, automobiles, taxis, trucks, and so forth.
+
+The differences among type, class, and category are an increasing strength of the algorithm for assigning the type, class, or category. A category is distinct; you will not often have to guess if something is animal, vegetable, or mineral to put it in one of those categories.
+
+A class is a set of things that have some commonality; you have rules for classifying an animal as a mammal or a reptile. You may have some cases for which it is more difficult to apply the rules, such as the platypus, an egg-laying mammal that lives in Australia, but the exceptions tend to become their own classification—monotremes in this example.
+
+A type is the weakest of the three, and it might call for a judgment. For example, in some states a three-wheeled motorcycle is licensed as amotorcycle, but in other states, it is licensed as an automobile, and in some states, it is licensed as an automobile only if it has a reverse gear.
+
+The three terms are often mixed in actual usage. Stick with the industry standard, even if it violates the aforementioned definitions.
+
+"_status" = an internal encoding that reflects a state of being, which can be the result of many factors. For example, "credit_status" might be computed from several sources.
+
+"_addr" or "_loc" = an address or location for an entity. There can be a subtle difference between an address and a location.
+
+"_img" = an image data type, such as.jpg,.gif, and so forth.
+
+Then an application might have some special situations with units of measurement that need to be shown on an attribute or dimension. And _always_ check to see if there is an ISO standard for a data element.
+
+## 1.2.5 Table and View Names Should Be Industry Standards, Collective, Class, or Plural Nouns
+
+#### Rationale:
+
+Industry standards should always be used. People in that industry will understand the name, and the definition will be maintained by the organization that sets those standards.
+
+For example, the North American Industry Classification System (NAICS) has replaced the old Standard Industrial Classification (SIC) system in the United States. This new code was developed jointly by the United States, Canada, and Mexico to provide new comparability in statistics about business activity across North America. The names "NAICS" and "naics_code" are clear to people who do business statistics, even though they look weird to the rest of us.
+
+If an industry standard is not right for your situation, then try to base your names on that standard. For example, if I am dealing only with automobiles made in Mexico, I could have a table named "VIN_Mexico" to show the restriction. Moving down the priority list, if I cannot find an industry standard, I would look for a collective or class name. I would never use a singular name.
+
+Collective or class table names are better than singular names because a table is a set and not a scalar value. If I say "Employee," the mental picture is of Dilbert standing by himself—one generic employee. If I say "Employees," the mental picture is of the crew from Dilbert—acollection of separate employees. If I say "Personnel," the mental picture is suddenly more abstract—a class without particular faces on it.
+
+It is legal in SQL to give a table and a column the same name, but it is a really bad idea. First of all, the column's name would be in violation of the rules we just discussed because it would lack a qualifier, but it would also mean that either the table name is not a set or the column name is not a scalar.
+
+#### Exceptions:
+
+Use a singular name if the table actually has one and only one row in it. The one example I can think of is a table for constants that looks like this:
+
+The insertion creates one row, so the table ought to have a singular name. The "lock" column assures you that there is always only one row. Another version of this is to create a VIEW that cannot be changed using SQL-99 syntax.
+
+The advantage is that this view cannot be changed; the disadvantage is that this view cannot be changed.
+
+## 1.2.6 Correlation Names Follow the Same Rules as Other Names . . . Almost
+
+#### Rationale:
+
+Correlation names are names. They should be derived from the base table or view name, the column name, or from the expression thatcreates them. The nice part is that the readers have the context in front of them, so you can often use a more abbreviated name.
+
+A correlation name is more often called an _alias,_ but I will be formal. In SQL-92, they can have an optional AS operator, and it should be used to make it clear that something is being given a new name.
+
+This explicitly means that you do not use an alphabetical sequence unrelated to the base table name. This horrible practice is all too common and makes maintaining the code much more difficult. Consider looking at several statements where the table "Personnel" is aliased as "A" in one, "D" in another, and "Q" in a third because of its position in a FROM clause.
+
+Column correlation names for a computed data element should name the computed data element in the same way that you would name a declared column. That is, try to find a common term for the computation. For example, "salary + COALESCE(commission, 0.00)) AS total_pay" makes sense to the reader.
+
+A simple table or view correlation name should have a short, simple name derived from the base table name or descriptive of the role that copy of the table is playing in the statement (e.g., "SELECT .. FROM Personnel AS Management, Personnel AS Workers" as the two uses of the table in the query).
+
+Now to explain the "almost" part of this section's title. In the case of multiple correlation names on the same table, you may find it handy to postfix abbreviated names with a number (e.g., "SELECT .. FROM Personnel AS PI, Personnel AS P2"). The digit is to tell the reader how many correlation names are used in the statement for that table.
+
+In effect, these are "correlation pronouns"—a shorthand that makes sense in a local context. They are used for the same reason as pronouns in a natural language: to make the statement shorter and easier to read.
+
+A table expression alias should have a short, simple name derived from the logical meaning of the table expression.
+
+Although not required, the correlation name on a table expression can be followed by a list of new column names in parentheses. If this list is missing, the correlation name inherits the names from the base tables or views in the table expression. In the case of a simple table correlation name, such a list would probably be redundant because we usually want to use the original column names.
+
+In the case of a table expression correlation name, such a list would probably be a good idea to avoid ambiguous column names. It also forces the programmer to trim the expression of extraneous columns that were not actually needed in the query.
+
+#### Exceptions:
+
+If there is no obvious, clear, simple name for the table correlation name, then use an invented name, such as a single letter like X. Likewise, if a computation has no immediate name, then you might use an invented name.
+
+## 1.2.7 Relationship Table Names Should Be Common Descriptive Terms
+
+#### Rationale:
+
+Tables and views can model relationships, usually one-to-many or many-to-many, as well as entities. If the relationship has a common name that is understood in the context, then use it. There is a tendency for newbies to concatenate the names of the tables involved to build a nounce word. For example, they name a table "Marriages" because that is the common term for that relationship rather than "ManWoman," "HusbandsWives," or something really weird. Likewise, "Enrollment" makes more sense than "Students_Courses"; once you start looking for the names, they come easily.
+
+This concatenation falls apart when the relationship is not a simple binary one, such as an escrow on a house that has a buyer, a seller, and a lender.
+
+#### Exceptions:
+
+If there is no common term for the relationship, you will need to invent something, and it might well be a concatenation of table names.
+
+## 1.2.8 Metadata Schema Access Objects Can Have Names That Include Structure Information
+
+This rule does not apply to the schema information tables, which come with standardized names. It is meant for naming indexes and other things that deal directly with storage and access. The postfix "_idx" is acceptable.
+
+#### Rationale:
+
+This is simply following the principle that a name should tell you what something is. In the case of indexes and other things that deal directly with storage and access, that is what they are. They have nothing to do with the data model.
+
+#### Exceptions:
+
+This does not apply to schema objects that are seen by the user. Look for the rules for the other schema objects as we go along.
+
+# 1.3 Problems in Naming Data Elements
+
+Now that we have talked about how to do it right, let's spend some time on common errors in names that violate the rules we set up.
+
+## 1.3.1 Avoid Vague Names
+
+#### Rationale:
+
+_"That sounds vaguely obscene to me! I can't stand vagueness!"_
+
+—Groucho Marx.
+
+At one extreme the name is so general that it tells us nothing. The column is a reserved word such as "date" or it is a general word like "id," "amount," "date," and so forth. Given a column called "date," you have to ask, "date of what?" An appointment? Birth? Hire? Termination? Death? The name begs the question on the face of it.
+
+At another extreme, the name is made useless by telling us a string of qualifiers that contradict each other. Consider the typical newbie column name like "type_code_id" as an example. If it is an identifier, then it is unique for every entity that has it, like the vehicle identification number (VIN) on a automobile. If it is a code, then what is the trusted source that maintains it like a ZIP code? It is drawn from a domain of values that is not unique. If it is a type, then what is the taxonomy towhich it belongs? Why not go all the way and call it "type_code_id_value" instead?
+
+Why did we not find a mere "customer_type" that would have been understood on sight?
+
+#### Exceptions:
+
+None
+
+Improperly formed data element names seem to be the result of ignorance and object-oriented (OO) programming. In particular, OO programmers put "_id" on every primary key in every table and have problems understanding that SQL is a strongly typed language in which things do not change their data types in programs. The names get absurd at times. Consider a lookup table for colors:
+
+But what does "_value_id" mean? Names like this are generated without thought or research. Assume that we are using the Pantone color system in the database, so we have a trusted source and a precise description—we did the research! This might have been written as follows:
+
+## 1.3.2 Avoid Names That Change from Place to Place
+
+#### Rationale:
+
+The worst possible design flaw is changing the name of an attribute on the fly, from table to table. As an example, consider this slightly cleaned-up piece of actual code from a SQL newsgroup:
+
+Those full table names are difficult to read, but the newbie who wrote this code thinks that the table name must _always_ be part of the column name. That is the way that a file worked in early COBOL programs.
+
+This means that if you have hundreds of tables, each appearance of the same attribute gets a new name, so you can never build a proper data dictionary. Did you also notice that it is not easy to see underscores, commas, and periods?
+
+Try this cleaned-up version, which clearly shows a simple star schema centered on the IPC table.
+
+I have no idea what a URN is, but it looks like a standard identifier of some kind. Look at all of the kinds of "URNs" (i.e., URN, IPCURN, and OffenseURN) in the original version of the query. It gives you the feeling of being in a crematorium gift shop.
+
+As you walk from room to room in your house, do you also change your name, based on your physical location? Of course not! The name we seek identifies the entity, not the location.
+
+#### Exceptions:
+
+Aliases inside a query can temporarily give a new name to an occurrence of a data element. These are temporary and disappear at the end of the statement. We discuss rules for this in another section 1.2.6.
+
+## 1.3.3 Do Not Use Proprietary Exposed Physical Locators
+
+#### Rationale:
+
+The most basic idea of modern data modeling is to separate the logical model and the physical implementation from each other. This allows us to reuse the model on different platforms and not be tied to just one platform.
+
+In the old days, the logical and physical implementations were fused together. I will explain this in more detail in the next chapter, but for now the rule is to never use proprietary physical locators. We want to have portable code. But the real problem is that the proprietary physical locator violates the basic idea of a key in the relational model.
+
+When new SQL programmers use IDENTITY, GUID, ROWID, or other auto-numbering vendor extensions to get a key that can be used for locating a given row, they are imitating a magnetic tape's sequential access. It lets them know the order in which a row was added to the table—just like individual records went onto the end of the magnetic tape!
+
+We will spend more time discussing this flaw in Chapter 3.
+
+#### Exceptions:
+
+You might want to fake a sequential file when you are using a SQL table structure for some purpose other than a relational database management system (RDBMS). For example, staging and scrubbing data outside the "Real Schema" that do not have any data integrity issues.
+CHAPTER 2 Fonts, Punctuation, and Spacing
+
+CODE IS USUALLY set in a monospace font. After more than a century of manual typewriters and decades of punchcards, we find that it is actually easier to read code in a monospace font than a proportional font. Punctuation marks get the same spacing as a letter in a monospace font, but would be lost in a proportional font.
+
+# 2.1 Typography and Code
+
+Your brain and eyes do not follow code the same way that they follow text, process mathematics, read maps, or look at pictures. In fact, there are a lot of individual differences in human brains.
+
+Some people like text editors that use colors for various syntax elements in a programming language. Other people get headaches from colored program editors and want to see black-and-white text. Likewise, a newspaper that put nouns in red, verbs in green, and other such things would simply not work. Yet black-and-white maps are much more difficult to read than those with colors. Why? This has to do with color perception and how fast you can switch between the left and right halves of your brain.
+
+There is a test for brain damage in which the examiner flashes cards with words printed in various colored inks (e.g., the word "RED" written in green ink). The examiner asks the subject for the word orthe color and times the responses. The rate is fairly constant over the subject's lifetime, so a change is a symptom of some physical or chemical change. Now, try reading this phrase:
+
+Almost nobody reading this for the first time catches the fact that the word "the" appears twice. The point is that there is a vertical component to how we read text in chunks of words.
+
+Code on a page is read from left to right and from top to bottom, with a lot of vertical eye movement that you would not have if you were reading pure text.
+
+A few years ago, the following posting made the rounds in newsgroups. I am not sure if it is genuinely from Cambridge University, but it makes its point very nicely:
+
+Aoccrdnig to rscheearch at Cmabrigde Uinervtisy, it deosn't mttaer in waht oredr the ltteers in a wrod are, the only iprmoetnt tihng is taht the frist and lsat ltteer be at the rghit pclae. The rset can be a total mses and you can sitll raed it wouthit porbelm. Tihs is bcuseae the huamn mnid does not raed ervey lteter by istlef, but the wrod as a wlohe.
+
+Because the parser guarantees that running code will not have syntax and spelling errors like those in the above text, the reader knows what token to expect next with far more certainty than in plain text. Not only are words seen as wholes, but they are also anticipated within each statement in the programming language. That is, if I see an "IF" token in Pascal or another member of the Algol family, I anticipate the matching "THEN" that completes the statement.
+
+Let's discuss some basic typographic conventions for programming code, which are based on how people read it.
+
+## 2.1.1 Use Only Upper- and Lowercase Letters, Digits, and Underscores for Names
+
+#### Rationale:
+
+This subset of characters will port to any other programming language. It is very handy to be able to use the same names in both the database and the host languages of the applications.
+
+For example, the octothrope or number sign (#) is allowed in several SQL products, but it has a special meaning in other programming languages and could not be used in them.
+
+#### Exceptions:
+
+If you are still programming on a machine that uses punchcards, then you have no choice but to use the limited, uppercase-only character. It is hard to imagine such a situation in the 21st century.
+
+If the SQL implementation requires special symbols for certain names, then you have no choice. For example, temporary table names begin with an octothrope and parameter names begin with a "petite snail" or "at sign" (@) in Sybase/SQL Server T-SQL dialects. However, it is a good idea to be sure that the names are unique without the special characters, so you can port the code to a more modern implementation.
+
+Do not use an underscore as the first or last letter in a name. It looks like the name is missing another component. Leading or trailing underscores also get lost visually without letters or digits around them, thanks to laser-quality printers. Likewise, do not use more than one underscore in a row. The old mechanical line printers could not align underscores, so you could eyeball them, whereas laser printers are microscopically precise.
+
+## 2.1.2 Lowercase Scalars Such as Column Names, Parameters, and Variables
+
+#### Rationale:
+
+Words in books and newspapers are written in lowercase letters because they are easier to read than uppercase words. This is basic typography. Using all uppercase letters is the worst choice. Lowercase text is also read faster than uppercase text. The first measurements are in Woodworth (1938), and Smith and Fisher (1975) have confirmed it. Participants were asked to read comparable passages of text, half completely in uppercase text and half presented in standard lowercase text. In each study, participants read reliably faster with the lowercase text by a 5 percent to 10 percent speed difference.
+
+#### Exceptions:
+
+Unless there is a compelling physical reason, use lowercase. The only compelling physical reason I can think of is that you are still using punchcards in the 21st century.
+
+## 2.1.3 Capitalize Schema Object Names
+
+#### Rationale:
+
+Schema objects include tables, views, stored procedures, and so forth. Capitalized words begin a sentence in languages that use the Latin alphabet. Additionally, capitalization represents proper nouns—like the names of sets being modeled by tables in SQL—in English, German, and other natural languages. This is the way that readers expect to see these names; don't surprise them.
+
+#### Exceptions:
+
+Unless the name naturally begins with a lowercase letter, there is no reason not to capitalize it.
+
+## 2.1.4 Uppercase the Reserved Words
+
+#### Rationale:
+
+Uppercase words are seen as a unit, rather than being read as a series of syllables or letters. The eye is drawn to them, and they act to announce a statement or clause. That is why headlines and warning signs work.
+
+Typographers use the term _bouma_ for the shape of a word. The term appears in Paul Saenger's book (1975). Imagine each letter on a rectangular card that just fits it, so you see the ascenders, descenders, and baseline letters as various-sized "Lego blocks" that are snapped together to make a word.
+
+The bouma of an uppercase word is always a simple, dense rectangle, and it is easy to pick out of a field of lowercase words. Consider this statement:
+
+versus:
+
+See how quickly you can find each clause, reading from left to right? Next, if you put each clause on a line of its own, you can read the code still faster:
+
+We will deal with rules for the vertical components later.
+
+#### Exceptions:
+
+None
+
+Keywords come in two types, reserved and nonreserved words. The reserved words are part of the SQL language; the nonreserved words are metadata names that appear in the environment and will not cause syntax errors in an actual SQL program. They are also not very likely to be used in a real application.
+
+Vendors will also have proprietary reserved words, which should also be capitalized.
+
+## 2.1.5 Avoid the Use of CamelCase
+
+#### Rationale:
+
+The eye tends to look for a word in its usual lowercase or capitalized form, so CamelCase words tend to lead the eye to the pieces rather than to the whole word. In particular, a CamelCase word that begins with a lowercase letter will be scanned starting at the first uppercase letter and then scanned backward to get the first syllable.
+
+Another problem is that you need to agree on how to mix the cases. For example, should it be "upcCode," "UpcCode," "UPCcode," or"UPCCode"? In practice, you can wind up with several versions of the same name.
+
+It is even more difficult to read text in alternating case; that is, where the letters of a word change from uppercase to lowercase multiple times within a word (e.g., "AlTeRnAtlnG cAsE"). The bouma shape is different from the same word in its lowercase form. Alternating case has been shown to be more difficult than either lowercase or uppercase text in a variety of studies.
+
+Smith (1969) showed that it slowed the reading speed of a passage of text. Mason (1978) showed that the time to name a word was slowed.
+
+Pollatsek, Well, and Schindler (1975) showed that word matching was hindered. Meyer and Gutschera (1975) showed that category decision times decreased.
+
+#### Exceptions:
+
+If the word naturally appears in CamelCase, such as "MacDonald," then use it. If you begin the object name with an uppercase letter, then you can optionally use it. However, never use CamelCase for a scalar.
+
+# 2.2 Word Spacing
+
+Put one space between language tokens and do not jam things into a stream. For example, do write "foobar = 21" instead of "foobar=21," as you will often see. Many programmers who grew up with punchcards were taught to use minimal white space to save the limited number of columns. For example, FORTRAN II does not need any spaces at all in its code, nor does the original IBM job control language (JCL) for the IBM/360 family. Modern programming languages are not this restricted, and we now have the ability to write code as if people were more important than computers.
+
+#### Rationale:
+
+We are now living in the 21st century, and you can add white space for readability without running over the edge. That is a screen and not a punchcard in front of you.
+
+#### Exceptions:
+
+You might have to wrap exceptionally long lines. This is not as big a problem in a concise language like SQL as it was in a verbose language like COBOL.
+
+# 2.3 Follow Normal Punctuation Rules
+
+#### Rationale:
+
+Try to follow the rules that you would for English punctuation, because people are used to reading English and their eyes expect certain conventions.
+
+1. In SQL in particular, you need to follow the rule about having a space after a comma because the comma and the period are easy to confuse or to miss visually.
+
+Compare:
+
+versus
+
+2. Put commas at the end of a line, not the start. A comma, semicolon, question mark, or periods are visual signals that something has just ended, not that it is starting. Having a comma at the start of a line will make the eye tick leftward as it looks for that missing word that was expected before the comma.
+
+Instead, put comma-separated lists on one line so they can be read left to right instead of vertically. If you split the list into two or more lines, see that each line contains related data elements.
+
+3. Put a new line or at least a space after a semicolon to separate statements.
+
+4. Put a space between words even when you could crowd them together.
+
+#### Exceptions:
+
+If SQL does not work the same way as English, then you have to follow the SQL syntax rules.
+
+Many of the code-formatting habits people have go back to habits they were taught by programmers who grew up with punchcard data processing. Because we have video terminals and text editors today, a lot of habits no longer have any basis.
+
+The practice of putting a comma in front of a single variable on a single line goes back to punchcards. It was often difficult for programmers to get to a keypunch machine to create their decks of cards. In this format, you could pull or insert a card to change your code. There is no excuse for this practice since we now have video terminals.
+
+English and European languages are read left to right and then top to bottom. This scanning pattern is so deeply learned that we arrange schematics, comic books, maps, and other graphics the same way. To see how much changing that order can throw you off, try to read a Japanese or Chinese comic book. The panels are in right-to-left order, and the Chinese word balloons are read top to bottom. This is why typographers have a rule that you do not set long words
+
+Did you spot the misspelling? About one-third of readers do not. Likewise, it is difficult to locate duplicates and errors in those longvertical lists of names. SQL formatting can use vertical alignment to advantage in other places but in things that should be chunked together.
+
+# 2.4 Use Full Reserved Words
+
+#### Rational:
+
+SQL allows you to skip some reserved words and to abbreviate others. Try to use the full forms to document the program. This is a good thing in COBOL, and it works in SQL as well.
+
+For example, an alias can be written with or without an AS operator. That is, "Personnel AS PI" is equivalent to "Personnel P1" in a FROM clause, and "(salary + commission) AS total_pay" is equivalent to "(salary + commission) total_pay" in a SELECT list. But the AS reserved word makes it easier to see there is an alias and not a comma in these situations.
+
+Technically, you can abbreviate INTEGER to INT and DECIMAL to DEC, but the full names are preferred. The abbreviations look like the reserved word "into" or the month "Dec" in English.
+
+#### Exceptions:
+
+The exception is to use the shorter forms of the character data types. That is, CHAR(n) instead of CHARACTERS), VARCHAR(n) instead of VARYING CHARACTER(n), NCHAR(n) instead of NATIONAL CHARACTER(n), and NVARCHAR(n) instead of NATIONAL VARYING CHARACTER(n). The full names are too long to be comfortable to a reader. Even COBOL, the most verbose programming language on earth, allows some abbreviations.
+
+# 2.5 Avoid Proprietary Reserved Words if a Standard Keyword Is Available in Your SQL Product
+
+#### Rationale:
+
+Sticking to standards will make your code readable to other SQL programmers who might not know your dialect. It also means that your code can run on other products without being rewritten.
+
+Standard code will protect you from failure when the proprietary syntax is dropped or modified. That unwelcome surprise occurred in several products when the vendors added the Standard SQL versions of OUTER JOINs and deprecated their old proprietary versions. In particular, SQL Server programmers had to unlearn their *= syntax and semantics for outer joins.
+
+The other disadvantage of proprietary features is that they change over time and have no standard behavior. For example, the BIT data type in SQL Server changed its NULL-ability between product releases. Oracle could not tell an empty string from a NULL. There are lots of other examples. Because there is no external standard to appeal, a vendor is free to do anything it wishes.
+
+#### Exceptions:
+
+If your SQL product does not yet support standard syntax for something, then you have no choice. This is true for temporal functions. They were late getting to Standard SQL, so the early vendors made up their own syntax and internal temporal models.
+
+# 2.6 Avoid Proprietary Statements if a Standard Statement Is Available
+
+#### Rationale:
+
+This rule ought to be obvious. Sticking to standards will make your code readable to other SQL programmers who might not know your dialect. It also means that your code can run on other products without being rewritten. Standard code will protect your code from failure when the proprietary syntax is dropped or modified.
+
+In fact, a vendor can actually give you proprietary features that are unpredictable! In the "Books On Line" interactive manual that comes with Microsoft SQL Server, we get a warning in the REMARKS section about the proprietary "UPDATE .. FROM.." syntax that tells us:
+
+The results of an UPDATE statement are undefined if the statement includes a FROM clause that is not specified in such a way that only one value is available for each column occurrence that is updated (in other words, if the UPDATE statement is not deterministic). For example, given the UPDATE statement in the following script, both rows in table S meet the qualifications of the FROM clause in the UPDATE statement, but it is undefined which row from S is used to update the row in table T.
+
+This replaces a prior behavior found in the Sybase and Ingres family where the UPDATE .. FROM would do multiple updates, one for each joined row in the second table.
+
+In older versions of Sybase/SQL Server, if a base table row is represented more than once in the embedded query, then that row is operated on multiple times instead of just once. This is a total violation of relational principles, but it's easy to do with the underlying physical implementation. Here is a quick example:
+
+Now try to update T1 by doubling all the rows that have a match in T2.
+
+The FROM clause gives you a CROSS JOIN, so you get a series of four actions on the same row (1 => 2 => 4 => 8 => 16). These are pretty simple examples, but you get the idea. There are subtle things with self-joins and the diseased mutant T-SQL syntax that can hang you in loopsby changing things, or you can have tables that depend on the order of the rows for their results, and so forth.
+
+SQL Server and Sybase used different fixes for this problem in later versions of their products. Sybase did a hidden "SELECT DISTINCT" in the implied query, and SQL Server gets an unpredictable row. Standard SQL is consistent and clear about aliases, views, and derived tables, as well as a highly orthogonal language.
+
+If the UPDATE clause could take an alias, according to the Standard SQL model, then you would create a copy of the contents of that base table under the alias name, then update that copy, and delete it when the statement was over—in effect doing nothing to the base table.
+
+If the UPDATE clause could take a FROM clause, according to the Standard SQL model, then you would create a result set from the table expression, then update that copy, and delete it when the statement was over—in effect doing nothing to the base tables.
+
+Because this syntax is so proprietary, inconsistent with the standard model, and ambiguous, why does it exist? In the original Sybase product, the physical model made this "extension" relatively easy to implement, and there were no standards or a good understanding of the relational model back then. Programmers got used to it and then it was almost impossible to fix.
+
+When I lived in Indianapolis in the mid-1970s, my neighbor had graduated from General Motors private college and gone to work for the company. His first job was investigating industrial accident reports. We were having a beer one night, and he got to telling war stories from the various General Motors plants he had been to for his job. His conclusion after a year on that job was that all industrial accidents are bizarre suicide attempts. People would go to the machine shop and build clever devices to short around the safety features on their equipment so they could work a little faster.
+
+For example, if you make a clamp that holds in one of the two safety switches that operates a small stamping machine, you can push the other button with one hand and work material with your free hand. Well, you can do this until that free hand is crushed just above the wrist and squirts across the back wall of the shop anyway. Trading speed for safety and correctness will eventually catch up with you.
+
+#### Exceptions:
+
+If your SQL product does not yet support standard syntax for something, then you have no choice. For example, Oracle did not support the CASEexpression, but its DECODE() function is quite close to it and can be substituted in older versions of Oracle.
+
+# 2.7 Rivers and Vertical Spacing
+
+When you look at a magazine or newspaper, you will notice that the text is set in a column that is even on both sides. This is called justified text, as opposed to ragged right or ragged left text. Extra spacing is added to each line to justify the text, but if this extra spacing appears in the same location on several rows, you get rivers.
+
+A river is a vertical open space in text, and it is considered to be bad typography. You want to read text from left to right, top to bottom, with a visual break at the indentation or new line that marks the start of a paragraph. A river pulls your eye downward and makes the text more difficult to read.
+
+It is easy to set up what typographers call rivers in the program code in a monospace font because you can add spacing as needed, but that same downward river effect aligns code on a vertical axis and makes the program easier to read.
+
+versus no river:
+
+# 2.8 Indentation
+
+When you have to indent in block-structured 3GL programming languages, use three spaces. A single space is too short to be read as anything but a word separator. Two spaces will work because that is what you were probably taught to use in typing classes at the end of a sentence, but three spaces or a new line is clearly a paragraph to the reader.
+
+Indenting five or more spaces actually hurts readability. The eye has to skip over too far to grab the code. In particular, the use of an eight-space tab character is historical. The early Teletype machines had 80 characters per line and set tabs at eight spaces for mechanical reasons. That became the definition when we moved to electronic terminals.
+
+The rule for SQL is that rivers override what we were doing in the old 3GL languages.
+
+#### Rationale:
+
+What we need in data manipulation language (DML) is a balance of indentation and the use of rivers to the logical nesting. Note how each subquery has a river to hold it together and that the subquery is placed against the river.
+
+#### Exceptions:
+
+A subquery is always inside parentheses, so one can make a case that the closing parentheses should align vertically with its mate.
+
+The advantage is that you can quickly find the limits of the subquery but at the cost of extra lines that hold only one or two tokens.
+
+When you have a group of related columns in the SELECT clause list or other places, then use the three-space rule to indent the members of the group when you have to go to a second line:
+
+The customer columns are on one line, while the 10 payments are split over three lines with an indentation to group them.
+
+# 2.9 Use Line Spacing to Group Statements
+
+#### Rationale:
+
+Use one new line between related statements and two new lines between separate steps in the same process.
+
+Clusters of related code on a page show the reader which statements perform each step of a process. It is also a good idea to introduce each step with a high-level comment, but we will get into that later.
+
+As an experiment to demonstrate how important visual clustering is, make some flash cards with some red circles on them. On one set of flash cards, arrange the spots in the patterns in which they appear on a double nine set of dominoes. On a second set of flash cards, put the spots on at random.
+
+Show the cards to your subjects for one second each and call out the number of the card. Ask them to write down the number of spots oneach card. When there is no arrangement, most people start having problems at five spots and almost nobody can handle eight or more randomly arranged cards. However, nine spots in a three-by-three arrangement present no problems. Even the 10 spots on a playing card are easy to count because they are broken into two clusters of five spots.
+
+#### Exceptions:
+
+The double spacing between steps can be optional if it breaks up the flow of the code.
+CHAPTER 3 Data Declaration Language
+
+_"[I need] Data! Data! Data! I can't make bricks without clay."_  
+—Sherlock Holmes  
+(fictional detective of author Sir Arthur Conan Doyle)
+
+_"Smart data structures and dumb code works a lot better  
+than the other way round."_  
+—Eric S. Raymond
+
+I BELIEVE THAT MOST of the bad SQL queries in the world are the result of bad schema design. A bad schema can be ambiguous, require extra work to fetch data, and not return valid results even when good data was input into it.
+
+Let's start with the syntax rules that should be followed when writing data declaration language (DDL), and then in the following chapters, talk about the content and semantics of the DDL.
+
+# 3.1 Put the Default in the Right Place
+
+#### Rationale:
+
+The DEFAULT constraint appears after the data type and NOT NULL constraint appears after the DEFAULT value.
+
+The SQL-92 standard requires that ordering, but most products allow you to place the DEFAULT either after the data type or after theNOT NULL constraint. A NULL-able column can also have a DEFAULT value, so the standard makes sense. Because we need a consistent pattern, let's go with the standard. Because NOT NULL is so common, it can be left on the same line as the DEFAULT and data type.
+
+#### Exceptions:
+
+None
+
+# 3.2 The Default Value Should Be the Same Data Type as the Column
+
+#### Rationale:
+
+That rule sounds obvious, but programmers do not follow it. You will see columns with decimal places defaulted to integer zero, columns of CHAR (n) defaulted to strings of less than (n) characters, and columns of TIMESTAMP defaulted to DATE. The result in many SQL products was implicit type conversions whenever a default value was used. Why incur that overhead, when you could get it right in the first place?
+
+#### Exceptions:
+
+None
+
+# 3.3 Do Not Use Proprietary Data Types
+
+#### Rationale:
+
+Proprietary data types do not port to other products or from one release to another of the same product. Standard SQL has more than enough data types to model most of the things you will find in the real world.
+
+As an example, only the SQL Server/Sybase family has a MONEY data type. It adds currency symbols and commas to a numeric string for display, but it has different rules for doing computations than NUMERIC or DECIMAL data types. The front end has to handle the currency symbols and commas and be sure that the basic math is correct. Why do something in the DDL only to undo it in the front end?
+
+Even worse, machine-level things like a BIT or BYTE data type have no place in a high-level language like SQL. SQL is a high-level language; it is abstract and defined without regard to physical implementation. This basic principle of data modeling is called _data abstraction._
+
+Bits and bytes are the lowest units of hardware-specific, physical implementation you can get. Are you on a high-end or low-end machine? Does the machine have 8-, 16-, 32-, 64-, or 128-bit words? Twos complement or ones complement math? Hey, the standards allowdecimal-based machines, so bits do not exist at all! What about NULLs? To be a data type, you have to have NULLs, so what is a NULL bit? By definition, a bit is on or off and has no NULL.
+
+What does the implementation of the host languages do with bits? Did you know that +1, +0,-0, and -1 are all used for Booleans but not consistently? That means all of the host languages—present, future, and not yet defined. Surely no good programmer would ever write nonportable code by getting to such a low level as bit fiddling!
+
+You might also ask if zero is used for "successful completion" in the functions of the host language or the vendor's own 4GL. There are two situations in practice. Either the bits are individual attributes or they are used as a vector to represent a single attribute. In the case of a single attribute, the encoding is limited to two values, which do not port to host languages or other SQLs, cannot be easily understood by an end user, and cannot be expanded.
+
+In the second case, what some newbies, who are still thinking in terms of second- and third-generation programming languages or even punchcards, do is build a vector for a series of yes/no status codes, failing to see the status vector as a single attribute. Did you ever play the children's game "20 Questions" when you were young?
+
+Imagine you have six components for a loan approval, so you allocate bits in your second-generation model of the world. You have 64 possible vectors, but only 5 of them are valid (i.e., you cannot be rejected for bankruptcy and still have good credit). For your data integrity, you can:
+
+1. Ignore the problem. This is actually what most newbies do. When the database becomes a mess without any data integrity, they move on to the second solution.
+
+2. Write elaborate ad hoc CHECK() constraints with user-defined functions or proprietary bit-level library functions that cannot port and that run like cold glue.
+
+Now we add a seventh condition to the vector: Which end does it go on? Why? How did you get it in the right place on all the possible hardware that it will ever use? Did the code that references a bit in a word by its position do it right after the change?
+
+You need to sit down and think about how to design an encoding of the data that is high level, general enough to expand, abstract, and portable. For example, is that loan approval a hierarchical code?
+
+Concatenation code? Vector code? Did you provide codes for unknown, missing, and N/A values? It is not easy to design such things!
+
+#### Exceptions:
+
+Very, very special circumstances where there is no alternative at the present time might excuse the use of proprietary data types. In 20 years of consulting on SQL programming, I have never found a situation that could not be handled by a basic data type or a CREATE DOMAIN statement.
+
+Next, consider porting a proprietary data type by building a user-defined distinct type that matches the proprietary data type. This is not always possible, so check your product. If the data type is exotic, such as Geo/Spatial data, sound, images, or documents, you should probably do the job in a specialized system and not SQL.
+
+# 3.4 Place the PRIMARY KEY Declaration at the Start of the CREATE TABLE Statement
+
+#### Rationale:
+
+Having the key as the first thing you read in a table declaration gives you important information about the nature of the table and how you will find the entities in it. For example, if I have a table named "Personnel" and the first column is "ssn," I immediately know that we track employees via their Social Security numbers.
+
+#### Exceptions:
+
+In the case of a compound primary key, the columns that make up the key might not fit nicely into the next rule (3.5). If this is the case, then put a comment by each component of the primary key to make it easier to find.
+
+# 3.5 Order the Columns in a Logical Sequence and Cluster Them in Logical Groups
+
+#### Rationale:
+
+The physical order of the columns within a table is not supposed to matter in the relational model. Their names and not their ordinal positions identify columns, but SQL has ordinal positions for columns in tables in default situations. The SELECT * and INSERT INTO statements use the order of declaration in their default actions.
+
+This rule is obvious; people prefer a logical ordering of things to a random mix. For example, the columns for an address are best put in their expected order: name, street, city, state, and postal code.
+
+#### Exceptions:
+
+Thanks to columns being added after the schema is in place, you might not be able to arrange the table as you would like in your SQL product. Check to see if your product allows column reordering.
+
+If you have a physical implementation that uses the column ordering in some special way, you need to take advantage of it. For example, DB2 for z/OS logs changes from the first byte changed to the last byte changed, unless the row is variable; then it logs from the first byte changed to the end of the row. If the change does not cause the length of the variable row to change size, it goes back to logging from the first byte changed to the last byte changed. The DBA can take advantage of this knowledge to optimize performance by placing:
+
+  * Infrequently updated nonvariable columns first
+  * Infrequently updated variable-length columns next
+  * Frequently updated columns last
+  * Columns that are frequently modified together next to each other
+
+Following this approach will cause DB2 to log the least amount of data most of the time. Because the log can be a significant bottleneck for performance, this approach is handy. You can always create the table and then create a view for use by developers that resequences the columns into the logical order if it is that important.
+
+# 3.6 Indent Referential Constraints and Actions under the Data Type
+
+#### Rationale:
+
+The idea is to make the full column declaration appear as one visual unit when you read down the CREATE TABLE statement. In particular, put the ON DELETE and ON UPDATE clauses on separate lines.
+
+The standard does not require that they appear together in any particular order. As an arbitrary decision, I am going to tell you to use alphabetical order, so ON DELETE comes before ON UPDATE if both are present.
+
+#### Exceptions:
+
+None
+
+# 3.7 Give Constraints Names in the Production Code
+
+#### Rationale:
+
+The constraint name will show up in error messages when it is violated. This gives you the ability to create meaningful messages and easily locate the errors.
+
+The syntax is simply "CONSTRAINT <name>," and it should be a clear statement of what has been violated done as a name. For example:
+
+If you do not provide a name, the SQL engine will probably provide a machine-generated name that is very long, impossible to read, and will give you no clue about the nature of your problem.
+
+#### Exceptions:
+
+You can leave off constraint names on PRIMARY KEYS, UNIQUE, and FOREIGN KEY constraints, because most SQL products will give an explicit error message about them when they are violated. The exception is that Oracle will use the system-generated name when it displays the execution plans.
+
+You can leave off constraint names during development work. However, remember that constraint names are global, not local, because the CREATE ASSERTION statement would have problems otherwise.
+
+# 3.8 Put CHECK() Constraint Near what they Check
+
+#### Rationale:
+
+Put single column CHECK() constraints on its column, multicolumn constraints near their columns.
+
+We want as much information about a column on that column as possible. Having to look in several places for the definition of a column can only cost us time and accuracy. Likewise, put multicolumn constraints as near to the columns involved as is reasonable.
+
+#### Exceptions:
+
+If your SQL product has a CREATE DOMAIN statement, you will include DEFAULT and CHECK() constraints in the domain declaration, so the use of the DOMAIN is enough. Multicolumn constraints on columns that are far apart should be moved to the end of the table declaration. This will give you one place to look for the more complex constraints, rather than trying to look all over the DDL statement.
+
+It can also be argued that none of this really matters, because most of the time we should be going to the schema information tables to retrieve the constraint definitions, not the DDL. Constraints may have been removed or added with subsequent ALTER statements, and the system catalog will have the correct, current state, whereas the DDL may not.
+
+## 3.8.1 Consider Range Constraints for Numeric Values
+
+#### Rationale:
+
+The whole idea of a database is that it is a single trusted repository for all of the data in the enterprise. This is the place where the business rules must be enforced.
+
+The most common constraint on numbers in a data model is that they are not less than zero. Now look at actual DDL and see how often you find that constraint. Programmers are lazy and do not bother with this level of details.
+
+#### Exceptions:
+
+When the column really can take any value whatsoever.
+
+## 3.8.2 Consider LIKE and SIMILAR TO Constraints for Character Values
+
+#### Rationale:
+
+Again, the whole idea of a database is that it is a single trusted repository for all of the data in the enterprise. This is the place where the business rules must be enforced.
+
+An encoding will have a format that can be validated with a LIKE or SIMILAR TO predicate. Now look at actual DDL and see how often you find that constraint. This is not as portable an option as numeric range checking, and many programmers who did not use UNIX in their youth have problems with regular expressions, but it is still important.
+
+#### Exceptions:
+
+When the column really can take any value whatsoever.
+
+## 3.8.3 Remember That Temporal Values Have Duration
+
+There is no such thing as a point in time. You can ask Einstein or go back to the Greek philosopher Zeno and his famous paradoxes. Temporal values have duration, and you need to remember that they have a start and finish time, either explicitly or implicitly, that includes all of the continuum bound by them. The implicit model is a single column and the explicit model uses a pair of temporal values.
+
+For example, when you set a due date for a payment, you usually mean any point from the start of that day up to but not including midnight of the following day. When you say an employee worked on a given date, you usually mean the event occurred during an eight-hour duration within that day.
+
+Remember that you can use a DEFAULT CURRENT_TIMESTAMP on a temporal column and that a NULL can be used as a marker for "eternity" in the finish time column. A CHECK() constraint can round off time values to the start of the nearest year, month, day, hour, minute, or second as needed.
+
+## 3.8.4 REAL and FLOAT Data Types Should Be Avoided
+
+Most commercial applications do not need floating-point math. SQL has NUMERIC and DECIMAL data types that can be set to a great deal of scale and precision and do not have floating-point numeric rounding errors. There will be exceptions for scientific and statistical data.
+
+# 3.9 Put Multiple Column Constraints as Near to Both Columns as Possible
+
+#### Rationale:
+
+Do not make the reader have to look in multiple physical locations to find all of the columns involved in the constraint. You do not have to indent this constraint, but it is a good idea to split it on two lines: one with the CONSTRAINT clause and one with the CHECK() clause.
+
+#### Exceptions:
+
+This is not always physically possible, especially when many columns are involved.
+
+# 3.10 Put Table-Level CHECK() Constraints at the End of the Table Declaration
+
+#### Rationale:
+
+These constraints are not yet well supported in SQL products, but they are legal SQL-92 syntax. Their predicates involve the entire table as a whole rather than just single rows. This implies that they will involve aggregate functions.
+
+#### Exceptions:
+
+None
+
+# 3.11 Use CREATE ASSERTION for Multi-table Constraints
+
+#### Rationale:
+
+Put multiple table CHECK() Constraints in CREATE ASSERTION statements rather than on a table declaration.
+
+These constraints are not yet well supported in SQL products, but they are legal SQL-92 syntax. Their predicates involve several different tables, not just one table. This implies that they are at a higher level and should be modeled there. The practical consideration is that all constraints are TRUE on an empty table, so the CREATE ASSERTIONstatement lets you control that possibility. The assertion name acts as the constraint name.
+
+#### Exceptions:
+
+If the SQL product does not support CREATE ASSERTION statements, then this cannot be done, but if it were possible, then violation would require a strong reason having to do with the schema design.
+
+# 3.12 Keep CHECK() Constraints Single Purposed
+
+#### Rationale:
+
+Put simple CHECK() constraints in their own clauses rather than writing one long constraint with multiple tests.
+
+When you give a constraint a name, that name will appear in error messages and can help the user to correct data. If all of the validation is in one single CHECK() clause, what name would you give it? For example, imagine a single validation for a name that looks for correct capitalization, extra spaces, and a length over five characters. About all you can call it is "bad address line" and hope the user can figure out how to fix it. However, if there were separate checks for capitalization, extra spaces, and a length over five characters, then those constraint names would be obvious and give the user a clue as to the actual problem.
+
+#### Exceptions:
+
+If your SQL product supports the SIMILAR TO predicate (a version of grep() based on the POSIX standard in Standard SQL), then you might consider having a longer regular expression with OR-ed patterns that fall under a general constraint name.
+
+If you do not want to give details about errors to users for security reasons, then you can use a single constraint with a vague name. This would be a strange situation.
+
+# 3.13 Every Table Must Have a Key to Be a Table
+
+#### Rationale:
+
+This is the very definition of a table. The problem is that many newbies do not understand what a key really is. A key must be a subset of the attributes (columns) in the table. There is no such thing as a universal, one-size-fits-all key. Just as no two sets of entities are the same, the attributes that make them unique have to be found in the reality of the data. God did not put a 17-letter Hebrew number on the bottom of everything in creation.
+
+Here is my classification of types of keys (Table 3.1).
+
+Table 3.1 _Types of keys_
+
+1. A _natural key is a subset of attributes that occurs in a table and acts as a unique identifier._ The user sees them. You can go to the external reality and verify them. You would also like to have some validation rule. Example: UPC codes on consumer goods are easily seen (read the package bar code), and you validate them with a scanner, a manual-check digit calculation, or a manufacturers Web site.
+
+2. _An artificial key is an extra attribute added to the table that is seen by the user._ It does not exist in the external reality but can beverified for syntax or check digits inside itself. Example: The open codes in the UPC scheme that a user can assign to his or her own products. The check digit still works the same way, but you have to verify the codes inside your own enterprise.
+
+If you have to construct a key yourself, it takes time to design it, to invent a validation rule, and so forth. There is a chapter on that topic in this book. Chapter 5 discusses the design of encoding schemes.
+
+3. _An exposed physical locator is not based on attributes in the data model and is exposed to the user._ There is no way to predict it or verify it. The system obtains a value through some physical process in the storage hardware that is totally unrelated to the logical data model. Example: IDENTITY columns in the T-SQL family; other proprietary, nonrelational auto-numbering devices; and cylinder and track locations on the hard drive used in Oracle.
+
+Technically, these are not really keys at all, because they are attributes of the physical storage and are not even part of the logical data model, but they are handy for lazy, non-RDBMS programmers who don't want to research or think! This is the worst way to program in SQL.
+
+4. _A surrogate key is system generated to replace the actual key behind the covers where the user never sees it._ It is based on attributes in the table. Example: Teradata hashing algorithms, pointer chains.
+
+The fact that you can never see or use them for DELETE and UPDATE or create them for INSERT is vital. When users can get to them, they will screw up the data integrity by getting the real keys and these physical locators out of sync. The system must maintain them.
+
+Notice that people get exposed physical locator and surrogate mixed up; they are totally different concepts.
+
+## 3.13.1 Auto-Numbers Are Not Relational Keys
+
+In an RDBMS, the data elements exist at the schema level. You put tables together from attributes, with the help of a data dictionary to model entities in SQL.
+
+But in a traditional 3GL-language application, the names are local to each file because each application program gives them names and meaning. Fields and subfields had to be completely specified to locate the data. There are important differences between a file system and a database, a table and a file, a row and a record, and a column and a field. If you do not have a good conceptual model, you hit a ceiling and cannot get past a certain level of competency.
+
+In 25 words or less, it is "logical versus physical," but it goes beyond that. A file system is a loose collection of files, which have a lot of redundant data in them. A database system is a single unit that models the entire enterprise as tables, constraints, and so forth.
+
+## 3.13.2 Files Are Not Tables
+
+Files are independent of each other, whereas tables in a database are interrelated. You open an entire database, not single tables within it, but you do open individual files. An action on one file cannot affect another file unless they are in the same application program; tables can interact without your knowledge via DRI actions, triggers, and so on.
+
+The original idea of a database was to collect data in a way that avoided redundant data in too many files and not have it depend on a particular programming language.
+
+A file is made up of records, and records are made up of fields. A file is ordered and can be accessed by a physical location, whereas a table is not. Saying "first record," "last record," and "next n records" makes sense in a file, but there is no concept of a "first row," "last row," and "next row" in a table.
+
+A file is usually associated with a particular language—ever try to read a FORTRAN file with a COBOL program? A database is language independent; the internal SQL data types are converted into host language data types.
+
+A field exists only because of the program reading it; a column exists because it is in a table in a database. A column is independent of any host language application program that might use it.
+
+In a procedural language, "READ a, b, c FROM FileX;" does not give the same results as "READ b, c, a FROM FileX;" and you can even write "READ a, a, a FROM FileX;" so you overwrite your local variable. In SQL, "SELECT a, b, c FROM TableX" returns the same data as "SELECT b, c, a FROM TableX" because things are located by name, not position.
+
+A field is fixed or variable length, can repeat with an OCCURS in COBOL, struct in c, and so on. A field can change data types (union in'C, VARIANT in Pascal, REDEFINES in COBOL, EQUIVALENCE in FORTRAN).
+
+A column is a scalar value, drawn from a single domain (domain = data type + constraints + relationships) and represented in one and only one data type. You have no idea whatsoever how a column is physically represented internally because you never see it directly.
+
+Consider temporal data types: in SQL Server, DATETIME (their name for TIMESTAMP data type) is a binary number internally (UNIX-style system clock representation), but TIMESTAMP is a string of digits in DB2 (COBOL-style time representation). When you have a field, you have to worry about that physical representation. SQL says not to worry about the bits; you think of data in the abstract.
+
+Fields have no constraints, no relationships, and no data type; each application program assigns such things, and they don't have to assign the same ones! That lack of data integrity was one of the reasons for RDBMS.
+
+Rows and columns have constraints. Records and fields can have anything in them and often do! Talk to anyone who has tried to build a data warehouse about that problem. My favorite is finding the part number "I hate my job" in a file during a data warehouse project.
+
+Dr. Codd (1979) defined a row as a representation of a single simple fact. A record is usually a combination of a lot of facts. That is, we don't normalize a file; you stuff data into it and hope that you have everything you need for an application. When the system needs new data, you add fields to the end of the records. That is how we got records that were measured in Kbytes.
+
+## 3.13.3 Look for the Properties of a Good Key
+
+#### Rationale:
+
+A checklist of desirable properties for a key is a good way to do a design inspection. There is no need to be negative all the time.
+
+1. _Uniqueness._ The first property is that the key be unique. This is the most basic property it can have because without uniqueness it cannot be a key by definition. Uniqueness is necessary, but not sufficient.
+
+Uniqueness has a context. An identifier can be unique in the local database, in the enterprise across databases, or unique universally. We would prefer the last of those three options.
+
+We can often get universal uniqueness with industry: standard codes such as VINs. We can get enterprise uniquenesswith things like telephone extensions and e-mail addresses. An identifier that is unique only in a single database is workable but pretty much useless because it will lack the other desired properties.
+
+2. _Stability._ The second property we want is stability or invariance. The first kind of stability is within the schema, and this applies to both key and nonkey columns. The same data element should have the same representation wherever it appears in the schema. It should not be CHAR(n) in one place and INTEGER in another. The same basic set of constraints should apply to it. That is, if we use the VIN as an identifier, then we can constrain it to be only for vehicles from Ford Motors; we cannot change the format of the VIN in one table and not in all others.
+
+The next kind of stability is over time. You do not want keys changing frequently or in unpredictable ways. Contrary to a popular myth, this does not mean that keys cannot ever change. As the scope of their context grows, they should be able to change.
+
+On January 1, 2005, the United States added one more digit to the UPC bar codes used in the retail industry. The reason was globalization and erosion of American industrial domination. The global bar-code standard will be the European Article Number (ΕΑΝ) Code. The American Universal Product Code (UPC) turned 30 years old in 2004 and was never so universal after all.
+
+The ΕΑΝ was set up in 1977 and uses 13 digits, whereas the UPC has 12 digits, of which you see 10 broken into two groups of 5 digits on a label. The Uniform Code Council, which sets the standards in North America, has the details for the conversion worked out.
+
+More than 5 billion bar-coded products are scanned every day on earth. It has made data mining in retail possible and saved millions of hours of labor. Why would you make up your own code and stick labels on everything? Thirty years ago, consumer groups protested that shoppers would be cheated if price tags were not on each item, labor protested possible job losses, and environmentalists said that laser scanners in the bar-code readers might damage people's eyes. The neo-Luddites have been with us a long time.
+
+For the neo-Luddite programmers who think that changing a key is going to kill you, let me quote John Metzger, chief information officer of A&P. The grocery chain had 630 stores in 2004, and the grocery industry works 1 percent to 3 percent profit margins—the smallest margins of any industry that is not taking a loss. A&P has handled the new bar-code problem as part of a modernization of its technology systems. "It is important," Mr. Metzger said, "but it is not a shut-the-company-down kind of issue."
+
+Along the same lines, ISBN in the book trade is being changed to 13 digits, and VINs are being redesigned. See the following sources for more information:
+
+3. _Familiarity._ It helps if the users know something about the data. This is not quite the same as validation, but it is related. Validation can tell you if the code is properly formed via some process; familiarity can tell you if it feels right because you know something about the context. Thus, ICD codes for disease would confuse a patient but not a medical records clerk.
+
+4. _Validation._ Can you look at the data value and tell that it is wrong, without using an external source? For example, I know that "2004-02-30" is not a valid date because no such day exists on the Common Era calendar. Check digits and fixed format codes are one way of obtaining this validation.
+
+5. _Verifiability._ How do I verify a key? This also comes in context and in levels of trust. When I cash a check at the supermarket, the clerk is willing to believe that the photo on the driver's license I present is really me, no matter how ugly it is. Or rather, the clerk used to believe it was me; the Kroger grocery store chain is now putting an inkless fingerprinting system in place, just like many banks have done.
+
+When I get a passport, I need a birth certificate and fingerprinting. There is a little less trust here. When I get a security clearance, I also need to be investigated. There is a lot less trust.
+
+A key without a verification method has no data integrity and will lead to the accumulation of bad data.
+
+6. _Simplicity._ A key should be as simple as possible, but no simpler. People, reports, and other systems will use the keys. Long, complex keys are more subject to error; storing and transmitting them is not an issue anymore, the way it was 40 or 50 years ago.
+
+One person's simple is another person's complex. For an example of a horribly complex code that is in common international usage, look up the International Standard Bank Number (IBAN). A country code at the start of the string determines how to parse the rest of the string, and it can be up to 34 alphanumeric characters in length. Why? Each country has its own account numbering systems, currencies, and laws, and they seldom match. In effect, the IBAN is a local banking code hidden inside an international standard (see <http://www.ecbs.org/iban/iban.htm> and the European Committee for Banking Standards Web site for publications).
+
+More and more programmers who have absolutely no database training are being told to design a database. They are using GUIDs, IDENTITY, ROWID, and other proprietary auto-numbering features in SQL products to imitate either a record number (sequential file system mindset) or OID (OO mindset) because they don't know anything else. This magical, universal, one-size-fits-all numbering is totally nonrelational, depends on the physical state of the hardware at a particular time, and is a poor attempt at mimicking a magnetic tape file system.
+
+Experienced database designers tend toward intelligent keys they find in industry-standard codes, such as UPC, VIN, GTIN, ISBN, and so on. They know that they need to verify the data against the reality they are modeling. A trusted external source is a good thing to have.
+
+The reasons given for this poor programming practice are many, so let me go down the list:
+
+**Q:** Couldn't a natural compound key become very long?
+
+**Al:** So what? This is the 21st century, and we have much better computers than we did in the 1950s when key size was a real physical issue. What is funny to me is the number of idiots who replace a natural two- or three-integer compound key with a huge GUID, which no human being or other system can possibly understand, because they think it will be faster and easy to program.
+
+**A2:** This is an implementation problem that the SQL engine can handle. For example, Teradata is a SQL designed for very large database (VLDB) applications that use hashing instead of B-tree or other indexes. They guarantee that no search requires more than two probes, no matter how large the database. A tree index requires more and more probes as the size of the database increases.
+
+**A3:** A long key is not always a bad thing for performance. For example, if I use (city, state) as my key, I get a free index on just (city). I can also add extra columns to the key to make it a super-key when such a super-key gives me a covering index (i.e., an index that contains all of the columns required for a query, so that the base table does not have to be accessed at all).
+
+**Q:** Can't I make things really fast on the current release of my SQL software?
+
+**Al:** Sure, if 1 want to lose all of the advantages of an abstract data model, SQL set-oriented programming, carry extra data, and destroy the portability of code. Look at any of the newsgroups and see how difficult it is to move the various exposed physical locators in the same product.
+
+The auto-numbering features are a holdover from the early SQLs, which were based on contiguous storage file systems. The data was kept in physically contiguous disk pages, in physically contiguous rows, made up of physically contiguous columns. In short, just like a deck of punchcards or a magnetic tape. Most programmers still carry that mental model, too.
+
+But physically contiguous storage is only one way of building a relational database, and it is not the best one. The basic idea of a relational database is that the user is not supposed to know how or where things are stored at all, much less write code that depends on the particular physical representation in a particular release of a particular product on particular hardware at a particular time.
+
+The first practical consideration is that auto-numbering is proprietary and nonportable, so you know that you will have maintenance problemswhen you change releases or port your system to other products. Newbies actually think they will never port code! Perhaps they only work for companies that are failing and will be gone. Perhaps their code is such a disaster that nobody else wants their application.
+
+But let's look at the logical problems. First, try to create a table with two columns and try to make them both auto-numbered. If you cannot declare more than one column to be of a certain data type, then that thing is not a data type at all, by definition. It is a property that belongs to the physical table, not the logical data in the table.
+
+Next, create a table with one column and make it an auto-number. Now try to insert, update, and delete different numbers from it. If you cannot insert, update, and delete rows, then it is not really a table by definition.
+
+Finally, create a simple table with one hidden auto-number column and a few other columns. Use a few statements like:
+
+Put a few rows into the table and notice that the auto-numbering feature sequentially numbered them in the order they were presented. If you delete a row, the gap in the sequence is not filled in, and the sequence continues from the highest number that has ever been used in that column in that particular table. This is how we did record numbers in preallocated sequential files in the 1950s, by the way. A utility program would then pack or compress the records that were flagged as deleted or unused to move the empty space to the physical end of the physical file.
+
+But we now use a statement with a query expression in it, like this:
+
+Because a query result is a table, and a table is a set that has no ordering, what should the auto-numbers be? The entire, whole, completed set is presented to Foobar all at once, not a row at a time. There are (n!) ways to number (n) rows, so which one do you pick? The answer has been to use whatever the physical order of the result set happened to be. That nonrelational phrase "physical order" again!
+
+But it is actually worse than that. If the same query is executed again, but with new statistics or after an index has been dropped or added, the new execution plan could bring the result set back in a different physical order. Can you explain from a logical model why the same rows in the second query get different auto-numbers? In the relational model, they should be treated the same if all the values of all the attributes are identical.
+
+Using auto-numbering as a primary key is a sign that there is no data model, only an imitation of a sequential file system. Because this magic, all-purpose, one-size-fits-all pseudo identifier exists only as a result of the physical state of a particular piece of hardware, at a particular time, as read by the current release of a particular database product, how do you verify that an entity has such a number in the reality you are modeling? People run into this problem when they have to rebuild their database from scratch after a disaster.
+
+You will see newbies who design tables like this:
+
+Now input data and submit the same row a thousand times or a million times. Your data integrity is trashed. The natural key was this:
+
+Another problem is that if a natural key exists (which it must, if the data model is correct), then the rows can be updated either through the key or through the auto-number. But because there is no way to reconcile the auto-number and the natural key, you have no data integrity.
+
+To demonstrate, here is a typical newbie schema. I call them "id-iots" because they always name the auto-number column "id" in every table.
+
+Now change a row in Personnel, using the "id" column:
+
+or using the natural key:
+
+But when I rebuild the row from scratch:
+
+What happened to the tables that referenced Personnel? Imagine a company bowling team table that also had the "id" column and the "ssn" of the players. I need cascaded DRI actions if the "ssn" changes, but I only have the "id," so I have no idea how many "ssn" values the same employee can have. The "id" column is at best redundant, but now we can see that it is also dangerous.
+
+Finally, an appeal to authority, with a quote from Dr. Codd (1979): "Database users may cause the system to generate or delete a surrogate, but they have no control over its value, nor is its value ever displayed to them.
+
+This means that a surrogate ought to act like an index: created by the user, managed by the system, and never seen by a user. That means never used in queries, DRI, or anything else that a user does.
+
+Codd also wrote the following:
+
+There are three difficulties in employing user-controlled keys as permanent surrogates for entities.
+
+1. The actual values of user-controlled keys are determined by users and must therefore be subject to change by them (e.g., if two companies merge, the two employee databases might be combined, with the result that some or all of the serial numbers might be changed).
+
+2. Two relations may have user-controlled keys defined on distinct domains (e.g., one of them uses Social Security, while the other uses employee serial numbers) and yet the entities denoted are the same.
+
+3. It may be necessary to carry information about an entity either before it has been assigned a user-controlled key value or after it has ceased to have one (e.g., an applicant for a job and a retiree).
+
+These difficulties have the important consequence that an equi-join on common key values may not yield the same result as a join on common entities. A solution—proposed in part [4] and more fully in [14]—is to introduce entity domains, which contain system-assigned surrogates. Database users may cause the system to generate or delete a surrogate, but they have no control over its value, nor is its value ever displayed to them . . . (Codd, 1979).
+
+#### Exceptions:
+
+If you are using the table as a staging area for data scrubbing or some other purpose than as a database, then feel free to use any kind of proprietary feature you wish to get the data right. We did a lot of this in the early days of RDBMS. Today, however, you should consider using ETL and other software tools that did not exist even a few years ago.
+
+# 3.14 Do Not Split Attributes
+
+#### Rationale:
+
+Attribute splitting consists of taking an attribute and modeling it in more than one place in the schema. This violates Domain-key Normal Form(DKNF) and makes programming insanely difficult. There are several ways to do this, discussed in the following sections.
+
+## 3.14.1 Split into Tables
+
+The values of an attribute are each given their own table. If you were to do this with gender and have a "MalePersonnel" and a "FemalePersonnel" table, you would quickly see the fallacy. But if I were to split data by years (temporal values) or by location (spatial values) or by department (organizational values), you might not see the same problem.
+
+In order to get any meaningful report, these tables would have to be UNION-ed back into a single "Personnel" table. The bad news is that constraints to prevent overlaps among the tables in the collection can be forgotten or wrong.
+
+Do not confuse attribute splitting with a partitioned table, which is maintained by the system and appears to be a whole to the users.
+
+## 3.14.2 Split into Columns
+
+The attribute is modeled as a series of columns that make no sense until all of the columns are reassembled (e.g., having a measurement in one column and the unit of measure in a second column). The solution is to have scale and keep all measurements in it.
+
+Look at section 3.3 on BIT data types as one of the worst offenders. You will also see attempts at formatting of long text columns by splitting (e.g., having two 50-character columns instead of one 100-character column so that the physical display code in the front end does not have to calculate a word-wrap function). When you get a 25-character-wide printout, though, you are in trouble.
+
+Another common version of this is to program dynamic domain changes in a table. That is, one column contains the domain, which is metadata, for another column, which is data.
+
+Glenn Carr posted a horrible example of having a column in a table change domain on the fly on September 29, 2004, on the SQL Server programming newsgroup. His goal was to keep football statistics; this is a simplification of his original schema design. I have removed about a dozen other errors in design, so we can concentrate on just the shifting domain problem.
+
+The "stat_field_id" held the names of the statistics whose values are given in the "number_value" column of the same row. A better name for this column should have been "yardage_or_completions_or_interceptions_or_.." because that is what it has in it. Here is a rewrite:
+
+We found by inspection that a player is identified by a (league_id, player_nbr) pair. Player_id was originally another IDENTITY column in the Players table. I see sports games where the jersey of each player has a number; let's use that for identification. If reusing jersey numbers is a problem, then I am sure that leagues have some standard in their industry for this, and I am sure that it is not an auto-incremented number that was set by the hardware in Mr. Carr's machine.
+
+What he was trying to find were composite statistics, such as "Yards per Completion," which is trivial in the rewritten schema. The hardest part of the code is avoiding a division by zero in a calculation. Using theoriginal design, you had to write elaborate self-joins that had awful performance. I leave this as an exercise to the reader.
+
+#### Exceptions:
+
+This is not really an exception. You can use a column to change the scale, but not the domain, used in another column. For example, I record temperatures in degrees Absolute, Celsius, or Fahrenheit and put the standard abbreviation code in another column. But I have to have a VIEW for each scale used so that I can show Americans everything in Fahrenheit and the rest of the world everything in Celsius. I also want people to be able to update through those views in the units their equipment gives them.
+
+A more complex example would be the use of the ISO currency codes with a decimal amount in a database that keeps international transactions. The domain is constant; the second column is always currency, never shoe size or body temperature. When I do this, I need to have a VIEW that will convert all of the values to the same common currency: Euros, Yen, Dollars, or whatever. But now there is a time element because the exchange rates change constantly. This is not an easy problem.
+
+## 3.14.3 Split into Rows
+
+The attribute is modeled as a flag and value on each row of the same table. The classic example is temporal, such as this list of events:
+
+Time is measured by duration, not by instants; the correct DDL is:
+
+#### Exceptions:
+
+None
+
+These are simply bad schema designs that are often the results of confusing the physical representation of the data with the logical model. This tends to be done by older programmers carrying old habits over from file systems.
+
+For example, in the old days of magnetic tape files, the tapes were dated and processing was based on the one-to-one correspondence between time and a physical file. Creating tables with temporal names like "Payroll_Jan," "Payroll_Feb," and so forth just mimic magnetic tapes.
+
+Another source of these errors is mimicking paper forms or input screens directly in the DDL. The most common is an order detail table that includes a line number because the paper form or screen for the order has a line number. Customers buy products that are identified in the inventory database by SKU, UPC, or other codes, not a physical line number on a form on the front of the application. But the programmer splits the quantity attribute into multiple rows.
+
+# 3.15 Do Not Use Object-Oriented Design for an RDBMS
+
+#### Rationale:
+
+Many years ago, the INCITS H2 Database Standards Committee (née ANSI X3H2 Database Standards Committee) had a meeting in Rapid City, South Dakota. We had Mount Rushmore and Bjarne Stroustrup as special attractions. Mr. Stroustrup did his slide show about Bell Labs inventing C++ and OO programming for us, and we got to ask questions.
+
+One of the questions was how we should put OO stuff into SQL. His answer was that Bell Labs, with all its talent, had tried four different approaches to this problem and came to the conclusion that you should not do it. OO was great for programming but deadly for data.
+
+## 3.15.1 A Table Is Not an Object Instance
+
+Tables in a properly designed schema do not appear and disappear like instances of an object. A table represents a set of entities or arelationship. For them to appear (CREATE TABLE) and disappear (DROP TABLE) is like living in a world of magic, where a whole new species of creatures are created by any user, on the fly. Likewise, there are no OIDs in SQL. GUIDs, auto-numbering, and all of those proprietary exposed physical locators will not work in the long run.
+
+I have watched people try to force OO models into SQL, and it falls apart in about a year. Every typo becomes a new attribute, or class queries that would have been so easy in a relational model are now multitable monster outer joins, redundancy grows at an exponential rate, constraints are virtually impossible to write so you can kiss data integrity goodbye, and so on.
+
+In a thread discussing OO versus relational modeling entitled "impedance mismatch" in the comp.databases.theory newsgroup in October 2004, one experienced programmer reported:
+
+I'm here to tell you what you already know—you are 100 percent correct. I am stuck with working with an OO schema superimposed on an RDBMS. The amount of gymnastics that I need to go through to do what should be the simplest query is unimaginable. It took six man-hours (me and one of the OO developers for three hours) to come up with a query that was the equivalent of:
+
+The data needed consisted of basic information, name of the office location, address, manager, and phone. The final query was almost a full page long, required the joining of all the various tables for each data element (as each data element is now an object and each object has its own attributes, so requires its own table), and of course the monster object-linking tables so as to obtain the correct instance of each object.
+
+By the way, which instance is the correct one? Why, the latest one, of course, unless it is marked as not being the one to use, in which case look for the one that is so marked. And the marking indicator is not always the same value, as there are several potential values. These object-linking tables are the biggest in the entire database. Millions of rows in each of these in just one year's time to keep track of less than 80,000 entity instances.
+
+Self-joins are needed in some cases; here are two of these monster tables, and a few smaller ones.
+
+Fortunately, there are extracts that run nightly to transform the data into a relational schema set up for reporting, but not all the data is there, or is wrong, so sometimes I need to go through the above.
+
+## 3.15.2 Do Not Use EAV Design for an RDBMS
+
+The Entity-Attribute-Value (EAV) design flaw is particularly popular among newbies who come from the agile or extreme school of software development. This school used to be called "Code first, design and think later" when it was first popular.
+
+The idea is that you have one huge table with three columns of metadata: entity name, attribute name, and attribute value. This lets your users invent new entities as they use the database. If the American wants to create something called a "tire" and the British speaker wants to create something called a "tyre," then they are both free to do so.
+
+The values have be recorded in the most general data type in the SQL engine, so you use a lot of VARCHAR(n) columns in the EAV model. Now try to put a constraint on the column.
+
+#### Exceptions:
+
+None. There are better tools for collecting free-form data.
+CHAPTER 4 Scales and Measurements
+
+BEFORE YOU CAN put data into a database, you actually need to think about how it will be represented and manipulated. Most programmers have never heard of measurement theory or thought about the best way to represent their data. Although this topic is not specifically about SQL style, it gives a foundation for decisions that have to be made in the design of any schema.
+
+# 4.1 Measurement Theory
+
+_Measure all that is measurable and attempt to make measurable that which_ _is not y et so._
+
+—Galileo (1564–1642)
+
+Measurement theory is a branch of applied mathematics that is useful in data analysis. Measurements are not the same as the attribute being measured. Measurement is not just assigning numbers to things or their attributes so much as it is assigning to things a structural property that can be expressed in numbers or other computable symbols. This structure is the scale used to take the measurement; the numbers or symbols represent units of measure.
+
+Strange as it might seem, measurement theory came from psychology, not mathematics or computer science. In particular, S. S.Stevens originated the idea of levels of measurement and classification of scales. Scales are classified into types by the properties they do or do not have. The properties with which we are concerned are the following:
+
+1. A _natural origin point on the scale._ This is sometimes called a zero, but it does not have to be literally a numeric zero. For example, if the measurement is the distance between objects, the natural zero is zero meters—you cannot get any closer than that. If the measurement is the temperature of objects, the natural zero is zero degrees Kelvin—nothing can get any colder than absolute zero. However, consider time: It goes from an eternal past into an eternal future, so you cannot find a natural origin for it.
+
+2. _Meaningful operations can be performed on the units._ It makes sense to add weights together to get a new weight. However, adding names or shoe sizes together is absurd.
+
+3. A _natural ordering of the units._ It makes sense to speak about an event occurring before or after another event, or a thing being heavier, longer, or hotter than another thing, but the alphabetical order imposed on a list of names is arbitrary, not natural—a foreign language, with different names for the same objects, would impose another ordering.
+
+4. _A natural metric function on the units. A_ metric function has nothing to do with the metric system of measurements, which is more properly called SI, for Systemé International d'units in French. Metric functions have the following three properties:
+
+a. The metric between an object and itself is the natural origin of the scale. We can write this in a semimathematical notation as _M(a, a) =_ 0.
+
+b. The order of the objects in the metric function does not matter. Again in the notation, M(a, _b) = M(b, a)._
+
+c. There is a natural additive function that obeys the rule that M (a, _b) + M(b, c) =_ M (a, _c),_ which is also known as the _triangular inequality._
+
+This notation is meant to be more general than just arithmetic. The zero in the first property is the origin of the scale, not just a numeric zero. The third property, defined with a plus and a greater than or equalto sign, is a symbolic way of expressing general ordering relationships. The greater than or equal to sign refers to a natural ordering on the attribute being measured. The plus sign refers to a meaningful operation in regard to that ordering, not just arithmetic addition.
+
+The special case of the third property, where the greater than or equal to is always greater than, is desirable to people because it means that they can use numbers for units and do simple arithmetic with the scales. This is called a _strong metric property._ For example, human perceptions of sound and light intensity follow a cube root law—that is, if you double the intensity of light, the perception of the intensity increases by only 20 percent (Stevens, 1957). The actual formula is "Physical intensity to the 0.3 power equals perceived intensity" in English. Knowing this, designers of stereo equipment use controls that work on a logarithmic scale internally but that show evenly spaced marks on the control panel of the amplifier.
+
+It is possible to have a scale that has any combination of the metric properties. For example, instead of measuring the distance between two places in meters, measure it in units of effort. This is the old Chinese system, which had uphill and downhill units of distance.
+
+Does this system of distances have the property that _M(a, a)_ = 0? Yes. It takes no effort to get to where you already are located. Does it have the property that _M(a, b) = Μ(b, a)?_ No. It takes less effort to go downhill than to go uphill. Does it have the property that Μ (a, _b) + Μ(b, c) = M(a, c)?_ Yes. The amount of effort needed to go directly to a place will always be less than the effort of making another stop along the way.
+
+## 4.1.1 Range and Granularity
+
+Range and granularity are properties of the way the measurements are made. Because we have to store data in a database within certain limits, these properties are important to a database designer. The types of scales are unrelated to whether you use discrete or continuous variables. Although measurements are always discrete because of finite precision, attributes can be conceptually either discrete or continuous regardless of measurement level. Temperature is usually regarded as a continuous attribute, so temperature measurement to the nearest degree Kelvin is a ratio-level measurement of a continuous attribute. However, quantum mechanics holds that the universe is fundamentally discrete, so temperature may actually be a discrete attribute. In ordinal scales for continuous attributes, ties are impossible (or have probability zero). In ordinal scales for discrete attributes, ties are possible. Nominal scalesusually apply to discrete attributes. Nominal scales for continuous attributes can be modeled but are rarely used.
+
+## 4.1.2 Range
+
+A scale also has other properties that are of interest to someone building a database. First, scales have a range: What are the highest and lowest values that can appear on the scale? It is possible to have a finite or an infinite limit on either the lower or the upper bound. Overflow and underflow errors are the result of range violations inside the database hardware.
+
+Database designers do not have infinite storage, so we have to pick a subrange to use in the database when we have no upper or lower bound. For example, few computer calendar routines will handle geologic time periods, but then few companies have bills that have been outstanding for that long either, so we do not mind.
+
+## 4.1.3 Granularity, Accuracy, and Precision
+
+Look at a ruler and a micrometer. They both measure length, using the same scale, but there is a difference. A micrometer is more precise because it has a finer granularity of units. Granularity is a static property of the scale itself—how many notches there are on your ruler. In Europe, all industrial drawings are done in millimeters; the United States has been using l/32nd of an inch.
+
+Accuracy is how close the measurement comes to the actual value. Precision is a measure of how repeatable a measurement is. Both depend on granularity, but they are not the same things. Human nature says that a number impresses according to the square of the number of decimal places. Hence, some people will use a computer system to express things to as many decimal places as possible, even when it makes no sense. For example, civil engineering in the United States uses decimal feet for road design. Nobody can build a road any more precisely than that, but many civil engineering students turn in work that is expressed in ten-thousandths of a foot. You don't use a micrometer on asphalt! A database often does not give the user a choice of precision for many calculations. In fact, the SQL standards leave the number of decimal places in the results of many arithmetic operations to be defined by the implementation.
+
+The ideas are easier to explain with handgun targets, which are scales to measure the ability of the shooter to put bullets in the center of atarget. A bigger target has a wider range compared with a smaller target. A target with more rings has a higher granularity.
+
+Once you start shooting, a group of shots that are closer together is more precise because the shots were more repeatable. A shot group that is closer to the center is more accurate because the shots were closer to the goal. Notice that precision and accuracy are not the same thing! If I have a good gun whose sights are off, I can get a tight cluster that is not near the bull's eye.
+
+# 4.2 Types of Scales
+
+The lack or presence of precision and accuracy determines the kind of scale you should choose. Scales are either quantitative or qualitative. Quantitative scales are what most people mean when they think of measurements, because these scales can be manipulated and are usually represented as numbers. Qualitative scales attempt to impose an order on an attribute, but they do not allow for computations—-just comparisons.
+
+## 4.2.1 Nominal Scales
+
+The simplest scales are the nominal scales. They simply assign a unique symbol, usually a number or a name, to each member of the set that they attempt to measure. For example, a list of city names is a nominal scale.
+
+Right away we are into philosophical differences, because many people do not consider listing to be measurement. Because no clear property is being measured, that school of thought would tell us this cannot be a scale.
+
+There is no natural origin point for a set, and likewise there is no ordering. We tend to use alphabetic ordering for names, but it makes just as much sense to use frequency of occurrence or increasing size or almost any other attribute that does have a natural ordering.
+
+The only meaningful operation that can be done with such a list is a test for equality—"Is this city New York or not?"—and the answer will be TRUE, FALSE, or UNKNOWN. Nominal scales are common in databases because they are used for unique identifiers, such as names and descriptions.
+
+## 4.2.2 Categorical Scales
+
+The next simplest scales are the categorical scales. They place an entity into a category that is assigned a unique symbol, usually a number or aname. For example, the class of animals might be categorized as reptiles, mammals, and so forth. The categories have to be within the same class of things to make sense.
+
+Again, many people do not consider categorizing to be measurement. The categories are probably defined by a large number of properties, and there are two potential problems with them. The first problem is that an entity might fall into one or more categories. For example, a platypus is a furry, warm-blooded, egg-laying animal. Mammals are warm-blooded but give live birth and optionally have fur. The second problem is that an entity might not fall into any of the categories at all. If we find a creature with chlorophyll and fur on Mars, we do not have a category of animals in which to place it.
+
+The two common solutions are either to create a new category of animals (monotremes for the platypus and echidna) or to allow an entity to be a member of more than one category. There is no natural origin point for a collection of subsets, and, likewise, there is no ordering of the subsets. We tend to use alphabetic ordering for names, but it makes just as much sense to use frequency of occurrence or increasing size or almost any other attribute that does have a natural ordering.
+
+The only meaningful operation that can be done with such a scale is a test for membership—"Is this animal a mammal or not?"—which will test either TRUE, FALSE, or UNKNOWN.
+
+## 4.2.3 Absolute Scales
+
+An absolute scale is a count of the elements in a set. Its natural origin is zero, or the empty set. The count is the ordering (a set of five elements is bigger than a set of three elements, and so on). Addition and subtraction are metric functions. Each element is taken to be identical and interchangeable. For example, when you buy a dozen Grade A eggs, you assume that for your purposes any Grade A egg will do the same job as any other Grade A egg. Again, absolute scales are in databases because they are used for quantities.
+
+## 4.2.4 Ordinal Scales
+
+Ordinal scales put things in order but have no origin and no operations. For example, geologists use a scale to measure the hardness of minerals called Moh's Scale for Hardness (MSH). It is based on a set of standard minerals, which are ordered by relative hardness (talc = 1, gypsum = 2, calcite = 3, fluorite = 4, apatite = 5, feldspar = 6, quartz = 7, topaz = 8, sapphire = 9, diamond =10).
+
+To measure an unknown mineral, you try to scratch the polished surface of one of the standard minerals with it; if it scratches the surface, the unknown is harder. Notice that I can get two different unknown minerals with the same measurement that are not equal to each other and that I can get minerals that are softer than my lower bound or harder than my upper bound. There is no origin point, and operations on the measurements make no sense (e.g., if I add 10 talc units, I do not get a diamond).
+
+Perhaps the most common use we see of ordinal scales today is to measure preferences or opinions. You are given a product or a situation and asked to decide how much you like or dislike it, how much you agree or disagree with a statement, and so forth. The scale is usually given a set of labels such as "strongly agree" through "strongly disagree," or the labels are ordered from 1 to 5.
+
+Consider pairwise choices between ice cream flavors. Saying that vanilla is preferred over wet leather in our taste test might well be expressing a universal truth, but there is no objective unit of likeability to apply. The lack of a unit means that such things as opinion polls that try to average such scales are meaningless; the best you can do is a bar graph of the number of respondents in each category.
+
+Another problem is that an ordinal scale may not be transitive. _Transitivity_ is the property of a relationship in which if _R(a, b)_ and R(b, c), then _R(a, c)._ We like this property and expect it in the real world, where we have relationships like "heavier than," "older than," and so forth. This is the result of a strong metric property.
+
+But an ice cream taster, who has just found out that the shop is out of vanilla, might prefer squid over wet leather, wet leather over wood, and wood over squid, so there is no metric function or linear ordering at all. Again, we are into philosophical differences, because many people do not consider a nontransitive relationship to be a scale.
+
+## 4.2.5 Rank Scales
+
+Rank scales have an origin and an ordering but no natural operations. The most common example of this would be military ranks. Nobody is lower than a private, and that rank is a starting point in your military career, but it makes no sense to somehow combine three privates to get a sergeant.
+
+Rank scales have to be transitive: A sergeant gives orders to a private, and because a major gives orders to a sergeant, he or she can also give orders to a private. You will see ordinal and rank scales grouped together in some of the literature if the author does not allow nontransitiveordinal scales. You will also see the same fallacies committed when people try to do statistical summaries of such scales.
+
+## 4.2.6 Interval Scales
+
+Interval scales have a metric function, ordering, and meaningful operations among the units but no natural origin. Calendars are the best example; some arbitrary historical event is the starting point for the scale and all measurements are related to it using identical units or intervals. Time, then, extends from a past eternity to a future eternity.
+
+The metric function is the number of days between two dates. Look at the three properties: (1) _M(a, a) =_ 0: there are zero days between today and today; (2) Μ (a, _b) = Μ(b, a):_ there are just as many days from today to next Monday as there are from next Monday to today; and (3) Μ (a, _b) + Μ(b, c)_ = _M(a, c):_ the number of days from today to next Monday plus the number of days from next Monday to Christmas is the same as the number of days from today until Christmas. Ordering is natural and strong: 1900-July-l occurs before 1993-July-l. Aggregations of the basic unit (days) into other units (weeks, months, and years) are also arbitrary.
+
+Please do not think that the only metric function is simple math; there are log-interval scales, too. The measurements are assigned numbers such that ratios between the numbers reflect ratios of the attribute. You then use formulas of the form (c X _m^d),_ where c and _d art_ constants, to do transforms and operations. For example, density = (mass/volume), fuel efficiency expressed in miles per gallon (mpg), decibel scale for sound, and the Richter scale for earthquakes are exponential, so their functions involve logarithms and exponents.
+
+## 4.2.7 Ratio Scales
+
+Ratio scales are what people think of when they think about a measurement. Ratio scales have an origin (usually zero units), an ordering, and a set of operations that can be expressed in arithmetic. They are called ratio scales because all measurements are expressed as multiples or fractions of a certain unit or interval.
+
+Length, mass, and volume are examples of this type of scale. The unit is what is arbitrary: The weight of a bag of sand is still weight whether it is measured in kilograms or in pounds. Another nice property is that the units are identical: A kilogram is still a kilogram whether it is measuring feathers or bricks.
+
+# 4.3 Using Scales
+
+Absolute and ratio scales are also called extensive scales because they deal with quantities, as opposed to the remaining scales, which are intensive because they measure qualities. Quantities can be added and manipulated together, whereas qualities cannot. Table 4.1 describes the different types of scales and their attributes.
+
+Table 4.1 _Scale properties_
+
+The origin for the absolute scale is numeric zero, and the natural functions are simple arithmetic. However, things are not always this simple. Temperature has an origin point at absolute zero, and its natural functions average heat over mass. This is why you cannot defrost a refrigerator, which is at 0 degrees Celsius, by putting a chicken whose body temperature is 35 degrees Celsius inside of it. The chicken does not have enough mass relative to heat. However, a bar of white-hot steel will do a nice job.
+
+# 4.4 Scale Conversion
+
+Scales can be put in a partial order based on the permissible transformations:
+
+An attribute might not fit exactly into any of these scales. For example, you mix nominal and ordinal information in a single scale, such as in questionnaires that have several nonresponse categories. It is common to have scales that mix ordinal and an interval scale by assuming the attribute is really a smooth monotone function. Subjective rating scales ("strongly agree," "agree,". . . "strongly disagree") have no equally spaced intervals between the ratings, but there are statistical techniques to ensure that the difference between two intervals is within certain limits. A binary variable is at least an interval scale, and it might be a ratio or absolute scale, if it means that the attribute exists or does not exist.
+
+The important principle of measurement theory is that you can convert from one scale to another only if they are of the same type and measure the same attribute. Absolute scales do not convert, which is why they are called absolute scales. Five apples are five apples, no matter how many times you count them or how you arrange them on the table. Nominal scales are converted to other nominal scales by a mapping between the scales.
+
+That means you look things up in a table. For example, I can convert my English city names to Polish city names with a dictionary. The problem comes when there is not a one-to-one mapping between the two nominal scales. For example, English uses the word "cousin" to identify the offspring of your parents' siblings, and tradition treats them all pretty much alike.
+
+Chinese language and culture have separate words for the same relations based on the genders of your parents' siblings and the age relationships among them (e.g., the oldest son of your father's oldest brother is a particular type of cousin and you have different social obligations to him). Something is lost in translation.
+
+Ordinal scales are converted to ordinal scales by a monotone function. That means you preserve the ordering when you convert. Looking at the MSH for geologists, I can pick another set of minerals, plastics, or metals to scratch, but rock samples that were definitely softer than others are still softer. Again, there are problems when there is not a one-to-one mapping between the two scales. My new scale may be able to tell the difference between rocks, whereas the MSH could not.
+
+Rank scales are converted to rank scales by a monotone function that preserves the ordering, like ordinal scales. Again, there are problems when there is not a one-to-one mapping between the two scales. For example, different military branches have slightly different ranks that don't quite correspond to each other.
+
+In both the nominal and the ordinal scales, the problem was that things that looked equal on one scale were different on another. This has to do with range and granularity, which was discussed in section 4.1.1 of this chapter.
+
+Interval scales are converted to interval scales by a linear function; that is, a function of the form _y_ = α X **X** _\+ b._ This preserves the ordering but shifts the origin point when you convert. For example, I can convert temperature from degrees Celsius to degrees Fahrenheit using the formula F = (9.0 + 5.0 X C) + 32.
+
+Ratio scales are converted to ratio scales by a constant multiplier, because both scales have the same ordering and origin point. For example, I can convert from pounds to kilograms using the formula ρ = 0.4536 X k. This is why people like to use ratio scales.
+
+# 4.5 Derived Units
+
+Many of the scales that we use are not primary units but rather derived units. These measures are constructed from primary units, such as miles per hour (time and distance) or square miles (distance and distance). You can use only ratio and interval scales to construct derived units.
+
+If you use an absolute scale with a ratio or interval scale, you are dealing with statistics, not measurements. For example, using weight (ratio scale) and the number of people in New York (absolute scale), we can compute the average weight of a New Yorker, which is a statistic, not a unit of measurement.
+
+The SI measurements use a basic set of seven units (i.e., meter for length, kilogram for mass, second for time, ampere for electrical current, degree Kelvin for temperature, mole for molecules, and candela for light) and construct derived units. ISO standard 2955 ("Informationprocessing—Representation of SI and other units for use in systems with limited character sets") has a notation for expressing SI units in ASCII character strings. (See ISO-2955, "Representation of SI and other units for use in systems with limited character sets") The notation uses parentheses, spaces, multiplication (shown by a period), division (shown by a solidus, or slash), and exponents (shown by numerals immediately after the unit abbreviation). There are also names for most of the standard derived units. For example, "100 kg.m **÷r** s2" converts to 10 Newtons (the unit of force), written as "10 N" instead.
+
+# 4.6 Punctuation and Standard Units
+
+A database stores measurements as numeric data represented in a binary format, but when the data is input or output, a human being wants readable characters and punctuation. Punctuation identifies the units being used and can be used for prefix, postfix, or infix symbols. It can also be implicit or explicit.
+
+If I write $25.15, you know that the unit of measure is the dollar because of the explicit prefix dollar sign. If I write 160 lbs., you know that the unit of measure is pounds because of the explicit postfix abbreviation for the unit. If I write 1989 MAR 12, you know that this is a date because of the implicit infix separation among month, day, and year, achieved by changing from numerals to letters, and the optional spaces. The ISO and SQL defaults represent the same date, using explicit infix punctuation, with 1989-03-12 instead. Likewise, a column header on a report that gives the units used is explicit punctuation.
+
+Databases do not generally store punctuation. The sole exception might be the proprietary MONEY or CURRENCY data type found in many SQL implementations as a vendor extension. Punctuation wastes storage space, and the units can be represented in some internal format that can be used in calculations. Punctuation is only for display.
+
+It is possible to put the units in a column next to a numeric column that holds their quantities, but this is awkward and wastes storage space. If everything is expressed in the same unit, the units column is redundant. If things are expressed in different units, you have to convert them to a common unit to do any calculations. Why not store them in a common unit in the first place? The DBA has to be sure that all data in a column of a table is expressed in the same units before it is stored. There are some horror stories about multinational companies sending the same input programs used in the United States to their European offices,where SI and English measurements were mixed into the same database without conversion.
+
+Ideally, the DBA should be sure that data is kept in the same units in all the tables in the database. If different units are needed, they can be provided in a VIEW that hides the conversions (thus the office in the United States sees English measurements and the European offices see SI units and date formats; neither is aware of the conversions being done for it).
+
+# 4.7 General Guidelines for Using Scales in a Database
+
+The following are general guidelines for using measurements and scales in a database and not firm, hard rules. You will find exceptions to all of them.
+
+1. _In general, the more unrestricted the permissible transformations on a scale are, the more restricted the statistics._ Almost all statistics are applicable to measurements made on ratio scales, but only a limited group of statistics may be applied to measurements made on nominal scales.
+
+2. _Use CHECK() clauses on table declarations to make sure that only the allowed values appear in the database._ If you have the CREATE DOMAIN feature of SQL-92, use it to build your scales. Nominal scales would have a list of possible values; other scales would have range checking. Likewise, use the DEFAULT clauses to be sure that each scale starts with its origin value, a NULL, or a default value that makes sense.
+
+3. _Declare at least one more decimal place than you think you will need for your smallest units._ In most SQL implementations, rounding and truncation will improve with more decimal places.
+
+The downside of SQL is that precision and the rules for truncation and rounding are implementation dependent, so a query with calculations might not give the same results on another product. However, SQL is more merciful than older file systems, because the DBA can ALTER a numeric column so it will have more precision and a greater range without destroying existing data or queries. Host programs may have to be changed to display the extra characters in the results, however.
+
+You also need to consider laws and accounting rules that deal with currencies. The European Union has rules for computing with euros, and the United States has similar rules for dollars in the Generally Accepted Accounting Practices (GAAP).
+
+4. Try _to store primary units rather than derived units._ This is not always possible, because you might not be able to measure anything but the derived unit. Look at your new tire gauge; it is set for Pascal (Newtons per square meter) and will not tell you how many square meters you have on the surface of the tire or the force exerted by the air, and you simply cannot figure these things out from the Pascals given. A set of primary units can be arranged in many different ways to construct any possible derived unit desired. Never store both the derived and the primary units in the same table. Not only is this redundant, but it opens the door to possible errors when a primary-unit column is changed and the derived units based on it are not updated. Also, most computers can recalculate the derived units much faster than they can read a value from a disk drive.
+
+5. _Use the same punctuation whenever a unit is displayed._ For example, do not mix ISO and ANSI date formats, or express weight in pounds and kilograms in the same report. Ideally, everything should be displayed in the same way in the entire application system.
+CHAPTER 5 Data Encoding Schemes
+
+YOU DO NOT put data directly into a database. You convert it into an encoding scheme first, then put the encoding into the rows of the tables. Words have to be written in an alphabet and belong to a language; measurements are expressed as numbers. We are so used to seeing words and numbers that we no longer think of them as encoding schemes. We also often fail to distinguish among the possible ways to identify (and therefore to encode) an entity or property. Do we encode the person receiving medical services or the policy that is paying for them? That might depend on whether the database is for the doctor or for the insurance company. Do we encode the first title of a song or the alternate title, or both? Or should we include the music itself in a multimedia database? And should it be as an image of the sheet music or as an audio recording? Nobody teaches people how to design these encoding schemes, so they are all too often done on the fly. Where standardized encoding schemes exist, they are too often ignored in favor of some ad hoc scheme. Beginning programmers have the attitude that encoding schemes do not really matter because the computer will take care of it, so they don't have to spend time on the design of their encoding schemes. This attitude has probably gotten worse with SQL than it was before. The new database designer thinks that an ALTER statement can fix any bad things he or she did at the start of the project.
+
+Yes, the computer can take care of a lot of problems, but the data entry and validation programs become complex and difficult to maintain. Database queries that have to follow the same convoluted encodings will cost both computer time and money, and a human being still has to use the code at some point. Bad schemes result in errors in data entry and misreading of outputs and can lead to incorrect data models.
+
+# 5.1 Bad Encoding Schemes
+
+To use an actual example, the automobile tag system for a certain southern state started as a punchcard system written in COBOL. Many readers are likely too young to remember punchcard (keypunch) machines. A punchcard is a piece of stiff paper on which a character is represented as one or more rectangular holes made into one of 80 vertical columns on the card. Contiguous groups of columns make up fixed-length fields of data. The keypunch machine has a typewriter-like keyboard; it automatically feeds cards into the punch as fast as a human being can type. The position, length, and alphabetic or numeric shift for each field on the card can be set by a control card in the keypunch machine to save the operator keystrokes. This is a fixed format and a fast input method, and making changes to a program once it is in place is difficult.
+
+The auto tag system had a single card column for a single-position numeric code to indicate the type of tag: private car, chauffeured car, taxi, truck, public bus, and so forth. As time went on, more tag types were added for veterans of assorted wars, for university alumni, and for whatever other lobbyist group happened to have the political power to pass a bill allowing it a special auto tag.
+
+Soon there were more than 10 types, so a single-digit system could not represent them. There was room on the punchcard to change the length of the field to two digits, but COBOL uses fixed-length fields, so changing the card layout would require changes in the programs and in the keypunch procedures.
+
+The first new tag code was handled by letting the data-entry clerk press a punctuation-mark key instead of changing from numeric lock to manual shift mode. Once that decision was made, it was followed for each new code thereafter, until the scheme looked like everything on the upper row of keys on a typewriter.
+
+Unfortunately, different makes and models of keypunch machines have different punctuation marks in the same keyboard position, so eachdeck of cards had to have a special program to convert its punches to the original model IBM 026 keypunch codes before the master file was updated. This practice continued even after all of the original machines had been retired to used-equipment heaven.
+
+The edit programs could not check for a simple numeric range to validate input but had to use a small lookup routine with more than 20 values in it. That does not sound like much until you realize that the system had to handle more than 3 million records in the first quarter of the year. The error rate was high, and each batch needed to know which machine had punched the cards before it could use a lookup table.
+
+If the encoding scheme had been designed with two digits (00 to 99) at the beginning, all of the problems would have been avoided. If I were to put this system into a database today, using video terminals for data entry, the tag type could be INTEGER and it could hold as many tag types as I would ever need. This is part of the legacy database problem.
+
+The second example was reported in _Information Systems Week_ in 1987. The first sentence told the whole story: "The chaos and rampant error rates in New York City's new Welfare Management System appear to be due to a tremendous increase in the number of codes it requires in data entry and the subsequent difficulty for users in learning to use it." The rest of the article explained how the new system attempted to merge several old existing systems. In the merger, the error rates increased from 2 percent to more than 20 percent because the encoding schemes used could not be matched up and consolidated.
+
+How do you know a bad encoding scheme when you see one? One bad feature is the failure to allow for growth. Talk to anyone who had to reconfigure a fixed-length record system to allow for the change from the old ZIP codes to the current ZIP+4 codes in their address data. SQL does not have this as a physical problem, but it can show up as a logical problem.
+
+Another bad property is ambiguous encodings in the scheme. Perhaps the funniest example of this problem was the Italian telephone system's attempt at a "time of day" service. It used a special three-digit number, like the 411 information number in the United States, but the three digits they picked were also those of a telephone exchange in Milan, so nobody could call into that exchange without getting the time signal before they completed their call.
+
+This happens more often than you would think, but the form that it usually takes is that of a miscellaneous code that is too general. Very different cases are then encoded as identical, and the user is given incorrect or misleading information when a query is performed.
+
+A bad encoding scheme lacks codes for missing, unknown, not applicable, or miscellaneous values. The classic story is the man who bought a prestige auto tag reading "NONE" and got thousands of traffic tickets as a result. The police had no special provision for a missing tag on the tickets, so when a car had no tag, they wrote "none" in the field for the tag number. The database simply matched his name and address to every unpaid missing-tag ticket on file at the time.
+
+Before you say that the NULL in SQL is a quick solution to this problem, think about how NULL is ignored in many SQL functions. The SQL query "SELECT tag_nbr, SUM(fine) FROM tickets GROUP BY tag_nbr;" will give the total fines on each car, but it also puts all of the missing tags into one group (i.e., one car), although we want to see each one as a separate case, because it is unlikely that there is only one untagged car in all of California.
+
+There are also differences among "missing," "unknown," "not applicable," "miscellaneous," and erroneous values that are subtle but important. For example, the International Classification of Disease uses 999.999 for miscellaneous illness. It means that we have diagnosed the patient, know that he or she has an illness, and cannot classify it—a scary condition for the patient—but this is not quite the same thing as a missing disease code (just admitted, might not even be sick), an inapplicable disease code (pregnancy complications in a male), an unknown disease code (sick and awaiting lab results), or an error in the diagnosis (the patient's temperature is recorded as 100 degrees Celsius, not Fahrenheit).
+
+# 5.2 Encoding Scheme Types
+
+The following is my classification system for encoding schemes and suggestions for using each of them. You will find some of these same ideas in library science and other fields, but I have never seen anyone else attempt a classification system for data processing.
+
+## 5.2.1 Enumeration Encoding
+
+An enumeration encoding arranges the attribute values in some order and assigns a number or a letter to each value. Numbers are usually a better choice than letters, because they can be increased without limit as more values are added. Enumeration schemes are a good choice for a short list of values but a bad choice for a long list. It is too difficult to remember a long list of codes, and soon any natural ordering principle is violated as new values are tacked onto the end.
+
+A good heuristic is to order the values in some natural manner, if one exists in the data, so that table lookup will be easier. Chronological order (1 occurs before 2) or procedural order (1 must be done before 2) is often a good choice. Another good heuristic is to order the values from most common to least common. That way you will have shorter codes for the most common cases. Other orderings could be based on physical characteristics such as largest to smallest, rainbow-color order, and so on.
+
+After arguing for a natural order in the list, I must admit that the most common scheme is alphabetical order, because it is simple to implement on a computer and makes it easy for a person to look up values in a table. ANSI standard X3.31, "Structure for the Identification of Counties of the United States for Information Interchange," encodes county names within a state by first alphabetizing the names, and then numbering them from one to whatever is needed.
+
+## 5.2.2 Measurement Encoding
+
+A measurement encoding is given in some unit of measure, such as pounds, meters, volts, or liters. This can be done in one of two ways. The column contains an implied unit of measure and the numbers represent the quantity in that unit, but sometimes the column explicitly contains the unit. The most common example of the second case would be money fields, where a dollar sign is used in the column; you know that the unit is dollars, not pounds or yen, by the sign.
+
+Scales and measurement theory are a whole separate topic and are discussed in detail in Chapter 4.
+
+## 5.2.3 Abbreviation Encoding
+
+Abbreviation codes shorten the attribute values to fit into less storage space, but the reader easily understands them. The codes can be either of fixed length or of variable length, but computer people tend to prefer fixed length. The most common example is the two-letter postal state abbreviations (e.g., CA for California, AL for Alabama), which replaced the old variable-length abbreviations (Calif, for California, Ala. for Alabama).
+
+A good abbreviation scheme is handy, but as the set of values becomes larger, the possibility for misunderstanding increases. The three-letter codes for airport baggage are pretty obvious for major cities: LAX for Los Angeles, SFO for San Francisco, BOS for Boston, ATL forAtlanta, but nobody can figure out the abbreviations for the smaller airports.
+
+As another example, consider the ISO 3166 Country Codes, which come in two-letter, three-letter, and nonabbreviation numeric forms. The RIPE Network Coordination Centre maintains these codes.
+
+## 5.2.4 Algorithmic Encoding
+
+Algorithmic encoding takes the value to be encoded and puts it through an algorithm to obtain the encodings. The algorithm should be reversible, so that the original value can be recovered. Although it is not required, the encoding is usually shorter (or at least of known maximum size) and more uniform in some useful way compared with the original value. Encryption is the most common example of an algorithmic encoding scheme, but it is so important that it needs to be considered as a topic by itself.
+
+Computer people are used to using Julianized dates, which convert a date into an integer. As an aside, please note that astronomers used the _Julian Date,_ which is a large number that represents the number of days since a particular heavenly event. The Julianized date is a number between 1 and 365 or 366, which represents the ordinal position of the day within the year. Algorithms take up computer time in both data input and output, but the encoding is useful because it allows searching or calculations to be done that would be difficult using the original data. Julianized dates can be used for computations; Soundex names give a phonetic matching that would not be possible with the original text.
+
+Another example is hashing functions, which convert numeric values into other numeric values for placing them in storage and retrieving them. Rounding numeric values before they go into the database is also a case of algorithmic encoding.
+
+The difference between an abbreviation and an algorithm is not that clear. An abbreviation can be considered a special case of an algorithm, which tells you how to remove or replace letters. The tests to tell them apart are as follows:
+
+1. When a human being can read it without effort, it is an abbreviation.
+
+2. An algorithmic encoding is not easily human readable.
+
+3. An algorithmic encoding might return the same code for more than one value, but an abbreviation is always one-to-one.
+
+## 5.2.5 Hierarchical Encoding Schemes
+
+A hierarchy partitions the set of values into disjoint categories, then partitions those categories into subcategories, and so forth until some final level is reached. Such schemes are shown either as nested sets or as tree charts. Each category has some meaning in itself, and the subcategories refine meaning further.
+
+The most common example is the ZIP code, which partitions the United States geographically. Each digit, as you read from left to right, further isolates the location of the address: first by postal region, then by state, then by city, and finally by the post office that has to make the delivery. For example, given the ZIP code 30310, we know that the 30000 to 39999 range means the southeastern United States. Within the southeastern codes, we know that the 30000 to 30399 range is Georgia and that 30300 to 30399 is metropolitan Atlanta. Finally, the whole code, 30310, identifies substation A in the West End section of the city. The ZIP code can be parsed by reading it from left to right, reading first one digit, then two, and then the last two digits.
+
+Another example is the Dewey Decimal Classification (DDC) system, which is used in public libraries in the United States. The 500-number series covers "Natural Sciences"; within that, the 510s cover "Mathematics"; and, finally, 512 deals with "Algebra" in particular. The scheme could be carried further, with decimal fractions for kinds of algebra.
+
+Hierarchical encoding schemes are great for large data domains that have a natural hierarchy. They organize the data for searching and reporting along that natural hierarchy and make it easy, but there can be problems in designing these schemes. First, the tree structure does not have to be neatly balanced, so some categories may need more codes than others and hence create more breakdowns. Eastern and ancient religions are shortchanged in the Dewey Decimal Classification system, reflecting a prejudice toward Christian and Jewish writings. Asian religions were pushed into a very small set of codes. Today, the Library of Congress has more books on Buddhist thought than on any other religion on earth.
+
+Second, you might not have made the right choices as to where to place certain values in the tree. For example, in the Dewey Decimal system, books on logic are encoded as 164, in the philosophy section, and not under the 510s, mathematics. In the 19th century, there was no mathematical logic. Today, nobody would think of looking for logic under philosophy. Dewey was simply following the conventions of hisday, and, like today's programmers, he found that the system specifications changed while he was working.
+
+## 5.2.6 Vector Encoding
+
+A vector is made up of a fixed number of components. These components can be ordered or unordered, but are always present. They can be of fixed or variable length. The components can be dependent or independent of each other, but the code applies to a single entity and makes sense only as a whole unit. Punctuation, symbol-set changes, or position within the code can determine the components of the vector.
+
+The most common example is a date, whose components are month, day, and year. The parts have some meaning by themselves, but the real meaning is in the vector—the date—as a whole because it is a complete entity. The different date formats used in computer systems give examples of all the options. The three components can be written in year-month-day order, month-day-year order, or just about any other way you wish.
+
+The limits on the values for the day depend on the year (is it a leap year or not?) and the month (28, 29, 30, or 31 days?). The components can be separated by punctuation (12/1/2005, using slashes and American date format), symbol-set changes (2005 DEC 01, using digits-letters-digits), or position (20051201, using positions 1 to 4, 5 to 6, and 7 to 8 for year, month, and day, respectively).
+
+Another example is the ISO code for tire sizes, which is made up of a wheel diameter (scaled in inches), a tire type (abbreviation code), and a width (scaled in centimeters). Thus, 15R155 means a 15-inch radial tire that is 155 millimeters wide, whereas 15SR155 is a steel-belted radial tire with the same dimensions. Despite the mixed American and ISO units, this is a general physical description of a tire in a single code.
+
+Vector schemes are informative and allow you to pick the best scheme for each component, but they have to be disassembled to get to the components (many database products provide special functions to do this for dates, street addresses, and people's names). Sorting by components is difficult unless you want them in the order given in the encoding; try to sort the tire sizes by construction, width, and diameter instead of by diameter, construction, and width.
+
+Another disadvantage is that a bad choice in one component can destroy the usefulness of the whole scheme. Another problem is extending the code. For example, if the standard tire number had to be expanded to include thickness in millimeters, where would that measurement go? Another number would have to be separated by apunctuation mark. It could not be inserted into a position inside the code without giving ambiguous codes. The code cannot be easily converted to a fixed-position vector encoding without changing many of the database routines.
+
+## 5.2.7 Concatenation Encoding
+
+A concatenation code is made up of a variable number of components that are concatenated together. As in a vector encoding, the components can be ordered or unordered, dependent on or independent of each other, and determined by punctuation, symbol-set changes, or position.
+
+A concatenation code is often a hierarchy that is refined by additions to the right. These are also known _as facet codes_ in Europe. Or the code can be a list of features, any of which can be present or missing. The order of the components may or may not be important.
+
+Concatenation codes were popular in machine shops at the turn of the 20th century: A paper tag was attached to a piece of work, and workers at different stations would sign off on their parts of the manufacturing process. Concatenation codes are still used in parts of the airplane industry, where longer codes represent subassemblies of the assembly in the head (also called the root or parent) of the code.
+
+Another type of concatenation code is a quorum code, which is not ordered. These codes say that n out of $$ marks must be present for the code to have meaning. For example, three out of five inspectors must approve a part before it passes.
+
+The most common use of concatenation codes is in keyword lists in the header records of documents in textbases. The author or librarian assigns each article in the system a list of keywords that describes the material covered by the article. The keywords are picked from a limited, specialized vocabulary that belongs to a particular discipline.
+
+Concatenation codes fell out of general use because their variable length made them more difficult to store in older computer systems, which used fixed-length records (think of a punchcard). The codes had to be ordered and stored as left-justified strings to sort correctly.
+
+These codes could also be ambiguous if they were poorly designed. For example, is the head of 1234 the 1 or the 12 substring? When concatenation codes are used in databases, they usually become a set of yes/no checkboxes, represented as adjacent columns in the file. This makes them Boolean vector codes, instead of true concatenation codes.
+
+# 5.3 General Guidelines for Designing Encoding Schemes
+
+These are general guidelines for designing encoding schemes in a database, not firm, hard rules. You will find exceptions to all of them.
+
+## 5.3.1 Existing Encoding Standards
+
+The use of existing standard encoding schemes is always recommended. If everyone uses the same codes, data will be easy to transfer and collect uniformly. Also, someone who sat down and did nothing else but work on this scheme probably did a better job than you could while trying to get a database up and running.
+
+As a rule of thumb, if you don't know the industry in which you are working, ask a subject-area expert. Although that sounds obvious, I have worked on a media library database project where the programmers actively avoided talking to the professional librarians who were on the other side of the project. As a result, recordings were keyed on GUIDs and there were no Schwann catalog numbers in the system. If you cannot find an expert, then Google for standards. First, check to see if ISO has a standard, then check the U.S. government, and then check industry groups and organizations.
+
+## 5.3.2 Allow for Expansion
+
+Allow for expansion of the codes. The ALTER statement can create more storage when a single-character code becomes a two-character code, but it will not change the spacing on the printed reports and screens. Start with at least one more decimal place or character position than you think you will need. Visual psychology makes "01" look like an encoding, whereas "1" looks like a quantity.
+
+## 5.3.3 Use Explicit Missing Values to Avoid NULLs
+
+#### Rationale:
+
+Avoid using NULLs as much as possible by putting special values in the encoding scheme instead. SQL handles NULLs differently than values, and NULLs don't tell you what kind of missing value you are dealing with.
+
+All-zeros are often used for missing values and all-nines for miscellaneous values. For example, the ISO gender codes are 0 = Unknown, 1 = Male, 2 = Female, and 9 = Not Applicable. "Not applicable" means a lawful person, such as a corporation, which has no gender.
+
+Versions of FORTRAN before the 1977 standard read blank (unpunched) columns in punchcards as zeros, so if you did not know a value, you skipped those columns and punched them later, when you did know. Likewise, using encoding schemes with leading zeros was a security trick to prevent blanks in a punchcard from being altered. The FORTRAN 77 standard fixed its "blank versus zero" problem, but it lives on in SQL in poorly designed systems that cannot tell a NULL from a blank string, an empty string, or a zero.
+
+The use of all-nines or all-Z's for miscellaneous values will make those values sort to the end of the screen or report. NULLs sort either always to the front or always to the rear, but which way they sort is implementation defined.
+
+#### Exceptions:
+
+NULLs cannot be avoided. For example, consider the column "termination_date" in the case of a newly hired employee. The use of a NULL makes computations easier and correct. The code simply leaves the NULL date or uses COALESCE (some_date, CURRENT_TIMESTAMP) as is appropriate.
+
+## 5.3.4 Translate Codes for the End User
+
+As much as possible, avoid displaying pure codes to users, but try to provide a translation for them. Translation in the front is not required for all codes, if they are common and well known to users. For example, most people do not need to see the two-letter state abbreviation written out in words. At the other extreme, however, nobody could read the billing codes used by several long-distance telephone companies.
+
+A part of translation is formatting the display so that it can be read by a human being. Punctuation marks, such as dashes, commas, currency signs, and so forth, are important. However, in a tiered architecture, display is done in the front end, not the database. Trying to put leading zeros or adding commas to numeric values is a common newbie error. Suddenly, everything is a string and you lose all temporal and numeric computation ability.
+
+These translation tables are one kind of auxiliary table; we will discuss other types later. They do not model an entity or relationship in the schema but are used like a function call in a procedural language. The general form for these tables is:
+
+Sometimes you might see the definition as part of the primary key or a CHECK() constraint on the "encode" column, but because these are read-only tables, which are maintained outside of the application, we generally do not worry about having to check their data integrity in the application.
+
+### 5.3.4.1 One True Lookup Table
+
+Sometimes a practice is both so common and so stupid that it gets a name, and, much like a disease, if it is really bad, it gets an abbreviation. I first ran into the One True Lookup Table (OTLT) design flaw in a thread on a CompuServe forum in 1998, but I have seen it rediscovered in newsgroups every year since.
+
+Instead of keeping the encodings and their definition in one table each, we put all of the encodings in one huge table. The schema for this table was like this:
+
+In practice, _m_ and _n_ are usually something like 255 or 50—default values particular to their SQL product.
+
+The rationale for having all encodings in one table is that it would let the programmer write a single front-end program to maintain all of the encodings. This method really stinks, and I strongly discourage it. Without looking at the following paragraphs, sit down and make a list of all the disadvantages of this method and see if you found anything that I missed. Then read the following list:
+
+1. _Normalization._ The real reason that this approach does not work is that it is an attempt to violate first normal form. I can see that these tables have a primary key and that all of the columns in a SQL database have to be scalar and of one data type, but I will still argue that it is not a first normal form table. The fact that two domains use the same data type does not make them the same attribute. The extra "code_type" column changes the domain of the other columns and thus violates first normal form because the column in not atomic. A table shouldmodel one set of entities or one relationship, not hundreds of them. As Aristotle said, "To be is to be something in particular; to be nothing in particular is to be nothing."
+
+2. _Total storage size._ The total storage required for the OTLT is greater than the storage required for the one encoding, one table approach because of the redundant encoding type column. Imagine having the entire International Classification of Diseases (ICD) and the Dewey Decimal system in one table. Only the needed small single encoding tables have to be put into main storage with single auxiliary tables, while the entire OTLT has to be pulled in and paged in and out of main storage to jump from one encoding to another.
+
+3. _Data types._ All encodings are forced into one data type, which has to be a string of the largest length that any encoding—present and future—used in the system, but VARCHAR(n) is not always the best way to represent data. The first thing that happens is that someone inserts a huge string that looks right on the screen but has trailing blanks or an odd character to the far right side of the column. The table quickly collects garbage.
+
+CHAR(n) data often has advantages for access and storage in many SQL products. Numeric encodings can take advantage of arithmetic operators for ranges, check digits, and so forth with CHECK() clauses. Dates can be used as codes that are translated into holidays and other events. Data types are not a one-size-fits-all affair. If one encoding allows NULLs, then all of them must in the OTLT.
+
+4. _Validation._ The only way to write a CHECK() clause on the OTLT is with a huge CASE expression of the form:
+
+This means that validation is going to take a long time, because every change will have to be considered by all the WHEN clauses in this oversized CASE expression until the SQL engine finds one that tests TRUE. You also need to add a CHECK() clause to the "code_type" column to be sure that the user does not create an invalid encoding name.
+
+5. _Flexibility._ The OTLT is created with one column for the encoding, so it cannot be used for (n) valued encodings where (n > 1). For example, if I want to translate (longitude, latitude) pairs into a location name, 1 would have to carry an extra column.
+
+6. _Maintenance._ Different encodings can use the same value, so you constantly have to watch which encoding you are working with. For example, both the ICD and Dewey Decimal system have three digits, a decimal point, and three digits.
+
+7. _Security._ To avoid exposing rows in one encoding scheme to unauthorized users, the OTLT has to have VIEWs defined on it that restrict users to the "code_type"s they are allowed to update. At this point, some of the rationale for the single table is gone, because the front end must now handle VIEWs in almost the same way it would handle multiple tables. These VIEWs also have to have the WITH CHECK OPTION clause, so that users do not make a valid change that is outside the scope of their permissions.
+
+8. _Display._ You have to CAST() every encoding for the front end. This can be a lot of overhead and a source of errors when the same monster string is CAST() to different data types in different programs.
+
+## 5.3.5 Keep the Codes in the Database
+
+A part of the database should have all of the codes stored in tables. These tables can be used to validate input, to translate codes in displays, and as part of the system documentation.
+
+I was amazed to go to a major hospital in Los Angeles in mid-1993 and see the clerk still looking up codes in a dog-eared looseleaf notebook instead of bringing them up on her terminal screen. The hospital was still using an old IBM mainframe system, which had dumb 3270 terminals, rather than a client/server system with workstations. There was not even a help screen available to the clerk.
+
+The translation tables can be downloaded to the workstations in a client/server system to reduce network traffic. They can also be used to build picklists on interactive screens and thereby reduce typographical errors. Changes to the codes are thereby propagated in the system without anyone having to rewrite application code. If the codes change over time, the table for a code should have to include a pair of "date effective" fields. This will allow a data warehouse to correctly read and translate old data.
+
+# 5.4 Multiple Character Sets
+
+Some DBMS products can support ASCII, EBCDIC, and Unicode. You need to be aware of this, so you can set proper collations and normalize your text.
+
+The predicate "<string> IS [NOT] NORMALIZED" in SQL-99 determines if a Unicode string is one of four normal forms (i.e., D, C, KD, and KC). The use of the words _normal form_ here is not the same as in a relational context. In the Unicode model, a single character can be built from several other characters. Accent marks can be put on basic Latin letters. Certain combinations of letters can be displayed as ligatures (ae becomes $$). Some languages, such as Hangul (Korean) and Vietnamese, build glyphs from concatenating symbols in two dimensions. Some languages have special forms of one letter that are determined by context, such as the terminal sigma in Greek or accented u in Czech. In short, writing is more complex than putting one letter after another.
+
+The Unicode standard defines the order of such constructions in their normal forms. You can still produce the same results with different orderings and sometimes with different combinations of symbols, but it is handy when you are searching such text to know that it is normalized rather than trying to parse each glyph on the fly. You can find details about normalization and links to free software at www.unicode.org.
+CHAPTER 6 Coding Choices
+
+_"Caesar: Pardon him, Theodotus. He is a barbarian and thinks the customs_ _of his tribe and island are the laws of nature."_
+
+_—Caesar and Cleopatra,_ by George Bernard Shaw, 1898
+
+THIS CHAPTER DEALS WITH writing good DML statements in Standard SQL. That means they are portable and can be optimized well by most SQL dialects. I define _portable_ to mean one of several things. The code is standard and can be run as-is on other SQL dialects; standard implies portable. Or the code can be converted to another SQL dialect in a simple mechanical fashion, or that the feature used is so universal that all or most products have it in some form; portable does not imply standard. You can get some help with this concept from the X/Open SQL Portability Guides.
+
+A major problem in becoming a SQL programmer is that people do not unlearn procedural or OO programming they had to learn for their first languages. They do not learn how to think in terms of sets and predicates, and so they mimic the solutions they know in their first programming languages. Jerry Weinberg (1978) observed this fact more than 25 years ago in his classic book, _Psychology of Computer Programming._ He was teaching PL/I. For those of you younger readers, PL/I was a language from IBM that was a hybrid of FORTRAN, COBOL, and A1GOL that had a popular craze.
+
+Weinberg found that he could tell the first programming languages of the students by how they wrote PL/I. My personal experience (1989) was that I could guess the nationality of the students in my C and Pascal programming classes because of their native spoken language.
+
+Another problem in becoming a SQL programmer is that people tend to become SQL dialect programmers and think that their particular products SQL is some kind of standard. In 2004,1 had a job interview for a position where I was being asked to evaluate different platforms for a major size increase in the company's databases. The interviewer kept asking me "general SQL" questions based on the storage architecture of the only product he knew.
+
+His product is not intended for Very Large Database (VLDB) applications, and he had no knowledge of Nucleus, Teradata, Model 204, or other products that compete in the VLDB arena. He had spent his career tuning one version of one product and could not make the jump to anything different, even conceptually. His career is about to become endangered.
+
+There is a place for the specialist dialect programmer, but dialect programming should be a last resort in special circumstances and never the first attempt. Think of it as cancer surgery: You do massive surgery when there is a bad tumor that is not treatable by other means; you do not start with it when the patient came in with acne.
+
+# 6.1 Pick Standard Constructions over Proprietary Constructions
+
+There is a fact of life in the IT industry called the Code Museum Effect, which works like this: First, each vendor adds a feature to its product. The feature is deemed useful, so it gets into the next version of the standard with slightly different syntax or semantics, but the vendor is stuck with its proprietary syntax. Its users have written code based on it, and they do not want to redo it. The solutions are the following:
+
+1. Never _implement the standard and just retain the old syntax._ The problem is that you cannot pass a conformance test, which can be required for government and industry contracts. SQL programmers who know the standard from other products cannot read, write, or maintain your code easily. In short, you have the database equivalent of last year's cell phone.
+
+2. _Implement the standard, but retain the old syntax, too._ This is the usual solution for a few releases. It gives the users a chance tomove to the standard syntax but does not break the existing applications. Everyone is happy for awhile.
+
+3. _Implement the standard and depreciate the old syntax._ The vendor is ready for a major release, which lets it redo major parts of the database engine. Changing to the standard syntax and not supporting the old syntax at this point is a good way to force users to upgrade their software and help pay for that major release.
+
+A professional programmer would be converting his or her old code at step two to avoid being trapped in the Code Museum when step three rolls around. Let's be honest, massive code conversions do not happen until after step three occurs in most shops, and they are a mess, but you can start to avoid the problems by always writing standard code in a step two situation.
+
+## 6.1.1 Use Standard OUTER JOIN Syntax
+
+#### Rationale:
+
+Here is how the standard OUTER JOINs work in SQL-92. Assume you are given:
+
+and the OUTER JOIN expression:
+
+We call Table 1 the "preserved table" andTable2 the "unpreserved table" in the query. What I am going to give you is a little different but equivalent to the ANSI/ISO standards.
+
+1. We build the CROSS JOIN of the two tables. Scan each row in the result set.
+
+2. If the predicate tests TRUE for that row, then you keep it. You also remove all rows derived from it from the CROSS JOIN.
+
+3. If the predicate tests FALSE or UNKNOWN for that row, then keep the columns from the preserved table, convert all the columns from the unpreserved table to NULLs, and remove the duplicates.
+
+So let us execute this by hand:
+
+Tablel CROSS JOIN Table2
+
+Table1 LEFT OUTER JOIN Table2
+
+The final results:
+
+The basic rule is that every row in the preserved table is represented in the results in at least one result row.
+
+### 6.1.1.1 Extended Equality and Proprietary Syntax
+
+Before the standard was set, vendors all had a slightly different syntax with slightly different semantics. Most of them involved an extended equality operator based on the original Sybase implementation. There are limitations and serious problems with the extended equality, however. Consider the two Chris Date tables:
+
+And let's do a Sybase-style extended equality OUTER JOIN like this:
+
+If I do the OUTER join first, I get:
+
+Then I apply the (qty < 200) predicate and get:
+
+Doing it in the opposite order results in the following:
+
+Sybase does it one way, Oracle does it another, and Centura (née Gupta) lets you pick which one to use—the worst of both nonstandard worlds! In SQL-92, you have a choice and can force the order of execution. Either do the predicates after the join:
+
+or do it in the joining:
+
+Another problem is that you cannot show the same table as preserved and unpreserved in the extended equality version, but it is easy in SQL-92. For example, to find the students who have taken Math 101 and might have taken Math 102:
+
+#### Exceptions:
+
+None. Almost every vendor, major and minor, has the ANSI infixed OUTER JOIN operator today. You will see various proprietary notations in legacy code, and you can convert it by following the discussion given previously.
+
+## 6.1.2 Infixed INNER JOIN and CROSS JOIN Syntax Is Optional, but Nice
+
+SQL-92 introduced the INNER JOIN and CROSS JOIN operators to match the OUTER JOIN operators and complete the notation; other infixed JOIN operators are not widely implemented but exist for completeness. The functionality of the INNER JOIN and CROSS JOINexisted in the FROM clause before and did not give the programmer anything new like the OUTER JOINs.
+
+#### Rationale:
+
+The CROSS JOIN is a handy piece of documentation that is much harder to miss seeing than a simple comma. Likewise, writing out INNER JOIN instead of the shorthand INNER helps document the code.
+
+However, many INNER JOIN operators can be visually confusing, and you might consider using the older syntax. The older syntax lets you put all of the predicates in one place and group them in some manner for readability. A rule of thumb is the "rule of five" in human psychology. This says that we have problems handling more than five things at once, get serious problems with seven, and break down at nine (Miller 1956).
+
+So when you have fewer than five tables, the infixed operators are fine but questionable for more than five INNER JOIN-ed tables. Trying to associate ON clauses to INNER JOIN operators is visually difficult. In particular, a Star Schema has an easily recognized pattern of joins from the fact table to each dimension table, like this in pseudocode:
+
+The reader can look down the right-hand side of the WHERE clause and see the dimensions in a vertical list.
+
+One style that is popular is to put the join conditions in the FROM clause with INNER JOIN syntax, then do the search arguments in the WHERE clause. Some newbies believe that this is required, but it is not. However, if the search arguments change, having them in one place is handy.
+
+A quick heuristic when using old-style joins is that the number of tables in the FROM clause should be one more than the number of join conditions in the WHERE clause. This shows that you do not have cycles in the joins. If the difference between the number of tables and the number of join conditions is more than one, then you might have an unwanted CROSS JOIN caused by a missing join condition. Old style:
+
+New style:
+
+Mixed style:
+
+#### Exceptions:
+
+The infixed join operators must be used if there is an OUTER JOIN in the FROM clause. The reason is that the order of execution matters with OUTER JOINs, and you can control it better with parentheses and predicates if they are all together.
+
+As a rule of thumb, when you have a FROM clause with five or more tables in it, the traditional syntax is probably easier to read than trying to visually match the ON clauses to the proper tables and correlation names. This rule of five is mentioned in other places as a limit on human data processing ability.
+
+## 6.1.3 Use ISO Temporal Syntax
+
+#### Rationale:
+
+The only display format allowed for temporal data in Standard SQL is based on ISO-8601, and it is the "yyyy-mm-dd hh:mm:ss.sssss" style. The Federal Information Processing Standards (FIPS) require at least five decimal places of precision in the seconds. Anything else is ambiguousand not acceptable if you want to work with other software that follows ISO standards.
+
+Standard SQL defines a minimal set of simple temporal math operators. All of them are available in all SQL products, but the syntax varies. For example, in the T-SQL dialect, the function call "DATEADD (DD, 13, birthdate)" adds "13" days to the date in birthdate. The Standard SQL syntax for the same calculation is "birthdate + INTERVAL '13 DAY" instead.
+
+You can set the display to ISO-8601 in every SQL product, and you can do 99.99 percent of your temporal work without any proprietary temporal functions. The problem is that porting code can be a bother. You need to make a set of notes about any differences in your dialect and the standard.
+
+#### Exceptions:
+
+None. Display formatting is always done in the client layer of a tiered architecture. This is a basic programming principle and has nothing to do with SQL per se. Failure to follow this principle is usually the result of a newbie who came to SQL from a traditional monolithic language with a strong coupling between the application, the display, and the file system.
+
+## 6.1.4 Use Standard and Portable Functions
+
+#### Rationale:
+
+Standard SQL is not a computational language, so it does not have the function library of FORTRAN or a statistical package. SQL is not a text manipulation language, so it does not have the function library of ICON or Snobol. All you have is simple four-function math and basic string operators in SQL-92. Vendors have always provided more than just the basic operators, so you can write portable code that assumes other math and string functions. The most common extra math functions are modulus, rounding and truncation, powers, and logarithms. The most extra common string functions are replacement, reversal, and repetition.
+
+#### Exceptions:
+
+If your dialect has a function built into it, which would require a huge amount of code to implement or a really long running time, then use the proprietary function and comment it for porting.
+
+# 6.2 Pick Compact Constructions over Longer Equivalents
+
+_"Entia non sunt multiplicanda praeter necessitatem."_ (No _more things should be presumed to exist than are absolutely necessary.)_
+
+—William Occam (c. 1280–1349).
+
+_"Everything should be made as simple as possible, but not simpler."_
+
+—Attributed to Albert Einstein
+
+Writing code in as short, clear, and compact a form as possible is just good software engineering for any programming language. Modules that clearly do one function are easier to modify and to understand. Systems with fewer modules are easier to maintain.
+
+SQL can replace hundreds of lines of procedural code with a few statements. You ought to be predisposed to think of short, clean solutions instead of kludges. However, old habits are hard to kill. Many newbies still think in terms of logical tests based on Boolean logic and simple AND-OR-NOT expressions that they know from their first programming languages.
+
+## 6.2.1 Avoid Extra Parentheses
+
+#### Rationale:
+
+Newbies see generated SQL code that has to have extra levels of parentheses to execute safely and think that this is the way to write code. Consider this simple query:
+
+This is not so bad to read, but by the time you have more than five predicates and useless nesting of parentheses, the code is difficult to read, and a missing parentheses is a real pain to locate. Let LISP programmers use them; they really need parentheses.
+
+#### Exceptions:
+
+Parentheses in moderation can make nested predicates easier to read:
+
+versus:
+
+In the following section, we will also see how to use a CASE expression for situations like this one.
+
+## 6.2.2 Use CASE Family Expressions
+
+The CASE expression is an expression and not a control statement; that is, it returns a value of one data type. Because SQL is declarative, there is no flow of control for it to modify, like the CASE statements in other languages. The number of newbies who do not understand the difference between an expression and a statement is frightening.
+
+The idea and the syntax came from the ADA programming language. Here is the formal BNF syntax for a <case specifications
+
+### 6.2.2.1 Searched CASE Expression
+
+The searched CASE expression is probably the most-used version of the expression. The WHEN . . . THEN . . . clauses are executed in left-to-right order. The first WHEN clause that tests TRUE returns the value given in its THEN clause, and you can nest CASE expressions inside of each other. If no explicit ELSE clause is given for the CASE expression, then the database will insert an implicit "ELSE NULL" clause. If you want to return a NULL in a THEN clause, you must use a CAST (NULL AS <datatype>) expression. I recommend always giving the ELSE clause, so that you can change it later when you find something explicit to return.
+
+### 6.2.2.2 Simple CASE Expression
+
+The <simple case expression> is defined as a searched CASE expression in which all of the WHEN clauses are made into equality comparisons against the <case operandx For example:
+
+could also be written as:
+
+There is a gimmick in this definition, however. The expression:
+
+becomes:
+
+The second WHEN clause is always UNKNOWN. Use the simple CASE expression when it is appropriate.
+
+### 6.2.2.3 Other CASE Expressions
+
+The SQL-92 standard defines other functions in terms of the CASE expression, which makes the language a bit more compact and easier to implement. For example, the COALESCE() function can be defined for one or two expressions by:
+
+1. COALESCE (<value exp #1>) is equivalent to (<value exp #1>)
+
+2. COALESCE (<value exp #1>, <value exp #2>) is equivalent to:
+
+Then we can recursively define it for (n) expressions, where (n >= 3), in the list by:
+
+COALESCE (<value exp #1>, <value exp #2>, n) as equivalent to:
+
+Likewise, NULLIF (<value exp #1>, <value exp #2>) is equivalent to:
+
+Use the most compact form of these CASE expressions, and do not expand them out to their definitions.
+
+## 6.2.3 Avoid Redundant Expressions
+
+#### Rationale:
+
+Most modern SQL engines are pretty smart. This was not always the case, so older SQL programmers will sometimes add redundant predicates to a where clause. For example, if none of the columns in the table Foobar is NULL-able, then given:
+
+One of the three search conditions is redundant, because it can be deduced from the other two. Redundant predicates only confuse the human readers and do not give information to a good optimizer.
+
+#### Exceptions:
+
+If your SQL has a bad optimizer and needs the extra help, then add redundant predicates.
+
+## 6.2.4 Seek a Compact Form
+
+#### Rationale:
+
+Many of the earlier SQL engines could not use an index on a column if it were in an expression, and they did not do any algebraic optimizations. Today, we do this bit of cleanup work because a simpler form of an expression is easier to maintain and to read:
+
+And a little algebra becomes:
+
+#### Exceptions:
+
+If your SQL has a really good optimizer, and the complicated form is easier for a human being to read for some reason, then use it. Sometimes there is no simple form.
+
+### 6.2.4.1 Use BETWEEN, Not AND-ed Predicates
+
+#### Rationale:
+
+Consider this simple query:
+
+which can be written as:
+
+The BETWEEN is more compact and gives the reader information about the relationship among three columns that might not be so obvious amid a longer list of search conditions.
+
+#### Exceptions:
+
+This rule makes sense from a readability standpoint, but it does not always stand up in terms of performance. Consider DB2 for z/OS in which "<column name> BETWEEN <expression> AND <expression> is both indexable and a stage one predicate." Without explaining what a stage one predicate is, it is preferred for performance.
+
+However, "<value> BETWEEN <column name 1>AND <column name 2>" is both stage two and nonindexable, but formulating the same using two <= predicates could be both stage one and indexable and therefore preferable for performance. Likewise, the same execution plan applies to "<column name 1> BETWEEN <column name 2> AND <column name 3>" predicates. This will differ from DBMS to DBMS and platform to platform. As optimizers get better, this will be less and less true.
+
+### 6.2.4.2 Use IN(), Not OR-ed predicates
+
+#### Rationale:
+
+The IN() predicate was first introduced in the Pascal programming language. In SQL it has two forms; the list and the subquery. The list form has a comma-separated list of values or expressions on the right-hand side. The predicate returns a TRUE result if there is a match in that list with the left-hand side of the predicate. It is shorthand for a list or OR-ed predicates. For example consider:
+
+which can be written as:
+
+The IN() is more compact and gives the reader information about the relationship among three columns that might not be so obvious amid alonger list of search conditions. The list can also consist of scalar expressions, but that is not common.
+
+#### Exceptions:
+
+Watch out for NULLs! The IN () predicate is defined as a chain of OR-ed predicates, thus:
+
+Therefore:
+
+We are now in SQLs three-valued logic. Remember that a NULL is not the same thing as an UNKNOWN; SQL-92 has no Boolean data type; and you cannot use AND, OR, and NOT on a NULL.
+
+The NOT IN () predicate is defined as the negation of the IN():
+
+means:
+
+Now put in a NULL for one of the list elements:
+
+If you wish to have a match on a NULL in a list, then you can
+
+COALESCE() the NULLs to the left-hand expression, thus:
+
+which is a little cleaner than:
+
+### 6.2.4.3 Use CASE Expressions, Not Complex Nested Predicates
+
+An advanced trick in the WHERE clause is to use a CASE expression for a complex predicate with material implications. If you forgot your freshman logic, a material implication logical operator is written as an arrow with two tails, and it means "p implies q" or "if ρ is true then q is true" in English.
+
+The use of a function that returns one or zero when given a predicate as its parameter is called a _characteristic function_ in logic and set theory.
+
+Review the rules for the CASE expression in section 6.2.2 first, so you understand it. The order of execution of the WHEN clauses can be used to optimize performance and avoid redundant tests. You can also nest CASE expressions inside the WHEN and THEN clauses of a containing CASE expression and display the logic as an indented tree structure.
+
+The goal of this technique is to replace pages of long lists of simple theta expressions inside horrible levels of parentheses and to providesome short-circuit evaluation as a bonus. When the nesting is too messy to understand, stop and reconsider your logic. Decision table tools, such as Logic Gem, are an excellent way to do this.
+
+# 6.3 Use Comments
+
+#### Rationale:
+
+The best documentation for maintaining a program has been comments in the code. Perhaps it is easier for procedural language programmers to add comments because they are explaining in a narrative fashion what their program is doing. Unfortunately, procedural language comments are often redundant if you can read the code. How much help did you get from:
+
+which gives you no information about what the variable score means and why it is incremented.
+
+In Standard SQL, a comment begins with two dashes (--) and ends with a new line, because the first SQL engines were on IBM mainframes and used punchcards. This format is a poor choice with modern computers that can store free-form text. Word wrap in program text can split a comment and give you errors. Because SQL supports the unary minus operator, this is ambiguous in some rare situations and makes the compiler work extra hard. Later standards added the C style /* and */ pairs, and many vendors have similar comment brackets. They are a better choice.
+
+SQL programmers do not like to put comments in their code, not even redundant or useless ones. My guess is that because SQL does a lot of work in one statement and programmers have been taught to comment the code at the statement execution level rather than explain the purpose of the code, the higher level of abstraction confuses them. They are not inclined to put comments at the clause level because the appearance of the code can be crowded.
+
+Get over it. You need a high-level descriptive comment on a block of SQL, and then more detailed comments on a few important clauses. Try to keep the comments aimed at non-SQL programmers and in plain English. For example, don't say "relational division of motor pool vehicles by available drivers" on the assumption that the reader willknow what a relational division is. Try "list all drivers who can drive all the vehicles in the motor pool" instead. The other trick is to reference the documentation for the schema and the applications. This assumes that they are current and useful, however.
+
+If you have the time, another guru-level trick is to save the best of the various statements you tried that worked but did not perform as well as the final choice as comments. In SQL, what was the best answer in one situation is often no longer the best answer. Instead of making the next programmer start from scratch, share your notes.
+
+#### Exceptions:
+
+In a well-designed schema with good data element names, much of the code is easy for an experienced SQL programmer to read. You can skip comments on single statements if their intent is really obvious, but remember that one programmer's obvious is another's "what the heck?" when you code.
+
+## 6.3.1 Stored Procedures
+
+Always start a stored procedure with a comment that gives at least the author, the date, and the update history. This is simply basic software management. After that, add a high-level description of the function of this module. The procedure name will be in a "<verbxobject>" format. Each parameter should have a comment as needed.
+
+## 6.3.2 Control Statement Comments
+
+Comments on control statements, such as IF-THEN-ELSE, BEGIN-END, and WHILE-DO loops, will look much like comments in any procedural program. Complicated SQL statements need a comment at the top and often comments at the clause level.
+
+## 6.3.3 Comments on Clause
+
+This point is difficult to generalize, but things that act as a unit might need a comment. For example, a derived table for which there is no good alias might need a comment to explain what it contains. A series of predicates that define a complicated join might be prefaced with a comment to explain what they are doing at a higher level.
+
+# 6.4 Avoid Optimizer Hints
+
+#### Rationale:
+
+Many products have proprietary syntax for sending parameters to the optimizer to change the execution plan for a statement. Because each physical implementation is different, this syntax will not be portable, but there are other problems too.
+
+First, the optimizer is usually smarter than the programmer and finds a good plan. People cannot handle computations that involve tens of parameters very well. Second, once a hint is put on a statement, it stays there permanently, long after the reason for the hint is gone. A typical example of this would set up a query hint for a skewed statistical distribution and then, as the database grows, the distribution becomes more normal or skewed in the opposite direction. The hint that used to be so helpful is now a handicap.
+
+#### Exceptions:
+
+If you do have a skewed statistical distribution or other weirdness in your data that is destroying performance, then use a hint. Set up a review of all statements with hints to see if they actually need to be maintained. Reviews should occur when a new release of database is installed (optimizer might be better) or the statistics of one or more of the tables change (data might be better), but if the performance is acceptable, then do not use hints.
+
+# 6.5 Avoid Triggers in Favor of DRI Actions
+
+#### Rationale:
+
+Although there is an ANSI/ISO standard for triggers, their syntax and semantics are still highly proprietary. Triggers are blocks of procedural code that are executed (fired) when a database event occurs to a table. This code is usually in a proprietary 3GL language. A database event is something that changes the data—an insert, update, or delete.
+
+The full ANSI version of triggers does not fire on an insertion, but some vendor products do. The full ANSI version of triggers have more than one trigger on a table and can fire them in a sequence either before or after the database event. Most vendor products do not have that much control over the triggers. On the other hand, the syntax and semantics for DRI actions are well defined and standardized.
+
+A newbie posted a topic under the title "Need Help with a Calculation Trigger" on the forums in the SQL Server Central Web site in November 2004. This person was having trouble setting up a trigger to check theunits of a "number field [sic]"; the real problem was that the poster did not know that a column is not a field.
+
+For some reason, the column was declared as FLOAT and was called length. The trouble is that some people were entering a length in meters, centimeters, and millimeters. The poster was trying to code a trigger that will fire on UPDATE or INSERT to check the value of length. If it is greater than 20, chances are the number is in millimeters and should be divided by 10. If the number is less than 0, then the number is probably in meters and should be multiplied by 100.
+
+However, this is the wrong answer. It is in procedural code. The right answer is in the DDL, with something like this:
+
+Triggers tend to fix errors on the fly; the goal is not to permit them in the first place.
+
+#### Exceptions:
+
+Some things should be done with triggers because you cannot do them with DRI. In particular, the INSTEAD OF trigger has to be used for updatable views. This trigger is attached to a VIEW, and instead of taking actions on the VIEW, it changes the base tables from which the VIEW is built, so that the user sees those changes reflected in the VIEW.
+
+Heuristics tend to favor stored procedures over triggers. A trigger fires every time its database event occurs, which puts it out of your control and adds that overhead to each database event. A storedprocedure has to be deliberately executed, which puts it completely in your control. Furthermore, the syntax for triggers is proprietary despite the standards, so they do not port well.
+
+# 6.6 Use SQL Stored Procedures
+
+Every SQL product has some kind of 4GL language that allows you to write stored procedures that reside in the database and that can be invoked from a host program. Although there is a SQL/PSM standard, in the real world, only Mimer and IBM have implemented it at the time of this writing. Instead, each vendor has a proprietary 4GL, such asT-SQL for the Sybase/SQL Server family, PL/SQL from Oracle, Informix-4GL from Informix, and so forth. For more details on these languages, I recommend that you get a copy of Jim Melton's excellent book, _Understanding SQL's Stored Procedures_ ISBN: 1-55860461-8 [out of print] on the subject. The advantages they have are considerable, including the following:
+
+  * _Security._ The users can only do what the stored procedure allows them to do, whereas dynamic SQL or other ad hoc access to the database allows them to do anything to the database. The safety and security issues ought to be obvious.
+  * _Maintenance._ The stored procedure can be easily replaced and recompiled with an improved version. All of the host language programs that call it will benefit from the improvements that were made and not be aware of the change.
+  * _Network traffic._ Because only parameters are passed, network traffic is lower than passing SQL code to the database across the network.
+  * _Consistency._ If a task is always done with a stored procedure, then it will be done the same way each time. Otherwise, you have to depend on all programmers (present and future) getting it right. Programmers are not evil, but they are human. When you tell someone that a customer has to be at least 18 years of age, one programmer will code "age > 18" and another will code "age >= 18" without any evil intent. You cannot expect everyone to remember all of the business rules and write flawless code forever.
+  * _Modularity._ Once you have a library of stored procedures, you can reuse them to build other procedures. Why reinvent the wheel every week?
+
+Chapter 8 is a general look at how to write stored procedures in SQL. If you look at any of the SQL newsgroups, you will see awful code. Apparently, programmers are not taking a basic software engineering course anymore, or they think that the old rules do not apply to a vendor's 4GL language.
+
+# 6.7 Avoid User-Defined Functions and Extensions inside the Database
+
+#### Rationale:
+
+SQL is a set-oriented language and wants to work with tables rather than scalars, but programmers will try to get around this model of programming to return to what they know by writing user-defined functions in other languages and putting them into the database.
+
+There are two kinds of user-defined functions and extensions. Some SQL products allow functions written in another standard language to become part of the database and to be used as if they were just another part of SQL. Others have a proprietary language in the database that allows the user to write extensions.
+
+Even the SQL/PSM allows you to write user-defined functions in any of the ANSI X3J standard programming languages that have data-type conversions and interfaces defined for SQL. There is a LANGUAGE clause in the CREATE PROCEDURE statement for this purpose.
+
+Microsoft has its common language runtime (CLR), which takes this one step further and embeds code from any compiler that can produce a CLR module in its SQL Server. Illustras "data blade" technology is now part of Informix, IBM has "extenders" to add functionality to the basic RDBMS, and Oracle has various "Cartridges" for its product.
+
+The rationale behind all of these various user-defined functions and extensions is to make the vendor's product more powerful and to avoid having to get another package for nontraditional data, such as temporal and spatial information. However, user-defined functions are difficult to maintain, destroy portability, and can affect data integrity.
+
+#### Exceptions:
+
+You might have a problem that can be solved with such tools, but this is a rare event in most cases; most data processing applications can be done just fine with standard SQL. You need to justify such a decision and be ready to do the extra work required.
+
+## 6.7.1 Multiple Language Problems
+
+Programming languages do not work the same way, so by allowing multiple languages to operate inside the database, you can lose data integrity. Just as quick examples: How does your language compare strings? The Xbase family ignores case and truncates the longer string, whereas SQL pads the shorter string and is case sensitive. How does your language handle a MOD() function when one or both arguments are negative? How does your language handle rounding and truncation? By hiding the fact that there is an interface between the SQL and the 3GL, you hide the problems without solving them.
+
+## 6.7.2 Portability Problems
+
+The proprietary user-defined functions and extensions will not port to another product, so you are locking yourself into one vendor. It is also difficult to find programmers who are proficient in several languages to even maintain the code, much less port it.
+
+## 6.7.3 Optimization Problems
+
+The code from a user-defined function is not integrated into the compiler. It has to be executed by itself when it appears in an expression. As a simple example of this principle, most compilers can do algebraic simplifications, because they know about the standard functions. They cannot do this with user-defined functions for fear of side effects. Also, 3GL languages are not designed to work on tables. You have to call them on each row level, which can be costly.
+
+# 6.8 Avoid Excessive Secondary Indexes
+
+First, not all SQL products use indexes: Nucleus is based on a compressed bit vector, Teradata uses hashing, and so forth. However, tree-structured indexes of various kinds are common enough to be worth mentioning. The X/Open SQL Portability Guides give a basic syntax that is close to that used in various dialects with minor embellishments. The user may or may not have control over the kind of index the system builds.
+
+A primary index is an index created to enforce PRIMARY KEY and UNIQUE constraints in the database. Without them, your schema is simply not a correct data model, because no table would have a key.
+
+A secondary index is an optional index created by the DBA to improve performance. The schema will return the same answers as itdoes with them, but perhaps not in a timely fashion—or even within the memory of living humans.
+
+Indexes are one thing that the optimizer considers in building an execution plan. When and how the index is used depends on the kind of index, the query, and the statistical distribution of the data. A slight change to any of these could result in a new execution plan later. With that caveat, we can speak in general terms about tree-structured indexes.
+
+If more than a certain percentage of a table is going to be used in a statement, then the indexes are ignored and the table is scanned from front to back. Using the index would involve more overhead than filtering the rows of the target table as they are read.
+
+The fundamental problem is that redundant or unused indexes take up storage space and have to be maintained whenever their base tables are changed. They slow up every update, insert, or delete operation to the table. Although this event is rare, indexes can also fool the optimizer into making a bad decision. There are tools for particular SQL products that can suggest indexes based on the actual statements submitted to the SQL engine. Consider using one.
+
+# 6.9 Avoid Correlated Subqueries
+
+#### Rationale:
+
+In the early days of SQL, the optimizers were not good at reducing complex SQL expressions that involved correlated subqueries. They would blindly execute loops inside loops, scanning the innermost tables repeatedly. The example used to illustrate this point was something like these two queries where "x" is not NULL-able and Table "Foo" is much larger than table "Bar," which produce the same results:
+
+versus
+
+In older SQL engines, the EXISTSO predicate would materialize a JOIN on the two tables and take longer. The IN() predicate would put the smaller table into main storage and scan it, perhaps sorting it to speed the search. This is not quite as true any more. Depending on the particular optimizer and the access method, correlated subqueries are not the monsters they once were. In fact, some products let you create indexes that prejoin tables, so they are the fastest way to execute such queries.
+
+However, correlated subqueries are confusing to people to read, and not all optimizers are that smart yet. For example, consider a table that models loans and payments with a status code for each payment. This is a classic one-to-many relationship. The problem is to select the loans where all of the payments have a status code of 'F':
+
+One answer to this problem uses this correlated scalar subquery in the SELECT list:
+
+This approach is backward. It works from the many side of the relationship to the one side, but with a little thought and starting from the one side, you can get this answer:
+
+The self-reference and correlation are complicated for both humans and machines. Most optimizers are not smart enough to flatten the first query like this.
+
+#### Exceptions:
+
+If you have a problem that is easier to understand with correlated subqueries and your optimizer is good, then don't be so afraid of them.
+
+# 6.10 Avoid UNIONS
+
+#### Rationale:
+
+UNIONs are usually not well optimized. Because they require that redundant duplicates be discarded, they force most SQL engines to do a sort before presenting the result set to the user. If possible, use UNION ALL instead. You should never have to build a chain of UNIONs from the same base table. That code can be written with OR-ed predicates or CASE expressions.
+
+As an example of a horrible misuse of SQL, Chris White posted a procedure that built dynamic SQL that would then build a report. Aside from the obvious violations of basic software engineering, the output was so huge that it exceeded the text size limits of SQL Server. He was attempting to construct an entire report in the database by using UNIONs to get the 12 lines of the report in the right order, by assigning them a letter of the alphabet. The whole thing would take several pages to show, but it is an extraction of the printout lines that were constructed from just the General Ledger. I have not attempted to clean up much of the code, so there are many violations of good coding rules in this snippet.
+
+The last part of the code could be reduced to a single, cohesive procedure. The output of the procedure would then be formatted in the front. Notice that section, description, and branch are all placeholders to give a slot for columns in the other UNIONs not shown here.
+
+#### Exceptions:
+
+Sometimes the UNION [ALL] is what you actually want. The other set operations in SQL-92, EXCEPT [ALL], and INTERSECT [ALL] are not widely available yet.
+
+# 6.11 Testing SQL
+
+When you are first writing a schema, you will probably generate some test data. If you look in the literature, there is a thing called an Armstrong set, which is the minimal number of rows that will test all of the constraints in a schema. Although it is difficult to automatically create an Armstrong set, you can do a good job with a little effort.
+
+## 6.11.1 Test All Possible Combinations of NULLs
+
+#### Rationale:
+
+NULLs behave strangely, and if there are problems, there is a good chance that a NULL will be involved. Newbies using graphic tools often leave more NULL-able columns in a single table than a professional would in an entire schema for a Fortune 500 company payroll.
+
+#### Exceptions:
+
+If the number of combinations is excessive, then look at a redesign rather than a stress test. It means you probably have too many NULL-able columns in the schema.
+
+## 6.11.2 Inspect and Test All CHECK() Constraints
+
+#### Rationale:
+
+You can extract the CHECK() constraint predicates from the DDL and look at them. The first thing is to see if the same data element has the same rules in all of the tables. Some attributes will always have the same CHECK() constraints if the model is correct. For example, the data type, regular expression, and check digit for a UPC code will be the same everywhere in the schema.
+
+Some attributes may have different constraints in different tables. For example, it would be reasonable to have "quantity INTEGER DEFAULT 0 NOT NULL CHECK (quantity >= 0)" almost everywhere that the quantity attribute appears. However, you might find that there is also a "CHECK (quantity > 0)" on a table. Is this an error or a situation where a zero quantity is disallowed? You need to look and see.
+
+#### Exceptions:
+
+None
+
+## 6.11.3 Beware of Character Columns
+
+#### Rationale:
+
+Character columns seldom have enough constraints on them. The result is that they have extra blanks in them, allow mixed-case letters, and will pretty much hold any kind of garbage that a user wishes to put in them.
+
+My favorite piece of test data for oversized, unconstrained NVARCHAR(n) columns is a collection of Buddhist sutras in Chinese Unicode. At least the users will learn a bit of classic Buddhist thought.
+
+#### Exceptions:
+
+None
+
+## 6.11.4 Test for Size
+
+#### Rationale:
+
+One of the problems with.small test data sets is that they will run just fine in the development shop, but when the size of the tables grows larger, you can get gradually degraded performance or catastrophe points. A catastrophe point is when there is a sudden change in the performance—the straw that breaks the camel's back. There is usually a physical component to a catastrophe point, such as excessive paging to a hard drive. Frankly, there is not a lot you can do about it except wait and see if it was a fluke or if it happens again.
+
+Gradually degraded performance is the nicer of the two situations. You can monitor the system, see the loss, and take action before anything bad happens. The bad news is that the term _gradual_ can be very short. The query that ran so well on a few thousand rows of test data is a pig when it goes live on several million rows of production data. Try to stress test on a data set that is larger than the current production database. That will let you know you have some margin of error.
+
+#### Exceptions:
+
+None
+CHAPTER 7 How to Use VIEWS
+
+_The Blind Men and the Elephant_
+
+By John Godfrey Saxe (1816–1887)
+
+It was six men of Indostan
+
+To learning much inclined,
+
+Who went to see the Elephant
+
+(Though all of them were blind),
+
+That each by observation
+
+Might satisfy his mind.
+
+The First approached the Elephant,
+
+And happening to fall
+
+Against his broad and sturdy side,
+
+At once began to bawl:
+
+"God bless me! but the Elephant
+
+Is very like a wall!"
+
+The Second, feeling of the tusk,
+
+Cried, "Ho! what have we here
+
+So very round and smooth and sharp?
+
+To me 'tis mighty clear
+
+This wonder of an Elephant
+
+Is very like a spear!"
+
+The Third approached the animal,
+
+And happening to take
+
+The squirming trunk within his hands,
+
+Thus boldly up and spake:
+
+"I see," quoth he, "the Elephant
+
+Is very like a snake!"
+
+The Fourth reached out an eager hand,
+
+And felt about the knee.
+
+"What most this wondrous beast is like
+
+Is mighty plain," quoth he;
+
+"Tis clear enough the Elephant
+
+Is very like a tree!"
+
+The Fifth, who chanced to touch the ear,
+
+Said: "E'en the blindest man
+
+Can tell what this resembles most;
+
+Deny the fact who can
+
+This marvel of an Elephant
+
+Is very like a fan!"
+
+The Sixth no sooner had begun
+
+About the beast to grope,
+
+Than, seizing on the swinging tail
+
+That fell within his scope,
+
+"I see," quoth he, "the Elephant
+
+Is very like a rope!"
+
+And so these men of Indostan
+
+Disputed loud and long,
+
+Each in his own opinion
+
+Exceeding stiff and strong,
+
+Though each was partly in the right,
+
+And all were in the wrong!
+
+Moral:
+
+So oft in theologie wars,
+
+The disputants, I ween,
+
+Rail on in utter ignorance
+
+Of what each other mean,
+
+And prate about an Elephant
+
+Not one of them has seen!
+
+VIEWs are virtual tables, defined by SELECT statements stored in the database. The SQL statement that defines the VIEW is executed only when the VIEW is invoked in another statement. The standard says that VIEWs are to act as if they are materialized, but in practice the optimizer will decide to materialize them as physical tables or to insert the SELECT statement in the definition into the query, invoking it and then compiling it like a derived table. There are six basic uses for VIEWs that we will discuss.
+
+# 7.1 VIEW Naming Conventions Are the Same as Tables
+
+#### Rationale:
+
+A VIEW is a logical table. It consists of rows and columns, exactly the same as a base table. A VIEW can be used in SELECT, UPDATE, DELETE, and INSERT statements in the same way that a base table can. Therefore, it stands to reason that VIEWs should utilize the same naming conventions as are used for tables. As an aside, the same can be said for aliases, synonyms, derived tables, table-valued functions, or anything that returns a table.
+
+In particular, there is an absurd naming convention of putting a "v" or "vw" in the first or last position of a VIEW name. My guess is that it comes from programmers either who are used to weakly typed languages that use Hungarian notation or who worked with file systems that had to have prefixes to locate the physical drive for the file. In the ISO-11179, the "vw" implies that the VIEW is a table dealing with Volkswagens.
+
+Individuals who have a need to differentiate between tables and VIEWs can utilize the schema information tables to determine which objects are VIEWs and which objects are tables. They should be at the system administration level or higher.
+
+INSERT, UPDATE, and DELETE are operations that cannot be performed on certain types of VIEWs. Users who need to do these privileges can be given INSTEAD OF triggers and never know if they are dealing with a VIEW or a base table.
+
+#### Exceptions:
+
+None
+
+## 7.1.1 Always Specify Column Names
+
+#### Rationale:
+
+When creating VIEWs, SQL provides the option of specifying new column names for the VIEW clause or defaulting to the same column names as the defining SELECT statement. It is always advisable to explicitly specify VIEW column names instead of allowing them to default, even if using the same names as the underlying base tables. This will provide for more accurate documentation.
+
+#### Exceptions:
+
+Make sure that the VIEW clause names are correct. If you misspell them, that is what the user sees.
+
+# 7.2 VIEWs Provide Row- and Column-Level Security
+
+One of the most beneficial purposes served by VIEWs is to extend the data security features of SQL. VIEWs can be created that provide a subset of rows, a subset of columns, or a subset of both rows and columns from the base table.
+
+How do VIEWs help provide row- and column-level security? Consider a "Personnel" table that contains all of the pertinent information regarding an enterprise's employees. Typically, name, address, position, birthdate, and salary information would be contained in such a table. However, not every user will require access to all of this information. Specifically, it may become necessary to shield the salary information from most users. You can accomplish this by creating a VIEW that does not contain the salary column and then granting most users the ability to access the VIEW, instead of the base table. The salary column will not be visible to users of the VIEW.
+
+Or perhaps you need to implement security at the row level. Consider a table that contains project information. Typically, this would include project name, purpose, start date, and who is responsible for the project. Assume that the security requirements for projects within your organization deem that only the employee who is responsible for the project can access the project data. By storing the authorization ID of the responsible employee in the "projects" table, a VIEW can be created using the CURRENT_USER value.
+
+Or, if you need to limit access to a team, you can create a table of teams to which only team managers have access.
+
+Another trick is to use the CURRENT_TIMESTAMP or CURRENT_DATE in VIEWs to get an automatic update to schedules and other time-related events.
+
+Each time the VIEW is invoked, it will check the clock and see if anything has changed for you.
+
+# 7.3 VIEWs Ensure Efficient Access Paths
+
+By coding the appropriate join criteria into the VIEW definition SQL, you can ensure that the correct join predicate will always be used. Of course, this technique becomes more useful as the SQL becomes more complex.
+
+# 7.4 VIEWs Mask Complexity from the User
+
+Somewhat akin to coding appropriate access into VIEWs, complex SQL can be coded into VIEWs to mask the complexity from the user. This can be extremely useful when your shop employs novice SQL users (whether those users are programmers, analysts, managers, or typical end users).
+
+As an example, consider the code for a relational division. Relational division is one of the eight basic operations in Codd's (1979) relational algebra. The idea is that a divisor table is used to partition a dividend table and produce a quotient or results table. The quotient table consists of those values of one column for which a second column had all of the values in the divisor.
+
+This is easier to explain with an example. We have a table of pilots and the planes they can fly (dividend); we have a table of planes in the hangar (divisor); we want the names of the pilots who can fly every plane (quotient) in the hangar. To get this result, we divide the PilotSkills table by the planes in the hangar.
+
+Here is one way to write the query:
+
+This not the sort of thing that newbie SQL programmers can pull out of their hats, but they can write "SELECT pilot FROM QualifiedPilots;" without much trouble. Furthermore, the VIEW definition can be changed, and the user will never know it. Here is another version of relational division:
+
+# 7.5 VIEWs Ensure Proper Data Derivation
+
+Another valid usage of VIEWs is to ensure consistent derived data by creating new columns for VIEWs that are based on arithmetic formulae (e.g., creating a VIEW that contains a column named "tot_comp," which is defined by [salary + commission + bonus]). Because this column name is at the table level, it can be used in the SELECT of the invoking SELECT statement. That is, this is illegal:
+
+and this is legal:
+
+followed by:
+
+Although this is an easy formula, it is a good idea to have a complicated one in only one place in the schema. It might not be right, but at least it will be consistent.
+
+# 7.6 VIEWs Rename Tables and/or Columns
+
+You can rename columns in VIEWs. This is particularly useful if a table contains arcane or complicated column names. There are some prime examples of such tables in the schema information tables of most SQL products. Additionally, if other tables exist with clumsy table and/or column names, VIEWs can provide a quick solution until you can rename them. In many SQL products, doing this can require dropping and recreating the tables.
+
+# 7.7 VIEWs Enforce Complicated Integrity Constraints
+
+Consider a schema for a chain of stores that has three tables, thus:
+
+The first two tables explain themselves. The third table shows the relationship between stores and personnel—namely, who is assigned to which job at which store and when this happened. Thus:
+
+Let job_type 0 = "unassigned", 1 = "stockboy", and so on, until we get to 99 = "Store Manager"; we have a rule that each store has one and only one manager. In full SQL-92 you could write a constraint like this:
+
+But many SQL products do not allow CHECK () constraints that apply to the table as a whole, and they do not support the scheme-level CREATE ASSERTION statement. So, how to do this? You might use a trigger, which will involve—ugh!—procedural code. Despite the SQL/PSM and other standards, most vendors implement different trigger models and use their proprietary 4GL language, but, being a fanatic, I want a pure SQL solution.
+
+Let's create two tables like this:
+
+Then build a UNION-ed VIEW:
+
+The key and job_type constraints in each table working together will guarantee only one manager per store. The next step is to add INSTEAD OF triggers to the VIEW, so that the users can insert, update, and delete from it easily.
+
+As an exercise for the reader: How would you ensure that no store has more than two assistant managers?
+
+# 7.8 Updatable VIEWs
+
+The SQL-92 standard is actually conservative about which VIEWs are updatable. They have to be based on the following:
+
+1. A SELECT statement on one and only one table, but the VIEW can be defined on several layers of VIEWs on top of VIEWs.
+
+2. The VIEW must include all of the columns of a UNIQUE or PRIMARY KEY constraint in the base table. This guarantees that all of the rows in the VIEW map back to one and only one row in the base table from which it is derived.
+
+3. All base table columns not shown in the VIEW must have default values or be NULL-able. The reason for that is obvious: You have to delete or insert a complete row into the base table, so the system must be able to construct such a row.
+
+However, other VIEWs are updatable, and some vendors support more than the basic version given in the SQL-92 standard. The VIEW must have an INSERT, UPDATE, and DELETE rule under the covers, which maps its rows back to a single row in the base table(s).
+
+## 7.8.1 WITH CHECK OPTION clause
+
+Another feature, which is not used enough, is the WITH CHECK OPTION clause on a VIEW. It is a bit tricky, when you nest VIEWs inside each other, but the idea is that an UPDATE or INSERT INTO statement cannot leave the scope of the set selected by the updatable VIEW. For example, we have a VIEW like this:
+
+The result would be that "NewYorkSalesmen" would be empty when you come back to it. This is probably not desirable. However, if we had defined the updatable VIEW as:
+
+the system would test the update for a violation and would reject it.
+
+## 7.8.2 INSTEAD OF Triggers
+
+Because some VIEWs cannot be updated, you can add INSTEAD OF triggers to fool the users. This trigger is executed instead of the INSERT, UPDATE, or DELETE action, thus overriding the actions of the triggering statements. The syntax will vary from product to product, but expect something like this:
+
+For obvious reasons, only one INSTEAD OF trigger per INSERT, UPDATE, or DELETE statement can be defined on a table or VIEW. However, it is possible to define VIEWs on VIEWs where each VIEW has its own INSTEAD OF trigger. INSTEAD OF triggers are not allowed on updatable VIEWs that have a WITH CHECK OPTION.
+
+You can also define INSTEAD OF triggers on base tables, but this is a bit weird because you have BEFORE and AFTER triggers.
+
+# 7.9 Have a Reason for Each VIEW
+
+#### Rationale:
+
+VIEWs should be created only when they achieve a specific, reasonable goal. Each VIEW should have a specific application or business requirement that it fulfills before it is created. That requirement shouldbe documented somewhere, preferably in a data dictionary or possibly as a remark in the VIEW declaration.
+
+#### Exceptions:
+
+None
+
+# 7.10 Avoid VIEW Proliferation
+
+#### Rationale:
+
+The proliferation avoidance rule is based on common sense. Why create something that is not needed? It just takes up space that could be used for something that is needed.
+
+Whenever a SQL object is created, additional entries are placed in the schema information tables. Creating needless schema objects causes what Craig Mullins calls _catalog clutter._ For example, in DB2, every unnecessary VIEW that is created in SQL will potentially insert rows into four VIEW-specific schema information tables (i.e., SYSVTREE, SYSVLTREE, SYSVIEWS, and SYSVIEWDEP) and three table-specific schema information tables (i.e., SYSTABLES, SYSTABAUTH, and SYSCOLUMNS).
+
+It is a good idea to use a utility program to see if you have VIEWs that are not referenced anywhere. Another good idea is to see if you have VIEWs that do the same thing, or almost the same thing, so you can remove one of them.
+
+#### Exceptions:
+
+None
+
+# 7.11 Synchronize VIΕWs with Base Tables
+
+#### Rationale:
+
+Whenever a base table changes, all VIEWs that depend on that base table should be analyzed to determine if the change affects them. All VIEWs should remain logically pure. The VIEW should remain useful for the specific reason you created it.
+
+For example, say a VIEW was created to control employee access to a project and we add the new badge numbers to the Personnel table. This badge number probably should also be added to the access VIEW. The badge number column can be added to the Personnel table immediately and then to the VIEW at the earliest convenience of the development team.
+
+The synchronization rule requires that strict change impact analysis procedures be in place. Every change to a base table should trigger the usage of these utility programs and maintenance procedures.
+
+#### Exceptions:
+
+None
+
+# 7.12 Improper Use of VIEWs
+
+Over the years, VIEWs have been used for other purposes that made sense at the time but have been rendered obsolete with the advent of new DBMS functionality.
+
+## 7.12.1 VIEWs for Domain Support
+
+#### Rationale:
+
+It is a sad fact of life that most RDBMS do not support domains. Domains were in the original relational model and should have been part of SQL from the start. A domain basically identifies the valid range of values that a column can contain. Of course, domains are more complex than this simple explanation. For example, only columns pooled from the same domain should be able to be compared within a predicate (unless explicitly overridden).
+
+Some of the functionality of domains can be implemented using VIEWs and the WITH CHECK OPTION clause, which ensures the update integrity of VIEWs. This will guarantee that all data inserted or updated using the VIEW will adhere to the VIEW specification.
+
+Now, this method of using VIEWs to simulate domains is still viable, but a better technique to provide the same functionality is available—namely, CHECK() constraints.
+
+And a CHECK() constraint is simpler than creating VIEWs with the WITH CHECK OPTION.
+
+#### Exceptions:
+
+None
+
+## 7.12.2 Single-Solution VIEWs
+
+#### Rationale:
+
+Another past usage for VIEWs was to enable solutions where VIEWs really were the only way to solve a data access problem. Without VIEWs, some complex data access requests could be encountered that were not capable of being coded using SQL alone. However, sometimes a VIEW can be created to implement a portion of the access. Then, the VIEW can be queried to satisfy the remainder.
+
+Consider the scenario where you want to report on detail information and summary information from a single table. For instance, what if you would like to report on stock prices? For each stock, provide all stock details, and also report the maximum, minimum, and average prices for that stock. Additionally, report the difference between the average price and each individual price.
+
+After the VIEW is created, the following SELECT statement can be issued joining the VIEW to the base table, thereby providing both detail and aggregate information on each report row:
+
+Situations such as these were ideal for using VIEWs to make data access a much simpler proposition. However, the advent of table expressions (sometimes referred to as in-line VIEWs) makes this usage of VIEWs obsolete. Why? Instead of coding the VIEW, we can take the SQL from the VIEW and specify it directly into the SQL statement that would have called the VIEW. Using the previous example, the final SQL statement becomes:
+
+So we can use a table expression to avoid creating and maintaining a VIEW.
+
+#### Exceptions:
+
+If an expression is used in many places and it has a clear meaning in the data model, then create a VIEW.
+
+## 7.12.3 Do Not Create One VIEW Per Base Table
+
+#### Rationale:
+
+A dubious recommendation is often made to create one VIEW for each base table in a SQL application system. This is what Craig Mullins calls "The Big VIEW Myth." This is supposed to insulate application programs from database changes. This insulation is to be achieved by mandating that all programs be written to access VIEWs instead of base tables. When a change is made to the base table, the programs do not need to be modified because they access a VIEW, not the base table.
+
+There is no adequate rationale for enforcing a strict rule of one VIEW per base table for SQL application systems. In fact, the evidence supports not using VIEWs in this manner. Although this sounds like a good idea in principle, indiscriminate VIEW creation should be avoided. The implementation of database changes requires scrupulous analysis regardless of whether VIEWs or base tables are used by your applications. Consider the simplest kind of schema change, adding a column to a table. If you do not add the column to the VIEW, no programs can access that column unless another VIEW is created thatcontains that column. But if you create a new VIEW every time you add a new column, it will not take long for your environment to be swamped with VIEWs.
+
+Then you have to ask which VIEW should be used by which program? Similar arguments can be made for removing columns, renaming tables and columns, combining tables, and splitting tables.
+
+In general, if you follow good SQL/SQL programming practices, you will usually not encounter situations where the usage of VIEWs initially would have helped program/data isolation anyway. By dispelling, "The Big VIEW Myth," you will decrease the administrative burden of creating and maintaining an avalanche of base table VIEWs.
+
+#### Exceptions:
+
+None
+
+# 7.13 Learn about Materialized VIEWs
+
+#### Rationale:
+
+A materialized VIEW is brought into existence in the physical database, where it can be used like any other table. This is implementation dependent, so you have to know what your product does to get the best use of this feature.
+
+All VIEWs are supposed to act as if they are materialized, but in practice the text of the view can often be put into the parse tree of the statement using it and expanded like an in-line macro statement. For example, given this VIEW:
+
+When it is used in a query, the effect is as if it were a derived table expression inside that query. For example:
+
+in effect becomes:
+
+which will probably become something like this in the parse tree:
+
+However, if more than one user references a VIEW, it can be cheaper to materialize it once and share the data among all users. If the materialized result set is small enough to fit into main storage, the performance improvements are even greater.
+
+This is actually a common event, because we tend to build views that summarize data for reporting periods. Thus, lots of users want to get to the same summary views at the same time. If you plan the VIEWs to take advantage of this usage pattern, you can get major performance improvements.
+
+#### Exceptions:
+
+None
+CHAPTER 8 How to Write Stored Procedures
+
+_"Whatever language you write in, your task as a programmer is to do the best you can with the tools at hand. A good programmer can overcome a poor language or a clumsy operating system, but even a great programming_ _environment will not rescue a bad programmer."_
+
+—Kerniehan and Pike
+
+EVERY SQL PRODUCT has some kind of 4GL tools that allow you to write stored procedures that reside in the database and that can be invoked from a host program. Each 4GL is a bit different, but they are all block-structured languages. They have varying degrees of power and different language models. For example, T-SQL is a simple, one-pass compiler modeled after the C and Algol languages. It was not intended as an application development language, but rather as a tool for doing short tasks inside a SQL Server database.
+
+At the other extreme, Oracles PL/SQL is modeled after ADA and SQL/PSM. It is a complicated language that can be used for application development. Likewise, Informix 4GL is an application development language that generates C code, which can be immediately ported to a large number of platforms.
+
+What this means is that anything I say about SQL stored procedures will have to be general, but perhaps the most frightening thing is that I have to go back and teach basic software engineeringprinciples to SQL programmers. If you look at the SQL code posted in newsgroups, much of it is written as if all of the work done in the 1970s and 1980s by Yourdon, DeMarco, Dijkstra, Wirth, and others, never happened. Wake up, people! Those rules still apply to any programming language because they apply to programming.
+
+# 8.1 Most SQL 4GLs Are Not for Applications
+
+#### Rationale:
+
+Most of the proprietary procedural languages added to SQL by vendors were never meant to replace application development languages (note the exceptions). They were meant to be micro-languages that could be used for procedural operations inside the database.
+
+The classic micro-language has no real input/output (I/O); you can print a message on the standard system output and that is about all. There is no file control, no complex computations, and no display formatting functions. These languages were for writing triggers and short cleanup modules in the schema, and the rule of thumb was never to have a procedure over one page or 50 lines long.
+
+This is fine; in a tiered architecture, display and complex computations are done in the host language of the presentation layer. But if you read the SQL newsgroups, you will constantly find newbie programmers who want to do display formatting in the database. They want to add leading zeros in a SELECT statement, concatenate first and last names, put line numbers on the result set to display ranges of those line numbers, and a host of other things. SQL is strictly a data-retrieval language and has nothing to do with application presentation layers.
+
+#### Exceptions:
+
+Informix 4GL, Progress, Oracle's PL/SQL, and a few other languages were actually meant for application development. Sometimes the language came before the SQL database and vice versa. A proprietary language can be fast to execute, fast to write, and have lots of nice features. A lot of mainframe packages are implemented in Informix 4GL under the covers, Oracle sells packages written in PL/SQL, and a lot of midsized systems are implemented in Progress. The trade-off is the ability to maintain these proprietary code bases versus maintaining a standard programming language with embedded SQL.
+
+# 8.2 Basic Software Engineering
+
+I am amazed that so many SQL programmers do not know basic software engineering. Working programmers on newsgroups actually have to ask for definitions of cohesion and coupling. Apparently, programmers are not getting the basics of their trade and simply try to pass certification exams instead of actually learning their craft. With some embarrassment, I will now give what should have been covered in a freshman course.
+
+These principles apply to any procedural programming language, but they have slightly different applications in SQL because it is a nonprocedural, set-oriented language with concurrency issues.
+
+## 8.2.1 Cohesion
+
+Cohesion is how well a module does one and only one thing: that it is logically coherent. The modules should have strong cohesion. You ought to name the module in the format "<verbxobject>," where the "<object>" is a specific logical unit in the data model.
+
+There are several types of cohesion. They are ranked here from the worst form of cohesion to the best:
+
+1. Coincidental
+
+2. Logical
+
+3. Temporal
+
+4. Procedural
+
+5. Communicational
+
+6. Informational
+
+7. Functional
+
+This scale is an ordinal scale, and a module can have characteristics of more than one type of cohesion in it. Let's define terms as follows:
+
+  * _Coincidental cohesion._ This is the worst kind of cohesion. This is where a module performs several unrelated tasks under one roof. Think of someone pasting random blocks of code together and somehow getting it to compile. This is what you get with dynamic SQL or passing table names as parameters.
+
+For example, "InsertNewCustomerO" tells you that you are going to be working with the tables related to the customers. However, a procedure called "InsertNewRecord," which can put a row into any table in the schema, is too general to have good cohesion. It works on bagpipes, marriages, and octopi or any new table that gets put into the schema later.
+
+Programmers should not be using dynamic SQL, because it has no cohesion and is dangerous. Users who have to provide, say, a table name, can also provide extra SQL code that will be executed. For example, instead of passing just the table name, they pass "Foobar; DELETE FROM Foobar; COMMIT" and destroy the database. But dynamic SQL also says that the programmer is so incompetent that he or she could not write the program and had to give the job to any random user, present or future, to complete on the fly.
+
+This kind of coding is the result of trying to do metadata operations in an application by using the schema information tables. SQL engines have tools for metadata, and the user should not be writing versions of them.
+
+  * _Logical cohesion._ Here modules can perform a series of related tasks, but the calling module selects only one. The worst example of this was a posting in 2004 on a SQL Server newsgroup where a programmer had been ordered to put all procedures into one module. A parameter would then pick which of 50-plus modules would be executed and which parameters would be used and what they would do in context.
+
+OO programmers like to do this for each table, because they can think of each table as some kind of object, and the procedure looks like methods on that object. It isn't.
+
+  * _Temporal cohesion._ The module performs a series of actions that are related in time. The classic example is to put all startup or shutdown actions in one module. Older COBOL and file system programmers tend to do this because they worked with batch processing systems that did not have concurrency issues.
+  * _Procedural cohesion._ The modules perform a sequence of steps in a process that has to be executed in specific order. Again, this style is used by file system programmers who are used to batch processing systems. They often write a lot of temporary tables to hold the process steps, like we used to allocate working tapes.
+  * _Communicational cohesion._ All elements operate on the same input data set or produce the same output data set. The parts communicate via common data in a global table.
+  * _Informational cohesion._ This is also called _sequential cohesion_ in the literature. Output from one element in the module serves as input for some other element, but unlike logical cohesion, the code for each action is completely independent.
+  * _Functional cohesion._ The module performs exactly one function or achieves a single goal. Math functions are the best example of this kind of cohesion. This is what we are trying to do, and it is why SQL is also known as a functional language.
+
+Procedural, communicational, informational, and functional cohesion are a bit more complicated in SQL than in 3GL programming because we have transactions. A transaction is logically one step, although it consists of individual SQL statements. What looks like procedural, communicational, or informational cohesion can be much stronger in SQL.
+
+## 8.2.2 Coupling
+
+If modules have to be used in a certain order, then they are strongly coupled. If they can be executed independently of each other and put together like Lego blocks, then they are loosely or weakly coupled. There are several kinds of coupling, which are ranked from worst to best as follows:
+
+1. Content
+
+2. Common
+
+3. Control
+
+4. Stamp
+
+5. Data
+
+The types of coupling are defined as follows:
+
+  * _Content coupling._ This occurs when one module directly references the contents of another module. For example, module X branches to a local label in module _y_ or module X modifies a statement of module _y._ Such modules are inextricably linked to each other. Content coupling is dangerous but is not often supported in SQL 4GL products. The rule here is not to pass a procedure as a parameter in a SQL 4GL.
+  * _Common coupling._ This occurs when several modules have access to the same global data. In the 3GL languages, this was use of global variables in the C family and other languages. In SQL, this can happen with the use of common global tables to pass information. It gets to be dangerous when concurrency controls are not done right.
+  * _Control coupling._ This occurs when one module has control over the logic of another. If module X calls module y and y determines which action X must take, then control coupling is present. The passing of a control switch statement as an argument is an example of control coupling. In SQL, you do this with subqueries that reference other parts of the schema in predicates that drive control flow.
+  * _Stamp coupling._ Entire tables are passed to the called module, but only some columns are used. In SQL, the use of "SELECT *" in production code is the prime example.
+  * _Data coupling._ Two modules are data coupled if all arguments are scalar data elements. Data coupling is a desirable goal because such modules are easier to maintain. Any changes in one module or table are less likely to cause a regression fault in the others.
+
+# 8.3 Use Classic Structured Programming
+
+Although I like to say that SQL is short for "Scarcely Qualifies as a Language," the truth is that it came from "Structured English-like Query Language" from the original project at IBM. A lot of current programmers seem to have missed the structured revolution and have reverted back to ad hoc programming but call it "extreme" or "agile" these days to make sloppy programming sound better.
+
+In classic structured programming, you have three control structures:
+
+1. _Concatenation._ The statements inside brackets are executed in sequential order. In SQL/PSM this is shown with the keyword brackets "BEGIN [ATOMIC] .. END" and often by just "BEGIN .. END" in proprietary 4GLs. The keyword ATOMIC makes the block into a transaction, which we will not discuss in detail here.
+
+2. _Selection._ A Boolean expression determines which one of two blocks of statements is executed. In SQL/PSM this is shown with the keywords "IF .. THEN .. [ELSE ..] END IF;" and in proprietary 4GLs with "IF .. THEN .. [ELSE ..];" or "IF .. [ELSE ..];" but syntax is always enough alike not to be a problem.
+
+3. _Iteration._ A block of statements is repeatedly executed while a Boolean expression is TRUE. In SQL/PSM this is shown with the keywords "WHILE .. LOOP .. END WHILE;" and you will see "WHILE .. DO.." keywords in many products. Again, various products are always enough alike not to be a problem.
+
+The important characteristic of all of these control structures is that they have one entry and one exit point. Any code written using them will also have one entry and one exit point. You do not use a GO TO statement in classic structured programming.
+
+Some languages allowed a RETURN() statement to jump out of functions and set the value of the function call. Some allowed a switch or case expression as a multiway selection control statement. But by sticking as close as possible to classic structured programming, your code is safe, verifiable, and easy to maintain.
+
+## 8.3.1 Cyclomatic Complexity
+
+So is there a heuristic for telling if I have a bad stored procedure? There are a lot of metrics actually. In the 1970s, we did a lot of research on software metrics and came up with some good stuff. Here is one that can be computed by hand when you have short procedures to measure.
+
+Tom McCabe (1976) invented the cyclomatic complexity metric. The score is basically the number of decision points in a module plus one, or the number of execution paths through the code. Decision points are where a flow graph of the procedure would branch. In a well-structured 4GL program, the keywords of the language will tell us what the decision points are. For us that means IF, WHILE, and each branch of a CASE or SWITCH statement, if your 4GL supports that feature.
+
+If the module has a score of 1 to 5, it is a simple procedure. If the score is between 6 to 10, it might need simplification. If the score is greater than 10, then you really should simplify the module. There are other metrics and methods, but most of them are not as easy to compute on the fly.
+
+# 8.4 Avoid Portability Problems
+
+#### Rationale:
+
+We already talked about writing portable SQL statements, but you also need to write portable 4GL code. Because these languages are proprietary, they will have some features that will not port to other SQL 4GLs. Also, you cannot expect that you will always find programmers who are expert in these languages or who have time to become experts. Plain, simple code in an unfamiliar language can be a great help.
+
+Stick to the classic three control structures. They will always port with only mechanical syntax changes and can be read by any programmer who knows a typical 3GL language. But there are other tricks and heuristics.
+
+## 8.4.1 Avoid Creating Temporary Tables
+
+In some vendor languages, the programmer can create a temporary table on-the-fly, while in Standard SQL the temporary tables are only created by someone holding administrative privileges. Use subquery expressions, derived tables, or VIEWs instead. The use of temporary tables is usually a sign of a bad design. Temporary tables are most often used to hold the steps in a procedural process. They replace the scratch or work tapes we used in the 1950s magnetic tape file systems.
+
+There are two major types of error handling. The Sybase/SQL Server family uses a sequential code model. After executing each statement, the SQL engine sets a global error variable, and the programmer has to write code to immediately catch this value and take action.
+
+The SQL/PSM model uses an interrupt model. There is a global SQLSTATE (the old SQLCODE is deprecated), which can return multiple values into a cache. These values can trigger actions that were defined in WHENEVER statements associated with blocks of code. Maintaining the error handling part of a module is difficult, so do a lot of comments in it.
+
+Put as much of the code into SQL statements, not into the 4GL. Ideally, a stored procedure ought to be one SQL statement, perhaps with a few parameters. The next best design would be a "BEGIN [ATOMIC] .. END" with a straight sequence of SQL statements. You lose points for each "IF..THEN..ELSE" and lose lots of points for each loop.
+
+## 8.4.2 Avoid Using Cursors
+
+#### Rationale:
+
+A cursor is a way of converting a set into a sequential file so that a host language can use it. There are a lot of options on the Standard SQL cursor, and there are a lot of vendor options, too.
+
+Cursors are difficult to port and generally run much slower than pure nonprocedural SQL statements. By slower, I mean orders of magnitude slower. For safety, the SQL engine has to assume that anything can happen inside a cursor, so it puts the transaction at the highest level it can and locks out other users.
+
+So why do people use them? The overwhelming reason is ignorance of SQL and old habits. The cursors in SQL are modeled after tape file semantics, and people know that kind of procedural programming. Here is the analogy in detail:
+
+Add the use of temporary tables as working or scratch tapes and you can mimic a 1950s tape system statement for statement and never learn to think relationally at all. In 2004, there was an example of this in the SQL Server Programming newsgroup. The newbie had written one cursor to loop through the first table and select rows that met a criterion into a temporary table. A second cursor looped through a second table ordered on a key; inside this loop, a third cursor looped through the temporary table to match rows and do an update. This was a classic 1950s master/transaction tape file merge but written in SQL. The 25 or so statements used in it were replaced by one UPDATE with a scalar subquery expression. It ran almost three orders of magnitude faster.
+
+#### Exceptions:
+
+The only uses I have found are truly exceptional. Cursors can be used to repair poorly designed tables that have duplicate rows or data that is so trashed you have to look at every row by itself to clean the data beforedoing an ALTER TABLE to fix such poor design permanently. Here are some reasons to use cursors:
+
+1. Cursors can be used to build metadata tools, but you really should be using what the vendor has provided. Messing directly with schema information tables is dangerous.
+
+2. Cursors can be used to solve NP-complete problems in SQL where you stop with the first answer you find that is within acceptable limits. The "Traveling Salesman" and "Bin Packing" problems are examples, but they are not exactly common database problems and are better solved with a procedural language and backtracking algorithms.
+
+3. In T-SQL and other products that still use physically contiguous storage, calculating a median is probably much faster with a cursor than with any of the set-based solutions, but in other products with different storage or indexing, computing the median is trivial.
+
+4. It is possible to actually write code that is worse than a cursor. Consider this slightly cleaned-up posting by Curtis Justus in the SQL Server Programming newsgroup in November 2004. He had a table of approximately 1 million rows and needed to "do something with each of the rows" in what he called a traditional "For/Each" type algorithm. The specifications were never explained beyond that. He posted a pseudocode program in T-SQL dialect, which would translate into Standard SQL pseudocode something like this:
+
+Yes, you are looking at a sequential tape file algorithm from the 1950s written in SQL in the early 21st century. The poster wanted to know if this was the most efficient way to go after the data. The answer, obviously, is that even a cursor would be better than this approach.
+
+You would be surprised by how many newbies rediscover sequential tape processing in SQL. Perhaps even more remarkable was this person's attitude that he was currently getting a fast enough response time that it did not have to be coded correctly. The lack of portability, the orders of magnitude degradation, and the extra lines of code that had to be maintained were simply not regarded as his responsibility as a professional.
+
+## 8.4.3 Prefer Set-Oriented Constructs to Procedural Code
+
+#### Rationale:
+
+The optimizer cannot use control structures from the 4GL to pick an execution plan. Thus, the more logic you can pass to it via pure SQL statements, the better it will perform. The real cost in a stored procedure is in data access. Timing for various operations on a typical 1-GHz PC in summer 2001 in nanoseconds was:
+
+If I can save a few disk fetches, I get a much better return on my efforts than if I write faster executing computations. The seek times have not gotten and are not going to get much better in the foreseeable future.
+
+### 8.4.3.1 Use CASE Expressions to Replace IF-THEN-ELSE Control Flow Statements
+
+As an example of how to do this, consider the problem of updating the prices in a bookstore. This is a version of an exercise in an early Sybase SQL training class to show why we needed cursors. We want to take 10 percent off expensive books ($25 or more) and increase inexpensive books by 10 percent to make up the loss. The following statement is the first impulse of most new SQL programmers, but it does not work.
+
+A book priced at $25.00 is reduced to $22.50 by the first update. Then it is raised to $24.75 by the second update. Reversing the order of the update statements does not change the problem. The answer given in the course was to use a cursor and to update each book one at a time. This would look something like this:
+
+But by using a CASE expression to replace the IF..THEN..ELSE logic, you can write:
+
+This requires less code and will run faster. The heuristic is to look for nearly identical SQL statements in the branches of an IF statement, then replace them inside one statement with a CASE expression.
+
+### 8.4.3.2 Use Sequence Tables to Replace Loop Control Flow
+
+A sequence table is a single-column table that contains integers from 1 to (n), for some values of (n) that are large enough to be useful. One way of generating such a table is:
+
+However, it is faster to write:
+
+This use of CROSS JOINs is another example of how to avoid loops. A weird but useful heuristic is to put the phrase "the set of.." in front of the nouns in a sentence that describes the problem you are solving. It is bad grammar, but it can help shift your mindset to thinking in terms of sets.
+
+Converting a string with a comma-separated list of values into a proper table with the position and value is done by using a simple WHILE loop that cuts off one substring up to but not including the comma, and then converts the substring to an integer. The code would look like this:
+
+However, the same thing can be done with a Sequence table, thus:
+
+It makes life easier if the lists in the input strings start and end with a comma. You will also need a table called Sequence, which is a set of integers from 1 to (n).
+
+The S1 and S2 copies of Sequence are used to locate bracketing pairs of commas, and the entire set of substrings located between them is extracted and cast as integers in one nonprocedural step. The trick is to be sure that the left-hand comma of the bracketing pair is the closest one to the second comma. The place column tells you the relative position of the value in the input string. The real advantage of the nonprocedural approach comes from modifying this second procedure to handle an entire table whose rows are CSV strings.
+
+In fact, the one row at a time procedure can be replaced with a VIEW instead:
+
+### 8.4.3.3 Use Calendar Tables to Perform Temporal Calculations
+
+#### Rationale:
+
+The first thing to do when you start a new application is to build a Sequence and Calendar table. The calendar table is keyed on a date, and the nonkey columns contain information about that date relative to the enterprise. Is this a workday or a holiday? What is its Julian date number? What fiscal calendar does it fall in? In short, anything to do with how the enterprise uses time must be detailed.
+
+The table for 20 years of data is only about 7,050 rows, which is nothing. You can look up programming tricks with this table in newsgroups or in Celko (1999).
+
+#### Exceptions:
+
+None
+
+### 8.4.3.4 Consider Auxiliary Tables to Perform Computations
+
+#### Rationale:
+
+If a function or computation returns only a few thousand values, instead of computing it over and over, put the parameters and the results into an auxiliary table that can be joined to the tables to get the answer. SQL is good at JOINs but not at computations; play to its strength.
+
+#### Exceptions:
+
+If the computation can be done with simple four-function math, then auxiliary tables could be overkill. If the computation is unpredictable or known to have a huge range, then it might not be possible to put it into an auxiliary table.
+
+# 8.5 Scalar versus Structured Parameters
+
+There are no arrays, lists, or other data structures in Standard SQL-92. There is only one data structure: the table. There are base tables, views, and derived tables, but the operative word in that list is "table."
+
+Procedural languages depend on other data structures, such as arrays, lists, and records. Newbie programmers who learned to program with such structures want to use them desperately when they get to SQL. The result is that they kludge code with poor performance. Even worse, they use dynamic SQL to construct a statement or an entire program on the fly.
+
+Stored procedure calls expect scalar parameters, not structured or dynamic parameters. By using a few coding tricks, you can still get theadvantages of stored procedures and have some flexibility. A typical problem is to pass a list of values to an IN() predicate, like this in pseudocode:
+
+The all-too-common kludge is dynamic SQL, which has a string with a list of comma-separated values for «parameter list». One answer is to use the code in section 8.4 to put the list into a table and write a compiled statement, thus:
+
+But a better answer is to scrub the list data in the front end and load it into a table with an INSERT INTO statement. The ability to do this will vary with each SQL product, but the standard SQL syntax uses row constructors, like this:
+
+The VALUES() list has to be of a known number of rows, but by putting NULLs or other dummy values in the list, you can get the effect of a dynamic list. You only need to clean them out on the database side, and you can use SELECT DISTINCT to remove duplicate values if needed. The full table insertion statement would look like this in the host language:
+
+# 8.6 Avoid Dynamic SQL
+
+Dynamic SQL is both slow and dangerous. It is also a sign that the programmer did not have a proper design for his or her application and is now turning that job over to any user, present or future. The purpose of Dynamic SQL is to build metadata tools, not applications. A metadata tool treats schema objects as schema objects, not as parts of a data model.
+
+## 8.6.1 Performance
+
+A stored procedure will have a cached execution plan in most SQL products, but Dynamic SQL has to be prepared repeatedly with each execution. Obviously, this is going to be slower than running compiled code that might already be in main storage. One counterargument is that if the predicates change in some significant way, then recompiling can give a better execution plan. The gist of this execution model is that if I have a predicate with constants instead of parameters, the optimizer can do a better job with it. For example, given this simple query:
+
+If the parameter ":input_sex_code" is male (1, using the ISO sex codes), then a table scan is the best way to process the query; if the parameter is female (2, using the ISO sex codes), then an index is the best; if the parameter is anything else, simply return an empty result set.
+
+Obviously, this is implementation dependent. However, more modern optimizers will create several possible execution plans, based on the statistics, and hold them until the parameter is known. In short, we are back to the "Trust the optimizer" rule.
+
+## 8.6.2 SQL Injection
+
+SQL injection is a security attack in which the attacker places SQL code into your procedure and executes it. Whenever you let a user input code directly into Dynamic SQL in stored procedure or SQL statements generated in client code, you are in danger. Here is an example of a function that builds a simple Dynamic SQL string, based on an FAQ at esquel@sommarskog.se:
+
+Assume that the input for the parameters "custname" comes directly from user input without any filtering or validation and that a malicious user passes this value in:
+
+The host program can then PREPARE and EXECUTE it, and drop the table for you.
+
+A plain user is not likely to have permissions to drop a table, but I can run all kinds of statements I wish via SQL injection. The attacker looks for inputs that will produce a syntax error rather than a runtime error, so he or she knows there is Dynamic SQL on the database side. The attacker writes the code, and, if needed, ends it with semicolons or with a start of comment that will remove the rest of the query code from compilation. With a little probing, the attacker can find out if the Dynamic SQL is providing a table name and really trash the schema.
+
+The first defense is not to give the users more privileges than are necessary for their jobs. A good heuristic is that plain users should be granted only SELECT privileges on the tables with which they work, but the best defense is not to use Dynamic SQL in production code.
+CHAPTER 9 Heuristics
+
+THE FOLLOWING TRICKS and heuristics are not exactly mathematically
+
+precise scientific methods. In fact, some of them sound pretty weird, but as Larry Constantine once remarked, a method is a list of things that tells you what to do next, when you did not know what to do next, and you hope the method at least gets you to a workable solution, if not a good solution.
+
+Let me pick simple programming problems and apply these heuristics as we go along. Consider the "Dance Partner Problem" in which you are given a list of people and their gender. Your task is to pair them into couples.
+
+Then there is the classic Orders problem: Given a data model of orders from customers for products from inventory, answer any of several questions. This is not a complete schema, but it will work for demonstration purposes.
+
+# 9.1 Put the Specification into a Clear Statement
+
+This might sound obvious, but the operative word is _clear_ statement. You need to ask questions at the start. Let me give some examples from actual problem statements having to do with a schema that models a typical orders and order details database:
+
+1. _"I want to see the most expensive item in each order._ " How do I handle ties for the most expensive item? Did you mean the highest unit price or the highest extension (quantity X unit price) on each order?
+
+2. _"I want to see how many lawn gnomes everyone ordered."_ How do I represent someone who never ordered a lawn gnome in the result set? Is that a NULL or a zero? If they returned all of their lawn gnomes, do I show the original order or the net results? Or do I show no order ever as a NULL and returns as a zero to preserve information?
+
+3. _"How many orders were over $100?"_ Did you mean strictly greater than $100 or greater than or equal to $100?
+
+In the "Dance Partner" example, we need to ask:
+
+1. How do we pair the couples?
+
+2. What do we do if there are more boys than girls (or vice versa) in the table?
+
+3. Can someone have more than one partner? If so, how do we assign them?
+
+Writing specs is actually harder than writing code. Given a complete, clear specification, the code can almost write itself.
+
+# 9.2 Add the Words "Set of All . . ./" in Front of the Nouns
+
+The big leap in SQL programming is thinking in sets and not in process steps that handle one unit of data at a time. Phrases like "for each x . . ." poison your mental model of the problem. Look for set characteristics and not for individual characteristics. For example, given the task to find all of the orders that ordered exactly the same number of each item, how would you solve it?
+
+One approach is, for each order, to see if there are two values of quantity that are not equal to each other and then reject that order. This leads to either cursors or a self-join. Here is a self-join version; I will not do the cursor version.
+
+Or you can look at each order as a set with these set properties:
+
+# 9.3 Remove Active Verbs from the Problem Statement
+
+Words like _traverse, compute,_ or other verbs that imply a process will poison your mental model. Try to phrase it as a "state of being" description instead. This is the same idea as in section 9.2, but with a slight twist.
+
+Programmers coming from procedural languages think in terms of actions. They add numbers, whereas a declarative programmer looks at a total. They think of process, whereas we think of completed results.
+
+# 9.4 You Can Still Use Stubs
+
+A famous Sydney Harris cartoon shows the phrase "Then a miracle occurs" in the middle of a blackboard full of equations, and a scientist says to the writer, "I think you should be more explicit here in step 2."
+
+We used that same trick in procedural programming languages by putting in a stub module when we did not know what to do at the point in a program. For example, if you were writing a payroll program and the company had a complex bonus policy that you did not understand or have specifications for, you would write a stub procedure that always returned a constant value and perhaps sent out a message that it had just executed. This allowed you to continue with the parts of the procedure that you did understand.
+
+This is more difficult to do in a declarative language. Procedural language modules can be loosely coupled, whereas the clauses and subqueries of a SELECT statement are a single unit of code. You could set up a "test harness" for procedural language modules; this is more difficult in SQL.
+
+Looking at the "Dance Partner Problem," I might approach it by saying that I need the boys and the girls in two separate subsets, but I don't know how to write the code for that yet. So I stub it with some pseudocode in my text editor. Because this is for dance, let's pick the pseudocode words from a musical. Nobody is going to see this scratch paper work, so why not?
+
+The angle-bracketed pseudocode might expand to multiple columns, subqueries, or just about anything later. Right now they are placemarkers. I also have a "??" placemarker for the relationship between my guys and dolls. I can then go to the next level in the nesting and expand the (<miracle for guys>) subquery like this:
+
+The same pattern would hold for the (<miracle for dolls>) subquery. I now need to figure out some way of getting code for <join thingie for guys>. The first place I look is the columns that appear in the People table. The only thing I can find in that table is gender. I have a rule that tells me guys = 1 and dolls = 2, and I am enforcing it in my subqueries already. (Note: The full ISO sex codes are 0 = unknown, 1 = male, 2 = female, and 9 = lawful persons, corporations, etc.) I could try this:
+
+but it is pretty easy to see that this is a CROSS JOIN in thin disguise. Add something with the names, perhaps?
+
+There was no help there. It produces a smaller set of pairs, but you still get multiple couples on the dance floor. This is where some experience with SQL helps. One of the customary programming tricks is to use a self-join to get a ranking of elements in a set based on their collation sequence. Because this works with any table, we can use it in both guys and dolls to get the final query.
+
+# 9.5 Do Not Worry about Displaying the Data
+
+In a tiered architecture, display is the job of the front end, not the database. Obviously, you do not do rounding, add leading zeros, change case, or pick a date format in the database. The important thing is to pass the front end all of the data it needs to do its job, but it is more than that. You can get your dance partner pairs with the query in section 9.4, but if you do not want to see the pairs on the same row, you can write a more compact query like this:
+
+This will put one person per row with a ranking in the alphabetical sort for their gender rather than one couple per row, but that is still the same information from a simpler query. Notice that both solutions can leave unpaired people toward the end of the alphabet.
+
+You can add an ORDER BY clause to the cursor that passes the result set to the front-end program in a simple client/server system, but in architectures with multiple tiers, sorting and other display functions might be performed differently in several places. For example, the same data is displayed in English units sorted by division in the United States but displayed in SI units sorted by country in Europe.
+
+# 9.6 Your First Attempts Need Special Handling
+
+Henry Ledgard (1976) put it very nicely:
+
+Pruning and restoring a blighted tree is almost an impossible task. The same is true of blighted computer programs. Restoring a structure that has been distorted by patches and deletions, or fixing a program with a seriously weak algorithm isn't worth the time. The best that can result is a long, inefficient, unintelligible program that defies maintenance. The worst that could result, we dare not think of.
+
+This is especially true with SQL, but how to handle restarts in DDL and DML is different because of the declarative nature of the two sublanguages. DDL execution is static once it is put into place, whereas DML is dynamic. That is, if I issue the same CREATE <schema object> command, it will have the same results each time, but if I issue the same SELECT, INSERT, UPDATE, or DELETE, the execution plan could change each time.
+
+## 9.6.1 Do Not Be Afraid to Throw Away Your First Attempts at DDL
+
+Bad DDL will distort all of the code based on it. Just consider our little "Dance Partner" schema: What if a proprietary BIT data type had been used for gender? The code would not port to other SQL dialects. The host languages would have to handle low-level bit manipulation. It would not interface with other data sources that use ISO standards.
+
+Designing a schema is hard work. It is unlikely that you will get it completely right in one afternoon. Rebuilding a database will take time and require fixing existing data, but the other choices are worse.
+
+When I lived in Salt Lake City, Utah, a programmer I met at a user group meeting had gotten into this situation: The existing database was falling apart as the workload increased thanks to poor design at the start. The updates and insertions for a day's work were taking almost 24 hours at that time, and the approaching disaster was obvious to the programmers. Management had no real solution, except to yell at the programmers. They used the database to send medical laboratory results to hospitals and doctors.
+
+A few months later, I got to see how an improperly declared column resulted in the wrong quantities of medical supplies being shipped to an African disaster area. The programmer tried to save a little space by violating first normal form by putting the package sizes into one column and pulling them out with SUBSTRINGO operations. The suppliers later agreed to package smaller quantities to help with the fantastic expense of shipping to a war zone. Now the first "subfield" in the quantity column was one unit and not five, but the tightly coupled front did not know this. Would you like to pick which four children will die because of sloppy programming? See what we mean by the last sentence in Ledgard's quote?
+
+## 9.6.2 Save Your First Attempts at DML
+
+Bad DML can run several orders of magnitude slower than good DML. The bad news is that it is difficult to tell what is good and what is bad in SQL. The procedural programmers had a deterministic environment in which the same program ran the same way every time. SQL decides how to execute a query based on statistics about the data and the resources available. They can and do change over time. Thus, what was the best solution today could be the poorer solution tomorrow.
+
+In 1988, Pascal (1988) published a classic article on PC database systems at the time. Pascal constructed seven logical equivalent queries for a database. Both the database and the query set were simple and were run on the same hardware platform to get timings.
+
+The Ingres optimizer was smart enough to find the equivalence, used the same execution plan, and gave the best performance for all queries. The other products at the time gave uneven performances. The worst timing was an order of magnitude or more than the best. In the case of Oracle, the worst timing was more than 600 times the best.
+
+I recommend that you save your working attempts so that you can reuse them when the world and/or your optimizer change. The second example for the "Dance Partner" in section 9.5 does a nice job of illustrating this heuristic. Put the code for one of the queries in as a comment, so the maintenance programmer can find it.
+
+# 9.7 Do Not Think with Boxes and Arrows
+
+This is going to sound absolutely insane, but some of us like to doodle when we are trying to solve a problem. Even an informal diagram can be a great conceptual help, especially when you are learning something new. We are visual creatures.
+
+The procedural programmers had the original ANSI X3.5 Flowchart symbols as an aid to their programming. This standard was a first crude attempt at a visual tool that became Structure Charts and Data Flow Diagrams (DFD) in the 1970s. All of these tools are based on "boxes and arrows"—they show the flow of data and/or control in a procedural system. If you use the old tools, you will tend to build the old systems. You might write the code in SQL, but the design will tend toward the procedural.
+
+# 9.8 Draw Circles and Set Diagrams
+
+If you use set-oriented diagrams, you will tend to produce set-oriented solutions. For example, draw a GROUP BY as small, disjointed circles inside a larger containing circle so you see them as subsets of a set. Use a time line to model temporal queries. In a set-oriented model, nothing flows; it exists in a state defined by constraints.
+
+Probably the clearest example of "boxes and arrows" versus "set diagrams" is the Adjacency List model versus the Nested Sets model for trees. You can Google these models or buy a copy of my book _Trees and Hierarchies in SQL for Smarties_ for details. The diagrams for each approach are shown in Figure 9.1.
+
+Figure 9.1 Adjacency list versus Nested Set Trees.
+
+# 9.9 Learn Your Dialect
+
+Although you should always try to write Standard SQL, it is also important to know which constructs your particular dialect and release favor. For example, constructing indexes and keys is important in older products that are based on sequential file structures. At the other extreme, the Nucleus engine from Sand Technology represents the entire database as a set of compressed bit vectors and has no indexing because in effect everything is automatically indexed.
+
+# 9.10 Imagine That Your WHERE Clause Is "Super Ameba"
+
+That is the weirdest title in this chapter, so bear with me. Your "Super Ameba" computer can split off a new processor at will, and assign it a task, in a massively parallel fashion. Imagine that every row in the working table that was built in the FROM clause is allocated one of these "ameba processors" that will test the WHERE clause search condition on just that row. This is a version of Pournelle's rule: "one task, one processor."
+
+If every row in your table can be independently tested against simple, basic search conditions, then your schema is probably a good relational design. But if your row needs to reference other rows in the same table, consult an outside source, or cannot answer those simple questions, then you probably have some kind of normalization problems.
+
+You have already seen the Nested Sets model and the Adjacency List model for trees. Given one row in isolation from the rest of the table, can you answer a basic node question about the tree being modeled? This leads to asking: What are basic questions? Here is a short list that applies to trees in graph theory.
+
+1. Is this a leaf node?
+
+2. Is this the root node?
+
+3. How big is the subtree rooted at this node?
+
+4. Given a second node in the same tree, is this node superior, subordinate, or at the same level as my node?
+
+Question 4 is particularly important, because it is the basic comparison operation for hierarchies. As you can see, the Nested Sets model can answer all of these questions and more, whereas the Adjacency List model can answer none of them.
+
+# 9.11 Use the Newsgroups and Internet
+
+The Internet is the greatest resource in the world, so learn to use it. You can find a whole range of newsgroups devoted to your particular product or to more general topics. When you ask a question on a newsgroup, please post DDL, so that people do not have to guess what the keys, constraints, Declarative Referential Integrity, data types, and so forth in your schema are. Sample data is also a good idea, along with clear specifications that explain the results you wanted.
+
+Most SQL products have a tool that will spit out DDL in one keystroke. Unfortunately, the output of these tools is generally less than human-readable. You should prune the real tables down to just what is needed to demonstrate your problem: There is no sense in posting a 100-column CREATE TABLE statement when all you want is two columns. Then clean up the constraints and other things in the output using the rules given in this book. You are asking people to do your job for you for free. At least be polite enough to provide them with sufficient information.
+
+If you are a student asking people to do your homework for you, please be advised that presenting the work of other people as your own is a valid reason for expulsion and/or failure at a university. When you post, announce that this is homework, the name of your school, your class, and your professor. This will let people verify that your actions are allowed.
+CHAPTER 10 Thinking in SQL
+
+_"It ain't so much the things we don't know that get us into trouble. It's the_ _Thing we know that just ain't so."_
+
+—Artemus Ward (Charles Farrar Browne),  
+American humorist (1834–1867)
+
+THE BIGGEST HURDLE in learning SQL is thinking in sets and logic, instead of in sequences and processes. I just gave you a list of heuristics in the previous chapter, but let's take a little time to analyze why mistakes were made. You now have some theory, but can you do diagnostics?
+
+I tried to find common errors that new programmers make, but perhaps the most difficult thing to learn is thinking in sets. Consider the classic puzzle shown in Figure 10.1.
+
+Figure 10.1 Classic block puzzle.
+
+The usual mistake people make is trying to count the 1 x 1 x 2 bricks one at a time. This requires the ability to make a three-dimensional mental model of the boxes, which is really difficult for most of us.
+
+The right approach is to look at the whole block as if it were completely filled in. It is 4 X 5 X 5 units, or 50 bricks. The corner that is knocked off is 3 bricks, which we can count individually, so we must have 47 bricks in the block. The arrangement inside the block does not matter at all.
+
+All of these examples are based on actual postings in a newsgroup that have been translated into SQL/PSM to remove proprietary features. In some cases, I have cleaned up the data element names, and in others I have left them. Obviously, I am guessing at motivation for each example, but I think I can defend my reasoning.
+
+# 10.1 Bad Programming in SQL and Procedural Languages
+
+As an example of not learning any relational approaches to a problem, consider a posting in the comp.databases.ms-sqlserver newsgroup in January 2005: The title was "How to Find a Hole in Records," which already tells you that the poster is thinking in terms of a file system and not an RDBMS.
+
+The original table declaration had the usual newbie "id" column, without a key or any constraints. The table modeled a year's worth of rows identified by a week-within-year number (1 to 53) and a day-of-the-week number (1 to 7). Thus, we started with a table that looked more or less like this, after the names were cleaned up:
+
+By removing the useless, proprietary id column and adding constraints, we then had the following table:
+
+Despite giving some constraints in the narrative specification, the poster never bothered to apply them to the table declaration. Newbies think of a table as a file, not as a set. The only criteria that data needs to be put into a file is that it is written to that file. The file cannot validate anything. The proprietary auto-number acts to replace a nonrelational record number in a sequential file system.
+
+The problem was to find the earliest missing day within each week for inserting a new row. If there were some other value or measurement for that date being recorded, it was not in the specifications. The poster's own T-SQL solution translated in SQL/PSM like this, with some name changes:
+
+This is a classic imitation of a FOR loop, or counting loop, used in all 3GL programming languages. However, if you look at it for two seconds, you will see that this is bad procedural programming! SQL will not make up for a lack of programming skills. In fact, the bad effects of mimicking 3GL languages in SQL are magnified. The optimizers and compilers in SQL engines are not designed to look for procedural code optimizations. By removing the redundant local variables and getting rid of the hidden GOTO statements in favor of a simple, classic structured design, the poster should have written this:
+
+This points out another weakness in this posting. We were not told how to handle a week that has all seven days represented. In the original table design, any integer value would have been accepted because of the lack of constraints. In the revised DDL, any weekday value not between 1 and 7 will cause a primary-key violation. This is not the best solution,but it at least follows the specs that were given without making too many guesses as to what should have been done.
+
+But can we do this without a loop and get a pure, nonprocedural SQL solution? Yes, there are several ways: Because the purpose of finding this weekday number is to insert a row in the table, why not do that in one procedure instead of finding the number in a function, and then doing the insertion in another procedural step. Think at the level of a whole process and not in sequential steps.
+
+This first answer is ugly looking and difficult to generalize, but it is fast if the optimizer factors out the tabular subquery in the WHEN clauses and computes it once. It also uses no local variables.
+
+The thought process was to get the entire set of weekday numbers present in the week, and then compare them to each value in an ordered list. The CASE expression is just a way to hide that list. Although it is a step forward, it is not yet really a set-oriented solution.
+
+Here is another version that uses a table constructor. This is more compact and easy to generalize. Here we are actually using a set-oriented solution! We are subtracting the set of actual days from the set of all possible days, and then looking at the minimum value in the result to get an answer.
+
+You can also use a pure set operations approach. The set difference operator can remove all of the numbers that are present, so that we can pick the minimum value from the leftovers.
+
+If all seven days are present, we will get an empty set, which will return a NULL for the day_nbr, and the NULL will violate the primary-key constraint.
+
+Here is a third, generalized version with the Sequence table providing any range of integers desired. Just remember that the DDL has to also match that change.
+
+In the case of only seven values, there is not going to be a huge difference in performance among any of these answers. However, with a huge number of values, the use of hashing or bit vector indexes would be a noticeable improvement over a loop.
+
+# 10.2 Thinking of Columns as Fields
+
+The original code was actually much worse, because the poster wanted to create and drop tables on the fly. The purpose is to load totals into a summary report table.
+
+Why did the poster create a dozen local variables and then use scalar subqueries to load them? The poster is still thinking in terms of a 3GLprogramming language. In COBOL or other 3GL languages, the file containing the Construction Survey data would be read in one record at a time, and then each record would be read one field at a time, from left to right. A sequence of IF-THEN statements would look at the fields and increment the appropriate counter. When the entire file is read, the results would be written to the working file for the survey summary.
+
+The poster looked at each column as if it were a field and asked how to get the value for it, in isolation from the whole. The poster had seen the use of a subquery expression and implemented it that way. The subqueries will not be well optimized, so this is actually going to run longer than if the poster had used SQL/PSM to mimic the classic COBOL program for this kind of summary.
+
+Without repeating a dozen columns again, a set-oriented solution is this:
+
+The trick was to ask what you want in each row of a summary table, as a completed unit of work, and not start at the column level. The answer is a tally of answers to some questions. The word _tally_ leads you to SUM() or COUNTO, and you remember the trick with the CASE expression.
+
+The final question is why not use a VIEW to get the summary instead of a procedure?
+
+# 10.3 Thinking in Processes, Not Declarations
+
+This is a simple schema for checking items out of an inventory. The original schema lacked keys and constraints that had to be added to give us this:
+
+The original narrative specification was:
+
+Each user can reserve a maximum of (n) items. Whenever a user reserves something, the "max_reserves" field [sic] of the user is retrieved and checked. Then a record [sic] is inserted into the Reservations table, and the "max_reserves" field [sic] of the user is updated accordingly. 1 would like to ask if there is a better way to implement this system, because there is a chance that the user reserves more than the maximum number, if he or she is logged in from two computers.
+
+The first proposal was for a stored procedure that looked like this in SQL/PSM:
+
+Passing the maximum number of items as a parameter makes no sense, because you have to look it up; this will let you pass any value you desire. Having a local variable for the count is redundant; SQL is orthogonal, and the scalar subquery can be used wherever the scalar variable is used.
+
+Rows are not records and columns are not fields. SQL is a declarative language, not a procedural one. So a sequence of procedural steps like "Retrieve—> check—> insert—> update" does not make sense. Instead, you say that you make a reservation such that the user is not over his or her limit. Think of logic, not process.
+
+Instead of recording the tally of reserved items in local storage, you can get it with a subquery expression. In fact, you might want to have a view to use for reports.
+
+# 10.4 Thinking the Schema Should Look Like the Input Forms
+
+There are several versions of this error. The easiest one is a simple timecard form that gets modeled exactly as it is printed on the paper form.
+
+But to answer even basic questions, you have to match up in and out times. Dr. Codd (1979) described a row in an RDBMS as containing a fact, but more than that, it should contain a whole fact and not half of it. The "half-fact" that John showed up at the job at 09:00 Hrs has nothing to do with paying him. I need to know that John was on the job from 09:00 to 17:00 Hrs. The correct design holds a whole in each row, thus:
+
+Many new SQL programmers are scared of NULLs, but this is a good use of them. We do not know the future, so we cannot assign a value to the out_time until we have that information.
+
+Another common example is a simple order form that is copied directly into DDL. In skeleton form, the usual layout is something like this:
+
+The order total can be computed from the order details, so it is redundant in the Orders table; but the total was a box on the paper form, so the newbie put it in the table.
+
+Nobody is actually buying or shipping a line number. Customers are ordering items, but the lines on the paper form are numbered, so the line numbers are in the OrderDetails table. This is dangerous, because if I repeat the same item on another line, I have to consolidate them in the database. Otherwise, quantity discounts will be missed, and I am wasting storage with redundant data.
+
+For example, each of the rows shows a "half-fact" in each row. One says that I ordered two pairs of lime green pants and the other says that I ordered three pairs of lime green pants on my order #123. The whole fact is that I ordered five pairs of lime green pants on my order #123.
+
+In 2004, I pointed this out to a programmer who had such a schema. She insisted that they needed the line numbers to be able to reproduce the original order exactly as it was keyed in, but then in a following posting in the same thread, she complained that her people were spending hours every day verifying the quantity of items in orders they received, because their suppliers did not use the proper model to present a consolidated, sorted display of the data.
+APPENDIX Resources
+
+# Military Standards
+
+DoD 8320.1-M-l, Data Element Standardization Procedures. DoD Directive 8320.1, "DoD Data Administration"
+
+<http://www.dtic.mil/whs/directives/corres/html/83201.htm>
+
+<http://www.abelia.com/498pdf/498ARAPX.PDF>
+
+# Metadata Standards
+
+Here is a short summary of the NCITS L8 Metadata Standards Committee rules for data elements:
+
+<http://pueblo.lbl.gov/~olken/X3L8/drafts/draft.docs.html>
+
+<http://lists.oasis-open.org/archives/ubl-ndrsc/200111/msg00005.html>
+
+Also the pdf file:
+
+<http://www.oasis-open.org/committees/download.php/6233/c002349_ISO_IEC_l1179>
+
+The draft:
+
+<http://www.iso.org/iso/en/ittf/PubliclyAvailableStandards/c002349_ISO_IEC_l1179-l_1999(E).zip>
+
+# ANSI and ISO Standards
+
+The SI Basics ("Metric System")
+
+ISO 31 "Quantities and Units (14 parts)"
+
+ISO 1000 "SI Units and Recommendations for the Use of Their Multiple and of Certain Other Units for the Application of the SI"
+
+ISO 2955 "Information Processing—Representation of SI and Other Units for Use in Systems with Limited Character Sets"
+
+A guide to both ISO 31 and ISO 1000 can be purchased at:
+
+<http://www.iso.org/iso/en/prods-services/prods-services/otherpubs/Quality.PublicationList?CLASSIFICATION=HANDBOOKS#090201>
+
+ISO 639-1:2002 "Codes for the Representation of Names of Languages—Part 1 : Alpha-2 Code"
+
+ISO 639-2:1998 "Codes for the Representation of Names of Languages—Part 2: Alpha-3 Code"
+
+The language codes are available online:
+
+<http://www.loc.gov/standards/iso639-2/iso639jac.html>
+
+ISO 3166 "Codes for the Representation of Names of Countries"
+
+This standard provides a unique two-letter code for each country and a three-letter code for special uses. A three-digit numeric code is given and intended as an alternative for all applications that need to be independent of the alphabet or to save bits in computer storage.
+
+<http://www.iso.org/iso/en/prods-services/popstds/countrynamecodes.html>
+
+ISO 4217:2001 "Codes for the Representation of Currencies and Funds" <http://www.iso.org/iso/en/prods-services/popstds/currencycodeslist.html>
+
+IBAN: International Standard Bank Number
+
+<http://www.ecbs.org/iban/iban.htm> and the European Committee for Banking Standards Web site for publications
+
+ISO 8601 "Data Elements and Interchange Formats—Information Interchange—Representation of Dates And Times."
+
+<http://www.iso.org/iso/en/prods-services/popstds/datesandtime.html>
+
+# U.S. Government Codes
+
+NAICS: North American Industry Classification System. This system replaced the old Standard Industrial Classification (SIC) system.
+
+<http://www.census.gov/epcd/www/naics.html>
+
+NAPCS: North American Product Classification System
+
+<http://www.census,gov/eos/www/napcs/napcs.htm>
+
+TIGER: Topologically Integrated Geographic Encoding and Referencing system. This is how the census views geography and reports data. It is available in electronic formats.
+
+DOT: Dictionary of Occupational Titles. This is the U.S. Department of Labor encoding system. You can see some of the codes at this URL:
+
+<http://www.wave.net/upg/immigration/dot_index.html>
+
+# Retail Industry
+
+**_EAN_ : _European Article Number, now combined with the UPC codes_**
+
+ISO/IEC 15418:1999 "EAN/UCC Application Identifiers and Fact Data Identifiers and Maintenance"
+
+ISO/IEC 15420:2000 "Automatic Identification and Data Capture Techniques—Bar Code Symbology Specification—EAN/UPC"
+
+Bar Code Détente: U.S. Finally Adds One More Digit
+
+2004 July 12, the _New York Times,_ by Steve Lohr; http://www.nytimes.com/2004/07/12/business/12barcode.html?ex=1090648405&ei=l&en=202cb9baba72e846
+
+**_VIN: Vehicle Identification Number_**
+
+ISO 3779:1983 Vehicle Identification Number (VIN)
+
+ISO 4030:1983 Vehicle Identification Number (VIN)—Location and Attachment
+
+ISO/TR 8357:1996 Instructions for the implementation of the assignment of world manufacturer identifier (WMI) codes for vehicle identification number (VIN) systems and for world parts manufacturer identifier (WPMI) codes (available in English only)
+
+A good news article on the changes that are coming to the VIN:
+
+http://www.cars.com/news/stories/070104_storya_dn.jhtml?page=newsstory&aff=national
+
+ISO tire sizes explained:
+
+<http://www.hostelshoppe.com/tech_tires.php>
+
+#### ISBN: International Standard Book Number
+
+<http://www.isbn.org/standards/home/index.asp>
+
+This site provides a converter for the new 13-digit ISBN that is based on the change from 10-digit UPC codes to 13-digit ΕΑΝ codes in the retail industry on January 1, 2005.
+
+# Code Formatting and Naming Conventions
+
+You can find other opinions at:
+
+<http://www.sqlservercentral.com/columnists/sjones/codingstandardspart2formatting.asp>
+
+<http://www.sqlservercentral.com/columnists/sjones/codingstandardspart1formatting,asp>.
+
+Gulutzan, P. "SQL Naming Conventions," <http://dbazine.com/gulutzan5.shtml>
+
+Bryzek, M. "Constraint Naming Standards," <http://ccm.redhat.com/doc/core-platform/5.0/engineering-standards/eng-standards-constraint-naming-sect-1.html>
+
+Celko, J. "Ten Things I Hate about You," http://www.intelligententerprise.com/001205/celkol_l.jhtml?_requestid=304726
+
+ISO/IEC. IS 11179-5 Information Technology Specification and Standardization of Data Elements: PART 5, Naming and Identification Principles for Data Elements.
+
+http://metadata-standards.org/Document-library/Draft-standards/11179-Part5-Naming&Identification/
+
+Jones, S. "Standards Part 1—Abbreviated Programming," <http://www.databasejournal.com/features/mssql/article.php/1471461>
+
+Karbowski, J. J. "Naming Standards beyond Programming," <http://www.devx.com/tips/Tip/12710>
+
+Koch, G., and K. Loney. _Oracle8i: The Complete Reference_ (3rd ed.). Emeryville, CA: Osborne McGraw Hill, 2000.
+
+Kondreddi, N., Vyas. "Database Object Naming Conventions," <http://vyaskn.tripod.com/object_naming.htm>
+
+Mullins, C. "What's in a Name?" <http://www.tdan.com/i004fe02.htm>
+
+Mullins, C. <http://www.craigsmullins.com/dbt_0999.htm>
+
+Sheppard, S. "Oracle Naming Conventions," <http://www.ss64.com/orasyntax/naming.html>
+Appendix Bibliography
+
+# Reading Psychology
+
+Fisher, D. "Reading and Visual Search," _Memory and Cognition,_ 3, 188–196, 1975.
+
+Mason, M. "From Print to Sound in Mature Readers as a Function of Reader Ability and Two Forms of Orthographic Regularity," _Memory and Cognition,_ 6, 568–581, 1978.
+
+Meyer, D. E., and K. D. Gutschera. "Orthographic versus Phonemic Processing of Printed Words," Psychonomic Society Presentation, 1975.
+
+Pollatsek, Α., A. D. Well, and R. M. Schindler. "Effects of Segmentation and Expectancy on Matching Time for Words and Nonwords," _Journal of Experimental Psychology: Human Perception and Performance,_ 1, 328–338, 1975.
+
+Saenger, P. _Space Between Words: The Origins of Silent Reading._ Palo Alto, CA: Stanford University Press, 1975.
+
+# Programming Considerations
+
+Arthur, J. _Measuring Programmer Productivity and Software Quality._ New York: John Wiley & Sons, 1985.
+
+Baecker, R. "Enhancing Program Readability and Comprehensibility with Tools for Program Visualization," _Proceedings of the 10th International Conference on Software Engineering,_ 356–366, April 11–15, 1988, Singapore.
+
+Berry, R. E., and A. E. Meekings. "A Style Analysis of C Programs," _Communications of the ACM,_ 281, 80–88, January 1985.
+
+Brooks, R. "Studying Programmer Behavior Experimentally: The Problems of Proper Methodology," _Communications of the ACM,_ 234, 207–213, April 1980.
+
+Celko, J. "Observations about Student Programming Practices," _Structured Programming,_ Fall 1989, p. 215.
+
+Celko, J. _SQL for Smarties_ (3rd ed.). San Francisco: Morgan-Kaufmann, 2005.
+
+Celko, J. _SQL Puzzles & Answers._ San Francisco: Morgan-Kaufmann, 1997.
+
+Celko, J. _Data & Databases._ San Francisco: Morgan-Kaufmann, 1999.
+
+Celko, J. Trees _& Hierarchies in SQL._ San Francisco: Morgan-Kaufmann, 2004.
+
+Codd, E. F. "Extending the Database Relational Model to Capture More Meaning," ACM _Transactions on Database Systems,_ 44, 397–434, December 1979.
+
+Cooper, D., and M. J. Clancy. _Oh! Pascal!_ New York: W. W. Norton, 1985.
+
+Fairley, R. _Software Engineering Concepts._ Boston: McGraw-Hill, 1985.
+
+Gilmore, D. J., and R. G. Green. "Comprehension and Recall of Miniature Programs," _International Journal of Man-Machine Studies,_ 211, 31–48, July 1984.
+
+Grogono, P. "On Layout, Identifiers and Semicolons in Pascal Programs," ACM _SIGPLAN Notices,_ 14(4), 35–40, April 1979.
+
+Kernighan, Β., and P. J. Plauger. _The Elements of Programming Style._ Boston: McGraw-Hill, 1982.
+
+Ledgard, H. _Programming_ Proverbs. Rochelle Park, NJ, Hayden Books, 1975.
+
+Ledgard, H., and L. J. Chmura. _Fortran with Style: Programming Proverbs._ Indianapolis, IN, Sams, 1978.
+
+Ledgard, H., and J. Tauer. _Professional Software. Volume 2: Programming_
+
+_Practice._ Boston: Addison-Wesley Longman, 1987.
+
+McCabe, Tom. "A Complexity Measure," _IEEE Transactions on Software,_
+
+1976.
+
+McKeithen, K., Reitman J., Rueter H., and Hirtle S. "Knowledge Organization and Skill Differences in Computer Programmers," _Cognitive Psychology,_ 13, 307–325, 1981.
+
+Meekings, B. "Style Analysis of Pascal Programs," ACM _SIGPLAN Notices,_ 18(9), 45–54, September 1983.
+
+Miller, G., A. "The Magical Number Seven Plus or Minus Two: Some Limits on Our Capacity for Processing Information," _The Psycological Review,_ 1956.
+
+Oman P., and Cook C. "A Taxonomy for Programming Style," _Proceedings of the 1990 ACM Annual Conference on Cooperation,_ February 20–22, 1990, Washington, DC.
+
+Oman P., and Cook C. "A Paradigm for Programming Style Research," _ACM SIGPLAN Notices,_ 23(12), 69–78, December 1988.
+
+Oman P., and Cook C. "Programming Style Authorship Analysis," _Proceedings of the 17th Annual ACM Conference on Computer Science:_
+
+_Computing Trends in the 1990s,_ Louisville, Kentucky, 320–326, February 1989
+
+Oman P., and Cook C. "Typographic Style Is More Than Cosmetic," _Communications of the ACM,_ 335, 506–520, May 1990.
+
+Pascal, F. "SQL Redundancy and DBMS Performance," _Database Programming & Design,_ 112, 22–28, December 1988.
+
+Pressman, R. S. _Software Engineering: A Practitioner's Approach_ (2nd ed.). Boston: McGraw-Hill, 1986.
+
+Redish K., and Smyth W. "Program Style Analysis: A Natural By-Product of Program Compilation," _Communications of the ACM,_ 29(2), 126–133, February 1986.
+
+Rees, M. J. "Automatic Assessment Aids for Pascal Programs," _ACM SIGPLAN Notices,_ 1710, 33–42, October 1982.
+
+Sheil, B. A. "The Psychological Study of Programming," ACM _Computing Surveys_ (CSUR), 131, 101–120, March 1981.
+
+Weinberg, G. _The Psychology of Computer Programming: Silver Anniversary Edition._ New York: Dorset House, 1998.
+
+Weissman, L. "Psychological Complexity of Computer Programs: An Experimental Methodology," ACM _SIGPLAN Notices,_ 96, 25–36, June 1974.
+Index
+
+3GL languages, , 190–91
+
+4GL languages, ,
+
+# A
+
+Abbreviation encoding, 87–88
+
+algorithm encoding vs.,
+
+defined,
+
+examples, 87–88
+
+_See also_ Encoding schemes
+
+Absolute scales,
+
+Abstraction levels, 9–10
+
+Accuracy, ,
+
+Actions, indenting, 45–46
+
+Active verbs,
+
+Affixes,
+
+Algorithmic encoding,
+
+Aliases. _See_ Correlation names
+
+ALTER statement, ,
+
+ANSI X3.5 Flowchart symbols,
+
+Artificial keys, 51–52
+
+Attributes, splitting, 62–66
+
+Auto-numbers, 52–53, ,
+
+column names,
+
+natural key and,
+
+as primary key,
+
+problems, 58–59
+
+Auxiliary tables,
+
+# B
+
+**Β** BETWEEN predicate, 114–15
+
+"Big VIEW Myth," 148–49
+
+Bits, ,
+
+Block puzzle, ,
+
+Bouma,
+
+Bytes,
+
+# C
+
+**C** Calendar tables,
+
+CamelCase
+
+problems,
+
+use avoidance, 29–30
+
+use exceptions,
+
+Capitalization rules, 6–7
+
+CASE expressions, 110–13
+
+COALESCE() function, 112–13
+
+for complex predicate,
+
+defined,
+
+replacing IF-THEN-ELSE control flow statements, 162–63
+
+searched,
+
+simple, 111–12
+
+using, 117–18
+
+Case-sensitivity rules, 6–7
+
+Categorical scales, 73–74
+
+categories,
+
+defined, 73–74
+
+properties,
+
+_See also_ Scales
+
+Character columns,
+
+Character sets,
+
+CHECK() constraints, 46–48
+
+applying to table as whole,
+
+inspecting/testing, 130–31
+
+single-purposed,
+
+on table declarations,
+
+table-level,
+
+See _also_ Constraints
+
+COALESCE() function, 112–13
+
+Codd, Dr., , , ,
+
+Code
+
+clusters,
+
+indentation, 38–39
+
+line spacing, 39–40
+
+lowercase scalars usage, 25–26
+
+name usage,
+
+punctuation rules, 31–33
+
+reading,
+
+reserved word use, 33–34
+
+statements, 34–37
+
+typography and, 23–30
+
+upper-/lowercase letter usage,
+
+word spacing,
+
+Code Museum Effect, ,
+
+Coding
+
+choices, 99–131
+
+comments, 118–19
+
+compact constructions, 109–18
+
+correlated subqueries, 125–27
+
+optimizer hints avoidance,
+
+secondary index avoidance, 124–25
+
+standard constructions, 100–108
+
+stored procedures, 122–23
+
+triggers avoidance, 120–22
+
+UNIONs, 127–30
+
+user-defined functions, 123–24
+
+Cohesion, 153–55
+
+coincidental, 153–54
+
+communicational,
+
+defined,
+
+functional,
+
+informational,
+
+logical,
+
+procedural,
+
+temporal,
+
+types,
+
+Coincidental cohesion, 153–54
+
+Columns
+
+added after schema,
+
+character,
+
+clustering, 44–45
+
+constraints, 48–49,
+
+as fields, 189–91
+
+names,
+
+ordering, 44–45
+
+renaming,
+
+splitting attributes into, 63–65
+
+VIEW,
+
+_See also_ Rows
+
+Comma-separated lists, ,
+
+Comments
+
+on clause,
+
+control statement,
+
+stored procedures,
+
+using, 118–19
+
+Common coupling,
+
+Common language runtime (CLR),
+
+Communicational cohesion,
+
+Compact constructions, 109–18
+
+CASE family expressions, 110–13
+
+parentheses and, 109–10
+
+redundant expressions and, —14
+
+seeking, 114–18
+
+Complexity, masking, 138–39
+
+Concatenation,
+
+Concatenation encoding,
+
+Consistency,
+
+Constraints
+
+CHECKO, 46–48
+
+column,
+
+integrity, 140–42
+
+LIKE,
+
+multiple column, 48–49
+
+multi-table, 49–50
+
+names,
+
+in narrative specification,
+
+range,
+
+referential,
+
+rows,
+
+SIMILAR TO,
+
+Content coupling, 155–56
+
+Control coupling,
+
+Control statement comments,
+
+Control structures, 156–57
+
+concatenation,
+
+iteration,
+
+selection,
+
+Correlation names, 15–17
+
+column,
+
+derivation, 15–16
+
+in queries,
+
+on table expression,
+
+Coupling, 155–56
+
+common,
+
+content, 155–56
+
+control,
+
+data,
+
+defined,
+
+stamp,
+
+types of,
+
+CREATE ASSERTION statement, 49–50,
+
+CREATE PROCEDURE statement,
+
+CREATE TABLE statement, , ,
+
+CROSS JOIN syntax, 105–7,
+
+Cursors
+
+porting,
+
+use avoidance, 159–61
+
+uses, 159–60
+
+Cyclomatic complexity,
+
+# D
+
+**D** Data abstraction,
+
+Database management system (DBMS),3
+
+Databases
+
+codes in, 96–97
+
+codes storage, 96–97
+
+relational, , , 66–68
+
+Data coupling,
+
+Data declaration language (DDL), 41–68
+
+attribute splitting, 62–66
+
+CHECK() constraint placement, 46–48
+
+column ordering, 44–45
+
+constraint names,
+
+CREATE ASSERTION use, 49–50
+
+DEFAULT value, 41–42
+
+first attempts, 177–78
+
+indentation, 45–46
+
+multiple column constraints, 48–49
+
+object-oriented design use, 66–68
+
+PRIMARY KEY declaration,
+
+proprietary data types and, 42–44
+
+single-purposed CHECKO
+
+constraints,
+
+table keys, 51–62
+
+table-level CHECK() constraints,49
+
+Data derivation, 139–40
+
+Data elements
+
+affixes,
+
+descriptive prefixes, 10–12
+
+logical,
+
+names in registry,
+
+naming problems, 18–21
+
+postfixes, 12–14
+
+query,
+
+scalar,
+
+Data encoding schemes, 83–97
+
+Data manipulation language (DML),
+
+bad,
+
+first attempts, 178–79
+
+Data types,
+
+FLOAT,
+
+proprietary, 42–44
+
+REAL,
+
+Decimal places, declaring, 81–82
+
+DEFAULT value, 41–42
+
+Delimited identifiers,
+
+Derived units, 79–80,
+
+Descriptive prefixes, 10–12
+
+Dewey Decimal Classification (DDC) system, 89–90
+
+Display,
+
+Domain-Key Normal Form (DKNF), 62–63
+
+DROP TABLE statement,
+
+Dynamic SQL, 168–70
+
+purpose,
+
+in stored procedures,
+
+# E
+
+Electronic data interchange (EDI) files,8,
+
+Encoding schemes, 83–97
+
+abbreviation, 87–88
+
+algorithmic,
+
+ambiguous,
+
+bad, 84–86
+
+codes in database, 96–97
+
+code translation, 93–96
+
+concatenation,
+
+design guidelines, 92–97
+
+enumeration, 86–87
+
+expansion and,
+
+explicit missing values, 92–93
+
+hierarchical, 89–90
+
+measurement,
+
+standards,
+
+types, 86–91
+
+vector, 90–91
+
+Entity-Attribute-Value (EAV) design,68
+
+Enumeration encoding, 86–87
+
+EXISTS() predicate,
+
+Extended equality, 103–5
+
+# F
+
+Fields, 53–54
+
+columns as, 189–91
+
+existence,
+
+length,
+
+Files
+
+EDI, ,
+
+records,
+
+tables vs., 53–54
+
+First attempts, 177–79
+
+DDL, 177–78
+
+DML, 178–79
+
+Flexibility,
+
+FLOAT data type,
+
+FROM clause, ,
+
+Functional cohesion,
+
+# G
+
+Granularity, 71–72,
+
+# H
+
+Heuristics, 171–81
+
+active verb removal,
+
+circles and set diagrams,
+
+clear statements, 172–73
+
+data display, 176–77
+
+dialect,
+
+first attempts, handling, 177–79
+
+newsgroups and Internet use,
+
+"Set of All . . .,"
+
+stubs use, 174–76
+
+Hierarchical encoding schemes, 89–90
+
+Dewey Decimal system example, 89–90
+
+uses,
+
+ZIP code example,
+
+_See also_ Encoding schemes
+
+# I
+
+IBM
+
+case sensitivity rules, ,
+
+identifier character sets,
+
+identifier length,
+
+quoted identifiers,
+
+Identifiers
+
+character sets,
+
+delimited,
+
+lengths,
+
+quoted, 4–6
+
+INCITS H2 Database Standards
+
+Committee,
+
+Indentation, 38–39
+
+actions, 45–46
+
+referential constraints, 45–46
+
+Informational cohesion,
+
+INNER JOIN syntax, 105–7
+
+IN() predicate, 115–17
+
+introduction,
+
+NOT,
+
+NULLs and,
+
+INSERT INTO statement, ,
+
+Integrity constraints, 140–42
+
+Intelligent keys,
+
+Internet,
+
+Interval scales,
+
+conversion,
+
+defined,
+
+properties,
+
+_See also_ Scales
+
+ISO-3166, ,
+
+ISO-11179 standards, ,
+
+correlation names, 15–17
+
+descriptive prefixes, 10–12
+
+levels of abstraction, 9–10
+
+metadata schema access objects,18
+
+naming conventions, 7–18
+
+postfixes, 12–14
+
+relationship table names,
+
+scalar data elements,
+
+sections, 7–8
+
+table and view names, 14–15
+
+ISO temporal syntax, 107–8
+
+Iteration,
+
+# J
+
+Justified text,
+
+# K
+
+Keys, 51–62
+
+artificial, 51–52
+
+exposed locator,
+
+familiarity,
+
+intelligent,
+
+natural,
+
+properties, 54–57
+
+simplicity,
+
+stability, 55–56
+
+surrogate,
+
+types of,
+
+uniqueness, 54–55
+
+validation,
+
+verifiability, 56–57
+
+Keywords,
+
+# L
+
+Levels of abstraction, 9–10
+
+LIKE constraint,
+
+Line spacing, 39–40
+
+Logical cohesion,
+
+Logical data elements,
+
+Logical model implementation,
+
+Lookup tables, 94–96
+
+# M
+
+Maintenance, ,
+
+Materialized VIEWs, 149–50
+
+Measurement encoding,
+
+Measurement theory, 69–73
+
+accuracy, ,
+
+defined,
+
+granularity, 71–72,
+
+precision, ,
+
+properties,
+
+range, 71–72
+
+scale conversion,
+
+Metadata schema access objects,
+
+Modularity,
+
+Moh's Scale for Hardness (MSH),
+
+MS SQL
+
+case-sensitivity rules, ,
+
+identifier character sets,
+
+identifier length,
+
+quoted identifiers,
+
+Multiple character sets,
+
+# N
+
+Names, 2–7
+
+capitalization rules, 6–7
+
+changing from place to place, 19–20
+
+column, 25–26,
+
+constraint,
+
+correlation, 15–17
+
+data elements, 18–21
+
+ISO-11179 standards conventions, 7–18
+
+length, 2–3
+
+letters, digits, underscores for,
+
+quoted identifiers and, 4–6
+
+relationship table,
+
+schema object, ,
+
+special characters in, 3–4
+
+table, 14–15
+
+vague, 18–19
+
+VIEW, 14–15
+
+Natural keys
+
+auto-numbers and,
+
+compound,
+
+defined,
+
+_See also_ Keys
+
+NCITS L8 Metadata Standards
+
+Committee rules,
+
+Network traffic,
+
+Newsgroups,
+
+Nominal scales,
+
+Normalization, 94–95
+
+North American Industry Classification
+
+System (NAICS),
+
+NULLs, , ,
+
+avoiding, 92–93
+
+IN() predicate and,
+
+sorting,
+
+testing combinations of,
+
+# O
+
+Object instances, 66–68
+
+Object-oriented design, 66–68
+
+One True Lookup Table (OTLT), ,95,
+
+Optimizer hints,
+
+Oracle
+
+case-sensitivity rules, ,
+
+identifier character sets,
+
+identifier length,
+
+quoted identifiers,
+
+ORDER BY clause,
+
+Orders problem, 171–72
+
+Ordinal scales, 74–75
+
+conversion,
+
+defined,
+
+properties,
+
+transitivity,
+
+_See also_ Scales
+
+OUTER JOIN syntax, 101–5
+
+extended equality,
+
+illustrated,
+
+# P
+
+Parentheses
+
+extra, avoiding, 109–10
+
+in moderation,
+
+Physical locators,
+
+Portable functions,
+
+Postfixes, 12–14
+
+category,
+
+class,
+
+Teradata standards, 12–13
+
+type, 13–14
+
+Precision, ,
+
+PRIMARY KEY declaration,
+
+Primary units, storing,
+
+Procedural cohesion,
+
+Processes, thinking in, 191–93
+
+Proprietary data types, 42–44
+
+Proprietary exposed physical locators,21
+
+Proprietary reserved words,
+
+avoiding, 33–34
+
+disadvantages,
+
+Proprietary statements, 34–37
+
+_Psychology of Computer Programming,_ 99
+
+Punchcards, ,
+
+Punctuation
+
+rules, 31–32
+
+standard units and, 80–81
+
+storage and,
+
+units display and,
+
+Queries,
+
+aliases inside,
+
+bad,
+
+Query data elements,
+
+Quoted identifiers, 4–6
+
+# R
+
+Range, 71–72
+
+Range constraints,
+
+Rank scales, 75–76
+
+conversion,
+
+defined,
+
+properties,
+
+transitivity, 75–76
+
+_See also_ Scales
+
+Ratio scales,
+
+conversion,
+
+defined,
+
+properties,
+
+_See also_ Scales
+
+REAL data type,
+
+Records,
+
+Redundant expressions, 113–14
+
+Referential constraints, 45–46
+
+Relational database management system (RDBMS), ,
+
+EAV design and,
+
+object-oriented design for, 66–68
+
+Relationship tables,
+
+Reserved words
+
+full,
+
+proprietary, , 33–34
+
+uppercase, 26–29
+
+Rivers,
+
+Rows
+
+constraints,
+
+defined,
+
+splitting attributes into, 65–66
+
+VIEW,
+
+_See also_ Columns
+
+# S
+
+Scalar data elements,
+
+Scalar parameters, 167–68
+
+Scales, 73–82
+
+absolute,
+
+categorical, 73–74
+
+conversion, 77–79
+
+derived units, 79–80
+
+information mixing,
+
+interval,
+
+nominal,
+
+ordinal, 74–75
+
+properties, ,
+
+rank, 75–76
+
+ratio,
+
+types of, 73–76
+
+unrestricted permissible
+
+transformations,
+
+use guidelines, 81–82
+
+using,
+
+Schema object names, ,
+
+Searched CASE expression,
+
+Secondary indexes, 124–25
+
+Security, ,
+
+column-level, 136–37
+
+row-level, 136–37
+
+Selection,
+
+SELECT statement, , ,
+
+Sequence tables, 163–66
+
+Set diagrams,
+
+Set-oriented constructs, 161–67
+
+SI measurements, 79–80
+
+SIMILAR TO constraint,
+
+Simple CASE expression, 111–12
+
+Single-solution VIEWs, 147–48
+
+Software engineering, 153–56
+
+cohesion, 153–55
+
+coupling, 155–56
+
+Spacing
+
+line, 39–40
+
+vertical,
+
+word,
+
+Splitting attributes, 62–66
+
+into columns, 63–65
+
+into rows, 65–66
+
+into tables,
+
+SQL
+
+4GLs, ,
+
+bad programming in, 184–89
+
+as declarative language,
+
+Dynamic, 168–70
+
+injection, 169–70
+
+Standard, , , , ,
+
+testing, 130–31
+
+thinking in, 183–95
+
+SQL-92
+
+CASE expressions,
+
+DEFAULT value,
+
+identifier ending,
+
+maximum identifier length, ,
+
+quoted identifiers,
+
+string operators,
+
+SQL-99,
+
+Stamp coupling,
+
+Standard functions,
+
+Standard Industrial Classification
+
+(SIC),
+
+Standard SQL
+
+case-sensitivity rules, ,
+
+comments,
+
+identifier character sets,
+
+quoted identifiers,
+
+Standard syntax
+
+CROSS JOIN, 105–7
+
+implementation, 100–101
+
+INNER JOIN, 105–7
+
+ISO temporal, 107–8
+
+OUTER JOIN, 101–5
+
+Standard units, 80–81
+
+Statements
+
+clear, 172–73
+
+grouping, 39–40
+
+proprietary, 34–37
+
+_See also specific statements_
+
+Stored procedures,
+
+advantages,
+
+Dynamic SQL and, 168–70
+
+performance,
+
+portability problems, 158–67
+
+scalar vs. structure parameters, 167–68
+
+software engineering and, 153–56
+
+structured programming, 156–57
+
+triggers vs., 121–22
+
+using, 122–23
+
+writing, 151–70
+
+Strong metric properties,
+
+Structured parameters, 167–68
+
+Structured programming, 156–57
+
+control structures, 156–57
+
+cyclomatic complexity,
+
+Stub modules, 174–76
+
+Surrogate keys,
+
+Synchronization, VIEWs, 145–46
+
+# T
+
+Tables
+
+auxiliary,
+
+calendar,
+
+declarations,
+
+expression correlation names,
+
+files vs., 53–54
+
+keys, 51–62
+
+logical,
+
+lookup, 94–96
+
+names, 14–15
+
+newbie designs,
+
+object instances vs., 66–68
+
+relationship,
+
+renaming,
+
+sequence, 163–66
+
+splitting attributes into,
+
+star schema,
+
+synchronizing VIEWs with, 145–46
+
+temporary,
+
+translation, 93–95
+
+Temporal cohesion,
+
+Temporal syntax, 107–8
+
+Temporal values,
+
+Temporary tables,
+
+Teradata standards,
+
+Testing SQL, 130–31
+
+character columns,
+
+CHECK() constraints, 130–31
+
+NULL combinations,
+
+for size,
+
+Translation, 93–96
+
+Translation tables, 93–94
+
+Triggers
+
+ANSI version,
+
+avoiding, 120–22
+
+INSTEAD OF, ,
+
+length,
+
+stored procedures vs., 121–22
+
+# U
+
+UNIONs, 127–30
+
+Units
+
+derived, 79–80,
+
+display,
+
+primary,
+
+standard, 80–81
+
+Updatable VIEWs, 143–44
+
+INSTEAD OF triggers,
+
+WITH CHECK OPTION clause, 143–44
+
+See _also_ VIEWs
+
+UPDATE statement, ,
+
+User-defined functions, 123–24
+
+multiple language problems,
+
+optimization problems,
+
+portability problems,
+
+# V
+
+Validation, 95–96
+
+VALUES() list,
+
+Vector encoding, 90–91
+
+advantages,
+
+defined,
+
+disadvantages, 90–91
+
+See _also_ Encoding schemes
+
+Verbs, active,
+
+Vertical spacing,
+
+Very Large Database (VLDB),
+
+VIEWs, 133–50
+
+application/business requirements, 144–45
+
+column-level security, 136–37
+
+columns,
+
+data derivation, 139–40
+
+defined,
+
+for domain support, 146–47
+
+efficient access paths,
+
+improper use, 146–49
+
+integrity constraints enforcement, 140–42
+
+invocation,
+
+masking complexity, 138–39
+
+materialized, 149–50
+
+names, 14–15
+
+proliferation, avoiding,
+
+reasons for, 144–45
+
+rename tables/columns,
+
+row-level security, 136–37
+
+rows,
+
+single-solution, 147–48
+
+synchronizing, 145–46
+
+updatable, 143–44
+
+uses,
+
+# W
+
+Weinberg, Jerry, 99–100
+
+WHERE clause,
+
+WITH CHECK OPTION clause, 143–44
+
+Word spacing, 
+ABOUT THE AUTHOR
+
+**Joe Celko** is a noted consultant and lecturer, and one of the most-read SQL authors in the world. He is well known for his 10 years of service on the ANSI SQL standards committee, his column in _Intelligent Enterprise_ magazine (which won several Reader's Choice Awards), and the war stories he tells to provide real-world insights into SQL programming. His best-selling books include _Joe Celko's SQL for Smarties: Advanced SQL Programming, second edition; Joe Celko's SQL Puzzles and Answers;_ and _Joe Celko's Trees and Hierarchies in SQL for Smarties._
+Instructions for online access
+
+Thank you for your purchase. Please note that your purchase of this Elsevier eBook also includes access to an online version. Please click here (or go to http://ebooks.elsevier.com) to request an activation code and registration instructions in order to gain access to the web version.
+
diff --git a/kag/examples/csqa/builder/data/linux_kernel_networking.txt b/kag/examples/csqa/builder/data/linux_kernel_networking.txt
new file mode 100644
index 00000000..cea7b922
--- /dev/null
+++ b/kag/examples/csqa/builder/data/linux_kernel_networking.txt
@@ -0,0 +1,30606 @@
+Linux Kernel Networking
+ 
+Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1© Apress 2014
+
+Rami Rosen
+
+Linux Kernel NetworkingImplementation and Theory
+
+Rami Rosen
+
+ISBN 978-1-4302-6196-4e-ISBN 978-1-4302-6197-1
+
+© Apress 2014
+
+Linux Kernel Networking: Implementation and Theory
+
+President and Publisher: Paul Manning
+
+Lead Editor: Michelle Lowman
+
+Technical Reviewer: Brendan Horan
+
+Editorial Board: Steve Anglin, Ewan Buckingham, Gary Cornell, Louise Corrigan, James DeWolf, Jonathan Gennick, Jonathan Hassell, Robert Hutchinson, Michelle Lowman, James Markham, Matthew Moodie, Jeff Olson, Jeffrey Pepper, Douglas Pundick, Ben Renow-Clarke, Dominic Shakeshaft, Gwenan Spearing, Matt Wade, Steve Weiss, Tom Welsh
+
+Coordinating Editor: Kevin Shea
+
+Copy Editor: Corbin Collins
+
+Compositor: SPi Global
+
+Indexer: SPi Global
+
+Artist: SPi Global
+
+Cover Designer: Anna Ishchenko
+
+Distributed to the book trade worldwide by Springer Science+Business Media New York, 233 Spring Street, 6th Floor, New York, NY 10013. Phone 1-800-SPRINGER, fax (201) 348-4505, e-mail orders-ny@springer-sbm.com, or visit  www.springeronline.com .
+
+For information on translations, please e-mail rights@apress.com, or visit  www.apress.com .
+
+Apress and friends of ED books may be purchased in bulk for academic, corporate, or promotional use. eBook versions and licenses are also available for most titles. For more information, reference our Special Bulk Sales–eBook Licensing web page at  www.apress.com/bulk-sales .
+
+Any source code or other supplementary materials referenced by the author in this text is available to readers at  www.apress.com . For detailed information about how to locate your book's source code, go to  www.apress.com/source-code .
+
+This work is subject to copyright. All rights are reserved by the Publisher, whether the whole or part of the material is concerned, specifically the rights of translation, reprinting, reuse of illustrations, recitation, broadcasting, reproduction on microfilms or in any other physical way, and transmission or information storage and retrieval, electronic adaptation, computer software, or by similar or dissimilar methodology now known or hereafter developed. Exempted from this legal reservation are brief excerpts in connection with reviews or scholarly analysis or material supplied specifically for the purpose of being entered and executed on a computer system, for exclusive use by the purchaser of the work. Duplication of this publication or parts thereof is permitted only under the provisions of the Copyright Law of the Publisher's location, in its current version, and permission for use must always be obtained from Springer. Permissions for use may be obtained through RightsLink at the Copyright Clearance Center. Violations are liable to prosecution under the respective Copyright Law.Trademarked names, logos, and images may appear in this book. Rather than use a trademark symbol with every occurrence of a trademarked name, logo, or image we use the names, logos, and images only in an editorial fashion and to the benefit of the trademark owner, with no intention of infringement of the trademark.The use in this publication of trade names, trademarks, service marks, and similar terms, even if they are not identified as such, is not to be taken as an expression of opinion as to whether or not they are subject to proprietary rights.While the advice and information in this book are believed to be true and accurate at the date of publication, neither the authors nor the editors nor the publisher can accept any legal responsibility for any errors or omissions that may be made. The publisher makes no warranty, express or implied, with respect to the material contained herein.
+
+To Dr. Joseph Shapira, Qualcomm Israel Founder and Ex-President, coauthor of "CDMA Radio with Repeaters"(Springer, 2007).
+
+Dr Ruth Shapira.
+
+Iris & Dr. Shye Shapira, made of the stuff dreams are made of.
+
+—Rami Rosen
+
+Preface
+
+This book takes you on a guided, in-depth tour of the current Linux kernel networking implementation and the theory behind it. For almost a decade, no new book about Linux networking has been written. A decade of dynamic and fast-paced Linux kernel development is quite a long time. There are important kernel networking subsystems that are not described in any other book; for example, IPv6, IPsec, Wireless (IEEE 802.11), IEEE 802.15.4, NFC, InfiniBand, and more. There is also very little information on the Web about the implementation details of these subsystems. For all these reasons, I have written this book.
+
+About ten years ago I made my first steps in kernel programming. I was a developer in a startup taking part in a VoIP project for a Linux-based set-top box (STB). There were crashes in the USB stack with some USB cameras, and we had to delve into the code to try to find a solution, because the vendors of that STB did not want to spend time to solve the problem. In fact, it was not that they did not want to, they simply did not know how to. In these days, there was almost no documentation about the USB stack. The Linux Device Drivers book from O'Reilly in those days was only in its second edition (the USB chapter was added only in the third edition). Success in that project was crucial for us as a startup. I had learned much about kernel programming in the process of solving the USB crash. Later on we had a project where a NAT traversal solution was needed. The userspace solution was so heavy that the device quickly crashed. When I suggested a kernel solution, my managers were very skeptical, but they did let me try. The kernel solution proved to be very stable and took much less CPU than the userspace solution. Since then I have taken part in many kernel networking projects. This book is a result of my many years of development and research.
+
+## Who This Book Is For
+
+This book is intended for computer professionals, including developers, software architects, designers, project managers, and CTOs, who are working on networking-related projects. These projects can be in a wide range of professional areas, such as communication, data centers, embedded devices, virtualization, security, and more. In addition, students and academy researchers and theorists who deal with networking projects or networking research or operating systems research will find a lot of help in this book.
+
+## How This Book Is Structured
+
+In Chapter 1 you will find a general overview of the Linux kernel and the Linux network stack. Other topics in this chapter include the implementation of the network device, the socket buffer, and the Rx and Tx paths. Chapter 1 concludes with a section about the Linux Kernel Networking Development Model.
+
+In chapter 2 you will learn about netlink sockets, which provide a mechanism for bidirectional communication between userspace and the kernel, and which are used by the networking subsystem as well as by other subsystems. You will also find a section in this chapter about generic netlink sockets, which can be perceived as advanced netlink sockets, and which you will encounter in Chapter 12 and while browsing the kernel networking source code.
+
+In Chapter 3 you will learn about the ICMP protocol, which helps to keep the system behaving correctly by sending error and control messages about the network layer (L3). You will learn about the implementation of the ICMP protocol both in IPv4 and in IPv6.
+
+Chapter 4 delves into the IPv4 protocol—the Internet and modern life cannot be described without it. You will learn about the structure of IPv4 header, about the Rx and Tx path, about IP options, about fragmentation and defragmentation and why they are needed, and about forwarding packets, which is one of the important tasks of IPv4.
+
+Chapters 5 and 6 are devoted to the IPv4 Routing Subsystem. In chapter 5 you will learn how a lookup in the routing subsystem is performed, how the routing tables are organized, which optimizations are used in the IPv4 routing subsystem and about the removal of the IPv4 routing cache. Chapter 6 discusses advanced routing topics such as Multicast Routing, Policy Routing, and Multipath Routing.
+
+Chapter 7 endeavors to explain the neighbouring subsystem. You will learn about the ARP protocol, which is used in IPv4, and about the the NDISC protocol used in IPv6, and about some of the differences between the two protocols. You will also learn about the Duplicate Address Detection (DAD) mechanism in IPv6.
+
+Chapter 8 discusses the IPv6 protocol, which seems to be the inevitable solution to the shortage of IPv4 addresses. This chapter describes the implementation of IPv6 and discusses topics such as IPv6 addresses, the IPv6 header and extension headers, autoconfiguration in IPv6, Rx path, and forwarding. It also describes the MLD protocol.
+
+Chapter 9 deals with the netfilter subsystem. You will learn about netfilter hooks and how they are registered, about Connection Tracking, about IP tables and Network Address Translation (NAT), and about callback used by Connection Tracking and NAT.
+
+Chapter 10 deals with IPsec, one of the most complex networking subsystems. Topics like the IKE protocol (which is implemented in userspace) and cryptography aspects of IPsec are discussed briefly (full treatment is beyond the scope of the book). You will learn about the XFRM framework, which is the basis of the Linux IPsec subsystem, and about its two most important structures: XFRM policy and XFRM state. The ESP protocol is briefly described, as well as the IPsec Rx path and Tx path in transport mode. The chapter concludes with a section about XFRM lookup and a short section about NAT traversal.
+
+Chapter 11 describes four Layer 4 protocols, starting with the most commonly used protocols, UDP and TCP, and concluding with two newer protocols, SCTP and DCCP.
+
+Chapter 12 deals with wireless in Linux (IEEE 802.11). You will learn about the mac80211 subsystem and its implementation, about various wireless network topologies, about power save mode, and about IEEE 802.11n and packet aggregation. There is also a section devoted to Wireless Mesh networks in this chapter.
+
+Chapter 13 delves into the InfiniBand subsystem, a technology enjoying a rising popularity in datacenters. You will learn about the RDMA stack organization, about addressing in InfiniBand, about the organization of InfiniBand packets, and about the RDMA API.
+
+Chapter 14 concludes the book with a discussion of advanced topics such as Linux namespaces and network namespaces in particular, Busy Poll Sockets, the Bluetooth subsystem, the IEEE 802.15.4 subsystem, the Near Field Communication (NFC) subsystem, the PCI subsystem, and more.
+
+Appendices A, "Linux API," and C, "Glossary," provide complete reference information for many topics dicussed in the book. Appendix B, "Network Administration," provides information about various tools which you will need while working with Linux kernel networking.
+
+## Conventions
+
+Throughout the book, I've kept a consistent style. All code snippets, whether inside text paragraphs or on lines of their own, along with library paths, shell commands, URLs, and other code-related elements, are set in monospaced font, like this. New terms are set off in italics, and other emphasis may be given in bold.
+
+About the Author
+
+  Rami Rosen is a software engineer, a computer science graduate of the Technion, Israel High Institute of Technology. In the last 17 years he has been a software developer for three innovative startups and a semiconductor company. Rami lives in Israel and he has participated in highly advanced Linux kernel projects, in particular those related to networking. He has published several articles and given lectures about Linux kernel networking and virtualization.
+
+About the Technical Reviewer
+
+  Brendan Horan is a hardware fanatic, with a full high rack of all types of machine architectures in his home. He has more than ten years of experience working with large UNIX systems and tuning the underlying hardware for optimal performance and stability. Brendan's love for all forms of hardware has helped him throughout his IT career, from fixing laptops to tuning servers and their hardware in order to suit the needs of high-availability designs and ultra low-latency applications. Brendan takes pride in the open source movement and is happy to say that every computer in his house is powered by open source technology. He resides in Hong Kong with his wife, Vikki, who continues daily to teach him more Cantonese.
+
+Acknowledgments
+
+Thanks to my editors for giving me the honor of writing this book; to Michelle Lowman, the lead editor, for believing in this book while it was still just an idea; to Kevin Shea, the coordinating editor, who guided and supported me from the initial stages until the book was fully realized; to Brendan Horan, the technical reviewer, for his helpful comments that helped me to improve the book by a lot; to Troy Mott, the development editor, for his many suggestions and for his hard work; to Corbin Collins and Roger LeBlanc, the copy editors, for shaping up the text; and to Kumar Dhaneesh from the production team.
+
+I would like to thank the Linux kernel networking maintainer, David Miller, for the great work he has done over all these years and all the developers who continue to participate and contribute to the networking subsystem. I would like also to say thanks to the Linux kernel networking community and all its members who helped me by reviewing my text: Julian Anastasov, Timo Teras, Steffen Klassert, Gerrit Renker, Javier Cardona, Gao feng, Vlad Yasevich, Cong Wang, Florian Westphal, Reuben Hawkins, Pekka Savola, Andreas Steffen, Daniel Borkmann, Joachim Nilsson, David Hauweele, Maxime Ripard, Alexandre Belloni, Benjamin Zores, and too many others to mention. Thanks to Donald Wood and Eliezer Tamir from Intel for their help with the "Busy Polling Sockets" section, and to Samuel Ortiz from Intel for his advice in preparing the NFC section. Thanks for Dotan Barak, an InfiniBand expert, for contributing Chapter 13, "InfiniBand."
+
+—Rami Rosen
+
+Contents
+
+Chapter 1:​ Introduction 1
+
+The Linux Network Stack 2
+
+The Network Device 4
+
+New API (NAPI) in Network Devices 5
+
+Receiving and Transmitting Packets 5
+
+The Socket Buffer 7
+
+The Linux Kernel Networking Development Model 10
+
+Summary 12
+
+Chapter 2:​ Netlink Sockets 13
+
+The Netlink Family 13
+
+Netlink Sockets Libraries 14
+
+The sockaddr_​nl Structure 15
+
+Userspace Packages for Controlling TCP/​IP Networking 15
+
+Kernel Netlink Sockets  16
+
+The Netlink Message Header 19
+
+NETLINK_​ROUTE Messages 22
+
+Adding and Deleting a Routing Entry in a Routing Table 24
+
+Generic Netlink Protocol 25
+
+Creating and Sending Generic Netlink Messages 29
+
+Socket Monitoring Interface  31
+
+Summary 32
+
+Chapter 3:​ Internet Control Message Protocol (ICMP) 37
+
+ICMPv4 37
+
+ICMPv4 Initialization 38
+
+ICMPv4 Header 39
+
+Receiving ICMPv4 Messages 42
+
+Sending ICMPv4 Messages:​ "Destination Unreachable" 44
+
+ICMPv6 47
+
+ICMPv6 Initialization 48
+
+ICMPv6 Header 49
+
+Receiving ICMPv6 Messages 50
+
+Sending ICMPv6 Messages 53
+
+ICMP Sockets ("Ping sockets") 56
+
+Summary  57
+
+Quick Reference 57
+
+Methods 57
+
+Tables 58
+
+procfs entries 60
+
+Creating "Destination Unreachable" Messages with iptables 61
+
+Chapter 4:​ IPv4 63
+
+IPv4 Header 64
+
+IPv4 Initialization 66
+
+Receiving IPv4 Packets 66
+
+Receiving IPv4 Multicast Packets 70
+
+IP Options 72
+
+Timestamp Option 74
+
+Record Route Option 77
+
+IP Options and Fragmentation 86
+
+Building IP Options 87
+
+Sending IPv4 Packets 88
+
+Fragmentation 94
+
+Fast Path 95
+
+Slow Path 97
+
+Defragmentation 100
+
+Forwarding 104
+
+Summary 107
+
+Quick Reference 107
+
+Methods 107
+
+Macros 110
+
+Chapter 5:​ The IPv4 Routing Subsystem 113
+
+Forwarding and the FIB 113
+
+Performing a Lookup in the Routing Subsystem 115
+
+FIB Tables 118
+
+FIB Info 119
+
+Caching 123
+
+Nexthop (fib_​nh) 124
+
+Policy Routing 126
+
+FIB Alias (fib_​alias) 127
+
+ICMPv4 Redirect Message 130
+
+Generating an ICMPv4 Redirect Message 131
+
+Receiving an ICMPv4 Redirect Message 132
+
+IPv4 Routing Cache 133
+
+Summary 135
+
+Quick Reference 135
+
+Methods 135
+
+Macros 136
+
+Tables 137
+
+Route Flags 139
+
+Chapter 6:​ Advanced Routing 141
+
+Multicast Routing 141
+
+The IGMP Protocol 142
+
+The Multicast Routing Table 143
+
+The Multicast Forwarding Cache (MFC) 144
+
+Multicast Router 146
+
+The Vif Device 147
+
+IPv4 Multicast Rx Path 148
+
+The ip_​mr_​forward( ) Method 151
+
+The ipmr_​queue_​xmit( ) Method 154
+
+The ipmr_​forward_​finish( ) Method 156
+
+The TTL in Multicast Traffic 157
+
+Policy Routing 157
+
+Policy Routing Management 158
+
+Policy Routing Implementation 158
+
+Multipath Routing 159
+
+Summary 160
+
+Quick Reference 160
+
+Methods 160
+
+Macros 163
+
+Procfs Multicast Entries 163
+
+Table 164
+
+Chapter 7:​ Linux Neighbouring Subsystem 165
+
+The Neighbouring Subsystem Core 165
+
+Creating and Freeing a Neighbour 172
+
+Interaction Between Userspace and the Neighbouring Subsystem 174
+
+Handling Network Events 175
+
+The ARP protocol (IPv4) 175
+
+ARP:​ Sending Solicitation Requests 177
+
+ARP:​ Receiving Solicitation Requests and Replies 181
+
+The NDISC Protocol (IPv6) 187
+
+Duplicate Address Detection (DAD) 187
+
+NIDSC:​ Sending Solicitation Requests 189
+
+NDISC:​ Receiving Neighbour Solicitations and Advertisements 193
+
+Summary 200
+
+Quick Reference 200
+
+Methods 200
+
+Macros 204
+
+The neigh_​statistics Structure 206
+
+Table 207
+
+Chapter 8:​ IPv6 209
+
+IPv6 – Short Introduction 209
+
+IPv6 Addresses 210
+
+Special Addresses 210
+
+Multicast Addresses 212
+
+IPv6 Header 213
+
+Extension Headers 215
+
+IPv6 Initialization 217
+
+Autoconfiguratio​n 217
+
+Receiving IPv6 Packets 218
+
+Local Delivery 222
+
+Forwarding 224
+
+Receiving IPv6 Multicast Packets 228
+
+Multicast Listener Discovery (MLD) 230
+
+Joining and Leaving a Multicast Group 230
+
+MLDv2 Multicast Listener Report 233
+
+Multicast Source Filtering (MSF) 234
+
+Sending IPv6 Packets 239
+
+IPv6 Routing 240
+
+Summary 240
+
+Quick Reference 240
+
+Methods 240
+
+Macros 244
+
+Tables 245
+
+Special Addresses 246
+
+Routing Tables Management in IPv6 246
+
+Chapter 9:​ Netfilter 247
+
+Netfilter Frameworks 247
+
+Netfilter Hooks 248
+
+Registration of Netfilter Hooks 249
+
+Connection Tracking 250
+
+Connection Tracking Initialization 251
+
+Connection Tracking Entries 255
+
+Connection Tracking Helpers and Expectations 259
+
+IPTables 262
+
+Delivery to the Local Host  265
+
+Forwarding the Packet  265
+
+Network Address Translation (NAT)  266
+
+NAT Hook Callbacks and Connection Tracking Hook Callbacks 268
+
+NAT Hook Callbacks 271
+
+Connection Tracking Extensions 273
+
+Summary 274
+
+Quick Reference 274
+
+Methods 274
+
+MACRO 276
+
+Tables 277
+
+Chapter 10:​ IPsec 279
+
+General 279
+
+IKE (Internet Key Exchange) 279
+
+IPsec and Cryptography 280
+
+The XFRM Framework 281
+
+XFRM Initialization 282
+
+XFRM Policies 282
+
+XFRM States (Security Associations) 285
+
+ESP Implementation (IPv4) 288
+
+IPv4 ESP Initialization 290
+
+Receiving an IPsec Packet (Transport Mode) 291
+
+Sending an IPsec Packet (Transport Mode) 294
+
+XFRM Lookup 295
+
+NAT Traversal in IPsec 298
+
+NAT-T Mode of Operation 299
+
+Summary 299
+
+Quick Reference 299
+
+Methods 299
+
+Table 302
+
+Chapter 11:​ Layer 4 Protocols 305
+
+Sockets 305
+
+Creating Sockets 306
+
+UDP (User Datagram Protocol) 310
+
+UDP Initialization 311
+
+Sending Packets with UDP 313
+
+Receiving Packets from the Network Layer (L3) with UDP 316
+
+TCP (Transmission Control Protocol) 318
+
+TCP Header 319
+
+TCP Initialization 321
+
+TCP Timers 322
+
+TCP Socket Initialization 323
+
+TCP Connection Setup 323
+
+Receiving Packets from the Network Layer (L3) with TCP 324
+
+Sending Packets with TCP 325
+
+SCTP (Stream Control Transmission Protocol) 326
+
+SCTP Packets and Chunks 328
+
+SCTP Chunk Header 328
+
+SCTP Chunk 329
+
+SCTP Associations 330
+
+Setting Up an SCTP Association 331
+
+Receiving Packets with SCTP 332
+
+Sending Packets with SCTP 332
+
+SCTP HEARTBEAT 332
+
+SCTP Multistreaming 333
+
+SCTP Multihoming 333
+
+DCCP:​ The Datagram Congestion Control Protocol 333
+
+DCCP Header 334
+
+DCCP Initialization 336
+
+DCCP Socket Initialization 337
+
+Receiving Packets from the Network Layer (L3) with DCCP 338
+
+Sending Packets with DCCP 338
+
+DCCP and NAT 339
+
+Summary 340
+
+Quick Reference 340
+
+Methods 340
+
+Macros 342
+
+Tables 342
+
+Chapter 12:​ Wireless in Linux  345
+
+Mac80211 Subsystem 345
+
+The 802.​11 MAC Header  346
+
+The Frame Control 347
+
+The Other 802.​11 MAC Header Members 348
+
+Network Topologies  349
+
+Infrastructure BSS 349
+
+IBSS, or Ad Hoc Mode  350
+
+Power Save Mode 350
+
+Entering Power Save Mode 350
+
+Exiting Power Save Mode 351
+
+Handling the Multicast/​Broadcast Buffer 351
+
+The Management Layer (MLME)  353
+
+Scanning 353
+
+Authentication 353
+
+Association 353
+
+Reassociation 353
+
+Mac80211 Implementation 354
+
+Rx Path 356
+
+Tx Path 356
+
+Fragmentation 357
+
+Mac80211 debugfs 358
+
+Wireless Modes 359
+
+High Throughput (ieee802.​11n) 359
+
+Packet Aggregation 360
+
+Mesh Networking (802.​11s)  362
+
+HWMP Protocol 364
+
+Setting Up a Mesh Network 365
+
+Linux Wireless Development Process 366
+
+Summary 366
+
+Quick Reference 366
+
+Methods 366
+
+Table 371
+
+Chapter 13:​ InfiniBand 373
+
+RDMA and InfiniBand—General 373
+
+The RDMA Stack Organization 374
+
+RDMA Technology Advantages 375
+
+InfiniBand Hardware Components 375
+
+Addressing in InfiniBand 375
+
+InfiniBand Features 376
+
+InfiniBand Packets 376
+
+Management Entities 377
+
+RDMA Resources 378
+
+RDMA Device 378
+
+Protection Domain (PD) 380
+
+Address Handle (AH) 380
+
+Memory Region (MR) 381
+
+Fast Memory Region (FMR) Pool 382
+
+Memory Window (MW) 382
+
+Completion Queue (CQ) 382
+
+eXtended Reliable Connected (XRC) Domain 384
+
+Shared Receive Queue (SRQ) 384
+
+Queue Pair (QP) 386
+
+Work Request Processing 391
+
+Supported Operations in the RDMA Architecture 392
+
+Multicast Groups 396
+
+Difference Between the Userspace and the Kernel-Level RDMA API 396
+
+Summary 397
+
+Quick Reference 397
+
+Methods 397
+
+Chapter 14:​ Advanced Topics 405
+
+Network Namespaces 405
+
+Namespaces Implementation 406
+
+UTS Namespaces Implementation 414
+
+Network Namespaces Implementation 416
+
+Network Namespaces Management  423
+
+Cgroups 426
+
+Cgroups Implementation 427
+
+Cgroup Devices Controller:​ A Simple Example 430
+
+Cgroup Memory Controller:​ A Simple Example 430
+
+The net_​prio Module 431
+
+The cls_​cgroup Classifier 432
+
+Mounting cgroup Subsystems 432
+
+Busy Poll Sockets 433
+
+Enabling Globally 435
+
+Enabling Per Socket 435
+
+Tuning and Configuration 435
+
+Performance 436
+
+The Linux Bluetooth Subsystem 436
+
+HCI Layer 439
+
+HCI Connection 441
+
+L2CAP 441
+
+BNEP 442
+
+Receiving Bluetooth Packets:​ Diagram 443
+
+L2CAP Extended Features 444
+
+Bluetooth Tools 444
+
+IEEE 802.​15.​4 and 6LoWPAN 445
+
+Neighbor Discovery Optimization 446
+
+Linux Kernel 6LoWPAN  447
+
+Near Field Communication (NFC) 450
+
+NFC Tags 450
+
+NFC Devices 451
+
+Communication and Operation Modes 451
+
+Host-Controller Interfaces 451
+
+Linux NFC support 452
+
+Userspace Architecture 456
+
+NFC on Android 457
+
+Notifications Chains 458
+
+The PCI Subsystem 461
+
+Wake-On-LAN (WOL) 463
+
+Teaming Network Device 464
+
+The PPPoE Protocol 465
+
+PPPoE Header 465
+
+PPPoE Initialization 467
+
+Sending and Receiving Packets with PPPoE 468
+
+Android 472
+
+Android Networking 472
+
+Android internals:​ Resources 473
+
+Summary 474
+
+Quick Reference 474
+
+Methods 474
+
+Macros 482
+
+Appendix A:​ Linux API 483
+
+The sk_​buff Structure 483
+
+struct skb_​shared_​info 492
+
+The net_​device structure 493
+
+RDMA (Remote DMA) 518
+
+RDMA Device 518
+
+The ib_​register_​client( ) Method 518
+
+The ib_​unregister_​client( ) Method 519
+
+The ib_​get_​client_​data( ) Method 519
+
+The ib_​set_​client_​data( ) Method 519
+
+The INIT_​IB_​EVENT_​HANDLER macro 520
+
+The ib_​register_​event_​handler( ) Method 520
+
+The ib_​event_​handler struct:​ 520
+
+The ib_​event Struct 520
+
+The ib_​unregister_​event_​handler( ) Method 522
+
+The ib_​query_​device( ) Method 522
+
+The ib_​query_​port( ) Method 526
+
+The rdma_​port_​get_​link_​layer( ) Method 529
+
+The ib_​query_​gid( ) Method 530
+
+The ib_​query_​pkey( ) Method 530
+
+The ib_​modify_​device( ) Method 530
+
+The ib_​modify_​port( ) Method 531
+
+The ib_​find_​gid( ) Method 532
+
+The ib_​find_​pkey( ) Method 532
+
+The rdma_​node_​get_​transport( ) Method 532
+
+The rdma_​node_​get_​transport( ) Method 532
+
+The ib_​mtu_​to_​int( ) Method 533
+
+The ib_​width_​enum_​to_​int( ) Method 533
+
+The ib_​rate_​to_​mult( ) Method 533
+
+The ib_​rate_​to_​mbps( ) Method 534
+
+The ib_​rate_​to_​mbps( ) Method 534
+
+Protection Domain (PD) 534
+
+The ib_​alloc_​pd( ) Method 534
+
+The ib_​dealloc_​pd( ) Method 534
+
+eXtended Reliable Connected (XRC) 535
+
+The ib_​alloc_​xrcd( ) Method 535
+
+The ib_​dealloc_​xrcd_​cq( ) Method 535
+
+Shared Receive Queue (SRQ) 535
+
+The ib_​create_​srq( ) Method 536
+
+The ib_​modify_​srq( ) Method 536
+
+The ib_​query_​srq( ) Method 537
+
+The ib_​destory_​srq( ) Method 537
+
+The ib_​post_​srq_​recv( ) Method 537
+
+Address Handle (AH)  538
+
+The ib_​create_​ah( ) Method 539
+
+The ib_​init_​ah_​from_​wc( ) Method 539
+
+The ib_​create_​ah_​from_​wc( ) Method 540
+
+The ib_​modify_​ah( ) Method 540
+
+The ib_​query_​ah( ) Method 540
+
+The ib_​destory_​ah( ) Method 540
+
+Multicast Groups 541
+
+The ib_​attach_​mcast( ) Method 541
+
+The ib_​detach_​mcast( ) method 541
+
+Completion Queue (CQ) 541
+
+The ib_​create_​cq( ) Method 541
+
+The ib_​resize_​cq( ) Method 542
+
+The ib_​modify_​cq( ) Method 542
+
+The ib_​peek_​cq( ) Method 542
+
+The ib_​req_​notify_​cq( ) Method 543
+
+The ib_​req_​ncomp_​notif( ) Method 543
+
+The ib_​poll_​cq( ) Method 543
+
+The ib_​destory_​cq( ) Method 547
+
+Queue Pair (QP) 547
+
+The ib_​qp_​cap Struct 547
+
+The ib_​create_​qp( ) Method 547
+
+The ib_​modify_​qp( ) Method 549
+
+The ib_​query_​qp( ) Method 553
+
+The ib_​open_​qp( ) Method 554
+
+The ib_​close_​qp( ) Method 554
+
+The ib_​post_​recv( ) Method 555
+
+The ib_​post_​send( ) Method 555
+
+Memory Windows (MW) 559
+
+The ib_​alloc_​mw( ) Method 559
+
+The ib_​bind_​mw( ) Method 560
+
+The ib_​dealloc_​mw( ) Method 560
+
+Memory Region (MR) 561
+
+The ib_​get_​dma_​mr( ) Method 561
+
+The ib_​dma_​mapping_​error( ) Method 561
+
+The ib_​dma_​map_​single( ) Method 561
+
+The ib_​dma_​unmap_​single( ) Method 562
+
+The ib_​dma_​map_​single_​attrs( ) Method 562
+
+The ib_​dma_​unmap_​single_​attrs( ) Method 562
+
+The ib_​dma_​map_​page( ) Method 563
+
+The ib_​dma_​unmap_​page( ) Method 563
+
+The ib_​dma_​map_​sg( ) Method 564
+
+The ib_​dma_​unmap_​sg( ) Method 564
+
+The ib_​dma_​map_​sg_​attr( ) Method 564
+
+The ib_​dma_​unmap_​sg( ) Method 565
+
+The ib_​sg_​dma_​address( ) Method 565
+
+The ib_​sg_​dma_​len( ) Method 565
+
+The ib_​dma_​sync_​single_​for_​cpu( ) Method 565
+
+The ib_​dma_​sync_​single_​for_​device( ) Method 566
+
+The ib_​dma_​alloc_​coherent( ) Method 566
+
+The ib_​dma_​free_​coherent( ) method 566
+
+The ib_​reg_​phys_​mr( ) Method 567
+
+The ib_​rereg_​phys_​mr( ) Method 567
+
+The ib_​query_​mr( ) Method 568
+
+The ib_​dereg_​mr( ) Method 569
+
+Appendix B:​ Network Administration 571
+
+arp 571
+
+arping 571
+
+arptables 571
+
+arpwatch 571
+
+ApacheBench (ab) 572
+
+brctl  572
+
+conntrack-tools 572
+
+crtools 572
+
+ebtables 572
+
+ether-wake 572
+
+ethtool 573
+
+git 573
+
+hciconfig 574
+
+hcidump 574
+
+hcitool 574
+
+ifconifg 574
+
+ifenslave 574
+
+iperf 575
+
+Using iperf 575
+
+iproute2 575
+
+iptables and iptables6 579
+
+ipvsadm 579
+
+iw 579
+
+iwconfig 579
+
+libreswan Project 580
+
+l2ping  580
+
+lowpan-tools  580
+
+lshw 580
+
+lscpu 580
+
+lspci 580
+
+mrouted 580
+
+nc 580
+
+ngrep 581
+
+netperf 581
+
+netsniff-ng 581
+
+netstat 581
+
+nmap (Network Mapper) 582
+
+openswan 582
+
+OpenVPN 582
+
+packeth 582
+
+ping 582
+
+pimd 583
+
+poptop 583
+
+ppp 583
+
+pktgen 583
+
+radvd 583
+
+route 583
+
+RP-PPPoE 584
+
+sar 584
+
+smcroute 584
+
+snort 584
+
+suricata 584
+
+strongSwan 584
+
+sysctl 584
+
+taskset 585
+
+tcpdump 585
+
+top 585
+
+tracepath 585
+
+traceroute 585
+
+tshark 585
+
+tunctl 586
+
+udevadm  586
+
+unshare 587
+
+vconfig  587
+
+wpa_​supplicant 587
+
+wireshark 588
+
+XORP 588
+
+Appendix C:​ Glossary 589
+
+Index599
+
+Contents at a Glance
+
+Chapter 1:​ Introduction 1
+
+Chapter 2:​ Netlink Sockets 13
+
+Chapter 3:​ Internet Control Message ProLohol (ICMP) 37
+
+Chapter 4:​ IPv4 63
+
+Chapter 5:​ The IPv4 Routing Subsystem 113
+
+Chapter 6:​ Advanced Routing 141
+
+Chapter 7:​ Linux Neighbouring Subsystem 165
+
+Chapter 8:​ IPv6 209
+
+Chapter 9:​ Netfilter 247
+
+Chapter 10:​ IPsec 279
+
+Chapter 11:​ Layer 4 ProLohols 305
+
+Chapter 12:​ Wireless in Linux  345
+
+Chapter 13:​ InfiniBand 373
+
+Chapter 14:​ Advanced Topics 405
+
+Appendix A:​ Linux API 483
+
+Appendix B:​ Network Administration 571
+
+Appendix C:​ Glossary 589
+
+Index599
+Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_1
+
+© Rami Rosen 2014
+
+# 1. Introduction
+
+Rami Rosen1
+
+(1)
+
+Haifa, Israel
+
+Abstract
+
+This book deals with the implementation of the Linux Kernel Networking stack and the theory behind it. You will find in the following pages an in-depth and detailed analysis of the networking subsystem and its architecture. I will not burden you with topics not directly related to networking, which you may encounter while reading kernel networking code (for example, locking and synchronization, SMP, atomic operations, and so on). There are plenty of resources about such topics. On the other hand, there are very few up-to-date resources that focus on kernel networking proper. By this I mean primarily describing the traversal of the packet in the Linux Kernel Networking stack and its interaction with various networking layers and subsystems—and how various networking protocols are implemented.
+
+This book deals with the implementation of the Linux Kernel Networking stack and the theory behind it. You will find in the following pages an in-depth and detailed analysis of the networking subsystem and its architecture. I will not burden you with topics not directly related to networking, which you may encounter while reading kernel networking code (for example, locking and synchronization, SMP, atomic operations, and so on). There are plenty of resources about such topics. On the other hand, there are very few up-to-date resources that focus on kernel networking proper. By this I mean primarily describing the traversal of the packet in the Linux Kernel Networking stack and its interaction with various networking layers and subsystems—and how various networking protocols are implemented.
+
+This book is also not a cumbersome, line-by-line code walkthrough. I focus on the essence of the implementation of each network layer and the theory guidelines and principles that led to this implementation. The Linux operating system has proved itself in recent years as a successful, reliable, stable, and popular operating system. And it seems that its popularity is growing steadily, in a wide variety of flavors, from mainframes, data centers, core routers, and web servers to embedded devices like wireless routers, set-top boxes, medical instruments, navigation equipment (like GPS devices), and consumer electronics devices. Many semiconductor vendors use Linux as the basis for their Board Support Packages (BSPs). The Linux operating system, which started as a project of a Finnish student named Linus Torvalds back in 1991, based on the UNIX operating system, proved to be a serious and reliable operating system and a rival for veteran proprietary operating systems.
+
+Linux began as an Intel x86-based operating system but has been ported to a very wide range of processors, including ARM, PowerPC, MIPS, SPARC, and more. The Android operating system, based upon the Linux kernel, is common today in tablets and smartphones, and seems likely to gain popularity in the future in smart TVs. Apart from Android, Google has also contributed some kernel networking features that were merged into the mainline kernel.
+
+Linux is an open source project, and as such it has an advantage over other proprietary operating systems: its source code is freely available under the General Public License (GPL). Other open source operating systems, like the different types of BSD, have much less popularity. I should also mention in this context the OpenSolaris project, based on the Common Development and Distribution License (CDDL). This project, started by Sun Microsystems, has not achieved the popularity that Linux has. Among the large community of active Linux developers, some contribute code on behalf of the companies they work for, and some contribute code voluntarily. All of the kernel development process is accessible via the kernel mailing lists. There is one central mailing list, the Linux Kernel Mailing List (LKML), and many subsystems have their own mailing lists. Contributing code is done via sending patches to the appropriate kernel mailing lists and to the maintainers, and these patches are discussed over the mailing lists.
+
+The Linux Kernel Networking stack is a very important subsystem of the Linux kernel. It is quite difficult to find a Linux-based system, whether it is a desktop, a server, a mobile device or any other embedded device, that does not use any kind of networking. Even in the rare case when a machine doesn't have any hardware network devices, you will still be using networking (maybe unconsciously) when you use X-Windows, as X-Windows itself is based upon client-server networking. A wide range of projects are related to the Linux Networking stack, from core routers to small embedded devices. Some of these projects deal with adding vendor-specific features. For example, some hardware vendors implement Generic Segmentation Offload (GSO) in some network devices. GSO is a networking feature of the kernel network stack that divides a large packet into smaller ones in the Tx path. Many hardware vendors implement checksumming in hardware in their network devices. Checksum is a mechanism to verify that a packet was not damaged on transit by calculating some hash from the packet and attaching it to the packet. Many projects provide some security enhancements for Linux. Sometimes these enhancements require some changes in the networking subsystem, as you will see, for example, in Chapter 3, when discussing the Openwall GNU/*/Linux project. In the embedded device arena there are, for example, many wireless routers that are Linux based; one example is the WRT54GL Linksys router, which runs Linux. There is also an open source, Linux-based operating system that can run on this device (and on some other devices), named OpenWrt, with a large and active community of developers (see  https://openwrt.org/ ). Learning about how the various protocols are implemented by the Linux Kernel Networking stack and becoming familiar with the main data structures and the main paths of a packet in it are essential to understanding it better.
+
+## The Linux Network Stack
+
+There are seven logical networking layers according to the Open Systems Interconnection (OSI) model. The lowest layer is the physical layer, which is the hardware, and the highest layer is the application layer, where userspace software processes are running. Let's describe these seven layers:
+
+1.
+
+The physical layer: Handles electrical signals and the low level details.
+
+2.
+
+The data link layer: Handles data transfer between endpoints. The most common data link layer is Ethernet. The Linux Ethernet network device drivers reside in this layer.
+
+3.
+
+The network layer: Handles packet forwarding and host addressing. In this book I discuss the most common network layers of the Linux Kernel Networking subsystem: IPv4 or IPv6. There are other, less common network layers which Linux implements, like DECnet, but they are not discussed.
+
+4.
+
+The protocol layer/transport layer: Handles data sending between nodes. The TCP and UDP protocols are the best-known protocols.
+
+5.
+
+The session layer: Handles sessions between endpoints.
+
+6.
+
+The presentation layer: Handles delivery and formatting.
+
+7.
+
+The application layer: Provides network services to end-user applications.
+
+Figure 1-1 shows the seven layers according to the OSI model.
+
+Figure 1-1.
+
+The OSI seven-layer model
+
+Figure 1-2 shows the three layers that the Linux Kernel Networking stack handles. The L2, L3, and L4 layers in this figure correspond to the data link layer, the network layer, and the transport layer in the seven-layer model, respectively. The essence of the Linux kernel stack is passing incoming packets from L2 (the network device drivers) to L3 (the network layer, usually IPv4 or IPv6) and then to L4 (the transport layer, where you have, for example, TCP or UDP listening sockets) if they are for local delivery, or back to L2 for transmission when the packets should be forwarded. Outgoing packets that were locally generated are passed from L4 to L3 and then to L2 for actual transmission by the network device driver. Along this way there are many stages, and many things can happen. For example:
+
+  * The packet can be changed due to protocol rules (for example, due to an IPsec rule or to a NAT rule).
+
+  * The packet can be discarded.
+
+  * The packet can cause an error message to be sent.
+
+  * The packet can be fragmented.
+
+  * The packet can be defragmented.
+
+  * A checksum should be calculated for the packet.
+
+The kernel does not handle any layer above L4; those layers (the session, presentation, and application layers) are handled solely by userspace applications. The physical layer (L1) is also not handled by the Linux kernel.
+
+If you feel overwhelmed, don't worry. You will learn a lot more about everything described here in a lot more depth in the following chapters.
+
+Figure 1-2.
+
+The Linux Kernel Networking layers
+
+## The Network Device
+
+The lower layer, Layer 2 (L2), as seen in Figure 1-2, is the link layer. The network device drivers reside in this layer. This book is not about network device driver development, because it focuses on the Linux kernel networking stack. I will briefly describe here the net_device structure, which represents a network device, and some of the concepts that are related to it. You should have a basic familiarity with the network device structure in order to better understand the network stack. Parameters of the device—like the size of MTU, which is typically 1,500 bytes for Ethernet devices—determine whether a packet should be fragmented. The net_device is a very large structure, consisting of device parameters like these:
+
+  * The IRQ number of the device.
+
+  * The MTU of the device.
+
+  * The MAC address of the device.
+
+  * The name of the device (like eth0 or eth1).
+
+  * The flags of the device (for example, whether it is up or down).
+
+  * A list of multicast addresses associated with the device.
+
+  * The promiscuity counter (discussed later in this section).
+
+  * The features that the device supports (like GSO or GRO offloading).
+
+  * An object of network device callbacks (net_device_ops object), which consists of function pointers, such as for opening and stopping a device, starting to transmit, changing the MTU of the network device, and more.
+
+  * An object of ethtool callbacks, which supports getting information about the device by running the command-line ethtool utility.
+
+  * The number of Tx and Rx queues, when the device supports multiqueues.
+
+  * The timestamp of the last transmit of a packet on this device.
+
+  * The timestamp of the last reception of a packet on this device.
+
+The following is the definition of some of the members of the net_device structure to give you a first impression:
+
+struct net_device {
+
+unsigned int irq; /* device IRQ number */
+
+...
+
+const struct net_device_ops *netdev_ops;
+
+...
+
+unsigned int mtu;
+
+...
+
+unsigned int promiscuity;
+
+...
+
+unsigned char *dev_addr;
+
+...
+
+};
+
+(include/linux/netdevice.h)
+
+Appendix A of the book includes a very detailed description of the net_device structure and most of its members. In that appendix you can see the irq, mtu, and other members mentioned earlier in this chapter.
+
+When the promiscuity counter is larger than 0, the network stack does not discard packets that are not destined to the local host. This is used, for example, by packet analyzers ("sniffers") like tcpdump and wireshark, which open raw sockets in userspace and want to receive also this type of traffic. It is a counter and not a Boolean in order to enable opening several sniffers concurrently: opening each such sniffer increments the counter by 1. When a sniffer is closed, the promiscuity counter is decremented by 1; and if it reaches 0, there are no more sniffers running, and the device exits the promiscuous mode.
+
+When browsing kernel networking core source code, in various places you will probably encounter the term NAPI (New API), which is a feature that most network device drivers implement nowadays. You should know what it is and why network device drivers use it.
+
+### New API (NAPI) in Network Devices
+
+The old network device drivers worked in interrupt-driven mode, which means that for every received packet, there was an interrupt. This proved to be inefficient in terms of performance under high load traffic. A new software technique was developed, called New API (NAPI), which is now supported on almost all Linux network device drivers. NAPI was first introduced in the 2.5/2.6 kernel and was backported to the 2.4.20 kernel. With NAPI, under high load, the network device driver works in polling mode and not in interrupt-driven mode. This means that each received packet does not trigger an interrupt. Instead the packets are buffered in the driver, and the kernel polls the driver from time to time to fetch the packets. Using NAPI improves performance under high load. For sockets applications that need the lowest possible latency and are willing to pay a cost of higher CPU utilization, Linux has added a capability for Busy Polling on Sockets from kernel 3.11 and later. This technology is discussed in Chapter 14, in the "Busy Poll Sockets" section.
+
+With your new knowledge about network devices under your belt, it is time to learn about the traversal of a packet inside the Linux Kernel Networking stack.
+
+### Receiving and Transmitting Packets
+
+The main tasks of the network device driver are these:
+
+  * To receive packets destined to the local host and to pass them to the network layer (L3), and from there to the transport layer (L4)
+
+  * To transmit outgoing packets generated on the local host and sent outside, or to forward packets that were received on the local host
+
+For each packet, incoming or outgoing, a lookup in the routing subsystem is performed. The decision about whether a packet should be forwarded and on which interface it should be sent is done based on the result of the lookup in the routing subsystem, which I describe in depth in Chapters 5 and . The lookup in the routing subsystem is not the only factor that determines the traversal of a packet in the network stack. For example, there are five points in the network stack where callbacks of the netfilter subsystem (often referred to as netfilter hooks) can be registered. The first netfilter hook point of a received packet is NF_INET_PRE_ROUTING, before a routing lookup was performed. When a packet is handled by such a callback, which is invoked by a macro named NF_HOOK(), it will continue its traversal in the networking stack according to the result of this callback (also called verdict). For example, if the verdict is NF_DROP, the packet will be discarded, and if the verdict is NF_ACCEPT, the packet will continue its traversal as usual. Netfilter hooks callbacks are registered by the nf_register_hook() method or by the nf_register_hooks() method, and you will encounter these invocations, for example, in various netfilter kernel modules. The kernel netfilter subsystem is the infrastructure for the well-known iptables userspace package. Chapter 9 describes the netfilter subsystem and the netfilter hooks, along with the connection tracking layer of netfilter.
+
+Besides the netfilter hooks, the packet traversal can be influenced by the IPsec subsystem—for example, when it matches a configured IPsec policy. IPsec provides a network layer security solution, and it uses the ESP and the AH protocols. IPsec is mandatory according to IPv6 specification and optional in IPv4, though most operating systems, including Linux, implemented IPsec also in IPv4. IPsec has two modes of operation: transport mode and tunnel mode. It is used as a basis for many virtual private network (VPN) solutions, though there are also non-IPsec VPN solutions. You learn about the IPsec subsystem and about IPsec policies in Chapter 10, which also discusses the problems that occur when working with IPsec through a NAT, and the IPsec NAT traversal solution.
+
+Still other factors can influence the traversal of the packet—for example, the value of the ttl field in the IPv4 header of a packet being forwarded. This ttl is decremented by 1 in each forwarding device. When it reaches 0, the packet is discarded, and an ICMPv4 message of "Time Exceeded" with "TTL Count Exceeded" code is sent back. This is done to avoid an endless journey of a forwarded packet because of some error. Moreover, each time a packet is forwarded successfully and the ttl is decremented by 1, the checksum of the IPv4 header should be recalculated, as its value depends on the IPv4 header, and the ttl is one of the IPv4 header members. Chapter 4, which deals with the IPv4 subsystem, talks more about this. In IPv6 there is something similar, but the hop counter in the IPv6 header is named hop_limit and not ttl. You will learn about this in Chapter 8, which deals with the IPv6 subsystem. You will also learn about ICMP in IPv4 and in IPv6 in Chapter 3, which deals with ICMP.
+
+A large part of the book discusses the traversal of a packet in the networking stack, whether it is in the receive path (Rx path, also known as ingress traffic) or the transmit path (Tx path, also known as egress traffic). This traversal is complex and has many variations: large packets could be fragmented before they are sent; on the other hand, fragmented packets should be assembled (discussed in Chapter 4). Packets of different types are handled differently. For example, multicast packets are packets that can be processed by a group of hosts (as opposed to unicast packets, which are destined to a specified host). Multicast can be used, for example, in applications of streaming media in order to consume less network resources. Handling IPv4 multicast traffic is discussed in Chapter 4. You will also learn how a host joins and leaves a multicast group; in IPv4, the Internet Group Management Protocol (IGMP) protocol handles multicast membership. Yet there are cases when the host is configured as a multicast router, and multicast traffic should be forwarded and not delivered to the local host. These cases are more complex as they should be handled in conjunction with a userspace multicast routing daemon, like the pimd daemon or the mrouted daemon. These cases, which are called multicast routing, are discussed in Chapter 6.
+
+To better understand the packet traversal, you must learn about how a packet is represented in the Linux kernel. The sk_buff structure represents an incoming or outgoing packet, including its headers (include/linux/skbuff.h). I refer to an sk_buff object as SKB in many places along this book, as this is the common way to denote sk_buff objects (SKB stands for socket buffer). The socket buffer (sk_buff) structure is a large structure—I will only discuss a few members of this structure in this chapter.
+
+### The Socket Buffer
+
+The sk_buff structure is described in depth in Appendix A. I recommend referring to this appendix when you need to know more about one of the SKB members or how to use the SKB API. Note that when working with SKBs, you must adhere to the SKB API. Thus, for example, when you want to advance the skb->data pointer, you do not do it directly, but with the skb_pull_inline() method or the skb_pull() method (you will see an example of this later in this section). And if you want to fetch the L4 header (transport header) from an SKB, you do it by calling the skb_transport_header() method. Likewise if you want to fetch the L3 header (network header), you do it by calling the skb_network_header() method, and if you want to fetch the L2 header (MAC header), you do it by calling the skb_mac_header() method. These three methods get an SKB as a single parameter.
+
+Here is the (partial) definition of the sk_buff structure:
+
+struct sk_buff {
+
+...
+
+struct sock *sk;
+
+struct net_device *dev;
+
+...
+
+__u8 pkt_type:3,
+
+...
+
+__be16 protocol;
+
+...
+
+sk_buff_data_t tail;
+
+sk_buff_data_t end;
+
+unsigned char *head,
+
+*data;
+
+sk_buff_data_t transport_header;
+
+sk_buff_data_t network_header;
+
+sk_buff_data_t mac_header;
+
+...
+
+};
+
+(include/linux/skbuff.h)
+
+When a packet is received on the wire, an SKB is allocated by the network device driver, typically by calling the netdev_alloc_skb() method (or the dev_alloc_skb() method, which is a legacy method that calls the netdev_alloc_skb() method with the first parameter as NULL). There are cases along the packet traversal where a packet can be discarded, and this is done by calling kfree_skb() or dev_kfree_skb(), both of which get as a single parameter a pointer to an SKB. Some members of the SKB are determined in the link layer (L2). For example, the pkt_type is determined by the eth_type_trans() method, according to the destination Ethernet address. If this address is a multicast address, the pkt_type will be set to PACKET_MULTICAST; if this address is a broadcast address, the pkt_type will be set to PACKET_BROADCAST; and if this address is the address of the local host, the pkt_type will be set to PACKET_HOST. Most Ethernet network drivers call the eth_type_trans() method in their Rx path. The eth_type_trans() method also sets the protocol field of the SKB according to the ethertype of the Ethernet header. The eth_type_trans() method also advances the data pointer of the SKB by 14 (ETH_HLEN), which is the size of an Ethernet header, by calling the skb_pull_inline() method. The reason for this is that the skb->data should point to the header of the layer in which it currently resides. When the packet was in L2, in the network device driver Rx path, skb->data pointed to the L2 (Ethernet) header; now that the packet is going to be moved to Layer 3, immediately after the call to the eth_type_trans() method, skb->data should point to the network (L3) header, which starts immediately after the Ethernet header (see Figure 1-3).
+
+The SKB includes the packet headers (L2, L3, and L4 headers) and the packet payload. In the packet traversal in the network stack, a header can be added or removed. For example, for an IPv4 packet generated locally by a socket and transmitted outside, the network layer (IPv4) adds an IPv4 header to the SKB. The IPv4 header size is 20 bytes as a minimum. When adding IP options, the IPv4 header size can be up to 60 bytes. IP options are described in Chapter 4, which discusses the IPv4 protocol implementation. Figure 1-3 shows an example of an IPv4 packet with L2, L3, and L4 headers. The example in Figure 1-3 is a UDPv4 packet. First is the Ethernet header (L2) of 14 bytes. Then there's the IPv4 header (L3) of a minimal size of 20 bytes up to 60 bytes, and after that is the UDPv4 header (L4), of 8 bytes. Then comes the payload of the packet.
+
+Figure 1-3.
+
+An IPv4 packet
+
+Each SKB has a dev member, which is an instance of the net_device structure. For incoming packets, it is the incoming network device, and for outgoing packets it is the outgoing network device. The network device attached to the SKB is sometimes needed to fetch information which might influence the traversal of the SKB in the Linux Kernel Networking stack. For example, the MTU of the network device may require fragmentation, as mentioned earlier. Each transmitted SKB has a sock object associated to it (sk). If the packet is a forwarded packet, then sk is NULL, because it was not generated on the local host.
+
+Each received packet should be handled by a matching network layer protocol handler. For example, an IPv4 packet should be handled by the ip_rcv() method, and an IPv6 packet should be handled by the ipv6_rcv() method. You will learn about the registration of the IPv4 protocol handler with the dev_add_pack() method in Chapter 4, and about the registration of the IPv6 protocol handler also with the dev_add_pack() method in Chapter 8. Moreover, I will follow the traversal of incoming and outgoing packets both in IPv4 and in IPv6. For example, in the ip_rcv() method, mostly sanity checks are performed, and if everything is fine the packet proceeds to an NF_INET_PRE_ROUTING hook callback, if such a callback is registered, and the next step, if it was not discarded by such a hook, is the ip_rcv_finish() method, where a lookup in the routing subsystem is performed. A lookup in the routing subsystem builds a destination cache entry (dst_entry object). You will learn about the dst_entry and about the input and output callback methods associated with it in Chapters 5 and , which describe the IPv4 routing subsystem.
+
+In IPv4 there is a problem of limited address space, as an IPv4 address is only 32 bit. Organizations use NAT (discussed in Chapter 9) to provide local addresses to their hosts, but the IPv4 address space still diminishes over the years. One of the main reasons for developing the IPv6 protocol was that its address space is huge compared to the IPv4 address space, because the IPv6 address length is 128 bit. But the IPv6 protocol is not only about a larger address space. The IPv6 protocol includes many changes and additions as a result of the experience gained over the years with the IPv4 protocol. For example, the IPv6 header has a fixed length of 40 bytes as opposed to the IPv4 header, which is variable in length (from a minimum of 20 bytes to 60 bytes) due to IP options, which can expand it. Processing IP options in IPv4 is complex and quite heavy in terms of performance. On the other hand, in IPv6 you cannot expand the IPv6 header at all (it is fixed in length, as mentioned). Instead there is a mechanism of extension headers which is much more efficient than the IP options in IPv4 in terms of performance. Another notable change is with the ICMP protocol; in IPv4 it was used only for error reporting and for informative messages. In IPv6, the ICMP protocol is used for many other purposes: for Neighbour Discovery (ND), for Multicast Listener Discovery (MLD), and more. Chapter 3 is dedicated to ICMP (both in IPv4 and IPv6). The IPv6 Neighbour Discovery protocol is described in Chapter 7, and the MLD protocol is discussed in Chapter 8, which deals with the IPv6 subsystem.
+
+As mentioned earlier, received packets are passed by the network device driver to the network layer, which is IPv4 or IPv6. If the packets are for local delivery, they will be delivered to the transport layer (L4) for handling by listening sockets. The most common transport protocols are UDP and TCP, discussed in Chapter 11, which discusses Layer 4, the transport layer. This chapter also covers two newer transport protocols, the Stream Control Transmission Protocol (SCTP) and the Datagram Congestion Control Protocol (DCCP). Both SCTP and DCCP adopted some TCP features and some UDP features, as you will find out. The SCTP protocol is known to be used in conjunction with the Long Term Evolution (LTE) protocol; the DCCP has not been tested so far in larger-scale Internet setups.
+
+Packets generated by the local host are created by Layer 4 sockets—for example, by TCP sockets or by UDP sockets. They are created by a userspace application with the Sockets API. There are two main types of sockets: datagram sockets and stream sockets. These two types of sockets and the POSIX-based socket API are also discussed in Chapter 11, where you will also learn about the kernel implementation of sockets (struct socket, which provides an interface to userspace, and struct sock, which provides an interface to Layer 3). The packets generated locally are passed to the network layer, L3 (described in Chapter 4, in the section "Sending IPv4 Packets") and then are passed to the network device driver (L2) for transmission. There are cases when fragmentation takes place in Layer 3, the network layer, and this is also discussed in chapter 4.
+
+Every Layer 2 network interface has an L2 address that identifies it. In the case of Ethernet, this is a 48-bit address, the MAC address which is assigned for each Ethernet network interface, provided by the manufacturer, and said to be unique (though you should consider that the MAC address for most network interfaces can be changed by userspace commands like ifconfig or ip). Each Ethernet packet starts with an Ethernet header, which is 14 bytes long. It consists of the Ethernet type (2 bytes), the source MAC address (6 bytes), and the destination MAC address (6 bytes). The Ethernet type value is 0x0800, for example, for IPv4, or 0x86DD for IPv6. For each outgoing packet, an Ethernet header should be built. When a userspace socket sends a packet, it specifies its destination address (it can be an IPv4 or an IPv6 address). This is not enough to build the packet, as the destination MAC address should be known. Finding the MAC address of a host based on its IP address is the task of the neighbouring subsystem, discussed in Chapter 7. Neighbor Discovery is handled by the ARP protocol in IPv4 and by the NDISC protocol in IPv6. These protocols are different: the ARP protocol relies on sending broadcast requests, whereas the NDISC protocol relies on sending ICMPv6 requests, which are in fact multicast packets. Both the ARP protocol and the NDSIC protocol are also discussed in Chapter 7.
+
+The network stack should communicate with the userspace for tasks such as adding or deleting routes, configuring neighboring tables, setting IPsec policies and states, and more. The communication between userspace and the kernel is done with netlink sockets, described in Chapter 2. The iproute2 userspace package, based on netlink sockets, is also discussed in Chapter 2, as well as the generic netlink sockets and their advantages.
+
+The wireless subsystem is discussed in Chapter 12. This subsystem is maintained separately, as mentioned earlier; it has a git tree of its own and a mailing list of its own. There are some unique features in the wireless stack that do not exist in the ordinary network stack, such as power save mode (which is when a station or an access point enters a sleep state). The Linux wireless subsystem also supports special topologies, like Mesh network, ad-hoc network, and more. These topologies sometimes require using special features. For example, Mesh networking uses a routing protocol called Hybrid Wireless Mesh Protocol (HWMP), discussed in Chapter 12. This protocol works in Layer 2 and deals with MAC addresses, as opposed to the IPV4 routing protocol. Chapter 12 also discusses the mac80211 framework, which is used by wireless device drivers. Another very interesting feature of the wireless subsystem is the block acknowledgment mechanism in IEEE 802.11n, also discussed in Chapter 12.
+
+In recent years InfiniBand technology has gained in popularity with enterprise datacenters. InfiniBand is based on a technology called Remote Direct Memory Access (RDMA). The RDMA API was introduced to the Linux kernel in version 2.6.11. In Chapter 13 you will find a good explanation about the Linux Infiniband implementation, the RDMA API, and its fundamental data structures.
+
+Virtualization solutions are also becoming popular, especially due to projects like Xen or KVM. Also hardware improvements, like VT-x for Intel processors or AMD-V for AMD processors, have made virtualization more efficient. There is another form of virtualization, which may be less known but has its own advantages. This virtualization is based on a different approach: process virtualization. It is implemented in Linux by namespaces. There is currently support for six namespaces in Linux, and there could be more in the future. The namespaces feature is already used by projects like Linux Containers ( http://lxc.sourceforge.net/ ) and Checkpoint/Restore In Userspace (CRIU). In order to support namespaces, two system calls were added to the kernel: unshare() and setns(); and six new flags were added to the CLONE_* flags, one for each type of namespace. I discuss namespaces and network namespaces in particular in Chapter 14. Chapter 14 also deals with the Bluetooth subsystem and gives a brief overview about the PCI subsystem, because many network device drivers are PCI devices. I do not delve into the PCI subsystem internals, because that is out of the scope of this book. Another interesting subsystem discussed in Chapter 14 is the IEEE 8012.15.4, which is for low-power and low-cost devices. These devices are sometimes mentioned in conjunction with the Internet of Things (IoT) concept, which involves connecting IP-enabled embedded devices to IP networks. It turns out that using IPv6 for these devices might be a good idea. This solution is termed IPv6 over Low Power Wireless Personal Area Networks (6LoWPAN). It has its own challenges, such as expanding the IPv6 Neighbour Discovery protocol to be suitable for such devices, which occasionally enter sleep mode (as opposed to ordinary IPv6 networks). These changes to the IPv6 Neighbour Discovery protocol have not been implemented yet, but it is interesting to consider the theory behind these changes. Apart from this, in Chapter 14 there are sections about other advanced topics like NFC, cgroups, Android, and more.
+
+To better understand the Linux Kernel Network stack or participate in its development, you must be familiar with how its development is handled.
+
+## The Linux Kernel Networking Development Model
+
+The kernel networking subsystem is very complex, and its development is quite dynamic. Like any Linux kernel subsystem, the development is done by git patches that are sent over a mailing list (sometimes over more than one mailing list) and that are eventually accepted or rejected by the maintainer of that subsystem. Learning about the Kernel Networking Development Model is important for many reasons. To better understand the code, to debug and solve problems in Linux Kernel Networking–based projects, to implement performance improvements and optimizations patches, or to implement new features, in many cases you need to learn many things such as the following:
+
+  * How to apply a patch
+
+  * How to read and interpret a patch
+
+  * How to find which patches could cause a given problem
+
+  * How to revert a patch
+
+  * How to find which patches are relevant to some feature
+
+  * How to adjust a project to an older kernel version (backporting)
+
+  * How to adjust a project to a newer kernel version (upgrading)
+
+  * How to clone a git tree
+
+  * How to rebase a git tree
+
+  * How to find out in which kernel version a specified git patch was applied
+
+There are cases when you need to work with new features that were just added, and for this you need to know how to work with the latest, bleeding-edge tree. And there are cases when you encounter some bug or you want to add some new feature to the network stack, and you need to prepare a patch and submit it. The Linux Kernel Networking subsystem, like the other parts of the kernel, is managed by git, a source code management (SCM) system, developed by Linus Torvalds. If you intend to send patches for the mainline kernel, or if your project is managed by git, you must learn to use the git tool.
+
+Sometimes you may even need to install a git server for development of local projects. Even if you are not intending to send any patches, you can use the git tool to retrieve a lot of information about the code and about the history of the development of the code. There are many available resources on the web about git; I recommend the free online book Pro Git, by Scott Chacon, available at  http://git-scm.com/book . If you intend to submit your patches to the mainline, you must adhere to some strict rules for writing, checking, and submitting patches so that your patch will be applied. Your patch should conform to the kernel coding style and should be tested. You also need to be patient, as sometimes even a trivial patch can be applied only after several days. I recommend learning to configure a host for using the git send-email command to submit patches (though submitting patches can be done with other mail clients, even with the popular Gmail webmail client). There are plenty of guides on the web about how to use git to prepare and send kernel patches. I also recommend reading Documentation/SubmittingPatches and Documentation/CodingStyle in the kernel tree before submitting your first patch.
+
+And I recommended using the following PERL scripts:
+
+  * scripts/checkpatch.pl to check the correctness of a patch
+
+  * scripts/get_maintainer.pl to find out to which maintainers a patch should be sent
+
+One of the most important resources of information is the Kernel Networking Development mailing list, netdev: netdev@vger.kernel.org, archived at  www.spinics.net/lists/netdev . This is a high volume list. Most of the posts are patches and Request for Comments (RFCs) for new code, along with comments and discussions about patches. This mailing list handles the Linux Kernel Networking stack and network device drivers, except for cases when dealing with a subsystem that has a specific mailing list and a specific git repository (such as the wireless subsystem, discussed in Chapter 12). Development of the iproute2 and the ethtool userspace packages is also handled in the netdev mailing list. It should be mentioned here that not every networking subsystem has a mailing list of its own; for example, the IPsec subsystem (discussed in Chapter 10), does not have a mailing list, nor does the IEEE 802.15.4 subsystem (Chapter 14). Some networking subsystems have their own specific git tree, maintainer, and mailing list, such as the wireless mailing list and the Bluetooth mailing list. From time to time the maintainers of these subsystems send a pull request for their git trees over the netdev mailing list. Another source of information is Documentation/networking in the kernel tree. It has a lot of information in many files about various networking topics, but keep in mind that the file that you find there is not always up to date.
+
+The Linux Kernel Networking subsystem is maintained in two git repositories. Patches and RFCs are sent to the netdev mailing list for both repositories. Here are the two git trees:
+
+  * net:  http://git.kernel.org/?p=linux/kernel/git/davem/net.git : for fixes to existing code already in the mainline tree
+
+  * net-next:  http://git.kernel.org/?p=linux/kernel/git/davem/net-next.git : new code for the future kernel release
+
+From time to time the maintainer of the networking subsystem, David Miller, sends pull requests for mainline for these git trees to Linus over the LKML. You should be aware that there are periods of time, during merge with mainline, when the net-next git tree is closed, and no patches should be sent. An announcement about when this period starts and another one when it ends is sent over the netdev mailing list.
+
+Note
+
+This book is based on kernel 3.9. All the code snippets are from this version, unless explicitly specified otherwise. The kernel tree is available from  www.kernel.org  as a tar file. Alternatively, you can download a kernel git tree with git clone (for example, using the URLs of the git net tree or the git net-next tree, which were mentioned earlier, or other git kernel repositories). There are plenty of guides on the Internet covering how to configure, build, and boot a Linux kernel. You can also browse various kernel versions online at  http://lxr.free-electrons.com/ . This website lets you follow where each method and each variable is referenced; moreover, you can navigate easily with a click of a mouse to previous versions of the Linux kernel. In case you are working with your own version of a Linux kernel tree, where some changes were made locally, you can locally install and configure a Linux Cross-Referencer server (LXR) on a local Linux machine. See  http://lxr.sourceforge.net/en/index.shtml .
+
+## Summary
+
+This chapter is a short introduction to the Linux Kernel Networking subsystem. I described the benefits of using Linux, a popular open source project, and the Kernel Networking Development Model. I also described the network device structure (net_device) and the socket buffer structure (sk_buff), which are the two most fundamental structures of the networking subsystem. You should refer to Appendix A for a detailed description of almost all the members of these structures and their uses. This chapter covered other important topics related to the traversal of a packet in the kernel networking stack, such as the lookup in the routing subsystem, fragmentation and defragmentation, protocol handler registration, and more. Some of these protocols are discussed in later chapters, including IPv4, IPv6, ICMP4 and ICMP6, ARP, and Neighbour Discovery. Several important subsystems, including the wireless subsystem, the Bluetooth subsystem, and the IEEE 812.5.4 subsystem, are also covered in later chapters. Chapter 2 starts the journey in the kernel network stack with netlink sockets, which provide a way for bidirectional communication between the userspace and the kernel, and which are talked about in several other chapters.
+Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_2
+
+© Rami Rosen 2014
+
+# 2. Netlink Sockets
+
+Rami Rosen1
+
+(1)
+
+Haifa, Israel
+
+Abstract
+
+Chapter 1 discusses the roles of the Linux kernel networking subsystem and the three layers in which it operates. The netlink socket interface appeared first in the 2.2 Linux kernel as AF_NETLINK socket. It was created as a more flexible alternative to the awkward IOCTL communication method between userspace processes and the kernel. The IOCTL handlers cannot send asynchronous messages to userspace from the kernel, whereas netlink sockets can. In order to use IOCTL, there is another level of complexity: you need to define IOCTL numbers. The operation model of netlink is quite simple: you open and register a netlink socket in userspace using the socket API, and this netlink socket handles bidirectional communication with a kernel netlink socket, usually sending messages to configure various system settings and getting responses back from the kernel.
+
+Chapter 1 discusses the roles of the Linux kernel networking subsystem and the three layers in which it operates. The netlink socket interface appeared first in the 2.2 Linux kernel as AF_NETLINK socket. It was created as a more flexible alternative to the awkward IOCTL communication method between userspace processes and the kernel. The IOCTL handlers cannot send asynchronous messages to userspace from the kernel, whereas netlink sockets can. In order to use IOCTL, there is another level of complexity: you need to define IOCTL numbers. The operation model of netlink is quite simple: you open and register a netlink socket in userspace using the socket API, and this netlink socket handles bidirectional communication with a kernel netlink socket, usually sending messages to configure various system settings and getting responses back from the kernel.
+
+This chapter describes the netlink protocol implementation and API and discusses its advantages and drawbacks. I also talk about the new generic netlink protocol, discuss its implementation and its advantages, and give some illustrative examples using the libnl library. I conclude with a discussion of the socket monitoring interface.
+
+## The Netlink Family
+
+The netlink protocol is a socket-based Inter Process Communication (IPC) mechanism, based on RFC 3549, "Linux Netlink as an IP Services Protocol." It provides a bidirectional communication channel between userspace and the kernel or among some parts of the kernel itself. Netlink is an extension of the standard socket implementation. The netlink protocol implementation resides mostly under net/netlink, where you will find the following four files:
+
+  * af_netlink.c
+
+  * af_netlink.h
+
+  * genetlink.c
+
+  * diag.c
+
+Apart from them, there are a few header files. In fact, the af_netlink module is the most commonly used; it provides the netlink kernel socket API, whereas the genetlink module provides a new generic netlink API with which it should be easier to create netlink messages. The diag monitoring interface module (diag.c) provides an API to dump and to get information about the netlink sockets. I discuss the diag module later in this chapter in the section "Socket monitoring interface."
+
+I should mention here that theoretically netlink sockets can be used to communicate between two userspace processes, or more (including sending multicast messages), though this is usually not used, and was not the original goal of netlink sockets. The UNIX domain sockets provide an API for IPC, and they are widely used for communication between two userspace processes.
+
+Netlink has some advantages over other ways of communication between userspace and the kernel. For example, there is no need for polling when working with netlink sockets. A userspace application opens a socket and then calls recvmsg(), and enters a blocking state if no messages are sent from the kernel; see, for example, the rtnl_listen() method of the iproute2 package (lib/libnetlink.c). Another advantage is that the kernel can be the initiator of sending asynchronous messages to userspace, without any need for the userspace to trigger any action (for example, by calling some IOCTL or by writing to some sysfs entry). Yet another advantage is that netlink sockets support multicast transmission.
+
+You create netlink sockets from userspace with the socket() system call. The netlink sockets can be SOCK_RAW sockets or SOCK_DGRAM sockets.
+
+Netlink sockets can be created in the kernel or in userspace; kernel netlink sockets are created by the netlink_kernel_create() method; and userspace netlink sockets are created by the socket() system call. Creating a netlink socket from userspace or from the kernel creates a netlink_sock object. When the socket is created from userspace, it is handled by the netlink_create() method. When the socket is created in the kernel, it is handled by __netlink_kernel_create(); this method sets the NETLINK_KERNEL_SOCKET flag. Eventually both methods call __netlink_create() to allocate a socket in the common way (by calling the sk_alloc() method) and initialize it. Figure 2-1 shows how a netlink socket is created in the kernel and in userspace.
+
+Figure 2-1.
+
+Creating a netlink socket in the kernel and in userspace
+
+You can create a netlink socket from userspace in a very similar way to ordinary BSD-style sockets, like this, for example: socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE). Then you should create a sockaddr_nl object (instance of the netlink socket address structure), initialize it, and use the standard BSD sockets API (such as bind(), sendmsg(), recvmsg(), and so on). The sockaddr_nl structure represents a netlink socket address in userspace or in the kernel.
+
+Netlink socket libraries provide a convenient API to netlink sockets. I discuss them in the next section.
+
+### Netlink Sockets Libraries
+
+I recommend you use the libnl API to develop userspace applications, which send or receive data by netlink sockets. The libnl package is a collection of libraries providing APIs to the netlink protocol-based Linux kernel interfaces. The iproute2 package uses the libnl library, as mentioned. Besides the core library (libnl), it includes support for the generic netlink family (libnl-genl), routing family (libnl-route), and netfilter family (libnl-nf). The package was developed mostly by Thomas Graf ( www.infradead.org/~tgr/libnl/ ). I should mention here also that there is a library called libmnl, which is a minimalistic userspace library oriented to netlink developers. The libmnl library was mostly written by Pablo Neira Ayuso, with contributions from Jozsef Kadlecsik and Jan Engelhardt. ( http://netfilter.org/projects/libmnl/ ).
+
+### The sockaddr_nl Structure
+
+Let's take a look at the sockaddr_nl structure, which represents a netlink socket address:
+
+struct sockaddr_nl {
+
+__kernel_sa_family_t nl_family; /* AF_NETLINK */
+
+unsigned short nl_pad; /* zero */
+
+__u32 nl_pid; /* port ID */
+
+__u32 nl_groups; /* multicast groups mask */
+
+};
+
+(include/uapi/linux/netlink.h)
+
+  * nl_family: Should always be AF_NETLINK.
+
+  * nl_pad: Should always be 0.
+
+  * nl_pid: The unicast address of a netlink socket. For kernel netlink sockets, it should be 0. Userspace applications sometimes set the nl_pid to be their process id (pid). In a userspace application, when you set nl_pid explicitly to 0, or don't set it at all, and afterwards call bind(), the kernel method netlink_autobind() assigns a value to nl_pid. It tries to assign the process id of the current thread. If you're creating two sockets in userspace, then you are responsible that their nl_pids are unique in case you don't call bind. Netlink sockets are not used only for networking; other subsystems, such as SELinux, audit, uevent, and others, use netlink sockets. The rtnelink sockets are netlink sockets specifically used for networking; they are used for routing messages, neighbouring messages, link messages, and more networking subsystem messages.
+
+  * nl_groups: The multicast group (or multicast group mask).
+
+The next section discusses the iproute2 and the older net-tools packages. The iproute2 package is based upon netlink sockets, and you'll see an example of using netlink sockets in iproute2 in the section "Adding and deleting a routing entry in a routing table", later in this chapter. I mention the net-tools package, which is older and might be deprecated in the future, to emphasize that as an alternative to iproute2, it has less power and less abilities.
+
+### Userspace Packages for Controlling TCP/IP Networking
+
+There are two userspace packages for controlling TCP/IP networking and handling network devices: net-tools and iproute2. The iproute2 package includes commands like the following:
+
+  * ip: For management of network tables and network interfaces
+
+  * tc: For traffic control management
+
+  * ss: For dumping socket statistics
+
+  * lnstat: For dumping linux network statistics
+
+  * bridge: For management of bridge addresses and devices
+
+The iproute2 package is based mostly on sending requests to the kernel from userspace and getting replies back over netlink sockets. There are a few exceptions where IOCTLs are used in iproute2. For example, the ip tuntap command uses IOCTLs to add/remove a TUN/TAP device. If you look at the TUN/TAP software driver code, you'll find that it defines some IOCTL handlers, but it does not use the rtnetlink sockets. The net-tools package is based on IOCTLs and includes known commands like these:
+
+  * ifconifg
+
+  * arp
+
+  * route
+
+  * netstat
+
+  * hostname
+
+  * rarp
+
+Some of the advanced functionalities of the iproute2 package are not available in the net-tools package.
+
+The next section discusses kernel netlink sockets—the core engine of handling communication between userspace and the kernel by exchanging netlink messages of different types. Learning about kernel netlink sockets is essential for understanding the interface that the netlink layer provides to userspace.
+
+### Kernel Netlink Sockets
+
+You create several netlink sockets in the kernel networking stack. Each kernel socket handles messages of different types: so for example, the netlink socket, which should handle NETLINK_ROUTE messages, is created in rtnetlink_net_init():
+
+static int __net_init rtnetlink_net_init(struct net *net) {
+
+...
+
+struct netlink_kernel_cfg cfg = {
+
+.groups = RTNLGRP_MAX,
+
+.input = rtnetlink_rcv,
+
+.cb_mutex = &rtnl_mutex,
+
+.flags = NL_CFG_F_NONROOT_RECV,
+
+};
+
+sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg);
+
+...
+
+}
+
+Note that the rtnetlink socket is aware of network namespaces; the network namespace object (struct net) contains a member named rtnl (rtnetlink socket). In the rtnetlink_net_init() method, after the rtnetlink socket was created by calling netlink_kernel_create(), it is assigned to the rtnl pointer of the corresponding network namespace object.
+
+Let's look in netlink_kernel_create() prototype:
+
+struct sock *netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg)
+
+  * The first parameter (net) is the network namespace.
+
+  * The second parameter is the netlink protocol (for example, NETLINK_ROUTE for rtnetlink messages, or NETLINK_XFRM for IPsec or NETLINK_AUDIT for the audit subsystem). There are over 20 netlink protocols, but their number is limited by 32 (MAX_LINKS). This is one of the reasons for creating the generic netlink protocol, as you'll see later in this chapter. The full list of netlink protocols is in include/uapi/linux/netlink.h.
+
+  * The third parameter is a reference to netlink_kernel_cfg, which consists of optional parameters for the netlink socket creation:
+
+struct netlink_kernel_cfg {
+
+unsigned int groups;
+
+unsigned int flags;
+
+void (*input)(struct sk_buff *skb);
+
+struct mutex *cb_mutex;
+
+void (*bind)(int group);
+
+};
+
+(include/uapi/linux/netlink.h)
+
+The groups member is for specifying a multicast group (or a mask of multicast groups). It's possible to join a multicast group by setting nl_groups of the sockaddr_nl object (you can also do this with the nl_join_groups() method of libnl). However, in this way you are limited to joining only 32 groups. Since kernel version 2.6.14, you can use the NETLINK_ADD_MEMBERSHIP/ NETLINK_DROP_MEMBERSHIP socket option to join/leave a multicast group, respectively. Using the socket option enables you to join a much higher number of groups. The nl_socket_add_memberships()/nl_socket_drop_membership() methods of libnl use this socket option.
+
+The flags member can be NL_CFG_F_NONROOT_RECV or NL_CFG_F_NONROOT_SEND.
+
+When CFG_F_NONROOT_RECV is set, a non-superuser can bind to a multicast group; in netlink_bind() there is the following code:
+
+static int netlink_bind(struct socket *sock, struct sockaddr *addr,
+
+int addr_len)
+
+{
+
+...
+
+if (nladdr->nl_groups) {
+
+if (!netlink_capable(sock, NL_CFG_F_NONROOT_RECV))
+
+return -EPERM;
+
+}
+
+For a non-superuser, if the NL_CFG_F_NONROOT_RECV is not set, then when binding to a multicast group the netlink_capable() method will return 0, and you get –EPRM error.
+
+When the NL_CFG_F_NONROOT_SEND flag is set, a non-superuser is allowed to send multicasts.
+
+The input member is for a callback; when the input member in netlink_kernel_cfg is NULL, the kernel socket won't be able to receive data from userspace (sending data from the kernel to userspace is possible, though). For the rtnetlink kernel socket, the rtnetlink_rcv() method was declared to be the input callback; as a result, data sent from userspace over the rtnelink socket will be handled by the rtnetlink_rcv() callback.
+
+For uevent kernel events, you need only to send data from the kernel to userspace; so, in lib/kobject_uevent.c, you have an example of a netlink socket where the input callback is undefined:
+
+static int uevent_net_init(struct net *net)
+
+{
+
+struct uevent_sock *ue_sk;
+
+struct netlink_kernel_cfg cfg = {
+
+.groups = 1,
+
+.flags = NL_CFG_F_NONROOT_RECV,
+
+};
+
+...
+
+ue_sk->sk = netlink_kernel_create(net, NETLINK_KOBJECT_UEVENT, &cfg);
+
+...
+
+}
+
+(lib/kobject_uevent.c)
+
+The mutex (cb_mutex) in the netlink_kernel_cfg object is optional; when not defining a mutex, you use the default one, cb_def_mutex (an instance of a mutex structure; see net/netlink/af_netlink.c). In fact, most netlink kernel sockets are created without defining a mutex in the netlink_kernel_cfg object. For example, the uevent kernel netlink socket (NETLINK_KOBJECT_UEVENT), mentioned earlier. Also, the audit kernel netlink socket (NETLINK_AUDIT) and other netlink sockets don't define a mutex. The rtnetlink socket is an exception—it uses the rtnl_mutex. Also the generic netlink socket, discussed in the next section, defines a mutex of its own: genl_mutex.
+
+The netlink_kernel_create() method makes an entry in a table named nl_table by calling the netlink_insert() method. Access to the nl_table is protected by a read write lock named nl_table_lock; lookup in this table is done by the netlink_lookup() method, specifying the protocol and the port id. Registration of a callback for a specified message type is done by rtnl_register(); there are several places in the networking kernel code where you register such callbacks. For example, in rtnetlink_init() you register callbacks for some messages, like RTM_NEWLINK (creating a new link), RTM_DELLINK (deleting a link), RTM_GETROUTE (dumping the route table), and more. In net/core/neighbour.c, you register callbacks for RTM_NEWNEIGH messages (creating a new neighbour), RTM_DELNEIGH (deleting a neighbour), RTM_GETNEIGHTBL message (dumping the neighbour table), and more. I discuss these actions in depth in Chapters 5 and . You also register callbacks to other types of messages in the FIB code (ip_fib_init()), in the multicast code (ip_mr_init()), in the IPv6 code, and in other places.
+
+The first step you should take to work with a netlink kernel socket is to register it. Let's take a look at the rtnl_register() method prototype:
+
+extern void rtnl_register(int protocol, int msgtype,
+
+rtnl_doit_func,
+
+rtnl_dumpit_func,
+
+rtnl_calcit_func);
+
+The first parameter is the protocol family (when you don't aim at a specific protocol, it is PF_UNSPEC); you'll find a list of all the protocol families in include/linux/socket.h.
+
+The second parameter is the netlink message type, like RTM_NEWLINK or RTM_NEWNEIGH. These are private netlink message types which the rtnelink protocol added. The full list of message types is in include/uapi/linux/rtnetlink.h .
+
+The last three parameters are callbacks: doit, dumpit, and calcit. The callbacks are the actions you want to perform for handling the message, and you usually specify only one callback.
+
+The doit callback is for actions like addition/deletion/modification; the dumpit callback is for retrieving information, and the calcit callback is for calculation of buffer size. The rtnetlink module has a table named rtnl_msg_handlers. This table is indexed by protocol number. Each entry in the table is a table in itself, indexed by message type. Each element in the table is an instance of rtnl_link, which is a structure that consists of pointers for these three callbacks. When registering a callback with rtnl_register(), you add the specified callback to this table.
+
+Registering a callback is done like this, for example: rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, NULL) in net/core/rtnetlink.c . This adds rtnl_newlink as the doit callback for RTM_NEWLINK messages in the corresponding rtnl_msg_handlers entry.
+
+Sending of rtnelink messages is done with rtmsg_ifinfo(). For example, in dev_open() you create a new link, so you call: rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); in the rtmsg_ifinfo() method, first the nlmsg_new() method is called to allocate an sk_buff with the proper size. Then two objects are created: the netlink message header (nlmsghdr) and an ifinfomsg object, which is located immediately after the netlink message header. These two objects are initialized by the rtnl_fill_ifinfo() method. Then rtnl_notify() is called to send the packet; sending the packet is actually done by the generic netlink method, nlmsg_notify() (in net/netlink/af_netlink.c). Figure 2-2 shows the stages of sending rtnelink messages with the rtmsg_ifinfo() method.
+
+Figure 2-2.
+
+Sending of rtnelink messages with the rtmsg_ifinfo() method
+
+The next section is about netlink messages, which are exchanged between userspace and the kernel. A netlink message always starts with a netlink message header, so your first step in learning about netlink messages will be to study the netlink message header format.
+
+### The Netlink Message Header
+
+A netlink message should obey a certain format, specified in RFC 3549, "Linux Netlink as an IP Services Protocol", section 2.2, "Message Format." A netlink message starts with a fixed size netlink header, and after it there is a payload. This section describes the Linux implementation of the netlink message header.
+
+The netlink message header is defined by struct nlmsghdr in include/uapi/linux/netlink.h:
+
+struct nlmsghdr
+
+{
+
+__u32 nlmsg_len;
+
+__u16 nlmsg_type;
+
+__u16 nlmsg_flags;
+
+__u32 nlmsg_seq;
+
+__u32 nlmsg_pid;
+
+};
+
+(include/uapi/linux/netlink.h)
+
+Every netlink packet starts with a netlink message header, which is represented by struct nlmsghdr . The length of nlmsghdr is 16 bytes. It contains five fields:
+
+  * nlmsg_len is the length of the message including the header.
+
+  * nlmsg_type is the message type; there are four basic netlink message header types:
+
+    * NLMSG_NOOP: No operation, message must be discarded.
+
+    * NLMSG_ERROR: Error occurred.
+
+    * NLMSG_DONE: A multipart message is terminated.
+
+    * NLMSG_OVERRUN: Overrun notification: error, data was lost.
+
+(include/uapi/linux/netlink.h)
+
+However, families can add netlink message header types of their own. For example, the rtnetlink protocol family adds message header types such as RTM_NEWLINK, RTM_DELLINK, RTM_NEWROUTE, and a lot more (see include/uapi/linux/rtnetlink.h). For a full list of the netlink message header types that were added by the rtnelink family with detailed explanation on each, see: man 7 rtnetlink. Note that message type values smaller than NLMSG_MIN_TYPE (0x10) are reserved for control messages and may not be used.
+
+  * nlmsg_flags field can be as follows:
+
+    * NLM_F_REQUEST: When it's a request message.
+
+    * NLM_F_MULTI: When it's a multipart message. Multipart messages are used for table dumps. Usually the size of messages is limited to a page (PAGE_SIZE). So large messages are divided into smaller ones, and each of them (except the last one) has the NLM_F_MULTI flag set. The last message has the NLMSG_DONE flag set.
+
+    * NLM_F_ACK: When you want the receiver of the message to reply with ACK. Netlink ACK messages are sent by the netlink_ack() method (net/netlink/af_netlink.c).
+
+    * NLM_F_DUMP: Retrieve information about a table/entry.
+
+    * NLM_F_ROOT: Specify the tree root.
+
+    * NLM_F_MATCH: Return all matching entries.
+
+    * NLM_F_ATOMIC: This flag is deprecated.
+
+The following flags are modifiers for creation of an entry:
+
+    * NLM_F_REPLACE: Override existing entry.
+
+    * NLM_F_EXCL: Do not touch entry, if it exists.
+
+    * NLM_F_CREATE: Create entry, if it does not exist.
+
+    * NLM_F_APPEND: Add entry to end of list.
+
+    * NLM_F_ECHO: Echo this request.
+
+I've shown the most commonly used flags. For a full list, see include/uapi/linux/netlink.h.
+
+  * nlmsg_seq is the sequence number (for message sequences). Unlike some Layer 4 transport protocols, there is no strict enforcement of the sequence number.
+
+  * nlmsg_pid is the sending port id. When a message is sent from the kernel, the nlmsg_pid is 0. When a message is sent from userspace, the nlmsg_pid can be set to be the process id of that userspace application which sent the message.
+
+Figure 2-3 shows the netlink message header.
+
+Figure 2-3.
+
+nlmsg header
+
+After the header comes the payload. The payload of netlink messages is composed of a set of attributes which are represented in Type-Length-Value (TLV) format. With TLV, the type and length are fixed in size (typically 1–4 bytes), and the value field is of variable size. The TLV representation is used also in other places in the networking code—for example, in IPv6 (see RFC 2460). TLV provides flexibility which makes future extensions easier to implement. Attributes can be nested, which enables complex tree structures of attributes.
+
+Each netlink attribute header is defined by struct nlattr:
+
+struct nlattr {
+
+__u16 nla_len;
+
+__u16 nla_type;
+
+};
+
+(include/uapi/linux/netlink.h)
+
+  * nla_len: The size of the attribute in bytes.
+
+  * nla_type: The attribute type. The value of nla_type can be, for example, NLA_U32 (for a 32-bit unsigned integer), NLA_STRING for a variable length string, NLA_NESTED for a nested attribute, NLA_UNSPEC for arbitrary type and length, and more. You can find the list of available types in include/net/netlink.h.
+
+Every netlink attribute must be aligned by a 4-byte boundary (NLA_ALIGNTO).
+
+Each family can define an attribute validation policy, which represents the expectations regarding the received attributes. This validation policy is represented by the nla_policy object. In fact, the nla_policy struct has exactly the same content as struct nlattr:
+
+struct nla_policy {
+
+u16 type;
+
+u16 len;
+
+};
+
+(include/uapi/linux/netlink.h)
+
+The attribute validation policy is an array of nla_policy objects; this array is indexed by the attribute number. For each attribute (except the fixed-length attributes), if the value of len in the nla_policy object is 0, no validation should be performed. If the attribute is one of the string types (such as NLA_STRING), len should be the maximum length of the string, without the terminating NULL byte. If the attribute type is NLA_UNSPEC or unknown, len should be set to the exact length of the attribute's payload. If the attribute type is NLA_FLAG, len is unused. (The reason is that the presence of the attribute itself implies a value of true, and the absence of the attribute implies a value of false).
+
+Receiving a generic netlink message in the kernel is handled by genl_rcv_msg(). In case it is a dump request (when the NLM_F_DUMP flag is set), you dump the table by calling the netlink_dump_start() method. If it's not a dump request, you parse the payload by the nlmsg_parse() method. The nlmsg_parse() method performs attribute validation by calling validate_nla() (lib/nlattr.c). If there are attributes with a type exceeding maxtype, they will be silently ignored for backwards compatibility. In case validation fails, you don't continue to the next step in genl_rcv_msg() (which is running the doit() callback), and the genl_rcv_msg() returns an error code.
+
+The next section describes the NETLINK_ROUTE messages, which are the most commonly used messages in the networking subsystem.
+
+### NETLINK_ROUTE Messages
+
+The rtnetlink (NETLINK_ROUTE) messages are not limited to the networking routing subsystem: there are neighbouring subsystem messages as well, interface setup messages, firewalling message, netlink queuing messages, policy routing messages, and many other types of rtnetlink messages, as you'll see in later chapters.
+
+The NETLINK_ROUTE messages can be divided into families:
+
+  * LINK (network interfaces)
+
+  * ADDR (network addresses)
+
+  * ROUTE (routing messages)
+
+  * NEIGH (neighbouring subsystem messages)
+
+  * RULE (policy routing rules)
+
+  * QDISC (queueing discipline)
+
+  * TCLASS (traffic classes)
+
+  * ACTION (packet action API, see net/sched/act_api.c)
+
+  * NEIGHTBL (neighbouring table)
+
+  * ADDRLABEL (address labeling)
+
+Each of these families has three types of messages: for creation, deletion, and retrieving information. So, for routing messages, you have the RTM_NEWROUTE message type for creating a route, the RTM_DELROUTE message type for deleting a route, and the RTM_GETROUTE message type for retrieving a route. With LINK messages there is, apart from the three methods for creation, deletion and information retrieval, an additional message for modifying a link: RTM_SETLINK.
+
+There are cases in which an error occurs, and you send an error message as a reply. The netlink error message is represented by the nlmsgerr struct:
+
+struct nlmsgerr {
+
+int error;
+
+struct nlmsghdr msg;
+
+};
+
+(include/uapi/linux/netlink.h)
+
+In fact, as you can see in Figure 2-4, the netlink error message is built from a netlink message header and an error code. When the error code is not 0, the netlink message header of the original request which caused the error is appended after the error code field.
+
+Figure 2-4.
+
+Netlink error message
+
+If you send a message that was constructed erroneously (for example, the nlmsg_type is not valid) then a netlink error message is sent back, and the error code is set according to the error that occurred. For example, when the nlmsg_type is not valid (a negative value, or a value higher than the maximum value permitted) the error code is set to –EOPNOTSUPP. See the rtnetlink_rcv_msg() method in net/core/rtnetlink.c. In error messages, the sequence number is set to be the sequence number of the request that caused the error.
+
+The sender can request to get an ACK for a netlink message. This is done by setting the netlink message header type (nlmsg_type) to be NLM_F_ACK. When the kernel sends an ACK, it uses an error message (the netlink message header type of this message is set to be NLMSG_ERROR) with an error code of 0. In this case, the original netlink header of the request is not appended to the error message. For implementation details, see the netlink_ack() method implementation in net/netlink/af_netlink.c.
+
+After learning about NETLINK_ROUTE messages, you're ready to look at an example of adding and deleting a routing entry in a routing table using NETLINK_ROUTE messages.
+
+### Adding and Deleting a Routing Entry in a Routing Table
+
+Behind the scenes, let's see what happens in the kernel in the context of netlink protocol when adding and deleting a routing entry. You can add a routing entry to the routing table by running, for example, the following:
+
+ip route add 192.168.2.11 via 192.168.2.20
+
+This command sends a netlink message from userspace (RTM_NEWROUTE) over an rtnetlink socket for adding a routing entry. The message is received by the rtnetlink kernel socket and handled by the rtnetlink_rcv() method. Eventually, adding the routing entry is done by invoking inet_rtm_newroute() in net/ipv4/fib_frontend.c. Subsequently, insertion into the Forwarding Information Base (FIB), which is the routing database, is accomplished with the fib_table_insert() method; however, inserting into the routing table is not the only task of fib_table_insert(). You should notify all listeners who performed registration for RTM_NEWROUTE messages. How? When inserting a new routing entry, you call the rtmsg_fib() method with RTM_NEWROUTE. The rtmsg_fib() method builds a netlink message and sends it by calling rtnl_notify() to notify all listeners who are registered to the RTNLGRP_IPV4_ROUTE group. These RTNLGRP_IPV4_ROUTE listeners can be registered in the kernel as well as in userspace (as is done in iproute2, or in some userspace routing daemons, like xorp). You'll see shortly how userspace daemons of iproute2 can subscribe to various rtnelink multicast groups.
+
+When deleting a routing entry, something quite similar happens. You can delete the routing entry earlier by running the following:
+
+ip route del 192.168.2.11
+
+That command sends a netlink message from userspace (RTM_DELROUTE) over an rtnetlink socket for deleting a routing entry. The message is again received by the rtnetlink kernel socket and handled by the rtnetlink_rcv() callback. Eventually, deleting the routing entry is done by invoking inet_rtm_delroute() callback in net/ipv4/fib_frontend.c. Subsequently, deletion from the FIB is done with fib_table_delete(), which calls rtmsg_fib(), this time with the RTM_DELROUTE message.
+
+You can monitor networking events with iproute2 ip command like this:
+
+ip monitor route
+
+For example, if you open one terminal and run ip monitor route there, and then open another terminal and run ip route add 192.168.1.10 via 192.168.2.200, on the first terminal you'll see this line: 192.168.1.10 via 192.168.2.200 dev em1. And when you run, on the second terminal, ip route del 192.168.1.10, on the first terminal the following text will appear: Deleted 192.168.1.10 via 192.168.2.200 dev em1.
+
+Running ip monitor route runs a daemon that opens a netlink socket and subscribes to the RTNLGRP_IPV4_ROUTE multicast group. Now, adding/deleting a route, as done in this example, will result in this: the message that was sent with rtnl_notify() will be received by the daemon and displayed on the terminal.
+
+You can subscribe to other multicast groups in this way. For example, to subscribe to the RTNLGRP_LINK multicast group, run ip monitor link. This daemon receives netlink messages from the kernel—when adding/deleting a link, for example. So if you open one terminal and run ip monitor link, and then open another terminal and add a VLAN interface by vconfig add eth1 200, on the first terminal you'll see lines like this:
+
+4: eth1.200@eth1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN
+
+link/ether 00:e0:4c:53:44:58 brd ff:ff:ff:ff:ff:ff
+
+And if you will add a bridge on the second terminal by brctl addbr mybr, on the first terminal you'll see lines like this:
+
+5: mybr: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN
+
+link/ether a2:7c:be:62:b5:b6 brd ff:ff:ff:ff:ff:ff
+
+You've seen what a netlink message is and how it is created and handled. You've seen how netlink sockets are handled. Next you'll learn why the generic netlink family (introduced in kernel 2.6.15) was created, and you'll learn about its Linux implementation.
+
+## Generic Netlink Protocol
+
+One of the drawbacks of the netlink protocol is that the number of protocol families is limited to 32 (MAX_LINKS). This is one of the main reasons that the generic netlink family was created—to provide support for adding a higher number of families. It acts as a netlink multiplexer and works with a single netlink family (NETLINK_GENERIC). The generic netlink protocol is based on the netlink protocol and uses its API.
+
+To add a netlink protocol family, you should add a protocol family definition in include/linux/netlink.h. But with generic netlink protocol, there is no need for that. The generic netlink protocol is also intended to be used in other subsystems besides networking, because it provides a general purpose communication channel. For example, it's used also by the acpi subsystem (see the definition of acpi_event_genl_family in drivers/acpi/event.c), by the task stats code (see kernel/taskstats.c), by the thermal events code, and more.
+
+The generic netlink kernel socket is created by the netlink_kernel_create() method like this:
+
+static int __net_init genl_pernet_init(struct net *net) {
+
+..
+
+struct netlink_kernel_cfg cfg = {
+
+.input = genl_rcv,
+
+.cb_mutex = &genl_mutex,
+
+.flags = NL_CFG_F_NONROOT_RECV,
+
+};
+
+net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, &cfg);
+
+...
+
+}
+
+(net/netlink/genetlink.c)
+
+Note that, like the netlink sockets described earlier, the generic netlink socket is also aware of network namespaces; the network namespace object (struct net) contains a member named genl_sock (a generic netlink socket). As you can see, the network namespace genl_sock pointer is assigned in the genl_pernet_init() method.
+
+The genl_rcv() method is defined to be the input callback of the genl_sock object, which was created earlier by the genl_pernet_init() method. As a result, data sent from userspace over generic netlink sockets is handled in the kernel by the genl_rcv() callback.
+
+You can create a generic netlink userspace socket with the socket() system call, though it is better to use the libnl-genl API (discussed later in this section).
+
+Immediately after creating the generic netlink kernel socket, register the controller family (genl_ctrl):
+
+static struct genl_family genl_ctrl = {
+
+.id = GENL_ID_CTRL,
+
+.name = "nlctrl",
+
+.version = 0x2,
+
+.maxattr = CTRL_ATTR_MAX,
+
+.netnsok = true,
+
+};
+
+static int __net_init genl_pernet_init(struct net *net) {
+
+...
+
+err = genl_register_family_with_ops(&genl_ctrl, &genl_ctrl_ops, 1)
+
+...
+
+The genl_ctrl has a fixed id of 0x10 (GENL_ID_CTRL); it is in fact the only instance of genl_family that's initialized with a fixed id; all other instances are initialized with GENL_ID_GENERATE as an id, which subsequently is replaced by a dynamically assigned value.
+
+There is support for registering multicast groups in generic netlink sockets by defining a genl_multicast_group object and calling genl_register_mc_group(); for example, in the Near Field Communication (NFC) subsystem, you have the following:
+
+static struct genl_multicast_group nfc_genl_event_mcgrp = {
+
+.name = NFC_GENL_MCAST_EVENT_NAME,
+
+};
+
+int __init nfc_genl_init(void)
+
+{
+
+...
+
+rc = genl_register_mc_group(&nfc_genl_family, &nfc_genl_event_mcgrp);
+
+...
+
+}
+
+(net/nfc/netlink.c)
+
+The name of a multicast group should be unique, because it is the primary key for lookups.
+
+In the multicast group, the id is also generated dynamically when registering a multicast group by calling the find_first_zero_bit() method in genl_register_mc_group(). There is only one multicast group, the notify_grp, that has a fixed id, GENL_ID_CTRL.
+
+To work with generic netlink sockets in the kernel, you should do the following:
+
+  * Create a genl_family object and register it by calling genl_register_family().
+
+  * Create a genl_ops object and register it by calling genl_register_ops().
+
+Alternatively, you can call genl_register_family_with_ops() and pass to it a genl_family object, an array of genl_ops, and its size. This method will first call genl_register_family() and then, if successful, will call genl_register_ops() for each genl_ops element of the specified array of genl_ops.
+
+The genl_register_family() and genl_register_ops() as well as the genl_family and genl_ops are defined in include/net/genetlink.h.
+
+The wireless subsystem uses generic netlink sockets:
+
+int nl80211_init(void)
+
+{
+
+int err;
+
+err = genl_register_family_with_ops(&nl80211_fam,
+
+nl80211_ops, ARRAY_SIZE(nl80211_ops));
+
+...
+
+}
+
+(net/wireless/nl80211.c)
+
+The generic netlink protocol is used by some userspace packages, such as the hostapd package and the iw package. The hostapd package ( http://hostap.epitest.fi ) provides a userspace daemon for wireless access point and authentication servers. The iw package is for manipulating wireless devices and their configuration (see  http://wireless.kernel.org/en/users/Documentation/iw ).
+
+The iw package is based on nl80211 and the libnl library. Chapter 12 discusses nl80211 in more detail. The old userspace wireless package is called wireless-toolsand is based on sending IOCTLs.
+
+Here are the genl_family and genl_ops definitions in nl80211:
+
+static struct genl_family nl80211_fam = {
+
+.id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */
+
+.name = "nl80211", /* have users key off the name instead */
+
+.hdrsize = 0, /* no private header */
+
+.version = 1, /* no particular meaning now */
+
+.maxattr = NL80211_ATTR_MAX,
+
+.netnsok = true,
+
+.pre_doit = nl80211_pre_doit,
+
+.post_doit = nl80211_post_doit,
+
+};
+
+  * name: Must be a unique name.
+
+  * id: id is GENL_ID_GENERATE in this case, which is in fact 0. GENL_ID_GENERATE tells the generic netlink controller to assign the channel a unique channel number when you register the family with genl_register_family(). The genl_register_family() assigns an id in the range 16 (GENL_MIN_ID, which is 0x10) to 1023 (GENL_MAX_ID).
+
+  * hdrsize: Size of a private header.
+
+  * maxattr: NL80211_ATTR_MAX, which is the maximum number of attributes supported.
+
+The nl80211_policy validation policy array has NL80211_ATTR_MAX elements (each attribute has an entry in the array):
+
+  * netnsok: true, which means the family can handle network namespaces.
+
+  * pre_doit: A hook that's called before the doit() callback.
+
+  * post_doit: A hook that can, for example, undo locking or any required private tasks after the doit() callback.
+
+You can add a command or several commands with the genl_ops structure. Let's take a look at the definition of genl_ops structand then at its usage in nl80211:
+
+struct genl_ops {
+
+u8 cmd;
+
+u8 internal_flags;
+
+unsigned int flags;
+
+const struct nla_policy *policy;
+
+int (*doit)(struct sk_buff *skb,
+
+struct genl_info *info);
+
+int (*dumpit)(struct sk_buff *skb,
+
+struct netlink_callback *cb);
+
+int (*done)(struct netlink_callback *cb);
+
+struct list_head ops_list;
+
+};
+
+  * cmd: Command identifier (the genl_ops struct defines a single command and its doit/dumpit handlers).
+
+  * internal_flags: Private flags which are defined and used by the family. For example, in nl80211, there are many operations that define internal flags (such as NL80211_FLAG_NEED_NETDEV_UP, NL80211_FLAG_NEED_RTNL, and more). The nl80211 pre_doit() and post_doit() callbacks perform actions according to these flags. See net/wireless/nl80211.
+
+  * flags: Operation flags. Values can be the following:
+
+  * GENL_ADMIN_PERM: When this flag is set, it means that the operation requires the CAP_NET_ADMIN privilege; see the genl_rcv_msg() method in net/netlink/genetlink.c.
+
+  * GENL_CMD_CAP_DO: This flag is set if the genl_ops struct implements the doit() callback.
+
+  * GENL_CMD_CAP_DUMP: This flag is set if the genl_ops struct implements the dumpit() callback.
+
+  * GENL_CMD_CAP_HASPOL: This flag is set if the genl_ops struct defines attribute validation policy (nla_policy array).
+
+  * policy : Attribute validation policy is discussed later in this section when describing the payload.
+
+  * doit: Standard command callback.
+
+  * dumpit: Callback for dumping.
+
+  * done: Completion callback for dumps.
+
+  * ops_list: Operations list.
+
+static struct genl_ops nl80211_ops[] = {
+
+{
+
+...
+
+{
+
+.cmd = NL80211_CMD_GET_SCAN,
+
+.policy = nl80211_policy,
+
+.dumpit = nl80211_dump_scan,
+
+},
+
+...
+
+}
+
+Note that either a doit or a dumpit callback must be specified for every element of genl_ops (nl80211_ops in this case) or the function will fail with -EINVAL.
+
+This entry in genl_ops adds the nl80211_dump_scan() callback as a handler of the NL80211_CMD_GET_SCAN command. The nl80211_policy is an array of nla_policy objects and defines the expected datatype of the attributes and their length.
+
+When running a scan command from userspace, for example by iw dev wlan0 scan, you send from userspace a generic netlink message whose command is NL80211_CMD_GET_SCAN over a generic netlink socket. Messages are sent by the nl_send_auto_complete() method or by nl_send_auto() in the newer libnl versions. nl_send_auto() fills the missing bits and pieces in the netlink message header. If you don't require any of the automatic message completion functionality, you can use nl_send() directly.
+
+The message is handled by the nl80211_dump_scan() method, which is the dumpit callback for this command (net/wireless/nl80211.c). There are more than 50 entries in the nl80211_ops object for handling commands, including NL80211_CMD_GET_INTERFACE, NL80211_CMD_SET_INTERFACE, NL80211_CMD_START_AP, and so on.
+
+To send commands to the kernel, a userspace application should know the family id. The family name is known in the userspace, but the family id is unknown in the userspace because it's determined only in runtime in the kernel. To get the family id, the userspace application should send a generic netlink CTRL_CMD_GETFAMILY request to the kernel. This request is handled by the ctrl_getfamily() method. It returns the family id as well as other information, such as the operations the family supports. Then the userspace can send commands to the kernel specifying the family id that it got in the reply. I discuss this more in the next section.
+
+### Creating and Sending Generic Netlink Messages
+
+A generic netlink message starts with a netlink header, followed by the generic netlink message header, and then there is an optional user specific header. Only after all that do you find the optional payload, as you can see in Figure 2-5.
+
+Figure 2-5.
+
+Generic netlink message.
+
+This is the generic netlink message header:
+
+struct genlmsghdr {
+
+__u8 cmd;
+
+__u8 version;
+
+__u16 reserved;
+
+};
+
+(include/uapi/linux/genetlink.h)
+
+  * cmd is a generic netlink message type; each generic family that you register adds its own commands. For example, for the nl80211_fam family mentioned above, the commands it adds (like NL80211_CMD_GET_INTERFACE) are represented by the nl80211_commands enum. There are more than 60 commands (see include/linux/nl80211.h).
+
+  * version can be used for versioning support. With nl80211 it is 1, with no particular meaning. The version member allows changing the format of a message without breaking backward compatibility.
+
+  * reserved is for future use.
+
+Allocating a buffer for a generic netlink message is done by the following method:
+
+sk_buff *genlmsg_new(size_t payload, gfp_t flags)
+
+This is in fact a wrapper around nlmsg_new().
+
+After allocating a buffer with genlmsg_new(), the genlmsg_put() is called to create the generic netlink header, which is an instance of genlmsghdr. You send a unicast generic netlink message with genlmsg_unicast(), which is in fact a wrapper around nlmsg_unicast(). You can send a multicast generic netlink message in two ways:
+
+  * genlmsg_multicast(): This method sends the message to the default network namespace, net_init.
+
+  * genlmsg_multicast_allns(): This method sends the message to all network namespaces.
+
+(All prototypes of the methods mentioned in this section are in include/net/genetlink.h.)
+
+You can create a generic netlink socket from userspace like this: socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); this call is handled in the kernel by the netlink_create() method, like an ordinary, non-generic netlink socket, as you saw in the previous section. You can use the socket API to perform further calls like bind() and sendmsg() or recvmsg(); however, using the libnl library instead is recommended.
+
+libnl-genl provides generic netlink API, for management of controller, family, and command registration. With libnl-genl, you can call genl_connect() to create a local socket file descriptor and bind the socket to the NETLINK_GENERIC netlink protocol.
+
+Let's take a brief look at what happens in a short typical userspace-kernel session when sending a command to the kernel via generic netlink sockets using the libnl library and the libnl-genl library.
+
+The iw package uses the libnl-genl library. When you run a command like iw dev wlan0 list, the following sequence occurs (omitting unimportant details):
+
+state->nl_sock = nl_socket_alloc()
+
+Allocate a socket (note the use here of libnl core API and not the generic netlink family (libnl-genl) yet.
+
+genl_connect(state->nl_sock)
+
+Call socket() with NETLINK_GENERIC and call bind() on this socket; the genl_connect() is a method of the libnl-genl library.
+
+genl_ctrl_resolve(state->nl_sock, "nl80211");
+
+This method resolves the generic netlink family name ("nl80211") to the corresponding numeric family identifier. The userspace application must send its subsequent messages to the kernel, specifying this id.
+
+The genl_ctrl_resolve() method calls genl_ctrl_probe_by_name(), which in fact sends a generic netlink message to the kernel with the CTRL_CMD_GETFAMILY command.
+
+In the kernel, the generic netlink controller ("nlctrl") handles the CTRL_CMD_GETFAMILY command by the ctrl_getfamily() method and returns the family id to userspace. This id was generated when the socket was created.
+
+Note
+
+You can get various parameters (such as generated id, header size, max attributes, and more) of all the ­registered generic netlink families with the userspace tool genl (of iproute2) by running genl ctrl list.
+
+You're now ready to learn about the socket monitoring interface, which lets you get information about sockets. The socket monitoring interface is used in userspace tools like ss, which displays socket information and statistics for various socket types, and in other projects, as you'll see in the next section.
+
+### Socket Monitoring Interface
+
+The sock_diag netlink sockets provide a netlink-based subsystem that can be used to get information about sockets. This feature was added to the kernel to support checkpoint/restore functionality for Linux in userspace (CRIU). To support this functionality, additional data about sockets was needed. For example, /procfs doesn't say which are the peers of a UNIX domain socket (AF_UNIX), and this info is needed for checkpoint/restore support. This additional data is not exported via /proc, and to make changes to procfs entries isn't always desirable because it might break userspace applications. The sock_diag netlink sockets give an API which enables access to this additional data. This API is used in the CRIU project as well as in the ss util. Without the sock_diag, after checkpointing a process (saving the state of a process to the filesystem), you can't reconstruct its UNIX domain sockets because you don't know who the peers are.
+
+To support the monitoring interface used by the ss tool, a netlink-based kernel socket is created (NETLINK_SOCK_DIAG). The ss tool, which is part of the iproute2 package, enables you to get socket statistics in a similar way to netstat. It can display more TCP and state information than other tools.
+
+You create a netlink kernel socket for sock_diag like this:
+
+static int __net_init diag_net_init(struct net *net)
+
+{
+
+struct netlink_kernel_cfg cfg = {
+
+.input = sock_diag_rcv,
+
+};
+
+net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg);
+
+return net->diag_nlsk == NULL ? -ENOMEM : 0;
+
+}
+
+(net/core/sock_diag.c)
+
+The sock_diag module has a table of sock_diag_handlerobjects named sock_diag_handlers. This table is indexed by the protocol number (for the list of protocol numbers, see include/linux/socket.h).
+
+The sock_diag_handler struct is very simple:
+
+struct sock_diag_handler {
+
+__u8 family;
+
+int (*dump)(struct sk_buff *skb, struct nlmsghdr *nlh);
+
+};
+
+(net/core/sock_diag.c)
+
+Each protocol that wants to add a socket monitoring interface entry to this table first defines a handler and then calls sock_diag_register(), specifying its handler. For example, for UNIX sockets, there is the following in net/unix/diag.c:
+
+The first step is definition of the handler:
+
+static const struct sock_diag_handler unix_diag_handler = {
+
+.family = AF_UNIX,
+
+.dump = unix_diag_handler_dump,
+
+};
+
+The second step is registration of the handler:
+
+static int __init unix_diag_init(void)
+
+{
+
+return sock_diag_register(&unix_diag_handler);
+
+}
+
+Now, with ss –x or ss --unix, you can dump the statistics that are gathered by the UNIX diag module. In quite a similar way, there are diag modules for other protocols, such as UDP (net/ipv4/udp_diag.c), TCP (net/ipv4/tcp_diag.c), DCCP (/net/dccp/diag.c), and AF_PACKET (net/packet/diag.c).
+
+There's also a diag module for the netlink sockets themselves. The /proc/net/netlink entry provides information about the netlink socket (netlink_sock object) like the portid, groups, the inode number of the socket, and more. If you want the details, dumping /proc/net/netlink is handled by netlink_seq_show() in net/netlink/af_netlink.c. There are some netlink_sock fields which /proc/net/netlink doesn't provide—for example, dst_group or dst_portid or groups above 32. For this reason, the netlink socket monitoring interface was added (net/netlink/diag.c). You should be able to use the ss tool of iproute2 to read netlink sockets information. The netlink diag code can be built also as a kernel module.
+
+## Summary
+
+This chapter covered netlink sockets, which provide a mechanism for bidirectional communication between the userspace and the kernel and are widely used by the networking subsystem. You've seen some examples of netlink sockets usage. I also discussed netlink messages, how they're created and handled. Another important subject the chapter dealt with is the generic netlink sockets, including their advantages and their usage. The next chapter covers the ICMP protocol, including its usage and its implementation in IPv4 and IPv6.
+
+## Quick Reference
+
+I conclude this chapter with a short list of important methods of the netlink and generic netlink subsystems. Some of them were mentioned in this chapter:
+
+int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
+
+struct nlmsghdr *))
+
+This method handles receiving netlink messages. It's called from the input callback of netlink families (for example, in the rtnetlink_rcv() method for the rtnetlink family, or in the sock_diag_rcv() method for the sock_diag family. The method performs sanity checks, like making sure that the length of the netlink message header does not exceed the permitted max length (NLMSG_HDRLEN). It also avoids invoking the specified callback in case that the message is a control message. In case the ACK flag (NLM_F_ACK) is set, it sends an error message by invoking the netlink_ack() method.
+
+struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
+
+u32 dst_portid, gfp_t gfp_mask)
+
+This method allocates an SKB with the specified size and gfp_mask; the other parameters (ssk, dst_portid) are used when working with memory mapped netlink IO (NETLINK_MMAP). This feature is not discussed in this chapter, and is located here: net/netlink/af_netlink.c.
+
+struct netlink_sock *nlk_sk(struct sock *sk)
+
+This method returns the netlink_sock object, which has an sk as a member, and is located here: net/netlink/af_netlink.h.
+
+struct sock *netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg)
+
+This method creates a kernel netlink socket.
+
+struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb)
+
+This method returns the netlink message header pointed to by skb->data.
+
+struct nlmsghdr *__nlmsg_put(struct sk_buff *skb, u32 portid,
+
+u32 seq, int type, int len, int flags)
+
+This method builds a netlink message header according to the specified parameters, and puts it in the skb, and is located here: include/linux/netlink.h.
+
+struct sk_buff *nlmsg_new(size_t payload, gfp_t flags)
+
+This method allocates a new netlink message with the specified message payload by calling alloc_skb(). If the specified payload is 0, alloc_skb() is called with NLMSG_HDRLEN (after alignment with the NLMSG_ALIGN macro).
+
+int nlmsg_msg_size(int payload)
+
+This method returns the length of a netlink message (message header length and payload), not including padding.
+
+void rtnl_register(int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+
+rtnl_calcit_func calcit)
+
+This method registers the specified rtnetlink message type with the three specified callbacks.
+
+static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+
+This method processes an rtnetlink message.
+
+static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
+
+int type, u32 pid, u32 seq, u32 change,
+
+unsigned int flags, u32 ext_filter_mask)
+
+This method creates two objects: a netlink message header (nlmsghdr) and an ifinfomsg object, located immediately after the netlink message header.
+
+void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
+
+struct nlmsghdr *nlh, gfp_t flags)
+
+This method sends an rtnetlink message.
+
+int genl_register_mc_group(struct genl_family *family,
+
+struct genl_multicast_group *grp)
+
+This method registers the specified multicast group, notifies the userspace, and returns 0 on success or a negative error code. The specified multicast group must have a name. The multicast group id is generated dynamically in this method by the find_first_zero_bit() method for all multicast groups, except for notify_grp, which has a fixed id of 0x10 (GENL_ID_CTRL).
+
+void genl_unregister_mc_group(struct genl_family *family,
+
+struct genl_multicast_group *grp)
+
+This method unregisters the specified multicast group and notifies the userspace about it. All current listeners on the group are removed. It's not necessary to unregister all multicast groups before unregistering the family—unregistering the family causes all assigned multicast groups to be unregistered automatically.
+
+int genl_register_ops(struct genl_family *family, struct genl_ops *ops)
+
+This method registers the specified operations and assigns them to the specified family. Either a doit() or a dumpit() callback must be specified or the operation will fail with -EINVAL. Only one operation structure per command identifier may be registered. It returns 0 on success or a negative error code.
+
+int genl_unregister_ops(struct genl_family *family, struct genl_ops *ops)
+
+This method unregisters the specified operations and unassigns them from the specified family. The operation blocks until the current message processing has finished and doesn't start again until the unregister process has finished. It's not necessary to unregister all operations before unregistering the family—unregistering the family causes all assigned operations to be unregistered automatically. It returns 0 on success or a negative error code.
+
+int genl_register_family(struct genl_family *family)
+
+This method registers the specified family after validating it first. Only one family may be registered with the same family name or identifier. The family id may equal GENL_ID_GENERATE, causing a unique id to be automatically generated and assigned.
+
+int genl_register_family_with_ops(struct genl_family *family,
+
+struct genl_ops *ops, size_t n_ops)
+
+This method registers the specified family and operations. Only one family may be registered with the same family name or identifier. The family id may equal GENL_ID_GENERATE, causing a unique id to be automatically generated and assigned. Either a doit or a dumpit callback must be specified for every registered operation or the function will fail. Only one operation structure per command identifier may be registered. This is equivalent to calling genl_register_family() followed by genl_register_ops() for every operation entry in the table, taking care to unregister the family on the error path. The method returns 0 on success or a negative error code.
+
+int genl_unregister_family(struct genl_family *family)
+
+This method unregisters the specified family and returns 0 on success or a negative error code.
+
+void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
+
+struct genl_family *family, int flags, u8 cmd)
+
+This method adds a generic netlink header to a netlink message.
+
+int genl_register_family(struct genl_family *family)
+
+int genl_unregister_family(struct genl_family *family)
+
+This method registers/unregisters a generic netlink family.
+
+int genl_register_ops(struct genl_family *family, struct genl_ops *ops)
+
+int genl_unregister_ops(struct genl_family *family, struct genl_ops *ops)
+
+This method registers/unregisters generic netlink operations.
+
+void genl_lock(void)
+
+void genl_unlock(void)
+
+This method locks/unlocks the generic netlink mutex (genl_mutex). Used for example in net/l2tp/l2tp_netlink.c.
+Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_3
+
+© Rami Rosen 2014
+
+# 3. Internet Control Message Protocol (ICMP)
+
+Rami Rosen1
+
+(1)
+
+Haifa, Israel
+
+Abstract
+
+Chapter 2 discusses the netlink sockets implementation and how netlink sockets are used as a communication channel between the kernel and userspace. This chapter deals with the ICMP protocol, which is a Layer 4 protocol. Userspace applications can use the ICMP protocol (to send and receive ICMP packets) by using the sockets API (the best-known example is probably the ping utility). This chapter discusses how these ICMP packets are handled in the kernel and gives some examples.
+
+Chapter 2 discusses the netlink sockets implementation and how netlink sockets are used as a communication channel between the kernel and userspace. This chapter deals with the ICMP protocol, which is a Layer 4 protocol. Userspace applications can use the ICMP protocol (to send and receive ICMP packets) by using the sockets API (the best-known example is probably the ping utility). This chapter discusses how these ICMP packets are handled in the kernel and gives some examples.
+
+The ICMP protocol is used primarily as a mandatory mechanism for sending error and control messages about the network layer (L3). The protocol enables getting feedback about problems in the communication environment by sending ICMP messages. These messages provide error handling and diagnostics. The ICMP protocol is relatively simple but is very important for assuring correct system behavior. The basic definition of ICMPv4 is in RFC 792, "Internet Control Message Protocol." This RFC defines the goals of the ICMPv4 protocol and the format of various ICMPv4 messages. I also mention in this chapter RFC 1122 ("Requirements for Internet Hosts—Communication Layers") which defines some requirements about several ICMP messages; RFC 4443, which defines the ICMPv6 protocol; and RFC 1812, which defines requirements for routers. I also describe which types of ICMPv4 and ICMPv6 messages exist, how they are sent, and how they are processed. I cover ICMP sockets, including why they were added and how they are used. Keep in mind that the ICMP protocol is also used for various security attacks; for example, the Smurf Attack is a denial-of-service attack in which large numbers of ICMP packets with the intended victim's spoofed source IP are sent as broadcasts to a computer network using an IP broadcast address.
+
+## ICMPv4
+
+ICMPv4 messages can be classified into two categories: error messages and information messages (they are termed "query messages" in RFC 1812). The ICMPv4 protocol is used in diagnostic tools like ping and traceroute. The famous ping utility is in fact a userspace application (from the iputils package) which opens a raw socket and sends an ICMP_ECHO message and should get back an ICMP_REPLY message as a response. Traceroute is a utility to find the path between a host and a given destination IP address. The traceroute utility is based on setting varying values to the Time To Live (TTL), which is a field in the IP header representing the hop count. The traceroute utility takes advantage of the fact that a forwarding machine will send back an ICMP_TIME_EXCEED message when the TTL of the packet reaches 0. The traceroute utility starts by sending messages with a TTL of 1, and with each received ICMP_DEST_UNREACH with code ICMP_TIME_EXCEED as a reply, it increases the TTL by 1 and sends again to the same destination. It uses the returned ICMP "Time Exceeded" messages to build a list of the routers that the packets traverse, until the destination is reached and returns an ICMP "Echo Reply" message. Traceroute uses the UDP protocol by default. The ICMPv4 module is net/ipv4/icmp.c. Note that ICMPv4 cannot be built as a kernel module.
+
+### ICMPv4 Initialization
+
+ICMPv4 initialization is done in the inet_init() method, which is invoked in boot phase. The inet_init() method invokes the icmp_init() method, which in turn calls the icmp_sk_init() method to create a kernel ICMP socket for sending ICMP messages and to initialize some ICMP procfs variables to their default values. (You will encounter some of these procfs variables later in this chapter.)
+
+Registration of the ICMPv4 protocol, like registration of other IPv4 protocols, is done in inet_init():
+
+static const struct net_protocol icmp_protocol = {
+
+.handler = icmp_rcv,
+
+.err_handler = icmp_err,
+
+.no_policy = 1,
+
+.netns_ok = 1,
+
+};
+
+(net/ipv4/af_inet.c)
+
+  * icmp_rcv: The handler callback. This means that for incoming packets whose protocol field in the IP header equals IPPROTO_ICMP (0x1), icmp_rcv() will be invoked.
+
+  * no_policy: This flag is set to 1, which implies that there is no need to perform IPsec policy checks; for example, the xfrm4_policy_check() method is not called in ip_local_deliver_finish() because the no_policy flag is set.
+
+  * netns_ok: This flag is set to 1, which indicates that the protocol is aware of network namespaces. Network namespaces are described in Appendix A, in the net_device section. The inet_add_protocol() method will fail for protocols whose netns_ok field is 0 with an error of -EINVAL.
+
+static int __init inet_init(void) {
+
+...
+
+if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
+
+pr_crit("%s: Cannot add ICMP protocol\n", __func__);
+
+...
+
+int __net_init icmp_sk_init(struct net *net)
+
+{
+
+...
+
+for_each_possible_cpu(i) {
+
+struct sock *sk;
+
+err = inet_ctl_sock_create(&sk, PF_INET
+
+SOCK_RAW, IPPROTO_ICMP, net);
+
+if (err < 0)
+
+goto fail;
+
+net->ipv4.icmp_sk[i] = sk;
+
+...
+
+sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
+
+}
+
+...
+
+}
+
+In the icmp_sk_init() method, a raw ICMPv4 socket is created for each CPU and is kept in an array. The current sk can be accessed with the icmp_sk(struct net *net) method. These sockets are used in the icmp_push_reply() method. The ICMPv4 procfs entries are initialized in the icmp_sk_init() method; I mention them in this chapter and summarize them in the "Quick Reference" section at the end of this chapter. Every ICMP packet starts with an ICMPv4 header. Before discussing how ICMPv4 messages are received and transmitted, the following section describes the ICMPv4 header, so that you better understand how ICMPv4 messages are built.
+
+### ICMPv4 Header
+
+The ICMPv4 header consists of type (8 bits), code (8 bits), and checksum (16 bits), and a 32 bits variable part member (its content varies based on the ICMPv4 type and code), as you can see in Figure 3-1. After the ICMPv4 header comes the payload, which should include the IPv4 header of the originating packet and a part of its payload. According to RFC 1812, it should contain as much of the original datagram as possible without the length of the ICMPv4 datagram exceeding 576 bytes. This size is in accordance to RFC 791, which specifies that "All hosts must be prepared to accept datagrams of up to 576 octets."
+
+Figure 3-1.
+
+The ICMPv4 header
+
+The ICMPv4 header is represented by struct icmphdr:
+
+struct icmphdr {
+
+__u8 type;
+
+__u8 code;
+
+__sum16 checksum;
+
+union {
+
+struct {
+
+__be16 id;
+
+__be16 sequence;
+
+} echo;
+
+__be32 gateway;
+
+struct {
+
+__be16 __unused;
+
+__be16 mtu;
+
+} frag;
+
+} un;
+
+};
+
+(include/uapi/linux/icmp.h)
+
+You'll find the current complete list of assigned ICMPv4 message type numbers and codes at  www.iana.org/assignments/icmp-parameters/icmp-parameters.xml .
+
+The ICMPv4 module defines an array of icmp_control objects, named icmp_pointers, which is indexed by ICMPv4 message type. Let's take a look at the icmp_control structure definition and at the icmp_pointers array:
+
+struct icmp_control {
+
+void (*handler)(struct sk_buff *skb);
+
+short error; /* This ICMP is classed as an error message */
+
+};
+
+static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
+
+NR_ICMP_TYPES is the highest ICMPv4 type, which is 18.
+
+(include/uapi/linux/icmp.h)
+
+The error field of the icmp_control objects of this array is 1 only for error message types, like the "Destination Unreachable" message (ICMP_DEST_UNREACH), and it is 0 (implicitly) for information messages, like echo (ICMP_ECHO). Some handlers are assigned to more than one type. Next I discuss handlers and the ICMPv4 message types they manage.
+
+ping_rcv() handles receiving a ping reply (ICMP_ECHOREPLY). The ping_rcv() method is implemented in the ICMP sockets code, net/ipv4/ping.c . In kernels prior to 3.0, in order to send ping, you had to create a raw socket in userspace. When receiving a reply to a ping (ICMP_ECHOREPLY message), the raw socket that sent the ping processed it. In order to understand how this is implemented, let's take a look in ip_local_deliver_finish(), which is the method which handles incoming IPv4 packets and passes them to the sockets which should process them:
+
+static int ip_local_deliver_finish(struct sk_buff *skb)
+
+{
+
+...
+
+int protocol = ip_hdr(skb)->protocol;
+
+const struct net_protocol *ipprot;
+
+int raw;
+
+resubmit:
+
+raw = raw_local_deliver(skb, protocol);
+
+ipprot = rcu_dereference(inet_protos[protocol]);
+
+if (ipprot != NULL) {
+
+int ret;
+
+...
+
+ret = ipprot->handler(skb);
+
+...
+
+(net/ipv4/ip_input.c)
+
+When the ip_local_deliver_finish() method receives an ICMP_ECHOREPLY packet, it first tries to deliver it to a listening raw socket, which will process it. Because a raw socket that was opened in userspace handles the ICMP_ECHOREPLY message, there is no need to do anything further with it. So when the ip_local_deliver_finish() method receives ICMP_ECHOREPLY, the raw_local_deliver() method is invoked first to process it by a raw socket, and afterwards the ipprot->handler(skb) is invoked (this is the icmp_rcv() callback in the case of ICMPv4 packet). And because the packet was already processed by a raw socket, there is nothing more to do with it. So the packet is discarded silently by calling the icmp_discard() method, which is the handler for ICMP_ECHOREPLY messages.
+
+When the ICMP sockets ("ping sockets") were integrated into the Linux kernel in kernel 3.0, this was changed. Ping sockets are discussed in the "ICMP Sockets ("Ping Sockets")" section later in this chapter. In this context I should note that with ICMP sockets, the sender of ping can be also not a raw socket. For example, you can create a socket like this: socket (PF_INET, SOCK_DGRAM, PROT_ICMP) and use it to send ping packets. This socket is not a raw socket. As a result, the echo reply is not delivered to any raw socket, since there is no corresponding raw socket which listens. To avoid this problem, the ICMPv4 module handles receiving ICMP_ECHOREPLY messages with the ping_rcv() callback. The ping module is located in the IPv4 layer (net/ipv4/ping.c). Nevertheless, most of the code in net/ipv4/ping.c is a dual-stack code (intended for both IPv4 and IPv6). As a result, the ping_rcv() method also handles ICMPV6_ECHO_REPLY messages for IPv6 (see icmpv6_rcv() in net/ipv6/icmp.c). I talk more about ICMP sockets later in this chapter.
+
+icmp_discard() is an empty handler used for nonexistent message types (message types whose numbers are without corresponding declarations in the header file) and for some messages that do not need any handling, for example ICMP_TIMESTAMPREPLY. The ICMP_TIMESTAMP and the ICMP_TIMESTAMPREPLY messages are used for time synchronization; the sender sends the originate timestamp in an ICMP_TIMESTAMP request; the receiver sends ICMP_TIMESTAMPREPLY with three timestamps: the originating timestamp which was sent by the sender of the timestamp request, as well as a receive timestamp and a transmit timestamp. There are more commonly used protocols for time synchronization than ICMPv4 timestamp messages, like the Network Time Protocol (NTP). I should also mention the Address Mask request (ICMP_ADDRESS), which is normally sent by a host to a router in order to obtain an appropriate subnet mask. Recipients should reply to this message with an address mask reply message. The ICMP_ADDRESS and the ICMP_ADDRESSREPLY messages, which were handled in the past by the icmp_address() method and by the icmp_address_reply() method, are now handled also by icmp_discard(). The reason is that there are other ways to get the subnet masks, such as with DHCP.
+
+icmp_unreach() handles ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_PARAMETERPROB, and ICMP_QUENCH message types.
+
+An ICMP_DEST_UNREACH message can be sent under various conditions. Some of these conditions are described in the "Sending ICMPv4 Messages: Destination Unreachable" section in this chapter.
+
+An ICMP_TIME_EXCEEDED message is sent in two cases:
+
+In ip_forward(), each packet decrements its TTL. According to RFC 1700, the recommended TTL for the IPv4 protocol is 64. If the TTL reaches 0, this is indication that the packet should be dropped because probably there was some loop. So, if the TTL reaches 0 in ip_forward(), the icmp_send() method is invoked:
+
+icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
+
+(net/ipv4/ip_forward.c)
+
+In such a case, an ICMP_TIME_EXCEEDED message with code ICMP_EXC_TTL is sent, the SKB is freed, the InHdrErrors SNMP counter (IPSTATS_MIB_INHDRERRORS) is incremented, and the method returns NET_RX_DROP.
+
+In ip_expire(), the following occurs when a timeout of a fragment exists:
+
+icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
+
+(net/ipv4/ip_fragment.c)
+
+An ICMP_PARAMETERPROB message is sent when parsing the options of an IPv4 header fails, in the ip_options_compile() method or in the ip_options_rcv_srr() method (net/ipv4/ip_options.c). The options are an optional, variable length field (up to 40 bytes) of the IPv4 header. IP options are discussed in Chapter 4.
+
+An ICMP_QUENCH message type is in fact deprecated. According to RFC 1812, section 4.3.3.3 (Source Quench): "A router SHOULD NOT originate ICMP Source Quench messages", and also, "A router MAY ignore any ICMP Source Quench messages it receives." The ICMP_QUENCH message was intended to reduce congestion, but it turned out that this is an ineffective solution.
+
+icmp_redirect() handles ICMP_REDIRECT messages; according to RFC 1122, section 3.2.2.2, hosts should not send an ICMP redirect message; redirects are to be sent only by gateways. icmp_redirect() handles ICMP_REDIRECT messages. In the past, icmp_redirect() called ip_rt_redirect(), but an ip_rt_redirect() invocation is not needed anymore as the protocol handlers now all properly propagate the redirect back into the routing code. In fact, in kernel 3.6, the ip_rt_redirect() method was removed. So the icmp_redirect() method first performs sanity checks and then calls icmp_socket_deliver(), which delivers the packet to the raw sockets and invokes the protocol error handler (in case it exists). Chapter 6 discusses ICMP_REDIRECT messages in more depth.
+
+icmp_echo() handles echo ("ping") requests (ICMP_ECHO) by sending echo replies (ICMP_ECHOREPLY) with icmp_reply(). If case net->ipv4.sysctl_icmp_echo_ignore_all is set, a reply will not be sent. For configuring ICMPv4 procfs entries, see the "Quick Reference" section at the end of this chapter, and also Documentation/networking/ip-sysctl.txt .
+
+icmp_timestamp() handles ICMP Timestamp requests (ICMP_TIMESTAMP) by sending ICMP_TIMESTAMPREPLY with icmp_reply().
+
+Before discussing sending ICMP messages by the icmp_reply() method and by the icmp_send() method, I should describe the icmp_bxm ("ICMP build xmit message") structure, which is used in both methods:
+
+struct icmp_bxm {
+
+struct sk_buff *skb;
+
+int offset;
+
+int data_len;
+
+struct {
+
+struct icmphdr icmph;
+
+__be32 times[3];
+
+} data;
+
+int head_len;
+
+struct ip_options_data replyopts;
+
+};
+
+  * skb: For the icmp_reply() method, this skb is the request packet; the icmp_param object (instance of icmp_bxm) is built from it (in the icmp_echo() method and in the icmp_timestamp() method). For the icmp_send() method, this skb is the one that triggered sending an ICMPv4 message due to some conditions; you will see several examples of such messages in this section.
+
+  * offset: Difference (offset) between skb_network_header(skb) and skb->data.
+
+  * data_len: ICMPv4 packet payload size.
+
+  * icmph: The ICMP v4 header.
+
+  * times[3]: Array of three timestamps, filled in icmp_timestamp().
+
+  * head_len: Size of the ICMPv4 header (in case of icmp_timestamp(), there are additional 12 bytes for the timestamps).
+
+  * replyopts: An ip_options data object. IP options are optional fields after the IP header, up to 40 bytes. They enable advanced features like strict routing/loose routing, record routing, time stamping, and more. They are initialized with the ip_options_echo() method. Chapter 4 discusses IP options.
+
+### Receiving ICMPv4 Messages
+
+The ip_local_deliver_finish() method handles packets for the local machine. When getting an ICMP packet, the method delivers the packet to the raw sockets that had performed registration of ICMPv4 protocol. In the icmp_rcv() method, first the InMsgs SNMP counter (ICMP_MIB_INMSGS) is incremented. Subsequently, the checksum correctness is verified. If the checksum is not correct, two SNMP counters are incremented, InCsumErrors and InErrors (ICMP_MIB_CSUMERRORS and ICMP_MIB_INERRORS, respectively), the SKB is freed, and the method returns 0. The icmp_rcv() method does not return an error in this case. In fact, the icmp_rcv() method always returns 0; the reason for returning 0 in case of checksum error is that no special thing should be done when receiving an erroneous ICMP message except to discard it; when a protocol handler returns a negative error, another attempt to process the packet is performed, and it is not needed in this case. For more details, refer to the implementation of the ip_local_deliver_finish() method. Then the ICMP header is examined in order to find its type; the corresponding procfs message type counter is incremented (each ICMP message type has a procfs counter), and a sanity check is performed to verify that it is not higher than the highest permitted value (NR_ICMP_TYPES). According to section 3.2.2 of RFC 1122, if an ICMP message of unknown type is received, it must be silently discarded. So if the message type is out of range, the InErrors SNMP counter (ICMP_MIB_INERRORS) is incremented, and the SKB is freed.
+
+In case the packet is a broadcast or a multicast, and it is an ICMP_ECHO message or an ICMP_TIMESTAMP message, there is a check whether broadcast/multicast echo requests are permitted by reading the variable net->ipv4.sysctl_icmp_echo_ignore_broadcasts. This variable can be configured via procfs by writing to /proc/sys/net/ipv4/icmp_echo_ignore_broadcasts, and by default its value is 1. If this variable is set, the packet is dropped silently. This is done according to section 3.2.2.6 of RFC 1122: "An ICMP Echo Request destined to an IP broadcast or IP multicast address MAY be silently discarded." And according to section 3.2.2.8 of this RFC, "An ICMP Timestamp Request message to an IP broadcast or IP multicast address MAY be silently discarded." Then a check is performed to detect whether the type is allowed for broadcast/multicast (ICMP_ECHO, ICMP_TIMESTAMP, ICMP_ADDRESS, and ICMP_ADDRESSREPLY). If it is not one of these message types, the packet is dropped and 0 is returned. Then according to its type, the corresponding entry in the icmp_pointers array is fetched and the appropriate handler is called. Let's take a look in the ICMP_ECHO entry in the icmp_control dispatch table:
+
+static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
+
+...
+
+[ICMP_ECHO] = {
+
+.handler = icmp_echo
+
+}
+
+...
+
+}
+
+So when receiving a ping (the type of the message is "Echo Request," ICMP_ECHO), it is handled by the icmp_echo() method. The icmp_echo() method changes the type in the ICMP header to be ICMP_ECHOREPLY and sends a reply by calling the icmp_reply() method. Apart from ping, the only other ICMP message which requires a response is the timestamp message (ICMP_TIMESTAMP); it is handled by the icmp_timestamp() method, which, much like in the ICMP_ECHO case, changes the type to ICMP_TIMESTAMPREPLY and sends a reply by calling the icmp_reply() method. Sending is done by ip_append_data() and by ip_push_pending_frames(). Receiving a ping reply (ICMP_ECHOREPLY) is handled by the ping_rcv() method .
+
+You can disable replying to pings with the following:
+
+echo 1 > /proc/sys/net/ipv4/icmp_echo_ignore_all
+
+There are some callbacks that handle more than one ICMP type. The icmp_discard() callback, for example, handles ICMPv4 packets whose type is not handled by the Linux ICMPv4 implementation, and messages like ICMP_TIMESTAMPREPLY, ICMP_INFO_REQUEST, ICMP_ADDRESSREPLY, and more.
+
+### Sending ICMPv4 Messages: "Destination Unreachable"
+
+There are two methods for sending an ICMPv4 message: the first is the icmp_reply() method, which is sent as a response for two types of ICMP requests, ICMP_ECHO and ICMP_TIMESTAMP. The second one is the icmp_send() method, where the local machine initiates sending an ICMPv4 message under certain conditions (described in this section). Both these methods eventually invoke icmp_push_reply() for actually sending the packet. The icmp_reply() method is called as a response to an ICMP_ECHO message from the icmp_echo() method, and as a response to an ICMP_TIMESTAMP message from the icmp_timestamp() method. The icmp_send() method is invoked from many places in the IPv4 network stack—for example, from netfilter, from the forwarding code (ip_forward.c) from tunnels like ipip and ip_gre, and more.
+
+This section looks into some of the cases when a "Destination Unreachable" message is sent (the type is ICMP_DEST_UNREACH).
+
+#### Code 2: ICMP_PROT_UNREACH (Protocol Unreachable)
+
+When the protocol of the IP header (which is an 8-bit field) is a nonexistent protocol, an ICMP_DEST_UNREACH/ICMP_PROT_UNREACH is sent back to the sender because there is no protocol handler for such a protocol (the protocol handler array is indexed by the protocol number, so for nonexistent protocols there will be no handler). By nonexistent protocol I mean either that because of some error indeed the protocol number of the IPv4 header does not appear in the protocol number list (which you can find in include/uapi/linux/in.h for IPv4), or that the kernel was built without support for that protocol, and, as a result, this protocol is not registered and there is no entry for it in the protocol handlers array. Because such a packet can't be handled, an ICMPv4 message of "Destination Unreachable" should be replied back to the sender; the ICMP_PROT_UNREACH code in the ICMPv4 reply signifies the cause of the error, "protocol is unreachable." See the following:
+
+static int ip_local_deliver_finish(struct sk_buff *skb)
+
+{
+
+...
+
+int protocol = ip_hdr(skb)->protocol;
+
+const struct net_protocol *ipprot;
+
+int raw;
+
+resubmit:
+
+raw = raw_local_deliver(skb, protocol);
+
+ipprot = rcu_dereference(inet_protos[protocol]);
+
+if (ipprot != NULL) {
+
+...
+
+} else {
+
+if (!raw) {
+
+if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+
+IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
+
+icmp_send(skb, ICMP_DEST_UNREACH,ICMP_PROT_UNREACH, 0);
+
+}
+
+...
+
+}
+
+(net/ipv4/ip_input.c)
+
+In this example, a lookup in the inet_protos array by protocol is performed; and because no entry was found, this means that the protocol is not registered in the kernel.
+
+#### Code 3: ICMP_PORT_UNREACH ("Port Unreachable")
+
+When receiving UDPv4 packets, a matching UDP socket is searched for. If no matching socket is found, the checksum correctness is verified. If it is wrong, the packet is dropped silently. If it is correct, the statistics are updated and a "Destination Unreachable"/"Port Unreachable" ICMP message is sent back:
+
+int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto)
+
+{
+
+struct sock *sk;
+
+...
+
+sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable)
+
+...
+
+if (sk != NULL) {
+
+...
+
+}
+
+/* No socket. Drop packet silently, if checksum is wrong */
+
+if (udp_lib_checksum_complete(skb))
+
+goto csum_error;
+
+UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
+
+icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+...
+
+}
+
+...
+
+}
+
+(net/ipv4/udp.c)
+
+A lookup is being performed by the __udp4_lib_lookup_skb() method, and if there is no socket, the statistics are updated and an ICMP_DEST_UNREACH message with ICMP_PORT_UNREACH code is sent back.
+
+#### Code 4: ICMP_FRAG_NEEDED
+
+When forwarding a packet with a length larger than the MTU of the outgoing link, if the don't fragment (DF) bit in the IPv4 header (IP_DF) is set, the packet is discarded and an ICMP_DEST_UNREACH message with ICMP_FRAG_NEEDED code is sent back to the sender:
+
+int ip_forward(struct sk_buff *skb)
+
+{
+
+...
+
+struct rtable *rt; /* Route we use */
+
+...
+
+if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
+
+(ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
+
+IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
+
+icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED
+
+htonl(dst_mtu(&rt->dst)));
+
+goto drop;
+
+}
+
+...
+
+}
+
+(net/ipv4/ip_forward.c)
+
+#### Code 5: ICMP_SR_FAILED
+
+When forwarding a packet with the strict routing option and gatewaying set, a "Destination Unreachable" message with ICMP_SR_FAILED code is sent back, and the packet is dropped:
+
+int ip_forward(struct sk_buff *skb)
+
+{
+
+struct ip_options *opt = &(IPCB(skb)->opt);
+
+...
+
+if (opt->is_strictroute && rt->rt_uses_gateway)
+
+goto sr_failed;
+
+...
+
+sr_failed:
+
+icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
+
+goto drop;
+
+}
+
+(net/ipv4/ip_forward.c)
+
+For a full list of all IPv4 "Destination Unreachable" codes, see Table 3-1 in the "Quick Reference" section at the end of this chapter. Note that a user can configure some rules with the iptables REJECT target and the \--reject-with qualifier, which can send "Destination Unreachable" messages according to the selection; more in the "Quick Reference" section at the end of this chapter.
+
+Both the icmp_reply() and the icmp_send() methods support rate limiting; they call icmpv4_xrlim_allow(), and if the rate limiting check allows sending the packet (the icmpv4_xrlim_allow() returns true), they send the packet. It should be mentioned here that rate limiting is not performed automatically on all types of traffic. Here are the conditions under which rate limiting check will not be performed:
+
+  * The message type is unknown.
+
+  * The packet is of PMTU discovery.
+
+  * The device is a loopback device.
+
+  * The ICMP type is not enabled in the rate mask.
+
+If all these conditions are not matched, rate limiting is performed by calling the inet_peer_xrlim_allow() method. You'll find more info about rate mask in the "Quick Reference" section at the end of this chapter.
+
+Let's look inside the icmp_send() method. First, this is its prototype:
+
+void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+
+skb_in is the SKB which caused the invocation of the icmp_send() method type and code are the ICMPv4 message type and code, respectively. The last parameter, info, is used in the following cases:
+
+  * For the ICMP_PARAMETERPROB message type it is the offset in the IPv4 header where the parsing problem occurred.
+
+  * For the ICMP_DEST_UNREACH message type with ICMP_FRAG_NEEDED code, it is the MTU.
+
+  * For the ICMP_REDIRECT message type with ICMP_REDIR_HOST code, it is the IP address of the destination address in the IPv4 header of the provoking SKB.
+
+When further looking into the icmp_send() method, first there are some sanity checks. Then multicast/broadcast packets are rejected. A check of whether the packet is a fragment is performed by inspecting the frag_off field of the IPv4 header. If the packet is fragmented, an ICMPv4 message is sent, but only for the first fragment. According to section 4.3.2.7 of RFC 1812, an ICMP error message must not be sent as the result of receiving an ICMP error message. So first a check is performed to find out whether the ICMPv4 message to be sent is an error message, and if it is so, another check is performed to find out whether the provoking SKB contained an error ICMPv4 message, and if so, then the method returns without sending the ICMPv4 message. Also if the type is an unknown ICMPv4 type (higher than NR_ICMP_TYPES), the method returns without sending the ICMPv4 message, though this isn't specified explicitly by the RFC. Then the source address is determined according to the value of net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr value (more details in the "Quick Reference" section at the end of this chapter). Then the ip_options_echo() method is invoked to copy the IP options of the IPv4 header of the invoking SKB. An icmp_bxm object (icmp_param) is being allocated and initialized, and a lookup in the routing subsystem is performed with the icmp_route_lookup() method. Then the icmp_push_reply() method is invoked.
+
+Let's take a look at the icmp_push_reply() method, which actually sends the packet. The icmp_push_reply() first finds the socket on which the packet should be sent by calling:
+
+sk = icmp_sk(dev_net((*rt)->dst.dev));
+
+The dev_net() method returns the network namespace of the outgoing network device. (The dev_net() method and network namespaces are discussed in chapter 14 and in Appendix A.) Then, the icmp_sk() method fetches the socket (because in SMP there is a socket per CPU). Then the ip_append_data() method is called to move the packet to the IP layer. If the ip_append_data() method fails, the statistics are updated by incrementing the ICMP_MIB_OUTERRORS counter and the ip_flush_pending_frames() method is called to free the SKB. I discuss the ip_append_data() method and the ip_flush_pending_frames() method in Chapter 4.
+
+Now that you know all about ICMPv4, it's time to move on to ICMPv6.
+
+## ICMPv6
+
+ICMPv6 has many similarities to ICMPv4 when it comes to reporting errors in the network layer (L3). There are additional tasks for ICMPv6 which are not performed in ICMPv4. This section discusses the ICMPv6 protocol, its new features (which are not implemented in ICMPv4), and the features which are similar. ICMPv6 is defined in RFC 4443. If you delve into ICMPv6 code you will probably encounter, sooner or later, comments that mention RFC 1885. In fact, RFC 1885, "Internet Control Message Protocol (ICMPv6) for the Internet Protocol Version 6 (IPv6)," is the base ICMPv6 RFC. It was obsoleted by RFC 2463, which was in turn obsoleted by RFC 4443. The ICMPv6 implementation is based upon IPv4, but it is more complicated; the changes and additions that were added are discussed in this section.
+
+The ICMPv6 protocol has a next header value of 58, according to RFC 4443, section 1 (Chapter 8 discusses IPv6 next headers). ICMPv6 is an integral part of IPv6 and must be fully implemented by every IPv6 node. Apart from error handling and diagnostics, ICMPv6 is used for the Neighbour Discovery (ND) protocol in IPv6, which replaces and enhances functions of ARP in IPv4, and for the Multicast Listener Discovery (MLD) protocol, which is the counterpart of the IGMP protocol in IPv4, shown in Figure 3-2.
+
+Figure 3-2.
+
+ICMP in IPv4 and IPv6. The counterpart of the IGMP protocol in IPv6 is the MLD protocol, and the counterpart of the ARP protocol in IPv6 is the ND protocol
+
+This section covers the ICMPv6 implementation. As you will see, it has many things in common with the ICMPv4 implementation in the way messages are handled and sent. There are even cases when the same methods are called in ICMPv4 and in ICMPv6 (for example, ping_rcv() and inet_peer_xrlim_allow()). There are some differences, and some topics are unique to ICMPv6. The ping6 and traceroute6 utilities are based on ICMPv6 and are the counterparts of ping and traceroute utilities of IPv4 (mentioned in the ICMPv4 section in the beginning of this chapter). ICMPv6 is implemented in net/ipv6/icmp.c and in net/ipv6/ip6_icmp.c . As with ICMPv4, ICMPv6 cannot be built as a kernel module.
+
+### ICMPv6 Initialization
+
+ICMPv6 initialization is done by the icmpv6_init() method and by the icmpv6_sk_init() method. Registration of the ICMPv6 protocol is done by icmpv6_init() (net/ipv6/icmp.c):
+
+static const struct inet6_protocol icmpv6_protocol = {
+
+.handler = icmpv6_rcv
+
+.err_handler = icmpv6_err
+
+.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL
+
+};
+
+The handler callback is icmpv6_rcv(); this means that for incoming packets whose protocol field equals IPPROTO_ICMPV6 (58), icmpv6_rcv() will be invoked.
+
+When the INET6_PROTO_NOPOLICY flag is set, this implies that IPsec policy checks should not be performed; for example, the xfrm6_policy_check() method is not called in ip6_input_finish() because the INET6_PROTO_NOPOLICY flag is set:
+
+int __init icmpv6_init(void)
+
+{
+
+int err;
+
+...
+
+if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0)
+
+goto fail;
+
+return 0;
+
+}
+
+static int __net_init icmpv6_sk_init(struct net *net)
+
+{
+
+struct sock *sk;
+
+...
+
+for_each_possible_cpu(i) {
+
+err = inet_ctl_sock_create(&sk, PF_INET6
+
+SOCK_RAW, IPPROTO_ICMPV6, net);
+
+...
+
+net->ipv6.icmp_sk[i] = sk;
+
+...
+
+}
+
+As in ICMPv4, a raw ICMPv6 socket is created for each CPU and is kept in an array. The current sk can be accessed by the icmpv6_sk() method.
+
+### ICMPv6 Header
+
+The ICMPv6 header consists of type (8 bits), code (8 bits), and checksum (16 bits), as you can see in Figure 3-3.
+
+Figure 3-3.
+
+ICMPv6 header
+
+The ICMPv6 header is represented by struct icmp6hdr:
+
+struct icmp6hdr {
+
+__u8 icmp6_type;
+
+__u8 icmp6_code;
+
+__sum16 icmp6_cksum;
+
+...
+
+}
+
+There is not enough room to show all the fields of struct icmp6hdr because it is too large (it is defined in include/uapi/linux/icmpv6.h). When the high-order bit of the type field is 0 (values in the range from 0 to 127), it indicates an error message; when the high-order bit is 1 (values in the range from 128 to 255), it indicates an information message. Table 3-1 shows the ICMPv6 message types by their number and kernel symbol.
+
+Table 3-1.
+
+ICMPv6 Messages
+
+Type | Kernel symbol | Error/Info | Description
+
+---|---|---|---
+
+1 | ICMPV6_DEST_UNREACH | Error | Destination Unreachable
+
+2 | ICMPV6_PKT_TOOBIG | Error | Packet too big
+
+3 | ICMPV6_TIME_EXCEED | Error | Time Exceeded
+
+4 | ICMPV6_PARAMPROB | Error | Parameter problem
+
+128 | ICMPV6_ECHO_REQUEST | Info | Echo Request
+
+129 | ICMPV6_ECHO_REPLY | Info | Echo Reply
+
+130 | ICMPV6_MGM_QUERY | Info | Multicast group membership management query
+
+131 | ICMPV6_MGM_REPORT | Info | Multicast group membership management report
+
+132 | ICMPV6_MGM_REDUCTION | Info | Multicast group membership management reduction
+
+133 | NDISC_ROUTER_SOLICITATION | Info | Router solicitation
+
+134 | NDISC_ROUTER_ADVERTISEMENT | Info | Router advertisement
+
+135 | NDISC_NEIGHBOUR_SOLICITATION | Info | Neighbour solicitation
+
+136 | NDISC_NEIGHBOUR_ADVERTISEMENT | Info | Neighbour advertisement
+
+137 | NDISC_REDIRECT | Info | Neighbour redirect
+
+The current complete list of assigned ICMPv6 types and codes can be found at  www.iana.org/assignments/icmpv6-parameters/icmpv6-parameters.xml .
+
+ICMPv6 performs some tasks that are not performed by ICMPv4. For example, Neighbour Discovery is done by ICMPv6, whereas in IPv4 it is done by the ARP/RARP protocols. Multicast group memberships are handled by ICMPv6 in conjunction with the MLD (Multicast Listener Discovery) protocol, whereas in IPv4 this is performed by IGMP (Internet Group Management Protocol). Some ICMPv6 messages are similar in meaning to ICMPv4 messages; for example, ICMPv6 has these messages: "Destination Unreachable," (ICMPV6_DEST_UNREACH), "Time Exceeded" (ICMPV6_TIME_EXCEED), "Parameter Problem" (ICMPV6_PARAMPROB), "Echo Request" (ICMPV6_ECHO_REQUEST), and more. On the other hand, some ICMPv6 messages are unique to IPv6, such as the NDISC_NEIGHBOUR_SOLICITATION message.
+
+### Receiving ICMPv6 Messages
+
+When getting an ICMPv6 packet, it is delivered to the icmpv6_rcv() method, which gets only an SKB as a parameter. Figure 3-4 shows the Rx path of a received ICMPv6 message.
+
+Figure 3-4.
+
+Receive path of ICMPv6 message
+
+In the icmpv6_rcv() method, after some sanity checks, the InMsgs SNMP counter (ICMP6_MIB_INMSGS) is incremented. Subsequently, the checksum correctness is verified. If the checksum is not correct, the InErrors SNMP counter (ICMP6_MIB_INERRORS) is incremented, and the SKB is freed. The icmpv6_rcv() method does not return an error in this case (in fact it always returns 0, much like its IPv4 counterpart, icmp_rcv()).Then the ICMPv6 header is read in order to find its type; the corresponding procfs message type counter is incremented by the ICMP6MSGIN_INC_STATS_BH macro (each ICMPv6 message type has a procfs counter). For example, when receiving ICMPv6 ECHO requests ("pings"), the /proc/net/snmp6/Icmp6InEchos counter is incremented, and when receiving ICMPv6 Neighbour Solicitation requests, the /proc/net/snmp6/Icmp6InNeighborSolicits counter is incremented.
+
+In ICMPv6, there is no dispatch table like the icmp_pointers table in ICMPv4. The handlers are invoked according to the ICMPv6 message type, in a long switch(type) command:
+
+  * "Echo Request" (ICMPV6_ECHO_REQUEST) is handled by the icmpv6_echo_reply() method.
+
+  * "Echo Reply" (ICMPV6_ECHO_REPLY) is handled by the ping_rcv() method. The ping_rcv() method is in the IPv4 ping module (net/ipv4/ping.c); this method is a dual-stack method (it handles both IPv4 and IPv6—discussed in the beginning of this chapter).
+
+  * Packet too big (ICMPV6_PKT_TOOBIG).
+
+    * First a check is done to verify that the data block area (pointed to by skb->data) contains a block of data whose size is at least as big as an ICMP header. This is done by the pskb_may_pull() method. If this condition is not met, the packet is dropped.
+
+    * Then the icmpv6_notify() method is invoked. This method eventually calls the raw6_icmp_error() method so that the registered raw sockets will handle the ICMP messages.
+
+  * "Destination Unreachable," "Time Exceeded," and "Parameter Problem" (ICMPV6_DEST_UNREACH, ICMPV6_TIME_EXCEED, and ICMPV6_PARAMPROB respectively) are also handled by icmpv6_notify().
+
+  * Neighbour Discovery (ND) messages:
+
+    * NDISC_ROUTER_SOLICITATION: Messages which are sent usually to the all-routers multicast address of FF02::2, and which are answered by router advertisements. (Special IPv6 multicast addresses are discussed in Chapter 8).
+
+    * NDISC_ROUTER_ADVERTISEMENT: Messages which are sent periodically by routers or as an immediate response to router solicitation requests. Router advertisements contain prefixes that are used for on-link determination and/or address configuration, a suggested hop limit value, and so on.
+
+    * NDISC_NEIGHBOUR_SOLICITATION: The counterpart of ARP request in IPv4.
+
+    * NDISC_NEIGHBOUR_ADVERTISEMENT: The counterpart of ARP reply in IPv4.
+
+    * NDISC_REDIRECT: Used by routers to inform hosts of a better first hop for a destination.
+
+    * All the Neighbour Discovery (ND) messages are handled by the neighbour discovery method, ndisc_rcv() (net/ipv6/ndisc.c). The ndisc_rcv() method is discussed in Chapter 7.
+
+  * ICMPV6_MGM_QUERY (Multicast Listener Report) is handled by igmp6_event_query().
+
+  * ICMPV6_MGM_REPORT (Multicast Listener Report) is handled by igmp6_event_report(). Note: Both ICMPV6_MGM_QUERY and ICMPV6_MGM_REPORT are discussed in more detail in Chapter 8.
+
+  * Messages of unknown type, and the following messages, are all handled by the icmpv6_notify() method:
+
+    * ICMPV6_MGM_REDUCTION: When a host leaves a multicast group, it sends an MLDv2 ICMPV6_MGM_REDUCTION message; see the igmp6_leave_group() method in net/ipv6/mcast.c.
+
+    * ICMPV6_MLD2_REPORT: MLDv2 Multicast Listener Report packet; usually sent with destination address of the all MLDv2-capable routers Multicast Group Address (FF02::16).
+
+    * ICMPV6_NI_QUERY- ICMP: Node Information Query.
+
+    * ICMPV6_NI_REPLY: ICMP Node Information Response.
+
+    * ICMPV6_DHAAD_REQUEST: ICMP Home Agent Address Discovery Request Message; see section 6.5, RFC 6275, "Mobility Support in IPv6."
+
+    * ICMPV6_DHAAD_REPLY: ICMP Home Agent Address Discovery Reply Message; See section 6.6, RFC 6275.
+
+    * ICMPV6_MOBILE_PREFIX_SOL: ICMP Mobile Prefix Solicitation Message Format; see section 6.7, RFC 6275.
+
+    * ICMPV6_MOBILE_PREFIX_ADV: ICMP Mobile Prefix Advertisement Message Format; see section 6.8, RFC 6275.
+
+Notice that the switch(type) command ends like this:
+
+default:
+
+LIMIT_NETDEBUG(KERN_DEBUG "icmpv6: msg of unknown type\n");
+
+/* informational */
+
+if (type & ICMPV6_INFOMSG_MASK)
+
+break;
+
+/*
+
+* error of unknown type.
+
+* must pass to upper level
+
+*/
+
+icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu);
+
+}
+
+Informational messages fulfill the condition (type & ICMPV6_INFOMSG_MASK), so they are discarded, whereas the other messages which do not fulfill this condition (and therefore should be error messages) are passed to the upper layer. This is done in accordance with section 2.4 ("Message Processing Rules") of RFC 4443.
+
+### Sending ICMPv6 Messages
+
+The main method for sending ICMPv6 messages is the icmpv6_send() method. The method is called when the local machine initiates sending an ICMPv6 message under conditions described in this section. There is also the icmpv6_echo_reply() method, which is called only as a response to an ICMPV6_ECHO_REQUEST ("ping") message. The icmp6_send() method is invoked from many places in the IPv6 network stack. This section looks at several examples.
+
+#### Example: Sending "Hop Limit Time Exceeded" ICMPv6 Messages
+
+When forwarding a packet, every machine decrements the Hop Limit Counter by 1. The Hop Limit Counter is a member of the IPv6 header—it is the IPv6 counterpart to Time To Live in IPv4. When the value of the Hop Limit Counter header reaches 0, an ICMPV6_TIME_EXCEED message is sent with ICMPV6_EXC_HOPLIMIT code by calling the icmpv6_send() method, then the statistics are updated and the packet is dropped:
+
+int ip6_forward(struct sk_buff *skb)
+
+{
+
+...
+
+if (hdr->hop_limit <= 1) {
+
+/* Force OUTPUT device used as source address */
+
+skb->dev = dst->dev;
+
+icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
+
+IP6_INC_STATS_BH(net
+
+ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
+
+kfree_skb(skb);
+
+return -ETIMEDOUT;
+
+}
+
+...
+
+}
+
+(net/ipv6/ip6_output.c)
+
+#### Example: Sending "Fragment Reassembly Time Exceeded" ICMPv6 Messages
+
+When a timeout of a fragment occurs, an ICMPV6_TIME_EXCEED message with ICMPV6_EXC_FRAGTIME code is sent back, by calling the icmpv6_send() method:
+
+void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq
+
+struct inet_frags *frags)
+
+{
+
+. . .
+
+icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
+
+. . .
+
+}
+
+(net/ipv6/reassembly.c)
+
+#### Example: Sending "Destination Unreachable"/"Port Unreachable" ICMPv6 Messages
+
+When receiving UDPv6 packets, a matching UDPv6 socket is searched for. If no matching socket is found, the checksum correctness is verified. If it is wrong, the packet is dropped silently. If it is correct, the statistics (UDP_MIB_NOPORTS MIB counter, which is exported to procfs by /proc/net/snmp6/Udp6NoPorts) is updated and a "Destination Unreachable"/"Port Unreachable" ICMPv6 message is sent back with icmpv6_send():
+
+int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto)
+
+{
+
+...
+
+sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
+
+if (sk != NULL) {
+
+...
+
+}
+
+...
+
+if (udp_lib_checksum_complete(skb))
+
+goto discard;
+
+UDP6_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
+
+icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+...
+
+}
+
+This case is very similar to the UDPv4 example given earlier in this chapter.
+
+#### Example: Sending "Fragmentation Needed" ICMPv6 Messages
+
+When forwarding a packet, if its size is larger than the MTU of the outgoing link, and the local_df bit in the SKB is not set, the packet is discarded and an ICMPV6_PKT_TOOBIG message is sent back to the sender. The information in this message is used as part of the Path MTU (PMTU) discovery process.
+
+Note that as opposed to the parallel case in IPv4, where an ICMP_DEST_UNREACH message with ICMP_FRAG_NEEDED code is sent, in this case an ICMPV6_PKT_TOOBIG message is sent back, and not a "Destination Unreachable" (ICMPV6_DEST_UNREACH) message. The ICMPV6_PKT_TOOBIG message has a message type number of its own in ICMPv6:
+
+int ip6_forward(struct sk_buff *skb)
+
+{
+
+...
+
+if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
+
+(IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
+
+/* Again, force OUTPUT device used as source address */
+
+skb->dev = dst->dev;
+
+icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+
+IP6_INC_STATS_BH(net
+
+ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
+
+IP6_INC_STATS_BH(net
+
+ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
+
+kfree_skb(skb);
+
+return -EMSGSIZE;
+
+}
+
+...
+
+}
+
+(net/ipv6/ip6_output.c)
+
+#### Example: Sending "Parameter Problem" ICMPv6 Messages
+
+When encountering a problem in parsing extension headers, an ICMPV6_PARAMPROB message with ICMPV6_UNK_OPTION code is sent back:
+
+static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) {
+
+switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) {
+
+...
+
+case 2: /* send ICMP PARM PROB regardless and drop packet */
+
+icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff);
+
+return false;
+
+}
+
+(net/ipv6/exthdrs.c)
+
+The icmpv6_send() method supports rate limiting by calling icmpv6_xrlim_allow(). I should mention here that, as in ICMPv4, rate limiting is not performed automatically in ICMPv6 on all types of traffic. Here are the conditions under which rate limiting check will not be performed:
+
+  * Informational messages
+
+  * PMTU discovery
+
+  * Loopback device
+
+If all these conditions are not matched, rate limiting is performed by calling the inet_peer_xrlim_allow() method, which is shared between ICMPv4 and ICMPv6. Note that unlike IPv4, you can't set a rate mask in IPv6. It is not forbidden by the ICMPv6 spec, RFC 4443, but it was never implemented.
+
+Let's look inside the icmp6_send() method. First, this is its prototype:
+
+static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
+
+The parameters are similar to those of the icmp_send() method of IPv4, so I won't repeat the explanation here. When further looking into the icmp6_send() code, you find some sanity checks. Checking whether the provoking message is an ICMPv6 error message is done by calling the is_ineligible() method; if it is, the icmp6_send() method terminates. The length of the message should not exceed 1280, which is IPv6 minimum MTU (IPV6_MIN_MTU, defined in include/linux/ipv6.h). This is done in accordance with RFC 4443, section 2.4 (c), which says that every ICMPv6 error message must include as much of the IPv6 offending (invoking) packet (the packet that caused the error) as possible without making the error message packet exceed the minimum IPv6 MTU. Then the message is passed to the IPv6 layer, by the ip6_append_data() method and by the icmpv6_push_pending_frame() method, to free the SKB.
+
+Now I'll turn to the icmpv6_echo_reply() method; as a reminder, this method is called as a response to an ICMPV6_ECHO message. The icmpv6_echo_reply() method gets only one parameter, the SKB. It builds an icmpv6_msg object and sets its type to ICMPV6_ECHO_REPLY. Then it passes the message to the IPv6 layer, by the ip6_append_data() method and by the icmpv6_push_pending_frame() method. If the ip6_append_data() method fails, an SNMP counter (ICMP6_MIB_OUTERRORS) is incremented, and ip6_flush_pending_frames() is invoked to free the SKB.
+
+Chapters 7 and  also discuss ICMPv6. The next section introduces ICMP sockets and the purpose they serve.
+
+## ICMP Sockets ("Ping sockets")
+
+A new type of sockets (IPPROTO_ICMP) was added by a patch from the Openwall GNU/*/Linux distribution (Owl), which provides security enhancements over other distributions. The ICMP sockets enable a setuid-less "ping." For Openwall GNU/*/Linux, it was the last step on the road to a setuid-less distribution. With this patch, a new ICMPv4 ping socket (which is not a raw socket) is created with:
+
+socket(PF_INET, SOCK_DGRAM, IPPROTO_ICMP);
+
+instead of with:
+
+socket(PF_INET, SOCK_RAW, IPPROTO_ICMP);
+
+There is also support for IPPROTO_ICMPV6 sockets, which was added later, in net/ipv6/icmp.c . A new ICMPv6 ping socket is created with:
+
+socket(PF_INET6, SOCK_DGRAM, IPPROTO_ICMPV6);
+
+instead of with:
+
+socket(PF_INET6, SOCK_RAW, IPPROTO_ICMP6);
+
+Similar functionality (non-privileged ICMP) is implemented in Mac OS X; see:  www.manpagez.com/man/4/icmp/ .
+
+Most of the code for ICMP sockets is in net/ipv4/ping.c; in fact, large parts of the code in net/ipv4/ping.c are dual-stack (IPv4 and IPv6). In net/ipv6/ping.c there are only few IPv6-specific bits. Using ICMP sockets is disabled by default. You can enable ICMP sockets by setting the following procfs entry: /proc/sys/net/ipv4/ping_group_range. It is "1 0" by default, meaning that nobody (not even root) may create ping sockets. So, if you want to allow a user with uid and gid of 1000 to use the ICMP socket, you should run this from the command line (with root privileges): echo 1000 1000 > /proc/sys/net/ipv4/ping_group_range, and then you can ping from this user account using ICMP sockets. If you want to set privileges for a user in the system, you should run from the command line echo 0 2147483647 > /proc/sys/net/ipv4/ping_group_range. (2147483647 is the value of GID_T_MAX; see include/net/ping.h.) There are no separate security settings for IPv4 and IPv6; everything is controlled by /proc/sys/net/ipv4/ping_group_range. The ICMP sockets support only ICMP_ECHO for IPv4 or ICMPV6_ECHO_REQUEST for IPv6, and the code of the ICMP message must be 0 in both cases.
+
+The ping_supported() helper method checks whether the parameters for building the ICMP message (both for IPv4 and IPv6) are valid. It is invoked from ping_sendmsg():
+
+static inline int ping_supported(int family, int type, int code)
+
+{
+
+return (family == AF_INET && type == ICMP_ECHO && code == 0) ||
+
+(family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0);
+
+}
+
+(net/ipv4/ping.c)
+
+ICMP sockets export the following entries to procfs: /proc/net/icmp for IPv4 and /proc/net/icmp6 for IPv6.
+
+For more info about ICMP sockets see  http://openwall.info/wiki/people/segoon/ping  and  http://lwn.net/Articles/420799/ .
+
+## Summary
+
+This chapter covered the implementation of ICMPv4 and ICMPv6. You learned about the ICMP header format of both protocols and about receiving and sending messages with both protocols. The new features of ICMPv6, which you will encounter in upcoming chapters, were also discussed. The Neighbouring Discovery protocol, which uses ICMPv6 messages, is discussed in Chapter 7, and the MLD protocol, which also uses ICMPv6 messages, is covered in Chapter 8. The next chapter, Chapter 4, talks about the implementation of the IPv4 network layer.
+
+In the "Quick Reference" section that follows, I cover the top methods related to the topics discussed in this chapter, ordered by their context. Then two tables mentioned in the chapter, some important relevant procfs entries and a short section about ICMP messages usage in iptables reject rules are all covered.
+
+## Quick Reference
+
+I conclude this chapter with a short list of important methods of ICMPv4 and ICMPv6, 6 tables, a section about procfs entries, and a short section about using a reject target in iptables and ip6tables to create ICMP "Destination Unreachable" messages.
+
+### Methods
+
+The following methods were covered in this chapter.
+
+#### int icmp_rcv(struct sk_buff *skb);
+
+This method is the main handler for processing incoming ICMPv4 packets.
+
+#### extern void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info);
+
+This method sends an ICMPv4 message. The parameters are the provoking SKB, ICMPv4 message type, ICMPv4 message code, and info (which is dependent on type).
+
+#### struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb);
+
+This method returns the ICMPv6 header, which the specified skb contains.
+
+#### void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info);
+
+This method sends an ICMPv6 message. The parameters are the provoking SKB, ICMPv6 message type, ICMPv6 message code, and info (which is dependent on type).
+
+#### void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos);
+
+This method is a convenient version of the icmp6_send() method, which all it does is call icmp6_send() with ICMPV6_PARAMPROB as a type, and with the other specified parameters, skb, code and pos, and frees the SKB afterwards.
+
+### Tables
+
+The following tables were covered in this chapter.
+
+Table 3-2.
+
+ICMPv4 "Destination Unreachable" (ICMP_DEST_UNREACH) Codes
+
+Code | Kernel Symbol | Description
+
+---|---|---
+
+0 | ICMP_NET_UNREACH | Network Unreachable
+
+1 | ICMP_HOST_UNREACH | Host Unreachable
+
+2 | ICMP_PROT_UNREACH | Protocol Unreachable
+
+3 | ICMP_PORT_UNREACH | Port Unreachable
+
+4 | ICMP_FRAG_NEEDED | Fragmentation Needed, but the DF flag is set.
+
+5 | ICMP_SR_FAILED | Source route failed
+
+6 | ICMP_NET_UNKNOWN | Destination network unknown
+
+7 | ICMP_HOST_UNKNOWN | Destination host unknown
+
+8 | ICMP_HOST_ISOLATED | Source host isolated
+
+9 | ICMP_NET_ANO | The destination network is administratively prohibited.
+
+10 | ICMP_HOST_ANO | The destination host is administratively prohibited.
+
+11 | ICMP_NET_UNR_TOS | The network is unreachable for Type Of Service.
+
+12 | ICMP_HOST_UNR_TOS | The host is unreachable for Type Of Service.
+
+13 | ICMP_PKT_FILTERED | Packet filtered
+
+14 | ICMP_PREC_VIOLATION | Precedence violation
+
+15 | ICMP_PREC_CUTOFF | Precedence cut off
+
+16 | NR_ICMP_UNREACH | Number of unreachable codes
+
+Table 3-3.
+
+ICMPv4 Redirect (ICMP_REDIRECT) Codes
+
+Code | Kernel Symbol | Description
+
+---|---|---
+
+0 | ICMP_REDIR_NET | Redirect Net
+
+1 | ICMP_REDIR_HOST | Redirect Host
+
+2 | ICMP_REDIR_NETTOS | Redirect Net for TOS
+
+3 | ICMP_REDIR_HOSTTOS | Redirect Host for TOS
+
+Table 3-4.
+
+ICMPv4 Time Exceeded (ICMP_TIME_EXCEEDED) Codes
+
+Code | Kernel Symbol | Description
+
+---|---|---
+
+0 | ICMP_EXC_TTL | TTL count exceeded
+
+1 | ICMP_EXC_FRAGTIME | Fragment Reassembly time exceeded
+
+Table 3-5.
+
+ICMPv6 "Destination Unreachable" (ICMPV6_DEST_UNREACH) Codes
+
+Code | Kernel Symbol | Description
+
+---|---|---
+
+0 | ICMPV6_NOROUTE | No route to destination
+
+1 | ICMPV6_ADM_PROHIBITED | Communication with destination administratively prohibited
+
+2 | ICMPV6_NOT_NEIGHBOUR | Beyond scope of source address
+
+3 | ICMPV6_ADDR_UNREACH | Address Unreachable
+
+4 | ICMPV6_PORT_UNREACH | Port Unreachable
+
+Note that ICMPV6_PKT_TOOBIG, which is the counterpart of IPv4 ICMP_DEST_UNREACH /ICMP_FRAG_NEEDED, is not a code of ICMPV6_DEST_UNREACH, but an ICMPv6 type in itself.
+
+Table 3-6.
+
+ICMPv6 Time Exceeded (ICMPV6_TIME_EXCEED) Codes
+
+Code | Kernel Symbol | Description
+
+---|---|---
+
+0 | ICMPV6_EXC_HOPLIMIT | Hop limit exceeded in transit
+
+1 | ICMPV6_EXC_FRAGTIME | Fragment reassembly time exceeded
+
+Table 3-7.
+
+ICMPv6 Parameter Problem (ICMPV6_PARAMPROB) Codes
+
+Code | Kernel Symbol | Description
+
+---|---|---
+
+0 | ICMPV6_HDR_FIELD | Erroneous header field encountered
+
+1 | ICMPV6_UNK_NEXTHDR | Unknown Next Header type encountered
+
+2 | ICMPV6_UNK_OPTION | Unknown IPv6 option encountered
+
+### procfs entries
+
+The kernel provides a way of configuring various settings for various subsystems from the userspace by way of writing values to entries under /proc. These entries are referred to as procfs entries. All of the ICMPv4 procfs entries are represented by variables in the netns_ipv4 structure (include/net/netns/ipv4.h), which is an object in the network namespace (struct net). Network namespaces and their implementation are discussed in Chapter 14. The following are the names of the sysctl variables that correspond to the ICMPv4 netns_ipv4 elements, explanations about their usage, and the default values to which they are initialized, specifying also in which method the initialization takes place.
+
+#### sysctl_icmp_echo_ignore_all
+
+When icmp_echo_ignore_all is set, echo requests (ICMP_ECHO) will not be replied.
+
+procfs entry: /proc/sys/net/ipv4/icmp_echo_ignore_all
+
+Initialized to 0 in icmp_sk_init()
+
+#### sysctl_icmp_echo_ignore_broadcasts
+
+When receiving a broadcast or a multicast echo (ICMP_ECHO) message or a timestamp (ICMP_TIMESTAMP) message, you check whether broadcast/multicast requests are permitted by reading sysctl_icmp_echo_ignore_broadcasts. If this variable is set, you drop the packet and return 0.
+
+procfs entry: /proc/sys/net/ipv4/icmp_echo_ignore_broadcasts
+
+Initialized to 1 in icmp_sk_init()
+
+#### sysctl_icmp_ignore_bogus_error_responses
+
+Some routers violate RFC1122 by sending bogus responses to broadcast frames. In the icmp_unreach() method, you check this flag. If this flag is set to TRUE, the kernel will not log these warnings ("<IPv4Addr>sent an invalid ICMP type...").
+
+procfs entry: /proc/sys/net/ipv4/icmp_ignore_bogus_error_responses
+
+Initialized to 1 in icmp_sk_init()
+
+#### sysctl_icmp_ratelimit
+
+Limit the maximal rates for sending ICMP packets whose type matches the icmp ratemask (icmp_ratemask, see later in this section) to specific targets.
+
+A value of 0 means disable any limiting; otherwise it is the minimal space between responses in milliseconds.
+
+procfs entry: /proc/sys/net/ipv4/icmp_ratelimit
+
+Initialized to 1 * HZ in icmp_sk_init()
+
+#### sysctl_icmp_ratemask
+
+Mask made of ICMP types for which rates are being limited. Each bit is an ICMPv4 type.
+
+procfs entry: /proc/sys/net/ipv4/icmp_ratemask
+
+Initialized to 0x1818 in icmp_sk_init()
+
+#### sysctl_icmp_errors_use_inbound_ifaddr
+
+The value of this variable is checked in icmp_send(). When it's not set, the ICMP error messages are sent with the primary address of the interface on which the packet will be sent. When it is set, the ICMP message will be sent with the primary address of the interface that received the packet that caused the icmp error.
+
+procfs entry: /proc/sys/net/ipv4/icmp_errors_use_inbound_ifaddr
+
+Initialized to 0 in icmp_sk_init()
+
+Note
+
+See also more about the ICMP sysctl variables, their types and their default values in
+
+Documentation/networking/ip-sysctl.txt .
+
+### Creating "Destination Unreachable" Messages with iptables
+
+The iptables userspace tool enables us to set rules which dictate what the kernel should do with traffic according to filters set by these rules. Handling iptables rules is done in the netfilter subsystem, and is discussed in Chapter 9. One of the iptables rules is the reject rule, which discards packets without further processing them. When setting an iptables reject target, the user can set a rule to send a "Destination Unreachable" ICMPv4 messages with various codes using the -j REJECT and \--reject-with qualifiers. For example, the following iptables rule will discard any packet from any source with sending back an ICMP message of "ICMP Host Prohibited":
+
+iptables -A INPUT -j REJECT --reject-with icmp-host-prohibited
+
+These are the possible values to the \--reject-with qualifier for setting an ICMPV4 message which will be sent in reply to the sending host:
+
+icmp-net-unreachable - ICMP_NET_UNREACH
+
+icmp-host-unreachable - ICMP_HOST_UNREACH
+
+icmp-port-unreachable - ICMP_PORT_UNREACH
+
+icmp-proto-unreachable - ICMP_PROT_UNREACH
+
+icmp-net-prohibited - ICMP_NET_ANO
+
+icmp-host-prohibited - ICMP_HOST_ANO
+
+icmp-admin-prohibited - ICMP_PKT_FILTERED
+
+You can also use \--reject-with tcp-reset which will send a TCP RST packet in reply to the sending host.
+
+(net/ipv4/netfilter/ipt_REJECT.c)
+
+With ip6tables in IPv6, there is also a REJECT target. For example:
+
+ip6tables -A INPUT -s 2001::/64 -p ICMPv6 -j REJECT --reject-with icmp6-adm-prohibited
+
+These are the possible values to the \--reject-with qualifier for setting an ICMPv6 message which will be sent in reply to the sending host:
+
+no-route, icmp6-no-route - ICMPV6_NOROUTE.
+
+adm-prohibited, icmp6-adm-prohibited - ICMPV6_ADM_PROHIBITED.
+
+port-unreach, icmp6-port-unreachable - ICMPV6_NOT_NEIGHBOUR.
+
+addr-unreach, icmp6-addr-unreachable - ICMPV6_ADDR_UNREACH.
+
+(net/ipv6/netfilter/ip6t_REJECT.c)
+Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_4
+
+© Rami Rosen 2014
+
+# 4. IPv4
+
+Rami Rosen1
+
+(1)
+
+Haifa, Israel
+
+Abstract
+
+Chapter 3 deals with the implementation of the ICMP protocol in IPv4 and in IPv6. This chapter, which deals with the IPv4 protocol, shows how ICMP messages are used for reporting Internet protocol errors under certain circumstances. The IPv4 protocol (Internet Protocol version 4) is one of the core protocols of today's standards-based Internet and routes most of the traffic on the Internet. The base definition is in RFC 791, "Internet Protocol," from 1981. The IPv4 protocol provides an end-to-end connectivity between any two hosts. Another important function of the IP layer is forwarding packets (also called routing) and managing tables that store routing information. Chapters 5 and 6 discuss IPv4 routing. This chapter describes the IPv4 Linux implementation: receiving and sending IPv4 packets, including multicast packets, IPv4 forwarding, and handling IPv4 options. There are cases when the packet to be sent is bigger than the MTU of the outgoing interface; in such cases the packet should be fragmented into smaller fragments. When fragmented packets are received, they should be assembled into one big packet, which should be identical to the packet that was sent before it was fragmented. These are also important tasks of the IPv4 protocol discussed in this chapter.
+
+Chapter 3 deals with the implementation of the ICMP protocol in IPv4 and in IPv6. This chapter, which deals with the IPv4 protocol, shows how ICMP messages are used for reporting Internet protocol errors under certain circumstances. The IPv4 protocol (Internet Protocol version 4) is one of the core protocols of today's standards-based Internet and routes most of the traffic on the Internet. The base definition is in RFC 791, "Internet Protocol," from 1981. The IPv4 protocol provides an end-to-end connectivity between any two hosts. Another important function of the IP layer is forwarding packets (also called routing) and managing tables that store routing information. Chapters 5 and  discuss IPv4 routing. This chapter describes the IPv4 Linux implementation: receiving and sending IPv4 packets, including multicast packets, IPv4 forwarding, and handling IPv4 options. There are cases when the packet to be sent is bigger than the MTU of the outgoing interface; in such cases the packet should be fragmented into smaller fragments. When fragmented packets are received, they should be assembled into one big packet, which should be identical to the packet that was sent before it was fragmented. These are also important tasks of the IPv4 protocol discussed in this chapter.
+
+Every IPv4 packet starts with an IP header, which is at least 20 bytes long. If IP options are used, the IPv4 header can be up to 60 bytes. After the IP header, there is the transport header (TCP header or UDP header, for example), and after it is the payload data. To understand the IPv4 protocol, you must first learn how the IPv4 header is built. In Figure 4-1 you can see the IPv4 header, which consists of two parts: the first part of 20 bytes (until the beginning of the options field in the IPv4 header) is the basic IPv4 header, and after it there is the IP options part, which can be from 0 to 40 bytes in length.
+
+## IPv4 Header
+
+The IPv4 header consists of information that defines how a packet should be handled by the kernel network stack: the protocol being used, the source and destination address, the checksum, the identification (id) of the packet that is needed for fragmentation, the ttl that helps avoiding packets being forwarded endlessly because of some error, and more. This information is stored in 13 members of the IPv4 header (the 14th member, IP Options, which is an extension to the IPv4 header, is optional). The various members of the IPv4 and the various IP options are described next. The IPv4 header is represented by the iphdr structure. Its members, which appear in Figure 4-1, are described in the next section. The IP options and their use are described in the "IP Options" section later in this chapter.
+
+Figure 4-1.
+
+IPv4 header
+
+Figure 4-1 shows the IPv4 header. All members always exist—except for the last one, the IP options, which is optional. The content of the IPv4 members determines how it will be handled in the IPv4 network stack: the packet is discarded when there is some problem (for example, if the version, which is the first member, is not 4, or if the checksum is incorrect). Each IPv4 packet starts with IPv4 header, and after it there is the payload:
+
+struct iphdr {
+
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+
+__u8 ihl:4,
+
+version:4;
+
+#elif defined (__BIG_ENDIAN_BITFIELD)
+
+__u8 version:4,
+
+ihl:4;
+
+#else
+
+#error "Please fix <asm/byteorder.h>"
+
+#endif
+
+__u8 tos;
+
+__be16 tot_len;
+
+__be16 id;
+
+__be16 frag_off;
+
+__u8 ttl;
+
+__u8 protocol;
+
+__sum16 check;
+
+__be32 saddr;
+
+__be32 daddr;
+
+/*The options start here. */
+
+};
+
+(include/uapi/linux/ip.h)
+
+The following is a description of the IPv4 header members:
+
+  * ihl: This stands for Internet Header Length. The length of the IPv4 header, measured in multiples of 4 bytes. The length of the IPv4 header is not fixed, as opposed to the header of IPv6, where the length is fixed (40 bytes). The reason is that the IPv4 header can include optional, varying length options. The minimum size of the IPv4 header is 20 bytes, when there are no options, and the maximum size is 60 bytes. The corresponding ihl values are 5 for minimum IPv4 header size, and 15 for the maximum size. The IPv4 header must be aligned to a 4-byte boundary.
+
+  * version: Should be 4.
+
+  * tos: The tos field of the IPv4 header was originally intended for Quality of Service (QoS) services; tos stands for Type of Service. Over the years this field took on a different meaning, as follows: RFC 2474 defines the Differentiated Services Field (DS Field) in the IPv4 and IPv6 headers, which is bits 0–5 of the tos. It is also named Differentiated Services Code Point (DSCP). RFC 3168 from 2001 defines the Explicit Congestion Notification (ECN) of the IP header; it is bits 6 and 7 of the tos field.
+
+  * tot_len: The total length, including the header, measured in bytes. Because tot_len is a 16-bit field, it can be up to 64KB. According to RFC 791, the minimum size is 576 bytes.
+
+  * id: Identification of the IPv4 header. The id field is important for fragmentation: when fragmenting an SKB, the id value of all the fragments of that SKB should be the same. Reassembling fragmented packets is done according to the id of the fragments.
+
+  * frag_off: The fragment offset, a 16-bit field. The lower 13 bits are the offset of the fragment. In the first fragment, the offset is 0. The offset is measured in units of 8 bytes. The higher 3 bits are the flags:
+
+    * 001 is MF (More Fragments). It is set for all fragments, except the last one.
+
+    * 010 is DF (Don't Fragment).
+
+    * 100 is CE (Congestion).
+
+See the IP_MF, IP_DF, and IP_CE flags declaration in include/net/ip.h .
+
+  * ttl: Time To Live: this is a hop counter. Each forwarding node decreases the ttl by 1. When it reaches 0, the packet is discarded, and a time exceeded ICMPv4 message is sent back; this avoids packets from being forwarded endlessly, for this reason or another.
+
+  * protocol: The L4 protocol of the packet—for example, IPPROTO_TCP for TCP traffic or IPPROTO_UDP for UDP traffic (for a list of all available protocols see include/linux/in.h).
+
+  * check: The checksum (16-bit field). The checksum is calculated only over the IPv4 header bytes.
+
+  * saddr: Source IPv4 address, 32 bits.
+
+  * daddr: Destination IPv4 address, 32 bits.
+
+In this section you have learned about the various IPv4 header members and their purposes. The initialization of the IPv4 protocol, which sets the callback to be invoked when receiving an IPv4 header, is discussed in the next section.
+
+## IPv4 Initialization
+
+IPv4 packets are packets with Ethernet type 0x0800 (Ethernet type is stored in the first two bytes of the 14-byte Ethernet header). Each protocol should define a protocol handler, and each protocol should be initialized so that the network stack can handle packets that belong to this protocol. So that you understand what causes received IPv4 packets to be handled by IPv4 methods, this section describes the registration of the IPv4 protocol handler :
+
+static struct packet _type ip_packet_type __read_mostly = {
+
+.type = cpu_to_be16(ETH_P_IP),
+
+.func = ip_rcv,
+
+};
+
+static int __init inet_init(void)
+
+{
+
+...
+
+dev_add_pack(&ip_packet_type);
+
+...
+
+}
+
+(net/ipv4/af_inet.c)
+
+The dev_add_pack() method adds the ip_rcv() method as a protocol handler for IPv4 packets. These are packets with Ethernet type 0x0800 (ETH_P_IP, defined in include/uapi/linux/if_ether.h). The inet_init() method performs various IPv4 initializations and is called during the boot phase.
+
+The main functionality of the IPv4 protocol is divided into the Rx (receive) path and the Tx (transmit) path. Now that you learned about the registration of the IPv4 protocol handler, you know which protocol handler manages IPv4 packets (the ip_rcv callback) and how this protocol handler is registered. You are ready now to start to learn about the IPv4 Rx path and how received IPv4 packets are handled. The Tx path is described in a later section, "Sending IPv4 Packets."
+
+## Receiving IPv4 Packets
+
+The main IPv4 receive method is the ip_rcv() method, which is the handler for all IPv4 packets (including multicasts and broadcasts). In fact, this method consists mostly of sanity checks. The real work is done in the ip_rcv_finish() method it invokes. Between the ip_rcv() method and the ip_rcv_finish() method is the NF_INET_PRE_ROUTING netfilter hook, invoked by calling the NF_HOOK macro (see code snippet later in this section). In this chapter, you will encounter many invocations of the NF_HOOK macros—these are the netfilter hooks. The netfilter subsystem allows you to register callbacks in five points along the journey of a packet in the network stack. These points will be mentioned by their names shortly. The reason for adding the netfilter hooks is to enable loading the netfilter kernel modules at runtime. The NF_HOOK macro invokes the callbacks of a specified point, if such callbacks were registered. You might also encounter the NF_HOOK macro called NF_HOOK_COND, which is a variation of the NF_HOOK macro. In some places in the network stack, the NF_HOOK_COND macro includes a Boolean parameter (the last parameter), which must be true for the hook to be executed (Chapter 9 discusses netfilter hooks). Note that the netfilter hooks can discard the packet and in such a case it will not continue on its ordinary path. Figure 4-2 shows the receiving path (Rx) of a packet received by the network driver. This packet can either be delivered to the local machine or be forwarded to another host. It is the lookup in the routing table that determines which of these two options will take place.
+
+Figure 4-2.
+
+Receiving IPv4 packets. For simplicity, the diagram does not include the fragmentation/defragmentation/options/IPsec methods
+
+Figure 4-2 shows the paths for a received IPv4 packet. The packet is received by the IPv4 protocol handler, the ip_rcv() method (see the upper left side of the figure). First of all, a lookup in the routing subsystem should be performed, immediately after calling the ip_rcv_finish() method. The result of the routing lookup determines whether the packet is for local delivery to the local host or is to be forwarded (routing lookup is explained in Chapter 5). If the packet is destined for the local host, it will first reach the ip_local_deliver() method, and subsequently it will reach the ip_local_deliver_finish() method. When the packet is to be forwarded, it will be handled by the ip_forward() method. Some netfilter hooks appear in the figure, like NF_INET_PRE_ROUTING and NF_INET_LOCAL_IN. Note that multicast traffic is handled by the ip_mr_input() method, discussed in the "Receiving IPv4 Multicast Packets" section later in this chapter. The NF_INET_PRE_ROUTING, NF_INET_LOCAL_IN, NF_INET_FORWARD, and NF_INET_POST_ROUTING are four of the five entry points of the netfilter hooks. The fifth one, NF_INET_LOCAL_OUT, is mentioned in the "Sending IPv4 packets" section later in this chapter. These five entry points are defined in include/uapi/linux/netfilter.h. Note that the same enum for these five hooks is also used in IPv6; for example, in the ipv6_rcv() method, a hook is being registered on NF_INET_PRE_ROUTING (net/ipv6/ip6_input.c). Let's take a look at the ip_rcv() method:
+
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+
+{
+
+First some sanity checks are performed, and I mention some of them in this section. The length of the IPv4 header (ihl) is measured in multiples of 4 bytes. The IPv4 header must be at least 20 bytes in size, which means that the ihl size must be at least 5. The version should be 4 (for IPv4). If one of these conditions is not met, the packet is dropped and the statistics (IPSTATS_MIB_INHDRERRORS) are updated.
+
+if (iph->ihl < 5 || iph->version != 4)
+
+goto inhdr_error;
+
+According to section 3.2.1.2 of RFC 1122, a host must verify the IPv4 header checksum on every received datagram and silently discard every datagram that has a bad checksum. This is done by calling the ip_fast_csum() method, which should return 0 on success. The IPv4 header checksum is calculated only over the IPv4 header bytes:
+
+if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+
+goto inhdr_error;
+
+Then the NF_HOOK macro is invoked:
+
+return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
+
+ip_rcv_finish);
+
+When the registered netfilter hook method returns NF_DROP, it means that the packet should be dropped, and the packet traversal does not continue. When the registered netfilter hook returns NF_STOLEN, it means that the packet was taken over by the netfilter subsystem, and the packet traversal does not continue. When the registered netfilter hook returns NF_ACCEPT, the packet continues its traversal. There are other return values (also termed verdicts) from netfilter hooks, like NF_QUEUE, NF_REPEAT, and NF_STOP, which are not discussed in this chapter. (As mentioned earlier, netfilter hooks are discussed in Chapter 9.) Let's assume for a moment that there are no netfilter callbacks registered in the NF_INET_PRE_ROUTING entry point, so the NF_HOOK macro will not invoke any netfilter callbacks and the ip_rcv_finish() method will be invoked. Let's take a look at the ip_rcv_finish() method:
+
+static int ip_rcv_finish(struct sk_buff *skb)
+
+{
+
+const struct iphdr *iph = ip_hdr(skb);
+
+struct rtable *rt;
+
+The skb_dst() method checks whether there is a dst object attached to the SKB; dst is an instance of dst_entry (include/net/dst.h) and represents the result of a lookup in the routing subsystem. The lookup is done according to the routing tables and the packet headers. The lookup in the routing subsystem also sets the input and /or the output callbacks of the dst. For example, if the packet is to be forwarded, the lookup in the routing subsystem will set the input callback to be ip_forward(). When the packet is destined to the local machine, the lookup in the routing subsystem will set the input callback to be ip_local_deliver(). For a multicast packet it can be ip_mr_input() under some conditions (I discuss multicast packets in the next section). The contents of the dst object determine how the packet will proceed in its journey; for example, when forwarding a packet, the decision about which input callback should be called when invoking dst_input(), or on which interface it should be transmitted, is taken according to the dst.(I discuss the routing subsystem in depth in the next chapter).
+
+If there is no dst attached to the SKB, a lookup in the routing subsystem is performed by the ip_route_input_noref() method. If the lookup fails, the packet is dropped. Note that handling multicast packets is different than handling unicast packets (discussed in the section "Receiving IPv4 Multicast Packets" later in this chapter).
+
+...
+
+if (!skb_dst(skb)) {
+
+Perform a lookup in the routing subsystem:
+
+int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+
+iph->tos, skb->dev);
+
+if (unlikely(err)) {
+
+if (err == -EXDEV)
+
+NET_INC_STATS_BH(dev_net(skb->dev),
+
+LINUX_MIB_IPRPFILTER);
+
+goto drop;
+
+}
+
+}
+
+Note
+
+The -EXDEV ("Crossdevice link") error is returned by the __fib_validate_source() method under certain circumstances when the Reverse Path Filter (RPF) is set. The RPF can be set via an entry in the procfs. In such cases the packet is dropped, the statistics (LINUX_MIB_IPRPFILTER) are updated, and the method returns NET_RX_DROP. Note that you can display the LINUX_MIB_IPRPFILTER counter by looking in the IPReversePathFilter column in the output of cat /proc/net/netstat.
+
+Now a check is performed to see whether the IPv4 header includes options. Because the length of the IPv4 header (ihl) is measured in multiples of 4 bytes, if it is greater than 5 this means that it includes options, so the ip_rcv_options() method should be invoked to handle these options. Handling IP options is discussed in depth in the "IP Options" section later in this chapter. Note that the ip_rcv_options() method can fail, as you will shortly see. If it is a multicast entry or a broadcast entry, the IPSTATS_MIB_INMCAST statistics or the IPSTATS_MIB_INBCAST statistics is updated, respectively. Then the dst_input() method is invoked. This method in turn simply invokes the input callback method by calling skb_dst(skb)->input(skb):
+
+if (iph->ihl > 5 && ip_rcv_options(skb))
+
+goto drop;
+
+rt = skb_rtable(skb);
+
+if (rt->rt_type == RTN_MULTICAST) {
+
+IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
+
+skb->len);
+
+} else if (rt->rt_type == RTN_BROADCAST)
+
+IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
+
+skb->len);
+
+return dst_input(skb);
+
+In this section you learned about the various stages in the reception of IPv4 packets: the sanity checks performed, the lookup in the routing subsystem, the ip_rcv_finish() method which performs the actual work. You also learned about which method is called when the packet should be forwarded and which method is called when the packet is for local delivery. IPv4 multicasting is a special case. Handling the reception of IPv4 multicast packets is discussed in the next section.
+
+## Receiving IPv4 Multicast Packets
+
+The ip_rcv() method is also a handler for multicast packets. As mentioned earlier, after some sanity checks, it invokes the ip_rcv_finish() method, which performs a lookup in the routing subsystem by calling ip_route_input_noref(). In the ip_route_input_noref() method, first a check is performed to see whether the local machine belongs to a multicast group of the destination multicast address, by calling the ip_check_mc_rcu() method. If it is so, or if the local machine is a multicast router (CONFIG_IP_MROUTE is set), the ip_route_input_mc() method is invoked; let's take a look at the code:
+
+int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+
+u8 tos, struct net_device *dev)
+
+{
+
+int res;
+
+rcu_read_lock();
+
+...
+
+if (ipv4_is_multicast(daddr)) {
+
+struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+if (in_dev) {
+
+int our = ip_check_mc_rcu(in_dev, daddr, saddr,
+
+ip_hdr(skb)->protocol);
+
+if (our
+
+#ifdef CONFIG_IP_MROUTE
+
+||
+
+(!ipv4_is_local_multicast(daddr) &&
+
+IN_DEV_MFORWARD(in_dev))
+
+#endif
+
+) {
+
+int res = ip_route_input_mc(skb, daddr, saddr,
+
+tos, dev, our);
+
+rcu_read_unlock();
+
+return res;
+
+}
+
+}
+
+...
+
+}
+
+...
+
+Let's further look into the ip_route_input_mc() method. If the local machine belongs to a multicast group of the destination multicast address (the value of the variable our is 1), then the input callback of dst is set to be ip_local_deliver. If the local host is a multicast router and IN_DEV_MFORWARD(in_dev) is set, then the input callback of dst is set to be ip_mr_input. The ip_rcv_finish() method, which calls dst_input(skb), invokes thus either the ip_local_deliver() method or the ip_mr_input() method, according to the input callback of dst. The IN_DEV_MFORWARD macro checks the procfs multicast forwarding entry. Note that the procfs multicast forwarding entry, /proc/sys/net/ipv4/conf/all/mc_forwarding , is a read-only entry (as opposed to the IPv4 unicast procfs forwarding entry), so you cannot set it simply by running from the command line: echo 1 > /proc/sys/net/ipv4/conf/all/mc_forwarding. Starting the pimd daemon, for example, sets it to 1, and stopping the daemon sets it to 0. pimd is a lightweight standalone PIM-SM v2 multicast routing daemon. If you are interested in learning about multicast routing daemon implementation, you might want to look into the pimd source code in  https://github.com/troglobit/pimd/ :
+
+static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+
+u8 tos, struct net_device *dev, int our)
+
+{
+
+struct rtable *rth;
+
+struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+...
+
+if (our) {
+
+rth->dst.input= ip_local_deliver;
+
+rth->rt_flags |= RTCF_LOCAL;
+
+}
+
+#ifdef CONFIG_IP_MROUTE
+
+if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
+
+rth->dst.input = ip_mr_input;
+
+#endif
+
+...
+
+The multicast layer holds a data structure called the Multicast Forwarding Cache (MFC). I don't discuss the details of the MFC or of the ip_mr_input() method here (I discuss them in Chapter 6). What is important in this context is that if a valid entry is found in the MFC, the ip_mr_forward() method is called. The ip_mr_forward() method performs some checks and eventually calls the ipmr_queue_xmit() method. In the ipmr_queue_xmit() method, the ttl is decreased, and the checksum is updated by calling the ip_decrease_ttl() method (the same is done in the ip_forward() method, as you will see later in this chapter). Then the ipmr_forward_finish() method is invoked by calling the NF_INET_FORWARD NF_HOOK macro (let's assume that there are no registered IPv4 netfilter hooks on NF_INET_FORWARD):
+
+static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
+
+struct sk_buff *skb, struct mfc_cache *c, int vifi)
+
+{
+
+...
+
+ip_decrease_ttl(ip_hdr(skb));
+
+...
+
+NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev,
+
+ipmr_forward_finish);
+
+return;
+
+}
+
+The ipmr_forward_finish() method is very short and is shown here in its entirety. All it does is update the statistics, call the ip_forward_options() method if there are options in the IPv4 header (IP options are described in the next section), and call the dst_output() method:
+
+static inline int ipmr_forward_finish(struct sk_buff *skb)
+
+{
+
+struct ip_options *opt = &(IPCB(skb)->opt);
+
+IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+
+IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
+
+if (unlikely(opt->optlen))
+
+ip_forward_options(skb);
+
+return dst_output(skb);
+
+}
+
+This section discussed how receiving IPv4 multicast packets is handled. The pimd was mentioned as an example of a multicast routing daemon, which interacts with the kernel in multicast packet forwarding. The next section describes the various IP options, which enable using special features of the network stack, such as tracking the route of a packet, tracking timestamps of packets, specifying network nodes which a packet should traverse. I also discuss how these IP options are handled in the network stack.
+
+## IP Options
+
+The IP options field of the IPv4 header is optional and is not often used for security reasons and because of processing overhead. Which options might be helpful? Suppose, for example, that your packets are being dropped by a certain firewall. You may be able to specify a different route with the Strict or Loose Source Routing options. Or if you want to find out the packets' path to some destination addresses, you can use the Record Route option.
+
+The IPv4 header may contain zero, one, or more options. The IPv4 header size is 20 bytes when there are no options. The length of the IP options field can be 40 bytes at most. The reason the IPv4 maximum length is 60 bytes is because the IPv4 header length is a 4-bit field, which expresses the length in multiples of 4 bytes. Hence the maximum value of the field is 15, which gives an IPv4 maximum header length of 60 bytes. When using more than one option, options are simply concatenated one after the other. The IPv4 header must be aligned to a 4-byte boundary, so sometimes padding is needed. The following RFCs discuss IP options: 781 (Timestamp Option), 791, 1063, 1108, 1393 (Traceroute Using an IP Option), and 2113 (IP Router Alert Option). There are two forms of IP options:
+
+  * Single byte option (option type): The "End of Option List" and "No Operation" are the only single byte options.
+
+  * Multibyte option: When using a multibyte option after the option type byte there are the following three fields:
+
+    * Length (1 byte): Length of the option in bytes.
+
+    * Pointer (1 byte): Offset from option start.
+
+    * Option data: This is a space where intermediate hosts can store data, for example, timestamps or IP addresses.
+
+In Figure 4-3 the Option type is shown.
+
+Figure 4-3.
+
+Option type
+
+When set, copied flag means that the option should be copied in all fragments. When it is not set, the option should be copied only in the first fragment. The IPOPT_COPIED macro checks whether the copied flag of a specified IP option is set. It is used in the ip_options_fragment() method for detecting options which may not be copied and for inserting IPOPT_NOOP instead. The ip_options_fragment() method is discussed later in this section.
+
+The option class can be one of the following 4 values:
+
+  * 00: control class (IPOPT_CONTROL)
+
+  * 01: reserved1 (IPOPT_RESERVED1)
+
+  * 10: debugging and measurement (IPOPT_MEASUREMENT)
+
+  * 11: reserved2 (IPOPT_RESERVED2)
+
+In the Linux network stack, only the IPOPT_TIMESTAMP option belongs to the debugging and measurement class. All the other options are control classes.
+
+The Option Number specifies an option by a unique number; possible values are 0–31, but not all are used by the Linux kernel.
+
+Table 4-1 shows all options according to their Linux symbol, option number, option class, and copied flag.
+
+Table 4-1.
+
+Options Table
+
+Linux Symbol | Option Number | Class | Copied Flag | Description
+
+---|---|---|---|---
+
+IPOPT_END | 0 | 0 | 0 | End of Option List
+
+IPOPT_NOOP | 1 | 0 | 0 | No Operation
+
+IPOPT_SEC | 2 | 0 | 1 | Security
+
+IPOPT_LSRR | 3 | 0 | 1 | Loose Source Record Route
+
+IPOPT_TIMESTAMP | 4 | 2 | 0 | Timestamp
+
+IPOPT_CIPSO | 6 | 0 | 1 | Commercial Internet Protocol Security Option
+
+IPOPT_RR | 7 | 0 | 0 | Record Route
+
+IPOPT_SID | 8 | 0 | 1 | Stream ID
+
+IPOPT_SSRR | 9 | 0 | 1 | Strict Source Record Route
+
+IPOPT_RA | 20 | 0 | 1 | Router Alert
+
+The option names (IPOPT_*) declarations are in include/uapi/linux/ip.h.
+
+The Linux network stack does not include all the IP options. For a full list, see  www.iana.org/assignments/ip-parameters/ip-parameters.xml  .
+
+I will describe the five options shortly, and then describe the Timestamp Option and the Record Route option in depth:
+
+  * End of Option List (IPOPT_END): 1-byte option used to indicate the end of the options field. This is a single zero byte option (all its bits are '0'). There can be no IP options after it.
+
+  * No Operation (IPOPT_NOOP): 1-byte option is used for internal padding, which is used for alignment.
+
+  * Security (IPOPT_SEC): This option provides a way for hosts to send security, handling restrictions, and TCC (closed user group) parameters. See RFC 791 and RFC 1108. Initially intended to be used by military applications.
+
+  * Loose Source Record Route (IPOPT_LSRR): This option specifies a list of routers that the packet should traverse. Between each two adjacent nodes in the list there can be intermediate routers which do not appear in the list, but the order should be kept.
+
+  * Commercial Internet Protocol Security Option (IPOPT_CIPSO): CIPSO is an IETF draft that has been adopted by several vendors. It deals with a network labeling standard. CIPSO labeling of a socket means adding the CIPSO IP options to all packets leaving the system through that socket. This option is validated upon reception of the packet. For more info about the CIPSO option, see Documentation/netlabel/draft-ietf-cipso-ipsecurity-01.txt and Documentation/netlabel/cipso_ipv4.txt.
+
+### Timestamp Option
+
+Timestamp (IPOPT_TIMESTAMP): The Timestamp option is specified in RFC 781, "A Specification of the Internet Protocol (IP) Timestamp Option." This option stores timestamps of hosts along the packet route. The stored timestamp is a 32-bit timestamp in milliseconds since midnight UTC of the current day. In addition, it can also store the addresses of all hosts in the packet route or timestamps of only selected hosts along the route. The maximum Timestamp option length is 40. The Timestamp option is not copied for fragments; it is carried only in the first fragment. The Timestamp option begins with three bytes of option type, length, and pointer (offset). The higher 4 bits of the fourth byte are the overflow counter, which is incremented in each hop where there is no available space to store the required data. When the overflow counter exceeds 15, an ICMP message of Parameter Problem is sent back. The lower 4 bits is the flag. The value of the flag can be one of the following:
+
+  * 0: Timestamp only (IPOPT_TS_TSONLY)
+
+  * 1: Timestamps and addresses (IPOPT_TS_TSANDADDR)
+
+  * 3: Timestamps of specified hops only (IPOPT_TS_PRESPEC)
+
+Note
+
+You can use the command-line ping utility with the Timestamp option and with the three subtypes mentioned earlier:
+
+ping -T tsonly (IPOPT_TS_TSONLY)
+
+ping -T tsandaddr (IPOPT_TS_TSANDADDR)
+
+ping -T tsprespec (IPOPT_TS_PRESPEC)
+
+Figure 4-4 shows the Timestamp option with timestamp only (the IPOPT_TS_TSONLY flag is set). Each router on the path adds its IPv4 address. When there is no more space, the overflow counter is incremented.
+
+Figure 4-4.
+
+Timestamp option (with timestamp only, flag = 0)
+
+Figure 4-5 shows the Timestamp option with timestamps and addresses (the IPOPT_TS_TSANDADDR flag is set). Each router on the path adds its IPv4 address and its timestamp. Again, when there is no more space, the overflow counter is incremented.
+
+Figure 4-5.
+
+Timestamp option (with timestamps and addresses, flag = 1)
+
+Figure 4-6 shows the Timestamp option with timestamps (the IPOPT_TS_PRESPEC flag is set). Each router on the path adds its timestamp only if it is in the pre-specified list. Again, when there is no more space, the overflow counter is incremented.
+
+Figure 4-6.
+
+Timestamp option (with timestamps of specified hops only, flag = 3)
+
+### Record Route Option
+
+Record Route (IPOPT_RR): The route of a packet is recorded. Each router on the way adds its address (see Figure 4-7). The length is set by the sending device. The command-line utility ping –R uses the Record Route IP Option. Note that the IPv4 header is only large enough for nine such routes (or even less, if more options are used). When the header is full and there is no room to insert an additional address, the datagram is forwarded without inserting the address to the IP options. See section 3.1, RFC 791.
+
+Figure 4-7.
+
+Record Route option
+
+Though ping –R uses the Record Route IP Option, in many cases, if you will try it, you will not get the expected result of all the network nodes along the way, because for security reasons many network nodes ignore this IP option. The manpage of ping mentions this explicitly. From man ping:
+
+...
+
+-R
+
+Includes the RECORD_ROUTE option in the ECHO_REQUEST packet and displays the route buffer on returned packets.
+
+...
+
+Many hosts ignore or discard this option.
+
+...
+
+  * Stream ID (IPOPT_SID): This option provides a way for the 16-bit SATNET stream identifier to be carried through networks that do not support the stream concept.
+
+  * Strict Source Record Route (IPOPT_SSRR): This option specifies a list of routers that the packet should traverse. The order should be kept, and no changes in traversal are permitted. Many routers block the Loose Source Record Route (LSRR) and Strict Source Record Route (SSRR) options because of security reasons.
+
+  * Router Alert (IPOPT_RA): The IP Router Alert option can be used to notify transit routers to more closely examine the contents of an IP packet. This is useful, for example, for new protocols but requires relatively complex processing in routers along the path. Specified in RFC 2113, "IP Router Alert Option."
+
+IP options are represented in Linux by the ip_options structure:
+
+struct ip_options {
+
+__be32 faddr;
+
+__be32 nexthop;
+
+unsigned char optlen;
+
+unsigned char srr;
+
+unsigned char rr;
+
+unsigned char ts;
+
+unsigned char is_strictroute:1,
+
+srr_is_hit:1,
+
+is_changed:1,
+
+rr_needaddr:1,
+
+ts_needtime:1,
+
+ts_needaddr:1;
+
+unsigned char router_alert;
+
+unsigned char cipso;
+
+unsigned char __pad2;
+
+unsigned char __data[0];
+
+};
+
+(include/net/inet_sock.h)
+
+Here are short descriptions of the members of the IP options structure:
+
+  * faddr: Saved first hop address. Set in ip_options_compile() when handling loose and strict routing, when the method was not invoked from the Rx path (SKB is NULL).
+
+  * nexthop: Saved nexthop address in LSRR and SSRR.
+
+  * optlen: The option length, in bytes. Cannot exceed 40 bytes.
+
+  * is_strictroute: A flag specifing usage of strict source route. The flag is set in the ip_options_compile() method when parsing strict route option type (IPOPT_SSRR); note that it is not set for loose route (IPOPT_LSRR).
+
+  * srr_is_hit: A flag specifing that the packet destination addr was the local host The srr_is_hit flag is set in ip_options_rcv_srr().
+
+  * is_changed: IP checksum is not valid anymore (the flag is set when one of the IP options is changed).
+
+  * rr_needaddr: Need to record IPv4 address of the outgoing device. The flag is set for the Record Route option (IPOPT_RR).
+
+  * ts_needtime: Need to record timestamp. The flag is set for these flags of the Timestamp IP Option: IPOPT_TS_TSONLY, IPOPT_TS_TSANDADDR and IPOPT_TS_PRESPEC (see a detailed explanation about the difference between these flags later in this section).
+
+  * ts_needaddr: Need to record IPv4 address of the outgoing device. This flag is set only when the IPOPT_TS_TSANDADDR flag is set, and it indicates that the IPv4 address of each node along the route of the packet should be added.
+
+  * router_alert: Set in the ip_options_compile() method when parsing a router alert option (IPOPT_RR).
+
+  * __data[0]: A buffer to store options that are received from userspace by setsockopt().
+
+See ip_options_get_from_user() and ip_options_get_finish() (net/ipv4/ip_options.c).
+
+Let's take a look at the ip_rcv_options() method:
+
+static inline bool ip_rcv_options(struct sk_buff *skb)
+
+{
+
+struct ip_options *opt;
+
+const struct iphdr *iph;
+
+struct net_device *dev = skb->dev;
+
+...
+
+Fetch the IPv4 header from the SKB:
+
+iph = ip_hdr(skb);
+
+Fetch the ip_options object from the inet_skb_parm object which is associated to the SKB:
+
+opt = &(IPCB(skb)->opt);
+
+Calculate the expected options length:
+
+opt->optlen = iph->ihl*4 - sizeof(struct iphdr);
+
+Call the ip_options_compile() method to build an ip_options object out of the SKB:
+
+if (ip_options_compile(dev_net(dev), opt, skb)) {
+
+IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+
+goto drop;
+
+}
+
+When the ip_options_compile() method is called in the Rx path (from the ip_rcv_options() method), it parses the IPv4 header of the specified SKB and builds an ip_options object out of it, according to the IPv4 header content, after verifying the validity of the options. The ip_options_compile() method can also be invoked from the ip_options_get_finish() method when getting options from userspace via the setsockopt() system call with IPPROTO_IP and IP_OPTIONS. In this case, data is copied from userspace into opt->data, and the third parameter for ip_options_compile(), the SKB, is NULL; the ip_options_compile() method builds the ip_options object in such a case from opt->__data. If some error is found while parsing the options, and it is in the Rx path (the ip_options_compile() method was invoked from ip_rcv_options()), a "Parameter Problem" ICMPv4 message (ICMP_PARAMETERPROB) is sent back. An error with the code –EINVAL is returned in case of error, regardless of how the method was invoked. Naturally, it is more convenient to work with the ip_options object than with the raw IPv4 header, because access to the IP options fields is much simpler this way. In the Rx path, the ip_options object that the ip_options_compile() method builds is stored in the control buffer (cb) of the SKB; this is done by setting the opt object to &(IPCB(skb)->opt). The IPCB(skb) macro is defined like this:
+
+#define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
+
+And the inet_skb_parm structure (which includes an ip_options object) is defined like this:
+
+struct inet_skb_parm {
+
+struct ip_options opt; /* Compiled IP options */
+
+unsigned char flags;
+
+u16 frag_max_size;
+
+};
+
+(include/net/ip.h)
+
+So &(IPCB(skb)->opt points to the ip_options object inside the inet_skb_parm object. I will not delve into all the small, tedious technical details of parsing the IPv4 header in the ip_options_compile() method in this book, because there is an abundance of such details and they are self-explanatory. I will discuss briefly how the ip_options_compile() parses some single byte options, like IPOPT_END and IPOPT_NOOP, and some more complex options like IPOPT_RR and IPOPT_TIMESTAMP in the Rx path and show some examples of which checks are done in this method and how it is implemented in the following code snippet:
+
+int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb)
+
+{
+
+...
+
+unsigned char *pp_ptr = NULL;
+
+struct rtable *rt = NULL;
+
+unsigned char *optptr;
+
+unsigned char *iph;
+
+int optlen, l;
+
+For starting the parsing process, the optptr pointer should point to the start of the IP options object and iterate over all the options in a loop. For the Rx path (when the ip_options_compile() method is invoked from the ip_rcv_options() method), the SKB that was received in the ip_rcv() method is passed as a parameter to ip_options_compile() and, needless to say, cannot be NULL. In such a case, the IP options start immediately after the initial fixed size (20 bytes) of the IPv4 header. When the ip_options_compile() was invoked from ip_options_get_finish(), the optptr pointer was set to opt->__data, because the ip_options_get_from_user() method copied the options that were sent from userspace into opt->__data. To be accurate, I should mention that if alignment is needed, the ip_options_get_finish() method also writes into opt->__data (it writes IPOPT_END in the proper place).
+
+if (skb != NULL) {
+
+rt = skb_rtable(skb);
+
+optptr = (unsigned char *)&(ip_hdr(skb)[1]);
+
+} else
+
+optptr = opt->__data;
+
+In this case, iph = ip_hdr(skb) cannot be used instead, because the case when SKB is NULL should be considered. The following assignment is correct also for the non-Rx path:
+
+iph = optptr - sizeof(struct iphdr);
+
+The variable l is initialized to be the options length (it can be 40 bytes at most). It is decremented by the length of the current option in each iteration of the following for loop:
+
+for (l = opt->optlen; l > 0; ) {
+
+switch (*optptr) {
+
+If an IPOPT_END option is encountered, it indicates that this is the end of the options list—there must be no other option after it. In such a case you write IPOPT_END for each byte which is different than IPOPT_END until the end of the options list. The is_changed Boolean flag should also be set, because it indicates that the IPv4 header was changed (and as a result, recalculation of checksum is pending—there is no justification for calculating the checksum right now or inside the for loop, because there might be other changes in the IPv4 header during the loop):
+
+case IPOPT_END:
+
+for (optptr++, l--; l>0; optptr++, l--) {
+
+if (*optptr != IPOPT_END) {
+
+*optptr = IPOPT_END;
+
+opt->is_changed = 1;
+
+}
+
+}
+
+goto eol;
+
+If an option type of No Operation (IPOPT_NOOP), which is a single byte option, is encountered, simply decrement l by 1, increment optptr by 1, and move forward to the next option type:
+
+case IPOPT_NOOP:
+
+l--;
+
+optptr++;
+
+continue;
+
+}
+
+Optlen is set to be the length of the option that is read (as optptr[1] holds the option length):
+
+optlen = optptr[1];
+
+The No Operation (IPOPT_NOOP) option and the End of Option List (IPOPT_END) option are the only single byte options. All other options are multibyte options and must have at least two bytes (option type and option length). Now a check is made that there are at least two option bytes and the option list length was not exceeded. If there was some error, the pp_ptr pointer is set to point to the source of the problem and exit the loop. If it is in the Rx path, an ICMPv4 message of "Parameter Problem" is sent back, passing as a parameter the offset where the problem occurred, so that the other side can analyze the problem:
+
+if (optlen<2 || optlen>l) {
+
+pp_ptr = optptr;
+
+goto error;
+
+}
+
+switch (*optptr) {
+
+case IPOPT_SSRR:
+
+case IPOPT_LSRR:
+
+...
+
+case IPOPT_RR:
+
+The option length of the Record Route option must be at least 3 bytes: option type, option length, and pointer (offset):
+
+if (optlen < 3) {
+
+pp_ptr = optptr + 1;
+
+goto error;
+
+}
+
+The option pointer offset of the Record Route option must be at least 4 bytes, since the space reserved for the address list must start after the three initial bytes (option type, option length, and pointer):
+
+if (optptr[2] < 4) {
+
+pp_ptr = optptr + 2;
+
+goto error;
+
+}
+
+if (optptr[2] <= optlen) {
+
+If the offset (optptr[2]) plus the three initial bytes exceeds the option length, there is an error:
+
+if (optptr[2]+3 > optlen) {
+
+pp_ptr = optptr + 2;
+
+goto error;
+
+}
+
+if (rt) {
+
+spec_dst_fill(&spec_dst, skb);
+
+Copy the IPv4 address to the Record Route buffer:
+
+memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
+
+Set the is_changed Boolean flag, which indicates that the IPv4 header was changed (recalculation of checksum is pending):
+
+opt->is_changed = 1;
+
+}
+
+Increment the pointer (offset) by 4 for the next address in the Record Route buffer (each IPv4 address is 4 bytes):
+
+optptr[2] += 4;
+
+Set the rr_needaddr flag (this flag is checked in the ip_forward_options() method):
+
+opt->rr_needaddr = 1;
+
+}
+
+opt->rr = optptr - iph;
+
+break;
+
+case IPOPT_TIMESTAMP:
+
+...
+
+The option length for Timestamp option must be at least 4 bytes: option type, option length, pointer (offset), and the fourth byte is divided into two fields: the higher 4 bits are the overflow counter, which is incremented in each hop where there is no available space to store the required data, and the lower 4 bits are the flag: timestamp only, timestamp and address, and timestamp by a specified hop:
+
+if (optlen < 4) {
+
+pp_ptr = optptr + 1;
+
+goto error;
+
+}
+
+optptr[2] is the pointer (offset). Because, as stated earlier, each Timestamp option starts with 4 bytes, it implies that the pointer (offset) must be at least 5:
+
+if (optptr[2] < 5) {
+
+pp_ptr = optptr + 2;
+
+goto error;
+
+}
+
+if (optptr[2] <= optlen) {
+
+unsigned char *timeptr = NULL;
+
+if (optptr[2]+3 > optptr[1]) {
+
+pp_ptr = optptr + 2;
+
+goto error;
+
+}
+
+In the switch command, the value of optptr[3]&0xF is checked. It is the flag (4 lower bits of the fourth byte) of the Timestamp option:
+
+switch (optptr[3]&0xF) {
+
+case IPOPT_TS_TSONLY:
+
+if (skb)
+
+timeptr = &optptr[optptr[2]-1];
+
+opt->ts_needtime = 1;
+
+For the Timestamp option with timestamps only flag (IPOPT_TS_TSONLY), 4 bytes are needed; so the pointer (offset) is incremented by 4:
+
+optptr[2] += 4;
+
+break;
+
+case IPOPT_TS_TSANDADDR:
+
+if (optptr[2]+7 > optptr[1]) {
+
+pp_ptr = optptr + 2;
+
+goto error;
+
+}
+
+if (rt) {
+
+spec_dst_fill(&spec_dst, skb);
+
+memcpy(&optptr[optptr[2]-1],
+
+&spec_dst, 4);
+
+timeptr = &optptr[optptr[2]+3];
+
+}
+
+opt->ts_needaddr = 1;
+
+opt->ts_needtime = 1;
+
+For the Timestamp option with timestamps and addresses flag (IPOPT_TS_TSANDADDR), 8 bytes are needed; so the pointer (offset) is incremented by 8:
+
+optptr[2] += 8;
+
+break;
+
+case IPOPT_TS_PRESPEC:
+
+if (optptr[2]+7 > optptr[1]) {
+
+pp_ptr = optptr + 2;
+
+goto error;
+
+}
+
+{
+
+__be32 addr;
+
+memcpy(&addr, &optptr[optptr[2]-1], 4);
+
+if (inet_addr_type(net,addr) == RTN_UNICAST)
+
+break;
+
+if (skb)
+
+timeptr = &optptr[optptr[2]+3];
+
+}
+
+opt->ts_needtime = 1;
+
+For the Timestamp option with timestamps and pre-specified hops flag (IPOPT_TS_PRESPEC), 8 bytes are needed, so the pointer (offset) is incremented by 8:
+
+optptr[2] += 8;
+
+break;
+
+default:
+
+...
+
+}
+
+...
+
+After the ip_options_compile() method has built the ip_options object, strict routing is handled. First, a check is performed to see whether the device supports source routing. This means that the /proc/sys/net/ipv4/conf/all/accept_source_route is set, and the /proc/sys/net/ipv4/conf/<deviceName>/accept_source_route is set. If these conditions are not met, the packet is dropped:
+
+...
+
+if (unlikely(opt->srr)) {
+
+struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+if (in_dev) {
+
+if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
+
+...
+
+goto drop;
+
+}
+
+}
+
+if (ip_options_rcv_srr(skb))
+
+goto drop;
+
+}
+
+Let's take a look at the ip_options_rcv_srr() method (again, I will focus on the important points, not little details). The list of source route addresses is iterated over. During the parsing process some sanity checks are made in the loop to see if there are errors. When the first nonlocal address is encountered, the loop is exited, and the following actions take place:
+
+  * Set the srr_is_hit flag of the IP option object (opt->srr_is_hit = 1).
+
+  * Set opt->nexthop to be the nexthop address that was found.
+
+  * Set the opt->is_changed flag to 1.
+
+The packet should be forwarded. When the method ip_forward_finish() is reached, the ip_forward_options() method is called. In this method, if the srr_is_hit flag of the IP option object is set, the daddr of the ipv4 header is changed to be opt->nexthop, the offset is incremented by 4 (to point to the next address in the source route addresses list), and—because the IPv4 header was changed—the checksum is recalculated by calling the ip_send_check() method.
+
+### IP Options and Fragmentation
+
+When describing the option type in the beginning of this section, I mentioned a copied flag in the option type byte which indicates whether or not to copy the option when forwarding a fragmented packet. Handling IP options in fragmentation is done by the ip_options_fragment() method, which is invoked from the method which prepares fragments, ip_fragment(). It is called only for the first fragment. Let's take a look at the ip_options_fragment() method, which is very simple:
+
+void ip_options_fragment(struct sk_buff *skb)
+
+{
+
+unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr);
+
+struct ip_options *opt = &(IPCB(skb)->opt);
+
+int l = opt->optlen;
+
+int optlen;
+
+The while loop simply iterates over the options, reading each option type. optptr is a pointer to the option list (which starts at the end of the 20 first bytes of the IPv4 header). l is the size of the option list, which is being decremented by 1 in each loop iteration:
+
+while (l > 0) {
+
+switch (*optptr) {
+
+When the option type is IPOPT_END, which terminates the option string, it means that reading the options is finished:
+
+case IPOPT_END:
+
+return;
+
+case IPOPT_NOOP:
+
+When the option type is IPOPT_NOOP, used for padding between options, the optptr pointer is incremented by 1, l is decremented, and the next option is processed:
+
+l--;
+
+optptr++;
+
+continue;
+
+}
+
+Perform a sanity check on the option length:
+
+optlen = optptr[1];
+
+if (optlen<2 || optlen>l)
+
+return;
+
+Check whether the option should be copied; if not, simply put one or several IPOPT_NOOP options instead of it with the memset() function. The number of IPOPT_NOOP bytes that memset() writes is the size of the option that was read, namely optlen:
+
+if (!IPOPT_COPIED(*optptr))
+
+memset(optptr, IPOPT_NOOP, optlen);
+
+Now go to the next option:
+
+l -= optlen;
+
+optptr += optlen; }
+
+IPOPT_TIMESTAMP and IPOPT_RR are options for which the copied flag is 0 (see Table 4-1). They are replaced by IPOPT_NOOP in the loop you saw earlier, and their relevant fields in the IP option object are reset to 0:
+
+opt->ts = 0;
+
+opt->rr = 0;
+
+opt->rr_needaddr = 0;
+
+opt->ts_needaddr = 0;
+
+opt->ts_needtime = 0;
+
+}
+
+(net/ipv4/ip_options.c)
+
+In this section you have learned how the ip_rcv_options() handles the reception of packets with IP options and how IP options are parsed by the ip_options_compile() method. Fragmentation in IP options was also discussed. The next section covers the process of building IPv4 options, which involves setting the IP options of an IPv4 header based on a specified ip_options object.
+
+### Building IP Options
+
+The ip_options_build() method can be thought of as the reverse of the ip_options_compile() method you saw earlier in this chapter. It takes an ip_options object as an argument and writes its content to the IPv4 header. Let's take a look at it:
+
+void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
+
+__be32 daddr, struct rtable *rt, int is_frag)
+
+{
+
+unsigned char *iph = skb_network_header(skb);
+
+memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
+
+memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen);
+
+opt = &(IPCB(skb)->opt);
+
+if (opt->srr)
+
+memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4);
+
+if (!is_frag) {
+
+if (opt->rr_needaddr)
+
+ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt);
+
+if (opt->ts_needaddr)
+
+ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt);
+
+if (opt->ts_needtime) {
+
+struct timespec tv;
+
+__be32 midtime;
+
+getnstimeofday(&tv);
+
+midtime = htonl((tv.tv_sec % 86400) *
+
+MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC);
+
+memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
+
+}
+
+return;
+
+}
+
+if (opt->rr) {
+
+memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]);
+
+opt->rr = 0;
+
+opt->rr_needaddr = 0;
+
+}
+
+if (opt->ts) {
+
+memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]);
+
+opt->ts = 0;
+
+opt->ts_needaddr = opt->ts_needtime = 0;
+
+}
+
+}
+
+The ip_forward_options() method handles forwarding fragmented packets (net/ipv4/ip_options.c). In this method the Record Route and Strict Record route options are handled, and the ip_send_check() method is invoked to calculate the checksum for packets whose IPv4 header was changed (the opt->is_changed flag is set) and to reset the opt->is_changed flag to 0. The IPv4 Tx path—namely, how packets are sent—is discussed in the next section.
+
+My discussion on the Rx path is finished. The next section talks about the Tx path—what happens when IPv4 packets are sent.
+
+## Sending IPv4 Packets
+
+The IPv4 layer provides the means for the layer above it, the transport layer (L4), to send packets by passing these packets to the link layer (L2). I discuss how that is implemented in this section, and you'll see some differences between handling transmission of TCPv4 packets in IPv4 and handling transmission of UDPv4 packets in IPv4. There are two main methods for sending IPv4 packets from Layer 4, the transport layer: The first one is the ip_queue_xmit() method, used by the transport protocols that handle fragmentation by themselves, like TCPv4. The ip_queue_xmit() method is not the only transmission method used by TCPv4, which uses also the ip_build_and_send_pkt() method, for example, to send SYN ACK messages (see the tcp_v4_send_synack() method implementation in net/ipv4/tcp_ipv4.c). The second method is the ip_append_data() method, used by the transport protocols that do not handle fragmentation, like the UDPv4 protocol or the ICMPv4 protocol. The ip_append_data() method does not send any packet—it only prepares the packet. The ip_push_pending_frames() method is for actually sending the packet, and it is used by ICMPv4 or raw sockets, for example. Calling ip_push_pending_frames() actually starts the transmission process by calling the ip_send_skb() method, which eventually calls the ip_local_out() method. The ip_push_pending_frames() method was used for carrying out the transmission in UDPv4 prior to kernel 2.6.39; with the new ip_finish_skb API in 2.6.39, the ip_send_skb() method is used instead. Both methods are implemented in net/ipv4/ip_output.c.
+
+There are cases where the dst_output() method is called directly, without using the ip_queue_xmit() method or the ip_append_data() method; for example, when sending with a raw socket which uses IP_HDRINCL socket option, there is no need to prepare an IPv4 header. Userspace applications that build an IPv4 by their own use the IPv4 IP_HDRINCL socket option. For example, the well-known ping of iputils and nping of nmap both enable the user to set the ttl of the IPv4 header like this:
+
+ping –ttl ipDestAddress
+
+or:
+
+nping –ttl ipDestAddress
+
+Sending packets by raw sockets whose IP_HDRINCL socket option is set is done like this:
+
+static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
+
+void *from, size_t length,
+
+struct rtable **rtp,
+
+unsigned int flags)
+
+{
+
+...
+
+err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
+
+rt->dst.dev, dst_output);
+
+...
+
+}
+
+Figure 4-8 shows the paths for sending IPv4 packets from the transport layer.
+
+Figure 4-8.
+
+Sending IPv4 packets
+
+In figure 4-8 you can see the different paths for transmitted packets that come from the transport layer (L4); these packets are handled by the ip_queue_xmit() method or by the ip_append_data() method.
+
+Let's start with the ip_queue_xmit() method, which is the simpler method of the two:
+
+int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
+
+...
+
+/* Make sure we can route this packet. */
+
+rt = (struct rtable *)__sk_dst_check(sk, 0);
+
+The rtable object is the result of a lookup in the routing subsystem. First I discuss the case where the rtable instance is NULL and you need to perform a lookup in the routing subsystem. If the strict routing option flag is set, the destination address is set to be the first address of the IP options:
+
+if (rt == NULL) {
+
+__be32 daddr;
+
+/* Use correct destination address if we have options. */
+
+daddr = inet->inet_daddr;
+
+if (inet_opt && inet_opt->opt.srr)
+
+daddr = inet_opt->opt.faddr;
+
+Now a lookup in the routing subsystem is performed with the ip_route_output_ports() method: if the lookup fails, the packet is dropped, and an error of –EHOSTUNREACH is returned:
+
+/* If this fails, retransmit mechanism of transport layer will
+
+* keep trying until route appears or the connection times
+
+* itself out.
+
+*/
+
+rt = ip_route_output_ports(sock_net(sk), fl4, sk,
+
+daddr, inet->inet_saddr,
+
+inet->inet_dport,
+
+inet->inet_sport,
+
+sk->sk_protocol,
+
+RT_CONN_FLAGS(sk),
+
+sk->sk_bound_dev_if);
+
+if (IS_ERR(rt))
+
+goto no_route;
+
+sk_setup_caps(sk, &rt->dst);
+
+}
+
+skb_dst_set_noref(skb, &rt->dst);
+
+...
+
+If the lookup succeeds, but both the is_strictroute flag in the options and the rt_uses_gateway flag in the routing entry are set, the packet is dropped, and an error of –EHOSTUNREACH is returned:
+
+if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
+
+goto no_route;
+
+Now the IPv4 header is being built. You should remember that the packet arrived from Layer 4, where skb->data pointed to the transport header. The skb->data pointer is moved back by the skb_push() method; the offset needed to move it back is the size of the IPv4 header plus the size of the IP options list (optlen), if IP options are used:
+
+/* OK, we know where to send it, allocate and build IP header. */
+
+skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
+
+Set the L3 header (skb->network_header) to point to skb->data:
+
+skb_reset_network_header(skb);
+
+iph = ip_hdr(skb);
+
+*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
+
+if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
+
+iph->frag_off = htons(IP_DF);
+
+else
+
+iph->frag_off = 0;
+
+iph->ttl = ip_select_ttl(inet, &rt->dst);
+
+iph->protocol = sk->sk_protocol;
+
+ip_copy_addrs(iph, fl4);
+
+The options length (optlen) is divided by 4, and the result is added to the IPv4 header length (iph->ihl) because the IPv4 header is measured in multiples of 4 bytes. Then the ip_options_build() method is invoked to build the options in the IPv4 header based on the content of the specified IP options. The last parameter of the ip_options_build() method, is_frag, specifies that there are no fragments. The ip_options_build() method was discussed in the "IP Option" section earlier in this chapter.
+
+if (inet_opt && inet_opt->opt.optlen) {
+
+iph->ihl += inet_opt->opt.optlen >> 2;
+
+ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
+
+}
+
+Set the id in the IPv4 header:
+
+ip_select_ident_more(iph, &rt->dst, sk,
+
+(skb_shinfo(skb)->gso_segs ?: 1) - 1);
+
+skb->priority = sk->sk_priority;
+
+skb->mark = sk->sk_mark;
+
+Send the packet:
+
+res = ip_local_out(skb);
+
+Before discussing the ip_append_data() method, I want to mention a callback which is a parameter to the ip_append_data() method: the getfrag() callback. The getfrag() method is a callback to copy the actual data from userspace into the SKB. In UDPv4, the getfrag() callback is set to be the generic method, ip_generic_getfrag(). In ICMPv4, the getfrag() callback is set to be a protocol-specific method, icmp_glue_bits(). Another issue I should mention here is the UDPv4 corking feature. The UDP_CORK socket option was added in kernel 2.5.44; when this option is enabled, all data output on this socket is accumulated into a single datagram that is transmitted when the option is disabled. You can enable and disable this socket option with the setsockopt() system call; see man 7 udp. In kernel 2.6.39, a lockless transmit fast path was added to the UDPv4 implementation. With this addition, when the corking feature is not used, the socket lock is not used. So when the UDP_CORK socket option is set (with the setsockopt() system call), or the MSG_MORE flag is set, the ip_append_data() method is invoked. And when the UDP_CORK socket option is not set, another path in the udp_sendmsg() method is used, which does not hold the socket lock and is faster as a result, and the ip_make_skb() method is invoked. Calling the ip_make_skb() method is similar to the ip_append_data() and the ip_push_pending_frames() methods rolled into one, except that it does not send the SKB produced. Sending the SKB is carried out by the ip_send_skb() method.
+
+Let's take a look now at the ip_append_data() method:
+
+int ip_append_data(struct sock *sk, struct flowi4 *fl4,
+
+int getfrag(void *from, char *to, int offset, int len,
+
+int odd, struct sk_buff *skb),
+
+void *from, int length, int transhdrlen,
+
+struct ipcm_cookie *ipc, struct rtable **rtp,
+
+unsigned int flags)
+
+{
+
+struct inet_sock *inet = inet_sk(sk);
+
+int err;
+
+If the MSG_PROBE flag us used, it means that the caller is interested only in some information (usually MTU, for PMTU discovery), so there is no need to actually send the packet, and the method returns 0:
+
+if (flags&MSG_PROBE)
+
+return 0;
+
+The value of transhdrlen is used to indicate whether it is a first fragment or not. The ip_setup_cork() method creates a cork IP options object if it does not exist and copies the IP options of the specified ipc (ipcm_cookie object) to the cork IP options:
+
+if (skb_queue_empty(&sk->sk_write_queue)) {
+
+err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
+
+if (err)
+
+return err;
+
+} else {
+
+transhdrlen = 0;
+
+}
+
+The real work is done by the __ip_append_data() method; this is a long and a complex method, and I can't delve into all its details. I will mention that there are two different ways to handle fragments in this method, according to whether the network device supports Scatter/Gather (NETIF_F_SG) or not. When the NETIF_F_SG flag is set, skb_shinfo(skb)->frags is used, whereas when the NETIF_F_SG flag is not set, skb_shinfo(skb)->frag_list is used. There is also a different memory allocation when the MSG_MORE flag is set. The MSG_MORE flag indicates that soon another packet will be sent. Since Linux 2.6, this flag is also supported for UDP sockets.
+
+return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
+
+sk_page_frag(sk), getfrag,
+
+from, length, transhdrlen, flags);
+
+}
+
+In this section you have learned about the Tx path—how sending IPv4 packets is implemented. When the packet length is higher than the network device MTU, the packet can't be sent as is. The next section covers fragmentation in the Tx path and how it is handled.
+
+## Fragmentation
+
+The network interface has a limit on the size of a packet. Usually in 10/100/1000 Mb/s Ethernet networks, it is 1500 bytes, though there are network interfaces that allow using an MTU of up to 9K (called jumbo frames). When sending a packet that is larger than the MTU of the outgoing network card, it should be broken into smaller pieces. This is done within the ip_fragment() method (net/ipv4/ip_output.c). Received fragmented packets should be reassembled into one packet. This is done by the ip_defrag() method, (net/ipv4/ip_fragment.c), discussed in the next section, "Defragmentation."
+
+Let's take a look first at the ip_fragment() method. Here's its prototype:
+
+int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
+
+The output callback is the method of transmission to be used. When the ip_fragment() method is invoked from ip_finish_output(), the output callback is the ip_finish_output2() method. There are two paths in the ip_fragment() method: the fast path and the slow path. The fast path is for packets where the frag_list of the SKB is not NULL, and the slow path is for packets that do not meet this condition.
+
+First a check is performed to see whether fragmentation is permitted, and if not, a "Destination Unreachable" ICMPv4 message with code of fragmentation needed is sent back to the sender, the statistics (IPSTATS_MIB_FRAGFAILS) are updated, the packet is dropped, and an error code of –EMSGSIZE is returned:
+
+int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
+
+{
+
+unsigned int mtu, hlen, left, len, ll_rs;
+
+...
+
+struct rtable *rt = skb_rtable(skb);
+
+int err = 0;
+
+dev = rt->dst.dev;
+
+...
+
+iph = ip_hdr(skb);
+
+if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) ||
+
+(IPCB(skb)->frag_max_size &&
+
+IPCB(skb)->frag_max_size > dst_mtu(&rt->dst)))) {
+
+IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+
+icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+
+htonl(ip_skb_dst_mtu(skb)));
+
+kfree_skb(skb);
+
+return -EMSGSIZE;
+
+}
+
+...
+
+...
+
+The next section discusses the fast path in fragmentation and its implementation.
+
+### Fast Path
+
+Now let's look into the fast path. First a check is performed to see whether the packet should be handled in the fast path by calling the skb_has_frag_list() method, which simply checks that skb_shinfo(skb)->frag_list is not NULL; if it is NULL, some sanity checks are made, and if something is not valid, the fallback to the slow path mechanism is activated (simply by calling goto slow_path). Then an IPv4 header is built for the first fragment. The frag_off of this IPv4 header is set to be htons(IP_MF), which indicates more fragments ahead. The frag_off field of the IPv4 header is a 16-bit field; the lower 13 bits are the fragment offset, and the higher 3 bits are the flags. For the first fragment, the offset should be 0, and the flag should be IP_MF (More Fragments). For all other fragments except the last one, the IP_MF flag should be set, and the lower 13 bits should be the fragment offset (measured in units of 8 bytes). For the last fragment, the IP_MF flag should not be set, but the lower 13 bits will still hold the fragment offset.
+
+Here's how to set hlen to the IPv4 header size in bytes:
+
+hlen = iph->ihl * 4;
+
+...
+
+if (skb_has_frag_list(skb)) {
+
+struct sk_buff *frag, *frag2;
+
+int first_len = skb_pagelen(skb);
+
+...
+
+err = 0;
+
+offset = 0;
+
+frag = skb_shinfo(skb)->frag_list;
+
+set skb_shinfo(skb)->frag_list to NULL by skb_frag_list_init(skb):
+
+skb_frag_list_init(skb);
+
+skb->data_len = first_len - skb_headlen(skb);
+
+skb->len = first_len;
+
+iph->tot_len = htons(first_len);
+
+Set the IP_MF (More Fragments) flag for the first fragment:
+
+iph->frag_off = htons(IP_MF);
+
+Because the value of some IPv4 header fields were changed, the checksum needs to be recalculated:
+
+ip_send_check(iph);
+
+Now take a look at the loop that traverses frag_list and builds fragments:
+
+for (;;) {
+
+/* Prepare header of the next frame,
+
+* before previous one went down. */
+
+if (frag) {
+
+frag->ip_summed = CHECKSUM_NONE;
+
+skb_reset_transport_header(frag);
+
+The ip_fragment() was invoked from the transport layer (L4), so skb->data points to the transport header. The skb->data pointer should be moved back by hlen bytes so that it will point to the IPv4 header (hlen is the size of the IPv4 header in bytes):
+
+__skb_push(frag, hlen);
+
+Set the L3 header (skb->network_header) to point to skb->data:
+
+skb_reset_network_header(frag);
+
+Copy the IPv4 header which was created into the L3 network header; in the first iteration of this for loop, it is the header which was created outside the loop for the first fragment:
+
+memcpy(skb_network_header(frag), iph, hlen);
+
+Now the IPv4 header and its tot_len of the next frag are initialized:
+
+iph = ip_hdr(frag);
+
+iph->tot_len = htons(frag->len);
+
+Copy various SKB fields (like pkt_type, priority, protocol) from SKB into frag:
+
+ip_copy_metadata(frag, skb);
+
+Only for the first fragment (where the offset is 0) should the ip_options_fragment() method be called:
+
+if (offset == 0)
+
+ip_options_fragment(frag);
+
+offset += skb->len - hlen;
+
+The frag_off field of the IPv4 header is measured in multiples of 8 bytes, so divide the offset by 8:
+
+iph->frag_off = htons(offset>>3);
+
+Each fragment, except the last one, should have the IP_MF flag set:
+
+if (frag->next != NULL)
+
+iph->frag_off |= htons(IP_MF);
+
+The value of some IPv4 header fields were changed, so the checksum should be recalculated:
+
+/* Ready, complete checksum */
+
+ip_send_check(iph);
+
+}
+
+Now send the fragment with the output callback. If sending it succeeded, increment IPSTATS_MIB_FRAGCREATES. If there was an error, exit the loop:
+
+err = output(skb);
+
+if (!err)
+
+IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
+
+if (err || !frag)
+
+break;
+
+Fetch the next SKB:
+
+skb = frag;
+
+frag = skb->next;
+
+skb->next = NULL;
+
+The following closing bracket is the end of the for loop:
+
+}
+
+The for loop is terminated, and the return value of the last call to output(skb) should be checked. If it is successful, the statistics (IPSTATS_MIB_FRAGOKS) are updated, and the method returns 0:
+
+if (err == 0) {
+
+IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
+
+return 0;
+
+}
+
+If the last call to output(skb) failed in one of the loop iterations, including the last one, the SKBs are freed, the statistics (IPSTATS_MIB_FRAGFAILS) are updated, and the error code (err) is returned:
+
+while (frag) {
+
+skb = frag->next;
+
+kfree_skb(frag);
+
+frag = skb;
+
+}
+
+IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+
+return err;
+
+You should now have a good understanding of the fast path in fragmentation and how it is implemented.
+
+### Slow Path
+
+Let's now take a look at how to implement the slow path in fragmentation:
+
+...
+
+iph = ip_hdr(skb);
+
+left = skb->len - hlen; /* Space per frame */
+
+...
+
+while (left > 0) {
+
+len = left;
+
+/* IF: it doesn't fit, use 'mtu' - the data space left */
+
+if (len > mtu)
+
+len = mtu;
+
+Each fragment (except the last one) should be aligned on a 8-byte boundary:
+
+if (len < left) {
+
+len &= ∼7;
+
+}
+
+Allocate an SKB:
+
+if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
+
+NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
+
+err = -ENOMEM;
+
+goto fail;
+
+}
+
+/*
+
+* Set up data on packet
+
+*/
+
+Copy various SKB fields (like pkt_type, priority, protocol) from skb into skb2:
+
+ip_copy_metadata(skb2, skb);
+
+skb_reserve(skb2, ll_rs);
+
+skb_put(skb2, len + hlen);
+
+skb_reset_network_header(skb2);
+
+skb2->transport_header = skb2->network_header + hlen;
+
+/*
+
+* Charge the memory for the fragment to any owner
+
+* it might possess
+
+*/
+
+if (skb->sk)
+
+skb_set_owner_w(skb2, skb->sk);
+
+/*
+
+* Copy the packet header into the new buffer.
+
+*/
+
+skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
+
+/*
+
+* Copy a block of the IP datagram.
+
+*/
+
+if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
+
+BUG();
+
+left -= len;
+
+/*
+
+* Fill in the new header fields.
+
+*/
+
+iph = ip_hdr(skb2);
+
+frag_off is measured in multiples of 8 bytes, so divide the offset by 8:
+
+iph->frag_off = htons((offset >> 3));
+
+...
+
+Handle options only once for the first fragment:
+
+if (offset == 0)
+
+ip_options_fragment(skb);
+
+The MF flag (More Fragments) should be set on any fragment but the last:
+
+if (left > 0 || not_last_frag)
+
+iph->frag_off |= htons(IP_MF);
+
+ptr += len;
+
+offset += len;
+
+/*
+
+* Put this fragment into the sending queue.
+
+*/
+
+iph->tot_len = htons(len + hlen);
+
+Because the value of some IPv4 header fields were changed, the checksum should be recalculated:
+
+ip_send_check(iph);
+
+Now send the fragment with the output callback. If sending it succeeded, increment IPSTATS_MIB_FRAGCREATES. If there was an error, then free the packet, update the statistics (IPSTATS_MIB_FRAGFAILS), and return the error code:
+
+err = output(skb2);
+
+if (err)
+
+goto fail;
+
+IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
+
+}
+
+Now the while (left > 0) loop has terminated, and the consume_skb() method is invoked to free the SKB, the statistics (IPSTATS_MIB_FRAGOKS) are updated, and the value of err is returned:
+
+consume_skb(skb);
+
+IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
+
+return err;
+
+This section dealt with the implementation of slow path in fragmentation, and this ends the discussion of fragmentation in the Tx path. Remember that received fragmented packets, which are received on a host, should be reconstructed again so that applications can handle the original packet. The next section discusses defragmentation—the opposite of fragmentation.
+
+## Defragmentation
+
+Defragmentation is the process of reassembling all the fragments of a packet, which all have the same id in the IPv4 header, into one buffer. The main method that handles defragmentation in the Rx path is ip_defrag() (net/ipv4/ip_fragment.c), which is called from ip_local_deliver(). There are other places where defragmentation might be needed, such as in firewalls, where the content of the packet should be known in order to be able to inspect it. In the ip_local_deliver() method, the ip_is_fragment() method is invoked to check whether the packet is fragmented; if it is, the ip_defrag() method is invoked. The ip_defrag() method has two arguments: the first is the SKB and the second is a 32-bit field which indicates the point where the method was invoked. Its value can be the following:
+
+  * IP_DEFRAG_LOCAL_DELIVER when it was called from ip_local_deliver().
+
+  * IP_DEFRAG_CALL_RA_CHAIN when it was called from ip_call_ra_chain().
+
+  * IP_DEFRAG_VS_IN or IP_DEFRAG_VS_FWD or IP_DEFRAG_VS_OUT when it was called from IPVS.
+
+For a full list of possible values for the second argument of ip_defrag(), look in the ip_defrag_users enum definition in include/net/ip.h.
+
+Let's look at the ip_defrag() invocation in ip_local_deliver():
+
+int ip_local_deliver(struct sk_buff *skb)
+
+{
+
+/*
+
+* Reassemble IP fragments.
+
+*/
+
+if (ip_is_fragment(ip_hdr(skb))) {
+
+if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
+
+return 0;
+
+}
+
+return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
+
+ip_local_deliver_finish);
+
+}
+
+(net/ipv4/ip_input.c)
+
+The ip_is_fragment() is a simple helper method that takes as a sole argument the IPv4 header and returns true when it is a fragment, like this:
+
+static inline bool ip_is_fragment(const struct iphdr *iph)
+
+{
+
+return (iph->frag_off & htons(IP_MF | IP_OFFSET)) != 0;
+
+}
+
+(include/net/ip.h)
+
+The ip_is_fragment() method returns true in either of two cases (or both):
+
+  * The IP_MF flag is set.
+
+  * The fragment offset is not 0.
+
+Thus it will return true on all fragments:
+
+  * On the first fragment, where frag_off is 0 but the IP_MF flag is set.
+
+  * On the last fragment, where frag_off is not 0 but the IP_MF flag is not set.
+
+  * On all other fragments, where frag_off is not 0 and the IP_MF flag is set.
+
+The implementation of defragmentation is based on a hash table of ipq objects. The hash function (ipqhashfn) has four arguments: fragment id, source address, destination address, and protocol:
+
+struct ipq {
+
+struct inet_frag_queue q;
+
+u32 user;
+
+__be32 saddr;
+
+__be32 daddr;
+
+__be16 id;
+
+u8 protocol;
+
+u8 ecn; /* RFC3168 support */
+
+int iif;
+
+unsigned int rid;
+
+struct inet_peer *peer;
+
+};
+
+Note that the logic of IPv4 defragmentation is shared with its IPv6 counterpart. So, for example, the inet_frag_queue structure and methods like the inet_frag_find() method and the inet_frag_evictor() method are not specific to IPv4; they are also used in IPv6 (see net/ipv6/reassembly.c and net/ipv6/nf_conntrack_reasm.c).
+
+The ip_defrag() method is quite short. First it makes sure there is enough memory by calling the ip_evictor() method. Then it tries to find an ipq for the SKB by calling the ip_find() method; if it does not find one, it creates an ipq object. The ipq object that the ip_find() method returns is assigned to a variable named qp (a pointer to an ipq object). Then it calls the ip_frag_queue() method to add the fragment to a linked list of fragments (qp->q.fragments). The addition to the list is done according to the fragment offset, because the list is sorted by the fragment offset. After all fragments of an SKB were added, the ip_frag_queue()method calls the ip_frag_reasm() method to build a new packet from all its fragments. The ip_frag_reasm() method also stops the timer (of ip_expire()) by calling the ipq_kill() method. If there was some error, and the size of the new packet exceeds the highest permitted size (which is 65535), the ip_frag_reasm() method updates the statistics (IPSTATS_MIB_REASMFAILS) and returns -E2BIG. If the call to skb_clone() method in ip_frag_reasm() fails, it returns –ENOMEM. The IPSTATS_MIB_REASMFAILS statistics is updated in this case as well. Constructing a packet from all its fragments should be done in a specified time interval. If it's not completed within that interval, the ip_expire() method will send an ICMPv4 message of "Time Exceeded" with "Fragment Reassembly Time Exceeded" code. The defragmentation time interval can be set by the following procfs entry: /proc/sys/net/ipv4/ipfrag_time. It is 30 seconds by default.
+
+Let's take a look at the ip_defrag() method:
+
+int ip_defrag(struct sk_buff *skb, u32 user)
+
+{
+
+struct ipq *qp;
+
+struct net *net;
+
+net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
+
+IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
+
+/* Start by cleaning up the memory. */
+
+ip_evictor(net);
+
+/* Lookup (or create) queue header */
+
+if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
+
+int ret;
+
+spin_lock(&qp->q.lock);
+
+ret = ip_frag_queue(qp, skb);
+
+spin_unlock(&qp->q.lock);
+
+ipq_put(qp);
+
+return ret;
+
+}
+
+IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+
+kfree_skb(skb);
+
+return -ENOMEM;
+
+}
+
+Before looking at the ip_frag_queue() method, consider the following macro, which simply returns the ipfrag_skb_cb object which is associated with the specified SKB:
+
+#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
+
+Now let's look at the ip_frag_queue() method. I will not describe all the details because the method is very complicated and takes into account problems that might arise from overlapping (overlapping fragments may occur due to retransmissions). In the following snippet, qp->q.len is set to be the total length of the packet, including all its fragments; when the IP_MF flag is not set, this means that this is the last fragment:
+
+static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+
+{
+
+struct sk_buff *prev, *next;
+
+...
+
+/* Determine the position of this fragment. */
+
+end = offset + skb->len - ihl;
+
+err = -EINVAL;
+
+/* Is this the final fragment? */
+
+if ((flags & IP_MF) == 0) {
+
+/* If we already have some bits beyond end
+
+* or have different end, the segment is corrupted.
+
+*/
+
+if (end < qp->q.len ||
+
+((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len))
+
+goto err;
+
+qp->q.last_in |= INET_FRAG_LAST_IN;
+
+qp->q.len = end;
+
+} else {
+
+...
+
+}
+
+Now the location for adding the fragment is found by looking for the first place which is after the fragment offset (the linked list of fragments is ordered by offset):
+
+...
+
+prev = NULL;
+
+for (next = qp->q.fragments; next != NULL; next = next->next) {
+
+if (FRAG_CB(next)->offset >= offset)
+
+break; /* bingo! */
+
+prev = next;
+
+}
+
+Now, prev points to where to add the new fragment if it is not NULL. Skipping handling overlapping and some other checks, let's continue to the insertion of the fragment into the list:
+
+FRAG_CB(skb)->offset = offset;
+
+/* Insert this fragment in the chain of fragments. */
+
+skb->next = next;
+
+if (!next)
+
+qp->q.fragments_tail = skb;
+
+if (prev)
+
+prev->next = skb;
+
+else
+
+qp->q.fragments = skb;
+
+...
+
+qp->q.meat += skb->len;
+
+Note that the qp->q.meat is incremented by skb->len for each fragment. As mentioned earlier, qp->q.len is the total length of all fragments, and when it is equal to qp->q.meat, it means that all fragments were added and should be reassembled into one packet with the ip_frag_reasm() method.
+
+Now you can see how and where reassembly takes place: (reassembly is done by calling the ip_frag_reasm() method):
+
+if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+
+qp->q.meat == qp->q.len) {
+
+unsigned long orefdst = skb->_skb_refdst;
+
+skb->_skb_refdst = 0UL;
+
+err = ip_frag_reasm(qp, prev, dev);
+
+skb->_skb_refdst = orefdst;
+
+return err;
+
+}
+
+Let's take a look at the ip_frag_reasm() method:
+
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
+
+struct net_device *dev)
+
+{
+
+struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+
+struct iphdr *iph;
+
+struct sk_buff *fp, *head = qp->q.fragments;
+
+int len;
+
+...
+
+/* Allocate a new buffer for the datagram. */
+
+ihlen = ip_hdrlen(head);
+
+len = ihlen + qp->q.len;
+
+err = -E2BIG;
+
+if (len > 65535)
+
+goto out_oversize;
+
+...
+
+skb_push(head, head->data - skb_network_header(head));
+
+## Forwarding
+
+The main handler for forwarding a packet is the ip_forward() method:
+
+int ip_forward(struct sk_buff *skb)
+
+{
+
+struct iphdr *iph; /* Our header */
+
+struct rtable *rt; /* Route we use */
+
+struct ip_options *opt = &(IPCB(skb)->opt);
+
+I should describe why Large Receive Offload (LRO) packets are dropped in forwarding. LRO is a performance-optimization technique that merges packets together, creating one large SKB, before they are passed to higher network layers. This reduces CPU overhead and thus improves the performance. Forwarding a large SKB, which was built by LRO, is not acceptable because it will be larger than the outgoing MTU. Therefore, when LRO is enabled the SKB is freed and the method returns NET_RX_DROP. Generic Receive Offload (GRO) design included forwarding ability, but LRO did not:
+
+if (skb_warn_if_lro(skb))
+
+goto drop;
+
+If the router_alert option is set, the ip_call_ra_chain() method should be invoked to handle the packet. When calling setsockopt() with IP_ROUTER_ALERT on a raw socket, the socket is added to a global list named ip_ra_chain (see include/net/ip.h). The ip_call_ra_chain() method delivers the packet to all raw sockets. You might wonder why is the packet delivered to all raw sockets and not to a single raw socket? In raw sockets there are no ports on which the sockets listen, as opposed to TCP or UDP.
+
+If the pkt_type—which was determined by the eth_type_trans() method, which should be called from the network driver, and which is discussed in Appendix A—is not PACKET_HOST, the packet is discarded:
+
+if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
+
+return NET_RX_SUCCESS;
+
+if (skb->pkt_type != PACKET_HOST)
+
+goto drop;
+
+The ttl (Time To Live) field of the IPv4 header is a counter which is decreased by 1 in each forwarding device. If the ttl reaches 0, that is an indication that the packet should be dropped and that a corresponding time exceeded ICMPv4 message with "TTL Count Exceeded" code should be sent:
+
+if (ip_hdr(skb)->ttl <= 1)
+
+goto too_many_hops;...
+
+...
+
+too_many_hops:
+
+/* Tell the sender its packet died... */
+
+IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);
+
+icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
+
+...
+
+Now a check is performed if both the strict route flag (is_strictroute) is set and the rt_uses_gateway flag is set; in such a case, strict routing cannot be applied, and a "Destination Unreachable" ICMPv4 message with "Strict Routing Failed" code is sent back:
+
+rt = skb_rtable(skb);
+
+if (opt->is_strictroute && rt->rt_uses_gateway)
+
+goto sr_failed;
+
+...
+
+sr_failed:
+
+icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
+
+goto drop;
+
+...
+
+Now a check is performed to see whether the length of the packet is larger than the outgoing device MTU. If it is, that means the packet is not permitted to be sent as it is. Another check is performed to see whether the DF (Don't Fragment) field in the IPv4 header is set and whether the local_df flag in the SKB is not set. If these conditions are met, it means that when the packet reaches the ip_output() method, it will not be fragmented with the ip_fragment() method. This means the packet cannot be sent as is, and it also cannot be fragmented; so a destination unreachable ICMPv4 message with "Fragmentation Needed" code is sent back, the packet is dropped, and the statistics (IPSTATS_MIB_FRAGFAILS) are updated:
+
+if (unlikely(skb->len > dst_mtu(&rt->dst) &&
+
+!skb_is_gso(skb) && (ip_hdr(skb)->frag_off & htons(IP_DF)))
+
+&& !skb->local_df) {
+
+IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
+
+icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+
+htonl(dst_mtu(&rt->dst)));
+
+goto drop; }
+
+Because the ttl and checksum of the IPv4 header are going to be changed, a copy of the SKB should be kept:
+
+/* We are about to mangle packet. Copy it! */
+
+if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
+
+goto drop;
+
+iph = ip_hdr(skb);
+
+As mentioned earlier, each node that forwards the packet should decrease the ttl. As a result of the ttl change, the checksum is also updated accordingly in the ip_decrease_ttl() method:
+
+/* Decrease ttl after skb cow done */
+
+ip_decrease_ttl(iph);
+
+Now a redirect ICMPv4 message is sent back. If the RTCF_DOREDIRECT flag of the routing entry is set then a "Redirect To Host" code is used for this message (I discuss ICMPv4 redirect messages in Chapter 5).
+
+/*
+
+* We now generate an ICMP HOST REDIRECT giving the route
+
+* we calculated.
+
+*/
+
+if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb))
+
+ip_rt_send_redirect(skb);
+
+The skb->priority in the Tx path is set to be the socket priority (sk->sk_priority)—see, for example, the ip_queue_xmit() method. The socket priority, in turn, can be set by calling the setsockopt() system call with SOL_SOCKET and SO_PRIORITY. However, when forwarding the packet, there is no socket attached to the SKB. So, in the ip_forward() method, the skb->priority is set according to a special table called ip_tos2prio. This table has 16 entries (see include/net/route.h).
+
+skb->priority = rt_tos2priority(iph->tos);
+
+Now, assuming that there are no netfilter NF_INET_FORWARD hooks, the ip_forward_finish() method is invoked:
+
+return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
+
+rt->dst.dev, ip_forward_finish);
+
+In ip_forward_finish(), the statistics are updated, and we check that the IPv4 packet includes IP options. If it does, the ip_forward_options() method is invoked to handle the options. If it does not have options, the dst_output() method is called. The only thing this method does is invoke skb_dst(skb)->output(skb):
+
+static int ip_forward_finish(struct sk_buff *skb)
+
+{
+
+struct ip_options *opt = &(IPCB(skb)->opt);
+
+IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+
+IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
+
+if (unlikely(opt->optlen))
+
+ip_forward_options(skb);
+
+return dst_output(skb);
+
+}
+
+In this section you learned about the methods for forwarding packets (ip_forward() and ip_forward_finish()), about cases when a packet is discarded in forwarding, about cases when an ICMP redirect is sent, and more.
+
+## Summary
+
+This chapter dealt with the IPv4 protocol—how an IPv4 packet is built, the IPv4 header structure and IP options, and how they are handled. You learned how the IPv4 protocol handler is registered. You also learned about the Rx path (how the reception of IPv4 packets is handled) and about the Tx path in IPv4 (how the transmission of IPv4 packets is handled). There are cases when packets are larger than the network interface MTU, and as a result they can't be sent without being fragmented on the sender side and later defragmented on the receiver side. You learned about the implementation of fragmentation in IPv4 (including how the slow path and the fast path are implemented and when they are used) and the implementation of defragmentation in IPv4. The chapter also covered IPv4 forwarding—sending an incoming packet on a different network interface without passing it to the upper layer. And you saw some examples of when a packet is discarded in the forwarding process and when an ICMP redirect is sent. The next chapter discusses the IPv4 routing subsystem. The "Quick Reference" section that follows covers the top methods that are related to the topics discussed in this chapter, ordered by their context.
+
+## Quick Reference
+
+I conclude this chapter with a short list of important methods and macros of the IPv4 subsystem that were mentioned in this chapter.
+
+### Methods
+
+The following is a short list of important methods of the IPv4 layer, which were mentioned in this chapter.
+
+#### int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl);
+
+This method moves packets from L4 (the transport layer) to L3 (the network layer), invoked for example from TCPv4.
+
+#### int ip_append_data(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, unsigned int flags);
+
+This method moves packets from L4 (the transport layer) to L3 (the network layer); invoked for example from UDPv4 when working with corked UDP sockets and from ICMPv4.
+
+#### struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, unsigned int flags);
+
+This method was added in kernel 2.6.39 for enabling lockless transmit fast path to the UDPv4 implementation; called when not using the UDP_CORK socket option.
+
+#### int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb);
+
+This method is a generic method for copying data from userspace into the specified skb.
+
+#### static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb);
+
+This method is the ICMPv4 getfrag callback. The ICMPv4 module calls the ip_append_data() method with icmp_glue_bits() as the getfrag callback.
+
+#### int ip_options_compile(struct net *net,struct ip_options *opt, struct sk_buff *skb);
+
+This method builds an ip_options object by parsing IP options.
+
+#### void ip_options_fragment(struct sk_buff *skb);
+
+This method fills the options whose copied flag is not set with NOOPs and resets the corresponding fields of these IP options. Invoked only for the first fragment.
+
+#### void ip_options_build(struct sk_buff *skb, struct ip_options *opt, __be32 daddr, struct rtable *rt, int is_frag);
+
+This method takes the specified ip_options object and writes its content to the IPv4 header. The last parameter, is_frag, is in practice 0 in all invocations of the ip_options_build() method.
+
+#### void ip_forward_options(struct sk_buff *skb);
+
+This method handles IP options forwarding.
+
+#### int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
+
+This method is the main Rx handler for IPv4 packets.
+
+#### ip_rcv_options(struct sk_buff *skb);
+
+This method is the main method for handling receiving a packet with options.
+
+#### int ip_options_rcv_srr(struct sk_buff *skb);
+
+This method handles receiving a packet with strict route option.
+
+#### int ip_forward(struct sk_buff *skb);
+
+This method is the main handler for forwarding IPv4 packets.
+
+#### static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *c, int vifi);
+
+This method is the multicast transmission method.
+
+#### static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, void *from, size_t length, struct rtable **rtp, unsigned int flags);
+
+This method is used by raw sockets for transmission when the IPHDRINC socket option is set. It calls the dst_output() method directly.
+
+#### int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
+
+This method is the main fragmentation method.
+
+#### int ip_defrag(struct sk_buff *skb, u32 user);
+
+This method is the main defragmentation method. It processes an incoming IP fragment. The second parameter, user, indicates where this method was invoked from. For a full list of possible values for the second parameter, look in the ip_defrag_users enum definition in include/net/ip.h.
+
+#### bool skb_has_frag_list(const struct sk_buff *skb);
+
+This method returns true if skb_shinfo(skb)->frag_list is not NULL. The method skb_has_frag_list() was named skb_has_frags() in the past, and was renamed skb_has_frag_list() in kernel 2.6.37. (The reason was that the name was confusing.) SKBs can be fragmented in two ways: via a page array (called skb_shinfo(skb)->frags[]) and via a list of SKBs (called skb_shinfo(skb)->frag_list). Because skb_has_frags() tests the latter, its name is confusing because it sounds more like it's testing the former.
+
+#### int ip_local_deliver(struct sk_buff *skb);
+
+This method handles delivering packets to Layer 4.
+
+int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp, unsigned char __user *data, int optlen);
+
+This method handles setting options from userspace by the setsockopt() system call with IP_OPTIONS.
+
+#### bool ip_is_fragment(const struct iphdr *iph);
+
+This method returns true if the packet is a fragment.
+
+#### int ip_decrease_ttl(struct iphdr *iph);
+
+This method decrements the ttl of the specified IPv4 header by 1 and, because one of the IPv4 header fields had changed (ttl), recalculates the IPv4 header checksum.
+
+#### int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, __be32 saddr, __be32 daddr, struct ip_options_rcu *opt);
+
+This method is used by TCPv4 to send SYN ACK. See the tcp_v4_send_synack() method in net/ipv4/tcp_ipv4.c.
+
+#### int ip_mr_input(struct sk_buff *skb);
+
+This method handles incoming multicast packets.
+
+#### int ip_mr_forward(struct net *net, struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *cache, int local);
+
+This method forwards multicast packets.
+
+#### bool ip_call_ra_chain(struct sk_buff *skb);
+
+This method handles the Router Alert IP option.
+
+### Macros
+
+This section mentions some macros from this chapter that deal with mechanisms encountered in the IPv4 stack, such as fragmentation, netfilter hooks, and IP options.
+
+#### IPCB(skb)
+
+This macro returns the inet_skb_parm object which skb->cb points to. It is used to access the ip_options object stored in the inet_skb_parm object (include/net/ip.h).
+
+#### FRAG_CB(skb)
+
+This macro returns the ipfrag_skb_cb object which skb->cb points to (net/ipv4/ip_fragment.c).
+
+#### int NF_HOOK(uint8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct sk_buff *))
+
+This macro is the netilter hook; the first parameter, pf, is the protocol family; for IPv4 it is NFPROTO_IPV4, and for IPv6 it is NFPROTO_IPV6. The second parameter is one of the five netfilter hook points in the network stack; these five points are defined in include/uapi/linux/netfilter.h and can be used both by IPv4 and IPv6. The okfn callback is to be called if there is no hook registered or if the registered netfilter hook does not discard or reject the packet.
+
+#### int NF_HOOK_COND(uint8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct sk_buff *), bool cond)
+
+This macro is same as the NF_HOOK() macro, but with an additional Boolean parameter, cond, which must be true so that the netfilter hook will be called.
+
+#### IPOPT_COPIED()
+
+This macro returns the copied flag of the option type.
+Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_5
+
+© Rami Rosen 2014
+
+# 5. The IPv4 Routing Subsystem
+
+Rami Rosen1
+
+(1)
+
+Haifa, Israel
+
+Abstract
+
+Chapter 4 discussed the IPv4 subsystem. In this chapter and the next I discuss one of the most important Linux subsystems, the routing subsystem, and its implementation in Linux. The Linux routing subsystem is used in a wide range of routers—from home and small office routers, to enterprise routers (which connect organizations or ISPs) and core high speed routers on the Internet backbone. It is impossible to imagine the modern world without these devices. The discussion in these two chapters is limited to the IPv4 routing subsystem, which is very similar to the IPv6 implementation. This chapter is mainly an introduction and presents the main data structures that are used by the IPv4 routing subsystem, like the routing tables, the Forwarding Information Base (FIB) info and the FIB alias, the FIB TRIE and more. (TRIE is not an acronym, by the way, but it is derived from the word retrieval). The TRIE is a data structure, a special tree that replaced the FIB hash table. You will learn how a lookup in the routing subsystem is performed, how and when ICMP Redirect messages are generated, and about the removal of the routing cache code. Note that the discussion and the code examples in this chapter relate to kernel 3.9, except for two sections where a different kernel version is explicitly mentioned.
+
+Chapter 4 discussed the IPv4 subsystem. In this chapter and the next I discuss one of the most important Linux subsystems, the routing subsystem, and its implementation in Linux. The Linux routing subsystem is used in a wide range of routers—from home and small office routers, to enterprise routers (which connect organizations or ISPs) and core high speed routers on the Internet backbone. It is impossible to imagine the modern world without these devices. The discussion in these two chapters is limited to the IPv4 routing subsystem, which is very similar to the IPv6 implementation. This chapter is mainly an introduction and presents the main data structures that are used by the IPv4 routing subsystem, like the routing tables, the Forwarding Information Base (FIB) info and the FIB alias, the FIB TRIE and more. (TRIE is not an acronym, by the way, but it is derived from the word retrieval). The TRIE is a data structure, a special tree that replaced the FIB hash table. You will learn how a lookup in the routing subsystem is performed, how and when ICMP Redirect messages are generated, and about the removal of the routing cache code. Note that the discussion and the code examples in this chapter relate to kernel 3.9, except for two sections where a different kernel version is explicitly mentioned.
+
+## Forwarding and the FIB
+
+One of the important goals of the Linux Networking stack is to forward traffic. This is relevant especially when discussing core routers, which operate in the Internet backbone. The Linux IP stack layer, responsible for forwarding packets and maintaining the forwarding database, is called the routing subsystem. For small networks, management of the FIB can be done by a system administrator, because most of the network topology is static. When discussing core routers, the situation is a bit different, as the topology is dynamic and there is a vast amount of ever-changing information. In this case, management of the FIB is done usually by userspace routing daemons, sometimes in conjunction with special hardware enhancements. These userspace daemons usually maintain routing tables of their own, which sometimes interact with the kernel routing tables.
+
+Let's start with the basics: what is routing? Take a look at a very simple forwarding example: you have two Ethernet Local Area Networks, LAN1 and LAN2. On LAN1 you have a subnet of 192.168.1.0/24, and on LAN2 you have a subnet of 192.168.2.0/24. There is a machine between these two LANs, which will be called a "forwarding router." There are two Ethernet network interface cards (NICs) in the forwarding router. The network interface connected to LAN1 is eth0 and has an IP address of 192.168.1.200, and the network interface connected to LAN2 is eth1 and has an IP address of 192.168.2.200, as you can see in Figure 5-1. For the sake of simplicity, let's assume that no firewall daemon runs on the forwarding router. You start sending traffic from LAN1, which is destined to LAN2. The process of forwarding incoming packets, which are sent from LAN1 and which are destined to LAN2 (or vice versa), according to data structures that are called routing tables, is called routing. I discuss this process and the routing table data structures in this chapter and in the next as well.
+
+In Figure 5-1, packets that arrive on eth0 from LAN1, which are destined to LAN2, are forwarded via eth1 as the outgoing device. In this process, the incoming packets move from Layer 2 (the link layer) in the kernel networking stack, to Layer 3, the network layer, in the forwarding router machine. As opposed to the case where the traffic is destined to the forwarding router machine ("Traffic to me"), however, there is no need to move the packets to Layer 4 (the transport layer) because this traffic in not intended to be handled by any Layer 4 transport socket. This traffic should be forwarded. Moving to Layer 4 has a performance cost, which is better to avoid whenever possible. This traffic is handled in Layer 3, and, according to the routing tables configured on the forwarding router machine, packets are forwarded on eth1 as the outgoing interface (or rejected).
+
+Figure 5-1.
+
+Forwarding packets between two LANs
+
+Figure 5-2 shows the three network layers handled by the kernel that were mentioned earlier.
+
+Figure 5-2.
+
+The three layers that are handled by the networking kernel stack
+
+Two additional terms that I should mention here, which are commonly used in routing, are default gateway and default route. When you are defining a default gateway entry in a routing table, every packet that is not handled by the other routing entries (if there are such entries) must be forwarded to it, regardless of the destination address in the IP header of this packet. The default route is designated as 0.0.0.0/0 in Classless Inter-Domain Routing (CIDR) notation. As a simple example, you can add a machine with an IPv4 address of 192.168.2.1 as a default gateway as follows:
+
+ip route add default via 192.168.2.1
+
+Or, when using the route command, like this:
+
+route add default gateway 192.168.2.1
+
+In this section you learned what forwarding is and saw a simple example illustrating how packets are forwarded between two LANs. You also learned what a default gateway is and what a default route is, and how to add them. Now that you know the basic terminology and what forwarding is, let's move on and see how a lookup in the routing subsystem is performed.
+
+## Performing a Lookup in the Routing Subsystem
+
+A lookup in the routing subsystem is done for each packet, both in the Rx path and in the Tx path. In kernels prior to 3.6, each lookup, both in the Rx path and in the Tx path, consisted of two phases: a lookup in the routing cache and, in case of a cache miss, a lookup in the routing tables (I discuss the routing cache at the end of this chapter, in the "IPv4 Routing Cache" section). A lookup is done by the fib_lookup() method. When the fib_lookup() method finds a proper entry in the routing subsystem, it builds a fib_result object, which consists of various routing parameters, and it returns 0. I discuss the fib_result object in this section and in other sections of this chapter. Here is the fib_lookup() prototype:
+
+int fib_lookup(struct net *net, const struct flowi4 *flp, struct fib_result *res)
+
+The flowi4 object consists of fields that are important to the IPv4 routing lookup process, including the destination address, source address, Type of Service (TOS), and more. In fact the flowi4 object defines the key to the lookup in the routing tables and should be initialized prior to performing a lookup with the fib_lookup() method. For IPv6 there is a parallel object named flowi6; both are defined in include/net/flow.h. The fib_result object is built in the IPv4 lookup process. The fib_lookup() method first searches the local FIB table. If the lookup fails, it performs a lookup in the main FIB table (I describe these two tables in the next section, "FIB tables"). After a lookup is successfully done, either in the Rx path or the Tx path, a dst object is built (an instance of the dst_entry structure, the destination cache, defined in include/net/dst.h). The dst object is embedded in a structure called rtable, as you will soon see. The rtable object, in fact, represents a routing entry which can be associated with an SKB. The most important members of the dst_entry object are two callbacks named input and output. In the routing lookup process, these callbacks are assigned to be the proper handlers according to the routing lookup result. These two callbacks get only an SKB as a parameter:
+
+struct dst_entry {
+
+...
+
+int (*input)(struct sk_buff *);
+
+int (*output)(struct sk_buff *);
+
+...
+
+}
+
+The following is the rtable structure; as you can see, the dst object is the first object in this structure:
+
+struct rtable {
+
+struct dst_entry dst;
+
+int c rt_genid;
+
+unsigned int rt_flags;
+
+__u16 rt_type;
+
+__u8 rt_is_input;
+
+__u8 rt_uses_gateway;
+
+int rt_iif;
+
+/* Info on neighbour */
+
+__be32 rt_gateway;
+
+/* Miscellaneous cached information */
+
+u32 rt_pmtu;
+
+struct list_head rt_uncached;
+
+};
+
+(include/net/route.h)
+
+The following is a description of the members of the rtable structure:
+
+  * rt_flags: The rtable object flags; some of the important flags are mentioned here:
+
+    * RTCF_BROADCAST: When set, the destination address is a broadcast address. This flag is set in the __mkroute_output() method and in the ip_route_input_slow() method.
+
+    * RTCF_MULTICAST: When set, the destination address is a multicast address. This flag is set in the ip_route_input_mc() method and in the __mkroute_output() method.
+
+    * RTCF_DOREDIRECT: When set, an ICMPv4 Redirect message should be sent as a response for an incoming packet. Several conditions should be fulfilled for this flag to be set, including that the input device and the output device are the same and the corresponding procfs send_redirects entry is set. There are more conditions, as you will see later in this chapter. This flag is set in the __mkroute_input() method.
+
+    * RTCF_LOCAL: When set, the destination address is local. This flag is set in the following methods: ip_route_input_slow(), __mkroute_output(), ip_route_input_mc() and __ip_route_output_key(). Some of the RTCF_XXX flags can be set simultaneously. For example, RTCF_LOCAL can be set when RTCF_BROADCAST or RTCF_MULTICAST are set. For the complete list of RTCF_ XXX flags, look in include/uapi/linux/in_route.h. Note that some of them are unused.
+
+  * rt_is_input: A flag that is set to 1 when this is an input route.
+
+  * rt_uses_gateway: Gets a value according to the following:
+
+    * When the nexthop is a gateway, rt_uses_gateway is 1.
+
+    * When the nexthop is a direct route, rt_uses_gateway is 0.
+
+  * rt_iif: The ifindex of the incoming interface. (Note that the rt_oif member was removed from the rtable structure in kernel 3.6; it was set to the oif of the specified flow key, but was used in fact only in one method).
+
+  * rt_pmtu: The Path MTU (the smallest MTU along the route).
+
+Note that in kernel 3.6, the fib_compute_spec_dst() method was added, which gets an SKB as a parameter. This method made the rt_spec_dst member of the rtable structure unneeded, and rt_spec_dst was removed from the rtable structure as a result. The fib_compute_spec_dst() method is needed in special cases, such as in the icmp_reply() method, when replying to the sender using its source address as a destination for the reply.
+
+For incoming unicast packets destined to the local host, the input callback of the dst object is set to ip_local_deliver(), and for incoming unicast packets that should be forwarded, this input callback is set to ip_forward(). For a packet generated on the local machine and sent away, the output callback is set to be ip_output(). For a multicast packet, the input callback can be set to ip_mr_input() (under some conditions which are not detailed in this chapter). There are cases when the input callback is set to be ip_error(), as you will see later in the PROHIBIT rule example in this chapter. Let's take a look in the fib_result object:
+
+struct fib_result {
+
+unsigned char prefixlen;
+
+unsigned char nh_sel;
+
+unsigned char type;
+
+unsigned char scope;
+
+u32 tclassid;
+
+struct fib_info *fi;
+
+struct fib_table *table;
+
+struct list_head *fa_head;
+
+};
+
+(include/net/ip_fib.h)
+
+  * prefixlen: The prefix length, which represents the netmask. Its values are in the range 0 to 32. It is 0 when using the default route. When adding, for example, a routing entry by ip route add 192.168.2.0/24 dev eth0, the prefixlen is 24, according to the netmask which was specified when adding the entry. The prefixlen is set in the check_leaf() method (net/ipv4/fib_trie.c).
+
+  * nh_sel: The nexthop number. When working with one nexthop only, it is 0. When working with Multipath Routing, there can be more than one nexthop. The nexthop objects are stored in an array in the routing entry (inside the fib_info object), as discussed in the next section.
+
+  * type: The type of the fib_result object is the most important field because it determines in fact how to handle the packet: whether to forward it to a different machine, deliver it locally, discard it silently, discard it with replying with an ICMPv4 message, and so on. The type of the fib_result object is determined according to the packet content (most notably the destination address) and according to routing rules set by the administrator, routing daemons, or a Redirect message. You will see how the type of the fib_result object is determined in the lookup process later in this chapter and in the next. The two most common types of the fib_result objects are the RTN_UNICAST type, which is set when the packet is for forwarding via a gateway or a direct route, and the RTN_LOCAL type, which is set when the packet is for the local host. Other types you will encounter in this book are the RTN_BROADCAST type, for packets that should be accepted locally as broadcasts, the RTN_MULTICAST type, for multicast routes, the RTN_UNREACHABLE type, for packets which trigger sending back an ICMPv4 "Destination Unreachable" message, and more. There are 12 route types in all. For a complete list of all available route types, see include/uapi/linux/rtnetlink.h.
+
+  * fi: A pointer to a fib_info object, which represents a routing entry. The fib_info object holds a reference to the nexthop (fib_nh). I discuss the FIB info structure in the section "FIB Info" later in this chapter.
+
+  * table: A pointer to the FIB table on which the lookup is done. It is set in the check_leaf() method (net/ipv4/fib_trie.c).
+
+  * fa_head: A pointer to a fib_alias list (a list of fib_alias objects associated with this route); optimization of routing entries is done when using fib_alias objects, which avoids creating a separate fib_info object for each routing entry, regardless of the fact that there are other fib_info objects which are very similar. All FIB aliases are sorted by fa_tos descending and fib_priority (metric) ascending. Aliases whose fa_tos is 0 are the last and can match any TOS. I discuss the fib_alias structure in the section "FIB Alias" later in this chapter.
+
+In this section you learned how a lookup in the routing subsystem is performed. You also found out about important data structures that relate to the routing lookup process, like fib_result and rtable. The next section discusses how the FIB tables are organized.
+
+## FIB Tables
+
+The main data structure of the routing subsystem is the routing table, which is represented by the fib_table structure. A routing table can be described, in a somewhat simplified way, as a table of entries where each entry determines which nexthop should be chosen for traffic destined to a subnet (or to a specific IPv4 destination address). This entry has other parameters, of course, discussed later in this chapter. Each routing entry contains a fib_info object (include/net/ip_fib.h), which stores the most important routing entry parameters (but not all, as you will see later in this chapter). The fib_info object is created by the fib_create_info() method (net/ipv4/fib_semantics.c) and is stored in a hash table named fib_info_hash. When the route uses prefsrc, the fib_info object is added also to a hash table named fib_info_laddrhash.
+
+There is a global counter of fib_info objects named fib_info_cnt which is incremented when creating a fib_info object, by the fib_create_info() method, and decremented when freeing a fib_info object, by the free_fib_info() method. The hash table is dynamically resized when it grows over some threshold. A lookup in the fib_info_hash hash table is done by the fib_find_info() method (it returns NULL when not finding an entry). Serializing access to the fib_info members is done by a spinlock named fib_info_lock. Here's the fib_table structure:
+
+struct fib_table {
+
+struct hlist_node tb_hlist;
+
+u32 tb_id;
+
+int tb_default;
+
+int tb_num_default;
+
+unsigned long tb_data[0];
+
+};
+
+(include/net/ip_fib.h)
+
+  * tb_id: The table identifier. For the main table, tb_id is 254 (RT_TABLE_MAIN), and for the local table, tb_id is 255 (RT_TABLE_LOCAL). I talk about the main table and the local table soon—for now, just note that when working without Policy Routing, only these two FIB tables, the main table and the local table, are created in boot.
+
+  * tb_num_default: The number of the default routes in the table. The fib_trie_table() method, which creates a table, initializes tb_num_default to 0. Adding a default route increments tb_num_default by 1, by the fib_table_insert() method. Deleting a default route decrements tb_num_default by 1, by the fib_table_delete() method.
+
+  * tb_data[0] : A placeholder for a routing entry (trie) object.
+
+This section covered how a FIB table is implemented. Next you will learn about the FIB info, which represents a single routing entry.
+
+### FIB Info
+
+A routing entry is represented by a fib_info structure. It consists of important routing entry parameters, such as the outgoing network device (fib_dev), the priority (fib_priority), the routing protocol identifier of this route (fib_protocol), and more. Let's take a look at the fib_info structure:
+
+struct fib_info {
+
+struct hlist_node fib_hash;
+
+struct hlist_node fib_lhash;
+
+struct net *fib_net;
+
+int fib_treeref;
+
+atomic_t fib_clntref;
+
+unsigned int fib_flags;
+
+unsigned char fib_dead;
+
+unsigned char fib_protocol;
+
+unsigned char fib_scope;
+
+unsigned char fib_type;
+
+__be32 fib_prefsrc;
+
+u32 fib_priority;
+
+u32 *fib_metrics;
+
+#define fib_mtu fib_metrics[RTAX_MTU-1]
+
+#define fib_window fib_metrics[RTAX_WINDOW-1]
+
+#define fib_rtt fib_metrics[RTAX_RTT-1]
+
+#define fib_advmss fib_metrics[RTAX_ADVMSS-1]
+
+int fib_nhs;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+int fib_power;
+
+#endif
+
+struct rcu_head rcu;
+
+struct fib_nh fib_nh[0];
+
+#define fib_dev fib_nh[0].nh_dev
+
+};
+
+(include/net/ip_fib.h)
+
+  * fib_net: The network namespace the fib_info object belongs to.
+
+  * fib_treeref: A reference counter that represents the number of fib_alias objects which hold a reference to this fib_info object. This reference counter is incremented in the fib_create_info() method and decremented in the fib_release_info() method. Both methods are in net/ipv4/fib_semantics.c.
+
+  * fib_clntref: A reference counter that is incremented by the fib_create_info() method (net/ipv4/fib_semantics.c) and decremented by the fib_info_put() method (include/net/ip_fib.h). If, after decrementing it by 1 in the fib_info_put() method, it reaches zero, than the associated fib_info object is freed by the free_fib_info() method.
+
+  * fib_dead: A flag that indicates whether it is permitted to free the fib_info object with the free_fib_info() method; fib_dead must be set to 1 before calling the free_fib_info() method. If the fib_dead flag is not set (its value is 0), then it is considered alive, and trying to free it with the free_fib_info() method will fail.
+
+  * fib_protocol: The routing protocol identifier of this route. When adding a routing rule from userspace without specifying the routing protocol ID, the fib_protocol is assigned to be RTPROT_BOOT. The administrator may add a route with the "proto static" modifier, which indicates that the route was added by an administrator; this can be done, for example, like this: ip route add proto static 192.168.5.3 via 192.168.2.1. The fib_protocol can be assigned one of these flags:
+
+    * RTPROT_UNSPEC: An error value.
+
+    * RTPROT_REDIRECT: When set, the routing entry was created as a result of receiving an ICMP Redirect message. The RTPROT_REDIRECT protocol identifier is used only in IPv6.
+
+    * RTPROT_KERNEL: When set, the routing entry was created by the kernel (for example, when creating the local IPv4 routing table, explained shortly).
+
+    * RTPROT_BOOT: When set, the admin added a route without specifying the "proto static" modifier.
+
+    * RTPROT_STATIC: Route installed by system administrator.
+
+    * RTPROT_RA: Don't misread this— this protocol identifier is not for Router Alert; it is for RDISC/ND Router Advertisements, and it is used in the kernel by the IPv6 subsystem only; see: net/ipv6/route.c. I discuss it in Chapter 8.
+
+The routing entry could also be added by userspace routing daemons, like ZEBRA, XORP, MROUTED, and more. Then it will be assigned the corresponding value from a list of protocol identifiers (see the RTPROT_XXX definitions in include/uapi/linux/rtnetlink.h). For example, for the XORP daemon it will be RTPROT_XORP. Note that these flags (like RTPROT_KERNEL or RTPROT_STATIC) are also used by IPv6, for the parallel field (the rt6i_protocol field in the rt6_info structure; the rt6_info object is the IPv6 parallel to the rtable object).
+
+  * fib_scope: The scope of the destination address. In short, scopes are assigned to addresses and routes. Scope indicates the distance of the host from other nodes. The ip address show command shows the scopes of all configured IP addresses on a host. The ip route show command displays the scopes of all the route entries of the main table. A scope can be one of these:
+
+    * host (RT_SCOPE_HOST): The node cannot communicate with the other network nodes. The loopback address has scope host.
+
+    * global (RT_SCOPE_UNIVERSE): The address can be used anywhere. This is the most common case.
+
+    * link (RT_SCOPE_LINK): This address can be accessed only from directly attached hosts.
+
+    * site (RT_SCOPE_SITE): This is used in IPv6 only (I discuss it in Chapter 8).
+
+    * nowhere (RT_SCOPE_NOWHERE): Destination doesn't exist.
+
+When a route is added by an administrator without specifying a scope, the fib_scope field is assigned a value according to these rules:
+
+    * global scope (RT_SCOPE_UNIVERSE): For all gatewayed unicast routes.
+
+    * scope link (RT_SCOPE_LINK): For direct unicast and broadcast routes.
+
+    * scope host (RT_SCOPE_HOST): For local routes.
+
+  * fib_type: The type of the route. The fib_type field was added to the fib_info structure as a key to make sure there is differentiation among fib_info objects by their type. The fib_type field was added to the fib_info struct in kernel 3.7. Originally this type was stored only in the fa_type field of the FIB alias object (fib_alias). You can add a rule to block traffic according to a specified category, for example, by: ip route add prohibit 192.168.1.17 from 192.168.2.103.
+
+    * The fib_type of the generated fib_info object is RTN_PROHIBIT.
+
+    * Sending traffic from 192.168.2.103 to 192.168.1.17 results in an ICMPv4 message of "Packet Filtered" (ICMP_PKT_FILTERED).
+
+  * fib_prefsrc: There are cases when you want to provide a specific source address to the lookup key. This is done by setting fib_prefsrc.
+
+  * fib_priority: The priority of the route, by default, is 0, which is the highest priority. The higher the value of the priority, the lower the priority is. For example, a priority of 3 is lower than a priority of 0, which is the highest priority. You can configure it, for example, with the ip command, in one of the following ways:
+
+    * ip route add 192.168.1.10 via 192.168.2.1 metric 5
+
+    * ip route add 192.168.1.10 via 192.168.2.1 priority 5
+
+    * ip route add 192.168.1.10 via 192.168.2.1 preference 5
+
+Each of these three commands sets the fib_priority to 5; there is no difference at all between them. Moreover, the metric parameter of the ip route command is not related in any way to the fib_metrics field of the fib_info structure.
+
+  * fib_mtu, fib_window, fib_rtt, and fib_advmss simply give more convenient names to commonly used elements of the fib_metrics array.
+
+fib_metrics is an array of 15 (RTAX_MAX) elements consisting of various metrics. It is initialized to be dst_default_metrics in net/core/dst.c. Many metrics are related to the TCP protocol, such as the Initial Congestion Window (initcwnd) metric. Table 5-1, at the end of the chapter shows all the available metrics and displays whether each is a TCP-related metric or not.
+
+From userspace, the TCPv4 initcwnd metric can be set thus, for example:
+
+ip route add 192.168.1.0/24 initcwnd 35
+
+There are metrics which are not TCP specific—for example, the mtu metric, which can be set from userspace like this:
+
+ip route add 192.168.1.0/24 mtu 800
+
+or like this:
+
+ip route add 192.168.1.0/24 mtu lock 800
+
+The difference between the two commands is that when specifying the modifier lock, no path MTU discovery will be tried. When not specifying the modifier lock, the MTU may be updated by the kernel due to Path MTU discovery. For more about how this is implemented, see the __ip_rt_update_pmtu() method, in net/ipv4/route.c:
+
+static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+
+{
+
+Avoiding Path MTU update when specifying the mtu lock modifier is achieved by calling the dst_metric_locked() method :
+
+...
+
+if (dst_metric_locked(dst, RTAX_MTU))
+
+return;
+
+...
+
+}
+
+  * fib_nhs: The number of nexthops. When Multipath Routing (CONFIG_IP_ROUTE_MULTIPATH) is not set, it cannot be more than 1. The Multipath Routing feature sets multiple alternative paths for a route, possibly assigning different weights to these paths. This feature provides benefits such as fault tolerance, increased bandwidth, or improved security (I discuss it in Chapter 6).
+
+  * fib_dev: The network device that will transmit the packet to the nexthop.
+
+  * fib_nh[0]: The fib_nh[0] member represents the nexthop. When working with Multipath Routing, you can define more than one nexthop in a route, and in this case there is an array of nexthops. Defining two nexthop nodes can be done like this, for example: ip route add default scope global nexthop dev eth0 nexthop dev eth1.
+
+As mentioned, when the fib_type is RTN_PROHIBIT, an ICMPv4 message of "Packet Filtered" (ICMP_PKT_FILTERED) is sent. How is it implemented? An array named fib_props consists of 12 (RTN_MAX) elements (defined in net/ipv4/fib_semantics.c). The index of this array is the route type. The available route types, such as RTN_PROHIBIT or RTN_UNICAST, can be found in include/uapi/linux/rtnetlink.h. Each element in the array is an instance of struct fib_prop; the fib_prop structure is a very simple structure:
+
+struct fib_prop {
+
+int error;
+
+u8 scope;
+
+};
+
+(net/ipv4/fib_lookup.h)
+
+For every route type, the corresponding fib_prop object contains the error and the scope for that route. For example, for the RTN_UNICAST route type (gateway or direct route), which is a very common route, the error value is 0, which means that there is no error, and the scope is RT_SCOPE_UNIVERSE. For the RTN_PROHIBIT route type (a rule which a system administrator configures in order to block traffic), the error is –EACCES, and the scope is RT_SCOPE_UNIVERSE:
+
+const struct fib_prop fib_props[RTN_MAX + 1] = {
+
+...
+
+[RTN_PROHIBIT] = {
+
+.error = -EACCES,
+
+.scope = RT_SCOPE_UNIVERSE,
+
+},
+
+...
+
+Table 5-2 at the end of this chapter shows all available route types, their error codes, and their scopes.
+
+When you configure a rule like the one mentioned earlier, by ip route add prohibit 192.168.1.17 from 192.168.2.103—and when a packet is sent from 192.168.2.103 to 192.168.1.17, what happens is the following: a lookup in the routing tables is performed in the Rx path. When a corresponding entry, which is in fact a leaf in the FIB TRIE, is found, the check_leaf() method is invoked. This method accesses the fib_props array with the route type of the packet as an index (fa->fa_type):
+
+static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
+
+t_key key, const struct flowi4 *flp,
+
+struct fib_result *res, int fib_flags)
+
+{
+
+...
+
+fib_alias_accessed(fa);
+
+err = fib_props[fa->fa_type].error;
+
+if (err) {
+
+...
+
+return err;
+
+}
+
+...
+
+Eventually, the fib_lookup() method, which initiated the lookup in the IPv4 routing subsystem, returns an error of –EACCES (in our case). It propagates all the way back from check_leaf() via fib_table_lookup() and so on until it returns to the method which triggered this chain, namely the fib_lookup() method. When the fib_lookup() method returns an error in the Rx path, it is handled by the ip_error() method. According to the error, an action is taken. In the case of –EACCES, an ICMPv4 of destination unreachable with code of Packet Filtered (ICMP_PKT_FILTERED) is sent back, and the packet is dropped.
+
+This section covered the FIB info, which represents a single routing entry. The next section discusses caching in the IPv4 routing subsystem (not to be confused with the IPv4 routing cache, which was removed from the network stack, and is discussed in the "IPv4 Routing Cache" section at the end of this chapter).
+
+### Caching
+
+Caching the results of a routing lookup is an optimization technique that improves the performance of the routing subsystem. The results of a routing lookup are usually cached in the nexthop (fib_nh) object; when the packet is not a unicast packet or realms are used (the packet itag is not 0), the results are not cached in the nexthop. The reason is that if all types of packets are cached, then the same nexthop can be used by different kinds of routes—that should be avoided. There are some minor exceptions to this which I do not discuss in this chapter. Caching in the Rx and the Tx path are performed as follows:
+
+  * In the Rx path, caching the fib_result object in the nexthop (fib_nh) object is done by setting the nh_rth_input field of the nexthop (fib_nh) object.
+
+  * In the Tx path, caching the fib_result object in the nexthop (fib_nh) object is done by setting the nh_pcpu_rth_output field of the nexthop (fib_nh) object.
+
+  * Both nh_rth_input and nh_pcpu_rth_output are instances of the rtable structure.
+
+  * Caching the fib_result is done by the rt_cache_route() method both in the Rx and the Tx paths (net/ipv4/route.c).
+
+  * Caching of Path MTU and ICMPv4 redirects is done with FIB exceptions.
+
+For performance, the nh_pcpu_rth_output is a per-CPU variable, meaning there is a copy for each CPU of the output dst entry. Caching is used almost always. The few exceptions are when an ICMPv4 Redirect message is sent, or itag (tclassid) is set, or there is not enough memory.
+
+In this section you have learned how caching is done using the nexthop object. The next section discusses the fib_nh structure, which represents the nexthop, and the FIB nexthop exceptions.
+
+### Nexthop (fib_nh)
+
+The fib_nh structure represents the nexthop. It consists of information such as the outgoing nexthop network device (nh_dev), outgoing nexthop interface index (nh_oif), the scope (nh_scope), and more. Let's take a look:
+
+struct fib_nh {
+
+struct net_device *nh_dev;
+
+struct hlist_node nh_hash;
+
+struct fib_info *nh_parent;
+
+unsigned int nh_flags;
+
+unsigned char nh_scope;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+int nh_weight;
+
+int nh_power;
+
+#endif
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+
+__u32 nh_tclassid;
+
+#endif
+
+int nh_oif;
+
+__be32 nh_gw;
+
+__be32 nh_saddr;
+
+int nh_saddr_genid;
+
+struct rtable __rcu * __percpu *nh_pcpu_rth_output;
+
+struct rtable __rcu *nh_rth_input;
+
+struct fnhe_hash_bucket *nh_exceptions;
+
+};
+
+(include/net/ip_fib.h)
+
+The nh_dev field represents the network device (net_device object) on which traffic to the nexthop will be transmitted. When a network device associated with one or more routes is disabled, a NETDEV_DOWN notification is sent. The FIB callback for handling this event is the fib_netdev_event() method; it is the callback of the fib_netdev_notifier notifier object, which is registered in the ip_fib_init() method by calling the register_netdevice_notifier() method (notification chains are discussed in Chapter 14). The fib_netdev_event() method calls the fib_disable_ip() method upon receiving a NETDEV_DOWN notification. In the fib_disable_ip() method, the following steps are performed:
+
+  * First, the fib_sync_down_dev() method is called (net/ipv4/fib_semantics.c). In the fib_sync_down_dev() method, the RTNH_F_DEAD flag of the nexthop flags (nh_flags) is set and the FIB info flags (fib_flags) is set.
+
+  * The routes are flushed by the fib_flush() method.
+
+  * The rt_cache_flush() method and the arp_ifdown() method are invoked. The arp_ifdown() method is not on any notifier chain.
+
+#### FIB Nexthop Exceptions
+
+FIB nexthop exceptions were added in kernel 3.6 to handle cases when a routing entry is changed not as a result of a userspace action, but as a result of an ICMPv4 Redirect message or as a result of Path MTU discovery. The hash key is the destination address. The FIB nexthop exceptions are based on a 2048 entry hash table; reclaiming (freeing hash entries) starts at a chain depth of 5. Each nexthop object (fib_nh) has a FIB nexthop exceptions hash table, nh_exceptions (an instance of the fnhe_hash_bucket structure). Let's take a look at the fib_nh_exception structure:
+
+struct fib_nh_exception {
+
+struct fib_nh_exception __rcu *fnhe_next;
+
+__be32 fnhe_daddr;
+
+u32 fnhe_pmtu;
+
+__be32 fnhe_gw;
+
+unsigned long fnhe_expires;
+
+struct rtable __rcu *fnhe_rth;
+
+unsigned long fnhe_stamp;
+
+};
+
+(include/net/ip_fib.h)
+
+The fib_nh_exception objects are created by the update_or_create_fnhe() method (net/ipv4/route.c). Where are FIB nexthop exceptions generated? The first case is when receiving an ICMPv4 Redirect message ("Redirect to Host") in the __ip_do_redirect() method. The "Redirect to Host" message includes a new gateway. The fnhe_gw field of the fib_nh_exception is set to be the new gateway when creating the FIB nexthop exception object (in the update_or_create_fnhe() method):
+
+static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
+
+bool kill_route)
+
+{
+
+...
+
+__be32 new_gw = icmp_hdr(skb)->un.gateway;
+
+...
+
+update_or_create_fnhe(nh, fl4->daddr, new_gw, 0, 0);
+
+...
+
+}
+
+The second case of generating FIB nexthop exceptions is when the Path MTU has changed, in the __ip_rt_update_pmtu() method. In such a case, the fnhe_pmtu field of the fib_nh_exception object is set to be the new MTU when creating the FIB nexthop exception object (in the update_or_create_fnhe() method). PMTU value is expired if it was not updated in the last 10 minutes (ip_rt_mtu_expires). This period is checked on every dst_mtu() call via the ipv4_mtu() method, which is a dst->ops->mtu handler. The ip_rt_mtu_expires, which is by default 600 seconds, can be configured via the procfs entry /proc/sys/net/ipv4/route/mtu_expires:
+
+static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+
+{
+
+...
+
+if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
+
+struct fib_nh *nh = &FIB_RES_NH(res);
+
+update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
+
+jiffies + ip_rt_mtu_expires);
+
+}
+
+...
+
+}
+
+Note
+
+FIB nexthop exceptions are used in the Tx path. Starting with Linux 3.11, they are also used in the Rx path. As a result, instead of fnhe_rth, there are fnhe_rth_input and fnhe_rth_output.
+
+Since kernel 2.4, Policy Routing is supported. With Policy Routing, the routing of a packet depends not only on the destination address, but on several other factors, such as the source address or the TOS. The system administrator can add up to 255 routing tables.
+
+### Policy Routing
+
+When working without Policy Routing (CONFIG_IP_MULTIPLE_TABLES is not set), two routing tables are created: the local table and the main table. The main table id is 254 (RT_TABLE_MAIN), and the local table id is 255 (RT_TABLE_LOCAL). The local table contains routing entries of local addresses. These routing entries can be added to the local table only by the kernel. Adding routing entries to the main table (RT_TABLE_MAIN) is done by a system administrator (via ip route add, for example). These tables are created by the fib4_rules_init() method of net/ipv4/fib_frontend.c. These tables were called ip_fib_local_table and ip_fib_main_table in kernels prior to 2.6.25, but they were removed in favor of using unified access to the routing tables with the fib_get_table() method with appropriate argument. By unified access, I mean that access to the routing tables is done in the same way, with the fib_get_table() method, both when Policy Routing support is enabled and when it is disabled. The fib_get_table() method gets only two arguments: the network namespace and the table id. Note that there is a different method with the same name, fib4_rules_init(), for the Policy Routing case, in net/ipv4/fib_rules.c, which is invoked when working with Policy Routing support. When working with Policy Routing support (CONFIG_IP_MULTIPLE_TABLES is set), there are three initial tables (local, main, and default), and there can be up to 255 routing tables. I talk more about Policy Routing in Chapter 6. Access to the main routing table can be done as follows:
+
+  * By a system administrator command (using ip route or route):
+
+    * Adding a route by ip route add is implemented by sending RTM_NEWROUTE message from userspace, which is handled by the inet_rtm_newroute() method. Note that a route is not necessarily always a rule that permits traffic. You can also add a route that blocks traffic, for example, by ip route add prohibit 192.168.1.17 from 192.168.2.103. As a result of applying this rule, all packets sent from 192.168.2.103 to 192.168.1.17 will be blocked.
+
+    * Deleting a route by ip route del is implemented by sending RTM_DELROUTE message from userspace, which is handled by the inet_rtm_delroute() method.
+
+    * Dumping a routing table by ip route show is implemented by sending RTM_GETROUTE message from userspace, which is handled by the inet_dump_fib() method.
+
+Note that ip route show displays the main table. For displaying the local table, you should run ip route show table local.
+
+    * Adding a route by route add is implemented by sending SIOCADDRT IOCTL, which is handled by the ip_rt_ioctl() method (net/ipv4/fib_frontend.c).
+
+    * Deleting a route by route del is implemented by sending SIOCDELRT IOCTL, which is handled by the ip_rt_ioctl() method (net/ipv4/fib_frontend.c).
+
+    * By userspace routing daemons which implement routing protocols like BGP (Border Gateway Protocol), EGP (Exterior Gateway Protocol), OSPF (Open Shortest Path First), or others. These routing daemons run on core routers, which operate in the Internet backbone, and can handle hundreds of thousands of routes.
+
+I should mention here that routes that were changed as a result of an ICMPv4 REDIRECT message or as a result of Path MTU discovery are cached in the nexthop exception table, discussed shortly. The next section describes the FIB alias, which helps in routing optimizations.
+
+### FIB Alias (fib_alias)
+
+There are cases when several routing entries to the same destination address or to the same subnet are created. These routing entries differ only in the value of their TOS. Instead of creating a fib_info for each such route, a fib_alias object is created. A fib_alias is smaller, which reduces memory consumption. Here is a simple example of creating 3 fib_alias objects:
+
+ip route add 192.168.1.10 via 192.168.2.1 tos 0x2
+
+ip route add 192.168.1.10 via 192.168.2.1 tos 0x4
+
+ip route add 192.168.1.10 via 192.168.2.1 tos 0x6
+
+Let's take a look at the fib_alias structure definition:
+
+struct fib_alias {
+
+struct list_head fa_list;
+
+struct fib_info *fa_info;
+
+u8 fa_tos;
+
+u8 fa_type;
+
+u8 fa_state;
+
+struct rcu_head rcu;
+
+};
+
+(net/ipv4/fib_lookup.h)
+
+Note that there was also a scope field in the fib_alias structure (fa_scope), but it was moved in kernel 2.6.39 to the fib_info structure.
+
+The fib_alias object stores routes to the same subnet but with different parameters. You can have one fib_info object which will be shared by many fib_alias objects. The fa_info pointer in all these fib_alias objects, in this case, will point to the same shared fib_info object. In Figure 5-3, you can see one fib_info object which is shared by three fib_alias objects, each with a different fa_tos. Note that the reference counter value of the fib_info object is 3 (fib_treeref).
+
+Figure 5-3.
+
+A fib_info which is shared by three fib_alias objects. Each fib_alias object has a different fa_tos value
+
+Let's take a look at what happens when you try to add a key for which a fib_node was already added before (as in the earlier example with the three TOS values 0x2, 0x4, and 0x6); suppose you had created the first rule with TOS of 0x2, and now you create the second rule, with TOS of 0x4.
+
+A fib_alias object is created by the fib_table_insert() method, which is the method that handles adding a routing entry:
+
+int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
+
+{
+
+struct trie *t = (struct trie *) tb->tb_data;
+
+struct fib_alias *fa, *new_fa;
+
+struct list_head *fa_head = NULL;
+
+struct fib_info *fi;
+
+...
+
+First, a fib_info object is created. Note that in the fib_create_info() method, after allocating and creating a fib_info object, a lookup is performed to check whether a similar object already exists by calling the fib_find_info() method. If such an object exists, it will be freed, and the reference counter of the object that was found (ofi in the code snippet you will shortly see) will be incremented by 1:
+
+fi = fib_create_info(cfg);
+
+Let's take a look at the code snippet in the fib_create_info() method mentioned earlier; for creating the second TOS rule, the fib_info object of the first rule and the fib_info object of the second rule are identical. You should remember that the TOS field exists in the fib_alias object but not in the fib_info object:
+
+struct fib_info *fib_create_info(struct fib_config *cfg)
+
+{
+
+struct fib_info *fi = NULL;
+
+struct fib_info *ofi;
+
+...
+
+fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
+
+if (fi == NULL)
+
+goto failure;
+
+...
+
+link_it:
+
+ofi = fib_find_info(fi);
+
+If a similar object is found, free the fib_info object and increment the fib_treeref reference count:
+
+if (ofi) {
+
+fi->fib_dead = 1;
+
+free_fib_info(fi);
+
+ofi->fib_treeref++;
+
+return ofi;
+
+}
+
+...
+
+}
+
+Now a check is performed to find out whether there is an alias to the fib_info object; in this case, there will be no alias because the TOS of the second rule is different than the TOS of the first rule:
+
+l = fib_find_node(t, key);
+
+fa = NULL;
+
+if (l) {
+
+fa_head = get_fa_head(l, plen);
+
+fa = fib_find_alias(fa_head, tos, fi->fib_priority);
+
+}
+
+if (fa && fa->fa_tos == tos &&
+
+fa->fa_info->fib_priority == fi->fib_priority) {
+
+...
+
+}
+
+Now a fib_alias is created, and its fa_info pointer is assigned to point the fib_info of the first rule that was created:
+
+new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+
+if (new_fa == NULL)
+
+goto out;
+
+new_fa->fa_info = fi;
+
+...
+
+Now that I have covered the FIB Alias, you are ready to look at the ICMPv4 redirect message, which is sent when there is a suboptimal route.
+
+## ICMPv4 Redirect Message
+
+There are cases when a routing entry is suboptimal. In such cases, an ICMPv4 redirect message is sent. The main criterion for a suboptimal entry is that the input device and the output device are the same. But there are more conditions that should be fulfilled so that an ICMPv4 redirect message is sent, as you will see in this section. There are four codes of ICMPv4 redirect message:
+
+  * ICMP_REDIR_NET: Redirect Net
+
+  * ICMP_REDIR_HOST: Redirect Host
+
+  * ICMP_REDIR_NETTOS: Redirect Net for TOS
+
+  * ICMP_REDIR_HOSTTOS: Redirect Host for TOS
+
+Figure 5-4 shows a setup where there is a suboptimal route. There are three machines in this setup, all on the same subnet (192.168.2.0/24) and all connected via a gateway (192.168.2.1). The AMD server (192.168.2.200) added the Windows server (192.168.2.10) as a gateway for accessing 192.168.2.7 (the laptop) by ip route add 192.168.2.7 via 192.168.2.10. The AMD server sends traffic to the laptop, for example, by ping 192.168.2.7. Because the default gateway is 192.168.2.10, the traffic is sent to 192.168.2.10. The Windows server detects that this is a suboptimal route, because the AMD server could send directly to 192.168.2.7, and sends back to the AMD server an ICMPv4 redirect message with ICMP_REDIR_HOST code.
+
+Figure 5-4.
+
+Redirect to Host (ICMP_REDIR_HOST), a simple setup
+
+Now that you have a better understanding of redirects, let's look at how an ICMPv4 message is generated.
+
+### Generating an ICMPv4 Redirect Message
+
+An ICMPv4 Redirect message is sent when there is some suboptimal route. The most notable condition for a suboptimal route is that the input device and the output device are the same, but there are some more conditions which should be met. Generating an ICMPv4 Redirect message is done in two phases:
+
+  * In the __mkroute_input() method: Here the RTCF_DOREDIRECT flag is set if needed.
+
+  * In the ip_forward() method: Here the ICMPv4 Redirect message is actually sent by calling the ip_rt_send_redirect() method.
+
+static int __mkroute_input(struct sk_buff *skb,
+
+const struct fib_result *res,
+
+struct in_device *in_dev,
+
+__be32 daddr, __be32 saddr, u32 tos)
+
+{
+
+struct rtable *rth;
+
+int err;
+
+struct in_device *out_dev;
+
+unsigned int flags = 0;
+
+bool do_cache;
+
+All of the following conditions should be sustained so that the RTCF_DOREDIRECT flag is set:
+
+  * The input device and the output device are the same.
+
+  * The procfs entry, /proc/sys/net/ipv4/conf/<deviceName>/send_redirects, is set.
+
+  * Either this outgoing device is a shared media or the source address (saddr) and the nexthop gateway address (nh_gw) are on the same subnet:
+
+if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
+
+(IN_DEV_SHARED_MEDIA(out_dev) ||
+
+inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
+
+flags |= RTCF_DOREDIRECT;
+
+do_cache = false;
+
+}
+
+...
+
+Setting the rtable object flags is done by:
+
+rth->rt_flags = flags;
+
+...
+
+}
+
+Sending the ICMPv4 Redirect message is done in the second phase, by the ip_forward() method:
+
+int ip_forward(struct sk_buff *skb)
+
+{
+
+struct iphdr *iph; /* Our header */
+
+struct rtable *rt; /* Route we use */
+
+struct ip_options *opt = &(IPCB(skb)->opt);
+
+Next a check is performed to see whether the RTCF_DOREDIRECT flag is set, whether an IP option of strict route does not exist (see chapter 4), and whether it is not an IPsec packet. (With IPsec tunnels, the input device of the tunneled packet can be the same as the decapsulated packet outgoing device; see  http://lists.openwall.net/netdev/2007/08/24/29 ):
+
+if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb))
+
+ip_rt_send_redirect(skb);
+
+In the ip_rt_send_redirect() method, the ICMPv4 Redirect message is actually sent. The third parameter is the IP address of the advised new gateway, which will be 192.168.2.7 in this case (The address of the laptop):
+
+void ip_rt_send_redirect(struct sk_buff *skb)
+
+{
+
+...
+
+icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
+
+rt_nexthop(rt, ip_hdr(skb)->daddr))
+
+...
+
+}
+
+(net/ipv4/route.c)
+
+### Receiving an ICMPv4 Redirect Message
+
+For an ICMPv4 Redirect message to be processed, it should pass some sanity checks. Handling an ICMPv4 Redirect message is done by the __ip_do_redirect() method:
+
+static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4
+
+*fl4,bool kill_route)
+
+{
+
+__be32 new_gw = icmp_hdr(skb)->un.gateway;
+
+__be32 old_gw = ip_hdr(skb)->saddr;
+
+struct net_device *dev = skb->dev;
+
+struct in_device *in_dev;
+
+struct fib_result res;
+
+struct neighbour *n;
+
+struct net *net;
+
+...
+
+Various checks are performed, such as that the network device is set to accept redirects. The redirect is rejected if necessary:
+
+if (rt->rt_gateway != old_gw)
+
+return;
+
+in_dev = __in_dev_get_rcu(dev);
+
+if (!in_dev)
+
+return;
+
+net = dev_net(dev);
+
+if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
+
+ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
+
+ipv4_is_zeronet(new_gw))
+
+goto reject_redirect;
+
+if (!IN_DEV_SHARED_MEDIA(in_dev)) {
+
+if (!inet_addr_onlink(in_dev, new_gw, old_gw))
+
+goto reject_redirect;
+
+if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
+
+goto reject_redirect;
+
+} else {
+
+if (inet_addr_type(net, new_gw) != RTN_UNICAST)
+
+goto reject_redirect;
+
+}
+
+A lookup in the neighboring subsystem is performed; the key to the lookup is the address of the advised gateway, new_gw, which was extracted from the ICMPv4 message in the beginning of this method:
+
+n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
+
+if (n) {
+
+if (!(n->nud_state & NUD_VALID)) {
+
+neigh_event_send(n, NULL);
+
+} else {
+
+if (fib_lookup(net, fl4, &res) == 0) {
+
+struct fib_nh *nh = &FIB_RES_NH(res);
+
+Create / update a FIB nexthop exception, specifying the IP address of an advised gateway (new_gw):
+
+update_or_create_fnhe(nh, fl4->daddr, new_gw,
+
+0, 0);
+
+}
+
+if (kill_route)
+
+rt->dst.obsolete = DST_OBSOLETE_KILL;
+
+call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
+
+}
+
+neigh_release(n);
+
+}
+
+return;
+
+reject_redirect:
+
+...
+
+(net/ipv4/route.c)
+
+Now that we've covered how a received ICMPv4 message is handled, we can next tackle the IPv4 routing cache and the reasons for its removal.
+
+### IPv4 Routing Cache
+
+In kernels prior to 3.6, there was an IPv4 routing cache with a garbage collector. The IPv4 routing cache was removed in kernel 3.6 (around July 2012). The FIB TRIE / FIB hash was a choice in the kernel for years, but not as the default. Having the FIB TRIE made it possible to remove the IPv4 routing cache, as it had Denial of Service (DoS) issues. FIB TRIE (also known as LC-trie) is the longest matching prefix lookup algorithm that performs better than FIB hash for large routing tables. It consumes more memory and is more complex, but since it performs better, it made the removal of the routing cache feasible. The FIB TRIE code was in the kernel for a long time before it was merged, but it was not the default. The main reason for the removal of the IPv4 routing cache was that launching DoS attacks against it was easy because the IPv4 routing cache created a cache entry for each unique flow. Basically that meant that by sending packets to random destinations, you could generate an unlimited amount of routing cache entries.
+
+Merging the FIB TRIE entailed the removal of the routing cache and of some of the cumbersome FIB hash tables and of the routing cache garbage collector methods. This chapter discusses the routing cache very briefly. Because the novice reader may wonder what it is needed for, note that in the Linux-based software industry, in commercial distributions like RedHat Enterprise, the kernels are fully maintained and fully supported for a very long period of time (RedHat, for example, gives support for its distributions for up to seven years). So it is very likely that some readers will be involved in projects based on kernels prior to 3.6, where you will find the routing cache and the FIB hash-based routing tables. Delving into the theory and implementation details of the FIB TRIE data structure is beyond the scope of this book. To learn more, I recommend the article "TRASH—A dynamic LC-trie and hash data structure" by Robert Olsson and Stefan Nilsson,  www.nada.kth.se/~snilsson/publications/TRASH/trash.pdf .
+
+Note that with the IPv4 routing cache implementation, there is a single cache, regardless of how many routing tables are used (there can be up to 255 routing tables when using Policy Routing). Note that there was also support for IPv4 Multipath Routing cache, but it was removed in kernel 2.6.23, in 2007. In fact, it never did work very well and never got out of the experimental state.
+
+For kernels prior to the 3.6 kernel, where the FIB TRIE is not yet merged, the lookup in the IPv4 routing subsystem was different: access to routing tables was preceded by access to the routing cache, the tables were organized differently, and there was a routing cache garbage collector, which was both asynchronous (periodic timer) and synchronous (activated under specific conditions, for example when the number of the cache entries exceeded some threshold). The cache was basically a big hash with the IP flow source address, destination address, and TOS as a key, associated with all flow-specific information like neighbor entry, PMTU, redirect, TCPMSS info, and so on. The benefit here is that cached entries were fast to look up and contained all the information needed by higher layers.
+
+Note
+
+The following two sections ("Rx Path" and "Tx Path") refer to the 2.6.38 kernel.
+
+#### Rx Path
+
+In the Rx path, first the ip_route_input_common() method is invoked. This method performs a lookup in the IPv4 routing cache, which is much quicker than the lookup in the IPv4 routing tables. Lookup in these routing tables is based on the Longest Prefix Match (LPM) search algorithm. With the LPM search, the most specific table entry—the one with the highest subnet mask—is called the Longest Prefix Match. In case the lookup in the routing cache fails ("cache miss"), a lookup in the routing tables is being performed by calling the ip_route_input_slow() method. This method calls the fib_lookup() method to perform the actual lookup. Upon success, it calls the ip_mkroute_input() method which (among other actions) inserts the routing entry into the routing cache by calling the rt_intern_hash() method.
+
+#### Tx Path
+
+In the Tx path, first the ip_route_output_key() method is invoked. This method performs a lookup in the IPv4 routing cache. In case of a cache miss, it calls the ip_route_output_slow() method, which calls the fib_lookup() method to perform a lookup in the routing subsystem. Subsequently, upon success, it calls the ip_mkroute_output() method which (among other actions) inserts the routing entry into the routing cache by calling the rt_intern_hash() method.
+
+## Summary
+
+This chapter covered various topics of the IPv4 routing subsystem. The routing subsystem is essential for handling both incoming and outgoing packets. You learned about various topics like forwarding, lookup in the routing subsystem, organization of the FIB tables, Policy Routing and the routing subsystem, and ICMPv4 Redirect message. You also learned about optimization which is gained with the FIB alias and the fact that the routing cache was removed, and why. The next chapter covers advanced topics of the IPv4 routing subsystem.
+
+## Quick Reference
+
+I conclude this chapter with a short list of important methods, macros, and tables of the IPv4 routing subsystem, along with a short explanation about routing flags.
+
+Note
+
+The IPv4 routing subsystem is implemented in these modules under net/ipv4: fib_frontend.c, fib_trie.c, fib_semantics.c, route.c.
+
+The fib_rules.c module implements Policy Routing and is compiled only when CONFIG_IP_MULTIPLE_TABLES is set. Among the most important header files are fib_lookup.h, include/net/ip_fib.h, and include/net/route.h.
+
+The destination cache (dst) implementation is in net/core/dst.c and in include/net/dst.h.
+
+CONFIG_IP_ROUTE_MULTIPATH should be set for Multipath Routing Support.
+
+### Methods
+
+This section lists the methods that were mentioned in this chapter.
+
+#### int fib_table_insert(struct fib_table *tb, struct fib_config *cfg);
+
+This method inserts an IPv4 routing entry to the specified FIB table (fib_table object), based on the specified fib_config object.
+
+#### int fib_table_delete(struct fib_table *tb, struct fib_config *cfg);
+
+This method deletes an IPv4 routing entry from the specified FIB table (fib_table object), based on the specified fib_config object.
+
+#### struct fib_info *fib_create_info(struct fib_config *cfg);
+
+This method creates a fib_info object derived from the specified fib_config object.
+
+#### void free_fib_info(struct fib_info *fi);
+
+This method frees a fib_info object in condition that it is not alive (the fib_dead flag is not 0) and decrements the global fib_info objects counter (fib_info_cnt).
+
+#### void fib_alias_accessed(struct fib_alias *fa);
+
+This method sets the fa_state flag of the specified fib_alias to be FA_S_ACCESSED. Note that the only fa_state flag is FA_S_ACCESSED.
+
+#### void ip_rt_send_redirect(struct sk_buff *skb);
+
+This method sends an ICMPV4 Redirect message, as a response to a suboptimal path.
+
+#### void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4*fl4, bool kill_route);
+
+This method handles receiving an ICMPv4 Redirect message.
+
+#### void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, u32 pmtu, unsigned long expires);
+
+This method creates a FIB nexthop exception table (fib_nh_exception) in the specified nexthop object (fib_nh), if it does not already exist, and initializes it. It is invoked when there should be a route update due to ICMPv4 redirect or due to PMTU discovery.
+
+#### u32 dst_metric(const struct dst_entry *dst, int metric);
+
+This method returns a metric of the specified dst object.
+
+#### struct fib_table *fib_trie_table(u32 id);
+
+This method allocates and initializes a FIB TRIE table.
+
+#### struct leaf *fib_find_node(struct trie *t, u32 key);
+
+This method performs a TRIE lookup with the specified key. It returns a leaf object upon success, or NULL in case of failure.
+
+### Macros
+
+This section is a list of macros of the IPv4 routing subsystem, some of which were mentioned in this chapter.
+
+#### FIB_RES_GW()
+
+This macro returns the nh_gw field (nexthop gateway address) associated with the specified fib_result object.
+
+#### FIB_RES_DEV()
+
+This macro returns the nh_dev field (Next hop net_device object) associated with the specified fib_result object.
+
+#### FIB_RES_OIF()
+
+This macro returns the nh_oif field (nexthop output interface index) associated with the specified fib_result object.
+
+#### FIB_RES_NH()
+
+This macro returns the nexthop (fib_nh object) of the fib_info of the specified fib_result object. When Multipath Routing is set, you can have multiple nexthops; the value of nh_sel field of the specified fib_result object is taken into account in this case, as an index to the array of the nexthops which is embedded in the fib_info object.
+
+(include/net/ip_fib.h)
+
+#### IN_DEV_FORWARD()
+
+This macro checks whether the specified network device (in_device object) supports IPv4 forwarding.
+
+#### IN_DEV_RX_REDIRECTS()
+
+This macro checks whether the specified network device (in_device object) supports accepting ICMPv4 Redirects.
+
+#### IN_DEV_TX_REDIRECTS()
+
+This macro checks whether the specified network device (in_device object) supports sending ICMPv4 Redirects.
+
+#### IS_LEAF()
+
+This macro checks whether the specified tree node is a leaf.
+
+#### IS_TNODE()
+
+This macro checks whether the specified tree node is an internal node (trie node or tnode).
+
+#### change_nexthops()
+
+This macro iterates over the nexthops of the specified fib_info object (net/ipv4/fib_semantics.c).
+
+### Tables
+
+There are 15 (RTAX_MAX) metrics for routes. Some of them are TCP related, and some are general. Table 5-1 shows which of these metrics are related to TCP.
+
+Table 5-1.
+
+Route Metrics
+
+Linux Symbol | TCP Metric (Y/N)
+
+---|---
+
+RTAX_UNSPEC | N
+
+RTAX_LOCK | N
+
+RTAX_MTU | N
+
+RTAX_WINDOW | Y
+
+RTAX_RTT | Y
+
+RTAX_RTTVAR | Y
+
+RTAX_SSTHRESH | Y
+
+RTAX_CWND | Y
+
+RTAX_ADVMSS | Y
+
+RTAX_REORDERING | Y
+
+RTAX_HOPLIMIT | N
+
+RTAX_INITCWND | Y
+
+RTAX_FEATURES | N
+
+RTAX_RTO_MIN | Y
+
+RTAX_INITRWND | Y
+
+(include/uapi/linux/rtnetlink.h)
+
+Table 5-2 shows the error value and the scope of all the route types.
+
+Table 5-2.
+
+Route Types
+
+Linux Symbol | Error | Scope
+
+---|---|---
+
+RTN_UNSPEC | 0 | RT_SCOPE_NOWHERE
+
+RTN_UNICAST | 0 | RT_SCOPE_UNIVERSE
+
+RTN_LOCAL | 0 | RT_SCOPE_HOST
+
+RTN_BROADCAST | 0 | RT_SCOPE_LINK
+
+RTN_ANYCAST | 0 | RT_SCOPE_LINK
+
+RTN_MULTICAST | 0 | RT_SCOPE_UNIVERSE
+
+RTN_BLACKHOLE | -EINVAL | RT_SCOPE_UNIVERSE
+
+RTN_UNREACHABLE | -EHOSTUNREACH | RT_SCOPE_UNIVERSE
+
+RTN_PROHIBIT | -EACCES | RT_SCOPE_UNIVERSE
+
+RTN_THROW | -EAGAIN | RT_SCOPE_UNIVERSE
+
+RTN_NAT | -EINVAL | RT_SCOPE_NOWHERE
+
+RTN_XRESOLVE | -EINVAL | RT_SCOPE_NOWHERE
+
+### Route Flags
+
+When running the route –n command, you get an output that shows the route flags. Here are the flag values and a short example of the output of route –n:
+
+  * U (Route is up)
+
+  * H (Target is a host)
+
+  * G (Use gateway)
+
+  * R (Reinstate route for dynamic routing)
+
+  * D (Dynamically installed by daemon or redirect)
+
+  * M (Modified from routing daemon or redirect)
+
+  * A (Installed by addrconf)
+
+  * ! (Reject route)
+
+Table 5-3 shows an example of the output of running route –n (the results are organized into a table form):
+
+Table 5-3.
+
+Kernel IP Routing Table
+
+Destination | Gateway | Genmask | Flags | Metric | Ref | Use | Iface
+
+---|---|---|---|---|---|---|---
+
+169.254.0.0 | 0.0.0.0 | 255.255.0.0 | U | 1002 | 0 | 0 | eth0
+
+192.168.3.0 | 192.168.2.1 | 255.255.255.0 | UG | 0 | 0 | 0 | eth1
+Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_6
+
+© Rami Rosen 2014
+
+# 6. Advanced Routing
+
+Rami Rosen1
+
+(1)
+
+Haifa, Israel
+
+Abstract
+
+Chapter 5 dealt with the IPv4 routing subsystem. This chapter continues with the routing subsystem and discusses advanced IPv4 routing topics such as Multicast Routing, Multipath Routing, Policy Routing, and more. This book deals with the Linux Kernel Networking implementation—it does not delve into the internals of userspace Multicast Routing daemons implementation, which are quite complex and beyond the scope of the book. I do, however, discuss to some extent the interaction between a userspace multicast routing daemon and the multicast layer in the kernel. I also briefly discuss the Internet Group Management Protocol (IGMP) protocol, which is the basis of multicast group membership management; adding and deleting multicast group members is done by the IGMP protocol. Some basic knowledge of IGMP is needed to understand the interaction between a multicast host and a multicast router.
+
+Chapter 5 dealt with the IPv4 routing subsystem. This chapter continues with the routing subsystem and discusses advanced IPv4 routing topics such as Multicast Routing, Multipath Routing, Policy Routing, and more. This book deals with the Linux Kernel Networking implementation—it does not delve into the internals of userspace Multicast Routing daemons implementation, which are quite complex and beyond the scope of the book. I do, however, discuss to some extent the interaction between a userspace multicast routing daemon and the multicast layer in the kernel. I also briefly discuss the Internet Group Management Protocol (IGMP) protocol, which is the basis of multicast group membership management; adding and deleting multicast group members is done by the IGMP protocol. Some basic knowledge of IGMP is needed to understand the interaction between a multicast host and a multicast router.
+
+Multipath Routing is the ability to add more than one nexthop to a route. Policy Routing enables configuring routing policies that are not based solely on the destination address. I start with describing Multicast Routing.
+
+## Multicast Routing
+
+Chapter 4 briefly mentions Multicast Routing, in the "Receiving IPv4 Multicast Packets" section. I will now discuss it in more depth. Sending multicast traffic means sending the same packet to multiple recipients. This feature can be useful in streaming media, audio/video conferencing, and more. It has a clear advantage over unicast traffic in terms of saving network bandwidth. Multicast addresses are defined as Class D addresses. The Classless Inter-Domain Routing (CIDR) prefix of this group is 224.0.0.0/4. The range of IPv4 multicast addresses is from 224.0.0.0 to 239.255.255.255. Handling Multicast Routing must be done in conjunction with a userspace routing daemon which interacts with the kernel. According to the Linux implementation, Multicast Routing cannot be handled solely by the kernel code without this userspace Routing daemon, as opposed to Unicast Routing. There are various multicast daemons: for example: mrouted, which is based on an implementation of the Distance Vector Multicast Routing Protocol (DVMRP), or pimd, which is based on the Protocol-Independent Multicast protocol (PIM). The DVMRP protocol is defined in RFC 1075, and it was the first multicast routing protocol. It is based on the Routing Information Protocol (RIP) protocol.
+
+The PIM protocol has two versions, and the kernel supports both of them (CONFIG_IP_PIMSM_V1 and CONFIG_IP_PIMSM_V2). PIM has four different modes: PIM-SM (PIM Sparse Mode), PIM-DM (PIM Dense Mode), PIM Source-Specific Multicast (PIM-SSM) and Bidirectional PIM. The protocol is called protocol-independent because it is not dependent on any particular routing protocol for topology discovery. This section discusses the interaction between the userspace daemon and the kernel multicast routing layer. Delving into the internals of the PIM protocol or the DVMRP protocol (or any other Multicast Routing protocol) is beyond the scope of this book. Normally, the Multicast Routing lookup is based on the source and destination addresses. There is a "Multicast Policy Routing" kernel feature, which is the parallel to the unicast policy routing kernel feature that was mentioned in Chapter 5 and which is also discussed in the course of this chapter. The multicast policy routing protocol is implemented using the Policy Routing API (for example, it calls the fib_rules_lookup() method to perform a lookup, creates a fib_rules_ops object, and registers it with the fib_rules_register() method, and so on). With Multicast Policy Routing, the routing can be based on additional criteria, like the ingress network interfaces. Moreover, you can work with more than one multicast routing table. In order to work with Multicast Policy Routing, IP_MROUTE_MULTIPLE_TABLES must be set.
+
+Figure 6-1 shows a simple IPv4 Multicast Routing setup. The topology is very simple: the laptop, on the left, joins a multicast group (224.225.0.1) by sending an IGMP packet (IP_ADD_MEMBERSHIP). The IGMP protocol is discussed in the next section, "The IGMP Protocol." The AMD server, in the middle, is configured as a multicast router, and a userspace multicast routing daemon (like pimd or mrouted) runs on it. The Windows server, on the right, which has an IP address of 192.168.2.10, sends multicast traffic to 224.225.0.1; this traffic is forwarded to the laptop via the multicast router. Note that the Windows server itself did not join the 224.225.0.1 multicast group. Running ip route add 224.0.0.0/4 dev <networkDeviceName> tells the kernel to send all multicast traffic via the specified network device.
+
+Figure 6-1.
+
+Simple Multicast Routing setup
+
+The next section discusses the IGMP protocol, which is used for the management of multicast group membership.
+
+### The IGMP Protocol
+
+The IGMP protocol is an integral part of IPv4 multicast. It must be implemented on each node that supports IPv4 multicast. In IPv6, multicast management is handled by the MLD (Multicast Listener Discovery) protocol, which uses ICMPv6 messages, discussed in Chapter 8. With the IGMP protocol, multicast group memberships are established and managed. There are three versions of IGMP:
+
+1.
+
+IGMPv1 (RFC 1112) : Has two types of messages—host membership report and host membership query. When a host wants to join a multicast group, it sends a membership report message. Multicast routers send membership queries to discover which host multicast groups have members on their attached local networks. Queries are addressed to the all-hosts group address (224.0.0.1, IGMP_ALL_HOSTS) and carry a TTL of 1 so that the membership query will not travel outside of the LAN.
+
+2.
+
+IGMPv2 (RFC 2236) : This is an extension of IGMPv1. The IGMPv2 protocol adds three new messages:
+
+a.
+
+Membership Query (0x11): There are two sub-types of Membership Query messages: General Query, used to learn which groups have members on an attached network, and Group-Specific Query, used to learn whether a particular group has any members on an attached network.
+
+b.
+
+Version 2 Membership Report (0x16).
+
+c.
+
+Leave Group (0x17).
+
+Note
+
+IGMPv2 also supports Version 1 Membership Report message, for backward compatibility with IGMPv1. See RFC 2236, section 2.1.
+
+3.
+
+IGMPv3 (RFC 3376, updated by RFC 4604) : This major revision of the protocol adds a feature called source filtering. This means that when a host joins a multicast group, it can specify a set of source addresses from which it will receive multicast traffic. The source filters can also exclude source addresses. To support the source filtering feature, the socket API was extended; see RFC 3678, "Socket Interface Extensions for Multicast Source Filters." I should also mention that the multicast router periodically (about every two minutes) sends a membership query to 224.0.0.1, the all-hosts multicast group address. A host that receives a membership query responds with a membership report. This is implemented in the kernel by the igmp_rcv() method: getting an IGMP_HOST_MEMBERSHIP_QUERY message is handled by the igmp_heard_query() method.
+
+Note
+
+The kernel implementation of IPv4 IGMP is in net/core/igmp.c, include/linux/igmp.h and include/uapi/linux/igmp.h.
+
+The next section examines the fundamental data structure of IPv4 Multicast Routing, the multicast routing table, and its Linux implementation.
+
+### The Multicast Routing Table
+
+The multicast routing table is represented by a structure named mr_table. Let's take a look at it:
+
+struct mr_table {
+
+struct list_head list;
+
+#ifdef CONFIG_NET_NS
+
+struct net *net;
+
+#endif
+
+u32 id;
+
+struct sock __rcu *mroute_sk;
+
+struct timer_list ipmr_expire_timer;
+
+struct list_head mfc_unres_queue;
+
+struct list_head mfc_cache_array[MFC_LINES];
+
+struct vif_device vif_table[MAXVIFS];
+
+. . .
+
+};
+
+(net/ipv4/ipmr.c)
+
+The following is a description of some members of the mr_table structure:
+
+  * net: The network namespace associated with the multicast routing table; by default it is the initial network namespace, init_net. Network namespaces are discussed in Chapter 14.
+
+  * id: The multicast routing table id; it is RT_TABLE_DEFAULT (253) when working with a single table.
+
+  * mroute_sk: This pointer represents a reference to the userspace socket that the kernel keeps. The mroute_sk pointer is initialized by calling setsockopt() from the userspace with the MRT_INIT socket option and is nullified by calling setsockopt() with the MRT_DONE socket option. The interaction between the userspace and the kernel is based on calling the setsockopt() method, on sending IOCTLs from userspace, and on building IGMP packets and passing them to the Multicast Routing daemon by calling the sock_queue_rcv_skb() method from the kernel.
+
+  * ipmr_expire_timer: Timer of cleaning unresolved multicast routing entries. This timer is initialized when creating a multicast routing table, in the ipmr_new_table() method, and removed when removing a multicast routing table, by the ipmr_free_table() method.
+
+  * mfc_unres_queue: A queue of unresolved routing entries.
+
+  * mfc_cache_array: A cache of the routing entries, with 64 (MFC_LINES) entries, discussed shortly in the next section.
+
+  * vif_table[MAXVIFS]: An array of 32 (MAXVIFS) vif_device objects. Entries are added by the vif_add() method and deleted by the vif_delete() method. The vif_device structure represents a virtual multicast routing network interface; it can be based on a physical device or on an IPIP (IP over IP) tunnel. The vif_device structure is discussed later in "The Vif Device" section.
+
+I have covered the multicast routing table and mentioned its important members, such as the Multicast Forwarding Cache (MFC) and the queue of unresolved routing entries. Next I will look at the MFC, which is embedded in the multicast routing table object and plays an important role in Multicast Routing.
+
+### The Multicast Forwarding Cache (MFC)
+
+The most important data structure in the multicast routing table is the MFC, which is in fact an array of cache entries (mfc_cache objects). This array, named mfc_cache_array, is embedded in the multicast routing table (mr_table) object. It has 64 (MFC_LINES) elements. The index of this array is the hash value (the hash function takes two parameters—the multicast group address and the source IP address; see the description of the MFC_HASH macro in the "Quick Reference" section at the end of this chapter).
+
+Usually there is only one multicast routing table, which is an instance of the mr_table structure, and a reference to it is kept in the IPv4 network namespace (net->ipv4.mrt). The table is created by the ipmr_rules_init() method, which also assigns net->ipv4.mrt to point to the multicast routing table that was created. When working with the multicast policy routing feature mentioned earlier, there can be multiple multicast policy routing tables. In both cases, you get the routing table using the same method, ipmr_fib_lookup(). The ipmr_fib_lookup() method gets three parameters as an input: the network namespace, the flow, and a pointer to the mr_table object which it should fill. Normally, it simply sets the specified mr_table pointer to be net->ipv4.mrt; when working with multiple tables (IP_MROUTE_MULTIPLE_TABLES is set), the implementation is more complex. Let's take a look at the mfc_cache structure:
+
+struct mfc_cache {
+
+struct list_head list;
+
+__be32 mfc_mcastgrp;
+
+__be32 mfc_origin;
+
+vifi_t mfc_parent;
+
+int mfc_flags;
+
+union {
+
+struct {
+
+unsigned long expires;
+
+struct sk_buff_head unresolved; /* Unresolved buffers */
+
+} unres;
+
+struct {
+
+unsigned long last_assert;
+
+int minvif;
+
+int maxvif;
+
+unsigned long bytes;
+
+unsigned long pkt;
+
+unsigned long wrong_if;
+
+unsigned char ttls[MAXVIFS]; /* TTL thresholds */
+
+} res;
+
+} mfc_un;
+
+struct rcu_head rcu;
+
+};
+
+(include/linux/mroute.h)
+
+The following is a description of some members of the mfc_cache structure:
+
+  * mfc_mcastgrp: the address of the multicast group that the entry belongs to.
+
+  * mfc_origin: The source address of the route.
+
+  * mfc_parent: The source interface.
+
+  * mfc_flags: The flags of the entry. Can have one of these values:
+
+    * MFC_STATIC: When the route was added statically and not by a multicast routing daemon.
+
+    * MFC_NOTIFY: When the RTM_F_NOTIFY flag of the routing entry was set. See the rt_fill_info() method and the ipmr_get_route() method for more details.
+
+  * The mfc_un union consists of two elements:
+
+    * unres: Unresolved cache entries.
+
+    * res: Resolved cache entries.
+
+The first time an SKB of a certain flow reaches the kernel, it is added to the queue of unresolved entries (mfc_un.unres.unresolved), where up to three SKBs can be saved. If there are three SKBs in the queue, the packet is not appended to the queue but is freed, and the ipmr_cache_unresolved() method returns -ENOBUFS ("No buffer space available"):
+
+static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
+
+{
+
+. . .
+
+if (c->mfc_un.unres.unresolved.qlen > 3) {
+
+kfree_skb(skb);
+
+err = -ENOBUFS;
+
+} else {
+
+. . .
+
+}
+
+(net/ipv4/ipmr.c)
+
+This section described the MFC and its important members, including the queue of resolved entries and the queue of unresolved entries. The next section briefly describes what a multicast router is and how it is configured in Linux.
+
+### Multicast Router
+
+In order to configure a machine as a multicast router, you should set the CONFIG_IP_MROUTE kernel configuration option. You should also run some routing daemon such as pimd or mrouted, as mentioned earlier. These routing daemons create a socket to communicate with the kernel. In pimd, for example, you create a raw IGMP socket by calling socket(AF_INET, SOCK_RAW, IPPROTO_IGMP). Calling setsockopt() on this socket triggers sending commands to the kernel, which are handled by the ip_mroute_setsockopt() method. When calling setsockopt() on this socket from the routing daemon with MRT_INIT, the kernel is set to keep a reference to the userspace socket in the mroute_sk field of the mr_table object that is used, and the mc_forwarding procfs entry (/proc/sys/net/ipv4/conf/all/mc_forwarding) is set by calling IPV4_DEVCONF_ALL(net, MC_FORWARDING)++. Note that the mc_forwarding procfs entry is a read-only entry and can't be set from userspace. You can't create another instance of a multicast routing daemon: when handling the MRT_INIT option, the ip_mroute_setsockopt() method checks whether the mroute_sk field of the mr_table object is initialized and returns -EADDRINUSE if so. Adding a network interface is done by calling setsockopt() on this socket with MRT_ADD_VIF, and deleting a network interface is done by calling setsockopt() on this socket with MRT_DEL_VIF. You can pass the parameters of the network interface to these setsockopt() calls by passing a vifctl object as the optval parameter of the setsockopt() system call. Let's take a look at the vifctl structure:
+
+struct vifctl {
+
+vifi_t vifc_vifi; /* Index of VIF */
+
+unsigned char vifc_flags; /* VIFF_ flags */
+
+unsigned char vifc_threshold; /* ttl limit */
+
+unsigned int vifc_rate_limit; /* Rate limiter values (NI) */
+
+union {
+
+struct in_addr vifc_lcl_addr; /* Local interface address */
+
+int vifc_lcl_ifindex; /* Local interface index */
+
+};
+
+struct in_addr vifc_rmt_addr; /* IPIP tunnel addr */
+
+};
+
+(include/uapi/linux/mroute.h)
+
+The following is a description of some members of the vifctl structure:
+
+  * vifc_flags can be:
+
+    * VIFF_TUNNEL: When you want to use an IPIP tunnel.
+
+    * VIFF_REGISTER: When you want to register the interface.
+
+    * VIFF_USE_IFINDEX: When you want to use the local interface index and not the local interface IP address; in such a case, you will set the vifc_lcl_ifindex to be the local interface index. The VIFF_USE_IFINDEX flag is available for 2.6.33 kernel and above.
+
+  * vifc_lcl_addr: The local interface IP address. (This is the default—no flag should be set for using it).
+
+  * vifc_lcl_ifindex: The local interface index. It should be set when the VIFF_USE_IFINDEX flag is set in vifc_flags.
+
+  * vifc_rmt_addr: The address of the remote node of a tunnel.
+
+When the multicast routing daemon is closed, the setsockopt() method is called with an MRT_DONE option. This triggers calling the mrtsock_destruct() method to nullify the mroute_sk field of the mr_table object that is used and to perform various cleanups.
+
+This section covered what a multicast router is and how it is configured in Linux. I also examined the vifctl structure. Next, I look at the Vif device, which represents a multicast network interface.
+
+### The Vif Device
+
+Multicast Routing supports two modes: direct multicast and multicast encapsulated in a unicast packet over a tunnel. In both cases, the same object is used (an instance of the vif_device structure) to represent the network interface. When working over a tunnel, the VIFF_TUNNEL flag will be set. Adding and deleting a multicast interface is done by the vif_add() method and by the vif_delete() method, respectively. The vif_add() method also sets the device to support multicast by calling the dev_set_allmulti(dev, 1) method, which increments the allmulti counter of the specified network device (net_device object). The vif_delete() method calls dev_set_allmulti(dev, -1) to decrement the allmulti counter of the specified network device (net_device object). For more details about the dev_set_allmulti() method, see appendix A. Let's take a look at the vif_device structure; its members are quite self-explanatory:
+
+struct vif_device {
+
+struct net_device *dev; /* Device we are using */
+
+unsigned long bytes_in,bytes_out;
+
+unsigned long pkt_in,pkt_out; /* Statistics */
+
+unsigned long rate_limit; /* Traffic shaping (NI) */
+
+unsigned char threshold; /* TTL threshold */
+
+unsigned short flags; /* Control flags */
+
+__be32 local,remote; /* Addresses(remote for tunnels)*/
+
+int link; /* Physical interface index */
+
+};
+
+(include/linux/mroute.h)
+
+In order to receive multicast traffic, a host must join a multicast group. This is done by creating a socket in userspace and calling setsockopt() with IPPROTO_IP and with the IP_ADD_MEMBERSHIP socket option. The userspace application also creates an ip_mreq object where it initializes the request parameters, like the desired group multicast address and the source IP address of the host (see the netinet/in.h userspace header). The setsockopt() call is handled in the kernel by the ip_mc_join_group() method, in net/ipv4/igmp.c. Eventually, the multicast address is added by the ip_mc_join_group() method to a list of multicast addresses (mc_list), which is a member of the in_device object. A host can leave a multicast group by calling setsockopt() with IPPROTO_IP and with the IP_DROP_MEMBERSHIP socket option. This is handled in the kernel by the ip_mc_leave_group() method, in net/ipv4/igmp.c. A single socket can join up to 20 multicast groups (sysctl_igmp_max_memberships). Trying to join more than 20 multicast groups by the same socket will fail with the -ENOBUFS error ("No buffer space available.") See the ip_mc_join_group() method implementation in net/ipv4/igmp.c.
+
+### IPv4 Multicast Rx Path
+
+Chapter 4's "Receiving IPv4 Multicast Packets" section briefly discusses how multicast packets are handled. I will now describe this in more depth. My discussion assumes that our machine is configured as a multicast router; this means, as was mentioned earlier, that CONFIG_IP_MROUTE is set and a routing daemon like pimd or mrouted runs on this host. Multicast packets are handled by the ip_route_input_mc() method, in which a routing table entry (an rtable object) is allocated and initialized, and in which the input callback of the dst object is set to be ip_mr_input(), in case CONFIG_IP_MROUTE is set. Let's take a look at the ip_mr_input() method:
+
+int ip_mr_input(struct sk_buff *skb)
+
+{
+
+struct mfc_cache *cache;
+
+struct net *net = dev_net(skb->dev);
+
+First the local flag is set to true if the packet is intended for local delivery, as the ip_mr_input() method also handles local multicast packets.
+
+int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
+
+struct mr_table *mrt;
+
+/* Packet is looped back after forward, it should not be
+
+* forwarded second time, but still can be delivered locally.
+
+*/
+
+if (IPCB(skb)->flags & IPSKB_FORWARDED)
+
+goto dont_forward;
+
+Normally, when working with a single multicast routing table, the ipmr_rt_fib_lookup() method simply returns the net->ipv4.mrt object:
+
+mrt = ipmr_rt_fib_lookup(net, skb);
+
+if (IS_ERR(mrt)) {
+
+kfree_skb(skb);
+
+return PTR_ERR(mrt);
+
+}
+
+if (!local) {
+
+IGMPv3 and some IGMPv2 implementations set the router alert option (IPOPT_RA) in the IPv4 header when sending JOIN or LEAVE packets. See the igmpv3_newpack() method in net/ipv4/igmp.c:
+
+if (IPCB(skb)->opt.router_alert) {
+
+The ip_call_ra_chain() method (net/ipv4/ip_input.c) calls the raw_rcv() method to pass the packet to the userspace raw socket, which listens. The ip_ra_chain object contains a reference to the multicast routing socket, which is passed as a parameter to the raw_rcv() method. For more details, look at the ip_call_ra_chain() method implementation, in net/ipv4/ip_input.c:
+
+if (ip_call_ra_chain(skb))
+
+return 0;
+
+There are implementations where the router alert option is not set, as explained in the following comment; these cases must be handled as well, by calling the raw_rcv() method directly:
+
+} else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
+
+/* IGMPv1 (and broken IGMPv2 implementations sort of
+
+* Cisco IOS <= 11.2(8)) do not put router alert
+
+* option to IGMP packets destined to routable
+
+* groups. It is very bad, because it means
+
+* that we can forward NO IGMP messages.
+
+*/
+
+struct sock *mroute_sk;
+
+The mrt->mroute_sk socket is a copy in the kernel of the socket that the multicast routing userspace application created:
+
+mroute_sk = rcu_dereference(mrt->mroute_sk);
+
+if (mroute_sk) {
+
+nf_reset(skb);
+
+raw_rcv(mroute_sk, skb);
+
+return 0;
+
+}
+
+}
+
+}
+
+First a lookup in the multicast routing cache, mfc_cache_array, is performed by calling the ipmr_cache_find() method. The hash key is the destination multicast group address and the source IP address of the packet, taken from the IPv4 header:
+
+cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
+
+if (cache == NULL) {
+
+A lookup in the virtual devices array (vif_table) is performed to see whether there is a corresponding entry which matches the incoming network device (skb->dev):
+
+int vif = ipmr_find_vif(mrt, skb->dev);
+
+The ipmr_cache_find_any() method handles the advanced feature of multicast proxy support (which is not discussed in this book):
+
+if (vif >= 0)
+
+cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr,
+
+vif);
+
+}
+
+/*
+
+* No usable cache entry
+
+*/
+
+if (cache == NULL) {
+
+int vif;
+
+If the packet is destined to the local host, deliver it:
+
+if (local) {
+
+struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
+ip_local_deliver(skb);
+
+if (skb2 == NULL)
+
+return -ENOBUFS;
+
+skb = skb2;
+
+}
+
+read_lock(&mrt_lock);
+
+vif = ipmr_find_vif(mrt, skb->dev);
+
+if (vif >= 0) {
+
+The ipmr_cache_unresolved() method creates a multicast routing entry (mfc_cache object) by calling the ipmr_cache_alloc_unres() method. This method creates a cache entry (mfc_cache object) and initializes its expiration time interval (by setting mfc_un.unres.expires). Let's take a look at this very short method, ipmr_cache_alloc_unres():
+
+static struct mfc_cache *ipmr_cache_alloc_unres(void)
+
+{
+
+struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
+
+if (c) {
+
+skb_queue_head_init(&c->mfc_un.unres.unresolved);
+
+Setting the expiration time interval:
+
+c->mfc_un.unres.expires = jiffies + 10*HZ;
+
+}
+
+return c;
+
+}
+
+If the routing daemon does not resolve the routing entry within its expiration interval, the entry is removed from the queue of the unresolved entries. When creating a multicast routing table (by the ipmr_new_table() method), its timer (ipmr_expire_timer) is set. This timer invokes the ipmr_expire_process() method periodically. The ipmr_expire_process() method iterates over all the unresolved cache entries in the queue of unresolved entries (mfc_unres_queue of the mrtable object) and removes the expired unresolved cache entries.
+
+After creating the unresolved cache entry, the ipmr_cache_unresolved() method adds it to the queue of unresolved entries (mfc_unres_queue of the multicast table, mrtable) and increments by 1 the unresolved queue length (cache_resolve_queue_len of the multicast table, mrtable). It also calls the ipmr_cache_report() method, which builds an IGMP message (IGMPMSG_NOCACHE) and delivers it to the userspace multicast routing daemon by calling eventually the sock_queue_rcv_skb() method.
+
+I mentioned that the userspace routing daemon should resolve the routing within some time interval. I will not delve into how this is implemented in userspace. Note, however, that once the routing daemon decides it should resolve an unresolved entry, it builds the cache entry parameters (in an mfcctl object) and calls setsockopt() with the MRT_ADD_MFC socket option, then it passes the mfcctl object embedded in the optval parameter of the setsockopt() system call; this is handled in the kernel by the ipmr_mfc_add() method:
+
+int err2 = ipmr_cache_unresolved(mrt, vif, skb);
+
+read_unlock(&mrt_lock);
+
+return err2;
+
+}
+
+read_unlock(&mrt_lock);
+
+kfree_skb(skb);
+
+return -ENODEV;
+
+}
+
+read_lock(&mrt_lock);
+
+If a cache entry was found in the MFC, call the ip_mr_forward() method to continue the packet traversal:
+
+ip_mr_forward(net, mrt, skb, cache, local);
+
+read_unlock(&mrt_lock);
+
+if (local)
+
+return ip_local_deliver(skb);
+
+return 0;
+
+dont_forward:
+
+if (local)
+
+return ip_local_deliver(skb);
+
+kfree_skb(skb);
+
+return 0;
+
+}
+
+This section detailed the IPv4 Multicast Rx path and the interaction with the routing daemon in this path. The next section describes the multicast routing forwarding method, ip_mr_forward().
+
+### The ip_mr_forward() Method
+
+Let's take a look at the ip_mr_forward() method:
+
+static int ip_mr_forward(struct net *net, struct mr_table *mrt,
+
+struct sk_buff *skb, struct mfc_cache *cache,
+
+int local)
+
+{
+
+int psend = -1;
+
+int vif, ct;
+
+int true_vifi = ipmr_find_vif(mrt, skb->dev);
+
+vif = cache->mfc_parent;
+
+Here you can see update statistics of the resolved cache object (mfc_un.res):
+
+cache->mfc_un.res.pkt++;
+
+cache->mfc_un.res.bytes += skb->len;
+
+if (cache->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) {
+
+struct mfc_cache *cache_proxy;
+
+The expression (*, G) means traffic from any source sending to the group G:
+
+/* For an (*,G) entry, we only check that the incomming
+
+* interface is part of the static tree.
+
+*/
+
+cache_proxy = ipmr_cache_find_any_parent(mrt, vif);
+
+if (cache_proxy &&
+
+cache_proxy->mfc_un.res.ttls[true_vifi] < 255)
+
+goto forward;
+
+}
+
+/*
+
+* Wrong interface: drop packet and (maybe) send PIM assert.
+
+*/
+
+if (mrt->vif_table[vif].dev != skb->dev) {
+
+if (rt_is_output_route(skb_rtable(skb))) {
+
+/* It is our own packet, looped back.
+
+* Very complicated situation...
+
+*
+
+* The best workaround until routing daemons will be
+
+* fixed is not to redistribute packet, if it was
+
+* send through wrong interface. It means, that
+
+* multicast applications WILL NOT work for
+
+* (S,G), which have default multicast route pointing
+
+* to wrong oif. In any case, it is not a good
+
+* idea to use multicasting applications on router.
+
+*/
+
+goto dont_forward;
+
+}
+
+cache->mfc_un.res.wrong_if++;
+
+if (true_vifi >= 0 && mrt->mroute_do_assert &&
+
+/* pimsm uses asserts, when switching from RPT to SPT,
+
+* so that we cannot check that packet arrived on an oif.
+
+* It is bad, but otherwise we would need to move pretty
+
+* large chunk of pimd to kernel. Ough... --ANK
+
+*/
+
+(mrt->mroute_do_pim ||
+
+cache->mfc_un.res.ttls[true_vifi] < 255) &&
+
+time_after(jiffies,
+
+cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
+
+cache->mfc_un.res.last_assert = jiffies;
+
+Call the ipmr_cache_report() method to build an IGMP message (IGMPMSG_WRONGVIF) and to deliver it to the userspace multicast routing daemon by calling the sock_queue_rcv_skb() method:
+
+ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
+
+}
+
+goto dont_forward;
+
+}
+
+The frame is now ready to be forwarded:
+
+forward:
+
+mrt->vif_table[vif].pkt_in++;
+
+mrt->vif_table[vif].bytes_in += skb->len;
+
+/*
+
+* Forward the frame
+
+*/
+
+if (cache->mfc_origin == htonl(INADDR_ANY) &&
+
+cache->mfc_mcastgrp == htonl(INADDR_ANY)) {
+
+if (true_vifi >= 0 &&
+
+true_vifi != cache->mfc_parent &&
+
+ip_hdr(skb)->ttl >
+
+cache->mfc_un.res.ttls[cache->mfc_parent]) {
+
+/* It's an (*,*) entry and the packet is not coming from
+
+* the upstream: forward the packet to the upstream
+
+* only.
+
+*/
+
+psend = cache->mfc_parent;
+
+goto last_forward;
+
+}
+
+goto dont_forward;
+
+}
+
+for (ct = cache->mfc_un.res.maxvif - 1;
+
+ct >= cache->mfc_un.res.minvif; ct--) {
+
+/* For (*,G) entry, don't forward to the incoming interface */
+
+if ((cache->mfc_origin != htonl(INADDR_ANY) ||
+
+ct != true_vifi) &&
+
+ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
+
+if (psend != -1) {
+
+struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
+Call the ipmr_queue_xmit() method to continue with the packet forwarding:
+
+if (skb2)
+
+ipmr_queue_xmit(net, mrt, skb2, cache,
+
+psend);
+
+}
+
+psend = ct;
+
+}
+
+}
+
+last_forward:
+
+if (psend != -1) {
+
+if (local) {
+
+struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
+if (skb2)
+
+ipmr_queue_xmit(net, mrt, skb2, cache, psend);
+
+} else {
+
+ipmr_queue_xmit(net, mrt, skb, cache, psend);
+
+return 0;
+
+}
+
+}
+
+dont_forward:
+
+if (!local)
+
+kfree_skb(skb);
+
+return 0;
+
+}
+
+Now that I have covered the multicast routing forwarding method, ip_mr_forward(), it is time to examine the ipmr_queue_xmit() method.
+
+### The ipmr_queue_xmit() Method
+
+Let's take a look at the ipmr_queue_xmit() method:
+
+static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
+
+struct sk_buff *skb, struct mfc_cache *c, int vifi)
+
+{
+
+const struct iphdr *iph = ip_hdr(skb);
+
+struct vif_device *vif = &mrt->vif_table[vifi];
+
+struct net_device *dev;
+
+struct rtable *rt;
+
+struct flowi4 fl4;
+
+The encap field is used when working with a tunnel:
+
+int encap = 0;
+
+if (vif->dev == NULL)
+
+goto out_free;
+
+#ifdef CONFIG_IP_PIMSM
+
+if (vif->flags & VIFF_REGISTER) {
+
+vif->pkt_out++;
+
+vif->bytes_out += skb->len;
+
+vif->dev->stats.tx_bytes += skb->len;
+
+vif->dev->stats.tx_packets++;
+
+ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
+
+goto out_free;
+
+}
+
+#endif
+
+When working with a tunnel, a routing lookup is performed with the vif->remote and vif->local, which represent the destination and local addresses, respectively. These addresses are the end points of the tunnel. When working with a vif_device object which represents a physical device, a routing lookup is performed with the destination of the IPv4 header and 0 as a source address:
+
+if (vif->flags & VIFF_TUNNEL) {
+
+rt = ip_route_output_ports(net, &fl4, NULL,
+
+vif->remote, vif->local,
+
+0, 0,
+
+IPPROTO_IPIP,
+
+RT_TOS(iph->tos), vif->link);
+
+if (IS_ERR(rt))
+
+goto out_free;
+
+encap = sizeof(struct iphdr);
+
+} else {
+
+rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0,
+
+0, 0,
+
+IPPROTO_IPIP,
+
+RT_TOS(iph->tos), vif->link);
+
+if (IS_ERR(rt))
+
+goto out_free;
+
+}
+
+dev = rt->dst.dev;
+
+Note that if the packet size is higher than the MTU, an ICMPv4 message is not sent (as is done in such a case under unicast forwarding); only the statistics are updated, and the packet is discarded:
+
+if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
+
+/* Do not fragment multicasts. Alas, IPv4 does not
+
+* allow to send ICMP, so that packets will disappear
+
+* to blackhole.
+
+*/
+
+IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+
+ip_rt_put(rt);
+
+goto out_free;
+
+}
+
+encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
+
+if (skb_cow(skb, encap)) {
+
+ip_rt_put(rt);
+
+goto out_free;
+
+}
+
+vif->pkt_out++;
+
+vif->bytes_out += skb->len;
+
+skb_dst_drop(skb);
+
+skb_dst_set(skb, &rt->dst);
+
+The TTL is decreased, and the IPv4 header checksum is recalculated (because the TTL is one of the IPv4 fields) when forwarding the packet; the same is done in the ip_forward() method for unicast packets:
+
+ip_decrease_ttl(ip_hdr(skb));
+
+/* FIXME: forward and output firewalls used to be called here.
+
+* What do we do with netfilter? -- RR
+
+*/
+
+if (vif->flags & VIFF_TUNNEL) {
+
+ip_encap(skb, vif->local, vif->remote);
+
+/* FIXME: extra output firewall step used to be here. --RR */
+
+vif->dev->stats.tx_packets++;
+
+vif->dev->stats.tx_bytes += skb->len;
+
+}
+
+IPCB(skb)->flags |= IPSKB_FORWARDED;
+
+/*
+
+* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
+
+* not only before forwarding, but after forwarding on all output
+
+* interfaces. It is clear, if mrouter runs a multicasting
+
+* program, it should receive packets not depending to what interface
+
+* program is joined.
+
+* If we will not make it, the program will have to join on all
+
+* interfaces. On the other hand, multihoming host (or router, but
+
+* not mrouter) cannot join to more than one interface - it will
+
+* result in receiving multiple packets.
+
+*/
+
+Invoke the NF_INET_FORWARD hook:
+
+NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev,
+
+ipmr_forward_finish);
+
+return;
+
+out_free:
+
+kfree_skb(skb);
+
+}
+
+### The ipmr_forward_finish() Method
+
+Let's take a look at the ipmr_forward_finish() method, which is a very short method—it is in fact identical to the ip_forward() method:
+
+static inline int ipmr_forward_finish(struct sk_buff *skb)
+
+{
+
+struct ip_options *opt = &(IPCB(skb)->opt);
+
+IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+
+IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
+
+Handle IPv4 options, if set (see Chapter 4):
+
+if (unlikely(opt->optlen))
+
+ip_forward_options(skb);
+
+return dst_output(skb);
+
+}
+
+Eventually, dst_output() sends the packet via the ip_mc_output() method, which calls the ip_finish_output() method (both methods are in net/ipv4/route.c).
+
+Now that I have covered these multicast methods, let's get a better understanding of how the value of the TTL field is used in multicast traffic.
+
+### The TTL in Multicast Traffic
+
+The TTL field of the IPv4 header has a double meaning when discussing multicast traffic. The first is the same as in unicast IPV4 traffic: the TTL represents a hop counter which is decreased by 1 on every device that is forwarding the packet. When it reaches 0, the packet is discarded. This is done to avoid endless travelling of packets due to some error. The second meaning of the TTL, which is unique to multicast traffic, is a threshold. The TTL values are divided into scopes. Routers have a TTL threshold assigned to each of their interfaces, and only packets with a TTL greater than the interface's threshold are forwarded. Here are the values of these thresholds:
+
+  * 0:Restricted to the same host (cannot be sent out by any interface)
+
+  * 1: Restricted to the same subnet (will not be forwarded by a router)
+
+  * 32: Restricted to the same site
+
+  * 64: Restricted to the same region
+
+  * 128: Restricted to the same continent
+
+  * 255: Unrestricted in scope (global)
+
+See: "IP Multicast Extensions for 4.3BSD UNIX and related systems," by Steve Deering, available at  www.kohala.com/start/mcast.api.txt .
+
+Note
+
+IPv4 Multicast Routing is implemented in net/ipv4/ipmr.c, include/linux/mroute.h, and include/uapi/linux/mroute.h.
+
+This completes my discussion of Multicast Routing. The chapter now moves on to Policy Routing, which enables you to configure routing policies that are not based solely on the destination address.
+
+## Policy Routing
+
+With Policy Routing, a system administrator can define up to 255 routing tables. This section discusses IPv4 Policy Routing; IPv6 Policy Routing is discussed in Chapter 8. In this section, I use the terms policy or rule for entries that are created by Policy Routing, in order to avoid confusing the ordinary routing entries (discussed in Chapter 5) with policy rules.
+
+### Policy Routing Management
+
+Policy Routing management is done with the ip rule command of the iproute2 package (there is no parallel for Policy Routing management with the route command). Let's see how to add, delete, and dump all Policy Routing rules:
+
+  * You add a rule with the ip rule add command; for example: ip rule add tos 0x04 table 252. After this rule is inserted, every packet which has an IPv4 TOS field matching 0x04 will be handled according to the routing rules of table 252. You can add routing entries to this table by specifying the table number when adding a route; for example: ip route add default via 192.168.2.10 table 252. This command is handled in the kernel by the fib_nl_newrule() method, in net/core/fib_rules.c. The tos modifier in the ip rule command earlier is one of the available SELECTOR modifiers of the ip rule command; see man 8 ip rule, and also Table 6-1 in the "Quick Reference" section at the end of this chapter.
+
+  * You delete a rule with the ip rule del command; for example: ip rule del tos 0x04 table 252. This command is handled in the kernel by the fib_nl_delrule() method in net/core/fib_rules.c.
+
+  * You dump all the rules with the ip rule list command or the ip rule show command. Both these commands are handled in the kernel by the fib_nl_dumprule() method in net/core/fib_rules.c.
+
+You now have a good idea about the basics of Policy Routing management, so let's examine the Linux implementation of Policy Routing.
+
+### Policy Routing Implementation
+
+The core infrastructure of Policy Routing is the fib_rules module, net/core/fib_rules.c. It is used by three protocols of the kernel networking stack: IPv4 (including the multicast module, which has a multicast policy routing feature, as mentioned in the "Multicast Routing" section earlier in this chapter), IPv6, and DECnet. The IPv4 Policy Routing is implemented also in a file named fib_rules.c. Don't be confused by the identical name (net/ipv4/fib_rules.c). In IPv6, policy routing is implemented in net/ipv6/fib6_rules.c. The header file, include/net/fib_rules.h, contains the data structures and methods of the Policy Routing core. Here is the definition of the fib4_rule structure, which is the basis for IPv4 Policy Routing:
+
+struct fib4_rule {
+
+struct fib_rule common;
+
+u8 dst_len;
+
+u8 src_len;
+
+u8 tos;
+
+__be32 src;
+
+__be32 srcmask;
+
+__be32 dst;
+
+__be32 dstmask;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+
+u32 tclassid;
+
+#endif
+
+};
+
+(net/ipv4/fib_rules.c)
+
+Three policies are created by default at boot time, by calling the fib_default_rules_init() method: the local (RT_TABLE_LOCAL) table, the main (RT_TABLE_MAIN) table, and the default (RT_TABLE_DEFAULT) table. Lookup is done by the fib_lookup() method. Note that there are two different implementations of the fib_lookup() method in include/net/ip_fib.h. The first one, which is wrapped in the #ifndef CONFIG_IP_MULTIPLE_TABLES block, is for non-Policy Routing, and the second is for Policy Routing. When working with Policy Routing, the lookup is performed like this: if there were no changes to the initial policy routing rules (net->ipv4.fib_has_custom_rules is not set), that means the rule must be in one of the three initial routing tables. So, first a lookup is done in the local table, then in the main table, and then the default table. If there is no corresponding entry, a network unreachable (-ENETUNREACH) error is returned. If there was some change in the initial policy routing rules (net->ipv4.fib_has_custom_rules is set), the_fib_lookup() method is invoked, which is a heavier method, because it iterates over the list of rules and calls fib_rule_match() for each rule in order to decide whether it matches or not. See the implementation of the fib_rules_lookup() method in net/core/fib_rules.c. (The fib_rules_lookup() method is invoked from the __fib_lookup() method). I should mention here that the net->ipv4.fib_has_custom_rules variable is set to false in the initialization phase, by the fib4_rules_init() method, and to true in the fib4_rule_configure() method and the fib4_rule_delete() method. Note that CONFIG_IP_MULTIPLE_TABLES should be set for working with Policy Routing.
+
+This concludes my Multicast Routing discussion. The next section talks about Multipath Routing, which is the ability to add more than one nexthop to a route.
+
+## Multipath Routing
+
+Multipath Routing provides the ability to add more than one nexthop to a route. Defining two nexthop nodes can be done like this, for example: ip route add default scope global nexthop dev eth0 nexthop dev eth1. A system administrator can also assign weights for each nexthop—like this, for example: ip route add 192.168.1.10 nexthop via 192.168.2.1 weight 3 nexthop via 192.168.2.10 weight 5. The fib_info structure represents an IPv4 routing entry that can have more than one FIB nexthop. The fib_nhs member of the fib_info object represents the number of FIB nexthop objects; the fib_info object contains an array of FIB nexthop objects named fib_nh. So in this case, a single fib_info object is created, with an array of two FIB nexthop objects. The kernel keeps the weight of each next hop in the nh_weight field of the FIB nexthop object (fib_nh). If weight was not specified when adding a multipath route, it is set by default to 1, in the fib_create_info() method. The fib_select_multipath() method is called to determine the nexthop when working with Multipath Routing. This method is invoked from two places: from the __ip_route_output_key() method, in the Tx path, and from the ip_mkroute_input() method, in the Rx path. Note that when the output device is set in the flow, the fib_select_multipath() method is not invoked, because the output device is known:
+
+struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) {
+
+. . .
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
+
+fib_select_multipath(&res);
+
+else
+
+#endif
+
+. . .
+
+}
+
+In the Rx path there is no need for checking whether fl4->flowi4_oif is 0, because it is set to 0 in the beginning of this method. I won't delve into the details of the fib_select_multipath() method. I will only mention that there is an element of randomness in the method, using jiffies, for helping in creating a fair weighted route distribution, and that the weight of each next hop is taken in account. The FIB nexthop to use is assigned by setting the FIB nexthop selector (nh_sel) of the specified fib_result object. In contrast to Multicast Routing, which is handled by a dedicated module (net/ipv4/ipmr.c), the code of Multipath Routing appears scattered in the existing routing code, enclosed in #ifdef CONFIG_IP_ROUTE_MULTIPATH conditionals, and no separate module was added in the source code for supporting it. As mentioned in Chapter 5, there was support for IPv4 multipath routing cache, but it was removed in 2007 in kernel 2.6.23; in fact, it never did work very well, and never got out of the experimental state. Do not confuse the removal of the multipath routing cache with the removal of the routing cache; these are two different caches. The removal of the routing cache took place five years later, in kernel 3.6 (2012).
+
+NOTE
+
+CONFIG_IP_ROUTE_MULTIPATH should be set for Multipath Routing Support.
+
+## Summary
+
+This chapter covered advanced IPv4 routing topics, like Multicast Routing, the IGMP protocol, Policy Routing, and Multipath Routing. You learned about the fundamental structures of Multicast Routing, such as the multicast table (mr_table), the multicast forwarding cache (MFC), the Vif device, and more. You also learned what should be done to set a host to be a multicast router, and all about the use of the ttl field in Multicast Routing. Chapter 7 deals with the Linux neighbouring subsystem. The "Quick Reference" section that follows covers the top methods related to the topics discussed in this chapter, ordered by their context.
+
+## Quick Reference
+
+I conclude this chapter with a short list of important routing subsystem methods (some of which were mentioned in this chapter), a list of macros, and procfs multicast entries and tables.
+
+### Methods
+
+Let's start with the methods:
+
+#### int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen);
+
+This method handles setsockopt() calls from the multicast routing daemon. The supported socket options are: MRT_INIT, MRT_DONE, MRT_ADD_VIF, MRT_DEL_VIF, MRT_ADD_MFC, MRT_DEL_MFC, MRT_ADD_MFC_PROXY, MRT_DEL_MFC_PROXY, MRT_ASSERT, MRT_PIM (when PIM support is set), and MRT_TABLE (when Multicast Policy Routing is set).
+
+#### int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen);
+
+This method handles getsockopt() calls from the multicast routing daemon. The supported socket options are MRT_VERSION, MRT_ASSERT and MRT_PIM.
+
+#### struct mr_table *ipmr_new_table(struct net *net, u32 id);
+
+This method creates a new multicast routing table. The id of the table will be the specified id.
+
+#### void ipmr_free_table(struct mr_table *mrt);
+
+This method frees the specified multicast routing table and the resources attached to it.
+
+#### int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr);
+
+This method is for joining a multicast group. The address of the multicast group to be joined is specified in the given ip_mreqn object. The method returns 0 on success.
+
+#### static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, __be32 origin, __be32 mcastgrp);
+
+This method performs a lookup in the IPv4 multicast routing cache. It returns NULL when no entry is found.
+
+#### bool ipv4_is_multicast(__be32 addr);
+
+This method returns true if the address is a multicast address.
+
+#### int ip_mr_input(struct sk_buff *skb);
+
+This method is the main IPv4 multicast Rx method (net/ipv4/ipmr.c).
+
+#### struct mfc_cache *ipmr_cache_alloc(void);
+
+This method allocates a multicast forwarding cache (mfc_cache) entry.
+
+#### static struct mfc_cache *ipmr_cache_alloc_unres(void);
+
+This method allocates a multicast routing cache (mfc_cache) entry for the unresolved cache and sets the expires field of the queue of unresolved entries.
+
+#### void fib_select_multipath(struct fib_result *res);
+
+This method is called to determine the nexthop when working with Multipath Routing.
+
+#### int dev_set_allmulti(struct net_device *dev, int inc);
+
+This method increments/decrements the allmulti counter of the specified network device according to the specified increment (the increment can be a positive number or a negative number).
+
+#### int igmp_rcv(struct sk_buff *skb);
+
+This method is the receive handler for IGMP packets.
+
+#### static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, struct mfcctl *mfc, int mrtsock, int parent);
+
+This method adds a multicast cache entry; it is invoked by calling setsockopt() from userspace with MRT_ADD_MFC.
+
+#### static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent);
+
+This method deletes a multicast cache entry; it is invoked by calling setsockopt() from userspace with MRT_DEL_MFC.
+
+#### static int vif_add(struct net *net, struct mr_table *mrt, struct vifctl *vifc, int mrtsock);
+
+This method adds a multicast virtual interface; it is invoked by calling setsockopt() from userspace with MRT_ADD_VIF.
+
+#### static int vif_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head);
+
+This method deletes a multicast virtual interface; it is invoked by calling setsockopt() from userspace with MRT_DEL_VIF.
+
+#### static void ipmr_expire_process(unsigned long arg);
+
+This method removes expired entries from the queue of unresolved entries.
+
+#### static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert);
+
+This method builds an IGMP packet, setting the type in the IGMP header to be the specified assert value and the code to be 0. This IGMP packet is delivered to the userspace multicast routing daemon by calling the sock_queue_rcv_skb() method. The assert parameter can be assigned one of these values: IGMPMSG_NOCACHE, when an unresolved cache entry is added to the queue of unresolved entries and wants to notify the userspace routing daemon that it should resolve it, IGMPMSG_WRONGVIF, and IGMPMSG_WHOLEPKT.
+
+#### static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr);
+
+This method is a notifier callback which is registered by the register_netdevice_notifier() method; when some network device is unregistered, a NETDEV_UNREGISTER event is generated; this callback receives this event and deletes the vif_device objects in the vif_table, whose device is the one that was unregistered.
+
+#### static void mrtsock_destruct(struct sock *sk);
+
+This method is called when the userspace routing daemon calls setsockopt() with MRT_DONE. This method nullifies the multicast routing socket (mroute_sk of the multicast routing table), decrements the mc_forwarding procfs entry, and calls the mroute_clean_tables() method to free resources.
+
+### Macros
+
+This section describes our macros.
+
+#### MFC_HASH(a,b)
+
+This macro calculates the hash value for adding entries to the MFC cache. It takes the group multicast address and the source IPv4 address as parameters.
+
+#### VIF_EXISTS(_mrt, _idx)
+
+This macro checks the existence of an entry in the vif_table; it returns true if the array of multicast virtual devices (vif_table) of the specified multicast routing table (mrt) has an entry with the specified index (_idx).
+
+### Procfs Multicast Entries
+
+The following is a description of two important procfs multicast entries:
+
+#### /proc/net/ip_mr_vif
+
+Lists all the multicast virtual interfaces; it displays all the vif_device objects in the multicast virtual device table (vif_table). Displaying the /proc/net/ip_mr_vif entry is handled by the ipmr_vif_seq_show() method.
+
+#### /proc/net/ip_mr_cache
+
+The state of the Multicast Forwarding Cache (MFC). This entry shows the following fields of all the cache entries: group multicast address (mfc_mcastgrp), source IP address (mfc_origin), input interface index (mfc_parent), forwarded packets (mfc_un.res.pkt), forwarded bytes (mfc_un.res.bytes), wrong interface index (mfc_un.res.wrong_if), the index of the forwarding interface (an index in the vif_table), and the entry in the mfc_un.res.ttls array corresponding to this index. Displaying the /proc/net/ip_mr_cache entry is handled by the ipmr_mfc_seq_show() method.
+
+### Table
+
+And finally, here in Table 6-1, is the table of rule selectors.
+
+Table 6-1.
+
+IP Rule Selectors
+
+Linux Symbol | Selector | Member of fib_rule | fib4_rule
+
+---|---|---|---
+
+FRA_SRC | from | src | (fib4_rule)
+
+FRA_DST | to | dst | (fib4_rule)
+
+FRA_IIFNAME | iif | iifname | (fib_rule)
+
+FRA_OIFNAME | oif | oifname | (fib_rule)
+
+FRA_FWMARK | fwmark | mark | (fib_rule)
+
+FRA_FWMASK | fwmark/fwmask | mark_mask | (fib_rule)
+
+FRA_PRIORITY | preference,order,priority | pref | (fib_rule)
+
+- | tos, dsfield | tos | (fib4_rule)
+Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_7
+
+© Rami Rosen 2014
+
+# 7. Linux Neighbouring Subsystem
+
+Rami Rosen1
+
+(1)
+
+Haifa, Israel
+
+Abstract
+
+This chapter discusses the Linux neighbouring subsystem and its implementation in Linux. The neighbouring subsystem is responsible for the discovery of the presence of nodes on the same link and for translation of L3 (network layer) addresses to L2 (link layer) addresses. L2 addresses are needed to build the L2 header for outgoing packets, as described in the next section. The protocol that implements this translation is called the Address Resolution Protocol (ARP) in IPv4 and Neighbour Discovery protocol (NDISC or ND) in IPv6. The neighbouring subsystem provides a protocol-independent infrastructure for performing L3-to-L2 mappings. The discussion in this chapter, however, is restricted to the most common cases—namely, the neighbouring subsystem usage in IPv4 and in IPv6. Keep in mind that the ARP protocol, like the ICMP protocol discussed in Chapter 3, is subject to security threats—such as ARP poisoning attacks and ARP spoofing attacks (security aspects of the ARP protocol are beyond the scope of this book).
+
+This chapter discusses the Linux neighbouring subsystem and its implementation in Linux. The neighbouring subsystem is responsible for the discovery of the presence of nodes on the same link and for translation of L3 (network layer) addresses to L2 (link layer) addresses. L2 addresses are needed to build the L2 header for outgoing packets, as described in the next section. The protocol that implements this translation is called the Address Resolution Protocol (ARP) in IPv4 and Neighbour Discovery protocol (NDISC or ND) in IPv6. The neighbouring subsystem provides a protocol-independent infrastructure for performing L3-to-L2 mappings. The discussion in this chapter, however, is restricted to the most common cases—namely, the neighbouring subsystem usage in IPv4 and in IPv6. Keep in mind that the ARP protocol, like the ICMP protocol discussed in Chapter 3, is subject to security threats—such as ARP poisoning attacks and ARP spoofing attacks (security aspects of the ARP protocol are beyond the scope of this book).
+
+I first discuss the common neighbouring data structures in this chapter and some important API methods, which are used both in IPv4 and in IPv6. Then I discuss the particular implementations of the ARP protocol and NDISC protocol. You will see how a neighbour is created and how it is freed, and you will learn about the interaction between userspace and the neighbouring subsystem. You will also learn about ARP requests and ARP replies, about NDISC neighbour solicitation and NDISC neighbour advertisements, and about a mechanism called Duplicate Address Detection (DAD), which is used by the NDISC protocol to avoid duplicate IPv6 addresses.
+
+## The Neighbouring Subsystem Core
+
+What is the neighbouring subsystem needed for? When a packet is sent over the L2 layer, the L2 destination address is needed to build an L2 header. Using the neighbouring subsystem solicitation requests and solicitation replies, the L2 address of a host can be found out given its L3 address (or the fact that such L3 address does not exist). In Ethernet, which is the most commonly used link layer (L2), the L2 address of a host is its MAC address. In IPv4, ARP is the neighbouring protocol, and solicitation requests and solicitation replies are called ARP requests and ARP replies, respectively. In IPv6, the neighbouring protocol is NDISC, and solicitation requests and solicitation replies are called neighbour solicitations and neighbour advertisements, respectively.
+
+There are cases where the destination address can be found without any help from the neighbouring subsystem—for example, when a broadcast is sent. In this case, the destination L2 address is fixed (for example, it is FF:FF:FF:FF:FF:FF in Ethernet). Or when the destination address is a multicast address, there is a fixed mapping between the L3 multicast address to its L2 address. I discuss such cases in the course of this chapter.
+
+The basic data structure of the Linux neighbouring subsystem is the neighbour. A neighbour represents a network node that is attached to the same link (L2). It is represented by the neighbour structure. This representation is not unique for a particular protocol. However, as mentioned, the discussion of the neighbour structure will be restricted to its use in the IPv4 and in the IPv6 protocols. Let's take a look in the neighbour structure:
+
+struct neighbour {
+
+struct neighbour __rcu *next;
+
+struct neigh_table *tbl;
+
+struct neigh_parms *parms;
+
+unsigned long confirmed;
+
+unsigned long updated;
+
+rwlock_t lock;
+
+atomic_t refcnt;
+
+struct sk_buff_head arp_queue;
+
+unsigned int arp_queue_len_bytes;
+
+struct timer_list timer;
+
+unsigned long used;
+
+atomic_t probes;
+
+__u8 flags;
+
+__u8 nud_state;
+
+__u8 type;
+
+__u8 dead;
+
+seqlock_t ha_lock;
+
+unsigned char ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))];
+
+struct hh_cache hh;
+
+int (*output)(struct neighbour *, struct sk_buff *);
+
+const struct neigh_ops *ops;
+
+struct rcu_head rcu;
+
+struct net_device *dev;
+
+u8 primary_key[0];
+
+};
+
+(include/net/neighbour.h)
+
+The following is a description of some of the important members of the neighbour structure:
+
+  * next: A pointer to the next neighbour on the same bucket in the hash table.
+
+  * tbl: The neighbouring table associated to this neighbour.
+
+  * parms: The neigh_parms object associated to this neighbour. It is initialized by the constructor method of the associated neighbouring table. For example, in IPv4 the arp_constructor() method initializes parms to be the arp_parms of the associated network device. Do not confuse it with the neigh_parms object of the neighbouring table.
+
+  * confirmed: Confirmation timestamp (discussed later in this chapter).
+
+  * refcnt: Reference counter. Incremented by the neigh_hold() macro and decremented by the neigh_release() method. The neigh_release() method frees the neighbour object by calling the neigh_destroy() method only if after decrementing the reference counter its value is 0.
+
+  * arp_queue: A queue of unresolved SKBs. Despite the name, this member is not unique to ARP and is used by other protocols, such as the NDISC protocol.
+
+  * timer: Every neighbour object has a timer; the timer callback is the neigh_timer_handler() method. The neigh_timer_handler() method can change the Network Unreachability Detection (NUD) state of the neighbour. When sending solicitation requests, and the state of the neighbour is NUD_INCOMPLETE or NUD_PROBE, and the number of solicitation requests probes is higher or equal to neigh_max_probes(), then the state of the neighbour is set to be NUD_FAILED, and the neigh_invalidate() method is invoked.
+
+  * ha_lock: Provides access protection to the neighbour hardware address (ha).
+
+  * ha: The hardware address of the neighbour object; in the case of Ethernet, it is the MAC address of the neighbour.
+
+  * hh: A hardware header cache of the L2 header (An hh_cache object).
+
+  * output: A pointer to a transmit method, like the neigh_resolve_output() method or the neigh_direct_output() method. It is dependent on the NUD state and as a result can be assigned to different methods during a neighbour lifetime. When initializing the neighbour object in the neigh_alloc() method, it is set to be the neigh_blackhole() method, which discards the packet and returns -ENETDOWN.
+
+And here are the helper methods (methods which set the output callback):
+
+    * void neigh_connect(struct neighbour *neigh)
+
+Sets the output() method of the specified neighbour to be neigh->ops->connected_output.
+
+    * void neigh_suspect(struct neighbour *neigh)
+
+Sets the output() method of the specified neighbour to be neigh->ops->output.
+
+    * nud_state: The NUD state of the neighbour. The nud_state value can be changed dynamically during the lifetime of a neighbour object. Table 7-1 in the "Quick Reference" section at the end of this chapter describes the basic NUD states and their Linux symbols. The NUD state machine is very complex; I do not delve into all of its nuances in this book.
+
+    * dead: A flag that is set when the neighbour object is alive. It is initialized to 0 when creating a neighbour object, at the end of the __neigh_create() method. The neigh_destroy() method will fail for neighbour objects whose dead flag is not set. The neigh_flush_dev() method sets the dead flag to 1 but does not yet remove the neighbour entry. The removal of neighbours marked as dead (their dead flag is set) is done later, by the garbage collectors.
+
+    * primary_key: The IP address (L3) of the neighbour. A lookup in the neighbouring tables is done with the primary_key. The primary_key length is based on which protocol is used. For IPv4, for example, it should be 4 bytes. For IPv6 it should be sizeof(struct in6_addr), as the in6_addr structure represents an IPv6 address. Therefore, the primary_key is defined as an array of 0 bytes, and when allocating a neighbour it should be taken into account which protocol is used. See the explanation about entry_size and key_len later in this chapter, in the description of the neigh_table structure members.
+
+To avoid sending solicitation requests for each new packet that is transmitted, the kernel keeps the mapping between L3 addresses and L2 addresses in a data structure called a neighbouring table; in the case of IPv4, it is the ARP table (sometimes also called the ARP cache, though they are the same)—in contrast to what you saw in the IPv4 routing subsystem in Chapter 5: the routing cache, before it was removed, and the routing table, were two different entities, which were represented by two different data structures. In the case of IPv6, the neighbouring table is the NDISC table (also known as the NDISC cache). Both the ARP table (arp_tbl) and the NDISC table (nd_tbl) are instances of the neigh_table structure. Let's take a look at the neigh_table structure:
+
+struct neigh_table {
+
+struct neigh_table *next;
+
+int family;
+
+int entry_size;
+
+int key_len;
+
+__u32 (*hash)(const void *pkey,
+
+const struct net_device *dev,
+
+__u32 *hash_rnd);
+
+int (*constructor)(struct neighbour *);
+
+int (*pconstructor)(struct pneigh_entry *);
+
+void (*pdestructor)(struct pneigh_entry *);
+
+void (*proxy_redo)(struct sk_buff *skb);
+
+char *id;
+
+struct neigh_parms parms;
+
+/* HACK. gc_* should follow parms without a gap! */
+
+int gc_interval;
+
+int gc_thresh1;
+
+int gc_thresh2;
+
+int gc_thresh3;
+
+unsigned long last_flush;
+
+struct delayed_work gc_work;
+
+struct timer_list proxy_timer;
+
+struct sk_buff_head proxy_queue;
+
+atomic_t entries;
+
+rwlock_t lock;
+
+unsigned long last_rand;
+
+struct neigh_statistics __percpu *stats;
+
+struct neigh_hash_table __rcu *nht;
+
+struct pneigh_entry **phash_buckets;
+
+};
+
+(include/net/neighbour.h)
+
+Here are some important members of the neigh_table structure:
+
+  * next: Each protocol creates its own neigh_table instance. There is a linked list of all the neighbouring tables in the system. The neigh_tables global variable is a pointer to the beginning of the list. The next variable points to the next item in this list.
+
+  * family: The protocol family: AF_INET for the IPv4 neighbouring table (arp_tbl), and AF_INET6 for the IPv6 neighbouring table (nd_tbl).
+
+  * entry_size: When allocating a neighbour entry by the neigh_alloc() method, the size for allocation is tbl->entry_size + dev->neigh_priv_len. Usually the neigh_priv_len value is 0. Before kernel 3.3, the entry_size was explicitly initialized to be sizeof(struct neighbour) + 4 for ARP, and sizeof(struct neighbour) + sizeof(struct in6_addr) for NDISC. The reason for this initialization was that when allocating a neighbour, you want to allocate space also for the primary_key[0] member. From kernel 3.3, the enrty_size was removed from the static initialization of arp_tbl and ndisc_tbl, and the entry_size initialization is done based on the key_len in the core neighbouring layer, by the neigh_table_init_no_netlink() method.
+
+  * key_len: The size of the lookup key; it is 4 bytes for IPv4, because the length of IPv4 address is 4 bytes, and it is sizeof (struct in6_addr) for IPv6. The in6_addr structure represents an IPv6 address.
+
+  * hash: The hash function for mapping a key (L3 address) to a specific hash value; for ARP it is the arp_hash() method. For NDISC it is the ndisc_hash() method.
+
+  * constructor: This method performs protocol-specific initialization when creating a neighbour object. For example, arp_constructor() for ARP in IPv4 and ndisc_constructor() for NDISC in IPv6. The constructor callback is invoked by the __neigh_create() method. It returns 0 on success.
+
+  * pconstructor: A method for creation of a neighbour proxy entry; it is not used by ARP, and it is pndisc_constructor for NDISC. This method should return 0 upon success. The pconstructor method is invoked from the pneigh_lookup() method if the lookup fails, on the condition that the pneigh_lookup() was invoked with creat = 1.
+
+  * pdestructor: A method for destroying a neighbour proxy entry. Like the pconstructor callback, the pdestructor is not used by ARP, and it is pndisc_destructor for NDISC. The pdestructor method is invoked from the pneigh_delete() method and from the pneigh_ifdown() method.
+
+  * id: The name of the table; it is arp_cache for IPv4 and ndisc_cache for IPv6.
+
+  * parms: A neigh_parms object: each neighbouring table has an associated neigh_parms object, which consists of various configuration settings, like reachability information, various timeouts, and more. The neigh_parms initialization is different in the ARP table and in the NDISC table.
+
+  * gc_interval: Not used directly by the neighbouring core.
+
+  * gc_thresh1, gc_thresh2, gc_thresh3: Thresholds of the number of neighbouring table entries. Used as criteria to activation of the synchronous garbage collector (neigh_forced_gc) and in the neigh_periodic_work() asynchronous garbage collector handler. See the explanation about allocating a neighbour object in the "Creating and Freeing a Neighbour" section later in this chapter. In the ARP table, the default values are: gc_thresh1 is 128, gc_thresh2 is 512, and gc_thresh3 is 1024. These values can be set by procfs. The same default values are also used in the NDISC table in IPv6. The IPv4 procfs entries are:
+
+    * /proc/sys/net/ipv4/neigh/default/gc_thresh1
+
+    * /proc/sys/net/ipv4/neigh/default/gc_thresh2
+
+    * /proc/sys/net/ipv4/neigh/default/gc_thresh3
+
+and for IPv6, these are the procfs entries:
+
+    * /proc/sys/net/ipv6/neigh/default/gc_thresh1
+
+    * /proc/sys/net/ipv6/neigh/default/gc_thresh2
+
+    * /proc/sys/net/ipv6/neigh/default/gc_thresh3
+
+  * last_flush: The most recent time when the neigh_forced_gc() method ran. It is initialized to be the current time (jiffies) in the neigh_table_init_no_netlink () method.
+
+  * gc_work: Asynchronous garbage collector handler. Set to be the neigh_periodic_work() timer by the neigh_table_init_no_netlink() method. The delayed_work struct is a type of a work queue. Before kernel 2.6.32, the neigh_periodic_timer() method was the asynchronous garbage collector handler; it processed only one bucket and not the entire neighbouring hash table. The neigh_periodic_work() method first checks whether the number of the entries in the table is less than gc_thresh1, and if so, it exits without doing anything; then it recomputes the reachable time (the reachable_time field of parms, which is the neigh_parms object associated with the neighbouring table). Then it scans the neighbouring hash table and removes entries which their state is not NUD_PERMANENT or NUD_IN_TIMER, and which their reference count is 1, and if one of these conditions is met: either they are in the NUD_FAILED state or the current time is after their used timestamp \+ gc_staletime (gc_staletime is a member of the neighbour parms object). Removal of the neighbour entry is done by setting the dead flag to 1 and calling the neigh_cleanup_and_release() method.
+
+  * proxy_timer: When a host is configured as an ARP proxy, it is possible to avoid immediate processing of solicitation requests and to process them with some delay. This is due to the fact that for an ARP proxy host, there can be a large number of solicitation requests (as opposed to the case when the host is not an ARP proxy, when you usually have a small amount of ARP requests). Sometimes you may prefer to delay the reply to such broadcasts so that you can give priority to hosts that own such IP addresses to be the first to get the request. This delay is a random value up to the proxy_delay parameter. The ARP proxy timer handler is the neigh_proxy_process() method. The proxy_timer is initialized by the neigh_table_init_no_netlink() method.
+
+  * proxy_queue: Proxy ARP queue of SKBs. SKBs are added with the pneigh_enqueue() method.
+
+  * stats: The neighbour statistics (neigh_statistics) object; consists of per CPU counters like allocs, which is the number of neighbour objects allocated by the neigh_alloc() method, or destroys, which is the number of neighbour objects which were freed by the neigh_destroy() method, and more. The neighbour statistics counters are incremented by the NEIGH_CACHE_STAT_INC macro. Note that because the statistics are per CPU counters, the macro this_cpu_inc() is used by this macro. You can display the ARP statistics and the NDISC statistics with cat /proc/net/stat/arp_cache and cat/proc/net/stat/ndisc_cache, respectively. In the "Quick Reference" section at the end of this chapter, there is a description of the neigh_statistics structure, specifying in which method each counter is incremented.
+
+  * nht: The neighbour hash table (neigh_hash_table object).
+
+  * phash_buckets: The neighbouring proxy hash table; allocated in the neigh_table_init_no_netlink() method.
+
+The initialization of the neighbouring table is done with the neigh_table_init() method:
+
+  * In IPv4, the ARP module defines the ARP table (an instance of the neigh_table structure named arp_tbl) and passes it as an argument to the neigh_table_init() method (see the arp_init() method in net/ipv4/arp.c).
+
+  * In IPv6, the NDISC module defines the NDSIC table (which is also an instance of the neigh_table structure named nd_tbl) and passes it as an argument to the neigh_table_init() method (see the ndisc_init() method in net/ipv6/ndisc.c).
+
+The neigh_table_init() method also creates the neighbouring hash table (the nht object) by calling the neigh_hash_alloc() method in the neigh_table_init_no_netlink() method, allocating space for eight hash entries:
+
+static void neigh_table_init_no_netlink(struct neigh_table *tbl)
+
+{
+
+. . .
+
+RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3));
+
+. . .
+
+}
+
+static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
+
+{
+
+The size of the hash table is 1<< shift (when size <= PAGE_SIZE):
+
+size_t size = (1 << shift) * sizeof(struct neighbour *);
+
+struct neigh_hash_table *ret;
+
+struct neighbour __rcu **buckets;
+
+int i;
+
+ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
+
+if (!ret)
+
+return NULL;
+
+if (size <= PAGE_SIZE)
+
+buckets = kzalloc(size, GFP_ATOMIC);
+
+else
+
+buckets = (struct neighbour __rcu **)
+
+__get_free_pages(GFP_ATOMIC | __GFP_ZERO,
+
+get_order(size));
+
+. . .
+
+}
+
+You may wonder why you need the neigh_table_init_no_netlink() method—why not perform all of the initialization in the neigh_table_init() method? The neigh_table_init_no_netlink() method performs all of the initializations of the neighbouring tables, except for linking it to the global linked list of neighbouring tables, neigh_tables. Originally such initialization, without linking to the neigh_tables linked list, was needed for ATM, and as a result the neigh_table_init() method was split, and the ATM clip module called the neigh_table_init_no_netlink() method instead of calling the neigh_table_init() method; however, over time, a different solution was found in ATM. Though the ATM clip module does not invoke the neigh_table_init_no_netlink() method anymore, the split of these methods remained, perhaps in case it is needed in the future.
+
+I should mention that each L3 protocol that uses the neighbouring subsystem also registers a protocol handler: for IPv4, the handler for ARP packets (packets whose type in their Ethernet header is 0x0806) is the arp_rcv() method:
+
+static struct packet_type arp_packet_type __read_mostly = {
+
+.type = cpu_to_be16(ETH_P_ARP),
+
+.func = arp_rcv,
+
+};
+
+void __init arp_init(void)
+
+{
+
+. . .
+
+dev_add_pack(&arp_packet_type);
+
+. . .
+
+}
+
+(net/ipv4/arp.c)
+
+For IPv6, the neighbouring messages are ICMPv6 messages, so they are handled by the icmpv6_rcv() method, which is the ICMPv6 handler. There are five ICMPv6 neighbouring messages; when each of them is received (by the icmpv6_rcv() method), the ndisc_rcv() method is invoked to handle them (see net/ipv6/icmp.c). The ndisc_rcv() method is discussed in a later section in this chapter. Each neighbour object defines a set of methods by the neigh_ops structure. This is done by its constructor method. The neigh_ops structure contains a protocol family member and four function pointers:
+
+struct neigh_ops {
+
+int family;
+
+void (*solicit)(struct neighbour *, struct sk_buff *);
+
+void (*error_report)(struct neighbour *, struct sk_buff *);
+
+int (*output)(struct neighbour *, struct sk_buff *);
+
+int (*connected_output)(struct neighbour *, struct sk_buff *);
+
+};
+
+(include/net/neighbour.h)
+
+  * family: AF_INET for IPv4 and AF_INET6 for IPv6.
+
+  * solicit: This method is responsible for sending the neighbour solicitation requests: in ARP it is the arp_solicit() method, and in NDISC it is the ndisc_solicit() method.
+
+  * error_report: This method is called from the neigh_invalidate() method when the neighbour state is NUD_FAILED. This happens, for example, after some timeout when a solicitation request is not replied.
+
+  * output: When the L3 address of the next hop is known, but the L2 address is not resolved, the output callback should be neigh_resolve_output().
+
+  * connected_output: The output method of the neighbour is set to be connected_output() when the neighbour state is NUD_REACHABLE or NUD_CONNECTED. See the invocations of neigh_connect() in the neigh_update() method and in the neigh_timer_handler() method.
+
+### Creating and Freeing a Neighbour
+
+A neighbour is created by the __neigh_create() method:
+
+struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref)
+
+First, the __neigh_create() method allocates a neighbour object by calling the neigh_alloc() method, which also performs various initializations. There are cases when the neigh_alloc() method calls the synchronous garbage collector (which is the neigh_forced_gc() method):
+
+static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
+
+{
+
+struct neighbour *n = NULL;
+
+unsigned long now = jiffies;
+
+int entries;
+
+entries = atomic_inc_return(&tbl->entries) - 1;
+
+If the number of table entries is greater than gc_thresh3 (1024 by default) or if the number of table entries is greater than gc_thresh2 (512 by default), and the time passed since the last flush is more than 5 Hz, the synchronous garbage collector method is invoked (the neigh_forced_gc() method). If after running the neigh_forced_gc() method, the number of table entries is greater than gc_thresh3 (1024), you do not allocate a neighbour object and return NULL:
+
+if (entries >= tbl->gc_thresh3 ||
+
+(entries >= tbl->gc_thresh2 &&
+
+time_after(now, tbl->last_flush + 5 * HZ))) {
+
+if (!neigh_forced_gc(tbl) &&
+
+entries >= tbl->gc_thresh3)
+
+goto out_entries;
+
+}
+
+Then the __neigh_create() method performs the protocol-specific setup by calling the constructor method of the specified neighbouring table (arp_constructor() for ARP, ndisc_constructor() for NDISC). In the constructor method, special cases like multicast or loopback addresses are handled. In the arp_constructor() method, for example, you call the arp_mc_map() method to set the hardware address of the neighbour (ha) according to the neighbour IPv4 primary_key address, and you set the nud_state to be NUD_NOARP, because multicast addresses don't need ARP. In the ndisc_constructor() method, for example, you do something quite similar when handling multicast addresses: you call the ndisc_mc_map() to set the hardware address of the neighbour (ha) according to the neighbour IPv6 primary_key address, and you again set the nud_state to be NUD_NOARP. There's also special treatment for broadcast addresses: in the arp_constructor() method, for example, when the neighbour type is RTN_BROADCAST, you set the neighbour hardware address (ha) to be the network device broadcast address (the broadcast field of the net_device object), and you set the nud_state to be NUD_NOARP. Note that the IPv6 protocol does not implement traditional IP broadcast, so the notion of a broadcast address is irrelevant (there is a link-local all nodes multicast group at address ff02::1, though). There are two special cases when additional setup needs to be done:
+
+  * When the ndo_neigh_construct() callback of the netdev_ops is defined, it is invoked. In fact, this is done only in the classical IP over ATM code (clip); see net/atm/clip.c.
+
+  * When the neigh_setup() callback of the neigh_parms object is defined, it is invoked. This is used, for example, in the bonding driver; see drivers/net/bonding/bond_main.c.
+
+When trying to create a neighbour object by the __neigh_create() method, and the number of the neighbour entries exceeds the hash table size, it must be enlarged. This is done by calling the neigh_hash_grow() method, like this:
+
+struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
+
+struct net_device *dev, bool want_ref)
+
+{
+
+. . .
+
+The hash table size is 1 << nht->hash_shift; the hash table must be enlarged if it is exceeded:
+
+if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
+
+nht = neigh_hash_grow(tbl, nht->hash_shift + 1);
+
+. . .
+
+}
+
+When the want_ref parameter is true, you will increment the neighbour reference count within this method. You also initialize the confirmed field of the neighbour object:
+
+n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
+
+It is initialized to be a little less than the current time, jiffies (for the simple reason that you want reachability confirmation to be required sooner). At the end of the __neigh_create() method, the dead flag is initialized to be 0, and the neighbour object is added to the neighbour hash table.
+
+The neigh_release() method decrements the reference counter of the neighbour and frees it when it reaches zero by calling the neigh_destroy() method. The neigh_destroy() method will verify that the neighbour is marked as dead: neighbours whose dead flag is 0 will not be removed.
+
+In this section, you learned about the kernel methods to create and free a neighbour. Next you will learn how adding and deleting a neighbour entry can be triggered from userspace, as well as how to display the neighbouring table, with the arp command for IPv4 and the ip command for IPv4/IPv6.
+
+### Interaction Between Userspace and the Neighbouring Subsystem
+
+Management of the ARP table is done with the ip neigh command of the iproute2 package or with the arp command of the net-tools package. Thus, you can display the ARP table by running, from the command line, one of the following commands:
+
+  * arp: Handled by the arp_seq_show() method in net/ipv4/arp.c.
+
+  * ip neigh show (or ip neighbour show): Handled by the neigh_dump_info() method in net/core/neighbour.c.
+
+Note that the ip neigh show command shows the NUD states of the neighbouring table entries (like NUD_REACHABLE or NUD_STALE). Note also that the arp command can display only the IPv4 neighbouring table (the ARP table), whereas with the ip command you can display both the IPv4 ARP table and the IPv6 neighbouring table. If you want to display only the IPv6 neighbouring table, you should run ip -6 neigh show.
+
+The ARP and NDISC modules also export data via procfs. That means you can display the ARP table by running cat /proc/net/arp (this procfs entry is handled by the arp_seq_show() method, which is the same method that handles the arp command, as mentioned earlier). Or you can display ARP statistics by cat /proc/net/stat/arp_cache, and you can display the NDISC statistics by cat /proc/net/stat/ndisc_cache (both are handled by the neigh_stat_seq_show() method).
+
+You can add an entry with ip neigh add, which is handled by the neigh_add() method. When running ip neigh add, you can specify the state of the entry which you are adding (like NUD_PERMANENT, NUD_STALE, NUD_REACHABLE and so on). For example:
+
+ip neigh add 192.168.0.121 dev eth0 lladdr 00:30:48:5b:cc:45 nud permanent
+
+Deleting an entry can be done by ip neigh del, and is handled by the neigh_delete() method. For example:
+
+ip neigh del 192.168.0.121 dev eth0
+
+Adding an entry to the proxy ARP table can be done with ip neigh add proxy. For example:
+
+ip neigh add proxy 192.168.2.11 dev eth0
+
+The addition is handled again by the neigh_add() method. In this case, the NTF_PROXY flag is set in the data passed from userspace (see the ndm_flags field of the ndm object), and therefore the pneigh_lookup() method is called to perform a lookup in the proxy neighbouring hash table (phash_buckets). In case the lookup failed, the pneigh_lookup() method adds an entry to the proxy neighbouring hash table.
+
+Deleting an entry from the proxy ARP table can be done with ip neigh del proxy. For example:
+
+ip neigh del proxy 192.168.2.11 dev eth0
+
+The deletion is handled by the neigh_delete() method. Again, in this case the NTF_PROXY flag is set in the data passed from userspace (see the ndm_flags field of the ndm object), and therefore the pneigh_delete() method is called to delete the entry from the proxy neighbouring table.
+
+With the ip ntable command, you can control the parameters for the neighbouring tables. For example:
+
+  * ip ntable show: Shows the parameters for all the neighbouring tables.
+
+  * ip ntable change: Change a value of a parameter of a neighbouring table. Handled by the neightbl_set() method. For example: ip ntable change name arp_cache queue 20 dev eth0.
+
+You can also add entries to the ARP table by arp add. And it is possible to add static entries manually to the ARP table, like this: arp –s <IPAddress> <MacAddress>. The static ARP entries are not deleted by the neigbouring subsystem garbage collector, but they are not persistent over reboot.
+
+The next section briefly describes how network events are handled in the neighbouring subsystem.
+
+### Handling Network Events
+
+The neighbouring core does not register any events with the register_netdevice_notifier() method. On the other hand, the ARP module and the NDISC module do register network events. In ARP, the arp_netdev_event() method is registered as the callback for netdev events. It handles changes of MAC address events by calling the generic neigh_changeaddr() method and by calling the rt_cache_flush() method. From kernel 3.11, you handle a NETDEV_CHANGE event when there was a change of the IFF_NOARP flag by calling the neigh_changeaddr() method. A NETDEV_CHANGE event is triggered when a device changes its flags, by the __dev_notify_flags() method, or when a device changes its state, by the netdev_state_change() method. In NDISC, the ndisc_netdev_event() method is registered as the callback for netdev events; it handles the NETDEV_CHANGEADDR, NETDEV_DOWN, and NETDEV_NOTIFY_PEERS events.
+
+After describing the fundamental data structures common to IPv4 and IPv6, like the neighbouring table (neigh_table) and the neighbour structure, and after discussing how a neighbour object is created and freed, it is time to describe the implementation of the first neighbouring protocol, the ARP protocol.
+
+## The ARP protocol (IPv4)
+
+The ARP protocol is defined in RFC 826. When working with Ethernet, the addresses are called MAC addresses and are 48-bit values. MAC addresses should be unique, but you must take into account that you may encounter a non-unique MAC address. A common reason for this is that on most network interfaces, a system administrator can configure MAC addresses with userspace tools like ifconfig or ip.
+
+When sending an IPv4 packet, you know the destination IPv4 address. You should build an Ethernet header, which should include a destination MAC address. Finding the MAC address based on a given IPv4 address is done by the ARP protocol as you will see shortly. If the MAC address is unknown, you send an ARP request as a broadcast. This ARP request contains the IPv4 address you are seeking. If there is a host with such an IPv4 address, this host sends a unicast ARP response as a reply. The ARP table (arp_tbl) is an instance of the neigh_table structure. The ARP header is represented by the arphdr structure:
+
+struct arphdr {
+
+__be16 ar_hrd; /* format of hardware address */
+
+__be16 ar_pro; /* format of protocol address */
+
+unsigned char ar_hln; /* length of hardware address */
+
+unsigned char ar_pln; /* length of protocol address */
+
+__be16 ar_op; /* ARP opcode (command) */
+
+#if 0
+
+*
+
+* Ethernet looks like this : This bit is variable sized however...
+
+*/
+
+unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */
+
+unsigned char ar_sip[4]; /* sender IP address */
+
+unsigned char ar_tha[ETH_ALEN]; /* target hardware address */
+
+unsigned char ar_tip[4]; /* target IP address */
+
+#endif
+
+};
+
+(include/uapi/linux/if_arp.h)
+
+The following is a description of some of the important members of the arphdr structure:
+
+  * ar_hrd is the hardware type; for Ethernet it is 0x01. For the full list of available ARP header hardware identifiers, see ARPHRD_XXX definitions in include/uapi/linux/if_arp.h.
+
+  * ar_pro is the protocol ID; for IPv4 it is 0x80. For the full list of available protocols IDs, see ETH_P_XXX in include/uapi/linux/if_ether.h.
+
+  * ar_hln is the hardware address length in bytes, which is 6 bytes for Ethernet addresses.
+
+  * ar_pln is the length of the protocol address in bytes, which is 4 bytes for IPv4 addresses.
+
+  * ar_op is the opcode, ARPOP_REQUEST for an ARP request, and ARPOP_REPLY for an ARP reply. For the full list of available ARP header opcodes look in include/uapi/linux/if_arp.h.
+
+Immediately after the ar_op are the sender hardware (MAC) address and IPv4 address, and the target hardware (MAC) address and IPv4 address. These addresses are not part of the ARP header (arphdr) structure. In the arp_process() method, they are extracted by reading the corresponding offsets of the ARP header, as you can see in the explanation about the arp_process() method in the section "ARP: Receiving Solicitation Requests and Replies" later in this chapter. Figure 7-1 shows an ARP header for an ARP Ethernet packet.
+
+Figure 7-1.
+
+ARP header (for Ethernet)
+
+In ARP, four neigh_ops objects are defined: arp_direct_ops, arp_generic_ops, arp_hh_ops, and arp_broken_ops. The initialization of the ARP table neigh_ops object is done by the arp_constructor() method, based on the network device features:
+
+  * If the header_ops of the net_device object is NULL, the neigh_ops object will be set to be arp_direct_ops. In this case, sending the packet will be done with the neigh_direct_output() method, which is in fact a wrapper around dev_queue_xmit(). In most Ethernet network devices, however, the header_ops of the net_device object is initialized to be eth_header_ops by the generic ether_setup() method; see net/ethernet/eth.c.
+
+  * If the header_ops of the net_device object contains a NULL cache() callback, then the neigh_ops object will be set to be arp_generic_ops.
+
+  * If the header_ops of the net_device object contains a non-NULL cache() callback, then the neigh_ops object will be set to be arp_hh_ops. In the case of using the generic eth_header_ops object, the cache() callback is the eth_header_cache() callback.
+
+  * For three types of devices, the neigh_ops object will be set to be arp_broken_ops (when the type of the net_device object is ARPHRD_ROSE, ARPHRD_AX25, or ARPHRD_NETROM).
+
+Now that I've covered the ARP protocol and the ARP header (arphdr) object, let's look at how ARP solicitation requests are sent.
+
+### ARP: Sending Solicitation Requests
+
+Where are solicitation requests being sent? The most common case is in the Tx path, before actually leaving the network layer (L3) and moving to the link layer (L2). In the ip_finish_output2() method, you first perform a lookup for the next hop IPv4 address in the ARP table by calling the __ipv4_neigh_lookup_noref() method, and if you don't find any matching neighbour entry, you create one by calling the __neigh_create() method:
+
+static inline int ip_finish_output2(struct sk_buff *skb)
+
+{
+
+struct dst_entry *dst = skb_dst(skb);
+
+struct rtable *rt = (struct rtable *)dst;
+
+struct net_device *dev = dst->dev;
+
+unsigned int hh_len = LL_RESERVED_SPACE(dev);
+
+struct neighbour *neigh;
+
+u32 nexthop;
+
+. . .
+
+. . .
+
+nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
+
+neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
+
+if (unlikely(!neigh))
+
+neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
+
+if (!IS_ERR(neigh)) {
+
+int res = dst_neigh_output(dst, neigh, skb);
+
+. . .
+
+}
+
+Let's take a look in the dst_neigh_output() method:
+
+static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
+
+struct sk_buff *skb)
+
+{
+
+const struct hh_cache *hh;
+
+if (dst->pending_confirm) {
+
+unsigned long now = jiffies;
+
+dst->pending_confirm = 0;
+
+/* avoid dirtying neighbour */
+
+if (n->confirmed != now)
+
+n->confirmed = now;
+
+}
+
+When you reach this method for the first time with this flow, nud_state is not NUD_CONNECTED, and the output callback is the neigh_resolve_output() method:
+
+hh = &n->hh;
+
+if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
+
+return neigh_hh_output(hh, skb);
+
+else
+
+return n->output(n, skb);
+
+}
+
+(include/net/dst.h)
+
+In the neigh_resolve_output() method, you call the neigh_event_send() method, which eventually puts the SKB in the arp_queue of the neighbour by __skb_queue_tail(&neigh->arp_queue, skb); later, the neigh_probe() method, invoked from the neighbour timer handler, neigh_timer_handler(), will send the packet by invoking the solicit() method (neigh->ops->solicit is the arp_solicit() method in our case):
+
+static void neigh_probe(struct neighbour *neigh)
+
+__releases(neigh->lock)
+
+{
+
+struct sk_buff *skb = skb_peek(&neigh->arp_queue);
+
+. . .
+
+neigh->ops->solicit(neigh, skb);
+
+atomic_inc(&neigh->probes);
+
+kfree_skb(skb);
+
+}
+
+Let's take a look at the arp_solicit() method, which actually sends the ARP request:
+
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
+
+{
+
+__be32 saddr = 0;
+
+u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL;
+
+struct net_device *dev = neigh->dev;
+
+__be32 target = *(__be32 *)neigh->primary_key;
+
+int probes = atomic_read(&neigh->probes);
+
+struct in_device *in_dev;
+
+rcu_read_lock();
+
+in_dev = __in_dev_get_rcu(dev);
+
+if (!in_dev) {
+
+rcu_read_unlock();
+
+return;
+
+}
+
+With the arp_announce procfs entry, you can set restrictions for which local source IP address to use for the ARP packet you want to send:
+
+  * 0: Use any local address, configured on any interface. This is the default value.
+
+  * 1: First try to use addresses that are on the target subnet. If there are no such addresses, use level 2.
+
+  * 2: Use primary IP address.
+
+Note that the max value of these two entries is used:
+
+/proc/sys/net/ipv4/conf/all/arp_announce
+
+/proc/sys/net/ipv4/conf/<netdeviceName>/arp_announce
+
+See also the description of the IN_DEV_ARP_ANNOUNCE macro in the "Quick Reference" section at the end of this chapter.
+
+switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
+
+default:
+
+case 0: /* By default announce any local IP */
+
+if (skb && inet_addr_type(dev_net(dev),
+
+ip_hdr(skb)->saddr) == RTN_LOCAL)
+
+saddr = ip_hdr(skb)->saddr;
+
+break;
+
+case 1: /* Restrict announcements of saddr in same subnet */
+
+if (!skb)
+
+break;
+
+saddr = ip_hdr(skb)->saddr;
+
+if (inet_addr_type(dev_net(dev), saddr) == RTN_LOCAL) {
+
+The inet_addr_onlink() method checks whether the specified target address and the specified source address are on the same subnet:
+
+/* saddr should be known to target */
+
+if (inet_addr_onlink(in_dev, target, saddr))
+
+break;
+
+}
+
+saddr = 0;
+
+break;
+
+case 2: /* Avoid secondary IPs, get a primary/preferred one */
+
+break;
+
+}
+
+rcu_read_unlock();
+
+if (!saddr)
+
+The inet_select_addr() method returns the address of the first primary interface of the specified device whose scope is smaller than the specified scope (RT_SCOPE_LINK in this case), and which is in the same subnet as the target:
+
+saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
+
+probes -= neigh->parms->ucast_probes;
+
+if (probes < 0) {
+
+if (!(neigh->nud_state & NUD_VALID))
+
+pr_debug("trying to ucast probe in NUD_INVALID\n");
+
+neigh_ha_snapshot(dst_ha, neigh, dev);
+
+dst_hw = dst_ha;
+
+} else {
+
+probes -= neigh->parms->app_probes;
+
+if (probes < 0) {
+
+CONFIG_ARPD is set when working with the userspace ARP daemon; there are projects like OpenNHRP, which are based on ARPD. Next Hop Resolution Protocol (NHRP) is used to improve the efficiency of routing computer network traffic over Non-Broadcast, Multiple Access (NBMA) networks (I don't discuss the ARPD userspace daemon in this book):
+
+#ifdef CONFIG_ARPD
+
+neigh_app_ns(neigh);
+
+#endif
+
+return;
+
+}
+
+}
+
+Now you call the arp_send() method to send an ARP request. Note that the last parameter, target_hw, is NULL. You do not yet know the target hardware (MAC) address. When calling arp_send() with target_hw as NULL, a broadcast ARP request is sent:
+
+arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
+
+dst_hw, dev->dev_addr, NULL);
+
+}
+
+Let's take a look at the arp_send() method, which is quite short:
+
+void arp_send(int type, int ptype, __be32 dest_ip,
+
+struct net_device *dev, __be32 src_ip,
+
+const unsigned char *dest_hw, const unsigned char *src_hw,
+
+const unsigned char *target_hw)
+
+{
+
+struct sk_buff *skb;
+
+/*
+
+* No arp on this interface.
+
+*/
+
+You must check whether the IFF_NOARP is supported on this network device. There are cases in which ARP is disabled: an administrator can disable ARP, for example, by ifconfig eth1 –arp or by ip link set eth1 arp off. Some network devices set the IFF_NOARP flag upon creation—for example, IPv4 tunnel devices, or PPP devices, which do not need ARP. See the ipip_tunnel_setup() method in net/ipv4/ipip.c or the ppp_setup() method in drivers/net/ppp_generic.c.
+
+if (dev->flags&IFF_NOARP)
+
+return;
+
+The arp_create() method creates an SKB with an ARP header and initializes it according to the specified parameters:
+
+skb = arp_create(type, ptype, dest_ip, dev, src_ip,
+
+dest_hw, src_hw, target_hw);
+
+if (skb == NULL)
+
+return;
+
+The only thing the arp_xmit() method does is call dev_queue_xmit() by the NF_HOOK() macro:
+
+arp_xmit(skb);
+
+}
+
+Now it is time to learn how these ARP requests are processed and how ARP replies are processed.
+
+### ARP: Receiving Solicitation Requests and Replies
+
+In IPv4, the arp_rcv() method is responsible for handling ARP packets, as mentioned earlier. Let's take a look at the arp_rcv() method:
+
+static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
+
+struct packet_type *pt, struct net_device *orig_dev)
+
+{
+
+const struct arphdr *arp;
+
+If the network device on which the ARP packet was received has the IFF_NOARP flag set, or if the packet is not destined for the local machine, or if it is for a loopback device, then the packet should be dropped. You continue and make some more sanity checks, and if everything is okay, you proceed to the arp_process() method, which performs the real work of processing an ARP packet:
+
+if (dev->flags & IFF_NOARP ||
+
+skb->pkt_type == PACKET_OTHERHOST ||
+
+skb->pkt_type == PACKET_LOOPBACK)
+
+goto freeskb;
+
+If the SKB is shared, you must clone it because it might be changed by someone else while being processed by the arp_rcv() method. The skb_share_check() method creates a clone of the SKB if it is shared (see Appendix A).
+
+skb = skb_share_check(skb, GFP_ATOMIC);
+
+if (!skb)
+
+goto out_of_mem;
+
+/* ARP header, plus 2 device addresses, plus 2 IP addresses. */
+
+if (!pskb_may_pull(skb, arp_hdr_len(dev)))
+
+goto freeskb;
+
+arp = arp_hdr(skb);
+
+The ar_hln of the ARP header represents the length of a hardware address, which should be 6 bytes for Ethernet header, and should be equal to the addr_len of the net_device object. The ar_pln of the ARP header represents the length of the protocol address and should be equal to the length of an IPv4 address, which is 4 bytes:
+
+if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4)
+
+goto freeskb;
+
+memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
+
+return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
+
+freeskb:
+
+kfree_skb(skb);
+
+out_of_mem:
+
+return 0;
+
+}
+
+Handling ARP requests is not restricted to packets that have the local host as their destination. When the local host is configured as a proxy ARP, or as a private VLAN proxy ARP (see RFC 3069), you also handle packets which have a destination that is not the local host. Support for private VLAN proxy ARP was added in kernel 2.6.34.
+
+In the arp_process() method, you handle only ARP requests or ARP responses. For ARP requests you perform a lookup in the routing subsystem by the ip_route_input_noref() method. If the ARP packet is for the local host (the rt_type of the routing entry is RTN_LOCAL), you proceed to check some conditions (described shortly). If all these checks pass, an ARP reply is sent back with the arp_send() method. If the ARP packet is not for the local host but should be forwarded (the rt_type of the routing entry is RTN_UNICAST), then you check some conditions (also described shortly), and if they are fulfilled you perform a lookup in the proxy ARP table by calling the pneigh_lookup() method.
+
+You will now see the implementation details of the main ARP method which handles ARP requests, the arp_process() method.
+
+#### The arp_process() Method
+
+Let's take a look at the arp_process() method, where the real work is done:
+
+static int arp_process(struct sk_buff *skb)
+
+{
+
+struct net_device *dev = skb->dev;
+
+struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+struct arphdr *arp;
+
+unsigned char *arp_ptr;
+
+struct rtable *rt;
+
+unsigned char *sha;
+
+__be32 sip, tip;
+
+u16 dev_type = dev->type;
+
+int addr_type;
+
+struct neighbour *n;
+
+struct net *net = dev_net(dev);
+
+/* arp_rcv below verifies the ARP header and verifies the device
+
+* is ARP'able.
+
+*/
+
+if (in_dev == NULL)
+
+goto out;
+
+Fetch the ARP header from the SKB (it is the network header, see the arp_hdr() method):
+
+arp = arp_hdr(skb);
+
+switch (dev_type) {
+
+default:
+
+if (arp->ar_pro != htons(ETH_P_IP) ||
+
+htons(dev_type) != arp->ar_hrd)
+
+goto out;
+
+break;
+
+case ARPHRD_ETHER:
+
+. . .
+
+if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
+
+arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
+
+arp->ar_pro != htons(ETH_P_IP))
+
+goto out;
+
+break;
+
+. . .
+
+You want to handle only ARP requests or ARP responses in the arp_process() method, and discard all other packets:
+
+/* Understand only these message types */
+
+if (arp->ar_op != htons(ARPOP_REPLY) &&
+
+arp->ar_op != htons(ARPOP_REQUEST))
+
+goto out;
+
+/*
+
+* Extract fields
+
+*/
+
+arp_ptr = (unsigned char *)(arp + 1);
+
+#### The arp_process() Method—Extracting Headers:
+
+Immediately after the ARP header, there are the following fields (see the ARP header definition above):
+
+  * sha: The source hardware address (the MAC address, which is 6 bytes).
+
+  * sip: The source IPv4 address (4 bytes).
+
+  * tha: The target hardware address (the MAC address, which is 6 bytes).
+
+  * tip: The target IPv4 address (4 bytes).
+
+Extract the sip and tip addresses:
+
+sha = arp_ptr;
+
+arp_ptr += dev->addr_len;
+
+Set sip to be the source IPv4 address after advancing arp_ptr with the corresponding offset:
+
+memcpy(&sip, arp_ptr, 4);
+
+arp_ptr += 4;
+
+switch (dev_type) {
+
+. . .
+
+default:
+
+arp_ptr += dev->addr_len;
+
+}
+
+Set tip to be the target IPv4 address after advancing arp_ptr with the corresponding offset:
+
+memcpy(&tip, arp_ptr, 4);
+
+Discard these two types of packets:
+
+  * Multicast packets
+
+  * Packets for the loopback device if the use of local routing with loopback addresses is disabled; see also the description of the IN_DEV_ROUTE_LOCALNET macro in the "Quick Reference" section at the end of this chapter.
+
+/*
+
+* Check for bad requests for 127.x.x.x and requests for multicast
+
+* addresses. If this is one such, delete it.
+
+*/
+
+if (ipv4_is_multicast(tip) ||
+
+(!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
+
+goto out;
+
+. . .
+
+The source IP (sip) is 0 when you use Duplicate Address Detection (DAD). DAD lets you detect the existence of double L3 addresses on different hosts on a LAN. DAD is implemented in IPv6 as an integral part of the address configuration process, but not in IPv4. However, there is support for correctly handling DAD requests in IPv4, as you will soon see. The arping utility of the iputils package is an example for using DAD in IPv4. When sending ARP request with arping –D, you send an ARP request where the sip of the ARP header is 0. (The –D modifier tells arping to be in DAD mode); the tip is usually the sender IPv4 address (because you want to check whether there is another host on the same LAN with the same IPv4 address as yours); if there is a host with the same IP address as the tip of the DAD ARP request, it will send back an ARP reply (without adding the sender to its neighbouring table):
+
+/* Special case: IPv4 duplicate address detection packet (RFC2131) */
+
+if (sip == 0) {
+
+if (arp->ar_op == htons(ARPOP_REQUEST) &&
+
+#### The arp_process() Method—arp_ignore() and arp_filter() Methods
+
+The arp_ignore procfs entry provides support for different modes for sending ARP replies as a response for an ARP request. The value used is the max value of /proc/sys/net/ipv4/conf/all/arp_ignore and /proc/sys/net/ipv4/conf/<netDeviceName>/arp_ignore. By default, the value of the arp_ignore procfs entry is 0, and in such a case, the arp_ignore() method returns 0. You reply to the ARP request with arp_send(), as you can see in the next code snippet (assuming that inet_addr_type(net, tip) returned RTN_LOCAL). The arp_ignore() method checks the value of IN_DEV_ARP_IGNORE(in_dev); for more details, see the arp_ignore()implementation in net/ipv4/arp.c and the description of the IN_DEV_ARP_IGNORE macro in the "Quick Reference" section at the end of this chapter:
+
+inet_addr_type(net, tip) == RTN_LOCAL &&
+
+!arp_ignore(in_dev, sip, tip))
+
+arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
+
+dev->dev_addr, sha);
+
+goto out;
+
+}
+
+if (arp->ar_op == htons(ARPOP_REQUEST) &&
+
+ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
+
+rt = skb_rtable(skb);
+
+addr_type = rt->rt_type;
+
+When addr_type equals RTN_LOCAL, the packet is for local delivery:
+
+if (addr_type == RTN_LOCAL) {
+
+int dont_send;
+
+dont_send = arp_ignore(in_dev, sip, tip);
+
+The arp_filter() method fails (returns 1) in two cases:
+
+  * When the lookup in the routing tables with the ip_route_output() method fails.
+
+  * When the outgoing network device of the routing entry is different than the network device on which the ARP request was received.
+
+In case of success, the arp_filter() method returns 0 (see also the description of the IN_DEV_ARPFILTER macro in the "Quick Reference" section at the end of this chapter):
+
+if (!dont_send && IN_DEV_ARPFILTER(in_dev))
+
+dont_send = arp_filter(sip, tip, dev);
+
+if (!dont_send) {
+
+Before sending the ARP reply, you want to add the sender to your neighbouring table or update it; this is done with the neigh_event_ns() method. The neigh_event_ns() method creates a new neighbouring table entry and sets its state to be NUD_STALE. If there is already such an entry, it updates its state to be NUD_STALE, with the neigh_update() method. Adding entries this way is termed passive learning:
+
+n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+
+if (n) {
+
+arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+
+dev, tip, sha, dev->dev_addr,
+
+sha);
+
+neigh_release(n);
+
+}
+
+}
+
+goto out;
+
+} else if (IN_DEV_FORWARD(in_dev)) {
+
+The arp_fwd_proxy() method returns 1 when the device can be used as an ARP proxy; the arp_fwd_pvlan() method returns 1 when the device can be used as an ARP VLAN proxy:
+
+if (addr_type == RTN_UNICAST &&
+
+(arp_fwd_proxy(in_dev, dev, rt) ||
+
+arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
+
+(rt->dst.dev != dev &&
+
+pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
+
+Again, call the neigh_event_ns() method to create a neighbour entry of the sender with NUD_STALE, or if such an entry exists, update that entry state to be NUD_STALE:
+
+n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+
+if (n)
+
+neigh_release(n);
+
+if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
+
+skb->pkt_type == PACKET_HOST ||
+
+in_dev->arp_parms->proxy_delay == 0) {
+
+arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+
+dev, tip, sha, dev->dev_addr,
+
+sha);
+
+} else {
+
+Delay sending an ARP reply by putting the SKB at the tail of the proxy_queue, by calling the pneigh_enqueue() method. Note that the delay is random and is a number between 0 and in_dev->arp_parms->proxy_delay:
+
+pneigh_enqueue(&arp_tbl,
+
+in_dev->arp_parms, skb);
+
+return 0;
+
+}
+
+goto out;
+
+}
+
+}
+
+}
+
+/* Update our ARP tables */
+
+Note that the last parameter of calling the __neigh_lookup() method is 0, which means that you only perform a lookup in the neighbouring table (and do not create a new neighbour if the lookup failed):
+
+n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
+
+The IN_DEV_ARP_ACCEPT macro tells you whether the network device is set to accept ARP requests (see also the description of the IN_DEV_ARP_ACCEPT macro in the "Quick Reference" section at the end of this of this chapter):
+
+if (IN_DEV_ARP_ACCEPT(in_dev)) {
+
+/* Unsolicited ARP is not accepted by default.
+
+It is possible, that this option should be enabled for some
+
+devices (strip is candidate)
+
+*/
+
+Unsolicited ARP requests are sent only to update the neighbouring table. In such requests, tip is equal to sip (the arping utility supports sending unsolicited ARP requests by arping –U):
+
+if (n == NULL &&
+
+(arp->ar_op == htons(ARPOP_REPLY) ||
+
+(arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) &&
+
+inet_addr_type(net, sip) == RTN_UNICAST)
+
+n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
+
+}
+
+if (n) {
+
+int state = NUD_REACHABLE;
+
+int override;
+
+/* If several different ARP replies follows back-to-back,
+
+use the FIRST one. It is possible, if several proxy
+
+agents are active. Taking the first reply prevents
+
+arp trashing and chooses the fastest router.
+
+*/
+
+override = time_after(jiffies, n->updated + n->parms->locktime);
+
+/* Broadcast replies and request packets
+
+do not assert neighbour reachability.
+
+*/
+
+if (arp->ar_op != htons(ARPOP_REPLY) ||
+
+skb->pkt_type != PACKET_HOST)
+
+state = NUD_STALE;
+
+Call neigh_update() to update the neighbouring table:
+
+neigh_update(n, sha, state,
+
+override ? NEIGH_UPDATE_F_OVERRIDE : 0);
+
+neigh_release(n);
+
+}
+
+out:
+
+consume_skb(skb);
+
+return 0;
+
+}
+
+Now that you know about the IPv4 ARP protocol implementation, it is time to move on to IPv6 NDISC protocol implementation. You will soon notice some of the differences between the neighbouring subsystem implementation in IPv4 and in IPv6.
+
+## The NDISC Protocol (IPv6)
+
+The Neighbour Discovery (NDISC) protocol is based on RFC 2461, "Neighbour Discovery for IP Version 6 (IPv6)," which was later obsoleted by RFC 4861 from 2007. IPv6 nodes (hosts or routers) on the same link use the Neighbour Discovery protocol to discover each other's presence, to discover routers, to determine each other's L2 addresses, and to maintain neighbour reachability information. Duplicate Address Detection (DAD) was added to avoid double L3 addresses on the same LAN. I discuss DAD and handling NDISC neighbour solicitation and neighbour advertisements shortly.
+
+Next you learn how IPv6 neighbour discovery protocols avoid creating duplicate IPv6 addresses.
+
+### Duplicate Address Detection (DAD)
+
+How can you be sure there is no other same IPv6 address on a LAN? The chances are low, but if such address does exist, it may cause trouble. DAD is a solution. When a host tries to configure an address, it first creates a Link Local address (a Link Local address starts with FE80). This address is tentative (IFA_F_TENTATIVE ), which means that the host can communicate only with ND messages. Then the host starts the DAD process by calling the addrconf_dad_start() method (net/ipv6/addrconf.c). The host sends a Neighbour Solicitation DAD message. The target is its tentative address, the source is all zeros (the unspecified address). If there is no answer in a specified time interval, the state is changed to permanent (IFA_F_PERMANENT). When Optimistic DAD (CONFIG_IPV6_OPTIMISTIC_DAD) is set, you don't wait until DAD is completed, but allow hosts to communicate with peers before DAD has finished successfully. See RFC 4429, "Optimistic Duplicate Address Detection (DAD) for IPv6," from 2006.
+
+The neighbouring table for IPv6 is called nd_tbl:
+
+struct neigh_table nd_tbl = {
+
+.family = AF_INET6,
+
+.key_len = sizeof(struct in6_addr),
+
+.hash = ndisc_hash,
+
+.constructor = ndisc_constructor,
+
+.pconstructor = pndisc_constructor,
+
+.pdestructor = pndisc_destructor,
+
+.proxy_redo = pndisc_redo,
+
+.id = "ndisc_cache",
+
+.parms = {
+
+.tbl = &nd_tbl,
+
+.base_reachable_time = ND_REACHABLE_TIME,
+
+.retrans_time = ND_RETRANS_TIMER,
+
+.gc_staletime = 60 * HZ,
+
+.reachable_time = ND_REACHABLE_TIME,
+
+.delay_probe_time = 5 * HZ,
+
+.queue_len_bytes = 64*1024,
+
+.ucast_probes = 3,
+
+.mcast_probes = 3,
+
+.anycast_delay = 1 * HZ,
+
+.proxy_delay = (8 * HZ) / 10,
+
+.proxy_qlen = 64,
+
+},
+
+.gc_interval = 30 * HZ,
+
+.gc_thresh1 = 128,
+
+.gc_thresh2 = 512,
+
+.gc_thresh3 = 1024,
+
+};
+
+(net/ipv6/ndisc.c)
+
+Note that some of the members of the NDISC table are equal to the parallel members in the ARP table—for example, the values of the garbage collector thresholds (gc_thresh1, gc_thresh2 and gc_thresh3).
+
+The Linux IPv6 Neighbour Discovery implementation is based on ICMPv6 messages to manage the interaction between neighbouring nodes. The Neighbour Discovery protocol defines the following five ICMPv6 message types:
+
+#define NDISC_ROUTER_SOLICITATION 133
+
+#define NDISC_ROUTER_ADVERTISEMENT 134
+
+#define NDISC_NEIGHBOUR_SOLICITATION 135
+
+#define NDISC_NEIGHBOUR_ADVERTISEMENT 136
+
+#define NDISC_REDIRECT 137
+
+(include/net/ndisc.h)
+
+Note that these five ICMPv6 message types are informational messages. ICMPv6 message types whose values are in the range from 0 to 127 are error messages, and ICMPv6 message types whose values are from 128 to 255 are informational messages. For more on that, see Chapter 3, which discusses the ICMP protocol. This chapter discusses only the Neighbour Solicitation and the Neighbour Discovery messages.
+
+As mentioned in the beginning of this chapter, because neighbouring discovery messages are ICMPv6 messages, they are handled by the icmpv6_rcv() method, which in turn invokes the ndisc_rcv() method for ICMPv6 packets whose message type is one of the five types mentioned earlier (see net/ipv6/icmp.c).
+
+In NDISC, there are three neigh_ops objects: ndisc_generic_ops, ndisc_hh_ops, and ndisc_direct_ops:
+
+  * If the header_ops of the net_device object is NULL, the neigh_ops object will be set to be ndisc_direct_ops. As in the case of arp_direct_ops, sending the packet is done with the neigh_direct_output() method, which is in fact a wrapper around dev_queue_xmit(). Note that, as mentioned in the ARP section earlier, in most Ethernet network devices, the header_ops of the net_device object is not NULL.
+
+  * If the header_ops of the net_device object contains a NULL cache() callback, then the neigh_ops object is set to be ndisc_generic_ops.
+
+  * If the header_ops of the net_device object contains a non-NULL cache() callback, then the neigh_ops object is set to be ndisc_hh_ops.
+
+This section discussed the DAD mechanism and how it helps to avoid duplicate addresses. The next section describes how solicitation requests are sent.
+
+### NIDSC: Sending Solicitation Requests
+
+Similarly to what you saw in IPv6, you also perform a lookup and create an entry if you did not find any match:
+
+static int ip6_finish_output2(struct sk_buff *skb)
+
+{
+
+struct dst_entry *dst = skb_dst(skb);
+
+struct net_device *dev = dst->dev;
+
+struct neighbour *neigh;
+
+struct in6_addr *nexthop;
+
+int ret;
+
+. . .
+
+. . .
+
+nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
+
+neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
+
+if (unlikely(!neigh))
+
+neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
+
+if (!IS_ERR(neigh)) {
+
+ret = dst_neigh_output(dst, neigh, skb);
+
+. . .
+
+Eventually, much like in the IPv4 Tx path, you call the solicit method neigh->ops->solicit(neigh, skb) from the neigh_probe() method. The neigh->ops->solicit in this case is the ndisc_solicit() method. The ndisc_solicit()is a very short method; it is in fact a wrapper around the ndisc_send_ns() method:
+
+static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
+
+{
+
+struct in6_addr *saddr = NULL;
+
+struct in6_addr mcaddr;
+
+struct net_device *dev = neigh->dev;
+
+struct in6_addr *target = (struct in6_addr *)&neigh->primary_key;
+
+int probes = atomic_read(&neigh->probes);
+
+if (skb && ipv6_chk_addr(dev_net(dev), &ipv6_hdr(skb)->saddr, dev, 1))
+
+saddr = &ipv6_hdr(skb)->saddr;
+
+if ((probes -= neigh->parms->ucast_probes) < 0) {
+
+if (!(neigh->nud_state & NUD_VALID)) {
+
+ND_PRINTK(1, dbg,
+
+"%s: trying to ucast probe in NUD_INVALID: %pI6\n",
+
+__func__, target);
+
+}
+
+ndisc_send_ns(dev, neigh, target, target, saddr);
+
+} else if ((probes -= neigh->parms->app_probes) < 0) {
+
+#ifdef CONFIG_ARPD
+
+neigh_app_ns(neigh);
+
+#endif
+
+} else {
+
+addrconf_addr_solict_mult(target, &mcaddr);
+
+ndisc_send_ns(dev, NULL, target, &mcaddr, saddr);
+
+}
+
+}
+
+In order to send the solicitation request, we need to build an nd_msg object:
+
+struct nd_msg {
+
+struct icmp6hdr icmph;
+
+struct in6_addr target;
+
+__u8 opt[0];
+
+};
+
+(include/net/ndisc.h)
+
+For a solicitation request, the ICMPv6 header type should be set to NDISC_NEIGHBOUR_SOLICITATION, and for solicitation reply, the ICMPv6 header type should be set to NDISC_NEIGHBOUR_ADVERTISEMENT. Note that with Neighbour Advertisement messages, there are cases when you need to set flags in the ICMPv6 header. The ICMPv6 header includes a structure named icmpv6_nd_advt, which includes the override, solicited, and router flags:
+
+struct icmp6hdr {
+
+__u8 icmp6_type;
+
+__u8 icmp6_code;
+
+__sum16 icmp6_cksum;
+
+union {
+
+. . .
+
+. . .
+
+struct icmpv6_nd_advt {
+
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+
+__u32 reserved:5,
+
+override:1,
+
+solicited:1,
+
+router:1,
+
+reserved2:24;
+
+. . .
+
+#endif
+
+} u_nd_advt;
+
+} icmp6_dataun;
+
+. . .
+
+#define icmp6_router icmp6_dataun.u_nd_advt.router
+
+#define icmp6_solicited icmp6_dataun.u_nd_advt.solicited
+
+#define icmp6_override icmp6_dataun.u_nd_advt.override
+
+. . .
+
+(include/uapi/linux/icmpv6.h)
+
+  * When a message is sent in response to a Neighbour Solicitation, you set the solicited flag (icmp6_solicited).
+
+  * When you want to override a neighbouring cache entry (update the L2 address), you set the override flag (icmp6_override).
+
+  * When the host sending the Neighbour Advertisement message is a router, you set the router flag (icmp6_router).
+
+You can see the use of these three flags in the ndisc_send_na() method that follows. Let's take a look at the ndisc_send_ns() method:
+
+void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh,
+
+const struct in6_addr *solicit,
+
+const struct in6_addr *daddr, const struct in6_addr *saddr)
+
+{
+
+struct sk_buff *skb;
+
+struct in6_addr addr_buf;
+
+int inc_opt = dev->addr_len;
+
+int optlen = 0;
+
+struct nd_msg *msg;
+
+if (saddr == NULL) {
+
+if (ipv6_get_lladdr(dev, &addr_buf,
+
+(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)))
+
+return;
+
+saddr = &addr_buf;
+
+}
+
+if (ipv6_addr_any(saddr))
+
+inc_opt = 0;
+
+if (inc_opt)
+
+optlen += ndisc_opt_addr_space(dev);
+
+skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
+
+if (!skb)
+
+return;
+
+Build the ICMPv6 header, which is embedded in the nd_msg object:
+
+msg = (struct nd_msg *)skb_put(skb, sizeof(*msg));
+
+*msg = (struct nd_msg) {
+
+.icmph = {
+
+.icmp6_type = NDISC_NEIGHBOUR_SOLICITATION,
+
+},
+
+.target = *solicit,
+
+};
+
+if (inc_opt)
+
+ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
+
+dev->dev_addr);
+
+ndisc_send_skb(skb, daddr, saddr);
+
+}
+
+Let's take a look at the ndisc_send_na() method:
+
+static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh,
+
+const struct in6_addr *daddr,
+
+const struct in6_addr *solicited_addr,
+
+bool router, bool solicited, bool override, bool inc_opt)
+
+{
+
+struct sk_buff *skb;
+
+struct in6_addr tmpaddr;
+
+struct inet6_ifaddr *ifp;
+
+const struct in6_addr *src_addr;
+
+struct nd_msg *msg;
+
+int optlen = 0;
+
+. . .
+
+skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
+
+if (!skb)
+
+return;
+
+Build the ICMPv6 header, which is embedded in the nd_msg object:
+
+msg = (struct nd_msg *)skb_put(skb, sizeof(*msg));
+
+*msg = (struct nd_msg) {
+
+.icmph = {
+
+.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT,
+
+.icmp6_router = router,
+
+.icmp6_solicited = solicited,
+
+.icmp6_override = override,
+
+},
+
+.target = *solicited_addr,
+
+};
+
+if (inc_opt)
+
+ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR,
+
+dev->dev_addr);
+
+ndisc_send_skb(skb, daddr, src_addr);
+
+}
+
+This section described how solicitation requests are sent. The next section talks about how Neighbour Solicitations and Advertisements are handled.
+
+### NDISC: Receiving Neighbour Solicitations and Advertisements
+
+As mentioned, the ndisc_rcv() method handles all five neighbour discovery message types; let's take a look at this method:
+
+int ndisc_rcv(struct sk_buff *skb)
+
+{
+
+struct nd_msg *msg;
+
+if (skb_linearize(skb))
+
+return 0;
+
+msg = (struct nd_msg *)skb_transport_header(skb);
+
+__skb_push(skb, skb->data - skb_transport_header(skb));
+
+According to RFC 4861, the hop limit of neighbour messages should be 255; the hop limit length is 8 bits, so the maximum hop limit is 255. A value of 255 assures that the packet was not forwarded, and this assures you that you are not exposed to some security attack. Packets that do not fulfill this requirement are discarded:
+
+if (ipv6_hdr(skb)->hop_limit != 255) {
+
+ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n",
+
+ipv6_hdr(skb)->hop_limit);
+
+return 0;
+
+}
+
+According to RFC 4861, the ICMPv6 code of neighbour messages should be 0, so drop packets that do not fulfill this requirement:
+
+if (msg->icmph.icmp6_code != 0) {
+
+ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n",
+
+msg->icmph.icmp6_code);
+
+return 0;
+
+}
+
+memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
+
+switch (msg->icmph.icmp6_type) {
+
+case NDISC_NEIGHBOUR_SOLICITATION:
+
+ndisc_recv_ns(skb);
+
+break;
+
+case NDISC_NEIGHBOUR_ADVERTISEMENT:
+
+ndisc_recv_na(skb);
+
+break;
+
+case NDISC_ROUTER_SOLICITATION:
+
+ndisc_recv_rs(skb);
+
+break;
+
+case NDISC_ROUTER_ADVERTISEMENT:
+
+ndisc_router_discovery(skb);
+
+break;
+
+case NDISC_REDIRECT:
+
+ndisc_redirect_rcv(skb);
+
+break;
+
+}
+
+return 0;
+
+}
+
+I do not discuss router solicitations and router advertisements in this chapter, since they are discussed in Chapter 8. Let's take a look at the ndisc_recv_ns() method:
+
+static void ndisc_recv_ns(struct sk_buff *skb)
+
+{
+
+struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
+
+const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
+
+const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr;
+
+u8 *lladdr = NULL;
+
+u32 ndoptlen = skb->tail - (skb->transport_header +
+
+offsetof(struct nd_msg, opt));
+
+struct ndisc_options ndopts;
+
+struct net_device *dev = skb->dev;
+
+struct inet6_ifaddr *ifp;
+
+struct inet6_dev *idev = NULL;
+
+struct neighbour *neigh;
+
+The ipv6_addr_any() method returns 1 when saddr is the unspecified address of all zeroes (IPV6_ADDR_ANY). When the source address is the unspecified address (all zeroes), this means that the request is DAD:
+
+int dad = ipv6_addr_any(saddr);
+
+bool inc;
+
+int is_router = -1;
+
+Perform some validity checks:
+
+if (skb->len < sizeof(struct nd_msg)) {
+
+ND_PRINTK(2, warn, "NS: packet too short\n");
+
+return;
+
+}
+
+if (ipv6_addr_is_multicast(&msg->target)) {
+
+ND_PRINTK(2, warn, "NS: multicast target address\n");
+
+return;
+
+}
+
+/*
+
+* RFC2461 7.1.1:
+
+* DAD has to be destined for solicited node multicast address.
+
+*/
+
+if (dad && !ipv6_addr_is_solict_mult(daddr)) {
+
+ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n");
+
+return;
+
+}
+
+if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
+
+ND_PRINTK(2, warn, "NS: invalid ND options\n");
+
+return;
+
+}
+
+if (ndopts.nd_opts_src_lladdr) {
+
+lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev);
+
+if (!lladdr) {
+
+ND_PRINTK(2, warn,
+
+"NS: invalid link-layer address length\n");
+
+return;
+
+}
+
+/* RFC2461 7.1.1:
+
+* If the IP source address is the unspecified address,
+
+* there MUST NOT be source link-layer address option
+
+* in the message.
+
+*/
+
+if (dad) {
+
+ND_PRINTK(2, warn,
+
+"NS: bad DAD packet (link-layer address option)\n");
+
+return;
+
+}
+
+}
+
+inc = ipv6_addr_is_multicast(daddr);
+
+ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1);
+
+if (ifp) {
+
+if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) {
+
+if (dad) {
+
+/*
+
+* We are colliding with another node
+
+* who is doing DAD
+
+* so fail our DAD process
+
+*/
+
+addrconf_dad_failure(ifp);
+
+return;
+
+} else {
+
+/*
+
+* This is not a dad solicitation.
+
+* If we are an optimistic node,
+
+* we should respond.
+
+* Otherwise, we should ignore it.
+
+*/
+
+if (!(ifp->flags & IFA_F_OPTIMISTIC))
+
+goto out;
+
+}
+
+}
+
+idev = ifp->idev;
+
+} else {
+
+struct net *net = dev_net(dev);
+
+idev = in6_dev_get(dev);
+
+if (!idev) {
+
+/* XXX: count this drop? */
+
+return;
+
+}
+
+if (ipv6_chk_acast_addr(net, dev, &msg->target) ||
+
+(idev->cnf.forwarding &&
+
+(net->ipv6.devconf_all->proxy_ndp || idev->cnf.proxy_ndp) &&
+
+(is_router = pndisc_is_router(&msg->target, dev)) >= 0)) {
+
+if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) &&
+
+skb->pkt_type != PACKET_HOST &&
+
+inc != 0 &&
+
+idev->nd_parms->proxy_delay != 0) {
+
+/*
+
+* for anycast or proxy,
+
+* sender should delay its response
+
+* by a random time between 0 and
+
+* MAX_ANYCAST_DELAY_TIME seconds.
+
+* (RFC2461) -- yoshfuji
+
+*/
+
+struct sk_buff *n = skb_clone(skb, GFP_ATOMIC);
+
+if (n)
+
+pneigh_enqueue(&nd_tbl, idev->nd_parms, n);
+
+goto out;
+
+}
+
+} else
+
+goto out;
+
+}
+
+if (is_router < 0)
+
+is_router = idev->cnf.forwarding;
+
+if (dad) {
+
+Send a neighbour advertisement message:
+
+ndisc_send_na(dev, NULL, &in6addr_linklocal_allnodes, &msg->target,
+
+!!is_router, false, (ifp != NULL), true);
+
+goto out;
+
+}
+
+if (inc)
+
+NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_mcast);
+
+else
+
+NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_ucast);
+
+/*
+
+* update / create cache entry
+
+* for the source address
+
+*/
+
+neigh = __neigh_lookup(&nd_tbl, saddr, dev,
+
+!inc || lladdr || !dev->addr_len);
+
+if (neigh)
+
+Update your neighbouring table with the sender's L2 address; the nud_state will be set to be NUD_STALE:
+
+neigh_update(neigh, lladdr, NUD_STALE,
+
+NEIGH_UPDATE_F_WEAK_OVERRIDE|
+
+NEIGH_UPDATE_F_OVERRIDE);
+
+if (neigh || !dev->header_ops) {
+
+Send a Neighbour Advertisement message:
+
+ndisc_send_na(dev, neigh, saddr, &msg->target,
+
+!!is_router,
+
+true, (ifp != NULL && inc), inc);
+
+if (neigh)
+
+neigh_release(neigh);
+
+}
+
+out:
+
+if (ifp)
+
+in6_ifa_put(ifp);
+
+else
+
+in6_dev_put(idev);
+
+}
+
+Let's take a look at the method that handles Neighbour Advertisements, ndisc_recv_na():
+
+static void ndisc_recv_na(struct sk_buff *skb)
+
+{
+
+struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
+
+const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
+
+const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr;
+
+u8 *lladdr = NULL;
+
+u32 ndoptlen = skb->tail - (skb->transport_header +
+
+offsetof(struct nd_msg, opt));
+
+struct ndisc_options ndopts;
+
+struct net_device *dev = skb->dev;
+
+struct inet6_ifaddr *ifp;
+
+struct neighbour *neigh;
+
+if (skb->len < sizeof(struct nd_msg)) {
+
+ND_PRINTK(2, warn, "NA: packet too short\n");
+
+return;
+
+}
+
+if (ipv6_addr_is_multicast(&msg->target)) {
+
+ND_PRINTK(2, warn, "NA: target address is multicast\n");
+
+return;
+
+}
+
+if (ipv6_addr_is_multicast(daddr) &&
+
+msg->icmph.icmp6_solicited) {
+
+ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n");
+
+return;
+
+}
+
+if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
+
+ND_PRINTK(2, warn, "NS: invalid ND option\n");
+
+return;
+
+}
+
+if (ndopts.nd_opts_tgt_lladdr) {
+
+lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev);
+
+if (!lladdr) {
+
+ND_PRINTK(2, warn,
+
+"NA: invalid link-layer address length\n");
+
+return;
+
+}
+
+}
+
+ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1);
+
+if (ifp) {
+
+if (skb->pkt_type != PACKET_LOOPBACK
+
+&& (ifp->flags & IFA_F_TENTATIVE)) {
+
+addrconf_dad_failure(ifp);
+
+return;
+
+}
+
+/* What should we make now? The advertisement
+
+is invalid, but ndisc specs say nothing
+
+about it. It could be misconfiguration, or
+
+an smart proxy agent tries to help us :-)
+
+We should not print the error if NA has been
+
+received from loopback - it is just our own
+
+unsolicited advertisement.
+
+*/
+
+if (skb->pkt_type != PACKET_LOOPBACK)
+
+ND_PRINTK(1, warn,
+
+"NA: someone advertises our address %pI6 on %s!\n",
+
+&ifp->addr, ifp->idev->dev->name);
+
+in6_ifa_put(ifp);
+
+return;
+
+}
+
+neigh = neigh_lookup(&nd_tbl, &msg->target, dev);
+
+if (neigh) {
+
+u8 old_flags = neigh->flags;
+
+struct net *net = dev_net(dev);
+
+if (neigh->nud_state & NUD_FAILED)
+
+goto out;
+
+/*
+
+* Don't update the neighbour cache entry on a proxy NA from
+
+* ourselves because either the proxied node is off link or it
+
+* has already sent a NA to us.
+
+*/
+
+if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) &&
+
+net->ipv6.devconf_all->forwarding &&
+
+net->ipv6.devconf_all->proxy_ndp &&
+
+pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) {
+
+/* XXX: idev->cnf.proxy_ndp */
+
+goto out;
+
+}
+
+Update the neighbouring table. When the received message is a Neighbour Solicitation, the icmp6_solicited is set, so you want to set the state to be NUD_REACHABLE. When the icmp6_override flag is set, you want the override flag to be set (this mean update the L2 address with the specified lladdr, if it is different):
+
+neigh_update(neigh, lladdr,
+
+msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE,
+
+NEIGH_UPDATE_F_WEAK_OVERRIDE|
+
+(msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)|
+
+NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
+
+(msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0));
+
+if ((old_flags & ∼neigh->flags) & NTF_ROUTER) {
+
+/*
+
+* Change: router to host
+
+*/
+
+struct rt6_info *rt;
+
+rt = rt6_get_dflt_router(saddr, dev);
+
+if (rt)
+
+ip6_del_rt(rt);
+
+}
+
+out:
+
+neigh_release(neigh);
+
+}
+
+}
+
+## Summary
+
+This chapter described the neighbouring subsystem in IPv4 and in IPv6. First you learned about the goals of the neighbouring subsystem. Then you learned about ARP requests and ARP replies in IPv4, and about NDISC Neighbour Solicitation and NDISC Neighbour Advertisements in IPv6. You also found out about how DAD implementation avoids duplicate IPv6 addresses, and you saw various methods for handling the neighbouring subsystem requests and replies. Chapter 8 discusses the IPv6 subsystem implementation. The "Quick Reference" section that follows covers the top methods and macros related to the topics discussed in this chapter, ordered by their context. I also show the neigh_statistics structure, which represents statistics collected by the neighbouring subsystem.
+
+## Quick Reference
+
+The following are some important methods and macros of the neighbouring subsystem, and a description of the neigh_statistics structure.
+
+Note
+
+The core neighbouring code is in net/core/neighbour.c, include/net/neighbour.h and include/uapi/linux/neighbour.h.
+
+The ARP code (IPv4) is in net/ipv4/arp.c, include/net/arp.h and in include/uapi/linux/if_arp.h.
+
+The NDISC code (IPv6) is in net/ipv6/ndisc.c and include/net/ndisc.h.
+
+### Methods
+
+Let's start by covering the methods.
+
+#### void neigh_table_init(struct neigh_table *tbl)
+
+This method invokes the neigh_table_init_no_netlink() method to perform the initialization of the neighbouring table, and links the table to the global neighbouring tables linked list (neigh_tables).
+
+#### void neigh_table_init_no_netlink(struct neigh_table *tbl)
+
+This method performs all the neighbour initialization apart from linking it to the global neighbouring table linked list, which is done by the neigh_table_init(), as mentioned earlier.
+
+#### int neigh_table_clear(struct neigh_table *tbl)
+
+This method frees the resources of the specified neighbouring table.
+
+#### struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
+
+This method allocates a neighbour object.
+
+#### struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
+
+This method allocates a neighbouring hash table.
+
+#### struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref)
+
+This method creates a neighbour object.
+
+#### int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+
+This method adds a neighbour entry; it is the handler for netlink RTM_NEWNEIGH message.
+
+#### int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+
+This method deletes a neighbour entry; it is the handler for netlink RTM_DELNEIGH message.
+
+#### void neigh_probe(struct neighbour *neigh)
+
+This method fetches an SKB from the neighbour arp_queue and calls the corresponding solicit() method to send it. In case of ARP, it will be arp_solicit(). It increments the neighbour probes counter and frees the packet.
+
+#### int neigh_forced_gc(struct neigh_table *tbl)
+
+This method is a synchronous garbage collection method. It removes neighbour entries that are not in the permanent state (NUD_PERMANENT) and whose reference count equals 1. The removal and cleanup of a neighbour is done by first setting the dead flag of the neighbour to be 1 and then calling the neigh_cleanup_and_release() method, which gets a neighbour object as a parameter. The neigh_forced_gc() method is invoked from the neigh_alloc() method under some conditions, as described in the "Creating and Freeing a Neighbour" section earlier in this chapter. The neigh_forced_gc() method returns 1 if at least one neighbour object was removed, and 0 otherwise.
+
+#### void neigh_periodic_work(struct work_struct *work)
+
+This method is the asynchronous garbage collector handler.
+
+#### static void neigh_timer_handler(unsigned long arg)
+
+This method is the per-neighbour periodic timer garbage collector handler.
+
+#### struct neighbour *__neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int creat)
+
+This method performs a lookup in the specified neighbouring table by the given key. If the creat parameter is 1, and the lookup fails, call the neigh_create() method to create a neighbour entry in the specified neighbouring table and return it.
+
+#### neigh_hh_init(struct neighbour *n, struct dst_entry *dst)
+
+This method initializes the L2 cache (hh_cache object) of the specified neighbour based on the specified routing cache entry.
+
+#### void __init arp_init(void)
+
+This method performs the setup for the ARP protocol: initialize the ARP table, register the arp_rcv() as a handler for receiving ARP packets, initialize procfs entries, register sysctl entries, and register the ARP netdev notifier callback, arp_netdev_event().
+
+#### int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+
+This method is the Rx handler for ARP packets (Ethernet packets with type 0x0806).
+
+#### int arp_constructor(struct neighbour *neigh)
+
+This method performs ARP neighbour initialization.
+
+#### int arp_process(struct sk_buff *skb)
+
+This method, invoked by the arp_rcv() method, handles the main processing of ARP requests and ARP responses.
+
+#### void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
+
+This method sends the solicitation request (ARPOP_REQUEST) after some checks and initializations, by calling the arp_send() method.
+
+#### void arp_send(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw)
+
+This method creates an ARP packet and initializes it with the specified parameters, by calling the arp_create() method, and sends it by calling the arp_xmit() method.
+
+#### void arp_xmit(struct sk_buff *skb)
+
+This method actually sends the packet by calling the NF_HOOK macro with dev_queue_xmit().
+
+#### struct arphdr *arp_hdr(const struct sk_buff *skb)
+
+This method fetches the ARP header of the specified SKB.
+
+#### int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
+
+This method translates an IPv4 address to L2 (link layer) address according to the network device type. When the device is an Ethernet device, for example, this is done with the ip_eth_mc_map() method; when the device is an Infiniband device, this is done with the ip_ib_mc_map() method.
+
+#### static inline int arp_fwd_proxy(struct in_device *in_dev, struct net_device *dev, struct rtable *rt)
+
+This method returns 1 if the specified device can use proxy ARP for the specified routing entry.
+
+#### static inline int arp_fwd_pvlan(struct in_device *in_dev, struct net_device *dev,struct rtable *rt, __be32 sip, __be32 tip)
+
+This method returns 1 if the specified device can use proxy ARP VLAN for the specified routing entry and specified IPv4 source and destination addresses.
+
+#### int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+
+This method is the ARP handler for netdev notification events.
+
+#### int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+
+This method is the NDISC handler for netdev notification events.
+
+#### int ndisc_rcv(struct sk_buff *skb)
+
+This method is the main NDISC handler for receiving one of the five types of solicitation packets.
+
+#### static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb)
+
+This method discards the packet and returns –ENETDOWN error (network is down).
+
+#### static void ndisc_recv_ns(struct sk_buff *skb) and static void ndisc_recv_na(struct sk_buff *skb)
+
+These methods handle receiving Neighbour Solicitation and Neighbour Advertisement, respectively.
+
+#### static void ndisc_recv_rs(struct sk_buff *skb) and static void ndisc_router_discovery(struct sk_buff *skb)
+
+These methods handle receiving router solicitation and router advertisement, respectively.
+
+#### int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir)
+
+This method translates an IPv4 address to a L2 (link layer) address according to the network device type. In Ethernet under IPv6, this is done by the ipv6_eth_mc_map() method.
+
+#### int ndisc_constructor(struct neighbour *neigh)
+
+This method performs NDISC neighbour initialization.
+
+#### void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
+
+This method sends the solicitation request after some checks and initializations, by calling the ndisc_send_ns() method.
+
+#### int icmpv6_rcv(struct sk_buff *skb)
+
+This method is a handler for receiving ICMPv6 messages.
+
+#### bool ipv6_addr_any(const struct in6_addr *a)
+
+This method returns 1 when the given IPv6 address is the unspecified address of all zeroes (IPV6_ADDR_ANY).
+
+#### int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
+
+This method checks whether the two specified addresses are on the same subnet.
+
+### Macros
+
+Now, let's look at the macros.
+
+#### IN_DEV_PROXY_ARP(in_dev)
+
+This macro returns true if /proc/sys/net/ipv4/conf/<netDevice>/proxy_arp is set or if /proc/sys/net/ipv4/conf/all/proxy_arp is set, where netDevice is the network device associated with the specified in_dev.
+
+#### IN_DEV_PROXY_ARP_PVLAN(in_dev)
+
+This macro returns true if /proc/sys/net/ipv4/conf/<netDevice>/proxy_arp_pvlan is set, where netDevice is the network device associated with the specified in_dev.
+
+#### IN_DEV_ARPFILTER(in_dev)
+
+This macro returns true if /proc/sys/net/ipv4/conf/<netDevice>/arp_filter is set or if /proc/sys/net/ipv4/conf/all/arp_filter is set, where netDevice is the network device associated with the specified in_dev.
+
+#### IN_DEV_ARP_ACCEPT(in_dev)
+
+This macro returns true if /proc/sys/net/ipv4/conf/<netDevice>/arp_accept is set or if /proc/sys/net/ipv4/conf/all/arp_accept is set, where netDevice is the network device associated with the specified in_dev.
+
+#### IN_DEV_ARP_ANNOUNCE(in_dev)
+
+This macro returns the max value of /proc/sys/net/ipv4/conf/<netDevice>/arp_announce and /proc/sys/net/ipv4/conf/all/arp_announce, where netDevice is the network device associated with the specified in_dev.
+
+#### IN_DEV_ARP_IGNORE(in_dev)
+
+This macro returns the max value of /proc/sys/net/ipv4/conf/<netDevice>/arp_ignore and /proc/sys/net/ipv4/conf/all/arp_ignore, where netDevice is the network device associated with the specified in_dev.
+
+#### IN_DEV_ARP_NOTIFY(in_dev)
+
+This macro returns the max value of /proc/sys/net/ipv4/conf/<netDevice>/arp_notify and /proc/sys/net/ipv4/conf/all/arp_notify, where netDevice is the network device associated with the specified in_dev.
+
+#### IN_DEV_SHARED_MEDIA(in_dev)
+
+This macro returns true if /proc/sys/net/ipv4/conf/<netDevice>/shared_media is set or if /proc/sys/net/ipv4/conf/all/shared_media is set, where netDevice is the network device associated with the specified in_dev.
+
+#### IN_DEV_ROUTE_LOCALNET(in_dev)
+
+This macro returns true if /proc/sys/net/ipv4/conf/<netDevice>/route_localnet is set or if /proc/sys/net/ipv4/conf/all/route_localnet is set, where netDevice is the network device associated with the specified in_dev.
+
+#### neigh_hold()
+
+This macro increments the reference count of the specified neighbour.
+
+### The neigh_statistics Structure
+
+The neigh_statistics structure is important for monitoring the neighbouring subsystem; as mentioned in the beginning of the chapter, both ARP and NDISC export this structure members via procfs (/proc/net/stat/arp_cache and /proc/net/stat/ndisc_cache, respectively). Following is a description of its members and pointing out where they are incremented:
+
+struct neigh_statistics {
+
+unsigned long allocs; /* number of allocated neighs */
+
+unsigned long destroys; /* number of destroyed neighs */
+
+unsigned long hash_grows; /* number of hash resizes */
+
+unsigned long res_failed; /* number of failed resolutions */
+
+unsigned long lookups; /* number of lookups */
+
+unsigned long hits; /* number of hits (among lookups) */
+
+unsigned long rcv_probes_mcast; /* number of received mcast ipv6 */
+
+unsigned long rcv_probes_ucast; /* number of received ucast ipv6 */
+
+unsigned long periodic_gc_runs; /* number of periodic GC runs */
+
+unsigned long forced_gc_runs; /* number of forced GC runs */
+
+unsigned long unres_discards; /* number of unresolved drops */
+
+};
+
+Here is a description of the members of the neigh_statistics structure:
+
+  * allocs: The number of the allocated neighbours; incremented by the neigh_alloc() method.
+
+  * destroys: The number of the destroyed neighbours; incremented by the neigh_destroy() method.
+
+  * hash_grows: The number of times that hash resize was done; incremented by the neigh_hash_grow() method.
+
+  * res_failed: The number of failed resolutions; incremented by the neigh_invalidate() method.
+
+  * lookups: The number of neighbour lookups that were done; incremented by the neigh_lookup() method and by the neigh_lookup_nodev() method.
+
+  * hits: The number of hits when performing a neighbour lookup ; incremented by the neigh_lookup() method and by the neigh_lookup_nodev() method, when you have a hit.
+
+  * rcv_probes_mcast: The number of received multicast probes (IPv6 only); incremented by the ndisc_recv_ns() method.
+
+  * rcv_probes_ucast: The number of received unicast probes (IPv6 only); incremented by the ndisc_recv_ns() method.
+
+  * periodic_gc_runs: The number of periodic GC invocations; incremented by the neigh_periodic_work() method.
+
+  * forced_gc_runs: The number of forced GC invocations; incremented by the neigh_forced_gc() method.
+
+  * unres_discards: The number of unresolved drops; incremented by the __neigh_event_send() method when an unresolved packet is discarded.
+
+### Table
+
+Here is the table that was covered.
+
+Table 7-1.
+
+Network Unreachability Detection States
+
+Linux | Symbol
+
+---|---
+
+NUD_INCOMPLETE | Address resolution is in progress and the link-layer address of the neighbour has not yet been determined. This means that a solicitation request was sent, and you are waiting for a solicitation reply or a timeout.
+
+NUD_REACHABLE | The neighbour is known to have been reachable recently.
+
+NUD_STALE | More than ReachableTime milliseconds have elapsed since the last positive confirmation that the forward path was functioning properly was received.
+
+NUD_DELAY | The neighbour is no longer known to be reachable. Delay sending probes for a short while in order to give upper layer protocols a chance to provide reachability confirmation.
+
+NUD_PROBE | The neighbour is no longer known to be reachable, and unicast Neighbour Solicitation probes are being sent to verify reachability.
+
+NUD_FAILED | Set the neighbour to be unreachable. When you delete a neighbour, you set it to be in the NUD_FAILED state.
+Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_8
+
+© Rami Rosen 2014
+
+# 8. IPv6
+
+Rami Rosen1
+
+(1)
+
+Haifa, Israel
+
+Abstract
+
+In Chapter 7, I dealt with the Linux Neighboring Subsystem and its implementation. In this chapter, I will discuss the IPv6 protocol and its implementation in Linux. IPv6 is the next-generation network layer protocol of the TCP/IP protocol stack. It was developed by the Internet Engineering Task Force (IETF), and it is intended to replace IPv4, which still carries the vast majority of Internet traffic.
+
+In Chapter 7, I dealt with the Linux Neighbouring Subsystem and its implementation. In this chapter, I will discuss the IPv6 protocol and its implementation in Linux. IPv6 is the next-generation network layer protocol of the TCP/IP protocol stack. It was developed by the Internet Engineering Task Force (IETF), and it is intended to replace IPv4, which still carries the vast majority of Internet traffic.
+
+In the early '90s, the IETF started an effort to develop the next generation of the IP protocol, due to the anticipated Internet growth. The first IPv6 RFC is from 1995: RFC 1883, "Internet Protocol, Version 6 (IPv6) Specification." Later, in 1998, RFC 2460 replaced it. The main problem IPv6 solves is the shortage of addresses: the length of an IPv6 address is 128 bits. IPv6 sets a much larger address space. Instead of 2^32 addresses in IPv4, we have 2^128 addresses in IPv6. This indeed enlarges the address space significantly, probably far more than will be needed in the next few decades. But extended address space is not the only advantage of IPv6, as some might think. Based on the experience gained with IPv4, many changes were made in IPv6 to improve the IP protocol. We will discuss many of these changes in this chapter.
+
+The IPv6 protocol is now gaining momentum as an improved network layer protocol. The growing popularity of the Internet all over the globe, and the growing markets for smart mobile devices and tablets, surely make the exhaustion of IPv4 addresses a more evident problem. This gives rise to the need for transitioning to the IPv4 successor, the IPv6 protocol.
+
+## IPv6 – Short Introduction
+
+The IPv6 subsystem is undoubtedly a very broad subject, which is growing steadily. Exciting features were added during the last decade. Some of these new features are based on IPv4, like ICMPv6 sockets, IPv6 Multicast Routing, and IPv6 NAT. IPsec is mandatory in IPv6 and optional in IPv4, though most operating systems implemented IPsec also in IPv4. When we delve into the IPv6 kernel internals, we find many similarities. Sometime the names of the methods and even the names of some of the variables are similar, except for the addition of "v6" or "6." There are, however, some changes in the implementation in some places.
+
+We chose to discuss in this chapter the important new features of IPv6, show some places where it differs from IPv4, and explain why a change was made. The extension headers, the Multicast Listener Discovery (MLD) protocol, and the Autoconfiguration process are some of the new features that we discuss and demonstrate with some userspace examples. We also discuss how receiving IPv6 packets works, how IPv6 forwarding works, and some points of difference when comparing them to IPv4. On the whole, it seems that the developers of IPv6 made a lot of improvements based on the past experience with IPv4, and the IPv6 implementation brings a lot of benefits not found in IPv4 and a lot of advantages over IPv4. We will discuss IPv6 addresses in the following section, including multicast addresses and special addresses.
+
+## IPv6 Addresses
+
+The first step in learning IPv6 is to become familiar with the IPv6 Addressing Architecture, which is defined in RFC 4291. There are three types of IPv6 addresses:
+
+  * Unicast: This address uniquely identifies an interface. A packet sent to a unicast address is delivered to the interface identified by that address.
+
+  * Anycast: This address can be assigned for a set of interfaces (usually on different nodes). This type of address does not exist in IPv4. It is, in fact, a mixture of a unicast address and a multicast address. A packet sent to an anycast address is delivered to one of the interfaces identified by that address (the "nearest" one, according to the routing protocols).
+
+  * Multicast: This address can be assigned for a set of interfaces (usually on different nodes). A packet sent to a multicast address is delivered to all the interfaces identified by that address. An interface can belong to any number of multicast groups.
+
+There is no broadcast address in IPv6. In IPv6, to get the same result as broadcast, you can send a packet to the group multicast address of all nodes (ff02::1). In IPv4, a large part of the functionality of the Address Resolution Protocol (ARP) protocol is based on broadcasts. The IPv6 subsystem uses neighbour discovery instead of ARP to map L3 addresses to L2 addresses. The IPv6 neighbour discovery protocol is based on ICMPv6, and it uses multicast addresses instead of broadcasts, as you saw in the previous chapter. You will see more examples of using multicast traffic later in this chapter.
+
+An IPv6 address comprises of 8 blocks of 16 bits, which is 128 bits in total. An IPv6 address looks like this:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx (where x is a hexadecimal digit.) Sometimes you will encounter "::" inside an IPv6 address; this is a shortcut for leading zeroes.
+
+In IPv6, address prefixes are used. Prefixes are, in fact, the parallel of IPv4 subnet masks. IPv6 prefixes are described in RFC 4291, "IP Version 6 Addressing Architecture." An IPv6 address prefix is represented by the following notation: ipv6-address/prefix-length.
+
+The prefix-length is a decimal value specifying how many of the leftmost contiguous bits of the address comprise the prefix. We use "/n" to denote a prefix n bits long. For example, for all IPv6 addresses that begin with the 32 bits 2001:0da7, the following prefix is used: 2001:da7::/32.
+
+Now that you have learned about the types of IPv6 addresses, you will learn in the following section about some special IPv6 addresses and their usage.
+
+### Special Addresses
+
+In this section, I describe some special IPv6 addresses and their usage. It is recommended that you be familiar with these special addresses because you will encounter some of them later in this chapter (like the unspecified address of all zeroes that is used in DAD, or Duplicate Address Detection) and while browsing the code. The following list contains special IPv6 addresses and explanations about their usage:
+
+  * There should be at least one link-local unicast address on each interface. The link-local address allows communication with other nodes in the same physical network; it is required for neighbour discovery, automatic address configuration, and more. Routers must not forward any packets with link-local source or destination addresses. Link-local addresses are assigned with the prefix fe80::/64.
+
+  * The Global Unicast Address general format is as follows: the first n bits are the global routing prefix, the next m bits are the subnet ID, and the rest of the 128-n-m bits are the interface ID.
+
+  * global routing prefix: A value assigned to a site. It represents the network ID or prefix of the address.
+
+  * subnet ID: An identifier of a subnet within the site.
+
+  * interface ID: An id; its value must be unique within the subnet. This is defined in RFC 3513, section 2.5.1.
+
+The Global Unicast Address is described in RFC 3587, "IPv6 Global Unicast Address Format." The assignable Global Unicast Address space is defined in RFC 4291.
+
+  * The IPv6 loopback address is 0:0:0:0:0:0:0:1, or ::1 in short notation.
+
+  * The address of all zeroes (0:0:0:0:0:0:0:0) is called the unspecified address. It is used in DAD (Duplicate Address Detection) as you saw in the previous chapter. It should not be used as a destination address. You cannot assign the unspecified address to an interface by using userspace tools like the ip command or the ifconfig command.
+
+  * IPv4-mapped IPv6 addresses are addresses that start with 80 bits of zero. The next 16 bits are one, and the remaining 32 bits are the IPv4 address. For example, ::ffff:192.0.2.128 represents the IPv4 address of 192.0.2.128. For usage of these addresses, see RFC 4038, "Application Aspects of IPv6 Transition."
+
+  * The IPv4-compatible format is deprecated; in this format, the IPv4 address is in the lower 32 bits of the IPv6 address and all remaining bits are 0; the address mentioned earlier should be ::192.0.2.128 in this format. See RFC 4291, section 2.5.5.1.
+
+  * Site local addresses were originally designed to be used for addressing inside of a site without the need for a global prefix, but they were deprecated in RFC 3879, "Deprecating Site Local Addresses," in 2004.
+
+An IPv6 address is represented in Linux by the in6_addr structure; using a union with three arrays (with 8, 16, and 32 bit elements) in the in6_addr structure helps in bit-manipulation operations:
+
+struct in6_addr {
+
+union {
+
+__u8 u6_addr8[16];
+
+__be16 u6_addr16[8];
+
+__be32 u6_addr32[4];
+
+} in6_u;
+
+#define s6_addr in6_u.u6_addr8
+
+#define s6_addr16 in6_u.u6_addr16
+
+#define s6_addr32 in6_u.u6_addr32
+
+};
+
+(include/uapi/linux/in6.h)
+
+Multicast plays an important role in IPv6, especially for ICMPv6-based protocols like NDISC (which I discussed in Chapter 7, which dealt with the Linux Neighbouring Subsystem) and MLD (which is discussed later in this chapter). I will now discuss multicast addresses in IPv6 in the next section.
+
+### Multicast Addresses
+
+Multicast addresses provide a way to define a multicast group; a node can belong to one or more multicast groups. Packets whose destination is a multicast address should be delivered to every node that belongs to that multicast group. In IPv6, all multicast addresses start with FF (8 first bits). Following are 4 bits for flags and 4 bits for scope. Finally, the last 112 bits are the group ID. The 4 bits of the flags field have this meaning:
+
+  * Bit 0: Reserved for future use.
+
+  * Bit 1: A value of 1 indicates that a Rendezvous Point is embedded in the address. Discussion of Rendezvous Points is more related to userspace daemons and is not within the scope of this book. For more details, see RFC 3956, "Embedding the Rendezvous Point (RP) Address in an IPv6 Multicast Address." This bit is sometimes referred to as the R-flag (R for Rendezvous Point.)
+
+  * Bit 2: A value of 1 indicates a multicast address that is assigned based on the network prefix. (See RFC 3306.) This bit is sometimes referred to as the P-flag (P for Prefix information.)
+
+  * Bit 3: A value of 0 indicates a permanently-assigned ("well-known") multicast address, assigned by the Internet Assigned Numbers Authority (IANA). A value of 1 indicates a non-permanently-assigned ("transient") multicast address. This bit is sometimes referred to as the T-flag (T for Temporary.)
+
+The scope can be one of the entries in Table 8-1, which shows the various IPv6 scopes by their Linux symbol and by their value.
+
+Table 8-1.
+
+IPv6 scopes
+
+Hex value | Description | Linux Symbol
+
+---|---|---
+
+0x01 | node local | IPV6_ADDR_SCOPE_NODELOCAL
+
+0x02 | link local | IPV6_ADDR_SCOPE_LINKLOCAL
+
+0x05 | site local | IPV6_ADDR_SCOPE_SITELOCAL
+
+0x08 | organization | IPV6_ADDR_SCOPE_ORGLOCAL
+
+0x0e | global | IPV6_ADDR_SCOPE_GLOBAL
+
+Now that you've learned about IPv6 multicast addresses, you will learn about some special multicast addresses in the next section.
+
+#### Special Multicast Addresses
+
+There are some special multicast addresses that I will mention in this chapter. Section 2.7.1 of RFC 4291 defines these special multicast addresses:
+
+  * All Nodes Multicast Address group: ff01::1, ff02::1
+
+  * All Routers Multicast Address group: ff01::2, ff02::2, ff05::2
+
+According to RFC 3810, there is this special address: All MLDv2-capable routers Multicast Group, which is ff02::16. Version 2 Multicast Listener Reports will be sent to this special address; I will discuss it in the "Multicast Listener Discovery (MLD)" section later in this chapter.
+
+A node is required to compute and join (on the appropriate interface) the associated Solicited-Node multicast addresses for all unicast and anycast addresses that have been configured for the node's interfaces (manually or automatically). Solicited-Node multicast addresses are computed based on the node's unicast and anycast addresses. A Solicited-Node multicast address is formed by taking the low-order 24 bits of an address (unicast or anycast) and appending those bits to the prefix ff02:0:0:0:0:1:ff00::/104, resulting in a multicast address in the range ff02:0:0:0:0:1:ff00:0000 to ff02:0:0:0:0:1:ffff:ffff. See RFC 4291.
+
+The method addrconf_addr_solict_mult() computes a link-local, solicited-node multicast address (include/net/addrconf.h). The method addrconf_join_solict() joins to a solicited address multicast group (net/ipv6/addrconf.c).
+
+In the previous chapter, you saw that a neighbour advertisement message is sent by the ndisc_send_na() method to the link-local, all nodes address (ff02::1). You will see more examples of using special addresses like the all nodes multicast group address or all routers multicast group address in later subsections of this chapter. In this section, you have seen some multicast addresses, which you will encounter later in this chapter and while browsing the IPv6 source code. I will now discuss the IPv6 header in the following section.
+
+## IPv6 Header
+
+Each IPv6 packet starts with an IPv6 header, and it is important to learn about its structure to understand fully the IPv6 Linux implementation. The IPv6 header has a fixed length of 40 bytes; for this reason, there is no field specifying the IPv6 header length (as opposed to IPv4, where the ihl member of the IPv4 header represents the header length). Note that there is also no checksum field in the IPv6 header, and this will be explained later in this chapter. In IPv6, there is no IP options mechanism as in IPv4. The IP options processing mechanism in IPv4 has a performance cost. Instead, IPV6 has a much more efficient mechanism of extension headers, which will be discussed in the next section, "extension headers." Figure 8-1 shows the IPv6 header and its fields.
+
+Figure 8-1.
+
+IPv6 header
+
+Note that in the original IPv6 standard, RFC 2460, the priority (Traffic Class) is 8 bits and the flow label is 20 bits. In the definition of the ipv6hdr structure, the priority (Traffic Class) field size is 4 bits. In fact, in the Linux IPv6 implementation, the first 4 bits of flow_lbl are glued to the priority (Traffic Class) field in order to form a "class." Figure 8-1 reflects the Linux definition of the ipv6hdr structure, which is shown here:
+
+struct ipv6hdr {
+
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+
+__u8 priority:4,
+
+version:4;
+
+#elif defined(__BIG_ENDIAN_BITFIELD)
+
+__u8 version:4,
+
+priority:4;
+
+#else
+
+#error "Please fix <asm/byteorder.h>"
+
+#endif
+
+__u8 flow_lbl[3];
+
+__be16 payload_len;
+
+__u8 nexthdr;
+
+__u8 hop_limit;
+
+struct in6_addr saddr;
+
+struct in6_addr daddr;
+
+};
+
+(include/uapi/linux/ipv6.h)
+
+The following is a description of the members of the ipv6hdr structure:
+
+  * version: A 4-bit field. It should be set to 6.
+
+  * priority: Indicates the traffic class or priority of the IPv6 packet. RFC 2460, the base of IPv6, does not define specific traffic class or priority values.
+
+  * flow_lbl: The flow labeling field was regarded as experimental when the base IPv6 standard was written (RFC 2460). It provides a way to label sequences of packets of a particular flow; this labeling can be used by upper layers for various purposes. RFC 6437, "IPv6 Flow Label Specification," from 2011, suggests using flow labeling to detect address spoofing.
+
+  * payload_len: A 16-bit field. The size of the packet, without the IPv6 header, can be up to 65,535 bytes. I will discuss larger packets ("jumbo frames") in the next section, when presenting the Hop-by-Hop Options header.
+
+  * nexthdr: When there are no extension headers, this will be the upper layer protocol number, like IPPROTO_UDP (17) for UDP or IPPROTO_TCP (6) for TCP. The list of available protocols is in include/uapi/linux/in.h. When using extension headers, this will be the type of the next header immediately following the IPv6 header. I will discuss extension headers in the next section.
+
+  * hop_limit: One byte field. Every forwarding device decrements the hop_limit counter by one. When it reaches zero, an ICMPv6 message is sent back and the packet is discarded. This parallels the TTL member in the IPv4 header. See the ip6_forward() method in net/ipv6/ip6_output.c.
+
+  * saddr: IPv6 source address (128 bit).
+
+  * daddr: IPv6 destination address (128 bit). This is possibly not the final packet destination if a Routing Header is used.
+
+Note that, as opposed to the IPv4 header, there is no checksum in the IPv6 header. Checksumming is assumed to be assured by both Layer 2 and Layer 4. UDP in IPv4 permits having a checksum of 0, indicating no checksum; UDP in IPV6 requires having its own checksum normally. There are some special cases in IPv6 where zero UDP checksum is allowed for IPv6 UDP tunnels; see RFC 6935, "IPv6 and UDP Checksums for Tunneled Packets." In Chapter 4, which deals with the IPv4 subsystem, you saw that when forwarding a packet the ip_decrease_ttl() method is invoked. This method recomputes the checksum of the IPv4 header because the value of the ttl was changed. In IPv6, there is no such a need for recomputation of the checksum when forwarding a packet, because there is no checksum at all in the IPv6 header. This results in a performance improvement in software-based routers.
+
+In this section, you have seen how the IPv6 header is built. You saw some differences between the IPv4 header and the IPv6 header—for example, in the IPv6 header there is no checksum and no header length. The next section discusses the IPv6 extension headers, which are the counterpart of IPv4 options.
+
+## Extension Headers
+
+The IPv4 header can include IP options, which can extend the IPv4 header from a minimum size of 20 bytes to 60 bytes. In IPv6, we have optional extension headers instead. With one exception (Hop-by-Hop Options header), extension headers are not processed by any node along a packet's delivery path until the packet reaches its final destination; this improves the performance of the forwarding process significantly. The base IPv6 standard defines extension headers. An IPv6 packet can include 0, 1 or more extension headers. These headers can be placed between the IPv6 header and the upper-layer header in a packet. The nexthdr field of the IPv6 header is the number of the next header immediately after the IPv6 header. These extension headers are chained; every extension header has a Next Header field. In the last extension header, the Next Header indicates the upper-layer protocol (such as TCP, UDP, or ICMPv6). Another advantage of extension headers is that adding new extension headers in the future is easy and does not require any changes in the IPv6 header.
+
+Extension headers must be processed strictly in the order they appear in the packet. Each extension header should occur at most once, except for the Destination Options header, which should occur at most twice. (See more detail later in this section in the description of the Destination Options header.) The Hop-by-Hop Options header must appear immediately after the IPv6 header; all other options can appear in any order. Section 4.1 of RFC 2460 ("Extension Header Order") states a recommended order in which extension headers should appear, but this is not mandatory. When an unknown Next Header number is encountered while processing a packet, an ICMPv6 "Parameter Problem" message with a code of "unknown Next Header" (ICMPV6_UNK_NEXTHDR) will be sent back to the sender by calling the icmpv6_param_prob() method. A description of the available ICMPv6 "Parameter Problem Codes" appears in Table 8-4 in the "Quick Reference" section at the end of this chapter.
+
+Each extension header must be aligned on an 8-byte boundary. For extension headers of variable size, there is a Header Extension Length field, and they use padding if needed to ensure that they are aligned on an 8-byte boundary. The numbers of all Linux IPv6 extension headers and their Linux Kernel symbol representation are displayed in Table 8-2, "IPv6 extension headers," in the "Quick Reference" section at the end of this chapter.
+
+A protocol handler is registered for each of the extension headers (except the Hop-by-Hop Options header) with the inet6_add_protocol() method. The reason for not registering a protocol handler for the Hop-by-Hop Options header is that there is a special method for parsing the Hop-by-Hop Options header, the ipv6_parse_hopopts() method. This method is invoked before calling the protocol handlers. (See the ipv6_rcv() method, net/ipv6/ip6_input.c). As mentioned before, the Hop-by-Hop Options header must be the first one, immediately following the IPv6 header. In this way, for example, the protocol handler for the Fragment extension header is registered:
+
+static const struct inet6_protocol frag_protocol =
+
+{
+
+.handler = ipv6_frag_rcv,
+
+.flags = INET6_PROTO_NOPOLICY,
+
+};
+
+int __init ipv6_frag_init(void)
+
+{
+
+int ret;
+
+ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+
+(net/ipv6/reassembly.c)
+
+Here is a description of all IPv6 Extension headers:
+
+  * Hop-by-Hop Options header: The Hop-by-Hop Options header must be processed on each node. It is parsed by the ipv6_parse_hopopts() method (net/ipv6/exthdrs.c).
+
+  * The Hop-by-Hop Options header must be immediately after the IPv6 header. It is used, for example, by the Multicast Listener Discovery protocol, as you will see in the "Multicast Listener Discovery (MLD)" section later in this chapter. The Hop-by-Hop Options header includes a variable-length option field. Its first byte is its type, which can be one of the following:
+
+    * Router Alert (Linux Kernel symbol: IPV6_TLV_ROUTERALERT, value: 5). See RFC 6398, "IP Router Alert Considerations and Usage."
+
+    * Jumbo (Linux Kernel symbol: IPV6_TLV_JUMBO, value: 194). The IPv6 packet payload normally can be up to 65,535 bytes long. With the jumbo option, it can be up to 2^32 bytes. See RFC 2675, "IPv6 Jumbograms."
+
+    * Pad1 (Linux Kernel symbol: IPV6_TLV_PAD1, value: 0). The Pad1 option is used to insert one byte of padding. When more than one padding byte is needed, the PadN option (see next) should be used (and not multiple Pad1 options). See section 4.2 of RFC 2460.
+
+    * PadN (Linux Kernel symbol: IPV6_TLV_PADN, value: 1). The PadN option is used to insert two or more octets of padding into the Options area of a header.
+
+  * Routing Options header: This parallels the IPv4 Loose Source Record Route (IPOPT_LSRR), which is discussed in the "IP Options" section in Chapter 4. It provides the ability to specify one or more routers that should be visited along the packet's traversal route to its final destination.
+
+  * Fragment Options header: As opposed to IPv4, fragmentation in IPv6 can occur only on the host that sends the packet, not on any of the intermediate nodes. Fragmentation is implemented by the ip6_fragment() method, which is invoked from the ip6_finish_output() method. In the ip6_fragment() method, there is a slow path and a fast path, much the same as in IPv4 fragmentation. The implementation of IPv6 fragmentation is in net/ipv6/ip6_output.c, and the implementation of IPv6 defragmentation is in net/ipv6/reassembly.c.
+
+  * Authentication Header: The Authentication header (AH) provides data authentication, data integrity, and anti-replay protection. It is described in RFC 4302, "IP Authentication Header," which makes RFC 2402 obsolete.
+
+  * Encapsulating Security Payload Options header: It is described in RFC 4303, "IP Encapsulating Security Payload (ESP)," which makes RFC 2406 obsolete. Note: The Encapsulating Security Payload (ESP) protocol is discussed in Chapter 10, which discusses the IPsec subsystem.
+
+  * Destination Options header: The Destination Options header can appear twice in a packet; before a Routing Options header, and after it. When it is before the Routing Options header, it includes information that should be processed by the routers that are specified by the Router Options header. When it is after the Router Options header, it includes information that should be processed by the final destination.
+
+In the next section, you will see how the IPv6 protocol handler, which is the ipv6_rcv() method, is associated with IPv6 packets.
+
+## IPv6 Initialization
+
+The inet6_init() method performs various IPv6 initializations (like procfs initializations, registration of protocol handlers for TCPv6, UDPv6 and other protocols), initialization of IPv6 subsystems (like IPv6 neighbour discovery, IPv6 Multicast Routing, and IPv6 routing subsystem) and more. For more details, look in net/ipv6/af_inet6.c. The ipv6_rcv() method is registered as a protocol handler for IPv6 packets by defining a packet_type object for IPv6 and registering it with the dev_add_pack() method, quite similarly to what is done in IPv4:
+
+static struct packet_type ipv6_packet_type __read_mostly = {
+
+.type = cpu_to_be16(ETH_P_IPV6),
+
+.func = ipv6_rcv,
+
+};
+
+static int __init ipv6_packet_init(void)
+
+{
+
+dev_add_pack(&ipv6_packet_type);
+
+return 0;
+
+}
+
+(net/ipv6/af_inet6.c)
+
+As a result of the registration just shown, each Ethernet packet whose ethertype is ETH_P_IPV6 (0x86DD) will be handled by the ipv6_rcv() method. Next, I will discuss the IPv6 Autoconfiguration mechanism for setting IPv6 addresses.
+
+## Autoconfiguration
+
+Autoconfiguration is a mechanism that allows a host to obtain or create a unique address for each of its interfaces. The IPv6 autoconfiguration process is initiated at system startup; nodes (both hosts and routers) generate a link-local address for their interfaces. This address is regarded as "tentative" (the interface flag IFA_F_TENTATIVE is set); this means that it can communicate only with neighbour discovery messages. It should be verified that this address is not already in use by another node on the link. This is done with the DAD (Duplicate Address Detection) mechanism, which was described in the previous chapter which deals with the Linux Neighbouring Subsystem. If the node is not unique, the autoconfiguration process will stop and manual configuration will be needed. In cases where the address is unique, the autoconfiguration process will continue. The next phase of autoconfiguration of hosts involves sending one or more Router Solicitations to the all routers multicast group address (ff02::2). This is done by calling the ndisc_send_rs() method from the addrconf_dad_completed() method. Routers reply with a Router Advertisement message, which is sent to the all hosts address, ff02::1. Both the Router Solicitation and the Router Advertisement use the Neighbour Discovery Protocol via ICMPv6 messages. The router solicitation ICMPv6 type is NDISC_ROUTER_SOLICITATION (133), and the router advertisement ICMPv6 type is NDISC_ROUTER_ADVERTISEMENT (134).
+
+The radvd daemon is an example of an open source Router Advertisement daemon that is used for stateless autoconfiguration ( http://www.litech.org/radvd/ ). You can set a prefix in the radvd configuration file, which will be sent in Router Advertisement messages. The radvd daemon sends Router Advertisements periodically. Apart from that, it also listens to Router Solicitations (RS) requests and answers with Router Advertisement (RA) reply messages. These Router Advertisement (RA) messages include a prefix field, which plays an important role in the autoconfiguration process, as you will immediately see. The prefix must be 64 bits long. When a host receives the Router Advertisement (RA) message, it configures its IP address based on this prefix and its own MAC address. If the Privacy Extensions feature (CONFIG_IPV6_PRIVACY) was set, there is also an element of randomness added in the IPv6 address creation. The Privacy Extensions mechanism avoids getting details about the identity of a machine from its IPv6 address, which is generated normally using its MAC address and a prefix, by adding randomness as was mentioned earlier. For more details on Privacy Extensions, see RFC 4941, "Privacy Extensions for Stateless Address Autoconfiguration in IPv6."
+
+When a host receives a Router Advertisement message, it can automatically configure its address and some other parameters. It can also choose a default router based on these advertisements. It is also possible to set a preferred lifetime and a valid lifetime for the addresses that are configured automatically on the hosts. The preferred lifetime value specifies the length of time in seconds that the address, which was generated from the prefix via stateless address autoconfiguration, remains in a preferred state. When the preferred time is over, this address will stop communicating (will not answer ping6, etc.). The valid lifetime value specifies the length of time in seconds that the address is valid (i.e., that applications already using it can keep using it); when this time is over, the address is removed. The preferred lifetime and the valid lifetime are represented in the kernel by the prefered_lft and the valid_lft fields of the inet6_ifaddr object, respectively (include/net/if_inet6.h).
+
+Renumbering is the process of replacing an old prefix with a new prefix, and changing the IPv6 addresses of hosts according to a new prefix. Renumbering can also be done quite easily with radvd, by adding a new prefix to its configuration settings, setting a preferred lifetime and a valid lifetime, and restarting the radvd daemon. See also RFC 4192, "Procedures for Renumbering an IPv6 Network without a Flag Day," and RFCs 5887, 6866, and 6879.
+
+The Dynamic Host Configuration Protocol version 6 (DHCPv6) is an example of stateful address configuration; in the stateful autoconfiguration model, hosts obtain interface addresses and/or configuration information and parameters from a server. Servers maintain a database that keeps track of which addresses have been assigned to which hosts. I will not delve into the details of the DHCPv6 protocol in this book. The DHCPv6 protocol is specified by RFC 3315, "Dynamic Host Configuration Protocol for IPv6 (DHCPv6)." The IPv6 Stateless Autoconfiguration standard is described in RFC 4862, "IPv6 Stateless Address Autoconfiguration."
+
+You have learned in this section about the Autoconfiguration process, and you saw how easy it is to replace an old prefix with a new prefix by configuring and restarting radvd. The next section discusses how the ipv6_rcv() method, which is the IPv6 protocol handler, handles the reception of IPv6 packets in a somewhat similar way to what you saw in IPv4.
+
+## Receiving IPv6 Packets
+
+The main IPv6 receive method is the ipv6_rcv() method, which is the handler for all IPv6 packets (including multicasts; there are no broadcasts in IPv6 as mentioned before). There are many similarities between the Rx path in IPv4 and in IPv6. As in IPv4, we first make some sanity checks, like checking that the version of the IPv6 header is 6 and that the source address is not a multicast address. (According to section 2.7 of RFC 4291, this is forbidden.) If there is a Hop-by-Hop Options header, it must be the first one. If the value of the nexthdr of the IPV6 header is 0, this indicates a Hop-by-Hop Options header, and it is parsed by calling the ipv6_parse_hopopts() method. The real work is done by the ip6_rcv_finish() method, which is invoked by calling the NF_HOOK() macro. If there is a netfilter callback that is registered at this point (NF_INET_PRE_ROUTING), it will be invoked. I will discuss netfilter hooks in the next chapter. Let's take a look at the ipv6_rcv() method:
+
+int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
+
+struct net_device *orig_dev)
+
+{
+
+const struct ipv6hdr *hdr;
+
+u32 pkt_len;
+
+struct inet6_dev *idev;
+
+Fetch the network namespace from the network device that is associated with the Socket Buffer (SKB):
+
+struct net *net = dev_net(skb->dev);
+
+. . .
+
+Fetch the IPv6 header from the SKB:
+
+hdr = ipv6_hdr(skb);
+
+Perform some sanity checks, and discard the SKB if necessary:
+
+if (hdr->version != 6)
+
+goto err;
+
+/*
+
+* RFC4291 2.5.3
+
+* A packet received on an interface with a destination address
+
+* of loopback must be dropped.
+
+*/
+
+if (!(dev->flags & IFF_LOOPBACK) &&
+
+ipv6_addr_loopback(&hdr->daddr))
+
+goto err;
+
+. . .
+
+/*
+
+* RFC4291 2.7
+
+* Multicast addresses must not be used as source addresses in IPv6
+
+* packets or appear in any Routing header.
+
+*/
+
+if (ipv6_addr_is_multicast(&hdr->saddr))
+
+goto err;
+
+. . .
+
+if (hdr->nexthdr == NEXTHDR_HOP) {
+
+if (ipv6_parse_hopopts(skb) < 0) {
+
+IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS);
+
+rcu_read_unlock();
+
+return NET_RX_DROP;
+
+}
+
+}
+
+. . .
+
+return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, dev, NULL,
+
+ip6_rcv_finish);
+
+err:
+
+IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS);
+
+drop:
+
+rcu_read_unlock();
+
+kfree_skb(skb);
+
+return NET_RX_DROP;
+
+}
+
+(net/ipv6/ip6_input.c)
+
+The ip6_rcv_finish() method first performs a lookup in the routing subsystem by calling the ip6_route_input() method, in case there is no dst attached to the SKB. The ip6_route_input() method eventually invokes the fib6_rule_lookup().
+
+int ip6_rcv_finish(struct sk_buff *skb)
+
+{
+
+. . .
+
+if (!skb_dst(skb))
+
+ip6_route_input(skb);
+
+Invoke the input callback of the dst attached to the SKB:
+
+return dst_input(skb);
+
+}
+
+(net/ipv6/ip6_input.c)
+
+Note
+
+There are two different implementations of the fib6_rule_lookup() method: one when Policy Routing (CONFIG_IPV6_MULTIPLE_TABLES) is set, in net/ipv6/fib6_rules.c, and one when Policy Routing is not set, in net/ipv6/ip6_fib.c.
+
+As you saw in Chapter 5, which dealt with advanced topics of the IPv4 Routing Subsystem, the lookup in the routing subsystem builds a dst object and sets its input and output callbacks; in IPv6, similar tasks are performed. After the ip6_rcv_finish() method performs the lookup in the routing subsystem, it calls the dst_input() method, which in fact invokes the input callback of the dst object that is associated with the packet.
+
+Figure 8-2 shows the receive path (Rx) of a packet that is received by the network driver. This packet can either be delivered to the local machine or be forwarded to another host. It is the result of the lookup in the routing tables that determines which of these two options will take place.
+
+Figure 8-2.
+
+Receiving IPv6 packets
+
+Note
+
+For simplicity, the diagram does not include the fragmentation/defragmentation/ parsing of extension headers /IPsec methods.
+
+The lookup in the IPv6 routing subsystem will set the input callback of the destination cache (dst) to be:
+
+  * ip6_input() when the packet is destined to the local machine.
+
+  * ip6_forward() when the packet is to be forwarded.
+
+  * ip6_mc_input() when the packet is destined to a multicast address.
+
+  * ip6_pkt_discard() when the packet is to be discarded. The ip6_pkt_discard() method drops the packet and replies to the sender with a destination unreachable (ICMPV6_DEST_UNREACH) ICMPv6 message.
+
+Incoming IPv6 packets can be locally delivered or forwarded; in the next section, you will learn about local delivery of IPv6 packets.
+
+### Local Delivery
+
+Let's look first at the local delivery case: the ip6_input() method is a very short method:
+
+int ip6_input(struct sk_buff *skb)
+
+{
+
+return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
+
+ip6_input_finish);
+
+}
+
+(net/ipv6/ip6_input.c)
+
+If there is a netfilter hook registered in this point (NF_INET_LOCAL_IN) it will be invoked. Otherwise, we will proceed to the ip6_input_finish() method:
+
+static int ip6_input_finish(struct sk_buff *skb)
+
+{
+
+struct net *net = dev_net(skb_dst(skb)->dev);
+
+const struct inet6_protocol *ipprot;
+
+The inet6_dev structure (include/net/if_inet6.h) is the IPv6 parallel of the IPv4 in_device structure. It contains IPv6-related configuration such as the network interface unicast address list (addr_list) and the network interface multicast address list (mc_list). This IPv6-related configuration can be set by the user with the ip command or with the ifconfig command.
+
+struct inet6_dev *idev;
+
+unsigned int nhoff;
+
+int nexthdr;
+
+bool raw;
+
+/*
+
+* Parse extension headers
+
+*/
+
+rcu_read_lock();
+
+resubmit:
+
+idev = ip6_dst_idev(skb_dst(skb));
+
+if (!pskb_pull(skb, skb_transport_offset(skb)))
+
+goto discard;
+
+nhoff = IP6CB(skb)->nhoff;
+
+Fetch the next header number from the SKB:
+
+nexthdr = skb_network_header(skb)[nhoff];
+
+First in case of a raw socket packet, we try to deliver it to a raw socket:
+
+raw = raw6_local_deliver(skb, nexthdr);
+
+Every extension header (except the Hop by Hop extension header) has a protocol handler which was registered by the inet6_add_protocol() method; this method in fact adds an entry to the global inet6_protos array (see net/ipv6/protocol.c).
+
+if ((ipprot = rcu_dereference(inet6_protos[nexthdr])) != NULL) {
+
+int ret;
+
+if (ipprot->flags & INET6_PROTO_FINAL) {
+
+const struct ipv6hdr *hdr;
+
+/* Free reference early: we don't need it any more,
+
+and it may hold ip_conntrack module loaded
+
+indefinitely. */
+
+nf_reset(skb);
+
+skb_postpull_rcsum(skb, skb_network_header(skb),
+
+skb_network_header_len(skb));
+
+hdr = ipv6_hdr(skb);
+
+RFC 3810, which is the MLDv2 specification, says: "Note that MLDv2 messages are not subject to source filtering and must always be processed by hosts and routers." We do not want to discard MLD multicast packets due to source filtering, since these MLD packets should be always processed according to the RFC. Therefore, before discarding the packet we make sure that if the destination address of the packet is a multicast address, the packet is not an MLD packet. This is done by calling the ipv6_is_mld() method before discarding it. If this method indicates that the packet is an MLD packet, it is not discarded. You can also see more about this in the "Multicast Listener Discovery (MLD)" section later in this chapter.
+
+if (ipv6_addr_is_multicast(&hdr->daddr) &&
+
+!ipv6_chk_mcast_addr(skb->dev, &hdr->daddr,
+
+&hdr->saddr) &&
+
+!ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb)))
+
+goto discard;
+
+}
+
+When the INET6_PROTO_NOPOLICY flag is set, this indicates that there is no need to perform IPsec policy checks for this protocol:
+
+if (!(ipprot->flags & INET6_PROTO_NOPOLICY) &&
+
+!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
+
+goto discard;
+
+ret = ipprot->handler(skb);
+
+if (ret > 0)
+
+goto resubmit;
+
+else if (ret == 0)
+
+IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS);
+
+} else {
+
+if (!raw) {
+
+if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+
+IP6_INC_STATS_BH(net, idev,
+
+IPSTATS_MIB_INUNKNOWNPROTOS);
+
+icmpv6_send(skb, ICMPV6_PARAMPROB,
+
+ICMPV6_UNK_NEXTHDR, nhoff);
+
+}
+
+kfree_skb(skb);
+
+} else {
+
+Everything went fine, so increment the INDELIVERS SNMP MIB counter (/proc/net/snmp6/Ip6InDelivers) and free the packet with the consume_skb() method:
+
+IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS);
+
+consume_skb(skb);
+
+}
+
+}
+
+rcu_read_unlock();
+
+return 0;
+
+discard:
+
+IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS);
+
+rcu_read_unlock();
+
+kfree_skb(skb);
+
+return 0;
+
+}
+
+(net/ipv6/ip6_input.c)
+
+You have seen the implementation details of local delivery, which is performed by the ip6_input() and ip6_input_finish() methods. Now is the time to turn to the implementation details of forwarding in IPv6. Also here, there are many similarities between forwarding in IPv4 and forwarding in IPv6.
+
+### Forwarding
+
+Forwarding in IPv6 is very similar to forwarding in IPv4. There are some slight changes, though. For example, in IPv6, a checksum is not calculated when forwarding a packet. (There is no checksum field at all in an IPv6 header, as was mentioned before.) Let's take a look at the ip6_forward() method:
+
+int ip6_forward(struct sk_buff *skb)
+
+{
+
+struct dst_entry *dst = skb_dst(skb);
+
+struct ipv6hdr *hdr = ipv6_hdr(skb);
+
+struct inet6_skb_parm *opt = IP6CB(skb);
+
+struct net *net = dev_net(dst->dev);
+
+u32 mtu;
+
+The IPv6 procfs forwarding entry (/proc/sys/net/ipv6/conf/all/forwarding) should be set:
+
+if (net->ipv6.devconf_all->forwarding == 0)
+
+goto error;
+
+When working with Large Receive Offload (LRO), the packet length will exceed the Maximum transmission unit (MTU). As in IPv4, when LRO is enabled, the SKB is freed and an error of –EINVAL is returned:
+
+if (skb_warn_if_lro(skb))
+
+goto drop;
+
+if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
+
+IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
+
+goto drop;
+
+}
+
+Drop packets that are not destined to go to the local host. The pkt_type associated with an SKB is determined according to the destination MAC address in the Ethernet header of an incoming packet. This is done by the eth_type_trans() method, which is typically called in the network device driver when handling an incoming packet. See the eth_type_trans() method, net/ethernet/eth.c.
+
+if (skb->pkt_type != PACKET_HOST)
+
+goto drop;
+
+skb_forward_csum(skb);
+
+/*
+
+* We DO NOT make any processing on
+
+* RA packets, pushing them to user level AS IS
+
+* without any WARRANTY that application will be able
+
+* to interpret them. The reason is that we
+
+* cannot make anything clever here.
+
+*
+
+* We are not end-node, so that if packet contains
+
+* AH/ESP, we cannot make anything.
+
+* Defragmentation also would be mistake, RA packets
+
+* cannot be fragmented, because there is no warranty
+
+* that different fragments will go along one path. --ANK
+
+*/
+
+if (opt->ra) {
+
+u8 *ptr = skb_network_header(skb) + opt->ra;
+
+We should try to deliver the packet to sockets that had the IPV6_ROUTER_ALERT socket option set by setsockopt(). This is done by calling the ip6_call_ra_chain() method; if the delivery in ip6_call_ra_chain() succeeded, the ip6_forward() method returns 0 and the packet is not forwarded. See the implementation of the ip6_call_ra_chain() method in net/ipv6/ip6_output.c.
+
+if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
+
+return 0;
+
+}
+
+/*
+
+* check and decrement ttl
+
+*/
+
+if (hdr->hop_limit <= 1) {
+
+/* Force OUTPUT device used as source address */
+
+skb->dev = dst->dev;
+
+Send back an ICMP error message when the Hop Limit is 1 (or less), much like what we have in IPv4 when forwarding a packet and the TTL reaches 0. In this case, the packet is discarded:
+
+icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
+
+IP6_INC_STATS_BH(net,
+
+ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
+
+kfree_skb(skb);
+
+return -ETIMEDOUT;
+
+}
+
+/* XXX: idev->cnf.proxy_ndp? */
+
+if (net->ipv6.devconf_all->proxy_ndp &&
+
+pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
+
+int proxied = ip6_forward_proxy_check(skb);
+
+if (proxied > 0)
+
+return ip6_input(skb);
+
+else if (proxied < 0) {
+
+IP6_INC_STATS(net, ip6_dst_idev(dst),
+
+IPSTATS_MIB_INDISCARDS);
+
+goto drop;
+
+}
+
+}
+
+if (!xfrm6_route_forward(skb)) {
+
+IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
+
+goto drop;
+
+}
+
+dst = skb_dst(skb);
+
+/* IPv6 specs say nothing about it, but it is clear that we cannot
+
+send redirects to source routed frames.
+
+We don't send redirects to frames decapsulated from IPsec.
+
+*/
+
+if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
+
+struct in6_addr *target = NULL;
+
+struct inet_peer *peer;
+
+struct rt6_info *rt;
+
+/*
+
+* incoming and outgoing devices are the same
+
+* send a redirect.
+
+*/
+
+rt = (struct rt6_info *) dst;
+
+if (rt->rt6i_flags & RTF_GATEWAY)
+
+target = &rt->rt6i_gateway;
+
+else
+
+target = &hdr->daddr;
+
+peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
+
+/* Limit redirects both by destination (here)
+
+and by source (inside ndisc_send_redirect)
+
+*/
+
+if (inet_peer_xrlim_allow(peer, 1*HZ))
+
+ndisc_send_redirect(skb, target);
+
+if (peer)
+
+inet_putpeer(peer);
+
+} else {
+
+int addrtype = ipv6_addr_type(&hdr->saddr);
+
+/* This check is security critical. */
+
+if (addrtype == IPV6_ADDR_ANY ||
+
+addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
+
+goto error;
+
+if (addrtype & IPV6_ADDR_LINKLOCAL) {
+
+icmpv6_send(skb, ICMPV6_DEST_UNREACH,
+
+ICMPV6_NOT_NEIGHBOUR, 0);
+
+goto error;
+
+}
+
+}
+
+Note that the IPv6 IPV6_MIN_MTU is 1280 bytes, according to section 5, "Packet Size Issues," of the base IPv6 standard, RFC 2460.
+
+mtu = dst_mtu(dst);
+
+if (mtu < IPV6_MIN_MTU)
+
+mtu = IPV6_MIN_MTU;
+
+if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
+
+(IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
+
+/* Again, force OUTPUT device used as source address */
+
+skb->dev = dst->dev;
+
+Reply back to the sender with an ICMPv6 message of "Packet Too Big," and free the SKB; the ip6_forward() method returns –EMSGSIZ in this case:
+
+icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+
+IP6_INC_STATS_BH(net,
+
+ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
+
+IP6_INC_STATS_BH(net,
+
+ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
+
+kfree_skb(skb);
+
+return -EMSGSIZE;
+
+}
+
+if (skb_cow(skb, dst->dev->hard_header_len)) {
+
+IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
+
+goto drop;
+
+}
+
+hdr = ipv6_hdr(skb);
+
+The packet is to be forwarded, so decrement the hop_limit of the IPv6 header.
+
+/* Mangling hops number delayed to point after skb COW */
+
+hdr->hop_limit--;
+
+IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
+
+IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
+
+return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
+
+ip6_forward_finish);
+
+error:
+
+IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
+
+drop:
+
+kfree_skb(skb);
+
+return -EINVAL;
+
+}
+
+(net/ipv6/ip6_output.c)
+
+The ip6_forward_finish() method is a one-line method, which simply invokes the destination cache (dst) output callback:
+
+static inline int ip6_forward_finish(struct sk_buff *skb)
+
+{
+
+return dst_output(skb);
+
+}
+
+(net/ipv6/ip6_output.c)
+
+You have seen in this section how the reception of IPv6 packets is handled, either by local delivery or by forwarding. You have also seen some differences between receiving IPv6 packets and receiving IPv4 packets. In the next section, I will discuss the Rx path for multicast traffic.
+
+## Receiving IPv6 Multicast Packets
+
+The ipv6_rcv() method is the IPv6 handler for both unicast packets and multicast packets. As mentioned above, after some sanity checks, it invokes the ip6_rcv_finish() method, which performs a lookup in the routing subsystem by calling the ip6_route_input() method. In the ip6_route_input() method, the input callback is set to be the ip6_mc_input method in cases of receiving a multicast packet. Let's take a look at the ip6_mc_input() method:
+
+int ip6_mc_input(struct sk_buff *skb)
+
+{
+
+const struct ipv6hdr *hdr;
+
+bool deliver;
+
+IP6_UPD_PO_STATS_BH(dev_net(skb_dst(skb)->dev),
+
+ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST,
+
+skb->len);
+
+hdr = ipv6_hdr(skb);
+
+The ipv6_chk_mcast_addr() method (net/ipv6/mcast.c) checks whether the multicast address list (mc_list) of the specified network device contains the specified multicast address (which is the destination address in the IPv6 header in this case, hdr->daddr). Note that because the third parameter is NULL, we do not check in this invocation whether there are any source filters for the source address; handling source filtering is discussed later in this chapter.
+
+deliver = ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, NULL);
+
+If the local machine is a multicast router (that is, CONFIG_IPV6_MROUTE is set), we continue after some checks to the ip6_mr_input() method. The IPv6 multicast routing implementation is very similar to the IPv4 multicast routing implementation, which was discussed in Chapter 6, so I will not discuss it in this book. The IPv6 multicast routing implementation is in net/ipv6/ip6mr.c. Support for IPv6 Multicast Routing was added in kernel 2.6.26 (2008), based on a patch by Mickael Hoerdt.
+
+#ifdef CONFIG_IPV6_MROUTE
+
+. . .
+
+if (dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding &&
+
+!(ipv6_addr_type(&hdr->daddr) &
+
+(IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) &&
+
+likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) {
+
+/*
+
+* Okay, we try to forward - split and duplicate
+
+* packets.
+
+*/
+
+struct sk_buff *skb2;
+
+if (deliver)
+
+skb2 = skb_clone(skb, GFP_ATOMIC);
+
+else {
+
+skb2 = skb;
+
+skb = NULL;
+
+}
+
+if (skb2) {
+
+Continue to the IPv6 Multicast Routing code, via the ip6_mr_input() method (net/ipv6/ip6mr.c):
+
+ip6_mr_input(skb2);
+
+}
+
+}
+
+#endif
+
+if (likely(deliver))
+
+ip6_input(skb);
+
+else {
+
+/* discard */
+
+kfree_skb(skb);
+
+}
+
+return 0;
+
+}
+
+(net/ipv6/ip6_input.c)
+
+When the multicast packet is not destined to be forwarded by multicast routing (for example, when CONFIG_IPV6_MROUTE is not set), we will continue to the ip6_input() method, which is in fact a wrapper around the ip6_input_finish() method as you already saw. In the ip6_input_finish() method, we again call the ipv6_chk_mcast_addr() method, but this time the third parameter is not NULL, it is the source address from the IPv6 header. This time we do check in the ipv6_chk_mcast_addr() method whether source filtering is set, and we handle the packet accordingly. Source filtering is discussed in the "Multicast Source Filtering (MSF)" section later in this chapter. Next, I will describe the Multicast Listener Discovery protocol, which parallels the IPv4 IGMPv3 protocol.
+
+## Multicast Listener Discovery (MLD)
+
+The MLD protocol is used to exchange group information between multicast hosts and routers. The MLD protocol is an asymmetric protocol; it specifies different behavior to Multicast Routers and to Multicast Listeners. In IPv4, multicast group management is handled by the Internet Group Management Protocol (IGMP) protocol, as you saw in Chapter 6. In IPv6, multicast group management is handled by the MLDv2 protocol, which is specified in RFC 3810, from 2004. The MLDv2 protocol is derived from the IGMPv3 protocol, which is used by IPv4. However, as opposed to the IGMPv3 protocol, MLDv2 is part of the ICMPv6 protocol, while IGMPv3 is a standalone protocol that does not use any of the ICMPv4 services; this is the main reason why the IGMPv3 protocol is not used in IPv6. Note that you might encounter the term GMP (Group Management Protocol), which is used to refer to both IGMP and MLD.
+
+The former version of the Multicast Listener Discovery protocol is MLDv1, and it is specified in RFC 2710; it is derived from IGMPv2. MLDv1 is based on the Any-Source Multicast (ASM) model; this means that you do not specify interest in receiving multicast traffic from a single source address or from a set of addresses. MLDv2 extends MLDv1 by adding support for Source Specific Multicast (SSM); this means the ability of a node to specify interest in including or excluding listening to packets from specific unicast source addresses. This feature is referred to as source filtering. Later in this section, I will show a short, detailed userspace example of how to use source filtering. See more in RFC 4604, "Using Internet Group Management Protocol Version 3 (IGMPv3) and Multicast Listener Discovery Protocol Version 2 (MLDv2) for Source-Specific Multicast."
+
+The MLDv2 protocol is based on Multicast Listener Reports and Multicast Listener Queries. An MLDv2 Router (which is also sometimes termed "Querier") sends periodically Multicast Listener Queries in order to learn about the state of multicast groups of nodes. If there are several MLDv2 Routers on the same link, only one of them is selected to be the Querier, and all the other routers are set to be in a Non-Querier state. This is done by a Querier Election mechanism, as described in section 7.6.2 of RFC 3810. Nodes respond to these queries with Multicast Listener Reports, in which they provide information about multicast groups to which they belong. When a listener wants to stop listening on some multicast group, it informs the Querier about it, and the Querier must query for other listeners of that multicast group address before deleting it from its Multicast Address Listener state. An MLDv2 router can provide state information about listeners to multicast routing protocols.
+
+Now that you have learned generally what the MLD protocol is, I will turn your attention in the following section to how joining and leaving a multicast group is handled.
+
+### Joining and Leaving a Multicast Group
+
+There are two ways to join or leave a multicast group in IPv6. The first one is from within the kernel, by calling the ipv6_dev_mc_inc() method, which gets as a parameter a network device object and a multicast group address. For example, when registering a network device, the ipv6_add_dev() method is invoked; each device should join the interface-local all nodes multicast group (ff01::1) and the link-local all nodes multicast group (ff02::1).
+
+static struct inet6_dev *ipv6_add_dev(struct net_device *dev) {
+
+. . .
+
+/* Join interface-local all-node multicast group */
+
+ipv6_dev_mc_inc(dev, & in6addr_interfacelocal_allnodes);
+
+/* Join all-node multicast group */
+
+ipv6_dev_mc_inc(dev, & in6addr_linklocal_allnodes);
+
+. . .
+
+}
+
+(net/ipv6/addrconf.c)
+
+Routers are devices that have their procfs forwarding entry, /proc/sys/net/ipv6/conf/all/forwarding, set. Routers join three multicast address groups, in addition to the two multicast group that each host joins and that were mentioned earlier. These are the link-local all-routers multicast group (ff02::2), interface-local all routers multicast group (ff01::2), and site-local all routers multicast group (ff05::2).
+
+Note that setting the IPv6 procfs forwarding entry value is handled by the addrconf_fixup_forwarding() method, which eventually calls the dev_forward_change() method, which causes the specified network interface to join or leave these three multicast address groups according to the value of the procfs entry (which is represented by idev->cnf.forwarding, as you can see in the following code snippet):
+
+static void dev_forward_change(struct inet6_dev *idev)
+
+{
+
+struct net_device *dev;
+
+struct inet6_ifaddr *ifa;
+
+. . .
+
+dev = idev->dev;
+
+. . .
+
+if (dev->flags & IFF_MULTICAST) {
+
+if (idev->cnf.forwarding) {
+
+ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
+
+ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allrouters);
+
+ipv6_dev_mc_inc(dev, &in6addr_sitelocal_allrouters);
+
+} else {
+
+ipv6_dev_mc_dec(dev, &in6addr_linklocal_allrouters);
+
+ipv6_dev_mc_dec(dev, &in6addr_interfacelocal_allrouters);
+
+ipv6_dev_mc_dec(dev, &in6addr_sitelocal_allrouters);
+
+}
+
+}
+
+. . .
+
+}
+
+(net/ipv6/addrconf.c)
+
+To leave a multicast group from within the kernel, you should call the ipv6_dev_mc_dec() method. The second way of joining a multicast group is by opening an IPv6 socket in userspace, creating a multicast request (ipv6_mreq object) and setting the ipv6mr_multiaddr of the request to be the multicast group address to which this host wants to join, and setting the ipv6mr_interface to the ifindex of the network interface it wants to set. Then it should call setsockopt() with the IPV6_JOIN_GROUP socket option:
+
+int sockd;
+
+struct ipv6_mreq mcgroup;
+
+struct addrinfo *results;
+
+. . .
+
+/* read an IPv6 multicast group address to which we want to join */
+
+/* into the address info object (results) */
+
+. . .
+
+Set the network interface that we want to use (by its ifindex value):
+
+mcgroup.ipv6mr_interface=3;
+
+Set the multicast group address for the group that we want to join in the request (ipv6mr_multiaddr):
+
+memcpy( &(mcgroup.ipv6mr_multiaddr),
+
+&(((struct sockaddr_in6 *) results->ai_addr)->sin6_addr),
+
+sizeof(struct in6_addr));
+
+sockd = socket(AF_INET6, SOCK_DGRAM,0);
+
+Call setsockopt() with IPV6_JOIN_GROUP to join the multicast group; this call is handled in the kernel by the ipv6_sock_mc_join() method (net/ipv6/mcast.c).
+
+status = setsockopt(sockd, IPPROTO_IPV6, IPV6_JOIN_GROUP,
+
+&mcgroup, sizeof(mcgroup));
+
+. . .
+
+The IPV6_ADD_MEMBERSHIP socket option can be used instead of IPV6_JOIN_GROUP. (They are equivalent.) Note that we can set the same multicast group address on more than one network device by setting different values of network interfaces to mcgroup.ipv6mr_interface. The value of mcgroup.ipv6mr_interface is passed as the ifindex parameter to the ipv6_sock_mc_join() method. In such a case, the kernel builds and sends an MLDv2 Multicast Listener Report packet (ICMPV6_MLD2_REPORT), where the destination address is ff02::16 (the all MLDv2-capable routers Multicast Group Address). According to section 5.2.14 in RFC 3810, all MLDv2-capable multicast routers should listen to this multicast address. The number of Multicast Address Records in the MLDv2 header (shown in Figure 8-3) will be 1, because only one Multicast Address Record is used, containing the address of the multicast group that we want to join. The multicast group address that a host wants to join is part of the ICMPv6 header. The Hop-by-Hop Options header with Router Alert is set in this packet. MLD packets contain a Hop-by-Hop Options header, which in turn contains a Router Alert options header; the next header of the Hop-by-Hop extension header is IPPROTO_ICMPV6 (58), because following the Hop-by-Hop header is the ICMPv6 packet, which contains the MLDv2 message.
+
+Figure 8-3.
+
+MLDv2 Multicast Listener Report
+
+A host can leave a multicast group by calling setsockopt() with the IPV6_DROP_MEMBERSHIP socket option, which is handled in the kernel by calling the ipv6_sock_mc_drop() method or by closing the socket. Note that IPV6_LEAVE_GROUP is equivalent to IPV6_DROP_MEMBERSHIP.
+
+After talking about how joining and leaving a multicast group is handled, it is time to see what an MLDv2 Multicast Listener Report is.
+
+### MLDv2 Multicast Listener Report
+
+The MLDv2 Multicast Listener Report is represented in the kernel by the mld2_report structure:
+
+struct mld2_report {
+
+struct icmp6hdr mld2r_hdr;
+
+struct mld2_grec mld2r_grec[0];
+
+};
+
+(include/net/mld.h)
+
+The first member of the mld2_report structure is the mld2r_hdr, which is an ICMPv6 header; its icmp6_type should be set to ICMPV6_MLD2_REPORT (143). The second member of the mld2_report structure is the mld2r_grec[0], an instance of the mld2_grec structure, which represents the MLDv2 group record. (This is the Multicast Address Record in Figure 8-3.) Following is the definition of the mld2_grec structure:
+
+struct mld2_grec {
+
+__u8 grec_type;
+
+__u8 grec_auxwords;
+
+__be16 grec_nsrcs;
+
+struct in6_addr grec_mca;
+
+struct in6_addr grec_src[0];
+
+};
+
+(include/net/mld.h)
+
+The following is a description of the members of the mld2_grec structure:
+
+  * grec_type: Specifies the type of the Multicast Address Record. See Table 8-3, "Multicast Address Record (record types)" in the "Quick Reference" section at the end of this chapter.
+
+  * grec_auxwords: The length of the Auxiliary Data (aux data len in Figure 8-3). The Auxiliary Data field, if present, contains additional information that pertains to this Multicast Address Record. Usually it is 0. See also section 5.2.10 in RFC 3810.
+
+  * grec_nsrcs: The number of source addresses.
+
+  * grec_mca: The multicast address to which this Multicast Address Record pertains.
+
+  * grec_src[0]: A unicast source address (or an array of unicast source addresses). These are addresses that we want to filter (block or allow).
+
+In the next section, I will discuss the Multicast Source Filtering (MSF) feature. You will find in it detailed examples of how a Multicast Address Record is used in source filtering.
+
+### Multicast Source Filtering (MSF)
+
+With Multicast Source Filtering, the kernel will drop the multicast traffic from sources other than the expected ones. This feature, which is also known as Source-Specific Multicast (SSM) was not part of MLDv1. It was introduced in MLDv2; see RFC 3810. It is the opposite of Any-Source Multicast (ASM), where a receiver expresses interest in a destination multicast address. To understand better what Multicast Source Filtering is all about, I will show here an example of a userspace application demonstrating how to join and leave a multicast group with source filtering.
+
+#### Joining and Leaving a Multicast Group with Source Filtering
+
+A host can join a multicast group with source filtering by opening an IPv6 socket in userspace, creating a multicast group source request (group_source_req object), and setting three parameters in the request:
+
+  * gsr_group: The multicast group address that this host wants to join
+
+  * gsr_source: The multicast group source address that it wants to allow
+
+  * ipv6mr_interface: The ifindex of the network interface it wants to set
+
+Then it should call setsockopt() with the MCAST_JOIN_SOURCE_GROUP socket option. Following here is a code snippet of a userspace application demonstrating this (checking the success of the system calls was removed, for brevity):
+
+int sockd;
+
+struct group_source_req mreq;
+
+struct addrinfo *results1;
+
+struct addrinfo *results2;
+
+/* read an IPv6 multicast group address that we want to join into results1 */
+
+/* read an IPv6 multicast group address which we want to allow into results2 */
+
+memcpy(&(mreq.gsr_group), results1->ai_addr, sizeof(struct sockaddr_in6));
+
+memcpy(&(mreq.gsr_source), results2->ai_addr, sizeof(struct sockaddr_in6));
+
+mreq.gsr_interface = 3;
+
+sockd = socket(AF_INET6, SOCK_DGRAM, 0);
+
+setsockopt(sockd, IPPROTO_IPV6, MCAST_JOIN_SOURCE_GROUP, &mreq, sizeof(mreq));
+
+This request is handled in the kernel first by the ipv6_sock_mc_join() method, and then by the ip6_mc_source() method. To leave the group, you should call setsockopt() with the MCAST_LEAVE_SOURCE_GROUP socket option or close the socket that you opened.
+
+You can set another address that you want to allow and again call setsockopt() with this socket with the MCAST_UNBLOCK_SOURCE socket option. This will add additional addresses to the source filter list. Each such call to setsockopt() will trigger sending an MLDv2 Multicast Listener Report message with one Multicast Address Record; the Record Type will be 5 ("Allow new sources"), and the number of sources will be 1 (the unicast address that you want to unblock). I will show now an example of using the MCAST_MSFILTER socket option for source filtering.
+
+#### Example: Using MCAST_MSFILTER for Source Filtering
+
+You can also block or permit multicast traffic from several multicast addresses in one setsockopt() call using MCAST_MSFILTER and a group_filter object. First, let's take a look at the definition of the group_filter structure definition in userspace, which is quite self-explanatory:
+
+struct group_filter
+
+{
+
+/* Interface index. */
+
+uint32_t gf_interface;
+
+/* Group address. */
+
+struct sockaddr_storage gf_group;
+
+/* Filter mode. */
+
+uint32_t gf_fmode;
+
+/* Number of source addresses. */
+
+uint32_t gf_numsrc;
+
+/* Source addresses. */
+
+struct sockaddr_storage gf_slist[1];
+
+};
+
+(include/netinet/in.h)
+
+The Filter mode (gf_fmode) can be MCAST_INCLUDE (when you want to allow multicast traffic from some unicast address) or MCAST_EXCLUDE (when you want to disallow multicast traffic from some unicast address). Following are two examples for this; the first will allow multicast traffic from three resources, and the second will disallow multicast traffic from two resources:
+
+struct ipv6_mreq mcgroup;
+
+struct group_filter filter;
+
+struct sockaddr_in6 *psin6;
+
+int sockd[2];
+
+Set the multicast group address that we want to join, ffff::9.
+
+inet_pton(AF_INET6,"ffff::9", &mcgroup.ipv6mr_multiaddr);
+
+Set the network interface that we want to use by its ifindex (here, we use eth0, which has an ifindex value of 2):
+
+mcgroup.ipv6mr_interface=2;
+
+Set the filter parameters: use the same ifindex (2), use MCAST_INCLUDE to set the filter to allow traffic from the sources that are specified by the filter, and set gf_numsrc to 3, because we want to prepare a filter of 3 unicast addresses:
+
+filter.gf_interface = 2;
+
+We want to prepare two filters: the first one will allow traffic from a set of three multicast addresses, and the second one will permit traffic from a set of two multicast addresses. First set the filter mode to MCAST_INCLUDE, which means to allow traffic from this filter:
+
+filter.gf_fmode = MCAST_INCLUDE;
+
+Set the number of source addresses of the filter (gf_numsrc) to be 3:
+
+filter.gf_numsrc = 3;
+
+Set the group address of the filter (gf_group) to be the same one that we use for the mcgrouop earlier, ffff::9:
+
+psin6 = (struct sockaddr_in6 *)&filter.gf_group;
+
+psin6->sin6_family = AF_INET6;
+
+inet_pton(PF_INET6, "ffff::9", &psin6->sin6_addr);
+
+The three unicast addresses that we want to allow are 2000::1, 2000::2, and 2000::3.
+
+Set filter.gf_slist[0], filter.gf_slist[1], and filter.gf_slist[2] accordingly:
+
+psin6 = (struct sockaddr_in6 *)&filter.gf_slist[0];
+
+psin6->sin6_family = AF_INET6;
+
+inet_pton(PF_INET6, "2000::1", &psin6->sin6_addr);
+
+psin6 = (struct sockaddr_in6 *)&filter.gf_slist[1];
+
+psin6->sin6_family = AF_INET6;
+
+inet_pton(PF_INET6, "2000::2", &psin6->sin6_addr);
+
+psin6 = (struct sockaddr_in6 *)&filter.gf_slist[2];
+
+psin6->sin6_family = AF_INET6;
+
+inet_pton(PF_INET6, "2000::3",&psin6->sin6_addr);
+
+Create a socket, and join a multicast group:
+
+sockd[0] = socket(AF_INET6, SOCK_DGRAM,0);
+
+status = setsockopt(sockd[0], IPPROTO_IPV6, IPV6_JOIN_GROUP,
+
+&mcgroup, sizeof(mcgroup));
+
+Activate the filter we created:
+
+status=setsockopt(sockd[0], IPPROTO_IPV6, MCAST_MSFILTER, &filter,
+
+GROUP_FILTER_SIZE(filter.gf_numsrc));
+
+This will trigger sending of an MLDv2 Multicast Listener Report (ICMPV6_MLD2_REPORT) to all MLDv2 routers (ff02::16) with a Multicast Address Record object (mld2_grec) embedded in it. (See the description of the mld2_report structure and Figure 8-3 earlier.) The values of the fields of mld2_grec will be as follows:
+
+  * grec_type will be MLD2_CHANGE_TO_INCLUDE (3).
+
+  * grec_auxwords will be 0. (We do not use Auxiliary Data.)
+
+  * grec_nsrcs is 3 (because we want to use a filter with 3 source addresses and we set gf_numsrc to 3).
+
+  * grec_mca will be ffff::9; this is the multicast group address that the Multicast Address Record pertains to.
+
+The following three unicast source addresses:
+
+  * grec_src[0] is 2000::1
+
+  * grec_src[1] is 2000::2
+
+  * grec_src[2] is 2000::3
+
+Now we want to create a filter of 2 unicast source addresses that we want to exclude. So first create a new userspace socket:
+
+sockd[1] = socket(AF_INET6, SOCK_DGRAM,0);
+
+Set the filter mode to EXCLUDE, and set the number of sources of the filter to be 2:
+
+filter.gf_fmode = MCAST_EXCLUDE;
+
+filter.gf_numsrc = 2;
+
+Set the two addresses we want to exclude, 2001::1 and 2001::2:
+
+psin6 = (struct sockaddr_in6 *)&filter.gf_slist[0];
+
+psin6->sin6_family = AF_INET6;
+
+inet_pton(PF_INET6, "2001::1", &psin6->sin6_addr);
+
+psin6 = (struct sockaddr_in6 *)&filter.gf_slist[1];
+
+psin6->sin6_family = AF_INET6;
+
+inet_pton(PF_INET6, "2001::2", &psin6->sin6_addr);
+
+Create a socket, and join a multicast group:
+
+status = setsockopt(sockd[1], IPPROTO_IPV6, IPV6_JOIN_GROUP,
+
+&mcgroup, sizeof(mcgroup));
+
+Activate the filter:
+
+status=setsockopt(sockd[1], IPPROTO_IPV6, MCAST_MSFILTER, &filter,
+
+GROUP_FILTER_SIZE(filter.gf_numsrc));
+
+This again will trigger the sending of an MLDv2 Multicast Listener Report (ICMPV6_MLD2_REPORT) to all MLDv2 routers (ff02::16). This time the content of the Multicast Address Record object (mld2_grec) will be different:
+
+  * grec_type will be MLD2_CHANGE_TO_EXCLUDE (4).
+
+  * grec_auxwords will be 0. (We do not use Auxiliary Data.)
+
+  * grec_nsrcs is 2 (because we want to use 2 source addresses and we set gf_numsrc to 2).
+
+  * grec_mca will be ffff::9, as before; this is the multicast group address that the Multicast Address Record pertains to.
+
+  * The following two unicast source addresses:
+
+    * grec_src[0] is 2001::1
+
+    * grec_src[1] is 2002::2
+
+Note
+
+We can display the source filtering mapping that we created by cat/proc/net/mcfilter6; this is handled in the kernel by the igmp6_mcf_seq_show() method.
+
+For example, the first three entries in this mapping will show that for the ffff::9 multicast address, we permit (INCLUDE) multicast traffic from 2000::1, 2000::2, and 2000::3. Note that for the first three entries the value in the INC (Include) column is 1. For the fourth and fifth entries, we disallow traffic from 2001::1 and 2001::2. Note that the value in the EX (Exclude) column is 1 for the fourth and fifth entries.
+
+cat /proc/net/mcfilter6
+
+Idx Device Multicast Address Source Address INC EXC
+
+2 eth0 ffff0000000000000000000000000009 20000000000000000000000000000001 1 0
+
+2 eth0 ffff0000000000000000000000000009 20000000000000000000000000000002 1 0
+
+2 eth0 ffff0000000000000000000000000009 20000000000000000000000000000003 1 0
+
+2 eth0 ffff0000000000000000000000000009 20010000000000000000000000000001 0 1
+
+2 eth0 ffff0000000000000000000000000009 20010000000000000000000000000002 0 1
+
+Note
+
+Creating filters by calling the setsockopt() method with MCAST_MSFILTER is handled in the kernel by the ip6_mc_msfilter() method, in net/ipv6/mcast.c.
+
+An MLD router (which is also sometimes known as the "Querier") joins the all MLDv2-capable routers Multicast Group (ff02::16) when it is started. It periodically sends Multicast Listener Query packets in order to know which hosts belong to a Multicast group, and to which Multicast group they belong. These are ICMPv6 packets whose type is ICMPV6_MGM_QUERY. The destination address of these query packets is the all-hosts multicast group (ff02::1). When a host receives an ICMPv6 Multicast Listener Query packet, the ICMPv6 Rx handler (the icmpv6_rcv() method) calls the igmp6_event_query() method to handle that query. Note that the igmp6_event_query() method handles both MLDv2 queries and MLDv1 queries (because both use ICMPV6_MGM_QUERY as the ICMPv6 type). The igmp6_event_query() method finds out whether the message is MLDv1 or MLDv2 by checking its length; in MLDv1 the length is 24 bytes, and in MLDv2 it is 28 bytes at least. Handling MLDv1 and MLDv2 messages is different; for MLDv2, we should support source filtering, as was mentioned before in this section, while this feature is not available in MLDv1. The host sends back a Multicast Listener Report by calling the igmp6_send() method. The Multicast Listener Report packet is an ICMPv6 packet.
+
+An example of an IPv6 MLD router is the mld6igmp daemon of the open source XORP project:  http://www.xorp.org . The MLD router keeps information about the multicast address groups of network nodes (MLD listeners) and updates this information dynamically. This information can be provided to Multicast Routing daemons. Delving into the implementation of MLDv2 routing daemons like the mld6igmp daemon, or into the implementation of other Multicast Routing daemons, is beyond the scope of this book because it is implemented in userspace.
+
+According to RFC 3810, MLDv2 should be interoperable with nodes that implement MLDv1; an implementation of MLDv2 must support the following two MLDv1 message types:
+
+  * MLDv1 Multicast Listener Report (ICMPV6_MGM_REPORT, decimal 131)
+
+  * MLDv1 Multicast Listener Done (ICMPV6_MGM_REDUCTION, decimal 132)
+
+We can use the MLDv1 protocol for Multicast Listener messages instead of MLDv2; this can be done by using the following:
+
+echo 1 > /proc/sys/net/ipv6/conf/all/force_mld_version
+
+In such a case, when a host joins a multicast group, a Multicast Listener Report message will be sent by the igmp6_send() method. This message will use ICMPV6_MGM_REPORT (131) of MLDv1 as the ICMPv6 type, not ICMPV6_MLD2_REPORT(143) as in MLDv2. Note that in this case you cannot use source filtering request for this message, as MLDv1 does not support it. We will join the multicast group by calling the igmp6_join_group() method. When you leave the multicast group, a Multicast Listener Done message will be sent. In this message, the ICMPv6 type is ICMPV6_MGM_REDUCTION (132).
+
+In the next section, I will very briefly talk about the IPv6 Tx path, which is quite similar to the IPv4 Tx path, and which I do not cover in depth in this chapter.
+
+## Sending IPv6 Packets
+
+The IPv6 Tx path is very similar to the IPv4 Tx path; even the names of the methods are very similar. Also in IPv6, there are two main methods for sending IPv6 packets from Layer 4, the transport layer: the first is the ip6_xmit() method, which is used by the TCP, Stream Control Transmission Protocol (SCTP), and Datagram Congestion Control Protocol (DCCP) protocols. The second method is the ip6_append_data() method, which is used, for example, by UDP and Raw sockets. Packets that are created on the local host are sent out by the ip6_local_out() method. The ip6_output() method is set to be the output callback of the protocol-independent dst_entry; it first calls the NF_HOOK() macro for the NF_INET_POST_ROUTING hook, and then it calls the ip6_finish_output() method. If fragmentation is needed, the ip6_finish_output() method calls the ip6_fragment() method to handle it; otherwise, it calls the ip6_finish_output2() method, which eventually sends the packet. For implementation details, look in the IPv6 Tx path code; it is mostly in net/ipv6/ip6_output.c.
+
+In the next section, I will very briefly talk about IPv6 routing, which is, again, quite similar to the IPv4 routing, and which I do not cover in depth in this chapter.
+
+## IPv6 Routing
+
+The implementation of IPv6 routing is very similar to the IPv4 routing implementation that was discussed in Chapter 5, which dealt with the IPv4 routing subsystem. Like in the IPv4 routing subsystem, Policy routing is also supported in IPv6 (when CONFIG_IPV6_MULTIPLE_TABLES is set). A routing entry is represented in IPv6 by the rt6_info structure (include/net/ip6_fib.h). The rt6_info object parallels the IPv4 rtable structure, and the flowi6 structure (include/net/flow.h) parallels the IPv4 flowi4 structure. (In fact, they both have as their first member the same flowi_common object.) For implementation details, look in the IPv6 routing modules: net/ipv6/route.c, net/ipv6/ip6_fib.c, and the policy routing module, net/ipv6/fib6_rules.c.
+
+## Summary
+
+I dealt with the IPv6 subsystem and its implementation in this chapter. I discussed various IPv6 topics, like IPv6 addresses (including Special Addresses and Multicast Addresses), how the IPv6 header is built, what the IPv6 extension headers are, the autoconfiguration process, the Rx path in IPv6, and the MLD protocol. In the next chapter, we will continue our journey into the kernel networking internals and discuss the netfilter subsystem and its implementation. In the "Quick Reference" section that follows, we will cover the top methods related to the topics we discussed in this chapter, ordered by their context.
+
+## Quick Reference
+
+I conclude this chapter with a short list of important methods of the IPv6 subsystem. Some of them were mentioned in this chapter. Subsequently, there are three tables and two short sections about IPv6 Special Addresses and about the management of routing tables in IPv6.
+
+### Methods
+
+Let's start with the methods.
+
+#### bool ipv6_addr_any(const struct in6_addr *a);
+
+This method returns true if the specified address is the all-zeroes address ("unspecified address").
+
+#### bool ipv6_addr_equal(const struct in6_addr *a1, const struct in6_addr *a2);
+
+This method returns true if the two specified IPv6 addresses are equal.
+
+#### static inline void ipv6_addr_set(struct in6_addr *addr, __be32 w1, __be32 w2, __be32 w3, __be32 w4);
+
+This method sets the IPv6 address according to the four 32-bit input parameters.
+
+#### bool ipv6_addr_is_multicast(const struct in6_addr *addr);
+
+This method returns true if the specified address is a multicast address.
+
+#### bool ipv6_ext_hdr(u8 nexthdr);
+
+This method returns true if the specified nexthdr is a well-known extension header.
+
+#### struct ipv6hdr *ipv6_hdr(const struct sk_buff *skb);
+
+This method returns the IPv6 header (ipv6hdr) of the specified skb.
+
+#### struct inet6_dev *in6_dev_get(const struct net_device *dev);
+
+This method returns the inet6_dev object associated with the specified device.
+
+#### bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset);
+
+This method returns true if the specified nexthdr is ICMPv6 (IPPROTO_ICMPV6) and the type of the ICMPv6 header located at the specified offset is an MLD type. It should be one of the following:
+
+  * ICMPV6_MGM_QUERY
+
+  * ICMPV6_MGM_REPORT
+
+  * ICMPV6_MGM_REDUCTION
+
+  * ICMPV6_MLD2_REPORT
+
+#### bool raw6_local_deliver(struct sk_buff *, int);
+
+This method tries to deliver the packet to a raw socket. It returns true on success.
+
+#### int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
+
+This method is the main Rx handler for IPv6 packets.
+
+#### bool ipv6_accept_ra(struct inet6_dev *idev);
+
+This method returns true if a host is configured to accept Router Advertisements, in these cases:
+
+  * If forwarding is enabled, the special hybrid mode should be set, which means that /proc/sys/net/ipv6/conf/<deviceName>/accept_ra is 2.
+
+  * If forwarding is not enabled, /proc/sys/net/ipv6/conf/<deviceName>/accept_ra should be 1.
+
+#### void ip6_route_input(struct sk_buff *skb);
+
+This method is the main IPv6 routing subsystem lookup method in the Rx path. It sets the dst entry of the specified skb according to the results of the lookup in the routing subsystem.
+
+#### int ip6_forward(struct sk_buff *skb);
+
+This method is the main forwarding method.
+
+#### struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk, struct flowi6 *fl6);
+
+This method is the main IPv6 routing subsystem lookup method in the Tx path. The return value is the destination cache entry (dst).
+
+Note
+
+Both the ip6_route_input() method and the ip6_route_output() method eventually perform the lookup by calling the fib6_lookup() method.
+
+#### void in6_dev_hold(struct inet6_dev *idev); and void __in6_dev_put(struct inet6_dev *idev);
+
+This method increments and decrements the reference counter of the specified idev object, respectively.
+
+#### int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf);
+
+This method handles a setsockopt() call with MCAST_MSFILTER.
+
+#### int ip6_mc_input(struct sk_buff *skb);
+
+This method is the main Rx handler for multicast packets.
+
+#### int ip6_mr_input(struct sk_buff *skb);
+
+This method is the main Rx handler for multicast packets that are to be forwarded.
+
+#### int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr);
+
+This method adds the specified device to a multicast group specified by addr, or creates such a group if not found.
+
+#### int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr);
+
+This method removes the specified device from the specified address group.
+
+#### bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, const struct in6_addr *src_addr);
+
+This method checks if the specified network device belongs to the specified multicast address group. If the third parameter is not NULL, it will also check whether source filtering permits receiving multicast traffic from the specified address (src_addr) that is destined to the specified multicast address group.
+
+#### inline void addrconf_addr_solict_mult(const struct in6_addr *addr, struct in6_addr *solicited)
+
+This method computes link-local solicited-node multicast addresses.
+
+#### void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
+
+This method joins to a solicited address multicast group.
+
+#### int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr);
+
+This method handles socket join on a multicast group.
+
+#### int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr);
+
+This method handles socket leave on a multicast group.
+
+#### int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol);
+
+This method registers an IPv6 protocol handler. It's used with L4 protocol registration (UDPv6, TCPv6, and more) and also with extension headers (like the Fragment Extension Header).
+
+#### int ipv6_parse_hopopts(struct sk_buff *skb);
+
+This method parses the Hop-by-Hop Options header, which must be the first extension header immediately after the IPv6 header.
+
+#### int ip6_local_out(struct sk_buff *skb);
+
+This method sends out packets that were generated on the local host.
+
+#### int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
+
+This method handles IPv6 fragmentation. It is called from the ip6_finish_output() method.
+
+#### void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos);
+
+This method sends an ICMPv6 parameter problem (ICMPV6_PARAMPROB) error. It is called when there is some problem in parsing extension headers or in the defragmentation process.
+
+#### int do_ipv6_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen); static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen, unsigned int flags);
+
+These methods are the generic IPv6 handlers for calling the setsockopt() and getsockopt() methods on IPv6 sockets, respectively (net/ipv6/ipv6_sockglue.c).
+
+#### int igmp6_event_query(struct sk_buff *skb);
+
+This method handles MLDv2 and MLDv1 queries.
+
+#### void ip6_route_input(struct sk_buff *skb);
+
+This method performs a routing lookup by building a flow6 object, based on the specified skb and invoking the ip6_route_input_lookup() method.
+
+### Macros
+
+And here are the macros.
+
+#### IPV6_ADDR_MC_SCOPE()
+
+This macro returns the scope of the specified IPv6 Multicast address, which is located in bits 11-14 of the multicast address.
+
+#### IPV6_ADDR_MC_FLAG_TRANSIENT()
+
+This macro returns 1 if the T bit of the flags of the specified multicast address is set.
+
+#### IPV6_ADDR_MC_FLAG_PREFIX()
+
+This macro returns 1 if the P bit of the flags of the specified multicast address is set.
+
+#### IPV6_ADDR_MC_FLAG_RENDEZVOUS()
+
+This macro returns 1 if the R bit of the flags of the specified multicast address is set.
+
+### Tables
+
+Here are the tables.
+
+Table 8-2 shows the IPv6 extension headers by their Linux symbol, value and description. You can find more details in the "extension headers" section of this chapter.
+
+Table 8-2.
+
+IPv6 extension headers
+
+Linux Symbol | Value | Description
+
+---|---|---
+
+NEXTHDR_HOP | 0 | Hop-by-Hop Options header.
+
+NEXTHDR_TCP | 6 | TCP segment.
+
+NEXTHDR_UDP | 17 | UDP message.
+
+NEXTHDR_IPV6 | 41 | IPv6 in IPv6.
+
+NEXTHDR_ROUTING | 43 | Routing header.
+
+NEXTHDR_FRAGMENT | 44 | Fragmentation/reassembly header.
+
+NEXTHDR_GRE | 47 | GRE header.
+
+NEXTHDR_ESP | 50 | Encapsulating security payload.
+
+NEXTHDR_AUTH | 51 | Authentication header.
+
+NEXTHDR_ICMP | 58 | ICMP for IPv6.
+
+NEXTHDR_NONE | 59 | No next header.
+
+NEXTHDR_DEST | 60 | Destination options header.
+
+NEXTHDR_MOBILITY | 135 | Mobility header.
+
+Table 8-3 shows the Multicast Address Record types by their Linux symbol and value. For more details see the "MLDv2 Multicast Listener Report" section in this chapter.
+
+Table 8-3.
+
+Multicast Address Record (record types)
+
+Linux Symbol | Value
+
+---|---
+
+MLD2_MODE_IS_INCLUDE | 1
+
+MLD2_MODE_IS_EXCLUDE | 2
+
+MLD2_CHANGE_TO_INCLUDE | 3
+
+MLD2_CHANGE_TO_EXCLUDE | 4
+
+MLD2_ALLOW_NEW_SOURCES | 5
+
+MLD2_BLOCK_OLD_SOURCES | 6
+
+(include/uapi/linux/icmpv6.h)
+
+Table 8-4 shows the codes of ICMPv6 "Parameter Problem" message by their Linux symbol and value. These codes gives more information about the type of problem which occurred.
+
+Table 8-4.
+
+ICMPv6 Parameter Problem codes
+
+Linux Symbol | Value
+
+---|---
+
+ICMPV6_HDR_FIELD | 0 Erroneous header field encountered
+
+ICMPV6_UNK_NEXTHDR | 1 Unknown header field encountered
+
+ICMPV6_UNK_OPTION | 2 Unknown IPv6 option encountered
+
+### Special Addresses
+
+All of the following variables are instances of the in6_addr structure:
+
+  * in6addr_any: Represents the unspecified device of all zeroes (::).
+
+  * in6addr_loopback: Represents the loopback device (::1).
+
+  * in6addr_linklocal_allnodes: Represents the link-local all nodes multicast address (ff02::1).
+
+  * in6addr_linklocal_allrouters: Represents the link-local all routers multicast address (ff02::2).
+
+  * in6addr_interfacelocal_allnodes: Represents the interface-local all nodes (ff01::1).
+
+  * in6addr_interfacelocal_allrouters: Represents the interface-local all routers (ff01::2).
+
+  * in6addr_sitelocal_allrouters: Represents the site-local all routers address (ff05::2).
+
+(include/linux/in6.h)
+
+### Routing Tables Management in IPv6
+
+Like in IPv4, we can manage adding and deleting routing entries and displaying the routing tables with the ip route command of iproute2 and with the route command of net-tools:
+
+  * Adding a route by ip -6 route add is handled by the inet6_rtm_newroute() method by invoking the ip6_route_add() method.
+
+  * Deleting a route by ip -6 route del is handled by the inet6_rtm_delroute() method by invoking the ip6_route_del() method.
+
+  * Displaying the routing table by ip -6 route show is handled by the inet6_dump_fib() method.
+
+  * Adding a route by route -A inet6 add is implemented by sending SIOCADDRT IOCTL, which is handled by the ipv6_route_ioctl() method, by invoking the ip6_route_add() method.
+
+  * Deleting a route by route -A inet6 del is implemented by sending SIOCDELRT IOCTL, which is handled by the ipv6_route_ioctl() method by invoking the ip6_route_del() method.
+
+Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_9
+
+© Rami Rosen 2014
+
+# 9. Netfilter
+
+Rami Rosen1
+
+(1)
+
+Haifa, Israel
+
+Abstract
+
+Chapter 8 discusses the IPv6 subsystem implementation. This chapter discusses the netfilter subsystem. The netfilter framework was started in 1998 by Rusty Russell, one of the most widely known Linux kernel developers, as an improvement of the older implementations of ipchains (Linux 2.2.x) and ipfwadm (Linux 2.0.x). The netfilter subsystem provides a framework that enables registering callbacks in various points (netfilter hooks) in the packet traversal in the network stack and performing various operations on packets, such as changing addresses or ports, dropping packets, logging, and more. These netfilter hooks provide the infrastructure to netfilter kernel modules that register callbacks in order to perform various tasks of the netfilter subsystem.
+
+Chapter 8 discusses the IPv6 subsystem implementation. This chapter discusses the netfilter subsystem. The netfilter framework was started in 1998 by Rusty Russell, one of the most widely known Linux kernel developers, as an improvement of the older implementations of ipchains (Linux 2.2.x) and ipfwadm (Linux 2.0.x). The netfilter subsystem provides a framework that enables registering callbacks in various points (netfilter hooks) in the packet traversal in the network stack and performing various operations on packets, such as changing addresses or ports, dropping packets, logging, and more. These netfilter hooks provide the infrastructure to netfilter kernel modules that register callbacks in order to perform various tasks of the netfilter subsystem.
+
+## Netfilter Frameworks
+
+The netfilter subsystem provides the following functionalities, discussed in this chapter:
+
+  * Packet selection (iptables)
+
+  * Packet filtering
+
+  * Network Address Translation (NAT)
+
+  * Packet mangling (modifying the contents of packet headers before or after routing)
+
+  * Connection tracking
+
+  * Gathering network statistics
+
+Here are some common frameworks that are based on the Linux kernel netfilter subsystem:
+
+  * IPVS (IP Virtual Server):A transport layer load-balancing solution (net/netfilter/ipvs). There is support for IPv4 IPVS from very early kernels, and support for IPVS in IPv6 is included since kernel 2.6.28. The IPv6 kernel support for IPVS was developed by Julius Volz and Vince Busam from Google. For more details, see the IPVS official website,  www.linuxvirtualserver.org .
+
+  * IP sets:A framework which consists of a userspace tool called ipset and a kernel part (net/netfilter/ipset). An IP set is basically a set of IP addresses. The IP sets framework was developed by Jozsef Kadlecsik. For more details, see  http://ipset.netfilter.org .
+
+  * iptables:Probably the most popular Linux firewall, iptables is the front end of netfilter, and it provides a management layer for netfilter: for example, adding and deleting netfilter rules, displaying statistics, adding a table, zeroing the counters of a table, and more.
+
+There are different iptables implementations in the kernel, according to the protocol:
+
+  * iptables for IPv4: (net/ipv4/netfilter/ip_tables.c)
+
+  * ip6tables for IPv6: (net/ipv6/netfilter/ip6_tables.c)
+
+  * arptables for ARP: (net/ipv4/netfilter/arp_tables.c)
+
+  * ebtables for Ethernet: (net/bridge/netfilter/ebtables.c)
+
+In userspace, you have the iptables and the ip6tables command-line tools, which are used to set up, maintain, and inspect the IPv4 and IPv6 tables, respectively. See man 8 iptables and man 8 ip6tables. Both iptables and ip6tables use the setsockopt()/getsockopt() system calls to communicate with the kernel from userspace. I should mention here two interesting ongoing netfilter projects. The xtables2 project—being developed primarily by Jan Engelhardt, a work in progress as of this writing—uses a netlink-based interface to communicate with the kernel netfilter subsystem. See more details on the project website,  http://xtables.de . The second project, the nftables project, is a new packet filtering engine that is a candidate to replace iptables. The nftables solution is based on using a virtual machine and a single unified implementation instead of the four iptables objects mentioned earlier (iptables, ip6tables, arptables, and ebtables). The nftables project was first presented in a netfilter workshop in 2008, by Patrick McHardy. The kernel infrastructure and userspace utility have been developed by Patrick McHardy and Pablo Neira Ayuso. For more details, see  http://netfilter.org/projects/nftables , and "Nftables: a new packet filtering engine" at  http://lwn.net/Articles/324989/ .
+
+There are a lot of netfilter modules that extend the core functionality of the core netfilter subsystem; apart from some examples, I do not describe these modules here in depth. There are a lot of information resources about these netfilter extensions from the administration perspective on the web and in various administration guides. See also the official netfilter project website:  www.netfilter.org .
+
+## Netfilter Hooks
+
+There are five points in the network stack where you have netfilter hooks: you have encountered these points in previous chapters' discussions of the Rx and Tx paths in IPv4 and in IPv6. Note that the names of the hooks are common to IPv4 and IPv6:
+
+  * NF_INET_PRE_ROUTING: This hook is in the ip_rcv() method in IPv4, and in the ipv6_rcv() method in IPv6. The ip_rcv() method is the protocol handler of IPv4, and the ipv6_rcv() method is the protocol handler of IPv6. It is the first hook point that all incoming packets reach, before performing a lookup in the routing subsystem.
+
+  * NF_INET_LOCAL_IN: This hook is in the ip_local_deliver() method in IPv4, and in the ip6_input() method in IPv6. All incoming packets addressed to the local host reach this hook point after first passing via the NF_INET_PRE_ROUTING hook point and after performing a lookup in the routing subsystem.
+
+  * NF_INET_FORWARD: This hook is in the ip_forward() method in IPv4, and in the ip6_forward() method in IPv6. All forwarded packets reach this hook point after first passing via the NF_INET_PRE_ROUTING hook point and after performing a lookup in the routing subsystem.
+
+  * NF_INET_POST_ROUTING: This hook is in the ip_output() method in IPv4, and in the ip6_finish_output2() method in IPv6. Packets that are forwarded reach this hook point after passing the NF_INET_FORWARD hook point. Also packets that are created in the local machine and sent out arrive to NF_INET_POST_ROUTING after passing the NF_INET_LOCAL_OUT hook point.
+
+  * NF_INET_LOCAL_OUT: This hook is in the __ip_local_out() method in IPv4, and in the __ip6_local_out() method in IPv6. All outgoing packets that were created on the local host reach this point before reaching the NF_INET_POST_ROUTING hook point.
+
+(include/uapi/linux/netfilter.h)
+
+The NF_HOOK macro, mentioned in previous chapters, is called in some distinct points along the packet traversal in the kernel network stack; it is defined in include/linux/netfilter.h:
+
+static inline int NF_HOOK(uint8_t pf, unsigned int hook, struct sk_buff *skb,
+
+struct net_device *in, struct net_device *out,
+
+int (*okfn)(struct sk_buff *))
+
+{
+
+return NF_HOOK_THRESH(pf, hook, skb, in, out, okfn, INT_MIN);
+
+}
+
+The parameters of the NF_HOOK() are as follows:
+
+  * pf: Protocol family. NFPROTO_IPV4 for IPv4 and NFPROTO_IPV6 for IPv6.
+
+  * hook: One of the five netfilter hooks mentioned earlier (for example, NF_INET_PRE_ROUTING or NF_INET_LOCAL_OUT).
+
+  * skb: The SKB object represents the packet that is being processed.
+
+  * in: The input network device (net_device object).
+
+  * out: The output network device (net_device object). There are cases when the output device is NULL, as it is yet unknown; for example, in the ip_rcv() method, net/ipv4/ip_input.c, which is called before a routing lookup is performed, and you don't know yet which is the output device; the NF_HOOK() macro is invoked in this method with a NULL output device.
+
+  * okfn: A pointer to a continuation function which will be called when the hook will terminate. It gets one argument, the SKB.
+
+The return value from a netfilter hook must be one of the following values (which are also termed netfilter verdicts):
+
+  * NF_DROP (0): Discard the packet silently.
+
+  * NF_ACCEPT (1): The packet continues its traversal in the kernel network stack as usual.
+
+  * NF_STOLEN (2): Do not continue traversal. The packet is processed by the hook method.
+
+  * NF_QUEUE (3): Queue the packet for user space.
+
+  * NF_REPEAT (4): The hook function should be called again.
+
+(include/uapi/linux/netfilter.h)
+
+Now that you know about the various netfilter hooks, the next section covers how netfilter hooks are registered.
+
+### Registration of Netfilter Hooks
+
+To register a hook callback at one of the five hook points mentioned earlier, you first define an nf_hook_ops object (or an array of nf_hook_ops objects) and then register it; the nf_hook_ops structure is defined in include/linux/netfilter.h:
+
+struct nf_hook_ops {
+
+struct list_head list;
+
+/* User fills in from here down. */
+
+nf_hookfn *hook;
+
+struct module *owner;
+
+u_int8_t pf;
+
+unsigned int hooknum;
+
+/* Hooks are ordered in ascending priority. */
+
+int priority;
+
+};
+
+The following introduces some of the important members of the nf_hook_ops structure:
+
+  * hook: The hook callback you want to register. Its prototype is:
+
+unsigned int nf_hookfn(unsigned int hooknum,
+
+struct sk_buff *skb,
+
+const struct net_device *in,
+
+const struct net_device *out,
+
+int (*okfn)(struct sk_buff *));
+
+  * pf: The protocol family (NFPROTO_IPV4 for IPv4 and NFPROTO_IPV6 for IPv6).
+
+  * hooknum: One of the five netfilter hooks mentioned earlier.
+
+  * priority: More than one hook callback can be registered on the same hook. Hook callbacks with lower priorities are called first. The nf_ip_hook_priorities enum defines possible values for IPv4 hook priorities (include/uapi/linux/netfilter_ipv4.h). See also Table 9-4 in the "Quick Reference" section at the end of this chapter.
+
+There are two methods to register netfilter hooks:
+
+  * int nf_register_hook(struct nf_hook_ops *reg): Registers a single nf_hook_ops object.
+
+  * int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n): Registers an array of n nf_hook_ops objects; the second parameter is the number of the elements in the array.
+
+You will see two examples of registration of an array of nf_hook_ops objects in the next two sections. Figure 9-1 in the next section illustrates the use of priorities when registering more than one hook callback on the same hook point.
+
+## Connection Tracking
+
+It is not enough to filter traffic only according to the L4 and L3 headers in modern networks. You should also take into account cases when the traffic is based on sessions, such as an FTP session or a SIP session. By FTP session, I mean this sequence of events, for example: the client first creates a TCP control connection on TCP port 21, which is the default FTP port. Commands sent from the FTP client (such as listing the contents of a directory) to the server are sent on this control port. The FTP server opens a data socket on port 20, where the destination port on the client side is dynamically allocated. Traffic should be filtered according to other parameters, such as the state of a connection or timeout. This is one of the main reasons for using the Connection Tracking layer.
+
+Connection Tracking allows the kernel to keep track of sessions. The Connection Tracking layer's primary goal is to serve as the basis of NAT. The IPv4 NAT module (net/ipv4/netfilter/iptable_nat.c) cannot be built if CONFIG_NF_CONNTRACK_IPV4 is not set. Similarly, the IPv6 NAT module (net/ipv6/netfilter/ip6table_nat.c) cannot be built if the CONFIG_NF_CONNTRACK_IPV6 is not set. However, Connection Tracking does not depend on NAT; you can run the Connection Tracking module without activating any NAT rule. The IPv4 and IPv6 NAT modules are discussed later in this chapter.
+
+Note
+
+There are some userspace tools (conntrack-tools) for Connection Tracking administration mentioned in the "Quick Reference" section at the end of this chapter. These tools may help you to better understand the Connection Tracking layer.
+
+### Connection Tracking Initialization
+
+An array of nf_hook_ops objects, called ipv4_conntrack_ops, is defined as follows:
+
+static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
+
+{
+
+.hook = ipv4_conntrack_in,
+
+.owner = THIS_MODULE,
+
+.pf = NFPROTO_IPV4,
+
+.hooknum = NF_INET_PRE_ROUTING,
+
+.priority = NF_IP_PRI_CONNTRACK,
+
+},
+
+{
+
+.hook = ipv4_conntrack_local,
+
+.owner = THIS_MODULE,
+
+.pf = NFPROTO_IPV4,
+
+.hooknum = NF_INET_LOCAL_OUT,
+
+.priority = NF_IP_PRI_CONNTRACK,
+
+},
+
+{
+
+.hook = ipv4_helper,
+
+.owner = THIS_MODULE,
+
+.pf = NFPROTO_IPV4,
+
+.hooknum = NF_INET_POST_ROUTING,
+
+.priority = NF_IP_PRI_CONNTRACK_HELPER,
+
+},
+
+{
+
+.hook = ipv4_confirm,
+
+.owner = THIS_MODULE,
+
+.pf = NFPROTO_IPV4,
+
+.hooknum = NF_INET_POST_ROUTING,
+
+.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
+
+},
+
+{
+
+.hook = ipv4_helper,
+
+.owner = THIS_MODULE,
+
+.pf = NFPROTO_IPV4,
+
+.hooknum = NF_INET_LOCAL_IN,
+
+.priority = NF_IP_PRI_CONNTRACK_HELPER,
+
+},
+
+{
+
+.hook = ipv4_confirm,
+
+.owner = THIS_MODULE,
+
+.pf = NFPROTO_IPV4,
+
+.hooknum = NF_INET_LOCAL_IN,
+
+.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
+
+},
+
+};
+
+(net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c)
+
+The two most important Connection Tracking hooks you register are the NF_INET_PRE_ROUTING hook, handled by the ipv4_conntrack_in() method, and the NF_INET_LOCAL_OUT hook, handled by the ipv4_conntrack_local() method. These two hooks have a priority of NF_IP_PRI_CONNTRACK (-200). The other hooks in the ipv4_conntrack_ops array have an NF_IP_PRI_CONNTRACK_HELPER (300) priority and an NF_IP_PRI_CONNTRACK_CONFIRM (INT_MAX, which is 2^31-1) priority. In netfilter hooks, a callback with a lower-priority value is executed first. (The enum nf_ip_hook_priorities in include/uapi/linux/netfilter_ipv4.h represents the possible priority values for IPv4 hooks). Both the ipv4_conntrack_local() method and the ipv4_conntrack_in() method invoke the nf_conntrack_in() method, passing the corresponding hooknum as a parameter. The nf_conntrack_in() method belongs to the protocol-independent NAT core, and is used both in IPv4 Connection Tracking and in IPv6 Connection Tracking; its second parameter is the protocol family, specifying whether it is IPv4 (PF_INET) or IPv6 (PF_INET6). I start the discussion with the nf_conntrack_in() callback. The other hook callbacks, ipv4_confirm() and ipv4_help(), are discussed later in this section.
+
+Note
+
+When the kernel is built with Connection Tracking support (CONFIG_NF_CONNTRACK is set ), the Connection Tracking hook callbacks are called even if there are no iptables rules that are activated. Naturally, this has some performance cost. If the performance is very important, and you know beforehand that the device will not use the netfilter subsystem, consider building the kernel without Connection Tracking support or building Connection Tracking as a kernel module and not loading it.
+
+Registration of IPv4 Connection Tracking hooks is done by calling the nf_register_hooks() method in the nf_conntrack_l3proto_ipv4_init() method (net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c):
+
+in nf_conntrack_l3proto_ipv4_init(void) {
+
+. . .
+
+ret = nf_register_hooks(ipv4_conntrack_ops,
+
+ARRAY_SIZE(ipv4_conntrack_ops))
+
+. . .
+
+}
+
+In Figure 9-1, you can see the Connection Tracking callbacks (ipv4_conntrack_in(), ipv4_conntrack_local(), ipv4_helper() and ipv4_confirm()), according to the hook points where they are registered.
+
+Figure 9-1.
+
+Connection Tracking hooks (IPv4)
+
+Note
+
+For the sake of simplicity, Figure 9-1 does not include more complex scenarios, such as when using IPsec or fragmentation or multicasting. It also omits the functions that are called for packets generated on the local host and sent out (like the ip_queue_xmit() method or the ip_build_and_send_pkt() method) for the sake of simplicity.
+
+The basic element of Connection Tracking is the nf_conntrack_tuple structure:
+
+struct nf_conntrack_tuple {
+
+struct nf_conntrack_man src;
+
+/* These are the parts of the tuple which are fixed. */
+
+struct {
+
+union nf_inet_addr u3;
+
+union {
+
+/* Add other protocols here. */
+
+__be16 all;
+
+struct {
+
+__be16 port;
+
+} tcp;
+
+struct {
+
+__be16 port;
+
+} udp;
+
+struct {
+
+u_int8_t type, code;
+
+} icmp;
+
+struct {
+
+__be16 port;
+
+} dccp;
+
+struct {
+
+__be16 port;
+
+} sctp;
+
+struct {
+
+__be16 key;
+
+} gre;
+
+} u;
+
+/* The protocol. */
+
+u_int8_t protonum;
+
+/* The direction (for tuplehash) */
+
+u_int8_t dir;
+
+} dst;
+
+};
+
+(include/net/netfilter/nf_conntrack_tuple.h)
+
+The nf_conntrack_tuple structure represents a flow in one direction. The union inside the dst structure includes various protocol objects (like TCP, UDP, ICMP, and more). For each transport layer (L4) protocol, there is a Connection Tracking module, which implements the protocol-specific part. Thus, for example, you have net/netfilter/nf_conntrack_proto_tcp.c for the TCP protocol, net/netfilter/nf_conntrack_proto_udp.c for the UDP protocol, net/netfilter/nf_conntrack_ftp.c for the FTP protocol, and more; these modules support both IPv4 and IPv6. You will see examples of how protocol-specific implementations of Connection Tracking modules differ later in this section.
+
+### Connection Tracking Entries
+
+The nf_conn structure represents the Connection Tracking entry:
+
+struct nf_conn {
+
+/* Usage count in here is 1 for hash table/destruct timer, 1 per skb,
+
+plus 1 for any connection(s) we are `master' for */
+
+struct nf_conntrack ct_general;
+
+spinlock_t lock;
+
+/* XXX should I move this to the tail ? - Y.K */
+
+/* These are my tuples; original and reply */
+
+struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX];
+
+/* Have we seen traffic both ways yet? (bitset) */
+
+unsigned long status;
+
+/* If we were expected by an expectation, this will be it */
+
+struct nf_conn *master;
+
+/* Timer function; drops refcnt when it goes off. */
+
+struct timer_list timeout;
+
+. . .
+
+/* Extensions */
+
+struct nf_ct_ext *ext;
+
+#ifdef CONFIG_NET_NS
+
+struct net *ct_net;
+
+#endif
+
+/* Storage reserved for other modules, must be the last member */
+
+union nf_conntrack_proto proto;
+
+};
+
+(include/net/netfilter/nf_conntrack.h)
+
+The following is a description of some of the important members of the nf_conn structure :
+
+  * ct_general: A reference count.
+
+  * tuplehash: There are two tuplehash objects: tuplehash[0] is the original direction, and tuplehash[1] is the reply. They are usually referred to as tuplehash[IP_CT_DIR_ORIGINAL] and tuplehash[IP_CT_DIR_REPLY], respectively.
+
+  * status: The status of the entry. When you start to track a connection entry, it is IP_CT_NEW; later on, when the connection is established, it becomes IP_CT_ESTABLISHED. See the ip_conntrack_info enum in include/uapi/linux/netfilter/nf_conntrack_common.h.
+
+  * master: An expected connection. Set by the init_conntrack() method, when an expected packet arrives (this means that the nf_ct_find_expectation() method, which is invoked by the init_conntrack() method, finds an expectation). See also the "Connection Tracking Helpers and Expectations" section later in this chapter.
+
+  * timeout: Timer of the connection entry. Each connection entry is expired after some time interval when there is no traffic. The time interval is determined according to the protocol. When allocating an nf_conn object with the __nf_conntrack_alloc() method, the timeout timer is set to be the death_by_timeout() method.
+
+Now that you know about the nf_conn struct and some of its members, let's take a look at the nf_conntrack_in() method:
+
+unsigned int nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
+
+struct sk_buff *skb)
+
+{
+
+struct nf_conn *ct, *tmpl = NULL;
+
+enum ip_conntrack_info ctinfo;
+
+struct nf_conntrack_l3proto *l3proto;
+
+struct nf_conntrack_l4proto *l4proto;
+
+unsigned int *timeouts;
+
+unsigned int dataoff;
+
+u_int8_t protonum;
+
+int set_reply = 0;
+
+int ret;
+
+if (skb->nfct) {
+
+/* Previously seen (loopback or untracked)? Ignore. */
+
+tmpl = (struct nf_conn *)skb->nfct;
+
+if (!nf_ct_is_template(tmpl)) {
+
+NF_CT_STAT_INC_ATOMIC(net, ignore);
+
+return NF_ACCEPT;
+
+}
+
+skb->nfct = NULL;
+
+}
+
+First you try to find whether the network layer (L3) protocol can be tracked:
+
+l3proto = __nf_ct_l3proto_find(pf);
+
+Now you try to find if the transport layer (L4) protocol can be tracked. For IPv4, it is done by the ipv4_get_l4proto() method (net/ipv4/netfilter/nf_conntrack_l3proto_ipv4):
+
+ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
+
+&dataoff, &protonum);
+
+if (ret <= 0) {
+
+. . .
+
+ret = -ret;
+
+goto out;
+
+}
+
+l4proto = __nf_ct_l4proto_find(pf, protonum);
+
+/* It may be an special packet, error, unclean...
+
+* inverse of the return code tells to the netfilter
+
+* core what to do with the packet. */
+
+Now you check protocol-specific error conditions (see, for example, the udp_error() method in net/netfilter/nf_conntrack_proto_udp.c, which checks for malformed packets, packets with invalid checksum, and more, or the tcp_error() method, in net/netfilter/nf_conntrack_proto_tcp.c):
+
+if (l4proto->error != NULL) {
+
+ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo,
+
+pf, hooknum);
+
+if (ret <= 0) {
+
+NF_CT_STAT_INC_ATOMIC(net, error);
+
+NF_CT_STAT_INC_ATOMIC(net, invalid);
+
+ret = -ret;
+
+goto out;
+
+}
+
+/* ICMP[v6] protocol trackers may assign one conntrack. */
+
+if (skb->nfct)
+
+goto out;
+
+}
+
+The resolve_normal_ct() method, which is invoked hereafter immediately, performs the following:
+
+  * Calculates the hash of the tuple by calling the hash_conntrack_raw() method.
+
+  * Performs a lookup for a tuple match by calling the __nf_conntrack_find_get() method, passing the hash as a parameter.
+
+  * If no match is found, it creates a new nf_conntrack_tuple_hash object by calling the init_conntrack() method. This nf_conntrack_tuple_hash object is added to the list of unconfirmed tuplehash objects. This list is embedded in the network namespace object; the net structure contains a netns_ct object, which consists of network namespace specific Connection Tracking information. One of its members is unconfirmed, which is a list of unconfirmed tuplehash objects (see include/net/netns/conntrack.h). Later on, in the __nf_conntrack_confirm() method, it will be removed from the unconfirmed list. I discuss the __nf_conntrack_confirm() method later in this section.
+
+  * Each SKB has a member called nfctinfo, which represents the connection state (for example, it is IP_CT_NEW for new connections), and also a member called nfct (an instance of the nf_conntrack struct) which is in fact a reference counter. The resolve_normal_ct() method initializes both of them.
+
+ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
+
+l3proto, l4proto, &set_reply, &ctinfo);
+
+if (!ct) {
+
+/* Not valid part of a connection */
+
+NF_CT_STAT_INC_ATOMIC(net, invalid);
+
+ret = NF_ACCEPT;
+
+goto out;
+
+}
+
+if (IS_ERR(ct)) {
+
+/* Too stressed to deal. */
+
+NF_CT_STAT_INC_ATOMIC(net, drop);
+
+ret = NF_DROP;
+
+goto out;
+
+}
+
+NF_CT_ASSERT(skb->nfct);
+
+You now call the nf_ct_timeout_lookup() method to decide what timeout policy you want to apply to this flow. For example, for UDP, the timeout is 30 seconds for unidirectional connections and 180 seconds for bidirectional connections; see the definition of the udp_timeouts array in net/netfilter/nf_conntrack_proto_udp.c. For TCP, which is a much more complex protocol, there are 11 entries in tcp_timeouts array (net/netfilter/nf_conntrack_proto_tcp.c):
+
+/* Decide what timeout policy we want to apply to this flow. */
+
+timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
+
+You now call the protocol-specific packet() method (for example, the udp_packet() for UDP or the tcp_packet() method for TCP). The udp_packet() method extends the timeout according to the status of the connection by calling the nf_ct_refresh_acct() method. For unreplied connections (where the IPS_SEEN_REPLY_BIT flag is not set), it will be set to 30 seconds, and for replied connections, it will be set to 180. Again, in the case of TCP, the tcp_packet() method is much more complex, due to the TCP advanced state machine. Moreover, the udp_packet() method always returns a verdict of NF_ACCEPT, whereas the tcp_packet() method may sometimes fail:
+
+ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum, timeouts);
+
+if (ret <= 0) {
+
+/* Invalid: inverse of the return code tells
+
+* the netfilter core what to do */
+
+pr_debug("nf_conntrack_in: Can't track with proto module\n");
+
+nf_conntrack_put(skb->nfct);
+
+skb->nfct = NULL;
+
+NF_CT_STAT_INC_ATOMIC(net, invalid);
+
+if (ret == -NF_DROP)
+
+NF_CT_STAT_INC_ATOMIC(net, drop);
+
+ret = -ret;
+
+goto out;
+
+}
+
+if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
+
+nf_conntrack_event_cache(IPCT_REPLY, ct);
+
+out:
+
+if (tmpl) {
+
+/* Special case: we have to repeat this hook, assign the
+
+* template again to this packet. We assume that this packet
+
+* has no conntrack assigned. This is used by nf_ct_tcp. */
+
+if (ret == NF_REPEAT)
+
+skb->nfct = (struct nf_conntrack *)tmpl;
+
+else
+
+nf_ct_put(tmpl);
+
+}
+
+return ret;
+
+}
+
+The ipv4_confirm() method, which is called in the NF_INET_POST_ROUTING hook and in the NF_INET_LOCAL_IN hook, will normally call the __nf_conntrack_confirm() method, which will remove the tuple from the unconfirmed list.
+
+### Connection Tracking Helpers and Expectations
+
+Some protocols have different flows for data and for control—for example, FTP, the File Transfer Protocol, and SIP, the Session Initiation Protocol, which is a VoIP protocol. Usually in these protocols, the control channel negotiates some configuration setup with the other side and agrees with it on which parameters to use for the data flow. These protocols are more difficult to handle by the netfilter subsystem, because the netfilter subsystem needs to be aware that flows are related to each other. In order to support these types of protocols, the netfilter subsystem provides the Connection Tracking Helpers, which extend the Connection Tracking basic functionality. These modules create expectations (nf_conntrack_expect objects), and these expectations tell the kernel that it should expect some traffic on a specified connection and that two connections are related. Knowing that two connections are related lets you define rules on the master connection that pertain also to the related connections. You can use a simple iptables rule based on the Connection Tracking state to accept packets whose Connection Tracking state is RELATED:
+
+iptables -A INPUT -m conntrack --ctstate RELATED -j ACCEPT
+
+Note
+
+Connections can be related not only as a result of expectation. For example, an ICMPv4 error packet such as "ICMP fragmentation needed" will be related if netfilter finds a conntrack entry that matches the tuple in the ICMP-embedded L3/L4 header. See the icmp_error_message() method for more details, net/ipv4/netfilter/nf_conntrack_proto_icmp.c.
+
+The Connection Tracking Helpers are represented by the nf_conntrack_helper structure (include/net/netfilter/nf_conntrack_helper.h). They are registered and unregistered by the nf_conntrack_helper_register() method and the nf_conntrack_helper_unregister() method, respectively. Thus, for example, the nf_conntrack_helper_register() method is invoked by nf_conntrack_ftp_init() (net/netfilter/nf_conntrack_ftp.c) in order to register the FTP Connection Tracking Helpers. The Connection Tracking Helpers are kept in a hash table (nf_ct_helper_hash). The ipv4_helper() hook callback is registered in two hook points, NF_INET_POST_ROUTING and NF_INET_LOCAL_IN (see the definition of ipv4_conntrack_ops array in the "Connection Tracking Initialization" section earlier). Because of this, when the FTP packet reaches the NF_INET_POST_ROUTING callback, ip_output(), or the NF_INET_LOCAL_IN callback, ip_local_deliver(), the ipv4_helper() method is invoked, and this method eventually calls the callbacks of the registered Connection Tracking Helpers. In the case of FTP, the registered helper method is the help() method, net/netfilter/nf_conntrack_ftp.c. This method looks for FTP-specific patterns, like the "PORT" FTP command; see the invocation of the find_pattern() method in the help() method, in the following code snippet (net/netfilter/nf_conntrack_ftp.c). If there is a match, an nf_conntrack_expect object is created by calling the nf_ct_expect_init() method:
+
+static int help(struct sk_buff *skb,
+
+unsigned int protoff,
+
+struct nf_conn *ct,
+
+enum ip_conntrack_info ctinfo)
+
+{
+
+struct nf_conntrack_expect *exp;
+
+. . .
+
+for (i = 0; i < ARRAY_SIZE(search[dir]); i++) {
+
+found = find_pattern(fb_ptr, datalen,
+
+search[dir][i].pattern,
+
+search[dir][i].plen,
+
+search[dir][i].skip,
+
+search[dir][i].term,
+
+&matchoff, &matchlen,
+
+&cmd,
+
+search[dir][i].getnum);
+
+if (found) break;
+
+}
+
+if (found == -1) {
+
+/* We don't usually drop packets. After all, this is
+
+connection tracking, not packet filtering.
+
+However, it is necessary for accurate tracking in
+
+this case. */
+
+nf_ct_helper_log(skb, ct, "partial matching of `%s'",
+
+search[dir][i].pattern);
+
+Note
+
+Normally, Connection Tracking does not drop packets. There are some cases when, due to some error or abnormal situation, packets are dropped. The following is an example of such a case: the invocation of find_pattern() earlier returned –1, which means that there is only a partial match; and the packet is dropped due to not finding a full pattern match.
+
+ret = NF_DROP;
+
+goto out;
+
+} else if (found == 0) { /* No match */
+
+ret = NF_ACCEPT;
+
+goto out_update_nl;
+
+}
+
+pr_debug("conntrack_ftp: match `%.*s' (%u bytes at %u)\n",
+
+matchlen, fb_ptr + matchoff,
+
+matchlen, ntohl(th->seq) + matchoff);
+
+exp = nf_ct_expect_alloc(ct);
+
+. . .
+
+nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, cmd.l3num,
+
+&ct->tuplehash[!dir].tuple.src.u3, daddr,
+
+IPPROTO_TCP, NULL, &cmd.u.tcp.port);
+
+. . .
+
+}
+
+(net/netfilter/nf_conntrack_ftp.c)
+
+Later on, when a new connection is created by the init_conntrack() method, you check whether it has expectations, and if it does, you set the IPS_EXPECTED_BIT flag and set the master of the connection (ct->master) to refer to the connection that created the expectation:
+
+static struct nf_conntrack_tuple_hash *
+
+init_conntrack(struct net *net, struct nf_conn *tmpl,
+
+const struct nf_conntrack_tuple *tuple,
+
+struct nf_conntrack_l3proto *l3proto,
+
+struct nf_conntrack_l4proto *l4proto,
+
+struct sk_buff *skb,
+
+unsigned int dataoff, u32 hash)
+
+{
+
+struct nf_conn *ct;
+
+struct nf_conn_help *help;
+
+struct nf_conntrack_tuple repl_tuple;
+
+struct nf_conntrack_ecache *ecache;
+
+struct nf_conntrack_expect *exp;
+
+u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+
+struct nf_conn_timeout *timeout_ext;
+
+unsigned int *timeouts;
+
+. . .
+
+ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
+
+hash);
+
+. . .
+
+exp = nf_ct_find_expectation(net, zone, tuple);
+
+if (exp) {
+
+pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
+
+ct, exp);
+
+/* Welcome, Mr. Bond. We've been expecting you... */
+
+__set_bit(IPS_EXPECTED_BIT, &ct->status);
+
+ct->master = exp->master;
+
+if (exp->helper) {
+
+help = nf_ct_helper_ext_add(ct, exp->helper,
+
+GFP_ATOMIC);
+
+if (help)
+
+rcu_assign_pointer(help->helper, exp->helper);
+
+}
+
+. . .
+
+Note that helpers listen on a predefined port. For example, the FTP Connection Tracking Helper listens on port 21 (see FTP_PORT definition in include/linux/netfilter/nf_conntrack_ftp.h). You can set a different port (or ports) in one of two ways: the first way is by a module parameter—you can override the default port value by supplying a single port or a comma-separated list of ports to the modprobe command:
+
+modprobe nf_conntrack_ftp ports=2121
+
+modprobe nf_conntrack_ftp ports=2022,2023,2024
+
+The second way is by using the CT target:
+
+iptables -A PREROUTING -t raw -p tcp --dport 8888 -j CT --helper ftp
+
+Note that the CT target (net/netfilter/xt_CT.c) was added in kernel 2.6.34.
+
+Note
+
+Xtables target extensions are represented by the xt_target structure and are registered by the xt_register_target() method for a single target, or by the xt_register_targets() method for an array of targets. Xtables match extensions are represented by the xt_match structure and are registered by the xt_register_match() method, or by the xt_register_matches() for an array of matches. The match extensions inspect a packet according to some criterion defined by the match extension module; thus, for example, the xt_length match module (net/netfilter/xt_length.c) inspects packets according to their length (the tot_len of the SKB in case of IPv4 packet), and the xt_connlimit module (net/netfilter/xt_connlimit.c) limits the number of parallel TCP connections per IP address.
+
+This section detailed the Connection Tracking initialization. The next section deals with iptables, which is probably the most known part of the netfilter framework.
+
+### IPTables
+
+There are two parts to iptables. The kernel part—the core is in net/ipv4/netfilter/ip_tables.c for IPv4, and in net/ipv6/netfilter/ip6_tables.c for IPv6. And there is the userspace part, which provides a front end for accessing the kernel iptables layer (for example, adding and deleting rules with the iptables command). Each table is represented by the xt_table structure (defined in include/linux/netfilter/x_tables.h). Registration and unregistration of a table is done by the ipt_register_table() and the ipt_unregister_table() methods, respectively. These methods are implemented in net/ipv4/netfilter/ip_tables.c. In IPv6, you also use the xt_table structure for creating tables, but registration and unregistration of a table is done by the ip6t_register_table() method and the ip6t_unregister_table() method, respectively.
+
+The network namespace object contains IPv4- and IPv6-specific objects (netns_ipv4 and netns_ipv6, respectively). The netns_ipv4 and netns_ipv6 objects, in turn, contain pointers to xt_table objects. For IPv4, in struct netns_ipv4 you have, for example, iptable_filter, iptable_mangle, nat_table, and more (include/net/netns/ipv4.h). In struct netns_ipv6 you have, for example, ip6table_filter, ip6table_mangle, ip6table_nat, and more (include/net/netns/ipv6.h). For a full list of the IPv4 and of the IPv6 network namespace netfilter tables and the corresponding kernel modules, see Tables 9-2 and 9-3 in the "Quick Reference" section at the end of this chapter.
+
+To understand how iptables work, let's take a look at a real example with the filter table. For the sake of simplicity, let's assume that the filter table is the only one that is built, and also that the LOG target is supported; the only rule I am using is for logging, as you will shortly see. First, let's take a look at the definition of the filter table:
+
+#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
+
+(1 << NF_INET_FORWARD) | \
+
+(1 << NF_INET_LOCAL_OUT))
+
+static const struct xt_table packet_filter = {
+
+.name = "filter",
+
+.valid_hooks = FILTER_VALID_HOOKS,
+
+.me = THIS_MODULE,
+
+.af = NFPROTO_IPV4,
+
+.priority = NF_IP_PRI_FILTER,
+
+};
+
+(net/ipv4/netfilter/iptable_filter.c)
+
+Initialization of the table is done first by calling the xt_hook_link() method, which sets the iptable_filter_hook() method as the hook callback of the nf_hook_ops object of the packet_filter table:
+
+static struct nf_hook_ops *filter_ops __read_mostly;
+
+static int __init iptable_filter_init(void)
+
+{
+
+. . .
+
+filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
+
+. . .
+
+}
+
+Then you call the ipt_register_table() method (note that the IPv4 netns object, net->ipv4, keeps a pointer to the filter table, iptable_filter):
+
+static int __net_init iptable_filter_net_init(struct net *net)
+
+{
+
+. . .
+
+net->ipv4.iptable_filter =
+
+ipt_register_table(net, &packet_filter, repl);
+
+. . .
+
+return PTR_RET(net->ipv4.iptable_filter);
+
+}
+
+(net/ipv4/netfilter/iptable_filter.c)
+
+Note that there are three hooks in the filter table:
+
+  * NF_INET_LOCAL_IN
+
+  * NF_INET_FORWARD
+
+  * NF_INET_LOCAL_OUT
+
+For this example, you set the following rule, using the iptable command line:
+
+iptables -A INPUT -p udp --dport=5001 -j LOG --log-level 1
+
+The meaning of this rule is that you will dump into the syslog incoming UDP packets with destination port 5001. The log-level modifier is the standard syslog level in the range 0 through 7; 0 is emergency and 7 is debug. Note that when running an iptables command, you should specify the table you want to use with the –t modifier; for example, iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE will add a rule to the NAT table. When not specifying a table name with the –t modifier, you use the filter table by default. So by running iptables -A INPUT -p udp --dport=5001 -j LOG --log-level 1, you add a rule to the filter table.
+
+Note
+
+You can set targets to iptables rules; usually these can be targets from the Linux netfilter subsystems (see the earlier example for using the LOG target). You can also write your own targets and extend the iptables userspace code to support them. See "Writing Netfilter modules," by Jan Engelhardt and Nicolas Bouliane:  http://inai.de/documents/Netfilter_Modules.pdf  .
+
+Note that CONFIG_NETFILTER_XT_TARGET_LOG must be set in order to use the LOG target in an iptables rule, as shown in the earlier example. You can refer to the code of net/netfilter/xt_LOG.c as an example of an iptables target module.
+
+When a UDP packet with destination port 5001 reaches the network driver and goes up to the network layer (L3), the first hook it encounters is the NF_INET_PRE_ROUTING hook; the filter table callback does not register a hook in NF_INET_PRE_ROUTING. It has only three hooks: NF_INET_LOCAL_IN, NF_INET_FORWARD, and NF_INET_LOCAL_OUT, as mentioned earlier. So you continue to the ip_rcv_finish() method and perform a lookup in the routing subsystem. Now there are two cases: the packet is intended to be delivered to the local host or intended to be forwarded (let's ignore cases when the packet is to be discarded). In Figure 9-2, you can see the packet traversal in both cases.
+
+Figure 9-2.
+
+Traffic for me and Forwarded Traffic with a Filter table rule
+
+### Delivery to the Local Host
+
+First you reach the ip_local_deliver() method; take a short look at this method:
+
+int ip_local_deliver(struct sk_buff *skb)
+
+{
+
+. . .
+
+return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
+
+ip_local_deliver_finish);
+
+}
+
+As you can see, you have the NF_INET_LOCAL_IN hook in this method, and as mentioned earlier, NF_INET_LOCAL_IN is one of the filter table hooks; so the NF_HOOK() macro will invoke the iptable_filter_hook() method. Now take a look in the iptable_filter_hook() method:
+
+static unsigned int iptable_filter_hook(unsigned int hook, struct sk_buff *skb,
+
+const struct net_device *in,
+
+const struct net_device *out,
+
+int (*okfn)(struct sk_buff *))
+
+{
+
+const struct net *net;
+
+. . .
+
+net = dev_net((in != NULL) ? in : out);
+
+. . .
+
+return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter);
+
+}
+
+(net/ipv4/netfilter/iptable_filter.c)
+
+The ipt_do_table() method, in fact, invokes the LOG target callback, ipt_log_packet(), which writes the packet headers into the syslog. If there were more rules, they would have been called at this point. Because there are no more rules, you continue to the ip_local_deliver_finish() method, and the packet continues its traversal to the transport layer (L4) to be handled by a corresponding socket.
+
+### Forwarding the Packet
+
+The second case is that after a lookup in the routing subsystem, you found that the packet is to be forwarded, so the ip_forward() method is called:
+
+int ip_forward(struct sk_buff *skb)
+
+{
+
+. . .
+
+return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
+
+rt->dst.dev, ip_forward_finish);
+
+. . .
+
+Because the filter table has a registered hook callback in NF_INET_FORWARD, as mentioned, you again invoke the iptable_filter_hook() method. And consequently, as before, you again call the ipt_do_table() method, which will in turn again call the ipt_log_packet() method. You will continue to the ip_forward_finish() method (note that ip_forward_finish is the last argument of the NF_HOOK macro above, which represents the continuation method). Then call the ip_output() method, and because the filter table has no NF_INET_POST_ROUTING hook, you continue to the ip_finish_output() method.
+
+Note
+
+You can filter packets according to their Connection Tracking state. The next rule will dump into syslog packets whose Connection Tracking state is ESTABLISHED:
+
+iptables -A INPUT -p tcp -m conntrack --ctstate ESTABLISHED -j LOG --log-level 1
+
+### Network Address Translation (NAT)
+
+The Network Address Translation (NAT) module deals mostly with IP address translation, as the name implies, or port manipulation. One of the most common uses of NAT is to enable a group of hosts with a private IP address on a Local Area Network to access the Internet via some residential gateway. You can do that, for example, by setting a NAT rule. The NAT, which is installed on the gateway, can use such a rule and provide the hosts the ability to access the Web. The netfilter subsystem has NAT implementation for IPv4 and for IPv6. The IPv6 NAT implementation is mainly based on the IPv4 implementation and provides, from a user perspective, an interface similar to IPv4. IPv6 NAT support was merged in kernel 3.7. It provides some features like an easy solution to load balancing (by setting a DNAT on incoming traffic) and more. The IPv6 NAT module is in net/ipv6/netfilter/ip6table_nat.c. There are many types of NAT setups, and there is a lot of documentation on the Web about NAT administration. I talk about two common configurations: SNAT is source NAT, where the source IP address is changed, and DNAT is a destination NAT, where the destination IP address is changed. You can use the –j flag to select SNAT or DNAT. The implementation of both DNAT and SNAT is in net/netfilter/xt_nat.c. The next section discusses NAT initialization.
+
+#### NAT initialization
+
+The NAT table, like the filter table in the previous section, is also an xt_table object. It is registered on all hook points, except for the NF_INET_FORWARD hook:
+
+static const struct xt_table nf_nat_ipv4_table = {
+
+.name = "nat",
+
+.valid_hooks = (1 << NF_INET_PRE_ROUTING) |
+
+(1 << NF_INET_POST_ROUTING) |
+
+(1 << NF_INET_LOCAL_OUT) |
+
+(1 << NF_INET_LOCAL_IN),
+
+.me = THIS_MODULE,
+
+.af = NFPROTO_IPV4,
+
+};
+
+(net/ipv4/netfilter/iptable_nat.c)
+
+Registration and unregistration of the NAT table is done by calling the ipt_register_table() and the ipt_unregister_table(), respectively (net/ipv4/netfilter/iptable_nat.c). The network namespace (struct net) includes an IPv4 specific object (netns_ipv4), which includes a pointer to the IPv4 NAT table (nat_table), as mentioned in the earlier "IP tables" section. This xt_table object, which is created by the ipt_register_table() method, is assigned to this nat_table pointer. You also define an array of nf_hook_ops objects and register it:
+
+static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
+
+/* Before packet filtering, change destination */
+
+{
+
+.hook = nf_nat_ipv4_in,
+
+.owner = THIS_MODULE,
+
+.pf = NFPROTO_IPV4,
+
+.hooknum = NF_INET_PRE_ROUTING,
+
+.priority = NF_IP_PRI_NAT_DST,
+
+},
+
+/* After packet filtering, change source */
+
+{
+
+.hook = nf_nat_ipv4_out,
+
+.owner = THIS_MODULE,
+
+.pf = NFPROTO_IPV4,
+
+.hooknum = NF_INET_POST_ROUTING,
+
+.priority = NF_IP_PRI_NAT_SRC,
+
+},
+
+/* Before packet filtering, change destination */
+
+{
+
+.hook = nf_nat_ipv4_local_fn,
+
+.owner = THIS_MODULE,
+
+.pf = NFPROTO_IPV4,
+
+.hooknum = NF_INET_LOCAL_OUT,
+
+.priority = NF_IP_PRI_NAT_DST,
+
+},
+
+/* After packet filtering, change source */
+
+{
+
+.hook = nf_nat_ipv4_fn,
+
+.owner = THIS_MODULE,
+
+.pf = NFPROTO_IPV4,
+
+.hooknum = NF_INET_LOCAL_IN,
+
+.priority = NF_IP_PRI_NAT_SRC,
+
+},
+
+};
+
+Registration of the nf_nat_ipv4_ops array is done in the iptable_nat_init() method:
+
+static int __init iptable_nat_init(void)
+
+{
+
+int err;
+
+. . .
+
+err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+
+if (err < 0)
+
+goto err2;
+
+return 0;
+
+. . .
+
+}
+
+(net/ipv4/netfilter/iptable_nat.c)
+
+### NAT Hook Callbacks and Connection Tracking Hook Callbacks
+
+There are some hooks on which both NAT callbacks and Connection Tracking callbacks are registered. For example, on the NF_INET_PRE_ROUTING hook (the first hook an incoming packet arrives at), there are two registered callbacks: the Connection Tracking callback, ipv4_conntrack_in(), and the NAT callback, nf_nat_ipv4_in(). The priority of the Connection Tracking callback, ipv4_conntrack_in(), is NF_IP_PRI_CONNTRACK (-200), and the priority of the NAT callback, nf_nat_ipv4_in(), is NF_IP_PRI_NAT_DST (-100). Because callbacks of the same hook with lower priorities are invoked first, the Connection Tracking ipv4_conntrack_in() callback, which has a priority of –200, will be invoked before the NAT nf_nat_ipv4_in() callback, which has a priority of –100. See Figure 9-1 for the location of the ipv4_conntrack_in() method and Figure 9-4 for the location of the nf_nat_ipv4_in(); both are in the same place, in the NF_INET_PRE_ROUTING point. The reason behind this is that NAT performs a lookup in the Connection Tracking layer, and if it does not find an entry, NAT does not perform any address translation action:
+
+static unsigned int nf_nat_ipv4_fn(unsigned int hooknum,
+
+struct sk_buff *skb,
+
+const struct net_device *in,
+
+const struct net_device *out,
+
+int (*okfn)(struct sk_buff *))
+
+{
+
+struct nf_conn *ct;
+
+. . .
+
+/* Don't try to NAT if this packet is not conntracked */
+
+if (nf_ct_is_untracked(ct))
+
+return NF_ACCEPT;
+
+. . .
+
+}
+
+(net/ipv4/netfilter/iptable_nat.c)
+
+Note
+
+The nf_nat_ipv4_fn () method is called from the NAT PRE_ROUTING callback, nf_nat_ipv4_in().
+
+On the NF_INET_POST_ROUTING hook, you have two registered Connection Tracking callbacks: the ipv4_helper() callback (with priority of NF_IP_PRI_CONNTRACK_HELPER, which is 300) and the ipv4_confirm() callback with priority of NF_IP_PRI_CONNTRACK_CONFIRM (INT_MAX, which is the highest integer value for a priority). You also have a registered NAT hook callback, nf_nat_ipv4_out(), with a priority of NF_IP_PRI_NAT_SRC, which is 100. As a result, when reaching the NF_INET_POST_ROUTING hook, first the NAT callback, nf_nat_ipv4_out(), will be called, and then the ipv4_helper() method will be called, and the ipv4_confirm() will be the last to be called. See Figure 9-4.
+
+Let's take a look in a simple DNAT rule and see the traversal of a forwarded packet and the order in which the Connection Tracking callbacks and the NAT callbacks are called (for the sake of simplicity, assume that the filter table is not built in this kernel image). In the setup shown in Figure 9-3, the middle host (the AMD server) runs this DNAT rule:
+
+iptables -t nat -A PREROUTING -j DNAT -p udp --dport 9999 --to-destination 192.168.1.8
+
+The meaning of this DNAT rule is that incoming UDP packets that are sent on UDP destination port 9999 will change their destination IP address to 192.168.1.8. The right side machine (the Linux desktop) sends UDP packets to 192.168.1.9 with UDP destination port of 9999. In the AMD server, the destination IPv4 address is changed to 192.168.1.8 by the DNAT rule, and the packets are sent to the laptop on the left.
+
+Figure 9-3.
+
+A simple setup with a DNAT rule
+
+In Figure 9-4, you can see the traversal of a first UDP packet, which is sent according to the setup mentioned earlier.
+
+Figure 9-4.
+
+NAT and netfilter hooks
+
+The generic NAT module is net/netfilter/nf_nat_core.c. The basic elements of the NAT implementation are the nf_nat_l4proto structure (include/net/netfilter/nf_nat_l4proto.h) and the nf_nat_l3proto structure. In kernels prior to 3.7, you will encounter the nf_nat_protocol structure instead of these two structures, which replaced them as part of adding IPv6 NAT support. These two structures provide a protocol-independent NAT core support.
+
+Both of these structures contain a manip_pkt() function pointer that changes the packet headers. Let's look at an example of the manip_pkt() implementation for the TCP protocol, in net/netfilter/nf_nat_proto_tcp.c:
+
+static bool tcp_manip_pkt(struct sk_buff *skb,
+
+const struct nf_nat_l3proto *l3proto,
+
+unsigned int iphdroff, unsigned int hdroff,
+
+const struct nf_conntrack_tuple *tuple,
+
+enum nf_nat_manip_type maniptype)
+
+{
+
+struct tcphdr *hdr;
+
+__be16 *portptr, newport, oldport;
+
+int hdrsize = 8; /* TCP connection tracking guarantees this much */
+
+/* this could be an inner header returned in icmp packet; in such
+
+cases we cannot update the checksum field since it is outside of
+
+the 8 bytes of transport layer headers we are guaranteed */
+
+if (skb->len >= hdroff + sizeof(struct tcphdr))
+
+hdrsize = sizeof(struct tcphdr);
+
+if (!skb_make_writable(skb, hdroff + hdrsize))
+
+return false;
+
+hdr = (struct tcphdr *)(skb->data + hdroff);
+
+Set newport according to maniptype:
+
+  * If you need to change the source port, maniptype is NF_NAT_MANIP_SRC. So you extract the port from the tuple->src.
+
+  * If you need to change the destination port, maniptype is NF_NAT_MANIP_DST. So you extract the port from the tuple->dst:
+
+if (maniptype == NF_NAT_MANIP_SRC) {
+
+/* Get rid of src port */
+
+newport = tuple->src.u.tcp.port;
+
+portptr = &hdr->source;
+
+} else {
+
+/* Get rid of dst port */
+
+newport = tuple->dst.u.tcp.port;
+
+portptr = &hdr->dest;
+
+}
+
+You are going to change the source port (when maniptype is NF_NAT_MANIP_SRC) or the destination port (when maniptype is NF_NAT_MANIP_DST) of the TCP header, so you need to recalculate the checksum. You must keep the old port for the checksum recalculation, which will be immediately done by calling the csum_update() method and the inet_proto_csum_replace2() method:
+
+oldport = *portptr;
+
+*portptr = newport;
+
+if (hdrsize < sizeof(*hdr))
+
+return true;
+
+Recalculate the checksum:
+
+l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
+
+inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0);
+
+return true;
+
+}
+
+### NAT Hook Callbacks
+
+The protocol-specific NAT module is net/ipv4/netfilter/iptable_nat.c for the IPv4 protocol, and net/ipv6/netfilter/ip6table_nat.c for the IPv6 protocol. These two NAT modules have four hooks callbacks each, shown in Table 9-1.
+
+Table 9-1.
+
+IPv4 and IPv6 NAT Callbacks
+
+Hook | Hook Callback (IPv4) | Hook Callback (IPv6)
+
+---|---|---
+
+NF_INET_PRE_ROUTING | nf_nat_ipv4_in | nf_nat_ipv6_in
+
+NF_INET_POST_ROUTING | nf_nat_ipv4_out | nf_nat_ipv6_out
+
+NF_INET_LOCAL_OUT | nf_nat_ipv4_local_fn | nf_nat_ipv6_local_fn
+
+NF_INET_LOCAL_IN | nf_nat_ipv4_fn | nf_nat_ipv6_fn
+
+The nf_nat_ipv4_fn() is the most important of these methods (for IPv4). The other three methods, nf_nat_ipv4_in(), nf_nat_ipv4_out(), and nf_nat_ipv4_local_fn(), all invoke the nf_nat_ipv4_fn() method. Let's take a look at the nf_nat_ipv4_fn() method:
+
+static unsigned int nf_nat_ipv4_fn(unsigned int hooknum,
+
+struct sk_buff *skb,
+
+const struct net_device *in,
+
+const struct net_device *out,
+
+int (*okfn)(struct sk_buff *))
+
+{
+
+struct nf_conn *ct;
+
+enum ip_conntrack_info ctinfo;
+
+struct nf_conn_nat *nat;
+
+/* maniptype == SRC for postrouting. */
+
+enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
+
+/* We never see fragments: conntrack defrags on pre-routing
+
+* and local-out, and nf_nat_out protects post-routing.
+
+*/
+
+NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
+
+ct = nf_ct_get(skb, &ctinfo);
+
+/* Can't track? It's not due to stress, or conntrack would
+
+* have dropped it. Hence it's the user's responsibilty to
+
+* packet filter it out, or implement conntrack/NAT for that
+
+* protocol. 8) --RR
+
+*/
+
+if (!ct)
+
+return NF_ACCEPT;
+
+/* Don't try to NAT if this packet is not conntracked */
+
+if (nf_ct_is_untracked(ct))
+
+return NF_ACCEPT;
+
+nat = nfct_nat(ct);
+
+if (!nat) {
+
+/* NAT module was loaded late. */
+
+if (nf_ct_is_confirmed(ct))
+
+return NF_ACCEPT;
+
+nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
+
+if (nat == NULL) {
+
+pr_debug("failed to add NAT extension\n");
+
+return NF_ACCEPT;
+
+}
+
+}
+
+switch (ctinfo) {
+
+case IP_CT_RELATED:
+
+case IP_CT_RELATED_REPLY:
+
+if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+
+if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+
+hooknum))
+
+return NF_DROP;
+
+else
+
+return NF_ACCEPT;
+
+}
+
+/* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
+
+case IP_CT_NEW:
+
+/* Seen it before? This can happen for loopback, retrans,
+
+* or local packets.
+
+*/
+
+if (!nf_nat_initialized(ct, maniptype)) {
+
+unsigned int ret;
+
+The nf_nat_rule_find() method calls the ipt_do_table() method, which iterates through all the matches of an entry in a specified table, and if there is a match, calls the target callback:
+
+ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
+
+if (ret != NF_ACCEPT)
+
+return ret;
+
+} else {
+
+pr_debug("Already setup manip %s for ct %p\n",
+
+maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
+
+ct);
+
+if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
+
+goto oif_changed;
+
+}
+
+break;
+
+default:
+
+/* ESTABLISHED */
+
+NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
+
+ctinfo == IP_CT_ESTABLISHED_REPLY);
+
+if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
+
+goto oif_changed;
+
+}
+
+return nf_nat_packet(ct, ctinfo, hooknum, skb);
+
+oif_changed:
+
+nf_ct_kill_acct(ct, ctinfo, skb);
+
+return NF_DROP;
+
+}
+
+### Connection Tracking Extensions
+
+Connection Tracking (CT) Extensions were added in kernel 2.6.23. The main point of Connection Tracking Extensions is to allocate only what is required—for example, if the NAT module is not loaded, the extra memory needed for NAT in the Connection Tracking layer will not be allocated. Some extensions are enabled by sysctls or even depending on certain iptables rules (for example, -m connlabel). Each Connection Tracking Extension module should define an nf_ct_ext_type object and perform registration by the nf_ct_extend_register() method (unregistration is done by the nf_ct_extend_unregister() method). Each extension should define a method to attach its Connection Tracking Extension to a connection (nf_conn) object, which should be called from the init_conntrack() method. Thus, for example, you have the nf_ct_tstamp_ext_add() method for the timestamp CT Extension and nf_ct_labels_ext_add() for the labels CT Extension. The Connection Tracking Extensions infrastructure is implemented in net/netfilter/nf_conntrack_extend.c. These are the Connection Tracking Extensions modules as of this writing (all under net/netfilter):
+
+  * nf_conntrack_timestamp.c
+
+  * nf_conntrack_timeout.c
+
+  * nf_conntrack_acct.c
+
+  * nf_conntrack_ecache.c
+
+  * nf_conntrack_labels.c
+
+  * nf_conntrack_helper.c
+
+## Summary
+
+This chapter described the netfilter subsystem implementation. I covered the netfilter hooks and how they are registered. I also discussed important subjects such as the Connection Tracking mechanism, iptables, and NAT. Chapter 10 deals with the IPsec subsystem and its implementation.
+
+## Quick Reference
+
+This section covers the top methods that are related to the topics discussed in this chapter, ordered by their context, followed by three tables and a short section about tools and libraries.
+
+### Methods
+
+The following is a short list of important methods of the netfilter subsystem. Some of them were mentioned in this chapter.
+
+#### struct xt_table *ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl);
+
+This method registers a table in the netfilter subsystem.
+
+#### void ipt_unregister_table(struct net *net, struct xt_table *table);
+
+This method unregisters a table in the netfilter subsystem.
+
+#### int nf_register_hook(struct nf_hook_ops *reg);
+
+This method registers a single nf_hook_ops object.
+
+#### int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
+
+This method registers an array of n nf_hook_ops objects; the second parameter is the number of the elements in the array.
+
+#### void nf_unregister_hook(struct nf_hook_ops *reg);
+
+This method unregisters a single nf_hook_ops object.
+
+#### void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
+
+This method unregisters an array of n nf_hook_ops objects; the second parameter is the number of the elements in the array.
+
+#### static inline void nf_conntrack_get(struct nf_conntrack *nfct);
+
+This method increments the reference count of the associated nf_conntrack object.
+
+#### static inline void nf_conntrack_put(struct nf_conntrack *nfct);
+
+This method decrements the reference count of the associated nf_conntrack object. If it reaches 0, the nf_conntrack_destroy() method is called.
+
+#### int nf_conntrack_helper_register(struct nf_conntrack_helper *me);
+
+This method registers an nf_conntrack_helper object.
+
+#### static inline struct nf_conn *resolve_normal_ct(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u_int16_t l3num, u_int8_t protonum, struct nf_conntrack_l3proto *l3proto, struct nf_conntrack_l4proto *l4proto, int *set_reply, enum ip_conntrack_info *ctinfo);
+
+This method tries to find an nf_conntrack_tuple_hash object according to the specified SKB by calling the __nf_conntrack_find_get() method, and if it does not find such an entry, it creates one by calling the init_conntrack() method. The resolve_normal_ct() method is called from the nf_conntrack_in() method (net/netfilter/nf_conntrack_core.c).
+
+#### struct nf_conntrack_tuple_hash *init_conntrack(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_l3proto *l3proto, struct nf_conntrack_l4proto *l4proto, struct sk_buff *skb, unsigned int dataoff, u32 hash);
+
+This method allocates a Connection Tracking nf_conntrack_tuple_hash object. Invoked from the resolve_normal_ct() method, it tries to find an expectation for this connection by calling the nf_ct_find_expectation() method.
+
+#### static struct nf_conn *__nf_conntrack_alloc(struct net *net, u16 zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp, u32 hash);
+
+This method allocates an nf_conn object. Sets the timeout timer of the nf_conn object to be the death_by_timeout() method.
+
+#### int xt_register_target(struct xt_target *target);
+
+This method registers an Xtable target extension.
+
+#### void xt_unregister_target(struct xt_target *target);
+
+This method unregisters an Xtable target extension.
+
+#### int xt_register_targets(struct xt_target *target, unsigned int n);
+
+This method registers an array of Xtable target extensions; n is the number of targets.
+
+#### void xt_unregister_targets(struct xt_target *target, unsigned int n);
+
+This method unregisters an array of Xtable target extensions; n is the number of targets.
+
+#### int xt_register_match(struct xt_match *target);
+
+This method registers an Xtable match extension.
+
+#### void xt_unregister_match(struct xt_match *target);
+
+This method unregisters an Xtable match extension.
+
+#### int xt_register_matches(struct xt_match *match, unsigned int n);
+
+This method registers an array of Xtable match extensions; n is the number of matches.
+
+#### void xt_unregister_matches(struct xt_match *match, unsigned int n);
+
+This method unregisters an array of Xtable match extensions; n is the number of matches.
+
+#### int nf_ct_extend_register(struct nf_ct_ext_type *type);
+
+This method registers a Connection Tracking Extension object.
+
+#### void nf_ct_extend_unregister(struct nf_ct_ext_type *type);
+
+This method unregisters a Connection Tracking Extension object.
+
+#### int __init iptable_nat_init(void);
+
+This method initializes the IPv4 NAT table.
+
+#### int __init nf_conntrack_ftp_init(void);
+
+This method initializes the Connection Tracking FTP Helper. Calls the nf_conntrack_helper_register() method to register the FTP helpers.
+
+### MACRO
+
+Let's look at the macro used in this chapter.
+
+#### NF_CT_DIRECTION(hash)
+
+This is a macro that gets an nf_conntrack_tuple_hash object as a parameter and returns the direction (IP_CT_DIR_ORIGINAL, which is 0, or IP_CT_DIR_REPLY, which is 1) of the destination (dst object) of the associated tuple (include/net/netfilter/nf_conntrack_tuple.h).
+
+### Tables
+
+And here are the tables, showing netfilter tables in IPv4 network namespace and in IPv6 network namespace and netfilter hook priorities.
+
+Table 9-2.
+
+IPv4 Network Namespace (netns_ipv4) Tables (xt_table Objects)
+
+Linux Symbol (netns_ipv4) | Linux Module
+
+---|---
+
+iptable_filter | net/ipv4/netfilter/iptable_filter.c
+
+iptable_mangle | net/ipv4/netfilter/iptable_mangle.c
+
+iptable_raw | net/ipv4/netfilter/iptable_raw.c
+
+arptable_filter | net/ipv4/netfilter/arp_tables.c
+
+nat_table | net/ipv4/netfilter/iptable_nat.c
+
+iptable_security | net/ipv4/netfilter/iptable_security.c (Note: CONFIG_SECURITY should be set).
+
+Table 9-3.
+
+IPv6 Network Namespace (netns_ipv6) Tables (xt_table Objects)
+
+Linux Symbol (netns_ipv6) | Linux Module
+
+---|---
+
+ip6table_filter | net/ipv6/netfilter/ip6table_filter.c
+
+ip6table_mangle | net/ipv6/netfilter/ip6table_mangle.c
+
+ip6table_raw | net/ipv6/netfilter/ip6table_raw.c
+
+ip6table_nat | net/ipv6/netfilter/ip6table_nat.c
+
+ip6table_security | net/ipv6/netfilter/ip6table_security.c (Note: CONFIG_SECURITY should be set).
+
+Table 9-4.
+
+Netfilter Hook Priorities
+
+Linux Symbol | value
+
+---|---
+
+NF_IP_PRI_FIRST | INT_MIN
+
+NF_IP_PRI_CONNTRACK_DEFRAG | -400
+
+NF_IP_PRI_RAW | -300
+
+NF_IP_PRI_SELINUX_FIRST | -225
+
+NF_IP_PRI_CONNTRACK | -200
+
+NF_IP_PRI_MANGLE | -150
+
+NF_IP_PRI_NAT_DST | -100
+
+NF_IP_PRI_FILTER | 0
+
+NF_IP_PRI_SECURITY | 50
+
+NF_IP_PRI_NAT_SRC | 100
+
+NF_IP_PRI_SELINUX_LAST | 225
+
+NF_IP_PRI_CONNTRACK_HELPER | 300
+
+NF_IP_PRI_CONNTRACK_CONFIRM | INT_MAX
+
+NF_IP_PRI_LAST | INT_MAX
+
+See the nf_ip_hook_priorities enum definition in include/uapi/linux/netfilter_ipv4.h.
+
+#### Tools and Libraries
+
+The conntrack-tools consist of a userspace daemon, conntrackd, and a command line tool, conntrack. It provides a tool with which system administrators can interact with the netfilter Connection Tracking layer. See:  http://conntrack-tools.netfilter.org/ .
+
+Some libraries are developed by the netfilter project and allow you to perform various userspace tasks; these libraries are prefixed with "libnetfilter"; for example, libnetfilter_conntrack, libnetfilter_log, and libnetfilter_queue. For more details, see the netfilter official website,  www.netfilter.org .
+Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_10
+
+© Rami Rosen 2014
+
+# 10. IPsec
+
+Rami Rosen1
+
+(1)
+
+Haifa, Israel
+
+Abstract
+
+Chapter 9 deals with the netfilter subsystem and with its kernel implementation. This chapter discusses the Internet Protocol Security (IPsec) subsystem. IPsec is a group of protocols for securing IP traffic by authenticating and encrypting each IP packet in a communication session. Most security services are provided by two major IPsec protocols: the Authentication Header (AH) protocol and the Encapsulating Security Payload (ESP) protocol. Moreover, IPsec provides protection against trying to eavesdrop and send again packets (replay attacks). IPsec is mandatory according to IPv6 specification and optional in IPv4. Nevertheless, most modern operating systems, including Linux, have support for IPsec both in IPv4 and in IPv6. The first IPsec protocols were defined in 1995 (RFCs 1825–1829). In 1998, these RFCs were deprecated by RFCs 2401–2412. Then again in 2005, these RFCs were updated by RFCs 4301–4309.
+
+Chapter 9 deals with the netfilter subsystem and with its kernel implementation. This chapter discusses the Internet Protocol Security (IPsec) subsystem. IPsec is a group of protocols for securing IP traffic by authenticating and encrypting each IP packet in a communication session. Most security services are provided by two major IPsec protocols: the Authentication Header (AH) protocol and the Encapsulating Security Payload (ESP) protocol. Moreover, IPsec provides protection against trying to eavesdrop and send again packets (replay attacks). IPsec is mandatory according to IPv6 specification and optional in IPv4. Nevertheless, most modern operating systems, including Linux, have support for IPsec both in IPv4 and in IPv6. The first IPsec protocols were defined in 1995 (RFCs 1825–1829). In 1998, these RFCs were deprecated by RFCs 2401–2412. Then again in 2005, these RFCs were updated by RFCs 4301–4309.
+
+The IPsec subsystem is very complex—perhaps the most complex part of the Linux kernel network stack. Its importance is paramount when considering the growing security requirements of organizations and of private citizens. This chapter gives you a basis for delving into this complex subsystem.
+
+## General
+
+IPsec has become a standard for most of the IP Virtual Private Network (VPN) technologyin the world. That said, there are also VPNs based on different technologies, such as Secure Sockets Layer (SSL) and pptp (tunneling a PPP connection over the GRE protocol). Among IPsec's several modes of operation, the most important are transport mode and tunnel mode. In transport mode, only the payload of the IP packet is encrypted, whereas in tunnel mode, the entire IP packet is encrypted and inserted into a new IP packet with a new IP header. When using a VPN with IPsec, you usually work in tunnel mode, although there are cases in which you work in transport mode (L2TP/IPsec, for example).
+
+I start with a short discussion about the Internet Key Exchange (IKE) userspace daemon and cryptography in IPsec. These are topics that are mostly not a part of the kernel networking stack but that are related to IPsec operation and are needed to get a better understanding of the kernel IPsec subsystem. I follow that with a discussion of the XFRM framework, which is the configuration and monitoring interface between the IPsec userspace part and IPsec kernel components, and explain the traversal of IPsec packets in the Tx and Rx paths. I conclude the chapter with a short section about NAT traversal in IPsec, which is an important and interesting feature, and a "Quick Reference" section. The next section begins the discussion with the IKE protocol.
+
+## IKE (Internet Key Exchange)
+
+The most popular open source userspace Linux IPsec solutions are Openswan (and libreswan, which forked from Openswan), strongSwan, and racoon (of ipsec-tools). Racoon is part of the Kame project, which aimed to provide a free IPv6 and IPsec protocol stack implementation for variants of BSD.
+
+To establish an IPsec connection, you need to set up a Security Association (SA). You do that with the help of the already mentioned userspace projects. An SA is defined by two parameters: a source address and a 32-bit Security Parameter Index (SPI). Both sides (called initiator and responder in IPsec terminology) should agree on parameters such as a key (or more than one key), authentication, encryption, data integrity and key exchange algorithms, and other parameters such as key lifetime (IKEv1 only). This can be done in two different ways of key distribution: by manual key exchange, which is rarely used since it is less secure, or by the IKE protocol. Openswan and strongSwan implementations provide an IKE daemon (pluto in Openswan and charon in strongSwan) that uses UDP port 500 (both source and destination) to send and receive IKE messages. Both use the XFRM Netlink interface to communicate with the native IPsec stack of the Linux kernel. The strongSwan project is the only complete open source implementation of RFC 5996, "Internet Key Exchange Protocol Version 2 (IKEv2)," whereas the Openswan project only implements a small mandatory subset.
+
+You can use IKEv1 Aggressive Mode in Openswan and in strongSwan 5.x (for strongSwan, it should be explicitly configured, and the name of the charon daemon changes to be weakSwan in this case); but this option is regarded unsafe. IKEv1 is still used by Apple operating systems (iOS and Mac OS X) because of the built-in racoon legacy client. Though many implementations use IKEv1, there are many improvements and advantages when using IKEv2. I'll mention some of them very briefly: in IKEv1, more messages are needed to establish an SA than in IKEv2. IKEv1 is very complex, whereas IKEv2 is considerably simpler and more robust, mainly because each IKEv2 request message must be acknowledged by an IKEv2 response message. In IKEv1, there are no acknowledgements, but there is a backoff algorithm which, in case of packet loss, keeps trying forever. However, in IKEv1 there can be a race when the two sides perform retransmission, whereas in IKEv2 that can't happen because the responsibility for retransmission is on the initiator only. Among the other important IKEv2 features are that IKEv2 has integrated NAT traversal support, automatic narrowing of Traffic Selectors (left|rightsubnet on both sides don't have to match exactly, but one proposal can be a subset of the other proposal), an IKEv2 configuration payload allowing to assign virtual IPv4/IPv6 addresses and internal DNS information (replacement for IKEv1 Mode Config), and finally IKEv2 EAP authentication (replacement for the dangerous IKEv1 XAUTH protocol), which solves the problem of potentially weak PSKs by requesting a VPN server certificate and digital signature first, before the client uses a potentially weak EAP authentication algorithm (for example, EAP-MSCHAPv2).
+
+There are two phases in IKE: the first is called Main Mode. In this stage, each side verifies the identity of the other side, and a common session key is established using the Diffie-Hellman key exchange algorithm. This mutual authentication is based on RSA or ECDSA certificates or pre-shared secrets (pre-shared key, PSKs), which are password based and assumed to be weaker. Other parameters like the Encryption algorithm and the Authentication method to be used are also negotiated. If this phase completes successfully, the two peers are said to establish an ISAKMP SA (Internet Security Association Key Management Protocol Security Association). The second phase is called Quick Mode. In this phase, both sides agree on the cryptographic algorithms to use. The IKEv2 protocol does not differentiate between phase 1 and 2 but establishes the first CHILD_SA as part of the IKE_AUTH message exchange. THE CHILD_SA_CREATE message exchange is used only to establish additional CHILD_SAs or for the periodic rekeying of the IKE and IPsec SAs. This is why IKEv1 needs nine messages to establish a single IPsec SA, whereas IKEv2 does the same in just four messages.
+
+The next section briefly discusses cryptography in the context of IPsec (a fuller treatment of the subject would be beyond the scope of this book).
+
+## IPsec and Cryptography
+
+There are two widely used IPsec stacks for Linux: the native Netkey stack (developed by Alexey Kuznetsov and David S. Miller) introduced with the 2.6 kernel, and the KLIPS stack, originally written for 2.0 kernel (it predates netfilter!). Netkey uses the Linux kernel Crypto API, whereas KLIPS might support more crypto hardware through Open Cryptography Framework (OCF). OCF's advantage is that it enables using asynchronous calls to encrypt/decrypt data. In the Linux kernel, most of the Crypto API performs synchronous calls. I should mention the acrypto kernel code, which is the asynchronous crypto layer of the Linux kernel. There are asynchronous implementations for all algorithm types. A lot of hardware crypto accelerators use the asynchronous crypto interface for crypto request offloading. That is simply because they can't block until the crypto job is done. They have to use the asynchronous API.
+
+It is also possible to use software-implemented algorithms with the asynchronous API. For example, the cryptd crypto template can run arbitrary algorithms in asynchronous mode. And you can use the pcrypt crypto template when working in multicore environment. This template parallelizes the crypto layer by sending incoming crypto requests to a configurable set of CPUs. It also takes care of the order of the crypto requests, so it does not introduce packet reorder when used with IPsec. The use of pcrypt can speed up IPsec by magnitudes in some situations. The crypto layer has a user management API which is used by the crconf ( http://sourceforge.net/projects/crconf/ ) tool to configure the crypto layer, so asynchronous crypto algorithms can be configured whenever needed. With the Linux 2.6.25 kernel, released in 2008, the XFRM framework started to offer support for the very efficient AEAD (Authenticated Encryption with Associated Data) algorithms (for example, AES-GCM), especially when the Intel AES-NI instruction set is available and data integrity comes nearly for free. Delving deeply into the details of cryptography in IPsec is beyond the scope of this book. For further information, I suggest reading the relevant chapters in Network Security Essentials, Fifth Edition by William Stallings (Prentice Hall, 2013).
+
+The next section discusses the XFRM framework, which is the infrastructure of IPsec.
+
+## The XFRM Framework
+
+IPsec is implemented by the XFRM (pronounced "transform") framework, originated in the USAGI project, which aimed at providing a production quality IPv6 and IPsec protocol stack. The term transform refers to an incoming packet or an outgoing packet being transformed in the kernel stack according to some IPsec rule. The XFRM framework was introduced in kernel 2.5. The XFRM infrastructure is protocol-family independent, which means that there is a generic part common to both IPv4 and IPv6, located under net/xfrm. Both IPv4 and IPv6 have their own implementation of ESP, AH, and IPCOMP. For example, the IPv4 ESP module is net/ipv4/esp4.c, and the IPv6 ESP module is net/ipv6/esp6.c. Apart from it, IPv4 and IPv6 implement some protocol-specific modules for supporting the XFRM infrastructure, such as net/ipv4/xfrm4_policy.c or net/ipv6/xfrm6_policy.c.
+
+The XFRM framework supports network namespaces, which is a form of lightweight process virtualization that enables a single process or a group of processes to have their own network stack (I discuss network namespaces in Chapter 14). Each network namespace (instance of struct net) includes a member called xfrm, which is an instance of the netns_xfrm structure. This object includes many data structures and variables that you will encounter in this chapter, such as the hash tables of XFRM policies and the hash tables of XFRM states, sysctl parameters, XFRM state garbage collector, counters, and more:
+
+struct netns_xfrm {
+
+struct hlist_head *state_bydst;
+
+struct hlist_head *state_bysrc;
+
+struct hlist_head *state_byspi;
+
+. . .
+
+unsigned int state_num;
+
+. . .
+
+struct work_struct state_gc_work;
+
+. . .
+
+u32 sysctl_aevent_etime;
+
+u32 sysctl_aevent_rseqth;
+
+int sysctl_larval_drop;
+
+u32 sysctl_acq_expires;
+
+};
+
+(include/net/netns/xfrm.h)
+
+### XFRM Initialization
+
+In IPv4, XFRM initialization is done by calling the xfrm_init() method and the xfrm4_init() method from the ip_rt_init() method in net/ipv4/route.c. In IPv6, the xfrm6_init() method is invoked from the ip6_route_init() method for performing XFRM initialization. Communication between the userspace and the kernel is done by creating a NETLINK_XFRM netlink socket and sending and receiving netlink messages. The netlink NETLINK_XFRM kernel socket is created in the following method:
+
+static int __net_init xfrm_user_net_init(struct net *net)
+
+{
+
+struct sock *nlsk;
+
+struct netlink_kernel_cfg cfg = {
+
+.groups = XFRMNLGRP_MAX,
+
+.input = xfrm_netlink_rcv,
+
+};
+
+nlsk = netlink_kernel_create(net, NETLINK_XFRM, &cfg);
+
+. . .
+
+return 0;
+
+}
+
+Messages sent from userspace (like XFRM_MSG_NEWPOLICY for creating a new Security Policy or XFRM_MSG_NEWSA for creating a new Security Association) are handled by the xfrm_netlink_rcv() method (net/xfrm/xfrm_user.c), which in turn calls the xfrm_user_rcv_msg() method (I discuss netlink sockets in Chapter 2).
+
+The XFRM policy and the XFRM state are the fundamental data structures of the XFRM framework. I start by describing what XFRM policy is, and subsequently I describe what XFRM state is.
+
+### XFRM Policies
+
+A Security Policy is a rule that tells IPsec whether a certain flow should be processed or whether it can bypass IPsec processing. The xfrm_policy structure represents an IPsec policy. A policy includes a selector (an xfrm_selector object). A policy is applied when its selector matches a flow. The XFRM selector consists of fields like source and destination addresses, source and destination ports, protocol, and more, which can identify a flow:
+
+struct xfrm_selector {
+
+xfrm_address_t daddr;
+
+xfrm_address_t saddr;
+
+__be16 dport;
+
+__be16 dport_mask;
+
+__be16 sport;
+
+__be16 sport_mask;
+
+__u16 family;
+
+__u8 prefixlen_d;
+
+__u8 prefixlen_s;
+
+__u8 proto;
+
+int ifindex;
+
+__kernel_uid32_t user;
+
+};
+
+(include/uapi/linux/xfrm.h)
+
+The xfrm_selector_match() method, which gets an XFRM selector, a flow, and a family (AF_INET for IPv4 or AF_INET6 for IPv6) as parameters, returns true when the specified flow matches the specified XFRM selector. Note that the xfrm_selector structure is also used in XFRM states, as you will see hereafter in this section. A Security Policy is represented by the xfrm_policy structure:
+
+struct xfrm_policy {
+
+. . .
+
+struct hlist_node bydst;
+
+struct hlist_node byidx;
+
+/* This lock only affects elements except for entry. */
+
+rwlock_t lock;
+
+atomic_t refcnt;
+
+struct timer_list timer;
+
+struct flow_cache_object flo;
+
+atomic_t genid;
+
+u32 priority;
+
+u32 index;
+
+struct xfrm_mark mark;
+
+struct xfrm_selector selector;
+
+struct xfrm_lifetime_cfg lft;
+
+struct xfrm_lifetime_cur curlft;
+
+struct xfrm_policy_walk_entry walk;
+
+struct xfrm_policy_queue polq;
+
+u8 type;
+
+u8 action;
+
+u8 flags;
+
+u8 xfrm_nr;
+
+u16 family;
+
+struct xfrm_sec_ctx *security;
+
+struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH];
+
+};
+
+(include/net/xfrm.h)
+
+The following description covers the important members of the xfrm_policy structure:
+
+  * refcnt: The XFRM policy reference counter; initialized to 1 in the xfrm_policy_alloc( ) method, incremented by the xfrm_pol_hold() method, and decremented by the xfrm_pol_put() method.
+
+  * timer: Per-policy timer; the timer callback is set to be xfrm_policy_timer() in the xfrm_policy_alloc() method. The xfrm_policy_timer() method handles policy expiration: it is responsible for deleting a policy when it is expired by calling the xfrm_policy_delete() method, and sending an event (XFRM_MSG_POLEXPIRE) to all registered Key Managers by calling the km_policy_expired() method.
+
+  * lft: The XFRM policy lifetime (xfrm_lifetime_cfg object). Every XFRM policy has a lifetime, which is a time interval (expressed as a time or byte count).
+
+You can set XFRM policy lifetime values with the ip command and the limit parameter—for example:
+
+ip xfrm policy add src 172.16.2.0/24 dst 172.16.1.0/24 limit byte-soft 6000 ...
+
+sets the soft_byte_limit of the XFRM policy lifetime (lft) to be 6000; see man 8 ip xfrm.
+
+You can display the lifetime (lft) of an XFRM policy by inspecting the lifetime configuration entry when running ip -stat xfrm policy show.
+
+  * curlft: The XFRM policy current lifetime, which reflects the current status of the policy in context of lifetime. The curlft is an xfrm_lifetime_cur object. It consists of four members (all of them are fields of 64 bits, unsigned):
+
+  * bytes: The number of bytes which were processed by the IPsec subsystem, incremented in the Tx path by the xfrm_output_one() method and in the Rx path by the xfrm_input() method.
+
+  * packets: The number of packets that were processed by the IPsec subsystem, incremented in the Tx path by the xfrm_output_one() method, and in the Rx path by the xfrm_input() method.
+
+  * add_time: The timestamp of adding the policy, initialized when adding a policy, in the xfrm_policy_insert() method and in the xfrm_sk_policy_insert() method.
+
+  * use_time: The timestamp of last access to the policy. The use_time timestamp is updated, for example, in the xfrm_lookup() method or in the __xfrm_policy_check() method. Initialized to 0 when adding the XFRM policy, in the xfrm_policy_insert() method and in the xfrm_sk_policy_insert() method.
+
+Note
+
+You can display the current lifetime (curlft) object of an XFRM policy by inspecting the lifetime current entry when running ip -stat xfrm policy show.
+
+  * polq: A queue to hold packets that are sent while there are still no XFRM states associated with the policy. As a default, such packets are discarded by calling the make_blackhole() method. When setting the xfrm_larval_drop sysctl entry to 0 (/proc/sys/net/core/xfrm_larval_drop), these packets are kept in a queue (polq.hold_queue) of SKBs; up to 100 packets (XFRM_MAX_QUEUE_LEN) can be kept in this queue. This is done by creating a dummy XFRM bundle, by the xfrm_create_dummy_bundle() method (see more in the "XFRM lookup" section later in this chapter). By default, the xfrm_larval_drop sysctl entry is set to 1 (see the __xfrm_sysctl_init() method in net/xfrm/xfrm_sysctl.c).
+
+  * type: Usually the type is XFRM_POLICY_TYPE_MAIN (0). When the kernel has support for subpolicy (CONFIG_XFRM_SUB_POLICY is set), two policies can be applied to the same packet, and you can use the XFRM_POLICY_TYPE_SUB (1) type. Policy that lives a shorter time in kernel should be a subpolicy. This feature is usually needed only for developers/debugging and for mobile IPv6, because you might apply one policy for IPsec and one for mobile IPv6. The IPsec policy is usually the main policy with a longer lifetime than the mobile IPv6 (sub) policy.
+
+  * action: Can have one of these two values:
+
+    * XFRM_POLICY_ALLOW (0): Permit the traffic.
+
+    * XFRM_POLICY_BLOCK(1): Disallow the traffic (for example, when using type=reject or type=drop in /etc/ipsec.conf).
+
+    * xfrm_nr: Number of templates associated with the policy—can be up to six templates (XFRM_MAX_DEPTH). The xfrm_tmpl structure is an intermediate structure between the XFRM state and the XFRM policy. It is initialized in the copy_templates() method, net/xfrm/xfrm_user.c.
+
+    * family: IPv4 or IPv6.
+
+    * security: A security context (xfrm_sec_ctx object) that allows the XFRM subsystem to restrict the sockets that can send or receive packets via Security Associations (XFRM states). For more details, see  http://lwn.net/Articles/156604/ .
+
+    * xfrm_vec: An array of XFRM templates (xfrm_tmpl objects).
+
+The kernel stores the IPsec Security Policies in the Security Policy Database (SPD). Management of the SPD is done by sending messages from a userspace socket. For example:
+
+  * Adding an XFRM policy (XFRM_MSG_NEWPOLICY) is handled by the xfrm_add_policy() method.
+
+  * Deleting an XFRM policy (XFRM_MSG_DELPOLICY) is handled by the xfrm_get_policy() method.
+
+  * Displaying the SPD (XFRM_MSG_GETPOLICY) is handled by the xfrm_dump_policy() method.
+
+  * Flushing the SPD (XFRM_MSG_FLUSHPOLICY) is handled by the xfrm_flush_policy() method.
+
+The next section describes what XFRM state is.
+
+### XFRM States (Security Associations)
+
+The xfrm_state structure represents an IPsec Security Association (SA) (include/net/xfrm.h). It represents unidirectional traffic and includes information such as cryptographic keys, flags, request id, statistics, replay parameters, and more. You add XFRM states by sending a request (XFRM_MSG_NEWSA) from a userspace socket; it is handled in the kernel by the xfrm_state_add() method (net/xfrm/xfrm_user.c). Likewise, you delete a state by sending an XFRM_MSG_DELSA message, and it is handled in the kernel by the xfrm_del_sa() method:
+
+struct xfrm_state {
+
+. . .
+
+union {
+
+struct hlist_node gclist;
+
+struct hlist_node bydst;
+
+};
+
+struct hlist_node bysrc;
+
+struct hlist_node byspi;
+
+atomic_t refcnt;
+
+spinlock_t lock;
+
+struct xfrm_id id;
+
+struct xfrm_selector sel;
+
+struct xfrm_mark mark;
+
+u32 tfcpad;
+
+u32 genid;
+
+/* Key manager bits */
+
+struct xfrm_state_walk km;
+
+/* Parameters of this state. */
+
+struct {
+
+u32 reqid;
+
+u8 mode;
+
+u8 replay_window;
+
+u8 aalgo, ealgo, calgo;
+
+u8 flags;
+
+u16 family;
+
+xfrm_address_t saddr;
+
+int header_len;
+
+int trailer_len;
+
+} props;
+
+struct xfrm_lifetime_cfg lft;
+
+/* Data for transformer */
+
+struct xfrm_algo_auth *aalg;
+
+struct xfrm_algo *ealg;
+
+struct xfrm_algo *calg;
+
+struct xfrm_algo_aead *aead;
+
+/* Data for encapsulator */
+
+struct xfrm_encap_tmpl *encap;
+
+/* Data for care-of address */
+
+xfrm_address_t *coaddr;
+
+/* IPComp needs an IPIP tunnel for handling uncompressed packets */
+
+struct xfrm_state *tunnel;
+
+/* If a tunnel, number of users + 1 */
+
+atomic_t tunnel_users;
+
+/* State for replay detection */
+
+struct xfrm_replay_state replay;
+
+struct xfrm_replay_state_esn *replay_esn;
+
+/* Replay detection state at the time we sent the last notification */
+
+struct xfrm_replay_state preplay;
+
+struct xfrm_replay_state_esn *preplay_esn;
+
+/* The functions for replay detection. */
+
+struct xfrm_replay *reply;
+
+/* internal flag that only holds state for delayed aevent at the
+
+* moment
+
+*/
+
+u32 xflags;
+
+/* Replay detection notification settings */
+
+u32 replay_maxage;
+
+u32 replay_maxdiff;
+
+/* Replay detection notification timer */
+
+struct timer_list rtimer;
+
+/* Statistics */
+
+struct xfrm_stats stats;
+
+struct xfrm_lifetime_cur curlft;
+
+struct tasklet_hrtimer mtimer;
+
+/* used to fix curlft->add_time when changing date */
+
+long saved_tmo;
+
+/* Last used time */
+
+unsigned long lastused;
+
+/* Reference to data common to all the instances of this
+
+* transformer. */
+
+const struct xfrm_type *type;
+
+struct xfrm_mode *inner_mode;
+
+struct xfrm_mode *inner_mode_iaf;
+
+struct xfrm_mode *outer_mode;
+
+/* Security context */
+
+struct xfrm_sec_ctx *security;
+
+/* Private data of this transformer, format is opaque,
+
+* interpreted by xfrm_type methods. */
+
+void *data;
+
+};
+
+(include/net/xfrm.h)
+
+The following description details some of the important members of the xfrm_state structure:
+
+  * refcnt: A reference counter, incremented by the xfrm_state_hold() method and decremented by the __xfrm_state_put() method or by the xfrm_state_put() method (the latter also releases the XFRM state by calling the __xfrm_state_destroy() method when the reference counter reaches 0).
+
+  * id: The id (xfrm_id object) consists of three fields, which uniquely define it: destination address, spi, and security protocol (AH, ESP, or IPCOMP).
+
+  * props: The properties of the XFRM state. For example:
+
+    * mode: Can be one of five modes (for example, XFRM_MODE_TRANSPORT for transport mode or XFRM_MODE_TUNNEL for tunnel mode; see include/uapi/linux/xfrm.h).
+
+    * flag: For example, XFRM_STATE_ICMP. These flags are available in include/uapi/linux/xfrm.h. These flags can be set from userspace, for example, with the ip command and the flag option: ip xfrm add state flag icmp ...
+
+    * family: IPv4 of IPv6.
+
+    * saddr: The source address of the XFRM state.
+
+    * lft: The XFRM state lifetime (xfrm_lifetime_cfg object).
+
+    * stats: An xfrm_stats object, representing XFRM state statistics. You can display the XFRM state statistics by ip –stat xfrm show.
+
+The kernel stores the IPsec Security Associations in the Security Associations Database (SAD). The xfrm_state objects are stored in three hash tables in netns_xfrm (the XFRM namespace, discussed earlier): state_bydst, state_bysrc, state_byspi. The keys to these tables are computed by the xfrm_dst_hash(), xfrm_src_hash(), and xfrm_spi_hash() methods, respectively. When an xfrm_state object is added, it is inserted into these three hash tables. If the value of the spi is 0 (the value 0 is not normally to be used for spi—I will shortly mention when it is 0), the xfrm_state object is not added to the state_byspi hash table (see the __xfrm_state_insert() method in net/xfrm/xfrm_state.c).
+
+Note
+
+An spi with value of 0 is only used for acquire states. The kernel sends an acquire message to the key manager and adds a temporary acquire state with spi 0 if traffic matches a policy, but the state is not yet resolved. The kernel does not bother to send a further acquire as long as the acquire state exists; the lifetime can be configured at net->xfrm.sysctl_acq_expires. If the state gets resolved, this acquire state is replaced by the actual state.
+
+Lookup in the SAD can be done by the following:
+
+  * xfrm_state_lookup() method: In the state_byspi hash table.
+
+  * xfrm_state_lookup_byaddr() method: In the state_bysrc hash table.
+
+  * xfrm_state_find() method: In the state_bydst hash table.
+
+The ESP protocol is the most commonly used IPsec protocol; it supports both encryption and authentication. The next section discusses the IPv4 ESP implementation.
+
+## ESP Implementation (IPv4)
+
+The ESP protocol is specified in RFC 4303; it supports both encryption and authentication. Though it also supports encryption-only and authentication-only modes, it is usually used with both encryption and authentication because it is safer. I should also mention here the new Authenticated Encryption (AEAD) methods like AES-GCM, which can do the encryption and data integrity computations in a single pass and can be highly parallelized on multiple cores, so that with the Intel AES-NI instruction set, an IPsec throughput of several Gbit/s can be achieved. The ESP protocol supports both tunnel mode and transport mode; the protocol identifier is 50 (IPPROTO_ESP). The ESP adds a new header and a trailer to each packet. According to the ESP format, illustrated in Figure 10-1, there are the following fields:
+
+  * SPI: A 32-bit Security Parameter Index. Together with the source address, it identities an SA.
+
+  * Sequence Number:32 bits, incremented by 1 for each transmitted packet in order to protect against replay attacks.
+
+  * Payload Data:A variable size encrypted data block.
+
+  * Padding: Padding for the encrypted data block in order to satisfy alignment requirements (0–255 bytes).
+
+  * Pad Length: The size of padding in bytes (1 byte).
+
+  * Next Header: The type of the next header (1 byte).
+
+  * Authentication Data:The Integrity Check Value (ICV).
+
+Figure 10-1.
+
+ESP format
+
+The next section discusses IPv4 ESP initialization.
+
+### IPv4 ESP Initialization
+
+We first define an esp_type (xfrm_type object) and esp4_protocol (net_protocol object) and register them thus:
+
+static const struct xfrm_type esp_type =
+
+{
+
+.description = "ESP4",
+
+.owner = THIS_MODULE,
+
+.proto = IPPROTO_ESP,
+
+.flags = XFRM_TYPE_REPLAY_PROT,
+
+.init_state = esp_init_state,
+
+.destructor = esp_destroy,
+
+.get_mtu = esp4_get_mtu,
+
+.input = esp_input,
+
+.output = esp_output
+
+};
+
+static const struct net_protocol esp4_protocol = {
+
+.handler = xfrm4_rcv,
+
+.err_handler = esp4_err,
+
+.no_policy = 1,
+
+.netns_ok = 1,
+
+};
+
+static int __init esp4_init(void)
+
+{
+
+Each protocol family has an instance of an xfrm_state_afinfo object, which includes protocol-family specific state methods; thus there is xfrm4_state_afinfo for IPv4 (net/ipv4/xfrm4_state.c) and xfrm6_state_afinfo for IPv6. This object includes an array of xfrm_type objects called type_map. Registering XFRM type by calling the xfrm_register_type() method will set the specified xfrm_type as an element in this array:
+
+if (xfrm_register_type(&esp_type, AF_INET) < 0) {
+
+pr_info("%s: can't add xfrm type\n", __func__);
+
+return -EAGAIN;
+
+}
+
+Registering the IPv4 ESP protocol is done like registering any other IPv4 protocol, by calling the inet_add_protocol() method. Note that the protocol handler used by IPv4 ESP, namely the xfrm4_rcv() method, is also used by the IPv4 AH protocol (net/ipv4/ah4.c) and by the IPv4 IPCOMP (IP Payload Compression Protocol ) protocol (net/ipv4/ipcomp.c).
+
+if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) {
+
+pr_info("%s: can't add protocol\n", __func__);
+
+xfrm_unregister_type(&esp_type, AF_INET);
+
+return -EAGAIN;
+
+}
+
+return 0;
+
+}
+
+(net/ipv4/esp4.c)
+
+## Receiving an IPsec Packet (Transport Mode)
+
+Suppose you work in transport mode in IPv4, and you receive an ESP packet that is destined to the local host. ESP in transport mode does not encrypt the IP header, only the IP payload. Figure 10-2 shows the traversal of an incoming IPv4 ESP packet, and its stages are described in this section. We will pass all the usual stages of local delivery, starting with the ip_rcv() method, and we will reach the ip_local_deliver_finish() method. Because the value of the protocol field in the IPv4 header is ESP (50), we invoke its handler, which is the xfrm4_rcv() method, as you saw earlier. The xfrm4_rcv() method further calls the generic xfrm_input() method, which performs a lookup in the SAD by calling the xfrm_state_lookup() method. If the lookup fails, the packet is dropped. In case of a lookup hit, the input callback method of the corresponding IPsec protocol is invoked:
+
+int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
+
+{
+
+struct xfrm_state *x;
+
+do {
+
+. . .
+
+Perform a lookup in the state_byspi hash table:
+
+x = xfrm_state_lookup(net, skb->mark, daddr, spi, nexthdr, family);
+
+Drop the packet silently if the lookup failed:
+
+if (x == NULL) {
+
+XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES);
+
+xfrm_audit_state_notfound(skb, family, spi, seq);
+
+goto drop;
+
+}
+
+In this case, of IPv4 ESP incoming traffic, the XFRM type associated with the state (x->type) is the ESP XFRM Type (esp_type); its input callback was set to esp_input(), as mentioned earlier in the "IPv4 ESP initialization" section.
+
+By calling x->type->input(), in the following line the esp_input() method is invoked; this method returns the protocol number of the original packet, before it was encrypted by ESP:
+
+nexthdr = x->type->input(x, skb);
+
+. . .
+
+The original protocol number is kept in the control buffer (cb) of the SKB by using the XFRM_MODE_SKB_CB macro; it will be used later for modifying the IPv4 header of the packet, as you will see:
+
+XFRM_MODE_SKB_CB(skb)->protocol = nexthdr;
+
+After the esp_input() method terminates, the xfrm4_transport_finish() method is invoked. This method modifies various fields of the IPv4 header. Take a look at the xfrm4_transport_finish() method:
+
+int xfrm4_transport_finish(struct sk_buff *skb, int async)
+
+{
+
+struct iphdr *iph = ip_hdr(skb);
+
+The protocol of the IPv4 header (iph->protocol) is 50 (ESP) at this point; you should set it to be the protocol number of the original packet (before it was encrypted by ESP) so that it will be processed by L4 sockets. The protocol number of the original packet was kept in XFRM_MODE_SKB_CB(skb)->protocol, as you saw earlier in this section:
+
+iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
+
+. . .
+
+__skb_push(skb, skb->data - skb_network_header(skb));
+
+iph->tot_len = htons(skb->len);
+
+Recalculate the checksum, since the IPv4 header was modified:
+
+ip_send_check(iph);
+
+Invoke any netfilter NF_INET_PRE_ROUTING hook callback and then call the xfrm4_rcv_encap_finish() method:
+
+NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, skb->dev, NULL,
+
+xfrm4_rcv_encap_finish);
+
+return 0;
+
+}
+
+The xfrm4_rcv_encap_finish() method calls the ip_local_deliver() method. Now the value of the protocol member in the IPv4 header is the original transport protocol (UDPv4, TCPv4, and so on), so from now on you proceed in the usual packet traversal, and the packet is passed to the transport layer (L4).
+
+Figure 10-2.
+
+Receiving IPv4 ESP packet, local delivery, transport mode. Note: The figure describes an IPv4 ESP packet. For IPv4 AH packets, the ah_input() method is invoked instead of the esp_input() method; likewise, for IPv4 IPCOMP packets, the ipcomp_input() method is invoked instead of the esp_input() method
+
+## Sending an IPsec Packet (Transport Mode)
+
+Figure 10-3 shows the Tx path of an outgoing packet sent via IPv4 ESP in transport mode. The first step after performing a lookup in the routing subsystem (by calling the ip_route_output_flow() method), is to perform a lookup for an XFRM policy, which can be applied on this flow. You do that by calling the xfrm_lookup() method (I discuss the internals of this method later in this section). If there is a lookup hit, continue to the ip_local_out() method, and then, after calling several methods as you can see in Figure 10-3, you eventually reach the esp_output() method, which encrypts the packet and then sends it out by calling the ip_output() method.
+
+Figure 10-3.
+
+Transmitting IPv4 ESP packet, transport mode. For the sake of simplicity, the case of creating a dummy bundle (when there are no XFRM states) and some other details are omitted
+
+The following section talks about how a lookup is performed in XFRM.
+
+## XFRM Lookup
+
+The xfrm_lookup() method is called for each packet that is sent out of the system. You want this lookup to be as efficient as possible. To achieve this goal, bundles are used. Bundles let you cache important information such as the route, the policies, the number of policies, and more; these bundles, which are instances of the xfrm_dst structure, are stored by using the flow cache. When the first packet of some flow arrives, you create an entry in the generic flow cache and subsequently create a bundle (xfrm_dst object). The bundle creation is done after a lookup for this bundle fails, because it is the first packet of this flow. When subsequent packets of this flow arrive, you will get a hit when performing a flow cache lookup:
+
+struct xfrm_dst {
+
+union {
+
+struct dst_entry dst;
+
+struct rtable rt;
+
+struct rt6_info rt6;
+
+} u;
+
+struct dst_entry *route;
+
+struct flow_cache_object flo;
+
+struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
+
+int num_pols, num_xfrms;
+
+#ifdef CONFIG_XFRM_SUB_POLICY
+
+struct flowi *origin;
+
+struct xfrm_selector *partner;
+
+#endif
+
+u32 xfrm_genid;
+
+u32 policy_genid;
+
+u32 route_mtu_cached;
+
+u32 child_mtu_cached;
+
+u32 route_cookie;
+
+u32 path_cookie;
+
+};
+
+(include/net/xfrm.h)
+
+The xfrm_lookup() method is a very complex method. I discuss its important parts but I don't delve into all its nuances. Figure 10-4 shows a block diagram of the internals of the xfrm_lookup() method.
+
+Figure 10-4.
+
+xfrm_lookup() internals
+
+Let's take a look at the xfrm_lookup() method:
+
+struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
+
+const struct flowi *fl, struct sock *sk, int flags)
+
+{
+
+The xfrm_lookup() method handles only the Tx path; so you set the flow direction (dir) to be FLOW_DIR_OUT by:
+
+u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
+
+If a policy is associated with this socket, you perform a lookup by the xfrm_sk_policy_lookup() method, which checks whether the packet flow matches the policy selector. Note that if the packet is to be forwarded, the xfrm_lookup() method was invoked from the __xfrm_route_forward() method, and there is no socket associated with the packet, because it was not generated on the local host; in this case, the specified sk argument is NULL:
+
+if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
+
+num_pols = 1;
+
+pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
+
+. . .
+
+}
+
+If there is no policy associated with this socket, you perform a lookup in the generic flow cache by calling the flow_cache_lookup() method, passing as an argument a function pointer to the xfrm_bundle_lookup method (the resolver callback). The key to the lookup is the flow object (the specified fl parameter). If you don't find an entry in the flow cache, allocate a new flow cache entry. If you find an entry with the same genid, call the xfrm_bundle_flo_get() method by invoking flo->ops->get(flo). Eventually, you call the xfrm_bundle_lookup() method by invoking the resolver callback, which gets the flow object as a parameter (oldflo). See the flow_cache_lookup() method implementation in net/core/flow.c:
+
+flo = flow_cache_lookup(net, fl, family, dir, xfrm_bundle_lookup, dst_orig);
+
+Fetch the bundle (xfrm_dst object) that contains the flow cache object as a member:
+
+xdst = container_of(flo, struct xfrm_dst, flo);
+
+Fetch cached data, like the number of policies, number of templates, the policies and the route:
+
+num_pols = xdst->num_pols;
+
+num_xfrms = xdst->num_xfrms;
+
+memcpy(pols, xdst->pols, sizeof(struct xfrm_policy*) * num_pols);
+
+route = xdst->route;
+
+}
+
+dst = &xdst->u.dst;
+
+Next comes handling a dummy bundle. A dummy bundle is a bundle where the route member is NULL. It is created in the XFRM bundle lookup process (by the xfrm_bundle_lookup() method) when no XFRM states were found, by calling the xfrm_create_dummy_bundle() method. In such a case, either one of the two options are available, according to the value of sysctl_larval_drop (/proc/sys/net/core/xfrm_larval_drop):
+
+  * If sysctl_larval_drop is set (which means its value is 1—it is so by default, as mentioned earlier in this chapter), the packet should be discarded.
+
+  * If sysctl_larval_drop is not set (its value is 0), the packets are kept in a per-policy queue (polq.hold_queue), which can contain up to 100 (XFRM_MAX_QUEUE_LEN) SKBs; this is implemented by the xdst_queue_output() method. These packets are kept until the XFRM states are resolved or until some timeout elapses. Once the states are resolved, the packets are sent out of the queue. If the XFRM states are not resolved after some time interval (the timeout of the xfrm_policy_queue object), the queue is flushed by the xfrm_queue_purge() method:
+
+if (route == NULL && num_xfrms > 0) {
+
+/* The only case when xfrm_bundle_lookup() returns a
+
+* bundle with null route, is when the template could
+
+* not be resolved. It means policies are there, but
+
+* bundle could not be created, since we don't yet
+
+* have the xfrm_state's. We need to wait for KM to
+
+* negotiate new SA's or bail out with error.*/
+
+if (net->xfrm.sysctl_larval_drop) {
+
+For IPv4, the make_blackhole() method calls the ipv4_blackhole_route() method. For IPv6, it calls the ip6_blackhole_route() method:
+
+return make_blackhole(net, family, dst_orig);
+
+}
+
+The next section covers one of the most important features of IPsec—NAT traversal—and explains what it is and why it is needed.
+
+## NAT Traversal in IPsec
+
+Why don't NAT devices allow IPsec traffic to pass? NAT changes the IP addresses and sometimes also the port numbers of the packet. As a result, it recalculates the checksum of the TCP or the UDP header. The transport layer checksum calculation takes into account the source and destination of the IP addresses. So even if only the IP addresses were changed, the TCP or UDP checksum should be recalculated. However, with ESP encryption in transport mode, the NAT device can't update the checksum because the TCP or UDP headers are encrypted with ESP. There are protocols where the checksum does not cover the IP header (like SCTP), so this problem does not occur there. To solve these problems, the NAT traversal standard for IPsec was developed (or, as officially termed in RFC 3948, "UDP Encapsulation of IPsec ESP Packets"). UDP Encapsulation can be applied to IPv4 packets as well as to IPv6 packets. NAT traversal solutions are not limited to IPsec traffic; these techniques are typically required for client-to-client networking applications, especially for peer-to-peer and Voice over Internet Protocol (VoIP) applications.
+
+There are some partial solutions for VoIP NAT-traversal, such as STUN, TURN, ICE, and more. I should mention here that strongSwan implements the IKEv2 Mediation Extension service ( http://tools.ietf.org/html/draft-brunner-ikev2-mediation-00 ), which allows two VPN endpoints located behind a NAT router each to establish a direct peer-to-peer IPsec tunnel using a mechanism similar to TURN and ICE. STUN, for example, is used in the VoIP open source Ekiga client (formerly gnomemeeting). The problem with these solutions is NAT devices they don't cope with. Devices called SBCs (session border controllers) provide a full solution for NAT traversal in VoIP. SBCs can be implemented in hardware (Juniper Networks, for example, provides a router-integrated SBC solution) or in software. These SBC solutions perform NAT traversal of the media traffic—which is sent by Real Time Protocol (RTP)—and sometimes also for the signaling traffic—which is sent by Session Initiation Protocol (SIP). NAT traversal is optional in IKEv2. Openswan, strongSwan, and racoon support NAT traversal, but Openswan and racoon support NAT-T only with IKEv1, whereas strongSwan supports NAT traversal in both IKEv1 and IKEv2.
+
+### NAT-T Mode of Operation
+
+How does NAT traversal work? First, keep in mind that NAT-T is a good solution only for ESP traffic and not for AH. Another restriction is that NAT-T can't be used with manual keying, but only with IKEv1 and IKEv2. This is because NAT-T is tied with exchanging IKEv1/IKEv2 messages. First, you must tell the userspace daemon (pluto) that you want to use the NAT traversal feature, because it is not activated by default. You do that in Openswan by adding nat_traversal=yes to the connection parameters in /etc/ipsec.conf. Clients not behind a NAT are not affected by the addition of this entry. In strongSwan, the IKEv2 charon daemon always supports NAT traversal, and this feature cannot be deactivated. In the first phase of IKE (Main Mode), you check whether both peers support NAT-T. In IKEv1, when a peer supports NAT-T, one of the ISAKAMP header members (vendor ID) tells whether it supports NAT-T. In IKEv2, NAT-T is part of the standard and does not have to be announced. If this condition is met, you check whether there is one or more NAT devices in the path between the two IPsec peers by sending NAT-D payload messages. If this condition is also met, NAT-T protects the original IPsec encoded packet by inserting in it a UDP header between the IP header and the ESP header. Both the source and destination ports in the UDP header are 4500. Besides, NAT-T sends keep-alive messages every 20 seconds so that the NAT retains its mapping. Keep alive messages are also sent on UDP port 4500 and are recognized by their content and value (which is one byte, 0xFF). When this packet reaches the IPsec peer, after going through the NAT, the kernel strips the UDP header and decrypts the ESP payload. See the xfrm4_udp_encap_rcv() method in net/ipv4/xfrm4_input.c.
+
+## Summary
+
+This chapter covered IPsec and the XFRM framework, which is the infrastructure of IPsec, and XFRM policies and states, which are the fundamental data structures of the XFRM framework. I also discussed IKE, the ESP4 implementation, the Rx/Tx path of ESP4 in transport mode, and NAT traversal in IPsec. Chapter 11 deals with the following transport Layer (L4) protocols: UDP, TCP, SCTP, and DCCP. The "Quick Reference" section that follows covers the top methods related to the topics discussed in this chapter, ordered by their context.
+
+## Quick Reference
+
+I conclude this chapter with a short list of important methods of IPsec. Some of them were mentioned in this chapter. Afterward, I include a table of XFRM SNMP MIB counters.
+
+### Methods
+
+Let's start with the methods.
+
+#### bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl, unsigned short family);
+
+This method returns true when the specified flow matches the specified XFRM selector. Invokes the __xfrm4_selector_match() method for IPv4 or the __xfrm6_selector_match() method for IPv6.
+
+#### int xfrm_policy_match(const struct xfrm_policy *pol, const struct flowi *fl, u8 type, u16 family, int dir);
+
+This method returns 0 if the specified policy can be applied to the specified flow, otherwise it returns an –errno.
+
+#### struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp);
+
+This method allocates and initializes an XFRM policy. It sets its reference counter to 1, initializes the read-write lock, assigns the policy namespace (xp_net) to be the specified network namespace, sets its timer callback to be xfrm_policy_timer(), and sets its state resolution packet queue timer (policy->polq.hold_timer) callback to be xfrm_policy_queue_process().
+
+#### void xfrm_policy_destroy(struct xfrm_policy *policy);
+
+This method removes the timer of specified XFRM policy object and releases the specified XFRM policy memory.
+
+#### void xfrm_pol_hold(struct xfrm_policy *policy);
+
+This method increments by 1 the reference count of the specified XFRM policy.
+
+#### static inline void xfrm_pol_put(struct xfrm_policy *policy);
+
+This method decrements by 1 the reference count of the specified XFRM policy. If the reference count reaches 0, call the xfrm_policy_destroy() method.
+
+#### struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family);
+
+This method returns the xfrm_state_afinfo object associated with the specified protocol family.
+
+#### struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, const struct flowi *fl, struct dst_entry *dst);
+
+This method creates an XFRM bundle. Called from the xfrm_resolve_and_create_bundle() method.
+
+#### int policy_to_flow_dir(int dir);
+
+This method returns the flow direction according to the specified policy direction. For example, return FLOW_DIR_IN when the specified direction is XFRM_POLICY_IN, and so on.
+
+#### static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, struct dst_entry *dst, const struct flowi *fl, int num_xfrms, u16 family);
+
+This method creates a dummy bundle. Called from the xfrm_bundle_lookup() method when policies were found but there are no matching states.
+
+#### struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family);
+
+This method allocates an XFRM bundle object. Called from the xfrm_bundle_create() method and from the xfrm_create_dummy_bundle() method.
+
+#### int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl);
+
+This method adds an XFRM policy to the SPD. Invoked from the xfrm_add_policy() method (net/xfrm/xfrm_user.c), or from the pfkey_spdadd() method (net/key/af_key.c).
+
+#### int xfrm_policy_delete(struct xfrm_policy *pol, int dir);
+
+This method releases the resources of the specified XFRM policy object. The direction argument (dir) is needed to decrement by 1 the corresponding XFRM policy counter in the policy_count in the per namespace netns_xfrm object.
+
+#### int xfrm_state_add(struct xfrm_state *x);
+
+This method adds the specified XFRM state to the SAD.
+
+#### int xfrm_state_delete(struct xfrm_state *x);
+
+This method deletes the specified XFRM state from the SAD.
+
+#### void __xfrm_state_destroy(struct xfrm_state *x);
+
+This method releases the resources of an XFRM state by adding it to the XFRM states garbage list and activating the XFRM state garbage collector.
+
+#### int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk, int (*func)(struct xfrm_state *, int, void*), void *data);
+
+This method iterates over all XFRM states (net->xfrm.state_all) and invokes the specified func callback.
+
+#### struct xfrm_state *xfrm_state_alloc(struct net *net);
+
+This method allocates and initializes an XFRM state.
+
+#### void xfrm_queue_purge(struct sk_buff_head *list);
+
+This method flushes the state resolution per-policy queue (polq.hold_queue).
+
+#### int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type);
+
+This method is the main Rx IPsec handler.
+
+#### static struct dst_entry *make_blackhole(struct net *net, u16 family, struct dst_entry *dst_orig);
+
+This method is invoked from the xfrm_lookup() method when there are no resolved states and sysctl_larval_drop is set. For IPv4, the make_blackhole() method calls the ipv4_blackhole_route() method; for IPv6, it calls the ip6_blackhole_route() method.
+
+#### int xdst_queue_output(struct sk_buff *skb);
+
+This method handles adding packets to the per-policy state resolution packet queue (pq->hold_queue). This queue can contain up to 100 (XFRM_MAX_QUEUE_LEN) packets.
+
+#### struct net *xs_net(struct xfrm_state *x);
+
+This method returns the namespace object (xs_net) associated with the specified xfrm_state object.
+
+#### struct net *xp_net(const struct xfrm_policy *xp);
+
+This method returns the namespace object (xp_net) associated with the specified xfrm_policy object.
+
+#### int xfrm_policy_id2dir(u32 index);
+
+This method returns the direction of the policy according to the specified index.
+
+#### int esp_input(struct xfrm_state *x, struct sk_buff *skb);
+
+This method is the main IPv4 ESP protocol handler.
+
+#### struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb);
+
+This method returns the ESP header associated with the specified SKB.
+
+#### int verify_newpolicy_info(struct xfrm_userpolicy_info *p);
+
+This method verifies that the specified xfrm_userpolicy_info object contains valid values. (xfrm_userpolicy_info is the object which is passed from userspace). It returns 0 if it is a valid object, and -EINVAL or -EAFNOSUPPORT if not.
+
+### Table
+
+Table 10-1 lists XFRM SNMP MIB counters.
+
+Table 10-1.
+
+XFRM SNMP MIB counters
+
+Linux Symbol | SNMP (procfs) Symbol | Methods in Which the Counter Might Be Incremented
+
+---|---|---
+
+LINUX_MIB_XFRMINERROR | XfrmInError | xfrm_input()
+
+LINUX_MIB_XFRMINBUFFERERROR | XfrmInBufferError | xfrm_input(),__xfrm_policy_check()
+
+LINUX_MIB_XFRMINHDRERROR | XfrmInHdrError | xfrm_input(),__xfrm_policy_check()
+
+LINUX_MIB_XFRMINNOSTATES | XfrmInNoStates | xfrm_input()
+
+LINUX_MIB_XFRMINSTATEPROTOERROR | XfrmInStateProtoError | xfrm_input()
+
+LINUX_MIB_XFRMINSTATEMODEERROR | XfrmInStateModeError | xfrm_input()
+
+LINUX_MIB_XFRMINSTATESEQERROR | XfrmInStateSeqError | xfrm_input()
+
+LINUX_MIB_XFRMINSTATEEXPIRED | XfrmInStateExpired | xfrm_input()
+
+LINUX_MIB_XFRMINSTATEMISMATCH | XfrmInStateMismatch | xfrm_input(),
+
+__xfrm_policy_check()
+
+LINUX_MIB_XFRMINSTATEINVALID | XfrmInStateInvalid | xfrm_input()
+
+LINUX_MIB_XFRMINTMPLMISMATCH | XfrmInTmplMismatch | __xfrm_policy_check()
+
+LINUX_MIB_XFRMINNOPOLS | XfrmInNoPols | __xfrm_policy_check()
+
+LINUX_MIB_XFRMINPOLBLOCK | XfrmInPolBlock | __xfrm_policy_check()
+
+LINUX_MIB_XFRMINPOLERROR | XfrmInPolError | __xfrm_policy_check()
+
+LINUX_MIB_XFRMOUTERROR | XfrmOutError | xfrm_output_one(),xfrm_output()
+
+LINUX_MIB_XFRMOUTBUNDLEGENERROR | XfrmOutBundleGenError | xfrm_resolve_and_create_bundle()
+
+LINUX_MIB_XFRMOUTBUNDLECHECKERROR | XfrmOutBundleCheckError | xfrm_resolve_and_create_bundle()
+
+LINUX_MIB_XFRMOUTNOSTATES | XfrmOutNoStates | xfrm_lookup()
+
+LINUX_MIB_XFRMOUTSTATEPROTOERROR | XfrmOutStateProtoError | xfrm_output_one()
+
+LINUX_MIB_XFRMOUTSTATEMODEERROR | XfrmOutStateModeError | xfrm_output_one()
+
+LINUX_MIB_XFRMOUTSTATESEQERROR | XfrmOutStateSeqError | xfrm_output_one()
+
+LINUX_MIB_XFRMOUTSTATEEXPIRED | XfrmOutStateExpired | xfrm_output_one()
+
+LINUX_MIB_XFRMOUTPOLBLOCK | XfrmOutPolBlock | xfrm_lookup()
+
+LINUX_MIB_XFRMOUTPOLDEAD | XfrmOutPolDead | n/a
+
+LINUX_MIB_XFRMOUTPOLERROR | XfrmOutPolError | xfrm_bundle_lookup(),
+
+xfrm_resolve_and_create_bundle()
+
+LINUX_MIB_XFRMFWDHDRERROR | XfrmFwdHdrError | __xfrm_route_forward()
+
+LINUX_MIB_XFRMOUTSTATEINVALID | XfrmOutStateInvalid | xfrm_output_one()
+
+Note
+
+The IPsec git tree: git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git.
+
+The ipsec git tree is for fixes for the IPsec networking subsystem; the development in this tree is done against David Miller's net git tree.
+
+The ipsec-next git tree: git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git.
+
+The ipsec-next tree is for changes for IPsec with linux-next as target; the development in this tree is done against David Miller's net-next git tree.
+
+The IPsec subsystem maintainers are Steffen Klassert, Herbert Xu, and David S. Miller.
+Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_11
+
+© Rami Rosen 2014
+
+# 11. Layer 4 Protocols
+
+Rami Rosen1
+
+(1)
+
+Haifa, Israel
+
+Abstract
+
+Chapter 10 discussed the Linux IPsec subsystem and its implementation. In this chapter, I will discuss four transport layer (L4) protocols. I will start our discussion with the two most commonly used transport layer (L4) protocols, the User Datagram Protocol (UDP) and the Transmission Control Protocol (TCP), which have been used for many years. Subsequently, I will discuss the newer Stream Control Transmission Protocol (SCTP) and Datagram Congestion Control Protocol (DCCP) protocols, which combine features of TCP and UDP. I will start the chapter with describing the sockets API, which is the interface between the transport layer (L4) and the userspace. I will discuss how sockets are implemented in the kernel and how data flows from the userspace to the transport layer and from the transport layer to the userspace. I will also deal with passing packets from the network layer (L3) to the transport layer (L4) when working with these protocols. I will discuss here mainly the IPv4 implementation of these four protocols, though some of the code is common to IPv4 and IPv6.
+
+Chapter 10 discussed the Linux IPsec subsystem and its implementation. In this chapter, I will discuss four transport layer (L4) protocols. I will start our discussion with the two most commonly used transport layer (L4) protocols, the User Datagram Protocol (UDP) and the Transmission Control Protocol (TCP), which are used for many years. Subsequently, I will discuss the newer Stream Control Transmission Protocol (SCTP) and Datagram Congestion Control Protocol (DCCP) protocols, which combine features of TCP and UDP. I will start the chapter with describing the sockets API, which is the interface between the transport layer (L4) and the userspace. I will discuss how sockets are implemented in the kernel and how data flows from the userspace to the transport layer and from the transport layer to the userspace. I will also deal with passing packets from the network layer (L3) to the transport layer (L4) when working with these protocols. I will discuss here mainly the IPv4 implementation of these four protocols, though some of the code is common to IPv4 and IPv6.
+
+## Sockets
+
+Every operating system has to provide an entry point and an API to its networking subsystems. The Linux kernel networking subsystem provides an interface to the userspace by the standard POSIX socket API, which was specified by the IEEE (IEEE Std 1003.1g-2000, describing networking APIs, also known as POSIX.1g). This API is based on Berkeley sockets API (also known as BSD sockets), which originated from the 4.2BSD Unix operating system and is an industry standard in several operating systems. In Linux, everything above the transport layer belongs to the userspace. Conforming to the Unix paradigm that "everything is a file," sockets are associated with files, as you will see later in this chapter. Using the uniform sockets API makes porting applications easier. These are the available socket types:
+
+  * Stream sockets (SOCK_STREAM): Provides a reliable, byte-stream communication channel. TCP sockets are an example of stream sockets.
+
+  * Datagram sockets (SOCK_DGRAM): Provides for exchanging of messages (called datagrams). Datagram sockets provide an unreliable communication channel, because packets can be discarded, arrive out of order, or be duplicated. UDP sockets are an example of datagram sockets.
+
+  * Raw sockets (SOCK_RAW): Uses direct access to the IP layer, and allows sending or receiving traffic without any protocol-specific, transport-layer formatting.
+
+  * Reliably delivered message (SOCK_RDM): Used by the Transparent Inter-Process Communication (TIPC), which was originally developed at Ericsson from 1996–2005 and was used in cluster applications. See  http://tipc.sourceforge.net .
+
+  * Sequenced packet stream (SOCK_SEQPACKET): This socket type is similar to the SOCK_STREAM type and is also connection-oriented. The only difference between these types is that record boundaries are maintained using the SOCK_SEQPACKET type. Record boundaries are visible to the receiver via the MSG_EOR (End of record) flag. The Sequenced packet stream type is not discussed in this chapter.
+
+  * DCCP sockets (SOCK_DCCP): The Datagram Congestion Control Protocol is a transport protocol that provides a congestion-controlled flow of unreliable datagrams. It combines features of both TCP and UDP. It is discussed in a later section of this chapter.
+
+  * Data links sockets (SOCK_PACKET): The SOCK_PACKET is considered obsolete in the AF_INET family. See the __sock_create() method in net/socket.c.
+
+The following is a description of some methods that the sockets API provides (all the kernel methods that appear in the following list are implemented in net/socket.c):
+
+  * socket(): Creates a new socket; will be discussed in the subsection "Creating Sockets."
+
+  * bind(): Associates a socket with a local port and an IP address; implemented in the kernel by the sys_bind() method.
+
+  * send(): Sends a message; implemented in the kernel by the sys_send() method.
+
+  * recv(): Receives a message; implemented in the kernel by the sys_recv() method.
+
+  * listen(): Allows a socket to receive connections from other sockets; implemented in the kernel by the sys_listen() method. Not relevant to datagram sockets.
+
+  * accept(): Accepts a connection on a socket; implemented in the kernel by the sys_accept() method. Relevant only with connection-based socket types (SOCK_STREAM, SOCK_SEQPACKET).
+
+  * connect(): Establishes a connection to a peer socket; implemented in the kernel by the sys_connect() method. Relevant to connection-based socket types (SOCK_STREAM or SOCK_SEQPACKET) as well as to connectionless socket types (SOCK_DGRAM).
+
+This book focuses on the kernel network implementation, so I will not delve into the details of the userspace socket API. If you want more information, I recommend the following books:
+
+  * Unix Network Programming, Volume 1: The Sockets Networking API (3rd Edition) by W. Richard Stevens, Bill Fenner, and Andrew M. Rudoff (Addison-Wesley Professional, 2003).
+
+  * The Linux Programming Interface by Michael Kerrisk (No Starch Press, 2010).
+
+Note
+
+All the socket API calls are handled by the socketcall() method, in net/socket.c.
+
+Now that you have learned about some socket types, you will learn what happens in the kernel when a socket is created. In the next section, I will introduce the two structures that implement sockets: struct socket and struct sock. I will also describe the difference between them and I will describe the msghdr struct and its members.
+
+## Creating Sockets
+
+There are two structures that represent a socket in the kernel: the first is struct socket, which provides an interface to the userspace and is created by the sys_socket() method. I will discuss the sys_socket() method later in this section. The second is struct sock, which provides an interface to the network layer (L3). Since the sock structure resides in the network layer, it is a protocol agnostic structure. I will discuss the sock structure also later in this section. The socket structure is short:
+
+struct socket {
+
+socket_state state;
+
+kmemcheck_bitfield_begin(type);
+
+short type;
+
+kmemcheck_bitfield_end(type);
+
+unsigned long flags;
+
+. . .
+
+struct file *file;
+
+struct sock *sk;
+
+const struct proto_ops *ops;
+
+};
+
+(include/linux/net.h)
+
+The following is a description of the members of the socket structure:
+
+  * state: A socket can be in one of several states, like SS_UNCONNECTED, SS_CONNECTED, and more. When an INET socket is created, its state is SS_UNCONNECTED; see the inet_create() method. After a stream socket connects successfully to another host, its state is SS_CONNECTED. See the socket_state enum in include/uapi/linux/net.h.
+
+  * type: The type of the socket, like SOCK_STREAM or SOCK_RAW; see the enum sock_type in include/linux/net.h.
+
+  * flags: The socket flags; for example, the SOCK_EXTERNALLY_ALLOCATED flag is set in the TUN device when allocating a socket, not by the socket() system call. See the tun_chr_open() method in drivers/net/tun.c. The socket flags are defined in include/linux/net.h.
+
+  * file: The file associated with the socket.
+
+  * sk: The sock object associated with the socket. The sock object represents the interface to the network layer (L3). When creating a socket, the associated sk object is created. For example, in IPv4, the inet_create() method, which is invoked when creating a socket, allocates a sock object, sk, and associates it with the specified socket object.
+
+  * ops: This object (an instance of the proto_ops object) consists mostly of callbacks for this socket, like connect(), listen(), sendmsg(), recvmsg(), and more. These callbacks are the interface to the userspace. The sendmsg() callback implements several library-level routines, such as write(), send(), sendto(), and sendmsg(). Quite similarly, the recvmsg() callback implements several library-level routines, such as read(), recv(), recvfrom(), and recvmsg(). Each protocol defines a proto_ops object of its own according to the protocol requirements. Thus, for TCP, its proto_ops object includes a listen callback, inet_listen(), and an accept callback, inet_accept(). On the other hand, the UDP protocol, which does not work in the client-server model, defines the listen() callback to be the sock_no_listen() method, and it defines the accept() callback to be the sock_no_accept() method. The only thing that both these methods do is return an error of –EOPNOTSUPP. See Table 11-1 in the "Quick Reference" section at the end of this chapter for the definitions of the TCP and UDP proto_ops objects. The proto_ops structure is defined in include/linux/net.h.
+
+The sock structure is the network-layer representation of sockets; it is quite long, and following here are only some of its fields that are important for our discussion:
+
+struct sock {
+
+struct sk_buff_head sk_receive_queue;
+
+int sk_rcvbuf;
+
+unsigned long sk_flags;
+
+int sk_sndbuf;
+
+struct sk_buff_head sk_write_queue;
+
+. . .
+
+unsigned int sk_shutdown : 2,
+
+sk_no_check : 2,
+
+sk_protocol : 8,
+
+sk_type : 16;
+
+. . .
+
+void (*sk_data_ready)(struct sock *sk, int bytes);
+
+void (*sk_write_space)(struct sock *sk);
+
+};
+
+(include/net/sock.h)
+
+The following is a description of the members of the sock structure:
+
+  * sk_receive_queue: A queue for incoming packets.
+
+  * sk_rcvbuf: The size of the receive buffer in bytes.
+
+  * sk_flags: Various flags, like SOCK_DEAD or SOCK_DBG; see the sock_flags enum definition in include/net/sock.h.
+
+  * sk_sndbuf: The size of the send buffer in bytes.
+
+  * sk_write_queue: A queue for outgoing packets.
+
+Note
+
+You will see later, in the "TCP Socket Initialization" section, how the sk_rcvbuf and the sk_sndbuf are initialized, and how this can be changed by writing to procfs entries.
+
+  * sk_no_check: Disable checksum flag. Can be set with the SO_NO_CHECK socket option.
+
+  * sk_protocol: This is the protocol identifier, which is set according to the third parameter (protocol) of the socket() system call.
+
+  * sk_type: The type of the socket, like SOCK_STREAM or SOCK_RAW; see the enum sock_type in include/linux/net.h.
+
+  * sk_data_ready: A callback to notify the socket that new data has arrived.
+
+  * sk_write_space: A callback to indicate that there is free memory available to proceed with data transmission.
+
+Creating sockets is done by calling the socket() system call from userspace:
+
+sockfd = socket(int socket_family, int socket_type, int protocol);
+
+The following is a description of the parameters of the socket() system call:
+
+  * socket_family: Can be, for example, AF_INET for IPv4, AF_INET6 for IPv6, or AF_UNIX for UNIX domain sockets, and so on. (UNIX domain sockets is a form of Inter Process Communication (IPC), which allows communication between processes that are running on the same host.)
+
+  * socket_type: Can be, for example, SOCK_STREAM for stream sockets, SOCK_DGRAM for datagram sockets, or SOCK_RAW for raw sockets, and so on.
+
+  * protocol: Can be any of the following:
+
+    * 0 or IPPROTO_TCP for TCP sockets.
+
+    * 0 or IPPROTO_UDP for UDP sockets.
+
+    * A valid IP protocol identifier (like IPPROTO_TCP or IPPROTO_ICMP) for raw sockets; see RFC 1700, "Assigned Numbers."
+
+The return value of the socket() system call (sockfd) is the file descriptor that should be passed as a parameter to subsequent calls with this socket. The socket() system call is handled in the kernel by the sys_socket() method. Let's take a look at the implementation of the socket() system call:
+
+SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
+
+{
+
+int retval;
+
+struct socket *sock;
+
+int flags;
+
+. . .
+
+retval = sock_create(family, type, protocol, &sock);
+
+if (retval < 0)
+
+goto out;
+
+. . .
+
+retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
+
+if (retval < 0)
+
+goto out_release;
+
+out:
+
+. . .
+
+return retval;
+
+}
+
+(net/socket.c)
+
+The sock_create() method calls the address-family specific socket creation method, create(); in the case of IPv4, it is the inet_create() method. (See the inet_family_ops definition in net/ipv4/af_inet.c.) The inet_create() method creates the sock object (sk) that is associated with the socket; the sock object represents the network layer socket interface. The sock_map_fd() method returns an fd (file descriptor) that is associated with the socket; normally, the socket() system call returns this fd.
+
+Sending data from a userspace socket, or receiving data in a userspace socket from the transport layer, is handled in the kernel by the sendmsg() and recvmsg() methods, respectively, which get a msghdr object as a parameter. The msghdr object includes the data blocks to send or to fill, as well as some other parameters.
+
+struct msghdr {
+
+void *msg_name; /* Socket name */
+
+int msg_namelen; /* Length of name */
+
+struct iovec *msg_iov; /* Data blocks */
+
+__kernel_size_t msg_iovlen; /* Number of blocks */
+
+void *msg_control; /* Per protocol magic (eg BSD file descriptor passing) */
+
+__kernel_size_t msg_controllen; /* Length of cmsg list */
+
+unsigned int msg_flags;
+
+};
+
+(include/linux/socket.h)
+
+The following is a description of some of the important members of the msghdr structure:
+
+  * msg_name: The destination socket address. To get the destination socket, you usually cast the msg_name opaque pointer to a struct sockaddr_in pointer. See, for example, the udp_sendmsg() method.
+
+  * msg_namelen: The length of the address.
+
+  * iovec: A vector of data blocks.
+
+  * msg_iovlen: The number of blocks in the iovec vector.
+
+  * msg_control: Control information (also known as ancillary data).
+
+  * msg_controllen: The length of the control information.
+
+  * msg_flags: Flags of received messages, like MSG_MORE. (See, for example, the section "Sending Packets with UDP" later in this chapter.)
+
+Note that the maximum control buffer length that the kernel can process is limited per socket by the value in sysctl_optmem_max (/proc/sys/net/core/optmem_max).
+
+In this section, I described the kernel implementation of the socket and the msghdr struct, which is used when sending and receiving packets. In the next section, I will start my discussion about transport layer protocols (L4) by describing the UDP protocol, which is the simplest among the protocols to be discussed in this chapter.
+
+## UDP (User Datagram Protocol)
+
+The UDP protocol is described in RFC 768 from 1980. The UDP protocol is a thin layer around the IP layer, adding only port, length, and checksum information. It dates back as early as 1980 and provides unreliable, message-oriented transport without congestion control. Many protocols use UDP. I will mention, for example, the RTP protocol (Real-time Transport Protocol), which is used for delivery of audio and video over IP networks. Such a type of traffic can tolerate some packet loss. The RTP is commonly used in VoIP applications, usually in conjunction with SIP (Session Initiation Protocol) based clients.(It should be mentioned here that, in fact, the RTP protocol can also use TCP, as specified in RFC 4571, but this is not used much.) I should mention here UDP-Lite, which is an extension of the UDP protocol to support variable-length checksums (RFC 3828). Most of UDP-Lite is implemented in net/ipv4/udplite.c, but you will encounter it also in the main UDP module, net/ipv4/udp.c. The UDP header length is 8 bytes:
+
+struct udphdr {
+
+__be16 source;
+
+__be16 dest;
+
+__be16 len;
+
+__sum16 check;
+
+};
+
+(include/uapi/linux/udp.h)
+
+The following is a description of the members of the UDP header:
+
+  * source: The source port (16 bit), in the range 1-65535.
+
+  * dest: The destination port (16 bit), in the range 1-65535.
+
+  * len: The length in bytes (the payload length and the UDP header length).
+
+  * checksum: The checksum of the packet.
+
+Figure 11-1 shows a UDP header.
+
+Figure 11-1.
+
+A UDP header (IPv4)
+
+In this section, you learned about the UDP header and its members. To understand how the userspace applications, which use the sockets API, communicate with the kernel (sending and receiving packets), you should know about how UDP initialization is done, which is described in the next section.
+
+### UDP Initialization
+
+We define the udp_protocol object (net_protocol object) and add it with the inet_add_protocol() method. This sets the udp_protocol object to be an element in the global protocols array (inet_protos).
+
+static const struct net_protocol udp_protocol = {
+
+.handler = udp_rcv,
+
+.err_handler = udp_err,
+
+.no_policy = 1,
+
+.netns_ok = 1,
+
+};
+
+(net/ipv4/af_inet.c)
+
+static int __init inet_init(void)
+
+{
+
+. . .
+
+if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
+
+pr_crit("%s: Cannot add UDP protocol\n", __func__);
+
+. . .
+
+}
+
+(net/ipv4/af_inet.c)
+
+We further define a udp_prot object and register it by calling the proto_register() method. This object contains mostly callbacks; these callbacks are invoked when opening a UDP socket in userspace and using the socket API. For example, calling the setsockopt() system call on a UDP socket will invoke the udp_setsockopt() callback.
+
+struct proto udp_prot = {
+
+.name = "UDP",
+
+.owner = THIS_MODULE,
+
+.close = udp_lib_close,
+
+.connect = ip4_datagram_connect,
+
+.disconnect = udp_disconnect,
+
+.ioctl = udp_ioctl,
+
+. . .
+
+.setsockopt = udp_setsockopt,
+
+.getsockopt = udp_getsockopt,
+
+.sendmsg = udp_sendmsg,
+
+.recvmsg = udp_recvmsg,
+
+.sendpage = udp_sendpage,
+
+. . .
+
+};
+
+(net/ipv4/udp.c)
+
+int __init inet_init(void)
+
+{
+
+int rc = -EINVAL;
+
+. . .
+
+rc = proto_register(&udp_prot, 1);
+
+. . .
+
+}
+
+(net/ipv4/af_inet.c)
+
+Note
+
+The UDP protocol, along with other core protocols, is initialized via the inet_init() method at boot-time.
+
+Now that you know about UDP initialization and its callback for sending packets, which is the udp_sendmsg() callback of the udp_prot object that was shown in this section, it is time to learn how packets are sent by UDP in IPV4.
+
+### Sending Packets with UDP
+
+Sending data from a UDP userspace socket can be done by several system calls: send(), sendto(), sendmsg(), and write(); eventually all of them are handled by the udp_sendmsg() method in the kernel. The userspace application builds a msghdr object that contains the data blocks and passes this msghdr object to the kernel. Let's take a look at this method:
+
+int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+
+size_t len)
+
+{
+
+In general, UDP packets are sent immediately. This behavior can be changed with the UDP_CORK socket option (introduced in kernel 2.5.44), which causes packet data passed to the udp_sendmsg() method to be accumulated until the final packet is released by unsetting the option. The same result can be achieved by setting the MSG_MORE flag:
+
+int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
+
+struct inet_sock *inet = inet_sk(sk);
+
+. . .
+
+First we make some sanity checks. The specified len, for example, cannot be greater than 65535 (remember that the len field in the UDP header is 16 bits):
+
+if (len > 0xFFFF)
+
+return -EMSGSIZE;
+
+We need to know the destination address and the destination port in order to build a flowi4 object, which is needed for sending the SKB with the udp_send_skb() method or with the ip_append_data() method. The destination port should not be 0. There are two cases here: the destination is specified in the msg_name of the msghdr, or the socket is connected and its state is TCP_ESTABLISHED. Note that UDP (in contrast to TCP) is almost a fully stateless protocol. The notion of TCP_ESTABLISHED in UDP mostly means that the socket has passed some sanity checks.
+
+if (msg->msg_name) {
+
+struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
+
+if (msg->msg_namelen < sizeof(*usin))
+
+return -EINVAL;
+
+if (usin->sin_family != AF_INET) {
+
+if (usin->sin_family != AF_UNSPEC)
+
+return -EAFNOSUPPORT;
+
+}
+
+daddr = usin->sin_addr.s_addr;
+
+dport = usin->sin_port;
+
+Linux code honors the fact that zero UDP/TCP ports are reserved by the IANA. The reservation of port 0 in TCP and UDP dates back to RFC 1010, "Assigned Numbers" (1987), and it was still present in RFC 1700, which was obsoleted by the online database (see RFC 3232), where they are still present. See  www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml .
+
+if (dport == 0)
+
+return -EINVAL;
+
+} else {
+
+if (sk->sk_state != TCP_ESTABLISHED)
+
+return -EDESTADDRREQ;
+
+daddr = inet->inet_daddr;
+
+dport = inet->inet_dport;
+
+/* Open fast path for connected socket.
+
+Route will not be used, if at least one option is set.
+
+*/
+
+connected = 1;
+
+}
+
+. . .
+
+A userspace application can send control information (also known as ancillary data) by setting msg_control and msg_controllen in the msghdr object. Ancillary data is, in fact, a sequence of cmsghdr objects with appended data. (For more details, see man 3 cmsg.) You can send and receive ancillary data by calling the sendmsg() and recvmsg() methods, respectively. For example, you can create an IP_PKTINFO ancillary message to set a source route to an unconnected UDP socket. (See man 7 ip.) When msg_controllen is not 0, this is a control information message, which is handled by the ip_cmsg_send() method. The ip_cmsg_send() method builds an ipcm_cookie (IP Control Message Cookie) object by parsing the specified msghdr object. The ipcm_cookie structure includes information that is used further when processing the packet. For example, when using an IP_PKTINFO ancillary message, you can set the source address by setting an address field in the control messages, which eventually sets the addr in the ipcm_cookie object. The ipcm_cookie is a short structure:
+
+struct ipcm_cookie {
+
+__be32 addr;
+
+int oif;
+
+struct ip_options_rcu *opt;
+
+__u8 tx_flags;
+
+};
+
+(include/net/ip.h)
+
+Let's continue our discussion of the udp_sendmsg() method:
+
+if (msg->msg_controllen) {
+
+err = ip_cmsg_send(sock_net(sk), msg, &ipc);
+
+if (err)
+
+return err;
+
+if (ipc.opt)
+
+free = 1;
+
+connected = 0;
+
+}
+
+. . .
+
+if (connected)
+
+rt = (struct rtable *)sk_dst_check(sk, 0);
+
+. . .
+
+If the routing entry is NULL, a routing lookup should be performed:
+
+if (rt == NULL) {
+
+struct net *net = sock_net(sk);
+
+fl4 = &fl4_stack;
+
+flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
+
+RT_SCOPE_UNIVERSE, sk->sk_protocol,
+
+inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP,
+
+faddr, saddr, dport, inet->inet_sport);
+
+security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
+
+rt = ip_route_output_flow(net, fl4, sk);
+
+if (IS_ERR(rt)) {
+
+err = PTR_ERR(rt);
+
+rt = NULL;
+
+if (err == -ENETUNREACH)
+
+IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+
+goto out;
+
+}
+
+. . .
+
+In kernel 2.6.39, a lockless transmit fast path was added. This means that when the corking feature is not set, we do not hold the socket lock and we call the udp_send_skb() method, and when the corking feature is set, we hold the socket lock by calling the lock_sock() method and then send the packet:
+
+/* Lockless fast path for the non-corking case. */
+
+if (!corkreq) {
+
+skb = ip_make_skb(sk, fl4, getfrag, msg->msg_iov, ulen,
+
+sizeof(struct udphdr), &ipc, &rt,
+
+msg->msg_flags);
+
+err = PTR_ERR(skb);
+
+if (!IS_ERR_OR_NULL(skb))
+
+err = udp_send_skb(skb, fl4);
+
+goto out;
+
+}
+
+Now we handle the case when the corking feature is set:
+
+lock_sock(sk);
+
+do_append_data:
+
+up->len += ulen;
+
+The ip_append_data() method buffers the data for transmission but does not transmit it yet. Subsequently calling the udp_push_pending_frames() method will actually perform the transmission. Note that the udp_push_pending_frames() method also handles fragmentation by the specified getfrag callback:
+
+err = ip_append_data(sk, fl4, getfrag, msg->msg_iov, ulen,
+
+sizeof(struct udphdr), &ipc, &rt,
+
+corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
+
+If the method failed, we should flush all pending SKBs. This is achieved by calling the udp_flush_pending_frames() method, which will free all the SKBs in the write queue of the socket (sk_write_queue) by the ip_flush_pending_frames() method:
+
+if (err)
+
+udp_flush_pending_frames(sk);
+
+else if (!corkreq)
+
+err = udp_push_pending_frames(sk);
+
+else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
+
+up->pending = 0;
+
+release_sock(sk);
+
+You learned in this section about sending packets with UDP. Now, to complete our discussion about UDP in IPv4, it's time to learn about how packets from the network layer (L3) are received with UDP in IPv4.
+
+### Receiving Packets from the Network Layer (L3) with UDP
+
+The main handler for receiving UDP packets from the network layer (L3) is the udp_rcv() method. All it does is invoke the __udp4_lib_rcv() method (net/ipv4/udp.c):
+
+int udp_rcv(struct sk_buff *skb)
+
+{
+
+return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
+
+}
+
+Let's take a look at the __udp4_lib_rcv() method:
+
+int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
+
+int proto)
+
+{
+
+struct sock *sk;
+
+struct udphdr *uh;
+
+unsigned short ulen;
+
+struct rtable *rt = skb_rtable(skb);
+
+__be32 saddr, daddr;
+
+struct net *net = dev_net(skb->dev);
+
+. . .
+
+We fetch the UDP header, header length, and source and destination addresses from the SKB:
+
+uh = udp_hdr(skb);
+
+ulen = ntohs(uh->len);
+
+saddr = ip_hdr(skb)->saddr;
+
+daddr = ip_hdr(skb)->daddr;
+
+We will skip some sanity checks that are being performed, like making sure that the UDP header length is not greater than the length of the packet and that the specified proto is the UDP protocol identifier (IPPROTO_UDP). If the packet is a broadcast or a multicast packet, it will be handled by the __udp4_lib_mcast_deliver() method:
+
+if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
+
+return __udp4_lib_mcast_deliver(net, skb, uh,
+
+saddr, daddr, udptable);
+
+Next we perform a lookup in the UDP sockets hash table:
+
+sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
+
+if (sk != NULL) {
+
+We arrive here because the lookup we performed found a matching socket. So process the SKB further by calling the udp_queue_rcv_skb() method, which invokes the generic sock_queue_rcv_skb() method, which in turn adds the specified SKB to the tail of sk->sk_receive_queue (by calling the __skb_queue_tail() method):
+
+int ret = udp_queue_rcv_skb(sk, skb);
+
+sock_put(sk);
+
+/* a return value > 0 means to resubmit the input, but
+
+* it wants the return to be -protocol, or 0
+
+*/
+
+if (ret > 0)
+
+return -ret;
+
+Everything is fine; return 0 to denote success:
+
+return 0;
+
+}
+
+. . .
+
+We arrived here because the lookup for a socket failed. This means that we should not handle the packet. This can occur, for example, when there is no listening UDP socket on the destination port. If the checksum is incorrect, we should drop the packet silently. If it is correct, we should send an ICMP reply back to the sender. This should be an ICMP message of "Destination Unreachable" with code of "Port Unreachable." Further on, we should free the packet and update an SNMP MIB counter:
+
+/* No socket. Drop packet silently, if checksum is wrong */
+
+if (udp_lib_checksum_complete(skb))
+
+goto csum_error;
+
+The next command increments the UDP_MIB_NOPORTS (NoPorts) MIB counter. Note that you can query various UDP MIB counters by cat /proc/net/snmp or by netstat –s.
+
+UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
+
+icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+/*
+
+* Hmm. We got an UDP packet to a port to which we
+
+* don't wanna listen. Ignore it.
+
+*/
+
+kfree_skb(skb);
+
+return 0;
+
+Figure 11-2 illustrates our discussion in this section about receiving UDP packets.
+
+Figure 11-2.
+
+Receiving UDP packets
+
+Our discussion about UDP is now finished. The next section describes the TCP protocol, which is the most complex among the protocols discussed in this chapter.
+
+## TCP (Transmission Control Protocol)
+
+The TCP protocol is described in RFC 793 from 1981. During the years since then, there have been many updates, variations, and additions to the base TCP protocol. Some additions were for specific types of networks (high-speed, satellite), whereas others were for performance improvements.
+
+The TCP protocol is the most commonly used transport protocol on the Internet today. Many well-known protocols are based upon TCP. The most well-known protocol is probably HTTP, and we should also mention here some other well-known protocols such as ftp, ssh, telnet, smtp, and ssl. The TCP protocol provides a reliable and connection-oriented transport, as opposed to UDP. Transmission is made reliable by using sequence numbers and acknowledgments.
+
+TCP is a very complex protocol; we will not discuss all the details, optimizations, and nuances of the TCP implementation in this chapter, as this requires a separate book in itself. TCP functionality consists of two ingredients: management of connections, and transmitting and receiving data. We will focus in this section on TCP initialization and TCP connection setup, which pertains to the first ingredient, connections management, and on receiving and sending packets, which pertains to the second ingredient. These are the important basics that enable further delving into the TCP protocol implementation. We should note that the TCP protocol self-regulates the byte-stream flow via congestion control. Many different congestion-control algorithms have been specified, and Linux provides a pluggable and configurable architecture to support a wide variety of algorithms. Delving into the details of the individual congestion-control algorithms is beyond the scope of this book.
+
+Every TCP packet starts with a TCP header. You must learn about the TCP header in order to understand the operation of TCP. The next section describes the IPv4 TCP header.
+
+### TCP Header
+
+The TCP header length is 20 bytes, but it is scalable up to 60 bytes when using TCP options:
+
+struct tcphdr {
+
+__be16 source;
+
+__be16 dest;
+
+__be32 seq;
+
+__be32 ack_seq;
+
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+
+__u16 res1:4,
+
+doff:4,
+
+fin:1,
+
+syn:1,
+
+rst:1,
+
+psh:1,
+
+ack:1,
+
+urg:1,
+
+ece:1,
+
+cwr:1;
+
+#elif defined(__BIG_ENDIAN_BITFIELD)
+
+__u16 doff:4,
+
+res1:4,
+
+cwr:1,
+
+ece:1,
+
+urg:1,
+
+ack:1,
+
+psh:1,
+
+rst:1,
+
+syn:1,
+
+fin:1;
+
+#else
+
+#error "Adjust your <asm/byteorder.h> defines"
+
+#endif
+
+__be16 window;
+
+__sum16 check;
+
+__be16 urg_ptr;
+
+};
+
+(include/uapi/linux/tcp.h)
+
+The following is a description of the members of the tcphdr structure:
+
+  * source: The source port (16 bit), in the range 1-65535.
+
+  * dest: The destination port (16 bit), in the range 1-65535.
+
+  * seq: The Sequence number (32 bits).
+
+  * ack_seq: Acknowledgment number (32 bits). If the ACK flag is set, the value of this field is the next sequence number that the receiver is expecting.
+
+  * res1: Reserved for future use (4 bits). It should always be set to 0.
+
+  * doff: Data offset (4 bits). The size of the TCP header in multiplies of 4 bytes; the minimum is 5 (20 bytes) and the maximum is 15 (60 bytes).
+
+The following are the TCP flags; each is 1 bit:
+
+  * fin: No more data from sender (when one of the endpoints wants to close the connection).
+
+  * syn: The SYN flag is initially sent when establishing the 3-way handshake between two endpoints.
+
+  * rst: The Reset flag is used when a segment that is not intended for the current connection arrives.
+
+  * psh: The data should be passed to userspace as soon as possible.
+
+  * ack: Signifies that the acknowledgment number (ack_seq) value in the TCP header is meaningful.
+
+  * urg: Signifies that the urgent pointer is meaningful.
+
+  * ece: ECN - Echo flag. ECN stands for "Explicit Congestion Notification." ECN provides a mechanism that sends end-to-end notification about network congestion without dropping packets. It was added by RFC 3168, "The Addition of Explicit Congestion Notification (ECN) to IP," from 2001.
+
+  * cwr: Congestion Window Reduced flag.
+
+  * window: TCP receive window size in bytes (16 bit).
+
+  * check: Checksum of the TCP header and TCP data.
+
+  * urg_ptr: Has significance only when the urg flag is set. It represents an offset from the sequence number indicating the last urgent data byte (16 bit).
+
+Figure 11-3 shows a diagram of a TCP header.
+
+Figure 11-3.
+
+TCP header (IPv4)
+
+In this section, I described the IPv4 TCP header and its members. You saw that, as opposed to the UDP header, which has only 4 members, the TCP header has a lot more members, since TCP is a much more complex protocol. In the following section, I will describe how TCP initialization is done so that you will learn how and where the initialization of the callbacks for receiving and sending TCP packets takes place.
+
+### TCP Initialization
+
+We define the tcp_protocol object (net_protocol object) and add it with the inet_add_protocol() method:
+
+static const struct net_protocol tcp_protocol = {
+
+.early_demux = tcp_v4_early_demux,
+
+.handler = tcp_v4_rcv,
+
+.err_handler = tcp_v4_err,
+
+.no_policy = 1,
+
+.netns_ok = 1,
+
+};
+
+(net/ipv4/af_inet.c)
+
+static int __init inet_init(void)
+
+{
+
+. . .
+
+if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
+
+pr_crit("%s: Cannot add TCP protocol\n", __func__);
+
+. . .
+
+}
+
+(net/ipv4/af_inet.c)
+
+We further define a tcp_prot object and register it by calling the proto_register() method, like what we did with UDP:
+
+struct proto tcp_prot = {
+
+.name = "TCP",
+
+.owner = THIS_MODULE,
+
+.close = tcp_close,
+
+.connect = tcp_v4_connect,
+
+.disconnect = tcp_disconnect,
+
+.accept = inet_csk_accept,
+
+.ioctl = tcp_ioctl,
+
+.init = tcp_v4_init_sock,
+
+. . .
+
+};
+
+(net/ipv4/tcp_ipv4.c)
+
+static int __init inet_init(void)
+
+{
+
+int rc;
+
+. . .
+
+rc = proto_register(&tcp_prot, 1);
+
+. . .
+
+}
+
+(net/ipv4/af_inet.c)
+
+Note that in the tcp_prot definition, the init function pointer is defined to be the tcp_v4_init_sock() callback, which performs various initializations, like setting the timers by calling the tcp_init_xmit_timers() method, setting the socket state, and more. Conversely, in UDP, which is a much simpler protocol, the init function pointer was not defined at all because there are no special initializations to perform in UDP. We will discuss the tcp_v4_init_sock() callback later in this section.
+
+In the next section, I will describe briefly the timers used by the TCP protocol.
+
+### TCP Timers
+
+TCP timers are handled in net/ipv4/tcp_timer.c. There are four timers used by TCP:
+
+  * Retransmit timer : Responsible for resending packets that were not acknowledged in a specified time interval. This can happen when a packet gets lost or corrupted. This timer is started after each segment is sent; if an ACK arrives before the timer expires, the timer is canceled.
+
+  * Delayed ACK timer : Delays sending ACK packets. It is set when TCP receives data that must be acknowledged but does not need to be acknowledged immediately.
+
+  * Keep Alive timer : Checks whether the connection is down. There are cases when sessions are idle for a long time and one side goes down. The Keep Alive timer detects such cases and calls the tcp_send_active_reset() method to reset the connection.
+
+  * Zero window probe timer (also known as the persistent timer ): When the receive buffer is full, the receiver advertises a zero window and the sender stops sending. Now, when a receiver sends a segment with a new window size and this segment is lost, the sender will keep waiting forever. The solution is this: when the sender gets a zero window, it uses a persistent timer to probe the receiver for its window size; when getting a non-zero window size, the persistent timer is stopped.
+
+### TCP Socket Initialization
+
+To use a TCP socket, a userspace application should create a SOCK_STREAM socket and call the socket() system call. This is handled in the kernel by the tcp_v4_init_sock() callback, which invokes the tcp_init_sock() method to do the real work. Note that the tcp_init_sock() method performs address-family independent initializations, and it is invoked also from the tcp_v6_init_sock() method. The important tasks of the tcp_init_sock() method are the following:
+
+  * Set the state of the socket to be TCP_CLOSE.
+
+  * Initialize TCP timers by calling the tcp_init_xmit_timers() method.
+
+  * Initialize the socket send buffer (sk_sndbuf) and receive buffer (sk_rcvbuf); sk_sndbuf is set to be to sysctl_tcp_wmem[1], which is by default 16384 bytes, and sk_rcvbuf is set to be sysctl_tcp_rmem[1], which is by default 87380 bytes. These default values are set in the tcp_init() method; the sysctl_tcp_wmem and sysctl_tcp_rmem arrays default values can be overridden by writing to /proc/sys/net/ipv4/tcp_wmem and to /proc/sys/net/ipv4/tcp_rmem, respectively. See the "TCP Variables" section in Documentation/networking/ip-sysctl.txt.
+
+  * Initialize the out-of-order queue and the prequeue.
+
+  * Initialize various parameters. For example, the TCP initial congestion window is initialized to 10 segments (TCP_INIT_CWND), according to RFC 6928, "Increasing TCP's Initial Window," from 2013.
+
+Now that you have learned how a TCP socket is initialized, I will discuss how to set up a TCP connection.
+
+### TCPConnection Setup
+
+TCP connection setup and teardown and TCP connection properties are described as transitions in a state machine. At each given moment, a TCP socket can be in one specified state; for example, the socket enters the TCP_LISTEN state when the listen() system call is invoked. The state of the sock object is represented by its sk_state member. For a list of all available states, refer to include/net/tcp_states.h.
+
+A three way handshake is used to set up a TCP connection between a TCP client and a TCP server:
+
+  * First, the client sends a SYN request to the server. Its state changes to TCP_SYN_SENT.
+
+  * The server socket, which is listening (its state is TCP_LISTEN), creates a request socket to represent the new connection in the TCP_SYN_RECV state and sends back a SYN ACK.
+
+  * The client that receives the SYN ACK changes its state to TCP_ESTABLISHED and sends an ACK to the server.
+
+  * The server receives the ACK and changes the request socket into a child socket in the TCP_ESTABLISHED state, as the connection is now established and data can be sent.
+
+Note
+
+to further look into the TCP state machine details, refer to the tcp_rcv_state_process() method (net/ipv4/tcp_input.c), which is the state machine engine, both for IPv4 and for IPv6. (It is called both from the tcp_v4_do_rcv() method and from the tcp_v6_do_rcv() method.)
+
+The next section describes how packets are received from the network layer (L3) with TCP in IPv4.
+
+### ReceivingPackets from the Network Layer (L3) with TCP
+
+The main handler for receiving TCP packets from the network layer (L3) is the tcp_v4_rcv() method (net/ipv4/tcp_ipv4.c). Let's take a look at this function:
+
+int tcp_v4_rcv(struct sk_buff *skb)
+
+{
+
+struct sock *sk;
+
+. . .
+
+First we make some sanity checks (for example, checking to see if the packet type is not PACKET_HOST or if the packet size is shorter than the TCP header) and discard the packet if there are any problems; then some initializations are made and also a lookup for a corresponding socket is performed by calling the __inet_lookup_skb() method, which first performs a lookup in the established sockets hash table by calling the __inet_lookup_established() method. In the case of a lookup miss, it performs a lookup in the listening sockets hash table by calling the __inet_lookup_listener() method. If no socket is found, the packet is discarded at this stage.
+
+sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
+
+. . .
+
+if (!sk)
+
+goto no_tcp_socket;
+
+Now we check whether the socket is owned by some application. The sock_owned_by_user() macro returns 1 when there is currently an application that owns the socket, and it returns a value of 0 when there is no application that owns the socket:
+
+if (!sock_owned_by_user(sk)) {
+
+. . .
+
+{
+
+We arrive here if no application owns the socket, so it can accept packets. First we try to put the packet in the prequeue by calling the tcp_prequeue() method, as packets in the prequeue are processed more efficiently. The tcp_prequeue() will return false if processing in the prequeue is not possible (for example, when the queue has no space); in such a case, we will call the tcp_v4_do_rcv() method, which we will discuss shortly:
+
+if (!tcp_prequeue(sk, skb))
+
+ret = tcp_v4_do_rcv(sk, skb);
+
+}
+
+When an application owns the socket, it means that it is in a locked state, so it cannot accept packets. In such a case, we add the packet to the backlog by calling the sk_add_backlog() method:
+
+} else if (unlikely(sk_add_backlog(sk, skb,
+
+sk->sk_rcvbuf + sk->sk_sndbuf))) {
+
+bh_unlock_sock(sk);
+
+NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
+
+goto discard_and_relse;
+
+}
+
+}
+
+Let's take a look at the tcp_v4_do_rcv() method:
+
+int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+
+{
+
+If the socket is in the TCP_ESTABLISHED state, we call the tcp_rcv_established() method:
+
+if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
+
+. . .
+
+if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
+
+rsk = sk;
+
+goto reset;
+
+}
+
+return 0;
+
+If the socket is in the TCP_LISTEN state, we call the tcp_v4_hnd_req() method:
+
+if (sk->sk_state == TCP_LISTEN) {
+
+struct sock *nsk = tcp_v4_hnd_req(sk, skb);
+
+}
+
+If we are not in the TCP_LISTEN state, we invoke the tcp_rcv_state_process() method:
+
+if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
+
+rsk = sk;
+
+goto reset;
+
+}
+
+return 0;
+
+reset:
+
+tcp_v4_send_reset(rsk, skb);
+
+}
+
+In this section, you learned about the reception of a TCP packet. In the next section, we conclude the TCP part of this chapter by describing how packets are sent with TCP in IPv4.
+
+### Sending Packets with TCP
+
+As with UDP, sending packets from TCP sockets that were created in userspace can be done by several system calls: send(), sendto(), sendmsg(), and write(). Eventually all of them are handled by the tcp_sendmsg() method (net/ipv4/tcp.c). This method copies the payload from the userspace to the kernel and sends it as TCP segments. It is much more complicated than the udp_sendmsg() method.
+
+int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+
+size_t size)
+
+{
+
+struct iovec *iov;
+
+struct tcp_sock *tp = tcp_sk(sk);
+
+struct sk_buff *skb;
+
+int iovlen, flags, err, copied = 0;
+
+int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
+
+bool sg;
+
+long timeo;
+
+. . .
+
+I will not delve into all the details of copying the data from the userspace to the SKB in this method. Once the SKB is built, it is sent with the tcp_push_one() method that calls the tcp_write_xmit() method, which in turn invokes the tcp_transmit_skb() method:
+
+static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+
+gfp_t gfp_mask)
+
+{
+
+The icsk_af_ops object (INET Connection Socket ops) is an address-family specific object. In the case of IPv4 TCP, it is set to be an inet_connection_sock_af_ops object named ipv4_specific in the tcp_v4_init_sock() method. The queue_xmit() callback is set to be the generic ip_queue_xmit() method. See net/ipv4/tcp_ipv4.c.
+
+. . .
+
+err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl);
+
+. . .
+
+}
+
+(net/ipv4/tcp_output.c)
+
+Now that you learned about TCP and UDP, you are ready to proceed to the next section which deals with the SCTP (Stream Control Transmission Protocol) protocol. The SCTP protocol combines features of both UDP and TCP, and it is newer than both of them.
+
+## SCTP (Stream Control Transmission Protocol)
+
+The SCTP protocol is specified in RFC 4960 from 2007. It was first specified in 2000. It is designed for Public Switched Telephone Network (PSTN) signaling over IP networks, but it can be used with other applications. The IETF SIGTRAN (Signaling Transport) working group originally developed the SCTP protocol and later handed the protocol over to the Transport Area working group (TSVWG) for the continued evolvement of SCTP as a general-purpose transport protocol. LTE (Long Term Evolution) uses SCTP; one of the main reasons for this is that the SCTP protocol is able to detect when a link goes down or when packets are dropped very quickly, whereas TCP does not have this feature. SCTP flow-control and congestion-control algorithms are very similar in TCP and SCTP. The SCTP protocol uses a variable for the advertised receiver window size (a_rwnd); this variable represents the current available space in the receiver buffer. The sender cannot send any new data if the receiver indicates that a_rwnd is 0 (no receive space available). The important features of SCTP are the following ones:
+
+  * SCTP combines the features of TCP and UDP. It is a reliable transport protocol with congestion control like TCP; it is a message-oriented protocol like UDP, whereas TCP is stream-oriented.
+
+  * The SCTP protocol provides improved security with its 4-way handshake (compared to the TCP 3-way handshake) to protect against SYN flooding attacks. I will discuss the 4-way handshake later in this chapter in the "Setting Up an SCTP Association" section.
+
+  * SCTP supports multihoming—that is, multiple IP addresses on both endpoints. This provides a network-level, fault-tolerance capability. I will discuss SCTP chunks later in this section.
+
+  * SCTP supports multistreaming, which means that it can send in parallel streams of data chunks. This can reduce the latency of streaming multimedia in some environments. I will discuss SCTP chunks later in this section.
+
+  * SCTP uses a heartbeat mechanism to detect idle/unreachable peers in the case of multihoming. I will discuss the SCTP heartbeat mechanism later in this chapter.
+
+After this short description of the SCTP protocol, we will now discuss how SCTP initialization is done. The sctp_init() method allocates memory for various structures, initializes some sysctl variables, and registers the SCTP protocol in IPv4 and in IPv6:
+
+int sctp_init(void)
+
+{
+
+int status = -EINVAL;
+
+. . .
+
+status = sctp_v4_add_protocol();
+
+if (status)
+
+goto err_add_protocol;
+
+/* Register SCTP with inet6 layer. */
+
+status = sctp_v6_add_protocol();
+
+if (status)
+
+goto err_v6_add_protocol;
+
+. . .
+
+}
+
+(net/sctp/protocol.c)
+
+The registration of the SCTP protocol is done by defining an instance of net_protocol (named sctp_protocol for IPv4 and sctpv6_protocol for IPv6) and calling the inet_add_protocol() method, quite similarly to what you saw in other transport protocols, like the UDP protocol. We also call the register_inetaddr_notifier() to receive notifications about adding or deleting a network address. These events will be handled by the sctp_inetaddr_event() method, which will update the SCTP global address list (sctp_local_addr_list) accordingly.
+
+static const struct net_protocol sctp_protocol = {
+
+.handler = sctp_rcv,
+
+.err_handler = sctp_v4_err,
+
+.no_policy = 1,
+
+};
+
+(net/sctp/protocol.c)
+
+static int sctp_v4_add_protocol(void)
+
+{
+
+/* Register notifier for inet address additions/deletions. */
+
+register_inetaddr_notifier(&sctp_inetaddr_notifier);
+
+/* Register SCTP with inet layer. */
+
+if (inet_add_protocol(&sctp_protocol, IPPROTO_SCTP) < 0)
+
+return -EAGAIN;
+
+return 0;
+
+}
+
+(net/sctp/protocol.c)
+
+Note
+
+The sctp_v6_add_protocol() method (net/sctp/ipv6.c) is very similar, so we will not show it here.
+
+Each SCTP packet starts with an SCTP header. I will now describe the structure of an SCTP header. I will start the discussion with SCTP chunks in the next section.
+
+### SCTP Packets and Chunks
+
+Each SCTP packet has an SCTP common header, which is followed by one or more chunks. Each chunk can contain either data or SCTP control information. Several chunks can be bundled into one SCTP packet (except for three chunks that are used when establishing and terminating a connection: INIT, INIT_ACK, and SHUTDOWN_COMPLETE). These chunks use the Type-Length-Value (TLV) format that you first encountered in Chapter 2.
+
+#### SCTP Common Header
+
+typedef struct sctphdr {
+
+__be16 source;
+
+__be16 dest;
+
+__be32 vtag;
+
+__le32 checksum;
+
+} __attribute__((packed)) sctp_sctphdr_t;
+
+(include/linux/sctp.h)
+
+Following is a description of the members of the sctphdr structure:
+
+  * source: SCTP source port.
+
+  * dest: SCTP destination port.
+
+  * vtag: Verification Tag, which is a 32 bit random value.
+
+  * checksum: Checksum of SCTP common header and all chunks.
+
+### SCTP Chunk Header
+
+The SCTP chunk header is represented by struct sctp_chunkhdr:
+
+typedef struct sctp_chunkhdr {
+
+__u8 type;
+
+__u8 flags;
+
+__be16 length;
+
+} __packed sctp_chunkhdr_t;
+
+(include/linux/sctp.h)
+
+The following is a description of the members of the sctp_chunkhdr structure:
+
+  * type: The SCTP type. For example, the type of data chunks is SCTP_CID_DATA. See Table 11-2, Chunk types, in the "Quick Reference" section at the end of this chapter, and also see the chunk ID enum definition (sctp_cid_t) in include/linux/sctp.h.
+
+  * flags: Usually, all 8 bits in it should be set to 0 by the sender and ignored by the receiver. There are cases when different values are used. For example, in ABORT chunk, we use the T bit (the LSB) thus: it is set to 0 if the sender filled in the Verification Tag, and it is set to 1 if the Verification Tag is reflected.
+
+  * length: The length of the SCTP chunk.
+
+### SCTP Chunk
+
+The SCTP chunk is represented by struct sctp_chunk. Each chunk object contains the source and destination address for this chunk and a subheader (member of the subh union) according to its type. For example, for data packets we have the sctp_datahdr subheader, and for the INIT type we have the sctp_inithdr subtype:
+
+struct sctp_chunk {
+
+. . .
+
+atomic_t refcnt;
+
+union {
+
+__u8 *v;
+
+struct sctp_datahdr *data_hdr;
+
+struct sctp_inithdr *init_hdr;
+
+struct sctp_sackhdr *sack_hdr;
+
+struct sctp_heartbeathdr *hb_hdr;
+
+struct sctp_sender_hb_info *hbs_hdr;
+
+struct sctp_shutdownhdr *shutdown_hdr;
+
+struct sctp_signed_cookie *cookie_hdr;
+
+struct sctp_ecnehdr *ecne_hdr;
+
+struct sctp_cwrhdr *ecn_cwr_hdr;
+
+struct sctp_errhdr *err_hdr;
+
+struct sctp_addiphdr *addip_hdr;
+
+struct sctp_fwdtsn_hdr *fwdtsn_hdr;
+
+struct sctp_authhdr *auth_hdr;
+
+} subh;
+
+struct sctp_chunkhdr *chunk_hdr;
+
+struct sctphdr *sctp_hdr;
+
+struct sctp_association *asoc;
+
+/* What endpoint received this chunk? */
+
+struct sctp_ep_common *rcvr;
+
+. . .
+
+/* What is the origin IP address for this chunk? */
+
+union sctp_addr source;
+
+/* Destination address for this chunk. */
+
+union sctp_addr dest;
+
+. . .
+
+/* For an inbound chunk, this tells us where it came from.
+
+* For an outbound chunk, it tells us where we'd like it to
+
+* go. It is NULL if we have no preference.
+
+*/
+
+struct sctp_transport *transport;
+
+};
+
+(include/net/sctp/structs.h)
+
+We will now describe an SCTP association (which is the counterpart of a TCP connection).
+
+### SCTP Associations
+
+In SCTP, we use the term association instead of a connection; a connection refers to communication between two IP addresses, whereas association refers to communication between two endpoints that might have multiple IP addresses. An SCTP association is represented by struct sctp_association:
+
+struct sctp_association {
+
+...
+
+sctp_assoc_t assoc_id;
+
+/* These are those association elements needed in the cookie. */
+
+struct sctp_cookie c;
+
+/* This is all information about our peer. */
+
+struct {
+
+struct list_head transport_addr_list;
+
+. . .
+
+__u16 transport_count;
+
+__u16 port;
+
+. . .
+
+struct sctp_transport *primary_path;
+
+struct sctp_transport *active_path;
+
+} peer;
+
+sctp_state_t state;
+
+. . .
+
+struct sctp_priv_assoc_stats stats;
+
+};
+
+(include/net/sctp/structs.h).
+
+The following is a description of some of the important members of the sctp_association structure:
+
+  * assoc_id: The association unique id. It's set by the sctp_assoc_set_id() method.
+
+  * c: The state cookie (sctp_cookie object) that is attached to the association.
+
+  * peer: An inner structure representing the peer endpoint of the association. Adding a peer is done by the sctp_assoc_add_peer() method; removing a peer is done by the sctp_assoc_rm_peer() method. Following is a description of some of the peer structure important members:
+
+  * transport_addr_list: Represents one or more addresses of the peer. We can add addresses to this list or remove addresses from it by using the sctp_connectx() method when an association is established.
+
+  * transport_count: The counter of the peer addresses in the peer address list (transport_addr_list).
+
+  * primary_path: Represents the address to which the initial connection was made (INIT <\--> INIT_ACK exchange). The association will attempt to always use the primary path if it is active.
+
+  * active_path: The address of the peer that is currently used when sending data.
+
+  * state: The state that the association is in, like SCTP_STATE_CLOSED or SCTP_STATE_ESTABLISHED. Various SCTP states are discussed later in this section.
+
+Adding multiple local addresses to an SCTP association or removing multiple addresses from one can be done, for example, with the sctp_bindx() system call, in order to support the multihoming feature mentioned earlier. Every SCTP association includes a peer object, which represents the remote endpoint; the peer object includes a list of one or more addresses of the remote endpoint (transport_addr_list). We can add one or more addresses to this list by calling the sctp_connectx() system call when establishing an association. An SCTP association is created by the sctp_association_new() method and initialized by the sctp_association_init() method. At any given moment, an SCTP association can be in one of 8 states; thus, for example, when it is created, its state is SCTP_STATE_CLOSED. Later on, these states can change; see, for example, the "Setting Up an SCTP Association" section later in this chapter. These states are represented by the sctp_state_t enum (include/net/sctp/constants.h).
+
+To send data between two endpoints, an initialization process must be completed. In this process, an SCTP association between these two endpoints is set; a cookie mechanism is used to provide protection against synchronization attacks. This process is discussed in the following section.
+
+### Setting Up an SCTP Association
+
+The initialization process is a 4-way handshake that consists of the following steps:
+
+  * One endpoint ("A") sends an INIT chunk to the endpoint it wants to communicate with ("Z"). This chunk will include a locally generated Tag in the Initiate Tag field of the INIT chunk, and it will also include a verification tag (vtag in the SCTP header) with a value of 0 (zero).
+
+  * After sending the INIT chunk, the association enters the SCTP_STATE_COOKIE_WAIT state.
+
+  * The other endpoint ("Z") sends to "A" an INIT-ACK chunk as a reply. This chunk will include a locally generated Tag in the Initiate Tag field of the INIT-ACK chunk and the remote Initiate Tag as the verification tag (vtag in the SCTP header). "Z" should also generate a state cookie and send it with the INIT-ACK reply.
+
+  * When "A" receives the INIT-ACK chunk, it leaves the SCTP_STATE_COOKIE_WAIT state. "A" will use the remote Initiate Tag as the verification tag (vtag in the SCTP header) in all transmitted packets from now on. "A" will send the state cookie it received in a COOKIE ECHO chunk. "A" will enter the SCTP_STATE_COOKIE_ECHOED state.
+
+  * When "Z" receives the COOKIE ECHO chunk, it will build a TCB (Transmission Control Block). The TCB is a data structure containing connection information on either side of an SCTP connection. "Z" will further change its state to SCTP_STATE_ESTABLISHED and reply with a COOKIE ACK chunk. This is where the association is finally established on "Z" and, at this point, this association will use the saved tags.
+
+  * When "A" receives the COOKIE ACK, it will move from the SCTP_STATE_COOKIE_ECHOED state to the SCTP_STATE_ESTABLISHED state.
+
+Note
+
+An endpoint might respond to an INIT, INIT ACK, or COOKIE ECHO chunk with an ABORT chunk when some mandatory parameters are missing, or when receiving invalid parameter values. The cause of the ABORT chunk should be specified in the reply.
+
+Now that you have learned about SCTP associations and how they are created, you will see how SCTP packets are received with SCTP and how SCTP packets are sent.
+
+### Receiving Packets with SCTP
+
+The main handler for receiving SCTP packets is the sctp_rcv() method, which gets an SKB as a single parameter (net/sctp/input.c). First some sanity checks are made (size, checksum, and so on). If everything is fine, we proceed to check whether this packet is an "Out of the Blue" (OOTB) packet. A packet is an OOTB packet if it is correctly formed (that is, no checksum error), but the receiver is not able to identify the SCTP association to which this packet belongs. (See section 8.4 in RFC 4960.) The OOTB packets are handled by the sctp_rcv_ootb() method, which iterates over all the chunks of the packet and takes an action according to the chunk type, as specified in the RFC. Thus, for example, an ABORT chunk is discarded. If this packet is not an OOTB packet, it is put into an SCTP inqueue by calling the sctp_inq_push() method and proceeds on its journey with the sctp_assoc_bh_rcv() method or with the sctp_endpoint_bh_rcv() method.
+
+### Sending Packets with SCTP
+
+Writing to a userspace SCTP socket reaches the sctp_sendmsg() method (net/sctp/socket.c). The packet is passed to the lower layers by calling the sctp_primitive_SEND() method, which in turn calls the state machine callback, sctp_do_sm() (net/sctp/sm_sideeffect.c), with SCTP_ST_PRIMITIVE_SEND. The next stage is to call sctp_side_effects(), and eventually call the sctp_packet_transmit() method.
+
+### SCTP HEARTBEAT
+
+The HEARTBEAT mechanism tests the connectivity of a transport or path by exchanging HEARTBEAT and HEARTBEAT-ACK SCTP packets. It declares the transport IP address to be down once it reaches the threshold of a nonreturned heartbeat acknowledgment. A HEARTBEAT chunk is sent every 30 seconds by default to monitor the reachability of an idle destination transport address. This time interval is configurable by setting /proc/sys/net/sctp/hb_interval. The default is 30000 milliseconds (30 seconds). Sending heartbeat chunks is performed by the sctp_sf_sendbeat_8_3() method. The reason for the 8_3 in the method name is that it refers to section 8.3 (Path Heartbeat) in RFC 4960. When an endpoint receives a HEARTBEAT chunk, it replies with a HEARTBEAT-ECHO chunk if it is in the SCTP_STATE_COOKIE_ECHOED state or the SCTP_STATE_ESTABLISHED state.
+
+### SCTP Multistreaming
+
+Streams are unidirectional data flows within a single association. The number of Outbound Streams and the number of Inbound Streams are declared during the association setup (by the INIT chunk), and the streams are valid during the entire association lifetime. A userspace application can set the number of streams by creating an sctp_initmsg object and initializing its sinit_num_ostreams and sinit_max_instreams, and then calling the setsockopt() method with SCTP_INITMSG. Initialization of the number of streams can also be done with the sendmsg() system call. This, in turn, sets the corresponding fields in the initmsg object of the sctp_sock object. One of the biggest reasons streams were added was to remove the Head-of-Line blocking (HoL Blocking) condition. Head-of-line blocking is a performance-limiting phenomenon that occurs when a line of packets is held up by the first packet—for example, in multiple requests in HTTP pipelining. When working with SCTP Multistreaming, this problem does not exist because each stream is sequenced separately and guaranteed to be delivered in order. Thus, once one of the streams is blocked due to loss/congestion, the other streams might not be blocked and data will continue to be delivered. This is due to that one stream can be blocked while the other streams are not blocked,
+
+Note
+
+Regarding using sockets for SCTP, I should mention the lksctp-tools project ( http://lksctp.sourceforge.net/ ). This project provides a Linux userspace library for SCTP (libsctp), including C language header files (netinet/sctp.h), for accessing SCTP-specific application programming interfaces not provided by the standard sockets, and also some helper utilities around SCTP. I should also mention RFC 6458, "Sockets API Extensions for Stream Control Transmission Protocol (SCTP)," which describes a mapping of the Stream Control Transmission Protocol (SCTP) into the sockets API.
+
+### SCTP Multihoming
+
+SCTP multihoming refers to having multiple IP addresses on both endpoints. One of the really nice features of SCTP is that endpoints are multihomed by default if the local ip address was specified as a wildcard. Also, there has been a lot of confusion about the multihoming feature because people expect that simply by binding to multiple addresses, the associations will end up being multihomed. This is not true because we implement only destination multihoming. In other words, both connected endpoints have to be multihomed for it to have true failover capability. If the local association knows about only a single destination address, there will be only one path and thus no multihoming.
+
+With describing SCTP multihoming in this section, the SCTP part of this chapter has ended. In the next section, I will describe the DCCP protocol, which is the last transport protocol to be discussed in this chapter.
+
+## DCCP: The Datagram Congestion Control Protocol
+
+DCCP is an unreliable, congestion-controlled transport layer protocol and, as such, it borrows from both UDP and TCP while adding new features. Like UDP, it is message-oriented and unreliable. Like TCP, it is a connection-oriented protocol and it also uses a 3-way handshake to set up the connection. Development of DCCP was helped by ideas from academia, through participation of several research institutes, but it has not been tested so far in larger-scale Internet setups. The use of DCCP would make sense, for instance, in applications that require minor delays and where a small degree of data loss is permitted, like in telephony and in streaming media applications.
+
+Congestion control in DCCP differs from that in TCP in that the congestion-control algorithm (called CCID) can be negotiated between endpoints and congestion control can be applied on both the forward and reverse paths of a connection (called half-connections in DCCP). Two classes of pluggable congestion control have been specified so far. The first type is a rate-based, smooth "TCP-friendly" algorithm (CCID-3, RFC 4342 and 5348), for which there is an experimental small-packet variation called CCID-4 (RFC 5622, RFC 4828). The second type of congestion control, "TCP-like" (RFC 4341) applies a basic TCP congestion-control algorithm with selective acknowledgments (SACK, RFC 2018) to DCCP flows. At least one CCID needs to be implemented by endpoints in order to function. The first DCCP Linux implementation was released in Linux kernel 2.6.14 (2005). This chapter describes the implementation principles of the DCCPv4 (IPv4). Delving into the implementation details of individual DCCP congestion-control algorithms is beyond the scope of this book.
+
+Now that I've introduced the DCCP protocol in general, I will describe the DCCP header.
+
+### DCCP Header
+
+Every DCCP packet starts with a DCCP header. The minimum DCCP header length is 12 bytes. DCCP uses a variable-length header, which can range from 12 to 1020 bytes, depending on whether short sequence numbers are used and which TLV packet options are used. DCCP sequence numbers are incremented for each packet (not per each byte as in TCP) and can be shortened from 6 to 3 bytes.
+
+struct dccp_hdr {
+
+__be16 dccph_sport,
+
+dccph_dport;
+
+__u8 dccph_doff;
+
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+
+__u8 dccph_cscov:4,
+
+dccph_ccval:4;
+
+#elif defined(__BIG_ENDIAN_BITFIELD)
+
+__u8 dccph_ccval:4,
+
+dccph_cscov:4;
+
+#else
+
+#error "Adjust your <asm/byteorder.h> defines"
+
+#endif
+
+__sum16 dccph_checksum;
+
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+
+__u8 dccph_x:1,
+
+dccph_type:4,
+
+dccph_reserved:3;
+
+#elif defined(__BIG_ENDIAN_BITFIELD)
+
+__u8 dccph_reserved:3,
+
+dccph_type:4,
+
+dccph_x:1;
+
+#else
+
+#error "Adjust your <asm/byteorder.h> defines"
+
+#endif
+
+__u8 dccph_seq2;
+
+__be16 dccph_seq;
+
+};
+
+(include/uapi/linux/dccp.h)
+
+The following is a description of the important members of the dccp_hdr structure:
+
+  * dccph_sport: Source port (16 bit).
+
+  * dccph_dport: Destination port (16 bit).
+
+  * dccph_doff: Data offset (8 bits). The size of the DCCP header is in multiples of 4 bytes.
+
+  * dccph_cscov: Determines which part of the packet is covered in the checksum. Using partial checksumming might improve performance when it is used with applications that can tolerate corruption of some low percentage.
+
+  * dccph_ccval: CCID-specific information from sender to receiver (not always used).
+
+  * dccph_x: Extended Sequence Numbers bit (1 bit). This flag is set when using 48-bit Extended Sequence and Acknowledgment Numbers.
+
+  * dccph_type: The DCCP header type (4 bits). This can be, for example, DCCP_PKT_DATA for a data packet or DCCP_PKT_ACK for an ACK. See Table  11-3 , "DCCP packet types," in the "Quick Reference" section at the end of this chapter.
+
+  * dccph_reserved: Reserved for future use (1 bit).
+
+  * dccph_checksum: The checksum (16 bit). The Internet checksum of the DCCP header and data, computed similarly to UDP and TCP. If partial checksums are used, only the length specified by dccph_cscov of the application data is checksummed.
+
+  * dccph_seq2: Sequence number. This is used when working with Extended Sequence Numbers (8 bit).
+
+  * dccph_seq: Sequence number. It is incremented by 1 for each packet (16 bit).
+
+Note
+
+DCCP sequence numbers depend on dccph_x. (For details, refer to the dccp_hdr_seq() method, include/linux/dccp.h).
+
+Figure 11-4 shows a DCCP header. The dccph_x flag is set, so we use 48-bit Extended Sequence numbers.
+
+Figure 11-4.
+
+DCCP header (the Extended Sequence Numbers bit is set, dccph_x=1)
+
+Figure 11-5 shows a DCCP header. The dccph_x flag is not set, so we use 24-bit Sequence numbers.
+
+Figure 11-5.
+
+DCCP header (the Extended Sequence Numbers bit is not set, dccph_x=0)
+
+### DCCP Initialization
+
+DCCP initialization happens much like in TCP and UDP. Considering the DCCPv4 case (net/dccp/ipv4.c), first a proto object is defined (dccp_v4_prot) and its DCCP specific callbacks are set; we also define a net_protocol object (dccp_v4_protocol) and initialize it:
+
+static struct proto dccp_v4_prot = {
+
+.name = "DCCP",
+
+.owner = THIS_MODULE,
+
+.close = dccp_close,
+
+.connect = dccp_v4_connect,
+
+.disconnect = dccp_disconnect,
+
+.ioctl = dccp_ioctl,
+
+.init = dccp_v4_init_sock,
+
+. . .
+
+.sendmsg = dccp_sendmsg,
+
+.recvmsg = dccp_recvmsg,
+
+. . .
+
+}
+
+(net/dccp/ipv4.c)
+
+static const struct net_protocol dccp_v4_protocol = {
+
+.handler = dccp_v4_rcv,
+
+.err_handler = dccp_v4_err,
+
+.no_policy = 1,
+
+.netns_ok = 1,
+
+};
+
+(net/dccp/ipv4.c)
+
+We register the dccp_v4_prot object and the dccp_v4_protocol object in the dccp_v4_init() method:
+
+static int __init dccp_v4_init(void)
+
+{
+
+int err = proto_register(&dccp_v4_prot, 1);
+
+if (err != 0)
+
+goto out;
+
+err = inet_add_protocol(&dccp_v4_protocol, IPPROTO_DCCP);
+
+if (err != 0)
+
+goto out_proto_unregister;
+
+(net/dccp/ipv4.c)
+
+### DCCP Socket Initialization
+
+Socket creation in DCCP from userspace uses the socket() system call, where the domain argument (SOCK_DCCP) indicates that a DCCP socket is to be created. Within the kernel, this causes DCCP socket initialization via the dccp_v4_init_sock() callback, which relies on the dccp_init_sock() method to perform the actual work:
+
+static int dccp_v4_init_sock(struct sock *sk)
+
+{
+
+static __u8 dccp_v4_ctl_sock_initialized;
+
+int err = dccp_init_sock(sk, dccp_v4_ctl_sock_initialized);
+
+if (err == 0) {
+
+if (unlikely(!dccp_v4_ctl_sock_initialized))
+
+dccp_v4_ctl_sock_initialized = 1;
+
+inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops;
+
+}
+
+return err;
+
+}
+
+(net/dccp/ipv4.c)
+
+The most important tasks of the dccp_init_sock() method are these:
+
+  * Initialization of the DCCP socket fields with sane default values (for example, the socket state is set to be DCCP_CLOSED)
+
+  * Initialization of the DCCP timers (via the dccp_init_xmit_timers() method)
+
+  * Initialization of the feature-negotiation part via calling the dccp_feat_init() method. Feature negotiation is a distinguishing feature of DCCP by which endpoints can mutually agree on properties of each side of the connection. It extends TCP feature negotiation and is described further in RFC 4340, sec. 6.
+
+### Receiving Packets from the Network Layer (L3) with DCCP
+
+The main handler for receiving DCCP packets from the network layer (L3) is the dccp_v4_rcv () method:
+
+static int dccp_v4_rcv(struct sk_buff *skb)
+
+{
+
+const struct dccp_hdr *dh;
+
+const struct iphdr *iph;
+
+struct sock *sk;
+
+int min_cov;
+
+First we discard invalid packets. For example, if the packet is not for this host (the packet type is not PACKET_HOST), or if the packet size is shorter than the DCCP header (which is 12 bytes):
+
+if (dccp_invalid_packet(skb))
+
+goto discard_it;
+
+Then we perform a lookup according to the flow:
+
+sk = __inet_lookup_skb(&dccp_hashinfo, skb,
+
+dh->dccph_sport, dh->dccph_dport);
+
+If no socket was found, the packet is dropped:
+
+if (sk == NULL) {
+
+. . .
+
+goto no_dccp_socket;
+
+}
+
+We make some more checks relating to Minimum Checksum Coverage, and if everything is fine, we proceed to the generic sk_receive_skb() method to pass the packet to the transport layer (L4). Note that the dccp_v4_rcv() method is very similar in structure and function to the tcp_v4_rcv() method. This is because the original author of DCCP in Linux, Arnaldo Carvalho de Melo, has worked quite hard to make the similarities between TCP and DCCP obvious and clear in the code.
+
+. . .
+
+return sk_receive_skb(sk, skb, 1);
+
+}
+
+(net/dccp/ipv4.c)
+
+### Sending Packets with DCCP
+
+Sending data from a DCCP userspace socket is eventually handled by the dccp_sendmsg() method in the kernel (net/dccp/proto.c). This parallels the TCP case, where the tcp_sendmsg() kernel method handles sending data from a TCP userspace socket. Let's take a look at the dccp_sendmsg() method:
+
+int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+
+size_t len)
+
+{
+
+const struct dccp_sock *dp = dccp_sk(sk);
+
+const int flags = msg->msg_flags;
+
+const int noblock = flags & MSG_DONTWAIT;
+
+struct sk_buff *skb;
+
+int rc, size;
+
+long timeo;
+
+Allocate an SKB:
+
+skb = sock_alloc_send_skb(sk, size, noblock, &rc);
+
+lock_sock(sk);
+
+if (skb == NULL)
+
+goto out_release;
+
+skb_reserve(skb, sk->sk_prot->max_header);
+
+Copy the data blocks from the msghdr object to the SKB:
+
+rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+
+if (rc != 0)
+
+goto out_discard;
+
+if (!timer_pending(&dp->dccps_xmit_timer))
+
+dccp_write_xmit(sk);
+
+Depending upon the type of congestion control (window-based or rate-based) chosen for the connection, the dccp_write_xmit() method will cause a packet to be sent later (via dccps_xmit_timer() expiry) or passed on for immediate sending by the dccp_xmit_packet() method. This, in turn, relies on the dccp_transmit_skb() method to initialize the outgoing DCCP header and pass it to the L3-specific queue_xmit sending callback (using the ip_queue_xmit() method for IPv4, and the inet6_csk_xmit() method for IPv6). I will conclude our discussion about DCCP with a short section about DCCP and NAT.
+
+### DCCP and NAT
+
+Some NAT devices do not let DCCP through (usually because their firmware is typically small, and hence does not support "exotic" IP protocols such as DCCP). RFC 5597 (September 2009) has suggested behavioral requirements for NATs to support NAT-ed DCCP communications. However, it is not clear to what extent the recommendations are put into consumer devices. One of the motivations for DCCP-UDP was the absence of NAT devices that would let DCCP through (RFC 6773, sec. 1). There is a detail that might be interesting in the comparison with TCP. The latter, by default, supports simultaneous open (RFC 793, section 3.4), whereas the initial specification of DCCP in RFC 4340, section 4.6 disallowed the use of simultaneous-open. To support NAPT traversal, RFC 5596 updated RFC 4340 in September 2009 with a "near simultaneous open" technique, which added one packet type (DCCP-LISTEN, RFC 5596, section 2.2.1) to the list and changed the state machine to support two more states (2.2.2) to support near-simultaneous open. The motivation was a NAT "hole punching" technique, which would require, however, that NATs with DCCP existed (same problem as above). As a result of this chicken-and-egg problem, DCCP has not seen much exposure over the Internet. Perhaps the UDP encapsulation will change that. But then it would no longer really be considered as a transport layer protocol.
+
+## Summary
+
+This chapter discussed four transport protocols: UDP and TCP, which are the most commonly used, and SCTP and DCCP, which are newer protocols. You learned the basic differences between these protocols. You learned that TCP is a much more complex protocol than UDP, as its uses a state machine and several timers and requires acknowledgments. You learned about the header of each of these protocols and about sending and receiving packets with these protocols. I discussed some unique features of the SCTP protocol, like multihoming and multistreaming.
+
+The next chapter will deal with the Wireless subsystem and its implementation in Linux. In the "Quick Reference" section that follows, I will cover the top methods related to the topics discussed in this chapter, ordered by their context, and also I will present the two tables that were mentioned in this chapter.
+
+## Quick Reference
+
+I will conclude this chapter with a short list of important methods of sockets and transport-layer protocols that we discussed in this chapter. Some of them were mentioned in this chapter. Afterward, there is one macro and three tables.
+
+### Methods
+
+Here are the methods.
+
+#### int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc);
+
+This method builds an ipcm_cookie object by parsing the specified msghdr object.
+
+#### void sock_put(struct sock *sk);
+
+This method decrements the reference count of the specified sock object.
+
+#### void sock_hold(struct sock *sk);
+
+This method increments the reference count of the specified sock object.
+
+#### int sock_create(int family, int type, int protocol, struct socket **res);
+
+This method performs some sanity checks, and if everything is fine, it allocates a socket by calling the sock_alloc() method, and then calling net_families[family]->create. (In the case of IPv4, it is the inet_create() method.)
+
+#### int sock_map_fd(struct socket *sock, int flags);
+
+This method allocates a file descriptor and fills in the file entry.
+
+#### bool sock_flag(const struct sock *sk, enum sock_flags flag);
+
+This method returns true if the specified flag is set in the specified sock object.
+
+#### int tcp_v4_rcv(struct sk_buff *skb);
+
+This method is the main handler to process incoming TCP packets arriving from the network layer (L3).
+
+#### void tcp_init_sock(struct sock *sk);
+
+This method performs address-family independent socket initializations.
+
+#### struct tcphdr *tcp_hdr(const struct sk_buff *skb);
+
+This method returns the TCP header associated with the specified skb.
+
+#### int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size);
+
+This method handles sending TCP packets that are sent from userspace.
+
+#### struct tcp_sock *tcp_sk(const struct sock *sk);
+
+This method returns the tcp_sock object associated with the specified sock object (sk).
+
+#### int udp_rcv(struct sk_buff *skb);
+
+This method is the main handler to process incoming UDP packets arriving from the network layer (L3).
+
+#### struct udphdr *udp_hdr(const struct sk_buff *skb);
+
+This method returns the UDP header associated with the specified skb.
+
+#### int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len);
+
+This method handles UDP packets that are sent from the userspace.
+
+#### struct sctphdr *sctp_hdr(const struct sk_buff *skb);
+
+This method returns the SCTP header associated with the specified skb.
+
+#### struct sctp_sock *sctp_sk(const struct sock *sk);
+
+This method returns the SCTP socket (sctp_sock object) associated with the specified sock object.
+
+#### int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t msg_len);
+
+This method handles SCTP packets that are sent from userspace.
+
+#### struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep, const struct sock *sk, sctp_scope_t scope, gfp_t gfp);
+
+This method allocates and initializes a new SCTP association.
+
+#### void sctp_association_free(struct sctp_association *asoc);
+
+This method frees the resources of an SCTP association.
+
+#### void sctp_chunk_hold(struct sctp_chunk *ch);
+
+This method increments the reference count of the specified SCTP chunk.
+
+#### void sctp_chunk_put(struct sctp_chunk *ch);
+
+This method decrements the reference count of the specified SCTP chunk. If the reference count reaches 0, it frees it by calling the sctp_chunk_destroy() method.
+
+#### int sctp_rcv(struct sk_buff *skb);
+
+This method is the main input handler for input SCTP packets.
+
+#### static int dccp_v4_rcv(struct sk_buff *skb);
+
+This method is the main Rx handler for processing incoming DCCP packets that arrive from the network layer (L3).
+
+#### int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len);
+
+This method handles DCCP packets that are sent from the userspace.
+
+### Macros
+
+And here is the macro.
+
+#### sctp_chunk_is_data( )
+
+This macro returns 1 if the specified chunk is a data chunk; otherwise, it returns 0.
+
+### Tables
+
+Take a look at the tables used in this chapter.
+
+Table 11-1.
+
+TCP and UDP prot_ops objects
+
+prot_ops callback | TCP | UDP
+
+---|---|---
+
+release | inet_release | inet_release
+
+bind | inet_bind | inet_bind
+
+connect | inet_stream_connect | inet_dgram_connect
+
+socketpair | sock_no_socketpair | sock_no_socketpair
+
+accept | inet_accept | sock_no_accept
+
+getname | inet_getname | inet_getname
+
+poll | tcp_poll | udp_poll
+
+ioctl | inet_ioctl | inet_ioctl
+
+listen | inet_listen | sock_no_listen
+
+shutdown | inet_shutdown | inet_shutdown
+
+setsockopt | sock_common_setsockopt | sock_common_setsockopt
+
+getsockopt | sock_common_getsockopt | sock_common_getsockopt
+
+sendmsg | inet_sendmsg | inet_sendmsg
+
+recvmsg | inet_recvmsg | inet_recvmsg
+
+mmap | sock_no_mmap | sock_no_mmap
+
+sendpage | inet_sendpage | inet_sendpage
+
+splice_read | tcp_splice_read | -
+
+compat_setsockopt | compat_sock_common_setsockopt | compat_sock_common_setsockopt
+
+compat_getsockopt | compat_sock_common_getsockopt | compat_sock_common_getsockopt
+
+compat_ioctl | inet_compat_ioctl | inet_compat_ioctl
+
+Note
+
+See the inet_stream_ops and the inet_dgram_ops definitions in net/ipv4/af_inet.c.
+
+Table 11-2.
+
+Chunk types
+
+Chunk Type | Linux Symbol | Value
+
+---|---|---
+
+Payload Data | SCTP_CID_DATA | 0
+
+Initiation | SCTP_CID_INIT | 1
+
+Initiation Acknowledgment | SCTP_CID_INIT_ACK | 2
+
+Selective Acknowledgment | SCTP_CID_SACK | 3
+
+Heartbeat Request | SCTP_CID_HEARTBEAT | 4
+
+Heartbeat Acknowledgment | SCTP_CID_HEARTBEAT_ACK | 5
+
+Abort | SCTP_CID_ABORT | 6
+
+Shutdown | SCTP_CID_SHUTDOWN | 7
+
+Shutdown Acknowledgment | SCTP_CID_SHUTDOWN_ACK | 8
+
+Operation Error | SCTP_CID_ERROR | 9
+
+State Cookie | SCTP_CID_COOKIE_ECHO | 10
+
+Cookie Acknowledgment | SCTP_CID_COOKIE_ACK | 11
+
+Explicit Congestion Notification Echo (ECNE) | SCTP_CID_ECN_ECNE | 12
+
+Congestion Window Reduced (CWR) | SCTP_CID_ECN_CWR | 13
+
+Shutdown Complete | SCTP_CID_SHUTDOWN_COMPLETE | 14
+
+SCTP Authentication Chunk (RFC 4895) | SCTP_CID_AUTH | 0x0F
+
+Transmission Sequence Numbers | SCTP_CID_FWD_TSN | 0xC0
+
+Address Configuration Change Chunk | SCTP_CID_ASCONF | 0xC1
+
+Address Configuration Acknowledgment Chunk | SCTP_CID_ASCONF_ACK | 0x80
+
+Table 11-3.
+
+DCCP packet types
+
+Linux Symbol | Description
+
+---|---
+
+DCCP_PKT_REQUEST | Sent by the client to initiate a connection (the first part of the three-way initiation handshake).
+
+DCCP_PKT_RESPONSE | Sent by the server in response to a DCCP-Request (the second part of the three-way initiation handshake).
+
+DCCP_PKT_DATA | Used to transmit application data.
+
+DCCP_PKT_ACK | Used to transmit pure acknowledgments.
+
+DCCP_PKT_DATAACK | Used to transmit application data with piggybacked acknowledgment information.
+
+DCCP_PKT_CLOSEREQ | Sent by the server to request that the client close the connection.
+
+DCCP_PKT_CLOSE | Used by the client or the server to close the connection; elicits a DCCP-Reset packet in response.
+
+DCCP_PKT_RESET | Used to terminate the connection, either normally or abnormally.
+
+DCCP_PKT_SYNC | Used to resynchronize sequence numbers after large bursts of packet loss.
+
+DCCP_PKT_SYNCACK | Acknowledge a DCCP_PKT_SYNC.
+Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_12
+
+© Rami Rosen 2014
+
+# 12. Wireless in Linux
+
+Rami Rosen1
+
+(1)
+
+Haifa, Israel
+
+Abstract
+
+Chapter 11 deals with Layer 4 protocols, which enable us to communicate with userspace. This chapter deals with the wireless stack in the Linux kernel. I describe the Linux wireless stack (mac80211 subsystem) and discuss some implementation details of important mechanisms in it, such as packet aggregation and block acknowledgement, used in IEEE 802.11n, and power save mode. Becoming familiar with the 802.11 MAC header is essential in order to understand the wireless subsystem implementation. The 802.11 MAC header, its members, and their usage are described in depth in this chapter. I also discuss some common wireless topologies, like infrastructure BSS, independent BSS, and Mesh networking.Mac80211 Subsystem
+
+Chapter 11 deals with Layer 4 protocols, which enable us to communicate with userspace. This chapter deals with the wireless stack in the Linux kernel. I describe the Linux wireless stack (mac80211 subsystem) and discuss some implementation details of important mechanisms in it, such as packet aggregation and block acknowledgement, used in IEEE 802.11n, and power save mode. Becoming familiar with the 802.11 MAC header is essential in order to understand the wireless subsystem implementation. The 802.11 MAC header, its members, and their usage are described in depth in this chapter. I also discuss some common wireless topologies, like infrastructure BSS, independent BSS, and Mesh networking.
+
+## Mac80211 Subsystem
+
+At the end of the 1990s, there were discussions in IEEE regarding a protocol for wireless local area networks (WLANS). The original version of the IEEE 802.11 spec for WLANS was released in 1997 and revised in 1999. In the following years, some extensions were added, formally termed 802.11 amendments. These extensions can be divided into PHY (Physical) layer extensions, MAC (Medium Access Control) layer extensions, Regulatory extensions, and others. PHY layer extensions are, for example, 802.11b from 1999, 802.11a (also from 1999), and 802.11g from 2003. MAC layer extensions are, for example, 802.11e for QoS and 802.11s for Mesh networking. The "Mesh Networking" section of this chapter deals with the Linux kernel implementation of the IEEE802.11s amendment. The IEEE802.11 spec was revised, and in 2007 a second version of 1,232 pages was released. In 2012, a spec of 2,793 pages was released, available from  http://standards.ieee.org/findstds/standard/802.11-2012.html . I refer to this spec as IEEE 802.11-2012 in this chapter. Following is a partial list of important 802.11 amendments:
+
+  * IEEE 802.11d: International (country-to-country) roaming extensions (2001).
+
+  * IEEE 802.11e: Enhancements: QoS, including packet bursting (2005).
+
+  * IEEE 802.11h: Spectrum Managed 802.11a for European compatibility (2004).
+
+  * IEEE 802.11i: Enhanced security (2004).
+
+  * IEEE 802.11j: Extensions for Japan (2004).
+
+  * IEEE 802.11k: Radio resource measurement enhancements (2008).
+
+  * IEEE 802.11n: Higher throughput improvements using MIMO (multiple input, multiple output antennas) (2009).
+
+  * IEEE 802.11p:WAVE: Wireless Access for the Vehicular Environment (such as ambulances and passenger cars). It has some peculiarities such as not using the BSS concept and narrower (5/10 MHz) channels. Note that IEEE 802.11p isn't supported in Linux as of this writing.
+
+  * IEEE 802.11v: Wireless network management.
+
+  * IEEE 802.11w: Protected Management Frames.
+
+  * IEEE 802.11y: 3650–3700 MHz operation in the U.S. (2008)
+
+  * IEEE 802.11z: Extensions to Direct Link Setup (DLS) (Aug 2007–Dec 2011).
+
+It was only in about 2001, about four years after the IEEE 802.11 first spec was approved, that laptops became very popular; many of these laptops were sold with wireless network interfaces. Today every laptop includes WiFi as standard equipment. It was important to the Linux community at that time to provide Linux drivers to these wireless network interfaces and to provide a Linux network wireless stack, in order to stay competitive with other OSes (such as Windows, Mac OS, and others). Less effort has been done regarding architecture and design. "They just want their hardware to work," as Jeff Garzik, the Linux Kernel Wireless maintainer at that time, put it. When the first wireless drivers for Linux were developed, there was no general wireless API. As a result, there were many cases of duplication of code between drivers, when developers implemented their drivers from scratch. Some drivers were based on FullMAC, which means that most of the management layer (MLME) is managed in hardware. In the years since, a new 802.11 wireless stack called mac80211 was developed. It was integrated into the Linux kernel in July 2007, for the 2.6.22 Linux kernel. The mac80211 stack is based on the d80211 stack, which is an open source, GPL-licensed stack by a company named Devicescape.
+
+I cannot delve into the details of the PHY layer, because that subject is very wide and deserves a book of its own. However, I must note that there are many differences between 802.11 and 802.3 wired Ethernet. Here are two major differences:
+
+  * Ethernet works with CSMA/CD, whereas 802.11 works with CSMA/CA. CSMA/CA stands for carrier sense multiple access/collision avoidance, and CSMA/CD stands for carrier sense multiple access/collision detection. The difference, as you might guess, is the collision detection. With Ethernet, a station starts to transmit when the medium is idle; if a collision is detected during transmission, it stops, and a random backoff period starts. Wireless stations cannot detect collisions while transmitting, whereas wired stations can. With CSMA/CA, the wireless station waits for a free medium and only then transmits the frame. In case of a collision, the station will not notice it, but because no acknowledgment frame should be sent for this packet, it is retransmitted after a timeout has elapsed if an acknowledgment is not received.
+
+  * Wireless traffic is sensitive to interferences. As a result, the 802.11 spec requires that every frame, except for broadcast and multicast, be acknowledged when it is received. Packets that are not acknowledged in time should be retransmitted. Note that since IEEE 802.11e, there is a mode which does not require acknowledgement—the QoSNoAck mode—but it's rarely used in practice.
+
+## The 802.11 MAC Header
+
+Each MAC frame consists of a MAC header, a frame body of variable length, and an FCS (Frame Check Sequence) of 32 bit CRC. Figure 12-1 shows the 802.11 header.
+
+Figure 12-1.
+
+IEEE 802.11 header. Note that all members are not always used, as this section will shortly explain
+
+The 802.11 header is represented in mac80211 by the ieee80211_hdr structure:
+
+struct ieee80211_hdr {
+
+__le16 frame_control;
+
+__le16 duration_id;
+
+u8 addr1[6];
+
+u8 addr2[6];
+
+u8 addr3[6];
+
+__le16 seq_ctrl;
+
+u8 addr4[6];
+
+} __packed;
+
+(include/linux/ieee80211.h)
+
+In contrast to an Ethernet header (struct ethhdr), which contains only three fields (source MAC address, destination MAC address, and Ethertype), the 802.11 header contains up to six addresses and some other fields. For a typical data frame, though, only three addresses are used (for example, Access Point or AP/client communication). With an ACK frame, only the receiver address is used. Note that Figure 12-1 shows only four addresses, but when working with Mesh networking, a Mesh extension header with two additional addresses is used.
+
+I now turn to a description of the 802.11 header fields, starting with the first field in the 802.11 header, called the frame control. This is an important field, and in many cases its contents determine the meaning of other fields of the 802.11 MAC header (especially addresses).
+
+### The Frame Control
+
+The frame control length is 16 bits. Figure 12-2 shows its fields and the size of each field.
+
+Figure 12-2.
+
+Frame control fields
+
+The following is a description of the frame control members:
+
+  * Protocol version: The version of the MAC 802.11 we use. Currently there is only one version of MAC, so this field is always 0.
+
+  * Type: There are three types of packets in 802.11—management, control, and data:
+
+    * Management packets (IEEE80211_FTYPE_MGMT) are for management actions like association, authentication, scanning, and more.
+
+    * Control packets (IEEE80211_FTYPE_CTL) usually have some relevance to data packets; for example, a PS-Poll packet is for retrieving packets from an AP buffer. Another example: a station that wants to transmit first sends a control packet named RTS (request to send); if the medium is free, the destination station will send back a control packet named CTS (clear to send).
+
+    * Data packets (IEEE80211_FTYPE_DATA) are the raw data packets. Null packets are a special case of raw packets, carrying no data and used mostly for power management control purposes. I discuss null packets in the "Power Save Mode" section later in this chapter.
+
+  * Subtype: For all the aforementioned three types of packets (management, control, and data), there is a sub-type field which identifies the character of the packet used. For example:
+
+    * A value of 0100 for the sub-type field in a management frame denotes that the packet is a Probe Request (IEEE80211_STYPE_PROBE_REQ) management packet, which is used in a scan operation.
+
+    * A value of 1011 for the sub-type field in a control packet denotes that this is a request to send (IEEE80211_STYPE_RTS) control packet. A value of 0100 for the sub-type field of a data packet denotes that this is a null data (IEEE80211_STYPE_NULLFUNC) packet, which is used for power management control.
+
+    * A value of 1000 (IEEE80211_STYPE_QOS_DATA) for the sub-type of a data packet means that this is a QoS data packet; this sub-type was added by the IEEE802.11e amendment, which dealt with QoS enhancements.
+
+  * ToDS: When this bit is set, it means the packet is for the distribution system.
+
+  * FromDS: When this bit is set, it means the packet is from the distribution system.
+
+  * More Frag: When you use fragmentation, this bit is set to 1.
+
+  * Retry: When a packet is retransmitted, this bit is set to 1. A typical case of retransmission is when a packet that was sent did not receive an acknowledgment in time. The acknowledgments are usually sent by the firmware of the wireless driver.
+
+  * Pwr Mgmt: When the power management bit is set, it means that the station will enter power save mode. I discuss power save mode in the "Power Save Mode" section later in this chapter.
+
+  * More Data: When an AP sends packets that it buffered for a sleeping station, it sets the More Data bit to 1 when the buffer is not empty. Thus the station knows that there are more packets it should retrieve. When the buffer has been emptied, this bit is set to 0.
+
+  * Protected Frame: This bit is set to 1 when the frame body is encrypted; only data frames and authentication frames can be encrypted.
+
+  * Order: With the MAC service called strict ordering, the order of frames is important. When this service is in use, the order bit is set to 1. It is rarely used.
+
+Note
+
+The action frame (IEEE80211_STYPE_ACTION) was introduced with the 802.11h amendment, which dealt with spectrum and transmit power management. However, because of a lack of space for management packets sub-types, action frames are used also in various newer amendments to the standard—for example, HT action frames in 802.11n.
+
+## The Other 802.11 MAC Header Members
+
+The following describes the other members of the mac802.11 header, after the frame control:
+
+  * Duration/ID: The duration holds values for the Network Allocation Vector (NAV) in microseconds, and it consists of 15 bits of the Duration/ID field. The sixteenth field is 0. When working in power save mode, it is the AID (association id) of a station for PS-Poll frames (see 8.2.4.2 (a) in IEEE 802.11-2012). The Network Allocation Vector (NAV) is a virtual carrier sensing mechanism. I do not delve into NAV internals because that is beyond the scope of this chapter.
+
+  * Sequence Control: This is a 2-byte field specifying the sequence control. In 802.11, it is possible that a packet will be received more than once, most commonly when an acknowledgment is not received for some reason. The sequence control field consists of a fragment number (4 bits) and a sequence number (12 bits). The sequence number is generated by the transmitting station, in the ieee80211_tx_h_sequence() method. In the case of a duplicate frame in a retransmission, it is dropped, and a counter of the dropped duplicate frames (dot11FrameDuplicateCount) is incremented by 1; this is done in the ieee80211_rx_h_check() method. The Sequence Control field is not present in control packets.
+
+  * Address1 – Address4: There are four addresses, but you don't always use all of them. Address 1 is the Receive Address (RA), and is used in all packets. Address 2 is the Transmit Address (TA), and it exists in all packets except ACK and CTS packets. Address 3 is used only for management and data packets. Address 4 is used when ToDS and FromDS bits of the frame control are set; this happens when operating in a Wireless Distribution System.
+
+  * QoS Control: The QoS control field was added by the 802.11e amendment and is only present in QoS data packets. Because it is not part of the original 802.11 spec, it is not part of the original mac80211 implementation, so it is not a member of the IEEE802.11 header (ieee80211_hdr struct). In fact, it was added at the end of the IEEE802.11 header and can be accessed by the ieee80211_get_qos_ctl() method. The QoS control field includes the tid (Traffic Identification), the ACK Policy, and a field called A-MSDU present, which tells whether an A-MSDU is present. I discuss A-MSDU later in this chapter, in the "High Throughput (ieee802.11n)" section.
+
+  * HT Control Field: HT (high throughput) control field was added by the 802.11n amendment (see 7.1.3.5(a) of the 802.11n-2009 spec).
+
+This section covered the 802.11 MAC header, with a description of its members and their use. Becoming familiar with the 802.11 MAC header is essential for understanding the mac802.11 stack.
+
+## Network Topologies
+
+There are two popular network topologies in 802.11 wireless networks. The first topology I discuss is Infrastructure BSS mode, which is the most popular. You encounter Infrastructure BSS wireless networks in home wireless networks and offices. Later I discuss the IBSS (Ad Hoc) mode. Note that IBSS is not Infrastructure BSS; IBSS is Independent BSS, which is an ad hoc network, discussed later in this section.
+
+### Infrastructure BSS
+
+When working in Infrastructure BSS mode, there is a central device, called an Access Point (AP), and some client stations. Together they form a BSS (Basic Service Set). These client stations must first perform association and authentication against the AP to be able to transmit packets via the AP. On many occasions, client stations perform scanning prior to authentication and association, in order to get details about the AP. Association is exclusive: a client can be associated with only one AP in a given moment. When a client associates with an AP successfully, it gets an AID (association id), which is a unique number (to this BSS) in the range 1–2007. An AP is in fact a wireless network device with some hardware additions (like Ethernet ports, LEDs, a button to reset to manufacturer defaults, and more). A management daemon runs on the AP device. An example of such software is the hostapd daemon. This software handles some of the management tasks of the MLME layer, such as authentication and association requests. It achieves this by registering itself to receive the relevant management frames via nl80211. The hostapd project is an open source project which enables several wireless network devices to operate as an AP.
+
+Clients can communicate with other clients (or to stations in a different network which is bridged to the AP) by sending packets to the AP, which are relayed by the AP to their final destination. To cover a large area, you can deploy multiple APs and connect them by wire. This type of deployment is called Extended Service Set (ESS). Within ESS deployment, there are two or more BSSs. Multicasts and broadcasts sent in one BSS, which may arrive on a nearby BSS, are rejected in the nearby BSS stations (the bssid in the 802.11 header does not match). Within such a deployment, each AP usually uses a different channel to minimize interference.
+
+### IBSS, or Ad Hoc Mode
+
+IBSS network is often formed without preplanning, for only as long as the WLAN is needed. An IBSS network is also called ad hoc network. Creating an IBSS is a simple procedure. You can set an IBSS by running from a command line this iw command (note that the 2412 parameter is for using channel 1):
+
+iw wlan0 ibss join AdHocNetworkName 2412
+
+Or when using the iwconfig tool, with these two commands:
+
+iwconfig wlan0 mode ad-hoc
+
+iwconfig wlan0 essid AdHocNetworkrName
+
+This triggers IBSS creation by calling the ieee80211_sta_create_ibss() method (net/mac80211/ibss.c). Then the ssid (AdHocNetworkName in this case) has to be distributed manually (or otherwise) to everyone who wants to connect to the ad hoc network. When working with IBSS, you do not have an AP. The bssid of the IBSS is a random 48-bit address (based on calling the get_random_bytes() method). Power management in Ad Hoc mode is a bit more complex than power management in Infrastructure BSS; it uses Announcement Traffic Indication Map (ATIM) messages. ATIM is not supported by mac802.11 and is not discussed in this chapter.
+
+The next section describes power save mode, which is one of the most important mechanisms of the mac80211 network stack.
+
+## Power Save Mode
+
+Apart from relaying packets, there is another important function for the AP: buffering packets for client stations that enter power save mode. Clients are usually battery-powered devices. From time to time, the wireless network interface enters power save mode.
+
+### Entering Power Save Mode
+
+When a client station enters power save mode, it informs the AP about it by sending usually a null data packet. In fact, technically speaking, it does not have to be a null data packet; it is enough that it is a packet with PM=1 (PM is the Power Management flag in the frame control). An AP that gets such a null packet starts keeping unicast packets which are destined to that station in a special buffer called ps_tx_buf; there is such a buffer for every station. This buffer is in fact a linked list of packets, and it can hold up to 128 packets (STA_MAX_TX_BUFFER) for each station. If the buffer is filled, it will start discarding the packets that were received first (FIFO). Apart from this, there is a single buffer called bc_buf, for multicast and broadcast packets (in the 802.11 stack, multicast packets should be received and processed by all the stations in the same BSS). The bc_buf buffer can also hold up to 128 packets (AP_MAX_BC_BUFFER). When a wireless network interface is in power save mode, it cannot receive or send packets.
+
+### Exiting Power Save Mode
+
+From time to time, an associated station is awakened by itself (by some timer); it then checks for special management packets, called beacons, which the AP sends periodically. Typically, an AP sends 10 beacons in a second; on most APs, this is a configurable parameter. These beacons contain data in information elements, which constitute the data in the management packet. The station that awoke checks a specific information element called TIM (Traffic Indication Map), by calling the ieee80211_check_tim() method (include/linux/ieee80211.h). The TIM is an array of 2008 entries. Because the TIM size is 251 bytes (2008 bits), you are allowed to send a partial virtual bitmap, which is smaller in size. If the entry in the TIM for that station is set, it means that the AP saved unicast packets for this station, so that station should empty the buffer of packets that the AP kept for it. The station starts sending null packets (or, more rarely, special control packets, called PS-Poll packets) to retrieve these buffered packets from the AP. Usually after the buffer has been emptied, the station goes to sleep (however, this is not mandatory according to the spec).
+
+### Handling the Multicast/Broadcast Buffer
+
+The AP buffers multicast and broadcast packets whenever at least one station is in sleeping mode. The AID for multicast/broadcast stations is 0; so, in such a case, you set TIM[0] to true. The Delivery Team (DTIM), which is a special type of TIM, is sent not in every beacon, but once for a predefined number of beacon intervals (the DTIM period). After a DTIM is sent, the AP sends its buffered broadcast and multicast packets. You retrieve packets from the multicast/broadcast buffer (bc_buf) by calling the ieee80211_get_buffered_bc() method. In Figure 12-3 you can see an AP that contains a linked list of stations (sta_info objects), each of them with a unicast buffer (ps_tx_buf) of its own, and a single bc_buf buffer, for storing multicast and broadcast packets.
+
+Figure 12-3.
+
+Buffering packets in an AP
+
+The AP is implemented as an ieee80211_if_ap object in mac80211. Each such ieee80211_if_ap object has a member called ps (an instance of ps_data), where power save data is stored. One of the members of the ps_data structure is the broadcast/multicast buffer, bc_buf.
+
+In Figure 12-4 you can see a flow of PS-Poll packets that a client sends in order to retrieve packets from the AP unicast buffer, ps_tx_buf. Note that the AP sends all the packets with the IEEE80211_FCTL_MOREDATA flag, except for the last one. Thus, the client knows that it should keep on sending PS-Poll packets until the buffer is emptied. For the sake of simplicity, the ACK traffic in this diagram is not included, but it should be mentioned here that the packets should be acknowledged.
+
+Figure 12-4.
+
+Sending PSPOLL packets from a client to retrieve packets from the ps_tx_buf buffer within an AP
+
+Note
+
+Power management and power save mode are two different topics. Power management deals with handling machines that perform suspend (whether it is suspend to RAM or suspend to disk, aka hibernate, or in some cases, both suspend to RAM and suspend to disk, aka hybrid suspend), and is handled in net/mac80211/pm.c. In the drivers, power management is handled by the resume/suspend methods. Power save mode, on the other hand, deals with handling stations that enter sleep mode and wake up; it has nothing to do with suspend and hibernation.
+
+This section described power save mode and the buffering mechanism. The next section discusses the management layer and the different tasks it handles.
+
+## The Management Layer (MLME)
+
+There are three components in the 802.11 management architecture:
+
+  * The Physical Layer Management Entity (PLME).
+
+  * The System Management Entity (SME).
+
+  * The MAC Layer Management Entity (MLME).
+
+### Scanning
+
+There are two types of scanning: passive scanning and active scanning. Passive scanning means to listen passively for beacons, without transmitting any packets for scanning. When performing passive scanning (the flags of the scan channel contain IEEE80211_CHAN_PASSIVE_SCAN), the station moves from channel to channel, trying to receive beacons. Passive scanning is needed in some higher 802.11a frequency bands, because you're not allowed to transmit anything at all until you've heard an AP beacon. With active scanning, each station sends a Probe Request packet; this is a management packet, with sub-type Probe Request (IEEE80211_STYPE_PROBE_REQ). Also with active scanning, the station moves from channel to channel, sending a Probe Request management packet on each channel (by calling the ieee80211_send_probe_req() method). This is done by calling the ieee80211_request_scan() method. Changing channels is done via a call to the ieee80211_hw_config() method, passing IEEE80211_CONF_CHANGE_CHANNEL as a parameter. Note that there is a one-to-one correspondence between a channel in which a station operates and the frequency in which it operates; the ieee80211_channel_to_frequency() method (net/wireless/util.c) returns the frequency in which a station operates, given its channel.
+
+### Authentication
+
+Authentication is done by calling the ieee80211_send_auth() method (net/mac80211/util.c). It sends a management frame with authentication sub-type (IEEE80211_STYPE_AUTH). There are many authentications types; the original IEEE802.11 spec talked about only two forms: open-system authentication and shared key authentication. The only mandatory authentication method required by the IEEE802.11 spec is the open-system authentication (WLAN_AUTH_OPEN). This is a very simple authentication algorithm—in fact, it is a null authentication algorithm. Any client that requests authentication with this algorithm will become authenticated. An example of another option for an authentication algorithm is the shared key authentication (WLAN_AUTH_SHARED_KEY). In shared key authentication, the station should authenticate using a Wired Equivalent Privacy (WEP) key.
+
+### Association
+
+In order to associate, a station sends a management frame with association sub-type (IEEE80211_STYPE_ASSOC_REQ). Association is done by calling the ieee80211_send_assoc() method (net/mac80211/mlme.c).
+
+### Reassociation
+
+When a station moves between APs within an ESS, it is said to be roaming. The roaming station sends a reassociation request to a new AP by sending a management frame with reassociation sub-type (IEEE80211_STYPE_REASSOC_REQ). Reassociation is done by calling the ieee80211_send_assoc() method; there are many similarities between association and reassociation, so this method handles both. In addition, with reassociation, the AP returns an AID (association id) to the client in case of success.
+
+This section talked about the management layer (MLME) and some of the operations it supports, like scanning, authentication, association, and more. In the next section I describe some mac80211 implementation details that are important in order to understand the wireless stack.
+
+## Mac80211 Implementation
+
+Mac80211 has an API for interfacing with the low level device drivers. The implementation of mac80211 is complex and full of many small details. I cannot give an exhaustive description of the mac80211 API and implementation; I do discuss some important points that can give a good starting point to those who want to delve into the code. A fundamental structure of mac80211 API is the ieee80211_hw struct (include/net/mac80211.h); it represents hardware information. The priv (pointer to a private area) pointer of ieee80211_hw is of an opaque type (void *). Most wireless device drivers define a private structure for this private area, like lbtf_private (Marvell wireless driver) or iwl_priv (iwlwifi from Intel). Memory allocation and initialziation for the ieee80211_hw struct is done by the ieee80211_alloc_hw() method. Here are some methods related to the ieee80211_hw struct:
+
+  * int ieee80211_register_hw(struct ieee80211_hw *hw): Called by wireless drivers for registering the specified ieee80211_hw object.
+
+  * void ieee80211_unregister_hw(struct ieee80211_hw *hw): Unregisters the specified 802.11 hardware device.
+
+  * struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, const struct ieee80211_ops *ops): Allocates an ieee80211_hw object and initializes it.
+
+  * ieee80211_rx_irqsafe(): This method is for receiving a packet. It is implemented in net/mac80211/rx.c and called from low level wireless drivers.
+
+The ieee80211_ops object, which is passed to the ieee80211_alloc_hw() method as you saw earlier, consists of pointers to callbacks to the driver. Not all of these callbacks must be implemented by the drivers. The following is a short description of these methods:
+
+  * tx(): The transmit handler called for each transmitted packet. It usually returns NETDEV_TX_OK (except for under certain limited conditions).
+
+  * start(): Activates the hardware device and is called before the first hardware device is enabled. It turns on frame reception.
+
+  * stop(): Turns off frame reception and usually turns off the hardware.
+
+  * add_interface(): Called when a network device attached to the hardware is enabled.
+
+  * remove_interface(): Informs a driver that the interface is going down.
+
+  * config(): Handles configuration requests, such as hardware channel configuration.
+
+  * configure_filter(): Configures the device's Rx filter.
+
+Figure 12-5 shows a block diagram of the architecture of the Linux wireless subsystem. You can see that the interface between wireless device drivers layer and the mac80211 layer is the ieee80211_ops object and its callbacks.
+
+Figure 12-5.
+
+Linux wireless architecture
+
+Another important structure is the sta_info struct (net/mac80211/sta_info.h), which represents a station. Among the members of this structure are various statistics counters, various flags, debugfs entries, the ps_tx_buf array for buffering unicast packets, and more. Stations are organized in a hash table (sta_hash) and a list (sta_list). The important methods related to sta_info are as follows:
+
+  * int sta_info_insert(struct sta_info *sta): Adds a station.
+
+  * int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr): Removes a station (by calling the __sta_info_destroy() method).
+
+  * struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, const u8 *addr): Fetches a station; the address of the station (it's bssid) is passed as a parameter.
+
+### Rx Path
+
+The ieee80211_rx() function (net/mac80211/rx.c) is the main receive handler. The status of the received packet (ieee80211_rx_status) is passed by the wireless driver to mac80211, embedded in the SKB control buffer (cb). The IEEE80211_SKB_RXCB() macro is used to fetch this status. The flag field of the Rx status specifies, for example, whether the FCS check failed on the packet (RX_FLAG_FAILED_FCS_CRC). The various values possible for the flag field are presented in Table 12-1 in the "Quick Reference" section of this chapter. In the ieee80211_rx() method, the ieee80211_rx_monitor() is invoked to remove the FCS (checksum) and remove a radiotap header (struct ieee80211_radiotap_header) which might have been added if the wireless interface is in monitor mode. (You use a network interface in monitor mode in case of sniffing, for example. Not all the wireless network interfaces support monitor mode, see the section "Wireless Modes" later in this chapter.)
+
+If you work with HT (802.11n), you perform AMPDU reordering if needed by invoking the ieee80211_rx_reorder_ampdu() method. Then you call the __ieee80211_rx_handle_packet() method, which eventually calls the ieee80211_invoke_rx_handlers() method. Then you call, one by one, various receive handlers (using a macro named CALL_RXH). The order of calling these handlers is important. Each handler checks whether it should handle the packet or not. If it decides it should not handle the packet, then you return RX_CONTINUE and proceed to the next handler. If it decides it should handle the packet, then you return RX_QUEUED.
+
+There are certain cases when a handler decides to drop a packet; in these cases, it returns RX_DROP_MONITOR or RX_DROP_UNUSABLE. For example, if you get a PS-Poll packet, and the type of the receiver shows that it is not an AP, you return RX_DROP_UNUSABLE. Another example: for a management frame, if the length of the SKB is less than the minimum (24), the packet is discarded and RX_DROP_MONITOR is returned. Or if the packet is not a management packet, then also the packet is discarded and RX_DROP_MONITOR is returned. Here is the code snippet from the ieee80211_rx_h_mgmt_check() method that implements this:
+
+ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx)
+
+{
+
+struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
+
+struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+
+. . .
+
+if (rx->skb->len < 24)
+
+return RX_DROP_MONITOR;
+
+if (!ieee80211_is_mgmt(mgmt->frame_control))
+
+return RX_DROP_MONITOR;
+
+. . .
+
+}
+
+(net/mac80211/rx.c)
+
+### Tx Path
+
+The ieee80211_tx() method is the main handler for transmission (net/mac80211/tx.c). First it invokes the __ieee80211_tx_prepare() method, which performs some checks and sets certain flags. Then it calls the invoke_tx_handlers() method, which calls, one by one, various transmit handlers (using a macro named CALL_TXH). If a transmit handler finds that it should do nothing with the packet, it returns TX_CONTINUE and you proceed to the next handler. If it decides it should handle a certain packet, it returns TX_QUEUED, and if it decides it should drop the packet, it returns TX_DROP. The invoke_tx_handlers() method returns 0 upon success. Let's take a short look in the implementation of the ieee80211_tx() method:
+
+static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
+
+struct sk_buff *skb, bool txpending,
+
+enum ieee80211_band band)
+
+{
+
+struct ieee80211_local *local = sdata->local;
+
+struct ieee80211_tx_data tx;
+
+ieee80211_tx_result res_prepare;
+
+struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+
+bool result = true;
+
+int led_len;
+
+Perform a sanity check, drop the SKB if its length is less than 10:
+
+if (unlikely(skb->len < 10)) {
+
+dev_kfree_skb(skb);
+
+return true;
+
+}
+
+/* initialises tx */
+
+led_len = skb->len;
+
+res_prepare = ieee80211_tx_prepare(sdata, &tx, skb);
+
+if (unlikely(res_prepare == TX_DROP)) {
+
+ieee80211_free_txskb(&local->hw, skb);
+
+return true;
+
+} else if (unlikely(res_prepare == TX_QUEUED)) {
+
+return true;
+
+}
+
+Invoke the Tx handlers; if everything is fine, continue with invoking the __ieee80211_tx() method:
+
+. . .
+
+if (!invoke_tx_handlers(&tx))
+
+result = __ieee80211_tx(local, &tx.skbs, led_len,
+
+tx.sta, txpending);
+
+return result;
+
+}
+
+(net/mac80211/tx.c)
+
+### Fragmentation
+
+Fragmentation in 802.11 is done only for unicast packets. Each station is assigned a fragmentation threshold size (in bytes). Packets that are bigger than this threshold should be fragmented. You can lower the number of collisions by reducing the fragmentation threshold size, making the packets smaller. You can inspect the fragmentation threshold of a station by running iwconfig or by inspecting the corresponding debugfs entry (see the "Mac80211 debugfs" section later in this chapter). You can set the fragmentation threshold with the iwconfig command; thus, for example, you can set the fragmentation threshold to 512 bytes by:
+
+iwconfig wlan0 frag 512
+
+Each fragment is acknowledged. The more fragment field in the fragment header is set to 1 if there are more fragments. Each fragment has a fragment number (a subfield in the sequence control field of the frame control). Reassembling of the fragments on the receiver is done according to the fragments numbers. Fragmentation in the transmitter side is done by the ieee80211_tx_h_fragment() method (net/mac80211/tx.c). Reassembly on the receiver side is done by the ieee80211_rx_h_defragment() method (net/mac80211/rx.c). Fragmentation is incompatible with aggregation (used for higher throughput), and given the high rates and thus short (in time) packets it is very rarely used nowadays.
+
+### Mac80211 debugfs
+
+debugfsis a technique that enables exporting debugging information to userspace. It creates entries under the sysfs filesystem. debugfs is a virtual filesystem devoted to debugging information. For mac80211, handling mac80211 debugfs is mostly in net/mac80211/debugfs.c. After mounting debugfs, various mac802.11 statistics and information entries can be inspected. Mounting debugfs is performed like this:
+
+mount -t debugfs none_debugs /sys/kernel/debug
+
+Note
+
+CONFIG_DEBUG_FS must be set when building the kernel to be able to mount and work with debugfs.
+
+For example, let's say your phy is phy0; the following is a discussion about some of the entries under /sys/kernel/debug/ieee80211/phy0:
+
+  * total_ps_buffered: This is the total number of packets (unicast and multicasts/broadcasts) which the AP buffered for the station. The total_ps_buffered counter is incremented by ieee80211_tx_h_unicast_ps_buf() for unicasts, and by ieee80211_tx_h_multicast_ps_buf() for multicasts or broadcasts.
+
+  * Under /sys/kernel/debug/ieee80211/phy0/statistics, you have various statistical information—for example:
+
+    * frame_duplicate_count denotes the number of duplicate frames. This debugfs entry represents the duplicate frames counter, dot11FrameDuplicateCount, which is incremented by the ieee80211_rx_h_check() method.
+
+    * transmitted_frame_count denotes the number of transmitted packets. This debugfs entry represents dot11TransmittedFrameCount; it is incremented by the ieee80211_tx_status() method.
+
+    * retry_count denotes number of retransmissions. This debugfs entry represents dot11RetryCount; it is incremented also by the ieee80211_tx_status() method.
+
+    * fragmentation_threshold: The size of the fragmentation threshold, in bytes. See the "Fragmentation" section earlier.
+
+    * Under /sys/kernel/debug/ieee80211/phy0/netdev:wlan0, you have some entries that give information about the interface; for example, if the interface is in station mode, you will have aid for the association id of the station, assoc_tries for the number of times the stations tried to perform association, bssid is for the bssid of the station, and so on.
+
+    * Every station uses a rate control algorithm. Its name is exported by the following debugfs entry: /sys/kernel/debug/ieee80211/phy1/rc/name.
+
+### Wireless Modes
+
+You can set a wireless network interface to operate in several modes, depending on its intended use and the topology of the network in which it is deployed. In some cases, you can set the mode with the iwconfig command, and in some cases you must use a tool like hostapd for this. Note that not all devices support all modes. See  www.linuxwireless.org/en/users/Drivers  for a list of Linux drivers that support different modes. Alternatively, you can also check to which values the interface_modes field of the wiphy member (in the ieee80211_hw object) is initialized in the driver code. The interface_modes are initialized to one or more modes of the nl80211_iftype enum, like NL80211_IFTYPE_STATION or NL80211_IFTYPE_ADHOC (see: include/uapi/linux/nl80211.h). The following is a detailed description of these wireless modes:
+
+  * AP mode: In this mode, the device acts as an AP (NL80211_IFTYPE_AP). The AP maintains and manages a list of associated stations. The network (BSS) name is the MAC address of the AP (bssid). There is also a human-readable name for the BSS, called the SSID.
+
+  * Station infrastructure mode: A managed station in an infrastructure mode (NL80211_IFTYPE_STATION).
+
+  * Monitor mode:All incoming packets are handed unfiltered in monitor mode (NL80211_IFTYPE_MONITOR). This is useful for sniffing. It is usually possible to transmit packets in monitor mode. This is termed packet injection; these packets are marked with a special flag (IEEE80211_TX_CTL_INJECTED).
+
+  * Ad Hoc (IBSS) mode:A station in an ad hoc (IBSS) network (NL80211_IFTYPE_ADHOC). With Ad Hoc mode, there is no AP device in the network.
+
+  * Wireless Distribution System (WDS) mode:A station in a WDS network (NL80211_IFTYPE_WDS).
+
+  * Mesh mode:A station in a Mesh network (NL80211_IFTYPE_MESH_POINT), discussed in the "Mesh Networking (802.11s)" section later in this chapter.
+
+The next section discusses the ieee802.11n technology, which provides higher performance, and how it is implemented in the Linux wireless stack. You will learn also about block acknowledgment and packet aggregation in 802.11n and how these techniques are used to improve performance.
+
+## High Throughput (ieee802.11n)
+
+A little after 802.11g was approved, a new task group was created in IEEE, called High Throughput Task Group (TGn). IEEE 802.11n became a final spec at the end of 2009. The IEEE 802.11n protocol allows coexistence with legacy devices. There were some vendors who already sold 802.11n pre-standard devices based on the 802.11n draft before the official approval. Broadcom set a precedent for releasing wireless interfaces based on a draft. In 2003, it released a chipset of a wireless device based on a draft of 802.11g. Following this precedent, as early as 2005 some vendors released products based on the 802.11n draft. For example, Intel Santa Rose processor has Intel Next-Gen Wireless-N (Intel WiFI Link 5000 series), supports 802.11n. Other Intel wireless network interfaces, like 4965AGN, also supported 802.11n. Other vendors, including Atheros and Ralink, also released 802.11n draft-based wireless devices. The WiFi alliance started certification of 802.11n draft devices in June 2007. A long list of vendors released products which comply with Wi-Fi CERTIFIED 802.11n draft 2.0.
+
+802.11n can operate on the 2.4 GHz and/or 5 GHz bands, whereas 802.11g and 802.11b operate only in the 2.4 GHz radio frequency band, and 802.11a operates only in the 5 GHz radio frequency band. The 802.11n MIMO (Multiple Input, Multiple Output) technology increases the range and reliability of traffic over the wireless coverage area. MIMO technology uses multiple transmitter and receiver antennas on both APs and clients, to allow for simultaneous data streams. The result is increased range and increased throughput. With 802.11n you can achieve a theoretical PHY rate of up to 600 Mbps (actual throughput will be much lower due to medium access rules, and so on).
+
+802.11n added many improvements for the 802.11 MAC layer. The most well known is packet aggregation, which concatenates multiple packets of application data into a single transmission frame. A block acknowledgment (BA) mechanism was added (discussed in the next section). BA permits multiple packets to be acknowledged by a single packet instead of sending an ACK for each received packet. The wait time between two consecutive packets is cut. This enables sending multiple data packets with a fixed overhead cost of a single packet. The BA protocol was introduced in the 802.11e amendment from 2005.
+
+### Packet Aggregation
+
+There are two types of packet aggregation:
+
+  * AMSDU: Aggregated Mac Service Data Unit
+
+  * AMPDU: Aggregated Mac Protocol Data Unit
+
+Note that the AMSDU is only supported on Rx, and not on Tx, and is wholly independent from the Block Ack mechanism described in this section; so the discussion in this section only pertains to AMPDU.
+
+There are two sides to a Block Ack session: originator and recipient. Each block session has a different Traffic Identifier (TID). The originator starts the block acknowledgement session by calling the ieee80211_start_tx_ba_session() method. This is done typically from a rate control algorithm method in the driver. For example, with the ath9k wireless driver, the ath_tx_status() function (drivers/net/wireless/ath/ath9k/rc.c), which is a rate control callback, invokes the ieee80211_start_tx_ba_session() method. The ieee80211_start_tx_ba_session() method sets the state to HT_ADDBA_REQUESTED_MSK and sends an ADDBA request packet, by invoking the ieee80211_send_addba_request() method. The call to ieee80211_send_addba_request() passes parameters for the session, such as the wanted reorder buffer size and the TID of the session.
+
+The reorder buffer size is limited to 64K (see the definition of ieee80211_max_ampdu_length_exp in include/linux/ieee80211.h). These parameters are part of the capability member (capab) in the struct addba_req. The response to the ADDBA request should be received within 1 Hz, which is one second in x86_64 machines (ADDBA_RESP_INTERVAL). If you do not get a response in time, the sta_addba_resp_timer_expired() method will stop the BA session by calling the ___ieee80211_stop_tx_ba_session() method. When the other side (the recipient) receives the ADDBA request, it first sends an ACK (every packet in ieee802.11 should be acknowledged, as mentioned before). Then it processes the ADDBA request by calling the ieee80211_process_addba_request() method; if everything is okay, it sets the aggregation state of this machine to operational (HT_AGG_STATE_OPERATIONAL) and sends an ADDBA response by calling the ieee80211_send_addba_resp() method. It also stops the response timer (the timer which has as its callback the sta_addba_resp_timer_expired() method) by calling del_timer_sync()on this timer. After a session is started, a data block containing multiple MPDU packets is sent. Consequently, the originator sends a Block Ack Request (BAR) packet by calling the ieee80211_send_bar() method.
+
+#### Block Ack Request (BAR)
+
+The BAR is a control packet with Block Ack Request sub-type (IEEE80211_STYPE_BACK_REQ). The BAR packet includes the SSN (start sequence number), which is the sequence number of the oldest MSDU in the block that should be acknowledged. The recipient receives the BAR and reorders the ampdu buffer accordingly, if needed. Figure 12-6 shows a BAR request.
+
+Figure 12-6.
+
+BAR request
+
+When sending a BAR, the type subfield in the frame control is control (IEEE80211_FTYPE_CTL), and the subtype subfield is Block Ack request (IEEE80211_STYPE_BACK_REQ). The BAR is represented by the ieee80211_bar struct:
+
+struct ieee80211_bar {
+
+__le16 frame_control;
+
+__le16 duration;
+
+__u8 ra[6];
+
+__u8 ta[6];
+
+__le16 control;
+
+__le16 start_seq_num;
+
+} __packed;
+
+(include/linux/ieee80211.h)
+
+The RA is the recipient address, and the TA is the transmitter (originator) address. The control field of the BAR request includes the TID.
+
+#### Block Ack
+
+There are two types of Block Ack: Immediate Block Ack and Delayed Block Ack. Figure 12-7 shows Immediate Block Ack.
+
+Figure 12-7.
+
+Immediate Block Ack
+
+The difference between Immediate Block Ack and Delayed Block Ack is that with Delayed Block Ack, the BAR request itself is answered first with an ACK, and then after some delay, with a BA (Block Ack). When using Delayed Block Ack, there is more time to process the BAR, and this is sometime needed when working with software based processing. Using Immediate Block Ack is better in terms of performance. The BA itself is also acknowledged. When the originator has no more data to send, it can terminate the Block Ack session by calling the ieee80211_send_delba() method; this function sends a DELBA request packet to the other side. The DELBA request is handled by the ieee80211_process_delba() method. The DELBA message, which causes a Block Ack session tear down, can be sent either from the originator or recipient of the Block Ack session. The AMPDU maximum length is 65535 octets. Note that packet aggregation is only implemented for APs and managed stations; packet aggregation for IBSS is not supported by the spec.
+
+## Mesh Networking (802.11s)
+
+The IEEE 802.11s protocol started as a Study Group of IEEE in September 2003, and became a Task Group named TGs in 2004. In 2006, 2 proposals, out of 15 (the "SEEMesh" and "Wi-Mesh" proposals) were merged into one, which resulted in draft D0.01. 802.11s was ratified in July 2011 and is now part of IEEE 802.11-2012. Mesh networks allow the creation of an 802.11 Basic Service Set over fully and partially connected Mesh topology. This can be seen as an improvement over 802.11 ad hoc network, which requires a fully-connected Mesh topology. Figures 12-8 and 12-9 illustrate the difference between the two types of Mesh topologies.
+
+Figure 12-8.
+
+Full Mesh
+
+In a partially-connected Mesh, nodes are connected to only some of the other nodes, but not to all of them. This topology is much more common in wireless Mesh networks. Figure 12-9 shows an example of a partial mesh.
+
+Figure 12-9.
+
+Partial Mesh
+
+Wireless mesh networks forward data packets over multiple wireless hops. Each mesh node acts as a relay point/router for the other mesh nodes. In kernel 2.6.26 (2008), support for the draft of wireless mesh networking (802.11s) was added to the network wireless stack, thanks to the open80211s project. The open80211s project goal was to create the first open implementation of 802.11s. The project got some sponsorship from the OLPC project and from some commercial companies. Luis Carlos Cobo and Javier Cardona and other developers from Cozybit developed the Linux mac80211 Mesh code.
+
+Now that you have learned a bit about Mesh networking and Mesh network topologies, you are ready for the next section, which covers the HWMP routing protocol for Mesh networks.
+
+### HWMP Protocol
+
+The 802.11s protocol defines a default routing protocol called HWMP (Hybrid Wireless Mesh Protocol). The HWMP protocol works with Layer 2 and deals with MAC addresses, as opposed to the IPV4 routing protocol, for example, which works with Layer 3 and deals with IP addresses. HWMP routing is based on two types of routing (hence it is called hybrid). The first is on-demand routing, and the second is proactive routing. The main difference between the two mechanisms has to do with the time in which path establishment is initiated (path is the name used for route in Layer 2). In on-demand routing, a path to a destination is established by the protocol only after the protocol stack has received frames for such a destination. This minimizes the amount of management traffic required to maintain the Mesh network at the expense of introducing additional latency in data traffic. Proactive routing can be used if a Mesh node is known to be the recipient of a lot of mesh traffic. In that case, the node will periodically announce itself over the Mesh network and trigger path establishments to itself from all the Mesh nodes in the network. Both on-demand and proactive routing are implemented in the Linux kernel. There are four types of routing messages:
+
+  * PREQ (Path Request): This type of message is sent as a broadcast when you look for some destination that you still do not have a route to. This PREQ message is propagated in the Mesh network until it gets to its destination. A lookup is performed on each station until the final destination is reached (by calling the mesh_path_lookup() method). If the lookup fails, the PREQ is forwarded (as a broadcast) to the other stations. The PREQ message is sent in a management packet; its sub-type is action (IEEE80211_STYPE_ACTION). It is handled by the hwmp_preq_frame_process() method.
+
+  * PREP (Path Reply): This type is a unicast packet that is sent as a reply to a PREQ message. This packet is sent in the reverse path. The PREP message is also sent in a management packet and its subtype is also the action sub-type (IEEE80211_STYPE_ACTION). It is handled by the hwmp_prep_frame_process() method. Both the PREQ and the PREP messages are sent by the mesh_path_sel_frame_tx() method.
+
+  * PERR (Path Error): If there is some failure on the way, a PERR is sent. A PERR message is handled by the mesh_path_error_tx() method.
+
+  * RANN (Root Announcement): The Root Mesh point periodically broadcasts this frame. Mesh points that receive it send a unicast RREQ to the root via the MP from which it received the RANN. In response, the Root Mesh will send a PREP response to each PREQ.
+
+Note
+
+The route takes into consideration a radio-aware metric (airtime metric). The airtime metric is calculated by the airtime_link_metric_get() method (based on rate and other hardware parameters). Mesh points continuously monitor their links and update metric values with neighbours.
+
+The station that sent the PREQ may try to send packets to the final destination while still not knowing the route to that destination; these packets are kept in a buffer of SKBs named frame_queue, which is a member of the mesh_path object (net/mac80211/mesh.h). In such a case, when a PREP finally arrives, the pending packets of this buffer are sent to the final destination (by calling the mesh_path_tx_pending() method). The maximum number of frames buffered per destination for unresolved destinations is 10 (MESH_FRAME_QUEUE_LEN). The advantages of Mesh networking are as follows:
+
+  * Rapid deployment
+
+  * Minimal configuration, inexpensive
+
+  * Easy to deploy in hard-to-wire environments
+
+  * Connectivity while nodes are in motion
+
+  * Higher reliability: no single point of failure and the ability to heal itself
+
+The disadvantages are as follows:
+
+  * Many broadcasts limit network performance.
+
+  * Not all wireless drivers support Mesh mode at the moment.
+
+### Setting Up a Mesh Network
+
+There are two sets of userspace tools for managing wireless devices and networks in Linux: one is the older Wireless Tools for Linux, an open source project based on IOCTLs. Examples of command line utilities of the wireless tools are iwconfig, iwlist, ifrename, and more. The newer tool is iw, based on generic netlink sockets (described in Chapter 2). However, there are some tasks that only the newer tool, iw, can perform. You can set a wireless device to work in Mesh mode only with the iw command.
+
+Example: setting a wireless network interface (wlan0) to work in Mesh mode can be done like this:
+
+iw wlan0 set type mesh
+
+Note
+
+Setting a wireless network interface (wlan0) to work in mesh mode can be done also like this:iw wlan0 set type mp
+
+mp stands for Mesh Point. See "Adding interfaces with iw" in  http://wireless.kernel.org/en/users/Documentation/iw
+
+Joining the mesh is done by: iw wlan0 mesh join "my-mesh-ID"
+
+You can display statistics about a station by the following:
+
+  * iw wlan0 station dump
+
+  * iw wlan0 mpath dump
+
+I should mention here also the authsae and the wpa_supplicant tools, which can be used to create secure Mesh networks and do not depend upon iw.
+
+## Linux Wireless Development Process
+
+Most development is done using the git distributed version control system, as with many other Linux subsystems. There are three main git trees; the bleeding edge is the wireless-testing tree. There are also the regular wireless tree and the wireless-next tree. The following are the links to the git repositories for the development trees:
+
+  * wireless-testing development tree:
+
+git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-testing.git
+
+  * wireless development tree:
+
+git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git
+
+  * wireless-next development tree:
+
+git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-next-2.6.git
+
+Patches are sent and discussed in the wireless mailing list: linux-wireless@vger.kernel.org. From time to time a pull request is sent to the kernel networking mailing list, netdev, mentioned in Chapter 1.
+
+As mentioned in the "Mac80211 subsystem" section, which dealt with the mac80211 subsystem, some wireless network interface vendors maintain their own development trees for their Linux drivers on their own sites. In some cases, the code they are using does not use the mac80211 API; for example, some Ralink and Realtek wireless device drivers. Since January 2006, the maintainer of the Linux wireless subsystem is John W. Linville, who replaced Jeff Garzik. The maintainer of mac80211 is Johannes Berg, from October 2007. There were some annual Linux wireless summits; the first took place in 2006 in Beaverton (OR). A very detailed wiki page is here:  http://wireless.kernel.org/ . This web site includes a lot of important documentation. For example, a table specifies the modes each wireless network interface supports. There is a lot of information in this wiki page regarding many wireless device drivers, hardware, and various tools (such as CRDA, the central regulatory domain agent, hostapd, iw, and more).
+
+## Summary
+
+A lot of development has been done in Linux wireless stack in recent years. The most significant change is the integration of the mac80211 stack and porting wireless drivers to use the mac80211 API, making the code much more organized. The situation is much better than before; many more wireless devices are supported in Linux. Mesh networking got a boost recently thanks to the open802.11s project. It was integrated in the Linux 2.6.26 kernel. The future will probably see more drivers that support the new standard, IEEE802.11ac, a 5 GHz-only technology that can reach maximum throughputs well above a gigabit per second, and more drivers that support P2P.
+
+Chapter 13 discusses InfiniBand and RDMA in the Linux kernel. The "Quick Reference" section covers the top methods that are related to the topics discussed in this chapter, ordered by their context.
+
+## Quick Reference
+
+I conclude this chapter with a short list of important methods of the Linux wireless subsystem, some of which are mentioned in this chapter. Table 12-1 shows the various possible values for the flag member of the ieee80211_rx_status object.
+
+### Methods
+
+This section discusses the methods.
+
+#### void ieee80211_send_bar(struct ieee80211_vif *vif, u8 *ra, u16 tid, u16 ssn);
+
+This method sends a block acknowledgment request.
+
+#### int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, u16 timeout);
+
+This method starts a Block Ack session by calling the wireless driver ampdu_action() callback, passing IEEE80211_AMPDU_TX_START. As a result, the driver will later call the ieee80211_start_tx_ba_cb() callback or the ieee80211_start_tx_ba_cb_irqsafe() callback, which will start the aggregation session.
+
+#### int ieee80211_stop_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid);
+
+This method stops a Block Ack session by calling the wireless driver ampdu_action() function, passing IEEE80211_AMPDU_TX_STOP. The driver must later call the ieee80211_stop_tx_ba_cb() callback or the ieee80211_stop_tx_ba_cb_irqsafe() callback.
+
+#### static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid, u8 dialog_token, u16 start_seq_num, u16 agg_size, u16 timeout);
+
+This method sends an ADDBA message. An ADDBA message is a management action message.
+
+#### void ieee80211_process_addba_request(struct ieee80211_local *local, struct sta_info *sta, struct ieee80211_mgmt *mgmt, size_t len);
+
+This method handles an ADDBA message.
+
+#### static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid, u8 dialog_token, u16 status, u16 policy, u16 buf_size, u16 timeout);
+
+This method sends an ADDBA response. An ADDBA response is a management packet, with subtype of action (IEEE80211_STYPE_ACTION).
+
+#### static ieee80211_rx_result debug_noinline ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx);
+
+This method handles AMSDU aggregation (Rx path).
+
+#### void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_mgmt *mgmt, size_t len);
+
+This method handles a DELBA message.
+
+#### void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid, u16 initiator, u16 reason_code);
+
+This method sends a DELBA message.
+
+#### void ieee80211_rx_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb);
+
+This method receives a packet. The ieee80211_rx_irqsafe() method can be called in hardware interrupt context.
+
+#### static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, struct sk_buff_head *frames);
+
+This method handles the A-MPDU reorder buffer.
+
+#### static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata, struct tid_ampdu_rx *tid_agg_rx, struct sk_buff_head *frames);
+
+This method handles the A-MPDU reorder buffer.
+
+#### static ieee80211_rx_result debug_noinline ieee80211_rx_h_check(struct ieee80211_rx_data *rx);
+
+This method drops duplicate frames of a retransmission and increment dot11FrameDuplicateCount and the station num_duplicates counter.
+
+#### void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, int powersave);
+
+This method sends a special NULL data frame.
+
+#### void ieee80211_send_pspoll(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata);
+
+This method sends a PS-Poll control packet to an AP.
+
+#### static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata);
+
+This method performs association or reassociation by sending a management packet with association sub-type of IEEE80211_STYPE_ASSOC_REQ or IEEE80211_STYPE_REASSOC_REQ, respectively. The ieee80211_send_assoc() method is invoked from the ieee80211_do_assoc() method.
+
+#### void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, const u8 *extra, size_t extra_len, const u8 *da, const u8 *bssid, const u8 *key, u8 key_len, u8 key_idx, u32 tx_flags);
+
+This method performs authentication by sending a management packet with authentication sub-type (IEEE80211_STYPE_AUTH).
+
+#### static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, u8 tim_len, u16 aid);
+
+This method checks whether the tim[aid] is set; the aid is passed as a parameter, and it represents the association id of the station.
+
+#### int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req);
+
+This method starts active scanning.
+
+#### void mesh_path_tx_pending(struct mesh_path *mpath);
+
+This method send packets from the frame_queue.
+
+#### struct mesh_path *mesh_path_lookup(struct ieee80211_sub_if_data *sdata, const u8 *dst);
+
+This method performs a lookup in a Mesh path table (routing table) of a Mesh point. The second parameter to the mesh_path_lookup() method is the hardware address of the destination. It returns NULL if there is no entry in the table, otherwise it returns a pointer to the mesh path structure which was found.
+
+#### static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata);
+
+This method creates an IBSS.
+
+#### int ieee80211_hw_config(struct ieee80211_local *local, u32 changed);
+
+This method is called for various configurations by the driver; in most cases, it delegates the call to the driver config() method, if implemented. The second parameter specifies which action to take (for instance, IEEE80211_CONF_CHANGE_CHANNEL to change channel, or IEEE80211_CONF_CHANGE_PS to change the power save mode of the driver).
+
+#### struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, const struct ieee80211_ops *ops);
+
+This method allocates a new 802.11 hardware device.
+
+#### int ieee80211_register_hw(struct ieee80211_hw *hw);
+
+This method registers a 802.11 hardware device.
+
+#### void ieee80211_unregister_hw(struct ieee80211_hw *hw);
+
+This method unregisters a 802.11 hardware device and frees its allocated resources.
+
+#### int sta_info_insert(struct sta_info *sta);
+
+This method adds a station to the hash table of stations and to the list of stations.
+
+#### int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr);
+
+This method removes a station and frees its resources.
+
+#### struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, const u8 *addr);
+
+This method returns a pointer to a station by performing a lookup in the hash table of stations.
+
+#### void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, u32 ratemask, bool directed, u32 tx_flags, struct ieee80211_channel *channel, bool scan);
+
+This method sends a probe request management packet.
+
+#### static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb);
+
+This method transmits an SKB.
+
+#### int ieee80211_channel_to_frequency(int chan, enum ieee80211_band band);
+
+This method returns the frequency in which a station operates, given its channel. There is a one-to-one correspondence between a channel and a frequency.
+
+#### static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags, const u8 *orig_addr, __le32 orig_sn, u8 target_flags, const u8 *target, __le32 target_sn, const u8 *da, u8 hop_count, u8 ttl, __le32 lifetime, __le32 metric, __le32 preq_id, struct ieee80211_sub_if_data *sdata);
+
+This method sends a PREQ or PREP management packet.
+
+#### static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, const u8 *preq_elem, u32 metric);
+
+This method handles a PREQ message.
+
+#### struct ieee80211_rx_status *IEEE80211_SKB_RXCB(struct sk_buff *skb);
+
+This method returns the ieee80211_rx_status object associated with the control buffer (cb), which is associated with the specified SKB.
+
+#### static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, bool txpending, enum ieee80211_band band);
+
+This method is the main handler for transmission.
+
+### Table
+
+Table 12-1 shows the bits of the flag member (a 32-bit field) of the ieee80211_rx_status structure and the corresponding Linux symbol.
+
+Table 12-1.
+
+Rx Flags: Various Possible Values for the Flag Field of the ieee80211_rx_status Object
+
+Linux Symbol | Bit | Description
+
+---|---|---
+
+RX_FLAG_MMIC_ERROR | 0 | Michael MIC error was reported on this frame.
+
+RX_FLAG_DECRYPTED | 1 | This frame was decrypted in hardware.
+
+RX_FLAG_MMIC_STRIPPED | 3 | The Michael MIC is stripped off this frame, verification has been done by the hardware.
+
+RX_FLAG_IV_STRIPPED | 4 | The IV/ICV are stripped from this frame.
+
+RX_FLAG_FAILED_FCS_CRC | 5 | The FCS check failed on the frame.
+
+RX_FLAG_FAILED_PLCP_CRC | 6 | The PCLP check failed on the frame.
+
+RX_FLAG_MACTIME_START | 7 | The timestamp passed in the RX status is valid and contains the time the first symbol of the MPDU was received.
+
+RX_FLAG_SHORTPRE | 8 | Short preamble was used for this frame.
+
+RX_FLAG_HT | 9 | HT MCS was used and rate_idx is MCS index
+
+RX_FLAG_40MHZ | 10 | HT40 (40 MHz) was used.
+
+RX_FLAG_SHORT_GI | 11 | Short guard interval was used.
+
+RX_FLAG_NO_SIGNAL_VAL | 12 | The signal strength value is not present.
+
+RX_FLAG_HT_GF | 13 | This frame was received in a HT-greenfield transmission
+
+RX_FLAG_AMPDU_DETAILS | 14 | A-MPDU details are known, in particular the reference
+
+number must be populated and be a distinct number for
+
+each A-MPDU.
+
+RX_FLAG_AMPDU_REPORT_ZEROLEN | 15 | Driver reports 0-length subframes.
+
+RX_FLAG_AMPDU_IS_ZEROLEN | 16 | This is a zero-length subframe, for monitoring purposes
+
+only.
+
+RX_FLAG_AMPDU_LAST_KNOWN | 17 | Last subframe is known, should be set on all subframes of a single A-MPDU.
+
+RX_FLAG_AMPDU_IS_LAST | 18 | This subframe is the last subframe of the A-MPDU.
+
+RX_FLAG_AMPDU_DELIM_CRC_ERROR | 19 | A delimiter CRC error has been detected on this subframe.
+
+RX_FLAG_AMPDU_DELIM_CRC_KNOWN | 20 | The delimiter CRC field is known (the CRC
+
+is stored in the ampdu_delimiter_crc field of the
+
+ieee80211_rx_status)
+
+RX_FLAG_MACTIME_END | 21 | The timestamp passed in the RX status is valid and
+
+contains the time the last symbol of the MPDU (including
+
+FCS) was received.
+
+RX_FLAG_VHT | 22 | VHT MCS was used and rate_index is MCS index
+
+RX_FLAG_80MHZ | 23 | 80 MHz was used
+
+RX_FLAG_80P80MHZ | 24 | 80+80 MHz was used
+
+RX_FLAG_160MHZ | 25 | 160 MHz was used
+Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_13
+
+© Rami Rosen 2014
+
+# 13. InfiniBand
+
+Rami Rosen1
+
+(1)
+
+Haifa, Israel
+
+Abstract
+
+Chapter 12 dealt with the wireless subsystem and its implementation in Linux. In this chapter, I will discuss the InfiniBand subsystem and its implementation in Linux. Though the InfiniBand technology might be perceived as a very complex technology for those who are unfamiliar with it, the concepts behind it are surprisingly straightforward, as you will see in this chapter. I will start our discussion with Remote Direct Memory Access (RDMA), and discuss its main data structures and its API. I will give some examples illustrating how to work with RDMA, and conclude this chapter with a short discussion about using RDMA API from the kernel level and userspace.
+
+> This chapter was written by Dotan Barak, an InfiniBand Expert. Dotan is a Senior Software Manager at Mellanox Technologies working on RDMA Technologies. Dotan has been working at Mellanox for more than 10 years in various roles, both as a developer and a manager. Additionally, Dotan maintains a blog about the RDMA technology:  http://www.rdmamojo.com .
+
+Chapter 12 dealt with the wireless subsystem and its implementation in Linux. In this chapter, I will discuss the InfiniBand subsystem and its implementation in Linux. Though the InfiniBand technology might be perceived as a very complex technology for those who are unfamiliar with it, the concepts behind it are surprisingly straightforward, as you will see in this chapter. I will start our discussion with Remote Direct Memory Access (RDMA), and discuss its main data structures and its API. I will give some examples illustrating how to work with RDMA, and conclude this chapter with a short discussion about using RDMA API from the kernel level and userspace.
+
+## RDMA and InfiniBand—General
+
+Remote Direct Memory Access (RDMA) is the ability for one machine to access—that is, to read or write to—memory on a remote machine. There are several main network protocols that support RDMA: InfiniBand, RDMA over Converged Ethernet (RoCE) and internet Wide Area RDMA Protocol (iWARP), and all of them share the same API. InfiniBand is a completely new networking protocol, and its specifications can be found in the document "InfiniBand Architecture specifications," which is maintained by the InfiniBand Trade Association (IBTA). RoCE allows you to have RDMA over an Ethernet network, and its specification can be found as an Annex to the InfiniBand specifications. iWARP is a protocol that allows using RDMA over TCP/IP, and its specifications can be found in the document, "An RDMA Protocol Specification," which is being maintained by the RDMA Consortium. Verbs is the description of the API to use RDMA from a client code. The RDMA API implementation was introduced to the Linux kernel in version 2.6.11. At the beginning, it supported only InfiniBand, and after several kernel versions, iWARP and RoCE support were added to it as well. When describing the API, I mention only one of them, but the following text refers to all. All of the definitions to this API can be found in include/rdma/ib_verbs.h. Here are some notes about the API and the implementation of the RDMA stack:
+
+  * Some of the functions are inline functions, and some of them aren't. Future implementation might change this behavior.
+
+  * Most of the APIs have the prefix "ib"; however, this API supports InfiniBand, iWARP and RoCE.
+
+  * The header ib_verbs.h contains functions and structures to be used by:
+
+    * The RDMA stack itself
+
+    * Low-level drivers for RDMA devices
+
+    * Kernel modules that use the stack as consumers
+
+I will concentrate on functions and structures that are relevant only for kernel modules that use the stack as consumers (the third case). The following section discusses the RDMA stack organization in the kernel tree.
+
+### The RDMA Stack Organization
+
+Almost all of the kernel RDMA stack code is under drivers/infiniband in the kernel tree. The following are some of its important modules (this is not an exhaustive list, as I do not cover the entire RDMA stack in this chapter):
+
+  * CM: Communication manager (drivers/infiniband/core/cm.c)
+
+  * IPoIB: IP over InfiniBand (drivers/infiniband/ulp/ipoib/)
+
+  * iSER: iSCSI extension for RDMA (drivers/infiniband/ulp/iser/)
+
+  * RDS: Reliable Datagram Socket (net/rds/)
+
+  * SRP: SCSI RDMA protocol (drivers/infiniband/ulp/srp/)
+
+  * Hardware low-level drivers of different vendors (drivers/infiniband/hw)
+
+  * verbs: Kernel verbs (drivers/infiniband/core/verbs.c)
+
+  * uverbs: User verbs (drivers/infiniband/core/uverbs_*.c)
+
+  * MAD: Management datagram (drivers/infiniband/core/mad.c)
+
+Figure 13-1 shows the Linux InfiniBand stack architecture.
+
+Figure 13-1.
+
+Linux Infiniband stack architecture
+
+In this section, I covered the RDMA stack organization and the kernel modules that are part of it in the Linux kernel.
+
+### RDMA Technology Advantages
+
+Here I will cover the advantages of the RDMA technology and explain the features that make it popular in many markets:
+
+  * Zero copy: The ability to directly write data to and read data from remote memory allows you to access remote buffers directly without the need to copy it between different software layers.
+
+  * Kernel bypass: Sending and receiving data from the same context of the code (that is, userspace or kernel level) saves the context switches time.
+
+  * CPU offload: The ability to send or receive data using dedicated hardware without any CPU intervention allows for decreasing the usage of the CPU on the remote side, because it doesn't perform any active operations.
+
+  * Low latency: RDMA technologies allow you to reach a very low latency for short messages. (In current hardware and on current servers, the latency for sending up to tens of bytes can be a couple of hundred nanoseconds.)
+
+  * High Bandwidth: In an Ethernet device, the maximum bandwidth is limited by the technology (that is, 10 or 40 Gbits/sec). In InfiniBand, the same protocol and equipment can be used from 2.5 Gbits/sec up to 120 Gbits/sec. (In current hardware and on current servers, the BW can be upto 56 Gbits/sec.)
+
+### InfiniBand Hardware Components
+
+Like in any other interconnect technologies, in InfiniBand several hardware components are described in the spec, some of them are endpoints to the packets (generating packets and the target of the packet), and some of them forward packets in the same subnet or between different subnets. Here I will cover the most common ones:
+
+  * Host Channel Adapter (HCA ): The network adapter that can be placed at a host or at any other system (for example, storage device). This component initiates or is the target of packets.
+
+  * Switch : A component that knows how to receive a packet from one port and send it to another port. If needed, it can duplicate multicast messages. (Broadcast isn't supported in InfiniBand.) Unlike other technologies, every switch is a very simple device with forwarding tables that are configured by the Subnet Manager (SM), which is an entity that configures and manages the subnet (later on in this section, I will discuss its role in more detail). The switch doesn't learn anything by itself or parse and anlyze packets; it forwards packets only within the same subnet.
+
+  * Router : A component that connects several different InfiniBand subnets.
+
+A subnet is a set of HCAs, switches, and router ports that are connected together. In this section, I described the various hardware components in InfiniBand, and now I will discuss the addressing of the devices, system, and ports in InfiniBand.
+
+### Addressing in InfiniBand
+
+Here are some rules about InfiniBand addressing and an example:
+
+  * In InfiniBand, the unique identifier of components is the Globally Unique Identifier (GUID), which is a 64-bit value that is unique in the world.
+
+  * Every node in the subnet has a Node GUID. This is the identifier of the node and a constant attribute of it.
+
+  * Every port in the subnet, including in HCAs and in switches, has a port GUID. This is the identifier of the port and a constant attribute of it.
+
+  * In systems that are made from several components, there can be a system GUID. All of the components in that system have the same system GUID.
+
+Here is an example that demonstrates all the aforementioned GUIDs: a big switch system that is combined from several switch chips. Every switch chip has a unique Node GUID. Every port in every switch has a unique port GUID. All of the chips in that system have the same system GUID.
+
+  * Global IDentifier (GID) is used to identify an end port or a multicast group. Every port has at least one valid GID at the GID table in index 0. It is based on the port GUID plus the subnet identifier that this port is part of.
+
+  * Local IDentifier (LID) is a 16-bit value that is assigned to every subnet port by the Subnet Manager. A switch is an exception, and the switch management port has the LID assignment, and not all of its ports. Every port can be assigned only one LID, or a contiguous range of LIDs, in order to have several paths to this port. Each LID is unique at a specific point of time in the same subnet, and it is used by the switches when forwarding the packets to know which egress port to use. The unicast LID's range is 0x001 to 0xbfff. The multicast LIDs range is 0xc000 to 0xfffe.
+
+### InfiniBand Features
+
+Here we will cover some of the InfiniBand protocol features:
+
+  * InfiniBand allows you to configure partitions of ports of HCAs, switches, and routers and allows you to provide virtual isolation over the same physical subnet. Every Partition Key (P_Key) is a 16-bit value that is combined from the following: 15 lsbs are the key value, and the msb is the membership level; 0 is limited membership; and 1 is full membership. Every port has a P_Key table that is being configured by the SM, and every Queue Pair (QP), the actual object in InfiniBand that sends and receives data, is associated with one P_Key index in this table. One QP can send or receive packets from a remote QP only if, in the P_Keys that each of them is associated with, the following is true:
+
+    * The key value is equal.
+
+    * At least one of them has full membership.
+
+  * Queue Key (Q_Key): An Unreliable Datagram (UD) QP will get unicast or multicast messages from a remote UD QP only if the Q_Key of the message is equal to the Q_Key value of this UD QP.
+
+  * Virtual Lanes (VL): This is a mechanism for creating multiple virtual links over a single physical link. Every virtual lane represents an autonomic set of buffers for send and receive packets in each port. The number of supported VLs is an attribute of a port.
+
+  * Service Level (SL): InfiniBand supports up to 16 service levels. The protocol doesn't specify the policy of each level. In InfiniBand, the QoS is implemented using the SL-to-VL mapping and the resources for each VL.
+
+  * Failover: Connected QPs are QPs that can send packets to or receive packets from only one remote QP. InfiniBand allows defining a primary path and an alternate path for connected QPs. If there is a problem with the primary path, instead of reporting an error, the alternate path will be used automatically.
+
+In the next section, we will look at what packets in InfiniBand look like. This is very useful when you debug problems in InfiniBand.
+
+### InfiniBand Packets
+
+Every packet in InfiniBand is a combination of several headers and, in many cases, a payload, which is the data of the messages that the clients want to send. Messages that contain only an ACK or messages with zero bytes (for example, if only immediate data is being sent) won't contain a payload. Those headers describe from where the packet was sent, what the target of the packet is, the used operation, the information needed to separate the packets into messages, and enough information to detect packet loss errors.
+
+Figure 13-2 presents the InfiniBand packet headers.
+
+Figure 13-2.
+
+InfiniBand packet headers
+
+Here are the headers in InfiniBand:
+
+  * Local Routing Header (LRH):8 bytes. Always present. It identifies the local source and destination ports of the packet. It also specifies the requested QoS attributes (SL and VL) of the message.
+
+  * Global Routing Header (GRH): 40 bytes. Optional. Present for multicast packets or packets that travel in multiple subnets. It describes the source and destination ports using GIDs. Its format is identical to the IPv6 header.
+
+  * Base Transport Header (BTH): 12 bytes. Always present. It specifies the source and destination QPs, the operation, packet sequence number, and partition.
+
+  * Extended Transport Header (ETH): from 4 to 28 bytes. Optional. Extra family of headers that might be present, depending on the class of the service and the operation used.
+
+  * Payload: Optional. The data that the client wants to send.
+
+  * Immediate data:4 bytes. Optional. Out-of-band, 32-bit value that can be added to Send and RDMA Write operations.
+
+  * Invariant CRC (ICRC):4 bytes. Always present. It covers all fields that should not be changed as the packet travels in the subnet.
+
+  * Variant CRC (VCRC):2 bytes. Always present. It covers all of the fields of the packet.
+
+### Management Entities
+
+The SM is the entity in the subnet that is responsible for analyzing the subnet and configuring it. These are some of its missions:
+
+  * Discover the physical topology of the subnet.
+
+  * Assign the LIDs and other attributes—such as active MTU, active speeds, and more—to each port in the subnet.
+
+  * Configure the forwarding table in the subnet switches.
+
+  * Detect any changes in the topology (for example, if new nodes were added or removed from the subnet).
+
+  * Handle various errors in the subnet.
+
+Subnet Manager is usually a software entity that can be running in a switch (which is called a managed switch) or in any node in the subnet.
+
+Several SMs can be running in a subnet, but only one of them will be active and the rest of them will be in standby mode. There is an internal protocol that performs master election and decides which SM will be active. If the active SM is going down, one of the standby SMs will become the active SM. Every port in the subnet has a Subnet Management Agent (SMA), which is an agent that knows how to receive management messages sent by the SM, handle them, and return a response. Subnet Administrator (SA) is a service that is part of the SM. These are some of its missions:
+
+  * Provide information about the subnet—for example, information about how to get from one port to another (that is, a path query).
+
+  * Allow you to register to get notifications about events.
+
+  * Provide services for management of the subnet, such as joining or leaving a multicast. Those services might cause the SM to (re)configure the subnet.
+
+Communication Manager (CM) is an entity that is capable of running on each port, if the port supports it, to establish, maintain, and tear down QP connections.
+
+## RDMA Resources
+
+In the RDMA API, a lot of resources need to be created and handled before any data can be sent or received. All of the resources are in the scope of a specific RDMA device, those resources cannot be shared or used across more than one local device, even if there are multiple devices in the same machine. Figure 13-3 presents the RDMA resource creation hierarchy.
+
+Figure 13-3.
+
+RDMA resource creation hierarchy
+
+### RDMA Device
+
+The client needs to register with the RDMA stack in order to be notified about any RDMA device that is being added to the system or removed from it. After the initial registration, the client is notified for all existing RDMA devices. A callback will be invoked for every RDMA device, and the client can start working with these devices in the following ways:
+
+  * Query the device for various attributes
+
+  * Modify the device attributes
+
+  * Create, work with and destroy resources
+
+The ib_register_client() method registers a kernel client that wants to use the RDMA stack. The specified callbacks will be invoked for every new InfiniBand device that currently exists in the system and that will be added to or removed from (using hot-plug functionality) the system. The ib_unregister_client() method unregisters a kernel client that wants to stop using the RDMA stack. Usually, it is called when the driver is being unloaded. Here is an sample code that shows how to register the RDMA stack in a kernel client:
+
+static void my_add_one(struct ib_device *device)
+
+{
+
+...
+
+}
+
+static void my_remove_one(struct ib_device *device)
+
+{
+
+...
+
+}
+
+static struct ib_client my_client = {
+
+.name = "my RDMA module",
+
+.add = my_add_one,
+
+.remove = my_remove_one
+
+};
+
+static int __init my_init_module(void)
+
+{
+
+int ret;
+
+ret = ib_register_client(&my_client);
+
+if (ret) {
+
+printk(KERN_ERR "Failed to register IB client\n");
+
+return ret;
+
+}
+
+return 0;
+
+}
+
+static void __exit my_cleanup_module(void)
+
+{
+
+ib_unregister_client(&my_client);
+
+}
+
+module_init(my_init_module);
+
+module_exit(my_cleanup_module);
+
+Following here is a description of several more methods for handling an InfiniBand device.
+
+  * The ib_set_client_data() method sets a client context to be associated with an InfiniBand device.
+
+  * The ib_get_client_data() method returns the client context that was associated with an InfiniBand device using the ib_set_client_data() method.
+
+  * The ib_register_event_handler() method registers a callback to be called for every asynchronous event that will occur to the InfiniBand device. The callback structure must be initialized with the INIT_IB_EVENT_HANDLER macro.
+
+  * The ib_unregister_event_handler() method unregisters the event handler.
+
+  * The ib_query_device() method queries the InfiniBand device for its attributes. Those attributes are constant and won't be changed in subsequent calls of this method.
+
+  * The ib_query_port() method queries the InfiniBand device port for its attributes. Some of those attributes are constant, and some of them might be changed in subsequent calls of this method—for example, the port LID, state, and some other attributes.
+
+  * The rdma_port_get_link_layer() method returns the link layer of the device port.
+
+  * The ib_query_gid() method queries the InfiniBand device port's GID table in a specific index. The ib_find_gid() method returns the index of a specific GID value in a port's GID table.
+
+  * The ib_query_pkey() method queries the InfiniBand device port's P_Key table in a specific index. The ib_find_pkey() method returns the index of a specific P_Key value in a port's P_Key table.
+
+### Protection Domain (PD)
+
+A PD allows associating itself with several other RDMA resources—such as SRQ, QP, AH, or MR—in order to provide a means of protection among them. RDMA resources that are associated with PDx cannot work with RDMA resources that were associated with PDy. Trying to mix those resources will end with an error. Typically, every module will have one PD. However, if a specific module wants to increase its security, it will use one PD for each remote QP or service that it uses. Allocation and deallocation of a PD is done like this:
+
+  * The ib_alloc_pd() method allocates a PD. It takes as an argument the pointer of the device object that was returned when the driver callback was called after its registration.
+
+  * The ib_dealloc_pd() method deallocates a PD. It is usually called when the driver is being unloaded or when the resources that are associated with the PD are being destroyed.
+
+### Address Handle (AH)
+
+An AH is used in the Send Request of a UD QP to describe the path of the message from the local port to the remote port. The same AH can be used for several QPs if all of them send messages to the same remote port using the same attributes. Following is a description of four methods related to the AH:
+
+  * The ib_create_ah() method creates an AH. It takes as an argument a PD and attributes for the AH. The AH attributes of the AH can be filled directly or by calling the ib_init_ah_from_wc() method, which gets as a parameter a received Work Completion (ib_wc object) that includes the attributes of a successfully completed incoming message, and the port it was received from. Instead of calling the ib_init_ah_from_wc() method and then the ib_create_ah() method, one can call the ib_create_ah_from_wc() method.
+
+  * The ib_modify_ah() method modifies the attributes of an existing AH.
+
+  * The ib_query_ah() method queries for the attributes of an existing AH.
+
+  * The ib_destroy_ah() method destroys an AH. It is called when there isn't a need to send any further messages to the node that the AH describes the path to.
+
+### Memory Region (MR)
+
+Every memory buffer that is accessed by the RDMA device needs to be registered. During the registration process, the following tasks are performed on the memory buffer:
+
+  * Separate the contiguous memory buffer to memory pages.
+
+  * The mapping of the virtual-to-physical translation will be done.
+
+  * The memory pages permission is checked to ensure that the requested permissions for the MR is supported by them.
+
+  * The memory pages are pinned, to prevent them from being swapped out. This keeps the virtual-to-physical mapping unchanged.
+
+After a successful memory registration is completed, it has two keys:
+
+  * Local key (lkey): A key for accessing this memory by local Work Requests.
+
+  * Remote key (rkey): A key for accessing this memory by a remote machine using RDMA operations.
+
+Those keys will be used in Work Requests when referring to those memory buffers. The same memory buffers can be registered several times, even with different permissions. The following is a description of some methods related to the MR:
+
+  * The ib_get_dma_mr() method returns a Memory Region for system memory that is usable for DMA. It takes a PD and the requested access permission for the MR as arguments.
+
+  * The ib_dma_map_single() method maps a kernel virtual address, that was allocated by the kmalloc() method family, to a DMA address. This DMA address will be used to access local and remote memory. The ib_dma_mapping_error() method should be used to check whether the mapping was successful.
+
+  * The ib_dma_unmap_single() method unmaps a DMA mapping that was done using ib_dma_map_single(). It should be called when this memory isn't needed anymore.
+
+Note
+
+There are some more flavors of ib_dma_map_single() that allow the mapping of pages, mapping ­according to DMA attributes, mapping using a scatter/gather list, or mapping using a scatter/gather list with DMA attributes: ib_dma_map_page(), ib_dma_map_single_attrs(), ib_dma_map_sg(), and ib_dma_map_sg_attrs(). All of them have corresponding unmap functions.
+
+Before accessing a DMA mapped memory, the following methods should be called:
+
+  * ib_dma_sync_single_for_cpu() if the DMA region is going to be accessed by the CPU, or ib_dma_sync_single_for_device() if the DMA region is going to be accessed by the InfiniBand device.
+
+  * The ib_dma_alloc_coherent() method allocates a memory block that can be accessed by the CPU and maps it for DMA.
+
+  * The ib_dma_free_coherent() method frees a memory block that was allocated using ib_dma_alloc_coherent().
+
+  * The ib_reg_phys_mr() method takes a set of physical pages, registers them, and prepares a virtual address that can be accessed by an RDMA device. If you want to change it after it was created, you should call the ib_rereg_phys_mr() method.
+
+  * The ib_query_mr() method retrieves the attributes of a specific MR. Note that most low-level drivers do not implement this method.
+
+  * The ib_dereg_mr() method deregisters an MR.
+
+### Fast Memory Region (FMR) Pool
+
+Registration of a Memory Region is a "heavy" procedure that might take some time to complete, and the context that performs it even might sleep if required resources aren't available when it is called. This behavior might be problematic when performed in certain contexts—for example, in the interrupt handler. Working with an FMR pool allows you to work with FMRs with registrations that are "lightweight" and can be registered in any context. The API of the FMR pool can be found in include/rdma/ib_fmr_pool.h.
+
+### Memory Window (MW)
+
+Enabling a remote access to a memory can be done in two ways:
+
+  * Register a memory buffer with remote permissions enabled.
+
+  * Register a Memory Region and then bind a Memory Window to it.
+
+Both of those ways will create a remote key (rkey) that can be used to access this memory with the specified permissions. However, if you wish to invalidate the rkey to prevent remote access to this memory, performing Memory Region deregistration might be a heavy procedure. Working with Memory Window on this Memory Region and binding or unbinding it when needed might provide a "lightweight" procedure for enabling and disabling remote access to memory. Following is a description of three methods related to the MW:
+
+  * The ib_alloc_mw() method allocates a Memory Window. It takes a PD and the MW type as arguments.
+
+  * The ib_bind_mw() method binds a Memory Window to a specified Memory Region with a specific address, size, and remote permissions by posting a special Work Request to a QP. It is called when you want to allow temporary remote access to its memory. A Work Completion in the Send Queue of the QP will be generated to describe the status of this operation. If ib_bind_mw() was called to a Memory Windows that is already bounded, to the same Memory Region or a different one, the previous binding will be invalidated.
+
+  * The ib_dealloc_mw() method deallocates the specified MW object.
+
+### Completion Queue (CQ)
+
+Every posted Work Request, to either Send or Receive Queue, is considered outstanding until there is a corresponding Work Completion for it or for any Work Request that was posted after it. While the Work Request is outstanding, the content of the memory buffers that it points to is undetermined:
+
+  * If the RDMA device reads this memory and sends its content over the wire, the client cannot know if this buffer can be (re)used or released. If this is a reliable QP, a successful Work Completion means that the message was received by the remote side. If this is an unreliable QP, a successful Work Completion means that the message was sent.
+
+  * If the RDMA device writes a message to this memory, the client cannot know if this buffer contains the incoming message.
+
+A Work Completion specifies that the corresponding Work Request was completed and provides some information about it: its status, the used opcode, its size, and so on. A CQ is an object that contains the Work Completions. The client needs to poll the CQ in order to read the Work Completions that it has. The CQ works on a first-in, first-out (FIFO) basis: the order of Work Completions that will be de-queued from it by the client will be according to the order that they were enqueued to the CQ by the RDMA device. The client can read the Work Completions in polling mode or request to get a notification when a new Work Completion is added to the CQ. A CQ cannot hold more Work Completions than its size. If more Work Completions than its capacity are added to it, a Work Completion with an error will be added, a CQ error asynchronous event will be generated, and all the Work Queues associated with it will get an error. Here are some methods related to the CQ:
+
+  * The ib_create_cq() method creates a CQ. It takes the following as its arguments: the pointer of the device object that was returned when the driver callback was called after its registration and the attributes for the CQ, including its size and the callbacks that will be called when there is an asynchronous event on this CQ or a Work Completion is added to it.
+
+  * The ib_resize_cq() method changes the size of a CQ. The new number of entries cannot be less than the number of the Work Completions that currently populate the CQ.
+
+  * The ib_modify_cq() method changes the moderation parameter for a CQ. A Completion event will be generated if at least a specific number of Work Completions enter the CQ or a timeout will expire. Using it might help reduce the number of interrupts that happen in an RDMA device.
+
+  * The ib_peek_cq() method returns the number of available Work Completions in a CQ.
+
+  * The ib_req_notify_cq() method requests that a Completion event notification be generated when the next Work Completion, or Work Completion that includes a solicited event indication, is added to the CQ. If no Work Completion is added to the CQ after the ib_req_notify_cq() method was called, no Completion event notification will occur.
+
+  * The ib_req_ncomp_notif() method requests that a Completion event notification be created when a specific number of Work Completions exists in the CQ. Unlike the ib_req_notify_cq() method, when calling the ib_req_ncomp_notif() method, a Completion event notification will be generated even if the CQ currently holds this number of Work Completions.
+
+  * The ib_poll_cq() method polls for Work Completions from a CQ. It reads the Work Completions from the CQ in the order they were added to it and removes them from it.
+
+Here is an example of a code that empties a CQ—that is, reads all the Work Completions from a CQ, and checks their status:
+
+struct ib_wc wc;
+
+int num_comp = 0;
+
+while (ib_poll_cq(cq, 1, &wc) > 0) {
+
+if (wc.status != IB_WC_SUCCESS) {
+
+printk(KERN_ERR "The Work Completion[%d] has a bad status %d\n",
+
+num_comp, wc.status);
+
+return -EINVAL;
+
+}
+
+num_comp ++;
+
+}
+
+### eXtended Reliable Connected (XRC) Domain
+
+An XRC Domain is an object that is used to limit the XRC SRQs an incoming message can target. That XRC domain can be associated with several other RDMA resources that work with XRC, such as SRQ and QP.
+
+### Shared Receive Queue (SRQ)
+
+An SRQ is a way for the RDMA architecture to be more scalable on the receive side. Instead of having a separate Receive Queue for every Queue Pair, there is a shared Receive Queue that all of the QPs are connected to. When they need to consume a Receive Request, they fetch it from the SRQ. Figure 13-4 presents QPs that are associated with an SRQ.
+
+Figure 13-4.
+
+QPs that are associated with an SRQ
+
+Here's what you do if you have N QPs, and each of them might receive a burst of M messages at a random time:
+
+  * Without using an SRQ, you post N*M Receive Requests.
+
+  * With SRQs, you post K*M (where K << N) Receive Requests.
+
+Unlike a QP, which doesn't have any mechanism to determine the number of outstanding Work Requests in it, with an SRQ you can set a watermark limit. When the number of Receive Requests drops below this limit, an SRQ limit asynchronous event will be created for this SRQ. The downside of using an SRQ is that you cannot predict which QP will consume every posted Receive Request from the SRQ, so the message size that each posted Receive Request will be able to hold must be the maximum incoming message size that any of the QPs might get. This limitation can be handled by creating several SRQs, one for each different maximum message size, and associating them with the relevant QPs according to their expected message sizes.
+
+Here is a description of some methods related to the SRQ and an example:
+
+  * The ib_create_srq() method creates an SRQ. It takes a PD and attributes for the SRQ.
+
+  * The ib_modify_srq() method modifies the attributes of the SRQ. It is used to set a new watermark value for the SRQ's limit event or to resize the SRQ for devices that support it.
+
+Here is an example for setting the value of the watermark to get an asynchronous event when the number of RRs in the SRQ drops below 5:
+
+struct ib_srq_attr srq_attr;
+
+int ret;
+
+memset(&srq_attr, 0, sizeof(srq_attr));
+
+srq_attr.srq_limit = 5;
+
+ret = ib_modify_srq(srq, &srq_attr, IB_SRQ_LIMIT);
+
+if (ret) {
+
+printk(KERN_ERR "Failed to set the SRQ's limit value\n");
+
+return ret;
+
+}
+
+Following here is a description of several more methods for handling an SRQ.
+
+  * The ib_query_srq() method queries for the current SRQ attributes. This method is usually used to check the content of the SRQ's limit value. The value 0 in the srq_limit member in the ib_srq_attr object means that there isn't any SRQ limit watermark set.
+
+  * The ib_destroy_srq() method destroys an SRQ.
+
+  * The ib_post_srq_recv() method takes a linked list of Receive Requests as an argument and adds them to a specified Shared Receive Queue for future processing.
+
+Here is an example for posting a single Receive Request to an SRQ. It saves an incoming message in a memory buffer, using its registered DMA address in a single gather entry:
+
+struct ib_recv_wr wr, *bad_wr;
+
+struct ib_sge sg;
+
+int ret;
+
+memset(&sg, 0, sizeof(sg));
+
+sg.addr = dma_addr;
+
+sg.length = len;
+
+sg.lkey = mr->lkey;
+
+memset(&wr, 0, sizeof(wr));
+
+wr.next = NULL;
+
+wr.wr_id = (uintptr_t)dma_addr;
+
+wr.sg_list = &sg;
+
+wr.num_sge = 1;
+
+ret = ib_post_srq_recv(srq, &wr, &bad_wr);
+
+if (ret) {
+
+printk(KERN_ERR "Failed to post Receive Request to an SRQ\n");
+
+return ret;
+
+}
+
+### Queue Pair (QP)
+
+Queue Pair is the actual object used to send and receive data in InfiniBand. It has two separate Work Queues: Send and Receive Queues. Every Work Queue has a specific number of Work Requests (WR) that can be posted to it, a number of scatter/gather elements that are supported for each WR, and a CQ to which the Work Requests whose processing has ended will add Work Completion. Those Work Queues can be created with similar or different attributes—for example, the number of WRs that can be posted to each Work Queue. The order in each Work Queue is guaranteed—that is, the processing of a Work Request in the Send Queue will start according to the order of the Send Requests submission. And the same behavior applies to the Receive Queue. However, there isn't any relation between them—that is, an outstanding Send Request can be processed even if it was posted after posting a Receive Request to the Receive Queue. Figure 13-5 presents a QP.
+
+Figure 13-5.
+
+QP (Queue Pair)
+
+Upon creation, every QP has a unique number across the RDMA device at a specific point in time.
+
+#### QP Transport Types
+
+There are several QP transport types supported in InfiniBand:
+
+  * Reliable Connected (RC): One RC QP is connected to a single remote RC QP, and reliability is guaranteed—that is, the arrival of all packets according to their order with the same content that they were sent with is guaranteed. Every message is fragmented to packets with the size of the path MTU at the sender side and defragmented at the receiver side. This QP supports Send, RDMA Write, RDMA Read, and Atomic operations.
+
+  * Unreliable Connected (UC): One UC QP is connected to a single remote UC QP, and reliability isn't guaranteed. Also, if a packet in a message is lost, the whole message is lost. Every message is fragmented to packets with the size of the path MTU at the sender side and defragmented at the receiver side. This QP supports Send and RDMA Write operations.
+
+  * Unreliable Datagram (UD): One UD QP can send a unicast message to any UD QP in the subnet. Multicast messages are supported. Reliability isn't guaranteed. Every message is limited to one packet message, with its size limited to the path MTU size. This QP supports only Send operations.
+
+  * eXtended Reliable Connected (XRC): Several QPs from the same node can send messages to a remote SRQ in a specific node. This is useful for decreasing the number of QPs between two nodes from the order of the number of CPU cores—that is, QP in a process per core, to one QP. This QP supports all operations that are supported by RC QP. This type is relevant only for userspace applications.
+
+  * Raw packet: Allows the client to build a complete packet, including the L2 headers, and send it as is. At the receiver side, no header will be stripped by the RDMA device.
+
+  * Raw IPv6/Raw Ethertype: QPs that allow sending raw packets that aren't interpreted by the IB device. Currently, both of these types aren't supported by any RDMA device.
+
+There are special QP transport types that are used for subnet management and special services:
+
+  * SMI/QP0: QP used for subnet managements packets.
+
+  * GSI/QP1: QP used for general services packets.
+
+The ib_create_qp() method creates a QP. It takes a PD and the requested attributes that this QP will be created with as arguments. Here is an example for creating an RC QP using a PD that was created, with two different CQs: one for the Send Queue and one for the Receive Queue.
+
+struct ib_qp_init_attr init_attr;
+
+struct ib_qp *qp;
+
+memset(&init_attr, 0, sizeof(init_attr));
+
+init_attr.event_handler = my_qp_event;
+
+init_attr.cap.max_send_wr = 2;
+
+init_attr.cap.max_recv_wr = 2;
+
+init_attr.cap.max_recv_sge = 1;
+
+init_attr.cap.max_send_sge = 1;
+
+init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+
+init_attr.qp_type = IB_QPT_RC;
+
+init_attr.send_cq = send_cq;
+
+init_attr.recv_cq = recv_cq;
+
+qp = ib_create_qp(pd, &init_attr);
+
+if (IS_ERR(qp)) {
+
+printk(KERN_ERR "Failed to create a QP\n");
+
+return PTR_ERR(qp);
+
+}
+
+#### QP State Machine
+
+A QP has a state machine that defines what the QP is capable of doing at each state:
+
+  * Reset state: Each QP is generated at this state. At this state, no Send Requests or Receive Requests can be posted to it. All incoming messages are silently dropped.
+
+  * Initialized state: At this state, no Send Requests can be posted to it. However, Receive Requests can be posted, but they won't be processed. All incoming messages are silently dropped. It is a good practice to post a Receive Request to a QP at this state before moving it to RTR (Ready To Receive). Doing this prevents a case where remote QP sends messages need to consume a Receive Request but such were not posted yet.
+
+  * Ready To Receive (RTR) state: At this state, no Send Requests can be posted to it, but Receive Requests can be posted and processed. All incoming messages will be handled. The first incoming message that is received at this state will generate the communication-established asynchronous event. A QP that only receives messages can stay at this state.
+
+  * Ready To Send (RTS) state: At this state, both Send Requests and Receive Requests can be posted and processed. All incoming messages will be handled. This is the common state for QPs.
+
+  * Send Queue Drained (SQD) state:At this state, the QP completes the processing of all the Send Requests that their processing has started. Only when there aren't any messages that can be sent, you can change some of the QP attributes. This state is separated into two internal states:
+
+    * Draining: Messages are still being sent.
+
+    * Drained: The sending of the messages was completed.
+
+  * Send Queue Error (SQE) state: The RDMA device automatically moves a QP to this state when there is an error in the Send Queue for unreliable transport types. The Send Request that caused the error will be completed with the error reason, and all of the consecutive Send Requests will be flushed. The Receive Queue will still work—that is, Receive Requests can be posted, and incoming messages will be handled. The client can recover from this state and modify the QP state back to RTS.
+
+  * Error state: At this state, all of the outstanding Work Requests will be flushed. The RDMA device can move the QP to this state if this is a reliable transport type and there was an error with a Send Request, or if there was an error in the Receive Queue regardless of which transport type was used. All incoming messages are silently dropped.
+
+A QP can be transitioned by ib_modify_qp() from any state to the Reset state and to the Error state. Moving the QP to the Error state will flush all of the outstanding Work Requests. Moving the QP to the Reset state will clear all previously configured attributes and remove all of the outstanding Work Request and Work Completions that were ended on this QP in the Completion Queues that this QP is working with. Figure 13-6 presents a QP state machine diagram.
+
+Figure 13-6.
+
+QP state machine
+
+The ib_modify_qp() method modifies the attributes of a QP. It takes as an argument the QP to modify and the attributes of the QP that will be modified. The state machine of the QP can be changed according to the diagram shown in Figure 13-6. Every QP transport type requires different attributes to be set in each QP state transition.
+
+Here is an example for modifying a newly created RC QP to the RTS state, in which it can send and receive packets. The local attributes are the outgoing port, the used SL, and the starting Packet Serial Number for the Send Queue. The remote attributes needed are the Receive PSN, the QP number, and the LID of the port that it uses.
+
+struct ib_qp_attr attr = {
+
+.qp_state = IB_QPS_INIT,
+
+.pkey_index = 0,
+
+.port_num = port,
+
+.qp_access_flags = 0
+
+};
+
+ret = ib_modify_qp(qp, &attr,
+
+IB_QP_STATE |
+
+IB_QP_PKEY_INDEX |
+
+IB_QP_PORT |
+
+IB_QP_ACCESS_FLAGS);
+
+if (ret) {
+
+printk(KERN_ERR "Failed to modify QP to INIT state\n");
+
+return ret;
+
+}
+
+attr.qp_state = IB_QPS_RTR;
+
+attr.path_mtu = mtu;
+
+attr.dest_qp_num = remote->qpn;
+
+attr.rq_psn = remote->psn;
+
+attr.max_dest_rd_atomic = 1;
+
+attr.min_rnr_timer = 12;
+
+attr.ah_attr.is_global = 0;
+
+attr.ah_attr.dlid = remote->lid;
+
+attr.ah_attr.sl = sl;
+
+attr.ah_attr.src_path_bits = 0,
+
+attr.ah_attr.port_num = port
+
+ret = ib_modify_qp(ctx->qp, &attr,
+
+IB_QP_STATE |
+
+IB_QP_AV |
+
+IB_QP_PATH_MTU |
+
+IB_QP_DEST_QPN |
+
+IB_QP_RQ_PSN |
+
+IB_QP_MAX_DEST_RD_ATOMIC |
+
+IB_QP_MIN_RNR_TIMER);
+
+if (ret) {
+
+printk(KERN_ERR "Failed to modify QP to RTR state\n");
+
+return ret;
+
+}
+
+attr.qp_state = IB_QPS_RTS;
+
+attr.timeout = 14;
+
+attr.retry_cnt = 7;
+
+attr.rnr_retry = 6;
+
+attr.sq_psn = my_psn;
+
+attr.max_rd_atomic = 1;
+
+ret = ib_modify_qp(ctx->qp, &attr,
+
+IB_QP_STATE |
+
+IB_QP_TIMEOUT |
+
+IB_QP_RETRY_CNT |
+
+IB_QP_RNR_RETRY |
+
+IB_QP_SQ_PSN |
+
+IB_QP_MAX_QP_RD_ATOMIC);
+
+if (ret) {
+
+printk(KERN_ERR "Failed to modify QP to RTS state\n");
+
+return ret;
+
+}
+
+Following here is a description of several more methods for handling a QP:
+
+  * The ib_query_qp() method queries for the current QP attributes. Some of the attributes are constant (the values that the client specifies), and some of them can be changed (for example, the state).
+
+  * The ib_destroy_qp() method destroys a QP. It is called when the QP isn't needed anymore.
+
+### Work Request Processing
+
+Every posted Work Request, to either the Send or Receive Queue, is considered outstanding until there is a Work Completion, which was polled from the CQ which is associated with this Work Queue for this Work Request or for Work Requests in the same Work Queue that were posted after it. Every outstanding Work Request in the Receive Queue will end with a Work Completion. A Work Request processing flow in a Work Queue is according to the diagram shown in Figure 13-7.
+
+Figure 13-7.
+
+Work Request processing flow
+
+In the Send Queue, you can choose (when creating a QP) whether you want every Send Request to end with a Work Completion or whether you want to select the Send Requests that will end with Work Completions—that is, selective signaling. You might encounter an error for an unsignaled Send Request; nevertheless, a Work Completion with bad status will be generated for it.
+
+When a Work Request is outstanding one cannot (re)use or free the resources that were specified in it when posting this Work Request. For example:
+
+  * When posting a Send Request for a UD QP, the AH cannot be freed.
+
+  * When posting a Receive Request, the memory buffers that were referred to in a scatter/gather (s/g) list cannot be read, because it is unknown if the RDMA device already wrote the data in them.
+
+"Fencing" is the ability to prevent the processing of a specific Send Request until the processing of the previous RDMA Read and Atomic operations ends. Adding the Fence indication to a Send Request can be useful, for example, when using RDMA Read from a remote address and sending the data, or part of it, in the same Send Queue. Without fencing, the send operation might start before the data is retrieved and available in local memory. When posting a Send Request to a UC or RC QP, the path to the target is known, because it was provided when moving the QP to the RTR state. However, when posting a Send Request to a UD QP, you need to add an AH to describe the path to the target(s) of this message. If there is an error related to the Send Queue, and if this is an Unreliable transport type, the Send Queue will move to the Error state (that is, the SQE state) but the Receive Queue will still be fully functional. The client can recover from this state and change the QP state back to RTS. If there is an error related to the Receive Queue, the QP will be moved to the Error state because this is an unrecoverable error. When a Work Queue is moved to the Error state, the Work Request that caused the error is ended with a status that indicates the nature of the error and the rest of the Work Requests in this Queue are flushed with error.
+
+### Supported Operations in the RDMA Architecture
+
+There are several operation types supported in InfiniBand:
+
+  * Send: Send a message over the wire. The remote side needs to have a Receive Request available, and the message will be written in its buffers.
+
+  * Send with Immediate: Send a message over the wire with an extra 32 bits of out-of-band data. The remote side needs to have a Receive Request available, and the message will be written in its buffers. This immediate data will be available in the Work Completion of the receiver.
+
+  * RDMA Write: Send a message over the wire to a remote address.
+
+  * RDMA Write with Immediate: Send a message over the wire, and write it to a remote address. The remote side needs to have a Receive Request available. This immediate data will be available in the Work Completion of the receiver. This operation can be seen as RDMA Write + Send with immediate with a zero-byte message.
+
+  * RDMA Read: Read a remote address, and fill the local buffer with its content.
+
+  * Compare and Swap: Compare the content of a remote address with valueX; if they are equal, replace its content with the valueY. All of this is performed in an atomic way. The original remote memory content is sent and saved locally.
+
+  * Fetch and Add: Add a value to the content of a remote address in an atomic way. The original remote memory content is sent and saved locally.
+
+  * Masked Compare and Swap: Compare the part of the content using maskX of a remote address with valueX; if they are equal, replace part of its content using the bits in maskY with valueY. All of this is performed in an atomic way. The original remote memory content is sent and saved locally.
+
+  * Masked Fetch and Add: Add a value to the content of a remote address in an atomic way, and change only the bits that are specified in the mask. The original remote memory content is sent and saved locally.
+
+  * Bind Memory Window: Binds a Memory Windows to a specific Memory Region.
+
+  * Fast registration: Registers a Fast Memory Region using a Work Request.
+
+  * Local invalidate: Invalidates a Fast Memory Region using a Work Request. If someone uses its old lkey/rkey, it will be considered an error. It can be combined with send/RDMA read; in such a case, first the send/read will be performed, and only then this Fast Memory Region will be invalidated.
+
+The Receive Request specifies where the incoming message will be saved for operations that consume a Receive Request. The total size of the memory buffers specified in the scatter list must be equal to or greater than the size of the incoming message.
+
+For UD QP, because the origin of the message is unknown in advance (same subnet or another subnet, unicast or multicast message), an extra 40 bytes, which is the GRH header size, must be added to the Receive Request buffers. The first 40 bytes will be filled with the GRH of the message, if such is available. This GRH information describes how to send a message back to the sender. The message itself will start at offset 40 in the memory buffers that were described in the scatter list.
+
+The ib_post_recv() method takes a linked list of Receive Requests and adds them to the Receive Queue of a specific QP for future processing. Here is an example for posting a single Receive Request for a QP. It saves an incoming message in a memory buffer using its registered DMA address in a single gather entry. qp is a pointer to a QP that was created using ib_create_qp(). The memory buffer is a block that was allocated using kmalloc() and mapped for DMA using ib_dma_map_single(). The used lkey is from the MR that was registered using ib_get_dma_mr().
+
+struct ib_recv_wr wr, *bad_wr;
+
+struct ib_sge sg;
+
+int ret;
+
+memset(&sg, 0, sizeof(sg));
+
+sg.addr = dma_addr;
+
+sg.length = len;
+
+sg.lkey = mr->lkey;
+
+memset(&wr, 0, sizeof(wr));
+
+wr.next = NULL;
+
+wr.wr_id = (uintptr_t)dma_addr;
+
+wr.sg_list = &sg;
+
+wr.num_sge = 1;
+
+ret = ib_post_recv(qp, &wr, &bad_wr);
+
+if (ret) {
+
+printk(KERN_ERR "Failed to post Receive Request to a QP\n");
+
+return ret;
+
+}
+
+The ib_post_send() method takes as an argument a linked list of Send Requests and adds them to the Send Queue of a specific QP for future processing. Here is an example for posting a single Send Request of a Send operation for a QP. It sends the content of a memory buffer using its registered DMA address in a single gather entry.
+
+struct ib_sge sg;
+
+struct ib_send_wr wr, *bad_wr;
+
+int ret;
+
+memset(&sg, 0, sizeof(sg));
+
+sg.addr = dma_addr;
+
+sg.length = len;
+
+sg.lkey = mr->lkey;
+
+memset(&wr, 0, sizeof(wr));
+
+wr.next = NULL;
+
+wr.wr_id = (uintptr_t)dma_addr;
+
+wr.sg_list = &sg;
+
+wr.num_sge = 1;
+
+wr.opcode = IB_WR_SEND;
+
+wr.send_flags = IB_SEND_SIGNALED;
+
+ret = ib_post_send(qp, &wr, &bad_wr);
+
+if (ret) {
+
+printk(KERN_ERR "Failed to post Send Request to a QP\n");
+
+return ret;
+
+}
+
+#### Work Completion Status
+
+Every Work Completion can be ended successfully or with an error. If it ends successfully, the operation was finished and the data was sent according to the transport type reliability level. If this Work Completion contains an error, the content of the memory buffers is unknown. There can be many reasons that the Work Request status indicates that there is an error: protection violation, bad address, and so on. The violation errors won't perform any retransmission. However, there are two special retry flows that are worth mentioning. Both of them are done automatically by the RDMA device, which retransmit packets, until the problem is solved or it exceeds the number of retransmissions. If the issue was solved, the client code won't be aware that this even happened, besides a temporary performance hiccup. This is relevant only for Reliable transport types.
+
+##### Retry Flow
+
+If the receiver side didn't return any ACK or NACK to the sender side within the expected timeout, the sender might send the message again, according to the timeout and the retry count attributes that were configured in the QP attributes. There might be several reasons for having such a problem:
+
+  * The attributes of the remote QP or the path to it aren't correct.
+
+  * The remote QP state didn't get to (at least) the RTR state.
+
+  * The remote QP state moved to the Error state.
+
+  * The message itself was dropped on the way from the sender to the receiver (for example, a CRC error).
+
+  * The ACK or NACK of messages was dropped on the way from the receiver to the sender (for example, a CRC error).
+
+Figure 13-8 presents the retry flow becasue of a packet loss that overcame a packet drop.
+
+Figure 13-8.
+
+A retry flow (on reliable transport types)
+
+If eventually the ACK/NACK is received by the sender QP successfully, it will continue to send the rest of the messages. If any message in the future has this problem too, the retry flow will be done again for this message as well, without any history that this was done before. If even after retrying several times the receiver side still doesn't respond, there will be a Work Completion with Retry Error on the sender side.
+
+##### Receiver Not Ready (RNR) Flow
+
+If the receiver side got a message that needs to consume a Receive Request from the Receiver Queue, but there isn't any outstanding Receive Request, the receiver will send back to the sender an RNR NACK. After a while, according to the time that was specified in the RNR NACK, the sender will try to send the message again.
+
+If eventually the receiver side posts a Receiver Request in time, and the incoming message consumes it, an ACK will be sent to the sender side to indicate that the message was saved successfully. If any message in the future has this problem too, the RNR retry flow will be done again for this message as well, without any history that this was done before. If even after retrying several times the receiver side still didn't post a Receiver Request and an RNR NACK was sent to the sender for each sent message, a Work Completion with RNR Retry Error will be generated on the sender side. Figure 13-9 presents the RNR retry flow of retry that overcome a missing Receive Request in he receiver side.
+
+Figure 13-9.
+
+RNR retry flow (on reliable transport types)
+
+In this section, I covered the Work Request status and some of the bad flows that can happen to a message. In the next section, I will discuss the multicast groups.
+
+### Multicast Groups
+
+Multicast groups are a means to send a message from one UD QP to many UD QPs. Every UD QP that wants to get this message needs to be attached to the multicast group. When a device gets a multicast packet, it duplicates it to all of the QPs that are attached to that group. Following is a description of two methods related to multicast groups:
+
+  * The ib_attach_mcast() method attaches a UD QP to a multicast group within an InfiniBand device. It accepts the QP to be attached and the multicast group attributes.
+
+  * The ib_detach_mcast() method detaches a UD QP from a multicast group.
+
+### Difference Between the Userspace and the Kernel-Level RDMA API
+
+The userspace and the kernel level of the RDMA stack API are quite similar, because they cover the same technology and need to be able to provide the same functionality. When the userspace is calling a method of the control path from the RDMA API, it performs a context switch to the kernel level to protect privileged resources and to synchronize objects that need to be synchronized (for example, the same QP number cannot be assigned to more than one QP at the same time).
+
+However, there are some differences between the userspace and the kernel-level RDMA API and functionality:
+
+  * The prefix of all the APIs in the kernel level is "ib_", while in the userspace the prefix is "ibv_".
+
+  * There are enumerations and macros that exist only in the RDMA API in the kernel level.
+
+  * There are QP types that are available only in the kernel (for example, the SMI and GSI QPs).
+
+  * There are privileged operations that can be performed only in the kernel level—for example, registration of a physical memory, registration of an MR using a WR, and FMRs.
+
+  * Some functionality isn't available in the RDMA API in the userspace—for example, Request for N notification.
+
+  * The kernel API is asynchronous. There are callbacks that are called when there is an asynchronous event or Completion event. In the userspace, everything is synchronous and the user needs to explicitly check if there is an asynchronous event or Completion event in its running context (that is, thread).
+
+  * XRC isn't relevant for kernel-level clients.
+
+  * There are new features that were introduced to the kernel level, but they are not available (yet) in the userspace.
+
+The userspace API is supplied by the userspace library "libibverbs." And although some of the RDMA functionality in the user level is less than the kernel-level one, it is enough to enjoy the benefits of the InfiniBand technology.
+
+## Summary
+
+You have learned in this chapter about the advantages of the InfiniBand technology. I reviewed the RDMA stack organization. I discussed the resource-creation hierarchy and all of the important objects and their API, which is needed in order to write client code that uses InfiniBand. You also saw some examples that use this API. The next chapter will deal with advanced topics like network namespaces and the Bluetooth subsystem.
+
+## Quick Reference
+
+I will conclude this chapter with a short list of important methods of the RDMA API. Some of them were mentioned in this chapter.
+
+### Methods
+
+Here are the methods.
+
+#### int ib_register_client(struct ib_client *client);
+
+Register a kernel client that wants to use the RDMA stack.
+
+#### void ib_unregister_client(struct ib_client *client);
+
+Unregister a kernel client that wants to stop using the RDMA stack.
+
+#### void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data);
+
+Set a client context to be associated with an InfiniBand device.
+
+#### void *ib_get_client_data(struct ib_device *device, struct ib_client *client);
+
+Read the client context that was associated with an InfiniBand device.
+
+#### int ib_register_event_handler(struct ib_event_handler *event_handler);
+
+Register a callback to be called for every asynchronous event that occurs to the InfiniBand device.
+
+#### int ib_unregister_event_handler(struct ib_event_handler *event_handler);
+
+Unregister a callback to be called for every asynchronous event that occurs to the InfiniBand device.
+
+#### int ib_query_device(struct ib_device *device, struct ib_device_attr *device_attr);
+
+Query an InfiniBand device for its attributes.
+
+#### int ib_query_port(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr);
+
+Query an InfiniBand device port for its attributes.
+
+#### enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num);
+
+Query for the link layer of the InfiniBand device's port.
+
+#### int ib_query_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid);
+
+Query for the GID in a specific index in the InfiniBand device's port GID table.
+
+#### int ib_query_pkey(struct ib_device *device, u8 port_num, u16 index, u16 *pkey);
+
+Query for the P_Key-specific index in the InfiniBand device's port P_Key table.
+
+#### int ib_find_gid(struct ib_device *device, union ib_gid *gid, u8 *port_num, u16 *index);
+
+Find the index of a specific GID value in the InfiniBand device's port GID table.
+
+#### int ib_find_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index);
+
+Find the index of a specific P_Key value in the InfiniBand device's port P_Key table.
+
+#### struct ib_pd *ib_alloc_pd(struct ib_device *device);
+
+Allocate a PD to be used later to create other InfiniBand resources.
+
+#### int ib_dealloc_pd(struct ib_pd *pd);
+
+Deallocate a PD.
+
+#### struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+
+Create an AH that will be used when posting a Send Request in a UD QP.
+
+#### int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, struct ib_grh *grh, struct ib_ah_attr *ah_attr);
+
+Initializes an AH attribute from a Work Completion of a received message and a GRH buffer. Those AH attributes can be used when calling the ib_create_ah() method.
+
+#### struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc, struct ib_grh *grh, u8 port_num);
+
+Create an AH from a Work Completion of a received message and a GRH buffer.
+
+#### int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+Modify the attributes of an existing AH.
+
+#### int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+Query the attributes of an existing AH.
+
+#### int ib_destroy_ah(struct ib_ah *ah);
+
+Destroy an AH.
+
+#### struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
+
+Return an MR system memory that is usable for DMA.
+
+#### static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr);
+
+Check if the DMA memory points to an invalid address—that is, check whether the DMA mapping operation failed.
+
+#### static inline u64 ib_dma_map_single(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction);
+
+Map a kernel virtual address to a DMA address.
+
+#### static inline void ib_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction);
+
+Unmap a DMA mapping of a virtual address.
+
+#### static inline u64 ib_dma_map_single_attrs(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction, struct dma_attrs *attrs)
+
+Map a kernel virtual memory to a DMA address according to DMA attributes.
+
+#### static inline void ib_dma_unmap_single_attrs(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction, struct dma_attrs *attrs);
+
+Unmap a DMA mapping of a virtual address that was mapped according to DMA attributes.
+
+#### static inline u64 ib_dma_map_page(struct ib_device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction);
+
+Maps a physical page to a DMA address.
+
+#### static inline void ib_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction);
+
+Unmap a DMA mapping of a physical page.
+
+#### static inline int ib_dma_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction);
+
+Map a scatter/gather list to a DMA address.
+
+#### static inline void ib_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction);
+
+Unmap a DMA mapping of a scatter/gather list.
+
+#### static inline int ib_dma_map_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *attrs);
+
+Map a scatter/gather list to a DMA address according to DMA attributes.
+
+#### static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *attrs);
+
+Unmap a DMA mapping of a scatter/gather list according to DMA attributes.
+
+#### static inline u64 ib_sg_dma_address(struct ib_device *dev, struct scatterlist *sg);
+
+Return the address attribute of a scatter/gather entry.
+
+#### static inline unsigned int ib_sg_dma_len(struct ib_device *dev, struct scatterlist *sg);
+
+Return the length attribute of a scatter/gather entry.
+
+#### static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir);
+
+Transfer a DMA region ownership to the CPU. It should be called before the CPU accesses a DMA mapped region whose ownership was previously transferred to the device.
+
+#### static inline void ib_dma_sync_single_for_device(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir);
+
+Transfer a DMA region ownership to the device. It should be called before the device accesses a DMA mapped region whose ownership was previously transferred to the CPU.
+
+#### static inline void *ib_dma_alloc_coherent(struct ib_device *dev, size_t size, u64 *dma_handle, gfp_t flag);
+
+Allocate a memory block that can be accessed by the CPU, and map it for DMA.
+
+#### static inline void ib_dma_free_coherent(struct ib_device *dev, size_t size, void *cpu_addr, u64 dma_handle);
+
+Free a memory block that was allocated using ib_dma_alloc_coherent().
+
+#### struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, struct ib_phys_buf *phys_buf_array, int num_phys_buf, int mr_access_flags, u64 *iova_start);
+
+Take a physical page list, and prepare it for being accessed by the InfiniBand device.
+
+#### int ib_rereg_phys_mr(struct ib_mr *mr, int mr_rereg_mask, struct ib_pd *pd, struct ib_phys_buf *phys_buf_array, int num_phys_buf, int mr_access_flags, u64 *iova_start);
+
+Change the attributes of an MR.
+
+#### int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
+
+Query for the attributes of an MR.
+
+#### int ib_dereg_mr(struct ib_mr *mr);
+
+Deregister an MR.
+
+#### struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
+
+Allocate an MW. This MW will be used to allow remote access to an MR.
+
+#### static inline int ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind);
+
+Bind an MW to an MR to allow a remote access to local memory with specific permissions.
+
+#### int ib_dealloc_mw(struct ib_mw *mw);
+
+Deallocates an MW.
+
+#### struct ib_cq *ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *), void *cq_context, int cqe, int comp_vector);
+
+Create a CQ. This CQ will be used to indicate the status of ended Work Requests for Send or Receive Queues.
+
+#### int ib_resize_cq(struct ib_cq *cq, int cqe);
+
+Change the number of entries in a CQ.
+
+#### int ib_modify_cq(structib_cq *cq, u16 cq_count, u16 cq_period);
+
+Modify the moderation attributes of a CQ. This method is used to decrease the number of interrupts of an InfiniBand device.
+
+#### int ib_peek_cq(structib_cq *cq, intwc_cnt);
+
+Return the number of available Work Completions in a CQ.
+
+#### static inline int ib_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+
+Request a Completion notification event to be generated when the next Work Completion is added to the CQ.
+
+#### static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt);
+
+Request a Completion notification event to be generated when there is a specific number of Work Completions in a CQ.
+
+#### static inline int ib_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
+
+Read and remove one or more Work Completions from a CQ. They are read in the order that they were added to the CQ.
+
+#### struct ib_srq *ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr);
+
+Create an SRQ that will be used as a shared Receive Queue for several QPs.
+
+#### int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask);
+
+Modify the attributes of an SRQ.
+
+#### int ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+
+Query for the attributes of an SRQ. The SRQ limit value might be changed in subsequent calls to this method.
+
+#### int ib_destroy_srq(struct ib_srq *srq);
+
+Destroy an SRQ.
+
+#### struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr);
+
+Create a QP. Every new QP is assigned with a QP number that isn't in use by other QPs at the same time.
+
+#### int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask);
+
+Modify the attributes of a QP, which includes Send and Receive Queue attributes and the QP state.
+
+#### int ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+
+Query for the attributes of a QP. Some of the attributes might be changed in subsequent calls to this method.
+
+#### int ib_destroy_qp(struct ib_qp *qp);
+
+Destroy a QP.
+
+#### static inline int ib_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr);
+
+Adds a linked list of Receive Requests to an SRQ.
+
+#### static inline int ib_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr);
+
+Adds a linked list of Receive Requests to the Receive Queue of a QP.
+
+#### static inline int ib_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr, struct ib_send_wr **bad_send_wr);
+
+Adds a linked list of Send Requests to the Send Queue of a QP.
+
+#### int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+Attaches a UD QP to a multicast group.
+
+#### int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+Detaches a UD QP from a multicast group.
+Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_14
+
+© Rami Rosen 2014
+
+# 14. Advanced Topics
+
+Rami Rosen1
+
+(1)
+
+Haifa, Israel
+
+Abstract
+
+Chapter 13 dealt with the InfiniBand subsystem and its implementation in Linux. This chapter deals with several advanced topics and some topics that didn't fit logically into other chapters. The chapter starts with a discussion about network namespaces, a type of lightweight process virtualization mechanism that was added to Linux in recent years. I will discuss the namespaces implementation in general and network namespaces in particular. You will learn that only two new system calls are needed in order to implement namespaces. You will also see several examples of how simple it is to create and manage network namespaces with the ip command of iproute2, and how simple it is to move one network device from one network namespace to another and to attach a specified process to a specified network namespace. The cgroups subsystem also provides resource management solution, which is different from namespaces. I will describe the cgroups subsystem and its two network modules, net_prio and cls_cgroup, and give two examples of using these cgroup network modules.
+
+Chapter 13 dealt with the InfiniBand subsystem and its implementation in Linux. This chapter deals with several advanced topics and some topics that didn't fit logically into other chapters. The chapter starts with a discussion about network namespaces, a type of lightweight process virtualization mechanism that was added to Linux in recent years. I will discuss the namespaces implementation in general and network namespaces in particular. You will learn that only two new system calls are needed in order to implement namespaces. You will also see several examples of how simple it is to create and manage network namespaces with the ip command of iproute2, and how simple it is to move one network device from one network namespace to another and to attach a specified process to a specified network namespace. The cgroups subsystem also provides resource management solution, which is different from namespaces. I will describe the cgroups subsystem and its two network modules, net_prio and cls_cgroup, and give two examples of using these cgroup network modules.
+
+Later on in this chapter, you will learn about Busy Poll Sockets and how to tune them. The Busy Poll Sockets feature provides an interesting performance optimization technique for sockets that need low latency and are willing to pay a cost of higher CPU utilization. The Busy Poll Sockets feature is available from kernel 3.11. I will also cover the Bluetooth subsystem, the IEEE 802.15.4 subsystem and the Near Field Communication (NFC) subsystem; these three subsystems typically work in short range networks, and the development of new features for these subsystem is progressing at a rapid pace. I will also discuss Notification Chains, which is an important mechanism that you may encounter while developing or debugging kernel networking code and the PCI subsystem, as many network devices are PCI devices. I will not delve deep into the PCI subsystem details, as this book is not about device drivers. I will conclude the chapter with three short sections, one about the teaming network driver (which is the new kernel link aggregation solution), one about the Point-to-Point over Ethernet (PPPoE) Protocol, and finally one about Android.
+
+## Network Namespaces
+
+This section covers Linux namespaces, what they are for and how they are implemented. It includes an in-depth discussion of network namespaces, giving some examples that will demonstrate their usage. Linux namespaces are essentially a virtualization solution. Operating system virtualization was implemented in mainframes many years before solutions like Xen or KVM hit the market. Also with Linux namespaces, which are a form of process virtualization, the idea is not new at all. It was tried in the Plan 9 operating system (see this article from 1992: "The Use of Name Spaces in Plan 9",  www.cs.bell-labs.com/sys/doc/names.html ).
+
+Namespaces is a form of lightweight process virtualization, and it provides resource isolation. As opposed to virtualization solutions like KVM or Xen, with namespaces you do not create additional instances of the operating system on the same host, but use only a single operating system instance. I should mention in this context that the Solaris operating system has a virtualization solution named Solaris Zones, which also uses a single operating system instance, but the scheme of resource partitioning is somewhat different than that of Linux namespaces (for example, in Solaris Zones there is a global zone which is the primary zone, and which has more capabilities). In the FreeBSD operating system there is a mechanism called jails, which also provides resource partitioning without running more than one instance of the kernel.
+
+The main idea of Linux namespaces is to partition resources among groups of processes to enable a process (or several processes) to have a different view of the system than processes in other groups of processes. This feature is used, for example, to provide resource isolation in the Linux containers project ( http://lxc.sourceforge.net/ ). The Linux containers project also uses another resource management mechanism that is provided by the cgroups subsystem, which will be described later in this chapter. With containers, you can run different Linux distributions on the same host using one instance of the operating systems. Namespaces are also needed for the checkpoint/restore feature, which is used in high performance computing (HPC). For example, it is used in CRIU ( http://criu.org/Main_Page  ), a software tool of OpenVZ ( http://openvz.org/Main_Page ), which implements checkpoint/restore functionality for Linux processes mostly in userspace, though there are very few places when CRIU kernel patches were merged. I should mention that there were some projects to implement checkpoint/restore in the kernel, but these projects were not accepted in mainline because they were too complex. For example, take the CKPT project:  https://ckpt.wiki.kernel.org/index.php/Main_Page . The checkpoint/restore feature (sometimes referred to as checkpoint/restart) enables stopping and saving several processes on a filesystem, and at a later time restores those processes (possibly on a different host) from the filesystem and resumes its execution from where it was stopped. Without namespaces, checkpoint/restore has very limited use cases, in particular live migration is only possible with them. Another use case for network namespaces is when you need to set up an environment that needs to simulate different network stacks for testing, debugging, etc. For readers who want to learn more about checkpoint/restart, I suggest reading the article "Virtual Servers and Checkpoint/Restart in Mainstream Linux," by Sukadev Bhattiprolu, Eric W. Biederman, Serge Hallyn, and Daniel Lezcano.
+
+Mount namespaces were the first type of Linux namespaces to be merged in 2002, for kernel 2.4.19. User namespaces were the last to be implemented, in kernel 3.8, for almost all filesystems types. It could be that additional namespaces will be developed, as is discussed later in this section. For creating a namespace you should have the CAP_SYS_ADMIN capability for all namespaces, except for the user namespace. Trying to create a namespace without the CAP_SYS_ADMIN capability for all namespaces, except for the user namespace, will result with an –EPRM error ("Operation not permitted"). Many developers took part in the development of namespaces, among them are Eric W. Biederman, Pavel Emelyanov, Al Viro, Cyrill Gorcunov, Andrew Vagin, and more.
+
+After getting some background about process virtualization and Linux namespaces, and how they are used, you are now ready to dive in into the gory implementation details.
+
+### Namespaces Implementation
+
+As of this writing, six namespaces are implemented in the Linux kernel. Here is a description of the main additions and changes that were needed in order to implement namespaces in the Linux kernel and to support namespaces in userspace packages:
+
+  * A structure called nsproxy (namespace proxy) was added. This structure contains pointers to five namespaces out of the six namespaces that are implemented. There is no pointer to the user namespace in the nsproxy structure; however, all the other five namespace objects contain a pointer to the user namespace object that owns them, and in each of these five namespaces, the user namespace pointer is called user_ns. The user namespace is a special case; it is a member of the credentials structure (cred), called user_ns. The cred structure represents the security context of a process. Each process descriptor (task_struct) contains two cred objects, for effective and objective process descriptor credentials. I will not delve into all the details and nuances of user namespaces implementation, since this is not in the scope of this book. An nsproxy object is created by the create_nsproxy() method and it is released by the free_nsproxy() method. A pointer to nsproxy object, which is also called nsproxy, was added to the process descriptor (a process descriptor is represented by the task_struct structure, include/linux/sched.h.) Let's take a look at the nsproxy structure, as it's quite short and should be quite self-explanatory:
+
+struct nsproxy {
+
+atomic_t count;
+
+struct uts_namespace *uts_ns;
+
+struct ipc_namespace *ipc_ns;
+
+struct mnt_namespace *mnt_ns;
+
+struct pid_namespace *pid_ns;
+
+struct net *net_ns;
+
+};
+
+(include/linux/nsproxy.h)
+
+  * You can see in the nsproxy structure five pointers of namespaces (there is no user namespace pointer). Using the nsproxy object in the process descriptor (task_struct object) instead of five namespace objects is an optimization. When performing fork(), a new child is likely to live in the same set of namespaces as its parent. So instead of five reference counter increments (one per each namespace), only one reference counter increment would happen (of the nsproxy object). The nsproxy count member is a reference counter, which is initialized to 1 when the nsproxy object is created by the create_nsproxy() method, and which is decremented by the put_nsproxy() method and incremented by the get_nsproxy() method. Note that the pid_ns member of the nsproxy object was renamed to pid_ns_for_children in kernel 3.11.
+
+  * A new system call, unshare(), was added. This system call gets a single parameter that is a bitmask of CLONE* flags. When the flags argument consists of one or more namespace CLONE_NEW* flags, the unshare() system call performs the following steps:
+
+    * First, it creates a new namespace (or several namespaces) according to the specified flag. This is done by calling the unshare_nsproxy_namespaces() method, which in turn creates a new nsproxy object and one or more namespaces by calling the create_new_namespaces() method. The type of the new namespace (or namespaces) is determined according to the specified CLONE_NEW* flag. The create_new_namespaces() method returns a new nsproxy object that contains the new created namespace (or namespaces).
+
+    * Then it attaches the calling process to that newly created nsproxy object by calling the switch_task_namespaces() method.
+
+  * When CLONE_NEWPID is the flag of the unshare() system call, it works differently than with the other flags; it's an implicit argument to fork(); only the child task will happen in a new PID namespace, not the one calling the unshare() system call. Other CLONE_NEW* flags immediately put the calling process into a new namespace.
+
+  * The six CLONE_NEW* flags, which were added to support the creation of namespaces, are described later in this section. The implementation of the unshare() system call is in kernel/fork.c.
+
+  * A new system call, setns(), was added. It attaches the calling thread to an existing namespace. Its prototype is int setns(int fd, int nstype); the parameters are:
+
+    * fd: A file descriptor which refers to a namespace. These are obtained by opening links from the /proc/<pid>/ns/ directory.
+
+    * nstype: An optional parameter. When it is one of the new CLONE_NEW* namespaces flags, the specified file descriptor must refer to a namespace which matches the type of the specified CLONE_NEW* flag. When the nstype is not set (its value is 0) the fd argument can refer to a namespace of any type. If the nstype does not correspond to the namespace type associated with the specified fd, a value of –EINVAL is returned.
+
+You can find the implementation of the setns() system call in kernel/nsproxy.c.
+
+  * The following six new clone flags were added in order to support namespaces:
+
+    * CLONE_NEWNS (for mount namespaces)
+
+    * CLONE_NEWUTS (for UTS namespaces)
+
+    * CLONE_NEWIPC (for IPC namespaces)
+
+    * CLONE_NEWPID (for PID namespaces)
+
+    * CLONE_NEWNET (for network namespaces)
+
+    * CLONE_NEWUSER (for user namespaces)
+
+  * The clone() system call is used traditionally to create a new process. It was adjusted to support these new flags so that it will create a new process attached to a new namespace (or namespaces). Note that you will encounter usage of the CLONE_NEWNET flag, for creating a new network namespace, in some of the examples later in this chapter.
+
+  * Each subsystem, from the six for which there is a namespace support, had implemented a unique namespace of its own. For example, the mount namespace is represented by a structure called mnt_namespace, and the network namespace is represented by a structure called net, which is discussed later in this section. I will mention the other namespaces later in this chapter.
+
+  * For namespaces creation, a method named create_new_namespaces() was added (kernel/nsproxy.c). This method gets as a first parameter a CLONE_NEW* flag or a bitmap of CLONE_NEW* flags. It first creates an nsproxy object by calling the create_nsproxy() method, and then it associates a namespace according to the specified flag; since the flag can be a bitmask of flags, the create_new_namespaces() method can associate more than one namespace. Let's take a look at the create_new_namespaces() method:
+
+static struct nsproxy *create_new_namespaces(unsigned long flags,
+
+struct task_struct *tsk, struct user_namespace *user_ns,
+
+struct fs_struct *new_fs)
+
+{
+
+struct nsproxy *new_nsp;
+
+int err;
+
+Allocate an nsproxy object and initialize its reference counter to 1:
+
+new_nsp = create_nsproxy();
+
+if (!new_nsp)
+
+return ERR_PTR(-ENOMEM);
+
+. . .
+
+After creating successfully an nsproxy object, we should create namespaces according to the specified flags, or associate an existing namespace to the new nsproxy object we created. We start by calling copy_mnt_ns(), for the mount namespaces, and then we call copy_utsname(), for the UTS namespace. I will describe here shortly the copy_utsname() method, because the UTS namespace is discussed in the "UTS Namespaces Implementation" section later in this chapter. If the CLONE_NEWUTS is not set in the specified flags of the copy_utsname() method, the copy_utsname() method does not create a new UTS namespace; it returns the UTS namespace that was passed by tsk->nsproxy->uts_ns as the last parameter to the copy_utsname() method. In case the CLONE_NEWUTS is set, the copy_utsname() method clones the specified UTS namespace by calling the clone_uts_ns() method. The clone_uts_ns() method, in turn, allocates a new UTS namespace object, copies the new_utsname object of the specified UTS namespace (tsk->nsproxy->uts_ns) into the new_utsname object of the newly created UTS namespace object, and returns the newly created UTS namespace. You will learn more about the new_utsname structure in the "UTS Namespaces Implementation" section later in this chapter:
+
+new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
+
+if (IS_ERR(new_nsp->uts_ns)) {
+
+err = PTR_ERR(new_nsp->uts_ns);
+
+goto out_uts;
+
+}
+
+. . .
+
+After handling the UTS namespace, we continue with calling the copy_ipcs() method to handle the IPC namespace, copy_pid_ns() to handle the PID namespace, and copy_net_ns() to handle the network namespace. Note that there is no call to the copy_user_ns() method, as the nsproxy does not contain a pointer to user namespace, as was mentioned earlier. I will describe here shortly the copy_net_ns() method. If the CLONE_NEWNET is not set in the specified flags of the create_new_namespaces() method, the copy_net_ns() method returns the network namespace that was passed as the third parameter to the copy_net_ns() method, tsk->nsproxy->net_ns, much like the copy_utsname() did, as you saw earlier in this section. If the CLONE_NEWNET is set, the copy_net_ns() method allocates a new network namespace by calling the net_alloc() method, initializes it by calling the setup_net() method, and adds it to the global list of all network namespaces, net_namespace_list:
+
+new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
+
+if (IS_ERR(new_nsp->net_ns)) {
+
+err = PTR_ERR(new_nsp->net_ns);
+
+goto out_net;
+
+}
+
+return new_nsp;
+
+}
+
+  * Note that the setns() system call, which does not create a new namespace but only attaches the calling thread to a specified namespace, also calls create_new_namespaces(), but it passes 0 as a first parameter; this implies that only an nsproxy is created by calling the create_nsproxy() method, but no new namespace is created, but the calling thread is associated with an existing network namespace which is identified by the specified fd argument of the setns() system call. Later in the setns() system call implementation, the switch_task_namespaces() method is invoked, and it assigns the new nsproxy which was just created to the calling thread (see kernel/nsproxy.c).
+
+  * A method named exit_task_namespaces() was added in kernel/nsproxy.c. It is called when a process is terminated, by the do_exit() method (kernel/exit.c). The exit_task_namespaces() method gets the process descriptor (task_struct object) as a single parameter. In fact the only thing it does is call the switch_task_namespaces() method, passing the specified process descriptor and a NULL nsproxy object as arguments. The switch_task_namespaces() method, in turn, nullifies the nsproxy object of the process descriptor of the process which is being terminated. If there are no other processes that use that nsproxy, it is freed.
+
+  * A method named get_net_ns_by_fd() was added. This method gets a file descriptor as its single parameter, and returns the network namespace associated with the inode that corresponds to the specified file descriptor. For readers who are not familiar with filesystems and with inode semantics, I suggest reading the "Inode Objects" section of Chapter 12, "The Virtual Filesystem," in Understanding the Linux Kernel by Daniel P. Bovet and Marco Cesati (O'Reilly, 2005).
+
+  * A method named get_net_ns_by_pid() was added. This method gets a PID number as a single argument, and it returns the network namespace object to which this process is attached.
+
+  * Six entries were added under /proc/<pid>/ns, one for each namespace. These files, when opened, should be fed into the setns() system call. You can use ls –al or readlink to display the unique proc inode number which is associated with a namespace. This unique proc inode is created by the proc_alloc_inum() method when the namespace is created, and is freed by the proc_free_inum() method when the namespace is released. See, for example, in the create_pid_namespace() method in kernel/pid_namespace.c. In the following example, the number in square brackets on the right is the unique proc inode number of each namespace:
+
+ls -al /proc/1/ns/
+
+total 0
+
+dr-x--x--x 2 root root 0 Nov 3 13:32 .
+
+dr-xr-xr-x 8 root root 0 Nov 3 12:17 ..
+
+lrwxrwxrwx 1 root root 0 Nov 3 13:32 ipc -> ipc:[4026531839]
+
+lrwxrwxrwx 1 root root 0 Nov 3 13:32 mnt -> mnt:[4026531840]
+
+lrwxrwxrwx 1 root root 0 Nov 3 13:32 net -> net:[4026531956]
+
+lrwxrwxrwx 1 root root 0 Nov 3 13:32 pid -> pid:[4026531836]
+
+lrwxrwxrwx 1 root root 0 Nov 3 13:32 user -> user:[4026531837]
+
+lrwxrwxrwx 1 root root 0 Nov 3 13:32 uts -> uts:[4026531838]
+
+  * A namespace can stay alive if either one of the following conditions is met:
+
+    * The namespace file under /proc/<pid>/ns/ descriptor is held.
+
+    * bind mounting the namespace proc file somewhere else, for example, for PID namespace, by: mount --bind /proc/self/ns/pid /some/filesystem/path
+
+  * For each of the six namespaces, a proc namespace operations object (an instance of proc_ns_operations structure) is defined. This object consists of callbacks, such as inum, to return the unique proc inode number associated with the namespace or install, for namespace installation (in the install callback, namespace specific actions are performed, such as attaching the specific namespace object to the nsproxy object, and more; the install callback is invoked by the setns system call). The proc_ns_operations structure in defined in include/linux/proc_fs.h. Following is the list of the six proc_ns_operations objects:
+
+    * utsns_operations for UTS namespace (kernel/utsname.c)
+
+    * ipcns_operations for IPC namespace (ipc/namespace.c)
+
+    * mntns_operations for mount namespaces (fs/namespace.c)
+
+    * pidns_operations for PID namespaces (kernel/pid_namespace.c)
+
+    * userns_operations for user namespace (kernel/user_namespace.c)
+
+    * netns_operations for network namespace (net/core/net_namespace.c)
+
+  * For each namespace, except the mount namespace, there is an initial namespace:
+
+    * init_uts_ns: For UTS namespace (init/version.c).
+
+    * init_ipc_ns: For IPC namespace (ipc/msgutil.c).
+
+    * init_pid_ns: For PID namespace (kernel/pid.c).
+
+    * init_net: For network namespace (net/core/net_namespace.c).
+
+    * init_user_ns: For user namespace (kernel/user.c).
+
+  * An initial, default nsproxy object is defined: it is called init_nsproxy and it contains pointers to five initial namespaces; they are all initialized to be the corresponding specific initial namespace except for the mount namespace, which is initialized to be NULL:
+
+struct nsproxy init_nsproxy = {
+
+.count = ATOMIC_INIT(1),
+
+.uts_ns = &init_uts_ns,
+
+#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
+
+.ipc_ns = &init_ipc_ns,
+
+#endif
+
+.mnt_ns = NULL,
+
+.pid_ns = &init_pid_ns,
+
+#ifdef CONFIG_NET
+
+.net_ns = &init_net,
+
+#endif
+
+};
+
+(kernel/nsproxy.c)
+
+  * A method named task_nsproxy() was added; it gets as a single parameter a process descriptor (task_struct object), and it returns the nsproxy associated with the specified task_struct object. See include/linux/nsproxy.h.
+
+These are the six namespaces available in the Linux kernel as of this writing:
+
+  * Mount namespaces: The mount namespaces allows a process to see its own view of the filesystem and of its mount points. Mounting a filesystem in one mount namespace does not propagate to the other mount namespaces. Mount namespaces are created by setting the CLONE_NEWNS flag when calling the clone() or unshare() system calls. In order to implement mount namespaces, a structure called mnt_namespace was added (fs/mount.h), and nsproxy holds a pointer to an mnt_namespace object called mnt_ns. Mount namespaces are available from kernel 2.4.19. Mount namespaces are implemented primarily in fs/namespace.c. When creating a new mount namespace, the following rules apply:
+
+    * All previous mounts will be visible in the new mount namespace.
+
+    * Mounts/unmounts in the new mount namespace are invisible to the rest of the system.
+
+    * Mounts/unmounts in the global mount namespace are visible in the new mount namespace.
+
+  * Mount namespaces use a VFS enhancement called shared subtrees, which was introduced in the Linux 2.6.15 kernel; the shared subtrees feature introduced new flags: MS_PRIVATE, MS_SHARED, MS_SLAVE and MS_UNBINDABLE . (See  http://lwn.net/Articles/159077/  and Documentation/filesystems/sharedsubtree.txt.) I will not discuss the internals of mount namespaces implementation. For readers who want to learn more about mount namespaces usage, I suggest reading the following article: "Applying Mount Namespaces," by Serge E. Hallyn and Ram Pai ( http://www.ibm.com/developerworks/linux/library/l-mount-namespaces/index.html ).
+
+  * PID namespaces: The PID namespaces provides the ability for different processes in different PID namespaces to have the same PID. This feature is a building block for Linux containers. It is important for checkpoint/restore of a process, because a process checkpointed on one host can be restored on a different host even if there is a process with the same PID on that host. When creating the first process in a new PID namespace, its PID is 1. The behavior of this process is somewhat like the behavior of the init process. This means that when a process dies, all its orphaned children will now have the process with PID 1 as their parent (child reaping). Sending SIGKILL signal to a process with PID 1 does not kill the process, regardless of in which namespace the SIGKILL signal was sent, in the initial PID namespace or in any other PID namespace. But killing init of one PID namespace from another (parent one) will work. In this case, all of the tasks living in the former namespace will be killed and the PID namespace will be stopped. PID namespaces are created by setting the CLONE_NEWPID flag when calling the clone() or unshare() system calls. In order to implement PID namespaces, a structure called pid_namespace was added (include/linux/pid_namespace.h), and nsproxy holds a pointer to a pid_namespace object called pid_ns. In order to have PID namespaces support, CONFIG_PID_NS should be set. PID namespaces are available from kernel 2.6.24. PID namespaces are implemented primarily in kernel/pid_namespace.c.
+
+  * Network namespaces: The network namespace allows creating what appears to be multiple instances of the kernel network stack. Network namespaces are created by setting the CLONE_NEWNET flag when calling the clone() or unshare() system calls. In order to implement network namespaces, a structure called net was added (include/net/net_namespace.h), and nsproxy holds a pointer to a net object called net_ns. In order to have network namespaces support, CONFIG_NET_NS should be set. I will discuss network namespaces later in this section. Network namespaces are available from kernel 2.6.29. Network namespaces are implemented primarily in net/core/net_namespace.c.
+
+  * IPC namespaces: The IPC namespace allows a process to have its own System V IPC resources and POSIX message queues resources. IPC namespaces are created by setting the CLONE_NEWIPC flag when calling the clone() or unshare() system calls. In order to implement IPC namespaces, a structure called ipc_namespace was added (include/linux/ipc_namespace.h), and nsproxy holds a pointer to an ipc_namespace object called ipc_ns. In order to have IPC namespaces support, CONFIG_IPC_NS should be set. Support for System V IPC resources is available in IPC namespaces from kernel 2.6.19. Support for POSIX message queues resources in IPC namespaces was added later, in kernel 2.6.30. IPC namespaces are implemented primarily in ipc/namespace.c.
+
+  * UTS namespaces: The UTS namespace provides the ability for different UTS namespaces to have different host name or domain name (or other information returned by the uname() system call). UTS namespaces are created by setting the CLONE_NEWUTS flag when calling the clone() or unshare() system calls. UTS namespace implementation is the simplest among the six namespaces that were implemented. In order to implement the UTS namespace, a structure called uts_namespace was added (include/linux/utsname.h), and nsproxy holds a pointer to a uts_namespace object called uts_ns. In order to have UTS namespaces support, CONFIG_UTS_NS should be set. UTS namespaces are available from kernel 2.6.19. UTS namespaces are implemented primarily in kernel/utsname.c.
+
+  * User namespaces: The user namespace allows mapping of user and group IDs. This mapping is done by writing to two procfs entries that were added for supporting user namespaces: /proc/sys/kernel/overflowuid and /proc/sys/kernel/overflowgid. A process attached to a user namespace can have a different set of capabilities then the host. User namespaces are created by setting the CLONE_NEWUSER flag when calling the clone() or unshare() system calls. In order to implement user namespaces, a structure called user_namespace was added (include/linux/user_namespace.h). The user_namespace object contains a pointer to the user namespace object that created it (parent). As opposed to the other five namespaces, nsproxy does not hold a pointer to a user_namespace object. I will not delve into more implementation details of user namespaces, as it is probably the most complex namespace and as it is beyond the scope of the book. In order to have user namespaces support, CONFIG_USER_NS should be set. User namespaces are available from kernel 3.8 for almost all filesystem types. User namespaces are implemented primarily in kernel/user_namespace.c.
+
+Support to namespaces was added in four userspace packages:
+
+  * In util-linux:
+
+    * The unshare utility can create any of the six namespaces, available since version 2.17.
+
+    * The nsenter utility (which is in fact a light wrapper around the setns system call), available since version 2.23.
+
+  * In iproute2, management of network namespaces is done with the ip netns command, and you will see several examples for this later in this chapter. Moreover, you can move a network interface to a different network namespace with the ip link command as you will see in the "Moving a Network Interface to a different Network Namespace" section later in this chapter.
+
+  * In ethtool, support was added to enable to find out whether the NETIF_F_NETNS_LOCAL feature is set for a specified network interface. When the NETIF_F_NETNS_LOCAL feature is set, this indicates that the network interface is local to that network namespace, and you cannot move it to a different network namespace. The NETIF_F_NETNS_LOCAL feature will be discussed later in this section.
+
+  * In the wireless iw package, an option was added to enable moving a wireless interface to a different namespace.
+
+Note
+
+In a presentation in Ottawa Linux Symposium (OLS) in 2006, "Multiple Instances of the Global Linux Namespaces," Eric W. Biederman (one of the main developers of Linux namespaces) mentioned ten namespaces; the other four namespaces that he mentioned in this presentation and that are not implemented yet are: device namespace, security namespace, security keys namespace, and time namespace. (See  https://www.kernel.org/doc/ols/2006/ols2006v1-pages-101-112.pdf .) For more information about namespaces, I suggest reading a series of six articles about it by Michael Kerrisk ( https://lwn.net/Articles/531114/  ). Mobile OS virtualization projects triggered a development effort to support device namespaces; for more information about device namespaces, which are not yet part of the kernel, see "Device Namespaces" By Jake Edge ( http://lwn.net/Articles/564854/ ) and also ( http://lwn.net/Articles/564977/ ). There was also some work for implementing a new syslog namespace (see the article "Stepping Closer to Practical Containers: "syslog" namespaces",  http://lwn.net/Articles/527342/ ).
+
+The following three system calls can be used with namespaces:
+
+  * clone(): Creates a new process attached to a new namespace (or namespaces). The type of the namespace is specified by a CLONE_NEW* flag which is passed as a parameter. Note that you can also use a bitmask of these CLONE_NEW* flags. The implementation of the clone() system call is in kernel/fork.c.
+
+  * unshare(): Discussed earlier in this section.
+
+  * setns(): Discussed earlier in this section.
+
+Note
+
+Namespaces do not have names inside the kernel that userspace processes can use to talk with them. If namespaces would have names, this would require keeping them globally, in yet another special namespace. This would complicate the implementation and can raise problems in checkpoint/restore for example. Instead, userspace processes should open namespace files under /proc/<pid>/ns/ and their file descriptors can be used to talk to a specific namespace, in order to keep that namespace alive. Namespaces are identified by a unique proc inode number generated when they are created and freed when they are released. Each of the six namespace structures contains an integer member called proc_inum, which is the namespace unique proc inode number and is assigned by calling the proc_alloc_inum() method. Each of the six namespaces has also a proc_ns_operations object, which includes namespace-specific callbacks; one of these callbacks, called inum, returns the proc_inum of the associated namespace (for the definition of proc_ns_operations structure, refer to include/linux/proc_fs.h).
+
+Before discussing network namespaces, let's describe how the simplest namespace, the UTS namespace, is implemented. This is a good starting point to understand the other, more complex namespaces.
+
+### UTS Namespaces Implementation
+
+In order to implement UTS namespaces, a struct called uts_namespace was added:
+
+struct uts_namespace {
+
+struct kref kref;
+
+struct new_utsname name;
+
+struct user_namespace *user_ns;
+
+unsigned int proc_inum;
+
+};
+
+(include/linux/utsname.h)
+
+Here is a short description of the members of the uts_namespace structure:
+
+  * kref: A reference counter. It is a generic kernel reference counter, incremented by the kref_get() method and decremented by the kref_put() method. Besides the UTS namespace, also the PID namespace has a kref object as a reference counter; all the other four namespaces use an atomic counter for reference counting. For more info about the kref API look in Documentation/kref.txt.
+
+  * name: A new_utsname object, contains fields like domainname and nodename (will be discussed shortly).
+
+  * user_ns: The user namespace associated with the UTS namespace.
+
+  * proc_inum: The unique proc inode number of the UTS namespace.
+
+The nsproxy structure contains a pointer to the uts_namespace:
+
+struct nsproxy {
+
+. . .
+
+struct uts_namespace *uts_ns;
+
+. . .
+
+};
+
+(include/linux/nsproxy.h)
+
+As you saw earlier, the uts_namespace object contains an instance of the new_utsname structure. Let's take a look at the new_utsname structure, which is the essence of the UTS namespace:
+
+struct new_utsname {
+
+char sysname[__NEW_UTS_LEN + 1];
+
+char nodename[__NEW_UTS_LEN + 1];
+
+char release[__NEW_UTS_LEN + 1];
+
+char version[__NEW_UTS_LEN + 1];
+
+char machine[__NEW_UTS_LEN + 1];
+
+char domainname[__NEW_UTS_LEN + 1];
+
+};
+
+(include/uapi/linux/utsname.h)
+
+The nodename member of the new_utsname is the host name, and domainname is the domain name. A method named utsname() was added; this method simply returns the new_utsname object which is associated with the process that currently runs (current):
+
+static inline struct new_utsname *utsname(void)
+
+{
+
+return &current->nsproxy->uts_ns->name;
+
+}
+
+(include/linux/utsname.h)
+
+Now, the new gethostname() system call implementation is the following:
+
+SYSCALL_DEFINE2(gethostname, char __user *, name, int, len)
+
+{
+
+int i, errno;
+
+struct new_utsname *u;
+
+if (len < 0)
+
+return -EINVAL;
+
+down_read(&uts_sem);
+
+Invoke the utsname() method, which accesses the new_utsname object of the UTS namespace associated with the current process:
+
+u = utsname();
+
+i = 1 + strlen(u->nodename);
+
+if (i > len)
+
+i = len;
+
+errno = 0;
+
+Copy to userspace the nodename of the new_utsname object that the utsname() method returned:
+
+if (copy_to_user(name, u->nodename, i))
+
+errno = -EFAULT;
+
+up_read(&uts_sem);
+
+return errno;
+
+}
+
+(kernel/sys.c)
+
+You can find a similar approach in the sethostbyname() and in the uname() system calls, which are also defined in kernel/sys.c. I should note that UTS namespaces implementation also handles UTS procfs entries. There are only two UTS procfs entries, /proc/sys/kernel/domainname and /proc/sys/kernel/hostname, which are writable (this means that you can change them from userspace). There are other UTS procfs entries which are not writable, like /proc/sys/kernel/ostype and /proc/sys/kernel/osrelease. If you will look at the table of the UTS procfs entries, uts_kern_table (kernel/utsname_sysctl.c), you will see that some entries, like ostype and osrelease, have mode of "0444", which means they are not writable, and only two of them, hostname and domainname, have mode of "0644", which means they are writable. Reading and writing the UTS procfs entries is handled by the proc_do_uts_string() method. Readers who want to learn more about how UTS procfs entries are handled should look into the proc_do_uts_string() method and into the get_uts() method; both are in kernel/utsname_sysctl.c.
+
+Now that you learned about how the simplest namespace, the UTS namespace, is implemented, it is time to learn about network namespaces and their implementation.
+
+### Network Namespaces Implementation
+
+A network namespace is logically another copy of the network stack, with its own network devices, routing tables, neighbouring tables, netfilter tables, network sockets, network procfs entries, network sysfs entries, and other network resources. A practical feature of network namespaces is that network applications running in a given namespace (let's say ns1) will first look for configuration files under /etc/netns/ns1, and only afterward under /etc. So, for example, if you created a namespace called ns1 and you have created /etc/netns/ns1/hosts, every userspace application that tries to access the hosts file will first access /etc/netns/ns1/hosts and only then (if the entry being looked for does not exist) will it read /etc/hosts. This feature is implemented using bind mounts and is available only for network namespaces created with the ip netns add command.
+
+#### The Network Namespace Object (struct net)
+
+Let's turn now to the definition of the net structure, which is the fundamental data structure that represents a network namespace:
+
+struct net {
+
+. . .
+
+struct user_namespace *user_ns; /* Owning user namespace */
+
+unsigned int proc_inum;
+
+struct proc_dir_entry *proc_net;
+
+struct proc_dir_entry *proc_net_stat;
+
+. . .
+
+struct list_head dev_base_head;
+
+struct hlist_head *dev_name_head;
+
+struct hlist_head *dev_index_head;
+
+. . .
+
+int ifindex;
+
+. . .
+
+struct net_device *loopback_dev; /* The loopback */
+
+. . .
+
+atomic_t count; /* To decided when the network
+
+* namespace should be shut down.
+
+*/
+
+struct netns_ipv4 ipv4;
+
+#if IS_ENABLED(CONFIG_IPV6)
+
+struct netns_ipv6 ipv6;
+
+#endif
+
+#if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE)
+
+struct netns_sctp sctp;
+
+#endif
+
+. . .
+
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+
+struct netns_ct ct;
+
+#endif
+
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+
+struct netns_nf_frag nf_frag;
+
+#endif
+
+. . .
+
+struct net_generic __rcu *gen;
+
+#ifdef CONFIG_XFRM
+
+struct netns_xfrm xfrm;
+
+#endif
+
+. . .
+
+};
+
+(include/net/net_namespace.h)
+
+Here is a short description of several members of the net structure:
+
+  * user_ns represents the user namespace that created the network namespace; it owns the network namespace and all its resources. It is assigned in the setup_net() method. For the initial network namespace object (init_net), the user namespace that created it is the initial user namespace, init_user_ns.
+
+  * proc_inum is the unique proc inode number associated to the network namespace. This unique proc inode is created by the proc_alloc_inum() method, which also assigns proc_inum to be the proc inode number. The proc_alloc_inum() method is invoked by the network namespace initialization method, net_ns_net_init(), and it is freed by calling the proc_free_inum() method in the network namespace cleanup method, net_ns_net_exit().
+
+  * proc_net represents the network namespace procfs entry (/proc/net) as each network namespace maintains its own procfs entry.
+
+  * proc_net_stat represents the network namespace procfs statistics entry (/proc/net/stat) as each network namespace maintains its own procfs statistics entry.
+
+  * dev_base_head points to a linked list of all network devices.
+
+  * dev_name_head points to a hashtable of network devices, where the key is the network device name.
+
+  * dev_index_head points to a hashtable of network devices, where the key is the network device index.
+
+  * ifindex is the last device index assigned inside a network namespace. Indices are virtualized in network namespaces; this means that loopback devices would always have index of 1 in all network namespaces, and other network devices may have coinciding indices when living in different network namespaces.
+
+  * loopback_dev is the loopback device. Every new network namespace is created with only one network device, the loopback device. The loopback_dev object of a network namespace is assigned in the loopback_net_init() method, drivers/net/loopback.c. You cannot move the loopback device from one network namespace to another.
+
+  * count is the network namespace reference counter. It is initialized to 1 when the network namespace is created by the by the setup_net() method. It is incremented by the get_net() method and decremented by the put_net() method. If the count reference counter reaches 0 in the put_net() method, the __put_net() method is called. The __put_net() method, in turn, adds the network namespace to a global list of network namespaces to be removed, cleanup_list, and later removes it.
+
+  * ipv4 (an instance of the netns_ipv4 structure) for the IPv4 subsystem. The netns_ipv4 structure contains IPv4 specific fields which are different for different namespaces. For example, in chapter 6 you saw that the multicast routing table of a specified network namespace called net is stored in net->ipv4.mrt. I will discuss the netns_ipv4 later in this section.
+
+  * ipv6 (an instance of the netns_ipv6 structure) for the IPv6 subsystem.
+
+  * sctp (an instance of the netns_sctp structure) for SCTP sockets.
+
+  * ct (an instance of the netns_ct structure, which is discussed in chapter 9 ) for the netfilter connection tracking subsystem.
+
+  * gen (an instance of the net_generic structure, defined in include/net/netns/generic.h) is a set of generic pointers on structures describing a network namespace context of optional subsystems. For example, the sit module (Simple Internet Transition, an IPv6 tunnel, implemented in net/ipv6/sit.c) puts its private data on struct net using this engine. This was introduced in order not to flood the struct net with pointers for every single network subsystem that is willing to have per network namespace context.
+
+  * xfrm (an instance of the netns_xfrm structure, which is mentioned several times in chapter 10 ) for the IPsec subsystem.
+
+Let's take a look at the IPv4 specific namespace, the netns_ipv4 structure:
+
+struct netns_ipv4 {
+
+. . .
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+
+struct fib_rules_ops *rules_ops;
+
+bool fib_has_custom_rules;
+
+struct fib_table *fib_local;
+
+struct fib_table *fib_main;
+
+struct fib_table *fib_default;
+
+#endif
+
+. . .
+
+struct hlist_head *fib_table_hash;
+
+struct sock *fibnl;
+
+struct sock **icmp_sk;
+
+. . .
+
+#ifdef CONFIG_NETFILTER
+
+struct xt_table *iptable_filter;
+
+struct xt_table *iptable_mangle;
+
+struct xt_table *iptable_raw;
+
+struct xt_table *arptable_filter;
+
+#ifdef CONFIG_SECURITY
+
+struct xt_table *iptable_security;
+
+#endif
+
+struct xt_table *nat_table;
+
+#endif
+
+int sysctl_icmp_echo_ignore_all;
+
+int sysctl_icmp_echo_ignore_broadcasts;
+
+int sysctl_icmp_ignore_bogus_error_responses;
+
+int sysctl_icmp_ratelimit;
+
+int sysctl_icmp_ratemask;
+
+int sysctl_icmp_errors_use_inbound_ifaddr;
+
+int sysctl_tcp_ecn;
+
+kgid_t sysctl_ping_group_range[2];
+
+long sysctl_tcp_mem[3];
+
+atomic_t dev_addr_genid;
+
+#ifdef CONFIG_IP_MROUTE
+
+#ifndef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+
+struct mr_table *mrt;
+
+#else
+
+struct list_head mr_tables;
+
+struct fib_rules_ops *mr_rules_ops;
+
+#endif
+
+#endif
+
+};
+
+(net/netns/ipv4.h)
+
+You can see in the netns_ipv4 structure many IPv4-specific tables and variables, like the routing tables, the netfilter tables, the multicast routing tables, and more.
+
+#### Network Namespaces Implementation: Other Data Structures
+
+In order to support network namespaces, a member called nd_net, which is a pointer to a network namespace, was added to the network device object (struct net_device). Setting the network namespace for a network device is done by calling the dev_net_set() method, and getting the network namespace associated to a network device is done by calling the dev_net() method. Note that a network device can belong to only a single network namespace at a given moment. The nd_net is set typically when a network device is registered or when a network device is moved to a different network namespace. For example, when registering a VLAN device, both these methods just mentioned are used:
+
+static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
+
+{
+
+struct net_device *new_dev;
+
+The network namespace to be assigned to the new VLAN device is the network namespace associated with the real device, which is passed as a parameter to the register_vlan_device() method; we get this namespace by calling dev_net(real_dev):
+
+struct net *net = dev_net(real_dev);
+
+. . .
+
+new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name, vlan_setup);
+
+if (new_dev == NULL)
+
+return -ENOBUFS;
+
+Switch the network namespace by calling the dev_net_set() method:
+
+dev_net_set(new_dev, net);
+
+. . .
+
+}
+
+A member called sk_net, a pointer to a network namespace, was added to struct sock, which represents a socket. Setting the network namespace for a sock object is done by calling the sock_net_set() method, and getting the network namespace associated to a sock object is done by calling the sock_net() method. Like in the case of the nd_net object, also a sock object can belong to only a single network namespace at a given moment.
+
+When the system boots, a default network namespace, init_net, is created. After the boot, all physical network devices and all sockets belong to that initial namespace, as well as the network loopback device.
+
+Some network devices and some network subsystems should have network namespaces specific data. In order to enable this, a structure named pernet_operations was added; this structure includes an init and exit callbacks:
+
+struct pernet_operations {
+
+. . .
+
+int (*init)(struct net *net);
+
+void (*exit)(struct net *net);
+
+. . .
+
+int *id;
+
+size_t size;
+
+};
+
+(include/net/net_namespace.h)
+
+Network devices that need network namespaces specific data should define a pernet_operations object, and define its init() and exit() callbacks for device specific initialization and cleanup, respectively, and call the register_pernet_device() method in their module initialization and the unregister_pernet_device() method when the module is removed, passing the pernet_operations object as a single parameter in both cases. For example, the PPPoE module exports information about PPPoE session by a procfs entry, /proc/net/pppoe. The information exported by this procfs entry depends on the network namespace to which this PPPoE device belongs (since different PPPoE devices can belong to different network namespaces). So the PPPoE module defines a pernet_operations object called pppoe_net_ops:
+
+static struct pernet_operations pppoe_net_ops = {
+
+.init = pppoe_init_net,
+
+.exit = pppoe_exit_net,
+
+.id = &pppoe_net_id,
+
+.size = sizeof(struct pppoe_net),
+
+}
+
+(net/ppp/pppoe.c)
+
+In the init callback, pppoe_init_net(), it only creates the PPPoE procfs entry, /proc/net/pppoe, by calling the proc_create() method:
+
+static __net_init int pppoe_init_net(struct net *net)
+
+{
+
+struct pppoe_net *pn = pppoe_pernet(net);
+
+struct proc_dir_entry *pde;
+
+rwlock_init(&pn->hash_lock);
+
+pde = proc_create("pppoe", S_IRUGO, net->proc_net, &pppoe_seq_fops);
+
+#ifdef CONFIG_PROC_FS
+
+if (!pde)
+
+return -ENOMEM;
+
+#endif
+
+return 0;
+
+}
+
+(net/ppp/pppoe.c)
+
+And in the exit callback, pppoe_exit_net(), it only removes the PPPoE procfs entry, /proc/net/pppoe, by calling the remove_proc_entry() method:
+
+static __net_exit void pppoe_exit_net(struct net *net)
+
+{
+
+remove_proc_entry("pppoe", net->proc_net);
+
+}
+
+(net/ppp/pppoe.c)
+
+Network subsystems that need network-namespace-specific data should call register_pernet_subsys() when the subsystem is initialized and unregister_pernet_subsys() when the subsystem is removed. You can look for examples in net/ipv4/route.c, and there are many other examples of reviewing these methods. The network namespace module itself also defines a net_ns_ops object and registers it in the boot phase:
+
+static struct pernet_operations __net_initdata net_ns_ops = {
+
+.init = net_ns_net_init,
+
+.exit = net_ns_net_exit,
+
+};
+
+static int __init net_ns_init(void)
+
+{
+
+. . .
+
+register_pernet_subsys(&net_ns_ops);
+
+. . .
+
+}
+
+(net/core/net_namespace.c)
+
+Each time a new network namespace is created, the init callback (net_ns_net_init) is called, and each time a network namespace is removed, the exit callback (net_ns_net_exit) is called. The only thing that the net_ns_net_init() does is to allocate a unique proc inode for the newly created namespace by calling the proc_alloc_inum() method; the newly created unique proc inode number is assigned to net->proc_inum:
+
+static __net_init int net_ns_net_init(struct net *net)
+
+{
+
+return proc_alloc_inum(&net->proc_inum);
+
+}
+
+And the only thing that the net_ns_net_exit() method does is to remove that unique proc inode by calling the proc_free_inum() method:
+
+static __net_exit void net_ns_net_exit(struct net *net)
+
+{
+
+proc_free_inum(net->proc_inum);
+
+}
+
+When you create a new network namespace, it has only the network loopback device. The most common ways to create a network namespace are:
+
+  * By a userspace application which will create a network namespace with the clone() system call or with the unshare() system call, setting the CLONE_NEWNET flag in both cases.
+
+  * Using ip netns command of iproute2 (you will shortly see an example).
+
+  * Using the unshare utility of util-linux, with the \--net flag.
+
+### Network Namespaces Management
+
+Next you will see some examples of using the ip netns command of the iproute2 package to perform actions such as creating a network namespace, deleting a network namespace, showing all the network namespaces, and more.
+
+  * Creating a network namespace named ns1 is done by:
+
+  * ip netns add ns1
+
+  * Running this command triggers first the creation of a file called /var/run/netns/ns1, and then the creation of the network namespace by the unshare() system call, passing it a CLONE_NEWNET flag. Then /var/run/netns/ns1 is attached to the network namespace (/proc/self/ns/net) by a bind mount (calling the mount() system call with MS_BIND). Note that network namespaces can be nested, which means that from within ns1 you can also create a new network namespace, and so on.
+
+  * Deleting a network namespace named ns1 is done by:
+
+  * ip netns del ns1
+
+  * Note that this will not delete a network namespace if there is one or more processes attached to it. In case there are no such processes, the /var/run/netns/ns1 file is deleted. Note also that when deleting a namespace, all its network devices are moved to the initial, default network namespace, init_net, except for network namespace local devices, which are network devices whose NETIF_F_NETNS_LOCAL feature is set; such network devices are deleted. See more in the "Moving a Network Interface to a Network Namespace" section later in this chapter and in Appendix A.
+
+  * Showing all the network namespaces in the system that were added by ip netns add is done by:
+
+  * ip netns list
+
+  * In fact, running ip netns list simply shows the names of files under /var/run/netns. Note that network namespaces not added by ip netns add will not be displayed by ip netns list, because creating such network namespaces did not trigger creation of any file under /var/run/netns. So, for example, a network namespace created by unshare --net bash will not appear when running ip netns list.
+
+  * Monitoring creation and removal of a network namespace is done by:
+
+  * ip netns monitor
+
+  * After running ip netns monitor, when you add a new namespace by ip netns add ns2 you will see on screen the following message: "add ns2", and after you delete that namespace by ip netns delete ns2 you will see on screen the following message: "delete ns2". Note that adding and removing network namespaces not by running ip netns add and ip netns delete, respectively, does not trigger displaying any messages on screen by ip netns monitor. The ip netns monitor command is implemented by setting an inotify watch on /var/run/netns. Note that in case you will run ip netns monitor before adding at least one network namespace with ip netns add you will get the following error: inotify_add_watch failed: No such file or directory. The reason is that trying to set a watch on /var/run/netns, which does not exist yet, fails. See man inotify_init() and man inotify_add_watch().
+
+  * Start a shell in a specified namespace (ns1 in this example) is done by:
+
+  * ip netns exec ns1 bash
+
+  * Note that with ip netns exec you can run any command in a specified network namespace. For example, the following command will display all network interfaces in the network namespace called ns1:
+
+  * ip netns exec ns1 ifconfig -a
+
+In recent versions of iproute2 (since version 3.8), you have these two additional helpful commands:
+
+  * Show the network namespace associated with the specified pid:
+
+  * ip netns identify #pid
+
+  * This is implemented by reading /proc/<pid>/ns/net and iterating over the files under /var/run/netns to find a match (using the stat() system call).
+
+  * Show the PID of a process (or list of processes) attached to a network namespace called ns1 by:
+
+  * ip netns pids ns1
+
+  * This is implemented by reading /var/run/netns/ns1, and then iterating over /proc/<pid> entries to find a matching /proc/pid/ns/net entry (using the stat() system call).
+
+Note
+
+For more information about the various ip netns command options see man ip netns.
+
+#### Moving a Network Interface to a Different Network Namespace
+
+Moving a network interface to a network namespace named ns1 can be done with the ip command. For example, by: ip link set eth0 netns ns1. As part of implementing network namespaces, a new feature named NETIF_F_NETNS_LOCAL was added to the features of the net_device object (The net_device structure represents a network interface. For more information about the net_device structure and its features see Appendix A). You can find out whether the NETIF_F_NETNS_LOCAL feature is set for a specified network device by looking at the netns-local flag in the output of ethtool -k eth0 or in the output of ethtool --show-features eth0 (both commands are equivalent.) Note that you cannot set the NETIF_F_NETNS_LOCAL feature with ethtool. This feature, when set, denotes that the network device is a network namespace local device. For example, the loopback, the bridge, the VXLAN and the PPP devices are network namespace local devices. Trying to move a network device whose NETIF_F_NETNS_LOCAL feature is set to a different namespace will fail with an error of –EINVAL, as you will shortly see in the following code snippet. The dev_change_net_namespace() method is invoked when trying to move a network interface to a different network namespace, for example by: ip link set eth0 netns ns1. Let's take a look at the dev_change_net_namespace() method:
+
+int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
+
+{
+
+int err;
+
+ASSERT_RTNL();
+
+/* Don't allow namespace local devices to be moved. */
+
+err = -EINVAL;
+
+Return –EINVAL in case that the device is a local device (The NETIF_F_NETNS_LOCAL flag in the features of net_device object is set)
+
+if (dev->features & NETIF_F_NETNS_LOCAL)
+
+goto out;
+
+. . .
+
+Actually switch the network namespace by setting nd_net of the net_device object to the new specified namespace:
+
+dev_net_set(dev, net)
+
+. . .
+
+out:
+
+return err;
+
+}
+
+(net/core/dev.c)
+
+Note
+
+You can move a network interface to a network namespace named ns1 also by specifying a PID of a process that is attached to that namespace, without specifying the namespace name explicitly. For example, if you know that a process whose PID is <pidNumber> is attached to ns1, running ip link set eth1 netns <pidNumber> will move eth1 to the ns1 namespace. Implementation details: getting the network namespace object when specifying one of the PIDs of its attached processes is implemented by the get_net_ns_by_pid() method, whereas getting the network namespace object when specifying the network namespace name is implemented by the get_net_ns_by_fd() method; both methods are in net/core/net_namespace.c. In order to move a wireless network interface to a different network namespace you should use the iw command. For example, if you want to move wlan0 to a network namespace and you know that a process whose PID is <pidNumber> is attached to that namespace, you can run iw phy phy0 set netns <pidNumber> to move it to that network namespace. For the implementation details, refer to the nl80211_wiphy_netns() method in net/wireless/nl80211.c.
+
+#### Communicating Between Two Network Namespaces
+
+I will end the network namespaces section with a short example of how two network namespaces can communicate with each other. It can be done either by using Unix sockets or by using the Virtual Ethernet (VETH) network driver to create a pair of virtual network devices and moving one of them to another network namespace. For example, here are the first two namespaces, ns1 and ns2:
+
+ip netns add ns1
+
+ip netns add ns2
+
+Start a shell in ns1:
+
+ip netns exec ns1 bash
+
+Create a virtual Ethernet device (its type is veth):
+
+ip link add name if_one type veth peer name if_one_peer
+
+Move if_one_peer to ns2:
+
+ip link set dev if_one_peer netns ns2
+
+You can now set addresses on if_one and on if_one_peer as usual, with the ifconfig command or with the ip command, and send packets from one network namespace to the other.
+
+Note
+
+Network namespaces are not mandatory for a kernel image. By default, network namespaces are enabled (CONFIG_NET_NS is set) in most distributions. However, you can build and boot a kernel where network namespaces are disabled.
+
+I have discussed in this section what namespaces are, and in particular what are network namespaces. I mentioned some of the major changes that were required in order to implement namespaces in general, like adding 6 new CLONE_NEW* flags, adding two new systems calls, adding an nsproxy object to the process descriptor, and more. I also described the implementation of UTS namespaces, which are the most simple among all namespaces, and the implementation of network namespaces. Several examples were given showing how simple it is to manipulate network namespaces with the ip netns command of the iproute2 package. Next I will describe the cgroups subsystem, which provides another solution of resource management, and two network modules that belong to it.
+
+## Cgroups
+
+The cgroups subsystem is a project started by Paul Menage, Rohit Seth, and other Google developers in 2006. It was initially called "process containers," but later it was renamed to "Control Groups." It provides resource management and resource accounting for groups of processes. It has been part of the mainline kernel since kernel 2.6.24, and it's used in several projects: for example by systemd (a service manager which replaced SysV init scripts; used, for example, by Fedora and by openSUSE), by the Linux Containers project, which was mentioned earlier in this chapter, by Google containers ( https://github.com/google/lmctfy/ ), by libvirt ( http://libvirt.org/cgroups.html ) and more. Cgroups kernel implementation is mostly in non-critical paths in terms of performance. The cgroups subsystem implements a new Virtual File System (VFS) type named "cgroups". All cgroups actions are done by filesystem actions, like creating cgroups directories in a cgroup filesystem, writing or reading to entries in these directories, mounting cgroup filesystems, etc. There is a library called libcgroup (a.k.a. libcg), which provides a set of userspace utilities for cgroups management: for example, cgcreate to create a new cgroup, cgdelete to delete a cgroup, cgexec to run a task in a specified control group, and more. In fact this is done by calling the cgroup filesystem operations from the libcg library. The libcg library is likely to see reduced usage in the future because it doesn't provide any coordination among multiple parties trying to use the cgroup controllers. It could be that in the future all the cgroup file operations will be performed by a library or by a daemon and not directly. The cgroups subsystem, as currently implemented, needs some form of coordination, because there is only a single controller for each resource type. When multiple actors modify it, this necessarily leads to conflicts. The cgroups controllers can be used by many projects like libvirt, systemd, lxc and more, simultaneously. When working only via cgroups filesystem operations, and when all the projects try to impose their own policy through cgroups at too low a level, without knowing about each other, they may accidently walk over each other. When each will talk to a daemon, for example, such a clash will be avoided. For more information about libcg see  http://libcg.sourceforge.net/ .
+
+As opposed to namespaces, no new system calls were added for implementing the cgroup subsystem. As in namespaces, several cgroups can be nested. There were code additions in the boot phase, mainly for the initialization of the cgroups subsystem, and in various subsystems, like the memory subsystem or security subsystem. Following here is a short, partial list of tasks that you can perform with cgroups:
+
+  * Assign a set of CPUs to a set of processes, with the cpusets cgroup controller. You can also control the NUMA node memory is allocated from with the cpusets cgroup controller.
+
+  * Manipulate the out of memory (oom) killer operation or create a process with a limited amount of memory with the memory cgroup controller (memcg). You will see an example later in this chapter.
+
+  * Assign permissions to devices under /dev, with the devices cgroup. You will see later an example of using the devices cgroup in the "Cgroup Devices – A Simple Example" section.
+
+  * Assign priority to traffic (see the section "The net_prio Module" later in this chapter).
+
+  * Freeze processes with the freezer cgroup.
+
+  * Report CPU resource usage of tasks of a cgroup with the cpuacct cgroup. Note that there is also the cpu controller, which can provision CPU cycles either by priority or by absolute bandwidth and provides the same or a superset of statistics.
+
+  * Tag network traffic with a class identifier (classid); see the section "The cls_cgroup Classifier" later in this chapter.
+
+Next I will describe very briefly some changes that were done for supporting cgroups.
+
+### Cgroups Implementation
+
+The cgroup subsystem is very complex. Here are several implementation details about the cgroup subsystem that should give you a good starting point to delve into its internals:
+
+  * A new structure called cgroup_subsys was added (include/linux/cgroup.h). It represents a cgroup subsystem (also known as a cgroup controller). The following cgroup subsystems are implemented:
+
+    * mem_cgroup_subsys: mm/memcontrol.c
+
+    * blkio_subsys: block/blk-cgroup.c
+
+    * cpuset_subsys: kernel/cpuset.c
+
+    * devices_subsys: security/device_cgroup.c
+
+    * freezer_subsys: kernel/cgroup_freezer.c
+
+    * net_cls_subsys: net/sched/cls_cgroup.c
+
+    * net_prio_subsys: net/core/netprio_cgroup.c
+
+    * perf_subsys: kernel/events/core.c
+
+    * cpu_cgroup_subsys: kernel/sched/core.c
+
+    * cpuacct_subsys: kernel/sched/core.c
+
+    * hugetlb_subsys: mm/hugetlb_cgroup.c
+
+  * A new structure called cgroup was added; it represents a control group (linux/cgroup.h)
+
+  * A new virtual file system was added; this was done by defining the cgroup_fs_type object and a cgroup_ops object (instance of super_operations):
+
+static struct file_system_type cgroup_fs_type = {
+
+.name = "cgroup",
+
+.mount = cgroup_mount,
+
+.kill_sb = cgroup_kill_sb,
+
+};
+
+static const struct super_operations cgroup_ops = {
+
+.statfs = simple_statfs,
+
+.drop_inode = generic_delete_inode,
+
+.show_options = cgroup_show_options,
+
+.remount_fs = cgroup_remount,
+
+};
+
+(kernel/cgroup.c)
+
+And registering it is done like any other filesystem with the register_filesystem() method in the cgroup_init() method; see kernel/cgroup.c.
+
+  * The following sysfs entry, /sys/fs/cgroup, is created by default when the cgroup subsystem is initialized; this is done by calling kobject_create_and_add("cgroup", fs_kobj) in the cgroup_init() method. Note that cgroup controllers can be mounted also on other directories.
+
+  * There is a global array of cgroup_subsys objects named subsys, defined in kernel/cgroup.c (note that from kernel 3.11, the array name was changed from subsys to cgroup_subsys). There are CGROUP_SUBSYS_COUNT elements in this array. A procfs entry called /proc/cgroups is exported by the cgroup subsystem. You can display the elements of the global subsys array in two ways:
+
+    * By running cat /proc/cgroups.
+
+    * By the lssubsys utility of libcgroup-tools.
+
+  * Creating a new cgroup entails generating these four control files always under that cgroup VFS:
+
+    * notify_on_release: Its initial value is inherited from its parent. It's represents a boolean variable, and its usage is related to the release_agenttopmost-only control file, which will be explained shortly.
+
+    * cgroup.event_control: This file enables getting notification from a cgroup, using the eventfd() system call. See man 2 eventfd, and fs/eventfd.c.
+
+    * tasks: A list of the PIDs which are attached to this group. Attaching a process to a cgroup is done by writing the value of its PID to the tasks control file and is handled by the cgroup_attach_task() method, kernel/cgroup.c. Displaying the cgroups to which a process is attached is done by cat /proc/<processPid>/cgroup. This is handled in the kernel by the proc_cgroup_show() method, in kernel/cgroup.c.
+
+  * cgroup.procs: A list of the thread group ids which are attached to this cgroup. The tasks entry allows attaching threads of the same process to different cgroup controllers, whereas cgroup.procs has a process-level granularity (all threads of a single process are moved together and belong to the same cgroup).
+
+  * In addition to these four control files, a control file named release_agentis created for the topmost cgroup root object only. The value of this file is a path of an executable that will be executed when the last process of a cgroup is terminated; the notify_on_release mentioned earlier should be set so that the release_agent feature will be enabled. The release_agent can be assigned as a cgroup mount option; this is the case, for example, in systemd in Fedora. The release_agent mechanism is based on a user-mode helper: the call_usermodehelper() method is invoked and a new userspace process is created each time that the release_agent is activated, which is costly in terms of performance. See: "The past, present, and future of control groups", lwn.net/Articles/574317/. For the release_agent implementation details see the cgroup_release_agent() method in kernel/cgroup.c.
+
+  * Apart from these four default control files and the release_agent topmost-only control file, each subsystem can create its own specific control files. This is done by defining an array of cftype (Control File type) objects and assigning this array to the base_cftypes member of the cgroup_subsys object. For example, for the memory cgroup controller, we have this definition for the usage_in_bytes control file:
+
+static struct cftype mem_cgroup_files[] = {
+
+{
+
+.name = "usage_in_bytes",
+
+.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
+
+.read = mem_cgroup_read,
+
+.register_event = mem_cgroup_usage_register_event,
+
+.unregister_event = mem_cgroup_usage_unregister_event,
+
+},
+
+. . .
+
+struct cgroup_subsys mem_cgroup_subsys = {
+
+.name = "memory",
+
+. . .
+
+.base_cftypes = mem_cgroup_files,
+
+};
+
+(mm/memcontrol.c)
+
+  * A member called cgroups, which is a pointer to a css_set object, was added to the process descriptor, task_struct. The css_set object contains an array of pointers to cgroup_subsys_state objects (one such pointer for each cgroup subsystem). The process descriptor itself (task_struct ) does not contain a direct pointer to a cgroup subsystem it is associated to, but this could be determined from this array of cgroup_subsys_state pointers.
+
+Two cgroups networking modules were added. They will be discussed later in this section:
+
+  * net_prio (net/core/netprio_cgroup.c).
+
+  * cls_cgroup (net/sched/cls_cgroup.c).
+
+Note
+
+The cgroup subsystem is still in its early days and likely to see a fair amount of development in its features and interface.
+
+Next you will see a short example that illustrates how the devices cgroup controller can be used to change the write permission of a device file.
+
+### Cgroup Devices Controller: A Simple Example
+
+Let's look at a simple example of using the devices cgroup. Running the following command will create a devices cgroup:
+
+mkdir /sys/fs/cgroup/devices/0
+
+Three control files will be created under /sys/fs/cgroup/devices/0:
+
+  * devices.deny: Devices for which access is denied.
+
+  * devices.allow: Devices for which access is allowed.
+
+  * devices.list: Available devices.
+
+Each such control file consists of four fields:
+
+  * type: possible values are: 'a' is all, 'c' is char device and 'b' is block device.
+
+  * The device major number.
+
+  * The device minor number.
+
+  * Access permission: 'r' is permission to read, 'w' is permission to write, and 'm' is permission to perform mknod.
+
+By default, when creating a new devices cgroup, it has all the permissions:
+
+cat /sys/fs/cgroup/devices/0/devices.list
+
+a *:* rwm
+
+The following command adds the current shell to the devices cgroup that you created earlier:
+
+echo $$ > /sys/fs/cgroup/devices/0/tasks
+
+The following command will deny access from all devices:
+
+echo a > /sys/fs/cgroup/devices/0/devices.deny
+
+echo "test" > /dev/null
+
+-bash: /dev/null: Operation not permitted
+
+The following command will return the access permission for all devices:
+
+echo a > /sys/fs/cgroup/devices/0/devices.allow
+
+Running the following command, which previously failed, will succeed now:
+
+echo "test" > /dev/null
+
+### Cgroup Memory Controller: A Simple Example
+
+You can disable the out of memory (OOM) killer thus, for example:
+
+mkdir /sys/fs/cgroup/memory/0
+
+echo $$ > /sys/fs/cgroup/memory/0/tasks
+
+echo 1 > /sys/fs/cgroup/memory/0/memory.oom_control
+
+Now if you will run some memory-hogging userspace program, the OOM killer will not be invoked. Enabling the OOM killer can be done by:
+
+echo 0 > /sys/fs/cgroup/memory/0/memory.oom_control
+
+You can use the eventfd() system call the get notifications in a userspace application about a change in the status of a cgroup. See man 2 eventfd.
+
+Note
+
+You can limit the memory a process in a cgroup can have up to 20M, for example, by:
+
+echo 20M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
+
+### The net_prio Module
+
+The network priority control group (net_prio) provides an interface for setting the priority of network traffic that is generated by various userspace applications. Usually this can be done by setting the SO_PRIORITY socket option, which sets the priority of the SKB, but it is not always wanted to use this socket option. To support the net_prio module, an object called priomap, an instance of netprio_map structure, was added to the net_device object. Let's take a look at the netprio_map structure:
+
+struct netprio_map {
+
+struct rcu_head rcu;
+
+u32 priomap_len;
+
+u32 priomap[];
+
+};
+
+(include/net/netprio_cgroup.h)
+
+The priomap array is using the net_prio sysfs entries, as you will see shortly. The net_prio module exports two entries to cgroup sysfs: net_prio.ifpriomap and net_prio.prioidx. The net_prio.ifpriomap is used to set the priomap object of a specified network device, as you will see in the example immediately following. In the Tx path, the dev_queue_xmit() method invokes the skb_update_prio() method to set skb->priority according to the priomap which is associated with the outgoing network device (skb->dev). The net_prio.prioidx is a read-only entry, which shows the id of the cgroup. The net_prio module is a good example of how simple it is to develop a cgroup kernel module in less than 400 lines of code. The net_prio module was developed by Neil Horman and is available from kernel 3.3. For more information see Documentation/cgroups/net_prio.txt. The following is an example of how to use the network priority cgroup module (note that you must load the netprio_cgroup.ko kernel module in case CONFIG_NETPRIO_CGROUP is set as a module and not as a built-in):
+
+mkdir /sys/fs/cgroup/net_prio
+
+mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio
+
+mkdir /sys/fs/cgroup/net_prio/0
+
+echo "eth1 4" > /sys/fs/cgroup/net_prio/0/net_prio.ifpriomap
+
+This sequence of commands would set any traffic originating from processes belonging to the netprio "0" group and outgoing on interface eth1 to have the priority of four. The last command triggers writing an entry to a field in the net_device object called priomap.
+
+Note
+
+In order to work with net_prio, CONFIG_NETPRIO_CGROUP should be set.
+
+### The cls_cgroup Classifier
+
+The cls_cgroup classifier provides an interface to tag network packets with a class identifier (classid). You can use it in conjunction with the tc tool to assign different priorities to packets from different cgroups, as the example that you will soon see demonstrates. The cls_cgroup module exports one entry to cgroup sysfs, net_cls.classid. The control group classifier (cls_cgroup) was merged in kernel 2.6.29 and was developed by Thomas Graf. Like the net_prio module which was discussed in the previous section, also this cgroup kernel module is less than 400 lines of code, which proves again that adding a cgroup controller by a kernel module is not a heavy task. Here is an example of using the control group classifier (note that you must load the cls_cgroup.ko kernel module in case that CONFIG_NETPRIO_CGROUP is set as a module and not as a built-in):
+
+mkdir /sys/fs/cgroup/net_cls
+
+mount -t cgroup -onet_cls none /sys/fs/cgroup/net_cls
+
+mkdir /sys/fs/cgroup/net_cls/0
+
+echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid
+
+The last command assigns classid 10:1 to group 0. The iproute2 package contains a utility named tc for managing traffic control settings. You can use the tc tool with this class id, for example:
+
+tc qdisc add dev eth0 root handle 10: htb
+
+tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit
+
+tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup
+
+For more information see Documentation/cgroups/net_cls.txt (only from kernel 3.10.)
+
+Note
+
+In order to work with cls_cgroup, CONFIG_NET_CLS_CGROUP should be set.
+
+I will conclude the discussion about the cgroup subsystem with a short section about mounting cgroups.
+
+### Mounting cgroup Subsystems
+
+Mounting a cgroup subsystem can be done also in other mount points than /sys/fs/cgroup, which is created by default. For example, you can mount the memory controller on /mycgroup/mymemtest by the following sequence:
+
+mkdir –p /mycgroup/mymemtest
+
+mount -t cgroup -o memory mymemtest /mycgroup/mymemtest
+
+Here are some of the mount options when mounting cgroup subsystems:
+
+  * all: Mount all cgroup controllers.
+
+  * none: Do not mount any controller.
+
+  * release_agent: A path to an executable which will be executed when the last process of a cgroup is terminated. Systemd uses the release_agent cgroup mount option.
+
+  * noprefix: Avoid prefix in control files. Each cgroup controller has its own prefix for its own control files; for example, the cpuset controller entry mem_exclusive appears as cpuset.mem_exclusive. The noprefix mount option avoids adding the controller prefix. For example,
+
+mkdir /cgroup
+
+mount -t tmpfs xxx /cgroup/
+
+mount -t cgroup -o noprefix,cpuset xxx /cgroup/
+
+ls /cgroup/
+
+cgroup.clone_children mem_hardwall mems
+
+cgroup.event_control memory_migrate notify_on_release
+
+cgroup.procs memory_pressure release_agent
+
+cpu_exclusive memory_pressure_enabled sched_load_balance
+
+cpus memory_spread_page sched_relax_domain_level
+
+mem_exclusive memory_spread_slab tasks
+
+Note
+
+Readers who want to delve into how parsing of the cgroups mount options is implemented should look into the parse_cgroupfs_options() method, kernel/cgroup.c.
+
+For more information about cgroups, see the following resources:
+
+  * Documentation/cgroups
+
+  * cgroups mailing list: cgroups@vger.kernel.org
+
+  * cgroups mailing list archives:  http://news.gmane.org/gmane.linux.kernel.cgroups
+
+  * git repository: git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
+
+Note
+
+Linux namespaces and cgroups are orthogonal and are not related technically. You can build a kernel with namespaces support and without cgroups support, and vice versa. In the past there were experiments with a cgroups namespace subsystem, called "ns", but the code was eventually removed.
+
+You have seen what cgroups are and you learned about its two network modules, net_prio and cls_cgroup. You also saw short examples demonstrating how the devices, memory, and the networking cgroups controllers can be used. The Busy Poll Sockets feature, which was added in kernel 3.11 and above, provides lower latency for sockets. Let's take a look at how it is implemented and how it is configured and used.
+
+## Busy Poll Sockets
+
+The traditional way the networking stack operates when the socket queue runs dry, is that it will sleep waiting for the driver to put more data on the socket queue, or returns if it is a non-blocking operation. This causes additional latency due to interrupts and context switches. For sockets applications that need the lowest possible latency and are willing to pay a cost of higher CPU utilization, Linux has added a capability for Busy Polling on Sockets from kernel 3.11 and above (in the beginning this technique was called Low Latency Sockets Poll, but it was changed to Busy Poll Sockets according to Linus suggestion). Busy Polling takes a more aggressive approach toward moving data to the application. When the application asks for more data and there is none in the socket queue, the networking stack actively calls into the device driver. The driver checks for newly arrived data and pushes it through the network layer (L3) to the socket. The driver may find data for other sockets and will push that data as well. When the poll call returns to the networking stack, the socket code checks whether new data is pending on the socket receive queue.
+
+In order that a network driver will support busy polling, it should supply its busy polling method and add it as the ndo_busy_poll callback of the net_device_ops object. This driver ndo_busy_poll callback should move the packets into the network stack; see for example, the ixgbe_low_latency_recv() method, drivers/net/ethernet/intel/ixgbe/ixgbe_main.c . This ndo_busy_poll callback should return the number of packets that were moved to the stack or 0 if there were no such packets, and LL_FLUSH_FAILED or LL_FLUSH_BUSY in case of some problem. An unmodified driver that does not fill in the ndo_busy_poll callback will continue to work as usual and will not be busy polled.
+
+An important component to providing low latency is busy polling. Sometimes when the driver polling routine returns with no data, more data is arriving and just misses being returned to the networking stack. This is where busy polling comes in to play. The networking stack polls the driver for a configurable period of time so new packets can be picked up as soon as they arrive.
+
+The active and busy polling of the device driver can provide reduced latency very close to that of the hardware. Busy polling can be used for large numbers of sockets at the same time but will not yield the best results, since busy polling on some sockets will slow down other sockets when using the same CPU core. Figure 14-1 contrasts the traditional receive flow with that of a socket that has been enabled for Busy Polling.
+
+Figure 14-1.
+
+Traditional receive flow versus Busy Poll Sockets receive flow
+
+1. Application checks for receive. 1. Application checks for receive
+
+2. No immediate receive – thus block. 2. Check device driver for pending packet (poll starts).
+
+3. Packet Received. 3. Meanwhile, packet received to NIC.
+
+4. Driver passes packet to the protocol layer. 4. Driver processes pending packet
+
+5. Protocol/socket wakes application. 5. Driver passes to the protocol layer
+
+\- Bypass context switch and interrupt.
+
+6. Application receives data through sockets. 6. Application receives data through sockets.
+
+Repeat. Repeat.
+
+### Enabling Globally
+
+Busy Polling on Sockets can be turned on globally for all sockets via procfs parameters or it can be turned on for individual sockets by setting the SO_BUSY_POLL socket option. For global enabling, there are two parameters: net.core.busy_poll and net.core.busy_read, which are exported to procfs by /proc/sys/net/core/busy_poll and /proc/sys/net/core/busy_read, respectively. Both are zero by default, which means that Busy Polling is off. Setting these values will enable Busy Polling globally. A value of 50 will usually yield good results, but some experimentation might help find a better value for some applications.
+
+  * busy_read controls the time limit when busy polling on blocking read operations. For a non-blocking read, if busy polling is enabled for the socket, the stack code polls just once before returning control to the user.
+
+  * busy_poll controls how long select and poll will busy poll waiting for new events on any of the sockets that are enabled for Busy Polling. Only sockets with the busy read socket operation enabled are busy polled.
+
+For more information, see: Documentation/sysctl/net.txt.
+
+### Enabling Per Socket
+
+A better way to enable Busy Polling is to modify the application to use the SO_BUSY_POLL socket option, which sets the sk_ll_usec of the socket object (an instance of the sock structure). By using this socket option, an application can specify which sockets are Busy Polled so CPU utilization is increased only for those sockets. Sockets from other applications and services will continue to use the traditional receive path. The recommended starting value for SO_BUSY_POLL is 50. The sysctl.net.busy_read value must be set to 0 and the sysctl.net.busy_poll value should be set as described in Documentation/sysctl/net.txt.
+
+### Tuning and Configuration
+
+Here are several ways in which you can tune and configure Busy Poll sockets:
+
+  * The interrupt coalescing (ethtool -C setting for rx-usecs) on the network device should be on the order of 100 to lower the interrupt rate. This limits the number of context switches caused by interrupts.
+
+  * Disabling GRO and LRO by using ethtool -K on the network device may avoid out of order packets on the receive queue. This should only be an issue when mixed bulk and low latency traffic arrive on the same queue. Generally, keeping GRO and LRO enabled usually gives best results.
+
+  * Application threads and the network device IRQs should be bound to separate CPU cores. Both sets of cores should be on the same CPU NUMA node as the network device. When the application and the IRQ run on the same core, there is a small penalty. If interrupt coalescing is set to a low value this penalty can be very large.
+
+  * For lowest latency, it may help to turn off the I/O Memory Management Unit (IOMMU) support. This may already be disabled by default on some systems.
+
+### Performance
+
+Many applications that use Busy Polling Sockets should show reduced latency and jitter as well as improved transactions per second. However, overloading the system with too many sockets that are busy polling can hurt performance as CPU contention increases. The parameters net.core.busy_poll, net.core.busy_read and the SO_BUSY_POLL socket option are all tunable. Experimenting with these values may give better results for various applications.
+
+I will now start a discussion of three wireless subsystems, which typically serve short range and low power devices: the Bluetooth subsystem, IEEE 802.15.4 and NFC. There is a growing interest in these three subsystems as new exciting features are added quite steadily. I will start the discussion with the Bluetooth subsystem.
+
+## The Linux Bluetooth Subsystem
+
+The Bluetooth protocol is one of the major transport protocols mainly for small and embedded devices. Bluetooth network interfaces are included nowadays in almost every new laptop or tablet and in every mobile phone, and in many electronic gadgets. The Bluetooth protocol was created by the mobile vendor Ericsson in 1994. In the beginning, it was intended to be a cable-replacement for point-to-point connections. Later, it evolved to enable wireless Personal Area Networks (PANs). Bluetooth operates in the 2.4 GHz Industrial, Scientific and Medical (ISM) radio-frequency band, which is license-free for low-power transmissions. The Bluetooth specifications are formalized by the Bluetooth Special Interest Group (SIG), which was founded in 1998; see  https://www.bluetooth.org . The SIG is responsible for development of Bluetooth specification and for the qualification process that helps to ensure interoperability between Bluetooth devices from different vendors. The Bluetooth core specification is freely available. There were several specifications for Bluetooth over the years, I will mention the most recent:
+
+  * Bluetooth v2.0 + Enhanced Data Rate (EDR) from 2004.
+
+  * Bluetooth v2.1 + EDR 2007; included improvement of the pairing process by secure simple pairing (SSP).
+
+  * Bluetooth v3.0 + HS (High Speed) from 2009; the main new feature is AMP (Alternate MAC/PHY), the addition of 802.11 as a high-speed transport.
+
+  * Bluetooth v4.0 + BLE (Bluetooth Low Energy, which was formerly known as WiBree) from 2010.
+
+There is a variety of uses for the Bluetooth protocol, like file transfer, audio streaming, health-care devices, networking, and more. Bluetooth is designed for short distance data exchange, in a range that typically extends up to 10 meters. There are three classes of Bluetooth devices, with the following ranges:
+
+  * Class 1 – about 100 m.
+
+  * Class 2 – about 10 m.
+
+  * Class 3 – about 1 m.
+
+The Linux Bluetooth protocol stack is called BlueZ. Originally it was a project started by Qualcomm. It was officially integrated in kernel 2.4.6 (2001). Figure 14-2 shows the Bluetooth stack.
+
+Figure 14-2.
+
+Bluetooth stack. Note: In the layer above L2CAP there can be other Bluetooth protocols that are not discussed in this chapter, like AVDTP (Audio/Video Distribution Transport Protocol), HFP (Hands-Free Profile), Audio/video control transport protocol (AVCTP), and more
+
+  * The lower three layers (The RADIO layer, Link controller and Link Management Protocol) are implemented in hardware or firmware.
+
+  * The Host Controller Interface (HCI) specifies how the host interacts and communicates with a local Bluetooth device (the controller). I will discuss it in the "HCI Layer" section, later in this chapter.
+
+  * The L2CAP (Logical link control and adaptation protocol) provides the ability to transmit and to receive packets from other Bluetooth devices. An application can use the L2CAP protocol as a message-based, unreliable data-delivery transport protocol similarly to the UDP protocol. Access to the L2CAP protocol from userspace is done by BSD sockets API, which was discussed in Chapter 11. Note that in L2CAP, packets are always delivered in the order they were sent, as opposed to UDP. In Figure 14-2, I showed three protocols that are located on top of L2CAP (there are other protocols on top of L2CAP that are not discussed in this chapter, as mentioned earlier).
+
+    * BNEP: Bluetooth Network Encapsulation Protocol. I will present an example of using the BNEP protocol later in this chapter.
+
+    * RFCOMM: The Radio Frequency Communications (RFCOMM) protocol is a reliable streams-based protocol. RFCOMM allows operation over only 30 ports. RFCOMM is used for emulating communication over a serial port and for sending unframed data.
+
+    * SDP: Service Discovery Protocol. Enables an application to register a description and a port number in an SDP server it runs. Clients can perform a lookup in the SDP server providing the description.
+
+  * The SCO (Synchronous Connection-Oriented) Layer: for sending audio; I do not delve into its details in this chapter as it falls outside the scope of this book.
+
+  * Bluetooth profiles are definitions of possible applications and specify general behaviors that Bluetooth- enabled devices use to communicate with other Bluetooth devices. There are many Bluetooth profiles, and I will mention some of the most commonly used ones:
+
+    * File Transfer Profile (FTP): Manipulates and transfers objects (files and folders) in an object store (file system) of another system.
+
+    * Health Device Profile (HDP): Handles medical data.
+
+    * Human Interface Device Profile (HID): A wrapper of USB HID (Human Interface Device) that provides support for devices like mice and keyboards.
+
+    * Object Push Profile (OPP) – Push objects profile.
+
+    * Personal Area Networking Profile (PAN): Provides networking over a Bluetooth link; you will see an example of it in the BNEP section later in this chapter.
+
+    * Headset Profile (HSP): Provides support for Bluetooth headsets, which are used with mobile phones.
+
+The seven layers in this diagram are roughly parallel to the seven layers of the OS model. The Radio (RF) layer is parallel to the Physical layer, the Link Controller is parallel to the Data Link Layer, the Link Management Protocol is parallel to the Network Protocol, and so on. The Linux Bluetooth subsystem consists of several ingredients:
+
+  * Bluetooth Core
+
+    * HCI device and connection manager, scheduler; files: net/bluetooth/hci*.c, net/bluetooth/mgmt.c.
+
+    * Bluetooth Address Family sockets; file: net/bluetooth/af_bluetooth.c.
+
+    * SCO audio links; file: net/bluetooth/sco.c.
+
+    * L2CAP (Logical Link Control and Adaptation Protocol); files: net/bluetooth/l2cap*.c.
+
+    * SMP (Security Manager Protocol) on LE (Low Energy) links; file: net/bluetooth/smp.c
+
+    * AMP manager - Alternate MAC/PHY management; file: net/bluetooth/a2mp.c.
+
+  * HCI Device drivers (Interface to the hardware); files: drivers/bluetooth/*. Includes vendor specific drivers as well as generic drivers, like the Bluetooth USB generic driver, btusb.
+
+  * RFCOMM Module (RFCOMM Protocol); files: net/bluetooth/rfcomm/*.
+
+  * BNEP Module (Bluetooth Network Encapsulation Protocol); files: net/bluetooth/bnep/*.
+
+  * CMTP Module (CAPI Message Transport Protocol), used by the ISDN protocol. CMTP is in fact obsolete; files: net/bluetooth/cmtp/*.
+
+  * HIDP Module (Human Interface Device Protocol); files: net/bluetooth/hidp/*.
+
+I discussed briefly the Bluetooth protocol, the architecture of the Bluetooth stack and the Linux Bluetooth subsystem tree, and Bluetooth profiles. In the next section I will describe the HCI layer, which is the first layer above the LMP (see Figure 14-2 earlier in this section).
+
+### HCI Layer
+
+I will start the discussion of the HCI layer with describing the HCI device, which represents a Bluetooth controller. Later in this section I will describe the interface between the HCI layer and the layer below it, the Link Controller layer, and the interface between the HCI and the layers above it, L2CAP and SCO.
+
+#### HCI Device
+
+A Bluetooth device is represented by struct hci_dev. This structure is quite big (over 100 members), and will partially be shown here:
+
+struct hci_dev {
+
+char name[8];
+
+unsigned long flags;
+
+__u8 bus;
+
+bdaddr_t bdaddr;
+
+__u8 dev_type;
+
+. . .
+
+struct work_struct rx_work;
+
+struct work_struct cmd_work;
+
+. . .
+
+struct sk_buff_head rx_q;
+
+struct sk_buff_head raw_q;
+
+struct sk_buff_head cmd_q;
+
+. . .
+
+int (*open)(struct hci_dev *hdev);
+
+int (*close)(struct hci_dev *hdev);
+
+int (*flush)(struct hci_dev *hdev);
+
+int (*send)(struct sk_buff *skb);
+
+void (*notify)(struct hci_dev *hdev, unsigned int evt);
+
+int (*ioctl)(struct hci_dev *hdev, unsigned int cmd, unsigned long arg);
+
+}
+
+(include/net/bluetooth/hci_core.h)
+
+Here is a description of some of the important members of the hci_dev structure:
+
+  * flags: Represents the state of a device, like HCI_UP or HCI_INIT.
+
+  * bus: The bus associated with the device, like USB (HCI_USB), UART (HCI_UART), PCI (HCI_PCI), etc. (see include/net/bluetooth/hci.h).
+
+  * bdaddr: Each HCI device has a unique address of 48 bits. It is exported to sysfs by: /sys/class/bluetooth/<hciDeviceName>/address
+
+  * dev_type: There are two types of Bluetooth devices:
+
+    * Basic Rate devices (HCI_BREDR).
+
+    * Alternate MAC and PHY devices (HCI_AMP).
+
+  * rx_work: Handles receiving packets that are kept in the rx_q queue of the HCI device, by the hci_rx_work() callback.
+
+  * cmd_work: Handles sending command packets which are kept in the cmd_q queue of the HCI device, by the hci_cmd_work() callback.
+
+  * rx_q: Receive queue of SKBs. SKBs are added to the rx_q by calling the skb_queue_tail() method when receiving an SKB, in the hci_recv_frame() method.
+
+  * raw_q: SKBs are added to the raw_q by calling the skb_queue_tail() method in the hci_sock_sendmsg() method.
+
+  * cmd_q: Command queue. SKBs are added to the cmd_q by calling the skb_queue_tail() method in the hci_sock_sendmsg() method.
+
+The hci_dev callbacks (like open(), close(), send(), etc) are typically assigned in the probe() method of a Bluetooth device driver (for example, refer to the generic USB Bluetooth driver, drivers/bluetooth/btusb.c).
+
+The HCI layer exports methods for registering/unregistering an HCI device (by the hci_register_dev() and the hci_unregister_dev() methods, respectively). Both methods get an hci_dev object as a single parameter. The registration will fail if the open() or close() callbacks of the specified hci_dev object are not defined.
+
+There are five types of HCI packets:
+
+  * HCI_COMMAND_PKT: Commands sent from the host to the Bluetooth device.
+
+  * HCI_ACLDATA_PKT: Asynchronous data which is sent or received from a Bluetooth device. ACL stands for Asynchronous Connection-oriented Link (ACL) protocol.
+
+  * HCI_SCODATA_PKT: Synchronous data which is sent or received from a Bluetooth device (usually audio). SCO stands for Synchronous Connection-Oriented (SCO).
+
+  * HCI_EVENT_PKT: Sent when an event (such as connection establishment) occurs.
+
+  * HCI_VENDOR_PKT: Used in some Bluetooth device drivers for vendor specific needs.
+
+#### HCI and the Layer Below It (Link Controller)
+
+The HCI communicates with the layer below it, the Link Controller, by:
+
+  * Sending data packets (HCI_ACLDATA_PKT or HCI_SCODATA_PKT) by calling the hci_send_frame() method, which delegates the call to the send() callback of the hci_dev object. The hci_send_frame() method gets an SKB as a single parameter.
+
+  * Sending command packets (HCI_COMMAND_PKT), by calling the hci_send_cmd() method. For example, sending a scan command.
+
+  * Receiving data packets, by calling the hci_acldata_packet() method or by calling the hci_scodata_packet() method.
+
+  * Receiving event packets, by calling the hci_event_packet() method. Handling HCI commands is asynchronous; so some time after sending a command packet (HCI_COMMAND_PKT), a single event or several events are received as a response by the HCI rx_work work_queue (the hci_rx_work() method). There are more than 45 different events (see HCI_EV_* in include/net/bluetooth/hci.h). For example, when performing a scan for nearby Bluetooth devices using the command-line hcitool, by hcitool scan, a command packet (HCI_OP_INQUIRY) is sent. As a result, three event packets are returned asynchronously to be handled by the hci_event_packet() method: HCI_EV_CMD_STATUS, HCI_EV_EXTENDED_INQUIRY_RESULT, and HCI_EV_INQUIRY_COMPLETE.
+
+#### HCI and the Layers Above It (L2CAP/SCO)
+
+Let's take a look at the methods by which the HCI layer communicates with the layers above it, the L2CAP layer and the SCO layer:
+
+  * HCI communicates with the L2CAP layer above it when receiving data packets by calling the hci_acldata_packet() method, which invokes the l2cap_recv_acldata() method of the L2CAP protocol.
+
+  * HCI communicates with the SCO layer above it when receiving SCO packets by calling the hci_scodata_packet() method, which invokes the sco_recv_scodata() method of the SCO protocol.
+
+### HCI Connection
+
+The HCI connection is represented by the hci_conn structure:
+
+struct hci_conn {
+
+struct list_head list;
+
+atomic_t refcnt;
+
+bdaddr_t dst;
+
+. . .
+
+__u8 type;
+
+}
+
+(include/net/bluetooth/hci_core.h)
+
+The following is a description of some of the members of the hci_conn structure:
+
+  * refcnt: A reference counter.
+
+  * dst: The Bluetooth destination address.
+
+  * type: Represents the type of the connection:
+
+    * SCO_LINK for SCO connection.
+
+    * ACL_LINK for ACL connection.
+
+    * ESCO_LINK for Extended Synchronous connection.
+
+    * LE_LINK – represents LE (Low Energy) connection; was added in kernel v2.6.39 to support Bluetooth V4.0, which added the LE feature.
+
+    * AMP_LINK – Added in v3.6 to support Bluetooth AMP controllers.
+
+An HCI connection is created by calling the hci_connect() method. There are three types of connections: SCO, ACL, and LE connection.
+
+### L2CAP
+
+In order to provide several data streams, L2CAP uses channels, which are represented by the l2cap_chan structure (include/net/bluetooth/l2cap.h). There is a global linked list of channels, named chan_list. Access to this list is serialized by a global read-write lock, chan_list_lock.
+
+The l2cap_recv_acldata() method, which I described in the section "HCI and the layers above it (L2CAP/SCO)" earlier in this chapter, is called when HCI passes data packets to the L2CAP layer. The l2cap_recv_acldata() method first performs some sanity checks and drops the packet if something is wrong, then it invokes the l2cap_recv_frame() method in case a complete packet was received. Each received packet starts with an L2CAP header:
+
+struct l2cap_hdr {
+
+__le16 len;
+
+__le16 cid;
+
+} __attribute__ ((packed));
+
+(include/net/bluetooth/l2cap.h)
+
+The l2cap_recv_frame() method checks the channel id of the received packet by inspecting the cid of the l2cap_hdr object. In case it is an L2CAP command (the cid is 0x0001) the l2cap_sig_channel() method is invoked to handle it. For example, when another Bluetooth device wants to connect to our device, an L2CAP_CONN_REQ request is received on the L2CAP signal channel, which will be handled by the l2cap_connect_req() method, net/bluetooth/l2cap_core.c. In the l2cap_connect_req() method, an L2CAP channel is created by calling the l2cap_chan_create() method, via pchan->ops->new_connection(). The L2CAP channel state is set to be BT_OPEN, and the configuration state is set to be CONF_NOT_COMPLETE. This means that the channel should be configured in order to work with it.
+
+### BNEP
+
+The BNEP protocol enables IP over Bluetooth, which means in practical terms running TCP/IP applications on top of L2CAP Bluetooth channels. You can also run TCP/IP applications with PPP over Bluetooth RFCOMM, but networking over serial PPP link is less efficient. The BNEP protocol uses a PAN profile. I will show a short example of using the BNEP protocol to setup Bluetooth over IP, and subsequently I will describe the kernel methods which implement such communication. Delving into the details of BNEP is beyond the scope of this book. If you want to learn more, see the BNEP spec, which can be found in:  http://grouper.ieee.org/groups/802/15/Bluetooth/BNEP.pdf . A very simple way to create a PAN is by running:
+
+  * On the server side:
+
+    * pand --listen --role=NAP
+
+    * Note: NAP stands for: Network Access Point (NAP)
+
+  * On the client side
+
+    * pand --connect btAddressOfTheServer
+
+On both endpoints, a virtual interface (bnep0) is created. Afterward, you can assign an IP addresses on bnep0 for both endpoints with the ifconfig command (or with the ip command), just like with Ethernet devices, and you will have a network connection over Bluetooth between these endpoints. See more in  http://bluez.sourceforge.net/contrib/HOWTO-PAN .
+
+The pand --listen command creates an L2CAP server socket, and calls the accept() system call, whereas the pand --connect btAddressOfTheServer creates an L2CAP client socket and calls the connect() system call. When the connect request is received in the server side, it sends an IOCTL of BNEPCONNADD, which is handled in the kernel by the bnep_add_connection() method (net/bluetooth/bnep/core.c), which performs the following tasks:
+
+  * Creates a BNEP session (bnep_session object).
+
+  * Adds the BNEP session object to the BNEP session list (bnep_session_list) by calling the __bnep_link_session() method.
+
+  * Creates a network device named bnepX (for the first BNEP device X is 0, for the second X is 1, and so on).
+
+  * Registers the network device by calling the register_netdev() method.
+
+  * Creates a kernel thread named "kbnepd btDeviceName". This kernel thread runs the bnep_session() method which contains an endless loop, to receive or transmit packets. This endless loop terminates only when a userspace application sends an IOCTL of BNEPCONNDEL, which calls the method bnep_del_connection() to set the terminate flag of the BNEP session, or when the state of the socket is changed and it is not connected anymore.
+
+  * The bnep_session() method invokes the bnep_rx_frame() method to receive incoming packets and to pass them to the network stack, and it invokes the bnep_tx_frame() method to send outgoing packets.
+
+### Receiving Bluetooth Packets: Diagram
+
+Figure 14-3 shows the path of a received Bluetooth ACL packet (as opposed to SCO, which is for handling audio and is handled differently). The first layer where the packet is handled is the HCI layer, by the hci_acldata_packet() method. It then proceeds to the higher L2CAP layer by calling the l2cap_recv_acldata() method.
+
+Figure 14-3.
+
+Receiving an ACL packet
+
+The l2cap_recv_acldata() method calls the l2cap_recv_frame() method, which fetches the L2CAP header (the l2cap_hdr object was described earlier) from the SKB.
+
+An action is being taken according to the channel ID of the L2CAP header.
+
+### L2CAP Extended Features
+
+Support for L2CAP Extended Features (also called eL2CAP) was added in kernel 2.6.36. These extended features include:
+
+  * Enhanced Retransmission Mode (ERTM), a reliable protocol with error and flow control.
+
+  * Streaming Mode (SM), an unreliable protocol for streaming purposes.
+
+  * Frame Check Sequence (FCS), a checksum for each received packet.
+
+  * Segmentation and Reassembly (SAR) of L2CAP packets that make retransmission easier.
+
+Some of these extensions were required for new profiles, like the Bluetooth Health Device Profile (HDP). Note that these features were available also before, but they were considered experimental and were disabled by default, and you should have set CONFIG_BT_L2CAP_EXT_FEATURES to enable them.
+
+### Bluetooth Tools
+
+Accessing the kernel from userspace is done with sockets with minor changes: instead of using AF_INET sockets, we use AF_BLUTOOTH sockets. Here is a short description of some important and useful Bluetooth tools:
+
+  * hciconfig: A tool for configuring Bluetooth devices. Displays information such as the interface type (BR/EDR or AMP), its Bluetooth address, its flags, and more. The hciconfig tool works by opening a raw HCI socket (BTPROTO_HCI) and sending IOCTLs; for example, in order to bring up or bring down the HCI device, an HCIDEVUP or HCIDEVDOWN is sent, respectively. These IOCTLs are handled in the kernel by the hci_sock_ioctl() method, net/bluetooth/hci_sock.c.
+
+  * hcitool: A tool for configuring Bluetooth connections and sending some special command to Bluetooth devices. For example hcitool scan will scan for nearby Bluetooth devices.
+
+  * hcidump: Dump raw HCI data coming from and going to a Bluetooth device.
+
+  * l2ping: Send an L2CAP echo request and receive answer.
+
+  * btmon: A friendlier version of hcidump.
+
+  * bluetoothctl: A friendlier version of hciconfig/hcitool.
+
+You can find more information about the Linux Bluetooth subsystem in:
+
+  * Linux BlueZ, the official Linux Bluetooth website:  http://www.bluez.org .
+
+  * Linux Bluetooth mailing list: linux-bluetooth@vger.kernel.org.
+
+  * Linux Bluetooth mailing list archives:  http://www.spinics.net/lists/linux-bluetooth/ .
+
+    * Note that this mailing list is for Bluetooth kernel patches as well as Bluetooth userspace patches.
+
+  * IRC channels on freenode.net:
+
+    * #bluez (development related topics)
+
+    * #bluez-users (non-development related topics)
+
+In this section I described the Linux Bluetooth subsystem, focusing on the networking aspects of this subsystem. You learned about the layers of the Bluetooth stack and how they are implemented in the Linux kernel. You also learned about the important Bluetooth kernel structures like HCI device and HCI connection. Next, I will describe the second wireless subsystem, the IEEEE 802.15.4 subsystem, and its implementation.
+
+## IEEE 802.15.4 and 6LoWPAN
+
+The IEEE 802.15.4 standard (IEEE Std 802.15.4-2011) specifies the Medium Access Control (MAC) layer and Physical (PHY) layer for Low-Rate Wireless Personal Area Networks (LR-WPANs). It is intended for low-cost and low-power consumption devices in a short-range network. Several bands are supported, among which the most common are the 2.4 GHz ISM band, 915 MHz, and 868 MHz. IEEE 802.15.4 devices can be used for example in wireless sensor networks (WSNs), security systems, industry automation systems, and more. It was designed to organize networks of sensors, switches, automation devices, etc. The maximum allowed bit rate is 250 kb/s. The standard also supports a 1000 kb/s bit rate for the 2.4 GHz band, but it is less common. Typical personal operating space is around 10m. The IEEE 802.15.4 standard is maintained by the IEEE 802.15 working group ( http://www.ieee802.org/15/ ). There are several protocols which sit on top of IEEE 802.15.4; the most known are ZigBee and 6LoWPAN.
+
+The ZigBee Alliance (ZA) has published non GPL specifications for IEEE802.15.4, but also the ZigBee IP (Z-IP) open standard ( http://www.zigbee.org/Specifications/ZigBeeIP/Overview.aspx ). It is based on Internet protocols such as IPv6, TCP, UDP, 6LoWPAN, and more. Using the IPv6 protocol for IEEE 802.15.4 is a good option because there is a huge address space of IPv6 addresses, which makes it possible to assign a unique routable address to each IPv6 node. The IPv6 header is simpler than the IPv4 header, and processing its extension headers is simpler than processing IPv4 header options. Using IPv6 with LR-WPANs is termed IPv6 over Low-power Wireless Personal Area Networks (6LoWPAN). IPv6 is not adapted for its use on an LR-WPAN and therefore requires an adaptation layer, as will be explained later in this section. There are five RFCs related to 6LoWPAN:
+
+  * RFC 4944: "Transmission of IPv6 Packets over IEEE 802.15.4 Networks."
+
+  * RFC 4919: "IPv6 over Low-Power Wireless Personal Area Networks (6LoWPANs): Overview, Assumptions, Problem Statement, and Goals."
+
+  * RFC 6282: "Compression Format for IPv6 Datagrams over IEEE 802.15.4-Based Networks." This RFC introduced a new encoding format, the LOWPAN_IPHC Encoding Format, instead of LOWPAN_HC1 and LOWPAN_HC2.
+
+  * RFC 6775: "Neighbor Discovery Optimization for IPv6 over Low-Power Wireless Personal Area Networks (6LoWPANs)."
+
+  * RFC 6550: "RPL: IPv6 Routing Protocol for Low-Power and Lossy Networks."
+
+The main challenges for implementing 6LoWPAN are:
+
+  * Different packet sizes: IPv6 has MTU of 1280 whereas IEEE802.15.4 has an MTU of 127 (IEEE802154_MTU). In order to support packets larger than 127 bytes, an adaptation layer between IPv6 and IEEE 802.15.4 should be defined. This adaptation layer is responsible for the transparent fragmentation/defragmentation of IPv6 packets.
+
+  * Different addresses: IPv6 address is 128 bit whereas IEEE802.15.4 are IEEE 64-bit extended (IEEE802154_ADDR_LONG) or, after association and after a PAN id is assigned, a 16 bit short addresses (IEEE802154_ADDR_SHORT) which are unique in that PAN. The main challenge is that we need compression mechanisms to reduce the size of a 6LoWPAN packet, largely made up of the IPv6 addresses. 6LoWPAN can for example leverage the fact that IEEE802.15.4 supports 16 bits short addresses to avoid the need of a 64-bit IID.
+
+  * Multicast is not supported natively in IEEE 802.15.4 whereas IPv6 uses multicast for ICMPv6 and for protocols that rely on ICMPv6 like the Neighbour Discovery protocol.
+
+IEEE 802.15.4 defines four types of frames:
+
+  * Beacon frames (IEEE802154_FC_TYPE_BEACON)
+
+  * MAC command frames (IEEE802154_FC_TYPE_MAC_CMD)
+
+  * Acknowledgement frames (IEEE802154_FC_TYPE_ACK)
+
+  * Data frames (IEEE802154_FC_TYPE_DATA)
+
+IPv6 packets must be carried on the fourth type, data frames. Acknowledgment for data packets is not mandatory, although it is recommended. As with 802.11, there are device drivers that implement most parts of the protocol by themselves (HardMAC device drivers), and device drivers that handle most of the protocol in software (SoftMAC device drivers). There are three types of nodes in 6LoWPAN:
+
+  * 6LoWPAN Node (6LN): Either a host or a router.
+
+  * 6LoWPAN Router (6LR): can send and receive Router Advertisements (RA) and Router Solicitations (RS) messages as well as forward and route IPv6 packets. These nodes are more complex than simple 6LoWPAN nodes and may need more memory and processing capacity.
+
+  * 6LoWPAN Border Router (6LBR): A border router located at the junction of separate 6LoWPAN networks or between a 6LoWPAN network and another IP network. The 6LBR is responsible for Forwarding between the IP network and the 6LoWPAN network and for the IPv6 configuration of the 6LoWPAN nodes. A 6LBR requires much more memory and processing capacity than a 6LN. They share context for the nodes in the LoWPAN, keep track of registered nodes with 6LoWPAN-ND and RPL. Generally 6LBR is always-on in contrast to 6LN who sleep most of their times. Figure 14-4 shows a simple setup with 6LBR, which connects between an IP network and a Wireless Sensor Network based on 6LoWPAN.
+
+Figure 14-4.
+
+6LBR connecting an IP network to WSN which runs over 6LoWPAN
+
+### Neighbor Discovery Optimization
+
+There are two reasons we should have optimizations and extensions for the IPv6 Neighboring protocol:
+
+  * IEEE 802.15.4 link layer does not have multicast support, although it supports broadcast (it uses 0xFFFF short address for message broadcasting).
+
+  * The Neighbor Discovery protocol is designed for sufficiently powered devices, and IEEE 802.15.4 devices can sleep in order to preserve energy; moreover, they operate in a lossy network environment, as the RFC puts it.
+
+RFC 6775, which deals with Neighbor Discovery Optimization, added new optimizations such as:
+
+  * Host-initiated refresh of Router Advertisement information. In IPv6, routers usually send periodically Router Advertisements. This feature removes the need for periodic or unsolicited Router Advertisements sent from routers to hosts.
+
+  * EUI-64-based IPv6 addresses are considered to be globally unique. When such addresses are used, DAD (Duplicate Address Detection) is not needed.
+
+  * Three options were added:
+
+    * Address Registration Option (ARO): The ARO option (33) can be a part of unicast NS message that a host sends as part of NUD (Neighbor Unreachability Detection) to determine that it can still reach a default router. When a host has a non-link-local address, it sends periodically NS messages to its default routers with the ARO options in order to register its address. Unregistration is done by sending an NS with an ARO containing a lifetime of 0.
+
+    * 6LoWPAN Context Option (6CO): The 6CO option (34) carries prefix information for LoWPAN header compression, and is similar to Prefix Information option (PIO) which is specified in RFC 4861.
+
+    * Authoritative Border Router Option (ABRO): The ABRO option (35) enables disseminating prefixes and context information across a route-over topology.
+
+  * Two new DAD messages were added:
+
+    * Duplicate Address Request (DAR). New ICMPv6 type of 157.
+
+    * Duplicate Address Confirmation (DAC). New ICMPv6 type of 158.
+
+### Linux Kernel 6LoWPAN
+
+The 6LoWPAN basic implementation was integrated into v3.2 Linux. It was contributed by the Embedded Systems Open Platform Group, from Siemens Corporate Technology. It has three layers:
+
+  * Network layer - net/ieee802154 (includes the 6lowpan module, Raw IEEE 802.15.4 sockets, the netlink interface, and more).
+
+  * MAC layer - net/mac802154. Implements a partial MAC layer for SoftMAC device drivers.
+
+  * PHY layer - drivers/net/ieee802154 – the IEEE802154 device drivers.
+
+  * There are currently two 802.15.4 devices which are supported:
+
+    * AT86RF230/231 transceiver driver
+
+    * Microchip MRF24J40
+
+  * There is the Fakelb driver (IEEE 802.15.4 loopback interface).
+
+  * These two devices, as well as many other 802.15.4 transceivers, are connected via SPI. There is also a serial driver, although it is not included in the mainline kernel and still experimental. There are devices like atusb, which are based on an AT86RF231 BN but are not in mainline as of this writing.
+
+#### 6LoWPAN Initialization
+
+In the lowpan_init_module() method, initialization of 6LoWPAN netlink sockets is done by calling the lowpan_netlink_init() method, and a protocol handler is registered for 6LoWPAN packets by calling the dev_add_pack() method:
+
+. . .
+
+static struct packet_type lowpan_packet_type = {
+
+.type = __constant_htons(ETH_P_IEEE802154),
+
+.func = lowpan_rcv,
+
+};
+
+. . .
+
+static int __init lowpan_init_module(void)
+
+{
+
+. . .
+
+dev_add_pack(&lowpan_packet_type);
+
+. . .
+
+}
+
+(net/ieee802154/6lowpan.c)
+
+The lowpan_rcv() method is the main Rx handler for 6LoWPAN packets, which has an ethertype of 0x00F6 (ETH_P_IEEE802154). It handles two cases:
+
+  * Reception of uncompressed packets (dispatch type is IPv6.)
+
+  * Reception of compressed packets.
+
+You use a virtual link to ensure the translation between 6LoWPAN and IPv6 packets. One endpoint of this virtual link speaks IPv6 and has an MTU of 1280, this is the 6LoWPAN interface. The other one speaks 6LoWPAN and has an MTU of 127, this is the WPAN interface. Compressed 6LoWPAN packets are processed by the lowpan_process_data() method, which calls the lowpan_uncompress_addr() to uncompress addresses and the lowpan_uncompress_udp_header() to uncompress the UDP header accordingly to the IPHC header. The uncompressed IPv6 packet is then delivered to the 6LoWPAN interface with the lowpan_skb_deliver() method (net/ieee802154/6lowpan.c).
+
+Figure 14-5 shows the 6LoWPAN Adaptation layer.
+
+Figure 14-5.
+
+6LoWPAN Adaptation layer
+
+Figure 14-6 shows the path of a packet from the PHY layer (the driver) via the MAC layer to the 6LoWPAN adaptation layer.
+
+Figure 14-6.
+
+Receiving a packet
+
+I will not delve into the details of the device drivers implementation, as this is out of our scope. I will mention that each device driver should create an ieee802154_dev object by calling the ieee802154_alloc_device() method, passing as a parameter an ieee802154_ops object. Every driver should define some ieee802154_ops object callbacks, like xmit, start, stop, and more. This applies for SoftMAC drivers only.
+
+I will mention here that an Internet-Draft was submitted for applying 6LoWPAN technology over Bluetooth Low-Energy devices (these devices are part of the Bluetooth 4.0 specification, as was mentioned in the previous chapter). See "Transmission of IPv6 Packets over Bluetooth Low Energy,"  http://tools.ietf.org/html/draft-ietf-6lowpan-btle-12 .
+
+Note
+
+Contiki is an open source Operating System implementing the Internet of Things (IoT) concept; some patches of the Linux IEEE802.15.4 6LoWPAN are derived from it, like the UDP header compression and decompression. It implements 6LoWPAN, and RPL. It was developed by Adam Dunkels. See  http://www.contiki-os.org/
+
+For additional resources about 6LoWPAN and 802.15.4:
+
+  * Books:
+
+    * "6LoWPAN: The Wireless Embedded Internet", by Zach Shelby and Carsten Bormann, Wiley, 2009.
+
+    * "Interconnecting Smart Objects with IP: The Next Internet," by Jean-Philippe Vasseur and Adam Dunkels (the Contiki developer), Morgan Kaufmann, 2010.
+
+  * An article about IPv6 Neighbor Discovery Optimization:  http://www.internetsociety.org/articles/ipv6-neighbor-discovery-optimization .
+
+The lowpan-tools is a set of utilities to manage the Linux LoWPAN stack. See:  http://sourceforge.net/projects/linux-zigbee/files/linux-zigbee-sources/0.3/
+
+Note
+
+The IEEE802.15.4 does not maintain a git repository of its own (though in the past there was one). Patches are sent to the netdev mailing list; some of the developers send the patches first to the linux zigbee developer mailing list to get some feedback:  https://lists.sourceforge.net/lists/listinfo/linux-zigbee-devel
+
+I described the IEEE 802.15.4 and the 6LoWPAN protocol in this section and the challenges it poses for integration in the Linux kernel, like adding Neighboring Discovery messages. In the next section I will describe the third wireless subsystem, which is intended for the most shortest ranges among the three wireless subsystems described in this chapter: the Near Field Communication (NFC) subsystem.
+
+## Near Field Communication (NFC)
+
+Near Field Communication is a very short range wireless technology (less than two inches) designed to transfer small amount of data over a very low latency link at up to 424 kb/s. NFC payloads range from very simple URLs or raw texts to more complex out of band data to trigger connection handover. Through its very short range and latency, NFC implements a tap and share concept by linking proximity to an immediate action triggered by the NFC data payload. Touch an NFC tag with your NFC enabled mobile phone and this will, for example, immediately fire up a web browser.
+
+NFC runs on the 13.65MHz band and is based on the Radio Frequency ID (RFID) ISO14443 and FeliCa standards. The NFC Forum ( http://www.nfc-forum.org/ ) is a consortium responsible for standardizing the technology through a set of specifications, ranging from the NFC Digital layer up to high-level services definitions like the NFC Connection Handover or the Personal Health Device Communication (PHDC) ones. All adopted NFC Forum specifications are available free of charge. See  http://www.nfc-forum.org/specs/ .
+
+At the heart of the NFC Forum specification is the NFC Data Exchange Format (NDEF) definition. It defines the NFC data structure used to exchange NFC payloads from NFC tags or between NFC peers. All NDEFs contain one or more NDEF Records that embed the actual payload. NDEF record header contains metadata that allow applications to build the semantic link between the NFC payload and an action to trigger on the reader side.
+
+### NFC Tags
+
+NFC tags are cheap, mostly static and battery less data containers. They're typically made of an inductive antenna connected to a very small amount of flash memory, packaged in many different form factors (labels, key rings, stickers, etc.). As per the NFC Forum definitions, NFC tags are passive devices, i.e., they're unable to generate any radio field. Instead they're powered by NFC active devices initiated RF fields. The NFC Forum defines four different tag types, each of them carrying a strong RFID and smart card legacy:
+
+  * Type 1 specifications derive from Innovision/Broadcom Topaz and Jewel card specifications. They can expose from 96 up to 2 KBytes of data at 106 kb/s.
+
+  * Type 2 tags are based on NXP Mifare Ultralight specifications. They're very similar to Type 1 tags.
+
+  * Type 3 tags are built on top of the non-secure parts of Sony FeliCa tags. They're more expensive than Type 1 and 2 tags, but can carry up to 1 MBytes at 212 or 424 kb/s.
+
+  * Type 4 specifications are based on NXP DESFire cards, support up to 32 KBytes and three transmission speeds: 106, 212, or 424 kb/s.
+
+### NFC Devices
+
+As opposed to NFC tags, NFC devices can generate their own magnetic field to initiate NFC communications. NFC-enabled mobile phones and NFC readers are the most common kinds of NFC devices. They support a larger feature set than NFC tags. They can read from or write to NFC tags, but they can also pretend to be a card and be seen as simple NFC tags from any reader. But one of the key advantages of the NFC technology over RFID is the possibility to have two NFC devices talking to each other in an NFC specific peer-to-peer mode. The link between two NFC devices is kept alive as long as the two devices are in magnetic range. In practice this means two NFC devices can maintain a peer-to-peer link while they physically touch each other. This introduces a whole new range of mobile use cases where one can exchange data, context, or credentials by touching someone else NFC device.
+
+### Communication and Operation Modes
+
+The NFC Forum defines two communication and three operation modes. An active NFC communication is established when two NFC devices can talk to one another by alternatively generating the magnetic field. This implies that both devices have their own power supply as they don't rely on any inductively generated power. Active communications can only be established in NFC peer-to-peer mode. On the other hand, only one NFC device generates the radio field on a passive NFC communication, and the other device replies by using that field.
+
+There are three NFC operation modes:
+
+  * Reader/Writer: An NFC device (e.g., an NFC-enabled mobile phone) read from or write to an NFC tag.
+
+  * Peer-to-peer: Two NFC devices establish a Logical Link Control Protocol (LLCP) over which several NFC services can be multiplexed: Simple NDEF Exchange Protocol (SNEP) for exchanging NDEF formatted data, Connection Handover for initiating a carrier (Bluetooth or WiFi) handover, or any proprietary protocol.
+
+  * Card Emulation: An NFC device replies to a reader poll by pretending to be an NFC tag. Payment and transaction issuers rely on this mode to implement contactless payments on top of NFC. In card emulation mode, payment applets running on a trusted execution environment (also known as "secure elements") take control of the NFC radio and expose themselves as a legacy payment card that can be read from an NFC-enabled point-of-sale terminal.
+
+### Host-Controller Interfaces
+
+Communication between hardware controllers and host stacks must follow a precisely defined interface: the host-controller one (HCI). The NFC hardware ecosystem is quite fragmented in that regard, as most of the initial NFC controllers implement an ETSI specified HCI originally designed for communication between SIM cards and contactless front-ends. (See  http://www.etsi.org/deliver/etsi_ts/102600_102699/102622/07.00.00_60/ts_102622v070000p.pdf ). This HCI was not tailored for NFC specific use cases, and so each and every manufacturer defined a large number of proprietary extensions to support their features. The NFC Forum tries to address that situation by defining its own interface, much more NFC oriented, the NFC Controller Interface (NCI). The industry trend is clearly showing that manufacturers abandon ETSI HCI in favor of NCI, building a more standardized hardware ecosystem.
+
+### Linux NFC support
+
+Unlike the Android operating system NFC stack, which is described later in this section, the standard Linux one is partly implemented by the kernel itself. Since the 3.1 Linux kernel release, Linux based application will find an NFC specific socket domain, along with a generic netlink family for NFC. (See  http://git.kernel.org/?p=linux/kernel/git/sameo/nfc-next.git;a=shortlog;h=refs/heads/master .) The NFC generic netlink family is intended to be an NFC out of band channel for controlling and monitoring NFC adapters. The NFC socket domain supports two families:
+
+  * Raw sockets for sending NFC frames that will arrive unmodified to the drivers
+
+  * LLCP sockets for implementing NFC peer-to-peer services
+
+The hardware abstraction is implemented in NFC kernel drivers that register against various parts of the stack, mostly depending on the host-controller interface used by the controllers they support. As a consequence, Linux applications can work on top of a hardware agnostic and fully POSIX compatible NFC kernel APIs. The Linux NFC stack is split between kernel and userspace. The kernel NFC sockets allow userspace applications to implement NFC tags support by sending tag types specific commands through the raw protocol. NFC peer-to-peer protocols (SNEP, Connection Handover, PHDC, etc.) can be implemented by transmitting their specific payloads through NFC sockets as well. Finally, card emulation mode is built on top of the secure element parts of the kernel NFC netlink API. The Linux NFC daemon, neard, sits on top of the kernel and implements all three NFC modes, regardless of the NFC controller physically wired to the host platform. (See  https://01.org/linux-nfc/ .)
+
+Figure 14-7 shows an overview of the NFC system.
+
+Figure 14-7.
+
+NFC overview
+
+#### NFC Sockets
+
+NFC sockets are of two kinds: raw and LLCP. Raw NFC sockets were designed with reader mode support in mind, as they provide a way to transmit tag specific commands and receive the tag replies back. The neard daemon uses NFC Raw sockets to implement all four tag types support, in both reader and writer modes. LLCP sockets implement the NFC peer-to-peer logical link control protocol on top of which neard implements all NFC Forum specified peer-to-peer services (SNEP, Connection Handover, and PHDC).
+
+Depending on the selected protocol, NFC socket semantics differ.
+
+##### Raw Sockets
+
+  * connect: Select and enable a detected NFC tag
+
+  * bind: Not supported
+
+  * send/recv: Send and receive raw NFC payloads. The NFC core implementation does not modify those payloads.
+
+##### LLCP Sockets
+
+  * connect: Connect to a specific LLCP service on a detected peer device, like the SNEP or Connection Handover services.
+
+  * bind: Link a device to a specific LLCP service. The service will be exported through the LLCP service name lookup (SNL) protocol for any NFC peer device to attempt a connection to it.
+
+  * send/recv: Transmit LLCP service payloads to and from an NFC peer device. The kernel will handle the LLCP specific link layer encapsulation and fragmentation.
+
+  * LLCP transport can be connected or connectionless, and this is handled through the UNIX standard SOCK_STREAM and SOCK_DGRAM socket types. NFC LLCP sockets also support the SOCK_RAW type for monitoring and sniffing purposes.
+
+#### NFC Netlink API
+
+The NFC generic netlink API is designed to implement out of band NFC specific operations. It also handles any discoverable secure element from an NFC controller. Through NFC netlink commands, you can:
+
+  * List all available NFC controllers.
+
+  * Power NFC controllers up and down.
+
+  * Start (and stop) NFC polls for discovering NFC tags and devices.
+
+  * Enable NFC peer-to-peer (a.k.a. LLCP) links between the local controller and remote NFC peers.
+
+  * Send LLCP service name lookup requests, in order to discover the available LLCP services on a remote peer.
+
+  * Enable and disable NFC discoverable secure elements (typically SIM card based or embedded secure elements).
+
+  * Send ISO7816 frames to enabled secure elements.
+
+  * Trigger NFC controller firmware downloads.
+
+The netlink API is not only about sending synchronous commands from NFC applications, but also about receiving asynchronous NFC-related events. Applications listening for broadcast NFC events on an NFC netlink socket will get notified about:
+
+  * Detected NFC tags and devices
+
+  * Discovered secure elements
+
+  * Secure element transaction status
+
+  * LLCP service name lookup replies
+
+The entire netlink API (both commands and events) along with the socket one are exported through the kernel headers, and installed at /usr/include/linux/nfc.h on standard Linux distributions.
+
+#### NFC Initialization
+
+NFC initialization is done by the nfc_init() method:
+
+static int __init nfc_init(void)
+
+{
+
+int rc;
+
+. . .
+
+Register the generic netlink NFC family and the NFC notifier callback, the nfc_genl_rcv_nl_event() method:
+
+rc = nfc_genl_init();
+
+if (rc)
+
+goto err_genl;
+
+/* the first generation must not be 0 */
+
+nfc_devlist_generation = 1;
+
+Initialize NFC Raw sockets:
+
+rc = rawsock_init();
+
+if (rc)
+
+goto err_rawsock;
+
+Initialize NFC LLCP sockets:
+
+rc = nfc_llcp_init();
+
+if (rc)
+
+goto err_llcp_sock;
+
+Initialize the AF_NFC protocol:
+
+rc = af_nfc_init();
+
+if (rc)
+
+goto err_af_nfc;
+
+return 0;
+
+. . .
+
+}
+
+(net/nfc/core.c)
+
+#### Drivers API
+
+As explained earlier, most NFC controllers nowadays either use HCI or NCI as their host-controller interface. Others define their proprietary interface over USB, like most PC-compatible NFC readers, for example. There are also some "Soft" NFC controllers that expect the host platform to implement the NFC Forum Digital layer and talk to an analog-only capable firmware. In order to support this variety of hardware controllers, the NFC kernel implements NFC NCI, HCI, and Digital layers. Depending on the NFC hardware they intend to support, device driver developers will need to register at module probing time against one of these stacks, or directly against the NFC core implementation for purely proprietary protocols. When registering, they typically provide a stack operands implementation, which is the actual hardware abstraction layer between NFC kernel drivers and the core parts of the NFC stack. The NFC driver registration APIs and operand prototypes are defined in the kernel include/net/nfc/ directory.
+
+Figure 14-8 shows a block diagram of the NFC Linux Architecture.
+
+Figure 14-8.
+
+NFC Linux Kernel Architecture. (Note that the NFC Digital layer is not in kernel 3.9. It is to be integrated into kernel 3.13.)
+
+The hierarchy shown in this figure can be understood better by looking into the implementation details of the registration of NFC device drivers directly to the NFC core and against the HCI and the NCI layer:
+
+  * Registration directly against the NFC core is done typically in the driver probe() callback. The registration is done using these steps:
+
+    * Create an nfc_dev object by calling the nfc_allocate_device() method.
+
+    * Call the nfc_register_device() method, passing the nfc_dev object which was created in the previous step as a single parameter.
+
+    * See: drivers/nfc/pn533.c.
+
+  * Registration against the HCI layer is done typically also in the probe() callback of the driver; in the case of the pn544 and microread NFC device drivers, which are the only HCI drivers in kernel 3.9, this probe() method is invoked by the I2C subsystem. The registration is done using these steps:
+
+    * Create an nfc_hci_dev object by calling the nfc_hci_allocate_device() method.
+
+    * The nfc_hci_dev structure is defined in include/net/nfc/hci.h.
+
+    * Call the nfc_hci_register_device() method, passing the nfc_hci_dev object which was created in the previous step as a single parameter. The nfc_hci_register_device() method in turn performs a registration against the NFC core by calling the nfc_register_device() method.
+
+    * See drivers/nfc/pn544/pn544.c and drivers/nfc/microread/microread.c.
+
+  * Registration against the NCI layer is done typically also in the probe() callback of the driver, for example in the nfcwilink driver. The registration is done using these steps:
+
+    * Create an nci_dev object by calling the nci_allocate_device() method.
+
+    * The nci_dev structure is defined in include/net/nfc/nci_core.h.
+
+    * Call the nci_register_device() method, passing the nci_dev object that was created in the previous step as a single parameter. The nci_register_device() method in turn performs a registration against the NFC core by calling the nfc_register_device() method, similarly to what you saw earlier in this section with registration against the HCI layer.
+
+    * See drivers/nfc/nfcwilink.c.
+
+When working directly against the NFC core, the driver must define five callbacks in the nfs_ops object (this object is passed as a first parameter of the nfc_allocate_device() method):
+
+  * start_poll: Set the driver to work in polling mode.
+
+  * stop_poll: Stop polling.
+
+  * activate_target: Activate a chosen target.
+
+  * deactivate_target: Deactivate a chosen target.
+
+  * im_transceive: Transceive operation.
+
+When working with HCI, the hci_nfc_ops object, which is an instance of nfs_ops, defines these five callbacks, and when allocating an HCI object with the nfc_hci_allocate_device() method, the nfc_allocate_device() method is invoked with this hci_nfc_ops object as a first parameter.
+
+With NCI, there is something quite similar, with the nci_nfc_ops object; see: net/nfc/nci/core.c.
+
+### Userspace Architecture
+
+neard ( http://git.kernel.org/?p=network/nfc/neard.git;a=summary ) is the Linux NFC daemon that runs on top of the kernel NFC APIs. It is a single threaded, GLib based process that implements the higher layers of the NFC peer-to-peer stack along with the four tag types specific commands for reading from and writing to NFC tags. The NDEF Push Protocol (NPP), SNEP, PHDC, and Connection Handover specifications are implemented through neard plugins. One of neard's main design goals is to provide a small, simple, and uniform NFC API for Linux based applications willing to provide high-level NFC services. This is achieved through a small D-Bus API that abstracts tags and devices interfaces and methods, hiding the NFC complexity away from application developers. This API is compatible with the freedesktop D-Bus ObjectManager one and provides the following interfaces:
+
+  * org.neard.Adapter: For detecting new NFC controllers, turning them on and off, and starting NFC polls.
+
+  * org.neard.Device, org.neard.Tag: For representing detected NFC tags and devices. Calling the Device.Push method will send NDEFs to the peer device while Tag.Write will write them to the selected tag.
+
+  * org.neard.Record: Represents human readable and understandable NDEF record payload and properties. Registering agents against the org.neard.NDEFAgent interface will give application access to the NDEF raw payloads.
+
+You can find more information about the neard userspace daemon here:  http://git.kernel.org/cgit/network/nfc/neard.git/tree/doc .
+
+### NFC on Android
+
+The initial NFC support was added to the Android operating system on December 2010, with the official 2.3 (Gingerbread) release. Android 2.3 only supported the reader/writer mode, but things have improved significantly since then, and the latest Android releases (Jelly Bean 4.3) come with a fully featured NFC support. For more information, see the Android NFC page:  http://developer.android.com/guide/topics/connectivity/nfc/index.html . Following the classic Android architecture, a Java specific NFC API is available for applications to provide NFC services and operations. It is left to integrators to implement these APIs through native hardware abstraction layers (HAL). Google ships a Broadcom NFC HAL that currently only supports Broadcom NFC hardware. Here again, it is left to Android OEMs and integrators to either adapt the Broadcom NFC HAL to their selected NFC chipset or to implement their own HAL. It is important to note that since the Broadcom stack implements the NFC Controller Interface (NCI) specification, it is relatively easy to adapt it to support any NCI compatible NFC controller. The Android NFC architecture is what one could call a userspace NFC stack. In fact the entire NFC implementation is done in userspace through the HAL. NFC frames are then pushed down to the NFC controller through a kernel driver stub. The driver simply encapsulates those frames into buffers that are ready to be sent to the physical link (e.g., I2C, SPI, UART) between the host platform and the NFC controller.
+
+Note
+
+Pull requests of the nfc-next git tree are sent to the wireless-next tree (Apart from the NFC subsystem, also the Bluetooth subsystem and the mac802.11 subsystem pull requests are handled by the wireless maintainer). From the wireless-next tree, pull requests are sent to net-next tree, and from there to Linus linux-next tree. The nfc-next tree is available in: git://git.kernel.org/pub/scm/linux/kernel/git/sameo/nfc-next.git
+
+There is also an nfc-fixes git repository, which contains urgent and critical fixes for the current release(-rc*). The git tree of nfc-fixes is available in:  git://git.kernel.org/pub/scm/linux/kernel/git/sameo/nfc-fixes.git/
+
+NFC mailing list: linux-nfc@lists.01.org.
+
+NFC mailing list archives:  https://lists.01.org/pipermail/linux-nfc/ .
+
+In this section you learned about what NFC is in general, and about the Linux NFC subsystem implementation and about the Android NFC subsystem implementation. In the next section I will discuss the notification chains mechanism, which is an important mechanism to inform network devices about various events.
+
+## Notifications Chains
+
+Network devices state can change dynamically; from time to time, the user/administrator can register/unregister network devices, change their MAC address, change their MTU, etc. The network stack and other subsystems and modules should be able to be notified about these events and handle them properly. The network notifications chains provide a mechanism for handling such events, and I will describe its API and the possible network events it handles in this section. For a full list of the events, see Table 14-1 later in this section. Every subsystem and every module can register itself to notification chains. This is done by defining a notifier_block and registering it. The core methods of notification chain registration and unregistration is the notifier_chain_register() and the notifier_chain_unregister() method, respectively. Generation of notification events is done by calling the notifier_call_chain() method. These three methods are not used directly (they are not exported; see kernel/notifier.c), and they do not use any locking mechanism. The following methods are wrappers around notifier_chain_register(), all of them implemented in kernel/notifier.c:
+
+  * atomic_notifier_chain_register()
+
+  * blocking_notifier_chain_register()
+
+  * raw_notifier_chain_register()
+
+  * srcu_notifier_chain_register()
+
+  * register_die_notifier()
+
+Table 14-1.
+
+Network Device Events:
+
+Event | Meaning
+
+---|---
+
+NETDEV_UP | device up event
+
+NETDEV_DOWN | device down event
+
+NETDEV_REBOOT | detected a hardware crash and restarted the device
+
+NETDEV_CHANGE | device state change
+
+NETDEV_REGISTER | device registration event
+
+NETDEV_UNREGISTER | device unregistration event
+
+NETDEV_CHANGEMTU | device MTU changed
+
+NETDEV_CHANGEADDR | device MAC address changed
+
+NETDEV_GOING_DOWN | device is going down
+
+NETDEV_CHANGENAME | device has changed its name
+
+NETDEV_FEAT_CHANGE | device features changed
+
+NETDEV_BONDING_FAILOVER | bonding failover event
+
+NETDEV_PRE_UP | this event enables to veto changing the device state to UP; for example, in cfg80211, denying interfaces to be set UP if the device is known to be rfkill'ed.
+
+see cfg80211_netdev_notifier_call()
+
+NETDEV_PRE_TYPE_CHANGE | The device is about to change its type.
+
+This is a generalization of the
+
+NETDEV_BONDING_OLDTYPE flag, which was replaced by NETDEV_PRE_TYPE_CHANGE
+
+NETDEV_POST_TYPE_CHANGE | device changed its type. This is a generalization of the NETDEV_BONDING_NEWTYPE flag, which was replaced by NETDEV_POST_TYPE_CHANGE
+
+NETDEV_POST_INIT | This event is generated in device registration (register_netdevice()), before creating the network device kobjects by netdev_register_kobject(); used in cfg80211 (net/wireless/core.c)
+
+NETDEV_UNREGISTER_FINAL | An event which is generated to finalize the device unregistration.
+
+NETDEV_RELEASE | the last slave of a bond is released (when working with netconsole over bonding) (This flag was also once used for bridges, in br_if.c).
+
+NETDEV_NOTIFY_PEERS | notify network peers event (i.e., a device wants to inform the rest of the network about some sort of reconfiguration such as a failover event or a virtual machine migration)
+
+NETDEV_JOIN | The device added a slave.
+
+Used for example in the bonding driver, in the bond_enslave() method, where we add a slave; see drivers/net/bonding/bond_main.c
+
+There are also corresponding wrapper methods for unregistering notification chains and for generating notification events for each of these wrappers. For example, for the notification chain registered with the atomic_notifier_chain_register() method, the atomic_notifier_chain_unregister() is for unregistering the notification chain, and the __atomic_notifier_call_chain() method is for generating notification events. Each of these wrappers has also a corresponding macro to define a notification chain; for the atomic_notifier_chain_register() wrapper it is the ATOMIC_NOTIFIER_HEAD macro (include/linux/notifier.h).
+
+After registering a notifier_block object, when every one of the events shown in Table 14-1 occurs, the callback specified in a notifier_block is invoked. The fundamental data structure of notification chains is the notifier_block structure; let's take a look:
+
+struct notifier_block {
+
+int (*notifier_call)(struct notifier_block *, unsigned long, void *);
+
+struct notifier_block __rcu *next;
+
+int priority;
+
+};
+
+(include/linux/notifier.h)
+
+  * notifier_call: The callback to be invoked.
+
+  * priority: callbacks of notifier_block objects with higher priority are performed first.
+
+There are many chains in the networking subsystem and in other subsystems. Let's mention some of the important ones:
+
+  * netdev_chain: Registered by the register_netdevice_notifier() method and unregistered by the unregister_netdevice_notifier() method (net/core/dev.c).
+
+  * inet6addr_chain: Registered by the register_inet6addr_notifier() method and unregistered by the unregister_inet6addr_notifier () method. Notifications are generated by the inet6addr_notifier_call_chain () method (net/ipv6/addrconf_core.c).
+
+  * netevent_notif_chain: Registered by the register_netevent_notifier() method and unregistered by the unregister_netevent_notifier() method. Notifications are generated by the call_netevent_notifiers() method (net/core/netevent.c).
+
+  * inetaddr_chain: Registered by the register_inetaddr_notifier() method and unregistered by the unregister_inetaddr_notifier() method. Notifications are generated by calling the blocking_notifier_call_chain() method.
+
+Let's take a look at an example of using the netdev_chain; you saw earlier that with netdev_chain, registration is done with the register_netdevice_notifier() method, which is a wrapper around the raw_notifier_chain_register() method. Following is an example of registering a callback named br_device_event; First, a notifier_block object is defined, and then it is registered by calling the register_netdevice_notifier() method:
+
+struct notifier_block br_device_notifier = {
+
+.notifier_call = br_device_event
+
+};
+
+(net/bridge/br_notify.c)
+
+static int __init br_init(void)
+
+{
+
+...
+
+register_netdevice_notifier(&br_device_notifier);
+
+...
+
+}
+
+(net/bridge/br.c)
+
+Notifications of the netdev_chain are generated by invoking the call_netdevice_notifiers() method. The first parameter of this method is the event. The call_netdevice_notifiers() method :is in fact a wrapper around raw_notifier_call_chain().
+
+So, when a network notification is generated, all callbacks which were registered are invoked; in this example, the br_device_event() callback will be called, regardless of which network event occurred; the callback will decide how to handle the notification, or maybe it will ignore it. Let's take a look at the callback method, br_device_event():
+
+static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr)
+
+{
+
+struct net_device *dev = ptr;
+
+struct net_bridge_port *p;
+
+struct net_bridge *br;
+
+bool changed_addr;
+
+int err;
+
+. . .
+
+The second parameter for the br_device_event() method is the event (all the events are defined in include/linux/netdevice.h):
+
+switch (event) {
+
+case NETDEV_CHANGEMTU:
+
+dev_set_mtu(br->dev, br_min_mtu(br));
+
+break;
+
+. . .
+
+}
+
+Note
+
+Registration of notification chains is not limited only to the networking subsystem. Thus, for example, the clockevents subsystem defines a chain called clockevents_chain and registers it by calling the raw_notifier_chain_register() method, and the hung_task module defines a chain named panic_notifier_list and registers it by calling the atomic_notifier_chain_register() method.
+
+Beside the notifications that are discussed in this section, there is another type or notifications, named RTNetlink notifications; these notifications are sent with the rtmsg_ifinfo() method. :This type of notifications was discussed in Chapter 2, which dealt with Netlink Sockets.
+
+These are the event types supported for networking (Note: the event types mentioned in the following table are defined in include/linux/netdevice.h):
+
+We have now covered notification events, a mechanism that enables network devices to get notifications about events such as change of MTU, change of MAC address and more. The next section will discuss shortly the PCI subsystem, describing some of its main data structures.
+
+## The PCI Subsystem
+
+Many network interfaces cards are Peripheral Component Interconnect (PCI) devices and should work in conjunction with the Linux PCI subsystem. Not all network interfaces are PCI devices; there are many embedded devices where the network interface is not on a PCI bus; the initialization and handling of these devices is done in a different way, and the following discussion is not relevant for these non-PCI devices. The new PCI devices are PCI Express (PCIe or PCIE) devices; the standard was created in 2004. They have a serial interface instead of a parallel interface, and as a result they have higher maximum system bus throughput. Each PCI device has a read-only configuration space; it is at least 256 bytes. The extended configuration space, available in PCI-X 2.0 and PCI Express buses, is 4096 bytes. You can read the PCI configuration space and the extended PCI configuration space by lspci (the lspci utility belongs to the pciutils package):
+
+  * lspci -xxx: Shows a hexadecimal dump of the PCI configuration space.
+
+  * lspci –xxxx: Shows a hexadecimal dump of the extended PCI configuration space.
+
+The Linux PCI API provides three methods for reading the configuration space, for handling 8-, 16-, and 32-bit granularity:
+
+  * static inline int pci_read_config_byte(const struct pci_dev *dev, int where, u8 *val)
+
+  * static inline int pci_read_config_word(const struct pci_dev *dev, int where, u16 *val)
+
+  * static inline int pci_read_config_dword(const struct pci_dev *dev, int where, u32 *val)
+
+There are also three methods for writing the configuration space; likewise, 8-, 16-, and 32-bit granularities are handled:
+
+  * static inline int pci_write_config_byte(const struct pci_dev *dev, int where, u8 val)
+
+  * static inline int pci_write_config_word(const struct pci_dev *dev, int where, u16 val)
+
+  * static inline int pci_write_config_dword(const struct pci_dev *dev, int where, u32 val)
+
+Every PCI manufacturer assigns values to at least the vendor, device, and class fields in the configuration space of the PCI device. A PCI device is identified by the Linux PCI subsystem by a pci_device_id object. The pci_device_id struct is defined in include/linux/mod_devicetable.h:
+
+struct pci_device_id {
+
+__u32 vendor, device; /* Vendor and device ID or PCI_ANY_ID*/
+
+__u32 subvendor, subdevice; /* Subsystem ID's or PCI_ANY_ID */
+
+__u32 class, class_mask; /* (class,subclass,prog-if) triplet */
+
+kernel_ulong_t driver_data; /* Data private to the driver */
+
+};
+
+(include/linux/mod_devicetable.h)
+
+The vendor, device, and class fields in pci_device_id identify a PCI device; most drivers do not need to specify the class as vendor/device is normally sufficient.
+
+Each PCI device driver declares a pci_driver object. Let's take a look at the pci_driver structure:
+
+struct pci_driver {
+
+. . .
+
+const char *name;
+
+const struct pci_device_id *id_table; /* must be non-NULL for probe to be called */
+
+int (*probe) (struct pci_dev *dev, const struct pci_device_id *id); /* New device inserted */
+
+void (*remove) (struct pci_dev *dev); /* Device removed (NULL if not a hot-plug capable driver) */
+
+int (*suspend) (struct pci_dev *dev, pm_message_t state); /* Device suspended */
+
+. . .
+
+int (*resume) (struct pci_dev *dev); /* Device woken up */
+
+. . .
+
+};
+
+(include/linux/pci.h)
+
+Here are short descriptions of the members of the pci_driver structure:
+
+  * name: Name of the PCI device.
+
+  * id_table: An array of pci_device_id objects which it supports. Initializing id_table is done usually with the DEFINE_PCI_DEVICE_TABLE macro.
+
+  * probe: A method for device initialization.
+
+  * remove: A method for freeing the device. The remove() method usually frees all the resources that were assigned in the probe() method.
+
+  * suspend: A power management callback which puts the device to be in low power state, for devices that support power management.
+
+  * resume: A power management callback that wakes the device from low power state, for devices that support power management.
+
+A PCI device is represented by struct pci_dev. It is a large structure; let's take a look at some of its members (they are self-explanatory):
+
+struct pci_dev {
+
+. . .
+
+unsigned short vendor;
+
+unsigned short device;
+
+unsigned short subsystem_vendor;
+
+unsigned short subsystem_device;
+
+. . .
+
+struct pci_driver *driver; /* which driver has allocated this device */
+
+. . .
+
+pci_power_t current_state; /* Current operating state. In ACPI-speak,
+
+this is D0-D3, D0 being fully functional,
+
+and D3 being off. */
+
+struct device dev; /* Generic device interface */
+
+int cfg_size; /* Size of configuration space */
+
+unsigned int irq;
+
+};
+
+(include/linux/pci.h)
+
+Registering of a PCI network device against the PCI subsystem is done by defining a pci_driver object and calling the pci_register_driver() macro, which gets as its single argument a pci_driver object. In order to initialize the PCI device before it's being used, a driver should call the pci_enable_device() method. This method wakes up the device if it was suspended, and allocates the required I/O resources and memory resources. Unregistering the PCI driver is done by the pci_unregister_driver() method. Usually the pci_register_driver() macro is called in the driver module_init() method and the pci_unregister_driver() method is called in the driver module_exit() method. Each driver should call the request_irq() method specifying the IRQ handler when the device is brought up, and call free_irq() when the device is brought down.
+
+Allocation and freeing of DMA (Direct Memory Access) memory is usually done with dma_alloc_coherent()/dma_free_coherent() when working with uncached memory buffer. With dma_alloc_coherent() we don't need to worry about cache coherency, as the mappings of this method are cache-coherent. See for example in e1000_alloc_ring_dma(), drivers/net/ethernet/intel/e1000e/netdev.c. The Linux DMA API is described in Documentation/DMA-API.txt.
+
+Note
+
+Single Root I/O Virtualization (SR-IOV) is a PCI feature that makes one physical device appear as several virtual devices. The SR-IOV specification was created by the PCI SIG. See  http://www.pcisig.com/specifications/iov/single_root/ . For more information see Documentation/PCI/pci-iov-howto.txt.
+
+More information about PCI can be found in the third edition of "Linux Device Drivers" by Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman, which is available (under Creative Commons License) in this URL:  http://lwn.net/Kernel/LDD3/  .
+
+### Wake-On-LAN (WOL)
+
+Wake-On-LAN is a standard that allows a device that had been soft-powered-down to be powered up or awakened by a network packet. Wake-On-LAN is disabled by default. There are some network device drivers which let the sysadmin enable the Wake-On-LAN feature, usually by running from userspace the ethtool command. In order to support this, the network device driver should define a set_wol() callback in the ethtool_ops object. See for example, the 8139cp driver of RealTek (net/ethernet/realtek/8139cp.c). Running ethtool <networkDeviceName> shows whether the network device supports Wake-On-LAN. The ethtool also lets the sysadmin define which packets should wake the device; for example, ethtool -s eth1 wol g will enable Wake-On-LAN for MagicPacket frames (MagicPacket is a standard of AMD). You can use the ether-wake utility of the net-tools package to send Wake-On-LAN MagicPacket frames.
+
+## Teaming Network Device
+
+The virtual teaming network device driver is intended to be a replacement for the bonding network device (drivers/net/bonding). The bonding network device provides a link aggregation solution (also known as: "link bundling" or "trunking"). See Documentation/networking/bonding.txt. The bonding driver is implemented fully in the kernel, and is known to be very large and prone to problems. The teaming network driver is controlled by userspace, as opposed to the bonding network driver. The userspace daemon is called teamd and it communicates with the kernel teaming driver by a library name libteam. The libteam library is based on generic netlink sockets (see Chapter 2).
+
+There are four modes for the teaming driver:
+
+  * loadbalance: Used in Link Aggregation Control Protocol (LACP), which is part of the 802.3ad standard.
+
+  * net/team/team_mode_loadbalance.c
+
+  * activebackup: Only one port is active at a given time. This port can transmit and receive SKBs. The other ports are backup ports. A userspace application can specify which port to use as the active port.
+
+  * net/team/team_mode_activebackup.c
+
+  * broadcast: All packets are sent by all ports.
+
+  * net/team/team_mode_broadcast.c
+
+  * roundrobin: Selection of ports is done by a round robin algorithm. No need for interaction with userspace for this mode.
+
+  * net/team/team_mode_roundrobin.c
+
+Note
+
+The teaming network driver resides under drivers/net/team and is developed by Jiri Pirko.
+
+For more information see  http://libteam.org/ .
+
+libteam site:  https://github.com/jpirko/libteam .
+
+Our brief overview about the teaming driver is over. Many of the readers use PPPoE services when they are surfing the Internet. The following short section covers the PPPoE protocol.
+
+## The PPPoE Protocol
+
+PPPoE is a specification for connecting multiple clients to a remote site. PPPoE is typically used by DSL providers to handle IP addresses and authenticate users. The PPPoE protocol provides the ability to use PPP encapsulation for Ethernet packets. The PPPoE protocol is specified in RFC 2516 from 1999, and the PPP protocol is specified in RFC 1661 from 1994. There are two stages in PPPoE:
+
+  * PPPoE discovery stage. The discovery is done in a client-server session. The server is called an Access Concentrator, and there can be more than one. These Access Concentrators are often deployed by an Internet Server Provider (ISP). These are the four steps in the Discovery stage:
+
+    * The PPPoE Active Discovery Initiation (PADI). A broadcast packet is sent from a host. The code in the PPPoE header is 0x09 (PADI_CODE), and the session id (sid) in the PPPoE header must be 0.
+
+    * The PPPoE Active Discovery Offer (PADO). An Access Concentrator replies to a PADI request with a PADO reply. The destination address is the address of the host that sent the PADI. The code in the PPPoE header is 0x07 (PADO_CODE). The session id (sid) in the PPPoE header must again be 0.
+
+    * PPPoE Active Discovery Request (PADR). A host sends a PADR packet to an Access Concentrator after it receives a PADO reply. The code in the PPPoE header is 0x19 (PADR_CODE). The session id (sid) in the PPPoE header must again be 0.
+
+    * PPPoE Active Discovery Session-confirmation (PADS). When the Access Concentrator gets a PADR request, it generates a unique session id, and sends a PADS packet as a reply. The code in the PPPoE header is 0x65 (PADS_CODE). The session id (sid) in the PPPoE header is the session id that it generated. The destination of the packet is the IP address of the host that sent the PADR request.
+
+    * A session is terminated by sending PPPoE Active Discovery Terminate (PADT) packet. The code in the PPPoE header is 0xa7 (PADT_CODE). A PADT can be sent either by an Access Concentrator or a host, and it can be sent any time after the session was established. The destination address is a unicast address. The ethertype of the Ethernet header of all the five discovery packets (PADI, PADO, PADR, PADS and PADT) is 0x8863 (ETH_P_PPP_DISC).
+
+  * PPPoE Session stage. Once the PPPoE discovery stage completed successfully, packets are sent using PPP encapsulation, which means adding a PPP header of two bytes. Using PPP enables registration and authentication using PPP subprotocols like Password Authentication Protocol (PAP) or Challenge Handshake Authentication Protocol (CHAP), and also PPP subprotocol called the Link Control Protocol (LCP), which is responsible for establishing and testing the data-link connection. The ethertype of the Ethernet header is 0x8864 (ETH_P_PPP_SES).
+
+Every PPPoE packet starts with a 6-byte of PPPoE header, and you must learn about the PPPoE header in order to understand better the PPPoE protocol.
+
+## PPPoE Header
+
+I will start by showing the PPPoE header definition in the Linux kernel:
+
+struct pppoe_hdr {
+
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+
+__u8 ver : 4;
+
+__u8 type : 4;
+
+#elif defined(__BIG_ENDIAN_BITFIELD)
+
+__u8 type : 4;
+
+__u8 ver : 4;
+
+#else
+
+#error "Please fix <asm/byteorder.h>"
+
+#endif
+
+__u8 code;
+
+__be16 sid;
+
+__be16 length;
+
+struct pppoe_tag tag[0];
+
+} __packed;
+
+(include/uapi/linux/if_pppox.h)
+
+The following is a description of the members of the pppoe_hdr structure:
+
+  * ver: The ver field is a 4-bit field and it must be set to 0x1 according to section 4 in RFC 2516.
+
+  * type: The type field is a 4-bit field and it must also be set to 0x1 according to section 4 in RFC 2516.
+
+  * code: The code field is a 8-bit field and it can be one of the constants mentioned earlier: PADI_CODE, PADO_CODE, PADR_CODE, PADS_CODE and PADT_CODE.
+
+  * sid: Session ID (16-bit).
+
+  * length: The length is a 16-bit field, and it represents the length of the PPPoE payload, without the length of the PPPoE header or the length of the Ethernet header.
+
+  * tag[0]: The PPPoE payload can contains zero or more tags, in a type-length-value (TLV) format. A tag consists of 3 fields:
+
+    * TAG_TYPE: 16-bit (for example, AC-Name, Service-Name, Generic-Error and more).
+
+    * TAG_LENGTH: 16-bit.
+
+    * TAG_VALUE: variable in length.
+
+  * Appendix A of RFC 2516 lists the various TAG_TYPEs and TAG_VALUEs.
+
+Figure 14-9 shows a PPPoE header:
+
+Figure 14-9.
+
+PPPoE header
+
+### PPPoE Initialization
+
+PPPoE Initialization is done by the pppoe_init() method, drivers/net/ppp/pppoe.c. Two PPPoE protocol handlers are registered, one for PPPoE discovery packets, and one for PPPoE session packets. Let's take a look at the PPPoE protocol handler registration:
+
+static struct packet_type pppoes_ptype __read_mostly = {
+
+.type = cpu_to_be16(ETH_P_PPP_SES),
+
+.func = pppoe_rcv,
+
+};
+
+static struct packet_type pppoed_ptype __read_mostly = {
+
+.type = cpu_to_be16(ETH_P_PPP_DISC),
+
+.func = pppoe_disc_rcv,
+
+};
+
+static int __init pppoe_init(void)
+
+{
+
+int err;
+
+dev_add_pack(&pppoes_ptype);
+
+dev_add_pack(&pppoed_ptype);
+
+. . .
+
+return 0;
+
+}
+
+The dev_add_pack() method is the generic method for registering protocol handlers, and you encountered in previous chapters. The protocol handlers which are registered by the pppoe_init() method are:
+
+  * The pppoe_disc_rcv() method is the handler for PPPoE discovery packets.
+
+  * The pppoe_rcv() method is the handler for PPPoE session packets.
+
+The PPPoE module exports an entry to procfs, /proc/net/pppoe. This entry consists of the session id, the MAC address, and the device of the current PPPoE sessions. Running cat /proc/net/pppoe is handled by the pppoe_seq_show() method. A notifier chain is registered by the pppoe_init() method by calling the register_netdevice_notifier(&pppoe_notifier).
+
+#### PPPoX Sockets
+
+PPPoX sockets are represented by the pppox_sock structure (include/linux/if_pppox.h) and are implemented in net/ppp/pppox.c. These sockets implement a Generic PPP encapsulation socket family. Apart from PPPoE, they are used also by Layer 2 Tunneling Protocol (L2TP) over PPP. PPPoX sockets are registered by calling register_pppox_proto(PX_PROTO_OE, &pppoe_proto) in the pppoe_init() method. Let's take a look at the definition of the pppox_sock structure:
+
+struct pppox_sock {
+
+/* struct sock must be the first member of pppox_sock */
+
+struct sock sk;
+
+struct ppp_channel chan;
+
+struct pppox_sock *next; /* for hash table */
+
+union {
+
+struct pppoe_opt pppoe;
+
+struct pptp_opt pptp;
+
+} proto;
+
+__be16 num;
+
+};
+
+(include/linux/if_pppox.h)
+
+When the PPPoX socket is used by PPPoE, the pppoe_opt of the proto union of the pppox_sock object is used. The pppoe_opt structure includes a member called pa, which is an instance of the pppoe_addr structure. The pppoe_addr structure represents the parameters of the PPPoE session: session id, remote MAC address of the peer, and the name of the network device that is used:
+
+struct pppoe_addr {
+
+sid_t sid; /* Session identifier */
+
+unsigned char remote[ETH_ALEN]; /* Remote address */
+
+char dev[IFNAMSIZ]; /* Local device to use */
+
+};
+
+(include/uapi/linux/if_pppox.h)
+
+Note
+
+Access to the pa member of the pppoe_opt structure which is embedded in the proto union is done in most cases in the PPPoE module using the pppoe_pa macro:
+
+#define pppoe_pa proto.pppoe.pa
+
+(include/linux/if_pppox.h)
+
+### Sending and Receiving Packets with PPPoE
+
+Once the discovery stage is completed, the PPP protocol must be used in order to enable traffic between the two peers, as was mentioned earlier. When starting a PPP connection by running, for example, pppd eth0 (see the example later in this section), the userspace pppd daemon creates a PPPoE socket by calling socket(AF_PPPOX, SOCK_STREAM, PX_PROTO_OE); this is done in the rp-pppoe plugin of the pppd daemon, in the PPPOEConnectDevice() method of pppd/plugins/rp-pppoe/plugin.c. This socket() system call creates a PPPoE socket by the pppoe_create() method of the PPPoE kernel module. Releasing the socket after the PPPoE session completed is done by the pppoe_release() method of the PPPoE kernel module. Let's take a look at the pppoe_create() method:
+
+static const struct proto_ops pppoe_ops = {
+
+.family = AF_PPPOX,
+
+.owner = THIS_MODULE,
+
+.release = pppoe_release,
+
+.bind = sock_no_bind,
+
+.connect = pppoe_connect,
+
+. . .
+
+.sendmsg = pppoe_sendmsg,
+
+.recvmsg = pppoe_recvmsg,
+
+. . .
+
+.ioctl = pppox_ioctl,
+
+};
+
+static int pppoe_create(struct net *net, struct socket *sock)
+
+{
+
+struct sock *sk;
+
+sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto);
+
+if (!sk)
+
+return -ENOMEM;
+
+sock_init_data(sock, sk);
+
+sock->state = SS_UNCONNECTED;
+
+sock->ops = &pppoe_ops;
+
+sk->sk_backlog_rcv = pppoe_rcv_core;
+
+sk->sk_state = PPPOX_NONE;
+
+sk->sk_type = SOCK_STREAM;
+
+sk->sk_family = PF_PPPOX;
+
+sk->sk_protocol = PX_PROTO_OE;
+
+return 0;
+
+}
+
+(drivers/net/ppp/pppoe.c)
+
+By defining pppoe_ops we set callbacks for this socket. So calling from userspace the connect() system call on an AF_PPPOX socket will be handled by the pppoe_connect() method of the PPPoE module in the kernel. After creating a PPPoE socket, the PPPOEConnectDevice() method calls connect(). Let's take a look at the pppoe_connect() method:
+
+static int pppoe_connect(struct socket *sock, struct sockaddr *uservaddr,
+
+int sockaddr_len, int flags)
+
+{
+
+struct sock *sk = sock->sk;
+
+struct sockaddr_pppox *sp = (struct sockaddr_pppox *)uservaddr;
+
+struct pppox_sock *po = pppox_sk(sk);
+
+struct net_device *dev = NULL;
+
+struct pppoe_net *pn;
+
+struct net *net = NULL;
+
+int error;
+
+lock_sock(sk);
+
+error = -EINVAL;
+
+if (sp->sa_protocol != PX_PROTO_OE)
+
+goto end;
+
+/* Check for already bound sockets */
+
+error = -EBUSY;
+
+The stage_session() method returns true when the session id is not 0 (as mentioned earlier, the session id is 0 in the discovery stage only). In case the socket is connected and it is in the session stage, the socket is already bound, so we exit:
+
+if ((sk->sk_state & PPPOX_CONNECTED) &&
+
+stage_session(sp->sa_addr.pppoe.sid))
+
+goto end;
+
+Reaching here means that the socket is not connected (it's sk_state is not PPPOX_CONNECTED) and we need to register a PPP channel:
+
+. . .
+
+/* Re-bind in session stage only */
+
+if (stage_session(sp->sa_addr.pppoe.sid)) {
+
+error = -ENODEV;
+
+net = sock_net(sk);
+
+dev = dev_get_by_name(net, sp->sa_addr.pppoe.dev);
+
+if (!dev)
+
+goto err_put;
+
+po->pppoe_dev = dev;
+
+po->pppoe_ifindex = dev->ifindex;
+
+pn = pppoe_pernet(net);
+
+The network device must be up:
+
+if (!(dev->flags & IFF_UP)) {
+
+goto err_put;
+
+}
+
+memcpy(&po->pppoe_pa,
+
+&sp->sa_addr.pppoe,
+
+sizeof(struct pppoe_addr));
+
+write_lock_bh(&pn->hash_lock);
+
+The __set_item() method inserts the pppox_sock object, po, into the PPPoE socket hashtable; the hash key is generated according to the session id and the remote peer MAC address by the hash_item() method. The remote peer MAC address is po->pppoe_pa.remote. If there is an entry in the hash table with the same session id and the same remote MAC address and the same ifindex of the network device, the __set_item() method will return an error of –EALREADY:
+
+error = __set_item(pn, po);
+
+write_unlock_bh(&pn->hash_lock);
+
+if (error < 0)
+
+goto err_put;
+
+po->chan is a ppp_channel object, see earlier in the pppox_sock structure definition. Before registering it by the ppp_register_net_channel() method, some of its members should be initialized:
+
+po->chan.hdrlen = (sizeof(struct pppoe_hdr) +
+
+dev->hard_header_len);
+
+po->chan.mtu = dev->mtu - sizeof(struct pppoe_hdr);
+
+po->chan.private = sk;
+
+po->chan.ops = &pppoe_chan_ops;
+
+error = ppp_register_net_channel(dev_net(dev), &po->chan);
+
+if (error) {
+
+The delete_item() method deletes a pppox_sock object from the PPPoE socket hashtable.
+
+delete_item(pn, po->pppoe_pa.sid,
+
+po->pppoe_pa.remote, po->pppoe_ifindex);
+
+goto err_put;
+
+}
+
+Set the socket state to be connected:
+
+sk->sk_state = PPPOX_CONNECTED;
+
+}
+
+po->num = sp->sa_addr.pppoe.sid;
+
+end:
+
+release_sock(sk);
+
+return error;
+
+err_put:
+
+if (po->pppoe_dev) {
+
+dev_put(po->pppoe_dev);
+
+po->pppoe_dev = NULL;
+
+}
+
+goto end;
+
+}
+
+By registration of a PPP channel we are allowed to use PPP services. We are able to process PPPoE session packets by calling the generic PPP method, ppp_input(), from the pppoe_rcv_core() method. Transmission of PPPoE session packets is done with the generic ppp_start_xmit() method.
+
+RP-PPPoE is an open source project which provides a PPPoE client and a PPPoE server for Linux:  http://www.roaringpenguin.com/products/pppoe . A simple example of running a PPPoE server is:
+
+pppoe-server -I p3p1 -R 192.168.3.101 -L 192.168.3.210 -N 200
+
+The options that are used in this example are:
+
+  * -I: The interface name (p3p1)
+
+  * -L: Set local IP address (192.168.3.210)
+
+  * -R: Set the starting remote IP address (192.168.3.101)
+
+  * -N: Max number of concurrent PPPoE sessions (200 in this case)
+
+For other options, see man 8 pppoe-server.
+
+Clients on the same LAN can create a PPPoE connection to this server by a pppd daemon, using the rp-pppoe plugin.
+
+Android popularity as a mobile Operating System for smartphones and tablets is growing steadily. I will conclude the book with a short section about Android, discussing briefly the Android development model and showing four examples about Android networking.
+
+## Android
+
+In the recent years, the Android operating system proved to be a very reliable and successful mobile OS. The Android operating system is based on a Linux kernel, with changes by Google developers. Android runs on hundreds of types of mobile devices, which are mostly based on the ARM processor. (I should mention that there is a project of porting Android to Intel x86 processors,  http://www.android-x86.org/ ). The first generation of Google TV devices is based on x86 processors by Intel, but the second generation of Google TV devices are based on ARM. Originally Android was developed by "Android Inc.", a company that was founded in California in 2003 by Andy Rubin and others. Google bought this company in 2005. The Open Handset Alliance (OHA), a consortium of over 80 companies, announced Android in 2007. Android is an open source operating system, and its source code is released under the Apache License. Unlike Linux, most of the development is done by Google employees behind closed doors. As opposed to Linux, there is no public mailing list where developers are sending and discussing patches. One can, however, send patches to public Gerrit (see  http://source.android.com/source/submit-patches.html ). But it is up to Google only to decide whether or not they will be included in the Android tree.
+
+Google developers had contributed a lot to the Linux kernel. You had learned earlier in this chapter that the cgroup subsystem was started by Google developers. I will mention also two Linux kernel networking patches, the Receive Packet Steering (RPS) patch, and the Receive flow steering (RFS) patch by Tom Herbert from Google (see  http://lwn.net/Articles/362339/  and  http://lwn.net/Articles/382428/ ), which were integrated into kernel 2.6.35. When working with multicore platforms, RPS and RFS let you steer packets according to the hash of the payload to a specific CPU. And there are a lot of other examples of contributions from Google to the Linux kernel, and it seems that also in the future you will encounter many important contributions to the Linux kernel from Google. One can find a lot of code from Android kernel in the staging tree of the Linux kernel. However, it is difficult to say whether the Android kernel will be merged fully into the Linux kernel; probably a very large part of it will find its way into the Linux kernel. For more information about Mainlining Android see this wiki:  http://elinux.org/Android_Mainlining_Project . In the past there were many obstacles in the way, as Google implemented unique mechanisms, like wakelocks, alternative power management, its own IPC (called Binder), which is based on a Lightweight Remote Procedure Call (RPC), Android shared memory driver (Ashmem), Low Memory Killer and more. In fact, the Kernel community rejected the Google power management wakelocks patches in 2010. But since then, some of these features were merged and the situation changed. (See "Autosleep and Wake Locks,"  https://lwn.net/Articles/479841/ , and "The LPC Android microconference",  https://lwn.net/Articles/570406/ ). Linaro ( www.linaro.org/ ) is a non-profit organization that was established in 2010 by leading big companies such as ARM, Freescale, IBM, Samsung, ST-Ericsson, and Texas Instruments (TI). Its engineering teams develop Linux ARM kernel and also optimizations for GCC toolchain. Linaro teams are doing an amazing job of coordinating and pushing/tweaking changes upstream. Delving into the details of Android kernel implementation and mainlining is beyond the scope of this book.
+
+### Android Networking
+
+The main networking issue with Android is, however, not due to Linux kernel but to Android userspace. Android heavily relies on HAL even for networking, as well as for system framework. Originally (i.e., up to 4.2), there's no Ethernet support at all at framework level. If drivers are compiled in the kernel, the TCP/IP stack still allows basic Ethernet connectivity for Android Debug Bridge (ADB) debugging, but that's all. Starting with 4.0, Android-x86 project fork added an early implementation (badly designed but somehow working) of Ethernet at framework level. Starting with 4.2, official upstream sources support Ethernet, but there is no way to actually configure it (it detects Ethernet plug in/out, and if a DHCP server is there, it provides an IP address to the interface). Applications can actually make use of this interface through framework, but mostly no one does this. If you require real Ethernet support (i.e., being able to configure your interface, static/DHCP configure it, set proxy, ensure that all apps are using the interface, then a lot of hacks are still required (see  www.slideshare.net/gxben/abs-2013-dive-into-android-networking-adding-ethernet-connectivity  ). In all cases, only one interface is being supported at a time (eth0 only, even if you have eth0 and eth1, so don't expect to act as a router of any kind). I will show here four short examples of how Android networking differs from Linux kernel networking:
+
+  * Security privileges and networking: Android added a security feature (named "paranoid network") to the Linux kernel, which restricts access to some networking features, depending on the group of the calling process. As opposed to the standard Linux kernel, where any application can open a socket and transmit/receive with it, in Android access to network resources is filtered by GID (group ID). The part of network security will be probably very difficult to merge into the mainline kernel, as it includes many features that are unique to Android. For more information about Android network security, see  http://elinux.org/Android_Security#Paranoid_network-ing .
+
+  * Bluetooth: Bluedroid is a Bluetooth stack based on code that was developed by Broadcom. It replaced the BlueZ based stack in Android 4.2. Support for Bluetooth Low Energy (BLE, or Bluetooth LE) devices, also known as Bluetooth Smart and Smart Ready devices, was introduced in Android 4.3 (API Level 18), July 2013. Prior to this, Android Open Source Project (AOSP) did not have support for BLE devices, but there were some vendors who provided an API to BLE.
+
+  * Netfilter: There is an interesting project from Google that provides better network statistics on Android. This is implemented by xt_qtaguid, a netfilter module, which enables userspace applications to tag their sockets. This project required some changes in the Linux kernel netfilter subsystem. Patches of these changes were also sent to the Linux Kernel Mailing List (LKML); see  http://lwn.net/Articles/517358/ . For details, see "Android netfilter changes"  http://www.linuxplumbersconf.org/2013/ocw/sessions/1491  .
+
+  * NFC: As was described in the Near Field Communication (NFC) section earlier in this chapter, the Android NFC architecture is a userspace NFC stack: the implementation is done in userspace through the HAL which is supplied by Broadcom or by Android OEMs.
+
+### Android internals: Resources
+
+Although there are many resources about developing applications for Android (whether in books, mailing list, forums, courses, etc.), there are very few resources about the internals of Android. For those readers who are interested to learn more, I suggest these resources:
+
+  * The book Embedded Android: Porting, Extending, and Customizing, by Karim Yaghmour (O'Reilly Media, 2013)
+
+  * Slides: Android System Development by Maxime Ripard, Alexandre Belloni (over 400 slides);  http://free-electrons.com/doc/training/android/ .
+
+  * Slides: Android Platform Anatomy by Benjamin Zores (59 slides);  http://www.slideshare.net/gxben/droidcon-2013-france-android-platform-anatomy .
+
+  * Slides: Jelly Bean Device Porting by Benjamin Zores (127 slides);  http://www.slideshare.net/gxben/as-2013-jelly-bean-device-porting-walkthrough .
+
+  * Website:  http://developer.android.com/index.html .
+
+  * Android platform internals forum - archives:  http://news.gmane.org/gmane.comp.handhelds.android.platform
+
+  * Once a year, an Android Builders Summit (ABS) is held. The first ABS was held in 2011 in San Francisco. It is recommended to read slides, watch videos, or attend.
+
+  * XDA Developers Conference:  http://xda-devcon.com/ ; Slides and videos in  http://xda-devcon.com/presentations/
+
+  * Slides: Android Internals, Marko Gargenta:  http://www.scandevconf.se/db/Marakana-Android-Internals.pdf
+
+Note
+
+Android git repositories are available in  https://android.googlesource.com/
+
+Note that Android uses a special tool based on python called repo for management of hundreds of git repositories, which makes working with git easier.
+
+## Summary
+
+I have dealt in this chapter with namespaces in Linux, focusing on network namespaces. I also described the cgroups subsystem and its implementation; furthermore, I described its two network modules, net_prio and cls_cgroup. The Linux Bluetooth subsystem and its implementation, the IEEE 802.15.4 Linux subsystem and 6LoWPAN, and the NFC subsystem were all covered. The optimization achieved by Low Latency Sockets Poll was also discussed in this chapter, along with the Notification Chains mechanism, which is widely used in the kernel networking stack (and you will encounter it when browsing the source code). Another topic that was briefly discussed was the PCI subsystem, in order to give some background about PCI devices, as many network devices are PCI devices. The chapter was concluded with three short sections about the network teaming driver (which is intended to replace the bonding driver), the PPPoE implementation, and Android.
+
+Although we've come to the end of the book, there is much more to learn about Linux Kernel networking, as it is a vast ocean of details, and it is progressing dynamically and at such a fast pace. New features and new patches are added constantly. I hope you enjoyed the book and that you learned a thing or two!
+
+## Quick Reference
+
+I will conclude with a list of methods and macros that were mentioned in this chapter.
+
+### Methods
+
+The following list contains the prototypes and descriptions of several methods covered in this chapter.
+
+#### void switch_task_namespaces(struct task_struct *p, struct nsproxy *new);
+
+This method assigns the specified nsproxy object to the specified process descriptor (task_struct object).
+
+#### struct nsproxy *create_nsproxy(void);
+
+This method allocates an nsproxy object and initializes its reference counter to 1.
+
+#### void free_nsproxy(struct nsproxy *ns);
+
+This method released the resources of the specified nsproxy object.
+
+#### struct net *dev_net(const struct net_device *dev);
+
+This method returns the network namespace object (nd_net) associated with the specified network device.
+
+#### void dev_net_set(struct net_device *dev, struct net *net);
+
+This method associates the specified network namespace to the specified network device by setting the nd_net member of the net_device object.
+
+#### void sock_net_set(struct sock *sk, struct net *net);
+
+This method associates the specified network namespace to the specified sock object.
+
+#### struct net *sock_net(const struct sock *sk);
+
+This method returns the network namespace object (sk_net) associated with the specified sock object.
+
+#### int net_eq(const struct net *net1, const struct net *net2);
+
+This method returns 1 if the first specified network namespace pointer equals the second specified network namespace pointer and 0 otherwise.
+
+#### struct net *net_alloc(void);
+
+This method allocates a network namespace. It is invoked from the copy_net_ns() method.
+
+#### struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, struct net *old_net);
+
+This method creates a new network namespace if the CLONE_NEWNET flag is set in its first parameter, flags. It creates the new network namespace by first calling the net_alloc() method to allocate it, then it initializes it by calling the setup_net() method, and finally adds it to the global list of all namespaces, net_namespace_list. In case the CLONE_NEWNET flag is set in its first parameter, flags, there is no need to create a new namespace and the specified old network namespace, old_net, is returned. Note that this description of the copy_net_ns() method refers to the case when CONFIG_NET_NS is set. When CONFIG_NET_NS is not set, there is a second implementation of copy_net_ns(), which the only thing it does is first verify that CLONE_NEWNET is set in the specified flags, and in case it is, returns the specified old network namespace (old_net); see include/net/net_namespace.h.
+
+#### int setup_net(struct net *net, struct user_namespace *user_ns);
+
+This method initializes the specified network namespace object. It assigns the network namespace user_ns member to be the specified user_ns, it initializes the reference counter (count) of the specified network namespace to be 1, and performs more initializations. It is invoked from the copy_net_ns() method and from the net_ns_init() method.
+
+#### int proc_alloc_inum(unsigned int *inum);
+
+This method allocates a proc inode and sets *inum to be the generated proc inode number (an integer between 0xf0000000 and 0xffffffff). It returns 0 on success.
+
+#### struct nsproxy *task_nsproxy(struct task_struct *tsk);
+
+This method returns the nsproxy object which is attached to the specified process descriptor (tsk).
+
+#### struct new_utsname *utsname(void);
+
+This method returns the new_utsname object which is associated with the process which currently runs (current).
+
+#### struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns);
+
+This method creates a new UTS namespace object by calling the create_uts_ns() method, and copies the new_utsname object of the specified old_ns UTS namespace into the new_utsname of the newly created UTS namespace.
+
+#### struct uts_namespace *copy_utsname(unsigned long flags, struct user_namespace *user_ns, struct uts_namespace *old_ns);
+
+This method creates a new UTS namespace if the CLONE_NEWUTS flag is set in its first parameter, flags. It creates the new UTS namespace by calling the clone_uts_ns() method, and returns the newly created UTS namespace. In case the CLONE_NEWUTS flag is set in its first parameter, there is no need to create a new namespace and the specified old UTS namespace (old_ns) is returned.
+
+#### struct net *sock_net(const struct sock *sk);
+
+This method returns the network namespace object (sk_net) associated with the specified sock object.
+
+#### void sock_net_set(struct sock *sk, struct net *net);
+
+This method assigns the specified network namespace to the specified sock object.
+
+#### int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat);
+
+This method changes the network namespace of the specified network device to be the specified network namespace. It returns 0 on success or –errno on failure. Callers must hold the rtnl semaphore. If the NETIF_F_NETNS_LOCAL flag is set in the features of the network device, an error of –EINVAL is returned.
+
+#### void put_net(struct net *net);
+
+This method decrements the reference counter of the specified network namespace. In case it reaches zero, it calls the __put_net() method to free its resources.
+
+#### struct net *get_net(struct net *net);
+
+This method returns the specified network namespace object after incrementing its reference counter.
+
+#### void get_nsproxy(struct nsproxy *ns);
+
+This method increments the reference counter of the specified nsproxy object.
+
+#### struct net *get_net_ns_by_pid(pid_t pid);
+
+This method gets a process id (PID) as an argument, and returns the network namespace object to which this process is attached.
+
+#### struct net *get_net_ns_by_fd(int fd);
+
+This method gets a file descriptor as an argument, and returns the network namespace associated with the inode that corresponds to the specified file descriptor.
+
+#### struct pid_namespace *ns_of_pid(struct pid *pid);
+
+This method returns the PID namespace in which the specified pid was created.
+
+#### void put_nsproxy(struct nsproxy *ns);
+
+This method decrements the reference counter of the specified nsproxy object; in case it reaches 0, the specified nsproxy is freed by calling the free_nsproxy() method.
+
+#### int register_pernet_device(struct pernet_operations *ops);
+
+This method registers a network namespace device.
+
+#### void unregister_pernet_device(struct pernet_operations *ops);
+
+This method unregisters a network namespace device.
+
+#### int register_pernet_subsys(struct pernet_operations *ops);
+
+This method registers a network namespace subsystem.
+
+#### void unregister_pernet_subsys(struct pernet_operations *ops);
+
+This method unregisters a network namespace subsystem.
+
+#### static int register_vlan_device(struct net_device *real_dev, u16 vlan_id);
+
+This method registers a VLAN device associated with the specified physical device (real_dev).
+
+#### void cgroup_release_agent(struct work_struct *work);
+
+This method is called when a cgroup is released. It creates a userspace process by invoking the call_usermodehelper() method.
+
+#### int call_usermodehelper(char * path, char ** argv, char ** envp, int wait);
+
+This method prepares and starts a userspace application.
+
+#### int bacmp(bdaddr_t *ba1, bdaddr_t *ba2);
+
+This method compares two Bluetooth addresses. It returns 0 if they are equal.
+
+#### void bacpy(bdaddr_t *dst, bdaddr_t *src);
+
+This method copies the specified source Bluetooth address (src) to the specified destination Bluetooth address (dst).
+
+#### int hci_send_frame(struct sk_buff *skb);
+
+This method is the main Bluetooth method for transmitting SKBs (commands and data).
+
+#### int hci_register_dev(struct hci_dev *hdev);
+
+This method registers the specified HCI device. It is invoked from Bluetooth device drivers. If the open() or close() callbacks of the specified hci_dev object are not defined, the method will fail and return –EINVAL. This method sets the HCI_SETUP flag in the dev_flags member of the specified HCI device; it also creates a sysfs entry for the device.
+
+#### void hci_unregister_dev(struct hci_dev *hdev);
+
+This method unregisters the specified HCI device. It is invoked from Bluetooth device drivers. It sets the HCI_UNREGISTER flag in the dev_flags member of the specified HCI device; it also removes the sysfs entry of the device.
+
+#### void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb);
+
+This method handles events that are received from the HCI layer by the hci_rx_work() method.
+
+#### int lowpan_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
+
+This method is the main Rx handler for 6LoWPAN packets. 6LoWPAN packets have an ethertype of 0x00F6.
+
+#### void pci_unregister_driver(struct pci_driver *dev);
+
+This method unregisters a PCI driver. It is usually called in the network driver module_exit() method.
+
+#### int pci_enable_device(struct pci_dev *dev);
+
+This method initializes the PCI device before it is used by driver.
+
+#### int request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev);
+
+This method registers the specified handler as the interrupt service routine for the specified irq.
+
+#### void free_irq(unsigned int irq, void *dev_id);
+
+This method frees an interrupt which was allocated with the request_irq() method.
+
+#### int nfc_init(void);
+
+This method performs initialization of the NFC subsystem by registering the generic netlink NFC family, initializing NFC Raw sockets and NFC LLCP sockets, and initializing the AF_NFC protocol.
+
+#### int nfc_register_device(struct nfc_dev *dev);
+
+This method registers an NFC device (an nfc_dev object) against the NFC core.
+
+#### int nfc_hci_register_device(struct nfc_hci_dev *hdev);
+
+This method registers an NFC HCI device (an nfc_hci_dev object) against the NFC HCI layer.
+
+#### int nci_register_device(struct nci_dev *ndev);
+
+This method registers an NFC NCI device (an nci_dev object) against the NFC NCI layer.
+
+#### static int __init pppoe_init(void);
+
+This method initializes the PPPoE layer (PPPoE protocol handlers, the sockets used by PPPoE, the network notification handler, the PPPoE procfs entry, and more).
+
+#### struct pppoe_hdr *pppoe_hdr(const struct sk_buff *skb);
+
+This method returns the PPPoE header associated with the specified skb.
+
+#### static int pppoe_create(struct net *net, struct socket *sock);
+
+This method creates a PPPoE socket. Return 0 on success or –ENOMEM if allocation of a socket by the sk_alloc() method failed.
+
+#### int __set_item(struct pppoe_net *pn, struct pppox_sock *po);
+
+This method inserts the specified pppox_sock object into the PPPoE socket hashtable. The hash key is calculated according to the session id and the remote peer MAC address by the hash_item() method.
+
+#### void delete_item(struct pppoe_net *pn, __be16 sid, char *addr, int ifindex);
+
+This method removes the PPPoE socket hashtable entry which has the specified session id, the specified MAC address, and the specified network interface index (ifindex).
+
+#### bool stage_session(__be16 sid);
+
+This method returns true when the specified session id is not 0.
+
+#### int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n);
+
+This method registers the specified notifier_block object (n) to the specified notifier chain (nl). Note that this method is not used directly, there are several wrappers around it.
+
+#### int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n);
+
+This method unregistered the specified notifier_block object (n) from the specified notifier chain (nl). Note that also this method is not used directly, there are several wrappers around it.
+
+#### int register_netdevice_notifier(struct notifier_block *nb);
+
+This method registers the specified notifier_block object to netdev_chain by calling the raw_notifier_chain_register() method.
+
+#### int unregister_netdevice_notifier(struct notifier_block *nb);
+
+This method unregisters the specified notifier_block object from netdev_chain by calling the raw_notifier_chain_unregister() method.
+
+#### int register_inet6addr_notifier(struct notifier_block *nb);
+
+This method registers the specified notifier_block object to inet6addr_chain by calling the atomic_notifier_chain_register() method.
+
+#### int unregister_inet6addr_notifier(struct notifier_block *nb);
+
+This method unregisters the specified notifier_block object from inet6addr_chain by calling the atomic_notifier_chain_unregister() method.
+
+#### int register_netevent_notifier(struct notifier_block *nb);
+
+This method registers the specified notifier_block object to netevent_notif_chain by calling the atomic_notifier_chain_register() method.
+
+#### int unregister_netevent_notifier(struct notifier_block *nb);
+
+This method unregisters the specified notifier_block object from netevent_notif_chain by calling the atomic_notifier_chain_unregister() method.
+
+#### int __kprobes notifier_call_chain(struct notifier_block **nl, unsigned long val, void *v, int nr_to_call, int *nr_calls);
+
+This method is for generating notification events. Note that also this method is not used directly, there are several wrappers around it.
+
+#### int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
+
+This method is for generating notification events on the netdev_chain, by calling the raw_notifier_call_chain() method.
+
+#### int blocking_notifier_call_chain(struct blocking_notifier_head *nh, unsigned long val, void *v);
+
+This method is for generating notification events; eventually, after using locking mechanism, it invokes the notifier_call_chain() method.
+
+#### int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,unsigned long val, void *v, int nr_to_call, int *nr_calls);
+
+This method is for generating notification events. Eventually, after using locking mechanism, it invokes the notifier_call_chain() method.
+
+### Macros
+
+Here you'll find a description of the macro that was covered in this chapter.
+
+#### pci_register_driver()
+
+This macro registers a PCI driver in the PCI subsystem. It gets a pci_driver object as a parameter. It is usually called in the network driver module_init() method.
+Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_15
+
+© Rami Rosen 2014
+
+# Linux API
+
+Rami Rosen1
+
+(1)
+
+Haifa, Israel
+
+Abstract
+
+In this appendix I cover the two most fundamental data structures in the Linux Kernel Networking stack: the sk_buff and the net_device. This is reference material that can help when reading the rest of this book, as you will probably encounter these two structures in almost every chapter. Becoming familiar with and learning about these two data structures is essential for understanding the Linux Kernel Networking stack. Subsequently, there is a section about remote DMA (RDMA), which is further reference material for Chapter 13. It describes in detail the main methods and the main data structures that are used by RDMA. This appendix is a good place to always return to, especially when looking for definitions of the basic terms.
+
+In this appendix I cover the two most fundamental data structures in the Linux Kernel Networking stack: the sk_buff and the net_device. This is reference material that can help when reading the rest of this book, as you will probably encounter these two structures in almost every chapter. Becoming familiar with and learning about these two data structures is essential for understanding the Linux Kernel Networking stack. Subsequently, there is a section about remote DMA (RDMA), which is further reference material for Chapter 13. It describes in detail the main methods and the main data structures that are used by RDMA. This appendix is a good place to always return to, especially when looking for definitions of the basic terms.
+
+## The sk_buff Structure
+
+The sk_buff structure represents a packet. SKB stands for socket buffer. A packet can be generated by a local socket in the local machine, which was created by a userspace application; the packet can be sent outside or to another socket in the same machine. A packet can also be created by a kernel socket; and you can receive a physical frame from a network device (Layer 2) and attach it to an sk_buff and pass it on to Layer 3. When the packet destination is your local machine, it will continue to Layer 4. If the packet is not for your machine, it will be forwarded according to your routing tables rules, if your machine supports forwarding. If the packet is damaged for any reason, it will be dropped. The sk_buff is a very large structure; I mention most of its members in this section. The sk_buff structure is defined in include/linux/skbuff.h . Here is a description of most of its members:
+
+  * ktime_t tstamp
+
+Timestamp of the arrival of the packet. Timestamps are stored in the SKB as offsets to a base timestamp. Note: do not confuse tstamp of the SKB with hardware timestamping, which is implemented with the hwtstamps of skb_shared_info. I describe the skb_shared_info object later in this appenidx.
+
+Helper methods:
+
+  * skb_get_ktime(const struct sk_buff *skb): Returns the tstamp of the specified skb.
+
+  * skb_get_timestamp(const struct sk_buff *skb, struct timeval *stamp): Converts the offset back to a struct timeval.
+
+  * net_timestamp_set(struct sk_buff *skb): Sets the timestamp for the specified skb. The timestamp calculation is done with the ktime_get_real() method, which returns the time in ktime_t format.
+
+  * net_enable_timestamp(): This method should be called to enable SKB timestamping.
+
+  * net_disable_timestamp(): This method should be called to disable SKB timestamping.
+
+  * struct sock *sk
+
+The socket that owns the SKB, for local generated traffic and for traffic that is destined for the local host. For packets that are being forwarded, sk is NULL. Usually when talking about sockets you deal with sockets which are created by calling the socket() system call from userspace. It should be mentioned that there are also kernel sockets, which are created by calling the sock_create_kern() method. See for example in vxlan_init_net() in the VXLAN driver, drivers/net/vxlan.c.
+
+Helper method:
+
+  * skb_orphan(struct sk_buff *skb): If the specified skb has a destructor, call this destructor; set the sock object (sk) of the specified skb to NULL, and set the destructor of the specified skb to NULL.
+
+  * struct net_device *dev
+
+The dev member is a net_device object which represents the network interface device associated to the SKB; you will sometimes encounter the term NIC (Network Interface Card) for such a network device. It can be the network device on which the packet arrives, or the network device on which the packet will be sent. The net_device structure will be discussed in depth in the next section.
+
+  * char cb[48]
+
+This is the control buffer. It is free to use by any layer. This is an opaque area used to store private information. For example, the TCP protocol uses it for the TCP control buffer:
+
+#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
+
+(include/net/tcp.h)
+
+The Bluetooth protocol also uses the control block:
+
+#define bt_cb(skb) ((struct bt_skb_cb *)((skb)->cb))
+
+(include/net/bluetooth/bluetooth.h)
+
+  * unsigned long _skb_refdst
+
+The destination entry (dst_entry) address. The dst_entry struct represents the routing entry for a given destination. For each packet, incoming or outgoing, you perform a lookup in the routing tables. Sometimes this lookup is called FIB lookup. The result of this lookup determines how you should handle this packet; for example, whether it should be forwarded, and if so, on which interface it should be transmitted; or should it be thrown, should an ICMP error message be sent, and so on. The dst_entry object has a reference counter (the __refcnt field). There are cases when you use this reference count, and there are cases when you do not use it. The dst_entry object and the lookup in the FIB is discussed in more detail in Chapter 4.
+
+Helper methods:
+
+  * skb_dst_set(struct sk_buff *skb, struct dst_entry *dst): Sets the skb dst, assuming a reference was taken on dst and should be released by the dst_release() method (which is invoked by the skb_dst_drop() method).
+
+  * skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst): Sets the skb dst, assuming a reference was not taken on dst. In this case, the skb_dst_drop() method will not call the dst_release() method for the dst.
+
+Note
+
+The SKB might have a dst_entry pointer attached to it; it can be reference counted or not. The low order bit of _skb_refdst is set if the reference counter was not taken.
+
+  * struct sec_path *sp
+
+The security path pointer. It includes an array of IPsec XFRM transformations states (xfrm_state objects). IPsec (IP Security) is a Layer 3 protocol which is used mostly in VPNs. It is mandatory in IPv6 and optional in IPv4. Linux, like many other operating systems, implements IPsec both for IPv4 and IPv6. The sec_path structure is defined in include/net/xfrm.h. See more in Chapter 10, which deals with the IPsec subsystem.
+
+Helper method:
+
+  * struct sec_path *skb_sec_path(struct sk_buff *skb): Returns the sec_path object (sp) associated with the specified skb.
+
+  * unsigned int len
+
+The total number of packet bytes.
+
+  * unsigned int data_len
+
+The data length. This field is used only when the packet has nonlinear data (paged data).
+
+Helper method:
+
+  * skb_is_nonlinear(const struct sk_buff *skb): Returns true when the data_len of the specified skb is larger than 0.
+
+  * __u16 mac_len
+
+The length of the MAC (Layer 2) header.
+
+  * __wsum csum
+
+The checksum.
+
+  * __u32 priority
+
+The queuing priority of the packet. In the Tx path, the priority of the SKB is set according to the socket priority (the sk_priority field of the socket). The socket priority in turn can be set by calling the setsockopt() system call with the SO_PRIORITY socket option. Using the net_prio cgroup kernel module, you can define a rule which will set the priority for the SKB; see in the description of the sk_buff netprio_map field, later in this section, and also in Documentation/cgroup/netprio.txt. For forwarded packets, the priority is set according to TOS (Type Of Service) field in the IP header. There is a table named ip_tos2prio which consists of 16 elements. The mapping from TOS to priority is done by the rt_tos2priority() method, according to the TOS field of the IP header; see the ip_forward() method in net/ipv4/ip_forward.c and the ip_tos2prio definition in include/net/route.h.
+
+  * __u8 local_df:1
+
+Allow local fragmentation flag. If the value of the pmtudisc field of the socket which sends the packet is IP_PMTUDISC_DONT or IP_PMTUDISC_WANT, local_df is set to 1; if the value of the pmtudisc field of the socket is IP_PMTUDISC_DO or IP_PMTUDISC_PROBE, local_df is set to 0. See the implementation of the __ip_make_skb() method in net/ipv4/ip_output.c. Only when the packet local_df is 0 do you set the IP header don't fragment flag, IP_DF; see the ip_queue_xmit() method in net/ipv4/ip_output.c:
+
+...
+
+if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
+
+iph->frag_off = htons(IP_DF);
+
+else
+
+iph->frag_off = 0;
+
+...
+
+The frag_off field in the IP header is a 16-bit field, which represents the offset and the flags of the fragment. The 13 leftmost (MSB) bits are the offset (the offset unit is 8-bytes) and the 3 rightmost (LSB) bits are the flags. The flags can be IP_MF (there are more fragments), IP_DF (do not fragment), IP_CE (for congestion), or IP_OFFSET (offset part).
+
+The reason behind this is that there are cases when you do not want to allow IP fragmentation. For example, in Path MTU Discovery (PMTUD), you set the DF (don't fragment) flag of the IP header. Thus, you don't fragment the outgoing packets. Any network device along the path whose MTU is smaller than the packet will drop it and send back an ICMP packet ("Fragmentation Needed"). Getting these ICMP "Fragmentation Needed" packets is required in order to determine the Path MTU. See more in Chapter 3. From userspace, setting IP_PMTUDISC_DO is done, for example, thus (the following code snippet is taken from the source code of the tracepath utility from the iputils package; the tracepath utility finds the path MTU):
+
+...
+
+int on = IP_PMTUDISC_DO;
+
+setsockopt(fd, SOL_IP, IP_MTU_DISCOVER, &on, sizeof(on));
+
+...
+
+  * __u8 cloned:1
+
+When the packet is cloned with the __skb_clone() method, this field is set to 1 in both the cloned packet and the primary packet. Cloning SKB means creating a private copy of the sk_buff struct; the data block is shared between the clone and the primary SKB.
+
+  * __u8 ip_summed:2
+
+Indicator of IP (Layer 3) checksum; can be one of these values:
+
+  * CHECKSUM_NONE: When the device driver does not support hardware checksumming, it sets the ip_summed field to be CHECKSUM_NONE. This is an indication that checksumming should be done in software.
+
+  * CHECKSUM_UNNECESSARY: No need for any checksumming.
+
+  * CHECKSUM_COMPLETE: Calculation of the checksum was completed by the hardware, for incoming packets.
+
+  * CHECKSUM_PARTIAL: A partial checksum was computed for outgoing packets; the hardware should complete the checksum calculation. CHECKSUM_COMPLETE and CHECKSUM_PARTIAL replace the CHECKSUM_HW flag, which is now deprecated.
+
+  * __u8 nohdr:1
+
+Payload reference only, must not modify header. There are cases when the owner of the SKB no longer needs to access the header at all. In such cases, you can call the skb_header_release() method, which sets the nohdr field of the SKB; this indicates that the header of this SKB should not be modified.
+
+  * __u8 nfctinfo:3
+
+Connection Tracking info. Connection Tracking allows the kernel to keep track of all logical network connections or sessions. NAT relies on Connection Tracking information for its translations. The value of the nfctinfo field corresponds to the ip_conntrack_info enum values. So, for example, when a new connection is starting to be tracked, the value of nfctinfo is IP_CT_NEW. When the connection is established, the value of nfctinfo is IP_CT_ESTABLISHED. The value of nfctinfo can change to IP_CT_RELATED when the packet is related to an existing connection—for example, when the traffic is part of some FTP session or SIP session, and so on. For a full list of ip_conntrack_info enum values see include/uapi/linux/netfilter/nf_conntrack_common.h. The nfctinfo field of the SKB is set in the resolve_normal_ct() method, net/netfilter/nf_conntrack_core.c. This method performs a Connection Tracking lookup, and if there is a miss, it creates a new Connection Tracking entry. Connection Tracking is discussed in depth in Chapter 9, which deals with the netfilter subsystem.
+
+  * __u8 pkt_type:3
+
+For Ethernet, the packet type depends on the destination MAC address in the ethernet header, and is determined by the eth_type_trans() method:
+
+  * PACKET_BROADCAST for broadcast
+
+  * PACKET_MULTICAST for multicast
+
+  * PACKET_HOST if the destination MAC address is the MAC address of the device which was passed as a parameter
+
+  * PACKET_OTHERHOST if these conditions are not met
+
+See the definition of the packet types in include/uapi/linux/if_packet.h.
+
+  * __u8 ipvs_property:1
+
+This flag indicates whether the SKB is owned by ipvs (IP Virtual Server), which is a kernel-based transport layer load-balancing solution. This field is set to 1 in the transmit methods of ipvs (net/netfilter/ipvs/ip_vs_xmit.c).
+
+  * __u8 peeked:1
+
+This packet has been already seen, so stats have been done for it—so don't do them again.
+
+  * __u8 nf_trace:1
+
+The netfilter packet trace flag. This flag is set by the packet flow tracing the netfilter module, xt_TRACE module, which is used to mark packets for tracing (net/netfilter/xt_TRACE.c).
+
+Helper method:
+
+  * nf_reset_trace(struct sk_buff *skb): Sets the nf_trace of the specified skb to 0.
+
+  * __be16 protocol
+
+The protocol field is initialized in the Rx path by the eth_type_trans() method to be ETH_P_IP when working with Ethernet and IP.
+
+  * void (*destructor)(struct sk_buff *skb)
+
+A callback that is invoked when freeing the SKB by calling the kfree_skb() method.
+
+  * struct nf_conntrack *nfct
+
+The associated Connection Tracking object, if it exists. The nfct field, like the nfctinfo field, is set in the resolve_normal_ct() method. The Connection Tracking layer is discussed in depth in Chapter 9, which deals with the netfilter subsystem.
+
+  * int skb_iif
+
+The ifindex of the network device on which the packet arrived.
+
+  * __u32 rxhash
+
+The rxhash of the SKB is calculated in the receive path, according to the source and destination address of the IP header and the ports from the transport header. A value of zero indicates that the hash is not valid. The rxhash is used to ensure that packets with the same flow will be handled by the same CPU when working with Symmetrical Multiprocessing (SMP). This decreases the number of cache misses and improves network performance. The rxhash is part of the Receive Packet Steering (RPS) feature, which was contributed by Google developers (Tom Herbert and others). The RPS feature gives performance improvement in SMP environments. See more in Documentation/networking/scaling.txt.
+
+  * __be16 vlan_proto
+
+The VLAN protocol used—usually it is the 802.1q protocol. Recently support for the 802.1ad protocol (also known as Stacked VLAN) was added.
+
+The following is an example of creating 802.1q and 802.1ad VLAN devices in userspace using the ip command of the iproute2 package:
+
+ip link add link eth0 eth0.1000 type vlan proto 802.1ad id 1000
+
+ip link add link eth0.1000 eth0.1000.1000 type vlan proto 802.1q id 100
+
+Note: this feature is supported in kernel 3.10 and higher.
+
+  * __u16 vlan_tci
+
+The VLAN tag control information (2 bytes), composed of ID and priority.
+
+Helper method:
+
+  * vlan_tx_tag_present(__skb): This macro checks whether the VLAN_TAG_PRESENT flag is set in the vlan_tci field of the specified __skb.
+
+  * __u16 queue_mapping
+
+Queue mapping for multiqueue devices.
+
+Helper methods:
+
+  * skb_set_queue_mapping (struct sk_buff *skb, u16 queue_mapping): Sets the specified queue_mapping for the specified skb.
+
+  * skb_get_queue_mapping(const struct sk_buff *skb): Returns the queue_mapping of the specified skb.
+
+  * __u8 pfmemalloc
+
+Allocate the SKB from PFMEMALLOC reserves.
+
+Helper method:
+
+  * skb_pfmemalloc(): Returns true if the SKB was allocated from PFMEMALLOC reserves.
+
+  * __u8 ooo_okay:1
+
+The ooo_okay flag is set to avoid ooo (out of order) packets.
+
+  * __u8 l4_rxhash:1
+
+A flag that is set when a canonical 4-tuple hash over transport ports is used.
+
+See the __skb_get_rxhash() method in net/core/flow_dissector.c.
+
+  * __u8 no_fcs:1
+
+A flag that is set when you request the NIC to treat the last 4 bytes as Ethernet Frame Check Sequence (FCS).
+
+  * __u8 encapsulation:1
+
+The encapsulation field denotes that the SKB is used for encapsulation. It is used, for example, in the VXLAN driver. VXLAN is a standard protocol to transfer Layer 2 Ethernet packets over a UDP kernel socket. It can be used as a solution when there are firewalls that block tunnels and allow, for example, only TCP or UDP traffic. The VXLAN driver uses UDP encapsulation and sets the SKB encapsulation to 1 in the vxlan_init_net() method. Also the ip_gre module and the ipip tunnel module use encapsulation and set the SKB encapsulation to 1.
+
+  * __u32 secmark
+
+Security mark field. The secmark field is set by an iptables SECMARK target, which labels packets with any valid security context. For example:
+
+iptables -t mangle -A INPUT -p tcp --dport 80 -j SECMARK --selctx system_u:object_r:httpd_packet_t:s0
+
+iptables -t mangle -A OUTPUT -p tcp --sport 80 -j SECMARK --selctx system_u:object_r:httpd_packet_t:s0
+
+In the preceding rule, you are statically labeling packets arriving at and leaving from port 80 as httpd_packet_t. See: netfilter/xt_SECMARK.c.
+
+Helper methods:
+
+  * void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from): Sets the value of the secmark field of the first specified SKB (to) to be equal to the value of the secmark field of the second specified SKB (from).
+
+  * void skb_init_secmark(struct sk_buff *skb): Initializes the secmark of the specified skb to be 0.
+
+The next three fields: mark, dropcount, and reserved_tailroom appear in a union.
+
+  * __u32 mark
+
+This field enables identifying the SKB by marking it.
+
+You can set the mark field of the SKB, for example, with the iptables MARK target in an iptables PREROUTING rule with the mangle table.
+
+  * iptables -A PREROUTING -t mangle -i eth1 -j MARK --set-mark 0x1234
+
+This rule will assign the value of 0x1234 to every SKB mark field for incoming traffic on eth1 before performing a routing lookup. You can also run an iptables rule which will check the mark field of every SKB to match a specified value and act upon it. Netfilter targets and iptables are discussed in Chapter 9, which deals with the netfilter subsystem.
+
+  * __u32 dropcount
+
+The dropcount counter represents the number of dropped packets (sk_drops) of the sk_receive_queue of the assigned sock object (sk). See the sock_queue_rcv_skb() method in net/core/sock.c.
+
+  * _u32 reserved_tailroom: Used in the sk_stream_alloc_skb() method.
+
+  * sk_buff_data_t transport_header
+
+The transport layer (L4) header.
+
+Helper methods:
+
+  * skb_transport_header(const struct sk_buff *skb): Returns the transport header of the specified skb.
+
+  * skb_transport_header_was_set(const struct sk_buff *skb): Returns 1 if the transport_header of the specified skb is set.
+
+  * sk_buff_data_t network_header
+
+The network layer (L3) header.
+
+Helper method:
+
+  * skb_network_header(const struct sk_buff *skb): Returns the network header of the specified skb.
+
+  * sk_buff_data_t mac_header
+
+The link layer (L2) header.
+
+Helper methods:
+
+  * skb_mac_header(const struct sk_buff *skb): Returns the MAC header of the specified skb.
+
+  * skb_mac_header_was_set(const struct sk_buff *skb): Returns 1 if the mac_header of the specified skb was set.
+
+  * sk_buff_data_t tail
+
+The tail of the data.
+
+  * sk_buff_data_t end
+
+The end of the buffer. The tail cannot exceed end.
+
+  * unsigned char head
+
+The head of the buffer.
+
+  * unsigned char data
+
+The data head. The data block is allocated separately from the sk_buff allocation.
+
+See, in _alloc_skb(), net/core/skbuff.c:
+
+data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
+
+Helper methods:
+
+  * skb_headroom(const struct sk_buff *skb): This method returns the headroom, which is the number of bytes of free space at the head of the specified skb (skb->data – skb->head). See Figure A-1.
+
+  * skb_tailroom(const struct sk_buff *skb): This method returns the tailroom, which is the number of bytes of free space at the tail of the specified skb (skb->end – skb->tail). See Figure A-1.
+
+Figure A-1 shows the headroom and the tailroom of an SKB.
+
+Figure A-1.
+
+Headroom and tailroom of an SKB
+
+The following are some methods for handling buffers:
+
+  * skb_put(struct sk_buff *skb, unsigned int len): Adds data to a buffer: this method adds len bytes to the buffer of the specified skb and increments the length of the specified skb by the specified len.
+
+  * skb_push(struct sk_buff *skb, unsigned int len): Adds data to the start of a buffer; this method decrements the data pointer of the specified skb by the specified len and increments the length of the specified skb by the specified len.
+
+  * skb_pull(struct sk_buff *skb, unsigned int len): Removes data from the start of a buffer; this method increments the data pointer of the specified skb by the specified len and decrements the length of the specified skb by the specified len.
+
+  * skb_reserve(struct sk_buff *skb, int len): Increases the headroom of an empty skb by reducing the tail.
+
+After describing some methods for handling buffers, I continue with listing the members of the sk_buff structure:
+
+  * unsigned int truesize
+
+The total memory allocated for the SKB (including the SKB structure itself and the size of the allocated data block).
+
+  * atomic_t users
+
+A reference counter, initialized to 1; incremented by the skb_get() method and decremented by the kfree_skb() method or by the consume_skb() method; the kfree_skb() method decrements the usage counter; if it reached 0, the method will free the SKB—otherwise, the method will return without freeing it.
+
+Helper methods:
+
+  * skb_get(struct sk_buff *skb): Increments the users reference counter by 1.
+
+  * skb_shared(const struct sk_buff *skb): Returns true if the number of users is not 1.
+
+  * skb_share_check(struct sk_buff *skb, gfp_t pri): If the buffer is not shared, the original buffer is returned. If the buffer is shared, the buffer is cloned, and the old copy drops a reference. A new clone with a single reference is returned. When being called from interrupt context or with spinlocks held, the pri parameter (priority) must be GFP_ATOMIC. If memory allocation fails, NULL is returned.
+
+  * consume_skb(struct sk_buff *skb): Decrements the users reference counter and frees the SKB if the users reference counter is zero.
+
+### struct skb_shared_info
+
+The skb_shared_info struct is located at the end of the data block (skb_end_pointer(SKB)). It consists of only a few fields. Let's take a look at it:
+
+struct skb_shared_info {
+
+unsigned char nr_frags;
+
+__u8 tx_flags;
+
+unsigned short gso_size;
+
+unsigned short gso_segs;
+
+unsigned short gso_type;
+
+struct sk_buff *frag_list;
+
+struct skb_shared_hwtstamps hwtstamps;
+
+__be32 ip6_frag_id;
+
+atomic_t dataref;
+
+void * destructor_arg;
+
+skb_frag_t frags[MAX_SKB_FRAGS];
+
+};
+
+The following is a description of some of the important members of the skb_shared_info structure:
+
+  * nr_frags: Represents the number of elements in the frags array.
+
+  * tx_flags can be:
+
+    * SKBTX_HW_TSTAMP: Generate a hardware time stamp.
+
+    * SKBTX_SW_TSTAMP: Generate a software time stamp.
+
+    * SKBTX_IN_PROGRESS: Device driver is going to provide a hardware timestamp.
+
+    * SKBTX_DEV_ZEROCOPY: Device driver supports Tx zero-copy buffers.
+
+    * SKBTX_WIFI_STATUS: Generate WiFi status information.
+
+    * SKBTX_SHARED_FRAG: Indication that at least one fragment might be overwritten.
+
+  * When working with fragmentation, there are cases when you work with a list of sk_buffs (frag_list), and there are cases when you work with the frags array. It depends mostly on whether the Scatter/Gather mode is set.
+
+Helper methods:
+
+  * skb_is_gso(const struct sk_buff *skb): Returns true if the gso_size of the skb_shared_info associated with the specified skb is not 0.
+
+  * skb_is_gso_v6(const struct sk_buff *skb): Returns true if the gso_type of the skb_shared_info associated with the skb is SKB_GSO_TCPV6.
+
+  * skb_shinfo(skb): A macro that returns the skb_shinfo associated with the specified skb.
+
+  * skb_has_frag_list(const struct sk_buff *skb): Returns true if the frag_list of the skb_shared_info of the specified skb is not NULL.
+
+  * dataref: A reference counter of the skb_shared_info struct. It is set to 1 in the method, which allocates the skb and initializes skb_shared_info (The __alloc_skb() method).
+
+## The net_device structure
+
+The net_device struct represents the network device. It can be a physical device, like an Ethernet device, or it can be a software device, like a bridge device or a VLAN device. As with the sk_buff structure, I will list its important members. The net_device struct is defined in include/linux/netdevice.h:
+
+  * char name[IFNAMSIZ]
+
+The name of the network device. This is the name that you see with ifconfig or ip commands (for example eth0, eth1, and so on). The maximum length of the interface name is 16 characters. In newer distributions with biosdevname support, the naming scheme corresponds to the physical location of the network device. So PCI network devices are named p<slot>p<port>, according to the chassis labels, and embedded ports (on motherboard interfaces) are named em<port>—for example, em1, em2, and so on. There is a special suffix for SR-IOV devices and Network Partitioning (NPAR)–enabled devices. Biosdevname is developed by Dell:  http://linux.dell.com/biosdevname . See also this white paper:  http://linux.dell.com/files/whitepapers/consistent_network_device_naming_in_linux.pdf .
+
+Helper method:
+
+  * dev_valid_name(const char *name): Checks the validity of the specified network device name. A network device name must obey certain restrictions in order to enable creating corresponding sysfs entries. For example, it cannot be ". " or ".. "; its length should not exceed 16 characters. Changing the interface name can be done like this, for example: ip link set <oldDeviceName> p2p1 <newDeviceName>. So, for example, ip link set p2p1 name a12345678901234567 will fail with this message: Error: argument "a12345678901234567" is wrong: "name" too long. The reason is that you tried to set a device name that is longer than 16 characters. And running ip link set p2p1 name. will fail with RTNETLINK answers: Invalid argument, since you tried to set the device name to be ".", which is an invalid value. See dev_valid_name() in net/core/dev.c.
+
+  * struct hlist_node name_hlist
+
+This is a hash table of network devices, indexed by the network device name. A lookup in this hash table is performed by dev_get_by_name(). Insertion into this hash table is performed by the list_netdevice() method, and removal from this hash table is done with the unlist_netdevice() method.
+
+  * char *ifalias
+
+SNMP alias interface name. Its length can be up to 256 (IFALIASZ).
+
+You can create an alias to a network device using this command line:
+
+ip link set <devName> alias myalias
+
+The ifalias name is exported via sysfs by /sys/class/net/<devName>/ifalias.
+
+Helper method:
+
+  * dev_set_alias(struct net_device *dev, const char *alias, size_t len): Sets the specified alias to the specified network device. The specified len parameter is the number of bytes of specified alias to be copied; if the specified len is greater than 256 (IFALIASZ), the method will fail with -EINVAL.
+
+  * unsigned int irq
+
+The Interrupt Request (IRQ) number of the device. The network driver should call request_irq() to register itself with this IRQ number. Typically this is done in the probe() callback of the network device driver. The prototype of the request_irq() method is: int request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev). The first argument is the IRQ number. The sepcified handler is the Interrupt Service Routine (ISR). The network driver should call the free_irq() method when it no longer uses this irq. In many cases, this irq is shared (the request_irq() method is called with the IRQF_SHARED flag). You can view the number of interrupts that occurred on each core by running cat /proc/interrupts. You can set the SMP affinity of the irq by echo irqMask > /proc/irq/<irqNumber>/smp_affinity.
+
+In an SMP machine, setting the SMP affinity of interrupts means setting which cores are allowed to handle the interrupt. Some PCI network interfaces use Message Signaled Interrupts (MSIs).PCI MSI interrupts are never shared, so the IRQF_SHARED flag is not set when calling the request_irq() method in these network drivers. See more info in Documentation/PCI/MSI-HOWTO.txt.
+
+  * unsigned long state
+
+A flag that can be one of these values:
+
+  * __LINK_STATE_START: This flag is set when the device is brought up, by the dev_open() method, and is cleared when the device is brought down.
+
+  * __LINK_STATE_PRESENT: This flag is set in device registration, by the register_netdevice() method, and is cleared in the netif_device_detach() method.
+
+  * __LINK_STATE_NOCARRIER: This flag shows whether the device detected loss of carrier. It is set by the netif_carrier_off() method and cleared by the netif_carrier_on() method. It is exported by sysfs via /sys/class/net/<devName>/carrier.
+
+  * __LINK_STATE_LINKWATCH_PENDING: This flag is set by the linkwatch_fire_event() method and cleared by the linkwatch_do_dev() method.
+
+  * __LINK_STATE_DORMANT: The dormant state indicates that the interface is not able to pass packets (that is, it is not "up"); however, this is a "pending" state, waiting for some external event. See section 3.1.12, "New states for IfOperStatus" in RFC 2863, "The Interfaces Group MIB."
+
+The state flag can be set with the generic set_bit() method.
+
+Helper methods:
+
+  * netif_running(const struct net_device *dev): Returns true if the __LINK_STATE_START flag of the state field of the specified device is set.
+
+  * netif_device_present(struct net_device *dev): Returns true if the __LINK_STATE_PRESENT flag of the state field of the specified device is set.
+
+  * netif_carrier_ok (const struct net_device *dev): Returns true if the __LINK_STATE_NOCARRIER flag of the state field of the specified device is not set.
+
+These three methods are defined in include/linux/netdevice.h.
+
+  * netdev_features_t features
+
+The set of currently active device features. These features should be changed only by the network core or in error paths of the ndo_set_features() callback. Network driver developers are responsible for setting the initial set of the device features. Sometimes they can use a wrong combination of features. The network core fixes this by removing an offending feature in the netdev_fix_features() method, which is invoked when the network interface is registered (in the register_netdevice() method); a proper message is also written to the kernel log.
+
+I will mention some net_device features here and discuss them. For the full list of net_device features, look in include/linux/netdev_features.h.
+
+  * NETIF_F_IP_CSUM means that the network device can checksum L4 IPv4 TCP/UDP packets.
+
+  * NETIF_F_IPV6_CSUM means that the network device can checksum L4 IPv6 TCP/UDP packets.
+
+  * NETIF_F_HW_CSUM means that the device can checksum in hardware all L4 packets. You cannot activate NETIF_F_HW_CSUM together with NETIF_F_IP_CSUM, or together with NETIF_F_IPV6_CSUM, because that will cause duplicate checksumming.
+
+If the driver features set includes both NETIF_F_HW_CSUM and NETIF_F_IP_CSUM features, then you will get a kernel message saying "mixed HW and IP checksum settings." In such a case, the netdev_fix_features() method removes the NETIF_F_IP_CSUM feature. If the driver features set includes both NETIF_F_HW_CSUM and NETIF_F_IPV6_CSUM features, you get again the same message as in the previous case. This time, the NETIF_F_IPV6_CSUM feature is the one which is being removed by the netdev_fix_features() method. In order for a device to support TSO (TCP Segmentation Offload), it needs also to support Scatter/Gather and TCP checksum; this means that both NETIF_F_SG and NETIF_F_IP_CSUM features must be set. If the driver features set does not include the NETIF_F_SG feature, then you will get a kernel message saying "Dropping TSO features since no SG feature," and the NETIF_F_ALL_TSO feature will be removed. If the driver features set does not include the NETIF_F_IP_CSUM feature and does not include NETIF_F_HW_CSUM, then you will get a kernel message saying "Dropping TSO features since no CSUM feature," and the NETIF_F_TSO will be removed.
+
+Note
+
+In recent kernels, if CONFIG_DYNAMIC_DEBUG kernel config item is set, you might need to explicitly enable printing of some messages, via <debugfs>/dynamic_debug/control interface. See Documentation/dynamic-debug-howto.txt.
+
+  * NETIF_F_LLTX is the LockLess TX flag and is considered deprecated. When it is set, you don't use the generic Tx lock (This is why it is called LockLess TX). See the following macro (HARD_TX_LOCK) from net/core/dev.c:
+
+#define HARD_TX_LOCK(dev, txq, cpu) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \
+
+__netif_tx_lock(txq, cpu); \
+
+} \
+
+}
+
+NETIF_F_LLTX is used in tunnel drivers like VXLAN, VETH, and in IP over IP (IPIP) tunneling driver. For example, in the IPIP tunnel module, you set the NETIF_F_LLTX flag in the ipip_tunnel_setup() method (net/ipv4/ipip.c).
+
+The NETIF_F_LLTX flag is also used in a few drivers that have implemented their own Tx lock, like the cxgb network driver.
+
+In drivers/net/ethernet/chelsio/cxgb/cxgb2.c, you have:
+
+static int __devinit init_one(struct pci_dev *pdev
+
+const struct pci_device_id *ent)
+
+{
+
+...
+
+netdev->features |= NETIF_F_SG | NETIF_F_IP_CSUM |
+
+NETIF_F_RXCSUM | NETIF_F_LLTX;
+
+...
+
+}
+
+  * NETIF_F_GRO is used to indicate that the device supports GRO (Generic Receive Offload). With GRO, incoming packets are merged at reception time. The GRO feature improves network performance. GRO replaced LRO (Large Receive Offload), which was limited to TCP/IPv4. This flag is checked in the beginning of the dev_gro_receive() method; devices that do not have this flag set will not perform the GRO handling part in this method. A driver that wants to use GRO should call the napi_gro_receive() method in the Rx path of the driver. You can enable/disable GRO with ethtool, by ethtool -K <deviceName> gro on/ ethtool -K <deviceName> gro off, respectively. You can check whether GRO is set by running ethtool –k <deviceName> and looking at the gro field.
+
+  * NETIF_F_GSO is set to indicate that the device supports Generic Segmentation Offload (GSO). GSO is a generalization of a previous solution called TSO (TCP segmentation offload), which dealt only with TCP in IPv4. GSO can handle also IPv6, UDP, and other protocols. GSO is a performance optimization, based on traversing the networking stack once instead of many times, for big packets. So the idea is to avoid segmentation in Layer 4 and defer segmentation as much as possible. The sysadmin can enable/disable GSO with ethtool, by ethtool -K <driverName> gso on/ethtool -K <driverName> gso off, respectively. You can check whether GSO is set by running ethtool –k <deviceName> and looking at the gso field. To work with GSO, you should work in Scatter/Gather mode. The NETIF_F_SG flag must be set.
+
+  * NETIF_F_NETNS_LOCAL is set for network namespace local devices. These are network devices that are not allowed to move between network namespaces. The loopback, VXLAN, and PPP network devices are examples of namespace local devices. All these devices have the NETIF_F_NETNS_LOCAL flag set. A sysadmin can check whether an interface has the NETIF_F_NETNS_LOCAL flag set or not by ethtool -k <deviceName>. This feature is fixed and cannot be changed by ethtool. Trying to move a network device of this type to a different namespace results in an error (-EINVAL). For details, look in the dev_change_net_namespace() method (net/core/dev.c). When deleting a network namespace, devices that do not have the NETIF_F_NETNS_LOCAL flag set are moved to the default initial network namespace (init_net). Network namespace local devices that have the NETIF_F_NETNS_LOCAL flag set are not moved to the default initial network namespace (init_net), but are deleted.
+
+  * NETIF_F_HW_VLAN_CTAG_RX is for use by devices which support VLAN Rx hardware acceleration. It was formerly called NETIF_F_HW_VLAN_RX and was renamed in kernel 3.10, when support for 802.1ad was added. "CTAG" was added to indicate that this device differ from "STAG" device (Service provider tagging). A device driver that sets the NETIF_F_HW_VLAN_RX feature must also define the ndo_vlan_rx_add_vid() and ndo_vlan_rx_kill_vid() callbacks. Failure to do so will avoid device registration and result in a "Buggy VLAN acceleration in driver" kernel error message.
+
+  * NETIF_F_HW_VLAN_CTAG_TX is for use by devices that support VLAN Tx hardware acceleration. It was formerly called NETIF_F_HW_VLAN_TX and was renamed in kernel 3.10 when support for 802.1ad was added.
+
+  * NETIF_F_VLAN_CHALLENGED is set for devices that can't handle VLAN packets. Setting this feature avoids registration of a VLAN device. Let's take a look at the VLAN registration method:
+
+static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) {
+
+int err;
+
+...
+
+err = vlan_check_real_dev(real_dev, vlan_id);
+
+The first thing the vlan_check_real_dev() method does is to check the network device features and return an error if the NETIF_F_VLAN_CHALLENGED feature is set:
+
+int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id)
+
+{
+
+const char *name = real_dev->name;
+
+if (real_dev->features & NETIF_F_VLAN_CHALLENGED) {
+
+pr_info("VLANs not supported on %s\n", name);
+
+return -EOPNOTSUPP;
+
+}
+
+...
+
+}
+
+For example, some types of Intel e100 network device drivers set the NETIF_F_VLAN_CHALLENGED feature (see e100_probe() in drivers/net/ethernet/intel/e100.c).
+
+You can check whether the NETIF_F_VLAN_CHALLENGED is set by running ethtool –k <deviceName> and looking at the vlan-challenged field. This is a fixed value that you cannot change with the ethtool command.
+
+  * NETIF_F_SG is set when the network interface supports Scatter/Gather IO. You can enable and disable Scatter/Gather with ethtool, by ethtool -K <deviceName> sg on/ ethtool -K <deviceName> sg off, respectively. You can check whether Scatter/Gather is set by running ethtool –k <deviceName> and looking at the sg field.
+
+  * NETIF_F_HIGHDMA is set if the device can perform access by DMA to high memory. The practical implication of setting this feature is that the ndo_start_xmit() callback of the net_device_ops object can manage SKBs, which have frags elements in high memory. You can check whether the NETIF_F_HIGHDMA is set by running ethtool –k <deviceName> and looking at the highdma field. This is a fixed value that you cannot change with the ethtool command.
+
+  * netdev_features_t hw_features
+
+The set of features that are changeable features. This means that their state may possibly be changed (enabled or disabled) for a particular device by a user's request. This set should be initialized in the ndo_init() callback and not changed later.
+
+  * netdev_features_t wanted_features
+
+The set of features that were requested by the user. A user may request to change various offloading features—for example, by running ethtool -K eth1 rx on. This generates a feature change event notification (NETDEV_FEAT_CHANGE) to be sent by the netdev_features_change() method.
+
+  * netdev_features_t vlan_features
+
+The set of features whose state is inherited by child VLAN devices. For example, let's look at the rtl_init_one() method, which is the probe callback of the r8169 network device driver (see Chapter 14):
+
+int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+
+{
+
+...
+
+dev->vlan_features=NETIF_F_SG|NETIF_F_IP_CSUM|NETIF_F_TSO| NETIF_F_HIGHDMA;
+
+...
+
+}
+
+(drivers/net/ethernet/realtek/r8169.c)
+
+This initialization means that all child VLAN devices will have these features. For example, let's say that your eth0 device is an r8169 device, and you add a VLAN device thus: vconfig add eth0 100. Then, in the initialization in the VLAN module, there is this code related to vlan_features:
+
+static int vlan_dev_init(struct net_device *dev)
+
+{
+
+...
+
+dev->features |= real_dev->vlan_features | NETIF_F_LLTX;
+
+...
+
+}
+
+(net/8021q/vlan_dev.c)
+
+This means that it sets the features of the VLAN child device to be the vlan_features of the real device (which is eth0 in this case), which were set according to what you saw earlier in the rtl_init_one() method.
+
+  * netdev_features_t hw_enc_features
+
+The mask of features inherited by encapsulating devices. This field indicates what encapsulation offloads the hardware is capable of doing, and drivers will need to set them appropriately. For more info about the network device features, see Documentation/networking/netdev-features.txt.
+
+  * ifindex
+
+The ifindex (Interface index) is a unique device identifier. This index is incremented by 1 each time you create a new network device, by the dev_new_index() method. The first network device you create, which is almost always the loopback device, has ifindex of 1. Cyclic integer overflow is handled by the method that handles assignment of the ifindex number. The ifindex is exported by sysfs via /sys/class/net/<devName>/ifindex.
+
+  * struct net_device_stats stats
+
+The statistics struct, which was left as a legacy, includes fields like the number of rx_packets or the number of tx_packets. New device drivers use the rtnl_link_stats64 struct (defined in include/uapi/linux/if_link.h) instead of the net_device_stats struct. Most of the network drivers implement the ndo_get_stats64() callback of net_device_ops (or the ndo_get_stats() callback of net_device_ops, when working with the older API).
+
+The statistics are exported via /sys/class/net/<deviceName>/statistics.
+
+Some drivers implement the get_ethtool_stats() callback. These drivers show statistics by ethtool -S <deviceName>
+
+See, for example, the rtl8169_get_ethtool_stats() method in drivers/net/ethernet/realtek/r8169.c.
+
+  * atomic_long_t rx_dropped
+
+A counter of the number of packets that were dropped in the RX path by the core network stack. This counter should not be used by drivers. Do not confuse the rx_dropped field of the sk_buff with the dropped field of the softnet_data struct. The softnet_data struct represents a per-CPU object. They are not equivalent because the rx_dropped of the sk_buff might be incremented in several methods, whereas the dropped counter of softnet_data is incremented only by the enqueue_to_backlog() method (net/core/dev.c). The dropped counter of softnet_data is exported by /proc/net/softnet_stat. In /proc/net/softnet_stat you have one line per CPU. The first column is the total packets counter, and the second one is the dropped packets counter.
+
+For example:
+
+cat /proc/net/softnet_stat
+
+00000076 00000001 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
+
+00000005 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
+
+You see here one line per CPU (you have two CPUs); for the first CPU, you see 118 total packets (hex 0x76), where one packet is dropped. For the second CPU, you see 5 total packets and 0 dropped.
+
+  * struct net_device_ops *netdev_ops
+
+The netdev_ops structure includes pointers for several callback methods that you want to define if you want to override the default behavior. Here are some callbacks of netdev_ops:
+
+  * The ndo_init() callback is called when network device is registered.
+
+  * The ndo_uninit() callback is called when the network device is unregistered or when the registration fails.
+
+  * The ndo_open() callback handles change of device state, when a network device state is being changed from down state to up state.
+
+  * The ndo_stop() callback is called when a network device state is being changed to be down.
+
+  * The ndo_validate_addr() callback is called to check whether the MAC is valid. Many network drivers set the generic eth_validate_addr() method to be the ndo_validate_addr() callback. The generic eth_validate_addr() method returns true if the MAC address is not a multicast address and is not all zeroes.
+
+  * The ndo_set_mac_address() callback sets the MAC address. Many network drivers set the generic eth_mac_addr() method to be the ndo_set_mac_address() callback of struct net_device_ops for setting their MAC address. For example, the VETH driver (drivers/net/veth.c) or the VXLAN driver (drivers/nets/vxlan.c).
+
+  * The ndo_start_xmit() callback handles packet transmission. It cannot be NULL.
+
+  * The ndo_select_queue() callback is used to select a Tx queue, when working with multiqueues. If the ndo_select_queue() callback is not set, then the __netdev_pick_tx() is called. See the implementaion of the netdev_pick_tx() method in net/core/flow_dissector.c.
+
+  * The ndo_change_mtu() callback handles modifying the MTU. It should check that the specified MTU is not less than 68, which is the minimum MTU. In many cases, network drivers set the ndo_change_mtu() callback to be the generic eth_change_mtu() method. The eth_change_mtu() method should be overridden if jumbo frames are supported.
+
+  * The ndo_do_ioctl() callback is called when getting an IOCTL request which is not handled by the generic interface code.
+
+  * The ndo_tx_timeout() callback is called when the transmitter was idle for a quite a while (for watchdog usage).
+
+  * The ndo_add_slave() callback is called to set a specified network device as a slave to a specified netowrk device. It is used, for example, in the team network driver and in the bonding network driver.
+
+  * The ndo_del_slave() callback is called to remove a previously enslaved network device.
+
+  * The ndo_set_features() callback is called to update the configuration of a network device with new features.
+
+  * The ndo_vlan_rx_add_vid() callback is called when registering a VLAN id if the network device supports VLAN filtering (the NETIF_F_HW_VLAN_FILTER flag is set in the device features).
+
+  * The ndo_vlan_rx_kill_vid() callback is called when unregistering a VLAN id if the network device supports VLAN filtering (the NETIF_F_HW_VLAN_FILTER flag is set in the device features).
+
+Note
+
+From kernel 3.10, the NETIF_F_HW_VLAN_FILTER flag was renamed to NETIF_F_HW_VLAN_CTAG_FILTER.
+
+  * There are also several callbacks for handling SR-IOV devices, for example, ndo_set_vf_mac() and ndo_set_vf_vlan().
+
+Before kernel 2.6.29, there was a callback named set_multicast_list() for addition of multicast addresses, which was replaced by the dev_set_rx_mode() method. The dev_set_rx_mode() callback is called primarily whenever the unicast or multicast address lists or the network interface flags are updated.
+
+  * struct ethtool_ops *ethtool_ops
+
+The ethtool_ops structure includes pointers for several callbacks for handling offloads, getting and setting various device settings, reading registers, getting statistics, reading RX flow hash indirection table, WakeOnLAN parameters, and many more. If the network driver does not initialize the ethtool_ops object, the networking core provides a default empty ethtool_ops object named default_ethtool_ops. The management of ethtool_ops is done in net/core/ethtool.c .
+
+Helper method:
+
+  * SET_ETHTOOL_OPS (netdev,ops): A macro which sets the specified ethtool_ops for the specified net_device.
+
+You can view the offload parameters of a network interface device by running ethtool –k <deviceName>. You can set some offload parameters of a network interface device by running ethtool –K <deviceName> offloadParameter off/on. See man 8 ethtool.
+
+  * const struct header_ops *header_ops
+
+The header_ops struct include callbacks for creating the Layer 2 header, parsing it, rebuilding it, and more. For Ethernet it is eth_header_ops, defined in net/ethernet/eth.c.
+
+  * unsigned int flags
+
+The interface flags of the network device that you can see from userspace. Here are some flags (for a full list see include/uapi/linux/if.h):
+
+  * IFF_UP flag is set when the interface state is changed from down to up.
+
+  * IFF_PROMISC is set when the interface is in promiscuous mode (receives all packets). When running sniffers like wireshark or tcpdump, the network interface is in promiscuous mode.
+
+  * IFF_LOOPBACK is set for the loopback device.
+
+  * IFF_NOARP is set for devices which do not use the ARP protocol. IFF_NOARP is set, for example, in tunnel devices (see for example, in the ipip_tunnel_setup() method, net/ipv4/ipip.c).
+
+  * IFF_POINTOPOINT is set for PPP devices. See for example, the ppp_setup() method, drivers/net/ppp/ppp_generic.c.
+
+  * IFF_MASTER is set for master devices. See, for example, for bonding devices, the bond_setup() method in drivers/net/bonding/bond_main.c.
+
+  * IFF_LIVE_ADDR_CHANGE flag indicates that the device supports hardware address modification when it's running. See the eth_mac_addr() method in net/ethernet/eth.c.
+
+  * IFF_UNICAST_FLT flag is set when the network driver handles unicast address filtering.
+
+  * IFF_BONDING is set for a bonding master device or bonding slave device. The bonding driver provides a method for aggregating multiple network interfaces into a single logical interface.
+
+  * IFF_TEAM_PORT is set for a device used as a team port. The teaming driver is a load-balancing network software driver intended to replace the bonding driver.
+
+  * IFF_MACVLAN_PORT is set for a device used as a macvlan port.
+
+  * IFF_EBRIDGE is set for an Ethernet bridging device.
+
+The flags field is exported by sysfs via /sys/class/net/<devName>/flags.
+
+Some of these flags can be set by userspace tools. For example, ifconfig <deviceName> -arp will set the IFF_NOARP network interface flag, and ifconfig <deviceName> arp will clear the IFF_NOARP flag. Note that you can do the same with the iproute2 ip command: ip link set dev <deviceName> arp on and ip link set dev <deviceName> arp off.
+
+  * unsigned int priv_flags
+
+The interface flags, which are invisible from userspace. For example, IFF_EBRIDGE for a bridge interface or IFF_BONDING for a bonding interface, or IFF_SUPP_NOFCS for an interface support sending custom FCS.
+
+Helper methods:
+
+  * netif_supports_nofcs(): Returns true if the IFF_SUPP_NOFCS is set in the priv_flags of the specified device.
+
+  * is_vlan_dev(struct net_device *dev): Returns 1 if the IFF_802_1Q_VLAN flag is set in the priv_flags of the specified network device.
+
+  * unsigned short gflags
+
+Global flags (kept as legacy).
+
+  * unsigned short padded
+
+How much padding is added by the alloc_netdev() method.
+
+  * unsigned char operstate
+
+RFC 2863 operstate.
+
+  * unsigned char link_mode
+
+Mapping policy to operstate.
+
+  * unsigned int mtu
+
+The network interface MTU (Maximum Transmission Unit) value. The maximum size of frame the device can handle. RFC 791 sets 68 as a minimum MTU. Each protocol has MTU of its own. The default MTU for Ethernet is 1,500 bytes. It is set in the ether_setup() method, net/ethernet/eth.c. Ethernet packets with sizes higher than 1,500 bytes, up to 9,000 bytes, are called Jumbo frames. The network interface MTU is exported by sysfs via /sys/class/net/<devName>/mtu.
+
+Helper method:
+
+  * dev_set_mtu(struct net_device *dev, int new_mtu): Changes the MTU of the specified device to a new value, specified by the mtu parameter.
+
+The sysadmin can change the MTU of a network interface to 1,400, for example, in one of the following ways:
+
+ifconfig <netDevice> mtu 1400
+
+ip link set <netDevice> mtu 1400
+
+echo 1400 > /sys/class/net/<netDevice>/mtu
+
+Many drivers implement the ndo_change_mtu() callback to change the MTU to perform driver-specific needed actions (like resetting the network card).
+
+  * unsigned short type
+
+The network interface hardware type. For example, for Ethernet it is ARPHRD_ETHER and is set in ether_setup() in net/ethernet/eth.c. For PPP interface, it is ARPHRD_PPP, and is set in the ppp_setup() method in drivers/net/ppp/ppp_generic.c. The type is exported by sysfs via /sys/class/net/<devName>/type.
+
+  * unsigned short hard_header_len
+
+The hardware header length. Ethernet headers, for example, consist of MAC source address, MAC destination address, and a type. The MAC source and destination addresses are 6 bytes each, and the type is 2 bytes. So the Ethernet header length is 14 bytes. The Ethernet header length is set to 14 (ETH_HLEN) in the ether_setup() method, net/ethernet/eth.c. The ether_setup() method is responsible for initializing some Ethernet device defaults, like the hard header len, Tx queue len, MTU, type, and more.
+
+  * unsigned char perm_addr[MAX_ADDR_LEN]
+
+The permanent hardware address (MAC address) of the device.
+
+  * unsigned char addr_assign_type
+
+Hardware address assignment type, can be one of the following:
+
+  * NET_ADDR_PERM
+
+  * NET_ADDR_RANDOM
+
+  * NET_ADDR_STOLEN
+
+  * NET_ADDR_SET
+
+By default, the MAC address is permanent (NET_ADDR_PERM). If the MAC address was generated with a helper method named eth_hw_addr_random(), the type of the MAC address is NET_ADD_RANDOM. The type of the MAC address is stored in the addr_assign_type member of the net_device. Also when changing the MAC address of the device, with eth_mac_addr(), you reset the addr_assign_type with ∼NET_ADDR_RANDOM (if it was marked as NET_ADDR_RANDOM before). When a network device is registered (by the register_netdevice() method), if the addr_assign_type equals NET_ADDR_PERM, dev->perm_addr is set to be dev->dev_addr. When you set a MAC address, you set the addr_assign_type to be NET_ADDR_SET. This indicates that the MAC address of a device has been set by the dev_set_mac_address() method. The addr_assign_typ e is exported by sysfs via /sys/class/net/<devName>/addr_assign_type.
+
+  * unsigned char addr_len
+
+The hardware address length in octets. For Ethernet addresses, it is 6 (ETH_ALEN) bytes and is set in the ether_setup() method. The addr_len is exported by sysfs via /sys/class/net/<deviceName>/addr_len.
+
+  * unsigned char neigh_priv_len
+
+Used in the neigh_alloc() method,net/core/neighbour.c; neigh_priv_len is initialized only in the ATM code (atm/clip.c).
+
+  * struct netdev_hw_addr_list uc
+
+Unicast MAC addresses list, initialized by the dev_uc_init() method. There are three types of packets in Ethernet: unicast, multicast, and broadcast. Unicast is destined for one machine, multicast is destined for a group of machines, and broadcast is destined for all the machines in the LAN.
+
+Helper methods:
+
+  * netdev_uc_empty(dev): Returns 1 if the unicast list of the specified device is empty (its count field is 0).
+
+  * dev_uc_flush(struct net_device *dev): Flushes the unicast addresses of the specified network device and zeroes count.
+
+  * struct netdev_hw_addr_list mc
+
+Multicast MAC addresses list, initialized by the dev_mc_init() method.
+
+Helper methods:
+
+  * netdev_mc_empty(dev): Returns 1 if the multicast list of the specified device is empty (its count field is 0).
+
+  * dev_mc_flush(struct net_device *dev): Flushes the multicast addresses of the specified network device and zeroes the count field.
+
+  * unsigned int promiscuity
+
+A counter of the times a network interface card is told to work in promiscuous mode. With promiscuous mode, packets with MAC destination address which is different than the interface MAC address are not rejected. The promiscuity counter is used, for example, to enable more than one sniffing client; so when opening some sniffing clients (like wireshark), this counter is incremented by 1 for each client you open, and closing that client will decrement the promiscuity counter. When the last instance of the sniffing client is closed, promiscuity will be set to 0, and the device will exit from working in promiscuous mode. It is used also in the bridging subsystem, as the bridge interface needs to work in promiscuous mode. So when adding a bridge interface, the network interface card is set to work in promiscuous mode. See the call to the dev_set_promiscuity() method in br_add_if(), net/bridge/br_if.c.
+
+Helper method:
+
+  * dev_set_promiscuity(struct net_device *dev, int inc): Increments/decrements the promiscuity counter of the specified network device according to the specified increment. The dev_set_promiscuity() method can get a positive increment or a negative increment parameter. As long as the promiscuity counter remains above zero, the interface remains in promiscuous mode. Once it reaches zero, the device reverts back to normal filtering operation. Because promiscuity is an integer, the dev_set_promiscuity() method takes into account cyclic overflow of integer, which means it handles the case when the promiscuity counter is incremented when it reaches the maximum positive value an unsigned integer can reach.
+
+  * unsigned int allmulti
+
+The allmulti counter of the network device enables or disables the allmulticast mode. When selected, all multicast packets on the network will be received by the interface. You can set a network device to work in allmulticast mode by ifconfig eth0 allmulti. You disable the allmulti flag by ifconfig eth0 –allmulti.
+
+Enabling/disabling the allmulticast mode can also be performed with the ip command:
+
+ip link set p2p1 allmulticast on
+
+ip link set p2p1 allmulticast off
+
+You can also see the allmulticast state by inspecting the flags that are shown by the ip command:
+
+ip addr show
+
+flags=4610<BROADCAST,ALLMULTI,MULTICAST> mtu 1500
+
+Helper method:
+
+  * dev_set_allmulti(struct net_device *dev, int inc): Increments/decrements the allmulti counter of the specified network device according to the specified increment (which can be a positive or a negative integer). The dev_set_allmulti() method also sets the IFF_ALLMULTI flag of the network device when setting the allmulticast mode and removes this flag when disabling the allmulticast mode.
+
+The next three fields are protocol-specific pointers:
+
+  * struct in_device __rcu *ip_ptr
+
+This pointer is assigned to a pointer to struct in_device, which represents IPv4 specific data, in inetdev_init(), net/ipv4/devinet.c.
+
+  * struct inet6_dev __rcu *ip6_ptr
+
+This pointer is assigned to a pointer to struct inet6_dev, which represents IPv6 specific data, in ipv6_add_dev(), net/ipv6/addrconf.c.
+
+  * struct wireless_dev *ieee80211_ptr
+
+This is a pointer for the wireless device, assigned in the ieee80211_if_add() method, net/mac80211/iface.c.
+
+  * unsigned long last_rx
+
+Time of last Rx. It should not be set by network device drivers, unless really needed. Used, for example, in the bonding driver code.
+
+  * struct list_head dev_list
+
+The global list of network devices. Insertion to the list is done with the list_netdevice() method, when the network device is registered. Removal from the list is done with the unlist_netdevice() method, when the network device is unregistered.
+
+  * struct list_head napi_list
+
+NAPI stands for New API, a technique by which the network driver works in polling mode, and not in interrupt-driven mode, when it is under high traffic. Using NAPI under high traffic has been proven to improve performance. When working with NAPI, instead of getting an interrupt for each received packet, the network stack buffers the packets and from time to time triggers the poll method the driver registered with the netif_napi_add() method. When working with polling mode, the driver starts to work in interrupt-driven mode. When there is an interrupt for the first received packet, you reach the interrupt service routine (ISR), which is the method that was registered with request_irq(). Then the driver disables interrupts and notifies NAPI to take control, usually by calling the __napi_schedule() method from the ISR. See, for example, the cpsw_interrupt() method in drivers/net/ethernet/ti/cpsw.
+
+When the traffic is low, the network driver switches to work in interrupt-driven mode. Nowadays, most network drivers work with NAPI. The napi_list object is the list of napi_struct objects; The netif_napi_add() method adds napi_struct objects to this list, and the netif_napi_del() method deletes napi_struct objects from this list. When calling the netif_napi_add() method, the driver should specify its polling method and a weight parameter. The weight is a limit on the number of packets the driver will pass to the stack in each polling cycle. It is recommended to use a weight of 64. If a driver attempts to call netif_napi_add() with weight higher than 64 (NAPI_POLL_WEIGHT), there is a kernel error message. NAPI_POLL_WEIGHT is defined in include/linux/netdevice.h.
+
+The network driver should call napi_enable() to enable NAPI scheduling. Usually this is done in the ndo_open() callback of the net_device_ops object. The network driver should call napi_disable() to disable NAPI scheduling. Usually this is done in the ndo_stop() callback of net_device_ops. NAPI is implemented using softirqs. This softirq handler is the net_rx_action() method and is registered by calling open_softirq(NET_RX_SOFTIRQ, net_rx_action) by the net_dev_init() method in net/core/dev.c. The net_rx_action() method invokes the poll method of the network driver which was registered with NAPI. The maximum number of packets (taken from all interfaces which are registered to polling) in one polling cycle (NAPI poll) is by default 300. It is the netdev_budget variable, defined in net/core/dev.c, and can be modified via a procfs entry, /proc/sys/net/core/netdev_budget. In the past, you could change the weight per device by writing values to a procfs entry, but currently, the /sys/class/net/<device>/weight sysfs entry is removed. See Documentation/sysctl/net.txt. I should also mention that the napi_complete() method removes a device from the polling list. When a network driver wants to return to work in interrupt-driven mode, it should call the napi_complete() method to remove itself from the polling list.
+
+  * struct list_head unreg_list
+
+The list of unregistered network devices. Devices are added to this list when they are unregistered.
+
+  * unsigned char *dev_addr
+
+The MAC address of the network interface. Sometimes you want to assign a random MAC address. You do that by calling the eth_hw_addr_random() method, which also sets the addr_assign_type to be NET_ADDR_RANDOM.
+
+The dev_addr field is exported by sysfs via /sys/class/net/<devName>/address.
+
+You can change dev_addr with userspace tools like ifconfig or ip of iproute2.
+
+Helper methods: Many times you invoke the following helper methods on Ethernet addresses in general and on dev_addr field of a network device in particular:
+
+  * is_zero_ether_addr(const u8 *addr): Returns true if the address is all zeroes.
+
+  * is_multicast_ether_addr(const u8 *addr): Returns true if the address is a multicast address. By definition the broadcast address is also a multicast address.
+
+  * is_valid_ether_addr (const u8 *addr): Returns true if the specified MAC address is not 00:00:00:00:00:00, is not a multicast address, and is not a broadcast address (FF:FF:FF:FF:FF:FF).
+
+  * struct netdev_hw_addr_list dev_addrs
+
+The list of device hardware addresses.
+
+  * unsigned char broadcast[MAX_ADDR_LEN]
+
+The hardware broadcast address. For Ethernet devices, the broadcast address is initialized to 0XFFFFFF in the ether_setup() method, net/ethernet/eth.c. The broadcast address is exported by sysfs via /sys/class/net/<devName>/broadcast .
+
+  * struct kset *queues_kset
+
+A kset is a group of kobjects of a specific type, belonging to a specific subsystem.
+
+The kobject structure is the basic type of the device model. A Tx queue is represented by struct netdev_queue, and the Rx queue is represented by struct netdev_rx_queue. Each of them holds a kobject pointer. The queues_kset object is a group of all kobjects of the Tx queues and Rx queues. Each Rx queue has the sysfs entry /sys/class/net/<deviceName>/queues/<rx-queueNumber>, and each Tx queue has the sysfs entry /sys/class/net/<deviceName>/queues/<tx-queueNumber>. These entries are added with the rx_queue_add_kobject() method and the netdev_queue_add_kobject() method respectively, in net/core/net-sysfs.c. For more information about the kobject and the device model, see Documentation/kobject.txt.
+
+  * struct netdev_rx_queue *_rx
+
+An array of Rx queues (netdev_rx_queue objects), initialized by the netif_alloc_rx_queues() method. The Rx queue to be used is determined in the get_rps_cpu() method. See more info about RPS in the description of the rxhash field in the previous sk_buff section.
+
+  * unsigned int num_rx_queues
+
+The number of Rx queues allocated in the register_netdev() method.
+
+  * unsigned int real_num_rx_queues
+
+Number of Rx queues currently active in the device.
+
+Helper method:
+
+  * netif_set_real_num_rx_queues (struct net_device *dev, unsigned int rxq): Sets the actual number of Rx queues used for the specified device according to the specified number of Rx queues. The relevant sysfs entries (/sys/class/net/<devName>/queues/*) are updated (only in the case that the state of the device is NETREG_REGISTERED or NETREG_UNREGISTERING). Note that alloc_netdev_mq() initializes num_rx_queues, real_num_rx_queues, num_tx_queues and real_num_tx_queues to the same value. One can set the number of Tx queues and Rx queues by using ip link when adding a device. For example, if you want to create a VLAN device with 6 Tx queues and 7 Rx queues, you can run this command:
+
+ip link add link p2p1 name p2p1.1 numtxqueues 6 numrxqueues 7 type vlan id 8
+
+  * rx_handler_func_t __rcu *rx_handler
+
+Helper methods:
+
+  * netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler void *rx_handler_data)
+
+The rx_handler callback is set by calling the netdev_rx_handler_register() method. It is used, for example, in bonding, team, openvswitch, macvlan, and bridge devices.
+
+  * netdev_rx_handler_unregister(struct net_device *dev): Unregisters a receive handler for the specified network device.
+
+  * void __rcu *rx_handler_data
+
+The rx_handler_data field is also set by the netdev_rx_handler_register() method when a non-NULL value is passed to the netdev_rx_handler_register() method.
+
+  * struct netdev_queue __rcu *ingress_queue
+
+Helper method:
+
+  * struct netdev_queue *dev_ingress_queue(struct net_device *dev): Returns the ingress_queue of the specified net_device (include/linux/rtnetlink.h).
+
+  * struct netdev_queue *_tx
+
+An array of Tx queues (netdev_queue objects), initialized by the netif_alloc_netdev_queues() method.
+
+Helper method:
+
+  * netdev_get_tx_queue(const struct net_device *dev,unsigned int index): Returns the Tx queue (netdev_queue object), an element of the _tx array of the specified network device at the specified index.
+
+  * unsigned int num_tx_queues
+
+Number of Tx queues, allocated by the alloc_netdev_mq() method.
+
+  * unsigned int real_num_tx_queues
+
+Number of Tx queues currently active in the device.
+
+Helper method:
+
+  * netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq): Sets the actual number of Tx queues used.
+
+  * struct Qdisc *qdisc
+
+Each device maintains a queue of packets to be transmitted named qdisc. The Qdisc (Queuing Disciplines) layer implements the Linux kernel traffic management. The default qdisc is pfifo_fast. You can set a different qdisc using tc, the traffic control tool of the iproute2 package. You can view the qdisc of your network device by the using the ip command:
+
+ip addr show <deviceName>
+
+For example, running
+
+ip addr show eth1
+
+can give:
+
+2: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP qlen 1000
+
+link/ether 00:e0:4c:53:44:58 brd ff:ff:ff:ff:ff:ff
+
+inet 192.168.2.200/24 brd 192.168.2.255 scope global eth1
+
+inet6 fe80::2e0:4cff:fe53:4458/64 scope link
+
+valid_lft forever preferred_lft forever
+
+In this example, you can see that a qdisc of pfifo_fast is used, which is the default.
+
+  * unsigned long tx_queue_len
+
+The maximum number of allowed packets per queue. Each hardware layer has its own tx_queue_len default. For Ethernet devices, tx_queue_len is set to 1,000 by default (see the ether_setup() method). For FDDI, tx_queue_len is set to 100 by default (see the fddi_setup() method in net/802/fddi.c).
+
+The tx_queue_len field is set to 0 for virtual devices, such as the VLAN device, because the actual transmission of packets is done by the real device on which these virtual devices are based. You can set the Tx queue length of a device by using the command ifconfig (this option is called txqueuelen) or by using the command ip link show (it is called qlen), in this way, for example:
+
+ifconfig p2p1 txqueuelen 900
+
+ip link set txqueuelen 950 dev p2p1
+
+The Tx queue length is exported via the following sysfs entry: /sys/class/net/<deviceName>/tx_queue_len.
+
+  * unsigned long trans_start
+
+The time (in jiffies) of the last transmission.
+
+  * int watchdog_timeo
+
+The watchdog is a timer that will invoke a callback when the network interface was idle and did not perform transmission in some specified timeout interval. Usually the driver defines a watchdog callback which will reset the network interface in such a case. The ndo_tx_timeout() callback of net_device_ops serves as the watchdog callback. The watchdog_timeo field represents the timeout that is used by the watchdog. See the dev_watchdog() method, net/sched/sch_generic.c.
+
+  * int __percpu *pcpu_refcnt
+
+Per CPU network device reference counter.
+
+Helper methods:
+
+  * dev_put(struct net_device *dev): Decrements the reference count.
+
+  * dev_hold(struct net_device *dev): Increments the reference count.
+
+  * struct hlist_node index_hlist
+
+This is a hash table of network devices, indexed by the network device index (the ifindex field). A lookup in this table is performed by the dev_get_by_index() method. Insertion into this table is performed by the list_netdevice() method, and removal from this list is done with the unlist_netdevice() method.
+
+  * enum {...} reg_state
+
+An enum that represents the various registration states of the network device.
+
+Possible values:
+
+  * NETREG_UNINITIALIZED: When the device memory is allocated, in the alloc_netdev_mqs() method.
+
+  * NETREG_REGISTERED: When the net_device is registered, in the register_netdevice() method.
+
+  * NETREG_UNREGISTERING: When unregistering a device, in the rollback_registered_many() method.
+
+  * NETREG_UNREGISTERED: The network device is unregistered but it is not freed yet.
+
+  * NETREG_RELEASED: The network device is in the last stage of freeing the allocated memory of the network device, in the free_netdev() method.
+
+  * NETREG_DUMMY: Used in the dummy device, in the init_dummy_netdev() method. See drivers/net/dummy.c.
+
+  * bool dismantle
+
+A Boolean flag that shows that the device is in dismantle phase, which means that it is going to be freed.
+
+  * enum {...} rtnl_link_state
+
+This is an enum that can have two values that represent the two phases of creating a new link:
+
+  * RTNL_LINK_INITIALIZE: The ongoing state, when creating the link is still not finished.
+
+  * RTNL_LINK_INITIALIZING: The final state, when work is finished.
+
+See the rtnl_newlink() method in net/core/rtnetlink.c.
+
+  * void (*destructor)(struct net_device *dev)
+
+This destructor callback is called when unregistering a network device, in the netdev_run_todo() method. It enables network devices to perform additional tasks that need to be done for unregistering. For example, the loopback device destructor callback, loopback_dev_free(), calls free_percpu() for freeing its statistics object and free_netdev(). Likewise the team device destructor callback, team_destructor(), also calls free_percpu() for freeing its statistics object and free_netdev(). And there are many other network device drivers that define a destructor callback.
+
+  * struct net *nd_net
+
+The network namespace this network device is inside. Network namespaces support was added in the 2.6.29 kernel. These features provide process virtualization, which is considered lightweight in comparison to other virtualization solutions like KVM and Xen. There is currently support for six namespaces in the Linux kernel. In order to support network namespaces, a structure called net was added. This structure represents a network namespace. The process descriptor (task_struct) handles the network namespace and other namespaces via a new member which was added for namespaces support, named nsproxy. This nsproxy includes a network namespace object called net_ns, and also four other namespace objects of the following namespaces: pid namespace, mount namespace, uts namespace, and ipc namespace; the sixth namespace, the user namespace, is kept in struct cred (the credentials object) which is a member of the process descriptor, task_struct).
+
+Network namespaces provide a partitioning and isolation mechanism which enables one process or a group of processes to have a private view of a full network stack of their own. By default, after boot all network interfaces belong to the default network namespace, init_net. You can create a network namespace with userspace tools using the ip command from iproute2 package or with the unshare command of util-linux—or by writing your own userspace application and invoking the unshare() or the clone() system calls with the CLONE_NEWNET flag. Moreover, you can also change the network namespace of a process by invoking the setns() system call. This setns() system call and the unshare() system call were added specially to support namespaces. The setns() system call can attach to the calling process an existing namespace of any type (network namespace, pid namespace, mount namespace, and so on). You need CAP_SYS_ADMIN privilege to call set_ns() for all namespaces, except the user namespace. See man 2 setns.
+
+A network device belongs to exactly one network namespace at a given moment. And a network socket belongs to exactly one network namespace at a given moment. Namespaces do not have names, but they do have a unique inode which identifies them. This unique inode is generated when the namespace is created and can be read by reading a procfs entry (the command ls –al /proc/<pid>/ns/ shows all the unique inode numbers symbolic links of a process—you can also read these symbolic links with the readlink command).
+
+For example, using the ip command, creating a new namespace called ns1 is done thus:
+
+ip netns add myns1
+
+Each newly created network namespace includes only the loopback device and includes no sockets. Each device (like a bridge device or a VLAN device) that is created from a process that runs in that namespace (like a shell) belongs to that namespace.
+
+Removing a namespace is done using the following command:
+
+ip netns del myns1
+
+Note
+
+After deleting a namespace, all its physical network devices are moved to the default network namespace. Local devices (namespace local devices that have the NETIF_F_NETNS_LOCAL flag set, like PPP device or VXLAN device) are not moved to the default network namespace but are deleted.
+
+Showing the list of all network namespaces on the system is done with this command:
+
+ip netns list
+
+Assigning the p2p1 interface to the myns1 network namespace is done by the command:
+
+ip link set p2p1 netns myns1
+
+Opening a shell in myns1 is done thus:
+
+ip netns exec myns1 bash
+
+With the unshare utility, creating a new namespace and starting a bash shell inside is done thus:
+
+unshare --net bash
+
+Two network namespaces can communicate by using a special virtual Ethernet driver, veth. (drivers/net/veth.c).
+
+Helper methods:
+
+  * dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat): Moves the network device to a different network namespace, specified by the net parameter. Local devices (devices in which the NETIF_F_NETNS_LOCAL feature is set) are not allowed to change their namespace. This method returns -EINVAL for this type of device. The pat parameter, when it is not NULL, is the name pattern to try if the current device name is already taken in the destination network namespace. The method also sends a KOBJ_REMOVE uevent for removing the old namespace entries from sysfs, and a KOBJ_ADD uevent to add the sysfs entries to the new namespace. This is done by invoking the kobject_uevent() method specifying the corresponding uevent.
+
+  * dev_net(const struct net_device *dev): Returns the network namespace of the specified network device.
+
+  * dev_net_set(struct net_device *dev, struct net *net): Decrements the reference count of the nd_net (namespace object) of the specified device and assigns the specified network namespace to it.
+
+The following four fields are members in a union:
+
+  * struct pcpu_lstats __percpu *lstats
+
+The loopback network device statistics.
+
+  * struct pcpu_tstats __percpu *tstats
+
+The tunnel statistics.
+
+  * struct pcpu_dstats __percpu *dstats
+
+The dummy network device statistics.
+
+  * struct pcpu_vstats __percpu *vstats
+
+The VETH (Virtual Ethernet) statistics.
+
+  * struct device dev
+
+The device object associated with the network device. Every device in the Linux kernel is associated with a device object, which is an instance of the device structure. For more information about the device structure, I suggest you read the "Devices" section in Chapter 14 of Linux Device Drivers, 3rd Edition (O'Reilly, 2005) and Documentation/driver-model/overview.txt.
+
+Helper methods:
+
+  * to_net_dev(d): Returns the net_device object that contains the specified device as its device object.
+
+  * SET_NETDEV_DEV (net, pdev): Sets the parent of the dev member of the specified network device to be that specified device (the second argument, pdev).
+
+With virtual devices, you do not call the SET_NETDEV_DEV() macro. As a result, entries for these virtual devices are created under /sys/devices/virtual/net.
+
+The SET_NETDEV_DEV() macro should be called before calling the register_netdev() method.
+
+  * SET_NETDEV_DEVTYPE(net, devtype): Sets the type of the dev member of the specified network device to be the specified type. The type is a device_type object.
+
+SET_NETDEV_DEVTYPE() is used, for example, in the br_dev_setup() method, innet/bridge/br_device.c:
+
+static struct device_type br_type = {
+
+.name = "bridge"
+
+};
+
+void br_dev_setup(struct net_device *dev)
+
+{
+
+...
+
+SET_NETDEV_DEVTYPE(dev, &br_type);
+
+...
+
+}
+
+With the udevadm tool (udev management tool), you can find the device type, for example, for a bridge device named mybr:
+
+udevadm info -q all -p /sys/devices/virtual/net/mybr
+
+P: /devices/virtual/net/mybr
+
+E: DEVPATH=/devices/virtual/net/mybr
+
+E: DEVTYPE=bridge
+
+E: ID_MM_CANDIDATE=1
+
+E: IFINDEX=7
+
+E: INTERFACE=mybr
+
+E: SUBSYSTEM=net
+
+  * const struct attribute_group *sysfs_groups[4]
+
+Used by networking sysfs.
+
+  * struct rtnl_link_ops *rtnl_link_ops
+
+The rtnetlink link operations object. It consists of various callbacks for handling network devices, for example:
+
+  * newlink() for configuring and registering a new device.
+
+  * changelink() for changing parameters of an existing device.
+
+  * dellink() for removing a device.
+
+  * get_num_tx_queues() for getting the number of Tx queues.
+
+  * get_num_rx_queues() for getting the number of Rx queues.
+
+Registration and unregistration of rtnl_link_ops object is done with the rtnl_link_register() method and the rtnl_link_unregister() method, respectively.
+
+  * unsigned int gso_max_size
+
+Helper method:
+
+  * netif_set_gso_max_size(struct net_device *dev, unsigned int size): Sets the specified gso_max_size for the specified network device.
+
+  * u8 num_tc
+
+The number of traffic classes in the net device.
+
+Helper method:
+
+  * netdev_set_num_tc(struct net_device *dev, u8 num_tc): Sets the num_tc of the specified network device (the maximum value of num_tc can be TC_MAX_QUEUE, which is 16).
+
+  * int netdev_get_num_tc(struct net_device *dev): Returns the num_tc value of the specified network device.
+
+  * struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]
+
+  * u8 prio_tc_map[TC_BITMASK + 1];
+
+  * struct netprio_map __rcu *priomap
+
+The network priority cgroup module provides an interface to set the priority of network traffic. The cgroups layer is a Linux kernel layer that enables process resource management and process isolation. It enables assigning one task or several tasks to a system resource, like a networking resource, memory resource, CPU resource, and so on. The cgroups layer implements a Virtual File System (VFS)and is managed by filesystem operations like mounting/unmounting, creating files and directories, writing to cgroup VFS control files, and so forth. The cgroup project was started in 2005 by developers from Google (Paul Manage, Rohit Seth, and others). Some projects are based on cgroups usage, like systemd and lxc (Linux containers). Google has its own implementation of containers, based on cgroups. There is no relation between the cgroup implementation and the namespaces implementation. In the past, there was a namespace controller in cgroups but it was removed. No new system calls were added for cgroups implementations, and the cgroup code additions are not critical in terms of performance. There are two networking cgroups modules: net_prio and net_cls. These two cgroup modules are relatively short and simple.
+
+Setting the priority of network traffic with the netprio cgroup module is done by writing an entry to a cgroup control file,/sys/fs/cgroup/net_prio/<group>/net_prio.ifpriomap. The entry is in the form "deviceName priority." It is true that an application can set the priority of its traffic via the setsockopt() system call with SO_PRIORITY, but this is not always possible. Sometimes you cannot change the code of certain applications. Moreover, you want to let the system administrator decide on priority according to site-specific setup. The netprio kernel module is a solution when using the setsockopt() system call with SO_PRIORITY is not feasible. The netprio module also exports another /sys/fs/cgroup/netprio entry, net_prio.prioidx. The net_prio.prioidx entry is a read-only file and contains a unique integer value that the kernel uses as an internal representation of this cgroup.
+
+netprio is implemented in net/core/netprio_cgroup.c.
+
+net_cls is implemented in net/sched/cls_cgroup.c.
+
+The network classifier cgroup provides an interface to tag network packets with a class identifier (classid). Creating a net_cls cgroups instance creates a net_cls.classid control file. This net_cls.classid value is initialized to 0. You can set up rules for this classid with tc, the traffic control command of iproute2.
+
+For more information, see Documentation/cgroups/net_cls.txt .
+
+  * struct phy_device *phydev
+
+The associated PHY device. The phy_device is the Layer 1 (the physical layer) device. It is defined in include/linux/phy.h. For many devices, PHY flow control parameters like autonegotiation, speed, or duplex can be configured via the PHY device with ethtool commands. See man 8 ethtool for more info.
+
+  * int group
+
+The group that the network device belongs to. It is initialized with INIT_NETDEV_GROUP (0) by default. The group is exported by sysfs via /sys/class/net/<devName>/netdev_group. The network device group filters are used for example in netfilter, in net/netfilter/xt_devgroup.c.
+
+Helper method:
+
+  * void dev_set_group(struct net_device *dev, int new_group): Changes the group of the specified device to be the specified group.
+
+  * struct pm_qos_request pm_qos_req
+
+Power Management Quality Of Service request object, defined in include/linux/pm_qos.h.
+
+For more details about PM QoS, see Documentation/power/pm_qos_interface.txt.
+
+Next I will describe the netdev_priv() method and the alloc_netdev() macro, which are used a lot in network drivers.
+
+The netdev_priv(struct net_device *netdev) method returns a pointer to the end of the net_device. This area is used by drivers, which define a private network interface structure in order to store private data. For example, in drivers/net/ethernet/intel/e1000e/netdev.c:
+
+static int e1000_open(struct net_device *netdev)
+
+{
+
+struct e1000_adapter *adapter = netdev_priv(netdev);
+
+...
+
+}
+
+The netdev_priv() method is used also for software devices, like the VLAN device. So you have:
+
+static inline struct vlan_dev_priv *vlan_dev_priv(const struct net_device *dev)
+
+{
+
+return netdev_priv(dev);
+
+}
+
+(net/8021q/vlan.h)
+
+  * The alloc_netdev(sizeof_priv, name, setup) macro is for allocation and initialization of a network device. It is in fact a wrapper around alloc_netdev_mqs(), with one Tx queue and one Rx queue. sizeof_priv is the size of private data to allocate space for. The setup method is a callback to initialize the network device. For Ethernet devices, it is usually ether_setup().
+
+For Ethernet devices, you can use the alloc_etherdev() or alloc_etherdev_mq() macros, which eventually invoke alloc_etherdev_mqs(); alloc_etherdev_mqs() is also a wrapper around alloc_netdev_mqs(), with the ether_setup() as the setup callback method.
+
+  * Software devices usually define a setup method of their own. So, in PPP you have the ppp_setup() method in drivers/net/ppp/ppp_generic.c, and for VLAN you have vlan_setup(struct net_device *dev) in net/8021q/vlan.h.
+
+## RDMA (Remote DMA)
+
+The following sections describe the RDMA API for the following data structures:
+
+  * RDMA device
+
+  * Protection Domain (PD)
+
+  * eXtended Reliable Connected (XRC)
+
+  * Shared Receive Queue (SRQ)
+
+  * Address Handle (AH)
+
+  * Multicast Groups
+
+  * Completion Queue (CQ)
+
+  * Queue Pair (QP)
+
+  * Memory Window (MW)
+
+  * Memory Region (MR)
+
+## RDMA Device
+
+The following methods are related to the RDMA device.
+
+### The ib_register_client() Method
+
+The ib_register_client() method registers a kernel client that wants to use the RDMA stack. The specified callbacks will be called for every RDMA device that currently exists in the system and for every new device that will be detected or removed by the system (using hot-plug). It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_register_client(struct ib_client *client);
+
+  * client: A structure that describes the attributes of the registration.
+
+#### The ib_client Struct:
+
+The device registration attributes are represented by struct ib_client:
+
+struct ib_client {
+
+char *name;
+
+void (*add) (struct ib_device *);
+
+void (*remove)(struct ib_device *);
+
+struct list_head list;
+
+};
+
+  * name: The name of the kernel module to be registered.
+
+  * add: A callback to be called for each RDMA device that exists in the system and for every new RDMA device that will be detected by the kernel.
+
+  * remove: A callback to be called for each RDMA device being removed by the kernel.
+
+### The ib_unregister_client() Method
+
+The ib_unregister_client() method unregisters a kernel module that wants to stop using the RDMA stack.
+
+void ib_unregister_client(struct ib_client *client);
+
+  * device: A structure that describes the attributes of the unregistration.
+
+  * client: Should be the same object that was used when ib_register_client() was called.
+
+### The ib_get_client_data() Method
+
+The ib_get_client_data() method returns the client context which was associated with the RDMA device using the ib_set_client_data() method.
+
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client);
+
+  * device: The RDMA device to get the client context from.
+
+  * client: The object that describes the attributes of the registration/unregistration.
+
+### The ib_set_client_data() Method
+
+The ib_set_client_data() method sets a client context to be associated with the RDMA device.
+
+void ib_set_client_data(struct ib_device *device, struct ib_client *client
+
+void *data);
+
+  * device: The RDMA device to set the client context with.
+
+  * client: The object that describes the attributes of the registration/unregistration.
+
+  * data: The client context to associate.
+
+### The INIT_IB_EVENT_HANDLER macro
+
+The INIT_IB_EVENT_HANDLER macro initializes an event handler for the asynchronous events that may occur to the RDMA device. This macro should be used before calling the ib_register_event_handler() method:
+
+#define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler) \
+
+do { \
+
+(_ptr)->device = _device; \
+
+(_ptr)->handler = _handler; \
+
+INIT_LIST_HEAD(&(_ptr)->list); \
+
+} while (0)
+
+  * _ptr: A pointer to the event handler that will be provided to the ib_register_event_handler() method.
+
+  * _device: The RDMA device context; upon its events the callback will be called.
+
+  * _handler: The callback that will be called with every asynchronous event.
+
+### The ib_register_event_handler() Method
+
+The ib_register_event_handler() method registers an RDMA event to be called with every handler asynchronous event. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_register_event_handler (struct ib_event_handler *event_handler);
+
+  * event_handler: The event handler that was initialized with the macro INIT_IB_EVENT_HANDLER. This callback may occur in interrupt context.
+
+### The ib_event_handler struct:
+
+The RDMA event handler is represented by struct ib_event_handler:
+
+struct ib_event_handler {
+
+struct ib_device *device;
+
+void (*handler)(struct ib_event_handler *, struct ib_event *);
+
+struct list_head list;
+
+};
+
+### The ib_event Struct
+
+The event callback is being called with the new event that happens to the RDMA device. This event is represented by struct ib_event.
+
+struct ib_event {
+
+struct ib_device *device;
+
+union {
+
+struct ib_cq *cq;
+
+struct ib_qp *qp;
+
+struct ib_srq *srq;
+
+u8 port_num;
+
+} element;
+
+enum ib_event_type event;
+
+};
+
+  * device: The RDMA device to which the asynchronous event occurred.
+
+  * element.cq: If this is a CQ event, the CQ on which the asynchronous event occurred.
+
+  * element.qp: If this is a QP event, the QP on which the asynchronous event occurred.
+
+  * element.srq: If this is an SRQ event, the SRQ on which the asynchronous event occurred.
+
+  * element.port_num: If this is a port event, the port number on which the asynchronous event occurred.
+
+  * event: The type of the asynchronous event that was occurred. It can be:
+
+    * IB_EVENT_CQ_ERR: CQ event. An error occurred to the CQ and no more Work Completions will be generated to it.
+
+    * IB_EVENT_QP_FATAL: QP event. An error occurred to the QP that prevents it from reporting an error through a Work Completion.
+
+    * IB_EVENT_QP_REQ_ERR: QP event. An incoming RDMA request caused a transport error violation in the targeted QP.
+
+    * IB_EVENT_QP_ACCESS_ERR: QP event. An incoming RDMA request caused a requested error violation in the targeted QP.
+
+    * IB_EVENT_COMM_EST: QP event. A communication established event occurred. An incoming message was received by a QP when it was in the RTR state.
+
+    * IB_EVENT_SQ_DRAINED: QP event. Send Queue drain event. The QP's Send Queue was drained.
+
+    * IB_EVENT_PATH_MIG: QP event. Path migration was completed successfully and the primary was changed.
+
+    * IB_EVENT_PATH_MIG_ERR: QP event. There was an error when trying to perform path migration.
+
+    * IB_EVENT_DEVICE_FATAL: Device event. There was an error with the RDMA device.
+
+    * IB_EVENT_PORT_ACTIVE: Port event. The port state has become active.
+
+    * IB_EVENT_PORT_ERR: Port event. The port state was active and it is no longer active.
+
+    * IB_EVENT_LID_CHANGE: Port event. The LID of the port was changed.
+
+    * IB_EVENT_PKEY_CHANGE: Port event. A P_Key entry was changed in the port's P_Key table.
+
+    * IB_EVENT_SM_CHANGE: Port event. The Subnet Manager that manages this port was change.
+
+    * IB_EVENT_SRQ_ERR: SRQ event. An error occurred to the SRQ.
+
+    * IB_EVENT_SRQ_LIMIT_REACHED: SRQ event/SRQ limit event. The number of Receive Requests in the SRQ dropped below the requested watermark.
+
+    * IB_EVENT_QP_LAST_WQE_REACHED: QP event. Last Receive Request reached from the SRQ, and it won't consume any more Receive Requests from it.
+
+    * IB_EVENT_CLIENT_REREGISTER: Port event. The client should reregister to all services from the Subnet Administrator.
+
+    * IB_EVENT_GID_CHANGE: Port event. A GID entry was changed in the port's GID table.
+
+### The ib_unregister_event_handler() Method
+
+The ib_unregister_event_handler() method unregisters an RDMA event handler. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_unregister_event_handler(struct ib_event_handler *event_handler);
+
+  * event_handler: The event handler to be unregistered. It should be the same object that was registered with ib_register_event_handler().
+
+### The ib_query_device() Method
+
+The ib_query_device() method queries the RDMA device for its attributes. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_query_device(struct ib_device *device
+
+struct ib_device_attr *device_attr);
+
+  * device: The RDMA device to be queried.
+
+  * device_attr: Pointer to a structure of an RDMA device attributes that will be filled.
+
+#### The ib_device_attr struct:
+
+The RDMA device attributes are represented by struct ib_device_attr:
+
+struct ib_device_attr {
+
+u64 fw_ver;
+
+__be64 sys_image_guid;
+
+u64 max_mr_size;
+
+u64 page_size_cap;
+
+u32 vendor_id;
+
+u32 vendor_part_id;
+
+u32 hw_ver;
+
+int max_qp;
+
+int max_qp_wr;
+
+int device_cap_flags;
+
+int max_sge;
+
+int max_sge_rd;
+
+int max_cq;
+
+int max_cqe;
+
+int max_mr;
+
+int max_pd;
+
+int max_qp_rd_atom;
+
+int max_ee_rd_atom;
+
+int max_res_rd_atom;
+
+int max_qp_init_rd_atom;
+
+int max_ee_init_rd_atom;
+
+enum ib_atomic_cap atomic_cap;
+
+enum ib_atomic_cap masked_atomic_cap;
+
+int max_ee;
+
+int max_rdd;
+
+int max_mw;
+
+int max_raw_ipv6_qp;
+
+int max_raw_ethy_qp;
+
+int max_mcast_grp;
+
+int max_mcast_qp_attach;
+
+int max_total_mcast_qp_attach;
+
+int max_ah;
+
+int max_fmr;
+
+int max_map_per_fmr;
+
+int max_srq;
+
+int max_srq_wr;
+
+int max_srq_sge;
+
+unsigned int max_fast_reg_page_list_len;
+
+u16 max_pkeys;
+
+u8 local_ca_ack_delay;
+
+};
+
+  * fw_ver: A number which represents the FW version of the RDMA device. It can be evaluated as ZZZZYYXX: Zs are the major number, Ys are the minor number, and Xs are the build number.
+
+    * sys_image_guid: The system image GUID: Has a unique value for each system.
+
+    * max_mr_size: The maximum supported MR size.
+
+    * page_size_cap: Bitwise OR for all of supported memory page shifts.
+
+    * vendor_id: The IEEE vendor ID.
+
+    * vendor_part_id: Device's part ID, as supplied by the vendor.
+
+    * hw_ver: Device's HW version, as supplied by the vendor.
+
+    * max_qp: Maximum supported number of QPs.
+
+    * max_qp_wr: Maximum supported number of Work Requests in each non-RD QP.
+
+    * device_cap_flags: Supported capabilities of the RDMA device. It is a bitwise OR of the masks:
+
+      * IB_DEVICE_RESIZE_MAX_WR: The RDMA device supports resize of the number of Work Requests in a QP.
+
+      * IB_DEVICE_BAD_PKEY_CNTR: The RDMA device supports the ability to count the number of bad P_Keys.
+
+      * IB_DEVICE_BAD_QKEY_CNTR: The RDMA device supports the ability to count the number of bad Q_Keys.
+
+      * IB_DEVICE_RAW_MULTI: The RDMA device supports raw packet multicast.
+
+      * IB_DEVICE_AUTO_PATH_MIG: The RDMA device supports Automatic Path Migration.
+
+      * IB_DEVICE_CHANGE_PHY_PORT: The RDMA device supports changing the QP's primary Port number.
+
+      * IB_DEVICE_UD_AV_PORT_ENFORCE: The RDMA device supports enforcements of the port number of UD QP and Address Handle.
+
+      * IB_DEVICE_CURR_QP_STATE_MOD: The RDMA device supports the current QP modifier when calling ib_modify_qp().
+
+      * IB_DEVICE_SHUTDOWN_PORT: The RDMA device supports port shutdown.
+
+      * IB_DEVICE_INIT_TYPE: The RDMA device supports setting InitType and InitTypeReply.
+
+      * IB_DEVICE_PORT_ACTIVE_EVENT: The RDMA device supports the generation of the port active asynchronous event.
+
+      * IB_DEVICE_SYS_IMAGE_GUID: The RDMA device supports system image GUID.
+
+      * IB_DEVICE_RC_RNR_NAK_GEN: The RDMA device supports RNR-NAK generation for RC QPs.
+
+      * IB_DEVICE_SRQ_RESIZE: The RDMA device supports resize of a SRQ.
+
+      * IB_DEVICE_N_NOTIFY_CQ: The RDMA device supports notification when N Work Completions exists in the CQ.
+
+      * IB_DEVICE_LOCAL_DMA_LKEY: The RDMA device supports Zero Stag (in iWARP) and reserved LKey (in InfiniBand).
+
+      * IB_DEVICE_RESERVED: Reserved bit.
+
+      * IB_DEVICE_MEM_WINDOW: The RDMA device supports Memory Windows.
+
+      * IB_DEVICE_UD_IP_CSUM: The RDMA device supports insertion of UDP and TCP checksum on outgoing UD IPoIB messages and can verify the validity of those checksum for incoming messages.
+
+      * IB_DEVICE_UD_TSO: The RDMA device supports TCP Segmentation Offload.
+
+      * IB_DEVICE_XRC: The RDMA device supports the eXtended Reliable Connected transport.
+
+      * IB_DEVICE_MEM_MGT_EXTENSIONS: The RDMA device supports memory management extensions support.
+
+      * IB_DEVICE_BLOCK_MULTICAST_LOOPBACK: The RDMA device supports blocking multicast loopback.
+
+      * IB_DEVICE_MEM_WINDOW_TYPE_2A: The RDMA device supports Memory Windows type 2A: association with a QP number.
+
+      * IB_DEVICE_MEM_WINDOW_TYPE_2B: The RDMA device supports Memory Windows type 2B: association with a QP number and a PD.
+
+    * max_sge: Maximum supported number of scatter/gather elements per Work Request in a non-RD QP.
+
+    * max_sge_rd: Maximum supported number of scatter/gather elements per Work Request in an RD QP.
+
+    * max_cq: Maximum supported number of CQs.
+
+    * max_cqe: Maximum supported number of entries in each CQ.
+
+    * max_mr: Maximum supported number of MRs.
+
+    * max_pd: Maximum supported number of PDs.
+
+    * max_qp_rd_atom: Maximum number of RDMA Read and Atomic operations that can be sent to a QP as the target of the operation.
+
+    * max_ee_rd_atom: Maximum number of RDMA Read and Atomic operations that can be sent to an EE context as the target of the operation.
+
+    * max_res_rd_atom: Maximum number of for incoming RDMA Read and Atomic operations that can be sent to this RDMA device as the target of the operation.
+
+    * max_qp_init_rd_atom: Maximum number of RDMA Read and Atomic operations that can be sent from a QP as the initiator of the operation.
+
+    * max_ee_init_rd_atom: Maximum number of RDMA Read and Atomic operations that can be sent from an EE context as the initiator of the operation.
+
+    * atomic_cap: Ability of the device to support atomic operations. Can be:
+
+      * IB_ATOMIC_NONE: The RDMA device doesn't guarantee any atomicity at all.
+
+      * IB_ATOMIC_HCA: The RDMA device guarantees atomicity between QPs in the same device.
+
+      * IB_ATOMIC_GLOB: The RDMA device guarantees atomicity between this device and any other component.
+
+  * masked_atomic_cap: The ability of the device to support masked atomic operations. Possible values as described in atomic_cap earlier.
+
+  * max_ee: Maximum supported number of EE contexts.
+
+  * max_rdd: Maximum supported number of RDDs.
+
+  * max_mw: Maximum supported number of MWs.
+
+  * max_raw_ipv6_qp: Maximum supported number of Raw IPv6 Datagram QPs.
+
+  * max_raw_ethy_qp: Maximum supported number of Raw Ethertype Datagram QPs.
+
+  * max_mcast_grp: Maximum supported number of multicast groups.
+
+  * max_mcast_qp_attach: Maximum supported number of QPs that can be attached to each multicast group.
+
+  * max_total_mcast_qp_attach: Maximum number of total QPs that can be attached to any multicast group.
+
+  * max_ah: Maximum supported number of AHs.
+
+  * max_fmr: Maximum supported number of FMRs.
+
+  * max_map_per_fmr: Maximum supported number of map operations which are allowed per FMR.
+
+  * max_srq: Maximum supported number of SRQs.
+
+  * max_srq_wr: Maximum supported number of Work Requests in each SRQ.
+
+  * max_srq_sge: Maximum supported number of scatter/gather elements per Work Request in an SRQ.
+
+  * max_fast_reg_page_list_len: Maximum number of page list that can be used when registering an FMR using a Work Request.
+
+  * max_pkeys: Maximum supported number of P_Keys.
+
+  * local_ca_ack_delay: Local CA ack delay. This value specifies the maximum expected time interval between the local device receiving a message and transmitting the associated ACK or NAK.
+
+### The ib_query_port() Method
+
+The ib_query_port() method queries the RDMA device port's attributes. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_query_port(struct ib_device *device
+
+u8 port_num, struct ib_port_attr *port_attr);
+
+  * device: The RDMA device to be queried.
+
+  * port_num: The port number to be queried.
+
+  * port_attr: A pointer to a structure of an RDMA port attributes which will be filled.
+
+#### The ib_port_attr Struct
+
+The RDMA port attributes are represented by struct ib_port_attr:
+
+struct ib_port_attr {
+
+enum ib_port_state state;
+
+enum ib_mtu max_mtu;
+
+enum ib_mtu active_mtu;
+
+int gid_tbl_len;
+
+u32 port_cap_flags;
+
+u32 max_msg_sz;
+
+u32 bad_pkey_cntr;
+
+u32 qkey_viol_cntr;
+
+u16 pkey_tbl_len;
+
+u16 lid;
+
+u16 sm_lid;
+
+u8 lmc;
+
+u8 max_vl_num;
+
+u8 sm_sl;
+
+u8 subnet_timeout;
+
+u8 init_type_reply;
+
+u8 active_width;
+
+u8 active_speed;
+
+u8 phys_state;
+
+};
+
+  * state: The logical port state. Can be:
+
+    * IB_PORT_NOP: Reserved value.
+
+    * IB_PORT_DOWN: Logical link is down.
+
+    * IB_PORT_INIT: Logical link is initialized. The physical link is up but the Subnet Manager hasn't started to configure the port.
+
+    * IB_PORT_ARMED: Logical link is armed. The physical link is up but the Subnet Manager started, and did not yet complete, configuring the port.
+
+    * IB_PORT_ACTIVE: Logical link is active.
+
+    * IB_PORT_ACTIVE_DEFER: Logical link is active but the physical link is down. The link tries to recover from this state.
+
+  * max_mtu: The maximum MTU supported by this port. Can be:
+
+    * IB_MTU_256: 256 bytes.
+
+    * IB_MTU_512: 512 bytes.
+
+    * IB_MTU_1024: 1,024 bytes.
+
+    * IB_MTU_2048: 2,048 bytes.
+
+    * IB_MTU_4096: 4,096 bytes.
+
+  * active_mtu: The actual MTU that this port is configured with. Can be as max_mtu, mentioned earlier.
+
+  * gid_tbl_len: The number of entries in the port's GID table.
+
+  * port_cap_flags: The port supported capabilities. It is a bitwise OR of the masks:
+
+    * IB_PORT_SM: An indication that the SM that manages the subnet is sending packets from this port.
+
+    * IB_PORT_NOTICE_SUP: An indication that this port supports notices.
+
+    * IB_PORT_TRAP_SUP: An indication that this port supports traps.
+
+    * IB_PORT_OPT_IPD_SUP: An indication that this port supports Inter Packet Delay optional values.
+
+    * IB_PORT_AUTO_MIGR_SUP: An indication that this port supports Automatic Path Migration.
+
+    * IB_PORT_SL_MAP_SUP: An indication that this port supports SL 2 VL mapping table.
+
+    * IB_PORT_MKEY_NVRAM: An indication that this port supports saving the M_Key attributes in Non Volatile RAM.
+
+    * IB_PORT_PKEY_NVRAM: An indication that this port supports saving the P_Key table in Non Volatile RAM.
+
+    * IB_PORT_LED_INFO_SUP: An indication that this port supports turning on and off the LED using management packets.
+
+    * IB_PORT_SM_DISABLED: An indication that there is an SM which isn't active in this port.
+
+    * IB_PORT_SYS_IMAGE_GUID_SUP: An indication that the port supports system image GUID.
+
+    * IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP: An indication that the SMA on the switch management port will monitor P_Key mismatches on each switch external port.
+
+    * IB_PORT_EXTENDED_SPEEDS_SUP: An indication that the port supports extended speeds (FDR and EDR).
+
+    * IB_PORT_CM_SUP: An indication that this port supports CM.
+
+    * IB_PORT_SNMP_TUNNEL_SUP: An indication that an SNMP tunneling agent is listening on this port.
+
+    * IB_PORT_REINIT_SUP: An indication that this port supports reinitialization of the node.
+
+    * IB_PORT_DEVICE_MGMT_SUP: An indication that this port supports device management.
+
+    * IB_PORT_VENDOR_CLASS_SUP: An indication that a vendor-specific agent is listening on this port.
+
+    * IB_PORT_DR_NOTICE_SUP: An indication that this port supports Direct Route notices.
+
+    * IB_PORT_CAP_MASK_NOTICE_SUP: An indication that this port supports sending a notice if the port's port_cap_flags is changed.
+
+    * IB_PORT_BOOT_MGMT_SUP: An indication that a boot manager agent is listening on this port.
+
+    * IB_PORT_LINK_LATENCY_SUP: An indication that this port supports link round trip latency measurement.
+
+    * IB_PORT_CLIENT_REG_SUP: An indication that this port is capable of generating the IB_EVENT_CLIENT_REREGISTER asynchronous event.
+
+  * max_msg_sz: The maximum supported message size by this port.
+
+  * bad_pkey_cntr: A counter for the number of bad P_Key from messages that this port received.
+
+  * qkey_viol_cntr: A counter for the number of Q_Key violations from messages that this port received.
+
+  * pkey_tbl_len: The number of entries in the port's P_Key table.
+
+  * lid: The port's Local Identifier (LID), as assigned by the SM.
+
+  * sm_lid: The LID of the SM.
+
+  * lmc: LID mask of this port.
+
+  * max_vl_num: Maximum number of Virtual Lanes supported by this port. Can be:
+
+    * 1: 1 VL is supported: VL0
+
+    * 2: 2 VLs are supported: VL0–VL1
+
+    * 3: 4 VLs are supported: VL0–VL3
+
+    * 4: 8 VLs are supported: VL0–VL7
+
+    * 5: 15 VLs are supported: VL0–VL14
+
+  * sm_sl: The SL to be used when sending messages to the SM.
+
+  * subnet_timeout: The maximum expected subnet propagation delay. This duration of time calculation is 4.094*2^subnet_timeout.
+
+  * init_type_reply: The value that the SM configures before moving the port state to IB_PORT_ARMED or IB_PORT_ACTIVE to specify the type of the initialization performed.
+
+  * active_width: The port's active width. Can be:
+
+    * IB_WIDTH_1X: Multiple of 1.
+
+    * IB_WIDTH_4X: Multiple of 4.
+
+    * IB_WIDTH_8X: Multiple of 8.
+
+    * IB_WIDTH_12X: Multiple of 12.
+
+  * active_speed: The port's active speed. Can be:
+
+    * IB_SPEED_SDR: Single Data Rate (SDR): 2.5 Gb/sec, 8/10 bit encoding.
+
+    * IB_SPEED_DDR: Double Data Rate (DDR): 5 Gb/sec, 8/10 bit encoding.
+
+    * IB_SPEED_QDR: Quad Data Rate (DDR): 10 Gb/sec, 8/10 bit encoding.
+
+    * IB_SPEED_FDR10: Fourteen10 Data Rate (FDR10): 10.3125 Gb/sec, 64/66 bit encoding.
+
+    * IB_SPEED_FDR: Fourteen Data Rate (FDR): 14.0625 Gb/sec, 64/66 bit encoding.
+
+    * IB_SPEED_EDR: Enhanced Data Rate (EDR): 25.78125 Gb/sec.
+
+  * phys_state: The physical port state. There isn't any enumeration for this value.
+
+### The rdma_port_get_link_layer() Method
+
+The rdma_port_get_link_layer() method returns the link layer of the RDMA device port. It will return the following values:
+
+  * IB_LINK_LAYER_UNSPECIFIED: Unspecified value, usually legacy value that indicates that this is an InfiniBand link layer.
+
+  * IB_LINK_LAYER_INFINIBAND: Link layer is InfiniBand.
+
+  * IB_LINK_LAYER_ETHERNET: Link layer is Ethernet. This indicates that the port supports RDMA Over Converged Ethernet (RoCE).
+
+enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num);
+
+  * device: The RDMA device to be queried.
+
+  * port_num: The port number to be queried.
+
+### The ib_query_gid() Method
+
+The ib_query_gid() method queries the RDMA device port's GID table. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_query_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid);
+
+  * device: The RDMA device to be queried.
+
+  * port_num: The port number to be queried.
+
+  * index: The index in the GID table to be queried.
+
+  * gid: A pointer to the GID union to be filled.
+
+### The ib_query_pkey() Method
+
+The ib_query_pkey() method queries the RDMA device port's P_Key table. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_query_pkey(struct ib_device *device
+
+u8 port_num, u16 index, u16 *pkey);
+
+  * device: The RDMA device to be queried.
+
+  * port_num: The port number to be queried.
+
+  * index: The index in the P_Key table to be queried.
+
+  * pkey: A pointer to the P_Key to be filled.
+
+### The ib_modify_device() Method
+
+The ib_modify_device() method modifies the RDMA device attributes. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_modify_device(struct ib_device *device
+
+int device_modify_mask
+
+struct ib_device_modify *device_modify);
+
+  * device: The RDMA device to be modified.
+
+  * device_modify_mask: The device attributes to be changed. It is a bitwise OR of the masks:
+
+    * IB_DEVICE_MODIFY_SYS_IMAGE_GUID: Modifies the system image GUID.
+
+    * IB_DEVICE_MODIFY_NODE_DESC: Modifies the node description.
+
+  * device_modify: The RDMA attributes to be modified, as described immediately.
+
+#### The ib_device_modify Struct
+
+The RDMA device attributes are represented by struct ib_device_modify:
+
+struct ib_device_modify {
+
+u64 sys_image_guid;
+
+char node_desc[64];
+
+};
+
+  * sys_image_guid: A 64-bit value of the system image GUID.
+
+  * node_desc: A NULL terminated string that describes the node description.
+
+### The ib_modify_port() Method
+
+The ib_modify_port() method modifies the RDMA device port's attributes. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_modify_port(struct ib_device *device
+
+u8 port_num, int port_modify_mask
+
+struct ib_port_modify *port_modify);
+
+  * device: The RDMA device to be modified.
+
+  * port_num: The port number to be modified.
+
+  * port_modify_mask: The port's attributes to be changed. It is a bitwise OR of the masks:
+
+    * IB_PORT_SHUTDOWN: Moves the port state to IB_PORT_DOWN.
+
+    * IB_PORT_INIT_TYPE: Sets the port InitType value.
+
+    * IB_PORT_RESET_QKEY_CNTR: Resets the port's Q_Key violation counter.
+
+  * port_modify: The port attributes to be modified, as described in the next section.
+
+#### The ib_port_modify struct:
+
+The RDMA device attributes are represented by struct ib_port_modify:
+
+struct ib_port_modify {
+
+u32 set_port_cap_mask;
+
+u32 clr_port_cap_mask;
+
+u8 init_type;
+
+};
+
+  * set_port_cap_mask: The port capabilities bits to be set.
+
+  * clr_port_cap_mask: The port capabilities bits to be cleared.
+
+  * init_type: The InitType value to be set.
+
+### The ib_find_gid() Method
+
+The ib_find_gid() method finds the port number and the index where a specific GID value exists in the GID table. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_find_gid(struct ib_device *device, union ib_gid *gid
+
+u8 *port_num, u16 *index);
+
+  * device: The RDMA device to be queried.
+
+  * gid: A pointer of the GID to search for.
+
+  * port_num: Will be filled with the port number that this GID exists in.
+
+  * index: Will be filled with the index in the GID table that this GID exists in.
+
+### The ib_find_pkey() Method
+
+The ib_find_pkey() method finds the index where a specific P_Key value exists in the P_Key table in a specific port number. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_find_pkey(struct ib_device *device
+
+u8 port_num, u16 pkey, u16 *index);
+
+  * device: The RDMA device to be queried.
+
+  * port_num: The port number to search the P_Key in.
+
+  * pkey: The P_Key value to search for.
+
+  * index: The index in the P_Key table that this P_Key exists in.
+
+### The rdma_node_get_transport() Method
+
+The rdma_node_get_transport() method returns the RDMA transport type of a specific node type. The available transport types can be:
+
+  * RDMA_TRANSPORT_IB: Transport is InfiniBand.
+
+  * RDMA_TRANSPORT_IWARP: Transport is iWARP.
+
+### The rdma_node_get_transport() Method
+
+enum rdma_transport_type
+
+rdma_node_get_transport(enum rdma_node_type node_type) __attribute_const__;
+
+  * node_type: The node type. Can be:
+
+    * RDMA_NODE_IB_CA: Node type is an InfiniBand Channel Adapter.
+
+    * RDMA_NODE_IB_SWITCH: Node type is an InfiniBand Switch.
+
+    * RDMA_NODE_IB_ROUTER: Node type is an InfiniBand Router.
+
+    * RDMA_NODE_RNIC: Node type is an RDMA NIC.
+
+### The ib_mtu_to_int() Method
+
+The ib_mtu_to_int() method returns the number of bytes, as an integer, for MTU enumerations. It will return a positive value on success or –1 on a failure.
+
+static inline int ib_mtu_enum_to_int(enum ib_mtu mtu);
+
+  * mtu: Can be an MTU enumeration, as described earlier.
+
+### The ib_width_enum_to_int() Method
+
+The ib_width_enum_to_int() method returns the number of width multiple, as an integer, for an IB port enumerations. It will return a positive value on success or –1 on a failure.
+
+static inline int ib_width_enum_to_int(enum ib_port_width width);
+
+  * width: Can be a port width enumeration, as described earlier.
+
+### The ib_rate_to_mult() Method
+
+The ib_rate_to_mult() method returns the number of multiple of the base rate of 2.5 Gbit/sec, as an integer, for an IB rate enumerations. It will return a positive value on success or –1 on a failure.
+
+int ib_rate_to_mult(enum ib_rate rate) __attribute_const__;
+
+  * rate: The rate enumeration to be converted. Can be:
+
+    * IB_RATE_PORT_CURRENT: Current port's rate.
+
+    * IB_RATE_2_5_GBPS: Rate of 2.5 Gbit/sec.
+
+    * IB_RATE_5_GBPS: Rate of 5 Gbit/sec.
+
+    * IB_RATE_10_GBPS: Rate of 10 Gbit/sec.
+
+    * IB_RATE_20_GBPS: Rate of 20 Gbit/sec.
+
+    * IB_RATE_30_GBPS: Rate of 30 Gbit/sec.
+
+    * IB_RATE_40_GBPS: Rate of 40 Gbit/sec.
+
+    * IB_RATE_60_GBPS: Rate of 60 Gbit/sec.
+
+    * IB_RATE_80_GBPS: Rate of 80 Gbit/sec.
+
+    * IB_RATE_120_GBPS: Rate of 120 Gbit/sec.
+
+    * IB_RATE_14_GBPS: Rate of 14 Gbit/sec.
+
+    * IB_RATE_56_GBPS: Rate of 56 Gbit/sec.
+
+    * IB_RATE_112_GBPS: Rate of 112 Gbit/sec.
+
+    * IB_RATE_168_GBPS: Rate of 168 Gbit/sec.
+
+    * IB_RATE_25_GBPS: Rate of 25 Gbit/sec.
+
+    * IB_RATE_100_GBPS: Rate of 100 Gbit/sec.
+
+    * IB_RATE_200_GBPS: Rate of 200 Gbit/sec.
+
+    * IB_RATE_300_GBPS: Rate of 300 Gbit/sec.
+
+### The ib_rate_to_mbps() Method
+
+The ib_rate_to_mbps() method returns the number of Mbit/sec, as an integer, for an IB rate enumerations. It will return a positive value on success or –1 on a failure.
+
+int ib_rate_to_mbps(enum ib_rate rate) __attribute_const__;
+
+  * rate: The rate enumeration to be converted, as described earlier.
+
+### The ib_rate_to_mbps() Method
+
+The ib_rate_to_mbps() method returns the IB rate enumerations for a multiple of the base rate of 2.5 Gbit/sec. It will return a positive value on success or –1 on a failure.
+
+enum ib_rate mult_to_ib_rate(int mult) __attribute_const__;
+
+  * mult: The rate multiple to be converted, as described earlier.
+
+## Protection Domain (PD)
+
+PD is an RDMA resource that associates QPs and SRQs with MRs and AHs with QPs. One can look at PD as a color, for example: red MR can work with a red QP, and red AH can work with a red QP. Working with green AH with a red QP will result in an error.
+
+### The ib_alloc_pd() Method
+
+The ib_alloc_pd() method allocates a PD. It will return a pointer to the newly allocated PD on success or an ERR_PTR() which specifies the reason for the failure.
+
+struct ib_pd *ib_alloc_pd(struct ib_device *device);
+
+  * device: The RDMA device that the PD will be associated with.
+
+### The ib_dealloc_pd() Method
+
+The ib_dealloc_pd() method deallocates a PD. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_dealloc_pd(struct ib_pd *pd);
+
+  * pd: The PD to be deallocated.
+
+## eXtended Reliable Connected (XRC)
+
+XRC is an IB transport extension that provides better scalability, in the sender side, for Reliable Connected QPs than the original Reliable Transport can provide. Using XRC will decrease the number of QPs between two specific cores: when using RC QPs, for each core, in each machine, there is a QP. When using XRC, there will be one XRC QP in each host. When sending a message, the sender needs to specify the remote SRQ number that will receive the message.
+
+### The ib_alloc_xrcd() Method
+
+The ib_alloc_xrcd() method allocates an XRC domain. It will return a pointer to the newly created XRC domain on success or an ERR_PTR() which specifies the reason for the failure.
+
+struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device);
+
+  * device: The RDMA device that this XRC domain will be allocated on.
+
+### The ib_dealloc_xrcd_cq() Method
+
+The ib_dealloc_xrcd_cq() method deallocates an XRC domain. It will return 0 on success or the errno value with the reason for the failure:
+
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd);
+
+  * xrcd: The XRC domain to be deallocated.
+
+## Shared Receive Queue (SRQ)
+
+SRQ is a resource that helps RDMA to be more scalable. Instead of managing the Receive Requests in the Receive Queues of many QPs, it is possible to manage them in a single Receive Queue, which all of them share. This will eliminate starvation in RC QPs or packet drops in unreliable transport types and will help to reduce the total posted Receive Requests, thus reducing the consumed memory. Furthermore, unlike a QP, an SRQ can have a watermark to allow a notification if the number of RRs in the SRQ dropped below a specify value.
+
+### The ib_srq_attr Struct
+
+The SRQ attributes are represented by struct ib_srq_attr:
+
+struct ib_srq_attr {
+
+u32 max_wr;
+
+u32 max_sge;
+
+u32 srq_limit;
+
+};
+
+  * max_wr: The maximum number of outstanding RRs that this SRQ can hold.
+
+  * max_sge: The maximum number of scatter/gather elements that each RR in the SRQ can hold.
+
+  * srq_limit: The watermark limit that creates an asynchronous event if the number of RRs in the SRQ dropped below this value.
+
+### The ib_create_srq() Method
+
+The ib_create_srq() method creates an SRQ. It will return a pointer to the newly created SRQ on success or an ERR_PTR() which specifies the reason for the failure:
+
+struct ib_srq *ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr);
+
+  * pd: The PD that this SRQ is being associated with.
+
+  * srq_init_attr: The attributes that this SRQ will be created with.
+
+#### The ib_srq_init_attr Struct
+
+The created SRQ attributes are represented by struct ib_srq_init_attr:
+
+struct ib_srq_init_attr {
+
+void (*event_handler)(struct ib_event *, void *);
+
+void *srq_context;
+
+struct ib_srq_attr attr;
+
+enum ib_srq_type srq_type;
+
+union {
+
+struct {
+
+struct ib_xrcd *xrcd;
+
+struct ib_cq *cq;
+
+} xrc;
+
+} ext;
+
+};
+
+  * event_handler: A pointer to a callback that will be called in case of an affiliated asynchronous event to the SRQ.
+
+  * srq_context: User-defined context that can be associated with the SRQ.
+
+  * attr: The SRQ attributes, as described earlier.
+
+  * srq_type: The type of the SRQ. Can be:
+
+    * IB_SRQT_BASIC: For regular SRQ.
+
+    * IB_SRQT_XRC: For XRC SRQ.
+
+  * ext: If srq_type is IB_SRQT_XRC, specifies the XRC domain or the CQ that this SRQ is associated with.
+
+### The ib_modify_srq() Method
+
+The ib_modify_srq() method modifies the attributes of the SRQ. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask);
+
+  * srq: The SRQ to be modified.
+
+  * srq_attr: The SRQ attributes, as described earlier.
+
+  * srq_attr_mask: The SRQ attributes to be changed. It is a bitwise OR of the masks:
+
+    * IB_SRQ_MAX_WR: Modify the number of RRs in the SRQ (that is, resize the SRQ). This can be done only if the device supports SRQ resize—that is, the IB_DEVICE_SRQ_RESIZE is set in the device flags.
+
+    * IB_SRQ_LIMIT: Set the value of the SRQ watermark limit.
+
+### The ib_query_srq() Method
+
+The ib_query_srq() method queries for the current SRQ attributes. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+
+  * srq: The SRQ to be queried.
+
+  * srq_attr: The SRQ attributes, as described earlier.
+
+### The ib_destory_srq() Method
+
+The ib_destory_srq() method destroys an SRQ. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_destroy_srq(struct ib_srq *srq);
+
+  * srq: The SRQ to be destroyed.
+
+### The ib_post_srq_recv() Method
+
+The ib_post_srq_recv() method takes a linked list of Receive Requests and adds them to the SRQ for future processing. Every Receive Request is considered outstanding until a Work Completion is generated after its processing. It will return 0 on success or the errno value with the reason for the failure.
+
+static inline int ib_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *recv_wr
+
+struct ib_recv_wr **bad_recv_wr);
+
+  * srq: The SRQ that the Receive Requests will be posted to.
+
+  * recv_wr: A linked list of Receive Request to be posted.
+
+  * bad_recv_wr: If there was an error with the handling of the Receive Requests, this pointer will be filled with the address of the Receive Request that caused this error.
+
+#### The ib_recv_wr Struct
+
+The Receive Request is represented by struct ib_recv_wr:
+
+struct ib_recv_wr {
+
+struct ib_recv_wr *next;
+
+u64 wr_id;
+
+struct ib_sge *sg_list;
+
+int num_sge;
+
+};
+
+  * next: A pointer to the next Receive Request in the list or NULL, if this is the last Receive Request.
+
+  * wr_id: A 64-bit value that is associated with this Receive Request and will be available in the corresponding Work Completion.
+
+  * sg_list: The array of the scatter/gather elements, as described in the next section.
+
+  * num_sge: The number of entries in sg_list. The value zero means that the message size that can be saved has zero bytes.
+
+#### The ib_sge Struct
+
+The scatter/gather element is represented by struct ib_sge:
+
+struct ib_sge {
+
+u64 addr;
+
+u32 length;
+
+u32 lkey;
+
+};
+
+  * addr: The address of the buffer to access.
+
+  * length: The length of the address to access.
+
+  * lkey: The Local Key of the Memory Region that this buffer was registered with.
+
+## Address Handle (AH)
+
+AH is an RDMA resource that describes the path from the local port to the remote port of the destination. It is being used for a UD QP.
+
+### The ib_ah_attr Struct
+
+The AH attributes are represented by struct ib_ah_attr:
+
+struct ib_ah_attr {
+
+struct ib_global_route grh;
+
+u16 dlid;
+
+u8 sl;
+
+u8 src_path_bits;
+
+u8 static_rate;
+
+u8 ah_flags;
+
+u8 port_num;
+
+};
+
+  * grh: The Global Routing Header attributes that are used for sending messages to another subnet or to a multicast group in the local or remote subnet.
+
+  * dlid: The destination LID.
+
+  * sl: The Service Level that this message will use.
+
+  * src_path_bits: The used source path bits. Relevant if LMC is used in this port.
+
+  * static_rate: The level of delay that should be done between sending the messages. It is used when sending a message to a remote node that supports a slower message rate than the local node.
+
+  * ah_flags: The AH flags. It is a bitwise OR of the masks:
+
+    * IB_AH_GRH: GRH is used in this AH.
+
+  * port_num: The local port number that messages will be sent from.
+
+### The ib_create_ah() Method
+
+The ib_create_ah() method creates an AH. It will return a pointer to the newly created AH on success or an ERR_PTR() which specifies the reason for the failure.
+
+struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+
+  * pd: The PD that this AH is being associated with.
+
+  * ah_attr: The attributes that this AH will be created with.
+
+### The ib_init_ah_from_wc() Method
+
+The ib_init_ah_from_wc() method initializes an AH attribute structure from a Work Completion and a GRH structure. This is being done in order to return a message back for an incoming message of an UD QP. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc
+
+struct ib_grh *grh, struct ib_ah_attr *ah_attr);
+
+  * device: The RDMA device that the Work Completion came from and the AH to be created on.
+
+  * port_num: The port number that the Work Completion came from and the AH will be associated with.
+
+  * wc: The Work Completion of the incoming message.
+
+  * grh: The GRH buffer of the incoming message.
+
+  * ah_attr: The attributes of this AH to be filled.
+
+### The ib_create_ah_from_wc() Method
+
+The ib_create_ah_from_wc() method creates an AH from a Work Completion and a GRH structure. This is done in order to return a message back for an incoming message of a UD QP. It will return a pointer to the newly created AH on success or an ERR_PTR() which specifies the reason for the failure.
+
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc, struct ib_grh *grh, u8 port_num);
+
+  * pd: The PD that this AH is being associated with.
+
+  * wc: The Work Completion of the incoming message.
+
+  * grh: The GRH buffer of the incoming message.
+
+  * port_num: The port number that the Work Completion came from and the AH will be associated with.
+
+### The ib_modify_ah() Method
+
+The ib_modify_ah() method modifies the attributes of the AH. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+  * ah: The AH to be modified.
+
+  * ah_attr: The AH attributes, as described earlier.
+
+### The ib_query_ah() Method
+
+The ib_query_ah() method queries for the current AH attributes. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+  * ah: The AH to be queried
+
+  * ah_attr: The AH attributes, as described earlier.
+
+### The ib_destory_ah() Method
+
+The ib_destory_ah() method destroys an AH. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_destroy_ah(struct ib_ah *ah);
+
+  * ah: The AH to be destroyed.
+
+## Multicast Groups
+
+Multicast groups are means to send a message from one UD QP to many UD QPs. Every UD QP that wants to get this message needs to be attached to a multicast group.
+
+### The ib_attach_mcast() Method
+
+The ib_attach_mcast() method attaches a UD QP to a multicast group within an RDMA device. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+  * qp: A handler of a UD QP to be attached to the multicast group.
+
+  * gid: The GID of the multicast group that the QP will be added to.
+
+  * lid: The LID of the multicast group that the QP will be added to.
+
+### The ib_detach_mcast() method
+
+The ib_detach_mcast() method detaches a UD QP from a multicast group within an RDMA device. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+  * qp: A handler of a UD QP to be detached from the multicast group.
+
+  * gid: The GID of the multicast group that the QP will be removed from.
+
+  * lid: The LID of the multicast group that the QP will be removed from.
+
+## Completion Queue (CQ)
+
+A Work Completion specifies that a corresponding Work Request was completed and provides some information.
+
+about it: its status, the used opcode, its size, and so on. A CQ is an object that consists of Work Completions.
+
+### The ib_create_cq() Method
+
+The ib_create_cq() method creates a CQ. It will return a pointer to the newly created CQ on success or an ERR_PTR() which specifies the reason for the failure.
+
+struct ib_cq *ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *), void *cq_context, int cqe, int comp_vector);
+
+  * device: The RDMA device that this CQ is being associated with.
+
+  * comp_handler: A pointer to a callback that will be called when a completion event occur to the CQ.
+
+  * event_handler: A pointer to a callback that will be called in case of an affiliated asynchronous event to the CQ.
+
+  * cq_context: A user-defined context that can be associated with the CQ.
+
+  * cqe: The requested number of Work Completions that this CQ can hold.
+
+  * comp_vector: The index of the RDMA device's completion vector to work on. If the IRQ affinity masks of these interrupts are spread across the cores, this value can be used to spread the completion workload over all of the cores.
+
+### The ib_resize_cq() Method
+
+The ib_resize_cq() method changes the size of the CQ to hold at least the new size, either by increasing the CQ size or decreasing it. Even if the user asks to resize a CQ, its size may not be resized.
+
+int ib_resize_cq(struct ib_cq *cq, int cqe);
+
+  * cq: The CQ to be resized. This value cannot be lower than the number of Work Completions that exists in the CQ.
+
+  * cqe: The requested number of Work Completions that this CQ can hold.
+
+### The ib_modify_cq() Method
+
+The ib_modify_cq() method changes the moderation parameter for a CQ. A Completion event will be generated if at least a specific number of Work Completion will enter the CQ or a timeout will expire. Using it may help to reduce the number of interrupts that happen to the RDMA device. It will return 0 on success or the -errno value with the reason for the failure.
+
+int ib_modify_cq(structib_cq *cq, u16 cq_count, u16 cq_period);
+
+  * cq: The CQ to be modified.
+
+  * cq_count: The number of Work Completions that will be added to the CQ, since the last Completion event, that will trigger a CQ event.
+
+  * cq_period: The number of microseconds that will pass, since the last Completion event, that will trigger a CQ event.
+
+### The ib_peek_cq() Method
+
+The ib_peek_cq() method returns the number of available Work Completions in the CQ. If the number of Work Completions in the CQ is equal to or greater than wc_cnt, it will return wc_cnt. Otherwise it will return the actual number of the Work Completions in the CQ. If an error occurred, it will return the errno value with the reason for the failure.
+
+int ib_peek_cq(structib_cq *cq, intwc_cnt);
+
+  * cq: The CQ to peek.
+
+  * cq_count: The number of Work Completions that will added to the CQ, since the last Completion event, that will trigger a CQ event.
+
+### The ib_req_notify_cq() Method
+
+The ib_req_notify_cq()method requests that a Completion event notification be created. Its return value can be:
+
+  * 0: This means that the notification was requested successfully. If IB_CQ_REPORT_MISSED_EVENTS was used, then a return value of 0 means that there aren't any missed events.
+
+  * Positive value is returned only when IB_CQ_REPORT_MISSED_EVENTS is used and there are missed events. The user should call the ib_poll_cq() method in order to read the Work Completions that exist in the CQ.
+
+  * Negative value is returned when an error occurred. The –errno value is returned, specifying the reason for the failure.
+
+static inline int ib_req_notify_cq(struct ib_cq *cq
+
+enum ib_cq_notify_flags flags);
+
+  * cq: The CQ that this Completion event will be generated for.
+
+  * flags: Information about the Work Completion that will cause the Completion event notification to be created. Can be one of:
+
+    * IB_CQ_NEXT_COMP: The next Work Completion that will be added to the CQ, after calling this method, will trigger the CQ event.
+
+    * IB_CQ_SOLICITED: The next Solicited Work Completion that will be added to the CQ, after calling this method, will trigger the CQ event.
+
+Both of those values can be bitwise ORed with IB_CQ_REPORT_MISSED_EVENTS in order to request a hint about missed events (that is, when calling this method and there are already Work Completions in this CQ).
+
+### The ib_req_ncomp_notif() Method
+
+The ib_req_ncomp_notif() method requests that a Completion event notification be created when the number of Work Completions in the CQ equals wc_cnt. It will return 0 on success, or the errno value with the reason for the failure.
+
+static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt);
+
+  * cq: The CQ that this Completion event will be generated for.
+
+  * wc_cnt: The number of Work Completions that the CQ will hold before a Completion event notification is generated.
+
+### The ib_poll_cq() Method
+
+The ib_poll_cq() method polls Work Completions from a CQ. It reads the Work Completion from the CQ and removes them. The Work Completions are read in the order they were added to the CQ. It will return 0 or a positive number to indicate the number of Work Completions that were read or the -errno value with the reason for the failure.
+
+static inline int ib_poll_cq(struct ib_cq *cq, int num_entries
+
+struct ib_wc *wc);
+
+  * cq: The CQ to be polled.
+
+  * num_entries: The maximum number of Work Completions to be polled.
+
+  * wc: An array that the number of polled Work Completions will be stored in.
+
+#### The ib_wc Struct
+
+Every Work Completion is represented by struct ib_wc:
+
+struct ib_wc {
+
+u64 wr_id;
+
+enum ib_wc_status status;
+
+enum ib_wc_opcode opcode;
+
+u32 vendor_err;
+
+u32 byte_len;
+
+struct ib_qp *qp;
+
+union {
+
+__be32 imm_data;
+
+u32 invalidate_rkey;
+
+} ex;
+
+u32 src_qp;
+
+int wc_flags;
+
+u16 pkey_index;
+
+u16 slid;
+
+u8 sl;
+
+u8 dlid_path_bits;
+
+u8 port_num;
+
+};
+
+  * wr_id: A 64-bit value that was associated with the corresponding Work Request.
+
+  * status: Status of the ended Work Request. Can be:
+
+  * IB_WC_SUCCESS: Operation completed successfully.
+
+  * IB_WC_LOC_LEN_ERR: Local length error. Either sent message is too big to be handled or incoming message is bigger than the available Receive Request.
+
+  * IB_WC_LOC_QP_OP_ERR: Local QP operation error. An internal QP consistency error was detected while processing a Work Request.
+
+  * IB_WC_LOC_EEC_OP_ERR: Local EE context operation error. Deprecated, since RD QPs aren't supported.
+
+  * IB_WC_LOC_PROT_ERR: Local protection error. The protection of the Work Request buffers is invalid to the requested operation.
+
+  * IB_WC_WR_FLUSH_ERR: Work Request flushed error. The Work Request was completed when the QP was in the Error state.
+
+  * IB_WC_MW_BIND_ERR: Memory Windows bind error. The operation of the Memory Windows binding failed.
+
+  * IB_WC_BAD_RESP_ERR: Bad response error. Unexpected transport layer opcode returned by the responder.
+
+  * IB_WC_LOC_ACCESS_ERR: Local access error. A protection error occurred on local buffers during the processing of an RDMA Write With Immediate message.
+
+  * IB_WC_REM_INV_REQ_ERR: Remove invalid request error. The incoming message is invalid.
+
+  * IB_WC_REM_ACCESS_ERR: Remote access error. A protection error occurred to incoming RDMA operation.
+
+  * IB_WC_REM_OP_ERR: Remote operation error. The incoming operation couldn't be completed successfully.
+
+  * IB_WC_RETRY_EXC_ERR: Transport retry counter exceeded. The remote QP didn't send any Ack or Nack, and the timeout was expired after the message retransmission.
+
+  * IB_WC_RNR_RETRY_EXC_ERR: RNR retry exceeded. The RNR NACK return count was exceeded.
+
+  * IB_WC_LOC_RDD_VIOL_ERR: Local RDD violation error. Deprecated, since RD QPs aren't supported.
+
+  * IB_WC_REM_INV_RD_REQ_ERR: Remove invalid RD request. Deprecated, since RD QPs aren't supported.
+
+  * IB_WC_REM_ABORT_ERR: Remote aborted error. The responder aborted the operation.
+
+  * IB_WC_INV_EECN_ERR: Invalid EE Context number. Deprecated, since RD QPs aren't supported.
+
+  * IB_WC_INV_EEC_STATE_ERR: Invalid EE context state error. Deprecated, since RD QPs aren't supported.
+
+  * IB_WC_FATAL_ERR: Fatal error.
+
+  * IB_WC_RESP_TIMEOUT_ERR: Response timeout error.
+
+  * IB_WC_GENERAL_ERR: General error. Other error which isn't covered by one of the earlier errors.
+
+    * opcode: The operation of the corresponding Work Request that was ended with this Work Completion. Can be:
+
+      * IB_WC_SEND: Send operation was completed in the sender side.
+
+      * IB_WC_RDMA_WRITE: RDMA Write operation was completed in the sender side.
+
+      * IB_WC_RDMA_READ: RDMA Read operation was completed in the sender side.
+
+      * IB_WC_COMP_SWAP: Compare and Swap operation was completed in the sender side.
+
+      * IB_WC_FETCH_ADD: Fetch and Add operation was completed in the sender side.
+
+      * IB_WC_BIND_MW: Memory bind operation was completed in the sender side.
+
+      * IB_WC_LSO: Send operation with Large Send Offload (LSO) was completed in the sender side.
+
+      * IB_WC_LOCAL_INV: Local invalidate operation was completed in the sender side.
+
+      * IB_WC_FAST_REG_MR: Fast registration operation was completed in the sender side.
+
+      * IB_WC_MASKED_COMP_SWAP: Masked Compare and Swap operation was completed in the sender side.
+
+      * IB_WC_MASKED_FETCH_ADD: Masked Fetch and Add operation was completed in the sender side.
+
+      * IB_WC_RECV: Receive Request of an incoming send operation was completed in the receiver side.
+
+      * IB_WC_RECV_RDMA_WITH_IMM: Receive Request of an incoming RDMA Write with immediate operation was completed in the receiver side.
+
+    * vendor_err: A vendor-specific value that provides extra information about the reason for the error.
+
+    * byte_len: If this is a Work Completion that was created from the end of a Receive Request, the byte_len value indicates the number of bytes that were received.
+
+    * qp: Handle of the QP that got the Work Completion. It is useful when QPs are associated with an SRQ—this way you can know the handle associated with the QP, that its incoming message consumed the Receive Request from the SRQ.
+
+    * ex.imm_data: Out Of Band data (32 bits), in network order, that was sent with the message. It is available if IB_WC_WITH_IMM is set in wc_flags.
+
+    * ex.invalidate_rkey: The rkey that was invalidated. It is available if IB_WC_WITH_INVALIDATE is set in wc_flags.
+
+    * src_qp: Source QP number. The QP number that sent this message. Only relevant for UD QPs.
+
+    * wc_flags: Flags that provide information about the Work Completion. It is a bitwise OR of the masks:
+
+      * IB_WC_GRH: Indicator that the message was received has a GRH and the first 40 bytes of the Receive Request buffers contains it. Only relevant for UD QPs.
+
+      * IB_WC_WITH_IMM: Indicator that the received message has immediate data.
+
+      * IB_WC_WITH_INVALIDATE: Indicator that a Send with Invalidate message was received.
+
+      * IB_WC_IP_CSUM_OK: Indicator that the received message passed the IP checksum test done by the RDMA device. This is available only if the RDMA device supports IP checksum offload. It is available if IB_DEVICE_UD_IP_CSUM is set in the device flags.
+
+  * pkey_index: The P_Key index, relevant only for GSI QPs.
+
+  * slid: The source LID of the message. Only relevant for UD QPs.
+
+  * sl: The Service Level of the message. Only relevant for UD QPs.
+
+  * dlid_path_bits: The destination LID path bits. Only relevant for UD QPs.
+
+  * port_num: The port number from which the message came in. Only relevant for Direct Route SMPs on switches.
+
+### The ib_destory_cq() Method
+
+The ib_destory_cq() method destroys a CQ. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_destroy_cq(struct ib_cq *cq);
+
+  * cq: The CQ to be destroyed.
+
+## Queue Pair (QP)
+
+QP is a resource that combines two Work Queues together: the Send Queue and the Receive Queue. Each queue acts as a FIFO. WRs that are being posted to each Work Queue will be processed by the order of their arrival. However, there isn't any guarantee about the order between the Queues. This resource is the resource that sends and receives packets.
+
+### The ib_qp_cap Struct
+
+The QP's Work Queues sizes are represented by struct ib_qp_cap:
+
+struct ib_qp_cap {
+
+u32 max_send_wr;
+
+u32 max_recv_wr;
+
+u32 max_send_sge;
+
+u32 max_recv_sge;
+
+u32 max_inline_data;
+
+};
+
+  * max_send_wr: The maximum number of outstanding Work Requests that this QP can hold in the Send Queue.
+
+  * max_recv_wr: The maximum number of outstanding Work Requests that this QP can hold in the Receive Queue. This value is ignored if the QP is associated with an SRQ.
+
+  * max_send_sge: The maximum number of scatter/gather elements that each Work Request in the Send Queue will be able to hold.
+
+  * max_recv_sge: The maximum number of scatter/gather elements that each Work Request in the Receive Queue will be able to hold.
+
+  * max_inline_data: The maximum message size that can be sent inline.
+
+### The ib_create_qp() Method
+
+The ib_create_qp() method creates a QP. It will return a pointer to the newly created QP on success or an ERR_PTR() which specifies the reason for the failure.
+
+struct ib_qp *ib_create_qp(struct ib_pd *pd
+
+struct ib_qp_init_attr *qp_init_attr);
+
+  * pd: The PD that this QP is being associated with.
+
+  * qp_init_attr: The attributes that this QP will be created with.
+
+#### The ib_qp_init_attr Struct
+
+The created QP attributes are represented by struct ib_qp_init_attr:
+
+struct ib_qp_init_attr {
+
+void (*event_handler)(struct ib_event *, void *);
+
+void *qp_context;
+
+struct ib_cq *send_cq;
+
+struct ib_cq *recv_cq;
+
+struct ib_srq *srq;
+
+struct ib_xrcd *xrcd; /* XRC TGT QPs only */
+
+struct ib_qp_cap cap;
+
+enum ib_sig_type sq_sig_type;
+
+enum ib_qp_type qp_type;
+
+enum ib_qp_create_flags create_flags;
+
+u8 port_num; /* special QP types only */
+
+};
+
+  * event_handler: A pointer to a callback that will be called in case of an affiliated asynchronous event to the QP.
+
+  * qp_context: User-defined context that can be associated with the QP.
+
+  * send_cq: A CQ that is being associated with the Send Queue of this QP.
+
+  * recv_cq: A CQ that is being associated with the Receive Queue of this QP.
+
+  * srq: A SRQ that is being associated with the Receive Queue of this QP or NULL if the QP isn't associated with an SRQ.
+
+  * xrcd: An XRC domain that this QP will be associated with. Relevant only if qp_type is IB_QPT_XRC_TGT.
+
+  * cap: A structure that describes the size of the Send and Receive Queues. This structure is described earlier.
+
+  * sq_sig_type: The signaling type of the Send Queue. It can be:
+
+    * IB_SIGNAL_ALL_WR: Every posted Send Request to the Send Queue will end with a Work Completion.
+
+    * IB_SIGNAL_REQ_WR: Only posted Send Requests to the Send Queue with an explicit request, i.e. set the IB_SEND_SIGNALED flag—will end with a Work Completion. This is called selective signaling.
+
+  * qp_type: The QP transport type. Can be:
+
+    * IB_QPT_SMI: A Subnet Management Interface QP.
+
+    * IB_QPT_GSI: A General Service Interface QP.
+
+    * IB_QPT_RC: A Reliable Connected QP.
+
+    * IB_QPT_UC: An Unreliable Connected QP.
+
+    * IB_QPT_UD: An Unreliable Datagram QP.
+
+    * IB_QPT_RAW_IPV6: An IPv6 raw datagram QP.
+
+    * IB_QPT_RAW_ETHERTYPE: An EtherType raw datagram QP.
+
+    * IB_QPT_RAW_PACKET: A raw packet QP.
+
+    * IB_QPT_XRC_INI: An XRC-initiator QP.
+
+    * IB_QPT_XRC_TGT: An XRC-target QP.
+
+  * create_flags: QP attributes flags. It is a bitwise OR of the masks:
+
+    * IB_QP_CREATE_IPOIB_UD_LSO: The QP will be used to send IPoIB LSO messages.
+
+    * IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK: Block loopback multicast packets.
+
+  * port_num: The RDMA device port number that this QP is associated with. Only relevant when qp_type is IB_QPT_SMI or IB_QPT_GS.
+
+### The ib_modify_qp() Method
+
+The ib_modify_qp() method modifies the attributes of the QP. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_modify_qp(struct ib_qp *qp
+
+struct ib_qp_attr *qp_attr
+
+int qp_attr_mask);
+
+  * qp: The QP to be modified.
+
+  * qp_attr: The QP attributes, as described earlier.
+
+  * qp_attr_mask: The QP attributes to be changed. Each mask specifies the attributes that will be modified in this QP transition, such as specifying which attributes in qp_attr will be used. It is a bitwise OR of the masks:
+
+    * IB_QP_STATE: Modifies the QP state, specified in the qp_state field.
+
+    * IB_QP_CUR_STATE: Modifies the assumed current QP state, specified in the cur_qp_state field.
+
+    * IB_QP_EN_SQD_ASYNC_NOTIFY: Modifies the status of the request for notification when the QP state is SQD.drained, specified in the en_sqd_async_notify field.
+
+    * IB_QP_ACCESS_FLAGS: Modifies the allowed incoming Remote operations, specified in the qp_access_flags field.
+
+    * IB_QP_PKEY_INDEX: Modifies the index in the P_Key table that this QP is associated with in the primary path, specified in the pkey_index field.
+
+    * IB_QP_PORT: Modifies the RDMA device's port number that QP's primary path is associated with, specified in the port_num field.
+
+    * IB_QP_QKEY: Modifies the Q-Key of the QP, specified in the qkey field.
+
+    * IB_QP_AV: Modifies the Address Vector attributes of the QP, specified in the ah_attr field.
+
+    * IB_QP_PATH_MTU: Modifies the MTU of the path, specified in the path_mtu field.
+
+    * IB_QP_TIMEOUT: Modifies the timeout to wait before retransmission, specified in the field timeout.
+
+    * IB_QP_RETRY_CNT: Modifies the number of retries of the QP for lack of Ack/Nack, specified in the retry_cnt field.
+
+    * IB_QP_RNR_RETRY: Modifies the number of RNR retry of the QP, specified in the rq_psn field.
+
+    * IB_QP_RQ_PSN: Modifies the start PSN of the received packets, specified in the rnr_retry field.
+
+    * IB_QP_MAX_QP_RD_ATOMIC: Modifies the number of RDMA Read and Atomic operations that this QP can process in parallel as an initiator, specified in the max_rd_atomic field.
+
+    * IB_QP_ALT_PATH: Modifies the alternate path of the QP, specified in the alt_ah_attr, alt_pkey_index, alt_port_num, and alt_timeout fields.
+
+    * IB_QP_MIN_RNR_TIMER: Modifies the minimum RNR timer that the QP will report to the remote side in the RNR Nak, specified in the min_rnr_timer field.
+
+    * IB_QP_SQ_PSN: Modifies the start PSN of the sent packets, specified in the sq_psn field.
+
+    * IB_QP_MAX_DEST_RD_ATOMIC: Modifies the number of RDMA Read and Atomic operations that this QP can process in parallel as an initiator, specified in the max_dest_rd_atomic field.
+
+    * IB_QP_PATH_MIG_STATE: Modifies the state of the path migration state machine, specified in the path_mig_state field.
+
+    * IB_QP_CAP: Modifies the size of the Work Queues in the QP (both Send and Receive Queues), specified in the cap field.
+
+    * IB_QP_DEST_QPN: Modifies the destination QP number, specified in the dest_qp_num field.
+
+#### The ib_qp_attr Struct
+
+The QP attributes are represented by struct ib_qp_attr:
+
+struct ib_qp_attr {
+
+enum ib_qp_state qp_state;
+
+enum ib_qp_state cur_qp_state;
+
+enum ib_mtu path_mtu;
+
+enum ib_mig_state path_mig_state;
+
+u32 qkey;
+
+u32 rq_psn;
+
+u32 sq_psn;
+
+u32 dest_qp_num;
+
+int qp_access_flags;
+
+struct ib_qp_cap cap;
+
+struct ib_ah_attr ah_attr;
+
+struct ib_ah_attr alt_ah_attr;
+
+u16 pkey_index;
+
+u16 alt_pkey_index;
+
+u8 en_sqd_async_notify;
+
+u8 sq_draining;
+
+u8 max_rd_atomic;
+
+u8 max_dest_rd_atomic;
+
+u8 min_rnr_timer;
+
+u8 port_num;
+
+u8 timeout;
+
+u8 retry_cnt;
+
+u8 rnr_retry;
+
+u8 alt_port_num;
+
+u8 alt_timeout;
+
+};
+
+  * qp_state: The state to move the QP to. Can be:
+
+    * IB_QPS_RESET: Reset state.
+
+    * IB_QPS_INIT: Initialized state.
+
+    * IB_QPS_RTR: Ready To Receive state.
+
+    * IB_QPS_RTS: Ready To Send state.
+
+    * IB_QPS_SQD: Send Queue Drained state.
+
+    * IB_QPS_SQE: Send Queue Error state.
+
+    * IB_QPS_ERR: Error state.
+
+  * cur_qp_state: The assumed current state of the QP. Can be like qp_state.
+
+  * path_mtu: The size of the MTU in the path. Can be:
+
+    * IB_MTU_256: 256 bytes.
+
+    * IB_MTU_512: 512 bytes.
+
+    * IB_MTU_1024: 1,024 bytes.
+
+    * IB_MTU_2048: 2,048 bytes.
+
+    * IB_MTU_4096: 4,096 bytes.
+
+  * path_mig_state: The path migration state machine, used in APM (Automatic Path Migration). Can be:
+
+    * IB_MIG_MIGRATED: Migrated. The state machine of path migration is Migrated (initial state of migration was done).
+
+    * IB_MIG_REARM: Rearm. The state machine of path migration is Rearm (attempt to try to coordinate the remote RC QP to move both local and remote QPs to Armed state).
+
+    * IB_MIG_ARMED: Armed. The state machine of path migration is Armed (both local and remote QPs are ready to perform a path migration).
+
+  * qkey: The Q_Key of the QP.
+
+  * rq_psn: The expected PSN of the first packet in the Receive Queue. The value is 24 bits.
+
+  * sq_psn: The used PSN of the first packet in the Send Queue. The value is 24 bits.
+
+  * dest_qp_num: The QP number in the remote (destination) side. The value is 24 bits.
+
+  * qp_access_flags: The allowed incoming RDMA and Atomic operations. It is a bitwise OR of the masks:
+
+    * IB_ACCESS_REMOTE_WRITE: Incoming RDMA Write operations are allowed.
+
+    * IB_ACCESS_REMOTE_READ: Incoming RDMA Read operations are allowed.
+
+    * IB_ACCESS_REMOTE_ATOMIC: Incoming Atomic operations are allowed.
+
+  * cap: The QP size. The number of Work Requests in the Receive and Send Queues. This can be done only if the device supports QP resize—that is, the IB_DEVICE_RESIZE_MAX_WR is set in the device flags. This structure is described earlier.
+
+  * ah_attr: Address vector of the primary path of the QP. This structure is described earlier.
+
+  * alt_ah_attr: Address vector of the alternate path of the QP. This structure is described earlier.
+
+  * pkey_index: The P_Key index of the primary path that this QP is associated with.
+
+  * alt_pkey_index: The P_Key index of the alternate path that this QP is associated with.
+
+  * en_sqd_async_notify: If value isn't zero, request that the asynchronous event callback will be called when the QP will moved to SQE.drained state.
+
+  * sq_draining: Relevant only for ib_query_qp(). If value isn't zero, the QP is in state SQD.drainning (and not SQD.drained).
+
+  * max_rd_atomic: The number of RDMA Read and Atomic operations that this QP can process in parallel as an initiator.
+
+  * max_dest_rd_atomic: The number of RDMA Read and Atomic operations that this QP can process in parallel as a destination.
+
+  * min_rnr_timer: The timeout to wait before resend the message again if the remote side responds with an RNR Nack.
+
+  * port_num: The RDMA device's Port number that this QP is associated with in the Primary path.
+
+  * timeout: The timeout to wait before resending the message again if the remote side didn't respond with any Ack or Nack in the primary path. The timeout is a 5-bit value, 0 is infinite time, and any other value means that the timeout will be 4.096 * 2 ^ timeout usec.
+
+  * retry_cnt: The number of times to (re)send the message if the remote side didn't respond with any Ack or Nack.
+
+  * rnr_retry: The number of times to (re)send the message if the remote side answered with an RNR Nack. 3 bits value, 7 means infinite retry. The value can be:
+
+    * IB_RNR_TIMER_655_36: Delay of 655.36 milliseconds.
+
+    * IB_RNR_TIMER_000_01: Delay of 0.01 milliseconds.
+
+    * IB_RNR_TIMER_000_02: Delay of 0.02 milliseconds.
+
+    * IB_RNR_TIMER_000_03: Delay of 0.03 milliseconds.
+
+    * IB_RNR_TIMER_000_04: Delay of 0.04 milliseconds.
+
+    * IB_RNR_TIMER_000_06: Delay of 0.06 milliseconds.
+
+    * IB_RNR_TIMER_000_08: Delay of 0.08 milliseconds.
+
+    * IB_RNR_TIMER_000_12: Delay of 0.12 milliseconds.
+
+    * IB_RNR_TIMER_000_16: Delay of 0.16 milliseconds.
+
+    * IB_RNR_TIMER_000_24: Delay of 0.24 milliseconds.
+
+    * IB_RNR_TIMER_000_32: Delay of 0.32 milliseconds.
+
+    * IB_RNR_TIMER_000_48: Delay of 0.48 milliseconds.
+
+    * IB_RNR_TIMER_000_64: Delay of 0.64 milliseconds.
+
+    * IB_RNR_TIMER_000_96: Delay of 0.96 milliseconds.
+
+    * IB_RNR_TIMER_001_28: Delay of 1.28 milliseconds.
+
+    * IB_RNR_TIMER_001_92: Delay of 1.92 milliseconds.
+
+    * IB_RNR_TIMER_002_56: Delay of 2.56 milliseconds.
+
+    * IB_RNR_TIMER_003_84: Delay of 3.84 milliseconds.
+
+    * IB_RNR_TIMER_005_12: Delay of 5.12 milliseconds.
+
+    * IB_RNR_TIMER_007_68: Delay of 7.68 milliseconds.
+
+    * IB_RNR_TIMER_010_24: Delay of 10.24 milliseconds.
+
+    * IB_RNR_TIMER_015_36: Delay of 15.36 milliseconds.
+
+    * IB_RNR_TIMER_020_48: Delay of 20.48 milliseconds.
+
+    * IB_RNR_TIMER_030_72: Delay of 30.72 milliseconds.
+
+    * IB_RNR_TIMER_040_96: Delay of 40.96 milliseconds.
+
+    * IB_RNR_TIMER_061_44: Delay of 61.44 milliseconds.
+
+    * IB_RNR_TIMER_081_92: Delay of 81.92 milliseconds.
+
+    * IB_RNR_TIMER_122_88: Delay of 122.88 milliseconds.
+
+    * IB_RNR_TIMER_163_84: Delay of 163.84 milliseconds.
+
+    * IB_RNR_TIMER_245_76: Delay of 245.76 milliseconds.
+
+    * IB_RNR_TIMER_327_68: Delay of 327.86 milliseconds.
+
+    * IB_RNR_TIMER_491_52: Delay of 391.52 milliseconds.
+
+  * alt_port_num: The RDMA device's Port number that this QP is associated with in the alternate path.
+
+  * alt_timeout: The timeout to wait before resend the message again if the remote side didn't respond with any Ack or Nack in the alternate path. 5-bit value, 0 is infinite time, and any other value means that the timeout will be 4.096 * 2 ^ timeout usec.
+
+### The ib_query_qp() Method
+
+The ib_query_qp() method queries for the current QP attributes. Some of the attributes in qp_attr may change in subsequent calls to ib_query_qp() the state fields. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+
+  * qp: The QP to be queried.
+
+  * qp_attr: The QP attributes, as described earlier.
+
+  * qp_attr_mask: The mask of the mandatory attributes to query. Low-level drivers can use it as a hint for the fields to be queried, but they may also ignore it as well and fill the whole structure.
+
+  * qp_init_attr: The QP init attributes, as described earlier.
+
+The ib_destory_qp() method destroys a QP. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_destroy_qp(struct ib_qp *qp);
+
+  * qp: The QP to be destroyed.
+
+### The ib_open_qp() Method
+
+The ib_open_qp() method obtains a reference to an existing sharable QP among multiple processes. The process that created the QP may exit, allowing transfer of the ownership of the QP to another process. It will return a pointer to the sharable QP on success or an ERR_PTR() which specifies the reason for the failure.
+
+struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, struct ib_qp_open_attr *qp_open_attr);
+
+  * xrcd: The XRC domain that the QP will be associated with.
+
+  * qp_open_attr: The attributes of the existing QP to be opened.
+
+#### The ib_qp_open_attr Struct
+
+The shared QP attributes are represented by struct ib_qp_open_attr:
+
+struct ib_qp_open_attr {
+
+void (*event_handler)(struct ib_event *, void *);
+
+void *qp_context;
+
+u32 qp_num;
+
+enum ib_qp_type qp_type;
+
+};
+
+  * event_handler: A pointer to a callback that will be called in case of an affiliated asynchronous event to the QP.
+
+  * qp_context: User-defined context that can be associated with the QP.
+
+  * qp_num: The QP number that this QP will open.
+
+  * qp_type: QP transport type. Only IB_QPT_XRC_TGT is supported.
+
+### The ib_close_qp() Method
+
+The ib_close_qp() method releases an external reference to a QP. The underlying shared QP won't be destroyed until all internal references that were acquired by the ib_open_qp() method are released. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_close_qp(struct ib_qp *qp);
+
+  * qp: The QP to be closed.
+
+### The ib_post_recv() Method
+
+The ib_post_recv() method takes a linked list of Receive Requests and adds them to the Receive Queue for future processing. Every Receive Request is considered outstanding until a Work Completion is generated after its processing. It will return 0 on success or the errno value with the reason for the failure.
+
+static inline int ib_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr);
+
+  * qp: The QP that the Receive Requests will be posted to.
+
+  * recv_wr: A linked list of Receive Request to be posted.
+
+  * bad_recv_wr: If there was an error with the handling of the Receive Requests, this pointer will be filled with the address of the Receive Request that caused this error.
+
+### The ib_post_send() Method
+
+The ib_post_send() method takes a linked list of Send Requests as an argument and adds them to the Send Queue for future processing. Every Send Request is considered outstanding until a Work Completion is generated after its processing. It will return 0 on success or the errno value with the reason for the failure.
+
+static inline int ib_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr, struct ib_send_wr **bad_send_wr);
+
+  * qp: The QP that the Send Requests will be posted to.
+
+  * send_wr: A linked list of Send Requests to be posted.
+
+  * bad_send_wr: If there was an error with the handling of the Send Requests, this pointer will be filled with the address of the Send Request that caused this error.
+
+#### The ib_send_wr Struct
+
+The Send Request is represented by struct ib_send_wr:
+
+struct ib_send_wr {
+
+struct ib_send_wr *next;
+
+u64 wr_id;
+
+struct ib_sge *sg_list;
+
+int num_sge;
+
+enum ib_wr_opcode opcode;
+
+int send_flags;
+
+union {
+
+__be32 imm_data;
+
+u32 invalidate_rkey;
+
+} ex;
+
+union {
+
+struct {
+
+u64 remote_addr;
+
+u32 rkey;
+
+} rdma;
+
+struct {
+
+u64 remote_addr;
+
+u64 compare_add;
+
+u64 swap;
+
+u64 compare_add_mask;
+
+u64 swap_mask;
+
+u32 rkey;
+
+} atomic;
+
+struct {
+
+struct ib_ah *ah;
+
+void *header;
+
+int hlen;
+
+int mss;
+
+u32 remote_qpn;
+
+u32 remote_qkey;
+
+u16 pkey_index; /* valid for GSI only */
+
+u8 port_num; /* valid for DR SMPs on switch only */
+
+} ud;
+
+struct {
+
+u64 iova_start;
+
+struct ib_fast_reg_page_list *page_list;
+
+unsigned int page_shift;
+
+unsigned int page_list_len;
+
+u32 length;
+
+int access_flags;
+
+u32 rkey;
+
+} fast_reg;
+
+struct {
+
+struct ib_mw *mw;
+
+/* The new rkey for the memory window. */
+
+u32 rkey;
+
+struct ib_mw_bind_info bind_info;
+
+} bind_mw;
+
+} wr;
+
+u32 xrc_remote_srq_num; /* XRC TGT QPs only */
+
+};
+
+  * next: A pointer to the next Send Request in the list or NULL, if this is the last Send Request.
+
+  * wr_id: 64-bit value that is associated with this Send Request and will be available in the corresponding Work Completion.
+
+  * sg_list: The array of the scatter/gather elements. As described earlier.
+
+  * num_sge: The number of entries in sg_list. The value zero means that the message size is zero bytes.
+
+  * opcode: The operation to perform. This affects the way that data is being transferred, the direction of it, and whether a Receive Request will be consumed in the remote side and which fields in the Send Request (send_wr) will be used. Can be:
+
+  * IB_WR_RDMA_WRITE: RDMA Write operation.
+
+  * IB_WR_RDMA_WRITE_WITH_IMM: RDMA Write with immediate operation.
+
+  * IB_WR_SEND: Send operation.
+
+  * IB_WR_SEND_WITH_IMM: Send with immediate operation.
+
+  * IB_WR_RDMA_READ: RDMA Read operation.
+
+  * IB_WR_ATOMIC_CMP_AND_SWP: Compare and Swap operation.
+
+  * IB_WR_ATOMIC_FETCH_AND_ADD: Fetch and Add operation.
+
+    * IB_WR_LSO: Send an IPoIB message with LSO (let the RDMA device fragment the big SKBs to multiple MSS-sized packets).LSO is an optimization feature which allows to use large packets by reducing CPU overhead.
+
+      * IB_WR_SEND_WITH_INV: Send with invalidate operation.
+
+      * IB_WR_RDMA_READ_WITH_INV: RDMA Read with invalidate operation.
+
+      * IB_WR_LOCAL_INV: Local invalidate operation.
+
+      * IB_WR_FAST_REG_MR: Fast MR registration operation.
+
+      * IB_WR_MASKED_ATOMIC_CMP_AND_SWP: Masked Compare and Swap operation.
+
+      * IB_WR_MASKED_ATOMIC_FETCH_AND_ADD: Masked Fetch and Add operation.
+
+      * IB_WR_BIND_MW: Memory bind operation.
+
+    * send_flags: Extra attributes for the Send Request. It is a bitwise OR of the masks:
+
+      * IB_SEND_FENCE: Before performing this operation, wait until the processing of prior Send Requests has ended.
+
+      * IB_SEND_SIGNALED: If the QP was created with selective signaling, when the processing of this Send Request is ended, a Work Completion will be generated.
+
+      * IB_SEND_SOLICITED: Mark that a Solicited event will be created in the remote side.
+
+      * IB_SEND_INLINE: Post this Send Request as inline—that is, let the low-level driver read the memory buffers in if sg_list instead of the RDMA device; this may increase the latency.
+
+      * IB_SEND_IP_CSUM: Send an IPoIB message and calculate the IP checksum in HW (checksum offload).
+
+  * ex.imm_data: The immediate data to send. This value is relevant if opcode is IB_WR_SEND_WITH_IMM or IB_WR_RDMA_WRITE_WITH_IMM.
+
+  * ex.invalidate_rkey: The rkey to be invalidated. This value is relevant if opcode is IB_WR_SEND_WITH_INV.
+
+The following union is relevant if opcode is IB_WR_RDMA_WRITE, IB_WR_RDMA_WRITE_WITH_IMM, or IB_WR_RDMA_READ:
+
+  * wr.rdma.remote_addr: The remote address that this Send Request is going to access.
+
+  * wr.rdma.rkey: The Remote Key (rkey) of the MR that this Send Request is going to access.
+
+The following union is relevant if opcode is IB_WR_ATOMIC_CMP_AND_SWP, IB_WR_ATOMIC_FETCH_AND_ADD,IB_WR_MASKED_ATOMIC_CMP_AND_SWP, or IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
+
+  * wr.atomic.remote_addr: The remote address that this Send Request is going to access.
+
+  * wr.atomic.compare_add: If opcode is IB_WR_ATOMIC_FETCH_AND_ADD*, this is the value to add to the content of remote_addr. Otherwise, this is the value to compare the content of remote_addr with.
+
+  * wr.atomic.swap: The value to place in remote_addr if the value in it is equal to compare_add. This value is relevant if opcode is IB_WR_ATOMIC_CMP_AND_SWP or IB_WR_MASKED_ATOMIC_CMP_AND_SWP.
+
+  * wr.atomic.compare_add_mask: If opcode is IB_WR_MASKED_ATOMIC_FETCH_AND_ADD, this is the mask of the values to change when adding the value of compare_add to the content of remote_addr. Otherwise, this is the mask to use on the content of remote_addr when comparing it with swap.
+
+  * wr.atomic.swap_mask: This is the mask of the value in the content of remote_addr to change. Relevant only if opcode is IB_WR_MASKED_ATOMIC_CMP_AND_SWP.
+
+  * wr.atomic.rkey: The rkey of the MR that this Send Request is going to access.
+
+The following union is relevant if the QP type that this Send Request is being posted to is UD:
+
+  * wr.ud.ah: The address handle that describes the path to the target node(s).
+
+  * wr.ud.header: A pointer that contains the header. Relevant if opcode is IB_WR_LSO.
+
+  * wr.ud.hlen: The length of wr.ud.header. Relevant if opcode is IB_WR_LSO.
+
+  * wr.ud.mss: The Maximum Segment Size that the message will be fragmented to. Relevant if opcode is IB_WR_LSO.
+
+  * wr.ud.remote_qpn: The remote QP number to send the message to. The enumeration IB_MULTICAST_QPN should be used if sending this message to a multicast group.
+
+  * wr.ud.remote_qkey: The remote Q_Key value to use. If the MSB of this value is set, then the value of the Q_Key will be taken from the QP attributes.
+
+  * wr.ud.pkey_index: The P_Key index that the message will be sent with. Relevant if QP type is IB_QPT_GSI.
+
+  * wr.ud.port_num: The port number that the message will be sent from. Relevant for Direct Route SMP on a switch.
+
+The following union is relevant if opcode is IB_WR_FAST_REG_MR:
+
+  * wr.fast_reg.iova_start: I/O Virtual Address of the newly created FMR.
+
+  * wr.fast_reg.page_list: List of pages to allocate to map in the FMR.
+
+  * wr.fast_reg.page_shift: Log 2 of size of "pages" to be mapped.
+
+  * wr.fast_reg.page_list_len: The number of pages in page_list.
+
+  * wr.fast_reg.length: The size, in bytes, of the FMR.
+
+  * wr.fast_reg.access_flags: The allowed operations on this FMR.
+
+  * wr.fast_reg.rkey: The value of the remote key to be assigned to the FMR.
+
+The following union is relevant if opcode is IB_WR_BIND_MW:
+
+  * wr.bind_mw.mw: The MW to be bounded.
+
+  * wr.bind_mw.rkey: The value of the remote key to be assigned to the MW.
+
+  * wr.bind_mw.bind_info: The bind attributes, as explained in the next section.
+
+The following member is relevant if the QP type that this Send Request is being posted to is XRCTGT:
+
+  * xrc_remote_srq_num: The remote SRQ that will receive the messages.
+
+#### The ib_mw_bind_info Struct
+
+The MW binding attributes for both MW type 1 and type 2 are represented by struct ib_mw_bind_info.
+
+struct ib_mw_bind_info {
+
+struct ib_mr *mr;
+
+u64 addr;
+
+u64 length;
+
+int mw_access_flags;
+
+};
+
+  * mr: A Memory Region that this Memory Window will be bounded to.
+
+  * addr: The address where the Memory Window will start from .
+
+  * length: The length, in bytes, of the Memory Window.
+
+  * mw_access_flags: The allowed incoming RDMA and Atomic operations. It is a bitwise OR of the masks:
+
+    * IB_ACCESS_REMOTE_WRITE: Incoming RDMA Write operations are allowed.
+
+    * IB_ACCESS_REMOTE_READ: Incoming RDMA Read operations are allowed.
+
+    * IB_ACCESS_REMOTE_ATOMIC: Incoming Atomic operations are allowed.
+
+## Memory Windows (MW)
+
+Memory Windows are used as a lightweight operation to change the allowed permission of incoming remote operations and invalidate them.
+
+### The ib_alloc_mw() Method
+
+The ib_alloc_mw() method allocates a Memory Window. It will return a pointer to the newly allocated MW on success or an ERR_PTR() which specifies the reason for the failure.
+
+struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
+
+  * pd: The PD that this MW is being associated with.
+
+  * type: The type of the Memory Window. Can be:
+
+    * IB_MW_TYPE_1: MW that can be bounded using a verb and supports only association of a PD.
+
+    * IB_MW_TYPE_2: MW that can be bounded using Work Request and supports association of a QP number only or a QP number and a PD.
+
+### The ib_bind_mw() Method
+
+The ib_bind_mw() method binds a Memory Window to a specified Memory Region with a specific address, size, and remote permissions. If there isn't any immediate error, the rkey of the MW will be updated to the new value, but the bind operation may still fail asynchronously (and end with completion with error). It will return 0 on success or the errno value with the reason for the failure.
+
+static inline int ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind);
+
+  * qp: The QP that the bind WR will be posted to.
+
+  * mw: The MW to bind.
+
+  * mw_bind: The bind attributes, as explained next.
+
+#### The ib_mw_bind Struct
+
+The MW binding attributes for type 1 MW are represented by struct ib_mw_bind.
+
+struct ib_mw_bind {
+
+u64 wr_id;
+
+int send_flags;
+
+struct ib_mw_bind_info bind_info;
+
+};
+
+  * wr_id: A 64-bit value that is associated with this bind Send Request The value of Work Request id (wr_id) will be available in the corresponding Work Completion.
+
+  * send_flags: Extra attribute for the bind Send Request, as explained earlier. Only IB_SEND_FENCE and IB_SEND_SIGNALED are supported here.
+
+  * bind_info: More attributes for the bind operation. As explained earlier.
+
+### The ib_dealloc_mw() Method
+
+The ib_dealloc_mw() method deallocates an MW. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_dealloc_mw(struct ib_mw *mw);
+
+  * mw: The MW to be deallocated.
+
+## Memory Region (MR)
+
+Every memory buffer that is being accessed by the RDMA device needs to be registered. During the registration process, the memory will be pinned (prevented from being swapped out), and the memory translation information (from virtual addresses ➤ physical addresses) will be saved in the RDMA device. After the registration, every Memory Region has two keys: one for local access and one for remote access. Those keys will be used when specifying those memory buffers in Work Requests.
+
+### The ib_get_dma_mr() Method
+
+The ib_get_dma_mr() method returns a Memory Region for system memory that is usable for DMA. Creating this MR isn't enough, and the ib_dma_*() methods below are needed in order to create or destroy addresses that the lkey and rkey of this MR will be used with. It will return a pointer to the newly allocated MR on success or an ERR_PTR() which specifies the reason for the failure.
+
+struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
+
+  * pd: The PD that this MR is being associated with.
+
+  * mr_access_flags: The allowed operations on this MR. Local Write is always supported in this MR. It is a bitwise OR of the masks:
+
+    * IB_ACCESS_LOCAL_WRITE: Local write to this Memory Region is allowed.
+
+    * IB_ACCESS_REMOTE_WRITE: Incoming RDMA Write operations to this Memory Region are allowed.
+
+    * IB_ACCESS_REMOTE_READ: Incoming RDMA Read operations to this Memory Region are allowed.
+
+    * IB_ACCESS_REMOTE_ATOMIC: Incoming Atomic operations to this Memory Region are allowed.
+
+    * IB_ACCESS_MW_BIND: MW bind to this Memory Region is allowed.
+
+    * IB_ZERO_BASED: Indication that the Virtual address is zero based.
+
+### The ib_dma_mapping_error() Method
+
+The ib_dma_mapping_error() method checks if the DMA address that was returned from ib_dma_*() failed. It will return a non-zero value if there was any failure and zero if the operation finished successfully.
+
+static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr);
+
+  * dev: The RDMA device for which the DMA address was created by using an ib_dma_*() method.
+
+  * dma_addr: The DMA address to verify.
+
+### The ib_dma_map_single() Method
+
+The ib_dma_map_single() method maps a kernel virtual address to a DMA address. It will return a DMA address that needed to be checked with the ib_dma_mapping_error() method for errors:
+
+static inline u64 ib_dma_map_single(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction);
+
+  * dev: The RDMA device on which the DMA address will be created.
+
+  * cpu_addr: The kernel virtual address to map for DMA.
+
+  * size: The size, in bytes, of the region to map.
+
+  * direction: The direction of the DMA. Can be:
+
+    * DMA_TO_DEVICE: DMA from the main memory to the device.
+
+    * DMA_FROM_DEVICE: DMA from the device to main memory.
+
+    * DMA_BIDIRECTIONAL: DMA from the main memory to the device or from the device to main memory.
+
+### The ib_dma_unmap_single() Method
+
+The ib_dma_unmap_single() method unmaps a DMA mapping that was assigned using ib_dma_map_single():
+
+static inline void ib_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction);
+
+  * dev: The RDMA device on which the DMA address was created.
+
+  * addr: The DMA address to unmap.
+
+  * size: The size, in bytes, of the region to unmap. This value must be the same value that was used in the ib_dma_map_single() method.
+
+  * direction: The direction of the DMA. This value must be the same value that was used in the ib_dma_map_single() method.
+
+### The ib_dma_map_single_attrs() Method
+
+The ib_dma_map_single_attrs() method maps a kernel virtual address to a DMA address according to a DMA attributes. It will return a DMA address that is needed to be checked with the ib_dma_mapping_error() method for errors.
+
+static inline u64 ib_dma_map_single_attrs(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction, struct dma_attrs *attrs);
+
+  * dev: The RDMA device on which the DMA address will be created.
+
+  * cpu_addr: The kernel virtual address to map for DMA.
+
+  * size: The size, in bytes, of the region to map.
+
+  * direction: The direction of the DMA. As described earlier.
+
+  * attrs: The DMA attributes for the mapping. If this value is NULL, this method behaves like the ib_dma_map_single() method.
+
+### The ib_dma_unmap_single_attrs() Method
+
+The ib_dma_unmap_single_attrs() method unmaps a DMA mapping that was assigned using the ib_dma_map_single_attrs() method:
+
+static inline void ib_dma_unmap_single_attrs(struct ib_device *dev, u64 addr, size_t size
+
+enum dma_data_direction direction, struct dma_attrs *attrs);
+
+  * dev: The RDMA device on which the DMA address was created.
+
+  * addr: The DMA address to unmap.
+
+  * size: The size, in bytes, of the region to unmap. This value must be the same value that was used in the ib_dma_map_single_attrs() method.
+
+  * direction: The direction of the DMA. This value must be the same value that was used in the ib_dma_map_single_attrs() method.
+
+  * attrs: The DMA attributes of the mapping. This value must be the same value that was used in the ib_dma_map_single_attrs() method. If this value is NULL, this method behaves like the ib_dma_unmap_single() method.
+
+### The ib_dma_map_page() Method
+
+The ib_dma_map_page() method maps a physical page to a DMA address. It will return a DMA address that needs to be checked with the ib_dma_mapping_error() method for errors:
+
+static inline u64 ib_dma_map_page(struct ib_device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction);
+
+  * dev: The RDMA device on which the DMA address will be created.
+
+  * page: The physical page address to map for DMA.
+
+  * offset: The offset within the page that the registration will start from.
+
+  * size: The size, in bytes, of the region.
+
+  * direction: The direction of the DMA. As described earlier.
+
+### The ib_dma_unmap_page() Method
+
+The ib_dma_unmap_page() method unmaps a DMA mapping that was assigned using the ib_dma_map_page() method:
+
+static inline void ib_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction);
+
+  * dev: The RDMA device on which the DMA address was created.
+
+  * addr: The DMA address to unmap.
+
+  * size: The size, in bytes, of the region to unmap. This value must be the same value that was used in the ib_dma_map_page() method.
+
+  * direction: The direction of the DMA. This value must be the same value that was used in the ib_dma_map_page() method.
+
+### The ib_dma_map_sg() Method
+
+The ib_dma_map_sg() method maps a scatter/gather list to a DMA address. It will return a non-zero value on success and 0 on a failure.
+
+static inline int ib_dma_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction);
+
+  * dev: The RDMA device on which the DMA address will be created.
+
+  * sg: An array of the scatter/gather entries to map.
+
+  * nents: The number of scatter/gather entries in sg.
+
+  * direction: The direction of the DMA. As described earlier.
+
+### The ib_dma_unmap_sg() Method
+
+The ib_dma_unmap_sg() method unmaps a DMA mapping that was assigned using the ib_dma_map_sg() method:
+
+static inline void ib_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction);
+
+  * dev: The RDMA device on which the DMA address was created.
+
+  * sg: An array of the scatter/gather entries to unmap. This value must be the same value that was used in the ib_dma_map_sg() method.
+
+  * nents: The number of scatter/gather entries in sg. This value must be the same value that was used in the ib_dma_map_sg() method.
+
+  * direction: The direction of the DMA. This value must be the same value that was used in the ib_dma_map_sg() method.
+
+### The ib_dma_map_sg_attr() Method
+
+The ib_dma_map_sg_attr() method maps a scatter/gather list to a DMA address according to a DMA attributes. It will return a non-zero value on success and 0 on a failure.
+
+static inline int ib_dma_map_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *attrs);
+
+  * dev: The RDMA device on which the DMA address will be created.
+
+  * sg: An array of the scatter/gather entries to map.
+
+  * nents: The number of scatter/gather entries in sg.
+
+  * direction: The direction of the DMA. As described earlier.
+
+  * attrs: The DMA attributes for the mapping. If this value is NULL, this method behaves like the ib_dma_map_sg() method.
+
+### The ib_dma_unmap_sg() Method
+
+The ib_dma_unmap_sg() method unmaps a DMA mapping that was done using the ib_dma_map_sg() method:
+
+static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *attrs);
+
+  * dev: The RDMA device on which the DMA address was created.
+
+  * sg: An array of the scatter/gather entries to unmap. This value must be the same value that was used in the ib_dma_map_sg_attrs() method.
+
+  * nents: The number of scatter/gather entries in sg. This value must be the same value that was used in the ib_dma_map_sg_attrs() method.
+
+  * direction: The direction of the DMA. This value must be the same value that was used in the ib_dma_map_sg_attrs() method.
+
+  * attrs: The DMA attributes of the mapping. This value must be the same value that was used in the ib_dma_map_sg_attrs() method. If this value is NULL, this method behaves like the ib_dma_unmap_sg() method.
+
+### The ib_sg_dma_address() Method
+
+The ib_sg_dma_address() method returns the DMA address from a scatter/gather entry.
+
+static inline u64 ib_sg_dma_address(struct ib_device *dev, struct scatterlist *sg);
+
+  * dev: The RDMA device on which the DMA address was created.
+
+  * sg: A scatter/gather entry.
+
+### The ib_sg_dma_len() Method
+
+The ib_sg_dma_len() method returns the DMA length from a scatter/gather entry.
+
+static inline unsigned int ib_sg_dma_len(struct ib_device *dev, struct scatterlist *sg);
+
+  * dev: The RDMA device on which the DMA address was created.
+
+  * sg: A scatter/gather entry.
+
+### The ib_dma_sync_single_for_cpu() Method
+
+The ib_dma_sync_single_for_cpu() method transfers a DMA region ownership to the CPU. This method must be called before the CPU accesses a DMA-mapped buffer in order to read or modify its content, and prevents the device from accessing it:
+
+static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir);
+
+  * dev: The RDMA device on which the DMA address was created.
+
+  * addr: The DMA address to sync.
+
+  * size: The size, in bytes, of the region.
+
+  * direction: The direction of the DMA. As described earlier.
+
+### The ib_dma_sync_single_for_device() Method
+
+The ib_dma_sync_single_for_device() method transfers a DMA region ownership to the device. This method must be called before the device can access a DMA-mapped buffer again after the ib_dma_sync_single_for_cpu() method was called.
+
+static inline void ib_dma_sync_single_for_device(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir);
+
+  * dev: The RDMA device on which the DMA address was created.
+
+  * addr: The DMA address to sync.
+
+  * size: The size, in bytes, of the region.
+
+  * direction: The direction of the DMA. As described earlier.
+
+### The ib_dma_alloc_coherent() Method
+
+The ib_dma_alloc_coherent() method allocates a memory block that can be accessible by the CPU and maps it for DMA. It will return the virtual address that the CPU can access on success or NULL in case of a failure:
+
+static inline void *ib_dma_alloc_coherent(struct ib_device *dev, size_t size, u64 *dma_handle, gfp_t flag);
+
+  * dev: The RDMA device on which the DMA address will be created.
+
+  * size: The size, in bytes, of the memory to allocate and map.
+
+  * direction: The direction of the DMA. As described earlier.
+
+  * dma_handle: A pointer that will be filled with the DMA address of the region, if the allocation succeeds.
+
+  * flag: Memory allocation flags. Can be:
+
+    * GFP_KERNEL: To allow blocking (not in interrupt, not holding SMP locks).
+
+    * GFP_ATOMIC: Prevent blocking.
+
+### The ib_dma_free_coherent() method
+
+The ib_dma_free_coherent() method frees a memory block that was allocated using the ib_dma_alloc_coherent() method:
+
+static inline void ib_dma_free_coherent(struct ib_device *dev, size_t size, void *cpu_addr, u64 dma_handle);
+
+  * dev: The RDMA device on which the DMA address was created.
+
+  * size: The size, in bytes, of the memory region. This value must be the same value that was used in the ib_dma_alloc_coherent() method.
+
+  * cpu_addr: The CPU memory address to free. This value must be the value that was returned by the ib_dma_alloc_coherent() method.
+
+  * dma_handle: The DMA address to free. This value must be the value that was returned by the ib_dma_alloc_coherent() method.
+
+### The ib_reg_phys_mr() Method
+
+The ib_reg_phys_mr() method takes a set of physical pages, register them and prepare a virtual address that can be accessed by an RDMA device. It will return a pointer to the newly allocated MR on success or an ERR_PTR(), which specifies the reason for the failure.
+
+struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, struct ib_phys_buf *phys_buf_array, int num_phys_buf, int mr_access_flags, u64 *iova_start);
+
+  * pd: The PD that this MR is being associated with.
+
+  * phys_buf_array: An array of physical buffers to use in the Memory Region.
+
+  * num_phys_buf: The number of physical buffers in phys_buf_array.
+
+  * mr_access_flags: The allowed operations on this MR. As specified earlier.
+
+  * iova_start: A pointer to the requested I/O Virtual Address to be associated with the Region, which is allowed to begin anywhere within the first physical buffer. The RDMA device will set this value with the actual I/O virtual address of the Region. This value may be different from the requested one.
+
+#### The ib_phys_buf Struct
+
+The physical buffer is represented by struct ib_phys_buf.
+
+struct ib_phys_buf {
+
+u64 addr;
+
+u64 size;
+
+};
+
+  * addr: The physical address of the buffer.
+
+  * size: The size of the buffer.
+
+### The ib_rereg_phys_mr() Method
+
+The ib_rereg_phys_mr() method modifies the attributes of an existing Memory Region. This method can be thought of as a call to the ib_dereg_mr() method, which was followed by a call to the ib_reg_phys_mr() method. Where possible, resources are reused instead of being deallocated and reallocated. It will return 0 on success or the errno value with the reason for the failure:
+
+int ib_rereg_phys_mr(struct ib_mr *mr, int mr_rereg_mask, struct ib_pd *pd, struct ib_phys_buf *phys_buf_array, int num_phys_buf, int mr_access_flags, u64 *iova_start);
+
+  * mr: The Memory Region to be reregistered.
+
+  * mr_rereg_mask: The Memory Region attributes to be changed. It is a bitwise OR of the masks:
+
+  * IB_MR_REREG_TRANS: Modify the memory pages of this Memory Region.
+
+  * IB_MR_REREG_PD: Modify the PD of this Memory Region.
+
+  * IB_MR_REREG_ACCESS: Modify the allowed operations of this Memory Region.
+
+  * pd: The new Protection Domain that this Memory Region will be associated with.
+
+  * phys_buf_array: The new physical pages to be used.
+
+  * num_phys_buf: The number of physical pages to be used.
+
+  * mr_access_flags: The new allowed operations of this Memory Region.
+
+  * iova_start: The new I/O Virtual Address of this Memory Region.
+
+### The ib_query_mr() Method
+
+The ib_query_mr() method retrieves the attributes of a specific MR. It will return 0 on success or the errno value with the reason for the failure.
+
+int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
+
+  * mr: The MR to be queried.
+
+  * mr_attr: The MR attributes as describe in the next section.
+
+The MR attributes are represented by struct ib_mr_attr.
+
+#### The ib_mr_attr Struct
+
+struct ib_mr_attr {
+
+struct ib_pd *pd;
+
+u64 device_virt_addr;
+
+u64 size;
+
+int mr_access_flags;
+
+u32 lkey;
+
+u32 rkey;
+
+};
+
+  * pd: The PD that the MR is associated with.
+
+  * device_virt_addr: The address of the virtual block that this MR covers.
+
+  * size: The size, in bytes, of the Memory Region.
+
+  * mr_access_flags: The access permissions of this Memory Region.
+
+  * lkey: The local key of this Memory Region.
+
+  * rkey: The remote key of this Memory Region.
+
+### The ib_dereg_mr() Method
+
+The ib_dereg_mr() method deregisters an MR. This method may fail if a Memory Window is bounded to it. It will return 0 on success or the errno value with the reason for the failure:
+
+int ib_dereg_mr(struct ib_mr *mr);
+
+  * mr: The MR to be deregistered.
+
+Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_16
+
+© Rami Rosen 2014
+
+# Network Administration
+
+Rami Rosen1
+
+(1)
+
+Haifa, Israel
+
+Abstract
+
+This appendix reviews some of the most popular tools for network administration and debugging. These tools can help a lot in finding solutions to common problems and in developing, debugging, benchmarking, analyzing, troubleshooting, and researching network projects. Most of these tools have very good documentation resources, either with man pages or with wiki pages, and a lot of other information resources about them are on the Internet. Many of them have active mailing lists (for users and developers) and a bug reporting system. Some of the most commonly used tools are described here by specifying their purpose and relevant links, accompanied by several examples. The tools mentioned in this appendix appear in alphabetical order.
+
+This appendix reviews some of the most popular tools for network administration and debugging. These tools can help a lot in finding solutions to common problems and in developing, debugging, benchmarking, analyzing, troubleshooting, and researching network projects. Most of these tools have very good documentation resources, either with man pages or with wiki pages, and a lot of other information resources about them are on the Internet. Many of them have active mailing lists (for users and developers) and a bug reporting system. Some of the most commonly used tools are described here by specifying their purpose and relevant links, accompanied by several examples. The tools mentioned in this appendix appear in alphabetical order.
+
+## arp
+
+This command is for ARP table management. Example of usage:
+
+You can display the ARP table by running arp from the command-line. arp –n will display the ARP table without name resolution.
+
+You can add static entries to the ARP table by:
+
+arp –s 192.168.2.10 00:e0:4c:11:22:33
+
+The arp utility belongs to the net-tools package. Website:  http://net-tools.sourceforge.net .
+
+## arping
+
+A utility to send ARP requests. The –D flag is for Duplicate Address Detection (DAD). The arping utility belongs to the iputils package. Website:  http://www.skbuff.net/iputils/ .
+
+### arptables
+
+A userspace tool for configuring rules for a Linux-based ARP rules firewall. Website:  http://ebtables.sourceforge.net/ .
+
+### arpwatch
+
+A userspace tool for monitoring ARP traffic. Website:  http://ee.lbl.gov/ .
+
+## ApacheBench (ab)
+
+A command-line utility for measuring the performance of HTTP web servers. The ApacheBench tool is part of the Apache open source project. In many distributions (for example, Ubuntu) it is part of the apache2-utils package. Example of usage:
+
+ab -n 100  http://www.google.com/
+
+The -n option is the number of requests to perform for the benchmarking session.
+
+## brctl
+
+A command-line utility for administration of Ethernet bridges, enabling the setup of a bridge configuration. The brctl utility belongs to the bridge-utils package. Examples for usage:
+
+  * brctl addbr mybr: Add a bridge named mybr.
+
+  * brctl delbr mybr: Delete the bridge named mybr.
+
+  * brctl addif mybr eth1: Add the eth1 interface to the bridge.
+
+  * brctl delif mybr eth1: Delete the eth1 interface from the bridge.
+
+  * brctl show: Show information about the bridge and its attached ports.
+
+The maintainer of the bridge-utils package is Stephen Hemminger. Fetching the git repository can be done by:
+
+git clone git://git.kernel.org/pub/scm/linux/kernel/git/shemminger/bridge-utils.git
+
+Website:  http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge .
+
+## conntrack-tools
+
+A set of userspace tools for management of netfilter connection tracking. It consists of a userspace daemon, conntrackd, and a command-line tool, conntrack. Website:  http://conntrack-tools.netfilter.org/ .
+
+## crtools
+
+A utility for checkpoint/restore of a process. Website:  http://criu.org/Installation .
+
+## ebtables
+
+A userspace tool for configuring rules for a Linux-based bridging firewall. Website:  http://ebtables.sourceforge.net/ .
+
+## ether-wake
+
+A utility to send Wake-On-LAN Magic Packets. The ether-wake utility belongs to the net-tools package.
+
+## ethtool
+
+The ethtool utility provides a way to query or control network driver and hardware settings, get statistics, get diagnostic information, and more. With ethtool you can control parameters of Ethernet devices, such as speed, duplex, auto-negotiation and flow control. Many features of ethtool require support in the network driver code.
+
+Examples:
+
+  * Output of ethtool eth0:
+
+Settings for eth0:
+
+Supported ports: [ TP MII ]
+
+Supported link modes: 10baseT/Half 10baseT/Full
+
+100baseT/Half 100baseT/Full
+
+1000baseT/Half 1000baseT/Full
+
+Supported pause frame use: No
+
+Supports auto-negotiation: Yes
+
+Advertised link modes: 10baseT/Half 10baseT/Full
+
+100baseT/Half 100baseT/Full
+
+1000baseT/Half 1000baseT/Full
+
+Advertised pause frame use: Symmetric Receive-only
+
+Advertised auto-negotiation: Yes
+
+Speed: 10Mb/s
+
+Duplex: Half
+
+Port: MII
+
+PHYAD: 0
+
+Transceiver: internal
+
+Auto-negotiation: on
+
+Supports Wake-on: pumbg
+
+Wake-on: g
+
+Current message level: 0x00000033 (51)
+
+drv probe ifdown ifup
+
+Link detected: no
+
+  * Getting offload parameters is done by: ethtool –k eth1.
+
+  * Setting offload parameters is done by: ethtool –K eth1 offLoadParamater.
+
+  * Querying the network device for associated driver information is done by: ethtool -i eth1.
+
+  * Showing statistics is done by: ethtool -S eth1 (note that not all the network device drivers implement this feature).
+
+  * Show permanent hardware (MAC) address: ethtool -P eth0.
+
+The development of ethtool is done by sending patches to the netdev mailing list. The maintainer of ethtool as of this writing is Ben Hutchings. The ethtool project is developed over a git repository. It can be downloaded by: git clone git://git.kernel.org/pub/scm/network/ethtool/ethtool.git.
+
+Website:  www.kernel.org/pub/software/network/ethtool/ .
+
+## git
+
+A distributed version control system started by Linus Torvalds. Linux kernel development, as well as many Linux related projects, are managed by git. One can also use the git send-email command in order to send patches by mail. Website:  http://git-scm.com/ .
+
+## hciconfig
+
+A command-line tool for configuring Bluetooth devices. With hciconfig, you can display information such as the Bluetooth interface type (BR/EDR or AMP), its Bluetooth address, its flags, and more. The hciconfig tool belongs to the bluez package. Example:
+
+hciconfig
+
+hci0: Type: BR/EDR Bus: USB
+
+BD Address: 00:02:72:AA:FB:94 ACL MTU: 1021:7 SCO MTU: 64:1
+
+UP RUNNING PSCAN
+
+RX bytes:964 acl:0 sco:0 events:41 errors:0
+
+TX bytes:903 acl:0 sco:0 commands:41 errors:0
+
+Website:  http://www.bluez.org/ .
+
+## hcidump
+
+A command-line utility for dumping raw HCI data coming from and going to a Bluetooth device. The hcidump utility belongs to the bluez-hcidump package. Website:  http://www.bluez.org/ .
+
+## hcitool
+
+A command-line utility for configuring Bluetooth connections and for sending some special commands to Bluetooth devices. For example, you can scan for nearby Bluetooth devices by: hcitool scan. The hcitool utility belongs to the bluez-hcidump package.
+
+## ifconifg
+
+The ifconfig command allows you to configure various network interface parameters, including the IP address of the device, the MTU, the MAC address, the Tx queue length (txqueuelen), flags, and more. The ifconfig tool belongs to the net-tools package, which is older than the iproute2 package (discussed later in this appendix). Here are three examples of usage:
+
+  * ifconfig eth0 mtu 1300: Change the MTU to 1300.
+
+  * ifconfig eth0 txqueuelen 1100: Change the Tx Queue length to 1100.
+
+  * ifconfig eth0 –arp: Disable the ARP protocol on eth0.
+
+Website:  http://net-tools.sourceforge.net .
+
+## ifenslave
+
+A utility for attaching and detaching slave network devices to a bonding device. Bonding is putting multiple physical Ethernet devices into a single logical one, what is often termed as Link aggregation/Trunking/Link bundling. The source file is in Documentation/networking/ifenslave.c. You can attach eth0, for example, to a bonding device bond0 by:
+
+ifenslave bond0 eth0
+
+The ifenslave utility belongs to the iputils package, maintained by Yoshifuji Hideaki. Website:  www.skbuff.net/iputils/ .
+
+## iperf
+
+The iperf project is an open source project that provides a benchmarking tool to measure TCP and UDP bandwidth performance. It allows you to tune various parameters. The iperf tool reports bandwidth, delay jitter, and datagram loss. It was originally developed by the Distributed Applications Support Team (DAST) at the National Laboratory for Applied Network Research (NLANR) in C++. It works in a client-server model. A new implementation from scratch, iperf3, which is not backwards compatible with the original iperf, is available from  https://code.google.com/p/iperf/ . The iperf3 is said to have a simpler code base. The iperf3 tool can report also the average CPU utilization of the client and the server.
+
+### Using iperf
+
+Following is a simple example of using iperf for measuring TCP performance. On one device (which has an IP address of 192.168.2.104), run the next command, which starts the server side (by default, it is a TCP socket on port 5001):
+
+iperf -s
+
+On a second device, run the iperf TCP client to connect to the iperf server:
+
+iperf -c 192.168.2.104
+
+On the client side you will see the following:
+
+\------------------------------------------------------------
+
+Client connecting to 192.168.2.104, TCP port 5001
+
+TCP window size: 22.9 KByte (default)
+
+\------------------------------------------------------------
+
+[ 3] local 192.168.2.200 port 35146 connected with 192.168.2.104 port 5001
+
+The default time interval is 10 seconds. After 10 seconds, the client will be disconnected, and you will see a message like this on the terminal:
+
+[ ID] Interval Transfer Bandwidth
+
+[ 3] 0.0-10.3 sec 7.62 MBytes 6.20 Mbits/sec
+
+You can tune many parameters of iperf, like these:
+
+  * –u: For using a UDP socket.
+
+  * -t: For using a different time interval in seconds instead of the default of 10 seconds.
+
+  * -T: Sets a TTL for multicast (the default is 1).
+
+  * -B: Bind to a host, an interface, or a multicast address.
+
+See man iperf. Website:  http://iperf.sourceforge.net/ .
+
+## iproute2
+
+The iproute2 package provides many tools for interaction between the userspace and the kernel networking subsystem. The most well-known is the ip command. It is based on netlink sockets (discussed in Chapter 2). With the ip command, you can perform various operations in a wide range of networking areas, and it has numerous options; see man 8 ip. Here are several examples of using the ip command for various tasks:
+
+  * Configuration of a network device with ip addr:
+
+    * ip addr add 192.168.0.10/24 dev eth0: Sets an IP address on eth0.
+
+    * ip addr show: Displays the addresses of all network interfaces (both IPv4 and IPv6).
+
+See man ip address.
+
+  * Configuration of a network device with ip link:
+
+    * ip link add mybr type bridge: Creates a bridge named mybr.
+
+    * ip link add name myteam type team: Creates a teaming device named myteam. (The teaming device driver aggregates multiple physical Ethernet devices into one logical one and is in fact the new bonding device. The teaming driver is discussed in Chapter 14.)
+
+    * ip link set eth1 mtu 1450: Sets the MTU of eth1 to be 1450.
+
+See man ip link.
+
+  * Management of ARP tables (IPv4) and NDISC (IPv6) tables:
+
+    * ip neigh show: Shows both the IPv4 neighbouring table (ARP table) and the IPv6 neighbouring table.
+
+    * ip -6 neigh show: Shows only the IPv6 neighbouring table.
+
+    * ip neigh flush dev eth0: Removes all entries from the neighboring tables associated with eth0.
+
+    * ip neigh add 192.168.2.20 dev eth2 lladdr 00:11:22:33:44:55 nud permanent: Adds a permanent neighbour entry (parallel to adding static entries in an ARP table).
+
+    * ip neigh change 192.168.2.20 dev eth2 lladdr 55:44:33:22:11:00 nud permanent: Updates a neighbour entry.
+
+See man ip neighbour.
+
+  * Management of the parameters for the neighbour tables:
+
+    * ip ntable show: Displays the neighbour tables parameters.
+
+    * ip ntable change name arp_cache locktime 1200 dev eth0: Changes the locktime parameter for the IPv4 neighbouring table associated with eth0.
+
+See man ip ntable.
+
+  * Network namespaces management:
+
+    * ip netns add myNamespace: Adds a network namespace named myNamespace.
+
+    * ip netns del myNamespace: Deletes the network namespace named myNamespace.
+
+    * ip netns list: Shows all network namespaces on the host.
+
+    * ip netns monitor: Displays a line of screen for each network namespace that is added or removed by the ip netns command.
+
+See man ip netns.
+
+  * Configuration of multicast addresses:
+
+    * ip maddr show: Shows all multicast addresses on the host (both IPv4 and IPv6).
+
+    * ip maddr add 00:10:02:03:04:05 dev eth1: Adds a multicast address on eth1.
+
+See man ip maddress.
+
+    * Monitor netlink messages. For example:
+
+      * ip monitor route displays on the screen messages about various network events like adding or deleting a route.
+
+See man ip monitor.
+
+  * Management of routing tables:
+
+    * ip route show: Shows the routing table.
+
+    * ip route flush dev eth1: Removes routing entries associated with eth1 from the routing table.
+
+    * ip route add default via 192.168.2.1: Adds 192.168.2.1 as a default gateway.
+
+    * ip route get 192.168.2.10: Gets the route to 192.168.2.10 and displays it.
+
+See man ip route.
+
+  * Management of rules in the RPDB (Routing Policy DataBase). For example:
+
+    * ip rule add tos 0x02 table 200: Adds a rule that sets the routing subsystem to perform a lookup in routing table 252 for packets whose TOS value is 0x02 (TOS is a field in the IPv4 header).
+
+    * ip rule del tos 0x02 table 200: Deletes a specified rule from the RPDB.
+
+    * ip rule show: Displays the rules in the RPDB.
+
+See man ip rule.
+
+  * Management of TUN/TAP devices:
+
+    * ip tuntap add tun1 mode tun: Creates a TUN device named tun1.
+
+    * ip tuntap del tun1 mode tun: Deletes a TUN device named tun1.
+
+    * ip tuntap add tap1 mode tap: Creates a TAP device named tap1.
+
+    * ip tuntap del tap1 mode tap: Deletes a TAP device named tap1.
+
+  * Management of IPsec policies:
+
+    * ip xfrm policy show: Shows IPsec policies.
+
+    * ip xfrm state show: Shows IPsec states.
+
+See man ip xfrm.
+
+The ss tool is used to dump socket statistics. For example, running
+
+ss -t –a
+
+will show all TCP sockets:
+
+State Recv-Q Send-Q Local Address:Port Peer Address:Port
+
+LISTEN 0 32 *:ftp *:*
+
+LISTEN 0 128 *:ssh *:*
+
+LISTEN 0 128 127.0.0.1:ipp *:*
+
+ESTAB 0 0 192.168.2.200:ssh 192.168.2.104:52089
+
+ESTAB 0 52 192.168.2.200:ssh 192.168.2.104:51352
+
+ESTAB 0 0 192.168.2.200:ssh 192.168.2.104:51523
+
+ESTAB 0 0 192.168.2.200:59532 107.21.231.190:http
+
+LISTEN 0 128 :::ssh :::*
+
+LISTEN 0 128 ::1:ipp :::*
+
+CLOSE-WAIT 1 0 ::1:48723 ::1:ipp
+
+There are other tools of iproute2:
+
+  * bridge: Shows/manipulates bridge addresses and devices. For example:
+
+    * bridge fdb show: Displays forwarding entries.
+
+See man bridge.
+
+  * genl: Gets information (like id, header size, max attributes, and more) about registered generic netlink families. For example, running genl ctrl list can have this as a result:
+
+Name: nlctrl
+
+ID: 0x10 Version: 0x2 header size: 0 max attribs: 7
+
+commands supported:
+
+#1: ID-0x3
+
+Capabilities (0xe):
+
+can doit; can dumpit; has policy
+
+multicast groups:
+
+#1: ID-0x10 name: notify
+
+  * lnstat: Displays Linux network statistics.
+
+  * rtmon: Monitors Rtnetlink sockets.
+
+  * tc: Shows/manipulates traffic control settings. For example:
+
+    * tc qdisc show: Running this command shows which queueing discipline (qdisc) entries are installed, for example:
+
+qdisc pfifo_fast 0: dev eth1 root refcnt 2 bands 3 priomap 1 2 . . .
+
+  * This shows that the pfifo_fast qdisc is associated with the eth1 network device. The pfifo_fast qdisc, which is a classless queueing discipline, is the default qdisc in Linux.
+
+    * tc -s qdisc show dev eth1: Shows statistics of the qdisc associated to eth1.
+
+See man tc.
+
+See: Linux Advanced Routing & Traffic Control HOWTO:  www.lartc.org/howto/ .
+
+The development of iproute2 is done by sending patches to the netdev mailing list. The maintainer of ethtool as of this writing is Stephen Hemminger. The iproute2 is developed over a git repository, which can be downloaded by: git clone git://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git.
+
+## iptables and iptables6
+
+The iptables and iptables6 are administration tools for packet filtering and NAT management for IPv4 and IPv6, respectively. With iptables/iptables6, you can define lists of rules. Each such rule tells what should be done with the packet (for example, discard it or accept it). Each rule specifies some matching condition for a packet, for example, that it will be a UDP packet. Following are some examples for using the iptables command:
+
+  * iptables -A INPUT -p tcp --dport=80 -j LOG --log-level 1: The meaning of this rule is that incoming TCP packets with destination port 80 will be dumped to the syslog.
+
+  * iptables –L: Lists all rules in the filter table. (There is no table mentioned in the command, so it accesses the Filter table, which is the default table.)
+
+  * iptables –t nat –L: Lists all rules in the NAT table.
+
+  * iptables –F: Flushes the selected table.
+
+  * iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE: Sets a MASQUERADE rule.
+
+Website:  www.netfilter.org/ .
+
+## ipvsadm
+
+A tool for Linux Virtual Server administration. Website:  www.linuxvirtualserver.org/software/ipvs.html .
+
+## iw
+
+Shows/manipulates wireless devices and their configuration. The iw package is based on generic netlink sockets (see Chapter 2). For example, you can perform these operations:
+
+  * iw dev wlan0 scan: Scans for nearby wireless devices.
+
+  * iw wlan0 station dump: Displays statistics about a station.
+
+  * iw list: Gets information about a wireless device (such as band information and 802.11n information).
+
+  * iw dev wlan0 get power_save – get power save mode.
+
+  * iw dev wlan0 set type ibss: Changes the wireless interface mode to be ibss (Ad-Hoc).
+
+  * iw dev wlan0 set type mesh: Changes the wireless interface mode to be mesh mode.
+
+  * iw dev wlan0 set type monitor: Changes the wireless interface mode to be monitor mode.
+
+  * iw dev wlan0 set type managed: Changes the wireless interface mode to be managed mode.
+
+See man iw.
+
+Gitweb:  http://git.kernel.org/cgit/linux/kernel/git/jberg/iw.git .
+
+Website:  http://wireless.kernel.org/en/users/Documentation/iw .
+
+## iwconfig
+
+The old tool for administering wireless devices. The iwconfig belongs to the wireless-tools package and is based on IOCTLs. Website:  www.hpl.hp.com/personal/Jean_Tourrilhes/Linux/Tools.html .
+
+## libreswan Project
+
+An IPsec software solution which forked from openswan version 2.6.38. Website:  http://libreswan.org/ .
+
+## l2ping
+
+A command-line utility for sending L2CAP echo requests and receiving answers over a Bluetooth device. The l2ping utility belongs to the bluez package. Website:  www.bluez.org/ .
+
+## lowpan-tools
+
+A set of utilities to manage the Linux LoWPAN stack. Website:  http://sourceforge.net/projects/linux-zigbee/files/linux-zigbee-sources/0.3/ .
+
+## lshw
+
+A utility that displays information about the hardware configuration of the machine. Website:  http://ezix.org/project/wiki/HardwareLiSter .
+
+## lscpu
+
+A utility for displaying information about the CPUs on the system. It is based on information from /proc/cpuinfo and sysfs. The lscpu belongs to the util-linux package.
+
+## lspci
+
+A utility for displaying information about PCI buses in the system and devices connected to them. Sometimes you need to get some information about a PCI network device with the lspci command. The lspci utility belongs to the pciutils package. Website:  http://mj.ucw.cz/sw/pciutils/ .
+
+## mrouted
+
+A multicast routing daemon, implementing the IPv4 Distance Vector Multicast Routing Protocol (DVMRP), which is specified in RFC 1075 from 1988. Website:  http://troglobit.com/mrouted.html .
+
+## nc
+
+A command-line utility that reads and writes data across networks. The nc belongs to the nmap-ncat package. Website:  http://nmap.org/ .
+
+## ngrep
+
+A command-line tool, based on the well-known grep command, that allows you to specify extended expressions to match against data payloads of packets. It recognizes TCP, UDP, and ICMP across Ethernet, PPP, SLIP, FDDI, and null interfaces. Website:  http://ngrep.sourceforge.net/ .
+
+## netperf
+
+Netperf is a networking benchmarking tool. Website:  www.netperf.org/netperf/ .
+
+## netsniff-ng
+
+netsniff-ng is an open source project networking toolkit that, among other things, can help in analyzing network traffic, performing stress tests, generating packets at a very high speed, and more. It uses the PF_PACKET zero-copy RINGs (TX and RX). Among the tools it provides are the following:
+
+  * netsniff-ng is a fast zero-copy analyzer, pcap capturing and replaying tool. The netsniff-ng tool is Linux-specific and does not support other operating systems, unlike many of the tools mentioned in this appendix. Example: Running netsniff-ng --in eth1 --out dump.pcap -s -b 0 creates a pcap file that can be read by wireshark or by tcpdump. The –s flag is for silence, and the –b 0 is for binding to CPU 0. See man netsniff-ng.
+
+  * trafgen is a zero-copy high performance network packet traffic generator utility.
+
+  * ifpps is a small utility that periodically provides top-like networking and system statistics from the kernel. ifpps gathers its data directly from procfs files.
+
+  * bpfc is a small Berkeley Packet Filter assembler and compiler.
+
+Fetching the git repository: git clone git://github.com/borkmann/netsniff-ng.git. Website:  http://netsniff-ng.org/ .
+
+## netstat
+
+The netstat tool enables you to print multicast memberships, routing tables, network connections, interface statistics, state of sockets, and more. The netstat tool belongs to the net-tools package. Useful flags:
+
+  * netstat –s: Displays summary statistics for each protocol.
+
+  * netstat –g: Displays multicast group membership information for IPv4 and IPv6.
+
+  * netstat -r: Shows the kernel IP routing table.
+
+  * netstat –nl: Shows the listening sockets (the -n flag is for showing numerical addresses instead of trying to determine symbolic host, port, or user names).
+
+  * netstat –aw: Shows all raw sockets.
+
+  * netstat –ax: Shows all Unix sockets.
+
+  * netstat –at: Shows all TCP sockets.
+
+  * netstat –au: Shows all UDP sockets.
+
+Website:  http://net-tools.sourceforge.net .
+
+## nmap (Network Mapper)
+
+Nmap is an open source security project that provides a network exploration and probing tool and a security/port scanner. It has features like port scanning (detecting the open ports on target hosts), OS detection, detecting MAC addresses, and more. For example,
+
+nmap  www.google.com
+
+can give output such as:
+
+Starting Nmap 6.00 (  http://nmap.org  ) at 2013-09-26 16:37 IDT
+
+Nmap scan report for  www.google.com  (212.179.154.227)
+
+Host is up (0.013s latency).
+
+Other addresses for  www.google.com  (not scanned): 212.179.154.221 212.179.154.251 212.179.154.232 212.179.154.237 212.179.154.216 212.179.154.231 212.179.154.241 212.179.154.247 212.179.154.222 212.179.154.226 212.179.154.236 212.179.154.246 212.179.154.212 212.179.154.217 212.179.154.242
+
+Not shown: 998 filtered ports
+
+PORT STATE SERVICE
+
+80/tcp open http
+
+443/tcp open https
+
+Nmap done: 1 IP address (1 host up) scanned in 5.24 seconds
+
+The nping utility of nmap can be used to generate raw packets for ARP poisoning, networking stress tests, and Denial of Service attacks, as well as to test connectivity like the ordinary ping utility. You can use the nping utility for setting IP options in generated traffic. See  http://nmap.org/book/nping-man-ip-options.html . Website:  http://nmap.org/ .
+
+## openswan
+
+An open source project implementing an IPsec-based VPN solution. It is based on the FreeS/WAN project. Website:  www.openswan.org/projects/openswan .
+
+### OpenVPN
+
+An open source project implementing VPN based on SSL/TLS. Website:  www.openvpn.net/ .
+
+## packeth
+
+An Ethernet-based packet generator tool for Ethernet. The tool has both GUI and CLI. Website:  http://packeth.sourceforge.net/packeth/Home.html .
+
+## ping
+
+The well-known utility for testing connectivity by sending ICMP ECHO request messages. Here are four useful options that are also mentioned in this book:
+
+  * -Q tos: Enables setting Quality Of Service bits in an ICMP packet. Mentioned in this appendix in the explanation about tshark filters.
+
+  * -R: Sets the Record Route IP option (discussed in Chapter 4).
+
+  * -T: Sets the timestamp IP option (discussed in Chapter 4).
+
+  * -f: Flood ping.
+
+  * See man ping for more command-line options.
+
+The ping utility belongs to the iputils package. Website:  www.skbuff.net/iputils/ .
+
+## pimd
+
+An open source lightweight stand-alone Protocol Independent Multicast - Sparse Mode (PIM-SM) v2 multicast daemon. Maintained by Joachim Nilsson. See  http://troglobit.com/pimd.html . git repository:  https://github.com/troglobit/pimd/ .
+
+## poptop
+
+PPTP Server for Linux. Website:  http://poptop.sourceforge.net/dox/ .
+
+## ppp
+
+An open source PPP daemon. git repository: git://ozlabs.org/~paulus/ppp.git. Website:  http://ppp.samba.org/download.html .
+
+## pktgen
+
+The pktgen kernel module (net/core/pktgen.c) can generate packets at very high speed. Monitoring and controlling is done via writing to /proc/net/pktgen entries. For "HOWTO for the linux packet generator" see Documentation/networking/pktgen.txt.
+
+## radvd
+
+This is a Router Advertisement Daemon for IPv6. It is an open source project maintained by Reuben Hawkins. It can be used for IPv6 stateless autoconfiguration and for renumbering. Website:  www.litech.org/radvd/ . git repository:  https://github.com/reubenhwk/radvd .
+
+## route
+
+A command-line tool for routing tables management. It belongs to the net-tools package, which is based on IOCTLs and which is older than the iproute2 package. Examples:
+
+  * route –n: Shows the routing table without name resolving.
+
+  * route add default gateway 192.168.1.1: Adds 192.168.1.1 as a default gateway.
+
+  * route –C: Displays the routing cache (keep in mind that the IPv4 routing cache was removed in kernel 3.6; see the "IPv4 Routing Cache" section in chapter 5).
+
+See man route.
+
+### RP-PPPoE
+
+An open source PPP over Ethernet (PPPoE) client for Linux and Solaris systems. Website:  www.roaringpenguin.com/products/pppoe .
+
+### sar
+
+A command-line tool to collect and report statistics about system activity. It is part of the sysstat package. As an example, running the following command will display four times the CPU statistics with interval of 1 second and the average at the end:
+
+sar 1 4
+
+Linux 3.6.10-4.fc18.x86_64 (a) 10/22/2013 _x86_64_ (2 CPU)
+
+07:47:10 PM CPU %user %nice %system %iowait %steal %idle
+
+07:47:11 PM all 0.00 0.00 0.00 0.00 0.00 100.00
+
+07:47:12 PM all 0.00 0.00 0.00 0.00 0.00 100.00
+
+07:47:13 PM all 0.00 0.00 0.00 0.00 0.00 100.00
+
+07:47:14 PM all 0.00 0.00 0.50 0.00 0.00 99.50
+
+Average: all 0.00 0.00 0.13 0.00 0.00 99.87
+
+Website:  http://sebastien.godard.pagesperso-orange.fr/ .
+
+## smcroute
+
+A command-line tool for multicast routing manipulation. Website:  www.cschill.de/smcroute/ .
+
+## snort
+
+An open source project that provides a network intrusion detection system (IDS) and a network intrusion prevention system (IPS). Website:  www.snort.org/ .
+
+### suricata
+
+An open source project that provides an IDS/IPS and a network security monitoring engine. Website:  http://suricata-ids.org/ .
+
+## strongSwan
+
+An open source project that implements IPsec solutions for Linux, Android, and other operating systems. Both IKEv1 and IKEv2 are implemented. The maintainer is Professor Andreas Steffen. Website:  www.strongswan.org/ .
+
+## sysctl
+
+The sysctl utility displays kernel parameters (including network parameters) at runtime. It can also set kernel parameters. For example, sysctl –a shows all kernel parameters. The sysctl utility belongs to the procps-ng package.
+
+## taskset
+
+A command-line utility for setting or retrieving a process's CPU affinity. The taskset utility is from the util-linux package.
+
+## tcpdump
+
+Tcpdump is an open source command-line protocol analyzer, available from  www.tcpdump.org . It is based on a C/C++ network traffic capture library called libpcap. Like wireshark, it can write its results to a file and read them from a file and it supports filtering. Unlike wireshark, it does not have a front end GUI. However, its output files can be read by wireshark. Example of sniffing with tcpdump:
+
+tcpdump -i eth1
+
+Website:  www.tcpdump.org .
+
+## top
+
+The top utility provides a real-time view of the system (parameters like memory usage, CPU usage, and more) and a system summary. This utility is part of the procps-ng package. Website:  https://gitorious.org/procps .
+
+## tracepath
+
+The tracepath command traces a path to a destination address, discovering the MTU along this path. For IPv6 destination addresses, you can use tracepath6. The tracepath utility belongs to the iputils package. Website:  www.skbuff.net/iputils/ .
+
+## traceroute
+
+Print the path that packets traverse to some destination. The traceroute utility uses the IP protocol's Time To Live (TTL) field to cause hosts on the packet path to return an ICMP TIME EXCEEDED response. The traceroute utility is discussed in Chapter 3, which deals with the ICMP protocol. Website:  http://traceroute.sourceforge.net .
+
+## tshark
+
+The tshark utility provides a command-line packet analyzer. It is part of the wireshark package. It has many command-line options. For example, you can write the output to a file with the –w option. You can set various filters to the packet filtering with tshark, some of which can be complex filters (as you will soon see). Example of setting a filter for capturing only ICMPv4 packets:
+
+tshark -R icmp
+
+Capturing on eth1
+
+17.609101 192.168.2.200 -> 81.218.16.241 ICMP 98 Echo (ping) request id=0x0dc6, seq=1/256, ttl=64
+
+17.617101 81.218.16.241 -> 192.168.2.200 ICMP 98 Echo (ping) reply id=0x0dc6, seq=1/256, ttl=58
+
+You can also set a filter on a value of a field in the IPv4 header. For example, the following command sets a filter on the DS field in the IPv4 header:
+
+tshark -R "ip.dsfield==0x2"
+
+If from a second terminal you send traffic with DS field as 0x2 in the IPv4 header (such traffic can be sent, for example, with ping –Q 0x2 destinationAdderss), it will be displayed onscreen by tshark.
+
+Example for filtering by source MAC address:
+
+tshark ether src host 00:e0:4c:11:22:33
+
+Example for filtering for UDP packets whose ports are in the port range 6000–8000:
+
+tshark -R udp portrange 6000-8000
+
+Example for setting a filter for capturing traffic where the source IP address is 192.168.2.200 and the port is 80 (it does not have to be TCP traffic only because here there is no filter set on some specified protocol):
+
+tshark -i eth1 -f "src host 192.168.2.200 and port 80"
+
+## tunctl
+
+tunctl is an older tool for creating TUN/TAP devices. It is available from  http://tunctl.sourceforge.net . Note that you can also create or remove a TUN/TAP device with the ip command (see the iproute2 section earlier in this appendix) and with the openvpn command-line tool of the openvpn package:
+
+openvpn --mktun --dev tun1
+
+openvpn --rmtun --dev tun1
+
+## udevadm
+
+You can get the network device type by running udevadm on its sysfs entry. For example, if the device has this entry under sysfs:
+
+/sys/devices/virtual/net/eth1.100
+
+then you can find that its DEVTYPE is VLAN:
+
+udevadm info -q all -p /sys/devices/virtual/net/eth1.100/
+
+P: /devices/virtual/net/eth1.100
+
+E: COMMENT=net device ()
+
+E: DEVPATH=/devices/virtual/net/eth1.100
+
+E: DEVTYPE=vlan
+
+E: IFINDEX=4
+
+E: INTERFACE=eth1.100
+
+E: MATCHADDR=00:e0:4c:53:44:58
+
+E: MATCHDEVID=0x0
+
+E: MATCHIFTYPE=1
+
+E: SUBSYSTEM=net
+
+E: UDEV_LOG=3
+
+E: USEC_INITIALIZED=28392625695
+
+udevadm belongs to the udev package. Website:  www.kernel.org/pub/linux/utils/kernel/hotplug/udev.html .
+
+## unshare
+
+The unshare utility enables you to create a namespace and run a program within that namespace that is unshared from its parent. The unsare utility belongs to the util-linux package. For various command-line options of the unshare utility, see man unshare, Example of usage:
+
+unshare -u /bin/bash
+
+This will create a UTS namespace.
+
+unshare --net /bin/bash
+
+This will create a new network namespace, in which a bash process will be started. Gitweb:  http://git.kernel.org/cgit/utils/util-linux/util-linux.git . Website:  http://userweb.kernel.org/~kzak/util-linux/ .
+
+## vconfig
+
+The vconfig utility enables you to configure VLAN (802.1q) interface. Examples of usage:
+
+  * vconfig add eth2 100: Adds a VLAN interface. This will create a VLAN interface, eth2.100.
+
+  * vconfig rem eth2.100: Remove the eth2.100 VLAN interface.
+
+  * Note that you can also add and delete VLAN interfaces with the ip command, for example, like this:
+
+    * ip link add link eth0 name eth0.100 type vlan id 100
+
+  * vconfig set_egress_map eth2.100 0 4: Map SKB priority of 0 to VLAN priority 4, so that outgoing packets which their SKB priority is 0 will be tagged with 4 as VLAN priority. The default VLAN priority is 0.
+
+  * vconfig set_ingress_map eth2.100 1 5: Map VLAN priority 5 to SKB priority of 1, so that incoming packets with VLAN priority of 5 will be queued with SKB priority of 1. The default SKB priority is 0.
+
+See man vconfig.
+
+Note that if VLAN support is compiled as a kernel module, then you must load the VLAN kernel module before trying to add the VLAN interface, by modprobe 8021q. Website:  www.candelatech.com/~greear/vlan.html .
+
+### wpa_supplicant
+
+Open source software that provides a wireless supplicant for Linux and other OSs. It supports WPA and WPA2. Website:  http://hostap.epitest.fi/wpa_supplicant/ .
+
+## wireshark
+
+The wireshark project provides a free and open source analyzer ("sniffer"). It has two flavors: a front-end GTK+ based GUI and a command-line, the tshark utility (mentioned earlier in this appendix). It is available on many operating systems and evolves dynamically: when new features are added to existing protocols and new protocols are added, new parsers ("dissectors") are modified or added. Wireshark has many features:
+
+  * Enables defining a wide range of filters (ports, destination or source address, protocol identifier, fields in headers, and more).
+
+  * Enables sorting the result according to various parameters (protocol type, time, and so on).
+
+  * Saves the sniffer output to a file/read a sniffer output from a file.
+
+  * Reads/writes many different capture file formats: tcpdump (libpcap), Pcap NG, and more.
+
+  * Capture Filters and Display Filters.
+
+Activating the wireshark or thsark sniffer puts the network interface to be in promiscuous mode to enable it to handle packets that are not destined to the local host. A lot of information is available in the man pages: man wireshark and man tshark. You can find more than 75 sniff samples of different protocols in  http://wiki.wireshark.org/SampleCaptures . Wireshark users mailing list:  www.wireshark.org/mailman/listinfo/wireshark-users . Website:  www.wireshark.org . Wiki:  http://wiki.wireshark.org/ .
+
+## XORP
+
+An Open Source project, implementing various routing protocols, like BGP, IGMP, OLSR, OSPF, PIM, and RIP. The name XORP is derived from eXtensible Open Router Platform. Website:  www.xorp.org/ .
+Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_17
+
+© Rami Rosen 2014
+
+# Glossary
+
+Rami Rosen1
+
+(1)
+
+Haifa, Israel
+
+Abstract
+
+The following list of glossary terms are covered in this book.
+
+The following list of glossary terms are covered in this book.
+
+ACL—Asynchronous Connection-oriented Link. A Bluetooth protocol.
+
+ADB — Android Debug Bridge.
+
+AVDTP—Audio/Video Distribution Transport Protocol. A Bluetooth protocol.
+
+AEAD—Authenticated Encryption with Associated Data.
+
+AES-NI—AES instruction set.
+
+AH—Authentication Header protocol. Used in IPsec, has a protocol number 51.
+
+AID—Association ID. A unique number that a wireless client gets when it associates to an Access Point. It is assigned by the Access Point, and it is in the range 1–2007.
+
+AMP—Alternate MAC/PHY.
+
+AMPDU—Aggregated Mac Protocol Data Unit. A type of packet aggregation in IEEE 802.11n.
+
+AMSDU—Aggregated Mac Service Data Unit. A type of packet aggregation in IEEE 802.11n.
+
+AOSP—Android Open Source Project.
+
+AP—Access Point. In wireless networks, a wireless device to which wireless clients associate and which enables them to connect to a wired network.
+
+API—Application Programming Interface. A set of methods and data structures that define the interface to a software layer, such as an interface for a library.
+
+ABRO—Authoritative Border Router Option. Added for Neighbour Discovery Optimization for IPv6. See RFC 6775.
+
+ABS—Android Builders Summit.
+
+ARO—Address Registration Option. Added for Neighbour Discovery Optimization for IPv6. See RFC 6775.
+
+ARP—Address Resolution Protocol. A protocol used to find the mapping between a network address (such as IPv4 address) into a link layer address (like a 48-bit Ethernet address).
+
+ARPD—ARP daemon. A userspace daemon that implements the ARP functionality.
+
+Ashmem—Android shared memory.
+
+ASM—Any-Source Multicast. In the any-source model, you do not specify interest in receiving multicast traffic from a single particular source address or from a set of addresses.
+
+BA—Block Acknowledgement mechanism used in IEEE 802.11n.
+
+BGP—Border Gateway Protocol. A core routing protocol.
+
+BLE—Bluetooth Low Energy.
+
+BNEP—Bluetooth Network Encapsulation Protocol.
+
+BTH—Base Transport Header. An InfiniBand header of 12 bytes. It specifies the source and destination QPs, the operation, packet sequence number, and partition.
+
+CM—Communication Manager in the InfiniBand stack.
+
+CIDR—Classless Inter-Domain Routing. A way to allocate Internet addresses used in inter-domain routing.
+
+CQ—Completion Queue (InfiniBand).
+
+CRIU — Checkpoint/Restore In Userspace. CRIU is a software tool, mainly implemented in userspace, with which you can freeze a running process and checkpoint it to a filesystem as a collection of files. You can then use these files to restore and run the application from the point where it was frozen. See  http://criu.org/Main_Page .
+
+CSMA/CD—Carrier Sense Multiple Access/Collision Detection. A Media Access Control method used in Ethernet networks.
+
+CSMA/CA—Carrier Sense Multiple Access/Collision Avoidance. A Media Access Control method used in wireless networks.
+
+CT—Connection Tracking. A netfilter layer that is the basis for NAT.
+
+DAD—Duplicate Address Detection. The DAD is a mechanism that helps to detect the existence of double L3 addresses on different hosts on a LAN.
+
+DAC—Duplicate Address Confirmation. An ICMPv6 type which was added in RFC 6775, with numeric value of 158.
+
+DAR—Duplicate Address Request. An ICMPv6 type which was added in RFC 6775, with numeric value of 157.
+
+DCCP—Datagram Congestion Control Protocol. An unreliable, congestion-controlled transport layer protocol. The use of DCCP would make sense, for instance, in applications that require low delays and where a small degree of data loss is permitted, like in telephony and streaming media applications.
+
+DHCP—Dynamic Host Configuration Protocol. A protocol for configuring network device parameters like an IP address, a default route, and one or more DNS server addresses.
+
+DMA—Direct Memory Access.
+
+DNAT—Destination NAT. A NAT that changes the destination address.
+
+DNS—Domain Name System. A system for translating domain names to IP addresses.
+
+DSCP—Differentiated Services Code Point. A classifying mechanism.
+
+DVMRP—Distance Vector Multicast Routing Protocol. A protocol for routing multicast datagrams. Suitable for use within an autonomous system. Defined in RFC 1075 from 1988.
+
+ECN—Explicit Congestion Notification. See RFC 3168, "The Addition of Explicit Congestion Notification (ECN) to IP."
+
+EDR—Enhanced Data Rate.
+
+EGP—Exterior Gateway Protocol. A routing protocol which is now considered obsolete. It was first formalized in RFC 827 in 1982.
+
+ERTM—Enhanced Retransmission Mode. A reliable protocol with error and flow control, used in Bluetooth.
+
+ESP—Encapsulating Security Payload. Used in IPsec, has protocol number 50.
+
+ETH—Extended Transport Header: An InfiniBand header with size from 4 to 28 bytes. This header represents an extra family of headers that may be present depending on the class of the service and the used operation.
+
+ETSI—European Telecommunications Standards Institute.
+
+FCS—Frame Check Sequence
+
+FIB—Forwarding Information Base. The database that contains the routing tables information.
+
+FMR—Fast Memory Region (InfiniBand).
+
+FSF—Free Software Foundation.
+
+FTP—File Transfer Protocol. A protocol for transferring files between two hosts, based on TCP.
+
+GCC—GNU Compiler Collection.
+
+GID—Global Identifier.
+
+GMP—Group Management Protocol. A term that refers to both IGMP and MLD. See RFC 4604, section 1.
+
+GRE—Generic Routing Encapsulation. A tunneling protocol.
+
+GRH—Global Routing Header. An InfiniBand header of 40 bytes. It describes the source and destination port using GIDs, and its format is identical to the IPv6 header.
+
+GRO—Generic Receive Offload. A technique with which incoming packets are merged at reception time into a bigger packet to improve performance.
+
+GSO—Generic Segmentation Offload. A technique with which outgoing packets are segmented not in the transport layer but as close as possible to the network driver or in the network driver itself.
+
+GUID—Global Unique Identifier.
+
+HAL—Hardware Abstraction Layer.
+
+HCA—Host Channel Adapter.
+
+HCI—Host Controller Interface. Used, for example, in Bluetooth, PCI and more.
+
+HDP—Health Device Profile. Used by Bluetooth.
+
+HFP—Hands-Free Profile. Used by Bluetooth.
+
+HoL Blocking—Head-of-line blocking is a performance-limiting phenomenon that occurs when a line of packets is held up by the first packet, for example, in multiple requests in HTTP pipelining.
+
+HPC—High Performance Computing. Management of computer resources in a way that gives high performance for heavy tasks such as solving large-scale problems in science, engineering, or economics.
+
+HS—High Speed.
+
+HTTP—Hypertext Transfer Protocol. The basic protocol for accessing the World Wide Web.
+
+HWMP— Hybrid Wireless Mesh Protocol. A routing protocol used in wireless Mesh networks that consists of two types of routing: on-demand routing and proactive routing.
+
+iWARP—Internet Wide Area RDMA Protocol.
+
+iSER—iSCSI extension for RDMA.
+
+IANA—Internet Assigned Numbers Authority. Responsible for IP addressing, global coordination of the DNS Root, and other IP-related symbols and numbers. Operated by the Internet Corporation for Assigned Names and Numbers (ICANN).
+
+IBTA—InfiniBand Trade Association.
+
+ICMP—Internet Control Message Protocol. An IP protocol for control and informational messages. The well-known ping utility is based on ICMP. The ICMP protocol is known to be used in various types of security DoS attacks, like the Smurf attack.
+
+ICE—Interactive Connectivity Establishment. Specified in RFC 5245. A protocol for NAT traversal.
+
+ICRC—Invariant CRC. An InfiniBand header of 4 bytes. Covers all fields, which should not be changed as the packet travels in the subnet.
+
+IDS—Intrusion Detection System.
+
+IoT—Internet of Things. Networking of everyday objects.
+
+IEEE—Institute of Electrical and Electronics Engineers.
+
+IGMP—Internet Group Management Protocol. Multicast group memberships protocol.
+
+IKE—Internet Key Exchange. A protocol for setting an IPsec Security Association.
+
+IOMMU—I/O Memory Management Unit.
+
+IP—Internet Protocol. The primary addressing and routing protocol for the Internet. IPv4 was first specified in RFC 791 from 1981, and IPv6 was first specified in RFC 1883 from 1995.
+
+IPoIB—IP over InfiniBand.
+
+IPS—Intrusion Prevention System.
+
+ISAKMP—Internet Security Association & Key Management Protocol.
+
+IOCTL—Input/Output Control. A system call that provides access from userspace to kernel.
+
+IPC—Inter Process Communication. There are many different mechanisms for IPC, such as shared memory semaphores, message queues, and more.
+
+IPCOMP—IP Payload Compression Protocol. A compressing protocol intended to reduce the size of data sent over a slow network connection. Using IPComp increases the overall communication performance between two network nodes.
+
+IPsec—IP security. A set of protocols developed by the IETF for secure exchange of packets over the IP protocol. IPsec is mandatory in IPv6 according to the IPv6 spec and optional in IPv4, though many operating systems implemented it also in IPv4. IPsec uses two encryption modes: Transport and Tunnel.
+
+IPVS—IP Virtual Server. A Linux kernel load balancing infrastructure, supports IPv4 and IPv6. See  http://www.linuxvirtualserver.org/software/ipvs.html .
+
+ISR—Interrupt Service Routine. An interrupt handler that is invoked when an interrupt is received.
+
+ISM—Industrial, scientific, and medical radio band.
+
+jumbo frames—Packets with size up to 9K. Some network interfaces allow using an MTU of up to 9K. Using jumbo frames can improve the network performance in some cases, such as in bulk data transfers.
+
+KVM—Kernel-based Virtual Machine. A Linux virtualization project.
+
+LACP—Link Aggregation Control Protocol.
+
+LAN—Local Area Network. A network that connects a limited area, such as an office building.
+
+LID—Local Identifier. A 16-bit value assigned to every subnet port by the Subnet Manager (InfiniBand).
+
+L2CAP—Logical Link Control and Adaptation Protocol. Used in Bluetooth.
+
+L2TP—Layer 2 Tunneling Protocol used by VPNs. L2TPv3 is specified in RFC 3931 (RFC 5641 has some updates).
+
+LKML—Linux Kernel Mailing List.
+
+LLCP —Logical Link Control Protocol. Used by NFC.
+
+LLN—Low-power and Lossy Network.
+
+LoWPAN—Low-power Wireless Personal Area Network.
+
+LMP—Link Management Protocol. Controls the radio link between two Bluetooth devices.
+
+LPM—Longest Prefix Match. An algorithm used by the routing subsystem.
+
+LRH—Local Routing Header. An InfiniBand header of 8 bytes. It identifies the local source and destination ports of the packet. It also specifies the requested QoS attributes (SL and VL) of the message.
+
+LRO—Large Receive Offload.
+
+LR-WPAN—Low-Rate Wireless Personal Area Network. Used in IEEE 802.15.4.
+
+LSB—Least significant bit.
+
+LSRR—Loose Source Record Route.
+
+LTE—Long Term Evolution.
+
+MAC—Media Access Control. A sublayer of the Data Link Layer (L2) of the OSI model.
+
+MAD—Management Datagram (InfiniBand).
+
+MFC—Multicast Forwarding Cache. A data structure in the kernel that consists of multicast forwarding entries.
+
+MIB—Management Information Base.
+
+MLD—Multicast Listener Discovery protocol. Enables each IPv6 router to discover the presence of multicast listeners. The MLD protocol is specified in RFC 3810, from 2004.
+
+MLME—MAC Layer Management Entity. A component in the IEEE 802.11 management layer responsible for operations such as scanning, authentication, association, and reassociation.
+
+MR—Memory Region (InfiniBand).
+
+MSF—Multicast Source Filtering. This is the feature to set filters so that multicast traffic from sources other than the expected ones will be dropped.
+
+MSI—Message Signaled Interrupts.
+
+MSS—Maximum Segment Size. A parameter of the TCP protocol.
+
+MTU—Maximum transmission unit. The size of the largest packet that a network protocol can transmit.
+
+MW—Memory Window (InfiniBand).
+
+NAP—Network Access Point.
+
+NAPI—New API. A technique by which network drivers are not interrupt-driven, but use polling. NAPI is discussed in Chapter 1.
+
+NAT—Network Address Translation. A layer responsible for modifying IP headers. In Linux, support for IPv6 NAT was merged in kernel 3.7.
+
+NAT-T—NAT traversal.
+
+NCI—NFC Controller Interface.
+
+ND / NDISC—Neighbour Discovery Protocol. Used in IPv6. Among its tasks: discovering network nodes on the same link, autoconfiguration of addresses, finding the Link Layer addresses of other nodes, and maintaining reachability information about other nodes.
+
+NFC—Near Field Communication.
+
+NDEF—NFC Data Exchange Format.
+
+NIC—Network Interface Card, also known as Network Interface Controller or Network Adapter. The hardware network device.
+
+NUMA—Non-Uniform Memory Access.
+
+NPP—NDEF Push Protocol.
+
+NPAR—NIC Partitioning. A technology that enables you to split up network card (NIC) traffic in partitions.
+
+NUD—Network Unreachability Detection. A mechanism responsible for determining whether a neighbour can be reached.
+
+OBEX—Object Exchange. A protocol for exchange of binary objects between devices, used in Bluetooth.
+
+OEM—Original Equipment Manufacturer.
+
+OFA—OpenFabrics Alliance.
+
+OCF—Open Cryptography Framework.
+
+OHA—Open Handset Alliance.
+
+OOTB—Out of the Blue packet (a term of the SCTP protocol). A packet is an OOTB packet if it is correctly formed (that is, no checksum error), but the receiver is not able to identify the SCTP association to which the packet belongs (see section 8.4 in RFC 4960).
+
+OPP—Object Push Profile. Used by Bluetooth.
+
+OSI Model—Open Systems Interconnection.
+
+OSPF—Open Shortest Path First. Interior gateway routing protocol developed for IP networks.
+
+PADI—PPPoE Active Discovery Initiation.
+
+PADO—PPPoE Active Discovery Offer.
+
+PADR—PPPoE Active Discovery Request.
+
+PADS—PPPoE Active Discovery Session.
+
+PADT—PPPoE Active Discovery Terminate.
+
+PAN—Personal Area Networking. A profile used in Bluetooth.
+
+PCI—Peripheral Component Interconnect. A bus for attaching devices. Many network interface cards are PCI devices.
+
+PD—Protection Domain.
+
+PHDC—Personal Health Device Communication. Used by NFC.
+
+PID—Process Identifier.
+
+PIM—Protocol Independent Multicast Protocol. A multicast routing protocol.
+
+PIM-SM—Protocol Independent Multicast—Sparse Mode.
+
+PLME—Physical Layer Management Entity in IEEE 802.11.
+
+PM—Power Management.
+
+PPP—Point To Point data link protocol. A protocol for direct communication between two hosts.
+
+PPPoE—PPP over Ethernet. The PPPoE protocol is specified in RFC 2516 from 1999.
+
+PERR—Path Error. A message that informs about some failure in a wireless Mesh network routing.
+
+PREP—Path Reply. A unicast packet sent as a reply to a PREQ message in a wireless Mesh network.
+
+PREQ—Path Request. A broadcast packet sent when looking for some address in a wireless Mesh network.
+
+PSK—Preshared Key.
+
+Qdisc—Queuing Disciplines.
+
+QP—Queue Pair (InfinBand).
+
+RA—Router Alert. One of the IPv4 options. It notifies transit routers to more closely examine the contents of an IP packet. It is used by many protocols, such as IGMP, MLD, and more.
+
+RANN—Root Announcement. A broadcast packet sent periodically by a Root Mesh point in a wireless Mesh network.
+
+RARP—Reverse Address Resolution Protocol. A protocol used to find the mapping between a link layer address (like a 48-bit Ethernet address) to a network address (like an IPv4 address).
+
+RC—A QP transport type in InfiniBand.
+
+RDMA—Remote Direct Memory Access. A direct memory access from one host to another.
+
+RDS—Reliable Datagram Socket. A reliable connectionless protocol developed by Oracle.
+
+RFC—Request For Comments. A document that specifies Internet specifications, communications protocols, procedures, and events. The standardization process of RFCs is documented at  http://tools.ietf.org/html/rfc2026 , "The Internet Standards Process."
+
+RFID—Radio Frequency ID.
+
+RFCOMM—Radio Frequency Communications protocol. Used in Bluetooth.
+
+RFS—Receive Flow Steering.
+
+RIP—Routing Information Protocol: A distance-vector routing protocol.
+
+RoCE—RDMA over Converged Ethernet.
+
+RP—Rendezvous Point.
+
+RPL—IPv6 Routing Protocol for Low-Power and Lossy Networks. The RPL protocol is specified in RFC 6550.
+
+RPDB—Routing Policy DataBase.
+
+RPF—Reverse Path Filter. A technique intended to prevent source address spoofing.
+
+RPC—Remote Procedure Call.
+
+RPS—Receive Packet Steering.
+
+RS—Router Solicitations.
+
+RSA—A cryptography algorithm. RSA stands for Ron Rivest, Adi Shamir, and Leonard Adleman, the people who developed it.
+
+RTP—Real-time Transport Protocol. A protocol for transmitting audio and video over IP networks.
+
+RTR—Ready To Receive. A state in InfiniBand QP State Machine.
+
+RTS—Ready To Send. A state in InfiniBand QP State Machine.
+
+SA—Security Association. A logical relationship between two hosts that consists of various parameters, such as cryptographic key, cryptographic algorithm, SPI, and more.
+
+SACK—Selective Acknowledgments. See RFC 2018, "TCP Selective Acknowledgment Options," from 1996.
+
+SAD—Security Association Database.
+
+SAR—Segmentation and Reassembly.
+
+SBC—Session Border Controllers.
+
+SCO—Synchronous Connection Oriented link. A Bluetooth protocol.
+
+SDP—Service Discovery Protocol. Used in Bluetooth.
+
+SCTP—Stream Control Transmission Protocol. A transport protocol that has features of both UDP and TCP.
+
+SE—Security Element (NFC).
+
+SIG—Special Interest Group.
+
+SIP—Session Initiation Protocol. A signaling protocol for VoIP, intended for creating and modifying VoIP sessions.
+
+SLAAC—Stateless Address autoconfiguration. Specified in RFC 4862.
+
+SKB—Socket Buffer. A kernel data structure representing a network packet (implemented by the sk_buff structure, include/linux/skbuff.h).
+
+SL—Service Level. The QoS in InfiniBand is implemented using the SL to VL mapping and the resources for each VL.
+
+SLAAC—Stateless Address Autoconfiguration.
+
+SM—Subnet Manager.
+
+SMA—Subnet Management Agent.
+
+SME—System Management Entity in IEEE 802.11.
+
+SMP—Symmetrical Multiprocessing. An architecture where two or more identical processors are connected to a single shared main memory.
+
+SNAT—Source NAT. A NAT that changes the source address.
+
+SNEP—Simple NDEF Exchange Protocol (SNEP) for exchanging NDEF-formatted data.
+
+SNMP—Simple Network Management Protocol.
+
+SPI—Security Parameter Index. Used by IPsec.
+
+SPD—Security Policy Database.
+
+SQD—Send Queue Drained. A state in InfiniBand QP State Machine.
+
+SQE—Send Queue Error. A state in InfiniBand QP State Machine.
+
+SRP—SCSI RDMA protocol.
+
+SR-IOV—Single Root I/O Virtualization. A specification that allows a PCIe device to appear to be multiple separate physical PCIe devices.
+
+SRQ—Shared Receive Queue (InfiniBand).
+
+SSM—Source Specific Multicast.
+
+STUN —Session Traversal Utilities for NAT.
+
+SSP—Secure Simple Pairing. A security feature required by Bluetooth v2.1.
+
+TCP—Transmission Control Protocol. The TCP protocol is the most commonly used transport protocol on the Internet today. Many protocols run on top of TCP, including FTP, HTTP, and more. TCP is specified in RFC 793 from 1981, and during the years since then there have been many protocol updates, variations, and additions to the base TCP protocol.
+
+TIPC—Transparent Inter-process Communication protocol. See  http://tipc.sourceforge.net/ .
+
+TOS —Type Of Service.
+
+TSO—TCP Segmentation Offload.
+
+TTL—Time To Live. A counter in the IPv4 header (its counterpart in IPv6 is called Hop Limit) that is decremented in each forwarding device. When this counter reaches 0, an ICMP of Time Exceeded is sent back, and the packet is discarded. Both the ttl member of the IPv4 header and the hop_limit member of the IPv6 header are 8-bit fields.
+
+TURN—Traversal Using Relays around NAT.
+
+UC—Unreliable Connected. A QP transport type in InfiniBand.
+
+UD—Unreliable Datagram. A QP transport type in InfiniBand.
+
+UDP—User Datagram Protocol. UDP is an unreliable protocol, as there is no guarantee that packets will be delivered for upper layer protocols. There is no handshaking phase in UDP, in contrast to TCP. The UDP header is simple and consists of only 4 fields: source port, destination port, checksum, and length.
+
+USAGI—UniverSAl playGround for Ipv6. A project that developed IPv6 and IPsec (for both IPv4 and IPv6) stacks for the Linux kernel.
+
+UTS—Unix Time-sharing System.
+
+VCRC—Variant CRC. An InfiniBand header of 2 bytes. Covers all the fields of the packet.
+
+VETH—Virtual Ethernet. A network driver which enables communication between two network devices in different network namespaces.
+
+VoIP—Voice Over IP.
+
+VFS—Virtual File System.
+
+VL—Virtual Lanes. A mechanism for creating multiple virtual links over a single physical link.
+
+VLAN—Virtual Local Area Network.
+
+VPN—Virtual Private Network.
+
+VXLAN—Virtual Extensible Local Area Network. VXLAN is a standard protocol to transfer Layer 2 Ethernet packets over UDP. VXLAN is needed because there are cases where firewalls block tunnels and allow, for example, only TCP/UDP traffic.
+
+WDS—Wireless Distribution System.
+
+WLAN—Wireless LAN.
+
+WOL—Wake On LAN.
+
+WSN—Wireless Sensor Networks.
+
+XRC—eXtended Reliable Connected. A QP transport type in InfiniBand.
+
+XFRM—IPsec Transformer. A Linux kernel framework for handling IPsec transformations. The two most fundamental data structures of the XFRM framework are the XFRM policy and the XFRM state.
+Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1© Apress 2014
+
+Index
+
+A
+
+Access point (AP)
+
+Address registration option (ARO)
+
+Address resolution protocol (ARP)
+
+arp_constructor() method
+
+arp_create() method
+
+arp_filter() method
+
+arphdr structure
+
+arp_ignore()
+
+arp_process() method
+
+arp_rcv() method
+
+arp_send() method
+
+daemon
+
+dst_neigh_output() method
+
+ethernet packet
+
+inet_addr_onlink() method
+
+inet_select_addr() method
+
+MAC addresses
+
+neigh_lookup()
+
+neigh_resolve_output() method
+
+NF_HOOK() macro
+
+pneigh_enqueue() method
+
+solicit() method
+
+AES instruction set (AES-NI)
+
+Aggregated mac protocol data unit (AMPDU)
+
+Aggregated mac service data unit (AMSDU)
+
+Alternate MAC/PHY (AMP)
+
+Android
+
+internal resources
+
+networking
+
+android debug bridge (ADB)
+
+Bluetooth
+
+near field communication (NFC)
+
+netfilter
+
+security privileges and networking
+
+Android debug bridge (ADB)
+
+Android open source project (AOSP)
+
+Any-source multicast (ASM)
+
+Application programming interface (API)
+
+ARP protocol.
+
+See See Address resolution protocol (ARP)
+
+Association ID (AID)
+
+Audio/video distribution transport protocol (AVDTP)
+
+Authentication header protocol (AH)
+
+Authoritative border router option (ABRO)
+
+B
+
+Base Transport Header (BTH)
+
+Beacons
+
+Block Acknowledgement (BA)
+
+Block Ack Request (BAR)
+
+Bluetooth Low Energy (BLE)
+
+Bluetooth Network Encapsulation Protocol (BNEP)
+
+Bluetooth protocol
+
+ACL packets
+
+Bluetooth profiles
+
+Bluetooth stack
+
+HCI connection
+
+Bluetooth Network Encapsulation Protocol (BNEP)
+
+logical link control and adaptation protocol (L2CAP)
+
+HCI layer, struct hci_dev
+
+host controller interface (HCI)
+
+L2CAP/SCO layers
+
+link controller
+
+logical link control and adaptation protocol (L2CAP) features
+
+personal area networks (PANs)
+
+radio frequency communications (RFCOMM)
+
+service discovery protocol(SDP)
+
+special interest group (SIG)
+
+synchronous connection-oriented (SCO)
+
+tools
+
+Board Support Packages (BSPs)
+
+Border Gateway Protocol (BGP)
+
+Busy poll sockets
+
+busy_poll controls
+
+busy_read controls
+
+ndo_busy_poll callback
+
+performance
+
+SO_BUSY_POLL socket option
+
+tuning and configuration
+
+C
+
+Carrier Sense Multiple Access/Collision Avoidance (CSMA/CA)
+
+Carrier Sense Multiple Access/Collision Detection (CSMA/CD)
+
+Cgroups
+
+cls_cgroup classifier
+
+device controller
+
+implementation
+
+cgroup_subsys structure
+
+css_set object
+
+register_filesystem() method
+
+release_agent
+
+libcg library
+
+memory controller
+
+mounting cgroup subsystems
+
+net_prio Module
+
+Checkpoint/Restore In Userspace (CRIU)
+
+Chunk types
+
+Classless Inter-Domain Routing (CIDR)
+
+Common Development and Distribution License (CDDL)
+
+Communication Manager (CM)
+
+Completion Queue (CQ)
+
+Connection tracking
+
+callbacks
+
+dst structure
+
+entries
+
+ipv4_confirm() method
+
+network namespace object
+
+nf_conn structure description
+
+nf_ct_timeout_lookup() method
+
+reference counter
+
+resolve_normal_ct() method
+
+specific packet() method
+
+extensions
+
+hook callbacks
+
+DNAT rule
+
+ipv4_conntrack_in()
+
+NAT and netfilter hooks
+
+nf_nat_ipv4_in()
+
+hooks
+
+initialization
+
+IPTables
+
+Filter table rule
+
+log-level modifier
+
+LOG target
+
+network namespace object
+
+parts
+
+IPv4 NAT module
+
+local host delivery
+
+NAT
+
+NAT hook callbacks
+
+nf_conntrack method
+
+nf_conntrack_tuple structure
+
+NF_INET_PRE_ROUTING hook
+
+packet forwarding
+
+Constructor
+
+Control packets
+
+CSMA/CA
+
+D
+
+Datagram Congestion Control Protocol (DCCP)
+
+and NAT
+
+development of
+
+header
+
+initialization
+
+packet types
+
+receiving packets
+
+sending packets
+
+socket initialization
+
+Datagram sockets
+
+Data links sockets
+
+Data packets
+
+Dccp_init_sock() method
+
+DCCP.
+
+See See Datagram Congestion Control Protocol (DCCP)
+
+Dccp_v4_rcv () method
+
+Delayed ACK timer
+
+Destination NAT (DNAT)
+
+Distance Vector Multicast Routing Protocol (DVMRP)
+
+Domain Name System (DNS)
+
+Duplicate Address Confirmation (DAC)
+
+Duplicate Address Detection (DAD)
+
+Duplicate Address Request (DAR)
+
+Dynamic Host Configuration Protocol (DHCP)
+
+Dynamic Host Configuration Protocol version 6 (DHCPv6)
+
+E
+
+Encapsulating Security Payload (ESP)
+
+Enhanced data rate (EDR)
+
+Enhanced Retransmission Mode (ERTM)
+
+ESP protocol
+
+Authentication Data
+
+ESP format
+
+initialization
+
+Padding
+
+Payload Data
+
+Security Parameter Index
+
+Sequence Number
+
+Extended Service Set (ESS)
+
+Extended Transport Header (ETH)
+
+Exterior Gateway Protocol (EGP)
+
+F
+
+Failover
+
+Fast Memory Region (FMR)
+
+Fib_select_multipath() method
+
+File Transfer Protocol (FTP)
+
+Forwarding Information Base (FIB)
+
+Free Software Foundation (FSF)
+
+G
+
+General Public License (GPL)
+
+Generic netlink protocol
+
+acpi subsystem
+
+command identifier
+
+ctrl_getfamily() method
+
+flags
+
+generic netlink messages
+
+genl_ops structure
+
+genl_pernet_init() method
+
+genl_sock pointer
+
+hostapd package
+
+internal_flags
+
+multicast group
+
+netlink_kernel_create() method
+
+NFC subsystem
+
+nl_send_auto()
+
+policy
+
+socket monitoring interface
+
+CRIU projects
+
+sock_diag_handler
+
+sock_diag_register()
+
+ss tool
+
+UNIX diag module
+
+wireless subsystem
+
+wireless-tools
+
+Generic Receive Offload (GRO) packets
+
+Generic Segmentation Offload (GSO)
+
+Genl_connect()method
+
+Git trees
+
+Global IDentifier (GID)
+
+Global Routing Header (GRH)
+
+Group Management Protocol (GMP)
+
+H
+
+Head-of-Line (HoL) blocking
+
+HEARTBEAT mechanism
+
+High Performance Computing (HPC)
+
+High Throughput Task Group (TGn)
+
+AMPDU aggregation
+
+AMSDU aggregation
+
+Block Ack Request (BAR)
+
+del_timer_sync()
+
+vendors
+
+Host Channel Adapter (HCA)
+
+Hybrid Wireless Mesh Protocol (HWMP)
+
+I, J
+
+ICMP protocol.
+
+See See Internet control message protocol (ICMP)
+
+ICMPv4 messages
+
+categories
+
+destination unreachable
+
+ICMP_FRAG_NEEDED code
+
+ICMP_PORT_UNREACH code
+
+ICMP_PROT_UNREACH code
+
+icmp_reply() method
+
+icmp_send() method
+
+ICMP_SR_FAILED code
+
+header
+
+conditions
+
+DHCP
+
+icmp_bxm structure
+
+icmp_control objects
+
+icmp_control structure
+
+icmp_discard()
+
+icmp_echo()method
+
+ICMP_QUENCH message
+
+icmp_redirect()
+
+ICMP sockets/ping sockets
+
+ip_local_deliver_finish()method
+
+NTP
+
+ping_rcv() method
+
+raw_local_deliver()
+
+struct icmphdr
+
+timestamps
+
+TTL
+
+icmp_echo() method
+
+inet_init() method
+
+IP broadcast or IP multicast address
+
+ip_local_deliver_finish() method
+
+ping and traceroute utility
+
+ping_rcv() method
+
+ICMPv4 redirect message
+
+ip_do_redirect() method
+
+ip_forward() method
+
+ip_rt_send_redirect() method
+
+mkroute_input() method
+
+ICMPv6 messages
+
+cmpv6_rcv() method
+
+destination unreachable
+
+ICMP_FRAG_NEEDED code
+
+ICMPV6_EXC_FRAGTIME code
+
+ICMPV6_EXC_HOPLIMIT code
+
+parameter problem
+
+port unreachable
+
+header
+
+icmpv6_init() method
+
+icmpv6_notify() method
+
+igmp6_event_report()
+
+ND messages
+
+pskb_may_pull() method
+
+IEEE 802.15.4
+
+ieee802154_dev object
+
+ieee802154_ops object
+
+low-rate wireless personal area networks (LR-WPANs)
+
+medium access control (MAC)
+
+wireless sensor networks (WSNs)
+
+IKE.
+
+See See Internet Key Exchange (IKE)
+
+Inet_create() method
+
+InfiniBand subsystem
+
+addressing
+
+Communication Manager
+
+features
+
+hardware components
+
+methods
+
+packet headers
+
+See(see Packet headers)
+
+RDMA
+
+See(see RDMA device; Remote Direct Memory Access (RDMA))
+
+Subnet Administrator
+
+Subnet Management Agent
+
+InfiniBand Trade Association (IBTA)
+
+Internet Assigned Numbers Authority (IANA)
+
+Internet control message protocol (ICMP)
+
+definition
+
+ICMPv4 messages
+
+See(see ICMPv4 messages)
+
+ICMPv6 messages
+
+See(see ICMPv6 message)
+
+ping sockets
+
+Internet Key Exchange (IKE)
+
+Internet Key Exchange Protocol Version 2 (IKEv2)
+
+Internet of Things (IoT)
+
+Internet Protocol (IP)
+
+Internet Protocol security (IPsec) subsystem
+
+cryptography
+
+definition
+
+ESP protocol
+
+Authentication Data
+
+ESP format
+
+initialization
+
+Padding
+
+Payload Data
+
+Security Parameter Index
+
+Sequence Number
+
+IKE
+
+methods
+
+NAT traversal
+
+Main Mode, IKE
+
+SBCs
+
+TCP/UDP header
+
+VoIP NAT-traversal
+
+transport mode
+
+receiving IPv4 ESP packet
+
+transmitting IPv4 ESP packet
+
+VPN technology
+
+XFRM framework
+
+dummy bundle
+
+flow_cache_lookup() method
+
+netns_xfrm structure
+
+Security Association (SA)
+
+security policy
+
+See(see Security policy)
+
+xfrm_init() method
+
+xfrm_lookup() method
+
+xfrm_route_forward() method
+
+XFRM SNMP MIB counters
+
+Internet server provider (ISP)
+
+Internet Wide Area RDMA Protocol (iWARP)
+
+Inter Process Communication (IPC)
+
+Ip_cmsg_send() method
+
+Ip_mc_leave_group() method
+
+Ipmr_rules_init() method
+
+IP Payload Compression Protocol (IPCOMP)
+
+IPsec subsystem.
+
+See See Internet protocol security (IPsec) subsystem
+
+IPv4 protocol
+
+defragmentation
+
+hash function
+
+ip_defrag() method
+
+ip_expire() method
+
+ip_forward() method
+
+ip_frag_queue()
+
+ip_frag_reasm() method
+
+ipq_kill() method
+
+dst_input() method
+
+dst_output() method
+
+fragmentation
+
+fast path fragmentation
+
+ip_fragment() method
+
+slow path fragmentation
+
+fragmentation needed code
+
+header
+
+fragment offset
+
+id field
+
+internet header length
+
+L4 protocol
+
+struct iphdr
+
+Time To Live
+
+total length
+
+Type of Service
+
+initialization
+
+internet header length
+
+ip_append_data() method
+
+ip_fast_csum() method
+
+ip_forward_options() method
+
+IP_HDRINCL socket option
+
+ip_local_deliver_finish() method
+
+IP options
+
+copied flag
+
+IPOPT_CIPSO option
+
+IPOPT_END option
+
+ip_options_fragment() method
+
+IPOPT_LSRR option
+
+IPOPT_NOOP option
+
+IPOPT_SEC option
+
+linux symbol
+
+memset() function
+
+Multibyte option
+
+option class
+
+option number
+
+optptr pointer
+
+record route option
+
+See(see Record route option)
+
+Single byte option
+
+timestamp option
+
+while loop
+
+ip_options_build() method
+
+ip_queue_xmit() method
+
+ip_rcv_finish() method
+
+ip_rcv() method
+
+ip_route_input_noref() method
+
+ip_route_output_ports()
+
+MSG_PROBE flag
+
+multicast packets
+
+netfilter hooks
+
+receiving path (Rx)
+
+routing subsystem
+
+RPF
+
+RTCF_DOREDIRECT flag
+
+skb_dst()
+
+skb_push() method
+
+strict route flag
+
+transport layer
+
+TTL count exceeded code
+
+IPv4 routing cache
+
+Rx Path
+
+Tx Path
+
+IPv6 header
+
+destination address
+
+extension headers
+
+Authentication Header
+
+Destination Options header
+
+ESP
+
+Fragment Options header
+
+Hop-by-Hop Options header
+
+protocol handler
+
+Routing Options header
+
+upper-layer protocol
+
+flow_lbl
+
+hop_limit
+
+ip_decrease_ttl() method
+
+nexthdr
+
+payload_len
+
+source address
+
+traffic class/priority
+
+version
+
+IPv6 protocol
+
+addresses
+
+Anycast
+
+ARP protocol
+
+Global Unicast
+
+in6_addr structure
+
+IPv4-compatible format
+
+link-local unicast address
+
+multicast address
+
+multicast address
+
+See(see Multicast address)
+
+Site local addresses
+
+Unicast
+
+autoconfiguration
+
+definition
+
+DHCPv6
+
+interface flag
+
+preferred lifetime
+
+RA
+
+router solicitation
+
+valid lifetime
+
+features
+
+in6_addr structure
+
+inet6_add_protocol() method
+
+inet6_dev structure
+
+inet6_init() method
+
+INET6_PROTO_NOPOLICY flag
+
+ip6_append_data() method
+
+ip6_forward () method
+
+ip6_input() method
+
+ip6_rcv_finish() method
+
+ip6_xmit() method
+
+IPv6 header
+
+See(see IPv6 header)
+
+ipv6_is_mld() method
+
+ipv6_rcv() method
+
+Linux symbol and value
+
+macros
+
+methods
+
+MLD
+
+See(see Multicast Listener Discovery (MLD))
+
+multicast packets
+
+ip6_input_finish() method
+
+ip6_mc_input() method
+
+ip6_mr_input() method
+
+ipv6_chk_mcast_addr() method
+
+routing
+
+routing tables
+
+Rx path
+
+SKB
+
+IP Virtual Server (IPVS)
+
+K
+
+Keep Alive timer
+
+Kernel netlink sockets
+
+callbacks
+
+EPRM error
+
+input callback
+
+netlink_bind()
+
+netlink_kernel_create() prototype
+
+netlink_lookup() method
+
+rtmsg_ifinfo() method
+
+rtnetlink_net_init() method
+
+rtnetlink_rcv() method
+
+rtnl_register()
+
+KLIPS stack
+
+L
+
+Large Receive Offload (LRO) packets
+
+Linux API
+
+net_device structure
+
+See(see Net_device structure)
+
+RDMA
+
+See(see Remote Direct Memory Access (RDMA))
+
+sk_buff Structure
+
+Bluetooth protocol
+
+checksum values
+
+connection tracking
+
+dev member
+
+dropcounter
+
+dst_entry struct
+
+eth_type_trans() method
+
+handling buffers
+
+headroom and tailroom
+
+ip_queue_xmit() method
+
+IP virtual server
+
+link layer
+
+netfilter packet trace flag
+
+network layer
+
+PMTUD
+
+preceding rule
+
+secmark field
+
+security path pointer
+
+setsockopt()
+
+skb_clone() method
+
+skb_pfmemalloc() function
+
+skb_shared_info struct
+
+sock_create_kern() method
+
+timestamp
+
+transport layer
+
+VLAN protocol
+
+Linux Kernel Mailing List (LKML)
+
+Linux neighbouring subsystem
+
+arp_netdev_event() method
+
+ARP protocol
+
+See(see Address resolution protocol (ARP))
+
+Ethernet
+
+macros
+
+methods
+
+NDISC Protocol
+
+See(see Neighbour Discovery (NDISC) protocol)
+
+neighbour solicitations
+
+neighbour structure
+
+dead flag
+
+neigh_parms object
+
+neigh_resolve_output() method
+
+neigh_timer_handler() method
+
+NUD state
+
+primary_key
+
+reference counter
+
+neigh_create() method
+
+neigh_statistics structure
+
+neigh_table structure
+
+arp_hash() method
+
+arp_rcv() method
+
+asynchronous garbage collector handler
+
+constructor
+
+function pointers
+
+IPv4 procfs
+
+ndisc_init() method
+
+neigh_alloc() method
+
+neigh_table_init_no_netlink() method
+
+pdestructor method
+
+phash_buckets
+
+proxy_timer
+
+sizeof
+
+thresholds
+
+network unreachability detection states
+
+vs. userspace
+
+Linux network stack
+
+development model
+
+git trees
+
+IPv4/IPv6
+
+network device drivers
+
+See(see Network device drivers)
+
+Open Systems Interconnection (OSI) model
+
+application layer
+
+data link layer
+
+network layer
+
+physical layer
+
+presentation layer
+
+protocol layer/transport layer
+
+session layer
+
+protocol rules
+
+TCP/UDP listening sockets
+
+Linux routing subsystem
+
+Linux wireless stack
+
+development trees
+
+Mac802 11 subsystem
+
+See(see Mac802 11 subsystem)
+
+methods
+
+MLME
+
+See(see Management Layer (MLME))
+
+network topologies
+
+IBSS/Ad Hoc Mode
+
+infrastructure BSS mode
+
+power save mode
+
+entering
+
+exiting
+
+multicast/broadcast buffer
+
+PS-Poll packets
+
+Rx Flags and Linux symbol
+
+Local IDentifier (LID)
+
+Local key (lkey)
+
+Local Routing Header (LRH)
+
+Logical link control and adaptation protocol (L2CAP)
+
+6LoWPAN
+
+implementation
+
+initialization
+
+adaption layer
+
+PHY layer
+
+neighbor discovery optimization
+
+6LoWPAN context option (6CO)
+
+Address Registration Option (ARO)
+
+authoritative border router option (ABRO)
+
+duplicate address detection (DAD) mesages
+
+Low-rate wireless personal area networks (LR-WPANs)
+
+M
+
+Mac802.11 subsystem
+
+802.11 amendments types
+
+802.11 vs. 802.3 wired Ethernet
+
+add_interface()method
+
+Ad Hoc (IBSS) mode
+
+AP mode
+
+architecture
+
+configure_filter()
+
+debugfs
+
+fragmentation
+
+header
+
+addresses
+
+frame control
+
+HT control field
+
+ieee80211_hdr structure
+
+Network allocation vector
+
+QoS Control
+
+sequence control
+
+ieee80211_alloc_hw() method
+
+management layer
+
+Mesh mode
+
+mesh networking
+
+advantages
+
+Full Mesh
+
+HWMP Protocol
+
+Partial Mesh
+
+Sett Up
+
+Monitor mode
+
+remove_interface()
+
+Rx Path function
+
+start()method
+
+Station infrastructure mode
+
+stop()
+
+TGn
+
+See(see High Throughput Task Group (TGn))
+
+tx()function
+
+Tx Path
+
+Wireless Distribution System (WDS) mode
+
+WLANS
+
+Management Layer (MLME)
+
+association
+
+authentication
+
+components
+
+reassociation
+
+scanning
+
+Management packets
+
+Memory windows
+
+ib_alloc_mw() method
+
+ib_bind_mw() method
+
+ib_dealloc_mw() method
+
+Mesh networking
+
+advantages
+
+Full Mesh
+
+HWMP Protocol
+
+Partial Mesh
+
+Sett Up
+
+Message Signaled Interrupts (MSIs)
+
+Mroute_sk pointer
+
+MSF
+
+filters
+
+group_filter structure
+
+igmp6_event_query() method
+
+mld2_grec structure
+
+MLDv1 message types
+
+multicast traffic
+
+parameters
+
+setsockopt() method
+
+Msghdr structure
+
+Multicast address
+
+Linux symbol and value
+
+MLD
+
+ndisc_send_na() method
+
+Multicast Forwarding Cache (MFC)
+
+Multicast Listener Discovery (MLD)
+
+ASM model
+
+dev_forward_change() method
+
+GMP
+
+Hop-by-Hop header
+
+ipv6_add_dev() method
+
+IPV6_ADD_MEMBERSHIP socket
+
+IPV6_JOIN_GROUP socket
+
+mld2_grec structure
+
+MLDv2 protocol
+
+MSF
+
+filters
+
+group_filter structure
+
+igmp6_event_query() method
+
+mld2_grec structure
+
+MLDv1 message types
+
+multicast traffic
+
+parameters
+
+setsockopt() method
+
+router join
+
+setsockopt()
+
+Multicast routing
+
+CIDR
+
+fib_rules_lookup() method
+
+IGMP protocol
+
+IGMPv1 (RFC 1112)
+
+IGMPv2 (RFC 2236)
+
+IGMPv3 (RFC updated by RFC 4604)
+
+ipmr_forward_finish() method
+
+ip_mr_forward() method
+
+ip_mroute_setsockopt() method
+
+ipmr_queue_xmit() method
+
+MFC
+
+mr_table structure, routing table
+
+PIM protocol
+
+Pv4 Multicast Rx Path
+
+ip_call_ra_chain() method
+
+ipmr_cache_alloc_unres()
+
+ipmr_cache_find() method
+
+ipmr_cache_unresolved() method
+
+ip_mr_forward()
+
+ip_mr_input() method
+
+ipmr_rt_fib_lookup() method
+
+raw_rcv() method
+
+setsockopt() method
+
+thresholds
+
+topology
+
+unicast IPV4 traffic
+
+vifc_flags
+
+vif_device structure
+
+Multicast Source Filtering (MSF)
+
+Multipath routing
+
+N
+
+Native Netkey stack
+
+NDISC protocol.
+
+See See Neighbour Discovery (NDISC) protocol
+
+Near field communication (NFC)
+
+Android
+
+communication and operation modes
+
+devices
+
+drivers API
+
+Kernel architecture
+
+nfc_allocate_device() method
+
+probe() callback
+
+probe() method
+
+host-controller Interfaces
+
+intialization
+
+netlink API
+
+NFC tags
+
+overview
+
+sockets
+
+LLCPsockets
+
+raw sockets
+
+subsystem
+
+userspace architecture
+
+Neigh_add() method
+
+Neighbour Discovery (NDISC) protocol
+
+duplicate address detection
+
+addrconf_dad_start() method
+
+ICMPv6 message types
+
+ipv6_addr_any() method
+
+ndisc_rcv() method
+
+ndisc_recv_na()
+
+ndisc_recv_ns() method
+
+ndisc_send_na() method
+
+ndisc_send_ns() method
+
+ndisc_solicit()
+
+nud_state
+
+override flag
+
+router flag
+
+solicited flag
+
+Neighbour discovery (ND) messages
+
+Neighbour structure
+
+dead flag
+
+neigh_parms object
+
+neigh_resolve_output() method
+
+neigh_timer_handler() method
+
+NUD state
+
+primary_key
+
+reference counter
+
+Neigh_delete() method
+
+Net_device structure
+
+allmulti counter
+
+boolean flag
+
+definition
+
+dev_uc_init() method
+
+enum
+
+Ethernet addresses
+
+eth_hw_addr_random() method
+
+features
+
+flag
+
+hardware address assignment type
+
+header_ops struct
+
+Interrupt Request (IRQ)
+
+int flags
+
+int priv_flags
+
+kobject structure
+
+message signaled interrupts
+
+MTU
+
+NAPI stands
+
+neigh_alloc() method
+
+netdev_ops structure
+
+netdev_run_todo() method
+
+NETIF_F_GRO
+
+NETIF_F_HIGHDMA
+
+NETIF_F_HW_VLAN_CTAG_RX
+
+NETIF_F_NETNS_LOCAL
+
+NETIF_F_VLAN_CHALLENGED
+
+network namespaces
+
+network partitioning
+
+promiscuity counter
+
+protocol-specific pointers
+
+Qdisc
+
+qdisc of pfifo_fast
+
+rx_handler
+
+Rx queues
+
+SET_ETHTOOL_OPS
+
+short gflags
+
+state flag
+
+Tx queue
+
+union
+
+VLAN devices
+
+watchdog timer
+
+Netfilter subsystem
+
+connection tracking
+
+See(see Connection tracking)
+
+frameworks
+
+IP sets
+
+iptables
+
+iptables types
+
+IPVS
+
+IPv4 and ipv6 network namespace
+
+methods
+
+netfilter hooks
+
+NF_INET_FORWARD
+
+NF_INET_LOCAL_IN
+
+NF_INET_LOCAL_OUT
+
+NF_INET_POST_ROUTING
+
+NF_INET_PRE_ROUTING
+
+parameters
+
+registration
+
+return value
+
+Netlink sockets
+
+advantages
+
+BSD-style sockets
+
+generic netlink protocol
+
+See(see Generic netlink protocol)
+
+IPC mechanism
+
+kernel netlink sockets
+
+See(see Kernel netlink sockets)
+
+libnl library
+
+netlink_kernel_create() method
+
+netlink message header
+
+attribute validation policy
+
+generic netlink message
+
+nlmsg_flags field
+
+nlmsg_len
+
+sequence number
+
+struct nlmsghdr
+
+TLV format
+
+types
+
+NETLINK_ROUTE messages
+
+routing table
+
+sockaddr_nl structure
+
+TCP/IP networking
+
+Network Address Translation (NAT)
+
+Network administration
+
+ApacheBench
+
+arping
+
+ARP table management
+
+arptables
+
+arpwatch
+
+brctl
+
+conntrack-tools
+
+crtools
+
+ebtables
+
+ether-wake
+
+ethtool
+
+git
+
+hciconfig
+
+hcidump
+
+hcitool
+
+ifconifg command
+
+ifenslave
+
+iperf
+
+iproute2 package
+
+iptables and iptables6
+
+ipvsadm
+
+iwconfig tool
+
+iw package
+
+l2ping
+
+libreswan Project
+
+lowpan-tools
+
+lscpu
+
+lshw
+
+lspci
+
+mrouted
+
+netperf tool
+
+netsniff-ng
+
+netstat tool
+
+ngrep tool
+
+nmap
+
+nmap-ncat package
+
+openswan
+
+OpenVPN
+
+packeth
+
+pimd
+
+ping
+
+pktgen
+
+poptop
+
+ppp daemon
+
+radvd
+
+route tool
+
+RP-PPPoE
+
+sar tool
+
+smcroute
+
+snort
+
+suricata
+
+sysctl utility
+
+taskset
+
+tcpdump
+
+top utility
+
+tracepath command
+
+traceroute utility
+
+tshark utility
+
+tunctl tool
+
+udevadm
+
+unshared utility
+
+vconfig utility
+
+wireshark
+
+wpa_supplicant
+
+XORP
+
+Network Allocation Vector (NAV)
+
+Network device drivers
+
+IPsec policy
+
+NAPI
+
+netfilter subsystem
+
+nf_register_hooks() method
+
+promiscuity counter
+
+socket buffer
+
+datagram and stream sockets
+
+Ethernet packet
+
+eth_type_trans() method
+
+ICMP protocol
+
+ip_rcv_finish() method
+
+IPv4 packet
+
+ipv6_rcv() method
+
+netdev_alloc_skb() method
+
+RDMA
+
+structure
+
+topologies
+
+transport protocols
+
+virtualization
+
+wireless subsystem
+
+structure
+
+traversal
+
+TTL Count Exceeded
+
+VPN solutions
+
+Network driver
+
+Network namespaces
+
+implementation
+
+data structures
+
+net structure
+
+management
+
+communication
+
+ip netns command
+
+network interface
+
+namespaces implementation
+
+clone() system
+
+clone_uts_ns() method
+
+copy_net_ns() method
+
+copy_utsname() method
+
+create_nsproxy() method
+
+exit_task_namespaces() method
+
+get_net_ns_by_fd() method
+
+get_net_ns_by_pid() method
+
+IPC namespaces
+
+ip netns command
+
+mnt_namespace
+
+network namespaces
+
+nsproxy structure
+
+PID namespaces
+
+setns() system
+
+unshare() system
+
+user_namespace
+
+UTS namespaces
+
+uts_namespace
+
+proc_do_uts_string() method
+
+sethostbyname()
+
+Network topologies
+
+IBSS/Ad Hoc Mode
+
+infrastructure BSS mode
+
+Next Hop Resolution Protocol (NHRP)
+
+Non-Broadcast, Multiple Access (NBMA)
+
+Notifications chains
+
+call_netdevice_notifier() method
+
+network device events
+
+notifier_chain_register() method
+
+register_netdevice_notifier() method
+
+rtmsg_ifinfo() method
+
+subsystems
+
+O
+
+Open Cryptography Framework (OCF)
+
+Open Systems Interconnection (OSI) model
+
+application layer
+
+data link layer
+
+network layer
+
+physical layer
+
+presentation layer
+
+protocol layer/transport layer
+
+session layer
+
+Out of the Blue packet (OOTB)
+
+P
+
+Packet headers
+
+Base Transport Header
+
+Extended Transport Header
+
+Global Routing Header
+
+Immediate data
+
+Invariant CRC
+
+Local Routing Header
+
+Payload
+
+Variant CRC
+
+Peripheral Component Interconnect (PCI) subsystem
+
+configuration space
+
+pci_driver structure
+
+struct pci_dev structure
+
+Wake-On-LAN (WOL)
+
+Persistent timer.
+
+See See Zero window probe timer
+
+Personal area networks (PANs)
+
+Ping sockets
+
+Policy routing
+
+definition
+
+fib_default_rules_init() method
+
+fib_lookup() method
+
+fib_rules module, implementation
+
+rules
+
+PPPoE protocol
+
+internet server provider (ISP)
+
+intialization, PPoXsockets
+
+link control protocol (LCP)
+
+password authentication protocol (PAP)
+
+PPPoE active discovery initiation (PADI)
+
+PPPoE active discovery offer (PADO)
+
+PPPoE active discovery request (PADR)
+
+PPPoE active discovery session (PADS)
+
+PPPoE active discovery terminate (PADT)
+
+PPPoE header
+
+sending and receiving packets
+
+Primary_key
+
+Protection domain (PD)
+
+address handle
+
+Fast Memory Region (FMR) Pool
+
+ib_alloc_pd() method
+
+ib_dealloc_pd() method
+
+memory region(MR)
+
+memory window
+
+QP
+
+See(see Queue Pair (QP))
+
+SRQ
+
+See(see Shared Receive Queue (SRQ))
+
+Q
+
+Queue Key (Q_Key)
+
+Queue pair (QP)
+
+attributes
+
+ib_close_qp() method
+
+ib_create_qp() method
+
+ib_modify_qp()
+
+ib_post_recv()
+
+ib_post_send() method
+
+MW binding attributes
+
+struct ib_send_wr
+
+ib_query_qp() method
+
+selective signaling
+
+state machine
+
+Error state
+
+ib_modify_qp() method
+
+ib_query_qp() method
+
+Initialized state
+
+Ready To Receive (RTR) state
+
+Ready To Send (RTS) state
+
+Reset state
+
+Send Queue Drained (SQD) state
+
+SQE state
+
+struct ib_qp_cap
+
+struct ib_qp_open_attr
+
+transport types
+
+Quick Mode
+
+R
+
+Radio Frequency Communications protocol (RFCOMM)
+
+Raw sockets
+
+RDMA device.
+
+See also See also Remote Direct Memory Access (RDMA)
+
+Real-time Transport Protocol (RTP)
+
+Receive path (Rx)
+
+Record route option
+
+for loop
+
+ip_options_compile()
+
+ip_options structure
+
+ip_rcv_options() method
+
+optptr pointer
+
+parameter problem
+
+router alert
+
+SSRR
+
+stream ID
+
+Reliably delivered message
+
+Remote Direct Memory Access (RDMA)
+
+address handle
+
+attributes
+
+ib_create_ah_from_wc() method
+
+ib_create_ah() method
+
+ib_destory_ah() method
+
+ib_init_ah_from_wc()
+
+ib_modify_ah() method
+
+ib_query_ah()
+
+advantages
+
+CPU offload
+
+High Bandwidth
+
+Kernel bypass
+
+Low latency
+
+Zero copy
+
+attributes
+
+completion queue
+
+first-in, first-out (FIFO)
+
+ib_create_cq() method
+
+ib_destory_cq()
+
+ib_modify_cq() method
+
+ib_peek_cq() method
+
+ib_poll_cq()
+
+ib_req_ncomp_notif()
+
+ib_req_notify_cq() method
+
+ib_resize_cq()
+
+QP
+
+See(see Queue Pair (QP))
+
+struct ib_wc
+
+device modification
+
+event handler
+
+eXtended Reliable Connected
+
+ib_alloc_xrcd() method
+
+ib_dealloc_xrcd_cq() method
+
+hierarchy
+
+ib_attach_mcast() method
+
+ib_detach_mcast()
+
+ib_find_gid()
+
+ib_find_pkey() method
+
+ib_get_client_data() method
+
+ib_modify_port() method
+
+ib_mtu_to_int()
+
+ib_query_device() method
+
+ib_query_gid()
+
+ib_query_pkey()
+
+ib_query_port()
+
+ib_rate_to_mbps() method
+
+ib_rate_to_mult()
+
+ib_register_client() method
+
+ib_register_event_handler()
+
+ib_set_client_data() method
+
+ib_unregister_client() method
+
+ib_width_enum_to_int()
+
+include/rdma/ib_verbs.h
+
+INIT_IB_EVENT_HANDLER macro
+
+memory region
+
+CPU accesses
+
+ib_dereg_mr() method
+
+ib_dma_alloc_coherent() method
+
+ib_dma_free_coherent() method
+
+ib_dma_map_page() method
+
+ib_dma_mapping_error()
+
+ib_dma_map_sg_attr()
+
+ib_dma_map_sg() method
+
+ib_dma_map_single() method
+
+ib_dma_unmap_page() method
+
+ib_dma_unmap_sg()
+
+ib_dma_unmap_sg() method
+
+ib_dma_unmap_single()
+
+ib_dma_unmap_single_attrs() method
+
+ib_get_dma_mr()
+
+ib_mr_attr Struct
+
+ib_reg_phys_mr() method
+
+ib_rereg_phys_mr() method
+
+ib_sg_dma_len() method
+
+kernel virtual address
+
+physical buffer
+
+memory windows
+
+ib_alloc_mw() method
+
+ib_bind_mw() method
+
+ib_dealloc_mw() method
+
+multicast groups
+
+network protocols
+
+node type
+
+operation types
+
+PD
+
+See(see Protection domain (PD))
+
+port attributes
+
+protection domain
+
+ib_alloc_pd() method
+
+ib_dealloc_pd()
+
+QP
+
+See(see Queue pair (QP))
+
+rdma_node_get_transport()
+
+rdma_port_get_link_layer() method
+
+request processing flow
+
+retry flow
+
+RNR Flow
+
+SRQ
+
+See(see Shared Receive Queue (SRQ))
+
+stack architecture
+
+struct ib_client
+
+struct ib_event
+
+Userspace vs. Kernel-Level RDMA API
+
+Remote key (rkey)
+
+Retransmit timer
+
+Retry flow
+
+Reverse Path Filter (RPF)
+
+RNR Flow
+
+Root Announcement (RANN)
+
+Router
+
+Router Advertisement (RA)
+
+Router Alert (RA)
+
+Routing subsystem
+
+FIB
+
+fib_table structure
+
+caching
+
+fib_alias object
+
+fib_info
+
+fib_nh_exceptions
+
+nexthop
+
+policy routing
+
+forwarding packets
+
+forwarding router
+
+IP rule selectors
+
+lookup
+
+fib_lookup() method
+
+flowi4 object
+
+rtable structure
+
+macros
+
+MFC_HASH
+
+VIF_EXISTS
+
+methods
+
+multicast routing
+
+See(see Multicast routing)
+
+multipath routing
+
+policy routing
+
+definition
+
+fib_default_rules_init() method
+
+fib_lookup() method
+
+fib_rules module, implementation
+
+rules
+
+procfs multicast
+
+redirect message
+
+route flags
+
+route metrics
+
+route types
+
+routing
+
+rtmsg_ifinfo() method
+
+rtnl_notify()
+
+S
+
+SCTP.
+
+See See Stream Control Transmission Protocol (SCTP)
+
+Security Association (SA)
+
+Security policy
+
+action
+
+current lifetime
+
+definition
+
+polq queue
+
+SPD
+
+xfrm_policy structure
+
+reference counter
+
+xfrm_policy_timer() method
+
+Security Policy Database (SPD)
+
+Sequenced packet stream
+
+Service Level (SL)
+
+Session Initiation Protocol (SIP)
+
+Setsockopt() method
+
+Shared Receive Queue (SRQ)
+
+attributes
+
+ib_create_srq() method
+
+ib_destory_srq() method
+
+ib_destroy_srq() method
+
+ib_modify_srq() method
+
+ib_post_srq_recv() method
+
+ib_query_srq()
+
+limit asynchronous event
+
+QP
+
+scatter/gather element
+
+struct ib_recv_wr
+
+Sock_create() method
+
+Socket Buffer (SKB)
+
+Socketcall() method
+
+Sockets
+
+API
+
+accept()
+
+bind()
+
+connect()
+
+datagram
+
+data links
+
+DCCP
+
+listen()
+
+raw
+
+recv()
+
+reliably delivered message
+
+send()
+
+sequenced packet stream
+
+socket()
+
+stream
+
+creation
+
+msghdr structure
+
+socket() system call
+
+implementation
+
+parameters of
+
+return value of
+
+struct socket
+
+structure
+
+Sock_map_fd() method
+
+Sock structure
+
+Stream Control Transmission Protocol (SCTP)
+
+association
+
+members
+
+multiple addresses, addition/removal of
+
+representation
+
+setting up
+
+chunk
+
+chunk header
+
+common header
+
+features
+
+HEARTBEAT mechanism
+
+initialization
+
+multihoming
+
+multistreaming
+
+receiving packets
+
+registration
+
+sending packets
+
+Stream sockets
+
+Strict source record route (SSRR)
+
+Struct sock
+
+Switch
+
+Sys_socket() method
+
+T
+
+TCP.
+
+See See Transmission Control Protocol (TCP)
+
+Tcp_init_sock() method
+
+TCP/IP networking
+
+Time To Live (TTL)
+
+Traditional receive flow vs Busy Poll Sockets receive flow
+
+Transmission Control Protocol (TCP)
+
+connection setup
+
+description
+
+flags
+
+header
+
+initialization
+
+prot_ops objects
+
+receiving packets
+
+sending packets
+
+socket initialization
+
+timers
+
+Transport layer protocols
+
+DCCP
+
+See(see Datagram Congestion Control Protocol (DCCP))
+
+macros
+
+methods
+
+SCTP
+
+See(see Stream Control Transmission Protocol (SCTP))
+
+TCP
+
+connection setup
+
+description
+
+header
+
+initialization
+
+receiving packets
+
+sending packets
+
+timers
+
+UDP
+
+See(see User Datagram Protocol (UDP))
+
+Type-Length-Value (TLV) format
+
+U
+
+User Datagram Protocol (UDP)
+
+description
+
+header
+
+initialization
+
+prot_ops objects
+
+receiving packets
+
+sending packets
+
+V
+
+Virtual Ethernet (VETH)
+
+Virtual Extensible Local Area Network (VXLAN)
+
+Virtual Lanes (VL)
+
+Virtual private network (VPN)
+
+W
+
+Wireless local area networks (WLANS)
+
+X, Y
+
+XFRM framework
+
+dummy bundle
+
+flow_cache_lookup() method
+
+netns_xfrm structure
+
+Security Association (SA)
+
+security policy
+
+See(see Security policy)
+
+xfrm_init() method
+
+xfrm_lookup() method
+
+xfrm_route_forward() method
+
+Z
+
+Zero window probe timer
+
diff --git a/kag/examples/csqa/builder/data/machine_learning_with_spark.txt b/kag/examples/csqa/builder/data/machine_learning_with_spark.txt
new file mode 100644
index 00000000..bc2864f4
--- /dev/null
+++ b/kag/examples/csqa/builder/data/machine_learning_with_spark.txt
@@ -0,0 +1,8844 @@
+Machine Learning With Spark
+ 
+# Table of Contents
+
+Machine Learning with Spark
+
+Credits
+
+About the Author
+
+Acknowledgments
+
+About the Reviewers
+
+www.PacktPub.com
+
+Support files, eBooks, discount offers, and more
+
+Why subscribe?
+
+Free access for Packt account holders
+
+Preface
+
+What this book covers
+
+What you need for this book
+
+Who this book is for
+
+Conventions
+
+Reader feedback
+
+Customer support
+
+Downloading the example code
+
+Errata
+
+Piracy
+
+Questions
+
+1. Getting Up and Running with Spark
+
+Installing and setting up Spark locally
+
+Spark clusters
+
+The Spark programming model
+
+SparkContext and SparkConf
+
+The Spark shell
+
+Resilient Distributed Datasets
+
+Creating RDDs
+
+Spark operations
+
+Caching RDDs
+
+Broadcast variables and accumulators
+
+The first step to a Spark program in Scala
+
+The first step to a Spark program in Java
+
+The first step to a Spark program in Python
+
+Getting Spark running on Amazon EC2
+
+Launching an EC2 Spark cluster
+
+Summary
+
+2. Designing a Machine Learning System
+
+Introducing MovieStream
+
+Business use cases for a machine learning system
+
+Personalization
+
+Targeted marketing and customer segmentation
+
+Predictive modeling and analytics
+
+Types of machine learning models
+
+The components of a data-driven machine learning system
+
+Data ingestion and storage
+
+Data cleansing and transformation
+
+Model training and testing loop
+
+Model deployment and integration
+
+Model monitoring and feedback
+
+Batch versus real time
+
+An architecture for a machine learning system
+
+Practical exercise
+
+Summary
+
+3. Obtaining, Processing, and Preparing Data with Spark
+
+Accessing publicly available datasets
+
+The MovieLens 100k dataset
+
+Exploring and visualizing your data
+
+Exploring the user dataset
+
+Exploring the movie dataset
+
+Exploring the rating dataset
+
+Processing and transforming your data
+
+Filling in bad or missing data
+
+Extracting useful features from your data
+
+Numerical features
+
+Categorical features
+
+Derived features
+
+Transforming timestamps into categorical features
+
+Text features
+
+Simple text feature extraction
+
+Normalizing features
+
+Using MLlib for feature normalization
+
+Using packages for feature extraction
+
+Summary
+
+4. Building a Recommendation Engine with Spark
+
+Types of recommendation models
+
+Content-based filtering
+
+Collaborative filtering
+
+Matrix factorization
+
+Explicit matrix factorization
+
+Implicit matrix factorization
+
+Alternating least squares
+
+Extracting the right features from your data
+
+Extracting features from the MovieLens 100k dataset
+
+Training the recommendation model
+
+Training a model on the MovieLens 100k dataset
+
+Training a model using implicit feedback data
+
+Using the recommendation model
+
+User recommendations
+
+Generating movie recommendations from the MovieLens 100k dataset
+
+Inspecting the recommendations
+
+Item recommendations
+
+Generating similar movies for the MovieLens 100k dataset
+
+Inspecting the similar items
+
+Evaluating the performance of recommendation models
+
+Mean Squared Error
+
+Mean average precision at K
+
+Using MLlib's built-in evaluation functions
+
+RMSE and MSE
+
+MAP
+
+Summary
+
+5. Building a Classification Model with Spark
+
+Types of classification models
+
+Linear models
+
+Logistic regression
+
+Linear support vector machines
+
+The naïve Bayes model
+
+Decision trees
+
+Extracting the right features from your data
+
+Extracting features from the Kaggle/StumbleUpon evergreen classification dataset
+
+Training classification models
+
+Training a classification model on the Kaggle/StumbleUpon evergreen classification dataset
+
+Using classification models
+
+Generating predictions for the Kaggle/StumbleUpon evergreen classification dataset
+
+Evaluating the performance of classification models
+
+Accuracy and prediction error
+
+Precision and recall
+
+ROC curve and AUC
+
+Improving model performance and tuning parameters
+
+Feature standardization
+
+Additional features
+
+Using the correct form of data
+
+Tuning model parameters
+
+Linear models
+
+Iterations
+
+Step size
+
+Regularization
+
+Decision trees
+
+Tuning tree depth and impurity
+
+The naïve Bayes model
+
+Cross-validation
+
+Summary
+
+6. Building a Regression Model with Spark
+
+Types of regression models
+
+Least squares regression
+
+Decision trees for regression
+
+Extracting the right features from your data
+
+Extracting features from the bike sharing dataset
+
+Creating feature vectors for the linear model
+
+Creating feature vectors for the decision tree
+
+Training and using regression models
+
+Training a regression model on the bike sharing dataset
+
+Evaluating the performance of regression models
+
+Mean Squared Error and Root Mean Squared Error
+
+Mean Absolute Error
+
+Root Mean Squared Log Error
+
+The R-squared coefficient
+
+Computing performance metrics on the bike sharing dataset
+
+Linear model
+
+Decision tree
+
+Improving model performance and tuning parameters
+
+Transforming the target variable
+
+Impact of training on log-transformed targets
+
+Tuning model parameters
+
+Creating training and testing sets to evaluate parameters
+
+The impact of parameter settings for linear models
+
+Iterations
+
+Step size
+
+L2 regularization
+
+L1 regularization
+
+Intercept
+
+The impact of parameter settings for the decision tree
+
+Tree depth
+
+Maximum bins
+
+Summary
+
+7. Building a Clustering Model with Spark
+
+Types of clustering models
+
+K-means clustering
+
+Initialization methods
+
+Variants
+
+Mixture models
+
+Hierarchical clustering
+
+Extracting the right features from your data
+
+Extracting features from the MovieLens dataset
+
+Extracting movie genre labels
+
+Training the recommendation model
+
+Normalization
+
+Training a clustering model
+
+Training a clustering model on the MovieLens dataset
+
+Making predictions using a clustering model
+
+Interpreting cluster predictions on the MovieLens dataset
+
+Interpreting the movie clusters
+
+Evaluating the performance of clustering models
+
+Internal evaluation metrics
+
+External evaluation metrics
+
+Computing performance metrics on the MovieLens dataset
+
+Tuning parameters for clustering models
+
+Selecting K through cross-validation
+
+Summary
+
+8. Dimensionality Reduction with Spark
+
+Types of dimensionality reduction
+
+Principal Components Analysis
+
+Singular Value Decomposition
+
+Relationship with matrix factorization
+
+Clustering as dimensionality reduction
+
+Extracting the right features from your data
+
+Extracting features from the LFW dataset
+
+Exploring the face data
+
+Visualizing the face data
+
+Extracting facial images as vectors
+
+Loading images
+
+Converting to grayscale and resizing the images
+
+Extracting feature vectors
+
+Normalization
+
+Training a dimensionality reduction model
+
+Running PCA on the LFW dataset
+
+Visualizing the Eigenfaces
+
+Interpreting the Eigenfaces
+
+Using a dimensionality reduction model
+
+Projecting data using PCA on the LFW dataset
+
+The relationship between PCA and SVD
+
+Evaluating dimensionality reduction models
+
+Evaluating k for SVD on the LFW dataset
+
+Summary
+
+9. Advanced Text Processing with Spark
+
+What's so special about text data?
+
+Extracting the right features from your data
+
+Term weighting schemes
+
+Feature hashing
+
+Extracting the TF-IDF features from the 20 Newsgroups dataset
+
+Exploring the 20 Newsgroups data
+
+Applying basic tokenization
+
+Improving our tokenization
+
+Removing stop words
+
+Excluding terms based on frequency
+
+A note about stemming
+
+Training a TF-IDF model
+
+Analyzing the TF-IDF weightings
+
+Using a TF-IDF model
+
+Document similarity with the 20 Newsgroups dataset and TF-IDF features
+
+Training a text classifier on the 20 Newsgroups dataset using TF-IDF
+
+Evaluating the impact of text processing
+
+Comparing raw features with processed TF-IDF features on the 20 Newsgroups dataset
+
+Word2Vec models
+
+Word2Vec on the 20 Newsgroups dataset
+
+Summary
+
+10. Real-time Machine Learning with Spark Streaming
+
+Online learning
+
+Stream processing
+
+An introduction to Spark Streaming
+
+Input sources
+
+Transformations
+
+Keeping track of state
+
+General transformations
+
+Actions
+
+Window operators
+
+Caching and fault tolerance with Spark Streaming
+
+Creating a Spark Streaming application
+
+The producer application
+
+Creating a basic streaming application
+
+Streaming analytics
+
+Stateful streaming
+
+Online learning with Spark Streaming
+
+Streaming regression
+
+A simple streaming regression program
+
+Creating a streaming data producer
+
+Creating a streaming regression model
+
+Streaming K-means
+
+Online model evaluation
+
+Comparing model performance with Spark Streaming
+
+Summary
+
+Index
+
+#  **Machine Learning with Spark**
+
+* * *
+
+# Machine Learning with Spark
+
+Copyright (C) 2015 Packt Publishing
+
+All rights reserved. No part of this book may be reproduced, stored in a retrieval system, or transmitted in any form or by any means, without the prior written permission of the publisher, except in the case of brief quotations embedded in critical articles or reviews.
+
+Every effort has been made in the preparation of this book to ensure the accuracy of the information presented. However, the information contained in this book is sold without warranty, either express or implied. Neither the author, nor Packt Publishing, and its dealers and distributors will be held liable for any damages caused or alleged to be caused directly or indirectly by this book.
+
+Packt Publishing has endeavored to provide trademark information about all of the companies and products mentioned in this book by the appropriate use of capitals. However, Packt Publishing cannot guarantee the accuracy of this information.
+
+First published: February 2015
+
+Production reference: 1170215
+
+Published by Packt Publishing Ltd.
+
+Livery Place
+
+35 Livery Street
+
+Birmingham B3 2PB, UK.
+
+ISBN 978-1-78328-851-9
+
+www.packtpub.com
+
+Cover image by Akshay Paunikar (`<akshaypaunikar4@gmail.com>`)
+
+# Credits
+
+ **Author**
+
+Nick Pentreath
+
+ **Reviewers**
+
+Andrea Mostosi
+
+Hao Ren
+
+Krishna Sankar
+
+ **Commissioning Editor**
+
+Rebecca Youe
+
+ **Acquisition Editor**
+
+Rebecca Youe
+
+ **Content Development Editor**
+
+Susmita Sabat
+
+ **Technical Editors**
+
+Vivek Arora
+
+Pankaj Kadam
+
+ **Copy Editor**
+
+Karuna Narayanan
+
+ **Project Coordinator**
+
+Milton Dsouza
+
+ **Proofreaders**
+
+Simran Bhogal
+
+Maria Gould
+
+Ameesha Green
+
+Paul Hindle
+
+ **Indexer**
+
+Priya Sane
+
+ **Graphics**
+
+Sheetal Aute
+
+Abhinash Sahu
+
+ **Production Coordinator**
+
+Nitesh Thakur
+
+ **Cover Work**
+
+Nitesh Thakur
+
+# About the Author
+
+ **Nick Pentreath** has a background in financial markets, machine learning, and software development. He has worked at Goldman Sachs Group, Inc.; as a research scientist at the online ad targeting start-up Cognitive Match Limited, London; and led the Data Science and Analytics team at Mxit, Africa's largest social network.
+
+He is a cofounder of Graphflow, a big data and machine learning company focused on user-centric recommendations and customer intelligence. He is passionate about combining commercial focus with machine learning and cutting-edge technology to build intelligent systems that learn from data to add value to the bottom line.
+
+Nick is a member of the Apache Spark Project Management Committee.
+
+# Acknowledgments
+
+Writing this book has been quite a rollercoaster ride over the past year, with many ups and downs, late nights, and working weekends. It has also been extremely rewarding to combine my passion for machine learning with my love of the Apache Spark project, and I hope to bring some of this out in this book.
+
+I would like to thank the Packt Publishing team for all their assistance throughout the writing and editing process: Rebecca, Susmita, Sudhir, Amey, Neil, Vivek, Pankaj, and everyone who worked on the book.
+
+Thanks also go to Debora Donato at StumbleUpon for assistance with data- and legal-related queries.
+
+Writing a book like this can be a somewhat lonely process, so it is incredibly helpful to get the feedback of reviewers to understand whether one is headed in the right direction (and what course adjustments need to be made). I'm deeply grateful to Andrea Mostosi, Hao Ren, and Krishna Sankar for taking the time to provide such detailed and critical feedback.
+
+I could not have gotten through this project without the unwavering support of all my family and friends, especially my wonderful wife, Tammy, who will be glad to have me back in the evenings and on weekends once again. Thank you all!
+
+Finally, thanks to all of you reading this; I hope you find it useful!
+
+# About the Reviewers
+
+ **Andrea Mostosi** is a technology enthusiast. An innovation lover since he was a child, he started a professional job in 2003 and worked on several projects, playing almost every role in the computer science environment. He is currently the CTO at The Fool, a company that tries to make sense of web and social data. During his free time, he likes traveling, running, cooking, biking, and coding.
+
+ ****
+
+I would like to thank my geek friends: Simone M, Daniele V, Luca T, Luigi P, Michele N, Luca O, Luca B, Diego C, and Fabio B. They are the smartest people I know, and comparing myself with them has always pushed me to be better.
+
+ **Hao Ren** is a software developer who is passionate about Scala, distributed systems, machine learning, and Apache Spark. He was an exchange student at EPFL when he learned about Scala in 2012. He is currently working in Paris as a backend and data engineer for ClaraVista--a company that focuses on high-performance marketing. His work responsibility is to build a Spark-based platform for purchase prediction and a new recommender system.
+
+Besides programming, he enjoys running, swimming, and playing basketball and badminton. You can learn more at his blog <http://www.invkrh.me>.
+
+ **Krishna Sankar** is a chief data scientist at BlackArrow, where he is focusing on enhancing user experience via inference, intelligence, and interfaces. Earlier stints include working as a principal architect and data scientist at Tata America International Corporation, director of data science at a bioinformatics start-up company, and as a distinguished engineer at Cisco Systems, Inc. He has spoken at various conferences about data science (<http://goo.gl/9pyJMH>), machine learning (<http://goo.gl/sSem2Y>), and social media analysis (<http://goo.gl/D9YpVQ>). He has also been a guest lecturer at the Naval Postgraduate School. He has written a few books on Java, wireless LAN security, Web 2.0, and now on Spark. His other passion is LEGO robotics. Earlier in April, he was at the St. Louis FLL World Competition as a robots design judge.
+
+# www.PacktPub.com
+
+# Support files, eBooks, discount offers, and more
+
+For support files and downloads related to your book, please visit www.PacktPub.com.
+
+Did you know that Packt offers eBook versions of every book published, with PDF and ePub files available? You can upgrade to the eBook version at www.PacktPub.com and as a print book customer, you are entitled to a discount on the eBook copy. Get in touch with us at `<service@packtpub.com>` for more details.
+
+At www.PacktPub.com, you can also read a collection of free technical articles, sign up for a range of free newsletters and receive exclusive discounts and offers on Packt books and eBooks.
+
+<https://www2.packtpub.com/books/subscription/packtlib>
+
+Do you need instant solutions to your IT questions? PacktLib is Packt's online digital book library. Here, you can search, access, and read Packt's entire library of books.
+
+## Why subscribe?
+
+  * Fully searchable across every book published by Packt
+  * Copy and paste, print, and bookmark content
+  * On demand and accessible via a web browser
+
+## Free access for Packt account holders
+
+If you have an account with Packt at www.PacktPub.com, you can use this to access PacktLib today and view 9 entirely free books. Simply use your login credentials for immediate access.
+
+# Preface
+
+In recent years, the volume of data being collected, stored, and analyzed has exploded, in particular in relation to the activity on the Web and mobile devices, as well as data from the physical world collected via sensor networks. While previously large-scale data storage, processing, analysis, and modeling was the domain of the largest institutions such as Google, Yahoo!, Facebook, and Twitter, increasingly, many organizations are being faced with the challenge of how to handle a massive amount of data.
+
+When faced with this quantity of data and the common requirement to utilize it in real time, human-powered systems quickly become infeasible. This has led to a rise in the so-called big data and machine learning systems that learn from this data to make automated decisions.
+
+In answer to the challenge of dealing with ever larger-scale data without any prohibitive cost, new open source technologies emerged at companies such as Google, Yahoo!, Amazon, and Facebook, which aimed at making it easier to handle massive data volumes by distributing data storage and computation across a cluster of computers.
+
+The most widespread of these is Apache Hadoop, which made it significantly easier and cheaper to both store large amounts of data (via the Hadoop Distributed File System, or HDFS) and run computations on this data (via Hadoop MapReduce, a framework to perform computation tasks in parallel across many nodes in a computer cluster).
+
+However, MapReduce has some important shortcomings, including high overheads to launch each job and reliance on storing intermediate data and results of the computation to disk, both of which make Hadoop relatively ill-suited for use cases of an iterative or low-latency nature. Apache Spark is a new framework for distributed computing that is designed from the ground up to be optimized for low-latency tasks and to store intermediate data and results in memory, thus addressing some of the major drawbacks of the Hadoop framework. Spark provides a clean, functional, and easy-to-understand API to write applications and is fully compatible with the Hadoop ecosystem.
+
+Furthermore, Spark provides native APIs in Scala, Java, and Python. The Scala and Python APIs allow all the benefits of the Scala or Python language, respectively, to be used directly in Spark applications, including using the relevant interpreter for real-time, interactive exploration. Spark itself now provides a toolkit (called MLlib) of distributed machine learning and data mining models that is under heavy development and already contains high-quality, scalable, and efficient algorithms for many common machine learning tasks, some of which we will delve into in this book.
+
+Applying machine learning techniques to massive datasets is challenging, primarily because most well-known machine learning algorithms are not designed for parallel architectures. In many cases, designing such algorithms is not an easy task. The nature of machine learning models is generally iterative, hence the strong appeal of Spark for this use case. While there are many competing frameworks for parallel computing, Spark is one of the few that combines speed, scalability, in-memory processing, and fault tolerance with ease of programming and a flexible, expressive, and powerful API design.
+
+Throughout this book, we will focus on real-world applications of machine learning technology. While we may briefly delve into some theoretical aspects of machine learning algorithms, the book will generally take a practical, applied approach with a focus on using examples and code to illustrate how to effectively use the features of Spark and MLlib, as well as other well-known and freely available packages for machine learning and data analysis, to create a useful machine learning system.
+
+# What this book covers
+
+Chapter 1, _Getting Up and Running with Spark_ , shows how to install and set up a local development environment for the Spark framework as well as how to create a Spark cluster in the cloud using Amazon EC2. The Spark programming model and API will be introduced, and a simple Spark application will be created using each of Scala, Java, and Python.
+
+Chapter 2, _Designing a Machine Learning System_ , presents an example of a real-world use case for a machine learning system. We will design a high-level architecture for an intelligent system in Spark based on this illustrative use case.
+
+Chapter 3, _Obtaining, Processing, and Preparing Data with Spark_ , details how to go about obtaining data for use in a machine learning system, in particular from various freely and publicly available sources. We will learn how to process, clean, and transform the raw data into features that may be used in machine learning models, using available tools, libraries, and Spark's functionality.
+
+Chapter 4, _Building a Recommendation Engine with Spark_ , deals with creating a recommendation model based on the collaborative filtering approach. This model will be used to recommend items to a given user as well as create lists of items that are similar to a given item. Standard metrics to evaluate the performance of a recommendation model will be covered here.
+
+Chapter 5, _Building a Classification Model with Spark_ , details how to create a model for binary classification as well as how to utilize standard performance-evaluation metrics for classification tasks.
+
+Chapter 6, _Building a Regression Model with Spark_ , shows how to create a model for regression, extending the classification model created in Chapter 5, _Building a Classification Model with Spark_. Evaluation metrics for the performance of regression models will be detailed here.
+
+Chapter 7, _Building a Clustering Model with Spark_ , explores how to create a clustering model as well as how to use related evaluation methodologies. You will learn how to analyze and visualize the clusters generated.
+
+Chapter 8, _Dimensionality Reduction with Spark_ , takes us through methods to extract the underlying structure from and reduce the dimensionality of our data. You will learn some common dimensionality-reduction techniques and how to apply and analyze them, as well as how to use the resulting data representation as input to another machine learning model.
+
+Chapter 9, _Advanced Text Processing with Spark_ , introduces approaches to deal with large-scale text data, including techniques for feature extraction from text and dealing with the very high-dimensional features typical in text data.
+
+Chapter 10, _Real-time Machine Learning with Spark Streaming_ , provides an overview of Spark Streaming and how it fits in with the online and incremental learning approaches to apply machine learning on data streams.
+
+# What you need for this book
+
+Throughout this book, we assume that you have some basic experience with programming in Scala, Java, or Python and have some basic knowledge of machine learning, statistics, and data analysis.
+
+# Who this book is for
+
+This book is aimed at entry-level to intermediate data scientists, data analysts, software engineers, and practitioners involved in machine learning or data mining with an interest in large-scale machine learning approaches, but who are not necessarily familiar with Spark. You may have some experience of statistics or machine learning software (perhaps including MATLAB, scikit-learn, Mahout, R, Weka, and so on) or distributed systems (perhaps including some exposure to Hadoop).
+
+# Conventions
+
+In this book, you will find a number of styles of text that distinguish between different kinds of information. Here are some examples of these styles, and an explanation of their meaning.
+
+Code words in text, database table names, folder names, filenames, file extensions, pathnames, dummy URLs, user input, and Twitter handles are shown as follows: "Spark places user scripts to run Spark in the `bin` directory."
+
+A block of code is set as follows:
+
+    val conf = new SparkConf()
+    .setAppName("Test Spark App")
+    .setMaster("local[4]")
+    val sc = new SparkContext(conf)
+
+Any command-line input or output is written as follows:
+
+    **> tar xfvz spark-1.2.0-bin-hadoop2.4.tgz**
+    **> cd spark-1.2.0-bin-hadoop2.4**
+
+**New terms** and **important words** are shown in bold. Words that you see on the screen, in menus or dialog boxes for example, appear in the text like this: "These can be obtained from the AWS homepage by clicking **Account** | **Security Credentials** | **Access Credentials**."
+
+### Note
+
+Warnings or important notes appear in a box like this.
+
+### Tip
+
+Tips and tricks appear like this.
+
+# Reader feedback
+
+Feedback from our readers is always welcome. Let us know what you think about this book--what you liked or may have disliked. Reader feedback is important for us to develop titles that you really get the most out of.
+
+To send us general feedback, simply send an e-mail to `<feedback@packtpub.com>`, and mention the book title through the subject of your message.
+
+If there is a topic that you have expertise in and you are interested in either writing or contributing to a book, see our author guide on www.packtpub.com/authors.
+
+# Customer support
+
+Now that you are the proud owner of a Packt book, we have a number of things to help you to get the most from your purchase.
+
+## Downloading the example code
+
+You can download the example code files for all Packt books you have purchased from your account at <http://www.packtpub.com>. If you purchased this book elsewhere, you can visit <http://www.packtpub.com/support> and register to have the files e-mailed directly to you.
+
+## Errata
+
+Although we have taken every care to ensure the accuracy of our content, mistakes do happen. If you find a mistake in one of our books--maybe a mistake in the text or the code--we would be grateful if you would report this to us. By doing so, you can save other readers from frustration and help us improve subsequent versions of this book. If you find any errata, please report them by visiting <http://www.packtpub.com/support>, selecting your book, clicking on the **Errata Submission Form** link, and entering the details of your errata. Once your errata are verified, your submission will be accepted and the errata will be uploaded to our website or added to any list of existing errata under the Errata section of that title.
+
+To view the previously submitted errata, go to <https://www.packtpub.com/books/content/support> and enter the name of the book in the search field. The required information will appear under the **Errata** section.
+
+## Piracy
+
+Piracy of copyright material on the Internet is an ongoing problem across all media. At Packt, we take the protection of our copyright and licenses very seriously. If you come across any illegal copies of our works, in any form, on the Internet, please provide us with the location address or website name immediately so that we can pursue a remedy.
+
+Please contact us at `<copyright@packtpub.com>` with a link to the suspected pirated material.
+
+We appreciate your help in protecting our authors, and our ability to bring you valuable content.
+
+## Questions
+
+You can contact us at `<questions@packtpub.com>` if you are having a problem with any aspect of the book, and we will do our best to address it.
+
+# Chapter 1. Getting Up and Running with Spark
+
+Apache Spark is a framework for distributed computing; this framework aims to make it simpler to write programs that run in parallel across many nodes in a cluster of computers. It tries to abstract the tasks of resource scheduling, job submission, execution, tracking, and communication between nodes, as well as the low-level operations that are inherent in parallel data processing. It also provides a higher level API to work with distributed data. In this way, it is similar to other distributed processing frameworks such as Apache Hadoop; however, the underlying architecture is somewhat different.
+
+Spark began as a research project at the University of California, Berkeley. The university was focused on the use case of distributed machine learning algorithms. Hence, it is designed from the ground up for high performance in applications of an iterative nature, where the same data is accessed multiple times. This performance is achieved primarily through caching datasets in memory, combined with low latency and overhead to launch parallel computation tasks. Together with other features such as fault tolerance, flexible distributed-memory data structures, and a powerful functional API, Spark has proved to be broadly useful for a wide range of large-scale data processing tasks, over and above machine learning and iterative analytics.
+
+### Note
+
+For more background on Spark, including the research papers underlying Spark's development, see the project's history page at <http://spark.apache.org/community.html#history>.
+
+Spark runs in four modes:
+
+  * The standalone local mode, where all Spark processes are run within the same **Java Virtual Machine** ( **JVM** ) process
+  * The standalone cluster mode, using Spark's own built-in job-scheduling framework
+  * Using Mesos, a popular open source cluster-computing framework
+  * Using YARN (commonly referred to as NextGen MapReduce), a Hadoop-related cluster-computing and resource-scheduling framework
+
+In this chapter, we will:
+
+  * Download the Spark binaries and set up a development environment that runs in Spark's standalone local mode. This environment will be used throughout the rest of the book to run the example code.
+  * Explore Spark's programming model and API using Spark's interactive console.
+  * Write our first Spark program in Scala, Java, and Python.
+  * Set up a Spark cluster using Amazon's **Elastic Cloud Compute** ( **EC2** ) platform, which can be used for large-sized data and heavier computational requirements, rather than running in the local mode.
+
+### Tip
+
+Spark can also be run on Amazon's Elastic MapReduce service using custom bootstrap action scripts, but this is beyond the scope of this book. The following article is a good reference guide: <http://aws.amazon.com/articles/Elastic-MapReduce/4926593393724923>.
+
+At the time of writing this book, the article covers running Spark Version 1.1.0.
+
+If you have previous experience in setting up Spark and are familiar with the basics of writing a Spark program, feel free to skip this chapter.
+
+# Installing and setting up Spark locally
+
+Spark can be run using the built-in standalone cluster scheduler in the local mode. This means that all the Spark processes are run within the same JVM--effectively, a single, multithreaded instance of Spark. The local mode is very useful for prototyping, development, debugging, and testing. However, this mode can also be useful in real-world scenarios to perform parallel computation across multiple cores on a single computer.
+
+As Spark's local mode is fully compatible with the cluster mode, programs written and tested locally can be run on a cluster with just a few additional steps.
+
+The first step in setting up Spark locally is to download the latest version (at the time of writing this book, the version is 1.2.0). The download page of the Spark project website, found at <http://spark.apache.org/downloads.html>, contains links to download various versions as well as to obtain the latest source code via GitHub.
+
+### Tip
+
+The Spark project documentation website at <http://spark.apache.org/docs/latest/> is a comprehensive resource to learn more about Spark. We highly recommend that you explore it!
+
+Spark needs to be built against a specific version of Hadoop in order to access **Hadoop Distributed File System** ( **HDFS** ) as well as standard and custom Hadoop input sources. The download page provides prebuilt binary packages for Hadoop 1, CDH4 (Cloudera's Hadoop Distribution), MapR's Hadoop distribution, and Hadoop 2 (YARN). Unless you wish to build Spark against a specific Hadoop version, we recommend that you download the prebuilt Hadoop 2.4 package from an Apache mirror using this link: <http://www.apache.org/dyn/closer.cgi/spark/spark-1.2.0/spark-1.2.0-bin-hadoop2.4.tgz>.
+
+Spark requires the Scala programming language (version 2.10.4 at the time of writing this book) in order to run. Fortunately, the prebuilt binary package comes with the Scala runtime packages included, so you don't need to install Scala separately in order to get started. However, you will need to have a **Java Runtime Environment** ( **JRE** ) or **Java Development Kit** ( **JDK** ) installed (see the software and hardware list in this book's code bundle for installation instructions).
+
+Once you have downloaded the Spark binary package, unpack the contents of the package and change into the newly created directory by running the following commands:
+
+    **> tar xfvz spark-1.2.0-bin-hadoop2.4.tgz**
+    **> cd spark-1.2.0-bin-hadoop2.4**
+
+Spark places user scripts to run Spark in the `bin` directory. You can test whether everything is working correctly by running one of the example programs included in Spark:
+
+    **>./bin/run-example org.apache.spark.examples.SparkPi**
+
+This will run the example in Spark's local standalone mode. In this mode, all the Spark processes are run within the same JVM, and Spark uses multiple threads for parallel processing. By default, the preceding example uses a number of threads equal to the number of cores available on your system. Once the program is finished running, you should see something similar to the following lines near the end of the output:
+
+    **...**
+    **14/11/27 20:58:47 INFO SparkContext: Job finished: reduce at SparkPi.scala:35, took 0.723269 s**
+    **Pi is roughly 3.1465**
+    **...**
+
+To configure the level of parallelism in the local mode, you can pass in a `master` parameter of the `local[N]` form, where `N` is the number of threads to use. For example, to use only two threads, run the following command instead:
+
+    **> MASTER=local[2] ./bin/run-example org.apache.spark.examples.SparkPi**
+
+# Spark clusters
+
+A Spark cluster is made up of two types of processes: a driver program and multiple executors. In the local mode, all these processes are run within the same JVM. In a cluster, these processes are usually run on separate nodes.
+
+For example, a typical cluster that runs in Spark's standalone mode (that is, using Spark's built-in cluster-management modules) will have:
+
+  * A master node that runs the Spark standalone master process as well as the driver program
+  * A number of worker nodes, each running an executor process
+
+While we will be using Spark's local standalone mode throughout this book to illustrate concepts and examples, the same Spark code that we write can be run on a Spark cluster. In the preceding example, if we run the code on a Spark standalone cluster, we could simply pass in the URL for the master node as follows:
+
+    **> MASTER=spark://IP:PORT ./bin/run-example org.apache.spark.examples.SparkPi**
+
+Here, `IP` is the IP address, and `PORT` is the port of the Spark master. This tells Spark to run the program on the cluster where the Spark master process is running.
+
+A full treatment of Spark's cluster management and deployment is beyond the scope of this book. However, we will briefly teach you how to set up and use an Amazon EC2 cluster later in this chapter.
+
+### Note
+
+For an overview of the Spark cluster-application deployment, take a look at the following links:
+
+  * <http://spark.apache.org/docs/latest/cluster-overview.html>
+  * <http://spark.apache.org/docs/latest/submitting-applications.html>
+
+# The Spark programming model
+
+Before we delve into a high-level overview of Spark's design, we will introduce the `SparkContext` object as well as the Spark shell, which we will use to interactively explore the basics of the Spark programming model.
+
+### Tip
+
+While this section provides a brief overview and examples of using Spark, we recommend that you read the following documentation to get a detailed understanding:
+
+  * Spark Quick Start: <http://spark.apache.org/docs/latest/quick-start.html>
+  * _Spark Programming guide_ , which covers Scala, Java, and Python: <http://spark.apache.org/docs/latest/programming-guide.html>
+
+## SparkContext and SparkConf
+
+The starting point of writing any Spark program is `SparkContext` (or `JavaSparkContext` in Java). `SparkContext` is initialized with an instance of a `SparkConf` object, which contains various Spark cluster-configuration settings (for example, the URL of the master node).
+
+Once initialized, we will use the various methods found in the `SparkContext` object to create and manipulate distributed datasets and shared variables. The Spark shell (in both Scala and Python, which is unfortunately not supported in Java) takes care of this context initialization for us, but the following lines of code show an example of creating a context running in the local mode in Scala:
+
+    val conf = new SparkConf()
+    .setAppName("Test Spark App")
+    .setMaster("local[4]")
+    val sc = new SparkContext(conf)
+
+This creates a context running in the local mode with four threads, with the name of the application set to `Test Spark App`. If we wish to use default configuration values, we could also call the following simple constructor for our `SparkContext` object, which works in exactly the same way:
+
+    val sc = new SparkContext("local[4]", "Test Spark App")
+
+### Tip
+
+ **Downloading the example code**
+
+You can download the example code files for all Packt books you have purchased from your account at <http://www.packtpub.com>. If you purchased this book elsewhere, you can visit <http://www.packtpub.com/support> and register to have the files e-mailed directly to you.
+
+## The Spark shell
+
+Spark supports writing programs interactively using either the Scala or Python REPL (that is, the **Read-Eval-Print-Loop** , or interactive shell). The shell provides instant feedback as we enter code, as this code is immediately evaluated. In the Scala shell, the return result and type is also displayed after a piece of code is run.
+
+To use the Spark shell with Scala, simply run `./bin/spark-shell` from the Spark base directory. This will launch the Scala shell and initialize `SparkContext`, which is available to us as the Scala value, `sc`. Your console output should look similar to the following screenshot:
+
+To use the Python shell with Spark, simply run the `./bin/pyspark` command. Like the Scala shell, the Python `SparkContext` object should be available as the Python variable `sc`. You should see an output similar to the one shown in this screenshot:
+
+## Resilient Distributed Datasets
+
+The core of Spark is a concept called the **Resilient Distributed Dataset** ( **RDD** ). An RDD is a collection of "records" (strictly speaking, objects of some type) that is distributed or partitioned across many nodes in a cluster (for the purposes of the Spark local mode, the single multithreaded process can be thought of in the same way). An RDD in Spark is fault-tolerant; this means that if a given node or task fails (for some reason other than erroneous user code, such as hardware failure, loss of communication, and so on), the RDD can be reconstructed automatically on the remaining nodes and the job will still complete.
+
+### Creating RDDs
+
+RDDs can be created from existing collections, for example, in the Scala Spark shell that you launched earlier:
+
+    val collection = List("a", "b", "c", "d", "e")
+    val rddFromCollection = sc.parallelize(collection)
+
+RDDs can also be created from Hadoop-based input sources, including the local filesystem, HDFS, and Amazon S3. A Hadoop-based RDD can utilize any input format that implements the Hadoop `InputFormat` interface, including text files, other standard Hadoop formats, HBase, Cassandra, and many more. The following code is an example of creating an RDD from a text file located on the local filesystem:
+
+    val rddFromTextFile = sc.textFile("LICENSE")
+
+The preceding `textFile` method returns an RDD where each record is a `String` object that represents one line of the text file.
+
+### Spark operations
+
+Once we have created an RDD, we have a distributed collection of records that we can manipulate. In Spark's programming model, operations are split into transformations and actions. Generally speaking, a transformation operation applies some function to all the records in the dataset, changing the records in some way. An action typically runs some computation or aggregation operation and returns the result to the driver program where `SparkContext` is running.
+
+Spark operations are functional in style. For programmers familiar with functional programming in Scala or Python, these operations should seem natural. For those without experience in functional programming, don't worry; the Spark API is relatively easy to learn.
+
+One of the most common transformations that you will use in Spark programs is the `map` operator. This applies a function to each record of an RDD, thus _mapping_ the input to some new output. For example, the following code fragment takes the RDD we created from a local text file and applies the `size` function to each record in the RDD. Remember that we created an RDD of `Strings`. Using `map`, we can transform each string to an integer, thus returning an RDD of `Ints`:
+
+    val intsFromStringsRDD = rddFromTextFile.map( **line = > line.size**)
+
+You should see output similar to the following line in your shell; this indicates the type of the RDD:
+
+    **intsFromStringsRDD: org.apache.spark.rdd.RDD[Int] = MappedRDD[5] at map at <console>:14**
+
+In the preceding code, we saw the `=>` syntax used. This is the Scala syntax for an anonymous function, which is a function that is not a named method (that is, one defined using the `def` keyword in Scala or Python, for example).
+
+### Note
+
+While a detailed treatment of anonymous functions is beyond the scope of this book, they are used extensively in Spark code in Scala and Python, as well as in Java 8 (both in examples and real-world applications), so it is useful to cover a few practicalities.
+
+The `line => line.size` syntax means that we are applying a function where the input variable is to the left of the `=>` operator, and the output is the result of the code to the right of the `=>` operator. In this case, the input is `line`, and the output is the result of calling `line.size`. In Scala, this function that maps a string to an integer is expressed as `String => Int`.
+
+This syntax saves us from having to separately define functions every time we use methods such as `map`; this is useful when the function is simple and will only be used once, as in this example.
+
+Now, we can apply a common action operation, `count`, to return the number of records in our RDD:
+
+    intsFromStringsRDD.count
+
+The result should look something like the following console output:
+
+    **14/01/29 23:28:28 INFO SparkContext: Starting job: count at <console>:17**
+    **...**
+    **14/01/29 23:28:28 INFO SparkContext: Job finished: count at <console>:17, took 0.019227 s**
+    **res4: Long = 398**
+
+Perhaps we want to find the average length of each line in this text file. We can first use the `sum` function to add up all the lengths of all the records and then divide the sum by the number of records:
+
+    val sumOfRecords = intsFromStringsRDD.sum
+    val numRecords = intsFromStringsRDD.count
+    val aveLengthOfRecord = sumOfRecords / numRecords
+
+The result will be as follows:
+
+    **aveLengthOfRecord: Double = 52.06030150753769**
+
+Spark operations, in most cases, return a new RDD, with the exception of most actions, which return the result of a computation (such as `Long` for `count` and `Double` for `sum` in the preceding example). This means that we can naturally chain together operations to make our program flow more concise and expressive. For example, the same result as the one in the preceding line of code can be achieved using the following code:
+
+    val aveLengthOfRecordChained = rddFromTextFile.map(line => line.size).sum / rddFromTextFile.count
+
+An important point to note is that Spark transformations are lazy. That is, invoking a transformation on an RDD does not immediately trigger a computation. Instead, transformations are chained together and are effectively only computed when an action is called. This allows Spark to be more efficient by only returning results to the driver when necessary so that the majority of operations are performed in parallel on the cluster.
+
+This means that if your Spark program never uses an action operation, it will never trigger an actual computation, and you will not get any results. For example, the following code will simply return a new RDD that represents the chain of transformations:
+
+    val transformedRDD = rddFromTextFile.map(line => line.size).filter(size => size > 10).map(size => size * 2)
+
+This returns the following result in the console:
+
+    **transformedRDD: org.apache.spark.rdd.RDD[Int] = MappedRDD[8] at map at <console>:14**
+
+Notice that no actual computation happens and no result is returned. If we now call an action, such as `sum`, on the resulting RDD, the computation will be triggered:
+
+    val computation = transformedRDD.sum
+
+You will now see that a Spark job is run, and it results in the following console output:
+
+    **...**
+    **14/11/27 21:48:21 INFO SparkContext: Job finished: sum at <console>:16, took 0.193513 s**
+    **computation: Double = 60468.0**
+
+### Tip
+
+The complete list of transformations and actions possible on RDDs as well as a set of more detailed examples are available in the Spark programming guide (located at <http://spark.apache.org/docs/latest/programming-guide.html#rdd-operations>), and the API documentation (the Scala API documentation) is located at <http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.rdd.RDD>).
+
+### Caching RDDs
+
+One of the most powerful features of Spark is the ability to cache data in memory across a cluster. This is achieved through use of the `cache` method on an RDD:
+
+    rddFromTextFile.cache
+
+Calling `cache` on an RDD tells Spark that the RDD should be kept in memory. The first time an action is called on the RDD that initiates a computation, the data is read from its source and put into memory. Hence, the first time such an operation is called, the time it takes to run the task is partly dependent on the time it takes to read the data from the input source. However, when the data is accessed the next time (for example, in subsequent queries in analytics or iterations in a machine learning model), the data can be read directly from memory, thus avoiding expensive I/O operations and speeding up the computation, in many cases, by a significant factor.
+
+If we now call the `count` or `sum` function on our cached RDD, we will see that the RDD is loaded into memory:
+
+    val aveLengthOfRecordChained = rddFromTextFile.map(line => line.size).sum / rddFromTextFile.count
+
+Indeed, in the following output, we see that the dataset was cached in memory on the first call, taking up approximately 62 KB and leaving us with around 270 MB of memory free:
+
+    **...**
+    **14/01/30 06:59:27 INFO MemoryStore: ensureFreeSpace(63454) called with curMem=32960, maxMem=311387750**
+    **14/01/30 06:59:27 INFO MemoryStore: Block rdd_2_0 stored as values to memory (estimated size 62.0 KB, free 296.9 MB)**
+    **14/01/30 06:59:27 INFO BlockManagerMasterActor$BlockManagerInfo: Added rdd_2_0 in memory on 10.0.0.3:55089 (size: 62.0 KB, free: 296.9 MB)**
+    **...**
+
+Now, we will call the same function again:
+
+    val aveLengthOfRecordChainedFromCached = rddFromTextFile.map(line => line.size).sum / rddFromTextFile.count
+
+We will see from the console output that the cached data is read directly from memory:
+
+    **...**
+    **14/01/30 06:59:34 INFO BlockManager: Found block rdd_2_0 locally**
+    **...**
+
+### Tip
+
+Spark also allows more fine-grained control over caching behavior. You can use the `persist` method to specify what approach Spark uses to cache data. More information on `RDD` caching can be found here: <http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence>.
+
+## Broadcast variables and accumulators
+
+Another core feature of Spark is the ability to create two special types of variables: broadcast variables and accumulators.
+
+A **broadcast variable** is a _read-only_ variable that is made available from the driver program that runs the `SparkContext` object to the nodes that will execute the computation. This is very useful in applications that need to make the same data available to the worker nodes in an efficient manner, such as machine learning algorithms. Spark makes creating broadcast variables as simple as calling a method on `SparkContext` as follows:
+
+    val broadcastAList = sc.broadcast(List("a", "b", "c", "d", "e"))
+
+The console output shows that the broadcast variable was stored in memory, taking up approximately 488 bytes, and it also shows that we still have 270 MB available to us:
+
+    **14/01/30 07:13:32 INFO MemoryStore: ensureFreeSpace(488) called with curMem=96414, maxMem=311387750**
+    **14/01/30 07:13:32 INFO MemoryStore: Block broadcast_1 stored as values to memory (estimated size 488.0 B, free 296.9 MB)**
+    **broadCastAList: org.apache.spark.broadcast.Broadcast[List[String]] = Broadcast(1)**
+
+A broadcast variable can be accessed from nodes other than the driver program that created it (that is, the worker nodes) by calling `value` on the variable:
+
+    sc.parallelize(List("1", "2", "3")).map(x => broadcastAList. **value** ++ x).collect
+
+This code creates a new RDD with three records from a collection (in this case, a Scala `List`) of `("1", "2", "3")`. In the `map` function, it returns a new collection with the relevant record from our new RDD appended to the `broadcastAList` that is our broadcast variable.
+
+Notice that we used the `collect` method in the preceding code. This is a Spark _action_ that returns the entire RDD to the driver as a Scala (or Python or Java) collection.
+
+We will often use `collect` when we wish to apply further processing to our results locally within the driver program.
+
+### Note
+
+Note that `collect` should generally only be used in cases where we really want to return the full result set to the driver and perform further processing. If we try to call `collect` on a very large dataset, we might run out of memory on the driver and crash our program.
+
+It is preferable to perform as much heavy-duty processing on our Spark cluster as possible, preventing the driver from becoming a bottleneck. In many cases, however, collecting results to the driver is necessary, such as during iterations in many machine learning models.
+
+On inspecting the result, we will see that for each of the three records in our new RDD, we now have a record that is our original broadcasted `List`, with the new element appended to it (that is, there is now either `"1"`, `"2"`, or `"3"` at the end):
+
+    **...**
+    **14/01/31 10:15:39 INFO SparkContext: Job finished: collect at <console>:15, took 0.025806 s**
+    **res6: Array[List[Any]] = Array(List(a, b, c, d, e, 1), List(a, b, c, d, e, 2), List(a, b, c, d, e, 3))**
+
+An **accumulator** is also a variable that is broadcasted to the worker nodes. The key difference between a broadcast variable and an accumulator is that while the broadcast variable is read-only, the accumulator can be added to. There are limitations to this, that is, in particular, the addition must be an associative operation so that the global accumulated value can be correctly computed in parallel and returned to the driver program. Each worker node can only access and add to its own local accumulator value, and only the driver program can access the global value. Accumulators are also accessed within the Spark code using the `value` method.
+
+### Tip
+
+For more details on broadcast variables and accumulators, see the _Shared Variables_ section of the _Spark Programming Guide_ : <http://spark.apache.org/docs/latest/programming-guide.html#shared-variables>.
+
+# The first step to a Spark program in Scala
+
+We will now use the ideas we introduced in the previous section to write a basic Spark program to manipulate a dataset. We will start with Scala and then write the same program in Java and Python. Our program will be based on exploring some data from an online store, about which users have purchased which products. The data is contained in a **comma-separated-value** ( **CSV** ) file called `UserPurchaseHistory.csv`, and the contents are shown in the following snippet. The first column of the CSV is the username, the second column is the product name, and the final column is the price:
+
+    **John,iPhone Cover,9.99**
+    **John,Headphones,5.49**
+    **Jack,iPhone Cover,9.99**
+    **Jill,Samsung Galaxy Cover,8.95**
+    **Bob,iPad Cover,5.49**
+
+For our Scala program, we need to create two files: our Scala code and our project build configuration file, using the build tool **Scala Build Tool** ( **sbt** ). For ease of use, we recommend that you download the sample project code called `scala-spark-app` for this chapter. This code also contains the CSV file under the `data` directory. You will need SBT installed on your system in order to run this example program (we use version 0.13.1 at the time of writing this book).
+
+### Tip
+
+Setting up SBT is beyond the scope of this book; however, you can find more information at <http://www.scala-sbt.org/release/docs/Getting-Started/Setup.html>.
+
+Our SBT configuration file, `build.sbt`, looks like this (note that the empty lines between each line of code are required):
+
+    name := "scala-spark-app"
+
+    version := "1.0"
+
+    scalaVersion := "2.10.4"
+
+    libraryDependencies += "org.apache.spark" %% "spark-core" % "1.2.0 "
+
+The last line adds the dependency on Spark to our project.
+
+Our Scala program is contained in the `ScalaApp.scala` file. We will walk through the program piece by piece. First, we need to import the required Spark classes:
+
+    import org.apache.spark.SparkContext
+    import org.apache.spark.SparkContext._
+
+    /**
+     * A simple Spark app in Scala
+     */
+    object ScalaApp {
+
+In our main method, we need to initialize our `SparkContext` object and use this to access our CSV data file with the `textFile` method. We will then map the raw text by splitting the string on the delimiter character (a comma in this case) and extracting the relevant records for username, product, and price:
+
+      def main(args: Array[String]) {
+        val sc = new SparkContext("local[2]", "First Spark App")
+        // we take the raw data in CSV format and convert it into a set of records of the form (user, product, price)
+        val data = sc.textFile("data/UserPurchaseHistory.csv")
+          .map(line => line.split(","))
+          .map(purchaseRecord => (purchaseRecord(0), purchaseRecord(1), purchaseRecord(2)))
+
+Now that we have an RDD, where each record is made up of `(user, product, price)`, we can compute various interesting metrics for our store, such as the following ones:
+
+  * The total number of purchases
+  * The number of unique users who purchased
+  * Our total revenue
+  * Our most popular product
+
+Let's compute the preceding metrics:
+
+        // let's count the number of purchases
+        val numPurchases = data.count()
+        // let's count how many unique users made purchases
+        val uniqueUsers = data.map{ case (user, product, price) => user }.distinct().count()
+        // let's sum up our total revenue
+        val totalRevenue = data.map{ case (user, product, price) => price.toDouble }.sum()
+        // let's find our most popular product
+        val productsByPopularity = data
+          .map{ case (user, product, price) => (product, 1) }
+          .reduceByKey(_ + _)
+          .collect()
+          .sortBy(-_._2)    
+        val mostPopular = productsByPopularity(0)
+
+This last piece of code to compute the most popular product is an example of the _Map/Reduce_ pattern made popular by Hadoop. First, we mapped our records of `(user, product, price)` to the records of `(product, 1)`. Then, we performed a `reduceByKey` operation, where we summed up the 1s for each unique product.
+
+Once we have this transformed RDD, which contains the number of purchases for each product, we will call `collect`, which returns the results of the computation to the driver program as a local Scala collection. We will then sort these counts locally (note that in practice, if the amount of data is large, we will perform the sorting in parallel, usually with a Spark operation such as `sortByKey`).
+
+Finally, we will print out the results of our computations to the console:
+
+        println("Total purchases: " + numPurchases)
+        println("Unique users: " + uniqueUsers)
+        println("Total revenue: " + totalRevenue)
+        println("Most popular product: %s with %d purchases".format(mostPopular._1, mostPopular._2))
+      }
+    }
+
+We can run this program by running `sbt run` in the project's base directory or by running the program in your Scala IDE if you are using one. The output should look similar to the following:
+
+    **...**
+    **[info] Compiling 1 Scala source to ...**
+    **[info] Running ScalaApp**
+    **...**
+    **14/01/30 10:54:40 INFO spark.SparkContext: Job finished: collect at ScalaApp.scala:25, took 0.045181 s**
+    **Total purchases: 5**
+    **Unique users: 4**
+    **Total revenue: 39.91**
+    **Most popular product: iPhone Cover with 2 purchases**
+
+We can see that we have five purchases from four different users with a total revenue of 39.91. Our most popular product is an iPhone cover with 2 purchases.
+
+# The first step to a Spark program in Java
+
+The Java API is very similar in principle to the Scala API. However, while Scala can call the Java code quite easily, in some cases, it is not possible to call the Scala code from Java. This is particularly the case when such Scala code makes use of certain Scala features such as implicit conversions, default parameters, and the Scala reflection API.
+
+Spark makes heavy use of these features in general, so it is necessary to have a separate API specifically for Java that includes Java versions of the common classes. Hence, `SparkContext` becomes `JavaSparkContext`, and `RDD` becomes `JavaRDD`.
+
+Java versions prior to version 8 do not support anonymous functions and do not have succinct syntax for functional-style programming, so functions in the Spark Java API must implement a `WrappedFunction` interface with the `call` method signature. While it is significantly more verbose, we will often create one-off anonymous classes to pass to our Spark operations, which implement this interface and the `call` method, to achieve much the same effect as anonymous functions in Scala.
+
+Spark provides support for Java 8's anonymous function (or _lambda_ ) syntax. Using this syntax makes a Spark program written in Java 8 look very close to the equivalent Scala program.
+
+In Scala, an RDD of key/value pairs provides special operators (such as `reduceByKey` and `saveAsSequenceFile`, for example) that are accessed automatically via implicit conversions. In Java, special types of `JavaRDD` classes are required in order to access similar functions. These include `JavaPairRDD` to work with key/value pairs and `JavaDoubleRDD` to work with numerical records.
+
+### Tip
+
+In this section, we covered the standard Java API syntax. For more details and examples related to working RDDs in Java as well as the Java 8 lambda syntax, see the Java sections of the _Spark Programming Guide_ found at <http://spark.apache.org/docs/latest/programming-guide.html#rdd-operations>.
+
+We will see examples of most of these differences in the following Java program, which is included in the example code of this chapter in the directory named `java-spark-app`. The code directory also contains the CSV data file under the `data` subdirectory.
+
+We will build and run this project with the Maven build tool, which we assume you have installed on your system.
+
+### Tip
+
+Installing and setting up Maven is beyond the scope of this book. Usually, Maven can easily be installed using the package manager on your Linux system or HomeBrew or MacPorts on Mac OS X.
+
+Detailed installation instructions can be found here: <http://maven.apache.org/download.cgi>.
+
+The project contains a Java file called `JavaApp.java`, which contains our program code:
+
+    import org.apache.spark.api.java.JavaRDD;
+    import org.apache.spark.api.java.JavaSparkContext;
+    import org.apache.spark.api.java.function.DoubleFunction;
+    import org.apache.spark.api.java.function.Function;
+    import org.apache.spark.api.java.function.Function2;
+    import org.apache.spark.api.java.function.PairFunction;
+    import scala.Tuple2;
+
+    import java.util.Collections;
+    import java.util.Comparator;
+    import java.util.List;
+
+    /**
+     * A simple Spark app in Java
+     */
+    public class JavaApp {
+
+      public static void main(String[] args) {
+
+As in our Scala example, we first need to initialize our context. Notice that we will use the `JavaSparkContext` class here instead of the `SparkContext` class that we used earlier. We will use the `JavaSparkContext` class in the same way to access our data using `textFile` and then split each row into the required fields. Note how we used an anonymous class to define a split function that performs the string processing, in the highlighted code:
+
+        JavaSparkContext sc = new JavaSparkContext("local[2]", "First Spark App");
+        // we take the raw data in CSV format and convert it into a set of records of the form (user, product, price)
+        JavaRDD<String[]> data = sc.textFile("data/UserPurchaseHistory.csv")
+        .map( **new Function <String, String[]>() {**
+    **@Override**
+    **public String[] call(String s) throws Exception {**
+    **return s.split(",");**
+    **}**
+    **}** );
+
+Now, we can compute the same metrics as we did in our Scala example. Note how some methods are the same (for example, `distinct` and `count`) for the Java and Scala APIs. Also note the use of anonymous classes that we pass to the `map` function. This code is highlighted here:
+
+        // let's count the number of purchases
+        long numPurchases = data.count();
+        // let's count how many unique users made purchases
+        long uniqueUsers = data.map( **new Function <String[], String>() {**
+    **@Override**
+    **public String call(String[] strings) throws Exception {**
+    **return strings[0];**
+    **}**
+    **}** ).distinct().count();
+        // let's sum up our total revenue
+        double totalRevenue = data.map( **new DoubleFunction <String[]>() {**
+    **@Override**
+    **public Double call(String[] strings) throws Exception {**
+    **return Double.parseDouble(strings[2]);**
+    **}**
+    **}** ).sum();
+
+In the following lines of code, we can see that the approach to compute the most popular product is the same as that in the Scala example. The extra code might seem complex, but it is mostly related to the Java code required to create the anonymous functions (which we have highlighted here). The actual functionality is the same:
+
+        // let's find our most popular product
+        // first we map the data to records of (product, 1)using a PairFunction
+        // and the Tuple2 class.
+        // then we call a reduceByKey operation with a Function2, which is essentially the sum function
+        List<Tuple2<String, Integer>> pairs = data.map( **new PairFunction <String[], String, Integer>() {**
+    **@Override**
+    **public Tuple2 <String, Integer> call(String[] strings)throws Exception {**
+    **return new Tuple2(strings[1], 1);**
+    **}**
+    **}** ).reduceByKey( **new Function2 <Integer, Integer, Integer>() {**
+    **@Override**
+    **public Integer call(Integer integer, Integer integer2)throws Exception {**
+    **return integer + integer2;**
+    **}**
+    **}** ).collect();
+        // finally we sort the result. Note we need to create a Comparator function,
+        // that reverses the sort order.
+        Collections.sort(pairs, new Comparator<Tuple2<String,Integer>>() {
+          @Override
+          public int compare(Tuple2<String, Integer> o1,Tuple2<String, Integer> o2) {
+            return -(o1._2() - o2._2());
+          }
+        });
+        String mostPopular = pairs.get(0)._1();
+        int purchases = pairs.get(0)._2();
+        System.out.println("Total purchases: " + numPurchases);
+        System.out.println("Unique users: " + uniqueUsers);
+        System.out.println("Total revenue: " + totalRevenue);
+        System.out.println(String.format("Most popular product:%s with %d purchases", mostPopular, purchases));
+      }
+    }
+
+As can be seen, the general structure is similar to the Scala version, apart from the extra boilerplate code to declare variables and functions via anonymous inner classes. It is a good exercise to work through both examples and compare the same lines of Scala code to those in Java to understand how the same result is achieved in each language.
+
+This program can be run with the following command executed from the project's base directory:
+
+    **> mvn exec:java -Dexec.mainClass="JavaApp"**
+
+You will see output that looks very similar to the Scala version, with the results of the computation identical:
+
+    **...**
+    **14/01/30 17:02:43 INFO spark.SparkContext: Job finished: collect at JavaApp.java:46, took 0.039167 s**
+    **Total purchases: 5**
+    **Unique users: 4**
+    **Total revenue: 39.91**
+    **Most popular product: iPhone Cover with 2 purchases**
+
+# The first step to a Spark program in Python
+
+Spark's Python API exposes virtually all the functionalities of Spark's Scala API in the Python language. There are some features that are not yet supported (for example, graph processing with GraphX and a few API methods here and there). See the Python section of the _Spark Programming Guide_ (<http://spark.apache.org/docs/latest/programming-guide.html>) for more details.
+
+Following on from the preceding examples, we will now write a Python version. We assume that you have Python version 2.6 and higher installed on your system (for example, most Linux and Mac OS X systems come with Python preinstalled).
+
+The example program is included in the sample code for this chapter, in the directory named `python-spark-app`, which also contains the CSV data file under the `data` subdirectory. The project contains a script, `pythonapp.py`, provided here:
+
+    """A simple Spark app in Python"""
+    from pyspark import SparkContext
+
+    sc = SparkContext("local[2]", "First Spark App")
+    # we take the raw data in CSV format and convert it into a set of records of the form (user, product, price)
+    data = sc.textFile("data/UserPurchaseHistory.csv").map(lambda line: line.split(",")).map(lambda record: (record[0], record[1], record[2]))
+    # let's count the number of purchases
+    numPurchases = data.count()
+    # let's count how many unique users made purchases
+    uniqueUsers = data.map(lambda record: record[0]).distinct().count()
+    # let's sum up our total revenue
+    totalRevenue = data.map(lambda record: float(record[2])).sum()
+    # let's find our most popular product
+    **products = data.map(lambda record: (record[1], 1.0)).reduceByKey(lambda a, b: a + b).collect()**
+    mostPopular = sorted(products, key=lambda x: x[1], reverse=True)[0]
+
+    print "Total purchases: %d" % numPurchases
+    print "Unique users: %d" % uniqueUsers
+    print "Total revenue: %2.2f" % totalRevenue
+    print "Most popular product: %s with %d purchases" % (mostPopular[0], mostPopular[1])
+
+If you compare the Scala and Python versions of our program, you will see that generally, the syntax looks very similar. One key difference is how we express anonymous functions (also called `lambda` functions; hence, the use of this keyword for the Python syntax). In Scala, we've seen that an anonymous function mapping an input `x` to an output `y` is expressed as `x => y`, while in Python, it is `lambda x: y`. In the highlighted line in the preceding code, we are applying an anonymous function that maps two inputs, `a` and `b`, generally of the same type, to an output. In this case, the function that we apply is the _plus_ function; hence, `lambda a, b: a + b`.
+
+The best way to run the script is to run the following command from the base directory of the sample project:
+
+    **> $SPARK_HOME/bin/spark-submit pythonapp.py**
+
+Here, the `SPARK_HOME` variable should be replaced with the path of the directory in which you originally unpacked the Spark prebuilt binary package at the start of this chapter.
+
+Upon running the script, you should see output similar to that of the Scala and Java examples, with the results of our computation being the same:
+
+    **...**
+    **14/01/30 11:43:47 INFO SparkContext: Job finished: collect at pythonapp.py:14, took 0.050251 s**
+    **Total purchases: 5**
+    **Unique users: 4**
+    **Total revenue: 39.91**
+    **Most popular product: iPhone Cover with 2 purchases**
+
+# Getting Spark running on Amazon EC2
+
+The Spark project provides scripts to run a Spark cluster in the cloud on Amazon's EC2 service. These scripts are located in the `ec2` directory. You can run the `spark-ec2` script contained in this directory with the following command:
+
+    **>./ec2/spark-ec2 **
+
+Running it in this way without an argument will show the help output:
+
+    **Usage: spark-ec2 [options] <action> <cluster_name>**
+    **< action> can be: launch, destroy, login, stop, start, get-master**
+
+    **Options:**
+    **...**
+
+Before creating a Spark EC2 cluster, you will need to ensure you have an Amazon account.
+
+### Tip
+
+If you don't have an Amazon Web Services account, you can sign up at <http://aws.amazon.com/>.
+
+The AWS console is available at <http://aws.amazon.com/console/>.
+
+You will also need to create an Amazon EC2 key pair and retrieve the relevant security credentials. The Spark documentation for EC2 (available at <http://spark.apache.org/docs/latest/ec2-scripts.html>) explains the requirements:
+
+>  _Create an Amazon EC2 key pair for yourself. This can be done by logging into your Amazon Web Services account through the AWS console, clicking on_ **Key Pairs** _on the left sidebar, and creating and downloading a key. Make sure that you set the permissions for the private key file to 600 (that is, only you can read and write it) so that`ssh` will work._
+> 
+>  _Whenever you want to use the_ `spark-ec2` _script, set the environment variables_ `AWS_ACCESS_KEY_ID` _and_ `AWS_SECRET_ACCESS_KEY` _to your Amazon EC2 access key ID and secret access key, respectively. These can be obtained from the AWS homepage by clicking_ **Account** | **Security Credentials** | **Access Credentials**.
+
+When creating a key pair, choose a name that is easy to remember. We will simply use `spark` for the key pair name. The key pair file itself will be called `spark.pem`. As mentioned earlier, ensure that the key pair file permissions are set appropriately and that the environment variables for the AWS credentials are exported using the following commands:
+
+    **> chmod 600 spark.pem**
+    **> export AWS_ACCESS_KEY_ID="..."**
+    **> export AWS_SECRET_ACCESS_KEY="..."**
+
+You should also be careful to keep your downloaded key pair file safe and not lose it, as it can only be downloaded once when it is created!
+
+Note that launching an Amazon EC2 cluster in the following section will _incur costs_ to your AWS account.
+
+## Launching an EC2 Spark cluster
+
+We're now ready to launch a small Spark cluster by changing into the `ec2` directory and then running the cluster launch command:
+
+    **> cd ec2**
+    **>./spark-ec2 -k spark -i spark.pem -s 1 --instance-type m3.medium --hadoop-major-version 2 launch test-cluster**
+
+This will launch a new Spark cluster called `test-cluster` with one master and one slave node of instance type `m3.medium`. This cluster will be launched with a Spark version built for Hadoop 2. The key pair name we used is `spark`, and the key pair file is `spark.pem` (if you gave the files different names or have an existing AWS key pair, use that name instead).
+
+It might take quite a while for the cluster to fully launch and initialize. You should see something like this screenshot immediately after running the launch command:
+
+If the cluster has launched successfully, you should eventually see the console output similar to the following screenshot:
+
+To test whether we can connect to our new cluster, we can run the following command:
+
+    **> ssh -i spark.pem root@ec2-54-227-127-14.compute-1.amazonaws.com**
+
+Remember to replace the public domain name of the master node (the address after `root@` in the preceding command) with the correct Amazon EC2 public domain name that will be shown in your console output after launching the cluster.
+
+You can also retrieve your cluster's master public domain name by running this line of code:
+
+    **>./spark-ec2 -i spark.pem get-master test-cluster**
+
+After successfully running the `ssh` command, you will be connected to your Spark master node in EC2, and your terminal output should match the following screenshot:
+
+We can test whether our cluster is correctly set up with Spark by changing into the Spark directory and running an example in the local mode:
+
+    **> cd spark**
+    **> MASTER=local[2] ./bin/run-example SparkPi**
+
+You should see output similar to running the same command on your local computer:
+
+    **...**
+    **14/01/30 20:20:21 INFO SparkContext: Job finished: reduce at SparkPi.scala:35, took 0.864044012 s**
+    **Pi is roughly 3.14032**
+    **...**
+
+Now that we have an actual cluster with multiple nodes, we can test Spark in the cluster mode. We can run the same example on the cluster, using our 1 slave node, by passing in the master URL instead of the local version:
+
+    **> MASTER=spark://ec2-54-227-127-14.compute-1.amazonaws.com:7077 ./bin/run-example SparkPi **
+
+### Tip
+
+Note that you will need to substitute the preceding master domain name with the correct domain name for your specific cluster.
+
+Again, the output should be similar to running the example locally; however, the log messages will show that your driver program has connected to the Spark master:
+
+    **...**
+    **14/01/30 20:26:17 INFO client.Client$ClientActor: Connecting to master spark://ec2-54-220-189-136.eu-west-1.compute.amazonaws.com:7077**
+    **14/01/30 20:26:17 INFO cluster.SparkDeploySchedulerBackend: Connected to Spark cluster with app ID app-20140130202617-0001**
+    **14/01/30 20:26:17 INFO client.Client$ClientActor: Executor added: app-20140130202617-0001/0 on worker-20140130201049-ip-10-34-137-45.eu-west-1.compute.internal-57119 (ip-10-34-137-45.eu-west-1.compute.internal:57119) with 1 cores**
+    **14/01/30 20:26:17 INFO cluster.SparkDeploySchedulerBackend: Granted executor ID app-20140130202617-0001/0 on hostPort ip-10-34-137-45.eu-west-1.compute.internal:57119 with 1 cores, 2.4 GB RAM**
+    **14/01/30 20:26:17 INFO client.Client$ClientActor: Executor updated: app-20140130202617-0001/0 is now RUNNING**
+    **14/01/30 20:26:18 INFO spark.SparkContext: Starting job: reduce at SparkPi.scala:39**
+    **...**
+
+Feel free to experiment with your cluster. Try out the interactive console in Scala, for example:
+
+    **> ./bin/spark-shell --master spark://ec2-54-227-127-14.compute-1.amazonaws.com:7077**
+
+Once you've finished, type `exit` to leave the console. You can also try the PySpark console by running the following command:
+
+    **> ./bin/pyspark --master spark://ec2-54-227-127-14.compute-1.amazonaws.com:7077**
+
+You can use the Spark Master web interface to see the applications registered with the master. To load the Master Web UI, navigate to `ec2-54-227-127-14.compute-1.amazonaws.com:8080` (again, remember to replace this domain name with your own master domain name). You should see something similar to the following screenshot showing the example you ran as well as the two console applications you launched:
+
+Remember that _you will be charged by Amazon_ for usage of the cluster. Don't forget to stop or terminate this test cluster once you're done with it. To do this, you can first exit the `ssh` session by typing `exit` to return to your own local system and then, run the following command:
+
+    **>./ec2/spark-ec2 -k spark -i spark.pem destroy test-cluster**
+
+You should see the following output:
+
+    **Are you sure you want to destroy the cluster test-cluster?**
+    **The following instances will be terminated:**
+    **Searching for existing cluster test-cluster...**
+    **Found 1 master(s), 1 slaves**
+    **> ec2-54-227-127-14.compute-1.amazonaws.com**
+    **> ec2-54-91-61-225.compute-1.amazonaws.com**
+    **ALL DATA ON ALL NODES WILL BE LOST!!**
+    **Destroy cluster test-cluster (y/N): y**
+    **Terminating master...**
+    **Terminating slaves...**
+
+Hit _Y_ and then _Enter_ to destroy the cluster.
+
+Congratulations! You've just set up a Spark cluster in the cloud, run a fully parallel example program on this cluster, and terminated it. If you would like to try out any of the example code in the subsequent chapters (or your own Spark programs) on a cluster, feel free to experiment with the Spark EC2 scripts and launch a cluster of your chosen size and instance profile (just be mindful of the costs and remember to shut it down when you're done!).
+
+# Summary
+
+In this chapter, we covered how to set up Spark locally on our own computer as well as in the cloud as a cluster running on Amazon EC2. You learned the basics of Spark's programming model and API using the interactive Scala console, and we wrote the same basic Spark program in Scala, Java, and Python.
+
+In the next chapter, we will consider how to go about using Spark to create a machine learning system.
+
+# Chapter 2. Designing a Machine Learning System
+
+In this chapter, we will design a high-level architecture for an intelligent, distributed machine learning system that uses Spark as its core computation engine. The problem we will focus on will be taking the existing architecture for a web-based business and redesigning it to use automated machine learning systems to power key areas of the business. In this chapter, we will:
+
+  * Introduce our hypothetical business scenario
+  * Provide an overview of the current architecture
+  * Explore various ways in which machine learning systems can enhance or replace certain business functions
+  * Provide a new architecture based on these ideas
+
+A modern large-scale data environment includes the following requirements:
+
+  * It must integrate with other components of the system, especially with data collection and storage systems, analytics and reporting, and frontend applications.
+  * It should be easily scalable and independent of the rest of the architecture. Ideally, this should be in the form of horizontal as well as vertical scalability.
+  * It should allow efficient computation in respect of the type of workload in mind, that is machine learning and iterative analytics applications.
+  * If possible, it should support both batch and real-time workloads.
+
+As a framework, Spark meets these criteria. However, we must ensure that the machine learning systems designed on Spark also meet these criteria. There is no good in implementing an algorithm that ends up having bottlenecks that cause our system to fail in terms of one or more of these requirements.
+
+# Introducing MovieStream
+
+To better illustrate the design of our architecture, we will introduce a practical scenario. Let's assume that we have just been appointed to head the data science team of MovieStream, a fictitious Internet business that streams movies and television shows to its users.
+
+MovieStream is growing rapidly, adding both users and titles to its catalogue. The current MovieStream system is outlined in the following diagram:
+
+MovieStream's current architecture
+
+As we can see in the preceding diagram, currently, MovieStream's content editorial team is responsible for deciding which movies and shows are promoted and shown on the various parts of the site. They are also responsible for creating the content for MovieStream's bulk marketing campaigns, which include e-mail and other direct marketing channels. Currently, MovieStream collects basic data on what titles are viewed by users on an aggregate basis and has access to some demographic data collected from users when they sign up to the service. In addition, they have access to some basic metadata about the titles in their catalogue.
+
+The MovieStream team is stretched thin due to their rapid growth, and they can't keep up with the number of new releases and the growing activity of their users. The CEO of MovieStream has heard a lot about big data, machine learning, and artificial intelligence, and would like us to build a machine learning system for MovieStream that can handle many of the functions currently handled by the content team in an automated manner.
+
+# Business use cases for a machine learning system
+
+Perhaps the first question we should answer is, "Why use machine learning at all?" Why doesn't MovieStream simply continue with human-driven decisions? There are many reasons to use machine learning (and certainly some reasons not to), but the most important ones are mentioned here:
+
+  * The scale of data involved means that full human involvement quickly becomes infeasible as MovieStream grows
+  * Model-driven approaches such as machine learning and statistics can often benefit from uncovering patterns that cannot be seen by humans (due to the size and complexity of the datasets)
+  * Model-driven approaches can avoid human and emotional biases (as long as the correct processes are carefully applied)
+
+However, there is no reason why both model-driven and human-driven processes and decision making cannot coexist. For example, many machine learning systems rely on receiving labeled data in order to train models. Often, labeling such data is costly, time consuming, and requires human input. A good example of this is classifying textual data into categories or assigning a sentiment indicator to the text. Many real-world systems use some form of human-driven system to generate labels for such data (or at least part of it) to provide training data to models. These models are then used to make predictions in the live system at a larger scale.
+
+In the context of MovieStream, we need not fear that our machine learning system will make the content team redundant. Indeed, we will see that our aim is to lift the burden of time-consuming tasks where machine learning might be able to perform better while providing tools to allow the team to better understand the users and content. This might, for example, help them in selecting which new content to acquire for the catalogue (which involves a significant amount of cost and is therefore a critical aspect of the business).
+
+## Personalization
+
+Perhaps one of the most important potential applications of machine learning in MovieStream's business is personalization. Generally speaking, personalization refers to adapting the experience of a user and the content presented to them based on various factors, which might include user behavior data as well as external factors.
+
+ **Recommendations** are essentially a subset of personalization. Recommendation generally refers to presenting a user with a list of items that we hope the user will be interested in. Recommendations might be used in web pages (for example, recommending related products), via e-mails or other direct marketing channels, via mobile apps, and so on.
+
+Personalization is very similar to recommendations, but while recommendations are usually focused on an _explicit_ presentation of products or content to the user, personalization is more generic and, often, more _implicit_. For example, applying personalization to search on the MovieStream site might allow us to adapt the search results for a given user, based on the data available about that user. This might include recommendation-based data (in the case of a search for products or content) but might also include various other factors such as geolocation and past search history. It might not be apparent to the user that the search results are adapted to their specific profile; this is why personalization tends to be more implicit.
+
+## Targeted marketing and customer segmentation
+
+In a manner similar to recommendations, targeted marketing uses a model to select what to target at users. While generally recommendations and personalization are focused on a one-to-one situation, segmentation approaches might try to assign users into groups based on characteristics and, possibly, behavioral data. The approach might be fairly simple or might involve a machine learning model such as clustering. Either way, the result is a set of segment assignments that might allow us to understand the broad characteristics of each group of users, what makes them similar to each other within a group, and what makes them different from others in different groups.
+
+This could help MovieStream to better understand the drivers of user behavior and might also allow a broader targeting approach where groups are targeted as opposed to (or more likely, in addition to) direct one-to-one targeting with personalization.
+
+These methods can also help when we don't necessarily have labeled data available (as is the case with certain user and content profile data) but we still wish to perform more focused targeting than a complete _one-size-fits-all_ approach.
+
+## Predictive modeling and analytics
+
+A third area where machine learning can be applied is in predictive analytics. This is a very broad term, and in some ways, it encompasses recommendations, personalization, and targeting too. In this context, since recommendations and segmentation are somewhat distinct, we use the term **predictive modeling** to refer to other models that seek to make predictions. An example of this can be a model to predict the potential viewing activity and revenue of new titles before any data is available on how popular the title might be. MovieStream can use past activity and revenue data, together with content attributes, to create a **regression model** that can be used to make predictions for brand new titles.
+
+As another example, we can use a **classification model** to automatically assign tags, keywords, or categories to new titles for which we only have partial data.
+
+# Types of machine learning models
+
+While we have highlighted a few use cases for machine learning in the context of the preceding MovieStream example, there are many other examples, some of which we will touch on in the relevant chapters when we introduce each machine learning task.
+
+However, we can broadly divide the preceding use cases and methods into two categories of machine learning:
+
+  *  **Supervised learning** : These types of models use _labeled_ data to learn. Recommendation engines, regression, and classification are examples of supervised learning methods. The labels in these models can be user-movie ratings (for recommendation), movie tags (in the case of the preceding classification example), or revenue figures (for regression). We will cover supervised learning models in Chapter 4, _Building a Recommendation Engine with Spark_ , Chapter 5, _Building a Classification Model with Spark_ , and Chapter 6, _Building a Regression Model with Spark_.
+  *  **Unsupervised learning** : When a model does not require labeled data, we refer to unsupervised learning. These types of models try to learn or extract some underlying structure in the data or reduce the data down to its most important features. Clustering, dimensionality reduction, and some forms of feature extraction, such as text processing, are all unsupervised techniques and will be dealt with in Chapter 7, _Building a Clustering Model with Spark_ , Chapter 8, _Dimensionality Reduction with Spark_ , and Chapter 9, _Advanced Text Processing with Spark_.
+
+# The components of a data-driven machine learning system
+
+The high-level components of our machine learning system are outlined in the following diagram. This diagram illustrates the machine learning pipeline from which we obtain data and in which we store data. We then transform it into a form that is usable as input to a machine learning model; train, test, and refine our model; and then, deploy the final model to our production system. The process is then repeated as new data is generated.
+
+A general machine learning pipeline
+
+## Data ingestion and storage
+
+The first step in our machine learning pipeline will be taking in the data that we require for training our models. Like many other businesses, MovieStream's data is typically generated by user activity, other systems (this is commonly referred to as machine-generated data), and external sources (for example, the time of day and weather during a particular user's visit to the site).
+
+This data can be ingested in various ways, for example, gathering user activity data from browser and mobile application event logs or accessing external web APIs to collect data on geolocation or weather.
+
+Once the collection mechanisms are in place, the data usually needs to be stored. This includes the raw data, data resulting from intermediate processing, and final model results to be used in production.
+
+Data storage can be complex and involve a wide variety of systems, including HDFS, Amazon S3, and other filesystems; SQL databases such as MySQL or PostgreSQL; distributed NoSQL data stores such as HBase, Cassandra, and DynamoDB; and search engines such as Solr or Elasticsearch to stream data systems such as Kafka, Flume, or Amazon Kinesis.
+
+For the purposes of this book, we will assume that the relevant data is available to us, so we will focus on the processing and modeling steps in the following pipeline.
+
+## Data cleansing and transformation
+
+The majority of machine learning models operate on features, which are typically numerical representations of the input variables that will be used for the model.
+
+While we might want to spend the majority of our time exploring machine learning models, data collected via various systems and sources in the preceding ingestion step is, in most cases, in a raw form. For example, we might log user events such as details of when a user views the information page for a movie, when they watch a movie, or when they provide some other feedback. We might also collect external information such as the location of the user (as provided through their IP address, for example). These event logs will typically contain some combination of textual and numeric information about the event (and also, perhaps, other forms of data such as images or audio).
+
+In order to use this raw data in our models, in almost all cases, we need to perform preprocessing, which might include:
+
+  *  **Filtering data** : Let's assume that we want to create a model from a subset of the raw data, such as only the most recent few months of activity data or only events that match certain criteria.
+  *  **Dealing with missing, incomplete, or corrupted data** : Many real-world datasets are incomplete in some way. This might include data that is missing (for example, due to a missing user input) or data that is incorrect or flawed (for example, due to an error in data ingestion or storage, technical issues or bugs, or software or hardware failure). We might need to filter out bad data or alternatively decide a method to fill in missing data points (such as using the average value from the dataset for missing points, for example).
+  *  **Dealing with potential anomalies, errors, and outliers** : Erroneous or outlier data might skew the results of model training, so we might wish to filter these cases out or use techniques that are able to deal with outliers.
+  *  **Joining together disparate data sources** : For example, we might need to match up the event data for each user with different internal data sources, such as user profiles, as well as external data, such as geolocation, weather, and economic data.
+  *  **Aggregating data** : Certain models might require input data that is aggregated in some way, such as computing the sum of a number of different event types per user.
+
+Once we have performed initial preprocessing on our data, we often need to transform the data into a representation that is suitable for machine learning models. For many model types, this representation will take the form of a vector or matrix structure that contains numerical data. Common challenges during data transformation and feature extraction include:
+
+  * Taking categorical data (such as country for geolocation or category for a movie) and encoding it in a numerical representation.
+  * Extracting useful features from text data.
+  * Dealing with image or audio data.
+  * We often convert numerical data into categorical data to reduce the number of values a variable can take on. An example of this is converting a variable for age into buckets (such as 25-35, 45-55, and so on).
+  * Transforming numerical features; for example, applying a log transformation to a numerical variable can help deal with variables that take on a very large range of values.
+  * Normalizing and standardizing numerical features ensures that all the different input variables for a model have a consistent scale. Many machine learning models require standardized input to work properly.
+  * Feature engineering is the process of combining or transforming the existing variables to create new features. For example, we can create a new variable that is the average of some other data, such as the average number of times a user watches a movie.
+
+We will cover all of these techniques through the examples in this book.
+
+These data-cleansing, exploration, aggregation, and transformation steps can be carried out using both Spark's core API functions as well as the SparkSQL engine, not to mention other external Scala, Java, or Python libraries. We can take advantage of Spark's Hadoop compatibility to read data from and write data to the various different storage systems mentioned earlier.
+
+## Model training and testing loop
+
+Once we have our training data in a form that is suitable for our model, we can proceed with the model's training and testing phase. During this phase, we are primarily concerned with  **model selection**. This can refer to choosing the best modeling approach for our task, or the best parameter settings for a given model. In fact, the term model selection often refers to both of these processes, as, in many cases, we might wish to try out various models and select the best performing model (with the best performing parameter settings for each model). It is also common to explore the application of combinations of different models (known as **ensemble methods** ) in this phase.
+
+This is typically a fairly straightforward process of running our chosen model on our training dataset and testing its performance on a test dataset (that is, a set of data that is held out for the evaluation of the model that the model has not seen in the training phase). This process is referred to as  **cross-validation**.
+
+However, due to the large scale of data we are typically working with, it is often useful to carry out this initial train-test loop on a smaller representative sample of our full dataset or perform model selection using parallel methods where possible.
+
+For this part of the pipeline, Spark's built-in machine learning library, MLlib, is a perfect fit. We will focus most of our attention in this book on the model training, evaluation, and cross-validation steps for various machine learning techniques, using MLlib and Spark's core features.
+
+## Model deployment and integration
+
+Once we have found the optimal model based on the train-test loop, we might still face the task of deploying the model to a production system so that it can be used to make actionable predictions.
+
+Usually, this process involves exporting the trained model to a central data store from where the production-serving system can obtain the latest version. Thus, the live system _refreshes_ the model periodically as a new model is trained.
+
+## Model monitoring and feedback
+
+It is critically important to monitor the performance of our machine learning system in production. Once we deploy our optimal trained model, we wish to understand how it is doing in the "wild". Is it performing as we expect on new, unseen data? Is its accuracy good enough? The reality is regardless of how much model selection and tuning we try to do in the earlier phases; the only way to measure true performance is to observe what happens in our production system.
+
+Also, bear in mind that model accuracy and predictive performance is only one aspect of a real-world system. Usually, we are concerned with other metrics related to business performance (for example, revenue and profitability) or user experience (such as the time spent on our site and how active our users are overall). In most cases, we cannot easily map model-predictive performance to these business metrics. The accuracy of a recommendation or targeting system might be important, but it relates only indirectly to the true metrics we are concerned about, namely whether we are improving user experience, activity, and ultimately, revenue.
+
+So, in real-world systems, we should monitor both model-accuracy metrics as well as business metrics. If possible, we should be able to experiment with different models running in production to allow us to optimize against these business metrics by making changes to the models. This is often done using live split tests. However, doing this correctly is not an easy task, and live testing and experimentation is expensive, in the sense that mistakes, poor performance, and using baseline models (they provide a control against which we test out production models) can negatively impact user experience and revenue.
+
+Another important aspect of this phase is  **model feedback**. This is the process where the predictions of our model feed through into user behavior; this, in turn, feeds through into our model. In a real-world system, our models are essentially influencing their own future training data by impacting decision-making and potential user behavior.
+
+For example, if we have deployed a recommendation system, then, by making recommendations, we might be influencing user behavior because we are only allowing users a limited selection of choices. We hope that this selection is relevant due to our model; however, this feedback loop, in turn, can influence our model's training data. This, in turn, feeds back into real-world performance. It is possible to get into an ever-narrowing feedback loop; ultimately, this can negatively affect both model accuracy and our important business metrics.
+
+Fortunately, there are mechanisms by which we can try to limit the potential negative impact of this feedback loop. These include providing some unbiased training data by having a small portion of data coming from users who are not exposed to our models or by being principled in the way we balance exploration, to learn more about our data, and exploitation, to use what we have learned to improve our system's performance.
+
+We will briefly cover some aspects of real-time monitoring and model updates in Chapter 10, _Real-time Machine Learning with Spark Streaming_.
+
+## Batch versus real time
+
+In the previous sections, we outlined the common batch processing approach, where the model is retrained using all data or a subset of all data, periodically. As the preceding pipeline takes some time to complete, it might not be possible to use this approach to update models immediately as new data arrives.
+
+While we will be mostly covering batch machine learning approaches in this book, there is a class of machine learning algorithms known as  **online learning** ; they update immediately as new data is fed into the model, thus enabling a real-time system. A common example is an online-optimization algorithm for a linear model, such as stochastic gradient descent. We can learn this algorithm using examples. The advantages of these methods are that the system can react very quickly to new information and also that the system can adapt to changes in the underlying behavior (that is, if the characteristics and distribution of the input data are changing over time, which is almost always the case in real-world situations).
+
+However, online-learning models come with their own unique challenges in a production context. For example, it might be difficult to ingest and transform data in real time. It can also be complex to properly perform model selection in a purely online setting. Latency of the online training and the model selection and deployment phases might be too high for true real-time requirements (for example, in online advertising, latency requirements are measured in single-digit milliseconds). Finally, batch-oriented frameworks might make it awkward to handle real-time processes of a streaming nature.
+
+Fortunately, Spark's real-time stream processing component, **Spark Streaming** , is a good potential fit for real-time machine learning workflows. We will explore Spark Streaming and online learning in Chapter 10, _Real-time Machine Learning with Spark Streaming_.
+
+Due to the complexities inherent in a true real-time machine learning system, in practice, many systems target near real-time operations. This is essentially a hybrid approach where models are not necessarily updated immediately as new data arrives; instead, the new data is collected into mini-batches of a small set of training data. These mini-batches can be fed to an online-learning algorithm. In many cases, this approach is combined with a periodic batch process that might recompute the model on the entire data set and perform more complex processing and model selection. This can help ensure that the real-time model does not degrade over time.
+
+Another similar approach involves making approximate updates to a more complex model as new data arrives while recomputing the entire model in a batch process periodically. In this way, the model can learn from new data, with a short delay (usually measured in seconds or, perhaps, a few minutes), but will become more and more inaccurate over time due to the approximation applied. The periodic recomputation takes care of this by retraining the model on all available data.
+
+# An architecture for a machine learning system
+
+Now that we have explored how our machine learning system might work in the context of MovieStream, we can outline a possible architecture for our system:
+
+MovieStream's future architecture
+
+As we can see, our system incorporates the machine learning pipeline outlined in the preceding diagram; this system also includes:
+
+  * Collecting data about users, their behavior, and our content titles
+  * Transforming this data into features
+  * Training our models, including our training-testing and model-selection phases
+  * Deploying the trained models to both our live model-serving system as well as using these models for offline processes
+  * Feeding back the model results into the MovieStream website through recommendation and targeting pages
+  * Feeding back the model results into MovieStream's personalized marketing channels
+  * Using the offline models to provide tools to MovieStream's various teams to better understand user behavior, characteristics of the content catalogue, and drivers of revenue for the business
+
+## Practical exercise
+
+Imagine that you now need to provide input to the frontend and infrastructure engineering team about the data that your machine learning system will need. Consider a brief for them on how they should structure the data-collection mechanisms. Write down some examples of what the raw data might look like (for example, web logs, event logs, and so on) and how it should flow through the system. Take into account the following aspects:
+
+  * What data sources will be required
+  * What format should the data be in
+  * How often should data be collected, processed, potentially aggregated, and stored
+  * What data storage will you use to ensure scalability
+
+# Summary
+
+In this chapter, you learned about the components inherent in a data-driven, automated machine learning system. We also outlined how a possible high-level architecture for such a system might look in a real-world situation.
+
+In the next chapter, we will discuss how to obtain publicly-available datasets for common machine learning tasks. We will also explore general concepts related to processing, cleaning, and transforming data so that they can be used to train a machine learning model.
+
+# Chapter 3. Obtaining, Processing, and Preparing Data with Spark
+
+Machine learning is an extremely broad field, and these days, applications can be found across areas that include web and mobile applications, Internet of Things and sensor networks, financial services, healthcare, and various scientific fields, to name just a few.
+
+Therefore, the range of data available for potential use in machine learning is enormous. In this book, we will focus mostly on business applications. In this context, the data available often consists of data internal to an organization (such as transactional data for a financial services company) as well as external data sources (such as financial asset price data for the same financial services company).
+
+For example, recall from Chapter 2, _Designing a Machine Learning System_ , that the main internal source of data for our hypothetical Internet business, MovieStream, consists of data on the movies available on the site, the users of the service, and their behavior. This includes data about movies and other content (for example, title, categories, description, images, actors, and directors), user information (for example, demographics, location, and so on), and user activity data (for example, web page views, title previews and views, ratings, reviews, and social data such as _likes_ , _shares_ , and social network profiles on services including Facebook and Twitter).
+
+External data sources in this example might include weather and geolocation services, third-party movie ratings and review sites such as _IMDB_ and _Rotten Tomatoes_ , and so on.
+
+Generally speaking, it is quite difficult to obtain data of an internal nature for real-world services and businesses, as it is commercially sensitive (in particular, data on purchasing activity, user or customer behavior, and revenue) and of great potential value to the organization concerned. This is why it is also often the most useful and interesting data on which to apply machine learning--a good machine learning model that can make accurate predictions can be highly valuable (witness the success of machine learning competitions such as the _Netflix Prize_ and _Kaggle_ ).
+
+In this book, we will make use of datasets that are publicly available to illustrate concepts around data processing and training of machine learning models.
+
+In this chapter, we will:
+
+  * Briefly cover the types of data typically used in machine learning.
+  * Provide examples of where to obtain interesting datasets, often publicly available on the Internet. We will use some of these datasets throughout the book to illustrate the use of the models we introduce.
+  * Discover how to process, clean, explore, and visualize our data.
+  * Introduce various techniques to transform our raw data into features that can be used as input to machine learning algorithms.
+  * Learn how to normalize input features using external libraries as well as Spark's built-in functionality.
+
+# Accessing publicly available datasets
+
+Fortunately, while commercially-sensitive data can be hard to come by, there are still a number of useful datasets available publicly. Many of these are often used as benchmark datasets for specific types of machine learning problems. Examples of common data sources include:
+
+  *  **UCI Machine Learning Repository** : This is a collection of almost 300 datasets of various types and sizes for tasks including classification, regression, clustering, and recommender systems. The list is available at <http://archive.ics.uci.edu/ml/>.
+  *  **Amazon AWS public datasets** : This is a set of often very large datasets that can be accessed via Amazon S3. These datasets include the Human Genome Project, the Common Crawl web corpus, Wikipedia data, and Google Books Ngrams. Information on these datasets can be found at <http://aws.amazon.com/publicdatasets/>.
+  *  **Kaggle** : This is a collection of datasets used in machine learning competitions run by Kaggle. Areas include classification, regression, ranking, recommender systems, and image analysis. These datasets can be found under the _Competitions_ section at <http://www.kaggle.com/competitions>.
+  *  **KDnuggets** : This has a detailed list of public datasets, including some of those mentioned earlier. The list is available at <http://www.kdnuggets.com/datasets/index.html>.
+
+### Tip
+
+There are many other resources to find public datasets depending on the specific domain and machine learning task. Hopefully, you might also have exposure to some interesting academic or commercial data of your own!
+
+To illustrate a few key concepts related to data processing, transformation, and feature extraction in Spark, we will download a commonly-used dataset for movie recommendations; this dataset is known as the **MovieLens** dataset. As it is applicable to recommender systems as well as potentially other machine learning tasks, it serves as a useful example dataset.
+
+### Note
+
+Spark's machine learning library, MLlib, has been under heavy development since its inception, and unlike the Spark core, it is still not in a fully stable state with regard to its overall API and design.
+
+As of Spark Version 1.2.0, a new, experimental API for MLlib has been released under the `ml` package (whereas the current library resides under the `mllib` package). This new API aims to enhance the APIs and interfaces for models as well as feature extraction and transformation so as to make it easier to build pipelines that chain together steps that include feature extraction, normalization, dataset transformations, model training, and cross-validation.
+
+In the upcoming chapters, we will only cover the existing, more developed MLlib API, since the new API is still experimental and may be subject to major changes in the next few Spark releases. Over time, the various feature-processing techniques and models that we will cover will simply be ported to the new API; however, the core concepts and most underlying code will remain largely unchanged.
+
+## The MovieLens 100k dataset
+
+The MovieLens 100k dataset is a set of 100,000 data points related to ratings given by a set of users to a set of movies. It also contains movie metadata and user profiles. While it is a small dataset, you can quickly download it and run Spark code on it. This makes it ideal for illustrative purposes.
+
+You can download the dataset from <http://files.grouplens.org/datasets/movielens/ml-100k.zip>.
+
+Once you have downloaded the data, unzip it using your terminal:
+
+    **> unzip ml-100k.zip**
+    **inflating: ml-100k/allbut.pl**
+    **inflating: ml-100k/mku.sh**
+    **inflating: ml-100k/README**
+    **...**
+    **inflating: ml-100k/ub.base**
+    **inflating: ml-100k/ub.test**
+
+This will create a directory called `ml-100k`. Change into this directory and examine the contents. The important files are `u.user` (user profiles), `u.item` (movie metadata), and `u.data` (the ratings given by users to movies):
+
+    **> cd ml-100k**
+
+The `README` file contains more information on the dataset, including the variables present in each data file. We can use the `head` command to examine the contents of the various files.
+
+For example, we can see that the `u.user` file contains the `user id`, `age`, `gender`, `occupation`, and `ZIP code` fields, separated by a pipe (`|` character):
+
+    **> head -5 u.user**
+    **1|24|M|technician|85711**
+    **2|53|F|other|94043**
+    **3|23|M|writer|32067**
+    **4|24|M|technician|43537**
+    **5|33|F|other|15213**
+
+The `u.item` file contains the `movie id`, `title`, `release data`, and `IMDB link` fields and a set of fields related to movie category data. It is also separated by a `|` character:
+
+    **> head -5 u.item**
+    **1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0**
+    **2|GoldenEye (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?GoldenEye%20(1995)|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0**
+    **3|Four Rooms (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0**
+    **4|Get Shorty (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)|0|1|0|0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0**
+    **5|Copycat (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Copycat%20(1995)|0|0|0|0|0|0|1|0|1|0|0|0|0|0|0|0|1|0|0**
+
+Finally, the `u.data` file contains the `user id`, `movie id`, `rating (1-5 scale)`, and `timestamp` fields and is separated by a tab (the `\t` character):
+
+    **> head -5 u.data**
+    **196	242	3	881250949**
+    **186	302	3	891717742**
+    **22	377	1	878887116**
+    **244	51	2	880606923**
+    **166	346	1	886397596**
+
+# Exploring and visualizing your data
+
+Now that we have our data available, let's fire up an interactive Spark console and explore it! For this section, we will use Python and the PySpark shell, as we are going to use the IPython interactive console and the matplotlib plotting library to process and visualize our data.
+
+### Note
+
+IPython is an advanced, interactive shell for Python. It includes a useful set of features called pylab, which includes NumPy and SciPy for numerical computing and matplotlib for interactive plotting and visualization.
+
+We recommend that you use the latest version of IPython (2.3.1 at the time of writing this book). To install IPython for your platform, follow the instructions available at <http://ipython.org/install.html>. If this is the first time you are using IPython, you can find a tutorial at <http://ipython.org/ipython-doc/stable/interactive/tutorial.html>.
+
+You will need to install all the packages listed earlier in order to work through the code in this chapter. Instructions to install the packages can be found in the code bundle. If you are starting out with Python or are unfamiliar with the process of installing these packages, we strongly recommend that you use a prebuilt scientific Python installation such as Anaconda (available at <http://continuum.io/downloads>) or Enthought (available at <https://store.enthought.com/downloads/>). These make the installation process much easier and include everything you will need to follow the example code.
+
+The PySpark console allows the option of setting which Python executable needs to be used to run the shell. We can choose to use IPython, as opposed to the standard Python shell, when launching our PySpark console. We can also pass in additional options to IPython, including telling it to launch with the pylab functionality enabled.
+
+We can do this by running the following command from the Spark home directory (that is, the same directory that we used previously to explore the Spark interactive console):
+
+    **> IPYTHON=1 IPYTHON_OPTS="--pylab" ./bin/pyspark**
+
+You will see the PySpark console start up, showing output similar to the following screenshot:
+
+The PySpark console using IPython
+
+### Tip
+
+Notice the `IPython 2.3.1 -- An enhanced Interactive Python` and `Using matplotlib backend: MacOSX` lines; they indicate that both the IPython and pylab functionalities are being used by the PySpark shell.
+
+You might see a slightly different output, depending on your operating system and software versions.
+
+Now that we have our IPython console open, we can start to explore the MovieLens dataset and do some basic analysis.
+
+### Note
+
+You can follow along with this chapter by entering the code examples into your IPython console. IPython also provides an HTML-enabled Notebook application. It provides some enhanced functionality over the standard IPython console, such as inline graphics for plotting, the HTML markup functionality, as well as the ability to run cells of code independently.
+
+The images used in this chapter were generated using the IPython Notebook, so don't worry if yours look a little bit different in style, as long as they contain the same content! You can also use the Notebook for the code in this chapter, if you prefer. In addition to the Python code for this chapter, we have provided a version saved in the IPython Notebook format, which you can load into your own IPython Notebook.
+
+Check out the instructions on how to use the IPython Notebook at <http://ipython.org/ipython-doc/stable/interactive/notebook.html>.
+
+## Exploring the user dataset
+
+First, we will analyze the characteristics of MovieLens users. Enter the following lines into your console (where `PATH` refers to the base directory in which you performed the `unzip` command to unzip the preceding MovieLens 100k dataset):
+
+    user_data = sc.textFile("/ **PATH** /ml-100k/u.user")
+    user_data.first()
+
+You should see output similar to this:
+
+    **u'1|24|M|technician|85711'**
+
+As we can see, this is the first line of our user data file, separated by the `"|"` character.
+
+### Tip
+
+The `first` function is similar to `collect`, but it only returns the first element of the RDD to the driver. We can also use `take(k)` to collect only the first _k_ elements of the RDD to the driver.
+
+Let's transform the data by splitting each line, around the `"|"` character. This will give us an RDD where each record is a Python list that contains the user ID, age, gender, occupation, and ZIP code fields.
+
+We will then count the number of users, genders, occupations, and ZIP codes. We can achieve this by running the following code in the console, line by line. Note that we do not cache the data, as it is unnecessary for this small size:
+
+    user_fields = user_data.map(lambda line: line.split("|"))
+    num_users = user_fields.map(lambda fields: fields[0]).count()
+    num_genders = user_fields.map(lambda fields:fields[2]).distinct().count()
+    num_occupations = user_fields.map(lambda fields:fields[3]).distinct().count()
+    num_zipcodes = user_fields.map(lambda fields:fields[4]).distinct().count()
+    print "Users: %d, genders: %d, occupations: %d, ZIP codes: %d" % (num_users, num_genders, num_occupations, num_zipcodes)
+
+You will see the following output:
+
+    **Users: 943, genders: 2, occupations: 21, ZIP codes: 795**
+
+Next, we will create a histogram to analyze the distribution of user ages, using matplotlib's `hist` function:
+
+    ages = user_fields.map(lambda x: int(x[1])).collect()
+    hist(ages, bins=20, color='lightblue', normed=True)
+    fig = matplotlib.pyplot.gcf()
+    fig.set_size_inches(16, 10)
+
+We passed in the `ages` array, together with the number of `bins` for our histogram (`20` in this case), to the `hist` function. Using the `normed=True` argument, we also specified that we want the histogram to be normalized so that each bucket represents the percentage of the overall data that falls into that bucket.
+
+You will see an image containing the histogram chart, which looks something like the one shown here. As we can see, the ages of MovieLens users are somewhat skewed towards younger viewers. A large number of users are between the ages of about 15 and 35.
+
+Distribution of user ages
+
+We might also want to explore the relative frequencies of the various occupations of our users. We can do this using the following code snippet. First, we will use the MapReduce approach introduced previously to count the occurrences of each occupation in the dataset. Then, we will use `matplotlib` to display a bar chart of occupation counts, using the `bar` function.
+
+Since part of our data is the descriptions of textual occupation, we will need to manipulate it a little to get it to work with the `bar` function:
+
+    count_by_occupation = user_fields.map(lambda fields: (fields[3], 1)).reduceByKey(lambda x, y: x + y).collect()
+    x_axis1 = np.array([c[0] for c in count_by_occupation])
+    y_axis1 = np.array([c[1] for c in count_by_occupation])
+
+Once we have collected the `RDD` of counts per occupation, we will convert it into two arrays for the _x_ axis (the occupations) and the _y_ axis (the counts) of our chart. The `collect` function returns the count data to us in no particular order. We need to sort the count data so that our bar chart is ordered from the lowest to the highest count.
+
+We will achieve this by first creating two `numpy` arrays and then using the `argsort` method of `numpy` to select the elements from each array, ordered by the count data in an ascending fashion. Notice that here, we will sort both the _x_ and _y_ axis arrays by the _y_ axis (that is, by the counts):
+
+    x_axis = x_axis1[np.argsort(y_axis1)]
+    y_axis = y_axis1[np.argsort(y_axis1)]
+
+Once we have the _x_ and _y_ axis data for our chart, we will create the bar chart with the occupations as labels on the _x_ axis and the counts as the values on the _y_ axis. We will also add a few lines, such as the `plt.xticks(rotation=30)` code, to display a better-looking chart:
+
+    pos = np.arange(len(x_axis))
+    width = 1.0
+
+    ax = plt.axes()
+    ax.set_xticks(pos + (width / 2))
+    ax.set_xticklabels(x_axis)
+
+    plt.bar(pos, y_axis, width, color='lightblue')
+    plt.xticks(rotation=30)
+    fig = matplotlib.pyplot.gcf()
+    fig.set_size_inches(16, 10)
+
+The image you have generated should look like the one here. It appears that the most prevalent occupations are **student** , **other** , **educator** , **administrator** , **engineer** , and **programmer**.
+
+Distribution of user occupations
+
+Spark provides a convenience method on RDDs called `countByValue`; this method counts the occurrences of each unique value in the RDD and returns it to the driver as a Python `dict` method (or a Scala or Java `Map` method). We can create the `count_by_occupation` variable using this method:
+
+    count_by_occupation2 = user_fields.map(lambda fields: fields[3]).countByValue()
+    print "Map-reduce approach:"
+    print dict(count_by_occupation2)
+    print ""
+    print "countByValue approach:"
+    print dict(count_by_occupation)
+
+You should see that the results are the same for each approach.
+
+## Exploring the movie dataset
+
+Next, we will investigate a few properties of the movie catalogue. We can inspect a row of the movie data file, as we did for the user data earlier, and then count the number of movies:
+
+    movie_data = sc.textFile("/ **PATH** /ml-100k/u.item")
+    print movie_data.first()
+    num_movies = movie_data.count()
+    print "Movies: %d" % num_movies
+
+You will see the following output on your console:
+
+    **1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0**
+    **Movies: 1682**
+
+In the same manner as we did for user ages and occupations earlier, we can plot the distribution of movie age, that is, the year of release relative to the current date (note that for this dataset, the current year is 1998).
+
+In the following code block, we can see that we need a small function called `convert_year` to handle errors in the parsing of the `release date` field. This is due to some bad data in one line of the movie data:
+
+    def convert_year(x):
+      try:
+        return int(x[-4:])
+      except:
+        return 1900 # there is a 'bad' data point with a blank year,
+        which we set to 1900 and will filter out later
+
+Once we have our utility function to parse the year of release, we can apply it to the movie data using a `map` transformation and collect the results:
+
+    movie_fields = movie_data.map(lambda lines: lines.split("|"))
+    years = movie_fields.map(lambda fields: fields[2]).map(lambda x: convert_year(x))
+
+Since we have assigned the value `1900` to any error in parsing, we can filter these bad values out of the resulting data using Spark's `filter` transformation:
+
+    years_filtered = years.filter(lambda x: x != 1900)
+
+This is a good example of how real-world datasets can often be messy and require a more in-depth approach to parsing data. In fact, this also illustrates why data exploration is so important, as many of these issues in data integrity and quality are picked up during this phase.
+
+After filtering out bad data, we will transform the list of movie release years into movie ages by subtracting the current year, use `countByValue` to compute the counts for each movie age, and finally, plot our histogram of movie ages (again, using the `hist` function, where the `values` variable are the values of the result from `countByValue`, and the `bins` variable are the keys):
+
+    movie_ages = years_filtered.map(lambda yr: 1998-yr).countByValue()
+    values = movie_ages.values()
+    bins = movie_ages.keys()
+    hist(values, bins=bins, color='lightblue', normed=True)
+    fig = matplotlib.pyplot.gcf()
+    fig.set_size_inches(16,10)
+
+You will see an image similar to the one here; it illustrates that most of the movies were released in the last few years before 1998:
+
+Distribution of movie ages
+
+## Exploring the rating dataset
+
+Let's now take a look at the ratings data:
+
+    rating_data = sc.textFile("/ **PATH** /ml-100k/u.data")
+    print rating_data.first()
+    num_ratings = rating_data.count()
+    print "Ratings: %d" % num_ratings
+
+This gives us the following result:
+
+    **196	242	3	881250949**
+    **Ratings: 100000**
+
+There are 100,000 ratings, and unlike the user and movie datasets, these records are split with a tab character (`"\t"`). As you might have guessed, we'd probably want to compute some basic summary statistics and frequency histograms for the rating values. Let's do this now:
+
+    rating_data = rating_data_raw.map(lambda line: line.split("\t"))
+    ratings = rating_data.map(lambda fields: int(fields[2]))
+    max_rating = ratings.reduce(lambda x, y: max(x, y))
+    min_rating = ratings.reduce(lambda x, y: min(x, y))
+    mean_rating = ratings.reduce(lambda x, y: x + y) / num_ratings
+    median_rating = np.median(ratings.collect())
+    ratings_per_user = num_ratings / num_users
+    ratings_per_movie = num_ratings / num_movies
+    print "Min rating: %d" % min_rating
+    print "Max rating: %d" % max_rating
+    print "Average rating: %2.2f" % mean_rating
+    print "Median rating: %d" % median_rating
+    print "Average # of ratings per user: %2.2f" % ratings_per_user
+    print "Average # of ratings per movie: %2.2f" % ratings_per_movie
+
+After running these lines on your console, you will see output similar to the following result:
+
+    **Min rating: 1**
+    **Max rating: 5**
+    **Average rating: 3.53**
+    **Median rating: 4**
+    **Average # of ratings per user: 106.00**
+    **Average # of ratings per movie: 59.00**
+
+We can see that the minimum rating is 1, while the maximum rating is 5. This is in line with what we expect, since the ratings are on a scale of 1 to 5.
+
+Spark also provides a `stats` function for RDDs; this function contains a numeric variable (such as `ratings` in this case) to compute similar summary statistics:
+
+    ratings.stats()
+
+Here is the output:
+
+    **(count: 100000, mean: 3.52986, stdev: 1.12566797076, max: 5.0, min: 1.0)**
+
+Looking at the results, the average rating given by a user to a movie is around 3.5 and the median rating is 4, so we might expect that the distribution of ratings will be skewed towards slightly higher ratings. Let's see whether this is true by creating a bar chart of rating values using a similar procedure as we did for occupations:
+
+    count_by_rating = ratings.countByValue()
+    x_axis = np.array(count_by_rating.keys())
+    y_axis = np.array([float(c) for c in count_by_rating.values()])
+    # we normalize the y-axis here to percentages
+    y_axis_normed = y_axis / y_axis.sum()
+    pos = np.arange(len(x_axis))
+    width = 1.0
+
+    ax = plt.axes()
+    ax.set_xticks(pos + (width / 2))
+    ax.set_xticklabels(x_axis)
+
+    plt.bar(pos, y_axis_normed, width, color='lightblue')
+    plt.xticks(rotation=30)
+    fig = matplotlib.pyplot.gcf()
+    fig.set_size_inches(16, 10)
+
+The preceding code should produce the following chart:
+
+Distribution of rating values
+
+In line with what we might have expected after seeing some summary statistics, it is clear that the distribution of ratings is skewed towards average to high ratings.
+
+We can also look at the distribution of the number of ratings made by each user. Recall that we previously computed the `rating_data` RDD used in the preceding code by splitting the ratings with the tab character. We will now use the `rating_data` variable again in the following code.
+
+To compute the distribution of ratings per user, we will first extract the user ID as key and rating as value from `rating_data` RDD. We will then group the ratings by user ID using Spark's `groupByKey` function:
+
+    user_ratings_grouped = rating_data.map(lambda fields: (int(fields[0]), int(fields[2]))).\
+        groupByKey()
+
+Next, for each key (user ID), we will find the size of the set of ratings; this will give us the number of ratings for that user:
+
+    user_ratings_byuser = user_ratings_grouped.map(lambda (k, v): (k, len(v)))
+    user_ratings_byuser.take(5)
+
+We can inspect the resulting RDD by taking a few records from it; this should give us an RDD of the (user ID, number of ratings) pairs:
+
+    **[(1, 272), (2, 62), (3, 54), (4, 24), (5, 175)]**
+
+Finally, we will plot the histogram of number of ratings per user using our favorite `hist` function:
+
+    user_ratings_byuser_local = user_ratings_byuser.map(lambda (k, v):v).collect()
+    hist(user_ratings_byuser_local, bins=200, color='lightblue',normed=True)
+    fig = matplotlib.pyplot.gcf()
+    fig.set_size_inches(16,10)
+
+Your chart should look similar to the following screenshot. We can see that most of the users give fewer than 100 ratings. The distribution of the ratings shows, however, that there are fairly large number of users that provide hundreds of ratings.
+
+Distribution of ratings per user
+
+We leave it to you to perform a similar analysis to create a histogram plot for the number of ratings given to each movie. Perhaps, if you're feeling adventurous, you could also extract a dataset of movie ratings by date (taken from the timestamps in the last column of the rating dataset) and chart a time series of the total number of ratings, number of unique users who gave a rating, and the number of unique movies rated, for each day.
+
+# Processing and transforming your data
+
+Now that we have done some initial exploratory analysis of our dataset and we know a little more about the characteristics of our users and movies, what do we do next?
+
+In order to make the raw data usable in a machine learning algorithm, we first need to clean it up and possibly transform it in various ways before extracting useful features from the transformed data. The transformation and feature extraction steps are closely linked, and in some cases, certain transformations are themselves a case of feature extraction.
+
+We have already seen an example of the need to clean data in the movie dataset. Generally, real-world datasets contain bad data, missing data points, and outliers. Ideally, we would correct bad data; however, this is often not possible, as many datasets derive from some form of collection process that cannot be repeated (this is the case, for example, in web activity data and sensor data). Missing values and outliers are also common and can be dealt with in a manner similar to bad data. Overall, the broad options are as follows:
+
+  *  **Filter out or remove records with bad or missing values** : This is sometimes unavoidable; however, this means losing the good part of a bad or missing record.
+  *  **Fill in bad or missing data** : We can try to assign a value to bad or missing data based on the rest of the data we have available. Approaches can include assigning a zero value, assigning the global mean or median, interpolating nearby or similar data points (usually, in a time-series dataset), and so on. Deciding on the correct approach is often a tricky task and depends on the data, situation, and one's own experience.
+  *  **Apply robust techniques to outliers** : The main issue with outliers is that they might be correct values, even though they are extreme. They might also be errors. It is often very difficult to know which case you are dealing with. Outliers can also be removed or filled in, although fortunately, there are statistical techniques (such as robust regression) to handle outliers and extreme values.
+  *  **Apply transformations to potential outliers** : Another approach for outliers or extreme values is to apply transformations, such as a logarithmic or Gaussian kernel transformation, to features that have potential outliers, or display large ranges of potential values. These types of transformations have the effect of dampening the impact of large changes in the scale of a variable and turning a nonlinear relationship into one that is linear.
+
+## Filling in bad or missing data
+
+We have already seen an example of filtering out bad data. Following on from the preceding code, the following code snippet applies the fill-in approach to the bad release date record by assigning a value to the data point that is equal to the median year of release:
+
+    years_pre_processed = movie_fields.map(lambda fields: fields[2]).map(lambda x: convert_year(x)).collect()
+    years_pre_processed_array = np.array(years_pre_processed)
+
+First, we will compute the mean and median year of release after selecting all the year of release data, _except_ the bad data point. We will then use the `numpy` function, `where`, to find the index of the bad value in `years_pre_processed_array` (recall that we assigned the value `1900` to this data point). Finally, we will use this index to assign the median release year to the bad value:
+
+    mean_year = np.mean(years_pre_processed_array[years_pre_processed_array!=1900])
+    median_year = np.median(years_pre_processed_array[years_pre_processed_array!=1900])
+    index_bad_data = np.where(years_pre_processed_array==1900)[0][0]
+    years_pre_processed_array[index_bad_data] = median_year
+    print "Mean year of release: %d" % mean_year
+    print "Median year of release: %d" % median_year
+    print "Index of '1900' after assigning median: %s" % np.where(years_pre_processed_array == 1900)[0]
+
+You should expect to see the following output:
+
+    **Mean year of release: 1989**
+    **Median year of release: 1995**
+    **Index of '1900' after assigning median: []**
+
+We computed both the mean and the median year of release here. As can be seen from the output, the median release year is quite higher because of the skewed distribution of the years. While it is not always straightforward to decide on precisely which fill-in value to use for a given situation, in this case, it is certainly feasible to use the median due to this skew.
+
+### Tip
+
+Note that the preceding code example is, strictly speaking, not very scalable, as it requires collecting all the data to the driver. We can use Spark's `mean` function for numeric RDDs to compute the mean, but there is no median function available currently. We can solve this by creating our own or by computing the median on a sample of the dataset created using the `sample` function (we will see more of this in the upcoming chapters).
+
+# Extracting useful features from your data
+
+Once we have completed the initial exploration, processing, and cleaning of our data, we are ready to get down to the business of extracting actual features from the data, with which our machine learning model can be trained.
+
+ **Features** refer to the variables that we use to train our model. Each row of data contains various information that we would like to extract into a training example. Almost all machine learning models ultimately work on numerical representations in the form of a **vector** ; hence, we need to convert raw data into numbers.
+
+Features broadly fall into a few categories, which are as follows:
+
+  *  **Numerical features** : These features are typically real or integer numbers, for example, the user age that we used in an example earlier.
+  *  **Categorical features** : These features refer to variables that can take one of a set of possible states at any given time. Examples from our dataset might include a user's gender or occupation or movie categories.
+  *  **Text features** : These are features derived from the text content in the data, for example, movie titles, descriptions, or reviews.
+  *  **Other features** : Most other types of features are ultimately represented numerically. For example, images, video, and audio can be represented as sets of numerical data. Geographical locations can be represented as latitude and longitude or geohash data.
+
+Here we will cover numerical, categorical, and text features.
+
+## Numerical features
+
+What is the difference between any old number and a numerical feature? Well, in reality, any numerical data can be used as an input variable. However, in a machine learning model, we learn about a vector of weights for each feature. The weights play a role in mapping feature values to an outcome or target variable (in the case of supervised learning models).
+
+Thus, we want to use features that make sense, that is, where the model can learn the relationship between feature values and the target variable. For example, age might be a reasonable feature. Perhaps there is a direct relationship between increasing age and a certain outcome. Similarly, height is a good example of a numerical feature that can be used directly.
+
+We will often see that numerical features are less useful in their raw form, but can be turned into representations that are more useful. Location is an example of such a case. Using raw locations (say, latitude and longitude) might not be that useful unless our data is very dense indeed, since our model might not be able to learn about a useful relationship between the raw location and an outcome. However, a relationship might exist between some aggregated or binned representation of the location (for example, a city or country) and the outcome.
+
+## Categorical features
+
+Categorical features cannot be used as input in their raw form, as they are not numbers; instead, they are members of a set of possible values that the variable can take. In the example mentioned earlier, user occupation is a categorical variable that can take the value of student, programmer, and so on.
+
+Such categorical variables are also known as **nominal** variables where there is no concept of order between the values of the variable. By contrast, when there is a concept of order between variables (such as the ratings mentioned earlier, where a rating of 5 is conceptually higher or better than a rating of 1), we refer to **ordinal** variables.
+
+To transform categorical variables into a numerical representation, we can use a common approach known as **1-of-k** encoding. An approach such as 1-of-k encoding is required to represent nominal variables in a way that makes sense for machine learning tasks. Ordinal variables might be used in their raw form but are often encoded in the same way as nominal variables.
+
+Assume that there are k possible values that the variable can take. If we assign each possible value an index from the set of 1 to k, then we can represent a given state of the variable using a binary vector of length k; here, all entries are zero, except the entry at the index that corresponds to the given state of the variable. This entry is set to one.
+
+For example, we can collect all the possible states of the `occupation` variable:
+
+    all_occupations = user_fields.map(lambda fields: fields[3]).distinct().collect()
+    all_occupations.sort()
+
+We can then assign index values to each possible occupation in turn (note that we start from zero, since Python, Scala, and Java arrays all use zero-based indices):
+
+    idx = 0
+    all_occupations_dict = {}
+    for o in all_occupations:
+        all_occupations_dict[o] = idx
+        idx +=1
+    # try a few examples to see what "1-of-k" encoding is assigned
+    print "Encoding of 'doctor': %d" % all_occupations_dict['doctor']
+    print "Encoding of 'programmer': %d" % all_occupations_dict['programmer']
+
+You will see the following output:
+
+    **Encoding of 'doctor': 2**
+    **Encoding of 'programmer': 14**
+
+Finally, we can encode the value of `programmer`. We will start by creating a `numpy` array of a length that is equal to the number of possible occupations (k in this case) and filling it with zeros. We will use the `zeros` function of `numpy` to create this array.
+
+We will then extract the index of the word `programmer` and assign a value of `1` to the array value at this index:
+
+    K = len(all_occupations_dict)
+    binary_x = np.zeros(K)
+    k_programmer = all_occupations_dict['programmer']
+    binary_x[k_programmer] = 1
+    print "Binary feature vector: %s" % binary_x
+    print "Length of binary vector: %d" % K
+
+This will give us the resulting binary feature vector of length `21`:
+
+    **Binary feature vector: [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0. 0.  0.  0.]**
+    **Length of binary vector: 21**
+
+## Derived features
+
+As we mentioned earlier, it is often useful to compute a derived feature from one or more available variables. We hope that the derived feature can add more information than only using the variable in its raw form.
+
+For instance, we can compute the average rating given by each user to all the movies they rated. This would be a feature that could provide a _user-specific_ intercept in our model (in fact, this is a commonly used approach in recommendation models). We have taken the raw rating data and created a new feature that can allow us to learn a better model.
+
+Examples of features derived from raw data include computing average values, median values, variances, sums, differences, maximums or minimums, and counts. We have already seen a case of this when we created a new `movie age` feature from the year of release of the movie and the current year. Often, the idea behind using these transformations is to summarize the numerical data in some way that might make it easier for a model to learn.
+
+It is also common to transform numerical features into categorical features, for example, by binning features. Common examples of this include variables such as age, geolocation, and time.
+
+### Transforming timestamps into categorical features
+
+To illustrate how to derive categorical features from numerical data, we will use the times of the ratings given by users to movies. These are in the form of Unix timestamps. We can use Python's `datetime` module to extract the date and time from the timestamp and, in turn, extract the `hour` of the day. This will result in an RDD of the hour of the day for each rating.
+
+We will need a function to extract a `datetime` representation of the rating timestamp (in seconds); we will create this function now:
+
+    def extract_datetime(ts):
+        import datetime
+        return datetime.datetime.fromtimestamp(ts)
+
+We will again use the `rating_data` RDD that we computed in the earlier examples as our starting point.
+
+First, we will use a `map` transformation to extract the timestamp field, converting it to a Python `int` datatype. We will then apply our `extract_datetime` function to each timestamp and extract the hour from the resulting `datetime` object:
+
+    timestamps = rating_data.map(lambda fields: int(fields[3]))
+    hour_of_day = timestamps.map(lambda ts: extract_datetime(ts). **hour** )
+    hour_of_day.take(5)
+
+If we take the first five records of the resulting RDD, we will see the following output:
+
+    **[17, 21, 9, 7, 7]**
+
+We have transformed the raw time data into a categorical feature that represents the hour of the day in which the rating was given.
+
+Now, say that we decide this is too coarse a representation. Perhaps we want to further refine the transformation. We can assign each hour-of-the-day value into a defined bucket that represents a time of day.
+
+For example, we can say that morning is from 7 a.m. to 11 a.m., while lunch is from 11 a.m. to 1 a.m., and so on. Using these buckets, we can create a function to assign a time of day, given the hour of the day as input:
+
+    def assign_tod(hr):
+      times_of_day = {
+        'morning' : range(7, 12),
+        'lunch' : range(12, 14),
+        'afternoon' : range(14, 18),
+        'evening' : range(18, 23),
+        'night' : range(23, 7)
+      }
+      for k, v in times_of_day.iteritems():
+        if hr in v: 
+          return k
+
+Now, we will apply the `assign_tod` function to the hour of each rating event contained in the `hour_of_day` RDD:
+
+    time_of_day = hour_of_day.map(lambda hr: assign_tod(hr))
+    time_of_day.take(5)
+
+If we again take the first five records of this new RDD, we will see the following transformed values:
+
+    **['afternoon', 'evening', 'morning', 'morning', 'morning']**
+
+We have now transformed the timestamp variable (which can take on thousands of values and is probably not useful to a model in its raw form) into hours (taking on 24 values) and then into a time of day (taking on five possible values). Now that we have a categorical feature, we can use the same 1-of-k encoding method outlined earlier to generate a binary feature vector.
+
+## Text features
+
+In some ways, text features are a form of categorical and derived features. Let's take the example of the description for a movie (which we do not have in our dataset). Here, the raw text could not be used directly, even as a categorical feature, since there are virtually unlimited possible combinations of words that could occur if each piece of text was a possible value. Our model would almost never see two occurrences of the same feature and would not be able to learn effectively. Therefore, we would like to turn raw text into a form that is more amenable to machine learning.
+
+There are numerous ways of dealing with text, and the field of natural language processing is dedicated to processing, representing, and modeling textual content. A full treatment is beyond the scope of this book, but we will introduce a simple and standard approach for text-feature extraction; this approach is known as the **bag-of-words** representation.
+
+The bag-of-words approach treats a piece of text content as a set of the words, and possibly numbers, in the text (these are often referred to as terms). The process of the bag-of-words approach is as follows:
+
+  *  **Tokenization** : First, some form of tokenization is applied to the text to split it into a set of tokens (generally words, numbers, and so on). An example of this is simple whitespace tokenization, which splits the text on each space and might remove punctuation and other characters that are not alphabetical or numerical.
+  *  **Stop word removal** : Next, it is usual to remove very common words such as "the", "and", and "but" (these are known as **stop words** ).
+  *  **Stemming** : The next step can include stemming, which refers to taking a term and reducing it to its base form or stem. A common example is plural terms becoming singular (for example, dogs becomes dog and so on). There are many approaches to stemming, and text-processing libraries often contain various stemming algorithms.
+  *  **Vectorization** : The final step is turning the processed terms into a vector representation. The simplest form is, perhaps, a binary vector representation, where we assign a value of one if a term exists in the text and zero if it does not. This is essentially identical to the categorical 1-of-k encoding we encountered earlier. Like 1-of-k encoding, this requires a dictionary of terms mapping a given term to an index number. As you might gather, there are potentially millions of individual possible terms (even after stop word removal and stemming). Hence, it becomes critical to use a sparse vector representation where only the fact that a term is present is stored, to save memory and disk space as well as compute time.
+
+### Note
+
+In Chapter 9, _Advanced Text Processing with Spark_ , we will cover more complex text processing and feature extraction, including methods to weight terms; these methods go beyond the basic binary encoding we saw earlier.
+
+### Simple text feature extraction
+
+To show an example of extracting textual features in the binary vector representation, we can use the movie titles that we have available.
+
+First, we will create a function to strip away the year of release for each movie, if the year is present, leaving only the title of the movie.
+
+We will use Python's regular expression module, `re`, to search for the year between parentheses in the movie titles. If we find a match with this regular expression, we will extract only the title up to the index of the first match (that is, the index in the title string of the opening parenthesis). This is done with the following `raw[:grps.start()]` code snippet:
+
+    def extract_title(raw):
+      import re
+      # this regular expression finds the non-word (numbers) betweenparentheses
+      grps = re.search("\((\w+)\)", raw)
+      if grps:
+        # we take only the title part, and strip the trailing whitespace from the remaining text, below
+        return raw[:grps.start()].strip() 
+      else:
+        return raw
+
+Next, we will extract the raw movie titles from the `movie_fields` RDD:
+
+    raw_titles = movie_fields.map(lambda fields: fields[1])
+
+We can test out our `extract_title` function on the first five raw titles as follows:
+
+    for raw_title in raw_titles.take(5):
+      print extract_title(raw_title)
+
+We can verify that our function works by inspecting the results, which should look like this:
+
+    **Toy Story**
+    **GoldenEye**
+    **Four Rooms**
+    **Get Shorty**
+    **Copycat**
+
+We would then like to apply our function to the raw titles and apply a tokenization scheme to the extracted titles to convert them to terms. We will use the simple whitespace tokenization we covered earlier:
+
+    movie_titles = raw_titles.map(lambda m: extract_title(m))
+    # next we tokenize the titles into terms. We'll use simple whitespace tokenization
+    title_terms = movie_titles.map(lambda t: t.split(" "))
+    print title_terms.take(5)
+
+Applying this simple tokenization gives the following result:
+
+    **[[u'Toy', u'Story'], [u'GoldenEye'], [u'Four', u'Rooms'], [u'Get', u'Shorty'], [u'Copycat']]**
+
+We can see that we have split each title on spaces so that each word becomes a token.
+
+### Tip
+
+Here, we do not cover details such as converting text to lowercase, removing non-word or non-numerical characters such as punctuation and special characters, removing stop words, and stemming. These steps will be important in a real-world application. We will cover many of these topics in Chapter 9, _Advanced Text Processing with Spark_.
+
+This additional processing can be done fairly simply using string functions, regular expressions, and the Spark API (apart from stemming). Perhaps you would like to give it a try!
+
+In order to assign each term to an index in our vector, we need to create the term dictionary, which maps each term to an integer index.
+
+First, we will use Spark's `flatMap` function (highlighted in the following code snippet) to expand the list of strings in each record of the `title_terms` RDD into a new RDD of strings where each record is a term called `all_terms`.
+
+We can then collect all the unique terms and assign indexes in exactly the same way that we did for the 1-of-k encoding of user occupations earlier:
+
+    # next we would like to collect all the possible terms, in order to build out dictionary of term <-> index mappings
+    all_terms = title_terms. **flatMap** (lambda x: x).distinct().collect()
+    # create a new dictionary to hold the terms, and assign the "1-of-k" indexes
+    idx = 0
+    all_terms_dict = {}
+    for term in all_terms:
+      all_terms_dict[term] = idx
+      idx +=1
+
+We can print out the total number of unique terms and test out our term mapping on a few terms:
+
+    print "Total number of terms: %d" % len(all_terms_dict)
+    print "Index of term 'Dead': %d" % all_terms_dict['Dead']
+    print "Index of term 'Rooms': %d" % all_terms_dict['Rooms']
+
+This will result in the following output:
+
+    **Total number of terms: 2645**
+    **Index of term 'Dead': 147**
+    **Index of term 'Rooms': 1963**
+
+We can also achieve the same result more efficiently using Spark's `zipWithIndex` function. This function takes an RDD of values and merges them together with an index to create a new RDD of key-value pairs, where the key will be the term and the value will be the index in the term dictionary. We will use `collectAsMap` to collect the key-value RDD to the driver as a Python `dict` method:
+
+    all_terms_dict2 = title_terms.flatMap(lambda x: x).distinct().zipWithIndex().collectAsMap()
+    print "Index of term 'Dead': %d" % all_terms_dict2['Dead']
+    print "Index of term 'Rooms': %d" % all_terms_dict2['Rooms']
+
+The output is as follows:
+
+    **Index of term 'Dead': 147**
+    **Index of term 'Rooms': 1963**
+
+The final step is to create a function that converts a set of terms into a sparse vector representation. To do this, we will create an empty sparse matrix with one row and a number of columns equal to the total number of terms in our dictionary. We will then step through each term in the input list of terms and check whether this term is in our term dictionary. If it is, we assign a value of 1 to the vector at the index that corresponds to the term in our dictionary mapping:
+
+    # this function takes a list of terms and encodes it as a scipy sparse vector using an approach 
+    # similar to the 1-of-k encoding
+    def create_vector(terms, term_dict):
+      from scipy import sparse as sp
+        num_terms = len(term_dict)
+        x = sp.csc_matrix((1, num_terms))  
+        for t in terms:
+          if t in term_dict:
+            idx = term_dict[t]
+            x[0, idx] = 1
+      return x
+
+Once we have our function, we will apply it to each record in our RDD of extracted terms:
+
+    all_terms_bcast = sc. **broadcast** (all_terms_dict)
+    term_vectors = title_terms.map(lambda terms: create_vector(terms, all_terms_bcast.value))
+    term_vectors.take(5)
+
+We can then inspect the first few records of our new RDD of sparse vectors:
+
+    **[ <1x2645 sparse matrix of type '<type 'numpy.float64'>'**
+    **with 2 stored elements in Compressed Sparse Column format >,**
+    ** <1x2645 sparse matrix of type '<type 'numpy.float64'>'**
+    **with 1 stored elements in Compressed Sparse Column format >,**
+    ** <1x2645 sparse matrix of type '<type 'numpy.float64'>'**
+    **with 2 stored elements in Compressed Sparse Column format >,**
+    ** <1x2645 sparse matrix of type '<type 'numpy.float64'>'**
+    **with 2 stored elements in Compressed Sparse Column format >,**
+    ** <1x2645 sparse matrix of type '<type 'numpy.float64'>'**
+    **with 1 stored elements in Compressed Sparse Column format >]**
+
+We can see that each movie title has now been transformed into a sparse vector. We can see that the titles where we extracted two terms have two non-zero entries in the vector, titles where we extracted only one term have one non-zero entry, and so on.
+
+### Tip
+
+Note the use of Spark's `broadcast` method in the preceding example code to create a broadcast variable that contains the term dictionary. In real-world applications, such term dictionaries can be extremely large, so using a broadcast variable is advisable.
+
+## Normalizing features
+
+Once the features have been extracted into the form of a vector, a common preprocessing step is to normalize the numerical data. The idea behind this is to transform each numerical feature in a way that scales it to a standard size. We can perform different kinds of normalization, which are as follows:
+
+  *  **Normalize a feature** : This is usually a transformation applied to an individual feature across the dataset, for example, subtracting the mean ( _centering_ the feature) or applying the standard normal transformation (such that the feature has a mean of zero and a standard deviation of 1).
+  *  **Normalize a feature vector** : This is usually a transformation applied to all features in a given row of the dataset such that the resulting feature vector has a normalized length. That is, we will ensure that each feature in the vector is scaled such that the vector has a norm of 1 (typically, on an L1 or L2 norm).
+
+We will use the second case as an example. We can use the `norm` function of `numpy` to achieve the vector normalization by first computing the L2 norm of a random vector and then dividing each element in the vector by this norm to create our normalized vector:
+
+    np.random.seed(42)
+    x = np.random.randn(10)
+    norm_x_2 = np.linalg.norm(x)
+    normalized_x = x / norm_x_2
+    print "x:\n%s" % x
+    print "2-Norm of x: %2.4f" % norm_x_2
+    print "Normalized x:\n%s" % normalized_x
+    print "2-Norm of normalized_x: %2.4f" % np.linalg.norm(normalized_x)
+
+This should give the following result (note that in the preceding code snippet, we set the random seed equal to 42 so that the result will always be the same):
+
+    **x: [ 0.49671415 -0.1382643 0.64768854 1.52302986 -0.23415337 -0.23413696 1.57921282 0.76743473 -0.46947439 0.54256004]**
+    **2-Norm of x: 2.5908**
+    **Normalized x: [ 0.19172213 -0.05336737 0.24999534 0.58786029 -0.09037871 -0.09037237 0.60954584 0.29621508 -0.1812081 0.20941776]**
+    **2-Norm of normalized_x: 1.0000**
+
+### Using MLlib for feature normalization
+
+Spark provides some built-in functions for feature scaling and standardization in its MLlib machine learning library. These include `StandardScaler`, which applies the standard normal transformation, and `Normalizer`, which applies the same feature vector normalization we showed you in our preceding example code.
+
+We will explore the use of these methods in the upcoming chapters, but for now, let's simply compare the results of using MLlib's `Normalizer` to our own results:
+
+    from pyspark.mllib.feature import Normalizer
+    normalizer = Normalizer()
+    vector = sc.parallelize([x])
+
+After importing the required class, we will instantiate `Normalizer` (by default, it will use the L2 norm as we did earlier). Note that as in most situations in Spark, we need to provide `Normalizer` with an RDD as input (it contains `numpy` arrays or MLlib vectors); hence, we will create a single-element RDD from our vector `x` for illustrative purposes.
+
+We will then use the `transform` function of `Normalizer` on our RDD. Since the RDD only has one vector in it, we will return our vector to the driver by calling `first` and finally by calling the `toArray` function to convert the vector back into a `numpy` array:
+
+    normalized_x_mllib = normalizer.transform(vector).first().toArray()
+
+Finally, we can print out the same details as we did previously, comparing the results:
+
+    print "x:\n%s" % x
+    print "2-Norm of x: %2.4f" % norm_x_2
+    print "Normalized x MLlib:\n%s" % normalized_x_mllib
+    print "2-Norm of normalized_x_mllib: %2.4f" % np.linalg.norm(normalized_x_mllib)
+
+You will end up with exactly the same normalized vector as we did with our own code. However, using MLlib's built-in methods is certainly more convenient and efficient than writing our own functions!
+
+## Using packages for feature extraction
+
+While we have covered many different approaches to feature extraction, it will be rather painful to have to create the code to perform these common tasks each and every time. Certainly, we can create our own reusable code libraries for this purpose; however, fortunately, we can rely on the existing tools and packages.
+
+Since Spark supports Scala, Java, and Python bindings, we can use packages available in these languages that provide sophisticated tools to process and extract features and represent them as vectors. A few examples of packages for feature extraction include scikit-learn, gensim, scikit-image, matplotlib, and NLTK in Python; OpenNLP in Java; and Breeze and Chalk in Scala. In fact, Breeze has been part of Spark MLlib since version 1.0, and we will see how to use some Breeze functionality for linear algebra in the later chapters.
+
+# Summary
+
+In this chapter, we saw how to find common, publicly-available datasets that can be used to test various machine learning models. You learned how to load, process, and clean data, as well as how to apply common techniques to transform raw data into feature vectors that can be used as training examples for our models.
+
+In the next chapter, you will learn the basics of recommender systems and explore how to create a recommendation model, use the model to make predictions, and evaluate the model.
+
+# Chapter 4. Building a Recommendation Engine with Spark
+
+Now that you have learned the basics of data processing and feature extraction, we will move on to explore individual machine learning models in detail, starting with recommendation engines.
+
+Recommendation engines are probably among the best types of machine learning model known to the general public. Even if people do not know exactly what a recommendation engine is, they have most likely experienced one through the use of popular websites such as Amazon, Netflix, YouTube, Twitter, LinkedIn, and Facebook. Recommendations are a core part of all these businesses, and in some cases, they drive significant percentages of their revenue.
+
+The idea behind recommendation engines is to predict what people might like and to uncover relationships between items to aid in the discovery process (in this way, it is similar and, in fact, often complementary to search engines, which also play a role in discovery). However, unlike search engines, recommendation engines try to present people with relevant content that they did not necessarily search for or that they might not even have heard of.
+
+Typically, a recommendation engine tries to model the connections between users and some type of item. In our MovieStream scenario from Chapter 2, _Designing a Machine Learning System_ , for example, we could use a recommendation engine to show our users movies that they might enjoy. If we can do this well, we could keep our users engaged using our service, which is good for both our users and us. Similarly, if we can do a good job of showing our users movies related to a given movie, we could aid in discovery and navigation on our site, again improving our users' experience, engagement, and the relevance of our content to them.
+
+However, recommendation engines are not limited to movies, books, or products. The techniques we will explore in this chapter can be applied to just about any user-to-item relationship as well as user-to-user connections, such as those found on social networks, allowing us to make recommendations such as people you may know or who to follow.
+
+Recommendation engines are most effective in two general scenarios (which are not mutually exclusive). They are explained here:
+
+  *  **Large number of available options for users** : When there are a very large number of available items, it becomes increasingly difficult for the user to find something they want. Searching can help when the user knows what they are looking for, but often, the right item might be something previously unknown to them. In this case, being recommended relevant items, that the user may not already know about, can help them discover new items.
+  *  **A significant degree of personal taste involved** : When personal taste plays a large role in selection, recommendation models, which often utilize a wisdom of the crowd approach, can be helpful in discovering items based on the behavior of others that have similar taste profiles.
+
+In this chapter, we will:
+
+  * Introduce the various types of recommendation engines
+  * Build a recommendation model using data about user preferences
+  * Use the trained model to compute recommendations for a given user as well compute similar items for a given item (that is, related items)
+  * Apply standard evaluation metrics to the model that we created to measure how well it performs in terms of predictive capability
+
+# Types of recommendation models
+
+Recommender systems are widely studied, and there are many approaches used, but there are two that are probably most prevalent: content-based filtering and collaborative filtering. Recently, other approaches such as ranking models have also gained in popularity. In practice, many approaches are hybrids, incorporating elements of many different methods into a model or combination of models.
+
+## Content-based filtering
+
+Content-based methods try to use the content or attributes of an item, together with some notion of similarity between two pieces of content, to generate items similar to a given item. These attributes are often textual content (such as titles, names, tags, and other metadata attached to an item), or in the case of media, they could include other features of the item, such as attributes extracted from audio and video content.
+
+In a similar manner, user recommendations can be generated based on attributes of users or user profiles, which are then matched to item attributes using the same measure of similarity. For example, a user can be represented by the combined attributes of the items they have interacted with. This becomes their user profile, which is then compared to item attributes to find items that match the user profile.
+
+## Collaborative filtering
+
+Collaborative filtering is a form of wisdom of the crowd approach where the set of preferences of many users with respect to items is used to generate estimated preferences of users for items with which they have not yet interacted. The idea behind this is the notion of similarity.
+
+In a user-based approach, if two users have exhibited similar preferences (that is, patterns of interacting with the same items in broadly the same way), then we would assume that they are similar to each other in terms of taste. To generate recommendations for unknown items for a given user, we can use the known preferences of other users that exhibit similar behavior. We can do this by selecting a set of similar users and computing some form of combined score based on the items they have shown a preference for. The overall logic is that if others have tastes similar to a set of items, these items would tend to be good candidates for recommendation.
+
+We can also take an item-based approach that computes some measure of similarity between items. This is usually based on the existing user-item preferences or ratings. Items that tend to be rated the same by similar users will be classed as similar under this approach. Once we have these similarities, we can represent a user in terms of the items they have interacted with and find items that are similar to these known items, which we can then recommend to the user. Again, a set of items similar to the known items is used to generate a combined score to estimate for an unknown item.
+
+The user- and item-based approaches are usually referred to as nearest-neighbor models, since the estimated scores are computed based on the set of most similar users or items (that is, their neighbors).
+
+Finally, there are many model-based methods that attempt to model the user-item preferences themselves so that new preferences can be estimated directly by applying the model to unknown user-item combinations.
+
+### Matrix factorization
+
+Since Spark's recommendation models currently only include an implementation of matrix factorization, we will focus our attention on this class of models. This focus is with good reason; however, these types of models have consistently been shown to perform extremely well in collaborative filtering and were among the best models in well-known competitions such as the Netflix prize.
+
+### Note
+
+For more information on and a brief overview of the performance of the best algorithms for the Netflix prize, see <http://techblog.netflix.com/2012/04/netflix-recommendations-beyond-5-stars.html>.
+
+#### Explicit matrix factorization
+
+When we deal with data that consists of preferences of users that are provided by the users themselves, we refer to explicit preference data. This includes, for example, ratings, thumbs up, likes, and so on that are given by users to items.
+
+We can take these ratings and form a two-dimensional matrix with users as rows and items as columns. Each entry represents a rating given by a user to a certain item. Since in most cases, each user has only interacted with a relatively small set of items, this matrix has only a few non-zero entries (that is, it is very sparse).
+
+As a simple example, let's assume that we have the following user ratings for a set of movies:
+
+    **Tom, Star Wars, 5**
+    **Jane, Titanic, 4**
+    **Bill, Batman, 3**
+    **Jane, Star Wars, 2**
+    **Bill, Titanic, 3**
+
+We will form the following ratings matrix:
+
+A simple movie-rating matrix
+
+Matrix factorization (or matrix completion) attempts to directly model this user-item matrix by representing it as a product of two smaller matrices of lower dimension. Thus, it is a dimensionality-reduction technique. If we have **U** users and **I** items, then our user-item matrix is of dimension U x I and might look something like the one shown in the following diagram:
+
+A sparse ratings matrix
+
+If we want to find a lower dimension (low-rank) approximation to our user-item matrix with the dimension **k** , we would end up with two matrices: one for users of size U x k and one for items of size I x k. These are known as factor matrices. If we multiply these two factor matrices, we would reconstruct an approximate version of the original ratings matrix. Note that while the original ratings matrix is typically very sparse, each factor matrix is dense, as shown in the following diagram:
+
+The user- and item-factor matrices
+
+These models are often also called latent feature models, as we are trying to discover some form of hidden features (which are represented by the factor matrices) that account for the structure of behavior inherent in the user-item rating matrix. While the latent features or factors are not directly interpretable, they might, perhaps, represent things such as the tendency of a user to like movies from a certain director, genre, style, or group of actors, for example.
+
+As we are directly modeling the user-item matrix, the prediction in these models is relatively straightforward: to compute a predicted rating for a user and item, we compute the vector dot product between the relevant row of the user-factor matrix (that is, the user's factor vector) and the relevant row of the item-factor matrix (that is, the item's factor vector).
+
+This is illustrated with the highlighted vectors in the following diagram:
+
+Computing recommendations from user- and item-factor vectors
+
+To find out the similarity between two items, we can use the same measures of similarity as we would use in the nearest-neighbor models, except that we can use the factor vectors directly by computing the similarity between two item-factor vectors, as illustrated in the following diagram:
+
+Computing similarity with item-factor vectors
+
+The benefit of factorization models is the relative ease of computing recommendations once the model is created. However, for very large user and itemsets, this can become a challenge as it requires storage and computation across potentially many millions of user- and item-factor vectors. Another advantage, as mentioned earlier, is that they tend to offer very good performance.
+
+### Note
+
+Projects such as Oryx (<https://github.com/OryxProject/oryx>) and Prediction.io (<https://github.com/PredictionIO/PredictionIO>) focus on model serving for large-scale models, including recommenders based on matrix factorization.
+
+On the down side, factorization models are relatively more complex to understand and interpret compared to nearest-neighbor models and are often more computationally intensive during the model's training phase.
+
+#### Implicit matrix factorization
+
+So far, we have dealt with explicit preferences such as ratings. However, much of the preference data that we might be able to collect is implicit feedback, where the preferences between a user and item are not given to us, but are, instead, implied from the interactions they might have with an item. Examples include binary data (such as whether a user viewed a movie, whether they purchased a product, and so on) as well as count data (such as the number of times a user watched a movie).
+
+There are many different approaches to deal with implicit data. MLlib implements a particular approach that treats the input rating matrix as two matrices: a binary preference matrix, **P** , and a matrix of confidence weights, **C**.
+
+For example, let's assume that the user-movie ratings we saw previously were, in fact, the number of times each user had viewed that movie. The two matrices would look something like ones shown in the following screenshot. Here, the matrix **P** informs us that a movie was viewed by a user, and the matrix **C** represents the confidence weighting, in the form of the view counts--generally, the more a user has watched a movie, the higher the confidence that they actually like it.
+
+Representation of an implicit preference and confidence matrix
+
+The implicit model still creates a user- and item-factor matrix. In this case, however, the matrix that the model is attempting to approximate is not the overall ratings matrix but the preference matrix P. If we compute a recommendation by calculating the dot product of a user- and item-factor vector, the score will not be an estimate of a rating directly. It will rather be an estimate of the preference of a user for an item (though not strictly between 0 and 1, these scores will generally be fairly close to a scale of 0 to 1).
+
+#### Alternating least squares
+
+ **Alternating Least Squares** ( **ALS** ) is an optimization technique to solve matrix factorization problems; this technique is powerful, achieves good performance, and has proven to be relatively easy to implement in a parallel fashion. Hence, it is well suited for platforms such as Spark. At the time of writing this book, it is the only recommendation model implemented in MLlib.
+
+ALS works by iteratively solving a series of least squares regression problems. In each iteration, one of the user- or item-factor matrices is treated as fixed, while the other one is updated using the fixed factor and the rating data. Then, the factor matrix that was solved for is, in turn, treated as fixed, while the other one is updated. This process continues until the model has converged (or for a fixed number of iterations).
+
+### Note
+
+Spark's documentation for collaborative filtering contains references to the papers that underlie the ALS algorithms implemented each component of explicit and implicit data. You can view the documentation at <http://spark.apache.org/docs/latest/mllib-collaborative-filtering.html>.
+
+# Extracting the right features from your data
+
+In this section, we will use explicit rating data, without additional user or item metadata or other information related to the user-item interactions. Hence, the features that we need as inputs are simply the user IDs, movie IDs, and the ratings assigned to each user and movie pair.
+
+## Extracting features from the MovieLens 100k dataset
+
+Start the Spark shell in the Spark base directory, ensuring that you provide enough memory via the `-driver-memory` option:
+
+    **>./bin/spark-shell -driver-memory 4g**
+
+In this example, we will use the same MovieLens dataset that we used in the previous chapter. Use the directory in which you placed the MovieLens 100k dataset as the input path in the following code.
+
+First, let's inspect the raw ratings dataset:
+
+    val rawData = sc.textFile("/ **PATH** /ml-100k/u.data")
+    rawData.first()
+
+You will see output similar to these lines of code:
+
+    **14/03/30 11:42:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable**
+    **14/03/30 11:42:41 WARN LoadSnappy: Snappy native library not loaded**
+    **14/03/30 11:42:41 INFO FileInputFormat: Total input paths to process : 1**
+    **14/03/30 11:42:41 INFO SparkContext: Starting job: first at <console>:15**
+    **14/03/30 11:42:41 INFO DAGScheduler: Got job 0 (first at <console>:15) with 1 output partitions (allowLocal=true)**
+    **14/03/30 11:42:41 INFO DAGScheduler: Final stage: Stage 0 (first at <console>:15)**
+    **14/03/30 11:42:41 INFO DAGScheduler: Parents of final stage: List()**
+    **14/03/30 11:42:41 INFO DAGScheduler: Missing parents: List()**
+    **14/03/30 11:42:41 INFO DAGScheduler: Computing the requested partition locally**
+    **14/03/30 11:42:41 INFO HadoopRDD: Input split: file:/Users/Nick/workspace/datasets/ml-100k/u.data:0+1979173**
+    **14/03/30 11:42:41 INFO SparkContext: Job finished: first at <console>:15, took 0.030533 s**
+    **res0: String = 196  242  3  881250949**
+
+Recall that this dataset consisted of the `user id`, `movie id`, `rating`, `timestamp` fields separated by a tab (`"\t"`) character. We don't need the time when the rating was made to train our model, so let's simply extract the first three fields:
+
+    val rawRatings = rawData.map(_.split("\t"). **take** (3))
+
+We will first split each record on the `"\t"` character, which gives us an `Array[String]` array. We will then use Scala's `take` function to keep only the first `3` elements of the array, which correspond to `user id`, `movie id`, and `rating`, respectively.
+
+We can inspect the first record of our new RDD by calling `rawRatings.first()`, which collects just the first record of the RDD back to the driver program. This will result in the following output:
+
+    **14/03/30 12:24:00 INFO SparkContext: Starting job: first at <console>:21**
+    **14/03/30 12:24:00 INFO DAGScheduler: Got job 1 (first at <console>:21) with 1 output partitions (allowLocal=true)**
+    **14/03/30 12:24:00 INFO DAGScheduler: Final stage: Stage 1 (first at <console>:21)**
+    **14/03/30 12:24:00 INFO DAGScheduler: Parents of final stage: List()**
+    **14/03/30 12:24:00 INFO DAGScheduler: Missing parents: List()**
+    **14/03/30 12:24:00 INFO DAGScheduler: Computing the requested partition locally**
+    **14/03/30 12:24:00 INFO HadoopRDD: Input split: file:/Users/Nick/workspace/datasets/ml-100k/u.data:0+1979173**
+    **14/03/30 12:24:00 INFO SparkContext: Job finished: first at <console>:21, took 0.00391 s**
+    **res6: Array[String] = Array(196, 242, 3)**
+
+We will use Spark's MLlib library to train our model. Let's take a look at what methods are available for us to use and what input is required. First, import the `ALS` model from MLlib:
+
+    import org.apache.spark.mllib.recommendation.ALS
+
+On the console, we can inspect the available methods on the ALS object using tab completion. Type in `ALS.` (note the dot) and then press the _Tab_ key. You should see the autocompletion of the methods:
+
+    **ALS.**
+    **asInstanceOf    isInstanceOf    main            toString        train           trainImplicit**
+
+The method we want to use is `train`. If we type `ALS.train` and hit _Enter_ , we will get an error. However, this error will tell us what the method signature looks like:
+
+    **ALS.train**
+    **< console>:12: error: ambiguous reference to overloaded definition,**
+    **both method train in object ALS of type (ratings: org.apache.spark.rdd.RDD[org.apache.spark.mllib.recommendation.Rating], rank: Int**
+    **, iterations: Int)org.apache.spark.mllib.recommendation.MatrixFactorizationModel**
+    **and  method train in object ALS of type (ratings: org.apache.spark.rdd.RDD[org.apache.spark.mllib.recommendation.Rating], rank: Int, iterations: Int, lambda: Double)org.apache.spark.mllib.recommendation.MatrixFactorizationModel**
+    **match expected type ?**
+    **ALS.train**
+    **^**
+
+So, we can see that at a minimum, we need to provide the input arguments, `ratings`, `rank`, and `iterations`. The second method also requires an argument called `lambda`. We'll cover these three shortly, but let's take a look at the `ratings` argument. First, let's import the `Rating` class that it references and use a similar approach to find out what an instance of `Rating` requires, by typing in `Rating()` and hitting _Enter_ :
+
+    **import org.apache.spark.mllib.recommendation.Rating**
+    **Rating()**
+    **< console>:13: error: not enough arguments for method apply: (user: Int, product: Int, rating: Double)org.apache.spark.mllib.recommendation.Rating in object Rating.**
+    **Unspecified value parameters user, product, rating.**
+    **Rating()**
+    **^**
+
+As we can see from the preceding output, we need to provide the `ALS` model with an RDD that consists of `Rating` records. A `Rating` class, in turn, is just a wrapper around `user id`, `movie id` (called `product` here), and the actual `rating` arguments. We'll create our rating dataset using the `map` method and transforming the array of IDs and ratings into a `Rating` object:
+
+    val ratings = rawRatings.map { case Array(user, movie, rating) => Rating(user. **toInt** , movie. **toInt** , rating. **toDouble** ) }
+
+### Note
+
+Notice that we need to use `toInt` or `toDouble` to convert the raw rating data (which was extracted as `Strings` from the text file) to `Int` or `Double` numeric inputs. Also, note the use of a `case` statement that allows us to extract the relevant variable names and use them directly (this saves us from having to use something like `val user = ratings(0)`).
+
+For more on Scala case statements and pattern matching as used here, take a look at <http://docs.scala-lang.org/tutorials/tour/pattern-matching.html>.
+
+We now have an `RDD[Rating]` that we can verify by calling:
+
+    **ratings.first()**
+    **14/03/30 12:32:48 INFO SparkContext: Starting job: first at <console>:24**
+    **14/03/30 12:32:48 INFO DAGScheduler: Got job 2 (first at <console>:24) with 1 output partitions (allowLocal=true)**
+    **14/03/30 12:32:48 INFO DAGScheduler: Final stage: Stage 2 (first at <console>:24)**
+    **14/03/30 12:32:48 INFO DAGScheduler: Parents of final stage: List()**
+    **14/03/30 12:32:48 INFO DAGScheduler: Missing parents: List()**
+    **14/03/30 12:32:48 INFO DAGScheduler: Computing the requested partition locally**
+    **14/03/30 12:32:48 INFO HadoopRDD: Input split: file:/Users/Nick/workspace/datasets/ml-100k/u.data:0+1979173**
+    **14/03/30 12:32:48 INFO SparkContext: Job finished: first at <console>:24, took 0.003752 s**
+    **res8: org.apache.spark.mllib.recommendation.Rating = Rating(196,242,3.0)**
+
+# Training the recommendation model
+
+Once we have extracted these simple features from our raw data, we are ready to proceed with model training; MLlib takes care of this for us. All we have to do is provide the correctly-parsed input RDD we just created as well as our chosen model parameters.
+
+## Training a model on the MovieLens 100k dataset
+
+We're now ready to train our model! The other inputs required for our model are as follows:
+
+  * `rank`: This refers to the number of factors in our ALS model, that is, the number of hidden features in our low-rank approximation matrices. Generally, the greater the number of factors, the better, but this has a direct impact on memory usage, both for computation and to store models for serving, particularly for large number of users or items. Hence, this is often a trade-off in real-world use cases. A rank in the range of 10 to 200 is usually reasonable.
+  * `iterations`: This refers to the number of iterations to run. While each iteration in `ALS` is guaranteed to decrease the reconstruction error of the ratings matrix, `ALS` models will converge to a reasonably good solution after relatively few iterations. So, we don't need to run for too many iterations in most cases (around 10 is often a good default).
+  * `lambda`: This parameter controls the regularization of our model. Thus, `lambda` controls over fitting. The higher the value of `lambda`, the more is the regularization applied. What constitutes a sensible value is very dependent on the size, nature, and sparsity of the underlying data, and as with almost all machine learning models, the regularization parameter is something that should be tuned using out-of-sample test data and cross-validation approaches.
+
+We'll use `rank` of `50`, `10` iterations, and a lambda parameter of `0.01` to illustrate how to train our model:
+
+    val model = ALS.train(ratings, 50, 10, 0.01)
+
+This returns a `MatrixFactorizationModel` object, which contains the user and item factors in the form of an RDD of `(id, factor)` pairs. These are called `userFeatures` and `productFeatures`, respectively. For example:
+
+    model.userFeatures
+
+You will see the output as:
+
+    **res14: org.apache.spark.rdd.RDD[(Int, Array[Double])] = FlatMappedRDD[659] at flatMap at ALS.scala:231**
+
+We can see that the factors are in the form of an `Array[Double]`.
+
+Note that the operations used in MLlib's `ALS` implementation are lazy transformations, so the actual computation will only be performed once we call some sort of action on the resulting `RDDs` of the user and item factors. We can force the computation using a Spark action such as `count`:
+
+    **model.userFeatures.count**
+
+This will trigger the computation, and we will see a quite a bit of output text similar to the following lines of code:
+
+    **14/03/30 13:10:40 INFO SparkContext: Starting job: count at <console>:26**
+    **14/03/30 13:10:40 INFO DAGScheduler: Registering RDD 665 (map at ALS.scala:147)**
+    **14/03/30 13:10:40 INFO DAGScheduler: Registering RDD 664 (map at ALS.scala:146)**
+    **14/03/30 13:10:40 INFO DAGScheduler: Registering RDD 674 (mapPartitionsWithIndex at ALS.scala:164)**
+    **...**
+    **14/03/30 13:10:45 INFO SparkContext: Job finished: count at <console>:26, took 5.068255 s**
+    **res16: Long = 943**
+
+If we call `count` for the movie factors, we will see the following output:
+
+    **model.productFeatures.count**
+    **14/03/30 13:15:21 INFO SparkContext: Starting job: count at <console>:26**
+    **14/03/30 13:15:21 INFO DAGScheduler: Got job 10 (count at <console>:26) with 1 output partitions (allowLocal=false)**
+    **14/03/30 13:15:21 INFO DAGScheduler: Final stage: Stage 165 (count at <console>:26)**
+    **14/03/30 13:15:21 INFO DAGScheduler: Parents of final stage: List(Stage 169, Stage 166)**
+    **14/03/30 13:15:21 INFO DAGScheduler: Missing parents: List()**
+    **14/03/30 13:15:21 INFO DAGScheduler: Submitting Stage 165 (FlatMappedRDD[883] at flatMap at ALS.scala:231), which has no missing parents**
+    **14/03/30 13:15:21 INFO DAGScheduler: Submitting 1 missing tasks from Stage 165 (FlatMappedRDD[883] at flatMap at ALS.scala:231)**
+    **...**
+    **14/03/30 13:15:21 INFO SparkContext: Job finished: count at <console>:26, took 0.030044 s**
+    **res21: Long = 1682**
+
+As expected, we have a factor array for each user (`943` factors) and movie (`1682` factors).
+
+### Training a model using implicit feedback data
+
+The standard matrix factorization approach in MLlib deals with explicit ratings. To work with implicit data, you can use the `trainImplicit` method. It is called in a manner similar to the standard `train` method. There is an additional parameter, `alpha`, that can be set (and in the same way, the regularization parameter, `lambda`, should be selected via testing and cross-validation methods).
+
+The `alpha` parameter controls the baseline level of confidence weighting applied. A higher level of `alpha` tends to make the model more confident about the fact that missing data equates to no preference for the relevant user-item pair.
+
+### Note
+
+As an exercise, try to take the existing MovieLens dataset and convert it into an implicit dataset. One possible approach is to convert it to binary feedback (0s and 1s) by applying a threshold on the ratings at some level.
+
+Another approach could be to convert the ratings' values into confidence weights (for example, perhaps, low ratings could imply zero weights, or even negative weights, which are supported by MLlib's implementation).
+
+Train a model on this dataset and compare the results of the following section with those generated by your implicit model.
+
+# Using the recommendation model
+
+Now that we have our trained model, we're ready to use it to make predictions. These predictions typically take one of two forms: recommendations for a given user and related or similar items for a given item.
+
+## User recommendations
+
+In this case, we would like to generate recommended items for a given user. This usually takes the form of a _top-K_ list, that is, the _K_ items that our model predicts will have the highest probability of the user liking them. This is done by computing the predicted score for each item and ranking the list based on this score.
+
+The exact method to perform this computation depends on the model involved. For example, in user-based approaches, the ratings of similar users on items are used to compute the recommendations for a user, while in an item-based approach, the computation is based on the similarity of items the user has rated to the candidate items.
+
+In matrix factorization, because we are modeling the ratings matrix directly, the predicted score can be computed as the vector dot product between a user-factor vector and an item-factor vector.
+
+### Generating movie recommendations from the MovieLens 100k dataset
+
+As MLlib's recommendation model is based on matrix factorization, we can use the factor matrices computed by our model to compute predicted scores (or ratings) for a user. We will focus on the explicit rating case using MovieLens data; however, the approach is the same when using the implicit model.
+
+The `MatrixFactorizationModel` class has a convenient `predict` method that will compute a predicted score for a given user and item combination:
+
+    val predictedRating = model.predict(789, 123)
+
+The output is as follows:
+
+    **14/03/30 16:10:10 INFO SparkContext: Starting job: lookup at MatrixFactorizationModel.scala:45**
+    **14/03/30 16:10:10 INFO DAGScheduler: Got job 30 (lookup at MatrixFactorizationModel.scala:45) with 1 output partitions (allowLocal=false)**
+    **...**
+    **14/03/30 16:10:10 INFO SparkContext: Job finished: lookup at MatrixFactorizationModel.scala:46, took 0.023077 s**
+    **predictedRating: Double = 3.128545693368485**
+
+As we can see, this model predicts a rating of `3.12` for user `789` and movie `123`.
+
+### Tip
+
+Note that you might see different results than those shown in this section because the `ALS` model is initialized randomly. So, different runs of the model will lead to different solutions.
+
+The `predict` method can also take an RDD of `(user, item)` IDs as the input and will generate predictions for each of these. We can use this method to make predictions for many users and items at the same time.
+
+To generate the _top-K_ recommended items for a user, `MatrixFactorizationModel` provides a convenience method called `recommendProducts`. This takes two arguments: `user` and `num`, where `user` is the user ID, and `num` is the number of items to recommend.
+
+It returns the top `num` items ranked in the order of the predicted score. Here, the scores are computed as the dot product between the user-factor vector and each item-factor vector.
+
+Let's generate the top `10` recommended items for user `789`:
+
+    val userId = 789
+    val K = 10
+    val topKRecs = model.recommendProducts(userId, K)
+
+We now have a set of predicted ratings for each movie for user `789`. If we print this out, we could inspect the top 10 recommendations for this user:
+
+    println(topKRecs.mkString("\n"))
+
+You should see the following output on your console:
+
+    **Rating(789,715,5.931851273771102)**
+    **Rating(789,12,5.582301095666215)**
+    **Rating(789,959,5.516272981542168)**
+    **Rating(789,42,5.458065302395629)**
+    **Rating(789,584,5.449949837103569)**
+    **Rating(789,750,5.348768847643657)**
+    **Rating(789,663,5.30832117499004)**
+    **Rating(789,134,5.278933936827717)**
+    **Rating(789,156,5.250959077906759)**
+    **Rating(789,432,5.169863417126231)**
+
+#### Inspecting the recommendations
+
+We can give these recommendations a sense check by taking a quick look at the titles of the movies a user has rated and the recommended movies. First, we need to load the movie data (which is the one of the datasets we explored in the previous chapter). We'll collect this data as a `Map[Int, String]` method mapping the movie ID to the title:
+
+    val movies = sc.textFile("/PATH/ml-100k/u.item")
+    val titles = movies.map(line => line.split("\\|").take(2)).map(array => (array(0).toInt,array(1))).collectAsMap()
+    titles(123)
+
+The preceding code will produce the output as:
+
+    **res68: String = Frighteners, The (1996)**
+
+For our user `789`, we can find out what movies they have rated, take the `10` movies with the highest rating, and then check the titles. We will do this now by first using the `keyBy` Spark function to create an RDD of key-value pairs from our `ratings` RDD, where the key will be the user ID. We will then use the `lookup` function to return just the ratings for this key (that is, that particular user ID) to the driver:
+
+    val moviesForUser = ratings.keyBy(_.user).lookup(789)
+
+Let's see how many movies this user has rated. This will be the `size` of the `moviesForUser` collection:
+
+    println(moviesForUser.size)
+
+We will see that this user has rated `33` movies.
+
+Next, we will take the 10 movies with the highest ratings by sorting the `moviesForUser` collection using the `rating` field of the `Rating` object. We will then extract the movie title for the relevant product ID attached to the `Rating` class from our mapping of movie titles and print out the top `10` titles with their ratings:
+
+    moviesForUser.sortBy(-_.rating).take(10).map(rating => (titles(rating.product), rating.rating)).foreach(println)
+
+You will see the following output displayed:
+
+    **(Godfather, The (1972),5.0)**
+    **(Trainspotting (1996),5.0)**
+    **(Dead Man Walking (1995),5.0)**
+    **(Star Wars (1977),5.0)**
+    **(Swingers (1996),5.0)**
+    **(Leaving Las Vegas (1995),5.0)**
+    **(Bound (1996),5.0)**
+    **(Fargo (1996),5.0)**
+    **(Last Supper, The (1995),5.0)**
+    **(Private Parts (1997),4.0)**
+
+Now, let's take a look at the top 10 recommendations for this user and see what the titles are using the same approach as the one we used earlier (note that the recommendations are already sorted):
+
+    topKRecs.map(rating => (titles(rating.product), rating.rating)).foreach(println)
+
+The output is as follows:
+
+    **(To Die For (1995),5.931851273771102)**
+    **(Usual Suspects, The (1995),5.582301095666215)**
+    **(Dazed and Confused (1993),5.516272981542168)**
+    **(Clerks (1994),5.458065302395629)**
+    **(Secret Garden, The (1993),5.449949837103569)**
+    **(Amistad (1997),5.348768847643657)**
+    **(Being There (1979),5.30832117499004)**
+    **(Citizen Kane (1941),5.278933936827717)**
+    **(Reservoir Dogs (1992),5.250959077906759)**
+    **(Fantasia (1940),5.169863417126231)**
+
+We leave it to you to decide whether these recommendations make sense.
+
+## Item recommendations
+
+Item recommendations are about answering the following question: for a certain item, what are the items most similar to it? Here, the precise definition of similarity is dependent on the model involved. In most cases, similarity is computed by comparing the vector representation of two items using some similarity measure. Common similarity measures include Pearson correlation and cosine similarity for real-valued vectors and Jaccard similarity for binary vectors.
+
+### Generating similar movies for the MovieLens 100k dataset
+
+The current `MatrixFactorizationModel` API does not directly support item-to-item similarity computations. Therefore, we will need to create our own code to do this.
+
+We will use the cosine similarity metric, and we will use the jblas linear algebra library (a dependency of MLlib) to compute the required vector dot products. This is similar to how the existing `predict` and `recommendProducts` methods work, except that we will use cosine similarity as opposed to just the dot product.
+
+We would like to compare the factor vector of our chosen item with each of the other items, using our similarity metric. In order to perform linear algebra computations, we will first need to create a vector object out of the factor vectors, which are in the form of an `Array[Double]`. The `JBLAS` class, `DoubleMatrix`, takes an `Array[Double]` as the constructor argument as follows:
+
+    import org.jblas.DoubleMatrix
+    val aMatrix = new DoubleMatrix(Array(1.0, 2.0, 3.0))
+
+Here is the output of the preceding code:
+
+    **aMatrix: org.jblas.DoubleMatrix = [1.000000; 2.000000; 3.000000]**
+
+### Tip
+
+Note that using jblas, vectors are represented as a one-dimensional `DoubleMatrix` class, while matrices are a two-dimensional `DoubleMatrix` class.
+
+We will need a method to compute the cosine similarity between two vectors. Cosine similarity is a measure of the angle between two vectors in an _n_ -dimensional space. It is computed by first calculating the dot product between the vectors and then dividing the result by a denominator, which is the norm (or length) of each vector multiplied together (specifically, the L2-norm is used in cosine similarity). In this way, cosine similarity is a normalized dot product.
+
+The cosine similarity measure takes on values between -1 and 1. A value of 1 implies completely similar, while a value of 0 implies independence (that is, no similarity). This measure is useful because it also captures negative similarity, that is, a value of -1 implies that not only are the vectors not similar, but they are also completely dissimilar.
+
+Let's create our `cosineSimilarity` function here:
+
+    def cosineSimilarity(vec1: DoubleMatrix, vec2: DoubleMatrix): Double = {
+      vec1.dot(vec2) / (vec1.norm2() * vec2.norm2())
+    }
+
+### Tip
+
+Note that we defined a return type for this function of `Double`. We are not required to do this, since Scala features type inference. However, it can often be useful to document return types for Scala functions.
+
+Let's try it out on one of our item factors for item `567`. We will need to collect an item factor from our model; we will do this using the `lookup` method in a similar way that we did earlier to collect the ratings for a specific user. In the following lines of code, we also use the `head` function, since `lookup` returns an array of values, and we only need the first value (in fact, there will only be one value, which is the factor vector for this item).
+
+Since this will be an `Array[Double]`, we will then need to create a `DoubleMatrix` object from it and compute the cosine similarity with itself:
+
+    val itemId = 567
+    val itemFactor = model.productFeatures.lookup(itemId).head
+    val itemVector = new DoubleMatrix(itemFactor)
+    cosineSimilarity(itemVector, itemVector)
+
+A similarity metric should measure how close, in some sense, two vectors are to each other. Here, we can see that our cosine similarity metric tells us that this item vector is identical to itself, which is what we would expect:
+
+    **res113: Double = 1.0**
+
+Now, we are ready to apply our similarity metric to each item:
+
+    val sims = model.productFeatures.map{ case (id, factor) => 
+      val factorVector = new DoubleMatrix(factor)
+      val sim = cosineSimilarity(factorVector, itemVector)
+      (id, sim)
+    }
+
+Next, we can compute the top 10 most similar items by sorting out the similarity score for each item:
+
+    // recall we defined K = 10 earlier
+    val sortedSims = sims.top(K)(Ordering.by[(Int, Double), Double] { case (id, similarity) => similarity })
+
+In the preceding code snippet, we used Spark's `top` function, which is an efficient way to compute _top-K_ results in a distributed fashion, instead of using `collect` to return all the data to the driver and sorting it locally (remember that we could be dealing with millions of users and items in the case of recommendation models).
+
+We need to tell Spark how to sort the `(item id, similarity score)` pairs in the `sims` RDD. To do this, we will pass an extra argument to `top`, which is a Scala `Ordering` object that tells Spark that it should sort by the value in the key-value pair (that is, sort by `similarity`).
+
+Finally, we can print the 10 items with the highest computed similarity metric to our given item:
+
+    println(sortedSims.take(10).mkString("\n"))
+
+You will see output like the following one:
+
+    **(567,1.0000000000000002)**
+    **(1471,0.6932331537649621)**
+    **(670,0.6898690594544726)**
+    **(201,0.6897964975027041)**
+    **(343,0.6891221044611473)**
+    **(563,0.6864214133620066)**
+    **(294,0.6812075443259535)**
+    **(413,0.6754663844488256)**
+    **(184,0.6702643811753909)**
+    **(109,0.6594872765176396)**
+
+Not surprisingly, we can see that the top-ranked similar item is our item. The rest are the other items in our set of items, ranked in order of our similarity metric.
+
+#### Inspecting the similar items
+
+Let's see what the title of our chosen movie is:
+
+    println(titles(itemId))
+
+The preceding code will print the following output:
+
+    **Wes Craven's New Nightmare (1994)**
+
+As we did for user recommendations, we can sense check our item-to-item similarity computations and take a look at the titles of the most similar movies. This time, we will take the top 11 so that we can exclude our given movie. So, we will take the numbers 1 to 11 in the list:
+
+    val sortedSims2 = sims.top(K + 1)(Ordering.by[(Int, Double), Double] { case (id, similarity) => similarity })
+    sortedSims2.slice(1, 11).map{ case (id, sim) => (titles(id), sim) }.mkString("\n")
+
+You will see the movie titles and scores displayed similar to this output:
+
+    **(Hideaway (1995),0.6932331537649621)**
+    **(Body Snatchers (1993),0.6898690594544726)**
+    **(Evil Dead II (1987),0.6897964975027041)**
+    **(Alien: Resurrection (1997),0.6891221044611473)**
+    **(Stephen King's The Langoliers (1995),0.6864214133620066)**
+    **(Liar Liar (1997),0.6812075443259535)**
+    **(Tales from the Crypt Presents: Bordello of Blood (1996),0.6754663844488256)**
+    **(Army of Darkness (1993),0.6702643811753909)**
+    **(Mystery Science Theater 3000: The Movie (1996),0.6594872765176396)**
+    **(Scream (1996),0.6538249646863378)**
+
+### Tip
+
+Once again note that you might see quite different results due to random model initialization.
+
+Now that you have computed similar items using cosine similarity, see if you can do the same with the user-factor vectors to compute similar users for a given user.
+
+# Evaluating the performance of recommendation models
+
+How do we know whether the model we have trained is a good model? We need to be able to evaluate its predictive performance in some way. **Evaluation metrics** are measures of a model's predictive capability or accuracy. Some are direct measures of how well a model predicts the model's target variable (such as Mean Squared Error), while others are concerned with how well the model performs at predicting things that might not be directly optimized in the model but are often closer to what we care about in the real world (such as Mean average precision).
+
+Evaluation metrics provide a standardized way of comparing the performance of the same model with different parameter settings and of comparing performance across different models. Using these metrics, we can perform model selection to choose the best-performing model from the set of models we wish to evaluate.
+
+Here, we will show you how to calculate two common evaluation metrics used in recommender systems and collaborative filtering models: Mean Squared Error and Mean average precision at K.
+
+## Mean Squared Error
+
+The **Mean Squared Error** ( **MSE** ) is a direct measure of the reconstruction error of the user-item rating matrix. It is also the objective function being minimized in certain models, specifically many matrix-factorization techniques, including `ALS`. As such, it is commonly used in explicit ratings settings.
+
+It is defined as the sum of the squared errors divided by the number of observations. The squared error, in turn, is the square of the difference between the predicted rating for a given user-item pair and the actual rating.
+
+We will use our user `789` as an example. Let's take the first rating for this user from the `moviesForUser` set of `Ratings` that we previously computed:
+
+    val actualRating = moviesForUser.take(1)(0)
+
+Here is the output:
+
+    **actualRating: org.apache.spark.mllib.recommendation.Rating = Rating(789,1012,4.0)**
+
+We will see that the rating for this user-item combination is 4. Next, we will compute the model's predicted rating:
+
+    val predictedRating = model.predict(789, actualRating.product)
+
+The output of the model's predicted rating is as follows:
+
+    **...**
+    **14/04/13 13:01:15 INFO SparkContext: Job finished: lookup at MatrixFactorizationModel.scala:46, took 0.025404 s**
+    **predictedRating: Double = 4.001005374200248**
+
+We will see that the predicted rating is about 4, very close to the actual rating. Finally, we will compute the squared error between the actual rating and the predicted rating:
+
+    val squaredError = math.pow(predictedRating - actualRating.rating, 2.0)
+
+The preceding code will output the squared error:
+
+    **squaredError: Double = 1.010777282523947E-6**
+
+So, in order to compute the overall MSE for the dataset, we need to compute this squared error for each `(user, movie, actual rating, predicted rating)` entry, sum them up, and divide them by the number of ratings. We will do this in the following code snippet.
+
+### Tip
+
+Note the following code is adapted from the Apache Spark programming guide for ALS at <http://spark.apache.org/docs/latest/mllib-collaborative-filtering.html>.
+
+First, we will extract the user and product IDs from the `ratings` RDD and make predictions for each user-item pair using `model.predict`. We will use the user-item pair as the key and the predicted rating as the value:
+
+    val usersProducts = ratings.map{ case Rating(user, product, rating)  => (user, product)}
+    val predictions = model.predict(usersProducts).map{
+        case Rating(user, product, rating) => ((user, product), rating)
+    }
+
+Next, we extract the actual ratings and also map the `ratings` RDD so that the user-item pair is the key and the actual rating is the value. Now that we have two RDDs with the same form of key, we can join them together to create a new RDD with the actual and predicted ratings for each user-item combination:
+
+    val ratingsAndPredictions = ratings.map{
+      case Rating(user, product, rating) => ((user, product), rating)
+    }.join(predictions)
+
+Finally, we will compute the MSE by summing up the squared errors using `reduce` and dividing by the `count` method of the number of records:
+
+    val MSE = ratingsAndPredictions.map{
+        case ((user, product), (actual, predicted)) =>  math.pow((actual - predicted), 2)
+    }.reduce(_ + _) / ratingsAndPredictions.count
+    println("Mean Squared Error = " + MSE)
+
+The output is as follows:
+
+    **Mean Squared Error = 0.08231947642632852**
+
+It is common to use the **Root Mean Squared Error** ( **RMSE** ), which is just the square root of the MSE metric. This is somewhat more interpretable, as it is in the same units as the underlying data (that is, the ratings in this case). It is equivalent to the standard deviation of the differences between the predicted and actual ratings. We can compute it simply as follows:
+
+    val RMSE = math.sqrt(MSE)
+    println("Root Mean Squared Error = " + RMSE)
+
+The preceding code will print the Root Mean Squared Error:
+
+    **Root Mean Squared Error = 0.2869137090247319**
+
+## Mean average precision at K
+
+ **Mean average precision at K** ( **MAPK** ) is the mean of the **average precision at K** ( **APK** ) metric across all instances in the dataset. APK is a metric commonly used in information retrieval. APK is a measure of the average relevance scores of a set of the _top-K_ documents presented in response to a query. For each query instance, we will compare the set of _top-K_ results with the set of actual relevant documents (that is, a ground truth set of relevant documents for the query).
+
+In the APK metric, the order of the result set matters, in that, the APK score would be higher if the result documents are both relevant and the relevant documents are presented higher in the results. It is, thus, a good metric for recommender systems in that typically we would compute the _top-K_ recommended items for each user and present these to the user. Of course, we prefer models where the items with the highest predicted scores (which are presented at the top of the list of recommendations) are, in fact, the most relevant items for the user. APK and other ranking-based metrics are also more appropriate evaluation measures for implicit datasets; here, MSE makes less sense.
+
+In order to evaluate our model, we can use APK, where each user is the equivalent of a query, and the set of _top-K_ recommended items is the document result set. The relevant documents (that is, the ground truth) in this case, is the set of items that a user interacted with. Hence, APK attempts to measure how good our model is at predicting items that a user will find relevant and choose to interact with.
+
+### Note
+
+The code for the following average precision computation is based on <https://github.com/benhamner/Metrics>.
+
+More information on MAPK can be found at <https://www.kaggle.com/wiki/MeanAveragePrecision>.
+
+Our function to compute the APK is shown here:
+
+    def avgPrecisionK(actual: Seq[Int], predicted: Seq[Int], k: Int): Double = {
+      val predK = predicted.take(k)
+      var score = 0.0
+      var numHits = 0.0
+      for ((p, i) <- predK.zipWithIndex) {
+        if (actual.contains(p)) {
+          numHits += 1.0
+          score += numHits / (i.toDouble + 1.0)
+        }
+      }
+      if (actual.isEmpty) {
+        1.0
+      } else {
+        score / scala.math.min(actual.size, k).toDouble
+      }
+    }
+
+As you can see, this takes as input a list of `actual` item IDs that are associated with the user and another list of `predicted` ids so that our estimate will be relevant for the user.
+
+We can compute the APK metric for our example user `789` as follows. First, we will extract the actual movie IDs for the user:
+
+    val actualMovies = moviesForUser.map(_.product)
+
+The output is as follows:
+
+    **actualMovies: Seq[Int] = ArrayBuffer(1012, 127, 475, 93, 1161, 286, 293, 9, 50, 294, 181, 1, 1008, 508, 284, 1017, 137, 111, 742, 248, 249, 1007, 591, 150, 276, 151, 129, 100, 741, 288, 762, 628, 124)**
+
+We will then use the movie recommendations we made previously to compute the APK score using `K = 10`:
+
+    val predictedMovies = topKRecs.map(_.product)
+
+Here is the output:
+
+    **predictedMovies: Array[Int] = Array(27, 497, 633, 827, 602, 849, 401, 584, 1035, 1014)**
+
+The following code will produce the average precision:
+
+    val apk10 = avgPrecisionK(actualMovies, predictedMovies, 10)
+
+The preceding code will print:
+
+    **apk10: Double = 0.0**
+
+In this case, we can see that our model is not doing a very good job of predicting relevant movies for this user as the APK score is 0.
+
+In order to compute the APK for each user and average them to compute the overall MAPK, we will need to generate the list of recommendations for each user in our dataset. While this can be fairly intensive on a large scale, we can distribute the computation using our Spark functionality. However, one limitation is that each worker must have the full item-factor matrix available so that it can compute the dot product between the relevant user vector and all item vectors. This can be a problem when the number of items is extremely high as the item matrix must fit in the memory of one machine.
+
+### Tip
+
+There is actually no easy way around this limitation. One possible approach is to only compute recommendations for a subset of items from the total item set, using approximate techniques such as Locality Sensitive Hashing (<http://en.wikipedia.org/wiki/Locality-sensitive_hashing>).
+
+We will now see how to go about this. First, we will collect the item factors and form a `DoubleMatrix` object from them:
+
+    val itemFactors = model.productFeatures.map { case (id, factor) => factor }.collect()
+    val itemMatrix = new DoubleMatrix(itemFactors)
+    println(itemMatrix.rows, itemMatrix.columns)
+
+The output of the preceding code is as follows:
+
+    **(1682,50)**
+
+This gives us a matrix with `1682` rows and `50` columns, as we would expect from `1682` movies with a factor dimension of `50`. Next, we will distribute the item matrix as a broadcast variable so that it is available on each worker node:
+
+    val imBroadcast = sc.broadcast(itemMatrix)
+
+You will see the output as follows:
+
+    **14/04/13 21:02:01 INFO MemoryStore: ensureFreeSpace(672960) called with curMem=4006896, maxMem=311387750**
+    **14/04/13 21:02:01 INFO MemoryStore: Block broadcast_21 stored as values to memory (estimated size 657.2 KB, free 292.5 MB)**
+    **imBroadcast: org.apache.spark.broadcast.Broadcast[org.jblas.DoubleMatrix] = Broadcast(21)**
+
+Now we are ready to compute the recommendations for each user. We will do this by applying a `map` function to each user factor within which we will perform a matrix multiplication between the user-factor vector and the movie-factor matrix. The result is a vector (of length `1682`, that is, the number of movies we have) with the predicted rating for each movie. We will then sort these predictions by the predicted rating:
+
+    val allRecs = model.userFeatures.map{ case (userId, array) => 
+      val userVector = new DoubleMatrix(array)
+      val scores = imBroadcast.value.mmul(userVector)
+      val sortedWithId = scores.data.zipWithIndex.sortBy(-_._1)
+      val recommendedIds = sortedWithId.map **(_._2 + 1** ).toSeq
+      (userId, recommendedIds)
+    }
+
+You will see the following on the screen:
+
+    **allRecs: org.apache.spark.rdd.RDD[(Int, Seq[Int])] = MappedRDD[269] at map at <console>:29**
+
+As we can see, we now have an RDD that contains a list of movie IDs for each user ID. These movie IDs are sorted in order of the estimated rating.
+
+### Tip
+
+Note that we needed to add 1 to the returned movie ids (as highlighted in the preceding code snippet), as the item-factor matrix is 0-indexed, while our movie IDs start at `1`.
+
+We also need the list of movie IDs for each user to pass into our APK function as the `actual` argument. We already have the `ratings` RDD ready, so we can extract just the user and movie IDs from it.
+
+If we use Spark's `groupBy` operator, we will get an RDD that contains a list of `(userid, movieid)` pairs for each user ID (as the user ID is the key on which we perform the `groupBy` operation):
+
+    val userMovies = ratings.map{ case Rating(user, product, rating) => (user, product) }.groupBy(_._1)
+
+The output of the preceding code is as follows:
+
+    **userMovies: org.apache.spark.rdd.RDD[(Int, Seq[(Int, Int)])] = MapPartitionsRDD[277] at groupBy at <console>:21**
+
+Finally, we can use Spark's `join` operator to join these two RDDs together on the user ID key. Then, for each user, we have the list of actual and predicted movie IDs that we can pass to our APK function. In a manner similar to how we computed MSE, we will sum each of these APK scores using a `reduce` action and divide by the number of users (that is, the count of the `allRecs` RDD):
+
+    val K = 10
+    val MAPK = allRecs.join(userMovies).map{ case (userId, (predicted, actualWithIds)) => 
+      val actual = actualWithIds.map(_._2).toSeq
+      avgPrecisionK(actual, predicted, K)
+    }.reduce(_ + _) / allRecs.count
+    println("Mean Average Precision at K = " + MAPK)
+
+The preceding code will print the mean average precision at K as follows:
+
+    **Mean Average Precision at K = 0.030486963254725705**
+
+Our model achieves a fairly low MAPK. However, note that typical values for recommendation tasks are usually relatively low, especially if the item set is extremely large.
+
+Try out a few parameter settings for `lambda` and `rank `(and `alpha` if you are using the implicit version of ALS) and see whether you can find a model that performs better based on the RMSE and MAPK evaluation metrics.
+
+## Using MLlib's built-in evaluation functions
+
+While we have computed MSE, RMSE, and MAPK from scratch, and it a useful learning exercise to do so, MLlib provides convenience functions to do this for us in the `RegressionMetrics` and `RankingMetrics` classes.
+
+### RMSE and MSE
+
+First, we will compute the MSE and RMSE metrics using `RegressionMetrics`. We will instantiate a `RegressionMetrics` instance by passing in an RDD of key-value pairs that represent the predicted and true values for each data point, as shown in the following code snippet. Here, we will again use the `ratingsAndPredictions` RDD we computed in our earlier example:
+
+    import org.apache.spark.mllib.evaluation.RegressionMetrics
+    val predictedAndTrue = ratingsAndPredictions.map { case ((user, product), (predicted, actual)) => (predicted, actual) }
+    val regressionMetrics = new RegressionMetrics(predictedAndTrue)
+
+We can then access various metrics, including MSE and RMSE. We will print out these metrics here:
+
+    println("Mean Squared Error = " + regressionMetrics.meanSquaredError)
+    println("Root Mean Squared Error = " + regressionMetrics.rootMeanSquaredError)
+
+You will see that the output for MSE and RMSE is exactly the same as the metrics we computed earlier:
+
+    **Mean Squared Error = 0.08231947642632852**
+    **Root Mean Squared Error = 0.2869137090247319**
+
+### MAP
+
+As we did for MSE and RMSE, we can compute ranking-based evaluation metrics using MLlib's `RankingMetrics` class. Similarly, to our own average precision function, we need to pass in an RDD of key-value pairs, where the key is an `Array` of predicted item IDs for a user, while the value is an array of actual item IDs.
+
+The implementation of the average precision at the K function in `RankingMetrics` is slightly different from ours, so we will get different results. However, the computation of the overall mean average precision (MAP, which does not use a threshold at K) is the same as our function if we select `K` to be very high (say, at least as high as the number of items in our item set):
+
+First, we will calculate MAP using `RankingMetrics`:
+
+    import org.apache.spark.mllib.evaluation.RankingMetrics
+    val predictedAndTrueForRanking = allRecs.join(userMovies).map{ case (userId, (predicted, actualWithIds)) => 
+      val actual = actualWithIds.map(_._2)
+      (predicted.toArray, actual.toArray)
+    }
+    val rankingMetrics = new RankingMetrics(predictedAndTrueForRanking)
+    println("Mean Average Precision = " + rankingMetrics.meanAveragePrecision)
+
+You will see the following output:
+
+    **Mean Average Precision = 0.07171412913757183**
+
+Next, we will use our function to compute the MAP in exactly the same way as we did previously, except that we set `K` to a very high value, say `2000`:
+
+    val MAPK2000 = allRecs.join(userMovies).map{ case (userId, (predicted, actualWithIds)) => 
+      val actual = actualWithIds.map(_._2).toSeq
+      avgPrecisionK(actual, predicted, 2000)
+    }.reduce(_ + _) / allRecs.count
+    println("Mean Average Precision = " + MAPK2000)
+
+You will see that the MAP from our own function is the same as the one computed using `RankingMetrics`:
+
+    **Mean Average Precision = 0.07171412913757186**
+
+### Note
+
+We will not cover cross-validation in this chapter, as we will provide a detailed treatment in the next few chapters. However, note that the same techniques for cross-validation that are explored in the upcoming chapters can be used to evaluate recommendation models, using the performance metrics such as MSE, RMSE, and MAP, which we covered in this section.
+
+# Summary
+
+In this chapter, we used Spark's MLlib library to train a collaborative filtering recommendation model, and you learned how to use this model to make predictions for the items that a given user might have a preference for. We also used our model to find items that are similar or related to a given item. Finally, we explored common metrics to evaluate the predictive capability of our recommendation model.
+
+In the next chapter, you will learn how to use Spark to train a model to classify your data and to use standard evaluation mechanisms to gauge the performance of your model.
+
+# Chapter 5. Building a Classification Model with Spark
+
+In this chapter, you will learn the basics of classification models and how they can be used in a variety of contexts. Classification generically refers to classifying things into distinct categories or classes. In the case of a classification model, we typically wish to assign classes based on a set of features. The features might represent variables related to an item or object, an event or context, or some combination of these.
+
+The simplest form of classification is when we have two classes; this is referred to as binary classification. One of the classes is usually labeled as the positive class (assigned a label of 1), while the other is labeled as the negative class (assigned a label of -1 or, sometimes, 0).
+
+A simple example with two classes is shown in the following figure. The input features in this case have two dimensions, and the feature values are represented on the _x_ and _y_ axes in the figure.
+
+Our task is to train a model that can classify new data points in this two-dimensional space as either one class (red) or the other (blue).
+
+A simple binary classification problem
+
+If we have more than two classes, we would refer to multiclass classification, and classes are typically labeled using integer numbers starting at 0 (for example, five different classes would range from label 0 to 4). An example is shown in the following figure. Again, the input features are assumed to be two-dimensional for ease of illustration.
+
+A simple multiclass classification problem
+
+Classification is a form of supervised learning where we train a model with training examples that include known targets or outcomes of interest (that is, the model is supervised with these example outcomes). Classification models can be used in many situations, but a few common examples include:
+
+  * Predicting the probability of Internet users clicking on an online advert; here, the classes are binary in nature (that is, click or no click)
+  * Detecting fraud; again, in this case, the classes are commonly binary (fraud or no fraud)
+  * Predicting defaults on loans (binary)
+  * Classifying images, video, or sounds (most often multiclass, with potentially very many different classes)
+  * Assigning categories or tags to news articles, web pages, or other content (multiclass)
+  * Discovering e-mail and web spam, network intrusions, and other malicious behavior (binary or multiclass)
+  * Detecting failure situations, for example in computer systems or networks
+  * Ranking customers or users in order of probability that they might purchase a product or use a service (this can be framed as classification by predicting probabilities and then ranking in the descending order)
+  * Predicting customers or users who might stop using a product, service, or provider (called churn)
+
+These are just a few possible use cases. In fact, it is probably safe to say that classification is one of the most widely used machine learning and statistical techniques in modern businesses and especially online businesses.
+
+In this chapter, we will:
+
+  * Discuss the types of classification models available in MLlib
+  * Use Spark to extract the appropriate features from raw input data
+  * Train a number of classification models using MLlib
+  * Make predictions with our classification models
+  * Apply a number of standard evaluation techniques to assess the predictive performance of our models
+  * Illustrate how to improve model performance using some of the feature-extraction approaches from Chapter 3, _Obtaining, Processing, and Preparing Data with Spark_
+  * Explore the impact of parameter tuning on model performance and learn how to use cross-validation to select the most optimal model parameters
+
+# Types of classification models
+
+We will explore three common classification models available in Spark: linear models, decision trees, and naive Bayes models. Linear models, while less complex, are relatively easier to scale to very large datasets. Decision tree is a powerful nonlinear technique that can be a little more difficult to scale up (fortunately, MLlib takes care of this for us!) and more computationally intensive to train, but delivers leading performance in many situations. Naive Bayes models are more simple but are easy to train efficiently and parallelize (in fact, they require only one pass over the dataset). They can also give reasonable performance in many cases when appropriate feature engineering is used. A naive Bayes model also provides a good baseline model against which we can measure the performance of other models.
+
+Currently, Spark's MLlib library supports binary classification for linear models, decision trees, and naive Bayes models and multiclass classification for decision trees and naive Bayes models. In this book, for simplicity in illustrating the examples, we will focus on the binary case.
+
+## Linear models
+
+The core idea of linear models (or generalized linear models) is that we model the predicted outcome of interest (often called the target or dependent variable) as a function of a simple linear predictor applied to the input variables (also referred to as features or independent variables).
+
+    y = f(wTx)
+
+Here, _y_ is the target variable, _w_ is the vector of parameters (known as the weight vector), and _x_ is the vector of input features.
+
+ _w Tx_ is the linear predictor (or vector dot product) of the weight vector _w_ and feature vector _x_. To this linear predictor, we applied a function _f_ (called the link function).
+
+Linear models can, in fact, be used for both classification and regression, simply by changing the link function. Standard linear regression (covered in the next chapter) uses an identity link (that is, _y = w Tx_ directly), while binary classification uses alternative link functions as discussed here.
+
+Let's take a look at the example of online advertising. In this case, the target variable would be 0 (often assigned the class label of -1 in mathematical treatments) if no click was observed for a given advert displayed on a web page (called an impression). The target variable would be 1 if a click occurred. The feature vector for each impression would consist of variables related to the impression event (such as features relating to the user, web page, advert and advertiser, and various other factors relating to the context of the event, such as the type of device used, time of the day, and geolocation).
+
+Thus, we would like to find a model that maps a given input feature vector (advert impression) to a predicted outcome (click or not). To make a prediction for a new data point, we will take the new feature vector (which is unseen, and hence, we do not know what the target variable is) and compute the dot product with our weight vector. We will then apply the relevant link function, and the result is our predicted outcome (after applying a threshold to the prediction, in the case of some models).
+
+Given a set of input data in the form of feature vectors and target variables, we would like to find the weight vector that is the best fit for the data, in the sense that we minimize some error between what our model predicts and the actual outcomes observed. This process is called **model** **fitting** , **training** , or **optimization**.
+
+More formally, we seek to find the weight vector that minimizes the sum, over all the training examples, of the loss (or error) computed from some loss function. The loss function takes the weight vector, feature vector, and the actual outcome for a given training example as input and outputs the loss. In fact, the loss function itself is effectively specified by the link function; hence, for a given type of classification or regression (that is, a given link function), there is a corresponding loss function.
+
+### Tip
+
+For further details on linear models and loss functions, see the linear methods section related to binary classification in the _Spark Programming Guide_ at <http://spark.apache.org/docs/latest/mllib-linear-methods.html#binary-classification>.Also, see the Wikipedia entry for generalized linear models at <http://en.wikipedia.org/wiki/Generalized_linear_model>.
+
+While a detailed treatment of linear models and loss functions is beyond the scope of this book, MLlib provides two loss functions suitable to binary classification (you can learn more about them from the Spark documentation). The first one is logistic loss, which equates to a model known as **logistic regression** , while the second one is the hinge loss, which is equivalent to a linear **Support Vector Machine** ( **SVM** ). Note that the SVM does not strictly fall into the statistical framework of generalized linear models but can be used in the same way as it essentially specifies a loss and link function.
+
+In the following image, we show the logistic loss and hinge loss relative to the actual zero-one loss. The zero-one loss is the true loss for binary classification--it is either zero if the model predicts correctly or one if the model predicts incorrectly. The reason it is not actually used is that it is not a differentiable loss function, so it is not possible to easily compute a gradient and, thus, very difficult to optimize.
+
+The other loss functions are approximations to the zero-one loss that make optimization possible.
+
+The logistic, hinge and zero-one loss functions
+
+### Note
+
+The preceding loss diagram is adapted from the scikit-learn example at <http://scikit-learn.org/stable/auto_examples/linear_model/plot_sgd_loss_functions.html>.
+
+### Logistic regression
+
+Logistic regression is a probabilistic model--that is, its predictions are bounded between 0 and 1, and for binary classification equate to the model's estimate of the probability of the data point belonging to the positive class. Logistic regression is one of the most widely used linear classification models.
+
+As mentioned earlier, the link function used in logistic regression is the logit link:
+
+    1 / (1 + exp(-wTx))
+
+The related loss function for logistic regression is the logistic loss:
+
+    log(1 + exp(-ywTx))
+
+Here, _y_ is the actual target variable (either _1_ for the positive class or _-1_ for the negative class).
+
+### Linear support vector machines
+
+SVM is a powerful and popular technique for regression and classification. Unlike logistic regression, it is not a probabilistic model but predicts classes based on whether the model evaluation is positive or negative.
+
+The SVM link function is the identity link, so the predicted outcome is:
+
+    y = wTx
+
+Hence, if the evaluation of _w Tx_ is greater than or equal to a threshold of 0, the SVM will assign the data point to class 1; otherwise, the SVM will assign it to class 0 (this threshold is a model parameter of SVM and can be adjusted).
+
+The loss function for SVM is known as the **hinge loss** and is defined as:
+
+    max(0, 1 - ywTx)
+
+SVM is a maximum margin classifier--it tries to find a weight vector such that the classes are separated as much as possible. It has been shown to perform well on many classification tasks, and the linear variant can scale to very large datasets.
+
+### Note
+
+SVMs have a large amount of theory behind them, which is beyond the scope of this book, but you can visit <http://en.wikipedia.org/wiki/Support_vector_machine> and <http://www.support-vector-machines.org/> for more details.
+
+In the following image, we have plotted the different decision functions for logistic regression (the blue line) and linear SVM (the red line), based on the simple binary classification example explained earlier.
+
+You can see that the SVM effectively focuses on the points that lie closest to the decision function (the margin lines are shown with red dashes):
+
+Decision functions for logistic regression and linear SVM for binary classification
+
+## The naive Bayes model
+
+Naive Bayes is a probabilistic model that makes predictions by computing the probability of a data point that belongs to a given class. A naive Bayes model assumes that each feature makes an independent contribution to the probability assigned to a class (it assumes conditional independence between features).
+
+Due to this assumption, the probability of each class becomes a function of the product of the probability of a feature occurring, given the class, as well as the probability of this class. This makes training the model tractable and relatively straightforward. The class prior probabilities and feature conditional probabilities are all estimated from the frequencies present in the dataset. Classification is performed by selecting the most probable class, given the features and class probabilities.
+
+An assumption is also made about the feature distributions (the parameters of which are estimated from the data). MLlib implements multinomial naive Bayes that assumes that the feature distribution is a multinomial distribution that represents non-negative frequency counts of the features.
+
+It is suitable for binary features (for example, _1-of-k_ encoded categorical features) and is commonly used for text and document classification (where, as we have seen in Chapter 3, _Obtaining, Processing, and Preparing Data with Spark_ , the bag-of-words vector is a typical feature representation).
+
+### Note
+
+Take a look at the _MLlib - Naive Bayes_ section in the Spark documentation at <http://spark.apache.org/docs/latest/mllib-naive-bayes.html> for more information.
+
+The Wikipedia page at <http://en.wikipedia.org/wiki/Naive_Bayes_classifier> has a more detailed explanation of the mathematical formulation.
+
+Here, we have shown the decision function of naive Bayes on our simple binary classification example:
+
+Decision function of naive Bayes for binary classification
+
+## Decision trees
+
+Decision tree model is a powerful, nonprobabilistic technique that can capture more complex nonlinear patterns and feature interactions. They have been shown to perform well on many tasks, are relatively easy to understand and interpret, can handle categorical and numerical features, and do not require input data to be scaled or standardized. They are well suited to be included in ensemble methods (for example, ensembles of decision tree models, which are called decision forests).
+
+The decision tree model constructs a tree where the leaves represent a class assignment to class 0 or 1, and the branches are a set of features. In the following figure, we show a simple decision tree where the binary outcome is **Stay at home** or **Go to the beach**. The features are the weather outside.
+
+A simple decision tree
+
+The decision tree algorithm is a top-down approach that begins at a root node (or feature), and then selects a feature at each step that gives the best split of the dataset, as measured by the information gain of this split. The information gain is computed from the node impurity (which is the extent to which the labels at the node are similar, or homogenous) minus the weighted sum of the impurities for the two child nodes that would be created by the split. For classification tasks, there are two measures that can be used to select the best split. These are Gini impurity and entropy.
+
+### Note
+
+See the _MLlib - Decision Tree_ section in the _Spark Programming Guide_ at <http://spark.apache.org/docs/latest/mllib-decision-tree.html> for further details on the decision tree algorithm and impurity measures for classification.
+
+In the following screenshot, we have plotted the decision boundary for the decision tree model, as we did for the other models earlier. We can see that the decision tree is able to fit complex, nonlinear models.
+
+Decision function for a decision tree for binary classification
+
+# Extracting the right features from your data
+
+You might recall from Chapter 3, _Obtaining, Processing, and Preparing Data with Spark_ that the majority of machine learning models operate on numerical data in the form of feature vectors. In addition, for supervised learning methods such as classification and regression, we need to provide the target variable (or variables in the case of multiclass situations) together with the feature vector.
+
+Classification models in MLlib operate on instances of `LabeledPoint`, which is a wrapper around the target variable (called the **label** ) and the **feature vector** :
+
+    case class LabeledPoint(label: Double, features: Vector)
+
+While in most examples of using classification, you will come across existing datasets that are already in the vector format, in practice, you will usually start with raw data that needs to be transformed into features. As we have already seen, this can involve preprocessing and transformation, such as binning numerical features, scaling and normalizing features, and using _1-of-k_ encodings for categorical features.
+
+## Extracting features from the Kaggle/StumbleUpon evergreen classification dataset
+
+In this chapter, we will use a different dataset from the one we used for our recommendation model, as the MovieLens data doesn't have much for us to work with in terms of a classification problem. We will use a dataset from a competition on Kaggle. The dataset was provided by StumbleUpon, and the problem relates to classifying whether a given web page is ephemeral (that is, short lived and will cease being popular soon) or evergreen (that is, persistently popular) on their web content recommendation pages.
+
+### Note
+
+The dataset used here can be downloaded from <http://www.kaggle.com/c/stumbleupon/data>.
+
+Download the training data (`train.tsv`)--you will need to accept the terms and conditions before downloading the dataset.
+
+You can find more information about the competition at <http://www.kaggle.com/c/stumbleupon>.
+
+Before we begin, it will be easier for us to work with the data in Spark if we remove the column name header from the first line of the file. Change to the directory in which you downloaded the data (referred to as `PATH` here) and run the following command to remove the first line and pipe the result to a new file called `train_noheader.tsv`:
+
+    **> sed 1d train.tsv > train_noheader.tsv**
+
+Now, we are ready to start up our Spark shell (remember to run this command from your Spark installation directory):
+
+    **>./bin/spark-shell --driver-memory 4g**
+
+You can type in the code that follows for the remainder of this chapter directly into your Spark shell.
+
+In a manner similar to what we did in the earlier chapters, we will load the raw training data into an RDD and inspect it:
+
+    val rawData = sc.textFile("/ **PATH** /train_noheader.tsv")
+    val records = rawData.map(line => line.split("\t"))
+    records.first()
+
+You will the following on the screen:
+
+    **Array[String] = Array("http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html", "4042", ...**
+
+You can check the fields that are available by reading through the overview on the dataset page above. The first two columns contain the URL and ID of the page. The next column contains some raw textual content. The next column contains the category assigned to the page. The next 22 columns contain numeric or categorical features of various kinds. The final column contains the target--1 is evergreen, while 0 is non-evergreen.
+
+We'll start off with a simple approach of using only the available numeric features directly. As each categorical variable is binary, we already have a _1-of-k_ encoding for these variables, so we don't need to do any further feature extraction.
+
+Due to the way the data is formatted, we will have to do a bit of data cleaning during our initial processing by trimming out the extra quotation characters (`"`). There are also missing values in the dataset; they are denoted by the `"?"` character. In this case, we will simply assign a zero value to these missing values:
+
+    import org.apache.spark.mllib.regression.LabeledPoint
+    import org.apache.spark.mllib.linalg.Vectors
+    val data = records.map { r =>
+      val trimmed = r.map(_.replaceAll("\"", ""))
+      val label = trimmed(r.size - 1).toInt
+      val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
+      LabeledPoint(label, Vectors.dense(features))
+    }
+
+In the preceding code, we extracted the label variable from the last column and an array of features for columns 5 to 25 after cleaning and dealing with missing values. We converted the label to an `Int` value and the features to an `Array[Double]`. Finally, we wrapped the label and features in a `LabeledPoint` instance, converting the features into an MLlib `Vector`.
+
+We will also cache the data and count the number of data points:
+
+    data.cache
+    val numData = data.count
+
+You will see that the value of `numData` is 7395.
+
+We will explore the dataset in more detail a little later, but we will tell you now that there are some negative feature values in the numeric data. As we saw earlier, the naive Bayes model requires non-negative features and will throw an error if it encounters negative values. So, for now, we will create a version of our input feature vectors for the naive Bayes model by setting any negative feature values to zero:
+
+    val nbData = records.map { r =>
+      val trimmed = r.map(_.replaceAll("\"", ""))
+      val label = trimmed(r.size - 1).toInt
+      val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble).map(d => if (d < 0) 0.0 else d)
+      LabeledPoint(label, Vectors.dense(features))
+    }
+
+# Training classification models
+
+Now that we have extracted some basic features from our dataset and created our input RDD, we are ready to train a number of models. To compare the performance and use of different models, we will train a model using logistic regression, SVM, naive Bayes, and a decision tree. You will notice that training each model looks nearly identical, although each has its own specific model parameters that can be set. MLlib sets sensible defaults in most cases, but in practice, the best parameter setting should be selected using evaluation techniques, which we will cover later in this chapter.
+
+## Training a classification model on the Kaggle/StumbleUpon evergreen classification dataset
+
+We can now apply the models from MLlib to our input data. First, we need to import the required classes and set up some minimal input parameters for each model. For logistic regression and SVM, this is the number of iterations, while for the decision tree model, it is the maximum tree depth:
+
+    import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
+    import org.apache.spark.mllib.classification.SVMWithSGD
+    import org.apache.spark.mllib.classification.NaiveBayes
+    import org.apache.spark.mllib.tree.DecisionTree
+    import org.apache.spark.mllib.tree.configuration.Algo
+    import org.apache.spark.mllib.tree.impurity.Entropy 
+    val numIterations = 10
+    val maxTreeDepth = 5
+
+Now, train each model in turn. First, we will train logistic regression:
+
+    val lrModel = LogisticRegressionWithSGD.train(data, numIterations)
+
+    **...**
+    **14/12/06 13:41:47 INFO DAGScheduler: Job 81 finished: reduce at RDDFunctions.scala:112, took 0.011968 s**
+    **14/12/06 13:41:47 INFO GradientDescent: GradientDescent.runMiniBatchSGD finished. Last 10 stochastic losses 0.6931471805599474, 1196521.395699124, Infinity, 1861127.002201189, Infinity, 2639638.049627607, Infinity, Infinity, Infinity, Infinity**
+    **lrModel: org.apache.spark.mllib.classification.LogisticRegressionModel = (weights=[-0.11372778986947886,-0.511619752777837,**
+    **...**
+
+Next up, we will train an SVM model:
+
+    val svmModel = SVMWithSGD.train(data, numIterations)
+
+You will see the following output:
+
+    **...**
+    **14/12/06 13:43:08 INFO DAGScheduler: Job 94 finished: reduce at RDDFunctions.scala:112, took 0.007192 s**
+    **14/12/06 13:43:08 INFO GradientDescent: GradientDescent.runMiniBatchSGD finished. Last 10 stochastic losses 1.0, 2398226.619666797, 2196192.9647478117, 3057987.2024311484, 271452.9038284356, 3158131.191895948, 1041799.350498323, 1507522.941537049, 1754560.9909073508, 136866.76745605646**
+    **svmModel: org.apache.spark.mllib.classification.SVMModel = (weights=[-0.12218838697834929,-0.5275107581589767,**
+    **...**
+
+Then, we will train the naive Bayes model; remember to use your special non-negative feature dataset:
+
+    val nbModel = NaiveBayes.train(nbData)
+
+The following is the output:
+
+    **...**
+    **14/12/06 13:44:48 INFO DAGScheduler: Job 95 finished: collect at NaiveBayes.scala:120, took 0.441273 s**
+    **nbModel: org.apache.spark.mllib.classification.NaiveBayesModel = org.apache.spark.mllib.classification.NaiveBayesModel@666ac612**
+    **...**
+
+Finally, we will train our decision tree:
+
+    val dtModel = DecisionTree.train(data, Algo.Classification, Entropy, maxTreeDepth)
+
+The output is as follows:
+
+    **...**
+    **14/12/06 13:46:03 INFO DAGScheduler: Job 104 finished: collectAsMap at DecisionTree.scala:653, took 0.031338 s**
+    **...**
+    **total: 0.343024**
+    **findSplitsBins: 0.119499**
+    **findBestSplits: 0.200352**
+    **chooseSplits: 0.199705**
+    **dtModel: org.apache.spark.mllib.tree.model.DecisionTreeModel = DecisionTreeModel classifier of depth 5 with 61 nodes**
+    **...**
+
+Notice that we set the mode, or `Algo`, of the decision tree to `Classification`, and we used the `Entropy` impurity measure.
+
+# Using classification models
+
+We now have four models trained on our input labels and features. We will now see how to use these models to make predictions on our dataset. For now, we will use the same training data to illustrate the `predict` method of each model.
+
+## Generating predictions for the Kaggle/StumbleUpon evergreen classification dataset
+
+We will use our logistic regression model as an example (the other models are used in the same way):
+
+    val dataPoint = data.first
+    val prediction = lrModel.predict(dataPoint.features)
+
+The following is the output:
+
+    **prediction: Double = 1.0**
+
+We saw that for the first data point in our training dataset, the model predicted a label of `1` (that is, evergreen). Let's examine the true label for this data point:
+
+    val trueLabel = dataPoint.label
+
+You can see the following output:
+
+    **trueLabel: Double = 0.0**
+
+So, in this case, our model got it wrong!
+
+We can also make predictions in bulk by passing in an `RDD[Vector]` as input:
+
+    val predictions = lrModel.predict(data.map(lp => lp.features))
+    predictions.take(5)
+
+The following is the output:
+
+    **Array[Double] = Array(1.0, 1.0, 1.0, 1.0, 1.0)**
+
+# Evaluating the performance of classification models
+
+When we make predictions using our model, as we did earlier, how do we know whether the predictions are good or not? We need to be able to evaluate how well our model performs. Evaluation metrics commonly used in binary classification include prediction accuracy and error, precision and recall, and area under the precision-recall curve, the **receiver operating characteristic** ( **ROC** ) curve, **area under ROC curve** ( **AUC** ), and F-measure.
+
+## Accuracy and prediction error
+
+The prediction error for binary classification is possibly the simplest measure available. It is the number of training examples that are misclassified, divided by the total number of examples. Similarly, accuracy is the number of correctly classified examples divided by the total examples.
+
+We can calculate the accuracy of our models in our training data by making predictions on each input feature and comparing them to the true label. We will sum up the number of correctly classified instances and divide this by the total number of data points to get the average classification accuracy:
+
+    val lrTotalCorrect = data.map { point =>
+      if (lrModel.predict(point.features) == point.label) 1 else 0
+    }.sum 
+    val lrAccuracy = lrTotalCorrect / data.count
+
+The output is as follows:
+
+    **lrAccuracy: Double = 0.5146720757268425**
+
+This gives us 51.5 percent accuracy, which doesn't look particularly impressive! Our model got only half of the training examples correct, which seems to be about as good as a random chance.
+
+### Note
+
+Note that the predictions made by the model are not naturally exactly 1 or 0. The output is usually a real number that must be turned into a class prediction. This is done through use of a threshold in the classifier's decision or scoring function.
+
+For example, binary logistic regression is a probabilistic model that returns the estimated probability of class 1 in its scoring function. Thus, a decision threshold of 0.5 is typical. That is, if the estimated probability of being in class 1 is higher than 50 percent, the model decides to classify the point as class 1; otherwise, it will be classified as class 0.
+
+Note that the threshold itself is effectively a model parameter that can be tuned in some models. It also plays a role in evaluation measures, as we will see now.
+
+What about the other models? Let's compute the accuracy for the other three:
+
+    val svmTotalCorrect = data.map { point =>
+      if (svmModel.predict(point.features) == point.label) 1 else 0
+    }.sum
+    val nbTotalCorrect = nbData.map { point =>
+      if (nbModel.predict(point.features) == point.label) 1 else 0
+    }.sum
+
+Note that the decision tree prediction threshold needs to be specified explicitly, as highlighted here:
+
+    val dtTotalCorrect = data.map { point =>
+      val score = dtModel.predict(point.features)
+      val predicted = if ( **score > 0.5**) 1 else 0 
+      if (predicted == point.label) 1 else 0
+    }.sum
+
+We can now inspect the accuracy for the other three models.
+
+First, the SVM model:
+
+    val svmAccuracy = svmTotalCorrect / numData
+
+Here is the output for the SVM model:
+
+    **svmAccuracy: Double = 0.5146720757268425**
+
+Next, our naive Bayes model:
+
+    val nbAccuracy = nbTotalCorrect / numData
+
+The output is as follows:
+
+    **nbAccuracy: Double = 0.5803921568627451**
+
+Finally, we compute the accuracy for the decision tree:
+
+    val dtAccuracy = dtTotalCorrect / numData
+
+And, the output is:
+
+    **dtAccuracy: Double = 0.6482758620689655**
+
+We can see that both SVM and naive Bayes also performed quite poorly. The decision tree model is better with 65 percent accuracy, but this is still not particularly high.
+
+## Precision and recall
+
+In information retrieval, precision is a commonly used measure of the quality of the results, while recall is a measure of the completeness of the results.
+
+In the binary classification context, precision is defined as the number of true positives (that is, the number of examples correctly predicted as class 1) divided by the sum of true positives and false positives (that is, the number of examples that were incorrectly predicted as class 1). Thus, we can see that a precision of 1.0 (or 100 percent) is achieved if every example predicted by the classifier to be class 1 is, in fact, in class 1 (that is, there are no false positives).
+
+Recall is defined as the number of true positives divided by the sum of true positives and false negatives (that is, the number of examples that were in class 1, but were predicted as class 0 by the model). We can see that a recall of 1.0 (or 100 percent) is achieved if the model doesn't miss any examples that were in class 1 (that is, there are no false negatives).
+
+Generally, precision and recall are inversely related; often, higher precision is related to lower recall and vice versa. To illustrate this, assume that we built a model that always predicted class 1. In this case, the model predictions would have no false negatives because the model always predicts 1; it will not miss any of class 1. Thus, the recall will be 1.0 for this model. On the other hand, the false positive rate could be very high, meaning precision would be low (this depends on the exact distribution of the classes in the dataset).
+
+Precision and recall are not particularly useful as standalone metrics, but are typically used together to form an aggregate or averaged metric. Precision and recall are also dependent on the threshold selected for the model.
+
+Intuitively, below some threshold level, a model will always predict class 1. Hence, it will have a recall of 1, but most likely, it will have low precision. At a high enough threshold, the model will always predict class 0. The model will then have a recall of 0, since it cannot achieve any true positives and will likely have many false negatives. Furthermore, its precision score will be undefined, as it will achieve zero true positives and zero false positives.
+
+The **precision-recall** ( **PR** ) curve shown in the following figure plots precision against recall outcomes for a given model, as the decision threshold of the classifier is changed. The area under this PR curve is referred to as the average precision. Intuitively, an area under the PR curve of 1.0 will equate to a perfect classifier that will achieve 100 percent in both precision and recall.
+
+Precision-recall curve
+
+### Tip
+
+See <http://en.wikipedia.org/wiki/Precision_and_recall> and <http://en.wikipedia.org/wiki/Average_precision#Average_precision> for more details on precision, recall, and area under the PR curve.
+
+## ROC curve and AUC
+
+The **ROC** curve is a concept similar to the PR curve. It is a graphical illustration of the true positive rate against the false positive rate for a classifier.
+
+The **true positive rate** ( **TPR** ) is the number of true positives divided by the sum of true positives and false negatives. In other words, it is the ratio of true positives to all positive examples. This is the same as the recall we saw earlier and is also commonly referred to as sensitivity.
+
+The **false positive rate** ( **FPR** ) is the number of false positives divided by the sum of false positives and **true negatives** (that is, the number of examples correctly predicted as class 0). In other words, it is the ratio of false positives to all negative examples.
+
+In a manner similar to precision and recall, the ROC curve (plotted in the following figure) represents the classifier's performance tradeoff of TPR against FPR, for different decision thresholds. Each point on the curve represents a different threshold in the decision function for the classifier.
+
+The ROC curve
+
+The area under the ROC curve (commonly referred to as AUC) represents an average value. Again, an AUC of 1.0 will represent a perfect classifier. An area of 0.5 is referred to as the random score. Thus, a model that achieves an AUC of 0.5 is no better than randomly guessing.
+
+### Note
+
+As both the area under the PR curve and the area under the ROC curve are effectively normalized (with a minimum of 0 and maximum of 1), we can use these measures to compare models with differing parameter settings and even compare completely different models. Thus, these metrics are popular for model evaluation and selection purposes.
+
+MLlib comes with a set of built-in routines to compute the area under the PR and ROC curves for binary classification. Here, we will compute these metrics for each of our models:
+
+    import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
+    val metrics = Seq(lrModel, svmModel).map { model => 
+      val scoreAndLabels = data.map { point =>
+        (model.predict(point.features), point.label)
+      }
+      val metrics = new BinaryClassificationMetrics(scoreAndLabels)
+      (model.getClass.getSimpleName, metrics.areaUnderPR, metrics.areaUnderROC)
+    }
+
+As we did previously to train the naive Bayes model and computing accuracy, we need to use the special `nbData` version of the dataset that we created to compute the classification metrics:
+
+    val nbMetrics = Seq(nbModel).map{ model =>
+      val scoreAndLabels = nbData.map { point =>
+        val score = model.predict(point.features)
+        (if (score > 0.5) 1.0 else 0.0, point.label)
+      }
+      val metrics = new BinaryClassificationMetrics(scoreAndLabels)
+      (model.getClass.getSimpleName, metrics.areaUnderPR, metrics.areaUnderROC)
+    }
+
+Note that because the `DecisionTreeModel` model does not implement the `ClassificationModel` interface that is implemented by the other three models, we need to compute the results separately for this model in the following code:
+
+    val dtMetrics = Seq(dtModel).map{ model =>
+      val scoreAndLabels = data.map { point =>
+        val score = model.predict(point.features)
+        (if (score > 0.5) 1.0 else 0.0, point.label)
+      }
+      val metrics = new BinaryClassificationMetrics(scoreAndLabels)
+      (model.getClass.getSimpleName, metrics.areaUnderPR, metrics.areaUnderROC)
+    }
+    val allMetrics = metrics ++ nbMetrics ++ dtMetrics
+    allMetrics.foreach{ case (m, pr, roc) => 
+      println(f"$m, Area under PR: ${pr * 100.0}%2.4f%%, Area under ROC: ${roc * 100.0}%2.4f%%") 
+    }
+
+Your output will look similar to the one here:
+
+    **LogisticRegressionModel, Area under PR: 75.6759%, Area under ROC: 50.1418%**
+    **SVMModel, Area under PR: 75.6759%, Area under ROC: 50.1418%**
+    **NaiveBayesModel, Area under PR: 68.0851%, Area under ROC: 58.3559%**
+    **DecisionTreeModel, Area under PR: 74.3081%, Area under ROC: 64.8837%**
+
+We can see that all models achieve broadly similar results for the average precision metric.
+
+Logistic regression and SVM achieve results of around 0.5 for AUC. This indicates that they do no better than random chance! Our naive Bayes and decision tree models fare a little better, achieving an AUC of 0.58 and 0.65, respectively. Still, this is not a very good result in terms of binary classification performance.
+
+### Note
+
+While we don't cover multiclass classification here, MLlib provides a similar evaluation class called `MulticlassMetrics`, which provides averaged versions of many common metrics.
+
+# Improving model performance and tuning parameters
+
+So, what went wrong? Why have our sophisticated models achieved nothing better than random chance? Is there a problem with our models?
+
+Recall that we started out by just throwing the data at our model. In fact, we didn't even throw all our data at the model, just the numeric columns that were easy to use. Furthermore, we didn't do a lot of analysis on these numeric features.
+
+## Feature standardization
+
+Many models that we employ make inherent assumptions about the distribution or scale of input data. One of the most common forms of assumption is about normally-distributed features. Let's take a deeper look at the distribution of our features.
+
+To do this, we can represent the feature vectors as a distributed matrix in MLlib, using the `RowMatrix` class. `RowMatrix` is an RDD made up of vector, where each vector is a row of our matrix.
+
+The `RowMatrix` class comes with some useful methods to operate on the matrix, one of which is a utility to compute statistics on the columns of the matrix:
+
+    import org.apache.spark.mllib.linalg.distributed.RowMatrix
+    val vectors = data.map(lp => lp.features)
+    val matrix = new RowMatrix(vectors)
+    val matrixSummary = matrix.computeColumnSummaryStatistics()
+
+The following code statement will print the mean of the matrix:
+
+    println(matrixSummary.mean)
+
+Here is the output:
+
+    **[0.41225805299526636,2.761823191986623,0.46823047328614004, ...**
+
+The following code statement will print the minimum value of the matrix:
+
+    println(matrixSummary.min)
+
+Here is the output:
+
+    **[0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.045564223,-1.0, ...**
+
+The following code statement will print the maximum value of the matrix:
+
+    println(matrixSummary.max)
+
+The output is as follows:
+
+    **[0.999426,363.0,1.0,1.0,0.980392157,0.980392157,21.0,0.25,0.0,0.444444444, ...**
+
+The following code statement will print the variance of the matrix:
+
+    println(matrixSummary.variance)
+
+The output of the variance is:
+
+    **[0.1097424416755897,74.30082476809638,0.04126316989120246, ...**
+
+The following code statement will print the nonzero number of the matrix:
+
+    println(matrixSummary.numNonzeros)
+
+Here is the output:
+
+    **[5053.0,7354.0,7172.0,6821.0,6160.0,5128.0,7350.0,1257.0,0.0, ...**
+
+The `computeColumnSummaryStatistics` method computes a number of statistics over each column of features, including the mean and variance, storing each of these in a `Vector` with one entry per column (that is, one entry per feature in our case).
+
+Looking at the preceding output for mean and variance, we can see quite clearly that the second feature has a much higher mean and variance than some of the other features (you will find a few other features that are similar and a few others that are more extreme). So, our data definitely does not conform to a standard Gaussian distribution in its raw form. To get the data in a more suitable form for our models, we can standardize each feature such that it has zero mean and unit standard deviation. We can do this by subtracting the column mean from each feature value and then scaling it by dividing it by the column standard deviation for the feature:
+
+    (x - μ) / sqrt(variance)
+
+Practically, for each feature vector in our input dataset, we can simply perform an element-wise subtraction of the preceding mean vector from the feature vector and then perform an element-wise division of the feature vector by the vector of feature standard deviations. The standard deviation vector itself can be obtained by performing an element-wise square root operation on the variance vector.
+
+As we mentioned in Chapter 3, _Obtaining, Processing, and Preparing Data with Spark_ , we fortunately have access to a convenience method from Spark's `StandardScaler` to accomplish this.
+
+`StandardScaler` works in much the same way as the `Normalizer` feature we used in that chapter. We will instantiate it by passing in two arguments that tell it whether to subtract the mean from the data and whether to apply standard deviation scaling. We will then fit `StandardScaler` on our input `vectors`. Finally, we will pass in an input vector to the `transform` function, which will then return a normalized vector. We will do this within the following `map` function to preserve the `label` from our dataset:
+
+    import org.apache.spark.mllib.feature.StandardScaler
+    val scaler = new StandardScaler(withMean = true, withStd = true).fit(vectors)
+    val scaledData = data.map(lp => LabeledPoint(lp.label, scaler.transform(lp.features)))
+
+Our data should now be standardized. Let's inspect the first row of the original and standardized features:
+
+    println(data.first.features)
+
+The output of the preceding line of code is as follows:
+
+    **[0.789131,2.055555556,0.676470588,0.205882353,**
+
+The following code will the first row of the standardized features:
+
+    println(scaledData.first.features)
+
+The output is as follows:
+
+    **[1.1376439023494747,-0.08193556218743517,1.025134766284205,-0.0558631837375738,**
+
+As we can see, the first feature has been transformed by applying the standardization formula. We can check this by subtracting the mean (which we computed earlier) from the first feature and dividing the result by the square root of the variance (which we computed earlier):
+
+    println((0.789131 - 0.41225805299526636)/ math.sqrt(0.1097424416755897))
+
+The result should be equal to the first element of our scaled vector:
+
+    **1.137647336497682**
+
+We can now retrain our model using the standardized data. We will use only the logistic regression model to illustrate the impact of feature standardization (since the decision tree and naive Bayes are not impacted by this):
+
+    val lrModelScaled = LogisticRegressionWithSGD.train(scaledData, numIterations)
+    val lrTotalCorrectScaled = scaledData.map { point =>
+      if (lrModelScaled.predict(point.features) == point.label) 1 else 0
+    }.sum
+    val lrAccuracyScaled = lrTotalCorrectScaled / numData
+    val lrPredictionsVsTrue = scaledData.map { point => 
+      (lrModelScaled.predict(point.features), point.label) 
+    }
+    val lrMetricsScaled = new BinaryClassificationMetrics(lrPredictionsVsTrue)
+    val lrPr = lrMetricsScaled.areaUnderPR
+    val lrRoc = lrMetricsScaled.areaUnderROC
+    println(f"${lrModelScaled.getClass.getSimpleName}\nAccuracy: ${lrAccuracyScaled * 100}%2.4f%%\nArea under PR: ${lrPr * 100.0}%2.4f%%\nArea under ROC: ${lrRoc * 100.0}%2.4f%%")
+
+The result should look similar to this:
+
+    **LogisticRegressionModel**
+    **Accuracy: 62.0419%**
+    **Area under PR: 72.7254%**
+    **Area under ROC: 61.9663%**
+
+Simply through standardizing our features, we have improved the logistic regression performance for accuracy and AUC from 50 percent, no better than random, to 62 percent.
+
+## Additional features
+
+We have seen that we need to be careful about standardizing and potentially normalizing our features, and the impact on model performance can be serious. In this case, we used only a portion of the features available. For example, we completely ignored the category variable and the textual content in the boilerplate variable column.
+
+This was done for ease of illustration, but let's assess the impact of adding an additional feature such as the category feature.
+
+First, we will inspect the categories and form a mapping of index to category, which you might recognize as the basis for a _1-of-k_ encoding of this categorical feature:
+
+    val categories = records.map(r => r(3)).distinct.collect.zipWithIndex.toMap
+    val numCategories = categories.size
+    println(categories)
+
+The output of the different categories is as follows:
+
+    **Map("weather" - > 0, "sports" -> 6, "unknown" -> 4, "computer_internet" -> 12, "?" -> 11, "culture_politics" -> 3, "religion" -> 8, "recreation" -> 2, "arts_entertainment" -> 9, "health" -> 5, "law_crime" -> 10, "gaming" -> 13, "business" -> 1, "science_technology" -> 7)**
+
+The following code will print the number of categories:
+
+    println(numCategories)
+
+Here is the output:
+
+    **14**
+
+So, we will need to create a vector of length 14 to represent this feature and assign a value of 1 for the index of the relevant category for each data point. We can then prepend this new feature vector to the vector of other numerical features:
+
+    val dataCategories = records.map { r =>
+      val trimmed = r.map(_.replaceAll("\"", ""))
+      val label = trimmed(r.size - 1).toInt
+      val categoryIdx = categories(r(3))
+      val categoryFeatures = Array.ofDim[Double](numCategories)
+      categoryFeatures(categoryIdx) = 1.0
+      val otherFeatures = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
+      val features = categoryFeatures ++ otherFeatures
+      LabeledPoint(label, Vectors.dense(features))
+    }
+    println(dataCategories.first)
+
+You should see output similar to what is shown here. You can see that the first part of our feature vector is now a vector of length 14 with one nonzero entry at the relevant category index:
+
+    **LabeledPoint(0.0, [0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575])**
+
+Again, since our raw features are not standardized, we should perform this transformation using the same `StandardScaler` approach that we used earlier before training a new model on this expanded dataset:
+
+    val scalerCats = new StandardScaler(withMean = true, withStd = true).fit(dataCategories.map(lp => lp.features))
+    val scaledDataCats = dataCategories.map(lp => LabeledPoint(lp.label, scalerCats.transform(lp.features)))
+
+We can inspect the features before and after scaling as we did earlier:
+
+    println(dataCategories.first.features)
+
+The output is as follows:
+
+    **0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.789131,2.055555556 ...**
+
+The following code will print the features after scaling:
+
+    println(scaledDataCats.first.features)
+
+You will see the following on the screen:
+
+    **[-0.023261105535492967,2.720728254208072,-0.4464200056407091,-0.2205258360869135, ...**
+
+### Tip
+
+Note that while the original raw features were sparse (that is, there are many entries that are zero), if we subtract the mean from each entry, we would end up with a non-sparse (dense) representation, as can be seen in the preceding example.
+
+This is not a problem in this case as the data size is small, but often large-scale real-world problems have extremely sparse input data with many features (online advertising and text classification are good examples). In this case, it is not advisable to lose this sparsity, as the memory and processing requirements for the equivalent dense representation can quickly explode with many millions of features. We can use StandardScaler and set `withMean` to `false` to avoid this.
+
+We're now ready to train a new logistic regression model with our expanded feature set, and then we will evaluate the performance:
+
+    val lrModelScaledCats = LogisticRegressionWithSGD.train(scaledDataCats, numIterations)
+    val lrTotalCorrectScaledCats = scaledDataCats.map { point =>
+      if (lrModelScaledCats.predict(point.features) == point.label) 1 else 0
+    }.sum
+    val lrAccuracyScaledCats = lrTotalCorrectScaledCats / numData
+    val lrPredictionsVsTrueCats = scaledDataCats.map { point => 
+      (lrModelScaledCats.predict(point.features), point.label) 
+    }
+    val lrMetricsScaledCats = new BinaryClassificationMetrics(lrPredictionsVsTrueCats)
+    val lrPrCats = lrMetricsScaledCats.areaUnderPR
+    val lrRocCats = lrMetricsScaledCats.areaUnderROC
+    println(f"${lrModelScaledCats.getClass.getSimpleName}\nAccuracy: ${lrAccuracyScaledCats * 100}%2.4f%%\nArea under PR: ${lrPrCats * 100.0}%2.4f%%\nArea under ROC: ${lrRocCats * 100.0}%2.4f%%")
+
+You should see output similar to this one:
+
+    **LogisticRegressionModel**
+    **Accuracy: 66.5720%**
+    **Area under PR: 75.7964%**
+    **Area under ROC: 66.5483%**
+
+By applying a feature standardization transformation to our data, we improved both the accuracy and AUC measures from 50 percent to 62 percent, and then, we achieved a further boost to 66 percent by adding the category feature into our model (remember to apply the standardization to our new feature set).
+
+### Note
+
+Note that the best model performance in the competition was an AUC of 0.88906 (see <http://www.kaggle.com/c/stumbleupon/leaderboard/private>).
+
+One approach to achieving performance almost as high is outlined at <http://www.kaggle.com/c/stumbleupon/forums/t/5680/beating-the-benchmark-leaderboard-auc-0-878>.
+
+Notice that there are still features that we have not yet used; most notably, the text features in the boilerplate variable. The leading competition submissions predominantly use the boilerplate features and features based on the raw textual content to achieve their performance. As we saw earlier, while adding category-improved performance, it appears that most of the variables are not very useful as predictors, while the textual content turned out to be highly predictive.
+
+Going through some of the best performing approaches for these competitions can give you a good idea as to how feature extraction and engineering play a critical role in model performance.
+
+## Using the correct form of data
+
+Another critical aspect of model performance is using the correct form of data for each model. Previously, we saw that applying a naive Bayes model to our numerical features resulted in very poor performance. Is this because the model itself is deficient?
+
+In this case, recall that MLlib implements a multinomial model. This model works on input in the form of non-zero count data. This can include a binary representation of categorical features (such as the _1-of-k_ encoding covered previously) or frequency data (such as the frequency of occurrences of words in a document). The numerical features we used initially do not conform to this assumed input distribution, so it is probably unsurprising that the model did so poorly.
+
+To illustrate this, we'll use only the category feature, which, when _1-of-k_ encoded, is of the correct form for the model. We will create a new dataset as follows:
+
+    val dataNB = records.map { r =>
+      val trimmed = r.map(_.replaceAll("\"", ""))
+      val label = trimmed(r.size - 1).toInt
+      val categoryIdx = categories(r(3))
+      val categoryFeatures = Array.ofDim[Double](numCategories)
+      categoryFeatures(categoryIdx) = 1.0
+      LabeledPoint(label, Vectors.dense(categoryFeatures))
+    }
+
+Next, we will train a new naive Bayes model and evaluate its performance:
+
+    val nbModelCats = NaiveBayes.train(dataNB)
+    val nbTotalCorrectCats = dataNB.map { point =>
+      if (nbModelCats.predict(point.features) == point.label) 1 else 0
+    }.sum
+    val nbAccuracyCats = nbTotalCorrectCats / numData
+    val nbPredictionsVsTrueCats = dataNB.map { point => 
+      (nbModelCats.predict(point.features), point.label) 
+    }
+    val nbMetricsCats = new BinaryClassificationMetrics(nbPredictionsVsTrueCats)
+    val nbPrCats = nbMetricsCats.areaUnderPR
+    val nbRocCats = nbMetricsCats.areaUnderROC
+    println(f"${nbModelCats.getClass.getSimpleName}\nAccuracy: ${nbAccuracyCats * 100}%2.4f%%\nArea under PR: ${nbPrCats * 100.0}%2.4f%%\nArea under ROC: ${nbRocCats * 100.0}%2.4f%%")
+
+You should see the following output:
+
+    **NaiveBayesModel**
+    **Accuracy: 60.9601%**
+    **Area under PR: 74.0522%**
+    **Area under ROC: 60.5138%**
+
+So, by ensuring that we use the correct form of input, we have improved the performance of the naive Bayes model slightly from 58 percent to 60 percent.
+
+## Tuning model parameters
+
+The previous section showed the impact on model performance of feature extraction and selection, as well as the form of input data and a model's assumptions around data distributions. So far, we have discussed model parameters only in passing, but they also play a significant role in model performance.
+
+MLlib's default `train` methods use default values for the parameters of each model. Let's take a deeper look at them.
+
+### Linear models
+
+Both logistic regression and SVM share the same parameters, because they use the same underlying optimization technique of **stochastic gradient descent** ( **SGD** ). They differ only in the loss function applied. If we take a look at the class definition for logistic regression in MLlib, we will see the following definition:
+
+    class LogisticRegressionWithSGD private (
+      private var stepSize: Double,
+      private var numIterations: Int,
+      private var regParam: Double,
+      private var miniBatchFraction: Double)
+      extends GeneralizedLinearAlgorithm[LogisticRegressionModel] ...
+
+We can see that the arguments that can be passed to the constructor are `stepSize`, `numIterations`, `regParam`, and `miniBatchFraction`. Of these, all except `regParam` are related to the underlying optimization technique.
+
+The instantiation code for logistic regression initializes the `Gradient`, `Updater`, and `Optimizer` and sets the relevant arguments for `Optimizer` (`GradientDescent` in this case):
+
+      private val gradient = new LogisticGradient()
+      private val updater = new SimpleUpdater()
+      override val optimizer = new GradientDescent(gradient, updater)
+        .setStepSize(stepSize)
+        .setNumIterations(numIterations)
+        .setRegParam(regParam)
+        .setMiniBatchFraction(miniBatchFraction)
+
+`LogisticGradient` sets up the logistic loss function that defines our logistic regression model.
+
+### Tip
+
+While a detailed treatment of optimization techniques is beyond the scope of this book, MLlib provides two optimizers for linear models: SGD and L-BFGS. L-BFGS is often more accurate and has fewer parameters to tune.
+
+SGD is the default, while L-BGFS can currently only be used directly for logistic regression via `LogisticRegressionWithLBFGS`. Try it out yourself and compare the results to those found with SGD.
+
+See <http://spark.apache.org/docs/latest/mllib-optimization.html> for further details.
+
+To investigate the impact of the remaining parameter settings, we will create a helper function that will train a logistic regression model, given a set of parameter inputs. First, we will import the required classes:
+
+    import org.apache.spark.rdd.RDD
+    import org.apache.spark.mllib.optimization.Updater
+    import org.apache.spark.mllib.optimization.SimpleUpdater
+    import org.apache.spark.mllib.optimization.L1Updater
+    import org.apache.spark.mllib.optimization.SquaredL2Updater
+    import org.apache.spark.mllib.classification.ClassificationModel
+
+Next, we will define our helper function to train a mode given a set of inputs:
+
+    def trainWithParams(input: RDD[LabeledPoint], regParam: Double, numIterations: Int, updater: Updater, stepSize: Double) = {
+      val lr = new LogisticRegressionWithSGD
+      lr.optimizer.setNumIterations(numIterations). setUpdater(updater).setRegParam(regParam).setStepSize(stepSize)
+      lr.run(input)
+    }
+
+Finally, we will create a second helper function to take the input data and a classification model and generate the relevant AUC metrics:
+
+    def createMetrics(label: String, data: RDD[LabeledPoint], model: ClassificationModel) = {
+      val scoreAndLabels = data.map { point =>
+        (model.predict(point.features), point.label)
+      }
+      val metrics = new BinaryClassificationMetrics(scoreAndLabels)
+      (label, metrics.areaUnderROC)
+    }
+
+We will also cache our scaled dataset, including categories, to speed up the multiple model training runs that we will be using to explore these different parameter settings:
+
+    scaledDataCats.cache
+
+#### Iterations
+
+Many machine learning methods are iterative in nature, converging to a solution (the optimal weight vector that minimizes the chosen loss function) over a number of iteration steps. SGD typically requires relatively few iterations to converge to a reasonable solution but can be run for more iterations to improve the solution. We can see this by trying a few different settings for the `numIterations` parameter and comparing the AUC results:
+
+    val iterResults = Seq(1, 5, 10, 50).map { param =>
+      val model = trainWithParams(scaledDataCats, 0.0, param, new SimpleUpdater, 1.0)
+      createMetrics(s"$param iterations", scaledDataCats, model)
+    }
+    iterResults.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%") }
+
+Your output should look like this:
+
+    **1 iterations, AUC = 64.97%**
+    **5 iterations, AUC = 66.62%**
+    **10 iterations, AUC = 66.55%**
+    **50 iterations, AUC = 66.81%**
+
+So, we can see that the number of iterations has minor impact on the results once a certain number of iterations have been completed.
+
+#### Step size
+
+In SGD, the step size parameter controls how far in the direction of the steepest gradient the algorithm takes a step when updating the model weight vector after each training example. A larger step size might speed up convergence, but a step size that is too large might cause problems with convergence as good solutions are overshot.
+
+We can see the impact of changing the step size here:
+
+    val stepResults = Seq(0.001, 0.01, 0.1, 1.0, 10.0).map { param =>
+      val model = trainWithParams(scaledDataCats, 0.0, numIterations, new SimpleUpdater, param)
+      createMetrics(s"$param step size", scaledDataCats, model)
+    }
+    stepResults.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%") }
+
+This will give us the following results, which show that increasing the step size too much can begin to negatively impact performance.
+
+    **0.001 step size, AUC = 64.95%**
+    **0.01 step size, AUC = 65.00%**
+    **0.1 step size, AUC = 65.52%**
+    **1.0 step size, AUC = 66.55%**
+    **10.0 step size, AUC = 61.92%**
+
+#### Regularization
+
+We briefly touched on the `Updater` class in the preceding logistic regression code. An `Updater` class in MLlib implements regularization. Regularization can help avoid over-fitting of a model to training data by effectively penalizing model complexity. This can be done by adding a term to the loss function that acts to increase the loss as a function of the model weight vector.
+
+Regularization is almost always required in real use cases, but is of particular importance when the feature dimension is very high (that is, the effective number of variable weights that can be learned is high) relative to the number of training examples.
+
+When regularization is absent or low, models can tend to over-fit. Without regularization, most models will over-fit on a training dataset. This is a key reason behind the use of cross-validation techniques for model fitting (which we will cover now).
+
+Conversely, since applying regularization encourages simpler models, model performance can suffer when regularization is high through under-fitting the data.
+
+The forms of regularization available in MLlib are:
+
+  * `SimpleUpdater`: This equates to no regularization and is the default for logistic regression
+  * `SquaredL2Updater`: This implements a regularizer based on the squared L2-norm of the weight vector; this is the default for SVM models
+  * `L1Updater`: This applies a regularizer based on the L1-norm of the weight vector; this can lead to sparse solutions in the weight vector (as less important weights are pulled towards zero)
+
+### Note
+
+Regularization and its relation to optimization is a broad and heavily researched area. Some more information is available from the following links:
+
+  * General regularization overview: <http://en.wikipedia.org/wiki/Regularization_(mathematics)>
+  * L2 regularization: <http://en.wikipedia.org/wiki/Tikhonov_regularization>
+  * Over-fitting and under-fitting: <http://en.wikipedia.org/wiki/Overfitting>
+  * Detailed overview of over-fitting and L1 versus L2 regularization: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.92.9860&rep=rep1&type=pdf
+
+Let's explore the impact of a range of regularization parameters using `SquaredL2Updater`:
+
+    val regResults = Seq(0.001, 0.01, 0.1, 1.0, 10.0).map { param =>
+      val model = trainWithParams(scaledDataCats, param, numIterations, new SquaredL2Updater, 1.0)
+      createMetrics(s"$param L2 regularization parameter", scaledDataCats, model)
+    }
+    regResults.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%") }
+
+Your output should look like this:
+
+    **0.001 L2 regularization parameter, AUC = 66.55%**
+    **0.01 L2 regularization parameter, AUC = 66.55%**
+    **0.1 L2 regularization parameter, AUC = 66.63%**
+    **1.0 L2 regularization parameter, AUC = 66.04%**
+    **10.0 L2 regularization parameter, AUC = 35.33%**
+
+As we can see, at low levels of regularization, there is not much impact in model performance. However, as we increase regularization, we can see the impact of under-fitting on our model evaluation.
+
+### Tip
+
+You will find similar results when using the L1 regularization. Give it a try by performing the same evaluation of regularization parameter against the AUC measure for `L1Updater`.
+
+### Decision trees
+
+The decision tree model we trained earlier was the best performer on the raw data that we first used. We set a parameter called `maxDepth`, which controls the maximum depth of the tree and, thus, the complexity of the model. Deeper trees result in more complex models that will be able to fit the data better.
+
+For classification problems, we can also select between two measures of impurity: `Gini` and `Entropy`.
+
+#### Tuning tree depth and impurity
+
+We will illustrate the impact of tree depth in a similar manner as we did for our logistic regression model.
+
+First, we will need to create another helper function in the Spark shell:
+
+    import org.apache.spark.mllib.tree.impurity.Impurity
+    import org.apache.spark.mllib.tree.impurity.Entropy
+    import org.apache.spark.mllib.tree.impurity.Gini
+
+    def trainDTWithParams(input: RDD[LabeledPoint], maxDepth: Int, impurity: Impurity) = {
+      DecisionTree.train(input, Algo.Classification, impurity, maxDepth)
+    }
+
+Now, we're ready to compute our AUC metric for different settings of tree depth. We will simply use our original dataset in this example since we do not need the data to be standardized.
+
+### Tip
+
+Note that decision tree models generally do not require features to be standardized or normalized, nor do they require categorical features to be binary-encoded.
+
+First, train the model using the `Entropy` impurity measure and varying tree depths:
+
+    val dtResultsEntropy = Seq(1, 2, 3, 4, 5, 10, 20).map { param =>
+      val model = trainDTWithParams(data, param, Entropy)
+      val scoreAndLabels = data.map { point =>
+        val score = model.predict(point.features)
+        (if (score > 0.5) 1.0 else 0.0, point.label)
+      }
+      val metrics = new BinaryClassificationMetrics(scoreAndLabels)
+      (s"$param tree depth", metrics.areaUnderROC)
+    }
+    dtResultsEntropy.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%") }
+
+This should output the results shown here:
+
+    **1 tree depth, AUC = 59.33%**
+    **2 tree depth, AUC = 61.68%**
+    **3 tree depth, AUC = 62.61%**
+    **4 tree depth, AUC = 63.63%**
+    **5 tree depth, AUC = 64.88%**
+    **10 tree depth, AUC = 76.26%**
+    **20 tree depth, AUC = 98.45%**
+
+Next, we will perform the same computation using the `Gini` impurity measure (we omitted the code as it is very similar, but it can be found in the code bundle). Your results should look something like this:
+
+    **1 tree depth, AUC = 59.33%**
+    **2 tree depth, AUC = 61.68%**
+    **3 tree depth, AUC = 62.61%**
+    **4 tree depth, AUC = 63.63%**
+    **5 tree depth, AUC = 64.89%**
+    **10 tree depth, AUC = 78.37%**
+    **20 tree depth, AUC = 98.87%**
+
+As you can see from the preceding results, increasing the tree depth parameter results in a more accurate model (as expected since the model is allowed to get more complex with greater tree depth). It is very likely that at higher tree depths, the model will over-fit the dataset significantly.
+
+There is very little difference in performance between the two impurity measures.
+
+### The naive Bayes model
+
+Finally, let's see the impact of changing the `lambda` parameter for naive Bayes. This parameter controls additive smoothing, which handles the case when a class and feature value do not occur together in the dataset.
+
+### Tip
+
+See <http://en.wikipedia.org/wiki/Additive_smoothing> for more details on additive smoothing.
+
+We will take the same approach as we did earlier, first creating a convenience training function and training the model with varying levels of `lambda`:
+
+    def trainNBWithParams(input: RDD[LabeledPoint], lambda: Double) = {
+      val nb = new NaiveBayes
+      nb.setLambda(lambda)
+      nb.run(input)
+    }
+    val nbResults = Seq(0.001, 0.01, 0.1, 1.0, 10.0).map { param =>
+      val model = trainNBWithParams(dataNB, param)
+      val scoreAndLabels = dataNB.map { point =>
+        (model.predict(point.features), point.label)
+      }
+      val metrics = new BinaryClassificationMetrics(scoreAndLabels)
+      (s"$param lambda", metrics.areaUnderROC)
+    }
+    nbResults.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%") 
+    }
+
+The results of the training are as follows:
+
+    **0.001 lambda, AUC = 60.51%**
+    **0.01 lambda, AUC = 60.51%**
+    **0.1 lambda, AUC = 60.51%**
+    **1.0 lambda, AUC = 60.51%**
+    **10.0 lambda, AUC = 60.51%**
+
+We can see that `lambda` has no impact in this case, since it will not be a problem if the combination of feature and class label not occurring together in the dataset.
+
+## Cross-validation
+
+So far in this book, we have only briefly mentioned the idea of cross-validation and out-of-sample testing. Cross-validation is a critical part of real-world machine learning and is central to many model selection and parameter tuning pipelines.
+
+The general idea behind cross-validation is that we want to know how our model will perform on unseen data. Evaluating this on real, live data (for example, in a production system) is risky, because we don't really know whether the trained model is the best in the sense of being able to make accurate predictions on new data. As we saw previously with regard to regularization, our model might have over-fit the training data and be poor at making predictions on data it has not been trained on.
+
+Cross-validation provides a mechanism where we use part of our available dataset to train our model and another part to evaluate the performance of this model. As the model is tested on data that it has not seen during the training phase, its performance, when evaluated on this part of the dataset, gives us an estimate as to how well our model generalizes for the new data points.
+
+Here, we will implement a simple cross-validation evaluation approach using a train-test split. We will divide our dataset into two non-overlapping parts. The first dataset is used to train our model and is called the training set. The second dataset, called the test set or hold-out set, is used to evaluate the performance of our model using our chosen evaluation measure. Common splits used in practice include 50/50, 60/40, and 80/20 splits, but you can use any split as long as the training set is not too small for the model to learn (generally, at least 50 percent is a practical minimum).
+
+In many cases, three sets are created: a training set, an evaluation set (which is used like the above test set to tune the model parameters such as lambda and step size), and a test set (which is never used to train a model or tune any parameters, but is only used to generate an estimated true performance on completely unseen data).
+
+### Note
+
+Here, we will explore a simple train-test split approach. There are many cross-validation techniques that are more exhaustive and complex.
+
+One popular example is K-fold cross-validation, where the dataset is split into K non-overlapping folds. The model is trained on K-1 folds of data and tested on the remaining, held-out fold. This is repeated K times, and the results are averaged to give the cross-validation score. The train-test split is effectively like two-fold cross-validation.
+
+Other approaches include leave-one-out cross-validation and random sampling. See the article at <http://en.wikipedia.org/wiki/Cross-validation_(statistics)> for further details.
+
+First, we will split our dataset into a 60 percent training set and a 40 percent test set (we will use a constant random seed of 123 here to ensure that we get the same results for ease of illustration):
+
+    val trainTestSplit = scaledDataCats.randomSplit(Array(0.6, 0.4), 123)
+    val train = trainTestSplit(0)
+    val test = trainTestSplit(1)
+
+Next, we will compute the evaluation metric of interest (again, we will use AUC) for a range of regularization parameter settings. Note that here we will use a finer-grained step size between the evaluated regularization parameters to better illustrate the differences in AUC, which are very small in this case:
+
+    val regResultsTest = Seq(0.0, 0.001, 0.0025, 0.005, 0.01).map { param =>
+      val model = trainWithParams( **train** , param, numIterations, new SquaredL2Updater, 1.0)
+      createMetrics(s"$param L2 regularization parameter", test, model)
+    }
+    regResultsTest.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.6f%%") 
+    }
+
+This will compute the results of training on the training set and the results of evaluating on the test set, as shown here:
+
+    **0.0 L2 regularization parameter, AUC = 66.480874%**
+    **0.001 L2 regularization parameter, AUC = 66.480874%**
+    **0.0025 L2 regularization parameter, AUC = 66.515027%**
+    **0.005 L2 regularization parameter, AUC = 66.515027%**
+    **0.01 L2 regularization parameter, AUC = 66.549180%**
+
+Now, let's compare this to the results of training and testing on the training set (this is what we were doing previously by training and testing on all data). Again, we will omit the code as it is very similar (but it is available in the code bundle):
+
+    **0.0 L2 regularization parameter, AUC = 66.260311%**
+    **0.001 L2 regularization parameter, AUC = 66.260311%**
+    **0.0025 L2 regularization parameter, AUC = 66.260311%**
+    **0.005 L2 regularization parameter, AUC = 66.238294%**
+    **0.01 L2 regularization parameter, AUC = 66.238294%**
+
+So, we can see that when we train and evaluate our model on the same dataset, we generally achieve the highest performance when regularization is lower. This is because our model has seen all the data points, and with low levels of regularization, it can over-fit the data set and achieve higher performance.
+
+In contrast, when we train on one dataset and test on another, we see that generally a slightly higher level of regularization results in better test set performance.
+
+In cross-validation, we would typically find the parameter settings (including regularization as well as the various other parameters such as step size and so on) that result in the best test set performance. We would then use these parameter settings to retrain the model on all of our data in order to use it to make predictions on new data.
+
+### Tip
+
+Recall from Chapter 4, _Building a Recommendation Engine with Spark_ , that we did not cover cross-validation. You can apply the same techniques we used earlier to split the ratings dataset from that chapter into a training and test dataset. You can then try out different parameter settings on the training set while evaluating the MSE and MAP performance metrics on the test set in a manner similar to what we did earlier. Give it a try!
+
+# Summary
+
+In this chapter, we covered the various classification models available in Spark MLlib, and we saw how to train models on input data and how to evaluate their performance using standard metrics and measures. We also explored how to apply some of the techniques previously introduced to transform our features. Finally, we investigated the impact of using the correct input data format or distribution on model performance, and we also saw the impact of adding more data to our model, tuning model parameters, and implementing cross-validation.
+
+In the next chapter, we will take a similar approach to delve into MLlib's regression models.
+
+# Chapter 6. Building a Regression Model with Spark
+
+In this chapter, we will build on what we covered in Chapter 5, _Building a Classification Model with Spark_. While classification models deal with outcomes that represent discrete classes, regression models are concerned with target variables that can take any real value. The underlying principle is very similar--we wish to find a model that maps input features to predicted target variables. Like classification, regression is also a form of supervised learning.
+
+Regression models can be used to predict just about any variable of interest. A few examples include the following:
+
+  * Predicting stock returns and other economic variables
+  * Predicting loss amounts for loan defaults (this can be combined with a classification model that predicts the probability of default, while the regression model predicts the amount in the case of a default)
+  * Recommendations (the Alternating Least Squares factorization model from Chapter 4, _Building a Recommendation Engine with Spark_ , uses linear regression in each iteration)
+  * Predicting **customer lifetime value** ( **CLTV** ) in a retail, mobile, or other business, based on user behavior and spending patterns
+
+In the following sections, we will:
+
+  * Introduce the various types of regression models available in MLlib
+  * Explore feature extraction and target variable transformation for regression models
+  * Train a number of regression models using MLlib
+  * See how to make predictions using the trained models
+  * Investigate the impact on performance of various parameter settings for regression using cross-validation
+
+# Types of regression models
+
+Spark's MLlib library offers two broad classes of regression models: linear models and decision tree regression models.
+
+Linear models are essentially the same as their classification counterparts, the only difference is that linear regression models use a different loss function, related link function, and decision function. MLlib provides a standard least squares regression model (although other types of generalized linear models for regression are planned).
+
+Decision trees can also be used for regression by changing the impurity measure.
+
+## Least squares regression
+
+You might recall from Chapter 5, _Building a Classification Model with Spark_ , that there are a variety of loss functions that can be applied to generalized linear models. The loss function used for least squares is the squared loss, which is defined as follows:
+
+    ½ (wTx - y)2
+
+Here, as for the classification setting, _y_ is the target variable (this time, real valued), _w_ is the weight vector, and _x_ is the feature vector.
+
+The related link function is the identity link, and the decision function is also the identity function, as generally, no thresholding is applied in regression. So, the model's prediction is simply _y = w Tx_.
+
+The standard least squares regression in MLlib does not use regularization. Looking at the squared loss function, we can see that the loss applied to incorrectly predicted points will be magnified since the loss is squared. This means that least squares regression is susceptible to outliers in the dataset and also to over-fitting. Generally, as for classification, we should apply some level of regularization in practice.
+
+Linear regression with L2 regularization is commonly referred to as ridge regression, while applying L1 regularization is called the **lasso**.
+
+### Tip
+
+See the section on linear least squares in the Spark MLlib documentation at <http://spark.apache.org/docs/latest/mllib-linear-methods.html#linear-least-squares-lasso-and-ridge-regression> for further information.
+
+## Decision trees for regression
+
+Just like using linear models for regression tasks involves changing the loss function used, using decision trees for regression involves changing the measure of the node impurity used. The impurity metric is called **variance** and is defined in the same way as the squared loss for least squares linear regression.
+
+### Note
+
+See the _MLlib - Decision Tree_ section in the Spark documentation at <http://spark.apache.org/docs/latest/mllib-decision-tree.html> for further details on the decision tree algorithm and impurity measure for regression.
+
+Now, we will plot a simple example of a regression problem with only one input variable shown on the _x_ axis and the target variable on the _y_ axis. The linear model prediction function is shown by a red dashed line, while the decision tree prediction function is shown by a green dashed line. We can see that the decision tree allows a more complex, nonlinear model to be fitted to the data.
+
+Linear model and decision tree prediction functions for regression
+
+# Extracting the right features from your data
+
+As the underlying models for regression are the same as those for the classification case, we can use the same approach to create input features. The only practical difference is that the target is now a real-valued variable, as opposed to a categorical one. The `LabeledPoint` class in MLlib already takes this into account, as the `label` field is of the `Double` type, so it can handle both cases.
+
+## Extracting features from the bike sharing dataset
+
+To illustrate the concepts in this chapter, we will be using the bike sharing dataset. This dataset contains hourly records of the number of bicycle rentals in the capital bike sharing system. It also contains variables related to date and time, weather, and seasonal and holiday information.
+
+### Note
+
+The dataset is available at <http://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset>.
+
+Click on the **Data Folder** link and then download the `Bike-Sharing-Dataset.zip` file.
+
+The bike sharing data was enriched with weather and seasonal data by Hadi Fanaee-T at the University of Porto and used in the following paper:
+
+Fanaee-T, Hadi and Gama Joao, Event labeling combining ensemble detectors and background knowledge, _Progress in Artificial Intelligence_ , pp. 1-15, Springer Berlin Heidelberg, 2013.
+
+The paper is available at <http://link.springer.com/article/10.1007%2Fs13748-013-0040-3>.
+
+Once you have downloaded the `Bike-Sharing-Dataset.zip` file, unzip it. This will create a directory called `Bike-Sharing-Dataset`, which contains the `day.csv`, `hour.csv`, and the `Readme.txt` files.
+
+The `Readme.txt` file contains information on the dataset, including the variable names and descriptions. Take a look at the file, and you will see that we have the following variables available:
+
+  * `instant`: This is the record ID
+  * `dteday`: This is the raw date
+  * `season`: This is different seasons such as spring, summer, winter, and fall
+  * `yr`: This is the year (2011 or 2012)
+  * `mnth`: This is the month of the year
+  * `hr`: This is the hour of the day
+  * `holiday`: This is whether the day was a holiday or not
+  * `weekday`: This is the day of the week
+  * `workingday`: This is whether the day was a working day or not
+  * `weathersit`: This is a categorical variable that describes the weather at a particular time
+  * `temp`: This is the normalized temperature
+  * `atemp`: This is the normalized apparent temperature
+  * `hum`: This is the normalized humidity
+  * `windspeed`: This is the normalized wind speed
+  * `cnt`: This is the target variable, that is, the count of bike rentals for that hour
+
+We will work with the hourly data contained in `hour.csv`. If you look at the first line of the dataset, you will see that it contains the column names as a header. You can do this by running the following command:
+
+    **> head -1 hour.csv**
+
+This should output the following result:
+
+    **instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt**
+
+Before we work with the data in Spark, we will again remove the header from the first line of the file using the same `sed` command that we used previously to create a new file called `hour_noheader.csv`:
+
+    **> sed 1d hour.csv > hour_noheader.csv**
+
+Since we will be doing some plotting of our dataset later on, we will use the Python shell for this chapter. This also serves to illustrate how to use MLlib's linear model and decision tree functionality from PySpark.
+
+Start up your PySpark shell from your Spark installation directory. If you want to use IPython, which we highly recommend, remember to include the `IPYTHON=1` environment variable together with the `pylab` functionality:
+
+    **> IPYTHON=1 IPYTHON_OPTS="--pylab" ./bin/pyspark**
+
+If you prefer to use IPython Notebook, you can start it with the following command:
+
+    **> IPYTHON=1 IPYTHON_OPTS=notebook ./bin/pyspark**
+
+You can type all the code that follows for the remainder of this chapter directly into your PySpark shell (or into IPython Notebook if you wish to use it).
+
+### Tip
+
+Recall that we used the IPython shell in Chapter 3, _Obtaining, Processing, and Preparing Data with Spark_. Take a look at that chapter and the code bundle for instructions to install IPython.
+
+We'll start as usual by loading the dataset and inspecting it:
+
+    path = "/ **PATH** /hour_noheader.csv"
+    raw_data = sc.textFile(path)
+    num_data = raw_data.count()
+    records = raw_data.map(lambda x: x.split(","))
+    first = records.first()
+    print first
+    print num_data
+
+You should see the following output:
+
+    **[u'1', u'2011-01-01', u'1', u'0', u'1', u'0', u'0', u'6', u'0', u'1', u'0.24', u'0.2879', u'0.81', u'0', u'3', u'13', u'16']**
+    **17379**
+
+So, we have `17,379` hourly records in our dataset. We have inspected the column names already. We will ignore the record ID and raw date columns. We will also ignore the `casual` and `registered` count target variables and focus on the overall count variable, `cnt` (which is the sum of the other two counts). We are left with 12 variables. The first eight are categorical, while the last 4 are normalized real-valued variables.
+
+To deal with the eight categorical variables, we will use the binary encoding approach with which you should be quite familiar by now. The four real-valued variables will be left as is.
+
+We will first cache our dataset, since we will be reading from it many times:
+
+    records.cache()
+
+In order to extract each categorical feature into a binary vector form, we will need to know the feature mapping of each feature value to the index of the nonzero value in our binary vector. Let's define a function that will extract this mapping from our dataset for a given column:
+
+    def get_mapping(rdd, idx):
+        return rdd.map(lambda fields: fields[idx]).distinct().zipWithIndex().collectAsMap()
+
+Our function first maps the field to its unique values and then uses the `zipWithIndex` transformation to zip the value up with a unique index such that a key-value RDD is formed, where the key is the variable and the value is the index. This index will be the index of the nonzero entry in the binary vector representation of the feature. We will finally collect this RDD back to the driver as a Python dictionary.
+
+We can test our function on the third variable column (index 2):
+
+    print "Mapping of first categorical feasture column: %s" % get_mapping(records, 2)
+
+The preceding line of code will give us the following output:
+
+    **Mapping of first categorical feasture column: {u'1': 0, u'3': 2, u'2': 1, u'4': 3}**
+
+Now, we can apply this function to each categorical column (that is, for variable indices 2 to 9):
+
+    mappings = [get_mapping(records, i) for i in range(2,10)]
+    cat_len = sum(map(len, mappings))
+    num_len = len(records.first()[11:15])
+    total_len = num_len + cat_len
+
+We now have the mappings for each variable, and we can see how many values in total we need for our binary vector representation:
+
+    print "Feature vector length for categorical features: %d" % cat_len 
+    print "Feature vector length for numerical features: %d" % num_len
+    print "Total feature vector length: %d" % total_len
+
+The output of the preceding code is as follows:
+
+    **Feature vector length for categorical features: 57**
+    **Feature vector length for numerical features: 4**
+    **Total feature vector length: 61**
+
+### Creating feature vectors for the linear model
+
+The next step is to use our extracted mappings to convert the categorical features to binary-encoded features. Again, it will be helpful to create a function that we can apply to each record in our dataset for this purpose. We will also create a function to extract the target variable from each record. We will need to import `numpy` for linear algebra utilities and MLlib's `LabeledPoint` class to wrap our feature vectors and target variables:
+
+    from pyspark.mllib.regression import LabeledPoint
+    import numpy as np
+
+    def extract_features(record):
+      cat_vec = np.zeros(cat_len)
+      i = 0
+      step = 0
+      for field in record[2:9]:
+        m = mappings[i]
+        idx = m[field]
+        cat_vec[idx + step] = 1
+        i = i + 1
+        step = step + len(m)
+      num_vec = np.array([float(field) for field in record[10:14]])
+      return np.concatenate((cat_vec, num_vec))
+
+    def extract_label(record):
+      return float(record[-1])
+
+In the preceding `extract_features` function, we ran through each column in the row of data. We extracted the binary encoding for each variable in turn from the mappings we created previously. The `step` variable ensures that the nonzero feature index in the full feature vector is correct (and is somewhat more efficient than, say, creating many smaller binary vectors and concatenating them). The numeric vector is created directly by first converting the data to floating point numbers and wrapping these in a `numpy` array. The resulting two vectors are then concatenated. The `extract_label` function simply converts the last column variable (the count) into a float.
+
+With our utility functions defined, we can proceed with extracting feature vectors and labels from our data records:
+
+    data = records.map(lambda r: LabeledPoint(extract_label(r), extract_features(r)))
+
+Let's inspect the first record in the extracted feature RDD:
+
+    first_point = data.first()
+    print "Raw data: " + str(first[2:])
+    print "Label: " + str(first_point.label)
+    print "Linear Model feature vector:\n" + str(first_point.features)
+    print "Linear Model feature vector length: " + str(len(first_point.features))
+
+You should see output similar to the following:
+
+    **Raw data: [u'1', u'0', u'1', u'0', u'0', u'6', u'0', u'1', u'0.24', u'0.2879', u'0.81', u'0', u'3', u'13', u'16']**
+    **Label: 16.0**
+    **Linear Model feature vector: [1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.24,0.2879,0.81,0.0]**
+    **Linear Model feature vector length: 61**
+
+As we can see, we converted the raw data into a feature vector made up of the binary categorical and real numeric features, and we indeed have a total vector length of `61`.
+
+### Creating feature vectors for the decision tree
+
+As we have seen, decision tree models typically work on raw features (that is, it is not required to convert categorical features into a binary vector encoding; they can, instead, be used directly). Therefore, we will create a separate function to extract the decision tree feature vector, which simply converts all the values to floats and wraps them in a `numpy` array:
+
+    def extract_features_dt(record):
+      return np.array(map(float, record[2:14]))
+    data_dt = records.map(lambda r: LabeledPoint(extract_label(r), extract_features_dt(r)))
+    first_point_dt = data_dt.first()
+    print "Decision Tree feature vector: " + str(first_point_dt.features)
+    print "Decision Tree feature vector length: " + str(len(first_point_dt.features))
+
+The following output shows the extracted feature vector, and we can see that we have a vector length of `12`, which matches the number of raw variables we are using:
+
+    **Decision Tree feature vector: [1.0,0.0,1.0,0.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.81,0.0]**
+    **Decision Tree feature vector length: 12**
+
+# Training and using regression models
+
+Training for regression models using decision trees and linear models follows the same procedure as for classification models. We simply pass the training data contained in a `[LabeledPoint]` RDD to the relevant `train` method. Note that in Scala, if we wanted to customize the various model parameters (such as regularization and step size for the SGD optimizer), we are required to instantiate a new model instance and use the `optimizer` field to access these available parameter setters.
+
+In Python, we are provided with a convenience method that gives us access to all the available model arguments, so we only have to use this one entry point for training. We can see the details of these convenience functions by importing the relevant modules and then calling the `help` function on the `train` methods:
+
+    from pyspark.mllib.regression import LinearRegressionWithSGD
+    from pyspark.mllib.tree import DecisionTree
+    help(LinearRegressionWithSGD.train)
+
+Doing this for the linear model outputs the following documentation:
+
+Linear regression help documentation
+
+We can see from the linear regression documentation that we need to pass in the training data at a minimum, but we can set any of the other model parameters using this `train` method.
+
+Similarly, for the decision tree model, which has a `trainRegressor` method (in addition to a `trainClassifier` method for classification models):
+
+    help(DecisionTree.trainRegressor)
+
+The preceding code would display the following documentation:
+
+Decision tree regression help documentation
+
+## Training a regression model on the bike sharing dataset
+
+We're ready to use the features we have extracted to train our models on the bike sharing data. First, we'll train the linear regression model and take a look at the first few predictions that the model makes on the data:
+
+    linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False)
+    true_vs_predicted = data.map(lambda p: (p.label, linear_model.predict(p.features)))
+    print "Linear Model predictions: " + str(true_vs_predicted.take(5))
+
+Note that we have not used the default settings for `iterations` and `step` here. We've changed the number of iterations so that the model does not take too long to train. As for the step size, you will see why this has been changed from the default a little later. You will see the following output:
+
+    **Linear Model predictions: [(16.0, 119.30920003093595), (40.0, 116.95463511937379), (32.0, 116.57294610647752), (13.0, 116.43535423855654), (1.0, 116.221247828503)]**
+
+Next, we will train the decision tree model simply using the default arguments to the `trainRegressor` method (which equates to using a tree depth of 5). Note that we need to pass in the other form of the dataset, `data_dt`, that we created from the raw feature values (as opposed to the binary encoded features that we used for the preceding linear model).
+
+We also need to pass in an argument for `categoricalFeaturesInfo`. This is a dictionary that maps the categorical feature index to the number of categories for the feature. If a feature is not in this mapping, it will be treated as continuous. For our purposes, we will leave this as is, passing in an empty mapping:
+
+    dt_model = DecisionTree.trainRegressor(data_dt,{})
+    preds = dt_model.predict(data_dt.map(lambda p: p.features))
+    actual = data.map(lambda p: p.label)
+    true_vs_predicted_dt = actual.zip(preds)
+    print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))
+    print "Decision Tree depth: " + str(dt_model.depth())
+    print "Decision Tree number of nodes: " + str(dt_model.numNodes())
+
+This should output these predictions:
+
+    **Decision Tree predictions: [(16.0, 54.913223140495866), (40.0, 54.913223140495866), (32.0, 53.171052631578945), (13.0, 14.284023668639053), (1.0, 14.284023668639053)]**
+    **Decision Tree depth: 5**
+    **Decision Tree number of nodes: 63**
+
+### Note
+
+This is not as bad as it sounds. While we do not cover it here, the Python code included with this chapter's code bundle includes an example of using `categoricalFeaturesInfo`. It does not make a large difference to performance in this case.
+
+From a quick glance at these predictions, it appears that the decision tree might do better, as the linear model is quite a way off in its predictions. However, we will apply more stringent evaluation methods to find out.
+
+# Evaluating the performance of regression models
+
+We saw in Chapter 5, _Building a Classification Model with Spark_ , that evaluation methods for classification models typically focus on measurements related to predicted class memberships relative to the actual class memberships. These are binary outcomes (either the predicted class is correct or incorrect), and it is less important whether the model just barely predicted correctly or not; what we care most about is the number of correct and incorrect predictions.
+
+When dealing with regression models, it is very unlikely that our model will precisely predict the target variable, because the target variable can take on any real value. However, we would naturally like to understand how far away our predicted values are from the true values, so will we utilize a metric that takes into account the overall deviation.
+
+Some of the standard evaluation metrics used to measure the performance of regression models include the **Mean Squared Error** ( **MSE** ) and **Root Mean Squared Error** ( **RMSE** ), the **Mean Absolute Error** ( **MAE** ), the R-squared coefficient, and many others.
+
+## Mean Squared Error and Root Mean Squared Error
+
+MSE is the average of the squared error that is used as the loss function for least squares regression:
+
+It is the sum, over all the data points, of the square of the difference between the predicted and actual target variables, divided by the number of data points.
+
+RMSE is the square root of MSE. MSE is measured in units that are the square of the target variable, while RMSE is measured in the same units as the target variable. Due to its formulation, MSE, just like the squared loss function that it derives from, effectively penalizes larger errors more severely.
+
+In order to evaluate our predictions based on the mean of an error metric, we will first make predictions for each input feature vector in an RDD of `LabeledPoint` instances by computing the error for each record using a function that takes the prediction and true target value as inputs. This will return a `[Double]` RDD that contains the error values. We can then find the average using the `mean` method of RDDs that contain `Double` values.
+
+Let's define our squared error function as follows:
+
+    def squared_error(actual, pred):
+        return (pred - actual)**2
+
+## Mean Absolute Error
+
+MAE is the average of the absolute differences between the predicted and actual targets:
+
+MAE is similar in principle to MSE, but it does not punish large deviations as much.
+
+Our function to compute MAE is as follows:
+
+    def abs_error(actual, pred):
+        return np.abs(pred - actual)
+
+## Root Mean Squared Log Error
+
+This measurement is not as widely used as MSE and MAE, but it is used as the metric for the Kaggle competition that uses the bike sharing dataset. It is effectively the RMSE of the log-transformed predicted and target values. This measurement is useful when there is a wide range in the target variable, and you do not necessarily want to penalize large errors when the predicted and target values are themselves high. It is also effective when you care about percentage errors rather than the absolute value of errors.
+
+### Note
+
+The Kaggle competition evaluation page can be found at <https://www.kaggle.com/c/bike-sharing-demand/details/evaluation>.
+
+The function to compute RMSLE is shown here:
+
+    def squared_log_error(pred, actual):
+        return (np.log(pred + 1) - np.log(actual + 1))**2
+
+## The R-squared coefficient
+
+The R-squared coefficient, also known as the coefficient of determination, is a measure of how well a model fits a dataset. It is commonly used in statistics. It measures the degree of variation in the target variable; this is explained by the variation in the input features. An R-squared coefficient generally takes a value between 0 and 1, where 1 equates to a perfect fit of the model.
+
+## Computing performance metrics on the bike sharing dataset
+
+Given the functions we defined earlier, we can now compute the various evaluation metrics on our bike sharing data.
+
+### Linear model
+
+Our approach will be to apply the relevant error function to each record in the `RDD` we computed earlier, which is `true_vs_predicted` for our linear model:
+
+    mse = true_vs_predicted.map(lambda (t, p): squared_error(t, p)).mean()
+    mae = true_vs_predicted.map(lambda (t, p): abs_error(t, p)).mean()
+    rmsle = np.sqrt(true_vs_predicted.map(lambda (t, p): squared_log_error(t, p)).mean())
+    print "Linear Model - Mean Squared Error: %2.4f" % mse
+    print "Linear Model - Mean Absolute Error: %2.4f" % mae
+    print "Linear Model - Root Mean Squared Log Error: %2.4f" % rmsle
+
+This outputs the following metrics:
+
+    **Linear Model - Mean Squared Error: 28166.3824**
+    **Linear Model - Mean Absolute Error: 129.4506**
+    **Linear Model - Root Mean Squared Log Error: 1.4974**
+
+### Decision tree
+
+We will use the same approach for the decision tree model, using the `true_vs_predicted_dt` RDD:
+
+    mse_dt = true_vs_predicted_dt.map(lambda (t, p): squared_error(t, p)).mean()
+    mae_dt = true_vs_predicted_dt.map(lambda (t, p): abs_error(t, p)).mean()
+    rmsle_dt = np.sqrt(true_vs_predicted_dt.map(lambda (t, p): squared_log_error(t, p)).mean())
+    print "Decision Tree - Mean Squared Error: %2.4f" % mse_dt
+    print "Decision Tree - Mean Absolute Error: %2.4f" % mae_dt
+    print "Decision Tree - Root Mean Squared Log Error: %2.4f" % rmsle_dt
+
+You should see output similar to this:
+
+    **Decision Tree - Mean Squared Error: 11560.7978**
+    **Decision Tree - Mean Absolute Error: 71.0969**
+    **Decision Tree - Root Mean Squared Log Error: 0.6259**
+
+Looking at the results, we can see that our initial guess about the decision tree model being the better performer is indeed true.
+
+### Note
+
+The Kaggle competition leaderboard lists the Mean Value Benchmark score on the test set at about 1.58. So, we see that our linear model performance is not much better. However, the decision tree with default settings achieves a performance of 0.63.
+
+The winning score at the time of writing this book is listed as 0.29504.
+
+# Improving model performance and tuning parameters
+
+In Chapter 5, _Building a Classification Model with Spark_ , we showed how feature transformation and selection can make a large difference to the performance of a model. In this chapter, we will focus on another type of transformation that can be applied to a dataset: transforming the target variable itself.
+
+## Transforming the target variable
+
+Recall that many machine learning models, including linear models, make assumptions regarding the distribution of the input data as well as target variables. In particular, linear regression assumes a normal distribution.
+
+In many real-world cases, the distributional assumptions of linear regression do not hold. In this case, for example, we know that the number of bike rentals can never be negative. This alone should indicate that the assumption of normality might be problematic. To get a better idea of the target distribution, it is often a good idea to plot a histogram of the target values.
+
+In this section, if you are using IPython Notebook, enter the magic function, `%pylab inline`, to import `pylab` (that is, the `numpy` and `matplotlib` plotting functions) into the workspace. This will also create any figures and plots inline within the `Notebook` cell.
+
+If you are using the standard IPython console, you can use `%pylab` to import the necessary functionality (your plots will appear in a separate window).
+
+We will now create a plot of the target variable distribution in the following piece of code:
+
+    targets = records.map(lambda r: float(r[-1])).collect()
+    hist(targets, bins=40, color='lightblue', normed=True)
+    fig = matplotlib.pyplot.gcf()
+    fig.set_size_inches(16, 10)
+
+Looking at the histogram plot, we can see that the distribution is highly skewed and certainly does not follow a normal distribution:
+
+Distribution of raw target variable values
+
+One way in which we might deal with this situation is by applying a transformation to the target variable, such that we take the logarithm of the target value instead of the raw value. This is often referred to as log-transforming the target variable (this transformation can also be applied to feature values).
+
+We will apply a log transformation to the following target variable and plot a histogram of the log-transformed values:
+
+    log_targets = records.map(lambda r: np.log(float(r[-1]))).collect()
+    hist(log_targets, bins=40, color='lightblue', normed=True)
+    fig = matplotlib.pyplot.gcf()
+    fig.set_size_inches(16, 10)
+
+Distribution of log-transformed target variable values
+
+A second type of transformation that is useful in the case of target values that do not take on negative values and, in addition, might take on a very wide range of values, is to take the square root of the variable.
+
+We will apply the square root transform in the following code, once more plotting the resulting target variable distribution:
+
+    sqrt_targets = records.map(lambda r: np.sqrt(float(r[-1]))).collect()
+    hist(sqrt_targets, bins=40, color='lightblue', normed=True)
+    fig = matplotlib.pyplot.gcf()
+    fig.set_size_inches(16, 10)
+
+From the plots of the log and square root transformations, we can see that both result in a more even distribution relative to the raw values. While they are still not normally distributed, they are a lot closer to a normal distribution when compared to the original target variable.
+
+Distribution of square-root-transformed target variable values
+
+### Impact of training on log-transformed targets
+
+So, does applying these transformations have any impact on model performance? Let's evaluate the various metrics we used previously on log-transformed data as an example.
+
+We will do this first for the linear model by applying the `numpy log` function to the `label` field of each `LabeledPoint` RDD. Here, we will only transform the target variable, and we will not apply any transformations to the features:
+
+    data_log = data.map(lambda lp: LabeledPoint(np.log(lp.label), lp.features))
+
+We will then train a model on this transformed data and form the RDD of predicted versus true values:
+
+    model_log = LinearRegressionWithSGD.train(data_log, iterations=10, step=0.1)
+
+Note that now that we have transformed the target variable, the predictions of the model will be on the log scale, as will the target values of the transformed dataset. Therefore, in order to use our model and evaluate its performance, we must first transform the log data back into the original scale by taking the exponent of both the predicted and true values using the `numpy exp` function. We will show you how to do this in the code here:
+
+    true_vs_predicted_log = data_log.map(lambda p: (np.exp(p.label), np.exp(model_log.predict(p.features))))
+
+Finally, we will compute the MSE, MAE, and RMSLE metrics for the model:
+
+    mse_log = true_vs_predicted_log.map(lambda (t, p): squared_error(t, p)).mean()
+    mae_log = true_vs_predicted_log.map(lambda (t, p): abs_error(t, p)).mean()
+    rmsle_log = np.sqrt(true_vs_predicted_log.map(lambda (t, p): squared_log_error(t, p)).mean())
+    print "Mean Squared Error: %2.4f" % mse_log
+    print "Mean Absolue Error: %2.4f" % mae_log
+    print "Root Mean Squared Log Error: %2.4f" % rmsle_log
+    print "Non log-transformed predictions:\n" + str(true_vs_predicted.take(3))
+    print "Log-transformed predictions:\n" + str(true_vs_predicted_log.take(3))
+
+You should see output similar to the following:
+
+    **Mean Squared Error: 38606.0875**
+    **Mean Absolue Error: 135.2726**
+    **Root Mean Squared Log Error: 1.3516**
+    **Non log-transformed predictions:**
+    **[(16.0, 119.30920003093594), (40.0, 116.95463511937378), (32.0, 116.57294610647752)]**
+    **Log-transformed predictions:**
+    **[(15.999999999999998, 45.860944832110015), (40.0, 43.255903592233274), (32.0, 42.311306147884252)]**
+
+If we compare these results to the results on the raw target variable, we see that while we did not improve the MSE or MAE, we improved the RMSLE.
+
+We will perform the same analysis for the decision tree model:
+
+    data_dt_log = data_dt.map(lambda lp: LabeledPoint(np.log(lp.label), lp.features))
+    dt_model_log = DecisionTree.trainRegressor(data_dt_log,{})
+
+    preds_log = dt_model_log.predict(data_dt_log.map(lambda p: p.features))
+    actual_log = data_dt_log.map(lambda p: p.label)
+    true_vs_predicted_dt_log = actual_log.zip(preds_log).map(lambda (t, p): (np.exp(t), np.exp(p)))
+
+    mse_log_dt = true_vs_predicted_dt_log.map(lambda (t, p): squared_error(t, p)).mean()
+    mae_log_dt = true_vs_predicted_dt_log.map(lambda (t, p): abs_error(t, p)).mean()
+    rmsle_log_dt = np.sqrt(true_vs_predicted_dt_log.map(lambda (t, p): squared_log_error(t, p)).mean())
+    print "Mean Squared Error: %2.4f" % mse_log_dt
+    print "Mean Absolue Error: %2.4f" % mae_log_dt
+    print "Root Mean Squared Log Error: %2.4f" % rmsle_log_dt
+    print "Non log-transformed predictions:\n" + str(true_vs_predicted_dt.take(3))
+    print "Log-transformed predictions:\n" + str(true_vs_predicted_dt_log.take(3))
+
+From the results here, we can see that we actually made our metrics slightly worse for the decision tree:
+
+    **Mean Squared Error: 14781.5760**
+    **Mean Absolue Error: 76.4131**
+    **Root Mean Squared Log Error: 0.6406**
+    **Non log-transformed predictions:**
+    **[(16.0, 54.913223140495866), (40.0, 54.913223140495866), (32.0, 53.171052631578945)]**
+    **Log-transformed predictions:**
+    **[(15.999999999999998, 37.530779787154508), (40.0, 37.530779787154508), (32.0, 7.2797070993907287)]**
+
+### Tip
+
+It is probably not surprising that the log transformation results in a better RMSLE performance for the linear model. As we are minimizing the squared error, once we have transformed the target variable to log values, we are effectively minimizing a loss function that is very similar to the RMSLE.
+
+This is good for Kaggle competition purposes, since we can more directly optimize against the competition-scoring metric.
+
+It might or might not be as useful in a real-world situation. This depends on how important larger absolute errors are (recall that RMSLE essentially penalizes relative errors rather than absolute magnitude of errors).
+
+## Tuning model parameters
+
+So far in this chapter, we have illustrated the concepts of model training and evaluation for MLlib's regression models by training and testing on the same dataset. We will now use a similar cross-validation approach that we used previously to evaluate the effect on performance of different parameter settings for our models.
+
+### Creating training and testing sets to evaluate parameters
+
+The first step is to create a test and training set for cross-validation purposes. Spark's Python API does not yet provide the `randomSplit` convenience method that is available in Scala. Hence, we will need to create a training and test dataset manually.
+
+One relatively easy way to do this is by first taking a random sample of, say, 20 percent of our data as our test set. We will then define our training set as the elements of the original RDD that are not in the test set RDD.
+
+We can achieve this using the `sample` method to take a random sample for our test set, followed by using the `subtractByKey` method, which takes care of returning the elements in one RDD where the keys do not overlap with the other RDD.
+
+Note that `subtractByKey`, as the name suggests, works on the keys of the RDD elements that consist of key-value pairs. Therefore, here we will use `zipWithIndex` on our RDD of extracted training examples. This creates an RDD of `(LabeledPoint, index)` pairs.
+
+We will then reverse the keys and values so that we can operate on the index keys:
+
+    data_with_idx = data.zipWithIndex().map(lambda (k, v): (v, k)) 
+    test = data_with_idx.sample(False, 0.2, 42)
+    train = data_with_idx.subtractByKey(test)
+
+Once we have the two RDDs, we will recover just the `LabeledPoint` instances we need for training and test data, using `map` to extract the value from the key-value pairs:
+
+    train_data = train.map(lambda (idx, p): p)
+    test_data = test.map(lambda (idx, p) : p)
+    train_size = train_data.count()
+    test_size = test_data.count()
+    print "Training data size: %d" % train_size
+    print "Test data size: %d" % test_size
+    print "Total data size: %d " % num_data
+    print "Train + Test size : %d" % (train_size + test_size)
+
+We can confirm that we now have two distinct datasets that add up to the original dataset in total:
+
+    **Training data size: 13934**
+    **Test data size: 3445**
+    **Total data size: 17379**
+    **Train + Test size : 17379**
+
+The final step is to apply the same approach to the features extracted for the decision tree model:
+
+    data_with_idx_dt = data_dt.zipWithIndex().map(lambda (k, v): (v, k))
+    test_dt = data_with_idx_dt.sample(False, 0.2, 42)
+    train_dt = data_with_idx_dt.subtractByKey(test_dt)
+    train_data_dt = train_dt.map(lambda (idx, p): p)
+    test_data_dt = test_dt.map(lambda (idx, p) : p)
+
+### The impact of parameter settings for linear models
+
+Now that we have prepared our training and test sets, we are ready to investigate the impact of different parameter settings on model performance. We will first carry out this evaluation for the linear model. We will create a convenience function to evaluate the relevant performance metric by training the model on the training set and evaluating it on the test set for different parameter settings.
+
+We will use the RMSLE evaluation metric, as it is the one used in the Kaggle competition with this dataset, and this allows us to compare our model results against the competition leaderboard to see how we perform.
+
+The evaluation function is defined here:
+
+    def evaluate(train, test, iterations, step, regParam, regType, intercept):
+        model = LinearRegressionWithSGD.train(train, iterations, step, regParam=regParam, regType=regType, intercept=intercept)
+        tp = test.map(lambda p: (p.label, model.predict(p.features)))
+        rmsle = np.sqrt(tp.map(lambda (t, p): squared_log_error(t, p)).mean())
+        return rmsle
+
+### Tip
+
+Note that in the following sections, you might get slightly different results due to some random initialization for SGD. However, your results will be comparable.
+
+#### Iterations
+
+As we saw when evaluating our classification models, we generally expect that a model trained with SGD will achieve better performance as the number of iterations increases, although the increase in performance will slow down as the number of iterations goes above some minimum number. Note that here, we will set the step size to 0.01 to better illustrate the impact at higher iteration numbers:
+
+    params = [1, 5, 10, 20, 50, 100]
+    metrics = [evaluate(train_data, test_data, **param** , 0.01, 0.0, 'l2', False) for param in params]
+    print params
+    print metrics
+
+The output shows that the error metric indeed decreases as the number of iterations increases. It also does so at a decreasing rate, again as expected. What is interesting is that eventually, the SGD optimization tends to overshoot the optimal solution, and the RMSLE eventually starts to increase slightly:
+
+    **[1, 5, 10, 20, 50, 100]**
+    **[2.3532904530306888, 1.6438528499254723, 1.4869656275309227, 1.4149741941240344, 1.4159641262731959, 1.4539667094611679]**
+
+Here, we will use the `matplotlib` library to plot a graph of the RMSLE metric against the number of iterations. We will use a log scale for the _x_ axis to make the output easier to visualize:
+
+    plot(params, metrics)
+    fig = matplotlib.pyplot.gcf()
+    pyplot.xscale('log')
+
+Metrics for varying number of iterations
+
+#### Step size
+
+We will perform a similar analysis for step size in the following code:
+
+    params = [0.01, 0.025, 0.05, 0.1, 1.0]
+    metrics = [evaluate(train_data, test_data, 10, **param** , 0.0, 'l2', False) for param in params]
+    print params
+    print metrics
+
+The output of the preceding code:
+
+    **[0.01, 0.025, 0.05, 0.1, 0.5]**
+    **[1.4869656275309227, 1.4189071944747715, 1.5027293911925559, 1.5384660954019973, nan]**
+
+Now, we can see why we avoided using the default step size when training the linear model originally. The default is set to _1.0_ , which, in this case, results in a `nan` output for the RMSLE metric. This typically means that the SGD model has converged to a very poor local minimum in the error function that it is optimizing. This can happen when the step size is relatively large, as it is easier for the optimization algorithm to overshoot good solutions.
+
+We can also see that for low step sizes and a relatively low number of iterations (we used 10 here), the model performance is slightly poorer. However, in the preceding _Iterations_ section, we saw that for the lower step-size setting, a higher number of iterations will generally converge to a better solution.
+
+Generally speaking, setting step size and number of iterations involves a trade-off. A lower step size means that convergence is slower but slightly more assured. However, it requires a higher number of iterations, which is more costly in terms of computation and time, in particular at a very large scale.
+
+### Tip
+
+Selecting the best parameter settings can be an intensive process that involves training a model on many combinations of parameter settings and selecting the best outcome. Each instance of model training involves a number of iterations, so this process can be very expensive and time consuming when performed on very large datasets.
+
+The output is plotted here, again using a log scale for the step-size axis:
+
+Metrics for varying values of step size
+
+#### L2 regularization
+
+In Chapter 5, _Building a Classification Model with Spark_ , we saw that regularization has the effect of penalizing model complexity in the form of an additional loss term that is a function of the model weight vector. L2 regularization penalizes the L2-norm of the weight vector, while L1 regularization penalizes the L1-norm.
+
+We expect training set performance to deteriorate with increasing regularization, as the model cannot fit the dataset well. However, we would also expect some amount of regularization that will result in optimal generalization performance as evidenced by the best performance on the test set.
+
+We will evaluate the impact of different levels of L2 regularization in this code:
+
+    params = [0.0, 0.01, 0.1, 1.0, 5.0, 10.0, 20.0]
+    metrics = [evaluate(train_data, test_data, 10, 0.1, **param** , **'l2'** , False) for param in params]
+    print params
+    print metrics
+    plot(params, metrics)
+    fig = matplotlib.pyplot.gcf()
+    pyplot.xscale('log')
+
+As expected, there is an optimal setting of the regularization parameter with respect to the test set RMSLE:
+
+    **[0.0, 0.01, 0.1, 1.0, 5.0, 10.0, 20.0]**
+    **[1.5384660954019971, 1.5379108106882864, 1.5329809395123755, 1.4900275345312988, 1.4016676336981468, 1.40998359211149, 1.5381771283158705]**
+
+This is easiest to see in the following plot (where we once more use the log scale for the regularization parameter axis):
+
+Metrics for varying levels of L2 regularization
+
+#### L1 regularization
+
+We can apply the same approach for differing levels of L1 regularization:
+
+    params = [0.0, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
+    metrics = [evaluate(train_data, test_data, 10, 0.1, **param** , **'l1'** , False) for param in params]
+    print params
+    print metrics
+    plot(params, metrics)
+    fig = matplotlib.pyplot.gcf()
+    pyplot.xscale('log')
+
+Again, the results are more clearly seen when plotted in the following graph. We see that there is a much more subtle decline in RMSLE, and it takes a very high value to cause a jump back up. Here, the level of L1 regularization required is much higher than that for the L2 form; however, the overall performance is poorer:
+
+    **[0.0, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]**
+    **[1.5384660954019971, 1.5384518080419873, 1.5383237472930684, 1.5372017600929164, 1.5303809928601677, 1.4352494587433793, 4.7551250073268614]**
+
+Metrics for varying levels of L1 regularization
+
+Using L1 regularization can encourage sparse weight vectors. Does this hold true in this case? We can find out by examining the number of entries in the weight vector that are zero, with increasing levels of regularization:
+
+    model_l1 = LinearRegressionWithSGD.train(train_data, 10, 0.1, regParam=1.0, regType='l1', intercept=False)
+    model_l1_10 = LinearRegressionWithSGD.train(train_data, 10, 0.1, regParam=10.0, regType='l1', intercept=False)
+    model_l1_100 = LinearRegressionWithSGD.train(train_data, 10, 0.1, regParam=100.0, regType='l1', intercept=False)
+    print "L1 (1.0) number of zero weights: " + str(sum(model_l1.weights.array == 0))
+    print "L1 (10.0) number of zeros weights: " + str(sum(model_l1_10.weights.array == 0))
+    print "L1 (100.0) number of zeros weights: " + str(sum(model_l1_100.weights.array == 0))
+
+We can see from the results that as we might expect, the number of zero feature weights in the model weight vector increases as greater levels of L1 regularization are applied:
+
+    **L1 (1.0) number of zero weights: 4**
+    **L1 (10.0) number of zeros weights: 20**
+    **L1 (100.0) number of zeros weights: 55**
+
+#### Intercept
+
+The final parameter option for the linear model is whether to use an intercept or not. An intercept is a constant term that is added to the weight vector and effectively accounts for the mean value of the target variable. If the data is already centered or normalized, an intercept is not necessary; however, it often does not hurt to use one in any case.
+
+We will evaluate the effect of adding an intercept term to the model here:
+
+    params = [False, True]
+    metrics = [evaluate(train_data, test_data, 10, 0.1, 1.0, 'l2', **param** ) for param in params]
+    print params
+    print metrics
+    bar(params, metrics, color='lightblue')
+    fig = matplotlib.pyplot.gcf()
+
+We can see from the result and plot that adding the intercept term results in a very slight increase in RMSLE:
+
+    **[False, True]**
+    **[1.4900275345312988, 1.506469812020645]**
+
+Metrics without and with an intercept
+
+### The impact of parameter settings for the decision tree
+
+Decision trees provide two main parameters: maximum tree depth and the maximum number of bins. We will now perform the same evaluation of the effect of parameter settings for the decision tree model. Our starting point is to create an evaluation function for the model, similar to the one used for the linear regression earlier. This function is provided here:
+
+    def evaluate_dt(train, test, maxDepth, maxBins):
+        model = DecisionTree.trainRegressor(train, {}, impurity='variance', maxDepth=maxDepth, maxBins=maxBins)
+        preds = model.predict(test.map(lambda p: p.features))
+        actual = test.map(lambda p: p.label)
+        tp = actual.zip(preds)
+        rmsle = np.sqrt(tp.map(lambda (t, p): squared_log_error(t, p)).mean())
+        return rmsle
+
+#### Tree depth
+
+We would generally expect performance to increase with more complex trees (that is, trees of greater depth). Having a lower tree depth acts as a form of regularization, and it might be the case that as with L2 or L1 regularization in linear models, there is a tree depth that is optimal with respect to the test set performance.
+
+Here, we will try to increase the depths of trees to see what impact they have on test set RMSLE, keeping the number of bins at the default level of `32`:
+
+    params = [1, 2, 3, 4, 5, 10, 20]
+    metrics = [evaluate_dt(train_data_dt, test_data_dt, param, 32) for param in params] 
+    print params
+    print metrics
+    plot(params, metrics)
+    fig = matplotlib.pyplot.gcf()
+
+In this case, it appears that the decision tree starts over-fitting at deeper tree levels. An optimal tree depth appears to be around 10 on this dataset.
+
+### Note
+
+Notice that our best RMSLE of 0.42 is now quite close to the Kaggle winner of around 0.29!
+
+The output of the tree depth is as follows:
+
+    **[1, 2, 3, 4, 5, 10, 20]**
+    **[1.0280339660196287, 0.92686672078778276, 0.81807794023407532, 0.74060228537329209, 0.63583503599563096, 0.42851360418692447, 0.45500008049779139]**
+
+Metrics for different tree depths
+
+#### Maximum bins
+
+Finally, we will perform our evaluation on the impact of setting the number of bins for the decision tree. As with the tree depth, a larger number of bins should allow the model to become more complex and might help performance with larger feature dimensions. After a certain point, it is unlikely that it will help any more and might, in fact, hinder performance on the test set due to over-fitting:
+
+    params = [2, 4, 8, 16, 32, 64, 100]
+    metrics = [evaluate_dt(train_data_dt, test_data_dt, 5, param) for param in params]
+    print params
+    print metrics
+    plot(params, metrics)
+    fig = matplotlib.pyplot.gcf()
+
+Here, we will show the output and plot to vary the number of bins (while keeping the tree depth at the default level of 5). In this case, using a small number of bins hurts performance, while there is no impact when we use around 32 bins (the default setting) or more. There seems to be an optimal setting for test set performance at around 16-20 bins:
+
+    **[2, 4, 8, 16, 32, 64, 100]**
+    **[1.3069788763726049, 0.81923394899750324, 0.75745322513058744, 0.62328384445223795, 0.63583503599563096, 0.63583503599563096, 0.63583503599563096]**
+
+Metrics for different maximum bins
+
+# Summary
+
+In this chapter, you saw how to use MLlib's linear model and decision tree functionality in Python within the context of regression models. We explored categorical feature extraction and the impact of applying transformations to the target variable in a regression problem. Finally, we implemented various performance-evaluation metrics and used them to implement a cross-validation exercise that explores the impact of the various parameter settings available in both linear models and decision trees on test set model performance.
+
+In the next chapter, we will cover a different approach to machine learning, that is unsupervised learning, specifically in clustering models.
+
+# Chapter 7. Building a Clustering Model with Spark
+
+In the last few chapters, we covered supervised learning methods, where the training data is labeled with the true outcome that we would like to predict (for example, a rating for recommendations and class assignment for classification or real target variable in the case of regression).
+
+Next, we will consider the case when we do not have labeled data available. This is called unsupervised learning, as the model is not supervised with the true target label. The unsupervised case is very common in practice, since obtaining labeled training data can be very difficult or expensive in many real-world scenarios (for example, having humans label training data with class labels for classification). However, we would still like to learn some underlying structure in the data and use these to make predictions.
+
+This is where unsupervised learning approaches can be useful. Unsupervised learning models are also often combined with supervised models, for example, applying unsupervised techniques to create new input features for supervised models.
+
+Clustering models are, in many ways, the unsupervised equivalent of classification models. With classification, we tried to learn a model that would predict which class a given training example belonged to. The model was essentially a mapping from a set of features to the class.
+
+In clustering, we would like to segment the data such that each training example is assigned to a segment called a **cluster**. The clusters act much like classes, except that the true class assignments are unknown.
+
+Clustering models have many use cases that are the same as classification; these include the following:
+
+  * Segmenting users or customers into different groups based on behavior characteristics and metadata
+  * Grouping content on a website or products in a retail business
+  * Finding clusters of similar genes
+  * Segmenting communities in ecology
+  * Creating image segments for use in image analysis applications such as object detection
+
+In this chapter, we will:
+
+  * Briefly explore a few types of clustering models
+  * Extract features from data specifically using the output of one model as input features for our clustering model
+  * Train a clustering model and use it to make predictions
+  * Apply performance-evaluation and parameter-selection techniques to select the optimal number of clusters to use
+
+# Types of clustering models
+
+There are many different forms of clustering models available, ranging from simple to extremely complex ones. The MLlib library currently provides K-means clustering, which is among the simplest approaches available. However, it is often very effective, and its simplicity means it is relatively easy to understand and is scalable.
+
+## K-means clustering
+
+K-means attempts to partition a set of data points into K distinct clusters (where K is an input parameter for the model).
+
+More formally, K-means tries to find clusters so as to minimize the sum of squared errors (or distances) within each cluster. This objective function is known as the **within cluster sum of squared errors** ( **WCSS** ).
+
+It is the sum, over each cluster, of the squared errors between each point and the cluster center.
+
+Starting with a set of K initial cluster centers (which are computed as the mean vector for all data points in the cluster), the standard method for K-means iterates between two steps:
+
+  1. Assign each data point to the cluster that minimizes the WCSS. The sum of squares is equivalent to the squared Euclidean distance; therefore, this equates to assigning each point to the **closest** cluster center as measured by the Euclidean distance metric.
+  2. Compute the new cluster centers based on the cluster assignments from the first step.
+
+The algorithm proceeds until either a maximum number of iterations has been reached or **convergence** has been achieved. Convergence means that the cluster assignments no longer change during the first step; therefore, the value of the WCSS objective function does not change either.
+
+### Tip
+
+For more details, refer to Spark's documentation on clustering at <http://spark.apache.org/docs/latest/mllib-clustering.html> or refer to <http://en.wikipedia.org/wiki/K-means_clustering>.
+
+To illustrate the basics of K-means, we will use the simple dataset we showed in our multiclass classification example in Chapter 5, _Building a Classification Model with Spark_. Recall that we have five classes, which are shown in the following figure:
+
+Multiclass dataset
+
+However, assume that we don't actually know the true classes. If we use K-means with five clusters, then after the first step, the model's cluster assignments might look like this:
+
+Cluster assignments after the first K-means iteration
+
+We can see that K-means has already picked out the centers of each cluster fairly well. After the next iteration, the assignments might look like those shown in the following figure:
+
+Cluster assignments after the second K-means iteration
+
+Things are starting to stabilize, but the overall cluster assignments are broadly the same as they were after the first iteration. Once the model has converged, the final assignments could look like this:
+
+Final cluster assignments for K-means
+
+As we can see, the model has done a decent job of separating the five clusters. The leftmost three are fairly accurate (with a few incorrect points). However, the two clusters in the bottom-right corner are less accurate.
+
+This illustrates:
+
+  * The iterative nature of K-means
+  * The model's dependency on the method of initially selecting clusters' centers (here, we will use a random approach)
+  * That the final cluster assignments can be very good for well-separated data but can be poor for data that is more difficult
+
+### Initialization methods
+
+The standard initialization method for K-means, usually simply referred to as the random method, starts by randomly assigning each data point to a cluster before proceeding with the first update step.
+
+MLlib provides a parallel variant for this initialization method, called K-means ||, which is the default initialization method used.
+
+MLlib provides a parallel variant called **K-means** **||** , **||** , for this initialization method; this is the default initialization method used.
+
+### Note
+
+See <http://en.wikipedia.org/wiki/K-means_clustering#Initialization_methods> and <http://en.wikipedia.org/wiki/K-means%2B%2B> for more information.
+
+The results of using K-means++ are shown here. Note that this time, the difficult lower-right points have been mostly correctly clustered.
+
+Final cluster assignments for K-means++
+
+### Variants
+
+There are many other variants of K-means; they focus on initialization methods or the core model. One of the more common variants is fuzzy K-means. This model does not assign each point to one cluster as K-means does (a so-called hard assignment). Instead, it is a soft version of K-means, where each point can belong to many clusters, and is represented by the relative membership to each cluster. So, for K clusters, each point is represented as a K-dimensional membership vector, with each entry in this vector indicating the membership proportion in each cluster.
+
+## Mixture models
+
+A **mixture model** is essentially an extension of the idea behind fuzzy K-means; however, it makes an assumption that there is an underlying probability distribution that generates the data. For example, we might assume that the data points are drawn from a set of K-independent Gaussian (normal) probability distributions. The cluster assignments are also soft, so each point is represented by K membership weights in each of the K underlying probability distributions.
+
+### Note
+
+See <http://en.wikipedia.org/wiki/Mixture_model> for further details and for a mathematical treatment of mixture models.
+
+## Hierarchical clustering
+
+ **Hierarchical clustering** is a structured clustering approach that results in a multilevel hierarchy of clusters, where each cluster might contain many subclusters (or child clusters). Each child cluster is, thus, linked to the parent cluster. This form of clustering is often also called tree clustering.
+
+Agglomerative clustering is a bottom-up approach where:
+
+  * Each data point begins in its own cluster
+  * The similarity (or distance) between each pair of clusters is evaluated
+  * The pair of clusters that are most similar are found; this pair is then merged to form a new cluster
+  * The process is repeated until only one top-level cluster remains
+
+ **Divisive** clustering is a top-down approach that works in reverse, starting with one cluster and at each stage, splitting a cluster into two, until all data points are allocated to their own bottom-level cluster.
+
+### Note
+
+You can find more information at <http://en.wikipedia.org/wiki/Hierarchical_clustering>.
+
+# Extracting the right features from your data
+
+Like most of the machine learning models we have encountered so far, K-means clustering requires numerical vectors as input. The same feature extraction and transformation approaches that we have seen for classification and regression are applicable for clustering.
+
+As K-means, like least squares regression, uses a squared error function as the optimization objective, it tends to be impacted by outliers and features with large variance.
+
+As for regression and classification cases, input data can be normalized and standardized to overcome this, which might improve accuracy. In some cases, however, it might be desirable not to standardize data, if, for example, the objective is to find segmentations according to certain specific features.
+
+## Extracting features from the MovieLens dataset
+
+For this example, we will return to the movie rating dataset we used in Chapter 4, _Building a Recommendation Engine with Spark_. Recall that we have three main datasets: one that contains the movie ratings (in the `u.data` file), a second one with user data (`u.user`), and a third one with movie data (`u.item`). We will also be using the genre data file to extract the genres for each movie (`u.genre`).
+
+We will start by looking at the movie data:
+
+    val movies = sc.textFile("/PATH/ml-100k/u.item")
+    println(movies.first)
+
+This should output the first line of the dataset:
+
+    **1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0**
+
+So, we have access to the move title, and we already have the movies categorized into genres. Why do we need to apply a clustering model to the movies? Clustering the movies is a useful exercise for two reasons:
+
+  * First, because we have access to the true genre labels, we can use these to evaluate the quality of the clusters that the model finds
+  * Second, we might wish to segment the movies based on some other attributes or features, apart from their genres
+
+For example, in this case, it seems that we don't have a lot of data to use for clustering, apart from the genres and title. However, this is not true--we also have the ratings data. Previously, we created a matrix factorization model from the ratings data. The model is made up of a set of user and movie factor vectors.
+
+We can think of the movie factors as representing each movie in a new latent feature space, where each latent feature, in turn, represents some form of structure in the ratings matrix. While it is not possible to directly interpret each latent feature, they might represent some hidden structure that influences the ratings behavior between users and movies. One factor could represent genre preference, another could refer to actors or directors, while yet another could represent the theme of the movie, and so on.
+
+So, if we use these factor vector representations of each movie as inputs to our clustering model, we will end up with a clustering that is based on the actual rating behavior of users rather than manual genre assignments.
+
+The same logic applies to the user factors--they represent users in the latent feature space of rating behavior, so clustering the user vectors should result in a clustering based on user rating behavior.
+
+### Extracting movie genre labels
+
+Before proceeding further, let's extract the genre mappings from the `u.genre` file. As you can see from the first line of the preceding dataset, we will need to map from the numerical genre assignments to the textual version so that they are readable.
+
+Take a look at the first few lines of `u.genre`:
+
+    val genres = sc.textFile("/PATH/ml-100k/u.genre")
+    genres.take(5).foreach(println)
+
+You should see the following output displayed:
+
+    **unknown|0**
+    **Action|1**
+    **Adventure|2**
+    **Animation|3**
+    **Children's|4**
+
+Here, `0` is the index of the relevant genre, while `unknown` is the genre assigned for this index. The indices correspond to the indices of the binary subvector that will represent the genres for each movie (that is, the 0s and 1s in the preceding movie data).
+
+To extract the genre mappings, we will split each line and extract a key-value pair, where the key is the text genre and the value is the index. Note that we have to filter out an empty line at the end; this will, otherwise, throw an error when we try to split the line (see the code highlighted here):
+
+    val genreMap = genres.filter( **!_.isEmpty** ).map(line => line.split("\\|")).map(array => (array(1), array(0))).collectAsMap
+    println(genreMap)
+
+The preceding code will provide the following output:
+
+    **Map(2 - > Adventure, 5 -> Comedy, 12 -> Musical, 15 -> Sci-Fi, 8 -> Drama, 18 -> Western, ...**
+
+Next, we'll create a new RDD from the movie data and our genre mapping; this RDD contains the movie ID, title, and genres. We will use this later to create a more readable output when we evaluate the clusters assigned to each movie by our clustering model.
+
+In the following code section, we will map over each movie and extract the genres subvector (which will still contain `Strings` rather than `Int` indexes). We will then apply the `zipWithIndex` method to create a new collection that contains the indices of the genre subvector, and we will filter this collection so that we are left only with the positive assignments (that is, the 1s that denote a genre assignment for the relevant index). We can then use our extracted genre mapping to map these indices to the textual genres. Finally, we will inspect the first record of the new `RDD` to see the result of these operations:
+
+    val titlesAndGenres = movies.map(_.split("\\|")).map { array =>
+      val genres = array.toSeq.slice(5, array.size)
+      val genresAssigned = genres.zipWithIndex.filter { case (g, idx) => 
+        g == "1" 
+      }.map { case (g, idx) => 
+        genreMap(idx.toString) 
+      }
+      (array(0).toInt, (array(1), genresAssigned))
+    }
+    println(titlesAndGenres.first)
+
+This should output the following result:
+
+    **(1,(Toy Story (1995),ArrayBuffer(Animation, Children's, Comedy)))**
+
+### Training the recommendation model
+
+To get the user and movie factor vectors, we first need to train another recommendation model. Fortunately, we have already done this in Chapter 4, _Building a Recommendation Engine with Spark_ , so we will follow the same procedure:
+
+    import org.apache.spark.mllib.recommendation.ALS
+    import org.apache.spark.mllib.recommendation.Rating
+    val rawData = sc.textFile("/PATH/ml-100k/u.data")
+    val rawRatings = rawData.map(_.split("\t").take(3))
+    val ratings = rawRatings.map{ case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
+    ratings.cache
+    val alsModel = ALS.train(ratings, 50, 10, 0.1)
+
+Recall from Chapter 4, _Building a Recommendation Engine with Spark_ , that the ALS model returned contains the factors in two RDDs of key-value pairs (called `userFeatures` and `productFeatures`) with the user or movie ID as the key and the factor as the value. We will need to extract just the factors and transform each one of them into an MLlib `Vector` to use as training input for our clustering model.
+
+We will do this for both users and movies as follows:
+
+    import org.apache.spark.mllib.linalg.Vectors
+    val movieFactors = alsModel.productFeatures.map { case (id, factor) => (id, Vectors.dense(factor)) }
+    val movieVectors = movieFactors.map(_._2)
+    val userFactors = alsModel.userFeatures.map { case (id, factor) => (id, Vectors.dense(factor)) }
+    val userVectors = userFactors.map(_._2)
+
+### Normalization
+
+Before we train our clustering model, it might be useful to look into the distribution of the input data in the form of the factor vectors. This will tell us whether we need to normalize the training data.
+
+We will follow the same approach as we did in Chapter 5, _Building a Classification Model with Spark_ , using MLlib's summary statistics available in the distributed `RowMatrix` class:
+
+    import org.apache.spark.mllib.linalg.distributed.RowMatrix
+    val movieMatrix = new RowMatrix(movieVectors)
+    val movieMatrixSummary = movieMatrix.computeColumnSummaryStatistics()
+    val userMatrix = new RowMatrix(userVectors)
+    val userMatrixSummary = 
+    userMatrix.computeColumnSummaryStatistics()
+    println("Movie factors mean: " + movieMatrixSummary.mean)
+    println("Movie factors variance: " + movieMatrixSummary.variance)
+    println("User factors mean: " + userMatrixSummary.mean)
+    println("User factors variance: " + userMatrixSummary.variance)
+
+You should see output similar to the one here:
+
+    **Movie factors mean: [0.28047737659519767,0.26886479057520024,0.2935579964446398,0.27821738264113755, ...**
+    **Movie factors variance: [0.038242041794064895,0.03742229118854288,0.044116961097355877,0.057116244055791986, ...**
+    **User factors mean: [0.2043520841572601,0.22135773814655782,0.2149706318418221,0.23647602029329481, ...**
+    **User factors variance: [0.037749421148850396,0.02831191551960241,0.032831876953314174,0.036775110657850954, ...**
+
+If we look at the output, we will see that there do not appear to be any important outliers that might skew the clustering results, so normalization should not be required in this case.
+
+# Training a clustering model
+
+Training for K-means in MLlib takes an approach similar to the other models--we pass an RDD that contains our training data to the `train` method of the `KMeans` object. Note that here we do not use `LabeledPoint` instances, as the labels are not used in clustering; they are used only in the feature vectors. Thus, we use a RDD `[Vector]` as input to the `train` method.
+
+## Training a clustering model on the MovieLens dataset
+
+We will train a model for both the movie and user factors that we generated by running our recommendation model. We need to pass in the number of clusters K and the maximum number of iterations for the algorithm to run. Model training might run for less than the maximum number of iterations if the change in the objective function from one iteration to the next is less than the tolerance level (the default for this tolerance is 0.0001).
+
+MLlib's K-means provides random and K-means || initialization, with the default being K-means ||. As both of these initialization methods are based on random selection to some extent, each model training run will return a different result.
+
+K-means does not generally converge to a global optimum model, so performing multiple training runs and selecting the most optimal model from these runs is a common practice. MLlib's training methods expose an option to complete multiple model training runs. The best training run, as measured by the evaluation of the loss function, is selected as the final model.
+
+We will first set up the required imports, as well as model parameters: K, maximum iterations, and number of runs:
+
+    import org.apache.spark.mllib.clustering.KMeans
+    val numClusters = 5
+    val numIterations = 10
+    val numRuns = 3
+
+We will then run K-means on the movie factor vectors:
+
+    val movieClusterModel = KMeans.train(movieVectors, numClusters, numIterations, numRuns)
+
+Once the model has completed training, we should see output that looks something like this:
+
+    **...**
+    **14/09/02 21:53:58 INFO SparkContext: Job finished: collectAsMap at KMeans.scala:193, took 0.02043 s**
+    **14/09/02 21:53:58 INFO KMeans: Iterations took 0.331 seconds.**
+    **14/09/02 21:53:58 INFO KMeans: KMeans reached the max number of iterations: 10.**
+    **14/09/02 21:53:58 INFO KMeans: The cost for the best run is 2586.298785925147**
+    **.**
+    **...**
+    **movieClusterModel: org.apache.spark.mllib.clustering.KMeansModel = org.apache.spark.mllib.clustering.KMeansModel@71c6f512**
+
+As can be seen from the highlighted text, the model training output tells us that the maximum number of iterations was reached, so the training process did not stop early based on the convergence criterion. It also shows the training set error (that is, the value of the K-means objective function) for the best run.
+
+We can try a much larger setting for the maximum iterations and use only one training run to see an example where the K-means model converges:
+
+    val movieClusterModelConverged = KMeans.train(movieVectors, numClusters, 100)
+
+You should be able to see the `KMeans converged in ... iterations` text in the model output; this text indicates that after so many iterations, the K-means objective function did not decrease more than the tolerance level:
+
+    **...**
+    **14/09/02 22:04:38 INFO SparkContext: Job finished: collectAsMap at KMeans.scala:193, took 0.040685 s**
+    **14/09/02 22:04:38 INFO KMeans: Run 0 finished in 34 iterations**
+    **14/09/02 22:04:38 INFO KMeans: Iterations took 0.812 seconds.**
+    **14/09/02 22:04:38 INFO KMeans: KMeans converged in 34 iterations.**
+    **14/09/02 22:04:38 INFO KMeans: The cost for the best run is 2584.9354332904104.**
+    **...**
+    **movieClusterModelConverged: org.apache.spark.mllib.clustering.KMeansModel = org.apache.spark.mllib.clustering.KMeansModel@6bb28fb5**
+
+### Tip
+
+Notice that when we use a lower number of iterations but use multiple training runs, we typically get a training error (called cost above) that is very similar to the one we obtain by running the model to convergence. Using the multiple runs option can, therefore, be a very effective method to find the best possible model.
+
+Finally, we will also train a K-means model on the user factor vectors:
+
+    val userClusterModel = KMeans.train(userVectors, numClusters, numIterations, numRuns)
+
+# Making predictions using a clustering model
+
+Using the trained K-means model is straightforward and similar to the other models we have encountered so far, such as classification and regression. We can make a prediction for a single `Vector` instance as follows:
+
+    val movie1 = movieVectors.first
+    val movieCluster = movieClusterModel.predict(movie1)
+    println(movieCluster)
+
+We can also make predictions for multiple inputs by passing a RDD `[Vector]` to the `predict` method of the model:
+
+    val predictions = movieClusterModel.predict(movieVectors)
+    println(predictions.take(10).mkString(","))
+
+The resulting output is a cluster assignment for each data point:
+
+    **0,0,1,1,2,1,0,1,1,1**
+
+### Tip
+
+Note that due to random initialization, the cluster assignments might change from one run of the model to another, so your results might differ from those shown earlier. The cluster ID themselves have no inherent meaning; they are simply arbitrarily labeled, starting from 0.
+
+## Interpreting cluster predictions on the MovieLens dataset
+
+We have covered how to make predictions for a set of input vectors, but how do we evaluate how good the predictions are? We will cover performance metrics a little later; however, here, we will see how to manually inspect and interpret the cluster assignments made by our K-means model.
+
+While unsupervised techniques have the advantage that they do not require us to provide labeled data for training, the disadvantage is that often, the results need to be manually interpreted. Often, we would like to further examine the clusters that are found and possibly try to interpret them and assign some sort of labeling or categorization to them.
+
+For example, we can examine the clustering of movies we have found to try to see whether there is some meaningful interpretation of each cluster, such as a common genre or theme among the movies in the cluster. There are many approaches we can use, but we will start by taking a few movies in each cluster that are closest to the center of the cluster. These movies, we assume, would be the ones that are least likely to be marginal in terms of their cluster assignment, and so, they should be among the most representative of the movies in the cluster. By examining these sets of movies, we can see what attributes are shared by the movies in each cluster.
+
+### Interpreting the movie clusters
+
+To begin, we need to decide what we mean by "closest to the center of each cluster". The objective function that is minimized by K-means is the sum of Euclidean distances between each point and the cluster center, summed over all clusters. Therefore, it is natural to use the Euclidean distance as our measure.
+
+We will define this function here. Note that we will need access to certain imports from the **Breeze** library (a dependency of MLlib) for linear algebra and vector-based numerical functions:
+
+    import breeze.linalg._
+    import breeze.numerics.pow
+    def computeDistance(v1: DenseVector[Double], v2: DenseVector[Double]) = pow(v1 - v2, 2).sum
+
+### Tip
+
+The preceding `pow` function is a Breeze universal function. This function is the same as the `pow` function from `scala.math`, except that it operates element-wise on the vector that is returned from the minus operation between the two input vectors.
+
+Now, we will use this function to compute, for each movie, the distance of the relevant movie factor vector from the center vector of the assigned cluster. We will also join our cluster assignments and distances data with the movie titles and genres so that we can output the results in a more readable way:
+
+    val titlesWithFactors = titlesAndGenres.join(movieFactors)
+    val moviesAssigned = titlesWithFactors.map { case (id, ((title, genres), vector)) => 
+      val pred = movieClusterModel.predict(vector)
+      val clusterCentre = movieClusterModel.clusterCenters(pred)
+      val dist = computeDistance(DenseVector(clusterCentre.toArray), DenseVector(vector.toArray))
+      (id, title, genres.mkString(" "), pred, dist) 
+    }
+    val clusterAssignments = moviesAssigned.groupBy { case (id, title, genres, cluster, dist) => cluster }.collectAsMap
+
+After running the preceding code snippet, we have an RDD that contains a set of key-value pairs for each cluster; here, the key is the numeric cluster identifier, and the value is made up of a set of movies and related information. The movie information we have is the movie ID, title, genres, cluster index, and distance of the movie's factor vector from the cluster center.
+
+Finally, we will iterate through each cluster and output the top 20 movies, ranked by distance from closest to the cluster center:
+
+    for ( (k, v) <- clusterAssignments.toSeq.sortBy(_._1)) {
+      println(s"Cluster $k:")
+      val m = v.toSeq.sortBy(_._5)
+      println(m.take(20).map { case (_, title, genres, _, d) => (title, genres, d) }.mkString("\n")) 
+      println("=====\n")
+    }
+
+The following screenshot is an example output. Note that your output might differ due to random initializations of both the recommendation and clustering model.
+
+The first cluster
+
+The first cluster, labeled 0, seems to contain a lot of old movies from the 1940s, 1950s, and 1960s, as well as a scattering of recent dramas.
+
+The second cluster
+
+The second cluster has a few horror movies in a prominent position, while the rest of the movies are less clear, but dramas are common too.
+
+The third cluster
+
+The third cluster is not clear-cut but has a fair number of comedy and drama movies.
+
+The fourth cluster
+
+The next cluster is more clearly associated with dramas and contains some foreign language films in particular.
+
+The last cluster
+
+The final cluster seems to be related predominantly to action and thrillers as well as romance movies, and seems to contain a number of relatively popular movies.
+
+As you can see, it is not always straightforward to determine exactly what each cluster represents. However, there is some evidence here that the clustering is picking out attributes or commonalities between groups of movies, which might not be immediately obvious based only on the movie titles and genres (such as a foreign language segment, a classic movie segment, and so on). If we had more metadata available, such as directors, actors, and so on, we might find out more details about the defining features of each cluster.
+
+### Tip
+
+We leave it as an exercise for you to perform a similar investigation into the clustering of the user factors. We have already created the input vectors in the `userVectors` variable, so you can train a K-means model on these vectors. After that, in order to evaluate the clusters, you would need to investigate the closest users for each cluster center (as we did for movies) and see if some common characteristics can be identified from the movies they have rated or the user metadata available.
+
+# Evaluating the performance of clustering models
+
+Like models such as regression, classification, and recommendation engines, there are many evaluation metrics that can be applied to clustering models to analyze their performance and the goodness of the clustering of the data points. Clustering evaluation is generally divided into either internal or external evaluation. Internal evaluation refers to the case where the same data used to train the model is used for evaluation. External evaluation refers to using data external to the training data for evaluation purposes.
+
+## Internal evaluation metrics
+
+Common internal evaluation metrics include the WCSS we covered earlier (which is exactly the K-means objective function), the Davies-Bouldin index, the Dunn Index, and the silhouette coefficient. All these measures tend to reward clusters where elements within a cluster are relatively close together, while elements in different clusters are relatively far away from each other.
+
+### Note
+
+The Wikipedia page on clustering evaluation at <http://en.wikipedia.org/wiki/Cluster_analysis#Internal_evaluation> has more details.
+
+## External evaluation metrics
+
+Since clustering can be thought of as unsupervised classification, if we have some form of labeled (or partially labeled) data available, we could use these labels to evaluate a clustering model. We can make predictions of clusters (that is, the class labels) using the model and evaluate the predictions against the true labels using metrics similar to some that we saw for classification evaluation (that is, based on true positive and negative and false positive and negative rates).
+
+These include the Rand measure, F-measure, Jaccard index, and others.
+
+### Note
+
+See <http://en.wikipedia.org/wiki/Cluster_analysis#External_evaluation> for more information on external evaluation for clustering.
+
+## Computing performance metrics on the MovieLens dataset
+
+MLlib provides a convenient `computeCost` function to compute the WCSS objective function given a RDD `[Vector]`. We will compute this metric for the following movie and user training data:
+
+    val movieCost = movieClusterModel.computeCost(movieVectors)
+    val userCost = userClusterModel.computeCost(userVectors)
+    println("WCSS for movies: " + movieCost)
+    println("WCSS for users: " + userCost)
+
+This should output the result similar to the following one:
+
+    **WCSS for movies: 2586.0777166339426**
+    **WCSS for users: 1403.4137493396831**
+
+# Tuning parameters for clustering models
+
+In contrast to many of the other models we have come across so far, K-means only has one parameter that can be tuned. This is K, the number of cluster centers chosen.
+
+## Selecting K through cross-validation
+
+As we have done with classification and regression models, we can apply cross-validation techniques to select the optimal number of clusters for our model. This works in much the same way as for supervised learning methods. We will split the dataset into a training set and a test set. We will then train a model on the training set and compute the evaluation metric of interest on the test set.
+
+We will do this for the movie clustering using the built-in WCSS evaluation metric provided by MLlib in the following code, using a 60 percent / 40 percent split between the training set and test set:
+
+    val trainTestSplitMovies = movieVectors.randomSplit(Array(0.6, 0.4), 123)
+    val trainMovies = trainTestSplitMovies(0)
+    val testMovies = trainTestSplitMovies(1)
+    val costsMovies = Seq(2, 3, 4, 5, 10, 20).map { k => (k, KMeans.train(trainMovies, numIterations, k, numRuns).computeCost(testMovies)) }
+    println("Movie clustering cross-validation:")
+    costsMovies.foreach { case (k, cost) => println(f"WCSS for K=$k id $cost%2.2f") }
+
+This should give results that look something like the ones shown here.
+
+The output of movie clustering cross-validation is:
+
+    **Movie clustering cross-validation**
+    **WCSS for K=2 id 942.06**
+    **WCSS for K=3 id 942.67**
+    **WCSS for K=4 id 950.35**
+    **WCSS for K=5 id 948.20**
+    **WCSS for K=10 id 943.26**
+    **WCSS for K=20 id 947.10**
+
+We can observe that the WCSS decreases as the number of clusters increases, up to a point. It then begins to increase. Another common pattern observed in the WCSS in cross-validation for K-means is that the metric continues to decrease as K increases, but at a certain point, the rate of decrease flattens out substantially. The value of K at which this occurs is generally selected as the optimal K parameter (this is sometimes called the elbow point, as this is where the line kinks when drawn as a graph).
+
+In our case, we might select a value of 10 for K, based on the preceding results. Also, note that the clusters that are computed by the model are often used for purposes that require some human interpretation (such as the cases of movie and customer segmentation we mentioned earlier). Therefore, this consideration also impacts the choice of K, as although a higher value of K might be more optimal from the mathematical point of view, it might be more difficult to understand and interpret many clusters.
+
+For completeness, we will also compute the cross-validation metrics for user clustering:
+
+    val trainTestSplitUsers = userVectors.randomSplit(Array(0.6, 0.4), 123)
+    val trainUsers = trainTestSplitUsers(0)
+    val testUsers = trainTestSplitUsers(1)
+    val costsUsers = Seq(2, 3, 4, 5, 10, 20).map { k => (k, KMeans.train(trainUsers, numIterations, k, numRuns).computeCost(testUsers)) }
+    println("User clustering cross-validation:")
+    costsUsers.foreach { case (k, cost) => println(f"WCSS for K=$k id $cost%2.2f") }
+
+We will see a pattern that is similar to the movie case:
+
+    **User clustering cross-validation:**
+    **WCSS for K=2 id 544.02**
+    **WCSS for K=3 id 542.18**
+    **WCSS for K=4 id 542.38**
+    **WCSS for K=5 id 542.33**
+    **WCSS for K=10 id 539.68**
+    **WCSS for K=20 id 541.21**
+
+### Tip
+
+Note that your results may differ slightly due to random initialization of the clustering models.
+
+# Summary
+
+In this chapter, we explored a new class of model that learns structure from unlabeled data--unsupervised learning. We worked through required input data, feature extraction, and saw how to use the output of one model (a recommendation model in our example) as the input to another model (our K-means clustering model). Finally, we evaluated the performance of the clustering model, both using manual interpretation of the cluster assignments and using mathematical performance metrics.
+
+In the next chapter, we will cover another type of unsupervised learning used to reduce our data down to its most important features or components--dimensionality reduction models.
+
+# Chapter 8. Dimensionality Reduction with Spark
+
+Over the course of this chapter, we will continue our exploration of unsupervised learning models in the form of **dimensionality reduction**.
+
+Unlike the models we have covered so far, such as regression, classification, and clustering, dimensionality reduction does not focus on making predictions. Instead, it tries to take a set of input data with a feature dimension _D_ (that is, the length of our feature vector) and extract a representation of the data of dimension _k_ , where _k_ is usually significantly smaller than _D_. It is, therefore, a form of preprocessing or feature transformation rather than a predictive model in its own right.
+
+It is important that the representation that is extracted should still be able to capture a large proportion of the variability or structure of the original data. The idea behind this is that most data sources will contain some form of underlying structure. This structure is typically unknown (often called latent features or latent factors), but if we can uncover some of this structure, our models could learn this structure and make predictions from it rather than from the data in its raw form, which might be noisy or contain many irrelevant features. In other words, dimensionality reduction throws away some of the noise in the data and keeps the hidden structure that is present.
+
+In some cases, the dimensionality of the raw data is far higher than the number of data points we have, so without dimensionality reduction, it would be difficult for other machine learning models, such as classification and regression, to learn anything, as they need to fit a number of parameters that is far larger than the number of training examples (in this sense, these methods bear some similarity to the regularization approaches that we have seen used in classification and regression).
+
+A few use cases of dimensionality reduction techniques include:
+
+  * Exploratory data analysis
+  * Extracting features to train other machine learning models
+  * Reducing storage and computation requirements for very large models in the prediction phase (for example, a production system that makes predictions)
+  * Reducing a large group of text documents down to a set of hidden topics or concepts
+  * Making learning and generalization of models easier when our data has a very large number of features (for example, when working with text, sound, images, or video data, which tends to be very high-dimensional)
+
+In this chapter, we will:
+
+  * Introduce the types of dimensionality reduction models available in MLlib
+  * Work with images of faces to extract features suitable for dimensionality reduction
+  * Train a dimensionality reduction model using MLlib
+  * Visualize and evaluate the results
+  * Perform parameter selection for our dimensionality reduction model
+
+# Types of dimensionality reduction
+
+MLlib provides two models for dimensionality reduction; these models are closely related to each other. They are **Principal Components Analysis** ( **PCA** ) and **Singular Value Decomposition** ( **SVD** ).
+
+## Principal Components Analysis
+
+PCA operates on a data matrix _X_ and seeks to extract a set of _k_ principal components from _X_. The principal components are each uncorrelated to each other and are computed such that the first principal component accounts for the largest variation in the input data. Each subsequent principal component is, in turn, computed such that it accounts for the largest variation, provided that it is independent of the principal components computed so far.
+
+In this way, the _k_ principal components returned are guaranteed to account for the highest amount of variation in the input data possible. Each principal component, in fact, has the same feature dimensionality as the original data matrix. Hence, a projection step is required in order to actually perform dimensionality reduction, where the original data is projected into the _k-dimensional_ space represented by the principal components.
+
+## Singular Value Decomposition
+
+SVD seeks to decompose a matrix _X_ of dimension _m x n_ into three component matrices:
+
+  *  _U_ of dimension _m x m_
+  *  _S_ , a diagonal matrix of size _m x n_ ; the entries of _S_ are referred to as the **singular values**
+  *  _V T_ of dimension _n x n_
+
+    X = U * S * V T
+
+Looking at the preceding formula, it appears that we have not reduced the dimensionality of the problem at all, as by multiplying _U_ , _S_ , and _V_ , we reconstruct the original matrix. In practice, the truncated SVD is usually computed. That is, only the top _k_ singular values, which represent the most variation in the data, are kept, while the rest are discarded. The formula to reconstruct _X_ based on the component matrices is then approximate:
+
+    X ~ Uk * Sk * Vk T
+
+An illustration of the truncated SVD is shown here:
+
+The truncated Singular Value Decomposition
+
+Keeping the top _k_ singular values is similar to keeping the top _k_ principal components in PCA. In fact, SVD and PCA are directly related, as we will see a little later in this chapter.
+
+### Note
+
+A detailed mathematical treatment of both PCA and SVD is beyond the scope of this book.
+
+An overview of dimensionality reduction can be found in the Spark documentation at <http://spark.apache.org/docs/latest/mllib-dimensionality-reduction.html>.
+
+The following links contain a more in-depth mathematical overview of PCA and SVD, respectively: <http://en.wikipedia.org/wiki/Principal_component_analysis> and <http://en.wikipedia.org/wiki/Singular_value_decomposition>.
+
+## Relationship with matrix factorization
+
+PCA and SVD are both matrix factorization techniques, in the sense that they decompose a data matrix into subcomponent matrices, each of which has a lower dimension (or rank) than the original matrix. Many other dimensionality reduction techniques are based on matrix factorization.
+
+You might remember another example of matrix factorization, that is, collaborative filtering, that we have already seen in Chapter 4, _Building a Recommendation Engine with Spark_. Matrix factorization approaches to collaborative filtering work by factorizing the ratings matrix into two components: the user factor matrix and the item factor matrix. Each of these has a lower dimension than the original data, so these methods also act as dimensionality reduction models.
+
+### Note
+
+Many of the best performing approaches to collaborative filtering include models based on SVD. Simon Funk's approach to the Netflix prize is a famous example. You can look at it at <http://sifter.org/~simon/journal/20061211.html>.
+
+## Clustering as dimensionality reduction
+
+The clustering models we covered in the previous chapter can also be used for a form of dimensionality reduction. This works in the following way:
+
+  * Assume that we cluster our high-dimensional feature vectors using a K-means clustering model, with _k_ clusters. The result is a set of _k_ cluster centers.
+  * We can represent each of our original data points in terms of how far it is from each of these cluster centers. That is, we can compute the distance of a data point to each cluster center. The result is a set of _k_ distances for each data point.
+  * These _k_ distances can form a new vector of dimension _k_. We can now represent our original data as a new vector of lower dimension, relative to the original feature dimension.
+
+Depending on the distance metric used, this can result in both dimensionality reduction and a form of nonlinear transformation of the data, allowing us to learn a more complex model while still benefiting from the speed and scalability of a linear model. For example, using a Gaussian or exponential distance function can approximate a very complex nonlinear feature transformation.
+
+# Extracting the right features from your data
+
+As with all machine learning models we have explored so far, dimensionality reduction models also operate on a feature vector representation of our data.
+
+For this chapter, we will dive into the world of image processing, using the **Labeled Faces in the Wild** ( **LFW** ) dataset of facial images. This dataset contains over 13,000 images of faces generally taken from the Internet and belonging to well-known public figures. The faces are labeled with the person's name.
+
+## Extracting features from the LFW dataset
+
+In order to avoid having to download and process a very large dataset, we will work with a subset of the images, using people who have names that start with an "A". This dataset can be downloaded from <http://vis-www.cs.umass.edu/lfw/lfw-a.tgz>.
+
+### Note
+
+For more details and other variants of the data, visit <http://vis-www.cs.umass.edu/lfw/>.
+
+The original research paper reference is:
+
+ _Gary B. Huang_ , _Manu Ramesh_ , _Tamara Berg_ , and _Erik Learned-Miller_. _Labeled Faces in the Wild: A Database for Studying Face Recognition in Unconstrained Environments_. University of Massachusetts, Amherst, Technical Report 07-49, October, 2007.
+
+It can be downloaded from <http://vis-www.cs.umass.edu/lfw/lfw.pdf>.
+
+Unzip the data using the following command:
+
+    **> tar xfvz lfw-a.tgz**
+
+This will create a folder called `lfw`, which contains a number of subfolders, one for each person.
+
+### Exploring the face data
+
+Start up your Spark Scala console by ensuring that you allocate sufficient memory, as dimensionality reduction methods can be quite computationally expensive:
+
+    **>./SPARK_HOME/bin/spark-shell --driver-memory 2g**
+
+Now that we've unzipped the data, we face a small challenge. Spark provides us with a way to read text files and custom Hadoop input data sources. However, there is no built-in functionality to allow us to read images.
+
+Spark provides a method called `wholeTextFiles`, which allows us to operate on entire files at once, compared to the `textFile` method that we have been using so far, which operates on the individual lines within a text file (or multiple files).
+
+We will use the `wholeTextFiles` method to access the location of each file. Using these file paths, we will write custom code to load and process the images. In the following example code, we will use `PATH` to refer to the directory in which you extracted the `lfw` subdirectory.
+
+We can use a wildcard path specification (using the `*` character highlighted in the following code snippet) to tell Spark to look in each directory under the `lfw` directory for files:
+
+    val path = "/ **PATH** /lfw/ ***** "
+    val rdd = sc.wholeTextFiles(path)
+    val first = rdd.first
+    println(first)
+
+Running the `first` command might take a little time, as Spark first scans the specified directory structure for all available files. Once completed, you should see output similar to the one shown here:
+
+    **first: (String, String) =  (file:/PATH/lfw/Aaron_Eckhart/Aaron_Eckhart_0001.jpg, ����??JFIF????? ...**
+
+You will see that `wholeTextFiles` returns an RDD that contains key-value pairs, where the key is the file location while the value is the content of the entire text file. For our purposes, we only care about the file path, as we cannot work directly with the image data as a string (notice that it is displayed as "binary nonsense" in the shell output).
+
+Let's extract the file paths from the RDD. Note that earlier, the file path starts with the `file:` text. This is used by Spark when reading files in order to differentiate between different filesystems (for example, `file://` for the local filesystem, `hdfs://` for HDFS, `s3n://` for Amazon S3, and so on).
+
+In our case, we will be using custom code to read the images, so we don't need this part of the path. Thus, we will remove it with the following `map` function:
+
+    val files = rdd.map { case (fileName, content) => fileName.replace("file:", "") }
+    println(files.first)
+
+This should display the file location with the `file:` prefix removed:
+
+    **/PATH/lfw/Aaron_Eckhart/Aaron_Eckhart_0001.jpg**
+
+Next, we will see how many files we are dealing with:
+
+    println(files.count)
+
+Running these commands creates a lot of noisy output in the Spark shell, as it outputs all the file paths that are read to the console. Ignore this part, but after the command has completed, the output should look something like this:
+
+    **..., /PATH/lfw/Azra_Akin/Azra_Akin_0003.jpg:0+19927, /PATH/lfw/Azra_Akin/Azra_Akin_0004.jpg:0+16030**
+    **...**
+    **14/09/18 20:36:25 INFO SparkContext: Job finished: count at <console>:19, took 1.151955 s**
+    **1055**
+
+So, we can see that we have 1055 images to work with.
+
+### Visualizing the face data
+
+Although there are a few tools available in Scala or Java to display images, this is one area where Python and the matplotlib library shine. We will use Scala to process and extract the images and run our models and IPython to display the actual images.
+
+You can run a separate IPython Notebook by opening a new terminal window and launching a new notebook:
+
+    **> ipython notebook**
+
+### Note
+
+Note that if using Python Notebook, you should first execute the following code snippet to ensure that the images are displayed inline after each notebook cell (including the `%` character): `%pylab inline`.
+
+Alternatively, you can launch a plain IPython console without the web notebook, enabling the `pylab` plotting functionality using the following command:
+
+    **> ipython --pylab**
+
+The dimensionality reduction techniques in MLlib are only available in Scala or Java at the time of writing this book, so we will continue to use the Scala Spark shell to run the models. Therefore, you won't need to run a PySpark console.
+
+### Tip
+
+We have provided the full Python code with this chapter as a Python script as well as in the IPython Notebook format. For instructions on installing IPython, see the code bundle.
+
+Let's display the image given by the first path we extracted earlier using matplotlib's `imread` and `imshow` functions:
+
+    path = "/PATH/lfw/PATH/lfw/Aaron_Eckhart/Aaron_Eckhart_0001.jpg"
+    ae = imread(path)
+    imshow(ae)
+
+### Note
+
+You should see the image displayed in your Notebook (or in a pop-up window if you are using the standard IPython shell). Note that we have not shown the image here.
+
+### Extracting facial images as vectors
+
+While a full treatment of image processing is beyond the scope of this book, we will need to know a few basics to proceed. Each color image can be represented as a three-dimensional array, or matrix, of pixels. The first two dimensions, that is the _x_ and _y_ axes, represent the position of each pixel, while the third dimension represents the **red, blue, and green** ( **RGB** ) color values for each pixel.
+
+A grayscale image only requires one value per pixel (there are no RGB values), so it can be represented as a plain two-dimensional matrix. For many image-processing and machine learning tasks related to images, it is common to operate on grayscale images. We will do this here by converting the color images to grayscale first.
+
+It is also a common practice in machine learning tasks to represent an image as a vector, instead of a matrix. We do this by concatenating each row (or alternatively, each column) of the matrix together to form a long vector (this is known as **reshaping** ). In this way, each raw, grayscale image matrix is transformed into a feature vector that is usable as input to a machine learning model.
+
+Fortunately for us, the built-in Java **Abstract Window Toolkit** ( **AWT** ) contains various basic image-processing functions. We will define a few utility functions to perform this processing using the `java.awt` classes.
+
+#### Loading images
+
+The first of these is a function to read an image from a file:
+
+    import java.awt.image.BufferedImage
+    def loadImageFromFile(path: String): BufferedImage = { 
+      import javax.imageio.ImageIO
+      import java.io.File
+      ImageIO.read(new File(path))
+    }
+
+This returns an instance of a `java.awt.image.BufferedImage` class, which stores the image data and provides a number of useful methods. Let's test it out by loading the first image into our Spark shell:
+
+    val aePath = "/PATH/lfw/Aaron_Eckhart/Aaron_Eckhart_0001.jpg"
+    val aeImage = loadImageFromFile(aePath)
+
+You should see the image details displayed in the shell:
+
+    **aeImage: java.awt.image.BufferedImage = BufferedImage@f41266e: type = 5 ColorModel: #pixelBits = 24 numComponents = 3 color space = java.awt.color.ICC_ColorSpace@7e420794 transparency = 1 has alpha = false isAlphaPre = false ByteInterleavedRaster: width = 250 height = 250 #numDataElements 3 dataOff[0] = 2**
+
+There is quite a lot of information here. Of particular interest to us is that the image width and height are 250 pixels, and as we can see, there are three components (that is, the RGB values) that are highlighted in the preceding output.
+
+#### Converting to grayscale and resizing the images
+
+The next function we will define will take the image that we have loaded with our preceding function, convert the image from color to grayscale, and resize the image's width and height.
+
+These steps are not strictly necessary, but both steps are done in many cases for efficiency purposes. Using RGB color images instead of grayscale increases the amount of data to be processed by a factor of 3. Similarly, larger images increase the processing and storage overhead significantly. Our raw 250 x 250 images represent 187,500 data points per image using three color components. For a set of 1055 images, this is 197,812,500 data points. Even if stored as integer values, each value stored takes 4 bytes of memory, so just 1055 images represent around 800 MB of memory! As you can see, image-processing tasks can quickly become extremely memory intensive.
+
+If we convert to grayscale and resize the images to, say, 50 x 50 pixels, we only require 2500 data points per image. For our 1055 images, this equates to 10 MB of memory, which is far more manageable for illustrative purposes.
+
+### Tip
+
+Another reason to resize is that MLlib's PCA model works best on _tall and skinny_ matrices with less than 10,000 columns. We will have 2500 columns (that is, each pixel becomes an entry in our feature vector), so we will come in well below this restriction.
+
+Let's define our processing function. We will do the grayscale conversion and resizing in one step, using the `java.awt.image` package:
+
+    def processImage(image: BufferedImage, width: Int, height: Int): BufferedImage = {
+      val bwImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY)
+      val g = bwImage.getGraphics()
+      g.drawImage(image, 0, 0, width, height, null)
+      g.dispose()
+      bwImage
+    }
+
+The first line of the function creates a new image of the desired width and height and specifies a grayscale color model. The third line draws the original image onto this newly created image. The `drawImage` method takes care of the color conversion and resizing for us! Finally, we return the new, processed image.
+
+Let's test this out on our sample image. We will convert it to grayscale and resize it to 100 x 100 pixels:
+
+    val grayImage = processImage(aeImage, 100, 100)
+
+You should see the following output in the console:
+
+    **grayImage: java.awt.image.BufferedImage = BufferedImage@21f8ea3b: type = 10 ColorModel: #pixelBits = 8 numComponents = 1 color space = java.awt.color.ICC_ColorSpace@5cd9d8e9 transparency = 1 has alpha = false isAlphaPre = false ByteInterleavedRaster: width = 100 height = 100 #numDataElements 1 dataOff[0] = 0**
+
+As you can see from the highlighted output, the image's width and height are indeed 100, and the number of color components is 1.
+
+Next, we will save the processed image to a temporary location so that we can read it back and display it in our IPython console:
+
+    import javax.imageio.ImageIO
+    import java.io.File
+    ImageIO.write(grayImage, "jpg", new File("/tmp/aeGray.jpg"))
+
+You should see a result of `true` displayed in your console, indicating that we successfully saved the image to the `aeGray.jpg` file in our `/tmp` directory.
+
+Finally, we will read the image in Python and use matplotlib to display the image. Type the following code into your IPython Notebook or shell (remember that this should be open in a new terminal window):
+
+    tmpPath = "/tmp/aeGray.jpg"
+    aeGary = imread(tmpPath)
+    imshow(aeGary, cmap=plt.cm.gray)
+
+This should display the image (note again, we haven't shown the image here). You should see that it is grayscale and of slightly worse quality as compared to the original image. Furthermore, you will notice that the scale of the axes are different, representing the new 100 x 100 dimension instead of the original 250 x 250 size.
+
+#### Extracting feature vectors
+
+The final step in the processing pipeline is to extract the actual feature vectors that will be the input to our dimensionality reduction model. As we mentioned earlier, the raw grayscale pixel data will be our features. We will form the vectors by flattening out the two-dimensional pixel matrix. The `BufferedImage` class provides a utility method to do just this, which we will use in our function:
+
+    def getPixelsFromImage(image: BufferedImage): Array[Double] = {
+      val width = image.getWidth
+      val height = image.getHeight
+      val pixels = Array.ofDim[Double](width * height)
+      image.getData.getPixels(0, 0, width, height, pixels)
+    }
+
+We can then combine these three functions into one utility function that takes a file location together with the desired image's width and height and returns the raw `Array[Double]` value that contains the pixel data:
+
+    def extractPixels(path: String, width: Int, height: Int): Array[Double] = {
+      val raw = loadImageFromFile(path)
+      val processed = processImage(raw, width, height)
+      getPixelsFromImage(processed)
+    }
+
+Applying this function to each element of the RDD that contains all the image file paths will give us a new RDD that contains the pixel data for each image. Let's do this and inspect the first few elements:
+
+    val pixels = files.map(f => extractPixels(f, 50, 50))
+    println(pixels.take(10).map(_.take(10).mkString("", ",", ", ...")).mkString("\n"))
+
+You should see output similar to this:
+
+    **0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0, ...**
+    **241.0,243.0,245.0,244.0,231.0,205.0,177.0,160.0,150.0,147.0, ...**
+    **253.0,253.0,253.0,253.0,253.0,253.0,254.0,254.0,253.0,253.0, ...**
+    **244.0,244.0,243.0,242.0,241.0,240.0,239.0,239.0,237.0,236.0, ...**
+    **44.0,47.0,47.0,49.0,62.0,116.0,173.0,223.0,232.0,233.0, ...**
+    **0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, ...**
+    **1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0, ...**
+    **26.0,26.0,27.0,26.0,24.0,24.0,25.0,26.0,27.0,27.0, ...**
+    **240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0, ...**
+    **0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, ...**
+
+The final step is to create an MLlib `Vector` instance for each image. We will cache the RDD to speed up our later computations:
+
+    import org.apache.spark.mllib.linalg.Vectors
+    val vectors = pixels.map(p => Vectors.dense(p))
+    vectors.setName("image-vectors")
+    vectors.cache
+
+### Tip
+
+We used the `setName` function earlier to assign an RDD a name. In this case, we called it `image-vectors`. This is so that we can later identify it more easily when looking at the Spark web interface.
+
+### Normalization
+
+It is a common practice to standardize input data prior to running dimensionality reduction models, in particular for PCA. As we did in Chapter 5, _Building a Classification Model with Spark_ , we will do this using the built-in `StandardScaler` provided by MLlib's `feature` package. We will only subtract the mean from the data in this case:
+
+    import org.apache.spark.mllib.linalg.Matrix
+    import org.apache.spark.mllib.linalg.distributed.RowMatrix
+    import org.apache.spark.mllib.feature.StandardScaler
+    val scaler = new StandardScaler(withMean = true, withStd = false).fit(vectors)
+
+Calling `fit` triggers a computation on our `RDD[Vector]`. You should see output similar to the one shown here:
+
+    **...**
+    **14/09/21 11:46:58 INFO SparkContext: Job finished: reduce at RDDFunctions.scala:111, took 0.495859 s**
+    **scaler: org.apache.spark.mllib.feature.StandardScalerModel = org.apache.spark.mllib.feature.StandardScalerModel@6bb1a1a1**
+
+### Tip
+
+Note that subtracting the mean works for dense input data. However, for sparse vectors, subtracting the mean vector from each input will transform the sparse data into dense data. For very high-dimensional input, this will likely exhaust the available memory resources, so it is not advisable.
+
+Finally, we will use the returned `scaler` to transform the raw image vectors to vectors with the column means subtracted:
+
+    val scaledVectors = vectors.map(v => scaler.transform(v))
+
+We mentioned earlier that the resized grayscale images would take up around 10 MB of memory. Indeed, you can take a look at the memory usage in the Spark application monitor storage page by going to `http://localhost:4040/storage/` ` `in your web browser.
+
+Since we gave our RDD of image vectors a friendly name of `image-vectors`, you should see something like the following screenshot (note that as we are using `Vector[Double]`, each element takes up 8 bytes instead of 4 bytes; hence, we actually use 20 MB of memory):
+
+Size of image vectors in memory
+
+# Training a dimensionality reduction model
+
+Dimensionality reduction models in MLlib require vectors as inputs. However, unlike clustering that operated on an `RDD[Vector]`, PCA and SVD computations are provided as methods on a distributed `RowMatrix` (this difference is largely down to syntax, as a `RowMatrix` is simply a wrapper around an `RDD[Vector]`).
+
+## Running PCA on the LFW dataset
+
+Now that we have extracted our image pixel data into vectors, we can instantiate a new `RowMatrix` and call the `computePrincipalComponents` method to compute the top `K` principal components of our distributed matrix:
+
+    import org.apache.spark.mllib.linalg.Matrix
+    import org.apache.spark.mllib.linalg.distributed.RowMatrix
+    val matrix = new RowMatrix(scaledVectors)
+    val K = 10
+    val pc = matrix.computePrincipalComponents(K)
+
+You will likely see quite a lot of output in your console while the model runs.
+
+### Tip
+
+If you see warnings such as **WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK** or **WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK** , you can safely ignore these.
+
+This means that the underlying linear algebra libraries used by MLlib could not load native routines. In this case, a Java-based fallback will be used, which is slower, but there is nothing to worry about for the purposes of this example.
+
+Once the model training is complete, you should see a result displayed in the console that looks similar to the following one:
+
+    **pc: org.apache.spark.mllib.linalg.Matrix =**
+    **-0.023183157256614906  -0.010622723054037303  ... (10 total)**
+    **-0.023960537953442107  -0.011495966728461177  ...**
+    **-0.024397470862198022  -0.013512219690177352  ...**
+    **-0.02463158818330343   -0.014758658113862178  ...**
+    **-0.024941633606137027  -0.014878858729655142  ...**
+    **-0.02525998879466241   -0.014602750644394844  ...**
+    **-0.025494722450369593  -0.014678013626511024  ...**
+    **-0.02604194423255582   -0.01439561589951032   ...**
+    **-0.025942214214865228  -0.013907665261197633  ...**
+    **-0.026151551334429365  -0.014707035797934148  ...**
+    **-0.026106572186134578  -0.016701471378568943  ...**
+    **-0.026242986173995755  -0.016254664123732318  ...**
+    **-0.02573628754284022   -0.017185663918352894  ...**
+    **-0.02545319635905169   -0.01653357295561698   ...**
+    **-0.025325893980995124  -0.0157082218373399...**
+
+### Visualizing the Eigenfaces
+
+Now that we have trained our PCA model, what is the result? Let's inspect the dimensions of the resulting matrix:
+
+    val rows = pc.numRows
+    val cols = pc.numCols
+    println(rows, cols)
+
+As you should see from your console output, the matrix of principal components has 2500 rows and 10 columns:
+
+    **(2500,10)**
+
+Recall that the dimension of each image is 50 x 50, so here, we have the top 10 principal components, each with a dimension identical to that of the input images. These principal components can be thought of as the set of latent (or hidden) features that capture the greatest variation in the original data.
+
+### Note
+
+In facial recognition and image processing, these principal components are often referred to as **Eigenfaces** , as PCA is closely related to the eigenvalue decomposition of the covariance matrix of the original data.
+
+See <http://en.wikipedia.org/wiki/Eigenface> for more details.
+
+Since each principal component is of the same dimension as the original images, each component can itself be thought of and represented as an image, making it possible to visualize the Eigenfaces as we would the input images.
+
+As we have often done in this book, we will use functionality from the Breeze linear algebra library as well as Python's numpy and matplotlib to visualize the Eigenfaces.
+
+First, we will extract the `pc` variable (an MLlib matrix) into a Breeze `DenseMatrix`:
+
+    import breeze.linalg.DenseMatrix
+    val pcBreeze = new DenseMatrix(rows, cols, pc.toArray)
+
+Breeze provides a useful function within the `linalg` package to write the matrix out as a CSV file. We will use this to save the principal components to a temporary CSV file:
+
+    import breeze.linalg.csvwrite
+    csvwrite(new File("/tmp/pc.csv"), pcBreeze)
+
+Next, we will load the matrix in IPython and visualize the principal components as images. Fortunately, numpy provides a utility function to read the matrix from the CSV file we created:
+
+    pcs = np.loadtxt("/tmp/pc.csv", delimiter=",")
+    print(pcs.shape)
+
+You should see the following output, confirming that the matrix we read has the same dimensions as the one we saved:
+
+    **(2500, 10)**
+
+We will need a utility function to display the images, which we define here:
+
+    def plot_gallery(images, h, w, n_row=2, n_col=5):
+        """Helper function to plot a gallery of portraits"""
+        plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
+        plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
+        for i in range(n_row * n_col):
+            plt.subplot(n_row, n_col, i + 1)
+            plt.imshow(images[:, i].reshape((h, w)), cmap=plt.cm.gray)
+            plt.title("Eigenface %d" % (i + 1), size=12)
+            plt.xticks(())
+            plt.yticks(())
+
+### Note
+
+This function is adapted from the LFW dataset example code in the scikit-learn documentation available at <http://scikit-learn.org/stable/auto_examples/applications/face_recognition.html>.
+
+We will now use this function to plot the top 10 Eigenfaces:
+
+    plot_gallery(pcs, 50, 50)
+
+This should display the following plot:
+
+Top 10 Eigenfaces
+
+### Interpreting the Eigenfaces
+
+Looking at the preceding images, we can see that the PCA model has effectively extracted recurring patterns of variation, which represent various features of the facial images. Each principal component can, as with clustering models, be interpreted. Again, like clustering, it is not always straightforward to interpret precisely what each principal component represents.
+
+We can see from these images that there appear to be some images that pick up directional factors (for example, images 6 and 9), some hone in on hair patterns (such as images 4, 5, 7, and 10), while others seem to be somewhat more related to facial features such as eyes, nose, and mouth (such as images 1, 7, and 9).
+
+# Using a dimensionality reduction model
+
+It is interesting to be able to visualize the outcome of a model in this way; however, the overall purpose of using dimensionality reduction is to create a more compact representation of the data that still captures the important features and variability in the raw dataset. To do this, we need to use a trained model to transform our raw data by projecting it into the new, lower-dimensional space represented by the principal components.
+
+## Projecting data using PCA on the LFW dataset
+
+We will illustrate this concept by projecting each LFW image into a ten-dimensional vector. This is done through a matrix multiplication of the image matrix with the matrix of principal components. As the image matrix is a distributed MLlib `RowMatrix`, Spark takes care of distributing this computation for us through the `multiply` function:
+
+    val projected = matrix.multiply(pc)
+    println(projected.numRows, projected.numCols)
+
+This will give you the following output:
+
+    **(1055,10)**
+
+Observe that each image that was of dimension 2500 has been transformed into a vector of size 10. Let's take a look at the first few vectors:
+
+    println(projected.rows.take(5).mkString("\n"))
+
+Here is the output:
+
+    **[2648.9455749636277,1340.3713412351376,443.67380716760965,-353.0021423043161,52.53102289832631,423.39861446944354,413.8429065865399,-484.18122999722294,87.98862070273545,-104.62720604921965]**
+    **[172.67735747311974,663.9154866829355,261.0575622447282,-711.4857925259682,462.7663154755333,167.3082231097332,-71.44832640530836,624.4911488194524,892.3209964031695,-528.0056327351435]**
+    **[-1063.4562028554978,388.3510869550539,1508.2535609357597,361.2485590837186,282.08588829583596,-554.3804376922453,604.6680021092125,-224.16600191143075,-228.0771984153961,-110.21539201855907]**
+    **[-4690.549692385103,241.83448841252638,-153.58903325799685,-28.26215061165965,521.8908276360171,-442.0430200747375,-490.1602309367725,-456.78026845649435,-78.79837478503592,70.62925170688868]**
+    **[-2766.7960144161225,612.8408888724891,-405.76374113178616,-468.56458995613974,863.1136863614743,-925.0935452709143,69.24586949009642,-777.3348492244131,504.54033662376435,257.0263568009851]**
+
+As the projected data is in the form of vectors, we can use the projection as input to another machine learning model. For example, we could use these projected inputs together with a set of input data generated from various images without faces to train a facial recognition model. Alternatively, we could train a multiclass classifier where each person is a class, thus creating a model that learns to identify the particular person that a face belongs to.
+
+## The relationship between PCA and SVD
+
+We mentioned earlier that there is a close relationship between PCA and SVD. In fact, we can recover the same principal components and also apply the same projection into the space of principal components using SVD.
+
+In our example, the right singular vectors derived from computing the SVD will be equivalent to the principal components we have calculated. We can see that this is the case by first computing the SVD on our image matrix and comparing the right singular vectors to the result of PCA. As was the case with PCA, SVD computation is provided as a function on a distributed `RowMatrix`:
+
+    val svd = matrix.computeSVD(10, computeU = true)
+    println(s"U dimension: (${svd.U.numRows}, ${svd.U.numCols})")
+    println(s"S dimension: (${svd.s.size}, )")
+    println(s"V dimension: (${svd.V.numRows}, ${svd.V.numCols})")
+
+We can see that SVD returns a matrix `U` of dimension 1055 x 10, a vector `S` of the singular values of length `10`, and a matrix `V` of the right singular vectors of dimension 2500 x 10:
+
+    **U dimension: (1055, 10)**
+    **S dimension: (10, )**
+    **V dimension: (2500, 10)**
+
+The matrix `V` is exactly equivalent to the result of PCA (ignoring the sign of the values and floating point tolerance). We can verify this with a utility function to compare the two by approximately comparing the data arrays of each matrix:
+
+    def approxEqual(array1: Array[Double], array2: Array[Double], tolerance: Double = 1e-6): Boolean = {
+      // note we ignore sign of the principal component / singular vector elements
+      val bools = array1.zip(array2).map { case (v1, v2) => if (math.abs(math.abs(v1) - math.abs(v2)) > 1e-6) false else true }
+      bools.fold(true)(_ & _)
+    }
+
+We will test the function on some test data:
+
+    println(approxEqual(Array(1.0, 2.0, 3.0), Array(1.0, 2.0, 3.0)))
+
+This will give you the following output:
+
+    **true**
+
+Let's try another test data:
+
+    println(approxEqual(Array(1.0, 2.0, 3.0), Array(3.0, 2.0, 1.0)))
+
+This will give you the following output:
+
+    **false**
+
+Finally, we can apply our equality function as follows:
+
+    println(approxEqual(svd.V.toArray, pc.toArray))
+
+Here is the output:
+
+    **true**
+
+The other relationship that holds is that the multiplication of the matrix `U` and vector `S` (or, strictly speaking, the diagonal matrix `S`) is equivalent to the PCA projection of our original image data into the space of the top 10 principal components.
+
+We will now show that this is indeed the case. We will first use Breeze to multiply each vector in `U` by `S`, element-wise. We will then compare each vector in our PCA projected vectors with the equivalent vector in our SVD projection, and sum up the number of equal cases:
+
+    val breezeS = breeze.linalg.DenseVector(svd.s.toArray)
+    val projectedSVD = svd.U.rows.map { v => 
+      val breezeV = breeze.linalg.DenseVector(v.toArray)
+      val multV = breezeV **:*** breezeS
+      Vectors.dense(multV.data)
+    }
+    projected.rows.zip(projectedSVD).map { case (v1, v2) => approxEqual(v1.toArray, v2.toArray) }.filter(b => true).count
+
+This should display a result of 1055, as we would expect, confirming that each row of `projected` is equal to each row of `projectedSVD`.
+
+### Note
+
+Note that the **:*** operator highlighted in the preceding code represents element-wise multiplication of the vectors.
+
+# Evaluating dimensionality reduction models
+
+Both PCA and SVD are deterministic models. That is, given a certain input dataset, they will always produce the same result. This is in contrast to many of the models we have seen so far, which depend on some random element (most often for the initialization of model weight vectors and so on).
+
+Both models are also guaranteed to return the top principal components or singular values, and hence, the only parameter is _k_. Like clustering models, increasing _k_ always improves the model performance (for clustering, the relevant error function, while for PCA and SVD, the total amount of variability explained by the _k_ components). Therefore, selecting a value for _k_ is a trade-off between capturing as much structure of the data as possible while keeping the dimensionality of projected data low.
+
+## Evaluating k for SVD on the LFW dataset
+
+We will examine the singular values obtained from computing the SVD on our image data. We can verify that the singular values are the same for each run and that they are returned in decreasing order, as follows:
+
+    val sValues = (1 to 5).map { i => matrix.computeSVD(i, computeU = false).s }
+    sValues.foreach(println)
+
+This should show us output similar to the following:
+
+    **[54091.00997110354]**
+    **[54091.00997110358,33757.702867982436]**
+    **[54091.00997110357,33757.70286798241,24541.193694775946]**
+    **[54091.00997110358,33757.70286798242,24541.19369477593,23309.58418888302]**
+    **[54091.00997110358,33757.70286798242,24541.19369477593,23309.584188882982,21803.09841158358]**
+
+As with evaluating values of _k_ for clustering, in the case of SVD (and PCA), it is often useful to plot the singular values for a larger range of _k_ and see where the point on the graph is where the amount of additional variance accounted for by each additional singular value starts to flatten out considerably.
+
+We will do this by first computing the top 300 singular values:
+
+    val svd300 = matrix.computeSVD(300, computeU = false)
+    val sMatrix = new DenseMatrix(1, 300, svd300.s.toArray)
+    csvwrite(new File("/tmp/s.csv"), sMatrix)
+
+We will write out the vector `S` of singular values to a temporary CSV file (as we did for our matrix of Eigenfaces previously) and then read it back in our IPython console, plotting the singular values for each _k_ :
+
+    s = np.loadtxt("/tmp/s.csv", delimiter=",")
+    print(s.shape)
+    plot(s)
+
+You should see an image displayed similar to the one shown here:
+
+Top 300 singular values
+
+A similar pattern is seen in the cumulative variation accounted for by the top 300 singular values (which we will plot on a log scale for the _y_ axis):
+
+    plot(cumsum(s))
+    plt.yscale('log')
+
+Cumulative sum of top 300 singular values
+
+We can see that after a certain value range for _k_ (around 100 in this case), the graph flattens considerably. This indicates that a number of singular values (or principal components) equivalent to this value of _k_ probably explains enough of the variation of the original data.
+
+### Tip
+
+Of course, if we are using dimensionality reduction to help improve the performance of another model, we could use the same evaluation methods used for that model to help us choose a value for _k_.
+
+For example, we could use the AUC metric, together with cross-validation, to choose both the model parameters for a classification model as well as the value of _k_ for our dimensionality reduction model. This does come at the expense of higher computation cost, however, as we would have to recompute the full model training and testing pipeline.
+
+# Summary
+
+In this chapter, we explored two new unsupervised learning methods, PCA and SVD, for dimensionality reduction. We saw how to extract features for and train these models using facial image data. We visualized the results of the model in the form of Eigenfaces, saw how to apply the models to transform our original data into a reduced dimensionality representation, and investigated the close link between PCA and SVD.
+
+In the next chapter, we will delve more deeply into techniques for text processing and analysis with Spark.
+
+# Chapter 9. Advanced Text Processing with Spark
+
+In Chapter 3, _Obtaining, Processing, and Preparing Data with Spark_ , we covered various topics related to feature extraction and data processing, including the basics of extracting features from text data. In this chapter, we will introduce more advanced text processing techniques available in MLlib to work with large-scale text datasets.
+
+In this chapter, we will:
+
+  * Work through detailed examples that illustrate data processing, feature extraction, and the modeling pipeline, as they relate to text data
+  * Evaluate the similarity between two documents based on the words in the documents
+  * Use the extracted text features as inputs for a classification model
+  * Cover a recent development in natural language processing to model words themselves as vectors and illustrate the use of Spark's **Word2Vec** model to evaluate the similarity between two words, based on their meaning
+
+# What's so special about text data?
+
+Text data can be complex to work with for two main reasons. First, text and language have an inherent structure that is not easily captured using the raw words as is (for example, meaning, context, different types of words, sentence structure, and different languages, to highlight a few). Therefore, naive feature extraction is usually relatively ineffective.
+
+Second, the effective dimensionality of text data is extremely large and potentially limitless. Think about the number of words in the English language alone and add all kinds of special words, characters, slang, and so on to this. Then, throw in other languages and all the types of text one might find across the Internet. The dimension of text data can easily exceed tens or even hundreds of millions of words, even in relatively small datasets. For example, the Common Crawl dataset of billions of websites contains over 840 billion individual words.
+
+To deal with these issues, we need ways of extracting more structured features and methods to handle the huge dimensionality of text data.
+
+# Extracting the right features from your data
+
+The field of **natural language processing** ( **NLP** ) covers a wide range of techniques to work with text, from text processing and feature extraction through to modeling and machine learning. In this chapter, we will focus on two feature extraction techniques available within MLlib: the TF-IDF term weighting scheme and feature hashing.
+
+Working through an example of TF-IDF, we will also explore the ways in which processing, tokenization, and filtering during feature extraction can help reduce the dimensionality of our input data as well as improve the information content and usefulness of the features we extract.
+
+## Term weighting schemes
+
+In Chapter 3, _Obtaining, Processing, and Preparing Data with Spark_ , we looked at vector representation where text features are mapped to a simple binary vector called the **bag-of-words** model. Another representation used commonly in practice is called **term frequency-inverse document frequency** ( **TF-IDF** ).
+
+TF-IDF weights each term in a piece of text (referred to as a **document** ) based on its frequency in the document (the **term frequency** ). A global normalization, called the **inverse document frequency** , is then applied based on the frequency of this term among all documents (the set of documents in a dataset is commonly referred to as a **corpus** ). The standard definition of TF-IDF is shown here:
+
+    tf-idf(t,d) = tf(t,d) x idf(t)
+
+Here, _tf(t,d)_ is the frequency (number of occurrences) of term _t_ in document _d_ and _idf(t)_ is the inverse document frequency of term _t_ in the corpus; this is defined as follows:
+
+    idf(t) = log(N / d)
+
+Here, _N_ is the total number of documents, and _d_ is the number of documents in which the term _t_ occurs.
+
+The TF-IDF formulation means that terms occurring many times in a document receive a higher weighting in the vector representation relative to those that occur few times in the document. However, the IDF normalization has the effect of reducing the weight of terms that are very common across all documents. The end result is that truly rare or important terms should be assigned higher weighting, while more common terms (which are assumed to have less importance) should have less impact in terms of weighting.
+
+### Note
+
+A good resource to learn more about the bag-of-words model (or **vector space model** ) is the book _Introduction to Information Retrieval_ , _Christopher D. Manning, Prabhakar Raghavan and Hinrich Sch utze_, _Cambridge University Press_ (available in HTML form at <http://nlp.stanford.edu/IR-book/html/htmledition/irbook.html>).
+
+It contains sections on text processing techniques, including tokenization, stop word removal, stemming, and the vector space model, as well as weighting schemes such as TF-IDF.
+
+An overview can also be found at <http://en.wikipedia.org/wiki/Tf%E2%80%93idf>.
+
+## Feature hashing
+
+ **Feature hashing** is a technique to deal with high-dimensional data and is often used with text and categorical datasets where the features can take on many unique values (often many millions of values). In the previous chapters, we often used the _1-of-K_ encoding approach for categorical features, including text. While this approach is simple and effective, it can break down in the face of extremely high-dimensional data.
+
+Building and using _1-of-K_ feature encoding requires us to keep a mapping of each possible feature value to an index in a vector. Furthermore, the process of creating the mapping itself requires at least one additional pass through the dataset and can be tricky to do in parallel scenarios. Up until now, we have often used a simple approach of collecting the distinct feature values and zipping this collection with a set of indices to create a map of feature value to index. This mapping is then broadcast (either explicitly in our code or implicitly by Spark) to each worker.
+
+However, when dealing with huge feature dimensions in the tens of millions or more that are common when working with text, this approach can be slow and can require significant memory and network resources, both on the Spark master (to collect the unique values) and workers (to broadcast the resulting mapping to each worker, which keeps it in memory to allow it to apply the feature encoding to its local piece of the input data).
+
+Feature hashing works by assigning the vector index for a feature based on the value obtained by hashing this feature to a number (usually, an integer value) using a hash function. For example, let's say the hash value of a categorical feature for the geolocation of `United States` is `342`. We will use the hashed value as the vector index, and the value at this index will be `1.0` to indicate the presence of the `United States` feature. The hash function used must be consistent (that is, for a given input, it returns the same output each time).
+
+This encoding works the same way as mapping-based encoding, except that we choose a size for our feature vector upfront. As the most common hash functions return values in the entire range of integers, we will use a _modulo_ operation to restrict the index values to the size of our vector, which is typically much smaller (a few tens of thousands to a few million, depending on our requirements).
+
+Feature hashing has the advantage that we do not need to build a mapping and keep it in memory. It is also easy to implement, very fast, and can be done online and in real time, thus not requiring a pass through our dataset first. Finally, because we selected a feature vector dimension that is significantly smaller than the raw dimensionality of our dataset, we bound the memory usage of our model both in training and production; hence, memory usage does not scale with the size and dimensionality of our data.
+
+However, there are two important drawbacks, which are as follows:
+
+  * As we don't create a mapping of features to index values, we also cannot do the reverse mapping of feature index to value. This makes it harder to, for example, determine which features are most informative in our models.
+  * As we are restricting the size of our feature vectors, we might experience **hash collisions**. This happens when two different features are hashed into the same index in our feature vector. Surprisingly, this doesn't seem to have a severe impact on model performance as long as we choose a reasonable feature vector dimension relative to the dimension of the input data.
+
+### Note
+
+Further information on hashing can be found at <http://en.wikipedia.org/wiki/Hash_function>.
+
+A key paper that introduced the use of hashing for feature extraction and machine learning is:
+
+ _Kilian Weinberger_ , _Anirban Dasgupta_ , _John Langford_ , _Alex Smola_ , and _Josh Attenberg_. _Feature Hashing for Large Scale Multitask Learning_. _Proc. ICML 2009_ , which is available at <http://alex.smola.org/papers/2009/Weinbergeretal09.pdf>.
+
+## Extracting the TF-IDF features from the 20 Newsgroups dataset
+
+To illustrate the concepts in this chapter, we will use a well-known text dataset called **20 Newsgroups** ; this dataset is commonly used for text-classification tasks. This is a collection of newsgroup messages posted across 20 different topics. There are various forms of data available. For our purposes, we will use the `bydate` version of the dataset, which is available at <http://qwone.com/~jason/20Newsgroups>.
+
+This dataset splits up the available data into training and test sets that comprise 60 percent and 40 percent of the original data, respectively. Here, the messages in the test set occur after those in the training set. This dataset also excludes some of the message headers that identify the actual newsgroup; hence, it is an appropriate dataset to test the real-world performance of classification models.
+
+### Note
+
+Further information on the original dataset can be found in the _UCI Machine Learning Repository_ page at <http://kdd.ics.uci.edu/databases/20newsgroups/20newsgroups.data.html>.
+
+To get started, download the data and unzip the file using the following command:
+
+    **> tar xfvz 20news-bydate.tar.gz**
+
+This will create two folders: one called `20news-bydate-train` and another one called `20news-bydate-test`. Let's take a look at the directory structure under the training dataset folder:
+
+    **> cd 20news-bydate-train/**
+    **> ls**
+
+You will see that it contains a number of subfolders, one for each newsgroup:
+
+    **alt.atheism              comp.windows.x           rec.sport.hockey         soc.religion.christian**
+    **comp.graphics            misc.forsale             sci.crypt                talk.politics.guns**
+    **comp.os.ms-windows.misc  rec.autos                sci.electronics          talk.politics.mideast**
+    **comp.sys.ibm.pc.hardware rec.motorcycles          sci.med                  talk.politics.misc**
+    **comp.sys.mac.hardware    rec.sport.baseball       sci.space                talk.religion.misc**
+
+There are a number of files under each newsgroup folder; each file contains an individual message posting:
+
+    **> ls rec.sport.hockey**
+    **52550 52580 52610 52640 53468 53550 53580 53610 53640 53670 53700 53731 53761 53791**
+    **...**
+
+We can take a look at a part of one of these messages to see the format:
+
+    **> head -20 rec.sport.hockey/52550**
+    **From: dchhabra@stpl.ists.ca (Deepak Chhabra)**
+    **Subject: Superstars and attendance (was Teemu Selanne, was +/- leaders)**
+    **Nntp-Posting-Host: stpl.ists.ca**
+    **Organization: Solar Terresterial Physics Laboratory, ISTS**
+    **Distribution: na**
+    **Lines: 115**
+
+    **Dean J. Falcione (posting from jrmst+8@pitt.edu) writes:**
+    **[I wrote:]**
+
+    **> >When the Pens got Mario, granted there was big publicity, etc, etc,**
+    **> >and interest was immediately generated.  Gretzky did the same thing for LA.**
+    **> >However, imnsho, neither team would have seen a marked improvement in**
+    **> >attendance if the team record did not improve.  In the year before Lemieux**
+    **> >came, Pittsburgh finished with 38 points.  Following his arrival, the Pens**
+    **> >finished with 53, 76, 72, 81, 87, 72, 88, and 87 points, with a couple of**
+    **^^**
+    **> >Stanley Cups thrown in.**
+    **...**
+
+As we can see, each message contains some header fields that contain the sender, subject, and other metadata, followed by the raw content of the message.
+
+### Exploring the 20 Newsgroups data
+
+Now, we will start up our Spark Scala console, ensuring that we make enough memory available:
+
+    **>./SPARK_HOME/bin/spark-shell --driver-memory 4g**
+
+Looking at the directory structure, you might recognize that once again, we have data contained in individual text files (one text file per message). Therefore, we will again use Spark's `wholeTextFiles` method to read the content of each file into a record in our RDD.
+
+In the code that follows, `PATH` refers to the directory in which you extracted the `20news-bydate` ZIP file:
+
+    val path = "/PATH/20news-bydate-train/*"
+    val rdd = sc.wholeTextFiles(path)
+    val text = rdd.map { case (file, text) => text }
+    println(text.count)
+
+The first time you run this command, it might take quite a bit of time, as Spark needs to scan the directory structure. You will also see quite a lot of console output, as Spark logs all the file paths that are being processed. During the processing, you will see the following line displayed, indicating the total number of files that Spark has detected:
+
+    **...**
+    **14/10/12 14:27:54 INFO FileInputFormat: Total input paths to process : 11314**
+    **...**
+
+After the command has finished running, you will see the total record count, which should be the same as the preceding **Total input paths to process** screen output:
+
+    **11314**
+
+Next, we will take a look at the newsgroup topics available:
+
+    val newsgroups = rdd.map { case (file, text) => file.split("/").takeRight(2).head }
+    val countByGroup = newsgroups.map(n => (n, 1)).reduceByKey(_ + _).collect.sortBy(-_._2).mkString("\n")
+    println(countByGroup)
+
+This will display the following result:
+
+    **(rec.sport.hockey,600)**
+    **(soc.religion.christian,599)**
+    **(rec.motorcycles,598)**
+    **(rec.sport.baseball,597)**
+    **(sci.crypt,595)**
+    **(rec.autos,594)**
+    **(sci.med,594)**
+    **(comp.windows.x,593)**
+    **(sci.space,593)**
+    **(sci.electronics,591)**
+    **(comp.os.ms-windows.misc,591)**
+    **(comp.sys.ibm.pc.hardware,590)**
+    **(misc.forsale,585)**
+    **(comp.graphics,584)**
+    **(comp.sys.mac.hardware,578)**
+    **(talk.politics.mideast,564)**
+    **(talk.politics.guns,546)**
+    **(alt.atheism,480)**
+    **(talk.politics.misc,465)**
+    **(talk.religion.misc,377)**
+
+We can see that the number of messages is roughly even between the topics.
+
+### Applying basic tokenization
+
+The first step in our text processing pipeline is to split up the raw text content in each document into a collection of terms (also referred to as **tokens** ). This is known as **tokenization**. We will start by applying a simple **whitespace** tokenization, together with converting each token to lowercase for each document:
+
+    val text = rdd.map { case (file, text) => text }
+    val whiteSpaceSplit = text.flatMap(t => t.split(" ").map(_.toLowerCase))
+    println(whiteSpaceSplit.distinct.count)
+
+### Tip
+
+In the preceding code, we used the `flatMap` function instead of `map`, as for now, we want to inspect all the tokens together for exploratory analysis. Later in this chapter, we will apply our tokenization scheme on a per-document basis, so we will use the `map` function.
+
+After running this code snippet, you will see the total number of unique tokens after applying our tokenization:
+
+    **402978**
+
+As you can see, for even a relatively small set of text, the number of raw tokens (and, therefore, the dimensionality of our feature vectors) can be very high.
+
+Let's take a look at a randomly selected document:
+
+    println(whiteSpaceSplit.sample(true, 0.3, 42).take(100).mkString(","))
+
+### Tip
+
+Note that we set the third parameter to the `sample` function, which is the random seed. We set this function to `42` so that we get the same results from each call to `sample` so that your results match those in this chapter.
+
+This will display the following result:
+
+    **atheist,resources**
+    **summary:,addresses,,to,atheism**
+    **keywords:,music,,thu,,11:57:19,11:57:19,gmt**
+    **distribution:,cambridge.,290**
+
+    **archive-name:,atheism/resources**
+    **alt-atheism-archive-name:,december,,,,,,,,,,,,,,,,,,,,,,addresses,addresses,,,,,,,religion,to:,to:,,p.o.,53701.**
+    **telephone:,sell,the,,fish,on,their,cars,,with,and,written**
+    **inside.,3d,plastic,plastic,,evolution,evolution,7119,,,,,san,san,san,mailing,net,who,to,atheist,press**
+
+    **aap,various,bible,,and,on.,,,one,book,is:**
+
+    **"the,w.p.,american,pp.,,1986.,bible,contains,ball,,based,based,james,of**
+
+### Improving our tokenization
+
+The preceding simple approach results in a lot of tokens and does not filter out many nonword characters (such as punctuation). Most tokenization schemes will remove these characters. We can do this by splitting each raw document on **nonword characters** using a regular expression pattern:
+
+    val nonWordSplit = text.flatMap(t => t.split("""\W+""").map(_.toLowerCase))
+    println(nonWordSplit.distinct.count)
+
+This reduces the number of unique tokens significantly:
+
+    **130126**
+
+If we inspect the first few tokens, we will see that we have eliminated most of the less useful characters in the text:
+
+    println(nonWordSplit.distinct.sample(true, 0.3, 42).take(100).mkString(","))
+
+You will see the following result displayed:
+
+    **bone,k29p,w1w3s1,odwyer,dnj33n,bruns,_congressional,mmejv5,mmejv5,artur,125215,entitlements,beleive,1pqd9hinnbmi,**
+    **jxicaijp,b0vp,underscored,believiing,qsins,1472,urtfi,nauseam,tohc4,kielbasa,ao,wargame,seetex,museum,typeset,pgva4,**
+    **dcbq,ja_jp,ww4ewa4g,animating,animating,10011100b,10011100b,413,wp3d,wp3d,cannibal,searflame,ets,1qjfnv,6jx,6jx,**
+    **detergent,yan,aanp,unaskable,9mf,bowdoin,chov,16mb,createwindow,kjznkh,df,classifieds,hour,cfsmo,santiago,santiago,**
+    **1r1d62,almanac_,almanac_,chq,nowadays,formac,formac,bacteriophage,barking,barking,barking,ipmgocj7b,monger,projector,**
+    **hama,65e90h8y,homewriter,cl5,1496,zysec,homerific,00ecgillespie,00ecgillespie,mqh0,suspects,steve_mullins,io21087,**
+    **funded,liberated,canonical,throng,0hnz,exxon,xtappcontext,mcdcup,mcdcup,5seg,biscuits**
+
+While our nonword pattern to split text works fairly well, we are still left with numbers and tokens that contain numeric characters. In some cases, numbers can be an important part of a corpus. For our purposes, the next step in our pipeline will be to filter out numbers and tokens that are words mixed with numbers.
+
+We can do this by applying another regular expression pattern and using this to filter out tokens that _do not match_ the pattern:
+
+    val regex = """[^0-9]*""".r
+    val filterNumbers = nonWordSplit.filter(token => regex.pattern.matcher(token).matches)
+    println(filterNumbers.distinct.count)
+
+This further reduces the size of the token set:
+
+    **84912**
+
+Let's take a look at another random sample of the filtered tokens:
+
+    println(filterNumbers.distinct.sample(true, 0.3, 42).take(100).mkString(","))
+
+You will see output like the following one:
+
+    **reunion,wuair,schwabam,eer,silikian,fuller,sloppiness,crying,crying,beckmans,leymarie,fowl,husky,rlhzrlhz,ignore,**
+    **loyalists,goofed,arius,isgal,dfuller,neurologists,robin,jxicaijp,majorly,nondiscriminatory,akl,sively,adultery,**
+    **urtfi,kielbasa,ao,instantaneous,subscriptions,collins,collins,za_,za_,jmckinney,nonmeasurable,nonmeasurable,**
+    **seetex,kjvar,dcbq,randall_clark,theoreticians,theoreticians,congresswoman,sparcstaton,diccon,nonnemacher,**
+    **arresed,ets,sganet,internship,bombay,keysym,newsserver,connecters,igpp,aichi,impute,impute,raffle,nixdorf,**
+    **nixdorf,amazement,butterfield,geosync,geosync,scoliosis,eng,eng,eng,kjznkh,explorers,antisemites,bombardments,**
+    **abba,caramate,tully,mishandles,wgtn,springer,nkm,nkm,alchoholic,chq,shutdown,bruncati,nowadays,mtearle,eastre,**
+    **discernible,bacteriophage,paradijs,systematically,rluap,rluap,blown,moderates**
+
+We can see that we have removed all the numeric characters. This still leaves us with a few strange _words_ , but we will not worry about these too much here.
+
+### Removing stop words
+
+ **Stop words** refer to common words that occur many times across almost all documents in a corpus (and across most corpuses). Examples of typical English stop words include and, but, the, of, and so on. It is a standard practice in text feature extraction to exclude stop words from the extracted tokens.
+
+When using TF-IDF weighting, the weighting scheme actually takes care of this for us. As stop words have a very low IDF score, they will tend to have very low TF-IDF weightings and thus less importance. In some cases, for information retrieval and search tasks, it might be desirable to include stop words. However, it can still be beneficial to exclude stop words during feature extraction, as it reduces the dimensionality of the final feature vectors as well as the size of the training data.
+
+We can take a look at some of the tokens in our corpus that have the highest occurrence across all documents to get an idea about some other stop words to exclude:
+
+    val tokenCounts = filterNumbers.map(t => (t, 1)).reduceByKey(_ + _)
+    val oreringDesc = Ordering.by[(String, Int), Int](_._2)
+    println(tokenCounts.top(20)(oreringDesc).mkString("\n"))
+
+In the preceding code, we took the tokens after filtering out numeric characters and generated a count of the occurrence of each token across the corpus. We can now use Spark's `top` function to retrieve the top 20 tokens by count. Notice that we need to provide the `top` function with an ordering that tells Spark how to order the elements of our RDD. In this case, we want to order by the count, so we will specify the second element of our key-value pair.
+
+Running the preceding code snippet will result in the following top tokens:
+
+    **(the,146532)**
+    **(to,75064)**
+    **(of,69034)**
+    **(a,64195)**
+    **(ax,62406)**
+    **(and,57957)**
+    **(i,53036)**
+    **(in,49402)**
+    **(is,43480)**
+    **(that,39264)**
+    **(it,33638)**
+    **(for,28600)**
+    **(you,26682)**
+    **(from,22670)**
+    **(s,22337)**
+    **(edu,21321)**
+    **(on,20493)**
+    **(this,20121)**
+    **(be,19285)**
+    **(t,18728)**
+
+As we might expect, there are a lot of common words in this list that we could potentially label as stop words. Let's create a set of stop words with some of these as well as other common words. We will then look at the tokens after filtering out these stop words:
+
+    val stopwords = Set(
+      "the","a","an","of","or","in","for","by","on","but", "is", "not", "with", "as", "was", "if",
+      "they", "are", "this", "and", "it", "have", "from", "at", "my", "be", "that", "to"
+    )
+    val tokenCountsFilteredStopwords = tokenCounts.filter { case (k, v) => !stopwords.contains(k) }
+    println(tokenCountsFilteredStopwords.top(20)(oreringDesc).mkString("\n"))
+
+You will see the following output:
+
+    **(ax,62406)**
+    **(i,53036)**
+    **(you,26682)**
+    **(s,22337)**
+    **(edu,21321)**
+    **(t,18728)**
+    **(m,12756)**
+    **(subject,12264)**
+    **(com,12133)**
+    **(lines,11835)**
+    **(can,11355)**
+    **(organization,11233)**
+    **(re,10534)**
+    **(what,9861)**
+    **(there,9689)**
+    **(x,9332)**
+    **(all,9310)**
+    **(will,9279)**
+    **(we,9227)**
+    **(one,9008)**
+
+You might notice that there are still quite a few common words in this top list. In practice, we might have a much larger set of stop words. However, we will keep a few (partly to illustrate the impact of common words when using TF-IDF weighting a little later).
+
+One other filtering step that we will use is removing any tokens that are only one character in length. The reasoning behind this is similar to removing stop words--these single-character tokens are unlikely to be informative in our text model and can further reduce the feature dimension and model size. We will do this with another filtering step:
+
+    val tokenCountsFilteredSize = tokenCountsFilteredStopwords.filter { case (k, v) => k.size >= 2 }
+    println(tokenCountsFilteredSize.top(20)(oreringDesc).mkString("\n"))
+
+Again, we will examine the tokens remaining after this filtering step:
+
+    **(ax,62406)**
+    **(you,26682)**
+    **(edu,21321)**
+    **(subject,12264)**
+    **(com,12133)**
+    **(lines,11835)**
+    **(can,11355)**
+    **(organization,11233)**
+    **(re,10534)**
+    **(what,9861)**
+    **(there,9689)**
+    **(all,9310)**
+    **(will,9279)**
+    **(we,9227)**
+    **(one,9008)**
+    **(would,8905)**
+    **(do,8674)**
+    **(he,8441)**
+    **(about,8336)**
+    **(writes,7844)**
+
+Apart from some of the common words that we have not excluded, we see that a few potentially more informative words are starting to appear.
+
+### Excluding terms based on frequency
+
+It is also a common practice to exclude terms during tokenization when their overall occurrence in the corpus is very low. For example, let's examine the least occurring terms in the corpus (notice the different ordering we use here to return the results sorted in ascending order):
+
+    val oreringAsc = Ordering.by[(String, Int), Int](-_._2)
+    println(tokenCountsFilteredSize.top(20)(oreringAsc).mkString("\n"))
+
+You will get the following results:
+
+    **(lennips,1)**
+    **(bluffing,1)**
+    **(preload,1)**
+    **(altina,1)**
+    **(dan_jacobson,1)**
+    **(vno,1)**
+    **(actu,1)**
+    **(donnalyn,1)**
+    **(ydag,1)**
+    **(mirosoft,1)**
+    **(xiconfiywindow,1)**
+    **(harger,1)**
+    **(feh,1)**
+    **(bankruptcies,1)**
+    **(uncompression,1)**
+    **(d_nibby,1)**
+    **(bunuel,1)**
+    **(odf,1)**
+    **(swith,1)**
+    **(lantastic,1)**
+
+As we can see, there are many terms that only occur once in the entire corpus. Since typically we want to use our extracted features for other tasks such as document similarity or machine learning models, tokens that only occur once are not useful to learn from, as we will not have enough training data relative to these tokens. We can apply another filter to exclude these rare tokens:
+
+    val rareTokens = tokenCounts.filter{ case (k, v) => v < 2 }.map { case (k, v) => k }.collect.toSet
+    val tokenCountsFilteredAll = tokenCountsFilteredSize.filter { case (k, v) => !rareTokens.contains(k) }
+    println(tokenCountsFilteredAll.top(20)(oreringAsc).mkString("\n"))
+
+We can see that we are left with tokens that occur at least twice in the corpus:
+
+    **(sina,2)**
+    **(akachhy,2)**
+    **(mvd,2)**
+    **(hizbolah,2)**
+    **(wendel_clark,2)**
+    **(sarkis,2)**
+    **(purposeful,2)**
+    **(feagans,2)**
+    **(wout,2)**
+    **(uneven,2)**
+    **(senna,2)**
+    **(multimeters,2)**
+    **(bushy,2)**
+    **(subdivided,2)**
+    **(coretest,2)**
+    **(oww,2)**
+    **(historicity,2)**
+    **(mmg,2)**
+    **(margitan,2)**
+    **(defiance,2)**
+
+Now, let's count the number of unique tokens:
+
+    println(tokenCountsFilteredAll.count)
+
+You will see the following output:
+
+    **51801**
+
+As we can see, by applying all the filtering steps in our tokenization pipeline, we have reduced the feature dimension from 402,978 to 51,801.
+
+We can now combine all our filtering logic into one function, which we can apply to each document in our RDD:
+
+    def tokenize(line: String): Seq[String] = {
+      line.split("""\W+""")
+        .map(_.toLowerCase)
+        .filter(token => regex.pattern.matcher(token).matches)
+        .filterNot(token => stopwords.contains(token))
+        .filterNot(token => rareTokens.contains(token))
+        .filter(token => token.size >= 2)
+        .toSeq
+    }
+
+We can check whether this function gives us the same result with the following code snippet:
+
+    println(text.flatMap(doc => tokenize(doc)).distinct.count)
+
+This will output `51801`, giving us the same unique token count as our step-by-step pipeline.
+
+We can tokenize each document in our RDD as follows:
+
+    val tokens = text.map(doc => tokenize(doc))
+    println(tokens.first.take(20))
+
+You will see output similar to the following, showing the first part of the tokenized version of our first document:
+
+    **WrappedArray(mathew, mathew, mantis, co, uk, subject, alt, atheism, faq, atheist, resources, summary, books, addresses, music, anything, related, atheism, keywords, faq)**
+
+### A note about stemming
+
+A common step in text processing and tokenization is **stemming**. This is the conversion of whole words to a **base form** (called a **word stem** ). For example, plurals might be converted to singular ( _dogs_ becomes _dog_ ), and forms such as _walking_ and _walker_ might become _walk_. Stemming can become quite complex and is typically handled with specialized NLP or search engine software (such as NLTK, OpenNLP, and Lucene, for example). We will ignore stemming for the purpose of our example here.
+
+### Note
+
+A full treatment of stemming is beyond the scope of this book. You can find more details at <http://en.wikipedia.org/wiki/Stemming>.
+
+### Training a TF-IDF model
+
+We will now use MLlib to transform each document, in the form of processed tokens, into a vector representation. The first step will be to use the `HashingTF` implementation, which makes use of feature hashing to map each token in the input text to an index in the vector of term frequencies. Then, we will compute the global IDF and use it to transform the term frequency vectors into TF-IDF vectors.
+
+For each token, the index will thus be the hash of the token (mapped in turn onto the dimension of the feature vector). The value for each token will be the TF-IDF weighting for that token (that is, the term frequency multiplied by the inverse document frequency).
+
+First, we will import the classes we need and create our `HashingTF` instance, passing in a `dim` dimension parameter. While the default feature dimension is 220 (or around 1 million), we will choose 218 (or around 260,000), since with about 50,000 tokens, we should not experience a significant number of hash collisions, and a smaller dimension will be more memory and processing friendly for illustrative purposes:
+
+    import org.apache.spark.mllib.linalg.{ SparseVector => SV }
+    import org.apache.spark.mllib.feature.HashingTF
+    import org.apache.spark.mllib.feature.IDF
+    val dim = math.pow(2, 18).toInt
+    val hashingTF = new HashingTF(dim)
+    val tf = hashingTF.transform(tokens)
+    tf.cache
+
+### Tip
+
+Note that we imported MLlib's `SparseVector` using an alias of `SV`. This is because later, we will use Breeze's `linalg` module, which itself also imports `SparseVector`. This way, we will avoid namespace collisions.
+
+The `transform` function of `HashingTF` maps each input document (that is, a sequence of tokens) to an MLlib `Vector`. We will also call `cache` to pin the data in memory to speed up subsequent operations.
+
+Let's inspect the first element of our transformed dataset:
+
+### Tip
+
+Note that `HashingTF.transform` returns an `RDD[Vector]`, so we will cast the result returned to an instance of an MLlib `SparseVector`.
+
+The `transform` method can also work on an individual document by taking an `Iterable` argument (for example, a document as a `Seq[String]`). This returns a single vector.
+
+    val v = tf.first.asInstanceOf[SV]
+    println(v.size)
+    println(v.values.size)
+    println(v.values.take(10).toSeq)
+    println(v.indices.take(10).toSeq)
+
+You will see the following output displayed:
+
+    **262144**
+    **706**
+    **WrappedArray(1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0)**
+    **WrappedArray(313, 713, 871, 1202, 1203, 1209, 1795, 1862, 3115, 3166)**
+
+We can see that the dimension of each sparse vector of term frequencies is 262,144 (or 218 as we specified). However, the number on non-zero entries in the vector is only 706. The last two lines of the output show the frequency counts and vector indexes for the first few entries in the vector.
+
+We will now compute the inverse document frequency for each term in the corpus by creating a new `IDF` instance and calling `fit` with our RDD of term frequency vectors as the input. We will then transform our term frequency vectors to TF-IDF vectors through the `transform` function of `IDF`:
+
+    val idf = new IDF().fit(tf)
+    val tfidf = idf.transform(tf)
+    val v2 = tfidf.first.asInstanceOf[SV]
+    println(v2.values.size)
+    println(v2.values.take(10).toSeq)
+    println(v2.indices.take(10).toSeq)
+
+When you examine the first element in the RDD of TF-IDF transformed vectors, you will see output similar to the one shown here:
+
+    **706**
+    **WrappedArray(2.3869085659322193, 4.670445463955571, 6.561295835827856, 4.597686109673142,  ...**
+    **WrappedArray(313, 713, 871, 1202, 1203, 1209, 1795, 1862, 3115, 3166)**
+
+We can see that the number of non-zero entries hasn't changed (at 706), nor have the vector indices for the terms. What has changed are the values for each term. Earlier, these represented the frequency of each term in the document, but now, the new values represent the frequencies weighted by the `IDF`.
+
+### Analyzing the TF-IDF weightings
+
+Next, let's investigate the TF-IDF weighting for a few terms to illustrate the impact of the commonality or rarity of a term.
+
+First, we can compute the minimum and maximum TF-IDF weights across the entire corpus:
+
+    val minMaxVals = tfidf.map { v =>
+      val sv = v.asInstanceOf[SV]
+      (sv.values.min, sv.values.max)
+    }
+    val globalMinMax = minMaxVals.reduce { case ((min1, max1), (min2, max2)) =>
+      (math.min(min1, min2), math.max(max1, max2))
+    }
+    println(globalMinMax)
+
+As we can see, the minimum TF-IDF is zero, while the maximum is significantly larger:
+
+    **(0.0,66155.39470409753)**
+
+We will now explore the TF-IDF weight attached to various terms. In the previous section on stop words, we filtered out many common terms that occur frequently. Recall that we did not remove all such potential stop words. Instead, we kept a few in the corpus so that we could illustrate the impact of applying the TF-IDF weighting scheme on these terms.
+
+TF-IDF weighting will tend to assign a lower weighting to common terms. To see this, we can compute the TF-IDF representation for a few of the terms that appear in the list of top occurrences that we previously computed, such as `you`, `do`, and `we`:
+
+    val common = sc.parallelize(Seq(Seq("you", "do", "we")))
+    val tfCommon = hashingTF.transform(common)
+    val tfidfCommon = idf.transform(tfCommon)
+    val commonVector = tfidfCommon.first.asInstanceOf[SV]
+    println(commonVector.values.toSeq)
+
+If we form a TF-IDF vector representation of this document, we would see the following values assigned to each term. Note that because of feature hashing, we are not sure exactly which term represents what. However, the values illustrate that the weighting applied to these terms is relatively low:
+
+    **WrappedArray(0.9965359935704624, 1.3348773448236835, 0.5457486182039175)**
+
+Now, let's apply the same transformation to a few less common terms that we might intuitively associate with being more linked to specific topics or concepts:
+
+    val uncommon = sc.parallelize(Seq(Seq("telescope", "legislation", "investment")))
+    val tfUncommon = hashingTF.transform(uncommon)
+    val tfidfUncommon = idf.transform(tfUncommon)
+    val uncommonVector = tfidfUncommon.first.asInstanceOf[SV]
+    println(uncommonVector.values.toSeq)
+
+We can see from the following results that the TF-IDF weightings are indeed significantly higher than for the more common terms:
+
+    **WrappedArray(5.3265513728351666, 5.308532867332488, 5.483736956357579)**
+
+# Using a TF-IDF model
+
+While we often refer to training a TF-IDF model, it is actually a feature extraction process or transformation rather than a machine learning model. TF-IDF weighting is often used as a preprocessing step for other models, such as dimensionality reduction, classification, or regression.
+
+To illustrate the potential uses of TF-IDF weighting, we will explore two examples. The first is using the TF-IDF vectors to compute document similarity, while the second involves training a multilabel classification model with the TF-IDF vectors as input features.
+
+## Document similarity with the 20 Newsgroups dataset and TF-IDF features
+
+You might recall from Chapter 4, _Building a Recommendation Engine with Spark_ , that the similarity between two vectors can be computed using a distance metric. The closer two vectors are (that is, the lower the distance metric), the more similar they are. One such metric that we used to compute similarity between movies is cosine similarity.
+
+Just like we did for movies, we can also compute the similarity between two documents. Using TF-IDF, we have transformed each document into a vector representation. Hence, we can use the same techniques as we used for movie vectors to compare two documents.
+
+Intuitively, we might expect two documents to be more similar to each other if they share many terms. Conversely, we might expect two documents to be less similar if they each contain many terms that are different from each other. As we compute cosine similarity by computing a dot product of the two vectors and each vector is made up of the terms in each document, we can see that documents with a high overlap of terms will tend to have a higher cosine similarity.
+
+Now, we can see TF-IDF at work. We might reasonably expect that even very different documents might contain many overlapping terms that are relatively common (for example, our stop words). However, due to a low TF-IDF weighting, these terms will not have a significant impact on the dot product and, therefore, will not have much impact on the similarity computed.
+
+For example, we might expect two randomly chosen messages from the `hockey` newsgroup to be relatively similar to each other. Let's see if this is the case:
+
+    val hockeyText = rdd.filter { case (file, text) => file.contains("hockey") }
+    val hockeyTF = hockeyText.mapValues(doc => hashingTF.transform(tokenize(doc)))
+    val hockeyTfIdf = idf.transform(hockeyTF.map(_._2))
+
+In the preceding code, we first filtered our raw input RDD to keep only the messages within the hockey topic. We then applied our tokenization and term frequency transformation functions. Note that the `transform` method used is the version that works on a single document (in the form of a `Seq[String]`) rather than the version that works on an RDD of documents.
+
+Finally, we applied the `IDF` transform (note that we use the same IDF that we have already computed on the whole corpus).
+
+Once we have our `hockey` document vectors, we can select two of these vectors at random and compute the cosine similarity between them (as we did earlier, we will use Breeze for the linear algebra functionality, in particular converting our MLlib vectors to Breeze `SparseVector` instances first):
+
+    import breeze.linalg._
+    val hockey1 = hockeyTfIdf.sample(true, 0.1, 42).first.asInstanceOf[SV]
+    val breeze1 = new SparseVector(hockey1.indices, hockey1.values, hockey1.size)
+    val hockey2 = hockeyTfIdf.sample(true, 0.1, 43).first.asInstanceOf[SV]
+    val breeze2 = new SparseVector(hockey2.indices, hockey2.values, hockey2.size)
+    val cosineSim = breeze1.dot(breeze2) / (norm(breeze1) * norm(breeze2))
+    println(cosineSim)
+
+We can see that the cosine similarity between the documents is around 0.06:
+
+    **0.060250114361164626**
+
+While this might seem quite low, recall that the effective dimensionality of our features is high due to the large number of unique terms that is typical when dealing with text data. Hence, we can expect that any two documents might have a relatively low overlap of terms even if they are about the same topic, and therefore would have a lower absolute similarity score.
+
+By contrast, we can compare this similarity score to the one computed between one of our `hockey` documents and another document chosen randomly from the `comp.graphics` newsgroup, using the same methodology:
+
+    val graphicsText = rdd.filter { case (file, text) => file.contains("comp.graphics") }
+    val graphicsTF = graphicsText.mapValues(doc => hashingTF.transform(tokenize(doc)))
+    val graphicsTfIdf = idf.transform(graphicsTF.map(_._2))
+    val graphics = graphicsTfIdf.sample(true, 0.1, 42).first.asInstanceOf[SV]
+    val breezeGraphics = new SparseVector(graphics.indices, graphics.values, graphics.size)
+    val cosineSim2 = breeze1.dot(breezeGraphics) / (norm(breeze1) * norm(breezeGraphics))
+    println(cosineSim2)
+
+The cosine similarity is significantly lower at 0.0047:
+
+    **0.004664850323792852**
+
+Finally, it is likely that a document from another sports-related topic might be more similar to our `hockey` document than one from a computer-related topic. However, we would probably expect a `baseball` document to not be as similar as our `hockey` document. Let's see whether this is the case by computing the similarity between a random message from the `baseball` newsgroup and our `hockey` document:
+
+    val baseballText = rdd.filter { case (file, text) => file.contains("baseball") }
+    val baseballTF = baseballText.mapValues(doc => hashingTF.transform(tokenize(doc)))
+    val baseballTfIdf = idf.transform(baseballTF.map(_._2))
+    val baseball = baseballTfIdf.sample(true, 0.1, 42).first.asInstanceOf[SV]
+    val breezeBaseball = new SparseVector(baseball.indices, baseball.values, baseball.size)
+    val cosineSim3 = breeze1.dot(breezeBaseball) / (norm(breeze1) * norm(breezeBaseball))
+    println(cosineSim3)
+
+Indeed, as we expected, we found that the `baseball` and `hockey` documents have a cosine similarity of 0.05, which is significantly higher than the `comp.graphics` document, but also somewhat lower than the other `hockey` document:
+
+    **0.05047395039466008**
+
+## Training a text classifier on the 20 Newsgroups dataset using TF-IDF
+
+When using TF-IDF vectors, we expected that the cosine similarity measure would capture the similarity between documents, based on the overlap of terms between them. In a similar way, we would expect that a machine learning model, such as a classifier, would be able to learn weightings for individual terms; this would allow it to distinguish between documents from different classes. That is, it should be possible to learn a mapping between the presence (and weighting) of certain terms and a specific topic.
+
+In the 20 Newsgroups example, each newsgroup topic is a class, and we can train a classifier using our TF-IDF transformed vectors as input.
+
+Since we are dealing with a multiclass classification problem, we will use the naive Bayes model in MLlib, which supports multiple classes. As the first step, we will import the Spark classes that we will be using:
+
+    import org.apache.spark.mllib.regression.LabeledPoint
+    import org.apache.spark.mllib.classification.NaiveBayes
+    import org.apache.spark.mllib.evaluation.MulticlassMetrics
+
+Next, we will need to extract the 20 topics and convert them to class mappings. We can do this in exactly the same way as we might for 1-of-K feature encoding, by assigning a numeric index to each class:
+
+    val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap
+    val zipped = newsgroups.zip(tfidf)
+    val train = zipped.map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) }
+    train.cache
+
+In the preceding code snippet, we took the `newsgroups` RDD, where each element is the topic, and used the `zip` function to combine it with each element in our `tfidf` RDD of TF-IDF vectors. We then mapped over each key-value element in our new `zipped` RDD and created a `LabeledPoint` instance, where `label` is the class index and `features` is the TF-IDF vector.
+
+### Tip
+
+Note that the `zip` operator assumes that each RDD has the same number of partitions as well as the same number of elements in each partition. It will fail if this is not the case. We can make this assumption here because we have effectively created both our `tfidf` RDD and `newsgroups` RDD from a series of `map` transformations on the same original RDD that preserved the partitioning structure.
+
+Now that we have an input RDD in the correct form, we can simply pass it to the naive Bayes `train` function:
+
+    val model = NaiveBayes.train(train, lambda = 0.1)
+
+Let's evaluate the performance of the model on the test dataset. We will load the raw test data from the `20news-bydate-test` directory, again using `wholeTextFiles` to read each message into an RDD element. We will then extract the class labels from the file paths in the same way as we did for the `newsgroups` RDD:
+
+    val testPath = "/PATH/20news-bydate-test/*"
+    val testRDD = sc.wholeTextFiles(testPath)
+    val testLabels = testRDD.map { case (file, text) =>
+      val topic = file.split("/").takeRight(2).head
+      newsgroupsMap(topic)
+    }
+
+Transforming the text in the test dataset follows the same procedure as for the training data--we will apply our `tokenize` function followed by the term frequency transformation, and we will again use the same IDF computed from the training data to transform the TF vectors into TF-IDF vectors. Finally, we will zip the test class labels with the TF-IDF vectors and create our test `RDD[LabeledPoint]`:
+
+    val testTf = testRDD.map { case (file, text) => hashingTF.transform(tokenize(text)) }
+    val testTfIdf = idf.transform(testTf)
+    val zippedTest = testLabels.zip(testTfIdf)
+    val test = zippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) }
+
+### Tip
+
+Note that it is important that we use the training set IDF to transform the test data, as this creates a more realistic estimation of model performance on new data, which might potentially contain terms that the model has not yet been trained on. It would be "cheating" to recompute the IDF vector based on the test dataset and, more importantly, would potentially lead to incorrect estimates of optimal model parameters selected through cross-validation.
+
+Now, we're ready to compute the predictions and true class labels for our model. We will use this RDD to compute accuracy and the multiclass weighted F-measure for our model:
+
+    val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
+    val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
+    val metrics = new MulticlassMetrics(predictionAndLabel)
+    println(accuracy)
+    println(metrics.weightedFMeasure)
+
+### Tip
+
+The weighted F-measure is an overall measure of precision and recall performance (where, like area under an ROC curve, values closer to 1.0 indicate better performance), which is then combined through a weighted averaged across the classes.
+
+We can see that our simple multiclass naive Bayes model has achieved close to 80 percent for both accuracy and F-measure:
+
+    **0.7915560276155071**
+    **0.7810675969031116**
+
+# Evaluating the impact of text processing
+
+Text processing and TF-IDF weighting are examples of feature extraction techniques designed to both reduce the dimensionality of and extract some structure from raw text data. We can see the impact of applying these processing techniques by comparing the performance of a model trained on raw text data with one trained on processed and TF-IDF weighted text data.
+
+## Comparing raw features with processed TF-IDF features on the 20 Newsgroups dataset
+
+In this example, we will simply apply the hashing term frequency transformation to the raw text tokens obtained using a simple whitespace splitting of the document text. We will train a model on this data and evaluate the performance on the test set as we did for the model trained with TF-IDF features:
+
+    val rawTokens = rdd.map { case (file, text) => text.split(" ") }
+    val rawTF = texrawTokenst.map(doc => hashingTF.transform(doc))
+    val rawTrain = newsgroups.zip(rawTF).map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) }
+    val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1)
+    val rawTestTF = testRDD.map { case (file, text) => hashingTF.transform(text.split(" ")) }
+    val rawZippedTest = testLabels.zip(rawTestTF)
+    val rawTest = rawZippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) }
+    val rawPredictionAndLabel = rawTest.map(p => (rawModel.predict(p.features), p.label))
+    val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count()
+    println(rawAccuracy)
+    val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel)
+    println(rawMetrics.weightedFMeasure)
+
+Perhaps surprisingly, the raw model does quite well, although both accuracy and F-measure are a few percentage points lower than those of the TF-IDF model. This is also partly a reflection of the fact that the naive Bayes model is well suited to data in the form of raw frequency counts:
+
+    **0.7661975570897503**
+    **0.7628947184990661**
+
+# Word2Vec models
+
+Until now, we have used a bag-of-words vector, optionally with some weighting scheme such as TF-IDF to represent the text in a document. Another recent class of models that has become popular is related to representing individual words as vectors.
+
+These are generally based in some way on the co-occurrence statistics between the words in a corpus. Once the vector representation is computed, we can use these vectors in ways similar to how we might use TF-IDF vectors (such as using them as features for other machine learning models). One such common use case is computing the similarity between two words with respect to their meanings, based on their vector representations.
+
+Word2Vec refers to a specific implementation of one of these models, often referred to as **distributed vector representations**. The MLlib model uses a **skip-gram** model, which seeks to learn vector representations that take into account the contexts in which words occur.
+
+### Note
+
+While a detailed treatment of Word2Vec is beyond the scope of this book, Spark's documentation at <http://spark.apache.org/docs/latest/mllib-feature-extraction.html#word2vec> contains some further details on the algorithm as well as links to the reference implementation.
+
+One of the main academic papers underlying Word2Vec is _Tomas Mikolov_ , _Kai Chen_ , _Greg Corrado_ , and _Jeffrey Dean_. _Efficient Estimation of Word Representations in Vector Space_. _In Proceedings of Workshop at ICLR_ , _2013_.
+
+It is available at <http://arxiv.org/pdf/1301.3781.pdf>.
+
+Another recent model in the area of word vector representations is GloVe at <http://www-nlp.stanford.edu/projects/glove/>.
+
+## Word2Vec on the 20 Newsgroups dataset
+
+Training a Word2Vec model in Spark is relatively simple. We will pass in an RDD where each element is a sequence of terms. We can use the RDD of tokenized documents we have already created as input to the model:
+
+    import org.apache.spark.mllib.feature.Word2Vec
+    val word2vec = new Word2Vec()
+    word2vec.setSeed(42)
+    val word2vecModel = word2vec.fit(tokens)
+
+### Tip
+
+Note that we used `setSeed` to set the random seed for model training so that you can see the same results each time the model is trained.
+
+You will see some output similar to the following while the model is being trained:
+
+    **...**
+    **14/10/25 14:21:59 INFO Word2Vec: wordCount = 2133172, alpha = 0.0011868763094487506**
+    **14/10/25 14:21:59 INFO Word2Vec: wordCount = 2144172, alpha = 0.0010640806039941193**
+    **14/10/25 14:21:59 INFO Word2Vec: wordCount = 2155172, alpha = 9.412848985394907E-4**
+    **14/10/25 14:21:59 INFO Word2Vec: wordCount = 2166172, alpha = 8.184891930848592E-4**
+    **14/10/25 14:22:00 INFO Word2Vec: wordCount = 2177172, alpha = 6.956934876302307E-4**
+    **14/10/25 14:22:00 INFO Word2Vec: wordCount = 2188172, alpha = 5.728977821755993E-4**
+    **14/10/25 14:22:00 INFO Word2Vec: wordCount = 2199172, alpha = 4.501020767209707E-4**
+    **14/10/25 14:22:00 INFO Word2Vec: wordCount = 2210172, alpha = 3.2730637126634213E-4**
+    **14/10/25 14:22:01 INFO Word2Vec: wordCount = 2221172, alpha = 2.0451066581171076E-4**
+    **14/10/25 14:22:01 INFO Word2Vec: wordCount = 2232172, alpha = 8.171496035708214E-5**
+    **...**
+    **14/10/25 14:22:02 INFO SparkContext: Job finished: collect at Word2Vec.scala:368, took 56.585983 s**
+    **14/10/25 14:22:02 INFO MappedRDD: Removing RDD 200 from persistence list**
+    **14/10/25 14:22:02 INFO BlockManager: Removing RDD 200**
+    **14/10/25 14:22:02 INFO BlockManager: Removing block rdd_200_0**
+    **14/10/25 14:22:02 INFO MemoryStore: Block rdd_200_0 of size 9008840 dropped from memory (free 1755596828)**
+    **word2vecModel: org.apache.spark.mllib.feature.Word2VecModel = org.apache.spark.mllib.feature.Word2VecModel@2b94e480**
+
+Once trained, we can easily find the top 20 synonyms for a given term (that is, the most similar term to the input term, computed by cosine similarity between the word vectors). For example, to find the 20 most similar terms to _hockey_ , use the following lines of code:
+
+    word2vecModel.findSynonyms("hockey", 20).foreach(println)
+
+As we can see from the following output, most of the terms relate to hockey or other sports topics:
+
+    **(sport,0.6828256249427795)**
+    **(ecac,0.6718048453330994)**
+    **(hispanic,0.6519884467124939)**
+    **(glens,0.6447514891624451)**
+    **(woofers,0.6351765394210815)**
+    **(boxscores,0.6009076237678528)**
+    **(tournament,0.6006366014480591)**
+    **(champs,0.5957855582237244)**
+    **(aargh,0.584071934223175)**
+    **(playoff,0.5834275484085083)**
+    **(ahl,0.5784651637077332)**
+    **(ncaa,0.5680188536643982)**
+    **(pool,0.5612311959266663)**
+    **(olympic,0.5552600026130676)**
+    **(champion,0.5549421310424805)**
+    **(filinuk,0.5528956651687622)**
+    **(yankees,0.5502706170082092)**
+    **(motorcycles,0.5484763979911804)**
+    **(calder,0.5481109023094177)**
+    **(rec,0.5432182550430298)**
+
+As another example, we can find 20 synonyms for the term _legislation_ as follows:
+
+    word2vecModel.findSynonyms("legislation", 20).foreach(println)
+
+In this case, we observe the terms related to _regulation_ , _politics_ , and _business_ feature prominently:
+
+    **(accommodates,0.8149217963218689)**
+    **(briefed,0.7582570314407349)**
+    **(amended,0.7310371994972229)**
+    **(telephony,0.7139414548873901)**
+    **(aclu,0.7080780863761902)**
+    **(pitted,0.7062571048736572)**
+    **(licensee,0.6981208324432373)**
+    **(agency,0.6880651712417603)**
+    **(policies,0.6828961372375488)**
+    **(senate,0.6821110844612122)**
+    **(businesses,0.6814320087432861)**
+    **(permit,0.6797110438346863)**
+    **(cpsr,0.6764014959335327)**
+    **(cooperation,0.6733141541481018)**
+    **(surveillance,0.6670728325843811)**
+    **(restricted,0.6666574478149414)**
+    **(congress,0.6661365628242493)**
+    **(procure,0.6655452251434326)**
+    **(industry,0.6650314927101135)**
+    **(inquiry,0.6644254922866821)**
+
+# Summary
+
+In this chapter, we took a deeper look into more complex text processing and explored MLlib's text feature extraction capabilities, in particular the TF-IDF term weighting schemes. We covered examples of using the resulting TF-IDF feature vectors to compute document similarity and train a newsgroup topic classification model. Finally, you learned how to use MLlib's cutting-edge Word2Vec model to compute a vector representation of words in a corpus of text and use the trained model to find words with contextual meaning that is similar to a given word.
+
+In the next chapter, we will take a look at online learning, and you will learn how Spark Streaming relates to online learning models.
+
+# Chapter 10. Real-time Machine Learning with Spark Streaming
+
+So far in this book, we have focused on **batch** data processing. That is, all our analysis, feature extraction, and model training has been applied to a fixed set of data that does not change. This fits neatly into Spark's core abstraction of RDDs, which are immutable distributed datasets. Once created, the data underlying the RDD does not change, although we might create new RDDs from the original RDD through Spark's transformation and action operators.
+
+Our attention has also been on batch machine learning models where we train a model on a fixed batch of training data that is usually represented as an RDD of feature vectors (and labels, in the case of supervised learning models).
+
+In this chapter, we will:
+
+  * Introduce the concept of online learning, where models are trained and updated on new data as it becomes available
+  * Explore stream processing using Spark Streaming
+  * See how Spark Streaming fits together with the online learning approach
+
+# Online learning
+
+The batch machine learning methods that we have applied in this book focus on processing an existing fixed set of training data. Typically, these techniques are also iterative, and we have performed multiple passes over our training data in order to converge to an optimal model.
+
+By contrast, online learning is based on performing only one sequential pass through the training data in a fully incremental fashion (that is, one training example at a time). After seeing each training example, the model makes a prediction for this example and then receives the true outcome (for example, the label for classification or real target for regression). The idea behind online learning is that the model continually updates as new information is received instead of being retrained periodically in batch training.
+
+In some settings, when data volume is very large or the process that generates the data is changing rapidly, online learning methods can adapt more quickly and in near real time, without needing to be retrained in an expensive batch process.
+
+However, online learning methods do not have to be used in a purely online manner. In fact, we have already seen an example of using an online learning model in the batch setting when we used **stochastic gradient descent** optimization to train our classification and regression models. SGD updates the model after each training example. However, we still made use of multiple passes over the training data in order to converge to a better result.
+
+In the pure online setting, we do not (or perhaps cannot) make multiple passes over the training data; hence, we need to process each input as it arrives. Online methods also include mini-batch methods where, instead of processing one input at a time, we process a small batch of training data.
+
+Online and batch methods can also be combined in real-world situations. For example, we can periodically retrain our models offline (say, every day) using batch methods. We can then deploy the trained model to production and update it using online methods in real time (that is, during the day, in between batch retraining) to adapt to any changes in the environment.
+
+As we will see in this chapter, the online learning setting can fit neatly into stream processing and the Spark Streaming framework.
+
+### Note
+
+See <http://en.wikipedia.org/wiki/Online_machine_learning> for more details on online machine learning.
+
+# Stream processing
+
+Before covering online learning with Spark, we will first explore the basics of stream processing and introduce the Spark Streaming library.
+
+In addition to the core Spark API and functionality, the Spark project contains another major library (in the same way as MLlib is a major project library) called **Spark Streaming** , which focuses on processing data streams in real time.
+
+A data stream is a continuous sequence of records. Common examples include activity stream data from a web or mobile application, time-stamped log data, transactional data, and event streams from sensor or device networks.
+
+The batch processing approach typically involves saving the data stream to an intermediate storage system (for example, HDFS or a database) and running a batch process on the saved data. In order to generate up-to-date results, the batch process must be run periodically (for example, daily, hourly, or even every few minutes) on the latest data available.
+
+By contrast, the stream-based approach applies processing to the data stream as it is generated. This allows near real-time processing (of the order of a subsecond to a few tenths of a second time frames rather than minutes, hours, days, or even weeks with typical batch processing).
+
+## An introduction to Spark Streaming
+
+There are a few different general techniques to deal with stream processing. Two of the most common ones are as follows:
+
+  * Treat each record individually and process it as soon as it is seen.
+  * Combine multiple records into **mini-batches**. These mini-batches can be delineated either by time or by the number of records in a batch.
+
+Spark Streaming takes the second approach. The core primitive in Spark Streaming is the **discretized stream** , or **DStream**. A DStream is a sequence of mini-batches, where each mini-batch is represented as a Spark RDD:
+
+The discretized stream abstraction
+
+A DStream is defined by its input source and a time window called the **batch interval**. The stream is broken up into time periods equal to the batch interval (beginning from the starting time of the application). Each RDD in the stream will contain the records that are received by the Spark Streaming application during a given batch interval. If no data is present in a given interval, the RDD will simply be empty.
+
+### Input sources
+
+Spark Streaming **receivers** are responsible for receiving data from an **input source** and converting the raw data into a DStream made up of Spark RDDs.
+
+Spark Streaming supports various input sources, including file-based sources (where the receiver watches for new files arriving at the input location and creates the DStream from the contents read from each new file) and network-based sources (such as receivers that communicate with socket-based sources, the Twitter API stream, Akka actors, or message queues and distributed stream and log transfer frameworks, such Flume, Kafka, and Amazon Kinesis).
+
+### Note
+
+See the documentation on input sources at <http://spark.apache.org/docs/latest/streaming-programming-guide.html#input-dstreams> for more details and for links to various advanced sources.
+
+### Transformations
+
+As we saw in Chapter 1, _Getting Up and Running with Spark_ , and throughout this book, Spark allows us to apply powerful transformations to RDDs. As DStreams are made up of RDDs, Spark Streaming provides a set of transformations available on DStreams; these transformations are similar to those available on RDDs. These include `map`, `flatMap`, `filter`, `join`, and `reduceByKey`.
+
+Spark Streaming transformations, such as those applicable to RDDs, operate on each element of a DStream's underlying data. That is, the transformations are effectively applied to each RDD in the DStream, which, in turn, applies the transformation to the elements of the RDD.
+
+Spark Streaming also provides operators such as `reduce` and `count`. These operators return a DStream made up of a single element (for example, the count value for each batch). Unlike the equivalent operators on RDDs, these do not trigger computation on DStreams directly. That is, they are not **actions** , but they are still transformations, as they return another DStream.
+
+#### Keeping track of state
+
+When we were dealing with batch processing of RDDs, keeping and updating a state variable was relatively straightforward. We could start with a certain state (for example, a count or sum of values) and then use broadcast variables or accumulators to update this state in parallel. Usually, we would then use an RDD action to collect the updated state to the driver and, in turn, update the global state.
+
+With DStreams, this is a little more complex, as we need to keep track of states across batches in a fault-tolerant manner. Conveniently, Spark Streaming provides the `updateStateByKey` function on a DStream of key-value pairs, which takes care of this for us, allowing us to create a stream of arbitrary state information and update it with each batch of data seen. For example, the state could be a global count of the number of times each key has been seen. The state could, thus, represent the number of visits per web page, clicks per advert, tweets per user, or purchases per product, for example.
+
+#### General transformations
+
+The Spark Streaming API also exposes a general `transform` function that gives us access to the underlying RDD for each batch in the stream. That is, where the higher level functions such as `map` transform a DStream to another DStream, `transform` allows us to apply functions from an RDD to another RDD. For example, we can use the RDD `join` operator to join each batch of the stream to an existing RDD that we computed separately from our streaming application (perhaps, in Spark or some other system).
+
+### Note
+
+The full list of transformations and further information on each of them is provided in the Spark documentation at <http://spark.apache.org/docs/latest/streaming-programming-guide.html#transformations-on-dstreams>.
+
+### Actions
+
+While some of the operators we have seen in Spark Streaming, such as `count`, are not actions as in the batch RDD case, Spark Streaming has the concept of **actions** on DStreams. Actions are **output** operators that, when invoked, trigger computation on the DStream. They are as follows:
+
+  * `print`: This prints the first 10 elements of each batch to the console and is typically used for debugging and testing.
+  * `saveAsObjectFile`, `saveAsTextFiles`, and `saveAsHadoopFiles`: These functions output each batch to a Hadoop-compatible filesystem with a filename (if applicable) derived from the batch start timestamp.
+  * `forEachRDD`: This operator is the most generic and allows us to apply any arbitrary processing to the RDDs within each batch of a DStream. It is used to apply _side effects_ , such as saving data to an external system, printing it for testing, exporting it to a dashboard, and so on.
+
+### Tip
+
+Note that like batch processing with Spark, DStream operators are **lazy**. In the same way in which we need to call an action, such as `count`, on an RDD to ensure that processing takes place, we need to call one of the preceding action operators in order to trigger computation on a DStream. Otherwise, our streaming application will not actually perform any computation.
+
+### Window operators
+
+As Spark Streaming operates on time-ordered batched streams of data, it introduces a new concept, which is that of **windowing**. A `window` function computes a transformation over a sliding window applied to the stream.
+
+A window is defined by the length of the window and the sliding interval. For example, with a 10-second window and a 5-second sliding interval, we will compute results every 5 seconds, based on the latest 10 seconds of data in the DStream. For example, we might wish to calculate the top websites by page view numbers over the last 10 seconds and recompute this metric every 5 seconds using a sliding window.
+
+The following figure illustrates a windowed DStream:
+
+A windowed DStream
+
+## Caching and fault tolerance with Spark Streaming
+
+Like Spark RDDs, DStreams can be cached in memory. The use cases for caching are similar to those for RDDs--if we expect to access the data in a DStream multiple times (perhaps performing multiple types of analysis or aggregation or outputting to multiple external systems), we will benefit from caching the data. Stateful operators, which include `window` functions and `updateStateByKey`, do this automatically for efficiency.
+
+Recall that RDDs are immutable datasets and are defined by their input data source and **lineage** --that is, the set of transformations and actions that are applied to the RDD. Fault tolerance in RDDs works by recreating the RDD (or partition of an RDD) that is lost due to the failure of a worker node.
+
+As DStreams are themselves batches of RDDs, they can also be recomputed as required to deal with worker node failure. However, this depends on the input data still being available. If the data source itself is fault-tolerant and persistent (such as HDFS or some other fault-tolerant data store), then the DStream can be recomputed.
+
+If data stream sources are delivered over a network (which is a common case with stream processing), Spark Streaming's default persistence behavior is to replicate data to two worker nodes. This allows network DStreams to be recomputed in the case of failure. Note, however, that any data received by a node but _not yet replicated_ might be lost when a node fails.
+
+Spark Streaming also supports recovery of the driver node in the event of failure. However, currently, for network-based sources, data in the memory of worker nodes will be lost in this case. Hence, Spark Streaming is not fully fault-tolerant in the face of failure of the driver node or application.
+
+### Note
+
+See http://spark.apache.org/docs/latest/streaming-programming-guide.html#caching--persistence and <http://spark.apache.org/docs/latest/streaming-programming-guide.html#fault-tolerance-properties> for more details.
+
+# Creating a Spark Streaming application
+
+We will now work through creating our first Spark Streaming application to illustrate some of the basic concepts around Spark Streaming that we introduced earlier.
+
+We will expand on the example applications used in Chapter 1, _Getting Up and Running with Spark_ , where we used a small example dataset of product purchase events. For this example, instead of using a static set of data, we will create a simple producer application that will randomly generate events and send them over a network connection. We will then create a few Spark Streaming consumer applications that will process this event stream.
+
+The sample project for this chapter contains the code you will need. It is called `scala-spark-streaming-app`. It consists of a Scala SBT project definition file, the example application source code, and a `\src\main\resources` directory that contains a file called `names.csv`.
+
+The `build.sbt` file for the project contains the following project definition:
+
+    name := "scala-spark-streaming-app"
+
+    version := "1.0"
+
+    scalaVersion := "2.10.4"
+
+    libraryDependencies += "org.apache.spark" %% "spark-mllib" % "1.1.0"
+
+    libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.1.0"
+
+Note that we added a dependency on Spark MLlib and Spark Streaming, which includes the dependency on the Spark core.
+
+The `names.csv` file contains a set of 20 randomly generated user names. We will use these names as part of our data generation function in our producer application:
+
+    **Miguel,Eric,James,Juan,Shawn,James,Doug,Gary,Frank,Janet,Michael,James,Malinda,Mike,Elaine,Kevin,Janet,Richard,Saul,Manuela**
+
+## The producer application
+
+Our producer needs to create a network connection and generate some random purchase event data to send over this connection. First, we will define our object and main method definition. We will then read the random names from the `names.csv` resource and create a set of products with prices, from which we will generate our random product events:
+
+    /**
+     * A producer application that generates random "product events", up to 5 per second, and sends them over a
+     * network connection
+     */
+    object StreamingProducer {
+
+      def main(args: Array[String]) {
+
+        val random = new Random()
+
+        // Maximum number of events per second
+        val MaxEvents = 6
+
+        // Read the list of possible names
+        val namesResource = this.getClass.getResourceAsStream("/names.csv")
+        val names = scala.io.Source.fromInputStream(namesResource)
+          .getLines()
+          .toList
+          .head
+          .split(",")
+          .toSeq
+
+        // Generate a sequence of possible products
+        val products = Seq(
+          "iPhone Cover" -> 9.99,
+          "Headphones" -> 5.49,
+          "Samsung Galaxy Cover" -> 8.95,
+          "iPad Cover" -> 7.49
+        )
+
+Using the list of names and map of product name to price, we will create a function that will randomly pick a product and name from these sources, generating a specified number of product events:
+
+        /** Generate a number of random product events */
+        def generateProductEvents(n: Int) = {
+          (1 to n).map { i =>
+            val (product, price) = products(random.nextInt(products.size))
+            val user = random.shuffle(names).head
+            (user, product, price)
+          }
+        }
+
+Finally, we will create a network socket and set our producer to listen on this socket. As soon as a connection is made (which will come from our consumer streaming application), the producer will start generating random events at a random rate between 0 and 5 per second:
+
+        // create a network producer
+        val listener = new ServerSocket(9999)
+        println("Listening on port: 9999")
+
+        while (true) {
+          val socket = listener.accept()
+          new Thread() {
+            override def run = {
+              println("Got client connected from: " + socket.getInetAddress)
+              val out = new PrintWriter(socket.getOutputStream(), true)
+
+              while (true) {
+                Thread.sleep(1000)
+                val num = random.nextInt(MaxEvents)
+                val productEvents = generateProductEvents(num)
+                productEvents.foreach{ event =>
+                  out.write(event.productIterator.mkString(","))
+                  out.write("\n")
+                }
+                out.flush()
+                println(s"Created $num events...")
+              }
+              socket.close()
+            }
+          }.start()
+        }
+      }
+    }
+
+### Note
+
+This producer example is based on the `PageViewGenerator` example in the Spark Streaming examples.
+
+The producer can be run by changing into the base directory of `scala-spark-streaming-app` and using SBT to run the application, as we did in Chapter 1, _Getting Up and Running with Spark_ :
+
+    **> cd scala-spark-streaming-app**
+    **> sbt**
+    **[info] ...**
+    **>**
+
+Use the `run` command to execute the application:
+
+    **> run**
+
+You should see output similar to the following:
+
+    **...**
+    **Multiple main classes detected, select one to run:**
+
+    **[1] StreamingProducer**
+    **[2] SimpleStreamingApp**
+    **[3] StreamingAnalyticsApp**
+    **[4] StreamingStateApp**
+    **[5] StreamingModelProducer**
+    **[6] SimpleStreamingModel**
+    **[7] MonitoringStreamingModel**
+
+    **Enter number:**
+
+Select the `StreamingProducer` option. The application will start running, and you should see the following output:
+
+    **[info] Running StreamingProducer**
+    **Listening on port: 9999**
+
+We can see that the producer is listening on port `9999`, waiting for our consumer application to connect.
+
+## Creating a basic streaming application
+
+Next, we will create our first streaming program. We will simply connect to the producer and print out the contents of each batch. Our streaming code looks like this:
+
+    /**
+     * A simple Spark Streaming app in Scala
+     */
+    object SimpleStreamingApp {
+
+      def main(args: Array[String]) {
+
+        val ssc = new StreamingContext("local[2]", "First Streaming App", Seconds(10))
+        val stream = ssc.socketTextStream("localhost", 9999)
+
+        // here we simply print out the first few elements of each
+        // batch
+        stream.print()
+        ssc.start()
+        ssc.awaitTermination()
+
+      }
+    }
+
+It looks fairly simple, and it is mostly due to the fact that Spark Streaming takes care of all the complexity for us. First, we initialized a `StreamingContext` (which is the streaming equivalent of the `SparkContext` we have used so far), specifying similar configuration options that are used to create a `SparkContext`. Notice, however, that here we are required to provide the batch interval, which we set to 10 seconds.
+
+We then created our data stream using a predefined streaming source, `socketTextStream`, which reads text from a socket host and port and creates a `DStream[String]`. We then called the `print` function on the DStream; this function prints out the first few elements of each batch.
+
+### Tip
+
+Calling `print` on a DStream is similar to calling `take` on an RDD. It displays only the first few elements.
+
+We can run this program using SBT. Open a second terminal window, leaving the producer program running, and run `sbt`:
+
+    **> sbt**
+    **[info] ...**
+    **> run**
+    **....**
+
+Again, you should see a few options to select:
+
+    **Multiple main classes detected, select one to run:**
+
+    **[1] StreamingProducer**
+    **[2] SimpleStreamingApp**
+    **[3] StreamingAnalyticsApp**
+    **[4] StreamingStateApp**
+    **[5] StreamingModelProducer**
+    **[6] SimpleStreamingModel**
+    **[7] MonitoringStreamingModel**
+
+Run the `SimpleStreamingApp` main class. You should see the streaming program start up, displaying output similar to the one shown here:
+
+    **...**
+    **14/11/15 21:02:23 INFO scheduler.ReceiverTracker: ReceiverTracker started**
+    **14/11/15 21:02:23 INFO dstream.ForEachDStream: metadataCleanupDelay = -1**
+    **14/11/15 21:02:23 INFO dstream.SocketInputDStream: metadataCleanupDelay = -1**
+    **14/11/15 21:02:23 INFO dstream.SocketInputDStream: Slide time = 10000 ms**
+    **14/11/15 21:02:23 INFO dstream.SocketInputDStream: Storage level = StorageLevel(false, false, false, false, 1)**
+    **14/11/15 21:02:23 INFO dstream.SocketInputDStream: Checkpoint interval = null**
+    **14/11/15 21:02:23 INFO dstream.SocketInputDStream: Remember duration = 10000 ms**
+    **14/11/15 21:02:23 INFO dstream.SocketInputDStream: Initialized and validated org.apache.spark.streaming.dstream.SocketInputDStream@ff3436d**
+    **14/11/15 21:02:23 INFO dstream.ForEachDStream: Slide time = 10000 ms**
+    **14/11/15 21:02:23 INFO dstream.ForEachDStream: Storage level = StorageLevel(false, false, false, false, 1)**
+    **14/11/15 21:02:23 INFO dstream.ForEachDStream: Checkpoint interval = null**
+    **14/11/15 21:02:23 INFO dstream.ForEachDStream: Remember duration = 10000 ms**
+    **14/11/15 21:02:23 INFO dstream.ForEachDStream: Initialized and validated org.apache.spark.streaming.dstream.ForEachDStream@5a10b6e8**
+    **14/11/15 21:02:23 INFO scheduler.ReceiverTracker: Starting 1 receivers**
+    **14/11/15 21:02:23 INFO spark.SparkContext: Starting job: runJob at ReceiverTracker.scala:275**
+    **...**
+
+At the same time, you should see that the terminal window running the producer displays something like the following:
+
+    **...**
+    **Got client connected from: /127.0.0.1**
+    **Created 2 events...**
+    **Created 2 events...**
+    **Created 3 events...**
+    **Created 1 events...**
+    **Created 5 events...**
+    **...**
+
+After about 10 seconds, which is the time of our streaming batch interval, Spark Streaming will trigger a computation on the stream due to our use of the `print` operator. This should display the first few events in the batch, which will look something like the following output:
+
+    **...**
+    **14/11/15 21:02:30 INFO spark.SparkContext: Job finished: take at DStream.scala:608, took 0.05596 s**
+    **-------------------------------------------**
+    **Time: 1416078150000 ms**
+    **-------------------------------------------**
+    **Michael,Headphones,5.49**
+    **Frank,Samsung Galaxy Cover,8.95**
+    **Eric,Headphones,5.49**
+    **Malinda,iPad Cover,7.49**
+    **James,iPhone Cover,9.99**
+    **James,Headphones,5.49**
+    **Doug,iPhone Cover,9.99**
+    **Juan,Headphones,5.49**
+    **James,iPhone Cover,9.99**
+    **Richard,iPad Cover,7.49**
+    **...**
+
+### Tip
+
+Note that you might see different results, as the producer generates a random number of random events each second.
+
+You can terminate the streaming app by pressing _Ctrl_ \+ _C_. If you want to, you can also terminate the producer (if you do, you will need to restart it again before starting the next streaming programs that we will create).
+
+## Streaming analytics
+
+Next, we will create a slightly more complex streaming program. In Chapter 1, _Getting Up and Running with Spark_ , we calculated a few metrics on our dataset of product purchases. These included the total number of purchases, the number of unique users, the total revenue, and the most popular product (together with its number of purchases and total revenue).
+
+In this example, we will compute the same metrics on our stream of purchase events. The key difference is that these metrics will be computed per batch and printed out.
+
+We will define our streaming application code here:
+
+    /**
+     * A more complex Streaming app, which computes statistics and prints the results for each batch in a DStream
+     */
+    object StreamingAnalyticsApp {
+
+      def main(args: Array[String]) {
+
+        val ssc = new StreamingContext("local[2]", "First Streaming App", Seconds(10))
+        val stream = ssc.socketTextStream("localhost", 9999)
+
+        // create stream of events from raw text elements
+        val events = stream.map { record =>
+          val event = record.split(",")
+          (event(0), event(1), event(2))
+        }
+
+First, we created exactly the same `StreamingContext` and socket stream as we did earlier. Our next step is to apply a `map` transformation to the raw text, where each record is a comma-separated string representing the purchase event. The `map` function splits the text and creates a tuple of `(user, product, price)`. This illustrates the use of `map` on a DStream and how it is the same as if we had been operating on an RDD.
+
+Next, we will use `foreachRDD` to apply arbitrary processing on each RDD in the stream to compute our desired metrics and print them to the console:
+
+        /*
+          We compute and print out stats for each batch.
+          Since each batch is an RDD, we call forEeachRDD on the DStream, and apply the usual RDD functions
+          we used in Chapter 1.
+         */
+        events.foreachRDD { (rdd, time) =>
+          val numPurchases = rdd.count()
+          val uniqueUsers = rdd.map { case (user, _, _) => user }.distinct().count()
+          val totalRevenue = rdd.map { case (_, _, price) => price.toDouble }.sum()
+          val productsByPopularity = rdd
+            .map { case (user, product, price) => (product, 1) }
+            .reduceByKey(_ + _)
+            .collect()
+            .sortBy(-_._2)
+          val mostPopular = productsByPopularity(0)
+
+          val formatter = new SimpleDateFormat
+          val dateStr = formatter.format(new Date(time.milliseconds))
+          println(s"== Batch start time: $dateStr ==")
+          println("Total purchases: " + numPurchases)
+          println("Unique users: " + uniqueUsers)
+          println("Total revenue: " + totalRevenue)
+          println("Most popular product: %s with %d purchases".format(mostPopular._1, mostPopular._2))
+        }
+
+        // start the context
+        ssc.start()
+        ssc.awaitTermination()
+
+      }
+
+    }
+
+If you compare the code operating on the RDDs inside the preceding `foreachRDD` block with that used in Chapter 1, _Getting Up and Running with Spark_ , you will notice that it is virtually the same code. This shows that we can apply any RDD-related processing we wish within the streaming setting by operating on the underlying RDDs, as well as using the built-in higher level streaming operations.
+
+Let's run the streaming program again by calling `sbt run` and selecting `StreamingAnalyticsApp`.
+
+### Tip
+
+Remember that you might also need to restart the producer if you previously terminated the program. This should be done before starting the streaming application.
+
+After about 10 seconds, you should see output from the streaming program similar to the following:
+
+    **...**
+    **14/11/15 21:27:30 INFO spark.SparkContext: Job finished: collect at Streaming.scala:125, took 0.071145 s**
+    **== Batch start time: 2014/11/15 9:27 PM ==**
+    **Total purchases: 16**
+    **Unique users: 10**
+    **Total revenue: 123.72**
+    **Most popular product: iPad Cover with 6 purchases**
+    **...**
+
+You can again terminate the streaming program using _Ctrl_ \+ _C_.
+
+## Stateful streaming
+
+As a final example, we will apply the concept of **stateful** streaming using the `updateStateByKey` function to compute a global state of revenue and number of purchases per user, which will be updated with new data from each 10-second batch. Our `StreamingStateApp` app is shown here:
+
+    object StreamingStateApp {
+      import org.apache.spark.streaming.StreamingContext._
+
+We will first define an `updateState` function that will compute the new state from the running state value and the new data in the current batch. Our state, in this case, is a tuple of `(number of products, revenue)` pairs, which we will keep for each user. We will compute the new state given the set of `(product, revenue)` pairs for the current batch and the accumulated state at the current time.
+
+Notice that we will deal with an `Option` value for the current state, as it might be empty (which will be the case for the first batch), and we need to define a default value, which we will do using `getOrElse` as shown here:
+
+      def updateState(prices: Seq[(String, Double)], currentTotal: Option[(Int, Double)]) = {
+        val currentRevenue = prices.map(_._2).sum
+        val currentNumberPurchases = prices.size
+        val state = currentTotal.getOrElse((0, 0.0))
+        Some((currentNumberPurchases + state._1, currentRevenue + state._2))
+      }
+
+      def main(args: Array[String]) {
+
+        val ssc = new StreamingContext("local[2]", "First Streaming App", Seconds(10))
+        // for stateful operations, we need to set a checkpoint
+        // location
+        ssc.checkpoint("/tmp/sparkstreaming/")
+        val stream = ssc.socketTextStream("localhost", 9999)
+
+        // create stream of events from raw text elements
+        val events = stream.map { record =>
+          val event = record.split(",")
+          (event(0), event(1), event(2).toDouble)
+        }
+
+        val users = events.map{ case (user, product, price) => (user, (product, price)) }
+        val revenuePerUser = users.updateStateByKey(updateState)
+        revenuePerUser.print()
+
+        // start the context
+        ssc.start()
+        ssc.awaitTermination()
+
+      }
+    }
+
+After applying the same string split transformation we used in our previous example, we called `updateStateByKey` on our DStream, passing in our defined `updateState` function. We then printed the results to the console.
+
+Start the streaming example using `sbt run` and by selecting `[4] StreamingStateApp` (also restart the producer program if necessary).
+
+After around 10 seconds, you will start to see the first set of state output. We will wait another 10 seconds to see the next set of output. You will see the overall global state being updated:
+
+    **...**
+    **-------------------------------------------**
+    **Time: 1416080440000 ms**
+    **-------------------------------------------**
+    **(Janet,(2,10.98))**
+    **(Frank,(1,5.49))**
+    **(James,(2,12.98))**
+    **(Malinda,(1,9.99))**
+    **(Elaine,(3,29.97))**
+    **(Gary,(2,12.98))**
+    **(Miguel,(3,20.47))**
+    **(Saul,(1,5.49))**
+    **(Manuela,(2,18.939999999999998))**
+    **(Eric,(2,18.939999999999998))**
+    **...**
+    **-------------------------------------------**
+    **Time: 1416080441000 ms**
+    **-------------------------------------------**
+    **(Janet,(6,34.94))**
+    **(Juan,(4,33.92))**
+    **(Frank,(2,14.44))**
+    **(James,(7,48.93000000000001))**
+    **(Malinda,(1,9.99))**
+    **(Elaine,(7,61.89))**
+    **(Gary,(4,28.46))**
+    **(Michael,(1,8.95))**
+    **(Richard,(2,16.439999999999998))**
+    **(Miguel,(5,35.95))**
+    **...**
+
+We can see that the number of purchases and revenue totals for each user are added to with each batch of data.
+
+### Tip
+
+Now, see if you can adapt this example to use Spark Streaming's `window` functions. For example, you can compute similar statistics per user over the past minute, sliding every 30 seconds.
+
+# Online learning with Spark Streaming
+
+As we have seen, Spark Streaming makes it easy to work with data streams in a way that should be familiar to us from working with RDDs. Using Spark's stream processing primitives combined with the online learning capabilities of MLlib's SGD-based methods, we can create real-time machine learning models that we can update on new data in the stream as it arrives.
+
+## Streaming regression
+
+Spark provides a built-in streaming machine learning model in the `StreamingLinearAlgorithm` class. Currently, only a linear regression implementation is available--`StreamingLinearRegressionWithSGD`--but future versions will include classification.
+
+The streaming regression model provides two methods for usage:
+
+  * `trainOn`: This takes `DStream[LabeledPoint]` as its argument. This tells the model to train on every batch in the input DStream. It can be called multiple times to train on different streams.
+  * `predictOn`: This also takes `DStream[LabeledPoint]`. This tells the model to make predictions on the input DStream, returning a new `DStream[Double]` that contains the model predictions.
+
+Under the hood, the streaming regression model uses `foreachRDD` and `map` to accomplish this. It also updates the model variable after each batch and exposes the latest trained model, which allows us to use this model in other applications or save it to an external location.
+
+The streaming regression model can be configured with parameters for step size and number of iterations in the same way as standard batch regression--the model class used is the same. We can also set the initial model weight vector.
+
+When we first start training a model, we can set the initial weights to a zero vector, or a random vector, or perhaps load the latest model from the result of an offline batch process. We can also decide to save the model periodically to an external system and use the latest model state as the starting point (for example, in the case of a restart after a node or application failure).
+
+## A simple streaming regression program
+
+To illustrate the use of streaming regression, we will create a simple example similar to the preceding one, which uses simulated data. We will write a producer program that generates random feature vectors and target variables, given a fixed, known weight vector, and writes each training example to a network stream.
+
+Our consumer application will run a streaming regression model, training and then testing on our simulated data stream. Our first example consumer will simply print its predictions to the console.
+
+### Creating a streaming data producer
+
+The data producer operates in a manner similar to our product event producer example. Recall from Chapter 5, _Building a Classification Model with Spark_ , that a linear model is a linear combination (or vector dot product) of a weight vector, _w_ , and a feature vector, _x_ (that is, _wTx_ ). Our producer will generate synthetic data using a fixed, known weight vector and randomly generated feature vectors. This data fits the linear model formulation exactly, so we will expect our regression model to learn the true weight vector fairly easily.
+
+First, we will set up a maximum number of events per second (say, 100) and the number of features in our feature vector (also 100 in this example):
+
+    /**
+     * A producer application that generates random linear regression data.
+     */
+    object StreamingModelProducer {
+      import breeze.linalg._
+
+      def main(args: Array[String]) {
+
+        // Maximum number of events per second
+        val MaxEvents = 100
+        val NumFeatures = 100
+
+        val random = new Random()
+
+The `generateRandomArray` function creates an array of the specified size where the entries are randomly generated from a normal distribution. We will use this function initially to generate our known weight vector, `w`, which will be fixed throughout the life of the producer. We will also create a random `intercept` value that will also be fixed. The weight vector and `intercept` will be used to generate each data point in our stream:
+
+        /** Function to generate a normally distributed dense vector */
+        def generateRandomArray(n: Int) = Array.tabulate(n)(_ => random.nextGaussian())
+
+        // Generate a fixed random model weight vector
+        val w = new DenseVector(generateRandomArray(NumFeatures))
+        val intercept = random.nextGaussian() * 10
+
+We will also need a function to generate a specified number of random data points. Each event is made up of a random feature vector and the target that we get from computing the dot product of our known weight vector with the random feature vector and adding the `intercept` value:
+
+        /** Generate a number of random data events*/
+        def generateNoisyData(n: Int) = {
+          (1 to n).map { i =>
+            val x = new DenseVector(generateRandomArray(NumFeatures))
+            val y: Double = w.dot(x)
+            val noisy = y + intercept
+            (noisy, x)
+          }
+        }
+
+Finally, we will use code similar to our previous producer to instantiate a network connection and send a random number of data points (between 0 and 100) in text format over the network each second:
+
+        // create a network producer
+        val listener = new ServerSocket(9999)
+        println("Listening on port: 9999")
+
+        while (true) {
+          val socket = listener.accept()
+          new Thread() {
+            override def run = {
+              println("Got client connected from: " + socket.getInetAddress)
+              val out = new PrintWriter(socket.getOutputStream(), true)
+
+              while (true) {
+                Thread.sleep(1000)
+                val num = random.nextInt(MaxEvents)
+                val data = generateNoisyData(num)
+                data.foreach { case (y, x) =>
+                  val xStr = x.data.mkString(",")
+                  val eventStr = s"$y\t$xStr"
+                  out.write(eventStr)
+                  out.write("\n")
+                }
+                out.flush()
+                println(s"Created $num events...")
+              }
+              socket.close()
+            }
+          }.start()
+        }
+      }
+    }
+
+You can start the producer using `sbt run`, followed by choosing to execute the `StreamingModelProducer` main method. This should result in the following output, thus indicating that the producer program is waiting for connections from our streaming regression application:
+
+    **[info] Running StreamingModelProducer**
+    **Listening on port: 9999**
+
+### Creating a streaming regression model
+
+In the next step in our example, we will create a streaming regression program. The basic layout and setup is the same as our previous streaming analytics examples:
+
+    /**
+     * A simple streaming linear regression that prints out predicted value for each batch
+     */
+    object SimpleStreamingModel {
+
+      def main(args: Array[String]) {
+
+        val ssc = new StreamingContext("local[2]", "First Streaming App", Seconds(10))
+        val stream = ssc.socketTextStream("localhost", 9999)
+
+Here, we will set up the number of features to match the records in our input data stream. We will then create a zero vector to use as the initial weight vector of our streaming regression model. Finally, we will select the number of iterations and step size:
+
+    val NumFeatures = 100
+        val zeroVector = DenseVector.zeros[Double](NumFeatures)
+        val model = new StreamingLinearRegressionWithSGD()
+          .setInitialWeights(Vectors.dense(zeroVector.data))
+          .setNumIterations(1)
+          .setStepSize(0.01)
+
+Next, we will again use the `map` function to transform the input DStream, where each record is a string representation of our input data, into a `LabeledPoint` instance that contains the target value and feature vector:
+
+        // create a stream of labeled points
+        val labeledStream = stream.map { event =>
+          val split = event.split("\t")
+          val y = split(0).toDouble
+          val features = split(1).split(",").map(_.toDouble)
+          LabeledPoint(label = y, features = Vectors.dense(features))
+        }
+
+The final step is to tell the model to train and test on our transformed DStream and also to print out the first few elements of each batch in the DStream of predicted values:
+
+        // train and test model on the stream, and print predictions
+        // for illustrative purposes
+        model.trainOn(labeledStream)
+        model.predictOn(labeledStream).print()
+
+        ssc.start()
+        ssc.awaitTermination()
+
+      }
+    }
+
+### Tip
+
+Note that because we are using the same MLlib model classes for streaming as we did for batch processing, we can, if we choose, perform multiple iterations over the training data in each batch (which is just an RDD of `LabeledPoint` instances).
+
+Here, we will set the number of iterations to `1` to simulate purely online learning. In practice, you can set the number of iterations higher, but note that the training time per batch will go up. If the training time per batch is much higher than the batch interval, the streaming model will start to lag behind the velocity of the data stream.
+
+This can be handled by decreasing the number of iterations, increasing the batch interval, or increasing the parallelism of our streaming program by adding more Spark workers.
+
+Now, we're ready to run `SimpleStreamingModel` in our second terminal window using `sbt run` in the same way as we did for the producer (remember to select the correct main method for SBT to execute). Once the streaming program starts running, you should see the following output in the producer console:
+
+    **Got client connected from: /127.0.0.1**
+    **...**
+    **Created 10 events...**
+    **Created 83 events...**
+    **Created 75 events...**
+    **...**
+
+After about 10 seconds, you should start seeing the model predictions being printed to the streaming application console, similar to those shown here:
+
+    **14/11/16 14:54:00 INFO StreamingLinearRegressionWithSGD: Model updated at time 1416142440000 ms**
+    **14/11/16 14:54:00 INFO StreamingLinearRegressionWithSGD: Current model: weights, [0.05160959387864821,0.05122747155689144,-0.17224086785756998,0.05822993392274008,0.07848094246845688,-0.1298315806501979,0.006059323642394124, ...**
+    **...**
+    **14/11/16 14:54:00 INFO JobScheduler: Finished job streaming job 1416142440000 ms.0 from job set of time 1416142440000 ms**
+    **14/11/16 14:54:00 INFO JobScheduler: Starting job streaming job 1416142440000 ms.1 from job set of time 1416142440000 ms**
+    **14/11/16 14:54:00 INFO SparkContext: Starting job: take at DStream.scala:608**
+    **14/11/16 14:54:00 INFO DAGScheduler: Got job 3 (take at DStream.scala:608) with 1 output partitions (allowLocal=true)**
+    **14/11/16 14:54:00 INFO DAGScheduler: Final stage: Stage 3(take at DStream.scala:608)**
+    **14/11/16 14:54:00 INFO DAGScheduler: Parents of final stage: List()**
+    **14/11/16 14:54:00 INFO DAGScheduler: Missing parents: List()**
+    **14/11/16 14:54:00 INFO DAGScheduler: Computing the requested partition locally**
+    **14/11/16 14:54:00 INFO SparkContext: Job finished: take at DStream.scala:608, took 0.014064 s**
+    **-------------------------------------------**
+    **Time: 1416142440000 ms**
+    **-------------------------------------------**
+    **-2.0851430248312526**
+    **4.609405228401022**
+    **2.817934589675725**
+    **3.3526557917118813**
+    **4.624236379848475**
+    **-2.3509098272485156**
+    **-0.7228551577759544**
+    **2.914231548990703**
+    **0.896926579927631**
+    **1.1968162940541283**
+    **...**
+
+Congratulations! You've created your first streaming online learning model!
+
+You can shut down the streaming application (and, optionally, the producer) by pressing _Ctrl_ \+ _C_ in each terminal window.
+
+## Streaming K-means
+
+MLlib also includes a streaming version of K-means clustering; this is called `StreamingKMeans`. This model is an extension of the mini-batch K-means algorithm where the model is updated with each batch based on a combination between the cluster centers computed from the previous batches and the cluster centers computed for the current batch.
+
+`StreamingKMeans` supports a _forgetfulness_ parameter _alpha_ (set using the `setDecayFactor` method); this controls how aggressive the model is in giving weight to newer data. An alpha value of 0 means the model will only use new data, while with an alpha value of `1`, all data since the beginning of the streaming application will be used.
+
+We will not cover streaming K-means further here (the Spark documentation at <http://spark.apache.org/docs/latest/mllib-clustering.html#streaming-clustering> contains further detail and an example). However, perhaps you could try to adapt the preceding streaming regression data producer to generate input data for a `StreamingKMeans` model. You could also adapt the streaming regression application to use `StreamingKMeans`.
+
+You can create the clustering data producer by first selecting a number of clusters, _K_ , and then generating each data point by:
+
+  * Randomly selecting a cluster index.
+  * Generating a random vector using specific normal distribution parameters for each cluster. That is, each of the _K_ clusters will have a mean and variance parameter, from which the random vectors will be generated using an approach similar to our preceding `generateRandomArray` function.
+
+In this way, each data point that belongs to the same cluster will be drawn from the same distribution, so our streaming clustering model should be able to learn the correct cluster centers over time.
+
+# Online model evaluation
+
+Combining machine learning with Spark Streaming has many potential applications and use cases, including keeping a model or set of models up to date on new training data as it arrives, thus enabling them to adapt quickly to changing situations or contexts.
+
+Another useful application is to track and compare the performance of multiple models in an online manner and, possibly, also perform model selection in real time so that the best performing model is always used to generate predictions on live data.
+
+This can be used to do real-time "A/B testing" of models, or combined with more advanced online selection and learning techniques, such as Bayesian update approaches and bandit algorithms. It can also be used simply to monitor model performance in real time, thus being able to respond or adapt if performance degrades for some reason.
+
+In this section, we will walk through a simple extension to our streaming regression example. In this example, we will compare the evolving error rate of two models with different parameters as they see more and more data in our input stream.
+
+## Comparing model performance with Spark Streaming
+
+As we have used a known weight vector and intercept to generate the training data in our producer application, we would expect our model to eventually learn this underlying weight vector (in the absence of random noise, which we do not add for this example).
+
+Therefore, we should see the model's error rate decrease over time, as it sees more and more data. We can also use standard regression error metrics to compare the performance of multiple models.
+
+In this example, we will create two models with different learning rates, training them both on the same data stream. We will then make predictions for each model and measure the **mean-squared error** ( **MSE** ) and **root mean-squared error** ( **RMSE** ) metrics for each batch.
+
+Our new monitored streaming model code is shown here:
+
+    /**
+     * A streaming regression model that compares the model performance of two models, printing out metrics for
+     * each batch
+     */
+    object MonitoringStreamingModel {
+      import org.apache.spark.SparkContext._
+
+      def main(args: Array[String]) {
+
+        val ssc = new StreamingContext("local[2]", "First Streaming App", Seconds(10))
+        val stream = ssc.socketTextStream("localhost", 9999)
+
+        val NumFeatures = 100
+        val zeroVector = DenseVector.zeros[Double](NumFeatures)
+        val model1 = new StreamingLinearRegressionWithSGD()
+          .setInitialWeights(Vectors.dense(zeroVector.data))
+          .setNumIterations(1)
+          .setStepSize(0.01)
+
+        val model2 = new StreamingLinearRegressionWithSGD()
+          .setInitialWeights(Vectors.dense(zeroVector.data))
+          .setNumIterations(1)
+          .setStepSize(1.0)
+    // create a stream of labeled points
+        val labeledStream = stream.map { event =>
+          val split = event.split("\t")
+          val y = split(0).toDouble
+          val features = split(1).split(",").map(_.toDouble)
+          LabeledPoint(label = y, features = Vectors.dense(features))
+        }
+
+Note that most of the preceding setup code is the same as our simple streaming model example. However, we created two instances of `StreamingLinearRegressionWithSGD`: one with a learning rate of `0.01` and one with the learning rate set to `1.0`.
+
+Next, we will train each model on our input stream, and using Spark Streaming's `transform` function, we will create a new DStream that contains the error rates for each model:
+
+        // train both models on the same stream
+        model1.trainOn(labeledStream)
+        model2.trainOn(labeledStream)
+
+        // use transform to create a stream with model error rates
+        val predsAndTrue = labeledStream.transform { rdd =>
+          val latest1 = model1.latestModel()
+          val latest2 = model2.latestModel()
+          rdd.map { point =>
+            val pred1 = latest1.predict(point.features)
+            val pred2 = latest2.predict(point.features)
+            (pred1 - point.label, pred2 - point.label)
+          }
+        }
+
+Finally, we will use `foreachRDD` to compute the MSE and RMSE metrics for each model and print them to the console:
+
+        // print out the MSE and RMSE metrics for each model per batch
+        predsAndTrue.foreachRDD { (rdd, time) =>
+          val mse1 = rdd.map { case (err1, err2) => err1 * err1 }.mean()
+          val rmse1 = math.sqrt(mse1)
+          val mse2 = rdd.map { case (err1, err2) => err2 * err2 }.mean()
+          val rmse2 = math.sqrt(mse2)
+          println(
+            s"""
+               |-------------------------------------------
+               |Time: $time
+               |-------------------------------------------
+             """.stripMargin)
+          println(s"MSE current batch: Model 1: $mse1; Model 2: $mse2")
+          println(s"RMSE current batch: Model 1: $rmse1; Model 2: $rmse2")
+          println("...\n")
+        }
+
+        ssc.start()
+        ssc.awaitTermination()
+
+      }
+    }
+
+If you terminated the producer earlier, start it again by executing `sbt run` and selecting `StreamingModelProducer`. Once the producer is running again, in your second terminal window, execute `sbt run` and choose the main class for `MonitoringStreamingModel`.
+
+You should see the streaming program startup, and after about 10 seconds, the first batch will be processed, printing output similar to the following:
+
+    **...**
+    **14/11/16 14:56:11 INFO SparkContext: Job finished: mean at StreamingModel.scala:159, took 0.09122 s**
+
+    **-------------------------------------------**
+    **Time: 1416142570000 ms**
+    **-------------------------------------------**
+
+    **MSE current batch: Model 1: 97.9475827857361; Model 2: 97.9475827857361**
+    **RMSE current batch: Model 1: 9.896847113385965; Model 2: 9.896847113385965**
+    **...**
+
+Since both models start with the same initial weight vector, we see that they both make the same predictions on this first batch and, therefore, have the same error.
+
+If we leave the streaming program running for a few minutes, we should eventually see that one of the models has started converging, leading to a lower and lower error, while the other model has tended to diverge to a poorer model due to the overly high learning rate:
+
+    **...**
+    **14/11/16 14:57:30 INFO SparkContext: Job finished: mean at StreamingModel.scala:159, took 0.069175 s**
+
+    **-------------------------------------------**
+    **Time: 1416142650000 ms**
+    **-------------------------------------------**
+
+    **MSE current batch: Model 1: 75.54543031658632; Model 2: 10318.213926882852**
+    **RMSE current batch: Model 1: 8.691687426304878; Model 2: 101.57860959317593**
+    **...**
+
+If you leave the program running for a number of minutes, you should eventually see the first model's error rate getting quite small:
+
+    **...**
+    **14/11/16 17:27:00 INFO SparkContext: Job finished: mean at StreamingModel.scala:159, took 0.037856 s**
+
+    **-------------------------------------------**
+    **Time: 1416151620000 ms**
+    **-------------------------------------------**
+
+    **MSE current batch: Model 1: 6.551475362521364; Model 2: 1.057088005456417E26**
+    **RMSE current batch: Model 1: 2.559584998104451; Model 2: 1.0281478519436867E13**
+    **...**
+
+### Tip
+
+Note again that due to random data generation, you might see different results, but the overall result should be the same--in the first batch, the models will have the same error, and subsequently, the first model should start to generate to a smaller and smaller error.
+
+# Summary
+
+In this chapter, we connected some of the dots between online machine learning and streaming data analysis. We introduced the Spark Streaming library and API for continuous processing of data streams based on familiar RDD functionality and worked through examples of streaming analytics applications that illustrate this functionality.
+
+Finally, we used MLlib's streaming regression model in a streaming application that involves computing and comparing model performance on a stream of input feature vectors.
+
+# Index
+
+## A
+
+  * Abstract Window Toolkit (AWT) / Extracting facial images as vectors
+  * accumulators / Broadcast variables and accumulators
+  * additive smoothing
+    * URL / The naïve Bayes model
+  * agglomerative clustering
+    * about / Hierarchical clustering
+  * alpha parameter / Training a model using implicit feedback data
+  * Alternating Least Squares (ALS) / Alternating least squares
+  * Amazon AWS public datasets
+    * URL / Accessing publicly available datasets
+    * about / Accessing publicly available datasets
+  * Amazon EC2
+    * Spark, running on / Getting Spark running on Amazon EC2
+    * EC2 Spark cluster, launching / Launching an EC2 Spark cluster
+  * Amazon Web Services account
+    * URL / Getting Spark running on Amazon EC2
+  * Anaconda
+    * URL / Exploring and visualizing your data
+  * analytics
+    * streaming / Streaming analytics
+  * architecture, machine learning system / An architecture for a machine learning system
+  * area under ROC curve (AUC) / Evaluating the performance of classification models
+  * AUC, classification models / ROC curve and AUC
+  * AWS console
+    * URL / Getting Spark running on Amazon EC2
+
+## B
+
+  * bad data
+    * filling / Filling in bad or missing data
+  * bag-of-words model
+    * about / Term weighting schemes
+  * base form / A note about stemming
+  * basic streaming application
+    * creating / Creating a basic streaming application
+  * batch interval
+    * about / An introduction to Spark Streaming
+  * bike sharing dataset
+    * features, extracting from / Extracting features from the bike sharing dataset
+    * regression model, training on / Training a regression model on the bike sharing dataset
+    * performance metrics, computing on / Computing performance metrics on the bike sharing dataset
+  * Breeze library / Interpreting the movie clusters
+  * broadcast variable / Broadcast variables and accumulators
+  * built-in evaluation functions
+    * using / Using MLlib's built-in evaluation functions
+    * RMSE / RMSE and MSE
+    * MSE / RMSE and MSE
+    * MAP / MAP
+  * business use cases, machine learning system
+    * about / Business use cases for a machine learning system
+    * personalization / Personalization
+    * targeted marketing / Targeted marketing and customer segmentation
+    * customer segmentation / Targeted marketing and customer segmentation
+    * predictive modelling and analytics / Predictive modeling and analytics
+
+## C
+
+  * categorical features / Categorical features
+    * timestamps, transforming into / Transforming timestamps into categorical features
+  * classification model
+    * about / Predictive modeling and analytics
+  * classification models
+    * types / Types of classification models
+    * linear models / Linear models
+    * naïve Bayes model / The naïve Bayes model
+    * decision trees / Decision trees
+    * training / Training classification models
+    * training, on Kaggle/StumbleUpon evergreen classification dataset / Training a classification model on the Kaggle/StumbleUpon evergreen classification dataset
+    * using / Using classification models
+    * predictions generating, for Kaggle/StumbleUpon evergreen classification dataset / Generating predictions for the Kaggle/StumbleUpon evergreen classification dataset
+  * clustering evaluation
+    * URL / Internal evaluation metrics
+  * clustering model
+    * training / Training a clustering model
+    * training, on MovieLens dataset / Training a clustering model on the MovieLens dataset
+    * used, for making predictions / Making predictions using a clustering model
+  * clustering models
+    * types / Types of clustering models
+    * K-means clustering / K-means clustering
+    * mixture model / Mixture models
+    * hierarchical clustering / Hierarchical clustering
+    * parameters, tuning for / Tuning parameters for clustering models
+    * K, selecting through cross-validation / Selecting K through cross-validation
+  * cluster predictions
+    * interpreting, on MovieLens dataset / Interpreting cluster predictions on the MovieLens dataset
+  * collaborative filtering
+    * about / Collaborative filtering
+    * matrix factorization / Matrix factorization
+  * comma-separated-value (CSV) / The first step to a Spark program in Scala
+  * components, data-driven machine learning system
+    * about / The components of a data-driven machine learning system
+    * data ingestion / Data ingestion and storage
+    * data storage / Data ingestion and storage
+    * data cleansing / Data cleansing and transformation
+    * data transformation / Data cleansing and transformation
+    * model training / Model training and testing loop
+    * testing loop / Model training and testing loop
+    * model deployment / Model deployment and integration
+    * model integration / Model deployment and integration
+    * model monitoring / Model monitoring and feedback
+    * model feedback / Model monitoring and feedback
+    * batch, versus real time / Batch versus real time
+  * content-based filtering / Content-based filtering
+  * convergence
+    * about / K-means clustering
+  * corpus
+    * about / Term weighting schemes
+  * correct form of data
+    * using / Using the correct form of data
+  * cross-validation
+    * K, selecting through / Selecting K through cross-validation
+  * cross validation
+    * about / Model training and testing loop, Cross-validation
+    * URL / Cross-validation
+  * customer segmentation
+    * about / Targeted marketing and customer segmentation
+
+## D
+
+  * data
+    * exploring / Exploring and visualizing your data
+    * visualizing / Exploring and visualizing your data
+    * user dataset, exploring / Exploring the user dataset
+    * movie dataset, exploring / Exploring the movie dataset
+    * rating dataset, exploring / Exploring the rating dataset
+    * processing / Processing and transforming your data
+    * transforming / Processing and transforming your data
+    * features, extracting from / Extracting useful features from your data, Extracting the right features from your data, Extracting the right features from your data, Extracting the right features from your data
+    * projecting, PCA used / Projecting data using PCA on the LFW dataset
+  * data-driven machine learning system
+    * components / The components of a data-driven machine learning system, Data ingestion and storage, Data cleansing and transformation, Model training and testing loop, Model monitoring and feedback, Batch versus real time
+  * data cleansing / Data cleansing and transformation
+  * data ingestion / Data ingestion and storage
+  * datasets
+    * accessing / Accessing publicly available datasets
+    * MovieLens 100k dataset / The MovieLens 100k dataset
+  * data sources
+    * UCI Machine Learning Repository / Accessing publicly available datasets
+    * Amazon AWS public datasets / Accessing publicly available datasets
+    * Kaggle / Accessing publicly available datasets
+    * KDnuggets / Accessing publicly available datasets
+  * data storage / Data ingestion and storage
+  * data transformation / Data cleansing and transformation
+  * decision tree / Decision tree
+  * decision trees / Decision trees
+    * about / Decision trees
+    * tree depth, tuning / Tuning tree depth and impurity
+    * impurity, tuning / Tuning tree depth and impurity
+    * used, for regression / Decision trees for regression
+  * derived features
+    * about / Derived features
+    * timestamps, transforming into categorical features / Transforming timestamps into categorical features
+  * dimensionality reduction
+    * types / Types of dimensionality reduction
+    * PCA / Principal Components Analysis
+    * SVD / Singular Value Decomposition
+    * relationship, to matrix factorization / Relationship with matrix factorization
+    * clustering as / Clustering as dimensionality reduction
+  * dimensionality reduction model
+    * training / Training a dimensionality reduction model
+    * PCA running, on LFW dataset / Running PCA on the LFW dataset
+    * using / Using a dimensionality reduction model
+    * data projecting, PCA used / Projecting data using PCA on the LFW dataset
+    * PCA and SVD, relationship between / The relationship between PCA and SVD
+  * dimensionality reduction models
+    * evaluating / Evaluating dimensionality reduction models
+    * k, evaluating for SVD / Evaluating k for SVD on the LFW dataset
+  * discretized stream
+    * about / An introduction to Spark Streaming
+  * distributed vector representations
+    * about / Word2Vec models
+  * divisive clustering
+    * about / Hierarchical clustering
+  * document similarity
+    * with 20 Newsgroups dataset / Document similarity with the 20 Newsgroups dataset and TF-IDF features
+    * with TF-IDF features / Document similarity with the 20 Newsgroups dataset and TF-IDF features
+  * DStream
+    * about / An introduction to Spark Streaming
+    * actions / Actions
+
+## E
+
+  * EC2 Spark cluster
+    * launching / Launching an EC2 Spark cluster
+  * Eigenfaces
+    * visualizing / Visualizing the Eigenfaces
+    * about / Visualizing the Eigenfaces
+    * URL / Visualizing the Eigenfaces
+    * interpreting / Interpreting the Eigenfaces
+  * ensemble methods / Model training and testing loop
+  * evaluation metrics
+    * about / Evaluating the performance of recommendation models
+  * explicit matrix factorization / Explicit matrix factorization
+  * external evaluation metrics / External evaluation metrics
+
+## F
+
+  * face data
+    * exploring / Exploring the face data
+    * visualizing / Visualizing the face data
+  * facial images, as vectors
+    * extracting / Extracting facial images as vectors
+    * images, loading / Loading images
+    * grayscale, converting to / Converting to grayscale and resizing the images
+    * images, resizing / Converting to grayscale and resizing the images
+    * feature vectors, extracting / Extracting feature vectors
+  * false positive rate (FPR) / ROC curve and AUC
+  * feature extraction
+    * packages, used for / Using packages for feature extraction
+  * feature extraction techniques
+    * term weighting schemes / Term weighting schemes
+    * feature hashing / Feature hashing
+    * TF-IDF features, extracting from 20 Newsgroups dataset / Extracting the TF-IDF features from the 20 Newsgroups dataset
+  * feature hashing / Feature hashing
+  * features
+    * extracting, from data / Extracting useful features from your data, Extracting the right features from your data, Extracting the right features from your data, Extracting the right features from your data, Extracting the right features from your data, Extracting the right features from your data
+    * about / Extracting useful features from your data
+    * numerical features / Extracting useful features from your data, Numerical features
+    * categorical features / Extracting useful features from your data, Categorical features
+    * text features / Extracting useful features from your data, Text features
+    * derived features / Derived features
+    * normalizing features / Normalizing features
+    * extracting / Extracting the right features from your data
+    * extracting, from MovieLens 100k dataset / Extracting features from the MovieLens 100k dataset
+    * extracting, from Kaggle/StumbleUpon evergreen classification dataset / Extracting features from the Kaggle/StumbleUpon evergreen classification dataset
+    * extracting, from bike sharing dataset / Extracting features from the bike sharing dataset
+    * extracting, from MovieLens dataset / Extracting features from the MovieLens dataset
+    * extracting, from LFW dataset / Extracting features from the LFW dataset
+  * features, extracting
+    * feature vectors, creating for linear model / Creating feature vectors for the linear model
+    * feature vectors, creating for decision tree / Creating feature vectors for the decision tree
+  * features, MovieLens dataset
+    * movie genre labels, extracting / Extracting movie genre labels
+    * recommendation model, training / Training the recommendation model
+    * normalization / Normalization
+  * feature standardization, model performance / Feature standardization
+  * feature vector
+    * about / Extracting the right features from your data
+  * feature vectors
+    * creating, for linear model / Creating feature vectors for the linear model
+    * creating, for decision tree / Creating feature vectors for the decision tree
+    * extracting / Extracting feature vectors
+
+## G
+
+  * generalized linear models
+    * URL / Linear models
+  * general regularization
+    * URL / Regularization
+  * grayscale
+    * converting to / Converting to grayscale and resizing the images
+
+## H
+
+  * Hadoop Distributed File System (HDFS) / Installing and setting up Spark locally
+  * hash collisions
+    * about / Feature hashing
+  * hierarchical clustering / Hierarchical clustering
+  * hinge loss
+    * about / Linear support vector machines
+
+## I
+
+  * images
+    * loading / Loading images
+    * resizing / Converting to grayscale and resizing the images
+  * implicit feedback data
+    * used, for training model / Training a model using implicit feedback data
+  * implicit matrix factorization / Implicit matrix factorization
+  * initialization methods, K-means clustering / Initialization methods
+  * internal evaluation metrics / Internal evaluation metrics
+  * inverse document frequency
+    * about / Term weighting schemes
+  * IPython
+    * about / Exploring and visualizing your data
+  * IPython Notebook
+    * URL / Exploring and visualizing your data
+  * item recommendations
+    * about / Item recommendations
+    * similar movies, generating for MovieLens 100K dataset / Generating similar movies for the MovieLens 100k dataset
+
+## J
+
+  * Java
+    * Spark program, writing in / The first step to a Spark program in Java
+  * Java Development Kit (JDK) / Installing and setting up Spark locally
+  * Java Runtime Environment (JRE) / Installing and setting up Spark locally
+
+## K
+
+  * K
+    * selecting, through cross-validation / Selecting K through cross-validation
+  * k
+    * evaluating, for SVD on LFW dataset / Evaluating k for SVD on the LFW dataset
+  * K-means
+    * streaming / Streaming K-means
+  * K-means clustering
+    * about / K-means clustering
+    * initialization methods / Initialization methods
+    * variants / Variants
+  * K-means ||
+    * about / Initialization methods
+  * Kaggle
+    * about / Accessing publicly available datasets
+    * URL / Accessing publicly available datasets
+  * Kaggle/StumbleUpon evergreen classification dataset
+    * features, extracting from / Extracting features from the Kaggle/StumbleUpon evergreen classification dataset
+    * URL / Extracting features from the Kaggle/StumbleUpon evergreen classification dataset
+    * classification models, training on / Training a classification model on the Kaggle/StumbleUpon evergreen classification dataset
+    * predictions, generating for / Generating predictions for the Kaggle/StumbleUpon evergreen classification dataset
+  * Kaggle competition evaluation page
+    * URL / Root Mean Squared Log Error
+  * KDnuggets
+    * about / Accessing publicly available datasets
+    * URL / Accessing publicly available datasets
+
+## L
+
+  * L1 regularization / L1 regularization
+  * L2 regularization
+    * URL / Regularization
+/ L2 regularization
+  * label
+    * about / Extracting the right features from your data
+  * Labeled Faces in the Wild (LFW)
+    * about / Extracting the right features from your data
+  * lasso
+    * about / Least squares regression
+  * latent feature models
+    * about / Explicit matrix factorization
+  * Least Squares Regression / Least squares regression
+  * LFW dataset
+    * features, extracting from / Extracting features from the LFW dataset
+    * face data, exploring / Exploring the face data
+    * face data, visualizing / Visualizing the face data
+    * facial images, extracting as vectors / Extracting facial images as vectors
+    * normalization / Normalization
+    * PCA, running on / Running PCA on the LFW dataset
+    * Eigenfaces, visualizing / Visualizing the Eigenfaces
+    * Eigenfaces, interpreting / Interpreting the Eigenfaces
+    * data projecting, PCA used / Projecting data using PCA on the LFW dataset
+    * k evaluating, for SVD / Evaluating k for SVD on the LFW dataset
+  * line => line.size syntax
+    * about / Spark operations
+  * linear model / Linear model
+  * linear models
+    * about / Linear models, Linear models
+    * logistic regression / Logistic regression
+    * linear support vector machines / Linear support vector machines
+    * iterations / Iterations
+    * step size parameter / Step size
+    * regularization / Regularization
+  * linear support vector machines / Linear support vector machines
+  * log-transformed targets
+    * training, impact / Impact of training on log-transformed targets
+  * logistic regression
+    * about / Linear models, Logistic regression
+
+## M
+
+  * machine learning models, types
+    * about / Types of machine learning models
+    * supervised learning / Types of machine learning models
+    * unsupervised learning / Types of machine learning models
+  * machine learning system
+    * business use cases / Business use cases for a machine learning system, Personalization, Targeted marketing and customer segmentation
+    * architecture / An architecture for a machine learning system
+  * MAE / Mean Absolute Error
+  * MAP
+    * about / MAP
+    * calculating / MAP
+  * map function / Broadcast variables and accumulators
+  * MAPK
+    * URL / Mean average precision at K
+  * matrix factorization
+    * about / Matrix factorization, Relationship with matrix factorization
+    * explicit matrix factorization / Explicit matrix factorization
+    * implicit matrix factorization / Implicit matrix factorization
+    * Alternating Least Squares (ALS) / Alternating least squares
+  * mean-squared error (MSE) / Comparing model performance with Spark Streaming
+  * Mean average precision at K (MAPK) / Mean average precision at K
+  * Mean Squared Error (MSE) / Mean Squared Error
+  * mini-batches
+    * about / An introduction to Spark Streaming
+  * missing data
+    * filling / Filling in bad or missing data
+  * mixture model / Mixture models
+  * MLlib
+    * used, for normalizing features / Using MLlib for feature normalization
+  * model
+    * training, on MovieLens 100k dataset / Training a model on the MovieLens 100k dataset
+    * training, implicit feedback data used / Training a model using implicit feedback data
+  * model deployment / Model deployment and integration
+  * model feedback
+    * about / Model monitoring and feedback
+  * model fitting
+    * about / Linear models
+  * model inputs
+    * rank / Training a model on the MovieLens 100k dataset
+    * iterations / Training a model on the MovieLens 100k dataset
+    * lambda / Training a model on the MovieLens 100k dataset
+  * model integration / Model deployment and integration
+  * model monitoring / Model monitoring and feedback
+  * model parameters
+    * tuning / Tuning model parameters, Tuning model parameters
+    * linear models / Linear models
+    * decision trees / Decision trees
+    * naïve Bayes model / The naïve Bayes model
+    * testing set, creating to evaluate parameters / Creating training and testing sets to evaluate parameters
+    * training set, creating to evaluate parameters / Creating training and testing sets to evaluate parameters
+    * parameter settings, impact for linear models / The impact of parameter settings for linear models
+    * parameter settings, impact for decision tree / The impact of parameter settings for the decision tree
+  * model performance
+    * improving / Improving model performance and tuning parameters, Improving model performance and tuning parameters
+    * feature standardization / Feature standardization
+    * additional features / Additional features
+    * correct form of data, using / Using the correct form of data
+    * comparing, with Spark Streaming / Comparing model performance with Spark Streaming
+  * model selection
+    * about / Model training and testing loop
+  * model training / Model training and testing loop
+  * movie clusters
+    * interpreting / Interpreting the movie clusters
+  * movie dataset
+    * exploring / Exploring the movie dataset
+  * movie genre labels
+    * extracting / Extracting movie genre labels
+  * MovieLens 100K dataset
+    * similar movies, generating for / Generating similar movies for the MovieLens 100k dataset
+  * MovieLens 100k dataset / The MovieLens 100k dataset
+    * URL / The MovieLens 100k dataset
+    * features, extracting from / Extracting features from the MovieLens 100k dataset
+    * movie recommendations, generating from / Generating movie recommendations from the MovieLens 100k dataset
+  * MovieLens dataset
+    * about / Accessing publicly available datasets
+    * features, extracting from / Extracting features from the MovieLens dataset
+    * clustering model, training on / Training a clustering model on the MovieLens dataset
+    * cluster predictions, interpreting on / Interpreting cluster predictions on the MovieLens dataset
+    * performance metrics, computing on / Computing performance metrics on the MovieLens dataset
+  * movie recommendations
+    * generating, from MovieLens 100k dataset / Generating movie recommendations from the MovieLens 100k dataset
+  * MovieStream
+    * about / Introducing MovieStream
+  * MSE / RMSE and MSE, Mean Squared Error and Root Mean Squared Error
+
+## N
+
+  * 20 Newsgroups
+    * about / Extracting the TF-IDF features from the 20 Newsgroups dataset
+    * URL / Extracting the TF-IDF features from the 20 Newsgroups dataset
+  * 20 Newsgroups data
+    * exploring / Exploring the 20 Newsgroups data
+  * 20 Newsgroups dataset
+    * TF-IDF features, extracting from / Extracting the TF-IDF features from the 20 Newsgroups dataset
+    * document similarity, used with / Document similarity with the 20 Newsgroups dataset and TF-IDF features
+    * text classifier, training on / Training a text classifier on the 20 Newsgroups dataset using TF-IDF
+    * Word2Vec models, used on / Word2Vec on the 20 Newsgroups dataset
+  * natural language processing (NLP)
+    * about / Extracting the right features from your data
+  * naïve Bayes model / The naïve Bayes model, The naïve Bayes model
+  * nominal variables
+    * about / Categorical features
+  * nonword characters / Improving our tokenization
+  * normalization
+    * normalize a feature / Normalizing features
+    * normalize a feature vector / Normalizing features
+  * normalization, LFW dataset / Normalization
+  * normalization, MovieLens dataset / Normalization
+  * normalizing features
+    * about / Normalizing features
+    * MLlib, used for / Using MLlib for feature normalization
+  * numerical features / Numerical features
+
+## O
+
+  * 1-of-k encoding
+    * about / Categorical features
+  * online learning / Batch versus real time
+    * about / Online learning
+  * online learning, with Spark Streaming
+    * about / Online learning with Spark Streaming
+    * streaming regression model / Streaming regression
+    * streaming regression program / A simple streaming regression program
+    * K-means, streaming / Streaming K-means
+  * online machine learning
+    * URL / Online learning
+  * online model evaluation
+    * about / Online model evaluation
+    * model performance, comparing with Spark Streaming / Comparing model performance with Spark Streaming
+  * optimization
+    * about / Linear models
+  * options, data transformation
+    * about / Processing and transforming your data
+  * ordinal variables
+    * about / Categorical features
+  * Oryx
+    * URL / Explicit matrix factorization
+  * over-fitting and under-fitting
+    * URL / Regularization
+
+## P
+
+  * packages
+    * used, for feature extraction / Using packages for feature extraction
+  * parameters
+    * tuning / Improving model performance and tuning parameters, Improving model performance and tuning parameters
+    * tuning, for clustering models / Tuning parameters for clustering models
+  * parameter settings impact, for decision tree
+    * about / The impact of parameter settings for the decision tree
+    * tree depth / Tree depth
+    * maximum bins / Maximum bins
+  * parameter settings impact, for linear models
+    * about / The impact of parameter settings for linear models
+    * iterations / Iterations
+    * step size / Step size
+    * L2 regularization / L2 regularization
+    * L1 regularization / L1 regularization
+    * intercept, using / Intercept
+  * PCA / Principal Components Analysis
+    * running, on LFW dataset / Running PCA on the LFW dataset
+    * and SVD, relationship between / The relationship between PCA and SVD
+  * performance, classification models
+    * evaluating / Evaluating the performance of classification models
+    * accuracy, calculating / Accuracy and prediction error
+    * prediction error / Accuracy and prediction error
+    * precision / Precision and recall
+    * recall / Precision and recall
+    * ROC curve / ROC curve and AUC
+    * AUC / ROC curve and AUC
+  * performance, clustering models
+    * evaluating / Evaluating the performance of clustering models
+    * internal evaluation metrics / Internal evaluation metrics
+    * external evaluation metrics / External evaluation metrics
+    * performance metrics, computing on MovieLens dataset / Computing performance metrics on the MovieLens dataset
+  * performance, recommendation models
+    * evaluating / Evaluating the performance of recommendation models
+    * Mean Squared Error (MSE) / Mean Squared Error
+    * Mean average precision at K (MAPK) / Mean average precision at K
+    * built-in evaluation functions, using / Using MLlib's built-in evaluation functions
+  * performance, regression models
+    * evaluating / Evaluating the performance of regression models
+    * MSE / Mean Squared Error and Root Mean Squared Error
+    * RMSE / Mean Squared Error and Root Mean Squared Error
+    * MAE / Mean Absolute Error
+    * Root Mean Squared Log Error / Root Mean Squared Log Error
+    * R-squared coefficient / The R-squared coefficient
+    * performance metrics, computing on bike sharing dataset / Computing performance metrics on the bike sharing dataset
+  * performance metrics
+    * computing, on bike sharing dataset / Computing performance metrics on the bike sharing dataset
+    * linear model / Linear model
+    * decision tree / Decision tree
+    * computing, on MovieLens dataset / Computing performance metrics on the MovieLens dataset
+  * personalization / Personalization
+  * precision, classification models / Precision and recall
+  * precision-recall (PR) curve / Precision and recall
+  * Prediction.io
+    * URL / Explicit matrix factorization
+  * prediction error, classification models / Accuracy and prediction error
+  * predictions
+    * generating, for Kaggle/StumbleUpon evergreen / Generating predictions for the Kaggle/StumbleUpon evergreen classification dataset
+    * generating, for Kaggle/StumbleUpon evergreen classification dataset / Generating predictions for the Kaggle/StumbleUpon evergreen classification dataset
+    * making, clustering model used / Making predictions using a clustering model
+  * predictive modeling
+    * about / Predictive modeling and analytics
+  * producer application / The producer application
+  * pylab
+    * about / Exploring and visualizing your data
+  * Python
+    * Spark program, writing in / The first step to a Spark program in Python
+
+## R
+
+  * R-squared coefficient / The R-squared coefficient
+  * rating dataset
+    * exploring / Exploring the rating dataset
+  * RDD caching
+    * URL / Caching RDDs
+  * RDDs
+    * about / Resilient Distributed Datasets
+    * creating / Creating RDDs
+    * Spark operations / Spark operations
+    * caching / Caching RDDs
+  * Readme.txt file
+    * about / Extracting features from the bike sharing dataset
+    * variables / Extracting features from the bike sharing dataset
+  * recall, classification models / Precision and recall
+  * receiver operating characteristic (ROC) / Evaluating the performance of classification models
+  * recommendation model
+    * training / Training the recommendation model, Training the recommendation model
+    * model, training on MovieLens 100k dataset / Training a model on the MovieLens 100k dataset
+    * using / Using the recommendation model
+    * user recommendations / User recommendations
+    * item recommendations / Item recommendations
+  * recommendation models
+    * about / Types of recommendation models
+    * types / Types of recommendation models
+    * content-based filtering / Content-based filtering
+    * collaborative filtering / Collaborative filtering
+  * recommendations / Personalization
+    * inspecting / Inspecting the recommendations
+  * red, blue, and green (RGB) / Extracting facial images as vectors
+  * regression model
+    * about / Predictive modeling and analytics
+  * regression models
+    * types / Types of regression models
+    * Least Squares Regression / Least squares regression
+    * decision trees, for regression / Decision trees for regression
+    * training / Training and using regression models
+    * using / Training and using regression models
+    * training, on bike sharing dataset / Training a regression model on the bike sharing dataset
+  * regularization forms
+    * SimpleUpdater / Regularization
+    * SquaredL2Updater / Regularization
+    * L1Updater / Regularization
+  * REPL (Read-Eval-Print-Loop)
+    * about / The Spark shell
+  * reshaping / Extracting facial images as vectors
+  * RMSE
+    * about / Mean Squared Error
+/ RMSE and MSE, Mean Squared Error and Root Mean Squared Error
+  * ROC curve
+    * URL / ROC curve and AUC
+  * ROC curve, classification models / ROC curve and AUC
+  * root mean-squared error (RMSE) / Comparing model performance with Spark Streaming
+  * Root Mean Squared Log Error / Root Mean Squared Log Error
+
+## S
+
+  * Scala
+    * Spark program, writing in / The first step to a Spark program in Scala
+  * Scala Build Tool (sbt) / The first step to a Spark program in Scala
+  * similar items
+    * inspecting / Inspecting the similar items
+  * singular values
+    * about / Singular Value Decomposition
+  * skip-gram model
+    * about / Word2Vec models
+  * Spark
+    * installing / Installing and setting up Spark locally
+    * setting up / Installing and setting up Spark locally
+    * running, on Amazon EC2 / Getting Spark running on Amazon EC2
+  * Spark clusters
+    * about / Spark clusters
+    * URL / Spark clusters
+  * SparkConf / SparkContext and SparkConf
+  * SparkContext / SparkContext and SparkConf
+  * Spark documentation
+    * URL / Linear models, Decision trees for regression, General transformations
+  * Spark documentation, for EC2
+    * URL / Getting Spark running on Amazon EC2
+  * Spark operations / Spark operations
+  * Spark program
+    * in Scala / The first step to a Spark program in Scala
+    * in Java / The first step to a Spark program in Java
+    * in Python / The first step to a Spark program in Python
+  * Spark programming guide
+    * URL / The first step to a Spark program in Python
+  * Spark Programming Guide
+    * URL / Broadcast variables and accumulators
+  * Spark programming model
+    * about / The Spark programming model
+    * SparkContext / SparkContext and SparkConf
+    * SparkConf / SparkContext and SparkConf
+    * Spark shell / The Spark shell
+    * RDDs / Resilient Distributed Datasets
+    * broadcast variable / Broadcast variables and accumulators
+    * accumulators / Broadcast variables and accumulators
+  * Spark project documentation website
+    * URL / Installing and setting up Spark locally
+  * Spark project website
+    * URL / Installing and setting up Spark locally
+  * Spark Quick Start
+    * URL / The Spark programming model
+  * Spark shell / The Spark shell
+  * Spark Streaming
+    * about / Batch versus real time, An introduction to Spark Streaming
+    * input sources / Input sources
+    * transformations / Transformations
+    * actions / Actions
+    * window operators / Window operators
+    * model performance, comparing with / Comparing model performance with Spark Streaming
+  * Spark Streaming application
+    * creating / Creating a Spark Streaming application
+    * producer application / The producer application
+    * basic streaming application, creating / Creating a basic streaming application
+    * analytics, streaming / Streaming analytics
+    * stateful streaming / Stateful streaming
+  * stateful streaming / Stateful streaming
+  * stemming
+    * about / A note about stemming
+    * URL / A note about stemming
+  * stochastic gradient descent
+    * about / Online learning
+  * Stochastic Gradient Descent (SGD) / Linear models
+  * stop words
+    * removing / Removing stop words
+  * streaming data producer
+    * creating / Creating a streaming data producer
+  * streaming regression model / Streaming regression
+    * trainOn method / Streaming regression
+    * predictOn method / Streaming regression
+    * creating / Creating a streaming regression model
+  * streaming regression program
+    * about / A simple streaming regression program
+    * streaming data producer, creating / Creating a streaming data producer
+    * streaming regression model, creating / Creating a streaming regression model
+  * Stream processing
+    * about / Stream processing
+    * Spark Streaming / An introduction to Spark Streaming
+    * caching, with Spark Streaming / Caching and fault tolerance with Spark Streaming
+    * fault tolerance, with Spark Streaming / Caching and fault tolerance with Spark Streaming
+  * supervised learning
+    * about / Types of machine learning models
+  * Support Vector Machine (SVM)
+    * about / Linear models
+  * SVD
+    * about / Singular Value Decomposition
+    * and PCA, relationship between / The relationship between PCA and SVD
+
+## T
+
+  * targeted marketing
+    * about / Targeted marketing and customer segmentation
+  * target variable
+    * transforming / Transforming the target variable
+    * training on log-transformed targets, impact / Impact of training on log-transformed targets
+  * term frequency
+    * about / Term weighting schemes
+  * term frequency-inverse document frequency (TF-IDF)
+    * about / Term weighting schemes
+  * terms based on frequency
+    * excluding / Excluding terms based on frequency
+  * term weighting schemes / Term weighting schemes
+  * testing loop / Model training and testing loop
+  * testing set
+    * creating, to evaluate parameters / Creating training and testing sets to evaluate parameters
+  * text classifier
+    * training, on 20 Newsgroups dataset / Training a text classifier on the 20 Newsgroups dataset using TF-IDF
+  * text data
+    * about / What's so special about text data?
+  * text features
+    * about / Text features
+    * extraction / Simple text feature extraction
+  * text processing impact
+    * evaluating / Evaluating the impact of text processing
+    * raw features, comparing / Comparing raw features with processed TF-IDF features on the 20 Newsgroups dataset
+  * TF-IDF
+    * used, for training text classifier / Training a text classifier on the 20 Newsgroups dataset using TF-IDF
+  * TF-IDF features
+    * extracting, from 20 Newsgroups dataset / Extracting the TF-IDF features from the 20 Newsgroups dataset
+    * document similarity, used with / Document similarity with the 20 Newsgroups dataset and TF-IDF features
+  * TF-IDF model
+    * training / Training a TF-IDF model
+    * using / Using a TF-IDF model
+    * document similarity, with 20 Newsgroups dataset / Document similarity with the 20 Newsgroups dataset and TF-IDF features
+    * document similarity, with TF-IDF features / Document similarity with the 20 Newsgroups dataset and TF-IDF features
+    * text classifier, training on 20 Newsgroups dataset / Training a text classifier on the 20 Newsgroups dataset using TF-IDF
+  * TF-IDF weightings
+    * analyzing / Analyzing the TF-IDF weightings
+  * timestamps
+    * transforming, into categorical features / Transforming timestamps into categorical features
+  * tokenization
+    * applying / Applying basic tokenization
+    * improving / Improving our tokenization
+  * training
+    * about / Linear models
+  * training set
+    * creating, to evaluate parameters / Creating training and testing sets to evaluate parameters
+  * transformations
+    * about / Transformations
+    * state, tracking / Keeping track of state
+    * general transformations / General transformations
+  * true positive rate (TPR) / ROC curve and AUC
+
+## U
+
+  * UCI Machine Learning Repository
+    * about / Accessing publicly available datasets
+    * URL / Accessing publicly available datasets
+  * unsupervised learning
+    * about / Types of machine learning models
+  * user dataset
+    * exploring / Exploring the user dataset
+  * user recommendations
+    * about / User recommendations
+    * movie recommendations, generating / Generating movie recommendations from the MovieLens 100k dataset
+
+## V
+
+  * variance
+    * about / Decision trees for regression
+  * variants, K-means clustering / Variants
+  * vector
+    * about / Extracting useful features from your data
+  * vector space model
+    * about / Term weighting schemes
+
+## W
+
+  * whitespace tokenization
+    * URL / Applying basic tokenization
+  * window
+    * about / Window operators
+  * windowing
+    * about / Window operators
+  * window operators / Window operators
+  * within cluster sum of squared errors (WCSS) / K-means clustering
+  * Word2Vec models
+    * about / Word2Vec models
+    * on 20 Newsgroups dataset / Word2Vec on the 20 Newsgroups dataset
+  * word stem / A note about stemming
+
+
diff --git a/kag/examples/csqa/builder/data/mastering_vba_for_microsoft_office.txt b/kag/examples/csqa/builder/data/mastering_vba_for_microsoft_office.txt
new file mode 100644
index 00000000..088069a6
--- /dev/null
+++ b/kag/examples/csqa/builder/data/mastering_vba_for_microsoft_office.txt
@@ -0,0 +1,27264 @@
+Mastering VBA for Microsoft Office 2013
+ 
+Table of Contents
+
+Acknowledgments
+
+About the Author
+
+Introduction
+
+Part 1: Recording Macros and Getting Started with VBA
+
+Chapter 1: Recording and Running Macros in the Office Applications
+
+What Is VBA and What Can You Do with It?
+
+Understanding Macro Basics
+
+Recording a Macro
+
+Running a Macro
+
+Recording a Sample Word Macro
+
+Recording a Sample Excel Macro
+
+Specifying How to Trigger an Existing Macro
+
+Deleting a Macro
+
+The Bottom Line
+
+Chapter 2: Getting Started with the Visual Basic Editor
+
+Opening the Visual Basic Editor
+
+Using the Visual Basic Editor's Main Windows
+
+Setting Properties for a Project
+
+Customizing the Visual Basic Editor
+
+The Bottom Line
+
+Chapter 3: Editing Recorded Macros
+
+Testing a Macro in the Visual Basic Editor
+
+Editing the Word Macro
+
+Editing the Excel Macro
+
+Editing a PowerPoint Macro
+
+The Bottom Line
+
+Chapter 4: Creating Code from Scratch in the Visual Basic Editor
+
+Setting Up the Visual Basic Editor for Creating the Procedures
+
+Creating a Procedure for Word
+
+Creating a Procedure for Excel
+
+Creating a Procedure for PowerPoint
+
+Creating a Procedure for Access
+
+The Bottom Line
+
+Part 2: Learning How to Work with VBA
+
+Chapter 5: Understanding the Essentials of VBA Syntax
+
+Getting Ready
+
+Procedures
+
+Statements
+
+Keywords
+
+Expressions
+
+Operators
+
+Variables
+
+Constants
+
+Arguments
+
+Objects
+
+Collections
+
+Properties
+
+Methods
+
+Events
+
+The Bottom Line
+
+Chapter 6: Working with Variables, Constants, and Enumerations
+
+Working with Variables
+
+Working with Constants
+
+Working with Enumerations
+
+The Bottom Line
+
+Chapter 7: Using Array Variables
+
+What Is an Array?
+
+Declaring an Array
+
+Storing Values in an Array
+
+Multidimensional Arrays
+
+Declaring a Dynamic Array
+
+Redimensioning an Array
+
+Returning Information from an Array
+
+Erasing an Array
+
+Finding Out Whether a Variable Is an Array
+
+Finding the Bounds of an Array
+
+Sorting an Array
+
+Searching through an Array
+
+The Bottom Line
+
+Chapter 8: Finding the Objects, Methods, and Properties You Need
+
+What Is an Object?
+
+Working with Collections
+
+Finding the Objects You Need
+
+Using Object Variables to Represent Objects
+
+Team Programming and OOP
+
+The Bottom Line
+
+Part 3: Making Decisions and Using Loops and Functions
+
+Chapter 9: Using Built-in Functions
+
+What Is a Function?
+
+Using Functions
+
+Using Functions to Convert Data
+
+Using Functions to Manipulate Strings
+
+Using VBA's Mathematical Functions
+
+Using VBA's Date and Time Functions
+
+Using File-Management Functions
+
+The Bottom Line
+
+Chapter 10: Creating Your Own Functions
+
+Components of a Function
+
+Creating a Function
+
+Examples of Functions for Any VBA-Enabled Office Application
+
+Creating a Function for Word
+
+Creating a Function for Excel
+
+Creating a Function for PowerPoint
+
+Creating a Function for Access
+
+The Bottom Line
+
+Chapter 11: Making Decisions in Your Code
+
+How Do You Compare Things in VBA?
+
+Testing Multiple Conditions by Using Logical Operators
+
+_Select Case_ Blocks
+
+The Bottom Line
+
+Chapter 12: Using Loops to Repeat Actions
+
+When Should You Use a Loop?
+
+Understanding the Basics of Loops
+
+Using For...loops for Fixed Repetitions
+
+Using _Do..._ Loops for Variable Numbers of Repetitions
+
+_While... Wend_ Loops
+
+Nesting Loops
+
+Avoiding Infinite Loops
+
+The Bottom Line
+
+Part 4: Using Message Boxes, Input Boxes, and Dialog Boxes
+
+Chapter 13: Getting User Input with Message Boxes and Input Boxes
+
+Opening a Procedure to Work On
+
+Displaying Status-Bar Messages in Word and Excel
+
+Message Boxes
+
+Input Boxes
+
+Forms: When Message Boxes and Input Boxes Won't Suffice
+
+The Bottom Line
+
+Chapter 14: Creating Simple Custom Dialog Boxes
+
+When Should You Use a Custom Dialog Box?
+
+Creating a Custom Dialog Box
+
+Linking a Form to a Procedure
+
+Retrieving the User's Choices from a Dialog Box
+
+Examples of Connecting Forms to Procedures
+
+Using an Application's Built-in Dialog Boxes from VBA
+
+The Bottom Line
+
+Chapter 15: Creating Complex Forms
+
+Creating and Working with Complex Dialog Boxes
+
+Using Events to Control Forms
+
+The Bottom Line
+
+Part 5: Building Modular Code and Using Classes
+
+Chapter 16: Building Modular Code and Using Classes
+
+Creating Modular Code
+
+Creating and Using Classes
+
+The Bottom Line
+
+Chapter 17: Debugging Your Code and Handling Errors
+
+Principles of Debugging
+
+The Different Types of Errors
+
+VBA's Debugging Tools
+
+Dealing with Infinite Loops
+
+Dealing with Runtime Errors
+
+Suppressing Alerts
+
+Handling User Interrupts in Word, Excel, and Project
+
+Documenting Your Code
+
+The Bottom Line
+
+Chapter 18: Building Well-Behaved Code
+
+What Is a Well-Behaved Procedure?
+
+Retaining or Restoring the User Environment
+
+Leaving the User in the Best Position to Continue Working
+
+Keeping the User Informed during the Procedure
+
+Making Sure a Procedure Is Running under Suitable Conditions
+
+Cleaning Up after a Procedure
+
+The Bottom Line
+
+Chapter 19: Securing Your Code with VBA's Security Features
+
+Understanding How VBA Implements Security
+
+Signing Your Macro Projects with Digital Signatures
+
+Choosing a Suitable Level of Security
+
+Locking Your Code
+
+The Bottom Line
+
+Part 6: Programming the Office Applications
+
+Chapter 20: Understanding the Word Object Model and Key Objects
+
+Examining the Word Object Model
+
+Working with the Documents Collection and the Document Object
+
+Working with the Selection Object
+
+Creating and Using Ranges
+
+Manipulating Options
+
+The Bottom Line
+
+Chapter 21: Working with Widely Used Objects in Word
+
+Using Find and Replace via VBA
+
+Working with Headers, Footers, and Page Numbers
+
+Working with Sections, Page Setup, Windows, and Views
+
+Working with Tables
+
+The Bottom Line
+
+Chapter 22: Understanding the Excel Object Model and Key Objects
+
+Getting an Overview of the Excel Object Model
+
+Understanding Excel's Creatable Objects
+
+Managing Workbooks
+
+Working with Worksheets
+
+Working with the Active Cell or Selection
+
+Working with Ranges
+
+Setting Options
+
+The Bottom Line
+
+Chapter 23: Working with Widely Used Objects in Excel
+
+Working with Charts
+
+Working with Windows
+
+Working with Find and Replace
+
+Adding Shapes
+
+The Bottom Line
+
+Chapter 24: Understanding the PowerPoint Object Model and Key Objects
+
+Getting an Overview of the PowerPoint Object Model
+
+Understanding PowerPoint's Creatable Objects
+
+Working with Presentations
+
+Working with Windows and Views
+
+Working with Slides
+
+Working with Masters
+
+The Bottom Line
+
+Chapter 25: Working with Shapes and Running Slide Shows
+
+Working with Shapes
+
+Working with Headers and Footers
+
+Setting Up and Running a Slide Show
+
+The Bottom Line
+
+Chapter 26: Understanding the Outlook Object Model and Key Objects
+
+Getting an Overview of the Outlook Object Model
+
+Working with the Application Object
+
+Understanding General Methods for Working with Outlook Objects
+
+Working with Messages
+
+Working with Calendar Items
+
+Working with Tasks and Task Requests
+
+Searching for Items
+
+The Bottom Line
+
+Chapter 27: Working with Events in Outlook
+
+Working with Application-Level Events
+
+Working with Item-Level Events
+
+Understanding Quick Steps
+
+The Bottom Line
+
+Chapter 28: Understanding the Access Object Model and Key Objects
+
+Getting Started with VBA in Access
+
+Getting an Overview of the Access Object Model
+
+Understanding Creatable Objects in Access
+
+Opening and Closing Databases
+
+Working with the _Screen_ Object
+
+Using the _DoCmd_ Object to Run Access Commands
+
+The Bottom Line
+
+Chapter 29: Manipulating the Data in an Access Database via VBA
+
+Understanding How to Proceed
+
+Preparing to Manage the Data in a Database
+
+Opening a Recordset
+
+Accessing a Particular Record in a Recordset
+
+Searching for a Record
+
+Returning the Fields in a Record
+
+Editing a Record
+
+Inserting and Deleting Records
+
+Closing a Recordset
+
+Saving a Recordset to the Cloud
+
+The Bottom Line
+
+Chapter 30: Accessing One Application from Another Application
+
+Understanding the Tools Used to Communicate between Applications
+
+Using Automation to Transfer Information
+
+Using the _Shell_ Function to Run an Application
+
+Using Data Objects to Store and Retrieve Information
+
+Communicating via DDE
+
+Communicating via _SendKeys_
+
+Going beyond VBA
+
+The Bottom Line
+
+Chapter 31: Programming the Office 2013 Ribbon
+
+What Is XML?
+
+Hiding the Editing Group on the Word Ribbon
+
+Working with Excel and PowerPoint
+
+Undoing Ribbon Modifications
+
+Selecting the Scope of Your Ribbon Customization
+
+Adding a New Group
+
+Adding Callbacks
+
+Adding Attributes
+
+Using Menus and Lists
+
+Toggling with a Toggle-Button Control
+
+Modifying the Ribbon in Access
+
+Adding a Callback in Access
+
+What to Look For If Things Go Wrong
+
+Where to Go from Here
+
+The Bottom Line
+
+Appendix: The Bottom Line
+
+Chapter 1: Recording and Running Macros in the Office Applications
+
+Chapter 2: Getting Started with the Visual Basic Editor
+
+Chapter 3: Editing Recorded Macros
+
+Chapter 4: Creating Code from Scratch in the Visual Basic Editor
+
+Chapter 5: Understanding the Essentials of VBA Syntax
+
+Chapter 6: Working with Variables, Constants, and Enumerations
+
+Chapter 7: Using Array Variables
+
+Chapter 8: Finding the Objects, Methods, and Properties You Need
+
+Chapter 9: Using Built-in Functions
+
+Chapter 10: Creating Your Own Functions
+
+Chapter 11: Making Decisions in Your Code
+
+Chapter 12: Using Loops to Repeat Actions
+
+Chapter 13: Getting User Input with Message Boxes and Input Boxes
+
+Chapter 14: Creating Simple Custom Dialog Boxes
+
+Chapter 15: Creating Complex Forms
+
+Chapter 16: Building Modular Code and Using Classes
+
+Chapter 17: Debugging Your Code and Handling Errors
+
+Chapter 18: Building Well-Behaved Code
+
+Chapter 19: Securing Your Code with VBA's Security Features
+
+Chapter 20: Understanding the Word Object Model and Key Objects
+
+Chapter 21: Working with Widely Used Objects in Word
+
+Chapter 22: Understanding the Excel Object Model and Key Objects
+
+Chapter 23: Working with Widely Used Objects in Excel
+
+Chapter 24: Understanding the PowerPoint Object Model and Key Objects
+
+Chapter 25: Working with Shapes and Running Slide Shows
+
+Chapter 26: Understanding the Outlook Object Model and Key Objects
+
+Chapter 27: Working with Events in Outlook
+
+Chapter 28: Understanding the Access Object Model and Key Objects
+
+Chapter 29: Manipulating the Data in an Access Database via VBA
+
+Chapter 30: Accessing One Application from Another Application
+
+Chapter 31: Programming the Office 2013 Ribbon
+
+Acquisitions Editor: Mariann Barsolo
+
+Development Editor: David Clark
+
+Technical Editor: Russ Mullen
+
+Production Editor: Eric Charbonneau
+
+Copy Editor: Judy Flynn
+
+Editorial Manager: Pete Gaughan
+
+Production Manager: Tim Tate
+
+Vice President and Executive Group Publisher: Richard Swadley
+
+Vice President and Publisher: Neil Edde
+
+Book Designers: Maureen Forys and Judy Fung
+
+Proofreader: Candace Cunningham
+
+Indexer: Ted Laux
+
+Project Coordinator, Cover: Katherine Crocker
+
+Cover Designer: Ryan Sneed
+
+Cover Image: ©iStockphoto.com/pic4you
+
+Copyright © 2013 by John Wiley & Sons, Inc., Indianapolis, Indiana  
+Published simultaneously in Canada
+
+ISBN: 978-1-118-69512-8  
+ISBN: 978-1-118-75022-3 (ebk.)  
+ISBN: 978-1-118-78630-7 (ebk.)
+
+No part of this publication may be reproduced, stored in a retrieval system or transmitted in any form or by any means, electronic, mechanical, photocopying, recording, scanning or otherwise, except as permitted under Sections 107 or 108 of the 1976 United States Copyright Act, without either the prior written permission of the Publisher, or authorization through payment of the appropriate per-copy fee to the Copyright Clearance Center, 222 Rosewood Drive, Danvers, MA 01923, (978) 750-8400, fax (978) 646-8600. Requests to the Publisher for permission should be addressed to the Permissions Department, John Wiley & Sons, Inc., 111 River Street, Hoboken, NJ 07030, (201) 748-6011, fax (201) 748-6008, or online at www.wiley.com/go/permissions.
+
+Limit of Liability/Disclaimer of Warranty: The publisher and the author make no representations or warranties with respect to the accuracy or completeness of the contents of this work and specifically disclaim all warranties, including without limitation warranties of fitness for a particular purpose. No warranty may be created or extended by sales or promotional materials. The advice and strategies contained herein may not be suitable for every situation. This work is sold with the understanding that the publisher is not engaged in rendering legal, accounting, or other professional services. If professional assistance is required, the services of a competent professional person should be sought. Neither the publisher nor the author shall be liable for damages arising herefrom. The fact that an organization or Web site is referred to in this work as a citation and/or a potential source of further information does not mean that the author or the publisher endorses the information the organization or Web site may provide or recommendations it may make. Further, readers should be aware that Internet Web sites listed in this work may have changed or disappeared between when this work was written and when it is read.
+
+For general information on our other products and services or to obtain technical support, please contact our Customer Care Department within the U.S. at (877) 762-2974, outside the U.S. at (317) 572-3993 or fax (317) 572-4002.
+
+Wiley publishes in a variety of print and electronic formats and by print-on-demand. Some material included with standard print versions of this book may not be included in e-books or in print-on-demand. If this book refers to media such as a CD or DVD that is not included in the version you purchased, you may download this material at <http://booksupport.wiley.com>. For more information about Wiley products, visit www.wiley.com.
+
+**Library of Congress Control Number:** 2013945361
+
+TRADEMARKS: Wiley, the Wiley logo, and the Sybex logo are trademarks or registered trademarks of John Wiley & Sons, Inc. and/or its affiliates, in the United States and other countries, and may not be used without written permission. Microsoft is a registered trademark of Microsoft Corporation. All other trademarks are the property of their respective owners. John Wiley & Sons, Inc. is not associated with any product or vendor mentioned in this book.
+Dear Reader,
+
+Thank you for choosing _Mastering VBA for Microsoft Office 2013_. This book is part of a family of premium-quality Sybex books, all of which are written by outstanding authors who combine practical experience with a gift for teaching.
+
+Sybex was founded in 1976. More than 30 years later, we're still committed to producing consistently exceptional books. With each of our titles, we're working hard to set a new standard for the industry. From the paper we print on to the authors we work with, our goal is to bring you the best books available.
+
+I hope you see all that reflected in these pages. I'd be very interested to hear your comments and get your feedback on how we're doing. Feel free to let me know what you think about this or any other Sybex book by sending me an email at nedde@wiley.com. If you think you've found a technical error in this book, please visit <http://sybex.custhelp.com>. Customer feedback is critical to our efforts at Sybex.
+
+I dedicate this book to my good friend
+
+Leroy Fincham.
+Acknowledgments
+
+I'd like to thank all the good people at Sybex who contributed to this book. Mariann Barsolo's encouragement made this book possible in the first place, and Pete Gaughan provided thoughtful guidance while launching the project. I am also indebted to development editor David Clark, whose valuable suggestions contributed to this book's tone and organization. Technical editor Russ Mullen carefully checked the book for accuracy and ensured that all the code examples work without any errors. Finally, thanks to Eric Charbonneau, production editor, the book went smoothly through its final stages—author review, design, and assembly. My gratitude also goes to copyeditor Judy Flynn, who, via a very close read, polished this book in many ways; she is truly an exceptional copy editor. Candace Cunningham is also great at her job, and she flagged important issues during her proofreading.
+About the Author
+
+_Mastering VBA for Microsoft Office 2013_ is **Richard Mansfield's** 45th book. His recent titles include _CSS Web Design for Dummies_ (Wiley), _Office Application Development All_ - _in_ - _One Desk Reference for Dummies_ (Wiley), _How to Do Everything with Second Life_ (McGraw-Hill), and _Programming_ : _A Beginner's Guide_ (McGraw-Hill). Overall, his books have sold more than 500,000 copies worldwide and have been translated into 12 languages.
+Introduction
+
+Visual Basic for Applications (VBA) is a powerful tool that enables you to automate tasks in Microsoft Office applications.
+
+Automating can save you and your colleagues considerable time and effort. Getting more work done in less time is usually good for your self-esteem, and it can do wonderful things for your job security and your career.
+
+# Where to Get This Book's Example Code
+
+Throughout this book you'll find many code (programming) examples. Rather than type in the code, you'll save yourself time (and typo-debugging headaches) if you just copy the code from this book's web page, then paste it into the Visual Basic Editor. You can find all the code from this book—accurate, fully tested, and bug-free—at this book's web page:
+
+www.sybex.com/go/masteringvbaoffice2013
+
+# If You Have Questions
+
+I'm happy to hear from readers, so if you have any difficulty while using this book, write me at earth@triad.rr.com.
+
+I'll try to respond the same day. We've all been beginners at some point, so don't feel your question is silly. If you're embarrassed, sign your email _Connie_ and I'll think you're Connie.
+
+# What Can I Do with VBA?
+
+You can use VBA to automate almost any action that you can perform interactively (manually) with an Office 2013 application. For example, in Word, VBA can create a document, add text to it, format it, edit it, and save it.
+
+In Excel, you can automatically integrate data from multiple workbooks into a single workbook. PowerPoint's VBA can create a custom presentation, including the latest data drawn from a variety of sources with no human intervention. And in Access you can create new tables, populate them with data, and send the table up to the cloud.
+
+VBA performs actions faster, more accurately, more reliably, and far more cheaply than any human. You can specify conditions for making a decision, then let VBA make those decisions for you in the future. By adding decision-making structures and loops (repetitions) to your code, you can go far beyond the range of actions that any human user can perform and finish the job in less than a second.
+
+Beyond automating actions you would otherwise perform manually, VBA gives you the tools to create user interfaces for your code—message boxes, input boxes, and _user forms_ —windows containing graphical objects that you can use to create forms and custom dialog boxes to display to the user.
+
+Using VBA, you can also create custom applications that run within the host application. For example, you could build within PowerPoint a custom application that automatically creates presentations for you.
+
+VBA can also communicate between applications. For example, Word can't do much in the way of mathematical calculations on sets of data: that's Excel's specialty. So, you can make Word start Excel running, perform some calculations, and then put the results into a Word document. Similarly, you could send graphs from Excel to PowerPoint or Outlook. You get the picture.
+
+Because VBA provides a standard set of tools that differ only in the specializations of the host applications, once you've learned to use VBA in one application, you'll be able to apply that knowledge quickly to using VBA in another application. For example, you might start by learning VBA in order to manipulate Excel and then move on to using your VBA skills with Outlook. You'll need to learn the components particular to Outlook, because they're different from Excel's features, but you'll be up to speed rapidly. It's like shopping. Once you understand the basics, going to a hardware store differs from going to a bookstore only in the particulars.
+
+As with any programming language, getting started with VBA involves a learning curve—but you'll be surprised how many tools VBA provides to help you quickly learn the fundamentals.
+
+The VBA Editor is among the best programming environments available. It includes help features that list programming options while you're typing, that instantly point out problems (and suggest solutions), that prevent you from making some kinds of mistakes, that offer context- sensitive help (with example programming), that even automatically complete your lines (sentences) of programming code.
+
+What's more, you can create some kinds of VBA programs without even writing a single line of code! You use the Macro Recorder tool built into Word and Excel—a great way to learn VBA more quickly. You turn on the Recorder and do what you want with Word or Excel manually via keyboard and mouse while the Recorder translates all your actions into programming code for you. Can't remember the programming code for saving a document? Just turn on the Recorder (click the icon on the lower left of Word's or Excel's status bar), save a document, then you've got the code it recorded:
+
+       ActiveDocument.Save
+
+Another truly cool thing about VBA: Its words—most of the programming commands that make the language do what you want—are English words. Unlike less efficient programming languages, Basic strives to be human-friendly, understandable, readable. The programming code that saves Word's current document is ActiveDocument Save. For Excel, you use ActiveWorkbook Save.
+
+For fun, search "save a document in c++" in Google, and you'll find lots of puzzling explanations attempting to accomplish this straightforward task in unfortunately unstraightforward ways, using often-puzzling diction. If you've tried programming in other languages, you'll find the simplicity and plain English of VBA a great relief. It's easy to learn, easy to use, yet no less powerful than any other programming language.
+
+This book uses the Macro Recorder as the jumping-off point for you to start creating code. You first explore how to record macros (small programs) and then learn to edit this recorded code to make it do other things. After that easy introduction, you go on to explore the essentials of VBA diction and syntax. The book concludes with ambitious topics.
+
+Word, because it's the most popular Office application and because it has the most sophisticated and efficient programming tools, is used for many of the examples in this book. But there are plenty of examples showing how to program Excel, PowerPoint, Outlook, and even Access. And code that works in one Office 2013 application will generally work with other applications in the suite—with little or sometimes no modification.
+
+# What's in This Book?
+
+This book teaches you how to use VBA to automate your work in Office 2013 applications. For its general examples, the book focuses on Word, Excel, Outlook, and PowerPoint, because those are the Microsoft Office applications that you're most likely to have, and because they have less eccentric programming tools and strategies than Access. The last part of the book continues the discussion of how to program these four applications, but also increases coverage of Access.
+
+Part 1 of the book, "Recording Macros and Getting Started with VBA," comprises the following chapters:
+
+  * Chapter 1 shows you how to record a macro using the Macro Recorder in Word and Excel. You also learn several ways to run macros and how to delete them.
+  * Chapter 2 introduces you to the powerful VBA Editor, the application in which you create VBA code (either by editing recorded code or by writing code from scratch) and user forms. The second half of this chapter discusses how you can customize the Visual Basic Editor so that you can work in it more efficiently.
+  * Chapter 3 shows you how to edit recorded macros, using the macros you recorded in Chapter 1. You learn how to step through and test a macro in the Visual Basic Editor.
+  * Chapter 4 teaches you how to start writing code from scratch in the Visual Basic Editor. You create a procedure (a small program called a macro) for Word, one for Excel, and a third for PowerPoint.
+
+Part 2, "Learning How to Work with VBA," contains the following chapters:
+
+  * Chapter 5 explains the essentials of VBA syntax, giving you a brief overview of the concepts you need to know. You also practice creating statements in the Visual Basic Editor.
+  * Chapter 6 shows you how to work with variables and constants, which are used to store information for your procedures to work on.
+  * Chapter 7 discusses how to use arrays. Arrays are like super-variables that can store multiple pieces of information at the same time.
+  * Chapter 8 teaches you how to find the objects you need to create your procedures. You learn how to correctly write code involving objects by employing the Macro Recorder, the Object Browser, and the Help system. And you see how to use object variables to represent objects. Finally, you explore the uses of object models.
+
+Part 3, "Making Decisions and Using Loops and Functions," consists of the following chapters:
+
+  * Chapter 9 describes how to use VBA's built-in functions—everything from string-conversion functions through mathematical and date functions to file-management functions.
+  * Chapter 10 shows you how to create functions of your own to supplement the built-in libraries of functions. You create functions that work in any VBA-enabled application, together with application-specific functions for Word, Excel, and PowerPoint.
+  * Chapter 11 shows you how to use conditional statements (such as If statements) to make decisions in your code. Conditional statements are key to making your code flexible and intelligent.
+  * Chapter 12 covers how you can use loops to repeat actions in your procedures: fixed-iteration loops for fixed numbers of repetitions, and indefinite loops that repeat until they satisfy a condition you specify. You also learn how to avoid creating infinite loops, which can cause your code to run either forever or until your computer crashes.
+
+Part 4, "Using Message Boxes, Input Boxes, and Dialog Boxes," has the following chapters:
+
+  * Chapter 13 shows you how to use message boxes to communicate with the users of your procedures and let them make simple decisions about how the procedures run. You also explore input boxes, which are dialog boxes that give the users a way to supply information the procedures need.
+  * Chapter 14 discusses how to employ VBA's user forms to create custom dialog boxes that enable the users to supply information, make choices, and otherwise interact with your macros.
+  * Chapter 15 discusses how to build more-complex dialog boxes. These include dynamic dialog boxes that update themselves when the user clicks a button, dialog boxes with hidden zones that the user can reveal to access infrequently used options, dialog boxes with multiple pages of information, and dialog boxes with controls that respond to actions the user takes.
+
+Part 5, "Creating Effective Code," contains the following chapters:
+
+  * Chapter 16 illustrates the benefits of reusable modular code rather than single-purpose procedures and then shows you how to write this reusable code.
+  * Chapter 17 explains the principles of debugging VBA code, examines the different kinds of errors that occur, and discusses how to deal with them.
+  * Chapter 18 explores how to build well-behaved code that's stable enough to withstand being run under the wrong circumstances and civilized enough to leave the user in the best possible state to continue their work after it finishes running.
+  * Chapter 19 discusses the security mechanisms that Windows and VBA provide for safeguarding VBA code and ensuring that you or your users do not run malevolent code (viruses, trojans, worms, and so on). The chapter discusses digital certificates and digital signatures, how to choose an appropriate security setting for the application you're using, and how to manage passwords.
+
+Part 6, "Programming the Office Applications," consists of these 12 chapters:
+
+  * Chapter 20 explains the Word object model and shows you how to work with key objects in Word, including the Document object, the Selection object, and Range objects. You also learn how to set options in Word and manage cloud storage via such systems as Dropbox or Microsoft's SkyDrive.
+  * Chapter 21 discusses how to work with widely used objects in Word, including the objects for Find and Replace; headers, footers, and page numbers; sections, page setup, windows, and views; and tables.
+  * Chapter 22 introduces you to the Excel object model and shows you how to work with key objects in Excel, including the Workbook object, the Worksheet object, the ActiveCell object, and Range objects. You also learn how to set options in Excel.
+  * Chapter 23 shows you how to work with charts, windows, and Find and Replace in Excel via VBA.
+  * Chapter 24 gets you started working with the PowerPoint object model and the key objects that it contains. You work with Presentation objects, Window objects, Slide objects, and Master objects.
+  * Chapter 25 teaches you how to go further with VBA in PowerPoint by working with shapes, headers and footers, and the VBA objects that enable you to set up and run a slide show automatically.
+  * Chapter 26 introduces you to Outlook's object model and the key objects that it contains. You meet Outlook's creatable objects and main interface items; learn general methods for working with Outlook objects; and work with messages, calendar items, tasks and task requests, and searches.
+  * Chapter 27 shows you how to work with events in Outlook. There are two types of events, application-level events and item-level events, which you can program to respond to both Outlook actions (such as new mail arriving) and user actions (such as creating a new contact).
+  * Chapter 28 familiarizes you with the Access object model and demonstrates how to perform key tasks with some of its main objects.
+  * Chapter 29 shows you how to manipulate the data in an Access database via VBA.
+  * Chapter 30 shows you how to communicate between applications via VBA. You learn which tools are available, how to use Automation, how to work with the Shell function, and how to use data objects, DDE, and SendKeys.
+  * Chapter 31 explores the various ways you can customize the Ribbon programmatically. It's not possible to customize it by VBA code alone. Instead, you must write XML code to modify what the user sees on the Ribbon and write _callbacks_ (event-handler procedures in VBA) to respond when the user clicks one of the buttons or other controls you've added to the Ribbon. You see how to modify tabs, groups, and individual controls—in Word, PowerPoint, Excel, and, using different techniques, in Access.
+
+# How Should I Use This Book?
+
+This book tries to present material in a sensible and logical way. To avoid repeating information unnecessarily, the chapters build on each other, so the later chapters generally assume that you've read the earlier chapters.
+
+The first five parts of the book offer a variety of code samples using Word, Excel, PowerPoint, and, to a lesser extent, Access. If you have these applications (or some of them), work through these examples as far as possible to get the most benefit from them. While you may be able to apply some of the examples directly to your work, mostly you'll find them illustrative of general VBA techniques and principles, and you'll need to customize them to suit your own needs.
+
+The sixth and last part of this book shows you some more-advanced techniques that are useful when using VBA to program Word, Excel, PowerPoint, Outlook, and Access. Work through the chapters that cover the application or applications that you want to program with VBA.
+
+Chapters 30 and 31 are specialized, but quite useful. Chapter 30 shows you how to use one application to control another application; for example, you might use Word to contact Excel and exploit its special mathematic or graphing capabilities. And Chapter 31 shows you many different ways to program the Ribbon—the primary user interface in Office 2013 applications.
+
+# Is This Book Suitable for Me?
+
+Yes.
+
+This book is for anyone who wants to learn to use VBA to automate their work in a VBA-enabled application. Automating your work could involve anything from creating a few simple procedures that would enable you to perform some complex and tedious operations via a single keystroke to building a custom application with a complete interface that looks nothing like the host application's regular interface.
+
+This book attempts to present theoretical material in as practical a context as possible by including lots of examples of the theory in action. For example, when you learn about loops, you execute short procedures that illustrate the use of each kind of loop so that you can see how and why they work and when to use them. And you'll also find many step-throughs—numbered lists that take you through a task, one step at a time. Above all, I've tried to make this book clear and understandable, even to readers who've never written any programming in their life.
+
+# Conventions Used in This Book
+
+This book uses several conventions to convey information succinctly:
+
+  * designates choosing a command from a menu. For example, "choose File ⇒ Open" means that you should pull down the File menu and choose the Open command from it.
+  * \+ signs indicate key combinations. For example, "press Ctrl+Shift+F9" means that you should simultaneously hold down the Ctrl, Shift, and F9 keys. Also, you'll sometimes see this: Press Ctrl+F, I. That means simultaneously press Ctrl and F, then release them and press I.
+  * Some of these key combinations can be confusing at first (for example, "Ctrl++" means that you hold down Ctrl and press the + key—in other words, hold down Ctrl and Shift together and press the = key, because the + key is the shifted =.).
+  * Likewise, "Shift+click" means that you should hold down the Shift key as you click with the mouse, and "Ctrl+click" means that you should hold down the Ctrl key as you click.
+  * ↑→↓← represent the arrow keys on your keyboard. These arrows are also represented in the text as "up-arrow," "down-arrow," etc. The important thing to note is that ← does not mean the Backspace key (which on many keyboards bears a similar arrow). The Backspace key is indicated simply by the words "Backspace" or "the Backspace key."
+  * **Boldface** indicates that you are to type something.
+  * Program font indicates program items, or text derived from program lines. Complete program lines appear offset in separate paragraphs like the example below, while shorter expressions appear as part of the main text.
+
+    Sub Sample_Listing()
+        'lines of program code look like this.
+    End Sub
+
+  * _Italics_ usually indicate either new terms being introduced or variable information (such as a drive letter that will vary from computer to computer and that you'll need to substitute for your own).
+  * _ (a continuation underline character) indicates that a single line of code has been broken onto a second or subsequent line in the book (because of the limitations of page size). In the VBA Editor, you should enter these "broken" lines of code as a single line. For example, in this code sample, a single line of VBA Editor code has been broken into three lines when printed in this book:
+
+    MsgBox System.PrivateProfileString("", _
+    "HKEY_CURRENT_USER\Software\Microsoft\ _
+    Office\11.0\Common\AutoCorrect", "Path")
+
+  * You'll also see sidebars throughout the book. These include asides, notes, tips, and warnings. They're a bit like footnotes, though less tedious. Each sidebar, no matter how small, has a headline—so you can quickly see if you want to read it.
+  * Finally, each chapter includes one, longer, _Real World Scenario_ sidebar: a case study, an important practical technique, or some other useful advice.
+
+# The Mastering Series
+
+The Mastering series from Sybex provides outstanding instruction for readers with intermediate and advanced skills in the form of top-notch training and development for those already working in their field and clear, serious education for those aspiring to become pros. Every Mastering book includes the following:
+
+  * Real World Scenarios, ranging from case studies to interviews, that show how the tool, technique, or knowledge presented is applied in actual practice
+  * Skill-based instruction with chapters organized around real tasks rather than abstract concepts or subjects
+  * Self-review test questions so you can be certain you're equipped to do the job right
+
+# For More Information
+
+Sybex strives to keep you supplied with the latest tools and information you need for your work. Please check the website at www.sybex.com/go/masteringvbaoffice2013, where we'll post additional content and updates that supplement this book if the need arises.
+Part 1
+
+Recording Macros and Getting Started with VBA
+
+  * **Chapter 1: Recording and Running Macros in the Office Applications**
+  * **Chapter 2: Getting Started with the Visual Basic Editor**
+  * **Chapter 3: Editing Recorded Macros**
+  * **Chapter 4: Creating Code from Scratch in the Visual Basic Editor**
+
+Chapter 1
+
+Recording and Running Macros in the Office Applications
+
+In this chapter, you'll learn the easiest way to get started with Visual Basic for Applications (VBA): recording simple _macros_ using a Macro Recorder that is built into the Office applications. Then you'll see how to run your macros to perform useful tasks.
+
+I'll define the term _macro_ in a moment. For now, just note that by recording macros, you can automate straightforward but tediously repetitive tasks and speed up your regular work. You can also use the Macro Recorder to create VBA code that performs the actions you need and then edit the code to customize it—adding flexibility and power. In fact, VBA is a real powerhouse if you know how to use it. This book shows you how to tap into that power.
+
+In this chapter you will learn to do the following:
+
+  * Record a macro
+  * Assign a macro to a button or keyboard shortcut
+  * Run a macro
+  * Delete a macro
+
+# What Is VBA and What Can You Do with It?
+
+Visual Basic for Applications is a programming language created by Microsoft that can be built into applications. You use VBA to automate operations in applications that support it. All the main Office applications—Word, Excel, Outlook, Access, and PowerPoint—include VBA, so you can automate operations through most Office applications.
+
+And please don't be put off by the notion that you'll be _programming_ : As you'll see shortly, working with VBA is nearly always quite easy. In fact, quite often you need not actually write any VBA yourself; you can merely _record_ it—letting the Office application write all the VBA "code." The phrase _automate operations in applications_ is perhaps a bit abstract. VBA allows you to streamline many tasks, avoid burdensome repetition, and improve your efficiency. Here are some examples:
+
+  * You can record a macro that automatically carries out a series of actions that you frequently perform. Let's say that you often edit Word documents written by a co-worker, but she sets the zoom level to 100. You prefer a zoom level of 150. All you need to automatically fix this is this VBA code:
+
+         ActiveWindow.ActivePane.View.Zoom.Percentage = 150
+
+And don't worry, you need not even know these programming terms like ActiveWindow or View.Zoom. When you turn on the Macro Recorder, then perform these actions (clicking View, then clicking Zoom, then setting the percentage), all your actions are translated into the necessary VBA code. You write no code at all.
+
+  * You can write code that performs actions a certain number of times and that makes decisions depending on the situation in which it is running. For example, you could write code that takes a series of actions on every presentation that's open in PowerPoint.
+  * You can have your macros interact with the user by displaying _forms_ , or custom dialog boxes, that enable the user to make choices and specify settings while the macro is running. For example, you might display a set of formatting options—showing captioned controls such as check boxes and option buttons—that the user can select. Then when the user closes the dialog box, your macro takes appropriate actions based on the user's input.
+  * You can take actions via VBA that you can't take (or take easily) by directly manipulating the user interface. For example, when you're working interactively in most applications, you're limited to working with the active file—the active document in Word, the active workbook in Excel, and so on. By using VBA, you can manipulate files that aren't active.
+  * You can make one application manipulate another application. For example, you can make Word place a table from a Word document into an Excel worksheet.
+
+## The Difference between Visual Basic and Visual Basic for Applications
+
+VBA is based on Visual Basic, a programming language derived from BASIC. _BASIC_ stands for Beginner's All-Purpose Symbolic Instruction Code. BASIC is designed to be user-friendly because it employs recognizable English words (or variations on them) rather than the abstruse and incomprehensible programming terms found in languages like COBOL. In addition to its English-like diction, BASIC's designers endeavored to keep its punctuation and syntax as simple and familiar as possible.
+
+Visual Basic is _visual_ in that it offers efficient shortcuts such as drag-and-drop programming techniques and many graphical elements.
+
+Visual Basic for Applications is a version of Visual Basic tailored to Microsoft Office applications. The set of _objects_ (features and behaviors) available in each application differs because no two applications share the same features and commands.
+
+For example, some VBA objects available in Word are not available in Excel (and vice versa) because some of Word's features, like the Table of Contents generator, are not appropriate in Excel.
+
+However, the large set of primary commands, fundamental structure, and core programming techniques of VBA in Word and VBA in Excel are the same. So you'll find that it's often quite easy to translate your knowledge of VBA in Word to VBA in Excel (or indeed in any VBA-enabled application).
+
+For example, you'd use the Save method (a _method_ is essentially an action that can be carried out) to save a file in Excel VBA, Word VBA, or PowerPoint VBA. What differs is the _object_ involved. In Excel VBA, the command would be ActiveWorkbook.Save, whereas in Word VBA it would be ActiveDocument.Save and in PowerPoint it would be ActivePresentation.Save.
+
+VBA always works with a host application (such as Access or Word). With the exception of some stand-alone programs that are usually best created with Visual Studio Tools for Office, a host application always needs to be open for VBA to run. This means that you can't build stand-alone applications with VBA the way you can with Visual Basic .NET or Visual Studio Tools for Office (VSTO). If you wish, you can _hide_ the host application from the user so that all they see is the interface (typically user forms) that you give to your VBA procedures. By doing this, you can create the illusion of a stand-alone application. Whether you need to employ this technique will depend on the type of programming you do.
+
+* * *
+
+What Are Visual Basic .NET and Visual Basic Express?
+
+Visual Basic .NET (VB .NET) is just one version of Microsoft's long history of BASIC language implementations. BASIC contains a vast set of libraries of prewritten code that allow you to do pretty much anything that Windows is capable of. Although VB .NET is generally employed to write stand-alone applications, you can tap into its libraries from within a VBA macro. Just remember, each Office application has its own object library, but the .NET libraries themselves contain many additional capabilities (often to manipulate the Windows operating system). So, if you need a capability that you can't find within VBA or an Office application's object library, the resources of the entire .NET library are also available to you. Visual Basic Express is a free version of VB .NET. After you've worked with VBA in this book, you might want to explore VB .NET at
+
+www.microsoft.com/visualstudio/eng/products/visual-studio-express-products
+
+You'll find versions for both traditional desktop Windows as well as Windows 8.
+
+* * *
+
+# Understanding Macro Basics
+
+A _macro_ is a sequence of commands you or a user can repeat at will. That's exactly the definition of a _computer program_. Macros, however, are generally short programs—dedicated to a single task. Think of it like this: A normal computer program, such as Photoshop or Internet Explorer (IE), has many capabilities. IE can prevent pop-up ads, block websites, display full-screen when you press F11, and so on. A macro is smaller, dedicated to accomplishing just one of these tasks, such as displaying full-screen.
+
+In some applications, you can set a macro to run itself automatically. For instance, you might create a macro in Word to automate basic formatting tasks on a type of document you regularly receive incorrectly formatted. As you'll see in Chapter 6, "Working with Variables, Constants, and Enumerations," in a discussion of the AutoExec feature, you can specify that a macro run automatically upon opening a document of that type.
+
+A macro is a type of _subroutine_ (sometimes also called a _subprocedure_ ). Generally, people tend to use the shorter, more informal terms _sub, procedure_ , and _routine._ In the Visual Basic Editor, each of your macros starts with the word _Sub_. Note that a macro is a single procedure, whereas a computer program like IE is a collection of many procedures.
+
+A macro used to be defined as recorded code rather than written code, but most people today use the word in its wider sense, so it can include written code as well. For example, if you record a macro and then edit it to make it more efficient, or to add commands to make it take further actions, most people still consider it a macro.
+
+In an Office application that supports the VBA Macro Recorder (such as Word or Excel), you can create macros in two ways:
+
+  * Turn on the Macro Recorder and just perform the sequence of actions you want the macro to perform. Clicks, typing, dragging, dropping—whatever you do is recorded.
+  * Open the Visual Basic Editor and type the VBA commands into it.
+
+There's also a useful hybrid approach that combines recording with editing. First record the sequence of actions, and then later, in the Visual Basic Editor, you can view and edit your macro. You could delete any unneeded commands. Or type in new commands. Or use the editor's Toolbox feature to drag and drop user-interface elements (such as message boxes and dialog boxes) into your macro so users can make decisions and choose options for how to run it. Macros are marvelously flexible, and the VBA Editor is famously powerful yet easy to use.
+
+Once you've created a macro, you specify how you want the user to trigger it. In most applications, you can assign a macro to the Ribbon, to the Quick Access Toolbar, or to a shortcut key combination. This makes it very easy to run the macro by merely clicking an icon or pressing a shortcut key (such as Alt+R). You can also optionally assign your macro to a Quick Access Toolbar button or keyboard shortcut when you first record the macro, via a dialog box that automatically appears when you begin a recording. You'll see how all this works shortly. It's simple. (To assign a macro to the Ribbon, first record it, then right-click the Ribbon and choose Customize The Ribbon. Locate and click the Choose Commands From drop-down box, then click the Macros entry to display all your macros.)
+
+# Recording a Macro
+
+The easiest way to create VBA code is to record a macro using the Macro Recorder. Only Word and Excel include a Macro Recorder.
+
+You switch on the Macro Recorder, optionally assign a trigger that will later run the macro (a toolbar button or a shortcut key combination), perform the actions you want in the macro, and then switch off the Macro Recorder. As you perform the actions, the Macro Recorder translates them into commands— _code_ —in the VBA programming language.
+
+Once you finish recording the macro, you can view the code in the Visual Basic Editor and change it if you wish. If the code works perfectly as you recorded it, you never have to look at it—you can just run the macro at any time by clicking the toolbar button or key combination you assigned to the macro.
+
+## Displaying the Developer Tab on the Ribbon
+
+Before going any further, ensure that the Developer (programmer) tab is visible in your Ribbon. This tab is your gateway to macros, VBA, and the VBA Editor. By default, the Office applications do not display this tab. (Access doesn't even _have_ this tab. Word, Excel, PowerPoint, and Outlook do.) To add this tab to your Ribbon, click the File tab, and then click Options. Click Customize Ribbon. In the list box on the right, click Developer to select it. Click the OK button to close the Options dialog box.
+
+In the following sections, you'll look at the stages involved in recording a macro. The process is easy, but you need to be familiar with some background if you haven't recorded macros before. After the general explanations, you'll record example macros in Word and Excel. (Later in the book you'll examine and modify those macros, after you learn how to use the Visual Basic Editor. So don't delete them.)
+
+## Planning the Macro
+
+Before you even start the Macro Recorder, it's sometimes a good idea to do a little planning. Think about what you will do in the macro. In most cases, you can just record a macro and not worry about the context. You can just record it with a document open and some text visible. But in some situations you need to ensure that a special context is set up before you start the recording. For example, you might want to create a macro in Word that does some kind of editing, such as italicizing and underlining a word. To do this, you'll want to first have the blinking "insertion" cursor on a word _that's not italicized or underlined_. You don't want to record the actions of moving the insertion cursor to a particular word. That would make your macro specific to this document and this word in that document. You usually want a macro to work well with more than just one particular document.
+
+Your macro is intended to just italicize and underline whatever word is currently under the blinking cursor in any document. Nevertheless, most simple macros can be recorded without any special planning. Just record whatever you want the macro to do.
+
+* * *
+
+Pausing a Macro
+
+Word (but not Excel) lets you pause the Macro Recorder if you need to stop while recording to do something that you do not want to record. This capability allows you to deal with problems you hadn't anticipated when planning the macro—for example, having to open a document that should have been open before you started recording the macro.
+
+* * *
+
+Some macros should perform any necessary setup themselves. The setup will be part of the macro. In these cases, you should make sure the application is in the state that the macro expects before you start recording the macro. For example, if, to do its job, a macro needs a blank active workbook in Excel, the macro itself should create that blank workbook rather than using whichever workbook happens to be active at the time. This saves a step when the macro runs. So to do this, start recording before launching a blank active workbook.
+
+* * *
+
+A Warning about Security
+
+Macros are computer programs, albeit usually small. You can even tap into all the features in the Windows operating system itself from within a macro. The result is that viruses and other harmful code can be contained within macros (and such code can execute automatically merely by the user opening an infected document via the AutoExec feature discussed in Chapter 6 and via other techniques, such as employing the application's Startup folder). For example, a virus embedded in a macro could delete files on the hard drive if the user opened an infected Word document. This is obviously dangerous.
+
+Office 2013 applications, not to mention the Windows operating systems, contain multiple layers of security to protect against such viruses and harmful code. Specific to macros is a macro "trust" technology that's built into Office applications. To see or modify these trust settings, open the Trust Center dialog box by clicking the Developer tab on the Ribbon, and then click the Macro Security icon (in the Code section of the Ribbon) in Word, Excel, Outlook, or PowerPoint. (Access, as is often the case, does things a bit differently than the other Office applications. Access has no Developer tab. To manage macro security in Access you click the File tab, click the Options link on the left side, click Trust Center, click the Trust Center Settings button, then click Macro Settings.)
+
+The main point here is that you might have to make some adjustments if you can't run macros or if you get mysterious error messages such as "The Macro Could Not Be Created" or "Access is denied." If this happens, your first step should be to look at the Trust Center and choose Disable All Macros With Notification. This setting asks the user for permission to run macros. Or, while you're working with macros in this book, you might want to just select Enable All Macros in the Trust Center. Then deselect this option before closing a document that you worked on in this book. The idea is that you can trust your own macros but you don't want to trust _all_ macros from _all_ documents you might get from outside sources.
+
+If you are working on a document that you created and it contains macros that you wrote, you can trust that document and agree to activate the macros. However, if you open a document from someone else, you have to be careful.
+
+Additional security issues can be solved by managing the various strata of security that now, out of necessity, are embedded within operating systems and applications. One way to deal with security issues is to explore security topics in Windows 7 or 8 applications' Help features. You can also sometimes get good answers by posting questions in online user groups or searching expert websites such as _Wikipedia_. Also, you can find a good overview of Office 2013 security here:
+
+<http://technet.microsoft.com/en-us/library/ee857085(v=office.15).aspx>
+
+Chapter 19, "Securing Your Code with VBA's Security Features," covers Office 2013 security issues in depth.
+
+* * *
+
+## Starting the Macro Recorder
+
+Start the Macro Recorder by clicking the Developer tab on the Ribbon and then clicking the Record Macro button. You can also click the Macro Record button on the status bar at the bottom of the application. (With this approach, you don't have to open the Developer tab. Just click the button on the status bar.)
+
+As soon as you start the Macro Recorder, the Record Macro dialog box opens. You see that this new macro has been given a default macro name (Macro1, Macro2, and so on). You can accept that default name or change it. There's also an optional description to fill in if you wish.
+
+To stop the Macro Recorder, you can click the Stop Recording button in the Developer tab. You can alternatively stop the recording by clicking the square button that appears during recording on the status bar, down on the bottom left of the application's window. Once the Recorder is stopped, the square button is replaced with an icon that you can click to start recording a new macro. In Word for the Mac, click the REC indicator rather than double-clicking it.
+
+The appearance of the Record Macro dialog box varies somewhat from one application to another because the dialog box must offer suitable options to accommodate the varying capabilities particular to each application. In each case, you get to name the macro and add a description of it. In most cases, you can also specify where to save the macro—for example, Word offers two options. For global use (making the macro available to all Word documents), store it in the file named normal.dotm. Or, if it is merely to be used in the currently active document, choose to store it in a file with the document's name and the .dotm filename extension. An ordinary Word template has a .dotx filename extension, but macros are stored in a file with the filename extension .dotm.
+
+Other applications differ somewhat in how the dialog works when you begin recording a macro. For example, Excel allows you three options: to store macros in the current workbook, or in a new workbook, or for use with _all_ Excel workbooks, in the Personal Macro Workbook. That's the equivalent of Word's Normal.dotm file, and Excel's Personal Macro workbook is saved in a file named Personal.xlsb.
+
+* * *
+
+Where to Store Macros in PowerPoint
+
+You can't record macros in the 2013 version of PowerPoint, but you can create them by writing programming code using the Visual Basic Editor. Then you can store macros in the currently active presentation or in any other open presentation or template. PowerPoint also provides a global macro storage container (similar to Word's Normal.dotm file). In PowerPoint, choose the All Open Presentations option in the Macro list box, which is found by clicking the Macros icon in the Code section of the Ribbon's Developer tab.
+
+* * *
+
+The Record Macro dialog box also lets you specify how you want the macro triggered. Word displays buttons you can click to either open a dialog for entering a shortcut key combination or open the Word Options dialog where you can create a button for this macro that will appear on the Quick Access Toolbar. Excel limits you to Ctrl+ shortcut key combinations as a way of launching macros, so there is no button to display a full keyboard shortcut dialog like the one in Word. Excel has only a small text box where you can enter the key that will be paired with Ctrl as the shortcut.
+
+Most of the Microsoft applications that host VBA have the Developer tab from which you control macro recording, launch the Visual Basic Editor, and otherwise manage macros. Access, however, groups several of its macro-related tools in a Database Tools tab (which is visible by default) and also has a Macro option on its Create tab.
+
+Figure 1.1 shows the Record Macro dialog box for Word with a custom name and description entered. Figure 1.2 shows Word's version of the Developer tab on the Ribbon.
+
+Figure 1.1 In the Record Macro dialog box, enter a name for the macro you're about to record. Type a concise but helpful description in the Description box. This is the Record Macro dialog box for Word.
+
+Figure 1.2 You can use the Developer tab on the Ribbon to work with macros.
+
+Here's what the primary Visual Basic features on the Ribbon's Developer tab (or Access's Database Tools tab) do:
+
+**Run Macro button**
+
+Only Access has this Ribbon button. It displays a Run Macro dialog box, in which you can choose the macro to run. Many aspects of VBA in Access are unique only to Access, and Chapter 28, "Understanding the Access Object Model and Key Objects," covers this topic in depth.
+
+**Record Macro button**
+
+Displays the Record Macro dialog box in Word or Excel.
+
+**Macro Security button**
+
+Displays the Trust Center macro settings dialog. You'll examine this in detail in Chapter 19. This button allows you to specify whether and how you want macros enabled.
+
+**Visual Basic button**
+
+Starts or switches to the Visual Basic Editor. You'll begin working in the Visual Basic Editor in Chapter 2, "Getting Started with the Visual Basic Editor" (and you'll spend most of the rest of the book employing it).
+
+**Macros button**
+
+Opens the classic Macros dialog from which you can run, step into (start the Visual Basic Editor in _Break mode,_ more about this in Chapter 3, "Editing Recorded Macros"), edit, create, delete, or open the macro project organizer dialog. (Not all of these options are available in all applications. For example, PowerPoint has no organizer.) Word and Excel have a similar Macros button in the Ribbon's View tab. This button has the ability to open the Macros dialog but can also start recording a macro. Note that Break mode is also referred to as Step mode.
+
+**Add-Ins**
+
+This is where you can access templates, styles, and specialized code libraries.
+
+**Controls**
+
+A set of control buttons that, when clicked, insert user-interface components—such as a drop-down list box—into an open document. Similar components can also be added to macros that you create in the VBA Editor. Chapters 14 and 15 explore this user-interface topic.
+
+**Design Mode button**
+
+Toggles between _Design mode_ and _Regular mode_. When Design mode you can add or edit embedded controls in documents. In Regular mode you can interact normally with controls (controls can accept information from the user via typing or mouse clicks).
+
+**Properties button**
+
+This button is enabled only if you're in Design mode. It allows you to edit the properties of the document (such as removing personal information).
+
+* * *
+
+The Emergence of XML
+
+XML has become an industry standard for storing and transmitting data. With Office 2007, the Office applications' documents began to employ XML extensively. This switch to XML is the primary reason that documents created in versions of Office 2007, 2010, and 2013 are not compatible with earlier versions of Office, such as Office 2003 documents. Thus, you must _convert_ old Office documents to the newer Office formats. And people still using older versions of Office must install the Microsoft Office Compatibility Pack for Word, Excel, and PowerPoint File Formats. Note that Word 2010 and 2013 document files are saved with a .docx filename extension, the x reflecting the underlying XML format on which Office 2007, 2010, and 2013 rest.
+
+* * *
+
+## Naming the Macro
+
+Next, enter a name for the new macro in the Macro Name text box in the Record Macro dialog box. The name must comply with the following conventions:
+
+  * It must start with a letter; after that, it can contain both letters and numbers.
+  * It can be up to 80 characters long.
+  * It can contain underscores, which are useful for separating words, such as File_Save.
+  * It cannot contain spaces, punctuation, or special characters, such as ! or *.
+
+* * *
+
+Name and Describe Your Macros
+
+Some people insist that to properly manage your set of macros, you must follow some clerical procedures that involve giving your macros descriptive names and also typing in a narrative description of each macro's purpose. They claim that if you create many macros, you should organize them carefully. Recording macros is so easy; you can create code so quickly that you can end up with a _pile_ of macros—as Southerners say—making it easy to get confused about which macro does what.
+
+You may be tempted not to assign a macro description when you're in a hurry or when you're playing with different ways to approach a problem and you're not sure which (if any) of your test macros you'll keep. And for simple, obvious code, perhaps using the Macro12, Macro13 default names and not typing in a description isn't a problem. If you find it easy to read VBA code, you can usually just look at a macro and read what it does.
+
+Even so, for more complex macros, and for people who find code hard to read—go ahead and enter a few notes for each macro that you record. Otherwise, you can end up with that pile of recorded macros that have the cryptic default names and no descriptions. To figure out what each macro does and which ones you can safely delete, you'll have to plow through the code—and a recorded macro's code can be surprisingly long, even if the macro does nothing more than adjust a few options in a couple of dialog boxes.
+
+You might also want to employ a macro-naming convention to indicate which are test macros that you can delete without remorse. Start the name with a word like _Temp_ , then add numeric values sequentially to keep track of the versions—for example, Scratch (Scratch01, Scratch02, and so on) and Temp (Temp01, Temp02, and so on).
+
+Each new macro you record is by default placed at the bottom of the set of macros in the VBA Editor. You can, however, always open the Visual Basic Editor and rename or add a description anytime you want because macros are fully editable.
+
+Personally, I like to put a little descriptive note inside more complicated macros' code, right at the top, under the Sub line. It looks like this:
+
+    Sub AltH()
+    ' Applies Heading 1 style
+        Selection.Style = ActiveDocument.Styles("Title1")
+    End Sub
+
+Any text following a single-quote symbol (') on a line of code is ignored by VBA. The single quote indicates that what follows is a _comment_ to assist the programmer in understanding the code rather than actual code that should be executed. (VBA would not know what to make of the words Applies Heading 1 style. They are not part of VBA's dictionary.)
+
+Note that if you type a description in the Description field of the Record Macro dialog when you first start recording, that comment is automatically inserted into your code—complete with the single-quote symbol.
+
+Also, my preferred way to name any macros that are triggered by keyboard shortcuts is to use the name of the keyboard shortcut itself. Thus, Sub AltH tells me that this macro is triggered by the Alt+H keyboard shortcut.
+
+But whatever system you adopt, it's generally better to err on the side of greater description or commenting within the code rather than too little. It only takes a moment to provide an expressive, meaningful name and a clear description of the purpose of the macro.
+
+* * *
+
+### Invalid Macro Names
+
+Word and Excel, the two Office applications that permit macro recording, raise objections to invalid macro names when you click the OK button to start recording the macro. If you enter an invalid macro name in the Record Macro dialog box, these applications let you know—in their own way. Word displays a brief, rather cursory message, while Excel gives more helpful info. Figure 1.3 shows how these applications respond to an invalid macro name once it's entered.
+
+Figure 1.3 The dialog boxes supplied by Word and Excel showing invalid macro names.
+
+### Describing Your Macros
+
+Type a description for the macro in the Description text box. Recall that this description is to help you (and anyone you share the macro with) identify the macro and understand when to use it. If the macro runs successfully only under particular conditions, you can note them briefly in the Description text box. For example, if the user must make a selection in the document before running the macro in Word, mention that.
+
+You now need to choose where to store the macro. Your choices with Word and Excel are as follows:
+
+**Word**
+
+Recall that in Word, if you want to restrict availability of the macro to just the current template (.dotm file) or document (.docm file), choose that template or document from the Store Macro In drop-down list in the Record Macro dialog box shown in Figure 1.1. If you want the macro to be available no matter which template you're working in, make sure the default setting—All Documents (Normal.dotm)—appears in the Store Macro In combo box. (If you're not clear on what Word's templates are and what they do, see the sidebar "Understanding Word's Normal.dotm, Templates, and Documents" later in this chapter).
+
+**Excel**
+
+In Excel, you can choose to store the macro in This Workbook (the active workbook), a new workbook, or Personal Macro Workbook. The Personal Macro Workbook is a special workbook named Personal.xlsb. Excel creates this Personal Macro Workbook the first time you choose to store a macro in the Personal Macro Workbook. By keeping your macros and other customizations in the Personal Macro Workbook, you can make them available to any of your procedures. Recall that the Personal Macro Workbook is similar to Word's global macros storage file named Normal.dotm. If you choose New Workbook, Excel creates a new workbook for you and creates the macro in it.
+
+### Storing Your Macros
+
+Word and Excel automatically store recorded macros in a default location in the specified document, template, workbook, or presentation:
+
+**Word**
+
+Word stores each recorded macro in a _module_ named NewMacros in the selected template or document, so you'll always know where to find a macro after you've recorded it. This can be a bit confusing because there can be multiple NewMacros folders visible in the Project Explorer pane in the Visual Basic Editor. (This happens because there can be more than one project open—such as several documents open simultaneously, each with its own NewMacros folder holding the macros embedded within each document.) Think of NewMacros as merely a holding area for macros—until you move them to another module with a more descriptive name. (Of course, if you create only a handful of macros, you don't need to go to the trouble of creating various special modules to subdivide them into categories. You can just leave everything in a NewMacros module. As always, how clerical you need to be depends on how organized your mind and memory are.)
+
+If a NewMacros module doesn't yet exist, the Macro Recorder creates it. Because it receives each macro recorded into its document or template, a NewMacros module can soon grow large if you record many macros. The NewMacros module in the default global template, Normal.dotm, is especially likely to grow bloated, because it receives each macro you record unless you specify another document or template prior to recording. Some people like to clear out the NewMacros module from time to time, putting recorded macros you want to keep into other modules and disposing of any useless or temp recorded macros. I don't have _that_ many macros, so I find no problem simply leaving them within the NewMacros module.
+
+**Excel**
+
+Excel stores each recorded macro for any given session in a new module named Module _n_ , where _n_ is the lowest unused number in ascending sequence (Module1, Module2, and so on). Any macros you create in the next session go into a new module with the next available number. If you record macros frequently with Excel, you'll most likely need to consolidate the macros you want to keep so that they're not scattered across many modules like this.
+
+* * *
+
+Understanding Word's Normal.dotm, Templates, and Documents
+
+Word 2007, 2010, and 2013 store data differently than previous versions of Word. For one thing, in Word 2003 you could create custom menus and toolbars that you stored in templates. Later versions of Word do not permit menus, nor do they permit any toolbars other than the Quick Access Toolbar. What's more, customizing that toolbar has a _global_ impact. Custom toolbar buttons are not stored in templates. In other words, any modifications you make to the Quick Access Toolbar will be visible in any Word document, no matter which template(s) is currently active.
+
+Word 2007, 2010, and 2013 feature three kinds of templates:
+
+  * Legacy templates from Word 2003 and earlier versions. These have a .dot filename extension. If you are working with one of these templates, [Compatibility Mode] appears on the Word title bar.
+  * Word 2010 templates that contain no macros (.dotx filename extension). You can save macros in a document that employs a .dotx template, but the macro will not be saved within the template.
+  * Templates with a .dotm filename extension contain macros. Recall that because macros written by malicious people can do damage just like a virus, recent versions of Word segregate macros into this special kind of template with a .dotm filename extension. A .dotm template can do anything that a .dotx template can do, but the .dotm template features the additional capability of hosting macros.
+
+Word has a four-layer architecture. Starting from the bottom, these layers are the application itself, the global template (Normal.dotm), the active document's template, and finally, the active document itself (the text and formatting). Each of the four layers can affect how Word appears and how it behaves, but all four layers are not necessarily active at any given time.
+
+The bottom layer, which is always active, is the Word application itself. This layer contains all the Word objects and built-in commands, such as _Open_. Also always active are objects such as Word's Quick Access Toolbar, the Ribbon, and so on. This layer is the most difficult to picture because usually you don't see it directly. Normal.dotm, the global template, forms the second layer and is also always active.
+
+When you start Word, it loads Normal.dotm automatically, and Normal.dotm stays loaded until you exit Word. (There's a special switch you can use—winword /n—to prevent the macros in Normal.dotm from being active if you need to troubleshoot it. Press the Start key [the Windows key] in Windows 8, and type **Run** to launch Word in this special way.)
+
+Normal.dotm contains styles (such as the default paragraph style), AutoText entries, formatted AutoCorrect entries, and customizations. These customizations show up in the other layers unless specifically excluded.
+
+Default blank documents (such as the document that Word normally creates when you start it and any document you create by clicking Ctrl+N or by clicking the Ribbon's File tab and then choosing New and Blank Document) are based on Normal.dotm. So when you're working in a default blank document, you see the Word interface as it is specified in Normal.dotm.
+
+The currently active template sits on top of the Word application and Normal.dotm. This template can contain styles, macro modules (if it is a macro-enabled .dotm file type), and settings for the template, along with any boilerplate text needed for this particular type of document. This is the third layer, but it is used only if the current document (or _active document_ ) is attached to a template other than Normal.dotm.
+
+On top of the current template sits the current document, which contains the text and graphics in the document, its formatting, and its layout. Documents can also contain macro modules and custom keyboard shortcuts, so the document itself can act as a fourth layer. This layer is always present when a document is open, but it has no effect on Word's interface or behavior unless the document contains customizations.
+
+Because these layers might contain conflicting information (such as two different font styles with the same name), there has to be an order of precedence that defines which layer "wins" in any such conflict. Customized settings work from the top layer downward. So customized settings in the active document take precedence over those in the active template. Likewise, any settings in the current template take precedence over any global templates (templates that automatically apply to all Word documents) or add-ins other than Normal.dotm. Customized settings in those global templates or add-ins take precedence over those in Normal.dotm.
+
+As another example, say you have the key combination Ctrl+Shift+K assigned to different actions in Normal.dotm, in a loaded global template, in a document's template, and in the document itself. When you press that key combination, only the procedure assigned in the document runs because that is the topmost layer. If you remove the key-combination assignment from the document, the template then becomes the topmost layer containing a definition of this key combination, so the procedure assigned in the template runs. If you remove the key combination from the template as well, the procedure in the loaded global template runs. Finally, if you remove that template's key combination too, the procedure in Normal.dotm runs.
+
+* * *
+
+## Choosing How to Run a New Macro
+
+Continuing our exploration of the Record Macro dialog box shown in Figure 1.1, at this point, after you've named the macro, typed a description, and chosen where to store it, it's time to choose how to trigger the macro. In other words, which way do you want to _run_ the macro: via a shortcut key or the Quick Access Toolbar button? Good typists generally prefer shortcut keys, but buttons provide at least a visual hint of the macro's purpose, and hovering your mouse on the button also displays the name of the macro.
+
+Shortcut keys and buttons are handy for people who record a moderate number of macros and don't organize them in complex ways—moving them from one module to another. If you create a great number of macros and feel the need to move them into other modules, assigning a shortcut key or button prior to recording becomes less useful. This is because moving a macro from one module to another disconnects any way you've assigned for running the macro.
+
+This limitation means that it makes sense to assign a way of running a macro—prior to recording—only if you're planning to use the macro in its recorded form (as opposed to, say, using part of it to create another macro) _and_ from its default location. If you plan to move the macro or rename it, don't assign a way of running it now. Instead, wait until the macro is in its final form and location, and then assign the means of running it. See "Specifying How to Trigger an Existing Macro," later in this chapter, for details.
+
+Personally, I don't have more than a couple dozen macros that I use all the time, so I avoid the complications described in the previous paragraph and the sidebar on managing your macros. Instead, I just add shortcut keys when I first create the macro, and leave them all in a single version of Normal.dotm. However, if you face more complicated situations—such as managing a big set of macros for a company—you might want to manage your macros with modules.
+
+* * *
+
+Manage Your Macros with Modules
+
+By moving your recorded macros into different modules, you can group related macros so you can compare the code, adjust them, or distribute them easily.
+
+* * *
+
+To assign a way to run the macro, follow the instructions in the next sections.
+
+You don't have to assign a button or keyboard shortcut prior to recording a macro. You can do it later, or at any time. In Word, Access, Excel, and other Office 2013 applications, you use the Options dialog box to assign a button on the Quick Access Toolbar to a macro. PowerPoint and Access do not permit you to assign keyboard shortcuts to macros, but for applications that do permit this—such as Word and Excel—you use the Customize Keyboard dialog box to assign a shortcut key to a macro. Excel limits you to Ctrl+ or Ctrl+Shift key combinations.
+
+### Running a Macro from the Ribbon
+
+Although it's not available in the Record Macro dialog box, you can add a macro to the Ribbon, like this:
+
+1. Right-click anywhere on the Ribbon.
+
+2. Click Customize The Ribbon on the menu. The Word Options dialog box appears.
+
+3. In the Choose Commands From drop-down list, select Macros.
+
+4. Click a macro's name to select it in the list.
+
+5. Click an existing tab in the list of tabs in the right dialog box where you want to locate your macro.
+
+6. Then click the New Group button and specify the name of your custom group.
+
+7. Click the rename button to give your new group a name.
+
+8. Click OK to close the Rename dialog box.
+
+9. Click the Add button to add your macro.
+
+10. Click the rename button to give your macro an easily understood name, and optionally an icon.
+
+11. Click OK to close the Rename dialog box.
+
+12. Click OK to close the Word Options dialog box.
+
+### Running a Macro from the Quick Access Toolbar
+
+Here's how to use the Word Options dialog box to assign a macro to a button on the Quick Access Toolbar:
+
+1. Right-click anywhere on the Quick Access Toolbar (it's the set of icons in the upper-left corner, above the Ribbon), and a menu will appear. (This toolbar will be just below the Ribbon if you've previously selected the Show Quick Access Toolbar Below The Ribbon option from this menu.)
+
+2. Click Customize Quick Access Toolbar on the menu. The Word Options dialog box appears.
+
+3. In the Choose Commands From drop-down list, select Macros.
+
+4. Click a macro's name to select it in the list, as shown in Figure 1.4.
+
+5. Click the Add button to insert this macro's name in the Customize Quick Access Toolbar list, as shown in Figure 1.4.
+
+Figure 1.4 Choose a way to run the macro in Word's Options dialog box.
+
+6. Word adds a button to the toolbar for the macro, giving it the macro's fully qualified name (its location plus its name), such as Normal.NewMacros.CreateDailyReport. This name consists of the name of the template or document in which the macro is stored, the name of the module that contains the macro, and the macro's name, respectively. You don't need all this information displayed when you hover your mouse pointer over the button.
+
+7. So rename the button or menu item: Click the Modify button at the bottom of the Customize Quick Access Toolbar list (see Figure 1.5). Whatever macro is highlighted (currently selected) in the list of toolbar items will be the one you're modifying.
+
+Figure 1.5 Word gives the menu item or toolbar button the full name of the macro. Use this Modify Button dialog to change the name to something shorter and better.
+
+* * *
+
+Macro Button Labels Need Not Match Their Official Names
+
+Notice that a macro's button name (displayed as its tooltip caption when you hover your mouse over it) doesn't have to bear any relation to the macro's actual name as it appears in the Visual Basic Editor or the Macro dialog.
+
+* * *
+
+8. While you're modifying the macro's name, you might also want to choose a different button icon that visually cues you about the macro's purpose. To do that, just double-click whatever icon you want to use, then click OK.
+
+### Running a Macro via a Shortcut Key Combination
+
+To assign the macro to a key combination, follow these steps:
+
+1. Right-click the Ribbon and choose Customize The Ribbon from the menu that appears. This opens the Word Options dialog.
+
+2. Click the Customize button next to Keyboard Shortcuts in the bottom left of the Word Options dialog box.
+
+3. Scroll down the Categories list box until you see Macros, then click Macros to select it.
+
+4. Click to select the name of the macro you want to assign a shortcut key combination to.
+
+5. Check the Current Keys list box to see if a key combination is already assigned. If it is, you can press the Backspace key to clear the key combination if you wish, or you can employ multiple key combinations to launch the macro.
+
+6. In the Press New Shortcut Key field, type the key combination you want to use to trigger the macro (see Figure 1.6).
+
+7. Check to see if this key combination is already used for another purpose. If so, you can reassign it, or you can choose a different combination by pressing the Backspace key in the Press New Shortcut Key field.
+
+Figure 1.6 Set a shortcut key combination for the macro in the Customize Keyboard dialog box.
+
+8. Be sure to click the Assign button when you're finished. Just closing this dialog does _not_ assign the key combination.
+
+* * *
+
+You Can Postpone Assigning a Shortcut Key Combination
+
+Remember that, as with the other ways of running a macro, you can assign a key combination to run a macro either at the time you record the macro or at any point after you finish recording it. If you intend to move the macro from the NewMacros module to another module, remember that you need not assign the key combination until the macro has reached its ultimate destination.
+
+* * *
+
+A key combination in Word can be any of the following:
+
+  * Alt plus either a function key or a regular key not used as a menu-access key.
+  * Ctrl plus a function key or a regular key.
+  * Shift plus a function key.
+  * Ctrl+Alt, Ctrl+Shift, Alt+Shift, or even Ctrl+Alt+Shift plus a regular key or function key. Pressing Ctrl+Alt+Shift and another key tends to be too awkward for practical use.
+
+* * *
+
+Specify Two-Step Key Combinations
+
+You can set up shortcut keys that have two steps—for example, Ctrl+Alt+F, 1 and Ctrl+Alt+F, 2—by pressing the second key (in this case, the 1 or the 2) after pressing the key combination. However, these shortcuts tend to be more trouble than they're worth, unless you're assigning literally hundreds of extra shortcut keys.
+
+* * *
+
+### Running a Macro the Old-Fashion Way
+
+A clumsy, rarely used way to run a macro is to click the Developer tab in the Ribbon. To see how this works, follow these steps:
+
+1. Click the Macros icon.
+
+2. Click the name of the macro in a displayed list.
+
+3. Finally, click the Run button.
+
+By the way, you can also run a macro from within the Visual Basic Editor by pressing F5. This is how you test macros while you're editing them.
+
+### Assigning a Way to Run a Macro in Excel
+
+When you're recording a macro, Excel allows you to assign only a Ctrl shortcut key, not a button, to run it. If you want to assign a Quick Access Toolbar button to the macro, you need to do so _after_ recording the macro (using the Customize feature as described shortly).
+
+To assign a Ctrl shortcut key to run the macro you're recording, follow these steps:
+
+1. Start recording the macro, then click the Shortcut Key text box to display the blinking insertion cursor. Press the shortcut key you want to use. (Press the Shift key at the same time if you want to include Shift in the shortcut.)
+
+2. In the Store Macro In drop-down list, specify where you want the Macro Recorder to store the macro. Your choices are as follows:
+
+  * _This Workbook_ stores the macro in the active workbook. This option is useful for macros that belong to a particular workbook and do not need to be used elsewhere.
+  * _New Workbook_ causes Excel to create a new workbook for you and store the macro in it. This option is useful for experimental macros that you'll need to edit before unleashing them on actual work.
+  * _Personal Macro Workbook_ stores the macro in the Personal Macro Workbook, a special workbook named PERSONAL.XLSB. By keeping your macros and other customizations in the Personal Macro Workbook, you can make them available to any of your procedures—in that way, the Personal Macro Workbook is similar to Word's Normal.dotm. If the Personal Macro Workbook does not exist yet, the Macro Recorder creates it automatically.
+
+3. Click the OK button to start recording the macro.
+
+### Assigning a Way to Run a Macro in PowerPoint
+
+PowerPoint does not let you record macros, but you can assign a way to run macros written in the Visual Basic Editor, as discussed in the section "Specifying How to Trigger an Existing Macro" later in this chapter.
+
+### Assigning a Way to Run a Macro in Outlook
+
+Outlook doesn't let you record macros, and by default macros are disabled. To enable macros in Outlook, click the Developer tab on the Ribbon, then click the Macro Security icon (it's on the left in the Code section of the Ribbon). The Trust Center dialog box opens. Click the Notification For All Macros option or the Enable All Macros option. To see how to assign a way to run macros, see the section "Specifying How to Trigger an Existing Macro" later in this chapter.
+
+### Recording the Actions in a Macro
+
+When you close the Record Macro dialog box, the Macro Recorder begins recording the macro. The Macro Recorder displays the Stop Recording icon (a white square) in the status bar at the bottom left of the screen (and a Stop Recording button in the Developer tab on the Ribbon). In addition, a small symbol of a cassette tape appears in the mouse pointer (these tapes were used in the old days, prior to the invention of the CD).
+
+Now you should perform the sequence of actions you want to record. What exactly you can do varies from application to application, but in general, you can use the mouse to select items, make choices in dialog boxes, and select defined items in documents (such as cells in spreadsheets). You'll find a number of things that you can't do with the mouse, such as select items within a document window in Word. To select items in a Word document window, you have to use the keyboard (Shift+arrow keys, for example). You can select cells with the mouse in Excel during recording.
+
+* * *
+
+The Macro Recorder Records Everything—The Complete Current Status
+
+When you make choices in a dialog box and click the OK button, the Macro Recorder records the current settings for all the options on that page of the dialog box. So, for example, when you change the left indentation of a paragraph in the Paragraph dialog box in Word, the Macro Recorder records _all the other settings_ on the Indents And Spacing page as well (Alignment, Before and After spacing, and so forth).
+
+* * *
+
+In Word, if you need to perform any actions that you don't want recorded, pause the Macro Recorder by clicking the Pause Recording button on the Ribbon. The button changes to Resume Recording. Click the button again to start recording again.
+
+To stop recording, click either the Stop Recording button on the Ribbon, or the other one on the status bar.
+
+The Macro Recorder has now recorded your macro and assigned it to a key combination or button, if you made that choice.
+
+# Running a Macro
+
+To run a macro you've recorded, you can use four methods to run it within the application:
+
+  * If you assigned a Quick Access Toolbar button, use that.
+  * If you added your macro to the Ribbon, you can use that.
+  * If you specified a shortcut-key-combination macro, use it.
+  * A less convenient approach is to press Alt+F8 to display the Macros dialog box, select the macro, and then click the Run button. (Alternatively, you could double-click the macro name in the list box.)
+
+* * *
+
+Running in the Editor
+
+You can also run a macro from the Visual Basic Editor, which is useful when you're working in the Editor. Just press F5.
+
+* * *
+
+The macro runs, performing the actions in the sequence in which you recorded them. For example, suppose you create a macro in Excel that selects cell A2 in the current worksheet, boldfaces that cell, enters the text **Yearly Sales** , selects cell B2, and enters the number **100000** in it. The Macro Recorder recognizes and saves those five actions. VBA then performs all five actions, step-by-step, each time you run the macro—albeit quite rapidly.
+
+* * *
+
+How to Stop an Executing Macro
+
+To stop a running macro, press Ctrl+Break (Break is usually the unshifted Pause key on the keyboard). VBA stops running the code and displays a dialog box telling you that code execution has been interrupted. Click the End button to dismiss this dialog box.
+
+* * *
+
+Some applications (such as Word) let you undo most actions executed via VBA after the macro stops running (by pressing Ctrl+Z or clicking the Undo button on the Quick Access Toolbar, undoing one command at a time); other applications do not.
+
+* * *
+
+Macro Errors Are Often Caused by Incorrect Contexts
+
+If running the macro results in an error, often this means that the macro is trying to do something to a file or an object that isn't available. For example, if you record a macro in Excel that works on the active workbook, the macro causes an error if you run it when no workbook is open (thus there is no such thing as an active workbook). Likewise, if you write a macro in PowerPoint that works with the third shape on the active slide, that macro fails if you run it on a slide that has no third shape. To get the macro to run properly, re-create the conditions it needs, and then try it again.
+
+* * *
+
+# Recording a Sample Word Macro
+
+In this section, you'll record a sample macro in Word that you can work with later in the book. This macro selects the current word, cuts it, moves the insertion point one word to the right, and pastes the word back in. This is a straightforward sequence of actions that you'll later view and edit in the Visual Basic Editor.
+
+Follow these steps to record the macro:
+
+1. Create a new document by pressing Ctrl+N.
+
+2. Start the Macro Recorder by clicking the Developer tab on the Ribbon, then clicking the Record Macro button. Or click the Macro Record button on the status bar at the bottom of the application. (With this approach, you don't have to open the Developer tab. Just click the button on the status bar.)
+
+3. In the Macro Name text box, enter **Transpose_Word_Right**.
+
+4. In the Store Macro In drop-down list, make sure All Documents (Normal.dotm) is selected, unless you want to assign the macro to a different template. (This and future examples in this book assume this macro is located in Normal.dotm, so do store it there.)
+
+5. In the Description box, enter a description for the macro (see Figure 1.7). Be fairly explicit and enter a description such as **Transposes the current word with the word to its right. Created 5/5/13 by Nanci Selest-Gomes**.
+
+Figure 1.7 Creating the sample macro in Word
+
+6. Assign a method of running the macro, as described in the previous section, if you want to. Create a toolbar button or assign a keyboard shortcut. (The method or methods you choose is strictly a matter of personal preference.) If you'll need to move the macro to a different module (or a different template or document) later, don't assign a method of running the macro at this point.
+
+7. Click the OK button to dismiss the Word Options dialog box or the Customize Keyboard dialog box (or just click the OK button to dismiss the Record Macro dialog box if you chose not to assign a way of running the macro). Now you're ready to record the macro. The Stop Recording option appears on the Ribbon and on the status bar, and the mouse pointer has a cassette-tape icon attached to it.
+
+8. As a quick demonstration of how you can pause recording, click the Pause Recording button on the Ribbon. The cassette-tape icon disappears from the mouse pointer, and the Pause Recording button changes into a Resume Recording button. Enter a line of text in the document: **The quick brown fox jumped over the lazy dog.** Position the insertion point anywhere in the word _quick_ , and then click the Resume Recording button on the Ribbon to reactivate the macro recorder.
+
+9. Record the actions for the macro as follows:
+
+a. Use Word's extend selection feature to select the word _quick_ by pressing the F8 key twice.
+
+b. Press the Esc key to cancel Extend mode.
+
+c. Press Shift+Delete to cut the selected word to the Clipboard.
+
+d. The insertion point is now at the beginning of the word _brown_. Press Ctrl+right arrow to move the insertion point right by one word so that it's at the beginning of the word _dog_.
+
+e. Press Shift+Insert to paste in the cut word from the Clipboard.
+
+f. Press Ctrl+left arrow to move the insertion point one word to the left. This restores the cursor to its original position.
+
+10. Click the Stop Recording button on the Ribbon or status bar. Your sentence now reads, "The brown quick fox jumped over the lazy dog."
+
+* * *
+
+Finding Built-In Keyboard Shortcuts
+
+You can find a complete list of the built-in keyboard shortcuts (such as Ctrl+left arrow) by searching an application's Help system for "Keyboard Shortcuts." If available, click the Show All option to expand the complete list, then use Ctrl+F to search for whatever you're interested in.
+
+* * *
+
+You can now run this macro by using the toolbar button or keyboard shortcut that you assigned (if you chose to assign one). Alternatively, click the Macros button in the Developer tab and run the macro from the Macros dialog box. Try positioning the insertion point in the word _brown_ and running the macro to restore the words in the sentence to their original order.
+
+At this point, Word has stored the macro in Normal.dot. If you don't save macros until you exit Word (or until an automated backup takes place), Word doesn't, by default, prompt you to save them then. It just does so automatically. But it's best to click the Save button in the File tab to store Normal now. That way, if Word or Windows crashes, you will avoid losing the macro.
+
+* * *
+
+You Can Force Word to Prompt You to Save the Normal Template
+
+Word, by default, automatically saves new macros added to the Normal template. But if you prefer to have Word prompt you to save any changes to the Normal template, choose Options on the File tab, then click the Advanced button and scroll down until you see the section of Save options. Select the Prompt Before Saving Normal Template check box, and then click the OK button. This option was selected by default in early versions of Office, but ever since Office 2007 it is turned off by default.
+
+* * *
+
+# Recording a Sample Excel Macro
+
+In the following sections, you'll record a sample Excel macro. This macro creates a new workbook, enters a sequence of months into it, and then saves it. You'll work with this macro again in Chapter 3, so don't delete it.
+
+## Create a Personal Macro Workbook If You Don't Have One Yet
+
+If you don't already have a Personal Macro Workbook in Excel, you'll need to create one before you can create this procedure. (If you do have a Personal Macro Workbook, skip to the next section.) Follow these steps:
+
+1. Click the Developer tab in the Ribbon, then click the Record Macro button on the Ribbon (or just click the Record Macro button on the status bar) to display the Record Macro dialog box.
+
+2. Accept the default name for the macro because you'll be deleting it momentarily.
+
+3. In the Store Macro In drop-down list, choose Personal Macro Workbook.
+
+4. Click the OK button to close the Record Macro dialog box and start recording the macro.
+
+5. Type a single character in whichever cell is active, and press the Enter key.
+
+6. Click the Stop Recording button on the Ribbon or status bar to stop recording the macro.
+
+7. Click the Unhide button on the View tab to display the Unhide dialog box. Select PERSONAL.XLSB and click the OK button.
+
+8. Click the Developer tab in the Ribbon, then click the Macros button on the Ribbon to display the Macros dialog box.
+
+9. Select the macro you recorded and click the Delete button to delete it. Click the Yes button in the confirmation message box.
+
+You now have caused Excel to generate a Personal Macro Workbook that you can use from now on to hold your global macros.
+
+## Record the Macro
+
+To create this macro, start Excel and follow these steps:
+
+1. Click the Developer tab in the Ribbon, then click the Record Macro button on the Ribbon (or just click the Record Macro button on the status bar). This displays the Record Macro dialog box, shown in Figure 1.8, with information entered.
+
+Figure 1.8 Display the Record Macro dialog box for Excel and make your choices in it.
+
+2. Enter the name for the macro in the Macro Name text box: **New_Workbook_with_Months**.
+
+3. In the Shortcut Key text box, enter a shortcut key if you want to. (Remember that you can always change the shortcut key later, so you're not forced to enter one at this point.)
+
+4. In the Store Macro In drop-down list, choose whether to store the macro in your Personal Macro Workbook, in a new workbook, or in this active workbook. As discussed a little earlier in this chapter, storing the macro in the Personal Macro Workbook gives you the most flexibility because it is Excel's global macro container. For this example, don't store the macro in the active workbook, because you're going to delete the active workbook almost immediately. Instead, store it in your Personal Macro Workbook. Remember, we'll use this macro in future examples.
+
+5. Type a description for the macro in the Description text box.
+
+6. Click the OK button to dismiss the Record Macro dialog box and start recording the macro.
+
+7. Click the File tab on the Ribbon and click New to display the available templates for a new workbook.
+
+8. Double-click the Blank workbook icon. Excel creates a new workbook and selects the first sheet on it.
+
+9. Click cell A1 to select it. (It may already be selected; click it anyway because you need to record this click instruction.)
+
+10. Enter **January 2014** and press the right arrow key to select cell B1. Excel automatically changes the date to your default date format. That's fine.
+
+11. Enter **February 2014** and press the left arrow key to select cell A1 again.
+
+12. Drag from cell A1 to cell B1 so that the two cells are selected.
+
+13. Drag the fill handle from cell B1 to cell L1 so that Excel's AutoFill feature enters the months March 2014 through December 2014 in the cells. (The fill handle is the small black dot in the lower-right corner of the selection frame. You'll know you're on it when the cursor changes from a white to a black cross.)
+
+14. Click the File tab on the Ribbon, then click the Save As option to display the Save As dialog box. Save the workbook in a convenient folder (for example, the My Documents folder) under a name such as **Sample Workbook.xlsx**.
+
+15. Click the Stop Recording button on the Ribbon or status bar to stop recording the macro.
+
+Close the sample workbook, and use Windows Explorer to navigate to the new .xlsx file you just saved, and delete the file. Then run the macro and watch what happens. (If you don't delete the existing workbook, Excel prompts you to decide whether to overwrite it when in step 14 it tries to save the new workbook using the same name as the existing workbook.)
+
+# Specifying How to Trigger an Existing Macro
+
+If you didn't assign a way of running the macro when you recorded it, you can assign a way of running it as described here.
+
+## Assigning a Macro to a Quick Access Toolbar Button in Word
+
+To assign a macro to the Quick Access Toolbar, follow these steps:
+
+1. Right-click anywhere on the Quick Access Toolbar (it's the set of icons in the upper-left corner, above the Ribbon). A menu appears.
+
+2. Click Customize Quick Access Toolbar on the menu. The Word Options dialog box appears.
+
+3. In the Choose Commands From drop-down list, select Macros.
+
+4. Click the name of the macro you want to assign a button to.
+
+5. Click the Add button to copy the macro name into the list of buttons on the right.
+
+6. Click the Modify button if you want to assign a different icon or modify the button's name.
+
+7. Click OK to close the dialog.
+
+## Assigning a Macro to a Shortcut Key Combination
+
+The section "Running a Macro via a Shortcut Key Combination," earlier in this chapter, explained how to do this in Word. PowerPoint and Access do not let you assign a macro to a key combination. Excel uses a slightly different approach than Word, limiting you to Ctrl and Shift combinations, as described earlier in this chapter in the section "Assigning a Way to Run a Macro in Excel."
+
+# Deleting a Macro
+
+To delete a macro you no longer need, follow these steps:
+
+1. Press Alt+F8 to display the Macros dialog box.
+
+2. Choose the macro in the Macro Name list box.
+
+3. Click the Delete button.
+
+4. In the warning message box that appears, click the Yes button. Figure 1.9 shows Excel's variation of this warning message box.
+
+Figure 1.9 When you delete a macro, the application checks to make sure you mean to do so.
+
+5. Click the Close button or the Cancel button to close the Macros dialog box.
+
+* * *
+
+Organizing Macros in Word with the Organizer Dialog Box
+
+Most VBA-enabled applications require you to use the Visual Basic Editor (which is discussed in the next chapter) to move code modules, user forms, and other code items from one file to another file. (A _code module_ is a virtual container used for storing macros. A _user form_ is a custom dialog box displayed to the user for input.) But Word provides a useful tool called the Organizer dialog box that you can use to copy, move, rename, and delete code modules, user forms, and other code items directly in the Word interface without opening the Visual Basic Editor.
+
+To use the Organizer dialog box, follow these steps:
+
+1. In Word, press Alt+F8.
+
+2. Click the Organizer button to display the Organizer dialog box, and click the Macro Project Items tab if the Macro Project Items page (shown here) isn't automatically displayed.
+
+3. Look at the two documents or templates listed in the readouts above the two list boxes. Usually, the left list box shows the active document, and the right one shows Normal.dotm. Change these so that one list box shows the document or template that contains the code you want to copy or move and the other list box shows the destination document or template. (If you want only to delete or rename code items, you need only make the Organizer dialog box list the document or template that contains the items.) To change the document or template listed, click the Close File button underneath the list box on the corresponding side. The Close File button changes to an Open File button. Click this button to display the Open dialog box, navigate to and select the document or template you want, and then click the Open button. The Open dialog will automatically default to displaying the Templates folder.
+
+4. You can then delete, rename, copy, and move macro project items. The following list details how to do this:
+
+  * To delete one or more macro project items from a template, choose the item or items from either panel of the Organizer dialog box and click the Delete button. Click the Yes button in the confirmation message box. Any copies of the items in other templates are unaffected.
+  * To rename a macro project item, select it from either panel and click the Rename button to open the Rename dialog box. Enter the new name and click the OK button. Any copies of the same item in other templates are unaffected.
+  * To copy one or more macro project items from one template to another, open the templates in the Organizer dialog box. Select the item or items to copy in either panel of the dialog box (the arrows on the Copy button change direction to point to the other panel). Then click the Copy button. If the recipient template contains a macro project item of the same name as one you're copying, Word displays a warning message box telling you that it can't copy the item. If you still want to copy the item, rename either the item you're copying or the item with the same name in the destination template, and then perform the copy operation.
+  * To move a macro project item from one template to another, copy it as described in the previous paragraph, and then delete the macro project item from the source template.
+
+5. Once you've deleted, renamed, copied, or moved macro project items, click the Close button to close the Organizer dialog box. If Word prompts you to save any changes to affected documents or templates that aren't open in your Word session, click the Yes button.
+
+* * *
+
+# The Bottom Line
+
+**Record a macro.**
+
+The easiest way to create a macro is to simply record it. Whatever you type or click—all your behaviors—are translated into VBA automatically and saved as a macro.
+
+Master It
+
+Turn on the macro recorder in Word and create a macro that moves the insertion cursor up three lines. Then turn off the macro recorder and view the code in the Visual Basic Editor.
+
+**Assign a macro to a button or keyboard shortcut.**
+
+You can trigger a macro using three convenient methods: clicking an entry on the Ribbon, clicking a button in the Quick Access Toolbar, or using a keyboard shortcut. You are responsible for assigning a macro to any or all of these methods.
+
+Master It
+
+Assign an existing macro to a new Quick Access Toolbar button.
+
+**Run a macro.**
+
+Macros are most efficiently triggered via a Ribbon entry, by clicking a button on the Quick Access Toolbar, or by pressing a shortcut key combination such as Alt+N or Ctrl+Alt+F. When you begin recording a macro, the Record Macro dialog has buttons that allow you to assign the new macro to a shortcut key or toolbar button. However, if you are using the Visual Basic Editor, you can run a macro by simply pressing F5.
+
+Master It
+
+Execute a macro from within the Visual Basic Editor.
+
+**Delete a macro.**
+
+It's useful to keep your collection of macros current and manageable. If you no longer need a macro, remove it. Macros can be directly deleted from the Visual Basic Editor or by clicking the Delete button in the Macros dialog (opened by pressing Alt+F8).
+
+Master It
+
+Temporarily remove a macro, then restore it, using the Visual Basic Editor.
+Chapter 2
+
+Getting Started with the Visual Basic Editor
+
+In this chapter, you'll start learning how to use the Visual Basic Editor, a powerful tool bundled with Office 2013 for working with VBA. This programming editor is the culmination of more than 18 years of modifications and improvements. It is highly effective.
+
+All applications that host VBA use the Visual Basic Editor, so the environment looks much the same no matter which application you're using.
+
+This chapter covers the fundamentals of the Visual Basic Editor: its components, what they do, and how you use them. You'll learn more advanced maneuvers as you work with VBA later in this book.
+
+This chapter also shows you how to customize the Visual Basic Editor to make it more comfortable, more in tune with your preferences. This customization doesn't take long, and you'll find the resulting ease of use more than worth the amount of time you invest.
+
+In this chapter you will learn to do the following:
+
+  * Open the Visual Basic Editor
+  * Open a macro in the Visual Basic Editor
+  * Understand the Visual Basic Editor's main windows
+  * Set properties for a project
+  * Customize the Visual Basic Editor
+
+# Opening the Visual Basic Editor
+
+You open the Visual Basic Editor from the host application you're using. For example, if you're working in Word, you open the Visual Basic Editor from Word. The instance of the Visual Basic Editor that you open is then associated with Word.
+
+However, you can open two or more instances of the Visual Basic Editor. For example, if you've already opened an instance of the Visual Basic Editor in Word, you could open another instance in Excel, and then another in Access.
+
+You can open the Visual Basic Editor in two ways:
+
+  * Select a macro that you want to edit. The host application then opens the Visual Basic Editor and displays that macro so that you're ready to work with it.
+  * Open the editor directly, and then locate the macro code you want to work with.
+
+The next two sections demonstrate the two ways of opening the Visual Basic Editor, and the third section shows you how to navigate to a macro.
+
+## Opening the Visual Basic Editor with a Macro Selected
+
+If you know the name of the macro you want to work with, use this method to open the Visual Basic Editor and the macro at the same time. This example uses Word to open the Transpose_Word_Right macro that you recorded in Chapter 1, "Recording and Running Macros in the Office Applications":
+
+1. Open Word if it's not already running.
+
+2. Press Alt+F8 to display the Macros dialog box.
+
+3. Select the Transpose_Word_Right macro and click the Edit button. Word opens the Visual Basic Editor with the macro displayed and ready for editing, as shown in Figure 2.1.
+
+Figure 2.1 The Visual Basic Editor with the Transpose_Word_Right macro open in the Code window
+
+4. Choose File ⇒ Close and return to Microsoft Word to close the Visual Basic Editor for the moment so that you can open it using the method described in the next section.
+
+## Opening the Visual Basic Editor Directly
+
+To open the Visual Basic Editor directly, follow these steps:
+
+1. Open or activate the host application. In this case, open or switch to Word.
+
+2. Press Alt+F11. The Visual Basic Editor opens.
+
+* * *
+
+The Visual Basic Editor Remembers Its Code Window
+
+Depending on the state of the Visual Basic Editor the last time it was closed, you may see one or more Code windows open. For example, if you left the Code window for the NewMacros module open in the previous section, the Visual Basic Editor will display this Code window again.
+
+* * *
+
+If you don't see the Properties window (see Figure 2.1), press F4. More on this important window shortly.
+
+## Navigating to a Macro
+
+After opening the Visual Basic Editor directly, use the Project Explorer pane (shown on the left side in Figure 2.1) to navigate to your macro. You also use the Project Explorer to navigate among open projects and modules when you're working in the Visual Basic Editor.
+
+* * *
+
+The Project Explorer Resembles Windows Explorer Folder View
+
+The Project Explorer pane works like a standard Windows Explorer tree when you're viewing folders and subfolders. Depending on the application you're using, you'll see different projects displayed in the tree (more on this later in the chapter).
+
+* * *
+
+To navigate to the Transpose_Word_Right macro, follow these steps:
+
+1. In the Project Explorer pane in the upper-left corner of the Visual Basic Editor, expand the entry for Normal (which represents Normal.dotm, the Normal template) by clicking the + sign to the left of its name. (If the Normal entry is already expanded, skip this step.)
+
+2. Double-click the Modules entry to expand it.
+
+3. Double-click the NewMacros module. (This is the global module in which Word automatically stores the macros you record unless you specify a different location in the Record Macro dialog box.) The Visual Basic Editor displays the contents of the module in the Code window on the right side, as you can see in Figure 2.1.
+
+If the module contains more than one macro, you'll also need to select the macro you want to work with—in this case, the Transpose_Word_Right macro. (If you've recorded only the Transpose_Word_Right macro, only this macro appears in the Code window.) To select a macro, use one of these methods:
+
+  * In the Code window, select the macro from the Procedure drop-down list, as shown in Figure 2.2. (If you hover the mouse pointer over the list before dropping it down, you'll see a tooltip that gives its name: Procedure.)
+  * Use the scroll bar to scroll to the macro you want to edit, which is identified by the word _Sub_ , the name you gave it, and a pair of parentheses—in this case, Sub Transpose_Word_Right().
+
+Figure 2.2 If the module contains two or more macros, scroll to the macro you want to edit, or select it from this Procedure drop-down list.
+
+* * *
+
+Maximize Your Code Window
+
+Eagle-eyed readers will notice a difference between Figures 2.1 and 2.2. By default, the Code window is displayed in "normal" window size. In other words, there is a gray background around it, as you can see in Figure 2.1. This allows you to open other code windows in the same area. However, that's a bit too much micro-multitasking for me, so from now on, I'll display the Code window maximized, as shown in Figure 2.2. This makes it easier to see your code. To do this, click the Code window's Maximize button, just to the left of the red X button that closes the window.
+
+* * *
+
+# Using the Visual Basic Editor's Main Windows
+
+In the following sections, you'll learn how to use the main windows of the Visual Basic Editor to get your work done.
+
+## The Project Explorer
+
+The Project Explorer is the tool for navigating among the various objects in the Visual Basic Editor. Figure 2.3 shows the Project Explorer for a Visual Basic Editor session with Word as the host application.
+
+Depending on the host application and its capabilities, each project can contain some or all of the following elements. (But don't worry about such items as class modules, link libraries, and so on—we'll explore them in later chapters.)
+
+Figure 2.3 Use the Project Explorer to navigate to the module you want to work with.
+
+  * User forms (windows that make up part of the macro's user interface, such as a custom dialog box that accepts user input).
+  * Modules containing macros, procedures, and functions.
+  * Class modules (modules that define objects, their properties, and their values).
+  * References to other projects or to library files (such as DLLs—Dynamic Link Libraries).
+  * Objects related to the application. For example, each Word document and template contains a Microsoft Word Objects folder that holds a class object named ThisDocument. ThisDocument gives you access to the properties and _events_ (actions the object can react to, such as a click event) for the document or template. Each Excel workbook contains a class object named ThisWorkbook that gives you access to the properties and events for the workbook and a Sheet object (named Sheet1, Sheet2, and so on) for each worksheet.
+
+For most host applications, each open document and template is considered a separate project and is displayed as a root in the project tree. The project tree also contains any global macro storage container—such as the Normal.dotm template in Word or the Personal Macro Workbook in Excel—and any add-ins that are loaded.
+
+As an example, in Figure 2.3, Normal.dotm is identified as Normal, and the active document is identified as Project (C02): a document named C02.
+
+* * *
+
+Change a Project's Name at Any Time
+
+You can change the name of a project by using the Project Properties dialog box (discussed later in this chapter) or by selecting the project and entering a new name in the Properties pane, shown directly below the Project Explorer pane (as seen earlier in Figure 2.1).
+
+Once you change the name, the project is identified by that name in the Project Explorer, followed by the name of the document or template. For example, if you change the project name of document 2 to Testing, the document project is identified as Testing(2) in the Project Explorer rather than Project(2).
+
+* * *
+
+You navigate the Project Explorer in the same way that you navigate the Windows Explorer folder tree: Click the boxed plus sign to the left of a project item to expand the view and display the items contained in the project, and click the resulting boxed minus sign to collapse the view and hide the items again. Double-click a module to display its code in the Code window. Double-click a user form to display it in the Code window.
+
+The Visual Basic Editor displays the Project Explorer by default, and because the Project Explorer provides fast and efficient navigation among the various elements of your VBA projects, it's usually easiest to keep it displayed unless you're short on screen space or you're working for long periods in the Code window and don't need to switch to other elements. However, most people don't create document-specific macros or large, complicated programs spanning multiple projects. As a result, they just leave all their macros in the NewMacros module.
+
+To close the Project Explorer, click its close button (the x button in its title bar). To display the Project Explorer again, press Ctrl+R or choose View ⇒ Project Explorer. As you'll see later in this chapter, you can also undock the Project Explorer. This lets you push it aside when you need more room. But it doesn't take up much room, so, again, many people just leave it tucked up there in the upper left.
+
+In Figure 2.3, three buttons appear on a toolbar at the top of the Project Explorer:
+
+**View Code**
+
+Displays the Code window for the selected object. For example, if you select a user form in the Project Explorer and click the View Code button, the Visual Basic Editor displays a Code window containing any code attached to the user form. If you select a module or a class module in the Project Explorer and click the View Code button, the Visual Basic Editor displays a Code window containing the code in the module. You can also right-click an item in the Project Explorer and choose View Code from the context menu.
+
+_Code_ is merely a synonym for programming — the series of commands you type in (or record) to make the computer behave a certain way. Code is sometimes called _programming code_ or _source code._
+
+Note that the words used in programming—the terms such as Selection or End Sub employed by a computer-programming language such as VBA—are referred to by a variety of synonyms: statements, keywords, commands, and so on. In this book, I'll frequently simply use the generic term _commands_.
+
+* * *
+
+Double-Click Modules to View Their Code
+
+For a module or a class module, you can also double-click the object to view its code. This is usually faster than selecting it and then clicking the View Code button. For a user form or a file, however, double-clicking displays the View Object option (discussed next) rather than the View Code option.
+
+* * *
+
+**View Object**
+
+Displays a window containing the selected object. The View Object button remains dimmed and unavailable until you select an object (such as a user form or a file or object within a file) that can be displayed. If the selected object is a user form, clicking the View Object button displays the user form; if the selected object is a file or an object within a file, clicking the View Object button displays that object in the host application's window.
+
+For example, selecting the ThisDocument object for a Word document and clicking the View Object button displays the actual Word document in the Word window. Selecting the Sheet1 object in an Excel workbook and clicking the View Object button displays that worksheet in the Excel workbook in the Excel window.
+
+* * *
+
+Viewing an Object
+
+You can also trigger the View Object mode by right-clicking an object and choosing View Object from the shortcut menu or by double-clicking an object that supports the View Object feature. (If the object doesn't support the View Object feature, double-clicking it triggers the View Code mode instead.)
+
+* * *
+
+**Toggle Folders**
+
+Toggles the view of the objects in the Project Explorer between _folder view_ (a view that shows the objects grouped within their projects and folders) and _contents view_ (which displays only the objects within their projects—no folders are shown).
+
+The left part of Figure 2.4 shows the Project Explorer for an application session sorted by folder view, and the right part shows the Project Explorer for the same situation in contents view. Whether you spend more time in folder view or contents view will depend on the size of your screen, the number of objects you put in any given project, and the way your mind works, not necessarily in that order. For many purposes, you'll want to toggle between folder view and contents view to locate objects most easily.
+
+Figure 2.4 Folder view (left) displays the objects separated into folders beneath the projects that contain them. Contents view (right) displays only the objects and the projects that contain them.
+
+The Project Explorer has several uses, which is another reason to keep it open all the time. Apart from navigating to the items you need to work with, you can perform the following additional tasks with the Project Explorer:
+
+  * Add components to or remove them from a project. For example, you can use the Project Explorer to add a module or a user form to a project.
+  * Compare the components of one project to the components of another project. Such a comparison can be useful when you need to establish the differences between two or more projects quickly (for example, your reference copy of a company template and the copies users have been adding to).
+  * Move or copy items from one project to another. You can drag a code module, class module, or user form from one project to another in the Project Explorer to copy it or from the Project Explorer in one instance of the Visual Basic Editor to a project in the Project Explorer in another instance. For example, you could drag a user form from a Visual Basic Editor instance hosted by Excel to a Visual Basic Editor session hosted by PowerPoint to copy the user form. You can't, however, copy or move objects that are specific to a particular application's object model; for example, you can't drop an Excel sheet into Word's Project Explorer because Word doesn't support that type of object.
+  * Import or export a code module or a user form to or from a project.
+
+* * *
+
+The Project Explorer Is Your Best View
+
+Many actions that you can perform through the Project Explorer you can also perform through the Visual Basic Editor's menu items. In general, though, the Project Explorer provides the easiest way to navigate from module to module in the Visual Basic Editor, especially if you ever have several complex projects open at the same time. You can access the most commonly used features for an object by right-clicking it in the Project Explorer to display the shortcut menu.
+
+* * *
+
+## The Object Browser
+
+The Visual Basic Editor provides a full Object Browser for working with objects in VBA. You'll look at the Object Browser in detail in Chapter 8, "Finding the Objects, Methods, and Properties You Need," and when you examine the object models for the various Office applications in the final part of the book. But in the meantime take a quick look at Figure 2.5, which shows the Object Browser for a Word VBA session. The Document object is selected in the left-hand panel, and a list of its properties appears in the right-hand panel. (To see this in your VBA Editor, press F2.)
+
+Figure 2.5 The Object Browser provides a quick way to look up objects and their properties. Here, you can see the properties contained in the Document object.
+
+You'll find that a number of these properties immediately make sense from your general knowledge of Word documents. For example, as you would expect, the AttachedTemplate property tells you which template the document is currently attached to. Likewise, the Bookmarks property contains information on any bookmarks in the document. The property information is displayed at the bottom of the Object Browser. One of the great things about the BASIC language, of which VBA is a variant, and the libraries of objects underlying the Office applications is that they generally use ordinary English terminology.
+
+## The Code Window
+
+You'll do most of the actual work of testing and editing your macros in the Visual Basic Editor's Code window. (Since code is written in plain text, you could simply write it in Notepad, then paste it into the code editor for testing and debugging. But the Visual Basic Editor offers so many useful programming tools that only the brilliant few can easily get good results by trying to wing it without any assistance from the editor.)
+
+The Visual Basic Editor provides an individual Code window for each open project, for each document section within the project that can contains code, and for each code module and user form in the project. Each Code window is identified by the project name, the name of the module within the project, and the word _Code_ in parentheses. Figure 2.6 shows the Visual Basic Editor Code window with the Transpose_Word_Right macro open in it.
+
+Figure 2.6 You edit macros in the Code window.
+
+As you can see from the figure, two drop-down list boxes appear just below the title bar of the Code window:
+
+  * The Object drop-down list box at the upper-left corner of the Code window provides a quick way of navigating between different objects.
+  * The Procedure drop-down list box at the upper-right corner of the Code window lets you move quickly from procedure to procedure within the current module. Click the down arrow button to display the drop-down list of procedures. You'll see that the first procedure is (Declarations). Clicking this item in the list takes you to the Declarations area at the top of the current code sheet, which is where you declare public variables and other VBA information that multiple procedures need to know.
+
+The Visual Basic Editor Code window provides a half dozen features that help you edit code efficiently and accurately, as discussed in the following sections.
+
+### Complete Word
+
+The Complete Word feature can complete the word you're typing into the Code window, once you've typed enough letters to distinguish that word from any other. If you haven't typed enough letters to distinguish the word, the Visual Basic Editor gives you the closest possibilities (see Figure 2.7). You can either "type down" (continue typing to narrow the selection) or scroll through the displayed list to find the one you want.
+
+Figure 2.7 The Complete Word feature automatically completes a term when you've typed enough to identify it. If you haven't typed enough, you can choose from a short list.
+
+The easiest way to activate Complete Word when you're typing code is to press Ctrl+spacebar. You can also choose Edit ⇒ Complete Word or click the Complete Word button on the Edit toolbar (see Figure 2.8). Note that the Edit toolbar isn't visible by default. Open it by choosing View ⇒ Toolbars ⇒ Edit or by right-clicking the toolbar area in the editor, then choosing Edit from the shortcut menu that appears.
+
+Figure 2.8 The Edit toolbar contains features used when working in the Code window.
+
+### Quick Info
+
+The Quick Info feature displays a ScreenTip showing syntax information about the currently selected variable, function, method, statement, or procedure. ( _Selected_ here just means the word in the code that's under or adjacent to the blinking cursor insertion point.) If you type in a command like MsgBox and then press the spacebar, the ScreenTip pops up to help you complete typing in the command. The tip shows both the required and optional elements of that command. Optional elements are enclosed in square brackets.
+
+Figure 2.9 shows an example of a Quick Info ScreenTip.
+
+Figure 2.9 Use the Editor's Quick Info feature to see a VB language command's syntax or a quick readout of status.
+
+To display Quick Info, use one of these methods:
+
+  * Just type a space following a VB command. For example, type **msgbox (space)**.
+  * Click the Quick Info icon on the Edit toolbar.
+  * Right-click a VB command and choose Quick Info from the shortcut menu.
+  * Position the insertion point in the command and press Ctrl+I.
+  * Position the insertion point in the term and choose Edit ⇒ Quick Info.
+  * If you're typing in actual commands from the VBA language (as opposed to, say, variables or objects), the easiest way to see Quick Info is just to type the command's name (such as **MsgBox** ), then press the spacebar key. Note that VB doesn't pay any attention to capitalization, so you can type in **msgbox** or **MsgBox** or whatever variation you wish. Once you finish typing in the line of code (by pressing Enter), the editor will automatically capitalize the command the standard way: MsgBox.
+
+### Auto List Members
+
+Many VB commands have properties (qualities) and methods (behaviors). Taken together, the properties and methods of an object are called its _members_.
+
+For example, a message box can display various icons (such as question mark, exclamation point, and so on) to cue the user about the purpose of the message (question, warning, etc.). This icon is called the _Buttons_ property of the message-box object. And this property is specified right after the text message in the line of code. Therefore, when I type a comma to indicate that I'm now going to specify the icon for my message box, the Auto List Members feature opens a drop-down list of the choices available. As you can see in Figure 2.10, I'm choosing vbOKOnly, but there are a number of other possible choices, such as vbOKCancel, vbQuestion, and so on.
+
+Figure 2.10 Use the Auto List Members command to enter code items quickly and accurately.
+
+The Auto List Members list allows you to quickly complete the line of code. Auto List Members is switched on by default and is automatically displayed when you type a period in an object description or a comma, parentheses, or other punctuation in a line of code. Notice in Figure 2.10 that I've typed in a message-box command followed by the text Hello, Marvin! and then a comma. As soon as I typed the comma, the list of settings for the Buttons appeared. (These settings are called _constants_.)
+
+Alternatively, you can display the list box by clicking the List Properties/Methods button on the Edit toolbar.
+
+To use Auto List Members to insert your choice into your code, follow these steps:
+
+1. Press the down arrow key to scroll down to the property or method, or scroll down with the mouse (see Figure 2.10). You can also type the first few letters of the property or method's name to jump to it.
+
+2. Enter the property or method into the code by doing one of the following:
+
+a. Press Tab, or double-click the property or method, if you want to continue adding to this line of code after entering the property or method. (There might be additional optional properties you want to specify on this line.)
+
+b. Press Enter if you want to start a new line after entering the property or method.
+
+### List Constants
+
+The List Constants feature displays a pop-up list box containing constants for a property you've typed so that you can quickly complete the expression. List Constants is switched on by default. Alternatively, you can display the list box by clicking the List Constants button on the Edit toolbar.
+
+To use List Constants (see Figure 2.11), follow these steps:
+
+1. Press ⇒ to scroll down to the constant, type its first letter (or first few letters), or scroll down with the mouse.
+
+2. Enter the constant in the code by doing the following:
+
+a. Press Tab, or double-click the constant, if you want to continue working on the same line after entering the constant.
+
+b. Press Enter if you want to start a new line after entering the constant.
+
+Figure 2.11 The List Constants feature saves you time and effort, especially when typing complex constant names.
+
+### Data Tips
+
+The Data Tips feature displays a ScreenTip containing the value of a variable the mouse pointer moves over when the Visual Basic Editor is in Break mode (a mode you use for testing and debugging macros, described later in this book). Figure 2.12 shows an example. The Data Tips feature is switched on by default, but you can switch it, and other features, off by choosing Tools ⇒ Options.
+
+Figure 2.12 Use the Data Tips feature to check the value of a variable when you're running or stepping through code.
+
+### Margin Indicators
+
+The Margin Indicators feature lets you quickly set a breakpoint, the next statement, or a bookmark by clicking in the margin of the Code window. You'll look at setting breakpoints, setting the next statement, and setting bookmarks later. (You can just right-click the gray margin on the left side of the Code window, then choose Toggle from the shortcut menu to manipulate breakpoints or bookmarks. You can also just left-click to toggle breakpoints.)
+
+### Other Editing Features
+
+Apart from these features, the Code window includes standard Office editing features such as copy and move, cut and paste, and drag and drop. You can drag code from one procedure or module to another.
+
+## The Properties Window
+
+The Visual Basic Editor provides a Properties window you can use to view and modify the properties of an object in VBA, such as a project, a module or class module, a user form, or a _control_ (a button or check box in a dialog box, for example). If the Properties window isn't visible in the Editor, press F4.
+
+In the drop-down list at the top of the Properties window you can select the object whose properties you want to view or modify. The Alphabetic option displays an alphabetical list of the properties in the item, and the Categorized option presents a list of the properties broken down into categories. Generally, I find the categorization less than useful because many properties don't really fit neatly into any particular category.
+
+Figure 2.13 shows the Alphabetic option with the properties for an Excel workbook on the left and the Categorized page on the right. (Showing the Categorized page for the Excel workbook or worksheet isn't very helpful because all of the properties belong to a Misc category—miscellaneous. There's no categorization here at all.)
+
+Figure 2.13 Use the Properties window to view the properties of a project, user form, module, class module, or control.
+
+The purpose of most of the workbook properties is easy to grasp. For example, if the HasRoutingSlip property is set to False, it means the workbook does not have an email routing slip attached to it, and if the Saved property is set to True, that indicates that the workbook does not contain any unsaved changes. You'll learn about the properties for user forms in Chapter 14, "Creating Simple Custom Dialog Boxes," and Chapter 15, "Creating Complex Forms."
+
+* * *
+
+Understanding Design Mode, Run Mode, and Break Mode
+
+The Visual Basic Editor can be in one of three modes, reflecting three fundamental phases of programming—writing, locating a bug, and fixing a bug:
+
+**Design mode**
+
+Also known as _design time_. Anytime you're working in the Visual Basic Editor on your code, you're in Design mode. You don't have to be actively designing anything visually—such as a user control or form—although you often will be. You will also often just be typing in _source code_ —the commands that Visual Basic will execute when you switch to Run mode. Or you might be editing code you've recorded.
+
+**Run mode**
+
+Also known as _runtime_. When code is running, you're in Run mode. The macro will be executed just as if it had been launched from within an application like Word (using a shortcut key combination or via clicking a Quick Access Toolbar button). The purpose of Run mode in the Visual Basic Editor is to allow you to test and observe the code's behavior and interact with it if necessary, to see that it works as it's supposed to. This is known as _debugging_. If you do find any problem during runtime testing, you can stop the execution by pressing Ctrl+Break and then check the values in variables or otherwise attempt to track down _where_ in your code the error is located. VBA itself can also throw you into Break mode if it detects an error condition.
+
+**Break mode**
+
+When code is running but execution is temporarily suspended, you're in Break mode. Among other things, Break mode lets you step through your code one command or one procedure at a time (rather than running all the commands at once at full speed). Stepping is a very handy tool when you're debugging or otherwise critiquing your code. You'll explore debugging techniques in detail in Chapter 17, "Debugging Your Code and Handling Errors."
+
+* * *
+
+The Visual Basic Editor displays the Properties window by default, but you can close it by clicking its close button (the x button). To display the Properties window again, press F4 or choose View ⇒ Properties Window.
+
+To change a property, click the cell containing the property's name. If a down arrow button appears in the value cell, click it to choose a new value from a drop-down list. If no button appears, click the value cell to display the blinking insertion cursor and type in a new value.
+
+You'll be able to choose different values from drop-down lists, depending on the type of property. For a True/False property, you'll be limited to those two choices in the drop-down list. For a text property such as Name, you can enter any valid VBA name.
+
+By default, the Properties window is docked below the Project Explorer. You can adjust the relative heights of the Properties window or the Project Explorer window by dragging the border between them. Or you can widen both at once by dragging the border to their right. If you undock the Properties window (drag it), you can resize it by dragging its borders or corners to display more properties or to shrink the window so it takes up less space in the Visual Basic Editor. Undock interior windows (also called _panes_ , such as the Properties pane) by dragging them by their title bar or by double-clicking their title bar. Redock by double-clicking their title bar or dragging them back into position.
+
+## The Immediate Window
+
+Beyond the Project Explorer, the Code window, and the Properties window, the Visual Basic Editor includes a number of other windows that it doesn't display by default. Two of the key windows are the Object Browser (described earlier in this chapter) and the Immediate window, which you'll use during the discussion of the VBA language in Chapter 5, "Understanding the Essentials of VBA Syntax."
+
+The Immediate window, shown in Figure 2.14, is a small, unadorned window you can use as a virtual scratch pad to enter lines of code you want to test without entering them in an actual macro. When you type a line of code into the Immediate window and press the Enter key, the Visual Basic Editor executes that code.
+
+Figure 2.14 Use the Immediate window for on-the-fly work and information.
+
+To display the Immediate window, press Ctrl+G or choose View ⇒ Immediate Window.
+
+* * *
+
+Display Variables' Status during Debugging
+
+You can also use the Immediate window to display information to help you check the values of variables and expressions while code is executing. That is done by using the Debug.Print command, as in this example, which displays the value of the variable _x_ in the Immediate window:
+
+    Sub ShowDebug()
+    Dim x As Integer
+    x = 12
+    **Debug.Print x**
+    End Sub
+
+* * *
+
+# Setting Properties for a Project
+
+Each VBA project has several properties of its own that you can set, including its project name, its description, and whether it is locked against viewing. To examine or set the properties for a project, right-click the project or one of its components in the Project Explorer and choose the Properties item in the context menu to display the Project Properties dialog box.
+
+Both the menu item and the resulting dialog box are identified by the description of the project—for example, the properties dialog box for a template in Word is identified as TemplateProject – Project Properties, and the properties dialog box for an Excel workbook is identified as VBAProject – Project Properties. Figure 2.15 shows the Project Properties dialog box for an Excel workbook project.
+
+Figure 2.15 Use the Project Properties dialog box to view and set the properties for a project and to lock a project against change.
+
+Here's what you can do on the General tab of the Project Properties dialog box:
+
+  * Set the project name in the Project Name text box. This name identifies the project in the Object Browser and, when necessary, in the Windows Registry. Make sure the name is unique to avoid confusion with any other project. Technically, the project name is the name of the type library for the project (a _type library_ describes the objects—such as modules and user forms—that the project contains); it is used to build the fully qualified class name of classes in the project (more on this later in the book). The project name can contain underscores but cannot contain spaces.
+  * Enter a description of the project in the Project Description text box. This description appears in the Description pane in the Object Browser to help the user understand what the project is. So be as concise, yet descriptive, as possible.
+  * Designate the Help file for the project by entering the name and path of the Help file in the Help File Name text box. Click the button marked with the ellipsis (...) to the right of the Help File Name text box to display the Help File dialog box. Then select the file and click the Open button to enter the name of the Help file in the text box. (Alternatively, you can type or paste in the name and path.)
+  * Specify the Help context for the project in the Project Help Context ID text box. The _Help context_ refers to a location in the Help file. The default Help context is 0, which causes the Help file to display its opening screen (the same screen you'll see if you run the Help file from the Run dialog box or by double-clicking the file in Explorer). You can specify a different help context to take the user to a particular topic—for example, one more relevant to the project on which they're seeking help.
+  * Specify any conditional compilation arguments needed for the project.
+
+Here's what you can do on the Protection tab of the Project Properties dialog box, shown in Figure 2.16:
+
+  * Select the Lock Project For Viewing check box to prevent other people from opening the project, viewing it, and changing it without knowing the password.
+  * In the Password To View Project Properties group box, enter a password for the project in the Password text box, and then enter the same password in the Confirm Password text box. Click the OK button and then close the project. Now nobody can open and view (let alone change) the project if they don't know the password. That said, Office's password security has been weak and was easily cracked prior to Office 2007. Now superior encryption techniques are used, but the password is still crackable, albeit with far greater difficulty. More on this in Chapter 19, "Securing Your Code with VBA's Security Features."
+
+Figure 2.16 The Protection page of the Project Properties dialog box lets you lock your project with a password so that nobody can view or edit it
+
+* * *
+
+Select Lock Project For Viewing If You Want to Prevent Others from Opening It
+
+If you enter a password in the Password text box and the Confirm Password text box but you don't select the Lock Project For Viewing check box, the Visual Basic Editor will prompt you for the password the next time you try to display the Project Properties dialog box. However, you'll be able to open and view the project and its contents without supplying the password.
+
+* * *
+
+# Customizing the Visual Basic Editor
+
+Given how much time you're likely to spend in the Visual Basic Editor, you ought to customize it so you can work as efficiently and comfortably as possible. You can customize it as follows:
+
+  * Choose editor and view preference settings in the Visual Basic Editor to control how it interacts with you
+  * Choose which windows to display in the Visual Basic Editor, and organize their layout so you can use your workspace as effectively as possible
+  * Customize the toolbar and menus in the Visual Basic Editor so the commands you need are at hand (without cluttering up your workspace)
+  * Customize the Toolbox so it contains the tools you need to build your user forms
+
+The following sections explain your options.
+
+* * *
+
+Customization Is Global across Applications
+
+Any customizing you do to the VBA Editor applies across all Office applications using the version of VBA you are customizing. For example, if you change the font in an instance of the Visual Basic Editor hosted by Excel, the font also changes for Editor instances hosted by Word, PowerPoint, Outlook, and so on.
+
+* * *
+
+## Choosing Editor and View Preferences
+
+To begin choosing editor and view preferences, choose Tools ⇒ Options to open the Options dialog box (see Figure 2.17).
+
+Figure 2.17 The Editor page of the Options dialog box
+
+### Editor Page Options
+
+The Editor page of the Options dialog box includes the following settings:
+
+**Auto Syntax Check**
+
+Controls whether VBA displays warning message boxes when it discovers errors while automatically checking your syntax as you type lines of code. Some people find this feature helpful because VBA instantly points out errors that could otherwise remain unnoticed until you tried to run or debug your code. But if your style is to move from one unfinished line of code to another (and ultimately finish all the lines at your convenience), you may want to turn off this feature to prevent the Visual Basic Editor from bombarding you with message boxes for errors you're aware of but prefer to fix later. This choice is similar to the difference between writers who like to fix spelling errors while they're typing (and thus leave Word's Check Spelling As You Type option active) and those who prefer to keep their eye on the ball and deal with minutia such as spelling after finishing their thoughts.
+
+* * *
+
+You'll Always Get a Code Red on Lines with Errors
+
+Even if you turn off Auto Syntax Check, the Visual Basic Editor still turns any offending lines of code red to draw your attention to them. It simply stops interrupting you with message boxes displaying error warnings each time you mistype something.
+
+* * *
+
+**Require Variable Declaration**
+
+Governs whether you must declare variables explicitly. Declaring variables explicitly is a little more work than declaring them implicitly, but many people believe that it's a good practice and will save you time down the road—so make sure that this check box is selected unless you have a strong preference otherwise. (Chapter 6, "Working with Variables, Constants, and Enumerations," discusses how to work with variables.)
+
+**Auto List Members**
+
+Described earlier in this chapter, this option controls whether the Auto List Members and List Constants features automatically suggest properties, methods, and constants as you work in the Code window. Most people find these features helpful, but some experienced programmers turn these features off because they know pretty much all the properties, methods, and constants they need and prefer not to be distracted by a busy interface.
+
+**Auto Quick Info**
+
+This option controls whether the Quick Info feature automatically displays information about functions and their parameters as you work with functions in the Code window.
+
+**Auto Data Tips**
+
+This option controls whether the Visual Basic Editor displays ScreenTips when you hover the mouse pointer over a variable or expression in Break mode, enabling you to check the value of a variable or expression quickly. (Alternatively, you can use the Locals, Immediate, or Watch window, but these take up more screen space.)
+
+**Auto Indent**
+
+Determines whether the Visual Basic Editor automatically indents subsequent lines of code after you've indented a line. When Auto Indent is switched on, the Visual Basic Editor starts each new line of code indented to the same level (the same number of tabs or spaces or the same combination of the two) as the previous line. When Auto Indent is switched off, the Visual Basic Editor starts each new line of code at the left margin of the Code window. Usually, automatic indentation is a time-saver, although it means that each time you need to decrease a new line's level of indentation, you must press Shift+Tab, click the Outdent button on the Edit toolbar, or delete the tabs or spaces.
+
+**Tab Width**
+
+Sets the number of spaces in a tab. You can adjust this setting from 1 to 32 spaces. The default setting is 4 spaces, which works well for the default font. If you choose to use a proportional font (such as Times or Arial) rather than a monospaced font (such as the default New Courier) for your code, you may want to increase the number of spaces a tab represents in order to clarify the levels of indentation in your code.
+
+**Drag-And-Drop Text Editing**
+
+Controls whether the Visual Basic Editor supports drag-and-drop. Most people find this feature helpful. You can drag portions of your code around the Code window or from one Code window to another. You can also drag code into the Immediate window or drag an expression into the Watch window.
+
+**Default To Full Module View**
+
+Controls whether the Visual Basic Editor displays all the procedures in a module in one list (Full Module view) or displays them one at a time (Procedure view). If you're working with short procedures, you may find Full Module view useful. However, the individual view can provide a less cluttered and more workable context for lengthy procedures. When working in Procedure view, you open the procedure you want to work with by choosing it from the Procedure drop-down list at the top of the Code window. To toggle between Full Module view and Procedure view, click the Full Module View button or the Procedure View button in the lower-left corner of any Code window.
+
+* * *
+
+Use a Drop-Down List to Quickly Move Procedures
+
+You can also use the Procedures drop-down list when working in Full Module view to quickly move to a procedure by name.
+
+* * *
+
+**Procedure Separator**
+
+Controls whether the Visual Basic Editor displays horizontal lines to separate the procedures within a module shown in Full Module view in the Code window. Usually these lines are helpful, providing a quick visual cue showing where one procedure ends and the next begins. (If you're using Procedure view, this check box has no effect.)
+
+### Editor Format Page Options
+
+The Editor Format page of the Options dialog box, shown in Figure 2.18, controls how code appears in the Visual Basic Editor.
+
+Figure 2.18 The Editor Format page of the Options dialog box
+
+By default, comments in your code are rendered in green. This helps you easily recognize that type of text in the code window. You can change the default colors for various types of text by choosing a type of text in the Code Colors list box and then specifying its colors and typeface (font). You have control over Foreground, Background, and Indicator options via drop-down lists. However, I find the default choices sensible, so I don't change them.
+
+Here's what the Code Colors choices mean:
+
+**Normal Text**
+
+Takes care of much of the text in a typical procedure. You'll probably want to make this a conventional color (such as black, the default).
+
+**Selection Text**
+
+Affects the color of selected (highlighted) text.
+
+**Syntax Error Text**
+
+Affects the color VBA uses for offending lines. The default color is red.
+
+**Execution Point Text**
+
+Affects the color VBA uses for the line currently being executed in Break mode. You'll usually want to make this a highlighter color (like the fluorescent yellow the Visual Basic Editor uses as the default) so you can immediately see the current line.
+
+**Breakpoint Text**
+
+Affects the color in which VBA displays breakpoints (points where code execution is forced to stop).
+
+**Comment Text**
+
+Affects the color of comment lines. The default color is dark green.
+
+**Keyword Text**
+
+Affects the color of keywords (words recognized as part of the VBA language). Recall that in this book I'm using the term _command_ for the words in the VBA language.
+
+Such text accounts for a sizable portion of each procedure. You may want to display keywords in a different color than normal text because some people find it helpful to be able to distinguish keywords without needing to read the entire code. The default color is dark blue, which is a good choice—not so intrusive that the characters look like confetti, yet not so hard to see that you can't quickly visualize the underlying syntax of a line of code.
+
+**Identifier Text**
+
+Affects the color VBA uses for identifiers. Identifiers include the names of variables, constants, and procedures you define.
+
+**Bookmark Text**
+
+Affects the color VBA uses for the bookmarks in your code.
+
+**Call Return Text**
+
+Affects the color VBA uses for calls to other procedures. By default, the Visual Basic Editor uses lime green for call return text.
+
+You can change the font and size of all the types of text in the Code window by using the Font and Size drop-down lists on the Editor Format page. You can also prevent the display of the margin indicator bar (the zone in which items such as the Next Statement and Breakpoint icons appear) by clearing the Margin Indicator Bar check box. (Usually, these icons are helpful, but removing this bar slightly increases the code area onscreen.)
+
+### General Page Options
+
+The General page of the Options dialog box contains several categories of settings. The following sections discuss them in groups. I always leave these options set to the default settings, which are shown in Figure 2.19.
+
+Figure 2.19 The General page of the Options dialog box
+
+#### _Form Grid Settings Group Box_
+
+The Form Grid Settings options control how the Visual Basic Editor handles user forms:
+
+  * The Show Grid check box controls whether the Visual Basic Editor displays a grid pattern of dots on the user form in Design mode to help you place and align controls. This check box is selected by default.
+  * The Width and Height text boxes set the spacing of the dots that make up the grid. You can set any value from 2 points to 60 points (the default setting is 6 points). If you display the grid onscreen, you'll see the dots; if you don't display the grid, it still affects the Align Controls To Grid feature, discussed next. Experiment and find the coarseness of grid that you find easiest to work with.
+  * The Align Controls To Grid check box governs whether the Visual Basic Editor automatically snaps the edges of controls you place or move to the nearest grid line. This option lets you place controls in approximately the right positions rapidly and easily, but it prevents you from making extremely fine positional adjustments. The grid enforces certain positions, and you might find it frustrating when trying to improve the layout of controls you've already placed on a user form. (If so, one option is to clear the Align Controls To Grid check box; another is to leave it selected but to decrease the size of the grid—to allow finer adjustments.)
+
+#### _The Edit and Continue Group Box_
+
+The Edit And Continue group box contains only one control—the Notify Before State Loss check box. This option controls whether the Visual Basic Editor warns you, when you're running code, if you try to take an action that requires VBA to reset the values of all variables in the module.
+
+#### _Error Trapping Group Box_
+
+The Error Trapping group box contains three option buttons you use to specify how VBA handles errors that occur when you're running code:
+
+**Break On All Errors**
+
+Tells VBA to enter Break mode when it encounters any error, no matter whether an error handler (a section of code designed to handle errors) is active or whether the code is in a class module. Break On All Errors is useful for pinpointing where errors occur, which helps you track them down and remove them. But if you've included an error handler in your code, you probably won't need this option.
+
+**Break In Class Module**
+
+This is arguably the most useful option for general use. When VBA encounters an unhandled error in a class module (a module that defines a type of object), VBA enters Break mode at the offending line of code.
+
+**Break On Unhandled Errors**
+
+The default setting, this is useful when you've constructed an error handler to deal with predictable errors in the current module. If there is an error handler, VBA allows the handler to trap the error and doesn't enter Break mode, but if there is no handler for the error generated, VBA enters Break mode on the offending line of code. An unhandled error in a class module, however, causes the project to enter Break mode on the line of code that invoked the offending procedure of the class, thus enabling you to identify (and alter) the line that caused the problem.
+
+#### _Compile Group Box_
+
+The Compile group box controls when VBA compiles the code for a project into executable code. Before any code can be executed, it needs to be compiled, but not all the code in a project must necessarily be compiled before the Visual Basic Editor can start executing the first parts of the code.
+
+You can select the Compile On Demand check box if you want VBA to compile the code only as needed. VBA compiles the code in the procedure you're running before starting to execute that procedure, but it doesn't compile code in other procedures in the same module unless the procedure you're running calls them (transfers execution to them, a technique you'll learn later in this book).
+
+As a result, execution of the procedure you run first in a module can begin as soon as VBA finishes compiling the code for that procedure. If the procedure then calls another procedure in the module, VBA compiles the code for the second procedure when the first procedure calls it, not when you begin running the first procedure.
+
+Compile On Demand is usually a good choice. It's especially useful when you're building a number of procedures in a module and have unfinished code lying around in some of them. In contrast, if you clear the Compile On Demand check box, VBA compiles _all_ the code in _all_ the procedures in the module before starting to execute the procedure you want to run. This means that not only does the procedure start a little later (more code takes more time to compile, though most computers today are so fast you won't notice), but any language error or compile error in _any_ procedure in the entire module prevents you from running and testing the current procedure, even if the code in that procedure contains no errors. This is a problem when you've only sketched in some of the procedures, so they remain unfinished.
+
+Suppose you have a module named Compilation that contains two procedures, GoodCode and BadCode, which look like this:
+
+    Sub GoodCode()
+      MsgBox "This code is working."
+    End Sub
+
+    Sub BadCode()
+      Application.Delete
+    End Sub
+
+GoodCode simply displays a message box to indicate that it's working, whereas BadCode contains an invalid statement (Application objects don't have a Delete method). GoodCode runs without causing a problem, but BadCode causes an error every time.
+
+If you try to run GoodCode with Compile On Demand switched on, the procedure runs fine: VBA compiles only the programming in the GoodCode procedure, finds no errors, and runs it. But if you try to run GoodCode with Compile On Demand switched off, VBA also compiles the code in BadCode before starting to run GoodCode—and VBA stops with a compile error at the bogus Application.Delete statement. This thorough checking before running any code is good for finished modules that work together, but it can slow you down and be annoying when you're just "sketching" code—experimenting with code in a module.
+
+On the other hand, you can see the advantage of compiling all the code in the module when GoodCode calls BadCode, as in the third line of this version of the procedure:
+
+    Sub GoodCode()
+      MsgBox "This code is working."
+      BadCode
+    End Sub
+
+Here, compiling the code in BadCode before starting to run GoodCode is a good idea because doing so prevents GoodCode from running if BadCode contains an error. If you run this version of GoodCode with Compile On Demand switched on, VBA compiles GoodCode and starts to run it, displaying the message box in the second line. The BadCode call in the third line then causes VBA to compile BadCode, at which point VBA stops with the compile error. You don't want this to happen in the middle of a complex procedure; in such a case, you'd want Compile On Demand switched off.
+
+The Background Compile check box, which is enabled only when the Compile On Demand check box is selected, controls whether the Visual Basic Editor uses idle CPU time to compile further code while it's running the code that it has already compiled. Keep Background Compile switched on unless you notice and are bothered by any slowing of the execution of your code. With current computer speeds, and if your projects aren't huge, you'll likely be unaware of any bothersome difference in execution rate.
+
+#### _Show ToolTips and Collapse Proj. Hides Windows_
+
+The final two options on the General page of the Options dialog box are Show ToolTips and Collapse Proj. Hides Windows. Also known as ScreenTips, ToolTips are text descriptions that appear when you hover the mouse pointer over a button or icon. The Show ToolTips check box controls whether the Visual Basic Editor displays ToolTips for its toolbar buttons. ToolTips tend to be useful unless you're desperate to save the memory and processor cycles they consume—which is very unlikely.
+
+The Collapse Proj. Hides Windows check box controls whether the Visual Basic Editor hides the Code window and other project windows that you collapse in the Project Explorer's tree. This check box is selected by default, and in general it's a useful choice. When you collapse a project in the Project Explorer, the Visual Basic Editor hides any Code windows or user form windows belonging to that project and removes them from the list that appears on the Window menu. When you expand the project again, the Visual Basic Editor displays the windows in their previous positions and restores them to the Window menu's list.
+
+### Docking Page Options
+
+The Docking page of the Options dialog box, shown in Figure 2.20, controls whether the various windows in the Visual Basic Editor are dockable—that is, whether they snap automatically and magnetically to a side of the window when you move them there. Keeping windows dockable usually makes for a more organized interface. However, you may want to make the windows undockable so you can drag them off the edge of the Visual Basic Editor if necessary and arrange them as you like on the screen. Contemporary monitors are becoming quite large, so you might have plenty of room to display various windows outside the primary editor window.
+
+Figure 2.20 The Docking page of the Options dialog box
+
+## Choosing and Laying Out the Editor Windows
+
+You can reposition the various windows (or _panes_ ) within the Visual Basic Editor. Your choice of layout depends largely on the size and resolution of your screen and your personal preferences, but here are a couple of suggestions:
+
+  * Always make the Code window large—maximize it. If you write long lines of code, you'll want to have as much space in the Visual Basic Editor window as possible. That way your lines won't wrap and the code will be easier to read.
+  * Some people find that much of the time they're actively writing code, they can dispense with the Project Explorer, displaying it only when needed. As a handy way of restoring it, you can put the Project Explorer display command on the Code window, Code window break, Watch window, Immediate window, and Locals window context menus. (You'll learn how to customize the editor's menus in the next section.) You can also quickly display the Project Explorer by pressing its shortcut key, Ctrl+R.
+  * If you're using a multimonitor arrangement, you'll wish you could drag the child windows outside the Visual Basic Editor parent window and onto the second monitor. Unfortunately, they won't go far beyond the boundaries of the parent window. But you can achieve a similar effect by expanding the Visual Basic Editor window from your right-hand monitor onto the left-hand monitor and then docking the Properties window and the Project Explorer on the left-hand monitor. The appearance of the menu bar and toolbar will suffer, but you'll have more space for the Code window, and all three windows will be available.
+
+## Customizing the Toolbar and Menu Bar
+
+The Visual Basic Editor supports the same toolbar and menu bar customizations as the classic, pre-Ribbon Microsoft applications used to offer, such as those found in Office 2003.
+
+However, since the Ribbon was introduced in Office 2007, the lone toolbar is the Quick Access Toolbar, and there are no menus at all in the main application. But the Visual Basic Editor retains the older interface style—enabling you to customize its menus and toolbars in the classic fashion.
+
+To customize the Visual Basic Editor, choose View ⇒ Toolbars ⇒ Customize (or right-click a displayed toolbar or the menu bar and choose Customize from the context menu) to display the Customize dialog box, shown in Figure 2.21.
+
+Figure 2.21 Use the Customize dialog box to customize the Visual Basic Editor's menus, toolbars, and context menus.
+
+* * *
+
+Limitations of Menu and Keyboard Shortcuts
+
+The Visual Basic Editor doesn't let you create new menus of your own or customize its keyboard shortcuts.
+
+* * *
+
+You can customize the Visual Basic Editor's toolbars, menus, and context menus to suit the way you work. Above all, if you use the context menus, be sure to customize them so they provide the commands you need.
+
+In particular, you may want to add two key commands to the context menus: Comment Block and Uncomment Block. The Comment Block command adds a comment apostrophe (') to the beginning of each line of code in a multiline block of text you select. This transforms these lines into a multiline comment that VBA won't execute.
+
+The Uncomment Block command reverses the process. It removes the first comment apostrophe from each command in the selected block. This makes the lines executable. (Any line that was commented before you employed the Comment Block command helpfully remains commented after you run the Uncomment Block command. Run the Uncomment Block command again, and you remove further commenting.)
+
+These commands are available from the Edit toolbar in the normal configuration of the Visual Basic Editor, but you'll probably find it more convenient to make them available at all times from the Code window's context menu.
+
+The Visual Basic Editor offers the context menus listed in Table 2.1. To customize a context menu, right-click anywhere within the toolbars and menus area. Then choose Customize from the shortcut menu. Now click the Toolbars tab in the Customize dialog box.
+
+Table 2.1 Context menus in the Visual Basic Editor
+
+**Context Menu** | **Appears When You Right-Click In or On**  
+---|---  
+MSForms | A user form  
+MSForms Control | A control on a user form  
+MSForms Control Group | A group of controls on a user form  
+MSForms MPC | A multipage control on a user form  
+Code Window | The Code window in Design mode  
+Code Window (Break) | The Code window in Break mode  
+Watch Window | The Watch window  
+Immediate Window | The Immediate window  
+Locals Window | The Locals window  
+Project Window | The Project window in Design mode  
+Project Window (Break) | The Project window in Break mode  
+Object Browser | The Object Browser  
+MSForms Palette | The clear space on a page in the Toolbox  
+MSForms Toolbox | The tab on a page in the Toolbox  
+MSForms DragDrop | An item on a user form that can be dragged and dropped elsewhere on the user form  
+Property Browser | A property in the Properties window  
+Docked Window | A docked window (for example, the Project Explorer)
+
+Select the Shortcut Menus check box in the Toolbars list on the Toolbars page of the Customize dialog box. Then click the Commands tab in the Customize dialog box and drag the command you want from the Commands page to the context menu (see Figure 2.22).
+
+Figure 2.22 Use the Shortcut Menus toolbar to put key commands on the context menus in the Visual Basic Editor.
+
+Here are some suggestions for customizing the Visual Basic Editor:
+
+  * If you use the Locals window often to track the value of variables when stepping through your code to debug it, place a button for that window on a toolbar that you always keep displayed (the default button for Locals is located by default only on the Debug toolbar), or place an item for it on the context menus for the Code window (both in Design mode and in Break mode), Watch window, and Immediate window.
+  * Put the Watch window and the Immediate window options on the context menus for the windows from which you'll invoke them.
+  * If you have a medium-sized monitor, consider grouping all the toolbar buttons you commonly use on one toolbar so that you don't waste space by displaying multiple toolbars horizontally.
+
+## Customizing the Toolbox
+
+You can also customize the Toolbox, a special pane that contains controls for building user forms. It can be made visible only when a user form is visible in the Code window. (Chapters 14 and 15 show you how to build user forms.)
+
+You can customize this Toolbox by adding and removing controls and adding new Toolbox pages of your own. Some programmers put their most-used controls on the Toolbox, all on one page, to save themselves time. These controls can include customized variations on the regular Toolbox controls, and by putting them on the Toolbox, you avoid having to customize them again.
+
+For example, many dialog boxes you create need an OK button that dismisses the dialog box, implements some code, and then continues execution of the procedure. Each OK button needs its Name property set to cmdOK, its Caption property set to OK, its Default property set to True, and its Height and Width properties set to a size smaller than the clunky dimensions the Visual Basic Editor assigns by default. Once you've thus customized a command button by modifying all these properties, you can place a copy of the special button on the Toolbox and easily just reuse it for subsequent forms. This saves time. Another candidate for this kind of customization is the TextBox. The default TextBox displays only a single line and uses a nearly unreadable font size of 8. To avoid having to modify these default properties each time you use a TextBox, create a custom TextBox that has multiple lines and is set to a font size of 11.
+
+Another reason to customize the Toolbox is to add advanced controls that extend the things you can do with dialog boxes and user forms.
+
+### Adding Controls to the Toolbox
+
+The first way you'll probably want to add controls to the Toolbox is directly from a user form. For example, once you've created your custom OK and Cancel buttons, or a TextBox, you can copy them from the user form to the Toolbox so you can reuse them in any user forms you subsequently create.
+
+To copy one of your custom controls from a displayed user form to the Toolbox, just drag it and drop it, as shown in Figure 2.23. (Chapter 14 shows you how to put controls onto user forms you create yourself.)
+
+Figure 2.23 The quickest way to add a control to the Toolbox is to drag it there from a user form.
+
+Microsoft and other vendors also provide a variety of prewritten controls you can add to your Toolbox. To add these controls, follow these steps:
+
+1. Right-click in the Toolbox page where you want to add controls. (You'll learn how to add new pages to the Toolbox in the section "Adding Pages to the Toolbox" a little later in this chapter.)
+
+2. Choose Additional Controls from the context menu to display the Additional Controls dialog box shown in Figure 2.24.
+
+3. In the Available Controls list box, click the check boxes for the controls you want to add to the Toolbox, and then click the OK button.
+
+Figure 2.24 In the Additional Controls dialog box, select the check boxes for the controls you want to add, and then click the OK button.
+
+Once you are finished, if you would like to collapse the list to only the currently selected items, click the Selected Items Only check box in the Show group box.
+
+Depending on your computer and what software is installed on it, you may find a variety of interesting and useful controls. There are numerous controls, but these are among the most noteworthy:
+
+  * A set of Microsoft Outlook controls
+  * A control for Apple's QuickTime
+  * A status-bar control
+
+Some of these controls can add important functionality to your macros. You can also search the Internet for additional specialized controls like calendars, security locks, and so on. Adding prebuilt controls can save you time because you simply drag and drop functionality onto your user forms—functionality that doesn't require you to spend days writing code.
+
+You can move a control from one page of the Toolbox to another by dragging it from the page it's on and moving the mouse pointer (still dragging) over the tab of the destination page to display that page. Then, move the mouse pointer down (again, still dragging) into the body of that page and drop the control.
+
+### Renaming a Toolbox Control
+
+When you move the mouse pointer over a control in the Toolbox, a ScreenTip appears, showing the name of that control. To rename a control, right-click it in the Toolbox and choose the Customize option from the context menu to display the Customize Control dialog box.
+
+Type the name for the control in the Tool Tip Text box in the Customize Control dialog box (delete or change the existing name as necessary). This name appears as a ScreenTip when the user moves the mouse pointer over the control in the Toolbox. Then, if you wish, assign a different picture to the control's Toolbox icon, as described in the next section. Otherwise, click the OK button to close the Customize Control dialog box.
+
+### Assigning a Picture to a Control's Toolbox Icon
+
+Each control in the Toolbox is identified by a picture. You can assign a new picture to the control by displaying the Customize Control dialog box, clicking the Load Picture button, and selecting the picture or icon in the resulting dialog box.
+
+You can edit the picture assigned to some controls by displaying the Customize Control dialog box, clicking the Edit Picture button, and using the Edit Image dialog box to color the pixels that make up the picture.
+
+### Removing Controls from the Toolbox
+
+To remove a control from the Toolbox, right-click it and choose Delete from the context menu. The item is identified by the name of the control—for example, if you right-click a control named Company Name Combo Box, the menu item is named Delete Company Name Combo Box.
+
+If the item is a custom control you created, this action gets rid of the control and you can't restore it (unless you have a copy elsewhere). If the item is one of the Microsoft-supplied controls that come with the Microsoft Forms 2.0 package (which is part of VBA), you can restore it to the Toolbox using the Additional Controls dialog box. Just select the check box for the appropriate object (for example, Microsoft Forms 2.0 CommandButton).
+
+You can also remove controls from the Toolbox by deleting the entire page they're on. See "Removing Pages from the Toolbox," later in this chapter.
+
+### Adding Pages to the Toolbox
+
+To add a page to the Toolbox, right-click the tab at the top of a page (or the label on the tab) and choose New Page from the context menu. The Visual Basic Editor adds a new page named New Page, to which it adds the Select Objects control. You'll probably want to rename the new page immediately.
+
+By the way, the Select Objects control (its icon is a black arrow) appears on _every_ page in the Toolbox, and you can't remove it. This is strange since you can go years without ever clicking it. This "control" is unlike others. It isn't added to a form. Instead, it must be selected in the Toolbox when you're resizing or repositioning, or when you otherwise need to select a true control on the form. However, when you merely click a control (and following many other actions), VBA automatically activates this "select object" feature—so you'll find that you never actually click it.
+
+### Renaming Pages in the Toolbox
+
+To change the name of a Toolbox page, right-click its tab or label and choose Rename from the context menu to display the Rename dialog box. Type the name in the Caption text box, type any control tip text in the Control Tip Text box, and click the OK button to close the dialog box.
+
+### Removing Pages from the Toolbox
+
+To remove a page from the Toolbox, right-click its tab or label and choose Delete Page from the context menu. The Visual Basic Editor removes the page from the Toolbox without any confirmation, regardless of whether the page contains controls.
+
+### Importing and Exporting Toolbox Pages
+
+If you want to share Toolbox pages, you can save them as separate files and distribute them to your colleagues. Toolbox pages have a .pag filename extension.
+
+To import a Toolbox page, right-click the tab or label on an existing page in the Toolbox and choose Import Page from the context menu to display the Import Page dialog box. Select the page you want to import and click the Open button in the dialog box. The Visual Basic Editor adds the new page after the last page currently in the Toolbox and names it New Page.
+
+Right-click the page's tab or label, choose Rename, type a new name and description, and then click the OK button.
+
+Likewise, you can export a Toolbox page by right-clicking its tab or label and choosing Export Page from the context menu to display the Export Page dialog box. Type a name for the page, choose the folder in which to save it, and then click the Save button to save it. Now anyone can import your page into their editor as described previously.
+
+### Moving Pages in the Toolbox
+
+To move a page in the Toolbox, right-click its tab or label and choose Move from the context menu to display the Page Order dialog box. In the Page Order list box, select the page or pages you want to move (Shift+click to select multiple contiguous pages, Ctrl+click to select multiple pages individually) and use the Move Up and Move Down buttons to rearrange the pages as desired. Click the OK button to close the Page Order dialog box when you've finished.
+
+# The Bottom Line
+
+**Open the Visual Basic Editor.**
+
+When you want to create a new macro by hand-programming (as opposed to recording) or need to modify or test a macro, the Visual Basic Editor is a powerful tool.
+
+**Master It**
+
+Open the Visual Basic Editor in Word and create a simple macro.
+
+**Open a Macro in the Visual Basic Editor.**
+
+You edit and test macro code in the Code window of the Visual Basic Editor.
+
+Master It
+
+Open the Visual Basic Editor and display a particular macro in the Code window.
+
+**Understand the Project Explorer's two views.**
+
+The Project Explorer window displays a tree of current projects. You can choose between viewing only the files or the folders and files.
+
+Master It
+
+Switch between folder and contents view in the Project Explorer.
+
+**Set properties for a project.**
+
+You can specify a project's name, an associated Help file, and other qualities of a project.
+
+Master It
+
+Lock a project so others can't modify or even read its contents.
+
+**Customize the Visual Basic Editor.**
+
+The Visual Basic Editor can be customized in many ways, including personalizing classic menus and toolbars.
+
+Master It
+
+Undock the Properties window and change its size. Then redock it.
+Chapter 3
+
+Editing Recorded Macros
+
+In this chapter, you'll use the Visual Basic Editor to edit the Word and Excel macros you recorded with the Macro Recorder in Chapter 1, "Recording and Running Macros in the Office Applications." In addition, you'll create a new macro in PowerPoint and see how to edit it. Even if you're working with an application that doesn't include the Macro Recorder (such as PowerPoint), you may still want to read through this chapter because it shows you how to use some of the key editing features of the Visual Basic Editor.
+
+There are three reasons for working with macros in the Visual Basic Editor:
+
+  * First, to fix any problems in the behavior of a macro you recorded. For example, if you accidentally hit the Enter key while recording the macro, the macro will keep performing that wrong instruction every time you run it unless you remove or change the instruction. You would want to delete this line of code in your macro:
+
+    Selection.TypeParagraph
+
+(Alternatively, it's sometimes easier to just rerecord the macro.)
+
+  * Second, to add further instructions to the macro to make it behave differently. This is a great way to get started learning VBA because sometimes by just making relatively small or simple changes to a recorded macro, you can greatly increase its power and flexibility. In the process, you become familiar with the language.
+  * Third, to create new macros by writing them in the Visual Basic Editor instead of recording them. You can write a new macro from scratch or paste in parts of an existing macro, as appropriate.
+
+In this chapter you will learn to do the following:
+
+  * Test a macro in the Visual Basic Editor
+  * Set breakpoints and use comments
+  * Edit the recorded Word macro
+  * Edit the recorded Excel macro
+  * Edit a new PowerPoint macro
+
+# Testing a Macro in the Visual Basic Editor
+
+If a macro fails when you try to run it from the host application, the quickest way to find out what's going wrong is to open the macro in the Visual Basic Editor, run it, and see where in the code it fails:
+
+1. In the host application, press Alt+F8 or choose Tools ⇒ Macro ⇒ Macros to display the Macros dialog box.
+
+2. Select the macro, and then click the Edit button. The host application opens an instance of the Visual Basic Editor and displays the macro for editing.
+
+3. Start the macro running by pressing F5. Alternatively, you could choose Run ⇒ Run Sub/UserForm or click the Run Sub/UserForm button (a green arrow) on the Standard toolbar in the Visual Basic Editor (see Figure 3.1).
+
+Figure 3.1 Click the Run Sub/UserForm button on the Standard toolbar to start running the code.
+
+4. If the macro encounters an error and halts execution (goes into _Break mode_ ), VBA displays an error-message box onscreen and selects the offending statement in the Code window (displays white letters on a blue background). You can then edit the statement to fix the problem. Once you've done so, step through the macro as described in the next section.
+
+* * *
+
+Understanding the VBA Editor Modes
+
+The VBA Editor is always in one of three modes:
+
+  * _Design mode_ when you're designing a user form or writing code
+  * _Execution mode_ when you've pressed F5 and are running your code, usually to see how it behaves to test it
+  * _Break mode_ when execution has been halted (so you can examine variables or otherwise take a look at what's going on in the code)
+
+The Editor halts execution and enters Break mode in several ways: when you press Ctrl+Break, each time you press F8 to single-step through the code, when it encounters a breakpoint that you've set within the code (discussed shortly), or when certain types of errors occur.
+
+You can tell if you're in Break mode by looking at the Editor's title bar. If you see the word [ _break_ ], you're in Break mode. If it just says _Normal_ , you're in Design mode. When you're in Break mode, you can return to normal Design (editing) mode (so you can type in the Code window to revise and retest the macro) by clicking the Reset button on the Standard toolbar in the Visual Basic Editor (it's the blue square next to the equals sign (Break button)—see Figure 3.1). If you ever find yourself unable to type in the Editor, or the Editor is otherwise behaving strangely, remember to click this Reset button to get out of Break mode and restore normalcy.
+
+* * *
+
+* * *
+
+Test Macros Only on Files You Don't Care About
+
+Always test your macros on files (or copies of files) that you don't care about. There are few better ways to lose valuable work than to unleash untested macros on a document and watch it get mangled or worse. Store your code in a central location (such as Normal.dotm in Word or the Personal Macro Workbook in Excel) so that it's accessible to all your files rather than only the file that contains it. If you create a macro in the wrong file, export it from that file and import it into your centralized storage. To export the macro, right-click its module in the Project Explorer, choose Export File from the context menu, use the Export File dialog box to specify the folder and filename, and then click the Save button. To import a module, right-click the destination project in the Project Explorer, choose Import File, select the file in the Import File dialog box, and then click the Open button.
+
+* * *
+
+## Stepping through a Macro
+
+To see exactly what a macro does (and what it does wrong), you can _step through_ the macro—go through the macro, executing one command at a time—so that you can see the effect of each command. Stepping through a macro can be time-consuming—you're seeing the macro run in slow motion—but it's one of the best ways to identify problems and fix them.
+
+Usually debugging is a matter of finding out _where_ in the code something goes wrong. And although you generally already know _what_ goes wrong, you still need to figure out the location of the problem in your code; then you can figure out how the error happens.
+
+To step through a macro, follow these steps:
+
+1. Open the host application, and then open the macro for editing: press Alt+F8, select the macro, and then click the Edit button.
+
+2. Sometimes it's helpful to arrange the Visual Basic Editor window and the host application's window so that you can see them both simultaneously. Either arrange the windows manually or use a Windows command to do so. For example, stack the windows by right-clicking in open space on the Windows Taskbar and choosing Show Windows Stacked from the context menu. Alternatively, you can select Show Windows Side By Side. If you have any other applications currently running, minimize them so they won't be included in your stack. (If you have two monitors, you can dedicate one to the Editor and one to the application.) In Windows 7 or 8, the quickest way to display two windows is to drag one of them to the far left (drop it, and it will snap to that location and resize so it takes up 50 percent of the screen). Drag the other window to the right.
+
+3. Set up conditions the macro expects. Perhaps you need to have a document open. For example, to run properly, a macro that applies a style to a paragraph requires that a paragraph is actually available.
+
+4. Click somewhere in the macro code. The location of the insertion cursor is how the Editor decides which macro you want to work with.
+
+5. Press F8 to step through the macro command by command. Each time you press F8, one line of your VBA code will be executed. The Visual Basic Editor highlights each command as it's executed, and you can watch the effect in the application window to catch errors.
+
+* * *
+
+Pressing F8 Is the Easiest Way to Step Through Macros
+
+You can also step through a macro by choosing Debug ⇒ Step Into or clicking the Step Into button on the Debug toolbar, but the F8 key is easiest to use. After all, you'll often need to step repeatedly until you locate the problem. Pressing a single key is quite a bit more efficient than repetitively opening a menu.
+
+* * *
+
+Figure 3.2 provides an example of stepping through a macro recorded in Word. As you'll see, to catch what a macro is doing wrong, arrange the application window and the Visual Basic Editor window so that you can see them both. Then step through the macro by pressing the F8 key or using the Step Into command.
+
+Figure 3.2 Stepping through a macro recorded in Word
+
+You'll learn about debugging macros in detail in Chapter 17, "Debugging Your Code and Handling Errors." However, let me briefly introduce two additional important techniques that can help you locate bugs in your macros: setting breakpoints and commenting out lines.
+
+## Setting Breakpoints
+
+A _breakpoint_ can be set on a line of code to tell VBA to stop executing the macro there. By using a breakpoint, you can run quickly through known functional parts of a macro at full speed (press F5 to run), and then the Editor automatically stops at the breakpoint. You put a breakpoint just before where you suspect a bug is located in the code. That way, you don't have to step through _all_ your code. You can execute the macro at normal, rapid speed—but then halt near the suspicious location and begin pressing F8 to step through the code, executing it slowly, statement by statement, to closely observe the behaviors. You can set as many breakpoints as you wish.
+
+To toggle a breakpoint on or off, right-click in a line of executable code (not a comment line, described in the following section) and choose Toggle ⇒ Breakpoint from the context menu or click the Toggle Breakpoint button on the Edit toolbar. Even easier, just click in the gray margin indicator bar to the left of the line of code.
+
+A line of code on which you set a breakpoint is shaded red by default. The breakpoint itself is designated by a red circle in the margin indicator bar (see Figure 3.3).
+
+Figure 3.3 Use a breakpoint (the red circle that appears in the margin indicator bar) to stop code execution at a line of your choice.
+
+* * *
+
+Breakpoints Are Not Persistent
+
+Breakpoints are temporary—the Visual Basic Editor doesn't save them with your code. You must specify them for each editing session.
+
+* * *
+
+## Commenting Out Lines
+
+Like most programming languages, VBA lets you add comments to your code so that it's easier to understand. Comments can be invaluable both when you're creating code and when you're revisiting your own code long enough after you've written it to forget what it does—or, worse, trying to figure out what someone else's code does.
+
+But there's another use for commenting. You can also _comment out_ lines of code to prevent the Visual Basic Editor from executing them. In other words, comments are normally just notes to self that are not part of the macro proper—they are not written in VBA. However, sometimes while debugging you'll want to comment out an actual line of executable code in your macro. That way during execution, this line is simply not executed. It's ignored.
+
+This can be a useful technique for temporarily skipping over suspect lines of code without actually removing them from the macro. Then you run the code and see what the difference is with the commented lines ignored. If the bug goes away, it's probably located within the lines that are commented out.
+
+To comment out a line manually, type an apostrophe (') at the very beginning of the line. Alternatively, you can use the Rem command instead of the apostrophe. ( _Rem_ is short for _remark_ , and comment lines are sometimes called remark lines.) To uncomment the line manually, just delete the apostrophe or Rem.
+
+The Visual Basic Editor provides the Comment Block and Uncomment Block commands for commenting out multiple lines automatically. Select the lines of code (or click in the single line you want to affect), and then click the Comment Block button on the Edit toolbar to place an apostrophe at the beginning of each line; to uncomment a line or a group of selected lines, click the Uncomment Block button, and the Visual Basic Editor removes an apostrophe from each line.
+
+The Comment Block and Uncomment Block commands work only with apostrophes, not with Rem lines. If you prefer to use Rem, you must comment and uncomment lines manually. Few people, though, use Rem these days.
+
+* * *
+
+Comment Block Commands Can Be Efficient
+
+The Comment Block command adds an apostrophe to the beginning of each line in the selected block, even for lines that are already commented off (this does no harm). Likewise, the Uncomment Block command removes apostrophes one at a time from each line in the selected block rather than removing all apostrophes at once. This behavior helps preserve comment lines and enables you to use different levels of commenting.
+
+* * *
+
+## Stepping Out of a Macro
+
+Once you've identified and fixed the problem with a macro, you probably won't want to step through the rest of the macro command by command. To run the rest of the macro and the rest of any macro that called it (triggered it), you can press the F5 key. Alternatively, you can click the Run Sub/UserForm button on the Standard toolbar or the Debug toolbar (see Figure 3.4), or you can choose Run ⇒ Continue. If you want to run only the rest of _this_ macro, and then return to stepping through the macro that called this one, use the Step Out command. The Step Out command finishes executing the current macro or procedure at full speed, but if the code then continues with another procedure, the Visual Basic Editor reverts to Break mode so you can examine that procedure's code. We'll explore what it means to _call_ procedures later in this book.
+
+Figure 3.4 The Debug toolbar contains commands for running code, stepping into it and out of it, and displaying key windows for debugging.
+
+To issue the Step Out command, press Ctrl+Shift+F8, click the Step Out button on the Debug toolbar, or choose Debug ⇒ Step Out.
+
+# Editing the Word Macro
+
+Now, edit the Transpose_Word_Right macro that you recorded in Word in Chapter 1, and use it to build another macro. To begin, open the macro in the Visual Basic Editor:
+
+1. Start Word if it's not already running, or activate it.
+
+2. Press Alt+F8 or choose Tools ⇒ Macro ⇒ Macros to display the Macros dialog box.
+
+3. Select the Transpose_Word_Right macro, and then click the Edit button.
+
+In the Code window, you should see code similar to Listing 3.1, except for the line numbers, which I'm using here to identify the lines of code.
+
+**Listing 3.1**: The recorded transpose-words macro
+
+     1.  Sub Transpose_Word_Right()
+     2.  '
+     3.  ' Transpose_Word_Right Macro
+     4.  ' Transposes the current word with the word to its right. _
+     5.  'Created 5/5/13 by Nanci Selest-Gomes.
+     6.  '
+     7.      Selection.Extend
+     8.      Selection.Extend
+     9.      Selection.EscapeKey
+    10.      Selection.Cut
+    11.      Selection.MoveRight Unit:=wdWord, Count:=1
+    12.      Selection.PasteAndFormat (wdFormatOriginalFormatting)
+    13.      Selection.MoveLeft Unit:=wdWord, Count:=1
+    14.  End Sub
+
+Here's what the macro does:
+
+  * Line 1 starts the macro with the Sub Transpose_Word_Right() statement, and line 14 ends the macro with the End Sub statement. The Sub and End Sub lines mark the beginning and end of the macro (as they do any macro).
+  * Lines 2 and 6 are blank comment lines the Macro Recorder inserts to make your macro easier to read. You can use any number of blank lines or blank comment lines in a macro to help separate statements into groups. (A blank line doesn't have to be commented out—it can just be blank—but the Macro Recorder has added commenting to these blank lines to make it clear what they are.)
+  * Lines 3 through 5 are comment lines that contain the name of the macro and its description. The Macro Recorder entered these lines from the information you typed into the Record Macro dialog box.
+  * Line 7 records the first keystroke of the F8 key, which starts Extend mode—a way of selecting text in a Word document.
+  * Line 8 records the second keystroke of the F8 key, which continues Extend mode and thereby selects the current word.
+  * Line 9 records the keystroke of the Esc key, which cancels Extend mode.
+  * Line 10 records the Cut command, which cuts the selection (in this case, the selected word) to the Clipboard.
+  * Line 11 records the Ctrl+→ ⇒ command, which moves the insertion point one word to the right.
+  * Line 12 records the Paste command, which pastes the selection into the document at the current position of the insertion point. Whatever formatting was originally applied to the selection is retained (rather than applying the formatting in effect at the new location).
+  * Line 13 records the Ctrl+← command, which moves the insertion point one word to the left.
+
+## Stepping Through the Transpose_Word_Right Macro
+
+Try stepping through this macro in Break mode using the Step Into command:
+
+1. Arrange your screen so you can see both the active Word window and the Visual Basic Editor window (for example, by right-clicking the Taskbar and choosing Show Windows Stacked from the context menu or by snapping each window to a side of the screen).
+
+2. Click in the Visual Basic Editor, and then click to place the blinking insertion point at the start (on the Sub) of the Transpose_Word_Right macro in the Code window.
+
+3. Press F8 to step through the code one active line at a time. You'll notice that VBA skips the blank lines and the comment lines because they're supposed to be ignored. VBA highlights the current statement each time you press F8, and you see the actions taking place in the Word window.
+
+The Visual Basic Editor leaves Break mode when it reaches the end of the macro (in this case, when you press F8 to execute the End Sub statement in line 14). The Editor returns to Design mode. You can also exit Break mode at any time by clicking the Reset button (blue square) on the Standard or the Debug toolbar or by choosing Run ⇒ Reset.
+
+## Running the Transpose_Word_Right Macro
+
+If the macro works fine when you step through it, you may also want to run it from the Visual Basic Editor. Just press F5. In Break mode, F5 executes the macro from the current instruction (where the insertion cursor is located).
+
+## Creating a Transpose_Word_Left Macro
+
+At this point we'll modify the macro. We'll create a Transpose_Word_Left macro by making minor adjustments to the Transpose_Word_Right macro. Follow these steps.
+
+1. In the Code window, select all the code for the Transpose_Word_Right macro, from the Sub Transpose_Word_Right() line to the End Sub line. You can select in three ways: by dragging with the mouse, by holding down Shift and using the arrow keys to extend the selection, or by positioning the insertion point at one end of the macro and then Shift+clicking the other end.
+
+2. Copy the code by issuing a Copy command (for example, by right-clicking and choosing Copy from the context menu or by pressing Ctrl+C or Ctrl+Insert).
+
+3. Click to move the insertion point to the line below the End Sub statement for the Transpose_Word_Right macro in the Code window.
+
+4. Paste the code by issuing a Paste command (by right-clicking and choosing Paste from the context menu or by pressing Ctrl+V or Shift+Insert). The Visual Basic Editor automatically enters a horizontal line between the End Sub statement for the Transpose_Word_Right macro and the new macro you've pasted.
+
+5. Change the name of the second Transpose_Word_Right macro to Transpose_Word_ **Left** by editing the Sub line:
+
+    Sub Transpose_Word_Left()
+
+6. Edit the comment lines at the beginning of the macro accordingly—for example,
+
+    'Transpose_Word_Left Macro
+    'Transposes the current word with the word to its left. _
+    'Created 5/5/13 by Nanci Selest-Gomes.
+
+7. Now all you need to do is replace the MoveRight method with the MoveLeft method. This will move the insertion point one word to the left instead of one word to the right. While you could do that by typing the correction or by using Cut and Paste to replace the Selection.MoveRight line with the commented-out Selection.MoveLeft line, try using the List Properties/Methods feature instead. Just for practice, follow these steps:
+
+a. Click to place the insertion point in the word MoveRight.
+
+b. Click the List Properties/Methods button on the Edit toolbar to display the list of properties and methods. It's the first button on the far left. Or just press Ctrl+J. (If the Edit toolbar isn't visible, right-click one of the existing toolbars and choose Edit from the context menu.)
+
+c. Double-click the MoveLeft method in the list to make it replace the MoveRight method in the code line.
+
+8. Now that you no longer need it, delete the line Selection.MoveLeft Unit:=wdWord, Count:=1 from the end of the macro.
+
+You should end up with a macro that looks like Listing 3.2.
+
+**Listing 3.2**: The edited transpose-words macro
+
+    Sub Transpose_Word_Left()
+    '
+    ' Transpose_Word_Left Macro
+    ' Transposes the current word with the word to its left. _
+    ' 'Created 5/5/13 by Nanci Selest-Gomes.
+    '
+        Selection.Extend
+        Selection.Extend
+        Selection.EscapeKey
+        Selection.Cut
+        Selection.MoveLeft Unit:=wdWord, Count:=1
+        Selection.PasteAndFormat (wdFormatOriginalFormatting) 
+    End Sub
+
+Try stepping through this macro to make sure it works. If it does, you're ready to save it—and perhaps to create a Quick Access Toolbar button, or keyboard shortcut, for it in Word if you plan to use it in your writing.
+
+## Save Your Work
+
+When you finish working with this or any other macro, choose File ⇒ Save (Ctrl+S) from the Visual Basic Editor to save the document or template that contains the macro and the changes you've made to it. Then press Alt+Q or choose File ⇒ Close And Return To Microsoft Word to close the Visual Basic Editor and return to Word.
+
+# Editing the Excel Macro
+
+In the following sections, you'll edit the Excel macro that you recorded in Chapter 1. This time, you won't create a new macro—instead, you'll add to the existing one.
+
+## Unhiding the Personal Macro Workbook
+
+Before you can edit the Excel macro, you'll need to unhide the Personal Macro Workbook if it's currently hidden:
+
+1. Open the View tab on the Ribbon.
+
+2. If the Unhide button is gray (disabled) in the Window group, then no workbooks are hidden, including Personal. You can skip the following steps. However, if the Unhide button is black (enabled), click it to display the Unhide dialog box.
+
+3. Select PERSONAL.XLSM or PERSONAL.XLSB and click the OK button. If you stored the macro from Chapter 1 in another workbook, open that workbook before trying to proceed. To hide the Personal Macro Workbook again after editing the macro, click the Hide button on the Ribbon while the Personal Macro Workbook is active.
+
+* * *
+
+Creating a Backup Copy of your Files
+
+Eventually you'll have a collection of macros in the Personal workbook. It's a good idea to keep a backup copy of these files in case something happens—such as reinstalling your Office applications when you buy a new computer. You don't want to lose your macro collection. To create a backup file, just locate PERSONAL.XLSB in Windows 8 by pressing the Windows key+F (or in Windows 7, just by pressing the Windows key) to open the Windows Search field and typing in its name. Then right-click PERSONAL.XLSB in the search-results list and choose Open File Location.
+
+Now you can copy the file, save it to another location, and rename it something like PERSONAL.BAK. You can also find PERSONAL.XLSB by using Windows explorer to locate it in this folder: Users\ _YourNameHere_ \AppData\Roaming\Microsoft\Excel\XLStart.
+
+Also make a backup copy of any other important macro collections, such as Word's Normal.Dotm file.
+
+* * *
+
+## Opening the Macro for Editing
+
+Now take the following steps to open the macro you recorded in Chapter 1 for viewing and editing:
+
+1. Press Alt+F8 to display the Macros dialog box.
+
+2. Select the macro named New_Workbook_with_Months.
+
+3. Click the Edit button to display the macro for editing in the Visual Basic Editor. Listing 3.3 shows code similar to what you should be seeing.
+
+**Listing 3.3**: New "workbook with months added" macro
+
+    1.  Sub New_Workbook_with_Months()
+    2.  '
+    3.  ' New_Workbook_with_Months Macro
+    4.  ' Creates a new workbook with the months filled in for a year. 
+    5.  '
+    6.  '
+    7.     Workbooks.Add
+    8.     Range("A1").Select
+    9.     ActiveCell.FormulaR1C1 = "Jan-2011"
+    10.    Range("B1").Select
+    11.    ActiveCell.FormulaR1C1 = "Feb-2011"
+    12.    Range("A1:B1").Select
+    13.    Selection.AutoFill Destination:=Range("A1:L1"), Type:=xlFillDefault
+    14.    Range("A1:L1").Select
+    15.    ActiveWorkbook.SaveAs Filename:= _
+    "C:\Users\ _Richard_ \Documents\Sample Workbook.xlsx", FileFormat:= _
+            xlOpenXMLWorkbook, CreateBackup:=False
+
+    16. End Sub
+
+(If you are using a version of Office prior to Office 2013, the file location specified in line 15 is likely C:\Users\Richard\AppData\Roaming\Microsoft\Excel\XLSTART\Sample Workbook.xlsx. Replace _Richard_ with your name.)
+
+Here's what happens in the macro in Listing 3.3:
+
+  * Line 1 starts the macro with the Sub New_Workbook_with_Months() statement, and line 16 ends the macro with the End Sub statement.
+  * Lines 2, 5, and 6 are comment lines that the Macro Recorder automatically adds. (The comment line in line 6 seems superfluous. It's there because Excel allows you to enter two lines in the Description text box in the Record Macro dialog box, but this macro uses only one line. Delete any blank or comment lines you wish. They'll have no effect on the behavior of the macro, though removing them could make it less readable in the Editor. It's your call.)
+  * Line 3 is a comment line that gives the macro's name and describes it as a macro, and line 4 contains the description from the Record Macro dialog box.
+  * Line 7 creates a new blank workbook by using the Add method on the Workbooks collection object. (A _collection_ object, or more concisely a _collection_ , is an object that contains objects of a given type. For example, a worksheet will contain a PivotTables collection of all the PivotTables on that worksheet.)
+  * Line 8 selects the Range object A1, making cell A1 active.
+  * Line 9 enters Jan-2011 in the active cell. Notice that the Macro Recorder has stored the parsed date value rather than the text that you typed in (January 2011). Also, keep in mind that the date displayed in the cell may be in a different format than MMM.
+  * Line 10 selects the Range object B1, making cell B1 active, and line 11 enters Feb-2011 in that cell.
+  * Line 12 selects the range A1:B1.
+  * Line 13 performs a default AutoFill operation on the range A1:L1, and line 14 selects that range. Note how the Macro Recorder has recorded two separate actions, although in the Excel interface you performed only one action.
+  * Line 15 saves the workbook under the name and folder given. Note that the Macro Recorder has automatically broken this long statement onto three lines by using the continuation character, an underscore preceded by a space. You can break lines of code anywhere between keywords to make the lines of code a comfortable length for working within the Editor. Again, lines broken with an underscore at the end have no effect on macro execution. They're merely formatting issues, so it's your call.
+
+## Editing the Macro
+
+Now modify the macro by following these steps:
+
+1. Select lines 8 through 13.
+
+2. Copy these lines by pressing Ctrl+C or right-clicking in the selection and choosing Copy from the context menu.
+
+3. Click at the start of line 14 to move the insertion point there.
+
+4. Paste the copied lines by pressing Ctrl+V and choosing Edit ⇒ Paste or right-clicking at the insertion point and choosing Paste from the context menu.
+
+5. If necessary, press the Enter key to move the line Range("A1:L1").Select down one line. (Press Enter if this code is red, indicating that it should be moved down one line rather than appended to line 13's code.)
+
+Your new macro should look like Listing 3.4.
+
+**Listing 3.4**: New extended version
+
+    1.  Sub New_Workbook_with_Months()
+    2.  '
+    3.  ' New_Workbook_with_Months Macro
+    4.  ' Creates a new workbook with the months filled in for a year. 
+    Recorded 5/5/13 by Abe Normal.' 
+    5.  '
+    6.  '
+    7.     Workbooks.Add
+    8.     Range("A1").Select
+    9.     ActiveCell.FormulaR1C1 = "Jan-2011"
+    10.    Range("B1").Select
+    11.    ActiveCell.FormulaR1C1 = "Feb-2011"
+    12.    Range("A1:B1").Select
+    13.    Selection.AutoFill Destination:=Range("A1:L1"), Type:=xlFillDefault
+    14.    Range("A1").Select
+    15.    ActiveCell.FormulaR1C1 = "Jan-2011"
+    16.    Range("B1").Select
+    17.    ActiveCell.FormulaR1C1 = "Feb-2011"
+    18.    Range("A1:B1").Select
+    19.    Selection.AutoFill Destination:=Range("A1:L1"), Type:=xlFillDefault
+    20.    Range("A1:L1").Select
+    21.    ActiveWorkbook.SaveAs Filename:= _
+    "C:\Users\Richard\AppData\Roaming\Microsoft\Excel\XLSTART\Sample Workbook.xlsx", _
+        FileFormat:=xlOpenXMLWorkbook, CreateBackup:=False
+    22. End Sub
+
+Now, change the macro by taking the following steps:
+
+1. Delete line 6. It's not doing any good, just taking up space in the Code window.
+
+2. Delete line 20. It's not necessary for what the macro does—you don't need the macro to select the range because the AutoFill instruction in line 13 is enough to perform the AutoFill operation without selecting the range.
+
+3. Change line 14 to select cell A2 instead of cell A1:
+
+    Range("A2").Select
+
+4. Change line 15 so that it enters the value 100 instead of Jan-2008:
+
+    ActiveCell.FormulaR1C1 = 100
+
+5. Change line 16 to select cell B2 instead of cell B1:
+
+    Range("B2").Select
+
+6. Change line 17 so that it enters the value 200 instead of Feb-2008:
+
+    ActiveCell.FormulaR1C1 = 200
+
+7. Change line 18 so that it selects the range A2:B2:
+
+    Range("A2:B2").Select
+
+8. Change line 19 so that it performs the AutoFill operation on the range A2:L2:
+
+    Selection.AutoFill Destination:=Range("A2:L2"), Type:=xlFillDefault
+
+9. Break line 13 with a space, underscore, and carriage return before the Type argument, as shown here. Indent the second line by one tab.
+
+    Selection.AutoFill Destination:=Range("A1:L1"), _
+        Type:=xlFillDefault
+
+10. Similarly, break line 19 with a space, underscore, carriage return, and tab before the Type argument.
+
+11. Click the Save button or choose File ⇒ Save to save the changes you made.
+
+The macro should now read like Listing 3.5.
+
+**Listing 3.5**: Streamlined macro
+
+    1.  Sub New_Workbook_with_Months()
+    2.  '
+    3.  ' New_Workbook_with_Months Macro
+    4.  ' Creates a new workbook with the months filled in for a year. 
+    Recorded 5/5/13 by Abe Normal.
+    '
+    5.  '
+    6.     Workbooks.Add
+    7.     Range("A1").Select
+    8.     ActiveCell.FormulaR1C1 = "Jan-2011"
+    9.     Range("B1").Select
+    10.    ActiveCell.FormulaR1C1 = "Feb-2011"
+    11.    Range("A1:B1").Select
+    12.    Selection.AutoFill Destination:=Range("A1:L1"), _
+              Type:=xlFillDefault
+    13.    Range("A2").Select
+    14.    ActiveCell.FormulaR1C1 = 100
+    15.    Range("B2").Select
+    16.    ActiveCell.FormulaR1C1 = 200
+    17.    Range("A2:B2").Select
+    18.    Selection.AutoFill Destination:=Range("A2:L2"), _
+              Type:=xlFillDefault
+    19.    ActiveWorkbook.SaveAs Filename:= _
+    "C:\Users\Richard\AppData\Roaming\Microsoft\Excel\XLSTART\temp.xlsx", _
+              FileFormat:=xlOpenXMLWorkbook, CreateBackup:=False
+    20. End Sub
+
+Now step through the macro and watch what happens: it creates the new workbook as before and enters the months, but then it enters the values 100 through 1200 in the second row of cells. This one is fun to watch on a split screen because you watch the cells fill with data as you step through it.
+
+At the end, the macro attempts to save the workbook as before. However, an error message or dialog box warns that a previous workbook exists by this name (unless you've already deleted it). Later you'll see how to handle this type of error so the macro doesn't halt or confuse the user with these kinds of odd error messages or dialog boxes.
+
+## Save Your Work
+
+When you finish working with this macro, choose File ⇒ Save from the Visual Basic Editor to save the workbook that contains the macro and the changes you've made to it. Then press Alt+Q or choose File ⇒ Close And Return To Microsoft Excel to close the Visual Basic Editor and return to Excel.
+
+# Editing a PowerPoint Macro
+
+In this section, you'll edit a PowerPoint macro. PowerPoint no longer includes a macro recorder, so you'll either have to type in the code for the following example or, better, just copy and paste it from this book's Web page at www.sybex.com/go/masteringvba2013.
+
+Start by opening the PowerPoint Visual Basic Editor:
+
+1. Open PowerPoint, and choose the blank presentation template (in PowerPoint 2010 and earlier versions, the blank presentation is opened by default). Now add a shape by clicking the Insert tab on the Ribbon, then clicking the Shapes icon in the Illustrations section.
+
+2. Click a rectangle shape of your choice, and drag on the slide to create it. This will be object 1 in the Shapes collection, so we can refer to it in the code like this:
+
+    ActiveWindow.Selection.SlideRange.Shapes(1).Select
+
+3. Open the PowerPoint Visual Basic Editor by pressing Alt+F11.
+
+4. Create a new, empty module by choosing Insert ⇒ Module in the Editor. Now you're ready to add some code.
+
+5. Type in (or paste from this book's web page) the code shown in Listing 3.6.
+
+**Listing 3.6**: Add a slide in PowerPoint
+
+     1.  Sub Add_Slide_and_Format_Placeholder()
+     2.  '
+     3.  ' Sample macro that adds a slide, formats its placeholder, 
+         ' and adds text to it. Recorded 6/16/13 by Batfield Dial.
+     4.  '
+     5.      ActiveWindow.View.GotoSlide Index:= _
+                 ActivePresentation.Slides.Add(Index:=2, _
+                 Layout:=ppLayoutText).SlideIndex
+     6.      ActiveWindow.Selection.SlideRange.Layout = ppLayoutTitle
+     7.      ActiveWindow.Selection.SlideRange.Shapes(1).Select
+     8.      With ActiveWindow.Selection.ShapeRange
+     9.          .IncrementLeft -6#
+    10.          .IncrementTop -125.75
+    11.      End With
+    12.      ActiveWindow.Selection.ShapeRange.ScaleHeight 1.56, msoFalse, _
+                 msoScaleFromTopLeft
+    13.      ActiveWindow.Selection.SlideRange.Shapes(1).Select
+    14.      ActiveWindow.Selection.ShapeRange.TextFrame.TextRange.Select
+    15.      ActiveWindow.Selection.ShapeRange.TextFrame.TextRange.Characters _
+                 (Start:=1, Length:=0).Select
+    16.      With ActiveWindow.Selection.TextRange _
+    17.          .Text = "The quick brown dog jumped over the lazy fox"
+    18.          With .Font
+    19.              .Name = "Arial"
+    20.              .Size = 44
+    21.              .Bold = msoFalse
+    22.              .Italic = msoFalse
+    23.              .Underline = msoFalse
+    24.              .Shadow = msoFalse
+    25.              .Emboss = msoFalse
+    26.              .BaselineOffset = 0
+    27.              .AutoRotateNumbers = msoFalse
+    28.              .Color.SchemeColor = ppTitle
+    29.          End With
+    30.      End With
+    31.      ActiveWindow.Selection.ShapeRange.TextFrame.TextRange.Characters _
+                 (Start:=1, Length:=42).Select
+    32.      With ActiveWindow.Selection.TextRange.Font
+    33.          .Name = "Impact"
+    34.          .Size = 54
+    35.          .Bold = msoFalse
+    36.          .Italic = msoFalse
+    37.          .Underline = msoFalse
+    38.          .Shadow = msoFalse
+    39.          .Emboss = msoFalse
+    40.          .BaselineOffset = 0
+    41.          .AutoRotateNumbers = msoFalse
+    42.          .Color.SchemeColor = ppTitle
+    43.      End With
+    44.  End Sub
+
+Here's what happens in the macro:
+
+  * Line 1 starts the macro, and line 44 ends it.
+  * Lines 2 and 4 are blank comment lines used to set off the description of the macro, which appears in line 3.
+  * Line 5 adds the slide to the presentation. This statement is a little complicated, but don't worry about it too much just yet. For now, note two things: First, the statement uses the Add method with the Slides collection object to add a slide to the collection (in other words, to create a new slide in this case). This is similar to the way the Excel macro explored earlier in this chapter used the Add method to add a workbook to its Workbooks collection. Second, the layout of the slide is ppLayoutText, the VBA constant for the Text slide layout that PowerPoint uses for a default new slide.
+  * Line 6 applies the Title layout (ppLayoutTitle) that you chose when recording the macro. (If you chose a different slide layout, you'll see a different constant than ppLayoutTitle.)
+  * Line 7 selects the first shape in the Shapes collection on the active slide. (For the moment, don't worry about how you get to the active slide.)
+  * Lines 8 to 11 are a With block. This block begins with a With statement that specifies properties or behaviors ( _methods_ ) for the shape that has been selected (ActiveWindow.Selection.ShapeRange). A With statement is a way of simplifying object references, and everything between the With statement and the End With statement refers to the objects that the With statement first mentions. In this case, line 9 uses the IncrementLeft method with a negative value to move the shape to the left, and line 10 uses the IncrementTop method with a negative value to move the shape up the slide.
+
+* * *
+
+The With Command Has Two Uses
+
+With statements have two benefits: They simplify code (because you don't need to specify the object in each of the lines between the With and End With lines), and they make code run faster.
+
+* * *
+
+  * Line 13 selects the first shape in the Shapes collection, and line 14 selects the TextRange object in the TextFrame object in the shape. When you're working interactively, PowerPoint makes this selection process seamless: You click in a shape displaying the legend "Click to add title" (or whatever), and PowerPoint selects the text range in the shape's text frame—but all you see is that the text in the shape becomes selected. In VBA, you have to go through a couple of unseen layers in the object model before getting to the text.
+  * When you select the placeholder text, PowerPoint gets rid of it. The same thing happens when you select the placeholder text via VBA. So line 15 makes a new selection at the beginning of the first character in the text range. The Length of the selection is 0, meaning that the selection is collapsed to an insertion point rather than containing any characters. Line 16 starts a With statement that continues until line 30. The With ActiveWindow.Selection.TextRange statement in line 16 lets line 17 reference the Text property of the TextRange object in the ActiveWindow object's Selection object much more simply (instead of ActiveWindow.Selection.TextRange.Text), and it lets line 18 reference the Font property of the TextRange object in the Selection object in the ActiveWindow object easily (instead of ActiveWindow.Selection.TextRange.Font).
+  * Line 17 sets the Text property of the ActiveWindow.Selection.TextRange object to the text typed.
+  * Line 18 then begins a nested With statement that sets the properties of the Font object for the TextRange object. Line 19 sets the Name property of the Font object to Arial; line 20 sets the Size property of the Font object to 44; line 21 sets the Bold property of the Font object to msoFalse, the Microsoft Office (mso) constant for False; and so on. These statements are not necessary for our purposes in this macro. But they're harmless, so you can leave them in your code or, if you wish, delete this entire With block (as we'll do shortly). Line 29 ends the nested With statement.
+
+* * *
+
+With Blocks Can Be Nested
+
+A nested With statement is one that is placed within another With statement and specifies an object within the object specified in the outer With statement. You can nest multiple-level With statements when necessary. You can see that the With block that begins on line 18 is nested within the outer With block that begins on line 16.
+
+* * *
+
+  * Line 31 uses the Select method to select characters 1 through 42 in the text range. This is the same as pressing the Ctrl+Shift+Home key combination. Because this statement specifies the characters to select, you'll need to change it if you change the text that this macro inserts. (If you run the statement on a text range that has fewer than 42 characters, it will return an error. If you run it on a text range that has more than 42 characters, it will select only the first 42 characters in the text range—not what you want.)
+  * Line 32 begins another With statement that works with the Font object of the TextRange object. This With statement imitates what happens if the user opens and modifies the Font dialog box.
+  * Line 43 ends the With statement, and line 44 ends the macro.
+
+You can edit this macro by slimming it down a little and changing the text it inserts:
+
+1. Delete the unnecessary With statement in lines 18 through 29.
+
+2. Delete line 30.
+
+3. Change lines 16 and 17 into a single statement without With:
+
+    ActiveWindow.Selection.TextRange.Text = _
+        "The quick brown dog jumped over the lazy fox"
+
+4. Now change the text that the new line 16 inserts. Type text of your choice between the double quotation marks.
+
+5. Change line 31 to use the Select method on the text _range_ rather than specifying which characters to select. Delete Characters(Start:=1, Length:=42) to leave this statement:
+
+    ActiveWindow.Selection.ShapeRange.TextFrame.TextRange.Select
+
+6. By specifying a range rather than a particular character count, you avoid the problem discussed earlier of having to count characters anytime you change the message. Specifying a character count is called _hard-coding_ and it's to be avoided whenever possible. If there's a way—as there is here with the TextRange property—let the computer figure out the count rather than specifying it in your code.
+
+7. Click the Save button on the Standard toolbar or choose File ⇒ Save to save the changes you've made to the presentation. In the Save As dialog box, locate the Save As Type drop-down list and change it from the default .pptx type (which cannot contain macros) to the .pptm type (which can).
+
+You should now have code that reads like Listing 3.7.
+
+**Listing 3.7**: The macro slimmed down and modified
+
+     1.   Sub Add_Slide_and_Format_Placeholder()
+     2.  '
+     3.  ' Sample macro that adds a slide, formats its placeholder, and adds text
+         ' to it.        Recorded 12/4/08 by Rodney Converse.
+           Recorded 12/4/08 by Rodney Converse.
+     4.  '
+     5.      ActiveWindow.View.GotoSlide Index:= _
+                 ActivePresentation.Slides.Add(Index:=2, _
+                 Layout:=ppLayoutText).SlideIndex
+     6.      ActiveWindow.Selection.SlideRange.Layout = ppLayoutTitle
+     7.      ActiveWindow.Selection.SlideRange.Shapes("Rectangle 4").Select
+     8.      With ActiveWindow.Selection.ShapeRange
+     9.          .IncrementLeft -6#
+    10.          .IncrementTop -125.75
+    11.      End With
+    12.      ActiveWindow.Selection.ShapeRange.ScaleHeight 1.56, msoFalse, _
+                 msoScaleFromTopLeft
+    13.      ActiveWindow.Selection.SlideRange.Shapes("Rectangle 4").Select
+    14.      ActiveWindow.Selection.ShapeRange.TextFrame.TextRange.Select
+    15.      ActiveWindow.Selection.ShapeRange.TextFrame.TextRange.Characters _
+                 (Start:=1, Length:=0).Select
+    16.      ActiveWindow.Selection.TextRange.Text = "Welcome to Acme Industries"
+    17.      ActiveWindow.Selection.ShapeRange.TextFrame.TextRange.Select
+    18.      With ActiveWindow.Selection.TextRange.Font
+    19.          .Name = "Impact"
+    20.          .Size = 54
+    21.          .Bold = msoFalse
+    22.          .Italic = msoFalse
+    23.          .Underline = msoFalse
+    24.          .Shadow = msoFalse
+    25.          .Emboss = msoFalse
+    26.          .BaselineOffset = 0
+    27.          .AutoRotateNumbers = msoFalse
+    28.          .Color.SchemeColor = ppTitle
+    29.      End With
+    30.  End Sub
+
+Now step through the changed macro and make sure it works as you expect it to. You may need to modify Rectangle 4 in the code to a different number.
+
+## Save Your Work
+
+When you finish working with this macro, choose File ⇒ Save from the Visual Basic Editor to save the presentation that contains the macro and the changes you've made to it. Be sure to change the file type from the default .pptx to the macro-enabled .pptm file type. Then press Alt+Q or choose File ⇒ Close And Return To Microsoft PowerPoint to close the Visual Basic Editor and return to PowerPoint.
+
+* * *
+
+When Should You Use the Macro Recorder?
+
+As you've seen so far in this book, you can create VBA code two ways. First, you can use the Macro Recorder (in the two applications—Word and Excel—that provide one) to record a series of actions when working interactively in the application. Or, second, you can type VBA statements into the Code window in the Visual Basic Editor. You're probably wondering when you should record a macro and when you should create code from scratch. Writing a procedure from scratch is clearly more difficult and more advanced than recording a procedure—so should you always record if a Recorder is available?
+
+Using the Macro Recorder has advantages and disadvantages. The advantages are as follows:
+
+  * The Macro Recorder creates usable code every time (provided you run the macro under suitable conditions).
+  * It is quick and easy to use.
+  * It can help you discover which VBA objects, methods, and properties correspond to which part of an application's interface.
+
+And here are the disadvantages:
+
+  * Code created in the Macro Recorder may contain unnecessary statements because the Macro Recorder records _everything_ you do in the application—including all the options in every built-in dialog box you use when recording the macro. For example, if you start the Macro Recorder from Word, choose Tools ⇒ Options to display the View page of the Options dialog box, click the Edit tab to display the Edit page, and change the Auto-Keyboard Switching setting, the Macro Recorder will record all the settings on the Edit page as well as all those on the View page. The result is about 40 lines of unnecessary code. (If you visit any other pages in the Options dialog box on the way to the Edit page, the Macro Recorder will record all the settings in those pages as well.) If you create the code manually in the Visual Basic Editor, you can achieve the same effect by using one statement rather than dozens.
+  * Code created by the Macro Recorder can work only in the active document because whichever document you're working with interactively automatically becomes the active document. Later in this book, you'll learn how to use objects in the applications' object models to work with documents other than the active document. Working with other documents can have advantages; for example, you can make your code run faster or hide from the user the manipulations you're performing.
+  * The Macro Recorder can create VBA code for only _some_ of the actions you perform in the host application. For example, if you want to display a dialog box or a user form in the course of a procedure, you need to write the appropriate statement manually—you can't record it. The subset of VBA actions available through the Macro Recorder is similar to the set of actions you can take in the host application when working interactively, so you can get a lot done with it. Still, you'll find it's limited compared to the full range of actions you can perform through VBA.
+
+However expert you become with VBA, consider the Macro Recorder a useful tool for creating either rough-and-ready macros or the basis of more complex procedures. You'll often find it makes sense to have the Macro Recorder handle as much of the strain of creating a procedure as possible. If you can save time by using the Macro Recorder to quickly identify the VBA object or property that you need, then do so.
+
+In addition, the Macro Recorder can show you how to write some code that you can't figure out how to write on your own. The Recorder always gets the syntax right.
+
+* * *
+
+# The Bottom Line
+
+**Test a macro in the Visual Basic Editor.**
+
+When you need to modify or debug a macro, the Visual Basic Editor is your best friend. It's filled with tools to make your job easier.
+
+Master It
+
+Open a macro; then step through it to see if anything goes wrong.
+
+**Set breakpoints and use comments.**
+
+Setting breakpoints allows you to press F5 to execute a macro, but forces the Editor to enter Break mode when execution reaches the line where the breakpoint resides. Comments help you understand the purpose of code—they describe it but are ignored during execution of the macro's code. "Commenting out" a line of code allows you to temporarily render it inactive to see what effect this has during execution. This is sometimes a good way to see if that line is causing the bug you're tracking down.
+
+Master It
+
+Set a breakpoint in, and add a comment to, a macro.
+
+**Edit a recorded macro.**
+
+Make some changes to a Word macro.
+
+Master It
+
+With the Visual Basic Editor open, choose a macro and modify it.
+Chapter 4
+
+Creating Code from Scratch in the Visual Basic Editor
+
+In this chapter, you'll practice creating procedures from scratch in the Visual Basic Editor. The examples walk you through creating a procedure in Word, Excel, and PowerPoint.
+
+For the examples in this book, the Visual Basic Editor should be set up a certain way and (for good practice) set to require explicit declarations of variables. So we'll start off this chapter by ensuring that these conditions are met.
+
+The purpose of this chapter is to give you a feel for creating code in the Visual Basic Editor before you study the details of the language. You'll work briefly with VBA elements (such as objects, properties, methods, variables, and constants) that you'll learn about more fully later in this book. Along the way, you'll meet several of the many helpful tools that the Visual Basic Editor provides, including the Macro Recorder, the Object Browser, and the Help system. You'll explore these tools more thoroughly later in this book, as well.
+
+In this chapter you will learn to do the following:
+
+  * Set up the Visual Basic Editor for creating procedures
+  * Create a procedure for Word
+  * Create a procedure for Excel
+  * Create a procedure for PowerPoint
+  * Create a procedure for Access
+
+# Setting Up the Visual Basic Editor for Creating the Procedures
+
+You'll find it easiest to follow the instructions in the following procedures—and in the rest of the book—if you have the Visual Basic Editor set up in a default configuration (like the layout you see the first time you display the Visual Basic Editor from a VBA host). Any changes you make to the VBA Editor will be in effect across all VBA-enabled Office applications. So, if you set up the Editor as described next, it will look like this whether you open it in Excel, Word, Access, Outlook, or PowerPoint.
+
+The following steps describe how to set up the Visual Basic Editor so it looks like Figure 4.1:
+
+1. If the Project Explorer isn't displayed, choose View ⇒ Project Explorer or press Ctrl+R to display it.
+
+2. If the Properties window isn't displayed, choose View ⇒ Properties Window or press the F4 key to display it.
+
+3. Unless you really prefer things otherwise, dock the Project Explorer in its conventional position at the upper-left corner of the main Visual Basic Editor area. Dock the Properties window below the Project Explorer, again in its default position. (To change docking, choose Tools ⇒ Options, click the Docking tab, and select the Docking options.) To dock an undocked (floating) window, double-click its title bar.
+
+4. Set up the Visual Basic Editor to require variables to be declared explicitly. The Editor will then enforce a rule that you must declare each variable formally before you can use it in the code. Choose Tools ⇒ Options to display the Options dialog box, select the Require Variable Declaration check box on the Editor page, and then click the OK button. More on variable declaration later in the book, but here's a brief summary. This setting makes the Visual Basic Editor automatically enter an Option Explicit statement for all modules and user forms you create from now on. And _that_ statement causes the Editor to check during runtime for any implicitly declared variables and remind you that you must declare them _explicitly_ , like this:
+
+     Dim txtName As String
+
+Figure 4.1 The default configuration for the VBA Editor
+
+# Creating a Procedure for Word
+
+The procedure you'll create for Word causes the Track Changes feature to toggle (between Strikethrough and Hidden) how deleted text will be displayed. With this macro, you'll be able to switch instantly between having deleted text remain onscreen with a line through it or having it simply disappear.
+
+Start by using the Macro Recorder to provide the necessary object qualifications. Then you can modify the code by hand in the Editor to create the toggle behavior.
+
+Follow these steps to record the macro:
+
+1. Start Word. If Word is already running, exit it and restart it.
+
+2. Record a macro to get to the object qualifications (properties and settings) you need. (Remember that to some, recording may feel like cheating, but the Macro Recorder is truly a gift when it comes to finding objects and getting complicated syntax correctly coded.) Follow these substeps:
+
+a. Click the Developer tab on the Ribbon; then click the Record Macro button in the Code section to display the Record Macro dialog box.
+
+b. Either accept the macro name that the Macro Recorder automatically assigns (Macro1, Macro2, and so on) or create a scratch name of your own, such as Temp, that will remind you to delete the macro if you forget to do so.
+
+c. Leave the Store Macro In drop-down list set to All Documents (Normal.dotm). Leave the description blank. This is a temporary macro just for practice, so we won't add it to our permanent collection.
+
+d. Click the OK button to start recording the macro.
+
+e. Click the Review tab on the Ribbon, and then click the small arrow in the lower-right corner of the Tracking section. The Track Changes Options dialog box opens. In that box click the Advanced Options button. (Note that the Advanced Track Changes Options dialog box looks somewhat different in Office 2010 and earlier versions. And you open the first dialog box by clicking the bottom half of the Track Changes icon.) Now ensure that Strikethrough is selected in the Deletions drop-down list (see Figure 4.2), and then click OK twice to close the two Track Changes Options dialog boxes. (Strikethrough is the default, so it's probably already selected—but we want the Recorder to show us how this option is coded in VBA. Clicking OK to close a dialog box records all the current settings in that box.)
+
+Figure 4.2 The Advanced Track Changes Options dialog box in Word
+
+f. Repeat the preceding step (e.) to reopen the Track Changes Options dialog box. Now, select Hidden in the Deletions drop-down list, and again click OK to close the dialog box.
+
+g. Stop recording the macro by clicking the white recording button in the status bar or by clicking the Stop Recording button on the Developer tab on the Ribbon.
+
+3. Press Alt+F8 to display the Macros dialog box. Select the macro you just recorded and click the Edit button to open it for editing in the Visual Basic Editor. Your code should look like this:
+
+    1.  Sub temp()
+    2.  '
+    3.  ' temp Macro
+    4.  '
+    5.  '
+    6.      With Options
+    7.          .InsertedTextMark = wdInsertedTextMarkUnderline
+    8.          .InsertedTextColor = wdRed
+    9.          .DeletedTextMark = wdDeletedTextMarkStrikeThrough
+    10.         .DeletedTextColor = wdRed
+    11.         .RevisedPropertiesMark = wdRevisedPropertiesMarkNone
+    12.         .RevisedPropertiesColor = wdByAuthor
+    13.         .RevisedLinesMark = wdRevisedLinesMarkOutsideBorder
+    14.         .CommentsColor = wdRed
+    15.         .RevisionsBalloonPrintOrientation = _
+    wdBalloonPrintOrientationPreserve
+    16.     End With
+    17.     ActiveWindow.View.RevisionsMode = wdMixedRevisions
+    18.     With Options
+    19.         .MoveFromTextMark = wdMoveFromTextMarkDoubleStrikeThrough
+    20.         .MoveFromTextColor = wdGreen
+    21.         .MoveToTextMark = wdMoveToTextMarkDoubleUnderline
+    22.         .MoveToTextColor = wdGreen
+    23.         .InsertedCellColor = wdCellColorLightBlue
+    24.         .MergedCellColor = wdCellColorLightYellow
+    25.         .DeletedCellColor = wdCellColorPink
+    26.         .SplitCellColor = wdCellColorLightOrange
+    27.     End With
+    28.     With ActiveDocument
+    29.         .TrackMoves = False
+    30.         .TrackFormatting = True
+    31.     End With
+    32.     With Options
+    33.         .InsertedTextMark = wdInsertedTextMarkUnderline
+    34.         .InsertedTextColor = wdRed
+    35.         .DeletedTextMark = wdDeletedTextMarkHidden
+    36.         .DeletedTextColor = wdRed
+    37.         .RevisedPropertiesMark = wdRevisedPropertiesMarkNone
+    38.         .RevisedPropertiesColor = wdByAuthor
+    39.         .RevisedLinesMark = wdRevisedLinesMarkOutsideBorder
+    40.         .CommentsColor = wdRed
+    41.         .RevisionsBalloonPrintOrientation = _
+    wdBalloonPrintOrientationPreserve
+    42.     End With
+    43.     ActiveWindow.View.RevisionsMode = wdMixedRevisions
+    44.     With Options
+    45.         .MoveFromTextMark = wdMoveFromTextMarkDoubleStrikeThrough
+    46.         .MoveFromTextColor = wdGreen
+    47.         .MoveToTextMark = wdMoveToTextMarkDoubleUnderline
+    48.         .MoveToTextColor = wdGreen
+    49.         .InsertedCellColor = wdCellColorLightBlue
+    50.         .MergedCellColor = wdCellColorLightYellow
+    51.         .DeletedCellColor = wdCellColorPink
+    52.         .SplitCellColor = wdCellColorLightOrange
+    53.     End With
+    54.     With ActiveDocument
+    55.         .TrackMoves = False
+    56.         .TrackFormatting = True
+    57.     End With
+    58. End Sub
+
+4. That's a daunting amount of code for the few rather simple actions you took. Remember that this is because the Macro Recorder records the settings for _all_ of the possible options in the Track Changes Options dialog box that you visited, not just the option you selected. Look over the code briefly to see the many settings that were recorded from the options inside the dialog box displayed in Figure 4.2.
+
+If you look at the figure, you can see how the code reflects the settings. For example, see the .SplitCellColor = wdCellColorLightOrange line of code and locate the setting it refers to in the dialog box.
+
+5. A second set of nearly identical settings in the code represents your second visit to the dialog box. Notice lines 9 and 36 in particular; these are key. Line 35 reflects the change made on your second visit—specifying a hidden rather than strikethrough property for the DeletedTextMark property of the Options object. Notice, too, the two values for this property: wdDeletedTextMarkStrikeThrough (when you recorded the Deletions drop-down specifying Strikethrough) and wdDeletedTextMarkHidden (when you set it to Hidden).
+
+6. Now in the Editor, select the entire recorded macro, from the Sub temp statement down to the End Sub statement, and press the Delete key to get rid of it.
+
+7. Make sure the Visual Basic Editor is set up as described in the section "Setting Up the Visual Basic Editor for Creating the Procedures," earlier in this chapter.
+
+8. In the Project Explorer window, right-click anywhere in the Normal item and choose Insert ⇒ Module from the context menu. The Visual Basic Editor inserts a new module in the Normal.dotm global template and displays a Code window for it.
+
+9. Press the F4 key to activate the Properties window for the new module. (By _activate_ I mean _give_ _the_ _focus to_ —whatever window has the focus is the one where typing will be displayed or mouse clicks will have an effect.) The Visual Basic Editor selects the (Name) property, the only property available for this new module. (Confusingly, the property's name is enclosed in parentheses.)
+
+10. Type a name for the new module in the Properties window. For this example, delete the default name (Module 1 or Module 2 or whatever it is) and type the name Procedures_to_Keep_1.
+
+11. Press the F7 key or click in the Code window to activate it.
+
+12. Verify that the Visual Basic Editor has entered the Option Explicit statement in the declarations area at the top of the code sheet (the code area) in the Code window. If not, go back and complete step 4 in the list at the start of this chapter.
+
+13. Below the Option Explicit statement, type the Sub statement for the procedure and press the Enter key. Name the procedure Toggle_Track_Changes_between_Hidden_and_Strikethrough:
+
+    Sub Toggle_Track_Changes_between_Hidden_and_Strikethrough
+
+14. When you press the Enter key, the Visual Basic Editor inserts for you the required parentheses at the end of the Sub statement, a blank line, and the End Sub statement and places the insertion point on the blank line, ready for you to start typing in some programming:
+
+    Sub Toggle_Track_Changes_between_Hidden_and_Strikethrough()
+
+    End Sub
+
+15. Press the Tab key to indent the first line below the Sub statement.
+
+16. Type **if options.** (in lowercase, and be sure to end with the period). Now the Editor displays the List Properties/Methods drop-down list.
+
+17. Type down through the list (type **d** , **e** , and then **l** ) and use the ⇒ key, or simply scroll with the mouse, to select the DeletedTextMark entry.
+
+18. Now just type **=** (the equal sign). The Visual Basic Editor enters the DeletedTextMark command for you, followed by the equal sign, and then displays the List Properties/Methods list of constants that can be used with the DeletedTextMark property (see Figure 4.3).
+
+Figure 4.3 The Visual Basic Editor's List Properties/Methods list displays the constants available for the DeletedTextMark property.
+
+19. Select the wdDeletedTextMarkHidden item and enter it into your code by pressing the Tab key or by double-clicking it.
+
+20. Type **Then** and press the Enter key. Note that when you start the next line of code (by pressing Enter), the Visual Basic Editor checks the line of code for errors. If you used lowercase for the If Options part of the statement, the Visual Basic Editor applies capitalization (this is just for show—VBA pays no attention to capitalization when executing code). If there are no space characters on either side of the equal sign, the Visual Basic Editor adds them too.
+
+21. Enter **Options.DeletedTextMark=wdDeletedTextMarkStrikethrough** , using the assistance offered by the Visual Basic Editor's _Auto List Members_ features (described earlier, in steps 16 through 18), and then press Enter.
+
+22. Press the Backspace key or Shift+Tab to unindent the new line of code by one tab stop.
+
+23. Type the **ElseIf** keyword, and then enter the rest of the procedure as follows:
+
+    ElseIf Options.DeletedTextMark = wdDeletedTextMarkStrikeThrough Then
+        Options.DeletedTextMark = wdDeletedTextMarkHidden
+    End If
+
+24. Make sure your completed procedure looks like this:
+
+    Sub Toggle_Track_Changes_between_Hidden_and_Strikethrough()
+        If Options.DeletedTextMark = wdDeletedTextMarkHidden Then
+            Options.DeletedTextMark = wdDeletedTextMarkStrikeThrough
+        ElseIf Options.DeletedTextMark = wdDeletedTextMarkStrikeThrough Then
+            Options.DeletedTextMark = wdDeletedTextMarkHidden
+        End If
+    End Sub
+
+25. Press Alt+F11 to switch to Word, and then type in a line or two of text.
+
+26. Arrange the Word window and the Visual Basic Editor window side by side. In Word, click the Review tab on the Ribbon, and click the upper half of the Track Changes button (the graphic icon) to activate the feature that marks up (or otherwise handles) revisions. Delete a word in your text. Notice whether it is struck through or is simply hidden. You have a macro that toggles between these two behaviors, so in the Visual Basic Editor, press the F5 key or click the Run Sub/UserForm button (on the Standard and Debug toolbars) to run the macro. Back in Word, see what effect the deletion has now. You can also take a look at the Track Changes Options dialog box to see that the Deletions setting has changed.
+
+27. Click the Save button on the Standard toolbar in the Visual Basic Editor.
+
+Note that you could alternatively write this macro using a With statement for the Options object so that it looks like this:
+
+    Sub Toggle_Track_Changes_between_Hidden_and_Strikethrough_2()
+        With Options
+            If .DeletedTextMark = wdDeletedTextMarkHidden Then
+                .DeletedTextMark = wdDeletedTextMarkStrikeThrough
+            ElseIf .DeletedTextMark = wdDeletedTextMarkStrikeThrough Then
+                .DeletedTextMark = wdDeletedTextMarkHidden
+            End If
+        End With
+    End Sub
+
+There are usually several ways to code a given behavior in VBA. Although formal (professional) programmers learn a set of "best practices," if you're just a hobbyist writing VBA for your own personal use, go ahead and code however you wish. Whatever works.
+
+# Creating a Procedure for Excel
+
+The procedure you'll create for Excel is short but helpful: When the user runs Excel, the procedure maximizes the Excel window and opens the last file used. The procedure also illustrates some useful techniques, including these:
+
+  * Writing a macro that executes when an application first starts up
+  * Working with events
+  * Using the Object Browser to find the objects, methods, and properties you need
+
+Follow these steps to create the procedure:
+
+1. Start Excel if it's not already running.
+
+2. Press Alt+Tab to cycle through your workbooks to locate Personal.xlsb. If your Personal Macro Workbook is currently hidden, click the Unhide button in the Window section of the View tab on the Ribbon. Select PERSONAL.XLSB in the Unhide Workbook list box, and then click the OK button.
+
+3. Press Alt+F11 to open the Visual Basic Editor.
+
+4. Make sure the Visual Basic Editor is set up as described in the section "Setting Up the Visual Basic Editor for Creating the Procedures" earlier in this chapter.
+
+5. In the Project Explorer window, expand VBAProject (PERSONAL.XLSB) if it's collapsed. To expand it, either double-click its name or click the + sign to its left.
+
+6. Expand the Microsoft Excel Objects folder.
+
+7. Double-click the ThisWorkbook item to open its code sheet in a Code window. The ThisWorkbook object represents the current workbook.
+
+8. Verify that the Visual Basic Editor has entered the Option Explicit statement in the declarations area at the top of the code sheet. If not, go back and complete step 4 in the list at the start of this chapter. However, note that at the time of this writing, even if you select the Require Variable Declaration option (via the Tools ⇒ Options menu in the Excel version of the VBA Editor), Option Explicit is not automatically inserted into your Code window.
+
+9. In the Code window, type
+
+    Private Sub Auto_Open
+
+10. and then press the Enter key. The Editor will add the required parentheses and the End Sub line.
+
+* * *
+
+Macros Have Scope
+
+The Private keyword limits the scope of a macro—the area in which it can operate. Private scope makes the macro available to all procedures in the module that contains it, but not to procedures in other modules. Chapter 6, "Working with Variables, Constants, and Enumerations," explains scope in more detail.
+
+* * *
+
+11. Open the Object Browser. Press the F2 key, choose View ⇒ Object Browser, or click the Object Browser button on the Standard toolbar to display the Object Browser window (see Figure 4.4).
+
+Figure 4.4 Use the Object Browser to find the objects, methods, and properties you need for a procedure.
+
+12. The first action we want to take in this macro is to maximize the Excel's application window. As in any application, VBA uses the Application object to represent the Excel application, but you need to find the correct property of this object to work with. Select Excel in the Project/Library drop-down list (see the label in Figure 4.4), type **maximize** in the Search Text box, and either click the Search button or press the Enter key. The Object Browser displays the result of the search (see Figure 4.5) in its Search Results pane (which was collapsed and not visible in Figure 4.4). The constant xlMaximized is a member of the class XlWindowState.
+
+Figure 4.5 The result of the search for "maximize" in the Object Browser
+
+13. Press the F7 key to activate the Code window. (Alternatively, click the Code window, choose View ⇒ Code, or choose the Code window from the Window menu.)
+
+14. Type **application.** (in lowercase and including the period) so that the Visual Basic Editor displays the drop-down list, type **w** to jump to the items beginning with _W_ , and select the WindowState item.
+
+15. Type **=** to enter the WindowState item in your code and to display the list of constants available for WindowState (see Figure 4.6).
+
+Figure 4.6 Use the list of constants to enter the constant quickly and easily.
+
+16. Select the xlMaximized item and press Enter to insert that property in the code, and move down a line to start writing a new statement.
+
+17. The second action for the macro is to open the last file used—file 1 on the recently used files list (this is the list that appears in the Recent Documents list when you click the Recent item in the File tab on the Ribbon). Press the F2 key to activate the Object Browser again.
+
+18. Leave Excel selected in the Project/Library drop-down list, type **recent** , and either press the Enter key or click the Search button. The Object Browser displays the results of the search (see Figure 4.7). The item you need is the RecentFiles property of the Application object. The RecentFiles property returns the RecentFiles _collection_ , an object that knows the information about the files in the recently used files list.
+
+Figure 4.7 The result of the search for "recent" in the Object Browser
+
+19. Press the F7 key to return to the Code window. Type **application.** and select RecentFiles from the List Properties/Methods drop-down list. Then type **(1).** to indicate the first item in the RecentFiles collection, and select the Open method from the List Properties/Methods list:
+
+    Application.RecentFiles(1).Open
+
+20. That's it. Your procedure should look like this:
+
+    Private Sub Auto_Open()
+
+        Application.WindowState = xlMaximized
+        Application.RecentFiles(1).Open
+
+    End Sub
+
+21. Press Alt+Q or choose File ⇒ Close And Return To Microsoft Excel to return to Excel.
+
+22. Click the File tab on the Ribbon and choose Save.
+
+23. Click the Hide button in the Window section of the View tab on the Ribbon. This hides PERSONAL.XLSB from view.
+
+24. Open a sample document, type something into one of the cells, save it, and close it.
+
+25. Press Alt+F4 to exit Excel. If you are asked if you want to save the changes you made to the current workbook and your Personal Macro Workbook, choose Yes.
+
+26. Restart Excel. Notice how Excel automatically maximizes the application window and opens the most recently used file.
+
+27. If you see an error message, it most likely means that you've renamed or moved the most recently used file. To prevent this problem, you can add some error-trapping code. We'll explore the On Error command thoroughly in Chapter 17, "Debugging Your Code and Handling Errors," but if you wish, you can make the following changes to your Auto_Open macro:
+
+    Private Sub Auto_Open()
+
+    **On Error GoTo Problem**
+
+        Application.WindowState = xlMaximized
+        Application.RecentFiles(1).Open
+
+    Exit Sub
+
+    Problem **:**
+
+        MsgBox "Error: " & Application.RecentFiles(1).Path & " can't be opened."
+
+    End Sub
+
+The Auto_Open name is special. When you name a macro Auto_Open, VBA knows that whatever actions are in the macro code should be executed when Excel starts running. This is one of a handful of special names called Excel's _events_ —things that happen to an object, in this case the Open event of the Excel application. (Notice that an object's _methods_ are actions it can take, such as a print method sending a document to the printer. Conversely, an object's _events_ are things that can happen to it, such as a user clicking a button or opening an application.)
+
+* * *
+
+How to Turn off Default Templates
+
+The following section describes how to use a template that comes with PowerPoint. You've likely noticed that when you start Office 2013 applications, they display a set of templates. Some users are likely to never use these templates and would prefer the traditional Office applications' behavior: starting with a blank document and bypassing this display of templates. To turn this off, choose File ⇒ Options, then uncheck Show The Start Screen When This Application Starts.
+
+* * *
+
+# Creating a Procedure for PowerPoint
+
+The procedure you'll create for PowerPoint is short and straightforward, but it can save the user enough effort over the long run to make it worthwhile. It adds a title slide to the active presentation, inserting a canned title that includes the current date and the company's name as the presenter.
+
+Follow these steps to create the procedure:
+
+1. Start PowerPoint. If PowerPoint is already running, close it and restart it. If PowerPoint creates a default presentation on startup, close the presentation (click the File tab and choose Close).
+
+2. Create a new presentation based on the Contemporary Photo Album template. In Office 2013, locate the list of Suggested Searches at the top of the default templates window displayed when you first run PowerPoint. Then click Photo Albums. (In previous versions of Office, click the File tab and choose New And Sample Templates). Make sure the default slide on the presentation has the Title Slide layout by right-clicking a blank area in the slide, then choosing Layout ⇒ Title And Content (it will be called _Title Slide_ in earlier versions of Office) to apply it to the default slide.
+
+3. Press Alt+F11 to open the Visual Basic Editor.
+
+4. Make sure the Visual Basic Editor is set up as described in the section "Setting Up the Visual Basic Editor for Creating the Procedures" earlier in this chapter.
+
+5. In the Project Explorer window, right-click anywhere in the VBAProject(Presentation1) item and choose Insert ⇒ Module from the context menu. The Visual Basic Editor inserts a new module in the project, displays a Code window containing the code sheet for the module, and expands the project tree in the Project Explorer.
+
+6. Verify that the Visual Basic Editor has entered the Option Explicit statement in the declarations area at the top of the code sheet. If not, go back and complete step 4 in the list at the start of this chapter.
+
+7. Press the F4 key to activate the Properties window.
+
+8. Replace the default name _Module_ _1_ by typing (in the Properties window) **General_Procedures**.
+
+9. Press the F7 key or click in the Code window to activate it.
+
+10. Below the Option Explicit statement, type the Sub statement for the procedure and press the Enter key:
+
+    Sub Add_Title_Slide
+
+11. When you press Enter, the Visual Basic Editor enters the parentheses at the end of the Sub statement, a blank line, and the End Sub statement for you, and places the insertion point on the blank line:
+
+    Sub Add_Title_Slide()
+
+    End Sub
+
+12. Press the Tab key to indent the first line below the Sub statement.
+
+13. Now identify the objects you need by using the Help system. You'll be working with the active presentation, which is represented by the ActivePresentation object. As you'll see in Part 6 of this book, "Programming the Office Applications"—which is all about objects—there are several ways to get information when programming with objects. For now, let's try searching online help rather than using the Editor's built-in Object Browser. Using Google or Bing, search for **object model reference powerpoint 2013**. You should then be able to locate the details about the Application object's ActivePresentation property object, as shown in Figure 4.8.
+
+Figure 4.8 The ActivePresentation property ⇒ screen
+
+14. Click the _Presentation_ link in "Returns a Presentation object..." near the top, as shown in Figure 4.8. This link will take you to the Presentation object's Help screen. We're drilling down in this Help system to find example code and other assistance that will show us how to work with slides and related objects. All this will become much clearer to you in Part 6. For now, just follow along to get the general idea.
+
+15. Now on the Presentation object's Help page, click the Presentation Object Members link (scroll to find it near the bottom of this web page), and then scroll way down to locate the Slides object in the properties list. Click the Slides link (see Figure 4.9), then in the new web page that appears, click a Slides link again (it's near the top where it says "Returns a Slides collection..."). Now you see the information about the Slides Collection object, as shown in Figure 4.10.
+
+Figure 4.9 Select the Slides object from the list.
+
+Figure 4.10 The Slides Collection Object Help screen
+
+16. From this screen, you learn two pieces of information: first, that a slide is represented by a Slide object (stored in a Slides collection), and second, that you use the Add method to create a new slide.
+
+17. Type a declaration for an object variable of the Slide object type to represent the slide the procedure creates. Notice that after you type **as** and a space, the Visual Basic Editor displays the list of available objects. Type down through the list (type **s** and **l** ) until you have selected Slide, and then press the Enter key to complete the term and start a new line of code:
+
+    Dim sldTitleSlide As Slide
+
+18. Use a Set statement to assign to the sldTitleSlide object a new slide you create by using the Add method. Type **set sld** and then press Ctrl+spacebar to make the Editor's Complete Word feature enter sldTitleSlide for you. Then type **= activepresentation.slides.add(** , using the Visual Basic Editor's assistance, so that the line reads as shown here:
+
+    Set sldTitleSlide = ActivePresentation.Slides.Add(
+
+19. When you type the parenthesis, the Auto Quick Info feature displays the syntax for the Add method, as shown in Figure 4.11.
+
+Figure 4.11 The Auto Quick Info feature displays the syntax for the Add method when you type the parenthesis after the Add method
+
+20. Type the **Index** argument, a colon, an equal sign, the value **1** (because the title slide is to be the first slide in the presentation), and a comma:
+
+    Set sldTitleSlide = ActivePresentation.Slides.Add(Index:=1,
+
+* * *
+
+Choosing between Labeled and Implied Argument Lists
+
+When a method uses arguments, as the Add method does here, you can choose between specifying the argument names or omitting them and letting VBA infer the arguments from the order of the values or constants. For example, in this case you can specify either Add(Index:=1, Layout:=ppLayoutTitle) or Add(1, ppLayoutTitle). The latter is more concise and easier to type in, but the former is much clearer to read.
+
+* * *
+
+21. Break the statement to the next line with a line-continuation character (an underscore preceded by a space). Then type a tab to indent the new line, type the **Layout** argument, a colon, and an equal sign, and pick the ppLayoutTitle constant from the List Properties/Methods drop-down list, as shown in Figure 4.12.
+
+Figure 4.12 Choose the ppLayoutTitle constant for the Layout argument.
+
+22. Type the parenthesis to end the statement:
+
+    Set sldTitleSlide = ActivePresentation.Slides.Add(Index:=1, _
+        Layout:=ppLayoutTitle)
+
+23. Press the Enter key to start a new line, and then press either the Backspace key or Shift+Tab to unindent the new line by one tab stop.
+
+24. You'll be working with the sldTitleSlide from here on, so create a With statement using it, and place the insertion point on the line between the With statement and the End With statement:
+
+    With sldTitleSlide
+
+    End With
+
+25. Next, the macro will manipulate the two items on the slide. To make it do so, you need to know the objects that represent them. You could use the Macro Recorder to find the objects, but this time try a more direct method: Place the insertion point on the line within the With statement and type . (a period) to display the List Properties/Methods drop-down list of available properties and methods for the Slide object.
+
+26. Sometimes the List Properties/Methods drop-down list is of little help because it displays so many possibly relevant properties and methods that you can't identify the property you need. But if you scan the list in this case, you'll see that the Shapes property (which returns the Shapes collection) is the only promising item.
+
+27. Press Ctrl+G, choose View ⇒ Immediate, or click the Immediate Window button on the Debug toolbar to display the Immediate window for a bit of testing.
+
+28. Type the following exploratory statement into the Immediate window and press the Enter key to execute this statement:
+
+    ActivePresentation.Slides(1).Shapes(1).Select
+
+(The Immediate window is a quick way to test individual lines of code without having to run the entire macro.) Now switch to PowerPoint's window to see if the item was, in fact, selected (whether it has a frame drawn around it). Press Alt+F11 or click the View Microsoft PowerPoint button on the Standard toolbar to display the PowerPoint window to verify that VBA has selected the first Shape object on the slide.
+
+29. Okay, this is the right object to start with, but now you need to find out how to add text to the shape. Go back to the Code window (click in the Code window or press the F7 key). Press the Backspace key to delete the period, and then type it again to redisplay the list. Type **te** to jump down to the items in the list whose names start with _text_. Select the TextFrame item in the list, and then type a period to enter the term and display the next list. Scroll down the list, select the TextRange object, and type a period to enter the term and display the next list. In the next list, select the Text property. Type an equal sign to enter the term. Then type double quotation marks followed by the text to assign to the text property **Pollution Update:** (with a space after it), double quotation marks, an ampersand, and the date (supplied by the Date function):
+
+    Shapes(1).TextFrame.TextRange.Text = "Pollution Update: " & Date
+
+30. Assign information to the second Shape in the same way:
+
+    .Shapes(2).TextFrame.TextRange.Text = "JMP Industrials."
+
+31. The finished procedure should look like this:
+
+    Sub Add_Title_Slide()
+        Dim sldTitleSlide As Slide
+        Set sldTitleSlide = ActivePresentation.Slides.Add(Index:=1, _
+            Layout:=ppLayoutTitle)
+        With sldTitleSlide
+            .Shapes(1).TextFrame.TextRange.Text = _
+                "Pollution Update: " & Date
+            .Shapes(2).TextFrame.TextRange.Text = _
+                "JMP Industrials"
+        End With
+    End Sub
+
+32. Press F5 to test the procedure. Look at the slides in PowerPoint. There should be a new first slide in the collection of slides on the left. Then delete all slides from the presentation (select slides by pressing Shift while clicking a range of slides in the left pane, then press Delete).
+
+33. If you wish, right-click on the Quick Access Toolbar in the upper-left corner of PowerPoint's screen, then choose Customize Quick Access Toolbar. Then add a Quick Access Toolbar button for the Add_Title_Slide macro.
+
+34. Save the presentation under a name such as Procedures.pptm. You might see a warning about personal-information risks. Click OK to close that Be Careful! message box.
+
+35. Create a new presentation; then test the toolbar button or menu item for the procedure. If you see a security warning, read the sidebar titled "A Warning about Security" in Chapter 1. Close the presentation without saving changes.
+
+# Creating a Procedure for Access
+
+Access has a long tradition of autonomy from the other Office applications, and this applies as well to its implementation of macros. It has no Recorder, for example, nor does it permit you to assign macros to shortcut key combinations.
+
+In addition, Access includes a legacy "Macro Builder," which you can take a look at by clicking the Macro button on the Create tab of the Ribbon. (Note that in Access there is no _Developer_ tab on the Ribbon. You can open the Visual Basic Editor from the Database Tools tab or press Alt+F11.)
+
+The Macro Builder utility has been generally unpopular over the years because the Visual Basic Editor offers far more options, objects, and features. The Builder is for nonprogrammers—a way to create simple macros via lists rather than actual programming. However, the Builder was somewhat improved in Access 2007, including provisions for error handling and the ability to embed macros within individual forms. And additional improvements were made for Access 2010, enough improvements that Microsoft renamed it the Macro Designer. But a rose by any other name is still a rose. If you're interested in details about the Macro Designer and its curious, some might say simplistic, reliance on repeated If...Then structures, see the sidebar titled "Using The Macro Builder" in Chapter 28, "Understanding the Access Object Model and Key Objects."
+
+For the reasons I mentioned, you will likely prefer to use the Visual Basic Editor rather than the Builder/Designer for any but the most elementary macros. After all, relying on a list of If queries is not only limiting, it's downright dated.
+
+Let's get a feel for writing real VBA macros in Access. In this example, you'll write a macro that displays today's date and time:
+
+1. Start Access.
+
+2. Double-click the Blank Desktop Database icon (in Access 2010 and earlier, double-click the Blank Database button).
+
+3. Press Alt+F11 to open the Visual Basic Editor.
+
+4. Right-click the database name in the Project Explorer, then choose Insert Module to open a new module in the Code window, where you can write macros.
+
+5. In the Code window, type the following macro:
+
+    Sub ShowDate()
+
+    MsgBox ("It is: " & Now)
+
+    End Sub
+
+6. Click anywhere within this code, and then press F5 to execute the macro. You should see a message box that displays the current date and time. (Note that you don't type the End Sub; Access automatically inserts it for you.)
+
+We'll cover Access macro programming in depth in Chapter 28 and Chapter 29, "Manipulating the Data in an Access Database via VBA." Also, you might have noticed that the Editor automatically inserted a line of code at the top: Option Compare Database. This specifies a particular way to go about comparing text strings.
+
+# The Bottom Line
+
+**Set up the Visual Basic Editor for creating procedures.**
+
+How you arrange the various components of the Visual Basic Editor is your personal choice, but while using this book, it's easiest if you set up the Editor to resemble the way it appears in the book's figures. Besides, this arrangement is quite close to the default layout, which has proven to be the most effective one for the majority of programmers (according to various focus groups and polls) for the decades that Visual Basic has been used.
+
+Master It
+
+Press a single key to display, then hide, the Properties window.
+
+**Create a procedure for Word.**
+
+Using the Help feature in any VBA-enabled application allows you to find code examples that you can copy and paste into your own code.
+
+Master It
+
+Open the Code window and use Help to find a code example.
+
+**Create a procedure for Excel.**
+
+Certain procedure names are special. In a previous Excel exercise, you added line numbering and gave that procedure a name of your own choice. But some procedure names have a special meaning—they are triggered by an _event_ in Excel itself. They will execute _automatically_ when that event takes place (you don't have to run events by choosing Run from the Macro dialog box or by assigning the macro to a keyboard shortcut or Quick Access Toolbar button). One such event is Excel's Auto_Open procedure.
+
+Master It
+
+Display a message to the user when Excel first executes.
+
+**Create a procedure for PowerPoint.**
+
+As you type a procedure, the Visual Basic Editor provides you with lists of objects' members (the Auto List Members feature) and with syntax examples, including both required and optional arguments (the Auto Quick Info feature). These tools can be invaluable in guiding you quickly to the correct object and syntax for a given command.
+
+Master It
+
+Use the Auto List Members and Auto Quick Info features to write a macro that saves a backup copy of the currently active presentation.
+
+**Create a procedure for Access.**
+
+Although Access includes a variety of macro-related features that are unique (such as its Macro Builder/Designer), its Visual Basic Editor is quite similar to the Visual Basic Editors in the other Office applications.
+
+Master It
+
+Open the Visual Basic Editor in Access and write a macro that displays today's date using the Date function rather than the Now function. Use the Access Visual Basic Editor Help system to understand the difference between these two functions.
+Part 2
+
+Learning How to Work with VBA
+
+  * **Chapter 5: Understanding the Essentials of VBA Syntax**
+  * **Chapter 6: Working with Variables, Constants, and Enumerations**
+  * **Chapter 7: Using Array Variables**
+  * **Chapter 8: Finding the Objects, Methods, and Properties You Need**
+
+Chapter 5
+
+Understanding the Essentials of VBA Syntax
+
+In this chapter, you'll learn the essentials of VBA syntax, building on what you learned via practical examples in the previous chapters. This chapter defines the key terms that you need to know about VBA to get going with it, and you'll practice using some of the features in the Visual Basic Editor.
+
+* * *
+
+If You Don't Understand a Programming Term, Look Ahead
+
+You'll find lots of definitions of programming terms as you work your way through this chapter. If you come across something that doesn't yet make sense to you, just keep going; you'll most likely find an explanation in the next few pages.
+
+* * *
+
+In this chapter you will learn to do the following:
+
+  * Understand the basics of VBA
+  * Work with procedures and functions
+  * Use the Immediate window to execute statements
+  * Understand objects, properties, methods, and events
+
+# Getting Ready
+
+To learn most efficiently in this next section, arrange the Visual Basic Editor in Word by performing the following steps. This chapter ⇒ focuses on Word because it's the most widely distributed of the VBA-enabled applications. If you don't have Word, read along anyway without performing the actions on the computer; the examples are easy to follow. (Much of this will work on any VBA host application, though many of the commands shown here are specific to Word.) Here are the steps:
+
+1. Start Word.
+
+2. Launch the Visual Basic Editor by pressing Alt+F11 or clicking the Developer tab on the Ribbon and then clicking the Visual Basic button.
+
+3. Arrange the Word window and the Visual Basic Editor window so that you can see both of them at once. For example, if these are the only two open windows that are not minimized, right-click the Taskbar and choose Show Windows Stacked or Show Windows Side By Side from the context menu to arrange the windows, or just drag them by their title bars to the right or left side.
+
+4. Display the Immediate window in the Visual Basic Editor by pressing Ctrl+G, choosing View ⇒ Immediate Window, or clicking the Immediate Window button on the Debug toolbar. Your setup should look like Figure 5.1.
+
+Figure 5.1 The Visual Basic Editor set up alongside a Word document. This is a good way to edit or debug macros. You can see where you are in the code and, often, the effect the macro is having.
+
+* * *
+
+Using Dual Monitors
+
+If you're using a multiple-monitor setup, you can dedicate one monitor to Word and another to the Visual Basic Editor.
+
+* * *
+
+# Procedures
+
+A _procedure_ in VBA is a named unit of code that contains a sequence of statements to be executed as a group. VBA itself has a library of procedures.
+
+For example, VBA contains a function (a type of procedure) named Left, which returns the left portion of a text string that you specify. For example, hello is a string of text five characters long. The statement Left("hello", 3) returns the leftmost three characters of the string: hel. (You could then display this three-character string in a message box or use it in code.) The name assigned to the procedure gives you a way to refer to the procedure.
+
+In addition, when you write a macro, you are writing a procedure of your own (as opposed to a procedure built into VBA already).
+
+Any executable code (your macros) in VBA must be contained in a procedure—if it isn't, VBA can't execute it and an error occurs. (The exception is statements you execute in the Immediate window, which take place outside a procedure. However, the contents of the Immediate window exist only during the current VBA session and are used for testing code. They cannot be executed from the host application via buttons, ribbons, or keyboard shortcuts.)
+
+A macro—in other words the code from Sub to End Sub—is a procedure.
+
+Procedures are contained within modules, which in turn are contained within project files, templates, or other VBA host objects, such as user forms.
+
+There are two types of procedures: functions and subprocedures (subs).
+
+## Functions
+
+A _function_ in VBA is one of two types of procedures. Like a sub, a function is a procedure designed to perform a specific task. For example, the built-in VBA Left function returns the left part of a text string, and the Right function, its counterpart, returns the right part of a text string. Each function has a clear task that you use it for, and it doesn't do anything else. To take a ridiculous example, you can't use the Left function to print a document in Word or make characters boldface—for those tasks, you need to use the appropriate functions, methods, and properties. Left just does its one, simple job.
+
+VBA comes with many built-in functions, but you can create your own as well. You'll create your own functions later in the book. They will begin with a Function statement and end with an End Function statement.
+
+_Each function returns a value_. For example, the Left function returns the left part of the string. Other functions return different kinds of results. Some, for example, just test a condition and return True if the condition is met and False if it is not met. But just remember that what distinguishes a function is that it returns some value.
+
+## Subprocedures
+
+A _subprocedure_ (also called a _sub_ or _subroutine_ ), like a function, is a complete procedure designed to perform a specific task, but unlike a function, a sub _does not return a value_.
+
+Note that many tasks need not return a result. For example, the Transpose_Word macros you created earlier in this book merely switch a pair of words in a document. There's no need for any value to be returned to VBA for further use. On the other hand, if your procedure calculates sales tax, there _is_ a result, the amount of tax, that must be returned by the procedure for display to the user or further manipulations by the VBA code.
+
+All the macros you record using the Macro Recorder are subprocedures, as are many of the procedures you'll look at in the rest of this book.
+
+Each subprocedure begins with a Sub statement and ends with an End Sub statement.
+
+* * *
+
+Functions Aren't Displayed in the Macros Dialog Box
+
+Only subprocedures appear in the Macros dialog box. Should you choose to write a function, it will not appear in that box.
+
+* * *
+
+# Statements
+
+When you create a macro in VBA, you're writing _statements_ , which are similar to sentences in ordinary speech. A _statement_ is a unit of code that describes an action, defines an item, or gives the value of a variable. VBA usually has one statement per line of code, although you can put more than one statement on a line by separating them with colons. (This isn't usually a good idea because it makes your code harder to read. Most programmers stick to one statement per line.)
+
+You can also break a lengthy line of code onto a second line or a subsequent line to make it easier to read (although this isn't usually necessary). You continue a statement onto the next line by using a line-continuation character: an underscore (_) preceded by a space (and followed by a carriage return; in other words, press the Enter key). You continue a line strictly for visual convenience; VBA still reads continued lines as a single "virtual" line of code. In other words, no matter how many line continuations you use for easy-to-read formatting, during execution it's still a single statement to VBA.
+
+So, think of VBA code as a series of sentences, each on its own line (or continued), that are usually executed one by one down from the top.
+
+* * *
+
+You Can't Break Strings with the Line-Continuation Character
+
+You can't break a string (text enclosed in quotation marks) with the line-continuation character. If you need to break a line that involves a long string in quotes, break the string into shorter strings and concatenate them using the & operator: "This" & "that".
+
+* * *
+
+VBA statements vary widely in length and complexity. A statement can range in length from a single word (such as Beep, which makes the computer beep, or Stop, which halts the execution of VBA code) to very long and complicated lines involving many components. But to make it easy to read your code, try to make your lines as brief as possible.
+
+That said, let's examine the makeup of several sample VBA statements in Word. Most of these will use the ActiveDocument object, which represents the active document in the current session of Word; a couple use the Documents collection, which represents all open documents (including the active document); and one uses the Selection object, which represents the current selection within a document (selected text or the location of the blinking insertion cursor). Don't worry if some of these statements aren't immediately comprehensible—you'll understand them soon enough.
+
+Here are some example statements for you to try:
+
+    Documents.Open "c:\temp\Sample Document.docm"
+    MsgBox ActiveDocument.Name
+    ActiveDocument.Words(1).Text = "Industry"
+    ActiveDocument.Close SaveChanges:=wdDoNotSaveChanges
+    Documents.Add
+    Selection.TypeText "The quick brown fox jumped over the lazy dog."
+    Documents.Close SaveChanges:=wdDoNotSaveChanges
+    Application.Quit
+
+Let's look at each of these statements in turn. The statement
+
+    Documents.Open "c:\temp\Sample Document.docm"
+
+uses the Open method of the Documents collection to open the specified document—in this case, Sample Document.docm. Enter this statement in the Immediate window, substituting a path and filename of a document that exists on your computer for **_\temp\Sample Document.docm_**.
+
+Press the Enter key, and VBA opens the document in the Word window. Just as when you open a document by hand while working interactively in Word, this statement in the macro makes this document the active document (the document whose window has the _focus_ ; in other words, the window that is currently selected and will therefore take input from keystrokes or mouse activity).
+
+The statement
+
+    MsgBox ActiveDocument.Name
+
+uses the MsgBox function (built into VBA) to display the Name property of the ActiveDocument object (in this example, Sample Document.docm). As an experiment, type this MsgBox statement into the Immediate window (type in lowercase, and use VBA's Help features as you choose) and press the Enter key. VBA displays a message box over the Word window. Click the OK button to dismiss the message box.
+
+Now you see how you can quickly test a statement using the Immediate window. You don't have to execute an entire macro; you can just try out a single statement (a single line of code) in the Immediate window if you want to see its effect.
+
+Next, the statement
+
+    ActiveDocument.Words(1).Text = "Industry"
+
+uses the _assignment_ _operator_ (the equal [=] sign) to assign the value Industry to the Text property of the first item in the Words collection in the ActiveDocument object. Enter this statement in the Immediate window and press the Enter key. You'll see the word _Industry_ displayed in the current typeface at the beginning of the document you opened.
+
+Note that after this line executes, the blinking insertion point appears at the _beginning_ of this word rather than at the end of the word, where it would be if you'd typed the word. This happens because VBA manipulates the properties of the document (in this case the Words collection) directly rather than imitating "typing" into it.
+
+The statement
+
+    ActiveDocument.Close SaveChanges:=wdDoNotSaveChanges
+
+uses the Close method to close the ActiveDocument object. It uses one _argument_ , SaveChanges, which controls whether Word saves the document that's being closed (if the document contains unsaved changes). In this case, the statement uses the constant wdDoNotSaveChanges to specify that Word shouldn't save changes when closing this document. Enter this statement in the Immediate window and press the Enter key, and you'll see VBA make Word close the document.
+
+An _argument_ is information you send to a procedure. For example, in this next statement the argument is the text string show, which is sent to the built-in VBA MsgBox function:
+
+    MsgBox ("show")
+
+A MsgBox function will display _any_ text. So you send it an argument: the particular text you want it to display. You'll learn more about arguments shortly.
+
+Now try entering this statement in the Immediate window:
+
+    Documents.Add
+
+This statement uses the Add method of the Documents collection to add a new Document object to the Documents collection. In other words, it creates a new document. Because the statement doesn't specify which template to use, the new document is based on the default template (Normal.dotm). When you enter this statement in the Immediate window and press Enter, Word creates a new document. As usual, this new document becomes the active document.
+
+The statement
+
+    Selection.TypeText "The quick brown fox jumped over the lazy dog."
+
+uses the TypeText method of the Selection object to type text into the active document at the position of the insertion point or current selection. (The Selection object represents the current selection, which can be either a "collapsed" selection—a mere insertion point with nothing actually selected, as in this example—or one or more selected objects, such as one or more words.)
+
+If text is selected in the active document, that selection is overwritten as usual—unless you've cleared the Typing Replaces Selected Text check box by pressing Alt+F then I, and then clicking the Advanced option in the left pane of the Word Options dialog box. In that case, the selection is collapsed to its beginning and the new text is inserted before the previously selected text.
+
+But in this example—because you just created a new document—nothing is selected. Enter the previous Selection.TypeText statement in the Immediate window and press the Enter key, and Word enters the text. Note that this time the insertion point ends up _after_ the inserted text; the TypeText method of the Selection object _is_ analogous to typing something into Word yourself.
+
+The statement
+
+    Documents.Close SaveChanges:=wdDoNotSaveChanges
+
+is similar to an ActiveDocument.Close SaveChanges:=wdDoNotSaveChanges statement except that it works on the Documents collection rather than the ActiveDocument object. The Documents collection represents _all_ open documents in the current Word session. So this statement closes all open documents and doesn't save any unsaved changes in them. Enter this statement in the Immediate window and press Enter, and you'll see that Word closes all the open documents.
+
+The statement
+
+    Application.Quit
+
+uses the Quit method of the Application object to close the Word application. Enter the statement in the Immediate window and press the Enter key. Word closes itself, also closing the Visual Basic Editor in the process because Word is the host for the Visual Basic Editor.
+
+* * *
+
+Getting Help in Visual Basic for Applications
+
+The Visual Basic Editor offers comprehensive help for the Visual Basic for Applications programming language. To view it, choose Help ⇒ Microsoft Visual Basic For Applications Help from the Visual Basic Editor. You're taken to a website devoted to the current application (in this case, Word 2013).
+
+Pressing F1 works two ways. If your blinking cursor is on a blank space or an empty line in the Code window, F1 displays a generic Office Help page. This page contains the link "Welcome to the Visual Basic for Applications language reference for Office 2013." Click that link.
+
+Here's a second way to press F1 for help. Often the quickest way to get help is to click a keyword in your code, such as ActiveWindow or MsgBox. By clicking, you put the blinking insertion cursor in that command, "selecting" it. Now when you press F1, the Editor tries to locate online help for that particular command.
+
+Most of the built-in VBA statements and functions are illustrated with code examples, which can be particularly useful when you're creating and troubleshooting your own code. The samples show you how it's done.
+
+The Visual Basic Help files use a couple of conventions you should know about before you try to use them:
+
+  * Italics denote variables or values you'll need to change yourself.
+  * Brackets—[and]—denote optional arguments.
+
+This book uses the same conventions, so you'll see them in use soon.
+
+If you don't find what you need by searching the Microsoft Visual Basic Help ⇒ web pages, choose Help ⇒ MSDN On The Web. That's a more generic Office 2013 help site, with links for all the various Office applications and their object library references.
+
+* * *
+
+# Keywords
+
+A _keyword_ is a word that is part of the built-in VBA language. Here are some examples:
+
+  * The Sub keyword indicates the beginning of a subprocedure, and the End Sub keywords mark the end of a subprocedure.
+  * The Function keyword indicates the beginning of a function, and the End Function keywords mark the end of a function.
+  * The Dim keyword starts a declaration (for example, of a variable) and the As keyword links the item declared to its type, which is also a keyword. For example, in the statement Dim strExample As String, there are three keywords: Dim, As, and String.
+
+The names of functions and subprocedures are not keywords (neither the built-in procedures nor procedures you write). Note that in this book I sometimes use the term _command_ as a synonym for _keyword_.
+
+* * *
+
+Identifying Keywords by Color
+
+The Visual Basic Editor displays all keywords in blue. But if you wish, you can specify a different color for keyword text on the Editor Format tab of the Options dialog box (choose Tools ⇒ Options from the Visual Basic Editor). If you're not sure whether an item is a keyword, check if the color the Visual Basic Editor gives the item is the same color as keywords such as Sub.
+
+* * *
+
+# Expressions
+
+An _expression_ involves multiple words. It consists of a combination of keywords, operators, variables, and/or constants that results in (or _resolves to_ ) a string, number, or object. For example, you could use an expression to do a math calculation or to compare one variable against another. Here's an example of a numeric expression (it's shown in boldface) that compares the variable _N_ to the number 4 by using the > (greater than) operator:
+
+    If N > 4 Then
+
+The result of this expression will depend on whatever value is currently held in the variable _N_. If it holds 12, then the expression will result in TRUE because 12 is greater than 4. More on expressions later.
+
+# Operators
+
+An _operator_ is a symbol you use to compare, combine, or otherwise work with values in an expression. VBA has four kinds of operators:
+
+  * _Arithmetic operators_ (such as + and –) perform mathematical calculations.
+  * _Comparison operators_ (such as < and >, less than and greater than, respectively) compare values.
+  * _Logical operators_ (such as And, Not, and Or) build logical structures.
+  * The _concatenation operator_ (&) joins two strings together.
+
+You'll look at the different kinds of operators and how they work in Chapter 11, "Making Decisions in Your Code."
+
+# Variables
+
+A _variable_ is a location in memory set aside for storing a piece of information that can change while a procedure is running. (Think of it as a named, resizable compartment within the memory area.)
+
+For example, if you need the user to input their name via an input or a dialog box, you'll typically store the name in a variable so you can work with it further down in some later statement in the procedure. Or perhaps you're adding several numbers that the user types in. You would have a variable that holds the current sum total—which keeps changing (varying) as the user types in more numbers.
+
+VBA uses several types of variables, including these:
+
+_Strings_ store text characters or groups of characters.
+
+_Integers_ store whole numbers (numbers without fractions).
+
+_Objects_ store objects.
+
+_Variants_ can store any type of data. Variant is the default type of variable.
+
+Either you can let VBA create Variant variables as the default type, or you can specify another data type if you wish. Specifying the types of variables has certain advantages that you'll learn about in due course.
+
+For the moment, try creating a variable in the Immediate window. Type the following line and press Enter:
+
+    myVariable = "Some sample text"
+
+Nothing visible happens, but VBA has created the myVariable variable. It has set aside some memory and labeled that area myVariable. It also stored the text string Some sample text in that variable. Now, type the following line and press Enter:
+
+    MsgBox myVariable
+
+This time, you can see the result: VBA goes to the memory area you specified (with the variable name myVariable) and retrieves the value, the string. A message box appears containing the text you had stored in the variable.
+
+You can declare variables either explicitly or implicitly. An _explicit_ declaration is a line of code that specifies the name you want to give the variable, and usually its type, before you use the variable in your code. Here's an explicit variable declaration:
+
+    Dim myVariable As String
+
+An _implicit_ declaration means that you don't bother with that explicit declaration statement. Instead, you just use the variable name in some other statement. VBA then stores the data in a Variant variable. (You have not specified the type.)
+
+In other words, if you _just use_ a variable in your code without declaring it, it's implicit.
+
+Here's an example of implicit declaration:
+
+    myVariable = "Some sample text"
+
+You never _explicitly_ declared this variable. The first time it appeared in your code, you just assigned some data, the text, to it. So VBA assumes that you want to create the variable implicitly.
+
+In the next few chapters, you'll use a few implicit variable declarations to keep things simple. In other words, you won't have to type in lines of code to declare implicit variables. VBA will create them for you when you first use them in an assignment or other statement.
+
+However, many educators and professional programmers insist on explicit declaration, so we'll do that for the most part in the later sections of this book. Explicit variable declarations make your code run faster and make it easier to understand. What's more beneficial, some types of errors can be avoided if you explicitly declare all your variables. So declaring is a good habit to get into.
+
+# Constants
+
+A _constant_ is similar to a variable. It's a named item that keeps a constant value while a program is executing. The constant's meaning _doesn't change_ during the macro's execution. (So in this way, it's unlike a _variable_.)
+
+VBA uses two types of constants: _intrinsic constants_ , which are built into the VBA language itself (and individual Office applications' implementations of VBA), and _user-defined constants_ , which you can create. For example, the built-in constant vbOKCancel is always available in VBA to be used with the MsgBox function. This constant creates a message box that contains an OK and a Cancel button. There are sets of built-in constants for colors, printing (vbTab, for example), and other properties.
+
+Concerning constants that you define, you might want to create one to store a piece of information that doesn't change, such as the name of a procedure or the distance between Boston and New York. In practice, the built-in _intrinsic_ constants are used quite often in VBA programming; user-defined constants not so much. It's just as easy to put the distance between those cities in a variable, even though it won't vary.
+
+# Arguments
+
+An _argument_ is a piece of information—supplied by a constant, a variable, a literal, or an expression—that you pass to a procedure, a function, or a method. Some arguments are required; others are optional. The text hello there in this MsgBox function is an argument:
+
+    MsgBox "hello there"
+
+Here's another example. As you saw earlier, the following statement uses the optional argument SaveChanges to specify whether Word should save any unsaved changes while closing the active document:
+
+    ActiveDocument.Close SaveChanges:=wdDoNotSaveChanges
+
+This optional argument uses the built-in constant wdDoNotSaveChanges.
+
+* * *
+
+Understanding Literals
+
+A _literal_ can be used instead of a constant or variable, if you wish. With a literal, you just type the actual value into the argument. For example, you could display a message box that says "Hi there" by using a variable:
+
+    txtMsg = "Hi there!"
+    MsgBox (txtMsg)
+
+Or you could simply avoid the variable and employ a literal (the actual text string) as the argument:
+
+    MsgBox ("Hi there!")
+
+Both of these approaches have the same result.
+
+* * *
+
+The Visual Basic Editor's helpful prompts and the Visual Basic Help file show the list of arguments for a function, a procedure, or a method in parentheses, with any optional arguments enclosed in brackets. If you have its Auto Quick Info feature activated, the Editor displays the argument list for a function, procedure, or method after you type its name followed by a space.
+
+Figure 5.2 shows the argument list for the Document object's Open method. Type **Documents.Open** , then press the spacebar to see the argument list.
+
+Figure 5.2 Optional arguments are enclosed within brackets.
+
+The FileName argument is required, so it isn't surrounded by brackets. All the other arguments (ConfirmConversions, ReadOnly, AddToRecentFiles, and so on) are optional and therefore are surrounded by brackets.
+
+If you don't supply a value for an optional argument, VBA uses the default value for the argument. (To find out the default value for an argument, consult the VBA Help file. The default is usually the most commonly employed value.) The Visual Basic Editor uses boldface to indicate the current argument in the list; as you enter each argument, the next argument in the list becomes bold.
+
+## Specifying Argument Names vs. Omitting Argument Names
+
+You can add arguments in either of two ways:
+
+  * Enter the name of the argument (for example, ConfirmConversions), followed by a colon, an equal sign (ConfirmConversions:=), and the constant or value you want to set for it (ConfirmConversions:=True). For example, the start of the statement might look like this:
+
+    Documents.Open FileName:="c:\temp\Example.docm", _
+        ConfirmConversions:=True, ReadOnly:=False
+
+  * Or enter the constant or value in the appropriate position in the argument list for the method, without entering the name of the argument. The previous statement would look like this:
+
+    Documents.Open "c:\Temp\Example.docm", True, False
+
+When you use the first approach—naming the arguments—you don't need to put them in order because VBA looks at their names to identify them. The following statements are functionally equivalent:
+
+    Documents.Open ReadOnly:=False, FileName:= "c:\temp\Example.docm", _
+        ReadOnly:=False, ConfirmConversions:=True
+
+    Documents.Open FileName:="c:\temp\Example.docm", _
+        ConfirmConversions:=True, ReadOnly:=False
+
+You also don't need to indicate to VBA which optional arguments you're omitting.
+
+By contrast, when you don't employ argument names, you're specifying which argument is which by its position in the list. Therefore, _the arguments must be in the correct order_ for VBA to recognize them correctly. If you choose not to use an optional argument but to use another optional argument that follows it, enter a comma (as a placeholder) to denote the omitted argument. For example, the following statement omits the ConfirmConversions argument and uses a comma to denote that the False value refers to the ReadOnly argument rather than the ConfirmConversions argument:
+
+    Documents.Open "c:\temp\Example.docm",, False
+
+Remember that when you type the comma in the Code or the Immediate window, Auto Quick Info moves the boldface to the next argument in the argument list to indicate that it's next in line for your attention.
+
+* * *
+
+Required Arguments Precede Optional Arguments
+
+Typically, required arguments are listed first in the argument list—before optional arguments. That way, you don't have to use commas to indicate the omission of optional arguments if you only want to enter the required arguments. You can just leave out all the rest of the items in the argument list.
+
+* * *
+
+## When to Include the Parentheses around the Argument List
+
+Most programmers enclose argument lists within parentheses. It makes the code easier to read. However, parentheses can be omitted in some circumstances. When you're assigning the result of a function to a variable or other object, you _must_ enclose the whole argument list in parentheses. For example, to assign to the variable objMyDocument the result of opening the document c:\temp\Example.docm, use the following statement:
+
+    objMyDocument = Documents.Open(FileName:="c:\temp\Example.docm", _
+        ConfirmConversions:=True, ReadOnly:=False)
+
+However, when you aren't assigning the result of an operation to a variable or an object, you don't need to use the parentheses around the argument list, even though it's common practice to do so. The following examples illustrate how you can either use or leave out parentheses when not assigning a result to a variable or other object:
+
+    MsgBox ("Hi there!")
+    MsgBox "Hi there!"
+
+# Objects
+
+To VBA, each application consists of a series of _objects_. Here are a few examples:
+
+  * In Word, a document is an object (the Document object), as is a paragraph (the Paragraph object) and a table (the Table object). Even a single character is an object (the Character object).
+  * In Excel, a workbook is an object (the Workbook object), as are the worksheets (the Worksheet object) and charts (the Chart object).
+  * In PowerPoint, a presentation is an object (the Presentation object), as are its slides (the Slide object) and the shapes (the Shape object) they contain.
+
+Most of the actions you can take in VBA involve manipulating objects. For example, as you saw earlier, you can close the active document in Word by using the Close method on the ActiveDocument object:
+
+    ActiveDocument.Close
+
+# Collections
+
+A _collection_ is an object that contains other objects, the way an umbrella-stand object contains umbrella objects. Collections provide a way to access all their members at the same time. For example, the Documents collection contains all the open documents, each of which is an object. Instead of closing Document objects one by one, you can close all open documents by using the Close method on the Documents collection:
+
+    Documents.Close
+
+Likewise, you can use a collection to change the properties of all the members of a collection simultaneously.
+
+Here's an example of some code that displays, in the Immediate window of the Editor, all the names of the objects in Word's CommandBars collection:
+
+    'fetch the number of commandbars
+        n = CommandBars.Count
+
+    'display all their names
+        For i = 1 To n
+            Debug.Print CommandBars(i).Name
+        Next i
+
+# Properties
+
+Each object has a number of _properties_. Think of properties as the qualities of an object, such as its color, size, and so on.
+
+For example, the current document in Word has properties such as the number of sentences in the document. Type this into the Immediate window, then press Enter:
+
+    MsgBox (ActiveDocument.Sentences.Count)
+
+Here you're using the Count property of the Sentences collection to find out how many sentences are in the document.
+
+Likewise, even a single character has various properties, such as its font, font size, and various types of emphasis (bold, italic, strikethrough, and so on).
+
+# Methods
+
+A _method_ is something an object can _do_. A capability. Different objects have different methods, just as different people have different talents. For example, here's a list of some of the methods of the Document object in Word (many of these methods are also available to objects such as the Workbook object in Excel and the Presentation object in PowerPoint):
+
+**Activate**
+
+Activates the document (the equivalent of selecting the document's window with the keyboard or mouse)
+
+**Close**
+
+Closes the document (the equivalent of pressing Alt+F then C, or clicking the Close button after clicking the File tab on the Ribbon)
+
+**Save**
+
+Saves the document (the equivalent of pressing Alt+F then S, or clicking the Save button after clicking the File tab on the Ribbon)
+
+**SaveAs**
+
+Saves the document under a specified name (the equivalent of pressing Alt+F then A, or clicking the Save As button after clicking the File tab on the Ribbon)
+
+# Events
+
+When an _event_ occurs, VBA is aware that something happened, usually something that happened _to_ an object. For example, the opening of a file (either by a user or by a macro procedure) typically generates an event. The user clicking a button in the toolbar generates a Click event. Another way to put it is that when you click a button, you trigger that button's Click event, and VBA becomes aware that this has happened.
+
+By writing code for an event, you can cause VBA to respond appropriately when that event occurs. For example, let's say you display a user form (a window). You might write some code in an OK button's Click event. This code might check that all necessary settings were specified by the user when the user clicked the OK button to close the user form and apply the settings. You might write more code within that button's Click event that responded (perhaps by displaying a message box) if the user had failed to type in some required information. In essence, you can write code in an event to tell VBA what to do if that event is triggered. You don't have to write code for all events; sometimes you'll write code in only one of them. But if you put a button captioned "Display Results" on a user form, you'd better at least write some code in that button's Click event to display some results.
+
+* * *
+
+Objects and Their Components
+
+I'll have much more to say about objects throughout the rest of this book. For now, see if you can identify the three primary parts of a typical object: properties (its qualities), methods (ways you can make the object behave), and events (something that happens to an object while a program or application is executing). Collectively, these three components of an object are called the object's _members_.
+
+Take a look at the following code window. See if you can spot the members of the Document object—its properties, its methods, and an event.
+
+Here, you can see that the ThisDocument object is selected in the Project Explorer on the left. This object has available to it the many properties in the long list displayed in the Properties window on the left side. You can either modify those properties directly in the Visual Basic Editor or write code that modifies them on the fly while the macro executes.
+
+On the right side is a drop-down list of events—actions that can happen to a Document object, or at least happen while the document is in existence within the computer. You can write code in any of these events (in the Code window, each event will be a separate subprocedure, enclosed within the Sub and End Sub statements). Here, you can see that we're writing code that will execute when the Document_Close event is triggered:
+
+    Private Sub Document_Close()
+
+In this example, I'm writing code to query users if they attempt to close the document. This code will execute anytime this document's Close event is triggered (when the user clicks the x button in the upper-right corner of the window, for instance).
+
+Only one _method_ is shown in the Code-window illustration. Can you spot it? It's in boldface in the following code example:
+
+    Private Sub Document_Close()
+    Dim intAnswer As Integer
+    intAnswer = MsgBox("Do you want to check the spelling?", _
+            vbOKCancel, "Document Is Being Closed")
+    If intAnswer = 1 Then ' they clicked OK. 1 = OK  2 = Cancel
+        ThisDocument. **CheckSpelling** 
+    End If
+    End Sub
+
+As you can see, CheckSpelling is a method (a task that an object is able to carry out).
+
+* * *
+
+# The Bottom Line
+
+**Understand the basics of VBA.**
+
+VBA includes two types of procedures, used for different purposes.
+
+Master It
+
+Name the two types of procedures used in VBA (and indeed in most computer languages), and describe the difference between them.
+
+**Work with procedures and functions.**
+
+A procedure is a container for a set of programming statements that accomplish a particular job.
+
+Master It
+
+Write a subprocedure in the Visual Basic Editor that displays a message to the user. Then execute that subprocedure to test it.
+
+**Use the Immediate window to execute individual statements.**
+
+When you're writing code, you often want to test a single line (a statement) to see if you have the syntax and punctuation right or if it produces the expected result.
+
+Master It
+
+Open the Immediate window, type in a line of code, and then execute that line.
+
+**Understand objects, properties, methods, and events.**
+
+Object-oriented programming (OOP) means creating objects to use in your programming. OOP has become the fundamental paradigm upon which large programming projects are built. Generally speaking, macros are not large and therefore don't profit from the clerical, security, and other benefits that OOP offers—particularly for people who write large applications as a team.
+
+However, code libraries, such as the vast VBA set of objects and their members (not to mention the even vaster .NET libraries that tap into the power of the operating system itself) _are_ written by large groups of people, and written at different times. These libraries themselves are huge. There must be a way to organize their objects and functions—to categorize them and allow you to execute the methods and manage their properties and arguments. As a result, another aspect of OOP—taxonomy—is quite valuable even when writing brief macros. It's a way to quickly locate the members you're interested in.
+
+Master It
+
+Look up the Document object in the Visual Basic Editor's Help system; then look at its methods.
+Chapter 6
+
+Working with Variables, Constants, and Enumerations
+
+This chapter covers the basics of working with variables, constants, and enumerations. _Variables_ are used very often; they provide a way of storing and manipulating information. Variables come in several types, such as String variables for storing text, various numeric data types for storing numbers (for example, Integer variables for storing integer values), Date variables for storing dates and time, Boolean variables for storing True/False values, and even Object variables for storing objects.
+
+A _constant_ is a named item that stores a value that doesn't change. Constants, like variables, exist only while a program is executing. Most programmers rarely create their own constants; they just use variables instead. However, there is another kind of constant that the programmer does not create. And this type of constant is used all the time. Many useful constants are built into VBA, to represent elements in Access, text color options in Excel, styles in Word, and so on.
+
+For our purposes, the term _enumeration_ means a numbered list—like a list of all the items you need to buy to paint a room. The list contains both the numbers and the names of the items. So you can refer to each item either by its number in the list or by its name. Essentially, an enumeration is a group of related, predefined constants. But constants are more commonly identified by their names rather than their numbers in the list. That's because the name AnimationFlyIntoFromLeft is easier to use in your programming than its number, 1312.
+
+The one type of variable that this chapter doesn't discuss is the Array variable, which is used to store a set of multiple pieces of related information at the same time. It's similar to an enumeration. Arrays are so important in computer programming that I'll devote an entire chapter to them: Chapter 7, "Using Array Variables."
+
+In this chapter you will learn to do the following:
+
+  * Understand what variables are and what you use them for
+  * Create and use variables
+  * Specify the scope and lifetime of a variable
+  * Work with constants
+  * Work with enumerations
+
+# Working with Variables
+
+Variables are used in nearly all computer programs, even short programs like macros. Think of a variable as a named area in the computer's memory that you use for storing data while a procedure is running. For example, in Chapter 5, "Understanding the Essentials of VBA Syntax," you created a variable that stored a simple string of text that you then displayed in a message box:
+
+    myVariable = "Sample variable text"
+    MsgBox myVariable
+
+The first statement sets aside an area in memory, names it myVariable, and assigns the string Sample variable text to it. The second statement retrieves the contents (called the _value_ ) of myVariable from memory and uses the MsgBox function to display it in a message box. The contents of myVariable remain in memory, so you can use the value again if necessary while the macro is running. Or you can even change the contents. In other words, the value in a variable can _vary_ while the program runs. A _constant_ , by contrast, doesn't vary during program execution.
+
+## Choosing Names for Variables
+
+VBA imposes several constraints on how you name your variables:
+
+  * Variable names must start with a letter and can be up to 255 characters in length. Usually, you'll want to keep them much shorter than this so that you can easily type them into your code and so that your lines of code don't rapidly reach awkward lengths.
+  * The Visual Basic Editor's AutoComplete feature helps make long variable names a little more manageable: Type enough of the variable's name to distinguish it from any keywords and other variable names, and press Ctrl+spacebar. If you've typed enough letters to uniquely identify the variable, the Visual Basic Editor inserts its name; if not, the Visual Basic Editor displays the drop-down list of keywords and names starting with those letters.
+  * Variable names can't contain characters such as periods, exclamation points, mathematical operators (+, –, /, *), or comparison operators (=, <>, >, >=, <, <=), nor can they internally contain type-declaration characters (@, &, $, #). (You'll learn about the type-declaration characters later in this chapter.)
+  * Variable names can't contain spaces but can contain underscores, which you can use to make the variable names more descriptive by combining words. User_Response is one example. However, it's more common to just omit the underscore and let capitalization segregate the words, as in UserResponse.
+
+As a general rule, you're pretty safe if you stick with straightforward alphanumerics enlivened with the occasional underscore if you like underscores.
+
+For example, all of the following variable names are fine, although the last one is awkwardly long:
+
+  * i
+  * John
+  * MyVariable
+  * MissionParameters
+  * The_String_the_User_Entered_in_the_Input_Box
+
+On the other hand, these variable names are not usable:
+
+  * My Variable—Contains a space
+  * My!Variable—Contains an exclamation point
+  * Time@Tide—Contains a type-declaration character (@)
+  * 1_String—Does not start with a letter
+
+Each variable name must be unique within the scope in which it's operating (to prevent VBA from confusing it with any other variable). Typically, the scope within which a variable operates is a procedure, but if you declare the variable as public or private (discussed later in the chapter), its scope is wider.
+
+The other constraint on variable names is that you should avoid assigning to a variable a name that VBA already uses in its own language or the name of a built-in function, statement, or object member. Doing so is called _shadowing_ a VBA keyword. It doesn't necessarily cause problems, but it may prevent you from using that function, statement, or method without specifically identifying it to VBA by prefacing its name with VBA. For example, instead of Date, you'd have to use VBA.Date—no big deal, but worth avoiding in the first place. After all, why add this complexity when it's simpler to just make up your own, unique variable names? Why do things that provide you with no real benefit and have drawbacks like making your code harder to read?
+
+There's no reason to shadow a VBA keyword, but VBA has so many keywords that it's surprisingly easy to do so.
+
+Don't worry about accidentally creating a variable name that violates one of the rules listed in this section. VBA will throw you an error message if you use @ or start your variable name with 6 or try any other illegal moves. VBA will either report "Invalid Character" or separate your variable name into multiple words, such as changing 56nin into 56 nin, thinking you are trying to use line numbers in your code. (You can, if you wish, number your lines, and VBA will execute the code by just ignoring the line numbers. I number the lines in the code in this book so I can reference them in the text.)
+
+## Declaring a Variable
+
+Recall from Chapter 5 that VBA lets you declare variables either implicitly or explicitly. As you'll see shortly, each approach has its pros and cons. However, as you'll also see, explicit declarations are almost always a good idea, and when you've been working with VBA for even a little while, you'll probably use them all the time. For this reason, it's best to declare your variables explicitly right from the beginning. But this chapter also illustrates how to make implicit declarations so you know that technique if that's your preference.
+
+### Declaring a Variable Implicitly
+
+Declaring a variable implicitly means that you just use it in your code without first declaring it explicitly. When you declare a variable implicitly, VBA checks to make sure that there isn't already an existing variable with that name. It then automatically creates a variable with that name for you and assigns it the Variant data type, which can contain any type of data except a fixed-length string.
+
+For example, in the previous chapter, you declared the variable myVariable by using the following implicit declaration:
+
+    myVariable = "Sample variable text"
+
+Here, myVariable is implicitly declared as a variable—because it is used in a statement rather than first being declared explicitly (usually with the Dim command).
+
+VBA assigns an implicitly declared variable to the Variant data type, which has a dozen or so subtypes. In this case, the variable's subtype is a string because it contains text. VBA usually assigns the variable the value Empty (a special value used to indicate Variant variables that have never been used) when it creates it, but in this case the variable receives a value immediately (because the string of text is assigned to it). VBA then assigns the string type because it can see you're storing a string in the variable.
+
+The advantage of declaring a variable implicitly is that you write less code. When you want a variable, you simply declare it on the spot by using it in a statement. But declaring a variable implicitly also has a couple of disadvantages:
+
+  * It's easier to make a mistake typing the variable's name elsewhere in your code. For example, suppose you implicitly declare the variable FilesToCreate and then later type FllesToCreate instead. VBA doesn't query the latter spelling (with its double ll typo). No error messages are displayed. VBA merely creates another, new, different variable with the ll name.
+
+When you're working with a number of variables, it can be difficult and time-consuming to catch little typo mistakes like this. And a mistake like this (having two variables when you think you have only one) causes errors. The problem in this example is that you think you're referring to the FilesToCreate variable, but you're not. VBA can detect this kind of error, but only if _explicit declaration_ is enforced. ( _Enforced_ here means that if you try to get away with using an undeclared variable—the one with the typo was never formally declared—the Visual Basic Editor displays an error message and halts execution.)
+
+  * The Variant variable type takes up more memory than other types of variables because it has to be able to store various types of data. This difference is negligible under most normal circumstances, particularly if you're using only a few variables or writing only short procedures. However, if you're using many variables in a huge program running on a computer with limited memory, the extra memory used by Variant variables might slow down a procedure or even run the computer out of memory. What's more important on an underpowered computer is that manipulating Variants takes longer than manipulating the other data types. This is because VBA has to keep checking to see what sort of data is in the variable.
+
+You can get around this second disadvantage in a couple of ways: first, by using a type-declaration character to specify the data type when you declare a variable implicitly or, second (as you will see in the next section), by simply telling VBA to force you to declare variables explicitly—and to display an error message if you don't.
+
+A _type-declaration character_ is a character that you add to the end of a variable's name in an implicit declaration to tell VBA which data type to use for the variable. Table 6.1 lists the type-declaration characters.
+
+Table 6.1 Type-declaration characters
+
+**Character** | **Data Type of Variable** | **Example**  
+---|---|---  
+% | Integer | Quantity%  
+& | Long | China&  
+@ | Currency | Profits@  
+! | Single | temperature!  
+
+# | Double | Differential#  
+$ | String (variable length) | myMessage$
+
+So you could implicitly declare the String variable UserName with the following statement, which assigns the value Jane Magnolia to the variable:
+
+    UserName$ = "Jane Magnolia"
+
+And you could implicitly declare the currency variable Price by using this statement:
+
+    Price@ = Cost * Margin
+
+You use the type-declaration character only when declaring the variable. Thereafter, you can refer to the variable by its name—UserName and Price in the previous examples.
+
+### Declaring a Variable Explicitly
+
+Declaring a variable explicitly means telling VBA that the variable exists before you use it. VBA allocates memory space to that variable and registers it as a known quantity. You can also declare the variable type at the same time—a good idea but not obligatory.
+
+You can declare a variable explicitly at any point in code before you use it, but custom and good sense recommend declaring all your variables at the beginning of the procedure that uses them. (Or, to give a variable greater scope, declare it in the General Declarations area up at the top of the Code window. More on scope later.)
+
+Locating all your declarations at the top of a procedure makes them easy to find, which helps anyone reading the code.
+
+Declaring variables explicitly offers the following advantages:
+
+  * Your code is easier to read and to debug, both for you yourself and for other programmers. When you write complex code, this is an important consideration.
+  * Forcing explicit variable declarations is accomplished by adding an Option Explicit statement at the top of a module—in the General Declarations section of the Code window. This enforcement makes it more difficult for you to create new variables unintentionally by mistyping the names of existing variables.
+  * It is more difficult for you to wipe out the contents of an existing variable unintentionally when trying to create a new variable.
+  * VBA can catch some data-typing errors at design time or compile time that with implicit declarations wouldn't surface until runtime.
+
+* * *
+
+Store the Correct Type of Value in a Variable
+
+A data-typing error occurs when you assign the wrong type of information to a variable. For example, if you declare an Integer variable and then assign a string of text to it, VBA triggers an error because it can't store string information in an Integer variable.
+
+* * *
+
+  * Your code runs a fraction faster because VBA won't need to determine each variable's type while the code is running.
+
+The disadvantage of declaring variables explicitly is that doing so takes a little more time, effort, and thought. For most code, however, this disadvantage is far outweighed by the advantages.
+
+To declare a variable explicitly, you use one of the following keywords: Dim, Private, Public, or Static.
+
+For example, the following statement declares the variable MyValue:
+
+    Dim MyValue
+
+Dim is the most common keyword to use for declaring a variable, and you'll probably want to use it for most of your variable declarations. You use the other keywords to specify a different scope, lifetime, and data type for the variable in the declaration. In the previous example, the MyValue variable receives the default scope and lifetime and the Variant data type, which makes it suitable for general-purpose use.
+
+You can also declare multiple variables on the same line by separating the variable statements with commas:
+
+    Dim Supervisor As String, ControllerCode As Long
+
+This can help you keep down the number of declaration lines in your code, but it makes the declarations harder to read, so it's not usually a good idea.
+
+Be warned that when you declare multiple variables on the same line, you must specify the data type for each, as in the previous example. You might be tempted to try a little abbreviation, like this, hoping for a couple of String variables:
+
+    Dim strManager, strReportingEmployee As String
+
+This statement doesn't create two String variables: strReportingEmployee is a String variable, but strManager is a Variant because the As String part of the code applies only to strReportingEmployee.
+
+## Choosing the Scope and Lifetime of a Variable
+
+The _scope_ of a variable is the area in VBA where it can operate. Think of it as similar to your scope of activity at work: those areas in which you perform tasks and those areas in which you don't. Your scope might be the office-cubicles area of the building, but if you were found slinking around inside the walk-in safe, there would be trouble. Entering the safe is not part of your job description.
+
+The default scope of a variable is the procedure that declares the variable (either implicitly or explicitly). In other words, the scope is between the Sub and End Sub (or Function and End Function) that define the start and end of a procedure. Macros are most often fairly short, and thus their code is most often contained within a single procedure. For these typical macros, there's no reason for a variable to have a scope any larger than its own procedure.
+
+Here's an example of procedure-level scope. Suppose you have a module named Financial_Procedures that contains the procedures Breakeven_Table and Profit_Analysis_Table, each of which uses a variable named Gross_Revenue and another named Expenses. The variables in each procedure are distinct from the variables in the other procedure, so there is no danger of VBA confusing the two. (For the human reader, though, using the same variable names in different procedures rapidly becomes confusing when debugging. In general, it's a good idea to use unique variable names, even at the default procedure level.)
+
+The _lifetime_ of a variable is the period during which VBA remembers the value of the variable. You need different lifetimes for your variables for different purposes. A variable's lifetime is tied to its scope. Lifetime, here, refers to how long during program execution the variable is in existence.
+
+Sometimes you need to access a variable from outside the procedure in which it's declared. In these cases, you need to declare a different, wider scope for the variable.
+
+* * *
+
+Require Explicit Declarations for Variables
+
+Most experts urge you to explicitly declare variables. You can set VBA to require you to declare variables explicitly. Most programmers and developers find this feature useful because it prevents you from declaring any variables implicitly, whether intentionally or otherwise.
+
+To require variable declarations globally—so explicit declaration is automatically enforced in any new module you create—choose Tools ⇒ Options in the Visual Basic Editor to display the Options dialog box, select the Require Variable Declaration check box in the Code Settings area, and then click the OK button. (The Require Variable Declaration check box is cleared by default, enabling you to declare variables implicitly, which is usually the easiest way to learn how to work with variables.) The Visual Basic Editor then adds an Option Explicit statement to each new module that you create. This statement enforces explicit variable declarations for the module it's in.
+
+When you select the Require Variable Declaration check box, the Visual Basic Editor doesn't add the Option Explicit statement to your existing modules. You must type the Option Explicit statement into your existing modules manually if you want to force explicit declarations in them too.
+
+To require variable declarations only for specified modules, put an Option Explicit statement at the beginning of each module for which you want to require declarations. The Option Explicit statement must go before the Sub or Function statement for the first procedure in the module—if you put it inside a procedure, or between procedures, VBA gives an error when you try to run any of the code in the module. This zone—above the first procedure in a module—is called the _General Declarations_ area.
+
+If you've set Option Explicit either globally or for a module, VBA tests the procedure before running it. More precisely, VBA protests when it tries to compile the code and discovers that you haven't declared one or more of the variables, and it warns you if a variable isn't explicitly declared, as shown here in this screenshot. VBA also highlights the variable in your code.
+
+If you get this message box, you can solve the problem either by declaring the variable or by turning off the requirement of variable declarations for the module. To turn off the requirement, remove the Option Explicit statement from the module by selecting and deleting the line that contains it or by commenting out this line by putting a single-quote symbol (') at the start of the line.
+
+* * *
+
+A variable can have three types of scope:
+
+  * Procedure
+  * Private
+  * Public
+
+### Procedure Scope
+
+A variable with _procedure scope_ (also known as _procedure-level scope_ or _local scope_ ) is available only to the procedure that contains it. As a result, the lifetime of a _local variable_ is limited to the duration of the procedure that declares it: As soon as that procedure stops running, VBA removes all local variables from memory and reclaims the memory that held them. This is true even if later on that same procedure is executed again. Local variables don't _persist_ if execution moves outside their procedure.
+
+Procedure scope is all you'll need for variables that operate only in the procedure in which they're declared. For example, say you implicitly declare a Variant variable named Supervisor, like this:
+
+    Supervisor = "Paul Smith"
+
+You can then use the Supervisor variable in the rest of that procedure—for example, retrieving the text stored in it or changing that text. When the procedure stops running, VBA removes the variable and reclaims the memory it occupied.
+
+* * *
+
+Implicitly Declared Variables Are Always Local
+
+When you declare a variable implicitly, it's automatically assigned procedure scope.
+
+* * *
+
+To explicitly declare a local variable, use the Dim keyword and place the declaration inside the procedure, like this:
+
+    Sub Create_Weekly_Report()
+        Dim strSupervisor As String
+        Dim lngController As Long
+    ...
+    End Sub
+
+Here, the second line declares the variable strSupervisor as the String data type, and the third line declares the variable lngController as the Long data type. (The section "Specifying the Data Type for a Variable," a bit later in this chapter, goes through the variable types.)
+
+On the other hand, if you need to pass any of these variables to another procedure that you call from the current procedure, procedure scope isn't sufficient—you need to use either private scope or public scope.
+
+### Private Scope
+
+A variable with private scope is available to all the other procedures in the module that contains it, but not to procedures in other modules. Using private variables enables you to pass the value of a variable from one procedure to another. Unlike local variables, which retain their value only as long as the procedure that contains them is running, private variables retain their value as long as _any procedure_ in the project that contains them is executing.
+
+To declare a variable with private scope, you can use either the Dim keyword or the Private keyword at the beginning of a module, placing it up top before the Sub statement for the first procedure in the module, like this:
+
+    Dim strSupervisor As String
+    Private blnConsultantAssigned As Boolean
+
+    Sub Assign_Personnel()
+
+The Visual Basic Editor displays the private declarations above the dividing line that appears between the General Declarations area and the code below it (see Figure 6.1).
+
+Figure 6.1 Private variable declarations appear in the declarations area.
+
+You'll notice that the Dim statement here uses exactly the same syntax as the earlier declaration for the local variable. The only difference is that to declare a private variable, you place the Dim statement in the declarations area rather than within a procedure. Because the Private statement has the same effect as the Dim statement for declaring private variables and can't be used within a procedure, it's clearer to use the Private statement rather than the Dim statement for declaring private variables.
+
+### Public Scope
+
+A variable declared with _public_ scope is available anywhere in a project. It's accessible by all procedures in all modules in the project that contains it.
+
+To declare a public variable, you use the Public keyword in the General Declarations area at the beginning of a module (up above the Sub statement for the first procedure in the module). Here's an example:
+
+    Option Explicit
+    Public intMyVar As Integer
+
+The second statement declares the public variable intMyVar as the Integer type.
+
+Like private variables, public variables retain their value as long as the project that contains them is open (still running). For example, if you want to track the user's name through a series of operations in Word, you can create an AutoExec procedure that prompts users to enter their name when they start Word. (AutoExec is the built-in name for a procedure that runs automatically when Word starts. Word, when you start it, searches to see if there is a Sub named AutoExec and, if so, executes that procedure.)
+
+* * *
+
+The Declarations Area Appears at the Top of the Code Window as Necessary
+
+The General Declarations area appears at the beginning of each module that contains declarations. For example, if you choose to use explicit variable declarations (by selecting the Require Variable Declaration check box on the Editor page of the Tools ⇒ Options dialog box), the Visual Basic Editor automatically enters the Option Explicit declaration at the start of each new module you create. If not, the Visual Basic Editor creates the declarations area when you first enter a statement there manually.
+
+* * *
+
+By storing the result of the user's input in a public variable, you can then retrieve the value for use anytime later in the same Word session. You can see how this would be handy if several macros needed the information contained in a variable. Remember that local variables (those declared inside a procedure) are destroyed as soon as that procedure reaches its End Sub statement and shuts down.
+
+* * *
+
+Use Prefixes to Identify Variable Types
+
+You'll likely notice in the various examples in this chapter that it's common to employ prefixes to identify a variable's data type (more on this later in the chapter). For instance, instead of naming a variable CurrentUser in Listing 6.1, I named it strCurrentUser. This str prefix identifies CurrentUser as a variable that holds text strings. These prefixes make your code easier to read and modify because each variable, everywhere in the code, is identified as a particular type. Prefixes commonly used include str for String, int for Integer, var for Variant, lng for Long, obj for Object, and so on. As you'll see later in this book, a similar set of prefixes is used to identify controls you place on a user form: txt for Text, btn for Button, and so on. If you're interested in following this convention, you can find lists of prefixes at this location in _Wikipedia_ :
+
+<http://en.wikipedia.org/wiki/Leszynski_naming_convention>
+
+Listing 6.1 shows an AutoExec procedure.
+
+**Listing 6.1**: An AutoExec procedure
+
+    1.  Public strCurrentUser As String
+    2.
+    3.  Sub AutoExec()
+    4.      strCurrentUser = InputBox("Please enter your name.", _
+                "Current User Identity")
+    5.  End Sub
+    6.
+    7.  Sub Identify_Current_User()
+    8.      MsgBox "The current user is " & strCurrentUser, _
+                vbOKOnly + vbInformation, "Current User"
+    9.  End Sub
+
+This code consists of three different parts:
+
+  * Line 1 declares the public String variable strCurrentUser.
+  * Lines 3 through 5 contain the AutoExec procedure. This procedure runs each time the user starts Word. Line 4 displays an input box that prompts the user to enter their name and stores their response in the public variable strCurrentUser.
+  * Lines 7 through 9 contain the Identify_Current_User procedure, which simply displays a message box that gives the name of the user, along with lead-in text and an information icon and title bar for completeness.
+
+You can test these procedures by stepping through (by pressing the F8 key) first the AutoExec procedure and then the Identify_Current_User procedure in the Visual Basic Editor. But to see their effect, you'll have to create the procedures and then exit Word. When you restart Word, the AutoExec procedure displays the input box for you to enter your name. At any point thereafter (until you exit Word), you can access the value in the strCurrentUser variable. For example, you could run the Identify_Current_User procedure at any time (until you close Word itself), and VBA displays a message box with the name you entered. A public variable is said to _persist_.
+
+* * *
+
+A Large Number of Public Variables Can Clog Memory
+
+Why not just make all variables public? When writing short programs like macros, this wouldn't cause as much difficulty as when writing large programs or programming professionally in a team. However, there is a variety of reasons to limit the scope of variables to as local as possible. For an interesting take on the advantages and disadvantages of public (global) variables, see this website:
+
+<http://c2.com/cgi/wiki?GlobalVariablesAreBad>
+
+* * *
+
+### Using Static Variables
+
+Besides declaring variables with Dim, Private, and Public, you can also use the Static keyword, which is special. You can use it to cause even a _local_ variable to persist. Use Static instead of Dim when you want to declare a _static_ variable—a variable whose values you want to preserve between calls to the procedure in which they are declared.
+
+Static variables are similar to public variables in that their lifetime is not limited to the duration of the procedure that declares them. The difference is that static variables, once declared, are available only to the procedure that declared them, whereas public variables are available to all procedures once they've been declared. So, a static variable has the scope of a local variable but the _lifetime_ of a public or private variable. There is one particular situation where static variables come in handy: _toggling_.
+
+Static variables are useful for maintaining information on a process that you need to run a number of times during a session of the application, either to maintain a running total (for example, a count of the times you performed a procedure) or to keep at hand a piece of information that may prove useful when you run a procedure a second or subsequent time. Typically you employ a static variable in a procedure that _toggles_ something between two states. For example, you could create a procedure that when first executed turns on italics, then when next executed turns italics off, then back on, and so on. Such a toggle would look something like this:
+
+    Sub ToggleItal()
+
+    Static switch As Boolean
+
+    switch = Not switch
+
+    If switch Then
+        MsgBox "on"
+    Else
+        MsgBox "Off"
+    End If
+
+    End Sub
+
+You can test this by stepping through it (pressing F8 after clicking the first line of the procedure). Each time you execute the procedure, you get a different message. The Not command switches a Boolean variable type back and forth between True and False. A Boolean variable has only those two possible values.
+
+The following statement declares the static String variable strSearchTerm1:
+
+    Static strSearchTerm1 As String
+
+## Specifying the Data Type for a Variable
+
+Table 6.2 explains the data types that VBA supports and the amount of memory each variable type requires.
+
+Table 6.2 VBA variable data types
+
+**Variable** | **Short Description** | **Memory Required**  
+---|---|---  
+Boolean | True or False | 2 bytes  
+Byte | An integer from 0 to 255 | 1 byte  
+Currency | A positive or negative number with up to 15 digits to the left of the decimal point and 4 digits to the right of it | 8 bytes  
+Date | A floating-point number with the date to the left of the decimal point and the time to the right of it | 8 bytes  
+Decimal | An unsigned integer scaled by a power of 10 | 12 bytes  
+Double | A floating-point number with a negative value between –1.79769313486231570E+308 and –4.94065645841246544E-324 or a positive value between 4.94065645841246544E-324 and 1.79769313486231570E+308 | 8 bytes  
+Integer | An integer from –32,768 to 32,767 | 2 bytes  
+Long | An integer from –2,147,483,648 to 2,147,483,647 | 4 bytes  
+Object | A reference to an object | 4 bytes  
+Single | A floating-point number with a negative value between –3.4028235E+38 and –1.401298E-45 and a positive value between 1.401298E-45 and 3.4028235E+38 | 4 bytes  
+Variable-length String | A string of text | 10 bytes plus the storage for the string  
+Fixed-length String | A string whose length doesn't change | Whatever size is specified for the length  
+Variant | Any type of data except a fixed-length string in a subtype of the Variant | Variants containing numbers: 16 bytes; Variants containing characters: 22 bytes plus the storage for the characters
+
+The next few pages discuss these data types in detail.
+
+### Do You Need to Specify the Data Type?
+
+Specifying the data type for each variable you create is a good idea, but it's not compulsory. You can almost always use the default Variant data type (as you've done a couple of times so far in this book's examples) and let VBA figure out which subtype to assign to the Variant.
+
+There are four disadvantages to using the Variant data type like this:
+
+  * Sometimes VBA makes a mistake when trying to interpret which kind of subtype you intended. This can cause rather obscure bugs.
+  * Using the Variant data type causes your code to run more slowly. However, with short procedures (or long procedures involving relatively few variables), memory and speed are rarely an issue.
+  * Your code is harder for humans to read and to debug. This can be more of a concern than memory or speed issues.
+  * The Variant data type takes up more memory than any of the other data types except long strings.
+
+### Boolean
+
+A Boolean variable can be set only to True or False. You can use the keywords True and False to set the value of a Boolean variable, as in the second line in the following code (the first declares the Boolean variable blnProduct_Available):
+
+    Dim blnProduct_Available As Boolean
+    blnProduct_Available = True
+
+You can then retrieve the result of the Boolean variable and take action accordingly:
+
+    If blnProduct_Available = True Then
+        MsgBox "The product is available."
+    Else             'blnProduct_Available = False
+        MsgBox "The product is not available."
+    End If
+
+When you convert a Boolean variable to another data type (such as a numeric value), True returns –1 and False returns 0. When you convert a numeric value to a Boolean value, 0 returns False and all other numbers (whether positive or negative) return True.
+
+Boolean variables take up 2 bytes each in memory.
+
+### Byte
+
+A Byte variable takes up the least memory of any data type—just 1 byte—and can store a number from 0 to 255.
+
+### Currency
+
+The Currency data type is designed for use with money. It allows for positive and negative numbers with up to 15 digits to the left of the decimal point and 4 digits to the right of it. Unlike the Single and Double data types, the Currency data type is exact, not rounded.
+
+To implicitly declare a Currency variable, use the type-declaration character @. For example, you could work out your weekly salary with a little simple math:
+
+    Sub Calculate_Weekly_Salary()
+        Salary@ = InputBox("Enter your salary.", _
+            "Calculate Weekly Salary")
+        WeeklySalary@ = Salary / 52
+        MsgBox WeeklySalary
+    End Sub
+
+Currency variables take up 8 bytes each.
+
+### Date
+
+The Date data type is relatively complex. VBA works with dates and times as floating-point numbers, with the date displayed to the left of the decimal point and the time to the right. VBA can handle dates from 1 January 100 to 31 December 9999 and times from 0:00:00 to 23:59:59.
+
+* * *
+
+Fixed-Point Numbers Are More Efficient
+
+Computer programming typically stores a number in either of two ways: as a floating-point number or as a fixed-point number. A floating-point number is a number in which the quantity is given by one number multiplied by a power of the number base (for example, 10): the decimal point "floats" to different locations. A fixed-point number is one in which the decimal place remains in the same location. Fixed-point numbers should be used whenever practical because the computer can calculate with them more quickly, for the same reason that addition, multiplication, and subtraction are easier to learn in school than long division and fractions.
+
+* * *
+
+You can enter date variables as literal date values—such as **6/30/36** or **June 30, 1936** —by placing a **#** sign before and after the literal date value:
+
+    #June 30, 1936#
+
+When you move the insertion point from the line in the Code window in which you've entered a literal date value between # signs, VBA converts the data to a number and changes the display to the date format set in your computer. For example, if you enter **June 30, 1936** , VBA will probably display it as 6/30/36. Likewise, you can enter a literal time value (for example, **#10:15PM#** ), and VBA converts it to a number and displays it according to the current time format (for example, 10:15:00 PM).
+
+Date variables take up 8 bytes each.
+
+* * *
+
+Always Specify the Century When Managing Date Data
+
+Always specify the century of the dates you use (such as 1909 or 2009), because VBA may supply the wrong century if you don't. Earlier versions of VBA (for example, in Office 2000 and Office 97) used to assign any year from 1 through 29 to the twentieth century and any year from 30 through 00 to the twenty-first century.
+
+* * *
+
+### Decimal
+
+The Decimal data type stores unsigned integers, scaled by powers of 10. _Unsigned_ means that the integers carry no plus or minus designation. Note that you can't declare a Decimal variable directly: you can use the Decimal data type only within a Variant data type (discussed later in this section).
+
+Decimal variables take up 12 bytes each.
+
+### Double
+
+The Double data type is for floating-point numbers and can handle negative values from –1.79769313486231570E+308 to –4.94065645841246544E-324 and positive numbers from 4.94065645841246544E-324 to 1.79769313486231570E+308
+
+Some numbers in this range cannot be represented exactly in binary, so VBA rounds them.
+
+_Double_ here stands for double-precision floating point—the way in which the number is handled by the computer. _Single_ (discussed later) stands for single-precision floating point.
+
+You can use the # type-declaration character to declare a Double variable implicitly. Double variables take up 8 bytes each.
+
+### Integer
+
+The Integer data type is the most efficient way of handling numbers within its range (from 32,768 to 32,767), a range that makes it useful for many procedures. For example, if you wanted to repeat an action 300 times, you could use an Integer variable for the counter, as in the following lines:
+
+    Dim intMyVar As Integer
+    For intMyVar = 1 to 300
+        'repeat actions
+    Next intMyVar
+
+Integer variables take up 2 bytes each. The Integer is the most commonly used numeric data type for many programming tasks. This is because unless you're working with something like moon rockets or the national debt, most math will fall within the Integer type's range.
+
+### Long
+
+The Long data type is for the national debt. A Long can hold integer values larger or smaller than those the Integer data type can handle: long variables can handle numbers from –2,147,483,648 to 2,147,483,647. (For numbers even larger or smaller than these, use the Double data type, but beware of its rounding.)
+
+Long variables use the type-declaration character & for implicit declarations and take up 4 bytes each.
+
+### Object
+
+The Object data type is for storing addresses that reference objects (for example, objects in an application's object model), providing an easy way to refer to an object.
+
+Object variables take up 4 bytes each.
+
+### Single
+
+The Single data type, like the Double data type, is for working with floating-point numbers. Single can handle negative values from –3.4028235E+38 through –1.401298E-45 and positive values from 1.401298E-45 through 3.4028235E+38
+
+Some numbers in this range cannot be represented exactly in binary, so VBA rounds them.
+
+Use the exclamation point type-declaration character to declare a Single variable implicitly (if you must use implicit declarations). Single variables take up 4 bytes each.
+
+### String
+
+The String data type is for handling text:
+
+  * Variable-length String variables can contain up to about 2 billion characters. They take up 10 bytes plus the storage required for the string.
+  * Fixed-length String variables can contain from 1 to about 64,000 characters. They take up only the storage required for the string. If the data assigned to the String variable is shorter than the fixed length, VBA pads the data with trailing spaces to make up the full complement of characters. If the data assigned to the String variable is longer than the fixed length, VBA truncates the data after the relevant character. VBA counts the characters from the left end of the string—for example, if you assign the string Output to a fixed-length String variable that's four characters long, VBA stores Outp. Fixed-length String variables are rarely used in most programming, with the exception of managing certain databases where there's a rule that a string cannot be longer than a specified length.
+  * Strings can contain letters, numbers (digits), spaces, and punctuation, not to mention special characters like @ and *.
+  * You can use the $ type-declaration character to declare a String variable implicitly, but (as usual) you'll do best to declare your String variables explicitly, along with all your other variables.
+
+### Variant
+
+The Variant data type, as mentioned earlier in this chapter, is the default type. It's assigned by VBA to any variable whose data type isn't specified by you—so a declaration such as Dim myUntypedVariable creates a Variant. However, Dim intVariable As Integer creates a variable of the Integer data type. (You can also declare a Variant variable explicitly: Dim myVariant As Variant, for example.)
+
+Variants can handle most of the different types of data, but there are a couple of characteristics of Variants to keep in mind:
+
+  * Variants can't contain fixed-length string data. If you need to use a fixed-length string, you must specify a fixed-length String data type.
+  * Variant variables can contain four special values: Empty (which means the variable hasn't yet been initialized), Error (a special value used for tracking errors in a procedure), Nothing (a special value used for disassociating a variable from the object it was associated with), and Null (which you use to indicate that the variable deliberately contains no data).
+
+Variant variables take up more memory than other types. Variant variables that contain numbers take up 16 bytes, and Variant variables that contain characters take up 22 bytes plus the storage required for the characters.
+
+### Deciding among Types for Variables
+
+If you found the details of the different types of variables confusing, relax. First, you can usually avoid the whole issue of choosing a variable type by declaring the variable either implicitly or explicitly and letting VBA assign the Variant data type with the appropriate subtype. Second, if you do choose to specify data types for some or all of your variables, you can apply a few straightforward rules to direct your choices:
+
+  * If the variable will contain only the values True and False, declare it as the Boolean data type.
+  * If the variable will always contain an integer (if it will never contain a fraction), declare it as the Integer data type. (If the number may be too big for the Integer data type, declare it as the Long data type instead.)
+  * If the variable will be used for calculating money, or if you require no-rounding fractions, use the Currency data type.
+  * If the variable may sometimes contain a fraction, declare it as the Single or Double data type.
+  * If the variable will always contain a string, declare it as the String data type.
+
+* * *
+
+If You're Unsure, Test a Variable's Type Using a Variant
+
+If you aren't sure what type of variable will best contain the information you're planning to use, start by declaring the variable as a Variant. Then step through the procedure in Break mode with the Locals window displayed (View ⇒ Locals Window). The Locals window displays local variables, their value, and their type. As you press F8 to step through your procedure, see what Variant subtype VBA assigns to the variable. You'll see the type, such as Variant/Double or Variant/String, in the Type column. Test the procedure a couple more times to make sure this subtype is consistent, and then try declaring the variable as the data type indicated by the subtype. Run the code a few times to make sure the new data type works.
+
+* * *
+
+# Working with Constants
+
+A constant is a named item that keeps a constant value during execution of a program. VBA provides many built-in constants, but you can also declare your own constants to help you work with information that stays constant through a procedure. But recall that many programmers simply use variables rather than constants, even for values that won't change (such as the number of eggs in a dozen). However, constants are available if you or your superiors find them of value.
+
+## Declaring Your Own Constants
+
+To declare your own constants, use the Const statement. By declaring a constant, you can simplify your code when you need to reuse a set value a number of times in your procedures.
+
+### Syntax
+
+The syntax for the Const statement is as follows:
+
+    [Public/Private] Const _constant_ [As _type_ ] = _expression_
+
+Here, Public and Private are optional keywords used for declaring public or private scope for a constant. You'll learn how they work in a moment. _constant_ is the name of the constant, which follows the normal rules for naming variables. _type_ is an optional argument that specifies the data type of the constant. _expression_ is a literal (a value written into your code), another constant, or a combination of the two.
+
+As with variables, you can declare multiple constants in the same line by separating the statements with a comma:
+
+    Const conPerformer As String = "Carmen Singer", _
+        conTicketPrice As String = "$34.99"
+
+### Example
+
+Declaring a constant in VBA works in a similar way to declaring a variable explicitly, but you declare the value of the constant when you declare the constant (rather than at a later point of your choosing). You can't change its value afterward.
+
+As an example, take a look at the following statements:
+
+    Const conVenue As String = "Davies Hall"
+    Const conDate As Date = #December 31, 2013#
+    MsgBox "The concert is at " & conVenue & " on " _
+    & conDate & "."
+
+The first line declares the constant conVenue as a String data type and assigns it the data Davies Hall. The second line declares the constant conDate as a Date string type and assigns it the date December 31, 2013. (When you finish creating this line of code and move the insertion point to another line, VBA changes the date to the date format set in your computer's clock—#12/31/2013#, for example.) The third line displays a message box containing a string concatenated from the three text items in double quotation marks, the conVenue string constant, and the conDate date constant.
+
+## Choosing the Scope and Lifetime for Your Constants
+
+Scope works the same way for constants as it does for variables. The default scope for a constant declared in a procedure is local—that is, its scope is the procedure that declares it. Consequently, its lifetime is the time for which the procedure runs. But you can set a different scope and lifetime for your constants by using the Public or Private keyword.
+
+  * To declare a private constant, place the declaration at the beginning of the module in which you want the constant to be available. A private constant's lifetime isn't limited, but it's available only to procedures in the module in which it's declared:
+
+    Private Const conPerformer As String = "Carmen Singer"
+
+  * To declare a public constant, place the declaration at the beginning of a module. A public constant's lifetime isn't limited, and it's available to all procedures in all modules in the project in which it's declared:
+
+    Public Const conTicketPrice As String = "$34.99"
+
+# Working with Enumerations
+
+In addition to constants you can create in your code, VBA includes sets of predefined constants. An _enumeration_ is a predefined list of unique integers (numbers) that have individual names. It's a set of items, related in some way.
+
+Here's an enumeration, a set of items that you need to paint a room. Note that another way to describe this is that it's a numbered list:
+
+1. Brushes
+
+2. Paint
+
+3. Masking tape
+
+4. Drop cloth
+
+5. Sandpaper
+
+You could now refer to any of these items by either their number in the enumeration or by their name.
+
+An enumeration is typically used in your programming to specify a property of an object. Each integer in the enumeration has a meaning to VBA and a name that allows you to refer to it easily. The names that correspond to the integers in the enumeration are called _enumerated constants_.
+
+For example, when you use the MsgBox function to display a message box using VBA, you can pick one of the enumerated constants in the VbMsgBoxStyle enumeration to specify the type of message box you want to show. If you require an icon in the message box, you can specify which icon from the list of available built-in icons. For example, one of the icons—a stop sign—is the enumerated constant vbCritical (or the integer 16). The enumerated constant vbQuestion (integer 32) displays a question-mark icon, and the enumerated constant vbExclamation (48) displays an exclamation-point icon. The enumerated constant vbInformation (64) refers to an information icon. However, in practice, the integers are rarely used. The enumerated constants (names like vbQuestion) are far easier for humans to grasp, read, and remember than the values (the various integers like 16, 32, 64, and so on) to which they are mapped. So, although you _could_ use the integers in your code, it's better to stick with the enumerated constants like vbQuestion.
+
+VBA includes many built-in enumerations, and the Visual Basic Editor displays the list of available enumerated constants to help you select the appropriate integer value when you're creating code. To see such a list, type this into a procedure:
+
+    msgbox("inga",
+
+As soon as you type the comma, up pops the list of enumerated constants, all the available button styles for a message box, including vbQuestion, vbYesNo, vbOKOnly, and so on. As you might guess, the vbOKOnly style displays only a single button, captioned OK. The vbYesNo style displays two buttons, one captioned Yes, the other No.
+
+You just click one of these button styles in the list of enumerated constants to enter it into your code. If you don't see the list, choose Tools ⇒ Options in the Visual Basic Editor, then select the Auto List Members check box.
+
+You can also define your own enumerations in custom objects that you create.
+
+# The Bottom Line
+
+**Understand what variables are and what you use them for.**
+
+Variables are a cornerstone of computer programming; they are extremely useful for the same reason that files are useful in the real world. You give a name to a variable for the same reason that you write a name to identify a file folder. And a file can, over time, contain various different papers, just as the value contained in a programming variable can vary. In both cases, the contents vary; the name remains the same. It's good practice to always specifically name a variable before using it in your code. This is called _explicit declaration_.
+
+Master It
+
+Explicitly declare a variable named CustomersAge.
+
+**Create and use variables.**
+
+When creating (declaring) a new variable, you should avoid using words or commands that are already in use by VBA, such as **Stop** or **End**. There are also restrictions such as not using special characters.
+
+Master It
+
+This variable name cannot be used, for two reasons. Fix it so it is a legitimate variable name:
+
+    Dim 1Turn! as Integer
+
+**Specify the scope and lifetime of a variable.**
+
+Variables have a range of influence, depending on how you declare them.
+
+**Master It**
+
+Create a variable named AnnualSales that will be available to any procedure within its own module but not to other modules.
+
+**Work with constants.**
+
+Constants, like variables, are named locations in memory that contain a value. Unlike with variables, however, the value in a constant does not change during program execution.
+
+Master It
+
+Define a string constant using the Dim command. Name your constant FirstPrez, and assign it the value George Washington.
+
+**Work with enumerations.**
+
+Enumerations provide a handy name for each item in a list, often a list of properties.
+
+Master It
+
+In the Project Explorer, click the ThisDocument object to select it. Then locate the JustificationMode property in the Properties window, and choose one of that property's enumerated constants by clicking the small down arrow that appears, then clicking one of the constants in the drop-down list.
+Chapter 7
+
+Using Array Variables
+
+In this chapter, you'll learn how to use arrays—containers that can store multiple values at the same time. An array is a kind of super-variable.
+
+You'll start by examining what arrays are and what you use them for. You'll then examine how to create them, populate them, and erase them. Along the way, you'll look at how to resize an array to make it contain more (or fewer) values, how to specify the scope for an array, and how to find out while your macro executes whether a particular variable name represents an array or a just an ordinary, single-value variable.
+
+In this chapter you will learn to do the following:
+
+  * Understand what arrays are and what you use them for
+  * Create and use arrays
+  * Redimension an array
+  * Erase an array
+  * Find out whether a variable is an array
+  * Sort an array
+  * Search an array
+
+# What Is an Array?
+
+An _array_ is a variable on steroids—a variable that can contain multiple values (but they must be of the same data type).
+
+You can access the array itself as a whole to work with all the values it contains at once. Or you can access any individual value stored within the array by specifying its index number, which indicates its position within the array.
+
+If you're having difficulty visualizing what this means, try picturing an array as a numbered list, similar to an enumeration (as described in Chapter 6, "Working with Variables, Constants, and Enumerations"). Each item in the list is located in its own row and is identified by an index number, so you can access the value of the item by just specifying its index number. It's like houses on a street: they all share the same name, such as Maple Drive, but each has a distinguishing number all its own. You'll see visual examples of arrays later in this chapter.
+
+The previous description is of a simple array—a numbered list like a row of houses on a street. Such an array is said to have only one _dimension_. However, later in this chapter you'll see that you can construct more complicated arrays, which are called _multidimensional_. They're more like a crossword puzzle with both rows _and columns_.
+
+* * *
+
+Variant Arrays Can Store Values of Differing Data Types
+
+An array with the Variant data type can store multiple subtypes of data. That's because a Variant permits any kind of data: strings, integers, and so on. It's a shape-shifter, unique among data types in that it can contain data of all types.
+
+* * *
+
+For now, though, let's look at the qualities of the most common, and most easily visualized, array structure, the _one-dimensional array_.
+
+* * *
+
+Use Option Base 1 to Simplify Indexes
+
+Although your code will be less portable—and other programmers who use other computer languages might object—if you're writing macros for your own private use you might want to employ the controversial Option Base 1 statement.
+
+An array is _delimited_ (or bounded) by a lower bound and an upper bound. In other words, the array's index numbers start with 1 (the lower bound) and end with whatever number of items are in the array (the upper bound). An array representing the eggs in an egg carton would have a lower bound of 1 and an upper bound of 12. That's the simple way to construct and visualize an array, but there's a catch: many computer languages, including VBA, employ a lower bound of _zero_ rather than one by default.
+
+This means that the first item in an array is indexed as zero—it's the zeroth item. This can be confusing, because it means that you're always working with an index number that's one lower than the item's position in the array. In such an array, January would be the zeroth month, February the first month, with array index number 1, March would be given index 2, and so on. It's as if your shopping list looked like this:
+
+0. Brushes
+
+1. Paint
+
+2. Masking tape
+
+3. Drop cloth
+
+4. Sandpaper
+
+Nobody writes lists with a zeroth item, but this is just one of the kinks in computer programming caused by carelessness when programming languages were first invented.
+
+However, _unlike_ most other computer languages, VBA allows you to normalize the way array indexes work: beginning them with index 1, the way humans count items in sets or lists.
+
+VBA lets you make 1 the default index number of the first item in an array by entering an Option Base 1 statement at the beginning of a module. Type this option up in the General Declarations section of your Code window, and the index number for each item in the array is then the same as the item's position in the array, so the array is easier to work with—easier to visualize.
+
+Why does the first item in an array default to zero anyway? Forty years ago, people who wrote programming languages decided to do this, and it has persisted. The major exception was the BASIC language, VBA's ancestor. It defaulted, sensibly, to 1 as the lower bound of any array. Eventually (with version 6 of Visual Basic), BASIC was modified to make it conform to the other languages and those in charge changed VBA's lower bound to zero. But BASIC did preserve the programmer's option to specify the lower bound as 1 with this Option Base statement.
+
+Arrays are lists, and we humans don't start lists with zero. We have a first birthday party, not a zeroth one. A winning team comes in first place, not zeroth place. Nonetheless, computer programmers have been wrestling with zero-based array indexing for three decades now—and introducing countless bugs into their code as a result. You're fortunate to be working with VBA, where you have an option to avoid this problem if it bothers you. But note that if you are studying programming or plan to use other languages or program professionally, you will have to accustom yourself to the types of error messages generated by this zero index hitch. Then you can say, "Oh, this is probably an indexing problem," and fiddle with an index number to fix it. Generally, you'll subtract 1 from the index number and that'll do the trick.
+
+* * *
+
+# Declaring an Array
+
+An array is a kind of variable, so you declare an array by using the familiar keywords: Dim, Private, Public, and Static. To indicate that it's an array, however, you add a pair of parentheses after the array's name. For example, the following statement declares an array named curMonthProfit:
+
+    Dim varMonthProfit()
+
+If you had left off the parentheses, then you would have created an ordinary variable capable of holding only a single value:
+
+    Dim varMonthProfit
+
+Because no data type was specified in the declaration (Dim) of the preceding array example, this example creates a Variant array. VBA then assigns the appropriate data types (String, Integer, and so on) when you store data in the array.
+
+But you can specify the data type of an array, just as you would for an ordinary variable. For example, the following statement declares the array named curMonthProfit and makes it the Currency data type:
+
+    Dim curMonthProfit() As Currency
+
+You can also specify the number of items in the array by using an _array subscript_. For example, the following statement declares the array named curMonthProfit, assigns the Currency data type, and specifies that the array contains 12 items:
+
+    Dim curMonthProfit(11) As Currency
+
+Now you can see one aspect of the zeroth problem. This array holds 12 items, but in its declaration we must specify 11! The array _subscript_ in the Dim curMonthProfit(11) As Currency statement is _11_ rather than 12 because by default an array's index starts at 0 rather than 1. That 0 index number gives this list _an extra element_. The 1st item is curMonthProfit(0), the 2nd is curMonthProfit(1), and the 12th is curMonthProfit(11). (You can avoid this counterintuitive approach by using the Option Base 1 statement.)
+
+Figure 7.1 shows a simple representation of the single-dimensional array created by the Dim curMonthProfit(11) As Currency statement.
+
+Figure 7.1 The single-dimensional array created by the statement Dim curMonthProfit(11) As Currency can be thought of as looking like this.
+
+To make numbering start at 1, add an Option Base statement to the declarations area at the beginning of the module in which you declare the array. Here is an example:
+
+    Option Base 1   'at the beginning of the code sheet
+
+    Dim curMonthProfit(12) As Currency
+
+Figure 7.2 shows a simple representation of how this array would look.
+
+Figure 7.2 The single-dimensional array created by the statement Dim curMonthProfit(12) As Currency with the Option Base 1 statement. Compare this to Figure 7.1.
+
+* * *
+
+Variants Can Be Inefficient under Extreme Circumstances
+
+Recall that omitting the data type when declaring an array (and thus making VBA automatically use the Variant data type) causes slightly increased memory usage, which could (under extreme circumstances) slow the performance of the computer. Because an array needs storage for each item it contains, a very large array can consume a significant amount of memory. This is particularly true with multidimensional arrays discussed later in this chapter.
+
+* * *
+
+You can also specify both the lower and upper bounds of an array explicitly. This example code states that the lower bound is to be 1 and the upper bound is 12:
+
+    Option Base 1   'at the beginning of the code sheet
+
+    Dim curMonthProfit(1 To 12) As Currency
+
+Because learning to use arrays is much easier for beginners if we start with an index of 1, the examples in the rest of this chapter use Option Base 1 statements.
+
+# Storing Values in an Array
+
+To assign a value to an item in an array, you use each item's index number to identify it. For example, the following statements assign the values London, Hong Kong, and Taipei to the first three items in an array named strLocations:
+
+    Option Base 1
+
+    Dim strLocations(6) As String
+
+    strLocations(1) = "London"
+    strLocations(2) = "Hong Kong"
+    strLocations(3) = "Taipei"
+
+Figure 7.3 shows how this array can be envisioned.
+
+Figure 7.3 A simple String array with three values assigned
+
+# Multidimensional Arrays
+
+The curMonthProfit example in the previous section is a one-dimensional array, which is the easiest kind of array to use. But VBA supports arrays with up to 60 dimensions—enough to tax the visualization skills of anyone without a PhD in multidimensional modeling. You probably won't want to get this complicated with arrays—two, three, or four dimensions are enough for most purposes. In fact, one dimension is enough for many purposes.
+
+To declare a multidimensional array, you separate the dimensions with commas. For example, the following statements declare a two-dimensional array named MyArray with three items in each dimension:
+
+    Option Base 1
+    Dim MyArray(3, 3)
+
+Figure 7.4 shows how you might represent the resulting array. Note that inside each item in this figure's table you can see the pair of index numbers you would use to access it, such as item 1,2 or item 3,2.
+
+Figure 7.4 You can think of a two-dimensional array as consisting of rows and columns.
+
+Multidimensional arrays sound forbidding, but a two-dimensional array is quite straightforward if you think of it basically as a _table_ that consists of rows and columns.
+
+In this example, the first series of three elements appears down the first column of the table, the second series of three elements appears down the second column, and so on.
+
+The information in any series doesn't need to be related to information in the other series, although it does need to be of the same data type. For example, you could assign three folder names to the first dimension of a String variable array (they would be in column 1), the names of your three cats to the second dimension (more strings), a list of the names of the Three Stooges to the third dimension (the third column in the table), and so on. You could then access the information in the array by specifying the position of the item you want to access—for instance, the second item in the first column of the table (item 1,2). You'll learn how to do this in just a minute.
+
+Similarly, you could picture a three-dimensional array as being something like a workbook of spreadsheets—rows and columns, with further rows and columns in the third dimension (down, or away from you).
+
+But that's about the limit of easily pictureable arrays—four-dimensional and larger arrays start to tax the imagination. A row of honeycombs, a set of apartment buildings? It gets difficult.
+
+# Declaring a Dynamic Array
+
+You can declare both _fixed_ - _size_ arrays and _dynamic_ arrays. The examples you've seen so far were fixed-size arrays. For instance, the curMonthProfit array was specified as having 12 items.
+
+Dynamic arrays are useful when the number of values you need to store will vary. For example, for a procedure that arranges windows side by side, you might create an array to contain the name of each open window. But while writing the code, you can't know how many windows might want to open while the macro runs. You'll probably want to use a dynamic array to contain the information. That way the array can be sized to fit the situation.
+
+To declare a dynamic array, you use a declaration statement _without_ specifying the number of items (you include the parentheses but leave them empty). For example, the following statement declares a dynamic array named arrTestArray and causes VBA to assign it the Variant data type (because no data type is specified):
+
+    Dim arrTestArray()
+
+# Redimensioning an Array
+
+You can change the size of, or _redimension_ , a dynamic array by using the ReDim statement. For example, to redimension the dynamic array arrTestArray declared in the previous example and assign it a size of five items, you could use the following statement:
+
+    **ReDim** arrTestArray(5)
+
+When you use ReDim to redimension an array like this, you lose the values currently in the array. If so far you've only declared the array as a dynamic array and it contains nothing, losing its contents won't bother you. There are no contents.
+
+But in other situations an array might be full of data, so you'll want to increase the size of an array while keeping its current contents. To preserve the existing values in an array when you raise its upper bound, use a ReDim Preserve statement instead of a straight ReDim statement:
+
+    ReDim **Preserve** arrTestArray(5)
+
+If you use ReDim Preserve to reduce the size of the array (to lower its upper bound), you of course lose the information stored in any items not included in the redimensioned array. For example, if you have a five-subscript (five-item) array with information in each item and then you redimension it using ReDim Preserve so that it has only three subscripts, you lose the information in the fourth and fifth subscripts.
+
+Note that ReDim Preserve works only for the last dimension of a multidimensional array. You can't preserve the data in other dimensions in a multidimensional array.
+
+# Returning Information from an Array
+
+To get information from an array, you use an index number to specify the position of the information you want to return. For example, the following statement returns the fourth item in the array named arrMyArray and displays it in a message box:
+
+    Option Base 1
+
+    MsgBox arrMyArray(4)
+
+The following statement returns the fifth item in the second dimension of a two-dimensional array named arrMy2DArray and displays it in a message box:
+
+    Option Base 1
+
+    MsgBox arrMy2DArray(2,5)
+
+To return multiple items from an array, specify each item individually.
+
+# Erasing an Array
+
+To erase the contents of an array, use the Erase command with the name of the array. This command reinitializes the items in a fixed-size array and frees the memory taken by items in dynamic arrays (completely erasing the array). For example, the following statement erases the contents of the fixed-size array named arrMyArray:
+
+    Erase arrMyArray
+
+# Finding Out Whether a Variable Is an Array
+
+Because an array is a type of variable, you may occasionally need to check whether a particular variable name denotes an array or an ordinary variable (sometimes called a _scalar variable_ ). To find out whether a variable is an array, use the IsArray function with the variable's name. For example, the following statements check the variable MyVariable and display the results in a message box:
+
+    If IsArray(MyVariable) = True Then
+
+        Msg = "MyVariable" & " is an array."
+    Else
+        Msg = "MyVariable" & " is not an array."
+    End If
+    MsgBox Msg, vbOKOnly + vbInformation, "Array Check"
+
+# Finding the Bounds of an Array
+
+To find the bounds of an array, you use the LBound function and the UBound function. LBound returns the _lower bound_ , the index number of the first item; UBound returns the upper bound, the index number of the last item.
+
+The LBound function and the UBound function have the following syntax:
+
+    LBound( _array_ [, _dimension_ ])
+    UBound( _array_ [, _dimension_ ])
+
+Here, _array_ is a required argument specifying the name of the array, and _dimension_ is an optional variant specifying the dimension whose bound you want to return—1 for the first dimension, 2 for the second, and so on. (If you omit the _dimension_ argument, VBA assumes you mean the first dimension.)
+
+For example, the following statement returns the upper bound of the second dimension in the array named arrMyArray and displays it in a message box:
+
+    MsgBox UBound(arrMyArray, 2)
+
+# Sorting an Array
+
+You'll sometimes need to sort an array, especially when you load information into the array from an external source rather than assigning values one by one in your code.
+
+Sorting is easy to understand conceptually: You simply rearrange things into the desired order. For example, you could sort the strings in one array into alphabetical order or reverse alphabetical order, or the numbers in another array into ascending order or descending order. But writing a program that sorts is much more difficult. So, don't write it. Just copy it from examples on the Internet, or from the following example.
+
+This section shows you a simple form of sorting—the bubble sort, so called because the items being sorted to the earlier positions in the array gradually bubble up to the top. The bubble sort consists of two _loops_ that compare two items in the array; if the second item belongs further up the list than the first item, the sort reverses their positions, and the comparisons continue until the whole list is sorted into order. The bubble sort is a relatively inefficient method of sorting items, but it's easy to grasp, and processor cycles are comparatively cheap these days. The bubble sort hasn't itself become any more efficient over the years, but processor speeds have sure ramped up.
+
+This example also introduces you to a major element of programming: the _loop_. Loops are an important tool found in many procedures and projects. In effect, a loop repeats some action until a condition is met. It's like saying, "Keep rearranging these attendance cards until the stack is alphabetized." Chapter 12, "Using Loops to Repeat Actions," shows you how to work with loops.
+
+Listing 7.1 contains the code for the bubble sort.
+
+**Listing 7.1**: A bubble sort
+
+     1.  Option Explicit
+     2.  Option Base 1
+     3.
+     4.  Sub Sort_an_Array()
+     5.
+     6.      'declare the array and other variables
+     7.      Dim strArray(12) As String
+     8.      Dim strTemp As String
+     9.      Dim strMsg As String
+    10.      Dim X As Integer, Y As Integer, i As Integer
+    11.
+    12.      'assign strings to the array
+    13.      strArray(1) = "nihilism"
+    14.      strArray(2) = "defeatism"
+    15.      strArray(3) = "hope"
+    16.      strArray(4) = "gloom"
+    17.      strArray(5) = "euphoria"
+    18.      strArray(6) = "despondency"
+    19.      strArray(7) = "optimism"
+    20.      strArray(8) = "pessimism"
+    21.      strArray(9) = "misery"
+    22.      strArray(10) = "happiness"
+    23.      strArray(11) = "bliss"
+    24.      strArray(12) = "mania"
+    25.
+    26.      strMsg = "Current items in array:" & vbCr & vbCr
+    27.      For i = 1 To UBound(strArray)
+    28.          strMsg = strMsg & i & ":" & vbTab & strArray(i) & vbCr
+    29.      Next i
+    30.      MsgBox strMsg, vbOKOnly + vbInformation, "Array Sorting: 1"
+    31.
+    32.      For X = LBound(strArray) To (UBound(strArray) - 1)
+    33.          For Y = (X + 1) To UBound(strArray)
+    34.              If strArray(X) > strArray(Y) Then
+    35.                  strTemp = strArray(X)
+    36.                  strArray(X) = strArray(Y)
+    37.                  strArray(Y) = strTemp
+    38.                  strTemp = ""
+    39.              End If
+    40.          Next Y
+    41.      Next X
+    42.
+    43.      strMsg = "Items in sorted array:" & vbCr & vbCr
+    44.      For i = 1 To UBound(strArray)
+    45.          strMsg = strMsg & i & ":" & vbTab & strArray(i) & vbCr
+    46.      Next i
+    47.      MsgBox strMsg, vbOKOnly + vbInformation, "Array Sorting: 2"
+    48.
+    49.  End Sub
+
+Read through this code, and the explanation of it that follows, to see how much of it you can understand. At this point, you might not grasp much at all. But don't worry; things will become clearer as you progress through this book. What's more, you need never write a bubble sort from scratch anyway—just copy this one, modifying it a little to sort whatever array you're dealing with. And remember, you can copy all the code in this book from this book's website at www.sybex.com/go/masteringvba2013.
+
+* * *
+
+How to Locate Line Numbers in the Editor
+
+In this book code examples more than a few lines long are given line numbers so the lines can be referenced easily in the explanatory text. If you're following along with a code description in this book, you'll sometimes want to know what line the blinking cursor is on in the editor code. Just look at the field at the far right of the editor's Standard toolbar, right next to the blue Help question mark. This field always displays the current line number and character number, as you can see in this screenshot.
+
+* * *
+
+Here's what happens in Listing 7.1:
+
+  * Line 1 contains an Option Explicit statement to force explicit declarations of variables, and line 2 contains an Option Base 1 statement to make array index numbers start at 1 rather than 0. These two statements appear in the General Declarations zone of the code sheet, above any other procedure in the Code window. Line 3 is a spacer—a blank line inserted just to make the code easier to read. You can remove it if you wish, or add more spacers—it's your call. VBA ignores blank lines.
+  * Line 4 begins the Sort_an_Array procedure. Line 5 is a spacer.
+  * Line 6 is a comment line prefacing the declaration of the array and the variables. Line 7 declares the String array strArray with 12 subscripts (array items). Line 8 declares the String variable strTemp. Line 9 declares the String variable strMsg. Line 10 declares the Integer variables X, Y, and i. Line 11 is a spacer.
+  * Line 12 is a comment line explaining that the next 12 statements (lines 13 through 24) assign strings to the array. The strings used are words describing various moods. Line 25 is a spacer.
+  * Lines 26 through 30 build a string out of the strings assigned to the array and then display it in a message box. This section of code is included to help users easily see what's going on if they run the procedure rather than stepping through it. Line 26 assigns introductory text and two carriage returns (two vbCr characters) to the String variable strMsg. Line 27 starts a For... Next loop that runs from i = 1 to i = UBound(strArray)—in other words, once for each item in the array. (The loop could also have run to i = 12 because the upper bound of the array is set, but using the upper bound is more flexible than hard-coding values.) Line 28 adds to strMsg the value of the counter variable i, a colon, a tab (vbTab), the contents of the array item currently referenced (strArray(i)), and a carriage return (vbCr). Line 29 concludes the loop, and line 30 displays a message box containing strMsg, as shown in Figure 7.5. Line 31 is a spacer.
+
+Figure 7.5 The Sort_an_Array procedure displays a message box of the unsorted terms so that the user can see how things start out.
+
+  * The sorting part of the procedure takes place in lines 32–41. Here are the details: 
+    * Line 32 begins a set of nested loops: one inside another. There's an outer loop and an inner loop. The outer For... Next loop ends in line 41 with the Next X statement. This loop runs from X = LBound(strArray) (in other words, X = 1) to X = (UBound(strArray) - 1) (in other words, X = 11, the upper bound of the array, minus 1).
+    * Line 33 begins the inner (nested) For... Next loop, which runs from Y = (X + 1) to Y = UBound(strArray). Line 40 ends this loop.
+    * Line 34 compares strArray(X) to strArray(Y). If strArray(X) is greater than strArray(Y)—in other words, if strArray(X) should appear after strArray(Y) in the alphabetized array—line 35 assigns strArray(X) to strTemp, line 36 assigns strArray(Y) to strArray(X), and line 37 assigns strTemp to strArray(Y), thus switching the values. Line 38 restores strTemp to an empty string. Line 39 ends the If statement. Line 40 ends the inner loop, line 41 ends the outer loop, and line 42 is a spacer.
+  * Lines 43 through 47 essentially repeat lines 26 through 30, displaying a message box (shown in Figure 7.6) of the now-sorted array so that the user can see that the sort has worked.
+
+Figure 7.6 When the Sort_an_Array procedure has finished sorting, it displays the sorted list in a second message box.
+
+  * Line 48 is a spacer, and line 49 ends the procedure.
+
+# Searching through an Array
+
+Another task you sometimes need to perform with an array is searching to find a particular value in it. This is similar to rifling through a box of recipe cards until you find _Ralph's Jailhouse Chili_.
+
+The following sections show you two methods of sorting—a linear search, which you can perform on either a sorted array or an unsorted array, and a binary search, which is faster but works only on a sorted array.
+
+## Performing a Linear Search through an Array
+
+A _linear_ search is a simple kind of search: You start at the beginning of the array and check each item until you find your target, or until you reach the end of the array and must report _not found_.
+
+Before executing this code, display the Immediate window in the editor by pressing Ctrl+G or choosing View ⇒ Immediate Window. This procedure prints information in the Immediate window so that you can see what's going on—and whether the code is running as intended. Using the Immediate Window like this to check output is often preferable to displaying message boxes as we did in the previous section. With the Immediate window, you don't have to click the message boxes closed, and the window can also be scrolled, displaying as much information as you wish.
+
+Listing 7.2 contains the code for a simple linear search through a one-dimensional array.
+
+**Listing 7.2**: A simple linear search
+
+     1.  Option Explicit
+     2.  Option Base 1
+     3.
+     4.  Sub Linear_Search_of_Array()
+     5.
+     6.      'declare the array and the variables
+     7.      Dim intArray(10) As Integer
+     8.      Dim i As Integer
+     9.      Dim varUserNumber As Variant
+    10.      Dim strMsg As String
+    11.
+    12.      'add random numbers between 0 and 10 to the array
+    13.      'and print them to the Immediate window for reference
+    14.      For i = 1 To 10
+    15.          intArray(i) = Int(Rnd * 10)
+    16.          Debug.Print intArray(i)
+    17.      Next i
+    18.
+    19.  Loopback:
+    20.      varUserNumber = InputBox _
+                 ("Enter a number between 1 and 10 to search for:", _
+                 "Linear Search Demonstrator")
+    21.      If varUserNumber = "" Then End
+    22.      If Not IsNumeric(varUserNumber) Then GoTo Loopback
+    23.      If varUserNumber < 1 Or varUserNumber > 10 Then GoTo Loopback
+    24.
+    25.      strMsg = "Your value, " & varUserNumber & _
+                 ", was not found in the array."
+    26.
+    27.      For i = 1 To UBound(intArray)
+    28.          If intArray(i) = varUserNumber Then
+    29.              strMsg = "Your value, " & varUserNumber & _
+                         ", was found at position " & i & " in the array."
+    30.              Exit For
+    31.          End If
+    32.      Next i
+    33.
+    34.      MsgBox strMsg, vbOKOnly + vbInformation, "Linear Search Result"
+    35.
+    36.  End Sub
+
+Here's what happens in Listing 7.2:
+
+  * As in the previous listing, line 1 contains an Option Explicit statement to force explicit declarations of variables, and line 2 contains an Option Base 1 statement to make the index numbers of arrays start at 1 rather than 0. These two statements appear in the declarations part of the code sheet, before any other procedure. Line 3 is a spacer.
+  * Line 4 begins the Linear_Search_of_Array procedure. Line 5 is a spacer.
+  * Line 6 is a comment line prefacing the declaration of the array and the other variables that the code uses. Line 7 declares the Integer array intArray with 10 subscripts. Line 8 declares the Integer variable I (traditionally programmers use the name I for a loop's counter variable— _I_ for _increment_ or _iteration_ ).
+  * Line 9 declares the Variant variable varUserNumber, which the code uses to store the user's input from an input box. (More on this control in a moment.) Line 10 declares the String variable strMsg. Line 11 is a spacer.
+  * The procedure declares the variable varUserNumber as a Variant rather than an Integer. This way, Visual Basic doesn't automatically halt execution and display an error message if the user enters something other than an integer (for example, text) in the input box.
+  * Lines 12 and 13 contain an extended comment line on the code in lines 14 through 17. (These two lines could be combined into one logical line by adding a continuation character at the end of the first line and omitting the apostrophe at the beginning of the second line, but the code is easier to read when the second line begins with the comment character as well.)
+  * Line 14 begins a For... Next loop that repeats 10 times: from i = 1 to 1 = 10. Line 15 assigns to the current item in the intArray array the integer result of a random number multiplied by 10: intArray(i) = Int(Rnd * 10). (The Rnd function generates a random number between 0 and 1 with a good number of decimal places. So the procedure multiplies that random number by 10 to get a number between 0 and 10 and then takes the integer portion of the number. In other words, the Int command strips off any fractional result, any values to the right of the decimal point.) Line 16 then uses the Print method of the Debug object to print the current item in intArray to the Immediate window. This is an easy way for you, the programmer, to examine the values generated randomly for the array. The user never sees the Immediate window. Line 17 ends the loop with the Next i statement. Line 18 is a spacer.
+  * Line 19 contains a _label_ , named Loopback, used to return execution to this point in the code if the user's input does not meet required conditions (If it's not between 1 and 10).
+  * Line 20 assigns to the Variant variable varUserNumber the result of the user's input. An input box (shown in Figure 7.7) prompts the user to enter a number between 1 and 10.
+  * Line 21 then compares the contents of varUserNumber to an empty string—the result you get if the user clicks the Cancel button in the input box or clicks the OK button without entering anything in the text box. If varUserNumber is an empty string, the End statement ends execution of the procedure.
+  * Line 22 uses the IsNumeric function to see whether the contents of varUserNumber are numeric. If they're not, the GoTo Loopback statement returns execution to the Loopback label, after which the input box is displayed again for the user to try their luck once more. Line 23 checks to see if varUserNumber is less than 1 or greater than 10. If either is the case, another GoTo Loopback statement returns execution to the Loopback label, and the input makes another appearance. Line 24 is a spacer.
+
+Figure 7.7 The Linear_Search_of_Array procedure displays an input box prompting the user to enter a number between 1 and 10. The array itself is printed in the Immediate window.
+
+* * *
+
+VBA Is Flexible
+
+Note the flexibility of VBA here: The code solicits user input and makes sure that it's a number between 1 and 10 (inclusive). Though that number is still stored in a Variant rather than explicitly converted to an Integer, VBA still performs the comparison needed.
+
+* * *
+
+  * Line 25 assigns to the String variable strMsg a preliminary message stating that the value (which it specifies) was not found in the array. (If the code finds the value in the array, it changes the message before displaying it.) Line 26 is a spacer.
+  * Lines 27 through 32 contain the searching part of the procedure. Line 27 begins a For... Next loop that runs from i = 1 to i = UBound(intArray)—once for each subscript in the array. Line 28 compares intArray(i) to varUserNumber; if there's a match, line 28 assigns to strMsg a string telling the user at which position in the array the value was found, and line 29 uses an Exit For statement to exit the For... Next loop. (If line 28 does not match, the Next i statement in line 32 causes the code to loop.)
+  * Line 33 is a spacer. Line 34 displays a message box containing strMsg to convey to the user the result of the linear search operation. Figure 7.8 shows the result of a successful search. Line 35 is a spacer, and line 36 ends the procedure.
+
+Figure 7.8 Line 34 of Listing 7.2 displays a message box telling the user the result of the linear search operation.
+
+* * *
+
+How to Generate Random Numbers
+
+Sharp-eyed readers will notice that a 0 sometimes appears in the array in the previous example, and what's more, 10 never appears. In other words, the code Int(Rnd * 10) randomly produces numbers ranging from 0 to 9. This is a byproduct of the rounding performed by the Int command. Here's how to use the Rnd command to produce the exact range of numbers you want.
+
+When asking VBA for a random number, you specify the upper limit of the range of numbers you want and then multiply that number by Rnd. For example, if you want to simulate rolling dice, you need random numbers from 1 to 6, so 6 is the upper limit. You multiply the result that Rnd gives you by 6. _But then you must add 1_ to make the result range from 1 to this upper limit. (Otherwise, the result is a range between 0 and the upper limit, minus 1, as in the code in Listing 7.2, which provided numbers from 0 to 9 rather than 1 to 10.)
+
+The Int function must be used because Rnd provides only fractions. Here are some typical results when the Rnd function executes:
+
+  * 0.4542078
+  * 0.3570231
+  * 0.1499811
+  * 0.7043958
+  * 0.928786
+
+Because these are fractions, you need to multiply to get whole numbers. But the Int command rounds off any fractional part of the final result. So here is how you would get a random number from 1 to 50:
+
+    X = Int(Rnd * 50 + 1)
+
+To get a range from 0 to an upper limit, specify as the upper limit a number 1 higher than you actually want. And don't add 1 inside the parentheses. This example provides a random number from 0 to 50:
+
+    X = Int(Rnd * 51)
+
+* * *
+
+## Performing a Binary Search through an Array
+
+As you saw in the previous section, a linear search is easy to perform, but it's pretty simple and slow—it starts looking at the beginning of the array and then checks each element, each item, in turn. This approach works fine for small searches, such as the 10-subscript array you searched in the last example, but you wouldn't want to try it on anything the size of a phone book—even in a small town. For serious, heavy-duty searching, you need a smarter approach.
+
+For conventional purposes, a _binary search_ is a good way to approach searching a sorted array. A binary search formalizes the technique you probably use when searching for something like a lost TV remote control. You expect it to be in a given location—somewhere in the living room, probably near the couch. So you focus your attention on the relevant area and search it thoroughly. (With a _linear search_ , you search everywhere in the house, from start to finish, without any attempt to intelligently narrow the search area.)
+
+The binary search technique (technically called an _algorithm_ ) determines the most likely target area by dividing the sorted array in half, establishing which half will contain the search item, and then repeating the divide-and-interrogate procedure until it either finds the search item or reaches the last subdivisible unit of the array without finding it. Remember, this array is presorted, so if the algorithm is looking for the number 12 in a list from 1 to 20, it's likely that the target will be in the second half of the list.
+
+Here's another example. Say that a binary search is looking for the value 789,789 in a million-subscript array that contains the numbers 1 through 1,000,000 in ascending order. It divides the array into two halves, each of which contains a half million subscripts. It establishes whether the search item is in the first half or the second half and then narrows the search to the appropriate half and divides it into new halves. It establishes whether the search item is in the first of these halves or the second and then focuses on that half, dividing _it_ into halves—and so on until it finds the term or has gotten down to a single subscript.
+
+This is a simple example, but a million is still a hefty number. Listing 7.3 makes things even simpler by using an array of a thousand subscripts that contains the numbers 1 through 1000 in order: The first subscript contains the number 1, the second subscript contains the number 2, and so on up to 1000. The example is unrealistic, but it makes it easy to see what's happening in the code.
+
+**Listing 7.3**: Searching through a large array
+
+     1.  Option Explicit
+     2.  Option Base 1
+     3.
+     4.  Sub Binary_Search_of_Array()
+     5.
+     6.      'declare the array and the variables
+     7.      Dim intThousand(1000) As Integer
+     8.      Dim i As Integer
+     9.      Dim intTop As Integer
+    10.      Dim intMiddle As Integer
+    11.      Dim intBottom As Integer
+    12.      Dim varUserNumber As Variant
+    13.      Dim strMsg As String
+    14.
+    15.      'populate the array with numbers 1 to 1000, in order
+    16.      For i = 1 To 1000
+    17.          intThousand(i) = i
+    18.      Next i
+    19.
+    20.      'prompt the user for the search item
+    21.  Loopback:
+    22.      varUserNumber = InputBox _
+                 ("Enter a number between 1 and 1000 to search for:", _
+                 "Binary Search Demonstrator")
+    23.      If varUserNumber = "" Then End
+    24.      If Not IsNumeric(varUserNumber) Then GoTo Loopback
+    25.
+    26.      'search for the search item
+    27.      intTop = UBound(intThousand)
+    28.      intBottom = LBound(intThousand)
+    29.
+    30.      Do
+    31.          intMiddle = (intTop + intBottom) / 2
+    32.          If varUserNumber > intThousand(intMiddle) Then
+    33.             intBottom = intMiddle + 1
+    34.          Else
+    35.              intTop = intMiddle - 1
+    36.          End If
+    37.      Loop Until (varUserNumber = intThousand(intMiddle)) _
+                 Or (intBottom > intTop)
+    38.
+    39.      'establish whether the search discovered the search item _
+              or not and add the appropriate information to strMsg
+    40.      If varUserNumber = intThousand(intMiddle) Then
+    41.          strMsg = "The search found the search item, " _
+                     & varUserNumber & ", at position " & intMiddle _
+                     & " in the array."
+    42.      Else
+    43.          strMsg = "The search did not find the search item, " _
+                     & varUserNumber & "."
+    44.      End If
+    45.
+    46.      MsgBox strMsg, vbOKOnly & vbInformation, "Binary Search Result"
+    47.
+    48.  End Sub
+
+Here's what happens in Listing 7.3:
+
+  * Line 1 contains an Option Explicit statement to force explicit declarations of variables, and line 2 contains an Option Base 1 statement to make the numbering of arrays start at 1 rather than 0. These two statements appear in the declarations part of the code sheet, before any procedure.
+  * Line 3 is a spacer. Line 4 declares the Binary_Search_of_Array procedure, and line 5 is another spacer.
+  * Line 6 is a comment line prefacing the declaration of the array (the thousand-subscript Integer array intThousand, declared in line 7) and the other variables that the procedure uses: the Integer variables i (line 8), intTop (line 9), intMiddle (line 10), and intBottom (line 11); the Variant variable varUserNumber (line 12); and the String variable strMsg (line 13). Line 14 is yet another spacer.
+  * Line 15 is a comment line announcing that lines 16 through 18 populate the array with the numbers 1 to 1000 in order. To do so, these lines use a For... Next loop that runs from i = 1 to i = 1000, assigning the current value of i to the subscript in the array referenced by i—in other words, assigning to each subscript the number that corresponds to its position in the array. Line 19 is a spacer.
+  * Line 20 is a comment line introducing the section of code (lines 21 through 24) that uses an input box (shown in Figure 7.9) to prompt users to enter a number to search for, and checks that they do so. As in the previous listing, this section of code checks to make sure users don't enter an empty string in the input box (line 23) and terminates execution of the procedure if they did. It also uses a label named Loopback (in line 21), to which the code returns if what a user entered in the input box (in line 22) turns out not to be numeric when line 24 checks. Because this time you know which numbers the array will contain, you don't need to check to make sure that users enter a suitable value. If they want to enter a value that doesn't appear in the array, so be it.
+
+Figure 7.9 The Binary_Search_of_Array procedure prompts the user to enter a number between 1 and 1000.
+
+  * Line 25 is a spacer, and line 26 is a comment that introduces the section of code that searches for the search item the user entered. Line 27 assigns to the intTop variable the upper bound of the array, and line 28 assigns to intBottom the lower bound. Line 29 is a spacer.
+  * Lines 30 through 37 contain a Do... Loop Until loop that performs the bulk of the binary searching. Here are the details: 
+    * Line 30 starts the Do... Loop Until loop with the Do keyword, and line 37 ends it with the Loop Until keywords and the condition ((varUserNumber = intThousand(intMiddle)) Or (intBottom > intTop)). You'll look at loops in detail in Chapter 12; for now, all you need to know is that a Do... Loop Until runs once and then evaluates the condition in the Loop Until statement to determine whether it should end or run again. The condition here specifies that the loop continue until either the value of the subscript in the array identified by intMiddle –intThousand(intMiddle)– matches the value in varUserNumber or the value of intBottom is greater than the value of intTop (intBottom > intTop).
+    * Line 31 sets the value of the Integer variable intMiddle to the sum of intTop and intBottom divided by 2 : (intTop + IntBottom) / 2. Doing so gives the midpoint for dividing the array. For example, in the thousand-subscript array, intTop has a value of 1000 on the first iteration of the loop, and intBottom has a value of 0, so intMiddle receives the value 500 (1000 divided by 2).
+    * Line 32 tests whether varUserNumber is greater than the value stored in the subscript identified by intMiddle—intThousand(intMiddle), the midpoint of the current section of the array. If it is, the search needs to work on the top half of the array, so line 33 resets intBottom to intMiddle + 1. If it's not, the Else statement in line 34 kicks in, and line 35 resets intTop to intMiddle–1 so that the search works on the lower half of the array.
+    * Line 36 ends the If statement, and line 37 tests the condition and continues or terminates the loop, as appropriate.
+  * Line 38 is a spacer. Line 39 contains a two-line comment introducing the code in lines 40 through 44, which establish whether the search found the search item and assign suitable information to the strMsg String variable. Line 40 compares varUserNumber to intThousand(intMiddle); if it matches, line 41 assigns to strMsg a string telling the user where the search item was found in the array. If it doesn't match, line 43 assigns a string telling the user that the search did not find the search item. Line 45 is a spacer, and line 46 displays a message box telling the user the result of the search. Figure 7.10 shows examples—one successful, one otherwise—of the message box.
+
+Figure 7.10 The Binary_Search_of_Array procedure tells the user whether the search was successful (left) or not.
+
+  * Line 47 is another spacer, and line 48 ends the procedure.
+
+The most complex part of the procedure is what happens in the loop. Download the code from the book's website at www.sybex.com/go/masteringvba2013.
+
+Copy the code, and paste it into the Visual Basic Editor (this code will work in any VBA-enabled application). Then open up the module and follow these steps:
+
+1. Display the Locals window (View ⇒ Locals Window) so that you can track the values of the variables intTop, intMiddle, and intBottom. Figure 7.11 shows the Locals window while the procedure is running.
+
+Figure 7.11 Use the Locals window to track the values of the intTop, intMiddle, and intBottom variables as the procedure runs.
+
+2. Set a breakpoint in the procedure on line 22 by clicking in the margin indicator bar next to the statement that begins varUserNumber = InputBox. (Because the statement is broken onto three lines, the Visual Basic Editor displays three red dots rather than one in the margin indicator bar, to indicate the breakpoint.)
+
+3. Press the F5 key (or choose Run ⇒ Run Sub/UserForm) to run the code up to the breakpoint. VBA creates and populates the array and then stops at line 22.
+
+4. Press the F8 key to step through the next statements. The first press displays the input box. Enter the value **67** for this example and click the OK button.
+
+5. As the code enters the Do loop and cycles through it, watch the values of the variables intTop, intMiddle, and intBottom in the Locals window. You'll see them change, as shown in the following list:
+
+At the end of the tenth iteration of the loop, intThousand(intMiddle) is equal to varUserNumber, so the loop ends. As you can see, breakpoints, single-stepping, and the Locals window are excellent debugging tools. Chapter 17, "Debugging Your Code and Handling Errors," further explores these and other debugging techniques.
+
+# The Bottom Line
+
+**Understand what arrays are and what you use them for.**
+
+Arrays play an important role in computer programming. In some ways they resemble a mini-database, and organized data is central to computing. Computers are sometimes called data processors for good reason, and arrays make it easier for you to manipulate variable data.
+
+**Master It**
+
+What is the difference between an array and an ordinary variable?
+
+**Create and use arrays.**
+
+When you create a new array, you _declare_ it and, optionally, specify the number of values it will contain.
+
+**Master It**
+
+There are four keywords that can be used to declare arrays. Name at least three of them.
+
+**Redimension an array.**
+
+If you want to resize an existing dynamic array, you can redimension it.
+
+**Master It**
+
+Redimensioning an array with the ReDim statement causes you to lose any values that are currently in that array. However, you can preserve these values using a special keyword. What is it?
+
+**Erase an array.**
+
+You can erase all the values in a fixed-size array or completely erase a dynamic array.
+
+**Master It**
+
+Write a line of code that erases an array named arrMyArray.
+
+**Find out whether a variable is an array.**
+
+An array is a type of variable, and you may occasionally need to check whether a particular variable name denotes an array or an ordinary _scalar variable_ (a variable that isn't an array).
+
+**Master It**
+
+Which built-in function can you use in VBA to find out whether a variable is an array or an ordinary, single-value variable?
+
+**Sort an array.**
+
+Visual Basic .NET includes array objects with built-in search and sort methods. In VBA, however, you must write a bit of code to search and sort the values in an array.
+
+**Master It**
+
+Name a popular, understandable, but relatively inefficient sorting technique.
+
+**Search an array.**
+
+Searching through an array can be accomplished in two primary ways. If you have a relatively small array, you can use the simpler, but less efficient technique. With large amounts of data, though, it's best to use the more robust approach.
+
+**Master It**
+
+Name two common ways to search an array.
+Chapter 8
+
+Finding the Objects, Methods, and Properties You Need
+
+In this chapter, you'll learn how to find the objects you need in the applications you're using. To learn the material in this chapter, you'll build on what you've learned in the earlier chapters. You'll start by examining the concepts involved: what objects and collections are, what properties are, and what methods are. You'll then learn how to find the objects, collections, properties, and methods you need to make your code work. To identify these items, you'll use a number of tools you've already read about, including the Object Browser (which you used briefly in Chapter 4, "Creating Code from Scratch in the Visual Basic Editor") and the VBA online Help resources.
+
+Along the way, this chapter explains how to use Object variables to represent objects in your code.
+
+In this chapter you will learn to do the following:
+
+  * Understand and use objects, properties, and methods
+  * Use collections of objects
+  * Find objects, properties, and methods
+  * Use Object variables to represent objects
+
+# What Is an Object?
+
+VBA-enabled applications (and many other modern applications) consist of a number of discrete objects, each with its own characteristics and capabilities.
+
+## The Benefits of OOP
+
+Building an application out of objects is called _object-oriented programming_ ( _OOP_ ). In theory, object-oriented programming has a number of benefits—for example, the code is easier to build and maintain (update) because you break it down into objects of a manageable size.
+
+Object-oriented programs should also be easier to understand than monolithic programs because it's less difficult for most people to grasp the concept of individual objects with associated characteristics and actions than to remember a far longer list of capabilities for the application as a whole.
+
+Figuring out which commands to use to accomplish your programming goals can also be faster thanks to OOP taxonomy. For example, a table in Word is represented by a Table object, and a column is represented by a Column object. The Column object has a Width property that sets or returns its width. It's simpler to manage this information when it's broken down into small pieces than to deal with some complex command such as WordTableSetColumnWidth or WordTableGetColumnWidth.
+
+A third benefit of object-oriented programming is that the VBA language itself can be extended. The programmer can build custom objects to implement functionality that the language itself didn't originally contain. For example, you can use VBA to build your own objects that do things that the Office applications themselves can't do.
+
+Another, rather different, use for OOP is somewhat clerical: OOP can be of help when a group of programmers are working together on a single program. They can easily step on each other's toes in various ways—using the wrong version, changing each other's code, and so on. We'll look at the ways OOP is employed in team programming at the end of this chapter.
+
+Objects can—and frequently do—contain other objects. Typically, the objects in an object-oriented application are arranged into a hierarchy called the _object model_ of the application. This hierarchy is intended to make it easier to figure out where—within a large library of objects—you'll find a particular object that you want to use in your macros. It's similar to the way a biography is likely to be found in the library's nonfiction area.
+
+* * *
+
+Object Models Covered in Depth Later in the Book
+
+This chapter discusses object models only a little, at the conceptual level: You need to know what an object model is in order to make sense of what you'll be learning in the following chapters, but you don't need to know the specifics of each object model to manipulate the objects used in the examples. Part 5 of this book, "Creating Effective Code," examines the object models of each of the applications covered in this book in enough detail to get you started on exploring the depths of each object model on your own.
+
+* * *
+
+Most VBA host applications, including all the major Office applications, have an Application object that represents the application as a whole. The Application object has properties and methods for things that apply to the application as a whole. For example, many applications have a Quit method that exits the application and a Visible property that controls whether the application is visible or hidden.
+
+In a typical object model, the Application object essentially contains all the other objects (and collections—groups—of objects) that make up the application. For example, Excel has an Application object that represents the Excel application, a Workbook object (grouped into the Workbooks collection) that represents a workbook, and a Worksheet object (grouped into the Sheets collection) that represents a worksheet. The Workbook object is contained within the Application object because you normally need to have the Excel application open to work with an Excel workbook.
+
+In turn, the Worksheet object is contained within the Workbook object because you need to have an Excel workbook open to use a worksheet. Walking further down the object model, the Worksheet object contains assorted other objects, including Row objects that represent the individual rows in the worksheet, Column objects that represent columns in the worksheet, and Range objects (which represent ranges of cells). And these objects in turn contain further objects.
+
+To get to an object, you typically walk down through the hierarchy of the object model until you reach the object you're looking for.
+
+To get to a Range object in Excel, for example, you would go through the Application object to the Workbook object, through the Workbook object to the appropriate Sheet object, and then finally to the Range object. The following statement shows how to select the range A1 in the first worksheet in the first open workbook (more on this in a minute):
+
+    Application.Workbooks(1).Sheets(1).Range("A1").Select
+
+## Understanding Creatable Objects
+
+The Application object, however, is optional and is usually left out of code lines. Why? Because you'd have to go through the Application object to get to pretty much _anything_ in the application, most applications _expose_ (make available to you) a number of _creatable_ objects. Creatable merely means that you can access something without having to type the word Application in your code. It's assumed. This is similar to the fact that you don't have to include the word _Earth_ when addressing an envelope. There's only that one possibility.
+
+These creatable objects are usually the most-used objects for the application, and by going through them, you can access most of the other objects without having to refer to the Application object. For example, Excel exposes the Workbooks collection as a creatable object, so you can use the following statement, which doesn't require that you type in Application. See the alternative example a couple of paragraphs earlier in this chapter.
+
+    Workbooks(1).Sheets(1).Range("A1").Select
+
+Any object can have properties and methods. The next sections discuss these items in detail.
+
+## Properties
+
+In VBA, a _property_ is an attribute or characteristic of an object. Most objects have multiple properties that specify each aspect of it.
+
+Each property has a specific data type for the information it stores. For example, the objects that represent files (such as documents, workbooks, or presentations) typically have a Boolean property named Saved that stores a value denoting whether all changes in the object have been saved (a value of True) or not (a value of False). These two values encompass the entire range of possibilities for the object: it can either contain unsaved changes or not contain unsaved changes. There is no third state. And a Boolean data type is used because that type has only two possible values.
+
+Similarly, most objects that represent files have a Name property that contains the name of the file in question. The Name property contains a String data type because it needs to contain text. And that text can be just about anything, limited only by the 255-character path that Windows permits for files and by certain characters—such as colons and pipe (|) characters—that Windows forbids in filenames.
+
+To work with a property, you _get_ (fetch or return) it to find out its current value or _set_ (change) it to a value of your choosing. Many properties are _read/write_ , meaning that you can both _get_ and _set_ their values, but some properties are _read-only_ , meaning that you can view their values but not change them.
+
+The Saved property is read/write for most applications, so you can set it. This means that you can tell the application that a file contains unsaved changes when it really doesn't or that it contains no unsaved changes when it actually has some. (Changing the Saved property can be useful when you're manipulating a file without the user's knowledge.) But the Name property of a file object is read-only—you'll typically set the name by issuing a Save As command, after which you cannot change the name from within the application while the file is open. So you can get (read, return, or fetch) the Name property but not set it. You'll also encounter some write-only properties, properties that you can set but not get.
+
+When an object contains another object, or contains a collection, it typically has a property that you _call_ (invoke) to return the contained object or collection. For example, the Word Document object includes a PageSetup property that returns the PageSetup object for the document (the PageSetup object contains settings such as paper size, orientation, lines per page, and margins for the document) and a Tables property that you call to return the Tables collection. Here's how you can _call_ the PageSetup object (which is contained in the Document object):
+
+    Sub GetLinesPage()
+
+      Dim sngLinesPerPage As Single
+
+      **sngLinesPerPage = ActiveDocument.PageSetup.LinesPage**
+
+      MsgBox sngLinesPerPage
+
+      End Sub
+
+Each object of the same type has the same set of properties but stores its own particular values for them. For example, if you're running PowerPoint and have three Presentation objects open, each has its own Name property. The value in each Name property is specific to each Presentation object. In other words, the value in a property in one object has nothing to do with the value in that property in another object: each object is independent of the other objects.
+
+## Methods
+
+A _method is_ an action that an object can perform, a capability an object has. For example, the Document object in various applications has a Save method that saves the document. You can use the Save method on different Document objects—Documents(1).Save saves the first Document object in the Documents collection, and Documents(2).Save saves the second Document object—but the Save method does the same thing in each case. An object can have one or more methods associated with it. Some objects have several dozen methods to implement all the functionality they need.
+
+The Save method is very common. It appears in many applications, as do other methods, such as SaveAs (which saves the file with a different name, location, or both) and Close (which closes the file).
+
+But other methods are unique to each application. For example, the Presentation object in PowerPoint has an AddBaseline method that applies a baseline (consisting either of the active presentation or of a specified presentation file) that enables you to track changes for a merge. The Document object in Word has no AddBaseline method, but it has an AcceptAllRevisions method that accepts all revisions in the document. PowerPoint doesn't have an AcceptAllRevisions method.
+
+Just as methods like Save are common to multiple applications, some methods are found in more than one object. For example, the Delete method is associated with many different objects. As its name suggests, the Delete method usually deletes the specified object. But other implementations of the Delete method behave somewhat differently, depending on the object they're working with. So even if you're familiar with a method from using it with one object, you need to make sure that it will have the effect you expect when you use it with another object.
+
+Some methods take no arguments. Other methods take one or more arguments (to supply necessary information). Just as with built-in VBA functions like MsgBox, some methods' arguments are required, while others are optional.
+
+When a method applies to multiple objects, it may have different syntax for different objects. Again, even if you're familiar with a method, you need to know exactly what it does with the object for which you're planning to use it.
+
+To use a method, you access it through the object involved. For example, to close the ActivePresentation object, which represents the active presentation in PowerPoint, you use the Close method (but you must specify the ActivePresentation object, like this):
+
+    ActivePresentation.Close
+
+* * *
+
+**Max the Dog: Visualizing Objects, Methods, and Properties**
+
+If you have a hard time getting a grip on objects, their properties, and methods, here's a somewhat strained comparison between the virtual objects, properties, and methods in VBA and physical objects, properties, and actions in the real world. Consider this example.
+
+Let's say you have a massive dog named Max—a Pyrenean mountain dog, white, 200 pounds, four years old, male, and not _fixed_.
+
+Max performs all the usual dog actions—sleep, run, eat, bark, growl, chew things, various unmentionable actions that we'll skip over—but also has a couple of unusual (for dogs) actions built in, such as slobbering on command, knocking people down, and biting mail carriers.
+
+If Max were implemented in VBA, he'd be a Dog object in a Dogs collection. The Dog object for Max would have properties such as these:
+
+    Name This is a read-only String with a value of Max
+    Sex This is a read-only String with a value of Male.
+    Fixed This is a read/write Boolean with a value of False.
+    Height This is a read/write Long with a value of 36.
+    Weight This is a read/write Long with a value of 200.
+    Age This is a read/write Integer with a value of 4.
+    Type This is a read/write String with a value of Pyrenean Mountain.
+    Color This is a read/write String with a value of White.
+
+Max would have methods such as Slobber, Bark, KnockDown, Intimidate, Chew, Run, and so on. Some of these methods would require arguments. The Slobber method would definitely need arguments like this, probably using Dog-specific constants that start with the dog designation:
+
+    Dogs("Max").Slobber OnWhat:="MyKnee", How:=dogSlobberDisgustingly
+
+The Dog object would contain objects representing the many components of the dog—ears, eyes, tongue, brain, stomach, legs, tail, and so on. Each of these objects in turn would have its own properties and methods as appropriate. For example, the Tail object would need a Wag method, which you would probably invoke ( _call_ ) something like this:
+
+    Dogs("Max").Tail.Wag Direction:=dogWagHorizontal, Frequency:=200
+
+* * *
+
+# Working with Collections
+
+When an object contains more than one object of the same type, the contained set of objects is said to be grouped into a _collection_. For example, Word uses Document objects, which are grouped into the Documents collection; PowerPoint has a Presentations collection for Presentation objects, and Excel has the Workbooks collection.
+
+As in these examples, the names of most collections are simply the plural of the outer, container object. There _are_ some exceptions, such as the Sheets collection in Excel that contains the Worksheet objects. But by and large the names of most collections are easy to derive from the name of the objects they contain—and vice versa.
+
+A collection—taken as a whole—is an object too and can have its own properties and methods. For example, many collections have a Count property that tells you how many objects are in the collection. This next example tells you how many documents are in the Documents collection:
+
+    Sub GetDocCount()
+
+      Dim lngCount As Long
+
+      lngCount = Documents.Count
+
+      MsgBox lngCount
+
+    End Sub
+
+Collections tend to have fewer properties and methods than individual objects. Most collections have an Add method for adding another object to the collection. Some collections, however, are read-only and do not have an Add method. Most collections have an Item property (the default property) for accessing an item within the collection.
+
+Most collections in VBA have the core group of properties listed in Table 8.1.
+
+Table 8.1 Core properties for collections in VBA
+
+**Property** | **Explanation**  
+---|---  
+Application | A read-only property that returns the application associated with the object or collection—the root of the hierarchy for the document. For example, the Application property for objects in PowerPoint returns Microsoft PowerPoint.  
+Count | A read-only Long property that returns the number of items in the collection—for example, the number of Shape objects in the Shapes collection in a PowerPoint slide.  
+Creator | In Microsoft applications, a read-only Long property that returns a 32-bit integer indicating the application used to create the object or collection.  
+Item | A read-only property that returns a specified member of the collection. Item is the default property of every collection, which means that you seldom need to specify it.  
+Parent | In Microsoft applications, a read-only String property that returns the parent object for the object or collection. The _parent_ object is the object that contains the object in question; the contained object is the _child_ object. For example, a Document object is a child of the Documents collection.
+
+## Working with an Object in a Collection
+
+To work with an object in a collection, you identify the object within the collection either by its name or by its position in the collection. For example, the following statement returns the first Document object in the Documents collection and displays its Name property in a message box:
+
+    MsgBox Documents(1).Name
+
+* * *
+
+Most Collections Are Zero-Based
+
+Recall that arrays are zero-based by default in VBA. They employ a 0 index number for the first item in the array (unless you use the Option Base 1 statement to force the first index number to 1 as we did in Chapter 7, "Using Array Variables").
+
+Fortunately, most VBA collections default to the more sensible 1 for the first item in the collection. This makes it easy to identify the object you need. For example, Documents(1) gives you the first document, Workbooks(2) gives you the second workbook, and so on.
+
+But notice the word _most_. Sadly, there are exceptions to this rule. Be warned that _some_ collections in VBA implementations are zero-based—their numbering starts at 0 (zero) rather than 1. For example, Access—nearly always the special case in VBA—employs zero-based collections. If you're not sure whether a particular collection is one- or zero-based, consult the Help topic for that collection.
+
+* * *
+
+You can optionally use the Item property to return an object from the collection, but because Item is the default property of a collection, you don't need to use it. It's assumed. The following two statements have the same effect, so there's no advantage to using the Item method:
+
+    strName = Documents(1).Name
+    strName = Documents.Item(1).Name
+
+## Adding an Object to a Collection
+
+To create a new object in a collection, you add an object to the collection. In many cases, you use the Add method to do so. For example, the following statement creates a new Document object in Word:
+
+    Documents.Add
+
+# Finding the Objects You Need
+
+The Visual Basic Editor provides a number of tools for finding the objects you need:
+
+  * The Macro Recorder, which you used to record macros in some Microsoft Office applications in Chapter 1, "Recording and Running Macros in the Office Applications"
+  * The Object Browser, which you used briefly in Chapter 4
+  * The online Help system, which can provide detailed help on the objects in the application
+  * The Auto List Members feature in the Visual Basic Editor
+
+The following sections show you how to use these tools to find objects.
+
+## Using the Macro Recorder to Add Code for the Objects You Need
+
+If you're using a Microsoft application, chances are that the easiest way to find the objects you need is to run the Macro Recorder to record a quick macro using the objects you're interested in. While you perform various actions in the application, the Macro Recorder creates code that you can then open in the Visual Basic Editor, examine, and modify if necessary.
+
+In spite of its advantages, the Macro Recorder does have two drawbacks:
+
+  * First, you can't record _every_ action that you might want. Let's say you're working in Excel and want to create a statement that performs an action on a specified workbook in the Workbooks collection rather than on the active workbook. With the Macro Recorder, you can record only actions performed on the active workbook. (This is the case because the Macro Recorder can record only those actions you can perform interactively in Excel, and you can't work interactively with any workbook other than the active one.) Here's another example: Some Ribbon actions are not recorded. In Word, clicking the Review ⇒ Show Markup Formatting feature to deselect it results in no recorded code. You would need to write the following code in the Editor yourself:
+
+    ActiveWindow.View.ShowFormatChanges = False
+
+  * Second, the Macro Recorder is apt to record more statements than you need, particularly when you're trying to record a setting in a dialog box.
+
+You saw an example of the second problem in Chapter 4. Here's another example. This time we'll record a macro to create an AutoCorrect entry. Let's say that you often have to type the word _references_ in your job. Dozens of times every day. You can speed up your work by merely typing **reffs** (or some other abbreviation of your choice). Then Word will automatically replace _reffs_ with _references_ as you type. Here's how to create this macro:
+
+1. Start Word.
+
+2. Click the Record Macro button on the status bar, or click the Developer tab on the Ribbon and then click the Record Macro button in the Code section. This displays the Record Macro dialog box. Type **Add_Item_to_AutoCorrect** in the Macro Name text box, and type a description in the Description text box. Make sure All Documents (Normal.dotm) is selected in the Store Macro In drop-down list, and then click the OK button to start recording.
+
+3. Press Alt+F then I. Then click the Proofing button and the AutoCorrect Options button to display the AutoCorrect dialog box. Type **reffs** in the Replace box and **references** in the With box, and click the Add button. Then click OK twice to close both open dialog boxes.
+
+4. Click the Stop Recording button on the Ribbon or the status bar to stop the Macro Recorder.
+
+Now press Alt+F8 to display the Macros dialog box, select the Add_Item_to_AutoCorrect entry, and click the Edit button to open the macro in the Visual Basic Editor. The code should look like this:
+
+    Sub Add_Item_to_AutoCorrect()
+    '
+    ' Add_Item_to_AutoCorrect Macro
+    ' Change reffs to references
+    '
+        **AutoCorrect.Entries.Add Name:="reffs", Value:="references"**
+        With Options
+            .AutoFormatAsYouTypeApplyHeadings = False
+            .AutoFormatAsYouTypeApplyBorders = True
+            .AutoFormatAsYouTypeApplyBulletedLists = True
+            .AutoFormatAsYouTypeApplyNumberedLists = True
+            .AutoFormatAsYouTypeApplyTables = True
+            .AutoFormatAsYouTypeReplaceQuotes = True
+            .AutoFormatAsYouTypeReplaceSymbols = True
+            .AutoFormatAsYouTypeReplaceOrdinals = True
+            .AutoFormatAsYouTypeReplaceFractions = True
+            .AutoFormatAsYouTypeReplacePlainTextEmphasis = False
+            .AutoFormatAsYouTypeReplaceHyperlinks = True
+            .AutoFormatAsYouTypeFormatListItemBeginning = True
+            .AutoFormatAsYouTypeDefineStyles = False
+            .TabIndentKey = True
+        End With
+        With AutoCorrect
+            .CorrectInitialCaps = True
+            .CorrectSentenceCaps = True
+            .CorrectDays = True
+            .CorrectCapsLock = True
+            .ReplaceText = True
+            .ReplaceTextFromSpellingChecker = True
+            .CorrectKeyboardSetting = False
+            .DisplayAutoCorrectOptions = True
+            .CorrectTableCells = True
+        End With
+        With OMathAutoCorrect
+            .UseOutsideOMath = False
+            .ReplaceText = True
+        End With
+        With Options
+            .AutoFormatApplyHeadings = True
+            .AutoFormatApplyLists = True
+            .AutoFormatApplyBulletedLists = True
+            .AutoFormatApplyOtherParas = True
+            .AutoFormatReplaceQuotes = True
+            .AutoFormatReplaceSymbols = True
+            .AutoFormatReplaceOrdinals = True
+            .AutoFormatReplaceFractions = True
+            .AutoFormatReplacePlainTextEmphasis = True
+            .AutoFormatReplaceHyperlinks = True
+            .AutoFormatPreserveStyles = True
+            .AutoFormatPlainTextWordMail = True
+        End With
+        Options.LabelSmartTags = False
+    End Sub
+
+Here, the Recorder has created dozens of lines of unnecessary code. The only statement you actually need to accomplish your task is this:
+
+    AutoCorrect.Entries.Add Name:="reffs", Value:="references"
+
+This line shows you that to add an AutoCorrect entry, you need to work with the Entries collection object in the AutoCorrect object. You use the Add method on the Entries collection to add an AutoCorrect entry to the list.
+
+All the other lines of code specifying the status of various options are unnecessary because you are not interested in changing any of them in this macro.
+
+By removing these extraneous lines from this recorded macro, you can reduce it to just the single line it needs to contain (together with the comment lines, which you can also remove if you want):
+
+    Sub Add_Item_to_AutoCorrect()
+    '
+    ' Add_Item_to_AutoCorrect Macro
+    ' Change reffs to references
+    '
+        AutoCorrect.Entries.Add Name:="reffs",Value:="references"
+    End Sub
+
+You used the Recorder to see the correct syntax for adding an entry to the AutoCorrect feature. There's no point to leaving in lines of code unrelated to your purposes. What's more, such extraneous code would make it harder at some future date to read and understand the macro's purpose. Even worse, these extra lines can set properties to conditions that you, or someone else using this macro, might not want. Let's say you run this macro in the future and you are working in a document that must not have any bullet symbols in it. So you've clicked the File tab on the Ribbon, then chosen File ⇒ Options ⇒ Proofing ⇒ AutoCorrect Options ⇒ AutoFormat As You Type and turned off bullets. However, when you run this macro, bullets are turned back on by this unneeded line in the code:
+
+            .AutoFormatAsYouTypeApplyBulletedLists = True
+
+In spite of its limitations, the Macro Recorder does provide quick access to the objects you need to work with, and you can always modify the resulting code in the Visual Basic Editor. What's more, the code that the Recorder generates is, if nothing else, guaranteed to execute without bugs.
+
+## Using the Object Browser
+
+For many programmers, the primary tool for writing code for objects is the Object Browser, which you used briefly in Chapter 4. In the following sections, you'll get to know the Object Browser better and learn to use it to find the information you need about objects. To see the Object Browser, press F2 in the Editor.
+
+### Components of the Object Browser
+
+The Object Browser provides the following information about both built-in objects and custom objects you create:
+
+  * Classes (formal definitions of objects)
+  * Properties (the attributes of objects or aspects of their behavior)
+  * Methods (actions you can perform on objects)
+  * Events (for example, the opening or closing of a document)
+  * Constants (named items that keep a constant value while a program is executing)
+
+Figure 8.1 shows the components of the Object Browser.
+
+Figure 8.1 The Object Browser provides information on built-in objects and custom objects. Here, the application is Excel.
+
+Here's what the different elements of the Object Browser do:
+
+  * The Project/Library drop-down list provides a list of object libraries available to the current project. (An _object library_ is collection of objects made available to programs. There can be several libraries in use at a given time. For example, one library might contain objects that specialize in rendering graphics, a second library might contain objects that assist with security features, and so on.) Use the drop-down list to choose the object libraries you want to view. For example, you might choose to view only objects in Outlook by choosing Outlook in the Project/Library drop-down list. Alternatively, you could stay with the default choice of <All Libraries>.
+  * In the Search Text box, enter the string you want to search for: Either type it in or choose a previous string in the current project session from the drop-down list. Then either press Enter or click the Search button to find members containing the search string.
+
+* * *
+
+Improve Your Searches with These Techniques
+
+To make your searches less specific, you can use wildcards such as ? (to represent any single character) and * (to represent any group of characters). You can also choose to search for a whole word only (rather than matching your search string with part of another word) by right-clicking anywhere in the Object Browser (except in the Project/Library drop-down list or in the Search Text box) and choosing Find Whole Word Only from the context menu. The Find Whole Word Only choice has a check mark next to it in the context menu when it's active; to deactivate it, choose Find Whole Word Only again on the context menu.
+
+* * *
+
+  * Click the Go Back button to retrace one by one your previous selections in the Classes list and the Members Of list. Click the Go Forward button to move forward through your previous selections one by one. The Go Back button becomes available when you go to a class or member in the Object Browser; the Go Forward button becomes available only when you've used the Go Back button to go back to a previous selection.
+  * Click the Copy To Clipboard button to copy the selected item from the Search Results list, the Classes list, the Members Of list, or the Details pane to the Clipboard so that you can paste it into your code.
+  * Click the View Definition button to display a Code window containing the code for the object selected in the Classes list or the Members Of list. The View Definition button is available (undimmed) only for objects that contain code, such as procedures and user forms that you've created.
+  * Click the Help button to display any available help for the currently selected item. Alternatively, press the F1 key.
+  * Click the Search button to search for the term entered in the Search Text box. If the Search Results pane isn't open, VBA opens it at this point.
+  * Click the Show/Hide Search Results button to toggle the display of the Search Results pane on and off.
+  * The Search Results list in the Search Results pane contains the results of the latest search you've conducted for a term entered in the Search Text box. If you've performed a search, the Object Browser updates the Search Results list when you use the Project/Library drop-down list to switch to a different library. Choosing a different library in the Project/Library drop-down list is a handy way of narrowing, expanding, or changing the focus of your search.
+  * The Classes list shows the available classes in the library or project specified in the Project/Library drop-down list.
+  * The Members Of list displays the available elements of the class selected in the Classes list. A method, constant, event, property, or procedure that has code written for it appears in boldface. The Members Of list can display the members either grouped into their different categories (methods, properties, events, and so on) or ungrouped as an alphabetical list of all the members available. To toggle between grouped and ungrouped, right-click in the Members Of list and choose Group Members from the context menu; click either to place a check mark (to group the members) or to remove the check mark (to ungroup the members).
+  * The Details pane displays the definition of the member selected in the Classes list or in the Members Of list. For example, if you select a procedure in the Members Of list, the Details pane displays its name, the name of the module and template or document in which it's stored, and any comment lines you inserted at the beginning of the procedure. The module name and project name contain hyperlinks (jumps) so that you can quickly move to them. You can copy information from the Details pane to the Code window by using either copy and paste or drag and drop.
+  * Drag the three split bars to resize the panes of the Object Browser to suit yourself. (You can also resize the Object Browser window as needed or maximize it so that it docks itself in the Code window.)
+
+The Object Browser uses different icons to indicate the various types of object that it lists. Figure 8.1 shows several icons; Table 8.2 shows the full range of icons and what they represent.
+
+A blue dot in the upper-left corner of a Property icon or a Method icon indicates that that property or method is the default.
+
+Table 8.2 Object Browser icons
+
+**Icon** | **Meaning**  
+---|---  
+  | Property  
+  | User-defined type  
+  | Method  
+  | Global  
+  | Constant  
+  | Library  
+  | Module  
+  | Project  
+  | Event  
+  | Built-in keyword or type  
+  | Class  
+  | Enum (enumeration)
+
+### Adding and Removing Object Libraries
+
+The default object libraries are sufficient for most typical macros, so you generally need not worry about adding any specialized libraries. If you get into some kinds of advanced macro programming, however, you will need to add other libraries (you'll modify the Ribbon in Chapter 31, "Programming the Office 2010 Ribbon," and to do that you have to add a special library). You can add and remove object libraries by choosing Tools ⇒ References in the editor and using the References dialog box to make your selections:
+
+  * By adding object libraries, you can make available additional objects to work with.
+  * By removing object libraries that you don't need to view or use, you can reduce the number of object references that VBA needs to resolve when it is compiling the code in a project. This allows the code to run faster, though as I've mentioned before, today's computers are so fast that finding ways to increase speed of macro execution is never an issue for most people.
+
+When you start the Visual Basic Editor, it automatically loads the object libraries required for using VBA and user forms with the host application. You don't have to change this set of object libraries until you need to access objects contained in other libraries. For example, if you create a procedure in Word that needs to employ a feature found in Excel, you'll have to add to Word's VBA Editor a reference to an Excel object library to make Excel's objects available.
+
+You can adjust the priority (or _order of precedence_ ) of different references by adjusting the order in which the references appear in the References dialog box. The priority of references matters when you use in your code an object whose name appears in more than one reference: VBA checks the References list to determine the order of the references that contain that object name and uses the first one unless specifically told to do otherwise by use of an unambiguous name.
+
+To add or remove object libraries, follow these steps:
+
+1. In the Visual Basic Editor, choose Tools ⇒ References to display the References dialog box (see Figure 8.2). You can also display the References dialog box by right-clicking in the Object Browser and choosing References from the context menu.
+
+Figure 8.2 You add and remove object libraries by using the References dialog box.
+
+2. In the Available References list box, select the check boxes for the object libraries you want to have access to, and clear the check boxes for the references you want to remove because you don't need them. You should find a reference for an object library for each application that supports automation and is installed on your computer. _Automation_ , in this context, means that an application permits the automation of tasks (in other words, macros). Another way to put this is an application that supports automation _exposes its objects_ , meaning that the application makes its objects available to programmers.
+
+3. The references that are in use appear together at the top of the Available References list box, not in alphabetical order (in order of precedence, as described earlier in this chapter).
+
+4. Adjust the order of precedence of the references if necessary by selecting a reference and using the up- and down-arrow Priority buttons to move it up or down the list. Usually, you'll want to keep Visual Basic for Applications and the object library of the application you're working with at the top of your list.
+
+* * *
+
+Adding a Reference Library
+
+You can even add new reference libraries to the list of available references in the References dialog box by clicking the Browse button to display the Add Reference dialog box, selecting the library file, and then clicking the Open button.
+
+* * *
+
+5. Click OK to close the References dialog box and return to the Object Browser.
+
+### Navigating with the Object Browser
+
+To browse the objects available to a project, follow these steps:
+
+1. First, activate a code module by double-clicking it in the editor's Project Explorer.
+
+2. Display the Object Browser by choosing View ⇒ Object Browser, by pressing the F2 button, or by clicking the Object Browser button on the Standard toolbar. (If the Object Browser is already displayed, make it active by clicking it or by selecting it from the list at the bottom of the Window menu.)
+
+3. In the Project/Library drop-down list, select the name of the project or the library that you want to view. The Object Browser displays the available classes in the Classes list.
+
+4. In the Classes list, select the class you want to work with. For example, if you chose a project in step 3, select the module you want to work with in the Classes list.
+
+5. If you want to work with a particular member of the class or project, select it in the Members Of list. For example, if you're working with a template project, you might want to choose a specific procedure or user form to work with.
+
+Once you've selected the class, member, or project, you can perform the following actions on it:
+
+  * View information about it in the Details pane at the bottom of the Object Browser window.
+  * View the definition of an object by clicking the View Definition button. Alternatively, right-click the object in the Members Of list and choose View Definition from the context menu. The View Definition button and the View Definition command are enabled (available, undimmed) only for objects that contain code, such as procedures and user forms that you've created.
+
+* * *
+
+A "Definition" Is Contained Code
+
+The definition of a procedure is the code that it contains. The definition of a module is all the code in all the procedures that it contains. The definition of a user form is the code in all the procedures attached to it. To see how the View Definition button works, type the name of one of your macros in the Object Browser's Search field (to the left of the icon). Then click the icon to locate this macro. Then click the View Definition button, and the Code window will open, displaying this macro's code.
+
+* * *
+
+  * Copy the text for the selected class, project, or member to the Clipboard by clicking the Copy button or by issuing a standard Copy command (pressing Ctrl+C or Ctrl+Insert).
+
+## Using Help to Find the Object You Need
+
+VBA's online Help system provides another easy way to access the details of the objects you want to work with. The Help files provide a hyperlinked reference to all the objects, methods, and properties in VBA, including graphics that show how the objects are related to each other, and plenty of code samples to show you the correct syntax.
+
+The quickest way to access VBA Help is to press the F1 key while working in the Visual Basic Editor.
+
+### Pressing F1 to Go to a General VBA Help Page
+
+F1 works two different ways. Press F1 with the cursor on a blank line, and you're taken to the VBA portal shown in Figure 8.3. However, press F1 with the cursor on a language keyword such as Variant or InputBox, and you're taken to a Help page with specific information about that particular keyword.
+
+Figure 8.3 The generic VBA portal
+
+First, try clicking a blank line in the Code window, then press F1. Your browser opens a generic Office website shown in Figure 8.3.
+
+For us Office programmers in the web page shown in Figure 8.3 are two links: the _Office_ link under Platforms shown down at the bottom and the _Welcome to the Visual Basic for Applications language reference for Office 2013_ link shown in the middle of the page.
+
+Click the Office link and you're taken to the Office for Developers help page, shown in Figure 8.4. There you'll find many useful links to code samples, Office application–specific pages, video lessons, and whatnot.
+
+Figure 8.4 This Help page contains many valuable links.
+
+### Pressing F1 to Go Directly to a Command's Help Page
+
+The second way to use F1 takes you directly to the Help page for the keyword you're interested in. If you want to see how to manipulate the active window, for example, just type **activewindow** into the Editor's Code window, and then, with the blinking insertion cursor somewhere in that word, press F1. See Figure 8.5.
+
+Figure 8.5 Put your insertion cursor on a command, then Press F1 to get context-sensitive help.
+
+After you press F1 on the activewindow command, as shown in Figure 8.5, the Help page for this command opens, as you can see in Figure 8.6.
+
+Figure 8.6 Here's the main Help page for the ActiveWindow property.
+
+Apart from the regular Help information you'll find in the Help pages online, here are a few additional ways to find help:
+
+  * At the top of most Microsoft help windows, you'll see a field titled _Search MSDN With Bing_. Try this: Type **Word 2013 selection object** into the Bing search field. A page is displayed with plenty of links. Click the top link, and you'll see several helpful code examples.
+  * When looking for help, you can also try clicking the Help menu in the Editor, then choosing one of the two help options listed: Microsoft Visual Basic Applications Help or MSDN On The Web. These two options open different entrees into the Help system, from which you can drill down until you locate the explanations or code samples you're after.
+  * Finally, when looking for help with objects, don't forget you can press F2 to display the built-in Object Browser.
+
+## Using the Auto List Members Feature
+
+You've already used the Auto List Members feature a couple of times in the previous chapters. To recap, in VBA code—as with most other programming languages—objects and their _members_ (properties and methods) are separated by periods. This punctuation helps you see the relationships between parent objects, child objects, and members. Notice the two periods in this code:
+
+    sngLinesPerPage = ActiveDocument.PageSetup.LinesPage
+
+When you're entering a statement in the Visual Basic Editor and you type the period at the end of the current object, the Auto List Members feature displays a list of properties and methods appropriate to the statement you've entered so far. (Turn this feature on in the Visual Basic Editor by choosing Tools ⇒ Options, then selecting the Auto List Members check box.)
+
+Technically, there's a distinction between Auto List Members and a somewhat similar List Properties/Methods feature. The former feature is triggered by typing a period (.) following the name of an object in a line of code. The latter is triggered by pressing Ctl+J or by right-clicking the name of an object in a line of code and choosing List Properties/Methods from the menu that appears. Of the two, I find Auto List Members more useful.
+
+The Auto List Members feature provides a quick way of completing statements, but you need to know which object you should work with before you can work with its members. Sometimes using this feature is a bit like finding your way through a maze and being given detailed directions that end with the phrase, "But you can't get there from here."
+
+Once you know the object from which to start, though, you can easily find the property or method you need. For example, to put together the statement Application.Documents(1).Close to close the first document in the Documents collection in Word, you could work as follows:
+
+1. Place the insertion point on a fresh line in an empty procedure (between the Sub and End Sub statements). Create a new procedure if necessary.
+
+2. Type the word **application** , or type **appl** and press Ctrl+spacebar to have the Complete Word feature complete the word for you.
+
+3. Type the period (.) after **Application**. The Auto List Members feature displays the list of properties and methods available to the Application object.
+
+4. Choose the Documents item in the Auto List Members list. You can scroll to it using the mouse and then double-click it to enter it in the Code window, scroll to it by using the arrow keys and enter it by pressing Tab, or type the first few letters of its name (to automatically locate it) and then enter it by pressing Tab. The latter method is shown in Figure 8.7, which uses Word.
+
+Figure 8.7 Using the Auto List Members feature to enter code
+
+5. Type **(1).** after Documents. When you type this period, the Auto List Members feature displays the list of properties and methods available to a Document object. Note that without the (1), you're working with the documents collection, but as soon as you add the (1), you're then working with a specific document, namely the first one in the collection.
+
+6. Choose the Close method in the Auto List Members list by scrolling to it with the mouse or with the down arrow key. Because this is the end of the statement, press the Enter key to enter the method and start a new line (rather than pressing the Tab key, which enters the method but continues the same line of code).
+
+* * *
+
+Automatic Selection Helps You Keep Your Hands on the Keyboard
+
+For most people, the quickest way to enter statements in the Code window is to keep their hands on the keyboard. After all, you're _typing_ your programming. To help you do this, the Visual Basic Editor automatically selects the current item in the Auto List Members list when you type a period or an opening parenthesis. In the previous example, you can type **Application.** to display the list, **Do** to select the Documents item, and **(** to enter the Documents item.
+
+* * *
+
+# Using Object Variables to Represent Objects
+
+As you learned in Chapter 6, "Working with Variables, Constants, and Enumerations," one of the data types available for variables in VBA is the _Object_ type. You use an Object variable to represent an object in your code: instead of referring to the object directly, you can employ the Object variable to access or manipulate the object it represents.
+
+Here's one major benefit of this approach: Using Object variables makes your code easier to read. It's simpler to see which object a section of code is working with, especially when you're working with multiple objects in the same section of code. Plus, you can give names to these variables that are descriptive and easily understood. What's more, object variables are often a necessity when you need to manipulate collections of objects.
+
+For example, say you create a procedure that manipulates the three open workbooks in Excel, copying a range of cells from one to the other two. If you have only those three workbooks open, you'll be able to refer to them directly as Workbooks(1), Workbooks(2), and Workbooks(3), respectively, because they'll occupy the first (and only) three slots in the Workbooks collection.
+
+But if your procedure changes the order of the workbooks, closes one or more workbooks, or creates one or more new workbooks, things rapidly get confusing. If, however, you've created Object variables (named, say, xlWorkbook1, xlWorkbook2, and xlWorkbook3) to refer to those specific workbooks, it will be much easier to keep them straight. This is because no matter which workbook moves to first position in the Workbooks collection, you'll be able to refer to the object represented by the Object variable xlWorkbook1 and know that you'll be accessing the workbook you're after. In other words, when you create Object variables, you get to _name them_ , using words that are more easily understood than index numbers. More important, once it's named, an Object variable's name does not change. Index numbers can change.
+
+To create an Object variable, you declare it in almost exactly the same way as you declare any other variable, using a Dim, Private, or Public statement. For example, the following statement declares the Object variable objMyObject:
+
+    Dim objMyObject As Object
+
+As usual for the Dim statement, if you use this declaration within a procedure, it creates a variable with local scope. If you use it in the declarations section at the top of a code sheet, it creates a variable with module-level private scope. Similarly, the Private and Public keywords create module-level private and public Object variables, respectively.
+
+Once you've declared the Object variable, you can assign an object to it. (Assigning objects works a bit differently from the way you use just an equal sign to assign a value to an ordinary variable.) To assign an object to an Object variable, you use a Set statement. The syntax for a Set statement is as follows:
+
+    Set _objectvariable_ = {[New] _expression_ |Nothing}
+
+Here's how that syntax breaks down:
+
+  * _objectvariable_ is the name of the Object variable to which you're assigning the object.
+  * New is an optional keyword that you can use to implicitly create a new object of the specified class. However, usually it's better to create objects _explicitly_ and then assign them to Object variables rather than use New to create them implicitly.
+  * _expression_ is a required expression that specifies or returns the object you want to assign to the Object variable.
+  * Nothing is an optional keyword that you assign to an existing Object variable to obliterate its contents and release the memory they occupied.
+
+For example, the following statements declare the Object variable objMyObject and assign to it the active workbook in Excel:
+
+    Dim objMyObject As Object
+    Set objMyObject = ActiveWorkbook
+
+The following statement uses the Nothing keyword to release the memory occupied by the objMyObject Object variable:
+
+    Set objMyObject = Nothing
+
+What's different about declaring an Object variable versus declaring other types of variables is that not only can you declare the Object variable as being of the type Object and then use the Set command, but you can also specify which type of object it is. For example, if an Object variable will always represent a Workbook object, you can declare it as being of the Workbook ⇒ data type. The following statement declares the Object variable xlWorkbook1 as being of the Workbook data type:
+
+    Dim xlWorkbook1 As Workbook
+
+Strongly associating a type with an Object variable like this has a couple of advantages. First, once you've _strongly typed_ (as it's called) the Object variable, the Visual Basic Editor can provide you with full assistance for the Object variable, just as if you were dealing with the object directly. For example, once you've created that Object variable xlWorkbook1 of the Workbook object type, the Visual Basic Editor displays the Auto List Members drop-down list when you type that Object variable's name followed by a period, as shown in Figure 8.8.
+
+Figure 8.8 When you strongly type your Object variables, you get the full benefit of the Visual Basic Editor's code-completion features for those Object variables.
+
+Second, when you strongly type an Object variable, you make it a bit harder to get things wrong in your code. If you try to assign the wrong type of object to a strongly typed Object variable, VBA gives an error. For example, if you create a Worksheet Object variable in Excel, as in the first of the following statements, but assign to it a Workbook object, as in the second statement, VBA displays a "Type Mismatch" error message when you execute this code—as well it should:
+
+    Dim wksSheet1 As Worksheet
+    Set wksSheet1 = ActiveWorkbook
+
+Finding out at this testing stage that you've created a problem is usually preferable to finding out later (for example, when you go to manipulate the wksSheet1 object and discover it doesn't behave as you expect it to).
+
+The main argument for _not_ strongly typing an Object variable is that you might not be sure ahead of time (while writing the code) what kind of object that variable will eventually reference during execution or if the kind of object it will store may vary from one execution of the code to another. (If either is the case, your code will need to be flexible enough to accommodate objects of different types for the same Object variable.) Usually, though, you'll want to strongly type all your Object variables.
+
+If you're not sure which object type to use for an Object variable, start by declaring the Object variable as being of the Object data type. Then run through the code a couple of times with the Locals window (View ⇒ Locals) displayed, and note the data type that VBA assigns to the Object variable. For example, if you press F8 repeatedly to step through the following statements in a Visual Basic Editor session hosted by Excel, the readout in the Locals window at first identifies the Object variable wks only as Object (as shown on the left in Figure 8.9). That's not too useful. However, press F8 again to execute the Set command, and you see loads of information (press the + icon next to wks). You now see Object/Sheet1 (as shown on the right in Figure 8.9) when executing the second statement assigns the first sheet in the active workbook to it. You also can see all the members, their current values, and their type.
+
+    Dim wks As Object
+    Set wks = ActiveWorkbook.Sheets(1)
+
+Figure 8.9 You can use the Locals window to help identify the object type that an Object variable will contain.
+
+* * *
+
+There Are Drawbacks to Weakly Typed Variables
+
+As you learned earlier in the book, you can avoid specifying data types altogether. For example, the statement Dim varMyVariant creates a Variant variable because the statement does not specify a data type. Variant variables can contain objects as well as other data types—but as before, using Variants requires VBA to do a little more work each time it encounters the variable (because VBA has to determine what data type the variable currently is) and denies you the benefits of strongly typing your variables. Weak typing also makes your code harder to read.
+
+* * *
+
+# Team Programming and OOP
+
+VBA is used by individual programmers as well as teams. OOP can offer some advantages when you are trying to manage a group of programmers working together on a large, complex VBA solution. OOP can help people avoid stepping on each other's toes—duplicating global variable names, creating version problems—because everyone's individual copy of the code is _their_ latest version but not the latest official version of the group, and other kinds of interference.
+
+Group programming needs management, and OOP, among its other benefits, assists in avoiding chaos when a team needs to work together on a common goal.
+
+One feature of OOP is _encapsulation_. This means that an object is self-contained and sealed off. It's like a black box that is plugged into your video system to improve the picture. You don't _open_ the box. Nobody is supposed to _modify_ the innards. You just use it.
+
+As an example, say that the boss wants all documents from now on to emphasize the company's name. You give Sandra the task of creating an object that is supposed to italicize and capitalize all references to _ACME WINDOWORKS_ in all company documents. And you ask Joe to create an object that ensures that any use of the company name is displayed in green rather than the normal black letters. (In reality, you would likely want to code these simple manipulations into _functions_ —see Chapter 10, "Creating Your Own Functions,"—rather than _objects_. Objects tend to perform multiple related jobs rather than a single, simple job like turning something green. But this is just an example, so we'll keep it simple here.)
+
+When this code is encapsulated into sealed-off objects, nobody has to worry that Sandra and Joe might use the same variable names or otherwise interfere with each other's code. Instead, within their totally separate, sealed-off objects, they can go ahead and write code as they please. This is because the scope of the code is local to the object, and also, neither Joe nor Sandra can view, much less modify, each other's code.
+
+A document is passed to Sandra's ItalAndCap object, and the document comes out the other end (returns) with all instances of _ACME WINDOWORKS_ italicized and capitalized. Then the document is passed to Joe's object and in turn spit out with _ACME WINDOWORKS_ in green. Thus, each component of the overall solution, the larger program, does its own job without interference from any other component (object). You thus avoid a lot of problems if people are working on individual tasks with the assurance that nobody else will be able to mess with their code or accidentally interact with it in some unpredictable way. Also, it's easier to track down bugs because each job is isolated from other jobs—and if the company name is only turning green half the time, you can tell Joe to take another look at his object.
+
+It's true that over the years OOP theory has grown quite arcane, abstract, and academic. OOP can be, in the upper reaches of universities, a terribly complex subject of study. In fact, they say that, like quantum mechanics, advanced OOP theory is understood by only 12 people in the world—and _they're_ fooling themselves. Nonetheless, if you are in charge of a team that's responsible for building a large application for Office, take some time to employ OOP features. Each individual programmer will be responsible for how their object works. The other programmers can merely _use_ that object without worrying about debugging it. They are not even allowed to see its internal code. Consider the objects that are built into VBA itself, such as Word's Selection object. It was written by somebody at Microsoft. You can put this object in your code and ask it to do things for you, such as move the cursor one word to the left:
+
+    Selection.MoveLeft Unit:=wdWord, Count:=1
+
+But you never see the actual code within the Selection object. You aren't allowed to modify it. And its code does not interact with your code's variables or cause other unwanted side effects. In other words, the built-in VBA objects are encapsulated—usable as black boxes, but sealed off.
+
+To create your own encapsulated objects in VBA, you add _class modules_ to a project, which are distinct from regular code modules. You'll see how to do this in Chapter 16, "Building Modular Code and Using Classes."
+
+# The Bottom Line
+
+**Understand and use objects, properties, and methods.**
+
+Contemporary programming employs a hierarchical method of organization known as object-oriented programming (OOP). At the very top of the hierarchy for any given application is the Application object. You go through this object to get to other objects that are lower in the hierarchy.
+
+Master It
+
+By using _creatable_ objects, you can often omit the Application object when referencing it in code. What are creatable objects?
+
+**Use collections of objects.**
+
+Collections are containers for a group of related objects, such as the Documents collection of Document objects.
+
+Master It
+
+Are collections objects? Do they have their own methods and properties?
+
+**Find objects, properties, and methods.**
+
+The Visual Basic Editor offers several ways to locate objects' members and add them to your programming code. There's an extensive Help system, the Object Browser, a List Properties/Methods feature, and the Auto List Members tool.
+
+Master It
+
+How do you employ Auto List Members to find out which properties and methods are available for Word's Document object?
+
+**Use Object variables to represent objects.**
+
+You can create variables that contain objects rather than typical values like strings or numbers.
+
+Master It
+
+What keywords do you use to declare an Object variable?
+Part 3
+
+Making Decisions and Using Loops and Functions
+
+  * **Chapter 9: Using Built-in Functions**
+  * **Chapter 10: Creating Your Own Functions**
+  * **Chapter 11: Making Decisions in Your Code**
+  * **Chapter 12: Using Loops to Repeat Actions**
+
+Chapter 9
+
+Using Built-in Functions
+
+VBA comes with a large number of built-in functions that perform commonly needed operations—everything from determining whether a file exists to returning the current date and converting data from one format to another. (For example, you can use a function to convert numeric data into a text string.)
+
+This chapter demonstrates what functions are, what they do, and how to use them. Along the way, you'll get to know some of the key functions built into VBA—including functions that convert data from one data type to another, functions that manage file operations, functions that do math, and many others.
+
+You can also create custom functions of your own to supplement VBA's built-in functions. The next chapter tells you how to build your own when VBA's functions don't meet your needs.
+
+In this chapter you will learn to do the following:
+
+  * Understand what functions are and what they do
+  * Use functions
+  * Use key VBA functions
+  * Convert data from one type to another
+  * Manipulate strings and dates
+
+# What Is a Function?
+
+A _function_ is a type of procedure. A function differs from a subroutine (subprocedure) in that a function always returns a value and a subroutine doesn't. And in common practice, a function almost always takes one or more arguments. Although subroutines _can_ be written to take arguments, most programmers don't write their code this way.
+
+So, to sum up, here are the key difference between functions and subroutines:
+
+**Subroutines**
+
+These never return values and are rarely sent arguments. Subs are also generally self-contained.
+
+**Functions**
+
+These communicate more with code outside their own, accepting incoming data from arguments, processing that data in some way, and sending back a result.
+
+You'll often use functions that are built into VBA. Typically, you feed information into a built-in function by sending it arguments. The built-in function then processes that info and returns a value for you to use. But you can also create your own functions in the Code window if you wish.
+
+Built-in functions are so essential to VBA that you've already used several in examples in this book. However, we'll now explore them more fully. For example, in Chapter 7, "Using Array Variables," you used the Rnd function to generate random numbers to fill an array named intArray, and the Int function to turn the random numbers into integers:
+
+    intArray(i) = Int(Rnd * 10)
+
+Rnd is one of the rare functions that does not have to take one or more arguments. (Rnd _can_ take one optional argument, but the previous example doesn't use it.)
+
+The Int function, on the other hand, requires an argument—the number or expression that it's turning into an integer. The argument in this example is supplied by the expression Rnd * 10. Here the Rnd function returns a value that the Int function uses; the Int function then returns a value to the procedure, which uses it to populate a subscript in the array.
+
+An _argument_ is a piece of information that gets passed to a function. (Arguments are also passed to methods and other commands.) You can tell when arguments are optional in Help descriptions because they're shown enclosed within brackets. When they are optional, you can either provide or omit the arguments displayed in the brackets. For example, the full Help syntax for the Rnd function looks like this:
+
+    Rnd([ _number_ ]) As Single
+
+The brackets indicate that the _number_ argument is optional, and the As Single part of the syntax denotes that the value _returned_ by the function will be of the Single data type.
+
+Different functions return different data types suited to their job: Many functions return a Variant, but yes/no functions, such as the IsNumeric function used in Chapter 7, return a Boolean value, either True or False. When necessary, VBA may even sometimes convert the result of a function to a different data type needed by another function in the expression.
+
+If any pair of brackets contains two arguments, you have to use both of them at once (blessedly, this is quite rare). For example, the MsgBox function displays a message box. The syntax for the MsgBox function is as follows:
+
+    MsgBox( _prompt_ [, _buttons_ ] [, _title_ ][, _helpfile, context_ ])
+
+Here, _prompt_ is the only required argument: _buttons, title, helpfile_ , and _context_ are all optional. But notice that _helpfile_ and _context_ are enclosed within a single set of brackets instead of each having its own pair, meaning that you need to use either both of these arguments or neither of them; you cannot use one without the other. Chapter 13, "Getting User Input with Message Boxes and Input Boxes," shows you how to use the MsgBox function in your code.
+
+# Using Functions
+
+To use a function, you _call_ it (or _invoke_ it) from a procedure—either a subprocedure (Sub) or from another function (Function).
+
+To call a function, you can use a _call_ statement, either with the optional Call keyword or by just using the name of the function. Using the Call keyword allows you to search through all calls in your project by searching for "call " ( _call_ followed by a space). However, using the Call keyword is overkill for everyday functions; programmers rarely use it.
+
+The syntax for the Call statement is as follows:
+
+    [Call] _name_ [ _argumentlist_ ]
+
+Here, _name_ is a required String argument giving the name of the function or procedure to call, and _argumentlist_ is an optional argument providing a comma-delimited list of the variables, arrays, or expressions to pass to the function or procedure. When calling a function, you'll almost always need to pass arguments (except for those few functions that take no arguments).
+
+The brackets around the Call keyword indicate that it is optional. If you do use this keyword, you need to enclose the _argumentlist_ argument in parentheses. In most cases, it's easier to read the code if you don't use the Call keyword when calling a function.
+
+For example, the following statement calls the MsgBox function, supplying the required argument _prompt_ (in this example, it's the string Hello, World!):
+
+    MsgBox "Hello, World!"
+
+You could use the Call keyword instead, as shown in the following statement, but there's little advantage in doing so:
+
+    Call MsgBox "Hello, World!"
+
+Note that the MsgBox function is one of the few with which you can omit the parentheses around the argument list.
+
+You can assign to a variable the result returned by a function. For example, consider the following code fragment. The first two of the following statements declare the String variables strExample and strLeft10. The third statement assigns a string of text to strExample. The fourth statement uses the Left function to return the leftmost 10 characters from strExample and assign them to strLeft10, which the fifth statement then displays in a message box (see Figure 9.1):
+
+    Dim strExample As String
+    Dim strLeft10 As String
+    strExample = "Technology is interesting."
+    strLeft10 = Left(strExample, 10)
+    MsgBox strLeft10
+
+Figure 9.1 Using the Left function to take the left part of a string—in this case, the first 10 characters of the string
+
+If you prefer, you can assign the result of each function to a variable, as in this next example. Here the first string variable, str1, is assigned the leftmost 13 characters from the string This is Pride and Patriotism. So after its code line executes, str1 holds the value This is Pride. Then str2 is assigned the rightmost 5 characters from str1, resulting in Pride.
+
+    Dim str1 As String
+    Dim str2 As String
+
+    str1 = Left("This is Pride and Patriotism", 13)
+    str2 = Right(str1, 5)
+
+    MsgBox str2
+
+However, after you become accustomed to working with functions, you can collapse them in various ways in your code. Instead of assigning the result of a function to a variable, you can insert it directly in your code or pass it (as an argument) to another function. This is a common shortcut. Take a look at the following statement. It does the same thing as the previous example but collapses the code into one line, avoiding the use of variables altogether:
+
+    MsgBox Right(Left("This is Pride and Patriotism", 13), 5)
+
+This statement uses three functions: the MsgBox function, the Left function, and the Right function. (The Right function is the counterpart of the Left function and returns the specified number of characters from the right side of the specified string.)
+
+When you have multiple sets of parentheses in a VBA statement, the code is executed starting from the innermost pair of parentheses and working outward. This is the same way that nested parentheses are handled in math.
+
+So, in the previous example the Left function is evaluated first, returning the leftmost 13 characters from the string: This is Pride (the spaces are characters too). VBA passes this new string to the Right function, which in this case returns the rightmost five characters from it: Pride. VBA then passes this second new string to the MsgBox function, which displays it in a message box.
+
+* * *
+
+Limit Your Nesting
+
+You can nest functions to many levels without giving VBA any trouble, but multilevel nesting can become hard for us humans to read and troubleshoot. For most practical purposes, it's a good idea to limit nesting to only a few levels, if that.
+
+* * *
+
+## Passing Arguments to a Function
+
+When a function takes more than one argument, you can pass the arguments to it in any of three ways:
+
+  * By supplying the argument values, without their names, _positionally_ (in the order in which the function expects them)
+  * By supplying the arguments, with their names, in the order in which the function expects them
+  * By supplying the arguments, with their names, in any order you choose
+
+The first method, supplying the arguments positionally without using their names, is usually the quickest way to proceed. The only disadvantage to doing so is that anyone reading your code may not know immediately which value corresponds to which argument—though they can look this up without trouble. To omit an optional argument, you place a comma where it would appear in the sequence of arguments.
+
+It does take extra time to type in argument names, but it makes your code easier to read. And when you omit an argument from a named argument list, you don't need to use the comma to indicate that you're skipping it.
+
+There's no advantage to using named arguments out of order over using them in order unless you happen to find doing so easier.
+
+For example, the DateSerial function returns a Variant/Date containing the date for the given year, month, and day. The syntax for DateSerial is as follows:
+
+    DateSerial( _year, month, day_ )
+
+Here, _year_ is a required Integer argument supplying the year, _month_ is a required Integer argument supplying the month, and _day_ is a required Integer argument supplying the day.
+
+The following statement supplies the arguments positionally without their names:
+
+    MsgBox DateSerial(2010, 12, 31)
+
+This statement is equivalent but supplies the arguments positionally with their names:
+
+    MsgBox DateSerial(Year:=2010, Month:=12, Day:=31)
+
+The following statement supplies the arguments, with their names, out of order:
+
+    MsgBox DateSerial(Day:=31, Year:=2010, Month:=12)
+
+All three of these statements work fine and achieve the same result. You'll cause a problem only if you list out-of-order arguments that you're supplying without names (positionally), if you name some arguments and don't name others, or if you omit required arguments. Figure 9.2 shows one of the errors you may encounter. In this case, I left out the required _month_ argument.
+
+Figure 9.2 An "Argument not optional" error occurs when you omit a required argument.
+
+# Using Functions to Convert Data
+
+Most data-type conversion isn't frequently needed in VBA, but you might as well at least understand what it does. Some computer languages are pretty strict about requiring explicit data typing (sometimes called _strong data typing_ ). And there _are_ a few specialized situations where you will need to convert one variable type into another. For example, you might be using the InputBox command to get some information from the user. The user is typing on a keyboard, so all the data they input will be characters (text string) data. But if your macro needs to do any math with this input, such as using the + command to add numbers, you must first convert the string data into numeric variables (or use the default Variant type). To convert a string to an integer number, you could use the Cint command. This same problem arises if you are importing data from another source, such as a database that stores everything as a string variable.
+
+VBA provides a full set of simple functions for converting data from one data type to another. Table 9.1 lists VBA's functions for simple data conversion.
+
+Table 9.1 VBA's functions for simple data conversion
+
+**Function (Arguments)** | **Data Type Returned**  
+---|---  
+CBool( _number_ ) | Boolean  
+CByte( _expression_ ) | Byte  
+CCur( _expression_ ) | Currency  
+CDate( _expression_ ) | Date  
+CDec( _expression_ ) | Decimal  
+CDbl( _expression_ ) | Double  
+CInt( _expression_ ) | Integer  
+CLng( _expression_ ) | Long  
+CSng( _expression_ ) | Single  
+CStr( _expression_ ) | String  
+CVar( _expression_ ) | Variant
+
+For example, the following statements declare the untyped variable varMyInput and the Integer variable intMyVar and then display an input box prompting the user to enter an integer. In the third statement, the user's input is assigned to varMyInput, which automatically becomes a Variant/String. The fourth statement uses the CInt function to convert varMyInput to an integer, assigning the result to intMyVar. The fifth statement compares intMyVar to 10, converts the result to Boolean by using the CBool function, and displays the result (True or False) in a message box.
+
+    Dim varMyInput
+    Dim intMyVar As Integer
+    varMyInput = InputBox("Enter an integer:", "10 Is True, Other Numbers Are False")
+    intMyVar = CInt(varMyInput)
+    MsgBox CBool(intMyVar = 10)
+
+Recall that a Boolean variable is only either True or False. So in the final line of this example, you're saying in effect, "If the value in the variable intMyVar is 10, the Boolean result will be True. If the value is anything other than 10, the result will be False."
+
+VBA also has a set of functions that manipulate data in more complicated ways. Only two of these more complex manipulation functions—Format and Chr—are used much in VBA programming, so we'll explore them in depth in this chapter.
+
+Table 9.2 lists VBA's functions for more complex data manipulation.
+
+Table 9.2 VBA's functions for complex data conversion
+
+**Function (Arguments)** | **Returns**  
+---|---  
+Asc( _string_ ) | The ANSI character code for the first character in the string.  
+Chr( _number_ ) | The string for the specified character code (a number between 0 and 255).  
+Format( _expression_ _,_ _format_ ) | A variant containing _expression_ formatted as specified by _format_. (You'll see how Format works in "Using the Format Function to Format an Expression" later in the chapter.)  
+Hex( _number_ ) | A string containing the hexadecimal value of _number_.  
+Oct( _number_ ) | A string containing the octal value of _number_.  
+RGB( _number1_ _,_ _number2_ _,_ _number3_ ) | A Long integer representing the color value specified by _number1, number2_ , and _number3_.  
+QBColor( _number_ ) | A Long containing the RGB value for the specified color.  
+Str( _number_ ) | A Variant/String containing a string representation of _number_. Use the superior CStr function instead.  
+Val( _string_ ) | The numeric portion of _string_ ; if _string_ does not have a numeric portion, Val returns 0. Use the superior CInt function instead.
+
+## Using the Asc Function to Return a Character Code
+
+This function isn't used much. Asc tells you which numeric value has been assigned to a particular letter according to the ANSI character code that's used in Windows. A _character code_ is a list of numbers by which computers refer to letters of the alphabet. For example, the character code used in Windows for a capital _A_ is 65 and for a capital _B_ is 66; a lowercase _a_ is 97, and a lowercase _b_ is 98.
+
+The syntax for the Asc function is straightforward:
+
+    Asc( _string_ )
+
+Here, string is any string expression. For example, Asc("A") returns 65.
+
+The following statements use the Asc function to return the character code for the first character of the current selection in the active document and display that code in a message box:
+
+    strThisCharacter = **Asc** (Selection.Text)
+    MsgBox strThisCharacter, vbOKOnly, "Character Code"
+
+## Using the Val Function to Extract a Number from the Start of a String
+
+The Val function, like Asc, is not much used. But for completeness, I've included it. The Val function converts the numbers contained in a text string into a numeric value. Val follows these rules:
+
+  * It reads only numbers in a string.
+  * It starts at the beginning of the string and reads only as far as the string contains characters that it recognizes as numbers (digits).
+  * It ignores tabs, line feeds, and blank spaces.
+  * It recognizes the period as a decimal separator, but not the comma.
+
+This means that if you feed Val a string consisting of tabbed columns of numbers, such as the second line here, it will read them as a single number (in this case, 445634.994711):
+
+    Item#   Price  Available   On Order  Ordered
+     4456   34.99      4          7        11
+
+If, however, you feed it something containing a mix of numbers and letters, Val will read only the numbers and strings recognized as numeric expressions (for example, Val("4E5") returns 400000 because it reads the expression as exponentiation). For example, if fed the address shown in the next example, Val returns 8661, ignoring the other numbers in the string (because it stops at the _L_ of _Laurel_ , the first character that isn't a number, a tab, a line feed, or a space):
+
+    8661 Laurel Avenue Suite 3806, Oakland, CA 94610
+
+The syntax for Val is straightforward:
+
+    Val( _string_ )
+
+Here, _string_ is a required argument consisting of any string expression.
+
+The following statement uses Val to return the numeric variable StreetNumber from the string Address1:
+
+    StreetNumber = Val(Address1)
+
+* * *
+
+Using CInt Instead of Val
+
+You should generally use the CInt function rather than the Val function when converting text to numbers. The reason is that CInt takes into account where you are located (the _regional settings_ in Windows). In America, for example, we use a comma to indicate thousands: 12,000. The CInt function can handle this; Val cannot (and converts "12,000" into 12):
+
+    Dim StrVar As String
+    StrVar = "12,000"
+    MsgBox "Val = " & Val(StrVar) & "  CInt = " & CInt(StrVar)
+
+When you execute this code, you'll see the result shown in the following message box. This illustrates why you should use CInt rather than Val.
+
+Remember that Val stops when it reaches the first non-digit character. So that comma trips it up when trying to convert 12,000.
+
+* * *
+
+## Using the Str Function to Convert a Number into a String
+
+Just as you can use CInt to convert a text string into a numeric value as described in the previous section, you can also convert a numeric value to a string with the Str function. But you should use the newer CStr function rather than the Str function, for the same reasons that CInt is superior to the older Val command.
+
+You'll need to convert a number to a string when you want to _concatenate_ the information contained in a value with a string. Concatenation means appending one string to another, as in "123" & "654", which results in the text "123654".
+
+Concatenation cannot be accomplished by simply using the + operator because VBA would attempt to perform the mathematical operation addition rather than the string operation you want: concatenation.
+
+A text string is just that: text. It's one or more alphanumeric characters, such as "55"—and that's quite different from the number 55. You can't concatenate "55" and 55. They're not the same kind of data at all.
+
+Here's an example. Suppose you've declared a String variable named strYourAge and a numeric variable named intAge. You can't use a strYourAge + intAge statement to concatenate them because they're different data types. You first need to create a string from the intAge variable and then concatenate that string with the strYourAge string. (Alternatively, you can use the & operator to concatenate the two variables.)
+
+To convert a value to a string, use the CInt function. The syntax for the CInt function is this:
+
+    CInt( _number_ )
+
+Here, number is a variable containing a numeric expression (such as an Integer data type, a Long data type, or a Double data type).
+
+The following short procedure provides an example of converting a value to a string:
+
+    Sub Age()
+        Dim intAge As Integer, strYourAge As String
+        intAge = InputBox("Enter your age:", "Age")
+        strYourAge = "Your age is " & **CInt** (intAge) & "."
+        MsgBox strYourAge, vbOKOnly + vbInformation, "Age"
+    End Sub
+
+* * *
+
+Using a Declaration Shortcut
+
+Notice in the example Sub Age how the Dim statement uses a kind of shorthand. Two different variables, separated by a comma, are declared on the same line using the same Dim command. This is equivalent to
+
+* * *
+
+       Dim intAge As Integer
+       Dim strYourAge As String
+
+## Using the Format Function to Format an Expression
+
+The Format function is a powerful tool for changing numbers, dates and times, and strings into a format that you prefer.
+
+The syntax for the Format function is as follows:
+
+    Format( _expression_ [, _format_ [, _firstdayofweek_ [, _firstweekofyear_ ]]])
+
+These are the components of the syntax:
+
+  * _expression_ is any valid expression.
+  * _format_ is an optional argument specifying a named format expression or a user-defined format expression. More on this in a moment.
+  * _firstdayofweek_ is an optional constant specifying the day that starts the week (for date information): The default setting is vbSunday (1), but you can also set vbMonday (2), vbTuesday (3), vbWednesday (4), vbThursday (5), vbFriday (6), vbSaturday (7), or vbUseSystem (0; uses the system setting).
+  * _firstweekofyear_ is an optional constant specifying the week considered first in the year (again, for date information), as shown in Table 9.3.
+
+Table 9.3 Constants that specify how a year starts
+
+**Constant** | **Value** | **Year Starts with Week**  
+---|---|---  
+vbUseSystem | 0 | Use the system setting.  
+vbFirstJan1 | 1 | The week in which January 1 falls (the default setting).  
+vbFirstFourDays | 2 | The first week with a minimum of four days in the year.  
+vbFirstFullWeek | 3 | The first full week (seven days) of the year.
+
+You can define your own formats for the Format function as described in the following sections if none of the predefined numeric formats (described next) suit your needs.
+
+### Using Predefined Numeric Formats
+
+Table 9.4 lists the predefined numeric formats that you can use with the Format function.
+
+Table 9.4 Predefined numeric formats
+
+**Format Name** | **Explanation** | **Example**  
+---|---|---  
+General Number | The number is displayed with no thousand separator. | 124589  
+Currency | The number is displayed with two decimal places, a thousand separator, and the currency symbol appropriate to the system locale. | $1,234.56  
+Fixed | The number is displayed with two decimal places and at least one integer place. | 5.00   
+Standard | The number is displayed with two decimal places, at least one integer place, and a thousand separator (when needed). | 1,225.00  
+Percent | The number is displayed multiplied by 100, with two decimal places and with a percent sign. | 78.00%   
+Scientific | The number is displayed in scientific notation. | 5.00E+00  
+Yes/No | A nonzero number is displayed as Yes; a zero number is displayed as No. | Yes   
+True/False | A nonzero number is displayed as True; a zero number is displayed as False. | False   
+On/Off | A nonzero number is displayed as On; a zero number is displayed as Off. | Off
+
+For example, the following statement returns $123.45:
+
+    Format("12345", "Currency")
+
+### Creating a Numeric Format
+
+If none of the predefined numeric formats suit your needs, you can create your own numeric formats by using your choice of a combination of the characters listed in Table 9.5.
+
+Table 9.5 Characters for creating your own number formats
+
+**Character** | **Explanation**  
+---|---  
+[None] | Displays the number without any formatting. (You won't usually want to use this option.)  
+0 | Placeholder for a digit. If there's no digit, VBA displays a zero. If the number has fewer digits than you use zeroes, VBA displays leading or trailing zeroes as appropriate.  
+
+# | Placeholder for a digit. If there's no digit, VBA displays nothing.  
+. | Placeholder for a decimal. Indicates where the decimal separator should fall. The decimal separator varies by locale (for example, a decimal point in the United States, a comma in Germany).  
+% | Placeholder for a percent character. VBA inserts the percent character and multiplies the expression by 100.  
+, | Thousand separator (depending on locale, a comma or a period).  
+: | Time separator (typically a colon, but again this depends on the locale).  
+/ | Date separator. (Again, what you'll see depends on the locale.)  
+E– E+ e– e+ | Scientific format: E– or e– places a minus sign next to negative exponents. E+ or e+ places a minus sign next to negative exponents and places a plus sign next to positive exponents.  
+– + $ ( ) | Displays a literal character.  
+\\[character] | Displays the literal character.  
+"[string]" | Displays the literal character. Use Chr(34) (the character code for double quotation marks) to provide the double quotation marks.
+
+For example, the following statement returns a currency formatted with four decimal places:
+
+    Format("123456", "$00.0000")
+
+### Creating a Date or Time Format
+
+Similarly, you can create your own date and time formats by mixing and matching the characters listed in Table 9.6.
+
+Table 9.6 Characters for creating your own date and time formats
+
+**Character** | **Explanation**  
+---|---  
+: | Time separator (typically a colon, but this depends on the locale).  
+/ | Date separator (also locale-dependent).  
+C | Displays the date (if there is a date or an integer value) in the system's short date format and the time (if there is a date or a fractional value) in the system's default time format.  
+D | Displays the date (1 to 31) without a leading zero for single-digit numbers.  
+Dd | Displays the date with a leading zero for single-digit numbers (01 to 31).  
+Ddd | Displays the day as a three-letter abbreviation (Sun, Mon, Tue, Wed, Thu, Fri, Sat) with no period.  
+Dddd | Displays the full name of the day.  
+Ddddd | Displays the complete date (day, month, and year) in the system's short date format.  
+Dddddd | Displays the complete date (day, month, and year) in the system's long date format.  
+aaaa | Displays the full, localized name of the day.  
+w | Displays an integer from 1 (Sunday) to 7 (Monday) containing the day of the week.  
+ww | Displays an integer from 1 to 54 giving the number of the week in the year. The number of weeks is 54 rather than 52 because most years start and end with partial weeks rather than having 52 start-to-finish weeks.  
+m | Displays an integer from 1 to 12 giving the number of the month without a leading zero on single-digit months. When used after h, returns minutes instead of months.  
+mm | Displays a number from 01 to 12 giving the two-digit number of the month. When used after h, returns minutes instead of months.  
+mmm | Displays the month as a three-letter abbreviation (except for May) without a period.  
+mmmm | Displays the full name of the month.  
+oooo | Displays the full localized name of the month.  
+q | Displays a number from 1 to 4 giving the quarter of the year.  
+y | Displays an integer from 1 to 366 giving the day of the year.  
+yy | Displays a number from 00 to 99 giving the two-digit year.  
+yyyy | Displays a number from 0100 to 9999 giving the four-digit year.  
+h | Displays a number from 0 to 23 giving the hour.  
+Hh | Displays a number from 00 to 23 giving the two-digit hour.  
+N | Displays a number from 0 to 60 giving the minute.  
+Nn | Displays a number from 00 to 60 giving the two-digit minute.  
+S | Displays a number from 0 to 60 giving the second.  
+Ss | Displays a number from 00 to 60 giving the two-digit second.  
+ttttt | Displays the full time (hour, minute, and second) in the system's default time format.  
+AM/PM | Uses the 12-hour clock and displays AM or PM as appropriate.  
+am/pm | Uses the 12-hour clock and displays am or pm as appropriate.  
+A/P | Uses the 12-hour clock and displays A or P as appropriate.  
+a/p | Uses the 12-hour clock and displays a or p as appropriate.  
+AMPM | Uses the 12-hour clock and displays the AM or PM string literal defined for the system.
+
+For example, the following statement returns Saturday, April 01, 2010:
+
+    Format(#4/1/2010#, "dddddd")
+
+### Creating a String Format
+
+The Format function also lets you create custom string formats using the options shown in Table 9.7.
+
+Table 9.7 Characters for creating your own string formats
+
+**Character** | **Explanation**  
+---|---  
+@ | Placeholder for a character. Displays a character if there is one, and a space if there is none.  
+& | Placeholder for a character. Displays a character if there is one, and nothing if there is none.  
+< | Displays the string in lowercase.  
+> | Displays the string in uppercase.  
+! | Causes VBA to fill placeholders from left to right instead of from right to left (the default direction).
+
+For example, the following statement assigns to strUser a string consisting of four spaces if there is no input in the input box:
+
+    strUser = Format(InputBox("Enter your name:"), "@@@@")
+
+## Using the Chr Function and Constants to Enter Special Characters in a String
+
+To insert special characters (such as a carriage return or a tab) into a string, specify the built-in constant (for those special characters that have built-in constants defined) or enter the appropriate character code using the Chr function. The syntax for the Chr function is straightforward:
+
+    Chr( _charactercode_ )
+
+Here, _charactercode_ is a number that identifies the special character you want to add.
+
+Table 9.8 lists the most useful character codes and character constants.
+
+Table 9.8 VBA character codes and character constants
+
+**Code** | **Built-in Character Constant** | **Character**  
+---|---|---  
+Chr(9) | vbTab | Tab  
+Chr(10) | vbLf | Line feed  
+Chr(11) | vbVerticalTab | Soft return (Shift+Enter)  
+Chr(12) | vbFormFeed | Page break  
+Chr(13) | vbCr | Carriage return  
+Chr(13) + Chr(10) | vbCrLf | Carriage return/line feed combination  
+Chr(14) | — | Column break  
+Chr(34) | — | Double straight quotation marks (")  
+Chr(39) | — | Single straight quotation mark/apostrophe (')  
+Chr(145) | — | Opening single smart quotation mark (')  
+Chr(146) | — | Closing single smart quotation mark/ apostrophe (')  
+Chr(147) | — | Opening double smart quotation mark (")  
+Chr(148) | — | Closing double smart quotation mark (")  
+Chr(149) | — | Bullet  
+Chr(150) | — | En dash  
+Chr(151) | — | Em dash
+
+Here's a practical example exploiting the Chr function. Say you wanted to build a string containing a person's name and address from individual strings containing items of that information. You also wanted the individual items separated by tabs in the resulting string so that you could insert the string into a document and then easily convert it into a table.
+
+To do this, you could use the following code:
+
+    Sub FormatTabular()
+
+    Dim i As Integer
+    Dim strFirstName As String
+    Dim strLastName As String
+    Dim strAddress As String
+    Dim strCity As String
+    Dim strState As String
+    Dim strAllInfo As String
+
+    strFirstName = "Phil"
+    strLastName = "Mortuqye"
+    strAddress = "12 Batwing Dr."
+    strCity = "Tulsa"
+    strState = "OK"
+
+        strAllInfo = strFirstName & vbTab & strLastName _
+            & vbTab & strAddress & vbTab & strCity _
+            & vbTab & strState & **vbCr**
+
+        Selection.TypeText strAllInfo
+    End Sub
+
+String variables are assigned to the string strAllInfo by concatenating the strings strFirstName, strLastName, and so on with tabs—vbTab characters—between them. The final character added to the built string is vbCr (a carriage-return character), which creates a new paragraph.
+
+The final line enters the strAllInfo string into the current document, thus building a tab-delimited list containing the names and addresses. This list can then be easily converted into a table whose columns each contain one item of information: The first column contains the strFirstName string, the second column the strLastName string, and so on.
+
+# Using Functions to Manipulate Strings
+
+String variables are often useful for holding text. You can use them to store any quantity of text, from a character or two up to a large number of pages from a Word document or other text document. You can also use strings to store specialized information, such as filenames and folder names. Once you've stored text in a string, you can manipulate it according to your needs.
+
+Table 9.9 lists VBA's built-in functions for manipulating strings. Because many of these functions are useful, and some are complex, you'll find detailed examples after the table.
+
+Table 9.9 VBA's string-manipulation functions
+
+**Function (Arguments)** | **Returns**  
+---|---  
+InStr( _start_ , _string1_ , _string2_ , _compare_ ) | A Variant/Long giving the position of the first instance of the search string ( _string2_ ) inside the target string ( _string1_ ), starting from the beginning of the target string  
+InStrRev( _stringcheck_ ,  _stringmatch_ , _start_ , _compare_ ) | A Variant/Long giving the position of the first instance of the search string ( _stringmatch_ ) inside the target string ( _stringcheck_ ), starting from the end of the target string  
+LCase( _string_ ) | A String containing the lowercased _string_  
+Left( _string_ , _number_ ) | A Variant/String containing the specified number of characters from the left end of _string_  
+Len( _string_ ) | A Long containing the number of characters in _string_  
+LTrim( _string_ ) | A Variant/String containing _string_ with any leading spaces trimmed off it  
+Mid( _string_ , _start_ , _length_ ) | A Variant/String containing the specified number of characters from the specified starting point within _string_  
+Right( _string_ , _number_ ) | A Variant/String containing the specified number of characters from the right end of _string_  
+RTrim( _string_ ) | A Variant/String containing _string_ with any trailing spaces trimmed off it  
+Space( _number_ ) | A Variant/String containing _number_ of spaces  
+StrComp( _string1_ , _string2_ , _compare_ ) | A Variant/Integer containing the result of comparing _string1_ and _string2_  
+StrConv( _string_ , _conversion_ ,  _LCID)_ | A Variant/String containing _string_ converted as specified by _conversion_ for the (optional) specified Locale ID _(LCID)_  
+String( _number_ , _character_ ) | A Variant/String containing _number_ of instances of _character_  
+StrReverse( _expression_ ) | A String containing the characters of _expression_ in reverse order  
+Trim( _string_ ) | A Variant/String containing _string_ with any leading spaces or trailing spaces trimmed off it  
+UCase( _string_ ) | A String containing the uppercased _string_
+
+## Using the Left, Right, and Mid Functions to Return Part of a String
+
+Frequently, you'll need to use only part of a string in your macros. For example, you might want to take only the first three characters of the name of a city to create a location code.
+
+VBA provides several functions for returning from strings the characters you need:
+
+  * The Left function returns a specified number of characters from the left end of the string.
+  * The Right function returns a specified number of characters from the right end of the string.
+  * The Mid function returns a specified number of characters starting from a specified location inside a string.
+
+* * *
+
+Some String Functions Come in Two Flavors
+
+VBA provides two versions of a number of string functions, including the Left, Right, and Mid functions: the versions shown here, which return String-type Variant values, and versions whose names end with $ (Left$, Right$, Mid$, and so on), which return pure String values.
+
+The functions that return the pure Strings run faster (though you're not likely to notice any difference with normal use) but return an error if you use them on a Null value. The functions that return the String-type Variants can deal with Null values with no problem. Which approach you employ can depend on, for example, the type of data you're manipulating. Some databases employ Null, some do not.
+
+* * *
+
+### Using the Left Function
+
+The Left function returns the specified number of characters from the left end of a string. The syntax for the Left function is as follows:
+
+    Left( _string, length_ )
+
+Here, the _string_ argument is any string expression—that is, any expression that returns a sequence of contiguous characters. Left returns Null if _string_ contains no data. The _length_ argument is a numeric expression specifying the number of characters to return. _length_ can be a straightforward number (such as 4, or 7, or 11) or it can be an expression that results in a number. For example, if the length of a word were stored in the variable named LenWord and you wanted to return two characters fewer than LenWord, you could specify the expression LenWord - 2 as the _length_ argument; to return three characters more than LenWord, you could specify LenWord + 3 as the _length_ argument.
+
+One way to use the Left function would be to separate the area code from a telephone number that was provided as an unseparated 10-digit number from a database. In the following statements, the telephone number is stored in the String variable strPhone, which the code assumes was created earlier:
+
+    Dim strArea As String
+    strArea = Left(strPhone, 3)
+
+These statements create the variable Area and fill it with the leftmost three characters of the variable strPhone.
+
+### Using the Right Function
+
+The Right function is the mirror image of the Left function. Right returns a specified number of characters from the right end of a string. The syntax for the Right function is as follows:
+
+    Right( _string, length_ )
+
+Again, the _string_ argument is any string expression, and _length_ is a numeric expression specifying the number of characters to return. And, again, Right returns Null if _string_ contains no data, and _length_ can be a number or an expression that results in a number.
+
+To continue the previous example, you could use the Right function to separate the last seven digits of the phone number stored in the string strPhone from the area code:
+
+    Dim strLocalNumber As String
+    strLocalNumber = Right(strPhone, 7)
+
+These statements create the variable strLocalNumber and fill it with the rightmost seven characters from the variable strPhone.
+
+### Using the Mid Function
+
+The Left and Right functions extract a substring from the left or right side of a string. The Mid function fetches a substring out of the middle of a string.
+
+The Mid function returns the specified number of characters from inside the given string. You specify a starting position in the string and the number of characters (to the right of the starting position) that you want extracted.
+
+The syntax for the Mid function is as follows:
+
+    Mid( _string, start_ [, _length_ ])
+
+Here are the elements of the syntax:
+
+  * As in Left and Right, the _string_ argument is any string expression. Mid returns Null if _string_ contains no data.
+  * _start_ is a numeric value specifying the character position in _string_ at which to start the _length_ selection. If _start_ is larger than the number of characters in _string_ , VBA returns a zero-length string. In code, an empty string is typed as two quotation marks with nothing inside: strState = "".
+  * _length_ is an _optional_ numeric expression specifying the number of characters to return. If you omit _length_ or use a _length_ argument greater than the number of characters in _string_ , VBA returns all the characters from the _start_ position to the end of _string_. _length_ can be an ordinary literal number or an expression that results in a number.
+
+Using the phone-number example, you could employ Mid to pluck the local exchange code out from within a 10-digit phone number (for instance, extract the 555 from 510 **555** 1212), like this:
+
+    Dim strPhone As String
+
+    strPhone = "5105551212"
+    MsgBox Mid(strPhone, 4, 3)
+
+This statement displays three characters in the variable strPhone, starting at the fourth character.
+
+* * *
+
+**Don't Torture Your Users—Accept a Variety of Formats**
+
+All too often programmers, for no good reason, make it hard for users to succeed. How many times have you tried to type your phone number in a website and been told that the only acceptable format is xxx-xxx-xxxx? Or (xxx) xxx-xxxx? Or numbers only! You _will_ do things our way. Well...why?
+
+Why? Because the programmer was lazy and refused to permit a variety of input.
+
+People write down their phone number various ways. Some type it in like this: (xxx) xxx-xxxx; others favor variations like xxx xxx-xxxx. Have you seen those instructions that say "use no hyphens" or "you must use hyphens"?
+
+This is simply slothful programming. The programmer doesn't want to take a little extra time to deal with varying input, so they transfer the work to the user. Make life easier for your users by writing a little extra code to translate various typical formats into whatever your program expects. Don't force the users to provide data "just so."
+
+Avoid such user frustration by simply writing some code that tests the user's input. Here are a few easy solutions:
+
+Use the InStr function (described later in this chapter) to check for parentheses or hyphens. Or use Mid to extract only the numeric values in the user's string entry—ignoring whatever blank spaces or non-numeric characters the user might have typed in. Your program's goal is to end up with 5105551212. After extracting the non-digits, you can then show the user a _useful_ error message if they have not entered the necessary 10 digits.
+
+Test the number with the Len function to see if there are 10 digits. If not, tell the user they made a mistake and to please reenter the phone number because there are not enough (or too many) digits.
+
+Your error message should also display the user's entry so they can see the problem. But you're just being lazy and annoying if you tell them they can't use parentheses or hyphens or _must_ use those punctuation marks—to satisfy _you_. Who are you?
+
+Your code should accept several predictable variations of user input. There's no need to reject legitimate user input simply because that input is punctuated in a different way than your data store or your code prefers. After all, why waste the time of perhaps thousands of users when it only takes a little extra coding to accommodate them?
+
+* * *
+
+We've seen how to extract a substring using Mid. But this function has another use as well. You can also use Mid to _find_ the location of a character within a string. In the following snippet, the Do Until... Loop walks backward through the string strFilename (which contains the FullName property of the template attached to the active document in Word) until it reaches the first backslash (\), storing the resulting character position in the Integer variable intLen. The message box then displays that part of strFilename to the right of the backslash (determined by subtracting intLen from the length of strFilename)—the name of the attached template without its path:
+
+    Dim strFilename As String, intLen As Integer
+    strFilename = ActiveDocument.AttachedTemplate.FullName
+    MsgBox strFilename
+
+    intLen = Len(strFilename)
+    Do Until **Mid** (strFilename, intLen, 1) = "\"
+        intLen = intLen - 1
+    Loop
+    MsgBox Right(strFilename, Len(strFilename) - intLen)
+
+This example is more illustrative than realistic for two reasons: First, you can get the name of the template more easily by just using the Name property rather than the FullName property. Second, there's a function called InStrRev (discussed next) that returns the position of one string within another by walking backward through it.
+
+## Using InStr and InStrRev to Find a String within Another String
+
+You can use the Mid function to find an individual character within a string, but what if you need to find a set of characters within a string? The InStr function is designed to find one string within another string. For example, you could check, say, the current paragraph to see if it contained a particular word. If it did, you could take action accordingly—for instance, replacing that word with another word or selecting the paragraph for inclusion in another document. Maybe your company has changed its name and you need to do a search and replace in a large number of document templates. Or something.
+
+The InStrRev function is the counterpart of the InStr function, working in a similar way but in the reverse direction.
+
+The syntax for InStr is as follows:
+
+    InStr([ _start_ , ] _string1, string2_ [, _compare_ ])
+
+Here are the arguments:
+
+  * _start_ is an optional argument specifying the starting position in the first string, _string1_. If you omit _start_ , VBA starts the search at the first character in _string1_ (which is usually where you want to start). However, you do need to use _start_ when you use the _compare_ argument to specify the type of string comparison to perform.
+  * _string1_ is a required argument specifying the string expression in which to search for _string2_.
+  * _string2_ is a required argument specifying the string expression for which to search in _string1_.
+  * _compare_ is an optional argument specifying the type of string comparison you want to perform. Text can be compared two ways: a _binary comparison_ , which is case sensitive, or a _textual comparison_ , which is not case sensitive. The default is a binary comparison, which you can specify by using the constant vbBinaryCompare or the value 0 for compare. Although specifying this value isn't necessary (because it's the default), you might want to include it to make your code ultra-clear. To specify a textual, case-insensitive comparison, use the constant vbTextCompare or the value 1 for _compare_.
+
+* * *
+
+Use Textual Comparisons with Unpredictable String Data
+
+A textual comparison is a useful weapon when you're dealing with data that may arrive in a variety of ways, like the telephone-number punctuation problem described in this chapter's Real World Scenario. Here's another example: If you wanted to search a selection for instances of a name, you'd probably want to find _all_ instances of the name—uppercase, lowercase, or title case (initial caps). Otherwise, you'd find only the name with exactly the same capitalization as you specified in the String2 argument.
+
+* * *
+
+Another way to use InStr is to find the _location_ of a certain string within another string so that you can then _change_ that substring. You might want to do this if you needed to move a file from its current position in a particular folder or subfolder to another folder that had a similar subfolder structure. For instance, suppose you work with documents stored in a variety of subfolders beneath a folder named _In_ (such as z:\Documents\In\), and after you're finished with them, you save them in corresponding subfolders beneath a folder named _Out_ (z:\Documents\Out\). The short procedure shown in Listing 9.1 automatically saves the documents in the _Out_ subfolder.
+
+**Listing 9.1**: Changing a file path
+
+    1.  Sub Save_in_Out_Folder()
+    2.      Dim strOName As String, strNName As String, _
+                intToChange As Integer
+    3.      strOName = ActiveDocument.FullName
+    4.      intToChange = InStr(strOName, "\In\")
+    5.      strNName = Left(strOName, intToChange - 1) & "\Out\" _
+                & Right(strOName, Len(strOName) - intToChange - 3)
+    6.      ActiveDocument.SaveAs strNName
+    7.  End Sub
+
+The code in Listing 9.1 works as follows:
+
+  * Line 1 begins the procedure, and line 7 ends it.
+  * Line 2 declares the String variable strOName (as in _original name_ ), the String variable strNName (as in _new name_ ), and the Integer variable intToChange. Line 3 then assigns strOName the FullName property of the ActiveDocument object: the full name of the active document, including the path to the document (for example, z:\Documents\In\Letters\My Letter.docm).
+  * Line 4 assigns to the variable intToChange the value of the InStr function that finds the string \In\ in the variable strOName. Using the example path from the previous paragraph, intToChange will be assigned the value 13 because the 1st character of the \In\ string is the 13th character in the strOName string.
+  * Line 5 assigns to the variable strNName the new filename created in the main part of the statement. This breaks down as follows: 
+    * Left(strOName, intToChange - 1) takes the left section of the strOName string, returning the number of characters specified by intToChange - 1—the number stored in intToChange minus one.
+    * & "\Out\" adds to the partial string specified in the previous bullet item (to continue the previous example, z:\Documents) the characters \Out\, which effectively replace the \In\ characters, thus changing the directory name (z:\Documents\Out\).
+    * & Right(strOName, Len(strOName) - intToChange - 3) completes the partial string by adding the right section of the strOName string, starting from after the \In\ string (Letters\My Letter.docm), giving z:\Documents\Out\Letters\My Letter.docm. The number of characters to take from the right section is determined by subtracting the value stored in intToChange from the length of strOName and then subtracting 3 from the result. Here, the value 3 comes from the length of the string \In\; because the intToChange value stores the character number of the first backslash, you need count only the _I_ , the _n_ , and the second backslash to reach its end.
+  * Line 6 saves the document using the name in the strNName variable.
+
+The syntax for InStrRev is similar to that of InStr:
+
+      InStrRev( _stringcheck, stringmatch_ [, _start_ [, _compare_ ]])
+
+These are the arguments:
+
+  * _stringcheck_ is a required String argument specifying the string in which to search for _stringmatch_.
+  * _stringmatch_ is a required String argument specifying the string for which to search.
+  * _start_ is an optional numeric argument specifying the starting position for the search. If you omit _start_ , VBA starts at the last character of _stringcheck_.
+  * _compare_ (as for InStr) is an optional argument specifying how to search: vbTextCompare for text, vbBinaryCompare for a binary comparison.
+
+## Using LTrim, ****RTrim , and Trim to Remove Spaces from a String
+
+Often you'll need to trim strings before concatenating them to avoid ending up with extra spaces in inappropriate places, such as in the middle of eight-character filenames.
+
+Data can contain appended or prepended spaces. And always remember that users might randomly type spaces in various ways when entering data. You never know. Your programming (and databases), however, need data in a predictable format (so the data can easily be searched, sorted, and otherwise manipulated).
+
+For example, if 500 users entered their zip code, some might type a space before entering the digits. Any such entries would be placed at the start of a list after the list was alphabetically sorted (the space character is seen as "lower" than ordinary characters by a sorting function). So the sort would produce an inaccurate result. It's easy, though, to use the Trim functions to get rid of spaces.
+
+As you saw in Table 9.9, VBA provides three functions specifically for trimming leading spaces and trailing spaces from strings:
+
+  * LTrim removes leading spaces from the specified string.
+  * RTrim removes trailing spaces from the specified string.
+  * Trim removes both leading and trailing spaces from the specified string.
+
+* * *
+
+Trim Is Often the Only Space-Removal Function You Need
+
+In many cases, you can simply use Trim instead of figuring out whether LTrim or RTrim is appropriate for what you expect a variable to contain. At other times, you'll need to remove either leading or trailing spaces while _retaining_ spaces on the other end. In those special cases, you'll need to use either LTrim or RTrim. RTrim is especially useful for working with fixed-length String variables, which will contain trailing spaces if the data assigned to them is shorter than their fixed length.
+
+* * *
+
+The syntax for the LTrim, RTrim, and Trim functions is straightforward:
+
+    LTrim(string)
+    RTrim(string)
+    Trim(string)
+
+In each case, string is any string expression.
+
+You could use the Trim function to remove both leading and trailing spaces from a string derived from the current selection in the active document in Word. The first line in this next code example declares strUntrimmed and strTrimmed as String variables. The second line assigns the data in the current selection to the strUntrimmed string. The third line assigns the trimmed version of the strUntrimmed string to the strTrimmed string:
+
+    Dim strUntrimmed As String, strTrimmed As String
+    strUntrimmed = Selection.Text
+    strTrimmed = Trim(strUntrimmed)
+
+## Using Len to Check the Length of a String
+
+To find out how long a string is, use the Len function. The syntax for the Len function is straightforward:
+
+    Len( _string_ )
+
+Here, _string_ is any valid string expression. (If _string_ is Null, Len also returns Null.)
+
+One use for Len is to make sure a user's entry in an input box or in a text box on a form is of a suitable length. A United States phone number must be 10 digits, for instance.
+
+The CheckPassword procedure shown in Listing 9.2 uses Len to make sure a password the user enters is long enough to be difficult to guess, but not too long.
+
+**Listing 9.2**: Testing password length with the Len function
+
+     1.  Sub CheckPassword()
+     2.      Dim strPassword As String
+     3.  BadPassword:
+     4.      strPassword = InputBox _
+                 ("Enter the password to protect this item from changes:" _
+                 , "Enter Password")
+     5.      If **Len** (strPassword) = 0 Then
+     6.          End
+     7.      ElseIf **Len** (strPassword) < 6 Then
+     8.          MsgBox "The password you chose is too short." _
+                     & vbCr & vbCr & _
+                     "Choose a password between 6 and 15 characters in length.", _
+                      vbOKOnly + vbCritical, "Unsuitable Password"
+     9.          GoTo BadPassword
+    10.      ElseIf **Len** (strPassword) > 15 Then
+    11.          MsgBox "The password you chose is too long." _
+                     & vbCr & vbCr & _
+                     "Choose a password between 6 and 15 characters in length.", _
+                     vbOKOnly + vbCritical, "Unsuitable Password"
+    12.          GoTo BadPassword
+    13.      End If
+    14.  End Sub
+
+Listing 9.2 ensures that a password contains between 6 and 15 characters (inclusive). Here's how the code works:
+
+  * Line 2 declares a String variable named strPassword.
+  * Line 3 contains the label BadPassword, to which the GoTo statements in line 9 and line 12 redirect execution if the password fails either of the checks. Labels are locations within code that you might need to jump to during execution. A label is a word on its own line in the code that ends with a colon. Labels are discussed in Chapter 11, "Making Decisions in Your Code."
+  * Line 4 assigns to strPassword the result of an input box that invites the user to enter the password for the item.
+  * Lines 5 through 13 then use an If statement to check that the password is an appropriate length. First, line 5 checks strPassword for zero length, which would mean that the user clicked either the Cancel button or the close button on the input box or clicked the OK button with no text entered in the input box. If the length of strPassword is zero, the End statement in line 6 terminates the procedure. If the password passes that test, line 7 checks to find out if its length is less than 6 characters; if so, the procedure displays a message box alerting the user to the problem and then redirects execution to the BadPassword label. If the password is 6 or more characters long, line 10 checks to see if it's more than 15 characters long; if it is, the user is shown another message box and another trip to the BadPassword label.
+
+## Using StrConv, LCase, and UCase to Change the Case of a String
+
+If you need to change the case of a string, use the StrConv (whose name comes from _string conversion_ ), LCase, and UCase functions. Of these, the easiest to use is StrConv, which can convert a string to a variety of different formats varying from straightforward uppercase, lowercase, or _propercase_ (as VBA refers to initial capitals, also known as title case) to the Japanese _hiragana_ and _katakana_ phonetic characters.
+
+### Using StrConv
+
+The StrConv function has the following syntax:
+
+    StrConv( _string, conversion_ )
+
+Here, the _string_ argument is any string expression, and the _conversion_ argument is a constant or value specifying the type of conversion required. The most useful conversion constants and values are shown in Table 9.10.
+
+Table 9.10 The most common conversion constants
+
+**Constant** | **Value** | **Effect**  
+---|---|---  
+vbUpperCase | 1 | Converts the given string to uppercase characters  
+vbLowerCase | 2 | Converts the given string to lowercase characters  
+vbProperCase | 3 | Converts the given string to propercase (aka title case—the first letter of every word is capitalized)  
+vbUnicode | 64 | Converts the given string to Unicode using the system's default code page  
+vbFromUnicode | 128 | Converts the given string from Unicode to the system's default code page
+
+For example, suppose you received from a database program a string called strCustomerName containing a person's name. You could use StrConv to make sure that it was in title case by using a statement such as this:
+
+    strProperCustomerName = StrConv(strCustomerName, vbProperCase)
+
+* * *
+
+StrConv Ignores the Capitalization You Feed It
+
+Note that StrConv doesn't care about the _case_ of the string you feed it—it simply returns the case you asked for. For example, feeding StrConv uppercase and asking it to return uppercase doesn't cause any problem.
+
+* * *
+
+### Using LCase and UCase
+
+If you don't feel like using StrConv, you can alternatively use the LCase and UCase functions, which convert a string to lowercase and uppercase, respectively.
+
+LCase and UCase have the following syntax:
+
+    LCase( _string_ )
+    UCase( _string_ )
+
+Here, _string_ is any string expression.
+
+For example, the following statement lowercases the string MyString and assigns it to MyLowerString:
+
+    MyLowerString = LCase(MyString)
+
+## Using the StrComp Function to Compare Apples to Apples
+
+As you've seen already, you can compare one item to another item by simply using the = operator:
+
+    If 1 = 1 Then MsgBox "One is one."
+
+This straightforward comparison with the = operator also works with two strings, as shown in the second line here:
+
+    strPet = InputBox("Is your pet a dog or a cat?", "Pet")
+    If strPet = "Dog" Then MsgBox "We do not accept dogs."
+
+The problem with this code as written is that the strings need to match exactly in capitalization for VBA to consider them equal. If the user enters dog or DOG (not to mention dOG, doG, dOg, or DoG) rather than Dog, the condition isn't met. Again, permit your users a variety of correct responses—don't enforce pointless capitalization and punctuation rules.
+
+To accept variations of capitalization, you could use the Or operator to hedge your bets:
+
+    If Pet = "Dog" Or Pet = "dog" Or Pet = "DOG" Or Pet = "dogs" _
+        Or Pet = "Dogs" or Pet = "DOGS" Then MsgBox _
+        "We do not accept dogs. "
+
+As you can see, such code rapidly becomes clumsy, even omitting some variations such as dOG. Or you could change the case of one or both strings involved to make sure their case matched, but it's simpler to just use the StrComp function, which is designed to permit you to ignore case. The syntax for StrComp is as follows:
+
+    StrComp( _string1, string2_ [, _compare_ ])
+
+Here, _string1_ and _string2_ are required String arguments specifying the strings to compare, and _compare_ is an optional argument specifying textual comparison (vbTextCompare) or binary comparison (vbBinaryCompare).
+
+The following statement uses StrComp to settle the pet question once and for all:
+
+    If StrComp(Pet, "dog", vbTextCompare) = True Then _
+        MsgBox "We do not accept dogs."
+
+# Using VBA's Mathematical Functions
+
+VBA provides a solid suite of functions for standard mathematical operations. Table 9.11 lists these functions with examples.
+
+Table 9.11 VBA's mathematical functions
+
+**Function(Argument)** | **Returns** | **Example**  
+---|---|---  
+Abs( _number_ ) | The absolute value of _number_ —the unsigned magnitude of the number. | Abs(-100) returns 100.  
+Atn( _number_ ) | The arctangent of _number_ in radians. | Atn(dblMyAngle)  
+Cos( _number_ ) | The cosine of angle _number_. | Cos(dblMyAngle)  
+Exp( _number_ ) | e, the base of natural logarithms, raised to the power of _number_. | Exp(5) returns 148.413159102577.  
+Fix( _number_ ) | The integer portion of _number_ (without rounding). If _number_ is negative, returns the negative number greater than or equal to _number_. | Fix(3.14159) returns 3. Fix(-3.14159) returns –3.  
+Int( _number_ ) | The integer portion of _number_ (again, without rounding). If _number_ is negative, returns the negative number less than or equal to _number_. | Int(3.14159) returns 3. Int(-3.14159) returns –4.  
+Log( _number_ ) | The natural logarithm of _number_. | Log(dblMyAngle)  
+Rnd([ _number_ ]) | A random number (with no argument) or a number based on the given initial seed. | Rnd(1) returns a random number.  
+Sgn( _number_ ) | –1 if _number_ is negative, 0 if _number_ is 0, 1 if _number_ is positive. | Sgn(7) returns 1. Sgn(-7) returns –1. Sgn(0) returns 0.  
+Sin( _number_ ) | The sine of the angle specified by _number_ (measured in radians). | Sin(dblMyAngle)   
+Sqr( _number_ ) | The square root of _number_. If _number_ is negative, VBA gives a runtime error. | Sqr(9) returns 3.  
+Tan( _number_ ) | The tangent of the angle specified by _number_ (measured in radians). | Tan(dblMyAngle)
+
+# Using VBA's Date and Time Functions
+
+VBA provides a full complement of date and time functions, as listed in Table 9.12. The table provides brief examples of working with the functions. The sections after the table provide longer examples showing how to use some of the more complex functions.
+
+Table 9.12 VBA's date and time functions
+
+**Function (Arguments)** | **Returns** | **Example**  
+---|---|---  
+Date | A Variant/Date containing the current date according to your computer | MsgBox Date might display 04/01/2010. (The format depends on your Windows date settings.)  
+DateAdd( _interval_ , _number_ , _date_ ) | A Variant/Date containing the date of the specified interval after the specified date | DateAdd("m", 1, "6/3/06") returns 7/3/2010.  
+DatePart( _interval_ , _date_ ) | The part (specified by _interval_ ) of the specified date | See the example in the next section.  
+DateSerial( _year_ , _month_ , _day_ ) | A Variant/Date containing the date for the specified year, month, and day | dteCompanyFounded = DateSerial(1997, 7, 4).  
+DateValue( _date_ ) | A Variant/Date containing the specified date | dteDeath = "July 2, 1971"   
+Day( _date_ ) | A Variant/Integer between 1 and 31, inclusive, representing the day of the month for _date_ | If Day(Date) = 1 And Month(Date) = 1 Then MsgBox "Happy new year!"  
+Hour( _time_ ) | A Variant/Integer between 0 and 23, inclusive, representing the hour for _time_ | dteHour = Hour(dteLoggedIn)  
+Minute( _time_ ) | A Variant/Integer between 0 and 59, inclusive, representing the minute for _time_ | dteMinute = Minute(dteLoggedIn)   
+Month( _date_ ) | A Variant/Integer between 1 and 12, inclusive, representing the month for _date_ | strThisDate = Month(Date) & "/" & Day(Date)   
+MonthName( _month_ ) | A String containing the name of the month represented by _month_ | MsgBox MonthName(Month(Date)) displays a message box containing the current month.  
+Now | A Variant/Date containing the current date and time according to your computer | MsgBox Now might display 04/01/2010 9:25:15PM. (The format of date and time will depend on your Windows date settings.)  
+Second( _time_ ) | A Variant/Integer between 0 and 59, inclusive, representing the second for _time_ | dteSecond = Second(dteLoggedIn)  
+Time | A Variant/Date containing the current time according to your computer | MsgBox Time might display 9:25:15PM. (The time format and time will depend on your Windows date settings.)  
+Timer | A Single giving the number of seconds that have elapsed since midnight | If Timer > 43200 Then MsgBox _ "This code only works in the morning.": End  
+TimeSerial( _hour_ ,  _minute_ , _second_ ) | A Variant/Date containing the time for the specified hour, minute, and second | TimeSerial(11, 12, 13) returns 11:12:13AM. (The format will depend on your Windows date settings.)  
+TimeValue( _time_ ) | A Variant/Date containing the time for _time_ | TimeValue(Now)   
+Weekday( _date_ ) | A Variant/Integer containing the day of the week represented by _date_ | See the next entry.  
+WeekdayName (weekday) | A String containing the weekday denoted by _weekday_ | WeekdayName(Weekday (#4/1/2013#)) returns Saturday, the day of the week for April Fool's Day 2013.
+
+## Using the DatePart Function to Parse Dates
+
+The DatePart function lets you take a date and separate it into its components. You can often achieve the same results by using other date functions, but DatePart is a great tool to have in your VBA toolbox.
+
+The syntax for DatePart is as follows:
+
+    DatePart( _Interval, Date_ [ _,FirstDayOfWeek_ [ _, FirstWeekOfYear_ ]])
+
+The components of the syntax are as follows:
+
+  * _Interval_ is a required String expression giving the unit in which you want to measure the interval: yyyy for year, q for quarter, m for month, y for the day of the year, d for day, w for weekday, ww for week, h for hour, n for minute (because m is for month), and s for second.
+  * _Date_ is a required Variant/Date giving the date you want to examine.
+  * _FirstDayOfWeek_ is an optional constant specifying the day that starts the week (for date information). The default setting is vbSunday (1), but you can also set vbMonday (2), vbTuesday (3), vbWednesday (4), vbThursday (5), vbFriday (6), vbSaturday (7), or vbUseSystem (0; this uses the system setting).
+  * _FirstWeekOfYear_ is an optional constant specifying the week considered first in the year. Table 9.13 shows the options for this constant.
+
+Table 9.13 The options for the FirstWeekOfYear constant
+
+**Constant** | **Value** | **Year Starts with Week**  
+---|---|---  
+vbUseSystem | 0 | Use the system setting.  
+vbFirstJan1 | 1 | The week in which January 1 falls (the default setting).  
+vbFirstFourDays | 2 | The first week with a minimum of four days in the year.  
+vbFirstFullWeek | 3 | The first full week (7 days) of the year.
+
+For example, the following statement assigns the current year to the variable dteThisYear:
+
+    dteThisYear = DatePart("yyyy", Date)
+
+## Using the DateDiff Function to Figure Out a Time Interval
+
+The DateDiff function returns the interval (the number of days, weeks, hours, and so on) between two specified dates. The syntax for DateDiff is as follows:
+
+    DateDiff( _interval_ _,_ _date1_ _,_ _date2_ [ _,_ _firstdayofweek_ [, _firstweekofyear_ ]])
+
+Here are the components of the syntax:
+
+  * _interval_ is a required String expression giving the unit in which you want to measure the interval: yyyy for year, q for quarter, m for month, y for the day of the year, d for day, w for weekday, ww for week, h for hour, n for minute (because m is for month), and s for second.
+  * _date1_ and _date2_ are the dates between which you're calculating the interval.
+  * _firstdayofweek_ is an optional constant specifying the day that starts the week (for date information). The default setting is vbSunday (1), but you can also set vbMonday (2), vbTuesday (3), vbWednesday (4), vbThursday (5), vbFriday (6), vbSaturday (7), or vbUseSystem (0; this uses the system setting).
+  * _firstweekofyear_ is an optional constant specifying the week considered first in the year. Table 9.13 shows the options for this constant.
+
+For example, the following statement returns the number of weeks between June 3, 2009, and September 30, 2009:
+
+    MsgBox DateDiff("ww", "6/3/2009", "9/30/2009")
+
+## Using the DateAdd Function to Add or Subtract Time from a Date
+
+The DateAdd function lets you easily add an interval of time to, or subtract an interval of time from, a specified date, returning the resulting date. The syntax for DateAdd is as follows:
+
+    DateAdd( _interval, number, date_ )
+
+Here are the components of the syntax:
+
+  * _interval_ is a required String expression giving the unit of measurement for the interval: yyyy for year, q for quarter, m for month, y for the day of the year, d for day, w for weekday, ww for week, h for hour, n for minute, and s for second.
+  * _number_ is a required numeric expression giving the number of intervals to add (a positive number) or to subtract (a negative number). If _number_ isn't already of the data type Long, VBA rounds it to the nearest whole number before evaluating the function.
+  * _date_ is a required Variant/Date or literal date giving the starting date.
+
+For example, the following statement returns the date 10 weeks from May 27, 2010:
+
+    DateAdd("ww", 10, #5/27/2009#)
+
+# Using File-Management Functions
+
+The following sections demonstrate how to use a couple of key VBA file-management functions: the Dir function, which you use to find out whether a file exists, and the CurDir function, which returns the current path.
+
+## Using the Dir Function to Check Whether a File Exists
+
+Often when managing files, you'll need to first check whether a particular file already exists. For instance, if you're about to save a file, you may want to make sure the save operation won't overwrite an existing file—a file with the same name in the same location on the hard drive.
+
+Or if you're about to open a file, you may want to see if that file exists before you use the Open method; otherwise, VBA will give an error.
+
+To test whether a file exists, you can use a straightforward procedure such as the one shown in Listing 9.3.
+
+**Listing 9.3**: Checking if a file exists with the Dir function
+
+     1.  Sub Does_File_Exist()
+     2.      Dim strTestFile As String, strNameToTest As String, _
+                 strMsg As String
+     3.      strNameToTest = InputBox("Enter the file name and path:")
+     4.      If strNameToTest = "" Then End
+     5.      strTestFile = **Dir** (strNameToTest)
+     6.      If Len(strTestFile) = 0 Then
+     7.          strMsg = "The file " & strNameToTest & _
+                     " does not exist."
+     8.      Else
+     9.          strMsg = "The file " & strNameToTest & " exists. "
+    10.      End If
+    11.      MsgBox strMsg, vbOKOnly + vbInformation, _
+                "File-Existence Check"
+    12.  End Sub
+
+This procedure in Listing 9.3 uses the Dir function to check whether a file exists and displays a message box indicating whether it does or doesn't. Figure 9.3 shows examples of the message box. This message box is for demonstration purposes only. In a real-world macro you'd likely use the result of the test to branch (execute different code blocks) based on whether the file exists. Branching is covered in Chapter 11.
+
+Figure 9.3 You can use the Dir function to check whether a file exists so that you don't accidentally overwrite it or cause an error by trying to open a nonexistent file.
+
+Here's how the code works:
+
+  * Line 2 declares the string variables strTestFile, strNameToTest, and strMsg.
+  * Line 3 then displays an input box prompting the user to enter a filename and path; VBA assigns the result of the input box to strNameToTest.
+  * Line 4 compares strNameToTest to a blank string (which means the user clicked the Cancel button in the input box or clicked the OK button without entering any text in the text box) and uses an End statement to end the procedure if it gets a match.
+  * Line 5 assigns to strTestFile the result of running the Dir function on the strNameToTest string. If Dir finds a match for strNameToTest, strTestFile will contain the name of the matching file; otherwise, it will contain an empty string.
+  * Line 6 begins an If... Then statement by testing the length of the strTestFile string. If the length is 0, the statement in line 7 assigns to strMsg text saying that the file doesn't exist; otherwise, VBA branches to the Else statement in line 8 and runs the statement in line 9, assigning text to strMsg saying that the file does exist. Line 10 ends the If statement.
+  * Line 11 displays a message box containing strMsg. Line 12 ends the procedure.
+
+* * *
+
+Garbage In, Garbage Out
+
+The code shown in Listing 9.3 isn't bulletproof because Dir is designed to work with wildcards as well as regular characters. As long as you're working with a simple text filename in strNameToTest, you'll be fine because Dir compares that text to the existing filenames on the hard drive and the result lets you know whether you have a match. But if strNameToTest contains wildcards (say it's c:\temp\\*.*; the asterisks specifying _any filename_ ), Dir reports that the file exists. However, there's no file by that name, just one or more files that match the wildcard. You can check on line 5 whether the name returned by Dir is exactly the same as the input name and make sure you do a case-insensitive comparison. This literalness of Dir is a nice illustration of GIGO (garbage in, garbage out)—from the computer's (and VBA's) point of view, it's doing what you asked it to, but the result is far from what you intended.
+
+* * *
+
+## Returning the Current Path
+
+You can find out the current path (the location on the hard drive to which the host application is currently pointed) on either the current drive or a specified drive by using the CurDir function. Often, you'll need to change the current path (using the ChDir function) to make sure the user is saving files in, or opening files from, a suitable location.
+
+To return the current path, use CurDir without an argument:
+
+    CurDir
+
+To return the current path for a specified drive, enter the drive letter as an argument. For example, to return the current path on drive D, use this statement:
+
+    CurDir("D")
+
+# The Bottom Line
+
+**Understand what functions are and what they do.**
+
+A function is a unit of code, a procedure, that performs a task _and returns a value_.
+
+You can write your own functions by writing code between Function and End Function in the VBA Editor. Chapter 10, "Creating Your Own Functions," explores how to write such custom functions. But in addition to functions you might write, there are many functions already prewritten in VBA—ready for you to call them from your macros to perform various tasks.
+
+Master It
+
+A function is quite similar to a subroutine, but there is a significant difference. What is it?
+
+**Use functions.**
+
+In a macro, you can call a built-in function by merely typing in its name and providing any required arguments.
+
+Master It
+
+You can combine multiple functions in a single line of code. The MsgBox function displays a message box containing whatever data you request. The only required argument for this function is the _prompt_. The Now function returns the current date and time. Write a line of code that calls the MsgBox function and uses the Now function as its argument.
+
+**Use key VBA functions.**
+
+VBA offers the services of hundreds of built-in functions. You'll find yourself using some of them over and over. They are _key_ to programming.
+
+Master It
+
+What built-in function is used quite often to display information in a dialog box to the user while a procedure runs?
+
+**Convert data from one type to another.**
+
+It's sometimes necessary to change a value from one data type to another. Perhaps you used an input box to ask the user to type in a String variable, but then you need to change it into an Integer type so you can do some math with it. (You can't add pieces of text to each other.)
+
+Master It
+
+What built-in function would you use to convert a string such as "12" (which, in reality, is two text _characters_ , the digits 1 and 2) into an Integer data type, the actual _number_ 12, that you can manipulate mathematically?
+
+**Manipulate strings and dates.**
+
+VBA includes a full set of functions to manage text and date data.
+
+Master It
+
+Which built-in function would you use to remove any leading and trailing space characters from a string? For example, you want to turn
+
+        "  this         "
+
+into
+
+        "this"
+
+Chapter 10
+
+Creating Your Own Functions
+
+In Chapter 9, "Using Built-in Functions," you learned how to use VBA's built-in functions. In this chapter, you'll learn how to create your own functions. You create a function the same way you create a subprocedure: by typing in the Code window. (You can't _record_ a function in Excel and Word—the applications that provide a Macro Recorder. Instead, you have to write functions yourself because the Recorder creates only subprocedures.)
+
+It's important to recall that, although both are procedures, functions differ from subs. The primary difference is that functions interact more with other procedures: They accept arguments (incoming data) from the procedure that calls them, and they return a value (outgoing data) back to the procedure that calls them. Subs, by contrast, normally don't require arguments and _never_ return any data.
+
+But functions are used in VBA far less often than subs. Most macros are self-contained subs. That's because most macros are small, brief automations: They perform simple, quick jobs like inserting a date into a document or saving a document using a particular filename.
+
+But you aren't limited to brief macros. You are free to create more complex, larger, and more sophisticated programs in VBA. And if you do create a large project, you'll want to use multiple procedures, not just one sub. This allows you to divide your work into multiple logical units that can each be individually tested and more easily modified. When you're using multiple procedures, however, they must work together and need to communicate among themselves. This is why you often use functions in large projects. Remember that the key feature of a function is that it facilitates _communication—_ sending values back and forth—among multiple procedures.
+
+This chapter will cover several ways to employ functions with the various Office 2013 applications. I'll start by explaining the components of a function and showing you how to put them together. You'll then create some functions that work in any VBA host and some functions that are specific to Word, Excel, and PowerPoint.
+
+In this chapter you will learn to do the following:
+
+  * Understand the components of a function statement
+  * Create a generic function
+  * Create a function for Word
+  * Create a function for Excel
+  * Create a function for PowerPoint
+  * Create a function for Access
+
+# Components of a Function
+
+To create a function, you use a Function statement. This is essentially the same way you create a Sub: just type in the word **Function** followed by the name you're giving the function.
+
+The syntax for the Function statement is as follows:
+
+    [Public | Private] [Static] Function _function_name_ [( _argument_list_ )] [As _type_ ]
+         [ _statements_ ]
+         [ _function_name_ = _expression_ ]
+         [Exit Function]
+         [ _statements_ ]
+         [ _function_name_ = _expression_ ]
+    End Function
+
+This syntax, most of which is optional, breaks down like this:
+
+  * Public is an optional keyword that you can use to make the function publicly accessible—accessible to all other procedures in all loaded modules. (If you need to limit the function's scope to the project that contains it, you can override this public availability by putting an Option Private Module statement in the module that contains the function.)
+  * Private is an optional keyword that you can use to make the function accessible to the other procedures in the module that contains it. The function is hidden from procedures in any other module.
+  * Static is an optional keyword that you can use to make local variables in the function retain their value between calls to the function.
+  * _function_name_ is required. It specifies a name for the function so you can refer to it elsewhere in your project (so you can _call_ the function—in other words, start it running). Functions follow the same naming rules as other VBA items, such as the rules for variable names: alphanumerics and underscores are fine, but no spaces, symbols, or punctuation. Note that a function _passes data back_ to whatever procedure called (executed) the function. It passes data back by assigning a value to its (the function's) name. If the function's name is AddStateTax, you would do some calculations to add the tax, then assign the result to the function's name:
+
+    Function AddStateTax(SubTotal)
+
+     AddStateTax = SubTotal * 1.07 'do the math and assign the result
+                              'to the function name so it gets passed back
+
+     End Function
+
+  * _argument_list_ is an optional argument supplying the list of variables that represent arguments passed to the function when it is invoked. _argument_list_ takes the syntax shown here:
+
+    [Optional] [ByRef | ByVal] [ParamArray] _variable_name_ [( )] [As _type_ ]
+
+    [= _default_value_ ]
+
+Here's a description of the elements of the _argument_list_ :
+
+  * Optional is an optional keyword that you can use to denote that an argument is optional—in other words, that it is not required. Once you've used Optional to declare an optional argument, any subsequent arguments in the _argument_list_ also have to be optional. That means you must put any _required_ arguments before the optional arguments, the same way VBA does with its built-in functions' argument lists. Also, it's a good idea to give optional arguments a default value.
+  * ByRef is an optional keyword that you can use to specify that an argument be passed _by reference_ ; ByVal is an optional keyword that you can use to specify that an argument be passed _by value_. You can pass an argument either by reference or by value.
+  * ParamArray is an optional keyword you can use as the last argument in _argument_list_ to denote an optional array of Variants. You can't use ParamArray with ByVal, ByRef, or Optional.
+
+* * *
+
+You Can Pass to a Function Either a Value's Address or a Copy of the Actual Value
+
+When a procedure (either a function or a subroutine) passes an argument to a function _by reference_ , the recipient procedure gets access to the actual memory location where the original variable is stored and can thus _change_ the value held in the original variable. By contrast, when an argument is passed _by value_ , the function gets only a copy of the information in the variable and therefore can't change the value held in the original variable (the recipient procedure doesn't even know where the original variable is located). By reference is the default way to pass an argument, and there is rarely any reason to pass by value, so just use the default.
+
+* * *
+
+  * _variable_name_ is the name of the variable that you want to use for this argument. When the function is called and a value is supplied for this argument, this variable can be used in your code.
+  * _type_ is an optional keyword giving the data type of the argument (Byte, Boolean, Currency, Date, Decimal, Double, Integer, Long, Object, Single, variable-length String, or Variant). For nonoptional arguments, you can also specify an object type (for example, a Worksheet object) or a custom object (one you've created).
+  * _default_value_ is an optional _literal_ (the value itself spelled out, such as "Sacramento") constant, or constant expression that you use to specify a default value for optional parameters. You'll see how to provide a default value shortly.
+
+  * _type_ is an optional argument specifying the data type of the value that the function returns: Byte, Boolean, Currency, Date, Decimal, Double, Integer, Long, Object, Single, variable-length String, Variant, or a custom type.
+  * _statements_ represents the statement or statements in the function (the code that does the job the function is supposed to accomplish). In theory, _statements_ is optional, but in practice, most functions will need one or more statements.
+  * _expression_ represents the value the function returns. _expression_ is also optional.
+
+# Creating a Function
+
+The following sections walk you through the process of creating a function.
+
+## Starting a Function Manually
+
+The easiest way to start creating a function is to type into the VBA Code window the word **Function** followed by the name you want to give to the function and any necessary arguments in parentheses, and then press Enter. VBA automatically enters a blank line and an End Function statement for you and places the insertion point on the blank line ready for you to create the programming code inside the new function.
+
+For example, if you type the following line and press Enter, the Visual Basic Editor displays what you see in Figure 10.1:
+
+    Function MyFunction(MaxTemp, MinTemp)
+
+Figure 10.1 When you type a Function statement and press Enter, the Visual Basic Editor automatically inserts a blank line and an End Function statement for you.
+
+## Starting a Function by Using the Add Procedure Dialog Box
+
+If you like to make the Visual Basic Editor work for you as much as possible (and prefer the slow way of doing things), you can also start creating a new function by using the Add Procedure dialog box:
+
+1. Choose Insert ⇒ Procedure to display the Add Procedure dialog box (see Figure 10.2).
+
+Figure 10.2 You can also use the Add Procedure dialog box to specify elements of a new function.
+
+2. Type the name for the procedure in the Name text box.
+
+3. Select the Function option button in the Type group box.
+
+4. Select the Public option button or the Private option button (as appropriate) in the Scope group box.
+
+5. If you want all local variables in the function to be of the static type (which you usually won't), select the All Local Variables As Statics check box.
+
+6. Click OK to enter the stub for the function, and then enter any arguments for the function in the parentheses manually.
+
+## Passing Arguments to a Function
+
+The arguments that will be passed to a function are listed in parentheses, separated by commas. In the following example code, the function states that it requires an argument named MaxTemp and an argument named MinTemp. These data must be passed to (sent to) this function for it to work:
+
+    Function GetTemps(MaxTemp As Double, MinTemp As Double)
+
+If somewhere in your code you attempt to call this function without passing the data it requires, VBA will display the error message "Argument Not Optional."
+
+You can also specify the data type of the arguments if you want by including an As statement with the data type after the argument's name. For example, you could use the following statement to set the MaxTemp and MinTemp arguments to the Double numeric data type:
+
+    Function GetTemps(MaxTemp As Double, MinTemp As Double)
+
+Passing an argument by reference (the default) is useful when you want to manipulate the variable in the recipient procedure and then return the variable to the procedure from which it originated. Alternatively, passing an argument by value is useful when you want to use the information stored in the variable in the recipient procedure and at the same time ensure that the original information in the variable doesn't change (but this isn't typically necessary).
+
+Because _by reference_ is the default way of passing an argument, both of the following statements pass the argument MyArg by reference:
+
+    Function PassByReference(MyArg)
+    Function PassByReference(ByRef MyArg)
+
+As you see, you can omit the default ByRef command. However, to pass an argument by value, you must use the ByVal keyword. The following statement passes the ValArg argument by value:
+
+    Function PassByValue(ByVal ValArg)
+
+If necessary, you can pass some arguments for a procedure by reference and others by value. The following statement passes the MyArg argument by reference and the ValArg argument by value:
+
+    Function PassBoth(ByRef MyArg, ByVal ValArg)
+
+In practice, though, you're likely to simply use the default ByRef approach for most, if not all, of your programming.
+
+## Declaring the Data Types of Arguments
+
+You can explicitly declare the data types of arguments. This conserves memory (although this is rarely an issue anymore) and ensures that the outside (calling) procedures are passing the correct type of information to your function. For this second reason, it's always a good idea to specify the data type. You avoid some kinds of errors that way.
+
+When passing an argument, you want to ensure that the data type of the argument you're passing matches the data type expected in the procedure. For example, if you declare a string and try to pass it as an argument when the receiving function specifies that it is expecting a Variant, VBA displays an error message.
+
+To declare the data type of an argument, just include the usual data-type declaration in the argument list. The following statement declares MyStrArg, specifying with As that a string must be passed and specifying a variant with VarArg:
+
+    Function PassType(MyStrArg As String, VarArg As Variant)
+
+## Specifying an Optional Argument
+
+You can specify an optional argument by using the Optional keyword:
+
+    Function PassBoth(ByRef MyArg As String, ByVal ValArg As Variant, _
+        Optional ByVal strName As String)
+
+When you specify an optional argument, it's a good idea to assign a default value to it. Doing so makes the code less susceptible to errors and gives the programmer a clue as to what kind of information is used here. To assign the default value, type an equal sign after the variable's definition, and then type the default value (use double quotation marks for a String value). For example, the following function statement declares the strName optional argument and assigns the default value if no value is passed:
+
+    Function PassBoth(ByRef MyArg As String, ByVal ValArg As Variant, _
+        Optional ByVal strName As String = "Sacramento **"** )
+
+What happens here is that this macro is being used by a company located in Sacramento, so they most often use that city's name for the literal value in this particular macro. Your default literal will differ, depending on what your macro is supposed to accomplish.
+
+## Controlling the Scope of a Function
+
+Like a subroutine, a function can have private or public scope. Private scope makes the function available only to procedures in the module that contains it, and public scope makes the function available to all open modules in your project.
+
+If you don't specify whether a function is private or public, VBA makes it public by default, so you don't need to specify the scope of a function unless you want it to have private scope. However, if you do use explicit Public declarations for those functions you want to be public, your code will be somewhat easier to read:
+
+    Private Function MyFunction(MaxTemp, MinTemp)
+    Public Function AnotherFunction(Industry, Average)
+
+# Examples of Functions for Any VBA-Enabled Office Application
+
+This part of the chapter contains two examples of functions that will work in any application that hosts VBA. That's because these functions don't access objects particular to any specific Office application.
+
+Later in this chapter, you'll see examples of functions that employ resources or features particular to a specific Office application.
+
+To start, first declare the function and its arguments. The following statement declares a function named NetProfit:
+
+    Function NetProfit(Gross As Double, Expenses As Double) As Double
+
+NetProfit uses two arguments, Gross and Expenses, declaring each as the Double data type; it's a floating-point (has a decimal point) number.
+
+At the end of this statement, we have specified that our function returns a Double value type. It's important to explicitly specify the variable types of the arguments and the type of the value that the function returns to the caller. This avoids unpleasant surprises (bugs) in your code because VBA catches and reports any attempt to pass the wrong data type to the function or send the wrong type of data back to whatever code called (executed) your function.
+
+Armed with the arguments (and their type, if you _explicitly type_ them as I'm suggesting you do), you _call_ (execute) your NetProfit function the same way you would execute a prewritten function that's built into VBA (like MsgBox). You simply use the function's name and supply the two arguments it needs, like this:
+
+    MyProfit = NetProfit(44000, 34000)
+
+Here, the variable MyProfit is assigned the value of the NetProfit function. In other words, after this function finishes its job and execution resumes in the _caller_ (the procedure that invoked the function), the returned value is assigned to the variable MyProfit.
+
+In this example, the NetProfit function is provided with a Gross argument of 44000 and an Expenses argument of 34000.
+
+Once you've created a function, the Visual Basic Editor displays its argument list when you type the name of the function in a caller procedure, as shown in Figure 10.3.
+
+Figure 10.3 The Visual Basic Editor displays a ToolTip of Auto Quick Info for functions you create as well as for its built-in functions.
+
+Listing 10.1 contains an example of calling a function: The ShowProfit procedure calls the NetProfit function and displays the result in a message box.
+
+**Listing 10.1**: How to call a function
+
+    1.  Sub ShowProfit()
+    2.      MsgBox (NetProfit(44000, 34000)),, "Net Profit"
+    3.  End Sub
+    4.
+    5.  Function NetProfit(Gross As Double, Expenses As Double) As Double
+    6.      NetProfit = (Gross - Expenses) * 0.9
+    7.  End Function
+
+In Listing 10.1, lines 1 through 3 contain the ShowProfit procedure, which simply calls the NetProfit function in line 2, passes it the arguments 44000 for Gross and 34000 for Expenses, and displays the result in a message box titled Net Profit. Notice that in line 2 we have employed a shortcut: using the function call inside an argument list. Line 2 does the same thing as this longer version:
+
+    Dim result as double
+    Result = NetProfit(44000, 34000)
+    MsgBox (Result),, "Net Profit"
+
+Lines 5 through 7 contain the NetProfit function. Line 5 declares the function as working with two Double arguments, Gross and Expenses, telling VBA what to do with the two arguments that line 2 has passed to the function.
+
+Line 6 calculates NetProfit to be 90 percent (0.9) of the value of Gross minus Expenses.
+
+## How Functions Return Information
+
+It's important to notice what else happens in line 6: the information calculated by the function is being assigned to the name of the function. This is how _the information gets passed back_ to the ShowProfit procedure that called the function.
+
+To make this process a bit clearer, let's write the code in a more verbose way. We'll do this the long way, without using the shortcut of doing both the calculating and assigning all on the same line. Here's how a function goes about its business.
+
+There are three main steps: calculation, assignment, and return. They are labeled as lines 1, 2, and 3 in the following listing.
+
+The function first does some computing—in this case, calculating a net profit. Then, second, it assigns the results of the calculation to its own name (NetProfit in this case). This assignment is how the data gets passed back to the caller. And finally, third, with the End command, it sends the results back to whatever procedure called the function:
+
+    Function NetProfit(Gross As Double, Expenses As Double) As Double
+    Dim Result As Double
+
+    1.  Result = (Gross - Expenses) * 0.9 'do the calculating
+
+    2.  NetProfit = Result 'store the information to be sent back
+
+    3.  End Function 'send the information back
+
+## Returning Text Data from a Function
+
+Listing 10.2 contains a function that returns a String argument.
+
+**Listing 10.2**: A function that returns a string
+
+     1.  Sub TestForSmog()
+     2.      Dim intCYear As Integer, strThisCar As String
+     3.  BadValueLoop:
+     4.      On Error GoTo Bye
+     5.      intCYear = InputBox("Enter the year of your car.", _
+                 "Do I Need a Smog Check?")
+     6.      strThisCar = NeedsSmog(intCYear)
+     7.      If strThisCar = "Yes" Then
+     8.          MsgBox "Your car needs a smog check.", _
+                 vbOKOnly + vbExclamation, "Smog Check"
+     9.      ElseIf strThisCar = "BadValue" Then
+    10.          MsgBox "The year you entered is in the future.", _
+                 vbOKOnly + vbCritical, "Smog Check"
+    11.          GoTo BadValueLoop
+    12.      Else
+    13.          MsgBox "Your car does not need a smog check.", _
+                 vbOKOnly + vbInformation, "Smog Check"
+    14.      End If
+    15.  Bye:
+    16.  End Sub
+    17.
+    18.  Function NeedsSmog(CarYear As Integer) As String
+    19.      If CarYear > Year(Now) Then
+    20.          NeedsSmog = "BadValue"
+    21.      ElseIf CarYear <= Year(Now) - 3 Then
+    22.          NeedsSmog = "Yes"
+    23.      Else
+    24.          NeedsSmog = "No"
+    25.      End If
+    26.  End Function
+
+Listing 10.2 contains the procedure TestForSmog (lines 1 through 16) and the NeedsSmog function (lines 18 through 26). The TestForSmog procedure calls the NeedsSmog function, which returns a value indicating whether the user's car needs a smog check. TestForSmog uses this value to display a message box (see Figure 10.4) informing users whether or not their car needs a smog check.
+
+Figure 10.4 The TestForSmog procedure prompts for the car's year and then displays a message box stating whether the car needs a smog test.
+
+Here's how the code works:
+
+  * TestForSmog starts by declaring the Integer variable intCYear and the String variable strThisCar in line 2.
+  * Line 3 contains the BadValueLoop label, to which execution returns from line 11 if the user has entered an unsuitable value for the year of the car. We'll want to display the input box again, to see if they can get it right this time. Note that if you want execution to jump to a particular zone in your code, you just type in a name for the location, such as BadValueLoop here, and end with a colon. This name-plus-colon is called a _label_ and it provides a way for you to transfer execution to a specific location within your macro. Then elsewhere in your code you can transfer execution to this label by using the GoTo command like this:
+
+    GoTo BadValueLoop
+
+  * Line 4 contains an On Error statement to transfer execution to the Bye label in line 15 if an error occurs. An error occurs if the user cancels the upcoming input box or clicks its OK button with no value entered in its text box.
+  * Line 5 displays an input box prompting the user to enter the year of the car. This line assigns to the intCYear variable the value the user enters in the input box.
+  * Line 6 then sets the value of the String variable strThisCar to the result of the NeedsSmog function running on the intCYear integer variable.
+  * Execution now shifts to the NeedsSmog function (line 18), which evaluates intCYear and returns the value for strThisCar. Line 18 declares the function, assigning its value to NeedsSmog. The function takes one argument, CarYear, which is declared as the Integer data type.
+  * Line 19 checks to see whether CarYear is greater than the value of the current year using Year(Now). If so, line 20 sets the value of NeedsSmog to BadValue, which is used to indicate that the user has entered a date in the future. If not, the ElseIf statement in line 21 runs, checking if the value of CarYear is less than or equal to Year(Now) - 3, the current year minus three. If so, line 22 sets the value of NeedsSmog to Yes; if not, the Else statement in line 23 runs, and line 24 sets the value of NeedsSmog to No. Line 25 ends the If statement, and line 26 ends the function.
+  * Execution then returns to the calling line (line 6) in the TestForSmog procedure, to which the NeedsSmog function returns the value it has assigned to the strThisCar variable.
+  * The rest of the TestForSmog procedure then works with the strThisCar variable. Line 7 compares strThisCar to Yes. If it matches, line 8 displays a message box stating that the car needs a smog check. If strThisCar doesn't match Yes, line 9 compares ThisCar to BadValue. If it matches, line 10 displays an alert-message box, and line 11 returns execution to the BadValueLoop label in line 3. If strThisCar doesn't match BadValue, the Else statement in line 12 runs, and line 13 displays a message box stating that the car doesn't need a smog check.
+  * Line 14 ends the If statement, line 15 contains the Bye label, and line 16 ends the procedure.
+
+Functions can be more complex than the simple, stand-alone examples shown here. For instance, you can include a function as part of a larger expression. You could add the results of the functions NetProfit and CurrentBalance (which takes a single argument) by using a statement such as this:
+
+    CurrentEstimate = NetProfit(44000, 33000) + CurrentBalance(MainAccount)
+
+# Creating a Function for Word
+
+Functions such as those shown in the previous section work in any VBA-hosting application because they do not call any application-specific features. This section and the following three sections show you examples of functions that are specific to applications.
+
+The task accomplished by the example program shown in Listing 10.3 is to remove some special types of formatting (hyperlinks, bookmarks, and fields) but retain any text in those special zones.
+
+* * *
+
+Creating Custom Function Libraries
+
+Some programmers like to keep functions they write (that aren't application-specific) in separate modules in the Editor. These little libraries represent your own collections of tested, useful, generic procedures. Need to calculate sales tax? Don't reinvent the wheel. Just import your library of math functions, among which is just this procedure. You can export a module as a file with the .bas filename extension and import it into whichever application needs the functions. Choose File ⇒ Export File (or press Ctrl+E). For example, you might maintain separate modules that contain your math equations, your string-manipulation functions, and other custom functions that work in any VBA host. A .bas file is merely an ordinary text file containing a module's source code (its subroutines and functions). You can read it in Notepad, but you can also use the File Import feature to add it to a VBA project. When imported, it will appear in the Project Explorer as a new module.
+
+* * *
+
+The function shown in Listing 10.3 is for Word and—unusually for a function—returns no information (technically it returns a null value). The function's main purpose is to perform several operations on the specified document. So no data needs to be returned to the caller.
+
+**Listing 10.3**: A function that returns a null value
+
+     1.  Option Explicit
+     2.
+     3.  Function Strip_Hyperlinks_Bookmarks_Fields()
+     4.      Dim myLink As Hyperlink
+     5.      Dim myBookmark As Bookmark
+     6.      Dim myField As Field
+     7.      With ActiveDocument
+     8.          For Each myLink In .Hyperlinks
+     9.              myLink.Delete
+    10.          Next myLink
+    11.          For Each myBookmark In .Bookmarks
+    12.              myBookmark.Delete
+    13.          Next myBookmark
+    14.          For Each myField In .Fields
+    15.              myField.Unlink
+    16.          Next myField
+    17.      End With
+    18.  End Function
+    19.
+    20.  Sub Clean_Up_Document_for_Conversion()
+    21.      Call Strip_Hyperlinks_Bookmarks_Fields
+    22.      'other cleanup functions here
+    23.  End Sub
+
+Here's how the code works:
+
+  * Line 1 contains the Option Explicit statement for the module to force explicit declarations of all variables. Line 2 is a spacer.
+  * Line 3 starts the function named Strip_Hyperlinks_Bookmarks_Fields, which removes all hyperlinks, bookmarks, and fields from the active document. The function continues until the End Function statement in line 18.
+  * Line 4 declares a variable named myLink as being of the Hyperlink type. Line 5 declares a variable named myBookmark as being of the Bookmark type. Line 6 declares a variable named myField as being of the Field type.
+  * Line 7 begins a With statement that works with the ActiveDocument object and continues until the End With statement in line 17. This With statement contains three For Each...Next loops.
+  * The first For Each...Next loop, in lines 8 through 10, goes through each myLink object in the current document's Hyperlinks collection. Line 9 uses the Delete method to delete each of the links in turn. Deleting a hyperlink removes the link from the document but leaves the text that was displayed for the hyperlink.
+  * The second For Each...Next loop, in lines 11 through 13, works with each myBookmark object in the Bookmarks collection. Line 12 uses the Delete method to delete each of the bookmarks in turn. Deleting a bookmark removes the marker from the document but leaves any text or other object that the bookmark contained.
+  * The third For Each...Next loop, in lines 14 through 16, works with each myField object in the Fields collection. Line 15 uses the Unlink method to unlink each of the fields in turn. Unlinking a field leaves the field's contents in the document as text or as an object but removes the field link.
+  * Line 17 contains the End With statement that ends the With statement, and line 18 contains the End Function statement that ends the function. Line 19 is a spacer.
+  * Lines 20 through 23 contain a short subprocedure that simply calls the Strip_Hyperlinks_Bookmarks_Fields function. Line 22 contains a comment stating that the subprocedure would call other cleanup functions. But the code to call other functions hasn't yet been written. It's a reminder.
+
+# Creating a Function for Excel
+
+This section shows you a function for Excel. The function in Listing 10.4 checks whether a workbook contains any unused sheets.
+
+**Listing 10.4**: An Excel function
+
+     1.  Option Explicit
+     2.
+     3.  Function BlankSheetsInWorkbook(ByRef WorkbookToTest As Workbook) As Boolean
+     4.      Dim objWorksheet As Worksheet
+     5.      BlankSheetsInWorkbook = False
+     6.      For Each objWorksheet In WorkbookToTest.Worksheets
+     7.          If Application.WorksheetFunction.CountBlank _
+                     (objWorksheet.Range("A1:IV65536")) = 16777216 Then
+     8.              BlankSheetsInWorkbook = True
+     9.              Exit Function
+    10.          End If
+    11.      Next objWorksheet
+    12.  End Function
+    13.
+    14.  Sub Check_Workbook_for_Blank_Worksheets()
+    15.      If BlankSheetsInWorkbook(ActiveWorkbook) = True Then
+    16.          MsgBox "This workbook contains one or more blank worksheets." & _
+                     vbCr & vbCr & "Please remove all blank worksheets before" & _
+                     " submitting the workbook.", vbOKOnly & vbExclamation, _
+                     "Check Workbook for Blank Worksheets"
+    17.      End If
+    18.  End Sub
+
+Here's how the code works:
+
+  * Line 1 contains the Option Explicit statement for the module to force explicit declarations of all variables. Line 2 is a spacer.
+  * Line 3 starts the function named BlankSheetsInWorkbook, which it declares as a Boolean function. The function works on an object named WorkbookToTest, which has the type Workbook—in other words, it's a workbook.
+  * Line 4 declares a variable named objWorksheet that is of the Worksheet type.
+  * Line 5 sets the value of the BlankSheetsInWorkbook function to False.
+  * Line 6 starts a For Each...Next loop that runs for each objWorksheet object (each worksheet) in the Worksheets collection in the WorkbookToTest object—that is, with each worksheet in the workbook that is passed to the function.
+  * Line 7 uses the CountBlank worksheet function to count the number of blank cells in the range A1:IV65536 in the worksheet being tested by the loop. If the number of blank cells is 16777216, the worksheet is blank because this is the number of cells in a worksheet. Line 8 then sets the value of the BlankSheetsInWorkbook function to True, and line 9 uses an Exit Function statement to exit the function. This is because there is no need to test any more worksheets once the function has found that at least one worksheet is blank.
+  * Line 10 contains the End If statement that ends the If statement. Line 11 contains the Next objWorksheet statement that ends the For Each...Next loop. And line 12 contains the End Function statement that ends the function. Line 13 is a spacer.
+  * Line 14 begins a short subprocedure named Check_Workbook_for_Blank_Worksheets. Line 15 runs the BlankSheetsInWorkbook function on the ActiveWorkbook object, which represents the active workbook in the Excel session. If the BlankSheetsInWorkbook function returns True, line 16 displays a message box that points out to the user that the workbook contains one or more blank worksheets and tells the user to remove them.
+
+# Creating a Function for PowerPoint
+
+This section includes an example function for PowerPoint. The function in Listing 10.5 checks that all the text on a slide is at least the minimum font size specified and displays an error-message box if any font is too small. (If, when you press Alt+F11 to open the VBA Editor, you see nothing in the Code window, choose Insert ⇒ Module so you'll have a container for your code.)
+
+**Listing 10.5**: A function in PowerPoint
+
+     1.  Option Explicit
+     2.
+     3.  Function CheckMinFontSize(objPresentation As Presentation) As Boolean
+     4.
+     5.      Dim objSlide As Slide
+     6.      Dim objShape As Shape
+     7.
+     8.      CheckMinFontSize = True
+     9.
+    10.      For Each objSlide In objPresentation.Slides
+    11.          objSlide.Select
+    12.          objSlide.Shapes.SelectAll
+    13.          For Each objShape In Windows(1).Selection.ShapeRange
+    14.              If objShape.Type = msoPlaceholder Then
+    15.                  If objShape.TextFrame.TextRange.Font.Size < 14 Then
+    16.                      CheckMinFontSize = False
+    17.                      Exit Function
+    18.                  End If
+    19.              End If
+    20.          Next objShape
+    21.      Next objSlide
+    22.  End Function
+    23.
+    24.  Sub Font_Check()
+    25.      If CheckMinFontSize(ActivePresentation) = False Then
+    26.          MsgBox "Some of the fonts in this presentation are too small." _
+                 & vbCr & vbCr & "Please change all fonts to 14 points or larger.", _
+                 vbCritical + vbOKOnly, "Font Size Check"
+    27.      End If
+    28.  End Sub
+
+Here's how the code works:
+
+  * Line 1 contains the Option Explicit statement for the module to force explicit declarations of all variables. Line 2 is a spacer.
+  * Line 3 declares the function named CheckMinFontSize as Boolean and specifies that it works on a variable named objPresentation, which is of the Presentation type. Line 4 is a spacer.
+  * Line 5 declares a variable named objSlide that is of the Slide type. Line 6 declares a variable named objShape that is of the Shape type. Line 7 is a spacer.
+  * Line 8 sets the value of the CheckMinFontSize function to True. This indicates that the font sizes are the minimum size or larger. Line 9 is a spacer.
+  * Line 10 starts a For Each...Next loop that continues until line 21 and works with each objSlide object in the Slides collection in the objPresentation object. This loop makes the function examine each of the Slide objects in the presentation that is passed to the function.
+  * Line 11 selects the current objSlide object, and line 12 uses the SelectAll method of the Slides collection.
+  * Line 13 starts a nested For Each...Next loop that runs once for each of the objShape objects in the ShapeRange object in the Selection object in the first window using Windows(1). The ShapeRange object contains all of the Shape objects within the selection. Here, the Shape objects are represented by the objShape variable.
+  * Line 14 uses an If statement to see if the Type property of the current Shape object is msoPlaceholder, the type that indicates a placeholder used for text. If the shape is a placeholder, line 15 checks if the font size used in the TextRange object within the TextFrame object within the Shape object is smaller than 14 points. If so, line 16 assigns the value False to the CheckMinFontSize function, and line 17 uses an Exit Function statement to stop execution of the function. This is because once a font smaller than the minimum permitted size has been found, there is no need to check further.
+  * Line 18 contains the End If statement that ends the nested If structure, and line 19 contains the End If statement that terminates the outer If structure.
+  * Line 20 contains the Next objShape statement that ends the nested For Each...Next loop, and line 21 contains the Next objSlide statement that ends the outer For Each...Next loop.
+  * Line 22 contains the End Function statement that ends the function. Line 23 is a spacer.
+  * Lines 24 through 28 contain a subroutine named Font_Check that runs the CheckMinFontSize function on the ActivePresentation object. If the function returns False, the subprocedure displays a message box alerting the user to the problem.
+
+# Creating a Function for Access
+
+You can create functions for Access the same way you do for any other VBA-enabled Office 2013 application—just type in the word **Function** and give this function a name.
+
+However, Access often has special ways of programming, and it has several unique aspects to its object model. The first thing you'll notice is a general-purpose object named DoCmd. This object has no properties, but it has lots of methods that accomplish such common tasks as launching other applications, locating records, and opening reports and forms.
+
+Before we create a macro to illustrate how to use the DoCmd object, it's necessary to have a little database set up that you can experiment with. Access comes with several templates, so we'll use one of them. Follow these steps:
+
+1. Run Access.
+
+2. In Office 2013, the various applications such as Word and Access display on startup a set of common templates.
+
+3. Double-click _Desktop_ Contacts to open that database template. (Don't choose the Contacts template, which includes online features that will complicate this example.)
+
+4. If you see a security warning message (a yellow strip below the Ribbon), click the Enable Content button.
+
+5. Click Create. If you see an offer to watch some videos from Microsoft at this point, click the x in the upper-right corner to close that window.
+
+6. Type in some random data by clicking the (New) link in the Open column on the left side, as shown in Figure 10.5. A Contacts Details dialog box opens (not shown in the figure).
+
+Figure 10.5 Type in some data—any data will do—so you can experiment with Access's DoCmd object.
+
+7. Click the Save And New button in the Contact Details dialog box each time you add a new contact. Add about three contacts.
+
+Now you can use the DoCmd to locate a particular record by its ID number. Press Alt+F11 to open the Visual Basic Editor in Access; then right-click the database name (it's the one in boldface) in the Project Explorer. Choose Insert ⇒ Module from the context menu. In your new module, type the following code, which will move the insertion pointer to a new record:
+
+    1.  Function MoveToNew()
+    2.
+    3.      DoCmd.OpenForm "Contact List"
+    4.      DoCmd.GoToRecord , , acNewRec
+    5.
+    6.  End Function
+
+To test this macro, click somewhere in one of the existing records so the blinking insertion cursor is located above the New record line. Then switch to the VBA Editor and click inside the MoveToNew function to place the Editor's cursor there. Press F5. Then go back to Access, and you should see that the blinking cursor has moved to the New record.
+
+Here's how the code works:
+
+  * Line 3 ensures that the correct form is open. Because you've just started working with this Contacts database and filled in some information in the Contact List form, the correct form is open and has the focus. However, it's possible that later additional forms will be added. It's always a good idea to specify which form, table, or other object you want to work with. You can't assume that a macro will always be executed in a specific context (such as with the correct form having the focus). In other words, if you omit line 3, this macro will act on whatever form is currently open in Access.
+  * Line 4 employs the GoToRecord method of the DoCmd object. The acNewRec constant specifies a new, rather than an existing, record.
+
+# The Bottom Line
+
+**Understand the components of a function statement.**
+
+Arguments can be passed from the calling code to a function in one of two ways: by reference or by value.
+
+Master It
+
+Describe the difference between passing data by reference and passing data by value.
+
+**Create a generic function.**
+
+You can write, and save (File ⇒ Export File), sets of generic functions that work in any VBA-enabled application.
+
+**Master It**
+
+Create a function that displays the current year in a message box. This function will require no arguments, nor will it return any value.
+
+**Create a function for Word.**
+
+Word contains a whole set of objects and members unique to word-processing tasks. Functions that are specific to Word employ one or more of these unique features of the Word object model.
+
+Master It
+
+Write a function that displays the number of hyperlinks in the currently active document. Use Word's Hyperlinks collection to get this information.
+
+**Create a function for Excel.**
+
+Excel uses an ActiveWorkbook object to represent the currently selected workbook. You can employ a full set of built-in methods to manipulate the features of any workbook.
+
+Master It
+
+Using the Sheets collection of Excel's ActiveWorkbook object, write a function that displays the number of sheets in the current workbook.
+
+**Create a function for PowerPoint.**
+
+PowerPoint's object model includes an ActivePresentation object, representing the currently selected presentation. Functions can make good use of this object and its members.
+
+Master It
+
+Write a function that returns how many slides are on a presentation. Pass the ActivePresentation object as an argument to this function; then display the number of slides the presentation contains. Call this function from a subroutine.
+
+**Create a function for Access.**
+
+Access often works a little differently from other VBA-enabled Office applications. For example, some common tasks are carried out by using methods of the special DoCmd object rather than methods of a Form or Table object.
+
+Master It
+
+Write a function that closes Access by using the DoCmd object's Quit method. Ensure that all data is saved by employing the acQuitSaveAll constant as an argument for the Quit method.
+Chapter 11
+
+Making Decisions in Your Code
+
+Computers behave intelligently in large part because programming languages include commands that test conditions. Then based on the results of that test, the code jumps ( _branches_ ) to an appropriate area within the program. This is similar to human decision-making: if it's raining, then take an umbrella. If not, leave it home.
+
+This chapter covers what are called _conditional_ expressions. VBA uses these to create decision structures to direct the flow—the path of execution—of your procedures.
+
+By using decision structures, you can cause your procedures to branch to different sections of code depending on such things as the value of a variable or expression or whether the user clicks the OK or Cancel button in a message box.
+
+VBA offers two types of decision structures: If blocks and Select Case blocks. And there is a set of various kinds of If statements suitable for making typical decisions. For more complicated decision-making, you'll want to use the heavy-duty Select Case block structure. It's more efficient when working with truly involved decisions.
+
+The chapter starts by introducing you to the comparison operators and logical operators you can use when building conditional expressions and logical expressions. Then it covers the different types of If blocks, which take up the bulk of the chapter. At the end of the chapter, you'll learn how to use Select Case.
+
+In this chapter you will learn to do the following:
+
+  * Use comparison operators
+  * Compare one item with another
+  * Test multiple conditions
+  * Use If blocks
+  * Use Select Case blocks
+
+# How Do You Compare Things in VBA?
+
+To compare things in VBA, you use _comparison operators_ to specify what type of comparison you want: whether one variable or expression is equal to another, whether one is greater than another, whether one is less than or equal to another, and so on.
+
+VBA supports the comparison operators shown in Table 11.1.
+
+Table 11.1 VBA's comparison operators
+
+**Operator** | **Meaning** | **Example**  
+---|---|---  
+= | Equal to | If strMyString **=** "Hello" Then  
+<> | Not equal to | If x **< >** 5 Then  
+< | Less than | If y **<** 100 Then  
+> | Greater than | If strMyString **>** "handle" Then  
+<= | Less than or equal to | If intMyCash **< =** 10 Then  
+>= | Greater than or equal to | If Time **> =** 12:00 PM Then MsgBox "It's afternoon." Else MsgBox "It's morning." End If  
+Is | Is the same object variable as | If Object1 **Is** Object2 Then
+
+The first six comparison operators shown in Table 11.1 are straightforward. Numeric expressions are evaluated as you would expect. Alphabetical expressions are evaluated in alphabetical order: for example, because _ax_ comes before _handle_ in alphabetical order, it's considered "less than" _handle_.
+
+So, "ax" < "handle" would evaluate to True. And whether an evaluation results in True or False determines what happens in an If. . . Then block. (In other words, the code in the Then section is executed when something is True. And it is not executed if something is False. Think If it's raining, Then take an umbrella. Otherwise, don't.)
+
+Mixed expressions (numbers and letters) are evaluated in alphabetical order as well: _Office 97_ is "greater than" _Office 2013_ because 9 is greater than 2.
+
+Is, the seventh comparison operator, is less familiar, and less often used. You use Is to compare object variables to establish whether two object variables represent the same object (a named object, not an object such as a document or a range).
+
+For example, the following statements declare two objects—objTest1 and objTest2—and assign to each ActiveDocument.Paragraphs(1).Range, the range consisting of the first paragraph in the active document in Word. The next statement then compares the two objects to each other, returning False in the message box because the two objects are different even though their contents are the same:
+
+    Dim objTest1 As Object
+    Dim objTest2 As Object
+    Set objTest1 = ActiveDocument.Paragraphs(1).Range
+    Set objTest2 = ActiveDocument.Paragraphs(1).Range
+    'the next statement returns False because the objects are different
+    MsgBox objTest1 **Is** objTest2
+
+However, if both object variables refer to the same object, the Is comparison returns True, as in the following example, in which both objTest1 and objTest2 refer to the object variable objTest3:
+
+    Dim objTest1 As Object
+    Dim objTest2 As Object
+    Dim objTest3 As Object
+    Set objTest3 = ActiveDocument.Paragraphs(1).Range
+    Set objTest1 = objTest3
+    Set objTest2 = objTest3
+    'the next statement returns True because
+    'objTest1 and objTest2 refer to the same object
+    MsgBox objTest1 **Is** objTest2
+
+When using Is, keep in mind that it isn't the specific _contents_ of the object variables that are being compared, but which _object_ they refer to.
+
+# Testing Multiple Conditions by Using Logical Operators
+
+Often, you'll need to test two or more conditions before taking an action: If statement X is True and statement Y is True, then do this; if statement X is True or statement Y is True, then do the other; if statement X is True and statement Y isn't True, then find something else to do; and so on. For example, if it's raining _and_ you have a cold, put on your warmest rain gear.
+
+To test multiple conditions, you use VBA's logical operators to link the conditions together. Table 11.2 lists the logical operators that VBA supports, with short examples and comments.
+
+Table 11.2 VBA's logical operators
+
+Of these six logical operators, you'll probably use the conjunction (And), disjunction (Or), and negation (Not) operators the most, with the other three thrown in on special (in other words, rare) occasions. (If the Imp logical operator doesn't make sense to you at this point, you probably don't need to use it.)
+
+* * *
+
+**VBA Doesn't Do Short-Circuit Evaluation**
+
+Here's something to beware of when evaluating multiple conditions: VBA doesn't do short-circuit evaluation in logical expressions (unlike other programming languages, such as C and C++).
+
+_Short_ - _circuit evaluation_ is the formal term for a simple logical technique most people use several times a day when making decisions in their daily lives: If the first of two or more dependent conditions is false, you typically don't waste time evaluating any other conditions contingent upon it.
+
+For example, suppose your most attractive coworker says they'll take you to lunch if you get the product out on time _and_ get a promotion. If you don't get the product out on time, you've blown your chances—it doesn't much matter if you get the promotion because even if you do, your lunch will still be that brown bag you forgot to put in the department fridge. There's no point in evaluating the second condition because it depends on the first, and the first condition wasn't met. You can just short-circuit any further condition testing.
+
+VBA doesn't think that way. It evaluates the second condition (and any subsequent conditions) whether or not it needs to. Evaluating all conditions takes a little more time (which isn't usually an issue), but it _can_ introduce unexpected complications in your code (which can be an issue). For example, the following snippet produces an error when the selection is only one character long. The error occurs because the code ends up running the Mid function on a zero-length string (the one-character selection minus one character)—even though you wouldn't expect this condition to be evaluated when the first condition is not met (because the length of the selection is not greater than 1):
+
+    Dim strShort As String
+    strShort = Selection.Text
+    If Len(strShort) **>** 1 And _
+      Mid(strShort, Len(strShort) - 1, 1) **=** "T" Then
+      MsgBox "The second-last character is T."
+    End If
+
+To avoid problems such as this, use _nested_ If blocks. In the following code example, the first condition isn't met (again, for a one-character selection), so the second condition isn't evaluated. Notice that one of the If blocks here is nested within (contained within) the other If block:
+
+    If Len(strShort) **>** 1 Then
+      If Mid(strShort, Len(strShort) - 1, 1) **=** "T" Then
+        MsgBox "The second-last character is T."
+      End If
+    End If
+
+* * *
+
+* * *
+
+Using to Toggle Boolean Properties
+
+Here's a useful tip. The Not command is a handy way of turning True to False and False to True. By using Not with a Boolean variable or property, you can toggle the state of the variable or property without even needing to check what the current state is. For example, in Excel, you could create an If structure to toggle the value of the Boolean property Saved (which controls whether Excel thinks the document in question contains unsaved changes) by using code such as this:
+
+    If ActiveWorkbook.Saved = True Then
+      ActiveWorkbook.Saved = False
+    Else
+      ActiveWorkbook.Saved = True
+    End If
+
+But you can achieve the same toggling effect much more simply by using Not as shown in the following code:
+
+    ActiveWorkbook.Saved = **Not** ActiveWorkbook.Saved
+
+* * *
+
+## _If_ Blocks
+
+As in most programming languages, If blocks in VBA are among the most immediately useful and versatile commands for making decisions.
+
+In the sections that follow, you'll look at three variations on the If statement:
+
+  * If...Then
+  * If...Then... Else
+  * If...Then... ElseIf... Else
+
+## _If...Then_
+
+If...Then statements tell VBA to make the simplest of decisions. If the condition is met, execute the following statement (or statements). If the condition isn't met, skip to the line immediately following the _conditional block_.
+
+An If statement block begins with If and concludes with End If. (However, a short If...Then statement can be written entirely on a single line, in which case the End If is omitted.)
+
+### Syntax
+
+Simple If...Then statements can be expressed entirely on a single line. A one-line If...Then statement looks like this:
+
+    If _condition_ Then **Code to be executed goes here**
+
+If the condition is met, VBA executes the statement or statements that follow _on that same logical line_. If the condition isn't met, VBA doesn't execute the statement or statements.
+
+But you can also write multi-line If...Then blocks. A multiple-line If...Then statement (the lines of code between If and End If are more properly known as a _block_ ) looks like this:
+
+    If _condition_ Then
+    **Code to be executed goes here**
+    End If
+
+If the condition is met, VBA executes all the code within the block (the statements enclosed between the If and End If). If the condition isn't met, VBA skips over the enclosed line or lines of code and resumes execution at the line after the End If statement.
+
+* * *
+
+Single-Line ⇒ Statements Don't Use
+
+Remember that a single-line If...Then statement has no End If to end it, whereas the If block requires an End If. VBA knows that a single-line If condition will end on the same line on which it starts. But an If block needs to have its end clearly specified so VBA knows which code to skip over if the condition evaluates to False. If blocks tend to be easier for humans to read.
+
+* * *
+
+### Examples
+
+In the previous chapters, you've already encountered a number of If blocks—they're so necessary in programming (not to mention in life itself) that it's hard to get anything done without them. The following sections show you some further examples.
+
+#### _One-Line_ If _Statements_
+
+Here's an example of a one-line If statement:
+
+    Dim bytAge As Integer
+    bytAge = InputBox("Enter your age.", "Age")
+    **If** bytAge < 21 **Then** MsgBox "You may not purchase alcohol.",, "Underage"
+
+The first line declares the Integer variable bytAge. The second line prompts the user to enter their age in an input box and stores the answer in the variable. The third line checks the value held in bytAge and displays an Underage message box if bytAge is less than 21.
+
+You can include multiple statements on a single line if you separate the statements by a colon. A single-line If statement can sometimes be a good candidate for a multi-statement line of code. What you are doing is specifying that more than one action should be taken if the expression in the If...Then statement evaluates to True.
+
+For example, let's say you wanted to halt the macro after displaying the Underage message box. You could include the End statement after a colon on the same line, as shown here:
+
+    If bytAge < 21 Then MsgBox "You may not purchase alcohol.",, "Underage": End
+
+VBA executes this as follows:
+
+1. First, it evaluates the condition.
+
+2. If the condition is met, it executes the first statement after Then—in this case, it displays the Underage message box. Then it proceeds to execute any further statements on that line. Notice that _all_ statements on a single-line If structure are conditional based on (depend on) that If statement. They are executed (or not) based on whether the condition is true or false.
+
+3. Once the user has dismissed the Underage message box (by clicking the OK button, the only button it has), VBA executes the statement after the colon: End.
+
+If you wanted, you could even add several more statements on the same "logical" line, separated by colons. End would have to be the last one because it ends the procedure. (By the way, a _logical_ line means that VBA sees this as a single line of code to be executed, no matter how many real-world, _physical_ lines the code takes up on your monitor.)
+
+You could even add another If statement if you felt like it:
+
+    **If** bytAge < 21 **Then If** bytAge > 18 **Then** MsgBox _
+      "You may vote but you may not drink.",, "Underage": End
+
+As you'll see if you're looking at this line in the Visual Basic Editor, there are a couple of problems with this approach:
+
+  * First, you need to break long lines of code with the line-continuation character or else they go off the edge of the Code window in the Editor, forcing you to scroll horizontally to read the ends of each line. You _could_ hide all windows except the Code window, use a minute font size for your code, or buy a larger monitor, but you're probably still not going to have any fun working with long lines of code. So, in practice, you don't want to pile statements onto a single code line. The brief End statement is probably the most you'll want to add.
+  * Second, long lines of code (broken or unbroken) that involve a number of statements tend to become visually confusing. Even if everything is obvious to you when you're entering the code, you may find the code hard to read when you have to debug it a few months later. Usually it's better to use If blocks rather than complex one-line If statements.
+
+#### If _Blocks_
+
+Block If constructions work the same way as one-line If statements except blocks contain multiple lines—typically with one command to each line—and they require an End If statement at the end. For example, the one-line If statement from the previous section could also be constructed as an If block like this:
+
+    If bytAge < 21 Then
+      MsgBox "You may not purchase alcohol.",, "Underage"
+      End
+    End If
+
+If the condition in the first line (the line with the If command) is True, VBA executes the statements within the block If. VBA displays the message box and then executes the End statement.
+
+As you can see from this example, If blocks are much easier to read (and thus easier to debug) than one-line If statements. This is especially true when you nest If statements within one another, which you'll need to do fairly often.
+
+To make If blocks easier to read, the convention is to press the Tab key to indent the lines within the block (VBA ignores the indentation during execution). You can see this indentation in the previous code example.
+
+With short If blocks, like the ones shown in this section, indentation doesn't make a great deal of difference. But with complex If statements, it can make all the difference between clarity and incomprehensibility, as you'll see in "Nesting If Blocks" later in this chapter.
+
+## _If...Then... Else_ Statements
+
+If...Then statements are good for taking a single course of action based on a condition, but often you'll need to decide between two courses of action. To do so, you use the If...Then... Else statement.
+
+By using an If...Then... Else statement, you can take one course of action if a condition is True and another course of action if it's False. It's the equivalent of ordinary language, such as If it's raining, Then take an umbrella, Else wear sunscreen.
+
+For example, If...Then... Else statements are a great way to deal with two-button message boxes. If the user clicks the OK button, the code will do one thing. If they click the Cancel button, it will do something different.
+
+* * *
+
+Use If...Then... Else with Clear-Cut True/False Situations
+
+The If...Then... Else statement is best used with clear-cut binary conditions—those that lend themselves to a true/false analysis. (Recall that a binary condition is like a two-position light switch—if it's not switched on, it must be switched off.) For more complex conditions, such as switches that can have three or more positions, you need to use a more complex logical statement, such as If...Then... ElseIf... Else or Select Case. We'll get to these structures later in this chapter.
+
+* * *
+
+### Syntax
+
+The syntax for the If...Then... Else statement is as follows:
+
+    If _condition_ Then
+    _statements1_
+    Else
+    _statements2_
+    End If
+
+If the condition is True, VBA executes _statements1_ , the first group of statements. If the condition is False, VBA moves execution to the Else line and executes _statements2_ , the second group of statements.
+
+Again, you have the option of creating one-line If...Then... Else statements or block If...Then... Else statements. However, it makes more sense to create block If...Then... Else statements because they're much easier to read and debug and because an If...Then... Else structure is inherently longer than an If...Then structure and thus certain to result in an awkwardly long line.
+
+### Example
+
+As a straightforward example of an If...Then... Else statement, consider the Electronic_Book_Critic procedure shown in Listing 11.1.
+
+**Listing 11.1**: A Simple If...Then example
+
+     1. Sub Electronic_Book_Critic()
+     2.
+     3.   Dim intBookPages As Integer
+     4.
+     5.   intBookPages = InputBox _
+           ("Enter the number of pages in the last book you read.", _
+           "The Electronic Book Critic")
+     6.   **If** intBookPages > 1000 Then
+     7.     MsgBox "That book is seriously long.", vbOKOnly _
+             + vbExclamation, "The Electronic Book Critic"
+     8.   **Else**
+     9.     MsgBox "That book is not so long.", vbOKOnly _
+             + vbInformation, "The Electronic Book Critic"
+    10.   **End If**
+    11.
+    12. End Sub
+
+Here's what happens in Listing 11.1:
+
+  * Line 1 starts the procedure, and line 12 ends it. Lines 2, 4, and 11 are spacers.
+  * Line 3 declares the Integer variable intBookPages. Line 5 then assigns to intBookPages the result of an input box prompting users to enter the number of pages in the last book they read.
+  * Line 6 checks to see if intBookPages is greater than 1000. If it is, the statement in line 7 runs, displaying a message box that states that the book is long.
+  * If intBookPages is not greater than 1000, VBA branches to the Else statement in line 8 and executes the statement following it, which displays a message box telling the user that the book wasn't so long.
+  * Line 10 ends the If condition.
+
+## _If...Then... ElseIf... Else_ Statements
+
+The last variation of the If command that you'll look at here is the If... Then... ElseIf... Else block, which you can use to help VBA decide between multiple courses of action. You can use any number of ElseIf lines, depending on how complex the condition is that you need to check.
+
+Again, you could create either one-line If... Then... ElseIf... Else statements or If... Then... ElseIf... Else blocks. However, in almost all cases, If... Then... ElseIf... Else blocks are easier to construct, to read, and to debug. As with the other If statements, one-line If... Then... ElseIf... Else statements don't need an End If statement, but If... Then... ElseIf... Else blocks do need one.
+
+### Syntax
+
+The syntax for If... Then... ElseIf... Else is as follows:
+
+    If _condition1_ Then
+    _statements1_
+    ElseIf _condition2_ Then
+    _statements2_
+    [ElseIf _condition3_ Then
+    _statements3_ ]
+    [Else
+    _statements4_ ]
+    End If
+
+If the condition expressed in _condition1_ is True, VBA executes _statements1_ , the first block of statements, and then resumes execution at the line after the End If clause. If _condition1_ is False, VBA branches to the first ElseIf clause and evaluates the condition expressed in _condition2_. If this is True, VBA executes _statements2_ and then moves to the line after the End If line; if it's False, VBA moves to the next ElseIf clause (if there is one) and evaluates its condition (here, _condition3_ ) in turn.
+
+If _all_ the conditions in the ElseIf statements prove False, VBA branches to the Else statement (if there is one) and executes the statements after it (here, _statements4_ ). The End If statement then terminates the conditional statement, and execution resumes with the line after the End If.
+
+The Else clause is optional, although in many cases it's a good idea to include it to let VBA take a different course of action if none of the conditions specified in the If and ElseIf clauses turns out to be True.
+
+You can have any number of ElseIf clauses in an If block, each with its own condition. But if you find yourself needing to use If statements with large numbers of ElseIf clauses (say, more than 5 or 10), you may want to try using the Select Case command instead, which you'll look at toward the end of the chapter.
+
+### Examples
+
+This section shows you two examples of If...Then... ElseIf... Else statements:
+
+  * A simple If...Then... ElseIf... Else statement for taking action based on which button the user clicks in a three-button message box
+  * An If...Then... ElseIf statement without an Else clause
+
+#### _A Simple_ If...Then... ElseIf... Else _Statement_
+
+A simple If...Then... ElseIf... Else statement, as used in Listing 11.2, is perfect for dealing with a three-button message box.
+
+**Listing 11.2**: Understanding the If...Then...ElseIf...Else structure
+
+     1. Sub Creating_a_Document()
+     2.
+     3.   Dim lngButton As Long
+     4.   Dim strMessage As String
+     5.
+     6.   strMessage = "Create a new document based on the " & _
+           "VP Report project?" & vbCr & vbCr & _
+           "Click Yes to use the VP Report template." & vbCr & _
+           "Click No to use a blank document." & vbCr & _
+           "Click Cancel to stop creating a new document."
+     7.
+     8.   lngButton = MsgBox _
+           (strMessage, vbYesNoCancel + vbQuestion, "Create New Document")
+     9.
+    10.   **If** lngButton = vbYes Then
+    11.     Documents.Add Template:= "z:\public\template\vpreport.dotm"
+    12.   **ElseIf** lngButton = vbNo Then
+    13.     Documents.Add
+    14.   **Else**  'lngButton is vbCancel
+    15.     End
+    16.   **End If**
+    17.
+    18. End Sub
+
+The Creating_a_Document procedure in Listing 11.2 displays a Yes/No/Cancel message box inviting the user to create a new document based on the VP Report project. The user can choose the Yes button to create such a document, the No button to create a blank document, or the Cancel button to cancel out of the procedure without creating a document at all.
+
+Here's what happens:
+
+  * Line 1 starts the procedure, and line 18 ends it.
+  * Line 2 is a spacer, after which line 3 declares the Long variable lngButton and line 4 declares the String variable strMessage. Line 5 is another spacer.
+  * Line 6 assigns to the String variable strMessage a long string that contains all the text for the message box. Line 7 is another spacer.
+  * Line 8 displays the message box, using strMessage as the prompt, specifying the vbYesNoCancel constant to produce a Yes/No/Cancel message box, and applying a suitable title (Create New Document). It assigns the result of the message box to the Long variable lngButton. Line 9 is a spacer.
+  * Line 10 starts the If...Then... ElseIf... Else statement, comparing the value of lngButton to vbYes.
+  * If line 10 matches, line 11 uses the Add method of the Documents object to create a new document based on the vpreport.dotm template. If not, the ElseIf condition in line 12 is evaluated, comparing the value of lngButton to vbNo. If you run this procedure and choose the Yes button in the message box, you will need to have a template named vpreport.dotm in the folder z:\public\template\ for line 11 to run. If you don't have the template, you'll get an error. Given that you're unlikely to have this template, you might want to change the path and filename to a template that you do have.
+  * If this second comparison matches, line 13 uses the Add method of the Documents object to create a new blank document. If not, the Else statement in line 14 is activated because the user must have chosen the Cancel button in the message box. The End statement in line 15 ends execution of the procedure.
+  * Line 16 ends the If statement. Line 17 is a spacer.
+
+This example is a little unusual in that the Else statement is limited to three possible branches because that's the number of possible responses from a message box—Yes, No, and Cancel.
+
+Because the If statement checks for the vbYes response and the ElseIf statement checks for the vbNo response, only the vbCancel response will trigger the Else statement.
+
+In other circumstances, the Else statement can serve as a catchall for _anything_ not caught by the If and ElseIf statements above the Else, so you need to make sure the If and ElseIf statements cover all the contingencies you want evaluated _before_ the Else statement kicks in. So, put the Else statement at the bottom of the block. For example, if you quiz the reader about the colors of the US flag, you must provide If and ElseIf statements for red, white, and blue. If you omit, for example, _white_ (one of the possibilities), and the user types in _white_ , your code will fall through to the Else statement, which might display an incorrect message such as "The color you entered is not on the flag."
+
+#### _An_ If...Then... ElseIf _Statement without an_ Else _Clause_
+
+You can use an If...Then... ElseIf statement without an Else clause when you don't need to take an action if none of the conditions in the If statement proves True. In the previous example, the situation had three clearly defined outcomes: the user could choose the Yes button, the No button, or the Cancel button in the message box. So you were able to use an If clause to test whether the user chose the Yes button, an ElseIf clause to test whether the user chose the No button, and an Else clause to test whether neither was chosen (meaning that the Cancel button was chosen). (Clicking the close button [x] on the title bar of a message box is the equivalent of choosing the Cancel button in the message box.)
+
+As an example of a situation in which you don't need to take action if no condition is True, consider the If statement in the Check_Password procedure in Listing 11.3. This procedure checks to ensure that the password a user enters to protect an item is of a suitable length.
+
+**Listing 11.3**: Taking no action when no condition is true
+
+     1. Sub Check_Password()
+     2.
+     3.   Dim strPassword As String
+     4.
+     5. BadPassword:
+     6.
+     7.   strPassword = InputBox _
+           ("Enter the password to protect this item from changes:", _
+           "Enter Password")
+     8.
+     9.   **If** Len(strPassword) = 0 Then
+    10.     End
+    11.   **ElseIf** Len(strPassword) < 6 Then
+    12.     MsgBox "The password you chose is too short." & vbCr _
+             & vbCr & "Please choose a password between " & _
+             "6 and 15 characters in length.", _
+            vbOKOnly + vbCritical, "Unsuitable Password"
+    13.     GoTo BadPassword
+    14.   **ElseIf** Len(strPassword) > 15 Then
+    15.      MsgBox "The password you chose is too long." & vbCr _
+             & vbCr & "Please choose a password between " & _
+             "6 and 15 characters in length.",
+             vbOKOnly + vbCritical, "Unsuitable Password"
+    16.      GoTo BadPassword
+    17.   **End If**
+    18.
+    19. End Sub
+
+This procedure forces users to enter an acceptable password. Here's what happens:
+
+  * Line 1 starts the procedure, and line 19 ends it.
+  * Line 2 is a spacer, after which line 3 declares the String variable strPassword.
+  * Line 4 is a spacer. Line 5 contains a label, BadPassword, to which VBA will loop if the password the user enters proves to be unsuitable. Line 6 is another spacer.
+  * Line 7 displays an input box prompting the user to enter a password, which VBA stores in the variable strPassword. Line 8 is a spacer.
+  * Line 9 checks strPassword to see if its length is zero, which means it's an empty string. This could mean that either the user clicked the Cancel button in the input box or the user clicked the OK button without entering any text in the text box of the input box. Either of these actions causes VBA to branch to line 10, where it executes the End statement that ends execution of the procedure.
+  * If the length of strPassword isn't zero (that is, the user has entered text into the text box of the input box and clicked the OK button), the If clause in line 9 is False and VBA moves to line 11, where it checks to see if the length of strPassword is less than 6 characters.
+  * If the length of strPassword is zero, VBA executes the code in lines 12 and 13. Line 12 displays a message box telling the user that the password is too short and specifying the length criteria for the password. This message box contains only an OK button, so when the user clicks it to continue, VBA continues with line 13, which returns execution to the BadPassword label on line 5. From there the procedure repeats itself, redisplaying the input box so that the user can try again.
+  * If the length of strPassword isn't more than 15 characters, execution passes from line 11 to the second ElseIf clause in line 14, where VBA checks to see if the length of strPassword is more than 15 characters.
+  * If the length of strPassword is more than 15 characters, VBA executes the code in lines 15 and 16: Line 15 displays a message box (again, with only an OK button) telling the user that the password is too long, and line 16 returns execution to the BadPassword label, again displaying the input box.
+
+There's no need for an Else statement in this case because once the user has supplied a password that doesn't trigger the If clause or either of the ElseIf clauses, execution moves out of the If block and continues at the line after the End If statement.
+
+## Creating Loops with _If_ and _GoTo_
+
+So far in this book, you've seen several examples of For... Next loops and For Each... Next loops. (Chapter 12, "Using Loops to Repeat Actions," shows you how to construct these types of loops and other types, such as Do loops.) You can also create loops with If statements and the GoTo statement, as you did in the last example.
+
+Many teachers and programmers frown upon making loops with If and GoTo. It's bad practice because If... GoTo loops can create "spaghetti code" (execution paths that jump around and are hard to visualize). Such paths can be not only grotesque in themselves, but also a nightmare to debug.
+
+However, _simple_ versions of If and GoTo loops can work perfectly well, so even if you choose not to use this technique yourself, you should at least know how such loops work. Whether or not to ban GoTo from your code is a matter of personal preference, company policy, or your teacher's beliefs.
+
+If nothing else, you might one day be responsible for working with someone else's code—someone whose standards aren't as rigorous as yours regarding the notorious GoTo command. So let's take a brief look at how GoTo can be used.
+
+### Syntax
+
+The GoTo statement is straightforward, and can be useful—it's already been used several times in the examples you've looked at so far in this book (in Listings 7.2 and 9.2, for example). The syntax is as follows:
+
+    GoTo _line_
+
+Here, the line argument can be a line label (or, rarely these days, a line number) within the current procedure.
+
+A line number is simply a number placed at the beginning of a line to identify it. For example, consider this demonstration of GoTo:
+
+    Sub Demo_of_GoTo()
+    1
+      If MsgBox("Go to line 1?", vbYesNo) = vbYes Then
+        GoTo 1
+      End If
+    End Sub
+
+The second line here contains only the line number 1, which identifies the line. The third line displays a message box offering the choice of going back to line 1; if the user chooses the Yes button, VBA executes the GoTo 1 statement and returns to the line labeled 1, after which it displays the message box again. (If the user chooses the No button, the If block is exited.)
+
+However, it's usually better to use a line _label_ than a line number. A line label is a name for a line. A label starts with a letter and ends with a colon. Between the letter and the colon, the label can consist of any combination of characters. For example, earlier in this chapter you saw the label BadPassword: used to loop back to an earlier stage in a procedure when certain conditions were met. Perhaps the quintessential example of a label is the Bye: label traditionally placed at the end of a procedure for use with this GoTo statement:
+
+    GoTo Bye
+
+When this label is placed just above the End...Sub command, it simply exits the macro.
+
+GoTo is usually used with a condition. If you use it without a condition to go back to a line earlier in the code than the GoTo statement, you're apt to create an _infinite loop_ (this bug is discussed in Chapter 12). And if you were to use the GoTo Bye statement without a condition, you would guarantee that your procedure would stop executing—no statement after this line would ever be executed. You would be jumping to the end of the macro.
+
+### Example
+
+As an example of a GoTo statement with a condition, you might use a GoTo Bye statement together with a message box that makes sure that the user wants to run a certain procedure:
+
+    Response = MsgBox("Do you want to create a daily report for " & _
+      "the head office from the current document?", _
+      vbYesNo + vbQuestion, "Create Daily Report")
+    If Response = vbNo Then GoTo Bye
+
+If the user chooses the No button in the message box that the first line displays, VBA executes the GoTo Bye statement, branching to the Bye: label located at the end of the subroutine.
+
+## Nesting _If_ Blocks
+
+You can _nest_ If blocks (put one inside another) as needed to manage any contortions required in your code. Each nested If block must be complete in and of itself. (This means each nested block must start with an If and conclude with its own End...If.)
+
+For example, if you nest one If block within another If block (but forget the End If that concludes the nested If), VBA assumes that the End If line for the outer If actually pairs with the nested If. That's so wrong.
+
+To make your If blocks easy to read, indent them to different levels. This is particularly important when nesting If blocks. Indenting provides you with visual cues, making it clear which If line is paired with each End If line. In other words, indentation makes the various If blocks stand out.
+
+To see how this is done, check out the following nested If statements:
+
+     1. **If** condition1 Then       'start of first If
+     2.   **If** condition2 Then     'start of second If
+     3.     **If** condition3 Then   'start of third If
+     4.       _statements1_
+     5.     ElseIf condition4 Then 'ElseIf for third If
+     6.       _statements2_
+     7.     Else          'Else for third If
+     8.       _statements3_
+     9.     **End If**         'End If for third If
+    10.   Else            'Else for second If
+    11.     **If** condition5 Then   'start of fourth If
+    12.       _statements4_
+    13.     **End If**         'End If for fourth If
+    14.   **End If**           'End If for second If
+    15. **Else**              'Else for first If
+    16.   _statements5_
+    17. **End If**             'End If for first If
+
+By following the layout, you can easily trace the flow of execution. For example, if condition1 in line 1 is False, VBA branches to the Else statement in line 15 and continues execution from there. If _condition1_ in line 1 is True, VBA evaluates the nested _condition2_ in line 2, and so on.
+
+The indentation is for visual clarity only—VBA pays no attention to it—but it can be a great help to the human reader. The previous nested If commands are also annotated with comments so that you can see which Else, ElseIf, and End If line belongs with which If line. However, with the indentation, commenting is unnecessary.
+
+By contrast, check out the unindented version of these nested blocks. This version is hard for the human eye to follow—and is even harder when it's buried in a morass of other code:
+
+     1. If condition1 Then      
+     2. If condition2 Then      
+     3. If condition3 Then      
+     4. _statements1_
+     5. ElseIf condition4 Then  
+     6. _statements2_
+     7. Else              
+     8. _statements3_
+     9. End If            
+    10. Else              
+    11. If condition5 Then      
+    12. _statements4_
+    13. End If             
+    14. End If             
+    15. Else              '
+    16. _statements5_
+    17. End If
+
+There's seldom a pressing need to nest multiple If blocks. Often, you'll need only to nest a simple If...Then statement within an If...Then... Else statement or within an If...Then... ElseIf... Else statement. Listing 11.4 shows an example using Word.
+
+**Listing 11.4**: Nesting an If...Then block
+
+     1. Selection.HomeKey Unit:=wdStory
+     2. Selection.Find.ClearFormatting
+     3. Selection.Find.Style = ActiveDocument.Styles("Heading 5")
+     4. Selection.Find.Text = " "
+     5. Selection.Find.Execute
+     6. If Selection.Find.Found Then
+     7.   lngResponse = MsgBox("Make this into a special note?", _
+           vbOKCancel, "Make Special Note")
+     8.   If lngResponse = vbOK Then
+     9.     Selection.Style = "Special Note"
+    10.   End If
+    11. End If
+
+The code in Listing 11.4 searches through the active document for the Heading 5 style and, if it finds the style, displays a message box offering to make it into a special note by applying the Special Note style. Here's what happens:
+
+  * Line 1 starts by returning the insertion point to the beginning of the document.
+  * Line 2 clears formatting from the Find command (to make sure that it isn't searching for inappropriate formatting).
+  * Line 3 sets Heading 5 as the style for which the Find command is searching, and Line 4 sets the search string as an empty string (" ").
+  * Line 5 then runs the Find operation.
+  * Lines 6 through 11 contain the outer If...Then loop. Line 6 checks to see if the Find operation in line 5 found a paragraph in Heading 5 style. If it did, VBA runs the code in lines 7 through 10.
+  * Line 7 displays a message box asking if the user wants to make the paragraph into a special note.
+  * Line 8 begins the nested If...Then statement and checks the user's response to the message box.
+  * If the user's response is a vbOK—if the user chose the OK button—VBA executes the statement in line 9, which applies the Special Note style (which I'll assume is included in the styles available to the current document or template) to the paragraph.
+  * Line 10 contains the End If statement for the nested If...Then block, and line 11 contains the End If statement for the outer If...Then block.
+
+If you expect a document to contain more than one instance of the Heading 5 style, use a Do While... Loop loop to search for each instance. See Chapter 12 for details on Do While... Loop loops.
+
+# _Select Case_ Blocks
+
+The Select Case block provides an effective alternative to complex multiple If...Then blocks or multiple ElseIf statements. Select Case combines the same decision-making capability of If constructions with tighter and more readable code.
+
+Use the Select Case statement when the decision you need to make is complicated because it involves more than two or three different values that are being evaluated
+
+Select Case blocks are easier to read than complex If...Then blocks, mostly because there's less code. This also makes Select Case blocks easier to modify: when you need to adjust one or more of the values used, you have less code to wade through.
+
+## Syntax
+
+The syntax for Select Case is as follows:
+
+    Select Case _TestExpression_
+      Case _Expression1_
+    _Statements1_
+      [Case _Expression2_
+    _Statements2_ ]
+      [Case Else
+    _StatementsElse_ ]
+    End Select
+
+Here's how the syntax breaks down:
+
+  * Select Case starts the block, and End Select ends it.
+  * _TestExpression_ is the expression that determines which of the Case statements executes.
+  * _Expression1, Expression2,_ and so on are the expressions against which VBA matches TestExpression.
+
+For example, you might test to see which of a number of buttons in a user form the user chose. The _TestExpression_ would be tied to a button that's been chosen; if it were the first button, VBA would match that to _Expression1_ and would run the statements in the lines following Case _Expression1;_ if it were the second button, VBA would match that to _Expression2_ and would run the statements in the lines following Case _Expression2;_ and so on for the rest of the Case blocks.
+
+Case Else is similar to the Else clause in an If block. Case Else is an optional clause that (if it's included) runs if none of the given expressions is matched.
+
+## Example
+
+As an example of a Select Case block, consider Listing 11.5, which prompts users to enter their typing speed and then displays an appropriate response.
+
+**Listing 11.5**: Working with a Select Case structure
+
+     1. Sub Check_Typing_Speed()
+     2.
+     3.   Dim varTypingSpeed As Variant
+     4.   Dim strMsg As String
+     5.
+     6.   varTypingSpeed = InputBox _
+           ("How many words can you type per minute?", "Typing Speed")
+     7.   **Select Case** varTypingSpeed
+     8.     **Case** " "
+     9.       End
+    10.     **Case Is** < 0, 0, 1 To 50
+    11.       strMsg = "please learn to type properly before " & _
+              "applying for a job."
+    12.     **Case** 50 To 60
+    13.       strMsg = "Your typing could do with a little brushing up. "
+    14.     **Case** 60 To 75
+    15.       strMsg = "We are satisfied with your typing speed."
+    16.     **Case** 75 To 99
+    17.       strMsg = "Your typing is more than adequate. "
+    18.     **Case** 100 To 200
+    19.       strMsg = "You wear out keyboards with your blinding speed."
+    20.     **Case Is** > 200
+    21.       strMsg = "I doubt that's true."
+    22.   **End Select**
+    23.
+    24.   MsgBox strMsg, vbOKOnly, "Typing Speed"
+    25.
+    26. End Sub
+
+Here's what happens in the Check_Typing_Speed procedure in Listing 11.5:
+
+  * Line 1 starts the procedure, and line 26 ends it.
+  * Line 2 is a spacer. Line 3 declares the Variant variable varTypingSpeed, and line 4 declares the String variable strMsg. Line 5 is another spacer.
+  * Line 6 displays an input box prompting the user to enter their typing speed. It stores this value in the variable varTypingSpeed.
+  * Line 7 begins the Select Case block, predicating it on the variable varTypingSpeed.
+  * Next, VBA evaluates each of the Case clauses in turn until it finds one that proves True. The first Case clause, in line 8, compares varTypingSpeed to an empty string (" ") to see if the user chose the Cancel button in the input box or clicked the OK button without entering a value in the text box. If Case " " is True, VBA executes the End statement in line 9, ending the procedure.
+  * If Case " " is False, VBA moves execution to the next Case clause—line 10 in this example—where it compares varTypingSpeed to three items: less than 0 (Is < 0), 0, and the range 1 to 50 words per minute. Notice three things here:
+
+1. You can include multiple comparison items in the same Case statement by separating them from each other with commas.
+
+2. Using the Is keyword with the comparison operator (here, the _less than_ operator) checks the relation of two numbers to each other.
+
+3. The To keyword denotes the range of values.
+
+  * If varTypingSpeed matches one of the comparison items in line 10, VBA assigns to the String variable strMsg the text on line 11 and then continues execution at the line after the End Select statement.
+  * If varTypingSpeed isn't within this range, VBA moves to the next Case clause and evaluates it in turn. When VBA finds a Case clause that's True, it executes the statement following that clause (in this case, assigning a text string to the strMsg variable) and then continues execution at the line after the End Select statement.
+  * For any case other than that in line 8 (which ends the procedure), line 24 displays a message box containing the text stored in the statement strMsg.
+
+A Select Case block can be a good way of specifying which action to take based on the user's choice from a ListBox or ComboBox control (these controls are explored in Chapter 14, "Creating Simple Custom Dialog Boxes"). Typically, a list box or combo box displays a list of many different options, such as all the states in the USA. After the user clicks to select an item within a ListBox or ComboBox control, the chosen item appears in the control's Value property. Your macro could then check this Value property as the test expression in your Select Case block and take action accordingly.
+
+## When Order Matters
+
+One final point about complex test structures. You need to ensure that your Select Case and If...Then... Else statements (or other multiple If structures) evaluate their test conditions in the appropriate order. This means that each condition to be evaluated must _exclude_ all the conditions that follow it.
+
+Let's say you're asking the user how old they are. And you set up your test cases like this:
+
+    1.  Age = InputBox ("How old are you?")
+    2.  
+    3.  Select Case Age
+    4.  
+    5.    **Case < 50** 
+    6.      strMsg = "You're nearing retirement."
+    7.  
+    8.    **Case < 12**
+    9.      strMsg = "Hello, youngster."
+
+This is a logic bug. And a bad one. Line 8 can never execute because everyone under 50, including those younger than 12, will trigger line 5. (The expression "less than 50" _includes_ "less than 12.")
+
+To work properly, these tests must be reversed, like this:
+
+    **Case < 12**
+       strMsg = "Hello, youngster."
+
+    **Case < 50**
+      strMsg = "You're nearing retirement."
+
+You can avoid this problem entirely by testing for equality or a range, as illustrated in Listing 11.5:
+
+    Case 50 To 60
+
+# The Bottom Line
+
+**Use comparison operators.**
+
+Comparison operators compare items using such tests as _greater than_ or _not equal to_.
+
+Master It
+
+Write a line of code that uses a _less than_ comparison to test whether a variable named Surplus is less than 1200.
+
+**Compare one item with another.**
+
+You can compare strings using _less than_ and _more than_ comparison operators.
+
+Master It
+
+What symbol do you use to determine if VariableA is lower in the alphabet than VariableB?
+
+**Test multiple conditions.**
+
+To test multiple conditions, you use VBA's _logical operators_ to link the conditions together.
+
+Master It
+
+Name two of the most commonly used logical operators.
+
+**Use If blocks.**
+
+If blocks are among the most common programming structures. They are often the best way to allow code to make decisions. To test two conditions, use If... Else... EndIf.
+
+Master It
+
+Write an If... Else... End If block of code that displays two message boxes. If the temperature (the variable Temp) is greater than 80, tell the user that it's hot outside. Otherwise, tell the user that it's not that hot.
+
+**Use Select Case blocks.**
+
+Select Case structures can be a useful alternative to If blocks.
+
+Master It
+
+When should you use a Select Case structure?
+Chapter 12
+
+Using Loops to Repeat Actions
+
+As in life, so in macros. Sometimes, you'll want to repeat an action a predetermined number of times: break six eggs to make an omelet, or create two new documents.
+
+More often, though, you'll just repeat an action until a certain condition is met: break eggs until the pan is full, or buy two lottery tickets a week until you hit it big, or subtract five from every instance of a value in an Excel spreadsheet. In these situations, you don't know in advance when you'll triumph against the wretched odds of the lottery, or how many times the value will appear in the spreadsheet—your code must simply carry on until the condition is met.
+
+In VBA, you use _loops_ to repeat actions. VBA provides a number of ways to use loops in your code. In this chapter, you'll learn about the different types of loops and typical uses for each.
+
+In this chapter you will learn to do the following:
+
+  * Understand when to use loops
+  * Use For... loops for fixed repetitions
+  * Use Do... loops for variable numbers of repetitions
+  * Nest one loop within another loop
+  * Avoid infinite loops
+
+# When Should You Use a Loop?
+
+To repeat an action or a series of actions in VBA, you could record the repetition itself into a macro by using the Macro Recorder (if the application you're using supports the Macro Recorder—remember that only Word and Excel do).
+
+Or you could copy some code and paste it back into the macro multiple times to repeat the behavior. For example, you could record a macro containing the code for creating a new Word document based on the default template, open the macro in the Visual Basic Editor, and then copy this new-document code and paste it five times to create a procedure that makes six new documents.
+
+It's almost always much better, however, to just write a loop block (structure) to repeat the commands as necessary.
+
+Loops have several straightforward advantages over repetitive, redundant code:
+
+  * Your procedures are shorter—they contain less code and fewer instructions—and are thus easier to understand.
+  * Your procedures are more flexible: instead of hard-coding the number of repetitions, you can vary the number as necessary. ( _Hard_ - _coding_ means writing fixed code as opposed to flexible, variable code, such as _Create 6 new documents_ versus _Create_ x _number of new documents_ , thereby allowing the user or the code to supply the value of _x_.)
+  * Your procedures are easier to test, debug, and modify, particularly for people other than you.
+
+That said, if you just need to repeat one or more actions two or three times in a procedure and that procedure will always need to repeat the action this same number of times, there's nothing wrong with hard-coding the procedure by repeating the code. It'll work fine, it's easy to do, and you won't have to spend time considering the logic of loops. The code will be longer and a tad harder to maintain, but that's no big deal in simple situations.
+
+# Understanding the Basics of Loops
+
+In VBA, a loop is a structure (block of code) that repeats a number of statements, looping back to the beginning of the structure once it has finished executing them. Each cycle of execution of a loop is called an _iteration_.
+
+There are two basic categories of loops:
+
+  * _Fixed_ - _iteration loops_ repeat a set number of times (six eggs).
+  * _Indefinite loops_ repeat a flexible number of times (enough eggs to fill whatever pan is being used).
+
+The execution of either type of loop is controlled by the _loop invariant_ , also called the _loop determinant_. This can be either a numeric expression or a logical expression. Fixed-iteration loops typically use numeric expressions, whereas indefinite loops typically use logical expressions. For example, a fixed-iteration loop might specify that the loop will iterate five times, while an indefinite loop might continue iterating until the end of a document is reached.
+
+Table 12.1 explains the types of loops that VBA provides.
+
+Table 12.1 VBA's loop types
+
+**Loop** | **Type** | **Explanation**  
+---|---|---  
+For...Next | Fixed | Repeats an action or a sequence of actions a given number of times.  
+For Each... Next | Fixed | Repeats an action or a sequence of actions once for each object in a VBA collection.  
+Do While... Loop | Indefinite | Performs an action or a sequence of actions if a condition is True and continues to perform it until the condition becomes False.  
+While... Wend | Indefinite | Performs an action or a sequence of actions if a condition is True and continues to perform it until the condition becomes False. This type of loop is similar to Do... Loop While but is now almost obsolete.  
+Do Until... Loop | Indefinite | Performs an action or sequence of actions while a condition is False and continues to perform it until the condition becomes True.  
+Do... Loop While | Indefinite | Performs an action or a sequence of actions once and then repeats it while a condition is True until it becomes False.  
+Do... Loop Until | Indefinite | Performs an action or a sequence of actions once and repeats it while a condition is False until it becomes True.
+
+# Using For...loops for Fixed Repetitions
+
+For...loops execute for a fixed number of times. For...Next loops repeat for the number of times of your choosing, while For Each... Next loops execute once for each element in a specified VBA collection.
+
+## _For...Next_ Loops
+
+A For...Next loop repeats an action or a sequence of actions a given number of times. How many times it loops is specified by a _counter variable_. The counter variable can be hard-coded into the procedure, passed from an input box or dialog box, or passed from a value generated either by a different part of the procedure or by a different procedure.
+
+### Syntax
+
+The syntax for For...Next loops is as follows:
+
+    For _counter_ = _start To end_ [Step _stepsize_ ]
+       [ _statements_ ]
+    [Exit For]
+       [ _statements_ ]
+    Next [ _counter_ ]
+
+Here's what happens in a For...Next loop (refer to the syntax):
+
+1. When VBA enters the loop at the For statement, it assigns the _start_ value to _counter_. It then executes the statements in the loop. When it reaches the Next statement, it increments _counter_ by 1 or by the specified _stepsize_ and loops back to the For statement.
+
+2. VBA then checks the _counter_ variable against the _end_ variable. When _stepsize_ is positive, if _counter_ is greater than _end_ , VBA terminates the loop and continues execution of the procedure with the statement immediately after the Next statement (which could be any action or the end of the procedure). If _counter_ is less than or equal to _end_ , VBA repeats the statements in the loop, increases _counter_ by 1 or by _stepsize_ , and loops back to the For statement again. (For a loop in which _stepsize_ is negative, the loop continues while _counter_ is greater than or equal to _end_ and ends when _counter_ is equal to or less than _end_. In other words, when the stepsize is negative, the loop counts _down_ rather than up.)
+
+3. The Exit For statement exits the For...loop early. You'll look at how to use the Exit For statement, and examples of the different uses of For...Next loops, later in this chapter.
+
+Table 12.2 explains the components of the For...Next loop syntax. As usual, brackets enclose optional items and italicized words are placeholders—elements in the code that are to be replaced by you, the programmer.
+
+Table 12.2 Components of the syntax for a For...Next loop
+
+**Component** | **Description**  
+---|---  
+_Counter_ | A numeric variable or an expression that produces a number. By default, VBA increases the counter value by an increment of 1 with each iteration of the loop, but you can change this increment by using the optional Step keyword and _stepsize_ argument. _counter_ is required in the For statement and is optional in the Next statement, but it's a good idea to also include _counter_ in the Next statement to make your code easy to read. This is particularly important when you're using multiple For...Next statements in the same procedure or nesting one For...Next statement within another.  
+_Start_ | A numeric variable or numeric expression giving the starting value for _counter_.  
+_End_ | A numeric variable or numeric expression giving the ending value for _counter_.  
+_Stepsize_ | A numeric variable or numeric expression specifying how much to increase or decrease the value of _counter_. To use _stepsize_ , use the Step keyword and specify the _stepsize_ variable. _stepsize_ is 1 by default, but you can use any positive or negative value.  
+Exit For | A statement for exiting a For...loop.  
+Next | The keyword indicating the end of the loop. Again, you can specify the optional _counter_ here to make your code clear.
+
+### Straightforward _For...Next_ Loops
+
+In a simple For...Next loop, you first specify a _counter_ variable and the starting and ending values for it:
+
+    Dim i As Integer
+    For i = 1 to 200
+
+Here, i is the _counter_ variable, 1 is the starting value, and 200 is the ending value. Because VBA by default increases the _counter_ variable by 1 with each iteration of the loop, the counter variable in this example will count 1, 2, 3, and so on up to 200. Once the loop iterates enough times so the value in counter is 201, the looping ends and execution continues in the line below the loop's End statement.
+
+You can also use the Step keyword to specify a different increment, either positive or negative; more on this in the next section.
+
+* * *
+
+i Is the Traditional Counter Variable Name for...next Loops
+
+i is the classic integer _counter_ variable used in a For...Next loop; after using i, the convention is to use j, k, l, m, and n for any subsequent _counter_ variables (if you're adding nested loops within in i loop). These short variable names derive from the days of key-card computation, when memory was at a premium and longer names represented a significant extravagance. These days, computer memory is abundant, so using long variable names is common practice for _most_ variables. But not with loop counters. Using i as the loop counter is pervasive, even in languages like Java and C++. So stick with i.
+
+* * *
+
+After the previous two statements (Dim and For), you specify whatever actions you want carried out within the loop, followed by the Next keyword to end the loop:
+
+    Application.StatusBar = _
+        "Please wait while Excel checks for nonuniform prices: " & i & "..."
+    Next i
+
+This code displays (on the status bar) Excel's progress in checking your spreadsheet for improbable values.
+
+As another example, say you need to check every paragraph in Word documents you receive from contributors to make sure there's no unsuitable formatting. By using a loop that runs from 1 to the number of paragraphs in the active document, you can check each paragraph in turn and let the user view the progress in the status bar. The number of paragraphs in a document is stored in the Count property of the Paragraphs collection in the ActiveDocument object:
+
+    Dim i As Integer
+    **For** i = 1 To ActiveDocument.Paragraphs.Count
+
+        ' _CheckParagraphForIllegalFormatting_
+
+        DoEvents
+
+    Application.StatusBar = _
+            "Please wait while Word checks the formatting in " _
+            & " this document: Paragraph " & i & " out of " _
+            & ActiveDocument.Paragraphs.Count & "..."
+    Selection.MoveDown Unit:=wdParagraph, _
+            Count:=1, Extend:=wdMove
+    **Next** i
+
+This code snippet executes a CheckParagraphForIllegalFormatting procedure. We've not yet written this procedure, so I just wrote a comment line indicating that the procedure needs to be called from inside this loop.
+
+Next we use the DoEvents command. This allows multitasking. It interrupts the loop to see if something else is going on in the computer (the user typing something, the status bar in Word being updated, or whatever). This prevents your loop from hogging the computer's microprocessor.
+
+Then the loop continues executing. The message is displayed in the status bar, indicating which paragraph out of the total number it's working on, and then the loop moves down a paragraph. When VBA reaches the Next statement, it increases the i counter by the default value, 1 (because no _stepsize_ variable is specified in the For statement) and loops back to the For statement, where it compares the value of i to the value of ActiveDocument.Paragraphs.Count. The procedure continues to loop until i has reached the value of ActiveDocument.Paragraphs.Count, which is the final iteration of the loop. Notice here how the counter variable is used twice: first to keep track of the loop's iterations, but it's also used later _within_ the loop to display the current paragraph number:
+
+Paragraph ″ & **i** &
+
+In a similar way you could use a simple For...Next loop to quickly build the structure of a timesheet or work log in Excel. The following statements use a For...Next loop to insert the labels 1.00 through 24:00 in the current column in the active sheet of the active workbook:
+
+    Dim i As Integer
+    For i = 1 To 24
+        ActiveCell.FormulaR1C1 = **i** & ":00"
+        ActiveCell.Offset(RowOffset:=1, ColumnOffset:=0).Select
+    Next i
+
+Here, the ActiveCell.FormulaR1Ci statement inserts the automatically increased string for the counter i together with a colon and two zeroes (to create a time format). The ActiveCell.Offset(RowOffset:=1, ColumnOffset:=0).Select statement selects the cell in the next row and the same column. The loop runs from i = 1 to i = 24 and stops when the automatic increase takes i to 25. Again, the counter variable is used within the loop. This is quite common.
+
+### For...Next Loops with Step Values
+
+If increasing the _counter_ variable by the default 1 doesn't suit your purpose, you can use the Step keyword to specify a different increment or decrement.
+
+For example, the following statement increases the _counter_ variable by 20, so the sequence is 0, 20, 40, 60, 80, 100:
+
+    For i = 0 to 100 Step 20
+
+You can also decrement by specifying a negative Step value:
+
+    For i = 1000 to 0 Step -100
+
+This statement produces the sequence 1000, 900, 800, and so on, down to 0.
+
+Instead of the "x out of y" countdown given in the example in the previous section, you could produce a countdown running from ActiveDocument.Paragraphs.Count to zero:
+
+    Dim i As Integer
+    For i = ActiveDocument.Paragraphs.Count To 0 **Step -1**
+        CheckParagraphForIllegalFormatting
+        Application.StatusBar = _
+            "Please wait while Word checks the formatting in this document: " & i
+        Selection.MoveDown Unit:=wdParagraph, Count:=1, Extend:=wdMove
+    Next i
+
+### Using an Input Box to Drive a For...Next Loop
+
+Sometimes you'll be able to hard-code the number of iterations into a For...Next loop (six eggs). You'll know the number of iterations when writing your code, so you can just type in the end condition number, like the 100 here:
+
+    For i = 0 to 100
+
+Other times, though, you can't know in advance how many loop iterations are needed. This information only becomes available during program execution (called _runtime_ ) rather than when you're writing the code (called _design time_ ).
+
+Often you'll take a number from another operation during execution, such as the ActiveDocument.Paragraphs.Count property in the previous example.
+
+You want to use this macro with many documents in the future. The number of paragraphs in various documents is different; it varies. So you can't know when writing your code how many times it should loop. Your macro itself has to gather that information at runtime.
+
+Frequently you ask the user to specify the number of loop repetitions. The easiest way of doing this is to display an input box, requesting the user to enter a value.
+
+For example, Listing 12.1 contains a simple procedure named CreatePresentations that displays an input box prompting users to enter the number of presentations they want to create. It then uses a For...Next loop to create the documents in PowerPoint.
+
+**Listing 12.1**: Letting the user specify the number of iterations
+
+    1.  Sub CreatePresentations()
+    2.      Dim intPresentations As Integer
+    3.      Dim i As Integer
+    4.      intPresentations = InputBox _
+                ("Enter the number of presentations to create:", _
+                "Create Presentations")
+    5.      For i = 1 To intPresentations
+    6.          Presentations.Add
+    7.      Next i
+    8.  End Sub
+
+Here's what happens in the CreatePresentations procedure in Listing 12.1
+
+  * Line 2 declares the Integer variable intPresentations, and line 3 declares the Integer variable i.
+  * Line 4 displays an input box prompting users to enter the number of presentations they want to create.
+  * Lines 5 through 7 contain a For...Next loop that runs from i = 1 to i = intPresentations with the default increment of 1 per iteration. Each iteration of the loop executes the Presentations.Add statement in line 6, creating a new presentation based on the default template.
+
+* * *
+
+**Control a For...Next Loop with User Input via a Dialog Box**
+
+An input box returns only a single value. Sometimes you need multiple values from the user. So, for those occasions when an input box won't suffice, you can easily get input from a dialog box to drive a For...Next loop. This book hasn't yet shown you how to create dialog boxes, but in this section you'll get a sneak preview by looking at a procedure named Create_Folders. You aren't expected to build and test this example; just read the code to get an idea of how it accepts user input and then employs that information in the loop.
+
+This example procedure reduces the tedium of creating multiple folders with predictable names, such as when I had to create 31 folders, a folder for each chapter in this book.
+
+Say that you're using a four-digit number to identify the project, the letter _s_ for section, and a two-digit number to identify the section. So you'd end up with folders named 1234s01, 1234s02, 1234s03, and so on—simple enough to create manually, but tedious if you need more than a dozen or so.
+
+In its simplest form, this dialog box would provide a text box for the number of folders to be created (though you could also use a drop-down list for this, or even a spinner control) and a text box for the project number. The following illustration is an example of how this dialog box might look.
+
+You display a dialog box by using the Show method in a separate macro, perhaps using a Load statement first, like this:
+
+    Sub makefolders()
+
+          Dialogs(wdDialogFileSaveAs).Show
+
+          **Load** frmCreateFolders
+
+          frmCreateFolders. **Show**
+
+    End Sub
+
+You might have noticed the Dialogs command in this code. It's quite useful, but we'll discuss it at the end of this sidebar. For now, our focus is on looping techniques.
+
+I named the example dialog box frmCreateFolders. However, any valid VBA name will work. The first text box—identified with the Number Of Folders To Create label—is named txtFolders; the second text box is named txtProjectNumber.
+
+The Cancel button here has an End statement attached to its Click event so that if the user clicks it, VBA ends the procedure:
+
+    Private Sub cmdCancel_Click()
+        End
+    End Sub
+
+The OK button in the dialog box has the following code attached to its Click event:
+
+     1.  Private Sub cmdOK_Click()
+     2.
+     3.      Dim strMsg As String
+     4.      Dim strFolder As String
+     5.      Dim i As Integer
+     6.
+     7.      frmCreateFolders.Hide
+     8.      Unload frmCreateFolders
+     9.      strMsg = "The Create_Folders procedure has created " _
+                 & "the following folders: " & vbCr & vbCr
+    10.
+    11.      **For** i = 1 To **txtFolders.Value**
+    12.          strFolder = **txtProjectNumber.Value** & "p" & Format(i, "0#")
+    13.          MkDir strFolder
+    14.          strMsg = strMsg & "    " & strFolder & vbCr
+    15.      **Next** i
+    16.
+    17.      MsgBox strMsg, vbOKOnly + vbInformation, _
+                 "Create Folders"
+    18.
+    19.  End Sub
+
+Let's pause here a minute for a pep talk. You might read the preceding code and say, "Hey! I'll never be able to remember all this stuff about Format and Hide and vbCr and vbOKOnly." Don't pout. Nobody memorizes all the variations of the Format command, or all the vb constants like vbCr. Remember, there are tons of sample code examples on the Internet and in books like this one. What's more, the VBA Editor itself displays lists of constants and object members as you type in a line of code. (Look up "Auto List Members" in this book's index. Or search the VBA Editor's Help index to locate online resources.)
+
+Now back to our regular programming. Notice that the Value properties of the two text boxes are used in this loop. The value in txtFolders specifies the loop's number of iterations. The txtProjectNumber specifies the first part of the name for each newly created folder.
+
+The cmdOK_Click procedure runs when the user clicks the OK button in the dialog box:
+
+  * Line 1 declares the cmdOK_Click subroutine, and line 19 ends it. Line 2 is a spacer.
+  * Line 3 declares the String variable strMsg, which is used to contain a string to display in a message box at the end of the procedure.
+  * Line 4 declares the String variable strFolder, which will contain the name of the current folder to create in each iteration of the loop.
+  * Line 5 declares the Integer variable i, which will be the _counter_ variable for the For...Next loop.
+  * Line 6 is a spacer.
+  * Line 7 hides frmCreateFolders.
+  * Line 8 unloads frmCreateFolders from memory.
+  * Line 9 assigns some introductory text to strMsg, ending it with a colon and two vbCr carriage-return characters to make the start of a list.
+  * Line 10 is a spacer.
+  * Lines 11 through 15 contain the For...Next loop that creates the folders. Line 11 causes the loop to run from i = 1 to i = txtFolders.Value, the value supplied by the user in the Number Of Folders To Create text box. Line 12 assigns to the strFolder String variable the Value property of the txtProjectNumber text box, the letter _p_ , and the value of i formatted via the Format function to include a leading zero if it's a single digit (so that 1 will appear as 01, and so on). Line 13 uses the MkDir command with strFolder to create a folder (that is, make a directory—the old DOS command mkdir lives on in VBA) of that name. Line 14 adds some spaces (for an indent), the contents of strFolder, and a vbCr character to strMsg. Line 15 then loops back to the For statement, incrementing the i counter. VBA then compares the i counter to txtFolders.Value and repeats the loop as necessary.
+
+This procedure creates a set of new subfolders within whatever is the current folder, without giving the user a choice of location. Chances are you won't want to do this in real-life situations. You might want to change a folder to a set location (so as to keep all the project files together), but more likely you'll want to let the user choose a suitable location—for example, by displaying a common dialog box, such as the Save As dialog box used by most Windows applications. These built-in dialog boxes can be very useful because everyone who uses Windows is familiar with them and because they contain quite a bit of functionality. You display, for example, the classic Windows SaveAs dialog box like this:
+
+    Dialogs(wdDialogFileSaveAs).Show
+
+When the user closes this dialog box, whatever folder the user specifies becomes the current folder and the document is automatically saved. You can find out more about how to use common dialog boxes in Chapter 14, "Creating Simple Custom Dialog Boxes," and also at this Microsoft web page:
+
+<http://msdn.microsoft.com/en-us/library/bb208857.aspx>
+
+I wanted you to be aware that common dialog boxes exist, but in this example, perhaps a more direct way of allowing the user to specify the path for the new directories would be to use the ChDir (change directory) command, like this:
+
+    Dim strDir As String
+
+    strDir = InputBox("Type the full path where you want new folders to be stored")
+
+    **ChDir** (strDir)
+
+* * *
+
+## For Each... Next Loops
+
+The For Each... Next loop, which is unique to the various versions of Visual Basic, including VBA, is similar to the For...Next loop. With For Each, however, the iterations are based on the number of objects in a collection, such as the Slides collection in a presentation or the Documents collection of Word documents. So, using For Each means that you, the programmer, don't necessarily know the number of loop iterations in advance, but VBA will know during execution because it will query an object's Count property.
+
+For example, you can choose to take an action for each Slide object in a presentation. During design time while writing your macro you don't need to know how many slides are in the collection. (If there are none, nothing happens.)
+
+### Syntax
+
+The syntax for the For Each... Next statement is straightforward:
+
+    For Each _object_ In _collection_
+        [ _statements_ ]
+        [Exit For]
+        [ _statements_ ]
+    Next [ _object_ ]
+
+VBA starts by evaluating the number of objects in the specified collection. It then executes the statements in the loop for the first of those objects. When it reaches the Next keyword, it loops back to the For Each line, reevaluates the number of objects, and performs further iterations as appropriate.
+
+Here's an example: The Documents collection contains the open documents in Word. So you could create a straightforward procedure to close all the open documents by using a For Each... Next loop like this:
+
+    Dim Doc As Document
+    **For Each** Doc in Documents
+        Doc.Close SaveChanges:=wdSaveChanges
+    **Next**
+
+VBA closes each open document in turn by using the Close method. The statement uses the wdSaveChanges constant for the SaveChanges argument to specify that any unsaved changes in the document be saved when the document is closed. As long as there are open documents in the Documents collection, VBA repeats the loop, so it closes all open documents and then terminates the procedure.
+
+This example provides a straightforward illustration of how a For Each... Next loop works, but you probably wouldn't want to use the example in practice. Instead, you'd probably use the Close method with the Documents collection (this collection contains all the open documents) to close all the open documents. It's a simpler approach. However, you might use a For Each... Next loop to check each document for certain characteristics before closing it.
+
+## Using an _Exit For_ Statement
+
+As you saw earlier in this chapter when looking at the syntax for For statements, you can use one or more Exit For statements to exit a For...loop if a certain condition is met. Exit For statements are optional and are seldom necessary. If you find yourself needing to use Exit For statements in all your procedures, there's probably something wrong with the way you're constructing your loops. That said, you may sometimes find Exit For statements useful—for example, to respond to an error that happens within a loop or if the user chooses to cancel a procedure.
+
+On those occasions when you do need Exit For statements to exit a loop early, you'll typically use them with straightforward conditions. For example, in Word, if you wanted to close open windows until you reached a certain document that you knew to be open, you could use an Exit For statement like this:
+
+    Dim Doc As Document
+    For Each Doc in Documents
+        If Doc.Name = "Document1" Then **Exit For**
+        Doc.Close
+    Next Doc
+
+This For Each... Next statement checks the Name property of the document to see if it's Document1; if it is, the Exit For statement causes VBA to exit the loop. Otherwise, VBA closes the document and returns to the start of the loop.
+
+* * *
+
+Use Multiple Exit for Statements If You Wish
+
+You can also use multiple Exit For statements if you need to. For example, you might need to check two or more conditions during the actions performed in the loop.
+
+* * *
+
+# Using _Do..._ Loops for Variable Numbers of Repetitions
+
+Do loops give you more flexibility than For...loops in that you can test for conditions and direct the flow of the procedure accordingly. VBA includes several types of Do loops:
+
+  * Do While... Loop
+  * Do... Loop While
+  * Do Until... Loop
+  * Do... Loop Until
+
+These loops break down into two categories:
+
+  * Loops that test a condition at the start of the loop, before executing any of the statements contained inside the loop. Do While... Loop and Do Until... Loop loops fall into this category. In other words, if the test fails, the loop's code within the loop block will not execute even once.
+  * Loops that test a condition at the end of the loop. This type of loop executes the code within the loop block before testing a condition. Do... Loop While and Do... Loop Until fall into this category. This type of loop will execute at least one time.
+
+The difference between the two types of loop in each category is that each While loop repeats itself _while_ a condition is True (until the condition becomes False), whereas each Until loop repeats itself _until_ a condition becomes True (while the condition remains False).
+
+This means that you can get by to some extent using only the While loops or only the Until loops—you'll just need to set up some of your conditions the other way around. For example, you could use a Do While... Loop loop with a condition of x < 100 or a Do Until... Loop loop with a condition of x = 100 to achieve the same effect. Put another way: _loop while_ x _is less than 100_ is equivalent to _loop until_ x = _100_ —as long as you start looping below 100.
+
+The following sections describe all the different kinds of Do loops so that you can know when to use each.
+
+## _Do While... Loop_ Loops
+
+In a Do While... Loop loop, you specify a condition that has to remain True for the actions (statements) inside the loop to be executed. If the condition isn't True, the actions aren't executed and the loop ends. When a loop ends, the code _below_ the loop block then executes.
+
+For example, you might want to search a document for an instance of a particular word or phrase and take action after you find it. Figure 12.1 shows a Do While... Loop loop.
+
+Figure 12.1 Do While... Loop loop tests for a condition before performing the actions contained in the loop.
+
+### Syntax
+
+The syntax for the Do While... Loop loop is straightforward:
+
+    Do While _condition_
+        [ _statements_ ]
+        [Exit Do]
+        [ _statements_ ]
+    Loop
+
+While the _condition_ is met (Do While the condition remains True), the statements in the loop are executed. The Loop keyword returns execution to the Do While line, which is then reevaluated. If the _condition_ is still True, the loop continues—it iterates again.
+
+However, if the _condition_ is False, execution jumps to the code below the loop block, starting with the statement on the line after the Loop keyword.
+
+You can use one or more optional Exit Do statements if you want to exit the loop without waiting until the condition turns False.
+
+Say you wanted to construct a glossary from a lengthy Word document that highlights the main terms by italicizing them. These terms are located in the body text as well as within bulleted or numbered lists. However, you want to avoid picking up italicized terms used in other elements of the document, such as headings or captions. In this situation, body text is in the Times New Roman font, but the captions and headlines are in other fonts.
+
+You could command Word to search for Times New Roman text with the italic attribute. If Word found instances of the text, it would take the appropriate actions, such as selecting the sentence containing the term, together with the next sentence (or the rest of the paragraph), and copying it to the end of another document. Then it would continue the search, performing the loop until it no longer found instances of italic Times New Roman text.
+
+Listing 12.2 shows an example of how such a procedure might be constructed with a Do While... Loop structure. This listing includes a number of commands that you haven't learned about yet, but you should easily be able to see how the loop works.
+
+**Listing 12.2**: Understanding how Do While works
+
+     1.  Sub GenerateGlossary()
+     2.
+     3.      Dim strSource As String
+     4.      Dim strDestination As String
+     5.      Dim strGlossaryName As String
+     6.
+     7.      strSource = ActiveWindow.Caption
+     8.      strGlossaryName = InputBox _
+                 ("Enter the name for the glossary document.", _
+                 "Create Glossary")
+     9.      If strGlossaryName = "" Then End
+    10.
+    11.      Documents.Add
+    12.      ActiveDocument.SaveAs FileName:=strGlossaryName, _
+                 FileFormat:=wdFormatDocument
+    13.      strDestination = ActiveWindow.Caption
+    14.      Windows(strSource).Activate
+    15.
+    16.      Selection.HomeKey Unit:=wdStory
+    17.      Selection.Find.ClearFormatting
+    18.      Selection.Find.Font.Italic = True
+    19.      Selection.Find.Font.Name = "Times New Roman"
+    20.      Selection.Find.Text = ""
+    21.      Selection.Find.Execute
+    22.
+    23.      **Do While** Selection.Find.Found
+    24.          Selection.Copy
+    25.          Selection.MoveRight Unit:=wdCharacter, _
+                     Count:=1, Extend:=wdMove
+    26.          Windows(strDestination).Activate
+    27.          Selection.EndKey Unit:=wdStory
+    28.          Selection.Paste
+    29.          Selection.TypeParagraph
+    30.          Windows(strSource).Activate
+    31.          Selection.Find.Execute
+    32.      **Loop**
+    33.
+    34.      Windows(strDestination).Activate
+    35.      ActiveDocument.Save
+    36.      ActiveDocument.Close
+    37.
+    38.  End Sub
+
+The GenerateGlossary procedure in Listing 12.2 copies italic items in the Times New Roman font from the current document and inserts them in a new document that it creates and saves. Here's what happens:
+
+  * Line 1 begins the procedure, and line 2 is a spacer.
+  * Lines 3, 4, and 5 declare the String variables strSource, strDestination, and strGlossaryName, respectively. Line 6 is a spacer.
+  * Line 7 assigns the Caption property of the active window to the String variable strSource. The procedure uses this variable to activate the document when it needs to work with it.
+  * Line 8 displays an input box asking the user to enter a name for the document that will contain the glossary entries pulled from the current document. It stores the string the user enters in the String variable strGlossaryName.
+  * Line 9 then compares strGlossaryName to an empty string (″″) to make sure the user hasn't clicked the Cancel button to cancel the procedure or clicked the OK button in the input box without entering a name in the text box. If GlossaryName is an empty string, line 9 uses an End statement to terminate execution of the procedure.
+  * Provided line 9 hasn't stopped the procedure in its tracks, the procedure rolls on. Line 10 is a spacer. Line 11 then creates a new blank document. (This document is based on the Normal.dotm global template because no Template argument is used to specify a different template.) This document will become the glossary document.
+  * Line 12 saves the document with the name the user specified in the input box.
+  * Line 13 stores the Caption property of this document in the strDestination variable, again making it available to activate this document as necessary throughout the procedure. You now have the source document identified by the strSource variable and the destination document identified by the strDestination variable.
+  * Line 14 uses the Activate method to activate the strSource window. Line 15 is a spacer.
+  * Line 16 uses the HomeKey method of the Selection object with the wdStory unit to move the insertion point to the beginning of the document, which is where the procedure needs to start working to catch all the italicized words in Times New Roman.
+  * Lines 17 through 20 detail the _Find_ operation the procedure needs to perform: Line 17 removes any formatting applied to the current Find item, line 18 sets the Find feature to find italic formatting, line 19 sets _Find_ to find Times New Roman text, and line 20 specifies the search string, which is an empty string (″″) that causes _Find_ to search only for the specified formatting.
+  * Line 21 then performs the Find operation by using the Execute method. Line 22 is a spacer.
+  * Lines 23 through 32 implement the Do While... Loop loop. Line 23 expresses the condition for the loop: While Selection.Find.Found (while the Find operation is able to find an instance of the italic Times New Roman text specified in the previous lines). While this condition is met (is True), the commands contained in the loop will execute.
+  * Line 24 copies the selection (the item found with italic Times New Roman formatting).
+  * Line 25 moves the insertion point one character to the right, effectively deselecting the selection and getting the procedure ready to search for the next instance in the document. You need to move the insertion point off the selection to the right so that the next _Find_ operation doesn't find the same instance. (If the procedure were searching up through the document instead of down, you'd need to move the insertion point off the selection to the left instead by using a Selection.MoveLeft statement.)
+  * Line 26 activates the strDestination window, putting Word's focus on it.
+  * Line 27 then moves the insertion point to the end of the glossary document, and line 28 pastes the copied item in at the position of the insertion point. Moving to the end of the document isn't strictly necessary here, provided that the Normal.dotm global template doesn't contain any text—if Normal.dotm is empty, the new document created in line 11 will be empty too, and the start and end of the document will be in the same position. And after each paste operation, Word positions the insertion point after the pasted item. However, if Normal.dotm _does_ contain text, then this step is necessary.
+  * Line 29 uses the TypeParagraph method of the Selection object to enter a paragraph after the text inserted by the paste operation.
+  * Line 30 activates the strSource document once more, and line 31 repeats the Find operation.
+  * The Loop statement in line 32 then loops execution of the procedure back to line 23, where the Do While Selection.Find.Found condition evaluates whether this latest Find operation was successful (True).
+  * If it was successful, the loop continues; if it wasn't, execution of the procedure continues at line 34, which activates the glossary document again. Line 35 saves the active document (the glossary document, because it was just activated), and line 36 closes it.
+  * Line 37 is a spacer, and line 38 ends the procedure.
+
+## _Do... Loop While_ Loops
+
+A Do... Loop While block is similar to a Do While... Loop, except that in the Do... Loop While loop, the statements contained within the loop are executed at least once.
+
+Whether the condition is True or False, the loop executes at least the first time through because the condition isn't tested until the end of the loop block.
+
+If the condition is True, the loop continues to run until the condition becomes False. Figure 12.2 shows a Do... Loop While loop.
+
+Figure 12.2 In a Do... Loop While loop, the actions in the loop run once before the condition is tested.
+
+The Do While... Loop block described earlier probably made immediate sense to you, but this Do... Loop While block may seem odd. You're going to execute the contained statements _before_ checking the condition?
+
+But you'll find that Do... Loop While loops can be very useful, although they lend themselves to different situations than Do While... Loop loops.
+
+Consider the lottery example from the beginning of the chapter. In that situation, you execute the action before you check the condition that controls the loop. First you buy a lottery ticket, and then you check to see if you've won. If you haven't won, or you've won only a small sum, you loop back and buy more tickets for the next lottery. (Actually, this is logically a Do... Loop Until loop rather than a Do... Loop While loop because you continue the loop while the condition is False; when you win a suitably large amount, the condition becomes True.)
+
+Likewise, in programming it's not uncommon to take an action and then check whether you need to repeat it. For example, you might want to apply special formatting to the current paragraph and then check to see if other paragraphs need the same treatment.
+
+### Syntax
+
+The syntax for a Do... Loop While loop is as follows:
+
+    Do
+        [ _statements_ ]
+        [Exit Do]
+        [ _statements_ ]
+    Loop While _condition_
+
+VBA performs the statements included in the loop, after which the Loop While line evaluates the condition. If it's True, VBA returns execution to the Do line and the loop continues to execute; if it's False, execution continues at the line after the Loop While line.
+
+As an example of a Do... Loop While loop, consider this crude password checker that you could use to prevent someone from executing a macro without supplying the correct password:
+
+    Dim varPassword As Variant
+    VarPassword = "corinth"
+    **Do**
+        varPassword = InputBox _
+            ("Enter the password to start the procedure:", _
+            "Check Password 1.0")
+    Loop While varPassword <> "CorrectPassword"
+
+Here the Do... Loop While loop first displays an input box for the user to enter the password. The Loop While line compares the value from the input box, stored in varPassword, against the correct password (here, CorrectPassword). If the two aren't equal (varPassword <> ″CorrectPassword″), the loop continues, displaying the input box again.
+
+This loop is just an example—you wouldn't want to use it as it is in real life. Here's why: Choosing the Cancel button in an input box causes it to return a blank string, which also doesn't match the correct password, causing the loop to run again. The security is perfect; the problem is that the only way to end the loop is for users to supply the correct password. If they're unable to do so, they will see the input box again and again. There's no way out of the loop. This is called an _endless loop_ and it's really bad programming. The user can get hopelessly trapped with the code repeating endlessly (in this case if they can't remember the password). Such loop stalls are also called _infinite loops_. More on these at the end of this chapter.
+
+You should build a more friendly password-checking procedure. You might specify a number of incorrect password guesses that the user could enter (perhaps three) and then if they still haven't gotten it right, make the procedure terminate itself. Or you could simply use an End statement to terminate the procedure if the user entered a blank string, like this:
+
+    Do
+        varPassword = InputBox _
+            ("Enter the password to start the procedure:", _
+             "Check Password 1.0")
+        If varPassword = "" Then **End**
+    Loop While varPassword <> "CorrectPassword"
+
+## _Do Until... Loop_ Loops
+
+A Do Until... Loop loop is similar to a Do While... Loop loop. The difference is how the condition works. In a Do Until... Loop loop, the loop runs while the condition is False and stops running when it's True. So this is the opposite of the way that the condition works in a Do While... Loop loop.
+
+Figure 12.3 shows a Do Until... Loop loop.
+
+Figure 12.3 A Do Until... Loop loop runs while the condition is False and stops running when the condition becomes True.
+
+* * *
+
+Do Until...Loop Blocks Execute Until a Condition Becomes False
+
+Note that Do Until... Loop loops are useful if you prefer to work with a condition that's True and keep it looping until the condition becomes False. Otherwise, you can achieve the same effects using Do While... Loop loops and inverting the condition. In other words, these two approaches to looping are functionally the same; it's just a matter of how you want to manage the condition. It's the difference between "sweep the porch _until_ it's clean" versus "sweep the porch _while_ it's still dirty." Same idea, expressed differently.
+
+* * *
+
+### Syntax
+
+The syntax for Do Until... Loop loops is as follows:
+
+    Do Until _condition_
+    _statements_
+        [Exit Do]
+        [ _statements_ ]
+    Loop
+
+When VBA enters the loop, it checks the _condition_. If the _condition_ is False, VBA executes the statements in the loop, encounters the Loop keyword, and loops back to the beginning of the loop, reevaluating the _condition_ as it goes. If the _condition_ is True, VBA terminates the loop and continues execution at the statement after the Loop line.
+
+For example, here's the lottery example once again, but now employing a Do...Until loop in Listing 12.3.
+
+**Listing 12.3**: Using Do...Until loops
+
+    1.  Sub Lottery_1()
+    2.      Dim intWin As Integer
+    3.      Do Until intWin > 2000
+    4.          intWin = Rnd * 2100
+    5.          MsgBox intWin, , "Lottery"
+    6.      Loop
+    7.  End Sub
+
+Here's how Listing 12.3 works:
+
+  * Line 2 declares the Single variable intWin. Line 3 then starts a Do Until... Loop loop with the condition that intWin > 2000—the value of the intWin variable must be larger than 2000 for the loop to end. Until then, the loop will continue to run.
+  * Line 4 assigns to intWin the result of 2100 multiplied by a random number produced by the Rnd function, which generates random numbers between 0 and 1. (This means that the loop needs to receive a random number of a little more than .95 to end—a chance of a little less than 1 in 20, considerably better than most lotteries.)
+  * Line 5 displays a simple message box containing the current value of the Win variable so that you can see how lucky you are.
+  * Line 6 contains the Loop keyword that completes the loop.
+  * Line 7 ends the procedure.
+
+Listing 12.4 shows a more useful example of a Do Until... Loop loop in Word.
+
+**Listing 12.4**: A practical example showing how to employ Do Until in Word
+
+    1.  Sub FindNextHeading()
+    2.      **Do Until** Left(Selection.Paragraphs(1).Style, 7) = "Heading"
+    3.          Selection.MoveDown Unit:=wdParagraph, _
+                    Count:=1, Extend:=wdMove
+    4.      **Loop**
+    5.  End Sub
+
+Listing 12.4 contains a short procedure that moves the insertion point to the next heading in the active document in Word. Here's how it works:
+
+  * Line 2 starts a Do Until... Loop loop that ends with the Loop keyword in line 4. The condition for the loop is that the seven leftmost characters in the name of the style for the first paragraph in the current selection—Left(Selection.Paragraphs(1).Style, 7)—match the string Heading. This will match any of the Heading styles (the built-in styles Heading 1 through Heading 9, or any style the user has defined whose name starts with _Heading_ ).
+  * Until the condition is met, VBA executes the statement in line 3, which moves the selection down by one paragraph.
+
+## _Do... Loop Until_ Loops
+
+The Do... Loop Until loop is similar to the Do Until... Loop structure except that in the Do... Loop Until loop, the statements contained within the loop block are executed at least once, whether the condition is True or False. If the condition is False, the loop continues to run until the condition becomes True. Figure 12.4 shows a Do... Loop Until loop.
+
+Figure 12.4 In a Do... Loop Until loop, the actions in the loop are run once before the condition is tested.
+
+### Syntax
+
+The syntax for Do... Loop Until loops is as follows:
+
+    Do
+        [ _statements_ ]
+        [Exit Do]
+        [ _statements_ ]
+    Loop Until _condition_
+
+VBA enters the loop at the Do line and executes the _statements_ in the loop. When it encounters the Loop Until line, it checks the _condition_. If the condition is False, VBA loops back to the Do line and again executes the _statements_. If the condition is True, VBA terminates the loop and continues execution at the line after the Loop Until line.
+
+As an example, say you want to repeatedly display an input box that adds new worksheets to a workbook until the user clicks the Cancel button or enters an empty string in the text box. You could use code like that shown in Listing 12.5.
+
+**Listing 12.5**: Use Do Loop to execute the code at least once
+
+     1.  Sub Create_Worksheets()
+     2.      Dim strNewSheet As String
+     3.      **Do**
+     4.          strNewSheet = InputBox _
+                     ("Enter the name for the new worksheet " _
+                     & "(31 characters max.):", "Add Worksheets")
+     5.          If strNewSheet <> "" Then
+     6.              ActiveWorkbook.Worksheets.Add
+     7.              ActiveSheet.Name = strNewSheet
+     8.          End If
+     9.      **Loop Until** strNewSheet = ""
+    10.  End Sub
+
+Here's what happens in the Create_Worksheets procedure:
+
+  * Line 2 declares the String variable strNewSheet.
+  * Line 3 begins a Do... Loop Until loop.
+  * Line 4 displays an input box asking the user to enter the name for the new worksheet.
+  * Line 5 uses an If statement to make sure that strNewSheet is not an empty string. If it's not, line 6 adds a new worksheet to the active workbook, and line 7 assigns the value of strNewSheet to the active sheet (the new sheet). Line 8 ends the If statement.
+  * Line 9 contains a Loop Until strNewSheet=″″ statement that causes the procedure to loop back to the Do line until the user enters an empty string in the input box. The user can enter an empty string either by leaving the text box in the input box blank and clicking the OK button or by clicking the Cancel button.
+  * Line 10 ends the procedure.
+
+## Using an _Exit Do_ Statement
+
+As with an Exit For statement in a For...loop, you can use an Exit Do statement to exit a Do loop without executing the statements below the Exit line. The Exit Do statement is optional, and you'll probably seldom want to use Exit Do statements in your loops—at least if the loops are properly designed.
+
+When you do need an Exit Do statement, you'll generally use it with its own condition. The example shown in Listing 12.6 makes the lottery a little more interesting by adding an If condition with an Exit Do statement to take effect if the win is less than $500.
+
+**Listing 12.6**: How to use the Exit Do command
+
+     1.  Sub Lottery_2()
+     2.      Dim intWin As Integer
+     3.      Do Until intWin > 2000
+     4.          intWin = Rnd * 2100
+     5.          If intWin < 500 Then
+     6.              MsgBox "Tough luck. You have been disqualified.", _
+                         vbOKOnly + vbCritical, "Lottery"
+     7.              **Exit Do**
+     8.          End If
+     9.          MsgBox intWin, , "Lottery"
+    10.      Loop
+    11.  End Sub
+
+The procedure in Listing 12.6 works in the same way as the example in Listing 12.3 except that line 5 introduces a new If condition. If the variable intWin is less than 500, the statements in lines 6 and 7 run. Line 6 displays a message box announcing that the player has been disqualified from the lottery, and line 7 exits the Do loop.
+
+## Is the _Exit Do_ Statement Bad Practice?
+
+Some programmers consider using an Exit Do statement to exit a Do loop a tactic of last resort, or at least clumsy programming. Others disagree. Many reckon that it's always acceptable to use an Exit Do statement to respond to an error or to the user clicking a cancel button.
+
+VBA executes Exit Do statements with no problem, so it's there if you want to use it. However, you can often rewrite your code to avoid using an Exit Do statement.
+
+For example, a condition that you check in the middle of the loop to decide whether to exit the loop can often be built into the main condition of the loop by using an operator such as And, Or, or Not, as shown in Listing 12.7:
+
+**Listing 12.7**: How to avoid the Exit Do command
+
+    1.  Sub Lottery_3()
+    2.      
+    3.  Dim intWin As Integer
+    4.      
+    5.  Do
+    6.      intWin = Rnd * 2100
+    7.      MsgBox intWin, , "Lottery"
+    8.  **Loop Until intWin > 2000 Or intWin < 500**
+    9.  
+    10. 
+    11. If intWin < 500 Then
+    12.     MsgBox "Tough luck. You have been disqualified.", _
+    13.                 vbOKOnly + vbCritical, "Lottery"
+    14. End If
+    15.                 
+     16. End Sub
+
+Listing 12.7 is a revision of the example in Listing 12.6. Listing 12.7 shows you how to use the Or operator to specify two conditions for the loop to iterate. In this way, you can omit the Exit Do command entirely.
+
+In line 8 of Listing 12.7, we are saying that the loop should end if the variable is greater than 2000 Or less than 500. This makes it somewhat clearer what the loop is doing.
+
+We must also make two other changes. First, we have to move the condition test from the top of the loop to the bottom. The Do Until command in Listing 12.6 must be changed to the Loop Until command in Listing 12.7. If we leave the condition test at the top of the loop, the condition will _always_ prevent the loop from executing. This is because the intWin variable will always hold zero when this loop first executes. So we move the condition test to the bottom of the loop, allowing the variable to be assigned some value in line 6.
+
+The final change we need to make is to move the If...Then block down to the bottom of the procedure.
+
+If the code is simple like this example, you might be better off rewriting it to employ an operator. But if the code is complex and lengthy, there's no good reason to force yourself to use operators when an Exit Do statement will do the trick instead.
+
+# _While... Wend_ Loops
+
+In addition to the For...Next loop, the For Each... Next loop, and the four flavors of Do loops examined so far in this chapter, VBA includes the While... Wend loop. While... Wend is VBA's version of the While... Wend looping structure used by earlier programming languages, such as the WordBasic programming language used with versions of Word up to and including Word 95. VBA includes While... Wend more for compatibility with those earlier versions than as a recommended technique. But you can use it if you choose to. The various Do loops have replaced While... Wend, but While... Wend still works fine.
+
+The syntax of a While... Wend loop is as follows:
+
+    While _condition_
+        [ _statements_ ]
+    Wend
+
+While the _condition_ is True, VBA executes the _statements_ in the loop. When it reaches the Wend keyword (which is a contraction of While End), it returns to the While statement and evaluates the _condition_ again. When the _condition_ evaluates as False, the statements in the loop are no longer executed and execution moves to the statement after the Wend statement.
+
+The following statements create a simple While... Wend loop for Word:
+
+    While Documents.Count < 10
+        Documents.Add
+    Wend
+
+While the number of documents in the Documents collection (measured here by the Count property of the Documents collection) is smaller than 10, the loop runs. Each time through, the Documents.Add statement in the second line creates a new document based on the Normal template (because no other template is specified). After the new document is created, the Wend statement in the third line returns execution to the first line, where the While condition is evaluated again.
+
+* * *
+
+Avoid Branching into the Middle of a While...Wend Loop
+
+If you do use a While... Wend loop, make sure the only way to enter the loop is by passing through the gate of the While condition. Branching into the middle of a While... Wend loop (for example, by using a label and a GoTo statement) can cause errors.
+
+* * *
+
+# Nesting Loops
+
+You can nest one or more loops within another loop to create the pattern of repetition you need: You can nest one For...loop inside another For...loop, a For...loop inside a Do loop, a Do loop inside a For...loop, or a Do loop inside a Do loop.
+
+* * *
+
+VBA Permits up to 16 Levels of Nesting, but Who Could Understand Such Complexity?
+
+You can nest up to 16 levels of loops in VBA, but you'll be hard-pressed to comprehend even half that number of levels as you read over your code. If you find your code becoming this complicated, consider whether you can take a less tortuous approach to solve the problem.
+
+* * *
+
+For example, if you need to create a number of folders, each of which contains a number of subfolders, you could use a variation of the Create_Folders procedure you looked at earlier in the chapter. But such a task cries out for nesting.
+
+The dialog box for the procedure will need another text box to contain the number of subfolders to create within each folder. The new dialog box is named frmCreateFoldersAndSubFolders and the text box for the number of subfolders is named txtHowManySubFolders. Figure 12.5 shows the dialog box.
+
+Figure 12.5 The dialog box to create folders and subfolders
+
+Listing 12.8 shows the code triggered by the Click event on the cmdOK button of the form.
+
+**Listing 12.8**: Employing a nested loop
+
+     1.  Private Sub cmdOK_Click()
+     2.
+     3.      Dim strStartingFolder As String
+     4.      Dim strFolderName As String
+     5.      Dim strSubfolderName As String
+     6.      Dim intSubfolder As Integer
+     7.      Dim intLoopCounter As Integer
+     8.
+     9.      frmCreateFoldersAndSubfolders.Hide
+    10.      Unload frmCreateFoldersAndSubfolders
+    11.
+    12.      strStartingFolder = CurDir
+    13.
+    14.      **For intLoopCounter** = 1 To txtHowManyFolders.Value
+    15.          strFolderName = txtProjectNumber.Value & "s" & _
+                     Format(intLoopCounter, "0#")
+    16.          MkDir strFolderName
+    17.          ChDir strFolderName
+    18.          **For intSubfolder** = 1 To txtHowManySubfolders.Value
+    19.              strSubfolderName = "Subsection" & intSubfolder
+    20.              MkDir strSubfolderName
+    21.          **Next intSubfolder**
+    22.          ChDir strStartingFolder
+    23.      **Next intLoopCounter**
+    24.
+    25.  End Sub
+
+Here's what the code in Listing 12.8 does:
+
+  * Line 1 begins the procedure, and line 25 ends it. Line 2 is a spacer.
+  * Lines 3 through 5 declare three String variables, strStartingFolder, strFolderName, and strSubfolderName, respectively.
+  * Line 6 declares the Integer variable intSubfolder, and line 7 declares the Integer variable i. Line 8 is a spacer.
+  * Line 9 hides the user form, and line 10 unloads it. Line 11 is a spacer.
+  * Line 12 stores the name of the current folder in the String variable strStartingFolder. You'll need this variable to make sure everything happens in the appropriate folder later in the procedure. Line 13 is another spacer.
+  * Lines 14 through 16 and line 23 are essentially the same as in the previous procedure. They build the folder name out of the Value property of the txtProjectNumber text box, the letter _s_ , a two-digit number, and the i variable and then use the MkDir statement to create the folder.
+  * Line 17 uses a ChDir statement to change folders to the folder that was just created, strFolderName.
+  * In line 18, the nested For...Next loop starts. This loop is controlled by the loop counter intSubfolder and will run from intSubfolder = 1 to intSubfolder = txtHowManySubFolders.Value, which is the value entered by the user in the Number Of Subfolders To Create text box in the dialog box.
+  * Line 19 builds the String variable strSubfolderName out of the word _Subsection_ and the value of the intSubfolder _counter_ variable. For this procedure, you can assume that there will be fewer than 10 subsections for each of the sections, so single-digit numbering is adequate.
+  * Line 20 creates the subfolder by using a MkDir statement with the strSubfolderName String variable.
+  * Line 21 uses the Next Subfolder statement to loop back to the beginning of the nested For...Next loop. VBA reevaluates the condition and repeats the loop as necessary.
+  * Line 22 changes folders back to strStartingFolder for the next iteration of the outside loop. (Otherwise, the next folder would be created within the current folder, strFolderName.)
+  * Line 23 then loops back to the beginning of the outer loop.
+
+* * *
+
+Use the Counter Variable with Next when Nesting for...loops
+
+Using counter variables with the Next command is optional (in Listing 12.8, the counter variables are named intLoopCounter and intSubfolder). You could simply use Next by itself and VBA will figure out what you mean. But when nesting For...loops, it's a good idea to include a counter variable to make it easier to see which loop is ending with the Next command (in other words, use Next intLoopCounter, for example, rather than just the shorthand version Next). Using a counter variable makes your procedures much easier to read and may prevent unpleasant surprises (bugs). Your nested loops must end in the exact reverse order of their starting, and the counters need to match.
+
+* * *
+
+# Avoiding Infinite Loops
+
+If you create an infinite (aka endless) loop in a procedure, it will happily run forever, unless the user presses Ctrl+Break, presses Ctrl+Alt+Del to use the Task Manager to shut down the frozen application, restarts the computer, or pulls the plug.
+
+For example, one type of loop you haven't yet encountered is the Do... Loop. As you can see in the example in Listing 12.9, without a condition attached to it, this structure is an infinite loop. There's no condition that can stop the looping.
+
+**Listing 12.9**: An example of an endless loop
+
+    1.  Sub InfiniteLoop()
+    2.      Dim x
+    3.      x = 1
+    4.      Do
+    5.          Application.StatusBar = _
+                   "Your computer is stuck in an endless loop: " & x
+    6.          x = x + 1
+    7.      Loop
+    8.  End Sub
+
+In Listing 12.9, line 2 declares the variable _x_ , and line 3 assigns it the value 1. Line 4 begins the Do loop, which displays a status-bar message and increases the value of _x_ by 1. The effect of this loop is to display a message and an ever-increasing number on the status bar until you press Ctrl+Break to stop the procedure or until the value overflows the variable's maximum value. This is all thoroughly pointless (except perhaps as a way to burn in a new computer) and is perhaps a good reason not to use the Do... Loop structure—at least not without a condition attached to one end of it.
+
+No matter what type of loop you use, to avoid creating an infinite loop, you need to make sure the condition that will terminate the loop can be satisfied at some point. For example, for an editing or cleanup procedure, you'll often want to perform an action until the end of the document is reached and then stop. Or you'll want to include some form of counting mechanism to make sure a Do loop doesn't exceed a certain number of iterations.
+
+# The Bottom Line
+
+**Understand when to use loops.**
+
+Loops come in very handy when you need to perform a repetitive task, such as searching through a document for a particular word.
+
+Master It
+
+What is the alternative to looping if you are carrying out repetitive tasks in a macro?
+
+**Use For...loops for fixed repetitions.**
+
+For...loops are the most common loop structures in programming. You specify the number of iterations the loop must make, and the loop is exited when that number is reached.
+
+Master It
+
+Write a For...Next loop that counts up to 100, but use the Step command to increment by twos.
+
+**Use Do... loops for variable numbers of repetitions.**
+
+A Do... loop iterates until or while a condition exists, then exits from the loop when the condition no longer exists.
+
+Master It
+
+There are two categories of Do... loops. Do While... Loop and Do Until... Loop loops test a condition before performing any action. What is the other category?
+
+**Nest one loop within another loop.**
+
+You can put loops inside other loops.
+
+Master It
+
+Think of a programming task where nested loops would be useful.
+
+**Avoid infinite loops.**
+
+An infinite (or endless) loop causes your macro to continue execution indefinitely—as if the macro had stopped responding and was "frozen."
+
+Master It
+
+How do you avoid creating an infinite loop?
+Part 4
+
+Using Message Boxes, Input Boxes, and Dialog Boxes
+
+  * **Chapter 13: Getting User Input with Message Boxes and Input Boxes**
+  * **Chapter 14: Creating Simple Custom Dialog Boxes**
+  * **Chapter 15: Creating Complex Forms**
+
+Chapter 13
+
+Getting User Input with Message Boxes and Input Boxes
+
+This chapter shows you how to start adding a user interface to recorded or written code in order to increase the power and functionality of your macros or applications.
+
+You'll learn the three easiest ways of communicating with the user of your code, the two easiest ways of enabling the user to make decisions in a procedure, and the easiest way of soliciting input from the user. Along the way, you'll see how to decide what is the best way to communicate with the user in any given set of circumstances. This will set the scene for starting an examination of more complex interactions with the user via custom dialog boxes, later in the book.
+
+In most Office applications, VBA offers you a choice of up to five ways of communicating with the user of a procedure:
+
+  * Displaying a message on the status bar at the bottom of the window (if the application provides a status bar). This is a bit limited, but it can be an effective way of communicating with the user. And it's not intrusive—users can easily ignore the status bar if they wish.
+  * Displaying a message box (usually in the middle of the screen). Message boxes are useful both for providing some information to users and for giving them the means to make a single choice based on the information you give them. You'll spend the bulk of this chapter working with message boxes.
+  * Displaying an input box (again, usually in the middle of the screen). You can use input boxes the same way you use message boxes—to communicate some information to users. But the primary purpose of an input box is input: to solicit one item of information from the user. Input boxes also provide users with the means of making a single choice to direct the flow of a procedure, although the mechanism for presenting this choice is much more limited than that in a message box. You'll look at input boxes toward the end of this chapter.
+  * Displaying a dialog box (once again, usually in the middle of a screen). You can use dialog boxes both to display information to the user and to let them make a variety of choices that are communicated back to your code. Dialog boxes are best reserved for those times when other forms of communication won't suffice; in other words, there's no point in using a dialog box when a simple message box or input box will do. You'll look at creating your own custom dialog boxes by using VBA user forms later in the book.
+  * Displaying an application's built-in dialog box, such as Word's FileOpen dialog box. This approach is explored in Chapter 14, "Creating Simple Custom Dialog Boxes."
+
+In this chapter you will learn to do the following:
+
+  * Display messages on the status bar
+  * Display message boxes
+  * Display input boxes
+  * Understand the limitations of message boxes and input boxes
+
+# Opening a Procedure to Work On
+
+Make sure you're all set for editing in the Code window in the VBA Editor:
+
+1. Start the application for which you're creating code.
+
+2. Launch the Visual Basic Editor from the host application by pressing Alt+F11.
+
+3. Open a procedure for editing in the Code window: Use the Project Explorer to navigate to the module that holds the procedure, and then either scroll to the procedure in the Code window or choose it from the Procedures drop-down list in the Code window.
+
+* * *
+
+You Can Locate Procedures Using the Macro Dialog Box
+
+Alternatively, in the VBA Editor, choose Tools ⇒ Macros to display the Macros dialog box. Or to display this dialog box from an application such as Word, click the Developer tab on the Ribbon, then click the Macros icon. Once the Macros dialog box is open, you can select a procedure you've created from the Macro Name list box and click the Edit button to display the Visual Basic Editor with the procedure open in the Code window.
+
+If you've opened an existing procedure, test its code by using the F8 key to step through the statements or by clicking F5 (the Run Sub/UserForm) to run it without stepping. (You can also run it by typing the procedure's name into the Editor's Immediate window and pressing Enter.)
+
+* * *
+
+Nevertheless, it's probably best to work in a new procedure rather than in an existing one because that way you won't do any damage to a macro you may want to use in the future.
+
+Create a new procedure in the Visual Basic Editor Code window by typing the Sub keyword, giving the procedure a name on a blank line in a module, and then pressing Enter. VBA adds the parentheses and End Sub statement. For example, you could type the following and press the Enter key:
+
+    Sub Experimentation_Zone
+
+VBA adds the parentheses and End Sub statement, together with a separator line to separate the procedure from any adjacent procedures in the Code window:
+
+    Sub Experimentation_Zone()
+    End Sub
+
+# Displaying Status-Bar Messages in Word and Excel
+
+Word and Excel let you display information on the status bar. This is often a convenient way to tell the user what's happening in a procedure without halting execution of the code (or, more important, without interrupting the user's work and requiring them to click a button to get rid of your message box).
+
+By displaying status information on the status bar as the procedure works, you can indicate to the user not only what the procedure is doing, but also that it's still, in fact, running. Of course, the user might not _notice_ the status bar. So if you are displaying crucial information, you must use a message box or one of the other types of boxes, like an input box. These force the user to pay attention; no further work can be done within the application until that box is dismissed.
+
+* * *
+
+How to Avoid Alarming the User
+
+A problem you'll sometimes encounter is that the user thinks a procedure has frozen, crashed, gone into an infinite loop, or failed to work because no changes are visible onscreen, whereas in fact your procedure is working properly in the background. If you have a procedure that takes a long time to execute, updates on the status bar let the user see that the procedure is still working. To see example code that illustrates how to update the status bar, take a look at the sidebar entitled "i Is the Traditional Counter Variable Name for For...Next Loops" in Chapter 12, "Using Loops to Repeat Actions."
+
+* * *
+
+But remember that the main disadvantage of displaying messages on the status bar is that users may miss them if they're not paying attention, if they've hidden the status bar, or if they're not expecting to see messages there.
+
+* * *
+
+How to Hide the Status Bar
+
+When I mentioned hiding the status bar in the previous paragraph, you might have launched an effort to do just that. You looked all over the Ribbon, paying particular attention to the View tab. Then you clicked the File tab to open the Options dialog box. But you didn't find a way, anywhere, to hide the status bar. Well, this is yet one more reason to learn VBA. As I've mentioned, you can do things with VBA that are not possible any other way. Here's the code that will hide the status bar:
+
+    Sub HideStatusBar()
+          Application.CommandBars("Status Bar").Visible = True
+    End Sub
+
+* * *
+
+If an application uses the status bar extensively to give the user information (as Word and Excel do), this might not be a problem for attentive users. But if there's any doubt, notify the user that information will be displayed on the status bar. For example, you might display a message box at the beginning of a procedure to tell the user to watch the status bar for updates.
+
+To display a message on the status bar in Word or Excel, you set the StatusBar property of the Application object to an appropriate string of text. The following example displays the status-bar information shown in Figure 13.1:
+
+    Application.StatusBar = "Word is formatting the report. Please wait..."
+
+Figure 13.1 In some applications, you can display information on the status bar.
+
+Typically, any information you display on the status bar remains displayed there until you change it, until the user clicks something, or until the application displays a message there itself.
+
+For example, if you display a message on the status bar and then invoke the Copy command in Excel, Excel displays its normal Copy message, "Select destination and press ENTER or choose Paste," on the status bar, wiping out your message. Application messages trump user-created messages.
+
+If you display a message on the status bar in the course of a procedure, you should update it later in the procedure to avoid leaving a now-obsolete and potentially misleading message on the status bar after the procedure has finished running. For example, you might display another message saying that the procedure has finished or clear the status bar by displaying a blank string on it.
+
+To clear the status bar, assign an empty string to it, as in the following statement:
+
+    Application.StatusBar = ""
+
+To see the effect of this statement, run it from the Visual Basic Editor (click the upper-right corner to ensure that the Editor window isn't maximized) with the Word or Excel window (or at least its status bar) visible at the same time. You'll see the effect best if you run a statement that displays information on the status bar (such as Application.StatusBar = "Hello, World!") first so that the status bar has information for the Application.StatusBar = "" statement to clear:
+
+    Application.StatusBar = "Hello, World!"
+    Application.StatusBar = ""
+
+* * *
+
+Progress Indicators Can Be Written Various Ways
+
+It's especially helpful to display a progress indicator on the status bar during longer processes so that the user can tell that they're still running and that they're making progress. Progress indication is usually coded within a loop block. For example, you might display a readout of the progress, such as "Excel is working on sheet 9 out of 150." Even more simply, adding increasing numbers of periods to the end of the status message gives an indication of progress, although it doesn't give an idea of how much longer the task will take. Here's how you can add periods to a string:
+
+    strPeriod = strPeriod & "."
+
+* * *
+
+# Message Boxes
+
+Another way to display information to the user is the message box; you've probably seen examples of it in almost every Windows application you've used. Message boxes are simple and limited, but they play an important role.
+
+Here are some typical uses of message boxes:
+
+  * Telling users what a procedure is about to do (and giving them the chance to exit the procedure if it isn't what they thought it was).
+  * Presenting users with an explanation of what a procedure will do next and asking them to make a simple decision (usually, to let it proceed or to send it on a different course).
+  * Warning users of an error that the procedure encountered and allowing them to take action on it.
+  * Informing users that a procedure ran successfully and that it has finished. This message is particularly useful for procedures that turn off screen updating or otherwise hide from users what they are doing. Such procedures may leave users unsure of whether they are still running or have finished. You can also use the message box to report what a procedure has done—for example, that it changed particular items, made a certain number of changes, or discovered problems in the document that require attention.
+
+This chapter shows you how to create a message box suitable for each of these tasks. In later chapters, you'll create specific message boxes to enhance various procedures.
+
+## The Pros and Cons of Message Boxes
+
+These are the advantages of using a message box:
+
+  * Users can't miss seeing the message box. Users are prevented from continuing to use the application until they close the message box. (If you want, you can even display a message box that the user can't escape by pressing Alt+Tab to switch to another application. You'll look at this a little later in the chapter.)
+  * You can present the user with a simple choice among two or three options.
+
+These are the disadvantages of using a message box:
+
+  * A message box can present only one, two, or three buttons, which means it can offer only a limited set of options to the user.
+  * The buttons in message boxes are predefined in sets—you can't put a custom button in a message box. (For that, you have to use a dialog box.)
+  * You can't use features such as text boxes, group boxes, or list boxes within message boxes.
+
+## Message-Box Syntax
+
+The basic syntax for message boxes is as follows:
+
+    MsgBox( _prompt_ [, _buttons_ ] [, _title_ ][, _helpfile, context_ ])
+
+Here's what the elements of this syntax mean:
+
+**MsgBox**
+
+The function that VBA uses to display a message box. You typically use it with a number of arguments enclosed in parentheses after it.
+
+**_prompt_**
+
+A required argument for the MsgBox function that specifies what text is displayed in the message box. _prompt_ is a String argument, meaning you need to type in the text of your choice; it can be up to 1,023 characters long, although it's usually a good idea to be more concise than this. (Any prompt longer than 1,023 characters is truncated to 1,023 characters without warning.)
+
+**_buttons_**
+
+An optional argument that controls the type of message box that VBA displays by specifying which buttons it contains. For example, as you'll see in a couple of pages, you can display a message box with just an OK button; with OK and Cancel buttons; with Abort, Retry, and Ignore buttons; and so on. You can also add arguments to the _buttons_ argument that control the icon in the message box and the modality of the message box. You'll also look at these options later in this chapter.
+
+**_title_**
+
+An optional argument that controls the title bar of the message box. This too is a String argument. If you don't specify _title_ , VBA uses the application's title—Microsoft Word for Word, Microsoft Excel for Excel, Microsoft PowerPoint for PowerPoint, and so on. Usually, it's best to specify the title because the application name on its own isn't helpful (unless the user has become confused as to which application is running the procedure).
+
+**_helpfile_**
+
+An optional argument that controls which Help file VBA displays when the user presses F1 within the message box to get help (or clicks the Help button in a message box that contains a Help button).
+
+**_context_**
+
+An optional argument that controls which topic in the Help file VBA jumps to. If you specify the helpfile argument, you must specify the _context_ argument as well.
+
+In the following sections, you'll first look at how you can build the simplest of message boxes and then explore how to add arguments to it to make it more complex.
+
+## Displaying a Simple Message Box
+
+You can display the simplest message box by specifying only the prompt as a text string enclosed in double quotation marks:
+
+    MsgBox "This is a simple message box."
+
+Run from Excel, this statement produces the simple message box shown in Figure 13.2. With _prompt_ as the only argument supplied, VBA produces a message box with only an OK button and with the application's name in the title bar. This message box does nothing except display information.
+
+Figure 13.2 When you use only the _prompt_ argument to display a simple message box, VBA uses the application's name as the title.
+
+You can enter this MsgBox statement on any blank line within a procedure. After you type the MsgBox keyword, VBA's Auto Quick Info feature prompts you with the syntax of the function, as shown in Figure 13.3.
+
+Figure 13.3 VBA's Auto Quick Info feature prompts you with the syntax for the message box.
+
+Once you've entered the MsgBox statement with its required argument ( _prompt_ ), you can display the message box by stepping through the code (by pressing the F8 key or clicking the Step Into button on the editor's Debug toolbar) or by running the procedure (by pressing the F5 key, by clicking the Run Sub/UserForm button, or by choosing Run ⇒ Run Sub/UserForm).
+
+Instead of entering a literal text string for the _prompt_ argument, you can use a String variable. The following example uses a String variable named strMsg:
+
+    Dim strMsg As String
+    strMsg = "This is a simple message box."
+    MsgBox strMsg
+
+This approach can be useful when you're working with long strings (you can build a big string by concatenating several shorter strings with the & operator). Using a variable is also useful when you need to display a string that has been defined earlier in the procedure or a string dynamically created by the procedure (for example, after having gotten the user's name via an input box).
+
+## Displaying a Multiline Message Box
+
+By default, VBA displays short message strings as a single line in a message box and wraps longer strings onto two or more lines as necessary, up to the limit of 1,024 characters in a string.
+
+You can deliberately break a string into more than one line by including line-feed and carriage-return characters in the string as follows:
+
+  * Chr(13) or vbCr represents a carriage return.
+  * Chr(10) or vbLf represents a line feed.
+  * Chr(10) + Chr(13) or vbCrLf represents a line-feed/carriage-return combination.
+
+In message boxes, these three characters all have the same effect—moving down one line. Your code is easier to read if you use a built-in constant (vbCr, vbLf, or vbCrLf) rather than the corresponding Chr() construction; it's also quicker to type. Usually, it's clearest to use the vbCr constant.
+
+You can add a tab to a string by using Chr(9) or vbTab. Again, vbTab is easier to read and to type.
+
+The following code displays the Word message box shown in Figure 13.4. Note that each part of the text string is enclosed in double quotation marks (to tell VBA that they're part of the string). The Chr(149) characters are bullets, so the text after them starts with a couple of spaces to give the bullets some room:
+
+    Dim strMsg As String
+    strMsg = "Word has finished formatting the report you requested." _
+        & vbCr & vbCr & "You can now run the following procedures:" & vbCr _
+        & vbCr & Chr(149) & " Distribute_Report will email the report to " _
+        & "the head office." & vbCr & vbCr & Chr(149) & _
+        " Store_Report will copy the report to the holding directory." _
+        & vbCr & vbCr & Chr(149) & " Backup_Report will create a backup " _
+        & "of the report on the file server."
+    MsgBox strMsg
+
+Figure 13.4 You can display a multiline message box by using line-feed and carriage-return characters within the prompt string.
+
+* * *
+
+VBA Automatically Helps You Punctuate Your Code
+
+You'll notice that in this example, a space appears on either side of each of the ampersands (&) and the equal sign. You can enter these spaces yourself or have VBA enter them for you when you move the insertion point to another line by pressing Enter or clicking the mouse. (Moving the insertion point to another line causes VBA to check the line you've just been working on and make various automatic changes if necessary. For example, some characters may be capitalized, or if you typed EndIf, VBA will make it two words as it's supposed to be.)
+
+* * *
+
+## Choosing Buttons for a Message Box
+
+The _buttons_ argument controls which buttons a message box contains. VBA offers the types of message boxes shown in Table 13.1, controlled by the _buttons_ argument.
+
+Table 13.1 Message-box types, controlled by the _buttons_ argument
+
+**Value** | **Constant** | **Buttons**  
+---|---|---  
+0 | vbOKOnly | OK  
+1 | vbOKCancel | OK, Cancel  
+2 | vbAbortRetryIgnore | Abort, Retry, Ignore  
+3 | vbYesNoCancel | Yes, No, Cancel  
+4 | vbYesNo | Yes, No  
+5 | vbRetryCancel | Retry, Cancel
+
+You can specify these message-box types in your code by using either the numeric value or the constant. For example, you can specify either 1 or vbOKCancel to produce a message box with OK and Cancel buttons. The value is easier to type; the constant is easier to read. Either of the following statements produces the message box shown in Figure 13.5 when run from PowerPoint:
+
+    Dim lngR As Long
+    lngR = MsgBox("Apply standard formatting to the slide?", vbYesNo)
+    lngR = MsgBox("Apply standard formatting to the slide?", 4)
+
+Figure 13.5 The vbYesNo constant produces a message box with Yes and No buttons.
+
+From VBA's point of view, it doesn't matter whether you use values or constants in the message boxes for your procedures. For the human, though, the text constants are far preferable. Even if you're the only person who ever sees your code, the code is much easier to read if you use the constants.
+
+## Choosing an Icon for a Message Box
+
+You can also add an icon to a message box by including the appropriate value or constant argument. Table 13.2 shows the options.
+
+Table 13.2 Arguments for message-box icons
+
+**Value** | **Constant** | **Displays**  
+---|---|---  
+16 | vbCritical | Stop icon  
+32 | vbQuestion | Question-mark icon  
+48 | vbExclamation | Exclamation-point icon  
+64 | vbInformation | Information icon
+
+Again, you can refer to these icons by using either the value or the constant: Either 48 or vbExclamation will produce an exclamation-point icon. Again, the constant is much easier to read.
+
+To link the value or constant for the message box with the value or constant for the icon, use a plus sign ( **+** ). For example, to produce a message box containing Yes and No buttons together with a question-mark icon (see Figure 13.6), you could enter **vbYesNo + vbQuestion** (or **4 + 32, vbYesNo + 32** , or **4 + vbQuestion** ):
+
+    lngR = MsgBox("Apply standard formatting to the slide?", _
+        vbYesNo + vbQuestion)
+
+Figure 13.6 Adding an icon gives a message box greater visual impact.
+
+## Setting a Default Button for a Message Box
+
+As usual in the Windows interface, the user is cued to a default button in a message box. It's the one with a blue border around its outside and a dotted line around its text area. (See the Yes button in Figure 13.6.) The user can move the selection to another button by using Tab or Shift+Tab or the →, ←, ↑, or ↓ key.
+
+However, you can specify in your code which button you want to be the default.
+
+* * *
+
+**The Practical Use of Default Buttons**
+
+You can set a default button for a message box by specifying a particular button in the MsgBox statement. Specifying a default button can be a wise move when you give procedures that take drastic action to users who may be unfamiliar with what's going to happen. (The user might accidentally hit the Enter key or click the highlighted button—the default button.)
+
+For example, consider a procedure that deletes the current document without the user having to close it and then switches to a file-management program (such as Windows Explorer) or messes around in one of the common dialog boxes (such as the Open or the Save dialog box). Common dialog boxes are demonstrated in the Real World Scenario sidebar titled "Control a For...Next Loop with User Input via a Dialog Box" in Chapter 12.
+
+Because such procedures can destroy someone's work if they run it inadvertently, you'd probably want to set a default button of No or Cancel in a confirmation message box so that the user has to actively choose to run the rest of the procedure. The message box halts execution, allows the user to agree or disagree with the action, and then carries out the user's wishes based on which button is clicked in the message box.
+
+Why does VBA include a default button at all? This makes it easy for the user to choose the ordinary VBA default button (captioned Yes or OK) by simply pressing Enter. Having the appropriate default button on a message box or dialog box can help the user deal with the message box or dialog box more quickly. But you as the programmer should decide if there is a different, more appropriate, default button. VBA automatically sets the first button in a message box to be the default button. But there are times that you will want to specify that the default button be a different button than the first. If you are doing something potentially dangerous in a macro—such as deleting the current document without saving it—it would be a good idea to make the second button (the No button) the default. This way, if the user simply presses Enter, nothing happens; the macro exits without deletion. Using this technique, you force the user to make a deliberate decision to move the mouse and click the Yes button. Table 13.3 shows you how to adjust which button is the default by using various built-in constants. And the short code example that ends the section demonstrates this technique.
+
+* * *
+
+Table 13.3 lists the arguments for default buttons.
+
+Table 13.3 Arguments for default message-box buttons
+
+**Value** | **Constant** | **Effect**  
+---|---|---  
+0 | vbDefaultButton1 | The first button is the default button.  
+256 | vbDefaultButton2 | The second button is the default button.  
+512 | vbDefaultButton3 | The third button is the default button.  
+768 | vbDefaultButton4 | The fourth button is the default button.
+
+All the message boxes mentioned so far have only one, two, or three buttons, but you can add a Help button to any of the message boxes, making for a fourth button on those boxes that already have three buttons (such as vbYesNoCancel). You'll see how to add the Help button in the section "Adding a Help Button to a Message Box" later in this chapter.
+
+In VBA, unless you specify otherwise, the first button on each of the message boxes is automatically the default button: for example, the OK button in a vbOKCancel message box, the Abort button in a vbAbortRetryIgnore message box, the Yes button in a vbYesNoCancel message box, the Yes button in a vbYesNo message box, and the Retry button in a vbRetryCancel message box. VBA counts the buttons in the order they're presented in the constant for the type of message box (which in turn is the left-to-right order in which they appear in the message box onscreen). So in a vbYesNoCancel message box, Yes is the first button, No is the second button, and Cancel is the third button.
+
+To make a different button the default, specify the value or constant as part of the _buttons_ argument. When run in PowerPoint, this statement produces the message box shown in Figure 13.7:
+
+    Dim lngQuery As Long
+    lngQuery = MsgBox("Do you want to delete this presentation?", _
+        vbYesNo + vbCritical + vbDefaultButton2)
+
+Figure 13.7 Specify a default button to steer the user toward a particular button in a message box.
+
+## Controlling the Modality of a Message Box
+
+VBA can display both application-modal message boxes and system-modal message boxes—at least in theory. _Application-modal_ message boxes stop you from doing anything in the current application until you dismiss them, whereas _system-modal_ message boxes stop you from doing anything _on your entire computer_ until you dismiss them.
+
+Most message boxes are application modal, allowing the user to switch to another application by pressing Alt+Tab (or switching via the Taskbar). The user can then work in the other application even though they haven't gotten rid of the message box. This gives them freedom and flexibility. In contrast, some message boxes (most often used during an installation process) are system modal, insisting that users concentrate their attention on them and them alone. Windows's critical system errors and "you must restart your computer now" messages are system modal to prevent you from avoiding them.
+
+You probably know from your own experience how frustrating system-modal message boxes can be. So when designing procedures, use system-modal message boxes only when absolutely necessary—for example, when an action might result in data loss or system instability. For most conventional purposes, application-modal message boxes will do everything you need them to—and won't confuse or vex your users.
+
+In theory, you can control the modality of a message box by using the two _buttons_ arguments shown in Table 13.4.
+
+Table 13.4 Arguments for message-box modality
+
+**Value** | **Constant** | **Result**  
+---|---|---  
+0 | vbApplicationModal | The message box is application modal.  
+4096 | vbSystemModal | The message box is system modal.
+
+In practice, even if you use the vbSystemModal argument, the user can switch to another application (provided that one is running) and continue working. However, the message box does stay "on top," remaining displayed—enough to annoy users but not totally prevent them from accessing another application.
+
+By default, message boxes are application modal, so you need to specify modality only on those rare occasions when you need a system-modal message box. When you do, add the vbSystemModal constant or 4096 value to the _buttons_ argument:
+
+    Response = MsgBox("Do you want to delete this document?", _
+        vbYesNo + vbCritical + vbDefaultButton2 + vbSystemModal)
+
+Please note that system-modal message boxes look the same as application-modal message boxes.
+
+## Specifying a Title for a Message Box
+
+The next component of the message box is its title bar, which is controlled by the optional _title_ argument. If you omit _title_ , VBA displays the application's name as the title, but users of your procedures will benefit from your providing a more helpful title.
+
+_title_ is a string expression and can be up to 1,024 characters in length, in theory (longer strings are truncated with no warning or error message), but in practice, any title longer than about 75 characters gets truncated with an ellipsis. If you want people to read the title bars of your message boxes, 25 characters or so is a reasonable maximum.
+
+## Title Bars Can Provide Useful Information
+
+The title bar is usually the first part of a message box that the user notices, so make your title bars as helpful as possible. Conventional etiquette is to put the name of the procedure in the title bar of a message box and then use the prompt argument to explain what actions the buttons in the message box will trigger.
+
+In addition, if you expect to revise your procedures, you may find it helpful to include their version number in the title so that users can easily check which version of the procedure they're using (and update to a more current version as appropriate). For instance, the Delete Workbook procedure is identified as version 12.39 in the message box shown in Figure 13.8.
+
+Figure 13.8 Usually, you'll want to specify the title argument for your message boxes. You may also want to include a version number.
+
+Specify the _title_ argument after the _buttons_ argument like this:
+
+    Dim lngQuery As Long
+    lngQuery = MsgBox("Do you want to delete this workbook?", vbYesNo _
+        + vbCritical + vbDefaultButton2, "Delete Workbook 12.39")
+
+You can use a string variable as the _title_ argument. For example, you could declare a single string variable and use it to supply the title for each message box that a procedure calls. Or you might need to display in the title of the message box a string created or stored in the procedure.
+
+* * *
+
+**Avoid Using Special Characters in Titles**
+
+Don't try putting line-feed, carriage-return, or tab characters in the title argument. VBA just ignores them.
+
+* * *
+
+## Adding a Help Button to a Message Box
+
+To add a Help button to a message box, use the vbMsgBoxHelpButton constant. You add this argument to whichever buttons you're specifying for the message box:
+
+    lngQuery = MsgBox("Do you want to delete this workbook?", vbYesNo _
+        + vbCritical + vbDefaultButton2 + **vbMsgBoxHelpButton** , _
+        "Delete Workbook")
+
+Adding the vbMsgBoxHelpButton argument simply places the Help button in the message box—it doesn't make the Help button display a Help file until you specify which Help file and topic it should use (see the next section for details). Figure 13.9 shows the message box that this statement produces.
+
+Figure 13.9 Use the vbMsgBoxHelpButton constant to add a Help button to a message box.
+
+## Specifying a Help File for a Message Box
+
+The final arguments you can use for a message box are the helpfile and _context_ arguments:
+
+  * The helpfile argument is a string argument specifying the name and location of the Help file that VBA displays when the user summons help from the message box.
+  * The _context_ argument is a Help context number within the Help file. The Help context number controls which Help-file topic is displayed.
+
+The helpfile and _context_ arguments are primarily useful if you're writing your own Help files, because otherwise it's difficult to access the Help context numbers, which are buried in the official Help files.
+
+If you're writing your own Help files, the syntax for specifying the helpfile and _context_ arguments is simple:
+
+    Dim lngQuery As Long
+    lngQuery = MsgBox("Do you want to delete this workbook?", vbYesNo _
+        + vbCritical + vbDefaultButton2 + vbMsgBoxHelpButton, _
+        "Delete Workbook", "c:\Windows\Help\My_Help.chm", 1012)
+
+In this case, the Help file is specified as My_Help.chm in the \Windows\Help\ folder. VBA displays the Help topic numbered 1012.
+
+When the user clicks the Help button in the message box, VBA displays the specified topic in the Help file. The message box stays onscreen so that when users have finished consulting the Help file, they can make their choice in the message box.
+
+The Help context number for the opening screen of a Help file is 0. Use 0 when you need to display a Help file for which you don't know the Help context number. Users must then locate the information they need on their own.
+
+* * *
+
+**Three Unusual Constants for Special Effects**
+
+VBA provides three special constants for use with message boxes. You probably won't need to use these often, but if you do, they'll come in handy. Specify them as the first argument in the _buttons_ arguments:
+
+vbMsgBoxSetForeground
+
+Tells VBA to make the message box the foreground window. You shouldn't need to use this constant often, because message boxes are displayed in the foreground by default (so that you can see them).
+
+vbMsgBoxRight
+
+Tells VBA to right-align the text in the message box.
+
+vbMsgBoxRtlReading
+
+Tells VBA to arrange the text from right to left on Hebrew and Arabic systems. It has no effect on non-BiDi (bidirectional) systems.
+
+* * *
+
+## Using Some Arguments without Others
+
+When displaying a message box, you can either specify or omit optional arguments. If you want to specify arguments later in the argument list without specifying the ones before them, use a comma to indicate each unused optional argument. (This technique can be used with any argument list.) For example, if you wanted to display the message box shown in the previous example without specifying _buttons_ and _title_ arguments, you could use the following statement:
+
+    Response = MsgBox("Do you want to format the report?",,, _
+        "c:\Windows\Help\Procedure Help.chm", 1012
+
+Here, the triple comma indicates that the _buttons_ and _title_ arguments are omitted (which will cause VBA to display defaults—a vbOKOnly message box with a title bar containing the application's name), preventing VBA from confusing the helpfile argument with the _buttons_ argument. Alternatively, you could use named arguments, which makes for less-concise but easier-to-read code:
+
+    Response = MsgBox("Do you want to format the report?", _
+        HelpFile:="c:\Windows\Help\Procedure Help.chm", Context:=1012)
+
+## Retrieving a Value from a Message Box
+
+If you display a vbOKOnly message box, you know which button the user clicks because the message box contains only an OK button. But when you use one of the other message-box styles, which can have two, three, or four buttons, you must retrieve a value that tells you which button the user clicked. You can then branch execution to respond appropriately to the user's choice.
+
+To retrieve a value from a message box, declare a variable for it. You can do so quite simply by telling VBA that the variable name is equal to the message box (so to speak), like this:
+
+    Dim lngResponse As Long
+    lngResponse **=** MsgBox("Do you want to create the daily report?", _
+        vbYesNo + vbQuestion, "Create Daily Report")
+
+You first declare a variable of the appropriate type (a Long variable) to contain the user's choice, as in the examples throughout this chapter:
+
+When you run the code, VBA stores which button the user clicked as a value in the variable. You can then check the value and take action accordingly.
+
+Table 13.5 shows the full list of buttons the user may choose. You can refer to the buttons by either the constant name or the value number. As usual, the constant is easier to read than the value.
+
+Table 13.5 Constants for selected buttons
+
+**Value** | **Constant** | **Button Selected**  
+---|---|---  
+1 | vbOK | OK  
+2 | vbCancel | Cancel  
+3 | vbAbort | Abort  
+4 | vbRetry | Retry  
+5 | vbIgnore | Ignore  
+6 | vbYes | Yes  
+7 | vbNo | No
+
+For example, to check a vbYesNo message box to see which button the user chose, you can use a straightforward If... Then... Else statement:
+
+    Dim lngUserChoice As Long
+    lngUserChoice = MsgBox("Do you want to create the daily report?", _
+        vbYesNo + vbQuestion, "Create Daily Report")
+    If lngUserChoice = vbYes Then
+        Goto CreateDailyReport
+    Else
+        Goto Bye
+    EndIf
+
+Here, if the user chooses the Yes button, VBA goes to the line of code identified by the CreateDailyReport label and continues running the procedure from there; if not, it terminates the procedure by going to the Bye label at the end. The If condition checks the response generated by the choice the user made in the message box to see if it's a vbYes (generated by clicking the Yes button or pressing Enter with the Yes button selected). The Else statement runs if the response was not vbYes—that is, if the user clicked the No button or pressed Esc.
+
+# Input Boxes
+
+Message boxes tell VBA which button the user clicked. But sometimes you want the user to supply your macro with some text, such as their name or birthday.
+
+When you want to retrieve one simple piece of text information from the user, use an input box. You'll be familiar with input boxes by sight if not by name: they usually look something like the example shown in Figure 13.10.
+
+Figure 13.10 Use an input box to retrieve a single piece of information from the user.
+
+* * *
+
+Create Custom Boxes for Complex Interaction
+
+To retrieve two or more pieces of information from the user, you could use two or more input boxes in succession, but it's usually easier for the user if you create a custom dialog box. You'll start building custom dialog boxes in Chapter 14.
+
+* * *
+
+## Input-Box Syntax
+
+The syntax for displaying an input box is straightforward and similar to the syntax for a message box:
+
+    InputBox( _prompt_ [, _title_ ] [, _default_ ] [, _xpos_ ] [, _ypos_ ] [, _helpfile, context_ ])
+
+Here's what the arguments mean:
+
+**_prompt_**
+
+A required string that specifies the prompt that appears in the input box. As with MsgBox, _prompt_ can be up to about 1,024 characters long, and you can use the carriage-return constant (vbCr) to force separate lines. Like the MsgBox _prompt_ argument, the InputBox _prompt_ automatically wraps if the prompt is longer than about 35 characters.
+
+**_title_**
+
+A string that specifies the text in the title bar of the input box. If you don't specify a _title_ argument, VBA supplies the application's name.
+
+**_default_**
+
+A string that you can use to specify text that will appear in the text box. Entering a _default_ argument can be a good idea both for cases when the default text is likely to be suitable (so the user can just press Enter to accept that default) or when you need to display sample text so that the user can understand what type of response you're looking for.
+
+Here's an example of suitable default text to cue the user: if you display an input box asking for the user's name, you could enter the Name value by fetching it from the BuiltInDocumentProperties collection of the ActiveDocument object, like this:
+
+    Dim strAuthor As String
+      strAuthor = _
+      ActiveDocument.BuiltInDocumentProperties(wdPropertyLastAuthor)
+
+**_xpos_** and **_ypos_**
+
+These are optional numeric values for specifying the onscreen position of the input box. _xpos_ governs the horizontal position of the left edge of the input box from the left edge of the screen (not of the Word window), whereas _ypos_ governs the vertical position of the top edge of the input box from the top of the screen. Each measurement is in _twips_ , described in the sidebar "Input Boxes Are Usually Best Displayed in the Center of the Screen" in this chapter. If you omit these two arguments, VBA displays the input box at the default position of halfway across the screen and one-third of the way down it.
+
+**_helpfile_** and **_context_**
+
+Optional arguments for specifying the Help file and context in the Help file to jump to if the user summons help from the input box. If you use helpfile, you must also use _context_.
+
+* * *
+
+Input Boxes Are Usually Best Displayed in the Center of the Screen
+
+A twip is 1/1440 inch. An average computer screen uses 96 dots per inch (dpi), so there are 15 twips per pixel, and a computer screen at 1024 × 768 resolution is 15,360 × 11,520 twips. If you need to position your input boxes and dialog boxes precisely, experiment with twips at different screen resolutions until you achieve satisfactory results. Generally, it's most effective to display an input box in the default center position. Your users are likely to have a variety of screen resolutions.
+
+* * *
+
+You can omit any of the optional arguments for an input box. But if you want to use another argument later in the syntax sequence, remember that you need to indicate the omission with a spacer comma (or use named arguments as described earlier in this chapter).
+
+Unlike message boxes, input boxes come with a predefined set of buttons—OK and Cancel, plus a Help button if you specify the helpfile and _context_ arguments—so there's no need to specify the main buttons for an input box. The following example declares the String variable strWhichOffice and assigns to it the result of the input box shown in Figure 13.11:
+
+    Dim strWhichOffice As String
+    strWhichOffice = InputBox( _
+        "Enter the name of the office that you visited:", _
+        "Expense Assistant", "Madrid", , , _
+        "c:\Windows\Help\Procedure Help.chm", 0)
+
+Figure 13.11 The input box comes with a predefined set of buttons.
+
+## Retrieving Input from an Input Box
+
+To retrieve the user's input from an input box, declare the numeric variable or String variable that will contain it. Here, the variable strWhichOffice will contain what the user types into the input box:
+
+    Dim strWhichOffice
+    strWhichOffice = _
+        InputBox("Enter the name of the office that you visited:", _
+        "Expense Assistant 2000", "Madrid", , , _
+        "c:\Windows\Help\Procedure Help.chm", 0)
+
+Once the user has entered a value or a string and clicked the OK button, your code can then use the returned value as usual in VBA. To make sure the user has clicked the OK button, check that the input box hasn't returned a zero-length string (which it also returns if the user chooses the OK button with the text box empty), and take action accordingly:
+
+    strWhichOffice = InputBox _
+        ("Enter the name of the office that you visited:", _
+        "Expense Assistant 2000", "Madrid", , , _
+        "c:\Windows\Help\Procedure Help.chm", 0)
+    If strWhichOffice = "" Then End
+
+# Forms: When Message Boxes and Input Boxes Won't Suffice
+
+As you've seen in this chapter, a message box can greatly enhance a procedure by enabling the user to make a choice at a turning point or by presenting the user with important information. But once you've used message boxes for a while, you're apt to start noticing their shortcomings:
+
+  * You can present only a limited amount of information, and you're constrained in the way you can display it (to whatever layout you can conjure up with new paragraphs, line breaks, tabs, and spaces).
+  * You can use only seven sets of buttons, which limits the amount of information that a user can return to your code via message boxes.
+
+While you _can_ get creative and enter complex messages in message boxes to make the most use of the buttons they offer, you'll usually do better to just create a custom dialog box instead. As you'll see in Chapters 14 and 15, custom dialog boxes are relatively simple to create, and they are more powerful and flexible than message boxes.
+
+You'll also want to avoid writing procedures that present the user with a number of choices via a _sequence_ of message boxes. Similarly, input boxes are useful for retrieving a single piece of information from the user, but beyond that, their limitations quickly become apparent too. If you find yourself planning to use two or more input boxes in immediate succession, create a custom dialog box instead. That way you display a single form for the user to fill in all the needed information, instead of several boxes. You'll see how to create forms in Chapter 14.
+
+# The Bottom Line
+
+**Display messages on the status bar.**
+
+The information bar at the bottom of the window in many applications is a useful, unobtrusive way of communicating with the user. The status bar is frequently used by applications to indicate the current page, zoom level, active view (such as _datasheet_ in Access), word count, and so on. However, you, too, can display information on the bar.
+
+Master It
+
+Write a small sub in the Visual Basic Editor that displays the current date and time in the status bar.
+
+**Display message boxes.**
+
+Message boxes are commonly used to inform or warn the user. By default, they appear in the middle of the screen and prevent the user from interacting with the host application until a button on the message box is clicked, thereby closing it.
+
+Master It
+
+Write a small sub in the Visual Basic Editor that displays the current date and time using a message box.
+
+**Display input boxes.**
+
+An input box is similar to a message box, except the former can get more information from the user. An input box allows the user to type in a string, which is more data than the simple information provided by which button the user clicked in a message box.
+
+Master It
+
+Write a small sub in the Visual Basic Editor that asks users to type in their name. Use the InStr function to see if there are any space characters in the returned string. If not, it means either they are Madonna or they have typed in only one name—so display a second input box telling them to provide both their first and last names.
+
+**Understand the limitations of message boxes and input boxes.**
+
+For even moderately complex interaction with the user, message and input boxes are often too limited. They return to the VBA code, for example, only a single user response: a button click or a single piece of text. So you can't conveniently use an input box to ask for multiple data—such as an address _and_ a phone number—without displaying multiple input boxes. That's ugly and disruptive.
+
+Master It
+
+In addition to the limitations on the amount of information you can retrieve from the user, what are the two other major limitations of message boxes and input boxes?
+Chapter 14
+
+Creating Simple Custom Dialog Boxes
+
+In this chapter, you'll start looking at Visual Basic for Applications' tools for creating custom dialog boxes that interact with the user. The terms _dialog box_ and _form_ (or _user form_ ) are generally used interchangeably. Technically, a dialog box is a quite simple, small window, such as a message box or input box. Forms, generally, are larger windows featuring more rich and complex interaction with the user. These terms, though, are equivalent in common usage.
+
+Dialog boxes and forms are among the most powerful and feature-packed elements of VBA. We will spend quite a bit of time exploring their uses as the primary communication path between users and procedures.
+
+This chapter covers the most straightforward form components and how to manipulate them. The next chapter shows you how to create more elaborate forms, such as those with tabbed pages and those that update themselves when the user clicks a control.
+
+In this chapter you will learn to do the following:
+
+  * Understand what you can do with a custom dialog box
+  * Create a custom dialog box
+  * Add controls to a dialog box
+  * Link dialog boxes to procedures
+  * Retrieve the user's choices from a dialog box
+
+# When Should You Use a Custom Dialog Box?
+
+You'll often want to use a _form_ (another word for dialog box or window) when simpler methods of interacting with the user fall short. Sometimes, because of the limited selection, the buttons provided in message boxes are insufficient for getting needed information from the user. Similarly, the single text field available in an input box would be inadequate if you need the user to provide multiple data (name, address, phone number, and so on). In other words, sometimes you need the user to fill in a _form_.
+
+You'll also want to use a custom dialog box for specialized input: when you need the user to choose nonexclusive options by selecting or clearing check boxes, to choose from among mutually exclusive choices via option buttons (also called radio buttons), or to select an item within a list displayed in a list box. Or perhaps you need to show users a picture. In other words, simple message boxes or input boxes cannot handle complex user input.
+
+Custom dialog boxes can include the full range of interface elements the user is probably familiar with from working with Windows applications. You can create custom dialog boxes that look and function almost exactly like the dialog boxes built into applications (such as the File Save dialog box). Or you can create even larger constructions that approach the sophistication of typical application windows.
+
+You'll use forms often in your more sophisticated macros. For example, when the user starts a procedure, you can have the procedure display a form presenting options—such as choosing the files for the procedure to manipulate. The user's choices determine what the procedure will then do.
+
+You can also create dialog boxes that VBA triggers in response to events in the computer system: for example, an event that runs at a specific time or runs when the user takes a specific action (such as creating, opening, or closing a document).
+
+Making your own dialog boxes is not that _hard_ , but it can be time-consuming if you're building a complicated form. Because creating forms is not the fastest programming job, you might want to consider any practical alternatives to using them.
+
+You've already looked at message boxes and input boxes, which provide a simple alternative for some of the relatively easy tasks for which you might want to create a custom dialog box.
+
+Also, some applications, such as Word and Excel, even let you use their built-in dialog boxes for your own purposes. If users are familiar with the application, they're probably familiar with these built-in dialog boxes and can immediately use them to perform standard actions—for example, to open or save files. These are called _common dialog boxes_. How to use common dialog boxes in your macros is demonstrated briefly in the Real World Scenario titled "Control a For...Next Loop with User Input via a Dialog Box" in Chapter 12, "Using Loops to Repeat Actions," and more fully later in this chapter in the section titled "Using an Application's Built-in Dialog Boxes from VBA."
+
+# Creating a Custom Dialog Box
+
+If you want to employ a custom dialog box or window in VBA, you use a visual object called a _user form._ A user form (also sometimes just referred to as a _form_ ) is a blank sheet on which you can place _controls_ (such as check boxes, buttons, and text boxes) to create a made-to-order dialog box.
+
+As you'll see, a user form contains its own code page where you, the programmer, write code to manage the various controls in the form. You can attach code to any of the controls, or to the user form itself, and that code is stored in the user form's _code sheet_. You can display the user form's code sheet in the Code window of the Visual Basic Editor and work with it as you would any other code. You can also run and test a user form as you would any other procedure (for example, by pressing F5 with the user form selected), and the VBA Editor will execute the code behind the form.
+
+You can display a user form (a dialog box) for the user to interact with, and you can then retrieve information from the user form and manipulate it with VBA code. It's in this sense that code supporting a form is said to be _behind_ a form. The user sees and interacts with a form, but behind the scenes you have written code to intelligently react to whatever the user might input.
+
+Each user form is itself an object and can contain a number of other objects that you can manipulate individually.
+
+* * *
+
+User Forms Aren't Always Dialog Boxes
+
+You can also create user forms that aren't dialog boxes. The distinction between a dialog box and a full window is imprecise, but it's usually easiest to define a resizable form as a window (you can resize it by dragging its borders or by clicking its Maximize button), while a dialog box has a fixed size. Some dialog boxes, such as the Find And Replace dialog box in Word, have an initially hidden part that the user can display (in the case of the Find And Replace dialog box, by clicking a More button).
+
+But apart from this simple resizing, the bounds of the dialog box are fixed—you can't grab the corner of the dialog box with the mouse and drag it to enlarge it. But remember that you, the programmer, can create very large user forms that have the complexity and dimensions of a typical application window.
+
+* * *
+
+For example, you could create a simple dialog box with two option buttons, an OK button, and a Cancel button. Each option button would be an object, the OK button would be a third object, and the Cancel button would be a fourth object. You could set properties individually for each object—such as the action to take when the Cancel button was clicked or the ScreenTip (also called a ToolTip) to display when the user moved the mouse pointer over each of the option buttons. (ToolTips help make the components of your form understandable for the user.) The point is to consider the components of a form—the _controls_ you place on the form—as _objects_. This is another use of the concept of objects. Controls are _visual_ objects, but like purely programmatic objects, controls have members such as _properties_.
+
+You can specify most properties of an object either at design time (when you're creating the user form) or at runtime (while the code is executing, either before or after you display the user form). For example, you can set the Value property of a check-box control to True to display the check box in its selected state or to False to display the check box in its cleared state. You can set the Value property three different ways:
+
+  * When building the user form, you can use the Editor's Properties window to specify values. For example, you can make a check box that will default to its selected (checked) state each time the user form is displayed.
+  * You can write code that sets the check box before the form gets displayed to the user while the macro is running.
+  * You can write code that sets the check box while the user is interacting with the form. Note that the user can click the check box to toggle it between its selected and deselected states. But your code can also do this.
+
+The next sections explain the process of creating a dialog box. Later ⇒ in this chapter, you'll find examples that step through creating a procedure and adding a dialog box to it.
+
+## Designing a Dialog Box
+
+It's possible to whip together a half-decent dialog box without much planning. Some programmers like to just "sketch" the user interface in a dialog box by dragging and dropping controls onto it, then positioning them so they look good and modifying their properties.
+
+Other programmers prefer to adopt a more methodical approach and plan what they need to include in the dialog box before they start creating it. If you fall into this latter category, consider the intended purpose of the dialog box and list the elements it will need in order to achieve this goal. Then sketch on paper a rough diagram of the dialog box to get an approximate idea of where you'll want to locate each of the elements (the controls you place on the form).
+
+* * *
+
+Try Basing Custom Dialog-Box Designs on Existing Dialog Boxes
+
+Another option is to base the design for your custom dialog box on an existing dialog box—either a dialog box built into an application (called a common dialog box) or a custom dialog box that your company or organization has already implemented. Leveraging previous development efforts can not only help you avoid reinventing the wheel, but also produce a custom dialog box that users find familiar and intuitive.
+
+* * *
+
+## Inserting a User Form
+
+Once you have a design in mind, the first step in creating a custom dialog box is to insert a user form in the appropriate template or document:
+
+1. Press Alt+11 to display the Visual Basic Editor if it's not already open.
+
+2. In the Project Explorer window, right-click the appropriate project and choose Insert ⇒ UserForm from the context menu.
+
+* * *
+
+Other Ways to Add a User Form
+
+You can also insert a user form by clicking the Insert UserForm button on the far left of the Editor's Standard toolbar.
+
+* * *
+
+The Visual Basic Editor opens a new user form like that shown in Figure 14.1, named UserForm1 (or the next available number if the project already contains other user forms).
+
+Figure 14.1 The first step in creating a new dialog box is to start a new user form. The Visual Basic Editor displays the Toolbox when a user form is the active window.
+
+The Visual Basic Editor also displays the _Toolbox_. (If you've previously hidden the Toolbox while working on a user form, the Visual Basic Editor doesn't display it. Choose View ⇒ Toolbox or click the Toolbox button on the far right of the Standard toolbar.)
+
+VBA automatically inserts the user form in the Forms object (the collection of forms) for the project. If the project you chose didn't already contain a Forms collection, VBA adds one to contain the new user form. You'll see the Forms object displayed in the Project Explorer.
+
+### Choosing User-Form Grid Settings
+
+The Visual Basic Editor displays a grid in each user form to help you place controls relative to the dialog box and to align controls relative to each other so they look neat instead of random.
+
+I don't know why you would want to do this, but to switch off the display of this grid or to switch off the Visual Basic Editor's automatic alignment of controls to the grid, follow these steps:
+
+1. Choose Tools ⇒ Options to display the Options dialog box.
+
+2. Click the General tab to display the General page (see Figure 14.2).
+
+Figure 14.2 The General page of the Options dialog box includes options for toggling the display of the grid, resizing the grid, and toggling whether VBA aligns the controls to the grid.
+
+3. Choose the settings you want:
+
+a. Clear the Show Grid check box if you want to turn off the display of the grid. (The grid continues to function, but the dots are not displayed.)
+
+b. Clear the Align Controls To Grid check box if you want to stop using the grid whether it's visible or not. This feature is usually a timesaver, but if the grid is too coarse for the layout you're trying to achieve, just reduce the sizing of the grid from the default 6 to perhaps 3 or 4.
+
+c. Change the number of points in the Width and Height text boxes to adjust the sizing of the grid's units.
+
+4. Click the OK button to close the Options dialog box and apply your choices.
+
+* * *
+
+Naming Conventions in Visual Basic for Applications
+
+Naming controls in VBA is similar to naming variables. Names for controls can be up to 40 characters long, must begin with a letter, and after that can be any combination of letters, numbers, and underscores. You can't use spaces or symbols in the names, and each name must be unique in its context—for example, each user form must have a unique name within a project, but within any user form or dialog box, a control can have the same name as another control in a different form.
+
+Those are the rules; you can also use conventions to make the names of your VBA objects as consistent and easy to understand as possible. Recall the conventions you've used in previous chapters for identifying the variable type with a prefix: str, lng, int, and so on. The prefixes widely used when naming controls identify the control. For example, by using the convention of prefixing a text box control's name with txt, you can be sure that anyone else reading your code will immediately identify the name as belonging to a text box—and that you yourself will easily identify the name when you revisit your old code.
+
+Here's an example showing conventional prefixes for several controls:
+
+    Private Sub cmbSelectEmployee_Change()
+      lblEmployeeName = cmbSelectEmployee.Text
+      fraStep2.Enabled = True
+      lblInstructions = "Enter text in the Step 2 text box. " & _
+        "For example, you might include brief biographical " & _
+        "information on the employee, details of their position, " & _
+        "or your hopes for their contribution to the company."
+      cmdClearEmployeeName.Enabled = True
+    End Sub
+
+Some popular naming conventions for the most-used VBA objects are shown in the following list. You'll encounter the naming conventions for other VBA objects later in the book. This list includes the control's name, the standard prefix, and finally an example showing how the control can be named in code:
+
+**Check box**
+
+The standard prefix is chk **,** as in chkReturnToPreviousPosition.
+
+**Command button**
+
+The standard prefix is cmd **,** as in cmdOK.
+
+**Form (user form)**
+
+The standard prefix is frm, as in frmMoveParagraph.
+
+**Frame**
+
+The standard prefix is fra, as in fraMovement.
+
+**List box**
+
+The standard prefix is lst, as in lstConferenceAttendees.
+
+**Combo box**
+
+The standard prefix is cmb, as in cmbColor.
+
+**Menu**
+
+The standard prefix is mnu, as in mnuProcedures.
+
+**Option button**
+
+The standard prefix is opt, as in optSpecialDelivery.
+
+**Label**
+
+The standard prefix is lbl, as in lblUserName.
+
+**Text box**
+
+The standard prefix is txt, as in txtUserDescription.
+
+Just as with variable names, the naming convention for controls begins with three lowercase letters and then starts the rest of the object's name with an uppercase letter to make it a little easier to read. For example, a text box in which the users are to type their last names might be named txtLastName.
+
+Naming conventions tend to seem awkwardly formal at first, and there's a strong temptation to avoid them. But if you plan to distribute your macros or expect others to work with them, it's usually worth the trouble to follow the naming conventions. Plus they help you when debugging. It's just another way to make reading code easier for everybody.
+
+* * *
+
+## Renaming a User Form
+
+Next, change the user form's name property from the default (UserForm1) to a more descriptive name. The following steps show how to do this. (For advice on choosing names, refer to the sidebar "Naming Conventions in Visual Basic for Applications" in this chapter.)
+
+1. If the Properties window isn't displayed, press F4 to display it. Figure 14.3 shows the two pages of the Properties window: Alphabetic and Categorized. Alphabetic displays an alphabetical listing of the properties of the currently selected object; Categorized displays the same properties but separated into categories, such as Appearance, Behavior, Font, Misc., Picture, and Position. (Some controls have more categories than those listed here.) You can expand a category by clicking the plus (+) sign beside it to display the properties it contains, and collapse it by clicking the resulting minus (–) sign. If the Alphabetic tab isn't selected, click it to select it.
+
+Figure 14.3 You can choose either an alphabetized or a categorized list in the Properties window.
+
+The Categorized option is not, in my view, very helpful because many of the properties are simply too difficult to fit into categories that make any sense. The Caption property, for example, is assigned to the Appearance category, but the (Name) property is contained in the Misc. collection. The very existence of a "miscellaneous" category demonstrates that the categorization effort has broken down. I suggest you stick with the default Alphabetic option instead.
+
+**2.** Make sure the drop-down list (at the top of the Properties window) is displaying the default name of the user form. If it isn't, select the user form from the drop-down list.
+
+**3.** Select the user form's default name (such as UserForm1 or UserForm2) in the cell to the right of the Name cell (you can double-click the name to select it quickly). Now type a new, more descriptive name for the user form. This name can be anything you want, with the standard VBA limitations:
+
+  * It must start with a letter.
+  * It can contain letters, numbers, and underscores but no spaces or symbols.
+  * It can be up to 40 characters long.
+
+**4.** Click the Caption cell to select the user form's default name and type the caption for the user form—that is, the text label that you want the user to see in the title bar of the dialog box. This name has no restrictions beyond the constraints imposed by the length of the title bar. You can enter a name longer than will fit in the title bar, but VBA truncates it with an ellipsis at its maximum displayable length. As you type, the name appears in the user-form title bar as well, so it's easy to see what's an appropriate length—at least, for the current size of the user form.
+
+**5.** Press Enter or click elsewhere in the Properties window (or elsewhere in the Visual Basic Editor) to set (make official) the user form's name. (Naming controls works the same way as naming forms.)
+
+* * *
+
+Dealing with the "Name Conflicts with Existing Module" Error
+
+If you run into the "Name _name_ conflicts with existing module, project, or object library" error (shown here), chances are you've just tried to give a user form the same name already assigned to something else.
+
+You've tried to reuse the name of a VBA project or object library.
+
+* * *
+
+## Adding Controls to the User Form
+
+Now that you've renamed the user form, you're ready to add controls to it from the Toolbox, shown in Figure 14.4. VBA automatically displays the Toolbox when a user form is active, but you can also display the Toolbox when no user form is active by choosing View ⇒ Toolbox.
+
+Figure 14.4 Use the Toolbox to add controls to the user form.
+
+* * *
+
+Removing the "Roaming Office" Control
+
+Obviously an oversight on Microsoft's part, they've included in the VBA 2013 Toolbox an obscure and—even in MSDN—essentially ignored control called the RoamingOffice control (not shown in Figure 14.4). Its use is beyond the scope of this book, not to mention beyond the scope of the VBA Help system and even Google. Perhaps Microsoft intends to make it useful in the future. For now, though, it clearly doesn't belong among the default controls on the Toolbox. It's the small gray crosshatched square icon next to the Image control. If you wish, you can remove the RoamingOffice control from your Toolbox by right-clicking its crosshatched icon, then choosing Delete RoamingOffice from the context menu.
+
+* * *
+
+Here's what the buttons on the Toolbox do:
+
+**Select Objects**
+
+This first control has a very specialized purpose, and you might never need to use it. First, it's not an ordinary control (it doesn't appear on a form; you can't drag and drop it onto a form). Its job is to restore the mouse pointer to _selection mode_. However, the mouse pointer _automatically_ returns to selection mode after you've dropped a control onto a form. So usually you'll need to click the Select Objects button only when you've selected another control and then changed your mind and decided not to use it. So you need to restore the pointer to its normal state. Alternatively, if you double-click a control (such as the check box), you trigger a technique that allows you to quickly add multiple versions of the same control repeatedly. (Every time you click in the form, a new check box is added to it, for example, while the Editor is in this state. To stop this repetitive behavior, you click the Select Objects button.
+
+**Label**
+
+Creates a _label_ , which is text used to identify a part of the dialog box or to explain information the user needs to know in order to use the dialog box effectively.
+
+**TextBox**
+
+Creates a text box (also sometimes called an _edit box_ ), a field into which the user can type text. You can also use a text box to display text to the user or to provide text for the user to copy and paste elsewhere. A text box can contain either one line (the default) or multiple lines and can display a horizontal scroll bar, a vertical scroll bar, or both.
+
+**ComboBox**
+
+Creates a combo box, a control that combines a text box with a list box. The user can either choose a value from the list box or enter a new value in the text box.
+
+**ListBox**
+
+Creates a list box, a control that lists a number of values. Users can pick one value from the list but can't enter a new value of their own (unlike with a combo box). The list box is good for presenting closed sets of data.
+
+**CheckBox**
+
+Creates a check box and an accompanying label. The user can select or clear the check box to turn the associated action on or off.
+
+**OptionButton**
+
+Creates an option button (also known as a _radio button_ ) and an accompanying label to identify the purpose of the button. This button is usually a circle that contains a black dot when selected. The user can select only one option button out of any group of option buttons. (The name radio button comes from radios with push buttons for stations; you can select only one button at a time. Push one, and the others pop out.)
+
+**ToggleButton**
+
+Creates a toggle button, a button that shows whether or not an item is selected. A toggle button can be defined with any two settings, such as On/Off or Yes/No. You can add a picture to a toggle button, which provides a graphical way of letting a user choose between options.
+
+**Frame**
+
+Creates a frame, an area of a user form or dialog box surrounded by a thin line and an accompanying label. You can use frames (also known as _group boxes_ ) to group related elements in your forms. As well as cordoning off elements visually, frames can separate elements logically. For example, VBA treats a group of option buttons contained within a frame as separate from option buttons in other frames or option buttons loose in the dialog box. This separation makes it easier to use multiple sets of option buttons in a form.
+
+**CommandButton**
+
+Creates a command button. This is the typical, ordinary Windows button that users click to communicate their wishes. Most dialog boxes contain command buttons such as OK and Cancel, or Open and Cancel, or Save, or Apply and Close.
+
+**TabStrip**
+
+Creates a tab strip for displaying multiple sets of data in the same set of controls. Tab strips are especially useful for presenting records in a database for review or modification: Each record in the database contains the same fields for information, so they can be displayed in the same group of controls. The tab strip provides an easy way of navigating between records.
+
+**MultiPage**
+
+Creates a multipage control for displaying multipage dialog boxes that have different layouts on each of their tabs. An example of a multipage dialog box is the Options dialog box (Tools ⇒ Options), which has multiple pages (often referred to incorrectly as tabs) in most of the Office applications.
+
+**ScrollBar**
+
+Creates a stand-alone scroll bar. Stand-alone scroll bars are of relatively little use in dialog boxes. Combo boxes and list boxes have built-in scroll bars.
+
+**SpinButton**
+
+Creates a spin-button control for attaching to another control. Spin buttons (also known as _spinners_ ) are typically small, rectangular buttons with one arrow pointing up and one down (or one arrow pointing left and the other pointing right). Spin buttons are useful for presenting sequential values with consistent intervals within an understood range, such as times or dates. For example, if you want the user to increment or decrement a price in a text box in 25-cent steps, you could use a spinner to adjust the price rather than letting the user type directly into the text box.
+
+**Image**
+
+Creates an image control for displaying a picture within a form. For example, you might use an image control to show a corporate logo or a picture of some sort. (If you want to display a photo, texture, or other graphic on the background of the form itself, set the form's Picture property.)
+
+* * *
+
+Adding Controls to the Visual Basic Editor Toolbox
+
+The Toolbox shown in Figure 14.4 contains the basic set of tools provided by VBA. As discussed in "Customizing the Toolbox" in Chapter 2, "Getting Started with the Visual Basic Editor," you can customize the Toolbox in various ways: by adding other controls to it, creating additional pages for the controls, moving controls from page to page, and creating customized controls of your own making so that you can avoid having to repeatedly adjust properties each time you add those controls.
+
+* * *
+
+Click one of the controls in the Toolbox to select it. Then click in the user form to insert the control on the form, as illustrated in Figure 14.5. VBA places the top-left corner of the control where you click. As you place a control, it snaps to the grid on the user form (unless you've turned off the Align Controls To Grid feature as described in "Choosing User-Form Grid Settings," earlier in this chapter).
+
+Figure 14.5 When you click in the user form, VBA places a standard-size control of the type you chose. If the Align Controls To Grid feature is switched on (as it is by default), VBA automatically aligns the control with the grid on the user form.
+
+You can resize the control as desired by selecting it and then clicking and dragging one of the selection handles (the white squares) that appear around it, as shown in Figure 14.6. The mouse pointer changes to a double-arrow icon when you've correctly positioned it to drag. When you drag a corner handle, VBA resizes the control on both sides of the corner; when you drag the handle at the midpoint of one of the control's sides, VBA resizes the control only in that dimension. In either case, VBA displays a dotted outline indicating the size that the control will be when you release the mouse button.
+
+Figure 14.6 Once you've placed a control, you can resize it as necessary by dragging one of its selection handles.
+
+To resize the user form itself, click its title bar, or click in any blank space in the form (anywhere outside a control). This selects the user form. Then click and drag one of the selection handles that appear around the form.
+
+To delete a control, right-click it in the user form and choose Delete from the context menu. Alternatively, click it to select it and then press the Delete key or choose Edit ⇒ Delete. Restore it by pressing Ctrl+Z.
+
+* * *
+
+Random Additional Default Toolbox Controls
+
+Now and then Microsoft adds application-specific or novel controls to the default Toolbox. This not only causes confusion, but it also means that the VBA Editor's Toolboxes are not standardized across the Office applications. This is a recent development, and unwelcome. Word 2013 arbitrarily includes a "Roaming Office" control. For more on this peculiar feature, see the sidebar "Removing the 'Roaming Office' Control" earlier in this chapter.
+
+Excel's VBA Editor includes a RefEdit control that mimics Excel's reference-edit boxes.
+
+Nobody objects to Microsoft providing additional controls to we programmers. (You can easily add controls to the Toolbox by right-clicking within the Toolbox and choosing Additional Controls from the context menu.) What's problematic is the randomness of what's now being included in the default Toolboxes.
+
+* * *
+
+## Grouping Controls
+
+Sometimes it's quite efficient to temporarily select several controls as a group in the Editor. This allows you to manipulate all the grouped controls as a unit. For example, if you want to change the font size of three text boxes, two option buttons, and four labels, just group them and change the font-size property in the Properties window only _once_. The whole group will have all their font sizes changed automatically. (This trick is not related to grouping controls within a Frame control as described earlier in this chapter.)
+
+We'll explore this useful grouping technique later in this chapter in the section titled "Working with Groups of Controls." For now, I'll just briefly introduce the concept.
+
+To delete, move, resize, or change the properties of multiple controls at once, first select them into a group. You can then delete them all at once by using the methods just described. Or you can move, resize, or modify the properties of the group as a whole.
+
+Here's how to group controls:
+
+  * To select multiple contiguous controls, click the first control, hold down Shift, and then click the last control in the sequence.
+  * To select multiple noncontiguous controls—or to add additional controls to a group after you've selected multiple contiguous controls by using the Shift key—hold down the Ctrl key as you click each additional control. (With the Ctrl key pressed, you can deselect any control in a group by clicking it a second time.)
+  * To select multiple controls in the same area of the user form, click in the form's background outside the controls and drag the resulting selection box until it encompasses at least part of each control. When you release the mouse button, the Visual Basic Editor selects the controls as a group.
+
+## Renaming Controls
+
+As with user forms, VBA automatically gives each control that you add to a form a default name consisting of the type of control plus a sequential number. When you add the first text box in a user form, VBA names it TextBox1; when you add another text box, VBA names it TextBox2; and so on. (Each control in a dialog box must have a unique name so that you can refer to it specifically in code.)
+
+You'll usually want to change the controls' default names to names that describe their purpose so you can remember what they do for the macro.
+
+For example, if TextBox2 is used for entering the user's organization name, you might want to rename it txtOrganizationName, txtOrgName, txtO_Name, or something similar.
+
+To rename a control, follow these steps:
+
+1. Click the control in the user form to select it and thereby display its properties in the Properties window.
+
+  * If the Properties window is already displayed, you can, if you prefer, select the control from the drop-down list at the top of the Properties window instead of selecting it in the user form. VBA then visually highlights (selects) the control in the user form, which helps you make sure that you've selected the control you want to affect.
+  * If the Properties window isn't displayed, you can quickly display it with the properties for the appropriate control by right-clicking the control in the user form and choosing Properties from the context menu.
+
+2. In the Properties window, double-click to select the default name in the cell to the right of the Name property.
+
+3. Type the new name for the control.
+
+4. Press Enter to set the control name, or click elsewhere in the Properties window or in the user form.
+
+* * *
+
+If You Rename a Control, You May Have to Modify Your Code
+
+You can rename a control anytime. But if you do, you must also change any existing references to it in the code that drives the user form. This gives you a strong incentive to choose suitable names for your controls before you write the code.
+
+* * *
+
+## Moving a Control
+
+To move a control, click anywhere in it to select it, and then drag it to where you want it to appear, as shown in Figure 14.7.
+
+Figure 14.7 If a control isn't currently selected, you can move it by clicking it and dragging it.
+
+To move a selected control, move the mouse pointer over the selection border around it so that the mouse pointer turns into a four-headed arrow (as shown in Figure 14.8), and then click and drag the control to where you want it to appear.
+
+Figure 14.8 If a control is selected, move the mouse pointer over its selection border, and then click and drag the control.
+
+* * *
+
+Useful Copy-and-Paste Techniques with Controls
+
+You can use the Copy and Paste commands (from the Standard toolbar, the Edit menu, or the context menu or by using the easiest approach, the keyboard, such as pressing Ctrl+X and Ctrl+V) to move a control.
+
+Copy and Paste isn't that efficient when moving a _single_ control; the Paste command places the control right in the middle of the user form, so you have to drag it to its new position anyway.
+
+However, when creating multiple, similar control sets—such as a group of text boxes with accompanying labels—copying and pasting can be quite useful. It's a quick way to build a whole set of fields for the user to fill in, for example. This way, you don't have to position and align each label/text box pair. Nor do you have to adjust each control's properties, because they are copied too. Align the first label/text pair, set the Font property the way you want it (usually larger, changing it from the default 8 pt. size to 11), resize the controls as you want them, change any other properties to suit yourself, and then copy and paste (clone) the pair as often as necessary by repeatedly pressing Ctrl+V.
+
+Be aware, though, that the VBA Editor unfortunately places each new clone directly on the center of the form, thereby hiding any other clones you've just added. In other words, when you paste, you can't actually see the new clone—it's in a pile on the center of the form. So you have to drag the clones away from the center to reveal the others beneath.
+
+Here's a related technique: Sometimes you want to copy the entire set of controls from one form to another. Select all the controls on Form1, then press Ctrl+C to copy them, then click Form2 to select it, and press Ctrl+V to paste the entire set of controls into the new form.
+
+The advantage of using Copy and Paste for creating new controls is that the new controls inherit all the characteristics of the original controls, so you can save time by creating a control, setting its properties, and then cloning it.
+
+You don't even need to change the names of the copies you paste to another user form—they just need to be named suitably for the code with which they work.
+
+As an alternative to using the Copy and Paste commands, you can also copy a control by holding down the Ctrl key as you click and drag the control. VBA displays a + sign attached to the mouse pointer to indicate that you're copying the control rather than moving it. Drop the copy where you want it to appear on the user form.
+
+* * *
+
+## Changing the Caption on a Control
+
+Some controls—such as option buttons and check boxes—have built-in text captions to let the user understand their purpose. You can change these captions like this:
+
+1. Click the control to select it.
+
+2. Click the caption itself to select it. VBA displays the blinking insertion cursor and a faint dotted border around the text, as shown in Figure 14.9.
+
+Figure 14.9 To change the caption on a control, select the control, and then click in the text so that it displays this faint dotted border.
+
+* * *
+
+Double-Clicking Opens the Code Window Rather Than Selects a Control
+
+When you click a label to select it and click again to position the insertion point to change the caption, make sure you click slowly enough that Windows doesn't interpret this as a double-click. A double-click displays the code sheet for the user form and automatically adds a procedure for the Click event of the control. If this happens, you can easily get back to viewing the form (it's called Design view, as opposed to Code view). Just press Shift+F7, double-click the module's name in the Project Explorer, or choose View ⇒ Object to view the form again.
+
+* * *
+
+3. Now click in the label to position the insertion point for editing it, or drag through the label to select all of it.
+
+4. Edit the text of the label as desired.
+
+5. Press Enter or click elsewhere in the user form to effect the change to the label. (You can alternatively change the label by changing its Caption property in the Properties window.)
+
+* * *
+
+When Should You Set Properties of a Control?
+
+You can set (specify) many properties of a control either at design time (while you're creating the user form) or at runtime (while the form's code is executing). There's a time and a place for each approach, a time when either is a reasonable course of action.
+
+Generally speaking, the more static the property, the more likely you'll want to set it at design time. Some properties, such as the Name property of a user form, _have_ to be set at design time—you can't change such properties at runtime for a user form. You'll also usually want to name your controls at design time, though you can add controls at runtime and set their Name properties during execution.
+
+In most cases, you'll want to set the properties that govern the position and size of the user form itself and its controls at design time. The advantages are clear: you can make sure that the user form looks as you intend it to, that it's legible, and so on.
+
+Occasionally, you may want to change the properties of a user form or the size or position of some of the controls on it at runtime. For example, you might need to add a couple of option buttons to the form to take care of eventualities not included in the basic design of the form. Alternatively, you might create a form that had two groups of option buttons sharing the same space—one group, in effect, positioned on top of the other. At runtime, you could modify their Visible properties in your code and thus display one group and hide the other group. If each group contained the same number of option buttons, you could even make do with only one group of option buttons, assigning the appropriate properties to each at runtime. However, there's no particular advantage in trying to simultaneously make just the one group do double duty like that. It can make your code more confusing.
+
+Given the flexibility that many properties of controls provide, you can often design your user forms to handle several circumstances by displaying and hiding different groups of controls at runtime rather than having to add or remove controls at runtime. Creating the complete set of controls for a user form at design time avoids most of the difficulties that can arise from adding extra controls at runtime. That said, you may sometimes need to create a user form on the fly to present information about the situation in which users have placed themselves.
+
+As you'll see as you continue to work with controls, you have to set values for _some_ controls at runtime. For example, you sometimes can't assign the list of items to a list box or combo box at design time. If a list displays items from a database, the list can vary depending on which data set the user selects. So you would have to write code that fills the list box during execution. (Often, you'll fill a list box during a UserForm_Initialize procedure that runs as the user form is being initialized for display.) The set of items in some lists can be known in advance and specified in your code during design time, such as a list box displaying all the countries in the world, from which the user selects the country of residence.
+
+* * *
+
+## Key Properties of the Toolbox Controls
+
+The following sections discuss the key properties of the controls in the default Toolbox.
+
+First, I'll explain the common properties used to manipulate many of the controls effectively. After that, I'll go through the controls one by one, listing the properties particular to each control.
+
+If you're new to VBA and find this section heavy going, just skip it for the time being and return to it when you're creating code and need to reference information about the properties of the controls.
+
+### Common Properties
+
+Table 14.1 lists the properties shared by all or most controls, grouped by category.
+
+Table 14.1 Properties common to most or all controls
+
+**Property Information** | **Applies To** | **Explanation**  
+---|---|---  
+**General Properties** |  |   
+BoundValue | All controls except Frame, Image, and Label | Contains the value of the control when the control receives the focus in the user form.  
+HelpContextID | All controls except Image and Label | Returns the context identifier of the Help file topic associated with the control.  
+Name | All controls | Contains the name for the control.  
+Object | All controls | Enables you to assign to a control a custom property or method that uses the same name as a standard property or method.  
+Parent | All controls | Returns the name of the user form that contains the control.  
+Tag | All controls | Used for assigning extra information to the control. This is rarely used.  
+Value | CheckBox, ComboBox, CommandButton, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton | One of the most varied properties, Value specifies the current state or value of the control. A CheckBox, OptionButton, or ToggleButton can have an integer value of −1 (True), indicating that the item is selected, or a value of 0 (False), indicating that the item is cleared. A ScrollBar or SpinButton returns a value containing the current value in the control. A ComboBox or ListBox returns the currently selected row's (or rows') BoundColumn value. A MultiPage returns an integer indicating the active page, and a TextBox returns the text in the text box.  
+|  | The value of a CommandButton is False because choosing the command button triggers a Click event. However, you can set the value of a CommandButton to True, which has the same effect as clicking it. In other words, the value property is similar to the value of a variable, but the property's possible values are highly specific to each control.  
+**Size and Position** |  |   
+Height | All controls | The height of the control, measured in points.  
+LayoutEffect | All controls except Image | Indicates whether a control was moved when the layout of the form was changed.  
+Left | All controls | The distance of the left border of the control in pixels from the left edge of the form or frame that contains it.  
+OldHeight | All controls | The previous height of the control, measured in pixels.  
+OldLeft | All controls | The previous position of the left border of the control, measured in pixels.  
+OldTop | All controls | The previous position of the top border of the control, measured in pixels.  
+OldWidth | All controls | The previous width of the control, measured in points.  
+Top | All controls | The distance of the top border of the control in pixels from the top edge of the form or frame that contains it.  
+Width | All controls | The width of the control, measured in points.  
+**Appearance** |  |   
+Alignment | CheckBox, OptionButton, ToggleButton | Specifies how the caption is aligned to the control.  
+AutoSize | CheckBox, ComboBox, CommandButton, Image, Label, OptionButton, TextBox, ToggleButton | A Boolean (True or False only) property that controls whether the object resizes itself automatically to accommodate its contents. The default setting is False, which means that the control doesn't automatically resize itself.  
+BackColor | All controls | The background color of the control. This property contains a number representing the color.  
+BackStyle | CheckBox, ComboBox, CommandButton, Frame, Image, Label, OptionButton, TextBox, ToggleButton | Specifies whether the background of the object is transparent (fmBackStyleTransparent) or opaque (fmBackStyleOpaque, the default). You can see through a transparent control—anything behind it on the form will show through. You can use transparent controls to achieve interesting effects—for example, by placing a transparent command button on top of an image or another control.  
+BorderColor | ComboBox, Image, Label, TextBox, ListBox | Specifies the color of the control's border. You can choose a border color from the System drop-down list or the palette or enter BorderColor as an eight-digit integer value (such as 16711680 for mid-blue). VBA stores the BorderColor property as a hexadecimal value (for instance, 00FF0000). For BorderColor to take effect, BorderStyle must be set to fmBorderStyleSingle.  
+BorderStyle | ComboBox, Frame, Image, Label, ListBox, TextBox, UserForm | Specifies the style of border on the control or user form. Use BorderStyle with the BorderColor property to set the color of a border.  
+Caption | CheckBox, CommandButton, Label, OptionButton, ToggleButton | A text string containing the description that appears for a control—the text that appears in a label, on a command button or toggle button, or next to a check box or option button.  
+Font (object) | All controls except Image, SpinButton, and ScrollBar | Font—an object rather than a property—controls the font in which the label for the object is displayed. For TextBox, ComboBox, and ListBox controls, Font controls the font in which the text in the control is displayed.  
+ForeColor | All controls except Image | The foreground color of the control (often the text on the control). This property contains a number representing the color.  
+Locked | CheckBox, ComboBox, CommandButton, ListBox, OptionButton, TextBox, ToggleButton | A Boolean property that specifies whether the user can change the control. When Locked is set to True, the user can't change the control, though the control can still receive the focus (that is, be selected) and trigger events. When Locked is False (the default value), the control is open for editing.  
+MouseIcon | All controls except MultiPage | Specifies the image to display when the user moves the mouse pointer over the control. To use the MouseIcon property, the MousePointer property must be set to 99, fmMousePointerCustom.  
+MousePointer | All controls except MultiPage | Specifies the type of mouse pointer to display when the user moves the mouse pointer over the control.  
+Picture | CheckBox, CommandButton, Frame, Image, Label, OptionButton, Page, ToggleButton, UserForm | Specifies the picture to display on the control. By using the Picture property, you can add a picture to a normally text-based control, such as a command button.  
+PicturePosition | CheckBox, CommandButton, Label, OptionButton, ToggleButton | Specifies how the picture is aligned with its caption.  
+SpecialEffect | CheckBox, ComboBox, Frame, Image, Label, ListBox, OptionButton, TextBox, ToggleButton | Specifies the visual effect to use for the control. For a CheckBox, OptionButton, or ToggleButton, the visual effect can be flat (fmButtonEffectFlat) or sunken (fmButtonEffectSunken). For the other controls, the visual effect can be flat (fmSpecialEffectFlat), raised (fmSpecialEffectRaised), sunken (fmSpecialEffectSunken), etched (fmSpecialEffectEtched), or a bump (fmSpecialEffectBump).  
+Visible | All controls | Indicates whether the control is visible; expressed as a Boolean value.  
+WordWrap | CheckBox, CommandButton, Label, OptionButton, TextBox, ToggleButton | A Boolean property that specifies whether the text in or on a control wraps at the end of a line. For most controls, WordWrap is set to True by default; you'll often want to change this property to False to prevent the text from wrapping inappropriately. If the control is a TextBox and its MultiLine property is set to True, VBA ignores the WordWrap property.  
+**Behavior** |  |   
+Accelerator | CheckBox, CommandButton, Label, OptionButton, Page, Tab, ToggleButton | The accelerator key (or _access key_ , or _mnemonic_ ) for the control—the key the user presses (typically in combination with Alt) to access the control. For example, in many dialog boxes, the user can access the Cancel button by pressing Alt+C. The accelerator key for a label applies to the next control in the tab order rather than to the label itself. The accelerator character must be one of the characters in the control's text caption, usually the first (The _C_ in Cancel, for example). Once you specify the accelerator character, VBA automatically underlines that character in the caption to cue the user that they can press, for example, Alt+C to select the Cancel button. For additional information on tab order, see the section titled "Adjusting the Tab Order of a Form" later in this chapter.  
+ControlSource | CheckBox, ComboBox, ListBox, OptionButton, ScrollBar, SpinButton, TextBox, ToggleButton | The cell or field used to set or store the Value of the control. The default value is an empty string (""), indicating that there is no control source for the control.  
+ControlTipText | All controls | The text of the ScreenTip displayed when the user holds the mouse pointer over the control. The default value of ControlTipText is a blank string, which means that no ScreenTip is displayed.  
+Enabled | All controls | A Boolean value that determines whether the control can be accessed (either interactively or programmatically).  
+TabIndex | All controls except Image | The position of the control in the tab order of the user form, expressed as an integer from 0 (the first position) through the number of controls on the user form.  
+TabStop | All controls except Image and Label | A Boolean value establishing whether the user can select the control by pressing the Tab key. If TabStop is set to False, the user can select the control only with the mouse. The TabStop setting doesn't change the tab order of the dialog box.
+
+### Label
+
+The Label control simply displays text on the screen. It's most often used to identify the purpose of another control, so you frequently see a Label control placed on a form to the left of a textbox whose purpose the label describes. Use the Caption property to type in the text that you want the label to display. Use the TextAlign property as shown in Table 14.2 to align the text of the label with the borders of the Label control.
+
+Table 14.2 TextAlign property values for the Label control
+
+**Constant** | **Value** | **Text Alignment**  
+---|---|---  
+fmTextAlignLeft | 1 | With the left border of the control  
+fmTextAlignCenter | 2 | Centered on the control's area  
+fmTextAlignRight | 3 | With the right border of the control
+
+### TextBox
+
+The TextBox is one of the most common controls. Recall that it can be a single-line control (often employed to display a field the user must fill in) or a multiline control, for displaying lots of text, as in a diary program where the user determines how much they want to write. Adjust this feature with the MultiLine property. Also, the defaults for a TextBox are a size of 8 pt. (too small usually) and a sans-serif font called Tahoma (sans-serif type is generally thought more appropriate for headlines than body text). So you'll usually find yourself employing the Font property to choose a larger font size and more readable font (such as Times New Roman).
+
+Table 14.3 lists the key properties of the TextBox control.
+
+Table 14.3 Key properties of the TextBox control
+
+**Property** | **Description**  
+---|---  
+AutoTab | A Boolean property that determines whether VBA automatically moves to the next field when the user has entered the maximum number of characters in the text box or combo box.  
+AutoWordSelect | A Boolean property that determines whether VBA automatically selects a whole word when the user drags the mouse through text in a text box or a combo box.  
+DragBehavior | Enables or disables drag-and-drop for a text box or combo box: fmDragBehaviorDisabled (0) disables drag-and-drop; fmDragBehaviorEnabled (1) enables drag-and-drop.  
+EnterFieldBehavior | Determines whether VBA selects the contents of the edit area of the text box or combo box when the user moves the focus to the text box or combo box: fmEnterFieldBehaviorSelectAll (0) selects the contents of the text box or current row of the combo box; fmEnterFieldBehaviorRecallSelection (1) doesn't change the previous selection.  
+EnterKeyBehavior | A Boolean property that determines what VBA does when the user presses Enter with the focus on a text box. If EnterKeyBehavior is True, VBA creates a new line when the user presses Enter; if EnterKeyBehavior is False, VBA moves the focus to the next control on the user form. If MultiLine is False, VBA ignores the EnterKeyBehavior setting.  
+HideSelection | A Boolean property that determines whether VBA displays any selected text in a text box or combo box. If HideSelection is True, VBA displays the text without indicating the selection when the control doesn't have the focus. If HideSelection is False, VBA indicates the selection both when the control has the focus and when it doesn't.  
+IMEMode | Determines the default runtime mode of the Input Method Editor (IME). This property is used only in Far Eastern applications (for example, those using Japanese hiragana or katakana or Korean hangul).  
+IntegralHeight | A Boolean property that determines whether a list box or a text box resizes itself vertically to display any rows that are too tall to fit into it at its current height (True) or not (False).  
+MultiLine | A Boolean property that determines whether the text box can contain multiple lines of text (True) or only one line (False). When MultiLine is True, the text box adds a vertical scroll bar when the content becomes more than will fit within the current dimensions of the text box. VBA defaults to Multiline = False.  
+PasswordChar | Specifies the placeholder character to display in place of the characters the user types (so somebody peeping won't see the actual password). The common password character is the asterisk *. This property is normally used for entering passwords and other information that needs to be obscured so that it cannot be read.  
+ScrollBars | Specifies which scroll bars to display on the text box. Usually, you'll do best to set the WordWrap property to True and let VBA add the vertical scroll bar to the text box as needed rather than using the ScrollBars property.  
+SelectionMargin | A Boolean property that determines whether the user can select a line of text in the text box or combo box by clicking in the selection bar to the left of the line.  
+ShowDropButtonWhen | Determines when to display the drop-down button for a combo box or a text box. fmShowDropButtonWhenNever (0) never displays the drop-down button and is the default for a text box. fmShowDropButtonWhenFocus (1) displays the drop-down button when the text box or combo box has the focus. fmShowDropButtonWhenAlways (2) always displays the drop-down button and is the default for a combo box.  
+TabKeyBehavior | A Boolean property that specifies whether the user can enter tabs in the text box. If TabKeyBehavior is True and MultiLine is True, pressing Tab enters a tab in the text box. If MultiLine is False, VBA ignores a TabKeyBehavior setting of True. If TabKeyBehavior is False, pressing Tab moves the focus to the next control in the tab order.
+
+### ComboBox and ListBox
+
+From the user's point of view, a key distinction is that a list box simply provides a list of options the user can choose from, whereas a combo box offers that list and also includes a field where the user can type in items.
+
+Table 14.4 shows the key properties of the ComboBox control and the ListBox control. These two controls are similar and share many properties. They do, however, differ somewhat in behavior and features; these differences are described in the entries marked "List box only" and "Combo box only" in the table.
+
+Table 14.4 Key properties of the ComboBox and ListBox controls
+
+**Property** | **Description**  
+---|---  
+AutoTab | See Table 14.3.  
+AutoWordSelect | See Table 14.3.  
+BoundColumn | A Variant property that determines the source of data in a combo box or a list box that has multiple columns. The default setting is 1 (the first column). To assign another column, specify the number of the column (columns are numbered from 1, the leftmost column). To assign the value of ListIndex to BoundColumn, use 0.  
+ColumnCount | A Long (data type) property that sets or returns the number of columns displayed in the combo box or list box. If the data source is unbound, you can specify up to 10 columns. To display all available columns in the data source, set ColumnCount to –1.  
+ColumnHeads | A Boolean property that determines whether the combo box or list box displays headings on the columns (True) or not (False).  
+ColumnWidths | A String (data type) property that sets or returns the width of each column in a multicolumn combo box or list box.  
+ListRows | (Combo box only.) A Long (data type) property that sets or returns the number of rows displayed in the combo box. If the number of items in the list is greater than the value of ListRows, the combo box displays a scroll bar so that the user can scroll to the unseen items.  
+ListStyle | Determines the visual effect the list uses. For both a combo box and a list box, fmListStylePlain displays a regular, unadorned list. For a combo box, fmListStyleOption displays an option button to the left of each entry, allowing the user to select one item from the list. For a list box, fmListStyleOption displays option buttons for a single-select list and check boxes for a multiselect list.  
+ListWidth | (Combo box only.) A Variant property that sets or returns the width of the list in a combo box. The default value is 0, which makes the list the same width as the text area of the combo box.  
+MatchEntry | Determines which type of matching the combo box or list box uses when the user types characters with the focus on the combo box or list box. fmMatchEntryFirstLetter (0) matches the next entry that starts with the letter or character typed: if the user types _t_ twice, VBA selects the first entry beginning with _t_ and then the second entry beginning with _t_. fmMatchEntryComplete (1) matches each letter the user types: if the user types _te_ , VBA selects the entry that starts with _te_. fmEntryMatchNone (2) specifies no matching: the user can't select an item by typing in the list box or combo box but must use the mouse or the arrow keys instead. The default MatchEntry setting for a combo box is fmMatchEntryComplete. The default setting for a list box is fmMatchEntryFirstLetter.  
+MatchRequired | (Combo box only.) A Boolean property determining whether the user must select an entry from the combo box before leaving the control (True) or not (False). This property is useful for making sure that if the user types a partial entry into the text-box area of the combo box, they don't forget to complete the selection in the drop-down list area.  
+If MatchRequired is True and the user tries to leave the combo box without making a selection, VBA displays an "Invalid Property Value" message box.  
+MultiSelect | (List box only.) Controls whether the user can make a single selection in the list or multiple selections. fmMultiSelectSingle (0) lets the user select only one item. fmMultiSelectMulti (1) lets the user select multiple items by clicking with the mouse or by pressing the spacebar. fmMultiSelectExtended (2) lets the user use Shift+click, Ctrl+click, and Shift with the arrow keys to extend or reduce the selection.  
+RowSource | A String property that specifies the source of a list to be displayed in a combo box or a list box.  
+SelectionMargin | See Table 14.3.  
+ShowDropButtonWhen | See Table 14.3.
+
+### CheckBox
+
+Check boxes are similar to option buttons—a set of choices presented to the user. However, option buttons permit the user to select only one from among the displayed options (like a set of radio pushbuttons). By contrast, users can select as many check boxes as they wish.
+
+Most of the properties of the CheckBox control have been discussed already. The key property of the CheckBox that you haven't come across yet is TripleState, which is a feature of the OptionButton and ToggleButton controls as well.
+
+TripleState is a Boolean property that determines whether the check box, option button, or toggle button can have a null state as well as True and False states. When a check box or other control is in the null state, it appears with a small black square in its box.
+
+You can see the null state in the Font dialog box in Word when one of the check-box-controlled properties—such as the Strikethrough check box in Figure 14.10—is true for _some_ but not all of the current selection. For example, select a word (or any amount of selected text) that is only partly struck through, and you trigger the null state for the Strikethrough check box, as shown in Figure 14.10. Normally, a check box is either checked or not, but when in a null state, it contains a small black square, indicating it's neither true nor false. (In earlier versions of Office, the null state in a check box was indicated by filling the box with gray or black.)
+
+Figure 14.10 By setting the TripleState property of a check box to True, you can display a check box in a null state. Here Word's Font dialog box shows the Strikethrough check box in a null state (containing a small black square, but not checked).
+
+A couple of properties described briefly in the context of other controls deserve more detail here:
+
+  * The SpecialEffect property controls the visual appearance of the check box. The default value is fmButtonEffectSunken (2), which displays a sunken box—the norm for 3D Windows dialog boxes. You can also choose fmButtonEffectFlat(0) to display a box with a flat effect, but why? To me, it doesn't look as good as the default 3D, shadowed box. The flat version is less subtle, crude actually. But, it fits in well with the new "Modern" aesthetic promoted by Microsoft in Windows 8—no gradients, opacity, dimensional effects, shadows, subtle colors, highlights, reflections, serif typefaces, and so on. In other words, flatland.
+
+Figure 14.11 shows a sunken check box and a flat check box. The Value property, which indicates whether the check box is selected (True) or cleared (False), is the default property of the check box. Recall that the default property need not be specified in code; it's assumed. Thus, you can either write CheckBox1.Value or just CheckBox. The following three statements have the same effect:
+
+    If CheckBox1.Value = True Then
+    If CheckBox1 = True Then
+    If CheckBox1 Then
+
+Figure 14.11 Use the SpecialEffect property to display a flat check box (bottom) rather than the traditional sunken check box.
+
+  * The Accelerator property provides quick access to the check box. Assign a unique accelerator key to check boxes so that the user can swiftly toggle them on and off from the keyboard.
+
+### OptionButton
+
+A _group_ of OptionButtons provides a set of mutually exclusive options from which the user can choose. Only one of the buttons in a group can be selected. For instance, you could have two OptionButtons under the heading Sex: Male and Female. (Recall that a set of CheckBoxes permits multiple options to be chosen simultaneously. CheckBoxes are useful for choosing more complex options. For example, under the heading Typeface, you could have Italic, Bold, and Underlined options, all of which could be selected simultaneously.)
+
+Like the CheckBox, the OptionButton control has a straightforward set of properties, almost all of which you've seen already in this chapter. This section shows you the GroupName property, which is unique to the OptionButton, and some of the key properties for working with option buttons.
+
+The GroupName property is a String data type that assigns the option button to a group of option buttons. Alternatively, you can create a group by placing a set of option buttons on a Frame control. The key idea here is that, once grouped, the buttons become mutually exclusive. However, there can be more than one group (or set) on a form—as long as you employ a Frame control or the GroupName property to isolate the various groups of buttons.
+
+The default setting for GroupName is a blank string (""), which means that an option button isn't assigned to a group until you explicitly assign it. When you enter the group name, the group is created. By using the GroupName property, you can have multiple groups of option buttons on the same form without using frames to specify groups, but you must somehow distinguish the logical groups of option buttons from each other so that the user can tell which option buttons constitute a group. Using a Frame control is the easiest way of segregating groups of option buttons both visually and logically—but it's useful to have the flexibility that GroupName provides when you need it. Also, a Frame has a built-in Caption property you can use to describe the group's purpose.
+
+These are the other key properties of the OptionButton control:
+
+  * The Value property, which indicates whether the option button is selected (True) or cleared (False), is the default property of the option button. So you can set or return the state of the option button by setting either the OptionButton object or its Value to True or False, as appropriate. Setting the Value of one OptionButton to True sets the Value of all other OptionButton controls in the same group or frame to False.
+  * The Accelerator property provides quick access to the option button. Assign a unique accelerator key to each option button so that the user can toggle it on and off from the keyboard.
+  * The SpecialEffect property controls the visual appearance of the option button. The default value of fmButtonEffectSunken (2) displays a sunken button, while fmButtonEffectFlat (0) displays a flattened button. Figure 14.11 shows a sunken option button and a flat option button.
+  * The TripleState property (discussed in the previous section, "CheckBox") lets you create an option button that has three states: selected (True), cleared (False), and null (which appears selected but grayed out). The TripleState property is disabled so that the user can't set the null state interactively, but you can set it programmatically as needed.
+
+### ToggleButton
+
+When it's not selected, the ToggleButton control appears raised, but it looks pushed in when it's selected. The key properties for the ToggleButton control are the same as those for the CheckBox and CommandButton:
+
+  * The Value property is the default property of the ToggleButton.
+  * The TripleState property lets you create a ToggleButton that has three states: selected (True), cleared (False), and null. The user can set a triple-state ToggleButton to its null state by clicking it. In its null state, a ToggleButton appears selected, but gray.
+  * The Accelerator property provides quick access to the toggle button.
+
+### Frame
+
+The Frame control is relatively straightforward, but it has several properties worth mentioning; they're shown in Table 14.5. The Frame control shares a couple of these properties with the Page object.
+
+Table 14.5 Properties of the Frame control
+
+**Property** | **Description**  
+---|---  
+Cycle | Determines the action taken when the user leaves the last control in the frame or on the page. fmCycleAllForms (0) moves the focus to the next control in the tab order for the user form or page, whereas fmCycleCurrentForm (2) keeps the focus within the frame or on the page until the focus is explicitly moved to a control in a different frame or on a different page. This property applies to the Page object as well.  
+InsideHeight | A read-only property that returns the height (measured in points) of the area inside the frame, not including the height of any horizontal scroll bar displayed. This property applies to the Page object as well.  
+InsideWidth | A read-only property that returns the width (in points) of the area inside the frame, not including the width of any vertical scroll bar displayed. This property applies to the Page object as well.  
+KeepScrollBarsVisible | A property that determines whether the frame or page displays horizontal and vertical scroll bars when they aren't required for the user to be able to navigate the frame or the page. fmScrollBarsNone (0) displays no scroll bars unless they're required. fmScrollBarsHorizontal (1) displays a horizontal scroll bar all the time. fmScrollBarsVertical (2) displays a vertical scroll bar all the time. fmScrollBarsBoth (3) displays a horizontal scroll bar and a vertical scroll bar all the time. fmScrollBarsNone is the default for the Frame object, and fmScrollBarsBoth is the default for the Page object. This property applies to the Page object as well.  
+PictureTiling | A Boolean property that determines whether a picture displayed on the control is tiled (True) so that it takes up the whole area covered by the control or not (False). To set the tiling pattern, you use the PictureAlignment and PictureSizeMode properties. This property applies to the Page object and the Image control as well.  
+PictureSizeMode | Determines how to display the background picture. fmPictureSizeModeClip (0), the default setting, crops (removes) any part of the picture too big to fit in the page, frame, or image control. Use this setting to show the picture at its original dimensions and in its original proportions. fmPictureSizeModeStretch (1) stretches the picture horizontally or vertically to fill the page, frame, or image control. This setting is good for colored backgrounds and decorative effects but tends to be disastrous for pictures that need to be recognizable; it also overrides the PictureAlignment property setting. fmPictureSizeModeZoom (3) zooms the picture proportionately until the horizontal dimension or the vertical dimension reaches the edge of the control but doesn't stretch the picture so that the other dimension is maximized as well. This is good for maximizing the size of a picture while retaining its proportions, but you'll need to resize the nonmaximized dimension to remove blank spaces. This property applies to the Page object and the Image control as well.  
+PictureAlignment | Determines where a picture is located. fmPictureAlignmentTopLeft (0) aligns the picture with the upper-left corner of the control. fmPictureAlignmentTopRight (1) aligns the picture with the upper-right corner of the control. fmPictureAlignmentCenter (2), the default setting, centers the picture in the control (both horizontally and vertically). fmPictureAlignmentBottomLeft (3) aligns the picture with the lower-left corner of the control. fmPictureAlignmentBottomRight (4) aligns the picture with the lower-right corner of the control. This property applies to the Page object and the Image control as well.
+
+### CommandButton
+
+The CommandButton is used quite often. This control has three unique properties, listed in Table 14.6.
+
+Table 14.6 Unique properties of the CommandButton control
+
+**Property** | **Description**  
+---|---  
+Cancel | A Boolean property that determines whether the command button is the Cancel button for the user form (True) or not (False). The Cancel button for a user form can bear any name; what distinguishes it is that its Cancel property is set to True. The Cancel button is activated by the user's pressing Esc, or clicking the button, or putting the focus on the button and pressing Enter. Only one command button on a form can be the Cancel button at any given time. Setting the Cancel property for a command button to True causes VBA to set the Cancel property to False for any button for which it was previously set to True.  
+Default | A Boolean property that determines whether the command button is the default button for the user form (True) or not (False). Only one command button on a form can be the default button at any given time. Setting the Default property for a command button to True causes VBA to set the Default property to False for any button for which it was previously set to True. The default button is activated by the user pressing Enter when the focus isn't on any other command button.  
+TakeFocusOnClick | A Boolean property that determines whether the command button takes the focus when the user clicks it (True) or not (False). The default setting for this property is True, but you may want to set it to False when you need the focus to remain on another control in the user form even when the user clicks the command button. However, if the user uses the Tab key or the arrow keys to move to the command button, the command button will take the focus even if the TakeFocusOnClick property is set to False.
+
+Note that it's useful to set the Accelerator property for each command button on a form. This way, the user can quickly access it from the keyboard.
+
+* * *
+
+Sometimes the Cancel Button Should Be the Default Button
+
+Sometimes you'll be tempted to make the Cancel button the default on a form. This offers an obvious benefit for forms that offer irreversible actions, such as deleting text or deleting a file, but it can confuse accessibility aids (such as screen readers) and make it difficult for users with cognitive difficulties to work with the form. For these reasons, it's usually best to make the default button on a form a different button than the Cancel button.
+
+* * *
+
+### TabStrip and MultiPage
+
+TabStrip controls allow you to create a multipage dialog box. Click the Home tab in Word and then click the small arrow icon in the lower-right corner of the Font area on the Ribbon. Word's Font dialog box will open and you'll see a two-tab dialog box. One tab is labeled _Font_ and the other tab is labeled _Advanced_. This is a good way to organize a dialog box when you have quite a few options to present to the user.
+
+The TabStrip control has several unique properties and a number of properties that it shares with the MultiPage control. Table 14.7 lists these properties.
+
+Table 14.7 Properties of the TabStrip and MultiPage controls
+
+**Property** | **Description**  
+---|---  
+ClientHeight | (Tab strip only.) A Single (data type) property that sets or returns the height of the display area of the tab strip, measured in points.  
+ClientLeft | (Tab strip only.) A Single property that returns the distance, measured in points, between the left border of the tab strip and the left border of the control inside it.  
+ClientTop | (Tab strip only.) A Single property that returns the distance, measured in points, between the top border of the tab strip and the top border of the control inside it.  
+ClientWidth | (Tab strip only.) A Single property that sets or returns the width of the display area of the tab strip, measured in points.  
+SelectedItem | Sets or returns the tab currently selected in a tab strip or the page currently selected in a MultiPage control.  
+TabFixedHeight | A Single property that sets or returns the fixed height of the tabs, measured in points. Set TabFixedHeight to 0 to have the tabs automatically size themselves to fit their contents.  
+TabFixedWidth | A Single property that sets or returns the fixed width of the tabs, measured in points. Set TabFixedWidth to 0 to have the tabs automatically size themselves to fit their contents.  
+TabOrientation | Determines the location of the tabs in the tab strip or multipage. fmTabOrientationTop (0), the default, displays the tabs at the top of the tab strip or multipage. fmTabOrientationBottom (1) displays the tabs at the bottom of the tab strip or multipage. fmTabOrientationLeft (2) displays the tabs at the left of the tab strip or multipage, and fmTabOrientationRight displays the tabs at the right of the tab strip or multipage.
+
+### ScrollBar and SpinButton
+
+A SpinButton allows the user to easily increment or decrement numbers, dates, and so on. The ScrollBar and SpinButton share a number of properties that you haven't yet encountered. Table 14.8 lists these properties.
+
+Table 14.8 Properties of the ScrollBar and SpinButton controls
+
+**Property** | **Description**  
+---|---  
+Delay | A Long (data type) property that sets the delay in milliseconds between clicks registered on the control when the user clicks and holds down the mouse button. The default delay is 50 milliseconds. The control registers the first click immediately, the second click after Delay x 5 (the extra delay is to assist the user in clicking only once), and the third and subsequent clicks after Delay.  
+LargeChange | (Scroll bar only.) A Long property that determines how much the item is scrolled when the user clicks in the scroll bar between the thumb (the small square within the scroll bar) and the scroll bar's arrow. Set the LargeChange property after setting the Max and Min properties of the scroll bar.  
+SmallChange | A Long property that determines how much movement occurs when the user clicks a scroll arrow in a scroll bar or spin button. SmallChange needs to be an integer value; the default value is 1.  
+Max | A Long property that specifies the maximum value for the Value property of the scroll bar or spin button. Max must be an integer. The default value is 1.  
+Min | A Long property that specifies the minimum value for the Value property of the scroll bar or spin button. Min must be an integer. The default value is 1.  
+ProportionalThumb | (Scroll bar only.) A Boolean property that determines whether the thumb is a fixed size (False) or is proportional to the size of the scrolling region (True), thereby giving the user an approximate idea of how much of the scrolling region is currently visible. The default setting is True.
+
+### Image
+
+By now, you've seen all the properties of the Image control. Most of the time when you use an Image control, you'll want to adjust the following properties:
+
+  * Use the Picture property to assign the picture file you want to appear in the Image control. Click in the Picture row in the Properties window, and then click the ellipsis button (...) that the text box displays. In the Load Picture dialog box, select the picture and click the OK button to add it. The Image control can display .BMP, .CUR (cursor), .GIF, .ICO (icon), .JPG, and .WMF files, but not other graphics files, such as .TIF. Most graphics applications, however, can easily convert one graphics file type into another.
+
+* * *
+
+An Easy Way to Capture a Graphic Image
+
+The easiest way to display part of a Windows screen in an Image control is to capture it by pressing the Print Screen key (to capture the entire screen) or the Alt+Print Screen key combination (to capture the currently active window). Then paste it into an application such as the Windows Paint accessory, trim (crop) it there as necessary, and save it as a .BMP file. Windows 8 provides a third option: Press the Windows key plus the Print Screen key to capture and automatically save the screen to disk. The captured image will be saved in your Libraries folder in a subfolder named Screenshots. The image is saved as a .PNG graphics filetype—widely considered to be the best way to grab screen images.
+
+* * *
+
+  * Use the PictureAlignment property to set the alignment of the picture.
+  * Use the PictureSizeMode property to set whether the picture is clipped, stretched, or zoomed to fill the Image control. Adjust the height and width of the Image control as necessary.
+  * Use the PictureTiling property if you need to tile the image to take up the full space in the control.
+
+### Page
+
+The Page object is one of the pages contained within a MultiPage object. You've already seen all its properties (in the context of other controls) except for the Index property, which it shares with the Tab object.
+
+The Index property is an Integer data type that determines the position of the Page object in the Pages collection in a MultiPage control or the position of a Tab object in the Tabs collection in a TabStrip. The first Page object or Tab object is numbered 0 (zero); the second Page or Tab object is numbered 1; and so on. You can change the Index property of a tab or page to change the position in which the tab or page appears in the collection.
+
+### Tab
+
+The Tab object is one of the tabs contained within a TabStrip object. You've already learned about all its properties in the context of other controls.
+
+## Working with Groups of Controls
+
+As mentioned briefly earlier in this chapter, when you are designing a form, it's often handy to group controls. By grouping two or more controls, you can work with them as a single unit to size, reposition, format, or delete them. (Recall that this form-design grouping technique has nothing to do with creating a set of option buttons within a Frame control. That creates a mutually exclusive collection of radio buttons to display to the user during runtime.)
+
+### Grouping Controls
+
+To group controls, select them by Shift+clicking, Ctrl+clicking, or dragging around them, and then right-click and choose Group from the context menu. Alternatively, select the controls, and then click the Group button on the UserForm toolbar (you'll need to display this toolbar—it's not displayed by default) or choose Format ⇒ Group. VBA creates a new group containing the controls and places a shaded border with handles around the whole group, as shown on the right in Figure 14.12.
+
+Figure 14.12 You can work with multiple controls simultaneously by grouping them. VBA indicates a group of controls by placing a border around the entire group, as shown on the right.
+
+When you merely select a set of controls (by Shift+clicking, Ctrl+clicking, or dragging around them), you have only temporarily grouped them. You can still manipulate them as a group, but as soon as you deselect them—by, for example, clicking the background of the form itself—the grouping disappears. However, when you right-click and choose Group from the context menu, they will remain grouped until you right-click and choose Ungroup.
+
+### Ungrouping Controls
+
+To ungroup controls, right-click any of the controls contained in the group and then choose Ungroup from the context menu. Alternatively, select the group of controls by clicking in any control in the group and then click the Ungroup button on the UserForm toolbar, or choose Format ⇒ Ungroup. VBA removes the shaded border with handles from around the group and displays the normal border and handles around each individual control.
+
+### Sizing Grouped Controls
+
+You can quickly size all controls in a group by selecting the group and then dragging the sizing handles on the surrounding border. For example, you could select the middle handle on the right side and drag it inward to shorten the controls, as shown in Figure 14.13. The controls will be resized proportionately to the change in the group outline.
+
+Figure 14.13 You can resize all the controls in a group by dragging a sizing handle on the surrounding border.
+
+When the controls are grouped, you can then use the Properties window to quickly modify any properties they have in common (such as Font). But resizing a group can present problems—the results can be ugly. Generally speaking, resizing works fine when you've grouped a number of controls of the same type, as in Figure 14.13. For example, sizing a group that consists of several command buttons or option buttons works well, whereas sizing a group that consists of a text box, a command button, and a combo box is seldom a good idea.
+
+### Deleting Grouped Controls
+
+You can quickly delete a whole group of controls by right-clicking any of them and choosing Delete from the context menu or by selecting the group and pressing the Delete key.
+
+### Working with One Control in a Group
+
+Even after you've grouped a number of controls, you can still work with them individually if necessary. To do this, first click any control in the group to select the group as a whole, as shown on the left in Figure 14.14. Then click the control you want to work with. As shown on the right in Figure 14.14, VBA displays a dark shaded border around the group (indicating that the group still exists) and displays the lighter shaded border around the individual control, indicating that that control is selected.
+
+Figure 14.14 To work with one control in a group, start by selecting the group (as shown on the left) and then select the control (as shown on the right).
+
+You can then modify the selected individual control as if it were not grouped. Change its ForeColor property to blue, for instance, and only the caption in that particular control will turn blue. When you've finished working with it, click another control in the group to individually select it, or click elsewhere in the user form to deselect all individual controls and restore the group.
+
+## Aligning Controls
+
+Even if you use the Snap To Grid feature, you'll often need to align controls manually. They must be ungrouped for this feature to work. The easiest way to align controls is to select two or more, then right-click in any one of them and choose an option from the Align submenu: Lefts, Centers, Rights, Tops, Middles, Bottoms, or To Grid. These options work as follows:
+
+**Lefts** aligns the left borders of the controls.
+
+**Centers** aligns the horizontal midpoints of the controls.
+
+**Rights** aligns the right borders of the controls.
+
+**Tops** aligns the tops of the controls.
+
+**Middles** aligns the vertical midpoints of the controls.
+
+**Bottoms** aligns the bottoms of the controls.
+
+**To Grid** aligns the controls to the grid.
+
+VBA aligns the borders or midpoints to the current position of that border or midpoint on the dominant control—the control that has white sizing handles around it rather than black sizing handles. After selecting the controls you want to align, make dominant the one that is already in the correct position by clicking it so that it takes on the white sizing handles. Then choose the alignment option you want.
+
+* * *
+
+Ensure That You Choose Appropriate Alignment Options
+
+Make sure the alignment option you choose makes sense for the controls you've selected. VBA will happily align controls in an inappropriate way if you tell it to. For example, if you select a number of option buttons or text boxes and choose Tops from the Align submenu, VBA will obligingly stack all the controls on top of each other, rendering them unusable. (To recover from such minor mishaps, press Ctrl+Z.)
+
+* * *
+
+## Placing Controls
+
+The VBA Editor offers several placement commands on the Format menu:
+
+  * On the Format ⇒ Make Same Size submenu, use the Width, Height, and Both commands to make two or more controls the same size in one or both dimensions.
+  * Use the Format ⇒ Size To Fit command to have VBA decide on a suitable size for an element based on the size of its label. This works well for, say, a toggle button with a medium-length label, but VBA will shrink an OK button to a size so small as to be unusable.
+  * Use the Format ⇒ Size To Grid command to increase or decrease the size of a control to the nearest gridpoints.
+  * On the Format ⇒ Horizontal Spacing and Format ⇒ Vertical Spacing submenus, use the Make Equal, Increase, Decrease, and Remove commands to set the horizontal spacing and vertical spacing of two or more controls. The Remove option removes extra space from between controls, which works well for, say, a vertical series of option buttons (which look good close together) but isn't a good idea for command buttons (which need a little space between them).
+  * On the Format ⇒ Center In Form submenu, use the Horizontally and Vertically commands to center a control or a group of controls in the form. Centering controls vertically is seldom a good idea, but you'll often want to center a frame or a group of command buttons horizontally.
+  * On the Format ⇒ Arrange Buttons submenu, use the Bottom and Right commands to reposition command buttons in a form quickly.
+
+## Adjusting the Tab Order of a Form
+
+The _tab order_ of a user form (or of a frame control within a form) is the order in which VBA selects controls in the form or frame when the user moves through them by pressing the Tab key (to move forward) or the Shift+Tab key combination (to move backward).
+
+Put another way, it's a Windows convention that when the user presses the Tab key, the _focus_ moves to the next control in a window.
+
+Only one control at a time can have the focus. For example, if a form has five text boxes, only one of these text boxes, the one that currently has the focus, will display characters as the user types. In addition, a button in a set of buttons can also have the focus, and when the user presses the Enter key, the button with the focus will be triggered. Or the user can click a different button to move the focus to that button.
+
+VBA displays a visual cue to indicate which control currently has the focus. You'll see a dotted frame around a button or option button and a blinking insertion cursor in a text box.
+
+Each frame you add to a user form has a separate tab order for the controls it contains: The frame itself appears in the tab order for the form, and the controls within the frame appear in the tab order for the frame.
+
+Set the tab order for a form or a frame to make it as easy as possible for the user to work with your form. Generally, for English-speaking users, it's best to arrange the tab order from left to right and from top to bottom in the dialog box or frame. For international users, you may want to arrange the tab order from right to left. You may also need to arrange the tab order to move from one control to a related control that would not normally be next in the tab order.
+
+The whole point of managing the tab order is that you simplify things for your user. Employing the Tab key in this way allows the user to fill in a whole form without once having to move their hand off the keyboard to keep selecting, with a mouse click, each next text box.
+
+This kind of tabbing is particularly useful when the user is asked to fill in several fields by typing into multiple text boxes (such as Name, Address, Phone, and so on). As soon as users finish filling in one field, they can press Tab to move on to the next. (Even easier, pressing the Enter key while in a text box moves users to the next control in the tab order.) At the end, after they've filled in the last field, they can quickly close the dialog box if you make the OK button the next control in the tab order.
+
+VBA assigns the tab order to the controls in a dialog box or frame on a first-come, first-served basis as you add the controls. Unless you add all the controls in perfect order, this default order will seldom produce the optimal tab order for a dialog box, so usually you'll want to adjust the tab order—or at least check to ensure that it's right. You're likely to place fewer controls on a frame than on a form, so you have a better chance of adding them in a suitable order, but you should check these controls too before unleashing the dialog box on users.
+
+Just press F5 and then repeatedly press the Tab key to examine your current tab order. Alternatively, you can open a Tab Order dialog box (shown in Figure 14.15) by right-clicking in the open space in the background of the form or frame and choosing Tab Order from the context menu. Or select the user form or frame and then choose View ⇒ Tab Order.
+
+Figure 14.15 Use the Tab Order dialog box to arrange the controls in your user form or frame into a logical order for the user.
+
+The time to adjust the tab order is after you've finished creating your form (adding a control later will require that you go back and modify the tab order). Here's how to change the tab order in a dialog box or frame:
+
+1. Rearrange the controls into the order in which you want them to appear by selecting them in the Tab Order list box and clicking the Move Up button or Move Down button as appropriate. You can Shift+click or drag to select a range of controls, or Ctrl+click to select two or more noncontiguous controls. (Or just change the controls' TabIndex properties in the Properties window.)
+
+2. Click the OK button to close the Tab Order dialog box.
+
+# Linking a Form to a Procedure
+
+Designing a custom form is only the first step in getting it to work in a procedure. The other step is writing the code to display the form to the user and make it perform its tasks.
+
+Typically, the code for a form consists of the following:
+
+  * A macro procedure that displays the dialog box by loading it and using the Show method. Usually, this procedure can be assigned to a Quick Access Toolbar button or to a shortcut key combination so that the user can conveniently invoke it. However, a procedure can also be designed to run automatically in response to a system event (such as running at a specified time or when a worksheet is opened).
+  * The user form that represents the form and its controls.
+  * The code attached to the user form. This code consists of procedures for designated controls. For example, for a simple dialog box containing two option buttons and two command buttons (an OK button and a Cancel button), you'd typically write one procedure for the OK button and one for the Cancel button. The procedure for the OK button is executed when the user either clicks the button with the mouse or presses the Enter key while the focus is on that button. Either of these user actions triggers the button's Click event, and whatever code you, the programmer, have written within this event is then executed. Remember that the easiest way to create an event (procedure) for a control is to just double-click the control on the form. The editor then switches to Code view and writes the necessary Sub...End Sub envelope for that event, like this:
+
+    Private Sub btnOK_Click()
+
+    End Sub
+
+Notice that the Editor automatically combines the Name property of the control with the name of the event as the procedure's name, separated by an underscore character: btnOK_Click.
+
+* * *
+
+In Static Dialog Boxes, Click Events Are Usually Employed with Command Buttons
+
+Most controls have quite a few events. Some of them might seem inappropriate or useless at first. For example, option buttons have a Click event. But why? It makes sense to _trap_ (to respond in code to an event such as a user's mouse click) using command buttons in a static dialog box. (A static dialog box is the most common type. The controls don't change or move.) However, as you'll see in the next chapter, in a dynamic dialog box, you may want to trap the click on an option button and display further controls to get additional input from the user.
+
+* * *
+
+Once the code attached to a button has run, execution returns to the form (if it's still displayed) or to the procedure that called the form.
+
+Note that code that runs directly in response to an event is called an _event procedure_ or _event handler_. An event procedure can call other procedures as necessary, so multiple procedures can be run indirectly when a single event handler Sub is triggered.
+
+## Loading and Unloading a Form
+
+You load a form by using the Load statement, and unload it by using the Unload statement. The Load statement loads the form into memory so that it's available to the program but doesn't display the form; for that you use the Show method (discussed in the next section). The Unload statement unloads the form from memory and releases any memory associated with that object. If the form is displayed when the Unload statement runs, VBA removes the form from the screen.
+
+The syntax for the Load and Unload statements is straightforward:
+
+    Load _UserForm1_
+    Unload _UserForm1_
+
+Here, _UserForm1_ is the name of the user form or dialog box. For example, the following statement loads the dialog box named frmMyDialog:
+
+    Load frmMyDialog
+
+## Displaying and Hiding a Form
+
+To display a form, you use the Show method; to hide a form, you use the Hide method. For example, the following statement displays the form named frmMyDialog:
+
+    frmMyDialog.Show
+
+If you execute a procedure containing this line, the frmMyDialog form appears onscreen so the user can interact with it: enter text in its text boxes, select or clear its check boxes, use its drop-down lists, click its buttons, and so on.
+
+When the user closes the form (by clicking the Close button on its title bar or by clicking a command button that dismisses it), the form disappears from the screen and the procedure continues to run. But until you retrieve settings from the form and take action on them, the form has no effect beyond its graphical display.
+
+You can display a form by using the Show method without explicitly loading the form with a Load command first; VBA takes care of the implied Load command for you. There's no particular advantage to including the Load command, but it might make your code easier to read and to debug. For example, the two procedures shown here have the same effect:
+
+    Sub Display_Dialog()
+        Load frmMyDialog   'loads the form into memory
+        frmMyDialog.Show   'displays the form
+    End Sub
+
+    Sub Display_Dialog()
+        frmMyDialog.Show   'loads the form into memory and displays it
+    End Sub
+
+If you run a Hide method without having loaded the form into memory by using the Load statement or the Show method, VBA loads the form but does not display it onscreen.
+
+Once you've displayed the form, take a moment to check its tab order by pressing F5 and then moving through it using the Tab key. When you first open the form, is the focus on the appropriate control, the control the user is most likely to want to interact with first? When you move forward from that control, is the next control that is selected the next control that the user will typically need to use? Adjust the tab order as necessary, as described in "Adjusting the Tab Order of a Form" earlier in this chapter.
+
+## Setting a Default Command Button
+
+To specify a default command button in a form, set that command button's Default property to True. VBA selects the default button when it displays the form so that if the user simply presses the Enter key to dismiss the dialog box, this button receives the keystroke.
+
+Only one button can be the default button at any given time. If you set the Default property of any button to True, VBA automatically changes to False the Default property of any other button previously set to True.
+
+# Retrieving the User's Choices from a Dialog Box
+
+To make a form do something, your code will usually respond to the user's input. The following sections first cover the VBA commands for retrieving information from a dialog box. Then you'll see an example of how to retrieve the user's choices from both a relatively simple dialog box and then a more complex form.
+
+## Returning a String from a Text Box
+
+To _return_ (retrieve) a string from a text box, your code can check its Value property or Text property after the user has clicked an OK or Cancel button or otherwise dismissed the dialog box.
+
+For example, if you have a text box named txtMyText, you could return its value and display it in a message box by using the following line:
+
+    MsgBox txtMyText. **Value**
+
+* * *
+
+The Text Property of a Text Box Is Unique
+
+For a text box, the Value property and the Text property return the same information; for most other VBA objects, the Value property and the Text property return different information.
+
+* * *
+
+Recall that VBA supports both one-line and multiline text boxes. To create a multiline text box, select the text box in the user form or in the drop-down list in the Properties window and set its MultiLine property to True. The user can then enter multiple lines in the text box and start new lines by pressing Shift+Enter.
+
+* * *
+
+Quick Changes for Two-State Properties
+
+Here's a tip: If you're changing a Boolean (two-state, True versus False) property—like Enabled, Visible, or Multiline—just double-click the value in the Properties window. For example, to change the default False setting for Multiline, double-click False in the Properties window. It changes to True. (This doesn't work with the Value property.)
+
+* * *
+
+To add a horizontal or vertical scroll bar to a text box, set its ScrollBars property to 1 - fmScrollBarsHorizontal (for a horizontal scroll bar), 2 - fmScrollBarsVertical (for a vertical scroll bar, which is usually more useful), or 3 - fmScrollBarsBoth (for both).
+
+## Returning a Value from an Option Button
+
+A regular option button is a binary control, so it can have only two values: True and False. True indicates that the button is selected, False that it's unselected. You can check an option button's value with a simple If... Then structure. For example, if you have two option buttons, named optSearchForFile and optUseThisFile, you can check their values and find out which was selected by using the following code:
+
+    If optSearchForFile = True Then
+      'optSearchForFile was selected; take action on this
+    Else   'optSearchForFile was not selected, so optUseThisFile was
+      'take action for optUseThisFile
+    End If
+
+Remember that Value is the default property of the OptionButton control. The previous code checks the value of the default property of the control, so you need not specify the property in your code. Default properties can be omitted as a kind of shorthand programming. The first line of code could be written out more fully as If optSearchForFile. **Value** = True Then. But in the code example, I chose to write it more succinctly, with = True implied: If optSearchForFile Then.
+
+With more than two option buttons, use an If... Then... ElseIf condition or a Select Case statement to determine which option button is selected.
+
+* * *
+
+You Can't Directly Test for a Null Value
+
+This is a bit esoteric, but as you saw earlier in this chapter, an option button or a check box can also have a null value if its TripleState property is set to True. Null means basically "neither completely true nor false"—the selected paragraph is _partially_ boldface, so its FontStyle is both bold and regular. If you allow your option buttons or check boxes to have a null state, you'll need to check for that as well in your procedures. You can't directly check for the control's value being Null (for example, If opt1.Value = Null causes an error), so use an If statement or Select Case statement to test True and False first. If the Value of the control is neither True nor False, it Else must be Null.
+
+* * *
+
+## Returning a Value from a Check Box
+
+Like an option button, a regular check box can only be either True or False, so you can use an If... Then structure to check its value. Here's an example:
+
+    If chkDisplayProgress = True Then
+       'take actions for chkDisplayProgress
+    End If
+
+Again, you're checking the default property of the control here—the Value property. The first line of code could also be written as If chkDisplayProgress.Value = True Then.
+
+Sometimes you'll need to take an action if the check box was cleared (deselected) rather than selected. For example, if the user clears the check box, you may need to turn off a configuration option.
+
+## Returning a Value from a List Box
+
+List boxes start out empty. So, before you can ask the user to choose an item in a list box, you must first fill the box with items from which the user can choose—you must tell VBA which items to display. To do so, you create a procedure to _initialize_ (prepare) the user form and add the items to the list box before displaying it:
+
+1. Right-click the name of the user form in the Project Explorer and choose View Code from the context menu to display (in the Code window) the code for the controls assigned to the dialog box. Or you can just double-click somewhere in the background on the user form to go to Code view. Recall that you can toggle between the Code window (press F7) and the form-design window (Shift+F7).
+
+2. In the Object drop-down list (on the top left of the Code window), make sure UserForm is selected.
+
+3. Choose Initialize from the Procedure drop-down list (on the top right of the Code window). The Visual Basic Editor creates a new procedure named Private Sub UserForm_Initialize for you, inserting it at the end of the procedures currently displayed in the code window:
+
+    Private Sub UserForm_Initialize()
+    End Sub
+
+Here's a tip: VBA runs a UserForm_Initialize procedure every time the user form is brought to life. This procedure is a good place to add items to a list box or combo box or to set properties of other controls on the user form. In other words, this Initialize event is where you write code to do any necessary preliminary housekeeping before displaying the form to the user.
+
+4. To add items to a list box, you can use the AddItem method for the list box object (here the box is named lstBatteries) with a text string in quotation marks to display the ID number of each battery in the list box:
+
+    lstBatteries.AddItem "Battery #A4601"
+
+    lstBatteries.AddItem "Battery #A4602"
+    lstBatteries.AddItem "Battery #A4603"
+    lstBatteries.AddItem "Battery #A4604"
+
+* * *
+
+The Initialize Event Is Flexible
+
+By adding items when you initialize the form, you can add different numbers of items as appropriate. For example, if you wanted the user to pick a document from a particular folder, you could create a list of the documents in that folder on the fly in your code during runtime and fill the list box with the documents' names.
+
+* * *
+
+To retrieve the user's choice from a single-select-style list box, check the Value property in your code, as in this example:
+
+    MsgBox "You chose this entry from the list box: " & lstBattery.Value
+
+Single-select list boxes are like a set of option buttons—the user is allowed to select only one of them.
+
+When you use the MultiSelect property to create a list box capable of multiple selections, you can no longer use the Value property to return the items selected in the list box. When MultiSelect is set to True, Value always returns a null value. Instead, you use the Selected property to determine which rows in the list box are selected and the List property (it's an array) to return the contents (the values) of each selected row.
+
+The following statements use a For... Next loop to build a string named strMsg containing the entries selected from a multiselect list box:
+
+    strMsg = "You chose the following entries from the list box: " & vbCr
+    For i = 1 To lstBatteries.ListCount
+      If lstBatteries.Selected(i - 1) = True Then
+        strMsg = strMsg & lstBatteries.List(i - 1) & vbCr
+      End If
+    Next i
+    MsgBox strMsg
+
+## Returning a Value from a Combo Box
+
+To return a value from a combo box (a control that is, in effect, a combination list box and text box), you add items to the combo box list in an Initialize procedure and then check the Value of the combo box after the user has dismissed the dialog box. (The combo box control doesn't offer multiple-selection capabilities, so Value is the property to check.)
+
+For example, you would use the following code to add items to a combo box named cmbColor:
+
+    Private Sub UserForm_Initialize()
+      cmbColor.AddItem "Red"
+      cmbColor.AddItem "Blue"
+      cmbColor.AddItem "Yellow"
+    End Sub
+
+To return the item the user chose in the combo box, retrieve the Value property:
+
+    Result = cmbColor.Value
+
+The item retrieved from a combo box can be either one of the items assigned in the Initialize procedure or one that the user has typed into the text-box portion of the combo box.
+
+# Examples of Connecting Forms to Procedures
+
+The following sections show you two examples of how you can create a procedure and then design a form that works with it to make the procedure more useful and powerful. In the first example, you'll record a macro in Word and then link a form to that code. In the second example, which will work with any VBA-enabled application, you'll create a user form and its associated code from scratch.
+
+## Word Example: The Move-Paragraph Procedure
+
+This first example moves the current paragraph up or down within the document by one or two paragraphs in Word.
+
+### Recording the Procedure
+
+Start by recording a procedure in Word to move the current paragraph. In the procedure, you need to record the commands for the following actions:
+
+  * Selecting the current paragraph
+  * Cutting the selection and then pasting it
+  * Moving the insertion point up and down the document
+  * Inserting a bookmark, moving the insertion point to it, and then deleting the bookmark
+
+We want our finished procedure to display a dialog box with option buttons for moving the current paragraph up one paragraph, up two paragraphs, down one paragraph, or down two paragraphs. The dialog box should also include a check box that indicates the user wants the insertion point returned to its original position at the end of the procedure. Because this is presumably desirable default behavior for the procedure, this check box is selected by default. Users can clear the check box if they don't want to return the insertion point to its original position.
+
+First, start Word and create a new, blank, scratch document (press Ctrl+N), and enter three or four paragraphs of text—just about anything will do, but it'll be easier to have recognizable text so that you can make sure the procedure is moving paragraphs as it should. Then place the insertion point in one of the paragraphs you've just entered and start recording a macro as discussed in Chapter 1, "Recording and Running Macros in the Office Applications":
+
+1. Click the Record Macro icon on the status bar or the Record Macro icon in the code section of the Ribbon's Developer tab. Either way, you see the Record Macro dialog box.
+
+2. Type the name for the macro, **Move_Paragraph** , in the Macro Name text box and a description in the Description text box.
+
+3. Choose a template or document, if necessary, in the Store Macro In drop-down list. (You probably don't want to add this to the global NewMacros module in the Normal.dotm file. Why clutter it up with practice macros?)
+
+4. If you want, use the Button or Keyboard button to create a Quick Access Toolbar button or keyboard shortcut for the macro.
+
+5. Click the OK button to start recording the macro.
+
+Record the following actions in the macro:
+
+1. Insert a bookmark at the current position of the insertion point by clicking the Bookmark icon in the Links section of the Ribbon's Insert tab. This displays the Bookmarks dialog box. Enter a name for the bookmark, and click the Add button. In this example, the bookmark is named Move_Paragraph_Temp to indicate that it's a temporary bookmark used for the Move_Paragraph procedure.
+
+2. Select the current paragraph by pressing F8 four times. The first press of F8 activates Extend mode, the second selects the current word, the third selects the current sentence, and the fourth selects the current paragraph. Press the Esc key to turn off Extend mode once the paragraph is selected.
+
+3. Cut the selected paragraph by using one of the variations of the Cut command (for example, press either Ctrl+X or Shift+Delete, or click the Cut icon in the Ribbon's Clipboard section).
+
+4. Move the insertion point up one paragraph by pressing Ctrl+.
+
+5. Paste the cut paragraph back in by using a Paste command (for example, press Ctrl+V or Shift+Insert, or click the Paste button on the Home tab of the Ribbon).
+
+6. Move the insertion point down one paragraph by pressing Ctrl+ ↓.
+
+7. Move the insertion point up two paragraphs by pressing Ctrl+ ↑ twice.
+
+Note that if you started with the insertion point at the beginning of the first paragraph in the document, you'll only be able to move the insertion point up one paragraph. This doesn't matter—press the keystroke anyway to record it. If Word beeps at you, ignore it.
+
+8. Move the insertion point down two paragraphs by pressing Ctrl+ ↓ twice. (If in doing so you hit the end of the document after the first keystroke, don't worry—perform the second keystroke anyway to record it. Word may sound a beep.)
+
+9. Open the Bookmarks dialog box again (click the Bookmark icon in the Links section of the Ribbon's Insert tab), select the Move_Paragraph_Temp bookmark, and click the Go To button to go to it. Then click the Delete button to delete the Move_Paragraph_Temp bookmark. Click the Close button to close the Bookmarks dialog box.
+
+10. Stop the Macro Recorder by clicking the Stop Recording icon on the status bar or the Stop Recording icon in the code section of Ribbon's Developer tab.
+
+Open the recorded macro in the Visual Basic Editor by pressing Alt+F8, selecting the macro's name in the Macros dialog box, and clicking the Edit button.
+
+You should see a macro that looks something like this:
+
+     1.  Sub Move_Paragraph()
+     2.  '
+     3.  ' Move_Paragraph Macro
+     4.  ' Move a paragraph up or down
+     5.  '
+     6.      With ActiveDocument.Bookmarks
+     7.          .Add Range:=Selection.Range, Name:="Move_Paragraph_Temp"
+     8.          .DefaultSorting = wdSortByName
+     9.          .ShowHidden = False
+    10.      End With
+    11.      Selection.Extend
+    12.      Selection.Extend
+    13.      Selection.Extend
+    14.      Selection.Extend
+    15.      Selection.EscapeKey
+    16.      Selection.Cut
+    17.      Selection.MoveUp Unit:=wdParagraph, Count:=1
+    18.      Selection.Paste
+    19.      Selection.MoveDown Unit:=wdParagraph, Count:=1
+    20.      Selection.MoveUp Unit:=wdParagraph, Count:=2
+    21.      Selection.MoveDown Unit:=wdParagraph, Count:=2
+    22.      Selection.GoTo What:=wdGoToBookmark, Name:="Move_Paragraph_Temp"
+    23.      ActiveDocument.Bookmarks("Move_Paragraph_Temp").Delete
+    24.      With ActiveDocument.Bookmarks
+    25.          .DefaultSorting = wdSortByName
+    26.          .ShowHidden = False
+    27.      End With
+    28.  End Sub
+
+You can probably read this macro code easily enough by now:
+
+  * Line 1 starts the macro, and line 28 ends it. Lines 2 and 5 are blank comment lines around the comment lines showing the macro's name (line 3) and description (line 4).
+  * Lines 6 through 10 contain a With statement that adds the Move_Paragraph_Temp bookmark. Lines 7 and 8 are unnecessary here, but the Macro Recorder records all the settings in the Bookmarks dialog box, including the setting for the Sort By option button and the Hidden Bookmarks check box.
+  * Lines 11 through 15 use the Extend Selection feature to select the current paragraph.
+  * Lines 17, 19, 20, and 21 record the syntax for moving the insertion point up and down one paragraph and two paragraphs, respectively.
+  * Line 16 records the Cut command and Line 18 the Paste command.
+  * Line 22 moves the insertion point to the Move_Paragraph_Temp bookmark, and line 23 deletes the bookmark. Lines 24 through 27 again record the settings in the Bookmarks dialog box, which you don't need here either.
+
+If you wish, you can quickly delete unnecessary lines of code, and collapse the first With structure, to create a more succinct, more easily understood, version of the code:
+
+     1.  Sub Move_Paragraph()
+     2.      ActiveDocument.Bookmarks.Add Range:=Selection.Range, _
+                 Name:="Move_Paragraph_Temp"
+     3.      Selection.Extend
+     4.      Selection.Extend
+     5.      Selection.Extend
+     6.      Selection.Extend
+     7.      Selection.EscapeKey
+     8.      Selection.Cut
+     9.      Selection.MoveUp Unit:=wdParagraph, Count:=1
+    10.      Selection.Paste
+    11.      Selection.MoveDown Unit:=wdParagraph, Count:=1
+    12.      Selection.MoveUp Unit:=wdParagraph, Count:=2
+    13.      Selection.MoveDown Unit:=wdParagraph, Count:=2
+    14.      Selection.GoTo What:=wdGoToBookmark, _
+                 Name:="Move_Paragraph_Temp"
+    15.  End Sub
+
+### Creating the Dialog Box
+
+Next, create the dialog box for the procedure (see Figure 14.16):
+
+Figure 14.16 The Move Current Paragraph dialog box that you will connect to the Move_Paragraph macro
+
+1. Start a user form in the Visual Basic Editor by clicking the Insert button's drop-down list and choosing UserForm (or just click the Insert button if it's already showing the UserForm icon) or by choosing Insert ⇒ UserForm.
+
+2. Use the Properties window for the user form to set its Name and Caption properties. Click in the cell next to the Name cell and enter the Name property there, and then click in the cell next to the Caption cell and enter the Caption property. The example user form is named frmMoveParagraph and has the caption Move Current Paragraph so that the name of the form is closely related to the text the user will see in the title bar of the dialog box but different from the procedure name (Move_Current_Paragraph).
+
+3. Place two frames in the user form, as shown in Figure 14.17, to act as group containers in the dialog box:
+
+a. Double-click the Frame tool in the Toolbox, and then click and drag in the user form to place each frame.
+
+b. Align the frames by selecting them both and choosing Format ⇒ Align ⇒ Lefts.
+
+c. With the frames still selected, verify that they are the same width by choosing Format ⇒ Make Same Size ⇒ Width. (Don't choose Format ⇒ Make Same Size ⇒ Height or Format ⇒ Make Same Size ⇒ Both. The top frame will need to be taller than the bottom frame.)
+
+d. Caption the top frame **Movement** and the bottom frame **Insertion Point** by selecting each in turn and then setting the Caption property in the Properties window. Then name the top frame **fraMovement** and the bottom frame **fraInsertionPoint**.
+
+Figure 14.17 Start by placing two frames in the user form.
+
+4. Place four option buttons in the Movement frame, as shown in Figure 14.18:
+
+Figure 14.18 Place four option buttons in the Movement frame like this.
+
+a. Double-click the OptionButton tool in the Toolbox, and then click in the Movement frame to place each option button. This time, don't click and drag—just click to place a normal-width option button.
+
+b. When you've placed the four option buttons, click the Select Objects button in the Toolbox to restore the selection pointer. Then select the four option buttons and align them with each other by choosing Format ⇒ Align ⇒ Lefts. Even out any disparities in spacing by choosing Format ⇒ Vertical Spacing ⇒ Make Equal. If necessary, use the other items on the Format ⇒ Vertical Spacing submenu—Increase, Decrease, and Remove—to adjust the amount of space between the option buttons. (You can do all these things freehand if you prefer by just eyeballing. Drag them around until you have them neatly positioned and sized.)
+
+c. Change the caption for each option button by setting the Caption property in the Properties window. Caption them as illustrated in Figure 14.18: **Up one paragraph** , **Up two paragraphs** , **Down one paragraph** , and **Down two paragraphs**. These option buttons will control the number of paragraphs the procedure moves the current paragraph.
+
+d. If you need to resize the option buttons to make all the text in the captions visible, select them and group them by right-clicking and choosing Group from the context menu, by choosing Format ⇒ Group, or by clicking the Group button on the UserForm toolbar. Then select the group and drag one of the handles to resize all the option buttons evenly. For example, to reveal hidden text that's cut off on the right side, drag the handle at the right midpoint of the group outward.
+
+e. Name the option buttons **optUpOne** , **optUpTwo** , **optDownOne** , and **optDownTwo** , respectively, by changing the Name property of each in turn in the Properties window.
+
+* * *
+
+Option Buttons Are Mutually Exclusive
+
+By default, all the option buttons on a user form (if they're not contained within a frame) are part of the same option group. This means that only one of these option buttons can be selected at any given time. If you want to provide more than one group of option buttons on a user form, you need to specify the separate groups. The easiest way to do this is to position each group within a separate Frame control as you did in this example. Alternatively, you can specify a different GroupName property for each option button.
+
+* * *
+
+f. Next, set the first option button's Value property to True by selecting the default False value in the Properties window and entering **True** instead. Doing so will select the option button in the user form you're designing, and when the dialog box is displayed, that option button will be selected as the default choice for the option group. Set its accelerator key to _U_ by entering **U** as its Accelerator property. Set the Accelerator property of the second option button to _t_ , the third to _D_ , and the fourth to _w_. The Accelerator property is case sensitive only when the caption for the control contains both the uppercase and lowercase versions of the same letter.
+
+5. Place a check box in the Insertion Point frame, as shown in Figure 14.19:
+
+Figure 14.19 Place a check box in the Insertion Point frame.
+
+a. Click the CheckBox tool in the Toolbox and then click in the Insertion Point frame in the user form to place a check box of the default size.
+
+b. In the Properties window, set the name of the check box to **chkReturnToPreviousPosition** (a long name but a descriptive one). Then set its Caption property to **Return to previous position**. Set its accelerator key to _R_ by entering **R** as its Accelerator property. Finally, set the check box to be selected by default by entering **True** as its Value property.
+
+6. Next, insert the command buttons for the form (see Figure 14.20):
+
+Figure 14.20 Add two command buttons and set their properties.
+
+a. Double-click the CommandButton tool on the Toolbox and click to place the first command button at the bottom of the user form. Click to place the second command button, and then click the Select Objects button to restore the selection mouse pointer.
+
+b. Size and place the command buttons by using the commands on the Format menu. For example, group the buttons, and then use the Format ⇒ Center In Form ⇒ Horizontally command to center the pair horizontally. You must group the buttons before doing this—if you simply select both of them, VBA centers one button on top of the other so that only the uppermost button is visible.
+
+c. Set properties of the command buttons as follows: For the left-hand button (which will become the OK button), set the Name property to **cmdOK** , the Caption property to **OK** , the Accelerator property to **O** (that's _O_ as in _OK_ , not a zero), and the Default property to **True**. For the right-hand button (which will become the Cancel button), set the Name property to **cmdCancel** , the Accelerator property to **A** , the Caption property to **Cancel** , and the Cancel property to **True**. Leave the Default property set to False.
+
+7. Now we attach our code to this form. Dive down into the Code window by double-clicking the Cancel button to display a procedure associated with it:
+
+    Private Sub cmdCancel_Click()
+
+    End Sub
+
+Recall that the Editor chooses to create a procedure for the most common event for whatever control (or the form) you double-click to get down into the code window. For most controls, this will be the Click event, as it is for the CommandButton control.
+
+Type an End statement between the lines:
+
+    Private Sub cmdCancel_Click()
+
+        End
+    End Sub
+
+This End statement removes the form from the screen and ends the current procedure—in this case, the Move_Current_Paragraph procedure.
+
+Now you'll attach code to the OK button, which is where things get interesting. When the user clicks the OK button, the procedure needs to continue executing and do all of the following:
+
+  * Remove the dialog box from display by hiding it or by unloading it (or, preferably, both). As discussed earlier in the chapter, the choice is yours, but using both commands is usually clearest.
+  * Check the Value property of the check box to see whether it was selected or cleared.
+  * Check the Value property of each option button in turn to see which of them was selected when the OK button was clicked.
+
+Now continue creating the Move Current Paragraph dialog box:
+
+8. Double-click the OK button to display the code attached to it. (If you're still working in the Code window, select cmdOK in the Object drop-down list (on the top left of the Code window). The editor automatically creates the Click event procedure for this button.
+
+First, enter the following two lines between the Private Sub and End Sub lines:
+
+    frmMoveParagraph.Hide
+
+    Unload frmMoveParagraph
+
+The frmMoveParagraph.Hide line activates the Hide method for the frmMoveParagraph user form, hiding it from display on the screen. The Unload frmMoveParagraph line unloads the dialog box from memory.
+
+* * *
+
+Removing a Form Can Prevent Confusion
+
+It isn't necessary to hide or unload a form to continue execution of a procedure, but if you don't, users may become confused. For example, if you click the OK button on a Print dialog box in a Windows application, you expect the dialog box to disappear and the Print command to be executed. If the dialog box didn't disappear (but it launched the printing job in the background), you'd probably think it hadn't registered your click, so you'd click again and again until it went away. Then you'd print multiple copies, which is so wrong.
+
+* * *
+
+9. Next, the procedure needs to check the Value property of the chkReturnToPreviousPosition check box to find out whether to insert a bookmark in the document to mark the current position of the insertion point. To do this, enter a straightforward If... Then statement:
+
+    If chkReturnToPreviousPosition = True Then
+
+    End If
+
+If the chkReturnToPreviousPosition statement is set to True—that is, if the check box is selected—the code in the lines following the Then statement runs. The Then statement consists of the lines for inserting a bookmark that you recorded earlier. Cut these lines from the procedure and paste them into the If... Then statement like this:
+
+    If chkReturnToPreviousPosition = True Then
+
+        With ActiveDocument.Bookmarks
+            .Add Range:=Selection.Range, Name:=" Move_Paragraph_Temp"
+        End With
+    End If
+
+If the check box is selected, the procedure inserts a bookmark; if the check box is cleared, the procedure passes over these lines.
+
+**10.** Next, right after the End If, paste in the code for selecting the current paragraph and cutting it to the Clipboard:
+
+    Selection.Extend
+    Selection.Extend
+    Selection.Extend
+    Selection.Extend
+    Selection.Cut
+
+11. After this, you need to retrieve the Value properties from the option buttons to see which one was selected when the user chose the OK button in the dialog box. For this, you can again use an If condition—this time, an If... Then ElseIf... Else condition, with the relevant insertion-point-movement lines from the recorded procedure pasted in:
+
+    If optUpOne = True Then
+
+        Selection.MoveUp Unit:=wdParagraph, Count:=1
+    ElseIf optUpTwo = True Then
+        Selection.MoveUp Unit:=wdParagraph, Count:=2
+    ElseIf optDownOne = True Then
+        Selection.MoveDown Unit:=wdParagraph, Count:=1
+    Else
+        Selection.MoveDown Unit:=wdParagraph, Count:=2
+    End If
+    Selection.Paste
+
+Here, optUpOne, optUpTwo, optDownOne, and optDownTwo (which uses the Else statement here and therefore isn't specified by name in the listing) are the four option buttons from the dialog box, representing the choice to move the current paragraph up one paragraph, up two paragraphs, down one paragraph, or down two paragraphs, respectively.
+
+The condition is straightforward: If optUpOne is True (that is, if the option button is selected), the first Then condition runs, moving the insertion point up one paragraph from its current position (after the current paragraph is cut, the insertion point will be at the beginning of the paragraph that was after the current one). If optUpOne is False, the first ElseIf condition is evaluated; if the condition evaluates to True, the second Then condition runs; and if the condition evaluates to False, the next ElseIf condition is evaluated. If that conditiona, too, turns out to be False, the Else code is run. In this case, the Else statement means that the optDownTwo option button was selected in the dialog box, so the Else code moves the insertion point down two paragraphs.
+
+Wherever the insertion point ends based on which option button the user chose, the next line of code (Selection.Paste) pastes in the cut paragraph from the Clipboard.
+
+**12.** Finally, the procedure must return the insertion point to where it was originally if the chkReturnToPreviousPosition check box is selected. Again, you can test for this with a simple If... Then condition that incorporates the go-to-bookmark and delete-bookmark lines from the recorded procedure:
+
+    If chkReturnToPreviousPosition = True Then
+
+        Selection.GoTo What:=wdGoToBookmark, _
+            Name:=" Move_Paragraph_Temp"
+        ActiveDocument.Bookmarks("Move_Paragraph_Temp").Delete
+    End If
+
+If the chkReturnToPreviousPosition check box is selected, VBA moves the insertion point to the temporary bookmark and then deletes that bookmark.
+
+Listing 14.1 shows the full listing for the cmdOK button.
+
+**Listing 14.1**: The full listing
+
+     1.  Private Sub cmdOK_Click()
+     2.      frmMoveParagraph.Hide
+     3.      Unload frmMoveParagraph
+     4.      If chkReturnToPreviousPosition = True Then
+     5.          With ActiveDocument.Bookmarks
+     6.              .Add Range:=Selection.Range, _
+                         Name:="Move_Paragraph_Temp"
+     7.          End With
+     8.      End If
+     9.      Selection.Extend
+    10.      Selection.Extend
+    11.      Selection.Extend
+    12.      Selection.Extend
+    13.      Selection.Cut
+    14.      If optUpOne = True Then
+    15.          Selection.MoveUp Unit:=wdParagraph, Count:=1
+    16.      ElseIf optUpTwo = True Then
+    17.      Selection.MoveUp Unit:=wdParagraph, Count:=2
+    18.      ElseIf optDownOne = True Then
+    19.          Selection.MoveDown Unit:=wdParagraph, Count:=1
+    20.      Else
+    21.          Selection.MoveDown Unit:=wdParagraph, Count:=2
+    22.      End If
+    23.      Selection.Paste
+    24.      If chkReturnToPreviousPosition = True Then
+    25.          Selection.GoTo What:=wdGoToBookmark, _
+                     Name:="Move_Paragraph_Temp"
+    26.          ActiveDocument.Bookmarks("Move_Paragraph_Temp").Delete
+    27.      End If
+    28.  End Sub
+
+Go ahead and try it. To test this example properly, you should remove the bookmark you inserted while recording the macro earlier in this chapter. To remove it, click the Bookmark item in the Links section in the Insert tab on Word's Ribbon. In the Bookmarks dialog box that opens, select Move_Paragraph_Temp and click the Delete button.
+
+Now open the scratch document in Word that you created earlier in this chapter and filled with several paragraphs of text. Press Alt+F11 to open the Visual Basic Editor. Double-click frmMoveParagraph in the Project Explorer to display the user form. Press F5 to run this procedure. Click the OK button in your user form and observe that the paragraphs were rearranged in the document.
+
+## General Example: Opening a File from a List Box
+
+This next example displays a user form that employs a list box from which the user can select a file to open. The user form is simple, as is its code. The macro includes a loop and an array to gather the names of the files in a folder and then displays the filenames in the list box. The user gets to select a file and click the Open button to open it. Figure 14.21 shows the user form in action, displaying Excel files.
+
+Figure 14.21 The user form you'll build in this example contains a list box that gives the user quick access to all current files.
+
+You can adapt this example to any of the Office 2013 applications discussed in this book by changing the filename to an appropriate type for that application and also modifying a couple of the key statements. The version of this example we'll look at now shows you how to create the procedure in Excel.
+
+### Building the User Form
+
+Follow these steps to build the user form:
+
+1. Start the application you want to work in. The example uses Excel.
+
+2. Display the Visual Basic Editor by pressing the Alt+F11 key or by clicking the Visual Basic icon in the Ribbon's Developer tab.
+
+3. In the Project Explorer, right-click the project to which you want to add the user form and choose Insert ⇒ UserForm from the context menu to insert a default-size user form in the project.
+
+4. Drag the handle at the lower-right corner of the user form to the right to make the user form a bit wider.
+
+5. Set the Name property of the form to **frmOpen_a_Current_File** and its Caption to **Open a Current File**. Check the Width property. You want it to be about 350 pixels wide.
+
+6. Click the Label button in the Toolbox, and then click in the upper-left corner of the user form to place a default-size label there. Activate the Properties window and set the properties of the label as shown in Table 14.9.
+
+Table 14.9 Set these properties of your label
+
+**Property** | **Value**  
+---|---  
+(Name) | lblInfo  
+AutoSize | True  
+Caption | Choose the file to open and click the Open button.  
+Left | 10  
+Top | 6  
+WordWrap | False
+
+7. Click the ListBox button in the Toolbox, and then click below the label in the user form to place a default-size list box there. Set its properties as shown in Table 14.10.
+
+Table 14.10 Set these properties of the ListBox
+
+**Property** | **Value**  
+---|---  
+(Name) | lstifles  
+Height | 100  
+Left | 10  
+Top | 25  
+Width | 300
+
+8. Double-click the CommandButton button in the Toolbox, and then click twice at the bottom of the user form to place two default-size command buttons there. Set their properties as shown in Table 14.11.
+
+Table 14.11 Set these properties of the CommandButton
+
+**Property** | **First Button Value** | **Second Button Value**  
+---|---|---  
+(Name) | cmdOpen | cmdCancel  
+Cancel | False | True  
+Caption | Open | Cancel  
+Default | True | False  
+Height | 21 | 21  
+Width | 55 | 55
+
+9. Arrange the command buttons as follows:
+
+a. Click the cmdCancel button to select it, and then drag it close to the cmdOK button.
+
+b. With the cmdCancel button still selected, Ctrl+click the cmdOK button to add it to the selection.
+
+c. Choose Format ⇒ Group to group the buttons.
+
+d. Choose Format ⇒ Center In Form ⇒ Horizontally to center the buttons horizontally in the form.
+
+e. Drag the group up or down as necessary.
+
+(Or just drag them around and eyeball them into a pleasing position.)
+
+## Creating the Code for the User Form
+
+Follow these steps to create the code for the user form:
+
+1. With the user form selected, press the F7 key to display the user form's code sheet.
+
+2. In the declarations portion of the code sheet (just keep pressing the up-arrow key until you move to the very top of the Code window), enter an Option Base 1 statement to make the array numbering start at 1 instead of at 0:
+
+    Option Base 1
+
+3. Make sure that UserForm is selected in the Object drop-down list (top left of the code sheet), and then pull down the Procedure drop-down list (top right) and choose Initialize from it. The Visual Basic Editor enters the stub of an Initialize procedure in the code sheet, like this:
+
+    Private Sub UserForm_Initialize()
+
+    End Sub
+
+4. Enter the statements for the Initialize procedure shown in Listing 14.2.
+
+5. In the Object drop-down list, select cmdCancel. The Visual Basic Editor enters the stub of a Click procedure, as shown here. (Click is the default event for the CommandButton control, so the Visual Basic Editor assumes that you want to create a Click procedure.)
+
+    Private Sub cmdCancel_Click()
+
+    End Sub
+
+6. Enter the statements for the cmdCancel_Click procedure shown in Listing 14.2.
+
+7. In the Object drop-down list, select cmdOpen. The Visual Basic Editor enters the stub of a Click procedure.
+
+8. Enter the statements for the cmdOpen_Click procedure shown in Listing 14.2.
+
+9. Customize line 9 (in the Initialize procedure) and line 32 (in the cmdOpen_Click procedure) so that the code will work with the application you're using, as shown in the following list. The procedure as shown is set up to run for Excel, but you'll probably need to change the path to reflect where the target files are on your computer.
+
+  * For Word, change the Workbooks.Open statement to Documents.Open:
+
+    If lstFiles.Value <> "" Then Documents.Open _
+
+         Filename:="c:\transfer\" & lstFiles.Value
+
+  * For PowerPoint, change the Workbooks.Open statement to Presentations.Open:
+
+    If lstFiles.Value <> "" Then Presentations.Open _
+
+        Filename:="c:\transfer\" & lstFiles.Value
+
+Listing 14.2 shows the full version of the code behind the Open a Current File user form.
+
+**Listing 14.2**: Using a ListBox to open a file
+
+     1.  Option Base 1
+     2.
+     3.  Private Sub UserForm_Initialize()
+     4.
+     5.      Dim strFileArray() As String
+     6.      Dim strFFile As String
+     7.      Dim intCount As Integer
+     8.
+     9.      **strFFile = Dir("c:\transfer\spreads\*.xlsb")**
+    10.      intCount = 1
+    11.
+    12.      Do While strFFile <> ""
+    13.          If strFFile <> "." And strFFile <> ".." Then
+    14.              ReDim Preserve strFileArray(intCount)
+    15.              strFileArray(intCount) = strFFile
+    16.              intCount = intCount + 1
+    17.              strFFile = Dir()
+    18.          End If
+    19.      Loop
+    20.
+    21.      lstFiles.List() = strFileArray
+    22.
+    23.  End Sub
+    24.
+    25.  Private Sub cmdCancel_Click()
+    26.      Me.Hide
+    27.      Unload Me
+    28.  End Sub
+    29.
+    30.  Private Sub cmdOpen_Click()
+    31.      Me.Hide
+    32.      **If lstFiles.Value <> "" Then Workbooks.Open _**
+                 **Name:="c:\transfer\spreads" & lstFiles.Value**
+    33.      Unload Me
+    34.  End Sub
+
+Listing 14.2 contains all the code that appears on the code sheet for the frmOpen_a_Current_File user form: a declarations section and three event procedures.
+
+In the declarations section, line 1 contains the Option Base 1 statement, which makes any array used on the code sheet begin at 1 rather than at 0. Line 2 is a spacer.
+
+Here's what happens in the UserForm_Initialize procedure (lines 3 to 23):
+
+  * Line 3 begins the Initialize procedure for the user form. Line 4 is a spacer.
+  * Line 5 declares the String array variable strFileArray. Line 6 declares the String variable strFFile. Line 7 declares the Integer variable intCount. Line 8 is a spacer.
+  * Line 9 assigns to strFFile the result of a directory operation on the designated folder (here, c:\transfer\spreads\), but substitute your own path to a folder on your computer that contains files with an .xlsb filename extension. Enter your own path in line 32 as well.
+  * Line 10 sets the intCount counter to 1. Note that if you don't use the Option Base 1 declaration for this procedure, you need to set Count to 0 (or the corresponding value for a different option base that you use). The first call to Dir, which specifies the pathname in an argument, returns the first file it finds in the folder (assuming it finds at least one file). Each subsequent call without the argument returns the next file in the folder, until Dir finds no more files.
+  * Line 11 is a spacer. Lines 12 through 19 contain a Do While...Loop loop that runs while strFFile isn't an empty string (″″): 
+    * Line 13 makes sure that strFFile isn't a folder by comparing it to the single period and double period used to denote folders. If strFFile isn't a folder, line 14 uses a ReDim Preserve statement to increase the dimensions of the strFileArray array to the number in intCount while retaining the current information in the array, thus building the list of files in the folder.
+    * Line 15 assigns to the intCount index of the strFileArray array the current contents of strFFile.
+    * Line 16 then adds 1 to intCount, and Line 17 sets strFFile to the result of the Dir function (the first filename matching the *.xlsb pattern in the designated folder).
+    * Line 18 ends the If condition. Line 19 contains the Loop keyword that will continue the loop as long as the Do While statement is True.
+  * When the loop ends, line 21 sets the List property of the lstFiles list box in the dialog box to the contents of strFileArray, which now contains a list of all the files in the folder.
+  * Line 22 is a spacer, line 23 ends the procedure, and line 24 is another spacer.
+
+Here's what happens in the cmdCancel_Click procedure (lines 25 through 28):
+
+  * Line 25 starts the cmdCancel_Click procedure, and line 28 ends it.
+  * Line 26 hides the user form, using the Me keyword to reference it.
+  * Line 27 unloads the user form from memory.
+
+  * Here's what happens in the cmdOpen_Click procedure (lines 30 through 34): 
+    * Line 30 starts the cmdOpen_Click procedure, and line 34 ends it.
+    * Line 31 hides the user form, again by using the Me keyword.
+    * Line 32 checks to make sure the Value property of the lstFiles list box is not an empty string ("") and, if it is not, uses the Open method of the Documents collection to open the file selected in the list box. The statement adds to the path (c:\transfer\spreads\) the Value property of the list box to produce the full filename. Substitute your own path for c:\transfer\spreads\.
+    * Line 33 unloads the user form from memory.
+
+Remember that to test this example, you'll need to adjust lines 9 and 32 to include a file path on your machine where some XLSB files are stored. For Excel 2013, try this location: C:\Users\ _YourName_ \AppData\Roaming\Microsoft\Excel\XLSTART.
+
+# Using an Application's Built-in Dialog Boxes from VBA
+
+Some applications, such as Word and Excel, let you use their built-in dialog boxes via VBA. If a built-in dialog box offers the functionality you need, using it can be a great solution: you don't have to build a custom dialog box, just reference the built-in dialog box in your code.
+
+You shouldn't even need to debug the dialog box, and users of your procedures will probably be familiar with the dialog box from their work in the application. These built-in dialog boxes are called _common dialog boxes_ , and we explored them briefly in the sidebar titled "Control a For...Next Loop with User Input via a Dialog Box" in Chapter 12.
+
+## Displaying a Built-in Dialog Box
+
+To display a built-in dialog box, you need to know its name and constant. You also must decide which method to use to display the dialog box.
+
+### Finding the Dialog Box Name and Constant
+
+Although Office 2013 no longer uses menus (with some exceptions, such as the Visual Basic Editor), built-in dialog boxes (in Word and other applications) are still identified by constants derived from the older, pre-Ribbon menu-style interface. These constants start with the letters wdDialog (as in Word Dialog), followed by the name of the dialog box.
+
+The name of the dialog box is derived from the pre–Office 2010 menu commands that displayed the dialog box prior to the introduction of the Ribbon interface (with Office 2007). For example, to refer to the Open dialog box, you use the constant wdDialogFileOpen, because in previous versions of Word, you would have chosen File ⇒ Open to display that dialog box.
+
+Or to display the Print dialog box (the old File ⇒ Print options), you use the constant wdDialogFilePrint, and to display the Options dialog box (Tools ⇒ Options), you use the constant wdDialogToolsOptions.
+
+So, although the user interface has evolved beyond classic menus, the menu structure itself remains as part of the classification system for internal objects—such as these constants used to identify various dialog boxes.
+
+Excel follows a similar but less rigid taxonomic convention. Built-in Excel dialog boxes are (for backward compatibility with older macro code) still identified by constants starting with the letters xlDialog followed by the name of the dialog box. The name of the dialog box is derived either from the classic menu commands that were required to display it or from the dialog box's title. For example, to refer to the Open dialog box, you use the constant xlDialogOpen (rather than xlDialogFileOpen).
+
+Anyway, the easiest way to find the name for the built-in dialog box you need is to search the Visual Basic Editor's Help system for "Built-in Dialog Box Argument Lists" in Word or Excel. (Access employs a whole different system for common dialog boxes, requiring the importation of object libraries using its Visual Basic Editor's Tools ⇒ References menu and the employment of specialized objects.)
+
+You can also view a list of Word or Excel built-in dialog boxes by displaying the Object Browser (press F2 in the Editor) and typing **wddialog** (for Word) or **xldialog** (for Excel) in the Search textbox.
+
+You use these constants with the Dialogs property, which returns the Dialogs collection object, which in turn contains all the built-in dialog boxes in the host application.
+
+For example, to display Word's Save As dialog box, you use the Show method, as illustrated in the following statement:
+
+    Dialogs(wdDialogFileSaveAs).Show
+
+It's as simple as that. To display Word's Replace dialog box, just substitute wdDialogEditReplace for wdDialogFileSaveAs.
+
+* * *
+
+The Dialogs Collection Is Creatable in Word, but Not in Excel
+
+In Word, the Dialogs collection is a "creatable object," meaning you can access it directly without going through the Application object. In Excel, however, the Dialogs collection is not creatable, so you must always add the Application object to this code, like this:
+
+    **Application.** Dialogs (xlDialogOptionsGeneral).Show.
+
+* * *
+
+### Choosing between the Show Method and the Display Method
+
+VBA provides two methods of displaying built-in dialog boxes onscreen: Show and Display:
+
+  * The Show method shows the specified Dialog object and then uses functions built into the Dialog object to carry out the user's requests. You don't need to write any code of your own. For example, if you use the Show method to display the wdDialogFileSaveAs dialog box and the user enters a name for the file in the File Name box and clicks the Save button, VBA itself automatically saves the file with the given name in the specified folder (and with any other options the user chose). You didn't write any programming to save this file.
+  * The Display method merely displays the dialog box onscreen, but it does _not_ execute the actions the user requests in the dialog box. Instead, it allows you to fetch the settings (the user's requests and selections) from the dialog box once the user dismisses it, but then you must write your own code to carry out what the user requested.
+
+* * *
+
+Displaying a Particular Tab of a Word Dialog Box
+
+If the dialog box you want to display has tabs, you can display the tab of your choice by specifying the DefaultTab property. You refer to a tab by the name of the dialog box plus the word Tab and the name of the tab. For example, the constant for the Bullets And Numbering dialog box is wdDialogFormatBulletsAndNumbering, and the constant for its Outline Numbered tab is wdDialogFormatBulletsAndNumberingTabOutlineNumbered. Likewise, the Font dialog box is referred to as wdDialogFormatFont, and its Character Spacing tab is referred to as wdDialogFormatFontTabCharacterSpacing. You could display this tab by using the following statements:
+
+    With Dialogs(wdDialogFormatFont)
+        .DefaultTab = wdDialogFormatFontTabCharacterSpacing
+        .Show
+    End With
+
+To get a list of all the tab constants, search for wdWordDialogTab in the Object Browser.
+
+* * *
+
+### Using the Show Method to Display and Execute a Dialog Box
+
+The Show method displays the specified dialog box and automatically responds to whatever actions the user takes in it. Show is useful when your user is merely going to perform a conventional interactive action. As a simple example, in a procedure that's supposed to perform certain formatting tasks on the current document, you could check to make sure a document was open before attempting to perform the formatting; then, if no document was open, you could display the built-in Open dialog box so that the user could open a file. (You might precede the Open dialog box with a message box explaining the problem.) Listing 14.3 shows the code for this part of the procedure.
+
+**Listing 14.3**: Using a common dialog box
+
+     1.  If Documents.Count = 0 Then
+     2.      Proceed = MsgBox("There is no document open." _
+                 & vbCr & vbCr & _
+                 "Please open a document for the procedure to work on.", _
+                 vbOKCancel + vbExclamation, "Format Report")
+     3.      If Proceed = vbOK Then
+     4.          **Dialogs(wdDialogFileOpen).Show**
+     5.          If Documents.Count = 0 Then End
+     6.      Else
+     7.          End
+     8.      End If
+     9.  End If
+    10.  'rest of procedure here
+
+Here's how the code works:
+
+  * Line 1 checks the Count property of the Documents collection to see if no documents are open; if that's the case, the statements in lines 2 through 8 run.
+  * Line 2 displays a message box informing users that no document is open and asking them to open one for the procedure to work on. The message box has OK and Cancel buttons and stores the button chosen in the variable Proceed.
+  * Line 3 checks to see if the OK button was chosen; if it was, line 4 displays the Open dialog box so that users can select the file, which VBA will open when they click the Open button in the Open dialog box.
+  * Users can cancel the procedure at this point by clicking the Cancel button in the Open dialog box, so line 5 checks the Count property of the Documents collection again and uses an End statement to terminate execution of the procedure if there is still no document open.
+  * If the OK button was not chosen, execution moves from line 3 to the Else statement in line 6, and the End statement in line 7 ends execution of the procedure.
+  * Line 8 contains the End If statement for the nested If statement, and line 9 contains the End If statement for the outer If statement.
+  * Line 10 contains a comment to indicate that you'd write more code here—the rest of the procedure would run from this point, which is reached only if a document is open.
+
+### Using the _Display_ Method to Display a Dialog Box
+
+Remember that unlike the Show method, the Display method displays a built-in dialog box but doesn't respond to any actions the user takes in the dialog box. Instead, you must write code that checks the settings that the user chose in the dialog box and then write more code to carry out the user's wishes. When you use the Display method, the user gets to work with familiar dialog boxes, but you totally control the behavior that results from that interaction.
+
+For example, you'll often need to find out which folder a procedure should be working in, such as when you need the location of a number of documents that the user wants to manipulate. To get the folder, you _could_ display a straightforward input box and prompt the user to type in the correct path to the folder—if the user knows the path and can type it in correctly.
+
+Perhaps a better solution is to display a list box containing the tree of drives, folders, and files on the user's hard drive, but to do this you need to dimension an array and fill it with the folders and filenames, _and_ you need to refresh the display every time the user moves up or down the tree—quite a lot of programming work.
+
+So why not just borrow all this functionality from a built-in common dialog box? It's already part of the Office applications. You can achieve the same result much more easily by using a built-in dialog box that has the tree built in (for example, the Open dialog box) and then retrieving the user's responses for your own purposes.
+
+If you need to execute the settings (user choices) in a built-in dialog box, you can use the Execute method. But you might want to check the user's selections in the dialog box before implementing them. If you find a problem, you could then, for example, display a dialog box of your own, such as an Input Box, asking for clarification.
+
+## Setting and Restoring Options in a Built-in Dialog Box
+
+Most of the built-in Word and Excel dialog boxes have arguments that you can use for retrieving or setting values in the dialog box. For example, the Open dialog box in Word has arguments for Name, ConfirmConversions, ReadOnly, LinkToSource, AddToMru (adding the document to the Most Recently Used document list on the Recent section of the File tab on the Ribbon), PasswordDoc, and more. Some of these are options that you'll see in the Open dialog box itself; others are associated options that you'll find on the various tabs of the Options dialog box. You can guess some argument names from the names of the corresponding controls in the dialog box, but other names aren't directly related. To learn the names, search for "Built-in Dialog Box Argument Lists" in the VBA Editor's Help system (choose MSDN on the Web, then search with Bing).
+
+For example, the following statements set the contents of the File Name text box in the Save As dialog box in Word and then display the dialog box:
+
+    With Dialogs(wdDialogFileSaveAs)
+        .Name = "Yellow Paint Primer"
+        .Show
+    End With
+
+Be aware that some arguments that applied to dialog boxes displayed by Office 2003 no longer apply to Office 2007, 2010, or 2013 dialog boxes. So you may need to experiment a bit to see if a particular legacy argument is still useful in the Office 2013 interface.
+
+If you change the settings in a dialog box that uses sticky (persistent) settings, it's a good idea to change them back at the end of your procedure so that users don't get unexpected results the next time they open the dialog box.
+
+## Which Button Did the User Choose in a Dialog Box?
+
+To find out which button the user clicked in a dialog box, check the return value of the Show method or the Display method. The return values are shown in Table 14.12.
+
+Table 14.12 Click return values
+
+**Return Value** | **Button Clicked**  
+---|---  
+–2 | Close  
+–1 | OK  
+0 | Cancel  
+1 | The first command button  
+2 | The second command button  
+>2 (greater than 2) | Subsequent command buttons
+
+For example, you might want to cancel your whole procedure if the user clicks the Cancel button in a dialog box, like this:
+
+    If Dialogs(wdDialogFileOpen).Show = 0 Then End
+
+## Specifying a Time-Out for a Dialog Box
+
+In some applications, including Word, you can display some built-in dialog boxes for a specified time rather than having them stay open until the user dismisses them by clicking OK or Cancel or some other button. To do so, you use the TimeOut Variant argument with the Show method or the Display method. You specify TimeOut as a number of units, each of which is approximately a thousandth of a second. (If the system is busy with many other tasks, the actual result might be a slightly longer delay.) So you could display the General page of the Word Options dialog box for about 10 seconds—long enough for the user to check the Name setting and change it if necessary—by using the following statements:
+
+    With Dialogs(wdDialogToolsOptions)
+        .DefaultTab = wdDialogToolsOptionsTabUserInfo
+        .Show (10000)
+    End With
+
+* * *
+
+The TIMEOUT Argument Doesn't Work with Custom Dialog Boxes
+
+TimeOut doesn't work with custom dialog boxes you create, only with the built-in Word dialog boxes. Also, some built-in Word dialog boxes—such as the New dialog box (wdDialogFileNew) and the Customize dialog box (wdDialogToolsCustomize)—don't recognize the TimeOut option either.
+
+* * *
+
+Timing out a dialog box is especially useful for noncritical information like the username in this example because it allows the procedure to continue even if the user has left the computer. Likewise, you might want to time out a Save As dialog box in which the procedure suggested a viable filename but allowed users to override it if they were present. However, for a procedure in which the user's input is essential, you won't want to use the TimeOut argument. You want to compel the user to respond by at least clicking a button; in this context, the dialog box should not disappear all by itself via this timeout technique.
+
+# The Bottom Line
+
+**Understand what you can do with a custom dialog box.**
+
+Custom dialog boxes—user interfaces you design as forms in the Visual Basic Editor—are often needed in macros and other kinds of Office automation. You might, for example, want to display a dialog box that allows the user to specify whether to let a macro continue beyond a certain point in its code or cease execution. Perhaps your macro is searching through a document for a particular phrase; then when it finds that phrase, it displays a dialog box to users asking if they want to continue further.
+
+**Master It**
+
+Which VBA statement would you use to stop a macro from continuing execution?
+
+**Create a custom dialog box.**
+
+You use the Visual Basic Editor to both design a custom dialog box (form) and write code for macros. You can attach the various controls to a form and then enter code _behind_ the dialog box.
+
+**Master It**
+
+How do you switch between the form-design window (sometimes called the object window) and the Code window in the Visual Basic Editor?
+
+**Add controls to a dialog box.**
+
+It's easy in the Visual Basic Editor to add various controls—such as command buttons and text boxes—to a user form (a custom dialog box).
+
+**Master It**
+
+How do you add a command button to a custom dialog box?
+
+**Link dialog boxes to procedures.**
+
+Buttons, check boxes, option buttons—displaying various controls to the user is fine, but unless you write some code _behind_ these various user-interface objects, what's the point? Your macro's user shouldn't discover that clicking a button _does nothing_.
+
+Dialog boxes often display objects with which users can communicate their wishes to your code. Therefore, you write code that explores the values the user enters into controls and responds to whatever buttons the user might click.
+
+**Master It**
+
+Create a small custom dialog box that displays a message in a label control saying, "Would you like to know the current date and time?" Put an OK button and a Cancel button on this form. Write code that simply ends the procedure if the user presses the Cancel button but that displays the date and time in the label if the user clicks the OK button. If the user clicks OK a second time, end the procedure.
+
+**Retrieve the user's choices from a dialog box.**
+
+A major task of most dialog boxes is retrieving values that the user has specified in various controls by selecting check boxes and so on. Then you write code to carry out the user's wishes based on these retrieved values. This interaction via dialog box is the typical way that a user communicates with your procedures, and vice versa.
+
+**Master It**
+
+Create a new dialog box that contains three option buttons captioned Small, Medium, and Large and named optSmall, optMedium, and optLarge. Write code in each option button's Click procedure to change the button's caption to boldface when the button is clicked.
+Chapter 15
+
+Creating Complex Forms
+
+While simple dialog boxes tend to be static, more complex dialog boxes can be _dynamic_ : They can change when the user clicks certain elements in them. Such changes can include the following:
+
+  * The application changes the information in the dialog box to reflect choices that the user has made. For example, if a user selects a particular check box, the application may make other check boxes unavailable (hidden or disabled) because the options offered by the other check boxes cannot be simultaneously chosen along with the first check box.
+  * The dialog box displays a hidden section of secondary, less frequently used options when the user clicks a button in the primary area of the dialog box.
+  * The application uses the dialog box to keep track of a procedure and to guide the user to the next step by displaying appropriate instructions and by activating relevant controls. In this chapter, you'll look at an example of this technique.
+
+In this chapter, you'll start by investigating how to create dynamic forms. Such dialog boxes cost you a little more work than static dialog boxes, but they're a great way to both present information and allow the user to make choices. (Note that the terms _form_ and _dialog box_ can be used interchangeably, though dialog boxes tend to be smaller and simpler than forms.)
+
+From dynamic dialog boxes you'll move on to multipage dialog boxes, which you use to present more information or options to the user than the eye and mind can comfortably encompass at once.
+
+You'll then look at how to create a _modeless_ dialog box (one that users can leave onscreen while they continue to work in their application, much like Word's Research pane displays results from the thesaurus, though you can continue to edit the document).
+
+The chapter ends by showing you how to work with the many events supported by the UserForm object and the controls you use on it. By using events, you can monitor what the user does and take action accordingly, or even prevent the user from doing something that doesn't seem like a good idea.
+
+In this chapter you will learn to do the following:
+
+  * Understand what a complex dialog box is
+  * Reveal and hide parts of a dialog box
+  * Create multipage dialog boxes
+  * Create modeless dialog boxes
+  * Explore all the form and control events
+
+# Creating and Working with Complex Dialog Boxes
+
+You should never use a complex dialog box when a simple one will do the trick and be easier for users to work with. If all a procedure needs is a pair of check boxes and a group of option buttons, there's no need to employ multiple pages of dynamically updating controls. But often, you will want to create complex dialog boxes (like the examples given at the beginning of this chapter) to provide users with the flexibility that your procedures demand.
+
+## Updating a Dialog Box to Reflect the User's Choices
+
+You'll find it relatively easy to change a form to reflect the options the user chooses. Your primary tool for doing this is the Click event, to which most controls placed on a form react and to which you can code in the Code window that's "behind" (associated with) your form.
+
+When you double-click a control on a form, the Code window for that form opens and a default Sub procedure is displayed. This procedure is associated with the clicked control. The procedure is automatically named after the control and the control's default event. If you double-click a command button, for example, the Code window opens with this button's default Click event:
+
+    Private Sub CommandButton1_Click()
+
+    End Sub
+
+Whatever code you put into this procedure will be executed when the user clicks this particular command button.
+
+Some controls have different default events than Click; you'll learn about the Change event as you work with complex dialog boxes, and you'll see the full slew of other events in the second half of the chapter.
+
+Listing 15.1 in the next section shows you an example of code that updates a dialog box should the user click a button captioned More.
+
+## Revealing a Hidden Part of a Form
+
+Hiding part of a complex form is a great way to simplify the user's initial interaction with the dialog box. Consider the Find And Replace dialog box in Word: When you first see it (by pressing Ctrl+H, or by clicking the Replace icon in the Editing section of the Ribbon's Home tab), you're shown only the part of the dialog box (see the top box in Figure 15.1) for the most common type of search and replace—just the target and the replacement, along with the option to replace them one by one, or _en masse_.
+
+Figure 15.1 Word's Find And Replace dialog box hides some of its options (top) until you click the More button to display its lower half (bottom).
+
+But, should you want to use the less common or more advanced options that the abbreviated version of the Find And Replace dialog box doesn't display by default, you can click the More button to reveal the bottom part of the dialog box, as shown at the bottom in Figure 15.1. Here are more rarely used options, such as matching prefix or case.
+
+You may want to take a similar approach with your own dialog boxes, hiding a subset of actions that most users won't need most of the time. To do so, you can use two techniques, either separately or in tandem:
+
+  * Set the Visible property to False to hide controls that are located in a displayed part of the dialog box. Set the Visible property to True when you want to display these controls (after the user presses a More button or some such trigger).
+  * Increase the height or width (or both) of the dialog box to reveal an area containing further controls. The Find And Replace dialog shown in Figure 15.1 uses the technique of increasing the Height property of the box.
+
+As a simple example of the latter technique, consider the dialog box shown in Figure 15.2. When you display the dialog box, only the top part is visible; when you click the More button, the bottom part is displayed. Listing 15.1 contains the code behind the dialog box that makes all this happen.
+
+Figure 15.2 The top part of this Inventories form offers the most frequently used options. Clicking the More button reveals the rest of the dialog box (shown on the bottom), which contains less-often-used controls.
+
+**Listing 15.1**: Revealing part of a dialog box
+
+    1.   Private Sub UserForm_Initialize()
+    2.      frmInventories.Height = 120
+    3.   End Sub
+    4.
+    5.   Private Sub cmdMore_Click()
+    6.       If cmdMore.Caption = "< < Less" Then
+    7.           cmdMore.Caption = "More > >"
+    8.           cmdMore.Accelerator = "M"
+    9.           frmInventories.Height = 120
+    10.      Else
+    11.          frmInventories.Height = 240
+    12.          cmdMore.Caption = "< < Less"
+    13.          cmdMore.Accelerator = "L"
+    14.          fraOptions.Enabled = True
+    15.      End If
+    16.  End Sub
+    17.
+    18.  Private Sub chkArtNames_Click()
+    19.      If chkArtNames = True Then
+    20.          optFromDocument.Enabled = True
+    21.          optFromDocument = True
+    22.          optAutoNames.Enabled = True
+    23.      Else
+    24.          optFromDocument.Enabled = False
+    25.          optFromDocument = False
+    26.          optAutoNames.Enabled = False
+    27.          optAutoNames = False
+    28.      End If
+    29.  End Sub
+    30.
+    31.  Private Sub cmdOK_Click()
+    32.      frmInventories.Hide
+    33.      Unload frmInventories
+    34.      'create inventories here
+    35.  End Sub
+    36.
+    37.  Private Sub cmdCancel_Click()
+    38.      End
+    39.  End Sub
+
+Listing 15.1 contains five short procedures that control the behavior of the dialog box:
+
+**UserForm_Initialize**
+
+Initializes the dialog box before it's displayed.
+
+**cmdMore_Click**
+
+Runs when the cmdMore button is chosen. This button bears the caption More when only the top half of the dialog box is displayed, and the caption Less when the full dialog box is displayed.
+
+**chkArtNames_Click**
+
+Runs when the Enter Art Filenames check box is chosen.
+
+**cmdOK_Click**
+
+Runs when the OK button is chosen.
+
+**cmdCancel_Click**
+
+Runs when the Cancel button is chosen.
+
+Here's what happens in the code.
+
+  * The UserForm_Initialize procedure sets the Height property of the frmInventories user form to 120, which is enough to display only the top part of the dialog box. (To find the appropriate height for your dialog box, drag it to the height that looks right and note the Height property in the Properties window.) This procedure is necessary only if the user form is set to its full height at design time. By setting the user form to a height of 120 at design time, you could avoid having to use a UserForm_Initialize procedure. However, for a user form that has three or more different sizes—or for a user form with two different sizes, one of which needs to be chosen at runtime depending on environmental conditions—you'll need to use a UserForm_Initialize procedure.
+  * The cmdMore_Click procedure starts by checking in line 6 whether the Caption property of the cmdMore command button is <<Less. If so, that means that the whole dialog box is displayed. Line 7 then sets the Caption property of the cmdMore command button to More > >, the button that will be used to display the bottom part of the dialog box again if necessary. Line 8 sets the Accelerator property of the cmdMore command button to M (to make the _M_ in _More_ the accelerator key for the button). Line 9 sets the Height property of frmInventories to 120, which is the depth required to show only the top part of the dialog box.
+
+* * *
+
+The caption Property Works, But Using a State Variable Is Considered More Elegant
+
+Checking the Caption property of the cmdMore button is an effective way of determining the current state of this form (whether it's expanded or not), but this isn't the most elegant of techniques. It's a form of _hard coding_ , considered by many to be a sleazy way of programming. Instead, you could maintain an internal state variable (a Static toggle) in which you store information about whether the dialog box is displayed in its full state or its partial state. Using an internal state variable avoids assuming that this caption will always remain the same. The code would fail to work correctly, for example, if the form were at some point _localized_ (adapted for a different language locale, where the words more and less are not used).
+
+* * *
+
+If the condition in line 6 is False, execution shifts from line 6 to the Else statement in line 10. This must mean that the Caption property of the cmdMore button is already set to More > >, so the dialog box is displayed in its smaller version and the More > > button is being clicked to expand the dialog box again. Line 11 sets the Height property of the user form to 249, thus displaying the lower part of the dialog box. Line 12 changes the Caption property of the cmdMore command button to < < Less. Line 13 sets the Accelerator property of the cmdMore command button to L.
+
+Line 14 enables the fraOptions frame (identified as Options in the dialog box and disabled in the user form, as are the optFromDocument option button and the optAutoNames option button), making it and the controls it contains available to the user. Line 16 ends the cmdMore_Click procedure.
+
+  * The chkArtNames_Click procedure (lines 18 to 29) runs when the Enter Art Filenames check box is clicked. This procedure enables and disables the option buttons below it, as appropriate. Line 19 checks to see if the chkArtNames check box is selected. If it is, the statements in lines 20 through 22 run. Line 20 sets the Enabled property of the optFromDocument option button (identified as From Document in the dialog box) to True, thus making it available, and line 21 selects this option button as the default choice. Line 22 enables optAutoNames, the option button identified as Automatic Naming in the dialog box.
+
+If the chkArtNames check box isn't selected, execution shifts to the Else statement in line 23, which directs execution to line 24. This line sets the Enabled property of the optFromDocument option button to False, disabling it. Line 25 then deselects this option button (whether it's selected or not). Line 26 disables the optAutoNames option button, and line 27 deselects it (again, whether it's selected or not). The End If statement in line 28 ends this If statement, and line 29 ends this procedure.
+
+  * The cmdOK_Click procedure in lines 31 to 35 shows the beginning of the procedure that runs once the OK button is clicked. Line 32 hides the Inventories dialog box, and line 33 unloads it from memory. Line 34 contains a comment indicating that the instructions for creating the inventories appear here.
+  * The cmdCancel_Click procedure contains only an End statement to end execution of the procedure if the user chooses the Cancel button.
+
+## Tracking a Procedure in a Form
+
+The next level of complexity in working with forms is using them to track the different stages of a procedure and to guide the user as to how to continue.
+
+Take a look at the Create New Employee Web Page dialog box shown in Figure 15.3. This dialog guides the user through a four-stage procedure to create a web page for a new employee. The first step is to identify the employee deserving of this honor by using either the drop-down list or the Select Other Employee command button in the step 1 frame. The second step is to enter suitable introductory, critical, or laudatory text about the employee. The third step is to select the most (or perhaps least) flattering photo of the employee to include in the web page. The fourth step is to save the web page to a folder on the company's intranet.
+
+Figure 15.3 The Create New Employee Web Page form provides users with instructions that are dynamically updated as they work their way through the procedure.
+
+When the user first displays the Create New Employee Web Page dialog box, they will see the version of the dialog box shown in Figure 15.3, with steps 2, 3, and 4 disabled and instructions for step 1 shown in the Instructions box at the top.
+
+When the user follows the instructions and selects the employee by using either the combo box drop-down list or the Select Other Employee command button, the code attached to the combo box drop-down list or the command button enables the step 2 frame, making its text box available to the user, as shown in Figure 15.4. Here is the code for the Change event of the cmbSelectEmployee combo box; the code for the Click event of the cmdSelectOtherEmployee command button is similar, although a little more complex.
+
+Figure 15.4 The second stage of the Create New Employee Web Page dialog box. Notice the changes from the first stage: the instructions in the Instructions frame have changed, and the use of the step 1 combo box drop-down list has enabled the step 2 frame.
+
+    Private Sub cmbSelectEmployee_Change()
+      lblEmployeeName = cmbSelectEmployee.Text
+      fraStep2.Enabled = True
+      lblInstructions = "Enter text in the Step 2 text box. " & _
+        "For example, you might include brief biographical " & _
+        "information on the employee, details of their position, " & _
+        "or your hopes for their contribution to the company."
+      cmdClearEmployeeName.Enabled = True
+    End Sub
+
+* * *
+
+An Ellipsis Signals That a Dialog Box Can Be Displayed
+
+The Select Other Employee button in the Create New Employee Web Page dialog box ends with an ellipsis (...), as do some of the other command buttons. This ellipsis is the Windows convention for indicating that the choice (here a command button, but also other contexts) results in a dialog box being displayed rather than an action being taken immediately.
+
+* * *
+
+These are the changes that occur when the user completes step 1 of the dialog box:
+
+  * The text of the label in the Instructions box at the top of the dialog box is changed to contain information about step 2 of the procedure.
+  * The name of the employee selected by the user is listed above the Employee label in the step 1 frame.
+  * The frame for step 2 is enabled (the text box it contains is enabled along with the frame).
+
+## Using Multipage Dialog Boxes and Tab Strip Controls
+
+VBA includes a MultiPage control, which enables you to create multipage dialog boxes, and a TabStrip control, which lets you create dialog boxes driven by tab strips (similar to the tabs on the Office applications' Ribbon). You've almost certainly used multipage dialog boxes (if you're not sure what they are, press Ctrl+D in Word to open the Font dialog box and see an example of one). You can access any page (one at a time) by clicking the tab at the top of the page. Each page contains a different set of controls and can have a different layout appropriate to the page's purpose.
+
+* * *
+
+A Tab Is Not a Page
+
+The tab is the little thing that sticks out from the top of the page, not the whole page itself. Many people refer to the pages as "tabs" because the tab is the part you click to access the page. It's perfectly okay to use these terms interchangeably, but this discussion uses _tab_ to mean only the tab component and _page_ to refer to the page qua page.
+
+* * *
+
+Multipage dialog boxes are great for packing a lot of information into a single form without having it take up the whole screen with a bewildering embarrassment of options. You'll need to divide the information into discrete sets of related information to fit it onto the pages. Each page can (and should) have a different layout of controls that govern the behavior of discrete items; the pages are normally separate in theme or purpose. Again, the Font dialog boxes in the Office applications have a Font tab and an Advanced tab. Look at the Tools ⇒ Options dialog box in the VBA Editor for another example.
+
+A dialog box that uses a tab strip differs from a multipage dialog box in that it contains a tab strip control containing multiple _tabs_ but not multiple _pages_. To the user, it looks as if different pages are being displayed, but the actual layout of the controls in the dialog box doesn't change. No matter which tab on the tab strip is selected, the set of controls remains the same, although the data displayed in the controls does change. This approach is useful for displaying records from a database. The tabs merely switch to a different record.
+
+Tab strips are useful when you need to display consistent sets of information, such as the records you might maintain on your company's customers. Each customer record has the same set of fields (analogous to the columns in a database): an account number, a name (perhaps several), an address, phone numbers, email addresses, URLs, an order history, an account balance, and so on. Therefore, you can use the same set of controls (text boxes and labels, for example) to display the information for each record. The tab strip control governs which customer's set of information is displayed in them. Because few databases have a small and fixed number of records, you'll need to populate the tab strip on the fly (during execution) with tabs and captions, but it works fine.
+
+Table 14.7 in Chapter 14, "Creating Simple Custom Dialog Boxes," explains the properties unique to the TabStrip control and MultiPage control.
+
+* * *
+
+**Limit the Number of Pages in Your Multipage Dialog Boxes**
+
+You can create dialog boxes containing dozens of tabs or dozens of pages. And if you run out of horizontal space to display the tabs, the VBA Editor adds a scroll bar to enable the user to scroll through the tabs. However, gigantic tab dialog boxes are impractical in the real world. As you doubtless know, not everything that's possible is also desirable.
+
+You'll probably want to avoid creating multipage dialog boxes with more than 10 or 12 pages because the wealth of information such a dialog box will contain is likely to overwhelm the user.
+
+If you need more than a dozen pages to organize the information in a dialog box, you're probably trying to present the user with too much data at once. Consider an alternative way of displaying it. Most likely, you should subdivide the information into smaller, easier-to-manage categories. For example, Microsoft spends countless hours spread over several years testing focus groups, quizzing users, and observing people's behavior when using Word. One result is that Microsoft's designers subdivide tasks and user interaction into various subcategories. Click the Page Layout tab in the Word Ribbon. Notice that the many tasks within this category are subdivided into logical areas: Page Setup, Paragraph, and Arrange. What's more, two of these subcategories—Page Setup and Paragraph—have small box icons you can click in the lower-right corner. Clicking these icons opens a separate dialog box with additional, less frequently used, options.
+
+Tabs are a different matter. If you use a tab strip to move through the records in a database recordset, you may need to use quite a few tabs in a given tab strip. Unless the number of tabs is absurdly large, this shouldn't normally be a problem. However, a better solution if you're attempting to manage a database might be to switch to one of the more robust, specialized database-related user interface controls available in Access, Visual Basic Express, or Visual Basic .NET. For more information, see
+
+www.microsoft.com/visualstudio/eng/products/visual-studio-express-products
+
+* * *
+
+### Multipage Dialog Boxes
+
+To create a multipage dialog box, click the MultiPage icon in the Toolbox, and then click in the user form where you want the control to appear. The VBA Editor places a MultiPage control with two pages, whose tabs have the labels Page 1 and Page 2. You can then move and size the control as usual. In typical usage, you'll want to create a MultiPage control that's only a little smaller than the user form it inhabits (like most of the multipage dialog boxes you'll see in Windows applications).
+
+Once you've created a MultiPage control, you work with a page on it by right-clicking its tab and using the resulting context menu:
+
+  * To add a page, right-click the label and choose New Page from the context menu. VBA will add a new page of the default size and will name it Page _n_ , where _n_ is the next number after the current number of pages (even if the other pages have names other than Page1, Page2, and so on).
+  * To rename a page in a MultiPage control, right-click the label and choose Rename from the context menu. In the Rename dialog box (see Figure 15.5), enter the caption (the label text) for the page in the Caption text box, the accelerator key in the Accelerator Key text box, and any control-tip text (the tip the user sees when they move the mouse pointer over the tab for the page) in the Control Tip Text text box. Click the OK button to close the Rename dialog box.
+
+Figure 15.5 Use the Rename dialog box to set the caption, accelerator key, and control-tip text for a page.
+
+  * To delete a page from a MultiPage control, right-click the label and choose Delete Page from the context menu. The VBA Editor will remove the page without prompting for confirmation.
+  * To move a page to a different place in the MultiPage control, right-click the label and choose Move from the context menu to display the Page Order dialog box (see Figure 15.6). In the Page Order list box, select the page or pages that you want to move (Shift+click to select multiple contiguous pages, Ctrl+click to select multiple noncontiguous pages), and then use the Move Up and Move Down buttons to rearrange the page or pages as desired. When you've finished, click the OK button to close the Page Order dialog box.
+
+Figure 15.6 Use the Move Up and Move Down buttons in the Page Order dialog box to change the order of pages in a MultiPage control.
+
+  * To specify which page of a multipage dialog box to display by default, use the Value property of the MultiPage control. You can set this property either at design time or at runtime. For example, you could use an initialization procedure such as the one shown here to display the third page (identified by the value 2, because the page numbering starts at 0) of a dialog box with a MultiPage control called MyMulti at runtime:
+
+    Sub UserForm_Initialize()
+      MyMulti.Value = 2
+    End Sub
+
+Once you've created a multipage dialog box, you can populate its pages with controls using the techniques you learned in Chapter 14. Each control must have a unique name in the entire form (not just within the page on which it appears).
+
+When designing a multipage dialog box, keep the following issues in mind:
+
+  * What's the best way to divide the information or options in the dialog box? What belongs on which page? Which information or options will the user expect to find grouped together?
+  * Which controls should appear on each page? Most dialog boxes need at least a pair of command buttons—such as OK and Cancel or OK and Close—available from each page to allow the user to dismiss the dialog box from whichever page they happen to end up on. In rare instances, you may want to force the user to return to a particular page in order to close a dialog box. In these cases, make sure each page that doesn't contain a command button to dismiss the dialog box tells the user where they will find such a command button.
+  * For settings, do you need to have an Apply button (as well as an OK button) to apply the changes on a particular page without closing the dialog box?
+
+Because each control in a multipage form has a unique name, when returning information from a multipage dialog box you need specify only the relevant object—you don't need to specify which page it's on.
+
+Figure 15.7 shows an example of a multipage dialog box. The first page contains the customer's personal contact information; the second, the customer's professional information; the third, the associations the customer belongs to; and the fourth, the certifications the customer holds.
+
+Figure 15.7 By using multiple pages in a dialog box, you can achieve a clean and uncluttered look that's also easily navigable.
+
+Most of the properties of the MultiPage control are straightforward, but a few deserve special mention:
+
+  * The Style property offers fmStyleTabs (the default setting, showing tabs for navigating between the pages), fmStyleButtons (which gives each page a rectangular button, with the button for the current page appearing pushed in), or fmStyleNone (which provides no means of navigating between the pages and no indication of the borders of the multipage dialog box). fmStyleNone can be useful for creating user forms that have two or more alternate layouts of which the user will only ever need to see one at a time. By including one set of controls on one page of the multipage dialog box and another set of controls on the other page, you can present two seemingly different dialog boxes by doing nothing more than changing which page of the MultiPage control is displayed. For example, you can use this approach to create a wizard that guides the user through a multistep process.
+  * The TabOrientation property controls where the tabs (or buttons) for the pages appear on the control. Your choices are fmTabOrientationTop (the default setting, placing the tabs at the top of the control), fmTabOrientationBottom, fmTabOrientationLeft, and fmTabOrientationRight. Experiment with the effects that the bottom, left, and right orientations offer, but unless they provide significant advantages over the more normal top orientation, use them sparingly if at all. Users won't thank you for deviating from the traditional, familiar interface unnecessarily.
+  * The MultiRow property controls whether a MultiPage control has one row of tabs for its pages (False) or multiple rows (True). When you have MultiRow set to True, the VBA Editor adds the second or subsequent rows of tabs when you run out of space on the first or current row.
+
+The MultiPage control doesn't have to take up the whole dialog box—in fact, most dialog boxes keep the key command buttons like OK and Cancel outside the multipage area so that they're available to the user no matter which page the user is on.
+
+That said, it is usually a good idea to make a MultiPage control the dominant part of a dialog box. In a complex and busy dialog box, a small MultiPage control can appear to be little more than a group box, and the user may miss the tabs, particularly if they're just skimming the controls looking for a particular option.
+
+### Using the Tab Strip Control
+
+Forms that use a tab strip are substantially different from multipage dialog boxes. A TabStrip control is used not to rearrange other controls but to change the data that appears in them as the user moves from one set of data to another. In other words, the layout of the controls remains static; just the values displayed in the controls changes from page to page on the strip.
+
+For instance, you might use a dialog box driven by a tab strip to view and update the records in a data source such as a Word table, an Excel spreadsheet, or an Access database. This example uses an Excel workbook in which information is stored on a number of worksheets. Figure 15.8 shows the DataSurfer dialog box, which is driven by a tab strip.
+
+The actual strip of tabs in a TabStrip control can appear above, below, or beside the controls that it contains. Above is the conventional—and default—position, just as it is in real-world recipe-card boxes and file drawers. But vertical and bottom tabs have shown up in eccentric Windows applications from time to time. As with the MultiPage control, use the TabOrientation property of the TabStrip control to specify whether the tab strip should appear at the top, bottom, left, or right of its control. But be sure to have some pretty good reason if you're departing from convention.
+
+Figure 15.8 Using a TabStrip control to create a multitab dialog box. The tab strip is used to control which set of information is displayed in the other controls in the dialog box.
+
+The tab strip can contain zero, one, or more tabs. For most purposes, there's little point in having only one tab on a tab strip, and even less in having no tab at all. But if you dynamically populate the tab strip with tabs in your procedures (as you're about to do in this next example) and create one tab for each record found, you may run into situations with only one record and thus a dialog box with only one tab—or even a tab strip without any tabs at all.
+
+Click the TabStrip button on the Toolbox, click in the user form to place the tab strip, and then drag it to an appropriate size. Bear in mind that a tab strip is only a visual display for the user's benefit. Unlike the MultiPage control, you establish the logical connection between the tab strip and the other controls through code. You can then add, rename, move, and delete tabs in the same way as you can pages in a MultiPage control.
+
+If you haven't placed the other controls for the dialog box, do so now.
+
+Once everything's in place, you write the code that will enable the tab strip to display the contents of the other controls. Listing 15.2 shows the code for the tab strip in the DataSurfer dialog box. This tab strip is named tabSurfer, and the code works with its Change event—the event procedure that _fires_ (is triggered and executes its code) when the user clicks a new tab on the strip.
+
+**Listing 15.2**: Programming a tab strip
+
+     1.  Private Sub tabSurfer_ **Change** ()
+     2.      If blnInitializing = False Then
+     3.          With ActiveWorkbook.Sheets(tabSurfer.Value + 1)
+     4.              'load the contents of the worksheet that corresponds _
+                      to the tab chosen
+     5.              .Activate
+     6.              txtFirstName.Text = .Cells(1, 2).Text
+     7.              txtInitial.Text = .Cells(2, 2).Text
+     8.              txtLastName.Text = .Cells(3, 2).Text
+     9.              txtAddress1.Text = .Cells(4, 2).Text
+    10.              txtAddress2.Text = .Cells(5, 2).Text
+    11.              txtCity.Text = .Cells(6, 2).Text
+    12.              txtState.Text = .Cells(7, 2)
+    13.              txtZip.Text = .Cells(8, 2).Text
+    14.              txtHomeArea.Text = .Cells(9, 2).Text
+    15.              txtHomePhone.Text = .Cells(10, 2).Text
+    16.              txtWorkArea.Text = .Cells(11, 2).Text
+    17.              txtWorkPhone.Text = .Cells(12, 2).Text
+    18.              txtWorkExtension.Text = .Cells(13, 2).Text
+    19.              txtEmail.Text = .Cells(14, 2).Text
+    20.          End With
+    21.      End If
+    22.  End Sub
+
+After specifying the worksheet, the code in Listing 15.2 essentially repeats itself for each of the text boxes that appears in the DataSurfer dialog box. This dialog box works with a data source implemented as Excel spreadsheets in the active workbook.
+
+Each worksheet in the workbook is one customer's record, with the name of the customer appearing on the worksheet's tab and the customer's data appearing in the second column: the first name in the first cell of the second column, the middle initial in the second cell, the last name in the third cell, and so on for the address, phone numbers (both home and work), and email address. So to get at any piece of information, you need to know the sheet of the record in question and the appropriate cell in the second column.
+
+Here's how the code works:
+
+  * Line 1 declares the procedure tabSurfer_Change, which executes automatically whenever the Change event of the tabSurfer tab strip fires. The Change event fires each time the user clicks a new tab, so you use this event to control the information displayed in the text boxes.
+  * The Change event also fires when a tab is added to (or removed from) the tab strip. Because the DataSurfer user form uses the Initialize event procedure to populate the tab strip with tabs (one per worksheet in the workbook), you do need to prevent the Change event procedure from running unnecessarily during the initialization phase of your program. So the user form declares a private Boolean variable named blnInitializing that the Initialize procedure sets to True while it's running and to False just before it ends. Line 2 of the Change event procedure checks to make sure that blnInitializing is False. If it's not, the Initialize procedure has fired the event, and the Change procedure does not need to load the information into the cells—so execution continues at line 21, just before the end of the procedure. But once the Initialize procedure has finished running, blnInitializing will be set to False, and the Change event procedure will run each time the user changes tabs in the tab strip.
+  * Line 3 begins a With statement that works with the appropriate worksheet in the active workbook: (ActiveWorkbook.Sheets(tabSurfer.Value + 1). The Value property of the tabSurfer tab strip tells us which tab in the tab strip is selected. Because the first tab in the tab strip is numbered 0 and the first worksheet in the workbook is numbered 1, you need to add 1 to the Value of the tab strip to even the numbers.
+  * Line 4 is a comment. Line 5 uses the Activate method to activate the worksheet in question.
+  * Lines 6 through 19 then set the Text property of each text box in the user form to the contents of the corresponding cell in the second column on the worksheet. For example, line 6 sets the Text property of the txtFirstName text box (which appears under the First Name label in the dialog box) to the contents of the first cell in the second column: .Cells(1, 2).Text.
+  * Line 20 ends the With statement, line 21 ends the If statement, and line 22 ends the procedure.
+
+### Using Pictures in Forms
+
+VBA includes extensive graphics capabilities that allow you to make your forms look pretty much any way you want them to. This book doesn't go into design aesthetics in any detail, but there's much you can do to make your forms look good. You can fiddle with Format ⇒ Order to pile controls on top of each other. Controls like the command button have their own Picture properties, as do forms themselves. Take a look at Figure 15.9. It shows a photo inside an image control, a background texture in the form's picture property, and a command button that blends into the background because its BackStyle property is set to Transparent.
+
+Figure 15.9 VBA includes extensive graphics features—you can make your forms look any way you want them to.
+
+You can add a picture to a form by using an Image control. Click the Image button in the Toolbox, and then click in the user form where you want the Image control to appear. Once you've placed the Image control, you can size and move the picture just as you would any other control.
+
+* * *
+
+Ensure That You Include Any Necessary Graphics Files When You Deploy a Macro
+
+Make sure the picture you choose for an Image control or a user form's background is available to all computers that will display the dialog box. If the picture isn't available, it fails to appear in the dialog box, which spoils the effect.
+
+* * *
+
+To choose the picture that will appear in the Image control, select the Picture property in the Properties window and click the ellipsis button that then appears to the right of the entry. The VBA Editor displays the Load Picture dialog box. Select the picture file and choose the Open button. The Picture property in the Properties window registers the type of picture you selected—such as Bitmap—but not its filename, and the picture appears in the Image control so that you can see if it's an appropriate size.
+
+* * *
+
+Loading a Picture into an Image Control Programmatically
+
+When specifying the picture for an Image control _programmatically_ (the picture is loaded while the macro is executing, during runtime), you need to use a LoadPicture statement. Compare that to how, when programming (design time) you can simply use the Properties window to assign a picture to the Picture property of the Image control. LoadPicture has the following syntax:
+
+    LoadPicture _filename_ , [ _WidthDesired_ ], [ _HeightDesired_ ]
+
+_filename_ is a String argument specifying the name of the picture file to be loaded into the Image control. _WidthDesired_ is an optional Long argument specifying the width of the picture in twips, and _HeightDesired_ is an optional Long argument specifying the height of the picture.
+
+For example, the following statement loads the picture named Rose.jpg that's located in the c:\root directory
+
+    Image1.Picture = LoadPicture("C:\rose.jpg")
+
+* * *
+
+Once you've chosen the picture, you have various options for positioning it and formatting it:
+
+  * If necessary, set the alignment of the picture by using the PictureAlignment property. (If the picture fully fills the Image control—neither overlapping it nor leaving parts of it empty—you may not need to set the alignment for it.) Table 15.1 shows the constants and values for the PictureAlignment property.
+  * If necessary, clip, stretch, or zoom the picture by using the PictureSizeMode property: fmPictureSizeModeClip (0) clips the picture to fit the Image control; fmPictureSizeModeStretch (1) stretches or squeezes the picture so that it fits the Image control (this option often makes for strange effects); and fmPictureSizeModeZoom (2) enlarges or reduces the picture so that its nearest dimension exactly fits the width or height of the Image control without changing the picture's proportions (this option usually leaves an unfilled gap on the other side).
+  * If you need to tile the image to take up the remaining space in the control, set the PictureTiling property to True. This option is rarely used with database work.
+  * If you need to adjust the position of the picture relative to its caption, set the PicturePosition property of the check box, command button, label, option button, or toggle button in question. Table 15.2 shows the constants and values for PicturePosition.
+
+Table 15.1 Constants and values for the PictureAlignment property
+
+**Constant** | **Value** | **Picture Alignment in Image Control**  
+---|---|---  
+fmPictureAlignmentTopLeft | 0 | Top left  
+fmPictureAlignmentTopRight | 1 | Top right  
+fmPictureAlignmentCenter | 2 | Centered  
+fmPictureAlignmentBottomLeft | 3 | Bottom left  
+fmPictureAlignmentBottomRight | 4 | Bottom right
+
+Table 15.2 PicturePosition property
+
+Once you've placed, sized, and formatted a picture, there are various possibilities for what you can do with it, such as using a picture's Click event to trigger an action. For example, you could display two graphics illustrating a choice of two formats for a document. Then the user could click the appropriate picture to signal their choice.
+
+## Creating a Modeless Dialog Box
+
+We're using VBA version 7, and ever since version 6 the language has offered the programmer an option to create a _modeless_ dialog box—one that users can leave onscreen while they continue to work in their application. In other words, they don't have to click an OK or Cancel button or otherwise dismiss the dialog box to regain the ability to interact with their application.
+
+You're doubtless familiar with modeless dialog boxes from working with Office. For example, the Find And Replace dialog box in Access, Word, and Excel is modeless, as is the Replace dialog box in PowerPoint.
+
+When you display a modeless dialog box, it takes the focus just as any modal dialog box does (its frame turns from gray to white and the X close icon in the upper right changes from dark gray to red, the indication that focus is on a message box in Windows 8's graphic scheme).
+
+But you can click in the application window to transfer the focus back to that window. For example, you can continue typing in a Word document, even while the Find And Replace dialog box remains visible.
+
+Creating a modeless dialog box is as simple as setting the ShowModal property of the user form to False from its default setting of True.
+
+There are various situations where you might want to use a modeless dialog box rather than a modal one. As a simple example, you might create a procedure and dialog box in Word that collects information from the user for a memo or a report. By making the dialog box modeless, you could allow the user to copy information from an open document (or open other documents and gather information from them) and paste it into the dialog box—saving users from having to copy the information before invoking the dialog box and allowing them to copy multiple separate items easily. Likewise, you could create a modeless user form (perhaps shaped like a toolbar) that users could keep onscreen and use to automatically enter text into predefined sections of three or four other documents without losing their place in the current document.
+
+You can also use modeless dialog boxes to display complex sets of interrelated user forms in which the user needs to copy and paste information from one user form to another or at least to access different areas of two or more displayed user forms at the same time. Displaying multiple forms at once can be confusing to the user, but you may sometimes find it necessary.
+
+Most of the time, you'll probably want to use modal dialog boxes in your VBA procedures. With modal dialog boxes, users must deal with the dialog box before they can continue to work in the application, and there's no risk that they'll end up with multiple dialog boxes scattered around the screen in assorted states of disuse.
+
+* * *
+
+You Can Use Serial Modal Dialog Boxes
+
+You can't display both modal and modeless user forms at the same time, but you can display one modal dialog box from another modal dialog box. When users close the second modal dialog box, VBA returns them to the first modal dialog box by default. However, you can write code to make the second modal dialog box automatically close the first dialog box after it closes itself.
+
+* * *
+
+## Specifying a Form's Location Onscreen
+
+By default, VBA centers a dialog box on the middle of the application window as much as possible, which is the normal behavior for Windows applications. If you want to position a form elsewhere on the screen (for example, to avoid obscuring important data onscreen), set the StartUpPosition property for the user form. Table 15.3 explains the settings you can use.
+
+Table 15.3 StartUpPosition property settings
+
+**Property** | **Value** | **Effect**  
+---|---|---  
+Manual | 0 | Displays the user form in the upper-left corner of the Windows Desktop.  
+CenterOwner | 1 | Centers the user form horizontally and vertically in the _owner_ application—the application to which the user form belongs.  
+CenterScreen | 2 | Centers the user form horizontally and vertically on the Desktop. In a multimonitor arrangement, this value centers the user form on the monitor containing the active window.  
+WindowsDefault | 3 | Displays the user form in the default position for Windows dialog boxes.
+
+# Using Events to Control Forms
+
+This section discusses the events built into VBA for use with forms and with individual controls to give the programmer fine control over how user forms look and behave.
+
+So far in this chapter, you've used three of the most useful events:
+
+  * You used the Initialize event to add items to list boxes just before a form is loaded and to adjust the number of tabs on a tab strip.
+  * You used the Click event to take action when the user clicks a particular control in a user form. So far you've been using Click mostly for command buttons, but you can use it for just about any control—including the user form itself.
+  * You used the Change event to control what happens when the user changes the tab displayed on a tab strip.
+
+Table 15.4 lists the events that VBA supports and the objects and controls with which each can be used.
+
+Table 15.4 Events that VBA supports and the objects and controls associated with them
+
+**Event** | **Occurs** | **Applies to These Controls and Objects**  
+---|---|---  
+Activate | When the user form becomes the active window | UserForm  
+Deactivate | When the user form ceases to be the active window | UserForm  
+AddControl | When a control is added at runtime | Frame, MultiPage, UserForm  
+AfterUpdate | After the user has changed data in a control | CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton, UserForm  
+BeforeDragOver | When the user is performing a drag-and-drop operation | CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton, UserForm  
+BeforeDropOrPaste | When the user is about to release a dragged item or about to paste an item | CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton, UserForm  
+BeforeUpdate | When the user has changed data in the control before the new data appears in the control | CheckBox, ComboBox, ListBox, OptionButton, ScrollBar, SpinButton, TextBox, ToggleButton  
+Change | When the Value property of a control changes | CheckBox, ComboBox, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton  
+Click | When the user clicks a control or object with the primary mouse button | CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, TabStrip, ToggleButton, UserForm  
+DblClick | When the user double-clicks a control or object with the primary mouse button | CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, TabStrip, TextBox, ToggleButton, UserForm  
+DropButtonClick | When the user displays or hides a drop-down list | ComboBox, TextBox  
+Enter | Just before one control on a user form receives the focus from another control | CheckBox, ComboBox, CommandButton, Frame, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton  
+Exit | Just before one control on a user form loses the focus to another control | CheckBox, ComboBox, CommandButton, Frame, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton  
+Error | When a control or object encounters an error | CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton, UserForm  
+Initialize | After a user form is loaded but before it's displayed | UserForm  
+KeyDown | When the user presses a key on the keyboard | CheckBox, ComboBox, CommandButton, Frame, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton, UserForm  
+KeyUp | When the user releases a key they've pressed on the keyboard | CheckBox, ComboBox, CommandButton, Frame, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton, UserForm  
+KeyPress | When the user presses an ANSI key on the keyboard | CheckBox, ComboBox, CommandButton, Frame, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton, UserForm  
+Layout | When the size of a frame, multipage, or user form changes | Frame, MultiPage, UserForm  
+MouseDown | When the user presses the primary mouse button | CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton, UserForm  
+MouseUp | When the user releases the primary mouse button (after pressing it) | CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton, UserForm  
+MouseMove | When the user moves the mouse | CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, TabStrip, TextBox, ToggleButton, UserForm  
+QueryClose | When a user form is about to close | UserForm  
+RemoveControl | When a control is deleted | Frame, MultiPage, UserForm  
+Resize | When a user form is resized | UserForm  
+Scroll | When the user moves the scroll box | Frame, MultiPage, ScrollBar, UserForm  
+SpinDown | When the user clicks the down button on a SpinButton control | SpinButton  
+SpinUp | When the user clicks the up button on a SpinButton control | SpinButton  
+Terminate | When a user form has been unloaded from memory | UserForm  
+Zoom | When the Zoom property of the control or user form changes | Frame, MultiPage, UserForm
+
+The ByVal keyword is used to pass arguments between procedures. When used with forms, it can return ReturnBoolean, ReturnEffect, ReturnInteger, and ReturnString objects.
+
+As you can see, VBA's events fall into several categories, which are discussed in the following sections in descending order of usefulness:
+
+  * Events that apply only to the UserForm object
+  * Events that apply to the UserForm object and other container objects (such as the Frame control and the MultiPage control)
+  * Events that apply to many or most of the controls, sometimes including the UserForm object as well
+  * Events that apply only to a few controls
+
+## Events Unique to the UserForm Object
+
+This section discusses the events that are unique to the UserForm object. These are the Initialize, QueryClose, Activate, Deactivate, Resize, and Terminate events.
+
+### Initialize Event
+
+An Initialize event occurs when the user form is loaded but before it appears onscreen.
+
+VBA's syntax for the Initialize event is as follows, where _userform_ is a valid UserForm object:
+
+    Private Sub _userform_ _Initialize()
+
+Typical uses for the Initialize event include retrieving information—from a database, a set of worksheets, or whatever—that the user form or application needs and assigning information to the controls on the user form (especially ListBox and ComboBox controls, to which you often need to add the information at runtime rather than at design time).
+
+Depending on the style and complexity of your user forms, you may also want to use the Initialize event to resize the user form, resize controls on the user form, display or hide particular controls, and in general make sure the user form is as closely suited as possible to the user's needs before displaying it.
+
+### QueryClose Event
+
+The QueryClose event applies to the UserForm object only. This event fires just before the user form closes.
+
+The syntax for the QueryClose event is as follows:
+
+    Private Sub UserForm_QueryClose(Cancel As Integer, CloseMode As Integer)
+
+Here, Cancel is an integer, typically 0 (zero). A nonzero value prevents the QueryClose event from firing and stops the user form (and the application) from closing.
+
+CloseMode is a value or a constant giving the cause of the QueryClose event. Table 15.5 shows the values and constants for CloseMode.
+
+Table 15.5 Values and constants for the CloseMode argument
+
+**Constant** | **Value** | **Cause of the QueryClose Event**  
+---|---|---  
+vbFormControlMenu | 0 | The user has closed the user form by clicking its close button or by invoking the Close command from the user form's control menu (for example, by right-clicking the title bar of the user form and choosing Close from the context menu).  
+vbFormCode | 1 | An Unload statement in code has closed the user form.  
+vbAppWindows | 2 | Windows is closing down and is closing the user form.  
+vbAppTaskManager | 3 | The Task Manager is closing the application and thus is also closing the user form.
+
+At first glance, QueryClose may appear to have few uses beyond double-checking that users really want to close a user form that they're attempting to close. Say that you have established that users had entered a lot of data in a form they were about to close. You might want to check that they hadn't clicked the user form's Close button or Cancel button by mistake, as illustrated in the following code fragment for Word:
+
+    Private Sub UserForm_QueryClose(Cancel As Integer, _
+        CloseMode As Integer)
+        'make sure the user wants to close the user form
+        'if they have entered information in it
+        Select Case CloseMode
+            Case 0
+                'user has clicked the close button or invoked an Unload statement
+                'if text box contains more than 5 characters, ask to save it
+                If Len(txtDescription.Text) > 5 Then
+                    If MsgBox("The Description text box contains " & _
+                        "a significant amount of text." & vbCr & _
+                        "Do you want to save this text?", vbYesNo + _
+                        vbQuestion, "Close Form") <> 0 Then
+                        Documents.Add
+                        Selection.TypeText txtDescription.Text
+                        ActiveDocument.SaveAs _
+                            "c:\temp\Temporary Description.docm"
+                        MsgBox "The contents of the Description text " & _
+                            "box have been saved in " & _
+                            "c:\temp\Temporary Description.docm.", _
+                            vbOKOnly + vbInformation, _
+                            "Form Information Saved"
+                    End If
+                 End If
+
+However, QueryClose comes into its own when the whole application, rather than just the user form, is closing. If the user form is modeless, users may not be aware that it's still open and that they're about to lose data they've typed into it or options they've selected in it.
+
+Sometimes you may be able to use QueryClose to save information from a user form when the application has stopped responding and is being closed by Windows or the Task Manager. Be warned that QueryClose's record isn't perfect on this—the code sometimes won't run.
+
+To stop an application from closing, set the Cancel property of the QueryClose event to True.
+
+### Activate Event
+
+The Activate event fires when the user form becomes the active window. Typically, this means the event fires when the user form is displayed, occurring just after the Initialize event if the user form is loaded by a Show statement rather than a Load statement.
+
+Note that if the user form is loaded by using a Load statement before being displayed with the Show statement, the Initialize event fires after the Load statement. The Activate event, firing after the Show statement, fires later.
+
+However, the Activate event also fires when the user form is reactivated after having been deactivated. For example, if you create a modeless user form with an Activate event procedure, the code is executed each time the user reactivates the user form after having deactivated it (for example, by working in the application window). Likewise, if you display one user form from another and then close the second user form, returning the focus to the first user form and reactivating it, the Activate event fires again.
+
+The syntax for the Activate event is as follows:
+
+    Private Sub UserForm_Activate()
+
+* * *
+
+Bug Alert: You May Face Problems Using Deactivate and Activate in Immediate Succession
+
+VBA can't always execute the event procedures for the Deactivate event of one user form and the Activate event of another user form in immediate succession. Sometimes things work as they should; more often, they don't.
+
+For example, say you have two user forms, named One and Two, each with an Activate event procedure and a Deactivate event procedure. If you display Two from One, the Deactivate event code from One should run, followed by the Activate event code from Two. This doesn't usually happen: Often, the Deactivate code of One will run, but the Activate code of Two won't. Run it again, and you may get the Activate code of Two to run but not the Deactivate code of One. However, if you remove or comment out the Deactivate event procedure from One and try again, Two's Activate code will run consistently each time One displays Two, indicating that the Activate event is firing but the Activate event procedure's code isn't running when the Deactivate event procedure is present.
+
+* * *
+
+### Deactivate Event
+
+The Deactivate event fires when the user form loses the focus after having been the active window, but it doesn't fire when the user form is hidden or unloaded. For example, if you display a user form that contains a Deactivate event procedure and then close the user form, the Deactivate event doesn't fire. However, if you display one user form from another, the Deactivate event for the first user form fires as the focus is transferred to the second user form. With modeless user forms, the Deactivate event is triggered each time the user leaves one user form by clicking on another.
+
+The syntax for the Deactivate event is as follows:
+
+    Private Sub UserForm_Deactivate()
+
+See the previous sidebar for details on a bug in using the Deactivate and Activate events in immediate succession.
+
+### Resize Event
+
+The Resize event fires when a user form is resized either manually by the user or programmatically by you.
+
+The syntax for the Resize event is as follows:
+
+    Private Sub UserForm_Resize()
+
+The main use for the Resize event is to move, resize, display, or hide controls to respond to a resized form. For example, you might resize a text box so that it occupies most of the width of the user form it lives on (see Figure 15.10) by using code such as that shown in Listing 15.3.
+
+Figure 15.10 You can use the Resize event of a user form to resize or reposition the controls it contains.
+
+**Listing 15.3**: Resizing via code
+
+     1.  Private Sub cmdWidenForm_Click()
+     2.      With frmResize
+     3.          If .Width < 451 Then
+     4.              .Width = .Width + 50
+     5.              If cmdNarrowForm.Enabled = False Then _
+                         cmdNarrowForm.Enabled = True
+     6.              If .Width > 451 Then _
+                         cmdWidenForm.Enabled = False
+     7.          End If
+     8.      End With
+     9.  End Sub
+    10.
+    11.  Private Sub cmdNarrowForm_Click()
+    12.      With frmResize
+    13.          If .Width > 240 Then
+    14.              .Width = .Width - 50
+    15.              If cmdWidenForm.Enabled = False Then _
+                         cmdWidenForm.Enabled = True
+    16.              If .Width < 270 Then _
+                         cmdNarrowForm.Enabled = False
+    17.          End If
+    18.      End With
+    19.  End Sub
+    20.
+    21.  Private Sub cmdClose_Click()
+    22.      Unload Me
+    23.  End Sub
+    24.
+    25.  Private Sub UserForm_Resize()
+    26.      txt1.Width = frmResize.Width - 30
+    27.  End Sub
+
+Listing 15.3 contains four short procedures: one for the Click event of the cmdWidenForm command button, one for the Click event of the cmdNarrowForm command button, one for the Click event of the cmdClose command button, and one for the Resize event of the user form.
+
+The cmdWidenForm_Click procedure shown in lines 1 through 9 increases the width of the user form by 50 points (1 point is 1/72 inch) when the user clicks the Widen Form button, as long as the Width property of the user form is less than 451 points. Line 5 enables the cmdNarrowForm command button if it isn't already enabled. (The cmdNarrowForm command button is disabled when the user form is displayed at its original narrow width.) Line 6 disables the cmdWidenForm command button if the Width property of the user form is more than 451 points.
+
+The cmdNarrowForm_Click procedure shown in lines 11 through 19 narrows the user form by 50 points as long as the Width of the user form is greater than 240 points (its original width), reenabling the cmdWidenForm button if it's disabled and disabling the cmdNarrowForm button if the Width of the user form is less than 270 points.
+
+The cmdClose_Click procedure shown in lines 21 through 23 simply unloads the user form (which it refers to by the Me keyword).
+
+The UserForm_Resize event procedure in lines 25 through 27 sets the Width property of txt1, the text box in the user form, to 30 points less than the Width of the user form. If you step through the code (repeatedly pressing F8) in the user form, you'll notice that the Resize event fires when the size of the user form changes. For example, when line 4 of the cmdWidenForm_Click procedure is executed, execution branches to the Resize event procedure in line 25, and this procedure is executed before the code in line 5.
+
+### Terminate Event
+
+The Terminate event fires when the user form has been unloaded—or, more precisely, when all references to an instance of the user form have been removed from memory or have gone out of scope.
+
+The syntax for the Terminate event is as follows:
+
+    Private Sub UserForm_Terminate()
+
+## Events That Apply to Both UserForms and Container Controls
+
+This section discusses the events that apply to the UserForm object _and_ to the container controls—the MultiPage control and the Frame control. Container controls can have other controls placed inside of them. (The Scroll event applies to the ScrollBar control as well as to MultiPage, Frame, and UserForm.) These events are Scroll, Zoom, Resize, Layout, AddControl, and RemoveControl.
+
+### Scroll Event
+
+The Scroll event applies to the Frame control, the MultiPage control, the ScrollBar control, and the UserForm object. This event occurs when the user moves the scroll box (the thumb) on a scroll bar on a frame, MultiPage control, scroll bar, or user form.
+
+The syntax for the Scroll event varies for the three controls and the UserForm object. The syntax for the Scroll event with the UserForm object is as follows:
+
+    Private Sub UserForm_Scroll(ByVal ActionX As MSForms.fmScrollAction, ByVal ActionY
+    As MSForms.fmScrollAction, ByVal RequestDx As Single, ByVal RequestDy As Single,
+    ByVal ActualDx As MSForms.ReturnSingle, ByVal ActualDy As MSForms.ReturnSingle)
+
+The syntax for the Scroll event with the ScrollBar control is as follows:
+
+    Private Sub scrollbar_Scroll()
+
+The syntax for the Scroll event with the MultiPage control is as follows:
+
+    Private Sub multipage_Scroll(index As Long, ActionX As fmScrollAction, ActionY As
+    fmScrollAction, ByVal RequestDx As Single, ByVal RequestDy As Single, ByVal 
+    ActualDx As MSForms.ReturnSingle, ByVal ActualDy As MSForms.ReturnSingle)
+
+The syntax for the Scroll event with the Frame control is as follows:
+
+    Private Sub frame_Scroll(ActionX As fmScrollAction, ActionY As fmScrollAction, 
+    ByVal RequestDx As Single, ByVal RequestDy As Single, ByVal ActualDx As MSForms
+    .Return Single, ByVal ActualDy As MSForms.ReturnSingle)
+
+In these last three syntax statements, _scrollbar_ is a valid ScrollBar object, _multipage_ is a valid MultiPage object, and _frame_ is a valid Frame object.
+
+Here are the arguments for the Scroll event:
+
+**Index**
+
+A required argument specifying the page of the MultiPage with which the event procedure is to be associated.
+
+**ActionX** and **ActionY**
+
+Required arguments determining the user's horizontal and vertical actions (respectively), as shown in Table 15.6.
+
+**RequestDx**
+
+The distance to move the scroll box horizontally, specified in points.
+
+**RequestDy**
+
+The distance to move the scroll box vertically, specified in points.
+
+**ActualDx**
+
+The distance the scroll box moved horizontally, measured in points.
+
+**ActualDy**
+
+The distance the scroll box moved vertically, measured in points.
+
+Table 15.6 ActionX and ActionY constants and values for the Scroll event
+
+**Constant** | **Value** | **Scroll Box Movement**  
+---|---|---  
+fmScrollActionNoChange |  0 | There was no change or movement.  
+fmScrollActionLineUp |  1 | The user moved the scroll box a short way upward on a vertical scroll bar (equivalent to pressing the ↑ key) or a short way to the left on a horizontal scroll bar (equivalent to pressing the ← key).  
+fmScrollActionLineDown |  2 | The user moved the scroll box a short way downward on a vertical scroll bar (equivalent to pressing the ↓ key) or a short way to the right on a horizontal scroll bar (equivalent to pressing the → key).  
+fmScrollActionPageUp |  3 | The user moved the scroll box up one page on a vertical scroll bar (equivalent to pressing the Page Up key) or one page to the left on a horizontal scroll bar (also equivalent to pressing the Page Up key).  
+fmScrollActionPageDown |  4 | The user moved the scroll box down one page on a vertical scroll bar (equivalent to pressing the Page Down key) or one page to the right on a horizontal scroll bar (also equivalent to pressing the Page Down key).  
+fmScrollActionBegin |  5 | The user moved the scroll box to the top of a vertical scroll bar or to the left end of a horizontal scroll bar.  
+fmScrollActionEnd |  6 | The user moved the scroll box to the bottom of a vertical scroll bar or to the right end of a horizontal scroll bar.  
+fmScrollActionPropertyChange |  8 | The user moved the scroll box, changing the value of either the ScrollTop property or the ScrollLeft property.  
+fmScrollActionControlRequest |  9 | The scroll action was requested by a control in the container in question.  
+fmScrollActionFocusRequest | 10 | The user moved the focus to a different control. This movement scrolls the user form so that the selected control is fully displayed in the available area.
+
+### Zoom Event
+
+Changing the Zoom property is like using a magnifying glass. The form's controls all grow larger if the Zoom value is greater than 100, and they grow smaller if the value is less than 100. However, the form itself doesn't change size. To change the size of the form, you must adjust its Height and Width properties.
+
+The Zoom _event_ fires when the Zoom property of the object changes at runtime. The Zoom property can be changed either automatically through code or by the user's manipulating—dragging a scroll bar's thumb, for example—a control that changes the property because you've written code that responds this way.
+
+The Zoom property uses this syntax for the control and the UserForm object:
+
+    Private Sub object_Zoom(Percent As Integer)
+
+Here, _object_ is a Frame control or a UserForm object. Percent is an Integer argument used to specify the percentage (from 10 percent to 400 percent) the user form is to be zoomed to. By default, user forms and controls are displayed at 100 percent zoom—full size.
+
+The Zoom property uses this syntax for the MultiPage control:
+
+    Private Sub multipage_Zoom(ByVal Index As Long, Percent As Integer)
+
+Index is the index (name or number) of the Page object in the MultiPage control with which the Zoom event procedure is associated.
+
+Zooming a user form zooms all the controls that are on it. For example, say a user form named frmEventsDemo includes a combo box named cmbZoom that offers a selection of zoom percentages. When the user selects an item in the combo box, the Change event for cmbZoom applies the combo box's Value property to the Zoom property of the user form, zooming it to the percentage selected. Zooming the user form triggers the Zoom event, whose procedure in this example sets the Width and Height of the user form to new values suited to the new zoom percentage:
+
+    Private Sub cmbZoom_Change()
+    'change the size of the controls:
+        frmEventsDemo.Zoom = cmbZoom.Value
+    End Sub
+    Private Sub UserForm_Zoom(Percent As Integer)
+    ' change the size of the form itself:
+        frmEventsDemo.Width = 300 * cmbZoom.Value / 100
+        frmEventsDemo.Height = 350 * cmbZoom.Value / 100
+    End Sub
+
+### Layout Event
+
+A Layout event is triggered when the size of the frame, MultiPage control, or user form is changed, either by the user or programmatically (automatically by an autosized control's becoming resized).
+
+By default, the Layout event automatically calculates the new position for any control that has been moved and repaints the screen accordingly. However, you can also use the Layout event for your own purposes if you need to.
+
+The syntax for the Layout event with a Frame control or a UserForm object is as follows:
+
+    Private Sub object_Layout()
+
+Here, _object_ is a Frame control or a UserForm object.
+
+The syntax for using the Layout event with a MultiPage control is as follows:
+
+    Private Sub multipage_Layout(index As Long)
+
+Here, _multipage_ is a MultiPage control and index is the Page object in the MultiPage control.
+
+* * *
+
+**VBA Automatically Saves Height and Width Properties**
+
+When a control is resized, VBA automatically stores its previous height and width in the OldHeight and OldWidth properties, while the Height and Width properties take on the new height and width values. It allows you to restore a control to its previous size by retrieving the OldHeight and OldWidth properties and assigning them to the Height and Width properties.
+
+* * *
+
+### AddControl Event
+
+The AddControl event is triggered when a control is added programmatically to a Frame control, a MultiPage control, or the user form at runtime; it isn't triggered when you add a control manually at design time. The event isn't triggered when the user form is initialized unless the Initialize event adds a control to the user form.
+
+The syntax for the AddControl event varies depending on the object or control. The syntax for the UserForm object and the Frame control is as follows:
+
+    Private Sub object_AddControl(ByVal Control As MSForms.Control)
+
+Here, _object_ is a UserForm object or Frame control, and Control is the control that's being added.
+
+The syntax for the MultiPage control is as follows:
+
+    Private Sub multipage_AddControl(ByVal Index As Long, ByVal Control As MSForms
+    .Control)
+
+Here, Index is the index number or name of the Page object that will receive the control.
+
+For example, the following cmdAddControl_Click procedure adds three option buttons (opt1, opt2, and opt3, respectively) to the frame fraOptions and sets properties for the first option button. (A comment indicates where the code would go on to set properties for the second and third option buttons.) The fraOptions_AddControl event procedure displays a message box giving the number of controls the frame now contains. Because the cmdAddControl_Click procedure adds three controls, the AddControl event fires three times, and the fraOptions_AddControl procedure runs three times:
+
+    Private Sub cmdAddControl_click()
+        Dim opt1 As OptionButton
+        Dim opt2 As OptionButton
+        Dim opt3 As OptionButton
+        Set opt1 = fraOptions.Controls.Add("Forms.OptionButton.1")
+        Set opt2 = fraOptions.Controls.Add("Forms.OptionButton.1")
+        Set opt3 = fraOptions.Controls.Add("Forms.OptionButton.1")
+        With opt1
+            .Left = 10
+            .Top = 10
+            .Name = "optDomestic"
+            .Caption = "Domestic"
+            .AutoSize = True
+            .Accelerator = "D"
+        End With
+        'set properties for opt2 and opt3 here
+    End Sub
+
+    Private Sub fraOptions_AddControl(ByVal Control As MSForms.Control)
+        MsgBox "The frame now contains " & _
+            fraOptions.Controls.Count & " controls."
+    End Sub
+
+### RemoveControl Event
+
+The RemoveControl event fires when a control is deleted from a frame control, a MultiPage control, or a user form, either programmatically or manually at runtime. (To remove a control manually, the user would typically use a control built into the user form for that purpose. There has to be some programming here—users can't simply delete controls all by themselves.)
+
+The syntax for the RemoveControl event is as follows for all controls but the MultiPage control:
+
+    Private Sub object_RemoveControl(ByVal Control As MSForms.Control)
+
+Here, _object_ is a valid object, and Control is a valid control.
+
+The syntax for the RemoveControl event is as follows for the MultiPage control:
+
+    Private Sub multipage_RemoveControl(ByVal Index As Long, ByVal Control As 
+    MSForms.Control)
+
+Here, _multipage_ is a valid MultiPage object. For a MultiPage control, Index specifies the Page object in the MultiPage control that contains the control to be deleted.
+
+## Events That Apply to Many or Most Controls
+
+This section discusses the events that apply to many or most controls. Some of these events apply to the UserForm object as well. These events are Click; Change; Enter and Exit; BeforeUpdate and AfterUpdate; KeyDown, KeyUp, and KeyPress; MouseDown, MouseUp, and MouseMove; BeforeDragOver; BeforeDropOrPaste; DblClick; and Error.
+
+### Click Event
+
+The most common event of all, the Click event services the CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, TabStrip, and ToggleButton controls. It is not available to the TextBox, ScrollBar, or SpinButton controls, but it _is_ a member of the UserForm object.
+
+A Click event occurs when the user clicks a control with the left mouse button or when the user selects a value for a control that has more than one possible value. For most controls, this means that each time the user clicks the control, the event fires. But there are a few exceptions:
+
+  * Clicking a disabled control fires the Click event of the user form (as if the user were clicking the user form through the control).
+  * The Click event of an OptionButton control fires when the user clicks the option button to select it. If the option button is already selected, clicking it has no effect. (On the other hand, the Click event of a CheckBox control fires each time the user clicks the check box—either to select it or to clear it.)
+  * The Click event of a ListBox control or ComboBox control fires when the user clicks to select an item from the list (not when the user clicks on the drop-down arrow or in the undropped portion of the combo box). If the user clicks an already-selected item, the Click event doesn't fire again.
+  * The Click event of a ToggleButton control occurs whenever the toggle button is clicked and when its Value property is changed. This means that it isn't a good idea to use the Click event of the ToggleButton control to toggle its Value.
+  * The Click event of a selected CommandButton control fires when you press the spacebar.
+  * The Click event of the default command button (the button with its Default property set to True) fires when the user presses Enter with no other command button selected.
+  * The Click event of the command button with its Cancel property set to True fires when the user presses Esc. The Click event for a control with an accelerator key set also fires when the user presses the accelerator key.
+
+For all controls except the TabStrip control and the MultiPage control, the Click event needs no arguments, as follows:
+
+    Private Sub object_Click()
+
+For a TabStrip control or a MultiPage control, your code must react to the Index argument, a required Long (data type) argument that VBA passes to indicate the affected tab or page of the control:
+
+    Private Sub object_Click(ByVal Index As Long)
+
+Here, _object_ is a valid MultiPage control or TabStrip control.
+
+* * *
+
+Sequence of Events: What Happens When the User Clicks (and Clicks Again)
+
+The order in which events trigger can sometimes be important to the programmer. If you don't understand the order in which events take place, you can become baffled and start using events in ways that trigger each other, or conflict with each other.
+
+When the user clicks a command button, the Enter event for this button occurs before its Click event if the click transfers the focus to the command button. When the Enter event for the command button fires, it usually prevents the Click event from firing.
+
+When the user clicks a control, the first event triggered is the MouseDown event, which fires when the user presses the mouse button. Then the MouseUp event fires when the user releases the mouse button. A Click event occurs after a MouseUp event. If the user clicks again within the double-click timeframe set in Windows, the DblClick event fires, followed by another MouseUp event.
+
+* * *
+
+### Change Event
+
+The Change event applies to the CheckBox, ComboBox, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, and ToggleButton controls. This event fires when the Value property of a control changes. This change can occur either through an action of the user's (such as typing text into a text box, selecting an option button, selecting or clearing a check box, clicking a toggle button, or changing the page displayed on a MultiPage control) or through an action taken programmatically at runtime.
+
+Bear in mind that when the Change event is fired by an action of the user's, that action may also trigger a Click event. (Even when this happens, Change is regarded as a better way of determining the new Value of the control than Click—though for many purposes Click will work satisfactorily as well.) Changing the Value property of a control manually at design time doesn't fire a Change event.
+
+The syntax for the Change event is as follows:
+
+    Private Sub object_Change()
+
+The Change event is useful for updating other controls after the user changes a control. For example, if the user enters the name for a new report into a text box (here, txtReportName), you could use the Change event to automatically insert into another text box (here called txtFileName) the name of the file in which to save the report:
+
+    Private Sub txtReportName_Change()
+        txtFileName.Text = txtReportName.Text & ".txt"
+    End Sub
+
+### Enter and Exit Events
+
+The Enter and Exit events apply to CheckBox, ComboBox, CommandButton, Frame, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, and ToggleButton controls.
+
+The Enter event fires when the focus is moved from one control on a user form to another control. The event fires just before the second control receives the focus.
+
+Like the Enter event, the Exit event fires when the focus is moved from one control on a user form to another control. However, the Exit event fires just before the first event loses the focus.
+
+The syntax for the Enter event is as follows:
+
+    Private Sub object_Enter()
+
+The syntax for the Exit event is a little more complex:
+
+    Private Sub object_Exit(ByVal Cancel As MSForms.ReturnBoolean)
+
+Here, Cancel is a required argument specifying event status. The default setting is False, which specifies that the control involved should handle the event and that the focus will pass to the next control; a setting of True specifies that the application handle the event, which keeps the focus on the current control.
+
+By using the Enter and Exit events, you can track the user's progress through the controls on a user form.
+
+The Exit event is useful for checking to see if the user has made an appropriate selection in the control or has entered a suitable value. For example, you could check the user's entry in the control and, if you find it inappropriate, display a message box alerting the user to the problem and then return the focus to the control so that the user can try again.
+
+* * *
+
+Other Ways to Trap User Input
+
+Other events that you might use for checking the contents of a control after the user has visited it include AfterUpdate and LostFocus. Similarly, you might use the BeforeUpdate and GotFocus events instead of the Enter event. A significant difference between Enter and GotFocus and between Exit and LostFocus is that GotFocus and LostFocus fire when the user form receives or loses the focus, respectively, but Enter and Exit don't fire.
+
+* * *
+
+### BeforeUpdate Event
+
+The BeforeUpdate event applies to the CheckBox, ComboBox, ListBox, OptionButton, ScrollBar, SpinButton, TextBox, and ToggleButton controls. This event occurs as the value or data in the specified control is changed; you can use the event to evaluate the change and decide whether to implement it.
+
+The syntax for the BeforeUpdate event is as follows:
+
+    Private Sub object_BeforeUpdate(ByVal Cancel As MSForms.ReturnBoolean)
+
+Here, _object_ is a valid object, and Cancel is a required argument indicating the status of the event. The default setting of False makes the control handle the event; True prevents the update from being executed and makes the application handle the event.
+
+Here's the sequence in which events fire as you move focus to a control, update it, and move on:
+
+1. The Enter event for the control fires when you move the focus to the control.
+
+2. The BeforeUpdate event for the control fires after you've entered the information for the update (for example, after you've pressed a key in a text box) but before the update is executed. By setting Cancel to True, you can prevent the update from taking place. (If you don't set Cancel to True, the update occurs and the AfterUpdate event can't prevent it from occurring.)
+
+3. The AfterUpdate event for the control fires after you've entered the information in the control and the update has been executed. If you set the Cancel argument for BeforeUpdate to True, the AfterUpdate event doesn't fire.
+
+4. The Exit event for the control fires when you move from this control to another control. (After the Exit event fires for the control you've left, the Enter event fires for the control to which you have moved the focus.)
+
+### AfterUpdate Event
+
+The AfterUpdate event applies to the CheckBox, ComboBox, ListBox, OptionButton, ScrollBar, SpinButton, TextBox, and ToggleButton controls. This event fires after the user changes information in a control and after that update has been executed.
+
+The syntax for the AfterUpdate event is the same for all the controls and objects it applies to:
+
+    Private Sub object_AfterUpdate()
+
+### KeyDown and KeyUp Events
+
+The KeyDown event and KeyUp event work with the CheckBox, ComboBox, CommandButton, Frame, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, and ToggleButton controls and to the UserForm object. These events are not available to the Image and Label controls.
+
+The KeyDown event fires when the user presses a key on the keyboard. The KeyUp event fires when the user releases the key. The KeyDown and KeyUp events also occur when a key is sent to the user form or control programmatically by using the SendKeys statement. These events don't occur when the user presses Enter when the user form contains a CommandButton control with its Default property set to True, nor when the user presses Esc when the user form contains a CommandButton control with its Cancel property set to True.
+
+When the keystroke moves the focus to another control, the KeyDown event fires for the original control, while the KeyPress and KeyDown events fire for the control to which the focus is moved.
+
+The KeyPress event fires after the KeyDown event and before the KeyUp event.
+
+The syntax for the KeyDown event is as follows:
+
+    Private Sub object_KeyDown(ByVal KeyCode As MSForms.ReturnInteger, ByVal Shift As
+    Integer)
+
+The syntax for the KeyUp event is as follows:
+
+    Private Sub object_KeyUp(ByVal KeyCode As MSForms.ReturnInteger, ByVal Shift As
+    Integer)
+
+Here, _object_ is an object name and is required. KeyCode is a required Integer argument specifying the key code of the key pressed. For example, the key code for the letter _t_ is 84. The key code isn't an ANSI value—it's a special number that identifies the key on the keyboard.
+
+Shift is a required argument specifying whether the Shift, Ctrl, or Alt key was pressed. Use the constants or values shown in Table 15.7.
+
+Table 15.7 Shift constants and values
+
+**Constant** | **Value** | **Description**  
+---|---|---  
+fmShiftMask | 1 | Shift key pressed  
+fmCtrlMask | 2 | Ctrl key pressed  
+fmAltMask | 4 | Alt key pressed
+
+### KeyPress Event
+
+The KeyPress event is a member of the CheckBox, ComboBox, CommandButton, Frame, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, and ToggleButton controls. It also is a member of the UserForm object. The Label control has no KeyPress event.
+
+The KeyPress event fires when the user presses a printable character, Ctrl plus an alphabetic character, Ctrl plus a special character (symbols), the Esc key, or the Backspace key while the control or object in question has the focus. Pressing the Tab key, the Enter key, or an arrow key doesn't cause the KeyPress event to fire, nor does a keystroke that moves the focus to another control from the current control.
+
+Technically, only ANSI keys fire the KeyPress event. The Delete key isn't an ANSI key, so pressing the Delete key to delete, say, text in a text box doesn't fire the KeyPress event. But deleting the same text in the same text box using the Backspace key does because Backspace is an ANSI key.
+
+The KeyPress event fires after the KeyDown event and before the KeyUp event. It also fires when you use SendKeys to send keystrokes to a user form programmatically.
+
+The syntax for the KeyPress event is as follows:
+
+    Private Sub object_KeyPress(ByVal KeyAscii As MSForms.ReturnInteger)
+
+Here, _object_ is a required argument specifying a valid object, and KeyAscii is a required Integer argument specifying an ANSI key code. To get the ANSI key code, use the Asc function. For example, Asc("t") returns the ANSI key code for the letter _t_ (the code is 116).
+
+By default, the KeyPress event processes the code for the key pressed—in humble terms, what you press is what you get. For example, if you press the _t_ key, you get a _t_ ; if you press the Delete key, you get a Delete action; and so on. By using a KeyPress event procedure, you can perform checks such as filtering out all nonnumeric keys when the user must enter a numeric value.
+
+### MouseDown Event and MouseUp Event
+
+The MouseDown and MouseUp events apply to the CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, and ToggleButton controls and to the UserForm object. The MouseDown event fires when the user presses a button on the mouse, and a MouseUp event occurs when the user releases that button. A Click event fires after a MouseUp event occurs.
+
+The syntax for the MouseDown and MouseUp events is as follows for all controls except for MultiPage and TabStrip:
+
+    Private Sub object_MouseDown(ByVal Button As Integer, ByVal Shift As Integer,
+    ByVal X As Single, ByVal Y As Single)
+
+    Private Sub object_MouseUp(ByVal Button As Integer, ByVal Shift As Integer,
+    ByVal X As Single, ByVal Y As Single)
+
+The syntax for the MouseDown and MouseUp events with the MultiPage and TabStrip controls adds an Index argument to specify the index of the page or the tab involved:
+
+    Private Sub object_MouseUp(ByVal Index As Long, ByVal Button As Integer, ByVal 
+    Shift As Integer, ByVal X As Single, ByVal Y As Single)
+    Private Sub object_MouseDown(ByVal Index As Long, ByVal Button As Integer, ByVal 
+    Shift As Integer, ByVal X As Single, ByVal Y As Single)
+
+Here, _object_ is a valid object for the statement.
+
+Index returns –1 if the user clicks outside the page or tab area of the control but still within the control (for example, to the right of the rightmost tab in a top-tab tab strip).
+
+Button is a required Integer argument specifying the mouse button that triggered the event. Table 15.8 lists the possible values for Button.
+
+Table 15.8 Button values and constants
+
+**Constant** | **Value** | **Description**  
+---|---|---  
+fmButtonLeft | 1 | Left (primary)  
+fmButtonRight | 2 | Right (non-primary)  
+fmButtonMiddle | 4 | Middle
+
+Shift is a required argument specifying whether the Shift, Ctrl, or Alt key was pressed. Table 15.9 lists the values for Shift.
+
+Table 15.9 Shift values
+
+**Value** | **Key or Keys Pressed**  
+---|---  
+1 | Shift  
+2 | Ctrl  
+3 | Shift+Ctrl  
+4 | Alt  
+5 | Alt+Shift  
+6 | Alt+Ctrl  
+7 | Alt+Shift+Ctrl
+
+You can also detect a single key by using the key masks listed in Table 15.7.
+
+X is a required Single argument specifying the horizontal position in points from the left edge of the user form, frame, or page. Y is a required Single argument specifying the vertical position in points from the top edge of the user form, frame, or page.
+
+### MouseMove Event
+
+The MouseMove event is available to the CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, TabStrip, TextBox, and ToggleButton controls and to the UserForm object. This event fires when the user moves the mouse pointer over the control or object in question.
+
+The syntax for the MouseMove event is different for the MultiPage control and the TabStrip control than for the other controls and for the UserForm object. The syntax for the other controls is as follows:
+
+    Private Sub object_MouseMove(ByVal Button As Integer, ByVal Shift As Integer,
+    ByVal X As Single, ByVal Y As Single)
+
+The syntax for the MultiPage control and the TabStrip control is as follows:
+
+    Private Sub object_MouseMove(ByVal Index As Long, ByVal Button As Integer,
+    ByVal Shift As Integer, ByVal X As Single, ByVal Y As Single)
+
+Here, _object_ is a required argument specifying a valid object.
+
+For the MultiPage and TabStrip controls, Index is a required argument that returns the index of the Page object in the MultiPage control or the Tab object in the TabStrip control associated with the event procedure.
+
+Button is a required Integer argument that returns which mouse button (if any) the user is pressing. Table 15.10 lists the values for Button.
+
+Table 15.10 Button values
+
+**Value** | **Button Pressed**  
+---|---  
+0 | No button  
+1 | Left  
+2 | Right  
+3 | Left and right  
+4 | Middle  
+5 | Left and middle  
+6 | Middle and right  
+7 | Left, middle, and right
+
+Shift is a required Integer argument that returns a value indicating whether the user is pressing the Shift, Alt, and/or Ctrl keys. Refer back to Table 15.9 for the list of Shift values.
+
+X is a required Single argument that returns a value specifying the horizontal position in points from the left edge of the user form, frame, or page. Y is a required Single argument specifying the vertical position in points from the top edge of the user form, frame, or page.
+
+As with the MouseDown and MouseUp events, you can also detect a single key by using the key masks listed in Table 15.7.
+
+Like most windows in the Windows operating system, user forms largely experience life as a nonstop sequence of mouse events. MouseMove events monitor where the mouse pointer is on the screen and which control has captured it. MouseMove events fire even if you use the keyboard to move a user form from under the mouse pointer because the mouse pointer ends up in a different place in relation to the user form even though it hasn't moved in the conventional sense.
+
+One use for the MouseMove event is to display appropriate text or an image for a control at which the user is pointing. For example, suppose a user form provides a list of available products, with each product's title appearing in a label. When the user positions the mouse pointer over a title in the label, you could use the MouseMove event to load a picture of the product into an Image control and a short description into another label.
+
+* * *
+
+MouseMove Events May Not Trigger between Close Controls
+
+The user form traps MouseMove events when the mouse pointer isn't over any control. However, if the user moves the mouse pointer quickly from one control to another very close to it, the user form may fail to trap the movement over the short intervening space.
+
+* * *
+
+### BeforeDragOver Event
+
+The BeforeDragOver event applies to the UserForm object itself and to the following controls: CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, and ToggleButton. A BeforeDragOver event is triggered when the user is performing a drag-and-drop operation.
+
+The syntax for the BeforeDragOver event depends on the object or control in question. The basic syntax for the UserForm object and all controls except the Frame, TabStrip, and MultiPage is as follows, where _object_ is a valid UserForm or control:
+
+    Private Sub object_BeforeDragOver(ByVal Cancel As MSForms.ReturnBoolean, ByVal
+    Control As MSForms.Control, ByVal Data As MSForms.DataObject, ByVal X As Single,
+    ByVal Y As Single, ByVal State As MSForms.fmDragState, ByVal Effect As MSForms.
+    ReturnEffect, ByVal Shift As Integer)
+
+The syntax for the BeforeDragOver event with the Frame control is as follows, where _frame_ is a valid Frame control:
+
+    Private Sub frame_BeforeDragOver(ByVal Cancel As MSForms.ReturnBoolean, ByVal
+    Control As MSForms.Control, ByVal Data As MSForms.DataObject, ByVal X As Single,
+    ByVal Y As Single, ByVal State As MSForms.fmDragState, ByVal Effect As MSForms.
+    ReturnEffect, ByVal Shift As Integer)
+
+The syntax for the BeforeDragOver event with the MultiPage control is as follows, where _multipage_ is a valid MultiPage control:
+
+    Private Sub multipage_BeforeDragOver(ByVal Index As Long, ByVal Cancel As MSForms.
+    ReturnBoolean, ByVal Control As MSForms.Control, ByVal Data As MSForms.
+    DataObject, ByVal X As Single, ByVal Y As Single, ByVal State As MSForms.
+    fmDragState, ByVal Effect As MSForms.ReturnEffect, ByVal Shift As Integer)
+
+The syntax for the BeforeDragOver event with the TabStrip control is as follows, where tabstrip is a valid TabStrip control:
+
+    Private Sub tabstrip_BeforeDragOver(ByVal Index As Long, ByVal Cancel As MSForms.
+    ReturnBoolean, ByVal Data As MSForms.DataObject, ByVal X As Single, ByVal Y As
+    Single, ByVal DragState As MSForms.fmDragState, ByVal Effect As MSForms.
+    ReturnEffect, ByVal Shift As Integer)
+
+These are the different parts of the statements:
+
+  * Index is the index of the Page object in a MultiPage control (or the Tab object in a TabStrip control) that is affected by the drag-and-drop.
+  * Cancel is a required argument giving the status of the BeforeDragOver event. The default setting is False, which makes the control handle the event. A setting of True makes the application handle the event.
+  * Control is a required argument specifying the control that is being dragged over.
+  * Data is a required argument specifying the data being dragged.
+  * X is a required argument specifying the horizontal distance in points from the left edge of the control. Y is a required argument specifying the vertical distance in points from the top of the control.
+  * DragState is a required argument specifying where the mouse pointer is in relation to a target (a location at which the data can be dropped). Table 15.11 lists the constants and values for DragState.
+  * Effect is a required argument specifying the operations the source of the drop is to support, as listed in Table 15.12.
+  * Shift is a required argument specifying whether the Shift, Ctrl, or Alt key is held down during the drag-and-drop operation, as listed in Table 15.7.
+
+Table 15.11 DragState constants and values
+
+**Constant** | **Value** | **Position of Mouse Pointer**  
+---|---|---  
+fmDragStateEnter | 0 | Within range of a target  
+fmDragStateLeave | 1 | Outside the range of a target  
+fmDragStateOver | 2 | At a new position, but remains within range of the same target
+
+**Table 15.12:** Effect constants and values
+
+**Constant** | **Value** | **Drop Effect**  
+---|---|---  
+fmDropEffectNone | 0 | Doesn't copy or move the source to the target  
+fmDropEffectCopy | 1 | Copies the source to the target  
+fmDropEffectMove | 2 | Moves the source to the target  
+fmDropEffectCopyOrMove | 3 | Copies or moves the source to the target
+
+You use the BeforeDragOver event to control drag-and-drop actions that the user performs. Use the DragState argument to make sure that the mouse pointer is within range of a target.
+
+### BeforeDropOrPaste Event
+
+The BeforeDropOrPaste event applies to the CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, and ToggleButton controls and to the UserForm object.
+
+A BeforeDropOrPaste event fires just before the user drops or pastes data onto an object.
+
+The syntax for the BeforeDropOrPaste event is different for the MultiPage and TabStrip controls than for the UserForm object and for the other controls. The basic syntax is as follows:
+
+    Private Sub object_BeforeDropOrPaste(ByVal Cancel As MSForms.ReturnBoolean, ByVal
+    Control As MSForms.Control, ByVal Action As MSForms.fmAction, ByVal Data As
+    MSForms.DataObject, ByVal X As Single, ByVal Y As Single, ByVal Effect As
+    MSForms.ReturnEffect, ByVal Shift As Integer)
+
+The syntax for the MultiPage control is as follows, where _multipage_ is a valid MultiPage control:
+
+    Private Sub multipage_BeforeDropOrPaste(ByVal Index As Long, ByVal Cancel  As
+    MSForms.ReturnBoolean, ByVal Control As MSForms.Control, ByVal Action As MSForms.
+    fmAction, ByVal Data As MSForms.DataObject, ByVal X As Single, ByVal Y As
+    Single, ByVal Effect As MSForms.ReturnEffect, ByVal Shift As Integer)
+
+The syntax for the TabStrip control is as follows, where _tabstrip_ is a valid TabStrip control:
+
+    Private Sub tabstrip_BeforeDropOrPaste(ByVal Index As Long, ByVal Cancel  As
+    MSForms.ReturnBoolean, ByVal Action As MSForms.fmAction, ByVal Data As MSForms.
+    DataObject, ByVal X As Single, ByVal Y As Single, ByVal Effect  As MSForms.
+    ReturnEffect, ByVal Shift As Integer)
+
+Here are the parts of the syntax:
+
+  * _object_ is a required object specifying a valid object.
+  * For the MultiPage control, Index is a required argument specifying the Page object involved.
+  * Cancel is a required argument giving the status of the event. The default setting of False makes the control handle the event; True makes the application handle the event.
+  * Control is a required argument specifying the target control.
+  * Action is a required argument specifying the result of the drag-and-drop operation. Table 15.13 shows the constants and values for Action.
+  * Data is a required argument specifying the data (contained in a DataObject) being dragged and dropped.
+  * X is a required argument specifying the horizontal distance in points from the left edge of the control for the drop. Y is a required argument specifying the vertical distance in points from the top of the control.
+  * Effect is a required argument specifying whether the drag-and-drop operation copies the data or moves it, as listed in Table 15.12.
+  * Shift is a required argument specifying whether the user has pressed the Shift, Ctrl, and/or Alt keys, as listed in Table 15.7.
+
+Table 15.13 Action constants and values
+
+**Constant** | **Value** | **Action Taken**  
+---|---|---  
+fmActionPaste | 2 | Pastes the object into the target.  
+fmActionDragDrop | 3 | The user has dragged the object from its source and dropped it on the target.
+
+The BeforeDropOrPaste event fires when a data object is transferred to a MultiPage or TabStrip control and just before the drop or paste operation occurs on other controls.
+
+### DblClick Event
+
+The DblClick event works with the CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, TabStrip, TextBox, and ToggleButton controls, as well as the UserForm object.
+
+A DblClick event occurs when the user double-clicks a control or object with the primary mouse button. The double-click must be fast enough to register as a double-click in Windows (this speed is controlled by the setting on the Buttons tab in the Mouse Properties dialog box in Control Panel) and occurs after the MouseDown event, the MouseUp event, and the Click event (for controls that support the Click event).
+
+The DblClick event has a different syntax for the MultiPage and TabStrip controls than for the other controls or for the user form.
+
+For the MultiPage and TabStrip controls, the syntax is as follows:
+
+    Private Sub object_DblClick(ByVal Index As Long, ByVal Cancel As MSForms
+    .ReturnBoolean)
+
+The syntax for the DblClick event for other controls is as follows:
+
+    Private Sub object_DblClick(ByVal Cancel As MSForms.ReturnBoolean)
+
+Here, _object_ is a required argument specifying a valid object. For the MultiPage control and the TabStrip control, Index is a required argument specifying the Page object within a MultiPage control or the Tab object within a TabStrip control to be associated with the event procedure.
+
+Cancel is a required argument specifying the status of the event. The default setting of False causes the control to handle the event; True causes the application to handle the event instead and causes the control to ignore the second click.
+
+In controls that support both the Click event and the DblClick event, the Click event occurs before the DblClick event. If you take an interface action (such as displaying a message box) with the Click event procedure, it blocks the DblClick event procedure from running. In the following example, the DblClick event procedure doesn't run:
+
+    Private Sub CommandButton1_Click()
+        MsgBox "Click event"
+    End Sub
+
+    Private Sub CommandButton1_DblClick _
+        (ByVal Cancel As MSForms.ReturnBoolean)
+        MsgBox "Double-click event"
+    End Sub
+
+However, you can execute non-interface statements in the Click event procedure without blocking the DblClick event procedure. The following example declares a private String variable named strMessage in the declarations portion of the code sheet for the user form. The Click event procedure for the CommandButton1 command button assigns text to strMessage. The DblClick event procedure assigns more text to strMess and then displays a message box containing strMessage so that you can see that both events have fired. Don't step into this code by pressing F8 in the VBA Editor—instead, press F5 to run it, or it won't work:
+
+    Private strMess As String
+    Private Sub CommandButton1_Click()
+        strMess = "Click event" & vbCr
+    End Sub
+
+    Private Sub CommandButton1_DblClick _
+        (ByVal Cancel As MSForms.ReturnBoolean)
+        strMessage = strMessage & "Double-click event"
+        MsgBox strMessage
+    End Sub
+
+For most controls you won't want to use both a Click event procedure and a DblClick event procedure—you'll choose one or the other as appropriate to the control's purpose.
+
+### Error Event
+
+The Error event applies to the CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, and ToggleButton controls. It also applies to the UserForm object. The Error event fires when a control encounters an error and is unable to return information about the error to the program that called the control. We will explore error handling in depth in Chapter 17 "Debugging Your Code and Handling Errors."
+
+The syntax for the Error event for the UserForm object and for all controls except the MultiPage control is as follows:
+
+    Private Sub object_Error(ByVal Number As Integer, ByVal Description As MSForms.
+    ReturnString, ByVal SCode As Long, ByVal Source As String, ByVal HelpFile As
+    String, ByVal HelpContext As Long, ByVal CancelDisplay As MSForms.ReturnBoolean)
+
+The syntax for the Error event for the MultiPage control is as follows, where _multipage_ is a valid MultiPage control:
+
+    Private Sub multipage_Error(ByVal Index As Long, ByVal Number As Integer, ByVal
+    Description As MSForms.ReturnString, ByVal SCode As Long, ByVal Source As String,
+    ByVal HelpFile As String, ByVal HelpContext As Long, ByVal CancelDisplay As
+    MSForms.ReturnBoolean)
+
+These are the components of the syntax:
+
+  * _object_ is the name of a valid object.
+  * For a MultiPage control, Index is the index of the Page object in the MultiPage control associated with the event.
+  * Number is a required argument that returns the value used by the control to identify the error.
+  * Description is a required String argument describing the error.
+  * SCode is a required argument giving the OLE status code for the error.
+  * Source is a required String argument containing the string identifying the control involved.
+  * HelpFile is a required String argument containing the full path to the Help file that contains the Description.
+  * HelpContext is a required Long argument containing the context ID for the Description within the Help file.
+  * CancelDisplay is a required Boolean argument that controls whether VBA displays the error message in a message box.
+
+## Events That Apply Only to a Few Controls
+
+This section discusses the three events that apply only to one or two controls. The first of the three is the DropButtonClick event, which applies only to the ComboBox and TextBox controls; the second and third are the SpinUp and SpinDown events, which apply only to the SpinButton control.
+
+### DropButtonClick Event
+
+The DropButtonClick event fires when the user displays or hides a drop-down list on a ComboBox by clicking the drop-down button or by pressing the F4 key when the ComboBox has the focus (is selected). DropButtonClick also fires when the user press the F4 key with a TextBox control selected, though this manifestation of the event is arcane enough to be singularly useless. It also fires when the DropDown method is executed in VBA to display the drop-down list, and it fires again when the DropDown method is executed again to hide the drop-down list.
+
+The syntax for the DropButtonClick event is as follows:
+
+    Private Sub object_DropButtonClick( )
+
+Here, _object_ is a valid ComboBox or TextBox control.
+
+One use for the DropButtonClick event is to add items to a ComboBox control rather than adding them at load time via the Initialize event. By adding these items only on demand (I'm assuming the user might not use the ComboBox control at all or might type information into its text-box area), you can cut down on load time for the user form. You can also load the ComboBox with data relevant to the other choices the user has made in the dialog box, allowing for more targeted information than you could have provided by loading the ComboBox with the Initialize event.
+
+### SpinDown and SpinUp Events
+
+The SpinDown and SpinUp events apply only to the SpinButton control. SpinDown and SpinUp are used to control what happens when the user clicks either the down-arrow button and up-arrow button, respectively, of a vertical SpinButton control or the right-arrow button and left-arrow button, respectively, of a horizontal SpinButton control. The SpinDown event fires when the user clicks the down-arrow or right-arrow button, and the SpinUp event fires when the user clicks the up-arrow or left-arrow button.
+
+The syntax for the SpinUp event and the SpinDown event is as follows:
+
+    Private Sub spinbutton_SpinDown()
+    Private Sub spinbutton_SpinUp()
+
+Here, _spinbutton_ is a SpinButton control.
+
+By default, the SpinDown event decreases and the SpinUp event increases the Value property of the SpinButton by the SmallChange increment.
+
+# The Bottom Line
+
+**Understand what a complex dialog box is.**
+
+Simple dialog boxes tend to be static, but complex dialog boxes are dynamic—they change during execution in response to clicks or other interaction from the user.
+
+Master It
+
+Describe two types of dynamic behavior typical of complex dialog boxes.
+
+**Reveal and hide parts of a dialog box.**
+
+Dialog boxes need not display everything at once. Word's Find And Replace dialog box illustrates how useful it can be to display an abbreviated dialog box containing the most common tasks and expand the box to reveal less-popular options if the user needs access to them.
+
+Master It
+
+Name the two most common techniques you can use to display additional options in a dialog box.
+
+**Create multipage dialog boxes.**
+
+VBA includes the MultiPage control, which enables you to create multipage dialog boxes. Word's Font dialog box is an example of one. You can access any page (one at a time) by clicking its tab at the top of the page.
+
+Master It
+
+How does the TabStrip control differ from the MultiPage control? What are the typical uses for each?
+
+**Create modeless dialog boxes.**
+
+A _modeless_ dialog box can be left visible onscreen while the user continues to work in an application. For example, the Find And Replace dialog box in Access, Word, and Excel is modeless, as is the Replace dialog box in PowerPoint. A _modal_ dialog box, by contrast, must be closed by users before they can continue to interact with the application.
+
+Master It
+
+How do you make a user form modeless?
+
+**Work with form events.**
+
+Events are actions that happen while a program is executing. Many events are supported by the UserForm object and the controls you use on it. By using events, you can monitor what the user does and take action accordingly or even prevent the user from doing something that doesn't seem like a good idea.
+
+Master It
+
+Name two of the three most useful events available in VBA programming.
+Part 5
+
+Building Modular Code and Using Classes
+
+  * **Chapter 16: Building Modular Code and Using Classes**
+  * **Chapter 17: Debugging Your Code and Handling Errors**
+  * **Chapter 18: Building Well-Behaved Code**
+  * **Chapter 19: Securing Your Code with VBA's Security Features**
+
+Chapter 16
+
+Building Modular Code and Using Classes
+
+This chapter shows you how to start building modular code—code broken up into individual components rather than all built together into a monolithic mass. You'll also see how to create _reusable code_ that you can use in future procedures.
+
+The second part of this chapter discusses how you can build and use your own classes in VBA to implement custom objects, store information in them, and return information from them.
+
+In this chapter you will learn to do the following:
+
+  * Arrange your code in modules
+  * Call a procedure
+  * Pass information from one procedure to another
+  * Understand what classes are and what they're for
+  * Create an object class
+
+# Creating Modular Code
+
+The code that you've created so far in this book has been effective—it _worked_ —but much of it has been less concise, organized, or elegant than it might be. The following sections show you how to refine your code.
+
+* * *
+
+What Is Elegance in Code?
+
+_Elegance_ in computer programming means not only that your code is bug-free and impeccably put together and that your user interface is well designed, but also that the code contains nothing unnecessary—it has been stripped down to the minimum required to achieve the desired effect.
+
+* * *
+
+## What Is Modular Code?
+
+_Modular code_ is code composed of different procedures that you can use in combination. The name doesn't specifically come from the fact that you store your VBA code in modules.
+
+For example, suppose you're working in Word. You can take a monolithic approach and create a single giant procedure that does a lot of things: creates a document based on the user's choice of template, inserts text and formats it, saves it in a particular folder under a name of the user's choice, prints it to a specific printer, and then closes it. Whew!
+
+Or...you can take the more practical _modular_ approach and subdivide this lengthy series of tasks into several separate procedures—one for each task. You can then create a kind of master procedure that runs each of these individual task procedures. In this way you can achieve the same results as executing the large, monolithic procedure. But subdivided code is easier to read, test, and even sometimes reuse. Think of it as using multiple small subs rather than a single large sub.
+
+You can also later create new master procedures that reuse these individual task procedures in a different way.
+
+## Advantages of Using Modular Code
+
+Modular code has several advantages over code that lumps everything together in one long sub or function. For one thing, it's often easier to write modular code because you create a number of short procedures, each of which performs a specific task. You stay focused on the single task at hand.
+
+You can usually debug these procedures relatively easily too, because their shorter length makes it simpler to identify, locate, and eliminate bugs.
+
+The procedures will also be more readable because they're less complex and you can more easily follow what they do.
+
+Modular code is also more efficient, for four reasons:
+
+  * By breaking your code into procedures, you can repeat their tasks at different points in a sequence of procedures without needing to repeat the lines of code. Having less code should make your procedures run faster.
+  * By reusing whole procedures, you can reduce the amount of code you have to write. And by writing less code, you give yourself less chance to write new errors into your program.
+  * If you need to change an item in the code, you can make a single change in the appropriate procedure instead of having to make changes at a number of locations in a long procedure (and perhaps missing some of them). This change then also applies to any procedures that call the procedure.
+  * You can call individual procedures from other procedures without having to assimilate them into the other procedures. Just think how tedious it would be if you had to create each of VBA's many built-in functions from scratch instead of being able to invoke them at will. You can do much the same with functions you create—reuse them rather than reinvent the wheel.
+
+## How to Approach Creating Modular Code
+
+The usefulness of modular coding will vary from person to person, from project to project, and from procedure to procedure. For example, if you record a macro to perform a simple, one-time task on a number of presentations, there's no need to worry about breaking it down into its components and formalizing them as procedures. Just go ahead and use a single procedure.
+
+However, if you sit down to plan a large procedure that's going to automate the creation of your company's budget-estimate spreadsheets, you can benefit greatly from dividing the code into a set of several procedures. This automation job is complex and requires a lot of code, and it's also a program that must be reused every time there's a new budget proposal.
+
+You can go about creating modular code in two main ways:
+
+  * Record (if the application you're using supports the VBA Macro Recorder) or write a procedure as usual and then examine it and break it into modules as necessary. This is a great way to start creating modular code, but it's usually less efficient: You'll end up spending a lot of time retrofitting your original, large procedure as you break it into smaller, separate procedures.
+  * List the different actions that your project requires, then code each action (or set of actions) as a separate procedure. This method requires a bit more planning but usually proves more efficient in the long run.
+
+## Arranging Your Code in Modules
+
+Once you've created a set of procedures, you can move them to a new module within the same project, or even to a different project. By grouping your procedures in modules, you can easily distribute the procedures to your colleagues without including any they don't need. In addition, you can remove from your immediate working environment any modules of code that you don't need.
+
+* * *
+
+Give Descriptive Names to Your Modules
+
+Give your modules descriptive names so that you can instantly identify them in the VBA Editor Project Explorer and other module-management tools. Avoid leaving modules named the default Module1, Module2, and so on.
+
+* * *
+
+## Calling a Procedure
+
+When one of your procedures needs to use another procedure you wrote, it _calls_ it (by name) in the same way that you learned in Chapter 9, "Using Built-in Functions," to call a built-in function like MsgBox.
+
+To call a procedure in the same project, either enter the name of the procedure to be called as a statement or use a Call statement with the name of the procedure.
+
+The syntax for the Call statement is the same for procedures as for functions:
+
+    [Call] _name_ [, _argumentlist_ ]
+
+Here, _name_ is a required String argument giving the name of the procedure to call. Meanwhile, _argumentlist_ is an argument (or list of several arguments) providing a comma-delimited list of the variables, arrays, or expressions to pass to the procedure. You use an argument list only for procedures that require arguments.
+
+Calling involves two procedures, the caller and the called. For example, the following CreateReceiptLetter procedure (the caller) calls the procedure FormatDocument (the called):
+
+    Sub CreateReceiptLetter()
+        'other actions here
+        **Call FormatDocument**
+        'other actions here
+    End Sub
+
+Most programmers omit the Call keyword, using just the name of the procedure. This next code does the same thing as the previous code example:
+
+    Sub CreateReceiptLetter()
+        'other actions here
+        **FormatDocument**
+        'other actions here
+    End Sub
+
+However, as with calling built-in functions, some programmers believe that using the Call keyword can make it clearer that your code is calling a procedure, and it enables you to search more easily for your calls. (When debugging, you can see what procedures are calling others by choosing the Call Stack option on the Editor's View menu. This feature is only available in Break mode, however, not during design time.)
+
+In the following example, a procedure named Caller calls a procedure named Called, which takes the String argument strFeedMe. Note that when you use Call, you need to enclose the argument list in parentheses:
+
+    Sub Caller()
+        Call Called("Hello")
+    End Sub
+
+    Sub Called(ByVal strFeedMe As String)
+        Msgbox strFeedMe
+    End Sub
+
+Again, you can omit the Call keyword and, if you wish, the parentheses, and yet achieve the same result:
+
+    Sub Caller()
+        Called "Hello"
+    End Sub
+
+As well as calling a procedure in the same project, you can call a procedure in another open project in the same host application (but usually not in another application). Typically, the syntax used to call a procedure in another project is as follows, although it can vary by application and version:
+
+    Project.Module.Procedure
+
+To call a procedure in another project, you need to add a reference to that project in the VBA Editor's References dialog box. Choose Tools ⇒ References, select the project (click the Browse button if you need to browse to it), and then click the OK button. Once this reference is in place, you can call the procedure.
+
+* * *
+
+Circular References Are Not Allowed
+
+You can't add to the current project a reference to a project that itself contains a reference to the current project. If you attempt a circular reference like that, when you add the reference and close the References dialog box, the VBA Editor displays a message box with the warning "Cyclic reference of projects not allowed" and the Editor refuses to insert the reference. (It does close the References dialog box, though.)
+
+* * *
+
+Let's turn our attention to another benefit of modular code: you can refine your code and make it run faster by making logical improvements and visual improvements.
+
+## Making Logical Improvements to Your Code
+
+Breaking a large procedure into several smaller procedures can improve the logic of your code by forcing you to consider each set of actions the procedure takes as _modular_ , which means they're separate from other sets of actions. And you can also improve the logic of your code in other ways: by using explicit variable declarations, by stripping out unnecessary statements to simplify recorded code, and by using With statements to eliminate repetitive object references. The following sections describe ways to improve the quality of your code.
+
+### Declaring Variables Explicitly nstead of Implicitly
+
+This has been mentioned before, but it's important. Instead of declaring variables implicitly, declare all your variables explicitly:
+
+    Dim strName As String
+    strName = "Lola Montez"
+
+Use that approach rather than the implicit declaration approach, which skips declaring the variable and merely assigns a value to it (which implicitly creates it):
+
+    strName = "Lola Montez"
+
+Explicit declaration allows VBA to allocate only as much memory as that variable type needs. What's more, by specifying the data type of a variable, you relieve VBA of the necessity to waste time figuring out the data type each time the variable appears in your code. Better still, you avoid the risk of unintentionally storing the wrong type of data in the variable. Because the variable is explicitly typed, VBA displays an error message rather than storing the data and changing the variable type.
+
+Table 16.1 shows the details on the amounts of memory that the different types of variables require.
+
+Table 16.1 Memory consumed by the different types of variables
+
+**Variable** | **Memory Needed (Bytes)**  
+---|---  
+Boolean |  2  
+Byte |  1  
+Currency |  8  
+Date |  8  
+Variant/Decimal | 12  
+Double |  8  
+Integer |  2  
+Long |  4  
+Object |  4  
+Single |  4  
+String | Variable-length strings: 10 bytes plus the storage required for the string, which can be up to about two billion characters; fixed-length strings: the number of bytes required to store the string, which can be from 1 to about 64,000 characters  
+Variant | Variants that contain numbers: 16 bytes; variants that contain characters: 22 bytes plus the storage required for the characters
+
+How much memory you save by specifying data types, and how much difference choosing variable types makes to your procedures, depends on the type of work you're doing. For example, if you store a million characters in a Single variable, the 12 bytes you save by specifying that it's a String variable rather than a Variant variable make little difference.
+
+But if you use many variables on a computer with limited memory, specifying the appropriate data types for your variables may save enough memory to enable your procedure to run where it otherwise wouldn't have been able to, or at least enable it to run faster. Of course, hardware is continually improving—and memory is hardware. Now that RAM is becoming cheap and plentiful, conserving memory is not much of an issue for programmers.
+
+A second reason for declaring your variables explicitly rather than implicitly is to make your code easier to read and to debug. And a third reason is that you can implement some runtime range-checking. If you _know_ something will be less than 32,768, and you therefore declare it as being the Integer data type (rather than the Long type), you'll automatically get a helpful error if a Long-size value creeps into it somehow at runtime.
+
+* * *
+
+**Simplify Recorded Code**
+
+Recall that the Macro Recorder (available only in Word and Excel) offers an excellent way to get started writing code for a project. Just turn on the recorder and carry out the actions you want your code to accomplish. The recorder can write code for many tasks. It can't create conditional branches, loops, and a few other code features, but it nevertheless can do quite a bit.
+
+The Macro Recorder provides a great way to kick-start creating code by letting you identify quickly the built-in objects the procedure will need to work with and the methods and properties you'll need to use with them.
+
+But as you've seen, one drawback of the Macro Recorder is that it tends to record a lot of code that you don't actually need in your procedures. It records the _state_ of a context—the status of _all_ the options in the current context. And you're probably interested in only one or two options.
+
+It's like taking a photo. The camera records _everything_ that you point it at. But often you don't want to see everything, just a particular object. You took a picture of the school play. The photo contains all the kids on the stage, but you're only really interested in little Darla's lovely smile and her costume. So you use a graphics program to crop out (cut away) everything but Darla.
+
+Here's an example of "cropping" code: When you record a procedure that changes one setting in a dialog box (such as switching to italic in the Font dialog box in Word), the Macro Recorder nonetheless records _all_ the other settings on not only that page, but also on all the other Font dialog box's pages (Character Spacing and things like that). Just in case you wanted them. But you don't. You're only interested in the italic feature.
+
+Once you've finished recording the procedure, you'll often want to open it to make minor adjustments; to add loops, decisions, or UI items (message boxes, input boxes, or user forms); or even to lift parts of the code for use in other procedures. When you do this, first examine the code the Macro Recorder has recorded, and where possible, strip out the statements unrelated to your purpose. Leave only the recorded pieces of code that you need. Make the code focus on what you're actually doing—the task you're carrying out. Later, you'll thank yourself if you have to examine or reuse this code: You'll be able to easily see what the code is doing: not superscript, not boldface, or any of the other settings—just italic.
+
+Take this Word example. Compare the Applying_Arial_Font procedure that follows with the Stripped_Down_Procedure_Applying_Arial_Font procedure that comes after it:
+
+    Sub Applying_Arial_Font()
+    '
+    ' Applying_Arial_Font Macro
+    ' Applies the Arial font to the selected text
+    '
+        With Selection.Font
+            . **Name = "Arial"**
+            .Size = 13
+            .Bold = False
+            .Italic = False
+            .Underline = wdUnderlineNone
+            .UnderlineColor = wdColorAutomatic
+            .StrikeThrough = False
+            .DoubleStrikeThrough = False
+            .Outline = False
+            .Emboss = False
+            .Shadow = False
+            .Hidden = False
+            .SmallCaps = False
+            .AllCaps = False
+            .Color = wdColorAutomatic
+            .Engrave = False
+            .Superscript = False
+            .Subscript = False
+            .Spacing = 0
+            .Scaling = 100
+            .Position = 0
+            .Kerning = 0
+            .Animation = wdAnimationNone
+        End With
+    End Sub
+
+    Sub Stripped_Down_Procedure_Applying_Arial_Font()
+        Selection.Font.Name = "Arial **"** End Sub
+
+As you can see, the Stripped_Down_Procedure_Applying_Arial_Font code has the same effect as the recorded procedure, but it contains 3 lines instead of the recorded procedure's 31.
+
+* * *
+
+### Using With Statements to Simplify Your Code
+
+When you're performing multiple actions with an object, you can often use With statements to avoid repeating the object reference for each action. This simplifies your code. It becomes easier to read. And it may make it run marginally faster.
+
+For example, the following statements contain multiple references to the first Paragraph object—Paragraphs(1)—in the ActiveDocument object in Word:
+
+    ActiveDocument.Paragraphs(1).Range.Font.Bold = True
+    ActiveDocument.Paragraphs(1).Range.Font.Name = "Times New Roman"
+    ActiveDocument.Paragraphs(1).LineSpacingRule = wdLineSpaceSingle
+    ActiveDocument.Paragraphs(1).Borders(1).LineStyle = wdLineStyleDouble
+    ActiveDocument.Paragraphs(1).Borders(1).ColorIndex = wdBlue
+
+You can replace this redundancy by employing a With structure that references the Paragraphs(1) object in the ActiveDocument object to simplify the number of references involved:
+
+    With ActiveDocument.Paragraphs(1)
+        .Range.Font.Bold = True
+        .Range.Font.Name = "Times New Roman"
+        .LineSpacingRule = wdLineSpaceSingle
+        .Borders(1).LineStyle = wdLineStyleDouble
+        .Borders(1).ColorIndex = wdBlue
+    End With
+
+When you need to work with multiple child objects contained within a single parent object, you can either use separate With statements or pick the lowest common denominator of the objects you want to work with and use an outer With statement along with nested With statements for the child objects.
+
+If you wish, you can further reduce the number of object references in the previous code example by using nested With statements for the Font object in the Range object and for the Borders(1) object, like this:
+
+    With ActiveDocument.Paragraphs(1)
+        With .Range.Font
+            .Bold = True
+            .Name = "Times New Roman"
+        End With
+        .LineSpacingRule = wdLineSpaceSingle
+        With .Borders(1)
+            .LineStyle = wdLineStyleDouble
+            .ColorIndex = wdBlue
+        End With
+    End With
+
+### Don't Use With Statements Pointlessly
+
+With statements are great for reducing repetitive object references and making your code easier to read, but don't use them just because you can. If you have only one statement within a With statement, as in the following example (which again uses Word), you're probably wasting your time typing the extra code to set up the With structure:
+
+    With ActiveDocument.Sections(1).Headers(wdHeaderFooterPrimary) _
+        .Range.Words(1)
+        .Bold = True
+    End With
+
+Likewise, don't nest With statements unless you need to—it gets confusing, like this bizarre example:
+
+    With ActiveDocument
+        With .Sections(1)
+            With .Headers(wdHeaderFooterPrimary)
+                With .Range
+                    With .Words(1)
+                        With .Font
+                            .Italic = True
+                            .Bold = False
+                            .Color = wdColorBlack
+                        End With
+                    End With
+                End With
+            End With
+        End With
+    End With
+
+This code is better when written like this:
+
+    With ActiveDocument.Sections(1).Headers(wdHeaderFooterPrimary).Range. _
+        Words(1).Font
+        .Italic = True
+        .Bold = False
+        .Color = wdColorBlack
+    End With
+
+### Optimizing Your Select Case Statements
+
+When you use a Select Case statement, arrange the Case statements so that the most likely ones appear first. This saves VBA some work and time—VBA goes down through the list of Case statements until it finds a match, so the earlier in the list it scores a match, the quicker the execution of the statement.
+
+### Don't Check Things Senselessly
+
+If you need to implement a setting (especially a Boolean one) every time a particular procedure runs, there's no point in checking the current value.
+
+For example, suppose you wanted to make sure the EnableAutoRecover property (a Boolean property that sets or returns whether the AutoRecover feature is on for the current workbook) of the ActiveWorkbook object in Excel is set to True. You could check the current value of EnableAutoRecover and, if it is False, set it to True like this:
+
+    If ActiveWorkbook.EnableAutoRecover = False Then _
+        ActiveWorkbook.EnableAutoRecover = True
+
+But that wastes code. Instead, simply set the property to True:
+
+    ActiveWorkbook.EnableAutoRecover = True
+
+### Removing Unused Elements from Your Code
+
+To improve the efficiency of your code, try to remove all unused elements from it. When creating a complex project with many interrelated procedures, it's easy to end up with some procedures that are almost or entirely useless. You were trying out various approaches and perhaps sketched in a couple of procedures that ended up never being used, for example.
+
+You'll find it easier to remove superfluous procedures if you've commented your code comprehensively while creating it so you can be sure that what you're removing is unused rather than used. If you're in doubt as to which procedure is calling which, display the Call Stack dialog box (see Figure 16.1); choose View ⇒ Call Stack or press Ctrl+L to see what's happening. Recall that the Call Stack dialog box is available in Break mode (while you're single-stepping through a procedure, or the editor has halted execution at a breakpoint, and so on). If one procedure has called another one during execution, they will both be listed.
+
+Figure 16.1 The Call Stack dialog box lets you see which procedure has called which.
+
+Figure 16.1 reveals that the procedure named Identify_Current_User called the procedure named ToggleItal and that ToggleItal then called GetClipboardText, which, in turn, called DocumentOpen. Execution is currently halted (is in Break mode) within the DocumentOpen procedure.
+
+Alternatively, try one of these techniques:
+
+  * Set a breakpoint at the beginning of a suspect procedure so that you'll be alerted when it's called.
+  * Display message boxes at decisive junctures in your code so you can see what's happening: Is the procedure ever called?
+  * Use a Debug.Print statement at an appropriate point (again, perhaps the beginning of a procedure) to temporarily log information in the Immediate window.
+
+Before you remove an apparently dead procedure from your code, make sure not only that it's unused in the way the code is currently being run, but also that it's not used in ways in which the procedure _might_ be run were circumstances different. If you think that the procedure might still be used, try moving it to a project from which you can easily restore it rather than deleting it altogether.
+
+Once you've removed any unused procedures, examine the variables in the procedures. Even if you're using the Option Explicit declaration and declaring every variable explicitly, check that you haven't declared variables that end up not being used. For simple projects, you'll be able to catch the unused variables by using the Locals window to see which of them never get assigned a value. For more complex projects, you may want to try some of the available third-party tools that help you remove unneeded elements from your code.
+
+If in doubt, just use the Editor's Find feature (Ctrl+F) to see if the variable name appears only once: when the variable is declared.
+
+Removing unused procedures and variables isn't crucial. They do no real harm; they're just debris. But they do clutter up your code, potentially making it harder to understand and modify if you come back to it later for maintenance or reuse.
+
+* * *
+
+Back Up Your Modules, Forms, and Class Modules
+
+Before removing an entire module, use the File ⇒ Export File command to export a copy of the module to a .BAS file in a safe storage location in case the module contains anything you'll subsequently discover to be of value. Similarly, export your user forms to .FRM files and your classes to .CLS files.
+
+* * *
+
+## Making Visual Improvements to Your Code
+
+Another way to improve your code is to format it so it's as easy as possible to read, maintain, and modify.
+
+### Indenting the Different Levels of Code
+
+As you've seen in the examples so far in this book, you can make code much easier to follow by indenting some lines of code with tabs or spaces to show their logical relation to each other or to visually illustrate subordination and structures such as loops.
+
+You can click the Indent and Outdent buttons on the editor's Edit toolbar or press Tab and Shift+Tab to quickly indent or unindent a selected block of code, with the relative indentation of the lines within the block remaining the same.
+
+* * *
+
+Labels Can't Be Indented, But That's a Good Thing
+
+You can't indent a label—a word ending with a colon (:) and used as the target of a GoTo statement. If you try to indent a label, the VBA Editor won't let you. The editor removes all spaces to the left of the label as soon as you press Enter or otherwise move the insertion point off the line containing the label. A label is a target and _should_ be on the far left of its code line so you can easily see it.
+
+* * *
+
+### Using Line-Continuation Characters to Break Long Lines
+
+Use the line-continuation character (a space followed by an underscore) to break long lines of code into two or more shorter lines. Breaking lines makes long statements fit within the Code window on an average-size monitor at a readable point size and enables you to break the code into more logical segments.
+
+### Using the Concatenation Character to Break Long Strings
+
+You can't use the line-continuation character to break strings, however. If you want to break a long string, you must divide the string into smaller strings and then use the concatenation character (&) to attach the parts again. You _can_ separate the parts of the divided string (which are merely separated by the line-continuation character). For example, consider a long string such as this:
+
+    strMessageText = "The macro has finished running. Please check your presentation
+    to ensure that all blank slides have been removed."
+
+Instead, you could divide the string into two, and then rejoin it like this:
+
+    strMessageText = "The macro has finished running. " & **_**
+        "Please check your presentation to ensure that " & **_**
+        "all blank slides have been removed."
+
+### Using Blank Lines to Break Up Your Code
+
+* * *
+
+**For Legacy Reasons, You Can Employ the + Character for Concatenation**
+
+Alternatively, you can use the addition character (+) to concatenate one string with another, but not to concatenate a string and a numeric variable (do that, and VBA tries to _add_ them mathematically instead of concatenating them). However, your code is easier to read if you just stick with the concatenation character & when concatenating strings. Leave the + character for math.
+
+* * *
+
+To make your code more readable, use blank lines to separate statements into logical groups. For example, you might segregate all the variable declarations in a procedure as shown in the following example so that they stand out more clearly:
+
+    Sub Create_Rejection_Letter
+
+        Dim strApplicantFirst As String, strApplicantInitial As String, _
+            strApplicantLast As String, strApplicantTitle As String
+        Dim strJobTitle As String
+        Dim dteDateApplied As Date, dteDateInterviewed As Date
+        Dim blnExperience As Boolean
+
+        strApplicantFirst = "Shirley"
+        strApplicantInitial = "P"
+        strApplicantLast = "McKorley"
+    ]
+
+### Using Variables to Simplify Complex Syntax
+
+You can use variables to simplify and shorten complex syntax. For example, you could display a message box by using an awkwardly long statement such as this one:
+
+    If MsgBox("The document contains no text." & vbCr & vbCr _
+        & "Click the Yes button to continue formatting the document." & _
+        " Click the No button to cancel the procedure.", _
+        vbYesNo & vbQuestion, _
+        "Error Selecting Document: Cancel Procedure?") Then
+
+Alternatively, you could use one String variable for building the message and another String variable for the title:
+
+    Dim strMsg As String
+    Dim strTBar As String
+    strMsg = "The document contains no text." & vbCr & vbCr
+    strMsg = _
+      strMsg & "Click the Yes button to continue formatting the document. "
+    strMsg = strMsg & "Click the No button to cancel the procedure."
+    strTBar = "Error Selecting Document: Cancel Procedure?"
+    If MsgBox(strMsg, vbYesNo & vbQuestion, strTBar) Then
+
+At first sight, this code looks more complex than the straightforward message-box statement, mostly because of the explicit variable declarations that increase the length of the code segment. But in the long run, this approach is much easier to read and modify.
+
+In the previous example, you could also replace the vbYesNo & vbQuestion part of the MsgBox statement with a variable (preferably a Long rather than a Variant). But doing so makes the code harder to read and is seldom worthwhile.
+
+### Passing Information from One Procedure to Another Using Arguments
+
+Often when you call another procedure, you'll need to pass information to it from the calling procedure. And you sometimes go the other way: when the called procedure has finished executing, it needs to pass back info to the caller.
+
+The best way to pass information from a caller procedure to a called procedure is by using arguments. You declare the arguments to pass in the declaration line of the procedure that passes them. The arguments appear in the parentheses after the procedure's name. You can pass either a single argument (as the first of the following statements does) or multiple arguments separated by commas (as the second does):
+
+    Sub PassOneArgument(MyArg)
+    Sub PassTwoArguments(FirstArg, SecondArg)
+
+As with functions (discussed in Chapter 9), you can pass an argument either _by reference_ or _by value_. When a procedure passes an argument to another procedure by reference, the recipient procedure gets access to the memory location where the original variable is stored and can change the original variable. By contrast, when a procedure passes an argument to another procedure by value, the recipient procedure gets only a copy of the information in the variable and can't change the information in the original variable.
+
+Passing an argument by reference is useful when you want to manipulate the variable in the recipient procedure and then return the variable to the procedure from which it originated. Passing an argument by value is useful when you want to use the information stored in the variable in the recipient procedure and at the same time make sure the original information in the variable doesn't change.
+
+By reference is the default way to pass an argument, but you can also use the ByRef keyword to state explicitly that you want to pass an argument by reference. Both of the following statements pass the argument MyArg by reference:
+
+    Sub PassByReference(MyArg)
+    Sub PassByReference(ByRef MyArg)
+
+To pass an argument by value, you must use the ByVal keyword. The following statement passes the ValArg argument by value:
+
+    Sub PassByValue(ByVal ValArg)
+
+In practice, however, you'll rarely, if ever, need to employ ByVal. Arguments are nearly universally passed by reference, the default.
+
+If necessary, you can pass some arguments for a procedure by reference and others by value. The following statement passes the MyArg argument by reference and the ValArg argument by value:
+
+    Sub PassBoth(ByRef MyArg, ByVal ValArg)
+
+You can explicitly declare the data type of arguments you pass in order to take up less memory and ensure that your procedures are passing the type of information you intend them to. But when passing an argument by reference, you need to make sure that the data type of the argument you're passing matches the data type expected by the called procedure. For example, if you declare a String in the caller procedure and try to pass it as an argument when the called procedure is expecting a Variant, VBA gives an error.
+
+To declare the data type of an argument, include a data-type declaration in the argument list. The following statement declares MyArg as a String and ValArg as a Variant:
+
+    Sub PassBoth(MyArg As String, ValArg As Variant)
+
+You can specify an optional argument by using the Optional keyword. Place the Optional keyword before the ByRef or ByVal keyword if you need to use ByRef or ByVal:
+
+    Sub PassBoth(ByRef MyArg As String, ByVal ValArg As Variant, _
+        Optional ByVal MyOptArg As Variant)
+
+Listing 16.1 shows a segment of a procedure that uses arguments to pass information from one procedure to another.
+
+**Listing 16.1**: Passing arguments from one procedure to another
+
+     1.  Sub GetCustomerInfo()
+     2.      Dim strCustName As String, strCustCity As String, _
+                 strCustPhone As String
+     3.      'Get strCustName, strCustCity, strCustPhone from a database
+     4.      CreateCustomer strCustName, strCustCity, strCustPhone
+     5.  End Sub
+     6.
+     7.  Sub CreateCustomer(ByRef strCName As String, _
+             ByRef strCCity As String, ByVal strCPhone As String)
+     8.      Dim strCustomer As String
+     9.      strCustomer = strCName & vbTab & strCCity _
+                 & vbTab & strCPhone
+    10.      'take action with strCustomer string here
+    11.  End Sub
+
+Listing 16.1 contains two minimalist procedures—GetCustomerInfo and CreateCustomer—that show how to use arguments to pass information between procedures:
+
+  * The first procedure, GetCustomerInfo, explicitly declares three String variables in line 2: strCustName, strCustCity, and strCustPhone.
+  * Line 3 contains a comment indicating that you would write additional code here to obtain the data and assign information to the variables.
+  * Line 4 calls the CreateCustomer procedure and passes to it the variables strCustName, strCustCity, and strCustPhone as arguments. Because this statement doesn't use the Call keyword, the arguments aren't enclosed in parentheses.
+  * Execution then switches to line 7, which starts the CreateCustomer procedure by declaring the three String arguments it uses: strCName and strCCity are to be passed by reference, and strCPhone is to be passed by value.
+  * Line 8 declares the String variable strCustomer. Line 9 then assigns to strCustomer the information in strCName, a tab, the information in strCCity, another tab, and the information in strCPhone.
+  * Line 10 contains a comment indicating where the procedure would take action with the strCustomer string (for example, dumping it into some kind of primitive database), and line 11 ends the procedure.
+
+### Passing Information Back from a Called Procedure
+
+Just a reminder: Functions, not subs, are used to pass information _back_ to a caller. Both functions and subs are procedures, but functions are specifically designed to send information back to a caller.
+
+This code example calls a function that adds state tax to a purchase price, then passes back the resulting total cost:
+
+    1.  Sub FindTotalCost()
+    2.  
+    3.  Dim OriginalCost, TotalCost ' declare two variant types
+    4.  OriginalCost = 155 'this sweater is expensive
+    5.  
+    6.  TotalCost = AddTax(OriginalCost) 'call the AddTax function
+    7.  MsgBox TotalCost 'show the final cost including 7% tax
+    8.  
+    9.  
+    10. End Sub
+    11. 
+    12. Function AddTax(SubTotal)
+    13. 
+    14. AddTax = SubTotal * 1.07 'do the math and assign the result
+    15.                          'to the function name so it gets passed back
+    16. 
+    17. End Function
+
+Data is passed from the caller to the called in line 6. Data is passed back from the called to the caller by assigning a value to the name of the function in line 14.
+
+### Passing Information from One Procedure to Another Using Private or Public Variables
+
+Another way to pass information from one procedure to another is to use either private variables or public variables. You can use private variables if the procedures that need to share information are located in the same module. If the procedures are located in different modules, you'll need to use public variables to pass the information.
+
+* * *
+
+Avoid Using Global Variables to Pass Data
+
+Using private or public variables to pass information from one procedure to another is widely considered poor programming practice. Doing so makes it harder to track the flow of information between procedures, especially when several procedures are involved. However, you may sometimes find this way of passing information helpful—or you may be required to work with someone else's code that uses this approach.
+
+* * *
+
+Listing 16.2 contains an example of passing information by using private variables.
+
+**Listing 16.2**: Passing data using a private variable
+
+     1.  Private strPassMe As String
+     2.
+     3.  Sub PassingInfo()
+     4.      strPassMe = "Hello."
+     5.      PassingInfoBack
+     6.      MsgBox strPassMe
+     7.  End Sub
+     8.
+     9.  Sub PassingInfoBack()
+    10.      strPassMe = strPassMe & " How are you?"
+    11.  End Sub
+
+Listing 16.2 begins by declaring the private String variable strPassMe at the beginning of the code sheet for the module. strPassMe is then available to all the procedures in the module.
+
+  * The PassingInfo procedure (lines 3 to 7) assigns the text Hello. (with the period) to strPassMe in line 4 and then calls the PassingInfoBack procedure in line 5.
+  * Execution then shifts to line 9, which starts the PassingInfoBack procedure.
+  * Line 10 adds How are you? with a leading space to the strPassMe String variable.
+  * Line 11 ends the PassingInfoBack procedure, at which point execution returns to the PassingInfo procedure at line 6, which displays a message box containing the strPassMe string (now _Hello_. _How are you_?).
+  * Line 7 ends the procedure.
+
+# Creating and Using Classes
+
+A _class_ is the formal definition of an object—typically, a custom object. By defining classes, you can build your own custom objects. A class is essentially a template for an object: Once you've defined the class in your code, VB will then create objects based on it when the code executes.
+
+The relationship between class and object is sometimes described as similar to a cookie cutter and a cookie or a blueprint and the houses based on that blueprint. The former is a description, the latter is the description brought to life.
+
+Another way to think of the distinction between class and object is to recall the distinction between design time and runtime. You create a _class_ during design time by writing code that describes the _object_ (or multiple objects). The class will come into being during runtime when the class code executes.
+
+The phrase _come into being_ is more formally expressed as follows: an object is _instantiated_. (An _instance_ —the object—of the class comes into existence during runtime.) Got it?
+
+## What Can You Do with Class Modules?
+
+Programming means telling the computer how to process some information.
+
+Information to be processed can be stored in various places. For example, you can store it in a database that your code accesses. Or you can type it into your code, such as storing the information _Donald_ in a string variable:
+
+    MyString = "Donald"
+
+The second half of information-processing is executing code that manipulates the information (the data). You've been doing this throughout this book. Here we process some data by computing the length of a string:
+
+    MsgBox Len(myString)
+
+One thing that is interesting about objects is that they not only can process information; they can also _contain_ it. They can hide their data (properties) or their processing (methods) from outside programming. This hiding is called _encapsulation_.
+
+You can use objects to store information, to process information, and to make information selectively accessible (hide it or not, as the programmer specifies) to the various other objects in an application.
+
+Consider what is to me the most successful application of object-oriented programming (or OOP): the controls you can put on a user form, such as the TextBoxes or Labels that we explored in Chapters 14 and 15—"Creating Simple Custom Dialog Boxes" and "Creating Complex Forms."
+
+A Label is an object that has a set of _properties_ , which can be visualized as both data and as processing capabilities. When you assign the value 33, say, to a Label's Left property, the Label automatically moves to that location on the form.
+
+_You did no programming to make this move happen_. You merely passed a desired position to the object and the object's internal capability to move itself took over and carried out the necessary tasks to make this happen. This is encapsulation: The programming that moves a Label is hidden from the outside world. The object has its own capabilities. And it contains its own internal data as well (the kind of line that frames a Label, its default width, the color of its background, and so on).
+
+When OOP is applied to more abstract concepts such as translating procedural programming (subs and functions) into OOP (objects), the results are mixed. Some programmers swear by OOP; others demur. OOP has become quite popular in many professional programming circles, but even after decades of implementation, OOP still causes controversy. For small jobs like simple macros, OOP is clearly overkill. For large projects, you might like the organizational and security features of OOP. And, if you intend to go into professional programming, you must understand how to use it.
+
+## A Brief Overview of Classes
+
+To create a class in VBA, you insert a _class module_ in a project (Insert ⇒ Class Module) and give the class the name by which you'll access it. You then use the Code window to create the code (constant and variable declarations, subroutines, and functions) that defines the properties and methods that the class will have. When you've finished, the class contains all the information that the custom object needs to perform its tasks and store data.
+
+A major distinction between a class module and a regular code module is that you don't directly execute code in a class module. Instead, in a regular code module you declare an object variable of the class's type. You then use this variable to access the class's members (its properties and methods) in your regular code.
+
+The concept of classes can be difficult to grasp, so the following sections present a simple example of a class that relates to something physical—the book you're holding. The example describes a class named Book that contains the salient information about a book. During runtime, after creating the Book object, the example's code adds this book's information to the Book object.
+
+Entire books endeavor to explain OOP and its uses. But I'll give you a taste of it here. The following example class works in any VBA host application.
+
+## Planning Your Class
+
+Before you start creating a class, decide the following:
+
+  * A class describes an object, so...what does this object _do_?
+  * What information does the class need to contain for the object to do what it's supposed to do? You use variables and properties to store this information. You use _variables_ to store information used privately, internally inside the object, and _properties_ to make available pieces of that information that need to be accessed from outside the object. You can create both read-only and read/write properties.
+  * What capabilities should this object have? Things a class can do, its behaviors, are called its _methods_. You create subroutines and functions to implement the class's methods—subroutines for the methods that return no value after doing their job and functions for the methods that do return a value after executing.
+
+Objects based on our Book class will contain information about a book project. Note that I said _objects_ , plural. A single class can create as many objects during runtime as the programmer wishes, just as a single cookie cutter can stamp out multiple cookies. Or a single blueprint can be used to build many townhouses.
+
+If you're a librarian programmer, you might use the Book class to generate thousands of Book objects.
+
+The class we'll construct will need properties for storing information such as the title, author, and price and will need a method that displays all this book information.
+
+## Creating a Class Module
+
+The first step in creating your class is to insert a class module into your project. You create a class module in much the same way you create a regular module.
+
+In the Project Explorer, right-click the target project or one of the items it contains and choose Insert ⇒ Class Module from the context menu. Alternatively, choose Insert ⇒ Class Module from the editor's menu bar, or click the Insert button on the Standard toolbar and choose Class Module from the drop-down list.
+
+The VBA Editor creates a new class module named Class _n_ (where _n_ is the next-higher consecutive number not yet employed to name a class module) and opens a Code window for it. If the project doesn't already contain a Class Modules folder, VBA adds one, and it appears in the Project Explorer.
+
+If you have the Require Variable Declarations option selected (on the Editor page of the Tools ⇒ Options dialog box in the VBA Editor), the VBA Editor automatically places an Option Explicit statement in the declarations area at the top of the code sheet for the class, just as it does for an ordinary module.
+
+If you don't have the Require Variable Declarations option selected, it's still a good idea to type in the Option Explicit statement anyway to force yourself to declare variables explicitly.
+
+## Naming the Class
+
+Now change the name of the class to something more descriptive than Class _n_. Press F4 to display the Properties window (if it's not already displayed) and enter the new name in the (Name) text box. Make the name descriptive, because you'll be using it in your code and you'll want its purpose to be easily grasped. We can name our example class Book. Press Enter or click elsewhere in the Visual Basic Editor window to make the change take effect.
+
+## Setting the Instancing Property
+
+The Instancing property determines whether a class module is visible (can be instantiated—brought into existence) from an outside project.
+
+Recall that an outside project must first reference the project that the class module is in before any access to another project's objects is even possible. Referencing is accomplished by Tools ⇒ References in the Editor.
+
+The default setting, 1 – Private, prevents other projects from seeing the class module and from working with instances (objects) of that class. In other words, the object is encapsulated, hidden.
+
+The other setting is 2 – PublicNonCreatable, and it allows an outside project to see the class. The outside project, even with a reference, however, still can't create instances (create objects) from the class by itself. _The instantiation must take place in the project that hosts the class_.
+
+So, for one project to access an object in another, three conditions must be met:
+
+  * The Instancing property in the project containing the object must be set to PublicNonCreatable.
+  * The project containing the object must have instantiated that object.
+  * The outside project must have established a reference to the project containing the object.
+
+To permit an outside project access to instances of a class (objects), set the Instancing property to 2 – PublicNonCreatable. Otherwise, leave the default setting of 1 – Private intact. With the default Private setting, only the project that has the class can access objects instantiated from that class.
+
+## Declaring Variables and Constants for the Class
+
+After setting the Instancing property, you should declare the variables and constants that the class will need for its internal operations.
+
+These declarations work just like the declarations you've seen so far in the book, except that you'll probably want to use a naming convention to indicate that the variables and constants belong to a class rather than to a procedure. We'll use the prefix book on the constants and variables to make it easy for the programmer to see that they're part of the Book class.
+
+The Book class uses the declarations shown in the following code snippet to declare one constant (bookName) and five variables (bookTitle, bookAuthor, bookPages, bookPrice, and bookPublicationDate) of assorted data types:
+
+    Const BookName = "Book Project"
+    Dim BookTitle As String
+    Dim BookAuthor As String
+    Dim BookPages As Long
+    Dim BookPrice As Currency
+    Dim BookPublicationDate As Date
+
+## Adding Properties to the Class
+
+Now add the properties to the class. Table 16.2 lists the properties that the Book class uses.
+
+Table 16.2 Properties of the Book class
+
+**Property** | **Description**  
+---|---  
+Title | A read/write String property that sets or returns the formal title of the book  
+Author | A read/write String property that sets or returns the author's name  
+Pages | A read/write Long property that sets or returns the page count of the book  
+Price | A read/write Currency property that sets or returns the price of the book  
+PublicationDate | A read/write Date property that sets or returns the publication date of the book
+
+You can create properties for a class in either of two ways. The first way is less formal than the second but provides you with less control over the properties.
+
+### Creating a Property by Using a Public Variable
+
+One way to create a property in your code is to declare a Public variable in the class module. Doing this creates a read/write property with the name of the variable. For example, the following statement (when typed into a class module) creates a read/write Boolean property named HardCover:
+
+    Public HardCover As Boolean
+
+Using a Public variable like this is a quick way to create a property, but it's a bit limited: It must be read/write. You can't choose to make the property read-only (or write-only). What's more, you can't execute any other code when the program's code sets or returns the value of the property.
+
+After declaring a Public variable, your code can then set and return the property's value in the usual way. For example, say we've created the Boolean property HardCover in an instance named MastVBA of the Book class. The following statements set (store, write data in) the property and then display a message box returning (reading the value from) the property:
+
+    MastVBA.HardCover = False
+    MsgBox MastVBA.HardCover
+
+Something special is illustrated here. The name of the _class_ is Book, but notice that the name of an object instantiated from this class is MastVBA. Objects—there can be many derived from a given class—will each have its individual object name. Instantiated objects should not have the same name as the class from which they spring.
+
+### Creating a Property by Using Property Procedures
+
+The second and more formal and flexible way to create a property is to use property procedures. There are three types of property procedures—Property Let, Property Get, and Property Set:
+
+  * A Property Let procedure assigns a value to a property. It _writes_.
+  * A Property Get procedure returns the value from a property. It _reads_.
+  * A Property Set procedure creates a reference to an object. (This is similar to how you create an object variable in ordinary, non-object procedures.)
+
+You typically use these procedures in pairs, pairing a Property Get procedure with a Property Let procedure. That creates a read/write capability. Or you pair a Property Set procedure with a Property Let procedure. If you use a Property Get procedure on its own, that property will be read-only.
+
+#### _Assigning a Value to a Property with a_ Property Let _Procedure_
+
+To permit outside code to assign a value to an object's property, you use a Property Let procedure. The syntax for a Property Let procedure is as follows:
+
+    Property Let _name_ ([ _arglist_ ,] _value_ )
+       [ _statements_ ]
+    End Property
+
+These are the components of the syntax:
+
+  * The Property keyword starts the procedure, and the End Property keywords end the procedure.
+  * _name_ is a required argument specifying the name of the property procedure being created. If you also create a paired Property Get procedure as well for this property, use the same name as the Property Let procedure.
+  * _arglist_ is a required argument listing the arguments that are passed to the procedure. An argument list is required here because a Let procedure is designed to assign a value or values to this property. So the outside code must provide at least one value. If _arglist_ contains multiple arguments, you separate them with commas.
+
+For example, the following Property Let procedure creates the String property Title, assigning the argument NewTitle and passing its value to the variable bookTitle:
+
+    Property Let Title(NewTitle As String)
+        bookTitle = NewTitle
+    End Property
+
+If you don't add a Property Get procedure for this Title data, the property named Title will be write-only. Write-only properties aren't widely useful, so the next step is to write code that reads the value in the property. Then it becomes a read/write property.
+
+#### _Returning a Value from a Property with a_ Property Get _Procedure_
+
+To return a value from a property, you use a Property Get procedure. The syntax for a Property Get procedure is as follows:
+
+    Property Get _name_ [( _arglist_ )] [As _type_ ]
+        [ _statements_ ]
+    End Property
+
+The components of the syntax are the same as for the Property Let procedure, except for two things:
+
+  * First, Property Get adds the optional _type_ argument, which specifies the data type for the property.
+  * Second, for Property Get, the _arglist_ argument is optional. You _can_ have arguments for Property Get procedures, but you won't usually need to. If you do use arguments, their names and data types must match those in the corresponding Property Let procedure.
+
+For example, the following Property Get procedure creates the String property Title, assigning to it the contents of the bookTitle variable:
+
+    Property Get Title() As String
+        Title = bookTitle
+    End Property
+
+If this Property Get procedure existed alone (without being paired with a corresponding Property Let procedure), it would be a read-only property. Use Property Get alone if you don't want to allow outside code to modify this property in any way.
+
+However, because we've paired it with the Property Let Title procedure shown in the previous section, you now have a read/write property.
+
+#### _Assigning an Object to a Property with a_ Property Set _Procedure_
+
+Instead of assigning a value to a property, you can assign an object to it. To do so, you use a Property Set procedure rather than a Property Let procedure. The syntax for a Property Set procedure is as follows:
+
+    Property Set _name_ ([ _arglist_ ,] _reference_ )
+        [ _statements_ ]
+    End Property
+
+The components of the syntax are the same as for the Property Let procedure, except that Property Set uses the _reference_ argument rather than the value argument. _reference_ is a required argument specifying the object to reference.
+
+For example, the following Property Set procedure creates the object property Where that references a range:
+
+    Property Set Where(rngR As Range)
+        bookRange = rngR
+    End Property
+
+* * *
+
+Both Set and Let Can Be Used with Object Variables
+
+For an object variable, you can use both a Property Set procedure and a Property Let procedure, but in most cases it makes more sense to use only a Property Set procedure.
+
+* * *
+
+### The Properties for the Book Class
+
+Listing 16.3 shows the full listing of properties for the Book class.
+
+**Listing 16.3**: All the properties of the Book class
+
+    1.  Option Explicit
+    2.  
+    3.  Const BookName = "VBA Book Project"
+    4.  Dim BookTitle As String
+    5.  Dim BookAuthor As String
+    6.  Dim BookPages As Integer
+    7.  Dim BookPrice As Currency
+    8.  Dim BookPublicationDate As Date
+    9.  
+    10.  Public Property Let Title(strT As String)
+    11.      BookTitle = strT
+    12.  End Property
+    13. 
+    14.  Public Property Get Title() As String
+    15.      Title = BookTitle
+    16.  End Property
+    17. 
+    18.  Public Property Let Author(strA As String)
+    19.      BookAuthor = strA
+    20.  End Property
+    21. 
+    22.  Public Property Get Author() As String
+    23.      Author = BookAuthor
+    24.  End Property
+    25. 
+    26.  Public Property Let Pages(intPages As Integer)
+    27.      BookPages = intPages
+    28.  End Property
+    29. 
+    30.  Public Property Get Pages() As Integer
+    31.      Pages = BookPages
+    32.  End Property
+    33. 
+    34.  Public Property Let Price(curP As Currency)
+    35.      BookPrice = curP
+    36.  End Property
+    37. 
+    38.  Public Property Get Price() As Currency
+    39.      Price = BookPrice
+    40.  End Property
+    41. 
+    42.  Public Property Let PublicationDate(dtePD As Date)
+    43.      BookPublicationDate = dtePD
+    44.  End Property
+    45. 
+    46.  Public Property Get PublicationDate() As Date
+    47.      PublicationDate = BookPublicationDate
+    48.  End Property
+
+In Listing 16.3, each property for the Book class is declared as Public so that it is publicly accessible.
+
+The code illustrates how you should organize your paired procedures by putting each Property Let procedure next to the corresponding Property Get procedure: The Property Let Title procedure in lines 10 through 12 is matched by the Property Get Title procedure in lines 14 through 16, and so on for the Author, Pages, Price, and PublicationDate property procedures.
+
+Pairing the procedures makes it easy to read the code to make sure each procedure that should have a counterpart does have one, and to make sure the arguments match.
+
+## Adding Methods to a Class
+
+Now that we've created properties as places to store data in our object, it's time to add functionality that will process that data. It's time to add the class's methods by adding subroutines and functions as necessary. As you'll see at the end of this chapter, the VBA Editor will display a list of the members—properties and methods—of an object you create, or objects built into VBA
+
+Subroutines and functions you create within a class are like the subroutines and functions you use in ordinary, non-object code modules.
+
+Our example Book class uses only one method, ShowInfo, which displays a message box showing the properties of the book. Listing 16.4 displays the ShowInfo procedure.
+
+**Listing 16.4**: The ShowInfo method of the Book class
+
+    1.  Sub ShowInfo()
+    2.      Dim strM As String
+    3.      strM = "Title:" & vbTab & BookTitle & vbCr
+    4.      strM = strM & "Author:" & vbTab & BookAuthor & vbCr
+    5.      strM = strM & "Pages:" & vbTab & BookPages & vbCr
+    6.      strM = strM & "Price:" & vbTab & "$" & BookPrice & vbCr
+    7.      strM = strM & "Date:" & vbTab & Me.PublicationDate & vbCr
+    8.          MsgBox strM, vbOKOnly + vbInformation, BookName _
+    9.          & " Information"
+    10. End Sub
+
+The ShowInfo procedure builds a string containing the information from the class and then displays the string in a message box. Here's what happens:
+
+  * Line 2 declares the String variable strM, which the procedure uses to store the information for the prompt argument in the message box.
+  * Line 3 adds to strM the text Title:, a tab, the contents of the bookTitle variable (which contains the title of the book in the object), and a carriage return.
+  * Line 4 builds on strM, adding the author information. Likewise, line 5 adds the information on the page count, and line 6 adds the price information (including a dollar sign for completeness).
+  * Line 7 also builds on strM, adding the date information. However, instead of using the class's internal variable (bookPublicationDate) to return the date stored, it calls the PublicationDate property of the object (which is identified by the Me keyword). This is by way of an example—returning bookPublicationDate works fine too. But you'll see the difference when you retrieve information from the object: instead of supplying the variable, VBA runs the Property Get PublicationDate procedure to return the information.
+  * Line 9 displays an OK-style message box containing the string strM. The message-box title is set to bookName (the constant that contains the text Book Project) and Information, and the message box uses an Information icon.
+
+## Using Your Class
+
+Recall that you can't execute class code directly. You can't put your insertion point inside the ShowInfo procedure and press F5 to run the code or F8 to step through the code.
+
+A class is a description of an object not yet in existence. Again, think blueprint for a house.
+
+So, before you can execute or test a class, you must create an instance of the class. You can't test the plumbing in a house by just looking at the blueprints before the house has been built. In other words, you must create an object based on the class template, then test the object.
+
+To instantiate an object, you write code in an ordinary, non-object code module (like the modules we've been using throughout this book so far, such as the Module1 or NewMacros module).
+
+To use the class you created, you create a new instance of the object by using a New keyword. The New keyword can be employed in either a Dim statement or a Set statement. For example, the following statement creates a new object variable based on the Book class:
+
+    Dim myBook As New Book
+
+The following statements declare an Object variable named bookAnotherBook and then assign to it a new instance of the Book object:
+
+    Dim bookAnotherBook As Object
+    Set bookAnotherBook = New Book
+
+You can then access the properties and methods of the Book object as you would any other VBA object's properties and methods (note the syntax: objectVariableName.Property). For example, the following statement sets the Price property of the bookAnotherBook object:
+
+    bookAnotherBook.Price = 54.99
+
+Listing 16.5 contains a short procedure called Class_Test that shows the Book class in action. Type this procedure into an ordinary code module (not a class module). And be sure the module you type this into is in the _same project_ as the Book class module you created earlier.
+
+**Listing 16.5**: Testing the Book class
+
+     1.  Sub Class_Test()
+     2.
+     3.      Dim myBook As New Book
+     4.
+     5.      myBook.Title = "Mastering VBA for Microsoft Office 2013"
+     6.      myBook.Price = 49.99
+     7.      myBook.Author = "Richard Mansfield"
+     8.      myBook.Pages = 880
+     9.      myBook.PublicationDate = #8/17/2013#
+    10.
+    11.      myBook.ShowInfo
+    12.
+    13.  End Sub
+
+The listing shows an example of how to use a class in your programming. Here's what happens:
+
+  * Line 1 begins the Class_Test procedure, and line 13 ends it.
+  * Line 2 is a spacer. Line 3 declares a new object variable named myBook of the Book class. Line 4 is another spacer.
+  * Lines 5 through 9 set the five properties of the myBook object—Title, Price, Author, Pages, and PublicationDate—as you'd set the properties for any other object. Note that the object name (the object variable name) is separated by a period from the properties and methods of that object.
+  * Line 10 is a spacer. Line 11 invokes the ShowInfo method of the myBook object—again, as you'd invoke a method for any other object.
+
+You can now test your object, clicking inside this procedure to put the blinking insertion cursor there, then pressing F5 (run) or F8 (single-stepping). Try single-stepping to see how the instantiation takes place and how the inner workings of the object add info to the properties and carry out the ShowInfo method.
+
+Here's another quick experiment. Notice that the VBA editor's Auto List Members feature works with objects you create, as well as objects built into VBA itself, such as Excel's Workbooks object. Remember that if in Excel's VBA Editor Code window you type workbooks. (you must type the period), suddenly a list drops down showing you all the members—the properties and methods—of the workbooks object. To then add one of these members to your code, just click it or use the down-arrow key to select it, then press Enter.
+
+Similarly, when you are programming with the MyBook object, typing **MyBook** followed by a period drops that object's members list down, as shown in Figure 16.2:
+
+Figure 16.2 VBA's helpful Auto List Members feature shows the properties and methods of your objects.
+
+# The Bottom Line
+
+**Arrange your code in modules.**
+
+Rather than use a single lengthy, complex procedure that accomplishes many tasks at once, programmers usually subdivide their code into smaller, self-contained procedures—dedicated to a single, discrete task.
+
+Master It
+
+Shorter, self-contained, single-task procedures offer the programmer several advantages. Name three.
+
+**Call a procedure.**
+
+You execute a procedure by calling it from within your programming code.
+
+Master It
+
+How do you call a procedure?
+
+**Pass information from one procedure to another.**
+
+Sometimes a procedure requires that you pass it some information. For example, a procedure that searches text and makes some style changes to it will require that you pass the text you want modified.
+
+Sometimes a procedure passes back information to the procedure that called it. For example, it might pass back a message describing whether the actions taken in the procedure were (or were not) accomplished successfully.
+
+Master It
+
+What kind of procedure can pass back information to the caller?
+
+**Understand what classes are and what they're for.**
+
+Contemporary computer programs employ classes for various reasons—to help organize large programs, to make code more easily reusable, to provide certain kinds of security, or as a superior substitute for public variables. But beginners sometimes have a hard time wrapping their minds around the concept, particularly the relationship between classes and objects.
+
+Master It
+
+What is the difference between a class and an object?
+
+Choose the correct answer (only one answer is correct):
+
+**1.** A class is like a cookie and an object is like a cookie cutter.
+
+**2.** A class is like a programmer and an object is like a module.
+
+**3.** A class is like a blueprint and an object is like a house built from that blueprint.
+
+**Create a class.**
+
+The VBA Editor employs a special kind of module for containing classes.
+
+Master It
+
+How do you create a class module in the VBA Editor?
+Chapter 17
+
+Debugging Your Code and Handling Errors
+
+In this chapter, you'll learn some of the things that can go wrong in your VBA code and what you can do about them. You'll examine the types of errors that can occur, from simple typos to infinite loops to errors that occur only once in a while (intermittent bugs are usually the hardest to locate).
+
+The chapter starts by explaining the basics of debugging. Then you'll work with the tools that VBA offers for debugging VBA code and use them to get the bugs out of some examples. The end of the chapter discusses the various ways to have your program itself respond to errors that happen during runtime.
+
+In this chapter you will learn to do the following:
+
+  * Understand the basic principles of debugging
+  * Recognize the four different types of errors you'll create
+  * Employ VBA's debugging tools
+  * Deal with runtime errors
+
+# Principles of Debugging
+
+A _bug_ is an error in hardware or software that causes a program to execute other than as intended. _Debugging_ means removing the bugs from hardware or software.
+
+* * *
+
+Where Did the Term _Bug_ Come From?
+
+There are various explanations of the etymology of the word _bug_ as used in computer programming, ranging from apocryphal stories of moths being found in the circuit boards of malfunctioning computers to musings that the word came from the mythological _bugbear_ , an unwelcome beast. But in fact, the term _bug_ has been used to mean something troublesome for centuries. For more information, see the "bug" entry in the _Free On_ - _line Dictionary of Computing_ at a site such as <http://foldoc.org/>.
+
+* * *
+
+Your goal when debugging should be to remove all bugs from your code. Your order of business will probably go something like this:
+
+1. First, test your code to see whether it works as it should. Put it through its paces. Test it by running the procedure once or twice using suitable files or other appropriate data. Try all the options the macro makes available to the user. Even if it seems to work, continue testing for a reasonable period with various data from various sample documents before unleashing the procedure on the world (or your colleagues).
+
+2. If your code doesn't work as you expected it to, you'll need to debug it. That means following the techniques described in this chapter to locate the bugs and then remove them. Once you've removed all the bugs that you can find, retest the code as described in the first step. This is important, because sometimes the act of debugging itself introduces new bugs.
+
+3. When testing your code, try to anticipate unusual, perhaps exotic ways that users might employ your code. For example, you might write a sophisticated procedure for manipulating a Word document on the (perfectly reasonable) assumption that the document will be open when the user starts the procedure running. You can test it on sample documents until you're blue in the face and it'll work fine every time. But if a user tries to run the procedure without first opening a document, it crashes.
+
+And don't make fun of this user. It might seem sensible to users that the procedure _should_ be launched before a file is loaded. Users might expect the procedure to display an input box asking them which document they want to manipulate. And more important, users also expect that you will anticipate and handle unexpected errors without crashing your programming. There are ways to _trap_ unanticipated user behavior or other runtime errors and respond to them gracefully. What does your program do if the user attempts to save a file to a disk that's full, for example? Just crash and thereby lose all the information they've spent time typing in?
+
+4. When you're ready to distribute your procedure, you may want to write instructions for its use. In these instructions, you may also need to document any bugs that you can't squash or circumstances under which the procedure shouldn't be run. But it's better to build instructions, responses to unanticipated problems, and other kinds of _error trapping_ into the macro itself Try to make your code bulletproof.
+
+Debugging a procedure tends to be idiosyncratic. There's no magic wand that you can wave over your code to banish bugs (although the VBA Editor does its best to help you eliminate certain types of errors from your code as you create it). Moreover, such simple things as forgetting to initialize a variable can wreak havoc on your code.
+
+You'll probably develop your own approach to debugging, partly because your programming will inevitably be written in your own style. But when debugging, it helps to focus on understanding what the code is supposed to do. You then correlate this with your observations of what the code actually does. When you reconcile the two, you'll probably have worked out how to debug the procedure.
+
+Also, the longer and more complex your code, the higher the probability that it will contain bugs. Certain kinds of bugs occur because of interactions among the parts of a project. And obviously the larger the project, the more parts with potential side effects. So keep your code as simple as possible by breaking it into separate procedures and modules, as discussed in Chapter 16, "Building Modular Code and Using Classes." Small code sections, with distinct, small tasks to accomplish, are almost always easier to debug than large lumps of code that try to do several things all at once. Remember that most debugging is a matter of locating _where_ in your code the problem occurs. If you're testing a small module of code with a very easily specified objective, locating a bug is that much easier.
+
+# The Different Types of Errors
+
+You'll encounter four basic kinds of errors in your programming:
+
+  * Language errors
+  * Compile errors
+  * Runtime errors
+  * Program logic errors
+
+The following sections look at these kinds of errors in turn and discuss how to prevent them. After that, you'll examine the tools VBA provides for debugging.
+
+## Language Errors
+
+The first type of error is a _language error_ (also known as a _syntax error_ ). When you mistype a word in the Code window, omit a vital piece of punctuation (and in programming, all punctuation is vital), scramble a statement, or leave off the end of a construction, that's a language error. If you've worked your way through the book to this point, you've probably already made dozens of language errors as part of the learning process and through simple typos.
+
+VBA helps you eliminate many language errors as you create them, as you'll see later in this chapter. Those language errors that the VBA Editor doesn't catch as you type them in usually show up as _compile errors_ during runtime testing, so the next section shows you examples of both language errors and compile errors.
+
+## Compile Errors
+
+_Compile errors_ occur when VBA can't compile a statement correctly—that is, when VBA can't turn a statement that you've entered into viable code.
+
+For example, if your programming tells VBA to use a certain property for an object that doesn't have that property, a compile error results. Compilation is the act of turning your source code (the programming you type into the Editor) into the lower-level commands understandable by the computer. For example, when you press F5 to execute your program, VBA starts off by compiling your programming. If it finds a problem during compilation, it displays an error message.
+
+The good news is that the VBA Editor detects many language errors and some compile errors as soon as you move the insertion point from the offending line. You don't even have to press F5 in many cases. For example, try typing the following statement in the Code window and pressing Enter to create a new line (or pressing ↑or ↓ to move to another line, or clicking the mouse in another line in the macro):
+
+    If X > Y
+
+The VBA Editor displays the compile error "Expected: Then or GoTo" (see Figure 17.1) to tell you that the statement is missing a vital element: it should say If X > Y Then or If X > Y GoTo. (If you don't see the error message, there are two possibilities: Either you have turned off the Auto Syntax Check option [Tools ⇒ Options] or you didn't actually type it in by hand and press Enter.)
+
+Figure 17.1 The VBA Editor helps debug your code by identifying many compile errors as it checks the statements you enter.
+
+Every time you enter a line of code, the Editor examines that line for completeness and accuracy. In this example, VBA knows that when the code contains an If command, there must be a subsequent Then or Goto command. And so the Editor rejects the line and informs you what the problem is.
+
+This vigilance on the part of the VBA Editor prevents you from running into this type of error deep in the execution of your code.
+
+* * *
+
+**Decide for Yourself If You Like the Auto Syntax Check Feature**
+
+This chapter assumes that you're keeping VBA's Auto Syntax Check feature and other features switched on. If you have Auto Syntax Check turned off (Tools ⇒ Options ⇒ Editor tab), you won't see the error message displayed in Figure 17.1. Instead, the only warning you get about that incomplete line of code is that the VBA Editor turns the line red. Code turned red is the Editor's way of telling you that it's choking on your inadequate programming. You can either try to fix the error right then or keep on coding—putting off the debugging process until you've sketched in more code in the procedure.
+
+Some developers choose to turn off Auto Syntax Checking because they don't want to be nagged as they type in their code—it can interfere with their focus on the larger goals of the program they're writing. Working without automatic, immediate syntax checking can prove a cure worse than the disease for some programmers. But others find error message interruptions about typos annoying.
+
+Ultimately, whether you use the Auto Syntax Check feature is a matter of personal taste. For example, some people like to be told _right away_ if they make a spelling error in a Word document; others consider spelling errors rather tedious issues best left for later during an editing phase. They write, focusing on the main points they're trying to make, then at some later time they turn on the spell checker and fix any typos and punctuation blunders. You find a similar choice when you work at most any task. Consider woodworking: Should you hang each tool back on the wall in its appropriate place when you finish using it, or is it better to just let the saws and screwdrivers pile up around you, putting them away all at once after the coat rack is finished?
+
+* * *
+
+The VBA Editor notices blunders like the previous If X > Y problem easily enough, but you can also make language errors that the VBA Editor _cannot_ identify when you move the insertion point from the line in which the blunder resides. Instead, VBA identifies these errors as compile errors later when you press F5 and it compiles the code. For example, if you enter the following statement in the Code window when working with Word, the VBA Editor doesn't detect anything wrong. But when you run the procedure by pressing F5, VBA compiles the code, discovers the error, and objects to it (see Figure 17.2):
+
+    ActiveDocument.SaveAs **FileMame** :="My File.docm"
+
+Figure 17.2 Other errors appear only when you try to run the code.
+
+This error is a straightforward typo—FileMame instead of FileName—but VBA won't see this particular kind of problem until it runs the code and fails to find any FileName property.
+
+The VBA Editor sometimes indirectly helps you to notice errors of this kind while you're writing code. Say you're trying to enter a Documents.Close statement in Word and mistype Documents as Docments. In this case, the VBA Editor doesn't display the Properties/Methods list (Auto List Members) as it normally does if you have this feature turned on. You haven't entered a valid object. VBA doesn't therefore have a members list to display.
+
+Not seeing the Properties/Methods list should alert you that something is wrong. If you continue anyway and enter the Docments.Close statement, the VBA Editor doesn't spot the mistake—it shows up as a "Run-time error 424: Object required" message (if you don't have Option Explicit on) when you try to run the procedure. (If you do have Option Explicit on, you get a "Variable not defined" compile error instead.)
+
+The Editor gives you yet another clue that Docments.Close is an error. When you press Enter to leave this line of code, you see this:
+
+    docments.Close
+
+Does anything here look odd to you? VBA will automatically capitalize valid objects names. But docments is not capitalized.
+
+Another kind of problem is caused if you specify a property or method for an object to which that property or method doesn't apply. In this situation, VBA displays a compile error. For example, say you forget that the proper method here is Add and you enter Documents.Create instead. VBA highlights the offending word and gives the compile error "Method or data member not found" (see Figure 17.3), which tells you there's no Create method for the Documents collection. This message is displayed only during runtime, not design time (design time means when you're typing in code lines).
+
+Figure 17.3 The "Method or data member not found" error tells you that you've used a method or property that isn't available for the object in question.
+
+## Runtime Errors
+
+The third type of error is the _runtime error_ , which occurs while code is executing. You will cause a runtime error if you write code that forces VBA to try to perform an impossible operation, such as opening a document that doesn't exist, closing a file when no file is open, or performing something mathematically impossible, such as dividing by zero.
+
+The diction, punctuation, and syntax of your code is error-free, but you're asking VBA to do something that can't be done. An unhandled runtime error results in a crash that manifests itself as a Microsoft Visual Basic dialog box displaying a runtime error number, such as the one shown in Figure 17.4.
+
+Figure 17.4 An unhandled runtime error causes VBA to display a message box such as this one.
+
+As an example of an impossible operation, consider the archetypal division by zero. The following statements give a "Run-time error '11': Division by zero" message:
+
+    Dim x As Integer
+    x = 1 / 0
+
+You're unlikely to enter anything as obviously wrong as this in your code (you're not _nuts_ ). A line of code like this will inevitably produce a division-by-zero error because the divisor is zero. But it's easy to enter a valid equation, such as MonthlyPay = Salary/Months, and forget to assign any value to Months (if a numeric variable is empty, it counts as a zero value) or to produce a zero value for Months by addition or some other math. Or the user can type zero into a dialog box, then your code later tries to use that as a divisor. And so on.
+
+One way to check for runtime errors is to track the values of your variables by using the Watch window (discussed later in this chapter). To avoid possible user-input errors, have your code check their input after they close a dialog box. You can, for example, display a message explaining that zero isn't an acceptable input for their age, then display the dialog box again, expecting valid input this time around.
+
+## Program Logic Errors
+
+The fourth type of error is the _program logic error_ , which is characterized by valid code that nonetheless produces incorrect results. With program logic errors, the code is technically fine. VBA is able to compile and run it without noticing any errors—but you get a different result than you intended.
+
+Program logic errors range in scope from the relatively obvious (such as performing manipulations on the wrong workbook in Excel because your code doesn't check which window is active) to the subtle (such as extending a range to the wrong character or cell). In the first example, the procedure is likely to run perfectly, but the resulting workbook will bear little resemblance to what you were trying to accomplish. In the second example, you might get a result that is almost correct—or the error might cause you to get perfect results sometimes and slightly wrong results at other times.
+
+Program logic errors tend to be the hardest errors to fix. To nail them down, you need to trace the execution of your code and pinpoint where things start to go wrong. To do that, you almost always need to employ the debugging tools discussed in the next section.
+
+A friend of mine wrote a very nice program to format and print forms. But while he was testing it he noticed that after working fine about five times, it suddenly sent only one-third of the form to the printer during a trial run. He couldn't get it to repeat this behavior. So he surrounded the code with a loop and let it run continuously (dumping the sample form repeatedly into a log file rather than wasting paper printing it over and over). He discovered that the error only occurred once every 256 times the program ran. He never did locate the bug, but when he gave the program to other people, he just told them that it worked "almost always."
+
+* * *
+
+When Errors Aren't Your Fault
+
+There are two other types of errors that you may run into—even though perhaps you shouldn't. The first type is where Microsoft has documented a VBA item differently than it actually works. This shouldn't happen, but because of the complexity of VBA, it does. If you find that your code absolutely won't work even though it follows the Microsoft documentation to the letter, consider the possibility that the documentation may be incorrect. Search the Web using the VBA keywords involved to find if others have encountered this problem and learn how they've worked around it. The second type of error, a distant relation of the first type, is where one version of VBA behaves differently than another version. For example, you might create a procedure that works perfectly in Word 2010, but you have to change it to make it work with Word 2013. In an ideal world, this shouldn't happen—but as you know, this world is far from ideal. These two errors are blessedly quite rare. For one thing, VBA has been extensively used for decades, so it's a very mature language with few surprises.
+
+* * *
+
+# VBA's Debugging Tools
+
+VBA provides a solid assortment of debugging tools to help you remove the bugs from your procedures. The main windows you'll employ for debugging are the Immediate window, the Locals window, and the Watch window. You can access these tools in various ways, one of which is by using the Debug toolbar (shown in Figure 17.5). Four of the buttons—Design Mode, Run Sub/UserForm, Break, and Reset—are shared with the Standard toolbar. You'll learn about most of the others later in this chapter.
+
+Figure 17.5 The Debug toolbar provides 13 commands for debugging your procedures.
+
+* * *
+
+Heisenbugs, Bohr Bugs, and Other Uncatchable Critters
+
+The more complex and lengthy your code, the more likely you are to create bugs that are exceptionally difficult to catch. Usually, with determination and ingenuity, you can track down even the tougher bugs located in a single procedure. But bugs that depend on several unforeseen and improbable circumstances occurring simultaneously can be tough to isolate.
+
+For example, an error that occurs in a procedure when the user makes a certain choice in a dialog box is relatively easy to catch. But if the error occurs only when the user has made two particular choices in the dialog box, it's much harder to locate. And if the error is contingent on a particular combination of three choices the user has made in the dialog box, or if it depends on an element in the particular file from which the procedure is getting its data, you'll likely have a much harder time pinpointing it.
+
+Programmer folklore defines various kinds of rare bugs by assigning them names derived from such disciplines as philosophy and quantum physics. For instance, a _heisenbug_ is defined as "a bug that disappears or alters its behavior when one attempts to probe or isolate it." Heisenbugs are frustrating, as are Bohr bugs and mandelbugs (search online for details if you're curious). But the worst kind of bug is the _schroedinbug_ , which is a design or implementation bug that remains quiescent until someone reads the code and notices that it shouldn't work, whereupon it stops working until the code is made logically consistent.
+
+These bugs are, of course, ridiculous—until you start to discover bit rot at work on your code and have to explain the problem to your superiors.
+
+* * *
+
+## Break Mode
+
+Break mode is a vital tool for debugging your procedures because it lets you watch your code execute step by step—line by line—in the Code window (by repeatedly pressing F8). This technique is called _single-stepping_.
+
+For example, if an If...Then...ElseIf...Else statement appears to be executing incorrectly, you can step through it in Break mode and watch exactly which statements are executing, and which are being skipped, to produce the bad result.
+
+These are the easiest ways to enter Break mode:
+
+  * Click to place the blinking insertion cursor in the procedure you want to run in the Code window and press the F8 key (or click the Step Into button on the Debug toolbar, or choose Debug ⇒ Step Into) to start stepping through it. Repeatedly press F8 to step down through the code.
+  * Set one or more breakpoints in the procedure to cause VBA to halt execution and enter Break mode when it reaches one of the marked lines. A breakpoint allows you to stop execution at a particular point in your code. The easiest way to set a breakpoint is to click beside the line where you want to stop. You click in the gray margin-indicator bar to the left of the Code window. (You could also right-click in the line of code and choose Toggle ⇒ Breakpoint from the context menu.) You can set any number of breakpoints. They're especially useful when you need to track down a bug that you suspect is located in a particular procedure because a breakpoint allows you to run the parts of a procedure that have no problems at full speed and then stop the procedure where you think there might be problems. From there, you can step through the suspicious statements and watch closely how they execute.
+
+You can also enter Break mode in a couple of other ways:
+
+  * Interrupt your code by pressing Ctrl+Break and then click the Debug button in the resulting dialog box (see Figure 17.6). Normally, the only reason to enter Break mode this way is if your code gets stuck in an endless loop (which you'll typically recognize when the code appears to be doing nothing for a long time or repeating itself when you think it shouldn't be). VBA highlights the statement that was executing when you pressed Ctrl+Break, but (depending on your timing) it's unlikely to be the statement that's causing the problem in your code—it'll just be one of the statements in the offending loop. You'll then need to step through the loop to identify the aberrant statement.
+
+Figure 17.6 You can enter Break mode by pressing Ctrl+Break and then clicking the Debug button in this dialog box.
+
+  * Click the Debug button in a runtime-error dialog box such as the one shown in Figure 17.7. In the Code window, VBA highlights the statement that caused the error. (You can also click the Help button in the runtime-error dialog box to get an explanation of the error before clicking the Debug button.)
+
+Figure 17.7 Entering Break mode from a runtime error dialog box like this one takes you straight to the offending statement in your code. The problem code will be highlighted in yellow.
+
+* * *
+
+Access's _SingleStep_ Method
+
+In addition to hosting a full version of VBA, Access includes a unique, legacy macro-design tool called the Macro Builder. This book doesn't spend much time with the Builder feature because Access's VBA offers much more capability and flexibility than its Builder. However, if you want to experiment with the Macro Builder, in Access click on the Ribbon's Create tab, then click the Macro icon on the far right. One interesting command (added to the Builder in Access 2007) is the SingleStep method of the DoCmd object. This operates somewhat like a breakpoint, dropping you into Break mode during execution and displaying Access's specialized Macro Single Step dialog box. You can insert DoCmd.SingleStep into a VBA macro as well. VBA recognizes it as a legitimate line of code. However, VBA just ignores this statement during runtime. Only macros created in the Access Builder will respond to this SingleStep method.
+
+* * *
+
+## The Step Over and Step Out Commands
+
+In Chapter 3, "Editing Recorded Macros," you learned how to step through a procedure by repeatedly pressing the F8 key to issue the Step Into command, going down the lines one at a time. (You can also issue this command by clicking the Step Into button on the Debug toolbar or choosing Debug ⇒ Step Into, but F8 is ever so much more efficient.)
+
+Stepping into lets you see exactly what each statement in your code does, but you'll often find that you need to get past sections of code that you're sure are working fine so that you can step through a section that seems suspicious. This situation is particularly true of loop structures, which can have you going round and round—a real time-waster if you know the bug you're tracking down isn't within the loop.
+
+Break mode offers three features to speed up stepping through your code: the Step Over command, the Step Out command, and the Run To Cursor command. The Step Over and Step Out commands aren't available until you enter Break mode (for example, by using the Step Into command).
+
+The Step Over command (which you can trigger by pressing Shift+F8, clicking the Step Over button on the Debug toolbar, or choosing Debug ⇒ Step Over) executes the whole Sub or function called from the current procedure instead of stepping through the called procedure statement by statement as the Step Into command would do. (It "steps over" that procedure or function.) Use the Step Over command when you're debugging a procedure that calls another procedure or function that you know to be error-free and that you don't need to test step by step.
+
+The Step Out command (which you can issue by Ctrl+Shift+F8, clicking the Step Out button on the Debug toolbar, or choosing Debug ⇒ Step Out) runs the rest of the current procedure at full speed. Use the Step Out command to quickly execute the rest of a procedure once you've gotten through the part that you needed to watch step by step.
+
+The Run To Cursor command (which you can issue by pressing Ctrl+F8 or choosing Debug ⇒ Run To Cursor) runs the code at full speed until it reaches the statement where the blinking cursor currently is in the Code window, whereupon it enters Break mode. Click to position the cursor in the appropriate statement before invoking this command.
+
+## The Locals Window
+
+The Locals window provides a quick readout of the values and types of all variables or expressions in the currently active procedure. It displays a collapsible tree view (see Figure 17.8).
+
+Figure 17.8 Use the Locals window to see at a glance all the expressions in the active procedure.
+
+An expression is a combination of keywords, operators, variables, and/or constants. Variables are one kind of expression; but more complex expressions involve more than a single variable: x > y, for example, is an expression stating that x is greater than y. This expression might be True or False, depending on what's happening during runtime.
+
+The Expression column displays the name of each expression, listed under the name of the procedure in which it appears. The Value column displays the current value of the expression (including Empty if the expression is empty, or Null or Nothing as appropriate). And the Type column displays the data type of the expression, with Variants listed as "Variant" along with their assigned data type (for example, "Variant/String" for a Variant assigned the String data type).
+
+To display the Locals window, click the Locals Window button on the Debug toolbar or choose View ⇒ Locals Window. To hide the Locals window, click its close button.
+
+From the Locals window, you can also click the button marked with an ellipsis (...) to display the Call Stack dialog box, discussed later in this chapter. This button is available only in Break mode.
+
+* * *
+
+How to Float and Dock Windows
+
+Remember that you can make panes (interior windows such as the Locals window) float by either dragging them or double-clicking their title bar. Restore them to their default docking location by double-clicking their title bar a second time.
+
+* * *
+
+## The Watch Window
+
+The Watch window (identified as Watches in Figure 17.9) is a separate window that you use to track the values of variables and expressions as your code executes. To display the Watch window, click the Watch Window button on the Debug toolbar or choose View ⇒ Watch Window in the VBA Editor. To hide the Watch window again, click its close button (clicking the Watch Window button or choosing View ⇒ Watch Window again doesn't hide it).
+
+Figure 17.9 Use the Watch window to track the values of variables and expressions in your code.
+
+The Watch window displays _watch expressions_ —expressions in your code that you specify ahead of time. You want to view a dynamic display of the values in these variables or expressions.
+
+Watch-expression information can help you to pinpoint where an unexpected value for a variable or an expression occurs as your code executes. The Watch window lists the names of the watched expressions or variables in the Expression column, their values in the Value column, their type (Integer, Byte, String, Long, and so on) in the Type column, and their context (the module and procedure in which they're operating) in the Context column. So to track the value of a given variable, you need only look at the Watch window at any given point while in Break mode.
+
+If a variable or expression listed in the Watch window hasn't been initialized, the Watch window displays "< Out of Context >" in the Value column and "Empty" (for a variable other than a Variant) or "Variant/Empty" (for a Variant) in the Type column.
+
+The VBA Editor updates all watch expressions in the Watch window whenever you enter Break mode and whenever you execute a statement in the Immediate window. So if you step through a procedure in the Code window by pressing the F8 key (which keeps you in Break mode), you can watch the value in a variable, or of an expression, as each statement executes. This is a great way to pinpoint where an error or an unexpected value occurs—and is much easier than moving the mouse over each variable or expression in question to check its value by using the Auto Data Tips feature.
+
+Here's a typical debugging scenario. Let's say your code is producing a preposterous result, such as asserting that your annual salary is $2,200,000. As usual with most debugging, you're trying to figure out _where_ in your code this sudden and massive gain in income is being calculated. Observe the Watch window while single-stepping through your code to see in which line of code the variable MySalary goes from 50,000 to 2,200,000. Now you're right there close to where the bug is and you can examine the preceding lines of code very carefully to see what's impacting the MySalary variable.
+
+Because watch expressions slow down execution of your code, the VBA Editor doesn't save them with the code—you need to redo them for each editing session. However, the Editor _does_ store watch expressions during the current editing session, so you can move from procedure to procedure without losing your watch expressions.
+
+### Setting Watch Expressions
+
+Sometimes referred to as _conditional breakpoints,_ watch expressions give you considerable flexibility when debugging. You can ask the VBA Editor to halt execution on most any kind of situation you can think up, such as break on any line that causes a variable to exceed a certain value, go below zero, change to a shorter string length, and so on. In other words, you specify a condition, an expression such as MySalary > 50000, and the VBA Editor automatically halts execution and displays the line where your salary increases beyond the expected 50,000. As you can imagine, the conditional breakpoint is one of the best tools a debugger has.
+
+To set a watch expression, add it to the list in the Watch window by following these steps:
+
+1. Select the variable or expression in your code, right-click it, and choose Add Watch from the context menu to display the Add Watch dialog box (see Figure 17.10). The variable or expression in which you right-clicked appears in the Expression text box.
+
+Figure 17.10 In the Add Watch dialog box, specify the watch expression you want to add.
+
+You can also select the variable or expression you're interested in and choose Debug ⇒ Add Watch to display the Add Watch dialog box. If you choose Debug ⇒ Add Watch _without_ selecting the variable or expression, you must type it in the Expression text box, which is a waste of time.
+
+**2.** If necessary, adjust the settings in the Context group box. The Procedure drop-down list is set to the current procedure, and the Module drop-down list is set to the current module.
+
+**3.** In the Watch Type group box, adjust the option-button setting if necessary:
+
+  * The default setting—Watch Expression—adds the variable or expression in the Expression text box to the list in the Watch window. However, conditional breakpoints are more useful if you do more than merely observe the status of variables or expressions. The following two list items describe the true benefit of these breakpoints.
+  * Break When Value Is True causes VBA to enter Break mode whenever the value of the variable or expression changes to True.
+  * Break When Value Changes causes VBA to enter Break mode whenever the value of the watch expression changes. Use this setting when dealing either with a watch expression whose value you don't expect to change but that appears to be changing (such as MySalary in the previous example) or with a watch expression whose every change you need to observe.
+
+**4.** Click the OK button to add the watch expression to the Watch window.
+
+* * *
+
+Use These Two Important Conditional Break Techniques
+
+The Break When Value Is True option button allows you to run your code without stepping through each statement that doesn't change the value of the watch expression to True. This allows you to specify that Break mode should be entered, for example, when your variable exceeds a certain value (such as X > 10000) or equals another variable (such as x = y). Employing this kind of conditional break can be extremely helpful when tracking down elusive bugs.
+
+The Break When Value Changes option button allows you to run your code and stop at each location where the value changes in the code.
+
+* * *
+
+You can also drag a variable or an expression from the Code window to the Watch window; doing so sets a default watch expression in the current context. To set Break When Value Is True or Break When Value Changes, edit the watch expression after dragging it to the Watch window.
+
+### Editing Watch Expressions
+
+To edit a watch expression, right-click it in the Watch window and choose Edit Watch from the context menu, or select it in the Watch window and choose Debug ⇒ Edit Watch. Either action will display the Edit Watch dialog box with the watch expression selected in the Expression box, as shown in Figure 17.11. Change the context or watch type for the watch expression by using the settings in the Context group box and the Watch Type group box, and then click the OK button to apply your changes.
+
+Figure 17.11 You can edit your watch expressions in the Edit Watch dialog box.
+
+### Deleting Watch Expressions
+
+To delete a watch expression, right-click it in the Watch window and choose Delete Watch from the context menu. You can also delete the current watch expression by clicking the Delete button in the Edit Watch dialog box.
+
+### Using the Quick Watch Feature
+
+For those times when you don't need to create a watch expression for an expression or a variable, when you merely want to observe the value, you can use the Quick Watch feature, which displays the Quick Watch dialog box (see Figure 17.12) containing the context and value of the selected expression.
+
+Figure 17.12 Use the Quick Watch dialog box to get quick information on a variable or expression for which you don't want to set a watch expression in the Watch window.
+
+To use Quick Watch, while in Break mode select the expression or variable in the Code window and then click the Quick Watch button on the Debug toolbar, choose Debug ⇒ Quick Watch, or press Shift+F9. (If you're already working in the Quick Watch dialog box, you can click the Add button to add the expression to the Watch window.)
+
+## The Immediate Window
+
+One use for the Immediate window is as a virtual scratchpad. In the Immediate window you enter lines of code that you want to test quickly, without having to enter them in a procedure and then test the entire procedure. A second major use of the Immediate window is to display information to help you check the values of variables while a procedure is executing.
+
+In the first case, you type code into the Immediate window, then press Enter to see the results immediately (get it?). In the second case, you insert in your code Debug.Print statements that display information in the Immediate window, where you can easily view it. We'll explore both of these techniques in the following sections.
+
+To display the Immediate window, click the Immediate Window button on the Debug toolbar, choose View ⇒ Immediate Window, or press Ctrl+G. To hide the Immediate window again, click its close button. (Clicking the Immediate Window button, choosing View ⇒ Immediate Window, or pressing Ctrl+G when the Immediate window is displayed does not hide the Immediate window.)
+
+You can execute code in the Immediate window in both Break mode and Design mode.
+
+### What You Can't Do in the Immediate Window
+
+There are a number of restrictions on the code you can use in the Immediate window:
+
+  * You can't use declarative statements (such as Dim, Private, Public, Option Explicit, Static, or Type) or control-flow statements (such as GoTo, Sub, or Function). These statements cause VBA to return an "Invalid in Immediate Pane" error.
+  * You can't use multiline statements (such as block If statements or block For... Next statements) because there's no logical connection between statements on different lines in the Immediate window: Each line is treated in isolation.
+  * You can't place breakpoints in the Immediate window.
+
+### Entering Code in the Immediate Window
+
+The Immediate window supports a number of standard Windows editing keystrokes and key combinations, such as Ctrl+X (Cut), Ctrl+C (Copy), Ctrl+V (Paste), Ctrl+Home (move the insertion point to the start of the window), Ctrl+End (move the insertion point to the end of the window), Delete (delete the current selection), and Shift+F10 (display the context menu).
+
+The Immediate window also supports the following VBA Editor keystrokes and key combinations:
+
+  * F5 continues running a procedure.
+  * Alt+F5 runs the error-handler code for the current procedure.
+  * F8 single-steps through code (executing one statement at a time).
+  * Shift+F8 procedure-steps through code (executing one procedure at a time).
+  * Alt+F8 steps into the error handler for the current procedure.
+  * F2 displays the Object Browser.
+
+Finally, the Immediate window has a couple of commands of its own:
+
+  * Pressing Enter runs the current line of code.
+  * Pressing Ctrl+Enter inserts a carriage return.
+
+### Printing Information to the Immediate Window
+
+As well as entering statements in the Immediate window for quick testing, you can use this window for a different debugging technique. To include in your procedures statements that print information to the Immediate window, use the Print method of the Debug object. Printing like this allows you to create a log during execution, a log you can later examine for errors or strange behavior. You don't single-step or display message boxes containing the value of a variable. Instead you print data for later study.
+
+The syntax for the Print method is as follows:
+
+    Debug.Print [ _outputlist_ ]
+
+_outputlist_ is an optional argument specifying the expression or expressions to print. You'll almost always want to include _outputlist_ —if you don't, the Print method prints a blank line, which is of little use. Construct your _outputlist_ using the following syntax:
+
+    [Spc( _n_ ) | Tab( _n_ )] _expression_
+
+Here, Spc( _n_ ) inserts space characters and Tab( _n_ ) inserts tab characters, with _n_ being the number of spaces or tabs to insert. Both are optional arguments, and for simple output, you'll seldom need to use them.
+
+_expression_ is an optional argument specifying the numeric expression or String expression to print:
+
+  * To specify multiple expressions, separate them with either a space or a semicolon.
+  * A Boolean value prints as either True or False (as appropriate).
+  * If _outputlist_ is Empty, Print doesn't print anything. If _outputlist_ is Null, Print prints Null.
+  * If _outputlist_ is an error, Print prints it as Error _errorcode_ , where _errorcode_ is the code specifying the error.
+
+As an example, you could log the contents of the String variables (expressions) CustName, Address1, Address2, City, State, and Zip to the Immediate window in an address format by using the following statements:
+
+    Debug.Print CustName
+    Debug.Print Address1 & "," & Address2
+    Debug.Print City & "," & State & " " & Zip
+
+As another example, the following procedure prints the names and paths of all open workbooks in Excel to the Immediate window:
+
+    Sub See_All_Workbook_Names()
+        Dim oBook As Workbook
+        For Each oBook In Workbooks
+            Debug.Print oBook.FullName
+        Next
+    End Sub
+
+In practice Debug.print is used by many programmers as a sometimes-quick, efficient alternative to debugging with the Watch windows, message boxes, or breakpoints. You need to see if something is going wrong with a variable (its value is wrong, but where does it go wrong?). So you insert some Debug.Print statements to display the variable's value while executing a procedure. Then you can see if the value is wrong in that location or somewhere else in the code.
+
+If your program contains multiple procedures, you might also want to debug.print the name of the procedure. This example identifies both the procedure and variable name within the Debug.Print statement:
+
+    Debug.Print "In the Sub Add_Tax the variable intLocal is: " & intLocal
+
+This results in the following line in the Immediate window:
+
+    In Sub Add_Tax the variable intLocal is: 7
+
+## The Call Stack Dialog Box
+
+When working in Break mode, you can summon the Call Stack dialog box (see Figure 16.1 in Chapter 16) to display a list of the active _procedure calls_ —the outside procedures being triggered by the current procedure. It shows the history of your code's execution path.
+
+When you begin running a procedure, that procedure is added to the call-stack list in the Call Stack dialog box. If that procedure then calls another procedure, the name of the second procedure is added to the call-stack list, but only while the procedure is executing; it's then removed from the list. By using the Call Stack dialog box in Break mode, you can find out what procedures are being called by another procedure; this can help you establish which parts of your code you need to check for errors.
+
+To display the Call Stack dialog box, click the Call Stack button on the Debug toolbar, press Ctrl+L, or select View ⇒ Call Stack. To display one of the procedures listed in the Call Stack dialog box, select it in the Project.Module.Function list box and click the Show button.
+
+# Dealing with Infinite Loops
+
+You'll probably find it easy to tell when a procedure gets stuck in an infinite loop: You'll notice that the procedure simply doesn't stop executing. If you open Windows's Task Manager, it will report that your application has "stopped responding." To interrupt an infinite loop, press Ctrl+Break. The VBA Editor then displays a Code Execution Has Been Interrupted dialog box. Infinite loops are also known as _endless loops_.
+
+There are several ways to get stuck in infinite loops, such as using GoTo statements without If conditions or Do loops without While or Until constraints. These are easy enough to avoid, but even if you do, it's still possible for infinite loops to occur in your code because of conditions you haven't been able to anticipate.
+
+The best way to approach detecting and eliminating an infinite loop is to use breakpoints or a watch expression to pinpoint where the procedure enters the infinite loop. Once you've reached it, use the Step Into command to step into the procedure. Then use the Watch window or the Locals window to observe the variable and expressions in the loop, which should indicate when something is going wrong and causing the loop to be endless.
+
+If your code contains a loop that should execute only a set number of times but you suspect it's running endlessly, you can insert a counter variable in the loop in an If... Then structure that triggers either an Exit For statement or an Exit Do statement to exit the loop if it runs more than a certain number of times.
+
+# Dealing with Runtime Errors
+
+Despite the help that VBA provides by checking for language errors and compile errors, runtime errors remain an unpleasant fact of life. Sooner or later, you will get runtime errors in your code, but you don't have to take them lying down. Just add _error handlers_ , pieces of code that trap errors, analyze them, and take action if they match given error codes.
+
+An error handler is a preventative measure, allowing your code to manage problems gracefully rather than crashing in front of a user's alarmed or bemused face.
+
+## When Should You Write an Error Handler?
+
+Consider writing an error handler in the following circumstances:
+
+  * When a runtime error can cause your code to fail disastrously. For a procedure that tweaks a couple of objects on a slide in PowerPoint, you're unlikely to need an error handler. By contrast, for a procedure that creates, deletes, or moves files, you'll probably want an error handler.
+  * When your program accesses peripherals or objects outside the application itself—the status of which is unpredictable during design time. In this situation, you can identify particular errors that are likely to occur and that can be trapped. For example, when the user tries to open a file, certain well-known errors can occur—perhaps the file doesn't exist, or is currently in use by another computer, or is on a network drive, floppy drive, CD-ROM drive, or removable drive that isn't available at the time. You'll also run into errors if the user tries to use a printer or other remote device (say, a scanner or a digital camera) that's not present, not connected, turned off, or not configured correctly. Similarly, any procedure that deals with a particular object in a document (for example, a chart in Excel) will run into trouble if that object is not available.
+
+* * *
+
+Consider Trapping Errors Rather than Anticipating Them
+
+In some instances, you may find it simpler to trap a resulting error from a procedure than to anticipate and try to forestall the many and various conditions that might lead to the generation of the error. For example, instead of checking to make sure a file exists before you try to open or manipulate the file, just trap any kind of error that results if the file isn't detected.
+
+* * *
+
+## Trapping an Error
+
+_Trapping_ an error means catching it in your code during runtime so that you can write programming that handles the error.
+
+VBA's On Error statement triggers when there is a runtime error, allowing you to write code that responds to the error.
+
+Usually, you'll want to prevent an error from stopping your VBA code, but you can also anticipate particular errors and use them to determine a suitable course of action to follow from the point at which they occur.
+
+To trap an error, you use the On Error statement. The usual syntax for On Error is as follows:
+
+    On Error GoTo _line_
+
+Here, _line_ is a label specifying the line to which execution is to branch when a runtime error occurs. For example, to branch to the label named ErrorHandler, you could use a structure like this:
+
+    Sub ErrorDemo()
+        **On Error GoTo ErrorHandler**
+        'ordinary code statements here
+
+    Exit Sub
+    **ErrorHandler:**
+               'error-handling statements here
+    End Sub
+
+The label you use to identify the error handler can be named with any valid label name—you don't have to call it ErrorHandler or anything similar. Some people find that a descriptive label (perhaps one that identifies the type or types of error expected, such as HandleErrorNoFileOpen) is clearer in the long run than a generic name; others prefer to go with a generic name such as HandleErr.
+
+Usually, you'll want to place the error trap early, near the top of a procedure so that it's active and ready to trap errors for all the lines of code below it throughout the whole procedure. If necessary, you can place several different error traps in a procedure by entering multiple On Error statements where they're needed—but only one can be enabled at a time. ( _Enabled_ means that an error trap has been switched on by an On Error statement. When an error occurs and execution branches to the error handler, that error handler is _active_.)
+
+Inserting multiple error handlers in a procedure can be useful when you're dealing with statements that can cause different types of errors that may need to be trapped. In the following example, the first On Error statement directs execution to ErrorHandler1, and the second On Error statement directs execution to ErrorHandler2:
+
+    Sub ErrorDemo2()
+        On Error GoTo ErrorHandler1
+        'statements here
+        On Error GoTo ErrorHandler2
+        'statements here
+        Exit Sub
+    ErrorHandler1:
+        'statements for first error handler here
+    ErrorHandler2:
+        'statements for second error handler here
+    End Sub
+
+Each error handler is limited to the procedure in which it appears, so you can create different error handlers for different procedures and have each enabled in turn as the procedures run.
+
+Because the error handler appears as code in the procedure, you need to make sure that it doesn't run when no error has occurred. You can do this by using either an Exit Sub statement in the line just above the error-handler statement (this ends execution of the procedure) or a GoTo statement that directs execution to a label beyond the error-handling code. The Exit Sub statement is better if you choose to place your error handler at the end of its procedure, which is standard practice and usually makes sense. The GoTo statement may prove easier to use if you choose to place your error handler elsewhere in the procedure.
+
+For a function, use an Exit Function statement rather than an Exit Sub statement. For a property in a class module, use an Exit Property statement.
+
+The following example uses an Exit Sub statement to cause execution to end before the error handler if no error occurs:
+
+    Sub ErrorDemo3()
+        On Error GoTo ErrorHandler
+        'statements that might cause an error
+        **Exit Sub**
+    ErrorHandler:
+        'statements that handle the error
+    End Sub
+
+This next example uses a GoTo statement to skip the error handler—which is placed within the code of the procedure—unless an error occurs. When execution reaches the GoTo SkipErrorHandler statement, it branches to the SkipErrorHandler label, thus bypassing the code in the error handler:
+
+    Sub ErrorDemo4()
+        On Error GoTo ErrorHandler
+        'statements that might cause an error
+        **GoTo SkipErrorHandler**
+    ErrorHandler:
+        'statements that  handle the error
+    SkipErrorHandler:
+        'statements
+    End Sub
+
+You read earlier in this book that some people don't like GoTo statements for uses such as the second example here. Given that this GoTo statement makes the flow of the procedure a little harder to follow, you may be inclined to agree with them in this case. (The use of GoTo in the On Error statement itself is, however, unavoidable.)
+
+## Disabling an Error Trap
+
+Recall that an error trap works only for the procedure in which it appears, and VBA disables it when the code in the procedure has finished executing. You can also disable an error trap before the end of a procedure in which it appears if you wish by using the following statement:
+
+    On Error GoTo 0
+
+Why would you do this? You might want to disable an error trap while testing a procedure to enable yourself to pinpoint errors that occur after a certain point while at the same time retaining error trapping for the first part of the procedure.
+
+## Resuming after an Error
+
+You use the Resume statement to resume execution of a procedure after trapping an error or handling an error with an error-handling routine. The Resume statement takes three forms: Resume, Resume Next, and Resume _line_.
+
+### Using a _Resume_ Statement
+
+The Resume statement causes execution to resume with the line that caused the error. Use Resume with an error-handling routine that detects and fixes the problem that caused the offending statement to fail. For example, look at the error handler in Listing 17.1, which runs when VBA is unable to apply a specified style in Word.
+
+**Listing 17.1**: Trapping a style error
+
+     1.  Sub StyleError()
+     2.
+     3.      On Error GoTo Handler
+     4.
+     5.      Selection.Style = "Executive Summary"
+     6.
+     7.      'the rest of the procedure happens here
+     8.
+     9.      'exit the procedure once execution gets this far
+    10.      Exit Sub
+    11.
+    12.  Handler:
+    13.
+    14.      If Err = 5834 Then
+    15.          ActiveDocument.Styles.Add _
+                     Name:="Executive Summary", Type:=wdStyleTypeParagraph
+    16.          **Resume**
+    17.      End If
+    18.
+    19.  End Sub
+
+Here's how the StyleError procedure in Listing 17.1 works:
+
+  * Line 1 starts the procedure, and line 19 ends it. Lines 2, 4, 6, 8, 11, 13, and 18 are spacers.
+  * Line 3 uses an On Error statement to enable the imaginatively named error handler, which is identified by the Handler label in line 12.
+  * Line 5 applies the style named Executive Summary to the current selection. If this operation succeeds, execution will continue at line 7, which in this example contains only a comment indicating that this is where the rest of the procedure would take place.
+  * Line 9 is a comment introducing line 10, which holds the Exit Sub statement to end execution of the procedure before the error handler.
+  * If the Selection.Style statement in line 5 causes an error, execution branches to the Handler label in line 12, and the error handler is activated. Line 14 compares the error value to 5834, the error that occurs if the specified style doesn't exist. If it matches, line 15 then adds the missing style to the document, and the Resume statement in line 16 causes execution to resume where the error occurred, on line 5. Because the specified style is now available, the Selection.Style statement runs without an error.
+
+* * *
+
+How to Find VBA Error Numbers and Their Explanations
+
+To find error numbers, here are three approaches:
+
+  * Go to this Web page:
+
+<http://msdn2.microsoft.com/en-us/library/Aa264975(VS.60).aspx>.
+
+  * Search the VBA Help system for _trappable errors_.
+  * Deliberately cause the error yourself and note the number and description in the resulting error-message dialog box that VBA displays.
+
+* * *
+
+### Using a _Resume Next_ Statement
+
+Resume Next causes execution to resume with the next statement after the statement that caused the error. You can use Resume Next in either of the following circumstances:
+
+  * With an error-handling routine that ignores the error and allows execution to continue without executing the offending statement
+  * As a straightforward On Error Resume Next statement that causes execution to continue at the next statement after the statement that caused an error, without using an error handler to fix the error
+
+As an example of the first circumstance, if the style specified in the previous example isn't available, you can use a Resume Next statement to skip applying it:
+
+    Sub StyleError2()
+        On Error GoTo Handler
+
+        Selection.Style = "Executive Summary"
+
+        'the rest of the procedure happens here
+
+        'exit the procedure once execution gets this far
+      Exit Sub
+
+    Handler:
+        **Resume Next**
+
+    End Sub
+
+The descriptions of Resume and Resume Next apply if the error occurred in the procedure that contains the error handler. But if the error occurred in a different procedure from the procedure that contains the error handler, Resume causes execution to resume with the last statement that transferred execution (called) out of the procedure where the handler is located; Resume Next causes execution to resume with the statement _after_ the last statement to call out of the procedure that contains the error handler.
+
+### Using a _Resume Line_ Statement
+
+Resume _line_ causes execution to resume at the specified line. Use a label to indicate the line, which must be in the same procedure as the error handler.
+
+For example, if a procedure tried to open a particular file, you could create a simple error handler that uses a Resume _line_ statement, as shown in Listing 17.2. This procedure works with Word. To make it work with other applications, substitute the appropriate error numbers in line 15.
+
+**Listing 17.2**: Resuming execution at a specified line
+
+     1.  Sub Handle_Error_Opening_File()
+     2.
+     3.      Dim strFName As String
+     4.
+     5.  StartHere:
+     6.
+     7.       On Error GoTo ErrorHandler
+     8.       strFName = InputBox("Enter the name of the file to open.", _
+                  "Open File")
+     9.       If strFName = "" Then End
+    10.       Documents.Open strFName
+    11.       Exit Sub
+    12.
+    13.  ErrorHandler:
+    14.
+    15.     If Err = 5174 Or Err = 5273 Then MsgBox _
+                  "The file " & strFName & " does not exist." & vbCr & _
+                  "Please enter the name again.", _
+                  vbOKOnly + vbCritical, "File Error"
+    16.       **Resume StartHere**
+    17.
+    18.  End Sub
+
+Here's how Listing 17.2 works:
+
+  * Line 1 starts the procedure, and line 18 ends it.
+  * Line 2 is a spacer. Line 3 declares the String variable strFName. Line 4 is another spacer.
+  * Line 5 contains the StartHere label, to which execution will return from the Resume statement in line 16. Line 6 is a spacer.
+  * Line 7 uses an On Error statement to enable the error handler ErrorHandler.
+  * Line 8 displays an input box prompting users for the name of the file they want to open, and stores the name in the variable strFName, which line 9 then tries to open. Line 10 checks strFName against an empty string and ends execution if it matches.
+  * If the file exists and can be opened, execution passes to line 11, where an Exit Sub statement exits the procedure, ending its execution. Otherwise, an error is generated, and execution branches to the ErrorHandler label in line 13, where the error handler becomes active.
+  * Line 14 is a spacer. Line 15 then compares the value of the error to 5174 (the error that occurs if VBA can't find the file) and to 5273 (the error that occurs if the document name or path isn't valid in Word). If either of these comparisons matches, line 15 displays a message box advising users of the error and prompting them to enter the correct filename.
+  * The Resume statement in line 16 then returns execution to the StartHere label in line 5. Line 17 is a spacer.
+
+* * *
+
+Try Inserting a Counter Variable to Deal with Repetitious User Errors
+
+For some procedures, you may want to build in a counter mechanism to prevent users from repeating the same error endlessly because they don't grasp what's wrong. By incrementing a counter variable each time the error handler is invoked and checking the resulting number, you can choose to take a different action after a number of unsuccessful attempts to execute a particular action.
+
+* * *
+
+You can't use a Resume statement anywhere other than in an error-handling routine (or an On Error Resume Next statement). If you do, VBA reports an error.
+
+## Getting a Description of an Error
+
+To see the description of the current error, return the Description property of the Err object:
+
+    MsgBox Err.Description
+
+In general, operating-system and programming-language error messages tend to be terse, cryptic, and of less help to the end user than to the people who built the OS or language. Think twice before displaying one of these error messages to an end user. The error message shown in Figure 17.7 says "Run-time error '5941': The requested member of the collection does not exist." As you can imagine, most users would be baffled by this message; some would panic.
+
+Usually, it's more effective, not to mention kinder, to write and display a more verbose error message of your own devising. It should explain in ordinary English what the problem is—and, preferably, what (if anything) the user can do to solve it.
+
+## Raising Your Own Errors
+
+As part of your testing, you may want to deliberately simulate errors so that you can see how well your error handler handles them. (Programming lingo sometimes substitutes the word _raise_ for _cause_ or _trigger._ Nobody knows why.)
+
+To cause an error to be triggered, use the Raise method of the Err object, specifying only the _number_ argument. _number_ is a Long argument giving the number of the error that you want to cause. For example, the following statement "raises" error 5121:
+
+    Err.Raise 5121
+
+# Suppressing Alerts
+
+Many of the procedures you build will use message boxes or dialog boxes to allow the user to choose options for the procedure. In some applications—such as Word, Excel, PowerPoint, and Access—you can use the DisplayAlerts property of the Application object to suppress the display of message boxes and errors while a procedure is running:
+
+  * In Word, DisplayAlerts can be set to wdAlertsNone (0) to suppress alerts and message boxes, wdAlertsMessageBox (-2) to suppress alerts but display message boxes, or wdAlertsAll (-1, the default) to display all alerts and message boxes. DisplayAlerts is a sticky setting. You need to set DisplayAlerts explicitly back to one of four things: to True or to wdAlertsAll when you want to see alerts again after setting it to False, to wdAlertsNone, or to wdAlertsMessageBox. VBA resets the default value when you restart Word.
+  * In Excel, DisplayAlerts is a read/write Boolean property that can be set to True to display alerts and False to suppress them. The setting sticks until you change it or restart Excel, at which point VBA resets it to True.
+  * In PowerPoint, DisplayAlerts is a read/write property that can be set to ppAlertsAll to display all alerts and ppAlertsNone to suppress all alerts. The setting sticks until you change it or until you restart PowerPoint, at which point VBA resets it to ppAlertsNone.
+  * In Access, you use the pervasive DoCmd object's SetWarnings method, like this:
+
+          DoCmd.SetWarnings False
+
+# Handling User Interrupts in Word, Excel, and Project
+
+Errors may seem quite enough of a problem, but you also need to decide what will happen if a user tries to interrupt your code by pressing Ctrl+Break during execution. Some VBA hosts, including Word and Excel, offer you three options:
+
+  * You can allow a user interrupt to stop your code. This is the easy way to proceed (and, as the default condition, needs no effort on your part), but in complex procedures, it may cause problems. For example, the user may have spent five minutes typing in data, only to lose it because the data wasn't saved due to the early termination of the program.
+  * You can prevent user interrupts by disabling user input while the procedure is running. This is simple to do, but you run the risk of creating unstoppable code if a procedure enters an endless loop. The user would have to power down the machine or, at least, invoke Task Manager and kill your task. Any unsaved work in the procedure or even the host application will be lost. The user might have been typing for _hours_ without saving their work. Losing this much...it can send some people _right over the edge_.
+  * As a compromise between the first two options, you can allow user interrupts during certain parts of a procedure and prevent user interrupts during more critical parts of a procedure.
+
+## Disabling User Input While a Procedure Is Running
+
+To disable user input while a procedure is executing, disable the Ctrl+Break key combination by setting the EnableCancelKey property of the Application object to wdCancelDisabled (in Word) or xlDisabled (in Excel):
+
+    Application.EnableCancelKey = wdCancelDisabled     'Word
+    Application.EnableCancelKey = xlDisabled           'Excel
+
+VBA automatically enables user input again when the procedure stops executing. You can also reenable user input during a procedure by setting the EnableCancelKey property to wdCancelInterrupt (in Word) or xlInterrupt (in Excel):
+
+    Application.EnableCancelKey = wdCancelInterrupt     'Word
+    Application.EnableCancelKey = xlInterrupt           'Excel
+
+Excel offers a third setting, xlErrorHandler, that traps the Ctrl+Break keystroke as error 18. You can deal with this error as you would any other error. Here's a quick example:
+
+    Sub CancelKey_Example()
+        Dim i As Long
+        On Error GoTo EH
+        Application.EnableCancelKey = xlErrorHandler
+        For i = 1 To 100000000 ' time-consuming loop
+            Application.StatusBar = i
+        Next i
+    EH:
+        If Err.Number = 18 Then
+            If MsgBox("Do you want to stop the procedure?" _
+                & vbCr & vbCr & "If not, stop pressing Ctrl+Break!", _
+                vbYesNo + vbCritical, "User Interrupt Detected") = vbYes Then End
+        End If
+    End Sub
+
+## Disabling User Input While Part of a Procedure Is Running
+
+You may want to temporarily disable user input while a procedure is executing a sensitive task that must not be interrupted. Then when the task is complete, you can reenable user input because it's safe for the user to stop the procedure again.
+
+For example, say you have a procedure in which a section of code moves a number of files from one folder to another. You don't want the user to prevent the code that executes the move operations from being interrupted. That could cause problems because if the user stopped the procedure in mid-task, it might leave some files still in the source folder and some in the destination folder.
+
+Here's an example using Word:
+
+    'interruptible actions up to this point
+    Application.EnableCancelKey = **wdCancelDisabled**
+    For i = 1 to LastFile
+         SourceFile = Source & "\Section" & i
+        DestFile = Destination & "\Section" & i
+        Name SourceFile As DestFile
+    Next i
+    Application.EnableCancelKey = **wdCancelInterrupt**
+    'interruptible actions after this point
+
+# Documenting Your Code
+
+Some musicians can read a symphonic score and more or less "hear" the music. Likewise, some programmers can read raw code and visualize what it does. But most programmers need comments to help them understand what code is doing, particularly if they wrote the code months before or if it was written by another programmer.
+
+Many programmers also find it easier to debug their procedures by documenting their code. The best way to document your code is to add comments to it, either as you create the code or after you've finished creating it: This procedure does this. It expects this data as input and provides this as its output. This line does this. And so on.
+
+Some experts advise that you document your code as you create it in any procedure in which you're exploring your way and trying different methods to reach your goal. Add comments to explain what action each group of statements is trying to achieve. Once you've gotten the procedure to work, go through the code and delete the statements you didn't use, using the comments to identify which sections are now useless and which are still worthwhile and leaving only the comments that are relevant to how the remaining code functions.
+
+Also consider adding comments when you're modifying an existing procedure so that you don't lose track of your changes. Once you have the procedure working to your liking, remove any unnecessary comments and reword any verbose or unclear comments.
+
+Other experts suggest documenting your code when you've finished writing it. This allows you to enter only the comment lines that you want to be there permanently. This is the way to go when you're fairly sure of the direction of your code when you start writing the procedure and the procedure needs only a few pointers to make its code clear once it's complete.
+
+To document your code, use comments prefaced by either the single quote (') or the Rem keyword (short for _remark_ ).
+
+* * *
+
+Use Block-Commenting as a Debugging Tool
+
+Remember that commenting can also be employed as a debugging technique–when you want to see how code runs with some lines inactivated. In other words, does the bug disappear when the commented-out lines are not executed? If so, the bug is probably located somewhere in those lines of code. You can "comment out" a group of lines, a whole line, or part of a line: anything to the right of an apostrophe or the Rem keyword is commented out. See the section in Chapter 3 titled "Commenting Out Lines" for details on this tactic.
+
+* * *
+
+Few programmers use Rem anymore. When you're trying to comment out only a part of a line, the apostrophe is usually the better choice anyway. If you do choose to use the Rem keyword, you'll need to add a colon before it to make it work consistently (some statements accept a Rem without a colon at their end; others generate a compile error):
+
+    Rem This is a comment line.
+    Documents.Add: Rem create a document based on Normal.dotm
+
+Generally, apostrophe-commented remarks are separated by a few spaces or tabs from any statement the line contains (as in the second line here). This makes the code and comments easier to read than comments using Rem:
+
+    'This is a comment line
+    Documents.Add     'create a document based on Normal.dotm
+
+It's tempting to think that you don't need to document your code because you'll be able to recall what it does. But once you've written a lot of code, you probably won't be able to remember. Coming back to a procedure six months after writing it, you'll find it as unfamiliar as if someone else had written it. And if you've become a VBA whiz, you may even find it hard to visualize the clumsy techniques you were using at that time.
+
+Most programmers have a distinct aversion to documenting their code; for some, the dislike of documenting is almost pathological. You can see why: When you're writing the code, documenting what each line does slows you down and distracts you from your larger purpose. And documenting after the code is finished and tested is tedious work. Besides, anyone that's competent should be able to read the code and see what it does...shouldn't they?
+
+Maybe so, but consider this: It's likely that you won't always be the person working with your code—at times, others may work with it too, and they'll appreciate all the help they can get in understanding its purposes and behaviors. Likewise, the code on which you work won't always be your own—you may at times have to debug code that others have written, and in this case, _you'll_ be the one grateful for comments.
+
+# The Bottom Line
+
+**Understand the basic principles of debugging.**
+
+A major aspect of programming is testing your code. Debugging can be enjoyable if you think of it as a puzzle that you can solve. But whether or not you enjoy it, debugging is essential if you want to preserve a reputation as a professional.
+
+Master It
+
+When testing your code, try to imagine ways that the code could fail. Describe a situation that can produce unanticipated results.
+
+**Recognize the four different types of errors you'll create.**
+
+Experts have concluded that there are four primary categories of error in programs.
+
+Master It
+
+Name two of the four basic types of programming errors.
+
+**Employ VBA's debugging tools.**
+
+The VBA Editor and VBA include a generous assortment of debugging tools to help you track down and remove bugs from your procedures. The main windows you'll employ for debugging are the Immediate window, the Locals window, and the Watch window.
+
+Master It
+
+The Watch window is especially useful because you can set watch expressions (also known as conditional breakpoints). Describe this debugging tactic.
+
+**Deal with runtime errors**
+
+You can trap some runtime errors (errors that show up while a procedure is executing) while debugging your code. But others show up only while your user is interacting with your program—and you're probably not there to help them. There is a way, though, to soften the blow and, in some cases, even fix a problem by adding error handlers to your programs.
+
+Master It
+
+Error handlers are special statements and sections of code that detect and then manage runtime errors. What VBA statement detects a runtime error?
+Chapter 18
+
+Building Well-Behaved Code
+
+This chapter concentrates on the principles of good behavior. Once you've built a procedure that's useful and that works consistently as intended, you'll probably want to distribute it to as many of your coworkers as might use it or even to a wider audience on the Internet. Before you distribute it, though, you should make sure that the procedure is as civilized as possible in its interaction with users and with the settings they may have chosen on their computers. It's all too easy to distribute an apparently solid, useful procedure that runs roughshod over the user's preferences or one that fails unexpectedly under certain circumstances. In this chapter, you'll look at how to avoid such problems and how to construct your procedures so that the user will have no problem interacting with them.
+
+The specifics of good program behavior vary from application to application, and you will need to apply the principles of the application with which you're working. This chapter gives some examples.
+
+In this chapter you will learn to do the following:
+
+  * Understand the characteristics of well-behaved procedures
+  * Retain and restore the user environment
+  * Let the user know what's happening
+  * Check that the procedure is running under suitable conditions
+  * Clean up after a procedure
+
+# What Is a Well-Behaved Procedure?
+
+A well-behaved procedure leaves no trace of its actions beyond those that the user expected it to perform. This means the following:
+
+  * Making no detectable changes to the user environment or, if the procedure does need to make changes (for example, in order to do its job), restoring the previous settings
+  * Presenting the user with relevant choices for the procedure and relevant information once the procedure has finished running
+  * Showing or telling the user what is happening while the procedure is running
+  * Making sure (if possible) that conditions are appropriate for the procedure to run successfully—before the procedure takes any actions
+  * Anticipating or trapping errors wherever possible so that the procedure doesn't crash or, if it does crash under exceptional circumstances, so that it does so as gracefully as possible, minimizing damage to the user's work
+  * Leaving users in the optimal position to continue their work after the procedure finishes executing
+  * Deleting any scratch documents, folders, or other detritus that the procedure created in order to perform its duties but that are no longer needed
+
+You can probably think of a couple of examples in which applications you use don't exactly do these things. For example, do you use Word? Then you're probably familiar with the less-than-inspiring behavior of the Page Up and Page Down feature. While working in a document, press the Page Down key three times, then press the Page Up key three times. Your blinking insertion point should be back in the exact location where it was before you paged down, then back up, right? Unfortunately, the insertion point doesn't always (let's be honest, it will _rarely_ ) return to the exact point in the document as it should.
+
+So if you page through your document to look at some paragraph but then try to return to where you were last, you always need to check that the insertion point is in the right place before you start typing—otherwise, the characters are very likely to land in the wrong place. Word was first released in October 1983, _so Microsoft has had time to fix this_. It would be simple for Word to note the insertion point before the paging, but why that's never done remains a mystery. I'll show you how to do this in your macros in the section titled "Leaving the User in the Best Position to Continue Working" later in this chapter.
+
+Such weaknesses in commercial applications' interfaces provoke two main reactions among developers. First, if users are accustomed to such niggles as having to reposition the selection or change the view when they shouldn't need to, they're unlikely to get too annoyed with having to perform similar actions after running one of our procedures. This is particularly true if your macro saves them plenty of time and effort, for which they should be grateful rather than picky. Besides, they mostly likely didn't pay for your macro, did they?
+
+The second reaction is an impressive (and sometimes overzealous) determination on the part of macro programmers to restore the user environment absolutely perfectly even if major software corporations seem incapable of producing software that does so.
+
+The first approach tends to be more economical in its code and the second more inventive. To get your work done and retain your sanity, you'll probably want to steer a course between the two extremes.
+
+# Retaining or Restoring the User Environment
+
+In many cases, your procedures will run without even needing to change the user environment—but if not, restore it as closely as possible to its previous state. What this means depends on the host application, but here are some examples of environment changes in Word, Excel, and PowerPoint:
+
+  * In Word: Changing the revision-marking (Track Changes) setting so that you can change the text without the changes being marked as revisions.
+  * In Word or PowerPoint: Changing the view to a different view so that you can perform certain operations that cannot be performed in the original view.
+  * In Excel: Creating a temporary worksheet on which you can manipulate data secure in the knowledge that you don't need to check whether any ranges are already occupied by user data.
+  * In any application that lets you manipulate its Find and Replace feature: Using the Find and Replace feature to identify and/or modify parts of a document, then restoring users' last search (and replace, if necessary) so that they can perform it again seamlessly. The problem here is that most applications have "sticky" Find and Replace settings to allow the user to perform the same search or replacement operation again quickly without reentering the parameters. If you've replaced users' search and replacement parameters, they'll get a rude shock the next time they try to search or replace. This is particularly true if you've turned on some esoteric feature like Match Case. The next time the user tries to search for _florida_ , they will find no matches, even if the document is about Miami and is jam-packed with the word _Florida_. Why? Because your macro left the Match Case filter turned on, and the user didn't capitalize _Florida_ when initiating the search. Fail.
+
+You'll want to save information about the user's environment so that you can restore it at the end of the procedure. If your procedure will mess around with the Match Case property of the Word's Find and Replace feature, at the start of this procedure you save the user's current value in this property in a private variable, public variable, or custom object as appropriate.
+
+Then at the end of your macro, fetch the saved value and restore it to the property you temporarily modified. Here's an example:
+
+    Dim CaseStatus As Boolean 'match case is either on or off
+
+        CaseStatus = Selection.Find.MatchCase 'save the user's setting
+
+        Selection.Find.MatchCase = True 'our macro needs to be case-sensitive
+
+        ' execute statements in the macro
+
+        Selection.Find.MatchCase = CaseStatus 'restore the user's preference
+
+# Leaving the User in the Best Position to Continue Working
+
+After your procedure finishes running, users need to be in the best possible position to continue their work. What exactly this best possible position entails depends on the situation, but here are three simple suggestions:
+
+  * Usually, you'll want to leave users viewing the same document they were working on when they started running your macro. There are some obvious exceptions to this, such as when the procedure creates a new file for the user and the user is expecting to work in that file, but the general principle applies in most situations.
+  * If a file is essentially untouched (at least from the user's point of view) by your macro, the blinking insertion cursor (selection) should probably be placed back where it was when the user started running the procedure. To restore the selection, you may want to define a range at the start of your procedure and then move the selection back to it at the end of the procedure. In some applications, you could also use a bookmark or a named range—but if you do, be sure to remove it afterward. Remember, leave no debris behind.
+  * Listing 18.1 is an example macro that you can try out. It saves a Word document's current blinking insertion-cursor location in a bookmark. Next it moves the cursor down a few lines and shows you a message box so you can see the new location of the cursor. Finally, it restores the cursor to its original location:
+
+**Listing 18.1**: Restoring the cursor
+
+    1.  Sub SaveAndRestoreCursor()
+    2.  
+    3.  'save the current cursor location in a bookmark
+    4.     ActiveDocument.Bookmarks.Add Name:="OriginalInsertionPoint", _
+           Range:=Selection.Range
+    5.  
+    6.  'move down eight lines
+    7.     Selection.MoveDown Unit:=wdLine, Count:=8
+    8.     
+    9.     MsgBox "moved to here (look for insertion line; it's moved down 8 lines from where it        was.)"
+    10.    
+    11. 'fetch the saved bookmark and go to it
+    12.    Selection.GoTo what:=wdGoToBookmark, Name:="OriginalInsertionPoint"
+    13.      
+    14.    MsgBox "Now the insertion line has been restored to where it was when this macro        started.)"
+    15. 
+    16. 'remove the bookmark to leave no debris behind
+    17. 
+    18.    ActiveDocument.Bookmarks("OriginalInsertionPoint").Delete
+    19.  End Sub
+
+  * Notice in line 18 that we delete our bookmark when we've finished using it. Don't leave rubbish behind.
+  * If the procedure has created a new object in the file, and the user will be expecting to work with it, you may want to have that object selected at the end of the procedure.
+
+# Keeping the User Informed during the Procedure
+
+A key component of a well-behaved procedure is keeping the user adequately informed throughout the process. In a macro that performs a basic if tedious task, adequate information may require only a clear description in the macro's Description field, to assure users that they're choosing the right procedure from the Macros dialog box.
+
+With a more complex procedure, adequate information will probably have to be more extensive: You may need to display a starting message box or dialog box, show information on the status bar during the procedure, display an ending message box, or create a log file of information so that the user has a record of what took place during execution of the procedure.
+
+You must first decide whether to disable user input during the procedure. In Word and Excel, you can disable user input to protect sensitive sections of your procedures by setting the EnableCancelKey property of the Application object (as discussed in "Disabling User Input While a Procedure Is Running" in Chapter 17, "Debugging Your Code and Handling Errors"). When you do so, it's a good idea to indicate to the user at the beginning of the procedure that input will be disabled and explain why. Otherwise, a user may react to a procedure that seems not to be executing in the same way they would respond to an application that had hung—by trying to close the application forcibly via Task Manager. To keep the user informed about other aspects of the procedure, you have several options, which are discussed in the following sections. But first, the sidebar "Disabling Screen Updating" examines how you can _hide_ information from the user (and the reasons for doing so) by disabling screen updating in Word and Excel.
+
+* * *
+
+**Disabling Screen Updating**
+
+Access, Word, and Excel let you disable screen updating—that is, stop the redrawing of the information in the document area. The other parts of the application window—the title bar, command bars, status bar, scroll bars, and so on—continue to update, but these items are usually relatively static compared to the document area and so don't take much updating. Still, if the user resizes the application window or the document window, they will see these other parts of the application window change, even with screen updating disabled.
+
+There are two advantages to disabling screen updating while your procedure is running:
+
+  * You can speed up the execution of your procedures somewhat. This improvement was quite noticeable in the early days of personal computing, and it is still perceptible with underpowered computers that have slow graphics cards. Most computers built since 2000 or so have relatively capable graphics cards, so turning off screen updating makes little visible difference. Any speed improvement from disabling screen updating applies especially to procedures that cause a lot of changes to the onscreen display. For example, suppose a procedure in Word strips a certain type of information out of the current document, pastes it into a new document, creates a table out of it, and applies assorted formatting to the table. The computer will expend a fair amount of effort updating what's appearing on the monitor. This is wasted effort if the user isn't hanging on every operation, so you might as well turn off screen updating.
+  * You can hide from users any parts of the procedure that you don't want them to see. This sounds totalitarian, but it's usually more like a cross between benevolent dictatorship and public television: People shouldn't see certain things that might really upset them, and there's a lot that most people don't _really_ need to know about. It's the same when you write programs: If users don't know about the operations that a procedure will routinely perform to achieve certain effects, they may be surprised or dismayed by what they see onscreen. For example, in a procedure that moves an open file, you might want to hide from the user the fact that the procedure closes the open file, moves it, and then reopens the file from its new location. By disabling screen updating, you can achieve this.
+
+The major disadvantage to disabling screen updating is that doing so prevents users from seeing information that might be useful to them. In the worst case, users may assume from the lack of activity onscreen that either the procedure has entered an endless loop or the computer has hung, and so they may try to stop the procedure by pressing Ctrl+Break or Ctrl+Alt+Delete to use Task Manager to close the application. (Task Manager typically lists the host application as "Not responding" for much of the time VBA code is running, which doesn't help.)
+
+To forestall users from disrupting a procedure, warn them in advance that a procedure will disable screen updating. For instance, you might mention the fact in a message box at the beginning of the procedure, or you might display a dialog box that allows the user to choose whether to disable screen updating and have the procedure run faster or to leave screen updating on and have the procedure run at its normal speed and provide a performance possibly worth watching.
+
+If you don't display a message box or dialog box at the beginning of a procedure, you may want to display information on the status bar to tell the user what's going on during the procedure. Word and Excel update the status bar and the title bar of the application even if screen updating is turned off—provided the status bar and the title bar are visible. To display information on the status bar, assign a suitable string to the StatusBar property of the Application object:
+
+    Application.StatusBar = _
+        "Word is creating 308 new documents for you to edit. Please wait..."
+
+Alternatively, you can disable screen updating for parts of a procedure and turn it back on, or refresh it, for other parts. Consider a procedure that creates and formats a number of documents from an existing document. If you turn off screen updating at the beginning of the procedure and then refresh it once each document has been created and formatted, the user will see each document in turn (which conveys the progress the procedure is making) without seeing the details of the formatting. What's more, the procedure will run faster than if the screen were showing all of the formatting taking place.
+
+To turn off screen updating, set the ScreenUpdating property of the Application object to False:
+
+    Application.ScreenUpdating = False
+
+To turn screen updating back on, set ScreenUpdating to True again:
+
+    Application.ScreenUpdating = True
+
+In Access, use the Echo method of the DoCmd object to turn screen updating on or off, respectively:
+
+    DoCmd.Echo True  'turns updating on
+    DoCmd.Echo False 'turns updating off
+
+In Word, to refresh the screen with the current contents of the video memory buffer, use the ScreenRefresh method of the Application object:
+
+    Application.ScreenRefresh
+
+* * *
+
+## Manipulating the Cursor
+
+Word and Excel permit you to manipulate the cursor (the mouse pointer). You may need to do this because VBA automatically displays the busy cursor (an hourglass in Windows XP, a rotating ring in Windows versions since then) while a VBA procedure is running and then restores the normal cursor when it has finished. Sometimes, however, you may need or want to specify the cursor's appearance in your code.
+
+* * *
+
+Stick with the Familiar Cursor Cues
+
+After using computers for even a few months, users tend to develop almost Pavlovian reactions to the cursor, with the busy cursor signifying (in ascending order) a momentary breather (or a slow computer), a chance to grab a cup of coffee or chat with a colleague, or the onset of panic that the computer has hung before they've saved the last three hours of work. You usually won't want to mess with these reactions. So it's a mistake to display an I-beam insertion cursor or "normal" arrow cursor when the system is in fact busy—or to display the busy cursor after the procedure has in fact finished running.
+
+* * *
+
+### Manipulating the Cursor in Word
+
+Word implements the cursor via the System object. To manipulate the cursor, you set the Cursor property. This is a read/write Long property that can be set to the following values: wdCursorIBeam (1) for an I-beam cursor, wdCursorNormal (2) for a normal cursor, wdCursorNorthWestArrow (3) for a left-angled resizing arrow (pointing up), and wdCursorWait (0) for the busy cursor. The exact appearance of the cursor depends on the cursor scheme the user has selected.
+
+For example, the following statement displays a busy cursor:
+
+    System.Cursor = wdCursorWait
+
+Note that a user can customize the cursors by clicking the Mouse icon in Control Panel to open the Mouse Properties dialog box, then selecting the Pointers tab.
+
+### Manipulating the Cursor in Excel
+
+Excel lets you manipulate the cursor through the Cursor property of the Application object. Cursor is a read/write Long property that can be set to the following values: xlIBeam (3) for an I-beam cursor, xlDefault (-4143) for a default cursor, xlNorthwestArrow (1) for the arrow pointing up and to the left, and xlWait (2) for the busy cursor.
+
+For example, the following statement displays the busy cursor:
+
+    Application.Cursor = xlWait
+
+When you explicitly set the Cursor property of the Application object in Excel, remember to reset it to something appropriate before your code stops executing. Otherwise, the cursor stays as you left it.
+
+## Displaying Information at the Beginning of a Procedure
+
+At the beginning of many procedures, you'll probably want to display a message box or a dialog box. For this purpose, you'll typically use a Yes/No or OK/Cancel message-box style. The message box tells users what the procedure will do and gives them the chance to cancel the procedure without running it any further.
+
+Alternatively, a dialog box can present options for the procedure (for example, mutually exclusive options via option buttons or nonexclusive options via check boxes), allowing users to enter information (via text boxes, list boxes, or combo boxes) and of course letting them cancel the procedure if they've cued it by accident. If you have time to create a Help file to accompany the procedures and user forms you create, you might add a Help button to each message box or dialog box, linking it to the relevant topic in the Help file.
+
+You can also use a message box or dialog box to warn the user that the procedure is going to disable user interrupts for part or all of its duration.
+
+## Communicating with the User via a Message Box or Dialog Box at the End of a Procedure
+
+With some procedures, you'll find it useful to collect information on what the procedure is doing so that you can display that information to the user in a message box or dialog box after the procedure has finished its work. As you saw in Chapter 13, "Getting User Input with Message Boxes and Input Boxes," message boxes are easier to use but are severely limited in their capabilities for laying out text—you're limited to the effects you can achieve with spaces, tabs, carriage returns, and bullets. With dialog boxes, however, you can lay out text however you need to (by using labels or text boxes) and even include images if necessary.
+
+The easiest way to collect information while running a procedure is to build one or more strings containing the information you want to display. For an example of this, look back to the sidebar titled "Control a For...Next Loop with User Input via a Dialog Box" in Chapter 12, "Using Loops to Repeat Actions," in which a cmdOK_Click procedure collects information while creating a series of folders and then at the end displays a message box telling the user what the procedure has accomplished.
+
+## Creating a Log File
+
+If you need to collect a lot of information during the course of running a procedure and either present it to the user once the procedure has finished or just make it available for reference if needed, consider using a log file rather than a message box or dialog box. Log files are useful for lengthy procedures that manipulate critical data: by writing information periodically to a log file (and by saving it frequently), you create a record of what the procedure achieves in case it crashes.
+
+* * *
+
+Make a Log File Useful for Both Average and Sophisticated Users
+
+If you want a log file to be useful for ordinary users as well as to the technically inclined, make its entries readable and helpful while including any technical information required for advanced troubleshooting. For example, a message such as _The data files for the_ " _Madrid_ " _office_ ( _madrid060430_. _xlsm_ ) _and the_ " _Taos_ " _office_ ( _taos060430_. _xlsm_ ) _were not found in the expected location_ , "\\\ _server2_ \ _data_ \ _dayfiles_ \", _so the information could not be included_ is usually more widely helpful than a cryptic _Error code 44E: Required Data Missing._
+
+* * *
+
+Say you wrote a procedure for Word that collects information from a variety of sources each day and writes it into a report. You might want to keep a log file that tracks whether information from each source was successfully transferred and at what time. Listing 18.2 provides an example of such a procedure. At the end of the procedure, you could leave the log file open so that the user could check whether the procedure was successful in creating the report or leave the summary file open so that the user could read the report itself.
+
+**Listing 18.2**: Creating a log file
+
+     1.  Sub Create_Log_File()
+     2.
+     3.      Dim strDate As String
+     4.      Dim strPath As String
+     5.      Dim strCity(10) As String
+     6.      Dim strLogText As String
+     7.      Dim strLogName As String
+     8.      Dim strSummary As String
+     9.      Dim strFile As String
+    10.      Dim i As Integer
+    11.
+    12.      On Error GoTo Crash
+    13.
+    14.      strCity(1) = "Chicago"
+    15.      strCity(2) = "Toronto"
+    16.      strCity(3) = "New York"
+    17.      strCity(4) = "London"
+    18.      strCity(5) = "Lyons"
+    19.      strCity(6) = "Antwerp"
+    20.      strCity(7) = "Copenhagen"
+    21.      strCity(8) = "Krakow"
+    22.      strCity(9) = "Pinsk"
+    23.      strCity(10) = "Belgrade"
+    24.
+    25.      strDate = Month(Date) & "-" & Day(Date) & "-" _
+                 & Year(Date)
+    26.      strPath = "f:\Daily Data\"
+    27.      strLogName = strPath & "Reports\Log for " _
+                 & strDate & ".docm"
+    28.      strSummary = strPath & "Reports\Summary for " _
+                 & strDate & ".docm"
+    29.      Documents.Add
+    30.      ActiveDocument.SaveAs strSummary
+    31.
+    32.      For i = 1 To 10
+    33.          strFile = strPath & strCity(i) & " " & strDate & ".docm"
+    34.          If Dir(strFile) <> "" Then
+    35.              Documents.Open strFile
+    36.              Documents(strFile).Paragraphs(1).Range.Copy
+    37.              Documents(strFile).Close _
+    38.                  SaveChanges:=wdDoNotSaveChanges
+    39.              With Documents(strSummary)
+    40.                  Selection.EndKey Unit:=wdStory
+    41.                  Selection.Paste
+    42.                  .Save
+    43.              End With
+    44.              strLogText = strLogText & strCity(i) _
+                         & vbTab & "OK" & vbCr
+    45.          Else
+    46.              strLogText = strLogText & strCity(i) _
+                         & vbTab & "No file" & vbCr
+    47.          End If
+    48.      Next i
+    49.
+    50.  Crash:
+    51.
+    52.      Documents.Add
+    53.      Selection.TypeText strLogText
+    54.      ActiveDocument.SaveAs strLogName
+    55.      Documents(strLogName).Close
+    56.      Documents(strSummary).Close
+    57.
+    58.  End Sub
+
+The procedure in Listing 18.2 creates a new document that contains a summary, opens a number of files in turn, copies the first paragraph out of each and pastes it into the summary document, and then closes the file. As it does this, the procedure maintains a string of log information from which it creates a log file at the end of the procedure or, if an error occurs, during the procedure. Here's what happens in the code:
+
+  * Lines 3 through 9 declare six String variables—strDate, strPath, strLogText, strLogName, strSummary, and strFile—and one String array, strCity, containing 10 items. (The procedure uses an Option Base 1 statement that doesn't appear in the listing, so strCity(10) produces 10 items in the array rather than 11.) Line 10 declares the Integer variable i, which the procedure will use as a counter.
+  * Line 11 is a spacer. Line 12 uses an On Error GoTo statement to start error handling and direct execution to the label Crash: in the event of an error. Line 13 is a spacer.
+  * Lines 14 through 23 assign the names of the company's 10 offices to the strCity array. Line 24 is a spacer.
+  * Line 25 assigns to strDate a string created by concatenating the month, the day, and the year for the current date (with a hyphen between each part) by using the Month, Day, and Year functions, respectively. For example, January 21, 2007, will produce a date string of 1-21-2007. (The reason for creating a string like this is that Windows can't handle slashes in filenames—slashes are reserved for indicating folders.)
+  * Line 26 sets strPath to the f:\Daily Data\ folder. Line 27 then builds a filename for the log file in the \Reports\ subfolder, and line 28 creates a filename for the summary file, also in the \Reports\ subfolder.
+  * Line 29 creates a new document based on Normal.dotm, and line 30 saves this document under the name stored in the strSummary variable. Line 31 is a spacer.
+  * Line 32 begins a For... Next loop that runs from i = 1 to i = 10. Line 33 assigns to the String variable strFile the filename for the first of the cities stored in the strCity array: strPath & strCity(i) & " " & strDate & ".docm".
+  * Line 34 then begins an If statement that checks whether Dir(strFile) returns an empty string. If not, line 35 opens the document specified by strFile, line 36 copies its first paragraph, and line 37 closes it without saving changes. The procedure doesn't make any changes to the document, but if the document contains any dynamic "hot fields" (such as date fields or links that automatically update themselves when the document is opened), it may have become dirty (modified). Including the SaveChanges argument ensures that users don't get an unexpected message box prompting them to save a document they know they haven't changed. (An alternative would be to set the Saved property of the document to True and then close it without using the SaveChanges argument.)
+  * Lines 39 through 43 contain a With statement that works with the Document object specified by strSummary. Line 40 uses the EndKey method with the Unit argument wdStory to move the selection to the end of the document. Line 41 pastes in the material copied from the document just opened, and line 42 saves the document. Line 43 ends the With statement.
+  * Line 44 adds to strLogText the contents of strCity(i), a tab, the text OK, and a carriage return, which will produce a simple tabbed list of the cities and the status of their reports.
+  * If the condition posed in line 34 isn't met, execution branches to the Else statement in line 45, and line 46 adds to strLogText the contents of strCity(i), a tab, No file, and a carriage return. Line 47 ends the If statement, and line 48 ends the For... Next loop, returning execution to line 32.
+  * Line 49 is a spacer. Line 50 contains the Crash: label and marks the start of the error handler. Unlike in many procedures, you don't want to stop execution before entering the error handler—as it happens, you want to execute these statements (to create the log file) even if an error occurs. Line 51 is a spacer.
+  * Line 52 creates a new document based on the default template; line 53 types the contents of strLogText into the new document; and line 54 saves it under the name strLogName. Line 55 closes this new document (alternatively, you could leave the document open so that the user could view it). Line 56 closes the summary document (which has remained open since it was created; again, you might want to leave this open so that the user might view it or offer the user the option of keeping it open). Line 57 is a spacer, and line 58 ends the procedure.
+
+# Making Sure a Procedure Is Running under Suitable Conditions
+
+Another important consideration when creating a well-behaved procedure is to check that it's running under suitable conditions. This ideal is nearly impossible to achieve under all circumstances, but you should take some basic steps, such as the following:
+
+  * Make sure a file is open in a procedure that needs a file to be open—otherwise, you'll get an error every time. For example, in Excel, you might check the Count property of the Workbooks collection to make sure at least one workbook is open:
+
+    If Workbooks.Count = 0 Then _
+        MsgBox "This procedure will not run without a " _
+        & "workbook open. Open one, then run the procedure again.", _
+    vbOKOnly + vbExclamation, _
+        "No Workbook Is Open"
+
+  * Check that the procedure is referencing an appropriate item, if the procedure has definable requirements. For example, in an Excel procedure that applies intricate formatting to a chart the user has selected, make sure the user has, in fact, actually selected a chart. Trying to manipulate another object with chart-related commands is likely to cause an error or at least unwanted side effects.
+  * Make sure a file contains the element required by the procedure. (If it doesn't, an error will likely result.) Alternatively, trap the error that will result from the element's absence.
+
+# Cleaning Up after a Procedure
+
+Like your children or housemates, your procedures should learn to clean up after themselves. Cleaning up involves the following:
+
+  * Undoing any changes that the procedure had to make
+  * Closing any files that no longer need to be open
+  * Removing any scratch files or folders that the procedure has created to achieve its effects
+
+## Undoing Changes the Procedure Has Made
+
+In some cases, you'll need to make changes to a document in order to run a procedure successfully. Here are a couple of examples:
+
+  * In Word, you might need to apply some formatting to half of a table but not to the rest of it. In this case, it may be easier to split the table into two tables so that you can select columns in the relevant part and format or change them without affecting the columns in the other half of the original table. If you do this, you'll want to join the tables together again afterward by removing the break you've inserted between the original table's two halves. The easiest way to do this is to bookmark the break that you insert. You can then go back to the bookmark and delete it and the break at the same time. Alternatively, you could use a Set statement to define a range for the break and then return to the range and remove the break.
+  * In Excel, you may need to define named ranges in a workbook so that you can easily reference them from the code. (Usually, you'll do better to use ranges via VBA, which won't leave unwanted named ranges in the workbook.) Delete these named ranges when you've finished with them.
+
+## Removing Scratch Files and Folders
+
+During a complex procedure, you may need to create scratch files in which to temporarily store or manipulate data, or scratch folders in which to store temporary files.
+
+For example, if you need to perform complex formatting on a few paragraphs of a long document in Word, you may find it easier to copy and paste those paragraphs into a new blank document and manipulate them there than to continue working in the original document and risk unintentionally affecting other paragraphs as well. Likewise, in PowerPoint, you might need to create a new presentation that you could use for temporary or backup storage of intricate objects.
+
+Creating scratch files, while often necessary for the safe and successful operation of a procedure, can be intrusive. You're cluttering up the user's hard drive with information that's probably of no use to that user. Creating scratch folders in which to save the scratch files is even worse. Always go the extra distance to clean up any temporary items that you've stored on the user's hard drive. If you're thinking that commercial applications don't always do this, not even Microsoft's applications, you're right. But that doesn't mean you should follow their example.
+
+If your procedure is going to remove any scratch files it creates, you may be tempted to conceal from the user their creation and subsequent deletion. This usually isn't a good idea—in most cases, the best thing is to warn the user that the procedure will create scratch files. You might even let the user specify or create a suitable folder for the scratch files or present the user with a list that logs the files created and whether they were successfully deleted. Doing so will allow users to easily delete any scratch files left on their computer if your procedure goes wrong or is interrupted during execution.
+
+Another approach is to use the API (application programming interface) commands GetTempDir and GetTempFileName to find out the location of the computer's temporary folder and a temporary filename that you can use. (How to make an API call is illustrated in Chapter 30, "Accessing One Application from Another Application," in the sidebar titled "Using the Sleep Function to Avoid Problems with Shell's Asynchrony.") But even if you use the default temporary folder, you should delete any files that you create in it when your procedure is finished. Again, a disappointing number of commercial software developers fail to do this.
+
+### Using Your Own Scratch Folder
+
+You can use the MkDir command to create a folder. For example, the following statement creates a folder named Scratch Folder on the C: drive:
+
+    **MkDir** "c:\Scratch Folder"
+
+Before creating a folder, use the Dir command to check to see that the name isn't already in use. (If a folder with that name already exists, an error results.) Here's how:
+
+    Dim s As String
+    s = "c:\TempDir"
+
+    If Len( **Dir** (s, vbDirectory)) = 0 Then
+        MkDir s
+    End If
+
+For temporary storage, you may want to use a folder name based on the date and time to lessen the chance that a folder with that name already exists. You could also use VBA's Rnd function to generate a random number to use as part of the folder name.
+
+### Deleting a Scratch Folder
+
+You can use the RmDir statement to remove an empty folder. (Make sure that you've deleted all files in the folder first—otherwise RmDir will fail.) For example, the following statement removes the scratch folder named Scratch Folder on the C: drive:
+
+    RmDir "c:\Scratch Folder"
+
+# The Bottom Line
+
+**Understand the characteristics of well-behaved procedures.**
+
+Well-behaved procedures don't annoy or alarm the user either during or after their execution.
+
+Master It
+
+Name two ways programmers can write procedures that don't annoy users.
+
+**Retain and restore the user environment.**
+
+Users quite rightly don't appreciate it if your procedure leaves the state of their application's or operating system's environment modified. Find ways to restore the user environment before your procedure finishes execution.
+
+Master It
+
+Assume that you are writing a procedure that employs Word's Search and Replace feature. This feature retains its settings between uses so the user can repeatedly trigger the same search or replace actions. How can you temporarily store the status of the user's last search or replace so that you can restore this data after your procedure is finished executing?
+
+**Let the user know what's happening.**
+
+Particularly when a procedure is doing a lengthy "batch job" such as updating dozens of files, it's important to let the user know that the computer hasn't frozen. People need to be told that execution is continuing as expected even though nothing appears to be happening.
+
+Master It
+
+Describe a way to let the user know that a procedure isn't frozen—that activity is taking place during execution.
+
+**Check that the procedure is running under suitable conditions.**
+
+Another important element of creating a well-behaved procedure is to check that it's running under suitable conditions. This ideal is nearly impossible to achieve under all circumstances, but you should take some basic steps.
+
+Master It
+
+If a procedure accesses data from a file, name an error that could occur and thus should be trapped.
+
+**Clean up after a procedure.**
+
+A well-behaved procedure avoids leaving unneeded files or other temporary items behind. In other words, a procedure should clean up after itself.
+
+Master It
+
+Cleaning up involves three major tasks. Name one.
+Chapter 19
+
+Securing Your Code with VBA's Security Features
+
+This chapter discusses how to use the security tools that VBA provides for distributing and implementing macros and VBA code. VBA security falls into three categories: securing your applications against rogue VBA code; establishing that your VBA code isn't itself rogue so that it can be run; and securing your code against theft, alteration, or snooping.
+
+In this chapter you will learn to do the following:
+
+  * Understand how VBA implements security
+  * Sign a macro project with a digital signature
+  * Get a digital certificate
+  * Choose the appropriate security level
+  * Lock your code
+
+# Understanding How VBA Implements Security
+
+Macros, dialog boxes, and user forms that you write are computer programs, albeit usually rather small ones. But because macros, like any other computer program, can access the user's hard drive and exploit other features of a computer, macros can do damage.
+
+Office and the operating systems Vista, Windows 7, and Windows 8 include a variety of security features designed to protect the user from malicious code—macro, virus, Trojan horse, bot, or whatever. But some security features are specific to Office documents and the macros, dialog boxes, and user forms they can contain.
+
+Scary but true: An evil macro can do its damage _automatically_. It's not necessary for the user to deliberately launch a macro from the Macros dialog box or from within the VBA Editor. Some procedures (with certain special names such as Open) automatically launch themselves. For example, if you name one of your procedures Document_Open Sub, all the code within that sub executes spontaneously when the user opens its host document:
+
+    Private Sub **Document** _ **Open** ()
+
+This can be handy, of course. Perhaps you'll want to write some code in this procedure that automatically sets up your preferred zoom level or completes some other housekeeping task that you always perform when opening any document. But the fact that the user doesn't need to specifically choose to run this macro means that a virus can be put into this procedure. And whammo—your computer is infected.
+
+Malicious code can enter a user's Office applications via three primary vehicles: macros, ActiveX controls, and add-ins. Microsoft provides users with various approaches to VBA security, including the following:
+
+  * Certain Office document file types that simply cannot contain any embedded macros at all. That's the difference between, for example, saving a file using the Word .docx option, which cannot contain macros, and the .docm file type, which can.
+  * Documents that are loaded from a trusted area on the hard drive.
+  * Trust Center settings the user can specify, such as completely preventing the execution of any ActiveX controls, macros, or add-ins without even notifying or querying the user. Alternatively, the user can be prompted for permission before potentially dangerous code is allowed execute.
+  * A list of user-modifiable "trusted publishers"—companies whose documents are considered safe.
+  * The ability to digitally sign your own documents or templates, thereby making you a "trusted publisher."
+
+Office 2007 introduced the concept of two types of documents. For the first time, the user could save documents that simply cannot contain any macros or other potentially malicious code. By default, any new Word document is of the .docx type, not the .docm (macro-enabled) type. In other words, a document must be deliberately created as a macro-enabled document. And because it also must have a .docm filename extension, everybody else (including Word when opening the document) knows that it contains possibly dangerous code. Administrators can use Group Policy to enforce rules concerning which file types are permitted. But the default .docx file type is free of potentially risky executables (files or procedures that can execute).
+
+Other Office applications also have pairs of macro-disabled, macro-enabled file types. Excel has .xlsx and .xlsm files, and PowerPoint has .pptx and .pptm files.
+
+Office includes various security tools and features that ordinary users, administrators, and IT professionals can employ to further safeguard Office applications from attack:
+
+  * An Office ActiveX kill bit that allows administrators to forbid certain ActiveX controls from executing.
+  * File-type blocking that can be implemented via Group Policy settings or individually via the Office Trust Center. The types of files that an application can access can be specifically controlled.
+  * A Trusted Documents feature that allows users to specify individual documents as reliable, thereby obviating whatever macro settings the user has enforced in the Trust Center.
+  * A scanning feature that searches for format exploits before a file can be opened by an Office application.
+  * A sandbox named Protected View. A sandbox isolates an executing program so it can't damage other programs, introduce viruses into the operating system, or store nasty surprises on the hard drive. Figure 19.1 shows the warning you get if you're about to open a document from a potentially dangerous source, and the Protected View options. This is similar to starting Windows in Safe Mode. In Protected View, executables are disabled. The protected document is in effect quarantined, so it theoretically can't do any harm to your computer or its contents. I say _theoretically_ because as we all know, no security is perfect. Note that in its description of Protected View in Figure 19.1, Microsoft carefully states that this mode will "help minimize harm"—no claim of invulnerability. All the Protected View options are turned on by default, so files you get from the Internet and Outlook attachments, for example, are automatically tossed into the sandbox when opened.
+
+Figure 19.1 Suspect sources trigger this security warning when opened in Office applications.
+
+Also, the user can deliberately choose to open a file in sandbox mode by selecting the file's name in an Office application's Open dialog box, clicking the Open drop-down box in the lower-right corner of the Open dialog, then choosing Open In Protected View.
+
+Various under-the-hood features, including password security and encryption to protect the privacy of user information.
+
+Doubtless there are additional hardening tactics that Microsoft is not mentioning. After all, why tell the bad people everything that's being done to prevent their incursions?
+
+* * *
+
+**Real Security in an Insecure World**
+
+All the virus-detection software, firewalls, digital signatures, and other security efforts in the world won't protect you or your colleagues if somebody on your network opens email attachments, downloads dodgy executables, or otherwise invites trouble into your environment.
+
+Even if everybody is aware of the dangers and follows the best security practices, viruses and other troubles can _still_ get in. After all, antivirus applications are always playing catch-up. A new virus is released, and then the antivirus forces identify it and send out a new update.
+
+On the plus side, currently it's pretty rare to find macros employed as a vehicle for spreading viruses. And, of course, if you're writing the VBA code yourself—as a reader of this book—you can certainly trust the source of _your_ macros. It's you!
+
+Because threats are constant, and because it's ultimately impossible to guarantee that you will never get a virus (in spite of taking great pains to prevent them), you should ensure that you are taking additional precautions to at least mitigate damage.
+
+Malicious software falls into two broad categories:
+
+  * Code that attempts to do damage to you by, for example, erasing files or slowing your computer down so much that it becomes painful to use. The goal here is to create a mess you have to clean up.
+  * Code that attempts to find out your secrets to do damage to you by, for example, stealing your identity to ruin your credit or to drain your bank account. The issue here is violation of your privacy, a different kind of mess you have to clean up.
+
+If you're concerned about privacy, you should encrypt any documents containing sensitive information. Fortunately, with Office 2007 the formerly weak Office encryption scheme was replaced with a highly secure one. And Microsoft continues to toughen built-in encryption schemes and has added integrity-checking technologies for encrypted files. PowerPoint, Word, and Excel all permit you to encrypt files and then decrypt them by providing a password. Click File on the Ribbon, then in the Info page click Protect Document and then Encrypt With Password.
+
+If you're worried about a virus attack, be sure to back up your documents (you should do this anyway, in case of a drive crash, fire, theft, or other havoc). These days, with three-terabyte external drives selling for around $100, it's practical to store your entire computer system (a "system image")—documents, programs, inbox email, everything—as an image on an external drive. That way, you wouldn't even have to reinstall applications in the event of a serious problem. You can use third-party backup systems. Or if you use Windows 7, you can use Windows's built-in backup system by choosing Start ⇒ Control Panel, then clicking Backup And Restore.
+
+If you use Windows 8 and just want to back up your data files, press Windows key+W and type **Save Backup** to use the new File History utility. If you want to use the traditional Windows backup, it's possible even in Windows 8. To invoke this utility, press Windows key+W and type **Windows 7 File Recovery**. From there you can create an image, a repair disk, or a traditional Windows-style backup.
+
+* * *
+
+To secure an application against rogue VBA code, you can use the Office Trust Center to choose the level of security that you want the application to use when running VBA code. Click the File tab, then choose Options. Click the Trust Center button in the left pane, and click the Trust Center Settings button.
+
+You can also specify which sources to trust and how much to trust them. A trusted source might be someone who works for the same company as you, or someone who has a digital certificate from a third party you trust, such as the VeriSign certification authority. Because you (in this example) trust VeriSign, you therefore trust the third party to whom VeriSign has issued a digital certificate. Office also has a trusted time-stamping feature with the digital signature technology.
+
+To establish that your own code is fine for the Office applications to trust, you can sign a document or template project that contains customizations or macro project items (code modules, class modules, or user forms) with a digital signature generated by a digital certificate that uniquely identifies you or your company. We'll look at this technique first because it sets the stage for specifying the level of security to use.
+
+You can also lock a macro project with a password so that nobody can open the code. This both prevents anyone from tinkering with your code and either stopping it from working or rendering it harmful, and protects your intellectual property: If nobody can see your code, they can't steal your ideas. The section "Locking Your Code" shows you how to do this.
+
+# Signing Your Macro Projects with Digital Signatures
+
+VBA provides a security mechanism for securing macro projects with digital signatures. The digital signatures provide a means of establishing the provenance of the projects, which should help you decide whether to trust the code. If you trust the source of the code to produce benevolent programming, you can open the project and run the code. If you suspect the source or the information of being malignant, you can either avoid opening the project or open the project with the code disabled.
+
+The same goes for other people: If others are concerned about your macros, you may need to sign your projects so that other people know where they come from and who created them. Once you've signed the projects, the code is available to any application that has specified you as a trusted source for macro projects. (This assumes users have chosen one of the Disable options in the Macro Settings dialog box. You'll see how to set the security level later, in the section "Specifying a Suitable Security Setting.")
+
+The following sections discuss what digital certificates are, what they mean in practical terms, how you obtain them, and how you use them to create digital signatures.
+
+* * *
+
+Trusting a Publisher Is Global for VBA-Enabled Applications
+
+VBA's security mechanism, and the list of certificates, is shared across the range of VBA-enabled applications on your computer. So if you designate a trusted publisher in one application, all the other applications that support VBA security will trust that source as well. For example, if you open a document that contains code in Word and choose to trust the source of the code, Excel and Outlook also gain that trust and open projects from that source without having to prompt you.
+
+* * *
+
+## What Is a Digital Certificate?
+
+A _digital certificate_ is an encrypted datum that uniquely identifies its holder. Rather like a driver's license, it provides a level of trust that you are who you say you are and that your code can be trusted.
+
+You use your digital certificate to create a digital signature for a project. This project can be a document project, a template project, or an add-in. The project doesn't have to contain macros, procedures, user forms, classes, or VBA code for you to sign it, although these contents are the usual reason for signing a project.
+
+A digital signature applies to a whole macro project, typically a document project or a template project. You can't apply a digital signature to just part of a project—say, just to one module of code or to one user form. Each macro project item in that macro project—each module, user form, class, and reference—is covered by the digital certificate.
+
+But digital signatures, while usually reliable, have sometimes been compromised.
+
+## Getting a Digital Certificate
+
+There are three types of digital certificates: those you create yourself ("self-signed"), those you get from your company or organization, and those you get from a commercial certification authority, or certificate authority (CA).
+
+A digital certificate you create yourself is the weakest form of identification and is of little use to people beyond you and those who use your machine, whereas a certificate from a commercial certification authority should be good enough for general use in the world. Self-signed code will generate a security warning if someone opens a file containing this code. Office applications will not allow this code to run on any but the machine on which the certificate was created.
+
+A certificate issued by your company falls in the middle range of trustworthiness: In many cases, the company will have obtained the certificate from a commercial certification authority, which means the commercial certification authority has established to its satisfaction that the company is trustworthy. Whom the company chooses to trust with the certificate is another matter and introduces another complicating link into the chain of trust. However, server software such as Windows Server includes independent certification-authority services that do not require a certificate from a commercial certification authority, so you should be careful which certificates you trust. See the section "Whose Certificate Is It, and What Does It Mean?" later in this chapter for a discussion of how to discern a certificate's provenance and meaning.
+
+### Creating a Digital Certificate of Your Own
+
+The quickest and easiest way of getting a digital certificate is to create one yourself. It's easy, but its usefulness is very limited. Remember that this kind of certification only works on the computer on which the certificate was created.
+
+To understand how digital certificates work, you'll probably want to create several of your own and practice with them on sample files. By designating some of your files as originating from trusted publishers and leaving others untrusted, you can get a clear idea of how digital certificates work without having to actually mess around with suspect code on your system.
+
+To open the Create Digital Certificate dialog box (see Figure 19.2), from the Desktop in Windows 8, press the Windows Key and type **digital certificate**. Press Enter when you see Digital Certificate for VBA projects. You'll see the form you can "sign," as shown in Figure 19.2.
+
+Figure 19.2 You can self-sign a certificate, but Office only permits such certification to be trusted within the computer where the certificate was created.
+
+If you're using Windows 7, choose Start ⇒ All Programs ⇒ Microsoft Office ⇒ Microsoft Office 2013 Tools ⇒ Digital Certificate For VBA Projects.
+
+Type the name for the certificate in the text box, and then click the OK button. The SelfCert application creates the certificate and installs it automatically.
+
+### Getting a Digital Certificate from Your Company
+
+Your second option is to get a digital certificate from a digital certificate server that your company has. The details of this procedure vary from company to company. The certificates the company provides via its digital certificate server are generated in the same fashion as the digital certificates distributed by the commercial certification authorities discussed in the next section. However, a company distributes the certificates from a pool that it has allocated, without needing to apply to the certification authority for each certificate as it's needed, or creates the certificates of its own accord without getting them from a certification authority. Clearly this isn't all that safe. A rogue employee can _pose_ as trustworthy, obtain a company certificate, and then run totally wild. Totally.
+
+### Getting a Digital Certificate from a Commercial Certification Authority
+
+Your third choice is to get a digital certificate from a commercial certification authority such as these:
+
+  * VeriSign (www.verisign.com). This, the most famous code-signing company, is now owned by Symantec.
+  * Go Daddy (www.godaddy.com) is the new kid on the block. Offers bargain code certification and other security products.
+  * Thawte, Inc. (www.thawte.com, a VeriSign company).
+  * GeoTrust (www.geotrust.com, another VeriSign company).
+  * DigiCert (www.digicert.com).
+
+VeriSign's computers handle four trillion lookups per day, but the company plans to spend $300 million over the next several years to increase that capacity to four quadrillion.
+
+Several types of certificate are available, depending on what you want to do. If you're creating and distributing software, you'll probably want to consider one of the certificates targeted at developers.
+
+The procedure for proving your identity varies depending on the CA and the type of certificate you want. Generally speaking, the greater the degree of trust that the certificate is intended to inspire, the more proof you'll need to supply. For example, you can get a basic certificate on the strength of nothing more than a verifiable email address, but this type of certificate is unlikely to make people trust you. Other certificate types require you to appear in person before a registration authority with full documentation (such as a passport, driver's license, or other identity documents). Such certificates obviously inspire more trust.
+
+### Installing a Digital Certificate
+
+Once you have a digital certificate, you need to install it so that Windows and the applications that will use it know where it's located.
+
+To install a digital certificate, follow these steps (you must be logged on as Administrator to view the Certificates dialog box):
+
+1. In Windows 8, from the Desktop, press the Windows Key and type **certmgr.msc**.
+
+* * *
+
+Self-Certifications Are Automatically Registered
+
+The Office SelfCert program automatically registers the certificates it creates on the computer on which it creates them. If you created a digital certificate for yourself, you shouldn't need to install it on the same computer. If you want to practice installing it, you'll need to use a different computer.
+
+* * *
+
+In Windows 7, click the Start button. A Search Programs And Files field opens just above the Start button. In the Search Programs And Files field, type **certmgr.msc**.
+
+**2.** When certmgr.msc appears in the Programs list, click it. You'll possibly be asked if you want to give yourself permission to take this step. Unless you are not you, go ahead and grant the permission by clicking the Continue button. (From this point on, Windows 7 will take a different path and display different dialogs than those shown here.)
+
+You now see the Certificates dialog box shown in Figure 19.3.
+
+Figure 19.3 Windows provides the Certificates dialog box to manage digital certificates.
+
+As you can see in Figure 19.3, I, identifying myself as an entity named _TotallyTrustworthy_ , granted code-signing certification to myself, also TotallyTrustworthy, as described earlier in this chapter in the section "Creating a Digital Certificate of Your Own."
+
+**3.** Click the Trusted Publishers folder in the left pane of the Certificates dialog box.
+
+**4.** Choose Action ⇒ All Tasks ⇒ Import from the Certificates dialog box's menu. The Certificate Import Wizard opens, as shown in Figure 19.4.
+
+Figure 19.4 Windows includes the Certificate Import Wizard to manage digital certificates.
+
+**5.** Click the Next button in the wizard to locate the file you want to import. You can search your hard drive for filenames ending in .cer or .crt.
+
+**6.** Click Next to display the Certificate Store page of the wizard, shown in Figure 19.5.
+
+Figure 19.5 On the Certificate Store page of the Certificate Import Wizard, choose the certificate store in which to store the certificate you're importing.
+
+7. Choose how to store the certificate:
+
+  * To have Windows store each certificate automatically in the default certificate store for the certificate's type, select the Automatically Select The Certificate Store Based On The Type Of Certificate option button.
+  * To control where Windows stores the certificates, select the Place All Certificates In The Following Store option button. To specify the store, click the Browse button to display the Select Certificate Store dialog box, shown in Figure 19.6. Choose the certificate store (for example, Personal) and click the OK button. To specify a particular location within a certificate store, select the Show Physical Stores check box, and then click the plus (+) sign next to the store in question to display its subfolders. Select the folder you want, and then click the OK button.
+
+Figure 19.6 Use the Select Certificate Store dialog box to specify the certificate store in which you want to keep the certificate. The screen on the left shows the categories of stores; the screen on the right shows the physical stores.
+
+8. Click the Next button to finish setting up the import procedure. The Completing The Certificate Import Wizard dialog box is displayed to confirm the choices you've made.
+
+9. Review your choices, and then click the Finish button. The Certificate Import Wizard imports the certificate and then confirms that the operation was successful.
+
+Now that you've imported the certificate, it appears in the Certificates dialog box on the appropriate page.
+
+### Exporting a Digital Certificate
+
+You may need to export a certificate for backup so that you can keep it safely on removable media away from your computer or so that you can install it on another computer. For security, you should not store the digital certificate on your hard drive after you install it, because storing it there is an unnecessary security risk.
+
+To export a certificate, right-click it in the Certificates dialog box, then choose All Tasks ⇒ Export. Windows starts the Certificate Export Wizard, which walks you through the process of exporting the certificate. If you choose to export the private key with the certificate, be sure to protect it with a password.
+
+### Removing a Digital Certificate
+
+To remove a digital certificate from Windows's digital certificate store, follow these steps:
+
+1. Display the Certificates dialog box (follow steps 1 and 2 in the section earlier in this chapter on installing a certificate).
+
+2. Click the folder in the left pane that contains the digital certificate in question, and then select the certificate you want to remove.
+
+3. Click the red X icon, or choose Action ⇒ Delete. Windows displays a dialog box warning you of the consequences of deleting the digital certificate and asking you to confirm the deletion. Figure 19.7 shows the warning you get when removing a certification authority certificate (top) or a personal certificate (bottom). Click the Yes button to delete the certificate.
+
+Figure 19.7 Two of the warnings the Certificate Manager displays when you're about to remove a digital certificate
+
+### Signing a Macro Project with a Digital Signature
+
+Once you've completed a macro project and have it ready for distribution, you sign it with a digital signature so that applications that use a high level of security can use it.
+
+To sign a macro project digitally, follow these steps:
+
+1. In the VBA Editor, navigate to the document or template project that contains the macro project you want to sign.
+
+2. Select the project in the Project Explorer.
+
+3. Choose Tools ⇒ Digital Signature to display the Digital Signature dialog box (see Figure 19.8).
+
+If the Digital Signature dialog box lists the certificate you want in the Sign As area, simply click the OK button to use that certificate.
+
+Figure 19.8 Use the Digital Signature dialog box to specify the digital signature for a macro project.
+
+4. Click the Choose button. If you have more than one certificate, you'll see a Select Certificate dialog box. (If you have only one certificate, you'll see the Windows Security dialog box where you can confirm your choice, as shown in Figure 19.9. You should then skip to step 7.)
+
+Figure 19.9 Use this Windows Security dialog box to confirm your choice of certificate with which to sign the macro project.
+
+5. Click the certificate you want to use for the macro project.
+
+6. Click the OK button to apply the selected certificate and close the Select Certificate dialog box.
+
+7. Click the OK button to close the Digital Signature dialog box.
+
+8. Click the Save button on the Standard toolbar, press Ctrl+S, or choose File ⇒ Save to save the document or template project with the digital signature applied to it.
+
+### Removing a Digital Signature from a Macro Project
+
+To remove a digital signature from a macro project, follow these steps:
+
+1. In the VBA Editor, navigate to the document or template project that contains the macro project.
+
+2. Select the project in the Project Explorer.
+
+3. Choose Tools ⇒ Digital Signatures to display the Digital Signature dialog box.
+
+4. Click the Remove button. Both the Certificate Name readout in the area labeled The VBA Project Is Currently Signed As and the Certificate Name in the Sign As area of the Digital Signature dialog box will display [No Certificate] to indicate that the project no longer has a digital certificate assigned to it.
+
+5. Click the OK button to close the Digital Signature dialog box.
+
+You can always reapply the digital signature to the project whenever you wish, as described earlier in this chapter.
+
+### Whose Certificate Is It, and What Does It Mean?
+
+When you receive a digitally signed project, you'll probably want to find out just who has signed it and just what type of digital certificate they used. To view the details of a digital certificate, follow these steps:
+
+1. In the VBA Editor, navigate to the document or template project that contains the macro project.
+
+2. Select the project in the Project Explorer.
+
+3. Choose Tools ⇒ Digital Signature to display the Digital Signature dialog box.
+
+4. For an official (VeriSign or other) certification, click the Details button to see information about the source.
+
+If you want to view the details of one of your own, dodgy, _self_ - _signed_ certificates, click the Choose button in the Digital Signature dialog box, and click the Click Here To View Certificate Properties link to display the Certificate Details dialog box shown in Figure 19.10.
+
+Figure 19.10 Use the Certificate Details dialog box to examine the properties of a certificate.
+
+By examining Figure 19.10 close up, you'll see the Official Certificate icon with the Gold Seal and the Blue Ribbon (they inspire trust), but there is, alas, also a _Red X_ symbol! Chilling. This X means that the project in question _cannot be trusted whatsoever_.
+
+The Certificate Details dialog box has three pages:
+
+  * The General page displays basic information about the certificate: for what purpose the certificate is intended, to whom it's issued, by whom it's issued, and the period for which it's valid.
+  * The Details page of the Certificate Details dialog box, shown in Figure 19.11, contains specifics about the certificate. Click one of the fields in the list box to display its value in the text box below.
+  * The Certification Path page of the Certificate Details dialog box shows the path by which the certificate has been issued from the issuing authority to the current holder. To check one of the links in the chain, select it in the Certification Path list box and click the View Certificate button (if it's available). You'll see the Certificate Details dialog box for the certificate in question. You can then follow the certification path for that certificate if you choose or click the OK button to dismiss the second (or subsequent) Certificate Details dialog box and return to the previous one.
+
+Figure 19.11 The Details page of the Certificate Details dialog box contains a host of details about the certificate.
+
+# Choosing a Suitable Level of Security
+
+To use VBA macros safely, you or a user of your code must open the Office Trust Center and choose a suitable level of security—high enough to avoid the threats posed by malicious or incompetent code but low enough that it doesn't prevent you from running useful, safe code.
+
+## Understanding the Security Threats Posed by VBA
+
+The VBA macro language is formidable. It can accomplish sophisticated and valuable tasks. But its capabilities also pose a threat when misused. Using relatively simple VBA commands, you can create files, delete files, manipulate existing data, and even control other applications.
+
+Also, code developed with the best of intentions can damage a computer when run under unsuitable circumstances. For example, a procedure might delete valuable data or delete critical files, making the computer crash. Such unintentional damage happens frequently enough, but what tends to make the headlines is damage caused intentionally by malicious code in macro viruses and other malicious software (or _malware_ ).
+
+A _macro virus_ is simply a computer virus written in a macro language such as VBA.
+
+## Protecting against Macro Viruses
+
+Protecting your computer (and computers connected to it in a network) against macro viruses requires three main steps:
+
+1. Install and run antivirus software, such as Malwarebytes (www.malwarebytes.org/) on your computer. And use the Windows Defender that's built into Windows. Update the antivirus software frequently and regularly with the latest virus definitions. (Most antivirus software offers automatic updating.)
+
+2. Configure suitable security settings in the applications you use, especially in those applications that host VBA or other programming languages or scripting languages. For example, configure VBA security settings as described in the next section.
+
+3. Be careful when opening any file that might contain code or an email attachment. Most modern applications warn you when there might be a problem with a file. Many macro viruses attempt to outwit such warnings by _social engineering_ —conning the user—rather than by sophisticated programming.
+
+4. For example, a macro virus may transmit itself as an email attachment to all the addresses in a friend's email application. The message and attachment suggest that the contents of the attachment are interesting or amusing—for example, jokes or compromising pictures. Because the file comes from a friend, someone known and trusted, and because the contents seem compelling, many users will open the file and ignore any security warnings. The action of simply opening the file can cause the code within the file to execute. Similarly, simply opening a Word .docm file can execute a macro. And by then it could be too late. Creepy code robots could be multiplying exponentially throughout your system.
+
+## Specifying a Suitable Security Setting
+
+First, set a suitable level of security for your purposes. To open the Options dialog box in Access, Word, Excel, or PowerPoint, click the File tab, then choose Options. Click the Trust Center button in the left pane. Then click the Trust Center Settings button, and click Macro Settings (see Figure 19.12).
+
+Figure 19.12 On the Macro Settings page of the Trust Center dialog box, choose the level of security you want to use when running macros.
+
+The various macro security settings are self-explanatory. However, if you are working in documents that you've created yourself and saved as the .docm type, having written your own macros you can temporarily choose the Enable All Macros option. At least while you're practicing with the examples in this book, you can trust your own documents. However, if you are opening macro-enabled document files (.docm or the other files from PowerPoint or Excel with an m appended to the filename extension), you should specify a less risky setting in your Trust Center macro settings.
+
+There's an easier way to deal with this problem, though. You can alternatively (and more safely) employ one of the disable options shown in Figure 19.12, but while doing development work with VBA (such as experimenting with the code in this book), just ensure that you save your .docm documents in one of the trusted locations. You can see the list of trusted locations by clicking the Trusted Locations button shown in the left pane in Figure 19.12.
+
+If you choose the Disable All Macros Except Digitally Signed Macros option, any unsigned macros in your documents won't work. They are blocked from executing. However, you can get them to work again by simply moving the document files to a trusted location.
+
+## Additional Trust Center Settings
+
+Microsoft is currently encouraging (by the pricing structure if nothing else) that its Office customers move from one-purchase, disk-based Office installation to a downloaded, pay-yearly subscription model called Office 365.
+
+What's more, there are seven versions of Office 365, each with its own variations on security features, such as whether or not it supports Group Policy settings. To see the variations, visit this page:
+
+<http://technet.microsoft.com/en-us/library/jj851145.aspx>
+
+Notice also a security feature listed in the left pane in Figure 19.12 that is new in Office 2013: Trusted App Catalogs.
+
+Open the Trusted Application Catalogs page in the Trust Center dialog box and you'll see the options illustrated in Figure 19.13.
+
+Figure 19.13 On this page of the Trust Center dialog box, choose whether you want to trust app catalogs.
+
+* * *
+
+The New Office Apps
+
+What is an Office app? Microsoft describes the new apps like this: "An app for Office is a region inside an Office application that contains a web page that can interact with the document to augment content and provide new interactive content types and functionality. apps [sic] for Office can be obtained by users from the new Office marketplace or from a private catalog in the form of stand-alone apps or subcomponents of a document template solution, or a SharePoint application." In other words, an online image-search tool or grammar checker could be embedded in Word as a command-bar pane, like Word's own built-in Navigation or Thesaurus command bars.
+
+At the time of this writing, the kinks have not yet been ironed out of all apps for Office, but if you're interested, you can try some free apps that are available in the Office Store. Click the Insert tab on the Ribbon, then click Apps For Office. Click See All. Click the _Find more apps at the Office Store_ link. Your browser opens showing various apps you can add to whatever Office application you happen to be working in currently.
+
+The apps for Office technology is, to be polite, still being refined at this time. For a sad example, try adding the Merriam-Webster dictionary to Word. Word 2013 has no built-in dictionary, presumably to encourage you to use an app instead. But this dictionary may not be the best. It doesn't have an entry for _normally_ , for example. Worse, if you look up _normal_ , the first definition given is _perpendicular_. A superior dictionary app for Office is the Bing dictionary, found here:
+
+http://office.microsoft.com/en-us/store/results.aspx?vtags=Reference&av=zwd150
+
+Alas, the person who wrote the interface for the Bing dictionary seems to think that the adjective _lookup_ is interchangeable with the verb _look up_. But don't blame Bing. On the plus side, the Bing dictionary does have a good definition of the word _naturally_.
+
+A key feature of the new Office apps is that they cannot be written in VBA. You must use "web technologies like HTML5, XML, CSS3, JavaScript, and REST APIs" instead.
+
+* * *
+
+When you trust a catalog of Office apps, you're telling Office that it can stop notifying you or otherwise blocking executable content (such as macros or ActiveX controls) from this source. Thus you can override on a case-by-case basis the macro and other security settings that have been specified (see Figure 19.12 and Figure 19.13).
+
+### File Block Settings
+
+The File Block Settings page, shown in Figure 19.14, gives you the ability to block individual file types from opening or to open them in Protected View. Here you can also specify which types of files can be saved. Notice at the bottom of this page that you specify what choosing the Open option means:
+
+  * Do Not Open Selected File Types means documents are totally blocked.
+  * Open Selected File Types In Protected View means you can open documents in the sandbox for reading only.
+  * Open Selected File Types In Protected View And Allow Editing means you can open documents in the sandbox for editing.
+
+Figure 19.14 File Block Settings specify what types of documents you want blocked or sandboxed.
+
+If you want to delve more deeply into Office 2013 security features, take a look at these web pages and the links therein:
+
+<http://technet.microsoft.com/en-us/library/cc179050.aspx>
+
+<http://technet.microsoft.com/en-us/library/cc179171.aspx>
+
+* * *
+
+Can Even a Simple .txt File Harbor a Virus?
+
+You might wonder why the _Plain Text Files_ option is included in the File Block Settings page shown in Figure 19.14. It would seem that a simple Notepad .txt file couldn't contain any dangerous executable code (any more than a stop sign could fire a bullet at you). After all, text is just words, right?
+
+Nope. Even opening simple .txt files can install a virus. How? The bad guys use trick filename extensions. Even though it says .txt, it might only be masquerading as a text file. Executable files (programs or viruses) usually have a .exe filename extension, but by default Windows _hides_ filename extensions. So you see Word, not Word.exe in Windows Explorer. Also, Windows files can be named with multiple extensions. So, you can have a dangerous file named OpenMe.txt.exe, but thanks to Windows's default extension-hiding, the filename that you actually see in this case is OpenMe.txt. You go ahead and double-click it thinking it will open in Notepad like most .txt files. Your hard drive explodes. Well, maybe not a detonation, but all your files could be wiped or there could be some other nasty virus surprise. OpenMe.txt was merely _posing_ as a .txt file, and inside was a monster.
+
+* * *
+
+# Locking Your Code
+
+To prevent anyone from viewing the contents of a macro project, you can lock it with a password. You'll usually want to do this before distributing a project to your colleagues. If your workplace is particularly volatile, you might even want to lock projects while they are merely under development on your own desktop. The argument against locking a project on which you're still actively working is that the lock adds a step to accessing the modules and forms in the project—but if you need the security, it's well worth the small amount of effort involved.
+
+Follow these steps to lock a document or template project:
+
+1. Press Alt+F11 to display the VBA Editor.
+
+2. In the Project Explorer, right-click the project that you want to lock, and choose Project Properties from the context menu to display the Project Properties dialog box. Alternatively, select the project in the Project Explorer and choose Tools ⇒ Project Properties.
+
+3. Click the Protection tab to display the Protection page (see Figure 19.15).
+
+Figure 19.15 Use the Protection page of the Project Properties dialog box to lock the project.
+
+4. Select the Lock Project For Viewing check box in the Lock Project group box.
+
+5. In the Password To View Project Properties group box, type a password in the Password text box and the same password in the Confirm Password text box. Setting a password is compulsory: You can't lock a project without specifying a password. Without a password, how could you unlock it?
+
+6. Click the OK button to apply the locking to the project. The VBA Editor closes the Project Properties dialog box but leaves the contents of the project open for you to view and work with.
+
+7. Switch back to the application, save your work, and close the application.
+
+Once you've done that, the project is locked and can't be viewed or edited without the password. When you choose to edit a procedure in the project from the application or try to expand the project in the Project Explorer in the VBA Editor, the Project Password dialog box appears, as shown in Figure 19.16 (unless you have macros disabled in the Trust Center settings).
+
+Figure 19.16 When you open a locked project, you need to enter the password for the project in this Project Password dialog box.
+
+Type the password in the Password text box and click the OK button to display the contents of the project. (If you enter the wrong password, the application or the VBA Editor displays a Project Locked message box followed by the Project Password dialog box for you to try again.)
+
+To unlock a project, open it in the VBA Editor (supplying the password), display the VBA Project Properties dialog box (by right-clicking the project's name in the Project Explorer, then choosing the Project Properties option from the context menu), clear the Lock Project For Viewing check box on the Protection page, and click the OK button. Save the file that contains the project.
+
+# The Bottom Line
+
+**Understand how VBA implements security.**
+
+Microsoft takes a multipronged approach to protecting users from malicious VBA code embedded in documents and capable of launching itself when the user simply opens the document.
+
+Master It
+
+Name two ways that users are protected from malicious VBA code.
+
+**Sign a macro project with a digital signature.**
+
+You can add a digital signature to your projects by creating your own certification, getting it from your company, or getting it from certification authorities such as VeriSign.
+
+Master It
+
+Describe the limitations of certifying a VBA macro project for yourself—without obtaining a certificate from your company or a commercial certification authority.
+
+**Get a digital certificate.**
+
+Commercial certification authorities provide the greatest level of security, but their certification is also more difficult to attain than self-certification or certification from your company.
+
+Master It
+
+Name some of the ways you may be required to prove your identity when obtaining a digital signature from a commercial certification authority.
+
+**Choose the appropriate security level.**
+
+When choosing the right security level to use VBA macros safely, you or a user of your code must achieve a balance. The security level must be set high enough to avoid malicious or incompetent code but low enough that it doesn't prevent you from running useful, safe code.
+
+Master It
+
+To set a suitable level of security for your purposes, open the Trust Center in Access, Word, Excel, or PowerPoint. You'll see four settings. Which one of the following five settings is _not_ available:
+
+  * Disable All Macros Without Notification
+  * Disable All Macros With Notification
+  * Disable All Macros Except Digitally Signed Macros
+  * Enable All Macros With Notification
+  * Enable All Macros
+
+**Lock your code.**
+
+You can protect your source code in the VBA Editor from others. You add a password to a project (projects are in boldface in the Project Explorer), and others can't open your VBA procedures for reading or modifying.
+
+Master It
+
+What is the one drawback to locking your code?
+Part 6
+
+Programming the Office Applications
+
+  * **Chapter 20: Understanding the Word Object Model and Key Objects**
+  * **Chapter 21: Working with Widely Used Objects in Word**
+  * **Chapter 22: Understanding the Excel Object Model and Key Objects**
+  * **Chapter 23: Working with Widely Used Objects in Excel**
+  * **Chapter 24: Understanding the PowerPoint Object Model and Key Objects**
+  * **Chapter 25: Working with Shapes and Running Slide Shows**
+  * **Chapter 26: Understanding the Outlook Object Model and Key Objects**
+  * **Chapter 27: Working with Events in Outlook**
+  * **Chapter 28: Understanding the Access Object Model and Key Objects**
+  * **Chapter 29: Manipulating the Data in an Access Database via VBA**
+  * **Chapter 30: Accessing One Application from Another Application**
+  * **Chapter 31: Programming the Office 2013 Ribbon**
+
+Chapter 20
+
+Understanding the Word Object Model and Key Objects
+
+In this chapter you'll become familiar with the Word object model and the architecture underlying Word. You'll see how to perform common tasks with the most frequently useful Word objects. These objects include the Documents collection and the Document object, the Selection object, Range objects, and the Options object.
+
+In this chapter you will learn to do the following:
+
+  * Understand the Word object model
+  * Understand Word's creatable objects
+  * Work with the Documents collection and the Document object
+  * Work with the Selection object
+  * Create and use ranges
+  * Manipulate options
+
+# Examining the Word Object Model
+
+You don't need to understand how the entire Word object model fits together in order to work with VBA in Word, but most people find having a general idea of the components and structure of the object model helpful. Some VBA programming involves managing objects, and for this the Help system's code examples are often invaluable. To see Word's object model reference, follow these steps:
+
+1. Launch or activate Word, and then press Alt+F11 to launch or activate the VBA Editor.
+
+2. Choose Help ⇒ Microsoft Visual Basic For Applications Help. (Pressing F1 is not an alternative, alas; it currently takes you to an entirely different page.) You should now see a web page similar to the one shown in Figure 20.1 (this figure shows a part of the web page). If you don't see this web page, type this URL into your browser's address field:
+
+<http://msdn.microsoft.com/en-US/library/fp179696(v=office.15).aspx>
+
+Figure 20.1 A Word Help website (partial view)
+
+3. Click the _Welcome to the Word 2013 developer reference_ link on the left side of the web page (see the pointing hand icon in Figure 20.1).
+
+You now see the page shown in Figure 20.2.
+
+Figure 20.2 Drilling down in the Word Help site (partial view)
+
+4. Now click the link named _Object model reference_ ( _Word 2013 developer reference_ ), as shown in Figure 20.2 with the pointing hand cursor.
+
+You now see the Object Model Reference, shown partially in Figure 20.3.
+
+Figure 20.3 The entries in the Word Object Model Reference will help you write your own VBA code.
+
+* * *
+
+Help When Migrating Legacy Code from Earlier Office Projects
+
+If you've inherited VBA code written in earlier versions of Office, those procedures might contain objects, methods, and properties that have been changed in Office 2013. Though modifications to object models are generally few, some incompatibilities can crop up and "break" the code so it won't run correctly. Fortunately, you can download a free utility, the Office Code Compatibility Inspector, that will flag objects and their members that have changed. It does a text comparison of the Office 2013 object model against VBA code written in earlier versions of Office. You can download the Compatibility Inspector from this web page:
+
+www.microsoft.com/en-us/download/details.aspx?id=15001
+
+* * *
+
+* * *
+
+A Shortcut: Understanding Creatable Objects
+
+Like most VBA-enabled applications, Word has a number of _creatable objects_. This merely means that you don't have to employ the full qualification. In other words, you don't need to mention the Application object in your code. For example, the Documents collection object is creatable,
+
+so you can omit its parent, the Application object, when using the collection in code, like this:
+
+    Dim x As Integer
+    x = Documents.Count
+    MsgBox x
+
+The Application object is simply understood, for the same reason that you don't have to add Planet Earth when addressing an envelope. The post office assumes that Mother Earth is the parent—the context—of all addresses.
+
+However, you can, if you wish, use the longer ("fully qualified") version:
+
+    x = Application.Documents.Count
+
+Both versions have the same effect.
+
+The following are typically the most useful of these creatable objects:
+
+  * The ActiveDocument object returns a Document object that represents the active document.
+  * The ActiveWindow object returns a Window object that represents the active window.
+  * The Documents collection contains the Document objects, each of which represents an open document.
+  * The Options object represents Word options and document options, including most of the options that appear in the Options dialog box.
+  * The Selection object represents the selection in the active document. Selection represents the selection (containing text or other objects) or collapsed selection (containing nothing—merely the blinking insertion point) in the document.
+  * The Windows collection contains the Window objects that represent all open windows.
+
+* * *
+
+The following sections show you how to work with some of the most useful Word objects, starting with the Documents collection and the Document object. You'll see how to use the ActiveWindow object and the Windows collection in the next chapter.
+
+# Working with the Documents Collection and the Document Object
+
+In many of your Word procedures, you'll likely work with documents: creating new documents, saving documents, opening existing documents, closing documents, and printing documents. To do so, you work with the Documents collection, which contains a Document object for each open document in Word.
+
+## Creating a Document
+
+To create a new file, use the Add method of the Documents collection. The syntax is as follows:
+
+    _expression_.Add Template, NewTemplate, DocumentType, Visible
+
+Here, _expression_ is a required expression that returns a Documents collection. Typically, you'll want to use the Documents collection itself ( **Documents**.Add).
+
+Template is an optional Variant argument that specifies the template on which to base the new document. If you omit Template, Word uses the Normal template (this process is the same as if you'd clicked the File tab on the Ribbon, then clicked the New button to open a blank document). So you need specify a Template argument only when you want to base the new document on a template other than the default Normal.dotm.
+
+NewTemplate is an optional Variant argument that you can set to True to create a template file (.dotx) rather than a document. NewTemplate is set to False by default, so you can safely omit this argument unless you're creating a template.
+
+DocumentType is an optional Variant argument that you can use to specify the type of document to create: wdNewBlankDocument (the default), wdNewEmailMessage, wdNewFrameset (for a frameset), or wdNewWebPage.
+
+Visible is an optional Variant argument that you can set to False to have the document created in a window that isn't visible. The default setting is True, making the document window visible.
+
+There are two ways to create a document:
+
+**Creating a document based on Normal.dotm**
+
+The following statement creates a new document based on the **Normal.dotm** global template:
+
+    Documents.Add
+
+**Creating a document based on a template**
+
+The following statement creates a new document based on the template named **Company Report.dotm** stored in the network folder designated **\\\server\public\templates** :
+
+    Documents.Add Template:= "\\server\public\templates\Company Report.dotm"
+
+## Creating a Template
+
+The following statements declare a new object variable of the Document class named myTemplate, create a new template based on the template named Fancy.dotx, and assign it to myTemplate:
+
+    Dim myTemplate As Document
+
+    Set myTemplate = Documents.Add(Template:="c:\program files (x86)\Microsoft Office\Office14\1033\Quickstyles\fancy.dotx", _
+        NewTemplate:=True, Visible:=True)
+
+In this example, the file path (c:\program files\ _and so on_ ) to the template is specified because this template is not in one of the default template folders. The result is a new .dotx file, based on the Fancy.dotx template.
+
+* * *
+
+Changing the Default File Locations
+
+Word has two templates folders: the user templates folder and the workgroup templates folder. You can change the locations of these folders by clicking the File tab on the Ribbon, then clicking the Options button to open the Word Options dialog box. Then click the Advanced button in the left pane. Scroll all the way down in the General Options section, and click the File Locations button you see at the bottom. Then click to select the default folder you want to change and click the Modify button.
+
+* * *
+
+### Saving a Document
+
+Just as when a user is saving a newly created document via the keyboard and mouse, when executing VBA code you must specify a filename and path the first time you save a new document. After that, you can save it under the same name or specify a different name or format. This is the difference between the Save and Save As options.
+
+### Saving a File for the First Time or as a Different File
+
+To save a file for the first time, or to save a file under a different name or in a different format, use the SaveAs2 method. The syntax is as follows:
+
+    _expression_.SaveAs2(FileName, FileFormat, LockComments, Password, AddToRecentFiles, WritePassword, ReadOnlyRecommended, EmbedTrueTypeFonts, SaveNativePictureFormat, SaveFormsData, SaveAsAOCELetter, Encoding, InsertLineBreaks, AllowSubstitutions, LineEnding, AddBiDiMarks, CompatibilityMode
+
+With Word 2010, the traditional SaveAs command was replaced by the SaveAs2 command, which is identical except for the addition of a CompatibilityMode argument. Documents can be saved five different ways with respect to their compatibility with earlier versions of Word. Based on how you set the CompatibilityMode argument, Word saves your document like this:
+
+  * 0 is the default if you don't specify any of the other CompatibilityMode options in this list. The document will be saved using whatever compatibility mode is currently used by this document.
+  * wdCurrent is a compatibility mode equivalent to the latest version of Microsoft Word.
+  * wdWord2003 is a mode that's compatible with Word 2003. Any features new in Word 2013 are disabled.
+  * wdWord2007 is essentially the same as 2003 mode, but features compatible with the 2007 version of Word are enabled.
+  * wdWord2010 is the mode where the Word 2010 features are enabled.
+
+The traditional SaveAs command will still work, but the Editor has a tendency to automatically replace it with SaveAs2. Spooky, true, but no real harm done.
+
+In the syntax , _expression_ is an expression that returns a Document object. For example, you might use the ActiveDocument object or an object in the Documents collection.
+
+FileName is an optional Variant argument that specifies the name for the document. If you omit FileName, VBA uses the current folder and the default filename of Doc _n_.docx (or .docm) or a document and Dot _n_.dotx (or .dotm) for a template, where _n_ is the next available number (for example, Doc5.docx for a macro-free document or Dot2.dotm for a macro-enabled template).
+
+* * *
+
+Avoid Accidentally Overwriting a File
+
+When writing code that saves a document, you should first check whether a document with this name and location already exists. If you don't check, VBA overwrites an existing file without warning, potentially causing data loss.
+
+* * *
+
+FileFormat is an optional Variant argument that specifies the format in which to save the document. Table 20.1 lists the wdSaveFormat constants for specifying commonly used formats.
+
+Table 20.1 WdSaveFormat constants
+
+**Constant** | **Saves Document As**  
+---|---  
+wdFormatDocument | A Word document  
+wdFormatDocument97 | The Word version 97 document format  
+wdFormatDocumentDefault | The Word document default (the docx file type)  
+wdFormatDOSText | A DOS text file (the pre-Windows OS)  
+wdFormatDOSTextLineBreaks | A DOS text file with carriage returns  
+wdFormatEncodedText | A text file with encoding  
+wdFormatFilteredHTML | A filtered HTML file (Word 2003 and XP only)  
+wdFormatFlatXML | An unindexed XML document  
+wdFormatFlatXMLMacroEnabled | An unindexed XML document with macro capability  
+wdFormatFlatXMLTemplate | An unindexed XML template  
+wdFormatFlatXMLTemplateMacroEnabled | An unindexed XML template with macro capability  
+wdFormatHTML | An HTML file  
+wdFormatOpenDocumentText | An XML file format developed by Sun Microsystems  
+wdFormatPDF | Adobe's Portable Document Format  
+wdFormatRTF | A Rich Text format file  
+wdFormatStrictOpenXMLDocument | An XML document standard promoted for several years by Microsoft  
+wdFormatTemplate | A Word template  
+wdFormatTemplate97 | The Word version 97 template format  
+wdFormatText | A text file (plain ASCII)  
+wdFormatTextLineBreaks | A text file with carriage returns  
+wdFormatUnicodeText | A text file with Unicode characters  
+wdFormatWebArchive | A web archive file  
+wdFormatXML | An XML file (Word 2003 only)  
+wdFormatXMLDocument | XML document format  
+wdFormatXMLDocumentMacroEnabled | XML document format with macros enabled  
+wdFormatXMLTemplate | XML template format  
+wdFormatXMLTemplateMacroEnabled | XML template format with macros enabled  
+wdFormatXPS | XPS format
+
+* * *
+
+A Quick Way to See Objects and Their Constants
+
+If you're writing code and you want to quickly see a list of constants, such as the WdSaveFormat constants shown in Table 20.1, just press F2 to open the Object Browser in the Editor. Then type **wdsaveformat** in the Object Browser's search field and press Enter. You'll see the complete list of constants as shown in the illustration.
+
+* * *
+
+As an example of how to use one of these constants, the following statement saves the active document as a filtered HTML file under the name Example.html in the current folder:
+
+    ActiveDocument.SaveAs2 FileName:="Example.html", _
+        FileFormat:=wdFormatFilteredHTML
+
+After you run this example code, use Windows Explorer to locate this new Example.html file and click on it. It will open in Internet Explorer as if it were a web page, because it's stored using the HTML format (if Internet Explorer is the default application in which your machine opens.html files). Or take a look at it in Notepad if you want to see the full horror of HTML markup.
+
+* * *
+
+Save Documents Using File Converters
+
+In addition to the wdSaveFormat constants described in Table 20.1, you can save documents in other formats for which you have file converters installed by specifying the appropriate value for the SaveFormat property of the FileConverter object. For example:
+
+    ActiveDocument.SaveAs2 FileFormat:=FileConverters(15).SaveFormat.
+
+See the FileConverters property entry in the VBA Help file for more information.
+
+* * *
+
+AddToRecentFiles is an optional Variant argument that you can set to True to have Word add the document to the list of recently used files displayed when you click the File tab on the Ribbon and then click Recent. (Often, when experimenting with documents in procedures, you'll want to avoid listing them on the Most Recently Used list, leaving the user's previous list of recent files undisturbed.)
+
+To protect the document as you save it, you can use four different protection features:
+
+  * LockComments is an optional Variant argument that you can set to True to lock the document so that reviewers can enter comments but can't change the text of the document.
+  * Password is an optional Variant argument that you can use to set a password required before opening the document.
+  * WritePassword is an optional Variant argument that you can use to set a password required before saving changes to the document.
+  * ReadOnlyRecommended is an optional Variant argument that you can set to True to have Word recommend that the user open the document as read-only.
+
+Finally, there are the following optional arguments you'll use infrequently, if ever:
+
+  * EmbedTrueTypeFonts is an optional Variant argument that you can set to True to save TrueType fonts with the document. (This is a good idea only if you're distributing the document to someone you know doesn't have the TrueType fonts installed to view the document correctly.)
+  * SaveNativePictureFormat is an optional Variant argument that you can set to True to have graphics imported from another platform saved as Windows graphics.
+  * SaveFormsData is an optional Variant argument that you can set to True to save the data entered in a form as a data record (as opposed to saving the whole form, including its static text).
+  * SaveAsAOCELetter is an optional Variant argument that you can set to True to save the document as an AOCE (Apple Open Collaboration Environment) letter (a mailing format for routing documents).
+  * Encoding is an optional Variant argument for using a different code page than the system code page. For example, you might need to save a document using a Cyrillic code page.
+  * InsertLineBreaks is an optional Variant argument that you can set to True when saving a document as a text file to make Word insert a line break at the end of each line of text.
+  * AllowSubstitutions is an optional Variant argument that you can set to True when saving a document as a text file to make Word substitute some symbol characters with similar text. For example, Word substitutes (TM) for a trademark symbol (™).
+  * LineEnding is an optional Variant argument that you can use when saving a document as a text file to control how Word marks line breaks and paragraph breaks.
+  * AddBiDiMarks is an optional Variant argument that you can set to True to make Word add control characters to the file to maintain bidirectional layout.
+
+Usually, when saving a file for the first time, you'll need to specify only its name and path; if you want to save it in a format other than a Word document, specify that too. The following statement saves the active document under the name Beehives.docx in the folder \\\server\Products\Field\:
+
+    ActiveDocument.SaveAs2 _
+        "\\server\Products\Field\Beehives.docx"
+
+### Saving a Document That Has Already Been Saved
+
+After a document has been first saved, you can save it in the future under the same name by using the Save method. For a Document object, the Save method takes no arguments (all the document's current formats are saved unchanged). For example, the following statement saves the document named Guns01.docx:
+
+    Documents("Guns01.docx").Save
+
+### Saving All Open Documents
+
+To save all open documents, use the Save method with the Documents collection. The syntax is as follows:
+
+    _expression_.Save(NoPrompt, OriginalFormat)
+
+Here, _expression_ is an expression that returns a Documents collection. Often, you'll use the Documents collection itself.
+
+NoPrompt is an optional Variant argument that you can set to True to make Word save all open documents containing unsaved changes and any attached templates containing unsaved changes without prompting the user. The default setting is False, which causes Word to prompt the user whether to save each document and template. Even if you set NoPrompt to True, Word will prompt the user to save changes to Normal.dotm if the Prompt Before Saving Normal Template check box is selected in the Save section of the Advanced tab of the Options dialog box.
+
+OriginalFormat is an optional Variant argument that you can set to wdOriginalDocumentFormat to save the documents in their original formats, wdWordDocument to force each document to be saved as a Word document, or wdPromptUserX to prompt the user about which format to use.
+
+For example, the following statement saves all open documents and templates without prompting the user:
+
+    Documents.Save NoPrompt:=True
+
+### Checking Whether a Document Contains Unsaved Changes
+
+To find out whether a document contains unsaved changes, check its Saved property. Saved is a read/write Boolean property that returns False if the document contains unsaved changes and True if it does not. A new document contains no unsaved changes, even though it has never been saved.
+
+* * *
+
+The Dangers of Cloud Storage and How to Send Files up into the Cloud
+
+With mobility now the main trend in personal computing, people increasingly expect their files to be available anywhere, not just on their hard drive at home or in the office. They also want them accessible to various devices: the Surface tablet/Ultrabook, the phone, the laptop, whatever.
+
+So, to make files within reach everywhere and on whatever kind of computer, data is being moved to the cloud. Never mind that if you read their EULAs you discover that cloud storage providers nearly universally refuse to guarantee either the safety or security of your data. It could be lost in a fire; it could be captured by snoops. To protect yourself, it is a wise precaution to keep your own backup copies in your own house or office and also encrypt sensitive information. The cloud is useful, but dicey. Who _are_ these people storing your data? And where, exactly, are their servers located?
+
+Nonetheless, if you want to know how to save Word files to the cloud on Microsoft's SkyDrive, it's pretty straightforward. Just save a document to your SkyDrive folder. This example saves the current document to SkyDrive (change my name, _Richard_ , to your name in the file path in this example code):
+
+    ActiveDocument.SaveAs ("C:\Users\ _Richard_ \SkyDrive\CloudTest")
+
+Similarly, to save to Dropbox:
+
+    ActiveDocument.SaveAs2 ("C:\Users\ _Richard_ \Dropbox\CloudTest")
+
+* * *
+
+## Opening a Document
+
+To open a document, use the Open method with the appropriate Document object. The syntax for the Open method is as follows:
+
+    _expression_.Open FileName, ConfirmConversions, ReadOnly,
+    AddToRecentFiles, PasswordDocument, PasswordTemplate,
+    Revert, WritePasswordDocument,  WritePasswordTemplate,
+    Format, Encoding, Visible, 
+    OpenAndRepair, DocumentDirection, NoEncodingDialog, XMLTransform
+
+The arguments are as follows:
+
+  * _expression_ is a required expression that returns a Documents collection. Usually, you'll want to use the Documents collection itself.
+  * FileName is a required Variant argument specifying the name (and path, if necessary) of the document to open.
+  * ConfirmConversions is an optional Variant argument that you can set to True to have Word display the Convert File dialog box if the file is in a format other than Word.
+  * ReadOnly is an optional Variant argument that you can set to True to open the document as read-only.
+  * AddToRecentFiles is an optional Variant argument that you can set to True to have Word add the filename to the list of recently used files at the foot of the File menu.
+  * PasswordDocument is an optional Variant argument that you can use to set a password for opening the document.
+  * PasswordTemplate is an optional Variant argument that you can use to set a password for opening the template.
+  * Revert is an optional Variant argument that specifies what Word should do if the FileName supplied matches a file that's already open. By default (that is, if you don't include the Revert argument), Revert is set to False, which means that Word activates the open instance of the document and doesn't open the saved instance. You can set Revert to True to have Word open the saved instance of the document and discard any changes to the open instance.
+  * WritePasswordDocument is an optional Variant argument that indicates the password for saving changes to the document.
+  * WritePasswordTemplate is an optional Variant argument that specifies the password for saving changes to the template.
+  * Format is an optional Variant argument that you can use to specify the file converter with which to open the document. Table 20.2 lists the WdOpenFormat constants you can use specify the file converter.
+  * Encoding is an optional Variant argument specifying the document encoding (the code page or the character set) for Word to use when opening the document.
+  * Visible is an optional Variant argument that you can set to False to have Word open the document in a window that isn't visible. (The default setting is True, specifying a visible window.)
+  * OpenAndRepair is an optional Variant that, when True, repairs the document to prevent corruption.
+  * DocumentDirection is an optional WdDocument Direction variable type, indicating the horizontal flow of text in the document. The default is wdLeftToRight.
+  * NoEncodingDialog is an optional Variant that defaults to False. But if it's set to True, the Encoding dialog box is not displayed when Word cannot recognize text encoding.
+  * XMLTransform is mysterious. The only explanation I could find is in MSDN and it merely says, "Specifies a transform to use." So your guess is as good as mine about what this option accomplishes.
+
+Table 20.2 WdOpenFormat constants for opening a document
+
+**Constant** | **Effect**  
+---|---  
+wdOpenFormatAllWord | Word opens the document in any recognized Word format as a Word document.  
+wdOpenFormatAllWordTemplates | Word opens the document in any recognized Word format as a Word template.  
+wdOpenFormatAuto | Word chooses a converter automatically. This is the default setting.  
+wdOpenFormatDocument | Word opens the document as a Word document.  
+wdOpenFormatDocument97 | Microsoft Word 97 document format.  
+wdOpenFormatEncodedText | Word opens the document as a text file with encoding.  
+wdOpenFormatOpenDocumentText | Word opens the document in an XML file format developed by Sun Microsystems.  
+wdOpenFormatRTF | Word opens the document as a Rich Text format file.  
+wdOpenFormatTemplate | Word opens the document as a template.  
+wdOpenFormatTemplate97 | Word 97 template format.  
+wdOpenFormatText | Word opens the document as a text file.  
+wdOpenFormatUnicodeText | Word opens the document as a Unicode text file.  
+wdOpenFormatWebPages | Word opens the document as a web page.  
+wdOpenFormatXML | Word opens the document in XML format.  
+wdOpenFormatXMLDocument | XML document format.  
+wdOpenFormatXMLDocumentMacroEnabled | XML document format with macros enabled.  
+wdOpenFormatXMLDocumentMacro-EnabledSerialized | Word opens an XML document with macros enabled by reconstructing the original document from a one-dimensional stream of bits.  
+wdOpenFormatXMLDocumentSerialized | Word opens an XML document by reconstructing the original document structure from a one-dimensional stream of bits.  
+wdOpenFormatXMLTemplate | XML template format.  
+wdOpenFormatXMLTemplateMacroEnabled | XML template format with macros enabled.  
+wdOpenFormatXMLTemplateMacro-EnabledSerialized | Word opens an XML template with macros enabled by reconstructing the original document from a one-dimensional stream of bits.  
+wdOpenFormatXMLTemplateSerialized | Word opens an XML template by reconstructing the original document from a one-dimensional stream of bits.
+
+The following statement opens the document Times.docx found in the C:\My Documents\ folder:
+
+    Documents.Open "C:\My Documents\Times.docx"
+
+The following statement opens the file notes.docm in the folder C:\temp as read-only and adds it to the list of most recently used files (the list you see when you click the File tab on the Ribbon, then click Recent):
+
+    Documents.Open "C:\temp\notes.docm", ReadOnly:=True, _
+        AddToRecentFiles:=True
+
+* * *
+
+How to Look Up Office 2013 Members in MSDN
+
+Recall that Microsoft's MSDN online help system can sometimes be difficult to search because it is so huge; it's perhaps _too_ complete. Among other issues, MSDN includes enumerations (lists of properties and methods, or constants, for example) for older versions of Office applications, such as 2007 or 2010, as well as those for the current version 2013.
+
+Although these lists usually don't change much between versions, they _can_ change. To preserve compatibility—so you don't have to rewrite your macros every time a new version of Office comes out—few enumerations ever _lose_ members. But new capabilities are added. Word 2013, for example, adds wdFormatStrictOpenXMLDocument to the enumeration list for wdSaveFormat shown in Table 20.1.
+
+To search MSDN for the latest enumeration, type something like this in the MSDN online Search Office With Bing field: **wdDefaultFilePath office 2013**. Then in the list of hits displayed by Bing, choose the enumeration. Note that _wd_ specifies Word.Closing a Document
+
+* * *
+
+To close a document, use the Close method with the application Document object. The syntax is as follows:
+
+    _expression_.Close(SaveChanges, OriginalFormat, RouteDocument)
+
+Here, _expression_ is a required expression that returns a Document object or a Documents collection. Typically you use the ActiveDocument object or, to close all documents, the Documents collection object.
+
+SaveChanges is an optional Variant argument you can use to specify how to handle unsaved changes. Use wdDoNotSaveChanges to discard changes, wdPromptToSaveChanges to have Word prompt the user to save changes, or wdSaveChanges to save changes without prompting.
+
+OriginalFormat is an optional Variant argument you can use to specify the save format for the document. Use wdOriginalDocumentFormat to have Word use the original document format, wdPromptUser to have Word prompt the user to choose a format, or wdWordDocument to use the Word document format.
+
+RouteDocument is an optional Variant argument that you can set to True to route a document that has a routing slip attached.
+
+For example, the following statement closes the active document without saving changes:
+
+    ActiveDocument. **Close** SaveChanges:= **wdDoNotSaveChanges**
+
+The following statement closes all open documents (but not the Word application itself) and saves changes automatically:
+
+    **Documents.** Close SaveChanges:=wdSaveChanges
+
+## Changing a Document's Template
+
+To change the template attached to a document, set the AttachedTemplate property of the Document object you want to affect to the path and name of the appropriate template. For example, the following statement attaches the template named SalesMarket02.dotm to the active document. In this example, the template is assumed to be stored in one of the Word templates folders, so the path need not be specified:
+
+    ActiveDocument.AttachedTemplate = "SalesMarket02.dotm"
+
+## Printing a Document
+
+To print a document, use the PrintOut method for the appropriate Document object. The syntax for the PrintOut method is as follows:
+
+    _expression_.PrintOut(Background, Append, Range, OutputFileName, From, To, Item, Copies, Pages, PageType, PrintToFile, Collate, ActivePrinterMacGX, ManualDuplexPrint, PrintZoomColumn, PrintZoomRow, PrintZoomPaperWidth, PrintZoomPaperHeight)
+
+These are the components of the PrintOut method:
+
+  * _expression_ is a required expression specifying an Application, Document, or Window object. Usually, you'll print a Document object such as ActiveDocument.
+  * Background is an optional Variant argument that you can set to True to have Word print the document in the background, allowing the procedure to continue running.
+  * Append is an optional Variant argument that you can set to True to append the document being printed to file to the print file specified.
+  * Range is an optional Variant argument specifying the selection or range of pages to print: wdPrintAllDocument (0, the default), wdPrintCurrentPage (2), wdPrintFromTo (3; use the From and To arguments to specify the pages), wdPrintRangeOfPages (4), or wdPrintSelection (1).
+  * OutputFileName is an optional Variant argument used to specify the name for the output file when printing to file.
+  * From is an optional Variant argument used to specify the starting page number when printing a range of pages.
+  * To is an optional Variant argument used to specify the ending page number when printing a range of pages.
+  * Item is an optional Variant argument used to specify the item to print: wdPrintAutoTextEntries (4), wdPrintComments (2), wdPrintDocumentContent (0, the default), wdPrintKeyAssignments (5, shortcut key assignments for the document or its template), wdPrintProperties (1), or wdPrintStyles (3).
+  * Copies is an optional Variant argument used to specify the number of copies to print. (If you omit Copies, Word prints one copy.)
+  * Pages is an optional Variant argument used to specify the pages to print—for example, 1, 11-21, 31.
+  * PageType is an optional Variant argument used to specify whether to print all pages (wdPrintAllPages, 0, the default), odd pages (wdPrintOddPagesOnly, 1), or even pages (wdPrintEvenPagesOnly, 2).
+  * PrintToFile is an optional Variant argument that you can set to True to direct the output of the print operation to a file.
+  * Collate is an optional Variant argument used when printing multiple copies of a document to specify whether to collate the pages (True) or not (False).
+  * ActivePrinterMacGX is an optional Variant argument used on the Macintosh to specify the printer if QuickDraw GX is installed.
+  * ManualDuplexPrint is an optional Variant argument that you set to True for two-sided printing on a printer that doesn't have duplex capabilities. When ManualDuplexPrint is True, you can use the PrintOddPagesInAscendingOrder property or the PrintEvenPagesInAscendingOrder property of the Options object to print odd or even pages in ascending order to create a manual duplex effect (reloading the odd-page-printed paper into the printer the other way up to print the even pages). The ManualDuplexPrint argument is available only in some languages.
+  * PrintZoomColumn and PrintZoomRow are optional Variant arguments that you use to specify the number of pages to print on a page horizontally (PrintZoomColumn) and vertically (PrintZoomRow). Each property can be 1, 2, or 4.
+  * PrintZoomPaperWidth is an optional Variant argument that you can use to specify the width (measured in twips) to which to scale printed pages.
+  * PrintZoomPaperHeight is an optional Variant argument that you can use to specify the height (measured in twips) to which to scale printed pages.
+
+For example, the following statement prints three collated copies of the active document in the background:
+
+    ActiveDocument.PrintOut Background:=True, Copies:=3, Collate:=True
+
+The following statement prints pages 2 through 5 of the active document:
+
+    ActiveDocument.PrintOut Range:=wdPrintFromTo, From:=2, To:=5
+
+The following statement prints the active document at two virtual pages per sheet of paper:
+
+    ActiveDocument.PrintOut PrintZoomColumn:=2, PrintZoomRow:=1
+
+## Working with the ActiveDocument Object
+
+The ActiveDocument object returns a Document object that represents the current document you're working with—in other words, whichever document has the focus in the Word window. The ActiveDocument object behaves like a Document object, but watch out for two possible problems when working with it.
+
+First, you may have problems locating information about the ActiveDocument object in the Help system. It's actually a _property_ of the Application object, so its status as an actual object is somewhat iffy. Object taxonomy is an evolving clerical system and, as you see, remains incomplete.
+
+To find the ActiveDocument object in the Help system, MSDN system, or VBA Editor Object Browser, you need to first locate the Application object, then look at its properties (or members). Just remember, ActiveDocument is found only _under_ the Application object. It's a clerical error. It's as if you were looking for _California_ in a geography book's index, but the index is wacky because you find _most_ states listed under their own names (Hawaii is under _H_ , for example), but for some reason, California is not listed under C. You're puzzled. It's a big, important state. Then you stumble upon the solution: In this bizarre index, _California_ is only found under the entry for _United States_.
+
+The second oddity about the ActiveDocument "property" is that it can be evanescent. The first problem is that if there's no document open in Word, there's no ActiveDocument object, and any code that tries to work with the ActiveDocument object returns an error. When writing code that invokes the ActiveDocument object, remember to check the Count property of the Documents collection to make sure there's a document open (Count will be at least 1) before attempting to use ActiveDocument in your code. Here's an example that tests to see if there is an open document:
+
+    If **Documents.Count = 0** Then
+        If MsgBox("No document is open." & vbCr & vbCr & _
+            "Do you want to create a new blank document?", _
+            vbYesNo + vbExclamation, "No Document Is Open") = vbYes Then
+            Documents.Add
+        Else
+            End
+        End If
+    End If
+
+A second problem relating to this evanescence is that a different document may be active than your code assumes is active. This problem tends to occur when a procedure starts with the active document and then creates a new document to work in; this new document becomes the active document, and from this point on, confusion may result.
+
+If you know the _name_ of the document that should be active, you can check to see if the name of the active document matches it, to verify that you'll be working with the right document.
+
+If there's any doubt about which document you're working with, declare a Document object variable and employ that object variable in your code rather than the ActiveDocument object.
+
+For example, the following statements declare a Document object and assign the ActiveDocument object to it so that subsequent code can work with the Document object:
+
+    **Dim myDocument As Document**
+    **Set myDocument = ActiveDocument**
+    With myDocument
+        'actions here
+    End With
+
+Or if you know the name of the document you want to work with:
+
+    Dim myDocument As Document
+    Set myDocument = ActiveDocument
+    **If myDocument.Name = "CorrectFile.docx** " Then
+        'actions here
+    End If
+
+# Working with the Selection Object
+
+Up to now in this chapter we've worked with programming that affects an entire document. To write code that works with only part of a document (a word, paragraph, or whatever), you can access these zones in three ways:
+
+  * By using the Selection object
+  * By directly accessing the object that you want to affect
+  * By defining a range that encompasses the object
+
+Using the Selection object is analogous to working interactively with Word and is effective with procedures that require the user to select an object or position the insertion point to denote what content in the document the procedure should access.
+
+Using the Selection object is also effective when you're learning to use VBA with Word, because many actions that you record using the Macro Recorder use the Selection object.
+
+The Selection object represents the current selection in the active document in Word. The selection can be very small (collapsed to the blinking cursor insertion point), in which case nothing is selected. Or a Selection object can contain one or more objects—one or more characters, one or more words, one or more paragraphs, a graphic, a table, the entire document. Or the selection can be a combination of these objects. Whatever's selected.
+
+Even if the selection is collapsed to an insertion point, you can use it to refer to objects outside the selection. For example, Selection.Paragraphs(1).Range.Words(10).Text returns the 10th word in the paragraph in which the insertion point is positioned (or, if a paragraph or multiple paragraphs are selected, the 10th word in the first paragraph).
+
+## Checking the Type of Selection
+
+Word recognizes nine different kinds of selections. When you're working in the active document, you'll often need to check what kind of selection is active so that you know whether you're dealing with no selection (just the insertion point), a block of ordinary text, or a special type of text like a table or a graphic.
+
+Depending on the current selection, you may not be able to take certain actions in your procedure, and you may not _want_ to take other actions. You can't, for example, insert a table row into an ordinary text paragraph.
+
+Table 20.3 lists the types of selections that Word differentiates.
+
+Table 20.3 Selection types in Word
+
+**constant** | **Value** | **Meaning**  
+---|---|---  
+wdNoSelection | 0 | There's no selection. (This state seems impossible to achieve. You'd think it'd be when no document is open, but then Selection statements return runtime error 91. Stay tuned...)  
+wdSelectionIP | 1 | The selection is collapsed to a plain insertion point—nothing is selected. But the insertion cursor is blinking as usual.  
+wdSelectionNormal | 2 | A "normal" selection, such as a selected word or sentence.  
+wdSelectionFrame | 3 | A frame is selected.  
+wdSelectionColumn | 4 | A column or part of a column (two or more cells in a column or one cell in each of two or more columns) is selected.  
+wdSelectionRow | 5 | A full row in a table is selected.  
+wdSelectionBlock | 6 | A block is selected (a vertical part of one or more paragraphs, selected by holding down the Alt key and dragging with the mouse or by using column-extend mode).  
+wdSelectionInlineShape | 7 | An inline shape or graphic (a shape or graphic that's in the text layer rather than floating over it) is selected.  
+wdSelectionShape | 8 | A Shape object is selected. (A text box counts as a Shape object.)
+
+To find out what type of selection you currently have, look at the Type property of the Selection object. The following statements check that the current selection is merely an insertion point before inserting a text literal. The text will not be inserted if the user has dragged to select, for example, some characters, a word, or a paragraph:
+
+    If Selection. **Type = wdSelectionIP** Then
+        Selection.TypeText "This is inserted."
+    End If
+
+## Checking the Story Type of the Selection
+
+Beyond the type of selection, you'll sometimes need to find out which "story" the selection is in—the main text story, the comments story, the primary header story, and so on. Microsoft uses the word _story_ instead of _zone, type_ , or other terms to mean a distinct type of content.
+
+Checking the story can help you avoid problems, such as trying to perform in a header or footer actions that Word supports only in a main text story.
+
+The story is the zone of the document within which the current selection is located. So, most of the time the story is the _main text story_ (wdMainTextStory). That's the document and the items within it. But alternative "stories" are things like footnotes, frames, headers, and footers—as you can see in Table 20.4, which lists the wdStoryType constants and the stories to which they correspond.
+
+You may notice another whimsical, enigmatic feature of Table 20.4. It starts the enumeration value with 1. Compare that to Table 20.3 which starts with 0. Inconsistencies like this make programming more challenging.
+
+Table 20.4 Word story types
+
+**Constant** | **Value** | **Meaning**  
+---|---|---  
+wdMainTextStory | 1 | Main (body) text of the document  
+wdCommentsStory | 4 | Comments section  
+wdEndnotesStory | 3 | Endnotes section  
+wdFootnotesStory | 2 | Footnotes section  
+wdTextFrameStory | 5 | Text in frames  
+wdPrimaryFooterStory | 9 | Main footer  
+wdEvenPagesFooterStory | 8 | Even-page footer  
+wdFirstPageFooterStory | 11 | First-page footer  
+wdPrimaryHeaderStory | 7 | Main header  
+wdEvenPagesHeaderStory | 6 | Even-page header  
+wdFirstPageHeaderStory | 10 | First-page header  
+wdFootnoteSeparatorStory | 12 | Footnote separator  
+wdFootnoteContinuationSeparatorStory | 13 | Footnote continuation separator  
+wdFootnoteContinuationNoticeStory | 14 | Footnote continuation notice  
+wdEndnoteSeparatorStory | 15 | Endnote separator  
+wdEndnoteContinuationSeparatorStory | 16 | Endnote continuation separator  
+wdEndnoteContinuationNoticeStory | 17 | Endnote continuation notice
+
+Here's a code example that displays a message box if the selection isn't in the main text of a document:
+
+    If Selection. **StoryType <> wdMainTextStory** Then
+        MsgBox "This range is not in the main text."
+    End If
+
+## Getting Other Information about the Current Selection
+
+To work effectively with a selection, you'll often need to know what it contains and where it's positioned. To find out, use the Information property to learn the details you need. Table 20.5 lists examples of useful information available in the Information property.
+
+Here's an example showing how to use the Information property:
+
+    If Selection. **Information(wdCapsLock) = True** Then
+        MsgBox "The caps lock is ON."
+    End If
+
+Sharp-eyed readers will notice a capricious inconsistency in this code. In the other code examples in this section, no parentheses were used around a constant, and the operator (= or <> or whatever) is placed between the property and the constant, as shown in this example:
+
+    Selection.Type = wdSelectionIP
+
+But with the Information property, you _do_ use parentheses, and you move the operator to the right of the constant:
+
+    **Selection.Information(wdCapsLock) =**
+
+This syntax and punctuation irregularity is yet _another_ of those exceptions to the rule. You should therefore remember that if the usual syntax produces an error message from the Editor, try the other (parenthetical) version.
+
+To see the complete list of all members, open the object browser (F2) and scroll down in the Classes list until you see wdInformation. Double-click it and its members will be listed in the Members of "WdInformation" list on the right.
+
+Table 20.5 Information available in the Information property
+
+**Constant** | **Returns This Information**  
+---|---  
+**Environment Information** |   
+wdCapsLock | True if Caps Lock is on.  
+wdNumLock | True if Num Lock is on.  
+wdOverType | True if Overtype mode is on. (You can turn Overtype mode on and off by changing the Overtype property.)  
+wdRevisionMarking | True if Track Changes is on.  
+wdSelectionMode | A value that specifies the current selection mode: 0 indicates a normal selection, 1 indicates an extended selection (Extend mode is on), and 2 indicates a column selection.  
+wdZoomPercentage | The current zoom percentage.  
+**Selection and Insertion Point Information** |   
+wdActiveEndAdjustedPageNumber | The number of the page containing the active end of the selection or range. This number reflects any change you make to the starting page number; wdActiveEndPageNumber, the alternative, doesn't.  
+wdActiveEndPageNumber | The number of the page containing the active end of the selection or range.  
+wdActiveEndSectionNumber | The number of the section containing the active end of the selection or range.  
+wdFirstCharacterColumnNumber | The character position of the first character in the selection or range. If the selection or range is collapsed to an insertion point, this constant returns the character number immediately to the right of the insertion point. (Note that this "column" is relative to the currently active left margin and doesn't have to be inside a table.)  
+wdFirstCharacterLineNumber | In Print Layout view and Print Preview, this constant returns the line number of the first character in the selection. In nonlayout views (e.g., Normal view), it returns -1.  
+wdFrameIsSelected | True if the selection or range is a whole frame or text box.  
+wdHeaderFooterType | A value that specifies the type of header or footer containing the selection or range: -1 indicates that the selection or range isn't in a header or footer; 0 indicates an even page header; 1 indicates an odd page header in a document that has odd and even headers and the only header in a document that doesn't have odd and even headers; 2 indicates an even page footer; 3 indicates an odd page footer in a document that has odd and even footers and the only footer in a document that doesn't have odd and even headers; 4 indicates a first-page header; and 5 indicates a first-page footer.  
+wdHorizontalPositionRelativeToPage | The horizontal position of the selection or range—the distance from the left edge of the selection or range to the left edge of the page, measured in twips.  
+wdHorizontalPositionRelativeToTextBoundary | The horizontal position of the selection or range—the distance from the left edge of the selection or range to the text boundary enclosing it, measured in twips.  
+wdInCommentPane | True if the selection or range is in a comment pane.  
+wdInEndnote | True if the selection or range is an endnote (defined as appearing in the endnote pane in Normal view or in the endnote area in Print Layout view).  
+wdInFootnote | True if the selection or range is in a footnote (defined as appearing in the footnote pane in Normal view or in the footnote area in Print Layout view).  
+wdInFootnoteEndnotePane | True if the selection or range is in a footnote or endnote.  
+wdInHeaderFooter | True if the selection or range is in a header or footer (defined as appearing in the header or footer pane in Normal view or in the header or footer area in Print Layout view).  
+wdInMasterDocument | True if the selection or range is in a master document (a document containing at least one subdocument).  
+wdInWordMail | A value that specifies the WordMail location of the selection or range: 0 indicates that the selection or range isn't in a WordMail message; 1 indicates that it's in a WordMail message you're sending; 2 indicates that it's in a WordMail you've received.  
+wdNumberOfPagesInDocument | The number of pages in the document in which the selection or range appears.  
+wdReferenceOfType | A value that specifies where the selection is in relation to a footnote reference, endnote reference, or comment reference. -1 indicates the selection or range includes a reference. 0 indicates the selection or range isn't before a reference. 1 indicates the selection or range is before a footnote reference, 2 that it's before an endnote reference, and 3 that it's before a comment reference.  
+wdVerticalPositionRelativeToPage | The vertical position of the selection or range—the distance from the top edge of the selection to the top edge of the page, measured in twips.  
+wdVerticalPositionRelativeToTextBoundary | The vertical position of the selection or range—the distance from the top edge of the selection to the text boundary enclosing it, measured in twips.  
+**Table Information** |   
+wdWithInTable | True if the selection is in a table.  
+wdStartOfRangeColumnNumber | The number of the table column containing the beginning of the selection or range.  
+wdEndOfRangeColumnNumber | The number of the table column containing the end of the selection or range.  
+wdStartOfRangeRowNumber | The number of the table row containing the beginning of the selection or range.  
+wdEndOfRangeRowNumber | The number of the table row containing the end of the selection or range.  
+wdAtEndOfRowMarker | True if the selection or range is at the end-of-row marker in a table (not the end-of-cell marker).  
+wdMaximumNumberOfColumns | The largest number of table columns in any row in the selection or range.  
+wdMaximumNumberOfRows | The largest number of table rows in the table in the selection or range.  
+**Macintosh** |   
+wdInClipboard | Used with Microsoft Office Macintosh Edition
+
+## Inserting Text at, after, or before the Selection
+
+You can insert text at the selection by using the TypeText method of the Selection object, insert text before the selection by using the InsertBefore method, or insert text after the selection by using the InsertAfter method.
+
+The TypeText method merely inserts a text string into the document if the selection is collapsed (merely the blinking insertion cursor with nothing actually selected). But if something _is_ selected, such as a word or phrase, that selection is _replaced_ by the string when you execute the TypeText method. However, the InsertBefore and InsertAfter methods do not replace a selection. They merely insert the new string.
+
+The syntax is as follows:
+
+    Selection.TypeText _string_
+    Selection.InsertAfter _string_
+    Selection.InsertBefore _string_
+
+Here, _string_ is a required String expression containing the text you want to insert in double quotation marks, as in this example:
+
+    Selection.TypeText "Please come to the meeting next Friday at 9:00 A.M."
+    Selection.InsertBefore "Dr. "
+    Selection.InsertAfter vbCr & Address
+
+When you use the InsertAfter or the InsertBefore method, VBA extends the selection to include the text you inserted. (You can see selected text, cells, or other items in a document because Word changes the background from the default white to the document frame color.) When you use the TypeText method, the result is a collapsed selection—whether you are replacing a selection or a collapsed selection. (Recall that a collapsed selection means nothing is selected—merely the blinking insertion point.)
+
+* * *
+
+A Selected Paragraph Includes the Ending Paragraph Mark
+
+When you have a whole paragraph selected, the selection includes the paragraph mark at the end of the paragraph. So any text you add to the end of the selection appears at the beginning of the next paragraph rather than at the end of the selected paragraph.
+
+* * *
+
+## Inserting a Paragraph in a Selection
+
+You can insert paragraphs:
+
+  * To insert a paragraph at the current selection, use the InsertParagraph method.
+  * To insert a paragraph before the current selection, use the InsertParagraphBefore method.
+  * To insert a paragraph after the current selection, use the InsertParagraphAfter method.
+
+You can also have VBA type a paragraph by using the Selection.TypeParagraph command.
+
+## Applying a Style
+
+To apply a style to a paragraph, set the Style property of the Paragraph object:
+
+    Selection.Style = "Heading 3"
+
+View the styles in the current document by pressing Ctrl+S, or click the Home tab on the Ribbon.
+
+Similarly, you can apply a character style to the current selection or (as in the following example) to a specific range of words or characters. This example changes the fifth word in the second paragraph of the current document to boldface:
+
+    ActiveDocument.Paragraphs(2).Range.Words(5).Style = "Bold"
+
+Note that a character style must always be applied to a range rather than directly to a paragraph.
+
+## Extending a Selection
+
+To extend a selection programmatically (through programming rather than by the user), use the EndOf method for a Range or Selection object. The syntax for the EndOf method is as follows:
+
+    _expression_.EndOf(Unit, Extend)
+
+Here, _expression_ is a required expression that returns a Range or Selection object, such as an object in the Characters, Words, Sentences, or Paragraphs collection. Unit is an optional Variant specifying the unit of movement (see Table 20.6).
+
+Table 20.6 Units of movement for the EndOf method
+
+**Unit** | **Meaning**  
+---|---  
+wdCharacter | A character.  
+wdWord | A word. (This is the default setting if you omit the argument.)  
+wdSentence | A sentence.  
+wdLine | A line. (This unit can be used only with Selection objects, not with ranges.)  
+wdParagraph | A paragraph.  
+wdSection | A section of a document.  
+wdStory | The current story—for example, the document story or the header and footer story.  
+wdCell | A cell in a table.  
+wdColumn | A column in a table.  
+wdRow | A row in a table.  
+wdTable | A whole table.
+
+Extend is an optional Variant specifying whether to move or extend the selection or range. wdMove moves the selection or range and is the default setting; wdExtend extends the selection or range.
+
+For example, the following statement extends the current selection to the end of the paragraph:
+
+    Selection.EndOf Unit:=wdParagraph, Extend: **=wdExtend**
+
+The following statement moves the selection to the end of the paragraph:
+
+    Selection.EndOf Unit:=wdParagraph, Extend: **=wdMove**
+
+The following statement selects from the current selection to the end of the current Word story:
+
+    Selection.EndOf Unit:=wdStory, Extend:=wdExtend
+
+To select the whole active document, use ActiveDocument.Content.Select. This command has the same effect as pressing Ctrl+A when working interactively.
+
+## Collapsing a Selection
+
+When you've finished working with a selection larger than a blinking cursor insertion point, you often want to deselect it. In other words, you may want to force the selection into a collapsed state (just the blinking cursor) when your procedure ends. (If you don't do this and the user just starts typing, whatever is selected will be _replaced_ by the user's typing.)
+
+The easiest way to do so is to use the Collapse method of the Selection object to collapse the selection to its start or its end:
+
+    Selection.Collapse Direction:=wdCollapseStart
+    Selection.Collapse Direction:=wdCollapseEnd
+
+Alternatively, you can reduce the selection to just one point by setting the selection's end selection equal to its start (collapsing the selection to its start) or by setting the selection's start equal to its end (collapsing the selection to its end):
+
+    Selection.End = Selection.Start
+    Selection.Start = Selection.End
+
+# Creating and Using Ranges
+
+In Word, a _range_ is a contiguous area of a document with a defined starting point and ending point. For example, if you define a range that consists of the first two paragraphs in a specified document, the range's starting point is at the beginning of the first paragraph, and its ending point is at the end of the second paragraph (after the paragraph mark).
+
+Although similar to a selection, a range is more flexible. And, it's important to note that a range is _named_ in your code, so you can refer to it by name at any time. There can be multiple ranges, but there can only be one selection at a time, and it has no name.
+
+The typical use of ranges in Word VBA is similar to how you use bookmarks when working interactively with Word: to mark a location in a document that you want to be able to access quickly or manipulate easily.
+
+Like a bookmark, a range can contain any amount of text in a document, from a single character to the entire contents of the document. A range can even have the same starting point and ending point, which gives it no contents and makes it, in effect, an invisible mark in the document that you can use to insert text. (This is similar to a collapsed selection.)
+
+Once you've created a range, you can refer to it, access its contents or insert new contents in it, or format it—all by using the methods and properties of the range object.
+
+* * *
+
+How a Range Differs from a Bookmark
+
+The main difference between a range and a bookmark is that the lifetime of a range is limited to the VBA procedure that defines it. Once the procedure finishes executing, the range vanishes. By contrast, a bookmark persists. It is saved with the document or template that contains it and can be accessed at any time (whether or not a procedure is running).
+
+* * *
+
+## Defining a Named Range
+
+To create a Range object, you use a Set statement and either the Range method on a Document object or the Range property for an object—for example, the Selection object, the Paragraphs collection, or a Paragraph object. The syntax for using the Range method is as follows:
+
+    Set RangeName = Document.Range(Start, End)
+
+Here, RangeName is the name you are assigning to the range, and Start and End are optional arguments specifying the starting and ending points of the range.
+
+The syntax for using the Range property on an object is as follows:
+
+    Set RangeName = _object_.Range
+
+For example, the following statement uses the Range property of the Paragraphs collection to define a range named FirstPara that consists of the first paragraph of the active document. This statement doesn't use Start and End arguments because the starting point and ending point of the paragraph are clearly understood:
+
+    Set FirstPara = ActiveDocument.Paragraphs(1).Range
+
+The following statements change to uppercase the first three words at the start of a document:
+
+    Dim InitialCaps As Range
+    Set InitialCaps = ActiveDocument.Range _
+    (Start:=ActiveDocument.Words(1).Start, _
+        End:=ActiveDocument.Words(3).End)
+    InitialCaps.Case = wdUpperCase
+
+The first statement defines a Range object named InitialCaps. The second statement assigns InitialCaps to a range in the active document, from the beginning of the first word to the end of the third word. The third statement changes the case of the InitialCaps Range object to uppercase.
+
+Because InitialCaps is now defined as a Range object for the duration of the procedure that declares it, you can return to InitialCaps and manipulate it later in the procedure if you want to.
+
+## Redefining a Range
+
+To redefine a range to make it refer to another part of a document, use the SetRange method. The syntax is as follows:
+
+    _expression_.SetRange(Start, End)
+
+Here, _expression_ is a required expression that returns a Range or Selection object, and Start and End are optional arguments specifying the starting and ending points of the range.
+
+For example, the following statement redefines the range named InitialCaps so it now refers to the first two characters of the document:
+
+    InitialCaps.SetRange Start:=0, End:=2
+
+You can also redefine a range by reusing the Set method, creating the range again from scratch.
+
+## Using the Duplicate Property to Store or Copy Formatting
+
+You can use the Duplicate property to store or copy a range so that you can apply it to another range. For example, the following statements declare two ranges, Range1 and Range2; store the duplicate of the current selection's range in Range1; assign to Range2 the Range of the first bookmark in the active document; and then apply to Range2 the contents of Range1:
+
+    Dim Range1 As Range, Range2 As Range
+    Set Range1 = Selection.Range. **Duplicate**
+    Set Range2 = ActiveDocument.Bookmarks(1).Range
+
+# Manipulating Options
+
+In your procedures, you'll often need to check the status of options in the Word application or in a particular document. In VBA, many of the options are controlled by the Options object, which has dozens of properties but no methods.
+
+Let's look now at four brief examples that show how to set options. Three of them use the Options object and one uses a property of the Document object. To see the full list of properties available for the Options object, look in the Help system.
+
+## Making Sure Hyperlinks Require Ctrl+Clicking
+
+Hyperlinks in Word documents have proved a mixed blessing—especially since Microsoft's changes to the way Word handles hyperlinks have left users unsure whether to just click or to Ctrl+click the hyperlink to follow it. You can set the CtrlClickHyperlinkToOpen property of the Options object to True to ensure that hyperlinks require Ctrl+clicking:
+
+    Options.CtrlClickHyperlinkToOpen = True
+
+Setting this option to False means you can trigger links by merely clicking them—no Ctrl key required.
+
+## Turning Off Overtype
+
+To make sure your procedures behave as expected, you may need to check that Word is using Insert mode rather than Overtype mode. (In Insert mode, Word inserts the characters you type at the insertion point, moving right any existing text to make room. In Overtype mode, each character you type replaces the character to the right of the insertion point.)
+
+Overtype mode is controlled by the Overtype property of the Options object. When OverType is True, Overtype mode is on; when Overtype is False, Insert mode is on. The following statements store the user's current Overtype setting in a Boolean variable named blnOvertypeOn, set Overtype to False, perform its actions, and then restore the user's Overtype setting:
+
+    Dim blnOvertypeOn As Boolean
+    blnOvertypeOn = Options.Overtype
+    Options.Overtype = False   'write more code here to perform actions
+    Options.Overtype = blnOvertypeOn
+
+## Setting a Default File Path
+
+When configuring Word on a computer, you may need to make sure that its default file paths are set to the correct folders. You can do so by working with the DefaultFilePath property of the Options object. The syntax is as follows:
+
+    _expression_.DefaultFilePath(Path)
+
+Here, _expression_ is a required expression that returns an Options object. Often, it's easiest to use the Options object itself. Path is one of the self-explanatory enumerated constants shown in the following list:
+
+wdAutoRecoverPath | wdStyleGalleryPath  
+---|---  
+wdBorderArtPath | wdTempFilePath  
+wdCurrentFolderPath | wdTextConvertersPath  
+wdDocumentsPath | wdToolsPath  
+wdGraphicsFiltersPath | wdTutorialPath  
+wdPicturesPath | wdUserOptionsPath  
+wdProgramPath | wdUserTemplatesPath  
+wdProofingToolsPath | wdWorkgroupTemplatesPath  
+wdStartupPath |
+
+For example, the following statements set the user templates path and the workgroup templates path:
+
+    Options.DefaultFilePath( **wdUserTemplatesPath** ) = _
+    "c:\users\richard\appdata\roaming\microsoft\templates"
+    Options.DefaultFilePath( **wdWorkgroupTemplatesPath** ) = _
+    "\\server\users\templates"
+
+## Turning Off Track Changes
+
+Before running a procedure that adds, deletes, or formats text, you may need to turn off the Track Changes feature so that the changes the procedure makes are not marked up in the text. If the user had Track Changes on, you should turn it back on at the end of the procedure so that changes the user makes are tracked again. Remember that it's usually a good practice when changing options to first store the user's current setting in a variable, carry out your procedure's task, and then restore the user's original setting.
+
+The following example saves the user's setting for the TrackRevisions option in the ActiveDocument object in a Boolean variable named blnTrackChangesOn, sets TrackRevisions to False, performs its actions, and then restores the user's TrackRevisions setting:
+
+    Dim blnTrackChangesOn As Boolean
+    blnTrackChangesOn = ActiveDocument.TrackRevisions
+    ActiveDocument.TrackRevisions = False
+    ' write more code here to perform actions 
+    ActiveDocument.TrackRevisions = blnTrackChangesOn
+
+## Accessing OneNote
+
+Earlier in this chapter you saw how to access the cloud: SkyDrive, Dropbox, and so on. It's uncomplicated. Dealing with OneNote is another matter because its contents are stored in the tricky XML format. _Tricky_ because reading and manipulating XML data is somewhat surreal. XML tries to be all things to all people, and the usual consequences of that effort ensue. XML comes in many, many versions; it mixes a "self-describing" metalanguage into its data; you'll find no standards for parsing (breaking the data down for reading and managing); and so on.
+
+When you write code to manage an ordinary text document, it's pretty simple because of VBA's string-manipulation features. Even managing a Word document is easy enough because VBA has so many built-in functions involving the Range, Selection, and other objects. Word effectively hides its internal formatting and other complexities, allowing you the option of handling the text simply as text.
+
+Not so with XML. As you'll see, just getting the metadata (information such as the name of a notebook) is heavy-going.
+
+Why bother to explore this topic then? Although VBA is not built into OneNote, VBA code in other Office 2013 applications can directly manipulate OneNote. And, because Microsoft is currently promoting OneNote, attempting to make it popular after all these years, I'm including this example demonstrating how to access it from the other Office applications.
+
+OneNote _is_ useful; it's actually quite rich in features and well integrated into the Windows and Office platforms. And now that versions of OneNote are available on everything from iOS to Android devices, perhaps Microsoft's dream will come true. So, if you should ever need to know how to contact OneNote from Word or some other Office application, read on.
+
+The following example fetches metadata from the user's OneNote. Before you try this code, choose Tools ⇒ References in the Editor and ensure that both Microsoft OneNote 15.0 Object Library and Microsoft XML, v6.0 are both checked in the References dialog box.
+
+    1.  Sub GetMetaData()
+    2.  
+    3.  'If it's not currently running, OneNote will be launched
+    4.  Dim ONote As oneNote.Application
+    5.  Set ONote = New oneNote.Application
+    6.  
+    7.  Dim strXML As String
+    8.  
+    9.  ONote.GetHierarchy "", hsNotebooks, strXML, xs2010 'this fails if you use xs2013
+    10. 
+    11. MsgBox strXML
+    12. End Sub
+
+Lines 4 and 5 create an instance of OneNote and assign it to the ONote object variable. Next we create a string variable in line 7 to hold the metatdata. Line 9 uses the GetHierarchy method to fill strXML with the metadata. hsNotebooks represents the collection of notebooks in OneNote. The messagebox displays the results, as illustrated in Figure 20.4.
+
+Figure 20.4 Metadata fetched from OneNote
+
+# The Bottom Line
+
+**Understand the Word object model.**
+
+Some people find viewing a schematic of the Word object model useful as a way of visualizing the interrelationships of the various objects and collections.
+
+Master It
+
+When you look at the Word Object Model Map, what is the highest object in the hierarchy—the object that contains all other objects?
+
+**Understand Word's creatable objects.**
+
+Word contains a set of creatable objects that VBA programmers will frequently employ in their code.
+
+Master It
+
+What is a creatable object?
+
+**Work with the **Documents** collection and the **Document** object.**
+
+The Documents collection represents all the currently open documents. Using VBA, you can manipulate this collection in a variety of ways.
+
+Master It
+
+Here is the syntax for creating a new document in the Documents collection:
+
+    Documents.Add Template, NewTemplate, DocumentType, Visible
+
+If you merely want to add a new, empty document (based on the default Normal.dotm template) to the documents currently open in Word, the code is quite simple. What is the code that you would write in VBA to accomplish this?
+
+**Work with the **Selection** object.**
+
+The Selection object represents the current selection in the active document in Word. A zone can be selected by the user by dragging the mouse or by using various key combinations (such as pressing Shift and an arrow key). A selection can include one or more objects—one or more characters, one or more words, one or more paragraphs, a graphic, a table, and so on. Or a combination of these objects.
+
+Master It
+
+One kind of selection is described as a _collapsed selection_. What is that?
+
+**Create and use ranges.**
+
+In Word, a _range_ is a named contiguous area of a document with a defined starting and ending point. The typical use of ranges in Word VBA is similar to how you use bookmarks when working interactively with Word: to mark a location in a document that you want to be able to access quickly or manipulate easily.
+
+Master It
+
+Although a range is similar to a bookmark, what is the significant difference between them?
+
+**Manipulate options**
+
+Word contains many options that can be manipulated from within VBA.
+
+Master It
+
+In Word, one object controls many of the options. This object has dozens of properties but no methods. Name this object.
+Chapter 21
+
+Working with Widely Used Objects in Word
+
+In the previous chapter, you learned how to work with some of the main objects in the Word object model, such as Document objects, the Selection object, Range objects, and the Options object. This chapter shows you how to go further with VBA in Word by working with Find and Replace; with headers, footers, and page numbers; with sections, page setup, windows, and views; and with tables.
+
+In this chapter you will learn to do the following:
+
+  * Use Find and Replace via VBA
+  * Work with headers, footers, and page numbers
+  * Manage sections, page setup, windows, and views
+  * Manipulate tables
+
+# Using Find and Replace via VBA
+
+Word's Find and Replace tool can be very useful in your procedures. You can, for example, quickly adjust multiple styles throughout an entire document. Or you could automate the process of finalizing documents (spell-checking, revising corporate information, looking for out-of-date references, or whatever routinely needs to be done before publication).
+
+To access Word's Find and Replace features via VBA, you use the Find and Replacement objects. This section illustrates how to work with the Find object's Execute method, usually the best method to employ when working with Find. You usually specify the parameters for the Find operation as arguments in the Execute statement, but you can also specify them beforehand using properties if you prefer that approach.
+
+Table 21.1 describes the Find properties that are most useful for common search operations.
+
+Table 21.1 Properties of the Find object
+
+**Property** | **Meaning**  
+---|---  
+Font | Font formatting you're searching for (on either specified text or an empty string).  
+Forward | A Boolean variable-type argument specifying whether to search forward (True) or backward (False) through the document.  
+Found | A Boolean property that's True if the search finds a match and False if it doesn't.  
+Highlight | A Long variable-type argument controlling whether highlighting is included in the formatting for the replacement text (True) or not (False).  
+MatchAllWordForms | A Boolean property—True or False—corresponding to the Find All Word Forms check box.  
+MatchCase | A Boolean property corresponding to the Match Case check box. If the user has this option deselected, be sure your code deselects it after you're finished with any case-sensitive searching in your procedure. See the sidebar "Practical Searching: Remember to Clear Formatting" later in this chapter.  
+MatchSoundsLike | A Boolean property corresponding to the Sounds Like check box.  
+MatchWholeWord | A Boolean property corresponding to the Find Whole Words Only check box.  
+MatchWildcards | A Boolean property corresponding to the Use Wildcards check box.  
+ParagraphFormat | Paragraph formatting you're searching for (on either specified text or an empty string).  
+Replacement | Returns a Replacement object containing the criteria for a replace operation.  
+Style | The style for the search text. Usually, you'll want to use the name of a style in the current template, but you can also use one of the built-in Word constant style names, such as wdStyleHeading1 (Heading 1 style).  
+Text | The text you're searching for (what you'd enter in the Find What box in the Find And Replace dialog box). Use an empty string ("") to search only for formatting.  
+Wrap | A Long property that governs whether a search that starts anywhere other than the beginning of a document (for a forward search) or the end of a document (for a backward search), or a search that takes place in a range, _wraps_ (continues) when it reaches the end or beginning of the document or the end or beginning of the selection.
+
+You use the Replacement object to specify the replace criteria in a replacement operation. The Replacement object has the following properties, which correspond to the properties of the Find object (but pertain to the replacement operation instead): Font, Highlight, ParagraphFormat, Style, and Text.
+
+## Understanding the Syntax for the _Execute_ Method
+
+The syntax for the Execute method is as follows:
+
+    expression.Execute(FindText, MatchCase, MatchWholeWord, MatchWildcards,
+       MatchSoundsLike, MatchAllWordForms, Forward, Wrap, Format,
+       ReplaceWith, Replace, MatchKashida, MatchDiacritics, MatchAlefHamza,
+       MatchControl, MatchPrefix, MatchSuffix, MatchPhrase, IgnoreSpace,
+       IgnorePunct)
+
+The final five arguments, starting with MatchPrefix, are not displayed in the Auto List Members tool in the Editor, but they can be used in code, as in, for example, IgnoreSpace:=True.
+
+The most commonly used arguments for this method are explained here:
+
+  * _expression_ is a required expression that returns a Find object. Usually, it's easiest to use the Find object itself.
+  * FindText is an optional Variant specifying the text for which to search. Although this argument is optional, you'll almost always want to specify it, even if you specify only an empty string ("") to allow you to search for formatting rather than text. (If you don't specify "" for FindText, you will inadvertently search for the previous text searched for, and the style you want to locate will never be found unless that text is also present.)
+  * You can search for special characters by using special characters you use when working interactively (for example, ˆp for a paragraph mark or ˆt for a tab) and for wildcards by using the traditional Windows wildcards. For wildcards to work, you need to set MatchWildcards to True. You can search for a symbol by entering a caret and a zero, followed by its character code. For example, to search for a smart double closing quote, you'd specify **ˆ0148** because its character code is 148.
+  * MatchCase is an optional Variant that you can set to True to make the search case sensitive.
+  * MatchWholeWord is an optional Variant that you can set to True to restrict the search to finding whole words rather than words contained in other words.
+  * MatchWildcards is an optional Variant that you can set to True to use wildcards in the search.
+  * MatchSoundsLike is an optional Variant that you can set to True to have Word find words that it thinks sound similar to the Find item specified.
+  * MatchAllWordForms is an optional Variant that you can set to True to have Word find all forms of the Find item specified (for example, different forms of the same verb or noun).
+  * Forward is an optional Variant that you can set to True to have Word search forward (from the beginning of the document toward the end) or False to have Word search backward.
+  * Wrap is an optional Variant that governs whether a search that begins anywhere other than the beginning of a document (for a forward search) or the end of a document (for a backward search), or that takes place in a range, _wraps_ (continues) when it reaches the end or beginning of the document. Word offers various options for Wrap, as detailed in Table 21.2.
+
+Table 21.2 Options for Wrap offered by Word
+
+**Constant** | **Value** | **Meaning**  
+---|---|---  
+wdFindAsk | 2 | Word searches the selection or range—or from the insertion point to the end or beginning of the document—and then displays a message box prompting the user to decide whether to search the rest of the document.  
+wdFindContinue | 1 | Word continues to search after reaching the end or beginning of the search range or the end or beginning of the document.  
+wdFindStop | 0 | Word stops the Find operation upon reaching the end or beginning of the search range or the end or beginning of the document.
+
+  * Format is an optional Variant that you can set to True to have the search operation find formatting as well as (or instead of) any Find text you've specified.
+  * ReplaceWith is an optional Variant specifying the replacement text. You can use an empty string for ReplaceWith to simply remove the FindText text; you can also use special characters for ReplaceWith as you can for the FindText argument. To use a graphic object, copy it to the Clipboard and then specify **ˆc** (which stands for the contents of the Clipboard).
+
+* * *
+
+How to Use Graphic Objects with
+
+To use a graphic object as described in the bulleted item that explains the ReplaceWith argument, the graphic needs to be in the text layer (not floating over text). If the graphic is floating over text, ˆc pastes in the previous text contents of the Clipboard.
+
+* * *
+
+  * Replace is an optional Variant that controls how many replacements the Find operation makes: one (wdReplaceOne), all (wdReplaceAll), or none (wdReplaceNone).
+  * MatchPrefix is an optional Variant that allows you to search for a string of characters at the start of words, but not if any other character(s) precede the string. Here's how it works: If you leave MatchPrefix and MatchWholeWord set to False, then search for **real** , you'll get results with any word that contains that string, such as _real_ , sur _real, real_ time, bo _real_ , and so on. Any word with _real_ in it will be a hit. But set MatchWholeWord to True, and only the string itself, **real** , will result in a hit. Leave MatchWholeWord set to False and set MatchPrefix to True, and only words that begin with _real_ will hit, such as _real_ and _real_ time. Words like sur _real_ fail to qualify because they don't begin with the target string.
+  * MatchSuffix is an optional Variant that works the same way as MatchPrefix, except MatchSuffix allows you to search for a string of characters at the end of a word but not if any other characters follow the string. Using the example in the previous bullet, with MatchSuffix set to True, you would get hits on sur _real_ and bo _real_ but not _real_ time.
+  * MatchPhrase is an optional Variant that when set to True ignores any control characters (such as paragraph or tab characters) or white space (one or more space characters) between words.
+  * **This**
+  * **phrase**
+  * becomes equivalent to
+  * **this phrase.**
+  * IgnoreSpace is an optional Variant that ignores any white space between words but that does not ignore control characters.
+  * IgnorePunct is an optional Variant that ignores all punctuation characters between words in a search phrase.
+
+* * *
+
+**Practical Searching: Remember to Clear Formatting**
+
+One behavior in Word that can puzzle even experienced users and developers results from the fact that Find settings _persist_. For example, say that you search for a style such as _Heading1_. All goes well, you find the headings, and you close the Find And Replace dialog box (or if you're searching using VBA code, your macro finishes execution).
+
+Then somewhat later you run another macro that searches or replaces—or you use the Find And Replace dialog box in Word to look for a word such as _program_. You know that the word _program_ appears many times in your document, but the Find utility displays a message stating that "The search item was not found." What's wrong?
+
+The problem is that your original search for the headline style persists during your session with Word. Even switching to a different document during the current session will not clear the search criteria—including any style, font, or other special search criteria, such as MatchCase, that may have been employed
+
+In other words, you're now searching for the word _program_ but _also_ for the style _Heading 1_. So all the instances of _program_ in regular body text do not trigger hits. They are not in the specified style.
+
+If you search for a style but fail to click the No Formatting button in the Find And Replace dialog box when you've finished, that style search remains active.
+
+Likewise, when you use the Find object and the Replacement object in a procedure, you'll often need to use the ClearFormatting method, which clears any formatting specified under the Find What box or the Replace With box. Using the ClearFormatting method has the same effect as clicking the No Formatting button with the focus on the Find What box or the Replace With box. The following statements (here used within a With structure) clear formatting from the Find and Replacement objects, respectively:
+
+    With ActiveDocument.Content.Find
+        .ClearFormatting
+        .Replacement.ClearFormatting
+    End With
+
+It's a good idea to get into the habit of using the **ClearFormatting** method at the end of any macro that searches for styles or other special formatting. And it doesn't hurt to use the **ClearFormatting** method at the **_start_** of any macro that searches for anything, as well. Sometimes unnecessary? Potentially redundant? Sure. But it's good insurance against this common and puzzling bug.
+
+A similar situation occurs when you employ the Execute method, as described earlier in this chapter. Remember that when using Execute, you should almost always specify the FindText argument—even if you specify only an empty string ("") to allow you to search for formatting. If you don't specify FindText, you run the risk of searching inadvertently for the string searched for previously.
+
+* * *
+
+## Putting Find and Replace to Work
+
+The simplest way to use Find and Replace is to specify only as many parameters as you need in an Execute statement, leaving out any optional parameters that are irrelevant to your search. With long argument lists, it's always better to use the named-argument approach, like this:
+
+    FindText:="National Velvet"
+
+This example replaces all pairs of paragraph marks (removing empty lines) in the active document with single paragraph marks; you could search for **ˆpˆp** and replace it with **ˆp** with the following statement:
+
+    ActiveDocument.Content.Find. **Execute** FindText:="ˆpˆp",
+      ReplaceWith:="ˆp", _
+        Replace:=wdReplaceAll
+
+By running this statement in a loop, you could replace all extra paragraph marks in the document. You would have to employ a loop here because the wdReplaceAll constant specifies that the find-and-replace activity should go through the entire document once.
+
+It's necessary to loop because you might have multiple paragraph marks in clusters, such as four in a row: ˆpˆpˆpˆp. The first pass through the document would replace those four with two (ˆpˆp), so you'd need to go through again to reduce these to the desired single ˆp. In other words, in this case you must search and replace more than once.
+
+You can also use a With statement to specify the properties for a Find and Replace operation. Listing 21.1 shows an example of this. The code changes all bold formatting in the open document named Example.docm to italic formatting.
+
+**Listing 21.1**: Using With to specify properties
+
+     1.  **With** Documents("Example.docm").Content.Find
+     2.      .ClearFormatting
+     3.      .Font.Bold = True
+     4.      **With** .Replacement
+     5.          .ClearFormatting
+     6.          .Font.Bold = False
+     7.          .Font.Italic = True
+     8.      **End With**
+     9.      .Execute FindText:= "", ReplaceWith:= "", _
+                 Format:=True, Replace:=wdReplaceAll
+    10.  **End With**
+
+  * Here, line 1 identifies the Document object (Example.docm in the Documents collection) with which to work and begins a With statement with its Find object.
+  * Line 2 uses the ClearFormatting method to clear any formatting from the Find object, and
+  * Line 3 then sets the Bold property of its Font object to True.
+  * Lines 4 through 8 contain a nested With statement for the Replacement object.
+  * Line 5 uses the ClearFormatting method to clear formatting from the Replacement object,
+  * Line 6 sets its Bold property to False, and
+  * Line 7 sets its Italic property to True.
+  * Line 9 then uses the Execute method to execute the replacement operation. Both FindText and ReplaceWith here are specified as empty strings to cause Word to work with formatting only; Format is set to True to activate the formatting set in the Find and Replacement objects, and Replace is set to wdReplaceAll to replace all instances of the bold formatting with the italic formatting.
+  * Line 10 ends the outer With statement.
+
+# Working with Headers, Footers, and Page Numbers
+
+The following sections show you how to work with headers and footers in Word documents. You'll also learn how to use VBA to manipulate page numbers, which are often included in headers and footers.
+
+## Understanding How VBA Implements Headers and Footers
+
+You can create several types of headers and footers in a Word document: the primary header and footer, unique first-page-only headers and footers, special headers and footers that appear only on the even pages—even different sets of headers and footers for each of the sections in a document if need be.
+
+Every document automatically gets a primary header and a primary footer, even if you don't put anything in them. You can then create different first-page and even-page headers by changing the Page Setup options for the section. (Click the Page Layout tab on the Ribbon, then click the small arrow in the lower-right corner of the Page Setup zone. This opens the Page Setup dialog box; click the Layout tab. Note, however, that the primary header and footer features are accessed from the Insert tab on the Ribbon.)
+
+VBA uses the following objects for headers and footers:
+
+  * Both headers and footers are contained in HeaderFooter objects. You access headers through the Headers property and footers through the Footers property.
+  * The HeadersFooters collection contains all the HeaderFooter objects in a given section of a document. Because each section of a document can have different headers and footers than the other sections have, you reach any given header or footer by going through the section.
+  * To return the HeadersFooters collection, you use the Headers property or the Footers property of the appropriate Section object in the appropriate Document object. Alternatively, you can use the HeaderFooter property of the Selection object to return a single HeaderFooter object, but this approach tends to be more limited in its use.
+  * The HeaderFooter object gives access to the Range object, the Shapes collection, and the PageNumbers collection.
+
+## Getting to a Header or Footer
+
+You access a header or footer through the appropriate _section_ within the document. For example, the following statement displays a message box containing the text in the first-page footer that's in the second section of the open document Transfer.docm:
+
+    MsgBox Documents("Transfer.docm").Sections(2). _
+        Footers(wdHeaderFooterFirstPage).Range.Text
+
+The following statements declare the HeaderFooter object variable myHeader and assign to it the primary header in the first section in the active document:
+
+    Dim myHeader As HeaderFooter
+    Set myHeader = ActiveDocument.Sections(1).Headers _
+        (wdHeaderFooterPrimary)
+
+## Checking to See If a Header or Footer Exists
+
+Recall that Word automatically creates a primary header and primary footer for each document, so these objects always exist. To find out whether other types of headers or footers exist, check the Exists property of the application HeaderFooter object. The following statements check to see if the even-pages footer exists in each section in turn in the active document and create a generic header (containing the section number and the full name of the document) formatted with the style named Footer (which exists by default in most Word documents):
+
+    Dim cSection As Section
+    With ActiveDocument
+        For Each cSection In .Sections
+            cHeader = cSection.Headers(wdHeaderFooterEvenPages)
+            If Not cSection.Headers(wdHeaderFooterEvenPages).Exists Then
+                cSection.PageSetup.OddAndEvenPagesHeaderFooter = True
+                cSection.Headers(wdHeaderFooterEvenPages).Range.Text _
+                    = "Section " & cSection.Index & " of " & .FullName
+                cSection.Headers(wdHeaderFooterEvenPages).Range. _
+                    Style = "Even Footer"
+            End If
+        Next cSection
+    End With
+
+## Linking to the Header or Footer in the Previous Section
+
+By default, Word links the header and footer in each section after the first to the header and footer in the previous section. To break the link, set the LinkToPrevious property of the header or footer to False; to create the link, set this property to True. The following statement unlinks the primary footer in the third section of the active document from the corresponding footer in the second section:
+
+    ActiveDocument.Sections(3).Footers _
+    (wdHeaderFooterPrimary).LinkToPrevious = False
+
+## Creating a Different First-Page Header
+
+To create a different header on the first page of a section, set the DifferentFirstPageHeaderFooter property of the PageSetup object for the section to True. The following statements check to see if the 10th section of the active document contains a first-page header and create one if it doesn't:
+
+    With ActiveDocument.Sections(10)
+        If .Headers(wdHeaderFooterFirstPage).Exists = False Then _
+            .PageSetup.DifferentFirstPageHeaderFooter = True
+    End With
+
+## Creating Different Odd- and Even-Page Headers
+
+To produce different headers for odd and even pages of your document (other than the first page), create an even-page header. The primary header by default appears on both odd and even pages until you create an even-page header, at which point the primary header becomes the odd-page header.
+
+As with the first-page header, you work through the PageSetup object to create a different even-page header, setting the OddAndEvenPagesHeaderFooter property to True, as in the following statement:
+
+    ActiveDocument.Sections(1). **PageSetup.OddAndEvenPagesHeaderFooter** = True
+
+* * *
+
+Use Nested Loops to Modify Headers and Footers
+
+If you write procedures to format documents, you may need to check or change all the headers and footers in a document. The easiest way to do so is to use two For Each... Next loops, the outer loop working through each Section object in the Sections collection and the inner loop working through each HeaderFooter object in the HeaderFooters collection within that section.
+
+* * *
+
+## Adding Page Numbers to Your Headers and Footers
+
+A header or footer of a document often contains a page number: either a simple number in a straightforward format (1, 2, 3, and so on) or a more complex number denoting the chapter and page within it, separated by a separator character.
+
+VBA implements page numbers through a PageNumbers collection that you return by using the PageNumbers property of the appropriate HeaderFooter object within the appropriate section of the document.
+
+### Adding Page Numbers to One or More Sections of a Document
+
+To add page numbers to a document, use the Add method with the PageNumbers collection for the appropriate section of the document.
+
+The syntax for the Add method is as follows:
+
+    _expression_.Add PageNumberAligment, FirstPage
+
+Here, _expression_ is a required expression that returns a PageNumbers collection. Usually, you'll use the PageNumbers collection itself.
+
+PageNumberAlignment is an optional Variant argument specifying the alignment for the page numbers being added. Table 21.3 lists the constants and values you can use.
+
+Table 21.3 PageNumberAlignment constants and values
+
+**Constant** | **Value** | **Resulting Alignment**  
+---|---|---  
+wdAlignPageNumberLeft | 0 | Left  
+wdAlignPageNumberCenter | 1 | Centered  
+wdAlignPageNumberRight | 2 | Right (default)  
+wdAlignPageNumberInside | 3 | Inside margin (right on left-hand pages, left on right-hand pages)  
+wdAlignPageNumberOutside | 4 | Outside margin (left on left-hand pages, right on right-hand pages)
+
+FirstPage is an optional Variant argument that you can set to False to make the header and footer on the first page suppress the page number. If you omit the FirstPage argument, the DifferentFirstPageHeaderFooter property of the PageSetup object controls whether the header and footer on the first page are the same as or different than they are on the other pages in the section.
+
+Both the PageNumberAlignment argument and the FirstPage argument are optional, but you'll usually want to specify at least the PageNumberAlignment argument.
+
+The following subprocedure adds page numbers to all the headers in each section of a document by using two For Each... Next loops:
+
+    Sub AddPageNumbersToAllHeadersAndSections()
+        Dim cHeader As HeaderFooter, cSection As Section
+        With Documents("Headers and Footers.docm")
+            For Each cSection In .Sections
+                For Each cHeader In cSection.Headers
+                    cSection.Headers(wdHeaderFooterPrimary).PageNumbers.Add _
+                    PageNumberAlignment:=wdAlignPageNumberRight, FirstPage:=True
+                Next cHeader
+            Next cSection
+        End With
+    End Sub
+
+### Removing Page Numbers from One or More Sections of a Document
+
+To remove a page number from a page, specify the PageNumber object and use the Delete method. The following subprocedure removes each PageNumber object from the current section of the active document:
+
+    Sub RemovePageNumbersFromCurrentSection()
+        Dim ThisHeader As HeaderFooter
+        Dim ThisPageNumber As **PageNumber**
+        With Selection.Sections(1)
+            For Each ThisHeader In .Headers
+                For Each ThisPageNumber In ThisHeader.PageNumbers
+                    ThisPageNumber. **Delete**
+                Next ThisPageNumber
+            Next ThisHeader
+        End With
+    End Sub
+
+### Finding Out If a Section of a Document Has Page Numbers
+
+The easiest way to find out if any given page number exists is to check the Count property for the PageNumbers collection for the appropriate section. For example, the following statement adds centered page numbers to the even-pages header in the current section if the header doesn't already have them:
+
+    If Selection.Sections(1).Headers(wdHeaderFooterEvenPages) _
+        .PageNumbers.Count = 0 Then Selection.Sections(1) _
+        .Headers(wdHeaderFooterEvenPages).PageNumbers.Add _
+        PageNumberAlignment:=wdAlignPageNumberCenter
+
+### Changing the Page Numbering for a Section
+
+To change the page numbering for a section, you work with the StartingNumber property, using the RestartNumberingAtSection property, the IncludeChapterNumber property, and the ChapterPageSeparator property as necessary.
+
+The StartingNumber property is a Long property that contains the starting page number for the section when the RestartNumberingAtSection property is set to True. When the RestartNumberingAtSection property is set to False, StartingNumber returns 0 (zero). The following statements set the page numbering for the primary header in the fourth section of the active document to start at 55 if it doesn't currently have a starting number assigned:
+
+    With ActiveDocument.Sections(4).Headers(wdHeaderFooterPrimary)
+        If .PageNumbers.StartingNumber = 0 Then
+            .PageNumbers.RestartNumberingAtSection = True
+            .PageNumbers.StartingNumber = 55
+        End If
+    End With
+
+To add the chapter number to the page numbers, use heading numbering in your document. Set the IncludeChapterNumber property to True, and specify the separator to use (for example, wdSeparatorEnDash for an en dash):
+
+    With ActiveDocument.Sections(4).Headers(wdHeaderFooterPrimary) _
+        .PageNumbers
+        .IncludeChapterNumber = True
+        .ChapterPageSeparator = wdSeparatorEnDash
+    End With
+
+### Suppressing the Page Number for the First Page
+
+To suppress the page number for the first page in a section, set the ShowFirstPageNumber property for the appropriate HeaderFooter object in the appropriate section to False:
+
+    ActiveDocument.Sections(1).Footers(wdHeaderFooterPrimary).PageNumbers_
+        .ShowFirstPageNumber = False
+
+### Formatting Page Numbers
+
+You can format page numbers in two ways: by setting the format in which they're displayed (for instance, as regular Arabic numbers or as lowercase Roman numerals) and by formatting the font in which that format is displayed.
+
+To choose the format in which the page numbers are displayed, set the NumberStyle property of the PageNumbers collection in question. For example, the following statement formats the page numbers in the primary header in the fourth section of the active document as lowercase letters:
+
+    ActiveDocument.Sections(4).Headers(wdHeaderFooterPrimary) _
+        .PageNumbers.NumberStyle = wdPageNumberStyleLowercaseLetter
+
+Once the page numbers are in the header or footer, you can format them in any of several ways. One easy way to set the font in which a given page number is formatted is to use the Select method to select the PageNumber object and then apply formatting to it as you would any other selection, as in the following statements:
+
+    ActiveDocument.Sections(4).Headers(wdHeaderFooterPrimary) _
+        . **PageNumbers(1).Select**
+        With Selection.Font
+            .Name = "Impact"
+            .Size = 22
+            .Bold = True
+        End With
+
+### Creating "Page X of Y"–Type Page Numbers
+
+You can also implement page numbering by using Word's field codes in the header or footer. This technique is especially useful when you want to number the pages with an "X of Y" numbering scheme—"Page 168 of 192" and so on. The following statements select the primary header for the final section of the active document, apply center alignment, and enter the text and fields to produce this type of numbering:
+
+    ActiveDocument.Sections(ActiveDocument.Sections.Count) _
+        .Headers(wdHeaderFooterPrimary).Range.Select
+    With Selection
+        .Paragraphs(1).Alignment = wdAlignParagraphCenter
+        .TypeText Text:="Page "
+        .Fields.Add Range:=Selection.Range, Type:=wdFieldEmpty, Text:= _
+            "PAGE ", PreserveFormatting:=True
+        .TypeText Text:=" of "
+        .Fields.Add Range:=Selection.Range, Type:=wdFieldEmpty, Text:= _
+            "NUMPAGES ", PreserveFormatting:=True
+    End With
+
+If you insert a page number by using a field in this way, you can still access the page number by using the appropriate PageNumber object. (In this case, the PageNumber object consists of the PAGE field, not of the NUMPAGES field.)
+
+# Working with Sections, Page Setup, Windows, and Views
+
+Each Word document contains at least one _section_ by default and can contain multiple sections as needed for its contents and layout. The section of the document controls the page layout so that different sections of a document can use different page layouts if necessary.
+
+## Adding a Section to a Document
+
+You can add a section to a document either by using the Add method with the Sections collection or by using the InsertBreak method with a Range or Selection object.
+
+The Add method has the following syntax:
+
+    _expression_.Add Range, Start
+
+Here, _expression_ is a required expression that returns a Sections collection. Range is an optional Variant argument specifying the range at the beginning of which to insert the break. (If you omit Range, VBA inserts the break at the end of the document.) Start is an optional Variant argument used to specify the type of section break to insert:
+
+  * wdSectionContinuous (0) for a continuous break
+  * wdSectionEvenPage (3) for an even-page break
+  * wdSectionOddPage (4) for an odd-page break
+  * wdSectionNewColumn (1) for a new-column break
+  * wdSectionNewPage (2, the default) for a new-page break
+
+The following statement adds a new-page section to the active document, placing it before the second paragraph:
+
+    ActiveDocument.Sections. **Add** _
+    Range:=.Range(Start:=.Paragraphs(2).Range.Start, _
+            End:=.Paragraphs(2).Range.Start), Start:=wdSectionNewPage
+
+The InsertBreak method takes the following syntax:
+
+    _expression_.InsertBreak Type
+
+Here, _expression_ is a required expression that returns a Selection or Range object. Type is an optional Variant argument specifying the type of section break to be inserted:
+
+  * wdSectionBreakNextPage (2) for a new-page break
+  * wdSectionBreakContinuous (3) for a continuous break
+  * wdSectionBreakEvenPage (4) for an even-page break
+  * wdSectionBreakOddPage (5) for an odd-page break
+  * wdColumnBreak (8) for a new-column break
+
+The following statement inserts a continuous section break before the second paragraph in the active document:
+
+    ActiveDocument.Paragraphs(2).Range.InsertBreak _
+     Type:=wdSectionBreakContinuous
+
+## Changing the Page Setup
+
+To change the page setup of a document or a section, you work with the PageSetup object of the application Document object or Section object. For example, the following statements work with the PageSetup object of the document named Planning.docm, setting letter-size paper, portrait orientation, mirror margins, and margin measurements (in points):
+
+    With Documents("Planning.docm").PageSetup
+        .PaperSize = wdPaperLetter
+        .Orientation = wdOrientPortrait
+        .TopMargin = 1
+        .BottomMargin = 1
+        .LeftMargin = 1
+        .RightMargin = 1.5
+        .MirrorMargins = True
+    End With
+
+## Opening a New Window Containing an Open Document
+
+To open a new window containing an open document, use the Add method. Its syntax is straightforward:
+
+    _expression_.Add _window_
+
+Here, _expression_ is an expression that returns a Windows collection, and _window_ is an optional Variant argument specifying the window containing the document for which you want to open a new window. If you omit _window_ , VBA opens a new window for the active document.
+
+* * *
+
+Understanding the Two Windows Collections
+
+There are two Windows collections: one for the application and one for the windows displaying the document with which you're working. The Windows collection for the Document object can be useful if you have multiple windows open for the same document (as you can do by clicking the Ribbon's View tab, then clicking the New Window button in the Window section of the Ribbon), but usually you'll want to use the Windows collection for the Application object. Windows is a creatable object, so you don't need to specify the Application object.
+
+* * *
+
+For example, the following statements open a new window for the first window open for the active document, assigning the window to the variable myWindow:
+
+    Dim myWindow As Window
+    Set myWindow = Windows.Add(Window:=ActiveDocument.Windows(1))
+
+## Closing All Windows Except the First for a Document
+
+Occasionally, it's useful to open one or more new windows for a document. If you do so, sooner or later you'll need to close all the secondary windows to give yourself more room to maneuver. The following statements close all windows except the first for the active document:
+
+    Dim myWin As Window, myDoc As String
+    myDoc = ActiveDocument.Name
+    For Each myWin In Windows
+        If myWin.Document = myDoc Then _
+            If myWin. **WindowNumbe** r <> 1 Then myWin.Close
+    Next myWin
+
+## Splitting a Window
+
+To split a window in two parts horizontally, set its Split property to True. To specify the split percentage (which controls how far down the window, measuring vertically, the split is placed), set the SplitVertical property. The following statements split the active window 70 percent of the way down the window:
+
+    With ActiveWindow
+        . **Split** = True
+        .SplitVertical = 70
+    End With
+
+To remove the split from the window, set the Split property to False:
+
+    ActiveWindow.Split = False
+
+* * *
+
+Try Snapping Windows
+
+Windows 7 and 8 have a nice feature that you might want to use instead of Word's internal split window. Drag a window by its title bar to the left side of the screen. Drag another window to the right side. They snap and automatically take up half the screen each.
+
+* * *
+
+## Displaying the Document Map for a Window
+
+To display the Document Map for a window at the Document Map's previous width percentage (of the entire window), set the DocumentMap property to True:
+
+    ActiveWindow.DocumentMap = True
+
+To display the Document Map at a different width, or to change the width of the Document Map, set the DocumentMapPercentWidth property to a suitable percentage of the window's width:
+
+    ActiveWindow.DocumentMapPercentWidth = 25
+
+To hide the Document Map again, set the DocumentMap property to False or set the DocumentMapPercentWidth property to 0.
+
+## Scrolling a Window
+
+To scroll a window up, down, left, or right, use either the LargeScroll method or the SmallScroll method.
+
+The LargeScroll method is analogous to clicking within the scroll bar (not on a thumb—the arrows at the top and bottom of the scroll bar); this scrolls the contents of the window by one entire "screen." The SmallScroll method is analogous to clicking a thumb, this scrolls the contents of the window up or down by one line. If you're working with a horizontal scroll bar, the contents move left or right by a small scroll increment.
+
+The syntax for the LargeScroll method is as follows:
+
+    _expression_.LargeScroll( _Down, Up, ToRight, ToLeft_ )
+
+The syntax for the SmallScroll method is almost identical:
+
+    _expression_.SmallScroll( _Down, Up, ToRight, ToLeft_ )
+
+Here, _expression_ is a required expression that returns a Window object. Down, Up, ToRight, and ToLeft are optional Variant arguments that specify the number of screens (for LargeScroll) or lines or horizontal movement units (for SmallScroll) to scroll the contents of the window in the directions their names indicate.
+
+The following statement scrolls the active window up two screens:
+
+    ActiveWindow. **LargeScroll** Up:=2
+
+## Arranging Windows
+
+To arrange a number of windows, use the Arrange method. The syntax for the Arrange method is as follows:
+
+    _expression_.Arrange ArrangeStyle
+
+Here, _expression_ is an expression that returns a Windows collection, and ArrangeStyle is an optional Variant argument that specifies how to arrange the windows: as icons (wdIcons, 1) or tiled (wdTiled, 0). The default is wdTiled.
+
+For example, the following statement tiles the open windows:
+
+    Windows. **Arrange** ArrangeStyle:=wdTiled
+
+## Positioning and Sizing a Window
+
+To position a window on the monitor, set its Left and Top properties, as in this example:
+
+    ActiveWindow.Left = 100
+    ActiveWindow.Top = 200
+
+To size a window, set its Height and Width properties:
+
+    With ActiveWindow
+        .Height = 300
+        .Width = 400
+    End With
+
+To maximize, minimize, or "restore" a window, set its WindowState property to wdWindowStateMaximize, wdWindowStateMinimize, or wdWindowStateNormal, respectively. The following statements maximize the window containing the document named Example.docm if the window is minimized:
+
+    With Documents("Example.docm").Windows(1)
+        If .WindowState = wdWindowStateMinimize Then _
+           .WindowState = wdWindowStateMaximize
+    End With
+
+## Making Sure an Item Is Displayed in the Window
+
+After opening or arranging windows, you'll often need to make sure an item you want the user to see—a range, some text, a graphic or other shape, or a field—is displayed in the window. The easiest way to do so is to use the ScrollIntoView method of the Window object. This method moves the view but not the selection, so if you need the selection to move as well, you'll need to write additional code to move it there.
+
+The ScrollIntoView method takes the following syntax:
+
+    _expression_.ScrollIntoView(Obj, Start)
+
+Here, _expression_ is a required expression that returns a Window object. Obj is a required argument specifying a Range or Shape object. Start is an optional Boolean argument that you can set to True (the default) to have the upper-left corner of the range or shape displayed, or False to have the lower-right corner displayed. Specify False for Start when you need to make sure the end of a range or shape that may be larger than the window is displayed.
+
+The following statements position the selection at the end of the last paragraph in the first list in the active document, ready to add a new paragraph to the list:
+
+    Dim rngFirstList As Range
+    Set rngFirstList = ActiveDocument.Lists(1).Range
+    ActiveDocument.Windows(1). **ScrollIntoView** Obj:=rngFirstList,
+       Start:=False
+    rngFirstList.Select
+    Selection.Collapse Direction:=wdCollapseEnd
+    Selection.MoveLeft Unit:=wdCharacter, Count:=1, Extend:=wdMove
+
+## Changing a Document's View
+
+To change a document's view, set the Type property of the View object for the appropriate window to wdConflictView, wdMasterView, wdNormalView, wdOutlineView, wdPrintPreview, wdPrintView, wdReadingView, or wdWebView. For example, the following statement changes the view for Sample.docm to Print Layout view:
+
+    Documents("Sample.docm").Windows(1).View.Type = wdPrintView
+
+## Switching to Read Mode
+
+Read mode hides the Ribbon, any markup, and nearly everything else except the text itself. Panes, however, such as Navigation and Thesaurus, do remain visible. The text itself is usually displayed as two pages or three (depending on your zoom level) side by side as in a book. You cannot edit in this view. Here's how to switch to read mode:
+
+    ActiveDocument.ActiveWindow.View.Type = wdReadingView
+
+Read mode is thoughtfully designed to make the content as easy to read and remember as possible. For example, the zoom feature (lower right) adjusts the font size but repaginates (reflows) so you never have the struggle with moving a horizontal scroll bar to show hidden text. There _is_ a scroll bar, but it's never needed to display text that's out of view because of the zoom level. The zoom bar is strictly for global document navigation and as an indicator of your current position.
+
+Read mode also gives you some control over column width. Most people find it easier to read shorter lines of text, so you can adjust line length in the View menu. The Esc key exits read mode.
+
+## Zooming the View to Display Multiple Pages
+
+To zoom Print Layout view or Print Preview to display multiple pages, set the PageColumns and PageRows properties of the appropriate View object. (Change the view first if necessary.) The following statement displays Sample.docm in Print Layout view with six pages displayed (three across by two deep):
+
+    With Documents("Sample.docm").Windows(1).View
+        .Type = wdPrintView
+        With .Zoom
+            .PageColumns = 3
+            .PageRows = 2
+        End With
+    End With
+
+# Working with Tables
+
+Many people need to work with tables in their Word documents, either creating them from scratch or manipulating existing tables.
+
+VBA uses a Table object to represent each individual table. If there is more than one Table object, they are gathered together into the Tables collection. To work with tables, you use the Tables property to return the Tables collection for the Document, Range, or Selection object in question.
+
+Here is a sample of the collections and objects that are members of the Tables collection and the Table object:
+
+  * The Rows collection contains the rows in the table. Each row is represented by a Row object.
+  * The Columns collection contains the columns in the table. Each column is represented by a Column object.
+  * The Cell object provides access to a specified cell directly from the Table object. You can also reach the cells in the table by going through the row or column in which they reside.
+  * The Range object provides access to ranges within the table.
+  * The Borders collection contains all the borders for the table.
+  * The Shading object contains all the shading for the table.
+
+For a complete list of the members of the Table object in Word 2013, see this Web page:
+
+<http://msdn.microsoft.com/en-us/library/office/ff195902.aspx>
+
+The members of the Tables collection can be found here:
+
+<http://msdn.microsoft.com/en-us/library/office/ff822892.aspx>
+
+## Creating a Table
+
+To create a new table from scratch (rather than converting existing text to a table), use the Add method with the Tables collection. The Add method takes the following syntax for the Tables collection:
+
+    _expression_.Add(Range, NumRows, NumColumns, DefaultTableBehavior, AutoFitBehavior)
+
+The arguments are as follows:
+
+  * _expression_ is a required expression that returns a Tables collection. Typically, you'll want to use the Tables collection for the appropriate document.
+  * Range is a required argument supplying the range where you want to insert the table. If the range is a selection (rather than being a collapsed selection, or insertion point), the table replaces the range.
+  * NumRows is a required Long argument specifying the number of rows the table is to have.
+  * NumColumns is a required Long argument specifying the number of columns the table is to have.
+  * DefaultTableBehavior is an optional Variant argument specifying whether the table autofits its columns to their contents or to the window when you change the contents or the window width. Use wdWord9TableBehavior to have the table autofit its columns or wdWord8TableBehavior (the default) to have the columns retain their width.
+  * AutoFitBehavior is an optional Variant argument specifying the autofit behavior for the table. This argument applies only when DefaultTableBehavior is wdWord9TableBehavior. Use wdAutoFitContent to resize the columns to their contents, wddAutoFitWindow to resize the columns to the window width, or wdAutoFitFixed to use a fixed column width.
+
+For example, the following statement inserts a new, blank, non-autofitting table containing 10 rows and 5 columns at the current position of the insertion point in the active document:
+
+    ActiveDocument.Tables.Add Range:=Selection.Range, NumRows:=10, _
+        NumColumns:=5, DefaultTableBehavior:=wdWord8TableBehavior
+
+## Selecting a Table
+
+To select a table, specify the Document, Range, or Selection object involved, and then identify the Table object and use the Select method. This method takes no arguments.
+
+The following statement selects the first table in the active document:
+
+    ActiveDocument.Tables(1).Select
+
+The following statements declare the variable tempTable and then select the first table in the document named Log.docm and assign its Range object to tempTable:
+
+    Dim tempTable
+    Documents("Log.docm").Tables(1).Select
+    Set tempTable = Selection.Tables(1).Range
+
+The following statement selects the second table in the range named tempRange:
+
+    tempRange.Tables(2).Select
+
+This statement selects the first table in the current selection:
+
+    Selection.Tables(1).Select
+
+## Converting Text to a Table
+
+To convert ordinary text to a table (as opposed to inserting a new table from scratch), use the ConvertToTable method with an appropriate Range or Selection object. The ConvertToTable method takes the following syntax:
+
+    _expression_.ConvertToTable(Separator, NumRows, NumColumns,
+       InitialColumnWidth, Format, ApplyBorders, ApplyShading, ApplyFont,
+       ApplyColor, ApplyHeadingRows, ApplyLastRow, ApplyFirstColumn,
+       ApplyLastColumn, AutoFit, AutoFitBehavior, DefaultTableBehavior)
+
+The arguments are as follows:
+
+  * _expression_ is a required argument specifying an expression that returns a Range object or a Selection object.
+  * Separator is an optional Variant argument specifying the separator character (also known as the _delimiter_ character) to use to mark where the column divisions were. You can use these values for Separator: 
+    * wdSeparateByCommas separates column information at commas.
+    * wdSeparateByDefaultListSeparator separates column information at the currently specified Other list separator character (the character shown in the text box alongside the Other option button in the Convert Table To Text dialog box).
+    * wdSeparateByParagraphs separates column information at the paragraph marks.
+    * wdSeparateByTabs (the default separator if you don't specify one) separates column information at tabs.
+    * Alternatively, you can specify a single separator character of your choice as a string or between double quotation marks. For example, enter **Separator:="|"** to use a vertical bar [|] as the separator.
+  * NumRows is an optional Variant argument specifying the number of rows the table should have. If you omit the NumRows argument, Word decides the number of rows in the table based on the number of columns specified and/or the number of the chosen separator characters it finds.
+  * NumColumns is an optional Variant argument specifying the number of columns the table should have. As with NumRows, if you omit the NumColumns argument, Word decides the number of columns in the table based on the number of rows specified and/or the number of the chosen separator characters it finds.
+  * InitialColumnWidth is an optional Variant argument that you can use to specify the initial width (in points) of each column in the table. If you omit the InitialColumnWidth argument, Word uses the full width of the page—from margin to margin—and allocates an equal width to each column, regardless of the relative widths of the contents of the columns. The InitialColumnWidth argument is useful primarily for restraining tables from using the full width of the page automatically. In many cases, autofitting the columns provides a better solution.
+  * Format is an optional Variant argument that you can use to specify one of Word's built-in autoformat styles for tables. To use the Format argument, specify the appropriate WdTableFormat constant (such as wdTableFormatElegant to specify the Elegant autoformat style). If you choose to apply a format, you can specify which properties of the autoformat style to apply to the table by using the following optional Variant arguments: 
+    * Set ApplyBorders to True to apply the border formatting, or to False not to apply it.
+    * Set ApplyShading to True to apply the shading, or to False not to apply it.
+    * Set ApplyFont to True to apply the font formatting, or to False not to apply it.
+    * Set ApplyColor to True to apply the color formatting, or to False not to apply it.
+    * Set ApplyHeadingRows to True to apply any heading-row formatting, or to False not to apply it.
+    * Set ApplyLastRow to True to apply any last-row formatting, or to False not to apply it.
+    * Set ApplyFirstColumn to True to apply any first-column formatting, or to False not to apply it.
+    * Set ApplyLastColumn to True to apply any last-column formatting, or to False not to apply it.
+  * AutoFit is an optional Variant argument you can set to True to have Word adjust the column width to best fit whatever contents are in the cells. When autofitting, Word doesn't increase the overall width of the table—it either reduces or retains the table's width.
+  * AutoFitBehavior and DefaultTableBehavior are as described in the section "Creating a Table," earlier in the chapter.
+
+The following statement converts the current selection to a five-column table, separating the information at commas. It applies autofitting to the table based on cell content and sets the cells to resize automatically:
+
+    Set myTable = Selection.ConvertToTable(wdSeparateByCommas, _
+        Selection.Paragraphs.Count, 5, , , , , , , , , , , True, _
+        wdAutoFitContent, wdWord9TableBehavior)
+
+## Ensuring That a Selection Is within a Table
+
+Before running any procedure that is intended to manipulate a table, it's a good idea to make sure that the current selection actually is within a table. Use the wdWithInTable argument of the Information property for the selection. wdWithInTable is Boolean, returning True if the selection is in a table and False if it isn't. Here's an example:
+
+    If Selection.Information(wdWithInTable) = True Then
+        'take actions here
+    End If
+
+## Finding Out Where a Selection Is within a Table
+
+In addition to establishing whether the selection is in a table, you can use the Information property to find out other information that can be useful when working with tables via a Range object or Selection object.
+
+Once you've established that the selection is within a table (probably by using the wdWithinTable argument), check whether the selection is at an end-of-row marker rather than being in a cell. If the selection is at an end-of-row marker, certain actions fail. For example, attempting to select the current cell or column fails because the selection is outside any cell or column, but attempting to select the current row succeeds.
+
+To check whether the selection is at the end-of-row marker, use the AtEndOfRowMarker argument for the Information property. The following statement moves the selection left one character (into the last cell in the same row) if the selection is at the end-of-row marker:
+
+    If Selection.Information(wdAtEndOfRowMarker) = True Then _
+        Selection.MoveLeft Unit:=wdCharacter, Count:=1
+
+If the selection contains the end-of-row marker rather than being a collapsed selection (an insertion point) before the marker, the wdAtEndOfRowMarker argument returns False. To avoid a selected end-of-row marker causing problems in your procedures, collapse the selection if it isn't collapsed before checking whether it's at the end-of-row marker. The following statements do this, using a variable named curSel to restore the selection it collapses unless collapsing the selection leaves the selection at an end-of-row marker:
+
+    Dim curSel
+    With Documents("Communications.docm")
+        If Selection.Type <> wdSelectionIP Then
+            Set curSel = Selection.Range
+            Selection.Collapse Direction:=wdCollapseStart
+        End If
+        If Selection.Information(wdAtEndOfRowMarker) = True Then
+            Selection.MoveLeft Unit:=wdCharacter, Count:=1, Extend:=wdMove
+        Else
+            If curSel <> "" Then curSel.Select
+            Set curSel = Nothing
+        End If
+    End With
+
+After establishing that the selection is safely in a table, you can retrieve six useful pieces of information about the table:
+
+  * wdStartOfRangeColumnNumber returns the number of the column in which the beginning of the selection or range falls. The following statement selects the column in which the current selection begins:
+
+    Selection.Tables(1).Columns(Selection.Information _
+        (wdStartOfRangeColumnNumber)).Select
+
+  * wdEndOfRangeColumnNumber returns the number of the column in which the end of the selection or range falls. The following statements delete the column in which the range testRange ends if the range is more than one column wide:
+
+    With testRange
+        If .Information(wdStartOfRangeColumnNumber) <> _
+            .Information(wdEndOfRangeColumnNumber) Then _
+            .Tables(1).Columns(.Information _
+            (wdEndOfRangeColumnNumber)).Delete
+    End With
+
+  * wdStartOfRangeRowNumber returns the number of the row in which the beginning of the selection or range falls.
+  * wdEndOfRangeRowNumber returns the number of the row in which the end of the selection or range falls.
+  * wdMaximumNumberOfColumns returns the highest number of columns in any row in the selection or range.
+  * wdMaximumNumberOfRows returns the highest number of rows in the specified selection or range in the table.
+
+## Sorting a Table
+
+To sort a table, identify the table and use the Sort method. Sort takes the following syntax with the Table object:
+
+    _expression_.Sort(ExcludeHeader, FieldNumber, SortFieldType, SortOrder,
+       FieldNumber2, SortFieldType2, SortOrder2, FieldNumber3,
+       SortFieldType3, SortOrder3, CaseSensitive, BidiSort, IgnoreThe,
+       IgnoreKashida, IgnoreDiacritics, IgnoreHe, LanguageID)
+
+The arguments are as follows:
+
+  * _expression_ is an expression that returns a Table object.
+  * ExcludeHeader is an optional Variant argument that you can set to True to exclude the first row in the table (which is often the table header row) from the sort, or to False to include the first row in the table.
+  * FieldNumber, FieldNumber2, and FieldNumber3 are optional Variant arguments specifying the first, second, and third fields by which to sort (respectively). Usually you'll want to specify at least FieldNumber; if you don't, Word performs an alphanumeric sort on the table.
+  * SortFieldType, SortFieldType2, and SortFieldType3 are optional Variant arguments specifying the type of sorting you want to use for FieldNumber, FieldNumber2, and FieldNumber3, respectively. For U.S. English, the options are alphanumeric sorting (wdSortFieldAlphanumeric, the default), numeric sorting (wdSortFieldNumeric), and date sorting (wdSortFieldDate).
+  * SortOrder, SortOrder2, and SortOrder3 are optional Variant arguments specifying the sorting order for FieldNumber, FieldNumber2, and FieldNumber3. Use wdSortOrderAscending to specify an ascending sort (the default) or wdSortOrderDescending to specify a descending sort.
+  * CaseSensitive is an optional Variant argument that you can set to True to specify case-sensitive sorting. The default setting is False.
+  * The next five arguments (BidiSort, IgnoreThe, IgnoreKashida, IgnoreDiacritics, and IgnoreHe) are for specialized sorting (such as right-to-left languages, Arabic, and Hebrew).
+  * LanguageID is an optional Variant argument that you can use to specify the language in which to sort. For example, to sort in Lithuanian, you could specify wdLithuanian for LanguageID. For sorting in your default language, you can omit this argument.
+
+## Adding a Column to a Table
+
+To add a column to a table, use the Add method with the Columns collection for the appropriate Table object. The Add method takes the following syntax for the Columns collection:
+
+    _expression_.Add [BeforeColumn]
+
+Here, _expression_ is a required expression that returns a Columns collection, and BeforeColumn is an optional Variant argument specifying the column to the left of which you want to insert the new column.
+
+The following example uses the Count property to check the number of columns in the first table in the active document. If this table contains fewer than five columns, one or more columns are added to bring the number of columns up to five. Each new column is added before (to the left of) the existing last column in the table:
+
+    With ActiveDocument.Tables(1)
+        .Select
+        If .Columns.Count < 5 Then
+            Do Until .Columns.Count = 5
+                .Columns.Add BeforeColumn:=.Columns(.Columns.Count)
+            Loop
+         End If
+    End With
+
+## Deleting a Column from a Table
+
+To delete a column, identify it and use the Delete method. Delete takes no arguments. The following statement deletes the first column in the table referenced by the object variable myTable:
+
+    myTable.Columns(1).Delete
+
+## Setting the Width of a Column
+
+You can set the width of a column by using the AutoFit method, by using the SetWidth method, or by specifying the Width property for the column.
+
+The AutoFit method resizes each column automatically to a width suitable to its contents. AutoFit takes no arguments. The following statement uses the AutoFit method to resize each column in the first table in the active document:
+
+    ActiveDocument.Tables(1).Columns.AutoFit
+
+The SetWidth method allows you to set the width of one or more columns and specify how the other columns in the table should change as a result. The syntax for the SetWidth method is as follows:
+
+    _expression_.SetWidth ColumnWidth, RulerStyle
+
+Here, _expression_ is an expression that returns the Columns collection or Column object whose width you want to set. ColumnWidth is a required Single argument specifying the width of the column or columns, measured in points. RulerStyle is a required Long argument that specifies how Word should adjust the width of the columns:
+
+  * The default value, wdAdjustNone, sets all the specified columns to the specified width, moving other columns to the left or right as necessary. This argument is analogous to Shift+dragging a column border when working interactively.
+  * wdAdjustFirstColumn applies the specified width to the first specified column, adjusting only as many columns to the right of this column as necessary. For example, widening the first column in a table slightly causes Word to narrow the second column but leave the third and subsequent columns unchanged. Widening the first column significantly causes Word to narrow the second and third columns, leaving the fourth and subsequent columns unchanged. This argument is analogous to dragging a column border when working interactively.
+  * wdAdjustProportional applies the specified width to the first specified column, keeping the right edge of the table in its previous position and adjusting all nonspecified columns proportionally to accommodate the change.
+  * wdAdjustSameWidth applies the specified width to the first specified column, keeping the right edge of the table in its previous position and adjusting all the other columns to an identical width to accommodate the change. This argument is analogous to Ctrl+dragging a column border when working interactively.
+
+The following statement sets the width of the second column in the first table in the active document to 50 points, adjusting the columns to the right of the second column proportionally:
+
+    ActiveDocument.Tables(1).Columns(2).SetWidth ColumnWidth:=50, _
+        RulerStyle:=wdAdjustProportional
+
+The Width property lets you change the width of a column without worrying about the effect on the other columns. Specify the width you want in points, as in this example:
+
+    ActiveDocument.Tables(11).Columns(44).Width = 100
+
+## Selecting a Column
+
+To select a column, use the Select method with the appropriate Column object. Select takes no arguments. The following statement selects the second column in the third table in the document named Originals.docm:
+
+    Documents("Originals.docm").Tables(3).Columns(2).Select
+
+## Adding a Row to a Table
+
+To add a row, use the Add method with the Rows collection for the table. The Add method takes the following syntax for the Rows collection:
+
+    _expression_.Add [BeforeRow]
+
+Here, _expression_ is a required expression that returns a Rows object, and BeforeRow is an optional Variant argument specifying the row before which you want to add the new row. If you omit BeforeRow, VBA adds the new row after the last existing row in the table.
+
+The following statement adds a new first row to the table referenced by the object variable myTable:
+
+    myTable.Rows.Add BeforeRow:=1
+
+You can also insert a row into a table at the current selection, using the InsertRowsBelow or InsertRowsAbove method. You specify how many rows. In this example, one row is inserted below the current selection:
+
+    Selection.InsertRowsBelow 1
+
+## Deleting a Row from a Table
+
+To delete a row, use the Delete method with the appropriate Row object. The Delete method takes no arguments. The following statement deletes the first row in the table referenced by the object variable myTable:
+
+    myTable.Rows(1).Delete
+
+## Setting the Height of One or More Rows
+
+You can set the height of rows by letting Word set the row height automatically, by using the SetHeight method to specify an exact height or a minimum height, or by setting the Height property of the row or rows directly.
+
+To have Word set the height of a row automatically, set the row's HeightRule property to wdRowHeightAuto. Word then adjusts the height of the row to accommodate the cell with the tallest contents. The following statement sets the HeightRule property for the second row in the fourth table in the active document to wdRowHeightAuto:
+
+    ActiveDocument.Tables(4).Rows(2).HeightRule = wdRowHeightAuto
+
+To specify an exact height or a minimum height for one or more rows, use the SetHeight method with the row or rows. The syntax for the SetHeight property is as follows:
+
+    _expression_.SetHeight RowHeight, [HeightRule]
+
+Here, _expression_ is an expression that returns a Row object or a Rows collection. HeightRule is a required Variant argument specifying the rule for setting the row height: use wdRowHeightAtLeast to specify a minimum height or wdRowHeightExactly to specify an exact height. (The third setting for HeightRule is wdRowHeightAuto, which specifies automatic row height and which you won't want to use in this case.)
+
+Instead of using the SetHeight method, you can set the Height property of the row or rows in question by specifying the height in points:
+
+    Documents("Tables.docm").Tables(3).Rows(3).Height = 33
+
+## Selecting a Row
+
+To select a row, use the Select method for the appropriate Row object. The Select method takes no arguments. The following statement selects the last row in the last table in the document named Tables.docm:
+
+    Documents("Tables.docm").Tables(.Tables.Count).Rows.Last.Select
+
+## Inserting a Cell
+
+To insert a cell, use the Add method with the Cells collection. The Add method takes the following syntax for the Cells collection:
+
+    _expression_.Add [BeforeCell]
+
+Here, _expression_ is an expression that returns a Cells collection, and BeforeCell is an optional Variant argument that specifies the cell to the left of which the new cell should be inserted. (If you omit the BeforeCell argument, VBA adds a new row of cells to the end of the table if you're using the Cells collection of the Columns collection, or it adds a new cell to the first row in the table if you're using the Cells collection of the Rows collection.)
+
+The following statement inserts a cell before the second cell in the first row of the first table in the document named Tables.docm:
+
+    Documents("Tables.docm").Tables(1).Rows(1).Cells.Add _
+        BeforeCell:=Documents("Tables.docm").Tables(1).Rows(1).Cells(2)
+
+## Returning the Text in a Cell
+
+To return the contents of a cell, use the Text property of the Range object for the cell. The following statement returns the text in the first cell in the second row of the third table in the active document and assigns it to the variable strCellText:
+
+    strCellText = ActiveDocument.Tables(3).Rows(2).Cells(1).Range.Text
+
+Because the Text property includes the end-of-cell marker (which takes up two characters), you'll usually want to strip off the last two characters when assigning the Text property to a string, like this:
+
+    strCellText = ActiveDocument.Tables(3).Rows(2).Cells(1).Range.Text
+    strCellText = Left(strCellText, Len(strCellText) - 2)
+
+When using the Range object, you can work with any of the objects and collections it contains. For example, to work with the paragraphs in a cell, use the Paragraphs collection.
+
+## Entering Text in a Cell
+
+To enter text in a cell, assign the text to the Text property of the Range object for the cell. The following statements enter text in the first three cells in the first row of the current selection:
+
+    With Selection.Tables(1).Rows(1)
+        .Cells(1).Range.Text = "Sample text in first cell."
+        .Cells(2).Range.Text = "Sample text in second cell."
+        .Cells(3).Range.Text = "Sample text in third cell."
+    End With
+
+## Deleting Cells
+
+To delete cells, use the Delete method with the appropriate Cell object or Cells collection. When you delete one or more cells, you must specify what happens to the rest of the table—whether the cells to the right of those you deleted move to the left or whether the cells below those you deleted move up.
+
+The syntax for the Delete method for the Cells collection and the Cell object is as follows:
+
+    _expression_.Delete [ShiftCells]
+
+Here, _expression_ is an expression that returns a Cells collection or a Cell object. ShiftCells is an optional Variant argument that specifies how the cells below or to the right of the deleted cell or cells should move. Use these values:
+
+  * wdDeleteCellsEntireColumn deletes the whole column in which the specified cell (or cells) is located.
+  * wdDeleteCellsEntireRow deletes the whole row.
+  * wdDeleteCellsShiftLeft moves cells across to the left to fill the gap.
+  * wdDeleteCellsShiftUp moves cells up to fill the gap.
+
+The following statement deletes the first cell in the first row of the first table in the active document and shifts the other cells in the first row to the left to fill the gap:
+
+    ActiveDocument.Tables(1).Rows(1).Cells(1).Delete _
+        ShiftCells:=wdDeleteCellsShiftLeft
+
+For procedures that rely on the user to make a selection within a table, you may want to determine how many rows or columns are in the selection before deciding how to shift the cells. The following example checks the number of rows and columns in a selection. If the selection is only one cell, or if the selection is all in one column, the code deletes the cell or cells and moves the other cells in the row to the left. If the selection is multiple cells in one column, the code deletes the cells and moves the other cells in the column up. If the selection spans columns and rows, the code displays a message box asking the user to make a selection in only one row or only one column:
+
+    With Selection
+        If .Columns.Count > 1 And .Rows.Count > 1 Then
+            MsgBox "Please select cells in only one row " _
+                & "or only one column."
+            End
+        Else
+            If .Cells.Count > 1 Then
+                If .Columns.Count > 1 Then
+                    .Cells.Delete ShiftCells:=wdDeleteCellsShiftUp
+                Else
+                    .Cells.Delete ShiftCells:=wdDeleteCellsShiftLeft
+                End If
+            Else
+                .Cells.Delete ShiftCells:=wdDeleteCellsShiftLeft
+            End If
+        End If
+    End With
+
+## Selecting a Range of Cells
+
+To select a range of cells within a table, declare a Range variable, assign to it the cells you want to select, and then select the range. The following example declares the Range variable myCells, assigns to it the first four cells in the first table in the active document, and then selects the range:
+
+    Dim myCells As Range
+    With ActiveDocument
+        Set myCells = .Range(Start:=.Tables(1).Cell(1, 1).Range.Start, _
+            End:=.Tables(1).Cell(1, 4).Range.End)
+        myCells.Select
+    End With
+
+## Converting a Table or Rows to Text
+
+To convert an entire table or a row or number of rows to text, specify the table, row, or rows and use the ConvertToText method. This is frequently useful if you're copying and pasting from Internet pages; they often contain tables and you just want the contents, the text, not the table structure itself. Due to limitations of the HTML language used to describe web page layout, HTML tables are sometimes even used for spacing and other reasons unrelated to displaying actual tabular data. These faux "tables" can look bizarre when pasted as text into Word or other body text. To see how to get rid of these annoying artifacts, see the example macro at the end of this section. It's a useful macro to add to your Normal project in Word's VBA Editor.
+
+The ConvertToText method takes the following syntax:
+
+    _expression_.ConvertTotext(Separator, Nested Tables)
+
+Here, _expression_ is a required expression that returns a Table object, a Row object, or a Rows collection. Separator is an optional Variant argument specifying the separator character (also known as the _delimiter_ character) to use to mark where the column divisions were. The possible values are as follows:
+
+  * wdSeparateByCommas separates column information by commas.
+  * wdSeparateByDefaultListSeparator separates column information by the currently specified Other list-separator character (the character shown in the text box alongside the Other option button in the Convert Table To Text dialog box).
+  * wdSeparateByParagraphs separates column information with paragraph marks.
+  * wdSeparateByTabs (the default separator if you don't specify one) separates column information by tabs.
+  * Alternatively, you can specify a separator character of your choice as a string or between double quotation marks. For example, enter **Separator:="|"** to use a vertical bar [|] as the separator. (Although you can supply more than one separator character here, Word uses only the first character.)
+
+The following statement converts the first table in the current selection to text using an asterisk (*) as the separator character:
+
+    Selection.Tables(1).ConvertToText Separator:="*"
+
+You can use the ConvertToText method with a Table object, a Row object, or a Rows collection. The following statement converts only the first row of the selected table to tab-delimited text:
+
+    Selection.Tables(1).Rows(1).ConvertToText Separator:=wdSeparateByTabs
+
+If you need to continue working with the contents of the table once you've converted it, assign a range to the table as you convert it. You can then work with the Range object afterward to manipulate the information. For example, the following statements convert the first table in the document named Cleveland Report.docm to text separated by paragraphs and assign the range exTable to the converted information and then copy the range, create a new document, and paste in the information:
+
+    Dim exTable As Range
+    Set exTable = Documents("Cleveland Report.docm").Tables(1). _
+        ConvertToText(Separator:=wdSeparateByParagraphs)
+    exTable.Copy
+    Documents.Add
+    Selection.Paste
+
+Often when you copy and paste information from a web page, it's in a tabular format. If you paste such tables into Word, it usually doesn't look right, is too bulky, and can be difficult to edit or format. In other words, you want to remove the web-page table definitions but leave the data in a usable format within Word.
+
+The following macro does just that:
+
+    Sub Untable()
+
+    On Error Resume Next
+
+       Selection.Rows.ConvertToText Separator:=wdSeparateByCommas, NestedTables:= _
+         True
+       Selection.MoveDown Unit:=wdLine, Count:=1
+
+    If Err Then MsgBox "No table was detected, dude."
+
+    End Sub
+
+To use this macro, click somewhere within the text you've pasted from the Internet to put the insertion cursor in a table (or a suspected table; they often don't look like tables, merely like an area of bizarre formatting), then execute the macro. You may need to execute this macro more than once to completely eliminate all the tabular formatting debris left over from the original HTML. The macro tells you when all table structures have been destroyed, and not only that—it calls you "dude."
+
+# The Bottom Line
+
+**Use Find and Replace via VBA.**
+
+Word's Find and Replace utilities are frequently valuable to the VBA programmer. You'll want to master them and also some subtleties associated with their use.
+
+Master It
+
+Sometimes when replacing, you need to go through a document more than once—using a loop structure. Why would you ever need to repeatedly search and replace the same document? Doesn't the Replace All setting in fact _replace all_?
+
+**Work with headers, footers, and page numbers.**
+
+All Word documents contain headers and footers, even if they are empty. In addition, you can insert various types of headers and footers.
+
+Master It
+
+Name two types of headers you can use in a Word document.
+
+**Manage sections, page setup, windows, and views.**
+
+Among the various ways you can view a document, you sometimes want to have the document automatically scroll to a particular table, graphic, or other target.
+
+Master It
+
+What method of the Window object can be used to easily accomplish this task?
+
+**Manipulate tables.**
+
+When you need to manage tables in Word documents, you can employ VBA to work with the Table object to represent a single table. If there is more than one table, they are referenced by a collection of Table objects.
+
+Master It
+
+Name two important and useful objects within the Tables collection or the Table object.
+Chapter 22
+
+Understanding the Excel Object Model and Key Objects
+
+This chapter shows you how to start working with the Excel object model, the architecture underlying Excel. It also shows you how to perform common actions with the most immediately useful Excel objects. These objects include the Workbooks collection and the Workbook object, the ActiveCell object, and Range objects. You'll also see how to set options in Excel.
+
+In this chapter you will learn to do the following:
+
+  * Work with workbooks
+  * Work with worksheets
+  * Work with the active cell or selection
+  * Work with ranges
+  * Set options
+
+# Getting an Overview of the Excel Object Model
+
+As with the other Office applications, it's not necessary (or even possible for most people) to understand how the entire Excel object model fits together in order to work with VBA in Excel, but most people find that knowing the main objects in the object model is helpful. And often the code examples in the Help system's object-model reference are invaluable—showing you how and where to employ objects in your own programming.
+
+To see the Excel object-model reference, follow these steps:
+
+1. Launch or activate Excel, and then press Alt+F11 to launch or activate the VBA Editor.
+
+2. Move your cursor to a blank space in the code window (to avoid context-sensitive help).
+
+3. Press F1 in the Editor to launch the web page for the VBA language reference for Office 2013.
+
+4. In the Bing search field, type **excel 2013 object model** and press Enter.
+
+5. Click the link _Object Model Reference_ ( _Excel 2013 Developer Reference_ ). You'll now have access to the whole collection of syntax specifications, useful descriptions, and code examples, as shown in Figure 22.1.
+
+Figure 22.1 The entries in the Excel object model reference will help you write your own VBA code.
+
+* * *
+
+Help When Migrating Legacy Code from Earlier Office Projects
+
+If you've inherited VBA code written in earlier versions of Office, those procedures might contain objects, methods, and properties that have been changed in Office 2013. Though modifications to object models are generally few, some incompatibilities can crop up and "break" the code so it won't run correctly. Fortunately, you can download a free utility, the Office Code Compatibility Inspector, that will flag objects and their members that have changed. It does a text comparison of the Office 2013 object model against VBA code written in earlier versions of Office. You can download the Compatibility Inspector from this web page:
+
+www.microsoft.com/en-us/download/details.aspx?id=15001
+
+* * *
+
+# Understanding Excel's Creatable Objects
+
+Excel _exposes_ (makes available for your use in code) various _creatable_ objects, meaning that you can employ most of the important objects in its object model without explicitly going through (mentioning) the Application object. For most programming purposes, these creatable objects are the most commonly used objects. Here's a list:
+
+  * The Workbooks collection contains the Workbook objects that represent all the open workbooks. Within a workbook, the Sheets collection contains the Worksheet objects that represent the worksheets and the Chart objects that represent chart sheets. On a sheet, the Range object gives you access to ranges, which can be anything from an individual cell to a complete worksheet. Remember that, because the workbooks object is creatable, you need not write Application.Workbooks in your code. You can leave off the Application and merely write Workbooks.
+  * The ActiveWorkbook object represents the currently active workbook.
+  * The ActiveSheet object represents the active worksheet.
+  * The Windows collection contains the Window objects that represent all the open windows.
+  * The ActiveWindow object represents the active window. When using this object, be sure to check that the window it represents is the type of window you want to manipulate, because the object returns whatever window currently has the focus.
+  * The ActiveCell object represents, you guessed it, the active cell. This object is especially valuable for simple procedures (for example, those that compute values or correct formatting) that work on a cell selected by the user.
+
+# Managing Workbooks
+
+In many of your Excel procedures, you'll need to manipulate workbooks: creating new workbooks, saving them in various locations and formats, opening existing workbooks, closing and printing workbooks. To accomplish these tasks, you employ the Workbooks collection, which contains a Workbook object for each open workbook in Excel.
+
+## Creating a Workbook
+
+To create a new workbook, use the Add method with the Workbooks collection. The syntax is as follows:
+
+    Workbooks.Add(Template)
+
+Here, Template is an optional Variant argument that specifies how to create the workbook. The following subsections discuss the available options.
+
+### Creating a New Blank Workbook
+
+To create a blank workbook (as if you'd clicked the File tab on the Ribbon, then clicked the New button), omit the Template argument:
+
+    Workbooks.Add
+
+The new workbook receives the number of sheets specified in the Excel Options dialog box (click the File tab on the Ribbon, then choose Options to display the When Creating New Workbooks section of the dialog box—you'll see a field where you can adjust the Include This Many Sheets option.
+
+You can get or set this value in VBA by using the SheetsInNewWorkbook property of the Application object. For example, the following macro declares an Integer variable named mySiNW, stores the current SheetsInNewWorkbook property in it, sets the SheetsInNewWorkbook property to 12, creates a new workbook (with those 12 worksheets), and then restores the SheetsInNewWorkbook setting to its previous value:
+
+    Sub MVBA_New_Workbook_with_12_Sheets()
+        Dim mySiNW As Integer
+        mySiNW = Application.SheetsInNewWorkbook
+        Application.SheetsInNewWorkbook = 12
+        Workbooks.Add
+        Application.SheetsInNewWorkbook = mySiNW
+    End Sub
+
+### Creating a New Workbook Based on a Template
+
+To create a workbook based on a template, specify the full path and name of the template file. For example, the following statement creates a new workbook based on the template Balance Sheet.xlt in a network folder \\\server\template\excel:
+
+    Workbooks.Add Template:= "\\server\template\excel\Balance Sheet.xlt"
+
+### Creating a New Workbook Based on an Existing Workbook
+
+To create a workbook based on an existing workbook, specify the full name and path of the workbook file. For example, the following statement creates a new workbook based on the existing workbook named Personnel.xlsx in the C:\Business folder:
+
+    Workbooks.Add Template:= "C:\Business\Personnel.xlsx"
+
+### Creating a Chart Workbook, a Macro Sheet, or a Worksheet
+
+You can also create a workbook that contains a single chart, macro sheet, or worksheet by using the constants shown in Table 22.1 with the Template argument.
+
+Table 22.1 Constants for creating a chart workbook, macro sheet, or worksheet
+
+**Constant** | **Creates a Workbook Containing**  
+---|---  
+xlWBATChart | A chart sheet  
+xlWBATExcel4IntlMacroSheet | An international macro sheet  
+xlWBATExcel4MacroSheet | A macro sheet  
+xlWBATWorksheet | A worksheet
+
+For example, the following statement creates a workbook containing a single chart sheet:
+
+    Workbooks.Add Template:=xlWBATChart
+
+## Saving a Workbook
+
+The first time you save a workbook, you must specify the path and filename to use (this is the SaveAs option). After that, you can save the workbook under the same name or specify a different path, name, format, or all three (this is the Save option).
+
+### Saving a Workbook for the First Time or as a Different File
+
+To save a workbook for the first time, or to save a workbook using a different path, name, or format, use the SaveAs method. The syntax is as follows:
+
+    _expression_. **SaveAs** (FileName, FileFormat, Password, WriteResPassword,
+       ReadOnlyRecommended, CreateBackup, AccessMode, ConflictResolution,
+       AddToMru, TextCodePage, TextVisualLayout, Local)
+
+The components of the syntax are as follows:
+
+  * _expression_ is a required expression that returns a Workbook object.
+  * FileName is an optional Variant argument that specifies the name for the workbook. If you omit FileName, VBA uses the current folder and the default filename of Book _n_.xlsx for a workbook, where _n_ is the next available number (for example, Book5.xlsx).
+
+VBA uses the default file format, which is specified in the Options dialog box's Save page. (Click the File tab on the Ribbon, then click Options to display the Options dialog box, then click the Save button on the left. You'll see a Save Files In This Format drop-down list.)
+
+You can get and set the default save format by using the DefaultSaveFormat property of the Application object. For example, the following statement sets the default save format to xlNormal, the "Excel Workbook" format:
+
+    Application.DefaultSaveFormat = xlNormal
+
+  * FileFormat is an optional Variant argument that specifies the format in which to save the workbook. Table 22.2 lists the XlFileFormat constants for specifying commonly used formats.
+
+* * *
+
+Be Careful Not to Accidentally Overwrite a File
+
+When saving a workbook to a folder, you should check whether a workbook with the same name already exists in the folder. If it does, and unless you prevent it, VBA overwrites it without warning, causing data loss. See "Using the Dir Function to Check Whether a File Exists" in Chapter 9, "Using Built-in Functions," for instructions on how to check whether a file with a particular filename already exists.
+
+* * *
+
+  * Password is an optional Variant argument that you can use to supply the password that is to be required to open the workbook (the "password to open"). Password is case sensitive. If the user can't provide the password, Excel won't open the workbook.
+  * WriteResPassword is an optional Variant argument that you can use to supply the password that is required to open the workbook in a writable form (the "password to modify"). WriteResPassword is case sensitive. If the user can't provide the password, Excel will open the workbook as read-only.
+  * ReadOnlyRecommended is an optional Variant argument that you can set to True to have Excel recommend that the user open the document as read-only. Such recommendations typically carry little force, and you'll do better to protect the workbook with a "password to modify."
+  * CreateBackup is an optional Variant argument that you can set to True to make Excel automatically create a backup of the workbook. The default setting is False.
+  * AccessMode is an optional argument that you can use to specify whether the workbook is shared or is in Exclusive mode. Specify xlExclusive for Exclusive mode, xlShared for Shared mode, and xlNoChange to leave the access mode unchanged (this is the default setting).
+  * ConflictResolution is an optional argument that you can use to specify how to resolve any conflicting changes to the workbook. Use xlLocalSessionChanges to accept the changes in the current Excel session, xlOtherSessionChanges to accept the other user's or users' changes, and xlUserResolution to display the Resolve Conflicts dialog box so that the user can choose how to resolve the conflicts.
+  * AddToMru is an optional Variant argument that you can set to True to add the workbook to the list of recently used files at the bottom of the File menu. The default setting is False.
+  * TextCodePage and TextVisualLayout are optional Variant arguments used in international versions of Excel (not in U.S. English Excel).
+  * Local is an optional Variant that controls whether the language used is that of Excel (True) or of VBA (False). (You'll seldom need to use Local.)
+
+Table 22.2 XlFileFormat constants for widely used formats
+
+**Constant** | **Saves Document As**  
+---|---  
+xlNormal | A normal workbook  
+xlXMLSpreadsheet | An XML spreadsheet  
+xlWebArchive | A single-file web page  
+xlHtml | A web page  
+xlTemplate | A template  
+xlExcel9795 | An Excel workbook for Excel versions 95 and later
+
+For example, the following statement saves the active workbook in the current folder under the name Salaries.xlsx and using the default save format:
+
+    ActiveWorkbook.SaveAs FileName:="Salaries.xlsx"
+
+The following statement saves the open workbook named Schedule.xlsx under the name Building Schedule.xlsx in the folder named \\\server2\Public using the Microsoft Excel 97–2003 & 5.0/95 format (from Excel 2003):
+
+    ActiveWorkbook.SaveAs Filename:="\\server2\Public\Building Schedule.xlsx", _
+            FileFormat:=xlExcel9795
+
+To see a complete list of all the Excel 2013 file formats, visit this web page:
+
+<http://msdn.microsoft.com/en-us/library/office/ff198017.aspx>
+
+### Saving a Workbook That Has Already Been Saved
+
+Once a workbook has been saved, you can just save it again with the same name by using the Save method. For a Workbook object, the Save method takes no arguments. For example, the following statement saves the workbook named Data Book.xlsx:
+
+    Workbooks("Data Book.xlsx").Save
+
+### Saving All Open Workbooks
+
+The Workbooks collection doesn't have a Save method, but you can save all open workbooks by using a loop such as that shown in the following subroutine:
+
+    Sub Save_All_Workbooks()
+        Dim myWorkbook As Workbook
+        For Each myWorkbook In Workbooks
+            myWorkbook.Save
+        Next myWorkbook
+    End Sub
+
+Note that if any of the currently opened workbooks have not been previously saved, and if they include any macros, a security message will be displayed when this procedure executes. Users are told that they must agree to save the potentially dangerous executable content in a macro-enabled file format (.xlsm). However, if the file has already been saved with the .xlsm filename extension, no message is displayed. If you want to suppress such messages, you can insert the following code at the start of this procedure:
+
+    Application.DisplayAlerts = False
+
+However, be sure to set the DisplayAlerts property back to True as soon as you can in the code. This particular warning message is quite useful as a reminder to the user—so you likely won't want to suppress it.
+
+## Accessing Cloud Storage
+
+Having VBA access SkyDrive, Dropbox, or one of the other cloud storage systems—systems, is fairly easy. Just open or save a file from the SkyDrive or Dropbox folder.
+
+The only thing to figure out is the file path, and it will look something like this: "C:\Users\ _Richard_ \SkyDrive\ExcelToCloudTest", with _Richard_ replaced by your name.
+
+This example saves the current document to SkyDrive. Because this is a source of so many errors, I repeat: Change my name, _Richard_ , to your name in the file path in this example code:
+
+    ActiveWorkbook.SaveAs ("C:\Users\ _Richard_ \SkyDrive\ExcelCloudTest")
+
+To save to Dropbox, it's pretty much the same:
+
+    ActiveWorkbook.SaveAs ("C:\Users\ _Richard_ \DropBox\ExcelCloudTest")
+
+## Opening a Workbook
+
+To open a workbook, use the Open method with the Workbooks collection. The syntax is as follows:
+
+    _expression_.Open(FileName, UpdateLinks, ReadOnly, Format, Password,
+       WriteResPassword, IgnoreReadOnlyRecommended, Origin, Delimiter,
+       Editable, Notify, Converter, AddToMru, Local, CorruptLoad)
+
+The components of the syntax are as follows:
+
+  * _expression_ is a required expression that returns a Workbooks collection. Often, you'll want to use the Workbooks collection itself.
+  * FileName is a required String argument that supplies the path and name of the workbook to open.
+  * UpdateLinks is an optional Variant that controls how Excel updates any links in the workbook. If you leave out this argument, the user is prompted to specify how to update the links. Table 22.3 shows the values and their effects. If Microsoft Excel is opening a file in the WKS, WK1, or WK3 format and the UpdateLinks argument is 2, Microsoft Excel generates charts from the graphs attached to the file. If the argument is 0, no charts are created.
+  * ReadOnly is an optional Variant that you can set to True to open the workbook as read-only. The default is False.
+  * Format is an optional Variant that you can use to specify the delimiter character when opening a text file. Use 1 for tabs, 2 for commas, 3 for spaces, 4 for semicolons, 5 for no delimiter character, and 6 for a delimiter you specify using the Delimiter argument.
+  * Password is an optional Variant argument that you can use to provide the password required to open the workbook (the "password to open"). Password is case sensitive. If you omit Password and a password is required, Excel prompts the user for it.
+
+* * *
+
+Don't Include Passwords in Your Procedures
+
+If possible, avoid placing passwords in your code, because it may be possible for other people to read them.
+
+* * *
+
+  * WriteResPassword is an optional Variant argument that you can use to provide the password required to open the workbook in a writable form (the "password to modify"). WriteResPassword is case sensitive. If you omit WriteResPassword and a password is required, Excel prompts the user for it.
+  * IgnoreReadOnlyRecommended is an optional Variant argument that you can set to True to have Excel ignore a read-only recommendation on the workbook.
+  * Origin is an optional Variant argument that you can use when opening a text file to specify the operating system used to encode it and thus how to treat carriage-return/line-feed characters and character encoding. Use xlWindows to indicate Windows, xlMacintosh to indicate Mac OS, or xlMSDOS to indicate DOS.
+  * Delimiter is an optional Variant argument you can use with a Format value of 6 to specify one delimiter character to use when opening a text file.
+  * Editable is an optional Variant argument that you can set to True when FileName specifies a template to open the template itself rather than start a workbook based on the template (False). Editable also applies to Excel 4.0 add-ins: True opens the add-in in a visible window, while False opens the add-in hidden. However, you can't employ this option with add-ins created in Excel 5.0 or later.
+  * Notify is an optional Variant argument that you can set to True to have Excel add the workbook to the notification list when someone else has the workbook open for editing and VBA requests the workbook. Excel then notifies the user when the workbook becomes available. If you specify Notify:=False, opening the workbook fails if someone else has the workbook open.
+  * Converter is an optional Variant argument that you can use to specify the first file converter to use when opening a file.
+  * AddToMru is an optional Variant argument that you can set to True to add the workbook to the list of recently used files at the bottom of the File menu. The default setting is False.
+  * Local is an optional Variant that controls whether the language used is that of Excel (True) or of VBA (False). (You'll seldom need to use Local.)
+  * CorruptLoad is an optional Variant that you can use to control how Excel handles corruption it encounters when opening the workbook. Use xlNormalLoad to use normal behavior—first, opening the workbook as usual; second, repairing the file if there's a problem; and third, recovering the data from the workbook. Use xrRepairFile to go straight to the repair stage or xlExtractData to go straight to the recovery stage.
+
+Table 22.3 Values for the UpdateLinks argument
+
+**Value** | **Effect**  
+---|---  
+(If you omit this argument) | Excel prompts the user to decide how to update links.  
+|   
+1 | User specifies how links are to be updated.  
+2 | Links are never updated for this workbook when it's opened.  
+3 | Excel always updates links for this workbook when opening it.
+
+For example, the following statement opens the workbook named Expenses.xlsx stored in the C:\Business folder without updating links:
+
+    Workbooks.Open Filename:= "C:\Business\Expenses.xlsx", UpdateLinks:=0
+
+The following statement opens the workbook named Plan.xlsx stored in the D:\Planning folder, providing the password for opening the workbook:
+
+    Workbooks.Open Filename:="D:\Planning\Plan.xlsx", Password:="s@cur1ng!"
+
+The following statement opens the text file named Data13.txt in the folder z:\transfer using an exclamation point (!) as the delimiter character:
+
+    Workbooks.Open _
+     Filename:="z:\transfer\Data13.txt", Format:=6,  Delimiter:="!"
+
+## Closing a Workbook
+
+To close a workbook, use the Close method with the appropriate Workbook object. The syntax is as follows:
+
+    _expression_.Close(SaveChanges, Filename, RouteWorkbook)
+
+The components of the syntax are as follows:
+
+  * _expression_ is a required expression that returns a Workbook object or the Workbooks collection.
+  * SaveChanges is an optional Variant argument that lets you specify whether to save any unsaved changes in the workbook (True) or not (False). If you omit the SaveChanges argument, Excel prompts the user to save any workbook that contains unsaved changes.
+  * Filename is an optional Variant that you can use to specify the filename under which to save the workbook if it contains changes. In most cases, it's best to use the SaveAs method to save the workbook under a different name before you use the Close method to close it.
+  * RouteWorkbook is an optional Variant argument that you can set to True to route the workbook to the next recipient on its routing slip, or False to refrain from routing the workbook. If the workbook has no routing slip attached, RouteWorkbook has no effect.
+
+For example, the following statement closes the active workbook without saving changes:
+
+    ActiveWorkbook.Close SaveChanges:=False
+
+### Closing All Open Workbooks
+
+To close all open workbooks, use the Close method with the Workbooks collection:
+
+    Workbooks.Close
+
+The Close method takes no arguments. Excel prompts you to save any workbook that contains unsaved changes. If such prompts will be inconvenient in a procedure, use a loop (for example, a For Each... Next loop with the Workbooks collection) to close each open workbook individually, using the SaveChanges argument to control whether Excel saves or discards any unsaved changes.
+
+## Sharing a Workbook
+
+To determine whether a workbook is shared, check its MultiUserEditing property. This is a read-only Boolean property.
+
+To share a workbook, use the SaveAs method (discussed in "Saving a Workbook for the First Time or as a Different File," earlier in this chapter) to save the file using the xlShared value for the AccessMode argument.
+
+For example, the following statements share the workbook named Brainstorming.xlsx if it is not already shared:
+
+    With Workbooks("Brainstorming.xlsx")
+        If MultiUserEditing = False Then
+            .SaveAs Filename:=.FullName, AccessMode:=xlShared
+        End If
+    End With
+
+## Protecting a Workbook
+
+To protect a workbook, use the Protect method with the appropriate Workbook object. The syntax is as follows:
+
+    _expression_.Protect(Password, Structure, Windows)
+
+The components of the syntax are as follows:
+
+  * _expression_ is a required expression that returns a Workbook object.
+  * Password is an optional Variant argument that specifies the password for unprotecting the workbook. Password is case sensitive. You'll almost always want to supply Password—if you don't, anybody who can open your workbook can unprotect it.
+  * Structure is an optional Variant argument that you can set to True to protect the workbook's structure (how the worksheets are positioned relative to each other) or leave at its default setting, False.
+  * Windows is an optional Variant argument that you can set to True to protect the workbook windows or omit to leave the windows unprotected.
+
+For example, the following statement protects the structure and windows of the active workbook with the password 0llsecurd:
+
+    ActiveWorkbook.Protect Password:="0llsecurd", Structure:=True, Windows:=True
+
+* * *
+
+You Can Protect Workbooks against Both Writing (Editing) and Reading
+
+In addition to protecting a workbook against modifications, you can protect it against being opened and viewed. See the sidebar "Setting Passwords and Read-Only Recommendations for a Workbook" later in this chapter for details.
+
+* * *
+
+## Working with the ActiveWorkbook Object
+
+The ActiveWorkbook object returns a Workbook object that represents the active workbook (whichever workbook currently has the focus in the Excel window). The ActiveWorkbook object behaves like a Workbook object and is very useful in procedures that users execute (put another way, macros that users run) after opening the workbook that they want to manipulate.
+
+If no workbook is open, there is no ActiveWorkbook object, so any code that tries to use the ActiveWorkbook object returns an error. Users can run macros when no workbook is open in Excel, so it's a good idea to verify that at least one workbook is open before trying to execute code that assumes there is an active workbook. One option is to check that the ActiveWorkbook object is not Nothing before running the code, as in the following example:
+
+    If ActiveWorkbook **Is Nothing** Then
+        MsgBox "Please open a workbook and click in it before running this macro." _
+            & vbCr & vbCr & "This macro will now end.", _
+            vbOKOnly + vbExclamation, "No Workbook Is Open"
+        End
+    End If
+
+It's also a good idea to check that the workbook your code assumes is the active workbook actually _is_ the active workbook. This problem can easily occur when a procedure starts with the active workbook and then creates a new workbook to work in; the new workbook becomes the active workbook, and from this point on, the code may start accessing the wrong workbook.
+
+If there's any doubt about which workbook you're working with, declare a Workbook object variable and use that object variable in your code rather than the ActiveWorkbook object. For example, the following statements declare a Workbook object variable and assign the ActiveWorkbook object to it, so that subsequent code can work with the object variable:
+
+    Dim myWorkbook As Workbooks
+    Set myWorkbook = ActiveWorkbook
+    With myWorkbook
+        'actions here
+    End With
+
+# Working with Worksheets
+
+Most workbooks you need to manipulate via VBA will contain one or more worksheets. As a result, many procedures will need to work with worksheets—inserting them, deleting them, copying or moving them, or simply printing the appropriate range from them.
+
+Each worksheet is represented by a Sheet object. The Sheet objects are contained within the Sheets collection.
+
+## Inserting a Worksheet
+
+To insert a worksheet into a workbook, use the Add method with the Sheets collection. The syntax is as follows:
+
+    _expression_.Add(Before, After, Count, Type)
+
+The components of the syntax are as follows:
+
+  * _expression_ is a required expression that returns a Sheets collection. Often, you'll want to use the Sheets collection itself.
+  * Before is an optional Variant argument that specifies the sheet before which to add the new sheet. After is an optional Variant argument that specifies the sheet after which to add the new sheet. Typically, you'll want to specify either Before or After, but not both. You can also omit both arguments to make Excel insert the new sheet before the active worksheet.
+  * Count is an optional Variant argument that specifies how many sheets to add. If you omit Count, VBA uses the default value, 1.
+  * Type is an optional Variant that specifies the type of sheet to insert. The default is xlWorksheet, a standard worksheet. You can also insert a chart sheet (xlChart), an Excel 4 macro sheet (xlExcel4MacroSheet), or an Excel 4 international macro sheet (xlExcel4IntlMacroSheet).
+
+For example, the following statements declare a Worksheet object variable named mySheet, insert a worksheet before the first sheet in the first open workbook and assign the new sheet to mySheet, and then set the Name property of mySheet to Summary (the Name property controls the text that appears on the worksheet's tab):
+
+    Dim mySheet As Worksheet
+    Set mySheet = Workbooks(1).Sheets.Add(before:=Sheets(1))
+    mySheet.Name = "Summary"
+
+The following statements insert two chart sheets after the last worksheet in the active workbook. The chart sheets receive default names, such as Chart1 and Chart2:
+
+    ActiveWorkbook.Sheets.Add _
+    After:=Sheets(Sheets.Count), Count:=2, Type:=xlChart
+
+## Deleting a Worksheet
+
+To delete a worksheet, use the Delete method of the appropriate Sheet object. The Delete method takes no arguments. For example, the following statement deletes the worksheet named Summary from the workbook referenced by the myWorkbook object variable:
+
+    myWorkbook.Sheets("Summary").Delete
+
+If you delete a worksheet, you lose any data stored on that worksheet, so Excel asks the user to confirm the deletion by default (see Figure 22.2). If you need to avoid this user interaction—for example, in a procedure that adds a worksheet without the user's knowledge, uses it to manipulate data, and then deletes it—you can turn off alerts in Excel by setting the DisplayAlerts property of the Application object to False before deleting the worksheet and then turning alerts back on:
+
+    Application.DisplayAlerts = False
+    myWorkbook.Sheets("Summary").Delete
+    Application.DisplayAlerts = True
+
+Figure 22.2 When deleting a worksheet, you must either suppress alerts in Excel or have the user confirm the deletion in this dialog box.
+
+## Copying or Moving a Worksheet
+
+To copy a worksheet, use the Copy method of the appropriate Sheet object. To move a worksheet, use the Move method. The syntax is as follows:
+
+    _expression_.Copy(Before, After)
+    _expression_.Move(Before, After)
+
+Here, _expression_ is a required expression that returns a Worksheet object. Before is an optional Variant argument that specifies the sheet before which to place the copy or the moved sheet. After is an optional Variant argument that specifies the sheet after which to place it:
+
+  * Typically, you'll want to specify either Before or After, but not both.
+  * You can specify another workbook by name to copy or move the worksheet to another workbook.
+  * You can also omit both arguments to make Excel create a new workbook containing the copied or moved sheet. The new workbook becomes the active workbook, so you can use the ActiveWorkbook object to start working with it or to assign it to an object variable.
+
+For example, the following statement copies the worksheet named Costs – Materials in the workbook named Building Schedule.xlsx, placing the copy after the last of the current worksheets in the workbook:
+
+    Workbooks("Building Schedule.xlsx").Sheets("Costs - Materials").Copy, _
+        After:=Sheets(Sheets.Count)
+
+The following line of code moves the worksheet named Homes from the workbook named Planning.xlsx to the workbook named Building Schedule.xlsx, inserting the worksheet before the first existing worksheet in the workbook:
+
+    Workbooks("Planning.xlsx").Sheets("Homes"). **Move** , _
+       Before:=Workbooks("Building Schedule.xlsx").Sheets(1)
+
+## Printing a Worksheet
+
+To print a worksheet, use the PrintOut method with the appropriate Worksheet object.
+
+* * *
+
+The PrintOut Method Can Be Used with Several Objects
+
+Various objects in addition to an individual worksheet have a PrintOut method, including the Worksheets collection, the Chart object and the Charts collection, the Workbook object, the Window object, and the Range object.
+
+* * *
+
+The syntax for the PrintOut method is as follows:
+
+    _expression_.PrintOut(From, To, Copies, Preview, ActivePrinter,
+       PrintToFile, Collate, PrToFileName, IgnorePrintAreas)
+
+The components of the syntax are as follows:
+
+  * _expression_ is a required expression that returns the appropriate Worksheet object or other object to which the PrintOut method applies.
+  * From is an optional Variant argument that specifies the number of the page at which to start printing. Omit From to start printing at the beginning of the object. Note that From and To refer to the pages in the printout, not to the overall number of pages that the object would take up.
+  * To is an optional Variant argument that specifies the number of the page at which to stop printing. Omit the To argument to print to the end of the object.
+  * Copies is an optional Variant argument that specifies the number of copies to print. If you omit Copies, Excel prints one copy.
+  * Preview is an optional Variant argument that you can set to True to display the object in Print Preview before printing it. Set Preview to False, or simply omit this argument, to print the object without previewing it. Use the PrintPreview method to display an object in Print Preview without printing it.
+  * ActivePrinter is an optional Variant argument that you can use to specify the printer on which to print.
+  * PrintToFile is an optional Variant argument that you can set to True to make Excel print to a print file rather than a printer. When printing to a file, you can use the PrToFileName property to specify the filename, or omit it and have Excel prompt the user for the filename.
+  * Collate is an optional Variant argument that you can set to True to have Excel print multiple copies for collation rather than printing all the copies of one page, all the copies of the next, and so on.
+  * PrToFileName is an optional Variant argument that you can use with PrintToFile:=True to specify the filename of the print file.
+  * IgnorePrintAreas is an optional Variant argument. Set to False, this argument prints the entire specified print area; when it's True, the entire object is printed and any print area is ignored. A _print area_ can be defined in Excel and is useful as a way of printing only a specified range of cells. Once specified, the print area is retained by Excel until you either clear it or specify a new print area. You define a print area by selecting the cells you want to print, then clicking the Ribbon's Page Layout tab. Click the Print Area option in the Page Setup area of the Ribbon.
+
+The following statement prints two copies of each page of the first worksheet in the active workbook, collating the pages:
+
+    ActiveWorkbook.Sheets(1).Printout Copies:=2, Collate:=True
+
+The following statement prints the first two pages of the worksheet named Summary in the workbook named Planning.xlsx to a file named Planning Summary.prn in the network folder \\\server\to_print:
+
+    Workbooks("Planning.xlsx").Sheets("Summary").PrintOut From:=1, To:=2, _
+        PrintToFile:=True, _
+        PrToFileName:="\\server\to_print\Planning Summary.prn"
+
+## Protecting a Worksheet
+
+To protect a worksheet, use the Protect method with the appropriate Worksheet object. The syntax is as follows:
+
+    _expression_.Protect(Password, DrawingObjects, Contents, Scenarios,
+       UserInterfaceOnly, AllowFormattingCells, AllowFormattingColumns,
+       AllowFormattingRows, AllowInsertingColumns, AllowInsertingRows,
+       AllowInsertingHyperlinks, AllowDeletingColumns, AllowDeletingRows,
+       AllowSorting, AllowFiltering, AllowUsingPivotTables)
+
+The components of the syntax are as follows:
+
+  * _expression_ is a required expression that returns a Worksheet object.
+  * Password is an optional Variant argument that specifies the password for unprotecting the worksheet. Password is case sensitive. You'll almost always want to supply Password to prevent unauthorized people from unprotecting the workbook.
+  * DrawingObjects is an optional Variant argument that you can set to True to protect shapes in the worksheet. The default setting is False.
+  * Contents is an optional Variant argument that protects the locked cells when set to True, its default value. Set Contents to False to leave the locked cells unprotected.
+  * Scenarios is an optional Variant argument that protects scenarios when set to True, its default value.
+  * UserInterfaceOnly is an optional Variant argument that you can set to True to leave macros unprotected while protecting the user interface. The default value is False.
+  * AllowFormattingCells, AllowFormattingColumns, and AllowFormattingRows are optional Variant arguments that you can set to True to allow the formatting of cells, columns, and rows, respectively. The default value for each argument is False.
+  * AllowInsertingColumns, AllowInsertingRows, and AllowInsertingHyperlinks are optional Variant arguments that you can set to True to allow the user to insert columns, rows, and hyperlinks, respectively. The default value for each argument is False.
+  * AllowDeletingColumns and AllowDeletingRows are optional Variant arguments that you can set to True to allow the user to delete columns or rows, respectively, where every cell in the column or row is unlocked. The default setting is False.
+  * AllowSorting is an optional Variant argument that you can set to True to allow the user to sort unlocked cells on the protected worksheet. The default setting is False.
+  * AllowFiltering is an optional Variant argument that you can set to True to allow the user to set filters or change filter criteria (but not enable or disable an autofilter) on a protected worksheet. The default setting is False.
+  * AllowUsingPivotTables is an optional Variant argument that you can set to True to allow the user to work with pivot tables on the protected worksheet. The default value is False.
+
+For example, the following statement protects the worksheet referenced by the object variable myWorksheet using the password no1gets1n:
+
+    myWorksheet.Protect Password:="no1gets1n"
+
+The following statement protects the myWorksheet worksheet with the same password but allows the formatting of cells and allows the sorting of unlocked cells:
+
+    myWorksheet.Protect Password:="no1gets1n", AllowFormattingCells:=True, _
+        AllowSorting:=True
+
+## Working with the _ActiveSheet_ Object
+
+The ActiveSheet object returns the active worksheet. If you specify a workbook, then the active worksheet in _that_ specified workbook is returned.
+
+If no sheet is active, ActiveSheet returns Nothing. Before executing code that depends on there being an active sheet, it's a good idea to check, as in this example:
+
+    If ActiveSheet Is Nothing Then End
+
+# Working with the Active Cell or Selection
+
+In a procedure that manipulates a selection that the user has made, you'll typically work with either the active cell or the selection. The active cell is always a single cell, but the selection can encompass multiple cells or other objects.
+
+## Working with the Active Cell
+
+The ActiveCell property of the Application object or the Window object returns a Range object that represents the active cell in the Excel application or in the specified window. If you use ActiveCell without specifying the window, VBA returns the active cell in the active window.
+
+For example, the following statement returns the address of the active cell in the active workbook:
+
+    ActiveCell.Address
+
+The following statement returns the text in the active cell in the first window open on the workbook named Planning.xlsx:
+
+    MsgBox Workbooks("Planning.xlsx").Windows(1).ActiveCell.Text
+
+If no worksheet is active, or if a chart sheet is active, there is no active cell. If you try to access ActiveCell, VBA returns an error. So before using code that assumes there is an active cell, check that ActiveCell is not Nothing:
+
+    If ActiveCell Is Nothing Then End
+
+### Getting and Setting the Value of the Active Cell
+
+To return the value of the active cell, use the Value property. For example, the following statement sets the value of the active cell to 25:
+
+    ActiveCell.Value = 25
+
+And the following statement retrieves the value of the active cell:
+
+    MsgBox ActiveCell.Value
+
+### Moving the Active Cell to Another Address
+
+The ActiveCell object is often convenient to work with in your code, so sometimes you'll want to make a different cell the active cell in order to work with it via the ActiveCell object. To make a cell the active cell, use the Activate method with the appropriate Range object. For example, the following statement makes cell L7 the active cell in the worksheet identified by the object variable myWorksheet:
+
+    myWorksheet.Range("B5").Activate
+
+Often, you'll need to move the active cell to a different range a specified number of rows or columns away (in other words, to an address _relative_ to the location of the active cell—as opposed to an _absolute_ address, such as C12). To do so, use the Offset property of the active cell object, specifying the number of rows with the RowOffset argument and the number of columns with the ColumnOffset argument. Use a positive offset to move the active cell right or down and a negative offset to move the active cell left or up. For example, the following statement moves the active cell up two rows (RowOffset:=-2) and four columns to the right (ColumnOffset:=4):
+
+        ActiveCell.Offset(RowOffset:=-2, ColumnOffset:=4).Activate
+
+In procedures that the user triggers (macros), it's often a good idea to return the active cell to where it was when the user started the procedure. To do so, you can store the location of the active cell and then return it to the stored location after your procedure is finished with its tasks. Here's an example:
+
+        Set myActiveCell = ActiveCell
+        Set myActiveWorksheet = ActiveSheet
+        Set myActiveWorkbook = ActiveWorkbook
+
+        'take actions here
+
+        myActiveWorkbook.Activate
+        myActiveWorksheet.Activate
+        myActiveCell.Activate
+
+* * *
+
+Be Careful with Equations That Use Relative Cell Addresses
+
+Always test your procedures carefully with various types of data. Errors can sometimes occur when you move cells that contain equations that use relative cell addresses.
+
+* * *
+
+### Working with the Region around the Active Cell
+
+You can work with the range of cells around the active cell by using the CurrentRegion property to return the CurrentRegion object. The current region extends from the active cell to the first _blank_ row above and below and to the first blank column to the left and right. In other words, if there are no blank rows or columns in the entire worksheet, then the region is all the cells in the worksheet.
+
+For example, the following statements use the Font property of the CurrentRegion object to set the font of the current region to 12-point Times New Roman with no bold or italic:
+
+    With ActiveCell.CurrentRegion.Font
+        .Name = "Times New Roman"
+        .Size = 12
+        .Bold = False
+        .Italic = False
+    End With
+
+## Working with the User's Selection
+
+In macros designed to be run by a user, you will often need to work with cells that the user has selected. For example, a user might select a range of cells and then run a macro to manipulate the contents of the range.
+
+To work with the range the user has selected, use the RangeSelection property of the appropriate Window object. For example, you might assign the RangeSelection property to a range so that you could work with it in a macro and then select it again at the end of the macro, leaving the user ready to work with their selection again. Here's an example:
+
+    Dim myMacroRange As Range
+    **Set myMacroRange = ActiveWindow.RangeSelection**
+    With myMacroRange
+        'take actions on the range here
+    End With
+    **myMacroRange.Activate**
+
+# Working with Ranges
+
+Within a worksheet, you'll often need to manipulate ranges of cells. You can work with _absolute_ ranges (ranges for which you specify the absolute addresses of the cells you want to affect, such as C12) or ranges relative to the active cell, where you merely describe an offset.
+
+You can either specify a range by using the Range property or create a named range by using the Names collection. Excel also provides the UsedRange property for working with the used range on a worksheet, and the SpecialCells method of the Range object for working with cells that meet specific criteria.
+
+## Working with a Range of Cells
+
+To work with a range of cells, use the Range property of the appropriate Worksheet object to specify the cells. For example, the following statement sets the value of cell C12 on the active worksheet to 44:
+
+    ActiveSheet.Range("C12").Value = "44"
+
+## Creating a Named Range
+
+To create a named range, use the Add method with the Names collection. The syntax is as follows:
+
+    _expression_.Add(Name, RefersTo, Visible, MacroType, ShortcutKey,
+       Category, NameLocal, RefersToLocal, CategoryLocal, RefersToR1C1,
+       RefersToR1C1Local)
+
+The components of the syntax are as follows:
+
+  * _expression_ is a required expression that returns a Names object.
+  * Name is an optional Variant argument that specifies the name to assign to the named range. Name is required if you don't specify the NameLocal argument (later in this list). The name cannot be a cell reference, nor can it contain spaces.
+  * RefersTo is an optional Variant argument that specifies the range for the named range. You need to specify RefersTo unless you use the RefersToLocal argument, the RefersToR1C1 argument, or the RefersToR1C1Local argument.
+  * Visible is an optional Variant argument that you can omit, set to True to have Excel make the name visible in the user interface (in the Go To dialog box, the Paste Name dialog box, and other locations), or set to False to make the name hidden.
+  * MacroType is an optional Variant argument that you can use to assign a macro type to the range: 1 for a user-defined Function procedure, 2 for a Sub procedure, and 3 or omitted for no macro.
+  * ShortcutKey is an optional Variant argument that specifies the shortcut key for a command macro assigned to the named range.
+  * Category is an optional Variant argument that specifies the category of the macro or function specified by MacroType. You can specify one of the categories used by the Function Wizard, or specify another name to have Excel create a new category with that name.
+  * NameLocal is an optional Variant argument that specifies the name for the range in the local language. Use NameLocal when you omit Name.
+  * RefersToLocal is an optional Variant argument that specifies the range for the named range. Use RefersToLocal when you omit RefersTo, RefersToR1C1, and RefersToR1C1Local.
+  * CategoryLocal is an optional Variant argument that you use to specify the category of the macro or function specified by MacroType. Use CategoryLocal when you omit Category.
+  * RefersToR1C1 is an optional Variant argument that specifies the range for the named range using R1C1 notation (R1C1 would mean row 1 column 1). Use RefersToR1C1 when you omit RefersTo, RefersToLocal, and RefersToR1C1Local.
+  * RefersToR1C1Local is an optional Variant argument that specifies the range for the named range using R1C1 notation in the local language. Use RefersToR1C1Local when you omit RefersTo, RefersToLocal, and RefersToR1C1.
+
+For example, the following statement defines a range named myRange that refers to the range A1:G22 on the worksheet named Materials in the workbook named Building Schedule.xlsx:
+
+    Workbooks("Building Schedule.xlsx"). **Names.Add** Name:= "myRange", _
+            RefersTo:="=Materials!$A$1:$G$22"
+
+## Deleting a Named Range
+
+To delete a named range, use the Delete method with the appropriate Name object. For example, the following statement deletes the range named myRange in the workbook named Building Schedule.xlsx:
+
+    Workbooks("Building Schedule.xlsx").Names("myRange").Delete
+
+## Working with a Named Range
+
+To work with a named range, specify the name with the Range object. For example, the following statements set the row height of the rows in the named range myRange to 20 points and applies 16-point Arial font to the cells:
+
+    With Range("myRange")
+        .RowHeight = 20
+        .Font.Name = "Arial"
+        .Font.Size = "16"
+    End With
+
+## Working with the Used Range
+
+If you need to work with all the cells on a worksheet, but not with any unoccupied areas of the worksheet, use the UsedRange property. For example, the following statement autofits all the columns in the used range in the active worksheet:
+
+    ActiveSheet.UsedRange.Columns.AutoFit
+
+## Working with the Special Cells
+
+If you need to work with only some types of cells on a worksheet or in a range, use the SpecialCells method of the Range object to return the cells you need. The syntax is as follows:
+
+    _expression_.SpecialCells(Type, Value)
+
+These are the components of the syntax:
+
+  * _expression_ is a required expression that returns a Range object.
+  * Type is a required argument that specifies which cells you want. Table 22.4 lists the constants you can use.
+  * Value is an optional Variant argument that you can use when Type is xlCellTypeConstants or xlCellTypeFormulas to control which cells Excel includes. Table 22.5 shows the constants and what they return.
+
+Table 22.4 Constants for the Type argument for the SpecialCells method
+
+**Constant** | **Returns This Kind of Cell**  
+---|---  
+xlCellTypeAllFormatConditions | All formats  
+xlCellTypeAllValidation | Cells that use validation  
+xlCellTypeBlanks | Empty  
+xlCellTypeComments | Containing notes  
+xlCellTypeConstants | Containing constants  
+xlCellTypeFormulas | Containing formulas  
+xlCellTypeLastCell | The last cell in the used range  
+xlCellTypeSameFormatConditions | Having the same format  
+xlCellTypeSameValidation | Containing the same validation criteria  
+xlCellTypeVisible | All visible
+
+Table 22.5 Constants for the Value argument for the SpecialCells method
+
+**Constant** | **Returns Cells Containing**  
+---|---  
+xlErrors | Errors  
+xlLogical | Logical values  
+xlNumbers | Numbers  
+xlTextValues | Text formulas
+
+For example, the following statement activates the last cell in the worksheet referenced by the object variable myWorksheet:
+
+    myWorksheet.Cell.SpecialCells(Type:=xlCellTypeLastCell).Activate
+
+The following statement identifies all the cells that contain formulas resulting in errors in the active worksheet:
+
+    ActiveSheet.Cells.SpecialCells(Type:=xlCellTypeFormulas, _
+        Value:=xlErrors).Activate
+
+## Entering a Formula in a Cell
+
+To enter a formula in a cell, set the Formula property of the appropriate Cell object. For example, the following statement enters the formula =SUM($G$12:$G$22) in the active cell:
+
+    ActiveCell.Formula = "=SUM($G$12:$G$22)"
+
+# Setting Options
+
+Unlike with Word, in which most of the options that you find in the Word Options dialog box (click the File tab, then click Options) are available through the Options object, most of Excel's options are located in the Application object. Workbook-specific properties that appear in the Excel Options dialog box, however, are accessed through the appropriate Workbook object.
+
+## Setting Options in the Application Object
+
+The following sections show three examples of setting widely useful options in the Application object.
+
+### Controlling Excel's Calculation
+
+In complex worksheets that perform many calculations, you may need to turn off automatic calculation so that a procedure can enter data quickly without the calculations taking place.
+
+To do so, set the Calculation property of the Application object to xlCalculationManual, enter the data, and then set the Calculation property back to its previous value:
+
+    Dim varAutoCalculation As Variant
+    varAutoCalculation = Application.Calculation
+    Application.Calculation = xlCalculationManual
+    'enter the data here
+    Application.Calculation = xlCalculationAutomatic
+
+### Clearing the Recently Used Files List
+
+Sometimes you may find it useful to clear all the entries from recently displayed documents (shown when you click the File tab on the Ribbon, then click Recent). Perhaps, for example, your macro creates some temporary files that you want to delete.
+
+You can do this by setting the Maximum property of the RecentFiles object to 0. After doing so, you likely want to restore the user's previous setting, as the following example illustrates:
+
+    Dim myMax As Long
+    With Application.RecentFiles
+        myMax = .Maximum 'store the user's preference, currently in effect
+        .Maximum = 0
+        .Maximum = myMax
+    End With
+
+After you execute this code and then click the File tab on the Ribbon and click Recent, no files will be displayed in the Recent Documents list.
+
+### Setting a Default File Location
+
+To set the default location for saving and opening files, use the DefaultFilePath property of the Application object, as in this example:
+
+    Application.DefaultFilePath = "\\server3\users\mjones\files"
+
+## Setting Options in a Workbook
+
+Workbook-specific options include the following:
+
+  * Security options (such as those shown in the following section and the sidebar "Setting Passwords and Read-Only Recommendations for a Workbook")
+  * Whether to update remote references in the workbook (the Boolean UpdateRemoteReferences property) and whether to save external link values (the Boolean SaveLinkValues property)
+  * Whether to use AutoRecover (the Boolean EnableAutoRecover property)
+  * Whether to accept labels in formulas (the Boolean AcceptLabelsInFormulas property) and whether to use the 1904 date system (the Boolean Date1904 property)
+
+### Forcing Excel to Remove Personal Information from the File Properties When You Save
+
+To make Excel remove personal information from a workbook's properties when you save it, set the RemovePersonalInformation property of the workbook to True:
+
+    ActiveWorkbook.RemovePersonalInformation = True
+
+* * *
+
+**Setting Passwords and Read-Only Recommendations for a Workbook**
+
+Office's protection works well in a typical workplace. To protect a workbook against an unauthorized user opening it or modifying it, you can set a "password to open" (for reading only) or a "password to modify" on the workbook. You can also specify that when anyone opens a workbook, Excel will recommend that they open it as read-only rather than read/write.
+
+To set a "password to open," set the Password property of the Workbook object. For example, the following statement sets the active workbook to use the "password to open" 1mpass4:
+
+    ActiveWorkbook.Password = "1mpass4"
+
+To set a "password to modify," set the WritePassword property of the Workbook object. For example, the following statement sets the active workbook to use the "password to modify" n0mods:
+
+    ActiveWorkbook.WritePassword = "n0mods"
+
+To apply a read-only recommendation to a workbook, set its ReadOnlyRecommended property to True:
+
+    Workbooks("Strategy.xlsx").ReadOnlyRecommended = True
+
+* * *
+
+## Accessing OneNote
+
+Earlier in this chapter you saw how to access SkyDrive and Dropbox. Simple enough. Dealing with OneNote is another matter because its contents are stored in the tricky XML format. When you write code to deal with XML, the words _efficient, straightforward_ , and _sensible_ do not come to mind.
+
+VBA isn't built into OneNote, but you can access OneNote from VBA in other Office applications.
+
+The following example gets the metadata (data about data) from your OneNote notebooks.
+
+Before you try this code, choose Tools ⇒ References in Excel's VBA Editor and ensure that both Microsoft OneNote 15.0 Object Library and Microsoft XML v6.0 are selected (checked) in the References dialog box.
+
+    1.  Sub GetMetaData()
+    2.  
+    3.  'If it's not currently running, OneNote will be launched
+    4.  Dim ONote As oneNote.Application
+    5.  Set ONote = New oneNote.Application
+    6.  
+    7.  Dim strXML As String
+    8.  
+    9.  ONote.GetHierarchy "", hsNotebooks, strXML, xs2010 'don't use xs2013
+    10. 
+    11. MsgBox strXML
+    12. End Sub
+
+Lines 4 and 5 create an instance of OneNote and assign it to the ONote object variable. Next we create a string variable in line 7 to hold the metadata. Line 9 uses the GetHierarchy method to fill strXML with the metadata. hsNotebooks represents the collection of notebooks in OneNote. The message box displays the results.
+
+# The Bottom Line
+
+**Work with workbooks.**
+
+You often need to create a new, blank workbook in a macro (mimicking a user clicking the File tab on the Ribbon, then clicking the New button). And writing code that accomplishes this is not difficult. It requires only two words.
+
+**Master It**
+
+What code would you write to create a new, blank notebook?
+
+**Work with worksheets.**
+
+Most workbooks you access via VBA will contain one or more worksheets, so most procedures will need to work with worksheets—inserting, deleting, copying, or moving them, or simply printing the appropriate range from them.
+
+**Master It**
+
+Name the object you use in VBA code to represent a worksheet.
+
+**Work with the active cell or selection.**
+
+In a procedure that manipulates a selection that the user has made, you'll typically work with either the active cell or the current selection.
+
+**Master It**
+
+What is the difference between the active cell and a selection?
+
+**Work with ranges.**
+
+Within a worksheet, you'll often need to manipulate ranges of cells. Excel includes a special kind of range—represented by the UsedRange property.
+
+**Master It**
+
+What is unique about UsedRange?
+
+**Set options.**
+
+Word employs an Options object to contain most of the options that you find in the Word Options dialog box (click the File tab on the Ribbon, then click Options). Excel uses a different object to contain its options.
+
+**Master It**
+
+From which object do you access most of Excel's options?
+Chapter 23
+
+Working with Widely Used Objects in Excel
+
+In the previous chapter, you learned to work with some of the main objects in the Excel object model, such as Workbook objects, the ActiveCell object, Range objects, and the Options object. This chapter shows you how to expand your programming facility with VBA in Excel by working with charts, windows, and Find and Replace.
+
+In this chapter you will learn to do the following:
+
+  * Work with charts
+  * Work with windows
+  * Work with Find and Replace
+
+# Working with Charts
+
+The following sections show you how to use VBA to create and format charts, either as entire chart sheets in a workbook or as objects on an existing worksheet.
+
+## Creating a Chart
+
+VBA uses the Chart object to represent a chart on a chart sheet and a ChartObject object to represent an embedded chart on a worksheet. The ChartObject object contains a Chart object, which you can manipulate by accessing it through the ChartObject object. Confused? Object classification schemes can be a bit bewildering.
+
+When writing a macro, you create a chart or chart object in a different order than when working interactively and doing things by hand within Excel. Here are the steps you take when creating charts _programmatically_ (via code rather than interactively via a mouse and keyboard):
+
+1. Create a Chart object variable.
+
+2. Instantiate (bring into existence) the Chart object using the Set command.
+
+3. Specify the source range for its data using the SetSourceData method.
+
+4. Specify the chart type using the ChartType property.
+
+5. Specify any other items you need to.
+
+### Creating a Chart on a New Chart Sheet
+
+To create a chart on a new chart sheet, use the Add method with the Charts collection. The syntax is as follows:
+
+    _expression_.Add(Before, After, Count, Type)
+
+Here are the components of this syntax:
+
+  * _expression_ is a required expression that returns a Charts collection.
+  * Before is an optional Variant argument that you can use to specify the sheet before which to add the new chart sheet. After is an optional Variant argument that you can use to specify the sheet after which to add the new sheet. Typically, you'll use either Before or After. If you omit both arguments, VBA adds the new chart sheet before the active sheet.
+  * Count is an optional Variant argument that you can use to specify how many chart sheets to add. The default is one.
+  * Type is an optional Variant argument that you can use to specify which kind of chart you want displayed. The choices are xlWorksheet, xlChart, xlExcel4MacroSheet, and xlExcel4IntlMacroSheet. The default value is xlWorksheet, so you have to specify xlChart in the following code example because it adds a chart, not an ordinary worksheet.
+
+The following code declares an object variable named myChartSheet as being of the Chart type (a chart worksheet) and then assigns to myChartSheet a new chart sheet added after the last existing sheet in the active workbook:
+
+    Dim myChartSheet As Chart
+    Set myChartSheet = ActiveWorkbook.Sheets.Add _
+        (After:=ActiveWorkbook.Sheets(ActiveWorkbook.Sheets.Count), _
+        Type:=xlChart)
+
+### Creating a Chart on an Existing Worksheet
+
+To create a chart on an existing worksheet, use the Add method with the ChartObjects collection. The syntax is as follows:
+
+    _expression_.Add(Left, Top, Width, Height)
+
+Here are the components of this syntax:
+
+  * _expression_ is a required expression that returns a ChartObjects collection.
+  * Left is a required Double (variable type) argument that specifies the position of the upper-left corner of the chart in points from the left edge of cell A1.
+  * Top is a required Double argument that specifies the position of the upper-left corner of the chart in points from the top edge of cell A1.
+  * Width is a required Double argument that specifies the width of the chart in points.
+  * Height is a required Double argument that specifies the height of the chart in points.
+
+For example, the following statements declare a new ChartObject object named myChartObject and assign to it a new chart object (chart area) 400 points wide by 300 points deep, positioned 200 points from the left edge and 200 points from the top of the worksheet:
+
+    Dim myChartObject As ChartObject
+    Set myChartObject = ActiveSheet.ChartObjects.Add(Left:=200, Top:=200, _
+        Width:=400, Height:=300)
+
+To work with the chart inside the ChartObject, return the Chart property of the ChartObject object.
+
+## Specifying the Source Data for the Chart
+
+So far, the chart (on the chart sheet or in the Chart object) is blank. To give it contents, specify the chart's source data by using the SetSourceData method of the Chart object. For example, the following statement specifies the range A1:E5 on the worksheet named Chart Data in the active workbook as the source data of the Chart object in the ChartObject object named myChartObject:
+
+    myChartObject.Chart. **SetSourceData** Source:= _
+        ActiveWorkbook.Sheets("Chart Data").Range("A1:E5")
+
+## Specifying the Chart Type
+
+To specify the chart type, set the ChartType property of the Chart object. Excel offers too great a variety of charts to list here (73 different types), but you can easily identify the chart types from their enumeration-constant names. For example, the constant xl3DArea represents the 3-D Area chart type, xlColumnStacked represents the Stacked Column chart type, and xlDoughnutExploded represents the Exploded Doughnut chart type.
+
+The following statement sets the type of the chart represented by the object variable myChart to the Stacked Column type:
+
+    myChart.ChartType = xlColumnStacked
+
+## Working with Series in the Chart
+
+To work with series in a chart, you use the SeriesCollection collection, which contains all the series in the specified chart.
+
+### Creating a New Series
+
+To create a new series, use the NewSeries method with the SeriesCollection collection. For example, the following statement adds a new series to the chart represented by the object variable myChart:
+
+    myChart.SeriesCollection.NewSeries
+
+### Adding a New Series
+
+To add a new series to a SeriesCollection collection, use the Add method with the appropriate SeriesCollection object. The syntax is as follows:
+
+    _expression_.Add(Source, Rowcol, SeriesLabels, CategoryLabels, Replace)
+
+Here are the components of this syntax:
+
+  * _expression_ is a required expression that returns a SeriesCollection collection.
+  * Source is a required Variant argument that specifies the source of the data for the new series. You can supply the data either as a range or as an array of data points.
+  * Rowcol is an optional argument that you can set to xlRows to specify that the new values are in rows in the specified range, or use the default setting, xlColumns, to specify that the new values are in columns. If you omit this argument, Excel uses xlColumns.
+  * SeriesLabels is an optional Variant argument that you can set to True to specify that the first row or column in the source area contains the series labels, or False to specify that the first row or column in the source area contains the first data point for the series. If you omit this argument, Excel tries to work out whether the first row or column contains a series label. It's best to specify this argument to avoid confusion. However, if Source is an array, VBA ignores this argument.
+  * CategoryLabels is an optional Variant argument that you can set to True to specify that the first row or column contains the name for the category labels, or set to False to specify that it does not contain them. If you omit this argument, Excel tries to work out whether the first row or column contains a category label. It's best to specify this argument to avoid confusion. Again, if Source is an array, VBA ignores this argument.
+  * Replace is an optional Variant argument that you can set to True when CategoryLabels is True to make the categories replace the existing categories for the series, or set to False (the default value) to prevent the existing categories from being replaced.
+
+The following procedure brings together several elements used in the previous code examples in this chapter. It illustrates how to create a complete chart and add a new series to the chart identified by the object variable myChart. The procedure draws the data from the range A4:K4 on the active worksheet in the active workbook, using rows:
+
+    Sub test()
+
+    Dim myChartObject As ChartObject
+    Dim MyChart As Chart
+
+    Set myChartObject = ActiveSheet.ChartObjects.Add(Left:=100, Top:=100, _
+        Width:=400, Height:=300)
+
+    Set MyChart = myChartObject.Chart
+    MyChart.ChartType = xlConeBarStacked
+
+    MyChart.SeriesCollection.Add _
+     Source:=ActiveSheet.Range("A4:K4"), Rowcol:=xlRows
+
+    End Sub
+
+If you execute this example, you'll see results similar to those shown in Figure 23.1. A chart will be generated based on whatever data lies within the specified range.
+
+Figure 23.1 This chart was generated in a procedure, using the Add method of the SeriesCollection object.
+
+### Extending an Existing Series
+
+To extend an existing series, use the Extend method with the appropriate SeriesCollection object. The syntax is as follows:
+
+    _expression_.Extend(Source, Rowcol, CategoryLabels)
+
+Here are the components of this syntax:
+
+  * _expression_ is a required expression that returns a SeriesCollection object.
+  * Source is a required Variant argument that specifies the source of the data for the new series. You can supply the data either as a range or as an array of data points.
+  * Rowcol is an optional argument that you can set to xlRows to specify that the new values are in rows in the specified range, or use the default setting, xlColumns, to specify that the new values are in columns. If you omit this argument, Excel uses xlColumns.
+  * CategoryLabels is an optional Variant argument that you can set to True to specify that the first row or column contains the name for the category labels, or set to False to specify that it does not contain them. If you omit this argument, Excel tries to work out whether the first row or column contains a category label. It's best to specify this argument to avoid confusion. If Source is an array, VBA ignores this argument.
+
+For example, the following statement extends the series in the chart identified by the object variable myChart using the data in the cells P3:P8 on the worksheet named Chart Data:
+
+    myChart.SeriesCollection.Extend _
+    Source:=Worksheets("Chart Data").Range("P3:P8")
+
+## Adding a Legend to the Chart
+
+To add a legend to the chart, set its HasLegend property to True. To manipulate the legend, work with the properties of the Legend object. Key properties include these:
+
+  * The Position property controls where the legend appears: xlLegendPositionBottom, xlLegendPositionCorner, xlLegendPositionLeft, xlLegendPositionRight, or xlLegendPositionTop.
+  * The Height property and the Width property control the height and width of the legend, respectively, in points.
+  * The Font property returns the Font object, whose properties you can set to specify the font size, name, and effects.
+
+For example, the following statements add the legend to the chart represented by the object variable myChart and apply 16-point Arial font to it:
+
+    With myChart.Legend
+        .HasLegend = True
+        .Font.Size = 16
+        .Font.Name = "Arial"
+    End With
+
+## Adding a Chart Title
+
+To add a title to the chart, set its HasTitle property to True, as in this example:
+
+    myChart.HasTitle = True
+
+Excel adds the title with the default text Chart Title. To change the text, set the Text property of the ChartTitle object, which represents the chart title. Here's an example:
+
+    myChart.ChartTitle.Text = "Industrial Mixups in North Dakota"
+
+To position the title, set its Top property (specifying the number of points from the top edge of the worksheet) and its Left property (specifying the number of points from the left edge of the worksheet), as in this example:
+
+    With myChart.ChartTitle
+        .Top = 100
+        .Left = 150
+    End With
+
+To format the text of the title, work with its Font object, as follows:
+
+    myChart.ChartTitle.Font.Name = "Arial"
+
+## Working with a Chart Axis
+
+To work with an axis of a chart, use the Axes method to access the appropriate axis. The syntax is as follows:
+
+    _expression_.Axes(Type, Group)
+
+Here, _expression_ is a required expression that returns a Chart object. Type is an optional Variant argument that specifies the axis to return. Use xlValue to return the value axis, xlCategory to return the category axis, or xlSeriesAxis to return the series axis (on 3D charts only). Group is an optional argument that you can set to xlSecondary to specify the second axis group instead of xlPrimary (the default setting), which specifies the first axis group.
+
+For example, the following statements work with the category axis in the primary group of the chart, applying its title, adding text, setting the font and font size, and turning major gridlines on and minor gridlines off. Note that this With structure should be placed within a second, outer With structure representing the chart itself:
+
+    With MyChart
+       With .Axes(Type:=xlCategory, AxisGroup:=xlPrimary)
+    .HasTitle = True
+          .AxisTitle.Text = "Years"
+          .AxisTitle.Font.Name = "Times New Roman"
+          .AxisTitle.Font.Size = 12
+          .HasMajorGridlines = True
+          .HasMinorGridlines = False
+        End With
+
+    End With
+
+## Formatting Headers and Footers
+
+You can manipulate headers and footers easily via VBA by using a built-in set of format and content constants. These include format specifications such as &U for underlining and &C for centering. Content constants include &D, which inserts the current date, &P for the page number, and &F for the document's name. The complete list of Excel 2013 header and footer constants can be found here:
+
+<http://msdn.microsoft.com/en-us/library/office/ff822794.aspx>
+
+This code turns on italics and underlining, and on the right side of the header prints _Dr. Dancy Page_ followed by the current page and the total number of pages: _Dr. Dancy Page 2 of 7_. If there is no header, one is created.
+
+    ActiveSheet.PageSetup.RightHeader = "&U&I Doctor Dancy  Page &P of &N"
+
+# Working with Windows
+
+The Windows collection contains a Window object for every open window in the Excel application. Normally, when you open a workbook, Excel opens a window so that you can see it. You can also open further windows as necessary—for example, by clicking the Ribbon's View tab, then clicking the New Window button in the Window area.
+
+In most cases, using Window objects isn't a very useful way to access data via VBA because you can access it more easily using objects such as the ActiveSheet object or the ActiveCell object. However, you may want to open, close, activate, or arrange windows programmatically (via a procedure rather than having the user do it by hand interactively) to display data to the user in a particular way.
+
+## Opening a New Window on a Workbook
+
+To open a new window on a workbook, use the NewWindow method of the appropriate Window object. This method takes no arguments. For example, the following statement opens a new window showing the contents of the first window open on the workbook identified by the object variable myWorkbook:
+
+    myWorkbook.Windows(1).NewWindow
+
+## Closing a Window
+
+To close a window, use the Close method with the appropriate Window object. The syntax is as follows:
+
+    _expression_.Close(SaveChanges, Filename, RouteWorkbook)
+
+Here, _expression_ is a required expression that returns a Window object. This syntax is the same as for closing a workbook (see "Closing a Workbook" in the previous chapter). The difference is that if two or more windows are open on the same workbook, closing the second or subsequent window does not close the workbook, so the arguments are not relevant. (If the window you're closing is the workbook's last window, however, you do need to specify the windows—otherwise, Excel prompts the user to save any unsaved changes.) For example, the following statement closes all windows open on the workbook referenced by the object variable myWorkbook except for one window:
+
+    Do While myWorkbook.Windows.Count > 1
+        myWorkbook.Windows(myWorkbook.Windows.Count).Close
+    Loop
+
+## Activating a Window
+
+To activate a window, use the Activate method of the appropriate Window object. For example, the following statement activates the first window open on the workbook Planning.xlsx:
+
+    Workbooks("Planning.xlsx").Windows(1).Activate
+
+Similarly, you can activate the previous window by using the ActivatePrevious method or the next window by using the ActivateNext method.
+
+## Arranging and Resizing Windows
+
+To arrange windows, use the Arrange method with the appropriate Windows collection. The syntax is as follows:
+
+    _expression_.Arrange(ArrangeStyle, ActiveWorkbook, SyncHorizontal, SyncVertical)
+
+Here are the components of this syntax:
+
+  * _expression_ is a required expression that returns a Windows collection.
+  * ArrangeStyle is an optional argument that you can set to xlArrangeStyleTiled to tile the windows (the default setting), xlArrangeStyleHorizontal to arrange the windows horizontally, xlArrangeStyleVertical to arrange the windows vertically, or xlArrangeStyleCascade to cascade the windows in an overlapping arrangement that lets you see the title bar of each window but the contents of only the front window.
+  * ActiveWorkbook is an optional Variant argument that you can set to True to make VBA arrange only the windows of the active workbook. The default value is False, which arranges all open windows.
+  * SyncHorizontal and SyncVertical are optional Variant arguments that you can set to True when you use ActiveWorkbook:=True to make the windows of the active workbook scroll horizontally or vertically in sync (when you scroll one window, the other windows scroll by the same amount in the same direction). The default is False.
+
+For example, the following statement arranges the windows in the workbook Budget.xlsx vertically and sets synchronized scrolling on them:
+
+    Workbooks("Budget.xlsx").Windows.Arrange _
+        ArrangeStyle:=xlArrangeStyleVertical, _
+        ActiveWorkbook:=True, SyncVertical:=True
+
+You can maximize, minimize, or restore the application window by setting the WindowState property of the Application object to xlMaximized, xlMinimized, or xlNormal. Similarly, within the application window, you can maximize, minimize, or restore a document by setting its WindowState property.
+
+When a window is in a "normal" state (xlNormal; not maximized or minimized), you can position it by using the Top and Left properties to specify the position of the upper-left corner of the window and size it by setting its Height and Width properties. Check the UsableWidth property and the UsableHeight property of the Application object to find the amount of space available in the Application window. (Similarly, you can check the UsableWidth property and the UsableHeight of the Window object to see how much space is available in the window—for example, so that you can size or position an object correctly.)
+
+The following example declares two Window object variables, myWindow1 and myWindow2, and assigns myWindow1 to the active window and myWindow2 to a new window showing the same worksheet as myWindow1. The example then sizes and positions the two windows so that each is the full height available in the application window, with myWindow1 taking one-quarter of the available width and myWindow2 taking the remaining three-quarters of the available width:
+
+    Dim myWindow1 As Window, myWindow2 As Window
+    Set myWindow1 = ActiveWindow
+    Set myWindow2 = myWindow1.NewWindow
+    With myWindow1
+        .WindowState = xlNormal
+        .Top = 0
+        .Left = 0
+        .Height = Application.UsableHeight
+        .Width = Application.UsableWidth * 0.25
+    End With
+    With myWindow2
+        .WindowState = xlNormal
+        .Top = 0
+        .Left = (Application.UsableWidth * 0.25) + 1
+        .Height = Application.UsableHeight
+        .Width = Application.UsableWidth * 0.75
+    End With
+
+## Zooming a Window and Setting Display Options
+
+To change the zoom, set the Zoom property of the appropriate Window object. For example, the following statement zooms the active window to 150 percent:
+
+    ActiveWindow.Zoom = 150
+
+In some procedures, you may need to change the display of the Excel window to ensure that certain features are (or are not) available to the user. Use the Boolean properties DisplayScrollBars, DisplayStatusBar, and DisplayFormulaBar to control whether Excel displays the scroll bars, status bar, and formula bar. Use the DisplayFullScreen property to toggle full-screen view on and off.
+
+For example, the following statements make sure that the scroll bars and status bar are hidden and that the formula bar is displayed:
+
+    With Application
+        .DisplayScrollBars = False
+        .DisplayStatusBar = False
+        .DisplayFormulaBar = True
+    End With
+
+# Working with Find and Replace
+
+Excel's Find and Replace features can be useful for locating data in your procedures. In Excel, Find and Replace are implemented through methods rather than (as in Word) through a Find object.
+
+Both the Range object and the WorksheetFunction object have Find methods and Replace methods (but with different syntax). For most find and replace operations, you'll want to use the Range object—for example, to replace the contents of specific cells on a worksheet.
+
+## Searching with the Find Method
+
+The syntax for the Range object's Find method is as follows:
+
+    _expression_.Find(What, After, LookIn, LookAt, SearchOrder, SearchDirection, MatchCase, MatchByte, SearchFormat)
+
+Here are the components of this syntax:
+
+  * _expression_ is a required expression that returns a Range object.
+  * What is a required Variant argument that specifies the data to find. This data can be a string of text or any Excel data type.
+  * After is an optional Variant argument that you can use to specify the cell after which to begin searching. After must be a cell in the range that's being searched. If you omit After, Excel begins the search at the upper-left cell in the range.
+  * LookIn is an optional Variant argument that you can use to specify whether to search in formulas (xlFormulas), values (xlValues), or comments (xlComments).
+  * LookAt is an optional Variant argument that you can set to xlWhole to search for the entire contents of a cell, or to xlPart to search for the match within the contents of cells.
+  * SearchOrder is an optional Variant argument that you can set to xlByRows to search by rows, or to xlByColumns to search by columns.
+  * SearchDirection is an optional Variant argument that you can set to xlNext to search downward, or to xlPrevious to search upward.
+  * MatchCase is an optional Variant argument that you can set to True to use case-sensitive searching. The default setting is False.
+  * MatchByte is an optional Variant argument used only if you've installed double-byte language support.
+  * SearchFormat is an optional Variant argument that controls whether Excel searches for specified formatting (True) or not (False).
+
+* * *
+
+**Practical Searching: Beware Persistent Settings**
+
+The LookIn, LookAt, SearchOrder, and MatchByte arguments of the Range object's Find method _persist_ —Excel retains them from one search to the next. Unless you know that the settings used in the previous search are suitable for your current needs, you should set these arguments explicitly for each new search to avoid getting unexpected results.
+
+Pay particular attention to the LookAt setting. This setting corresponds to the Match Entire Cell Contents check box in the Find And Replace dialog box. To see this option manually in an Excel window, click any cell, then click the Find And Select button on the Ribbon's Home tab. Choose Replace on the menu that drops down, then click the Options button in the Find And Replace dialog box, if necessary, to display all the options available in that dialog box.
+
+Remember that format settings such as font and subscript persist as well. So you might want to also specify them if you're concerned they might have been previously employed by you or the user.
+
+And, finally, remember to always be courteous to users by restoring their settings. Users know that Find and Replace settings persist, so they expect them to remain as you found them, no matter what you might do with them while your procedure executes. So at the start of your procedure, store the user's current settings in variables. Then at the end of your procedure, save these settings back to the various options.
+
+Excel has no global command equivalent to Word's ClearFormatting statement, described in Chapter 21, "Working with Widely Used Objects in Word."
+
+* * *
+
+The following example code searches for 2008 in formulas in cells after the active cell, without searching for formatting:
+
+    Cells.Find(What:="2008", After:=ActiveCell, LookIn:=xlFormulas, LookAt _
+        :=xlWhole, SearchOrder:=xlByRows, SearchDirection:=xlNext, MatchCase:= _
+        True, SearchFormat:=False).Activate
+
+Notice that in this code each argument is named. And one, MatchByte, is omitted. Recall that if you leave out an argument in an argument list, you must either insert a comma as a placeholder or use named arguments. Given that Excel's Find arguments are persistent, it is a good idea to use named arguments here to remind yourself that they need to be restored to the user's previous settings.
+
+## Continuing a Search with the _FindNext_ and _FindPrevious_ Methods
+
+After you have executed a search using the Find method, you can use the FindNext method to find the next instance of the search item, or the FindPrevious method to find the previous instance. The syntax is as follows:
+
+    _expression_.FindNext(After)
+    _expression_.FindPrevious(After)
+
+Here, _expression_ is a required expression that returns a Range object, and After is an optional Variant argument that specifies the cell after which you want to search (for the FindNext method) or before which you want to search (for the FindPrevious method). After must be a single cell.
+
+For example, the following statement finds the next instance of the search item:
+
+    Cells.FindNext
+
+## Replacing with the _Replace_ Method
+
+To replace using VBA, use the Replace method with the Range object. The syntax is as follows:
+
+    _expression_.Replace(What, Replacement, LookAt, SearchOrder, MatchCase, MatchByte, SearchFormat, ReplaceFormat)
+
+The components of the syntax are the same as for the Search method except for the following:
+
+  * Replacement is a required Variant argument that specifies the replacement string for the search.
+  * ReplaceFormat is an optional Variant argument that controls whether Excel replaces formatting in the search (True) or not (False).
+
+For example, the following statement replaces the instances of the word Sales in column B of the active worksheet with the words Sales & Marketing, using case-sensitive matching:
+
+    ActiveSheet.Columns("B").Replace What:="Sales", _
+        Replacement:="Sales & Marketing", SearchOrder:=xlByColumns, _
+        MatchCase:=True
+
+## Searching for and Replacing Formatting
+
+To search for formatting, use the FindFormat property of the Application object to define the formatting, and then set the SearchFormat argument of the Find method to True. Similarly, use the ReplaceFormat property of the Application object to define the replacement formatting, and then set the ReplaceFormat property of the Replace method to True.
+
+For example, the following statements use a With structure to set the Application.FindFormat.Font properties for which to search, a With structure to set the Application.ReplaceFormat.Font with which to replace them, and the Replace method of the Cells collection to effect the replacement:
+
+        With Application.FindFormat.Font
+            .Name = "Arial"
+            .Size = "12"
+            .Bold = True
+        End With
+        With Application.ReplaceFormat.Font
+            .Name = "Arial Black"
+            .Bold = False
+        End With
+        Cells.Replace What:="5", Replacement:="5", LookAt:=xlPart, SearchOrder _
+            :=xlByColumns, MatchCase:=False, SearchFormat:=True, ReplaceFormat:=True
+
+# Adding Shapes
+
+It's easy to add shapes to a worksheet. This technique can be used to draw attention to important points or liven up statistical data for a presentation. Here's an example that adds two explosion graphics to a worksheet:
+
+    Sub AutoShapes()
+
+        ActiveSheet.Shapes.AddShape(msoShapeExplosion2, 425, 145, 86, 101).Select
+        ActiveSheet.Shapes.AddShape(msoShapeExplosion1, 265, 224, 190, 190).Select
+
+    End Sub
+
+The AddShape method takes the following arguments:
+
+    AddShape(Type, Left, Top, Width, Height)
+
+The Type argument specifies one of a set of msoShape constants that can be found in Excel's VBA Editor. There are dozens of shapes, including a moon, a heart, and a tear. Press F2 to display the Object Browser. In the list box at the top left of the Object Browser, you'll likely see All Libraries displayed by default. Instead, open this list and select Office. (This list box specifies the library of objects that will be searched.) Now in the field directly below that, type **msoshape** and click the binoculars icon next to the field.
+
+# The Bottom Line
+
+**Work with charts.**
+
+You can create either full chart sheets or embedded charts within an ordinary Excel worksheet.
+
+Master It
+
+What object is used in a procedure to represent an embedded chart?
+
+**Work with windows.**
+
+To open a new window on a workbook, you use the NewWindow method of the appropriate Window object.
+
+Master It
+
+Does the NewWindow method take any arguments?
+
+**Work with Find and Replace.**
+
+When working with the Find and Replace features in Excel, you need to be aware of a phenomenon known as _persistence_.
+
+Master It
+
+What is persistence, and why should it concern you?
+Chapter 24
+
+Understanding the PowerPoint Object Model and Key Objects
+
+This chapter shows you how to start working with the PowerPoint object model, the architecture underlying PowerPoint, and how to perform common actions with the most immediately useful PowerPoint objects. These objects include the Presentations collection and the Presentation object, the ActivePresentation object, the Slides collection and Slide objects, Window objects, and Master objects.
+
+In this chapter you will learn to do the following:
+
+  * Understand the PowerPoint object model
+  * Understand PowerPoint's creatable objects
+  * Work with presentations
+  * Work with windows and views
+  * Work with slides
+  * Work with masters
+
+# Getting an Overview of the PowerPoint Object Model
+
+As with all Office applications that include VBA, you can write macros without understanding how the PowerPoint object model fits together, but most people find that familiarity with the main objects in the object model is helpful. Also, the code examples in the Help system's object-model reference can be invaluable. They show how and where to employ objects in your own programming.
+
+To begin exploring the PowerPoint object model, follow these steps:
+
+1. Launch or switch to PowerPoint, and then press Alt+F11 to launch or switch to the VBA Editor.
+
+2. Move your cursor to a blank space in the code window (to avoid context-sensitive help).
+
+3. Press F1 in the Editor to launch the VBA language reference for Office 2013 web page.
+
+4. In the Bing search field, type **powerpoint 2013 object model** and press Enter.
+
+5. Click the link _Object model reference_ ( _PowerPoint 2013 Developer Reference_ ). You'll now have access to the whole collection of syntax specifications, useful descriptions, and code examples, as shown in Figure 24.1.
+
+Figure 24.1 The entries in the PowerPoint object-model reference will help you write your own VBA code.
+
+* * *
+
+Help When Migrating Legacy Code from Earlier Office Projects
+
+If you've inherited VBA code written in earlier versions of PowerPoint, those procedures might contain objects, methods, and properties that have been changed in Office 2013. Though changes to previous object models are generally few, some incompatibilities can crop up and "break" the code so it won't run correctly. Fortunately, you can download a free utility to assist you in mending the broken code. See the sidebar in Chapter 22 titled "Help When Migrating Legacy Code from Earlier Office Projects" for more information.
+
+* * *
+
+# Understanding PowerPoint's Creatable Objects
+
+In PowerPoint, the Application object gives you access to all the objects in the PowerPoint application. But for many operations, you can go directly through one of the "creatable" objects available in PowerPoint. (Recall that _creatable_ merely means you can optionally leave out the word Application when specifying a creatable object in your code.) The four most useful creatable objects are listed here:
+
+  * The ActivePresentation object represents the active presentation, the presentation that would respond if you typed something.
+  * The Presentations collection contains the Presentation objects, each of which represents one of the currently open presentations.
+  * The ActiveWindow object represents the active window in the application.
+  * The SlideShowWindows collection contains the SlideShowWindow objects, each of which represents an open slide-show window. This collection is useful for manipulating a slide show that's currently displayed.
+
+Within a presentation, you'll typically find yourself working with the Slides collection, which contains all the Slide objects that represent the slides. On a slide, most items are represented by Shape objects gathered into the Shapes collection. For example, the text in a typical placeholder is contained in the Text property of the TextRange object in the TextFrame object within a Shape object on a slide.
+
+# Working with Presentations
+
+To get any work done in PowerPoint, you'll usually need to work with one or more presentations. VBA uses the Presentation object to represent a presentation and organizes the open Presentation objects into the Presentations collection.
+
+## Creating a New Presentation Based on the Default Template
+
+You can create a new presentation based on the default template. This is equivalent to clicking the File tab on PowerPoint's Ribbon, then clicking the New option in PowerPoint. To do this, use the Add method with the Presentations collection. The syntax is as follows:
+
+    _expression_.Add(WithWindow)
+
+Here are the components of this syntax:
+
+  * _expression_ is a required expression that returns a Presentations object. Often, it's easiest to use the Presentations object itself.
+  * WithWindow is an optional Long argument. Set WithWindow to msoFalse to prevent the new presentation from being visible—for example, so that you can create and manipulate it without the user seeing the details. (You may want to temporarily hide the presentation so that the user doesn't have to endure the irritating flickering effect that PowerPoint tends to exhibit while creating presentation objects programmatically.) The default value is msoTrue, making the new presentation visible.
+
+For example, the following statements declare an object variable of the Presentation type named myPresentation, create a new presentation, assign the new presentation to myPresentation, and make it invisible to the user:
+
+    Dim myPresentation As Presentation
+    Set myPresentation = Presentations. **Add** (WithWindow:=msoFalse)
+
+* * *
+
+Understanding Tri-State Values
+
+This is a bit rarified, but let's leave no stone unturned. The Add method of the Presentations object allows you to set its WithWindow argument to four different states: msoFalse, msoTrue, msoTriStateToggle, or msoTriStateMixed. True and false are common and easily understood states. But PowerPoint makes extensive use of two unusual states called MsoTriState values, both of which represent a kind of super-Boolean state. Instead of being limited to merely True or False, a tri-state value can also be in special third and fourth states. msoTriStateMixed means that something is both true and false at the same time, like lovers. Here's an example: if a string contains three words, one of which is bold, is the string bold or not? Well, the answer in VBA is that it is a _mixed_ string. msoTriStateToggle means that the state is potentially true or false.
+
+In other words, the user could click a two-state control resetting the status either way, or your code could reset it.
+
+In most cases, you'll want to set a tri-state value to either msoTrue or msoFalse. In fact, I can't imagine a situation in which your code would actually ever have a need to _set_ msoTriState values; instead, you would only ever need to check this value (to _read_ it) to find out if the property you were dealing with contained a mixture of msoTrue and msoFalse values. Remember that _mixed_ means something is true and false at the same time; _toggle_ means that something is _potentially_ either true or false. Actually, don't remember this, because it's so infrequently useful. I mention it only because it pervades the documentation on PowerPoint programming.
+
+* * *
+
+## Creating a New Presentation Based on a Template
+
+To create a new presentation based on a template other than the default template, use the Open method of the Presentations collection. The syntax is as follows:
+
+    _expression_.Open(FileName, ReadOnly, Untitled, WithWindow)
+
+The components of the syntax are explained here (ReadOnly, Untitled, and WithWindow are all msoTriState values, but pay no attention to that):
+
+  * _expression_ is a required expression that returns a Presentations object. Often, it's easiest to use the Presentations object itself.
+  * FileName is a required String argument that specifies the path and name of the file to use as a template for the new presentation. This file can be either a template in the conventional sense or a presentation that you want to use as a template.
+  * ReadOnly is an optional argument that specifies whether the file is opened with read-only status (msoTrue) or with read/write status (msoFalse). When creating a new presentation based on a template, you don't need to specify ReadOnly.
+  * Untitled is an optional argument that specifies whether to open the file as itself (msoFalse) or as a copy (msoTrue). When creating a new presentation based on a template, set Untitled to msoTrue.
+  * WithWindow is an optional argument that you can set to msoFalse to prevent the new presentation from being visible. The default value is msoTrue, making the new presentation visible.
+
+For example, the following statement creates a new presentation based on the template named Capsules.potm in the C:\Users\ _Richard_ \Documents\Custom Office Templates\ folder:
+
+    Presentations. **Open** _
+     FileName:=˝C:\Users\ ** _Richard_** \Documents\Custom Office Templates\Presentation2.potx˝, Untitled:=msoTrue
+
+As usual, replace my name, _Richard_ , with your name.
+
+## Opening an Existing Presentation
+
+To open an existing presentation already on the hard drive, use the Open method of the Presentations collection. The syntax is as shown in the previous section. The difference is that you use the FileName argument to specify the presentation you want to open (as opposed to the file that you want to use as the template for creating a new presentation) and either omit the Untitled argument or set it to msoFalse. You may also need to use the OpenConflictDocument argument to specify how to handle any conflict file that exists for the presentation you're opening.
+
+For example, the following statement opens the existing presentation named Train Time.pptm stored in the folder Z:\Public, opening the presentation for editing rather than opening it as read-only:
+
+    Presentations. **Open** FileName:=˝Z:\Public\Train Time.pptm˝, ReadOnly:=msoFalse
+
+## Opening a Presentation from the Cloud
+
+Chapters 20 and 22, "Understanding the Word Object Model and Key Objects" and "Understanding the Excel Object Model and Key Objects," demonstrated how to save documents to SkyDrive and Dropbox. Here we'll go the other way and open a presentation that's been stored on SkyDrive. The mechanics of contacting the cloud are, blessedly, handled for us by the various cloud services. There _are_ security issues—particularly during transmission to and from the storage servers—which I personally am glad to leave to these companies' programmers.
+
+All we VBA programmers have to do to store to or open from the cloud is to get the file path right. It's as if you are storing something on your hard drive—which in fact you are. The only difference is that the files in this location on your hard drive are also automatically stored (synced) somewhere else in the world, in a server farm.
+
+Let's assume you have a presentation named PX.pptm stored in SkyDrive. The file path will normally be "C:\Users\ ** _Richard_** \SkyDrive\PX.pptm".
+
+So, to open this PX presentation, you can use this code, replacing _Richard_ with whatever your name is:
+
+    Presentations.Open FileName:=˝C:\Users\ ** _Richard_** \SkyDrive\PX.pptm˝, ReadOnly:=msoFalse
+
+## Saving a Presentation
+
+The first time you save a presentation, you must specify the path and filename to use. After that, you can save the presentation under the same name or specify a different path, name, format, or all three. This is the same distinction between the Save and Save As options on the File tab of the Ribbon.
+
+### Saving a Presentation for the First Time or under a Different Name
+
+To save a presentation for the first time, or to save a presentation using a different path, name, or format, use the SaveAs method. The syntax is as follows:
+
+    _expression_. **SaveAs** (Filename, FileFormat, EmbedFonts)
+
+Here are the components of this syntax:
+
+  * _expression_ is a required expression that returns a Presentation object.
+  * Filename is a required String argument that specifies the filename under which to save the presentation. Normally, you include the path in Filename; if you omit the path, PowerPoint uses the current folder.
+  * FileFormat is an optional argument that specifies the file format to use. Although there are 27 total SaveAs constants, Table 24.1 lists only the 7 most widely useful formats.
+  * EmbedFonts is an optional argument that you can set to msoTrue to embed TrueType fonts in the presentation, or to False (the default) to not embed them.
+
+Table 24.1 Useful FileFormat constants for saving PowerPoint files
+
+**Format Name** | **Constant**  
+---|---  
+PowerPoint format | ppSaveAsPresentation  
+Default format (set on the Save tab of the Options dialog box) | ppSaveAsDefault  
+Single-file web page | ppSaveAsWebArchive  
+Web page | ppSaveAsHTML  
+Presentation | ppSaveAsPresentation  
+Design template | ppSaveAsTemplate  
+PowerPoint show | ppSaveAsShow
+
+For example, the following statement saves the presentation identified by the object variable myPresentation under the name HR.pptm in the folder Z:\Shared\Presentations, using the web-page format and not embedding fonts:
+
+    myPresentation. **SaveAs** FileName:=˝Z:\Shared\Presentations\HR.pptm˝, _
+        FileFormat:= **ppSaveAsHTML** , EmbedTrueTypeFonts:=msoFalse
+
+* * *
+
+Using the Object Browser to Quickly See Constants and Objects
+
+Here's a useful reminder. When you don't need code samples or extra details, you don't need to take the time to look through the full online Help system for an object's members or constants (such as the ppSaveAs constants shown in Table 24.1). Instead, just press F2 in the VBA Editor to bring up the Object Browser. Then, in the search field (to the left of the binoculars icon), type the object's name, a member (property or method), or a constant name. For example, you could type **ppSaveAsPresentation** , then click the binoculars icon. You would then see the entire list of 27 ppSaveAs constants.
+
+To see the full list in the online Help system, visit this web page:
+
+<http://msdn.microsoft.com/en-us/library/office/ff746500.aspx>
+
+* * *
+
+### Saving a Presentation under Its Existing Name
+
+To save a presentation under its existing name, use the Save method. This method takes no arguments because it has only one possible behavior. For example, the following statement saves the active presentation:
+
+    ActivePresentation.Save
+
+If the presentation on which you use the Save method has never been saved, PowerPoint doesn't prompt the user to specify the filename and location. Instead, PowerPoint saves the presentation using the default name assigned to its window (for example, a presentation whose window is called Presentation11 will be saved as Presentation11.pptm) and in the current folder. To avoid using this default name and location, you can check the Path property of the Presentation object before using the Save method if you need to determine whether the presentation has been saved. If it has not been saved (if Path = ""), then you would use the SaveAs method to specify the folder and title you want to use, as in this example:
+
+    If ActivePresentation.Path = ˝˝ Then
+        ActivePresentation.SaveAs FileName:=˝z:\public\presentations\Corporate.pptm˝
+    Else
+        ActivePresentation.Save
+    End If
+
+### Saving a Copy of a Presentation
+
+Instead of using the SaveAs method to save a presentation under a different name, you can use the SaveCopyAs method to save a copy of the open presentation without affecting the open presentation (the presentation remains open, and any unsaved changes remain unsaved). The syntax and arguments for the SaveCopyAs method are the same as for the SaveAs method:
+
+    expression.SaveAs(Filename, FileFormat, EmbedFonts)
+
+For example, the following statement saves a copy of the active presentation under the name Copy 1.pptm in the folder Z:\Public\Presentations, using the same file format as the presentation currently uses:
+
+    ActivePresentation.SaveCopyAs FileName:=˝Z:\Public\Presentations\Copy 1.pptm˝
+
+### Saving All Open Presentations
+
+The Presentations collection doesn't have a Save method, but you can save all open presentations by using a loop such as that shown in the following subroutine. This subroutine leaves unsaved any presentation that doesn't yet have a filename assigned.
+
+    Sub Save_All_Presentations()
+        Dim myPresentation As Presentation
+        For Each myPresentation In Presentations
+            If myPresentation.Path <> ˝˝ Then myPresentation.Save
+        Next myPresentation
+    End Sub
+
+## Closing a Presentation
+
+To close a presentation, use the Close method of the appropriate Presentation object. The Close method takes no arguments. For example, the following statement closes the active presentation:
+
+    ActivePresentation.Close
+
+If the presentation you're closing contains unsaved changes, PowerPoint prompts the user to save them. To avoid the user's being prompted, set the Saved property of the Presentation object to True before using the Close method. Here's an example:
+
+    With Presentations(˝Karelia Industry.pptm˝)
+        .Saved = True
+        .Close
+    End With
+
+## Exporting a Presentation or Some Slides to Graphics
+
+You can export an entire presentation, a single slide, or a range of slides by using the Export method of the Presentation object, the Slide object, or a SlideRange object. The syntax for the Export method with a Presentation object is as follows:
+
+    _expression_.Export(Path, FilterName, ScaleWidth, ScaleHeight)
+
+The syntax for the Export method with a Slide object or a SlideRange object is almost the same:
+
+    _expression_.Export(FileName, FilterName, ScaleWidth, ScaleHeight)
+
+Here are the components of this syntax:
+
+  * _expression_ is a required expression that returns a Presentation object, a Slide object, or a SlideRange object, as appropriate.
+  * Path (for a Presentation object) is a required String argument that specifies the path of the folder in which to save the graphics files of the slides.
+  * FileName (for a Slide object or a SlideRange object) is a required String argument that specifies the filename to use for the exported graphic. Include the path in FileName unless you want PowerPoint to use the current folder.
+  * FilterName is a required String argument that specifies the filter to use. Use the registered filename extension (JPG, TIF, BMP, or PNG) for FilterName.
+  * ScaleWidth is an optional Long argument that you can include to specify the width of the graphic in pixels.
+  * ScaleHeight is an optional Long argument that you can include to specify the height of the graphic in pixels.
+
+For example, the following statement exports all the slides in the active presentation to 800×600 JPG graphics in the Z:\Public\Presentations folder. PowerPoint names the graphics Slide1, Slide2, and so on:
+
+    ActivePresentation.Export Path:=˝Z:\Public\Presentations˝, _
+        FilterName:=˝JPG˝, ScaleWidth:=800, ScaleHeight:=600
+
+The following statement exports the sixth slide in the active presentation to the file named Slide6.png in the Z:\Public\Presentations folder, using the PNG format:
+
+    ActivePresentation.Slides(6).Export _
+        FileName:=˝Z:\Public\Presentations\Slide6.png˝, FilterName:=˝PNG˝
+
+## Printing a Presentation
+
+To print a presentation, use the PrintOut method of the appropriate Presentation object. The syntax is as follows:
+
+    _expression_.PrintOut(From, To, PrintToFile, Copies, Collate)
+
+Here are the components of this syntax:
+
+  * _expression_ is a required expression that returns a Presentation object.
+  * From and To are optional Integer arguments that specify the first slide and last slide to print. If you omit From, PowerPoint prints from the first slide; if you omit To, PowerPoint prints through the last slide.
+  * PrintToFile is an optional String argument that you can include to make PowerPoint print to the specified file rather than to the printer.
+  * Copies is an optional Integer argument that specifies how many copies of the presentation or slides to print. Omit Copies to use the default value, 1.
+  * Collate is an optional argument that you can set to msoFalse to prevent PowerPoint from collating multiple copies (which is the default setting).
+
+For example, the following statement prints all the slides in the active presentation:
+
+    ActivePresentation.PrintOut
+
+The following example prints slides 5 through 12 of the presentation identified by the object variable myPresentation:
+
+    myPresentation.PrintOut From:=5, To:=12
+
+## Applying a Template to a Presentation, to a Slide, or to a Range of Slides
+
+You can apply a design template to a presentation, to a single slide within a presentation, or to a range of slides by using the ApplyTemplate method with the Presentation object, the Slide object, or the SlideRange object. The syntax is as follows:
+
+    _expression_.ApplyTemplate(FileName)
+
+Here, _expression_ is a required expression that returns a Presentation object, a Slide object, or a SlideRange object. FileName is a required String argument that specifies the path and name of the design template.
+
+For example, the following statement applies the design template named Clouds.potm stored in the C:\Users\ ** _Richard_** \AppData\Roaming\Microsoft\Templates\ folder:
+
+    ActivePresentation.Slides(1). **ApplyTemplate** FileName:= _
+    ˝C:\Users\ **Richard** \AppData\Roaming\Microsoft\Templates\Clouds.potm˝
+
+As usual, replace my name, _Richard_ , with your name.
+
+The following statement applies the design template named Mountain Top.potm stored in the Z:\Public\Template folder to the first slide in the presentation named Success.pptm:
+
+    Presentations(˝Success.pptm˝).Slides(1). **ApplyTemplate** FileName:= _
+        ˝Z:\Public\Template\Mountain Top.potm˝
+
+The following example applies the design template named Disaster.potm stored in the Z:\Public\Template folder to a range of slides consisting of the first, fourth, and sixth slides in the active presentation:
+
+    ActivePresentation.Slides.Range(Array(1, 4, 6)).ApplyTemplate _
+        FileName:=˝Z:\Public\Template \Disaster.potm˝
+
+## Working with the Active Presentation
+
+The ActivePresentation property of the Application object returns a Presentation object that represents the active presentation (the presentation in the active window). The ActivePresentation object can be very useful for procedures that the user starts.
+
+If no window is open, trying to use the ActivePresentation object returns an error. Unless you're sure that there is an active presentation, it's a good idea to check that a window is open before you access the ActivePresentation object, as in this example:
+
+    If Windows.Count = 0 Then
+        MsgBox ˝Please open a presentation before running this macro.˝
+        End
+    End If
+
+# Working with Windows and Views
+
+To get the PowerPoint window into the state you want, you'll often need to work with the window and with the view. PowerPoint uses two types of windows:
+
+  * _Document windows_ are windows that contain documents (presentation files) rather than slide shows. VBA considers document windows to be DocumentWindow objects organized into the DocumentWindows collection but represents them with Window objects organized into the Windows collection. (Sounds mad, but you'll see how this works shortly.)
+  * _Slide_ - _show windows_ are windows that contain open slide shows. VBA uses SlideShowWindow objects and the SlideShowWindows collection to represent slide-show windows.
+
+The following sections show you how to work with document windows. You'll learn how to work with slide-show windows in ˝Setting Up and Running a Slide Show˝ in Chapter 25, ˝Working with Shapes and Running Slide Shows.˝
+
+The Windows collection contains a Window object for every open window in the PowerPoint application. When you open a presentation while working interactively, PowerPoint opens a window so that you can see the presentation. When you open a presentation via VBA, you can set the WithWindow argument of the Add method to msoFalse to prevent PowerPoint from displaying a window for the presentation. In the user interface, you can also open further windows as necessary—for example, by clicking the New Window button in the Window section of the Ribbon's View tab.
+
+## Working with the Active Window
+
+PowerPoint uses the ActiveWindow object to represent the window that is active (the window that currently has the _focus_ and is thus the one that accepts mouse clicks or typing).
+
+Only one window is active at a time. The active window is always the first Window object in the Windows collection—Windows(1).
+
+If no window is open at all, or all open windows are hidden, there is no active window and using the ActiveWindow object causes VBA to return an error. To make sure that a window is open, check whether the Count property of the Windows collection is 0. Here's an example:
+
+    If Windows. **Count** = 0 Then MsgBox ˝There is no active window.˝, vbOkOnly + _
+        vbExclamation, ˝No Window Is Open˝
+
+When you're working with presentations using VBA, you may sometimes find that the ActiveWindow object is a handy way to access a presentation, especially for a macro that the user runs after choosing the presentation, slide, or other object that they want to affect. In other cases, you may find that the ActivePresentation object is a more convenient way to access the presentation you need to work with, or you may prefer to access the presentation via the Presentations collection.
+
+## Opening a New Window on a Presentation
+
+To open a new window, use the NewWindow method of the appropriate Window object. This method takes no arguments. For example, the following statement opens a new window showing the contents of the active window:
+
+    ActiveWindow.NewWindow
+
+## Closing a Window
+
+To close a window, use the Close method with the appropriate Window object. In PowerPoint, the Close method takes no arguments.
+
+* * *
+
+Be Careful When Closing Windows Programmatically
+
+Recall that _programmatically_ means _by programming_ , by executing code (as opposed to by user interaction). If the window you're closing is the last window open for the presentation, PowerPoint simply closes the window without prompting the user to save any unsaved changes. For this reason, be careful when closing windows, or your code can cause the user to lose data.
+
+* * *
+
+For example, you might close all windows but one on a presentation:
+
+    Do While ActivePresentation.Windows.Count > 1
+        ActivePresentation.Windows(ActivePresentation.Windows.Count).Close
+    Loop
+
+Alternatively, you might use the Save method to save a presentation before closing its last window, as in the next example. (More simply, you could use the Close method to close the presentation itself after saving it.)
+
+    With ActivePresentation
+        If .Path = ˝˝ Then
+            MsgBox ˝Please save this presentation.˝, vbOKOnly
+        Else
+            .Save
+            For Each myWindow In Windows
+                .Close
+            Next myWindow
+        End If
+    End With
+
+## Activating a Window
+
+To activate a window or one of its panes, use the Activate method of the appropriate Window object. For example, the following statement activates the first window open on the presentation Benefits.pptm:
+
+    Presentations(˝Benefits.pptm˝).Windows(1).Activate
+
+## Arranging and Resizing Windows
+
+To arrange windows, use the Arrange method with the appropriate Windows collection. The syntax is as follows:
+
+    _expression_.Arrange(ArrangeStyle)
+
+Here, _expression_ is a required expression that returns a Windows collection. ArrangeStyle is a required argument that specifies how to arrange the windows: ppArrangeCascade (cascade the windows in an overlapping arrangement that lets you see the title bar of each window but the contents of only the front window) or ppArrangeTiled (tile the windows; the default setting).
+
+You can maximize, minimize, or restore the application window by setting the WindowState property of the Application object to ppWindowMaximized, ppWindowMinimized, or ppWindowNormal. Similarly, within the application window, you can maximize, minimize, or restore a document by setting its WindowState property.
+
+When a window is in a ˝normal˝ state (ppWindowNormal, not maximized or minimized), you can position it by using the Top and Left properties to specify the position of the upper-left corner of the window and size it by setting its Height and Width properties.
+
+The following example maximizes the application window and cascades the document windows within it:
+
+    Application.WindowState = ppWindowMaximized
+    Windows.Arrange ArrangeStyle:=ppArrangeCascade
+
+## Changing the View
+
+To change the view in a window, set the ViewType property of the appropriate Window object to one of these 12 constants: ppViewHandoutMaster, ppViewMasterThumbnails, ppViewNormal, ppViewNotesMaster, ppViewNotesPage, ppViewOutline, ppViewPrintPreview, ppViewSlide, ppViewSlideMaster, ppViewSlideSorter, ppViewThumbnails, or ppViewTitleMaster. For example, the following statement switches the active window into Slide Sorter view:
+
+    ActiveWindow.ViewType=ppViewSlideSorter
+
+To zoom the view, specify a value from 10 to 400 for the Zoom property of the View object for the appropriate window. The value represents the zoom percentage, but you don't include a percent sign. For example, the following statement zooms the active window to 150 percent:
+
+    ActiveWindow.View.Zoom = 150
+
+## Working with Panes
+
+The Pane object represents a pane of the PowerPoint window in Slide view. The Outline pane is represented by index number 1, the Slide pane by index number 2, and the Notes pane by index number 3. You can activate a pane by using the Activate method with the appropriate Pane object. The following example switches the view in the active window to Slide view and activates the Outline pane:
+
+    With ActiveWindow
+        .ViewType = ppViewSlide
+        .Panes(1).Activate
+    End With
+
+To change the arrangement of the panes in a PowerPoint window in Slide view, use the SplitHorizontal property and the SplitVertical property of the Window object.
+
+The SplitHorizontal property controls the percentage of the document window's width that the Outline pane occupies, and the SplitVertical property controls the percentage of the document window's height that the Slide pane occupies. The following example sets the Outline pane to 25 percent of the width of the document window (leaving 75 percent to the Slide pane) and the Slide pane to 75 percent of the height of the window (leaving 25 percent to the Notes pane):
+
+    With ActiveWindow
+        .SplitHorizontal = 25
+        .SplitVertical = 75
+    End With
+
+# Working with Slides
+
+Once you have created or opened the presentation you want to affect, you can access the slides it contains by using the Slides collection, which contains a Slide object for each slide in the presentation. Each slide is identified by its index number, but you can also assign names to slides in three different ways:
+
+**Using object variables**
+
+Then you can refer to each slide by its object variable name.
+
+**Using ID numbers**
+
+See the section titled "Finding a Slide by Its ID Number" later in this chapter.
+
+**Using the Name property**
+
+See the section titled "Accessing a Slide by Name" later in this chapter.
+
+Having a unique name for a slide is especially useful when you add slides to or delete slides from a presentation, because this causes the index numbers of the slides to change. It's much easier to just name the slides than to try to keep track of their shifting index numbers.
+
+## Adding a Slide to a Presentation
+
+To add a slide to a presentation, use the Add method with the Slides collection. The syntax is as follows:
+
+    _expression_.Add(Index, Layout)
+
+Here are the components of this syntax:
+
+  * _expression_ is a required expression that returns a Slides collection. In many cases, it's easiest to use the Slides collection itself.
+  * Index is a required Long argument that specifies the index number for positioning the slide in the presentation. For example, the number 2 makes the new slide the second slide in the presentation.
+  * Layout is a required Long argument that specifies the layout for the new slide. The layout names correspond closely to the names you'll see in the Insert Slide dialog box or the Slide Layout task pane. For example, ppLayoutBlank specifies a blank slide, ppLayoutTitleOnly a title-only slide, and ppLayoutChartAndText a chart-and-text slide. The following statements declare an object variable named mySlide and assign to it a new title slide added at the beginning of the active presentation:
+
+     Dim mySlide As Slide
+     Set mySlide = ActivePresentation.Slides.Add(Index:=1, _
+         Layout:=ppLayoutTitle)
+
+* * *
+
+Understanding the ˝Mixed˝ Constants
+
+If you look at the list of constants for the Layout property, you'll notice one is called ppLayoutMixed. There's no ˝Mixed˝ layout in PowerPoint's list of slide layouts, and if you try to apply ppLayoutMixed to a slide, VBA returns an error. This is because ppLayoutMixed is the value VBA returns for the Layout property of a slide range that contains multiple slides with different designs.
+
+Other properties have similar Mixed values to indicate that the objects use different values. For example, ppTransitionSpeedMixed means that the slides or shapes use different transition speeds. Don't try to set a property to a Mixed value, because doing so always gives an error.
+
+* * *
+
+## Inserting Slides from an Existing Presentation
+
+When creating presentations automatically, it's often useful to insert slides from an existing presentation. To do so, use the InsertFromFile method of the Slides collection. The syntax is as follows:
+
+    _expression_.InsertFromFile(FileName, Index, SlideStart, SlideEnd)
+
+Here are the components of this syntax:
+
+  * _expression_ is a required expression that returns a Slides collection. Often, you'll want to use the Slides collection itself.
+  * FileName is a required String argument that specifies the file from which to insert the slides.
+  * Index is a required Long argument that specifies the slide position in the open presentation at which to insert the slides.
+  * SlideStart is an optional Long argument that specifies the first slide to insert. If you omit SlideStart, PowerPoint starts at the first slide.
+  * SlideEnd is an optional Long argument that specifies the last slide to insert. If you omit SlideEnd, PowerPoint goes up to the last slide.
+
+For example, the following statement inserts slides 2 through 8 from the presentation named Handbook.pptm stored in the folder Z:\Transfer\Presentations, placing the slides starting at the fifth slide in the open presentation Corporate.pptm:
+
+    Presentations(˝Corporate.pptm˝).Slides. **InsertFromFile** _
+        FileName:=˝Z:\Transfer\Presentations\Handbook.pptm˝, Index:=5, _
+        SlideStart:=2, SlideEnd:=8
+
+## Finding a Slide by Its ID Number
+
+When working programmatically with a presentation, it can be difficult to track which slide is which, especially when you add, delete, insert, copy, or move slides—thereby changing the slides' index numbers.
+
+To help you, PowerPoint assigns a slide ID number to each slide when it's created. The slide ID number doesn't change when you move a slide to a different position in the presentation, unlike the index number, which always reflects the slide's position in the presentation. You can check a slide's ID number by returning the SlideID property of the appropriate Slide object.
+
+To find a slide by its ID number, use the FindBySlideID method of the Slides collection. The syntax is as follows:
+
+    _expression_.FindBySlideID(SlideID)
+
+Here, _expression_ is a required expression that returns a Slides collection. SlideID is a required Long argument that specifies the ID number of the slide you want to return.
+
+The following example declares a Long variable named TargetSlide and assigns to it a new slide added at the fifth index position in the active presentation, inserts a full presentation at the third index position, and then uses the FindBySlideID method to return the slide identified by TargetSlide and apply a different design template to it. This approach is similar to creating object variables for slides, as described earlier in this chapter. However, here you create Long variables to hold the ID numbers instead of object variables:
+
+    Dim TargetSlide As Long
+    TargetSlide = ActivePresentation.Slides.Add(Index:=5, _
+        Layout:=ppLayoutFourObjects).SlideID
+    Presentations(˝Corporate.pptm˝).Slides.InsertFromFile _
+        FileName:=˝Z:\Transfer\Presentations\Handbook.pptm˝, Index:=3
+    ActivePresentation.Slides.FindBySlideID(TargetSlide).ApplyTemplate _
+        FileName:=˝C:\Program Files\Microsoft Office\Templates\Presentation
+        ÂDesigns\Brain Blitz.potm˝
+
+## Changing the Layout of an Existing Slide
+
+To change the layout of an existing slide, set its Layout property. For example, the following statement changes the layout of the first slide in the active presentation to the clip-art-and-vertical-text layout:
+
+    ActivePresentation.Slides(1).Layout = ppLayoutClipArtAndVerticalText
+
+When you change the layout of a slide, PowerPoint moves its existing contents to allow any new objects needed to be added to the slide.
+
+## Deleting an Existing Slide
+
+To delete an existing slide, use the Delete method with the appropriate Slide object. For example, the following statement deletes the first slide in the active presentation:
+
+    ActivePresentation.Slides(1).Delete
+
+Be aware that PowerPoint doesn't confirm the deletion of a slide via VBA.
+
+## Copying and Pasting a Slide
+
+To copy a slide, use the Copy method of the appropriate Slide object. The Copy method takes no arguments. (You can also cut a slide by using the Cut method, which also takes no arguments.)
+
+To paste a slide, use the Paste method of the Slides collection. The Paste method takes an Index argument that specifies the slide position at which to paste in the slide.
+
+For example, the following statements copy the first slide in the active presentation and paste it in so that it is the fifth slide:
+
+    ActivePresentation.Slides(1).Copy
+    ActivePresentation.Slides.Paste Index:=5
+
+## Duplicating a Slide
+
+Instead of copying and pasting, you can directly duplicate a slide by using the Duplicate method of the Slide object. This method takes no arguments and places the duplicate of the slide immediately after the original in the index-number list. For example, the following statement duplicates the fourth slide in the active presentation, placing the copy at the fifth index position:
+
+    ActivePresentation.Slides(4).Duplicate
+
+## Moving a Slide
+
+Instead of cutting and pasting a slide, you can move it directly by using the MoveTo method with the appropriate Slide object. Moving a slide has the same ultimate effect as cutting and pasting it but has the advantage of not changing the contents of the Clipboard (which you might need to preserve for the user or for other purposes). The syntax for the MoveTo method is as follows:
+
+    _expression_.MoveTo(ToPos)
+
+Here, _expression_ is a required expression that returns a Slide object, and ToPos is a required Long argument that specifies the index position to which you want to move the slide.
+
+For example, the following statement moves the third slide in the presentation identified by the object variable myPresentation to the beginning of the presentation:
+
+    myPresentation.Slides(3).MoveTo ToPos:=1
+
+## Accessing a Slide by Name
+
+Instead of accessing a slide by its index number, you can assign a name to it by using the Name property of the Slide object. For example, the following statements assign the name Chairman's Introduction to the fifth slide in the active presentation and then use the Select method of the Slide object to select that slide by name:
+
+    ActivePresentation.Slides(1).Name = ˝Chairman's Introduction˝
+    ActivePresentation.Slides(˝Chairman's Introduction˝).Select
+
+## Working with a Range of Slides
+
+To work with a range of slides, use the Range method of the Slides collection to return a SlideRange object that represents the slides. The SlideRange object can represent a single slide, but you're usually better off using it to represent a range of slides. (You can access a single slide more easily by its index number or by a name you assign to it than through a SlideRange object.)
+
+To return a SlideRange object that encompasses two or more slides, use the Array function with a comma-delimited list of the slides. The list can use either the index numbers or the names of the slides. For example, the following statements declare the SlideRange object variable mySlideRange and assign to it the first five slides in the open presentation named HR.pptm:
+
+    Dim mySlideRange As SlideRange
+    Set mySlideRange = _
+    Presentations(˝HR.pptm˝).Slides.Range(Array(1, 2, 3, 4, 5))
+
+The following statement assigns to the SlideRange object variable mySlideRange the slides named Intro and Outro in the active presentation:
+
+    Set mySlideRange = ActivePresentation.Slides.Range(Array(˝Intro˝, ˝Outro˝))
+
+## Formatting a Slide
+
+You can apply a design template to a slide by using the ApplyTemplate method, as discussed in ˝Applying a Template to a Presentation, to a Slide, or to a Range of Slides,˝ earlier in this chapter. You can also apply a background or a color scheme, as discussed in the following sections.
+
+### Applying a Background to One or More Slides
+
+To apply a background to a slide or several slides, use the Background property of the appropriate Slide object or SlideRange object to return the ShapeRange object representing the background of the slide or slides. You can then use the Fill object to set a color, fill, gradient, or picture in the background.
+
+The following example applies the picture Winter.jpg from the folder C:\Sample Pictures to the fourth slide in the presentation named Corporate.pptm. The example sets the FollowMasterBackground property to msoFalse, making the slide use a different background than the slide master, and also sets the DisplayMasterShapes property to msoFalse, making the slide not display the shapes on the slide master:
+
+    With Presentations(˝Corporate.pptm˝).Slides(4)
+        .FollowMasterBackground = msoFalse
+        .DisplayMasterShapes = msoFalse
+        With .Background
+            .Fill.ForeColor.RGB = RGB(255, 255, 255)
+            .Fill.BackColor.SchemeColor = ppAccent1
+            .Fill.UserPicture ˝C:\Sample Pictures\Winter.jpg˝
+        End With
+    End With
+
+### Applying a Color Scheme to a Slide
+
+A color scheme is a group of eight colors that are used to create the look of the title, background, and other elements of a slide, handout, or notes page. VBA uses an RGBColor object to represent each color, and a ColorScheme object to represent each color scheme. The ColorScheme objects are gathered in a ColorSchemes collection for the entire presentation.
+
+To change the color scheme of a slide or several slides, use the ColorScheme property of the appropriate Slide object or SlideRange object to return the ColorScheme object, and then work with the Colors method to specify the color. The syntax is as follows:
+
+    _expression_.Colors(SchemeColor)
+
+Here, _expression_ is a required expression that returns a ColorScheme object. SchemeColor is a required argument that specifies which color in the color scheme to set—for example, ppAccent1 (for the first accent in the color scheme), ppBackground (for the background color), or ppTitle (for the title color).
+
+The following statement sets the background color of the color scheme for the first three slides in the active presentation to black, which is RGB(0, 0, 0):
+
+    ActivePresentation.Slides.Range(Array(1, 2, 3)) _
+        .ColorScheme.Colors(ppBackground).RGB = RGB(0, 0, 0)
+
+## Setting a Transition for a Slide, a Range of Slides, or a Master
+
+To set a transition for a slide, a range of slides, or a master, use the SlideShowTransition property of the Slide object, the SlideRange object, or the Master object to return the SlideShowTransition object.
+
+To specify the speed at which the transition runs, set its Speed property to ppTransitionSpeedFast, ppTransitionSpeedMedium, or ppTransitionSpeedSlow.
+
+* * *
+
+Creating Effective Transitions between Slides
+
+Using transitions between slides can make a presentation look smooth and professional or awkward and amateurish.
+
+To specify the effect to use, set the EntryEffect property to the constant for the effect. There are too many constants to list here, but their names are generally descriptive enough to be easy to decipher. For example, the ppEffectBlindsHorizontal constant generates a transition that resembles an adjustment of window blinds, the ppEffectDissolve constant causes a rather crude kind of melting effect, and the ppEffectNone constant represents the No Transition setting.
+
+You should avoid all of these, and most of the other available transitions, unless you want to go back several decades to early TV transition effects. Contemporary television and movies employ smooth, subtle, and unobtrusive transitions between scenes. And slides are simple scenes. So, you should generally stay away from trick transitions like window blinds or crude transitions like the ppEffectDissolve, which is highly pixelated.
+
+To figure out which of the transition effects are sophisticated and discreet, try them out. You can preview all the transitions by clicking the Transitions tab on the PowerPoint Ribbon. Then click any slide transition you want to see.
+
+The default transition is quite a good dissolve, but if you want to try another classy transition, experiment with ppEffectFade. It's similar to the default. Also try experimenting with the Animations tab on the PowerPoint Ribbon, which governs how you animate the various shape objects on a slide.
+
+And also try experimenting with the transition speed to suit the animation to the subject of your presentation.
+
+* * *
+
+To control how the slide advances, set the AdvanceOnTime property to msoTrue (for automatic advancing) or msoFalse (for manual advancing). If you use automatic advancing, use the AdvanceTime property to specify the number of seconds. If you want the slide to advance when the user clicks, set the AdvanceOnClick property to msoTrue. (You can set both AdvanceOnTime and AdvanceOnClick to msoTrue. The slide advances manually if the user clicks before the AdvanceTime interval has elapsed.)
+
+To play a preset sound effect with the transition, use the SoundEffect property of the SlideShowTransition object to return the SoundEffect object, use the Name property to specify the name of the sound effect, and then use the Play method to play the sound effect. You can also play any compatible sound file by using the ImportFromFile method of the SoundEffect object and using the FullName argument to specify the path and filename of the sound file.
+
+PowerPoint 2013 can play any of the following audio-file types: .aiff, .au, .mid, .midi, .mp3, .m4a, .mp4, .wav, or .wma. But be aware that even if a file has one of these filename extensions, it still might not be playable if the proper codec isn't available. That's why everyone urges you to never give a _naked presentation_. Always first do a test run of any presentation on the equipment you'll be using for the official presentation when people are there closely watching you.
+
+If you want the sound to loop until the next sound, set the LoopSoundUntilNext property of the SlideShowTransition object to msoTrue. The default value is msoFalse.
+
+The following example sets up a transition for the second slide in the active presentation. The transition uses the Fade effect running at medium speed, sets advancing to either on click or after a delay of 30 seconds, and plays a sound file from an external source without looping:
+
+    With ActivePresentation.Slides(2)
+        With .SlideShowTransition
+            .EntryEffect = ppEffectFade
+            .Speed = ppTransitionSpeedMedium
+            .AdvanceOnClick = msoTrue
+            .AdvanceOnTime = msoTrue
+            .AdvanceTime = 30
+            .SoundEffect.ImportFromFile _
+                FileName:=˝d:\Sounds\Crescendo.wav˝
+            .LoopSoundUntilNext = msoFalse
+        End With
+    End With
+
+# Working with Masters
+
+VBA uses the Master object to represent the various masters that PowerPoint uses: the slide master, title master, handout master, and notes master.
+
+## Working with the Slide Master
+
+To work with the slide master for a presentation, use the SlideMaster property of the Presentation object.
+
+To return the slide master for a slide, use the Master property of the appropriate Slide object. For example, the following statement adds a title to the slide master for the active presentation (if the slide master already has a title, VBA returns an error):
+
+    ActivePresentation.SlideMaster.Shapes.AddTitle.TextFrame.TextRange.Text = _
+        ˝Orientation˝
+
+## Working with the Title Master
+
+To find out whether a presentation has a title master, check the HasTitleMaster property. If it doesn't, you can use the AddTitleMaster method of the Presentation object to add a title master, as in the following example. If the presentation already has a title master, VBA returns an error when you try to add a title master:
+
+    If Not ActivePresentation.HasTitleMaster Then _
+    ActivePresentation.AddTitleMaster
+
+To return the title master for the presentation, use the TitleMaster property of the Presentation object. The following example checks that the title master exists and, if it does, formats the date and time to be visible and to use the dMMMyy format with automatic updating:
+
+    With myPresentation
+        If .HasTitleMaster Then
+            With .TitleMaster.HeadersFooters.DateAndTime
+                .Visible = msoTrue
+                .Format = ppDateTimedMMMyy
+                .UseFormat = msoTrue
+            End With
+        End If
+    End With
+
+## Working with the Handout Master
+
+To work with the handout master, use the HandoutMaster property of the Presentation object to return the Master object. The following example uses the HandoutMaster property of the ActivePresentation object to fill the background of the handout master with a picture:
+
+    With ActivePresentation.HandoutMaster.Background
+        .Fill.ForeColor.RGB = RGB(255, 255, 255)
+        .Fill.BackColor.SchemeColor = ppAccent1
+        .Fill.UserPicture ˝d:\igrafx\dawn.jpg˝
+    End With
+
+## Working with the Notes Master
+
+To work with the notes master, use the NotesMaster property of the Presentation object to return the Master object. For example, the following statement clears the HeaderFooter objects in the notes master in the first open presentation:
+
+    Presentations(1).NotesMaster.HeadersFooters.Clear
+
+## Deleting a Master
+
+You can delete the title master or handout master, but not the slide master or notes master. To delete the title master or handout master, use the Delete method of the Master object. The following example checks that the active presentation has a title master and then deletes it:
+
+    If ActivePresentation.HasTitleMaster Then _
+     ActivePresentation.TitleMaster.Delete
+
+# The Bottom Line
+
+**Understand PowerPoint's creatable objects.**
+
+Creatable objects are commonly used objects that can be employed in VBA code without requiring that you qualify them with the Application object. You can leave that word out of your code; it's optional, and rarely used.
+
+Master It
+
+Name one of the objects or collections that are creatable in PowerPoint procedures.
+
+**Work with presentations.**
+
+You can create a new presentation programmatically, but PowerPoint generates an annoying flicker on most systems while it brings the new presentation into view. You can block this unpleasant, strobelike effect to avoid disturbing your audience.
+
+Master It
+
+How do you prevent a newly created presentation from being visible so that you can create and manipulate it in your code without the user seeing the flickering effect onscreen?
+
+**Work with windows and views.**
+
+To get the PowerPoint window into the state you want, you'll often need to work with the window and with the view.
+
+Master It
+
+PowerPoint uses two types of windows. What are they?
+
+**Work with slides.**
+
+Once you have created or opened the presentation you want to manipulate, you can access the slides it contains by using the Slides collection. This collection contains a Slide object for each slide in the presentation. Each slide is identified by its index number, but you can also use object variables to refer to slides or assign names to slides.
+
+Master It
+
+Why would you want to assign names to slides rather than using the default index numbers that are automatically assigned to the slides?
+
+**Work with masters.**
+
+Before attempting to manipulate a master in your code, you should determine whether the master actually exists in the presentation.
+
+Master It
+
+How do you find out whether a presentation has a title master?
+Chapter 25
+
+Working with Shapes and Running Slide Shows
+
+In the previous chapter you learned to work with Presentation objects, Slide objects, and Master objects. In this chapter you'll learn to work with Shape objects to manipulate the contents of slides and with HeaderFooter objects to control the contents of headers and footers. You'll also see how to set up and run a slide show using VBA.
+
+In this chapter you will learn to do the following:
+
+  * Work with shapes
+  * Work with headers and footers
+  * Set up and run a slide show
+
+# Working with Shapes
+
+Most of the objects on a typical PowerPoint slide are Shape objects. For example, a title box is a Shape object, as is a picture or a Word table that you've pasted in. You access the Shape objects through the Shapes collection of a Slide object, a SlideRange object, or a Master object.
+
+## Adding Shapes to Slides
+
+Varying methods of the Shapes collection add the different types of shapes. Table 25.1 lists the Shape objects you can add and the methods and arguments for adding them. The following sections explain the arguments. You can find additional details about the Shapes object here:
+
+<http://msdn.microsoft.com/en-us/library/office/ff745286.aspx>
+
+### Shared Arguments for Adding Shapes
+
+These are arguments that are shared among various shape-adding methods:
+
+  * BeginX and EndX are required arguments (of the Single data type) that specify the horizontal starting position and ending position of the connector or line, measured in points from the left edge of the slide.
+  * BeginY and EndY are required Single data arguments that specify the vertical starting point and ending point of the connector or line, measured in points from the top of the slide.
+  * FileName is a required String argument used to specify the file to be used for creating the object (for example, the media file for creating a media object).
+  * Left is a required Single argument that specifies the position of the left edge of the shape from the left edge of the slide, measured in points. Top is a required Single argument that specifies the position of the top edge of the shape from the top edge of the slide, measured in points.
+  * Height is a required Single argument that specifies the height of the shape, measured in points. Width is a required Single argument that specifies the width of the shape, measured in points.
+  * LinkToFile is on optional argument that you can set to msoTrue to link the picture to its source file.
+  * NumColumns and NumRows are required Long arguments that specify the number of columns and rows in the table you're adding.
+  * Orientation is a required argument that specifies the orientation: msoTextOrientationHorizontal (horizontal) or msoTextOrientationVerticalFarEast (vertical).
+  * SafeArrayOfPoints is a required Variant argument that supplies an array of coordinate pairs that give the vertices and control points of a curve or polyline. The line begins at the first pair of coordinates and ends at the last pair.
+  * SaveWithDocument is a required argument that controls whether PowerPoint saves the linked picture in the presentation (msoTrue) or not (msoFalse). If you set LinkToFile: =msoFalse, you must set SaveWithDocument: =msoTrue.
+
+Table 25.1 Shapes and the methods for adding them to slides
+
+**To Add This Shape** | **Use This Method and These Arguments**  
+---|---  
+Callout | AddCallout(Type, Left, Top, Width, Height)  
+Chart | AddChart(Type, Left, Top, Width, Height)  
+Chart2 | AddChart(Style, Type, Left, Top, Width, Height, NewLayout)  
+Comment | AddComment(Left, Top, Width, Height)  
+Connector | AddConnector(Type, BeginX, BeginY, EndX, EndY)  
+Curve | AddCurve(SafeArrayOfPoints)  
+Label | AddLabel(Orientation, Left, Top, Width, Height)  
+Line | AddLine(BeginX, BeginY, EndX, EndY)  
+Media object | AddMediaObject(FileName, Left, Top, Width, Height)  
+Media object 2 | AddMediaObject2(FileName, LinkToFile, Left, Top, Width, Height)  
+Media object from embed tag | AddMediaObjectFromEmbedTag(EmbedTag, Left, Top, Width, Height)  
+OLE object | AddOLEObject(Left, Top, Width, Height, ClassName, FileName, DisplayAsIcon, IconFileName, IconIndex, IconLabel, Link)  
+Picture | AddPicture(FileName, LinkToFile, SaveWithDocument, Left, Top, Width, Height)  
+Placeholder | AddPlaceholder(Type, Left, Top, Width, Height)  
+Polyline | AddPolyline(SafeArrayOfPoints)  
+Shape | AddShape(Type, Left, Top, Width, Height)  
+Smart Art | AddSmartArt(Layout, Left, Top, Width, Height)  
+Table | AddTable(NumRows, NumColumns, Left, Top, Width, Height)  
+Textbox | AddTextbox(Orientation, Left, Top, Width, Height)  
+Text Effect | AddTextEffect(PresetTextEffect, Text, FontName, FontSize, FontBold, FontItalic, Left, Top)  
+Title | AddTitle
+
+### _Type_ Argument for Adding Shapes
+
+The Type argument is different for the various methods that use it. Here are some examples:
+
+  * Type for the AddPlaceholder method is a required argument that specifies the type of placeholder to add. The names are self-explanatory: ppPlaceholderBitmap, ppPlaceholderBody, ppPlaceholderCenterTitle, ppPlaceholderChart, ppPlaceholderDate, ppPlaceholderFooter, ppPlaceholderHeader, ppPlaceholderMediaClip, ppPlaceholderMixed, ppPlaceholderObject, ppPlaceholderOrgChart, ppPlaceholderPicture, ppPlaceholderSlideNumber, ppPlaceholderSubtitle, ppPlaceholderTable, ppPlaceholderTitle, ppPlaceholderVerticalBody, ppPlaceholderVerticalObject, ppPlaceholderVerticalTitle
+
+* * *
+
+Limitations on Placeholders
+
+You can use the ppPlaceholderVerticalBody and ppPlaceholderVerticalTitle placeholders only on slides that use vertical text—the slide layouts ppLayoutVerticalText, ppLayoutClipArtAndVerticalText, ppLayoutVerticalTitleAndText, and ppLayout VerticalTitleAndTextOverChart.
+
+* * *
+
+  * Type for the AddCallout method is a required argument that specifies the type of callout line to add: msoCalloutOne (a one-segment line that can be vertical or horizontal), msoCalloutTwo (a one-segment line that rotates freely), msoCalloutThree (a two-segment line), or msoCalloutFour (a three-segment line).
+  * Type for the AddShape method is a required argument that specifies the type of AutoShape to add. There are too many constants to list here, but most are easy to identify from their names. For example, msoShapeHeart is a heart shape, msoShapeLightningBolt gives a lightning bolt, and so on. To see a list of the constants, search for the AddShape method in the VBA Editor Help file, and then click the link for the msoAutoShapeType entry. Or type **msoautoshapetype** in the editor's Object Browser search field.
+  * Type for the AddDiagram method is a required argument that specifies the diagram type: msoDiagramCycle (a cycle diagram), msoDiagramOrgChart (an org chart), msoDiagramPyramid (a pyramid diagram), msoDiagramRadial (a radial diagram), msoDiagramTarget (a target diagram), or msoDiagramVenn (a Venn diagram).
+
+* * *
+
+**What Is MSO? Practical Advice for the Perplexed Programmer**
+
+You may have noticed that many of the enumerations and constants you're running into in PowerPoint are prepended (the opposite of _appended_ ) by _mso_. This strange little acronym can stand for several things: Martha Stewart's stock market name (Martha Stewart Omnimedia), or Milwaukee Symphony Orchestra, or Microsoft Office. In this case, it stands for Microsoft Office. And why it is prepended to PowerPoint enumerations and not to other Office 2013 enumerations is just one of those mysteries that keep all us programmers on our toes. After all, even those of us who are semiconscious are likely aware that we're using VBA in Microsoft Office.
+
+Here's another example of a mystery. Throughout the decades of BASIC programming history, and in all other versions of BASIC and VBA, you use the words True and False to mean true and false. That makes a certain kind of sense when you think about it. However, in PowerPoint 2013, you can also use the built-in constants msoTrue and msoFalse to, for example, set the Visible property of a footer on a slide. Luckily, these constants are optional. You can still use the traditional True and False. There is no difference between Microsoft Office's truth and truth in general. (I'm speaking here strictly in the context of these constants.)
+
+_Mso_ also appears in the MsoTriState variable type—that bizarro uber-Boolean type that you ran into in the previous chapter. You remember it: it's like the famous quantum mechanical tri-state cat, which can be alive, dead, or a mixture of the two.
+
+In my opinion, you should not worry much over these weird usages such as msoTriStateMixed; tri-state entities have no precedent outside electronic chip diagrams and advanced physics. Think of them as Boolean (true or false). And remember that although you can _read_ the third (the mixed true and false) status of a tri-state type, you can't _set_ (assign) anything other than the traditional True or False values to it.
+
+* * *
+
+### Arguments Specific to the _AddTextEffect_ Method
+
+The following arguments apply only to the AddTextEffect method:
+
+  * PresetTextEffect is a required argument that specifies the preset text effect to use. These preset text effects are identified by the constants msoTextEffect1 through msoTextEffect30, which correspond to the order in which the samples appear in the WordArt Gallery dialog box (1 through 6 are the first row, 7 through 12 the second row, and so on).
+  * Text is a required String argument that specifies the text to use in the WordArt object.
+  * FontBold is a required argument that you set to msoTrue to make the font bold or msoFalse to make it not bold.
+  * FontItalic is a required argument that you set to msoTrue to make the font italic and msoFalse to make it not italic.
+  * FontName is a required String argument that specifies the name of the font to use.
+  * FontSize is a required Single argument that specifies the font size to use.
+
+### Arguments Specific to the _AddOLEObject_ Method
+
+The following arguments apply only to the AddOLEObject method:
+
+  * ClassName is an optional String argument that specifies the program ID (the ProgID) or OLE long class name for the object. You must use either ClassName or FileName, but not both. In most cases, it's easiest to use FileName.
+  * DisplayAsIcon is an optional argument that you can set to msoTrue to display the OLE object as an icon rather than as itself (the default).
+  * IconFileName is an optional String argument that you can use with DisplayAsIcon:=True to specify the filename of the icon you want to display for the object.
+  * IconIndex is an optional Integer argument that specifies the index of the icon to use within the icon file specified by IconFileName. If you omit the IconIndex argument, VBA uses the second icon in the icon file, the icon at position 1 (the first icon in the file is at position 0).
+  * IconLabel is an optional String argument that you can use to specify the caption (or label) to display under the icon.
+  * Link is an optional argument that you can set to msoTrue to link the OLE object to its source file when you use the FileName argument. Link must be msoFalse when you use ClassName to specify a class name.
+
+### An Example of Using the _AddShape_ Method
+
+The following statement uses the AddShape method to add a bent up-arrow to the upper-right corner of the last slide in the active presentation. Before executing this example, click the File tab on PowerPoint's Ribbon, then click the New option in the left pane to see the some of the available templates and themes. Double-click one of the templates so you'll have some slides to work with in this example. (In versions of PowerPoint prior to 2013, you'll have to open the Sample Templates folder before choosing a template.)
+
+Open the Visual Basic Editor by pressing Alt+F11. Locate the project in the Project window, right-click on its name (it will be boldface), and choose Insert ⇒ Module. Type the following into the new module, and then press F5 with your blinking cursor inside this subroutine to execute the code and see the effect:
+
+    Sub test()
+
+    ActivePresentation.Slides(ActivePresentation.Slides.Count) _
+        .Shapes.AddShape Type:=msoShapeBentUpArrow, Left:=575, Top:=10, _
+        Width:=150, Height:=75
+
+    End Sub
+
+To see what happened, look at last slide and notice that a shape has been added to it—a bent up-arrow.
+
+### An Example of Using the _AddTextEffect_ Method
+
+The following example uses the AddTextEffect method to superimpose a WordArt item onto the third slide. Ensure that you have at least three slides by pressing Ctrl+M a few times to add some new slides.
+
+This code draws the text _Questions_ & _Answers_ (on three lines) on the slide. This WordArt item is instructed in our code to use 54-point bold Garamond.
+
+    ActivePresentation.Slides(3).Shapes. **AddTextEffect** _
+       **PresetTextEffect:=msoTextEffect14** , _
+            Text:="Questions" + Chr$(CharCode:=13) + _
+            "&" + Chr$(CharCode:=13) + "Answers", _
+            FontName:="Garamond", FontSize:=54, FontBold:=msoTrue, _
+            FontItalic:=msoFalse, Left:=230, Top:=125
+
+There are 30 msoTextEffect constants you can experiment with. They range from msoTextEffect1 to msoTextEffect30. msoTextEffect14 is nice; it provides a kind of metallic effect.
+
+### An Example of Using the _AddTextbox_ Method
+
+The following example adds a text box to the second slide in the active presentation and assigns text to it:
+
+    Dim myTextBox As Shape
+
+    With ActivePresentation.Slides(2)
+        Set myTextBox = .Shapes.AddTextbox _
+            (Orientation:=msoTextOrientationHorizontal, Left:=100, Top:=50, _
+            Width:=400, Height:=100)
+        myTextBox.TextFrame.TextRange.Text = "Corrective Lenses"
+    End With
+
+## Deleting a Shape
+
+To delete a shape, use the Delete method with the appropriate Shape object. For example, the following statement deletes the first Shape object on the second slide in the active presentation:
+
+    ActivePresentation.Slides(2).Shapes(1).Delete
+
+## Selecting All Shapes
+
+To select all the shapes on a slide, use the SelectAll method of the appropriate Shapes collection. For example, the following statement selects all the Shape objects on the first slide in the active presentation:
+
+    ActivePresentation.Slides(1).Shapes.SelectAll
+
+## Repositioning and Resizing a Shape
+
+To reposition a shape, set its Left property (to specify the distance in points from the left edge of the slide to the left edge of the shape) and its Top property (to specify the distance in points from the top edge of the slide to the top edge of the shape).
+
+To change the size of a shape, set its Width and Height properties to the appropriate number of points.
+
+For example, the following statements position the first shape on the first slide in the active presentation 200 points from the left side of the slide and 100 points from its top and make the shape 300 points wide by 200 points high:
+
+    With ActivePresentation.Slides(1).Shapes(1)
+        .Left = 200
+        .Top = 100
+        .Width = 300
+        .Height = 200
+    End With
+
+You can also move a shape relative to its current location by using the IncrementLeft method and the IncrementTop method. Rotate it by using the IncrementRotation method. Note that these methods are not absolute specified locations within a slide. Instead, they are relative to the current position or rotation of the shape. Each of these methods takes an Increment argument:
+
+  * For the IncrementLeft and IncrementTop methods, the Increment argument specifies the number of points to move the shape. A negative number moves the shape to the left or upward, while a positive number moves the shape to the right or downward.
+  * For the IncrementRotation method, the Increment argument specifies the number of degrees to rotate the shape. A positive number rotates the shape clockwise; a negative number rotates the shape counterclockwise.
+
+The following example works with the first shape on the third slide of the active presentation, moving it 100 points to the left and 200 points down and rotating it 90 degrees counterclockwise:
+
+    With ActivePresentation.Slides(3).Shapes(1)
+        .IncrementLeft Increment:=-100
+        .IncrementTop Increment:=200
+        .IncrementRotation Increment:=-90
+    End With
+
+## Copying Formatting from One Shape to Another
+
+Often, it's useful to be able to apply the same formatting to multiple shapes. When one shape has the formatting you want, you can use the PickUp method of the Shape object to copy the formatting from that shape and then use the Apply method to apply that formatting to another shape.
+
+Neither the PickUp method nor the Apply method uses any arguments. The following example copies the formatting from the first shape on the second slide in the active presentation and applies it to the third shape on the fourth slide:
+
+    With ActivePresentation
+        .Slides(2).Shapes(1).PickUp
+        .Slides(4).Shapes(3).Apply
+    End With
+
+## Working with Text in a Shape
+
+The text within a shape is contained in a TextRange object, which itself is contained in a TextFrame object. To work with the text in a shape, you use the TextFrame property of the Shape object to return the TextFrame object and then use the TextRange property of the TextFrame object to return the TextRange object. Got it?
+
+Within the TextRange object, the Text property contains the text, the Font object contains the font formatting, the ParagraphFormat object contains the paragraph formatting, and the ActionSettings collection contains the action settings for the text range.
+
+### Finding Out Whether a Shape Has a Text Frame
+
+Not every shape has a text frame, so prior to manipulating text it's a good idea to first determine whether the shape you're dealing with in fact even _has_ a text frame.
+
+To do so, check that the HasTextFrame property of the Shape object is msoTrue, as in this example:
+
+    If ActivePresentation.Slides(1).Shapes(1).HasTextFrame = msoTrue Then
+        MsgBox "The shape contains a text frame."
+    End If
+
+You may also need to check whether the text frame contains text. To do so, check that the HasText property of the TextFrame object is msoTrue. Here's an example:
+
+    With ActivePresentation.Slides(1).Shapes(1).TextFrame
+        If .HasText = msoTrue Then MsgBox .TextRange.Text
+    End With
+
+### Returning and Setting the Text in a Text Range
+
+To return (read) or set (specify) the text in a text range, you can simply use the Text property of the TextRange object. For example, the following statement sets the text in the first shape on the fourth slide in the presentation identified by the object variable myPresentation to Strategic Planning Meeting:
+
+    Sub Test()
+
+    Dim myPresentation As Presentation
+    Set myPresentation = Presentations(1)
+
+    myPresentation.Slides(4).Shapes(1).TextFrame.TextRange. **Text** _
+        = "Strategic Planning Meeting"
+
+    End Sub
+
+You can also return parts of the text by using the Paragraphs method, the Sentences method, the Lines method, the Words method, the Characters method, or the Runs method. The syntax for these methods is shown here, using the Paragraphs method as the example:
+
+    _expression_.Paragraphs(Start, Length)
+
+The components of the syntax are as follows:
+
+  * _expression_ is a required expression that returns a TextRange object.
+  * Start is an optional Long argument that specifies the first item (paragraph, sentence, line, word, character, or text run) to return.
+  * Length is an optional Long argument that specifies how many items to return—for example, two paragraphs, three sentences, or four words.
+
+* * *
+
+Understanding Text Runs
+
+A _text run_ is a sequence of characters that have the same font formatting. Text runs can be useful for picking out parts of text ranges that are formatted in a particular way.
+
+* * *
+
+The following code example returns the second through fifth words (the four words starting with the second word) from the first shape on the first slide in the active presentation:
+
+    MsgBox ActivePresentation.Slides(1).Shapes(1).TextFrame _
+        .TextRange.Words(Start:=2, Length:=4)
+
+The next code example sets the text of the second paragraph in the second shape on the sixth slide in the presentation identified by the object variable myPresentation to VP of Business Development:
+
+    myPresentation.Slides(6).Shapes(2).TextFrame.TextRange _
+        .Paragraphs(Start:=2, Length:=1).Text = "VP of Business Development"
+
+### Formatting the Text in a Text Range
+
+To format the text in a text range, use the ParagraphFormat object to control the paragraph formatting (including the alignment and the space before and after) and the Font object to control the font formatting.
+
+These are the most useful properties of the ParagraphFormat object:
+
+  * The Alignment property controls the alignment. Use ppAlignLeft for left alignment, ppAlignCenter for centering, ppAlignJustify for justified alignment, ppAlignDistribute for distributed alignment (justified using all available space), or ppAlignRight for right alignment.
+  * The Bullet property returns the BulletFormat object, which represents the bullet formatting. See the next section for details.
+  * The LineRuleBefore property, the LineRuleAfter property, and the LineRuleWithin property determine whether the measurements set by the SpaceBefore property, the SpaceAfter property, and the SpaceWithin property use lines (msoTrue) or points (msoFalse).
+  * The SpaceBefore property and the SpaceAfter property control the amount of space before and after each paragraph. The SpaceWithin property controls the amount of space between base lines in a paragraph. All measurements are in points.
+
+The following example sets left alignment, 18 points of spacing before and after paragraphs, and 12 points of spacing between lines for the second shape on the slide identified by the object variable mySlide:
+
+    Dim mySlide As Slide
+    Set mySlide = Presentations(1).Slides(2)
+
+    With mySlide.Shapes(2).TextFrame.TextRange.ParagraphFormat
+        .Alignment = ppAlignLeft
+        .LineRuleAfter = msoFalse
+        .SpaceAfter = 18
+        .LineRuleBefore = msoFalse
+        .SpaceBefore = 18
+        .LineRuleWithin = msoFalse
+        .SpaceWithin = 12
+    End With
+
+### Formatting the Bullets for a Text Range
+
+Bullets and numbers are vital to the lists used in many PowerPoint slides. To control whether and how bullets and numbers appear, use the Bullet property of the TextRange object to return the BulletFormat object, and then work with the BulletFormat object's properties and methods.
+
+To make bullets and numbers visible, set the Visible property of the BulletFormat object to msoTrue; to hide bullets and numbers, set Visible to msoFalse.
+
+To specify which type of bullet or numbering to use, set the Type property of the BulletFormat object to ppBulletUnnumbered (for a bullet), ppBulletNumbered (numbers), ppBulletPicture (for a picture), or ppBulletNone (no bullet).
+
+* * *
+
+Another Mixed Data Type
+
+The Type property of the BulletFormat object returns the value ppBulletMixed when the selection includes multiple types of bullets. You can't set Type to ppBulletMixed. You can only read it.
+
+* * *
+
+To specify the bullet character, use the Character property and the character number. You can find out the character number from the Symbol dialog box or the Character Map applet, which you can run by pressing the Windows key to get to the Modern view in Windows 8, then typing **Character Map**.
+
+Seeing this application in previous versions of Windows is somewhat more clumsy (Yay! Windows 8 demonstrates that it is an improvement in some ways). For Windows 7 and previous, click Start ⇒ All Programs ⇒ Accessories ⇒ System Tools ⇒ Character Map.
+
+Unfortunately, the character codes are given in the _hexadecimal_ numbering system. If you look up the check-box symbol for the Wingdings font that's used in the following code example, the character map utility doesn't say 254 in our human decimal numbering system. Instead, it says Character Code: 0xFE (the hex way of expressing 254).
+
+This tedious holdover from the early days of computing serves no particular purpose in character codes, but you have to deal with it. Why? Because some people think that pointless complexity is cute, or it helps make programming seem somehow more mysterious than it in fact is. This type of thing can also help with job security because managers will usually be dazzled by what they assume are complicated programming mysteries like hex. Hex (short for hexadecimal) is based on 16 digits: 0 1 2 3 4 5 6 7 8 9 A B C D E F. People with eight fingers on each hand have an advantage here.
+
+To solve the hex-character-code problem, you can either use a calculator that can translate between hex and decimal or just prepend the characters _& H_ in front of the hex code and let VBA translate it for you when executing your procedure.
+
+For example, in the following code example, I used 254 (a decimal number) because I can translate hex (well, my HP programming calculator can). But if you can't, or more likely don't want to be bothered, just click the character you want to use in the Character Map dialog box and then look at its hex code in the lower left of the dialog box.. In this example, it's listed as 0xFE (which means, you guessed it, decimal 254). Since the Wingdings font has only 256 characters, ignore the _0x_ part and use the _FE_ , like this, in your code:
+
+    .Character = &H **FE**
+
+Use the Font property to specify the font name, size, and color. The following example sets the bullet for the first shape on the slide identified by the object variable mySlide to Wingdings character 254, a check box, using the color white, which is RGB(255, 255, 255), and 44-point size:
+
+    With mySlide.Shapes(1).TextFrame.TextRange.ParagraphFormat.Bullet
+        .Type = ppBulletUnnumbered
+        .Character = 254
+        With .Font
+            .Name = "Wingdings"
+            .Size = 44
+            .Color = RGB(255, 255, 255)
+        End With
+    End With
+
+Color is of course an important element in any design. You can easily find out which RGB values you need to employ for various colors by visiting this web page:
+
+<http://cloford.com/resources/colours/500col.htm>
+
+To use your own custom picture as a bullet, set the Type property of the BulletFormat object to ppBulletPicture and then use the Picture method with the Picture argument, a required String argument that specifies the path and filename of the file to use as the bullet. You can use most common types of graphics files, including .bmp, .eps, .gif, .jpg, .jpeg, .pcx, .png, .tiff, and .wmf files. The following example uses the file Face1.jpg stored in the folder Z:\Public\Pictures as the bullet for the first shape on the slide identified by the object variable mySlide:
+
+    With mySlide.Shapes(1).TextFrame.TextRange.ParagraphFormat.Bullet
+        .Type = ppBulletPicture
+        .Picture Picture:="z:\Public\Pictures\Face1.jpg"
+    End With
+
+## Animating a Shape or a Range of Shapes
+
+To animate a shape or a range of shapes, use the AnimationSettings property of the Shape object or the ShapeRange object to return the AnimationSettings object.
+
+To specify the animation effect to use, set the EntryEffect property to the constant for the effect. Let's see how to figure out which animation effect looks best for the shape you're working with. First, click a shape in a slide to select the shape. Now display the Add Animation pane.
+
+There are too many animation constants to list here, but their names are easy to understand from the names listed in the Add Animation pane. To open this pane, click the Animations tab in PowerPoint's Ribbon, then click the Add Animation icon in the Advanced Animation section.
+
+A pane drops down in PowerPoint's window, as shown in Figure 25.1.
+
+Figure 25.1 Here's a selection of animation effects available for use in PowerPoint.
+
+As usual with animations, less is more. Choose subtle effects unless you're presenting to an audience of louts that will appreciate vulgarity.
+
+To write code that creates an animation, set the Animate property to msoTrue. (To turn off an animation, set Animate to msoFalse.)
+
+To control how the text in a shape is animated, set the TextLevelEffect property to ppAnimateLevelNone (no animation), ppAnimateByFirstLevel, ppAnimateBySecondLevel, ppAnimateByThirdLevel, ppAnimateByFourthLevel, ppAnimateByFifthLevel, or ppAnimateByAllLevels.
+
+If you set TextLevelEffect to any value other than ppAnimateByAllLevels or ppAnimate LevelNone, you can use the TextUnitEffect property to specify how to animate the text. Use ppAnimateByParagraph to animate by paragraph, ppAnimateByWord to animate by word, or ppAnimateByCharacter to animate by character.
+
+To reverse the order of the animation, set the AnimateTextInReverse property to msoTrue. (The default is msoFalse.)
+
+To control how the animation advances, set the AdvanceMode property to ppAdvanceOnTime (for automatic advancing using a timing) or ppAdvanceOnClick (for manual advancing). If you use automatic advancing, use the AdvanceTime property to specify the number of seconds to wait before advancing.
+
+To play a built-in sound effect with the transition, use the SoundEffect property of the AnimationSettings object to return the SoundEffect object, use the Name property to specify the name of the sound effect, and then use the Play method to play the sound effect. You can also play your own sound file by using the ImportFromFile method of the SoundEffect object and using the FullName argument to specify the path and filename of the sound file.
+
+To control how a media clip is played, use the PlaySettings property of the Animation Settings object to return the PlaySettings object. For example, if you want the sound to loop until the next sound, set the LoopSoundUntilNext property of the PlaySettings object within the AnimationSettings object to msoTrue. The default value is msoFalse.
+
+You can find all these options by pressing F2 to display the Object Browser in the VBA Editor and then searching for them. For example, search for ppEntryEffect to see all possible constants for the various possible lead-in animations.
+
+The following example applies a custom animation to the first shape on the slide identified by the object variable mySlide. The animation uses the entry effect Fly In From Right, plays a sound effect from a file, animates the text by first-level paragraphs and by whole paragraphs, and advances when the user clicks:
+
+    Dim mySlide As Slide
+    Set mySlide = Presentations(1).Slides(2)
+
+    With mySlide.Shapes(1).AnimationSettings
+        .EntryEffect = ppEffectFlyFromRight
+        .AdvanceMode = ppAdvanceOnClick
+        .SoundEffect.ImportFromFile FileName:="D:\Media\Whistle4.wav"
+        .TextLevelEffect = ppAnimateByFirstLevel
+        .TextUnitEffect = ppAnimateByParagraph
+    End With
+
+To test this (or other code examples you try in PowerPoint), just press F5 in the main PowerPoint window, and then repeatedly click the screen to activate the various transitions and effects. Press Esc when you're done.
+
+# Working with Headers and Footers
+
+PowerPoint uses HeaderFooter objects to represent the headers, footers, slide numbers, and date and time on slides. The HeaderFooter objects are organized into the HeadersFooters collection, which you access through the HeaderFooters property of the Master object, a Slide object, or a SlideRange collection.
+
+Be warned: Before you can execute the following code examples, you must first _add a footer to the slides in your active presentation_. The code examples expect to modify an existing footer, not to create it (unlike in Excel, where a new header or footer _will_ be created automatically).
+
+So, before executing these examples, click the Insert tab on PowerPoint's Ribbon, and then in the Text area, click the Header And Footer button to open the Header And Footer dialog box. In this dialog box, click the Date And Time check box and the Footer check box. Then click the Apply To All button.
+
+## Returning the Header or Footer Object You Want
+
+To access the object you want, use the appropriate property of the HeaderFooter object:
+
+  * Use the DateAndTime property to return the date and time.
+  * Use the Footer property to return the footer itself.
+  * Use the Header property to return the header on a notes page or handout. Slides themselves can't have a header.
+  * Use the SlideNumber property to return the slide number on a slide or the page number on a notes page or a handout.
+
+The following example uses the Footer property to set the text of the HeaderFooter object of the first slide in the active presentation:
+
+    ActivePresentation.Slides(1).HeadersFooters.Footer.Text = "Sentence 102"
+
+## Displaying or Hiding a Header or Footer Object
+
+To display the HeaderFooter object, set its Visible property to msoTrue (or just True). To hide the HeaderFooter object, set its Visible property to msoFalse. For example, the following statement hides the footer on the fifth slide in the active presentation:
+
+    ActivePresentation.Slides(5).HeadersFooters.Footer.Visible = False
+
+## Setting the Text in a Header or Footer
+
+To set the text that you want in a HeaderFooter object, assign a string containing the text to the object's Text property. For example, the following statement sets the text of the footer of the fifth slide in the active presentation to Confidential:
+
+    ActivePresentation.Slides(5).HeadersFooters.Footer.Text = "Confidential"
+
+If you executed the previous example code, executing this example will trigger an error message. That's because you made the same slide (#5) invisible in the previous code. To be able to set the text in this slide, it must first be visible:
+
+    ActivePresentation.Slides(5).HeadersFooters.Footer.Visible = **True**
+    ActivePresentation.Slides(5).HeadersFooters.Footer.Text = "Confidential"
+
+## Setting the Format for Date and Time Headers and Footers
+
+If your slides, notes pages, or handouts use dates and times in their footers or headers, use the Format property to specify how the dates and times should appear. Table 25.2 lists the constants you can use.
+
+Table 25.2 Format property constants for date and time headers and footers
+
+**Format** | **Example**  
+---|---  
+ppDateTimeddddMMMMddyyyy | Thursday, October 05, 2013  
+ppDateTimedMMMMyyyy | 5 October 2013  
+ppDateTimedMMMyy | 5-Oct-13  
+ppDateTimeFormatMixed | 10/5/2013  
+ppDateTimeHmm | 10:17  
+ppDateTimehmmAMPM | 10:17AM  
+ppDateTimeHmmss | 10:17:16  
+ppDateTimehmmssAMPM | 10:17:16AM  
+ppDateTimeMdyy | 10/5/2013  
+ppDateTimeMMddyyHmm | 10/5/2013 10:17AM  
+ppDateTimeMMddyyhmmAMPM | 10/5/2013 10:17:16AM  
+ppDateTimeMMMMdyyyy | October 5, 2013  
+ppDateTimeMMMMyy | October 08  
+ppDateTimeMMyy | Oct-08
+
+Set the UseFormat property of the HeaderFooter to msoTrue if you want the date and time to be updated automatically. Set UseFormat to msoFalse if you want the date and time to remain unchanged.
+
+The following example displays the current date in the format Friday, April 12, 2013:
+
+    Sub SetFooter()
+
+    Dim objPresTation As Presentation
+    Set objPresTation = Application.ActivePresentation
+
+    With objPresTation.Slides(2).HeadersFooters.DateAndTime
+
+        .UseFormat = True
+
+        **.Format = ppDateTimeddddMMMMddyyyy**
+
+    End With
+
+    End Sub
+
+# Setting Up and Running a Slide Show
+
+Not only can you assemble and format a slide show using VBA; you can also run it using VBA. To set up a slide show, use the SlideShowSettings property of the Presentation object to return the SlideShowSettings object. When you run the slide show, VBA creates a SlideShowWindow object, which you can then manipulate to control the slide show.
+
+## Controlling the Show Type
+
+To specify the type of show, set the ShowType property of the SlideShowSettings object to ppShowTypeSpeaker (for a standard full-screen presentation presented by a speaker), ppShowTypeKiosk (for a kiosk presentation), or ppShowTypeWindow (for a "browsed by an individual" presentation that appears in a window). For a show in a window, you can use the Left and Top properties to specify the position of the upper-left corner of the window and the Height and Width properties to specify its size.
+
+To control whether animation and narration are used, set the ShowWithAnimation property and the ShowWithNarration property of the SlideShowSettings object to msoTrue or msoFalse.
+
+To control whether the presentation loops until stopped, set the LoopUntilStopped property of the SlideShowSettings object to msoTrue or msoFalse.
+
+To control how the presentation advances, set the AdvanceMode property to ppSlideShowManualAdvance (for manual advancing), ppSlideShowUseSlideTimings (for automatic advancing using timings already set), or ppSlideShowRehearseNewTimings (to rehearse new timings while the show plays).
+
+The following example sets the active presentation running as a kiosk presentation that will advance automatically using its timings and loop until it is stopped:
+
+    With ActivePresentation.SlideShowSettings
+        .LoopUntilStopped = msoCTrue
+        .AdvanceMode = ppSlideShowUseSlideTimings
+        .ShowType = ppShowTypeKiosk
+        .Run
+    End With
+
+This next example sets the presentation named Corporate.pptm running in speaker (full-screen) mode, sizing the image to 800×600 pixels and positioning it at the upper-left corner of the screen. The show uses manual advancing:
+
+    With Presentations("Corporate.pptm").SlideShowSettings
+        .LoopUntilStopped = msoFalse
+        .ShowType = ppShowTypeSpeaker
+        .AdvanceMode = ppSlideShowManualAdvance
+        With .Run
+            .Height = 600
+            .Width = 800
+            .Left = 0
+            .Top = 0
+        End With
+    End With
+
+## Creating a Custom Show
+
+Custom shows within a presentation are represented by the NamedSlideShows collection within the SlideShowSettings object. Use the NamedSlideShows property of the SlideShowSettings object to return the NamedSlideShows collection.
+
+To create a custom show, use the Add method of the NamedSlideShows collection. The syntax is as follows:
+
+    _expression_.Add(Name, SafeArrayOfSlideIDs)
+
+Here, _expression_ is a required expression that returns a NamedSlideShows object. Name is a required String argument that specifies the name to assign to the new custom show. SafeArrayOfSlideIDs is a required Variant that specifies the numbers or names of the slides to include in the custom show.
+
+For example, the following statements declare an array of the Long data type; assign to it slides 2, 4, 5, and 10 from the open presentation named Corporate.pptm; and create a new custom show named Short Show using the following array:
+
+    Dim myArray(4) As Long
+    With Presentations("Corporate.pptm")
+        myArray(1) = .Slides(2).SlideID
+        myArray(2) = .Slides(4).SlideID
+        myArray(3) = .Slides(5).SlideID
+        myArray(4) = .Slides(10).SlideID
+        .SlideShowSettings.NamedSlideShows.Add Name:="Short Show", _
+             safeArrayOfSlideIDs:=myArray
+    End With
+
+## Deleting a Custom Show
+
+To delete a custom show, use the Delete method with the appropriate NamedSlideShow object. For example, the following statement deletes the custom show named Overview from the active presentation:
+
+    ActivePresentation.SlideShowSettings.NamedSlideShows("Overview").Delete
+
+## Starting a Slide Show
+
+To start a slide show using the whole presentation, use the Run method of the SlideShowSettings object. For example, the following statement starts the slide show running in the presentation identified by the object variable myPresentation:
+
+    myPresentation.SlideShowSettings.Run
+
+To show only a range of slides from a presentation, set the RangeType property of the SlideShowSettings object to ppShowSlideRange, use the StartingSlide property of the SlideShowSettings object to specify the first slide and the EndingSlide property to specify the last slide, and then use the Run method to run the presentation. The following example shows slides 4 through 8 in the presentation named Corporate.pptm:
+
+    With Presentations("Corporate.pptm").SlideShowSettings
+        .RangeType = ppShowSlideRange
+        .StartingSlide = 4
+        .EndingSlide = 8
+        .Run
+    End With
+
+To start running a custom show, set the RangeType property of the SlideShowSettings object to ppShowNamedSlideShow, use the SlideShowName property to specify the name of the custom show, and then use the Run method to run the custom show. The following example shows the custom show named Short Show in the active presentation:
+
+    With ActivePresentation.SlideShowSettings
+        .RangeType = ppShowNamedSlideShow
+        .SlideShowName = "Short Show"
+        .Run
+    End With
+
+When you start a slide show, VBA creates a SlideShowWindow object representing the object. You can access the SlideShowWindow object either through the SlideShowWindows collection (a creatable object that contains a SlideShowWindow object for each open slide show) or through the SlideShowWindow property of the Presentation object. If you know which presentation is running, it's easier to go through the appropriate Presentation object.
+
+## Changing the Size and Position of a Slide Show
+
+To find out whether a slide show is displayed full screen or in a window, check the IsFullScreen property of the SlideShowWindow object. If the IsFullScreen property returns -1, the presentation is full screen; if the property returns 0, the presentation is a window.
+
+To set the height and width of the slide-show window in pixels, use the Height property and the Width property. To set its position, use the Top property to specify the distance in pixels of the top edge of the presentation from the top of the window or screen, and the Left property to specify the distance in pixels of the left edge of the presentation from the left edge of the window or the screen.
+
+## Moving from Slide to Slide
+
+Apart from controlling the position and size of the presentation, most of the actions you can take with a presentation involve the View object. To find out which slide is displayed, return the CurrentShowPosition property:
+
+    MsgBox ActivePresentation.SlideShowWindow.View.CurrentShowPosition
+
+To display the first slide in the presentation, use the First method. To display the last slide, use the Last method:
+
+    ActivePresentation.SlideShowWindow.View.First
+    ActivePresentation.SlideShowWindow.View.Last
+
+To display the next slide, use the Next method. To display the previous slide, use the Previous method. Here's an example:
+
+    ActivePresentation.SlideShowWindow.View.Previous
+
+To display a particular slide in the slide show, use the GotoSlide method of the View object, using the Index argument to specify the slide number. For example, the following statement displays slide 5 in the first open slide-show window:
+
+    Application.SlideShowWindows(1).View.GotoSlide Index:=5
+
+## Pausing the Show and Using White and Black Screens
+
+To display a white screen, set the State property to ppSlideShowWhiteScreen. To display a black screen, set the State property of the View object to ppSlideShowBlackScreen:
+
+    ActivePresentation.SlideShowWindow.View.State = ppSlideShowWhiteScreen
+    ActivePresentation.SlideShowWindow.View.State = ppSlideShowBlackScreen
+
+To toggle the black screen or white screen off and start the show running again, set the State property to ppSlideShowRunning.
+
+To pause the presentation, set the State property of the View object to ppSlideShowPaused. To start the show again, set the State property to ppSlideShowRunning, as in this example:
+
+    With ActivePresentation.SlideShowWindow.View
+        .State = ppSlideShowPaused
+        .State = ppSlideShowRunning
+    End With
+
+## Starting and Stopping Custom Shows
+
+To start a custom show running, use the GotoNamedShow method and use the SlideShowName argument to specify the name of the custom show. For example, the following statement starts the custom show named New Show running:
+
+    SlideShowWindows(1).GotoNamedShow SlideShowName:="New Show"
+
+To exit a custom show, use the EndNamedShow method and then use the Next method to advance the presentation. PowerPoint then displays the first slide in the full presentation:
+
+    With ActivePresentation.SlideShowWindow.View
+        .EndNamedShow
+        .Next
+    End With
+
+## Exiting a Slide Show
+
+To exit the slide show, use the Exit method of the View property of the SlideShowWindow object. For example, the following statement exits the slide show in the active presentation:
+
+    ActivePresentation.SlideShowWindow.View.Exit
+
+# The Bottom Line
+
+**Work with shapes.**
+
+PowerPoint VBA provides many ways to access and manipulate shapes.
+
+Master It
+
+Describe what the following line of code does:
+
+    ActivePresentation.Slides(2).Shapes(1).Delete
+
+**Work with headers and footers.**
+
+Using PowerPoint headers and footers can be a convenient way to provide continuity for presentations as well as to identify each element.
+
+**Master It**
+
+In this chapter, you worked with several examples showing how to manipulate footers for slides. Why were there no examples illustrating how to manipulate headers for slides?
+
+**Set up and run a slide show.**
+
+To create a custom slide show, you use the Add method of the NamedSlideShows collection.
+
+Master It
+
+The syntax when using the Add method of the NamedSlideShows collection is
+
+    _expression_.Add(Name, SafeArrayOfSlideIDs)
+
+Explain what the four components of this line of code are and do.
+Chapter 26
+
+Understanding the Outlook Object Model and Key Objects
+
+In this chapter, you'll begin to come to grips with the Outlook object model and using VBA to manipulate Outlook. You'll learn where Outlook stores VBA items, meet the VBA objects for Outlook's creatable objects and main user-interface items, and work with some of the main Outlook objects. You'll explore a variety of objects, from the Application object that represents the entire application through the objects that represent individual messages, calendar items, and tasks. You'll also learn how to search programmatically.
+
+In this chapter you will learn to do the following:
+
+  * Work with the Application object
+  * Work with messages
+  * Work with calendar items
+  * Work with tasks and task requests
+  * Search for items
+
+# Getting an Overview of the Outlook Object Model
+
+Many people find Outlook harder to work with programmatically than other Office applications, so it's particularly helpful to explore the Outlook object model to see which objects Outlook uses and how they're related. Above all, when working with objects, seeing VBA code examples in the online Help system or online can be invaluable.
+
+You can find the Outlook object-model reference by following these steps:
+
+1. Launch or switch to Outlook, and then press Alt+F11 to launch or switch to the VBA Editor.
+
+2. Move your cursor to a blank space in the code window (to avoid context-sensitive help).
+
+3. Press F1 in the editor to launch MSDN (the Microsoft Developer Network). At the time of this writing, you'll see a message that the page you requested cannot be found. This is because Outlook has the incorrect link built into its Help feature. Never mind. We just want to use the Bing search anyway.
+
+4. In the Bing search field, type **outlook 2013 object model** and press Enter.
+
+5. Click the link _Object Model_ ( _Outlook 2013 Developer Reference_ ). You'll now have access to the whole collection of Outlook 2013 VBA syntax specifications, useful descriptions, and code examples (one of which is shown in Figure 26.1).
+
+Figure 26.1 Sample code found in the Outlook object model-reference will help you write your own VBA code.
+
+## Understanding Where Outlook Stores VBA Macros
+
+As you've seen earlier in this book, Word and Excel let you store VBA projects either in a global location (the Normal.dotm template in Word or the Personal Macro Workbook in Excel) or in individual templates or document files. PowerPoint lets you store VBA projects in presentation files and templates.
+
+Outlook, by contrast, doesn't let you store VBA projects in individual items (such as Outlook's email messages or contacts). Instead, Outlook saves all projects in a single VBA project called VbaProject.OTM, which is stored in the following folder (instead of _Richard_ in this path, substitute your username):
+
+    C:\Users\ _Richard_ \AppData\Roaming\Microsoft\Outlook
+
+## Understanding Outlook's Most Common Creatable Objects
+
+In Outlook VBA, the Application object represents the entire Outlook application, so you can access any Outlook object by going through the Application object. However, Outlook also exposes various creatable objects, allowing you to reach some of the objects in its object model without explicitly going through the Application object. Recall that "creatable" merely means that when you're writing code involving these objects, using the word Application is optional. You can get the same result by using either of the following versions:
+
+    Application.Explorers
+
+or more simply,
+
+    Explorers
+
+Here is a list of Outlook's most common creatable objects; you'll work with most of them in more detail later in this chapter and in the next chapter:
+
+  * The Explorers collection contains an Explorer object for each window that displays the contents of a folder.
+  * The Inspectors collection contains an Inspector object for each window that's open displaying an Outlook item.
+  * The COMAddIns collection contains a COMAddIn object for each COM (Component Object Model) add-in loaded in Outlook.
+  * The Reminders collection contains a Reminder object for each reminder.
+
+The most prominent objects in the Outlook user interface are represented in VBA by items whose names are descriptive of their purpose, such as these, for example:
+
+  * The MailItem object represents a mail item.
+  * The ContactItem object represents a contact.
+  * The TaskItem object represents a task.
+  * The AppointmentItem object represents an appointment.
+  * The JournalItem object represents a journal entry.
+  * The NoteItem object represents a note.
+
+You'll learn how to work with these objects later in this chapter and in the next chapter.
+
+# Working with the Application Object
+
+You can have only one instance of Outlook running at a time. (By contrast, you can run multiple instances of Word or Excel at the same time.) You probably won't find this a limitation when you're writing macros that work within Outlook. But if you create a procedure in another application (such as Word) that will communicate with and manipulate Outlook, you will need to check whether there is an instance of Outlook currently running in the computer before you create an instance programmatically. (See Chapter 30, "Accessing One Application from Another Application," for instructions on accessing one application programmatically from another application.)
+
+## Working with the NameSpace Object
+
+Here is a new concept: the _NameSpace_. Among all the VBA-enabled Office applications, only Outlook employs this technique. That the NameSpace approach is unique to Outlook demonstrates beyond all doubt that the various Microsoft Office application teams work at least partly independently when building their object-model structures.
+
+## Working with Inspectors and Explorers
+
+Many Outlook VBA activities, such as accessing email messages, tasks, or contacts programmatically, require that you use the GetNameSpace method of Outlook's Application object to return the NameSpace object that represents the root object of the data source. Anyway, that's the official version. Just remember that you use the following syntax to get most jobs done in Outlook VBA:
+
+    _expression_.GetNameSpace(Type)
+
+Here, _expression_ is a required expression that returns an Application object. Type is a required String argument that specifies the type of namespace you want to return. Outlook supports only the MAPI data source, so you always use Type: = "MAPI" with the GetNameSpace method. For example, the following statement returns the NameSpace and uses the CurrentUser property to display the name—the email address—of the current user in a message box:
+
+    MsgBox Application. **GetNamespace("MAPI")**.CurrentUser
+
+* * *
+
+What Is an API?
+
+MAPI means Messaging Application Programming Interface. It's a collection of functions written by Microsoft that can be used in programming related to email. There are all kinds of APIs used for various purposes. _API_ is just another term for a library of built-in functions. Come to think of it, _namespace_ is also a synonym. The general term _namespace_ in computer programming means a collection of functions that is self-contained. This allows you to have functions with identical names that are distinguished by their individual namespaces. That way VBA or another language knows which function to trigger when it appears in the code. It's similar to a teacher using full names to distinguish John Thompson from John Ortega.
+
+* * *
+
+### Accessing Default Folders within the NameSpace Object
+
+The NameSpace object contains the folders that Outlook uses—both the collection of default folders used to store default items such as email messages, tasks, and contacts as well as any other folders created by the user or by custom procedures. These folders are represented in Outlook's VBA by MAPIFolder objects that are organized into a Folders collection.
+
+You'd probably expect that to find out which are the current default folders, you would use a method of the Folders collection. Nope. Given that we're in a special situation here (dealing with email), GetDefaultFolder is a method of the NameSpace object. The syntax is as follows:
+
+    _expression_.GetDefaultFolder(FolderType)
+
+Here, _expression_ is a required expression that returns a NameSpace object. FolderType is a required argument that specifies which default folder you want to return. The constants are self-explanatory: olFolderCalendar, olFolderConflicts, olFolderContacts, olFolderDeletedItems, olFolderDrafts, olFolderInbox, olFolderJournal, olFolderJunk, olFolderLocalFailures, olFolderManagedEmail, olFolderNotes, olFolderOutbox, olFolderRSSFeeds, olFolderSentMail, olFolderServerFailures, olFolderSuggestedContacts, olFolderSyncIssues, olFolderTasks, olFolderToDo, or olPublicFoldersAllPublicFolders.
+
+The following example creates the object variable myCal and assigns the default calendar folder to it:
+
+    Dim myCal As MAPIFolder
+    Set myCal = Application.GetNamespace("MAPI") _
+        .GetDefaultFolder(FolderType:=olFolderCalendar)
+
+### Accessing Other Folders within the NameSpace Object
+
+Accessing the default folders in the NameSpace object via the GetDefaultFolder method is easy, but often you'll need to access other folders. In this case, you _do_ use the Folders collection.
+
+The following example displays a message box (see Figure 26.2) containing a list of all the folders contained in the namespace:
+
+    Sub List_All_NameSpace_Folders()
+        Dim myNS As NameSpace
+        Dim myFolder As MAPIFolder
+        Dim mySubfolder As MAPIFolder
+        Dim strFolderList As String
+
+        strFolderList = "Your Outlook NameSpace contains these folders:" _
+            & vbCr & vbCr
+
+        Set myNS = Application.GetNamespace("MAPI")
+        With myNS
+            For Each myFolder In myNS.Folders
+                strFolderList = strFolderList & myFolder.Name & vbCr
+                For Each mySubfolder In myFolder.Folders
+                    strFolderList = strFolderList & "*  " & mySubfolder.Name & vbCr
+                Next mySubfolder
+            Next myFolder
+
+        End With
+        MsgBox strFolderList, vbOKOnly + vbInformation, "Folders in NameSpace"
+
+    End Sub
+
+Figure 26.2 Listing the folders contained in the NameSpace object
+
+### Creating a New Folder
+
+To create a new folder, use the Add method with the Folders collection. The syntax is as follows:
+
+    _expression_.Add(Name, Type)
+
+Here, _expression_ is a required expression that returns a Folders collection. Name is a required String argument that specifies the display name to assign to the new folder. Type is an optional Long argument that you can use to specify the type of folder to create: olFolderCalendar, olFolderContacts, olFolderDrafts, olFolderInbox, olFolderJournal, olFolderNotes, or olFolderTasks. If you omit Type, Outlook assigns the new folder the same type as its parent folder (the folder in which you create the new folder).
+
+The following statement creates a new folder named Personal Tasks in the Tasks folder, assigning the new folder the olFolderTasks folder type explicitly for clarity:
+
+    Application.GetNamespace("MAPI").GetDefaultFolder(olFolderTasks) _
+        .Folders.Add Name:="Personal Tasks", Type:=olFolderTasks
+
+### Deleting a Folder
+
+To delete a folder, use the Delete method with the appropriate MAPIFolder object. This method takes no arguments. The following example deletes the folder named Personal Tasks in the Tasks folder:
+
+    Application.GetNamespace("MAPI").GetDefaultFolder(olFolderTasks) _
+        .Folders("Personal Tasks").Delete
+
+* * *
+
+Deletion Is Dangerous
+
+Be careful when deleting objects in Outlook. First, Outlook doesn't request any confirmation before deleting an object. Second, the deletion is permanent; there's no Recycle Bin backup.
+
+* * *
+
+## Understanding Inspectors and Explorers
+
+VBA uses two major Outlook objects that most users wouldn't recognize from working with the Outlook user interface alone:
+
+  * An Inspector is an object that represents a window displaying a specific Outlook item, such as an email message or an appointment.
+  * An Explorer object represents a window that displays the contents of a folder, such as a list of emails.
+
+* * *
+
+Objects within Objects
+
+Unlike the behavior of many collections, an Explorer object is included in the Explorers collection even if it is not visible.
+
+* * *
+
+### Opening an Inspector Window
+
+To open an inspector window for an object, use the Display method of the Inspector object. For example, the following statement displays an inspector window for the object referenced by the object variable myItem:
+
+    myItem.Display
+
+### Returning the Inspector Associated with an Item
+
+To return the inspector associated with an item, use the GetInspector property of the appropriate object. The following example returns the inspector for the item identified by the object variable myItem:
+
+    myItem.GetInspector
+
+### Returning the Active Window, Inspector, or Explorer
+
+Unlike Word, Excel, and PowerPoint, Outlook doesn't have an ActiveWindow object that represents the active window. However, Outlook's Application object does have an ActiveWindow method, which returns the topmost Outlook window. (If there is no window, ActiveWindow returns Nothing.)
+
+This window will be either an Inspector object or an Explorer object. Similarly, the ActiveExplorer method of the Application object returns the active explorer, and the ActiveInspector method of the Application object returns the active inspector. Got it?
+
+You can use the TypeName function to determine which type of window is active. The following example displays a message box that states which window type is active _if_ there is an active window:
+
+    If Not TypeName(ActiveWindow) = "Nothing" Then
+        MsgBox "An " & TypeName(ActiveWindow) & " window is active."
+    End If
+
+Notice that we say here If Not... Nothing. The double negative means "if the active window isn't nothing."
+
+### Working with the Active Inspector
+
+In many procedures, you'll need to determine what the topmost inspector in the Outlook application is, either so that you can work with that inspector or so that you can restore the inspector to the topmost position at the end of a procedure that manipulates other inspectors. (Remember, you should always try to restore an application to the state it was in when your procedure started execution. This is a courtesy to the user and evidence of careful, quality programming.)
+
+To find out which is the topmost inspector, use the ActiveInspector method of the Application object. For example, the following statement maximizes the window of the topmost inspector:
+
+    Application.ActiveInspector.WindowState = olMaximized
+
+Note that this example attempts to maximize an inspector window, so there must actually _be_ an inspector window open when you run the code. In other words, double-click an email message in Outlook to open it in a window separate from the Outlook window. This separate window, showing a single email, is an inspector. If you want to _trap_ this error (and you should) to prevent your macro from crashing when no inspector exists, here's how to make sure there is an active inspector. You can check that the TypeName function does not return Nothing when run on the ActiveInspector method of the Application object, like this:
+
+    If TypeName(Application.ActiveInspector) = "Nothing" Then
+        MsgBox "No item is currently open."
+        End 'shut down the macro
+    End If
+
+## Creating Items
+
+To create new items in Outlook, you use the CreateItem method or the CreateItemFromTemplate method of the Application object. The CreateItem method creates default items, while the CreateItemFromTemplate method creates items based on the templates you specify.
+
+* * *
+
+You Can Use Custom Forms to Create New Objects
+
+You can also create new objects using a custom form. To do so, use the Add method with the Items collection.
+
+* * *
+
+### Using the CreateItem Method to Create Default Items
+
+The syntax for the CreateItem method is as follows:
+
+    _expression_.CreateItem(ItemType)
+
+Here, _expression_ is a required expression that returns an Application object. ItemType is a required argument that specifies the type of item to create: olAppointmentItem, olContactItem, olDistributionListItem, olJournalItem, olMailItem, MobileItemMMS, MobileItemSMS, olNoteItem, olPostItem, or olTaskItem.
+
+The following example creates a new email message; assigns a recipient (by setting the To property), a subject (by setting the Subject property), and body text (by setting the Body property); and then displays the message window:
+
+    Dim myMessage As MailItem
+    Set myMessage = Application.CreateItem(ItemType:=olMailItem)
+    With myMessage
+        .To = "test@example.com"
+        .Subject = "Test message"
+        .Body = "This is a test message."
+        .Display
+    End With
+
+### Using the CreateItemFromTemplate Method to Create Items Based on Templates
+
+Instead of creating a default item by using the CreateItem method, you can alternatively use the CreateItemFromTemplate method of the Application object to create a new item based on a template. The syntax for the CreateItemFromTemplate method is as follows:
+
+    _expression_.CreateItemFromTemplate(TemplatePath, InFolder)
+
+Here, _expression_ is a required expression that returns an Application object. TemplatePath is a required String argument that specifies the path and filename of the template on which to base the new item. InFolder is an optional Variant argument that you can use to specify the folder in which to create the item. If you omit the InFolder argument, Outlook creates the item in the default folder for that item type.
+
+Before you can test the following example, you must create a note template to work with. Press Ctrl+Shift+N in Outlook to create a new note, then choose File ⇒ Save As and choose the Outlook Template option in the Save As Type list box in the Save As dialog box. Save it as tpltNote.oft.
+
+The following example creates a new note item based on the custom template tpltNote.oft you just stored in the C:\Users\ _Richard_ \AppData\Roaming\Microsoft\Templates folder within the user's user profile (substitute your computer's name for _Richard_ ). The example then displays the new note item:
+
+    Dim myNoteItem As NoteItem
+
+    Set myNoteItem = Application.CreateItemFromTemplate _
+    ("C:\Users\ _Richard_ \AppData\Roaming\" _
+    & "Microsoft\Templates\tpltNote.oft")
+    myNoteItem.Display
+
+## Quitting Outlook
+
+To quit Outlook, use the Quit method of the Application object. This method takes no arguments:
+
+    Application.Quit
+
+You may also want to work with the events available to the Application object. See Chapter 27, "Working with Events in Outlook," for a discussion of how to work with these application-level events and with item-level events.
+
+# Understanding General Methods for Working with Outlook Objects
+
+Many of the objects in Outlook use the methods covered in the following sections. You'll see brief examples showing you how to use the methods, as well as further examples on the individual types of objects—email messages, appointments, contacts, tasks, and so on—later in this chapter and in the next.
+
+## Using the Display Method
+
+To open an item in an inspector window, use the Display method. The syntax is as follows:
+
+    _expression_.Display(Modal)
+
+Here, _expression_ is a required expression that returns the type of object you want to display—for example, a ContactItem object or a MailItem object. Modal is an optional Variant argument that you can set to True to make the window modal. A window is modeless by default, or becomes modeless if you set Modal to False. Making the window modal means that users must close the window before they can work with another window.
+
+Note that the Modal argument isn't available for Explorer and MAPIFolder objects.
+
+For example, the following statement uses the Display method to display the Inbox:
+
+    Application.GetNamespace("MAPI").GetDefaultFolder(olFolderInbox).Display
+
+## Using the Close Method
+
+To close a window, use the Close method. The syntax is as follows:
+
+    _expression_.Close(SaveMode)
+
+Here, _expression_ is a required expression that returns the object you want to close. SaveMode is a required argument that specifies whether to save changes (olSave), discard the changes (olDiscard), or prompt the user to decide whether to save the changes (olPromptForSave).
+
+The following example closes the active inspector and saves any changes to its contents:
+
+    ActiveInspector.Close SaveMode:=olSave
+
+Remember that this code requires that an inspector be currently open. See the warning earlier in this chapter in the section titled "Working with the Active Inspector."
+
+## Using the Delete Method
+
+To delete an item, use the Delete method. This method takes no arguments. The following example deletes the item with the index number 1 in the Contacts folder. Be careful if you want to give this code a test run. It will delete a contact, but exactly _which_ contact is unpredictable. The sidebar "Practical Programming" explains why. So, if you value your contacts list, don't test this example. Take my word for it; I had to test this code and I still don't know which of my contacts were deleted. I hope it was some long-ago acquaintance.
+
+    Application.GetNamespace("MAPI").GetDefaultFolder(olFolderContacts) _
+        .Items(1).Delete
+
+* * *
+
+**Practical Programming: The Items Collection Is Unsorted**
+
+You often need to sort and search data. Be warned that the index numbers in the Items collection of your contacts are not ordered in any way. The collection is not alphabetical, nor is it ordered in any other fashion (by the date the contact was entered, was modified, or by any other order). Using the Delete, Display, or other methods with the Items collection accesses what to us, as programmers, will be a random item. In the previous example, Items(1) will almost certainly not be the first contact in your list of contacts. Or, as the Outlook online Help system puts it, "The items in the Items collection object are not guaranteed to be in any particular order."
+
+However, you _can_ sort items yourself if you wish, by writing code that sorts. Then you can search the sorted list that's generated. You do this by using the Sort method, as the following example illustrates. These statements sort your contacts alphabetically by the Full Name field in the Contacts dialog box. You can optionally sort by due date (for tasks), by last name (for contacts), and many other ways.
+
+    Sub SortContacts()
+        Dim strNames As String
+        Dim myNameSpace As Outlook.NameSpace
+        Dim myFolder As Outlook.Folder
+        Dim myItem As Outlook.ContactItem
+        Dim myItems As Outlook.Items
+
+        Set myNameSpace = Application.GetNamespace("MAPI")
+        Set myFolder = myNameSpace.GetDefaultFolder(olFolderContacts)
+        Set myItems = myFolder.Items
+        myItems.Sort "[FullName]", False
+
+        For Each myItem In myItems
+
+            strNames = strNames & ", " & myItem.FullName
+
+        Next myItem
+
+        MsgBox strNames
+
+    End Sub
+
+Notice that you could use this For Each... Next loop to search for a particular item in the collection of items.
+
+Alternatively, you can use the AdvancedSearch method of the Application object, as described in the section "Searching for Items" later in this chapter.
+
+* * *
+
+## Using the PrintOut Method
+
+To print an item, use the PrintOut method. This method takes no arguments. The following example prints the item with the index number 1 in the Inbox:
+
+    Application.GetNamespace("MAPI").GetDefaultFolder(olFolderInbox) _
+        .Items(1).PrintOut
+
+## Using the Save Method
+
+To save an item, use the Save method. This method takes no arguments. The following example creates a new task; assigns it a subject, start date (today), and due date (a week from today); turns off the reminder for the task; and then saves it:
+
+    Dim myTask As TaskItem
+    Set myTask = Application.CreateItem(ItemType:=olTaskItem)
+    With myTask
+        .Subject = "Arrange Review Meeting"
+        .StartDate = Date
+        .DueDate = Date + 7
+        .ReminderSet = False
+        .Save
+    End With
+
+This item will appear in the ToDo list of the MyTasks section of your Outlook Tasks.
+
+## Using the SaveAs Method
+
+To save an item as a separate file, use the SaveAs method. The syntax is as follows:
+
+    _expression_.SaveAs(Path, Type)
+
+Here, _expression_ is a required expression that returns the object to be saved. Path is a required String argument that specifies the path and filename under which to save the file. Type is an optional Variant argument that you can use to control the file type used for the file, as shown in Table 26.1.
+
+Table 26.1 Type arguments for the SaveAs method
+
+**Argument** | **Type of File**  
+---|---  
+olHTML | HTML file  
+olMSG | Outlook message format (.msg filename extension)  
+olRTF | Rich Text format  
+olTemplate | Template  
+olDoc | Word document format (email messages using WordMail)  
+olTXT | Text file  
+olVCal | vCal file  
+olVCard | vCard file  
+olICal | iCal file  
+olMSGUnicode | Outlook Unicode message format (.msg filename extension)
+
+The following example saves the message open in the active inspector. So before testing this example, be sure that a message has been double-clicked and is thus open in its own window separate from the main Outlook window. Remember that code involving the active inspector requires that an inspector be currently open. See the warning, and a way to error-trap this, earlier in this chapter in the section titled "Working with the Active Inspector."
+
+If the IsWordMail property of the ActiveInspector object returns True, the example saves the message as a .doc file; if the IsWordMail property returns False, the example saves the message as an .rtf file. If no inspector window is active, the example displays a message box pointing out the problem to the user:
+
+    If TypeName(ActiveInspector) = "Nothing" Then
+        MsgBox "This macro cannot run because " & _
+            "there is no active window.", vbOKOnly, "Macro Cannot Run"
+        End
+    Else
+        If ActiveInspector.IsWordMail Then
+            ActiveInspector.CurrentItem.SaveAs "c:\keep\message.doc"
+        Else
+            ActiveInspector.CurrentItem.SaveAs "c:\keep\message.rtf"
+        End If
+    End If
+
+# Working with Messages
+
+If you or your colleagues use Outlook's email capabilities extensively, you may be able to save time by programming Outlook to create or process messages automatically. The following sections show you how to create a new message, work with its contents, add an attachment, and send the message.
+
+## Creating a New Message
+
+To create a new message, use the CreateItem method of the Application object and specify olMailItem for the ItemType argument. The following example creates a MailItem object variable named myMessage and assigns to it a new message:
+
+    Dim myMessage As MailItem
+    Set myMessage = Application.CreateItem(ItemType:=olMailItem)
+
+## Working with the Contents of a Message
+
+To work with the contents of a message, set or get the appropriate properties. These are the most widely useful properties:
+
+  * To is the recipient or recipients of the message.
+  * CC is the recipient or recipients of copies of the message.
+  * BCC is the recipient or recipients of blind copies of the message.
+  * Subject is the subject line of the message.
+  * Body is the body text of the message.
+  * BodyFormat is the message's formatting type: olFormatPlain for text only, olFormatRichText for text with formatting, and olFormatHTML for HTML formatting.
+  * Importance is the relative importance of the message. Set it to olImportanceHigh, olImportanceNormal, or olImportanceLow.
+
+The following example creates a new message item and assigns it to the object variable myMessage. It then adds an addressee, a subject, and body text; applies the HTML format; sets the importance to high; and sends the message:
+
+    Dim myMessage As MailItem
+    Set myMessage = Application.CreateItem(ItemType:=olMailItem)
+    With myMessage
+        .To = "petra_smith@ourbigcompany.com"
+        .Subject = "Preparation for Review"
+        .Body = "Please drop by tomorrow and spend a few minutes" _
+            & " discussing the materials we need for Darla's review."
+        .BodyFormat = olFormatHTML
+        .Importance = olImportanceHigh
+        .Send
+    End With
+
+When this message, shown in Figure 26.3, arrives at Petra's machine, Outlook 2013 briefly displays it in the upper-right corner:
+
+Figure 26.3 A portion of a message of high importance is briefly displayed in Outlook.
+
+## Adding an Attachment to a Message
+
+To add an attachment to a message, use the Add method with the Attachments collection, which you return by using the Attachments property of the MailItem object. The syntax is as follows:
+
+    _expression_.Add(Source, Type, Position, DisplayName)
+
+Here are the components of the syntax:
+
+  * _expression_ is a required expression that returns an Attachments collection.
+  * Source is a required String argument that specifies the path and filename of the attachment.
+  * Type is an optional String argument that you can use to specify the type of attachment.
+  * Position is an optional String argument that you can use with rich-text messages to specify the character at which the attachment is positioned in the text. Use character 0 to hide the attachment, 1 to position the attachment at the beginning of the message, or a higher value to position the attachment at the specified character position. To position the attachment at the end of the message, use a number higher than the number of characters in the message.
+  * DisplayName is an optional String argument that you can specify to control the name displayed for the attachment in the message.
+
+The following example attaches to the message referenced by the object variable myMessage the file Corporate Downsizing.pptm stored in the folder Y:\Sample Documents, positioning the attachment at the beginning of the message and setting its display name to Downsizing Presentation:
+
+    myMessage.Attachments. **Add** _
+        Source:="Y:\Sample Documents\Corporate Downsizing.pptm", _
+        Position:=1, DisplayName:="Downsizing Presentation"
+
+To test this, insert this code into the example code from the previous section ("Working with the Contents of a Message"), like this:
+
+    Dim myMessage As MailItem
+    Set myMessage = Application.CreateItem(ItemType:=olMailItem)
+
+    **myMessage.Attachments.Add _**
+        **Source:="Y:\Sample Documents\Corporate Downsizing.pptm", _**
+        **Position:=1, DisplayName:="Downsizing Presentation"**
+
+    With myMessage
+        .To = "petra_smith@ourbigcompany.com"
+        .Subject = "Preparation for Review"
+        .Body = "Please drop by tomorrow and spend a few minutes" _
+            & " discussing the materials we need for Darla's review."
+        .BodyFormat = olFormatHTML
+        .Importance = olImportanceHigh
+        .Send
+    End With
+
+## Sending a Message
+
+To send a message, use the Send method. This method takes no arguments. The following example sends the message referenced by the object variable myMessage:
+
+    myMessage.Send
+
+* * *
+
+Multiple Sends
+
+The Send method applies to the AppointmentItem, MeetingItem, and TaskItem objects as well as to the MailItem object.
+
+* * *
+
+To check whether a message has been sent, check its Sent property. This Boolean property returns True if the message has been sent and False if it has not.
+
+# Working with Calendar Items
+
+If you create or receive many calendar items, you may be able to save time or streamline your scheduling by using VBA. The following sections show you how to create a calendar item and work with its contents.
+
+## Creating a New Calendar Item
+
+To create a new calendar item, use the CreateItem method of the Application object and specify olAppointmentItem for the ItemType argument. The following example creates an AppointmentItem object variable named myAppointment and assigns to it a new appointment item:
+
+    Dim myAppointment As AppointmentItem
+    Set myAppointment = Application.CreateItem(ItemType:=olAppointmentItem)
+
+## Working with the Contents of a Calendar Item
+
+To work with the contents of a calendar item, set or get the appropriate properties. These are the most widely useful properties:
+
+  * Subject is the subject of the appointment.
+  * Body is the body text of the appointment.
+  * Start is the start time of the appointment.
+  * End is the end time of the appointment.
+  * BusyStatus is your status during the appointment: olBusy, olFree, olOutOfOffice, or olTentative.
+  * Categories is the category or categories assigned to the item.
+  * ReminderSet determines whether the appointment has a reminder (True) or not (False).
+  * ReminderMinutesBeforeStart is the number of minutes before the event that the reminder should occur.
+
+The following example creates a new AppointmentItem object and assigns it to the object variable myAppointment. It then sets the subject, body, start date (2:30 p.m. on the day seven days after the present date), and end date (one hour after the start); marks the time as busy; assigns the Personal category; sets a reminder 30 minutes before the appointment; and saves the appointment:
+
+    Dim myAppointment As AppointmentItem
+    Set myAppointment = Application.CreateItem(ItemType:=olAppointmentItem)
+    With myAppointment
+        .Subject = "Dentist"
+        .Body = "Dr. Schmitt " & vbCr & "4436 Acacia Blvd."
+        .Start = Str(Date + 7) & " 2.30 PM"
+        .End = Str(Date + 7) & " 3.30 PM"
+        .BusyStatus = olBusy
+        .Categories = "Personal"
+        .ReminderMinutesBeforeStart = 30
+        .ReminderSet = True
+        .Save
+    End With
+
+The AppointmentItem object has a grand total of 71 properties. If you want to explore more of them, take a look at this MSDN web page:
+
+<http://msdn.microsoft.com/en-us/library/office/jj900814.aspx>
+
+* * *
+
+Allowing Users to Manually Assign Categories
+
+Assigning categories to an item programmatically can be difficult, especially because many users create custom categories or assign categories in an idiosyncratic manner. In many cases, it's better to allow each user to assign their preferred categories manually by displaying the Categories dialog box at the appropriate point in your procedure. You can do so by using the ShowCategoriesDialog method of the item—for example, myAppointment.ShowCategoriesDialog for an item referenced by the object variable myAppointment.
+
+* * *
+
+# Working with Tasks and Task Requests
+
+VBA can automate tasks and task requests. The following sections show you how to create a task, work with the contents of a task item, and send a task request.
+
+## Creating a Task
+
+To create a new task item, use the CreateItem method of the Application object and specify olTaskItem for the ItemType argument. The following example creates a TaskItem object variable named myTask and assigns to it a new task item:
+
+    Dim myTask As TaskItem
+    Set myTask = Application.CreateItem(ItemType:=olTaskItem)
+
+## Working with the Contents of a Task Item
+
+To work with the contents of a task item, set or get the appropriate properties. These are the most widely useful properties:
+
+  * Subject is the subject of the task.
+  * Body is the body text of the task.
+  * Start is the start time of the task.
+  * DueDate is the due date of the task.
+  * Importance is the importance of the task. Set it to olImportanceHigh, olImportanceNormal, or olImportanceLow.
+  * Status is the status of the task: olTaskNotStarted, olTaskWaiting, olTaskDeferred, olTaskInProgress, or olTaskComplete.
+  * PercentComplete is the percentage of the task completed.
+  * Companies is the companies associated with the task.
+  * BillingInformation is the company or department to bill for the task.
+
+The following example creates a TaskItem object variable named myTask and assigns to it a new task item. It then sets the subject and body of the task, specifies a due date, sets the status to olTaskInProgress and the percentage complete to 10, specifies the company involved and who to bill, sets the importance to High, and then saves the task:
+
+    Dim myTask As TaskItem
+    Set myTask = Application.CreateItem(ItemType:=olTaskItem)
+    With myTask
+        .Subject = "Create a business plan"
+        .Body = "The business plan must cover the next four years." & _
+            vbCr & vbCr & "It must provide a detailed budget, " & _
+            "staffing projections, and a cost/benefit analysis."
+        .DueDate = Str(Date + 28)
+        .Status = olTaskInProgress
+        .PercentComplete = 10
+        .Companies = "Acme Polyglot Industrialists"
+        .BillingInformation = "Sales & Marketing"
+        .Importance = olImportanceHigh
+        .Save
+    End With
+
+The TaskItem object has 69 properties. If you want to explore more of them, take a look at this MSDN web page:
+
+<http://msdn.microsoft.com/en-us/library/office/jj871952.aspx>
+
+## Assigning a Task to a Colleague
+
+To assign a task to a colleague, use the Assign method of the TaskItem object, and then use the Add method of the Recipients collection to add one or more recipients. Finally, you can use the Send method to send the task to your colleague.
+
+The following example creates a task, uses the Assign method to indicate that it will be assigned, specifies a recipient, and sends the task:
+
+    Dim myTaskAssignment As TaskItem
+    Set myTaskAssignment = Application.CreateItem(ItemType:=olTaskItem)
+    With myTaskAssignment
+        .Assign
+        .Recipients.Add Name:="Peter Nagelly"
+        .Subject = "Buy Bagels for Dress-Down/Eat-Up Day"
+        .Body = "It's your turn to get the bagels on Friday."
+        .Body = .Body & vbCr & vbCr & "Remember: No donuts AT ALL."
+        .DueDate = Str(Date + 3)
+        .Send
+    End With
+
+# Searching for Items
+
+To search for items, use the AdvancedSearch method of the Application object. The syntax is as follows:
+
+    _expression_.AdvancedSearch(Scope, Filter, SearchSubFolders, Tag)
+
+Here are the components of the syntax:
+
+  * _expression_ is a required expression that returns an Application object.
+  * Scope is a required String argument that specifies the scope of the search (which items to search). Usually you'll search a particular folder. For example, you might search the Inbox for messages that match certain criteria, or you might search the Tasks folder for particular tasks.
+  * Filter is an optional String argument that specifies the search filter. While this argument is optional, you will need to use it unless you want to return all the items within the scope you've specified.
+  * SearchSubFolders is an optional Variant argument that you can set to True to search through any subfolders of the folder specified by the Scope argument, or False to search only the specified folder. The default is False.
+  * Tag is an optional Variant argument that you can use to specify a name for the search you're defining. If you create a name, you can call the search again.
+
+The following example searches the Inbox (Scope: = "Inbox") for messages with the subject line containing _Office_. If any messages are found, the procedure produces a list of sender names, which it assigns to the String variable strMessages and displays in a message box.
+
+Note that at the time of this writing, there appears to be a timing bug in the advanced search feature. If you press F5 to execute the following code, no search hits are found. However, if you press F8 repeatedly to step through the code, it works as expected and hits are found.
+
+I'm including this code because _it should work_. If you find a way to insert an effective delay or otherwise fix the problem, please email me at my address in the introduction to this book. Or perhaps by the time this book is published Microsoft will have fixed it.
+
+    Sub Sample_Advanced_Search()
+
+        Dim mySearch As Search
+        Dim myResults As Results
+        Dim intCounter As Integer
+        Dim strMessages As String
+        Dim intTotal As Integer
+
+    Dim strFilter As String
+    strFilter = Chr(34) & "urn:schemas:httpmail:subject" & Chr(34) & " like '%Office%'"
+
+        Set mySearch = AdvancedSearch(Scope:="Inbox", filter:=strFilter)
+
+        Set myResults = mySearch.Results
+        intTotal = myResults.Count
+
+        For intCounter = 1 To intTotal
+            strMessages = strMessages & _
+                myResults.Item(intCounter).SenderName & vbCr
+        Next intCounter
+
+        MsgBox strMessages, vbOKOnly, "Search Results"
+
+    End Sub
+
+* * *
+
+You Can Execute 100 Searches Simultaneously, But Should You?
+
+If necessary, you can run two or more searches at the same time. To do so, use the AdvancedSearch method in successive lines of code. Actually, you can run up to 100 searches at the same time, but doing so puts a considerable load on your computer and may make it run slowly or appear to stop responding.
+
+* * *
+
+# The Bottom Line
+
+**Work with the **Application** object.**
+
+VBA uses two major Outlook objects that most users wouldn't recognize from working with the Outlook user interface alone.
+
+Master It
+
+One of these objects represents a window that displays the contents of a folder. The other represents a window displaying an Outlook item, such as an email message or an appointment. What are the names of these two objects?
+
+**Work with messages.**
+
+To work with the contents of a message in VBA, you set or get various properties.
+
+Master It
+
+Name one of the most widely useful properties employed when manipulating the contents of a message in a procedure.
+
+**Work with calendar items.**
+
+You can create new calendar appointment items via VBA.
+
+Master It
+
+To create a new calendar item, you use a particular method of the Application object and specify olAppointmentItem for the ItemType argument. What is the method?
+
+**Work with tasks and task requests.**
+
+You can assign a task to a colleague and then add one or more recipients. You can then send the task to your colleague and, optionally, the additional recipients.
+
+Master It
+
+What methods do you use to assign, add, and send a task to others?
+Chapter 27
+
+Working with Events in Outlook
+
+If you want to automate the way that Outlook works, you may sometimes need to write code that responds to Outlook events. Outlook has two classes of events, application-level events and item-level events, and between them, they enable you to write code that responds to most anything that happens in Outlook. In this chapter, you will learn how to work with both types of events, and you will see code examples showing how to manage some of the events.
+
+* * *
+
+How Event-Handler Procedures Differ from Ordinary Macros
+
+Both writing and testing an event-handler procedure differ from the techniques you've been employing throughout this book when creating and testing ordinary macro procedures. If you intend to test the examples in this chapter, I suggest that you first read the sidebar titled "How to Test Event-Handler Procedures" later in this chapter.
+
+The following points summarize the qualities of event-handler procedures that differ from ordinary procedures:
+
+  * An event handler must be located within a class module, not in an ordinary macro module. Therefore, you're entering the world of object-oriented programming (OOP). And in spite of some useful qualities, OOP can sink us programmers into a quagmire of complexity. A complete example demonstrating how to add an event handler to Outlook and then test it can be found in the sidebar "How to Test Event-Handler Procedures."
+  * You must declare an object variable.
+  * You must initialize the object variable to connect it to an actual object.
+  * You then write code in a procedure triggered by the event you're interested in.
+  * You test your code differently than you would in ordinary modules. In a class module, you cannot simply test the event handler by pressing F5 to run it directly. Pressing F5 brings up the Macro dialog box. Instead you test your code indirectly by triggering the event it's designed to service—for example, by modifying a contact in the Contacts folder.
+
+* * *
+
+In addition to the events discussed in this chapter, Outlook supports form events such as those discussed in "Using Events to Control Forms" in Chapter 15, "Creating Complex Forms Boxes." However, as is so often the case with Outlook and Access, the folders in Outlook are somewhat unique and are different from the VBA forms you've worked with earlier in this book. Outlook's folder/form is described later in this chapter in the sidebar titled "What Is VBScript?" As you'll see, you even use a special variation on the VBA language when programming this folder-slash-form.
+
+We'll conclude this chapter with a brief look at Outlook's Quick Steps feature. For those who don't wish to, or can't, write macros, Quick Steps provides an alternative, if seriously limited, way to automate some tasks. This tool is similar to Access's Macro Designer, though even more simplistic.
+
+In this chapter you will learn to do the following:
+
+  * Create event handlers
+  * Work with application-level events
+  * Work with item-level events
+  * Understand the Quick Steps tool
+
+# Working with Application-Level Events
+
+By default, macros are disabled in Outlook 2013. To work with the examples in this chapter, or to use macros in general, you must select an enabling option in Outlook's Trust Center. To do so, follow these steps:
+
+1. Click the File tab on the Ribbon.
+
+2. Choose Options in the left pane of the File window.
+
+3. Click Trust Center in the left pane of the Outlook Options dialog box.
+
+4. Click the Trust Center Settings button.
+
+5. Click Macro Settings in the left pane of the Trust Center dialog box.
+
+6. Now choose one of the two lower options: Notification For All Macros (which gets old quickly) or Enable All Macros.
+
+Recall that an event is something that happens to an object, such as a click, a mouse drag, a keystroke, and so on. You can write code in an event (an _event procedure_ , as it's called) to respond to the click or other event.
+
+An application-level event is an event that happens to the Outlook application as a whole rather than to an individual item within it. For example, the Startup event is an application-level event that occurs when Outlook starts, and the Quit event is an application-level event that occurs when Outlook closes. By contrast, item-level events represent things that happen to individual items—for example, the opening of a particular email message or contact record, or a user's switching from one folder to another.
+
+The application-level events are easier to access than the item-level events because the Application object is the topmost object and is always available when Outlook is running. This means that you don't have to use an event handler to create the Application object. It just always exists. You do, however, have to write code to create an object for an item-level event.
+
+To access the application-level events, you use the built-in ThisOutlookSession class module. It's automatically inserted into the VBA Editor. Look in the Project Explorer and expand the Project1 item that represents the Outlook VBA project, then expand the Microsoft Outlook Objects item. You now see the ThisOutlookSession item. Double-click it to open a Code window showing its contents. (If this is the first time you've opened the ThisOutlookSession class module, it will have no contents.)
+
+Each of the events described in the following sections works with the Application object. For simplicity, most of the following examples directly use the Outlook Application object itself, but you could declare an object variable, then use it to return the Application object if you wish.
+
+Recall that you can find the Application object in the drop-down list on the top left of the VBA Editor's Code window. All the events available to the Application object can be selected from the drop-down list on the top right of the Code window, as shown in Figure 27.1.
+
+Figure 27.1 The drop-down list on the right shows all the events available in the Application object.
+
+You can select these various events from the drop-down list (causing the editor to type in the procedure structure for you) or just type the event name yourself as a sub directly in the Code window. However, if you select from the drop-down list, the VBA Editor will automatically add any necessary arguments as well. So that's the easier approach.
+
+Also, if you declare object variables using the WithEvents statement, like this, the Editor's drop-down lists will include these objects and their available events:
+
+    Public WithEvents myInspectors As Inspectors
+    Public WithEvents myInspector As Inspector
+
+That can be a useful shortcut while programming because you can view every event available in an object—and also have the editor type in any necessary arguments. Later in this chapter you'll experiment with the Inspectors collection and the Inspector argument.
+
+## Using the Startup Event
+
+The Startup event, which takes no arguments, occurs when Outlook starts. In other words, every time the user starts Outlook, any code you might have written in the Sub Application_Startup() procedure will automatically execute.
+
+The Startup event is useful for making sure that Outlook is correctly configured for the user to start work. Say that someone always starts off by writing notes, and the first note is always a reminder about time cards. The following example creates a new NoteItem object (a note), assigns text to its Body property, and uses the Display item to display it:
+
+    Private Sub **Application_Startup** ()
+        Dim myNoteItem As NoteItem
+        Set myNoteItem = Application.CreateItem(ItemType:=olNoteItem)
+        myNoteItem.Body = "Please start a new time card for the day."
+        myNoteItem.Display
+    End Sub
+
+You can also put the Startup event to good use by writing code with the Set command to connect an object variable to a real object it is supposed to represent. More on this later in this chapter, in the section titled "Declaring an Object Variable and Initializing an Event."
+
+## Using the Quit Event
+
+The Quit event occurs when Outlook is shut down. This event is triggered three possible ways:
+
+  * By the user choosing Exit in the File tab of the Ribbon.
+  * By the user clicking the red X icon in the upper right of the Outlook window.
+  * By the programmer using the Quit method of the Application object in VBA.
+
+By the time that the Quit event fires (is triggered), all of Outlook's windows have already been closed and all global variables have been released, so there's little left for a programmer to access via code in this event procedure. One possibility, however, is to display a parting message to the user, as in the following example, which displays a message on the workday that precedes a national holiday to remind the user of the holiday:
+
+    Private Sub Application_Quit()
+
+        Dim strMessage As String
+        Select Case Format(Date, "MM/DD/YYYY")
+            Case "01/18/2013"
+                strMessage = "Next Monday is Martin Luther King Day."
+            Case "02/15/2013"
+                strMessage = "Next Monday is President's Day."
+            Case "05/23/2013"
+                strMessage = "Next Monday is Memorial Day."
+            Case "07/03/2013"
+                strMessage = "Friday is Independence Day." & _
+                    " Monday is a company holiday."
+            Case "08/29/2013"
+                strMessage = "Next Monday is Labor Day."
+            'other National Holidays here
+        End Select
+
+    If strMessage = "" Then Exit Sub
+
+    MsgBox strMessage, vbOKCancel + vbExclamation, "Don't Forget..."
+
+    End Sub
+
+## Using the ****ItemSend Event
+
+The ItemSend event occurs when an item is sent, either by the user issuing a Send command (for example, by clicking the Send button in a message window) or by executing the Send method in VBA code. The syntax for the ItemSend event is as follows:
+
+    Sub _expression_ _ItemSend(ByVal Item As Object, Cancel As Boolean)
+
+Here, _expression_ is a required expression that returns an Application object. Item is a required argument that specifies the item that's being sent. Cancel is an optional Boolean argument that you can set to False to prevent the item from being sent.
+
+The following example examines the Subject property of the Item object being sent. If the Subject property is an empty string, the message box prompts the user to add a subject line, and the Cancel = True statement cancels the sending of the item:
+
+    Private Sub Application_ItemSend(ByVal Item As Object, Cancel As Boolean)
+        If Item.Subject = "" Then
+            MsgBox "Please add a subject line to this message before sending it."
+            Cancel = True
+        End If
+    End Sub
+
+## Using the ****NewMail ****and NewMailEx Events
+
+The NewMail event occurs when one or more new mail items arrives in the Inbox. The NewMail event can be useful for sorting messages automatically. You can also specify custom rules to sort messages automatically. The NewMail event takes no arguments.
+
+The following example displays a message box that offers to show the Inbox when new mail arrives, triggering the NewMail event:
+
+    Private Sub Application_NewMail()
+        If MsgBox("You have new mail. Do you want to see your Inbox?", _
+            vbYesNo + vbInformation, "New Mail Alert") = vbYes Then
+
+            Application.GetNamespace("MAPI").GetDefaultFolder(olFolderInbox).Display
+        End If
+    End Sub
+
+The NewMailEx event is a more complex version of the NewMail event that passes to your code a list of the items received in the Inbox since that event last fired. The NewMailEx event passes this list only for Exchange Server and other mailboxes that provide notification of messages received. The syntax is as follows:
+
+    Sub _expression_.NewMailEx(EntryIDCollection As String)
+
+Here, _expression_ is a required expression that returns an Application object. EntryIDCollection is a string that contains the entry IDs of the messages that have been received. Each entry ID is separated from the next by a comma; if there is a single entry ID, there is no comma in the EntryIDCollection string.
+
+The following example of a NewMailEx event procedure uses a Do While... Loop loop to separate the individual message IDs (by using the InStr function to identify each section of the EntryIDCollection string, up to the next comma, in turn). Then the code builds a string that contains introductory text followed by the subject line of each message, one message to a line. Finally, the procedure displays the string in a message box so that when Outlook receives new mail, the user receives an executive summary of the subject lines:
+
+    Private Sub Application_NewMailEx(ByVal EntryIDCollection As String)
+
+        Dim myMailItem As Object
+        Dim intMsgIDStart As Integer, intMsgIDEnd As Integer
+        Dim intCutPoint As String, strMailItemID As String, strMailList As String
+
+        intMsgIDStart = 1
+        intCutPoint = Len(EntryIDCollection)
+
+        intMsgIDEnd = InStr(intMsgIDStart, EntryIDCollection, ",")
+        strMailList = "You have the following messages:"
+
+        Do While intMsgIDEnd <> 0
+            strMailItemID = Strings.Mid(EntryIDCollection, intMsgIDStart, _
+                (intMsgIDEnd - intMsgIDStart))
+            Set myMailItem = Application.Session.GetItemFromID(strMailItemID)
+            strMailList = strMailList & vbCr & myMailItem.Subject
+            intMsgIDStart = intMsgIDEnd + 1
+            intMsgIDEnd = InStr(intMsgIDStart, EntryIDCollection, ",")
+        Loop
+
+        MsgBox strMailList, vbOKOnly + vbInformation, "Mail Alert"
+
+    End Sub
+
+* * *
+
+An Alternative to the NewMail Events
+
+Instead of using a NewMail or NewMailEx event, you can use an ItemAdd event with the items in the Inbox to process each new message that arrives.
+
+* * *
+
+## Using the AdvancedSearchComplete and the AdvancedSearchStopped ****Events
+
+Outlook provides two events for working with advanced searches created using the AdvancedSearch method. The AdvancedSearchComplete event fires when the AdvancedSearch method is run via VBA and finishes searching. The AdvancedSearchStopped event fires when the AdvancedSearch method is run via VBA, and is stopped by using the Stop method of the search.
+
+The syntax for the AdvancedSearchComplete event is as follows:
+
+    Private Sub _expression_ _ AdvancedSearchComplete(ByVal SearchObject As Object)
+
+Here, _expression_ is a required expression that returns an Application-type object variable that has been declared with events in a class module. SearchObject is the Search object that the AdvancedSearch method returns.
+
+The following example uses the AdvancedSearchComplete event to return the number of search results that were found by the AdvancedSearch method:
+
+    Private Sub Application_AdvancedSearchComplete(ByVal SearchObject As Search)
+        MsgBox "The search has finished running and found " & _
+            SearchObject.Results.Count & " results.", vbOKOnly + vbInformation, _
+            "Advanced Search Complete Event"
+    End Sub
+
+The following example uses the AdvancedSearchStopped event to inform the user that the search has been stopped:
+
+    Private Sub Application_AdvancedSearchStopped(ByVal SearchObject As Search)
+        MsgBox "The search was stopped by a Stop command.", vbOKOnly
+    End Sub
+
+## Using the MAPILogonComplete ****Event
+
+The MAPILogonComplete event occurs when the user has successfully logged on to Outlook. You can use the MAPILogonComplete event to ensure that Outlook is configured correctly for the user or simply to display some information in a message. The MAPILogonComplete event takes no arguments.
+
+The following example of a MAPILogonComplete procedure displays a message about current trading conditions when the user has successfully logged on to Outlook. The code includes a commented line indicating where the String variables strPubDownBegin and strPubForecast would be declared and assigned data in a real-world implementation of this example:
+
+    Private Sub Application_MAPILogonComplete()
+
+        Dim strMsg As String
+
+        'strPubDowBegin and strPubForecast declared and assigned strings here
+
+        strMsg = "Welcome to the UltraBroker Trading System!" & vbCr & vbCr
+        strMsg = strMsg & "Today's starting value is " & strPubDowBegin & "." _
+            & vbCr & vbCr
+        strMsg = strMsg & "Today's trading forecast is " & strPubForecast & "."
+        MsgBox strMsg, vbOKOnly + vbInformation, _
+            "UltraBroker Trading System Logon Greeting"
+    End Sub
+
+## Using the Reminder ****Event
+
+The Reminder event fires immediately before the reminder for a meeting, task, or appointment is displayed to the user. You can use the Reminder event to take an action related to the reminder. Because the reminder itself is usually adequate for reminding the user of the meeting, task, or appointment, the Reminder event tends to be more useful when accessing Outlook programmatically than when a user is working interactively with Outlook. The syntax is as follows:
+
+    Sub _expression_ _Reminder(ByVal Item As Object)
+
+Here, _expression_ is a required expression that returns an Application object, and Item is the AppointmentItem, MailItem, ContactItem, or TaskItem object associated with the reminder.
+
+## Using the OptionsPagesAdd ****Event
+
+The OptionsPagesAdd event occurs when either the Options dialog box (Tools ⇒ Options) or the Properties dialog box for a folder, such as the Inbox, is opened. (To open the Properties dialog box for a folder, right-click the folder, and then choose Properties from the context menu.) You can use this event to add a custom page (which is contained in a COM [Component Object Model] add-in that you have created) to the Options dialog box or the Properties dialog box. The syntax for the OptionsPagesAdd event is as follows:
+
+    Sub _expression_ _OptionsPagesAdd(ByVal Pages As PropertyPages, _
+        ByVal Folder As MAPIFolder)
+
+Here, _expression_ is a required expression that returns an Application object or a NameSpace object. Pages is a required argument that gives the collection of custom property pages added to the dialog box. Folder is a required argument used when _expression_ returns a MAPIFolder object. Folder returns the MAPIFolder object for which the Properties dialog box is being opened.
+
+# Working with Item-Level Events
+
+In addition to the application-level events discussed so far, Outlook supports a wide variety of _item-level events_ —events that fire when specific items are manipulated, as opposed to events related to Outlook as a whole.
+
+You can handle item-level events in Outlook in two ways:
+
+  * By declaring an event in a class module and running an initialization procedure so that VBA then traps the event when it fires. This chapter takes this approach.
+  * By creating Visual Basic Script (VBScript) code and placing it in a "custom form" used by the item. Custom forms are not to be confused with the UserForms we've been working with in the VBA Editor throughout this book. You create a custom form in Outlook by clicking the Developer tab on the Ribbon, then choosing options displayed on the Custom Forms section of the Ribbon.
+
+* * *
+
+What Is VBScript?
+
+Script versions of computer languages were originally designed to execute when a user visits a web page. So these languages are supposed to contain fewer capabilities than ordinary languages. For example, VBScript doesn't have a command that deletes a folder in Outlook, whereas VBA does (FolderRemove). Why? You don't want Outlook folders being deleted—or to trigger similar damaging actions—just because you simply opened a malicious web page in your browser.
+
+Although the original intent was that script languages would be lightweight, Web-oriented versions of their parent languages, as always seems to happen with mission creep, they have changed over time to perform various tasks and to have a variety of implementations. This sort of corruption is typical in computer software: there are many versions of "standards" like XML, HTML, and the like. They start out with the intention to be uniform across platforms, to be governed by certain laws, and so on. Then they deconstruct. It reminds you of Mae West's famous remark: "I used to be Snow White, but I drifted."
+
+In spite of VBScript's limitations, you might want to employ it for one specialized job in Outlook: sharing items with others. VBScript code is contained within its custom form, so you can send it to other people. You can't directly export VBA to others inside items you share.
+
+If you're interested in pursuing Outlook's Custom Forms and the VBScript that drives them, consult the useful tutorial here:
+
+<http://msdn.microsoft.com/en-us/library/office/jj973110.aspx>
+
+* * *
+
+## Declaring an Object Variable and Initializing an Event
+
+Follow these steps to declare an object variable and initialize an event:
+
+1. Use a class module to contain your object-variable declaration, in one of the following three ways:
+
+  * Use the built-in ThisOutlookSession module. In the Project Explorer, expand the project name (it's in boldface and by default is named Project1). Expand the Microsoft Outlook Objects item, and double-click the ThisOutlookSession item to open its Code window.
+  * Create a new class module by right-clicking the project name in the Project Explorer and choosing Insert ⇒ Class Module from the context menu. The VBA Editor automatically opens a Code window for the class.
+  * If there is one, you can open an existing class module by double-clicking it in the Project Explorer.
+
+2. In the declarations area at the beginning of your class module (at the top of the Code window), declare a variable to represent the object to which the event applies. Use the WithEvents keyword to specify that this object has events. The following example creates a public variable named myPublicContactItem:
+
+    Public **WithEvents** myPublicContactItem As ContactItem
+
+3. Initialize the object variable by setting it to represent the appropriate object. The following example sets our myPublicContactItem variable to represent the first item in the default contacts folder:
+
+    Set myPublicContactItem = Application.GetNamespace("MAPI") _
+        .GetDefaultFolder(olFolderContacts).Items(1)
+
+Once you've initialized the object variable, the procedure will run after the event fires.
+
+You can initialize the object variable manually if necessary, and you may find it convenient to do so when you're writing and testing code to handle events. But if you need to handle the event each time Outlook runs—if you want to make the macro a permanent part of your macro collection—it's obviously best to run the code to initialize the object variable automatically. For example, you might use the Startup event of the Application object (discussed in "Using the Startup Event," earlier in this chapter) to run event-handling initialization code automatically each time Outlook starts. In other words,
+
+    Private Sub Application_Startup()
+
+    Set myPublicContactItem = Application.GetNamespace("MAPI") _
+        .GetDefaultFolder(olFolderContacts).Items(1)
+
+    End Sub
+
+## Understanding the Events That Apply to All Message Items
+
+Table 27.1 lists the common message events. I'm using the term _message_ here to refer to the AppointmentItem, MailItem, ContactItem, and TaskItem objects. In other words, Table 27.1 lists the most common events that are available to these four objects.
+
+But be aware that there are additional "item" objects in Outlook, such as the DocumentItem, DistListItem, JournalItem, MeetingItem, and so on. To view these various items, and see descriptions of their events, visit this web page:
+
+<http://msdn.microsoft.com/en-us/library/office/ff866465.aspx>
+
+Also note that although Table 27.1 describes 16 common events, each of the "item" objects actually has 26 events. As an example, the complete list of events for the MailItem object in Outlook 2013 is provided on this web page:
+
+<http://msdn.microsoft.com/en-us/library/office/jj900958.aspx>)
+
+Table 27.1 Common item-level events
+
+**Event** | **Event Occurs**  
+---|---  
+AttachmentAdd | After an attachment is added to the item  
+AttachmentRead | When the user opens an email attachment for reading  
+BeforeAttachmentSave | When the user chooses to save an attachment but before the command is executed  
+BeforeCheckNames | Before Outlook checks the names of the recipients of an item being sent  
+BeforeDelete | Before an item is deleted  
+Close | When an inspector is being closed but before the closing occurs  
+CustomAction | When the custom action of an item is executed  
+CustomPropertyChange | When a custom property of an item is changed  
+Forward | When the user forwards an item  
+Open | When an item is opened in an inspector  
+PropertyChange | When a standard property (as opposed to a custom property) in the item is changed  
+Read | When an item is opened for editing in an inspector window or is selected for editing in-cell  
+Reply | When the user issues a Reply command for an item  
+ReplyAll | When the user issues a Reply All command  
+Send | When a Send command has been issued but before the item is sent  
+Write | When an item is saved, either explicitly by the user or implicitly by Outlook
+
+Note that the Close event applies to the Inspector object and the Explorer object as well as to the objects just mentioned.
+
+The events that fire before an action occurs allow you to cancel the action, preventing it from happening at all. The syntax for these events uses a Boolean argument named Cancel that you can set to True to prevent the action from taking place. For example, the syntax for the BeforeDelete event is as follows:
+
+    Sub _expression_ _BeforeDelete(ByVal Item As Object, **Cancel** As Boolean)
+
+Here, _expression_ is a required expression that returns one of the message items to which the event applies (for example, a TaskItem object). The following example uses the BeforeDelete event to see if the TaskItem object that's open in an inspector is marked as complete when the user tries to delete it. If the task is not marked as complete, a message box prompts the user to complete the task, and the example then sets the Cancel argument to True to prevent the deletion:
+
+    Private Sub myTaskItem_BeforeDelete(ByVal Item As Object, Cancel As Boolean)
+        If myTaskItem.Complete = False Then
+            MsgBox "Please complete the task before deleting it.", _
+                vbOKOnly + vbExclamation, "Task Is Incomplete"
+            **Cancel = True**
+        End If
+    End Sub
+
+* * *
+
+The Difference between the ****Read**** and ****Open**** Events
+
+The Read event and the Open event both occur when the user opens an existing item for editing. The difference between the two events is that the Open event occurs only when the item is being opened in an inspector window, whereas the Read event occurs both when the item is being opened in an inspector window and also when it is being selected for editing in a cell.
+
+* * *
+
+## Understanding the Events That Apply to Explorers, Inspectors, and Views
+
+Table 27.2 lists the events that apply to explorers, inspectors, and views. Some events apply to both explorers and inspectors.
+
+Table 27.2 Events that apply to explorers, inspectors, or views
+
+**Event** | **Applies To** | **Event Occurs**  
+---|---|---  
+BeforeFolderSwitch | Explorer | Before the explorer displays a new folder  
+BeforeItemCopy | Explorer | When the user issues a Copy command but before the Copy operation takes place  
+BeforeItemCut | Explorer | When an item is cut from a folder  
+BeforeItemPaste | Explorer | Before an item is pasted  
+BeforeViewSwitch | Explorer | Before the view changes in the Outlook window  
+Close | Explorer, Inspector | When an explorer is closing  
+FolderSwitch | Explorer | After an explorer displays a new folder  
+SelectionChange | Explorer | When the focus is moved to a different item in a folder, or when Outlook selects the first item in a folder when the user selects that folder  
+AttachmentSelectionChange | Explorer, Inspector | When a new or different attachment is selected  
+ViewSwitch | Explorer | When the view changes in the explorer window  
+Activate | Explorer, Inspector | When an explorer window or an inspector window is activated (becomes the active window)  
+Deactivate | Explorer, Inspector | When an explorer window or an inspector window is deactivated (stops being the active window)  
+BeforeMaximize | Explorer, Inspector | When the user maximizes the explorer or inspector but before maximization takes place  
+BeforeMinimize | Explorer, Inspector | When the user minimizes the explorer or inspector but before minimization takes place  
+BeforeMove | Explorer, Inspector | When the user moves an explorer window or an inspector window but before the action takes place  
+BeforeSize | Explorer, Inspector | When the user resizes the explorer window or inspector window but before the resizing takes place  
+PageChange | Inspector | When the active form page changes  
+InlineResponse | Explorer | When an inline response appears in the reading pane  
+InlineResponseClose | Explorer | When an inline response in the reading pane closes  
+NewExplorer | Explorers | When a new explorer window is opened  
+NewInspector | Inspectors | When a new inspector window is opened  
+ViewAdd | Views | When a view is added to the Views collection  
+ViewRemove | Views | When a view is removed from the Views collection
+
+If you work on a small screen (for example, a laptop screen), you might prefer to use the NewInspector event to maximize each inspector window you open and to hide any toolbars you don't need. The first procedure in the following example (which includes the necessary declarations) uses the NewInspector event to make sure the Standard toolbar is displayed, hide the Advanced toolbar, and assign the Inspector object representing the new inspector to the Public object variable myInspector. The second procedure uses the Activate event of the myInspector object to maximize its window by setting the WindowState property to olMaximized.
+
+The net effect of these two event procedures is to configure the toolbars as described earlier and maximize the inspector window. Put more simply, if you, for example, double-click an email, it opens in a new window. That windows is the "inspector" object. The Activate event procedure is necessary because the NewInspector event runs before the inspector window is displayed, which means the NewInspector event procedure cannot maximize the inspector window.
+
+    Public WithEvents myInspectors As Inspectors
+    Public WithEvents myInspector As Inspector
+
+    Private Sub myInspectors_NewInspector(ByVal Inspector As Outlook.Inspector)
+        With Inspector
+            With .CommandBars
+                .Item("Standard").Visible = True
+                .Item("Advanced").Visible = False
+            End With
+            Set myInspector = Inspector
+        End With
+    End Sub
+
+    Private Sub myInspector_Activate()
+        myInspector.WindowState = olMaximized
+    End Sub
+
+* * *
+
+**How to Test Event-Handler Procedures**
+
+You don't test event handlers the same way that you test ordinary VBA modules. In an ordinary module, you click to put the blinking insertion cursor inside the macro you want to execute, then press F5 to execute that procedure.
+
+In a _class_ module, by contrast, pressing F5 merely opens the Macros dialog box rather than directly running the code.
+
+If you are confused about where to put handler code, and how to test it, don't be discouraged. An event handler must be put into a class module. And whenever you use classes, you're venturing into OOP. OOP, whatever its merits, always adds a layer of complexity for the programmer. So, let's briefly review this topic so you'll see how to write, then test, event handlers.
+
+In this next example, you want to respond to any changes the user might make to one of the user's contacts. In other words, you need to write some code in the ItemChange event of the Contacts folder. Perhaps you want your code to alert the user that they need to make further changes. Or that they need to send this new information to their assistant. Whatever the reason, your purpose is to write code that executes when a Contact item changes—when the user modifies a contact, then clicks the Save button, thereby triggering the ItemChange event.
+
+"Handling" an event (writing your own code that executes when an event takes place) requires that you take three steps:
+
+1. **Create an object variable** —using the WithEvents command—that will represent the object whose event you want to handle. Where does this code go? At the top of a class module in the General Declarations section above any subs. Outlook has that special built-in class module named ThisOutlookSession. So instead of creating a new class module, let's keep things simple and just use the existing ThisOutlookSession class module to declare our object variable.
+
+2. **Point or connect (** **Set** **) your new object variable to the actual object whose event you want to handle**. In our example, we want to handle the Items collection of Outlook's Contacts folder. Where does this code go? It could be put into a macro. Or, because we want to have this connection made automatically, let's put it in Outlook's startup event. That way the connection is made whenever the user runs Outlook. Remember that the various Office applications have specially reserved names: if you name a procedure in Word AutoExec, for example, its code executes when you start Word. If you name a procedure Application_Startup in Outlook, that's the equivalent of Word's AutoExec.
+
+3. **Write the event-handler code—the actions you want taken when this event occurs**. Where does this code go? In the same class module where you declared the object variables (step 1, above).
+
+There are other ways to handle events, but this is a straightforward example. To keep it simple, we'll put the code for all three steps in Outlook's built-in ThisOutlookSession class module. Now let's follow the preceding steps, only this time we'll insert the actual code:
+
+1. First open Outlook's VBA Editor by pressing Alt+F11. Expand Project1 in the project window until you see the ThisOutlookSession class module (under Microsoft Outlook Objects). Double-click ThisOutlookSession to open its code window.
+
+2. At the top of the ThisOutlookSession Code window, type the object variable's declaration:
+
+    Public **WithEvents** objContacts As Items
+
+3. Now, in the Application Startup event, we'll write code that connects the object variable to the real Outlook object we're interested in: the Items collection of the Contacts folder:
+
+    Private Sub Application_Startup()
+
+        **Set** objContacts = Application.GetNamespace("MAPI") _
+            .GetDefaultFolder( **olFolderContacts** ). **Items**
+
+    End Sub
+
+4. Finally, we'll write the event-handler code that does the job we want done. This code also goes in the ThisOutlookSession module:
+
+    Private Sub objContacts_ **ItemChange** (ByVal Item As Object)
+
+        MsgBox "This Contact Item Has Been Changed"
+
+    End Sub
+
+Now to test this event handler. In Outlook, open your Contacts folder. In Outlook 2013 this folder is named _People_ and is found in the lower-left corner next to the Mail and the Calendar links. Double-click some random contact and type something in the Notes field, then click the Save button. This should cause your event-handler code to execute, displaying a message box telling you that the contact info has changed.
+
+I'm not going to pretend that any of this is easy. Although OOP has its merits, writing code employing OOP rules can be a real wrestling match. Complexities involving diction, punctuation, reference, scope, precedence, and other issues will often draw you into a world of multiplying interactions—leading to unpredictable and perplexing test-code-retest cycles. Your best bet when working with class modules is to try to find working example code online that's close to what you're trying to accomplish, then modify it to suit your purposes.
+
+* * *
+
+## Understanding the Events That Apply to Folders
+
+Outlook provides three events (see Table 27.3) that apply to folders.
+
+Table 27.3 Events that apply to folders
+
+**Event** | **Event Occurs**  
+---|---  
+FolderAdd | When a folder is added to the specified Folders collection  
+FolderChange | When a folder in the specified Folders collection is changed  
+FolderRemove | When a folder is removed from the specified Folders collection
+
+## Understanding the Events That Apply to Items and Results
+
+Table 27.4 lists the events that apply to items and results.
+
+Table 27.4 Events that apply to items and results
+
+**Event** | **Event Occurs**  
+---|---  
+ItemAdd | When one or more items are added to the collection but not when many items are added all at once  
+ItemChange | When an item in the Items collection or the Results collection is changed  
+ItemRemove | When an item is deleted from the Items collection or the Results collection but not when 16 or more items are deleted at once from a Personal Folders file, an Exchange mailbox, or an Exchange public folder; also not when the last item in a Personal Folders file is deleted
+
+The example in the sidebar "How to Test Event-Handler Procedures" earlier in this chapter employs the ItemChange event to monitor when any contact is changed in the Contacts folder.
+
+## Understanding the Events That Apply to Reminders
+
+Table 27.5 explains the events that Outlook provides for reminders. You can use these events to take actions when a reminder fires, before the reminder dialog box appears, when the user clicks the Snooze button to dismiss a reminder, or when reminders are added, changed, or removed.
+
+Table 27.5 Events that apply to reminders
+
+**Event** | **Event Occurs**  
+---|---  
+BeforeReminderShow | Before Outlook displays the Reminder dialog box  
+ReminderAdd | When a reminder is added  
+ReminderChange | After a reminder has been changed  
+ReminderFire | Before a reminder is executed  
+ReminderRemove | When a reminder is removed from the Reminders collection  
+Snooze | When the user dismisses a reminder by clicking the Snooze button
+
+## Understanding the Events That Apply to Synchronization
+
+If you write procedures to synchronize Outlook, you may need to use the three events that apply to the SyncObject object, which represents a Send/Receive group for a user. (You can access the SyncObject object by using the SyncObjects property of the NameSpace object to return the SyncObjects collection.) Table 27.6 explains the events that apply to the SyncObject object.
+
+Table 27.6 Events that apply to the SyncObject object
+
+**Event** | **Event Occurs**  
+---|---  
+SyncStart | When Outlook starts synchronizing a user's folders  
+Progress | Triggers periodically during the synchronization of Outlook folders  
+SyncEnd | After synchronization ends  
+OnError | When an error occurs during synchronization
+
+The following example uses the OnError event with the object variable mySyncObject. If an error occurs during synchronization of the SyncObject represented by mySyncObject, the procedure displays an error message giving the error code and description:
+
+    Private Sub mySyncObject_OnError(ByVal Code As Long, _
+        ByVal Description As String)
+
+        Dim strMessage As String
+        strMessage = "An error occurred during synchronization:" & vbCr & vbCr
+        strMessage = strMessage & "Error code: " & Code & vbCr
+        strMessage = strMessage & "Error description: " & Description
+        MsgBox strMessage, vbOKOnly + vbExclamation, "Synchronization Error"
+
+    End Sub
+
+# Understanding Quick Steps
+
+A Quick Steps feature allows non-programmers to combine actions in Outlook without having to record a macro (Outlook has no recorder anyway) or write a procedure using VBA.
+
+While looking at the Mail page in Outlook, click the Home button on the Ribbon. You'll see the Quick Steps area right in the middle of the Ribbon.
+
+The rationale for Quick Steps is the same as the rationale for writing or recording macros: After you've specified and saved a set of actions, you need not manually repeat those actions in the future—you merely run the macro and the behaviors are carried out automatically.
+
+Quick Steps is similar to Access's Macro Designer: You're presented with a list of common actions and you can choose to combine two or more of them into a macro-like little "program." And, like a macro, a Quick Steps one-click button saves time by launching the "program" anytime the user chooses. Non-programmers can build the Quick Steps "programs" out of actions that they frequently perform—thus saving time.
+
+Although not nearly as flexible and powerful as writing macros in VBA, for a common task you might consider seeing if it's possible to create a Quick Step.
+
+Some sample Quick Steps are already available in the Ribbon, and when you first click them you're asked to customize their behavior to suit your way of working. Click, for example, the _MoveTo: ?_ sample, and the First Time Setup dialog box opens, as shown in Figure 27.2.
+
+Figure 27.2 Experiment with the sample Quick Steps to get an idea how to create and customize them.
+
+As you see in Figure 27.2, you're allowed to customize this Quick Step by changing its name, specifying the target folder, and deciding whether or not to mark it as read. So this little program performs two actions at the click of a button. That could be a time-saver if you frequently store read email in a particular folder. Also notice the Options button where you can further modify the behavior of this Quick Step. You can add more actions, delete actions, specify a shortcut key, and write a tooltip.
+
+Quick Steps makes 20 actions available to you, so it's no competition for the thousands of things you can do with VBA. Nonetheless, you might want to consider employing the Quick Steps tool for quick and easy automation of common mail-related tasks in Outlook.
+
+# The Bottom Line
+
+**Work with application-level events.**
+
+Event handlers are procedures that contain code that responds to an event. In other words, if a user modifies one of their contacts, an event can detect this modification and execute code you've written to respond to the modification.
+
+Master It
+
+Event-handler procedures are unlike ordinary macro procedures in several ways. Name one of the differences.
+
+**Work with item-level events.**
+
+Outlook has two primary kinds of events.
+
+Master It
+
+What are the two types of events in Outlook? And how do they differ?
+Chapter 28
+
+Understanding the Access Object Model and Key Objects
+
+If you work with Access databases, forms, or reports, you'll find many opportunities for customizing Access using VBA to streamline your work and that of your colleagues. Depending on the purposes for which you use Access, you might program Access to automatically extract data sets you need, to create custom reports on a regular schedule, and to perform many other tasks.
+
+Even if your work in Access consists simply of entering data into databases and checking that it is correct, you may be able to program VBA to make mundane tasks less onerous. For example, you might use VBA to simplify the process of data entry or to validate the data that the user enters to avoid problems further down the line.
+
+This chapter first shows you how to get started with VBA in Access because Access implements VBA in a different way from the other applications this book has discussed. You'll then come to grips with the Access object model and learn about its most important creatable objects. After that, the chapter shows you how to open and close databases, set startup properties for a database, work with the Screen object, and use the DoCmd object to run Access commands.
+
+The next chapter discusses how to manipulate the data in an Access database via VBA.
+
+In this chapter you will learn to do the following:
+
+  * Get started with VBA in Access
+  * Understand Access-style macros
+  * Open and close databases
+  * Work with the Screen object
+  * Use the DoCmd object to run Access commands
+
+# Getting Started with VBA in Access
+
+Access implements VBA differently than the other Office applications do. Here are the main differences:
+
+  * Collections in Access are zero-based—the first item in a collection is numbered 0 (zero) rather than 1. For example, Forms(0).Name returns the Name property of the first Form object in the Forms collection. Zero-based collections make your job as a programmer more difficult, particularly when employing loops.
+  * The term _macro_ is used in a special way in Access, unlike the way it's used in other Office applications, not to mention all other forms of computing. An Access "macro" is a historical entity—a holdover from the early days of this database system. Some consider the whole approach rather harebrained because it's limited to a subset of the available programming statements, and it's not nearly as useful or flexible or efficient (in most cases) as just writing VBA code. With an Access macro, you enter a list of actions that you want to perform by using a special utility—the Macro Designer (formerly known as the Macro Builder)—that's built into Access. You choose these actions from a list, then type in arguments in the next cell in a table displayed by the Macro Designer. So it's all a bit like filling in a form and not that much like real programming. It's similar to Outlook's Quick Steps tool described in Chapter 27, "Working with Events in Outlook."
+  * Access's so-called "macros" are created by clicking the Table tab on the Ribbon, clicking the Named Macros option, and clicking Create Named Macro. From now on we'll call these self-styled "macros" _Access_ - _style macros_ , to distinguish them from the true macros we've worked with throughout this book.
+  * When you write VBA code in the Access VBA Editor—as you would in the other Office 2013 applications—you create true macros, properly so called. (Just remember that Access doesn't describe these VBA procedures as _macros_. You just have to get used to the difference in terminology.) We'll focus our attention on the VBA capabilities in Access rather than on the legacy Macro Designer.
+  * For a user to execute a macro Sub, you must first create an Access-style function that calls the subprocedure. While you, the programmer, are working on a macro in the VBA Editor, you can debug and run the subprocedure by using the VBA Editor's usual commands (for example, press F5 to run and test the subprocedure). But a user will not be able to run the macro directly from the Access user interface. Instead, you must employ the RunCode action, as you'll see. There is an exception to this rule. In Chapter 31, "Progamming the Office 2013 Ribbon" (see the section titled "Direct Communication with VBA"), you'll learn how to directly trigger VBA by modifying the Access Ribbon.
+
+The following sections provide a complete, start-to-finish example of how to work with VBA in Access. You create a module, then write a procedure in that module, and finally, use the Macro Designer to create an Access-style macro whose sole purpose is to start the execution of the VBA procedure.
+
+This chapter shows you how to create "macros" in Access, so first you need to ensure that macros are, in fact, enabled in Access. Follow these steps to enable Access macros:
+
+1. Click the File tab on the Ribbon.
+
+2. Click Options in the left pane.
+
+3. Click Trust Center in the Access Options dialog box.
+
+4. Click the Trust Center Settings button.
+
+5. Click Macro Settings in the left pane of the Trust Center dialog box.
+
+6. Click the Enable All Macros option button.
+
+7. Click OK twice to close the dialog boxes.
+
+## Creating a Module in the VBA Editor
+
+To create a module where you can write VBA code, open an Access database and click the Ribbon's Database Tools tab. Click the Visual Basic button on the Ribbon (or simply press Alt+F11).
+
+The VBA Editor opens. Choose Insert ⇒ Module in the VBA Editor or right-click the project's name (it's boldface) in the Project Explorer pane, and choose Insert ⇒ Module from the shortcut menu.
+
+## Creating a Function
+
+After creating a VBA module in the VBA Editor, you can create a function within it as described earlier in this book. The following example creates a function named Standard_Setup that simply displays a message box to indicate that it is running (the next section uses this macro as an example):
+
+    Public Function Standard_Setup()
+        'put your choice of commands here
+        MsgBox "The Standard_Setup macro is running."
+    End Function
+
+You can test this code as usual by clicking somewhere inside the procedure, then pressing F5.
+
+After creating the function, switch back to Access by pressing Alt+F11 or clicking the View Microsoft Access button on the far left of the Standard toolbar in the VBA Editor. Of course, you could also use the traditional Windows Alt+Tab shortcut.
+
+## Using the Macro Designer
+
+Although this and the next chapter focus on automating Access via the more flexible and powerful VBA language, some readers may be interested to know how to work with the Macro Designer tool. So we'll explore it briefly before moving on to VBA examples.
+
+## Creating an Access-Style Macro to Run a Function
+
+Recall that a user can't directly trigger a VBA procedure interactively from the main Access interface (although you, the programmer, can press F5 to test procedures in the VBA Editor). You'll find no Macros dialog box like the one in Word and other Office 2013 applications. True, there _is_ a Run Macro button on the Database Tools tab of the Access Ribbon, but this feature cannot directly trigger a VBA procedure. (It only triggers an Access-style macro.)
+
+For a user to run a VBA procedure, you have to create an Access-style macro that was built using Access's Macro Designer. You use the RunCode action (command) to call the VBA procedure. We'll see how to do that now:
+
+1. Display the database window if it's not already displayed. For example, click the word _View_ (the _word_ with the small black down-arrow, not the icon) on the Ribbon's Home tab, then select Datasheet View from the options displayed.
+
+2. Click the Macro button on the Ribbon's Create tab to open the Macro Designer window (see Figure 28.1). This also opens a Design tab on the Ribbon.
+
+Figure 28.1 Use the Macro Designer window to create a new Access-style "macro" in Access.
+
+3. In the Action Catalog pane on the right, open the Macro Commands folder and double-click the RunCode item. This inserts the RunCode command into the middle pane. (The RunMacro option command can only execute Access-style macros. Likewise, if you try to add a button to the Quick Access Toolbar above the Ribbon, it too can only execute Access-style macros. )
+
+4. In the Function Name field, type **Standard_Setup()** , the name of the VBA test function you created earlier in this chapter. _The empty parentheses are required, so don't omit them._
+
+5. Click the Save icon in the Quick Access Toolbar above the Ribbon, or press Ctrl+S.
+
+6. Type the name **test** in the Save As dialog box, and click the OK button. (Tip: If you modify the macro later and want to change its name, choose File ⇒ Save As ⇒ Save Object As, then click the Save button. Isn't Access remarkably roundabout sometimes? Or you can right-click the macro's name in the left pane of the main Access window, then choose Rename.)
+
+7. Now test this macro (and consequently the VBA procedure it triggers) by clicking the Run icon on the Ribbon. It's the icon with the red exclamation point. This icon appears only when the Macro Designer is active in the Design tab of the Ribbon. You now see the message box telling you that your macro is running.
+
+The user can execute Access-style macros when the Macro Designer is closed. Just double-click _test_ in the All Access Objects list (the pane on the left side of the main window). It may be necessary to click the small down arrow at the top of this pane and choose Show All.
+
+Or the user can click the Database Tools tab of the Ribbon, then click the Run Macro icon on the Macro section (it too has a red exclamation point). Access's Run Macro dialog box opens. Select _test_ as the macro name you want to run, then click OK to close the dialog box and execute your macro.
+
+* * *
+
+Three Ways to Execute an Access-Style Macro
+
+To sum up, a user can execute an Access-style macro only three ways:
+
+  * Choose the Run Macro option from the Ribbon's Database Tools tab. This opens a small Run Macro dialog box from which you can select an Access-style macro and execute it.
+  * Double-click the Access-style macro's name in the All Access Objects list in the left pane of the main Access window.
+  * Add a button to the Quick Access Toolbar that will execute the Access-style macro.
+
+Add a button to the Quick Access Toolbar by following these steps:
+
+1. Click the Customize Quick Access Toolbar button (the down arrow icon on the right of the Quick Access Toolbar at the top left of the Access window).
+
+2. Click the More Commands option in the drop-down list. The Access Options dialog box opens.
+
+3. Select Macros in the Choose Commands From drop-down list.
+
+4. Double-click your macro's name to move it into the list on the right side (where the toolbar's displayed items are listed).
+
+5. Click OK to close the dialog box and put your macro on the toolbar.
+
+Note that you can't trigger a macro from a keyboard shortcut (Access doesn't permit you to create custom keyboard shortcuts).
+
+* * *
+
+## Translating an Access-Style Macro into a VBA Macro
+
+Given that VBA is far more powerful than the Access-style macros, you might want to convert an Access-style macro into VBA to enhance it. You can have Access automatically translate Access-style macros into VBA functions. Follow these steps:
+
+1. Display the database window if it's not already displayed. For example, click the word _View_ (the _word_ with the small black down-arrow, not the icon) on the Ribbon's Home tab, then select Datasheet View from the options displayed.
+
+2. Click the tab on the top of the main Access window named _test_ to view the Access-style macro you created earlier in this chapter. Click the Macro button on the Ribbon's Create tab to open the Macro Designer window (see Figure 28.1). This also opens a Design tab on the Ribbon.
+
+3. On the left side of the Ribbon, click Convert Macros To Visual Basic.
+
+4. You see a dialog box where you can optionally refuse to include error handling or comments.
+
+5. Click the Convert button.
+
+6. Press Alt+F11 to open the VBA Editor.
+
+7. In the Navigation pane, locate and double-click the module named Converted Macro-test. You now see the translated code:
+
+     '------------------------------------------------------------
+     ' test1
+     '
+     '------------------------------------------------------------
+     Function test1()
+     On Error GoTo test1_Err
+
+         Run_SampleProcedure
+
+     test1_Exit:
+         Exit Function
+
+     test1_Err:
+         MsgBox Error$
+         Resume test1_Exit
+
+     End Function
+
+If you opted to avoid the error trapping and commenting, it's simpler:
+
+     '------------------------------------------------------------
+     ' test1
+     '
+     '------------------------------------------------------------
+     Function test1()
+
+         Run_SampleProcedure
+
+     End Function
+
+## Using an _AutoExec_ Macro to Initialize an Access Session
+
+To set up preconditions for an Access session, you can use an AutoExec macro. When Access starts running, it checks to see if there is a macro named AutoExec. If so, that macro is executed (runs) automatically when Access opens. This AutoExec feature is also available in other Office applications, like Word.
+
+For example, you might choose to maximize the application window, open a particular item (for example, a table), or display a particular record. Note that AutoExec must be the name of an Access-style macro, not a VBA procedure.
+
+By the way, you can prevent an AutoExec macro from running when you open a database by holding down the Shift key while the database opens.
+
+To create an AutoExec macro, start a new macro as described in the previous section, add to it the actions that you want the macro to perform, and save it with the special reserved name AutoExec. The macro then runs the next time you open the database.
+
+We'll now turn our attention to regular VBA programming, but if you're interested in learning more about the Macro Designer, see the tutorial on this web page:
+
+<http://blogs.msdn.com/access/archive/2009/07/28/meet-the-access-2010-macro-designer.aspx>
+
+## Running a Subprocedure
+
+Until now, you've mostly created traditional subs when writing or recording a macro. And for consistency, the Access VBA code examples in this chapter and elsewhere will also be subs.
+
+But beware. If you want to permit the user to execute Access VBA procedures, they must be turned into functions. Just replace the word Sub with Function in your code. VBA will then automatically change the line at the end of your procedure from End Sub to End Function. Easy enough.
+
+So, just remember that in this way, and many others, Access differs from other Office applications. When you're writing a VBA macro in Access, there's no good reason to create Access VBA code in a subprocedure rather than in a function because a sub cannot be triggered directly in Access.
+
+Only functions can be directly triggered, as the example in the previous section illustrated. If you feel you must create a Sub, the only way to execute it is to create a function that, in turn, has the single job of executing your subprocedure. So what is the point?
+
+This function triggering indirection is clumsy, but it can be made to work if for some unimaginable reason you want to use a sub procedure. Here is a simple example:
+
+1. In the VBA Editor, create a subprocedure that performs the actions you want:
+
+    Sub SampleProcedure()
+        MsgBox "The subprocedure named Sample Procedure is running."
+    End Sub
+
+2. Still in the VBA Editor, create a function that runs the subprocedure:
+
+    Public Function Run_SampleProcedure()
+        Call SampleProcedure
+    End Function
+
+3. Then switch to Access and create an Access-style macro that uses the RunCode action to run the function that runs the subprocedure. (See the section earlier in this chapter titled "Creating an Access-style Macro to Run a Function.")
+
+## Understanding the _Option Compare Database_ Statement
+
+When you launch the VBA Editor in Access (by pressing Alt+F11 or clicking the Visual Basic button on the Ribbon's Database Tools tab) and then insert a code module you'll notice that Access automatically enters an Option Compare Database statement in the General Declarations area of the Code window.
+
+As an aside, recall that if you've selected the Require Variable Declaration check box on the Editor tab of the VBA Editor Options dialog box (Tools ⇒ Options) to make the VBA Editor force you to declare all variables explicitly, you'll see an Option Explicit statement in the General Declarations area as well.
+
+Access supports three different ways of comparing text strings: Option Compare Database, Option Compare Binary, and Option Compare Text. Here's what these options mean:
+
+  * Option Compare Database is the default comparison type for Access databases and performs string comparisons using the sort order for the locale that Windows is using (for example, U.S. English). Sorting is not case sensitive. Access automatically inserts an Option Compare Database statement in the declarations section of each module that you insert. You can delete the Option Compare Database statement, in which case Access will use Option Compare Binary instead.
+  * Option Compare Binary performs case-sensitive sorting. To use Option Compare Binary, either delete the Option Compare Database statement in the declarations section or change it to an Option Compare Binary statement.
+  * Option Compare Text performs case-insensitive sorting. To use Option Compare Text, change the Option Compare Database or Option Compare Binary statement to an Option Compare Text statement.
+
+# Getting an Overview of the Access Object Model
+
+It's not crucial to understand how the Access object model fits together in order to work with VBA in Access, but most people find it helpful to know the main objects in the object model. And sometimes the code examples in the Help system's object-model reference can be invaluable—showing you how and where to employ objects in your own programming.
+
+To explore the Access object model, follow these steps:
+
+1. Launch or activate Access, and then press Alt+F11 to launch or activate the VBA Editor.
+
+2. Move your cursor to a blank space in the code window (to avoid context-sensitive help).
+
+3. Press F1 in the editor to launch the Help web page for the VBA language reference for Office 2013.
+
+4. In the Bing search field, type **Access 2013 object model** and press Enter.
+
+5. Click the link _Access object model reference_ ( _Access 2013 developer reference_ ). You now see the list of primary Access objects, as shown in Figure 28.2.
+
+Figure 28.2 The entries in the Access object-model reference will help you write your own VBA code.
+
+# Understanding Creatable Objects in Access
+
+Access _exposes_ (makes available for your use in code) various _creatable_ objects, meaning that you can employ most of the important objects in its object model without explicitly going through (mentioning in your code) the Application object.
+
+For most programming purposes, these creatable objects are the most commonly used objects. The main creatable objects in Access are as follows:
+
+  * The Forms collection contains all the Form objects, which represent the open forms in a database. Because it's creatable, you need not write Application.Form in your code. You can leave off the Application and merely write Form.
+  * The Reports collection contains all the Report objects, which represent the open reports in a database.
+  * The DataAccessPages collection contains all the DataAccessPage objects, which represent the open data access pages in a project or a database. (An Access _project_ is a file that connects to a SQL Server database.)
+  * The CurrentProject object represents the active project or database in Access.
+  * The CurrentData object represents the objects stored in the current database.
+  * The CodeProject object represents the project containing the code database of a project or database.
+  * The CodeData object represents the objects stored in the code database.
+  * The Screen object represents the screen object that currently has the focus (the object that is receiving input or ready to receive input). The object can be a form, a report, or a control.
+  * The DoCmd object enables you to run Access commands.
+  * The Modules collection contains the Module objects, which represent the code modules and class modules in a database.
+  * The References collection contains the Reference objects, which represent the references set in the Access application.
+  * The DBEngine object represents the Microsoft Jet Database Engine and is the topmost object in the Data Access Objects (DAO) hierarchy. The DBEngine object provides access to the Workspaces collection, which contains all the Workspace objects available to Access, and to the Errors collection, which contains an Error object for each operation involving DAO.
+  * The Workspace object contains a named session for a given user. When you open a database, Access creates a workspace by default and assigns the open database to it. You can work with the current workspace or create more workspaces as needed.
+  * The Error object contains information about the data-access errors that have occurred in a DAO operation.
+
+# Opening and Closing Databases
+
+The following sections show you how to open and close databases. You can use the CurrentDb method to return the current database, open a database and treat it as the current database, or even open multiple databases at once. You can also create and remove workspaces.
+
+## Using the CurrentDb Method to Return the Current Database
+
+To work with the database that's currently open in Access, use the CurrentDb method on the Application object or an object variable representing the Application object. The CurrentDb method returns a Database object variable representing the currently open database that has the focus.
+
+The following example declares an object variable of the Database type named myDatabase and then uses the CurrentDb method to assign the active database to it:
+
+    Dim myDatabase As Database
+    Set myDatabase = Application.CurrentDb
+
+## Closing the Current Database and Opening a Different Database
+
+In Access, you can choose from among several ways of opening and closing a database. This section discusses the simplest method of opening and closing a database—by treating it as the current database. This method is similar to opening and closing a database when working interactively in Access. See the next section for another method of opening and closing databases that lets you have two or more databases open at the same time.
+
+To open a database as the current database, use the OpenCurrentDatabase method of the Application object. The syntax is as follows:
+
+    _expression_.OpenCurrentDatabase(Filepath, Exclusive, bstrPassword)
+
+Here are the components of the syntax:
+
+  * _expression_ is a required expression that returns an Application object.
+  * Filepath is a required String argument that specifies the path and filename of the database to open. You should specify the filename extension; if you omit it, Access assumes the extension is .accdb.
+  * Exclusive is an optional Boolean argument that you can set to True to open the database in Exclusive mode rather than in Shared mode (the default, or the result of an explicit False setting).
+  * bstrPassword is an optional String argument that specifies the password required to open the database.
+
+To close the current database, use the CloseCurrentDatabase method with the Application object. This method takes no arguments.
+
+You can run the CloseCurrentDatabase method from the current database, but you can't do anything after that because the code stops after VBA executes the CloseCurrentDatabase method and the database containing the code closes. To close the current database and open another by using the OpenCurrentDatabase method, you must run the code from outside the databases involved—for example, by using automation from another application. Chapter 30, "Accessing One Application from Another Application," describes this technique.
+
+* * *
+
+Prepare the Northwind Database to Use with This Book's Examples
+
+To test and experiment with some of the Access code examples in this and the following chapters, you need to do a little preliminary housekeeping. Put simply, we all need to be experimenting with the same database so we get the same results.
+
+Traditionally, when authors have written about Access, they've employed a sample database named Northwind that Microsoft included with Access. Northwind is a full-featured and therefore useful example database. It can be particularly valuable when you want to experiment with Access but don't want to use your own database (both to keep it safe and because your database might not have some of the features that Northwind has).
+
+I'll use Northwind in some of the examples in this book so that all readers can be working with the same data and the same structures. Therefore, before you test some of the upcoming code examples, please put a copy of Northwind.accdt in your C:\Temp directory so the example code in this book can locate it. If you don't have a C:\Temp directory, create one.
+
+You may already have Northwind on your hard drive. To see if you do, press the Start button to display the Windows 8 Modern home page, and type **Northwind.accdt**. Then click the Files search option in the right pane. If it shows up in the search, right-click it, choose Open File Location, then copy it and paste it into your C:\Temp directory.
+
+Then double-click this Northwind.accdt file. It will open in Access. Give the database the name **Northwind** and save it to C:\Temp. You want to end up with a file named Northwind.accdb in your C:\Temp directory.
+
+If you don't find Northwind.accdt on your hard drive, you can download it from Microsoft's website:
+
+<http://office.microsoft.com/en-us/templates/TC012289971033.aspx>
+
+The downloaded file will be named TS01128997.accdt. At some point Windows may ask your permission to download or install an ActiveX object. Agree to that. Don't worry; just double-click 01228997.accdt to open Northwind in Access. You'll see the File New Database dialog box open. In the File Name field, change the name to **Northwind.accdb** and click OK to close the File New Database dialog box and save Northwind.accdb to C:\Temp. As I said earlier, you want to end up with a file named Northwind.accdb in your C:\Temp directory.
+
+Now the code examples in this book can reference this file path to open Northwind:
+
+    filepath:="C:\Temp\Northwind.accdb"
+
+Next you'll want to remove the default login dialog box so you can work with the database more easily from code. Open Northwind.accdb by double-clicking its name in Windows Explorer.
+
+By default a login dialog box appears asking you to select one of the "employees" from this imaginary company. Click the Login button to close the dialog box and see Northwind in Access.
+
+If it's not already open, click the >> symbol in Access's left pane to open the _Navigation pane_. Locate the Supporting Objects entry in the Navigation pane and click it to expand it. Scroll down until you locate the macro named AutoExec. Right-click AutoExec, choose Cut from the context menu, and then close Access.
+
+Now that login dialog box won't interrupt you any more when you open the Northwind example database.
+
+* * *
+
+There's an additional requirement when you're writing code that communicates _between_ Office applications. You can't simply declare an object variable to point to an application object, like this:
+
+    Dim myAccess As Access.Application
+
+This code will run only if you first provide a _reference_ in the host application. For example, if you're trying to manipulate Access from VBA code within a Word macro, you need to set a reference in Word's VBA Editor.
+
+The following example illustrates a way to contact and manipulate Access from another VBA host—for example, from Excel or from Word. But before you can execute this code from Word or some other application, you must first choose Tools ⇒ References in the Word VBA Editor, then select _Microsoft Access 15.0 Object Library_ from the drop-down list. For this example, you must also have a database currently loaded and running in an instance of Access.
+
+This next example declares the object variable myAccess as the Access.Application type and the object variable myDatabase as the Object type. The example uses the GetObject method to assign to myAccess the copy of Access that's running, uses the CloseCurrentDatabase method to close this database, and then uses the OpenCurrentDatabase method to open another database, namely Northwind, in Exclusive mode. The final statement uses the CurrentDb method to assign the open database to the myDatabase object variable:
+
+    Dim myAccess As Access.Application
+    Dim myDatabase As Object
+
+    Set myAccess = GetObject(, "Access.Application")
+    myAccess.CloseCurrentDatabase
+    myAccess.OpenCurrentDatabase _
+        filepath:="C:\Temp\Northwind.accdb", Exclusive:=True
+    Set myDatabase = myAccess.CurrentDb
+
+When you test this code by executing it in the Word VBA Editor, you'll know it works because whatever database was open in Access will be replaced by Northwind (see the "Prepare the Northwind Database to Use with This Book's Examples" sidebar). Also note that when running this code, you might get an error message saying "User-defined type not defined." And the Editor will highlight this line of code:
+
+    Dim myAccess As Access.Application
+
+This means that the editor can't locate the object named Access. For reasons unknown, a newly added library is sometimes deselected in References. To fix this problem, just repeat the steps described previously to use Tools ⇒ References to add a reference to the _Microsoft Access 15.0 Object Library_ again.
+
+## Opening Multiple Databases at Once
+
+Instead of using the OpenCurrentDatabase method to open a database as the current database, you can use the OpenDatabase method of the Workspace object to open another database and return a reference to the Database object representing it. The syntax for the OpenDatabase method is as follows:
+
+    Set _database_ = _workspace_.OpenDatabase (Name, Options, ReadOnly, Connect)
+
+* * *
+
+Creating New Databases, Forms, and Reports in Access
+
+The discussions of the other Office applications in this part of the book (Part 6) have emphasized creating and saving new files—for example, creating new documents in Word or new workbooks in Excel and saving them under suitable names and in the appropriate formats.
+
+Access, too, has its own VBA commands for creating new databases, forms, reports, tables, and other objects programmatically:
+
+  * To create a new database, use the NewCurrentDatabase method of the Application object.
+  * To create a new form, use the CreateForm method. To place controls on the form, use the CreateControl method.
+  * To create a new report, use the CreateReport method. To place controls on the report, use the CreateReportControl method.
+
+While creating a new database programmatically is quite feasible, it is not only complex but also something that you probably won't need to do often, if ever. In most cases, the goal of your Access VBA programming will be to manipulate existing databases and objects that you have built manually.
+
+* * *
+
+Here are the components of the syntax:
+
+  * _database_ is an object variable that will represent the database you open.
+  * _workspace_ is an optional object variable that specifies the workspace in which you want to open the database. If you omit _workspace_ , Access opens the database in the default workspace. Although you can open the database in the default workspace without problems, you may find it more convenient to create another workspace and use it to keep the database separate. See "Creating and Removing Workspaces" later in this chapter for details.
+  * Name is a required String argument that specifies the name of the database to open. An error results if the database doesn't exist or isn't available or if another user has opened the database for exclusive access.
+  * Options is an optional Variant argument that specifies any options you want to set for the database. For an Access database, you can specify True to open the database in Exclusive mode or False (the default) to open it in Shared mode. For ODBCDirect workspaces, you can use other options; see the Access Visual Basic Help file for details.
+  * ReadOnly is an optional Variant argument that you can set to True to open the database in read-only mode. The default value is False, which opens the database in read/write mode.
+  * Connect is an optional Variant that you can use to pass any necessary connection information, such as a password for opening the database.
+
+The following example "opens" the Northwind database in a special sense: it's opened behind the scenes for our code to contact it and have access to its data, structure, and other features. But it _is not opened in Access where the user can see it_. In other words, an instance of the database is fully exposed to our code, but there's no user interface. There's no display in Access of the Northwind database. For this reason, I've included a message box in the code example to prove to you that the code example has actually opened Northwind and fetched some data from it.
+
+Also, when you use this invisible database technique, it's a good idea to finish up by closing any recordsets or other objects you've opened, as well as closing the database instance itself. This way, unattached and useless entities aren't left floating in your computer's memory.
+
+This example will not work if you have Northwind open in Access. You must test this code while a different database is open in Access.
+
+This example declares a Workspace object variable named myWorkspace and a Database object variable named myDatabase, assigns to myWorkspace the first Workspace object in the Workspaces collection (the default workspace), and assigns to myDatabase the database Northwind.accdb, which it opens in Exclusive mode with read/write access.
+
+To show you that Northwind did come into existence, we fetch the City data from the first record in the Customers table. Finally, we display the city name, then clean up memory by closing both the recordset and the database instance.
+
+You can try this by entering this code in a module in the Access VBA Editor, but just do this while some database other than Northwind is open in Access. Press F5, and you'll see the city data.
+
+    Sub test()
+
+    Dim myWorkspace As Workspace
+    Set myWorkspace = DBEngine.Workspaces(0)
+
+    Dim myDatabase As Database
+    Dim RecSet As Recordset
+
+    Set myDatabase = myWorkspace.OpenDatabase _
+        (Name:="C:\temp\northwind.accdb", _
+        Options:=True, ReadOnly:=False)
+
+    Set RecSet = myDatabase.OpenRecordset("Customers", dbOpenDynaset)
+
+    MsgBox RecSet!City
+
+     RecSet.Close
+     myDatabase.Close
+
+    End Sub
+
+## Closing a Database
+
+To close a database that you've opened by using the OpenDatabase method, use the Close method of the object variable to which you've assigned the database. For example, the following statement closes the database assigned to the object variable myDatabase:
+
+    myDatabase.Close
+
+## Creating and Removing Workspaces
+
+To keep different databases in separate sessions, you can create a new workspace as needed and remove it when you have finished working with it.
+
+### Creating a New Workspace
+
+To create a new workspace, use the CreateWorkspace method of the DBEngine object.
+
+The syntax is as follows:
+
+    Set _workspace_ = CreateWorkspace(Name, UserName, Password, UseType)
+
+Here are the components of the syntax:
+
+  * _workspace_ is the object variable to which you want to assign the workspace you're creating.
+  * Name is a required String argument that specifies the name to assign to the new workspace.
+  * UserName is a required String argument that specifies the owner of the new workspace.
+  * Password is a required String argument that specifies the password for the new workspace. The password can be up to 14 characters long. Use an empty string if you want to set a blank password.
+  * UseType is an optional argument that indicates the type of workspace to create. Use dbUseJet to create a Microsoft Jet workspace. Use dbUseODBC to create an ODBCDirect workspace. Omit this argument if you want the DefaultType property of the DBEngine object to determine the type of data source connected to the workspace.
+
+The following example declares an object variable named myWorkspace of the Workspace type and assigns to it a new Jet workspace named Workspace2. The example makes the admin account the owner of the new workspace:
+
+    Dim myWorkspace As Workspace
+    Set myWorkspace = CreateWorkspace(Name:="Workspace2", _
+        UserName:="admin", Password:="", UseType:=dbUseJet)
+
+After creating a new workspace, you can use it to open a new database (as described earlier in this chapter).
+
+### Removing a Workspace
+
+Before removing a workspace from the Workspaces collection, you must close all the open connections and databases. You can then use the Close method to close the Workspace object. For example, the following statement closes the Workspace object identified by the object variable myWorkspace:
+
+    myWorkspace.Close
+
+# Working with the _Screen_ Object
+
+If you've used VBA in the other Office applications, you've probably written code that works with whichever object is currently active. For example, in Word you can use the ActiveDocument object to work with the active document or the Selection object to work with the current selection. In PowerPoint you can work with the ActivePresentation object to work with whichever presentation happens to be active.
+
+In Access, you can use the Screen object to work with the form, report, or control that has the focus. The Screen object has various properties, including the following:
+
+  * The ActiveForm property returns the active form. If there is no active form, trying to use the ActiveForm property returns the error 2475.
+  * The ActiveDatasheet property returns the active datasheet. If there is no active datasheet, trying to use the ActiveDatasheet property returns the error 2484.
+  * The ActiveReport property returns the active report. If there is no active report, trying to use the ActiveReport property returns the error 2476.
+  * The ActiveDataAccessPage property returns the active data access page. If there is no active data access page, trying to use the ActiveDataAccessPage property returns the error 2022.
+  * The ActiveControl property returns the active control. If there is no active control, trying to use the ActiveControl property returns the error 2474.
+  * The PreviousControl property lets you access the control that previously had the focus.
+
+To avoid errors, you should check which object is active before trying to manipulate it by using the Screen object. The following example uses the these listed error numbers to determine whether a form, report, datasheet, or data access page is active and then displays a message box identifying the item and giving its name:
+
+    On Error Resume Next
+
+    Dim strName As String
+    Dim strType As String
+    strType = "Form"
+    strName = Screen.ActiveForm.Name
+    If Err = 2475 Then
+        Err = 0
+        strType = "Report"
+        strName = Screen.ActiveReport.Name
+        If Err = 2476 Then
+            Err = 0
+            strType = "Data access page"
+            strName = Screen.ActiveDataAccessPage.Name
+            If Err = 2022 Then
+                Err = 0
+                strType = "Datasheet"
+                strName = Screen.ActiveDatasheet.Name
+            End If
+        End If
+    End If
+
+    MsgBox "The current Screen object is a " & strType & vbCr _
+        & vbCr & "Screen object name: " & strName, _
+        vbOKOnly + vbInformation, "Current Screen Object"
+
+If you test this, use the Create tab on the Ribbon (and click the Form icon) to ensure that there is a form active in Access.
+
+# Using the _DoCmd_ Object to Run Access Commands
+
+The DoCmd object enables you to execute normal Access commands, such as Find or Rename, in your VBA code.
+
+To run a command, you use one of the methods of the DoCmd object. Table 28.1 lists the 66 DoCmd methods available in Access 2013 and explains briefly what they do.
+
+The following sections include examples showing how to use some of the methods described in Table 28.1.
+
+Table 28.1 Methods of the DoCmd object
+
+**Method** | **Explanation**  
+---|---  
+AddMenu | Adds a menu to the global menu bar or to a custom menu bar.  
+ApplyFilter | Applies a filter so that only records that match certain criteria are displayed.  
+Beep | Makes the computer beep—for example, to attract the user's attention when an error has occurred.  
+BrowseTo | BrowseTo is an Access-style macro action that helps you either create a custom user interface on top of an existing wizard navigation control or build your own.  
+CancelEvent | Cancels the event that has occurred.  
+ClearMacroError | Use after you handle an Access-style macro error to reset the data about the error so you can check for any future errors (in the MacroError object) while the macro continues to execute.  
+Close | Closes the specified object—for example, a form or a report.  
+CloseDatabase | Closes the database, just as if you'd clicked the File tab on the Ribbon and chosen the Close Database option. A Save dialog box will appear if necessary, asking for your disposition of any unsaved objects.  
+CopyDatabaseFile | Copies the database connected to the current project to a SQL Server file.  
+CopyObject | Copies the specified object (for example, a query or a table) into the specified database (or to a new table in the current database).  
+DeleteObject | Deletes the specified object from the database.  
+DoMenuItem | Performs a command from a menu or toolbar. This is an older command that has been replaced by the RunCommand method (described later in this table).  
+Echo | Provides backward compatibility for running the Echo action in earlier versions of VBA. It's better to use Application.Echo now.  
+FindNext | Finds the next record matching the search criteria specified by the FindRecord method.  
+FindRecord | Performs a search for a record that matches the specified criteria.  
+GoToControl | Moves the focus to the specified control or field in a form or datasheet.  
+GoToPage | Moves the focus to the specified page of a form.  
+GoToRecord | Makes the specified record the current record.  
+Hourglass | Changes the mouse pointer to an hourglass (a wait pointer) or back to a normal pointer.  
+LockNavigationPane | This option prevents the user from right-clicking a database object displayed in the left pane (Navigation pane) and then selecting the Cut or Delete option from the context menu that appears. Other options on that menu, such as Copy and Paste, are still enabled.  
+Maximize | Maximizes the active window.  
+Minimize | Minimizes the active window.  
+MoveSize | Moves or resizes (or both) the active window.  
+NavigateTo | Allows you to specify how objects are displayed in the Navigation pane (left pane). For example, you could reorganize the list of objects, or even prevent some objects from being displayed at all.  
+OpenDataAccessPage | Opens the specified data access page in the specified view.  
+OpenDiagram | Opens the specified database diagram.  
+OpenForm | Opens the specified form and optionally applies filtering.  
+OpenFunction | Opens the specified user-defined function in the specified view (for example, datasheet view) and mode (for example, for data entry).  
+OpenModule | Opens the specified VBA module at the specified procedure.  
+OpenQuery | Opens the specified query in the specified view and mode.  
+OpenReport | Opens a report in Design view or Print Preview. Alternatively, you can use this method to print a hard copy of the report.  
+OpenStoredProcedure | A macro action that opens a stored procedure in Design view, Datasheet view, or Print Preview.  
+OpenTable | Opens the specified table in the specified view and mode.  
+OpenView | Opens the specified view in the specified view and mode.  
+OutputTo | Outputs the data in the specified object (for example, a report or a data access page) in the specified format.  
+PrintOut | Prints the specified object.  
+Quit | Provides backward compatibility with Access 95. With later versions of Access, use Application.Quit instead.  
+RefreshRecord | Refreshes a record.  
+Rename | Renames the specified object with the name given.  
+RepaintObject | Repaints the specified object, completing any screen updates that are pending.  
+Requery | Updates the data in the specified control by querying the data source again.  
+Restore | Restores the active window to its nonmaximized and nonminimized size.  
+RunCommand | Runs the specified built-in menu command or toolbar command.  
+RunDataMacro | Calls a named data macro.  
+RunMacro | Runs the specified macro.  
+RunSavedImportExport | Runs a saved import or export specification.  
+RunSQL | Runs an Access action query using the specified SQL statement.  
+Save | Saves the specified object or (if no object is specified) the active object.  
+SearchForRecord | Searches for a specific record in a table, form, query, or report.  
+SelectObject | Selects the specified object in the database window or in an object that's already open.  
+SendObject | Sends the specified object (for example, a form or a report) in an email message.  
+SetDisplayedCategories | Specifies which categories are displayed under the Navigate To Category option in the Navigation pane. If you click anywhere in the Navigation pane's title bar, you'll see the various options.  
+SetFilter | Can be used to change the WHERE clause to update a URL.  
+SetMenuItem | Sets the state of a menu item—for example, enabling or disabling a menu item.  
+SetOrderBy | Change an order by. In other words, sort records in ascending or descending order.  
+SetParameter | Sets the values of parameters.  
+SetProperty | Sets various properties of a control or field, such as BackColor, Width, Enabled, and Caption.  
+SetWarnings | Turns system messages on or off.  
+ShowAllRecords | Removes any existing filters from the current form, query, or table.  
+ShowToolbar | Displays or hides the specified toolbar.  
+SingleStep | Pauses the currently executing macro and displays a Macro Single Step dialog box.  
+TransferDatabase | Imports data into or exports data from the current database or project.  
+TransferSharePointList | Imports (or links) data from a Microsoft Windows SharePoint Services 3.0 site.  
+TransferSpreadsheet | Imports data from or exports data to a spreadsheet.  
+TransferSQLDatabase | Transfers the specified SQL Server database to another SQL Server database.  
+TransferText | Imports data from or exports data to a text file.
+
+## Using the OpenForm Method to Open a Form
+
+To open a form, use the OpenForm method of the DoCmd object. The syntax is as follows:
+
+    _expression_.OpenForm(FormName, View, FilterName, WhereCondition, DataMode, WindowMode, OpenArgs)
+
+Here are the components of the syntax:
+
+  * _expression_ is a required expression that returns a DoCmd object. In many cases, it's easiest to use the DoCmd object itself.
+  * FormName is a required Variant argument that specifies the name of the form you want to open. The form must be in the current database.
+  * View is an optional argument that specifies the view to use: acNormal (the default), acDesign, acFormDS, acFormPivotChart, acFormPivotTable, or acPreview.
+  * FilterName is an optional Variant argument that you can use to specify the name of a query. The query must be stored in the current database.
+  * WhereCondition is an optional Variant that you can use to specify a SQL WHERE clause. Omit the word WHERE from the clause.
+  * DataMode is an optional argument for specifying the mode in which to open the form: acFormPropertySettings, acFormAdd, acFormEdit, or acFormReadOnly. acFormPropertySettings is the default setting and opens the form using the mode set in the form.
+  * WindowMode is an optional argument for specifying how to open the form. The default is acWindowNormal, a normal window. You can also open the form as a dialog box (acDialog) or as an icon (acIcon) or keep it hidden (acHidden).
+  * OpenArgs is an optional Variant that you can use to specify arguments for opening the form—for example, to move the focus to a particular record.
+
+The following example uses the DoCmd object to open a form in the Northwind sample database (you must have this database open in Access for this to work). Press Alt+F11 to open Access's VBA Editor, and then type in this code. When you execute the code by pressing F5, Access displays the first record for which the Employee field matches Jan Kotas:
+
+    Sub test ()
+
+    DoCmd.OpenForm FormName:="Sales Analysis Form", View:=acNormal, _
+    WhereCondition:="Employee ='Jan Kotas'"
+
+    End Sub
+
+## Using the PrintOut Method to Print an Object
+
+To print an object, use the PrintOut method. The syntax is as follows:
+
+    _expression_.PrintOut(PrintRange, PageFrom, PageTo, PrintQuality, Copies, CollateCopies)
+
+Here are the components of the syntax:
+
+  * _expression_ is a required expression that returns a DoCmd object.
+  * PrintRange is an optional argument that specifies what to print: all of the object (acPrintAll, the default), specific pages (acPages), or the selection (acSelection).
+  * PageFrom and PageTo are optional Variant arguments that you use with PrintRange: = acPages to specify the starting and ending page numbers of the print range.
+  * PrintQuality is an optional argument that you can use to specify the print quality. The default setting is acHigh, but you can also specify acLow, acMedium, or acDraft (draft quality, to save ink and time).
+  * Copies is an optional Variant argument that you can use to specify how many copies to print. The default is 1.
+  * CollateCopies is an optional Variant argument that you can set to True to collate the copies, and False not to. The default setting is True.
+
+The following example prints one copy (the default) of the first page in the active object at full quality without collating the copies:
+
+    DoCmd.PrintOut PrintRange:=acPages, _
+    PageFrom:=1, PageTo:=1, CollateCopies:=False
+
+Be sure to trap this code for an error in case you've requested a printout of something that doesn't exist—such as a range of 1 to 4 for a single-page form. In fact, it's always a good idea to trap errors in code that contacts peripherals such as printers or hard drives. What if the printer isn't turned on or the hard drive is full? Your code should anticipate and manage situations like these.
+
+## Using the _RunMacro_ Method to Run an Access-Style Macro
+
+To run an Access-style macro, use the RunMacro method. The syntax is as follows:
+
+    _expression_.RunMacro(MacroName, RepeatCount, RepeatExpression)
+
+Here are the components of the syntax:
+
+  * _expression_ is a required expression that returns a DoCmd object.
+  * MacroName is a required Variant argument that specifies the macro name.
+  * RepeatCount is an optional Variant argument that you can use to specify an expression to control the number of times that the macro should run. The default is 1.
+  * RepeatExpression is an optional Variant argument that contains a numeric expression to be evaluated each time the macro runs. The macro stops when this expression evaluates to 0 (False).
+
+The following example runs an Access-style macro named RemoveDuplicates:
+
+    DoCmd.RunMacro "RemoveDuplicates"
+
+# The Bottom Line
+
+**Get started with VBA in Access.**
+
+Access allows you to write macros in a VBA Editor using VBA code. But it also features a legacy Macro Designer utility (formerly known as the Macro Builder) with which you create an entirely different kind of macro, what we've been calling an Access-style macro.
+
+Master It
+
+The term _macro_ is used in a special way in Access (referring to only one of the two types of custom procedures Access permits you to construct: VBA and Macro Designer). This usage of _macro_ is unlike the way the term _macro_ is used in other Office applications, not to mention all other forms of computing. Describe what Access means by the term _macro_.
+
+**Open and close databases.**
+
+Access permits you to open a database in several ways.
+
+Master It
+
+Two common commands that open a database in Access are OpenCurrentDatabase and OpenDatabase. What is the difference between these two commands?
+
+**Work with the Screen object.**
+
+You became familiar with using ActiveDocument objects in Word to access the document that currently has the focus. Or you used the ActivePresentation object to work with whichever presentation happened to be active in PowerPoint. Access, however, employs the Screen object as the parent of whatever object has the focus.
+
+Master It
+
+The Screen object represents the screen object that currently has the focus in Access (that is, the object that is receiving input or ready to receive input). Three types of common Access objects can have the focus when you employ the Screen object. What are they?
+
+**Use the DoCmd object to run Access commands.**
+
+Many of the tools that Access makes available to users, such as printing a report or maximizing a window, are also available to the programmer via the methods of the DoCmd object.
+
+Master It
+
+The DoCmd object has 66 methods in Office 2013. Describe the purpose of the DoCmd object's Beep method.
+Chapter 29
+
+Manipulating the Data in an Access Database via VBA
+
+This chapter shows you how to begin manipulating the data in an Access database. You can do so either from within Access or from another VBA-enabled application—for example, from Excel or from Word. This chapter shows you how to work from within Access.
+
+There are two main ways to manage data in an Access database: via Data Access Objects (DAO) or via ActiveX Data Objects (ADO). DAO is the older technology to access data, and it works for both Microsoft Jet databases (Microsoft Jet is the Access database engine) and ODBC-compliant data sources. (ODBC is Open Database Connectivity, a long-existing standard for accessing databases. ODBC is also useful for accessing open-source solutions, such as MySQL.) ADO is a high-level programming interface that can be used with a wide range of data sources.
+
+Access offers you the choice of methods, but you will probably find it easier to use ADO than DAO. Additional information about choosing between these two technologies can be found at the following location:
+
+<http://msdn.microsoft.com/en-us/library/aa164825(office.10).aspx>
+
+In this chapter you will learn to do the following:
+
+  * Open a recordset
+  * Access a particular record in a recordset
+  * Search for a record
+  * Edit a record
+  * Insert and delete records
+
+# Understanding How to Proceed
+
+Once you've chosen between ADO and DAO, you take the following primary steps to manipulate the data in the database from Access:
+
+1. Add a reference to the object library you'll be using.
+
+2. Create a recordset that contains the records with which you want to work.
+
+3. Work with the records in the recordset.
+
+4. Close the recordset.
+
+All the steps work in more or less the same way for ADO and DAO, except that you create the recordset in different ways. The following sections take you through these steps, splitting the path where necessary to cover the differences in ADO and DAO.
+
+# Preparing to Manage the Data in a Database
+
+Given that there are two distinct ways to manage data in Access—ADO and DAO—you have to specify which one you're planning to employ. You can think of libraries as collections of pre-written functions.
+
+Why bother fooling around with multiple libraries? The answer is that there can't be a single, massive, all-purpose library because, among other issues, there would be name confusion. Two different functions in two different libraries might well share the same name. But they could perform different tasks or perform the same task differently. It's like having various libraries in a large university. The word _positive_ means entirely different things in the law library than it does in the medical library.
+
+Note that some of the following code examples will work just fine no matter which library you are currently referencing.
+
+However, to ensure consistency and avoid bugs, create a reference to the object library you want to use (ADO or DAO). And in your code you'll specify the appropriate connection to the data source—the Microsoft ActiveX Data Objects Object 6.1 Library for an ADO connection or Microsoft DAO 3.6 Object Library for a DAO connection. (Note that these 6.1 and 3.6 version numbers might not match the versions of these libraries available on your machine. Just choose the latest, highest version number you see.)
+
+## Adding a Reference to the Appropriate Object Library
+
+To create a reference to the object library you need, follow these steps:
+
+1. Launch Access.
+
+2. Launch or activate the VBA Editor by pressing Alt+F11.
+
+3. In the VBA Editor, choose Tools ⇒ References to display the References dialog box.
+
+4. Scroll down the Available References list box to the appropriate object library item, and then select its check box and click OK to close the References dialog box:
+
+  * For an ADO connection, select the check box for the Microsoft ActiveX Data Objects 6.1 Library item.
+  * For a Data Access Object, select the check box for Microsoft DAO 3.6 Object Library.
+
+You can't select both libraries at the same time. And if you don't include the correct library, you'll get a compile error when you try to execute one of the objects in that library (such as a DAO.Recordset). The message will refer to this as a "user-defined" object because it can't find the object in the currently referenced libraries—so the Editor thinks it's a new object introduced by you, the programmer, but that you forgot to declare it.
+
+## Establishing a Connection to the Database
+
+It's possible to establish connections to databases in a variety of ways, but in this chapter we'll use a simple, direct line of code. In Chapter 28, "Understanding the Access Object Model and Key Objects," you saw what steps to take to go online and obtain the Northwind.accdb sample database and where to store it on your hard drive so you could experiment with the example code in these final chapters of the book. If you haven't already taken those steps, see the sidebar in Chapter 28 titled "Prepare the Northwind Database to Use with This Book's Examples."
+
+To open a connection (but not make it visible to the user in Access) to the Northwind sample database, you can use this code if you're employing DAO:
+
+    Dim myDatabase As DAO.Database
+    Set myDatabase = DBEngine.OpenDatabase("C:\temp\Northwind.accdb")
+
+You'll see this approach used in examples later in this chapter. You'll also see how to manipulate Northwind while it's loaded into Access where the user can see it. Recall from the previous chapter that you can open a database two ways: to get to its data but not display it in Access, or to load it into Access and make it visible to the user.
+
+# Opening a Recordset
+
+To get to the records in the database to which you're establishing the connection, you must open a recordset. ADO and DAO use different approaches. The following subsections give you the details.
+
+## Opening a Recordset Using ADO
+
+To open a recordset using ADO, you use the Open method of the RecordSet object. The syntax for the Open method is as follows:
+
+    _recordset_.Open Source, ActiveConnection, CursorType, LockType, Options
+
+Here are the components of the syntax:
+
+  * _recordset_ is the RecordSet object that you want to open. Often, you'll use an object variable that references the RecordSet object.
+  * Source is an optional Variant argument that specifies the table, command, SQL statement, or file that contains the recordset.
+  * ActiveConnection is an optional Variant argument. This can be either an object variable of the Connection type or a Variant/String containing parameters for the connection.
+  * CursorType is an optional argument for specifying the type of cursor to use in the recordset. Table 29.1 explains the cursor types.
+  * LockType is an optional argument for specifying how to lock the recordset while it is open. Table 29.2 explains the lock options.
+  * Options is an optional Long argument that you can use to control how the Source value is evaluated if it is not a Command object. Table 29.3 explains the available constants, which fall into two categories: command-type options and execute options. You can use two or more constants for the Options argument.
+
+* * *
+
+An Alternative to Providing Arguments for the Open Method
+
+Instead of specifying the arguments with the Open method, you can set the Source, ActiveConnection, CursorType, and LockType properties of the RecordSet object you're opening and then use the Open method without arguments. You may find that this approach makes your code easier to read.
+
+* * *
+
+Table 29.1 Cursor-type constants for opening a recordset
+
+**Constant** | **Cursor Type and Explanation**  
+---|---  
+adOpenForwardOnly | Forward-only cursor. You can scroll through the recordset only forward. This is the default cursor and provides the best performance when you need to go through the records only once.  
+adOpenDynamic | Dynamic cursor. You can move freely through the recordset, and you can see changes that other users make to records.  
+adOpenKeyset | Keyset cursor. You can move freely through the recordset and see changes that other users make to records. You cannot see records that other users add, and records that other users delete are inaccessible.  
+adOpenStatic | Static cursor. You can't see changes that other users make. Use a static cursor when you need to only search for data or create reports from the data that exists when you open the recordset.
+
+Table 29.2 Lock options for opening a recordset via ADO
+
+**Constant** | **Opens the Recordset With**  
+---|---  
+adLockReadOnly | Data in read-only mode, so you cannot alter it. Use this constant if you need to search or analyze the data but not manipulate it.  
+adLockOptimistic | Optimistic locking, which locks a record only when you run the Update method to update it explicitly.  
+adLockBatchOptimistic | Optimistic batch locking, which enables you to perform a simultaneous update on several records that you've changed.  
+adLockPessimistic | Pessimistic locking, which locks a record immediately after you change it.
+
+Table 29.3 Choices for the Options argument when opening a recordset
+
+**Constant** | **Explanation**  
+---|---  
+****************| ********  
+adCmdText | Evaluates Source as text specifying a command or stored procedure call.  
+acCmdTable | Evaluates Source as the name of a table consisting of columns returned by an internally generated SQL query.  
+acCmdStoredProc | Evaluates Source as the name of a stored procedure.  
+acCmdFile | Evaluates Source as the filename of a stored recordset.  
+acCmdTableDirect | Evaluates Source as a table name and returns all columns of the table. Do not use with adAsyncExecute.  
+adCmdUnknown | This means that the type is unknown. This is the default.  
+**Execute Options** | ****  
+adAsyncExecute | Executes the command asynchronously. Does not work with acCmdTableDirect.  
+adAsyncFetch | Retrieves the rows specified by the CacheSize property synchronously and the remaining rows asynchronously.  
+adAsyncFetchNonBlocking | Prevents the main thread from blocking other data access while retrieving data.  
+adExecuteRecord | The CommandText (adCmdText, described earlier in this table) is a stored procedure or a command that fetches a single row of data. It is returned as a Record object.  
+adExecuteNoRecords | Used to improve performance when you know that no records will be returned (for example, you're merely adding, not fetching, data).  
+adExecuteStream | Treats the data returned by Source as a single row that becomes a Record object.
+
+You'll see examples of opening a recordset a little later in this chapter. First, you must decide how to access the data in the recordset. The easiest methods are to use an existing table or a SQL SELECT statement.
+
+## Choosing How to Access the Data in an ADO Recordset
+
+How you actually get to the data in the recordset you open depends on whether you want to fetch all the data in a table or just part of it. If you want all the data in a table, you can use a table to access the data. If you want to return only particular records, you can use an SQL SELECT statement to fetch them.
+
+### Using a Table to Access the Data in an ADO Recordset
+
+To open in a recordset a whole table from a database, specify the table name as the Source argument in the Open statement. The following example declares a RecordSet object variable, uses a Set statement to assign the appropriate recordset type to it, uses the ActiveConnection method to connect to the currently active database (whatever you have loaded into Access at the time), and then uses the Open method to open the entire Customers table. We'll use the Northwind sample database (that you installed in Chapter 28), which has a Customers table.
+
+This example demonstrates how to bring into an ADO recordset the data from an entire table and then move around within this recordset. Your code will not need to instantiate a database object but instead will work with the Northwind database that's currently loaded into Access. (This is a very simple example to illustrate some basic concepts. Normally when accessing a database, you'll want to employ an SQL statement and check for recordset boundary conditions—using BOF and EOF properties. SQL and BOF/EOF are described later in this chapter. For now, just consider the following example code an illustration of elementary principles, to which you'll add real-world maneuvers demonstrated in the code examples later in this chapter.)
+
+As always, it's necessary for you to first ensure that the ADO library is referenced. So in the VBA Editor, choose Tools ⇒ References and select the check box next to Microsoft ActiveX Data Objects 6.1 Library. Finally, load the Northwind.accdb sample database into Access.
+
+    1.   Sub ExploreRecordset()
+    2.       Dim myRecordset As ADODB.Recordset
+    3.       Set myRecordset = New ADODB.Recordset
+    4.  
+    5.  'point to the currently loaded database
+    6.       myRecordset.ActiveConnection = CurrentProject.Connection
+    7.       myRecordset.CursorType = adOpenStatic
+    8.       myRecordset.Open Source:="Customers"
+    9.  
+    10.  'Display the First Name from the first row
+    11.      MsgBox myRecordset("First Name")
+    12. 
+    13.  'Move to the last row and show the Last Name
+    14.      myRecordset.MoveLast
+    15.      MsgBox myRecordset("Last Name")
+    16. 
+    17.  'Move to the previous row and display the Job Title
+    18.      myRecordset.MovePrevious
+    19.      MsgBox myRecordset("Job Title")
+    20. 
+    21.  'Move back to the first row and display the Phone Number
+    22.      myRecordset.MoveFirst
+    23.      MsgBox myRecordset("Business Phone")
+    24. 
+    25.  'Move to the next row and show the Last Name
+    26.      myRecordset.MoveNext
+    27.     MsgBox myRecordset("Last Name")
+    28. 
+    29. 
+    30.     myRecordset.Close
+    31.     Set myRecordset = Nothing
+    32. End Sub
+
+In this code, you first declare a recordset variable, and in line 6 you point it to the database currently loaded in Access. Line 7 defines the cursor type as static, and line 8 loads the data—the entire Customers table—into your recordset.
+
+Line 11 doesn't move anywhere within the recordset, so by merely supplying the recordset's name, MyRecordset, along with one of the table's field names, Last Name, to a MsgBox function, you can display the first record in the table.
+
+Line 14 does move to a different record within the recordset—the last record—before displaying the data in that record's Last Name field. Line 18 moves to the penultimate record, line 22 moves to the first record, and line 26 moves to the second record. Finally, line 30 closes the recordset and line 31 assigns Nothing to the object variable, which has the effect of eliminating it.
+
+* * *
+
+What Is a "First Record" in a Table?
+
+LWparaofcase study typeIt's important for beginners to understand the practical difference between a table of raw data in a database and an organized _recordset_ extracted from that database. The concept of a "first record" within a relational database is essentially meaningless until you use an SQL statement to organize (sort or group) the records in some fashion.
+
+Records in a relational database (the type of database Access employs) are not necessarily organized. For example, they are not necessarily alphabetized by any particular field (such as LastName) or numerically listed by an ID number, or organized using some other scheme. True, data is stored in tables, and a table _does_ have structure: its fields separate the data into logical categories such as LastName, Address, CellPhone, and so on. But its records (rows of actual data) are _not necessarily_ maintained in any particular order.
+
+A set of records (a _recordset_ ) is extracted from a database when you execute an SQL statement. This statement allows you to specify how you want to see the records organized (grouped by city, alphabetized, or whatever). SQL is flexible: You can organize records in many ways when you extract a recordset from a database. You can sort records by any of their fields; you can also sort in either ascending (the default) or descending order (specify DESC for descending). Which record is first also depends on which field you sort the recordset by, as specified in the ORDER BY statement.
+
+In the example in the section "Using a Table to Access the Data in an ADO Recordset" in this chapter, the records are moved into the recordset unsorted. As each action is carried out in this code—moving forward and backward through the recordset—message boxes display the records in their unsorted order. However, if you want to organize the records in alphabetical order by each customer's last name, add an ORDER BY keyword to your SQL statement, like this:
+
+    myRecordset.Open "Select * from Customers
+    **ORDER BY 'Last Name'**
+    "
+
+Just remember that you can get a recordset without using an SQL statement, like this:
+
+    myRecordset.Open Source:="Customers"
+
+But the concept of a "first record" in this recordset probably will have no meaning.
+
+However, you can get a recordset by using an SQL statement, like this:
+
+    myRecordset.Open strSQL
+
+In this case, the "first record" will have meaning to you—based on the criteria you specified in the SQL statement (strSQL here would be a string you previously defined that contains an SQL statement). The section titled "Using an SQL SELECT Statement to Access a Subset of the Data in an ADO Recordset," later in this chapter, explains how to use an SQL statement.
+
+* * *
+
+### Using an SQL _SELECT_ Statement to Access a Subset of the Data in an ADO Recordset
+
+If you want to add to your recordset only those records that match criteria you specify, use an SQL SELECT statement. SELECT statements can be constructed in complex ways, but you can also create straightforward statements with a little practice using this syntax:
+
+    SELECT [DISTINCT] _fields_ FROM _table_ WHERE _criteria_ ORDER BY _fields_ [DESC]
+
+The words in uppercase are the SQL keywords, and the words in lowercase italics are placeholders for the data you supply, such as the actual name of a real table. Here are the details:
+
+  * The SELECT keyword indicates that you're creating a statement to select records (as opposed to, say, delete records).
+  * You can include the optional DISTINCT keyword (the brackets indicate that it is optional) to make the statement return only unique records, discarding any duplicates that the statement would otherwise return. If you omit DISTINCT, you get any duplicates as well.
+  * _fields_ is a list of the fields that you want to have appear in the recordset. If you use two or more field names, separate them with commas — for example, contact, company, address. To return all field names, enter an asterisk (*).
+  * FROM _table_ specifies the name of the table from which to draw the data.
+  * WHERE _criteria_ specifies the criteria for filtering the records. Enter the field name, an equal sign, a single straight quote, the value you're looking for, and another single straight quote. For example, WHERE City = 'Taos' returns only the results where Taos appears in the City field.
+  * ORDER BY _fields_ specifies the field or fields on which to sort the results. If you use two or more fields, put them in the order of precedence you want (the first sort field first, the second sort field second, and so on) and separate them with commas. The default sort order is ascending, but you can force a descending sort by adding the DESC keyword. For example, ORDER BY Zip DESC produces a descending sort by the Zip field, while ORDER BY State, City produces an ascending sort by the State field and, within that, by City.
+
+Because SQL SELECT statements contain so many elements, putting a SELECT statement as an argument in an Open statement can create uncomfortably long lines of code. You can break the lines of code in the editor with the underscore symbol as usual, but you may find it easier to use the properties of the RecordSet object to specify the details of the recordset rather than using the Open arguments.
+
+Another way to avoid using a large SQL statement as an argument for the Open method is to first assign the SELECT statement to a String variable and then use that string to supply the argument. The following code illustrates that approach.
+
+In this code, we'll assign an SQL statement to a string and then use that string as the argument for the Open statement. Before executing this example, press Ctrl+G in the VBA Editor to open the Immediate window, where the results will be displayed.
+
+    Sub SubSet()
+
+        Dim strSQL As String
+
+        Dim myRecordset As ADODB.Recordset
+        Set myRecordset = New ADODB.Recordset
+        myRecordset.ActiveConnection = CurrentProject.Connection
+        **strSQL = "Select * FROM Customers WHERE ID > 17"**
+        myRecordset.Open strSQL
+
+        Do Until myRecordset.EOF
+            Debug.Print myRecordset("Last Name")
+            myRecordset.MoveNext
+        Loop
+
+    End Sub
+
+In this example, you want to import into the recordset only those records that have an ID higher than 17, so you set up an SQL statement that specifies that condition. Then you looped through the recordset until EOF (end of file), displaying each last name in the Immediate window.
+
+#### _Opening a Recordset Using DAO_
+
+When working with DAO, you use a different approach than the ADO techniques explored so far in this chapter. You use the OpenRecordset method of the Database object to create a new recordset and add it to the Recordsets collection.
+
+The syntax for the OpenRecordset method is as follows:
+
+    Set _recordset_ = _object_.OpenRecordset (Name, Type, Options, LockEdit)
+
+Here are the components of the syntax:
+
+  * _recordset_ is an object variable representing the RecordSet object you're opening.
+  * _object_ is an object variable representing the database from which to create the new RecordSet object.
+  * Name is a required String argument that specifies the table, query, or SQL statement that provides the records for the recordset. If you're using a Jet database and returning a table-type recordset, you can use only a table name for the Name argument.
+  * Type is an optional argument that you can use to specify the type of recordset you're opening. Table 29.4 explains the constants you can use for Type.
+  * Options is an optional argument that you can use to specify constants that control how Access opens the recordset. Table 29.5 explains the constants you can use for Options.
+  * LockEdit is an optional constant that you can use to specify how the recordset is locked. Access 2013 no longer supports ODBCDirect workspaces. So if you need to connect to external data stores directly (not through Access's database engine), then you must use ADO rather than DAO. Table 29.6 explains the constants you can use for LockEdit.
+
+Table 29.4 Constants for the Type argument for the OpenRecordSet method
+
+**Constant** | **Opens This Type of Recordset**  
+---|---  
+dbOpenTable | Table-type. This works only in Microsoft Jet workspaces. This is the default setting if you open a recordset in a Jet workspace without specifying the Type.  
+dbOpenDynamic | Dynamic-type. This works only in ODBCDirect workspaces. The recordset is similar to an ODBC dynamic cursor and enables you to add, remove, or edit rows from a database table.  
+dbOpenDynaset | Dynaset-type. This recordset is similar to an ODBC keyset cursor and enables you to add, remove, or edit rows from a database table. You can also move freely through the rows in the dynaset.  
+dbOpenSnapshot | Snapshot-type. This recordset is similar to an ODBC static cursor. It opens a snapshot of the records but does not update them when other users make changes. To update the snapshot, you must close the recordset and reopen it.  
+dbOpenForwardOnly | Forward-only. You can move only forward through the recordset.
+
+Table 29.5 Table 29.5: **** Constants for the Options argument
+
+**Constant** | **Explanation** | **Limitations**  
+---|---|---  
+dbAppendOnly | Users can add new records but cannot edit or delete existing records. | Jet dynaset-type recordsets only  
+dbSQLPassThrough | Passes an SQL statement to an ODBC data source connected via Jet. | Jet snapshot-type recordsets only  
+dbSeeChanges | Causes a runtime error if a user attempts to change data that another user is already editing. | Jet dynaset-type recordsets only  
+dbDenyWrite | Prevents other users from adding or modifying records. | Jet recordsets only  
+dbDenyRead | Prevents other users from reading data. | Jet table-type recordsets only  
+dbForwardOnly | Forces a forward-only recordset. This is an older option included for backward compatibility. Use Type: = dbOpenForwardOnly instead. | Jet snapshot-type recordsets only  
+dbReadOnly | Prevents users from changing the recordset. This is an older option included for backward compatibility. Use LockEdits: = dbReadOnly instead. If you must use Options: = dbReadOnly, do not include the LockEdits argument. | Jet recordsets only  
+dbRunAsync | Runs a query asynchronously (so that some results are returned while others are still pending). | ODBCDirect workspaces only  
+dbExecDirect | Runs a query by calling SQLExecDirect. | ODBCDirect workspaces only  
+dbInconsistent | Permits inconsistent updates, enabling you to update a field in one table of a multitable recordset without updating another table in the recordset. You can use either this constant or dbConsistent, but not both. | Jet dynaset-type and snapshot-type recordsets only  
+dbConsistent | Permits only consistent updates so that shared fields in tables underlying a multitable recordset must be updated together. You can use either this constant or dbInconsistent, but not both. | Jet dynaset-type and snapshot-type recordsets only  
+dbFailOnError | If an error occurs, updates are rolled back. | Jet recordsets only
+
+Table 29.6 Constants for the LockEdit argument
+
+**Constant** | **Explanation** | **Default or Limitations**  
+---|---|---  
+dbPessimistic | Uses pessimistic locking, which locks a record immediately after you change it. | Default for Jet workspaces  
+dbOptimistic | Uses optimistic locking, which locks a record only when you run the Update method to update it explicitly. |   
+dbOptimisticValue | Uses optimistic concurrency, comparing the data values in old and new records to find out if changes have been made since the record was last accessed. The concurrency is based on row values. | ODBCDirect workspaces only  
+dbOptimisticBatch | Uses optimistic batch locking, which enables you to perform a simultaneous update on several records that you've changed. | ODBCDirect workspaces only
+
+### Opening a DAO Recordset Using a Table
+
+The easiest way to open a DAO recordset is to open an entire table by specifying the table name for the Name argument and using Type: = dbOpenTable to explicitly state that you're opening a table. The following example declares the object variable myRecordset as a DAO.Recordset object and then assigns to it the records from the Customers table in the database identified by the myDatabase object variable:
+
+    Sub DAOTest()
+    Dim myRecordset As DAO.Recordset
+    Dim myDatabase As DAO.Database
+
+       'Open the copy of Northwind on the hard drive
+       Set myDatabase = DBEngine.OpenDatabase("C:\temp\Northwind.accdb")
+
+       'Create the DAO-style Recordset
+
+    **Set myRecordset = myDatabase.OpenRecordset(Name:="Customers", _**
+        **Type:=dbOpenTable)**
+
+       MsgBox myRecordset("ID")
+       MsgBox myRecordset("Company")
+       MsgBox myRecordset("Address")
+       MsgBox myRecordset("City")
+
+       Set myRecordset = Nothing
+    End Sub
+
+### Opening a DAO Recordset Using an SQL _SELECT_ Statement
+
+If you want to return only a subset of records rather than an entire table, use an SQL SELECT statement to open the DAO recordset. (See "Using an SQL SELECT Statement to Access a Subset of the Data in an ADO Recordset," earlier in this chapter, for an explanation of the essentials of SQL SELECT statements.)
+
+Specify the SQL statement as the Name argument for the OpenRecordset method, as the following example illustrates. This code declares a Database object variable, assigns the Northwind sample database to it, declares a RecordSet object variable, and then assigns to the object variable the results of a SELECT statement run on the database:
+
+    Sub DAOSelect()
+
+    Dim myDatabase As DAO.Database
+       Set myDatabase = DBEngine.OpenDatabase("C:\temp\Northwind.accdb")
+
+    Dim myRecordset As DAO.Recordset
+       Set myRecordset = myDatabase.OpenRecordset _
+        (Name:="SELECT * FROM Customers WHERE City ='Boston'", _
+        Type:=dbOpenDynaset)
+       Do Until myRecordset.EOF
+            Debug.Print myRecordset("Last Name")
+            myRecordset.MoveNext
+        Loop
+
+       Set myRecordset = Nothing
+    End Sub
+
+Note that the results in this example are printed in the VBA Editor's Immediate window, so press Ctrl+G to open that window before pressing F5 to test this procedure.
+
+# Accessing a Particular Record in a Recordset
+
+To work with a particular record in a recordset, you can either move through (loop) the records until you find the one you want or search for the record using Seek or Find methods. The RecordSet object includes these methods for moving about the records in the recordset:
+
+**Method** | **Moves to Record**  
+---|---  
+MoveFirst | First  
+MoveNext | Next  
+MovePrevious | Previous  
+MoveLast | Last  
+Move | Move to a specified record
+
+## Using the MoveFirst, MoveNext, MovePrevious, and MoveLast Methods
+
+The MoveFirst method and MoveLast method are always safe to use because as long as the recordset contains one or more records, there's always a first record and a last record. (If the recordset contains only one record, that record is considered both first and last.)
+
+But if you use the MovePrevious method from the first record in the recordset or the MoveNext method from the last record, you move beyond the recordset, accessing what is sometimes called a "phantom record"—one that isn't there. When you try to access the contents of such a record, VBA gives the runtime error 3021 ("No current record"). Figure 29.1 shows this error.
+
+Figure 29.1 The runtime error "No current record" usually means that you've moved outside the recordset.
+
+BOF means beginning of file, and EOF means end of file. Note that you can visualize the end of a recordset as a point just beyond the last record. EOF, therefore, is not the same as the last record. BOF, likewise, is not the first record, but a point just before it. (I mention this because we have a tendency to view the first item in a set as the "beginning" of the set; we would consider the first float as the beginning of a parade. Recordsets aren't like that.)
+
+To check whether you're at the beginning or end of the recordset, use the BOF property or the EOF property of the RecordSet object. The BOF property returns True when the current record is at the beginning of the file, and the EOF property returns True when the current record is at the end of the file. To avoid errors, after using the MovePrevious method check whether the beginning of the file has been reached, as in this example:
+
+    With myRecordset
+        .MovePrevious
+        **If .BOF = True Then .MoveNext**
+    End With
+
+Similarly, after using the MoveNext method check whether the end of the file has been reached:
+
+    myRecordset.MoveNext
+    **If myRecordset.EOF Then myRecordset.MovePrevious**
+
+## Using the _Move_ ****Method to Move by Multiple Records
+
+To move by several records at once, but not to the first record or last record in the recordset, use the Move method. The syntax for ADO differs from that used with DAO.
+
+Here's the syntax for the Move method with ADO:
+
+    _recordset_.Move NumRecords, Start
+
+The syntax for the Move method with DAO is as follows:
+
+    _recordset_.Move Rows, StartBookmark
+
+Here, _recordset_ is the recordset involved, NumRecords or Rows is the number of records by which to move (use a positive number to move forward or a negative number to move back), and Start or StartBookmark is an optional argument that you can use to specify a bookmark from which you want to start the movement. If you omit Start or StartBookmark, movement starts from the current record.
+
+For example, the following statement moves 10 records forward from the current record in an ADO recordset:
+
+    myRecordset.Move NumRecords:=10
+
+The following statement moves 5 records backward from the current record in a DAO recordset:
+
+    myRecordset.Move Rows:=-5
+
+To create a bookmark, move to the record that you want to mark, and then use the Bookmark property of the RecordSet object. The following example declares a Variant variable named myBookmark and then assigns to it a bookmark representing the current record in an ADO recordset:
+
+    Dim myBookmark As Variant
+    myBookmark = myRecordset. **Bookmark**
+
+After setting a bookmark, you can use it as the starting point of a move. For example, the following statement moves to the eighth record after the bookmark myBookmark in an ADO recordset:
+
+    myRecordset.Move NumRecords:=8, **Start:=myBookmark**
+
+# Searching for a Record
+
+The process of searching for a record in a recordset differs in ADO and in DAO. The following sections show you how to search using either technology.
+
+* * *
+
+Also Consider the Seek Method
+
+Both ADO recordsets and DAO recordsets include a method called Seek, which is more complex and more powerful than the Find method for ADO and the four Find methods for DAO discussed here. Consult the Access VBA Help file for additional details on the Seek method.
+
+* * *
+
+## Searching for a Record in an ADO Recordset
+
+To search for a record in an ADO recordset, you can use the Find method of the RecordSet object. The syntax is as follows:
+
+    _recordset_.Find Criteria, SkipRows, SearchDirection, Start
+
+Here are the components of the syntax:
+
+  * _recordset_ is the recordset involved.
+  * Criteria is a required String argument that specifies the column name, type of comparison, and value to use. For example, to locate a record where the state is California, you could specify that the State column is equal (=) to CA.
+  * SkipRows is an optional Long value that you can use to specify an offset from the current row (or from the bookmark specified by the Start argument) at which to start searching instead of starting from the current row. For example, an offset of 3 starts the search three rows later than the current row.
+  * SearchDirection is an optional argument for specifying whether to search forward or backward. The default is adSearchForward; specify adSearchBackward to search backward instead.
+  * Start is an optional Variant argument that specifies the bookmark from which to start the search or the offset. If you omit Start, the search starts from the current row.
+
+When you run the search, it stops at the first matching record. If no record matches and you're searching forward, it stops at the end of the recordset; if you're searching backward, it stops at the beginning of the recordset. If the end or beginning of the recordset is reached, you know that there was no match for the search.
+
+The following example begins by moving to the first record in the recordset that is represented by the object variable myRecordset. Then the code searches for the first record that matches the criterion "City = 'Denver'". The example checks the EOF property to ensure that the end of the recordset has not been reached. If it has not, this means we found a record containing Denver in the City field, so the example displays a message box with the last name data for the record matching Denver. However, if the end of the recordset has been reached, the example displays a message box stating that no match was found:
+
+    Sub SearchADO()
+
+        Dim strSQL As String
+
+        Dim myRecordset As ADODB.Recordset
+        Set myRecordset = New ADODB.Recordset
+        myRecordset.ActiveConnection = CurrentProject.Connection
+
+        myRecordset.Open Source:="Select * from Customers", _
+            Options:=adCmdText
+
+        With myRecordset
+        .MoveFirst
+        . **Find Criteria:="City='Denver'"**
+
+        If Not .EOF Then
+            MsgBox .Fields("Last Name")
+        Else
+            MsgBox "No matching record was found."
+        End If
+
+    End With
+
+    End Sub
+
+To continue your search for the same criteria, you can use the SkipRows argument to specify an offset so that you don't simply find the current record again. For example, you'll likely want to move ahead just one row, like this:
+
+    myRecordset.Find Criteria="City='Denver'", SkipRows:=1
+
+## Searching for a Record in a DAO Recordset
+
+To search for a record in a DAO recordset, you can use one of these four methods:
+
+  * The FindFirst method starts searching at the beginning of the recordset and searches forward.
+  * The FindNext method starts searching at the current record and searches forward.
+  * The FindPrevious method starts searching at the current record and searches backward.
+  * The FindLast method starts searching at the end of the recordset and searches backward.
+
+The syntax for these four methods is as follows:
+
+    _recordset_.FindFirst _Criteria_
+    _recordset_.FindNext _Criteria_
+    _recordset_.FindPrevious _Criteria_
+    _recordset_.FindLast _Criteria_
+
+Here, _recordset_ is a required object variable that represents the RecordSet object involved. _Criteria_ is a required String argument that specifies the criteria for the search. _Criteria_ works in the same way as the WHERE clause in an SQL statement, except that it does not use the word WHERE.
+
+The following example uses the FindFirst method to search from the beginning of the recordset for the first record that matches the criterion City = 'Las Vegas':
+
+    Sub DAOSearch()
+
+    Dim myDatabase As DAO.Database
+    Set myDatabase = DBEngine.OpenDatabase("C:\temp\Northwind.accdb")
+    Dim myRecordset As DAO.Recordset
+    Set myRecordset = myDatabase.OpenRecordset _
+        (Name:="SELECT * FROM Customers", _
+        Type:=dbOpenDynaset)
+
+    myRecordset. **FindFirst "City = 'Las Vegas'"**
+
+    MsgBox myRecordset("Last Name")
+
+       Set myRecordset = Nothing
+    End Sub
+
+When you start a search in a DAO recordset using one of the four Find methods, the NoMatch property of the RecordSet object is set to True. If the method finds a match, the NoMatch property is set to False. So you can test the NoMatch property to tell whether or not the search found a match, as in this example:
+
+    If myRecordset.NoMatch = False Then
+       MsgBox myRecordset("Last Name")
+    End If
+
+# Returning the Fields in a Record
+
+Once you've moved to a record, you can return the fields it contains by using the appropriate Field object from the Fields collection. Field is the default property for the RecordSet object, so you can omit it if you choose. For example, both the following statements return the Last Name field from the current record:
+
+    myRecordset.Fields("Last Name")
+    myRecordset("Last Name")
+
+# Editing a Record
+
+To change the data in a record, first use the Edit method to specify the value you want to store in the field, and then use the Update method of the RecordSet object to update the data in the underlying table. The following example prepares a record for editing with the Edit method, changes the value in the Last Name field to Schmidtz, and then uses the Update method to update it:
+
+    With myRecordset
+        .Edit
+        .Fields("Last Name").Value = "Schmidtz"
+        .Update
+    End With
+
+# Inserting and Deleting Records
+
+To insert a new record, use the AddNew method of the RecordSet object. You can then assign data to the fields in the record. After that, use the Update method to save the data to the table in the database. The following example uses a With statement to perform these actions:
+
+    Sub AddOne()
+
+    Dim myDatabase As DAO.Database
+    Set myDatabase = DBEngine.OpenDatabase("C:\temp\Northwind.accdb")
+    Dim myRecordset As DAO.Recordset
+    Set myRecordset = myDatabase.OpenRecordset _
+        (Name:="SELECT * FROM Customers", _
+        Type:=dbOpenDynaset)
+
+    With myRecordset
+        .AddNew
+        .Fields("ID").Value = 32
+        .Fields("Last Name").Value = "Murphy"
+        .Fields("First Name").Value = "Andrea"
+        .Fields("Company").Value = "Company RP"
+        .Fields("City").Value = "City of Industry"
+        'add data for the other fields here
+        .Update
+    End With
+
+       Set myRecordset = Nothing
+
+    End Sub
+
+After you press F5 in the VBA Editor to test this code, switch to Access, display the Customers table, _and then you will need to press F5 to refresh the view in Access before you can see this new record_.
+
+To delete a record, identify it by either moving to it or searching for it, and then use the Delete method followed by the Update method. The following example deletes the current record and then updates the table:
+
+    myRecordset.Edit
+    myRecordset.Delete
+    myRecordset.Update
+
+# Closing a Recordset
+
+After working with an object, you should close it. To close a recordset, use the Close method with the appropriate RecordSet object or the object variable that represents the RecordSet object. The following example closes the recordset represented by the object variable myRecordset:
+
+    myRecordset.Close
+
+After closing the recordset, set its object variable to Nothing to release the memory it occupied:
+
+    Set myRecordset = Nothing
+
+# Saving a Recordset to the Cloud
+
+You might want to store a recordset on your hard drive or in the cloud. As you've seen in cloud-access examples in previous chapters, saving files to the cloud is much the same as saving to an ordinary hard-drive folder. By the way, this example also illustrates how to use the Save method of the RecordSet object:
+
+    1.  Sub SaveToCloud()
+    2.  
+    3.      Dim myRecordset As ADODB.Recordset
+    4.      Set myRecordset = New ADODB.Recordset
+    5.      myRecordset.ActiveConnection = CurrentProject.Connection
+    6.  
+    7.      Dim strSQL As String
+    8.      Dim strFilepath As String
+    9.      strFilepath = "C:\Users\Richard\ **SkyDrive** \Cities.xml"
+    10. 
+    11.         strSQL = "SELECT city FROM Employees"
+    12.         myRecordset.Open strSQL
+    13. 
+    14.         myRecordset. **Save** strFilepath, adPersistXML
+    15.   
+    16.     Set myRecordset = Nothing
+    17. 
+    18. End Sub
+
+To test this, open Northwind and press Alt+F11 to open the VBA Editor. Paste this code into a module, but change _Richard_ in line 9 to your own name.
+
+Most of this code should be understandable from previous examples in this chapter. Line 9 specifies the location on my hard drive where files move to SkyDrive automatically after being saved there. You could just as easily save this recordset to any ordinary hard-drive folder, like this:
+
+    myRecordset.Save "c:\temp\Cities.xml", adPersistXML
+
+The save command we're using stores this recordset in the XML format, about which I'll have much more to say in Chapter 31, "Programming the Office 2010 Ribbon."
+
+However, for the curious, here's what this recordset looks like in the XML format, showing the city data for the nine records in the Employees table:
+
+    <xml xmlns:s='uuid:BDC6E3F0-6DA3-11d1-A2A3-00AA00C14882'
+          xmlns:dt='uuid:C2F41010-65B3-11d1-A29F-00AA00C14882'
+          xmlns:rs='urn:schemas-microsoft-com:rowset'
+          xmlns:z='#RowsetSchema'>
+    <s:Schema id='RowsetSchema'>
+          <s:ElementType name='row' content='eltOnly'>
+                <s:AttributeType name='city' rs:number='1' rs:nullable='true' rs:maydefer='true' rs:writeunknown='true'>
+                      <s:datatype dt:type='string' dt:maxLength='50'/>
+                </s:AttributeType>
+                <s:extends type='rs:rowbase'/>
+          </s:ElementType>
+    </s:Schema>
+    <rs:data>
+          <z:row city='Seattle'/>
+          <z:row city='Bellevue'/>
+          <z:row city='Redmond'/>
+          <z:row city='Kirkland'/>
+          <z:row city='Seattle'/>
+          <z:row city='Redmond'/>
+          <z:row city='Seattle'/>
+          <z:row city='Redmond'/>
+          <z:row city='Seattle'/>
+    </rs:data>
+    </xml>
+
+# The Bottom Line
+
+**Open a recordset.**
+
+You can open an ADO recordset in two different ways.
+
+**Master It**
+
+One way to open an ADO recordset is to provide an argument list following the Open method. What is the other way to open an ADO recordset, which doesn't involve using arguments? Some people say that this second approach makes their code easier to read.
+
+**Access a particular record in a recordset.**
+
+Both ADO and DAO technologies have methods that allow you to move around within a recordset.
+
+**Master It**
+
+One method you can use to traverse a recordset is the MoveFirst method. It takes you to the first record in the recordset. What does the _first record_ mean in a recordset in a relational database? Is it the record that's the lowest numerically, the lowest alphabetically, or what?
+
+**Search for a record.**
+
+Both ADO and DAO offer methods to directly search for a particular record.
+
+**Master It**
+
+ADO offers a Find method. How many methods does DAO offer, and what are they?
+
+**Edit a record.**
+
+When editing a record, you first use the Edit method, and then you can change the value in a field.
+
+**Master It**
+
+After you have made a change to a value in a record, what method do you use to save this change to make it part of the database?
+
+**Insert and delete records.**
+
+It's not difficult to insert new records or delete existing ones. In both situations, you use the Update method when finished to save the changes to the database.
+
+**Master It**
+
+To insert a new record into a recordset, what method do you use before you can assign data to the fields in the new record?
+Chapter 30
+
+Accessing One Application from Another Application
+
+So far, this book has focused on how to work with VBA to perform actions _within_ a VBA host application, such as Word or Access.
+
+But you might sometimes (perhaps often) need to communicate between applications as well. This chapter shows you the tools for contacting and manipulating one application from another: Automation, data objects, Dynamic Data Exchange (DDE), and SendKeys.
+
+In this chapter you will learn to do the following:
+
+  * Use Automation to transfer information
+  * Use the Shell function to run an application
+  * Use data objects to store and retrieve information
+  * Communicate via DDE
+  * Communicate via SendKeys
+
+# Understanding the Tools Used to Communicate between Applications
+
+Most VBA host applications (such as the Office applications that this chapter uses as examples) offer several tools for communicating with other applications:
+
+**Automation**
+
+Formerly known as Object Linking and Embedding (OLE), Automation is usually the most effective method for transferring information from one Windows application to another. If the applications you're using support Automation, use it in preference to the alternatives, DDE and SendKeys.
+
+**Dynamic Data Exchange (DDE)**
+
+An older method of transferring information between applications that remains a good fallback when Automation isn't available. DDE is available in only some applications.
+
+**SendKeys**
+
+The oldest and most primitive method of communicating between applications, SendKeys relies on sending keystroke equivalents to the other application. It's an attempt to pretend that someone is typing on the keyboard. But this can cause timing and other issues. Although rudimentary by comparison to Automation and DDE, SendKeys can still be effective in some situations.
+
+Beyond these three communications tools, this chapter discusses the DataObject object, which you can use to store information and to transfer information to and from the Windows Clipboard.
+
+* * *
+
+Don't Forget the Command Line
+
+If an application doesn't offer any of the control methods discussed in this chapter, you may be able to control it through the command line. For example, you can use the /p command-line switch in many applications to print a file without any user interaction. Search the Web for "command line, vba" and the application's name to find relevant tutorials.
+
+* * *
+
+# Using Automation to Transfer Information
+
+Automation is the most powerful and efficient way to communicate between applications. Each application that supports Automation offers one or more Component Object Model (COM) objects that you can access programmatically—usually an object representing the application, an object representing the various types of files the application uses, objects representing its major components, and so on.
+
+For any Automation transaction, there's a _server application_ that provides the information or tools and a _client application_ that receives or employs them. (There's also another pair of terms that distinguish between two communicating applications: the server application is also sometimes known as the _object application_ , and the client application is known as the _controlling application_.)
+
+Automation lets the client application harness the built-in capabilities of the server application. For example, Excel has better calculation features than Word and can generate useful charts, data maps, and so on based on its calculations and data. By using Automation, Word can borrow Excel's calculation engine and then insert the results into a Word document. Or Word could use Excel to create a chart that it then inserts into a document as well. Word can also take more-limited actions, such as causing Excel to open a workbook, copy a group of cells from a spreadsheet in it, and paste-link them into a document.
+
+To use Automation through VBA, you create an object in VBA that references the application you want to work with. You use the CreateObject function to create a new object in another application and the GetObject function to retrieve an existing object in another application.
+
+When using Automation, you can choose whether to display the server application or keep it hidden from the user. For some procedures, you'll need to display it—for example, the user might need to choose a file or a folder or make another choice that requires live intervention. In other situations, it can be best to keep the server application hidden so that the user isn't distracted by an application suddenly launching itself spontaneously and robotically carrying out actions in front of the user's startled eyes. This can make some users uneasy, as if the computer has gotten out of control. A colleague of mine, something of a prankster, used to torment new hires by inserting a procedure in their word processor that caused individual characters in a document to start swinging and then drop off the bottom of the screen. As if they'd "come loose." Then he would walk over and tell them that this wouldn't be a problem as long as they didn't jar their desk while typing.
+
+But even if you decide to hide a server application from the user when the procedure runs, in most cases it's helpful to display to yourself the server application while you're writing and testing the procedure. Doing so makes it much easier to see what's going wrong when your code doesn't work as expected.
+
+## Understanding Early and Late Binding
+
+When you use Automation to access another application, you can choose which type of _binding_ to use—that is, how to establish the connection between the client application and the server application.
+
+_Early binding_ involves adding a reference to the application's object library by using the References dialog box (Tools ⇒ References) at design time and then declaring an object at the start of the code by using a Dim statement that declares the specific object class type rather than declaring the object generically As Object.
+
+For example, the following code connects to a slide within a PowerPoint presentation by using early binding:
+
+    Dim myPowerPoint As PowerPoint.Application
+    Dim myPresentation As Presentation
+    Dim mySlide As Slide
+    Set myPowerPoint = CreateObject("PowerPoint.Application")
+    Set myPresentation = myPowerPoint.Presentations.Add
+    Set mySlide = myPresentation.Slides.Add(Index:=1, Layout:=ppLayoutTitleOnly)
+
+With late binding, you create an object that references the other application when you run the code. If you declare the object explicitly, you declare it as a generic object—As Object—rather than declare it as a specific object class type.
+
+For example, the following statements declare the Object variable myOutlook and then assign to it a reference to an Outlook.Application object:
+
+    Dim myOutlook As Object
+    Set myOutlook = CreateObject("Outlook.Application")
+
+* * *
+
+Early Binding Isn't Universal
+
+Not all applications that support Automation support early binding. Some applications cannot provide direct access to their functions at design time, while you're writing your code, as is required for early binding. They provide access to their functions only at runtime, when the code itself is executing. With such applications, you have no choice; you must use late binding.
+
+* * *
+
+If the server application you're using supports early binding, use it in preference to late binding. There are three advantages to early binding:
+
+  * Once you've added to the project the reference to the application's object library, you can dynamically work in your code with the outside (server) application's objects, properties, and methods through the VBA Editor in the client application. This makes it much easier to use the Editor's built-in IntelliSense features to find the objects, properties, and methods you need in the application you're referring to, and to avoid mistakes such as typos and missing arguments.
+  * Because you specify the particular type of object when you declare the object variable, you're less likely to attempt to work with the wrong object by mistake.
+  * Because VBA can compile more information about the object, elements of its methods and properties need not be resolved during runtime. So it runs faster.
+
+On the other hand, late binding can avoid object-library issues such as having to make the right references and other library-version problems.
+
+## Creating an Object with the _CreateObject_ Function
+
+The CreateObject function creates and returns a reference to an Automation object exposed to other applications. The syntax is as follows:
+
+    CreateObject( _class_ [, _servername_ ])
+
+Here, _class_ is a required argument specifying the class (the formal definition) of the object to create. The _class_ argument consists of the name of the library that will provide the object and the type of object to be provided, so it looks like this:
+
+    applicationname.objecttype
+
+For example, to specify the Excel Application object as a class, use a _class_ argument of Excel.Application. Here, Excel is the name of the application that provides the object, and Application is the type of object that we want Excel to provide. Likewise, Excel.Sheet would specify a worksheet object in Excel.
+
+_servername_ is an optional string Variant that specifies the name of the network server on which to create the object. If you merely want to connect to an application located on the user's machine (in other words, if both applications—the client and server applications—are located on the same hard drive), omit _servername_ or specify an empty string. To connect with an application located on a remote server machine, you must have DCOM (the Distributed Component Object Model) installed, and the object on the server computer must be configured to allow remote creation.
+
+Typically, you'll use a CreateObject function with a Set statement to assign to an object variable the object that you create. For example, the following statements declare an object variable named myNewSheet and assign an Excel worksheet object to it:
+
+    Dim myNewSheet As Object
+    Set myNewSheet = CreateObject("Excel.Sheet")
+
+* * *
+
+Can Be Used with Any COM Object
+
+You can use the CreateObject function with any COM object on your computer system, not just with application objects.
+
+* * *
+
+## Returning an Object with the _GetObject_ Function
+
+The GetObject function returns a reference to an existing Automation object. The syntax is as follows:
+
+    GetObject([ _pathname_ ] [, _class_ ])
+
+You can provide either argument—but you must provide _one_ of them. Here, _pathname_ is an optional string Variant specifying the full path and name of the file that contains the object you want to retrieve. _pathname_ is optional, but if you don't specify it, you must specify the _class_ argument. _class_ (which is optional if you specify _pathname_ , but required if you don't) is a string Variant specifying the class of the object you want to return.
+
+As with CreateObject, typically you'll use a GetObject function with a Set statement to assign to an object variable the object that you return with the GetObject function. For example, in the second of the following statements, the GetObject function returns an object consisting of the workbook Z:\Finance\Revenue.xlsm. The Set statement assigns this object to the object variable named Revenue declared in the first statement:
+
+    Dim Revenue As Object
+    Set Revenue = GetObject("Z:\Finance\Revenue.xlsm")
+
+Here, the workbook is associated with Excel. When this code runs, VBA starts Excel if it isn't already running and activates the workbook. You can then reference the object by referring to its object variable; in this example, you could manipulate the Revenue object to affect the Z:\Finance\Revenue.xlsm workbook.
+
+## Examples of Using Automation with the Office Applications
+
+The following sections show three examples of using Automation with Office applications.
+
+### Transferring Information from an Excel Spreadsheet to a Word Document
+
+This example transfers information from an Excel spreadsheet to a Word document.
+
+First, you need to add to the target Word project (the client project that will contain the code that accesses Excel) a reference to the Excel object library. Follow these steps:
+
+1. Start or activate Word, and then press Alt+F11 to launch the VBA Editor.
+
+2. In the Project Explorer, click the project to which you want to add the reference. For example, if the procedure or procedures will reside in the Normal.dotm template, select the Normal project in the Project Explorer before adding the reference. Or just choose Insert ⇒ Module to create a brand-new module to play around with.
+
+3. Choose Tools ⇒ References to display the References dialog box.
+
+4. Select the check box for the Microsoft Excel 15.0 Object Library item.
+
+5. Click the OK button to close the References dialog box.
+
+Once you've added the reference, you can use the VBA Editor's Object Browser to browse Excel objects. Display the Object Browser as usual by pressing F2 or choosing View ⇒ Object Browser, and then choose Excel in the Object Browser's Project/Library drop-down list. The Object Browser will display the contents of the Excel object library, as shown in Figure 30.1. You can display the help (code examples, syntax) for a selected Excel object by clicking the Help button (the question-mark icon) in the Object Browser.
+
+Figure 30.1 Once you've loaded the Excel object library, you can view its contents in the Object Browser from the VBA Editor session launched from the host application (in this case, Microsoft Word).
+
+To create and test the next code example, first set up in Excel the preconditions that this procedure expects: namely, a range object named SalesTotal. To do this, open Excel, and right-click a cell anywhere in the displayed sheet in Book1 (the default name of the first blank workbook). If you don't see a workbook named Book1, choose File ⇒ New, then click the blank workbook icon in the displayed templates.
+
+In the context menu that opens when you right-click a cell in Book1, choose the Define Name option. In the New Name dialog box that opens, type **SalesTotal** in the Name field. Then click OK to close the dialog box.
+
+Now double-click the same cell you just named and type in **145** or some other value. It's this value that your macro in Word will pluck from this workbook. Now click the File tab in the Ribbon, choose Save As, and save this workbook as **Book1.xlsx** in the C:\temp subdirectory. (Note that you're saving it as an .xlsx file.) Now you can either leave Excel running or just close it. It won't matter because your macro will open the file on the hard drive.
+
+Okay, now in Word's VBA Editor, add the code. Because you used early binding, you have available the Editor's Intellisense assistance and code-completion features. Create the procedure shown in Listing 30.1. This procedure uses the GetObject function to retrieve the information from the specified cell in the Excel spreadsheet you previously created and inserts this data in the active Word document at the current insertion point (where the blinking cursor is).
+
+**Listing 30.1**: Getting data from an Excel cell and inserting it into Word
+
+    1.  Sub Return_a_Value_from_Excel()
+    2.  
+    3.         Dim mySpreadsheet As Excel.Workbook
+    4.         Dim strSalesTotal As String
+    5.  
+    6.         Set mySpreadsheet = _
+    7.             **GetObject("C:\Temp\Book1.xlsx")**
+    8.  
+    9.         strSalesTotal = **mySpreadsheet.Application.Range("SalesTotal").Value**
+    10. 
+    11.        Set mySpreadsheet = Nothing
+    12. 
+    13.        Selection.TypeText "Current sales total: $" & strSalesTotal & "."
+    14. 
+    15.        Selection.TypeParagraph
+    16. 
+    17.    End Sub
+
+This subprocedure retrieves one piece of information from an Excel spreadsheet that's on the hard drive in the C:\temp directory. Here's what happens in the subprocedure:
+
+  * Line 3 declares the object variable mySpreadsheet of the type Excel.Workbook. Line 4 declares the String variable strSalesTotal.
+  * Line 6 uses a Set statement and the GetObject function to make mySpreadsheet reference the spreadsheet C:\Temp\Book1.xlsm.
+  * Line 9 assigns to the String variable strSalesTotal the Value property (the actual data) of the Range object named SalesTotal in the Excel Application object. You defined the SalesTotal range as a single cell, so strSalesTotal receives the value of that cell.
+  * Line 11 assigns to the mySpreadsheet object the special value Nothing, releasing the memory it occupied. (Because the procedure ends almost immediately afterward, this statement isn't necessary here. VBA will destroy it at the end of execution of the procedure. But it's good practice to free the memory assigned to an object when you no longer need to use the object, just to get into the habit.)
+  * Line 13 uses the TypeText method of the Selection object in Word to enter a string of text and the strSalesTotal string at the current selection. Line 14 uses the TypeParagraph method to insert a paragraph after the text.
+
+If you have trouble getting this example to work, double-check the following:
+
+  * Choose Tools ⇒ References in the editor to ensure that the check box next to Microsoft Excel 15.0 Object Library is checked.
+  * If you see an error message stating "Run-time error '432': File name or class name not found during Automation operation," it means that there's something wrong in this line of code:
+
+        Set mySpreadsheet = _
+            GetObject("C:\Temp\Book1.xlsx")
+
+Either you've mistyped this path in your code (such as typing C:\Docs rather than C:\Temp) or you have not saved an Excel file named Book1.xlsx to this folder.
+
+  * If you see an error message stating "Run-time error '1004': Method 'Range' of object '_Application' failed," this is an error in the following line of code:
+
+    strSalesTotal = mySpreadsheet.Application.Range("SalesTotal").Value
+
+A failure of this code means either you've got a typo in the code, such as specifying the wrong range name, or there is no range by the name SalesTotal in the Excel workbook you're opening.
+
+### Transferring Information from a Word Document to an Excel Workbook
+
+We managed to send data from Excel to Word in the previous section. Now let's go the other way.
+
+This next procedure (Listing 30.2) runs as a macro in Word. The procedure requires that Excel be currently running, so the procedure checks for the possibility that Excel isn't executing and handles the problem itself by starting Excel if necessary. The procedure creates a new Excel workbook and then transfers information from Word to the workbook.
+
+For this example to work, you must store a Word .docm file named test.docm in your C:\temp directory.
+
+As before, you'll find creating this procedure easier if you first add to the current Word project a reference to the Excel object library. (See the previous section for instructions.)
+
+**Listing 30.2**: Sending data from Word to Excel
+
+    1.  Sub Send_Word _Count_to_Excel_Spreadsheet()
+    2.
+    3.      Dim WordCount As Variant
+    4.      Dim strPath As String
+    5.      Dim strFile As String
+    6.      Dim docCurDoc As Document
+    7.      Dim myXL As Excel.Application
+    8.      Dim myXLS As Excel.Workbook
+    9.      Const errExcelNotRunning = 429
+    10.     Const errDocNotAvailable = 5174
+    11.
+    12.     On Error GoTo Handle
+    13.
+    14.     ' open the Word document:
+    15.     strPath = "C:\temp"
+    16.     strFile = "test.docm"
+    17.     Set docCurDoc = Documents.Open(strPath & "\" _
+    18.         & strFile, AddToRecentFiles:=False)
+    19.
+    20.
+    21.     'is Excel already running?
+    22.     Set myXL = GetObject(, "Excel.application")
+    23.
+    24.     myXL.Visible = True
+    25.     Set myXLS = myXL.Workbooks.Add
+    26.     myXL.ActiveCell.Range("A1").Select
+    27.     myXL.ActiveCell = "Word Count"
+    28.
+    29.     WordCount = docCurDoc _
+    30.         .BuiltInDocumentProperties(wdPropertyWords)
+    31.
+    32.         myXL.ActiveCell.Range("A2").Select
+    33.         myXL.ActiveCell = WordCount
+    34.
+    35.         docCurDoc.Close SaveChanges:=wdDoNotSaveChanges
+    36.
+    37. Shutdown:
+    38.     Set myXL = Nothing
+    39.     Set myXLS = Nothing
+    40.
+    41.     Exit Sub
+    42.
+    43. Handle:
+    44.      If Err.Number = errExcelNotRunning Then
+    45.         'If no instance of Excel is running then, run it:
+    46.         Set myXL = CreateObject("Excel.Application")
+    47.         Err.Clear
+    48.         Resume Next
+    49.     ElseIf Err.Number = errDocNotAvailable Then
+    50.         MsgBox "No Word Document named Test.docm Found"
+    51.         GoTo Shutdown
+    52.     Else
+    53.         Resume Next
+    54.     End If
+    55.
+    56. End Sub
+
+Here's what happens in Listing 30.2:
+
+  * Line 2 is a spacer. In fact, all blank lines are just spacers—so I won't mention them again.
+  * Line 3 declares the Variant variable that will be assigned the number of words in a Word document. Later, in line 33, this same variable assigns its value to an Excel cell. Line 4 declares the String variable strPath that will hold the file path to the Word document, and line 5 declares the String variable strFile that will hold the Word document's filename.
+  * Line 6 declares the Document variable docCurDoc; it will point to the Word document when it is opened using the Open method of the Documents object. Line 7 declares an Excel.Application object variable myXL, and line 8 declares an Excel.Workbook object variable myXLS.
+  * Line 9 declares the constant errExcelNotRunning, setting its value to 429. This error number indicates that the procedure attempted to manipulate Excel while no instance of Excel was currently executing. Line 10 declares the constant errDocNotAvailable, setting its value to 5174. This error number indicates that the Word document your procedure attempted to open could not be found.
+  * Line 12 starts error handling for the procedure, directing execution to the code below the label Handle in the event of an error.
+  * Line 17 opens the Word document specified by strPath, a backslash, and strFile, assigning the document object to the docCurDoc variable. If the document isn't available, an error occurs and execution is transferred to the error-handler code that starts in line 43. This error number matches the constant defined in the procedure as errDocNotAvailable, so a message box informs the user that the Word document wasn't found. Then execution is transferred to the Shutdown label where the two object variables are destroyed and the procedure is exited.
+  * Line 22 can also potentially trigger an error condition. It attempts to assign a currently executing instance of Excel to the object variable myXL. However, if this attempt fails, execution is transferred to the Handle label. If Excel isn't running at this point, error 429 ("ActiveX component cannot create object") occurs, so line 44 in the error handler checks for this error by using the constant errExcelNotRunning. If it matches the error number, line 46 assigns to myXL a _new_ instance of Excel that it creates by using the CreateObject function. Line 47 then uses an Err.Clear statement to clear the error, and line 48 contains a Resume Next statement to cause VBA to resume execution back up at the next statement following the offending statement.
+  * One way or another, by the time line 24 is executed, myXL refers to a running instance of Excel. Line 24 sets the Visible property of myXL to True so that it appears onscreen.
+  * Line 25 assigns to myXLS a new workbook created by using the Add method of the Workbooks object in myXL.
+  * Line 26 positions the insertion pointer in the first cell.
+  * Line 27 assigns to the active cell in myXL the text Word Count.
+  * Line 29 assigns the document's word count value to the variable WordCount. This value is accessed by using the wdPropertyWords property from the BuiltInDocumentProperties collection of docCurDoc.
+  * Line 32 moves the insertion cursor down one row in Excel to cell A2, and line 33 displays the word count in that cell.
+  * Finally, line 35 closes the Word document without saving any changes that may have been made to it while it was opened for inspection.
+  * Line 41 contains an Exit Sub statement to exit the procedure at this point—to avoid permitting execution to continue down into the zone where the error-handling statements are. Using an Exit Sub like this is common when a procedure includes an error handler at the end.
+
+### Placing a PowerPoint Slide in an Outlook Message
+
+The next procedure shows how to communicate between PowerPoint and Outlook. This procedure, run from PowerPoint, returns the existing instance of Outlook or (if there is none) creates a new instance. The procedure then uses PowerPoint to send a message that gives details drawn from the presentation.
+
+Listing 30.3 shows the procedure. There's one complication: Because PowerPoint doesn't have a central macro storage project like Word's Normal.dotm or Excel's Personal Macro Workbook, the code must be stored in an open presentation. This could be the presentation that is the subject of the email, but it is much more convenient to maintain a code-only presentation that you open at the beginning of all PowerPoint sessions that require the use of code. This becomes your own personal macro-storage system.
+
+In any case, you need some slides from which to pick information that will be sent (and you also need to provide your email address), so follow these steps to set up the necessary preconditions for the upcoming example.
+
+First, prepare the target PowerPoint project (the project that will contain the code that accesses Outlook and will contain the slides you're accessing):
+
+1. Start PowerPoint. Click the Photo Albums link at the top of the sample templates (just below the search field). Click the Contemporary Photo Album presentation, then click the Create button to load it into PowerPoint.
+
+2. Launch the VBA Editor by pressing Alt+F11.
+
+3. In the VBA Editor, choose Insert ⇒ Module to open a code module where you can put this procedure.
+
+4. Choose Tools ⇒ References to display the References dialog box.
+
+5. Select the check box for the Microsoft Outlook 15.0 Object Library item.
+
+6. Click OK to close the References dialog box.
+
+Now enter the code from Listing 30.3 into the module you inserted in step 3. Be sure to replace my email address in line 23 with your email address.
+
+**Listing 30.3**: Placing a PowerPoint Slide in an Outlook Message
+
+     1.  Sub Notify_of_New_Presentation()
+     2.
+     3.      Dim myPresentation As Presentation
+     4.      Dim strPresentationFilename As String
+     5.      Dim strPresentationTitle As String
+     6.      Dim strPresentationPresenter As String
+     7.      Dim myOutlook As Outlook.Application
+     8.      Dim myMessage As Outlook.MailItem
+     9.      Const errOutlookNotRunning = 429
+    10.
+    11.      On Error GoTo ErrorHandler
+    12.
+    13.      Set myPresentation = ActivePresentation
+    14.      With myPresentation
+    15.          strPresentationFilename = .FullName
+    16.          strPresentationTitle = _
+                     .Slides(1).Shapes(3).TextFrame.TextRange.Text
+    17.          strPresentationPresenter = _
+                     .Slides(1).Shapes(1).TextFrame.TextRange.Text
+    18.      End With
+    19.
+    20.      Set myOutlook = GetObject(, "Outlook.Application")
+    21.      Set myMessage = myOutlook.CreateItem(ItemType:=olMailItem)
+    22.      With myMessage
+             ' replace the following line with your email address:
+    23.           .To = "richard41@pri.r.com **"**
+    24.
+    25.          .Subject = "Presentation for review: " & strPresentationTitle
+    26.          .BodyFormat = olFormatHTML
+    27.          .Body = "Please review the following presentation:" & _
+                     vbCr & vbCr & "Title: " & strPresentationTitle & vbCr & _
+                     "Presenter: " & strPresentationPresenter & vbCr & vbCr & _
+                     "The presentation is in the file: " & _
+                     strPresentationFilename
+    28.          .Send
+    29.      End With
+    30.
+    31.      myOutlook.Quit
+    32.
+    33.      Set myMessage = Nothing
+    34.      Set myOutlook = Nothing
+    35.      Exit Sub
+    36.  ErrorHandler:
+    37.      If Err.Number = errOutlookNotRunning Then
+    38.          Set myOutlook = CreateObject("Outlook.Application")
+    39.          Err.Clear
+    40.          Resume Next
+    41.      Else
+    42.          MsgBox Err.Number & vbCr & Err.Description, vbOKOnly + _
+                     vbCritical, "An Error Has Occurred"
+    43.    End If
+    44.
+    45.  End Sub
+
+Here's what happens in Listing 30.3:
+
+  * Line 3 declares a Presentation object variable named myPresentation. Line 4 declares a String variable named strPresentationFilename, which is used for storing the path and filename of the presentation. Line 5 declares a String variable named strPresentationTitle, which is used to store the title of the presentation. Line 6 declares a String variable named strPresentationPresenter, which is used to store the name of the presenter of the presentation.
+  * Line 7 declares an Outlook.Application object variable named myOutlook that is used to represent the Outlook application. Line 8 declares an Outlook.MailItem object variable named myMessage that is used to represent the message that the procedure creates. Line 9 declares a constant named errOutlookNotRunning and assigns to it the number 429, the error number returned if no instance of Outlook is available when the GetObject function tries to access it.
+  * Line 11 starts error handling for the procedure, directing execution to the label ErrorHandler (in line 36) in the event of an error.
+  * Line 13 assigns the active presentation to the myPresentation object variable. Lines 14 through 18 contain a With structure that works with myPresentation. Line 15 assigns the FullName property of myPresentation to strPresentationFilename.
+  * Line 16 assigns to strPresentationTitle the Text property of the TextRange object in the TextFrame object in the third Shape object on the first Slide object—in other words, the text from the first placeholder shape on the first slide in the presentation. Similarly, line 17 assigns to strPresentationPresenter the text from the second shape on the second slide.
+  * Line 20 assigns to myOutlook the current instance of Outlook, which it returns using the GetObject function. If Outlook isn't running at this point, error 429 ("ActiveX component cannot create object") occurs, so line 37 in the error handler checks for this error by using the constant errOutlookNotRunning. If it matches, line 38 assigns to myOutlook a new instance of Outlook that it creates by using the CreateObject function. Line 39 then uses an Err.Clear statement to clear the error, and line 40 contains a Resume Next statement to cause VBA to jump back up in the code and resume execution where it left off (at the statement after the offending statement).
+  * Line 21 uses the CreateItem method of the Outlook Application object (represented by myOutlook) to create a new mail item (a new email), which it assigns to myMessage. Lines 22 through 29 contain a With structure that works with myMessage.
+  * Line 23 assigns recipients by setting the To property. ( _You should change this line to your own email address so you can test this code and receive the message it sends._ )
+  * Line 24 is a placeholder.
+  * Line 25 enters text for the Subject property. Line 26 specifies that the message use HTML formatting (.BodyFormat = olFormatHTML). Line 27 assigns text to the body of the message by using the Body property. Line 28 then uses the Send method to send the message.
+  * Line 31 uses the Quit method to close myOutlook.
+  * Line 33 sets myMessage to Nothing, releasing the memory it occupied. Similarly, line 34 sets myOutlook to Nothing. Line 35 then exits the procedure.
+  * As discussed earlier in this list, the primary function of the error handler is to launch an instance of Outlook if none is currently running. If any error other than error 429 occurs, execution branches to the Else statement in line 41, and line 42 displays a message box that gives the error number and description.
+
+If you test this example, be sure to remember to change line 23 from my email address to your email address. When the procedure finishes execution, look in your Inbox in Outlook for the new email message.
+
+# Using the _Shell_ Function to Run an Application
+
+Instead of using the CreateObject function to start an application and return a reference to it, you can use the Shell function to run an application. Shell can run any executable program, and its syntax is straightforward:
+
+    Shell(pathname[,windowstyle])
+
+Here, pathname is the file path and program name of the program you want the Shell command to execute. Also include in the pathname any necessary command-line switches or arguments required by that program.
+
+This example opens Internet Explorer, maximizes its window, then switches the focus to it:
+
+    Sub OpenIE()
+
+    Dim id
+
+    id = Shell("c:\program files\internet explorer\iexplore.exe", vbMaximizedFocus)
+
+    End Sub
+
+* * *
+
+_Shell_ Can Launch Applications via Filename Extensions
+
+Shell can also start an application based on a file whose filename extension is associated with that program. It's as if you had double-clicked on a file in Windows Explorer, causing Windows to see if any application is associated with that file's extension.
+
+For example, say that you specify a.txt filename extension as the argument for Shell: Shell "testfile.txt". A file with a .txt extension usually starts Notepad because Notepad is usually associated with the filename extension .txt. (I say _usually_ because Windows users are free to reassign filename extensions to alternative applications.) If Shell can't find the specified application or file, it returns a runtime error.
+
+* * *
+
+windowstyle is an optional integer Variant that you use to specify the type of window in which to run the application and to switch focus to the newly launched application. Table 30.1 lists the constants and values for windowstyle.
+
+* * *
+
+Using the _Sleep_ Function to Avoid Problems with Shell's Asynchrony
+
+LWparaofcase study typeThe Shell function runs other programs _asynchronously_ rather than _synchronously_. In other words, Shell doesn't halt all other activity until it is finished with its job. So when VBA executes a Shell statement, it registers the statement as an action to be performed—but that action may not necessarily be finished before the next statement in your code executes.
+
+This asynchrony can cause errors in your procedures if subsequent commands depend on the Shell statement having already been executed. If you run into this type of problem, a crude but often-effective fix is to just allow extra time for the Shell function to execute before taking any dependent action. For example, you might run the Shell function earlier in the procedure than you otherwise would have done rather than running it right before the dependent actions. But a better solution is to use an API call (such as Sleep) to delay the execution of further statements for a few seconds so that the Shell function can finish executing. Place this declaration in the declarations section at the top of the Code window:
+
+    Public Declare Sub Sleep Lib "kernel32" (ByVal dwMilliseconds As Long)
+
+Then call the Sleep function at the appropriate point in your code, specifying the number of milliseconds you want the code to wait. The following statement uses Sleep to implement a 2-second delay:
+
+    Sleep (2000)
+
+* * *
+
+Table 30.1 Constants and values for the windowstyle argument
+
+**Constant** | **Value** | **Window Style**  
+---|---|---  
+vbHide | 0 | Minimized and hidden, but with focus  
+vbNormalFocus | 1 | Normal ("restored") with focus  
+vbMinimizedFocus | 2 | Minimized with focus (the default)  
+vbMaximizedFocus | 3 | Maximized with focus  
+vbNormalNoFocus | 4 | Normal ("restored") without focus  
+vbMinimizedNoFocus | 6 | Minimized without focus
+
+# Using Data Objects to Store and Retrieve Information
+
+As you've seen so far in this book, you can store information in many places using VBA. But what you will find uniquely useful about the _data object_ is its ability to copy information to, and retrieve information from, the Clipboard. This chapter is all about ways to communicate between applications, and the Clipboard is one such way.
+
+A data object is logically attached to a UserForm object in the Microsoft Forms object model, but you can use a data object by itself with no user form displayed. This is similar to the way that you can create and manipulate a hidden Access database with no visible interface displayed to the user. (This phenomenon is described in the section titled "Opening Multiple Databases at Once" in Chapter 28.)
+
+A data object, which is represented in VBA by the DataObject object, is used to store data. Each data object can hold multiple pieces of text information, and each piece must be in a different, defined format. You can create and use multiple data objects to store multiple pieces of data in the same format, or you can cheat and tell VBA that information is in a different format when really it's not.
+
+At any given time, the Clipboard can contain one text item and one item in another format, such as a graphical object. If you copy another text item to the Clipboard, that item will overwrite the previous text item, but any graphical item on the Clipboard will remain unscathed. Likewise, if you copy a graphical item to the Clipboard, it will overwrite any previous graphical item (or indeed any item in a non-text format) stored in the Clipboard, but any text item in the Clipboard won't be affected.
+
+The data object works in a way similar to the Clipboard. However, a data object can't store graphical information. It _can_ store multiple pieces of text information, each _defined_ as being in a different format.
+
+## Creating a Data Object
+
+To create a data object, declare an object variable of the DataObject type and then use a Set statement to assign a new DataObject object to it. For example, the following statements declare a DataObject variable named myDObj and assign a new DataObject to it:
+
+    Dim myDObj As DataObject
+    Set myDObj = New DataObject
+
+## Storing Information in a Data Object
+
+To store information in a data object, use the SetText method, which has the following syntax:
+
+    _object_.SetText(StoreData [,format])
+
+The components of the syntax are as follows:
+
+  * _object_ is a required argument specifying a valid object.
+  * StoreData is a required argument specifying the data to store in the data object.
+  * format is an optional argument containing an Integer value or a String specifying the format of the information in StoreData. A value of 1 indicates text format; a value other than 1 or a String indicates a user-defined format.
+
+For example, the following statement stores the text Sample text string in the DataObject named myDObj:
+
+    myDObj.SetText "Sample text string"
+
+The following statement stores the text Sample formatted text string in the DataObject named myDObj, defining and using the custom format myFormat:
+
+    myDObj.SetText "Sample formatted text string", "myFormat"
+
+Once the custom format has been defined and stored in the data object, you can access the data stored in that format by specifying the format. In this case, no formatting is actually involved—the code simply uses the format argument to create and identify a different data slot in the data object so that the new string doesn't overwrite the existing text string. It's a trick.
+
+## Returning Information from a Data Object
+
+To return information from a data object, use the GetText method of the DataObject object. The GetText method has the following syntax:
+
+    _object_.GetText([format])
+
+The components of the syntax are as follows:
+
+  * _object_ is a required argument specifying a valid object.
+  * format is an optional argument containing a String or an Integer specifying the format of the data to retrieve.
+
+For example, the following statement displays a message box containing the plain-text string stored in the DataObject named myDObj:
+
+    MsgBox myDObj.GetText
+
+The following statement assigns to the String variable strTemp the text stored with the myFormat format in the DataObject named myDObj:
+
+    strTemp = myDObj.GetText("myFormat")
+
+Here's a working code example that illustrates how to create a data object and then uses it to store and retrieve information. First, choose Tools ⇒ References in the editor to ensure that the check box next to Microsoft Forms 2.0 Object Library is checked. Note that it's likely this library will not be in its correct alphabetic location in the list of libraries in the References dialog box. Instead, it will probably be already checked and, thus, found in the first 10 or so libraries at the top of the References list.
+
+Type this working example into an application's VBA Editor, and press F5 to see it execute:
+
+    Sub StoreText()
+
+       Dim myDObj As DataObject
+
+       Set myDObj = New DataObject
+
+       myDObj.SetText "Sample text string"
+
+       MsgBox myDObj.GetText
+
+    End Sub
+
+## Assigning Information to the Clipboard
+
+To assign text to the Clipboard from a data object, use the PutInClipboard method of the DataObject. For example, the following example creates a new data object named myDO, assigns to it the text Nasta Louise Gomes, and then assigns that text to the Clipboard:
+
+    Sub StoreText()
+
+    Dim myDO As DataObject
+    Set myDO = New DataObject
+    myDO.SetText "Nasta Louise Gomes"
+    myDO.PutInClipboard
+
+    End Sub
+
+Test this by pressing F5, and then press Ctrl+V to display the Clipboard contents in the Editor, or Word, or some other text application.
+
+## Returning Information from the Clipboard to a Data Object
+
+To fetch whatever text information is in the Clipboard and store it in a data object, use the GetFromClipboard method of the DataObject object. The following example creates a data object referenced by the variable aDO, assigns to it the text from the Clipboard, and then displays the text:
+
+    Sub GetClipboardText()
+
+    Dim aDO As DataObject
+    Set aDO = New DataObject
+    aDO.GetFromClipboard
+
+    MsgBox aDO.GetText
+
+    End Sub
+
+To return formatted information from the Clipboard and store it in a data object, use the GetFormat method of the DataObject object.
+
+## Finding Out Whether a Data Object Contains a Given Format
+
+To find out whether a data object contains a given format, use the GetFormat method of the DataObject object. The syntax for the GetFormat method is as follows:
+
+    _object_.GetFormat(format)
+
+Here are the components of the syntax:
+
+  * _object_ is a required argument that returns a valid DataObject object.
+  * format is an Integer or String specifying the format you're looking for. If the DataObject contains the format, GetFormat returns True; if not, GetFormat returns False.
+
+For example, the following statement checks to see if the DataObject named myDO contains the format myHTML and assigns the format's contents to the string strHTMLText if it does:
+
+    If myDO.GetFormat("myHTML") = True Then _
+        strHTMLText = myDO.GetText(Format:="myHTML")
+
+# Communicating via DDE
+
+If the application with which you want to communicate doesn't support Automation, you can try Dynamic Data Exchange (DDE). DDE is a protocol that establishes a channel between two applications through which they can automatically exchange data. DDE can be tricky to set up, but once you get it working, it is usually reliable.
+
+Not all applications support DDE. Among the Office applications, Word, Excel, and Access support DDE, but PowerPoint and Outlook do not. What's more, Microsoft warns that DDE is not a secure technology. So use it only in situations where you aren't vulnerable to outside intrusion.
+
+In the following descriptions of DDE statements, I'll use the term _method_ in its more generic, non-OOP sense. Back long, long ago when DDE was introduced (in Windows 3.0!), object-oriented programming wasn't yet fashionable.
+
+A typical DDE conversation can contain the following actions:
+
+  * Using the DDEInitiate method to start a DDE connection and establish the channel on which the connection operates
+  * Using the DDERequest method to return text from the other application or the DDEPoke method to send text to the other application
+  * Using the DDEExecute method to execute a command in the other application
+  * Using the DDETerminate method to close the current DDE channel or using the DDETerminateAll method to close all the DDE channels
+
+## Using _DDEInitiate_ to Start a DDE Connection
+
+To start a DDE connection, you use the DDEInitiate method. The DDEInitiate method employs the following syntax:
+
+    _expression_.DDEInitiate(App, Topic)
+
+The components of the syntax are as follows:
+
+  * _expression_ is an optional expression specifying an Application object.
+  * App is a required String argument specifying the name of the application with which the DDE connection is to be started.
+  * Topic is a required String argument specifying the DDE topic (such as an open file) in the application. To discover the list of topics available for an application, you send a DDE request (via the DDERequest method, discussed in the next section) to the System object in the application.
+
+DDEInitiate returns the number of the DDE channel established. You then use this number for subsequent DDE calls.
+
+For example, the following statements declare the Long variable lngDDEChannel1 and assign to it a DDE channel established with the workbook Sales Results.xlsm in Excel:
+
+    Dim lngDDEChannel1 As Long
+    lngDDEChannel1 = DDEInitiate("Excel", "Sales Results.xlsm")
+
+## Using _DDERequest_ to Return Text from Another Application
+
+To return a string of text from another application, you use the DDERequest method. The DDERequest method has the following syntax:
+
+    _expression_.DDERequest(Channel, Item)
+
+The components of the syntax are as follows:
+
+  * _expression_ is an optional expression that returns an Application object.
+  * Channel is a required Long argument specifying the DDE channel to use for the request.
+  * Item is a required String argument specifying the item requested.
+
+To get the list of topics available via DDE, request the Topics item from the System topic. For example, the following statements establish a DDE channel to FrontPage (by using DDEInitiate) and return the list of DDE topics, assigning the list to the String variable strDDETopics:
+
+    Dim lngDDE1 As Long
+    Dim strDDETopics As String
+    lngDDE1 = DDEInitiate(App:="FrontPage", Topic:="System")
+    strDDETopics = DDERequest(Channel:=lngDDE1, Item:="Topics")
+
+Open Excel, click the File tab on the Ribbon, and then click the New option. Click Monthly Family Meal Planner in the display of templates. Then click the Create button.
+
+Now open Word's VBA Editor and type in the following procedure. The following statements establish a DDE channel to the workbook SalesReport1.xlsm in Excel and return the contents of cell C7 (R7C3) in the String variable strResult:
+
+    Sub DDEtoExcel()
+
+    Dim lngDDEChannel1 As Long, strResult As String
+    lngDDEChannel1 = DDEInitiate("Excel", "Monthly family meal planner1")
+    strResult = DDERequest(lngDDEChannel1, "R11C4")
+    MsgBox strResult
+    DDETerminateAll
+
+    End Sub
+
+When you press F5 to test this, you should see a message box displaying "Beef and Mushroom Skillet Supper," which sounds pretty nasty.
+
+For DDE to work, you have to use the correct, full name of the target document as it appears in the title bar of the application. In this case, your target document is an Excel workbook named _Monthly family meal planner1._
+
+The previous code works only if you haven't yet saved the Monthly family meal planner1 workbook because before it's saved, a new workbook has no filename extension appended to its name. However, if you _have_ already saved this workbook, you must append whatever filename extension you employed, such as .xlsm. Here's an example:
+
+    lngDDEChannel1 = DDEInitiate("Excel", "Monthly family meal planner1.xlsm")
+
+The DDETerminateAll statement is explained shortly.
+
+## Using _DDEPoke_ to Send Text to Another Application
+
+To send text to another application, use the DDEPoke method. The DDEPoke method has the following syntax:
+
+    _expression_.DDEPoke(Channel, Item, Data)
+
+The components of the syntax are as follows:
+
+  * _expression_ is an optional expression that returns an Application object.
+  * Channel is a required Long argument specifying the DDE channel to use.
+  * Item is a required String argument specifying the item to which to send the data.
+  * Data is a required String argument specifying the data to be sent.
+
+Continuing to use the previous example, the following statements use the DDEPoke method to assign the data Potato Salad Surprise to cell R11 C4 in the worksheet:
+
+    Sub DDEPokeExcel()
+
+    Dim lngDDEChannel1 As Long, strResult As String
+
+    lngDDEChannel1 = DDEInitiate("Excel", "Monthly family meal planner1")
+    strResult = DDERequest(lngDDEChannel1, "R11C4")
+
+    DDEPoke Channel:=lngDDEChannel1, Item:="R11C4", _
+         Data:="Potato Salad Surprise"
+    DDETerminateAll
+
+    End Sub
+
+Now look at the Excel worksheet and you'll see that "Beef and Mushroom Skillet Supper" has been replaced with the even more dubious-sounding "Potato Salad Surprise."
+
+## Using _DDEExecute_ to Have One Application Execute a Command in Another
+
+To execute a command in another application, use the DDEExecute method. The DDEExecute method has the following syntax:
+
+    _expression_.DDEExecute(Channel, Command)
+
+The components of the syntax are as follows:
+
+  * _expression_ is an optional expression that returns an Application object.
+  * Channel is a required Long argument specifying the DDE channel to use.
+  * Command is a required String argument specifying the command or series of commands to execute.
+
+For example, the following statements establish a DDE channel to Excel and issue a Close command to close the active workbook:
+
+    Sub DDEExec()
+
+    Dim lngMyChannel
+    lngMyChannel = DDEInitiate(App:="Excel", Topic:="System")
+    DDEExecute lngMyChannel, Command:="[Close]"
+
+    Exit Sub
+
+If the workbook you're closing has unsaved data, Excel will display a message box prompting you to save it—thus preventing it from closing until the prompt is satisfied.
+
+## Using _DDETerminate_ to Close a DDE Channel
+
+When you've finished a DDE communication, use the DDETerminate method to close the DDE channel you opened. The syntax for the DDETerminate method is as follows:
+
+    _expression_.DDETerminate(Channel)
+
+Here are the components of the syntax:
+
+  * _expression_ is an optional expression that returns an Application object.
+  * Channel is a required Long argument specifying the DDE channel to close.
+
+The following statements employ the previous example, closing the DDE channel that was opened:
+
+    Dim lngMyChannel
+    lngMyChannel = DDEInitiate(App:="Excel", Topic:="System")
+    DDEExecute lngMyChannel, Command:="[Close]"
+    DDETerminate lngMyChannel
+
+## Using _DDETerminateAll_ to Close All Open DDE Channels
+
+To close all open DDE channels, use the DDETerminateAll method:
+
+    DDETerminateAll
+
+Because VBA doesn't automatically close DDE channels when a procedure ends, it's a good idea to use a DDETerminateAll statement to make sure you haven't inadvertently left any DDE channels open.
+
+# Communicating via _SendKeys_
+
+The SendKeys statement is a basic and limited form of communication between applications. You may find SendKeys useful if neither Automation nor DDE works with the target application. But SendKeys does have shortcomings, as you'll see momentarily.
+
+SendKeys transmits specified keystrokes to the destination application. It impersonates someone typing at the keyboard.
+
+For example, to use SendKeys to send the command to create a new file in Notepad, you send the keystrokes for Alt+F, N (to execute the File ⇒ New command), and Notepad reacts as if you had pressed the keys manually. In Office 2013 applications, Alt+F opens the File tab on the Ribbon.
+
+SendKeys works only with currently running Windows applications: You can't use SendKeys to start another application running (for that you need to use Shell, as discussed earlier in this chapter), nor can you use SendKeys to communicate with DOS applications running in a virtual DOS machine under Windows.
+
+The syntax for the SendKeys statement is as follows:
+
+    SendKeys string[, wait]
+
+Here, string is a required String expression specifying the keystrokes to be sent to the destination application. wait is an optional Boolean value specifying whether to wait after sending the keystrokes until the application has executed them (True) or to immediately return control to the procedure sending the keystrokes (False, the default setting). The True setting, however, can prevent some kinds of timing problems.
+
+Typically, string consists of a series of keystrokes (rather than a single keystroke). All alphanumeric characters that appear on the regular keyboard are represented by the characters themselves: To send the letter _H_ , you specify **H** in the string, and to send the word _Hello_ , you specify **Hello** in the string. To denote the movement (arrow) and editing keys, SendKeys uses keywords enclosed within braces ({}), as described in Table 30.2.
+
+Table 30.2 SendKeys keywords for movement and editing keys
+
+**Key** | **Code**  
+---|---  
+Down arrow | {DOWN}  
+Left arrow | {LEFT}  
+Right arrow | {RIGHT}  
+Up arrow | {UP}  
+Backspace | {BACKSPACE}, {BS}, or {BKSP}  
+Break | {BREAK}  
+Caps Lock | {CAPSLOCK}  
+Delete | {DELETE} or {DEL}  
+End | {END}  
+Enter | {ENTER}  
+Esc | {ESC}  
+F1, F2, etc. | {F1}, {F2}, etc. (up to {F16})  
+Help | {HELP}  
+Home | {HOME}  
+Insert | {INSERT} or {INS}  
+NumLock | {NUMLOCK}  
+Page Down | {PGDN}  
+Page Up | {PGUP}  
+Print Screen | {PRTSC}  
+Scroll Lock | {SCROLLLOCK}  
+Tab | {TAB}
+
+To send Shift, Control, and Alt, use the symbols shown in Table 30.3.
+
+Table 30.3 SendKeys symbols for meta keys
+
+**Key** | **Code**  
+---|---  
+Shift | +  
+Ctrl | ˆ  
+Alt | %
+
+SendKeys automatically assigns the keystroke after the meta key to the meta key, thereby imitating pressing and holding the Alt key, for example, while simultaneously pressing S.
+
+In other words, to send a Ctrl+O keystroke combination, you would specify **ˆO** , and SendKeys imitates holding down Ctrl while pressing O. Then, the next keystroke after the _O_ is considered to be struck separately. If you need to assign multiple keystrokes to the meta key, enter the keystrokes in parentheses after the meta key. For example, to send Alt+F, I, I, you'd write **%(FII)** , not **%FII**.
+
+As you can see, SendKeys has special uses for the plus sign (+), caret (ˆ), percent sign (%), and parentheses (). The tilde (∼) gets special treatment as well. To use these characters to merely represent themselves instead of their special uses, enter them within braces: {=} sends a regular = sign, {ˆ} a regular caret, {%} a percent sign, {∼} a tilde, and {()} parentheses. Likewise, you must enclose brackets (which have a special meaning in DDE in some applications) within braces; braces themselves also go within braces.
+
+Using SendKeys is much less complex than these details initially make it appear—but with that reassurance, there's one more trick you should know: To repeat a key, enter the key and the number of repetitions in braces. For example, to send five up-arrow keystrokes, you'd specify {UP 5}; to send 10 zeroes, you'd specify {0 10}.
+
+Listing 30.4 shows an example of how to use SendKeys to send some text to Notepad after first starting it with the Shell command.
+
+* * *
+
+Warnings about SendKeys
+
+SendKeys is an old technology and it has two serious drawbacks. First, you can run into timing issues. SendKeys was created when computers ran far more slowly than they do today. For this reason, in some circumstances, executing the code in Listing 30.4 creates a problem when it displays the Save dialog box. Execution stops, failing to complete the file-saving. Or the filename is saved as _og file_ rather than _log file_. These are timing problems. The second drawback relates to testing your code. Because SendKeys needs to activate the target application, you can't step through your code (repeatedly pressing F8) in the VBA Editor—the editor grabs the focus back at the wrong point, becomes perplexed, and the keystrokes are dumped into the Editor rather than into Notepad, the intended target. Instead, you must run the procedure either from the VBA Editor (by pressing F5) or from the host application as a macro. Technically, this second behavior—absorbing keystrokes into the Editor rather than Notepad—is a result of what SendKeys is actually doing: It's pushing keystrokes into the key buffer. Then they pop back out wherever they can.
+
+* * *
+
+**Listing 30.4**: Automating Notepad with SendKeys
+
+     1.  Sub Send_to_Notepad()
+     2.      Dim strLogDate As String
+     3.      Dim strSaveLog As String
+     4.      Dim strMsg As String
+     5.      Dim appNotepad As Variant
+     6.      strMsg = "Sample log text here."
+     7.      strLogDate = Month(Now) & "-" & Day(Now) & "-" & Year(Now)
+     8.      strSaveLog = "Log file for " & strLogDate & ".txt"
+     9.      appNotepad = Shell("notepad.exe", vbNormalFocus)
+    10.      AppActivate appNotepad
+    11.      SendKeys strMsg & "%FS" & strSaveLog & "{Enter}" & "%{F4}", True
+    12.  End Sub
+
+Here's how the code works:
+
+  * The Send_to_Notepad procedure starts by declaring (in lines 2, 3, and 4) three String variables—strLogDate, strSaveLog, and strMsg—and (in line 5) one Variant variable, appNotepad.
+  * Line 6 then assigns to strMsg a sample string of text.
+  * Line 7 assigns to strLogDate a date built of the Day, Month, and Year values for Now (which returns the current date and time). For example, if the date is July 11, 2013, Month(Now) will return 7, Day(Now) will return 11, and Year(Now) will return 2013, so the strLogDate string will contain 7-11-2013.
+  * Line 8 then assigns to the strSaveLog string (which will be used to supply the filename for the log file) text describing the file, the strLogDate string, and the .txt filename extension (to continue our example, Log file for 7-11-2013.txt).
+  * In line 9, the procedure finally gets down to business, using the Shell statement to run Notepad in a "normal" (not maximized or minimized) window with focus and storing the task ID of the Notepad session in the variable appNotepad.
+  * Line 10 then uses an AppActivate statement to activate Notepad.
+  * Line 11 uses a SendKeys statement to send to Notepad the following: 
+    * The information contained in the String variable strMsg.
+    * An Alt=F keystroke (to pull down the File menu), followed by an S keystroke to choose the Save item on the menu. This keystroke displays the Save As dialog box with the File Name text box selected.
+    * The strSaveLog String variable, which is entered in the File Name text box.
+    * An Enter keystroke to choose the Save button in the Save As dialog box.
+    * An Alt=F4 keystroke to quit Notepad.
+  * Line 12 ends the procedure.
+
+When you run this procedure (again, you need to run the procedure by pressing F5 rather than stepping into it with F8), you'll see the following:
+
+1. Notepad springs to life.
+
+2. The contents of the Msg string appear in the Notepad window.
+
+3. The Save As dialog box displays itself, enters the filename in the File Name text box, and then dismisses itself.
+
+4. Notepad closes. The .txt file is saved to the currently active folder on your hard drive.
+
+Because SendKeys was historically most often employed to open an application's menus and select an option from the menus (the way that Notepad still behaves), you might think that applications since Vista—which are largely menu-free and employ the Ribbon instead—would seriously curtail the flexibility of the SendKeys technique. However, this isn't true. Many of the features of the Ribbon, for example, are accessible via key combinations. Try pressing the sequence Alt, W, Q, 2, and the Enter key in Word; it will switch to the View tab on the Ribbon, select the Zoom option, and switch to a 200% zoom. The difference here is that instead of employing the traditional approach of simultaneously pressing the Alt key while pressing other keys (such as Alt+V to open a View menu), in current Windows operating systems you press and release Alt by itself, then you press the W key to switch to the View tab on the Ribbon. At this point, additional keystrokes are possible to activate the various options on the View tab. To exit from this mode, press Esc.
+
+Here's another code example, which illustrates how to manipulate Ribbon-based applications. This time Excel, not Notepad, is the target, and the Ribbon, not a menu, is manipulated. The code sends an Alt key by itself (this activates the shortcut key feature on the Ribbon and the Quick Access Toolbar as well, displaying a variety of keys you can choose from). Then the code switches to the View tab (a W does that), and finally full-screen mode is turned on by sending an E:
+
+    Sub Send_to_Excel()
+
+        Dim appExcel As Variant
+
+        appExcel = Shell("Excel.exe", vbNormalFocus)
+        AppActivate appExcel
+
+            SendKeys "%", True 'send Alt by itself
+        SendKeys "W", True 'W for the View tab
+        SendKeys "E", True 'E for full screen mode
+
+      End Sub
+
+# Going beyond VBA
+
+VBA is not limited to its own library of functions. In this chapter you've seen how to use the Editor's Tools ⇒ References feature to make Office applications' object libraries available to VBA's built-in capabilities. But wait. There's more.
+
+VBA can also access the entire Windows API (application programming interface). This isn't as simple as adding a library via Tools ⇒ References. And the necessary code is verbose. But if you want to have very complete control over Windows's internals, to, for example, perfectly manage timing issues like waiting for an outside application to complete its task and other advanced techniques—the Windows API functions are up to such jobs (and plenty more besides).
+
+Windows API programming is beyond the scope of this book, but if you're interested, copy and paste the sample code from this MSDN web page:
+
+<http://msdn.microsoft.com/en-us/library/office/bb258148(v=office.12).aspx>
+
+That sample code works fine in Word's or Access's VBA Editors. And the links provided on that web page are your doorways into further, deeper study of the topic. If, like me, you have major geek tendencies, it's great fun to wander around and experiment in an immense compendium like the API. You can make Windows do things you wouldn't believe.
+
+# The Bottom Line
+
+**Use Automation to transfer information.**
+
+Automation sets up communication between two applications, designating one of them as the _server_ and the other as the _client_.
+
+Master It
+
+Of the various ways to communication between applications, which is generally the most effective?
+
+**Use the **Shell** function to run an application.**
+
+Although the Shell function can prove useful in a variety of inter-application communication situations, Shell can also present the programmer with a timing problem.
+
+Master It
+
+Describe the timing issues that the Shell function raises, and describe a good solution to this problem.
+
+**Use data objects to store and retrieve information.**
+
+This book has described a variety of ways to store and retrieve information when working with the VBA language. Using data objects is one of these useful techniques.
+
+Master It
+
+How is the data-object technology special as a way of storing and retrieving information; what can a data object do that's unique?
+
+**Communicate via DDE.**
+
+Dynamic Data Exchange (DDE) is a technology introduced back in May 1990 with Windows 3.0. Use it if other, more efficient communication technologies are unavailable to the applications you are working with.
+
+Master It
+
+Not all applications support DDE. Which Office 2013 applications don't support DDE communication?
+
+**Communicate via **SendKeys**.**
+
+Using SendKeys is a fairly simple but rather awkward and limited way to communicate between applications. It imitates typing in keystrokes, thereby allowing you to manipulate an application by accessing some of its features using, for example, Alt+key combinations, such as Alt+F to open the File tab on the Ribbon.
+
+Master It
+
+SendKeys was historically most often employed to open an application's menus and select an option from the menus. Since Vista, Windows applications have largely done away with traditional menus, so is SendKeys of even more limited use now than in the past?
+Chapter 31
+
+Programming the Office 2013 Ribbon
+
+VBA programmers may want to customize the Office applications' Ribbons programmatically (via macro code as opposed to the user employing the Options dialog box). Perhaps your organization wants to hide certain features in Excel, add a step-through wizard to Word, create a Ribbon that is custom-designed for working with a particular presentation, add a special tab containing capabilities relevant to your business, or otherwise automate management of this major part of the user interface.
+
+Or you might want to create dynamic Ribbon effects, such as hiding, disabling, revealing, or modifying Ribbon elements—labels, groups, controls, or whatever—based on the user's behaviors in the application or on some other criterion.
+
+This chapter explores all aspects of Ribbon customization so you'll be able to fully exploit the Ribbon's capabilities programmatically.
+
+Note that the Ribbon can be programmatically modified in two ways: The most efficient approach is to create XML code and make it interact with VBA procedures. This chapter employs this technique and describes how to customize the Ribbon in Word, Excel, and PowerPoint. A second, more complex approach requires writing COM add-ins, a technique that is beyond the scope of this book.
+
+The Access Ribbon can't be modified in the same way that you modify the Ribbon in Word, Excel, and PowerPoint. Access requires a unique approach, including creating a specialized table to hold the XML code that modifies the Ribbon. Modifying the Access Ribbon is covered at the end of this chapter.
+
+The Ribbon's contents are described in the XML language, but you don't need to know how to write XML to manipulate the Ribbon. Throughout this chapter, you can just copy and paste XML code examples, making modifications to them to suit your needs.
+
+As you'll see shortly, there's also a handy utility you can download that helps you avoid several tedious steps when modifying the Ribbon and verifies that your XML statements are "well formed" (that they follow the rules of XML and thus should work).
+
+In this chapter you will learn to do the following:
+
+  * Understand what XML is
+  * Hide a group on the Ribbon
+  * Add a new, custom group
+  * Create callbacks for event handling
+  * Manipulate the Access Ribbon
+  * Debug Ribbon programming
+
+# What Is XML?
+
+XML means _extensible markup language_. It's a way to combine data with descriptions of that data.
+
+Think of a file cabinet holding various documents, each of which is stored in a folder with a label describing the meaning of its document: Telephone Bill, Boat Insurance, Bobby's Arrest, and so on.
+
+But XML takes this a step further, becoming more granular (more finely detailed) in its marking (labeling) of data. Each paragraph, sentence, or even individual words can also be contained within descriptive "tags," like this:
+
+    <FirstArrest>
+          <DateofArrest>12,1,2013</DateofArrest>
+          <Location>Sao Paulo, Brazil
+          </Location>
+    </FirstArrest>
+    <SecondArrest>
+    <DateofArrest>12,14,2013</DateofArrest>
+          <Location>Miami Airport
+          </Location>
+
+   ...
+
+You get the idea: descriptive tags, then the data contained, followed by closing tags. For example, <Location> is a tag presumably containing some kind of geographical data; </Location> is a tag with a backslash, meaning that this is the end of the information about location. Any opening tag must be paired with a closing tag, and they thus surround the data that they describe.
+
+XML is "extensible," meaning anybody can make up their own tags. XML is a way of storing information along with descriptions of the meaning of that information. You can think of it as similar to a record in a database.
+
+Contrast this with HTML (the markup language that underlies web pages), which describes how to _display_ information and contains standardized tags, such as <i> </i> for italic, understood by all browsers.
+
+If you want to know more about XML, you'll find a good introductory tutorial here:
+
+<http://w3schools.com/xml/>
+
+# Hiding the Editing Group on the Word Ribbon
+
+To get an idea of how to modify the Ribbon, let's assume that you want to remove the Editing group in the Word Ribbon's Home tab. This group has three options: Find, Replace, and Select. However, you decide that you just don't need to display these options because you always press Ctrl+F to open the Find dialog box and Ctrl+H to open the Replace dialog box, and you select by simply dragging the mouse. To you, this Editing group is useless, just wasting valuable space on the Ribbon.
+
+To hide the Editing group on the Ribbon, follow these steps:
+
+1. First you'll want to download a free utility that makes working with the Ribbon much easier. Go to
+
+    http://openxmldeveloper.org/blog/b/openxmldeveloper/archive/2010/08/10/23248.aspx
+
+and download, then install, the Office Custom UI Editor tool. This utility can be downloaded via a link on this web page named OfficeCustomUIEditorSetup.zip. (The file is just above the comments.) When you extract the contents of this zip file, you'll have an installer (.msi) file. Just double-click it to install the Custom UI Editor.
+
+2. Start Word.
+
+3. Press Alt, F, N, (or click the File tab, then click the New option). Then click the blank document template.
+
+4. Press Alt, F, A and save the document as RibbonMod.docm to your Desktop (or some other location such as C:\temp where you can easily locate it).
+
+5. Press Alt, F, C to close this document. Closing the document is necessary because if it's still open when you attempt to store your XML code in it (by choosing File ⇒ Save in the Custom UI Editor for Microsoft Office), you'll get an error message.
+
+* * *
+
+Why You Should Use Macro-Enabled File Types
+
+Note that you could also save the document as the default .docx file type, but in this chapter you'll always use the macro-enabled .docm type (and the other "m" type, such as .xlsm for macro-enabled Excel files and .pptm for PowerPoint). These types of files can include macros, and in some of the examples in this chapter, you'll need to write procedures to handle events—triggered when the user clicks a control that you've added to the Ribbon.
+
+* * *
+
+6. Run the Custom UI Editor for Microsoft Office.
+
+7. Choose File ⇒ Open.
+
+8. Browse to the RibbonMod.docm file that you saved in step 4, and open it.
+
+9. In the right pane of the Custom UI Editor, type the following XML code:
+
+    /2009/07/<customUI xmlns="http://schemas.microsoft.com/office/2009/07/customui">
+      <ribbon>
+         <tabs>
+            <tab idMso="TabHome">
+               <group idMso="GroupEditing" visible="false" />
+            </tab>
+         </tabs>
+      </ribbon>
+    </customUI>
+
+Identifiers (idMso), images (imageMso), and other attributes in Ribbon XML code can have an Mso suffix. Mso is short for Microsoft Office, and when appended to an attribute it means _built_ - _in_. So, a tab with an idMso attribute is one of the tabs on the Ribbon by default. A tab with a plain id attribute is a new tab you've added to the Ribbon. Likewise, an imageMso is one of the set of built-in Office 2013 icons, but an image is an icon you created by importing a graphics file (see "Creating Your Own Icons" later in this chapter).
+
+* * *
+
+**Watch Out for Special Characters**
+
+XML will choke on special characters—it expects plain vanilla text with none of those slanted quotation marks (called "smart quotes") or other fancy formatting. You used to be able to paste code into Notepad, then copy it from Notepad and paste it into the VBA Editor or the Custom UI Editor. When text was dipped into Notepad like this, all special characters were stripped off. Slanted quotation marks (which are two distinct characters, open and close quotes) turned into a single, vertical quotation-mark character. This was quite a good way to wash text. No more. Those at Microsoft who fiddle with good tools and make them less useful decided to justify their salaries by _not leaving Notepad alone_. After all, they're getting paid to do _something_ , so they get restless. Until the latest version, Notepad has been left alone, unchanged for decades.
+
+How do you get rid of characters like smart quotes (" and ") that XML (and the VBA Editor) cannot work with, replacing them with straight quotes (")? There are three ways, but #3 is the best:
+
+1. Hand-edit each bad character by selecting it, then pressing the " key. If you press this key in Notepad or a code editor, it will appear as the correct " simple quotation mark (no slant).
+
+2. If you're working with a large piece of code with many quotation marks, paste it into Notepad, then press Ctrl+H to open the Replace dialog box. Paste one of the bad, open-quote (") slanted quotation-mark characters into the Find What field, then click the Replace With field and press the " key. (Notepad by default uses the straight-quotes character.) Note that you'll have to repeat this process with the close-quote (") slanted quotation-mark character.
+
+3. What do we do when faced with a repetitive and tedious task? Anyone?
+
+Yes. Write a macro. Here's a macro that opens a new, blank Word document, pastes in the text that needs changing, then makes the necessary replacements:
+
+    1.  Sub StraightenQuotes()
+    2.  ' Changes smart quotes (slanted) to straight quotes
+    3.  
+    4.  On Error GoTo Problem
+    5.  Dim aDO As DataObject
+    6.  Set aDO = New DataObject
+    7.  aDO.GetFromClipboard
+    8.  aDO.GetText
+    9. 
+    10. Dim bQuotesOn As Boolean
+    11. bQuotesOn = Options.AutoFormatAsYouTypeReplaceQuotes
+    12. 
+    13. Options.AutoFormatAsYouTypeReplaceQuotes = False
+    14.     
+    15. Documents.Add Template:="Normal", NewTemplate:=False, DocumentType:=0
+    16.           
+    17. Selection.Paste
+    18.     
+    19.     Selection.WholeStory
+    20.     
+    21.     Selection.Find.ClearFormatting 
+    22.     Selection.Find.Replacement.ClearFormatting
+    23.     
+    24.     With Selection.Find
+    25.         .Text = ChrW(8221)
+    26.         .Replacement.Text = """"
+    27.         .Wrap = wdFindStop
+    28.         .Forward = True
+    29.     End With
+    30.     Selection.Find.Execute Replace:=wdReplaceAll
+    31.     
+    32.     Selection.Find.ClearFormatting
+    33.     Selection.Find.Replacement.ClearFormatting
+    34.         
+    35.     With Selection.Find
+    36.         .Text = ChrW(8220)
+    37.         .Replacement.Text = """"
+    38.         .Wrap = wdFindStop
+    39.         .Forward = True
+    40.         End With
+    41.     Selection.Find.Execute Replace:=wdReplaceAll
+    42. 
+    43. Options.AutoFormatAsYouTypeReplaceQuotes = bQuotesOn
+    44. 
+    45. Exit Sub
+    46. 
+    47. Problem:
+    48. MsgBox "There was a problem. Be sure that you have copied some text into the Clipboard before executing this macro."
+    49. 
+    50. End Sub
+
+To test this, just copy some text (that contains the unwanted slanted quotation marks) into the Windows Clipboard (select the text, then press Ctrl+C). Then run the macro. Here's what the code does:
+
+  * Line 4 says that if something goes wrong, jump down to the label named Problem at the end of the procedure. The most likely problem is that the user has a graphic in the Clipboard (they pressed PrtScn, for example) rather than text.
+  * Lines 5–8 fetch the text from the Clipboard.
+  * Lines 10 and 11 save the user's setting for smart quotes so we can restore it at the end of the macro.
+  * Line 13 turns off Word's Smart Quotes feature so when in our code the slanted quotation marks are replaced by straight quotation marks, Word will permit this. Line 15 opens a new, blank document. This is important because you might currently also be working on a second, ordinary text document where you want smart quotes.
+  * Line 17 pastes the text from the Clipboard into the blank document.
+  * Line 19 selects all the text.
+  * Lines 21 through 41 carry out the find and replace. Remember, this code must be executed twice, once for the open-quote and a second time for the close-quote characters.
+  * Line 43 restores the user's setting for the Smart Quotes option.
+  * Line 45 exits the procedure so we don't fall into the error handler after successfully running the procedure without error.
+  * Line 47's label identifies the error-handler code.
+  * Line 48 handles the error by reminding the user that there must be text in the Clipboard for this macro to work.
+
+Yes, I used Word's Macro Recorder to help me write this code. Having been programming in BASIC and writing books on it for 25 years, I'm almost freakishly proficient in the language. But I had only a vague idea what kind of code would turn off Word's Smart Quotes feature. So, I turned on the Macro Recorder, then went to File ⇒ Options in Word and turned off Smart Quotes. VBA created this code:
+
+    Options.AutoFormatAsYouTypeReplaceQuotes = False
+
+So I just copied the code into my macro. I used the same trick to get the code that opens a new document and does the finding and replacing. Unless you're Martha Stewart and can remember everything you've ever read or done, you'll need to rely on the Macro Recorder and online code samples to write macros of even moderate complexity.
+
+* * *
+
+10. In the Custom UI Editor, click the icon with the red check mark.
+
+This tool validates your XML code (a very handy feature).
+
+If you don't now see the message "Custom UI XML is well formed," you've made a typo in the XML code or included bad special characters. Retype it (or better yet, copy and paste it from this book's web page—see this book's introduction for information on copying code).
+
+If you see an error message stating " "" is an unexpected token...," you need to fix the quotation marks in the XML code to make them straight, not "smart" quotation marks, as described in the sidebar in this chapter titled "Watch Out for Special Characters."
+
+You should always validate your XML code because if there _is_ an error of some kind, your Ribbon customization simply won't happen. You will be given no error message or other warning when executing the customization itself. It just won't work.
+
+11. Choose File ⇒ Save (which saves your Word document), then File ⇒ Exit to close the UI Editor.
+
+12. Now, to see the effect, open the RibbonMod document by clicking the File tab on Word's Ribbon and then clicking Open. In the list of recent documents, choose RibbonMod.docm (or double-click that filename in Windows Explorer).
+
+If you entered the correct XML code, you'll see a Ribbon like the one on the bottom of Figure 31.1.
+
+Figure 31.1 Word's Ribbon with (top) and without its Editing group (bottom)
+
+The key lines in the XML code are these:
+
+                   **<** tab idMso **="TabHome" >**
+                        <group idMso=" **GroupEditing" visible="false** " />
+
+The line of code that begins with <tab specifies the tab on the Ribbon you want to modify—in this case, it's the Home tab. The group element specifies which group within the tab you are targeting. In this case, it's the Editing group. Finally, the code specifies that the Editing group's Visible attribute should be set to false. (I'll define the XML terms _element_ and _attribute_ shortly.)
+
+* * *
+
+**Deciding What to Include on the Ribbon**
+
+Practical Ribbon programming can require a little planning. When you modify the Ribbon, you'll want to include utilities or features that you use frequently, and perhaps hide those you'll never need.
+
+Here's a useful tip: Real estate on the Ribbon is valuable. Notice in Figure 31.1 that when you remove the Editing group, the Ribbon automatically expands the options visible for the Styles group—displaying nine rather than eight styles. So, if you're planning to add some new options or a new group of your own (as described later in this chapter), consider making room for them by hiding a group you don't need.
+
+Also, Office applications are quite mature software and contain a large number of features. Of necessity, Microsoft had to choose what to include on the Ribbon. Quite a few features simply had to be left off or relegated to dialog boxes that appear when you click the small arrow icon found in the bottom right of many Ribbon groups.
+
+But Microsoft's choices of what to show or hide are unlikely to completely match your preferences. To see if there is a feature missing from Word's default Ribbon that you want to add to it, right-click the Ribbon and choose Customize The Ribbon. In the Choose Commands From drop-down list, select Commands Not In The Ribbon. You'll see a list—a long list.
+
+* * *
+
+## A Word of Warning
+
+There's a major problem to be aware of when you're working with XML code. When XML was being designed, some crack committee decided that this language would be _case sensitive_. That decision has caused countless problems for programmers over the years. It's easy to generate bugs in languages that are case sensitive because ThisVariableName is not the same as ThisVariablename. The capital N in the first version means that these two words refer to two different variables (even though, of course, the programmer intended them to represent the same variable). And since the two versions look very much the same, it is often difficult to locate and fix this bug. Your XML code won't work if you don't precisely match capitalization, but you won't be told _why_.
+
+Fortunately and sensibly, VBA _isn't_ case sensitive, so you never have to worry about this kind of error in VBA code. But if you're getting an error when attempting to modify the Ribbon using XML, make sure there isn't a case mismatch somewhere in all that code. A good rule is to simply _always use lowercase_ when writing XML code. If you name a function or variable without any capitalization—and stick to that practice—you'll avoid the case-related debugging headaches that XML programmers have struggled with for years.
+
+See the section at the end of this chapter titled "What to Look For If Things Go Wrong" for additional possible problems when working with the Ribbon.
+
+## XML Terminology
+
+To understand the descriptions of XML code examples in this chapter, you need to know the meaning of two terms.
+
+In XML, an _element_ is roughly what a programmer would call an _object_. For example, a button control is an element in XML code. An element's _tag_ (name) is enclosed within these symbols: < and /> (immediately following the < is the name of the element).
+
+And just as objects in ordinary programming (such as VBA) have properties and methods, XML's elements have _attributes_. So, in the following example code, a button element is defined, and it has four attributes: label, size, onAction, and imageMso. Of the four, three are analogous to properties (qualities), with onAction similar to an object's method (a behavior). But in XML, they are all simply referred to as attributes.
+
+    <button id="b1"
+           label="Check Spelling"
+           size="large"
+           onAction="module1.test"
+           imageMso="diamond" />
+
+Because each attribute is named (label, size, and so on), the order of the attributes in code is irrelevant in XML. For example, in the previous code you could put the imageMso attribute above the label attribute and nothing would change. You'll explore various additional attributes later in this chapter.
+
+However, although the order of attributes is irrelevant in XML, the order of _elements_ does matter. XML is hierarchical (also known as _nested_ or a _tree_ ), meaning that you need to put elements inside each other in the proper order, as with a Russian doll set.
+
+When working with the Microsoft Ribbon, the order of the element tags is Ribbon, tabs, tab, group, button (or other control), as you can see in this code:
+
+    <customUI xmlns="http://schemas.microsoft.com/office/2009/07/customui">
+         < **ribbon** >
+              < **tabs** >
+                   < **tab** idMso="TabView">
+                        < **group** id="CustomViewsGroup"
+                         label="Next Window"
+                         insertAfterMso="GroupWindow">
+                             < **labelControl** id="null"/>
+                             < **button** idMso="WindowNext"/>
+                        </ **group** >
+                   </ **tab** >
+              </ **tabs** >
+         </ **ribbon** >
+    </customUI>
+
+If you violate this nesting structure—by putting a group tag outside a tab tag, for example—you'll get an error message ("The name in the end tag must match the name in the start tag"):
+
+    <button idMso="WindowNext"/>
+        **< /tab>**
+    </ **group >**
+
+Or if you leave out one of the closing tags (signified by the />), you'll get the same error message (here the closing </tab> tag is missing):
+
+                        </group>
+                   </tabs>
+
+## Using Built-in Icons
+
+The imageMso "galleries" can be downloaded from this web page:
+
+www.microsoft.com/downloads/details.aspx?FamilyID=2D3A18A2-2E75-4E43-8579-D543C19D0EED&displaylang=e&displaylang=en?
+
+Load this .docx file into Word. Choose Enable Editing if asked. Then you see a mystery message in boldface: **"Images are in the** **_ImageMso 0_** **and** **_ImageMso_** _1_ **tabs in the Backstage.** " Are you among the few who know what the term _backstage_ means in relation to Office applications? It means, "click the File tab on the Ribbon." But it's so much more fun to have us try to guess what it means.
+
+You'll see two new items listed on the left side under Options: ImageMso 0 and ImageMso 1. Click these to see a complete list of icons and the names you can use to reference them in your XML code, like this: imageMso="diamond".
+
+# Working with Excel and PowerPoint
+
+To modify Excel's or PowerPoint's Ribbons, you use the same techniques demonstrated with Word's Ribbon in the previous section. To illustrate this, the following example hides PowerPoint's Ink group in the Ribbon's Review tab.
+
+1. Start PowerPoint. Click the Blank Presentation template.
+
+2. Click the File tab on the Ribbon; then click Save As, A and save the document as PPMod.pptx to your Desktop (or another location where you can easily locate it).
+
+3. Click the File tab on the Ribbon and click Close to close this document.
+
+4. Run the Custom UI Editor for Microsoft Office.
+
+5. Choose File ⇒ Open.
+
+6. Browse to the PPMod.pptx file that you saved in step 2, and open it.
+
+7. In the Custom UI tab of the Editor, type the following XML code:
+
+    <customUI xmlns="http://schemas.microsoft.com/office/2009/07/customui">
+      <ribbon>
+         <tabs>
+            <tab idMso="TabReview">
+               <group idMso="GroupInk" visible="false" />
+            </tab>
+         </tabs>
+      </ribbon>
+    </customUI>
+
+8. Click the icon with the red check mark to validate your XML code. If you don't see the message "Custom UI XML is well formed," you've made a typo in the XML code. Retype it (or copy and paste it from this book's web page).
+
+9. Choose File ⇒ Save, then File Exit to close the UI Editor.
+
+At this point, open PPMod.pptx and click the Review tab. The Ink group should now be hidden, and as a result of the freed space, the Themes group should display several additional themes.
+
+# Undoing Ribbon Modifications
+
+In the previous two sections, you've modified the Ribbon: first by hiding a group in Word, then by hiding a group in PowerPoint. In other examples later in this chapter, you'll add new groups to the Ribbon.
+
+But what if you want to undo the changes you made to the Ribbon or modify those changes? It's quite easy. Just open the document, presentation, workbook, or template in the Custom UI Editor for Microsoft Office. You will see the XML code that represents your customization. Delete it or modify it, and then simply validate and save it (overwriting it) back to your hard drive. If you explore the Ribbon technology in depth—working directly with .zip and other files that are part of an Office 2013 document—you'll come to appreciate all the tedious and error-prone steps that the Custom UI Editor saves you.
+
+# Selecting the Scope of Your Ribbon Customization
+
+Recall that where you put your VBA macros determines their scope: You can embed a macro in a single document, in a template used by multiple documents, or (for Word) in the Normal.dotm file that is used.
+
+When you create XML code to manipulate the Ribbon, where you store your code determines its scope—much the same as the way scope works with macros. Here are your options, listed in increasing size of scope:
+
+  * To apply your Ribbon customization to only a single document, just embed it in that document, as illustrated in the two previous examples.
+  * To use the custom Ribbon for all documents based on a particular template, put the XML code in a macro-enabled template file type (file types with the .dotm, .xltm, and .potm filename extensions).
+  * Add the XML code to Normal.dotm if you want the custom Ribbon automatically available to all Word documents. Just follow the same steps as in the first example in this chapter, but instead of modifying the RibbonMod.docm file in the Custom UI Editor for Microsoft Office, modify Normal.dotm.
+
+(Normal.dotm can be found in a path similar to this: C:\Users\ **Richard** \AppData\Roaming\Microsoft\Templates. However, replace Richard with your username.)
+
+Recall that you add custom Ribbons to _individual_ PowerPoint or Excel documents much the same way that you add them to Word—by adding XML code and using the Custom UI Editor. However, it's not that simple to add a _global_ custom Ribbon to PowerPoint or Excel documents because they don't have a direct equivalent to Normal.dotm. Instead, with Excel and PowerPoint, you must create an add-in file to globally customize the Ribbon.
+
+# Adding a New Group
+
+In this next example, you'll see how to specify where on the Ribbon you want to place a new, custom element and also where to find the correct control identifier (idMso attribute) for built-in dialog boxes, commands, and controls.
+
+## Cautions about Customizing
+
+When customizing the Ribbon for coworkers, you could, of course, hide an entire group and then replace it by adding a new group of your own—reproducing some or all of Microsoft's original buttons on the group and adding some new ones of your own. But you should think twice before taking this approach because lots of study has gone into these groupings and most people will find them well organized if not totally intuitive. What's more, people get used to the Ribbon's organization.
+
+Of course, if you're just customizing the Ribbon for yourself and your coworkers won't use it, you can more freely rearrange things.
+
+* * *
+
+Back to the 1950s
+
+As an aside, we're using the term _button_ here because that's the technical term. However, on the Ribbon and elsewhere in the Windows 8 Modern interface, there's no visual button that the user clicks. In an effort to provide us with a "modern" (1950s) aesthetic, the photorealism (now derided as _skeuomorphism_ ) that reached its zenith in Windows 7 has been stripped away by Microsoft's design team. The user interface is now flattened. Pretty much gone are gradients, dimensional effects, reflections, textures, subtle colors, buttons, highlights, opacity, serif typefaces, and shadowing. Figure 31.2 illustrates the difference.
+
+* * *
+
+I frequently like to cycle through open Word documents, and that requires the Next Window feature. The Word Ribbon displays a Switch Windows drop-down list on the View tab, but I don't want to choose from a list; I want to just click a button to open each active document in turn (like the way you can switch to each active Windows application by pressing Alt+Tab).
+
+The Next Window command is listed in the Commands Not In The Ribbon list in the Word Options dialog box. Normally, you want to avoid filling up the Ribbon with new groups because some of the Ribbon's built-in buttons must be hidden in order to make room for your custom groups. This can force the user to have to click the Ribbon to reveal what is perhaps a favorite, frequently used button. But in Word's View tab, there's plenty of room to add a new group. Figure 31.2 shows how the new group will look in the new "modern" Windows 8 style.
+
+Figure 31.2 A traditional button control on a VBA form (left) compared to the simplified "button" on Word's Ribbon (right).
+
+A good place for our new tab, which we will call _Shuffle_ , is just to the right of the built-in Window group. This will leave the Macros group on the far right of the View tab, as it is in the default Ribbon arrangement.
+
+To create a new group (called Shuffle) and a new button (called Next Window) in that group, follow the steps in the first example in this chapter. But replace the XML code in step 9 with this XML code:
+
+    <customUI xmlns="http://schemas.microsoft.com/office/2009/07/customui">
+         <ribbon>
+              <tabs>
+                   <tab idMso="Tab **View** ">
+                        < **group id** =" **CustomViewsGroup** " label=" **Shuffle** "
+    **insertAfter** Mso="Group **Window** ">
+                             < **labelControl id** =" **null** "/>
+                             < **button** idMso=" **WindowNext** "/>
+                        </group>
+                   </tab>
+              </tabs>
+         </ribbon>
+    </customUI>
+
+There are several things to notice in this code. Each element in the Ribbon (the group, label, and button) is given an identifier, an id—a unique string. And there are two types of IDs: the idMso and the plain id. An idMso refers to a built-in Microsoft command, control, or dialog box (such as the Next Window command or a font dialog box). A plain id refers to a customized element you are adding to the Ribbon, such as a customized group, a new tab, a button that triggers one of your macros, and so on.
+
+Note that I added a _button_ control to the Ribbon (there are other controls you can place on the Ribbon, such as a label, as you'll see in another example shortly).
+
+The code in this example illustrates how to use the XML element id:
+
+    group id="CustomViewsGroup"
+
+You can also use it like this:
+
+    labelControl id="null"
+
+The id name that you give to these objects (such as CustomViewsGroup) is not displayed; it's for internal programming purposes only. So name them as you wish.
+
+Note that I added a Label control but didn't provide any text caption (I included no Label attribute). Used this way, a Label control acts as a spacer, pushing the Next Window button to the middle of the group area. Without it, the button would be displayed at the top of the Ribbon. Add two of these captionless Label spacer controls if you want to push the button to the bottom of the Ribbon.
+
+In this code, the View tab is specified as the tab we're going to use (<tab idMso="TabView">). The caption that will be displayed on the Ribbon to describe our new group is Next Window, thanks to this code: label="Shuffle". The position of our new item on the Ribbon is specified as being to the right of the built-in Window group: insertAfterMso="GroupWindow">.
+
+## Two Ways to Find the Correct _idMso_
+
+How did I know that the proper Microsoft name for this Next Window command is WindowNext? Right-click the Ribbon and click Customize The Ribbon. If you look up the command in the Choose Commands From drop-down list (Commands Not In The Ribbon) in the Word Options dialog box, it is displayed as Next Window. But if you use Next Window in the code for the idMso (Microsoft built-in command ID), nothing will be displayed on the Ribbon because Next Window is not the correct internal ID. This won't work:
+
+    <button idMso="Next Window"/>
+
+You must use this:
+
+    <button idMso="WindowNext"/>
+
+The quickest way to identify the correct internal Microsoft ID (idMso) for any control or command is to pause your mouse pointer on top of the command in the Options dialog box's Choose Commands From drop-down list, as shown in Figure 31.3. The control name appears in parentheses at the far right of the tooltip that is displayed when you hover your mouse pointer over any command in the list.
+
+Figure 31.3 Hover your mouse pointer over any command in this list to see the correct ID to use in your XML code. In this example, you can see that the Next Window command has an ID of WindowNext.
+
+If you prefer, you can download a more detailed list of the commands and controls to use with the idMso attribute. To download tables (that can be viewed in Excel) of all the built-in Office 2013 controls—such as the font dialog box or the Clipboard task pane (ShowClipboard)—go to this web page:
+
+www.microsoft.com/en-us/download/details.aspx?id=36798
+
+# Adding Callbacks
+
+Now it's time to employ some VBA code to respond when the user interacts with the Ribbon. Unlike in the previous example—where we added a button that triggered Word's built-in Next Window feature—you'll now add a button that triggers one of your own macros.
+
+To run VBA code, you insert a _callback_ in the XML code to execute whatever VBA macro you specify. When the user clicks a control, such as a button, the XML code that services this control sends a message to the Office application, telling it that a response is needed (this is very similar to the triggering of a Click event in ordinary VBA programming). The Office application then "calls back" to your VBA procedure—whatever procedure is specified by the onAction attribute you write in the XML code.
+
+Put another way, to create a callback, you type in an attribute in the XML code, specifying which macro you want to execute. It looks like this:
+
+    onAction="module1.test"
+
+If you add this attribute to a Button control on the Ribbon, it means that when the user clicks that button, the macro named test located in module1 is executed. (The OnAction attribute is similar to the Click event in VBA. In addition to the Button control, the CheckBox, ToggleButton, and Gallery controls have an OnAction attribute.)
+
+To see exactly how this communication between a button and a VBA macro works, follow these steps (refer to the first example in this chapter if you need additional information about how to carry out these steps):
+
+1. Create a new, empty Word document.
+
+2. Press Alt+F11 to open the VBA Editor.
+
+3. Double-click ThisDocument in the Project Explorer to open the Code window for your new, empty Word document. (Make sure you're clicking the correct ThisDocument project if there is more than one document currently open in Word.)
+
+4. Now type in this macro that you will execute via a button on the Ribbon:
+
+    Sub test(control As IRibbonControl)
+
+    MsgBox "Hi!"
+
+    End Sub
+
+Notice the argument for this procedure: control As IRibbonControl. This argument is necessary when you're interacting with the Ribbon using a button (or any of the large number of other controls that can be put on the Ribbon). If you don't include this argument, you'll get a "Wrong number of arguments, or invalid property assignment" error message.
+
+5. Go back to Word itself and click the File tab on the Ribbon. Choose Save As to save this document as RibbonTest.docm. Then close the document. (Make sure to save it as a .docm file.)
+
+6. Run the Custom UI Editor for Microsoft Office.
+
+7. Choose File ⇒ Open.
+
+8. Browse to the RibbonTest.docm file that you saved in step 5 and open it.
+
+9. In the Custom UI tab of the Editor, type the following XML code:
+
+    <customUI xmlns="http://schemas.microsoft.com/office/2009/07/customui">
+    <ribbon>
+      <tabs>
+        <tab id="t1" label="Execute">
+          <group id="g1" label="Run Test">
+
+    <button id="b1"
+           label="See Message"
+    **onAction="ThisDocument.test"** />
+
+          </group>
+        </tab>
+      </tabs>
+    </ribbon>
+    </customUI>
+
+10. Click the icon with the red check mark to validate the XML code. If the code isn't well formed according to the validation test, you've made a typo in the XML code. Retype it (or better, copy and paste it from this book's web page).
+
+11. Choose File ⇒ Save, then File Exit to close the UI Editor.
+
+12. Now, to see your new tab, group, and button, open RibbonTest.docm in Word, click the new Execute tab at the far right of the button, and click the See Message "button."
+
+You should see a message box appear with the message "Hi!" in it. If you don't, check out the section "What to Look For If Things Go Wrong" later in this chapter.
+
+Try modifying the code in the previous example to see the effect of some other attributes of the Button control. That control has the following attributes: description, enabled, id, idMso, idQ, image, imageMso, insertAfterMso, insertAfterQ, insertBeforeMso, insertBeforeQ, keytip, label, ScreenTip, showImage, showLabel, size, supertip, tag, and visible.
+
+# Adding Attributes
+
+As you can see, the Ribbon is quite dramatically modifiable; in fact, you can create it from scratch. In this section, you'll explore some additional ways to modify the Ribbon. For this and the remaining examples in this chapter, we'll use Word because it's the most popular Office application and because modifying the Excel or PowerPoint Ribbons works much the same way, as the second example in this chapter illustrated.
+
+## Using Built-in Icons and ScreenTips
+
+Try following the steps in the previous section again, but this time modify the XML code for the button element by adding two additional attributes, like this:
+
+    <button id="b1"
+             label="See Message"
+             imageMso= "ShowTimeZones"
+             screentip="Say Hi!"
+             onAction="module1.test" />
+
+When you load the .docm file and click the Execute tab that you've added to the Ribbon, you'll now see two changes from the previous example, reflecting the two attributes you added to the button element. First, there is a globe icon (one of the built-in imageMso icons).
+
+Also, when you pause your mouse cursor on top of the button, a tooltip (now called a _ScreenTip_ by Microsoft) is displayed. You can employ this attribute to remind the user of the purpose of the button. There are actually two elements to a ScreenTip: the ScreenTip attribute displays a heading in boldface and the supertip attribute displays a normal-font "body text" message. You can use either or both. The following code produces the result shown in Figure 31.4:
+
+    screentip="Run a Macro"
+    supertip="Click this button to execute the test macro."
+
+Figure 31.4 The ScreenTip attribute is shown on top in boldface. The supertip attribute displays the body text in a regular font, below.
+
+## Creating Your Own Icons
+
+Although Microsoft provides an extensive collection of built-in icons, you can also create your own. You can use any .bmp, .jpg, or .ico file. The attribute for a custom icon is image rather than imageMso.
+
+The Custom UI Editor for Microsoft Office provides an easy way to find and employ custom icons. To see how this works, open RibbonTest.docm in the UI Editor, and then click the middle icon in the UI Editor (its ScreenTip is _Insert Icons_ ). An Insert Custom Icons dialog box opens. Browse your hard drive until you locate the graphics file you want to use. All permitted graphics file types will be displayed in the browser.
+
+It's not necessary to reduce the size of a large graphics file; it will automatically be reduced for you when displayed.
+
+When you find the file you want, double-click it. The dialog box closes and a picture of the image as well as its name are displayed in the left panel of the UI Editor. You have to click the small + next to the customUI entry in the left pane to see the image. Right-click the image to delete it or to change its ID.
+
+Notice that the filename extension, such as .jpg, is stripped off in the UI Editor. This reminds you that you don't use the extension when you add this image to the XML code. A photo of a rose is used as an icon in the following code.
+
+The file on the hard drive is rose.jpg, but notice that in the code it's referred to merely as rose:
+
+    <button id="b1"
+              label="See Message"
+              **image="Rose"**
+              size="large"
+         screentip="Say Hi!"
+         onAction="module1.test" />
+
+# Using Menus and Lists
+
+Although the button is the most common control employed on the Ribbon (or indeed in any user interface), you can use other controls as well. For example, if you want to offer the user multiple options—such as choosing between executing three different macros—a drop-down list box or a menu might be preferable to using up Ribbon space by adding three buttons on the _top level_ (what's always visible in a tab) of the Ribbon.
+
+## Adding Menus
+
+Here's how to add to the Ribbon a menu that, when clicked, displays three buttons—each of which launches a different macro:
+
+1. Open RibbonTest.docm in the Custom UI Editor.
+
+2. Type this XML code into the Custom UI Editor:
+
+    <customUI xmlns="http://schemas.microsoft.com/office/2009/07/customui">
+    <ribbon>
+      <tabs>
+        <tab id="t1" label="Execute">
+          <group id="g1"  label="Favorite Macros">
+
+    < **menu id="m1" label="Choose a Macro"** >
+     <button id="b1"
+         imageMso="InkDeleteAllInk"
+         label="Convert Case"
+         onAction="ThisDocument.ConvertCase" />
+     <button id="b2"
+         imageMso="DataRefreshAll"
+         label="Replace 5pt with 10pt"
+         onAction="ThisDocument.UpSize" />
+     <button id="b3"
+         imageMso="PictureBrightnessGallery"
+         label="Memo Format"
+         onAction="ThisDocument.Memo" />
+    </menu>
+
+          </group>
+        </tab>
+      </tabs>
+    </ribbon>
+    </customUI>
+
+Note that I chose imageMso icons that symbolize the various actions taken by these macros (see Figure 31.5).
+
+Figure 31.5 Click the Choose A Macro menu, and the menu items are displayed as a set of buttons, ready to launch various macros when clicked.
+
+3. Click the Validate button in the Custom UI Editor, and if the message says your code has no typos (meaning it is well formed), choose File ⇒ Save in the UI Editor to store this XML code in the RibbonTest.docm file.
+
+4. Open RibbonTest.docm in Word and press Alt+F11 to open the VBA Editor. Ensure that you have three macros in the ThisDocument module, named ConvertCase, UpSize, and Memo. They should look like this:
+
+    Sub ConvertCase(control As IRibbonControl)
+
+      MsgBox "convert"
+
+    End Sub
+
+    Sub UpSize(control As IRibbonControl)
+
+      MsgBox "upsize"
+
+    End Sub
+
+    Sub Memo(control As IRibbonControl)
+
+      MsgBox "memo"
+
+    End Sub
+
+(These message boxes are mere stubs for testing purposes; your actual procedure will, of course, contain real macros that convert case, resize fonts, and display a memo form.)
+
+You can now return to the document RibbonTest.docm and try out the new menu. Click the Execute tab, and notice the Choose A Macro menu in the Favorite Macros group. A menu is indicated in the Ribbon by a down-arrow icon that the user clicks to display the menu items, as shown in Figure 31.5.
+
+If you prefer, you can send all the menu buttons to the same macro and then choose between them in the VBA code by testing their id attribute (which is passed to the VBA code by the ID property of the control object). The VBA code would look like this:
+
+    Sub choosemacro(control As IRibbonControl)
+
+        Select Case control.ID
+            Case "b1"
+                MsgBox ("button1")
+            Case "b2"
+                MsgBox ("button2")
+            Case "b3"
+                MsgBox ("button3")
+        End Select
+
+    End Sub
+
+For this select-case technique to work, you also need to modify the XML code in this example so that each button executes this same choosemacro procedure:
+
+    <menu id="m1" label="Choose a Macro">
+        <button id="b1"
+                imageMso="InkDeleteAllInk"
+                label="Convert Case"
+                **onAction="ThisDocument.choosemacro** " />
+        <button id="b2"
+                imageMso="DataRefreshAll"
+                label="Replace 5pt with 10pt"
+                **onAction="ThisDocument.choosemacro** " />
+        <button id="b3"
+                imageMso="PictureBrightnessGallery"
+                label="Memo Format"
+                **onAction="ThisDocument.choosemacro** " />
+    </menu>
+
+## Adding a DropDown List Control
+
+You can employ a DropDown List control much as you would a menu when you want to offer the user a set of choices. Here's an example of XML code that displays a drop-down list on the Ribbon. This example displays the same choices (three macros) to the user as the previous example. But here we use a drop-down list rather than a menu:
+
+    <customUI xmlns="http://schemas.microsoft.com/office/2009/07/customui">
+    <ribbon>
+       <tabs>
+          <tab id="t1" label="Execute">
+             <group id="g1"  label="Favorite Macros">
+
+    < **dropDown** id="ddlist1"
+
+    **getSelectedItemIndex=** " **setfirst** "
+
+               label="Favorite Macros"
+               onAction="ThisDocument.test">
+
+         < **item id="i1" label="Convert Case** "/>
+         < **item id="i2" label="Replace 5pt with 10pt** "/>
+         < **item id="i3" label="Memo Format** "/>
+
+    </dropDown>
+
+             </group>
+          </tab>
+       </tabs>
+    </ribbon>
+    </customUI>
+
+You create a DropDown List control by adding a dropDown element in the XML code and then adding as many item child elements as you wish within the parent dropDown element. (Remember that XML is very particular about capitalization. If you try to use dropdown rather than dropDown, you'll cause an error because your XML code will not be well formed.)
+
+You should include a getSelectedItemIndex attribute, as illustrated earlier, so the DropDown List control's text box will not initially be blank, confusing the user. This attribute executes a macro named setfirst, which you must write. This macro returns the index number of the item you want to initially display in the DropDown List control's text box. In this example, the macro returns a zero (which means to display the first item). So in this example, Convert Case will be displayed. Here's the macro that causes this to happen by setting the return value argument (passed back in the variable named x) to zero:
+
+    Sub setfirst(ByVal control As IRibbonControl, ByRef x)
+        x = 0
+    End Sub
+
+Pay particular attention to this GetSelectedItemIndex attribute. Recall that this attribute transfers execution to a VBA procedure when the document is being loaded (before the user sees the Ribbon), so it is a form of initialization. (For other kinds of initialization, you can use other attributes in the set of built-in get attributes, including getEnabled, getImageMso, getLabel, and getVisible.)
+
+One additional macro must be written as well. This second macro (pointed to by the onAction attribute in the XML code) responds appropriately when the user clicks an item in the list:
+
+    Sub test(control As IRibbonControl, **id As String, index As Integer** )
+
+        Select Case **index**
+
+            Case 0
+                MsgBox ("item1")
+            Case 1
+                MsgBox ("item2")
+            Case 2
+                MsgBox ("item3")
+        End Select
+
+    End Sub
+
+When the user clicks an item in the list, the values of both that item's id and index attributes are passed to the macro. So you can use either one to decide what code to execute. In this example, I'm using the index number, but you could just as easily use the ID string (such as Case "i1") to detect the ID of each item.
+
+## Using a DialogBoxLauncher
+
+Some of the Ribbon's built-in groups display a small arrow in the lower-right corner (see the Font group on Word's Home tab for an example). Click that arrow, and a dialog box or task pane appears. This is a way to conserve space on the Ribbon if displaying a whole slew of options at all times would be impractical.
+
+If you want to add a dialog box to one of your custom groups, use the DialogBoxLauncher control by writing some XML code like this:
+
+    <customUI xmlns="http://schemas.microsoft.com/office/2009/07/customui">
+    <ribbon>
+       <tabs>
+          <tab id="t1" label="Execute">
+               <group id="g1"
+                   imageMso="StartAfterPrevious"
+                   label="Insert Date">
+
+    < **dialogBoxLauncher** >
+
+         < **button id=** " **b1** "
+         **onAction="test** " />
+
+    </ **dialogBoxLauncher** >
+
+             </group>
+          </tab>
+       </tabs>
+    </ribbon>
+    </customUI>
+
+This DialogBoxLauncher merely transfers execution to a macro (named Test in this example). So, when the user clicks the small arrow at the bottom of the custom group named Execute, the test macro displays whatever built-in dialog box is appropriate. Write this code, and the macro displays the Date And Time dialog box:
+
+    Sub test(control As IRibbonControl)
+
+        Dialogs(wdDialogInsertDateTime).Show
+
+    End Sub
+
+The VBA Editor will show you a list of the available dialog boxes as soon as you type the left parenthesis in Dialogs(. (This assumes that you've got the handy Auto List Members feature turned on in the VBA Editor. Choose Tools ⇒ Options, then click the Editor tab in the Options dialog box.)
+
+Test this example. It will insert the current date, formatted as the user chooses by selecting an option in the Date And Time dialog box.
+
+# Toggling with a Toggle-Button Control
+
+A toggle button used to look like a regular button, but now, with the Windows 8 Modern interface, it's just text, like a hyperlink. The button, when clicked, used to animate itself. Shadows were displayed behind a clicked toggle button to make the button look like it had been pressed. With Office 2010, however, the shadowing was, for some reason, removed. When toggled, an Office 2010 button startlingly turned yellow. I guess that's somebody's idea of a improving this visual cue. As we all know, when you press something it does turn yellow.
+
+With Office 2013, a pressed toggle button is, surprisingly, indicated by a blue background around the button's label.
+
+What's next? We can hope that sooner or later things will settle down and we can get accustomed to cues that last longer than each revision of Office, and that, at long last, make sense.
+
+* * *
+
+VBA Escapes the Visual Flattening Due to Neglect
+
+Fortunately VBA's forms have consistently retained over the years the same shadowed, realistic buttons and other three-dimensional controls (see Figure 31.2). This, however, is most likely a matter of neglect rather than common sense on Microsoft's part. For now, anyway, their decision-makers appear to be generally uninterested in any form of the Basic language, including VBA. They tolerate it as a function of backward compatibility and, I suppose, because of the millions of people who like to use Basic in their Office applications, and in other contexts (see Microsoft's Visual Studio for ways to use Visual Basic to create stand-alone applications).
+
+* * *
+
+If you need to add an on-off control to the Ribbon, you can use either a check-box control or a toggle button. Both visually cue users about their status: the check box with a check and the toggle button by turning color and adding a frame.
+
+Either of these controls can be used for two-state situations, such as allowing the user to choose between italic text or no italic text. Here's an example showing how to employ the toggle button. When the user clicks it, the button lights up and looks as if it's been pressed into the Ribbon. When the button is clicked a second time, the yellow lighting effect is turned off.
+
+Your VBA procedure can detect the status of the button—pressed or not—by examining a Boolean argument I named down that is passed to your procedure. Here's how to use it in your VBA code:
+
+    Sub test(control As IRibbonControl, **down As Boolean** )
+
+        If down Then
+            MsgBox "Button Down"
+        Else
+            MsgBox "Button Up"
+        End If
+
+    End Sub
+
+This VBA procedure will respond to the following XML code you can store in your document by using the UI Editor as described in the various step-through examples throughout this chapter:
+
+    <customUI xmlns="http://schemas.microsoft.com/office/2009/07/customui">
+    <ribbon>
+       <tabs>
+          <tab id="t1" label="Execute">
+             <group id="g1" label="Run Test">
+
+    <
+    **toggleButton** id="tbutton1"
+       label="Click to Toggle"
+       imageMso="DeclineInvitation"
+       onAction="ThisDocument.test" />
+
+             </group>
+          </tab>
+       </tabs>
+    </ribbon>
+    </customUI>
+
+# Modifying the Ribbon in Access
+
+As you've doubtless noticed in previous examples in this book, Access often takes its own, sometimes rather meandering , path to accomplish a given task. Ribbon customization is no exception.
+
+You can still use the Custom UI Editor for Microsoft Office (or any other XML editor) to enter and validate your XML code, but you can't use it to _store_ the XML in an Access database the way you've stored XML in Word and PowerPoint in previous examples in this chapter.
+
+With Access, you must store XML by hand, in a cell in a special table. Let's call it less than elegant. I don't want to use the word _kludge_.
+
+In the following example, you'll see how to modify the Access Ribbon by adding a new tab, group, and toggle button—just as we added these elements to Word's Ribbon in the previous section. When the Access Ribbon has been modified, you can click the new toggle button and see the message "Button Down" or "Button Up." The result is the same as in the previous section, "Toggling with a Toggle-Button Control," but the steps to achieve that result in Access are quite dissimilar from those you took to add a toggle button to Word's Ribbon.
+
+Follow these steps:
+
+1. Open a new blank database by running Access, clicking the Blank Desktop Database template, typing in the name **R.accdb** , and then clicking the Create button.
+
+(You could also open Northwind or any other database and modify its Ribbon following these steps, but for simplicity, let's stick with a blank database for this example.)
+
+2. Right-click the Navigation pane's title bar where it says All Access Objects. (The Navigation pane is on the left side.)
+
+3. Choose Navigation Options from the context menu.
+
+4. Check the Show System Objects check box in the Navigation Options dialog box. Then click OK to close the dialog box. You're going to create a special table named USysRibbons, and it won't be visible in the Navigation pane if this check box isn't checked.
+
+5. Now you want to make sure you see some error messages if there's a bug in your Ribbon-customization programming. To do this, follow these steps:
+
+a. Click the File tab on the Ribbon.
+
+b. Click Options.
+
+c. Click the Client Settings button in the left pane of the Access Options dialog box. Scroll down until you locate the General section near the bottom.
+
+d. Check the Show Add-in User Interface Errors check box. If you're using Access 2010, this option is enabled by default. In Access 2013 it's disabled by default, as it is in Word and the other Office 2013 applications.
+
+e. Click OK to close the dialog box.
+
+6. Next you'll create your special table. Click the Ribbon's Create tab, and then click the Table Design button in the Tables group on the Ribbon.
+
+7. In the first column of the first row, type **ID** for the field name, and in the second column, choose AutoNumber as the data type.
+
+8. In the second row, type **RibbonName** and choose Short Text as the data type.
+
+9. In the third row, type **RibbonXml** and choose Long Text as the data type.
+
+10. Click the cell where you typed in **ID** to select it, and then click the Primary Key button in the Tools group on the Ribbon. (You may need to click a tab at the very top named Table Tools.)
+
+A key symbol is displayed in the record selector.
+
+11. Click the File tab, then click Save to save your new table. Be sure to name the table **USysRibbons**. Click the Yes button in the two message boxes that are displayed.
+
+12. Now that you've designed the structure of this special table, it's time to add the data that modifies the Ribbon. Locate on your hard drive the .accdb file you just created and double-click it to open it in Access. Double-click UsysRibbons in the left pane. Then right-click the USysRibbons tab and select Datasheet View.
+
+13. Ignore the ID field; it will automatically generate ID numbers for you.
+
+14. Click the RibbonName cell and type **Toggle** —this is the name that will identify your custom Ribbon (you can choose any name that you wish, but for the purpose of this example, use Toggle).
+
+15. Enlarge the row both vertically and horizontally to make enough room to view the XML code you'll add to the RibbonXml cell. (See Figure 31.6.) This step isn't absolutely necessary, but it makes it easier for you to test and modify the XML code. So, in the title bar between the labels RibbonXml and Click To Add, drag to the right to widen the RibbonXml column.
+
+Figure 31.6 Here's the special table you can use in Access to store your XML code when modifying the Ribbon.
+
+16. Right-click the record selector (the yellow area just to the right of the Navigation pane and just above an asterisk symbol). Choose Row Height from the context menu. Type **300** and click OK to close the Row Height dialog box.
+
+17. Type the following code into the RibbonXml cell:
+
+    <customUI xmlns="http://schemas.microsoft.com/office/2009/07/customui">
+     <ribbon
+    **startFromScratch="false"** >
+      <tabs>
+        <tab id="t1" label="Execute">
+          <group id="g1" label="Run Test">
+
+    <toggleButton id="tbutton1"
+    label="Click to Toggle"
+    imageMso="DeclineInvitation"
+    onAction="togtest"/>
+
+          </group>
+        </tab>
+      </tabs>
+    </ribbon>
+    </customUI>
+
+Note that in Access, unlike in other applications, it is necessary to set the startFromScratch attribute to false in the XML code. (This attribute specifies whether you're creating a brand-new Ribbon or modifying the default Ribbon.)
+
+18. Right-click the table's tab (it reads **USysRibbons** in boldface) and choose Close. Answer Yes when asked if you want to save changes to the layout.
+
+19. Click the File tab and choose Close.
+
+## Testing Your New Ribbon
+
+Now you're ready to choose your new Ribbon as the default Ribbon for this database. You have to specifically select the custom Ribbon you named Toggle. Follow these steps:
+
+1. Open the database that you closed in step 19 in the previous exercise.
+
+2. Click the File tab, and then choose Options to open the Access Options dialog box.
+
+3. Click the Current Database button.
+
+4. Scroll down until you locate the Ribbon And Toolbar Options section in the dialog box.
+
+5. Open the Ribbon Name drop-down list box and select Toggle. (If you didn't name your table USysRibbons in step 11 in the previous exercise, the Toggle option will not appear in the drop-down list.)
+
+6. Click OK to close the dialog box.
+
+7. A message informs you that you must close then reopen this database for this new option to take effect. Do so.
+
+8. Now you should see a new tab named Execute on the Ribbon, as shown in Figure 31.7.
+
+Figure 31.7 Now you've modified the Access Ribbon, adding a new tab named Execute.
+
+9. Click the Execute tab. You should then see the Run Test group and your Click To Toggle button. You can _look_ , but don't _touch_. Don't click the button yet—you've not yet provided a macro named togtest that will act as an event handler (a callback) for this button. Recall that you have this line of code in your XML: onAction="togtest". That means _When I'm clicked, execute the VBA procedure named togtest_. But you haven't yet created that procedure in a VBA module. You'll do that in the next section.
+
+# Adding a Callback in Access
+
+In the previous section, you modified the Ribbon by adding three elements to it: a tab, a group, and a control. Now it's time to see how to add a callback. Follow these steps:
+
+1. Open the database that contains your special Execute tab on its Ribbon that you used in the previous step-through.
+
+2. Press Alt+F11 to open the VBA Editor in Access. Before you can communicate from the Ribbon's controls to a VBA procedure in Access, you must specify a reference to an object library (this step is not required in the other Office 2013 applications).
+
+3. Choose Tools ⇒ References in the VBA Editor. Notice that the Microsoft Office 15.0 Access database engine Object Library is by default selected. However, you must scroll down until you find Microsoft _Office_ 15.0 Object Library and select its check box. Fail to take this step, and your VBA callbacks will not work—you'll see the error message "Microsoft Office cannot run the macro or callback function 'togtest'. Make sure the macro or function exists and takes the correct parameters."
+
+I'm harping on this because not adding the Office 15.0 Object Library is an easy mistake to make (at least it was for me—it took me hours to figure this one out). I read that a reference to that library was necessary, but when I opened the References dialog box and glanced at it, I saw _Microsoft Access 15_. _0 Object Library_ and thought that was it. So I just closed the dialog box without adding the essential reference to the _Office 15_. _0_ library.
+
+4. Click OK to close the References dialog box.
+
+5. Double-click Module1 in the VBA Editor Project Explorer. If there is no Module1, choose Insert ⇒ Module.
+
+6. Type this procedure into the module:
+
+    Sub togtest(control As IRibbonControl, down As Boolean)
+
+      If down Then
+         MsgBox "Button Down"
+      Else
+         MsgBox "Button Up"
+      End If
+
+    End Sub
+
+This procedure looks at the argument named down, and if it returns a value of True, that means the toggle button has been pressed on the Ribbon. If a value of False is returned, that means the toggle button has popped back out. The code If down Then is just a shorter version of If down = True Then.
+
+7. Now click your Click To Toggle toggle button on the Execute tab of the Access Ribbon. You should see a message box saying "Button Down."
+
+* * *
+
+Direct Communication with VBA
+
+You may recall from Chapter 28, "Understanding the Access Object Model and Key Objects," that the user can't directly execute a VBA procedure from a button on the Quick Access Toolbar. Instead, you must first create an old-style Access "macro" that employs the RunCode command to execute the VBA procedure. However, the callback technique you learned in this section permits the user to click a button (or other control) on the Ribbon that directly executes a VBA function—without going through an old-style Access "macro."
+
+* * *
+
+# What to Look For If Things Go Wrong
+
+Ribbon customization—though useful and important—is new territory for many programmers. You have to deal with XML as a code platform, some unusual programming techniques (such as writing descriptive "markup" code in the XML and traditional programming code in VBA), communication between the XML and VBA via callbacks, new controls, new enumerations (such as the set of icons you can use with ImageMso), and so on.
+
+Bugs happen even when writing code in a technology that you may have been using for decades, such as Visual Basic. So it's no surprise that Ribbon programming—with its several unique features—is pretty much guaranteed to bother you with bugs. The following sections describe some strategies for dealing with common Ribbon-related bugs.
+
+## Employ Error-Message Tools
+
+Here are two preliminary steps to take in your effort to avoid, or cure, bugs.
+
+First, turn on an error-reporting feature that's off by default. It allows you to see certain kinds of error messages if something is wrong with your Ribbon customization. Click the File tab on the Ribbon, then choose Options to open the application's Options dialog box. Click the Advanced button (or if you're working in Access, click the Client Settings button). Scroll down to the General section and ensure that the Show Add-in User Interface Errors check box is selected. XML itself doesn't display error messages or crash—it just _does nothing_ if there's a problem. In other words, whatever effect you're after doesn't happen: a new tab doesn't appear, a built-in tab isn't hidden, and so on. But if you turn on the Show Add-in User Interface Errors feature, you'll see a descriptive explanation of why the Ribbon was not modified. It will look something like the message displayed in Figure 31.8.
+
+Figure 31.8 Error messages are your first line of defense against bugs.
+
+This error message tells you that your error in the XML is located in line 11 in the following code:
+
+    1.  <customUI xmlns="http://schemas.microsoft.com/office/2009/07/customui">
+    2.  <ribbon>
+    3.     <tabs>
+    4.        <tab id="t1" label="Execute">
+    5.           <group id="g1" label="Run Test">
+    6.
+    7.            **< toggleButton** id="tb1"
+    8.                   label="Change me."
+    9.                   supertip="Dynamically changes this button's label"
+    10.
+    11.                  **onActon** =" **tix** "
+    12.                 />
+    13.          </group>
+    14.       </tab>
+    15.    </tabs>
+    16. </ribbon>
+    17. </customUI>
+
+The message also says that the toggleButton element has no attribute named onActon, which indeed it doesn't. You meant to type onAction, so this a typo. However, most often this error message is caused by _improper case_. For example, you've spelled the attribute or other code element correctly, but you typed OnAction rather than the "proper" onAction. Remember that XML cares about capitalization and insists that it be exactly as expected. Recall that this irrational case sensitivity is the source of a great many XML bugs. Always check capitalization when you're tracking down an XML problem. A case issue is easily overlooked.
+
+The second preliminary step to take when fixing bugs is to use an XML editor capable of validating XML code. Such an editor is included with some versions of Visual Basic (such as VB 2012, which is not the VBA built into Office applications, but rather the stand-alone version of the Basic programming language in Microsoft's Visual Studio).
+
+You can also find free XML editors for download from places like MajorGeeks.com. Or just use the Custom UI Editor for Microsoft Office tool you downloaded earlier in this chapter. XML editors can give you useful error messages if your XML code is faulty—the specific line where a problem is and, sometimes, a good description of the error. There is overlap between XML-validation error messages and those displayed by the Show Add-in User Interface Errors option you turned on in the previous step. But when it comes to fixing bugs, the more assistance the merrier.
+
+## Cure Common User-Interface Programming Problems
+
+Now let's explore some common error messages and what you can do about them.
+
+### Callback Problems: Bad Reference, Missing VBA Procedure, or Security Issues
+
+The first error message we'll look at tells you that there's a problem with a callback you're attempting. The XML code can't execute the onAction attribute (see Figure 31.9). This error message can be caused because you haven't set a reference to the Microsoft Access 15.0 Object Library, as described in step 3 in the section "Adding a Callback in Access" earlier in this chapter (a variation on this error message is described in that section).
+
+Figure 31.9 When you click your new button on the Ribbon to test it, you might see this error message.
+
+This error message can also be caused by the two problems mentioned in the error message. First, you either mistyped or never wrote the procedure that you're referencing. For example, there is no procedure named tix:
+
+    onActon="tix"
+
+Or, second, you need to enable macros by clicking the File tab on the Ribbon, clicking Options, then clicking the Trust Center button in the dialog box, clicking the Trust Center Settings button, and clicking one of the check boxes that permit macros.
+
+### Other Callback Problems
+
+The next error message (see Figure 31.10) means either one of two things. First, you've neglected to put in the required argument IRibbonControl.
+
+You've put in
+
+    Sub trigger()
+
+rather than
+
+    Sub trigger(IRibbonControl)
+
+Figure 31.10 This error message results when you don't provide necessary arguments.
+
+Or, second, you've neglected to provide additional required arguments. For example, the Toggle Button control passes two arguments to VBA, the IRibbonControl (which all callbacks always pass) and the status of the button, up or down, a Boolean value, like this:
+
+    Sub test(control As IRibbonControl, **down As Boolean** )
+
+### XML Mistakes
+
+The next error I'll describe is easy to make. If you violate the XML nesting structure—by putting a group tag outside a tab tag, for example—you'll get the error message shown in Figure 31.11:
+
+                 < **button** idMso="WindowNext"/>
+                      </ **tab** >
+                </ **group** >
+
+Figure 31.11 When XML tags aren't in the proper order, you'll see this error message.
+
+The group is supposed to be inside a tab, not vice versa. Or if you leave out one of the closing tags (signified by the />), you'll get the same error message. Here the closing </tab> tag is missing:
+
+                        **< /group>**
+                   **< /tabs>**
+
+However, if you validate your XML code in an XML editor like the Custom UI Editor for Microsoft Office, the editor itself will flag certain problems such as this incorrect nesting.
+
+# Where to Go from Here
+
+As you've seen, the Ribbon can be customized programmatically in a variety of ways. This chapter shows you the essential techniques. However, there's more to explore. You can experiment with the interesting Gallery control or create dynamic Ribbon effects, such as disabling, hiding, or changing the label on a control during runtime, as conditions warrant. In fact, you can create an entirely new Ribbon from scratch using the startFromScratch attribute.
+
+If you're interested in going deeper into Ribbon programming, here are some useful online resources:
+
+**Download the RibbonX Visual Designer**
+
+This powerful utility can assist you in all types of modifications to the Ribbon. Take a tour of its features at
+
+www.andypope.info/vba/ribboneditor.htm
+
+**Try MSDN white papers**
+
+Take a look at Microsoft's online help system for developers—MSDN. It includes a lengthy, three-part tutorial on Ribbon programming, which can be found at
+
+<http://msdn2.microsoft.com/en-us/library/aa338202.aspx>
+
+This helpful tutorial covers advanced topics such as using COM add-ins and dynamic Ribbon updating; a helpful FAQ; and a complete reference to the many attributes, callbacks, and child elements involved in Ribbon programming. Or take a look here:
+
+<http://msdn.microsoft.com/en-us/library/office/ff862537.aspx>
+
+**Read all about RibbonX**
+
+_RibbonX for Dummies_ (Wiley) is a solid introductory book to all aspects of Ribbon programming, and it covers advanced topics as well.
+
+**Visit the Windows Ribbon Development Forum**
+
+The Windows Ribbon Development Forum is filled with questions and answers on Ribbon programming. If you run into a roadblock, post your question here and experts will respond.
+
+http://social.msdn.microsoft.com/Forums/en-US/windowsribbondevelopment
+
+**Explore Microsoft's Access Team's blog**
+
+The following web page and its links focus on Access, and it includes information useful to beginners as well as more advanced Access programmers. It's written by Microsoft's own Access Team, and you can also post questions at this blog:
+
+<http://blogs.office.com/b/microsoft-access/archive/tags/access+2013/default.aspx>
+
+# The Bottom Line
+
+**Hide a tab on the Ribbon.**
+
+Modifying the Ribbon involves employing XML attributes—similar to methods and properties—of various Ribbon elements such as tabs, groups, and buttons.
+
+Master It
+
+Some Ribbon-related attributes include the suffix Mso. Examples include idMso and imageMso. What does the Mso mean, and what kind of attributes' names are appended with Mso?
+
+**Hide a group.**
+
+You might want to make an entire Ribbon group invisible. For example, the Editing group on the Home tab includes three options that most people launch via shortcut keys: Find, Replace, and Select. So what's the point of having this group take up space on the Ribbon?
+
+Master It
+
+What XML attribute of a group do you set to false to remove that group from the Ribbon?
+
+**Create callbacks for event handling.**
+
+To execute VBA code, you insert a callback in the XML code that will run whatever VBA macro you specify. When the user clicks a control, such as a button, the XML code that services this control sends a message to the Office application, telling it that a response is needed.
+
+Master It
+
+What XML attribute do you use to create a callback?
+
+**Manipulate the Access Ribbon.**
+
+Access often does things differently from the majority of Office applications, and Ribbon programming is no different. You can manipulate the Access Ribbon as freely as in the other applications, but several of the programming techniques differ.
+
+Master It
+
+Where can you store the XML code when programming the Access Ribbon?
+
+**Debug Ribbon programming.**
+
+Most Ribbon programming involves writing two types of code: XML and VBA. Strategies for fixing bugs in XML include _validation_.
+
+Master It
+
+What is XML validation?
+Appendix
+
+The Bottom Line
+
+Each of The Bottom Line sections in the chapters suggest exercises to deepen skills and understanding. Sometimes there is only one possible **solution** , but often you are encouraged to use your skills and creativity to create something that builds on what you know and lets you explore one of many possible solutions.
+
+# Chapter 1: Recording and Running Macros in the Office Applications
+
+**Record a macro.**
+
+The easiest way to create a macro is to simply record it. Whatever you type or click—all your behaviors—are translated into VBA automatically and saved as a macro.
+
+**Master It**
+
+Turn on the macro recorder in Word and create a macro that moves the insertion cursor up three lines. Then turn off the macro recorder and view the code in the Visual Basic Editor.
+
+**Solution**
+
+Click the Developer tab on the Ribbon (if that tab isn't visible, press Alt+F, I and select Show Developer Tab in the Ribbon). Click the Record Macro button on the Developer tab, then give the macro a name, such as **temporary** , and, if necessary, change the Store Macros In target to All Documents (Normal.dotm). Click OK to close the Record Macro dialog box. This begins the recording process. Press the up arrow key three times. That's what you want to record.
+
+Now click the Stop Recording button on the Developer tab or on the status bar at the bottom of the screen. Press Alt+F11 to open the Visual Basic Editor. Open Normal, then Modules, and double-click NewMacros in the left (Project) pane of the Editor. Scroll the code window until you locate your new macro. The macro code should look something like this:
+
+    Sub temporary()
+    '
+    ' temporary Macro
+    '
+    '
+        Selection.MoveUp Unit:=wdLine, Count:=3
+    End Sub
+
+**Assign a macro to a button or keyboard shortcut.**
+
+You can trigger a macro using three convenient methods: clicking an entry on the Ribbon, clicking a button in the Quick Access Toolbar, or using a keyboard shortcut. You are responsible for assigning a macro to any or all of these methods.
+
+**Master It**
+
+Assign an existing macro to a new Quick Access Toolbar button.
+
+**Solution**
+
+Press Alt+F, I. Click Customize in the left pane of the Word Options dialog box. In the Choose Commands From list, select Macros. Click a macro's name to select it in the list. Click the Add button to insert this macro's name in the Customize Quick Access Toolbar list. Click OK to close the dialog box. Now you see a new button on the Toolbar that, when clicked, launches your macro.
+
+**Run a macro.**
+
+Macros are most efficiently triggered via a Ribbon entry, by clicking a button on the Quick Access Toolbar, or by pressing a shortcut key combination such as Alt+N or Ctrl+Alt+F. When you begin recording a macro, the Record Macro dialog has buttons that allow you to assign the new macro to a shortcut key or toolbar button. However, if you are using the Visual Basic Editor, you can run a macro by simply pressing F5.
+
+**Master It**
+
+Execute a macro from within the Visual Basic Editor.
+
+**Solution**
+
+Open the Visual Basic Editor by pressing Alt+F11. Click to put the insertion cursor anywhere in the code within one of your macros in the right pane (between the Sub and End Sub lines of code). Press F5 to execute the macro.
+
+**Delete a macro.**
+
+It's useful to keep your collection of macros current and manageable. If you no longer need a macro, remove it. Macros can be directly deleted from the Visual Basic Editor or by clicking the Delete button in the Macros dialog (opened by pressing Alt+F8).
+
+**Master It**
+
+Temporarily remove a macro, then restore it, using the Visual Basic Editor.
+
+**Solution**
+
+Press Alt+F11 to open the Visual Basic Editor. The code within this editor is just text, similar to Notepad. Locate a macro within the Editor. Each macro is the code that starts with Sub and concludes with End Sub. So drag your mouse to select an entire macro, including its Sub...End Sub lines. (Note that the Editor displays a horizontal line between each macro, so you can easily see where each macro's code begins and ends.) Press Ctrl+C to copy the macro's code. Then press Delete to delete the macro. Close the Visual Basic Editor and press Alt+F8 to open the Macros dialog box and see your list of macros. Scroll this list and notice that the macro you deleted in the Editor no longer exists.
+
+Now close the Macros dialog box. Restore the macro by pressing Alt+F11 to reopen the Editor. Then click on a blank line at the very top of the right pane where the macro code is. However, you want to put the blinking insertion cursor _outside_ of any other macro's Sub...End Sub code area. Finally, press Ctrl+V to paste the macro you previously deleted. It's restored. Remember, this Visual Basic Editor merely accepts ordinary, plain text for its source code—so you can freely cut, copy, and paste code. In fact, you can copy and paste all the code examples from this book at the book's website: www.sybex.com/go/masteringvbaoffice2013
+
+# Chapter 2: Getting Started with the Visual Basic Editor
+
+**Open the Visual Basic Editor.**
+
+When you want to create a new macro by hand-programming (as opposed to recording) or need to modify or test a macro, the Visual Basic Editor is a powerful tool.
+
+**Master It**
+
+Open the Visual Basic Editor in Word and create a simple macro.
+
+**Solution**
+
+Press Alt+F11.
+
+**Open a Macro in the Visual Basic Editor.**
+
+You edit and test macro code in the Code window of the Visual Basic Editor.
+
+**Master It**
+
+Open the Visual Basic Editor and display a particular macro in the Code window.
+
+**Solution**
+
+Press Alt+F8 to open the Macros dialog box, click the name of the macro you want to work with, then click the Edit button.
+
+The Project Explorer window displays a tree of current projects. You can choose between viewing only the files or the folders and files.
+
+**Understand the Project Explorer's two views.**
+
+The Project Explorer window displays a tree of current projects. You can choose between viewing only the files or the folders and files.
+
+**Master It**
+
+Switch between folder and contents view in the Project Explorer.
+
+**Solution**
+
+Click the icon on the right side (a picture of a folder) just under the Project Explorer's title bar.
+
+**Set properties for a project.**
+
+You can specify a project's name, an associated Help file, and other qualities of a project.
+
+**Master It**
+
+Lock a project so others can't modify or even read its contents.
+
+**Solution**
+
+Right-click the project's name in the Project Explorer to open the shortcut menu. Choose <ProjectName> Properties. Click the Protection tab and select the Lock project for viewing text box. In the Password to View Project Properties group box, enter a password for the project in the Password text box, and then enter the same password in the Confirm Password text box. Click the OK button and close the project.
+
+**Customize the Visual Basic Editor.**
+
+The Visual Basic Editor can be customized in many ways, including personalizing classic menus and toolbars.
+
+**Master It**
+
+Undock the Properties window and change its size. Then redock it.
+
+**Solution**
+
+Double-click the title bar of the Properties window to undock it. Position your mouse pointer in the lower-right corner until the pointer changes to a double arrow. Then drag the window to resize it. Restore the Properties window to its default docked position by double-clicking its title bar again.
+
+# Chapter 3: Editing Recorded Macros
+
+**Test a macro in the Visual Basic Editor.**
+
+When you need to modify or debug a macro, the Visual Basic Editor is your best friend. It's filled with tools to make your job easier.
+
+**Master It**
+
+Open a macro; then step through it to see if anything goes wrong.
+
+**Solution**
+
+Press Alt+F8 to open the Macros dialog box. Select the macro's name that you want to test, then click the Step Into button. The Visual Basic Editor opens and the insertion cursor is located within the chosen macro, thereby making it the currently active one (the one with which the Editor's features—such as the Step tool—will work). The first line is highlighted, indicating that it is the next line that will execute. Press F8 to execute the first line, then press F8 repeatedly to step down through each line of code. See if any problems occur—either problems you observe in the behavior of the macro in your application, or problems that VBA notifies you of by displaying an error-message box.
+
+**Set breakpoints and use comments.**
+
+Setting breakpoints allows you to press F5 to execute a macro, but forces the Editor to enter Break mode when execution reaches the line where the breakpoint resides. Comments help you understand the purpose of code—they describe it but are ignored during execution of the macro's code. "Commenting out" a line of code allows you to temporarily render it inactive to see what effect this has during execution. This is sometimes a good way to see if that line is causing the bug you're tracking down.
+
+**Master It**
+
+Set a breakpoint in, and add a comment to, a macro.
+
+**Solution**
+
+Set a breakpoint by clicking in the gray margin indicator bar to the left of a line of code where you want to halt execution. The line of code on which you set a breakpoint is shaded brown by default. You can set as many breakpoints as you wish. Now type in a line such as ' **The following With block describes the format for this new paragraph**. Because you've started this line with a single-quote symbol, the line will be ignored when the Editor executes the macro.
+
+**Edit a recorded macro.**
+
+Make some changes to a Word macro.
+
+**Master It**
+
+With the Visual Basic Editor open, choose a macro and modify it.
+
+**Solution**
+
+Click a line between the Sub and End Sub lines that envelop the macro you want to modify. This puts the insertion cursor where you want it. Now simply type in whatever adjustments you want to make to the code.
+
+# Chapter 4: Creating Code from Scratch in the Visual Basic Editor
+
+Set up the Visual Basic Editor for creating procedures.
+
+How you arrange the various components of the Visual Basic Editor is your personal choice, but while using this book, it's easiest if you set up the Editor to resemble the way it appears in the book's figures. Besides, this arrangement is quite close to the default layout, which has proven to be the most effective one for the majority of programmers (according to various focus groups and polls) for the decades that Visual Basic has been used.
+
+**Master It**
+
+Press a single key to display, then hide, the Properties window.
+
+**Solution**
+
+Press F4 to display the Properties window. Click the close button to dismiss it.
+
+**Create a procedure for Word.**
+
+Using the Help feature in any VBA-enabled application allows you to find code examples that you can copy and paste into your own code.
+
+**Master It**
+
+Open the Code window and use Help to find a code example.
+
+**Solution**
+
+Press F7 to open the Code window, and then press F1 to open Help. Click the Word Object Model Reference link. Scroll down until you see the Line Numbering Object link. Click it, then click the Line Numbering Object link in this newly displayed information. You'll find a code example that adds line numbers to the active document. Select and copy this code, then paste it into the Visual Basic Editor. Note that many code examples are not full procedures but merely snippets of code, so it's up to you to add the Sub...End Sub envelope. Your final procedure in the Visual Basic Editor should look like this:
+
+    Sub AddLines()
+
+    With ActiveDocument.Sections(1).PageSetup.LineNumbering
+        .Active = True
+        .CountBy = 5
+        .RestartMode = wdRestartPage
+    End With
+
+    End Sub
+
+I named it AddLines; name it whatever you wish. But keep in mind that to be able to execute it—for it to be a formal macro—you must include the Sub...End Sub.
+
+Press F5 to try it out, and then look at the document and see the line numbers.
+
+Remove the line numbers from the document by clicking the Line Numbers option in the Page Setup section of the Page Layout tab on the Ribbon and choosing None.
+
+**Create a procedure for Excel.**
+
+Certain procedure names are special. In a previous Excel exercise, you added line numbering and gave that procedure a name of your own choice. But some procedure names have a special meaning—they are triggered by an _event_ in Excel itself. They will execute _automatically_ when that event takes place (you don't have to run events by choosing Run from the Macro dialog box or by assigning the macro to a keyboard shortcut or Quick Access Toolbar button). One such event is Excel's Auto_Open procedure.
+
+**Master It**
+
+Display a message to the user when Excel first executes.
+
+**Solution**
+
+Press F7 to open the Code window in Excel's Visual Basic Editor. Locate Vroject ( _theprojectname_ ) in the Project Explorer, double-click it to open its contents, and then double-click ThisWorkbook under the project. An empty Sub (an open event) appears in the Code window. Type the highlighted code into the procedure:
+
+    Private Sub Workbook_Open()
+        **MsgBox "Opened"**
+    End Sub
+
+Close and then reopen the workbook to see the message automatically displayed.
+
+**Create a procedure for PowerPoint.**
+
+As you type a procedure, the Visual Basic Editor provides you with lists of objects' members (the Auto List Members feature) and with syntax examples, including both required and optional arguments (the Auto Quick Info feature). These tools can be invaluable in guiding you quickly to the correct object and syntax for a given command.
+
+**Master It**
+
+Use the Auto List Members and Auto Quick Info features to write a macro that saves a backup copy of the currently active presentation.
+
+**Solution**
+
+Create a new presentation based on a template of your choosing. Press Alt+F11 to open the Visual Basic Editor. Choose Tools ⇒ Options and ensure that the Auto List Members and Auto Quick Info check boxes are selected.
+
+Right-click the name of the presentation in the Project Explorer, then choose Insert Module so you'll have a place to write a macro.
+
+Type the following line of code to create a macro:
+
+    Sub SaveTemp()
+
+When you press the Enter key, the Visual Basic Editor automatically adds End Sub.
+
+Now type **Application.** in the macro. As soon as you press the period (.), a list of properties and methods of the Application object is displayed. Choose ActivePresentation. Again, when you press the period key, a list of the ActivePresentation object's members appears. Choose SaveCopyAs. Press the spacebar to insert a space after SaveCopyAs, and you'll see that this method has only one required argument: a filename string (meaning you must provide a literal filename within quotes, or a string variable or constant). In this case, just type **"temporary"** , and then press the F5 key to execute the macro. You didn't specify a path, so you can find your file in your Documents folder, where it is saved by default.
+
+Your macro should look like this:
+
+    Sub SaveTemp()
+
+    Application.ActivePresentation.SaveCopyAs "temporary"
+
+    End Sub
+
+**Create a procedure for Access.**
+
+Although Access includes a variety of macro-related features that are unique (such as its Macro Builder/Designer), its Visual Basic Editor is quite similar to the Visual Basic Editors in the other Office applications.
+
+**Master It**
+
+Open the Visual Basic Editor in Access and write a macro that displays today's date using the Date function rather than the Now function. Use the Access Visual Basic Editor Help system to understand the difference between these two functions.
+
+**Solution**
+
+In Chapter 4, you wrote a macro in Access that displays today's date and time. Here you will display the date only.
+
+**1.** Start Access.
+
+**2.** Click the Blank Database button, and then click the Create button.
+
+**3.** Press Alt+F11 to open the Visual Basic Editor.
+
+**4.** Right-click the database name in the Project Explorer, then choose Insert Module to open a new module in the Code window.
+
+**5.** In the Code window, type the following macro:
+
+     Sub ShowDateOnly()
+
+     MsgBox Date
+
+     End Sub
+
+**6.** Press F5 to execute the macro. You should see a message box that displays the current date.
+
+# Chapter 5: Understanding the Essentials of VBA Syntax
+
+**Understand the basics of VBA.**
+
+VBA includes two types of procedures, used for different purposes.
+
+**Master It**
+
+Name the two types of procedures used in VBA (and indeed in most computer languages), and describe the difference between them.
+
+**Solution**
+
+A _function_ always returns a value after it finishes executing. For example, you can display a message box, and when the user clicks a button to close that box, the value returned represents which button the user clicked. When the statement that called (invoked) the MsgBox function gets that value, it can respond in whatever way the programmer finds appropriate. Often it's something like this: _if the user clicked OK, then check the spelling, or if the user clicked Cancel, then close the document_.
+
+A _subprocedure_ (or subroutine) does not return a value. It does a job, then quits without sending back any information to the code or action that triggered it. Events are always subprocedures.
+
+**Work with procedures and functions.**
+
+A procedure is a container for a set of programming statements that accomplish a particular job.
+
+**Master It**
+
+Write a subprocedure in the Visual Basic Editor that displays a message to the user. Then execute that subprocedure to test it.
+
+**Solution**
+
+In the Visual Basic Editor Code window, type code similar to this:
+
+    Sub showmessage()
+
+    MsgBox "Hi, user."
+
+    End Sub
+
+Execute this code by clicking within the subprocedure to position the insertion point there and then pressing F5.
+
+**Use the Immediate window to execute individual statements.**
+
+When you're writing code, you often want to test a single line (a statement) to see if you have the syntax and punctuation right or if it produces the expected result.
+
+**Master It**
+
+Open the Immediate window, type in a line of code, and then execute that line.
+
+**Solution**
+
+Press Ctrl+G to open the Immediate window, then type a line of VBA code. Press the Enter key when you've finished to execute that statement.
+
+**Understand objects, properties, methods, and events.**
+
+Object-oriented programming (OOP) means creating objects to use in your programming. OOP has become the fundamental paradigm upon which large programming projects are built. Generally speaking, macros are not large and therefore don't profit from the clerical, security, and other benefits that OOP offers—particularly for people who write large applications as a team.
+
+However, code libraries, such as the vast VBA set of objects and their members (not to mention the even vaster .NET libraries that tap into the power of the operating system itself) _are_ written by large groups of people, and written at different times. These libraries themselves are huge. There must be a way to organize their objects and functions—to categorize them and allow you to execute the methods and manage their properties and arguments. As a result, another aspect of OOP—taxonomy—is quite valuable even when writing brief macros. It's a way to quickly locate the members you're interested in.
+
+**Master It**
+
+Look up the Document object in the Visual Basic Editor's Help system; then look at its methods.
+
+**Solution**
+
+With the Visual Basic Editor the active window, choose Help ⇒ Microsoft Visual Basic Help. Click the Word Object Model Reference link in the Help dialog box. Maximize the Help dialog box so you can see the large lists of objects and members. Scroll down until you see the Document object link. Click that link, and then click the link Document Object Members. As you scroll down, you'll see the many methods, properties, and events for this object. Click any of them to get a description, syntax, and, usually, a helpful code example you can cut and paste into your Code window.
+
+# Chapter 6: Working with Variables, Constants, and Enumerations
+
+**Understand what variables are and what you use them for.**
+
+Variables are a cornerstone of computer programming; they are extremely useful for the same reason that files are useful in the real world. You give a name to a variable for the same reason that you write a name to identify a file folder. And a file can, over time, contain various different papers, just as the value contained in a programming variable can vary. In both cases, the contents vary; the name remains the same. It's good practice to always specifically name a variable before using it in your code. This is called _explicit declaration_.
+
+**Master It**
+
+Explicitly declare a variable named CustomersAge.
+
+**Solution**
+
+This code explicitly declares a variable:
+
+    Dim CustomersAge
+
+If you decided to declare the variable as an Integer type, it would look like this:
+
+    Dim CustomersAge As Integer
+
+**Create and use variables.**
+
+When creating (declaring) a new variable, you should avoid using words or commands that are already in use by VBA, such as Stop or End. There are also restrictions such as not using special characters.
+
+**Master It**
+
+This variable name cannot be used, for two reasons. Fix it so it is a legitimate variable name:
+
+    Dim 1Turn! as Integer
+
+**Solution**
+
+    Dim Turn as Integer
+
+You can't begin a variable name with a digit, nor can you use an exclamation point anywhere in the name.
+
+**Specify the scope and lifetime of a variable.**
+
+Variables have a range of influence, depending on how you declare them.
+
+**Master It**
+
+Create a variable named AnnualSales that will be available to any procedure within its own module but not to other modules.
+
+**Solution**
+
+Constants, like variables, are named locations in memory that contain a value. Unlike variables, however, the value in a constant does not change during program execution.
+
+**Work with constants.**
+
+Constants, like variables, are named locations in memory that contain a value. Unlike with variables, however, the value in a constant does not change during program execution.
+
+**Master It**
+
+Define a string constant using the Dim command. Name your constant FirstPrez, and assign it the value George Washington.
+
+**Solution**
+
+This code line defines a constant, and assigns a value to it:
+
+    Const FirstPrez As String = "George Washington"
+
+**Work with enumerations.**
+
+Enumerations provide a handy name for each item in a list, often a list of properties.
+
+**Master It**
+
+In the Project Explorer, click the ThisDocument object to select it. Then locate the JustificationMode property in the Properties window, and choose one of that property's enumerated constants by clicking the small down-arrow that appears, then clicking one of the constants in the drop-down list.
+
+**Solution**
+
+You'll see the drop-down list of enumerated values for the JustificationMode property, as illustrated in the following screenshot.
+
+# Chapter 7: Using Array Variables
+
+**Understand what arrays are and what you use them for.**
+
+Arrays play an important role in computer programming. In some ways they resemble a mini-database, and organized data is central to computing. Computers are sometimes called data processors for good reason, and arrays make it easier for you to manipulate variable data.
+
+**Master It**
+
+What is the difference between an array and an ordinary variable?
+
+**Solution**
+
+An ordinary (scalar) variable can contain only a single value; an array can contain multiple values, identified by index numbers.
+
+**Create and use arrays.**
+
+When you create a new array, you _declare_ it and, optionally, specify the number of values it will contain.
+
+**Master It**
+
+There are four keywords that can be used to declare arrays. Name at least three of them.
+
+**Solution**
+
+Arrays can be declared using the same keywords that are employed to declare ordinary variables: Dim, Private, Public, or Static.
+
+**Redimension an array.**
+
+If you want to resize an existing dynamic array, you can redimension it.
+
+**Master It**
+
+Redimensioning an array with the ReDim statement causes you to lose any values that are currently in that array. However, you can preserve these values using a special keyword. What is it?
+
+**Solution**
+
+To preserve values when redimensioning an array, use the Preserve command, like this:
+
+    ReDim Preserve arrTestArray(5)
+
+**Erase an array.**
+
+You can erase all the values in a fixed-size array or completely erase a dynamic array.
+
+**Master It**
+
+Write a line of code that erases an array named arrMyArray.
+
+**Solution**
+
+Use the Erase statement with the name of an array you want to erase. The following statement erases the contents of the fixed-size array named arrMyArray:
+
+    Erase arrMyArray
+
+**Find out whether a variable is an array.**
+
+An array is a type of variable, and you may occasionally need to check whether a particular variable name denotes an array or an ordinary _scalar variable_ (a variable that isn't an array).
+
+**Master It**
+
+Which built-in function can you use in VBA to find out whether a variable is an array or an ordinary, single-value variable?
+
+**Solution**
+
+Use the IsArray function with the variable's name to see if a variable is an array. For example, the following statement checks the variable MyVariable to see if it's an array:
+
+    If IsArray(MyVariable) = True Then
+
+**Sort an array.**
+
+Visual Basic .NET includes array objects with built-in search and sort methods. In VBA, however, you must write a bit of code to search and sort the values in an array.
+
+**Master It**
+
+Name a popular, understandable, but relatively inefficient sorting technique.
+
+**Solution**
+
+The bubble sort is easy to visualize, but relatively inefficient.
+
+**Search an array.**
+
+Searching through an array can be accomplished in two primary ways. If you have a relatively small array, you can use the simpler, but less efficient technique. With large amounts of data, though, it's best to use the more robust approach.
+
+**Master It**
+
+Name two common ways to search an array.
+
+**Solution**
+
+You can use either the simple linear search or the binary search, which requires that an array be sorted first.
+
+# Chapter 8: Finding the Objects, Methods, and Properties You Need
+
+**Understand and use objects, properties, and methods.**
+
+Contemporary programming employs a hierarchical method of organization known as object-oriented programming (OOP). At the very top of the hierarchy for any given application is the Application object. You go through this object to get to other objects that are lower in the hierarchy.
+
+**Master It**
+
+By using _creatable_ objects, you can often omit the Application object when referencing it in code. What are creatable objects?
+
+**Solution**
+
+Because you'd have to go through the Application object to get to pretty much anything in the application, most applications include a number of _creatable_ objects—objects that you can access without referring explicitly to the Application object. These creatable objects are usually the most-used objects for the application, and by using them, you can access most of the other objects without having to refer to the Application object. For example, Excel exposes the Workbooks collection as a creatable object, so you can use the following statement, which doesn't use the Application object:
+
+    Workbooks(1).Sheets(1).Range("A1").Select
+
+**Use collections of objects.**
+
+Collections are containers for a group of related objects, such as the Documents collection of Document objects.
+
+**Master It**
+
+Are collections objects? Do they have their own methods and properties?
+
+**Solution**
+
+Yes, collections are themselves objects—in the same sense that a flower vase contains a group of flower objects but the vase, too, is an object. A collection can have its own properties and methods, though collections usually have fewer properties and methods than other objects.
+
+**Find objects, properties, and methods.**
+
+The Visual Basic Editor offers several ways to locate objects' members and add them to your programming code. There's an extensive Help system, the Object Browser, a List Properties/Methods feature, and the Auto List Members tool.
+
+**Master It**
+
+How do you employ Auto List Members to find out which properties and methods are available for Word's Document object?
+
+**Solution**
+
+Type **Document.** in the Code window, and as soon as you type the period, a list of the Document object's members appears.
+
+**Use Object variables to represent objects.**
+
+You can create variables that contain objects rather than typical values like strings or numbers.
+
+**Master It**
+
+What keywords do you use to declare an Object variable?
+
+**Solution**
+
+The same keywords are used to declare Object variables as you use for any other variable. To create an Object variable, declare it using a Dim, Private, or Public statement. For example, the following statement declares the Object variable objMyObject:
+
+    Dim objMyObject As Object
+
+However, you assign a value to an ordinary variable by using the = sign:
+
+    strMyString = "Harry"
+
+You assign an object to an Object variable using the Set command in addition to the = sign, like this:
+
+    Dim wksSheet1 As Worksheet
+    **Set** wksSheet1 = ActiveWorkbook
+
+# Chapter 9: Using Built-in Functions
+
+**Understand what functions are and what they do.**
+
+A function is a unit of code, a procedure, that performs a task _and returns a value_.
+
+You can write your own functions by writing code between Function and End Function in the VBA Editor. Chapter 10, "Creating Your Own Functions," explores how to write such custom functions. But in addition to functions you might write, there are many functions already prewritten in VBA—ready for you to call them from your macros to perform various tasks.
+
+**Master It**
+
+A function is quite similar to a subroutine, but there is a significant difference. What is it?
+
+**Solution**
+
+Subroutines don't return a value; functions do.
+
+**Use functions.**
+
+In a macro, you can call a built-in function by merely typing in its name and providing any required arguments.
+
+**Master It**
+
+You can combine multiple functions in a single line of code. The MsgBox function displays a message box containing whatever data you request. The only required argument for this function is the _prompt_. The Now function returns the current date and time. Write a line of code that calls the MsgBox function and uses the Now function as its argument.
+
+**Solution**
+
+    MsgBox Now
+
+The MsgBox function displays a message box which, in this case, is the date and time returned by the Now function.
+
+**Use key VBA functions.**
+
+VBA offers the services of hundreds of built-in functions. You'll find yourself using some of them over and over. They are _key_ to programming.
+
+**Master It**
+
+What built-in function is used quite often to display information in a dialog box to the user while a procedure runs?
+
+**Solution**
+
+Both the MsgBox and InputBox functions are used to display information to the user in a dialog box.
+
+**Convert data from one type to another.**
+
+It's sometimes necessary to change a value from one data type to another. Perhaps you used an input box to ask the user to type in a String variable, but then you need to change it into an Integer type so you can do some math with it. (You can't add pieces of text to each other.)
+
+**Master It**
+
+What built-in function would you use to convert a string such as "12" (which, in reality, is two text _characters_ , the digits 1 and 2) into an Integer data type, the actual _number_ 12, that you can manipulate mathematically?
+
+**Solution**
+
+The built-in function CInt transforms other data types into an Integer type. Here's an example
+
+    intMyVar = CInt(varMyInput)
+
+**Manipulate strings and dates.**
+
+VBA includes a full set of functions to manage text and date data.
+
+**Master It**
+
+Which built-in function would you use to remove any leading and trailing space characters from a string? For example, you want to turn
+
+        "  this         "
+
+into
+
+        "this"
+
+**Solution**
+
+Use the Trim function. The following example displays two message boxes. The first shows you the length of the string before the Trim function is applied. The second displays the number of characters in the string after Trim does its job.
+
+# Chapter 10: Creating Your Own Functions
+
+**Understand the components of a function statement.**
+
+Arguments can be passed from the calling code to a function in one of two ways: by reference or by value.
+
+**Master It**
+
+Describe the difference between passing data by reference and passing data by value.
+
+**Solution**
+
+The memory address of the actual value is passed to the function when passed _by reference_. This means that the value can be changed by the function. When passed _by value_ , a copy of the data is sent to the function, leaving the original data unmodifiable by the called function. By reference is the default.
+
+**Create a generic function.**
+
+You can write, and save (File ⇒ Export File), sets of generic functions that work in any VBA-enabled application.
+
+**Master It**
+
+Create a function that displays the current year in a message box. This function will require no arguments, nor will it return any value.
+
+**Solution**
+
+    Function ShowYear()
+
+        MsgBox (Year(Now))
+
+      End Function
+
+**Create a function for Word.**
+
+Word contains a whole set of objects and members unique to word-processing tasks. Functions that are specific to Word employ one or more of these unique features of the Word object model.
+
+**Master It**
+
+Write a function that displays the number of hyperlinks in the currently active document. Use Word's Hyperlinks collection to get this information.
+
+**Solution**
+
+    Function FindHyperCount()
+
+        MsgBox (ActiveDocument.Hyperlinks.Count)
+
+    End Function
+
+**Create a function for Excel.**
+
+Excel uses an ActiveWorkbook object to represent the currently selected workbook. You can employ a full set of built-in methods to manipulate the features of any workbook.
+
+**Master It**
+
+Using the Sheets collection of Excel's ActiveWorkbook object, write a function that displays the number of sheets in the current workbook.
+
+**Solution**
+
+    Function SheetsCount()
+
+    MsgBox (ActiveWorkbook.Sheets.Count)
+
+    End Function
+
+**Create a function for PowerPoint.**
+
+PowerPoint's object model includes an ActivePresentation object, representing the currently selected presentation. Functions can make good use of this object and its members.
+
+**Master It**
+
+Write a function that returns how many slides are on a presentation. Pass the ActivePresentation object as an argument to this function; then display the number of slides the presentation contains. Call this function from a subroutine.
+
+**Solution**
+
+    Option Explicit
+
+    Function CountSlides(objPresentation As Presentation) As Integer
+        CountSlides = objPresentation.Slides.Count
+    End Function
+
+    Sub SeeNumber()
+        MsgBox (CountSlides(ActivePresentation))
+    End Sub
+
+Notice that your CountSlides function is called from within the SeeNumber Sub. The ActivePresentation object is passed as an argument. The CountSlides function's argument list includes a variable of the _Presentation_ type. And the entire function is defined as an Integer type, meaning it will pass back integer data to the caller. CountSlides is assigned the integer value provided by the Count method, and thus returns this value to the calling subroutine.
+
+**Create a function for Access.**
+
+Access often works a little differently from other VBA-enabled Office applications. For example, some common tasks are carried out by using methods of the special DoCmd object rather than methods of a Form or Table object.
+
+**Master It**
+
+Write a function that closes Access by using the DoCmd object's Quit method. Ensure that all data is saved by employing the acQuitSaveAll constant as an argument for the Quit method.
+
+**Solution**
+
+This function closes Access with the DoCmd object's Quit method.
+
+    Function QuitApp()
+
+    DoCmd.Quit (acQuitSaveAll)
+
+    End Function
+
+# Chapter 11: Making Decisions in Your Code
+
+**Use comparison operators.**
+
+Comparison operators compare items using such tests as _greater than_ or _not equal to_.
+
+**Master It**
+
+Write a line of code that uses a _less than_ comparison to test whether a variable named Surplus is less than 1200.
+
+**Solution**
+
+Here's an example of the _less than_ comparison operator:
+
+    If Surplus < 1200 Then
+
+**Compare one item with another.**
+
+You can compare strings using _less than_ and _more than_ comparison operators.
+
+**Master It**
+
+What symbol do you use to determine if VariableA is lower in the alphabet than VariableB?
+
+**Solution**
+
+You use the less than symbol <, like this:
+
+    If VariableA < VariableB Then
+
+**Test multiple conditions.**
+
+To test multiple conditions, you use VBA's _logical operators_ to link the conditions together.
+
+**Master It**
+
+Name two of the most commonly used logical operators.
+
+**Solution**
+
+The most often-used logical operators are And, Or, and Not. Tests can be combined using these operators, like this:
+
+    If A < B **AND** C = F Then
+
+**Use If blocks.**
+
+If blocks are among the most common programming structures. They are often the best way to allow code to make decisions. To test two conditions, use If... Else... EndIf.
+
+**Master It**
+
+Write an If... Else... End If block of code that displays two message boxes. If the temperature (the variable Temp) is greater than 80, tell the user that it's hot outside. Otherwise, tell the user that it's not that hot.
+
+**Solution**
+
+Your code may vary somewhat from this, but see if you've followed the basic structure:
+
+    Sub tempShow()
+
+      Dim Temp As Integer
+      Temp = 66
+
+      If Temp > 80 Then
+        MsgBox "Hey, it's hot outside!"
+      Else
+        MsgBox "It's not that hot."
+      End If
+
+    End Sub
+
+**Use** Select Case **blocks**.
+
+Select Case structures can be a useful alternative to If blocks.
+
+**Master It**
+
+When should you use a Select Case structure?
+
+**Solution**
+
+Select Case is often more readable than a lengthy, complex If... ElseIf... ElseIf... multiple-test decision-making block.
+
+Use the Select Case statement when the decision you need to make in the code depends on one variable (or expression) that has more than two or three different values that you need to evaluate.
+
+# Chapter 12: Using Loops to Repeat Actions
+
+**Understand when to use loops.**
+
+Loops come in very handy when you need to perform a repetitive task, such as searching through a document for a particular word.
+
+**Master It**
+
+What is the alternative to looping if you are carrying out repetitive tasks in a macro?
+
+**Solution**
+
+You can copy the repeated code and then paste it into the Visual Basic Editor as many times as you want to repeat the task. Programmers, however, frown on repeated code because they consider it a redundancy, which it often is. If some behavior needs to be repeated, you can almost always employ some form of loop structure.
+
+**Use For... loops for fixed repetitions.**
+
+For... loops are the most common loop structures in programming. You specify the number of iterations the loop must make, and the loop is exited when that number is reached.
+
+**Master It**
+
+Write a For...Next loop that counts up to 100, but use the Step command to increment by twos.
+
+**Solution**
+
+This code increments its _counter_ variable (i) by twos:
+
+    For i = 1 To 100 Step 2
+
+    Next
+
+**Use Do... loops for variable numbers of repetitions.**
+
+A Do... loop iterates until or while a condition exists, then exits from the loop when the condition no longer exists.
+
+**Master It**
+
+There are two categories of Do... loops. Do While... Loop and Do Until... Loop loops test a condition before performing any action. What is the other category?
+
+**Solution**
+
+The second type of Do... loop includes loops that perform an action before testing a condition. Do... Loop While and Do... Loop Until fall into this category.
+
+**Nest one loop within another loop.**
+
+You can put loops inside other loops.
+
+**Master It**
+
+Think of a programming task where nested loops would be useful.
+
+**Solution**
+
+Nested loops would come in handy if you're accessing a multidimensional array or a table of data, for example. You could use one loop to search through each row in the table, and another loop to search through all the columns. The example of nested loops given in this chapter is that you need to create a number of folders, each of which contains a number of subfolders. Structurally, this subfolders-within-folders example resembles a multidimensional array or a table of data.
+
+**Avoid infinite loops.**
+
+An infinite (or endless) loop causes your macro to continue execution indefinitely—as if the macro had stopped responding and was "frozen."
+
+**Master It**
+
+How do you avoid creating an infinite loop?
+
+**Solution**
+
+Be sure that it is possible for your loop to terminate at some point. Ensure that a condition will occur that ends the looping.
+
+# Chapter 13: Getting User Input with Message Boxes and Input Boxes
+
+**Display messages on the status bar.**
+
+The information bar at the bottom of the window in many applications is a useful, unobtrusive way of communicating with the user. The status bar is frequently used by applications to indicate the current page, zoom level, active view (such as _datasheet_ in Access), word count, and so on. However, you, too, can display information on the bar.
+
+**Master It**
+
+Write a small sub in the Visual Basic Editor that displays the current date and time in the status bar.
+
+**Solution**
+
+This procedure shows how to display information in the status bar.
+
+    Sub Experimentation_Zone()
+
+    Application.StatusBar = Now
+
+    End Sub
+
+**Display message boxes.**
+
+Message boxes are commonly used to inform or warn the user. By default, they appear in the middle of the screen and prevent the user from interacting with the host application until a button on the message box is clicked, thereby closing it.
+
+**Master It**
+
+Write a small sub in the Visual Basic Editor that displays the current date and time using a message box.
+
+**Solution**
+
+Here's how to display information in a message box.
+
+    Sub Experimentation_Zone()
+
+    MsgBox Now
+
+    End Sub
+
+**Display input boxes.**
+
+An input box is similar to a message box, except the former can get more information from the user. An input box allows the user to type in a string, which is more data than the simple information provided by which button the user clicked in a message box.
+
+**Master It**
+
+Write a small sub in the Visual Basic Editor that asks users to type in their name. Use the InStr function to see if there are any space characters in the returned string. If not, it means either they are Madonna or they have typed in only one name—so display a second input box telling them to provide both their first and last names.
+
+**Solution**
+
+You can handle this several ways, but this example code uses a Do...Loop Until structure to repeatedly display an input box until the user types in at least two words:
+
+    Sub Get_Name()
+
+    Dim response As String
+
+    response = InputBox("Please type in your full name:", _
+       "Enter Name")
+
+    If Not InStr(response, " ") Then 'found no space character
+
+        Do
+
+            response = InputBox _
+     ("You entered only one name. Please type in your full name:", _
+     "Enter First and Last Names Please")
+
+        Loop Until InStr(response, " ")
+
+    End If
+
+    End Sub
+
+**Understand the limitations of message boxes and input boxes.**
+
+For even moderately complex interaction with the user, message and input boxes are often too limited. They return to the VBA code, for example, only a single user response: a button click or a single piece of text. So you can't conveniently use an input box to ask for multiple data—such as an address _and_ a phone number—without displaying multiple input boxes. That's ugly and disruptive.
+
+**Master It**
+
+In addition to the limitations on the amount of information you can retrieve from the user, what are the two other major limitations of message boxes and input boxes?
+
+**Solution**
+
+You are limited in the formatting and the amount of information you can display to the user.
+
+# Chapter 14: Creating Simple Custom Dialog Boxes
+
+**Understand what you can do with a custom dialog box.**
+
+Custom dialog boxes—user interfaces you design as forms in the Visual Basic Editor—are often needed in macros and other kinds of Office automation. You might, for example, want to display a dialog box that allows the user to specify whether to let a macro continue beyond a certain point in its code or cease execution. Perhaps your macro is searching through a document for a particular phrase; then when it finds that phrase, it displays a dialog box to users asking if they want to continue further.
+
+**Master It**
+
+Which VBA statement would you use to stop a macro from continuing execution?
+
+**Solution**
+
+The End command halts execution.
+
+**Create a custom dialog box.**
+
+You use the Visual Basic Editor to both design a custom dialog box (form) and write code for macros. You can attach the various controls to a form and then enter code _behind_ the dialog box.
+
+**Master It**
+
+How do you switch between the form-design window (sometimes called the object window) and the Code window in the Visual Basic Editor?
+
+**Solution**
+
+The easiest way is to press F7 to display the Code window and press Shift+F7 to display the design window. However, you can also use the View menu, or double-click the module name in the Project Explorer (to switch to design mode), or double-click the form or one of its controls (to switch to the Code window).
+
+**Add controls to a dialog box.**
+
+It's easy in the Visual Basic Editor to add various controls—such as command buttons and text boxes—to a user form (a custom dialog box).
+
+**Master It**
+
+How do you add a command button to a custom dialog box?
+
+**Solution**
+
+If the Visual Basic Editor Toolbox isn't visible, click the form. Or you can choose View ⇒ Toolbox. Or click the Toolbox button on the Standard toolbar to display the Toolbox. Then click the command-button icon and click the form. If you want to add more than one command button, double-click the command-button icon in the Toolbox. Move your mouse cursor (notice that it now displays a small button icon) to the location in the dialog box where you want to place the command button. Click the form to place the new button on the form.
+
+**Link dialog boxes to procedures.**
+
+Buttons, check boxes, option buttons—displaying various controls to the user is fine, but unless you write some code _behind_ these various user-interface objects, what's the point? Your macro's user shouldn't discover that clicking a button _does nothing_.
+
+Dialog boxes often display objects with which users can communicate their wishes to your code. Therefore, you write code that explores the values the user enters into controls and responds to whatever buttons the user might click.
+
+**Master It**
+
+Create a small custom dialog box that displays a message in a label control saying, "Would you like to know the current date and time?" Put an OK button and a Cancel button on this form. Write code that simply ends the procedure if the user presses the Cancel button but that displays the date and time in the label if the user clicks the OK button. If the user clicks OK a second time, end the procedure.
+
+**Solution**
+
+After you've placed the label with its Caption property set to "Would you like to know the current date and time?" and put two appropriately captioned buttons on the form, double-click the Cancel button to open the Code window. Name the label **lblShowTime**.
+
+In the Cancel button's Click procedure, type **End**. Now use the Object drop-down list (on the top left of the Code window) to select the OK button.
+
+In the OK button's Click procedure, type the code shown here:
+
+    1.  Private Sub btnCancel_Click()
+    2.      End
+    3.  End Sub
+    4.
+    5.  Private Sub btnOK_Click()
+    6.
+    7.      If Label1.Caption = "Would you like to see the current date
+           and time?" Then
+    8.          Label1.Caption = Now
+    9.      Else
+    10.         End
+    11.     End If
+    12.
+    13. End Sub
+
+Here's how the code works:
+
+Line 2 closes the dialog box using the End statement. You could also use the UnLoad statement.
+
+Line 7 tests the value in the label's Caption property. This is how the procedure knows if this is the first time the user has clicked the OK button. If the label displays the original caption ("Would you like to see the current date and time?"), then you know the user has not previously clicked OK. However, if this original caption is not displayed (because the date and time are), that means the user is clicking OK a second time and wants to close the dialog box.
+
+Line 8 displays the date and time in the label.
+
+Line 10 exits the procedures, which also has the effect of removing the dialog box.
+
+There are other ways to test whether the user has clicked OK twice. You could create a Static Boolean variable type that is set to True the first time the OK button is clicked. Line 7 would then test the value of this variable in the following way:
+
+    If blnToggle = True Then
+
+**Retrieve the user's choices from a dialog box.**
+
+A major task of most dialog boxes is retrieving values that the user has specified in various controls by selecting check boxes and so on. Then you write code to carry out the user's wishes based on these retrieved values. This interaction via dialog box is the typical way that a user communicates with your procedures, and vice versa.
+
+**Master It**
+
+Create a new dialog box that contains three option buttons captioned Small, Medium, and Large and named optSmall, optMedium, and optLarge. Write code in each option button's Click procedure to change the button's caption to boldface when the button is clicked.
+
+**Solution**
+
+The following code shows how to employ the Font object's Bold property to turn boldface on and off in a caption on a dialog box:
+
+    1.  Private Sub optSmall_Click()
+    2.      optSmall.Font.Bold = True
+    3.      optMedium.Font.Bold = False
+    4.      optLarge.Font.Bold = False
+    5.  End Sub
+    6.
+    7.  Private Sub optMedium_Click()
+    8.      optSmall.Font.Bold = False
+    9.      optMedium.Font.Bold = True
+    10.     optLarge.Font.Bold = False
+    11. End Sub
+    12.
+    13. Private Sub optLarge_Click()
+    14.     optSmall.Font.Bold = False
+    15.     optMedium.Font.Bold = False
+    16.     optLarge.Font.Bold = True
+    17. End Sub
+
+With code, there are always various ways to achieve a given result. You could write this code a different way by creating a function that accepted small **,** medium **,** or large as an argument and then used a Select Case structure to make the appropriate caption boldface. That **solution** would look like this:
+
+    Private Sub optSmall_Click()
+        ChangeSize ("small")
+    End Sub
+
+    Private Sub optMedium_Click()
+        ChangeSize ("medium")
+    End Sub
+
+    Private Sub optLarge_Click()
+        ChangeSize ("large")
+    End Sub
+
+    Function ChangeSize(strChoice As String)
+
+    Select Case strChoice
+
+    Case Is = "small"
+
+    optSmall.Font.Bold = True
+    optMedium.Font.Bold = False
+    optLarge.Font.Bold = False
+
+    Case Is = "medium"
+
+    optSmall.Font.Bold = False
+    optMedium.Font.Bold = True
+    optLarge.Font.Bold = False
+
+    Case Is = "large"
+
+    optSmall.Font.Bold = False
+    optMedium.Font.Bold = False
+    optLarge.Font.Bold = True
+
+    End Select
+
+    End Function
+
+# Chapter 15: Creating Complex Forms
+
+**Understand what a complex dialog box is.**
+
+Simple dialog boxes tend to be static, but complex dialog boxes are dynamic—they change during execution in response to clicks or other interaction from the user.
+
+**Master It**
+
+Describe two types of dynamic behavior typical of complex dialog boxes.
+
+**Solution**
+
+The following types of dynamic behavior are typical of complex dialog boxes:
+
+The application changes the information in the dialog box to reflect choices that the user has made.
+
+The dialog box displays a hidden section of secondary options when the user clicks a button in the primary area of the dialog box.
+
+The application uses the dialog box to keep track of a procedure and to guide the user to the next step by displaying appropriate instructions and by activating the relevant control.
+
+**Reveal and hide parts of a dialog box.**
+
+Dialog boxes need not display everything at once. Word's Find And Replace dialog box illustrates how useful it can be to display an abbreviated dialog box containing the most common tasks and expand the box to reveal less-popular options if the user needs access to them.
+
+**Master It**
+
+Name the two most common techniques you can use to display additional options in a dialog box.
+
+**Solution**
+
+The two most common techniques for displaying additional options in a dialog box are as follows:
+
+Set the Visible property to False during design time to initially hide a control on a form. Then set its Visible property to True when you want to display the control.
+
+Increase the height or width (or both) of the dialog box to reveal an area containing further controls.
+
+**Create multipage dialog boxes.**
+
+VBA includes the MultiPage control, which enables you to create multipage dialog boxes. Word's Font dialog box is an example of one. You can access any page (one at a time) by clicking its tab at the top of the page.
+
+**Master It**
+
+How does the TabStrip control differ from the MultiPage control? What are the typical uses for each?
+
+**Solution**
+
+A MultiPage control allows the user to switch among different virtual _pages_ (with differing controls and varied layouts). The MultiPage control is most often employed to display a set of property pages for a feature (such as fonts) that includes many possible options and settings. You therefore can subdivide all these options among multiple pages in the form: one page for font properties and another page for character spacing, for example.
+
+A TabStrip control contains multiple _tabs_ but not multiple pages. In other words, the layout of the rest of the dialog box (apart from the tab strip itself) stays the same no matter which tab on the tab strip the user clicks. This is useful for displaying records from a database because the fields (such as text boxes) remain identical no matter which record the user is viewing.
+
+**Create modeless dialog boxes.**
+
+A _modeless_ dialog box can be left visible onscreen while the user continues to work in an application. For example, the Find And Replace dialog box in Access, Word, and Excel is modeless, as is the Replace dialog box in PowerPoint. A _modal_ dialog box, by contrast, must be closed by users before they can continue to interact with the application.
+
+**Master It**
+
+How do you make a user form modeless?
+
+**Solution**
+
+Set its ShowModal property to False. The default is True.
+
+**Work with form events.**
+
+Events are actions that happen while a program is executing. Many events are supported by the UserForm object and the controls you use on it. By using events, you can monitor what the user does and take action accordingly or even prevent the user from doing something that doesn't seem like a good idea.
+
+**Master It**
+
+Name two of the three most useful events available in VBA programming.
+
+**Solution**
+
+The three most commonly useful events for VBA programming are Click, Initialize, and Change.
+
+# Chapter 16: Building Modular Code and Using Classes
+
+**Arrange your code in modules.**
+
+Rather than use a single lengthy, complex procedure that accomplishes many tasks at once, programmers usually subdivide their code into smaller, self-contained procedures—dedicated to a single, discrete task.
+
+**Master It**
+
+Shorter, self-contained, single-task procedures offer the programmer several advantages. Name three.
+
+**Solution**
+
+The advantages of shorter, self-contained, single-task procedures are as follows:
+
+Modular code is often easier to write because you create a number of short procedures, each of which performs a specific task.
+
+You can usually debug shorter procedures relatively easily.
+
+Short procedures are more readable because you can more easily follow what they do.
+
+By breaking your code into procedures, you can repeat their tasks at different points in a sequence of procedures without needing to repeat the lines of code.
+
+By reusing whole procedures, you reduce the amount of code you have to write.
+
+If you need to change an item in the code, you can make a single change in the appropriate procedure instead of having to make changes at a number of locations in a long procedure.
+
+You can easily reuse short, dedicated, single-task procedures in other code in the future.
+
+**Call a procedure.**
+
+You execute a procedure by calling it from within your programming code.
+
+**Master It**
+
+How do you call a procedure?
+
+**Solution**
+
+You can use the optional Call statement, like this, to call a procedure named FormatDocument:
+
+    Call FormatDocument
+
+But most programmers omit the Call keyword, using just the name of the procedure, like this:
+
+    FormatDocument
+
+**Pass information from one procedure to another.**
+
+Sometimes a procedure requires that you pass it some information. For example, a procedure that searches text and makes some style changes to it will require that you pass the text you want modified.
+
+Sometimes a procedure passes back information to the procedure that called it. For example, it might pass back a message describing whether the actions taken in the procedure were (or were not) accomplished successfully.
+
+**Master It**
+
+What kind of procedure can pass back information to the caller?
+
+**Solution**
+
+Only functions can pass back information to a caller. Subroutines can accept data (arguments) like functions, but subroutines cannot pass data back to the caller.
+
+**Understand what classes are and what they're for.**
+
+Contemporary computer programs employ classes for various reasons—to help organize large programs, to make code more easily reusable, to provide certain kinds of security, or as a superior substitute for public variables. But beginners sometimes have a hard time wrapping their minds around the concept, particularly the relationship between classes and objects.
+
+**Master It**
+
+What is the difference between a class and an object?
+
+Choose the correct answer (only one answer is correct):
+
+**1.** A class is like a cookie and an object is like a cookie cutter.
+
+**2.** A class is like a programmer and an object is like a module.
+
+**3.** A class is like a blueprint and an object is like a house built from that blueprint.
+
+**Solution**
+
+The answer is 3. A class is like a blueprint and an object is like a house built from that blueprint.
+
+**Create a class.**
+
+The VBA Editor employs a special kind of module for containing classes.
+
+**Master It**
+
+How do you create a class module in the VBA Editor?
+
+**Solution**
+
+There are three ways to create a class module:
+
+Right-click the name of the target project in the Project Explorer (or right-click one of the items contained within the project). Then choose Insert ⇒ Class Module from the context menu.
+
+Click the Insert button on the Standard toolbar and choose Class Module from the drop-down list.
+
+Choose Insert ⇒ Class Module from the menu bar.
+
+# Chapter 17: Debugging Your Code and Handling Errors
+
+**Understand the basic principles of debugging.**
+
+A major aspect of programming is testing your code. Debugging can be enjoyable if you think of it as a puzzle that you can solve. But whether or not you enjoy it, debugging is essential if you want to preserve a reputation as a professional.
+
+**Master It**
+
+When testing your code, try to imagine ways that the code could fail. Describe a situation that can produce unanticipated results.
+
+**Solution**
+
+A user may try to run a document-formatting procedure without first opening a document.
+
+Or the user might try to open a file and trigger certain errors—perhaps the file doesn't exist, or is currently in use by another computer, or is on a network drive, floppy drive, CD-ROM drive, or removable drive that isn't available at the time.
+
+You'll also run into other peripheral-related errors if the user tries to use a printer or other remote device (say, a scanner or a digital camera) that's not present, connected, powered up, or configured correctly.
+
+Similarly, any procedure that deals with a particular object in a document (for example, a chart in Excel) will run into trouble if that object is not present or not available.
+
+**Recognize the four different types of errors you'll create.**
+
+Experts have concluded that there are four primary categories of error in programs.
+
+**Master It**
+
+Name two of the four basic types of programming errors.
+
+**Solution**
+
+Here are the four basic types of programming errors:
+
+Language errors
+
+Compile errors
+
+Runtime errors
+
+Program logic errors
+
+**Employ VBA's debugging tools.**
+
+The VBA Editor and VBA include a generous assortment of debugging tools to help you track down and remove bugs from your procedures. The main windows you'll employ for debugging are the Immediate window, the Locals window, and the Watch window.
+
+**Master It**
+
+The Watch window is especially useful because you can set watch expressions (also known as conditional breakpoints). Describe this debugging tactic.
+
+**Solution**
+
+Watch expressions are flexible and powerful debugging tools. You can ask the VBA Editor to break on any kind of expression you can think up, such as any line that causes a variable to exceed a certain value, go below zero, change to a shorter string length, and so on.
+
+You specify a condition (a watch expression such as X < 0), and the VBA Editor automatically halts execution and displays the line where this occurs.
+
+**Deal with runtime errors**
+
+You can trap some runtime errors (errors that show up while a procedure is executing) while debugging your code. But others show up only while your user is interacting with your program—and you're probably not there to help them. There is a way, though, to soften the blow and, in some cases, even fix a problem by adding error handlers to your programs.
+
+**Master It**
+
+Error handlers are special statements and sections of code that detect and then manage runtime errors. What VBA statement detects a runtime error?
+
+**Solution**
+
+VBA's On Error statement triggers when there is a runtime error, allowing you to write code that responds to the error.
+
+# Chapter 18: Building Well-Behaved Code
+
+**Understand the characteristics of well-behaved procedures.**
+
+Well-behaved procedures don't annoy or alarm the user either during or after their execution.
+
+**Master It**
+
+Name two ways programmers can write procedures that don't annoy users.
+
+**Solution**
+
+Here are some ways programmers can avoid annoying users by their procedures' actions:
+
+Make no durable or detectable changes to the user environment—other than changes the procedure is designed to make. In other words, restore the previous settings.
+
+Present the user with relevant choices for the procedure and relevant information once the procedure has finished running.
+
+Show or tell the user what is happening while the procedure is running.
+
+Make sure whenever possible that conditions are appropriate for the procedure to run successfully—before the procedure takes any actions.
+
+Anticipate or trap errors to avoid a crash. But if the procedure does crash, handle the situation as gracefully as possible and minimize damage to, or loss of, the user's work.
+
+Leave users in the optimal position to continue their work after the procedure finishes executing.
+
+Delete any scratch documents, folders, or other debris that the procedure created in order to perform its duties but that are no longer needed.
+
+**Retain and restore the user environment.**
+
+Users quite rightly don't appreciate it if your procedure leaves the state of their application's or operating system's environment modified. Find ways to restore the user environment before your procedure finishes execution.
+
+**Master It**
+
+Assume that you are writing a procedure that employs Word's Search and Replace feature. This feature retains its settings between uses so the user can repeatedly trigger the same search or replace actions. How can you temporarily store the status of the user's last search or replace so that you can restore this data after your procedure is finished executing?
+
+**Solution**
+
+To store such information, you can use private variables, public variables, or custom objects.
+
+Let the user know what's happening.
+
+Particularly when a procedure is doing a lengthy "batch job" such as updating dozens of files, it's important to let the user know that the computer hasn't frozen. People need to be told that execution is continuing as expected even though nothing appears to be happening.
+
+**Master It**
+
+Describe a way to let the user know that a procedure isn't frozen—that activity is taking place during execution.
+
+**Solution**
+
+You can tell users via a message box before starting a lengthy process that they should anticipate a delay. Alternatively, you can display messages on the status bar. Or you could disable screen updating for parts of a procedure and turn it back on, or refresh it, for other parts.
+
+**Check that the procedure is running under suitable conditions.**
+
+Another important element of creating a well-behaved procedure is to check that it's running under suitable conditions. This ideal is nearly impossible to achieve under all circumstances, but you should take some basic steps.
+
+**Master It**
+
+If a procedure accesses data from a file, name an error that could occur and thus should be trapped.
+
+**Solution**
+
+You should trap errors in case a file being accessed hasn't been opened, doesn't exist (has been deleted or moved), or doesn't contain data that the procedure expects to find in it.
+
+**Clean up after a procedure.**
+
+A well-behaved procedure avoids leaving unneeded files or other temporary items behind. In other words, a procedure should clean up after itself.
+
+**Master It**
+
+Cleaning up involves three major tasks. Name one.
+
+**Solution**
+
+The three main ways that a procedure cleans up after itself are as follows:
+
+Undoing any changes that the procedure had to make to enable itself to run
+
+Closing any files that no longer need to be open
+
+Removing any scratch files or folders that the procedure has created to achieve its effects
+
+# Chapter 19: Securing Your Code with VBA's Security Features
+
+**Understand how VBA implements security.**
+
+Microsoft takes a multipronged approach to protecting users from malicious VBA code embedded in documents and capable of launching itself when the user simply opens the document.
+
+**Master It**
+
+Name two ways that users are protected from malicious VBA code.
+
+**Solution**
+
+Users are protected from malicious VBA code in the followings ways:
+
+The default file type for Office documents simply cannot contain any embedded macros at all (these files' filename extensions end in x, such as .docx).
+
+Macro-enabled documents can be stored in a trusted area on the hard drive.
+
+The user can specify various trust settings for both macros and other executables, such as add-ins and ActiveX controls. For example, the user can forbid the execution of any controls unless the user is first notified. Another setting prompts the user for permission before allowing a control to be loaded.
+
+The user can modify a list of "trusted publishers"—companies whose documents are considered safe.
+
+Developers can digitally sign their own projects, thereby making themselves "trusted publishers."
+
+The types of files that an application can access can be more specifically controlled via file blocking.
+
+A Trusted Documents feature allows users to specify individual documents as reliable.
+
+Files are scanned before being opened.
+
+Files can be opened in a sandbox called Protected View.
+
+**Sign a macro project with a digital signature.**
+
+You can add a digital signature to your projects by creating your own certification, getting it from your company, or getting it from certification authorities such as VeriSign.
+
+**Master It**
+
+Describe the limitations of certifying a VBA macro project for yourself—without obtaining a certificate from your company or a commercial certification authority.
+
+**Solution**
+
+The quickest and easiest way of getting a digital certificate is to create one yourself. However, this kind of certification works only on the computer on which the certificate was created, and it's the least trustworthy type of digital signature. A digital certificate you create yourself is of little value to people other than you and those who personally trust you.
+
+**Get a digital certificate.**
+
+Commercial certification authorities provide the greatest level of security, but their certification is also more difficult to attain than self-certification or certification from your company.
+
+**Master It**
+
+Name some of the ways you may be required to prove your identity when obtaining a digital signature from a commercial certification authority.
+
+**Solution**
+
+The procedure for proving your identity varies depending on the commercial certification authority and the type of certificate you want. Generally speaking, the greater the degree of trust that the certificate is intended to inspire, the more proof you'll need to supply. For example, you can get a basic certificate on the strength of nothing more than a verifiable email address, but this type of certificate is unlikely to make people trust you. Other certificate types require you to appear in person before a registration authority with full documentation (such as a passport, driver's license, or other identity documents). Such certificates carry more trust.
+
+**Choose the appropriate security level.**
+
+When choosing the right security level to use VBA macros safely, you or a user of your code must achieve a balance. The security level must be set high enough to avoid malicious or incompetent code but low enough that it doesn't prevent you from running useful, safe code.
+
+**Master It**
+
+To set a suitable level of security for your purposes, open the Trust Center in Access, Word, Excel, or PowerPoint. You'll see four settings. Which one of the following five settings is _not_ available:
+
+Disable All Macros Without Notification
+
+Disable All Macros With Notification
+
+Disable All Macros Except Digitally Signed Macros
+
+Enable All Macros With Notification
+
+Enable All Macros
+
+**Solution**
+
+There is no Enable All Macros With Notification option.
+
+**Lock your code.**
+
+You can protect your source code in the VBA Editor from others. You add a password to a project (projects are in boldface in the Project Explorer), and others can't open your VBA procedures for reading or modifying.
+
+**Master It**
+
+What is the one drawback to locking your code?
+
+**Solution**
+
+The lock requires an extra step to access the modules and forms in the project because you must first provide the password. However, for the protection you gain by locking your code, this small extra effort can be well worth the trouble.
+
+# Chapter 20: Understanding the Word Object Model and Key Objects
+
+**Understand the Word object model.**
+
+Some people find viewing a schematic of the Word object model useful as a way of visualizing the interrelationships of the various objects and collections.
+
+**Master It**
+
+When you look at the Word Object Model Map, what is the highest object in the hierarchy—the object that contains all other objects?
+
+**Solution**
+
+The highest object in the hierarchy is the Application object.
+
+**Understand Word's creatable objects.**
+
+Word contains a set of creatable objects that VBA programmers will frequently employ in their code.
+
+**Master It**
+
+What is a creatable object?
+
+**Solution**
+
+A _creatable object_ is simply one that doesn't require you to use the term Application when invoking it. It's a kind of shorthand. For example, the Documents collection object is creatable, so the word Application is optional in this code: Application.Documents.Count does the same thing as Documents.Count.
+
+**Work with the Documents collection and the Document object.**
+
+The Documents collection represents all the currently open documents. Using VBA, you can manipulate this collection in a variety of ways.
+
+**Master It**
+
+Here is the syntax for creating a new document in the Documents collection:
+
+    Documents.Add Template, NewTemplate, DocumentType, Visible
+
+If you merely want to add a new, empty document (based on the default Normal.dotm template) to the documents currently open in Word, the code is quite simple. What is the code that you would write in VBA to accomplish this?
+
+**Solution**
+
+The code is as follows:
+
+    Documents.Add
+
+**Work with the Selection object.**
+
+The Selection object represents the current selection in the active document in Word. The user can select a zone by dragging the mouse or by using various key combinations (such as pressing Shift and an arrow key). A selection can include one or more objects—one or more characters, one or more words, one or more paragraphs, a graphic, a table, and so on. Or a combination of these objects.
+
+**Master It**
+
+One kind of selection is described as a _collapsed selection_. What is that?
+
+**Solution**
+
+A collapsed selection is an insertion point (the blinking cursor). Nothing is visibly selected. The insertion point, however, is still thought of as technically a selection (pointing to a place within the document), even though this special kind of selection has no contents.
+
+**Create and use ranges.**
+
+In Word, a _range_ is a named contiguous area of a document with a defined starting and ending point. The typical use of ranges in Word VBA is similar to how you use bookmarks when working interactively with Word: to mark a location in a document that you want to be able to access quickly or manipulate easily.
+
+**Master It**
+
+Although a range is similar to a bookmark, what is the significant difference between them?
+
+**Solution**
+
+The main difference between a range and a bookmark involves their lifetimes. A range exists only as long as the VBA procedure that defines it is executing. A bookmark is persistent: it is saved with the document or template that contains it and can be accessed at any time (whether or not a procedure is running).
+
+**Manipulate options**
+
+Word contains many options that can be manipulated from within VBA.
+
+**Master It**
+
+In Word, one object controls many of the options. This object has dozens of properties but no methods. Name this object.
+
+**Solution**
+
+The Options object controls many of the options in Word.
+
+# Chapter 21: Working with Widely Used Objects in Word
+
+**Use Find and Replace via VBA.**
+
+Word's Find and Replace utilities are frequently valuable to the VBA programmer. You'll want to master them and also some subtleties associated with their use.
+
+**Master It**
+
+Sometimes when replacing, you need to go through a document more than once—using a loop structure. Why would you ever need to repeatedly search and replace the same document? Doesn't the Replace All setting in fact _replace all_?
+
+**Solution**
+
+In some situations, the act of replacing actually generates new instances of the target of the replacement activity. Let's say you want only single-space strings in a document. (For example, you want sentences separated by only a single space character, but sometimes a typist accidentally presses two or more spaces.) You set up a search and replace that looks for double-space character strings and replaces them with single-space characters. You have to take into account that there can also be some multiple-space-character strings. Consider a string of six adjacent space characters. During the first pass, your double-to-single search and replace reduces the six-space string to a three-space string (three instances of double spaces would reduce to three instances of single spaces). The second pass through a loop would reduce it to a two-space string, requiring yet a third pass through the loop to achieve the desired single space. This same situation can apply to multiple-paragraph spacing (multiple blank lines) and tabs.
+
+**Work with headers, footers, and page numbers.**
+
+All Word documents contain headers and footers, even if they are empty. In addition, you can insert various types of headers and footers.
+
+**Master It**
+
+Name two types of headers you can use in a Word document.
+
+**Solution**
+
+Here are the major types of Word headers: the primary header, different first-page headers, different even-page headers, and different sets of headers for each of the sections in the document.
+
+**Manage sections, page setup, windows, and views.**
+
+Among the various ways you can view a document, you sometimes want to have the document automatically scroll to a particular table, graphic, or other target.
+
+**Master It**
+
+What method of the Window object can be used to easily accomplish this task?
+
+**Solution**
+
+The ScrollIntoView method of the Window object moves the view to a target you specify.
+
+**Manipulate tables.**
+
+When you need to manage tables in Word documents, you can employ VBA to work with the Table object to represent a single table. If there is more than one table, they are referenced by a collection of Table objects.
+
+**Master It**
+
+Name two important and useful objects within the Tables collection or the Table object.
+
+**Solution**
+
+Some of the most useful objects within a Table object or a Tables collection are as follows:
+
+The Rows collection contains the rows in the table. Each row is represented by a Row object.
+
+The Columns collection contains the columns in the table. Each column is represented by a Column object.
+
+The Cell object provides access to a specified cell directly from the Table object. You can also reach the cells in the table by going through the row or column in which they reside.
+
+The Range object provides access to ranges within the table.
+
+The Borders collection contains all the borders for the table.
+
+The Shading object contains all the shading for the table.
+
+# Chapter 22: Understanding the Excel Object Model and Key Objects
+
+**Work with workbooks.**
+
+You often need to create a new, blank workbook in a macro (mimicking a user clicking the File tab on the Ribbon, then clicking the New button). And writing code that accomplishes this is not difficult. It requires only two words.
+
+**Master It**
+
+What code would you write to create a new, blank notebook?
+
+**Solution**
+
+To create a blank workbook, omit the Template argument, like this:
+
+     Workbooks.Add
+
+**Work with worksheets.**
+
+Most workbooks you access via VBA will contain one or more worksheets, so most procedures will need to work with worksheets—inserting, deleting, copying, or moving them, or simply printing the appropriate range from them.
+
+**Master It**
+
+Name the object you use in VBA code to represent a worksheet.
+
+**Solution**
+
+Each worksheet is represented by a Sheet object. The Sheet objects are contained within the Sheets collection.
+
+**Work with the active cell or selection.**
+
+In a procedure that manipulates a selection that the user has made, you'll typically work with either the active cell or the current selection.
+
+**Master It**
+
+What is the difference between the active cell and a selection?
+
+**Solution**
+
+The active cell is always a single cell, but the selection can either be a single cell or encompass multiple cells or other objects.
+
+**Work with ranges.**
+
+Within a worksheet, you'll often need to manipulate ranges of cells. Excel includes a special kind of range—represented by the UsedRange property.
+
+**Master It**
+
+What is unique about UsedRange?
+
+**Solution**
+
+If you need to work with all the cells on a worksheet (but not with any unoccupied areas of the worksheet), use the UsedRange property. UsedRange ignores empty areas of a worksheet.
+
+**Set options.**
+
+Word employs an Options object to contain most of the options that you find in the Word Options dialog box (click the File tab on the Ribbon, then click Options). Excel uses a different object to contain its options.
+
+**Master It**
+
+From which object do you access most of Excel's options?
+
+**Solution**
+
+You access most of Excel's options from the Application object. However, you can access the workbook-specific properties that appear in the Excel Options dialog box through the appropriate Workbook object.
+
+# Chapter 23: Working with Widely Used Objects in Excel
+
+**Work with charts.**
+
+You can create either full chart sheets or embedded charts within an ordinary Excel worksheet.
+
+**Master It**
+
+What object is used in a procedure to represent an embedded chart?
+
+**Solution**
+
+VBA uses the ChartObject object to represent an embedded chart on a worksheet.
+
+**Work with windows.**
+
+To open a new window on a workbook, you use the NewWindow method of the appropriate Window object.
+
+**Master It**
+
+Does the NewWindow method take any arguments?
+
+**Solution**
+
+No, the NewWindow method takes no arguments. For example, the following statement opens a new window showing the contents of the first window open on the workbook identified by the object variable myWorkbook:
+
+     myWorkbook.Windows(1).NewWindow
+
+**Work with Find and Replace.**
+
+When working with the Find and Replace features in Excel, you need to be aware of a phenomenon known as _persistence_.
+
+**Master It**
+
+What is persistence, and why should it concern you?
+
+**Solution**
+
+The LookIn, LookAt, SearchOrder, and MatchByte arguments of the Range object's Find method _persist_. This means that Excel retains their settings from one search to the next (until this session with Excel ends and you shut it down). So, if you don't know that the settings used in the last search (either programmatically in a procedure or by the user) are suitable for your current needs, you should set these arguments explicitly in each search to avoid getting unexpected results. Format settings such as font and subscript also persist.
+
+# Chapter 24: Understanding the PowerPoint Object Model and Key Objects
+
+**Understand PowerPoint's creatable objects.**
+
+Creatable objects are commonly used objects that can be employed in VBA code without requiring that you qualify them with the Application object. You can leave that word out of your code; it's optional, and rarely used.
+
+**Master It**
+
+Name one of the objects or collections that are creatable in PowerPoint procedures.
+
+**Solution**
+
+Objects or collections that are creatable in PowerPoint procedures include the ActivePresentation object, the Presentations collection, the ActiveWindow object, and the SlideShowWindows collection.
+
+**Work with presentations.**
+
+You can create a new presentation programmatically, but PowerPoint generates an annoying flicker on most systems while it brings the new presentation into view. You can block this unpleasant, strobelike effect to avoid disturbing your audience.
+
+**Master It**
+
+How do you prevent a newly created presentation from being visible so that you can create and manipulate it in your code without the user seeing the flickering effect onscreen?
+
+**Solution**
+
+WithWindow is an optional Long argument of the Add method of the Presentations collection. Set WithWindow to msoFalse to hide the presentation so that the user doesn't have to endure the irritating flickering effect that PowerPoint tends to exhibit while creating presentation objects programmatically. The default value is msoTrue, making the new presentation visible.
+
+**Work with windows and views.**
+
+To get the PowerPoint window into the state you want, you'll often need to work with the window and with the view.
+
+**Master It**
+
+PowerPoint uses two types of windows. What are they?
+
+**Solution**
+
+PowerPoint uses document windows and slide-show windows.
+
+**Work with slides.**
+
+Once you have created or opened the presentation you want to manipulate, you can access the slides it contains by using the Slides collection. This collection contains a Slide object for each slide in the presentation. Each slide is identified by its index number, but you can also use object variables to refer to slides or assign names to slides.
+
+**Master It**
+
+Why would you want to assign names to slides rather than using the default index numbers that are automatically assigned to the slides?
+
+**Solution**
+
+Assigning names to slides is useful because if you add slides to, or delete them from, the presentation, the index numbers of the slides will change. You don't want to have to keep track of readjusted index numbers as you manipulate the collection. If they have names, you can access the slides directly, without worrying that their index numbers might have changed.
+
+**Work with masters.**
+
+Before attempting to manipulate a master in your code, you should determine whether the master actually exists in the presentation.
+
+**Master It**
+
+How do you find out whether a presentation has a title master?
+
+**Solution**
+
+Check the HasTitleMaster property. If the presentation already has a title master, VBA returns an error when you try to add a title master. So check, like this:
+
+     If ActivePresentation.HasTitleMaster Then
+            'take further action based on the If...Then test
+
+# Chapter 25: Working with Shapes and Running Slide Shows
+
+**Work with shapes.**
+
+PowerPoint VBA provides many ways to access and manipulate shapes.
+
+**Master It**
+
+Describe what the following line of code does:
+
+    ActivePresentation.Slides(2).Shapes(1).Delete
+
+**Solution**
+
+The code example deletes the first Shape object on the second slide in the active presentation.
+
+**Work with headers and footers.**
+
+Using PowerPoint headers and footers can be a convenient way to provide continuity for presentations as well as to identify each element.
+
+**Master It**
+
+In this chapter, you worked with several examples showing how to manipulate footers for slides. Why were there no examples illustrating how to manipulate headers for slides?
+
+**Solution**
+
+Slides can't have headers, only footers. Notes pages or handouts can have headers.
+
+**Set up and run a slide show.**
+
+To create a custom slide show, you use the Add method of the NamedSlideShows collection.
+
+**Master It**
+
+The syntax when using the Add method of the NamedSlideShows collection is
+
+    _expression_.Add(Name, SafeArrayOfSlideIDs)
+
+Explain what the four components of this line of code are and do.
+
+**Solution**
+
+The components are as follows:
+
+_expression_ is a required expression that returns a NamedSlideShows object.
+
+Add is the method (of the NamedSlideShows object) that adds the slides to the new show.
+
+Name is a required String argument that specifies the name to assign to the new custom slide show.
+
+SafeArrayOfSlideIDs is also a required argument. It's a Variant argument that specifies the numbers or names of the slides to include in the custom show.
+
+# Chapter 26: Understanding the Outlook Object Model and Key Objects
+
+**Work with the Application object.**
+
+VBA uses two major Outlook objects that most users wouldn't recognize from working with the Outlook user interface alone.
+
+**Master It**
+
+One of these objects represents a window that displays the contents of a folder. The other represents a window displaying an Outlook item, such as an email message or an appointment. What are the names of these two objects?
+
+**Solution**
+
+An Inspector object is an object that represents a window displaying an Outlook item, such as an email message or an appointment.
+
+An Explorer object represents a window that displays the contents of a folder.
+
+**Work with messages.**
+
+To work with the contents of a message in VBA, you set or get various properties.
+
+**Master It**
+
+Name one of the most widely useful properties employed when manipulating the contents of a message in a procedure.
+
+**Solution**
+
+The most commonly useful properties when accessing a message in VBA are To, CC, BCC, Subject, Body, BodyFormat, and Importance.
+
+**Work with calendar items.**
+
+You can create new calendar appointment items via VBA.
+
+**Master It**
+
+To create a new calendar item, you use a particular method of the Application object and specify olAppointmentItem for the ItemType argument. What is the method?
+
+**Solution**
+
+To create a new calendar item, you use the CreateItem method of the Application object. This example creates an AppointmentItem object variable named myAppointment and assigns to it a new appointment item:
+
+    Dim myAppointment As AppointmentItem
+    Set myAppointment = Application.CreateItem(ItemType:=olAppointmentItem)
+
+**Work with tasks and task requests.**
+
+You can assign a task to a colleague and then add one or more recipients. You can then send the task to your colleague and, optionally, the additional recipients.
+
+**Master It**
+
+What methods do you use to assign, add, and send a task to others?
+
+**Solution**
+
+To assign, add, and send a task to others, use the Assign, Add, and Send methods.
+
+# Chapter 27: Working with Events in Outlook
+
+**Work with application-level events.**
+
+Event handlers are procedures that contain code that responds to an event. In other words, if a user modifies one of their contacts, an event can detect this modification and execute code you've written to respond to the modification.
+
+**Master It**
+
+Event handler-procedures are unlike ordinary macro procedures in several ways. Name one of the differences.
+
+**Solution**
+
+Both the construction of an event-handler procedure and its testing differ somewhat from the techniques you've been employing throughout this book when creating and testing ordinary macro procedures:
+
+An event handler must be located within a class module, not an ordinary macro module.
+
+An object variable must be declared that can point to the event.
+
+The object variable must be initialized (connected to an object).
+
+You cannot simply test the event handler by pressing F5 to run it directly (you must instead run it indirectly by triggering the event it's designed to service—for example, by modifying a contact in the Contacts folder).
+
+**Work with item-level events.**
+
+Outlook has two primary kinds of events.
+
+**Master It**
+
+What are the two types of events in Outlook? And how do they differ?
+
+**Solution**
+
+The two types of events in Outlook are application-level and item-level events. Application-level events apply to Outlook as a whole (for example, an event that triggers when the application is closed). Item-level events apply to individual items within Outlook, such as a contact or an email message in the Inbox.
+
+# Chapter 28: Understanding the Access Object Model and Key Objects
+
+**Get started with VBA in Access.**
+
+Access allows you to write macros in a VBA Editor using VBA code. But it also features a legacy Macro Designer utility (formerly known as the Macro Builder) with which you create an entirely different kind of macro, what we've been calling an Access-style macro.
+
+**Master It**
+
+The term _macro_ is used in a special way in Access (referring to only one of the two types of custom procedures Access permits you to construct: VBA and Macro Designer). This usage of _macro_ is unlike the way the term _macro_ is used in other Office applications, not to mention all other forms of computing. Describe what Access means by the term _macro_.
+
+**Solution**
+
+Instead of defining macros as VBA procedures, Access uses the term _macro_ to describe a technology unique to Access. You create these Access "macros" by clicking the Macro button on the Create tab (in the Macros And Code section of the Ribbon) to open the Macro Designer window.
+
+An Access "macro" is a historical entity—a holdover from the early days of this database system. Access macros are limited to a subset of the available programming statements.
+
+Using the Macro Designer, you enter a list of actions that you want to perform. You choose these actions from a list and then type in arguments in the next cell in a table displayed by the Macro Designer.
+
+**Open and close databases.**
+
+Access permits you to open a database in several ways.
+
+**Master It**
+
+Two common commands that open a database in Access are OpenCurrentDatabase and OpenDatabase. What is the difference between these two commands?
+
+**Solution**
+
+Instead of using the OpenCurrentDatabase method to open a database as the current database, you can use the OpenDatabase method of the Workspace object to open another database and return a reference to the Database object representing it. Using this method, you can open multiple databases. The OpenCurrentDatabase method, by contrast, can open only a single database at a time.
+
+**Work with the Screen object.**
+
+You became familiar with using ActiveDocument objects in Word to access the document that currently has the focus. Or you used the ActivePresentation object to work with whichever presentation happened to be active in PowerPoint. Access, however, employs the Screen object as the parent of whatever object has the focus.
+
+**Master It**
+
+The Screen object represents the screen object that currently has the focus in Access (that is, the object that is receiving input or ready to receive input). Three types of common Access objects can have the focus when you employ the Screen object. What are they?
+
+**Solution**
+
+The object can be a form, a report, or a control.
+
+**Use the DoCmd object to run Access commands.**
+
+Many of the tools that Access makes available to users, such as printing a report or maximizing a window, are also available to the programmer via the methods of the DoCmd object.
+
+**Master It**
+
+The DoCmd object has 66 methods in Office 2013. Describe the purpose of the DoCmd object's Beep method.
+
+**Solution**
+
+The Beep method makes the computer emit a sound. This can be used in conjunction with an error message to alert the user that an error has occurred.
+
+# Chapter 29: Manipulating the Data in an Access Database via VBA
+
+**Open a recordset.**
+
+You can open an ADO recordset in two different ways.
+
+**Master It**
+
+One way to open an ADO recordset is to provide an argument list following the Open method. What is the other way to open an ADO recordset, which doesn't involve using arguments? Some people say that this second approach makes their code easier to read.
+
+**Solution**
+
+Instead of specifying arguments for the Open method, you can set the Source, ActiveConnection, CursorType, and LockType properties of the RecordSet object you're opening and then use the Open method without arguments. You may agree with those who feel this approach makes the code easier to read.
+
+**Access a particular record in a recordset.**
+
+Both ADO and DAO technologies have methods that allow you to move around within a recordset.
+
+**Master It**
+
+One method you can use to traverse a recordset is the MoveFirst method. It takes you to the first record in the recordset. What does the _first record_ mean in a recordset in a relational database? Is it the record that's the lowest numerically, the lowest alphabetically, or what?
+
+**Solution**
+
+The concept of a "first record" within a relational database is essentially meaningless, unless you sort the records in some fashion. You can sort them by any of their fields and either ascending (the default) or descending (specify DESC). Which record is first depends on the field by which you sort the recordset. But _you_ must sort them. They can't be assumed to be sorted in any way within the database itself.
+
+**Search for a record.**
+
+Both ADO and DAO offer methods to directly search for a particular record.
+
+**Master It**
+
+ADO offers a Find method. How many methods does DAO offer, and what are they?
+
+**Solution**
+
+There are four Find methods for DAO: FindFirst, FindNext, FindPrevious, and FindLast.
+
+**Edit a record.**
+
+When editing a record, you first use the Edit method, and then you can change the value in a field.
+
+**Master It**
+
+After you have made a change to a value in a record, what method do you use to save this change to make it part of the database?
+
+**Solution**
+
+You use the Update method to save the changes you made to a recordset.
+
+**Insert and delete records.**
+
+It's not difficult to insert new records or delete existing ones. In both situations, you use the Update method when finished to save the changes to the database.
+
+**Master It**
+
+To insert a new record into a recordset, what method do you use before you can assign data to the fields in the new record?
+
+**Solution**
+
+You use the AddNew method of the RecordSet object to create a new, empty record.
+
+# Chapter 30: Accessing One Application from Another Application
+
+**Use Automation to transfer information.**
+
+Automation sets up communication between two applications, designating one of them as the _server_ and the other as the _client_.
+
+**Master It**
+
+Of the various ways to communication between applications, which is generally the most effective?
+
+**Solution**
+
+Automation is the most powerful and efficient way to communicate between applications.
+
+**Use the Shell function to run an application.**
+
+Although the Shell function can prove useful in a variety of inter-application communication situations, Shell can also present the programmer with a timing problem.
+
+**Master It**
+
+Describe the timing issues that the Shell function raises, and describe a good **solution** to this problem.
+
+**Solution**
+
+The Shell function runs other programs _asynchronously_ rather than synchronously. In other words, Shell doesn't halt all other activity until it has finished with its job. So when VBA executes a Shell statement, it registers the statement as an action to be performed—but that action may not necessarily be finished before the next statement in the procedure executes.
+
+This asynchrony can cause errors in your procedures if subsequent commands depend on the Shell statement having already been executed. If you run into this type of problem, a crude but often-effective fix is to just allow extra time for the Shell function to execute before taking any dependent action. You can employ the Sleep function to pause execution of your procedure to allow any necessary commands to be carried out.
+
+**Use data objects to store and retrieve information.**
+
+This book has described a variety of ways to store and retrieve information when working with the VBA language. Using data objects is one of these useful techniques.
+
+**Master It**
+
+How is the data-object technology special as a way of storing and retrieving information; what can a data object do that's unique?
+
+**Solution**
+
+The data object has the ability to copy information to and retrieve information from the Windows Clipboard.
+
+**Communicate via DDE.**
+
+Dynamic Data Exchange (DDE) is a technology introduced back in May 1990 with Windows 3.0. Use it if other, more efficient communication technologies are unavailable to the applications you are working with.
+
+**Master It**
+
+Not all applications support DDE. Which Office 2013 applications don't support DDE communication?
+
+**Solution**
+
+PowerPoint and Outlook do not support DDE.
+
+**Communicate via SendKeys.**
+
+Using SendKeys is a fairly simple but rather awkward and limited way to communicate between applications. It imitates typing in keystrokes, thereby allowing you to manipulate an application by accessing some of its features using, for example, Alt+key combinations, such as Alt+F to open the File tab on the Ribbon.
+
+**Master It**
+
+SendKeys was historically most often employed to open an application's menus and select an option from the menus. Since Vista, Windows applications have largely done away with traditional menus, so is SendKeys of even more limited use now than in the past?
+
+**Solution**
+
+No, SendKeys must simply send some different keystrokes to access recent Office applications' features. Many of the features of the Ribbon, for example, are accessible via key combinations. For example, pressing the sequence Alt, W, Q, 2, and the Enter key in Word will switch to the View tab on the Ribbon, select the Zoom option, and switch to a 200% zoom. The difference here is that instead of employing the older approach of simultaneously pressing the Alt key while pressing other keys (such as Alt+V to open a View menu), in today's applications you press and release Alt by itself to activate the Ribbon, then you press the W key to switch to the View tab on the Ribbon. At this point, additional keystrokes can be sent to trigger the various options on the View tab.
+
+# Chapter 31: Programming the Office 2013 Ribbon
+
+**Hide a tab on the Ribbon.**
+
+Modifying the Ribbon involves employing XML attributes—similar to methods and properties—of various Ribbon elements such as tabs, groups, and buttons.
+
+**Master It**
+
+Some Ribbon-related attributes include the suffix Mso. Examples include idMso and imageMso. What does the Mso mean, and what kind of attributes' names are appended with Mso?
+
+**Solution**
+
+Mso stands for Microsoft Office, and it means that a tab, icon, or other element is built in—such as a tab that is, by default, visible on the Ribbon.
+
+**Hide a group.**
+
+You might want to make an entire Ribbon group invisible. For example, the Editing group on the Home tab includes three options that most people launch via shortcut keys: Find, Replace, and Select. So what's the point of having this group take up space on the Ribbon?
+
+**Master It**
+
+What XML attribute of a group do you set to false to remove that group from the Ribbon?
+
+**Solution**
+
+You can set any group's visible attribute to false, thereby hiding it from the user.
+
+**Create callbacks for event handling.**
+
+To execute VBA code, you insert a callback in the XML code that will run whatever VBA macro you specify. When the user clicks a control, such as a button, the XML code that services this control sends a message to the Office application, telling it that a response is needed.
+
+**Master It**
+
+What XML attribute do you use to create a callback?
+
+**Solution**
+
+You use the onAction attribute to create a callback.
+
+**Manipulate the Access Ribbon.**
+
+Access often does things differently from the majority of Office applications, and Ribbon programming is no different. You can manipulate the Access Ribbon as freely as in the other applications, but several of the programming techniques differ.
+
+**Master It**
+
+Where can you store the XML code when programming the Access Ribbon?
+
+**Solution**
+
+You store the XML code when programming the Access Ribbon in a special table you create, named USysRibbons.
+
+**Debug Ribbon programming.**
+
+Most Ribbon programming involves writing two types of code: XML and VBA. Strategies for fixing bugs in XML include _validation_.
+
+**Master It**
+
+What is XML validation?
+
+**Solution**
+
+XML validation is a feature built into XML code editors that checks to ensure that the XML is _well formed_ (free of certain kinds of errors). A validation looks for tags that are missing, out of order, improperly punctuated, use the wrong capitalization, are not part of the schema (aren't listed as attributes or elements in the document that defines the particular XML version being used), and so on. Some errors can't be detected during validation (such as pointing an onAction attribute to a VBA procedure that doesn't exist). But many common errors—such as typos—can be spotted by validation, which is an automatic scan of XML code.
+
diff --git a/kag/examples/csqa/builder/data/modern_optimization_with_r.txt b/kag/examples/csqa/builder/data/modern_optimization_with_r.txt
new file mode 100644
index 00000000..b921ce30
--- /dev/null
+++ b/kag/examples/csqa/builder/data/modern_optimization_with_r.txt
@@ -0,0 +1,9865 @@
+Modern Optimization With R
+ 
+Use R!
+
+Series EditorsRobert Gentleman, Kurt Hornik and Giovanni Parmigiani
+
+More information about this series at http://​www.​springer.​com/​series/​6991
+
+Albert: Bayesian Computation with R (2nd ed. 2009)
+
+Bivand/Pebesma/Gómez-Rubio: Applied Spatial Data Analysis with R (2nd ed. 2013)
+
+Cook/Swayne: Interactive and Dynamic Graphics for Data Analysis: With R and GGobi
+
+Hahne/Huber/Gentleman/Falcon: Bioconductor Case Studies
+
+Paradis: Analysis of Phylogenetics and Evolution with R (2nd ed. 2012)
+
+Pfaff: Analysis of Integrated and Cointegrated Time Series with R (2nd ed. 2008)
+
+Sarkar: Lattice: Multivariate Data Visualization with R
+
+Spector: Data Manipulation with R
+
+Paulo Cortez
+
+Modern Optimization with R
+
+Paulo Cortez
+
+Department of Information Systems, University of Minho, Guimarães, Portugal
+
+ISSN 2197-5736e-ISSN 2197-5744
+
+ISBN 978-3-319-08262-2e-ISBN 978-3-319-08263-9
+
+DOI 10.1007/978-3-319-08263-9
+
+Springer Cham Heidelberg New York Dordrecht London
+
+Library of Congress Control Number: 2014945630
+
+Mathematics Subject Classification (2010): 68T2060-0462M1062M4565C0568T0597R40
+
+© Springer International Publishing Switzerland 2014
+
+This work is subject to copyright. All rights are reserved by the Publisher, whether the whole or part of the material is concerned, specifically the rights of translation, reprinting, reuse of illustrations, recitation, broadcasting, reproduction on microfilms or in any other physical way, and transmission or information storage and retrieval, electronic adaptation, computer software, or by similar or dissimilar methodology now known or hereafter developed. Exempted from this legal reservation are brief excerpts in connection with reviews or scholarly analysis or material supplied specifically for the purpose of being entered and executed on a computer system, for exclusive use by the purchaser of the work. Duplication of this publication or parts thereof is permitted only under the provisions of the Copyright Law of the Publisher's location, in its current version, and permission for use must always be obtained from Springer. Permissions for use may be obtained through RightsLink at the Copyright Clearance Center. Violations are liable to prosecution under the respective Copyright Law.
+
+The use of general descriptive names, registered names, trademarks, service marks, etc. in this publication does not imply, even in the absence of a specific statement, that such names are exempt from the relevant protective laws and regulations and therefore free for general use.
+
+While the advice and information in this book are believed to be true and accurate at the date of publication, neither the authors nor the editors nor the publisher can accept any legal responsibility for any errors or omissions that may be made. The publisher makes no warranty, express or implied, with respect to the material contained herein.
+
+Printed on acid-free paper
+
+Springer is part of Springer Science+Business Media (www.springer.com)
+
+Preface
+
+Currently, we are in the Information Age, where more organizational and individual activities and processes are based on Information Technology. We are also in a fast changing world. Due to several factors, such as globalization, technological improvements, and more recently the 2008 financial crisis, both organizations and individuals are pressured for improving their efficiency, reducing costs, and making better-informed decisions. This is where optimization methods, supported by computational tools, can play a key role.
+
+Optimization is about minimizing or maximizing a goal (or goals) and it is useful in several domains, including Agriculture, Banking, Control, Engineering, Finance, Marketing, Production, and Science. Examples of real-world applications include the optimization of construction works, financial portfolios, marketing campaigns, and water management in agriculture, just to name a few.
+
+Modern optimization, also known as metaheuristics, is related with general purpose solvers based on computational methods that use few domain knowledge, iteratively improving an initial solution (or population of solutions) to optimize a problem. Modern optimization is particularly useful for solving complex problems for which no specialized optimization algorithm has been developed, such as problems with discontinuities, dynamic changes, multiple objectives, or hard and soft restrictions, which are more difficult to be handled by classical methods.
+
+Although modern optimization often incorporates random processes within their search engines, the overall optimization procedure tends to be much better than pure random (Monte Carlo) search. Several of these methods are naturally inspired. Examples of popular modern methods that are discussed in this book are simulated annealing, tabu search, genetic algorithms, genetic programming, NSGA II (multi-objective optimization), differential evolution, and particle swarm optimization.
+
+R is a free, open source, and multiple platform tool (e.g., Windows , Linux , MacOS ) that was specifically developed for statistical analysis. Currently, there is an increasing interest in using R to perform an intelligent data analysis. While it is difficult to know the real number of R users (e.g., it may range from 250,000 to 2 million), several estimates show a clear growth in the R popularity. In effect, the R community is very active and new packages are being continuously created, with more than 5800 packages available, thus enhancing the tool capabilities. In particular, several of these packages implement modern optimization methods.
+
+There are several books that discuss either modern optimization methods or the R tool. However, within the author's knowledge, there is no book that integrates both subjects and under a practical point of view, with several application R code examples that can be easily tested by the readers. Hence, the goal of this book is to gather in a single document (self-contained) the most relevant concepts related with modern optimization methods, showing how such concepts and methods can be addressed using the R tool.
+
+This book is addressed for several target audience groups. Given that the R tool is free, this book can be easily adopted in several bachelor or master level courses in areas such as "Operations Research", "Decision Support", "Business Intelligence", "Soft Computing", or "Evolutionary Computation". Thus, this book should be appealing for bachelor's or master's students in Computer Science, Information Technology, or related areas (e.g., Engineering or Science). The book should also be of interest for two types of practitioners: R users interested in applying modern optimization methods and non R expert data analysts or optimization practitioners who want to test the R capabilities for optimizing real-world tasks.
+
+## How to Read This Book
+
+This book is organized as follows:
+
+  * Chapter   1  introduces the motivation for modern optimization methods and why the R tool should be used to explore such methods. Also, this chapter discusses key modern optimization topics, namely the representation of a solution, the evaluation function, constraints, and an overall view of modern optimization methods. This chapter ends with the description of the optimization tasks that are used for tutorial purposes in the next chapters.
+
+  * Chapter   2  presents basic concepts about the R tool. This chapter is particularly addressed for non R experts, including the necessary knowledge that is required to understand and test the book examples. R experts should skip this chapter.
+
+  * Chapter   3  is about how blind search can be implemented in R. This chapter details in particular three approaches: pure blind, grid, and Monte Carlo search.
+
+  * Chapter   4  introduces local search methods, namely hill climbing, simulated annealing, and tabu search. Then, an example comparison between several local search methods is shown.
+
+  * Chapter   5  presents population-based search methods, namely genetic and evolutionary algorithms, differential evolution, particle swarm optimization and estimation of distribution algorithm. Then, two additional examples are discussed, showing a comparison between population-based methods and how to handle constraints.
+
+  * Chapter   6  is dedicated to multi-objective optimization. This chapter first presents three demonstrative multi-objective tasks and then discusses three multi-objective optimization approaches: weighted-formula, lexicographic, and Pareto (e.g., NSGA-II algorithm).
+
+  * Chapter   7  presents three real-world applications of previously discussed methods, namely traveling salesman problem, time series forecasting, and wine quality classification.
+
+Each chapter starts with an introduction, followed by several chapter topic related sections and ends with an R command summary and exercise sections. Throughout the book, several examples of R code are shown. The code was run using a 64 bit R (version 3.0.2) under on a MacOS laptop. Nevertheless, these examples should be easily reproduced by the readers on other systems, possibly resulting in slight numerical (32 bit version) or graphical differences for deterministic examples. Also, given that a large portion of the discussed methods are stochastic, it is natural that different executions of the same code and under the same system will lead to small differences in the results.
+
+It is particularly recommended that students should execute the R code and try to solve the proposed exercises. Examples of solutions are presented at the end of this book. All these code files and data examples are available at: http://​www3.​dsi.​uminho.​pt/​pcortez/​mor .
+
+## Production
+
+Several contents of this book were taught by the author in the last 5 years in distinct course units of master and doctoral programs. At the master's level, it included the courses "Adaptive Business Intelligence" (Masters in Information Technology, University of Minho, Portugal) and "Business Intelligence" (Masters in Information Systems Management, Lisbon University Institute, Portugal). The doctoral course was "Adaptive Business Intelligence" (Doctoral Program in Computer Science, Universities of Minho, Aveiro and Porto, Portugal). Also, some material was lectured at a tutorial given in the European Simulation and Modelling Conference (ESM 2011), held at Guimarães.
+
+This book was written in LaTeX, using the texstudio editor ( http://​texstudio.​sourceforge.​net ) and its US English spell checker. Most figures were made in R, while some of the figures were designed using xfig ( http://​www.​xfig.​org ), an open source vector graphical tool.
+
+Paulo Cortez
+
+Guimarães, Portugal
+
+Contents
+
+1 Introduction 1
+
+1.​1 Motivation 1
+
+1.​2 Why R?​ 2
+
+1.​3 Representation of a Solution 3
+
+1.​4 Evaluation Function 3
+
+1.​5 Constraints 4
+
+1.​6 Optimization Methods 5
+
+1.​7 Demonstrative Problems 7
+
+2 R Basics 11
+
+2.​1 Introduction 11
+
+2.​2 Basic Objects and Functions 13
+
+2.​3 Controlling Execution and Writing Functions 20
+
+2.​4 Importing and Exporting Data 24
+
+2.​5 Additional Features 26
+
+2.​6 Command Summary 27
+
+2.​7 Exercises 29
+
+3 Blind Search 31
+
+3.​1 Introduction 31
+
+3.​2 Full Blind Search 32
+
+3.​3 Grid Search 36
+
+3.​4 Monte Carlo Search 40
+
+3.​5 Command Summary 42
+
+3.​6 Exercises 43
+
+4 Local Search 45
+
+4.​1 Introduction 45
+
+4.​2 Hill Climbing 45
+
+4.​3 Simulated Annealing 50
+
+4.​4 Tabu Search 53
+
+4.​5 Comparison of Local Search Methods 57
+
+4.​6 Command Summary 60
+
+4.​7 Exercises 61
+
+5 Population Based Search 63
+
+5.​1 Introduction 63
+
+5.​2 Genetic and Evolutionary Algorithms 64
+
+5.​3 Differential Evolution 70
+
+5.​4 Particle Swarm Optimization 73
+
+5.​5 Estimation of Distribution Algorithm 78
+
+5.​6 Comparison of Population Based Methods 84
+
+5.​7 Bag Prices with Constraint 88
+
+5.​8 Genetic Programming 91
+
+5.​9 Command Summary 97
+
+5.​10 Exercises 98
+
+6 Multi-Objective Optimization 99
+
+6.​1 Introduction 99
+
+6.​2 Multi-Objective Demonstrative Problems 99
+
+6.​3 Weighted-Formula Approach 101
+
+6.​4 Lexicographic Approach 104
+
+6.​5 Pareto Approach 110
+
+6.​6 Command Summary 116
+
+6.​7 Exercises 117
+
+7 Applications 119
+
+7.​1 Introduction 119
+
+7.​2 Traveling Salesman Problem 119
+
+7.​3 Time Series Forecasting 133
+
+7.​4 Wine Quality Classification 138
+
+7.​5 Command Summary 145
+
+7.​6 Exercises 146
+
+References149
+
+Solutions153
+
+Index171
+
+List of Figures
+
+Fig.1.1 Example of a convex ( left ) and non-convex ( right ) function landscapes 4
+
+Fig.1.2 Classification of the optimization methods presented in this book (related R packages are in brackets ) 6
+
+Fig.1.3 Example of the binary ( sum of bits — top left ; max sin — top right ), integer ( bag prices — middle ) and real value ( sphere — bottom left ; rastrigin — bottom right ) task landscapes 9
+
+Fig.2.1 Example of the R console ( top ) and GUI 3.0 versions ( bottom ) for Mac OS X 12
+
+Fig.2.2 Examples of a plot of a factor ( left ) and a vector ( right ) in R 16
+
+Fig.3.1 Example of pure blind search ( left ) and grid search ( right ) strategies 32
+
+Fig.3.2 Example of grid search using L = 10 ( left ) and L = 20 ( right ) levels for sphere andspace D = 2 40
+
+Fig.3.3 Example of Monte Carlo search using N = 100 ( left ) and N = 1, 000 ( right ) samples for sphere and D = 2 42
+
+Fig.4.1 Example of a local search strategy46
+
+Fig.4.2 Example of hill climbing search (only best "down the hill" points are shown) for sphere and D = 2 49
+
+Fig.4.3 Example of the temperature cooling ( left ) and simulated annealing search ( right ) for sphere and D = 2 53
+
+Fig.4.4 Local search comparison example for the rastrigin task ( D = 20) 60
+
+Fig.5.1 Example of a population based search strategy64
+
+Fig.5.2 Example of binary one-point crossover ( left ) and mutation ( right ) operators 66
+
+Fig.5.3 Example of evolution of a genetic algorithm for task bag prices 69
+
+Fig.5.4 Example of an evolutionary algorithm search for sphere ( D = 2) 70
+
+Fig.5.5 Population evolution in terms of x 1 ( top ) and x 2 ( bottom ) values under the differential evolution algorithm for sphere ( D = 2) 74
+
+Fig.5.6 Particle swarm optimization for sphere and D = 2 ( left denotes the evolution of the position particles for the first parameter; right shows the evolution of the best fitness) 79
+
+Fig.5.7 Evolution of the first parameter population values ( x 1 ) for EDA ( N P = 100) 83
+
+Fig.5.8 Population based search comparison example for the rastrigin ( top ) and bag prices ( bottom ) tasks 87
+
+Fig.5.9 Comparison of repair and death penalty strategies for bag prices task with constraint 92
+
+Fig.5.10 Example of a genetic programming random subtree crossover93
+
+Fig.5.11 Comparison of rastrigin function and best solution given by the genetic programming 97
+
+Fig.6.1 Example of the FES1 f 1 ( left ) and f 2 ( right ) task landscapes ( D = 2) 100
+
+Fig.6.2 Examples of convex ( left ) and non-convex ( right ) Pareto fronts, where the goal is to minimize both objectives 1 and 2 102
+
+Fig.6.3 NSGA-II results for bag prices ( top graphs ) and FES1 ( bottom graphs ) tasks ( left graphs show the Pareto front evolution, while right graphs compare the best Pareto front with the weighted-formula results) 116
+
+Fig.7.1 Example of three order mutation operators120
+
+Fig.7.2 Example of PMX and OX crossover operators121
+
+Fig.7.3 Comparison of simulated annealing (SANN) and evolutionary algorithm (EA) approaches for the Qatar TSP130
+
+Fig.7.4 Optimized tour obtained using evolutionary algorithm ( left ), Lamarckian evolution ( middle ), and 2-opt ( right ) approaches for the Qatar TSP 131
+
+Fig.7.5 Area of Qatar tour given by 2-opt ( left ) and optimized by the evolutionary approach ( right ) 133
+
+Fig.7.6 Sunspot one-step ahead forecasts using ARIMA and genetic programming (gp) methods138
+
+Fig.7.7 The optimized Pareto front ( left ) and ROC curve for the SVM with four inputs ( right ) for the white wine quality task 145
+
+List of Algorithms
+
+1 Generic modern optimization method7
+
+2 Pure hill climbing optimization method46
+
+3 Simulated annealing search as implemented by the optim function 51
+
+4 Tabu search54
+
+5 Genetic/evolutionary algorithm as implemented by the genalg package 65
+
+6 Differential evolution algorithm as implemented by the DEoptim package 71
+
+7 Particle swarm optimization pseudo-code for SPSO 2007 and 201175
+
+8 Generic EDA pseudo-code implemented in copulaedas package, adapted from Gonzalez-Fernandez and Soto (2012) 80
+© Springer International Publishing Switzerland 2014
+
+Paulo CortezModern Optimization with RUse R!10.1007/978-3-319-08263-9_1
+
+# 1. Introduction
+
+Paulo Cortez1
+
+(1)
+
+Department of Information Systems, University of Minho, Guimarães, Portugal
+
+## 1.1 Motivation
+
+A vast number of real-world (often complex) tasks can be viewed as an optimization problem, where the goal is to minimize or maximize a given goal. In effect, optimization is quite useful in distinct application domains, such as Agriculture, Banking, Control, Engineering, Finance, Marketing, Production and Science. Moreover, due to advances in Information Technology, nowadays it is easy to store and process data. Since the 1970s, and following the Moore's law, the number of transistors in computer processors has doubled every 2 years, resulting in more computational power at a reasonable price. And it is estimated that the amount of data storage doubles at a higher rate. Furthermore, organizations and individual users are currently pressured to increase efficiency and reduce costs. Rather than taking decisions based on human experience and intuition, there is an increasing trend for adopting computational tools, based on optimization methods, to analyze real-world data in order to make better informed decisions.
+
+Optimization is a core topic of the Operations Research field, which developed several classical techniques, such as linear programming (proposed in the 1940s) and branch and bound (developed in the 1960s) (Schrijver 1998). More recently, in the last decades, there has been an emergence of new optimization algorithms, often termed "modern optimization" (Michalewicz et al. 2006), "modern heuristics" (Michalewicz and Fogel 2004), or "metaheuristics" (Luke 2012). In this book, we adopt the first term, modern optimization, to describe these algorithms.
+
+In contrast with classical methods, modern optimization methods are general purpose solvers, i.e., applicable to a wide range of distinct problems, since few domain knowledge is required. For instance, the optimization problem does not need to be differentiable, which is required by classical methods such as gradient descent. There are only two main issues that need to be specified by the user when adopting modern heuristic methods (Michalewicz et al. 2006): the representation of the solution, which defines the search space and its size; and the evaluation function, which defines how good a particular solution is, allowing to compare different solutions. In particular, modern methods are useful for solving complex problems for which no specialized optimization algorithm has been developed (Luke 2012). For instance, problems with discontinuities, dynamic changes, multiple objectives or hard and soft restrictions, which are more difficult to be handled by classical methods (Michalewicz et al. 2006). Also in contrast with classical methods, modern optimization does not warranty that the optimal solution is always found. However, often modern methods achieve high quality solutions with a much more reasonable use of computational resources (e.g., memory and processing effort) (Michalewicz and Fogel 2004).
+
+There is a vast number of successful real-world applications based on modern optimization methods. Examples studied by the author of this book include (among others): sitting guests at a wedding party (Rocha et al. 2001); time series forecasting (Cortez et al. 2004); optimization of data mining classification and regression models (Rocha et al. 2007); and improvement of quality of service levels in computer networks (Rocha et al. 2011).
+
+## 1.2 Why R?
+
+The R tool (R Core Team 2013) is an open source, high-level matrix programming language for statistical and data analysis. The tool runs on multiple platforms, including Windows, MacOS, Linux, FreeBSD, and other UNIX systems. R is an interpreted language, meaning that the user gets an immediate response of the tool, without the need of program compilation. The most common usage of R is under a console command interface, which often requires a higher learning curve from the user when compared with other more graphical user interface tools. However, after mastering the R environment, the user achieves a better understanding of what is being executed and higher control when compared with graphical interface based products.
+
+The R base distribution includes a large variety of statistical techniques (e.g., distribution functions, statistical tests), which can be useful for inclusion in modern optimization methods and to analyze their results. Moreover, the tool is highly extensible by creating packages. The R community is very active and new packages are being continuously created, with more than 5,800 packages available at the Comprehensive R Archive Network (CRAN): http://​www.​r-project.​org/​. By installing these packages, users get access to new features, such as: data mining/machine learning algorithms; simulation and visualization techniques; and also modern optimization methods. New algorithms tend to be quickly implemented in R, thus this tool can be viewed as worldwide gateway for sharing computational algorithms (Cortez 2010). While it is difficult to know the real number of R users (e.g., it may range from 250,000 to 2 million), several estimates show a clear growth in the R popularity (Vance 2009; Muenchen 2013). A useful advantage of using R is that it is possible to execute quite distinct computational tasks under the same tool, such as combining optimization with statistical analysis, visualization, simulation, and data mining (see Sect. 7.​4 for an example that optimizes data mining models).
+
+To facilitate the usage of packages, given that a large number is available, several R packages are organized into CRAN task views. The Optimization and Mathematical Programming view is located at http://​cran.​r-project.​org/​web/​views/​Optimization.​html and includes more than 60 packages. In this book, we explore several of these CRAN view packages (and others) related with modern optimization methods.
+
+## 1.3 Representation of a Solution
+
+A major decision when using modern optimization methods is related with how to represent a possible solution (Michalewicz et al. 2006). Such decision sets the search space and its size, thus producing an impact on how new solutions are searched. To represent a solution, there are several possibilities. Binary, integer, character, real value and ordered vectors, matrices, trees and virtually any computer based representation form (e.g., computer program) can be used to encode solutions. A given representation might include a mix of different encodings (e.g., binary and real values). Also, a representation might be of fixed (e.g., fixed binary vectors) or of variable length (e.g., trees).
+
+Historically, some of these representation types are attached with specific optimization methods. For instance, binary encodings are the basis of Genetic Algorithms (Holland 1975). Tabu search was designed to work on discrete spaces (Glover and Laguna 1998). Real-value encodings are adopted by several evolutionary algorithms (e.g., evolution strategy) (Bäck and Schwefel 1993), differential evolution, and particle swarms . Tree structures are optimized using genetic programming (Banzhaf et al. 1998). It should be noted that often these optimization methods can be adapted to other representations. For instance, a novel particle swarm was proposed for discrete optimization in Chen et al. (2010).
+
+In what concerns this book, the representations adopted are restricted by the implementations available in the explored R tool packages, which mainly adopt binary or real values. Thus, there will be more focus on these type of representations. Nevertheless, Sect. 7.​2 shows a ordered vector representation optimization example. More details about other representations, including their algorithmic adjustments, can be found in Luke (2012).
+
+## 1.4 Evaluation Function
+
+Another important decision for handling optimization tasks is the definition of the evaluation function, which should translate the desired goal (or goals) to be maximized or minimized. Such function allows to compare different solutions, by providing either a rank (ordinal evaluation function) or a quality measure score (numeric function) (Michalewicz et al. 2006). When considering numeric functions, the shape can be convex or non-convex, with several local minima/maxima (Fig. 1.1). Convex tasks are much easier to solve and there are specialized algorithms (e.g., least squares, linear programming) that are quite effective for handling such problems (Boyd and Vandenberghe 2004). However, many practical problems are non-convex, often including noisy or complex function landscapes, with discontinuities. Optimization problems can even be dynamic, changing through time. For all these complex problems, an interesting alternative is to use modern optimization algorithms that only search a subset of the search space but tend to achieve near optimum solutions in a reasonable time.
+
+Fig. 1.1
+
+Example of a convex (left) and non-convex (right) function landscapes
+
+By default, some implementations of optimization methods only perform a minimization of a numerical evaluation function. In such cases, a simple approach is to transform the maximization function max(f(s)) into the equivalent minimization task − min(f ′ (s)), by adopting ![
+$$f^{{\\prime}}\(s\) = -f\(s\)$$
+](A317276_1_En_1_Chapter_IEq1.gif), where s denotes the solution.
+
+In several application fields (e.g., Control, Engineering, Finance) there are two or more goals that need to be optimized. Often, these goals conflict and trade-offs need to be set, since optimizing solutions under a single objective can lead to unacceptable outcomes in terms of the remaining goals. In such cases, a much better approach is to adopt a multi-objective optimization . In this book, we devote more attention to single response evaluation functions, since multi-objective optimization is discussed in a separated chapter (Chap. ).
+
+## 1.5 Constraints
+
+There are two main types of constraints (Michalewicz 2008): hard and soft . Hard constraints cannot be violated and are due to factors such as laws or physical restrictions. Soft constraints are related with other (often non-priority) user goals, such as increasing production efficiency while reducing environmental costs.
+
+Soft restrictions can be handled by adopting a multi-objective approach (Chap. ), while hard constraints may originate infeasible solutions that need to be treated by the optimization procedure. To deal with infeasible solutions, several methods can be adopted (Michalewicz et al. 2006): death-penalty, penalty-weights, repair and only generate feasible solutions.
+
+Death-penalty is a simple method, which involves assigning a very large penalty value, such that infeasible solutions are quickly discarded by the search (see Sect. 4.​4 for an example). However, this method is not very efficient and often puts the search engine effort in discarding solutions and not finding the optimum value. Penalty-weights is a better solution, also easy to implement. For example, quite often, the shape of an evaluation function can be set within the form ![
+$$f\(s\) = \\mathit{Objective}\(s\) -\\mathit{Penalty}\(s\)$$
+](A317276_1_En_1_Chapter_IEq2.gif) (Rocha et al. 2001). For instance, for a given business, a possible evaluation function could be ![
+$$f = w_{1} \\times \\mathit{Profit}\(s\) - w_{2} \\times \\mathit{Cost}\(s\)$$
+](A317276_1_En_1_Chapter_IEq3.gif). The main problem with penalty-weights is that often it is difficult to find the ideal weights, in particular when several constraints are involved. The repair approach transforms an infeasible solution into a feasible one. Often, this is achieved by using domain dependent information (such as shown in Sect. 5.​2) or by applying a local search (e.g., looking for a feasible solution in the solution space neighborhood, see Sect. 5.​7). Finally, the approaches that only generate feasible solutions are based in decoders and special operators. Decoders work only in a feasible search space, by adopting an indirectly representation, while special operators use domain knowledge to create new solutions from previous ones.
+
+## 1.6 Optimization Methods
+
+There are different dimensions that can be used to classify optimization methods. Three factors of analysis are adopted in this book: the type of guided search; the search is deterministic or stochastic based; and if the method is inspired by physical or biological processes.
+
+The type of search can be blind or guided . The former assumes the exhaustion of all alternatives for finding the optimum solution, while the latter uses previous searches to guide current search. Modern methods use a guided search, which often is subdivided into two main categories: local, which searches within the neighborhood of an initial solution, and global search, which uses a population of solutions. In most practical problems, with high-dimensional search spaces, pure blind search is not feasible, requiring too much computational effort. Local (or single-state) search presents in general a much faster convergence, when compared with global search methods. However, if the evaluation landscape is too noisy or complex, with several local minima (e.g., right of Fig. 1.1), local methods can easily get stuck. In such cases, multiple runs, with different initial solutions, can be executed to improve convergence. Although population based algorithms tend to require more computation than local methods, they perform a simultaneous search in distinct regions of the search space, thus working better as global optimization methods.
+
+The distinct search types can even be combined. For instance, a two-phase search can be set, where a global method is employed at a first step, to quickly identify interesting search space regions, and then, as the second step, the best solutions are improved by employing a local search method. Another alternative is to perform a tight integration of both approaches, such as under a Lamarckian evolution or Baldwin effect (Rocha et al. 2000). In both cases, within each cycle of the population based method, each new solution is used as the initial point of a local search and the evaluation function is computed over the improved solution. Lamarckian evolution replaces the population original solution by its improved value (Sect. 7.​2 presents a Lamarckian evolution example), while the Baldwinian strategy keeps the original point (as set by the population based method).
+
+Several modern methods employ some degree of randomness, thus belonging to the family of stochastic optimization methods, such as simulated annealing and genetic algorithms (Luke 2012). Also, several of these methods are naturally inspired (e.g., genetic algorithms, particle swarm optimization ) (Holland 1975; Eberhart et al. 2001). Figure 1.2 shows the full taxonomy of the optimization methods presented in this book (with respective R packages).
+
+Fig. 1.2
+
+Classification of the optimization methods presented in this book (related R packages are in brackets)
+
+The distinct modern optimization methods share some common features. Algorithm 1 shows (in pseudo-code) a generic modern optimization method that is applicable to all methods discussed in this book. This global algorithm receives two inputs, the evaluation function (f) and a set of control parameters (C), which includes not only the method's internal parameters (e.g., initial temperature, population size) but it is also related with the representation of the solution (e.g., lower and upper bounds, representation type, and length). In all modern optimization methods, there is an initial setup followed by a main loop cycle that ends once a given termination criterian is met. Distinct criteria can be adopted (or even combined):
+
+  * maximum computational measures—such as number of iterations, evaluation function calculations, time elapsed;
+
+  * target measures—such as to stop if best value is higher or equal to a given threshold;
+
+  * convergence measures—such as number of iterations without any improvement in the solution or average enhancement achieved in the last iterations; and
+
+  * distribution measures—such as measuring how spread are the last tested solutions in the search space and stop if the dispersion is smaller than a threshold.
+
+What distinguishes the methods is related with two main aspects. First, if in each iteration there is a single-state (local based) or a population of solutions. Second, the way new solutions are created (function change) and used to guide in the search (function select). In the generic pseudo-code, the number of iterations (i) is also included as an input of the change, best, and select functions because it is assumed that the behavior of these functions can be dynamic, changing as the search method evolves.
+
+Algorithm 1 Generic modern optimization method
+
+## 1.7 Demonstrative Problems
+
+This section includes examples of simple optimization tasks that were selected mainly from a tutorial perspective, where the aim is to easily show the capabilities of the optimization methods. The selected demonstrative problems include 2 binary, 1 integer and 2 real value tasks. More optimization tasks are presented and explored in Chaps.  (multi-optimization tasks) and 7 (real-world tasks).
+
+The binary problems are termed here sum of bits and max sin. The former, also known as max ones, is a simple binary maximization "toy" problem, defined as (Luke 2012):
+
+![
+$$\\displaystyle{ f_{\\text{sum of bits}}\(\\mathbf{x}\) =\\sum _{ i=1}^{D}x_{ i} }$$
+](A317276_1_En_1_Chapter_Equ1.gif)
+
+(1.1)
+
+where x = (x 1,..., x D ) is a boolean vector (x i ∈ { 0, 1}) with a dimension (or length) of D. The latter problem is another simple binary task, where the goal is to maximize (Eberhart and Shi 2011):
+
+![
+$$\\displaystyle{ \\begin{array}{rl} x^{{\\prime}}& =\\sum _{ i=1}^{D}x_{i}2^{i-1} \\\\ f_{\\text{max sin}}\(\\mathbf{x}\)& =\\sin \(\\pi \\frac{x^{{\\prime}}} {2^{D}}\)\\\\ \\end{array} }$$
+](A317276_1_En_1_Chapter_Equ2.gif)
+
+(1.2)
+
+where x ′ is the integer representation of x.
+
+A visualization for both binary problems is given in top of Fig. 1.3, assuming a dimension of D = 8. In the top left graph, x-axis denotes x ′ , the integer representation of x for sum of bits. In the example, the optimum solution for the sum of bits is x = (1, 1, 1, 1, 1, 1, 1, 1) (f(x) = 8), while the best solution for max sin is x = (1, 0, 0, 0, 0, 0, 0, 0), x ′ = 128 (f(x) = 1).
+
+Fig. 1.3
+
+Example of the binary (sum of bits—top left; max sin—top right), integer (bag prices—middle) and real value (sphere—bottom left; rastrigin—bottom right) task landscapes
+
+The bag prices is an integer optimization task (proposed in this book), that mimics the decision of setting of prices for items produced in a bag factory. The factory produces up to five (D = 5) different bags, with a unit cost of u = ($30, $25, $20, $15, $10), where u i is the cost for manufacturing product i. The production cost is ![
+$$cost\(x_{i}\) = 100 + u_{i} \\times sales\(x_{i}\)$$
+](A317276_1_En_1_Chapter_IEq4.gif) for the i-th bag type. The number of expected sales, which is what the factory will produce, is dependent on the product selling price (x) and marketing effort (m), according to the formula ![
+$$sales\(x_{i}\) = round\(\(1000/\\ln \(x_{i} + 200\) - 141\) \\times m_{i}\)$$
+](A317276_1_En_1_Chapter_IEq5.gif), where round is the ordinary rounding function and m = (2. 0, 1. 75, 1. 5, 1. 25, 1. 0). The manager at the factory needs to decide the selling prices for each bag (x i , in $), within the range $1 to $1,000, in order to maximize the expected profit related with the next production cycle:
+
+![
+$$\\displaystyle{ f_{\\text{bag prices}} =\\sum _{ i=1}^{D}x_{ i} \\times sales\(x_{i}\) - cost\(x_{i}\) }$$
+](A317276_1_En_1_Chapter_Equ3.gif)
+
+(1.3)
+
+The middle left graph of Fig. 1.3 plots the full search space for the first item of bag prices (D = 1), while the middle right plot shows a zoom near the optimum solution. As shown by the graphs, the profit function follows in general a global convex shape. However, close to the optimum (x 1 = 414, f(x 1) = 11420) point there are several local minima, under a "saw" shape that is more difficult to optimize. As shown in Chap. , the optimum solution for five different bags (D = 5) is x = c(414, 404, 408, 413, 395), which corresponds to an estimated profit of $43,899.
+
+Turning to the real value tasks, two popular benchmarks are adopted, namely sphere and rastrigin (Tang et al. 2009), which are defined by:
+
+![
+$$\\displaystyle{ \\begin{array}{rl} f_{\\text{sphere}}\(\\mathbf{x}\)& =\\sum _{ i=1}^{D}x_{i}^{2} \\\\ f_{\\text{rastrigin}}\(\\mathbf{x}\)& =\\sum _{ i=1}^{D}\(x_{i}^{2} - 10\\cos 2\\pi x_{i} + 10\)\\\\ \\end{array} }$$
+](A317276_1_En_1_Chapter_Equ4.gif)
+
+(1.4)
+
+where x = (x 1,..., x D ) is a real value vector (![
+$$x_{i} \\in \\mathfrak{R}$$
+](A317276_1_En_1_Chapter_IEq6.gif)). For both tasks, the goal is to find the minimum value, which is the origin point (e.g., if D = 4 and x = (0, 0, 0, 0), then f(x) = 0). The sphere task is more simpler, mainly used for demonstration purposes, while the rastrigin is much more difficult multi-modal problem, given that the number of local minima grows exponentially with the increase of dimensionality (D). The differences between sphere and rastrigin are clearly shown in the two graphs at the bottom of Fig. 1.3.
+
+References
+
+Bäck T, Schwefel HP (1993) An overview of evolutionary algorithms for parameter optimization. Evol Comput 1(1):1–23CrossRef
+
+Banzhaf W, Nordin P, Keller R, Francone F (1998) Genetic programming. An introduction. Morgan Kaufmann, San FranciscoCrossRefMATH
+
+Boyd S, Vandenberghe L (2004) Convex optimization. Cambridge University Press, CambridgeCrossRefMATH
+
+Chen WN, Zhang J, Chung HS, Zhong WL, Wu WG, Shi YH (2010) A novel set-based particle swarm optimization method for discrete optimization problems. IEEE Trans Evol Comput 14(2):278–300CrossRef
+
+Cortez P (2010) Data mining with neural networks and support vector machines using the R/rminer tool. In: Perner P (ed) Advances in data mining: applications and theoretical aspects. 10th industrial conference on data mining. Lecture notes in artificial intelligence, vol 6171. Springer, Berlin, pp 572–583
+
+Cortez P, Rocha M, Neves J (2004) Evolving time series forecasting ARMA models. J Heuristics 10(4):415–429CrossRef
+
+Eberhart R, Kennedy J, Shi Y (2001) Swarm intelligence. Morgan Kaufmann, San Francisco
+
+Eberhart RC, Shi Y (2011) Computational intelligence: concepts to implementations. Morgan Kaufmann, San Francisco
+
+Glover F, Laguna M (1998) Tabu search. Springer, Heidelberg
+
+Holland J (1975) Adaptation in natural and artificial systems. Ph.D. thesis, University of Michigan
+
+Luke S (2012) Essentials of metaheuristics. Lulu.com, online version at http://​cs.​gmu.​edu/​~sean/​book/​metaheuristics
+
+Michalewicz Z (2008) Adaptive Business Intelligence, Computer Science Course 7005 Handouts
+
+Michalewicz Z, Fogel D (2004) How to solve it: modern heuristics. Springer, BerlinCrossRef
+
+Michalewicz Z, Schmidt M, Michalewicz M, Chiriac C (2006) Adaptive business intelligence. Springer, Berlin
+
+Muenchen RA (2013) The popularity of data analysis software. http://​r4stats.​com/​articles/​popularity/​
+
+R Core Team (2013) R: a language and environment for statistical computing. R Foundation for Statistical Computing, Vienna. http://​www.​R-project.​org/​
+
+Rocha M, Cortez P, Neves J (2000) The Relationship between learning and evolution in static and in dynamic environments. In: Fyfe C (ed) Proceedings of the 2nd ICSC symposium on engineering of intelligent systems (EIS'2000). ICSC Academic Press, Paisley, pp 377–383
+
+Rocha M, Mendes R, Cortez P, Neves J (2001) Sitting guest at a wedding party: experiments on genetic and evolutionary constrained optimization. In: Proceedings of the 2001 congress on evolutionary computation (CEC2001), vol 1. IEEE Computer Society, Seoul, pp 671–678
+
+Rocha M, Cortez P, Neves J (2007) Evolution of neural networks for classification and regression. Neurocomputing 70:2809–2816CrossRef
+
+Rocha M, Sousa P, Cortez P, Rio M (2011) Quality of service constrained routing optimization using evolutionary computation. Appl Soft Comput 11(1):356–364CrossRef
+
+Schrijver A (1998) Theory of linear and integer programming. Wiley, ChichesterMATH
+
+Tang K, Li X, Suganthan P, Yang Z, Weise T (2009) Benchmark functions for the cec'2010 special session and competition on large-scale global optimization. Tech. rep., Technical report, University of Science and Technology of China
+
+Vance A (2009) R You Ready for R? http://​bits.​blogs.​nytimes.​com/​2009/​01/​08/​r-you-ready-for-r/​
+© Springer International Publishing Switzerland 2014
+
+Paulo CortezModern Optimization with RUse R!10.1007/978-3-319-08263-9_2
+
+# 2. R Basics
+
+Paulo Cortez1
+
+(1)
+
+Department of Information Systems, University of Minho, Guimarães, Portugal
+
+## 2.1 Introduction
+
+As explained in the preface of this book, the goal of this chapter is to briefly present the most relevant R tool aspects that need to be learned by non-experts in order to understand the examples discussed in this book. For a more detailed introduction to the tool, please consult (Paradis 2002; Zuur et al. 2009; Venables et al. 2013).
+
+R is language and a computational environment for statistical analysis that was created by Ihaka and Gentleman in 1991 and that was influenced by the S and Scheme languages (Ihaka and Gentleman 1996). R uses a high-level language, based in objects, that is flexible and extensible (e.g., by the development of packages) and allows a natural integration of statistics, graphics, and programming. The R system offers an integrated suite with a large and coherent collection of tools for data manipulation, analysis, and graphical display.
+
+The tool is freely distributed under a GNU general public license and can be easily installed from the official web page (http://​www.​r-project.​org) , with several binary versions available for most commonly adopted operating systems (e.g., Windows, MacOS). R can be run under the console (e.g., common in Linux systems) or using Graphical User Interface (GUI) applications (e.g., R for Mac OS X GUI). There are also several independent Integrated Development Environments (IDE) for R, such as RStudio (http://​www.​rstudio.​com/​) and Tinn-R (http://​nbcgib.​uesc.​br/​lec/​software/​editores/​tinn-r/​en). Figure 2.1 shows an example of the R console and GUI applications for the Mac OS system.
+
+Fig. 2.1
+
+Example of the R console (top) and GUI 3.0 versions (bottom) for Mac OS X
+
+R works mostly under a console interface (Fig. 2.1), where commands are typed after the prompt (>). An extensive help system is included. There are two console alternatives to get help on a particular function. For instance, help(barplot) or ?barplot returns the help for the barplot function. It is also possible to search for a text expression, such as help.search("linear models") or ??"linear models". For each function, the help system includes often a short description, the function main arguments, details, return value, references, and examples. The last item is quite useful for an immediate user perception of the function capabilities and can be accessed directly in the console, such as by using ![
+$$\\fbox{ $\\mathtt{> example\(barplot\)}$}$$
+](A317276_1_En_2_Chapter_IEq1.gif) . Some demonstrations of interesting R scripts are available with the command demo , such as ![
+$$\\fbox{ $\\mathtt{> demo\(graphics\)}$}$$
+](A317276_1_En_2_Chapter_IEq2.gif) or ![
+$$\\fbox{ $\\mathtt{> demo\(\)}$}$$
+](A317276_1_En_2_Chapter_IEq3.gif).
+
+The tool capabilities can be extended by installing packages. The full list of packages is available at CRAN (http://​cran.​r-project.​org). Packages can be installed on the console, using the command install.packages, or GUI system, using the application menus. After installing a package, the respective functions and help is only available if the package is loaded using the library command. For example, the following sequence shows the commands needed to install the particle swarm package and get the help on its main function:
+
+> install.packages("pso")
+
+> library(pso)
+
+> ?pso
+
+A good way to get help on a particular package is to use >help(package= package ).
+
+R instructions can be separated using the ; or newline character. Everything that appears after the # character in a line is considered a comment. R commands can be introduced directly in the console or edited in a script source file (e.g., using RStudio). The common adopted extension for R files is .R and these can be loaded with the source command . For example, source("code.R") executes the file code.R.
+
+By default, the R system searches for files (e.g., code, data) in the current working directory, unless an explicit path is defined. The definition of such path is operating system dependent. Examples of paths are: " ∼ /directory/" (Unix or Mac OS, where  ∼  means the user's home directory); "C:/Documents and Settings/User/directory/" (Windows); and "../directory" (relative path should work in all systems). The working directory can be accessed and changed using the R GUI (e.g., " ∼ /R/rminer/" at the bottom of Fig. 2.1) or the console, under the getwd() and setwd() functions, such as ![
+$$\\fbox{ $\\mathtt{> setwd\(\\textquotedblright../directory\\textquotedblright \)}$}$$
+](A317276_1_En_2_Chapter_IEq4.gif).
+
+There is a vast amount of R features (e.g., functions, operators), either in the base version or its contributing packages. In effect, the number of features offered in R is such large that often users face the dilemma between spending time coding a procedure or searching if such procedure has already been implemented. In what concerns this book, the next sections describe some relevant R features that are required to understand the remaining chapters of this book. Explanation is given mostly based on examples, given that the full description of each R operator or function can be obtained using the help system, such as help(":") or help("sort").
+
+## 2.2 Basic Objects and Functions
+
+R uses objects to store items, such as data and functions. The = (or <- 1) operator can be used to assign an object to a variable. The class of an object is automatically assumed, with atomic objects including the logical (i.e., FALSE, TRUE), character (e.g., "day"), integer (e.g., 1L), and numeric (e.g., 0.2, 1.2e-3) types. The type of any R object can be accessed by using the function class. There are also several constants defined, such as: pi—π; Inf—infinity; NaN—not a number; NA—missing value; and NULL—empty or null object.
+
+The R system includes an extensive list of functions and operators that can be applied over a wide range of object types, such as:
+
+  * class() —get type of object;
+
+  * summary() —show a summary of the object;
+
+  * print() —shows the object;
+
+  * plot() —plots the object; and
+
+  * is.na() , is.nan() , is.null() —check if object is NA, NaN, or NULL.
+
+Another useful function is ls() , which lists all objects defined by the user. An example of a simple R session is shown here:
+
+> s="hello world"
+
+> print(class(s)) # character
+
+[1] "character"
+
+> print(s) # "hello world"
+
+[1] "hello world"
+
+> x=1.1
+
+> print(class(x)) # numeric
+
+[1] "numeric"
+
+> print(summary(x)) # summary of x
+
+Min. 1st Qu. Median Mean 3rd Qu. Max.
+
+1.1 1.1 1.1 1.1 1.1 1.1
+
+> plot(x)
+
+> print(x) # 1.1
+
+[1] 1.1
+
+> print(pi) # 3.141593
+
+[1] 3.141593
+
+> print(sqrt(-1)) # NaN
+
+[1] NaN
+
+Warning message:
+
+In sqrt(-1) : NaNs produced
+
+> print(1/0) # Inf
+
+[1] Inf
+
+There are also several containers, such as: vector , factor , ordered , ​​​​matrix , array , data.frame , and list . Vectors, matrices, and arrays use an indexed notation ([]) to store and access several atoms. A factor is a special vector that contains only discrete values from a domain set, while ordered is a special factor whose domain levels have an order. A data frame is a special matrix where the columns (made of vectors or factors) have names. Finally, a list is a collection of distinct objects (called components). A list can include indexed elements (of any type, including containers) under the [[]] notation.
+
+There is a large number of functions and operators that can be used to manipulate R objects (including containers). Some useful functions are:
+
+  * c() —concatenate several elements;
+
+  * seq() —create a regular sequence;
+
+  * sample() , runif() , rnorm() —create random samples;
+
+  * set.seed() —set the random generation seed number;
+
+  * str() —show the internal structure of the object;
+
+  * length() , sum() , mean() , median() , min() , max() —computes the length, sum, average, median, minimum or maximum of all elements of the object;
+
+  * names() —get and set the names of an object;
+
+  * sort() —sorts a vector or factor;
+
+  * which() —returns the indexes of an object that follow a logical condition;
+
+  * which.min() , which.max() —returns the index of the minimum or maximum value;
+
+  * sqrt() —square root of a number; and
+
+  * sin() , cos() , tan() —trigonometric functions.
+
+Examples of operators are:
+
+  * $—get and set a list component;
+
+  * :—generate regular sequences;
+
+  * +, -, *, /—simple arithmetic operators;
+
+  * ![
+$$^{\\hat{}}$$
+](A317276_1_En_2_Chapter_IEq5.gif) (or **)—power operator; and
+
+  * %%—rest of an integer division.
+
+R also offers vast graphical based features. Examples of useful related functions are:
+
+  * plot —generic plotting function;
+
+  * barplot —bar plots;
+
+  * pie —pie charts;
+
+  * hist —histograms;
+
+  * boxplot —box-and-whisker plot; and
+
+  * wireframe —3D scatter plot (package lattice).
+
+A graph can be sent to screen (default) or redirected to a device (e.g., PDF file). The description of all these graphical features is out of scope of this book, but a very interesting sample of R based graphs and code can be found at the R Graph Gallery https://​www.​facebook.​com/​pages/​R-Graph-Gallery/​169231589826661.
+
+Fig. 2.2
+
+Examples of a plot of a factor (left) and a vector (right) in R
+
+An example of an R session that uses factors and vectors is presented next (Fig. 2.2 shows the graphs created by such code):
+
+> f=factor(c("a","a","b","b","c")); print(f) # create factor
+
+[1] a a b b c
+
+Levels: a b c
+
+> f[1]="c"; print(f) # change factor
+
+[1] c a b b c
+
+Levels: a b c
+
+> print(levels(f)) # show domain levels: "a" "b" "c"
+
+[1] "a" "b" "c"
+
+> print(summary(f)) # show a summary of y
+
+a b c
+
+1 2 2
+
+> plot(f) # show y barplot
+
+> x=c(1.1,2.3,-1,4,2e-2) # creates vector x
+
+> summary(x) # show summary of x
+
+Min. 1st Qu. Median Mean 3rd Qu. Max.
+
+-1.000 0.020 1.100 1.284 2.300 4.000
+
+> print(x) # show x
+
+[1] 1.10 2.30 -1.00 4.00 0.02
+
+> str(x) # show x structure
+
+num [1:5] 1.1 2.3 -1 4 0.02
+
+> length(x) # show the length of x
+
+[1] 5
+
+> x[2] # show second element of x
+
+[1] 2.3
+
+> x[2:3]=(2:3)*1.1 # change 2nd and 3rd elements
+
+> x[length(x)]=5 # change last element to 5
+
+> print(x) # show x
+
+[1] 1.1 2.2 3.3 4.0 5.0
+
+> print(x>3) # show which x elements > 3
+
+[1] FALSE FALSE TRUE TRUE TRUE
+
+> print(which(x>3)) # show indexes of x>3 condition
+
+[1] 3 4 5
+
+> names(x)=c("1st","2nd","3rd","4th","5th") # change names of x
+
+> print(x) # show x
+
+1st 2nd 3rd 4th 5th
+
+1.1 2.2 3.3 4.0 5.0
+
+> print(mean(x)) # show the average of x
+
+[1] 3.12
+
+> print(summary(x)) # show a summary of x
+
+Min. 1st Qu. Median Mean 3rd Qu. Max.
+
+1.10 2.20 3.30 3.12 4.00 5.00
+
+> y=vector(length=5); print(y) # FALSE, FALSE, ..., FALSE
+
+[1] FALSE FALSE FALSE FALSE FALSE
+
+> y[]=1; print(y) # all elements set to 1
+
+[1] 1 1 1 1 1
+
+> y[c(1,3,5)]=2; print(y) # 2,1,2,1,2
+
+[1] 2 1 2 1 2
+
+> # fancier plot of y:
+
+> plot(y,type="b",lwd=3,col="gray",pch=19,panel.first=grid(5,5))
+
+Typically, R functions can receive several arguments, allowing to detail the effect of the function (e.g. help(plot)). To facilitate the use of functions, most of the parameters have default values (which are available in the help system). For instance, replacing the last line of above code with plot(y) will also work, although with a simpler effect.
+
+Another R example for manipulating vectors is shown here:
+
+> x=sample(1:10,5,replace=TRUE) # 5 random samples from 1 to 10
+
+# with replacement
+
+> print(x) # show x
+
+[1] 10 5 5 1 2
+
+> print(min(x)) # show min of x
+
+[1] 1
+
+> print(which.min(x)) # show index of x that contains min
+
+[1] 4
+
+> print(sort(x,decreasing=TRUE)) # show x in decreasing order
+
+[1] 10 5 5 2 1
+
+> y=seq(0,20,by=2); print(y) # y = 0, 2, ..., 20
+
+[1] 0 2 4 6 8 10 12 14 16 18 20
+
+> print(y[x]) # show y[x]
+
+[1] 18 8 8 0 2
+
+> print(y[-x]) # - means indexes excluded from y
+
+[1] 4 6 10 12 14 16 20
+
+> x=runif(5,0.0,10.0);print(x) # 5 uniform samples from 0 to 10
+
+[1] 1.011359 1.454996 6.430331 9.395036 6.192061
+
+> y=rnorm(5,10.0,1.0);print(y) # normal samples (mean 10, std 1)
+
+[1] 10.601637 9.231792 9.548483 9.883687 9.591727
+
+> t.test(x,y) # t-student paired test
+
+Welch Two Sample t-test
+
+data: x and y
+
+t = -3.015, df = 4.168, p-value = 0.03733
+
+alternative hypothesis: true difference in means is not equal to 0
+
+95 percent confidence interval:
+
+-9.2932638 -0.4561531
+
+sample estimates:
+
+mean of x mean of y
+
+4.896757 9.771465
+
+The last R function (t.test() ) checks if the differences between the x and y averages are statistically significant. Other statistical tests are easily available in R, such as wilcox.test (Wilcoxon) or chisq.test (Pearson's chi-squared). In the above example, x is created using a uniform distribution ![
+$$\\mathcal{U}\(0,10\)$$
+](A317276_1_En_2_Chapter_IEq6.gif), while y is created using a normal one, i.e., ![
+$$\\mathcal{N}\(10,1\)$$
+](A317276_1_En_2_Chapter_IEq7.gif). Given that R is a strong statistical tool, there is an extensive list of distribution functions (e.g., binomial, Poisson), which can be accessed using help("Distributions").
+
+The next R session is about matrix and data.frame objects:
+
+> m=matrix(ncol=3,nrow=2); m[,]=0; print(m) # 3x2 matrix
+
+[,1] [,2] [,3]
+
+[1,] 0 0 0
+
+[2,] 0 0 0
+
+> m[1,]=1:3; print(m) # change 1st row
+
+[,1] [,2] [,3]
+
+[1,] 1 2 3
+
+[2,] 0 0 0
+
+> m[,3]=1:2; print(m) # change 3rd column
+
+[,1] [,2] [,3]
+
+[1,] 1 2 1
+
+[2,] 0 0 2
+
+> m[2,1]=3; print(m) # change m[2,1]
+
+[,1] [,2] [,3]
+
+[1,] 1 2 1
+
+[2,] 3 0 2
+
+> print(nrow(m)) # number of rows
+
+[1] 2
+
+> print(ncol(m)) # number of columns
+
+[1] 3
+
+> m[nrow(m),ncol(m)]=5; print(m) # change last element
+
+[,1] [,2] [,3]
+
+[1,] 1 2 1
+
+[2,] 3 0 5
+
+> m[nrow(m)-1,ncol(m)-1]=4; print(m) # change m[1,2]
+
+[,1] [,2] [,3]
+
+[1,] 1 4 1
+
+[2,] 3 0 5
+
+> print(max(m)) # show maximum of m
+
+[1] 5
+
+> m=sqrt(m); print(m) # change m
+
+[,1] [,2] [,3]
+
+[1,] 1.000000 2 1.000000
+
+[2,] 1.732051 0 2.236068
+
+> m[1,]=c(1,1,2013); m[2,]=c(2,2,2013) # change m
+
+> d=data.frame(m) # create data.frame
+
+> names(d)=c("day","month","year") # change names
+
+> d[1,]=c(2,1,2013); print(d) # change 1st row
+
+day month year
+
+1 2 1 2013
+
+2 2 2 2013
+
+> d$day[2]=3; print(d) # change d[1,2]
+
+day month year
+
+1 2 1 2013
+
+2 3 2 2013
+
+> d=rbind(d,c(4,3,2014)); print(d) # add row to d
+
+day month year
+
+1 2 1 2013
+
+2 3 2 2013
+
+3 4 3 2014
+
+> # change 2nd column of d to factor, same as d[,2]=factor(...
+
+> d$month=factor(c("Jan","Feb","Mar"))
+
+> print(summary(d)) # summary of d
+
+day month year
+
+Min. :2.0 Feb:1 Min. :2013
+
+1st Qu.:2.5 Jan:1 1st Qu.:2013
+
+Median :3.0 Mar:1 Median :2013
+
+Mean :3.0 Mean :2013
+
+3rd Qu.:3.5 3rd Qu.:2014
+
+Max. :4.0 Max. :2014
+
+The last section example is related with lists:
+
+> l=list(a="hello",b=1:3) # list with 2 components
+
+> print(summary(l)) # summary of l
+
+Length Class Mode
+
+a 1 -none- character
+
+b 3 -none- numeric
+
+> print(l) # show l
+
+$a
+
+[1] "hello"
+
+$b
+
+[1] 1 2 3
+
+> l$b=l$b^2+1;print(l) # change b to (b*b)+1
+
+$a
+
+[1] "hello"
+
+$b
+
+[1] 2 5 10
+
+> v=vector("list",3) # vector list
+
+> v[[1]]=1:3 # change 1st element of v
+
+> v[[2]]=2 # change 2nd element of v
+
+> v[[3]]=l # change 3rd element of v
+
+> print(v) # show v
+
+[[1]]
+
+[1] 1 2 3
+
+[[2]]
+
+[1] 2
+
+[[3]]
+
+[[3]]$a
+
+[1] "hello"
+
+[[3]]$b
+
+[1] 2 3 4
+
+> print(length(v)) # length of v
+
+[1] 3
+
+## 2.3 Controlling Execution and Writing Functions
+
+The R language contains a set of control-flow constructs that are quite similar to other imperative languages (e.g., C, Java). Such constructs can be accessed using the console command line help("Control") and include:
+
+  * if( condition) expression—if condition is true then execute expression;
+
+  * if( condition) expression1 else expression2—another conditional execution variant, where expression1 is executed if condition is TRUE, else expression2 is executed;
+
+  * switch(... ) —conditional control function that evaluates the first argument and based on such argument chooses one of the remaining arguments.
+
+  * for( variable in sequence ) expression—cycle where variable assumes in each iteration a different value from sequence.
+
+  * while( condition ) expression —loop that is executed while condition is true;
+
+  * repeat expression—execute expression (stops only if there is a break);
+
+  * break—breaks out a loop; and
+
+  * next—-skips to next iteration.
+
+A condition in R is of the logical type, assumed as the first TRUE or FALSE value. Similarly to other imperative languages, several logical operators can be used within a condition (use help("Logic") for more details):
+
+  * x == y, x != y, x > y, x >= yx < y and x >= y—equal, different, higher, higher or equal, lower, lower or equal to;
+
+  * ! x—negation of x;
+
+  * x & y and x | y—x and y elementwise logical AND and OR (may generate several TRUE or FALSE values);
+
+  * x && y and x || y—left to right examination of logical AND and OR (generates only one TRUE or FALSE value);
+
+  * xor( x, y )—elementwise exclusive OR.
+
+Regarding the expression, it can include a single command, expression1, or a compound expression, under the form { expression1 ; expression2 }. An R session is presented to exemplify how control execution is performed:
+
+# two if else examples:
+
+> x=0; if(x>0) cat("positive\n") else if(x==0) cat("neutral\n") else cat("negative\n")
+
+neutral
+
+> if(xor(x,1)) cat("XOR TRUE\n") else cat("XOR FALSE\n")
+
+XOR TRUE
+
+> print(switch(3,"a","b","c")) # numeric switch example
+
+[1] "c"
+
+> x=1; while(x<3) { print(x); x=x+1;} # while example
+
+[1] 1
+
+[1] 2
+
+> for(i in 1:3) print(2*i) # for example #1
+
+[1] 2
+
+[1] 4
+
+[1] 6
+
+> for(i in c("a","b","c")) print(i) # for example #2
+
+[1] "a"
+
+[1] "b"
+
+[1] "c"
+
+> for(i in 1:10) if(i%%3==0) print(i) # for example #3
+
+[1] 3
+
+[1] 6
+
+[1] 9
+
+# character switch example:
+
+> var="sin";x=1:3;y=switch(var,cos=cos(x),sin=sin(x))
+
+> cat("the",var,"of",x,"is",round(y,digits=3),"\n")
+
+the sin of 1 2 3 is 0.841 0.909 0.141
+
+This example introduces two new functions: cat and round . Similarly to print, the cat function concatenates and outputs several objects, where "∖n" means the newline character2, while round rounds the object values with the number of digits defined by the second argument.
+
+The elementwise logical operators are useful for filtering containers, such as shown in this example:
+
+> x=1:10;print(x)
+
+[1] 1 2 3 4 5 6 7 8 9 10
+
+> print(x>=3&x<8) # select some elements
+
+[1] FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
+
+> I=which(x>=3&x<8);print(I) # indexes of selection
+
+[1] 3 4 5 6 7
+
+> d=data.frame(x=1:4,f=factor(c(rep("a",2),rep("b",2))))
+
+> print(d)
+
+x f
+
+1 1 a
+
+2 2 a
+
+3 3 b
+
+4 4 b
+
+> print(d[d$x<2|d$f=="b",]) # select rows
+
+x f
+
+1 1 a
+
+3 3 b
+
+4 4 b
+
+The rep function replicates elements of vectors and lists. For instance, rep(1:3,2) results in the vector: 1 2 3 1 2 3.
+
+The power of R is highly enhanced by the definition of functions, which similarly to other imperative languages (e.g., C, Java) define a portion of code that can be called several times during program execution. A function is defined as: name =function( arg1, arg2,...) expression, where: name is the function name; arg1, arg2,... are the arguments; and expression is a single command or compound expression. Arguments can have default values by using arg = arg expression in the function definition. Here, arg expression can be a constant or an arbitrary R expression, even involving other arguments of the same function. The three dots special argument (...) means several arguments that are passed on to other functions (an example of... usage is shown in Sect. 3.​2).
+
+The scope of a function code is local, meaning that any assignment (=) within the function is lost when the function ends. If needed, global assignment in R is possible using the <<- operator (see Sect. 4.​5 for an example). Also, a function can only return one object, which is the value set under the return command or, if not used, last line of the function. Functions can be recursive, i.e., a function that calls the same function but typically with different arguments. Moreover, a function may define other functions within itself.
+
+As an example, the following code was edited in a file named functions.R and computes the profit for the bag prices task (Sect. 1.​7) and defines a recursive function:
+
+### functions.R file ###
+
+# compute the bag factory profit for x:
+
+# x - a vector of prices
+
+profit=function(x) # x - a vector of prices
+
+{ x=round(x,digits=0) # convert x into integer
+
+s=sales(x) # get the expected sales
+
+c=cost(s) # get the expected cost
+
+profit=sum(s*x-c) # compute the profit
+
+return(profit)
+
+# local variables x, s, c and profit are lost from here
+
+}
+
+# compute the cost for producing units:
+
+# units - number of units produced
+
+# A - fixed cost, cpu - cost per unit
+
+cost=function(units,A=100,cpu=35-5*(1:length(units)))
+
+{ return(A+cpu*units) }
+
+# compute the estimated sales for x:
+
+# x - a vector of prices, m - marketing effort
+
+# A, B, C - constants of the estimated function
+
+sales=function(x,A=1000,B=200,C=141,
+
+m=seq(2,length.out=length(x),by=-0.25))
+
+{ return(round(m*(A/log(x+B)-C),digits=0))}
+
+# example of a simple recursive function:
+
+fact=function(x=0) # x - integer number
+
+{ if(x==0) return(1) else return(x*fact(x-1))}
+
+In this example, although object x is changed inside function profit, such change is not visible outside the function scope. The code also presents several examples of default arguments, such as constants (e.g., C=141) and more complex expressions (e.g., cpu=35-5*(1:length(units))). The last function (fact) was included only for demonstrative purposes of a recursive function, since it only works for single numbers. It should be noted that R includes the enhanced factorial function that works with both single and container objects.
+
+When invoking a function call, arguments can be given in any order, provided the argument name is explicitly used, under the form argname = object, where argname is the name of the argument. Else, arguments are assumed from left to right. The following session loads the previous code and executes some of its functions:
+
+> source("functions.R") # load the code
+
+> cat("class of profit is:",class(profit),"\n") # function
+
+class of profit is: function
+
+> x=c(414.1,404.2,408.3,413.2,395.0)
+
+> y=profit(x); cat("maximum profit:",y,"\n")
+
+maximum profit: 43899
+
+> cat("x is not changed:",x,"\n")
+
+x is not changed: 414.1 404.2 408.3 413.2 395
+
+> cat("cost(x=",x,")=",cost(x),"\n")
+
+cost(x= 414.1 404.2 408.3 413.2 395 )= 12523 10205 8266 6298 4050
+
+> cat("sales(x=",x,")=",sales(round(x)),"\n")
+
+sales(x= 414.1 404.2 408.3 413.2 395 )= 30 27 23 19 16
+
+> x=c(414,404); # sales for 2 bags:
+
+> cat("sales(x=",x,")=",sales(x),"\n")
+
+sales(x= 414 404 )= 30 27
+
+> cat("sales(x,A=1000,m=c(2,1.75))=",sales(x,1000,m=c(2,1.75)),"\n")
+
+sales(x,A=1000,m=c(2,1.75))= 30 27
+
+> # show 3! :
+
+> x=3; cat("fact(",x,")=",fact(x),"\n")
+
+fact( 3 )= 6
+
+R users tend to avoid the definition of loops (e.g., for) in order to reduce the number of lines of code and mistakes. Often, this can be achieved by using special functions that execute an argument function over all elements of a container (e.g., vector or matrix), such as: sapply , which runs over a vector or list; and apply , which runs over a matrix or array. An example that demonstrates these functions is shown next:
+
+> source("functions.R") # load the code
+
+> x=1:5 # show the factorial of 1:5
+
+> cat(sapply(x,fact),"\n") # fact is a function
+
+1 2 6 24 120
+
+> m=matrix(ncol=5,nrow=2)
+
+> m[1,]=c(1,1,1,1,1) # very cheap bags
+
+> m[2,]=c(414,404,408,413,395) # optimum
+
+# show profit for both price setups:
+
+> y=apply(m,1,profit); print(y) # profit is a function
+
+[1] -7854 43899
+
+The second argument of apply() is called MARGIN and indicates if the function (third argument) is applied over the rows (1), columns (2), or both (c(1,2)).
+
+## 2.4 Importing and Exporting Data
+
+The R tool includes several functions for importing and exporting data. Any R object can be saved into a binary or ASCII (using an R external representation) with the save function and then loaded with the load command. All R session objects, i.e., the current workspace, can be saved with save.image(), which also occurs with q("yes") (quiting function). Such workspace is automatically saved into a .RData file. Similarly to when reading R source files, file names are assumed to be found in the current working directory (corresponding to getwd() ), unless the absolute path is specified in the file names.
+
+Text files can be read by using the readLines (all file or one line at the time) functions. A text file can be generated by using a combination of: file (create or open a connection), writeLines (write lines into a connection) and close (close a connection); or sink (divert output to a connection) and console writing functions (e.g., print or cat).
+
+The next example shows how to save/load objects and text files:
+
+> x=list(a=1:3,b="hello!") # x is a list
+
+> save(x,file="x.Rdata",ascii=TRUE) # save into working directory
+
+> rm(x) # remove an object
+
+> print(x) # gives an error
+
+Error in print(x) : object 'x' not found
+
+> load("x.Rdata") # x now exists!
+
+> print(x) # show x
+
+$a
+
+[1] 1 2 3
+
+$b
+
+[1] "hello!"
+
+> t=readLines("x.Rdata") # read all text file
+
+> cat("first line:",t[1],"\n") # show 1st line
+
+first line: RDA2
+
+> cat("first line:",readLines("x.Rdata",n=1),"\n")
+
+first line: RDA2
+
+> # write a text file using writeLines:
+
+> conn=file("demo.txt") # create a connection
+
+> writeLines("hello!", conn) # write something
+
+> close(conn) # close connection
+
+> # write a text file using sink:
+
+> sink("demo2.txt") # divert output
+
+> cat("hello!\n") # write something
+
+> sink() # stop sink
+
+A common way of loading data is to read tabular or spreadsheet data (e.g., CSV format) by using the read.table function (and its variants, such as read.csv ). It should be noted that read.table can also read files directly from the Web, as shown in Sect. 7.​4. The reverse operation is performed using the write.table command. The "R data import/export" section of the R manual (accessed using help.start()) includes a wide range of data formats that can be accessed by installing the foreign package, such as read.spss , read.mtp (Minitab Portable Worksheet format), and read.xport (SAS XPORT format). Other file formats can be read using other packages, such as Excel files (gdata package and function read.xls ), Web content (RCurlb package and getURL function), relational databases (e.g., MySQL using package RMySQL ), and text corpus (tm package). A demonstrative example for reading and writing tabular data is shown here:
+
+> # create and write a simple data.frame:
+
+> d=data.frame(day=1:2,mon=factor(c("Jan","Feb")),year=c(12,13))
+
+> print(d)
+
+day mon year
+
+1 1 Jan 12
+
+2 2 Feb 13
+
+> write.table(d,file="demo.csv",row.names=FALSE,sep=";")
+
+> # read the created data.frame:
+
+> d2=read.table("demo.csv",header=TRUE,sep=";")
+
+> print(d2)
+
+day mon year
+
+1 1 Jan 12
+
+2 2 Feb 13
+
+> # read white wine quality dataset from UCI repository:
+
+> library(RCurl)
+
+> URL="http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
+
+> wine=getURL(URL)
+
+# write "winequality-white.csv" to working directory:
+
+> write(wine,file="winequality-white.csv")
+
+# read file:
+
+> w=read.table("winequality-white.csv",header=TRUE,sep=";")
+
+> cat("wine data (",nrow(w),"x",ncol(w),")\n") # show nrow x ncol
+
+wine data ( 4898 x 12 )
+
+Any R graphic can be saved into a file by changing the output device driver, creating the graphic and then closing the device (dev.off()) . Several graphic devices are available, such as pdf , png , jpeg , and tiff . The next example shows the full code used to create the top left graph of Fig. 1.​3:
+
+# create PDF file:
+
+DIR="" # change if different directory is used
+
+pdf(paste(DIR,"sumbits.pdf",sep=""),width=5,height=5)
+
+sumbinint=function(x) # sum of bits of an integer
+
+{ return(sum(as.numeric(intToBits(x))))}
+
+sumbits=function(x) # sum of bits of a vector
+
+{ return(sapply(x,sumbinint))}
+
+D=8; x=0:(2^D-1)# x is the search space (integer representation)
+
+y=sumbits(x) # y is the number of binary bits of x
+
+plot(x,y,type="l",ylab="evaluation function",
+
+xlab="search space (x)",lwd=2)
+
+pmax=c(x[which.max(y)],max(y)) # maximum point coordinates
+
+points(pmax[1],pmax[2],pch=19,lwd=2) # plot maximum point
+
+legend("topleft","optimum",pch=19,lwd=2) # add a legend
+
+dev.off() # close the device
+
+This examples introduces the functions: intToBits , which converts an integer into a binary representation; as.numeric , which converts an object into numeric; and legend , which adds legends to plots.
+
+## 2.5 Additional Features
+
+This section discusses four additional R features: command line execution of R, parallel computing, getting R source code of a function, and interfacing with other computer languages.
+
+The R environment can be executed directly from the operating system console, under two possibilities:
+
+  * R [options] [< infile] [> outfile]; or
+
+  * R CMD command [arguments].
+
+The full details can be accessed by running ![
+$$\\fbox{ $\\mathtt{\\$R--help}$}$$
+](A317276_1_En_2_Chapter_IEq8.gif) in the operating system console ($ is the Mac OS prompt). This direct mode can be used for several operations, such as compiling files for use in R or executing an R file in batch processing , without manual intervention. For example, if the previous shown code for creating a pdf file is saved in a file called sumbits.R, then such code can be directly executed in R by using: ![
+$$\\fbox{ $\\mathtt{\\$R---vanilla---slave < sumbits.R}$}$$
+](A317276_1_En_2_Chapter_IEq9.gif).
+
+There is a CRAN task view for high-performance and parallel computing with R (http://​cran.​r-project.​org/​web/​views/​HighPerformanceC​omputing.​html). The view includes several packages that are useful for high-performance computing, such as multicore and parallel. An example of multicore use is shown next:
+
+> library(multicore) # load the package
+
+> x1=1:5;x2=5:10 # create 2 objects
+
+> p1=parallel(factorial(x1)) # run in parallel
+
+> p2=parallel(factorial(x2)) # run in parallel
+
+> collect(list(p1,p2)) # collect results
+
+$'8995'
+
+[1] 1 2 6 24 120
+
+$'8996'
+
+[1] 120 720 5040 40320 362880 3628800
+
+Given that all functions are stored as objects, it is easy in R to access the full code of any given function, including built-in functions, by using the methods and getAnywhere commands, such as:
+
+methods(mean) # list all available methods for mean function
+
+getAnywhere(mean.default) # show R code for default mean function
+
+The R environment can interface with other programming languages , such as Fortran, C, and Java. Examples of interfaces with the C and Java languages can be found in:
+
+  * C—http://​adv-r.​had.​co.​nz/​C-interface.​html; see also the "Writing R Extensions" user manual, by typing help.start();
+
+  * Java—http://​www.​rforge.​net/​rJava/​.
+
+## 2.6 Command Summary
+
+Inf | Infinity value
+
+---|---
+
+NA | Missing value
+
+NULL | Empty or null object
+
+NaN | Not a number constant
+
+RCurl | Package for network (HTTP/FTP/...) interface
+
+apply() | Apply a function over a matrix or array
+
+as.numeric() | Converts an object into numeric
+
+barplot() | Draw a bar plot
+
+boxplot() | Plot a box-and-whisker graph
+
+c() | Concatenate values into a vector
+
+cat() | Concatenate and output command
+
+chisq.test() | Pearson's chi-squared test
+
+class() | Get class of object
+
+close() | Close a file connection
+
+cos() | Cosine trigonometric function
+
+dev.off()) | Close a graphical device
+
+example() | Show examples of a command
+
+factorial() | Compute the factorial of an object
+
+file() | Create or open a file connection
+
+for() | Loop execution command
+
+function() | Defines a function
+
+getAnywhere() | Retrieve an R object
+
+getURL() | Get Web content (package RCurl)
+
+getwd() | Get working directory
+
+help() | Get help on a particular subject
+
+help.search() | Get help on a text expression
+
+help.start() | Get the full R manual
+
+hist() | Plot a histogram
+
+if() | Conditional execution command
+
+install.packages() | Install a package
+
+intToBits() | Convert integer to binary representation
+
+is.na() | Check missing data
+
+is.nan() | Check if NaN
+
+is.null() | Check if NULL
+
+jpeg() | Set graphical device to jpeg file
+
+lattice | Package with high-level data visualization functions
+
+legend() | Add a legend to a plot
+
+length() | Number of elements of an object
+
+library() | Load a package
+
+load() | Load an object from file
+
+ls() | List created objects
+
+max() | Maximum of all values
+
+mean() | Mean of all values
+
+median() | Median of all values
+
+methods() | List methods for functions
+
+min() | Minimum of all values
+
+multicore | Package for parallel processing
+
+names() | Get and set the names of an object
+
+parallel() | Execute expression in a separate process
+
+|
+
+(package multicore)
+
+pdf() | Set graphical device to pdf file
+
+pi | Mathematical π value
+
+pie() | Plot a pie chart
+
+plot() | Generic plot of a object
+
+png() | Set graphical device to png file
+
+print() | Show an object
+
+read.table() | Read a tabular file (e.g., CSV)
+
+readLines() | Read lines from text file
+
+rep() | Function that replicates elements of vectors and lists
+
+return() | Returns an item from a function
+
+rnorm() | Create normal distribution random samples
+
+round() | Rounds the first argument values
+
+runif() | Create real value uniform random samples
+
+sample() | Create integer uniform random samples
+
+sapply() | Apply a function over a vector
+
+save() | Save an object into a file
+
+save.image() | Save workspace
+
+seq() | Create a regular sequence
+
+setwd() | Set the working directory
+
+set.seed() | Set the random generation number
+
+|
+
+(used by sample, runif,...)
+
+sin() | Sine trigonometric function
+
+sink() | Divert output to a file connection
+
+sort() | Sorts a vector or factor
+
+source() | Execute R code from a file
+
+sqrt() | Square root of a number
+
+str() | Show internal structure of object
+
+sum() | Sum of all values
+
+summary() | Show a summary of the object
+
+switch() | Conditional control function
+
+t.test() | Performs a t-student test
+
+tan() | Tangent trigonometric function
+
+tiff() | Set graphical device to tiff file
+
+wilcox.test() | Wilcoxon test
+
+which() | Returns the indexes of an object that follow
+
+|
+
+a logical condition
+
+which.max() | Returns the indexes of the maximum value
+
+which.min() | Returns the indexes of the minimum value
+
+while() | Loop execution command
+
+wireframe() | Draw a 3D scatter plot (package lattice)
+
+writeLines() | Write lines into a text file
+
+write.table() | Write object into tabular file
+
+## 2.7 Exercises
+
+2.1.
+
+In the R environment console, create a vector v with 10 elements, all set to 0. Using a single command, replace the indexes 3, 7, and 9 values of v with 1.
+
+2.2.
+
+Create a vector v with all even numbers between 1 and 50.
+
+2.3.
+
+Create matrix m of size 3×4, such that:
+
+1.
+
+the first row contains the sequence 1, 2, 3, 4;
+
+2.
+
+the second row contains the square root of the first row;
+
+3.
+
+the third row is computed after step 2 and contains the square root of the second row; and
+
+4.
+
+the fourth column is computed after step 3 and contains the squared values of the third column.
+
+Then, show matrix values and its row and column sums (use apply function) with a precision of two decimal digits.
+
+2.4.
+
+Create function counteven(x) that counts how many even numbers are included in a vector x, using three approaches:
+
+1.
+
+use a for() cycle with an if() condition;
+
+2.
+
+use sapply() function; and
+
+3.
+
+use a condition that is applied directly to x (without if).
+
+Test the function over the object x=1:10.
+
+2.5.
+
+Write in a file maxsin.R the full R code that is needed to create the maxsin.pdf PDF file that appears in top right plot of Fig. 1.​3 (Sect. 1.​7 and Eq. (1.​2) explain how max sin is defined). Execute the R source file and check if the PDF file is identical to the top right plot of Fig. 1.​3.
+
+2.6.
+
+Forest fires data exercise:
+
+1.
+
+If needed, install and load the RCurl package.
+
+2.
+
+Use the getURL and write functions to write the forest fires data http://​archive.​ics.​uci.​edu/​ml/​machine-learning-databases/​forest-fires/​forestfires.​csv into a local file.
+
+3.
+
+Load the local CSV file (forestfires.csv, the separator character is ",") into a data frame.
+
+4.
+
+Show the average temperature in August.
+
+5.
+
+Select ten random samples of temperatures from the months February, July, and August; then check if the average temperature differences are significant under 95 % confidence level t-student paired tests.
+
+6.
+
+Show all records from August and with a burned area higher than 100.
+
+7.
+
+Save the records obtained previously (6) into a CSV file named aug100.csv.
+
+References
+
+Ihaka R, Gentleman R (1996) R: a language for data analysis and graphics. J Comput Graph Stat 5(3):299–314
+
+Paradis E (2002) R for beginners. Montpellier (F): University of Montpellier. http://​cran.​r-project.​org/​doc/​contrib/​Paradis-rdebuts_​en.​pdf
+
+Venables W, Smith D, R Core Team (2013) An introduction to R. http://​cran.​r-project.​org/​doc/​manuals/​R-intro.​pdf
+
+Zuur A, Ieno E, Meesters E (2009) A beginner's guide to R. Springer, New YorkCrossRefMATH
+
+Footnotes
+
+1
+
+Although the <- operator is commonly used in R , this book adopts the smaller = character.
+
+2
+
+For C language users, there is also the sprintf function (e.g., sprintf("float: %.2f string: %s",pi,"pi")).
+© Springer International Publishing Switzerland 2014
+
+Paulo CortezModern Optimization with RUse R!10.1007/978-3-319-08263-9_3
+
+# 3. Blind Search
+
+Paulo Cortez1
+
+(1)
+
+Department of Information Systems, University of Minho, Guimarães, Portugal
+
+## 3.1 Introduction
+
+Full blind search assumes the exhaustion of all alternatives, where any previous search does not affect how next solutions are tested (left of Fig. 3.1). Given that the full search space is tested, the optimum solution is always found. Blind search is only applicable to discrete search spaces and it is easy to encode in two ways. First, by setting the full search space in a matrix and then sequentially testing each row (solution) of this matrix. Second, in a recursive way, by setting the search space as a tree, where each branch denotes a possible value for a given variable and all solutions appear at the leaves (at the same level). Examples of two quite known blind methods based on tree structures are depth-first and breadth-first algorithms. The former starts at the root of the tree and traverses through each branch as far as possible, before backtracking. The latter also starts at the root but searches on a level basis, searching first all succeeding nodes of the root and then the next succeeding nodes of the root succeeding nodes, and so on.
+
+Fig. 3.1
+
+Example of pure blind search (left) and grid search (right) strategies
+
+The major disadvantage of pure blind search is that it is not feasible when the search space is continuous or too large, a situation that often occurs with real-world tasks. Consider, for instance, the bag prices toy problem defined in Sect. 1.​7, even with a small search dimension (D = 5) the full search space is quite large for the R tool (i.e., ![
+$$1000^{5} = 10^{15} = 1000 \\times 10^{12} = 1000$$
+](A317276_1_En_3_Chapter_IEq1.gif) billion of searches!). Hence, pure blind search methods are often adapted, by setting thresholds (e.g., depth-first with a maximum depth of K), reducing the space searched or using heuristics. Grid search (Hsu et al. 2003) is an example of a search space reduction method. Monte Carlo search (Caflisch 1998), also known as random search, is another popular blind method. The method is based on a repeated random sampling, with up to N sampled points. This method is popular since it is computationally feasible and quite easy to encode.
+
+The next sections present R implementations of three blind search methods: full blind search, grid search, and Monte Carlo search. Also, these implementations are tested on the demonstrative problems presented in Chap. .
+
+## 3.2 Full Blind Search
+
+This section presents two blind search functions: fsearch and dfsearch. The former is a simpler function that requires the search space to be explicitly defined in a matrix in the format solutions × D (argument Search), while the latter performs a recursive implementation of the depth-first search and requires the definition of the domain values for each variable to be optimized (argument domain). Both functions receive as arguments the evaluation function (FUN), the optimization type (type, a character with "min" or "max") and extra arguments, (denoted by ... and that might be used by the evaluation function FUN). These functions were encoded in a file named blind.R:
+
+### blind.R file ###
+
+# full bind search method
+
+# search - matrix with solutions x D
+
+# FUN - evaluation function
+
+# type - "min" or "max"
+
+# ... - extra parameters for FUN
+
+fsearch=function(search,FUN,type="min",...)
+
+{
+
+x=apply(search,1,FUN,...) # run FUN over all search rows
+
+ib=switch(type,min=which.min(x),max=which.max(x))
+
+return(list(index=ib,sol=search[ib,],eval=x[ib]))
+
+}
+
+# depth-first full search method
+
+# l - level of the tree
+
+# b - branch of the tree
+
+# domain - vector list of size D with domain values
+
+# FUN - eval function
+
+# type - "min" or "max"
+
+# D - dimension (number of variables)
+
+# x - current solution vector
+
+# bcur - current best sol
+
+# ... - extra parameters for FUN
+
+dfsearch=function(l=1,b=1,domain,FUN,type="min",
+
+D=length(domain),
+
+x=rep(NA,D),
+
+bcur=switch(type,min=list(sol=NULL,eval=Inf),
+
+max=list(sol=NULL,eval=-Inf)),
+
+...)
+
+{ if((l-1)==D) # "leave" with solution x to be tested:
+
+{ f=FUN(x,...);fb=bcur$eval
+
+ib=switch(type,min=which.min(c(fb,f)),
+
+max=which.max(c(fb,f)))
+
+if(ib==1) return (bcur) else return(list(sol=x,eval=f))
+
+}
+
+else # go through sub branches
+
+{ for(j in 1:length(domain[[l]]))
+
+{ x[l]=domain[[l]][j]
+
+bcur=dfsearch(l+1,j,domain,FUN,type,D=D,
+
+x=x,bcur=bcur,...)
+
+}
+
+return(bcur)
+
+}
+
+}
+
+where dfsearch is a recursive function that tests if the tree node is a leave, computing the evaluation function for the respective solution, else traverses through the node sub branches. This function requires some memory state variables (l, b, x and bcur) that are changed each time a new recursive call is executed. The domain of values is stored in a vector list of length D, since the elements of this vector can have different lengths, according to their domain values.
+
+The next R code tests both blind search functions for the sum of bits and max sin tasks (Sect. 1.​7, D = 8):
+
+### binary-blind.R file ###
+
+source("blind.R") # load the blind search methods
+
+# read D bits from integer x:
+
+binint=function(x,D)
+
+{ x=rev(intToBits(x)[1:D]) # get D bits
+
+# remove extra 0s from raw type:
+
+as.numeric(unlist(strsplit(as.character(x),""))[(1:D)*2])
+
+}
+
+# convert binary vector into integer: code inspired in
+
+# http://stackoverflow.com/questions/12892348/
+
+# in-r-how-to-convert-binary-string-to-binary-or-decimal-value
+
+intbin=function(x) sum(2^(which(rev(x==1))-1))
+
+# sum a raw binary object x (evaluation function):
+
+sumbin=function(x) sum(as.numeric(x))
+
+# max sin of binary raw object x (evaluation function):
+
+maxsin=function(x,Dim) sin(pi*(intbin(x))/(2^Dim))
+
+D=8 # number of dimensions
+
+x=0:(2^D-1) # integer search space
+
+# set full search space in solutions x D:
+
+search=t(sapply(x,binint,D=D))
+
+# set the domain values (D binary variables):
+
+domain=vector("list",D)
+
+for(i in 1:D) domain[[i]]=c(0,1) # bits
+
+# sum of bits, fsearch:
+
+S1=fsearch(search,sumbin,"max") # full search
+
+cat("fsearch best s:",S1$sol,"f:",S1$eval,"\n")
+
+# sum of bits, dfsearch:
+
+S2=dfsearch(domain=domain,FUN=sumbin,type="max")
+
+cat("dfsearch best s:",S2$sol,"f:",S2$eval,"\n")
+
+# max sin, fsearch:
+
+S3=fsearch(search,maxsin,"max",Dim=8) # full search
+
+cat("fsearch best s:",S3$sol,"f:",S3$eval,"\n")
+
+# max sin, dfsearch:
+
+S4=dfsearch(domain=domain,FUN=maxsin,type="max",Dim=8)
+
+cat("dfsearch best s:",S4$sol,"f:",S4$eval,"\n")
+
+where binint is an auxiliary function that selects only D bits from the raw object returned by intToBits . The intToBits returns 32 bits in a reversed format, thus the rev R function is also applied to set correctly the bits order. Given that the raw type includes two hex digits, the purpose of the last line of function binint is to remove extra 0 characters from the raw object. Such line uses some R functions that were not described in the previous chapter: as.character —convert to character type; strsplit —split a character vector into substrings; and unlist —transforms a list into a vector. The following R session exemplifies the effect of the binint code (and newly introduced R functions):
+
+> x=intToBits(7)[1:4]; print(x)
+
+[1] 01 01 01 00
+
+> x=rev(x); print(x)
+
+[1] 00 01 01 01
+
+> x=strsplit(as.character(x),""); print(x)
+
+[[1]]
+
+[1] "0" "0"
+
+[[2]]
+
+[1] "0" "1"
+
+[[3]]
+
+[1] "0" "1"
+
+[[4]]
+
+[1] "0" "1"
+
+> x=unlist(x); print(x)
+
+[1] "0" "0" "0" "1" "0" "1" "0" "1"
+
+> x=as.numeric(x[(1:4)*2]); print(x)
+
+[1] 0 1 1 1
+
+The generic sapply function uses the defined binint function in order to create the full binary search space from an integer space. Given that sapply returns a D× solutions matrix, the t R function is used to transpose the matrix into the required solutions × D format. The result of executing file binary-blind.R is:
+
+> source("binary-blind.R")
+
+fsearch best s: 1 1 1 1 1 1 1 1 f: 8
+
+dfsearch best s: 1 1 1 1 1 1 1 1 f: 8
+
+fsearch best s: 1 0 0 0 0 0 0 0 f: 1
+
+dfsearch best s: 1 0 0 0 0 0 0 0 f: 1
+
+where both methods (fsearch and dfsearch) return the optimum sum of bits and max sin solutions.
+
+Turning to the bag prices task (Sect. 1.​7), as explained previously, the search of all space of solutions (10005) is not feasible in practical terms. However, using domain knowledge, i.e., the original problem formulation assumes that the price for each bag can be optimized independently of other bag prices, it is easy to get the optimum solution, as shown in file bag-blind.R:
+
+### bag-blind.R file ###
+
+source("blind.R") # load the blind search methods
+
+source("functions.R") # load profit(), cost() and sales()
+
+# auxiliary function that sets the optimum price for
+
+# one bag type (D), assuming an independent influence of
+
+# a particular price on the remaining bag prices:
+
+ibag=function(D) # D - type of bag
+
+{ x=1:1000 # price for each bag type
+
+# set search space for one bag:
+
+search=matrix(ncol=5,nrow=1000)
+
+search[]=1; search[,D]=x
+
+S1=fsearch(search,profit,"max")
+
+S1$sol[D] # best price
+
+}
+
+# compute the best price for all bag types:
+
+S=sapply(1:5,ibag)
+
+# show the optimum solution:
+
+cat("optimum s:",S,"f:",profit(S),"\n")
+
+The result of executing file bag-blind.R is:
+
+> source("bag-blind.R")
+
+optimum s: 414 404 408 413 395 f: 43899
+
+It should be noted that while the original formulation of bag prices assumes an independence when optimizing each bag price variable (and optimum profit is 43,899), there are other variations presented in this book where this assumption is not true (see Sects. 5.​7 and 6.2).
+
+Given that pure blind search cannot be applied to real value search spaces (![
+$$\\mathfrak{R}$$
+](A317276_1_En_3_Chapter_IEq2.gif)), no code is shown here for the sphere and rastrigin tasks. Nevertheless, these two real value optimization tasks are handled in the next two sections.
+
+## 3.3 Grid Search
+
+Grid search reduces the space of solutions by implementing a regular hyper dimensional search with a given step size. The left of Fig. 3.1 shows an example of a two dimensional (3×3) grid search. Grid search is particularly used for hyperparameter optimization of machine learning algorithms, such as neural networks or support vector machines .
+
+There are several grid search variants. Uniform design search (Huang et al. 2007) is similar to the standard grid search method, except that it uses a different type of grid, with lesser search points. Nested grid search is another variant that uses several grid search levels. The first level is used with a large step size. Then, a second grid level is applied over the best point, searching over a smaller area and with a lower grid size. And so on. Nested search is not a pure blind method, since it incorporates a greedy heuristic, where the next level search is guided by the result of the current level search.
+
+Depending on the grid step size, grid search is often much faster than pure bind search. Also, depending on the number of levels and initial grid step size, nested search might be much faster than standard grid search, but it also can get stuck more easily on local minima. The main disadvantage of the grid search approach is that it suffers from the curse of dimensionality, i.e., the computational effort complexity is very high when the number of dimensions (variables to optimize) is large. For instance, the standard grid search computational complexity is ![
+$$\\mathcal{O}\(L^{D}\)$$
+](A317276_1_En_3_Chapter_IEq3.gif), where L is the number of grid search levels and D the dimension (variables to optimize). If only L = 3 levels are considered and with a dimension of D = 30, this leads to 330 ≈ 206 billion searches, which is infeasible under the R tool. Other disadvantages of grid search methods include the additional parameters that need to be set (e.g., grid search step, number of nested levels) and also the adopted type of blind search that does not warranty achieving the optimum solution and, more importantly, might not be particularly efficient in several practical applications.
+
+The next code implements two functions for the standard grid search method (gsearch and gsearch2) and one for the nested grid search (ngsearch):
+
+### grid.R file ###
+
+# standard grid search method (uses fsearch)
+
+# step - vector with step size for each dimension D
+
+# lower - vector with lowest values for each dimension
+
+# upper - vector with highest values for each dimension
+
+# FUN - evaluation function
+
+# type - "min" or "max"
+
+# ... - extra parameters for FUN
+
+gsearch=function(step,lower,upper,FUN,type="min",...)
+
+{ D=length(step) # dimension
+
+domain=vector("list",D) # domain values
+
+L=vector(length=D) # auxiliary vector
+
+for(i in 1:D)
+
+{ domain[[i]]=seq(lower[i],upper[i],by=step[i])
+
+L[i]=length(domain[[i]])
+
+}
+
+LS=prod(L)
+
+s=matrix(ncol=D,nrow=LS) # set the search space
+
+for(i in 1:D)
+
+{
+
+if(i==1) E=1 else E=E*L[i-1]
+
+s[,i]=rep(domain[[i]],length.out=LS,each=E)
+
+}
+
+fsearch(s,FUN,type,...) # best solution
+
+}
+
+# standard grid search method (uses dfsearch)
+
+gsearch2=function(step,lower,upper,FUN,type="min",...)
+
+{ D=length(step) # dimension
+
+domain=vector("list",D) # domain values
+
+for(i in 1:D) domain[[i]]=seq(lower[i],upper[i],by=step[i])
+
+dfsearch(domain=domain,FUN=FUN,type=type,...) # solution
+
+}
+
+# nested grid search method (uses fsearch)
+
+# levels - number of nested levels
+
+ngsearch=function(levels,step,lower,upper,FUN,type,...)
+
+{ stop=FALSE;i=1 # auxiliary objects
+
+bcur=switch(type,min=list(sol=NULL,eval=Inf),
+
+max=list(sol=NULL,eval=-Inf))
+
+while(!stop) # cycle while stopping criteria is not met
+
+{
+
+s=gsearch(step,lower,upper,FUN,type,...)
+
+# if needed, update best current solution:
+
+if( (type=="min" && s$eval<bcur$eval)||
+
+(type=="max" && s$eval>bcur$eval)) bcur=s
+
+if(i<levels) # update step, lower and upper:
+
+{ step=step/2
+
+interval=(upper-lower)/4
+
+lower=sapply(lower,max,s$sol-interval)
+
+upper=sapply(upper,min,s$sol+interval)
+
+}
+
+if(i>=levels || sum((upper-lower)<=step)>0) stop=TRUE
+
+else i=i+1
+
+}
+
+return(bcur) # best solution
+
+}
+
+All functions require the setting of the grid step (step, numeric vector) and lower and upper bounds (vectors lower and upper). The first function uses the fsearch function, while the second one uses the recursive blind variant (dfsearch), both described in Sect. 3.2. The gsearch function contains more code in comparison with gsearch2, since it requires setting first the search space. This is achieved by using the useful rep function with the each argument. For example, rep(1:2,each=2) returns the vector: 1 1 2 2. The second grid search function (gsearch2) is simpler to implement, given that it performs a direct call of the depth-first search.
+
+The nested grid function uses a simple cycle that calls gsearch and whose maximum number of iterations depends on the levels argument. The cycle also stops when the range set by the upper and lower bounds is lower than the step size. The next level search is set around the best solution of the current grid search and with half of the current step size. The lower and upper bounds are changed accordingly and the min and max functions are used to avoid setting a search space larger than the original bounds. For some configurations of the step, lower and upper arguments, this nested function might repeat on the next level the evaluation of solutions that were previously evaluated. For the sake of simplicity, the nested grid code is kept with this handicap, although it could be enhanced by implementing a cache that stores previous tested solutions in memory and only computes the evaluation function for new solutions.
+
+The next code explores the three implemented grid search methods for the bag prices task of Sect. 1.​7:
+
+### bag-grid.R file ###
+
+source("blind.R") # load the blind search methods
+
+source("grid.R") # load the grid search methods
+
+source("functions.R") # load the profit function
+
+# grid search for all bag prices, step of 100$
+
+PTM=proc.time() # start clock
+
+S1=gsearch(rep(100,5),rep(1,5),rep(1000,5),profit,"max")
+
+sec=(proc.time()-PTM)[3] # get seconds elapsed
+
+cat("gsearch best s:",S1$sol,"f:",S1$eval,"time:",sec,"s\n")
+
+# grid search 2 for all bag prices, step of 100$
+
+PTM=proc.time() # start clock
+
+S2=gsearch2(rep(100,5),rep(1,5),rep(1000,5),profit,"max")
+
+sec=(proc.time()-PTM)[3] # get seconds elapsed
+
+cat("gsearch2 best s:",S2$sol,"f:",S2$eval,"time:",sec,"s\n")
+
+# nested grid with 3 levels and initial step of 500$
+
+PTM=proc.time() # start clock
+
+S3=ngsearch(3,rep(500,5),rep(1,5),rep(1000,5),profit,"max")
+
+sec=(proc.time()-PTM)[3] # get seconds elapsed
+
+cat("ngsearch best s:",S3$sol,"f:",S3$eval,"time:",sec,"s\n")
+
+This code includes the proc.time R function, which returns the time elapsed (in seconds) by the running process and that is useful for computational effort measurements. The result of executing file bag-grid.R is:
+
+> source("bag-grid.R")
+
+gsearch best s: 401 401 401 401 501 f: 43142 time: 4.149 s
+
+gsearch2 best s: 401 401 401 401 501 f: 43142 time: 5.654 s
+
+ngsearch best s: 376.375 376.375 376.375 501.375 501.375 f: 42823 time: 0.005 s
+
+Under the tested settings, the pure grid search methods execute 10 searches per dimension, leading to a total of 105 = 100, 000 evaluations, achieving the same solution (43,142) under a similar computational effort.1 The nested grid achieves a worst solution (42,823) but under much less evaluations (2 searches for per dimension and level, total of 25 × 3 = 96 tested solutions).
+
+Regarding the real value optimization tasks (sphere and rastrigin, Sect. 1.​7), these can be handled by grid search methods, provided that the dimension adopted is small. The next code shows an example for D = 2 and range of [−5. 2, 5. 2] (commonly used within these benchmark functions):
+
+### real-grid.R file ###
+
+source("blind.R") # load the blind search methods
+
+source("grid.R") # load the grid search methods
+
+# real-value functions: sphere and rastrigin:
+
+sphere=function(x) sum(x^2)
+
+rastrigin=function(x) 10*length(x)+sum(x^2-10*cos(2*pi*x))
+
+cat("sphere:\n") # D=2, easy task
+
+S=gsearch(rep(1.1,2),rep(-5.2,2),rep(5.2,2),sphere,"min")
+
+cat("gsearch s:",S$sol,"f:",S$eval,"\n")
+
+S=ngsearch(3,rep(3,2),rep(-5.2,2),rep(5.2,2),sphere,"min")
+
+cat("ngsearch s:",S$sol,"f:",S$eval,"\n")
+
+cat("rastrigin:\n") # D=2, easy task
+
+S=gsearch(rep(1.1,2),rep(-5.2,2),rep(5.2,2),rastrigin,"min")
+
+cat("gsearch s:",S$sol,"f:",S$eval,"\n")
+
+S=ngsearch(3,rep(3,2),rep(-5.2,2),rep(5.2,2),rastrigin,"min")
+
+cat("ngsearch s:",S$sol,"f:",S$eval,"\n")
+
+The execution result of file real-grid.R is:
+
+sphere:
+
+gsearch s: 0.3 0.3 f: 0.18
+
+ngsearch s: -0.1 -0.1 f: 0.02
+
+rastrigin:
+
+gsearch s: -1.9 -1.9 f: 11.03966
+
+ngsearch s: -0.1 -0.1 f: 3.83966
+
+Good solutions were achieved, close to the optimum solution of s = (0, 0) and f = 0. Figure 3.2 shows how the space is searched when using the standard grid search for sphere when using L = 10 (left) and L = 20 search levels (right) for each dimension. The three dimensional plots were achieved using the scatterplot3d function of the scatterplot3d package.
+
+Fig. 3.2
+
+Example of grid search using L = 10 (left) and L = 20 (right) levels for sphere and D = 2
+
+## 3.4 Monte Carlo Search
+
+Monte Carlo is a versatile numerical method that is easy to implement and is applicable to high-dimensional problems (in contrast with grid search), ranging from Physics to Finance (Caflisch 1998). The method consists in a random generation of N points, using a given probability distribution over the problem domain. The computational effort complexity is ![
+$$\\mathcal{O}\(N\)$$
+](A317276_1_En_3_Chapter_IEq4.gif).
+
+More details about implementing Monte Carlo methods in R can be found in Robert and Casella (2009). In this book, we present a very simple implementation of the Monte Carlo search, which adopts the uniform distribution ![
+$$\\mathcal{U}\(\\mathit{lower},\\mathit{upper}\)$$
+](A317276_1_En_3_Chapter_IEq5.gif) and includes only four lines of code:
+
+### montecarlo.R file ###
+
+# montecarlo uniform search method
+
+# N - number of samples
+
+# lower - vector with lowest values for each dimension
+
+# upper - vector with highest values for each dimension
+
+# domain - vector list of size D with domain values
+
+# FUN - evaluation function
+
+# type - "min" or "max"
+
+# ... - extra parameters for FUN
+
+mcsearch=function(N,lower,upper,FUN,type="min",...)
+
+{ D=length(lower)
+
+s=matrix(nrow=N,ncol=D) # set the search space
+
+for(i in 1:N) s[i,]=runif(D,lower,upper)
+
+fsearch(s,FUN,type,...) # best solution
+
+}
+
+The proposed implementation is tested here for the bag prices (D = 5) and real value tasks (sphere and rastrigin, D ∈ { 2, 30}) by using N = 10, 000 uniform samples:
+
+### test-mc.R file ###
+
+source("blind.R") # load the blind search methods
+
+source("montecarlo.R") # load the monte carlo method
+
+source("functions.R") # load the profit function
+
+N=10000 # set the number of samples
+
+cat("monte carlo search (N:",N,")\n")
+
+# bag prices
+
+cat("bag prices:")
+
+S=mcsearch(N,rep(1,5),rep(1000,5),profit,"max")
+
+cat("s:",S$sol,"f:",S$eval,"\n")
+
+# real-value functions: sphere and rastrigin:
+
+sphere=function(x) sum(x^2)
+
+rastrigin=function(x) 10*length(x)+sum(x^2-10*cos(2*pi*x))
+
+D=c(2,30)
+
+label="sphere"
+
+for(i in 1:length(D))
+
+{ S=mcsearch(N,rep(-5.2,D[i]),rep(5.2,D[i]),sphere,"min")
+
+cat(label,"D:",D[i],"s:",S$sol[1:2],"f:",S$eval,"\n")
+
+}
+
+label="rastrigin"
+
+for(i in 1:length(D))
+
+{ S=mcsearch(N,rep(-5.2,D[i]),rep(5.2,D[i]),rastrigin,"min")
+
+cat(label,"D:",D[i],"s:",S$sol[1:2],"f:",S$eval,"\n")
+
+}
+
+To simplify the analysis of the obtained results, the code only shows the optimized values for the first two variables (x 1 and x 2). Given that Monte Carlo is a stochastic method, each run will present a different result. An example execution (one run) is:
+
+> source("test-mc.R")
+
+monte carlo search (N: 10000 )
+
+bag prices:s: 349.7477 369.1669 396.1959 320.5007 302.3327 f: 42508
+
+sphere D: 2 s: -0.01755296 0.0350427 f: 0.001536097
+
+sphere D: 30 s: -0.09818928 -1.883463 f: 113.7578
+
+rastrigin D: 2 s: -0.0124561 0.02947438 f: 0.2026272
+
+rastrigin D: 30 s: 0.6508581 -3.043595 f: 347.1969
+
+Under the tested setup (N = 10, 000), interesting results were achieved for the sphere and rastrigin tasks when D = 2. However, when the dimension increases (D = 30) the optimized solutions are further away from the optimum value (f = 0). For demonstration purposes, Fig. 3.3 shows two examples of Monte Carlo searches for sphere and D = 2 (left plot with N = 100 and right graph with N = 1, 000).
+
+Fig. 3.3
+
+Example of Monte Carlo search using N = 100 (left) and N = 1, 000 (right) samples for sphere and D = 2
+
+## 3.5 Command Summary
+
+dfsearch() | Depth-first blind search (chapter file "blind.R")
+
+---|---
+
+fsearch() | Full blind search (chapter file "blind.R")
+
+gsearch() | Grid search (chapter file "grid.R")
+
+mcsearch() | Monte Carlo search (chapter file "montecarlo.R")
+
+ngsearch() | Nested-grid search (chapter file "grid.R")
+
+proc.time() | Time elapsed by the running process
+
+rev() | Reversed version of an object
+
+scatterplot3d | Package that implements scatterplot3d()
+
+scatterplot3d() | Plot a 3D point cloud (package scatterplot3d)
+
+strsplit() | Split elements of character vector
+
+t() | Matrix transpose
+
+unlist() | Convert list to vector
+
+## 3.6 Exercises
+
+3.1.
+
+Explore the optimization of the binary max sin task with a higher dimension (D = 16), under pure blind, grid, and Monte Carlo methods. Show the optimized solutions, evaluation values, and time elapsed (in seconds). For the grid and Monte Carlo methods, use directly the fsearch function, by changing only the integer space (object x of the file "binary-blind.R") with a maximum of N = 1, 000 searches. Tip: use seq() for setting the grid search integer space and sample() for Monte Carlo.
+
+3.2.
+
+Consider the bag prices (D = 5). Adapt the file "bag-grid.R" such that two grid searches are performed over the range [350, 450] with a step size of 11$ in order to show the solution and evaluation values. The first search should be executed using gsearch function, while the second search should be implemented using the depth-first method (dfsearch function). Tip: use the seq function to set the domain of values used by dfsearch.
+
+3.3.
+
+Consider the rastrigin task with a dimension of D = 30. Using the Monte Carlo method, explore different N values within the range {100,1,000,10,000}. Execute 30 runs for each N value and compare if the average differences are statistically significant at the 95 % confidence level under a pairwise t-student test. Also, plot the boxplots for the results due to each N value.
+
+References
+
+Caflisch RE (1998) Monte carlo and quasi-monte carlo methods. Acta Numer 1998:1–49CrossRefMathSciNet
+
+Hsu CH, Chang CC, Lin CJ (2003) A practical guide to support vector classification. Tech. rep., National Taiwan University
+
+Huang CM, Lee YJ, Lin DK, Huang SY (2007) Model selection for support vector machines via uniform design. Comput Stat Data Anal 52(1):335–346CrossRefMATHMathSciNet
+
+Robert C, Casella G (2009) Introducing Monte Carlo methods with R. Springer, New York
+
+Footnotes
+
+1
+
+Slightly different execution times can be achieved by executing distinct runs under the same code and machine.
+© Springer International Publishing Switzerland 2014
+
+Paulo CortezModern Optimization with RUse R!10.1007/978-3-319-08263-9_4
+
+# 4. Local Search
+
+Paulo Cortez1
+
+(1)
+
+Department of Information Systems, University of Minho, Guimarães, Portugal
+
+## 4.1 Introduction
+
+In contrast with the blind search methods presented in Chap. , modern optimization techniques are based on a guided search, where new solutions are generated from existing solutions. Local search , often termed single-state search, includes several methods that focus their attention within a local neighborhood of a given initial solution, as shown in Fig. 4.1. A priori knowledge, such as problem domain heuristics, can be used to set the initial solution. A more common approach is to set the initial point randomly and perform several restarts (also known as runs).
+
+Fig. 4.1
+
+Example of a local search strategy
+
+The main differences within local methods is set on how new solutions are defined and what is kept as the current solution (corresponding to functions change and select of Algorithm 1). The next sections describe how three local search methods, namely hill climbing, simulated annealing, and tabu search, can be adopted in the R tool. This chapter also includes a section that describes how to compare search methods in R, by providing a demonstrative example that compares two local search methods with random search.
+
+## 4.2 Hill Climbing
+
+Hill climbing is a simple local optimization method that "climbs" up the hill until a local optimum is found (assuming a maximization goal). The method works by iteratively searching for new solutions within the neighborhood of current solution, adopting new solutions if they are better, as shown in the pseudo-code of Algorithm 2. The purpose of function change is to produce a slightly different solution, by performing a full search in the whole neighborhood or by applying a small random change in the current solution values. It should be noted that while the standard hill climbing algorithm is deterministic, when random changes are used for perturbing a solution, a stochastic behavior is achieved. This is why hill climbing is set at the middle of the deterministic/stochastic dimension in Fig. 1.​2.
+
+Algorithm 2 Pure hill climbing optimization method
+
+There are several hill climbing variants, such as steepest ascent hill climbing (Luke 2012), which searches for up to N solutions in the neighborhood of S and then adopts the best one; and stochastic hill climbing (Michalewicz et al. 2006), which replaces the deterministic select function, selecting new solutions with a probability of P (a similar strategy is performed by the simulated annealing method, discussed in the next section).
+
+The R implementation of the standard hill climbing method is coded in file hill.R:
+
+### hill.R file ###
+
+# pure hill climbing:
+
+# par - initial solution
+
+# fn - evaluation function
+
+# change - function to generate the next candidate
+
+# lower - vector with lowest values for each dimension
+
+# upper - vector with highest values for each dimension
+
+# control - list with stopping and monitoring method:
+
+# $maxit - maximum number of iterations
+
+# $REPORT - frequency of monitoring information
+
+# type - "min" or "max"
+
+# ... - extra parameters for FUN
+
+hclimbing=function(par,fn,change,lower,upper,control,
+
+type="min",...)
+
+{ fpar=fn(par,...)
+
+for(i in 1:control$maxit)
+
+{
+
+par1=change(par,lower,upper)
+
+fpar1=fn(par1,...)
+
+if(control$REPORT>0 &&(i==1||i%%control$REPORT==0))
+
+cat("i:",i,"s:",par,"f:",fpar,"s'",par1,"f:",fpar1,"\n")
+
+if( (type=="min" && fpar1<fpar)
+
+|| (type=="max" && fpar1>fpar)) { par=par1;fpar=fpar1 }
+
+}
+
+if(control$REPORT>=1) cat("best:",par,"f:",fpar,"\n")
+
+return(list(sol=par,eval=fpar))
+
+}
+
+# slight random change of vector par:
+
+# par - initial solution
+
+# lower - vector with lowest values for each dimension
+
+# upper - vector with highest values for each dimension
+
+# dist - random distribution function
+
+# round - use integer (TRUE) or continuous (FALSE) search
+
+# ... - extra parameters for dist
+
+# examples: dist=rnorm, mean=0, sd=1; dist=runif, min=0,max=1
+
+hchange=function(par,lower,upper,dist,round=TRUE,...)
+
+{ D=length(par) # dimension
+
+step=dist(D,...) # slight step
+
+if(round) step=round(step)
+
+par1=par+step
+
+# return par1 within [lower,upper]:
+
+return(ifelse(par1<lower,lower,ifelse(par1>upper,upper,par1)))
+
+}
+
+The main function (hclimbing) receives an initial search point (par), an evaluation function (named now fun for coherence purposes with optim, see next section), lower and upper bounds, a control object and optimization type. The control list is used to set the maximum number of iterations (control$maxit) and monitor the search, showing the solutions searched every control$REPORT iterations.
+
+The change function (hchange) produces a small perturbation over a given solution (par). New values are achieved by adopting a given random distribution function (dist). Given the goal of getting a small perturbation, the normal (Gaussian) distribution ![
+$$\\mathcal{N}\(0,1\)$$
+](A317276_1_En_4_Chapter_IEq1.gif) is adopted in this book, corresponding to the arguments dist=rnorm, mean=0, sd=1. This means that in most cases very small changes are performed (with an average of zero), although large deviations might occur in a few cases. The new solution is kept within the range [lower,upper] by using the useful ifelse( condition, yes, no ) R function that performs a conditional element selection (returns the values of yes if the condition is true, else returns the elements of no). For example, the result of x=c(-1,4,9);sqrt(ifelse(x>=0,x,NA)) is NA 2 3.
+
+For demonstration purposes, the next R code executes ten iterations of a hill climbing search for the sum of bits task (Sect. 1.​7), starting from the origin (all zero) solution:
+
+### sumbits-hill.R file ###
+
+source("hill.R") # load the hill climbing methods
+
+# sum a raw binary object x (evaluation function):
+
+sumbin=function(x) sum(x)
+
+# hill climbing for sum of bits, one run:
+
+D=8 # dimension
+
+s=rep(0,D) # c(0,0,0,0,...)
+
+C=list(maxit=10,REPORT=1) # maximum of 10 iterations
+
+ichange=function(par,lower,upper) # integer change
+
+{ hchange(par,lower,upper,rnorm,mean=0,sd=1) }
+
+hclimbing(s,sumbin,change=ichange,lower=rep(0,D),upper=rep(1,D),
+
+control=C,type="max")
+
+One example of such execution is:
+
+> source("sumbits-hill.R")
+
+i: 1 s: 0 0 0 0 0 0 0 0 f: 0 s' 0 0 0 1 0 0 1 0 f: 2
+
+i: 2 s: 0 0 0 1 0 0 1 0 f: 2 s' 0 0 0 1 0 0 1 0 f: 2
+
+i: 3 s: 0 0 0 1 0 0 1 0 f: 2 s' 0 0 0 0 1 1 0 0 f: 2
+
+i: 4 s: 0 0 0 1 0 0 1 0 f: 2 s' 1 0 0 1 0 0 1 0 f: 3
+
+i: 5 s: 1 0 0 1 0 0 1 0 f: 3 s' 0 0 0 0 1 0 0 1 f: 2
+
+i: 6 s: 1 0 0 1 0 0 1 0 f: 3 s' 1 1 0 1 1 0 0 1 f: 5
+
+i: 7 s: 1 1 0 1 1 0 0 1 f: 5 s' 0 1 0 1 0 1 0 0 f: 3
+
+i: 8 s: 1 1 0 1 1 0 0 1 f: 5 s' 0 0 0 1 0 1 1 0 f: 3
+
+i: 9 s: 1 1 0 1 1 0 0 1 f: 5 s' 1 0 1 1 1 0 1 0 f: 5
+
+i: 10 s: 1 1 0 1 1 0 0 1 f: 5 s' 1 1 0 1 1 1 1 1 f: 7
+
+best: 1 1 0 1 1 1 1 1 f: 7
+
+The sum of bits is an easy task and after ten iterations the hill climbing method achieves a solution that is very close to the optimum (f = 8).
+
+The next code performs a hill climbing for the bag prices (D = 5) and sphere tasks (D = 2):
+
+### bs-hill.R file ###
+
+source("hill.R") # load the hill climbing methods
+
+source("functions.R") # load the profit function
+
+# hill climbing for all bag prices, one run:
+
+D=5; C=list(maxit=10000,REPORT=10000) # 10000 iterations
+
+s=sample(1:1000,D,replace=TRUE) # initial search
+
+ichange=function(par,lower,upper) # integer value change
+
+{ hchange(par,lower,upper,rnorm,mean=0,sd=1) }
+
+hclimbing(s,profit,change=ichange,lower=rep(1,D),
+
+upper=rep(1000,D),control=C,type="max")
+
+# hill climbing for sphere, one run:
+
+sphere=function(x) sum(x^2)
+
+D=2; C=list(maxit=10000,REPORT=10000)
+
+rchange=function(par,lower,upper) # real value change
+
+{ hchange(par,lower,upper,rnorm,mean=0,sd=0.5,round=FALSE) }
+
+s=runif(D,-5.2,5.2) # initial search
+
+hclimbing(s,sphere,change=rchange,lower=rep(-5.2,D),
+
+upper=rep(5.2,D),control=C,type="min")
+
+An execution example is:
+
+> source("bs-hill.R")
+
+i: 1 s: 136 332 716 748 781 f: 28946 s' 135 332 716 749 781 f: 28890
+
+i: 10000 s: 188 338 743 770 812 f: 31570 s' 189 336 742 769 812 f: 31387
+
+best: 188 338 743 770 812 f: 31570
+
+i: 1 s: 1.884003 4.549536 f: 24.24775 s' 2.142131 4.349898 f: 23.51034
+
+i: 10000 s: 0.001860534 0.009182373 f: 8.777755e-05 s' 0.5428469 -0.304862 f: 0.3876236
+
+best: 0.001860534 0.009182373 f: 8.777755e-05
+
+Using 10,000 iterations, the hill climbing search improved the solution from 28,890 to 31,570 (bag prices) and from 23.51 to 0.00 (sphere). For demonstrative purposes, Fig. 4.2 shows the searched "down the hill" (best) points for the sphere task.
+
+Fig. 4.2
+
+Example of hill climbing search (only best "down the hill" points are shown) for sphere and D = 2
+
+## 4.3 Simulated Annealing
+
+Simulated annealing is a variation of the hill climbing technique that was proposed in the 1980s and that is inspired in the annealing phenomenon of metallurgy, which involves first heating a particular metal and then perform a controlled cooling (Luke 2012). This single-state method differs from the hill climbing search by adopting a control temperature parameter (T) that is used to compute the probability of accepting inferior solutions. In contrast with the stochastic hill climbing, which adopts a fixed value for T, the simulated annealing uses a variable temperature value during the search. The method starts with a high temperature and then gradually decreases (cooling process) the control parameter until a small value is achieved (similar to the hill climbing). Given that simulated annealing is a single-state method, it is described in this chapter. However, it should be noted that for high temperatures the method is almost equivalent to Monte Carlo search, thus behaving more like a global search method (in particular if the change function is set to perform high changes), while for low temperatures the method is similar to the hill climbing local search (Michalewicz et al. 2006).
+
+This book adopts the simulated annealing implementation of the optim R function, which only performs minimization tasks and that executes several optimization methods by setting argument method, such as:
+
+  * "Nelder-Mead"—Nelder and Mead or downhill simplex method;
+
+  * "BFGS"—a quasi-Newton method ;
+
+  * "CG"—conjugate gradients method ;
+
+  * "L-BFGS-B"—modification of the BFGS method with lower and upper bounds; and
+
+  * "SANN"—simulated annealing.
+
+Algorithm 3 presents the pseudo-code of the simulated annealing implementation, which is based on the variant proposed by Bélisle (1992). This implementation includes three search parameters: maxit—the maximum number of iterations; temp (T)—the initial temperature; and tmax—the number of evaluations at each temperature. By default, the values for the control parameters are maxit = 10, 000, T = 10, and tmax = 10. Also, new search points are generated using a Gaussian Markov kernel with a scale proportional to the temperature. Nevertheless, these defaults can be changed by setting two optim arguments: the control list and gr (change) function. The last argument is useful for solving combinatorial problems, i.e., when the representation of the solution includes discrete values. The optim function returns a list with several components, such as $par—the optimized values and $value—the evaluation of the best solution.
+
+Algorithm 3 Simulated annealing search as implemented by the optim function
+
+Similarly to the hill climbing demonstration, the sumbits-sann.R file executes ten iterations of the simulated annealing for the sum of bits task:
+
+### sumbits-sann.R file ###
+
+source("hill.R") # get hchange function
+
+# sum a raw binary object x (evaluation function):
+
+minsumbin=function(x) (length(x)-sum(x)) # optim only minimizes!
+
+# SANN for sum of bits, one run:
+
+D=8 # dimension
+
+s=rep(0,D) # c(0,0,0,0,...)
+
+C=list(maxit=10,temp=10,tmax=1,trace=TRUE,REPORT=1)
+
+bchange=function(par) # binary change
+
+{ D=length(par)
+
+hchange(par,lower=rep(0,D),upper=rep(1,D),rnorm,mean=0,sd=1)
+
+}
+
+s=optim(s,minsumbin,gr=bchange,method="SANN",control=C)
+
+cat("best:",s$par,"f:",s$value,"(max: fs:",sum(s$par),")\n")
+
+Given that optim only performs minimization, the evaluation function needs to be adapted (as discussed in Sect. 1.​3). In this example, it was set to have a minimum of zero. Also, given that method="SANN" does not include lower and upper bonds, it is the responsibility of the change function (gr) to not generate unfeasible solutions. In this case, the auxiliary binary change function (bchange) uses the hchange function (from file hill.R) to set the 0 and 1 bounds for all D values. An execution example of file sumbits-sann.R is:
+
+> source("sumbits-sann.R")
+
+sann objective function values
+
+initial value 8.000000
+
+iter 1 value 7.000000
+
+iter 2 value 2.000000
+
+iter 3 value 2.000000
+
+iter 4 value 1.000000
+
+iter 5 value 1.000000
+
+iter 6 value 1.000000
+
+iter 7 value 1.000000
+
+iter 8 value 1.000000
+
+iter 9 value 1.000000
+
+final value 1.000000
+
+sann stopped after 9 iterations
+
+best: 1 1 1 1 1 1 1 0 f: 1 (max: fs: 7 )
+
+The simulated annealing search is also adapted for bag prices (D = 5) and sphere tasks (D = 2), by setting maxit = 10, 000, T = 1, 000 and tmax = 10 (file bs-sann.R):
+
+### bs-sann.R file ###
+
+source("hill.R") # load the hchange method
+
+source("functions.R") # load the profit function
+
+eval=function(x) -profit(x) # optim minimizes!
+
+# hill climbing for all bag prices, one run:
+
+D=5; C=list(maxit=10000,temp=1000,trace=TRUE,REPORT=10000)
+
+s=sample(1:1000,D,replace=TRUE) # initial search
+
+ichange=function(par) # integer value change
+
+{ D=length(par)
+
+hchange(par,lower=rep(1,D),upper=rep(1000,D),rnorm,mean=0,
+
+sd=1)
+
+}
+
+s=optim(s,eval,gr=ichange,method="SANN",control=C)
+
+cat("best:",s$par,"profit:",abs(s$value),"\n")
+
+# hill climbing for sphere, one run:
+
+sphere=function(x) sum(x^2)
+
+D=2; C=list(maxit=10000,temp=1000,trace=TRUE,REPORT=10000)
+
+s=runif(D,-5.2,5.2) # initial search
+
+# SANN with default change (gr) function:
+
+s=optim(s,sphere,method="SANN",control=C)
+
+cat("best:",s$par,"f:",s$value,"\n")
+
+An example execution of file bs-sann.R is:
+
+> source("bs-sann.R")
+
+sann objective function values
+
+initial value -35982.000000
+
+final value -39449.000000
+
+sann stopped after 9999 iterations
+
+best: 293 570 634 606 474 profit: 39449
+
+sann objective function values
+
+initial value 21.733662
+
+final value 1.243649
+
+sann stopped after 9999 iterations
+
+best: -0.6856747 -0.8794882 f: 1.243649
+
+In this execution, the initial solution was improved $3,467 (bag prices) and 20.49 (sphere). For the last task execution, Fig. 4.3 shows the evolution of the temperature values (left) and points searched (right). While the initial point is within the [−5. 2, 5. 2] range, the default and unbounded Gaussian Markov change function searches for several solutions outside the initial range. However, as the search proceeds, more solutions tend to converge to the origin point, which is the optimum.
+
+Fig. 4.3
+
+Example of the temperature cooling (left) and simulated annealing search (right) for sphere and D = 2
+
+## 4.4 Tabu Search
+
+Tabu search was created by Glover (1986) and uses the concept of "memory" to force the search into new areas. The algorithm is a variation of the hill climbing method that includes in a tabu list of length L, which stores the most recent solutions that become "tabu" and thus cannot be used when selecting a new solution. The intention is to keep a short-term memory of recent changes, preventing future moves from deleting these changes (Brownlee 2011). Similarly to the hill climbing method, the search of solutions within the neighborhood of the current solution (function change) can be deterministic, including the entire neighborhood, or stochastic (e.g., small random perturbation). Also, the tabu search algorithm is deterministic, except if a stochastic change is adopted (Michalewicz et al. 2007). Hence, this method is also centered within the deterministic/stochastic factor of analysis in Fig. 1.​2.
+
+There are extensions of the original tabu search method. Tabu search was devised for discrete spaces and combinatorial problems (e.g., traveling salesman problem). However, the method can be extended to work with real valued points if a similarity function is used to check if a solution is very close to a member of the tabu list (Luke 2012). Other extensions to the original method include adding other types of memory structures, such as: intermediate-term, to focus the search in promising areas (intensification phase ); and long-term, to promote a wider exploration of the search space (diversification phase ). More details can be found in Glover (1990).
+
+Algorithm 4 presents a simple implementation of tabu search, in an adaptation of the pseudo-code presented in Brownlee (2011). The algorithm combines a steepest ascent hill climbing search with a short tabu memory and includes three control parameters: maxit—the maximum number of iterations; L—the length of the tabu list; and N—the number of neighborhood solutions searched at each iteration.
+
+Algorithm 4 Tabu search
+
+In this section, the tabuSearch function is adopted (as implemented in the package under the same name). This function only works with binary strings, using a stochastic generation of new solutions and assuming a maximization goal. Also, it implements a three memory scheme, under the sequence of stages: preliminary search (short-term), intensification (intermediate-term), and diversification (long-term). Some relevant arguments are:
+
+  * size—length of the binary solution (L S );
+
+  * iters—maximum number of iterations (maxit) during the preliminary stage;
+
+  * objFunc—evaluation function (f) to be maximized;
+
+  * config—initial solution (S);
+
+  * neigh—number of neighbor configurations (N) searched at each iteration;
+
+  * listSize—length of the tabu list (L);
+
+  * nRestarts—maximum number of restarts in the intensification stage; and
+
+  * repeatAll—number of times to repeat the search.
+
+The tabuSearch function returns a list with elements such as: $configKeep—matrix with stored solutions; and $eUtilityKeep—vector with the respective evaluations.
+
+To demonstrate this method, file binary-tabu.R optimizes the binary tasks of Sect. 1.​7:
+
+### binary-tabu.R file ###
+
+library(tabuSearch) # load tabuSearch package
+
+# tabu search for sum of bits:
+
+sumbin=function(x) (sum(x)) # sum of bits
+
+D=8 # dimension
+
+s=rep(0,D) # c(0,0,0,0,...)
+
+s=tabuSearch(D,iters=2,objFunc=sumbin,config=s,neigh=2,
+
+listSize=4,nRestarts=1)
+
+b=which.max(s$eUtilityKeep) # best index
+
+cat("best:",s$configKeep[b,],"f:",s$eUtilityKeep[b],"\n")
+
+# tabu search for max sin:
+
+intbin=function(x) sum(2^(which(rev(x==1))-1))
+
+maxsin=function(x) # max sin (explained in Chapter )
+
+{ D=length(x);x=intbin(x); return(sin(pi*(as.numeric(x))/(2^D))) }
+
+D=8
+
+s=rep(0,D) # c(0,0,0,0,...)
+
+s=tabuSearch(D,iters=2,objFunc=maxsin,config=s,neigh=2,
+
+listSize=4,nRestarts=1)
+
+b=which.max(s$eUtilityKeep) # best index
+
+cat("best:",s$configKeep[b,],"f:",s$eUtilityKeep[b],"\n")
+
+An example of file binary-tabu.R execution is:
+
+> source("binary-tabu.R")
+
+best: 0 1 1 1 0 1 1 1 f: 6
+
+best: 1 0 0 1 0 1 1 0 f: 0.9637761
+
+While few iterations were used, the method optimized solutions close to the optimum values (f = 8 for sum of bits and f = 1 for max sin).
+
+The tabu search is also demonstrated for the bag prices integer task (D = 5). Given that tabuSearch() imposes some restrictions, adaptations are needed. The most relevant is the use of a binary representation, with ten digits per integer value (to cover the {$1,$2,...,$1,000} range). Also, since the associated search space includes infeasible solutions, a simply death penalty scheme is used (Sect. 1.​5), where ![
+$$f = -\\infty $$
+](A317276_1_En_4_Chapter_IEq2.gif) if any price is above $1,000. Finally, given that tabuSearch() does not include extra arguments to be passed to the evaluation function, the arguments D and Dim need to be explicitly defined before tabu search method is executed. The adapted R code (file bag-tabu.R) is:
+
+### bag-tabu.R file ###
+
+library(tabuSearch) # load tabuSearch package
+
+source("functions.R") # load the profit function
+
+# tabu search for bag prices:
+
+D=5 # dimension (number of prices)
+
+MaxPrice=1000
+
+Dim=ceiling(log(MaxPrice,2)) # size of each price (=10)
+
+size=D*Dim # total number of bits (=50)
+
+s=sample(0:1,size,replace=TRUE) # initial search
+
+intbin=function(x) # convert binary to integer
+
+{ sum(2^(which(rev(x==1))-1)) } # explained in Chapter
+
+bintbin=function(x) # convert binary to D prices
+
+{ # note: D and Dim need to be set outside this function
+
+s=vector(length=D)
+
+for(i in 1:D) # convert x into s:
+
+{ ini=(i-1)*Dim+1;end=ini+Dim-1
+
+s[i]=intbin(x[ini:end])
+
+}
+
+return(s)
+
+}
+
+bprofit=function(x) # profit for binary x
+
+{ s=bintbin(x)
+
+if(sum(s>MaxPrice)>0) f=-Inf # death penalty
+
+else f=profit(s)
+
+return(f)
+
+}
+
+cat("initial:",bintbin(s),"f:",bprofit(s),"\n")
+
+s=tabuSearch(size,iters=100,objFunc=bprofit,config=s,neigh=4,listSize=16,nRestarts=1)
+
+b=which.max(s$eUtilityKeep) # best index
+
+cat("best:",bintbin(s$configKeep[b,]),"f:",s$eUtilityKeep[b],
+
+"\n")
+
+This code introduces the ceiling() R function that returns the closest upper integer. An execution example of file bag-tabu.R is:
+
+> source("bag-tabu.R")
+
+initial: 621 1005 880 884 435 f: -Inf
+
+best: 419 428 442 425 382 f: 43050
+
+In this case, the tabu search managed to improve an infeasible initial search point into a solution that is only 2 % far from the optimum value (f = 43, 899).
+
+## 4.5 Comparison of Local Search Methods
+
+The comparison of optimization methods is not a trivial task. The no free lunch theorem (Wolpert and Macready 1997) states that all search methods have a similar global performance when compared over all possible functions. However, the set of all functions includes random and deceptive ones, which often are not interesting to be optimized. A constructive response to the theorem is to define a subset of "searchable" functions where the theorem does not hold, comparing the average performance of a several algorithms on this subset (Mendes 2004). Yet, even if an interesting subset of functions and methods is selected, there are other relevant issues for a robust comparison: how to tune the control parameters of a method (e.g., T of simulated annealing) and which performance metrics and statistical tests should be adopted.
+
+Hence, rather than presenting a complete comparison, this section presents an R code example of how optimization methods can be compared, assuming some reasonable assumptions (if needed, these can be changed by the readers). The example uses one task, rastrigin benchmark with D = 20 (which is the most difficult real value task from Sect. 1.​7) and compares three methods: Monte Carlo (Sect. 3.​4), hill climbing (Sect. 4.2), and simulated annealing (Sect. 4.3). To avoid any bias towards a method, the same change function is used for hill climbing and simulated annealing strategies and the default optim values (T = 10, tmax = 10) are adopted for the last search strategy. The same maximum number of iterations (maxit = 10, 000) is used for all methods. Rather than comparing just the final best value, the comparison is made throughout the search execution. Some measures of search execution can be deceptive, such as time elapsed, which might be dependent on the processor workload, or number of iterations, whose computational effort depends on the type of search. Thus, the best value is stored for each evaluation function (from 1 to 10,000), as sequentially called by the method. Finally, a total of 50 runs are executed for each method, with the initial solutions randomly generated within the range [−5. 2, 5. 2]. To aggregate the results, the average and respective t-student 95 % confidence intervals curves are computed for the best values. The comparison code outputs a PDF result file (file compare.R):
+
+### compare.R file ###
+
+source("hill.R") # get hchange
+
+source("blind.R") # get fsearch
+
+source("montecarlo.R") # get mcsearch
+
+library(rminer) # get meanint
+
+# comparison setup:
+
+crastrigin=function(x)
+
+{ f=10*length(x)+sum(x^2-10*cos(2*pi*x))
+
+# global assignment code: <<-
+
+EV<<-EV+1 # increase evaluations
+
+if(f<BEST) BEST<<-f
+
+if(EV<=MAXIT) F[EV]<<-BEST
+
+return(f)
+
+}
+
+Runs=50; D=20; MAXIT=10000
+
+lower=rep(-5.2,D);upper=rep(5.2,D)
+
+rchange1=function(par,lower,upper) # change for hclimbing
+
+{ hchange(par,lower=lower,upper=upper,rnorm,
+
+mean=0,sd=0.5,round=FALSE) }
+
+rchange2=function(par) # change for optim
+
+{ hchange(par,lower=lower,upper=upper,rnorm,
+
+mean=0,sd=0.5,round=FALSE) }
+
+CHILL=list(maxit=MAXIT,REPORT=0)
+
+CSANN=list(maxit=MAXIT,temp=10,trace=FALSE)
+
+Methods=c("monte carlo","hill climbing","simulated annealing")
+
+# run all optimizations and store results:
+
+RES=vector("list",length(Methods)) # all results
+
+for(m in 1:length(Methods))
+
+RES[[m]]=matrix(nrow=MAXIT,ncol=Runs)
+
+for(R in 1:Runs) # cycle all runs
+
+{ s=runif(D,-5.2,5.2) # initial search point
+
+EV=0; BEST=Inf; F=rep(NA,MAXIT) # reset these vars.
+
+# monte carlo:
+
+mcsearch(MAXIT,lower=lower,upper=upper,FUN=crastrigin)
+
+RES[[1]][,R]=F
+
+# hill climbing:
+
+EV=0; BEST=Inf; F=rep(NA,MAXIT)
+
+hclimbing(s,crastrigin,change=rchange1,lower=lower,
+
+upper=upper,control=CHILL,type="min")
+
+RES[[2]][,R]=F
+
+# SANN:
+
+EV=0; BEST=Inf; F=rep(NA,MAXIT)
+
+optim(s,crastrigin,method="SANN",gr=rchange2,control=CSANN)
+
+RES[[3]][,R]=F
+
+}
+
+# aggregate (average and confidence interval) results:
+
+AV=matrix(nrow=MAXIT,ncol=length(Methods))
+
+CI=AV
+
+for(m in 1:length(Methods))
+
+{
+
+for(i in 1:MAXIT)
+
+{
+
+mi=meanint(RES[[m]][i,])
+
+AV[i,m]=mi$mean;CI[i,m]=mi$int
+
+}
+
+}
+
+# show comparative PDF graph:
+
+# plot a nice confidence interval bar:
+
+confbar=function(x,ylower,yupper,K=100)
+
+{ segments(x-K,yupper,x+K)
+
+segments(x-K,ylower,x+K)
+
+segments(g2,ylower,g2,yupper)
+
+}
+
+pdf("comp-rastrigin.pdf",width=5,height=5)
+
+par(mar=c(4.0,4.0,0.1,0.6)) # reduce default plot margin
+
+MIN=min(AV-CI);MAX=max(AV+CI)
+
+# 10.000 are too much points, thus two grids are used
+
+# to improve clarity:
+
+g1=seq(1,MAXIT,length.out=1000) # grid for lines
+
+g2=seq(1,MAXIT,length.out=11) # grid for confbar
+
+plot(g1,AV[g1,3],ylim=c(MIN,MAX),type="l",lwd=2,
+
+ylab="average best",xlab="number of evaluations")
+
+confbar(g2,AV[g2,3]-CI[g2,3],AV[g2,3]+CI[g2,3])
+
+lines(g1,AV[g1,2],lwd=2,lty=2)
+
+confbar(g2,AV[g2,2]-CI[g2,2],AV[g2,2]+CI[g2,2])
+
+lines(g1,AV[g1,1],lwd=2,lty=3)
+
+confbar(g2,AV[g2,1]-CI[g2,1],AV[g2,1]+CI[g2,1])
+
+legend("topright",legend=rev(Methods),lwd=2,lty=1:3)
+
+dev.off() # close the PDF device
+
+Given that some optimization functions (e.g., optim) are restrictive in terms of the parameters that can be used as inputs, the evaluation function is adapted to perform global assignments (operator <​​​ <-, Sect. 2.​3) to the number of evaluations (EV), best value (BEST), and vector of best function values (F). The results are stored in a vector list of size 3, each element with a matrix maxit × runs. Two similar change functions are defined, since optim does not allow the definition of additional arguments to be passed to gr. The code introduces some new R functions:
+
+  * meanint (from package rminer )—computes the mean and t-student confidence intervals;
+
+  * segments —draws a segment;
+
+  * par —sets graphical parameters used by plot; and
+
+  * lines —joints points into line segments.
+
+The result execution of file compare.R is presented in Fig. 4.4. Initially, all methods present a fast and similar convergence. However, after around 1,000 evaluations, the hill climbing and simulated annealing methods start to outperform the random search (Monte Carlo). The confidence interval bars show that after around 4,000 evaluations, the local search methods are statistically better when compared with Monte Carlo. In this experiment, simulated annealing produces only a slight best average result and the differences are not statistically significant when compared with hill climbing (since confidence intervals overlap).
+
+Fig. 4.4
+
+Local search comparison example for the rastrigin task (D = 20)
+
+## 4.6 Command Summary
+
+ceiling() | returns the closest upper integer
+
+---|---
+
+hchange() | slight random change of a vector (chapter file "hill.R")
+
+hclimbing() | standard hill climbing search (chapter file "hill.R")
+
+ifelse() | conditional element selection
+
+lines() | joints points into line segments
+
+meanint | computes the mean and t-student confidence intervals (package rminer)
+
+optim() | general-purpose optimization (includes simulated annealing)
+
+par() | set or query graphical parameters used by plot()
+
+rminer | package for simpler use of data mining (classification and regression) methods
+
+segments | draws a segment line
+
+tabuSearch | package for tabu search
+
+tabuSearch() | tabu search for binary string maximization (package tabuSearch)
+
+## 4.7 Exercises
+
+4.1.
+
+Adapt the function hclimbing function to accept an additional control parameter (N). When N = 0, the function should execute the standard hill climbing, while when N > 0, the function should implement the steepest ascent hill climbing method. This last variant works by searching first N neighbor solutions within each iteration, in order to select the best new solution to be compared with current search point.
+
+4.2.
+
+Explore the optimization of the binary max sin task with a higher dimension (D = 16), under hill climbing, simulated annealing, and tabu search methods. Use the zero vector as the starting point and a maximum of 20 iterations. Show the optimized solutions and evaluation values.
+
+4.3.
+
+Execute the optimization of the rastrigin function (D = 8) with the tabuSearch function. Adopt a binary representation such that each dimension value is encoded into 8 bits, denoting any of the 256 regular levels within the range [−5. 2, 5. 2]. Use the control parameters: maxit = 500, N = 8, L = 8 and nRestarts=1 and a randomly generated initial point.
+
+References
+
+Bélisle CJ (1992) Convergence theorems for a class of simulated annealing algorithms on R d. J Appl Probab 29:885–895CrossRefMATHMathSciNet
+
+Brownlee J (2011) Clever algorithms: nature-inspired programming recipes, Lulu
+
+Glover F (1986) Future paths for integer programming and links to artificial intelligence. Comput Oper Res 13(5):533–549CrossRef90048-1)MATHMathSciNet
+
+Glover F (1990) Tabu search: a tutorial. Interfaces 20(4):74–94CrossRef
+
+Luke S (2012) Essentials of metaheuristics. Lulu.com, online version at http://​cs.​gmu.​edu/​~sean/​book/​metaheuristics
+
+Mendes R (2004) Population topologies and their influence in particle swarm performance. Ph.D. thesis, Universidade do Minho
+
+Michalewicz Z, Schmidt M, Michalewicz M, Chiriac C (2006) Adaptive business intelligence. Springer, Berlin
+
+Michalewicz Z, Schmidt M, Michalewicz M, Chiriac C (2007) Adaptive business intelligence: three case studies. In: Evolutionary computation in dynamic and uncertain environments. Springer, Berlin, pp 179–196
+
+Wolpert DH, Macready WG (1997) No free lunch theorems for optimization. IEEE Trans Evol Comput 1(1):67–82CrossRef
+© Springer International Publishing Switzerland 2014
+
+Paulo CortezModern Optimization with RUse R!10.1007/978-3-319-08263-9_5
+
+# 5. Population Based Search
+
+Paulo Cortez1
+
+(1)
+
+Department of Information Systems, University of Minho, Guimarães, Portugal
+
+## 5.1 Introduction
+
+In previous chapter, several local based search methods were presented, such as hill climbing, simulated annealing, and tabu search. All these methods are single-state, thus operating their effort around the neighborhood of a current solution. This type of search is simple and quite often efficient (Michalewicz et al. 2006). However, there is another interesting class of search methods, known as population based search , that use a pool of candidate solutions rather than a single search point. Thus, population based methods tend to require more computation when compared with simpler local methods, although they tend to work better as global optimization methods, quickly finding interesting regions of the search space (Michalewicz and Fogel 2004).
+
+As shown in Fig. 5.1, population based methods tend to explore more distinct regions of the search space, when compared with single-state methods. As consequence, more diversity can be reached in terms of setting new solutions, which can be created not only by slightly changing each individual search point but also by combining attributes related with two (or more) search points.
+
+Fig. 5.1
+
+Example of a population based search strategy
+
+The main difference between population based methods is set in terms of: how solutions are represented and what attributes are stored for each search point; how new solutions are created; and how the best population individuals are selected. Most population based methods are naturally inspired (Luke 2012). Natural phenomena such as genetics, natural selection, and collective behavior of animals have led to optimization techniques such as genetic and evolutionary algorithms, genetic programming, estimation of distribution, differential evolution, and particle swarm optimization. This chapter describes all these methods and examples of their applications using the R tool.
+
+## 5.2 Genetic and Evolutionary Algorithms
+
+Evolutionary computation denotes several optimization algorithms inspired in the natural selection phenomenon and that include a population of competing solutions. Although it is not always clearly defined, the distinction among these methods is mostly based on how to represent a solution and how new solutions are created. Genetic algorithms were proposed by Holland (1975). The original method worked only on binary representations and adopted massively the crossover operator for generating new solutions. More recently, the term evolutionary algorithm was adopted to address genetic algorithm variants that include real value representations and that adopt flexible genetic operators, ranging from heavy use of crossover to only mutation changes (Michalewicz 1996).
+
+There is a biological terminology associated with evolutionary computation methods (Luke 2012). For instance, a candidate solution is often termed individual, while population denotes a pool of individuals. The genotype, genome, or chromosome denotes the individual data structure representation. A gene is a value position in such representation and an allele is a particular value for a gene. The evaluation function is also known as fitness and phenotype represents how the individual operates during fitness assessment. The creation of new solutions is called breeding and occurs due to the application of genetic operators, such as crossover and mutation. Crossover involves selecting two or more parent solutions in order to generate children, while mutation often performs a slight change to a single individual.
+
+This book adopts the genetic/evolutionary algorithm as implemented by the genalg package (Lucasius and Kateman 1993). The package handles minimization tasks and contains two relevant functions: rbga.bin , for binary chromosomes; and rbga , for real value representations. The genalg main pseudo-code for rbga.bin() and rbga() is presented in Algorithm 5 and is detailed in the next paragraphs.
+
+Algorithm 5 Genetic/evolutionary algorithm as implemented by the genalg package
+
+The initialization function creates a random population of N P individuals (argument popSize), under a particular distribution (uniform or other). Each individual contains a fixed length chromosome (with L S genes), defined by size for rbga.bin or by the length of the lower bound values (stringMin) for rbga. For rbga, the initial values are randomly generated within the lower (stringMin) and upper (stringMax) bounds. The optional argument suggestions can be used to include a priori knowledge, by defining an explicit initial matrix with up to N P solutions. The function rbga.bin() includes also the argument zeroToOneRatio that denotes the probability for choosing a zero for mutations and population initialization. The ideal number of individuals (N P , argument popSize) is problem dependent. Often, this value is set taking into account the chromosome length (L S ), computational effort and, preliminary experiments. Common population size values are N P ∈ { 20, 50, 100, 200, 500, 1, 000, 2, 000}.
+
+The algorithm runs for maxit generations (argument iters). If an elitism scheme is adopted (E > 0, value set by argument elitism), then the best E individuals from the current population always pass to the next generation (the default is elitism=20 % of the population size). The remaining next population individuals are created by applying a crossover and then a mutation. The pseudo selectparents function aims at the selection of N P − E parents from the current population, in order to apply the crossover operator. The rbga algorithm performs an uniform random choice of parents, while rbga.bin executes a probabilistic selection of the fittest individuals. This selection works by ranking first the current population (according to the fitness values) and then performing a random selection of parents according to a density normal distribution. The respective R code for generating such probabilities is dnorm(1:popSize, mean = 0, sd = (popSize/3)). For instance, if N P = 4, then the probabilities for choosing parents from the ranked population are (0. 23, 0. 1, 0. 02, 0. 0), where 0.23 denotes the probability for the best individual. It should be noted that this probabilistic selection is also known as roulette wheel selection (Goldberg and Deb 1991). There are other selection schemes, such as tournament selection, which is explained in Sect. 6.​4.
+
+To create new individuals, both rbga and rbga.bin adopt the same one-point crossover, which was originally proposed in Holland (1975). The original one-point operator works by first selecting two parents and a random cutting position and then creating two children, each with distinct portions of the parents, as exemplified in the left of Fig. 5.2. As a specific genalg package implementation, only the first one-point crossover child is inserted into the new population and thus N P − E crossover operations are executed (Sect. 7.​2 shows an implementation that inserts the two children generated from a crossover). Next, a mutation operator is executed over the children, where each gene can be changed with a small probability (set by mutationChange). By default, mutationChange is set to 1/(size+1). Once a gene is mutated, the new value is set differently according to the chromosome representation type. In the binary version, the new bit value is randomly set taking into account the zeroToOneRatio value. The right of Fig. 5.2 shows an example of a binary bit mutation. Mutated real values are obtained by first computing g ′ = 0. 67 × r d × d f × R g , where r d ∈ {−1, 1} is a random direction, ![
+$$d_{f} = \(maxit - i\)/maxit$$
+](A317276_1_En_5_Chapter_IEq1.gif) is a dampening factor, and ![
+$$R_{g} =\\max \(g\) -\\min \(g\)$$
+](A317276_1_En_5_Chapter_IEq2.gif) is the range of the gene (e.g., max(g) denotes the upper bound for g, as set by StringMax). If g ′ lies outside the lower (StringMin) or upper bounds (StringMax), then ![
+$$g^{{\\prime}} =\\min \(g\) + \\mathcal{U}\(0,1\) \\times R_{g}$$
+](A317276_1_En_5_Chapter_IEq3.gif). More details can be checked by accessing the rbga package source code (![
+$$\\fbox{ $\\mathtt{> getAnywhere\(rbga.bin\)}$}$$
+](A317276_1_En_5_Chapter_IEq4.gif) and ![
+$$\\fbox{ $\\mathtt{> getAnywhere\(rbga\)}$}$$
+](A317276_1_En_5_Chapter_IEq5.gif)).
+
+Fig. 5.2
+
+Example of binary one-point crossover (left) and mutation (right) operators
+
+The rbga.bin and rba functions include four additional parameters:
+
+  * monitorFunc—monitoring function (e.g., could be used to compute useful statistics, such as population diversity measures), applied after each generation;
+
+  * evalFunc—evaluation (or fitness) function;
+
+  * showSettings—if TRUE, then the genetic algorithm parameters are shown (e.g., N P ); and
+
+  * verbose—if TRUE, then more text about the search evolution is displayed.
+
+The result of executing rbga.bin and rba is a list with components such as: $population—last population; $evaluations—last fitness values; $best—best value per generation; and $mean—mean fitness value per generation. The genalg package also includes functions for plotting (plot.rbga ) and summarizing (summary.rbga ) results. These functions adopt the useful R feature of S3 scheme of method dispatching, meaning that if obj is the object returned by rbga.bin or rbga, then the simpler call of plot(obj) (or summary(obj)) will execute plot.rbga (or summary.rbga).
+
+Given that the help(rbga.bin) already provides an example with the sum of bits task, the demonstration code (file bag-genalg.R) for the genetic algorithm explores the bag prices (D = 5) problem of Sect. 1.​7 (the code was adapted from the example given for the tabu search in Sect. 4.​5):
+
+### bag-genalg.R file ###
+
+library(genalg) # load genalg package
+
+source("functions.R") # load the profit function
+
+# genetic algorithm search for bag prices:
+
+D=5 # dimension (number of prices)
+
+MaxPrice=1000
+
+Dim=ceiling(log(MaxPrice,2)) # size of each price (=10)
+
+size=D*Dim # total number of bits (=50)
+
+intbin=function(x) # convert binary to integer
+
+{ sum(2^(which(rev(x==1))-1)) } # explained in Chapter
+
+bintbin=function(x) # convert binary to D prices
+
+{ # note: D and Dim need to be set outside this function
+
+s=vector(length=D)
+
+for(i in 1:D) # convert x into s:
+
+{ ini=(i-1)*Dim+1;end=ini+Dim-1
+
+s[i]=intbin(x[ini:end])
+
+}
+
+return(s)
+
+}
+
+bprofit=function(x) # profit for binary x
+
+{ s=bintbin(x)
+
+s=ifelse(s>MaxPrice,MaxPrice,s) # repair!
+
+f=-profit(s) # minimization task!
+
+return(f)
+
+}
+
+# genetic algorithm execution:
+
+G=rbga.bin(size=size,popSize=50,iters=100,zeroToOneRatio=1,evalFunc=bprofit,elitism=1)
+
+# show results:
+
+b=which.min(G$evaluations) # best individual
+
+cat("best:",bintbin(G$population[b,]),"f:",-G$evaluations[b],
+
+"\n")
+
+pdf("genalg1.pdf") # personalized plot of G results
+
+plot(-G$best,type="l",lwd=2,ylab="profit",xlab="generations")
+
+lines(-G$mean,lty=2,lwd=2)
+
+legend("bottomright",c("best","mean"),lty=1:2,lwd=2)
+
+dev.off()
+
+summary(G,echo=TRUE) # same as summary.rbga
+
+Similarly to the tabu search example, 10 binary digits are used to encode each price. The evaluation function (bprofit) was adapted with two changes. First, a repair strategy was adopted for handling infeasible prices, where high prices are limited into the MaxPrice upper bound. Second, the profit function is multiplied by − 1, since genalg only handles minimization tasks. The last code lines show results in terms of the best solution and summary of the genetic algorithm execution. Also, the code creates a plot showing the evolution of the best and mean profit values. An example of executing file bag-genalg.R is:
+
+> source("bag-genalg.R")
+
+best: 427 431 425 355 447 f: 43671
+
+GA Settings
+
+Type = binary chromosome
+
+Population size = 50
+
+Number of Generations = 100
+
+Elitism = 1
+
+Mutation Chance = 0.0196078431372549
+
+Search Domain
+
+Var 1 = [,]
+
+Var 0 = [,]
+
+GA Results
+
+Best Solution : 0 1 1 0 1 0 1 0 1 1 0 1 1 0 1 0 1 1 1 1 0 1 1 0 1 0 1 0 0 1 0 1 0 1 1 0 0 0 1 1 0 1 1 0 1 1 1 1 1 1
+
+Using 100 generations, the genetic algorithm improved the initial population (randomly set) best profit from $36,745 to $43,671, with the best fitness value being very close to the optimum (profit of $43,899). Figure 5.3 shows the best and mean profit values during the 100 generations.
+
+Fig. 5.3
+
+Example of evolution of a genetic algorithm for task bag prices
+
+The rbga demonstration code is related with the sphere (D = 2) task (file sphere-genalg.R):
+
+### sphere-genalg.R file ###
+
+library(genalg) # load genalg
+
+# evolutionary algorithm for sphere:
+
+sphere=function(x) sum(x^2)
+
+D=2
+
+monitor=function(obj)
+
+{ if(i==1)
+
+{ plot(obj$population,xlim=c(-5.2,5.2),ylim=c(-5.2,5.2),
+
+xlab="x1",ylab="x2",type="p",pch=16,
+
+col=gray(1-i/maxit))
+
+}
+
+else if(i%%K==0) points(obj$population,pch=16,
+
+col=gray(1-i/maxit))
+
+i<<-i+1 # global update
+
+}
+
+maxit=100
+
+K=5 # store population values every K generations
+
+i=1 # initial generation
+
+# evolutionary algorithm execution:
+
+pdf("genalg2.pdf",width=5,height=5)
+
+set.seed(12345) # set for replicability purposes
+
+E=rbga(rep(-5.2,D),rep(5.2,D),popSize=5,iters=maxit,
+
+monitorFunc=monitor,evalFunc=sphere)
+
+b=which.min(E$evaluations) # best individual
+
+cat("best:",E$population[b,],"f:",E$evaluations[b],"\n")
+
+dev.off()
+
+In this example, the monitor argument is used to plot the population of solutions every K generations, using a coloring scheme that ranges from light gray (initial population) to dark (last generation). This gradient coloring is achieved using the gray() R function, which creates gray colors between 1 (white) and 0 (black). The use of the set.seed command (setting the R random seed) is adopted here only for reproducibility purposes, i.e. readers who execute this code should get the same results. The execution of file sphere-genalg.R is:
+
+> source("sphere-genalg.R")
+
+best: 0.05639766 0.009093091 f: 0.00326338
+
+Although a very small population is used (N P = 5, minimum value accepted by rbga), the evolved solution of s = (0. 009, 0. 003) and f = 0. 056 is very close to the optimum (f = 0). Figure 5.4 presents the result of the plot, showing that darker points converge towards the optimum point (origin).
+
+Fig. 5.4
+
+Example of an evolutionary algorithm search for sphere (D = 2)
+
+## 5.3 Differential Evolution
+
+Differential evolution is a global search strategy that tends to work well for continuous numerical optimization and that was proposed in Storn and Price (1997). Similarly to genetic and evolutionary algorithms, the method evolves a population of solutions, where each solution is made of a string of real values. The main difference when compared with evolutionary algorithms is that differential evolution uses arithmetic operators to generate new solutions, instead of the classical crossover and mutation operators. The differential mutation operators are based on vector addition and subtraction, thus only working in metric spaces (e.g., boolean, integer or real values) (Luke 2012).
+
+This chapter adopts the differential evolution algorithm as implemented by the DEoptim package (Mullen et al. 2011), which performs a minimization goal. The respective pseudo-code is presented in Algorithm 6.
+
+Algorithm 6 Differential evolution algorithm as implemented by the DEoptim package
+
+The classical differential mutation starts by first choosing three individuals (s 1, s 2, and s 3) from the population. In contrast with genetic algorithms, these three individuals are randomly selected and selection only occurs when replacing mutated individuals in the population (as shown in Algorithm 6). A trial mutant is created using (Mullen et al. 2011): ![
+$$s_{m,j} = s_{1,j} + F \\times \(x_{2,j} - x_{3,j}\)$$
+](A317276_1_En_5_Chapter_IEq6.gif), where F ∈ [0, 2] is a positive scaling factor, often less than 1.0, and j denotes the j-th parameter of the representation of the solution. If the trial mutant values violate the upper or lower bounds, then s m, j is reseted using ![
+$$s_{m,j} =\\max \(s_{j}\) -\\mathcal{U}\(0,1\)\(\\max \(s_{j}\) - min\(s_{j}\)\)$$
+](A317276_1_En_5_Chapter_IEq7.gif), if s m, j > max(s j ), or ![
+$$s_{m,j} =\\min \(s_{j}\) + \\mathcal{U}\(0,1\)\(\\max \(s_{j}\) -\\min \(s_{j}\)\)$$
+](A317276_1_En_5_Chapter_IEq8.gif), if s m, j < min(s j ), where max(s j ) and min(s j ) denote the upper and lower limits for the j-th parameter of the solution. The first trial mutant value (chosen at random) is always computed. Then, new mutations are generated until all string values have been mutated (total of L S mutations) or if r > CR, where ![
+$$r = \\mathcal{U}\(0,1\)$$
+](A317276_1_En_5_Chapter_IEq9.gif) denotes a random number and CR is the crossover probability. Finally, the new child (s ′ ) is set as the generated mutant values plus the remaining ones from the current solution (s). Hence, the CR constant controls the fraction of values that are mutated.
+
+The DEoptim function includes six arguments:
+
+  * fn—function to be minimized;
+
+  * lower, upper—lower and upper bounds;
+
+  * control—a list of control parameters (details are given in function DEoptim.control );
+
+  * ...—additional arguments to be passed to fn; and
+
+  * fnMap—optional function that is run after each population creation but before the population is evaluated (it allows to impose integer/cardinality constraints).
+
+The control parameters (C) are specified using the DEoptim.control function, which contains arguments such as:
+
+  * VTR—value to be reached, stop if best value is below VTR (default VTR=-Inf);
+
+  * strategys—type of differential strategy adopted, includes six different mutation strategies (classical mutation is set using strategy=1, default is strategy=2, full details can be accessed by executing ![
+$$\\fbox{ $\\mathtt{>?DEoptim.control}$}$$
+](A317276_1_En_5_Chapter_IEq10.gif);
+
+  * NP—population size (default is 10*length(lower));
+
+  * itermax—maximum number of iterations (default is 200);
+
+  * CR—crossover probability (CR ∈ [0, 1], default is 0.5);
+
+  * F—differential weighting factor (F ∈ [0, 2], default is 0.8);
+
+  * trace—a logical or integer value indicating if progress should be reported (if integer it occurs every trace iterations, default is true);
+
+  * initialpop—an initial population (defaults to NULL);
+
+  * storepopfrom, storepopfreq—from which iteration and with which frequency should the population values be stored; and
+
+  * reltol, steptol—relative convergence tolerance stop criterion, the method stops if unable to reduce the value by a factor of reltol*(abs(value)) after steptol iterations.
+
+The result of the DEoptim function is a list that contains two components:
+
+  * $optim—a list with elements, such as $bestmem—best solution and $bestval—best evaluation value;
+
+  * $member—a list with components, such as bestvalit—best value at each iteration and pop—last population.
+
+Similarly to the genalg package, DEoptim also includes functions for plotting (plot.DEoptim ) and summarizing (summary.DEoptim ) results (under S3 scheme of method dispatching).
+
+Price et al. (2005) advise the following general configuration for the differential evolution parameters: use the default F = 0. 8 and CR = 0. 9 values and set the population size to ten times the number of solution values (N P = 10 × L S ). Further details about the DEoptim package can be found in Mullen et al. (2011) (execute ![
+$$\\fbox{ $\\mathtt{> vignette\(`` DEoptim'' \)}$}$$
+](A317276_1_En_5_Chapter_IEq11.gif) to get an immediate access to this reference).
+
+The demonstration sphere-DEoptim.R code adopts the sphere (D = 2) task:
+
+### sphere-DEoptim.R file ###
+
+library(DEoptim) # load DEoptim
+
+sphere=function(x) sum(x^2)
+
+D=2
+
+maxit=100
+
+set.seed(12345) # set for replicability
+
+C=DEoptim.control(strategy=1,NP=5,itermax=maxit,CR=0.9,F=0.8,
+
+trace=25,storepopfrom=1,storepopfreq=1)
+
+# perform the optimization:
+
+D=suppressWarnings(DEoptim(sphere,rep(-5.2,D),rep(5.2,D),
+
+control=C))
+
+# show result:
+
+summary(D)
+
+pdf("DEoptim.pdf",onefile=FALSE,width=5,height=9,
+
+colormodel="gray")
+
+plot(D,plot.type="storepop")
+
+dev.off()
+
+cat("best:",D$optim$bestmem,"f:",D$optim$bestval,"\n")
+
+The C object contains the control parameters, adjusted for the classical differential mutation and population size of 5, among other settings (the arguments storepopfrom and storepopfreq are required for the plot). Giving that DEoptim produces a warning when the population size is not set using the advised rule (N P = 10 × L S ), the suppressWarnings R function was added to ignore such warning. Regarding the plot, the informative "storepop" was selected (other options are "bestmemit"—evolution of the best parameter values; and "bestvalit"—best function value in each iteration). Also, additional arguments were used in the pdf function (onefile and colormodel) in order to adjust the file to contain just one page with a gray coloring scheme. The execution result of file sphere-DEoptim.R is:
+
+> source("sphere-DEoptim.R")
+
+Iteration: 25 bestvalit: 0.644692 bestmemit: 0.799515
+
+0.073944
+
+Iteration: 50 bestvalit: 0.308293 bestmemit: 0.550749 -0.070493
+
+Iteration: 75 bestvalit: 0.290737 bestmemit: 0.535771 -0.060715
+
+Iteration: 100 bestvalit: 0.256731 bestmemit: 0.504867 -0.042906
+
+***** summary of DEoptim object *****
+
+best member : 0.50487 -0.04291
+
+best value : 0.25673
+
+after : 100 generations
+
+fn evaluated : 202 times
+
+*************************************
+
+best: 0.5048666 -0.0429055 f: 0.2567311
+
+The differential evolution algorithm improved the best value of the initial population from 7.37 to 0.25, leading to the optimized solution of ![
+$$s = \(0.50,-0.04\)$$
+](A317276_1_En_5_Chapter_IEq12.gif). Figure 5.5 presents the result of the plot, showing a fast convergence in the population towards the optimized values (0.5 and − 0. 04). The first optimized value (0.5) is not very close to the optimum. However this is a tutorial example that includes a very small population size. When the advised rule is used (N P = 20), the maximum distance of the optimized point to the origin is 7. 53 × 10−10!
+
+Fig. 5.5
+
+Population evolution in terms of x 1 (top) and x 2 (bottom) values under the differential evolution algorithm for sphere (D = 2)
+
+## 5.4 Particle Swarm Optimization
+
+Swarm intelligence denotes a family of algorithms (e.g., ant colony and particle swarm optimization) that are inspired in swarm behavior, which is exhibited by several animals (e.g., birds, ants, bees). These algorithms assume a population of simple agents with direct or indirect interactions that influence future behaviors. While each agent is independent, the whole swarm tends to produce a self-organized behavior, which is the essence of swarm intelligence (Michalewicz et al. 2006).
+
+Particle swarm optimization is a swarm intelligence technique that was proposed by Kennedy and Eberhart (1995) for numerical optimization. Similarly to differential evolution, particle swarms operate mostly on metric spaces (Luke 2012). The algorithm is defined by the evolution of a population of particles, represented as vectors with a D-th (or L S ) dimensional space. The particle trajectories oscillate around a region that is influenced by the individual previous performance and by the success of his neighborhood (Mendes et al. 2002).
+
+Since the original algorithm was presented in 1995, numerous variants have been proposed. This chapter adopts the pso package , which implements two standard versions that have been made publicly available at the Particle Swarm Central site (http://​www.​particleswarm.​info/​): SPSO 2007 and 2011. It is important to note that these SPSO variants do not claim to be the best versions on the market. Rather, SPSO implement the original particle swarm version (Kennedy and Eberhart 1995) with few improvements based on recent works. The goal is to define stable standards that can be compared against newly proposed particle swarm algorithms.
+
+A particle moves on a step-by-step basis, where each step is also known as iteration, and contains (Clerc 2012): a position (s, inside search space), a fitness value (f), a velocity (v, used to compute next position), and a memory (p, previous best position found by the particle, and l, previous best position in the neighborhood). Each particle starts with random position and velocity values. Then, the search is performed by a cycle of iterations. During the search, the swarm assumes a topology, which denotes a set of links between particles. A link allows one particle to inform another one about its memory. The neighborhood is defined by the set of informants of a particle. The new particle position depends on its current position and velocity, while velocity values are changed using all elements of a particle (s, v, p and l). The overall pseudo-code for SPSO 2007 and 2011 is presented in Algorithm 7, which assumes a minimization goal.
+
+Algorithm 7 Particle swarm optimization pseudo-code for SPSO 2007 and 2011
+
+This chapter highlights only the main SPSO differences. The full details are available at Clerc (2012). Several termination criteria are defined in SPSO 2007 and 2011: maximum admissible error (when optimum point is known), maximum number of iterations/evaluations, and maximum number of iterations without improvement. Regarding the swarm size (N P ), in SPSO 2007 it is automatically defined as the integer part of ![
+$$10 + 2\\sqrt{L_{S}}$$
+](A317276_1_En_5_Chapter_IEq13.gif) (L S denotes the length of a solution), while in SPSO 2011 it is user defined (with suggested value of 40).
+
+Historically, two popular particle swarm topologies (Mendes et al. 2002) were: star, where all particles know each other; and ring, where each particle has only two neighbors. However, the SPSO variants use a more recent adaptive star topology (Clerc 2012), where each particle informs itself and K randomly particles. Usually, K is set to 3. SPSO 2007 and 2011 use similar random uniform initializations for the position: ![
+$$s_{j} = \\mathcal{U}\(\\min \(s_{j}\),\\max \(s_{j}\)\),j \\in \\{ 1,\\ldots,L_{S}\\}$$
+](A317276_1_En_5_Chapter_IEq14.gif). However, the velocity is set differently: SPSO 2007—![
+$$v_{j} = \\frac{\\mathcal{U}\(\\min \(s_{j}\),\\max \(s_{j}\)\)-s_{j}} {2}$$
+](A317276_1_En_5_Chapter_IEq15.gif); SPSO 2011—![
+$$v_{j} = \\mathcal{U}\(\\min \(s_{j}\) - s_{j},\\max \(s_{j}\) - s_{j}\)$$
+](A317276_1_En_5_Chapter_IEq16.gif).
+
+In SPSO 2007, the velocity update is applied dimension by dimension:
+
+![
+$$\\displaystyle{ v_{j} \\leftarrow wvj + \\mathcal{U}\(0,c\)\(p_{j} - s_{j}\) + \\mathcal{U}\(0,c\)\(l_{j} - s_{j}\) }$$
+](A317276_1_En_5_Chapter_Equ1.gif)
+
+(5.1)
+
+where w and c are exploitation and exploration constants. The former constant sets the ability to explore regions of the search space, while the latter one defines the ability to concentrate around a promising area. The suggested values for these constants are ![
+$$w = 1/\(2\\ln 2\) \\simeq 0.721$$
+](A317276_1_En_5_Chapter_IEq17.gif) and ![
+$$c = 1/2 +\\ln \(2\) \\simeq 1.193$$
+](A317276_1_En_5_Chapter_IEq18.gif). A different scheme is adopted for SPSO 2011. First, the center of gravity (G j ) of three points (current position and two points, slightly beyond p and l):
+
+![
+$$\\displaystyle{ G_{j} = s_{j} + c\\frac{p_{j} + l_{j} - 2s_{j}} {3} }$$
+](A317276_1_En_5_Chapter_Equ2.gif)
+
+(5.2)
+
+Then, a random point (s ′ ) is selected within the hypersphere of center G j and radius ![
+$$\\parallel G_{j} - s_{j} \\parallel $$
+](A317276_1_En_5_Chapter_IEq19.gif). Next, the velocity is updated as ![
+$$v_{j} = wvj + s_{j}^{{\\prime}}- s_{j}$$
+](A317276_1_En_5_Chapter_IEq20.gif) and the new position is simply adjusted using ![
+$$s_{j} = wv_{j} + s_{j}^{{\\prime}}$$
+](A317276_1_En_5_Chapter_IEq21.gif). There is a special case, when l = p. In such case, in SPSO 2007, the velocity is set using ![
+$$v_{j} \\leftarrow wvj + \\mathcal{U}\(0,c\)\(p_{j} - s_{j}\)$$
+](A317276_1_En_5_Chapter_IEq22.gif), while in SPSO 2011 the center of gravity is computed using ![
+$$s_{j} + c\\frac{p_{j}-s_{j}} {2}$$
+](A317276_1_En_5_Chapter_IEq23.gif).
+
+The confinement function is used to assure that the new position is within the admissible bounds. In both SPSO variants, when a position is outside a bound, it is set to that bound. In SPSO 2007, the velocity is also set to zero, while in SPSO 2011 it is set to half the opposite velocity (![
+$$v_{j} = -0.5v_{j}$$
+](A317276_1_En_5_Chapter_IEq24.gif)).
+
+The pso package includes the core psoptim function that can be used as a replacement of function optim (Sect. 4.​3) and includes arguments, such as:
+
+  * par—vector defining the dimensionality of the problem (L S ), included for compatibility with optim and can include NA values;
+
+  * fn—function to be minimized;
+
+  * lower, upper—lower and upper bounds;
+
+  * ...—additional arguments to be passed to fn; and
+
+  * control—a list of control parameters.
+
+The control list includes components, such as:
+
+  * $trace—if positive, progress information is shown (default is 0);
+
+  * $fnscale—scaling applied to the evaluation function (if negative, transforms the problem into maximization; default is 1);
+
+  * $maxit—maximum number of iterations (defaults to 1,000);
+
+  * $maxf—maximum number of function evaluations;
+
+  * $abstol—stops if best fitness is less than or equal to this value (defaults to -Inf);
+
+  * $reltol—if the maximum distance between best particle and all others is less than reltol*d, then the algorithm restarts;
+
+  * $REPORT—frequency of reports if trace is positive;
+
+  * $trace.stats—if TRUE, then statistics at every REPORT step are recorded;
+
+  * $s—swarm size (N P );
+
+  * $k—K value (defaults to 3);
+
+  * $p—average percentage of informants for each particle (a value of 1 implies a fully informed scheme, where all particles and not just K neighbors affect the individual velocity, defaults to 1-(1-1/s)^k);
+
+  * $w—exploitation constant (if a vector of 2 elements, constant is gradually changed from w[1] to w[2], default 1/(2*log(2));
+
+  * $c.p—local exploration constant (associated with p, defaults to 0.5+ log(2));
+
+  * $c.g—global exploration constant (associated with l, defaults to 0.5+ log(2));
+
+  * $d—diameter of the search space (defaults to Euclidean distance between upper and lower);
+
+  * $v.max—maximum admitted velocity (if not NA the velocity is clampled to the length of v.max*d, defaults to NA);
+
+  * $maxit.stagnate—maximum number of iterations without improvement (defaults to Inf); and
+
+  * $type —SPSO implementation type ("SPSO2007" or "SPSO2011", defaults to "SPSO2007").
+
+The result is a list (compatible with optim) that contains:
+
+  * $par—best solution found;
+
+  * $value—best evaluation value;
+
+  * $counts—vector with three numbers (function evaluations, iterations, and restarts);
+
+  * $convergence and $message—stopping criterion type and message; and
+
+  * $stats—if trace is positive and trace.stats is true, then it contains the statistics: it—iteration numbers, error—best fitnesses, f—current swarm fitness vector, and x—current swarm position matrix.
+
+The sphere-psoptim.R file adapts the psoptim function for the sphere (D = 2) task:
+
+### sphere-psoptim.R file ###
+
+library(pso) # load pso
+
+sphere=function(x) sum(x^2)
+
+D=2; maxit=10; s=5
+
+set.seed(12345) # set for replicability
+
+C=list(trace=1,maxit=maxit,REPORT=1,trace.stats=1,s=s)
+
+# perform the optimization:
+
+PSO=psoptim(rep(NA,D),fn=sphere,lower=rep(-5.2,D),
+
+upper=rep(5.2,D),control=C)
+
+# result:
+
+pdf("psoptim1.pdf",width=5,height=5)
+
+j=1 # j-th parameter
+
+plot(xlim=c(1,maxit),rep(1,s),PSO$stats$x[[1]][j,],pch=19,
+
+xlab="iterations",ylab=paste("s_",j," value",sep=""))
+
+for(i in 2:maxit) points(rep(i,s),PSO$stats$x[[i]][j,],pch=19)
+
+dev.off()
+
+pdf("psoptim2.pdf",width=5,height=5)
+
+plot(PSO$stats$error,type="l",lwd=2,xlab="iterations",
+
+ylab="best fitness")
+
+dev.off()
+
+cat("best:",PSO$par,"f:",PSO$value,"\n")
+
+In this demonstration, a very small swarm size (N P = 5) was adopted. Also, the control list was set to report statistics every iteration, under a maximum of ten iterations. The visual results are presented in terms of two plots. The first plot is similar to Fig. 5.5 and shows the evolution of the position particles for the first parameter. The second plot shows the evolution of the best fitness during the optimization. The execution of file sphere-psoptim.R is:
+
+> source("sphere-psoptim.R")
+
+S=5, K=3, p=0.488, w0=0.7213, w1=0.7213, c.p=1.193, c.g=1.193
+
+v.max=NA, d=14.71, vectorize=FALSE, hybrid=off
+
+It 1: fitness=3.318
+
+It 2: fitness=0.9281
+
+It 3: fitness=0.7925
+
+It 4: fitness=0.4302
+
+It 5: fitness=0.2844
+
+It 6: fitness=0.2394
+
+It 7: fitness=0.2383
+
+It 8: fitness=0.2383
+
+It 9: fitness=0.1174
+
+It 10: fitness=0.1174
+
+Maximal number of iterations reached
+
+best: 0.2037517 -0.2755488 f: 0.1174419
+
+The particle swarm improved the best fitness from 3.318 to 0.1174, leading to the optimized solution of ![
+$$s = \(0.20,-0.28\)$$
+](A317276_1_En_5_Chapter_IEq25.gif) (f = 0. 12). Figure 5.6 presents the result of the plots. It should be stressed that this tutorial example uses a very small swarm. When the advised rule is adopted (N P = 12), the optimized values are f = 0. 03 (10 iterations) and ![
+$$f = 9.36 \\times 10^{-12}$$
+](A317276_1_En_5_Chapter_IEq26.gif) (100 iterations).
+
+Fig. 5.6
+
+Particle swarm optimization for sphere and D = 2 (left denotes the evolution of the position particles for the first parameter; right shows the evolution of the best fitness)
+
+## 5.5 Estimation of Distribution Algorithm
+
+Estimation of distribution algorithms (EDA) (Larrañaga and Lozano 2002) are optimization methods that combine ideas from evolutionary computation, machine learning, and statistics. These methods were proposed in the mid-1990s, under several variants, such as population based incremental learning (PBIL) (Baluja 1994) and univariate marginal distribution algorithm (UMDA) (Mühlenbein 1997).
+
+EDA works by iteratively estimating and sampling a probability distribution that is built from promising solutions (Gonzalez-Fernandez and Soto 2012). Other population based methods (e.g., evolutionary algorithms) create new individuals using an implicit distribution function (e.g., due to mutation and crossover operators). In contrast, EDA uses an explicit probability distribution defined by a model class (e.g., normal distribution). One main advantage of EDAs is that the search distribution may encode dependencies between the domain problem parameters, thus performing a more effective search.
+
+The EDAs adopted in this chapter are implemented in the copulaedas package. The full implementation details are available at Gonzalez-Fernandez and Soto (2012). The generic EDA structure is presented in Algorithm 8. The initial population is often created by using a random seeding method. The results of global optimization methods, such as EDAs, can often be enhanced when combined with a local optimization method. Also, as described in Chap. , such local optimization can be useful to repair infeasible solutions (see Sect. 5.7 for an example). Then, the population of solutions are improved in a main cycle, until a termination criterion is met.
+
+Within the main cycle, the selection function goal is to choose the most interesting solutions. For instance, truncation selection chooses a percentage of the best solutions from current population (P). The essential steps of EDA are the estimation and simulation of the search distribution, which is implemented by the learn and sample functions. The learning estimates the structure and parameters of the probabilistic model (M) and the sampling is used to generate new solutions (P ′ ) from the probabilistic model. Finally, the replacement function defines the next population. For instance, by replacing the full current population by the newly sampled one (P ′ ), by maintaining only the best solutions (found in both populations) or by keeping a diverse set of solutions.
+
+Algorithm 8 Generic EDA pseudo-code implemented in copulaedas package, adapted from Gonzalez-Fernandez and Soto (2012)
+
+The copulaedas package implements EDAs based on copula functions (Joe 1997), under a modular object oriented implementation composed of separated generic functions that facilitates the definition of new EDAs (Gonzalez-Fernandez and Soto 2012). EDA components, such as learning and sampling methods, are independently programmed under a common structure shared by most EDAs. The package uses S4 classes , which denotes R objects that have a formal definition of a class (type ![
+$$\\fbox{ $\\mathtt{> help\(`` Classes'' \)}$}$$
+](A317276_1_En_5_Chapter_IEq27.gif) for more details) and generic methods that can be defined by using the setMethod R function. An S4 instance is composed of slots , which is a class component that can be accessed and changed using the @ symbol. An S4 class instance can be displayed at the console by using the show() R function.
+
+The main function is edaRun , which implements Algorithm 8, assumes a minimization goal, and includes four arguments:
+
+  * eda—an EDA instance;
+
+  * f—evaluation function to be minimized; and
+
+  * lower, upper—lower and upper bounds.
+
+The result is an EDAResult class with several slots, namely: @eda—EDA class; @f—evaluation function; @lower and @upper—lower and upper bounds; @numGens—number of generations (iterations); @fEvals—number of evaluations; @bestEval—best evaluation; @bestSol—best solution; and @cpuTime—time elapsed by the algorithm;
+
+An EDA instance can be created using one of two functions, according to the type of model of search distributions: CEDA—using multivariate copula; and VEDA—using vines (graphical models that represent high-dimensional distributions and that can model a more rich variety of dependencies). The main arguments of CEDA are: copula—"indep" (independence or product copula) or "normal" (normal copula, the default); margin—marginal distribution (e.g., "norm"); and popSize—population size (N P , default is 100). The VEDA function includes the same margin and popSize arguments and also: vine—"CVine" (canonical vine) or "DVine" (the default); copulas—candidate copulas: "normal", "t", "clayton", "frank" or "gumbel" (default is c("normal")); and indepTestSigLevel—significance independence test level (default 0.01). The result is a CEDA (or VEDA) class with two slots: @name—the EDA name; and @parameters—the EDA parameters. Using these two functions, several EDAs can be defined, including UMDA, Gaussian copula EDA (GCEDA), C-vine EDA (CVEDA) and D-vine (DVEDA):
+
+# four EDA types:
+
+# adapted from (Gonzalez-Fernandez and Soto, 2012)
+
+UMDA=CEDA(copula="indep",margin="norm"); UMDA@name="UMDA"
+
+GCEDA=CEDA(copula="normal",margin="norm"); GCEDA@name="GCEDA"
+
+CVEDA=VEDA(vine="CVine",indepTestSigLevel=0.01,
+
+copulas = c("normal"),margin = "norm")
+
+CVEDA@name="CVEDA"
+
+DVEDA=VEDA(vine="DVine",indepTestSigLevel=0.01,
+
+copulas = c("normal"),margin = "norm")
+
+DVEDA@name="DVEDA"
+
+The population size (N P ) is a critical factor of EDA performance, if too small then the estimate of the search distributions might be inaccurate, while a too large number increases the computational effort and might not introduce any gain in the optimization. Thus, several population size values should be tested. In particular, the copulaedas vignette (Gonzalez-Fernandez and Soto 2012) presents a bisection method that starts with an initial interval and that is implemented using the edaCriticalPopSize function (check ![
+$$\\fbox{ $\\mathtt{>?edaCriticalPopSize}$}$$
+](A317276_1_En_5_Chapter_IEq28.gif)).
+
+The copulaedas package includes several other generic methods that can be defined using the setMethod function (type ![
+$$\\fbox{ $\\mathtt{> help\(`` EDA-class'' \)}$}$$
+](A317276_1_En_5_Chapter_IEq29.gif) for more details), such as: edaSeed—initialization function (default is edaSeedUniform); edaOptimize—local optimization (disabled by default, Sect. 5.7 exemplifies how to define a different function); edaSelect—selection function (default is edaSelectTruncation); edaReplace—replacement function (default is edaReplaceComplete—P is replaced by P ′ ); edaReport—reporting function (disabled by default); and edaTerminate—termination criteria (default edaTerminateMaxGen—maximum of iterations).
+
+The same sphere (D = 2) task is used to demonstrate the EDA:
+
+### sphere-EDA.R file ###
+
+library(copulaedas)
+
+sphere=function(x) sum(x^2)
+
+D=2; maxit=10; LP=5
+
+set.seed(12345) # set for replicability
+
+# set termination criterion and report method:
+
+setMethod("edaTerminate","EDA",edaTerminateMaxGen)
+
+setMethod("edaReport","EDA",edaReportSimple)
+
+# set EDA type:
+
+UMDA=CEDA(copula="indep",margin="norm",popSize=LP,maxGen=maxit)
+
+UMDA@name="UMDA (LP=5)"
+
+# run the algorithm:
+
+E=edaRun(UMDA,sphere,rep(-5.2,D),rep(5.2,D))
+
+# show result:
+
+show(E)
+
+cat("best:",E@bestSol,"f:",E@bestEval,"\n")
+
+# second EDA execution, using LP=100:
+
+maxit=10; LP=100;
+
+UMDA=CEDA(copula="indep",margin="norm",popSize=LP,maxGen=maxit)
+
+UMDA@name="UMDA (LP=100)"
+
+setMethod("edaReport","EDA",edaReportDumpPop) # pop_*.txt files
+
+E=edaRun(UMDA,sphere,rep(-5.2,D),rep(5.2,D))
+
+show(E)
+
+cat("best:",E@bestSol,"f:",E@bestEval,"\n")
+
+# read dumped files and create a plot:
+
+pdf("eda1.pdf",width=7,height=7)
+
+j=1; # j-th parameter
+
+i=1;d=read.table(paste("pop_",i,".txt",sep=""))
+
+plot(xlim=c(1,maxit),rep(1,LP),d[,j],pch=19,
+
+xlab="iterations",ylab=paste("s_",j," value",sep=""))
+
+for(i in 2:maxit)
+
+{ d=read.table(paste("pop_",i,".txt",sep=""))
+
+points(rep(i,LP),d[,j],pch=19)
+
+}
+
+dev.off()
+
+In this example, the UMDA EDA type was selected using two different population sizes (N P = 5 and N P = 100, which is the default copulaedas value). For the last EDA, the edaReportDumpPop report type is adopted, which dumps each population into a different text file (e.g., the first population is stored at pop_1.txt). After showing the second EDA result, the dumped files are read using the read.table command, in order to create the plot of Fig. 5.7. The execution result of such demonstration code is:
+
+> source("sphere-EDA.R")
+
+Generation Minimum Mean Std. Dev.
+
+1 7.376173e+00 1.823098e+01 6.958909e+00
+
+2 7.583753e+00 1.230911e+01 4.032899e+00
+
+3 8.001074e+00 9.506158e+00 9.969029e-01
+
+4 7.118887e+00 8.358575e+00 9.419817e-01
+
+5 7.075184e+00 7.622604e+00 3.998974e-01
+
+6 7.140877e+00 7.321902e+00 1.257652e-01
+
+7 7.070203e+00 7.222189e+00 1.176669e-01
+
+8 7.018386e+00 7.089300e+00 4.450968e-02
+
+9 6.935975e+00 7.010147e+00 7.216829e-02
+
+10 6.927741e+00 6.946876e+00 1.160758e-02
+
+Results for UMDA (LP=5)
+
+Best function evaluation 6.927741e+00
+
+No. of generations 10
+
+No. of function evaluations 50
+
+CPU time 0.103 seconds
+
+best: 1.804887 -1.915757 f: 6.927741
+
+Results for UMDA (LP=100)
+
+Best function evaluation 5.359326e-08
+
+No. of generations 10
+
+No. of function evaluations 1000
+
+CPU time 0.036 seconds
+
+best: -0.00013545 0.0001877407 f: 5.359326e-08
+
+When only five individuals are used, the algorithm only performs a slight optimization (from f = 7. 38 to f = 6. 93). However, when a higher population size is adopted (N P = 100), the EDA performs a very good optimization, achieving a value of 5. 36 × 10−8 in only ten generations. Figure 5.7 shows the respective evolution of the first parameter population values, showing a fast convergence towards the optimum zero value.
+
+Fig. 5.7
+
+Evolution of the first parameter population values (x 1) for EDA (N P = 100)
+
+## 5.6 Comparison of Population Based Methods
+
+The goal of this section is to compare all previously presented population based algorithms on two tasks (rastrigin, D = 20; and bag prices, D = 5). Four continuous optimization methods are compared: evolutionary algorithm, differential evolution, particle swarm optimization (SPSO 2007), and EDA (GCEDA variant). For the second task, each solution is rounded to the nearest integer value (within $1,$1000]) before computing the profit. Each method is run fifty times for each task. To simplify the analysis, the comparison is made only in terms of aggregated results over the runs (average or percentage of successes) and no confidence intervals or statistical tests are used (check Sects. [2.​2, 4.5, and 5.7 for R code examples of more robust statistical comparative analysis).
+
+Similarly to what is discussed in Sect. 4.​5, rather than executing a complete and robust comparison, the intention is more to show how population based algorithms can be compared. To provide a fair comparison and to simplify the experimentation setup, the default parameters of the methods are adopted, except for the population size, which is kept the same for all methods (N P = 100 for rastrigin and N P = 50 for bag prices). Also, as performed in Sect. 4.​5, all methods are evaluated by storing the best value as a function of the number of evaluations and up to the same maximum number (MAXFN=10000 for rastrigin and MAXFN=5000 for bag prices).
+
+The comparison code (file compare2.R) uses the same global variables of Sect. 4.​5 (EV, F and BEST), to store the best value:
+
+### compare2.R file ###
+
+source("functions.R") # bag prices functions
+
+library(genalg)
+
+library(DEoptim)
+
+library(pso)
+
+library(copulaedas)
+
+# evaluation functions: ------------------------------------
+
+crastrigin=function(x) # adapted rastrigin
+
+{ f=10*length(x)+sum(x^2-10*cos(2*pi*x))
+
+# global assignment code: <<-
+
+EV<<-EV+1 # increase evaluations
+
+if(f<BEST) BEST<<-f # minimum value
+
+if(EV<=MAXFN) F[EV]<<-BEST
+
+return(f)
+
+}
+
+cprofit=function(x) # adapted bag prices
+
+{ x=round(x,digits=0) # convert x into integer
+
+# given that EDA occasionally produces unbounded values:
+
+x=ifelse(x<1,1,x) # assure that x is within
+
+x=ifelse(x>1000,1000,x) # the [1,1000] bounds
+
+s=sales(x) # get the expected sales
+
+c=cost(s) # get the expected cost
+
+profit=sum(s*x-c) # compute the profit
+
+EV<<-EV+1 # increase evaluations
+
+if(profit>BEST) BEST<<-profit # maximum value
+
+if(EV<=MAXFN) F[EV]<<-BEST
+
+return(-profit) # minimization task!
+
+}
+
+# auxiliary functions: ------------------------------------
+
+crun=function(method,f,lower,upper,LP,maxit) # run a method
+
+{ if(method=="EA")
+
+rbga(evalFunc=f,stringMin=lower,stringMax=upper,popSize=LP,
+
+iters=maxit*1.5)
+
+else if(method=="DE")
+
+{ C=DEoptim.control(itermax=maxit,trace=FALSE,NP=LP)
+
+DEoptim(f,lower=lower,upper=upper,control=C)
+
+}
+
+else if(method=="PSO")
+
+{ C=list(maxit=maxit,s=LP)
+
+psoptim(rep(NA,length(lower)),fn=f,
+
+lower=lower,upper=upper,control=C)
+
+}
+
+else if(method=="EDA")
+
+{ setMethod("edaTerminate","EDA",edaTerminateMaxGen)
+
+GCEDA=CEDA(copula="normal",margin="norm",popSize=LP,
+
+maxGen=maxit)
+
+GCEDA@name="GCEDA"
+
+edaRun(GCEDA,f,lower,upper)
+
+}
+
+}
+
+successes=function(x,LIM,type="min") # number of successes
+
+{ if(type=="min") return(sum(x<LIM)) else return(sum(x>LIM)) }
+
+ctest=function(Methods,f,lower,upper,type="min",Runs, # test
+
+D,MAXFN,maxit,LP,pdf,main,LIM) # all methods:
+
+{ RES=vector("list",length(Methods)) # all results
+
+VAL=matrix(nrow=Runs,ncol=length(Methods)) # best values
+
+for(m in 1:length(Methods)) # initialize RES object
+
+RES[[m]]=matrix(nrow=MAXFN,ncol=Runs)
+
+for(R in 1:Runs) # cycle all runs
+
+for(m in 1:length(Methods))
+
+{ EV<<-0; F<<-rep(NA,MAXFN) # reset EV and F
+
+if(type=="min") BEST<<-Inf else BEST<<\- -Inf # reset BEST
+
+suppressWarnings(crun(Methods[m],f,lower,upper,LP,maxit))
+
+RES[[m]][,R]=F # store all best values
+
+VAL[R,m]=F[MAXFN] # store best value at MAXFN
+
+}
+
+# compute average F result per method:
+
+AV=matrix(nrow=MAXFN,ncol=length(Methods))
+
+for(m in 1:length(Methods))
+
+for(i in 1:MAXFN)
+
+AV[i,m]=mean(RES[[m]][i,])
+
+# show results:
+
+cat(main,"\n",Methods,"\n")
+
+cat(round(apply(VAL,2,mean),digits=0)," (average best)\n")
+
+cat(round(100*apply(VAL,2,successes,LIM,type)/Runs,
+
+digits=0)," (%successes)\n")
+
+# create pdf file:
+
+pdf(paste(pdf,".pdf",sep=""),width=5,height=5,paper="special")
+
+par(mar=c(4.0,4.0,1.8,0.6)) # reduce default plot margin
+
+MIN=min(AV);MAX=max(AV)
+
+# use a grid to improve clarity:
+
+g1=seq(1,MAXFN,length.out=500) # grid for lines
+
+plot(g1,AV[g1,1],ylim=c(MIN,MAX),type="l",lwd=2,main=main,
+
+ylab="average best",xlab="number of evaluations")
+
+for(i in 2:length(Methods)) lines(g1,AV[g1,i],lwd=2,lty=i)
+
+if(type=="min") position="topright" else position=
+
+"bottomright"
+
+legend(position,legend=Methods,lwd=2,lty=1:length(Methods))
+
+dev.off() # close the PDF device
+
+}
+
+# define EV, BEST and F:
+
+MAXFN=10000
+
+EV=0;BEST=Inf;F=rep(NA,MAXFN)
+
+# define method labels:
+
+Methods=c("EA","DE","PSO","EDA")
+
+# rastrigin comparison: -----------------------------------
+
+Runs=50; D=20; LP=100; maxit=100
+
+lower=rep(-5.2,D);upper=rep(5.2,D)
+
+ctest(Methods,crastrigin,lower,upper,"min",Runs,D,MAXFN,maxit,
+
+LP,
+
+"comp-rastrigin2","rastrigin (D=20)",75)
+
+# bag prices comparison: ----------------------------------
+
+MAXFN=5000
+
+F=rep(NA,MAXFN)
+
+Runs=50; D=5; LP=50; maxit=100
+
+lower=rep(1,D);upper=rep(1000,D)
+
+ctest(Methods,cprofit,lower,upper,"max",Runs,D,MAXFN,maxit,LP,
+
+"comp-bagprices","bag prices (D=5)",43500)
+
+Two important auxiliary functions were defined: crun—for executing a run of one of the four methods; and ctest—for executing several runs and showing the overall results. For the evolutionary algorithm, the maximum number of iterations is increased by 50 % to assure that at least MAXFN evaluations are executed (due to elitism, the number of tested solutions is lower than N P × maxit). The obtained results for each task are presented in terms of a plot and two console metrics. Each plot shows in the y-axis the evolution of the average best value, while the x-axis contains the number of evaluation functions. The two metrics used are: the average (over all runs) the best result (measured at MAXFN) and the percentage of successes. The last metric is measured as the proportion of best results below 75 (for rastrigin) or above 43,500 (for bag prices).
+
+Fig. 5.8
+
+Population based search comparison example for the rastrigin (top) and bag prices (bottom) tasks
+
+The results of the two plots are presented in Fig. 5.8, while the console results are:
+
+> source("compare2.R")
+
+rastrigin (D=20)
+
+EA DE PSO EDA
+
+38 64 101 74 (average best)
+
+100 94 2 58 (%successes)
+
+EA DE PSO EDA
+
+43674 43830 43722 43646 (average best)
+
+96 100 100 92 (%successes)
+
+The comparison of methods using the methodology related with Fig. 5.8 is interesting, as it presents the average behavior of the method throughout the number of evaluations, which is correlated with computational effort. EDA shows a faster initial convergence when compared with other methods (see Fig. 5.8) and thus it is the best choice if few computational resources are available. However, after a while the EDA convergence gets more flat and the method is outperformed by the evolutionary algorithm for rastrigin (after around 5,000 evaluations) and by the differential evolution for bag prices (after around 2,000 evaluations). Hence, if more computation power is available, then evolutionary algorithm is the best method (at MAXFN) for rastrigin, with an average of 38 and 100 % of successes, while differential evolution is the best choice for bag prices, with an average of 43,830 and 100 % of successes. It should be noted that this is just a demonstrative comparison example and different results could be achieved with a distinct experimental setup (e.g., different N P and MAXFN values).
+
+## 5.7 Bag Prices with Constraint
+
+This section compares two strategies for handling constraints: death penalty and repair . As explained in Sect. 1.​5, death penalty is a simply strategy that can be easily applied to any optimization method. It requires only changing the evaluation function to return very high penalty value if the solution is infeasible. However, such strategy is not very efficient, since the infeasible solutions do not guide the search, thus behaving similarly to Monte Carlo random search, see Sect. 3.​4. The repair alternative tends to be more efficient, as it transforms an infeasible solution into a feasible one, but often requires domain knowledge.
+
+For this comparison, the bag prices task (of Sect. 1.​7) is adopted with a hard constraint: the maximum number of bags that can be manufactured within a production cycle is set to 50. Also, the EDA method is used as the optimization engine, since the copulaedas package presents a useful feature for the repair strategy, since it is possible to add a local optimization method within the EDA main cycle (by using the edaOptimize generic method). The death penalty is simply implemented by returning Inf when a solution is infeasible, while the repair method uses a local search and domain knowledge. The code of Sect. 5.6 was adapted (e.g., same N P , maxit and MAXFN values) for this experiment:
+
+### bag-prices-constr.R file ###
+
+source("functions.R") # bag prices functions
+
+library(copulaedas) # EDA
+
+# evaluation function: ------------------------------------
+
+cprofit2=function(x) # bag prices with death penalty
+
+{ x=round(x,digits=0) # convert x into integer
+
+x=ifelse(x<1,1,x) # assure that x is within
+
+x=ifelse(x>1000,1000,x) # the [1,1000] bounds
+
+s=sales(x)
+
+if(sum(s)>50) res=Inf # if needed, death penalty!!!
+
+else{ c=cost(s);profit=sum(s*x-c)
+
+# if needed, store best value
+
+if(profit>BEST) { BEST<<-profit; B<<-x}
+
+res=-profit # minimization task!
+
+}
+
+EV<<-EV+1 # increase evaluations
+
+if(EV<=MAXFN) F[EV]<<-BEST
+
+return(res)
+
+}
+
+# example of a local search method that repairs a solution:
+
+localRepair=function(eda, gen, pop, popEval, f, lower, upper)
+
+{
+
+for(i in 1:nrow(pop))
+
+{ x=pop[i,]
+
+x=round(x,digits=0) # convert x into integer
+
+x=ifelse(x<lower[1],lower[1],x) # assure x within
+
+x=ifelse(x>upper[1],upper[1],x) # bounds
+
+s=sales(x)
+
+if(sum(s)>50)
+
+{
+
+x1=x
+
+while(sum(s)>50) # new constraint: repair
+
+{ # increase price to reduce sales:
+
+x1=x1+abs(round(rnorm(D,mean=0,sd=5)))
+
+x1=ifelse(x1>upper[1],upper[1],x1) # bound if needed
+
+s=sales(x1)
+
+}
+
+x=x1 # update the new x
+
+}
+
+pop[i,]=x;popEval[i]=f(x)
+
+}
+
+return(list(pop=pop,popEval=popEval))
+
+}
+
+# experiment: ----------------------------------------------
+
+MAXFN=5000
+
+Runs=50; D=5; LP=50; maxit=100
+
+lower=rep(1,D);upper=rep(1000,D)
+
+Methods=c("Death","Repair")
+
+setMethod("edaTerminate","EDA",edaTerminateMaxGen)
+
+GCEDA=CEDA(copula="normal",margin="norm",popSize=LP,
+
+maxGen=maxit,fEvalStdDev=10)
+
+GCEDA@name="GCEDA"
+
+RES=vector("list",length(Methods)) # all results
+
+VAL=matrix(nrow=Runs,ncol=length(Methods)) # best values
+
+for(m in 1:length(Methods)) # initialize RES object
+
+RES[[m]]=matrix(nrow=MAXFN,ncol=Runs)
+
+for(R in 1:Runs) # cycle all runs
+
+{
+
+B=NA;EV=0; F=rep(NA,MAXFN); BEST= -Inf # reset vars.
+
+setMethod("edaOptimize","EDA",edaOptimizeDisabled)
+
+setMethod("edaTerminate","EDA",edaTerminateMaxGen)
+
+suppressWarnings(edaRun(GCEDA,cprofit2,lower,upper))
+
+RES[[1]][,R]=F # store all best values
+
+VAL[R,1]=F[MAXFN] # store best value at MAXFN
+
+B=NA;EV=0; F=rep(NA,MAXFN); BEST= -Inf # reset vars.
+
+# set local repair search method:
+
+setMethod("edaOptimize","EDA",localRepair)
+
+# set additional termination criterion:
+
+setMethod("edaTerminate","EDA",
+
+edaTerminateCombined(edaTerminateMaxGen,edaTerminateEvalStdDev))
+
+# this edaRun might produces warnings or errors:
+
+suppressWarnings(try(edaRun(GCEDA,cprofit2,lower,upper),silent=TRUE))
+
+if(EV<MAXFN) # if stopped due to EvalStdDev
+
+F[(EV+1):MAXFN]=rep(F[EV],MAXFN-EV) # replace NAs
+
+RES[[2]][,R]=F # store all best values
+
+VAL[R,2]=F[MAXFN] # store best value at MAXFN
+
+}
+
+# compute average F result per method:
+
+AV=matrix(nrow=MAXFN,ncol=length(Methods))
+
+for(m in 1:length(Methods))
+
+for(i in 1:MAXFN)
+
+AV[i,m]=mean(RES[[m]][i,])
+
+# show results:
+
+cat(Methods,"\n")
+
+cat(round(apply(VAL,2,mean),digits=0)," (average best)\n")
+
+# Mann-Whitney non-parametric test:
+
+p=wilcox.test(VAL[,1],VAL[,2],paired=TRUE)$p.value
+
+cat("p-value:",round(p,digits=2),"(<0.05)\n")
+
+# create pdf file:
+
+pdf("comp-bagprices-constr.pdf",width=5,height=5,paper=
+
+"special")
+
+par(mar=c(4.0,4.0,1.8,0.6)) # reduce default plot margin
+
+# use a grid to improve clarity:
+
+g1=seq(1,MAXFN,length.out=500) # grid for lines
+
+plot(g1,AV[g1,2],type="l",lwd=2,
+
+main="bag prices with constraint",
+
+ylab="average best",xlab="number of evaluations")
+
+lines(g1,AV[g1,1],lwd=2,lty=2)
+
+legend("bottomright",legend=rev(Methods),lwd=2,lty=1:4)
+
+dev.off() # close the PDF device
+
+The two constraint handling methods are compared similarly as in Sect. 5.6, thus global EV, F, and BEST variables are used to store the best values at a given function evaluation. The death penalty is implemented in function cprofit2. The repair solution is handled by the localRepair function, which contains the signature (function arguments) required by edaOptimize (e.g., eda argument is not needed). If a solution is infeasible, then a local search is applied, where the solution prices are randomly increased until the expected sales is lower than 51. This local search uses the following domain knowledge: the effect of increasing a price is a reduction in the number of sales. The new feasible solutions are then evaluated. To reduce the number of code lines, the same cprofit2 function is used, although the death penalty is never applied, given the evaluated solution is feasible. The localRepair function returns a list with the new population and evaluation values. Such list is used by edaRun to replace the sampled population (P ′ ), thus behaving as a Lamarckian global–local search hybrid (see Sect. 1.​6).
+
+The EDA applied is similar to the one presented in Sect. 5.6 (e.g., GCEDA), except for the repair strategy, which includes the explained local search and an additional termination criterion (edaTerminateEvalStdDev), which stops when the standard deviation of the evaluation of the solutions is too low. When more than one criterion is used in EDA, the edaTerminateCombined method needs to be adopted. The edaTerminateEvalStdDev extra criterion was added given that the repair strategy leads to a very fast convergence and thus the population quickly converges to the same solution. However, setting the right standard deviation threshold value (fEvalStdDev) is not an easy task (setting a too large value will stop the method too soon). With fEvalStdDev=10, edaRun still occasionally produces warnings or errors. Thus, the suppressWarnings and try R functions are used to avoid this problem. The latter function prevents the failure of the edaRun execution. Such failure is not problematic, giving that the results are stored in global variables. In the evaluation function (cprofit2), the global variable B is used to store the best solution, which is useful when edaRun fails. When a failure occurs, the code also replaces the remaining NA values of the F object with the last known evaluation value.
+
+The obtained results are processed similarly to what is described in Sect. 5.6, except that the number of successes is not measured and the final average results are compared with a Mann–Whitney non-parametric statistical test (using the wilcox.test function). An example of file bag-prices-constr.R execution is:
+
+> source("bag-prices-constr.R")
+
+Death Repair
+
+31175 32364 (average best)
+
+p-value: 0 (<0.05)
+
+As shown by the average results, statistical test (p-value < 0.05) and plot of Fig. 5.9, the repair strategy clearly outperforms the death penalty one, with a final average difference of $1189 (statistically significant). Nevertheless, it should be noted that this repair strategy uses domain knowledge and thus it cannot be applied directly to other tasks.
+
+Fig. 5.9
+
+Comparison of repair and death penalty strategies for bag prices task with constraint
+
+## 5.8 Genetic Programming
+
+Genetic programming denotes a collection of evolutionary computation methods that automatically generate computer programs (Banzhaf et al. 1998). In general, the computer programs have a variable length and are based on lists or trees (Luke 2012). Hence, the goal of genetic programming is quite distinct from the previous presented population based algorithms. Instead of numerical optimization, genetic programming is used in tasks such as automatic programming or discovering mathematical functions. As pointed out in Flasch (2013), the one key advantage of genetic programming is that the representation of the solutions is often easy to interpret by humans; however, the main drawback is a high computational cost, due to the high search space of potential solutions.
+
+Genetic programming adopts the same concepts of evolutionary algorithms, with a population of solutions that compete for survival and use of genetic operators for generating new offspring. Thus, the search engine is similar to what is described in Algorithm 5. Giving that a different representation is adopted (e.g., trees), a distinct initialization function is adopted (e.g., random growth) and specialized genetic operators are used (Michalewicz et al. 2006). There are two main mutation types in classical genetic programming systems: replace a randomly selected value or function by another random value or function; and replace a randomly selected subtree by another generated subtree. The classical crossover works by replacing a random subtree from a parent solution by another random subtree taken from a second parent. Figure 5.10 shows an example of the random subtree crossover and includes four examples of tree representations for mathematical functions.
+
+Fig. 5.10
+
+Example of a genetic programming random subtree crossover
+
+This section approaches the mathematical function discovery goal using the rgp package and the intention is to show how non-numerical representations can be handled by a modern optimization technique. Only a brief explanation of rgp features is provided, since the package includes a large range of functions. If needed, further details can be obtained in Flasch (2013) (![
+$$\\fbox{ $\\mathtt{> vignette\(`` rgp\\_introduction'' \)}$}$$
+](A317276_1_En_5_Chapter_IEq30.gif)).
+
+The first step is to define the symbolic expression search space. In rgp, solutions are represented as R functions, which are constructed in terms of three sets: input variables (function arguments), constants, and function symbols. Constants are created in terms of factory functions, which typically are stochastic and are called each time a new constant is needed. The function symbols usually include arithmetic operators, such as addition (+) or subtraction (-). Other R mathematical functions can also be included (e.g., exp, log), but some care is needed to avoid invalid expressions (e.g., log(-1) returns a NaN). These sets (whose members are known as building blocks) can be set using the rgp functions:
+
+  * inputVariableSet —arguments are the names (strings) of input variables;
+
+  * constantFactorySet —argument is a factory function that often includes a random number generator (e.g., rnorm, runif); and
+
+  * functionSet —arguments are strings that define the set of mathematical symbols.
+
+Next, an evaluation function needs to be defined. The rgp assumes a a minimization goal. The final step consists in selecting the genetic programming parameters and running the algorithm. This is achieved using the geneticProgramming function, which includes parameters such as:
+
+  * fitnessFunction—evaluation function that includes one argument (the expression);
+
+  * stopCondition—termination criteria (usually based on maximum runtime, in seconds; ?makeStepsStopCondition shows other stopping options and full details);
+
+  * population—initial population, if missing it is created using a random growth;
+
+  * populationSize—size of the population (N P , the default is 100);
+
+  * eliteSize—number of individuals to keep (elitism, defaults to ceiling(0.1 * populationSize));
+
+  * functionSet—set of mathematical symbols;
+
+  * inputVariables—set of input variables;
+
+  * constantSet—set of constant factory functions;
+
+  * crossoverFunction—crossover operator (defaults to crossover, which is the classical random subtree crossover, see Fig. 5.10);
+
+  * mutationFunction—mutation function (defaults to NULL, check ?mutateFunc for rgp mutation possibilities);
+
+  * progressMonitor—function called every generation and that shows the progress of the algorithm; and
+
+  * verbose—if progress should be printed.
+
+The result is a list with several components, including: $population, the last population; and $fitnessValues, the evaluation values of such population.
+
+To show the rgp capabilities, the synthetic rastrigin function (D = 2) is adopted (check Sect. 7.​3 for a real-world problem demonstration) and approximated with a polynomial function. Thus, the variable set includes two inputs (x 1 and x 2) and set of function symbols is defined as {"*", "+", "-"}. In this example, the set of constants is generated using a normal distribution. Moreover, the evaluation function is not the rastrigin function itself, since the goal is to approximate this function. Rather, the evaluation function is set as the error between the rastrigin and candidate expression outputs for an input domain. The mean squared error (MSE) is the adopted error metric. Widely used in statistics, the metric is defined as ![
+$$MSE =\\sum _{ i=1}^{N}\(y_{i} -\\hat{ y}_{i}\)^{2}/N$$
+](A317276_1_En_5_Chapter_IEq31.gif), where y i is the target value for input x i , ![
+$$\\hat{y}_{i}$$
+](A317276_1_En_5_Chapter_IEq32.gif) is the estimated value, and N is the number of input examples. MSE penalizes higher individual errors and the lower the metric, the better is the approximation. The genetic programming is set with a population size of N P = 50, a random subtree mutation (with maximum subtree depth of 4, using the mutateSubtree rgp function) and it is stopped after 50 s. The implemented R code is:
+
+### gp-rastrigin.R ###
+
+library(rgp) # load rgp
+
+# auxiliary functions:
+
+rastrigin=function(x) 10*length(x)+sum(x^2-10*cos(2*pi*x))
+
+fwrapper=function(x,f) f(x[1],x[2])
+
+# configuration of the genetic programming:
+
+ST=inputVariableSet("x1","x2")
+
+cF1=constantFactorySet(function() rnorm(1)) # mean=0, sd=1
+
+FS=functionSet("+","*","-")
+
+# set the input samples (grid^2 data points):
+
+grid=10 # size of the grid used
+
+domain=matrix(ncol=2,nrow=grid^2) # 2D domain grid
+
+domain[,1]=rep(seq(-5.2,5.2,length.out=grid),each=grid)
+
+domain[,2]=rep(seq(-5.2,5.2,length.out=grid),times=grid)
+
+eval=function(f) # evaluation function
+
+{ mse(apply(domain,1,rastrigin),apply(domain,1,fwrapper,f)) }
+
+# run the genetic programming:
+
+set.seed(12345) # set for replicability
+
+mut=function(func) # set the mutation function
+
+{ mutateSubtree(func,funcset=FS,inset=ST, conset=cF1,
+
+mutatesubtreeprob=0.1,maxsubtreedepth=4) }
+
+gp=geneticProgramming(functionSet=FS,inputVariables=ST,
+
+constantSet=cF1,populationSize=50,
+
+fitnessFunction=eval,
+
+stopCondition=makeTimeStopCondition(50),
+
+mutationFunction=mut,verbose=TRUE)
+
+# show the results:
+
+b=gp$population[[which.min(gp$fitnessValues)]]
+
+cat("best solution (f=",eval(b),"):\n")
+
+print(b)
+
+# create approximation plot:
+
+L1=apply(domain,1,rastrigin);L2=apply(domain,1,fwrapper,b)
+
+MIN=min(L1,L2);MAX=max(L1,L2)
+
+pdf("gp-function.pdf",width=7,height=7,paper="special")
+
+plot(L1,ylim=c(MIN,MAX),type="l",lwd=2,lty=1,
+
+xlab="points",ylab="function values")
+
+lines(L2,type="l",lwd=2,lty=2)
+
+legend("bottomright",leg=c("rastrigin","GP function"),lwd=2,
+
+lty=1:2)
+
+dev.off()
+
+In this example, the two input variables are named "x1" and "x2," while the input domain is created as a two dimensional grid, where each input is varied within the range [−5. 2, 5. 2], with a total of grid*grid=100 samples. The domain matrix is created using the rep and seq R functions (type ![
+$$\\fbox{ $\\mathtt{> print\(domain\)}$}$$
+](A317276_1_En_5_Chapter_IEq33.gif) to check the matrix values). Such matrix is used by the eval function, which uses the apply function at the row level to generate first the rastrigin and expression (f) outputs and then computes the MSE for all domain samples. The mse function computes the MSE and it is defined in the rgp package. The auxiliary fwrapper function was created for an easier use of apply over f, since f receives two arguments while a row from domain is one vector with two elements. After running the genetic programming, the best solution is presented. Also, a PDF file is created, related with a two dimensional plot, where the x-axis denotes all 100 points and the y-axis the rastrigin and genetic programming best solution output values. The result of running the demonstration file (gp-rastrigin.R) is:1
+
+> source("gp-rastrigin.R")
+
+STARTING genetic programming evolution run (Age/Fitness/Complexity Pareto GP search-heuristic) ...
+
+evolution step 100, fitness evaluations: 1980, best fitness: 1459.126753, time elapsed: 3.37 seconds
+
+evolution step 200, fitness evaluations: 3980, best fitness:
+
+317.616080, time elapsed: 7.14 seconds
+
+evolution step 300, fitness evaluations: 5980, best fitness:
+
+205.121919, time elapsed: 12.09 seconds
+
+evolution step 400, fitness evaluations: 7980, best fitness:
+
+98.718003, time elapsed: 18.1 seconds
+
+evolution step 500, fitness evaluations: 9980, best fitness:
+
+87.140058, time elapsed: 23.73 seconds
+
+evolution step 600, fitness evaluations: 11980, best fitness:
+
+87.140058, time elapsed: 29.62 seconds
+
+evolution step 700, fitness evaluations: 13980, best fitness:
+
+87.140058, time elapsed: 35.31 seconds
+
+evolution step 800, fitness evaluations: 15980, best fitness:
+
+87.140058, time elapsed: 41.28 seconds
+
+evolution step 900, fitness evaluations: 17980, best fitness:
+
+87.074739, time elapsed: 46.98 seconds
+
+Genetic programming evolution run FINISHED after 954 evolution steps, 19060 fitness evaluations and 50.05 seconds.
+
+best solution (f= 87.07474 ):
+
+function (x1, x2)
+
+x2 * x2 + (1.3647488967524 + x1 * x1 + -0.82968488587336 +
+
+1.3647488967524 + 1.3647488967524 + 1.3647488967524 +
+
+1.3647488967524 + 1.3647488967524 + 1.59224941702801 +
+
+1.3647488967524 + 1.3647488967524 + 1.3647488967524 +
+
+1.3647488967524 + 1.3647488967524)
+
+After 50 s, the best obtained solution is ![
+$$x_{2}^{2} + x_{1}^{2} + 15.7748$$
+](A317276_1_En_5_Chapter_IEq34.gif), corresponding to an MSE value of 87. Figure 5.11 shows the created plot, revealing an interesting fit. It should be noted that the final solution (![
+$$x_{2}^{2} + x_{1}^{2} + 15.7748$$
+](A317276_1_En_5_Chapter_IEq35.gif)) was obtained by performing a manual "cleaning" of the returned symbolic expression. Such post-processing (using manual or automatic techniques) is a common task when human understandable knowledge is required.
+
+Fig. 5.11
+
+Comparison of rastrigin function and best solution given by the genetic programming
+
+## 5.9 Command Summary
+
+CEDA() | Implement EDAs based on multivariate copulas (package copulaedas)
+
+---|---
+
+constantFactorySet() | Genetic programming set of constants (package rgp)
+
+copulaedas | Package for EDAs based on copulas
+
+DEoptim | Package for differential evolution
+
+DEoptim() | Differential evolution algorithm (package DEoptim)
+
+DEoptim.control() | Differential evolution control parameters (package DEoptim)
+
+edaRun() | EDA optimization algorithm (package copulaedas)
+
+functionSet() | Genetic programming set of functions (package rgp)
+
+genalg | Package for genetic and evolutionary algorithms
+
+geneticProgramming | Genetic programming algorithm (package rgp)
+
+gray() | Returns a vector of gray colors from a vector of gray levels
+
+inputVariableSet() | Genetic programming set of variables (package rgp)
+
+mse() | Mean squared error (package rgp)
+
+mutateSubtree() | Random subtree mutation (package rgp)
+
+plot.DEoptim() | Plot differential evolution result (package DEoptim)
+
+plot.rbga() | Plot genetic/evolutionary algorithm result (package genalg)
+
+pso | Package for particle swarm optimization
+
+pso() | Particle swarm optimization algorithm (package pso)
+
+rbga() | Evolutionary algorithm (package genalg)
+
+rbga.bin() | Genetic algorithm (package genalg)
+
+rgp | Package for genetic programming
+
+show() | Show an object
+
+summary.DEoptim() | Summarize differential evolution result (package DEoptim)
+
+summary.rbga() | Summarize genetic/evolutionary algorithm result (package genalg)
+
+suppressWarnings() | Evaluates its expression and ignores all warnings
+
+try | Runs an expression that might fail and handles error-recovery
+
+VEDA() | Implement EDAs based on vines (package copulaedas)
+
+vignette() | View a particular vignette or list available ones
+
+## 5.10 Exercises
+
+5.1.
+
+Apply a genetic algorithm to optimize the binary max sin task with D = 16 (from Exercise 4.2), using a population size of 20, elitism of 1, and maximum of 100 iterations. Show the best solution and fitness value.
+
+5.2.
+
+Consider the eggholder function (D = 2):
+
+![
+$$\\displaystyle{ f = -\(x_{2} + 47\)\\sin \(\\sqrt{\\vert x_{2 } + x_{1 } /2 + 47\\vert }- x_{1}\\sin \(\\sqrt{\\vert x_{1 } - x_{2 } + 47\\vert }\) }$$
+](A317276_1_En_5_Chapter_Equ3.gif)
+
+(5.3)
+
+Adapt the code of file compare2.R (Sect. 5.6) such that three methods are compared to minimize the eggholder task: Monte Carlo search (Sect. 3.​4), particle swarm optimization (SPSO 2011), and EDA (DVEDA). Use ten runs for each method, with a maximum number of evaluations set to MAXFN=1000 and solutions searched within the range [ − 512,512]. Consider the percentage of successes below − 950. For the population based methods, use a population size of N P = 20 and maximum number of iterations of maxit = 50.
+
+5.3.
+
+Consider the original bag prices task (D = 5, Sect. 1.​7) with a new hard constraint: x 1 > x 2 > x 3 > x 4 > x 5. Adapt the code of Sect. 5.7 in order to compare death penalty and repair constraint handling strategies using an EDA of type UMDA. Hint: consider a simple repair solution that reorders each infeasible solution into a feasible one.
+
+5.4.
+
+Approximate the eggholder function of Exercise 5.2 using a genetic programming method with a population size of 100 and other default parameters. The genetic programming building blocks should be defined as:
+
+  * function symbols—use the same functions/operators that appear at the eggholder equation;
+
+  * constants—use a random sampling over the eggholder constants {2,47}; and
+
+  * variables—two inputs (x 1 and x 2).
+
+Set the domain input with 500 samples randomly generated within the range [−512, 512] and stop the algorithm after 20 s.
+
+References
+
+Baluja S (1994) Population-based incremental learning: a method for integrating genetic search based function optimization and competitive learning. Tech. rep., DTIC Document
+
+Banzhaf W, Nordin P, Keller R, Francone F (1998) Genetic programming. An introduction. Morgan Kaufmann, San FranciscoCrossRefMATH
+
+Clerc M (2012) Standard particle swarm optimization. hal-00764996, version 1. http://​hal.​archives-ouvertes.​fr/​hal-00764996
+
+Flasch O (2013) A friendly introduction to rgp. http://​cran.​r-project.​org/​web/​packages/​rgp/​vignettes/​rgp_​introduction.​pdf
+
+Goldberg DE, Deb K (1991) A comparative analysis of selection schemes used in genetic algorithms, Urbana 51:61801–62996.
+
+Gonzalez-Fernandez Y, Soto M (2012) copulaedas: an R package for estimation of distribution algorithms based on Copulas. arXiv preprint arXiv:12095429
+
+Holland J (1975) Adaptation in natural and artificial systems. Ph.D. thesis, University of Michigan
+
+Joe H (1997) Multivariate models and dependence concepts, vol 73. CRC Press, Boca RatonCrossRefMATH
+
+Kennedy J, Eberhart R (1995) Particle swarm optimization. In: ICNN'95 - IEEE international conference on neural networks proceedings. IEEE Computer Society, Perth, pp 1942–1948
+
+Larrañaga P, Lozano JA (2002) Estimation of distribution algorithms: a new tool for evolutionary computation, vol 2. Kluwer Academic, Boston
+
+Lucasius CB, Kateman G (1993) Understanding and using genetic algorithms part 1. Concepts, properties and context. Chemom Intell Lab Syst 19(1):1–33CrossRef80079-W)
+
+Luke S (2012) Essentials of metaheuristics. Lulu.com, online version at http://​cs.​gmu.​edu/​~sean/​book/​metaheuristics
+
+Mendes R, Cortez P, Rocha M, Neves J (2002) Particle swarms for feedforward neural network training. In: Proceedings of the 2002 international joint conference on neural networks (IJCNN 2002). IEEE Computer Society, Honolulu, pp 1895–1899
+
+Michalewicz Z (1996) Genetic algorithms + data structures = evolution programs. Springer, BerlinCrossRefMATH
+
+Michalewicz Z, Fogel D (2004) How to solve it: modern heuristics. Springer, BerlinCrossRef
+
+Michalewicz Z, Schmidt M, Michalewicz M, Chiriac C (2006) Adaptive business intelligence. Springer, Berlin
+
+Mühlenbein H (1997) The equation for response to selection and its use for prediction. Evol Comput 5(3):303–346CrossRef
+
+Mullen K, Ardia D, Gil D, Windover D, Cline J (2011) Deoptim: an r package for global optimization by differential evolution. J Stat Softw 40(6):1–26
+
+Price KV, Storn RM, Lampinen JA (2005) Differential evolution a practical approach to global optimization. Springer, BerlinMATH
+
+Storn R, Price K (1997) Differential evolution–a simple and efficient heuristic for global optimization over continuous spaces. J Glob Optim 11(4):341–359CrossRefMATHMathSciNet
+
+Footnotes
+
+1
+
+These results were achieved with rgp version 0.3-4 and later rgp versions might produce different results.
+© Springer International Publishing Switzerland 2014
+
+Paulo CortezModern Optimization with RUse R!10.1007/978-3-319-08263-9_6
+
+# 6. Multi-Objective Optimization
+
+Paulo Cortez1
+
+(1)
+
+Department of Information Systems, University of Minho, Guimarães, Portugal
+
+## 6.1 Introduction
+
+In previous chapters, only single objective tasks were addressed. However, multiple goals are common in real-world domains. For instance, a company typically desires to increase sales while reducing production costs. Within its marketing department, the goal might include maximizing target audiences while minimizing the marketing budget. Also, within the production department, the same company might want to maximize the manufactured items, in terms of both quality and production numbers, while minimizing production time, costs, and waste of material. Often, the various objectives can conflict, where gaining in one goal involves losing in another one. Thus, there is a need to set the right trade-offs.
+
+To handle multi-objective tasks, there are three main approaches (Freitas 2004): weighted-formula, lexicographic and Pareto front, whose R implementation details are discussed in the next sections, after presenting the demonstrative tasks selected for this chapter.
+
+## 6.2 Multi-Objective Demonstrative Problems
+
+This section includes three examples of simple multi-objective tasks that were selected to demonstrate the methods presented in this chapter. Given an D-dimensional variable vector x = { x 1,..., x D } the goal is to optimize a set of m objective functions ![
+$$\\{f_{1}\(x_{1},\\ldots,x_{D}\),\\ldots,f_{m}\(x_{1},\\ldots,x_{D}\)\\}$$
+](A317276_1_En_6_Chapter_IEq1.gif). To simplify the demonstrations, only two m = 2 objective functions are adopted for each task (an m = 3 example is shown in Sect. 7.​6).
+
+The binary multi-objective goal consists in maximizing both functions of the set ![
+$$\\{f_{\\text{sum of bits}}\(x_{1},\\ldots,x_{D}\),f_{\\text{max sin}}\(x_{1},\\ldots,x_{D}\)\\}$$
+](A317276_1_En_6_Chapter_IEq2.gif), where x i ∈ { 0, 1} and the functions are defined in Eqs. (1.​1) and (1.​2). As explained in Sect. 1.​7 (see also Fig. 1.​3), when D = 8 the optimum solutions are set at different points of the search space (x=(1,1,1,1,1,1,1,1) for f 1 and x=(1,0,0,0,0,0,0,0) for f 2), thus a trade-off is needed.
+
+The bag prices integer multi-objective goal is set by maximizing f 1 and minimizing f 2, where f 1 = f bag prices and ![
+$$f_{2} =\\sum _{ i=1}^{D}sales\(x_{i}\)$$
+](A317276_1_En_6_Chapter_IEq3.gif), i.e., the number of bags that the factory will produce (see Sect. 1.​7).
+
+Finally, the real value multi-objective goal is defined in terms of the FES1 benchmark (Huband et al. 2006), which involves minimizing both functions of the set:
+
+![
+$$\\displaystyle{ \\{f_{1} =\\sum _{ i=1}^{D}\\vert x_{ i} -\\exp \(\(i/D\)^{2}\)/3\\vert ^{0.5},f_{ 2} =\\sum _{ i=1}^{D}\(x_{ i} - 0.5\\cos \(10\\pi i/D\) - 0.5\)^{2}\\} }$$
+](A317276_1_En_6_Chapter_Equ1.gif)
+
+(6.1)
+
+where x i ∈ [0, 1]. As shown in Fig. 6.1, the minimum solutions are set at distinct points of the search space.
+
+Fig. 6.1
+
+Example of the FES1 f 1 (left) and f 2 (right) task landscapes (D = 2)
+
+The R code related with the three multi-optimization tasks is presented in file mo-tasks.R:
+
+### mo-tasks.R file ###
+
+# binary multi-optimization goal:
+
+sumbin=function(x) (sum(x))
+
+intbin=function(x) sum(2^(which(rev(x==1))-1))
+
+maxsin=function(x) # max sin (explained in Chapter )
+
+{ D=length(x);x=intbin(x)
+
+return(sin(pi*(as.numeric(x))/(2^D))) }
+
+# integer multi-optimization goal:
+
+profit=function(x) # x - a vector of prices
+
+{ x=round(x,digits=0) # convert x into integer
+
+s=sales(x) # get the expected sales
+
+c=cost(s) # get the expected cost
+
+profit=sum(s*x-c) # compute the profit
+
+return(profit)
+
+}
+
+cost=function(units,A=100,cpu=35-5*(1:length(units)))
+
+{ return(A+cpu*units) }
+
+sales=function(x,A=1000,B=200,C=141,
+
+m=seq(2,length.out=length(x),by=-0.25))
+
+{ return(round(m*(A/log(x+B)-C),digits=0))}
+
+produced=function(x) sum(sales(round(x)))
+
+# real value FES1 benchmark:
+
+fes1=function(x)
+
+{ D=length(x);f1=0;f2=0
+
+for(i in 1:D)
+
+{ f1=f1+abs(x[i]-exp((i/D)^2)/3)^0.5
+
+f2=f2+(x[i]-0.5*cos(10*pi/D)-0.5)^2
+
+}
+
+return(c(f1,f2))
+
+}
+
+## 6.3 Weighted-Formula Approach
+
+The weighted-formula approach , also known as priori approach , has the advantage of being the simplest multi-objective solution, thus it is more easy to implement. This approach involves first assigning weights to each goal and then optimizing a quality Q measure that is typically set using an additive or multiplicative formula:
+
+![
+$$\\displaystyle{ \\begin{array}{lcl} Q& =&w_{1} \\times g_{1} + w_{2} \\times g_{2} +\\ldots +w_{n} \\times g_{n} \\\\ Q& =&g_{1}^{w_{1}} \\times g_{1}^{w_{2}} \\times \\ldots \\times g_{n}^{w_{n}} \\\\ \\end{array} }$$
+](A317276_1_En_6_Chapter_Equ2.gif)
+
+(6.2)
+
+where g 1, g 2,..., g n denote the distinct goals and w 1, w 2,..., w n the assigned weights.
+
+As discussed in Freitas (2004) and Konak et al. (2006), there are several disadvantages with the weighted-formula approach. First, setting the ideal weights is often difficult and thus weights tend to be set ad-hoc (e.g., based on intuition). Second, solving a problem to optimize Q for a particular vector w yields a single solution. This means that optimizing with a different combination of weights requires the execution of a new optimization procedure. Third, even if the weights are correctly defined, the search will miss trade-offs that might be interesting for the user. In particular, the linear combination of weights (as in the additive formula) limits the search for solutions in a non-convex region of the Pareto front (Fig. 6.2).
+
+Fig. 6.2
+
+Examples of convex (left) and non-convex (right) Pareto fronts, where the goal is to minimize both objectives 1 and 2
+
+To solve the first two weighted-formula limitations, enhanced optimization variants have been proposed. One interesting example is the weight-based genetic algorithm (WBGA) , which encodes a different weight vector into each solution of the genetic population (Konak et al. 2006).
+
+In this section, a pure weighted-formula approach is adopted for the three tasks presented in Sect. 6.2. Five additive weight combinations are tested: w 1 = (1. 00, 0. 00), w 2 = (0. 75, 0. 25), w 3 = (0. 50, 0. 50), w 4 = (0. 75, 0. 25), and w 5 = (0. 00, 1. 00). It should be noted that in all three tasks, there are different scales for each of the objectives (e.g., [0,8] range for f sum of bits and [0,1] for f max sin). Thus, the optimization method will tend to improve more the objective associated with the largest scale, unless more differentiated weights are used. Nevertheless, for the sake of simplicity, the same weight combinations are used for all three benchmarks.
+
+As the search engine, genetic and evolutionary algorithms are adopted, as implemented in the genalg package. The advantage of this package is that it can handle both binary (rbga.bin function) and real value (rbga function) representations. A distinct run of the optimization algorithm is executed for each of the five weight combinations. The population size is set to 20 individuals for bag prices and FES1 multi-objective tasks, while a smaller population size of 12 is used for the simpler binary multi-objective problem. The weighted-formula R code is presented in file wf-test.R:
+
+### wf-test.R file ###
+
+source("mo-tasks.R") # load multi-optimization tasks
+
+library(genalg) # load genalg package
+
+set.seed(12345) # set for replicability
+
+step=5 # number of weight combinations
+
+w=matrix(ncol=2,nrow=step) # weight combinations
+
+w[,1]=seq(1,0,length.out=step)
+
+w[,2]=1-w[,1]
+
+print("Weight combinations:")
+
+print(w)
+
+# --- binary task:
+
+D=8 # 8 bits
+
+eval=function(x) return(W[1]*sumbin(x)+W[2]*maxsin(x))
+
+cat("binary task:\n")
+
+for(i in 1:step)
+
+{
+
+W= -w[i,] # rbga.bin minimization goal: max. f1 and max. f2
+
+G=rbga.bin(size=D,popSize=12,iters=100,zeroToOneRatio=1,
+
+evalFunc=eval,elitism=1)
+
+b=G$population[which.min(G$evaluations),] # best individual
+
+cat("w",i,"best:",b)
+
+cat(" f=(",sumbin(b),",",round(maxsin(b),2),")","\n",sep="")
+
+}
+
+# --- integer task:
+
+D=5 # 5 bag prices
+
+eval=function(x) return(W[1]*profit(x)+W[2]*produced(x))
+
+cat("integer task:\n")
+
+res=matrix(nrow=nrow(w),ncol=ncol(w)) # for CSV files
+
+for(i in 1:step)
+
+{
+
+W=c(-w[i,1],w[i,2]) # rbga min. goal: max. f1 and min. f2
+
+G=rbga(evalFunc=eval,stringMin=rep(1,D),stringMax=rep(1000,D),
+
+popSize=20,iters=100)
+
+b=round(G$population[which.min(G$evaluations),]) # best
+
+cat("w",i,"best:",b)
+
+cat(" f=(",profit(b),",",produced(b),")","\n",sep="")
+
+res[i,]=c(profit(b),produced(b))
+
+}
+
+write.table(res,"wf-bag.csv",
+
+row.names=FALSE,col.names=FALSE,sep=" ")
+
+# --- real value task:
+
+D=8 # dimension
+
+eval=function(x) return(sum(W*fes1(x)))
+
+cat("real value task:\n")
+
+for(i in 1:step)
+
+{
+
+W=w[i,] # rbga minimization goal
+
+G=rbga(evalFunc=eval,stringMin=rep(0,D),stringMax=rep(1,D),
+
+popSize=20,iters=100)
+
+b=G$population[which.min(G$evaluations),] # best solution
+
+cat("w",i,"best:",round(b,2))
+
+cat(" f=(",round(fes1(b)[1],2),",",round(fes1(b)[2],2),")",
+
+"\n",sep="")
+
+res[i,]=fes1(b)
+
+}
+
+write.table(res,"wf-fes1.csv",
+
+row.names=FALSE,col.names=FALSE,sep=" ")
+
+The distinct weight combinations are stored in matrix w. Given that the genalg package performs a minimization, the ![
+$$f^{{\\prime}}\(s\) = -f\(s\)$$
+](A317276_1_En_6_Chapter_IEq4.gif) transformation (Sect. 1.​4) is adopted when the objective requires a maximization and thus the auxiliary W vector is used to multiple the weight values by − 1 when needed. After executing each optimization run, the code displays the best evolved solution and also the two objective evaluation values. For comparison with other multi-objective approaches, the best evaluation values are stored into CSV files (using the write.table function) for the last two tasks. The execution result is:
+
+> source("wf-test.R")
+
+[1] "Weight combinations:"
+
+[,1] [,2]
+
+[1,] 1.00 0.00
+
+[2,] 0.75 0.25
+
+[3,] 0.50 0.50
+
+[4,] 0.25 0.75
+
+[5,] 0.00 1.00
+
+binary task:
+
+w 1 best: 1 1 1 1 1 1 1 1 f=(8,0.01)
+
+w 2 best: 1 1 1 1 1 1 1 1 f=(8,0.01)
+
+w 3 best: 1 1 1 1 1 1 1 1 f=(8,0.01)
+
+w 4 best: 0 1 1 1 1 1 1 1 f=(7,1)
+
+w 5 best: 0 1 1 1 1 1 1 1 f=(7,1)
+
+integer task:
+
+w 1 best: 420 362 419 367 415 f=(43165,117)
+
+w 2 best: 425 433 408 390 410 f=(43579,112)
+
+w 3 best: 412 390 407 305 446 f=(43435,120)
+
+w 4 best: 399 418 438 405 372 f=(43499,114)
+
+w 5 best: 986 969 969 913 991 f=(4145,5)
+
+real value task:
+
+w 1 best: 0.32 0.35 0.37 0.42 0.51 0.57 0.74 0.91 f=(0.92,1.44)
+
+w 2 best: 0.36 0.33 0.38 0.43 0.51 0.58 0.68 0.91 f=(0.87,1.41)
+
+w 3 best: 0.36 0.34 0.39 0.43 0.5 0.58 0.49 0.89 f=(1.11,1.21)
+
+w 4 best: 0.33 0.35 0.39 0.42 0.49 0.36 0.29 0.25 f=(2.3,0.4)
+
+w 5 best: 0.18 0.15 0.16 0.23 0.16 0.16 0.15 0.17 f=(4.61,0.01)
+
+As expected, the obtained results show that in general the genetic and evolutionary algorithms manage to get the best f 1 values for the w 1 = (1. 00, 0. 00) weight combination and best f 2 values for the w 5 = (0. 00, 1. 00) vector of weights. The quality of the remaining task evolved solutions (i.e., for bag prices and FES1) will be discussed in Sect. 6.5.
+
+## 6.4 Lexicographic Approach
+
+Under the lexicographic approach , different priorities are assigned to different objectives, such that the objectives are optimized in their priority order (Freitas 2004). When two solutions are compared, first the evaluation measure for the highest-priority objective is compared. If the first solution is significantly better (e.g., using a given tolerance value) than the second solution, then the former is chosen. Else, the comparison is set using the second highest-priority objective. The process is repeated until a clear winner is found. If there is no clear winner, then the solution with the best highest-priority objective can be selected.
+
+In Freitas (2004), the advantages and disadvantages of the lexicographic approach are highlighted. When compared with the weighted-formula, the lexicographic approach has the advantage of avoiding the problem of mixing non-commensurable criteria in the same formula, as it treats each criterion separately. Also, if the intention is to just to compare several solutions, then the lexicographic approach is easier when compared with the Pareto approach. However, the lexicographic approach requires the user to a priori define the criteria priorities and tolerance thresholds, which similarly to the weighted-formula are set ad-hoc.
+
+Given that in previous section an evolutionary/genetic algorithm was adopted, the presented lexicographic implementation also adopts the same base algorithm. In particular, the rbga.bin() function code is adapted by replacing the probabilistic (roulette wheel) selection with a tournament selection . This operator works by randomly sampling k individuals (solutions) from the population and then selects the best n individuals (Goldberg and Deb 1991). The advantage of using tournament is that there is no need for a single fitness value, since the selection of what is the "best" can be performed under a lexicographic comparison with the k solutions. It should be noted that the same tournament function could be used to get other multi-objective optimization adaptations. For instance, a lexicographic hill climbing could easily be achieved by setting the best function of Algorithm 2 as the same tournament operator (in this case by comparing k = 2 solutions).
+
+The lexicographic genetic algorithm R code is provided in the file lg-ga.R:
+
+### lg-ga.R file ###
+
+# lexicographic comparison of several solutions:
+
+# x - is a matrix with several objectives at each column
+
+# and each row is related with a solution
+
+lexibest=function(x) # assumes LEXI is defined
+
+{
+
+size=nrow(x); m=ncol(x)
+
+candidates=1:size
+
+stop=FALSE; i=1
+
+while(!stop)
+
+{
+
+F=x[candidates,i] # i-th goal
+
+minFID=which.min(F) # minimization goal is assumed
+
+minF=F[minFID]
+
+# compute tolerance value
+
+if(minF>-1 && minF<1) tolerance=LEXI[i]
+
+else tolerance=abs(LEXI[i]*minF)
+
+I=which((F-minF)<=tolerance)
+
+if(length(I)>0) # at least one candidate
+
+candidates=candidates[I] # update candidates
+
+else stop=TRUE
+
+if(!stop && i==m) stop=TRUE
+
+else i=i+1
+
+}
+
+if(length(candidates)>1)
+
+{ # return highest priority goal if no clear winner:
+
+stop=FALSE; i=1
+
+while(!stop)
+
+{
+
+minF=min(x[candidates,i])
+
+I=which(x[candidates,i]==minF)
+
+candidates=candidates[I]
+
+if(length(candidates)==1||i==m) stop=TRUE
+
+else i=i+1
+
+}
+
+# remove (any) extra duplicate individuals:
+
+candidates=candidates[1]
+
+}
+
+# return lexibest:
+
+return(candidates)
+
+}
+
+# compare k randomly selected solutions from Population:
+
+# returns n best indexes of Population (decreasing order)
+
+# m is the number of objectives
+
+tournament=function(Population,evalFunc,k,n,m=2)
+
+{
+
+popSize=nrow(Population)
+
+PID=sample(1:popSize,k) # select k random tournament solutions
+
+E=matrix(nrow=k,ncol=m) # evaluations of tournament solutions
+
+for(i in 1:k) # evaluate tournament
+
+E[i,]=evalFunc(Population[PID[i],])
+
+# return best n individuals:
+
+B=lexibest(E); i=1; res=PID[B] # best individual
+
+while(i<n) # other best individuals
+
+{
+
+E=E[-B,];PID=PID[-B] # all except B
+
+if(is.matrix(E)) B=lexibest(E)
+
+else B=1 # only 1 row
+
+res=c(res,PID[B])
+
+i=i+1
+
+}
+
+return(res)
+
+}
+
+# lexicographic adapted version of rbga.bin:
+
+# this function is almost identical to rbga.bin except that
+
+# the code was simplified and a lexicographic tournament is used
+
+# instead of roulette wheel selection
+
+lrbga.bin=function(size=10, suggestions=NULL, popSize=200,
+
+iters=100, mutationChance=NA, elitism=NA,
+
+zeroToOneRatio=10,evalFunc=NULL)
+
+{
+
+vars=size
+
+if(is.na(mutationChance)) { mutationChance=1/(vars + 1) }
+
+if(is.na(elitism)) { elitism=floor(popSize/5)}
+
+if(!is.null(suggestions))
+
+{
+
+population=matrix(nrow=popSize, ncol=vars)
+
+suggestionCount=dim(suggestions)[1]
+
+for(i in 1:suggestionCount)
+
+population[i, ]=suggestions[i, ]
+
+for(child in (suggestionCount + 1):popSize)
+
+{
+
+population[child, ]=sample(c(rep(0, zeroToOneRatio),1),vars,rep=TRUE)
+
+while(sum(population[child, ])==0)
+
+population[child, ]=sample(c(rep(0, zeroToOneRatio), 1),vars,rep=TRUE)
+
+}
+
+}
+
+else
+
+{
+
+population=matrix(nrow=popSize, ncol=vars)
+
+for(child in 1:popSize)
+
+{
+
+population[child,]=sample(c(rep(0, zeroToOneRatio),1),vars,rep=TRUE)
+
+while (sum(population[child, ]) == 0)
+
+population[child, ]=sample(c(rep(0, zeroToOneRatio),1),vars,rep=TRUE)
+
+}
+
+}
+
+# main GA cycle:
+
+for(iter in 1:iters)
+
+{
+
+newPopulation=matrix(nrow=popSize, ncol=vars)
+
+if(elitism>0) # applying elitism:
+
+{
+
+elitismID=tournament(population,evalFunc,k=popSize,n=elitism)
+
+newPopulation[1:elitism,]=population[elitismID,]
+
+}
+
+# applying crossover:
+
+for(child in (elitism + 1):popSize)
+
+{
+
+### very new code inserted here : ###
+
+pID1=tournament(population,evalFunc=evalFunc,k=2,n=1)
+
+pID2=tournament(population,evalFunc=evalFunc,k=2,n=1)
+
+parents=population[c(pID1,pID2),]
+
+### end of very new code ###
+
+crossOverPoint=sample(0:vars, 1)
+
+if(crossOverPoint == 0)
+
+newPopulation[child,]=parents[2,]
+
+else if(crossOverPoint == vars)
+
+newPopulation[child, ]=parents[1, ]
+
+else
+
+{
+
+newPopulation[child,]=c(parents[1,][1:crossOverPoint],parents[2,][(crossOverPoint+1):vars])
+
+while(sum(newPopulation[child,])==0)
+
+newPopulation[child, ]=sample(c(rep(0,zeroToOneRatio),1),vars,rep=TRUE)
+
+}
+
+}
+
+population=newPopulation # store new population
+
+if(mutationChance >0) # applying mutations:
+
+{
+
+mutationCount=0
+
+for(object in (elitism+1):popSize)
+
+{
+
+for(var in 1:vars)
+
+{
+
+if(runif(1)< mutationChance)
+
+{
+
+population[object, var]=sample(c(rep(0,zeroToOneRatio),1),1)
+
+mutationCount=mutationCount+1
+
+}
+
+}
+
+}
+
+}
+
+} # end of GA main cycle
+
+result=list(type="binary chromosome",size=size,popSize=popSize,
+
+iters=iters,suggestions=suggestions,
+
+population=population,elitism=elitism,
+
+mutationChance=mutationChance)
+
+return(result)
+
+}
+
+The new lrbga.bin() is a simplified version of the rbga.bin() function (which is accessible by simple typing ![
+$$\\fbox{ $\\mathtt{> rbga.bin}$}$$
+](A317276_1_En_6_Chapter_IEq5.gif) in the R console), where all verbose and monitoring code has been removed. The most important change is that before applying the crossover a tournament with k=2 individuals is set to select the two parents. It should be noted that k = 2 is the most popular tournament strategy (Michalewicz and Fogel 2004). The same tournament operator is also used to select the elitism individuals from the population (in this case with: k = N p – population size; and n = E – elitism). The tournament() function returns the n best individuals, according to a lexicographic comparison. Under this implementation, the evaluation function needs to return vector with the fitness values for all m objectives.
+
+The tournament() assumes the first objective as the highest priority function, the second objective is considered the second highest priority function, and so on. It should be noted that tournament() uses the is.matrix() R function, which returns true if x is a matrix object. The lexicographic comparison is only executed when there are two or more solutions (which occurs when x object is a matrix). Function lexibest() implements the lexicographic comparison, returning the best index of the tournament population. This function assumes that the tolerance thresholds are defined in object LEXI. Also, these thresholds are interpreted as percentages if − 1 < f i < 1 for the i-th objective, else absolute values are used. Working from the highest priority to the smallest one, the tournament population is reduced on a step by step basis, such that on each iteration only the best solutions within the tolerance range for the i-th objective are selected. If there is no clear winner, lexibest() selects the best solution, as evaluated from the highest to the smallest priority objective.
+
+The optimization of the binary multi-objective goal is coded in file lg-test.R, using a tolerance of 20 % for both objectives and the same other parameters that were used in Sect. 6.3:
+
+### lg-test.R file ###
+
+source("mo-tasks.R") # load multi-optimization tasks
+
+source("lg-ga.R") # load lrgba.bin
+
+set.seed(12345) # set for replicability
+
+LEXI=c(0.2,0.2) # tolerance 20% for each goal
+
+cat("tolerance thresholds:",LEXI,"\n")
+
+# --- binary task:
+
+D=8 # 8 bits
+
+# eval: transform binary objectives into minimization goal
+
+# returns a vector with 2 values, one per objective:
+
+eval=function(x) return(c(-sumbin(x),-maxsin(x)))
+
+popSize=12
+
+G=lrbga.bin(size=D,popSize=popSize,iters=100,zeroToOneRatio=1,
+
+evalFunc=eval,elitism=1)
+
+print("Ranking of last population:")
+
+B=tournament(G$population,eval,k=popSize,n=popSize,m=2)
+
+for(i in 1:popSize)
+
+{
+
+x=G$population[B[i],]
+
+cat(x," f=(",sumbin(x),",",round(maxsin(x),2),")","\n",sep="")
+
+}
+
+Given that there is not a single best solution, after executing the lexicographic genetic algorithm, the code shows a ranking (according to the lexicographic criterion) of all individuals from last population:
+
+> source("lg-test.R")
+
+tolerance thresholds: 0.2 0.2
+
+[1] "Ranking of last population:"
+
+01111111 f=(7,1)
+
+01111111 f=(7,1)
+
+01111111 f=(7,1)
+
+01111111 f=(7,1)
+
+01111111 f=(7,1)
+
+01111111 f=(7,1)
+
+01111111 f=(7,1)
+
+01110111 f=(6,0.99)
+
+01011111 f=(6,0.92)
+
+10101111 f=(6,0.84)
+
+10101111 f=(6,0.84)
+
+01010111 f=(5,0.88)
+
+With a single run, the lexicographic algorithm is capable of finding the same (f 1, f 2) = (7, 1) solution that belongs to the Pareto front (see next section).
+
+## 6.5 Pareto Approach
+
+A solution s 1 dominates (in the Pareto sense) a solution s 2 if s 1 is better than s 2 in one objective and as least as good as s 2 in all other objectives. A solution s i is non-dominated when there is no solution s j that dominates s i and the Pareto front contains all non-dominated solutions (Luke 2012). An example situation is shown in the left of Fig. 6.1, where s 1 is a non-dominated solution and part of the Pareto front, while s 2 is a dominated one (both solutions have the same f 2 value but s 1 presents a better f 1). Assuming this concept, Pareto multi-objective optimization methods return a set of non-dominated solutions (from the Pareto front), rather than just a single solution.
+
+When compared with previous approaches (weighted-formula and lexicographic), the Pareto multi-objective optimization presents several advantages (Freitas 2004). It is a more natural method, since a "true" multi-objective approach is executed, providing to the user an interesting set of distinct solutions and letting the user (a posteriori) to decide which one is best. Moreover, under a single execution, the method optimizes the distinct objectives, thus no multiple runs are required to get the Pareto front points. In addition, there is no need to set ad-hoc weights or tolerance values. The drawback of the Pareto approach is that a larger search space needs to be explored and tracked, thus Pareto based methods tend to be more complex than single-objective counterparts.
+
+Considering that Pareto based methods need to keep track of a population of solutions, evolutionary algorithms have become a natural and popular solution to generate Pareto optimal solutions. Examples of standard evolutionary multi-objective approaches include the strength Pareto evolutionary algorithm 2 (SPEA-2) and non-dominated sorting genetic algorithm-II (NSGA-II) (Deb 2001). Multi-objective evolutionary algorithms (MOEA) often use Pareto-based ranking schemes, where individuals in the Pareto front are rank 1, then the front solutions are removed and the individuals from the new front are rank 2, and so on.
+
+The NSGA-II is implemented in the mco package and thus it is adopted in this section. NSGA-II is an evolutionary algorithm variant specifically designed for multi-objective optimization and that uses three useful concepts: Pareto front ranking, elitism, and sparsity. The full NSGA-II algorithmic details can be found in Deb (2001) and Luke (2012), although the skeleton of the algorithm is similar to Algorithm 5. The initial population P is randomly generated and then a cycle is executed until a termination criterion is met. Within each cycle, NSGA-II uses a Pareto ranking scheme to assign a ranking number to each individual of the population. An elitism scheme is also adopted, storing an archive (P E ) of the best individuals. The elitism individuals are selected taking into account their rank number and also their sparsity. An individual is in a sparse region if the neighbor individuals are not too close to it. To measure how close two points are, a distance metric is used, such as Manhattan distance, which is defined by the sum of all m objective differences between the two points. Then, a new population (![
+$$\\mathit{Children} \\leftarrow \\mathit{breed}\(P_{E}\)$$
+](A317276_1_En_6_Chapter_IEq6.gif)) is created, often by using a tournament selection (e.g., with k = 2), crossover, and mutation operators. The next population is set as the union of the archive (![
+$$P \\leftarrow \\mathit{Children} \\cup P_{E}$$
+](A317276_1_En_6_Chapter_IEq7.gif)) and a new cycle is executed over P.
+
+In the mco package, NSGA-II is implemented with the nsga2 function. The package also contains other functions, such as related with multi-objective benchmarks (e.g., belegundu()) and Pareto front (e.g., paretoSet() ). The useful nsga2() function performs a minimization of vectors of real numbers and includes the main parameters:
+
+  * fn—function to be minimized (should return a vector with the several objective values);
+
+  * idim—input dimension (D);
+
+  * odim—output dimension (number of objective functions, m);
+
+  * ...—extra arguments to be passed fn;
+
+  * lower.bounds, upper.bounds—lower and upper bounds;
+
+  * popsize—population size (N P , default is 100);
+
+  * generations—number of generations (maxit, default is 100) or a vector;
+
+  * cprob—crossover probability (default is 0.7); and
+
+  * mprob—mutation probability (default is 0.2).
+
+The function returns a list with the final population (if generations is a number), with components:
+
+  * $par—the population values;
+
+  * $value—matrix with the best objective values (in columns) for the last population individuals (in rows); and
+
+  * $pareto.optimal—a boolean vector that indicates which individuals from the last generation belong to the Pareto front.
+
+When generations is a vector, a vector list is returned where the i-th element contains the population after generations[i] iterations (an R code example is shown in the next presented code).
+
+File ngsa2-test.R codes the optimization of the three multi-objective tutorial tasks under the NSGA-II algorithm:
+
+### nsga2-test.R file ###
+
+source("mo-tasks.R") # load multi-optimization tasks
+
+library(mco) # load mco package
+
+set.seed(12345) # set for replicability
+
+m=2 # two objectives
+
+# --- binary task:
+
+D=8 # 8 bits
+
+# eval: transform binary objectives into minimization goal
+
+# round(x) is used to convert real number to 0 or 1 values
+
+eval=function(x) c(-sumbin(round(x)),-maxsin(round(x)))
+
+cat("binary task:\n")
+
+G=nsga2(fn=eval,idim=D,odim=m,
+
+lower.bounds=rep(0,D),upper.bounds=rep(1,D),
+
+popsize=12,generations=100)
+
+# show last Pareto front
+
+I=which(G$pareto.optimal)
+
+for(i in I)
+
+{
+
+x=round(G$par[i,])
+
+cat(x," f=(",sumbin(x),",",round(maxsin(x),2),")","\n",sep="")
+
+}
+
+# --- integer task:
+
+D=5 # 5 bag prices
+
+# eval: transform objectives into minimization goal
+
+eval=function(x) c(-profit(x),produced(x))
+
+cat("integer task:\n")
+
+G=nsga2(fn=eval,idim=5,odim=m,
+
+lower.bounds=rep(1,D),upper.bounds=rep(1000,D),
+
+popsize=20,generations=1:100)
+
+# show best individuals:
+
+I=which(G[[100]]$pareto.optimal)
+
+for(i in I)
+
+{
+
+x=round(G[[100]]$par[i,])
+
+cat(x," f=(",profit(x),",",produced(x),")","\n",sep=" ")
+
+}
+
+# create PDF with Pareto front evolution:
+
+pdf(file="nsga-bag.pdf",paper="special",height=5,width=5)
+
+par(mar=c(4.0,4.0,0.1,0.1))
+
+I=1:100
+
+for(i in I)
+
+{ P=G[[i]]$value # objectives f1 and f2
+
+P[,1]=-1*P[,1] # show positive f1 values
+
+# color from light gray (75) to dark (1):
+
+COL=paste("gray",round(76-i*0.75),sep="")
+
+if(i==1) plot(P,xlim=c(-500,44000),ylim=c(0,140),
+
+xlab="f1",ylab="f2",cex=0.5,col=COL)
+
+Pareto=P[G[[i]]$pareto.optimal,]
+
+# sort Pareto according to x axis:
+
+I=sort.int(Pareto[,1],index.return=TRUE)
+
+Pareto=Pareto[I$ix,]
+
+points(P,type="p",pch=1,cex=0.5,col=COL)
+
+lines(Pareto,type="l",cex=0.5,col=COL)
+
+}
+
+dev.off()
+
+# create PDF comparing NSGA-II with WF:
+
+pdf(file="nsga-bag2.pdf",paper="special",height=5,width=5)
+
+par(mar=c(4.0,4.0,0.1,0.1))
+
+# NSGA-II best results:
+
+P=G[[100]]$value # objectives f1 and f2
+
+P[,1]=-1*P[,1] # show positive f1 values
+
+Pareto=P[G[[100]]$pareto.optimal,]
+
+# sort Pareto according to x axis:
+
+I=sort.int(Pareto[,1],index.return=TRUE)
+
+plot(Pareto[I$ix,],xlim=c(-500,44000),ylim=c(0,140),
+
+xlab="f1",ylab="f2",type="b",lwd=2,lty=1,pch=1)
+
+# weight-formula best results:
+
+wf=read.table("wf-bag.csv",sep=" ")
+
+I=sort.int(wf[,1],index.return=TRUE)
+
+lines(wf[I$ix,],type="b",lty=2,lwd=2,pch=3)
+
+legend("topleft",c("NSGA-II","weighted-formula"),
+
+lwd=2,lty=1:2,pch=c(1,3))
+
+dev.off()
+
+# --- real value task:
+
+D=8 # dimension
+
+cat("real value task:\n")
+
+G=nsga2(fn=fes1,idim=D,odim=m,
+
+lower.bounds=rep(0,D),upper.bounds=rep(1,D),
+
+popsize=20,generations=1:100)
+
+# show best individuals:
+
+I=which(G[[100]]$pareto.optimal)
+
+for(i in I)
+
+{
+
+x=round(G[[100]]$par[i,],digits=2); cat(x)
+
+cat(" f=(",round(fes1(x)[1],2),",",round(fes1(x)[2],2),")",
+
+"\n",sep="")
+
+}
+
+# create PDF with Pareto front evolution:
+
+pdf(file="nsga-fes1.pdf",paper="special",height=5,width=5)
+
+par(mar=c(4.0,4.0,0.1,0.1))
+
+I=1:100
+
+for(i in I)
+
+{ P=G[[i]]$value # objectives f1 and f2
+
+# color from light gray (75) to dark (1):
+
+COL=paste("gray",round(76-i*0.75),sep="")
+
+if(i==1) plot(P,xlim=c(0.5,5.0),ylim=c(0,2.0),
+
+xlab="f1",ylab="f2",cex=0.5,col=COL)
+
+Pareto=P[G[[i]]$pareto.optimal,]
+
+# sort Pareto according to x axis:
+
+I=sort.int(Pareto[,1],index.return=TRUE)
+
+Pareto=Pareto[I$ix,]
+
+points(P,type="p",pch=1,cex=0.5,col=COL)
+
+lines(Pareto,type="l",cex=0.5,col=COL)
+
+}
+
+dev.off()
+
+# create PDF comparing NSGA-II with WF:
+
+pdf(file="nsga-fes1-2.pdf",paper="special",height=5,width=5)
+
+par(mar=c(4.0,4.0,0.1,0.1))
+
+# NSGA-II best results:
+
+P=G[[100]]$value # objectives f1 and f2
+
+Pareto=P[G[[100]]$pareto.optimal,]
+
+# sort Pareto according to x axis:
+
+I=sort.int(Pareto[,1],index.return=TRUE)
+
+plot(Pareto[I$ix,],xlim=c(0.5,5.0),ylim=c(0,2.0),
+
+xlab="f1",ylab="f2",type="b",lwd=2,pch=1)
+
+# weight-formula best results:
+
+wf=read.table("wf-fes1.csv",sep=" ")
+
+I=sort.int(wf[,1],index.return=TRUE)
+
+lines(wf[I$ix,],type="b",lty=2,lwd=2,pch=3)
+
+legend("top",c("NSGA-II","weighted-formula"),
+
+lwd=2,lty=1:2,pch=c(1,3))
+
+dev.off()
+
+The execution of function nsga2 is straightforward taking into account the weight-formula and lexicographic examples. For the binary task, each solution parameter ( ∈ [0, 1]) is first rounded in order to transform it into a binary number, since nsga2() only works with real values. After calling the algorithm, the code shows all Pareto front solutions from the last generation. For each of the bag prices and FES1 tasks, the code also creates two PDF files. The first PDF contains the search evolution in terms of the f 1 (x-axis) and f 2 (y-axis) objectives, where individual solutions are represented by small circle points and the Pareto front solutions are connected with lines. Also, a varying color scheme is adopted to plot the points and lines, ranging from light gray (first generation) to black (last generation). The second PDF compares the best Pareto front optimized by NSGA-II with the five solutions obtained by the five runs (with different weight combinations) executed for the weighted-formula approach. The execution result is:
+
+> source("nsga2-test.R")
+
+binary task:
+
+11111111 f=(8,0.01)
+
+01111111 f=(7,1)
+
+11111111 f=(8,0.01)
+
+01111111 f=(7,1)
+
+11111111 f=(8,0.01)
+
+01111111 f=(7,1)
+
+11111111 f=(8,0.01)
+
+11111111 f=(8,0.01)
+
+11111111 f=(8,0.01)
+
+11111111 f=(8,0.01)
+
+11111111 f=(8,0.01)
+
+01111111 f=(7,1)
+
+integer task:
+
+414 403 406 431 394 f=( 43736 , 114 )
+
+1000 996 993 989 988 f=( -500 , 0 )
+
+752 944 929 871 999 f=( 12944 , 17 )
+
+813 649 872 971 791 f=( 19729 , 28 )
+
+1000 934 979 942 996 f=( 3204 , 4 )
+
+803 967 645 627 745 f=( 22523 , 34 )
+
+414 403 406 503 473 f=( 42955 , 107 )
+
+554 629 591 443 563 f=( 38665 , 74 )
+
+775 721 510 621 782 f=( 30643 , 50 )
+
+436 494 494 614 565 f=( 40789 , 87 )
+
+900 934 979 942 996 f=( 6684 , 8 )
+
+807 498 506 641 707 f=( 32477 , 59 )
+
+790 749 595 877 789 f=( 25273 , 37 )
+
+979 882 957 938 794 f=( 8873 , 11 )
+
+997 787 634 985 728 f=( 15792 , 24 )
+
+775 634 725 602 782 f=( 29307 , 45 )
+
+997 788 647 991 728 f=( 15328 , 23 )
+
+432 494 494 433 563 f=( 42191 , 95 )
+
+620 654 680 393 608 f=( 36061 , 67 )
+
+946 672 668 622 638 f=( 26670 , 42 )
+
+real value task:
+
+0.15 0.12 0.12 0.11 0.16 0.15 0.16 0.15 f=(4.85,0)
+
+0.34 0.35 0.38 0.43 0.49 0.59 0.72 0.91 f=(0.45,1.44)
+
+0.34 0.35 0.38 0.43 0.49 0.59 0.35 0.5 f=(1.58,0.7)
+
+0.34 0.35 0.39 0.25 0.2 0.13 0.14 0.13 f=(3.46,0.15)
+
+0.22 0.18 0.39 0.27 0.2 0.12 0.13 0.13 f=(4.11,0.09)
+
+0.34 0.35 0.39 0.43 0.3 0.59 0.34 0.46 f=(2.02,0.57)
+
+0.34 0.35 0.38 0.43 0.49 0.5 0.72 0.91 f=(0.67,1.37)
+
+0.34 0.35 0.38 0.43 0.49 0.59 0.72 0.72 f=(0.82,1.19)
+
+0.19 0.36 0.12 0.11 0.16 0.19 0.16 0.15 f=(4.36,0.05)
+
+0.34 0.35 0.39 0.43 0.2 0.13 0.07 0.14 f=(3.13,0.23)
+
+0.34 0.35 0.38 0.42 0.49 0.59 0.72 0.67 f=(0.92,1.13)
+
+0.34 0.35 0.39 0.41 0.24 0.59 0.23 0.48 f=(2.24,0.53)
+
+0.34 0.35 0.39 0.43 0.11 0.35 0.45 0.48 f=(2.5,0.46)
+
+0.34 0.35 0.38 0.43 0.49 0.59 0.72 0.5 f=(1.03,0.98)
+
+0.33 0.35 0.39 0.43 0.39 0.4 0.13 0.33 f=(2.56,0.37)
+
+0.33 0.35 0.39 0.43 0.3 0.4 0.13 0.3 f=(2.7,0.33)
+
+0.22 0.35 0.39 0.42 0.48 0.12 0.13 0.16 f=(3.01,0.29)
+
+0.29 0.35 0.38 0.43 0.49 0.59 0.72 0.49 f=(1.22,0.96)
+
+0.34 0.35 0.38 0.43 0.47 0.59 0.69 0.43 f=(1.29,0.89)
+
+0.34 0.35 0.38 0.42 0.5 0.58 0.72 0.16 f=(1.33,0.85)
+
+For the sake of simplicity, the comparison of NSGA-II results with other approaches is performed using only a single NGSA-II run. It should be noted that for a more robust comparison, several runs should be applied, as shown in Sects. 4.​5 and 5.6.
+
+The binary multi-objective task is quite simple given that the optimum Pareto front only contains two solutions (found by NSGA-II): (f 1, f 2) = (7, 1) and (f 1, f 2) = (8, 0. 01). When compared with the weight-formula, the same best solutions were obtained, although NSGA-II gets both solutions under the same run.
+
+As shown in the left two graphs of Fig. 6.3, the bag prices and FES1 are more complex multi-objective tasks when compared with the binary sum of bits/max sin problem. Figure 6.3 reveals that the optimum Pareto fronts seem to have a convex shape for both integer and real value optimization tasks, although the final shape of such Pareto curve is only achieved during the last generations of NSGA-II. The right plots of Fig. 6.3 confirm that NSGA-II optimizes within a single run a more interesting Pareto front when compared with the results obtained by the weighted-formula approach (and that requires executing 5 runs). For bag prices, the weighted approach misses several interesting solutions outside the linear combination (dashed line) extreme trade-offs. For FES1, the weighted approach is not able to get quality f 1 values when compared with NSGA-II results.
+
+Fig. 6.3
+
+NSGA-II results for bag prices (top graphs) and FES1 (bottom graphs) tasks (left graphs show the Pareto front evolution, while right graphs compare the best Pareto front with the weighted-formula results)
+
+## 6.6 Command Summary
+
+belegundu() | multi-objective belegundu test problem (package mco)
+
+---|---
+
+is.matrix() | returns true if argument is a matrix
+
+lrbga.bin | lexicographic genetic algorithm (chapter file "lg-ga.R")
+
+mco | package for multi-criteria optimization algorithms
+
+nsga2() | NSGA-II algorithm (package mco)
+
+paretoSet() | returns the Pareto front from a mco result object (package mco)
+
+tournament() | tournament under a lexicographic approach (chapter file "lg-ga.R")
+
+## 6.7 Exercises
+
+6.1.
+
+Encode the lexicographic hill climbing function lhclimbing(), which uses a lexicographic tournament with k = 2 to select the best solutions. Also, demonstrate the usefulness of lhclimbing() to optimize the bag prices multi-objective task under the tolerance vector (0. 1, 0. 1), where the first priority is f 1 (profit). Show the best solution.
+
+6.2.
+
+Consider the FES2 task (Huband et al. 2006), where the goal is to minimize three functions under the range x i ∈ [0, 1]:
+
+![
+$$\\displaystyle{ \\begin{array}{rl} f_{1} & =\\sum _{ i=1}^{D}\(x_{i} - 0.5\\cos \(10\\pi i/D\) - 0.5\)^{2} \\\\ f_{2} & =\\sum _{ i=1}^{D}\\vert x_{i} -\\sin ^{2}\(i - 1\)\\cos ^{2}\(i - 1\)\\vert ^{0.5} \\\\ f_{2} & =\\sum _{ i=1}^{D}\\vert x_{i} - 0.25\\cos \(i - 1\)\\cos \(2i - 2\) - 0.5\\vert ^{0.5}\\\\ \\end{array} }$$
+](A317276_1_En_6_Chapter_Equ3.gif)
+
+(6.3)
+
+Using the scatterplot3d() function (package scatterplot3d), compare the final Pareto front solutions when exploring two multi-optimization approaches:
+
+1.
+
+a WBGA, which encodes a different weight vector into each solution of the genetic population (use function rgba); and
+
+2.
+
+NSGA-II algorithm.
+
+For both approaches, use a population size of 20,100 generations and a dimension of D = 8.
+
+References
+
+Deb K (2001) Multi-objective optimization. In: Multi-objective optimization using evolutionary algorithms. Wiley, Chichester, pp 13–46MATH
+
+Freitas AA (2004) A critical review of multi-objective optimization in data mining: a position paper. ACM SIGKDD Explor Newslett 6(2):77–86CrossRefMathSciNet
+
+Goldberg DE, Deb K (1991) A comparative analysis of selection schemes used in genetic algorithms, Urbana 51:61801–62996.
+
+Huband S, Hingston P, Barone L, While L (2006) A review of multiobjective test problems and a scalable test problem toolkit. IEEE Trans Evol Comput 10(5):477–506CrossRef
+
+Konak A, Coit DW, Smith AE (2006) Multi-objective optimization using genetic algorithms: a tutorial. Reliab Eng Syst Saf 91(9):992–1007CrossRef
+
+Luke S (2012) Essentials of metaheuristics. Lulu.com, online version at http://​cs.​gmu.​edu/​~sean/​book/​metaheuristics
+
+Michalewicz Z, Fogel D (2004) How to solve it: modern heuristics. Springer, BerlinCrossRef
+© Springer International Publishing Switzerland 2014
+
+Paulo CortezModern Optimization with RUse R!10.1007/978-3-319-08263-9_7
+
+# 7. Applications
+
+Paulo Cortez1
+
+(1)
+
+Department of Information Systems, University of Minho, Guimarães, Portugal
+
+## 7.1 Introduction
+
+Previous chapters have approached demonstrative optimization tasks that were synthetically generated. The intention was to present a tutorial perspective and thus more simpler tasks were approached. As a complement, this chapter addresses real-world applications, where the data available is taken from a physical phenomena. Exemplifying the optimization of real-world data in R is interesting for two main reasons. First, physical phenomena may contain surprising or unknown features. Second, it provides additional code examples of how to load real-world data into R.
+
+This chapter addresses three real-world tasks that are discussed in the next sections: traveling salesman problem (TSP), time series forecasting (TSF), and wine quality classification.
+
+## 7.2 Traveling Salesman Problem
+
+The TSP is a classical combinatorial optimization. The goal is to find the cheapest way of visiting all cities (just once) of a given map, starting in one city and ending in the same city (Reinelt 1994). The complete traveling is known as Hamiltonian cycle or TSP tour. The standard TSP assumes symmetric costs, where the cost of visiting from city A to B is the same as visiting from B to A. This problem is non-deterministic polynomial (NP)-complete, which means that there is no algorithm capable of reaching the optimum solution in a polynomial time (in respect to n) for all possible TSP instances of size n. While being a complex task, TSP has been heavily studied in the last decades and a substantial improvement has been reached. For example, in 1954 an instance with 49 cities was the maximum TSP size solved, while in 2004 the record was established in 24,978 cities (Applegate et al. 2011).
+
+Due to its importance, several optimization methods were devised to specifically address the TSP, such as 2-opt and concorde . The former method is a local search algorithm that is tested in this section and starts with a random tour and then exchanges two cities in order to avoid any two crossing paths until no improvements are possible (Croes 1958). The latter is a recent advanced exact TSP solver for symmetric TSPs and that is based on branch-and-cut approach (Applegate et al. 2001). Rather than competing with these TSP specific methods, in this section the TSP is used as an application domain to show how ordered representations can be handled by modern optimization methods. Also, it is used as example to demonstrate how a Lamarckian evolution works.
+
+An ordered representation is a natural choice for TSP since it assumes that solutions can be generated as permutations of the symbols from an alphabet. Without losing generality, the alphabet can be defined by the integers in the set {1, 2,..., n} (Rocha et al. 2001). Under this representation, the search space is n! for a particular TSP instance of size n. The adaption of modern optimization methods to this representation type requires assuring that generated solutions (e.g., created in the initialization and change functions of Algorithm 1) are feasible, avoiding missing integers and repeated values.
+
+For single-state methods, several mutation operators can be adopted to change solutions, such as (Michalewicz and Fogel 2004): exchange , insertion , and displacement . The first operator swaps two randomly selected cities, the second operator inserts a city into a random position and the third operator inserts a random subtour into another position. Figure 7.1 shows examples of these order mutations.
+
+Fig. 7.1
+
+Example of three order mutation operators
+
+For evolutionary approaches, there are several crossover methods that preserve order, such as partially matched crossover (PMX) , order crossover (OX) , and cycle crossover (CX) (Rocha et al. 2001). PMX first selects two cutting points and the corresponding matching section is exchanged between the two parents, through position-to-position exchange operations. The OX also exchanges two sections between the parents but keeping an emphasis on the relative order of the genes from both parents. Both operators are shown in Fig. 7.2, while CX is described in Exercise 7.1.
+
+Fig. 7.2
+
+Example of PMX and OX crossover operators
+
+In this section, two modern optimization methods are adapted to TSP: simulated annealing and evolutionary algorithm (under two variants). Simulated annealing uses a mutation operator to change solutions, while the evolutionary variants use specialized crossover and mutation operators. Given that the genalg package is not flexible in terms of accepting new genetic operators, the rbga.bin code was adapted to accept user defined operators under the new oea function. The two evolutionary variants include the standard algorithm and a Lamarckian evolution. The latter employs a greedy approach, where a local learning procedure is used first to improve a solution and then the improved solution replaces the population original solution. The R code that implements the ordered representation operators and Lamarckian evolution option is presented in file oea.R:
+
+### oea.R file ###
+
+### mutation operators:
+
+exchange=function(s,N=length(s))
+
+{ p=sample(1:N,2) # select two positions
+
+temp=s[p[1]] # swap values
+
+s[p[1]]=s[p[2]]
+
+s[p[2]]=temp
+
+return(s)
+
+}
+
+insertion=function(s,N=length(s),p=NA,i=NA)
+
+{ if(is.na(p)) p=sample(1:N,1) # select a position
+
+I=setdiff(1:N,p) # ALL except p
+
+if(is.na(i)) i=sample(I,1) # select random place
+
+if(i>p) i=i+1 # need to produce a change
+
+I1=which(I<i) # first part
+
+I2=which(I>=i) # last part
+
+s=s[c(I[I1],p,I[I2])] # new solution
+
+return(s)
+
+}
+
+displacement=function(s,N=length(s))
+
+{ p=c(1,N)
+
+# select random tour different than s
+
+while(p[1]==1&&p[2]==N) p=sort(sample(1:N,2))
+
+I=setdiff(1:N,p[1]:p[2]) # ALL except p
+
+i=sample(I,1) # select random place
+
+I1=which(I<i) # first part
+
+I2=which(I>=i) # last part
+
+s=s[c(I[I1],p[1]:p[2],I[I2])]
+
+return(s)
+
+}
+
+### crossover operators:
+
+# partially matched crossover (PMX) operator:
+
+# m is a matrix with 2 parent x ordered solutions
+
+pmx=function(m)
+
+{
+
+N=ncol(m)
+
+p=sample(1:N,2) # two cutting points
+
+c=m # children
+
+for(i in p[1]:p[2])
+
+{ # rearrange:
+
+c[1,which(c[1,]==m[2,i])]=c[1,i]
+
+# crossed section:
+
+c[1,i]=m[2,i]
+
+# rearrange:
+
+c[2,which(c[2,]==m[1,i])]=c[2,i]
+
+# crossed section:
+
+c[2,i]=m[1,i]
+
+}
+
+return(c)
+
+}
+
+# order crossover (OX) operator:
+
+# m is a matrix with 2 parent x ordered solutions
+
+ox=function(m)
+
+{
+
+N=ncol(m)
+
+p=sort(sample(1:N,2)) # two cutting points
+
+c=matrix(rep(NA,N*2),ncol=N)
+
+# keep selected section:
+
+c[,p[1]:p[2]]=m[,p[1]:p[2]]
+
+# rotate after cut 2 (p[2]):
+
+I=((p[2]+1):(p[2]+N))
+
+I=ifelse(I<=N,I,I-N)
+
+a=m[,I]
+
+# fill remaining genes:
+
+a1=setdiff(a[2,],c[1,p[1]:p[2]])
+
+a2=setdiff(a[1,],c[2,p[1]:p[2]])
+
+I2=setdiff(I,p[1]:p[2])
+
+c[,I2]=rbind(a1,a2)
+
+return(c)
+
+}
+
+### order (representation) evolutionary algorithm:
+
+# adapted version of rbga.bin that works with ordered vectors,
+
+# accepts used defined mutation and crossover operators and
+
+# accepts a Lamarckian evolution if evalFunc returns a list
+
+# note: assumes solution with values from the range 1,2,...,size
+
+oea=function(size=10,suggestions=NULL,popSize=200,iters=100,mutationChance=NA,
+
+elitism=NA,evalFunc=NULL,
+
+crossfunc=NULL,mutfunc=mutfunc,REPORT=0)
+
+{
+
+if(is.na(mutationChance)) { mutationChance=0.5 }
+
+if(is.na(elitism)) { elitism=floor(popSize/5)}
+
+# population initialization:
+
+population=matrix(nrow=popSize,ncol=size)
+
+if(!is.null(suggestions))
+
+{
+
+suggestionCount=dim(suggestions)[1]
+
+for(i in 1:suggestionCount)
+
+population[i, ] = suggestions[i, ]
+
+I=(suggestionCount+1):popSize ### new code
+
+}
+
+else I=1:popSize ### new code
+
+for(child in I) ### new code
+
+population[child,]=sample(1:size,size) ### new code
+
+# evaluate population:
+
+evalVals = rep(NA, popSize)
+
+# main GA cycle:
+
+for(iter in 1:iters)
+
+{
+
+# evaluate population
+
+for(object in 1:popSize)
+
+{### new code
+
+EF = evalFunc(population[object,])
+
+if(is.list(EF)) # Lamarckian change of solution
+
+{ population[object,]=EF$solution
+
+evalVals[object] = EF$eval
+
+}
+
+else evalVals[object]=EF
+
+### end of new code
+
+}
+
+sortedEvaluations=sort(evalVals,index=TRUE)
+
+if(REPORT>0 && (iter%%REPORT==0||iter==1))
+
+cat(iter,"best:",sortedEvaluations$x[1],"mean:",mean(sortedEvaluations$x),"\n")
+
+sortedPopulation=matrix(population[sortedEvaluations$ix,],ncol=size)
+
+# check elitism:
+
+newPopulation=matrix(nrow=popSize,ncol=size)
+
+if(elitism>0) # applying elitism:
+
+newPopulation[1:elitism,]=sortedPopulation[1:elitism,]
+
+### very new code inserted here : ###
+
+# roulette wheel selection of remaining individuals
+
+others=popSize-elitism
+
+prob=(max(sortedEvaluations$x)-sortedEvaluations$x+1)
+
+prob=prob/sum(prob) # such that sum(prob)==1
+
+# crossover with half of the population (if !is.null)
+
+if(!is.null(crossfunc)) # need to crossover
+
+half=round(others/2)
+
+else half=0 # no crossover
+
+if(!is.null(crossfunc))
+
+{
+
+for(child in seq(1,half,by=2))
+
+{
+
+selIDs=sample(1:popSize,2,prob=prob)
+
+parents=sortedPopulation[selIDs, ]
+
+if(child==half)
+
+newPopulation[elitism+child,]=crossfunc(parents)[1,] # 1st child
+
+else
+
+newPopulation[elitism+child:(child+1),]=crossfunc(parents) # two children
+
+}
+
+}
+
+# mutation with remaining individuals
+
+for(child in (half+1):others)
+
+{
+
+selIDs=sample(1:popSize,1,prob=prob)
+
+newPopulation[elitism+child,]=mutfunc(sortedPopulation[selIDs,])
+
+}
+
+### end of very new code ###
+
+population=newPopulation # store new population
+
+} # end of GA main cycle
+
+result=list(type="ordered chromosome",size=size,
+
+popSize=popSize, iters=iters,population=population,
+
+elitism=elitism, mutationChance=mutationChance,
+
+evaluations=evalVals)return(result)
+
+}
+
+File oea.R implements all mentioned order operations. Also, it defines the new order evolutionary algorithm in oea(). This function uses the {1, 2,..., n} alphabet, where size= n and the first population is randomly generated using the sample(1:size,size) command (except for what is included in object suggestions). The oea code adopts a slight different approach for changing individuals when compared with the genalg package, which uses crossover to create new individuals and then mutates the created individuals (Algorithm 5). In the order representation case, a mutation is applied to the whole solution and thus it does not make sense to define a mutation probability for a particular gene (as in genalg). Following approach similar to what is described in Rocha et al. (2001), in oea() a pure roulette wheel selection is used to select the fittest individuals that are used to generate new solutions. Then, half of the fittest individuals are crossed and the remaining half are mutated (unless there is no crossover function, which in this case all fittest individuals are mutated). The new arguments (when compared with rbga.bin) are:
+
+  * crossfunc—crossover function (optional); if defined, half of the selected individuals (except elitism) are generated by this operator;
+
+  * mutfunc—mutation function that changes the remaining (non crossed) selected population individuals; and
+
+  * REPORT—shows the best and mean population fitness values every REPORT generations.
+
+The oea function introduces another useful feature when compared with genalg. The evaluation function (evalFunc) can return the fitness value or a list with two components: $eval—the fitness value; and $solution—the original or an improved solution. When a list is returned, then it is assumed that the population individual will be changed after its evaluation, under a Lamarckian evolution scheme (explained in Sect. 1.​6).
+
+To demonstrate the simulated annealing and order evolutionary algorithm, the Qatar TSP was selected and that includes 194 cities. Other national TSPs are available at http://​www.​math.​uwaterloo.​ca/​tsp/​world/​countries.​html. The R code is presented in file tsp.R:
+
+### tsp.R file ###
+
+library(TSP) # load TSP package
+
+library(RCurl) # load RCurl package
+
+source("oea.R") # load ordered evolutionary algorithm
+
+# get Qatar - 194 cities TSP instance:
+
+txt=getURL("http://​www.​math.​uwaterloo.​ca/​tsp/​world/​qa194.​tsp")
+
+# simple parse of txt object, removing header and last line:
+
+txt=strsplit(txt,"NODE_COORD_SECTION") # split text into 2 parts
+
+txt=txt[[1]][2] # get second text part
+
+txt=strsplit(txt,"EOF") # split text into 2 parts
+
+txt=txt[[1]][1] # get first text part
+
+# save data into a simple .csv file, sep=" ":
+
+cat(txt,file="qa194.csv")
+
+# read the TSP format into Data
+
+# (first row is empty, thus header=TRUE)
+
+# get city Cartesian coordinates
+
+Data=read.table("qa194.csv",sep=" ")
+
+Data=Data[,3:2] # longitude and latitude
+
+names(Data)=c("x","y") # x and y labels
+
+N=nrow(Data) # number of cities
+
+# distance between two cities (EUC_2D-norm)
+
+# Eulidean distance rounded to whole number
+
+D=dist(Data,upper=TRUE)
+
+D[1:length(D)]=round(D[1:length(D)])
+
+# create TSP object from D:
+
+TD=TSP(D)
+
+set.seed(12345) # for replicability
+
+cat("2-opt run:\n")
+
+PTM=proc.time() # start clock
+
+R1=solve_TSP(TD,method="2-opt")
+
+sec=(proc.time()-PTM)[3] # get seconds elapsed
+
+print(R1) # show optimum
+
+cat("time elapsed:",sec,"\n")
+
+MAXIT=100000
+
+Methods=c("SANN","EA","LEA") # comparison of 3 methods
+
+RES=matrix(nrow=MAXIT,ncol=length(Methods))
+
+MD=as.matrix(D)
+
+# overall distance of a tour (evaluation function):
+
+tour=function(s)
+
+{ # compute tour length:
+
+EV<<-EV+1 # increase evaluations
+
+s=c(s,s[1]) # start city is also end city
+
+res=0
+
+for(i in 2:length(s)) res=res+MD[s[i],s[i-1]]
+
+# store memory with best values:
+
+if(res<BEST) BEST<<-res
+
+if(EV<=MAXIT) F[EV]<<-BEST
+
+# only for hybrid method:
+
+# return tour
+
+return(res)
+
+}
+
+# move city index according to dir
+
+mindex=function(i,dir,s=NULL,N=length(s))
+
+{ res=i+dir #positive or negative jump
+
+if(res<1) res=N+res else if(res>N) res=res-N
+
+return(res)
+
+}
+
+# local improvement and evaluation:
+
+# first tries to improve a solution with a
+
+# local search that uses domain knowledge (MD)
+
+# returns best solution and evaluation value
+
+local_imp_tour=function(s,p=NA)
+
+{ # local search
+
+N=length(s); ALL=1:N
+
+if(is.na(p)) p=sample(ALL,1) # select random position
+
+I=setdiff(ALL,p)
+
+# current distance: p to neighbors
+
+pprev=mindex(p,-1,N=N); pnext=mindex(p,1,N=N)
+
+dpcur=MD[s[pprev],s[p]]+MD[s[p],s[pnext]]
+
+# new distance if p is remove to another position:
+
+dpnew=MD[s[pprev],s[pnext]]
+
+# search for best insertion position for p:
+
+ibest=0;best=-Inf
+
+for(i in I) # extra cycle that increases computation
+
+{
+
+inext=mindex(i,1,N=N);iprev=mindex(i,-1,N=N)
+
+if(inext==p) inext=pnext
+
+if(iprev==p) iprev=pprev
+
+# dinew: new distance p to neighbors if p inserted:
+
+# current i distance without p:
+
+if(i<p) {dinew=MD[s[iprev],s[p]]+MD[s[p],s[i]]
+
+dicur=MD[s[iprev],s[i]]
+
+}
+
+else
+
+{ dinew=MD[s[i],s[p]]+MD[s[p],s[inext]]
+
+dicur=MD[s[i],s[inext]]
+
+}
+
+# difference between current tour and new one:
+
+dif=(dicur+dpcur)-(dinew+dpnew)
+
+if(dif>0 && dif>best) # improved solution
+
+{
+
+best=dif
+
+ibest=i
+
+}
+
+}
+
+if(ibest>0) # insert p in i
+
+s=insertion(s,p=p,i=ibest)
+
+return(list(eval=tour(s),solution=s))
+
+}
+
+# SANN:
+
+cat("SANN run:\n")
+
+set.seed(12345) # for replicability
+
+s=sample(1:N,N) # initial solution
+
+EV=0; BEST=Inf; F=rep(NA,MAXIT) # reset these vars.
+
+C=list(maxit=MAXIT,temp=2000,trace=TRUE,REPORT=MAXIT)
+
+PTM=proc.time() # start clock
+
+SANN=optim(s,fn=tour,gr=insertion,method="SANN",control=C)
+
+sec=(proc.time()-PTM)[3] # get seconds elapsed
+
+cat("time elapsed:",sec,"\n")
+
+RES[,1]=F
+
+# EA:
+
+cat("EA run:\n")
+
+set.seed(12345) # for replicability
+
+EV=0; BEST=Inf; F=rep(NA,MAXIT) # reset these vars.
+
+pSize=30;iters=ceiling((MAXIT-pSize)/(pSize-1))
+
+PTM=proc.time() # start clock
+
+OEA=oea(size=N,popSize=pSize,iters=iters,evalFunc=tour,crossfunc=ox,mutfunc=insertion,REPORT=iters,elitism=1)
+
+sec=(proc.time()-PTM)[3] # get seconds elapsed
+
+cat("time elapsed:",sec,"\n")
+
+RES[,2]=F
+
+# Lamarckian EA (LEA):
+
+cat("LEA run:\n")
+
+set.seed(12345) # for replicability
+
+EV=0; BEST=Inf; F=rep(NA,MAXIT) # reset these vars.
+
+pSize=30;iters=ceiling((MAXIT-pSize)/(pSize-1))
+
+PTM=proc.time() # start clock
+
+LEA=oea(size=N,popSize=pSize,iters=iters,evalFunc=local_imp
+
+_tour,crossfunc=ox,mutfunc=insertion,REPORT=iters,elitism=1)
+
+sec=(proc.time()-PTM)[3] # get seconds elapsed
+
+cat("time elapsed:",sec,"\n")
+
+RES[,3]=F
+
+# create PDF with comparison:
+
+pdf("qa194-opt.pdf",paper="special")
+
+par(mar=c(4.0,4.0,0.1,0.1))
+
+X=seq(1,MAXIT,length.out=200)
+
+ylim=c(min(RES)-50,max(RES))
+
+plot(X,RES[X,1],ylim=ylim,type="l",lty=3,lwd=2,xlab="evaluations",ylab="tour distance")
+
+lines(X,RES[X,2],type="l",lty=2,lwd=2)
+
+lines(X,RES[X,3],type="l",lty=1,lwd=2)
+
+legend("topright",Methods,lwd=2,lty=3:1)
+
+dev.off()
+
+# create 3 PDF files with best tours:
+
+pdf("qa194-2-opt.pdf",paper="special")
+
+par(mar=c(0.0,0.0,0.0,0.0))
+
+plot(Data[c(R1[1:N],R1[1]),],type="l",xaxt="n",yaxt="n")
+
+dev.off()
+
+pdf("qa194-ea.pdf",paper="special")
+
+par(mar=c(0.0,0.0,0.0,0.0))
+
+b=OEA$population[which.min(OEA$evaluations),]
+
+plot(Data[c(b,b[1]),],type="l",xaxt="n",yaxt="n")
+
+dev.off()
+
+pdf("qa194-lea.pdf",paper="special")
+
+par(mar=c(0.0,0.0,0.0,0.0))
+
+b=LEA$population[which.min(LEA$evaluations),]
+
+plot(Data[c(b,b[1]),],type="l",xaxt="n",yaxt="n")
+
+dev.off()
+
+The code starts by reading the Qatar TSP instance from the Web by using the getURL function of the RCurl package. The data is originally in the TSPLIB Format (extension .tsp) and thus some parsing (e.g., remove the header part until NODE_COORD_SECTION) is necessary to convert it into a CSV format. The national TSPs assume a traveling cost that is defined by the Euclidean distance rounded to the nearest whole number (TSPLIB EUC_2D-norm). This is easily computed by using the R dist function , which returns a distance matrix between all rows of a data matrix.
+
+The code tests four methods to solve the Qatar instance: 2-opt, simulated annealing, an order evolutionary algorithm, and an evolutionary Lamarckian variant. The first method is executed using the the TSP package , which is specifically addressed to handle the TSP and includes two useful functions: TSP —generates a TSP object from a distance matrix; and solve_TSP —solves a TSP instance under several method options (e.g., "2-opt" and "concorde"). To simplify the code and analysis, the remaining optimization methods are only compared under a single run, although a proper comparison would require the use of several runs, as shown in Sect. 4.​5.​ The simulated annealing and evolutionary algorithms are executed under the same conditions. Similarly to the code presented in Sect. 4.​5, the global EV, BEST, and F are used to trace the evolution of optimization according to the number of function evaluations. The method parameters were fixed into a temperature of T = 2, 000 for the simulated annealing and population size of L P = 30 for the evolutionary algorithm. The Lamarckian approach works as follows. When the evaluation function is called, a local search that uses domain knowledge is applied. This local search works by randomly selecting a city and then moving such city to the best possible position when analyzing only the city to direct neighbors (previous and next city) distances.
+
+The tour() (evaluation function) uses an already defined distance matrix (MD object) to save computational effort (i.e., the Euclidean distance is only calculated once). The local_imp_tour evaluates and returns the solution improved by the domain knowledge local search method and it is used by the Lamarckian evolutionary algorithm. The same initialization seed is used and both simulated annealing and evolutionary methods are traced up to MAXIT=100000 function evaluations. The insertion operator is used to change a solution, and the OX operator is adopted to cross half of the individuals in the evolutionary algorithm. For each method, the code shows the length of the tour and time elapsed (in seconds). The code also generates three PDF files with a comparison of simulated annealing and evolutionary approaches and two optimized tours (for 2-opt and evolutionary algorithm methods). The execution result of file tsp.R is:
+
+2-opt run:
+
+object of class "TOUR"
+
+result of method "2-opt" for 194 cities
+
+tour length: 10279
+
+time elapsed: 0.151
+
+SANN run:
+
+sann objective function values
+
+initial value 91112.000000
+
+final value 40687.000000
+
+sann stopped after 99999 iterations
+
+time elapsed: 74.469
+
+EA run:
+
+1 best: 87620 mean: 93128
+
+3448 best: 22702 mean: 25067.03
+
+time elapsed: 88.026
+
+LEA run:
+
+1 best: 87002 mean: 92674.93
+
+3448 best: 12130 mean: 14462.53
+
+time elapsed: 546.005
+
+The execution times of simulated annealing and pure evolutionary algorithm are quite similar and are much longer when compared with 2-opt approach. The Lamarckian evolution method requires around six times more computation when compared with the evolutionary algorithm. The extra computation is explained by the use of the local search, given that all evaluations require the execution of an extra linear cycle.
+
+Fig. 7.3
+
+Comparison of simulated annealing (SANN) and evolutionary algorithm (EA) approaches for the Qatar TSP
+
+The comparison between the simulated annealing and evolutionary methods is shown in Fig. 7.3. Under the experimental setup conditions, the simulated annealing initially performs similarly to the two evolutionary algorithm methods. However, after around 10,000 evaluations the simulated annealing improvement gets slower when compared with the standard evolutionary algorithm and after around 50,000 evaluations, the convergence is rather flat, reaching a tour value of 40,687. The pure evolutionary algorithm performs better than the simulated annealing, getting an average distance decrease of 10,184 after 50,000 evaluations, when compared with the simulated annealing, and obtaining a final tour of 22,702. The Lamarckian method performs much better than the pure evolutionary algorithm, presenting an average tour improvement of 17,117 after 50,000 evaluations and reaching a final value of 12,130. The best solution is produced by the 2-opt method (which is also the fastest method), with a tour length of 10,279 and that is 1,851 points better than the Lamarckian evolved tour. A visual comparison of the evolutionary algorithm, Lamarckian evolution, and 2-opt optimized tours is shown in Fig. 7.4, revealing a clear improvement in the quality of the solutions when moving from left to right.
+
+In this demonstration, 2-opt provided the best results, followed by the Lamarckian approach. However, 2-opt was specifically proposed for the symmetrical and standard TSP and thus performs a massive and clever use of the distance matrix (domain knowledge) to solve the task. The Lamarckian method also uses the distance matrix, although with a much simpler approach when compared with 2-opt. As explained in Chap. 1.1, the simulated annealing and evolutionary algorithms are general purpose methods that only have an indirect access to the domain knowledge through the received evaluation function values. This means that the same modern optimization algorithms could be easily applied (by adjusting the evaluation function) to other TSP variants (e.g., with constrains) or combinatorial problems (e.g., job shop scheduling) while 2-opt (or even the Lamarckian method) could not.
+
+Fig. 7.4
+
+Optimized tour obtained using evolutionary algorithm (left), Lamarckian evolution (middle), and 2-opt (right) approaches for the Qatar TSP
+
+To demonstrate the previous point, a TSP variant is now addressed, where the goal is set in terms of searching for the minimum area of the tour (and not tour length). This new variant cannot be directly optimized using the 2-opt method. However, the adaptation to a modern optimization method is straightforward and just requires changing the evaluation function. To show this, the same Qatar instance and evolutionary ordered representation optimization is adopted. The new R code is presented in file tsp2.R:
+
+### tsp2.R file ###
+
+# this file assumes that tsp.R has already been executed
+
+library(rgeos) # get gArea function
+
+poly=function(data)
+
+{ poly="";sep=", "
+
+for(i in 1:nrow(data))
+
+{ if(i==nrow(data)) sep=""
+
+poly=paste(poly,paste(data[i,],collapse=" "),sep,sep="")
+
+}
+
+poly=paste("POLYGON((",poly,"))",collapse="")
+
+poly=readWKT(poly) # WKT format to polygon
+
+}
+
+# new evaluation function: area of polygon
+
+area=function(s) return( gArea(poly(Data[c(s,s[1]),])) )
+
+cat("area of 2-opt TSP tour:",area(R1),"\n")
+
+# plot area of 2-opt:
+
+pdf("qa-2opt-area.pdf",paper="special")
+
+par(mar=c(0.0,0.0,0.0,0.0))
+
+PR1=poly(Data[c(R1,R1[1]),])
+
+plot(PR1,col="gray")
+
+dev.off()
+
+# EA:
+
+cat("EA run for TSP area:\n")
+
+set.seed(12345) # for replicability
+
+pSize=30;iters=20
+
+PTM=proc.time() # start clock
+
+OEA=oea(size=N,popSize=pSize,iters=iters,evalFunc=area,crossfunc=ox,mutfunc=insertion,REPORT=iters,elitism=1)
+
+sec=(proc.time()-PTM)[3] # get seconds elapsed
+
+bi=which.min(OEA$evaluations)
+
+b=OEA$population[which.min(OEA$evaluations),]
+
+cat("best fitness:",OEA$evaluations[1],"time elapsed:",sec,"\n")
+
+# plot area of EA best solution:
+
+pdf("qa-ea-area.pdf",paper="special")
+
+par(mar=c(0.0,0.0,0.0,0.0))
+
+PEA=poly(Data[c(b,b[1]),])
+
+plot(PEA,col="gray")
+
+lines(Data[c(b,b[1]),],lwd=2)
+
+dev.off()
+
+The evaluation function uses the gArea() function of the rgeos package to compute the area of a polygon. Before calculating the area, the function first converts the selected solution into a polygon object by calling the poly auxiliary function. The latter function first encodes the tour under the Well Known Text (WKT) format (see http://​en.​wikipedia.​org/​wiki/​Well-known_​text) and then uses readWKT() (from the rgeos package) function to create the polygon (sp geometry object used by the rgeos package). For comparison purposes, the area is first computed for the tour optimized by the 2-opt method. Then, the evolutionary optimization is executed and stopped after 20 iterations. The code also produces two PDF files with area plots related to the best solutions optimized by the evolutionary algorithm and 2-opt methods.
+
+The result of executing file tsp2.R is:
+
+> source("tsp2.R")
+
+area of 2-opt TSP tour: 465571.6
+
+EA run for TSP area:
+
+1 best: 103488.9 mean: 562663.5
+
+20 best: 616.7817 mean: 201368.5
+
+best fitness: 616.7817 time elapsed: 23.383
+
+Now the evolutionary approach achieves a value that is much lower when compared with 2-opt (difference of 464954.8). The area of each optimized solution is shown in Fig. 7.5. As shown by the plots, the evolutionary algorithm best solution contains a huge number of crossing paths, which clearly reduces the area of the optimized tour. In contrast, the 2-opt solution does not contain crossing paths. In effect, 2-opt intentionally avoids crossing paths and such strategy is very good for reducing the path length but not the tour area.
+
+Fig. 7.5
+
+Area of Qatar tour given by 2-opt (left) and optimized by the evolutionary approach (right)
+
+## 7.3 Time Series Forecasting
+
+A univariate time series is a collection of timely ordered observations related with an event (y 1, y 2,..., y t ) and the goal of TSF is to model a complex system as a black-box, predicting its behavior based on historical data (Makridakis et al. 1998). Past values (called in-samples) are first used to fit the model and then forecasts are estimated (![
+$$\\hat{y}_{t}$$
+](A317276_1_En_7_Chapter_IEq1.gif)) for future values (called out-of-samples). The TSF task is to determine a function f such that ![
+$$\\hat{y}_{t} = f\(y_{t-1},y_{t-2},\\ldots,y_{t-k}\)$$
+](A317276_1_En_7_Chapter_IEq2.gif), where k denotes the maximum time lag used by the model. Under one-step ahead forecasting, the errors (or residuals) are given by ![
+$$e_{i} = y_{i} -\\hat{ y}_{i}$$
+](A317276_1_En_7_Chapter_IEq3.gif), where ![
+$$i \\in \\{ T + 1,T + 2,\\ldots,T + h\\}$$
+](A317276_1_En_7_Chapter_IEq4.gif), T is the current time, h is the horizon (or number of predictions), and the errors are to be minimized according to an accuracy metric, such as the mean absolute error (![
+$$MAE = \\frac{\\sum _{i}\\vert e_{i}\\vert } {h}$$
+](A317276_1_En_7_Chapter_IEq5.gif)) (Stepnicka et al. 2013). TSF is highly relevant in distinct domains, such as Agriculture, Finance, Sales, and Production. TSF forecasts can be used to support individual and organizational decision making (e.g., for setting early production plans).
+
+Due to its importance, several statistical TSF methods were proposed, such as the autoregressive integrated moving-average (ARIMA) methodology , which was proposed in 1976 and is widely used in practice (Makridakis et al. 1998). The methodology assumes three main steps: model identification, parameter estimation, and model validation. The ARIMA base model is set in terms of a linear combination of past values (AR component of order p) and errors (MA component of order q). The definition assumed by the R arima function is:
+
+![
+$$\\displaystyle{ \\hat{x_{t}} = a_{1}x_{t_{1}} +\\ldots +a_{p}x_{t_{p}} + e_{t} + b_{1}e_{t_{1}} +\\ldots +b_{q}e_{t_{q}} }$$
+](A317276_1_En_7_Chapter_Equ1.gif)
+
+(7.1)
+
+where ![
+$$x_{t} = y_{t} - m$$
+](A317276_1_En_7_Chapter_IEq6.gif) and m, a 1,..., a p and b 1,..., b q are coefficients that are estimated using an optimization method. The arima() default is to use a conditional sum of squares search to find starting values and then apply a maximum likelihood optimization. To identify and estimate the best ARIMA model, the auto.arima function is adopted from the forecast package.
+
+In this section, genetic programming and rgb package is adopted to fit a time series using a simple function approximation that uses arithmetic operators (+, -, * and /). As explained in Sect. 5.​8, the advantage of genetic programming is that it can find explicit solutions that are easy to interpret by humans. The selected series is the sunspot numbers (also known as Wolf number), which measures the yearly number of dark spots present at the surface of the sun. Forecasting this series is relevant due to several reasons (Kaboudan 2003): the sunspots data generation process is unknown; sunspots are often used to estimate solar activity levels; accurate prediction of sunspot numbers is a key issue for weather forecasting and for making decisions about satellite orbits and space missions. The data range from 1700 to 2012. In this demonstration, data from the years 1700–1980 are used as in-samples and the test period is 1981–2012 (out-of-samples). Forecasting accuracy is measured using the MAE metric and one-step ahead forecasts. The respective R code is presented in file tsf.R:
+
+### tsf.R file ###
+
+library(RCurl) # load RCurl package
+
+# get sunspot series
+
+txt=getURL("http://sidc.oma.be/silso/DATA/yearssn.dat")
+
+# consider 1700-2012 years (remove 2013 * row that is provisory in 2014)
+
+series=strsplit(txt,"\n")[[1]][1:(2012-1700+1)]
+
+cat(series,sep="\n",file="sunspots.dat") # save to file
+
+series=read.table("sunspots.dat")[,2] # read from file
+
+L=length(series) # series length
+
+forecasts=32 # number of 1-ahead forecasts
+
+outsamples=series[(L-forecasts+1):L] # out-of-samples
+
+sunspots=series[1:(L-forecasts)] # in-samples
+
+# mean absolute error of residuals
+
+maeres=function(residuals) mean(abs(residuals))
+
+# fit best ARIMA model:
+
+INIT=10 # initialization period (no error computed before)
+
+library(forecast) # load forecast package
+
+arima=auto.arima(sunspots) # detected order is AR=2, MA=1
+
+print(arima) # show ARIMA model
+
+cat("arima fit MAE=",
+
+maeres(arima$residuals[INIT:length(sunspots)]),"\n")
+
+# one-step ahead forecasts:
+
+# (this code is needed because forecast function
+
+# only issues h-ahead forecasts)
+
+LIN=length(sunspots) # length of in-samples
+
+f1=rep(NA,forecasts)
+
+for(h in 1:forecasts)
+
+{ # execute arima with fixed coefficients but with more in-samples:
+
+arima1=arima(series[1:(LIN+h-1)],order=arima$arma[c(1,3,2)],fixed=arima$coef)
+
+f1[h]=forecast(arima1,h=1)$mean[1]
+
+}
+
+e1=maeres(outsamples-f1)
+
+text1=paste("arima (MAE=",round(e1,digits=1),")",sep="")
+
+# fit genetic programming arithmetic model:
+
+library(rgp) # load rgp
+
+ST=inputVariableSet("x1","x2")#same order of AR arima component
+
+cF1=constantFactorySet(function() rnorm(1)) # mean=0, sd=1
+
+FS=functionSet("+","*","-","/") # arithmetic
+
+# genetic programming time series function
+
+# receives function f
+
+# if(h>0) then returns 1-ahead forecasts
+
+# else returns MAE over fitting period (in-samples)
+
+gpts=function(f,h=0)
+
+{
+
+if(h>0) TS=series
+
+else TS=series[1:LIN]
+
+LTS=length(TS)
+
+F=rep(0,LTS) # forecasts
+
+E=rep(0,LTS) # residuals
+
+if(h>0) I=(LTS-h+1):LTS # h forecasts
+
+else I=INIT:LTS # fit to in-samples
+
+for(i in I)
+
+{
+
+F[i]=f(TS[i-1],TS[i-2])
+
+if(is.nan(F[i])) F[i]=0 # deal with NaN
+
+E[i]=TS[i]-F[i]
+
+}
+
+if(h>0) return (F[I]) # forecasts
+
+else return(maeres(E[I])) # MAE on fit
+
+}
+
+# mutation function
+
+mut=function(func)
+
+{ mutateSubtree(func,funcset=FS,inset=ST,conset=cF1,
+
+mutatesubtreeprob=0.3,maxsubtreedepth=4)}
+
+set.seed(12345) # set for replicability
+
+gp=geneticProgramming(functionSet=FS,inputVariables=ST,
+
+constantSet=cF1,
+
+populationSize=100,
+
+fitnessFunction=gpts,
+
+stopCondition=makeStepsStopCondition(1000),
+
+mutationFunction=mut,
+
+verbose=TRUE)
+
+f2=gpts(gp$population[[which.min(gp$fitnessValues)]],
+
+h=forecasts)
+
+e2=maeres(outsamples-f2)
+
+text2=paste("gp (MAE=",round(e2,digits=1),")",sep="")
+
+cat("best solution:\n")
+
+print(gp$population[[which.min(gp$fitnessValues)]])
+
+cat("gp fit MAE=",min(gp$fitnessValues),"\n")
+
+# show quality of one-step ahead forecasts:
+
+ymin=min(c(outsamples,f1,f2))
+
+ymax=max(c(outsamples,f1,f2))
+
+pdf("fsunspots.pdf")
+
+par(mar=c(4.0,4.0,0.1,0.1))
+
+plot(outsamples,ylim=c(ymin,ymax),type="b",pch=1,
+
+xlab="time (years after 1980)",ylab="values",cex=0.8)
+
+lines(f1,lty=2,type="b",pch=3,cex=0.5)
+
+lines(f2,lty=3,type="b",pch=5,cex=0.5)
+
+legend("topright",c("sunspots",text1,text2),lty=1:3,
+
+pch=c(1,3,5))
+
+dev.off()
+
+The ARIMA model is automatically found using the auto.arima function, which receives as inputs the in-samples. For this example, the identified model is an ARIMA(2, 0, 1), with p = 2 and q = 1. The forecast function (from package forecast) executes multi-step ahead predictions. Thus, one-step ahead forecasts are built by using an iterative call to the function, where in each iteration the ARIMA model is computed with one extra in-sample value. For comparison purposes, the genetic programming method uses the same p order and thus the input variables are x1 ![
+$$= y_{t-1}$$
+](A317276_1_En_7_Chapter_IEq7.gif) and x2 ![
+$$= y_{t-2}$$
+](A317276_1_En_7_Chapter_IEq8.gif). In order to save code, the gpts function is used under two execution goals: fitness function, computing the MAE over all in-samples except for the first INIT values (when h=0); and estimation of forecasts, returning h one-step ahead forecasts. Since the / operator can generate NaN values (e.g., 0/0), any NaN value is transformed into 0. To simplify the demonstration, only one run is used, with a fixed seed. The genetic programming is stopped after 1,000 generations and then a PDF file is created, comparing the forecasts with the sunspot values. The result of executing file tsf.R is:1
+
+> source("tsf.R")
+
+Series: sunspots
+
+ARIMA(2,0,1) with non-zero mean
+
+Coefficients:
+
+ar1 ar2 ma1 intercept
+
+1.4565 -0.7493 -0.1315 48.0511
+
+s.e. 0.0552 0.0502 0.0782 2.8761
+
+sigma^2 estimated as 263.6: log likelihood=-1183.17
+
+AIC=2376.33 AICc=2376.55 BIC=2394.52
+
+arima fit MAE= 12.22482
+
+STARTING genetic programming evolution run (Age/Fitness/Complexity Pareto GP search-heuristic) ...
+
+evolution step 100, fitness evaluations: 1980, best fitness:
+
+17.475042, time elapsed: 7.63 seconds
+
+evolution step 200, fitness evaluations: 3980, best fitness:
+
+17.474204, time elapsed: 13.18 seconds
+
+evolution step 300, fitness evaluations: 5980, best fitness:
+
+14.099732, time elapsed: 19.82 seconds
+
+evolution step 400, fitness evaluations: 7980, best fitness:
+
+12.690703, time elapsed: 27.49 seconds
+
+evolution step 500, fitness evaluations: 9980, best fitness:
+
+11.802043, time elapsed: 36.59 seconds
+
+evolution step 600, fitness evaluations: 11980, best fitness:
+
+11.791989, time elapsed: 48.06 seconds
+
+evolution step 700, fitness evaluations: 13980, best fitness:
+
+11.784837, time elapsed: 58.44 seconds
+
+evolution step 800, fitness evaluations: 15980, best fitness:
+
+11.768817, time elapsed: 1 minute, 8.75 seconds
+
+evolution step 900, fitness evaluations: 17980, best fitness:
+
+11.768817, time elapsed: 1 minute, 18.74 seconds
+
+evolution step 1000, fitness evaluations: 19980, best fitness: 11.768817, time elapsed: 1 minute, 28.74 seconds
+
+Genetic programming evolution run FINISHED after 1000 evolution steps, 19980 fitness evaluations and 1 minute, 28.74 seconds.
+
+best solution:
+
+function (x1, x2)
+
+x1/(x2 + x1) * (x1 + x1/(x2 + x1) * (x1 + x1 - x1/
+
+(1.3647488967524 +1.3647488967524)) - x1/(x2 + x1) * x1/
+
+(1.3647488967524 +(1.3647488967524 + 1.3647488967524/x2)))
+
+gp fit MAE= 11.76882
+
+The ARIMA(2, 0, 1) model fits the in-samples with an MAE of 12.2 and the genetic programming method best fitness is slightly better (MAE = 11. 8). The genetic programming representation does not include the MA terms (i.e., past errors) of ARIMA but the evolved solution is nonlinear (due to the * operator). The quality of the one-step ahead forecasts is shown in Fig. 7.6. Both ARIMA and genetic programming predictions are close to the true sunspot values. Overall, the genetic programming solution produces slightly better forecasts with an improvement of 0.7 when compared with the ARIMA method in terms of MAE measured over the out-of-samples. This is an interesting result, since ARIMA methodology was specifically designed for TSF while genetic programming is a much more generic optimization method.
+
+Fig. 7.6
+
+Sunspot one-step ahead forecasts using ARIMA and genetic programming (gp) methods
+
+## 7.4 Wine Quality Classification
+
+Classification is an important data mining/machine learning task, where the goal is to build a data-driven model (i.e., model fit using a dataset) that is capable of predicting a class label (output target) given several input variables that characterize an item (Cortez 2012). For example, a classification model can estimate the type of credit client, "good" or "bad", given the status of her/his bank account, credit purpose, and amount.
+
+Often, it is possible to assign probabilities (p ∈ [0, 1]) for a class label when using a classifier. Under such scheme, the choice of a label is dependent on a decision threshold D, such that the class is true if p > D. The receiver operating characteristic (ROC) curve shows the performance of a two class classifier across the range of possible threshold (D) values, plotting one minus the specificity (false positive rate—FPR; x-axis) versus the sensitivity (also known as true positive rate—TPR; y-axis) (Fawcett 2006). The overall accuracy is given by the area under the curve (AUC = ∫ 0 1 ROCdD) , measuring the degree of discrimination that can be obtained from a given model. The ROC analysis is a popular and richer measure for evaluating classification discrimination capability. The main advantage of the ROC curve is that performance is not dependent on the class label distributions and error costs (Fawcett 2006). Since there is a trade-off between specificity and sensitivity errors, the option for setting the best D value can be left for the domain expert. The ideal classified should present an AUC of 1.0, while an AUC of 0.5 denotes a random classifier. Often, AUC values are read as: 0.7—good; 0.8—very good; and 0.9—excellent.
+
+Given the interest in classification, several machine learning methods have been proposed, each one with its own purposes and advantages. In this section, the support vector machine (SVM) model is adopted. This is a powerful learning tool that is based on a statistical learning theory and was developed in the 1990s under the work of Vapnik and its collaborators (e.g., Cortes and Vapnik 1995). The model is popular due to its learning flexibility (i.e., no a priori restriction is imposed) and tendency for achieving high quality classification results. In effect, SVM was recently considered one of the most influential data mining algorithms due to its classification capabilities (Wu et al. 2008).
+
+The basic idea of an SVM is to transform the input space into a higher feature space by using a nonlinear mapping that depends on a kernel function. Then, the algorithm finds the best linear separating hyperplane, related to a set of support vector points, in the feature space. The gaussian kernel is popular option and presents less hyperparameters and numerical difficulties than other kernels (e.g., polynomial or sigmoid). When using this kernel, SVM classification performance is affected by two hyperparameters: γ, the parameter of the kernel, and C > 0, a penalty parameter of the error term. Thus, model selection is a key issue when using SVM. When performing model selection, classification performance is often measured over a validation set, containing data samples that do not belong to the training set. This procedure is used to avoid overfitting, since SVM can easily fit every single sample, possibly including noise or outliers, and such model would have limited generalization capabilities. The validation set can be created by using a holdout or k-fold cross-validation method (Kohavi 1995) to split the training data into training and validation sets.
+
+Classification performance is also affected by the input variables used to fit the model (this includes SVM and other models). Feature selection , i.e., the selection of the right inputs, is useful to discard irrelevant variables (also known as features), leading to simpler models that are easier to interpret and often presenting higher predictive accuracies (Guyon and Elisseeff 2003).
+
+There is no optimal universal method for tuning a classifier. Thus, often trial-and-error or heuristics are used, normally executed using only one type of selection, such as backward selection for feature selection (Guyon and Elisseeff 2003) and grid search for model selection (Hsu et al. 2003). However, ideally both selection types should be performed simultaneously. This section follows such approach, under a multi-objective optimization search. As explained in Freitas (2004), the multi-objective strategy is justified by the trade-off that exists between having less features and increasing the classifier performance. The use of a modern optimization method, such as the NSGAII algorithm adopted in this section, is particularly appealing to non-specialized data mining/machine learning users, given that the search is fully automatic and more exhaustive, thus tending to provide better performances when compared with the manual design.
+
+The University California Irvine (UCI) machine learning repository (Bache and Lichman 2013) contains more than 280 datasets that are used by the data mining community to test algorithms and tools. Most datasets are related with classification tasks. In particular, this section explores the wine quality that was proposed and analyzed in Cortez et al. (2009). The goal is to model wine quality based on physicochemical tests. The output target (quality) was computed as the median of at least three sensory evaluations performed by wine experts, using a scale that ranges from 1 (very poor) to 10 (excellent). The physicochemical tests include 11 continuous variables (inputs), such as chlorides and alcohol (vol.%). As explained in Cortez et al. (2009), building a data-driven model, capable of predicting wine quality from physicochemical values, is important for the wine domain because the relationships between the laboratory tests and sensory analysis are complex and are still not fully understood. Thus, an accurate data-driven model can support the wine expert decisions, aiding the speed and quality of the evaluation performance. Also, such model could also be used to improve the training of oenology students.
+
+This section exemplifies how the best classification model can be optimized by performing a simultaneous feature and model selection. Given that two objectives are defined, i.e., improving classification performance and reducing the number of features used by the model, a multi-objective approach is adopted. The example code uses the mco and rminer packages. The former is used to get the nsga2 function (NSGAII algorithm). The latter library facilitates the use of data mining algorithms in classification and regression tasks by presenting a short and coherent set of functions (Cortez 2010). The rminer package is only briefly described here, for further details consult help(package=rminer). The classification example for the white wine quality dataset is coded in file wine-quality.R:
+
+### wine-quality.R file ###
+
+library(rminer) # load rminer package
+
+library(kernlab) # load svm functions used by rminer
+
+library(mco) # load mco package
+
+# load wine quality dataset directly from UCI repository:
+
+file="http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
+
+d=read.table(file=file,sep=";",header=TRUE)
+
+# convert the output variable into 3 classes of wine:
+
+# "poor_or_average" <\- 3,4,5 or 6;
+
+# "good_or_excellent" <\- 7, 8 or 9
+
+d$quality=cut(d$quality,c(1,6,10),
+
+c("poor_or_average","good_or_excellent"))
+
+output=ncol(d) # output target index (last column)
+
+maxinputs=output-1 # number of maximum inputs
+
+# to speed up the demonstration, select a smaller sample of
+
+data:
+
+n=nrow(d) # total number of samples
+
+ns=round(n*0.25) # select a quarter of the samples
+
+set.seed(12345) # for replicability
+
+ALL=sample(1:n,ns) # contains 25% of the index samples
+
+# show a summary of the wine quality dataset (25%):
+
+print(summary(d[ALL,]))
+
+cat("output class distribuition (25% samples):\n")
+
+print(table(d[ALL,]$quality)) # show distribution of classes
+
+# holdout split:
+
+# select training data (for fitting the model), 70%; and
+
+# test data (for estimating generalization capabilities), 30%.
+
+H=holdout(d[ALL,]$quality,ratio=0.7)
+
+cat("nr. training samples:",length(H$tr),"\n")
+
+cat("nr. test samples:",length(H$ts),"\n")
+
+# evaluation function:
+
+# x is in the form c(Gamma,C,b1,b2,...,b11)
+
+eval=function(x)
+
+{ n=length(x)
+
+gamma=2^x[1]
+
+C=2^x[2]
+
+features=round(x[3:n])
+
+inputs=which(features==1)
+
+attributes=c(inputs,output)
+
+# divert console:
+
+# sink is used to avoid kernlab ksvm messages in a few cases
+
+sink(file=textConnection("rval","w",local = TRUE))
+
+M=mining(quality~.,d[H$tr,attributes],method=c("kfold",3),model="svm",search=gamma,mpar=c(C,NA))
+
+sink(NULL) # restores console
+
+# AUC for the internal 3-fold cross-validation:
+
+auc=as.numeric(mmetric(M,metric="AUC"))
+
+auc1=1-auc # transform auc maximization into minimization goal
+
+return(c(auc1,length(inputs)))
+
+}
+
+# NSGAII multi-objective optimization:
+
+cat("NSGAII optimization:\n")
+
+m=2 # two objectives: AUC and number of features
+
+lower=c(-15,-5,rep(0,maxinputs))
+
+upper=c(3,15,rep(1,maxinputs))
+
+PTM=proc.time() # start clock
+
+G=nsga2(fn=eval,idim=length(lower),odim=m,lower.bounds=lower,upper.bounds=upper,popsize=12,generations=10)
+
+sec=(proc.time()-PTM)[3] # get seconds elapsed
+
+cat("time elapsed:",sec,"\n")
+
+# show the Pareto front:
+
+I=which(G$pareto.optimal)
+
+for(i in I)
+
+{ x=G$par[i,]
+
+n=length(x)
+
+gamma=2^x[1]
+
+C=2^x[2]
+
+features=round(x[3:n])
+
+inputs=which(features==1)
+
+cat("gamma:",gamma,"C:",C,"features:",inputs,"; f=(",
+
+1-G$value[i,1],G$value[i,2],")\n",sep=" ")
+
+}
+
+# create PDF showing the Pareto front:
+
+pdf(file="nsga-wine.pdf",paper="special",height=5,width=5)
+
+par(mar=c(4.0,4.0,0.1,0.1))
+
+SI=sort.int(G$value[I,1],index.return=TRUE)
+
+plot(1-G$value[SI$ix,1],G$value[SI$ix,2],xlab="AUC",ylab="nr. features",type="b",lwd=2)
+
+dev.off()
+
+# selection of the SVM model with 4 inputs:
+
+x=G$par[I[7],]
+
+gamma=2^x[1]
+
+C=2^x[2]
+
+features=round(x[3:n])
+
+inputs=which(features==1)
+
+attributes=c(inputs,output)
+
+# fit a SVM with the optimized parameters:
+
+cat("fit SVM with nr features:",length(inputs),"nr samples:",length(H$tr),"gamma:",gamma,"C:",C,"\n")
+
+cat("inputs:",names(d)[inputs],"\n")
+
+M=fit(quality~.,d[H$tr,attributes],model="svm",
+
+search=gamma,mpar=c(C,NA))
+
+# get SVM predictions for unseen data:
+
+P=predict(M,d[H$ts,attributes])
+
+# create PDF showing the ROC curve for unseen data:
+
+auc=mmetric(d[H$ts,]$quality,P,metric="AUC")
+
+main=paste("ROC curve for test data",
+
+" (AUC=",round(auc,digits=2),")",sep="")
+
+mgraph(d[H$ts,]$quality,P,graph="ROC",PDF="roc-wine",main=main,baseline=TRUE,Grid=10,leg="SVM")
+
+In this example, the read.table function is used to read the CSV file directly from the UCI repository. Originally, there are seven numeric values for the wine quality variable (range from 3 to 9). The example approaches a simple binary task, thus the cut R function is used to transform the numeric values into two classes. Also, the original dataset includes 4,898 samples, which is a large number for the SVM fit. To reduce the computational effort, in this demonstration 25 % of the samples are first selected. Given that classification performance should be accessed over unseen data, not used for fitting, the popular holdout split validation procedure is adopted, where 70 % of the selected samples are used for the search of the best model, while the other 30 % of the samples are used as test set, for estimating the model true generalization capabilities. The holdout function creates a list with the training ($tr) and test ($ts) indexes related with a output target. The NSGAII chromosome is made in terms of real values and includes γ, C and 11 values related with feature selection. As advised in Hsu et al. (2003), the γ and C parameters are searched using exponentially growing sequences, where γ ∈ [2−15, 23] and C ∈ [2−5, 215]. The feature selection values are interpreted as boolean numbers, where the respective input variable is included in the model if > 0. 5.
+
+The evaluation function is based on the powerful mining function, which trains and tests a classifier under several runs and a given validation method. In this case, the used function arguments were:
+
+  * x=quality~.—an R formula that means that the target is the quality attribute and that all other data attributes are used as inputs;
+
+  * data=d[H$tr,attributes]—dataset used (data.frame), in this case corresponds to the training set samples and variables defined by the solution x (features and output);
+
+  * method=c("kfold",3)—the estimation method used by the function (in this case a threefold cross-validation);
+
+  * model="svm"—model type name;
+
+  * search=gamma—hyperparameter to tune (in this case it is fixed to the value of gamma); and
+
+  * mpar=c(C,NA)—vector with extra model parameters (in this case it sets C).
+
+For some gamma and C configurations, the SVM function produces some messages and thus the useful sink R function was adopted to discard these messages. At the end, the evaluation function returns the AUC value (computed using the mmetric rminer function) and number of features.
+
+The NSGAII algorithm is executed using a small population size (12) and stopped after 10 generations, in order to reduce the computational effort of the demonstration. After the search, the Pareto front is shown in the console and also plotted into a PDF file. In the example, one particular model (with four inputs) is selected and fit using all training data (fit function from rminer). Then, predictions are executed for the test data (using the predict rminer function) and the respective ROC curve is saved into another PDF file (using the mgraph rminer function ). The execution result of file wine-quality.R is:
+
+> source("wine-quality.R")
+
+fixed.acidity volatile.acidity citric.acid residual.sugar
+
+Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. :
+
+0.600
+
+1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2600 1st Qu.:
+
+1.800
+
+Median : 6.800 Median :0.2600 Median :0.3100 Median :
+
+5.700
+
+Mean : 6.813 Mean :0.2788 Mean :0.3253 Mean :
+
+6.322
+
+3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3700 3rd Qu.:
+
+9.500
+
+Max. :14.200 Max. :0.8150 Max. :1.2300 Max. :
+
+26.050
+
+chlorides free.sulfur.dioxide total.sulfur.dioxide
+
+Min. :0.00900 Min. : 3.0 Min. : 21.0
+
+1st Qu.:0.03600 1st Qu.: 23.0 1st Qu.:108.0
+
+Median :0.04300 Median : 34.0 Median :134.0
+
+Mean :0.04508 Mean : 34.9 Mean :138.4
+
+3rd Qu.:0.05000 3rd Qu.: 45.0 3rd Qu.:167.0
+
+Max. :0.21100 Max. :146.5 Max. :366.5
+
+density pH sulphates alcohol
+
+Min. :0.9871 Min. :2.790 Min. :0.2200 Min. : 8.50
+
+1st Qu.:0.9918 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50
+
+Median :0.9937 Median :3.180 Median :0.4700 Median :10.40
+
+Mean :0.9939 Mean :3.189 Mean :0.4878 Mean :10.54
+
+3rd Qu.:0.9959 3rd Qu.:3.283 3rd Qu.:0.5500 3rd Qu.:11.40
+
+Max. :1.0030 Max. :3.810 Max. :0.9800 Max. :14.00
+
+quality
+
+poor_or_average :950
+
+good_or_excellent:274
+
+output class distribuition (25% samples):
+
+poor_or_average good_or_excellent
+
+950 274
+
+nr. training samples: 856
+
+nr. test samples: 368
+
+NSGAII optimization:
+
+time elapsed: 124.027
+
+gamma: 0.09344539 C: 0.4146168 features: 1 2 4 5 6 7 9 10 11 ; f=( 0.8449871 9 )
+
+gamma: 0.002701287 C: 48.64076 features: 6 11 ; f=( 0.8044546 2)
+
+gamma: 4.332014e-05 C: 876.2796 features: 3 5 6 7 9 11 ;
+
+f=(0.827304 6)
+
+gamma: 0.002422175 C: 56.40689 features: 11 ; f=( 0.7830263 1 )
+
+gamma: 4.332014e-05 C: 948.0165 features: 5 6 7 9 11 ;
+
+f=( 0.8255598 5 )
+
+gamma: 0.4768549 C: 0.0702998 features: 1 2 4 5 6 7 10 11 ;
+
+f=( 0.8399648 8 )
+
+gamma: 0.0007962147 C: 948.0165 features: 5 6 7 11 ;
+
+f=( 0.8144563 4 )
+
+fit SVM with nr features: 4 nr samples: 856 gamma: 0.0007962147 C: 948.0165
+
+inputs: chlorides free.sulfur.dioxide total.sulfur.dioxide alcohol
+
+The code starts by showing the summary of the selected dataset (with 25 % of the data). The output classes are biased, where the most (78 %) of the samples are related to poor or average wines. The NSGAII algorithm optimizes a Pareto front with seven solutions. The left of Fig. 7.7 shows such front, which is non-convex, and confirms the number of features versus classification performance trade-off. The example code selects a particular model, in this case the SVM that uses four inputs (AUC = 0.81). The right of Fig. 7.7 shows the ROC curve for such model, computed over the test set. The respective AUC value is also shown in the plot and it is slightly lower (difference of 0.07) than the one obtained using an internal threefold cross-validation. This result was expected given that test set metrics tend to be lower than validation metrics. The search algorithm has access to the validation samples and thus it can highly optimize this value. Yet, the test data is only used to measure the true generalization capabilities of the optimized model and after the search is completed. Nevertheless, it should be stressed that 0.73 corresponds to a good discrimination, while the model includes a very small number of input variables. Thus, this is an interesting classification model that was automatically found by the multi-objective search.
+
+Fig. 7.7
+
+The optimized Pareto front (left) and ROC curve for the SVM with four inputs (right) for the white wine quality task
+
+## 7.5 Command Summary
+
+arima | Fit an ARIMA time series model
+
+---|---
+
+auto.arima | Automatic identification and estimation of an ARIMA model (package forecast)
+
+displacement | Displacement operator (chapter file "oea.R")
+
+dist | Computes a distance matrix between rows of a data matrix
+
+exchange | Exchange operator (chapter file "oea.R")
+
+fit | Fit a supervised data mining model (package rminer)
+
+forecast | Package for time series forecasting
+
+forecast() | Generic function for forecasting from a time series model (package forecast)
+
+gArea() | Compute the area of a polygon (package rgeos)
+
+holdout | Returns indexes for holdout data split with training and test sets (package rminer)
+
+insertion | Insertion operator (chapter file "oea.R")
+
+mining | Trains and tests a model under several runs and a given validation method (package rminer)
+
+mgraph | Plots a data mining result graph (package rminer)
+
+mmetric | Compute classification or regression error metrics (package rminer)
+
+ox | Order crossover (OX) operator (chapter file "oea.R")
+
+oea | Order representation evolutionary algorithm (chapter file "oea.R")
+
+ox | Order crossover (OX) operator (chapter file "oea.R")
+
+plot | Plot function for geometry objects (package rgeos)
+
+pmx | Partially matched crossover (PMX) operator (chapter file "oea.R")
+
+predict | Predict function for fit objects (package rminer)
+
+readWKT | Read WKT format into a geometry object (package rgeos)
+
+rgeos | Package that interfaces to geometry engine—open source
+
+rminer | Package for a simpler use of classification and regression data mining methods
+
+TSP | Package for traveling salesman problems
+
+TSP() | Creates a TSP instance (package TSP)
+
+## 7.6 Exercises
+
+7.1.
+
+Encode the cycle crossover (cx function) for order representations, which performs a number of cycles between two parent solutions: P 1 and P 2 (Rocha et al. 2001). Cycle 1 starts with the first value in P 1 (v 1) and analyzes the value at same position in P 2 (v). Then, it searches for v in P 1 and analyzes the corresponding value at position P 2 (new v). This procedure continues until the new v is equal to v 1, ending the cycle. All P 1 and P 2 genes that were found in this cycle are marked. Cycle 2 starts with the first value in P 1 that is not marked and ends as described in cycle 1. The whole process proceeds with similar cycles until all genes have been marked. The genes marked in odd cycles are copied from P 1 to child 1 and from P 2 to child 2, while genes marked in even cycles are copied from P 1 to child 2 and from P 2 to child 1.
+
+Show the children that result from applying the cycle crossover to the parents P 1 = (1,2,3,4,5,6,7,8,9) and P 2 = (9,8,1,2,3,4,5,6,7).
+
+7.2.
+
+Encode the random mutation (randomm) and random crossover (randomx) operators that randomly select an ordered mutation (exchange, insertion, or displacement) or crossover (PMX, OX or CV). Optimize the same Qatar TSP instance using two simulated annealing and evolutionary algorithm variants that use the new randomm and randomx operators. Using the same setting of Sect. 7.2, show the total distance of the optimized simulated annealing and evolutionary algorithm tours.
+
+7.3.
+
+Using the same sunspots TSF example (from Sect. 7.3), optimize coefficients of the ARIMA(2, 0, 1) model using a particle swarm optimization method and compare the MAE one-step ahead forecasts with the method returned by auto.arima function. As lower and upper bounds for the particle swarm optimization use the [−1, 1] range for all coefficients of ARIMA except m, which should be searched around the sunspots average (within ±10 % of the average value).
+
+7.4.
+
+Change the wine classification code (from Sect. 7.4) such that three quality classes are defined: "bad"—3, 4 or 5; "average"—6; "good"—7, 8 or 9. To speed up the execution of this exercise, consider only 10 % of the original samples (randomly selected). Then, adapt the optimization to perform only model selection (search for γ and C; use of a fixed number of 11 inputs) and consider three objectives: the maximization of the AUC value for each class label (use the metric="AUCCLASS" argument for the mmetric function). Finally, show the Pareto front values in the console and also in a plot using the scatterplot3d function.
+
+References
+
+Applegate D, Bixby R, Chvátal V, Cook W (2001) TSP cuts which do not conform to the template paradigm. In: Computational combinatorial optimization. Springer, Berlin, pp 261–303
+
+Applegate DL, Bixby RE, Chvatal V, Cook WJ (2011) The traveling salesman problem: a computational study. Princeton University Press, Princeton
+
+Bache K, Lichman M (2013) UCI machine learning repository. http://​archive.​ics.​uci.​edu/​ml
+
+Cortes C, Vapnik V (1995) Support vector networks. Mach Learn 20(3):273–297MATH
+
+Cortez P (2010) Data mining with neural networks and support vector machines using the R/rminer tool. In: Perner P (ed) Advances in data mining: applications and theoretical aspects. 10th industrial conference on data mining. Lecture notes in artificial intelligence, vol 6171. Springer, Berlin, pp 572–583
+
+Cortez P (2012) Data mining with multilayer perceptrons and support vector machines. Springer, Berlin, pp 9–25 (Chap. 2)
+
+Cortez P, Cerdeira A, Almeida F, Matos T, Reis J (2009) Modeling wine preferences by data mining from physicochemical properties. Dec Support Syst 47(4):547–553CrossRef
+
+Croes G (1958) A method for solving traveling-salesman problems. Oper Res 6(6):791–812CrossRefMathSciNet
+
+Fawcett T (2006) An introduction to ROC analysis. Pattern Recognit Lett 27:861–874CrossRef
+
+Freitas AA (2004) A critical review of multi-objective optimization in data mining: a position paper. ACM SIGKDD Explor Newslett 6(2):77–86CrossRefMathSciNet
+
+Guyon I, Elisseeff A (2003) An introduction to variable and feature selection. J Mach Learn Res 3:1157–1182MATH
+
+Hsu CH, Chang CC, Lin CJ (2003) A practical guide to support vector classification. Tech. rep., National Taiwan University
+
+Kaboudan MA (2003) Forecasting with computer-evolved model specifications: a genetic programming application. Comput Oper Res 30(11):1661–1681CrossRef00098-9)MATH
+
+Kohavi R (1995) A study of cross-validation and bootstrap for accuracy estimation and model selection. In: Proceedings of the international joint conference on artificial intelligence (IJCAI), vol 2. Morgan Kaufmann, Montreal
+
+Makridakis S, Weelwright S, Hyndman R (1998) Forecasting: methods and applications, 3rd edn. Wiley, New York
+
+Michalewicz Z, Fogel D (2004) How to solve it: modern heuristics. Springer, BerlinCrossRef
+
+Reinelt G (1994) The traveling salesman: computational solutions for TSP applications. Springer, New York
+
+Rocha M, Mendes R, Cortez P, Neves J (2001) Sitting guest at a wedding party: experiments on genetic and evolutionary constrained optimization. In: Proceedings of the 2001 congress on evolutionary computation (CEC2001), vol 1. IEEE Computer Society, Seoul, pp 671–678
+
+Stepnicka M, Cortez P, Donate JP, Stepnicková L (2013) Forecasting seasonal time series with computational intelligence: on recent methods and the potential of their combinations. Expert Syst Appl 40(6):1981–1992CrossRef
+
+Wu X, Kumar V, Quinlan J, Gosh J, Yang Q, Motoda H, MacLachlan G, Ng A, Liu B, Yu P, Zhou Z, Steinbach M, Hand D, Steinberg D (2008) Top 10 algorithms in data mining. Knowl Inf Syst 14(1):1–37CrossRef
+
+Footnotes
+
+1
+
+These results were achieved with rgp version 0.3-4 and later rgp versions might produce different results.
+
+## Solutions
+
+##  Exercises of Chap.
+
+ 2.1
+
+v=rep(0,10) # same as: v=vector(length=10);v[]=0
+
+v[c(3,7,9)]=1 # update values
+
+print(v) # show v
+
+ 2.2
+
+v=seq(2,50,by=2) # one way
+
+print(v)
+
+v=(1:25)*2 # other way
+
+print(v)
+
+ 2.3
+
+m=matrix(nrow=3,ncol=4)
+
+m[1,]=1:4
+
+m[2,]=sqrt(m[1,])
+
+m[3,]=sqrt(m[2,])
+
+m[,4]=m[,3]^2 # m[,3]*m[,3]
+
+print(round(m,digits=2))
+
+cat("sums of rows:",round(apply(m,1,sum),digits=2),"\n")
+
+cat("sums of columns:",round(apply(m,2,sum),digits=2),"\n")
+
+ 2.4
+
+# 1 - use of for ... if
+
+counteven1=function(x)
+
+{ r=0
+
+for(i in 1:length(x))
+
+{ if(x[i]%%2==0) r=r+1 }
+
+return(r)
+
+}
+
+# 2 - use of sapply
+
+# auxiliary function
+
+ifeven=function(x) # x is a number
+
+{ if(x%%2) return(TRUE) else return(FALSE)}
+
+counteven2=function(x)
+
+{ return(sum(sapply(x,ifeven))) }
+
+# 3 - use of direct condition (easiest way)
+
+counteven3=function(x)
+
+{ return(sum(x%%2==0)) }
+
+x=1:10
+
+cat("counteven1:",counteven1(x),"\n")
+
+cat("counteven2:",counteven2(x),"\n")
+
+cat("counteven3:",counteven3(x),"\n")
+
+ 2.5
+
+DIR="" # change to other directory if needed
+
+pdf(paste(DIR,"maxsin.pdf",sep=""),width=5,height=5) # create PDF
+
+D=8 # number of binary digits, the dimension
+
+x=0:(2^D-1);y=sin(pi*x/2^D)
+
+plot(x,y,type="l",ylab="evaluation function",
+
+xlab="search space",lwd=2)
+
+pmax=c(x[which.max(y)],max(y)) # set the maximum point
+
+points(pmax[1],pmax[2],pch=19,lwd=2) # plot the maximum
+
+legend("topright","optimum",pch=19,lwd=2) # add a legend
+
+dev.off() # close the graphical device
+
+ 2.6
+
+# 1
+
+# install.packages("RCurl") # if needed, install the package
+
+library(RCurl)
+
+# 2
+
+fires=getURL("http://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv")
+
+write(fires,file="forestfires.csv") # write to working directory
+
+# 3, read file:
+
+fires=read.table("forestfires.csv",header=TRUE,sep=",")
+
+# 4
+
+aug=fires$temp[fires$month=="aug"]
+
+cat("mean temperature in Aug.:",mean(aug),"\n")
+
+# 5
+
+feb=fires$temp[fires$month=="feb"]
+
+jul=fires$temp[fires$month=="jul"]
+
+sfeb=sample(feb,10)
+
+sjul=sample(jul,10)
+
+saug=sample(aug,10)
+
+p1=t.test(saug,sfeb)$p.value
+
+p2=t.test(saug,sjul)$p.value
+
+p3=t.test(sjul,sfeb)$p.value
+
+cat("p-values (Aug-Feb,Aug-Jul,Jul-Feb):",
+
+round(c(p1,p2,p3),digits=2),"\n")
+
+# 6
+
+aug100=fires[fires$month=="aug"&fires$area>100,]
+
+print(aug100)
+
+# 7
+
+write.table(aug100,"aug100.csv",sep=",",row.names=FALSE)
+
+##  Exercises of Chap.
+
+ 3.1
+
+source("blind.R") # load the blind search methods
+
+binint=function(x,D)
+
+{ x=rev(intToBits(x)[1:D]) # get D bits
+
+# remove extra 0s from raw type:
+
+as.numeric(unlist(strsplit(as.character(x),""))[(1:D)*2])
+
+}
+
+intbin=function(x) sum(2^(which(rev(x==1))-1))
+
+maxsin=function(x,Dim) sin(pi*(intbin(x))/(2^Dim))
+
+D=16 # number of dimensions
+
+# blind search:
+
+PTM=proc.time() # start clock
+
+x=0:(2^D-1) # integer search space
+
+search=t(sapply(x,binint,D=D))
+
+S=fsearch(search,maxsin,"max",D) # full search
+
+sec=(proc.time()-PTM)[3] # get seconds elapsed
+
+cat("fsearch s:",S$sol,"f:",S$eval,"time:",sec,"s\n")
+
+# adapted grid search:
+
+N=1000
+
+PTM=proc.time() # start clock
+
+x=seq(0,2^D-1,length.out=N)
+
+search=t(sapply(x,binint,D=D))
+
+S=fsearch(search,maxsin,"max",D) # grid
+
+sec=(proc.time()-PTM)[3] # get seconds elapsed
+
+cat("gsearch s:",S$sol,"f:",S$eval,"time:",sec,"s\n")
+
+# adapted monte carlo search:
+
+PTM=proc.time() # start clock
+
+x=sample(0:2^D-1,N)
+
+search=t(sapply(x,binint,D=D))
+
+S=fsearch(search,maxsin,"max",D) # grid
+
+sec=(proc.time()-PTM)[3] # get seconds elapsed
+
+cat("mcsearch s:",S$sol,"f:",S$eval,"time:",sec,"s\n")
+
+ 3.2
+
+source("blind.R") # load the blind search methods
+
+source("grid.R") # load the grid search methods
+
+source("functions.R") # load the profit function
+
+D=5 # number of dimensions
+
+# grid search code:
+
+S1=gsearch(rep(11,D),rep(350,D),rep(450,D),profit,"max")
+
+cat("gsearch s:",round(S$sol),"f:",S$eval,"\n")
+
+# dfsearch code:
+
+domain=vector("list",D)
+
+for(i in 1:D) domain[[i]]=seq(350,450,by=11)
+
+S=dfsearch(domain=domain,FUN=profit,type="max")
+
+cat("dfsearch s:",round(S$sol),"f:",S$eval,"\n")
+
+ 3.3
+
+source("blind.R") # load the blind search methods
+
+source("montecarlo.R") # load the monte carlo method
+
+rastrigin=function(x) 10*length(x)+sum(x^2-10*cos(2*pi*x))
+
+# experiment setup parameters:
+
+D=30
+
+Runs=30
+
+N=10^c(2,3,4) # number of samples
+
+# perform all monte carlo searches:
+
+S=matrix(nrow=Runs,ncol=length(N))
+
+for(j in 1:length(N)) # cycle all number of samples
+
+for(i in 1:Runs) # cycle all runs
+
+S[i,j]=mcsearch(N[j],rep(-5.2,D),rep(5.2,D),
+
+rastrigin,"min")$eval
+
+# compare average results:
+
+p21=t.test(S[,2],S[,1])$p.value
+
+p31=t.test(S[,3],S[,2])$p.value
+
+cat("N=",N,"\n")
+
+cat("average f:",apply(S,2,mean),"\n")
+
+cat("p-value (N=",N[2],"vs N=",N[1],")=",
+
+round(p21,digits=2),"\n")
+
+cat("p-value (N=",N[3],"vs N=",N[2],")=",
+
+round(p31,digits=2),"\n")
+
+boxplot(S[,1],S[,2],S[,3],names=paste("N=",N,sep=""))
+
+##  Exercises of Chap.
+
+ 4.1
+
+# steepest ascent hill climbing method:
+
+hclimbing=function(par,fn,change,lower,upper,control,
+
+type="min",...)
+
+{ fpar=fn(par,...)
+
+for(i in 1:control$maxit)
+
+{
+
+par1=change(par,lower,upper)
+
+fpar1=fn(par1,...)
+
+if(control$N>0) # steepest ascent code
+
+{ for(j in 1:control$N-1)
+
+{ cand=change(par,lower,upper)
+
+fcand=fn(cand,...)
+
+if( (type=="min" && fcand<fpar1)
+
+|| (type=="max" && fcand>fpar1))
+
+{par1=cand;fpar1=fcand}
+
+}
+
+}
+
+if(control$REPORT>0 &&(i==1||i%%control$REPORT==0))
+
+cat("i:",i,"s:",par,"f:",fpar,"s'",par1,"f:",fpar1,"\n")
+
+if( (type=="min" && fpar1<fpar)
+
+|| (type=="max" && fpar1>fpar)) { par=par1;fpar=fpar1 }
+
+}
+
+if(control$REPORT>=1) cat("best:",par,"f:",fpar,"\n")
+
+return(list(sol=par,eval=fpar))
+
+}
+
+ 4.2
+
+source("hill.R") # load the hill climbing methods
+
+intbin=function(x) sum(2^(which(rev(x==1))-1))
+
+maxsin=function(x) sin(pi*(intbin(x))/(2^D))
+
+D=16 # number of dimensions
+
+s=rep(0,D) # initial search point
+
+# hill climbing:
+
+maxit=20
+
+C=list(maxit=maxit,REPORT=0) # maximum of 10 iterations
+
+ichange=function(par,lower,upper) # integer change
+
+{hchange(par,lower,upper,rnorm,mean=0,sd=1) }
+
+b=hclimbing(s,maxsin,change=ichange,lower=rep(0,D),
+
+upper=rep(1,D),
+
+control=C,type="max")
+
+cat("hill b:",b$sol,"f:",b$eval,"\n")
+
+# simulated annealing:
+
+eval=function(x) -maxsin(x)
+
+ichange2=function(par) # integer change
+
+{D=length(par);hchange(par,lower=rep(0,D),upper=rep(1,D),rnorm,mean=0,sd=1)}
+
+C=list(maxit=maxit)
+
+b=optim(s,eval,method="SANN",gr=ichange2,control=C)
+
+cat("sann b:",b$par,"f:",abs(b$value),"\n")
+
+# tabu search:
+
+b=tabuSearch(size=D,iters=maxit,objFunc=maxsin,config=s,neigh=4,listSize=8)
+
+ib=which.max(b$eUtilityKeep) # best index
+
+cat("tabu b:",b$configKeep[ib,],"f:",b$eUtilityKeep[ib],"\n")
+
+ 4.3
+
+library(tabuSearch) # get tabuSearch
+
+rastrigin=function(x) f=10*length(x)+sum(x^2-10*cos(2*pi*x))
+
+intbin=function(x) # convert binary to integer
+
+{ sum(2^(which(rev(x==1))-1)) } # explained in Chapter
+
+breal=function(x) # convert binary to D real values
+
+{ # note: D and bits need to be set outside this function
+
+s=vector(length=D)
+
+for(i in 1:D) # convert x into s:
+
+{ ini=(i-1)*bits+1;end=ini+bits-1
+
+n=intbin(x[ini:end])
+
+s[i]=lower+n*drange/2^bits
+
+}
+
+return(s)
+
+}
+
+# note: tabuSearch does not work well with negative evaluations
+
+# to solve this drawback, a MAXIMUM constant is defined
+
+MAXIMUM=10000
+
+brastrigin=function(x) MAXIMUM-rastrigin(breal(x)) # max. goal
+
+D=8;MAXIT=500
+
+bits=8 # per dimension
+
+size=D*bits
+
+lower=-5.2;upper=5.2;drange=upper-lower
+
+s=sample(0:1,size=size,replace=TRUE)
+
+b=tabuSearch(size=size,iters=MAXIT,objFunc=brastrigin,config=s,neigh=bits,listSize=bits,nRestarts=1)
+
+ib=which.max(b$eUtilityKeep) # best index
+
+cat("b:",b$configKeep[ib,],"f:",MAXIMUM-b$eUtilityKeep[ib],"\n")
+
+##  Exercises of Chap.
+
+ 5.1
+
+library(genalg) # get rba.bin
+
+intbin=function(x) sum(2^(which(rev(x==1))-1))
+
+maxsin=function(x) -sin(pi*(intbin(x))/(2^D))
+
+D=16 # number of dimensions
+
+# genetic algorithm:
+
+GA=rbga.bin(size=D,popSize=20,iters=100,zeroToOneRatio=1,
+
+evalFunc=maxsin,elitism=1)
+
+b=which.min(GA$evaluations) # best individual
+
+cat("best:",GA$population[b,],"f:",-GA$evaluations[b],"\n")
+
+ 5.2
+
+library(pso)
+
+library(copulaedas)
+
+source("blind.R") # get fsearch
+
+source("montecarlo.R") # get mcsearch
+
+# evaluation function: -------------------------------------
+
+eggholder=function(x) # length of x is 2
+
+{ x=ifelse(x<lower[1],lower[1],x) # (only due to EDA):
+
+x=ifelse(x>upper[1],upper[1],x) # bound if needed
+
+f=(-(x[2]+47)*sin(sqrt(abs(x[2]+x[1]/2+47)))
+
+-x[1]*sin(sqrt(abs(x[1]-(x[2]+47))))
+
+)
+
+# global assignment code: <<-
+
+EV<<-EV+1 # increase evaluations
+
+if(f<BEST) BEST<<-f # minimum value
+
+if(EV<=MAXFN) F[EV]<<-BEST
+
+return(f)
+
+}
+
+# auxiliary functions: ------------------------------------
+
+crun2=function(method,f,lower,upper,LP,maxit,MAXFN) # run a method
+
+{ if(method=="MC")
+
+{
+
+s=runif(D,lower[1],upper[1]) # initial search point
+
+mcsearch(MAXFN,lower=lower,upper=upper,FUN=eggholder)
+
+}
+
+else if(method=="PSO")
+
+{ C=list(maxit=maxit,s=LP,type="SPSO2011")
+
+psoptim(rep(NA,length(lower)),fn=f,
+
+lower=lower,upper=upper,control=C)
+
+}
+
+else if(method=="EDA")
+
+{ setMethod("edaTerminate","EDA",edaTerminateMaxGen)
+
+DVEDA=VEDA(vine="DVine",indepTestSigLevel=0.01,
+
+copulas = c("normal"),margin = "norm")
+
+DVEDA@name="DVEDA"
+
+edaRun(DVEDA,f,lower,upper)
+
+}
+
+}
+
+successes=function(x,LIM,type="min") # number of successes
+
+{ if(type=="min") return(sum(x<LIM)) else return(sum(x>LIM)) }
+
+ctest2=function(Methods,f,lower,upper,type="min",Runs, # test
+
+D,MAXFN,maxit,LP,pdf,main,LIM) # all methods:
+
+{ RES=vector("list",length(Methods)) # all results
+
+VAL=matrix(nrow=Runs,ncol=length(Methods)) # best values
+
+for(m in 1:length(Methods)) # initialize RES object
+
+RES[[m]]=matrix(nrow=MAXFN,ncol=Runs)
+
+for(R in 1:Runs) # cycle all runs
+
+for(m in 1:length(Methods))
+
+{ EV<<-0; F<<-rep(NA,MAXFN) # reset EV and F
+
+if(type=="min") BEST<<-Inf else BEST<<\- -Inf # reset BEST
+
+suppressWarnings(crun2(Methods[m],f,lower,upper,LP,maxit,MAXFN))
+
+RES[[m]][,R]=F # store all best values
+
+VAL[R,m]=F[MAXFN] # store best value at MAXFN
+
+}
+
+# compute average F result per method:
+
+AV=matrix(nrow=MAXFN,ncol=length(Methods))
+
+for(m in 1:length(Methods))
+
+for(i in 1:MAXFN)
+
+AV[i,m]=mean(RES[[m]][i,])
+
+# show results:
+
+cat(main,"\n",Methods,"\n")
+
+cat(round(apply(VAL,2,mean),digits=0)," (average best)\n")
+
+cat(round(100*apply(VAL,2,successes,LIM,type)/Runs,
+
+digits=0)," (%successes)\n")
+
+# create pdf file:
+
+pdf(paste(pdf,".pdf",sep=""),width=5,height=5,paper="special")
+
+par(mar=c(4.0,4.0,1.8,0.6)) # reduce default plot margin
+
+MIN=min(AV);MAX=max(AV)
+
+# use a grid to improve clarity:
+
+g1=seq(1,MAXFN,length.out=500) # grid for lines
+
+plot(g1,AV[g1,1],ylim=c(MIN,MAX),type="l",lwd=2,main=main,
+
+ylab="average best",xlab="number of evaluations")
+
+for(i in 2:length(Methods)) lines(g1,AV[g1,i],lwd=2,lty=i)
+
+if(type=="min") position="topright" else position="bottomright"
+
+legend(position,legend=Methods,lwd=2,lty=1:length(Methods))
+
+dev.off() # close the PDF device
+
+}
+
+# define EV, BEST and F:
+
+MAXFN=1000
+
+EV=0;BEST=Inf;F=rep(NA,MAXFN)
+
+# define method labels:
+
+Methods=c("MC","PSO","EDA")
+
+# eggholder comparison: -----------------------------------
+
+Runs=10; D=2; LP=20; maxit=50
+
+lower=rep(-512,D);upper=rep(512,D)
+
+ctest2(Methods,eggholder,lower,upper,"min",Runs,D,MAXFN,
+
+maxit,LP,
+
+"comp-eggholder","eggholder (D=2)",-950)
+
+ 5.3
+
+source("functions.R") # bag prices functions
+
+library(copulaedas) # EDA
+
+# auxiliary functions: ------------------------------------
+
+# returns TRUE if prices are sorted in descending order
+
+prices_ord=function(x)
+
+{ d=diff(x) # d lagged differences x(i+1)-x(i)
+
+if(sum(d>=0)) return (FALSE) else return (TRUE)
+
+}
+
+ord_prices=function(x)
+
+{ x=sort(x,decreasing=TRUE) # sort x
+
+# x is sorted but there can be ties:
+
+k=2 # remove ties by removing $1
+
+while(!prices_ord(x)) # at each iteration
+
+{ if(x[k]==x[k-1]) x[k]=x[k]-1
+
+k=k+1
+
+}
+
+return(x)
+
+}
+
+# evaluation function: ------------------------------------
+
+cprofit3=function(x) # bag prices with death penalty
+
+{ x=round(x,digits=0) # convert x into integer
+
+x=ifelse(x<1,1,x) # assure that x is within
+
+x=ifelse(x>1000,1000,x) # the [1,1000] bounds
+
+if(!prices_ord(x)) res=Inf # if needed, death penalty!!!
+
+else
+
+{
+
+s=sales(x);c=cost(s);profit=sum(s*x-c)
+
+# if needed, store best value
+
+if(profit>BEST) { BEST<<-profit; B<<-x}
+
+res=-profit # minimization task!
+
+}
+
+EV<<-EV+1 # increase evaluations
+
+if(EV<=MAXFN) F[EV]<<-BEST
+
+return(res)
+
+}
+
+# example of a very simple and fast repair of a solution:
+
+# sort the solution values!
+
+localRepair2=function(eda, gen, pop, popEval, f, lower, upper)
+
+{
+
+for(i in 1:nrow(pop))
+
+{ x=pop[i,]
+
+x=round(x,digits=0) # convert x into integer
+
+x=ifelse(x<lower[1],lower[1],x) # assure x within
+
+x=ifelse(x>upper[1],upper[1],x) # bounds
+
+if(!prices_ord(x)) x=ord_prices(x) # order x
+
+pop[i,]=x;popEval[i]=f(x) # replace x in population
+
+}
+
+return(list(pop=pop,popEval=popEval))
+
+}
+
+# experiment: ----------------------------------------------
+
+MAXFN=5000
+
+Runs=50; D=5; LP=50; maxit=100
+
+lower=rep(1,D);upper=rep(1000,D)
+
+Methods=c("Death","Repair")
+
+setMethod("edaTerminate","EDA",edaTerminateMaxGen)
+
+UMDA=CEDA(copula="indep",margin="norm"); UMDA@name="UMDA"
+
+RES=vector("list",length(Methods)) # all results
+
+VAL=matrix(nrow=Runs,ncol=length(Methods)) # best values
+
+for(m in 1:length(Methods)) # initialize RES object
+
+RES[[m]]=matrix(nrow=MAXFN,ncol=Runs)
+
+for(R in 1:Runs) # cycle all runs
+
+{
+
+B=NA;EV=0; F=rep(NA,MAXFN); BEST= -Inf # reset vars.
+
+setMethod("edaOptimize","EDA",edaOptimizeDisabled)
+
+setMethod("edaTerminate","EDA",edaTerminateMaxGen)
+
+suppressWarnings(edaRun(UMDA,cprofit3,lower,upper))
+
+RES[[1]][,R]=F # store all best values
+
+VAL[R,1]=F[MAXFN] # store best value at MAXFN
+
+B=NA;EV=0; F=rep(NA,MAXFN); BEST= -Inf # reset vars.
+
+# set local repair search method:
+
+setMethod("edaOptimize","EDA",localRepair2)
+
+# set additional termination criterion:
+
+setMethod("edaTerminate","EDA",
+
+edaTerminateCombined(edaTerminateMaxGen,edaTerminateEvalStdDev))
+
+# this edaRun might produces warnings or errors:
+
+suppressWarnings(try(edaRun(UMDA,cprofit3,lower,upper),silent=TRUE))
+
+if(EV<MAXFN) # if stopped due to EvalStdDev
+
+F[(EV+1):MAXFN]=rep(F[EV],MAXFN-EV) # replace NAs
+
+RES[[2]][,R]=F # store all best values
+
+VAL[R,2]=F[MAXFN] # store best value at MAXFN
+
+}
+
+# compute average F result per method:
+
+MIN=Inf
+
+AV=matrix(nrow=MAXFN,ncol=length(Methods))
+
+for(m in 1:length(Methods))
+
+for(i in 1:MAXFN)
+
+{
+
+AV[i,m]=mean(RES[[m]][i,])
+
+# update MIN for plot (different than -Inf):
+
+if(AV[i,m]!=-Inf && AV[i,m]<MIN) MIN=AV[i,m]
+
+}
+
+# show results:
+
+cat(Methods,"\n")
+
+cat(round(apply(VAL,2,mean),digits=0)," (average best)\n")
+
+# Mann-Whitney non-parametric test:
+
+p=wilcox.test(VAL[,1],VAL[,2],paired=TRUE)$p.value
+
+cat("p-value:",round(p,digits=2),"(<0.05)\n")
+
+# create PDF file:
+
+pdf("comp-bagprices-constr2.pdf",width=5,height=5,
+
+paper="special")
+
+par(mar=c(4.0,4.0,1.8,0.6)) # reduce default plot margin
+
+# use a grid to improve clarity:
+
+g1=seq(1,MAXFN,length.out=500) # grid for lines
+
+MAX=max(AV)
+
+plot(g1,AV[g1,2],ylim=c(MIN,MAX),type="l",lwd=2,
+
+main="bag prices with constraint 2",
+
+ylab="average best",xlab="number of evaluations")
+
+lines(g1,AV[g1,1],lwd=2,lty=2)
+
+legend("bottomright",legend=rev(Methods),lwd=2,lty=1:4)
+
+dev.off() # close the PDF device
+
+ 5.4
+
+library(rgp) # load rgp
+
+# auxiliary functions:
+
+eggholder=function(x) # length of x is 2
+
+f=(-(x[2]+47)*sin(sqrt(abs(x[2]+x[1]/2+47)))
+
+-x[1]*sin(sqrt(abs(x[1]-(x[2]+47))))
+
+)
+
+fwrapper=function(x,f)
+
+{ res=suppressWarnings(f(x[1],x[2]))
+
+# if NaN is generated (e.g. sqrt(-1)) then
+
+if(is.nan(res)) res=Inf # replace by Inf
+
+return(res)
+
+}
+
+# configuration of the genetic programming:
+
+ST=inputVariableSet("x1","x2")
+
+cF1=constantFactorySet(function() sample(c(2,47),1) )
+
+FS=functionSet("+","-","/","sin","sqrt","abs")
+
+# set the input samples:
+
+samples=500
+
+domain=matrix(ncol=2,nrow=samples)
+
+domain[]=runif(samples,-512,512)
+
+eval=function(f) # evaluation function
+
+mse(apply(domain,1,eggholder),apply(domain,1,fwrapper,f))
+
+# run the genetic programming:
+
+gp=geneticProgramming(functionSet=FS,inputVariables=ST,
+
+constantSet=cF1,populationSize=100,
+
+fitnessFunction=eval,
+
+stopCondition=makeTimeStopCondition(20),
+
+verbose=TRUE)
+
+# show the results:
+
+b=gp$population[[which.min(gp$fitnessValues)]]
+
+cat("best solution (f=",eval(b),"):\n")
+
+print(b)
+
+L1=apply(domain,1,eggholder)
+
+L2=apply(domain,1,fwrapper,b)
+
+# sort L1 and L2 (according to L1 indexes)
+
+# for an easier comparison of both curves:
+
+L1=sort.int(L1,index.return=TRUE)
+
+L2=L2[L1$ix]
+
+L1=L1$x
+
+MIN=min(L1,L2);MAX=max(L1,L2)
+
+plot(L1,ylim=c(MIN,MAX),type="l",lwd=2,lty=1,
+
+xlab="points",ylab="function values")
+
+lines(L2,type="l",lwd=2,lty=2)
+
+legend("bottomright",leg=c("eggholder","GP function"),lwd=2,lty=1:2)
+
+# note: the fit is not perfect, but the search space is
+
+# too large
+
+##  Exercises of Chap.
+
+ 6.1
+
+source("hill.R") # load the blind search methods
+
+source("mo-tasks.R") # load MO bag prices task
+
+source("lg-ga.R") # load tournament function
+
+# lexicographic hill climbing, assumes minimization goal:
+
+lhclimbing=function(par,fn,change,lower,upper,control,
+
+...)
+
+{
+
+for(i in 1:control$maxit)
+
+{
+
+par1=change(par,lower,upper)
+
+if(control$REPORT>0 &&(i==1||i%%control$REPORT==0))
+
+cat("i:",i,"s:",par,"f:",eval(par),"s'",par1,"f:",
+
+eval(par1),"\n")
+
+pop=rbind(par,par1) # population with 2 solutions
+
+I=tournament(pop,fn,k=2,n=1,m=2)
+
+par=pop[I,]
+
+}
+
+if(control$REPORT>=1) cat("best:",par,"f:",eval(par),"\n")
+
+return(list(sol=par,eval=eval(par)))
+
+}
+
+# lexico. hill climbing for all bag prices, one run:
+
+D=5; C=list(maxit=10000,REPORT=10000) # 10000 iterations
+
+s=sample(1:1000,D,replace=TRUE) # initial search
+
+ichange=function(par,lower,upper) # integer value change
+
+{ hchange(par,lower,upper,rnorm,mean=0,sd=1) }
+
+LEXI=c(0.1,0.1) # explicitly defined lexico. tolerances
+
+eval=function(x) c(-profit(x),produced(x))
+
+b=lhclimbing(s,fn=eval,change=ichange,lower=rep(1,D),
+
+upper=rep(1000,D),control=C)
+
+cat("final ",b$sol,"f(",profit(b$sol),",",produced(b$sol),")\n")
+
+ 6.2
+
+library(genalg) # load rbga function
+
+library(mco) # load nsga2 function
+
+set.seed(12345) # set for replicability
+
+# real value FES2 benchmark:
+
+fes2=function(x)
+
+{ D=length(x);f=rep(0,3)
+
+for(i in 1:D)
+
+{
+
+f[1]=f[1]+(x[i]-0.5*cos(10*pi/D)-0.5)^2
+
+f[2]=f[2]+abs(x[i]-(sin(i-1))^2*(cos(i-1)^2))^0.5
+
+f[3]=f[3]+abs(x[i]-0.25*cos(i-1)*cos(2*i-2)-0.5)^0.5
+
+}
+
+return(f)
+
+}
+
+D=8;m=3
+
+# WBGA execution:
+
+# evaluation function for WBGA
+
+# (also used to print and get last population fes2 values:
+
+# WBGA chromosome used: x=(w1,w2,w3,v1,v2,v3,...,vD)
+
+# where w_i are the weights and v_j the values
+
+eval=function(x,REPORT=FALSE)
+
+{ D=length(x)/2
+
+# normalize weights, such that sum(w)=1
+
+w=x[1:m]/sum(x[1:m]);v=x[(m+1):length(x)];f=fes2(v)
+
+if(REPORT)
+
+{ cat("w:",round(w,2),"v:",round(v,2),"f:",round(f,2),"\n")
+
+return(f)
+
+}
+
+else return(sum(w*f))
+
+}
+
+WBGA=rbga(evalFunc=eval,
+
+stringMin=rep(0,D*2),stringMax=rep(1,D*2),
+
+popSize=20,iters=100)
+
+print("WBGA last population:")
+
+# S1 contains last population fes2 values in individuals x objectives
+
+S1=t(apply(WBGA$population,1,eval,REPORT=TRUE))
+
+LS1=nrow(S1)
+
+# NSGA-II execution:
+
+NSGA2=nsga2(fn=fes2,idim=D,odim=m,
+
+lower.bounds=rep(0,D),upper.bounds=rep(1,D),
+
+popsize=20,generations=100)
+
+S2=NSGA2$value[NSGA2$pareto.optimal,]
+
+print("NSGA2 last Pareto front:")
+
+print(S2)
+
+LS2=nrow(S2)
+
+# Comparison of results:
+
+library(scatterplot3d)
+
+S=data.frame(rbind(S1,S2))
+
+names(S)=c("f1","f2","f3")
+
+col=c(rep("gray",LS1),rep("black",LS2))
+
+# nice scatterplot3d
+
+# WBGA points are in gray
+
+# NSGA2 points are in black
+
+# NSGA2 produces a more disperse and interesting
+
+# Pareto front when compared with WBGA
+
+scatterplot3d(S,pch=16,color=col)
+
+##  Exercises of Chap.
+
+ 7.1
+
+# cycle crossover (CX) operator:
+
+# m is a matrix with 2 parent x ordered solutions
+
+cx=function(m)
+
+{
+
+N=ncol(m)
+
+c=matrix(rep(NA,N*2),ncol=N)
+
+stop=FALSE
+
+k=1
+
+ALL=1:N
+
+while(length(ALL)>0)
+
+{
+
+i=ALL[1]
+
+# perform a cycle:
+
+base=m[1,i];vi=m[2,i]
+
+I=i
+
+while(vi!=base)
+
+{
+
+i=which(m[1,]==m[2,i])
+
+vi=m[2,i]
+
+I=c(I,i)
+
+}
+
+ALL=setdiff(ALL,I)
+
+if(k%%2==1) c[,I]=m[,I] else c[,I]=m[2:1,I]
+
+k=k+1
+
+}
+
+return(c)
+
+}
+
+# example of CX operator:
+
+m=matrix(ncol=9,nrow=2)
+
+m[1,]=1:9
+
+m[2,]=c(9,8,1,2,3,4,5,6,7)
+
+print(m)
+
+print("---")
+
+print(cx(m))
+
+ 7.2
+
+# this solution assumes that file "tsp.R" has already been executed
+
+source("oea.R") # load ordered evolutionary algorithm
+
+source("s7-1.R") # get the cycle operator
+
+# random mutation
+
+randomm=function(s)
+
+{ return(switch(sample(1:3,1),exchange(s),insertion(s),displacement(s))) }
+
+# random crossover
+
+randomx=function(m)
+
+{ return(switch(sample(1:3,1),pmx(m),ox(m),cx(m))) }
+
+Methods=c("new SANN","new EA")
+
+# new SANN:
+
+cat("new SANN run:\n")
+
+set.seed(12345) # for replicability
+
+s=sample(1:N,N) # initial solution
+
+EV=0; BEST=Inf; F=rep(NA,MAXIT) # reset these vars.
+
+C=list(maxit=MAXIT,temp=2000,trace=TRUE,REPORT=MAXIT)
+
+PTM=proc.time() # start clock
+
+SANN=optim(s,fn=tour,gr=randomm,method="SANN",control=C)
+
+sec=(proc.time()-PTM)[3] # get seconds elapsed
+
+cat("time elapsed:",sec,"\n")
+
+RES[,1]=F
+
+cat("tour distance:",tour(SANN$par),"\n")
+
+# new EA:
+
+cat("new EA run:\n")
+
+set.seed(12345) # for replicability
+
+EV=0; BEST=Inf; F=rep(NA,MAXIT) # reset these vars.
+
+pSize=30;iters=ceiling((MAXIT-pSize)/(pSize-1))
+
+PTM=proc.time() # start clock
+
+OEA=oea(size=N,popSize=pSize,iters=iters,evalFunc=tour,crossfunc=randomx,mutfunc=randomm,REPORT=iters,elitism=1)
+
+sec=(proc.time()-PTM)[3] # get seconds elapsed
+
+cat("time elapsed:",sec,"\n")
+
+RES[,2]=F
+
+cat("tour distance:",tour(OEA$population[which.min(OEA$evaluations),]),"\n")
+
+# there is no improvement when compared with "tsp.R" file
+
+ 7.3
+
+# this solution assumes that file "tsf.R" has already been executed
+
+library(pso) # load pso
+
+# evaluation function of arma coefficients:
+
+evalarma=function(s)
+
+{ a=suppressWarnings(arima(sunspots,order=c(AR,0,MA),fixed=s))
+
+R=a$residuals[INIT:length(sunspots)]
+
+R=maeres(R)
+
+if(is.nan(R)) R=Inf # death penalty
+
+return(maeres(R))
+
+}
+
+AR=2;MA=1
+
+maxit=100; LP=50
+
+meants=mean(sunspots);K=0.1*meants
+
+lower=c(rep(-1,(AR+MA)),meants-K)
+
+upper=c(rep(1,(AR+MA)),meants+K)
+
+C=list(maxit=maxit,s=LP,trace=10,REPORT=10)
+
+set.seed(12345) # set for replicability
+
+PSO=psoptim(rep(NA,length(lower)),fn=evalarma,
+
+lower=lower,upper=upper,control=C)
+
+arima2=arima(sunspots,order=c(AR,0,MA),fixed=PSO$par)
+
+print(arima2)
+
+cat("pso fit MAE=",PSO$value,"\n")
+
+# one-step ahead predictions:
+
+f3=rep(NA,forecasts)
+
+for(h in 1:forecasts)
+
+{ # execute arima with fixed coefficients but with more in-samples:
+
+arima1=arima(series[1:(LIN+h-1)],order=arima2$arma[c(1,3,2)],fixed=arima2$coef)
+
+f3[h]=forecast(arima1,h=1)$mean[1]
+
+}
+
+e3=maeres(outsamples-f3)
+
+text3=paste("pso arima (MAE=",round(e3,digits=1),")",sep="")
+
+# show quality of one-step ahead forecasts:
+
+ymin=min(c(outsamples,f1,f3))
+
+ymax=max(c(outsamples,f1,f3))
+
+par(mar=c(4.0,4.0,0.1,0.1))
+
+plot(outsamples,ylim=c(ymin,ymax),type="b",pch=1,
+
+xlab="time (years after 1980)",ylab="values",cex=0.8)
+
+lines(f1,lty=2,type="b",pch=3,cex=0.5)
+
+lines(f3,lty=3,type="b",pch=5,cex=0.5)
+
+legend("topright",c("sunspots",text1,text3),lty=1:3,
+
+pch=c(1,3,5))
+
+ 7.4
+
+# this solution assumes that file "wine-quality.R" has already been executed
+
+# reload wine quality dataset since a new quality is defined:
+
+file="http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
+
+d=read.table(file=file,sep=";",header=TRUE)
+
+# convert the output variable into 3 classes of wine:
+
+# "bad" <\- 3,4,5
+
+# "average" <\- 6
+
+# "good" <\- 7, 8 or 9
+
+d$quality=cut(d$quality,c(0,5.5,6.5,10),c("bad","average",
+
+"good"))
+
+n=nrow(d) # total number of samples
+
+ns=round(n*0.10) # select only 10% of the samples for a fast demonstration
+
+set.seed(12345) # for replicability
+
+ALL=sample(1:n,ns) # contains 10% of the index samples
+
+# show a summary of the wine quality dataset (10%):
+
+print(summary(d[ALL,]))
+
+cat("output class distribuition (10% samples):\n")
+
+print(table(d[ALL,]$quality)) # show distribution of classes
+
+# holdout split:
+
+# select training data (for fitting the model), 70%; and
+
+# test data (for estimating generalization capabilities), 30%.
+
+H=holdout(d[ALL,]$quality,ratio=0.7)
+
+cat("nr. training samples:",length(H$tr),"\n")
+
+cat("nr. test samples:",length(H$ts),"\n")
+
+# new evaluation function:
+
+# x is in the form c(Gamma,C)
+
+eval=function(x)
+
+{ n=length(x)
+
+gamma=2^x[1]
+
+C=2^x[2]
+
+inputs=1:maxinputs # use all inputs
+
+attributes=c(inputs,output)
+
+# divert console:
+
+# sink is used to avoid kernlab ksvm messages in a few cases
+
+sink(file=textConnection("rval","w",local = TRUE))
+
+M=mining(quality~.,d[H$tr,attributes],method=c("kfold",3),model="svm",search=gamma,mpar=c(C,NA))
+
+sink(NULL) # restores console
+
+# AUC for the internal 3-fold cross-validation:
+
+auc=as.numeric(mmetric(M,metric="AUCCLASS"))
+
+# auc now contains 3 values, the AUC for each class
+
+auc1=1-auc # transform auc maximization into minimization goal
+
+return(c(auc1))
+
+}
+
+# NSGAII multi-objective optimization:
+
+cat("NSGAII optimization:\n")
+
+m=3 # four objectives: AUC for each class and number of features
+
+lower=c(-15,-5)
+
+upper=c(3,15)
+
+PTM=proc.time() # start clock
+
+G=nsga2(fn=eval,idim=length(lower),odim=m,lower.bounds=lower,upper.bounds=upper,popsize=12,generations=10)
+
+sec=(proc.time()-PTM)[3] # get seconds elapsed
+
+cat("time elapsed:",sec,"\n")
+
+# show the Pareto front:
+
+I=which(G$pareto.optimal)
+
+for(i in I)
+
+{ x=G$par[i,]
+
+n=length(x)
+
+gamma=2^x[1]
+
+C=2^x[2]
+
+features=round(x[3:n])
+
+inputs=which(features==1)
+
+cat("gamma:",gamma,"C:",C,"; f=(",
+
+1-G$value[i,1:3],")\n",sep=" ")
+
+}
+
+Pareto=1-G$value[I,] # AUC for each class
+
+Pareto=data.frame(Pareto)
+
+names(Pareto)=c("AUC bad","AUC average","AUC good")
+
+# sort Pareto according to f1:
+
+S=sort.int(Pareto[,1],index.return=TRUE)
+
+Pareto=Pareto[S$ix,]
+
+library(scatterplot3d) # get scatterplot3d function
+
+scatterplot3d(Pareto,xlab="f1",ylab="f2",zlab="f3",
+
+pch=16,type="b")
+
+# looking at the Pareto front, the wine expert could
+
+# select the best model and then measure the performance
+
+# of such model on the test set...
+
+References
+
+Applegate D, Bixby R, Chvátal V, Cook W (2001) TSP cuts which do not conform to the template paradigm. In: Computational combinatorial optimization. Springer, Berlin, pp 261–303
+
+Applegate DL, Bixby RE, Chvatal V, Cook WJ (2011) The traveling salesman problem: a computational study. Princeton University Press, Princeton
+
+Bache K, Lichman M (2013) UCI machine learning repository. http://​archive.​ics.​uci.​edu/​ml
+
+Bäck T, Schwefel HP (1993) An overview of evolutionary algorithms for parameter optimization. Evol Comput 1(1):1–23
+
+Baluja S (1994) Population-based incremental learning: a method for integrating genetic search based function optimization and competitive learning. Tech. rep., DTIC Document
+
+Banzhaf W, Nordin P, Keller R, Francone F (1998) Genetic programming. An introduction. Morgan Kaufmann, San Francisco
+
+Bélisle CJ (1992) Convergence theorems for a class of simulated annealing algorithms on R d. J Appl Probab 29:885–895
+
+Boyd S, Vandenberghe L (2004) Convex optimization. Cambridge University Press, Cambridge
+
+Brownlee J (2011) Clever algorithms: nature-inspired programming recipes, Lulu
+
+Caflisch RE (1998) Monte carlo and quasi-monte carlo methods. Acta Numer 1998:1–49
+
+Chen WN, Zhang J, Chung HS, Zhong WL, Wu WG, Shi YH (2010) A novel set-based particle swarm optimization method for discrete optimization problems. IEEE Trans Evol Comput 14(2):278–300
+
+Clerc M (2012) Standard particle swarm optimization. hal-00764996, version 1. http://​hal.​archives-ouvertes.​fr/​hal-00764996
+
+Cortes C, Vapnik V (1995) Support vector networks. Mach Learn 20(3):273–297
+
+Cortez P (2010) Data mining with neural networks and support vector machines using the R/rminer tool. In: Perner P (ed) Advances in data mining: applications and theoretical aspects. 10th industrial conference on data mining. Lecture notes in artificial intelligence, vol 6171. Springer, Berlin, pp 572–583
+
+Cortez P (2012) Data mining with multilayer perceptrons and support vector machines. Springer, Berlin, pp 9–25 (Chap. 2)
+
+Cortez P, Rocha M, Neves J (2004) Evolving time series forecasting ARMA models. J Heuristics 10(4):415–429
+
+Cortez P, Cerdeira A, Almeida F, Matos T, Reis J (2009) Modeling wine preferences by data mining from physicochemical properties. Dec Support Syst 47(4):547–553
+
+Croes G (1958) A method for solving traveling-salesman problems. Oper Res 6(6):791–812
+
+Deb K (2001) Multi-objective optimization. In: Multi-objective optimization using evolutionary algorithms. Wiley, Chichester, pp 13–46
+
+Eberhart R, Kennedy J, Shi Y (2001) Swarm intelligence. Morgan Kaufmann, San Francisco
+
+Eberhart RC, Shi Y (2011) Computational intelligence: concepts to implementations. Morgan Kaufmann, San Francisco
+
+Fawcett T (2006) An introduction to ROC analysis. Pattern Recognit Lett 27:861–874
+
+Flasch O (2013) A friendly introduction to rgp. http://​cran.​r-project.​org/​web/​packages/​rgp/​vignettes/​rgp_​introduction.​pdf
+
+Freitas AA (2004) A critical review of multi-objective optimization in data mining: a position paper. ACM SIGKDD Explor Newslett 6(2):77–86
+
+Glover F (1986) Future paths for integer programming and links to artificial intelligence. Comput Oper Res 13(5):533–549
+
+Glover F (1990) Tabu search: a tutorial. Interfaces 20(4):74–94
+
+Glover F, Laguna M (1998) Tabu search. Springer, Heidelberg
+
+Goldberg DE, Deb K (1991) A comparative analysis of selection schemes used in genetic algorithms, Urbana 51:61801–62996.
+
+Gonzalez-Fernandez Y, Soto M (2012) copulaedas: an R package for estimation of distribution algorithms based on Copulas. arXiv preprint arXiv:12095429
+
+Guyon I, Elisseeff A (2003) An introduction to variable and feature selection. J Mach Learn Res 3:1157–1182
+
+Holland J (1975) Adaptation in natural and artificial systems. Ph.D. thesis, University of Michigan
+
+Hsu CH, Chang CC, Lin CJ (2003) A practical guide to support vector classification. Tech. rep., National Taiwan University
+
+Huang CM, Lee YJ, Lin DK, Huang SY (2007) Model selection for support vector machines via uniform design. Comput Stat Data Anal 52(1):335–346
+
+Huband S, Hingston P, Barone L, While L (2006) A review of multiobjective test problems and a scalable test problem toolkit. IEEE Trans Evol Comput 10(5):477–506
+
+Ihaka R, Gentleman R (1996) R: a language for data analysis and graphics. J Comput Graph Stat 5(3):299–314
+
+Joe H (1997) Multivariate models and dependence concepts, vol 73. CRC Press, Boca Raton
+
+Kaboudan MA (2003) Forecasting with computer-evolved model specifications: a genetic programming application. Comput Oper Res 30(11):1661–1681
+
+Kennedy J, Eberhart R (1995) Particle swarm optimization. In: ICNN'95 - IEEE international conference on neural networks proceedings. IEEE Computer Society, Perth, pp 1942–1948
+
+Kohavi R (1995) A study of cross-validation and bootstrap for accuracy estimation and model selection. In: Proceedings of the international joint conference on artificial intelligence (IJCAI), vol 2. Morgan Kaufmann, Montreal
+
+Konak A, Coit DW, Smith AE (2006) Multi-objective optimization using genetic algorithms: a tutorial. Reliab Eng Syst Saf 91(9):992–1007
+
+Larrañaga P, Lozano JA (2002) Estimation of distribution algorithms: a new tool for evolutionary computation, vol 2. Kluwer Academic, Boston
+
+Lucasius CB, Kateman G (1993) Understanding and using genetic algorithms part 1. Concepts, properties and context. Chemom Intell Lab Syst 19(1):1–33
+
+Luke S (2012) Essentials of metaheuristics. Lulu.com, online version at http://​cs.​gmu.​edu/​~sean/​book/​metaheuristics
+
+Makridakis S, Weelwright S, Hyndman R (1998) Forecasting: methods and applications, 3rd edn. Wiley, New York
+
+Mendes R (2004) Population topologies and their influence in particle swarm performance. Ph.D. thesis, Universidade do Minho
+
+Mendes R, Cortez P, Rocha M, Neves J (2002) Particle swarms for feedforward neural network training. In: Proceedings of the 2002 international joint conference on neural networks (IJCNN 2002). IEEE Computer Society, Honolulu, pp 1895–1899
+
+Michalewicz Z (1996) Genetic algorithms + data structures = evolution programs. Springer, Berlin
+
+Michalewicz Z (2008) Adaptive Business Intelligence, Computer Science Course 7005 Handouts
+
+Michalewicz Z, Fogel D (2004) How to solve it: modern heuristics. Springer, Berlin
+
+Michalewicz Z, Schmidt M, Michalewicz M, Chiriac C (2006) Adaptive business intelligence. Springer, Berlin
+
+Michalewicz Z, Schmidt M, Michalewicz M, Chiriac C (2007) Adaptive business intelligence: three case studies. In: Evolutionary computation in dynamic and uncertain environments. Springer, Berlin, pp 179–196
+
+Muenchen RA (2013) The popularity of data analysis software. http://​r4stats.​com/​articles/​popularity/​
+
+Mühlenbein H (1997) The equation for response to selection and its use for prediction. Evol Comput 5(3):303–346
+
+Mullen K, Ardia D, Gil D, Windover D, Cline J (2011) Deoptim: an r package for global optimization by differential evolution. J Stat Softw 40(6):1–26
+
+Paradis E (2002) R for beginners. Montpellier (F): University of Montpellier. http://​cran.​r-project.​org/​doc/​contrib/​Paradis-rdebuts_​en.​pdf
+
+Price KV, Storn RM, Lampinen JA (2005) Differential evolution a practical approach to global optimization. Springer, Berlin
+
+R Core Team (2013) R: a language and environment for statistical computing. R Foundation for Statistical Computing, Vienna. http://​www.​R-project.​org/​
+
+Reinelt G (1994) The traveling salesman: computational solutions for TSP applications. Springer, New York
+
+Robert C, Casella G (2009) Introducing Monte Carlo methods with R. Springer, New York
+
+Rocha M, Cortez P, Neves J (2000) The Relationship between learning and evolution in static and in dynamic environments. In: Fyfe C (ed) Proceedings of the 2nd ICSC symposium on engineering of intelligent systems (EIS'2000). ICSC Academic Press, Paisley, pp 377–383
+
+Rocha M, Mendes R, Cortez P, Neves J (2001) Sitting guest at a wedding party: experiments on genetic and evolutionary constrained optimization. In: Proceedings of the 2001 congress on evolutionary computation (CEC2001), vol 1. IEEE Computer Society, Seoul, pp 671–678
+
+Rocha M, Cortez P, Neves J (2007) Evolution of neural networks for classification and regression. Neurocomputing 70:2809–2816
+
+Rocha M, Sousa P, Cortez P, Rio M (2011) Quality of service constrained routing optimization using evolutionary computation. Appl Soft Comput 11(1):356–364
+
+Schrijver A (1998) Theory of linear and integer programming. Wiley, Chichester
+
+Stepnicka M, Cortez P, Donate JP, Stepnicková L (2013) Forecasting seasonal time series with computational intelligence: on recent methods and the potential of their combinations. Expert Syst Appl 40(6):1981–1992
+
+Storn R, Price K (1997) Differential evolution–a simple and efficient heuristic for global optimization over continuous spaces. J Glob Optim 11(4):341–359
+
+Tang K, Li X, Suganthan P, Yang Z, Weise T (2009) Benchmark functions for the cec'2010 special session and competition on large-scale global optimization. Tech. rep., Technical report, University of Science and Technology of China
+
+Vance A (2009) R You Ready for R? http://​bits.​blogs.​nytimes.​com/​2009/​01/​08/​r-you-ready-for-r/​
+
+Venables W, Smith D, R Core Team (2013) An introduction to R. http://​cran.​r-project.​org/​doc/​manuals/​R-intro.​pdf
+
+Wolpert DH, Macready WG (1997) No free lunch theorems for optimization. IEEE Trans Evol Comput 1(1):67–82
+
+Wu X, Kumar V, Quinlan J, Gosh J, Yang Q, Motoda H, MacLachlan G, Ng A, Liu B, Yu P, Zhou Z, Steinbach M, Hand D, Steinberg D (2008) Top 10 algorithms in data mining. Knowl Inf Syst 14(1):1–37
+
+Zuur A, Ieno E, Meesters E (2009) A beginner's guide to R. Springer, New York
+
+Index
+
+2
+
+2-opt method
+
+A
+
+adaptive start topology
+
+ant colony optimization
+
+applications
+
+apply()
+
+ARIMA methodology
+
+arima()
+
+array
+
+as.character()
+
+as.numeric()
+
+AUC metric
+
+auto.arima()
+
+B
+
+Baldwin effect
+
+barplot()
+
+batch processing
+
+BFGS method
+
+binary encoding
+
+blind search
+
+boxplot()
+
+branch and bound
+
+breadth-first search
+
+C
+
+c()
+
+cat()
+
+ceiling()
+
+chisq.test()
+
+class()
+
+classification
+
+close()
+
+comparison of methods
+
+Comprehensive R Archive Network (CRAN)
+
+Concorde algorithm
+
+conjugate gradients method
+
+constantFactorySet()
+
+constraints
+
+copula
+
+copulaedas package
+
+cos()
+
+cycle operator
+
+D
+
+data mining
+
+data.frame
+
+demo()
+
+demonstrative tasks
+
+DEoptim package
+
+DEoptim()
+
+DEoptim.control()
+
+depth-first search
+
+dev.off()
+
+differential evolution
+
+Displacement operator
+
+dist()
+
+diversification phase
+
+E
+
+edaRun()
+
+Estimation of distribution algorithms (EDA)
+
+evaluation function
+
+evolutionary algorithm
+
+evolutionary computation
+
+example()
+
+Excel format
+
+exchange operator
+
+exercises
+
+F
+
+factor
+
+factorial()
+
+feature selection
+
+file()
+
+fit()
+
+for()
+
+forecast package
+
+forecast()
+
+function()
+
+functionSet()
+
+G
+
+gArea()
+
+genalg package
+
+genetic algorithm
+
+genetic programming
+
+geneticProgramming()
+
+getAnywhere()
+
+getURL()
+
+getwd()
+
+gray()
+
+grid search
+
+guided search
+
+H
+
+Hamiltonian cycle
+
+hard constrains
+
+help()
+
+help.search()
+
+hill climbing
+
+hist()
+
+holdout()
+
+I
+
+ifelse()
+
+inputVariableSet()
+
+insertion operator
+
+intensification phase
+
+interface with other languages
+
+intToBits()
+
+is.matrix(x)
+
+is.na()
+
+is.nan()
+
+is.null()
+
+J
+
+jpeg()
+
+L
+
+Lamarckian evolution
+
+legend()
+
+length()
+
+lexicographic approach
+
+library()
+
+linear programming
+
+lines()
+
+list
+
+load()
+
+local search
+
+ls()
+
+M
+
+machine learning
+
+mathematical function discovery
+
+matrix
+
+max()
+
+mco package
+
+mean absolute error
+
+mean squared error
+
+mean()
+
+meanint()
+
+median()
+
+metaheuristics
+
+methods()
+
+mgraph()
+
+min()
+
+mining()
+
+Minitab format
+
+mmetric()
+
+model selection
+
+modern heuristics
+
+modern optimization
+
+monte carlo search
+
+mse()
+
+Multi-objective evolutionary algorithm
+
+multi-objective optimization
+
+mutateSubtree()
+
+MySQL
+
+N
+
+names()
+
+Nelder and Mead method
+
+nested grid search
+
+no free lunch theorem
+
+NSGA-II
+
+nsga2()
+
+O
+
+Object oriented programming
+
+Operations Research
+
+optim()
+
+optimization
+
+order crossover
+
+ordered
+
+ordered representation
+
+P
+
+par()
+
+parallel computing
+
+Pareto front
+
+paretoSet()
+
+partially matched crossover
+
+particle swarm optimization
+
+pdf()
+
+pie()
+
+plot()
+
+plot.DEoptim()
+
+plot.rbga()
+
+png()
+
+population based search
+
+predict()
+
+print()
+
+priori approach
+
+proc.time()
+
+pso package
+
+psoptim()
+
+R
+
+R console
+
+R Control
+
+R GUI
+
+R installation
+
+R operators
+
+R tool
+
+rbga()
+
+rbga.bin()
+
+read.csv()
+
+read.table()
+
+readLines()
+
+readWKT()
+
+real value encoding
+
+rep()
+
+repair
+
+representation of a solution
+
+return()
+
+rev()
+
+rgeos package
+
+rgp package
+
+rminer package
+
+rnorm()
+
+ROC curve
+
+roulette wheel selection
+
+round()
+
+RStudio
+
+runif()
+
+S
+
+S3 method
+
+S4 class
+
+sample()
+
+sapply()
+
+SAS XPORT format
+
+save()
+
+scatterplot3d()
+
+segments()
+
+seq()
+
+set.seed()
+
+setMethod()
+
+setwd()
+
+show()
+
+simulated annealing
+
+sin()
+
+single-state search
+
+sink()
+
+slot
+
+soft constrains
+
+solve_TSP()
+
+sort()
+
+source()
+
+SPEA-2
+
+specificity
+
+spensitivity
+
+SPSS format
+
+sqrt()
+
+steepest ascent hill climbing
+
+stochastic hill climbing
+
+stochastic optimization
+
+str()
+
+strsplit()
+
+sum()
+
+summary()
+
+summary.DEoptim()
+
+summary.rbga()
+
+support vector machine
+
+suppressWarnings()
+
+swarm intelligence
+
+switch()
+
+T
+
+t()
+
+t.test()
+
+tabu search
+
+tabuSearch package
+
+tabuSearch()
+
+tan()
+
+taxonomy of optimization methods
+
+termination criteria
+
+tiff()
+
+time series
+
+time series forecasting
+
+Tinn-R
+
+tournament selection
+
+traveling salesman problem
+
+tree structure
+
+truncation selection
+
+try()
+
+TSP package
+
+TSP()
+
+U
+
+uniform design search
+
+uniform distribution
+
+unlist()
+
+V
+
+vector
+
+vignette()
+
+vines
+
+W
+
+weight-based genetic algorithm (WBGA)
+
+weighted-formula approach
+
+well known text (WKT) format
+
+which()
+
+which.max()
+
+which.mbin()
+
+while()
+
+wilcox.test()
+
+wireframe()
+
+writeLines()
+
diff --git a/kag/examples/csqa/builder/data/probability_and_statistics_for_computer_science.txt b/kag/examples/csqa/builder/data/probability_and_statistics_for_computer_science.txt
new file mode 100644
index 00000000..3f806758
--- /dev/null
+++ b/kag/examples/csqa/builder/data/probability_and_statistics_for_computer_science.txt
@@ -0,0 +1,17853 @@
+Probability and Statistics for Computer Science
+ 
+David Forsyth
+
+Probability and Statistics for Computer Science
+
+David Forsyth
+
+Computer Science Department, University of Illinois at Urbana Champaign, Urbana, IL, USA
+
+ISBN 978-3-319-64409-7e-ISBN 978-3-319-64410-3
+
+<https://doi.org/10.1007/978-3-319-64410-3>
+
+Library of Congress Control Number: 2017950289
+
+© Springer International Publishing AG 2018
+
+This work is subject to copyright. All rights are reserved by the Publisher, whether the whole or part of the material is concerned, specifically the rights of translation, reprinting, reuse of illustrations, recitation, broadcasting, reproduction on microfilms or in any other physical way, and transmission or information storage and retrieval, electronic adaptation, computer software, or by similar or dissimilar methodology now known or hereafter developed.
+
+The use of general descriptive names, registered names, trademarks, service marks, etc. in this publication does not imply, even in the absence of a specific statement, that such names are exempt from the relevant protective laws and regulations and therefore free for general use.
+
+The publisher, the authors and the editors are safe to assume that the advice and information in this book are believed to be true and accurate at the date of publication. Neither the publisher nor the authors or the editors give a warranty, express or implied, with respect to the material contained herein or for any errors or omissions that may have been made. The publisher remains neutral with regard to jurisdictional claims in published maps and institutional affiliations.
+
+Printed on acid-free paper
+
+This Springer imprint is published by Springer Nature
+
+The registered company is Springer International Publishing AG
+
+The registered company address is: Gewerbestrasse 11, 6330 Cham, Switzerland
+
+To my family
+
+Preface
+
+An understanding of probability and statistics is an essential tool for a modern computer scientist. If your tastes run to theory, then you need to know a lot of probability (e.g., to understand randomized algorithms, to understand the probabilistic method in graph theory, to understand a lot of work on approximation, and so on) and at least enough statistics to bluff successfully on occasion. If your tastes run to the practical, you will find yourself constantly raiding the larder of statistical techniques (particularly classification, clustering, and regression). For example, much of modern artificial intelligence is built on clever pirating of statistical ideas. As another example, thinking about statistical inference for gigantic datasets has had a tremendous influence on how people build modern computer systems.
+
+Computer science undergraduates traditionally are required to take either a course in probability, typically taught by the math department, or a course in statistics, typically taught by the statistics department. A curriculum committee in my department decided that the curricula of these courses could do with some revision. So I taught a trial version of a course, for which I wrote notes; these notes became this book. There is no new fact about probability or statistics here, but the selection of topics is my own; I think it's quite different from what one sees in other books.
+
+The key principle in choosing what to write about was to cover the ideas in probability and statistics that I thought every computer science undergraduate student should have seen, whatever their chosen specialty or career. This means the book is broad and coverage of many areas is shallow. I think that's fine, because my purpose is to ensure that all have seen enough to know that, say, firing up a classification package will make many problems go away. So I've covered enough to get you started and to get you to realize that it's worth knowing more.
+
+The notes I wrote have been useful to graduate students as well. In my experience, many learned some or all of this material without realizing how useful it was and then forgot it. If this happened to you, I hope the book is a stimulus to your memory. You really should have a grasp of all of this material. You might need to know more, but you certainly shouldn't know less.
+
+## Reading and Teaching This Book
+
+I wrote this book to be taught, or read, by starting at the beginning and proceeding to the end. Different instructors or readers may have different needs, and so I sketch some pointers to what can be omitted below.
+
+## Describing Datasets
+
+This part covers:
+
+  * Various descriptive statistics (mean, standard deviation, variance) and visualization methods for 1D datasets
+
+  * Scatter plots, correlation, and prediction for 2D datasets
+
+Most people will have seen some, but not all, of this material. In my experience, it takes some time for people to really internalize just how useful it is to make pictures of datasets. I've tried to emphasize this point strongly by investigating a variety of datasets in worked examples. When I teach this material, I move through these chapters slowly and carefully.
+
+## Probability
+
+This part covers:
+
+  * Discrete probability, developed fairly formally
+
+  * Conditional probability, with a particular emphasis on examples, because people find this topic counterintuitive
+
+  * Random variables and expectations
+
+  * Just a little continuous probability (probability density functions and how to interpret them)
+
+  * Markov's inequality, Chebyshev's inequality, and the weak law of large numbers
+
+  * A selection of facts about an assortment of useful probability distributions
+
+  * The normal approximation to a binomial distribution with large N
+
+I've been quite careful developing discrete probability fairly formally. Most people find conditional probability counterintuitive (or, at least, behave as if they do—you can still start a fight with the Monty Hall problem), and so I've used a number of (sometimes startling) examples to emphasize how useful it is to tread carefully here. In my experience, worked examples help learning, but I found that too many worked examples in any one section could become distracting, so there's an entire section of extra worked examples. You can't omit anything here, except perhaps the extra worked examples.
+
+The chapter on random variables largely contains routine material, but there I've covered Markov's inequality, Chebyshev's inequality, and the weak law of large numbers. In my experience, computer science undergraduates find simulation absolutely natural (why do sums when you can write a program?) and enjoy the weak law as a license to do what they would do anyway. You could omit the inequalities and just describe the weak law, though most students run into the inequalities in later theory courses; the experience is usually happier if they've seen them once before.
+
+The chapter on useful probability distributions again largely contains routine material. When I teach this course, I skim through the chapter fairly fast and rely on students reading the chapter. However, there is a detailed discussion of a normal approximation to a binomial distribution with large N . In my experience, no one enjoys the derivation, but you should know the approximation is available, and roughly how it works. I lecture this topic in some detail, mainly by giving examples.
+
+## Inference
+
+This part covers:
+
+  * Samples and populations
+
+  * Confidence intervals for sampled estimates of population means
+
+  * Statistical significance, including t-tests, F-tests, and χ 2 -tests
+
+  * Very simple experimental design, including one-way and two-way experiments
+
+  * ANOVA for experiments
+
+  * Maximum likelihood inference
+
+  * Simple Bayesian inference
+
+  * A very brief discussion of filtering
+
+The material on samples covers only sampling with replacement; if you need something more complicated, this will get you started. Confidence intervals are not much liked by students, I think because the true definition is quite delicate; but getting a grasp of the general idea is useful. You really shouldn't omit these topics.
+
+You shouldn't omit statistical significance either, though you might feel the impulse. I have never dealt with anyone who found their first encounter with statistical significance pleasurable (such a person might exist, the population being very large). But the idea is so useful and so valuable that you just have to take your medicine. Statistical significance is often seen and sometimes taught as a powerful but fundamentally mysterious apotropaic ritual. I try very hard not to do this.
+
+I have often omitted teaching simple experimental design and ANOVA, but in retrospect this was a mistake. The ideas are straightforward and useful. There's a bit of hypocrisy involved in teaching experimental design using other people's datasets. The (correct) alternative is to force students to plan and execute experiments; there just isn't enough time in a usual course to fit this in.
+
+Finally, you shouldn't omit maximum likelihood inference or Bayesian inference. Many people don't need to know about filtering, though.
+
+## Tools
+
+This part covers:
+
+  * Principal component analysis
+
+  * Simple multidimensional scaling with principal coordinate analysis;
+
+  * Basic ideas in classification;
+
+  * Nearest neighbors classification;
+
+  * Naive Bayes classification;
+
+  * Classifying with a linear SVM trained with stochastic gradient descent;
+
+  * Classifying with a random forest;
+
+  * The curse of dimension;
+
+  * Agglomerative and divisive clustering;
+
+  * K-means clustering;
+
+  * Vector quantization;
+
+  * A superficial mention of the multivariate normal distribution;
+
+  * Linear regression;
+
+  * A variety of tricks to analyze and improve regressions;
+
+  * Nearest neighbors regression;
+
+  * Simple Markov chains;
+
+  * Hidden Markov models.
+
+Most students in my institution take this course at the same time they take a linear algebra course. When I teach the course, I try and time things so they hit PCA shortly after hitting eigenvalues and eigenvectors. You shouldn't omit PCA. I lecture principal coordinate analysis very superficially, just describing what it does and why it's useful.
+
+I've been told, often quite forcefully, you can't teach classification to undergraduates. I think you have to, and in my experience, they like it a lot. Students really respond to being taught something that is extremely useful and really easy to do. Please, please, don't omit any of this stuff.
+
+The clustering material is quite simple and easy to teach. In my experience, the topic is a little baffling without an application. I always set a programming exercise where one must build a classifier using features derived from vector quantization. This is a great way of identifying situations where people think they understand something, but don't really. Most students find the exercise challenging, because they must use several concepts together. But most students overcome the challenges and are pleased to see the pieces intermeshing well. The discussion of the multivariate normal distribution is not much more than a mention. I don't think you could omit anything in this chapter.
+
+The regression material is also quite simple and is also easy to teach. The main obstacle here is that students feel something more complicated must necessarily work better (and they're not the only ones). I also don't think you could omit anything in this chapter.
+
+In my experience, computer science students find simple Markov chains natural (though they might find the notation annoying) and will suggest simulating a chain before the instructor does. The examples of using Markov chains to produce natural language (particularly Garkov and wine reviews) are wonderful fun and you really should show them in lectures. You could omit the discussion of ranking the Web. About half of each class I've dealt with has found hidden Markov models easy and natural, and the other half has been wishing the end of the semester was closer. You could omit this topic if you sense likely resistance, and have those who might find it interesting read it.
+
+## Mathematical Bits and Pieces
+
+This is a chapter of collected mathematical facts some readers might find useful, together with some slightly deeper information on decision tree construction. Not necessary to lecture this.
+
+David Forsyth
+
+Urbana, IL, USA
+
+Notation and Conventions
+
+A dataset is a collection of d -tuples (a d -tuple is an ordered list of d elements). Tuples differ from vectors, because we can always add and subtract vectors, but we cannot necessarily add or subtract tuples. There are always N items in any dataset. There are always d elements in each tuple in a dataset. The number of elements will be the same for every tuple in any given tuple. Sometimes we may not know the value of some elements in some tuples.
+
+We use the same notation for a tuple and for a vector. Most of our data will be vectors. We write a vector in bold, so x could represent a vector or a tuple (the context will make it obvious which is intended).
+
+The entire dataset is { x }. When we need to refer to the i th data item, we write x i . Assume we have N data items, and we wish to make a new dataset out of them; we write the dataset made out of these items as { x i } (the i is to suggest you are taking a set of items and making a dataset out of them). If we need to refer to the j th component of a vector x i , we will write x i ( j )  (notice this isn't in bold, because it is a component, not a vector, and the j is in parentheses because it isn't a power). Vectors are always column vectors.
+
+When I write { kx }, I mean the dataset created by taking each element of the dataset { x } and multiplying by k ; and when I write { x \+ c }, I mean the dataset created by taking each element of the dataset { x } and adding c .
+
+## Terms
+
+  *   is the mean of the dataset { x } (Definition 1.​1 , page 7).
+
+  *   is the standard deviation of the dataset { x } (Definition 1.​2 , page 10).
+
+  *   is the standard deviation of the dataset { x } (Definition 1.​3 , page 13).
+
+  *   is the standard deviation of the dataset { x } (Definition 1.​4 , page 13).
+
+  * percentile ({ x }, k ) is the k% percentile of the dataset { x } (Definition 1.​5 , page 14).
+
+  * iqr { x } is the interquartile range of the dataset { x } (Definition 1.​7 , page 15).
+
+  *   is the dataset { x }, transformed to standard coordinates (Definition 1.​8 , page 18).
+
+  * Standard normal data is defined in Definition 18 (page 19).
+
+  * Normal data is defined in Definition 1.​10 (page 19).
+
+  *   is the correlation between two components x and y of a dataset (Definition 2.​1 , page 39).
+
+  * ∅ is the empty set.
+
+  *   is the set of all possible outcomes of an experiment.
+
+  * Sets are written as   .
+
+  *   is the complement of the set   (i.e.,   ).
+
+  *   is an event (page 341).
+
+  *   is the probability of event   (page 341).
+
+  *   is the probability of event   , conditioned on event   (page 341).
+
+  * p ( x ) is the probability that random variable X will take the value x , also written as P ({ X = x }) (page 341).
+
+  * p ( x , y ) is the probability that random variable X will take the value x and random variable Y will take the value y , also written as P ({ X = x } ∩{ Y = y }) (page 341).
+
+  *   means the value of x that maximizes f ( x ).
+
+  *   means the value of x that minimizes f ( x ).
+
+  * max  i ( f ( x i )) means the largest value that f takes on different elements of the dataset { x i }.
+
+  *   is an estimated value of a parameter θ .
+
+## Background Information
+
+Cards : A standard deck of playing cards contains 52 cards. These cards are divided into four suits. The suits are spades and clubs (which are black) and hearts and diamonds (which are red). Each suit contains 13 cards: ace, 2, 3, 4, 5, 6, 7, 8, 9, 10, jack (sometimes called knave), queen, and king. It is common to call jack, queen, and king court cards .
+
+Dice : If you look hard enough, you can obtain dice with many different numbers of sides (though I've never seen a three-sided die). We adopt the convention that the sides of an N -sided die are labeled with numbers 1...N and that no number is used twice. Most dice are like this.
+
+Fairness : Each face of a fair coin or die has the same probability of landing upmost in a flip or roll.
+
+Roulette : A roulette wheel has a collection of slots. There are 36 slots numbered with digits 1... 36, and then one, two, or even three slots numbered with zero. There are no other slots. Odd-numbered slots are colored red, and even-numbered slots are colored black. Zeros are green. A ball is thrown at the wheel when it is spinning, and it bounces around and eventually falls into a slot. If the wheel is properly balanced, the ball has the same probability of falling into each slot. The number of the slot the ball falls into is said to "come up."
+
+Acknowledgments
+
+I acknowledge a wide range of intellectual debts, starting at kindergarten. Important figures in the very long list of my creditors include Gerald Alanthwaite, Mike Brady, Tom Fair, Margaret Fleck, Jitendra Malik, Joe Mundy, Jean Ponce, Mike Rodd, Charlie Rothwell, and Andrew Zisserman.
+
+I have benefited from looking at a variety of sources, though this work really is my own. I particularly enjoyed the following books:
+
+  * Elementary Probability , D. Stirzaker; Cambridge University Press, 2e, 2003.
+
+  * What is a p-value anyway? 34 Stories to Help You Actually Understand Statistics , A. J. Vickers; Pearson, 2009.
+
+  * Elementary Probability for Applications , R. Durrett; Cambridge University Press, 2009.
+
+  * Statistics , D. Freedman, R. Pisani and R. Purves; W. W. Norton & Company, 4e, 2007.
+
+  * Data Analysis and Graphics Using R: An Example-Based Approach , J. Maindonald and W. J. Braun; Cambridge University Press, 2e, 2003.
+
+  * The Nature of Statistical Learning Theory , V. Vapnik; Springer, 1999.
+
+A wonderful feature of modern scientific life is the willingness of people to share data on the Internet. I have roamed the Internet widely looking for datasets, and have tried to credit the makers and sharers of data accurately and fully when I use the dataset. If, by some oversight, I have left you out, please tell me and I will try and fix this. I have been particularly enthusiastic about using data from the following repositories:
+
+  * The UC Irvine Machine Learning Repository , at http://​archive.​ics.​uci.​edu/​ml/​ .
+
+  * Dr. John Rasp's Statistics Website , at http://​www2.​stetson.​edu/​~jrasp/​ .
+
+  * OzDASL: The Australasian Data and Story Library , at http://​www.​statsci.​org/​data/​ .
+
+  * The Center for Genome Dynamics, at the Jackson Laboratory , at http://​cgd.​jax.​org/​ (which contains staggering amounts of information about mice).
+
+I looked at Wikipedia regularly when preparing this manuscript, and I've pointed readers to neat stories there when they're relevant. I don't think one could learn the material in this book by reading Wikipedia, but it's been tremendously helpful in restoring ideas that I have mislaid, mangled, or simply forgotten.
+
+Typos spotted by Han Chen (numerous!), Henry Lin (numerous!), Eric Huber, Brian Lunt, Yusuf Sobh, and Scott Walters. Some names might be missing due to poor record-keeping on my part; I apologize. Jian Peng and Paris Smaragdis taught courses from versions of these notes and improved them by detailed comments, suggestions, and typo lists. TAs for this course have helped improve the notes. Thanks to Minje Kim, Henry Lin, Zicheng Liao, Karthik Ramaswamy, Saurabh Singh, Michael Sittig, Nikita Spirin, and Daphne Tsatsoulis. TAs for related classes have also helped improve the notes. Thanks to Tanmay Gangwani, Sili Hui, Ayush Jain, Maghav Kumar, Jiajun Lu, Jason Rock, Daeyun Shin, Mariya Vasileva, and Anirud Yadav.
+
+I have benefited hugely from reviews organized by the publisher. Reviewers made many extremely helpful suggestions, which I have tried to adopt; among many other things, the current material on inference is the product of a complete overhaul recommended by a reviewer. Reviewers were anonymous to me at time of review, but their names were later revealed so I can thank them by name. Thanks to:
+
+Remaining typos, errors, howlers, infelicities, cliché, slang, jargon, cant, platitude, attitude, inaccuracy, fatuousness, etc., are all my fault: Sorry.
+
+Ashis Biswas
+
+Dipak Ghosal
+
+James Mixco
+
+Sabrina Ripp
+
+Catherine Robinson
+
+Eric Sakk
+
+William Semper
+
+Contents
+
+Part I Describing Datasets
+
+1 First Tools for Looking at Data 3
+
+1.​1 Datasets 3
+
+1.​2 What's Happening?​ Plotting Data 4
+
+1.​2.​1 Bar Charts 5
+
+1.​2.​2 Histograms 6
+
+1.​2.​3 How to Make Histograms 6
+
+1.​2.​4 Conditional Histograms 7
+
+1.​3 Summarizing 1D Data 7
+
+1.​3.​1 The Mean 7
+
+1.​3.​2 Standard Deviation 9
+
+1.​3.​3 Computing Mean and Standard Deviation Online 12
+
+1.​3.​4 Variance 12
+
+1.​3.​5 The Median 13
+
+1.​3.​6 Interquartile Range 14
+
+1.​3.​7 Using Summaries Sensibly 15
+
+1.​4 Plots and Summaries 16
+
+1.​4.​1 Some Properties of Histograms 16
+
+1.​4.​2 Standard Coordinates and Normal Data 18
+
+1.​4.​3 Box Plots 20
+
+1.​5 Whose is Bigger?​ Investigating Australian Pizzas 20
+
+1.​6 You Should 24
+
+1.​6.​1 Remember These Definitions 24
+
+1.​6.​2 Remember These Terms 25
+
+1.​6.​3 Remember These Facts 25
+
+1.​6.​4 Be Able to 25
+
+2 Looking at Relationships 29
+
+2.​1 Plotting 2D Data 29
+
+2.​1.​1 Categorical Data, Counts, and Charts 29
+
+2.​1.​2 Series 31
+
+2.​1.​3 Scatter Plots for Spatial Data 33
+
+2.​1.​4 Exposing Relationships with Scatter Plots 34
+
+2.​2 Correlation 36
+
+2.​2.​1 The Correlation Coefficient 39
+
+2.​2.​2 Using Correlation to Predict 42
+
+2.​2.​3 Confusion Caused by Correlation 44
+
+2.​3 Sterile Males in Wild Horse Herds 45
+
+2.​4 You Should 47
+
+2.​4.​1 Remember These Definitions 47
+
+2.​4.​2 Remember These Terms 47
+
+2.​4.​3 Remember These Facts 47
+
+2.​4.​4 Use These Procedures 47
+
+2.​4.​5 Be Able to 47
+
+Part II Probability
+
+3 Basic Ideas in Probability 53
+
+3.​1 Experiments, Outcomes and Probability 53
+
+3.​1.​1 Outcomes and Probability 53
+
+3.​2 Events 55
+
+3.​2.​1 Computing Event Probabilities by Counting Outcomes 56
+
+3.​2.​2 The Probability of Events 58
+
+3.​2.​3 Computing Probabilities by Reasoning About Sets 60
+
+3.​3 Independence 61
+
+3.​3.​1 Example:​ Airline Overbooking 64
+
+3.​4 Conditional Probability 66
+
+3.​4.​1 Evaluating Conditional Probabilities 67
+
+3.​4.​2 Detecting Rare Events Is Hard 70
+
+3.​4.​3 Conditional Probability and Various Forms of Independence 71
+
+3.​4.​4 Warning Example:​ The Prosecutor's Fallacy 72
+
+3.​4.​5 Warning Example:​ The Monty Hall Problem 73
+
+3.​5 Extra Worked Examples 75
+
+3.​5.​1 Outcomes and Probability 75
+
+3.​5.​2 Events 76
+
+3.​5.​3 Independence 77
+
+3.​5.​4 Conditional Probability 78
+
+3.​6 You Should 80
+
+3.​6.​1 Remember These Definitions 80
+
+3.​6.​2 Remember These Terms 80
+
+3.​6.​3 Remember and Use These Facts 80
+
+3.​6.​4 Remember These Points 80
+
+3.​6.​5 Be Able to 81
+
+4 Random Variables and Expectations 87
+
+4.​1 Random Variables 87
+
+4.​1.​1 Joint and Conditional Probability for Random Variables 89
+
+4.​1.​2 Just a Little Continuous Probability 91
+
+4.​2 Expectations and Expected Values 93
+
+4.​2.​1 Expected Values 93
+
+4.​2.​2 Mean, Variance and Covariance 95
+
+4.​2.​3 Expectations and Statistics 98
+
+4.​3 The Weak Law of Large Numbers 99
+
+4.​3.​1 IID Samples 99
+
+4.​3.​2 Two Inequalities 100
+
+4.​3.​3 Proving the Inequalities 100
+
+4.​3.​4 The Weak Law of Large Numbers 102
+
+4.​4 Using the Weak Law of Large Numbers 103
+
+4.​4.​1 Should You Accept a Bet?​ 103
+
+4.​4.​2 Odds, Expectations and Bookmaking:​ A Cultural Diversion 104
+
+4.​4.​3 Ending a Game Early 105
+
+4.​4.​4 Making a Decision with Decision Trees and Expectations 105
+
+4.​4.​5 Utility 106
+
+4.​5 You Should 108
+
+4.​5.​1 Remember These Definitions 108
+
+4.​5.​2 Remember These Terms 108
+
+4.​5.​3 Use and Remember These Facts 109
+
+4.​5.​4 Remember These Points 109
+
+4.​5.​5 Be Able to 109
+
+5 Useful Probability Distributions 115
+
+5.​1 Discrete Distributions 115
+
+5.​1.​1 The Discrete Uniform Distribution 115
+
+5.​1.​2 Bernoulli Random Variables 116
+
+5.​1.​3 The Geometric Distribution 116
+
+5.​1.​4 The Binomial Probability Distribution 116
+
+5.​1.​5 Multinomial Probabilities 118
+
+5.​1.​6 The Poisson Distribution 118
+
+5.​2 Continuous Distributions 120
+
+5.​2.​1 The Continuous Uniform Distribution 120
+
+5.​2.​2 The Beta Distribution 120
+
+5.​2.​3 The Gamma Distribution 121
+
+5.​2.​4 The Exponential Distribution 122
+
+5.​3 The Normal Distribution 123
+
+5.​3.​1 The Standard Normal Distribution 123
+
+5.​3.​2 The Normal Distribution 124
+
+5.​3.​3 Properties of the Normal Distribution 124
+
+ 5.4 Approximating Binomials with Large N  126
+
+5.​4.​1 Large N 127
+
+5.​4.​2 Getting Normal 128
+
+5.​4.​3 Using a Normal Approximation to the Binomial Distribution 129
+
+5.​5 You Should 130
+
+5.​5.​1 Remember These Definitions 130
+
+5.​5.​2 Remember These Terms 130
+
+5.​5.​3 Remember These Facts 131
+
+5.​5.​4 Remember These Points 131
+
+Part III Inference
+
+6 Samples and Populations 141
+
+6.​1 The Sample Mean 141
+
+6.​1.​1 The Sample Mean Is an Estimate of the Population Mean 141
+
+6.​1.​2 The Variance of the Sample Mean 142
+
+6.​1.​3 When The Urn Model Works 144
+
+6.​1.​4 Distributions Are Like Populations 145
+
+6.​2 Confidence Intervals 146
+
+6.​2.​1 Constructing Confidence Intervals 146
+
+6.​2.​2 Estimating the Variance of the Sample Mean 146
+
+6.​2.​3 The Probability Distribution of the Sample Mean 148
+
+6.​2.​4 Confidence Intervals for Population Means 149
+
+6.​2.​5 Standard Error Estimates from Simulation 152
+
+6.​3 You Should 154
+
+6.​3.​1 Remember These Definitions 154
+
+6.​3.​2 Remember These Terms 154
+
+6.​3.​3 Remember These Facts 154
+
+6.​3.​4 Use These Procedures 154
+
+6.​3.​5 Be Able to 154
+
+7 The Significance of Evidence 159
+
+7.​1 Significance 160
+
+7.​1.​1 Evaluating Significance 160
+
+7.​1.​2 P-Values 161
+
+7.​2 Comparing the Mean of Two Populations 165
+
+7.​2.​1 Assuming Known Population Standard Deviations 165
+
+7.​2.​2 Assuming Same, Unknown Population Standard Deviation 167
+
+7.​2.​3 Assuming Different, Unknown Population Standard Deviation 168
+
+7.​3 Other Useful Tests of Significance 169
+
+7.​3.​1 F-Tests and Standard Deviations 169
+
+ 7.3.2 χ 2 Tests of Model Fit  171
+
+7.​4 P-Value Hacking and Other Dangerous Behavior 174
+
+7.​5 You Should 174
+
+7.​5.​1 Remember These Definitions 174
+
+7.​5.​2 Remember These Terms 175
+
+7.​5.​3 Remember These Facts 175
+
+7.​5.​4 Use These Procedures 175
+
+7.​5.​5 Be Able to 175
+
+8 Experiments 179
+
+8.​1 A Simple Experiment:​ The Effect of a Treatment 179
+
+8.​1.​1 Randomized Balanced Experiments 180
+
+8.​1.​2 Decomposing Error in Predictions 180
+
+8.​1.​3 Estimating the Noise Variance 181
+
+8.​1.​4 The ANOVA Table 182
+
+8.​1.​5 Unbalanced Experiments 183
+
+8.​1.​6 Significant Differences 185
+
+8.​2 Two Factor Experiments 186
+
+8.​2.​1 Decomposing the Error 188
+
+8.​2.​2 Interaction Between Effects 189
+
+8.​2.​3 The Effects of a Treatment 190
+
+8.​2.​4 Setting Up An ANOVA Table 191
+
+8.​3 You Should 194
+
+8.​3.​1 Remember These Definitions 194
+
+8.​3.​2 Remember These Terms 194
+
+8.​3.​3 Remember These Facts 194
+
+8.​3.​4 Use These Procedures 194
+
+8.​3.​5 Be Able to 194
+
+9 Inferring Probability Models from Data 197
+
+9.​1 Estimating Model Parameters with Maximum Likelihood 197
+
+9.​1.​1 The Maximum Likelihood Principle 198
+
+9.​1.​2 Binomial, Geometric and Multinomial Distributions 199
+
+9.​1.​3 Poisson and Normal Distributions 201
+
+9.​1.​4 Confidence Intervals for Model Parameters 204
+
+9.​1.​5 Cautions About Maximum Likelihood 206
+
+9.​2 Incorporating Priors with Bayesian Inference 206
+
+9.​2.​1 Conjugacy 209
+
+9.​2.​2 MAP Inference 210
+
+9.​2.​3 Cautions About Bayesian Inference 211
+
+9.​3 Bayesian Inference for Normal Distributions 211
+
+9.​3.​1 Example:​ Measuring Depth of a Borehole 212
+
+9.​3.​2 Normal Prior and Normal Likelihood Yield Normal Posterior 212
+
+9.​3.​3 Filtering 214
+
+9.​4 You Should 215
+
+9.​4.​1 Remember These Definitions 215
+
+9.​4.​2 Remember These Terms 216
+
+9.​4.​3 Remember These Facts 216
+
+9.​4.​4 Use These Procedures 217
+
+9.​4.​5 Be Able to 217
+
+Part IV Tools
+
+10 Extracting Important Relationships in High Dimensions 225
+
+10.​1 Summaries and Simple Plots 225
+
+10.​1.​1 The Mean 226
+
+10.​1.​2 Stem Plots and Scatterplot Matrices 226
+
+10.​1.​3 Covariance 227
+
+10.​1.​4 The Covariance Matrix 228
+
+10.​2 Using Mean and Covariance to Understand High Dimensional Data 231
+
+10.​2.​1 Mean and Covariance Under Affine Transformations 231
+
+10.​2.​2 Eigenvectors and Diagonalization 232
+
+10.​2.​3 Diagonalizing Covariance by Rotating Blobs 233
+
+10.​2.​4 Approximating Blobs 235
+
+10.​2.​5 Example:​ Transforming the Height-Weight Blob 235
+
+10.​3 Principal Components Analysis 236
+
+10.​3.​1 The Low Dimensional Representation 236
+
+10.​3.​2 The Error Caused by Reducing Dimension 238
+
+10.​3.​3 Example:​ Representing Colors with Principal Components 241
+
+10.​3.​4 Example:​ Representing Faces with Principal Components 242
+
+10.​4 Multi-Dimensional Scaling 242
+
+10.​4.​1 Choosing Low D Points Using High D Distances 243
+
+10.​4.​2 Factoring a Dot-Product Matrix 245
+
+10.​4.​3 Example:​ Mapping with Multidimensional​ Scaling 246
+
+10.​5 Example:​ Understanding Height and Weight 247
+
+10.​6 You Should 250
+
+10.​6.​1 Remember These Definitions 250
+
+10.​6.​2 Remember These Terms 250
+
+10.​6.​3 Remember These Facts 250
+
+10.​6.​4 Use These Procedures 250
+
+10.​6.​5 Be Able to 250
+
+11 Learning to Classify 253
+
+11.​1 Classification:​ The Big Ideas 253
+
+11.​1.​1 The Error Rate, and Other Summaries of Performance 254
+
+11.​1.​2 More Detailed Evaluation 254
+
+11.​1.​3 Overfitting and Cross-Validation 255
+
+11.​2 Classifying with Nearest Neighbors 256
+
+11.​2.​1 Practical Considerations for Nearest Neighbors 256
+
+11.​3 Classifying with Naive Bayes 257
+
+11.​3.​1 Cross-Validation to Choose a Model 259
+
+11.​4 The Support Vector Machine 260
+
+11.​4.​1 The Hinge Loss 261
+
+11.​4.​2 Regularization 262
+
+11.​4.​3 Finding a Classifier with Stochastic Gradient Descent 262
+
+ 11.4.4 Searching for λ  264
+
+11.​4.​5 Example:​ Training an SVM with Stochastic Gradient Descent 266
+
+11.​4.​6 Multi-Class Classification with SVMs 268
+
+11.​5 Classifying with Random Forests 268
+
+11.​5.​1 Building a Decision Tree:​ General Algorithm 270
+
+11.​5.​2 Building a Decision Tree:​ Choosing a Split 270
+
+11.​5.​3 Forests 272
+
+11.​6 You Should 274
+
+11.​6.​1 Remember These Definitions 274
+
+11.​6.​2 Remember These Terms 274
+
+11.​6.​3 Remember These Facts 275
+
+11.​6.​4 Use These Procedures 275
+
+11.​6.​5 Be Able to 276
+
+12 Clustering:​ Models of High Dimensional Data 281
+
+12.​1 The Curse of Dimension 281
+
+12.​1.​1 Minor Banes of Dimension 281
+
+12.​1.​2 The Curse:​ Data Isn't Where You Think It Is 282
+
+12.​2 Clustering Data 283
+
+12.​2.​1 Agglomerative and Divisive Clustering 283
+
+12.​2.​2 Clustering and Distance 285
+
+12.​3 The K-Means Algorithm and Variants 287
+
+12.​3.​1 How to Choose K 288
+
+12.​3.​2 Soft Assignment 290
+
+12.​3.​3 Efficient Clustering and Hierarchical K Means 291
+
+12.​3.​4 K-Mediods 292
+
+12.​3.​5 Example:​ Groceries in Portugal 292
+
+12.​3.​6 General Comments on K-Means 293
+
+12.​4 Describing Repetition with Vector Quantization 294
+
+12.​4.​1 Vector Quantization 296
+
+12.​4.​2 Example:​ Activity from Accelerometer Data 298
+
+12.​5 The Multivariate Normal Distribution 300
+
+12.​5.​1 Affine Transformations and Gaussians 301
+
+12.​5.​2 Plotting a 2D Gaussian:​ Covariance Ellipses 301
+
+12.​6 You Should 302
+
+12.​6.​1 Remember These Definitions 302
+
+12.​6.​2 Remember These Terms 302
+
+12.​6.​3 Remember These Facts 303
+
+12.​6.​4 Use These Procedures 303
+
+13 Regression 305
+
+13.​1 Regression to Make Predictions 305
+
+13.​2 Regression to Spot Trends 306
+
+13.​3 Linear Regression and Least Squares 308
+
+13.​3.​1 Linear Regression 308
+
+ 13.3.2 Choosing β  309
+
+13.​3.​3 Solving the Least Squares Problem 309
+
+13.​3.​4 Residuals 310
+
+13.​3.​5 R-Squared 310
+
+13.​4 Producing Good Linear Regressions 313
+
+13.​4.​1 Transforming Variables 313
+
+13.​4.​2 Problem Data Points Have Significant Impact 314
+
+13.​4.​3 Functions of One Explanatory Variable 317
+
+13.​4.​4 Regularizing Linear Regressions 318
+
+13.​5 Exploiting Your Neighbors for Regression 321
+
+13.​5.​1 Using Your Neighbors to Predict More than a Number 323
+
+13.​6 You Should 323
+
+13.​6.​1 Remember These Definitions 323
+
+13.​6.​2 Remember These Terms 324
+
+13.​6.​3 Remember These Facts 324
+
+13.​6.​4 Remember These Procedures 324
+
+14 Markov Chains and Hidden Markov Models 331
+
+14.​1 Markov Chains 331
+
+14.​1.​1 Transition Probability Matrices 333
+
+14.​1.​2 Stationary Distributions 335
+
+14.​1.​3 Example:​ Markov Chain Models of Text 336
+
+14.​2 Estimating Properties of Markov Chains 338
+
+14.​2.​1 Simulation 338
+
+14.​2.​2 Simulation Results as Random Variables 339
+
+14.​2.​3 Simulating Markov Chains 341
+
+14.​3 Example:​ Ranking the Web by Simulating a Markov Chain 342
+
+14.​4 Hidden Markov Models and Dynamic Programming 344
+
+14.​4.​1 Hidden Markov Models 344
+
+14.​4.​2 Picturing Inference with a Trellis 344
+
+14.​4.​3 Dynamic Programming for HMM's:​ Formalities 346
+
+14.​4.​4 Example:​ Simple Communication Errors 348
+
+14.​5 You Should 349
+
+14.​5.​1 Remember These Definitions 349
+
+14.​5.​2 Remember These Terms 349
+
+14.​5.​3 Remember These Facts 350
+
+14.​5.​4 Be Able to 350
+
+Part V Mathematical Bits and Pieces
+
+15 Resources and Extras 355
+
+15.​1 Useful Material About Matrices 355
+
+15.​1.​1 The Singular Value Decomposition 356
+
+15.​1.​2 Approximating A Symmetric Matrix 356
+
+15.​2 Some Special Functions 358
+
+15.​3 Splitting a Node in a Decision Tree 359
+
+15.​3.​1 Accounting for Information with Entropy 359
+
+15.​3.​2 Choosing a Split with Information Gain 360
+
+Index363
+
+About the Author
+
+David Forsyth
+
+grew up in Cape Town. He received a B.Sc. (Elec. Eng.) from the University of the Witwatersrand, Johannesburg, in 1984, an M.Sc. (Elec. Eng.) from that university in 1986, and a D.Phil. from Balliol College, Oxford, in 1989. He spent 3 years on the faculty at the University of Iowa and 10 years on the faculty at the University of California at Berkeley and then moved to the University of Illinois. He served as program cochair for IEEE Computer Vision and Pattern Recognition in 2000, 2011, and 2018; general cochair for CVPR 2006 and ICCV 2019; and program cochair for the European Conference on Computer Vision 2008 and is a regular member of the program committee of all major international conferences on computer vision. He has served six terms on the SIGGRAPH program committee. In 2006, he received an IEEE technical achievement award, in 2009 he was named an IEEE Fellow, and in 2014 he was named an ACM Fellow. He served as editor in chief of IEEE TPAMI from 2014 to 2017. He is lead coauthor of Computer Vision: A Modern Approach , a textbook of computer vision that ran to two editions and four languages. Among a variety of odd hobbies, he is a compulsive diver, certified up to normoxic trimix level. 
+
+# Part I  
+Describing Datasets
+© Springer International Publishing AG 2018
+
+David ForsythProbability and Statistics for Computer Sciencehttps://doi.org/10.1007/978-3-319-64410-3_1
+
+# 1. First Tools for Looking at Data
+
+David Forsyth1
+
+(1)
+
+Computer Science Department, University of Illinois at Urbana Champaign, Urbana, IL, USA
+
+The single most important question for a working scientist—perhaps the single most useful question anyone can ask—is: "what's going on here?" Answering this question requires creative use of different ways to make pictures of datasets, to summarize them, and to expose whatever structure might be there. This is an activity that is sometimes known as "Descriptive Statistics". There isn't any fixed recipe for understanding a dataset, but there is a rich variety of tools we can use to get insights.
+
+## 1.1 Datasets
+
+A dataset is a collection of descriptions of different instances of the same phenomenon. These descriptions could take a variety of forms, but it is important that they are descriptions of the same thing. For example, my grandfather collected the daily rainfall in his garden for many years; we could collect the height of each person in a room; or the number of children in each family on a block; or whether 10 classmates would prefer to be "rich" or "famous". There could be more than one description recorded for each item. For example, when he recorded the contents of the rain gauge each morning, my grandfather could have recorded (say) the temperature and barometric pressure. As another example, one might record the height, weight, blood pressure and body temperature of every patient visiting a doctor's office.
+
+The descriptions in a dataset can take a variety of forms. A description could be categorical, meaning that each data item can take a small set of prescribed values. For example, we might record whether each of 100 passers-by preferred to be "Rich" or "Famous". As another example, we could record whether the passers-by are "Male" or "Female". Categorical data could be ordinal, meaning that we can tell whether one data item is larger than another. For example, a dataset giving the number of children in a family for some set of families is categorical, because it uses only non-negative integers, but it is also ordinal, because we can tell whether one family is larger than another.
+
+Some ordinal categorical data appears not to be numerical, but can be assigned a number in a reasonably sensible fashion. For example, many readers will recall being asked by a doctor to rate their pain on a scale of 1–10—a question that is usually relatively easy to answer, but is quite strange when you think about it carefully. As another example, we could ask a set of users to rate the usability of an interface in a range from "very bad" to "very good", and then record that using − 2 for "very bad", − 1 for "bad", 0 for "neutral", 1 for "good", and 2 for "very good".
+
+Many interesting datasets involve continuous variables (like, for example, height or weight or body temperature) when you could reasonably expect to encounter any value in a particular range. For example, we might have the heights of all people in a particular room, or the rainfall at a particular place for each day of the year.
+
+You should think of a dataset as a collection of d-tuples (a d-tuple is an ordered list of d elements). Tuples differ from vectors, because we can always add and subtract vectors, but we cannot necessarily add or subtract tuples. We will always write N for the number of tuples in the dataset, and d for the number of elements in each tuple. The number of elements will be the same for every tuple, though sometimes we may not know the value of some elements in some tuples (which means we must figure out how to predict their values, which we will do much later).
+
+Each element of a tuple has its own type. Some elements might be categorical. For example, one dataset we shall see several times has entries for Gender; Grade; Age; Race; Urban/Rural; School; Goals; Grades; Sports; Looks; and Money for 478 children, so d = 11 and N = 478. In this dataset, each entry is categorical data. Clearly, these tuples are not vectors because one cannot add or subtract (say) Gender, or add Age to Grades.
+
+Most of our data will be vectors. We use the same notation for a tuple and for a vector. We write a vector in bold, so x could represent a vector or a tuple (the context will make it obvious which is intended).
+
+The entire data set is {x}. When we need to refer to the i'th data item, we write x i . Assume we have N data items, and we wish to make a new dataset out of them; we write the dataset made out of these items as {x i } (the i is to suggest you are taking a set of items and making a dataset out of them).
+
+In this chapter, we will work mainly with continuous data. We will see a variety of methods for plotting and summarizing 1-tuples. We can build these plots from a dataset of d-tuples by extracting the r'th element of each d-tuple. All through the book, we will see many datasets downloaded from various web sources, because people are so generous about publishing interesting datasets on the web. In the next chapter, we will look at two-dimensional data, and we look at high dimensional data in Chap.
+
+## 1.2 What's Happening? Plotting Data
+
+The very simplest way to present or visualize a dataset is to produce a table. Tables can be helpful, but aren't much use for large datasets, because it is difficult to get any sense of what the data means from a table. As a continuous example, Table 1.1 gives a table of the net worth of a set of people you might meet in a bar (I made this data up). You can scan the table and have a rough sense of what is going on; net worths are quite close to $100,000, and there aren't any very big or very small numbers. This sort of information might be useful, for example, in choosing a bar.
+
+Table 1.1
+
+On the left, net worths of people you meet in a bar, in US $; I made this data up, using some information from the US Census
+
+Index | Net worth
+
+---|---
+
+1 | 100, 360
+
+2 | 109, 770
+
+3 | 96, 860
+
+4 | 97, 860
+
+5 | 108, 930
+
+6 | 124, 330
+
+7 | 101, 300
+
+8 | 112, 710
+
+9 | 106, 740
+
+10 | 120, 170
+
+Index | Taste score | Index | Taste score
+
+---|---|---|---
+
+1 | 12.3 | 11 | 34.9
+
+2 | 20.9 | 12 | 57.2
+
+3 | 39 | 13 | 0.7
+
+4 | 47.9 | 14 | 25.9
+
+5 | 5.6 | 15 | 54.9
+
+6 | 25.9 | 16 | 40.9
+
+7 | 37.3 | 17 | 15.9
+
+8 | 21.9 | 18 | 6.4
+
+9 | 18.1 | 19 | 18
+
+10 | 21 | 20 | 38.9
+
+The index column, which tells you which data item is being referred to, is usually not displayed in a table because you can usually assume that the first line is the first item, and so on. On the right, the taste score (I'm not making this up; higher is better) for 20 different cheeses. This data is real (i.e. not made up), and it comes from http://​lib.​stat.​cmu.​edu/​DASL/​Datafiles/​Cheese.​html
+
+People would like to measure, record, and reason about an extraordinary variety of phenomena. Apparently, one can score the goodness of the flavor of cheese with a number (bigger is better); Table 1.1 gives a score for each of thirty cheeses (I did not make up this data, but downloaded it from http://​lib.​stat.​cmu.​edu/​DASL/​Datafiles/​Cheese.​html). You should notice that a few cheeses have very high scores, and most have moderate scores. It's difficult to draw more significant conclusions from the table, though.
+
+Table 1.2 shows a table for a set of categorical data. Psychologists collected data from students in grades 4–6 in three school districts to understand what factors students thought made other students popular. This fascinating data set can be found at http://​lib.​stat.​cmu.​edu/​DASL/​Datafiles/​PopularKids.​html, and was prepared by Chase and Dunner in a paper "The Role of Sports as a Social Determinant for Children," published in Research Quarterly for Exercise and Sport in 1992. Among other things, for each student they asked whether the student's goal was to make good grades ("Grades", for short); to be popular ("Popular"); or to be good at sports ("Sports"). They have this information for 478 students, so a table would be very hard to read. Table 1.2 shows the gender and the goal for the first 20 students in this group. It's rather harder to draw any serious conclusion from this data, because the full table would be so big. We need a more effective tool than eyeballing the table.
+
+Table 1.2
+
+Chase and Dunner, in a study described in the text, collected data on what students thought made other students popular
+
+Gender | Goal | Gender | Goal
+
+---|---|---|---
+
+Boy | Sports | Girl | Sports
+
+Boy | Popular | Girl | Grades
+
+Girl | Popular | Boy | Popular
+
+Girl | Popular | Boy | Popular
+
+Girl | Popular | Boy | Popular
+
+Girl | Popular | Girl | Grades
+
+Girl | Popular | Girl | Sports
+
+Girl | Grades | Girl | Popular
+
+Girl | Sports | Girl | Grades
+
+Girl | Sports | Girl | Sports
+
+As part of this effort, they collected information on (a) the gender and (b) the goal of students. This table gives the gender ("boy" or "girl") and the goal (to make good grades—"Grades"; to be popular—"Popular"; or to be good at sports—"Sports"). The table gives this information for the first 20 of 478 students; the rest can be found at http://​lib.​stat.​cmu.​edu/​DASL/​Datafiles/​PopularKids.​html. This data is clearly categorical, and not ordinal
+
+### 1.2.1 Bar Charts
+
+A bar chart is a set of bars, one per category, where the height of each bar is proportional to the number of items in that category. A glance at a bar chart often exposes important structure in data, for example, which categories are common, and which are rare. Bar charts are particularly useful for categorical data. Figure 1.1 shows such bar charts for the genders and the goals in the student dataset of Chase and Dunner. You can see at a glance that there are about as many boys as girls, and that there are more students who think grades are important than students who think sports or popularity is important. You couldn't draw either conclusion from Table 1.2, because I showed only the first 20 items; but a 478 item table is very difficult to read.
+
+Fig. 1.1
+
+On the left, a bar chart of the number of children of each gender in the Chase and Dunner study. Notice that there are about the same number of boys and girls (the bars are about the same height). On the right, a bar chart of the number of children selecting each of three goals. You can tell, at a glance, that different goals are more or less popular by looking at the height of the bars
+
+### 1.2.2 Histograms
+
+Data is continuous when a data item could take any value in some range or set of ranges. In turn, this means that we can reasonably expect a continuous dataset contains few or no pairs of items that have exactly the same value. Drawing a bar chart in the obvious way—one bar per value—produces a mess of unit height bars, and seldom leads to a good plot. Instead, we would like to have fewer bars, each representing more data items. We need a procedure to decide which data items count in which bar.
+
+A simple generalization of a bar chart is a histogram. We divide the range of the data into intervals, which do not need to be equal in length. We think of each interval as having an associated pigeonhole, and choose one pigeonhole for each data item. We then build a set of boxes, one per interval. Each box sits on its interval on the horizontal axis, and its height is determined by the number of data items in the corresponding pigeonhole. In the simplest histogram, the intervals that form the bases of the boxes are equally sized. In this case, the height of the box is given by the number of data items in the box.
+
+Figure 1.2 shows a histogram of the data in Table 1.1. There are five bars—by my choice; I could have plotted ten bars—and the height of each bar gives the number of data items that fall into its interval. For example, there is one net worth in the range between $102,500 and $107,500. Notice that one bar is invisible, because there is no data in that range. This picture suggests conclusions consistent with the ones we had from eyeballing the table—the net worths tend to be quite similar, and around $100,000.
+
+Fig. 1.2
+
+On the left, a histogram of net worths from the dataset described in the text and shown in Table 1.1. On the right, a histogram of cheese goodness scores from the dataset described in the text and shown in Table 1.1
+
+Figure 1.2 also shows a histogram of the data in Table 1.1. There are six bars (0–10, 10–20, and so on), and the height of each bar gives the number of data items that fall into its interval—so that, for example, there are 9 cheeses in this dataset whose score is greater than or equal to 10 and less than 20. You can also use the bars to estimate other properties. So, for example, there are 14 cheeses whose score is less than 20, and 3 cheeses with a score of 50 or greater. This picture is much more helpful than the table; you can see at a glance that quite a lot of cheeses have relatively low scores, and few have high scores.
+
+### 1.2.3 How to Make Histograms
+
+Usually, one makes a histogram by finding the appropriate command or routine in your programming environment. I use Matlab and R, depending on what I feel like. It is useful to understand the procedures used to make and plot histograms.
+
+Histograms with Even Intervals: The easiest histogram to build uses equally sized intervals. Write x i for the i'th number in the dataset, x min for the smallest value, and x max for the largest value. We divide the range between the smallest and largest values into n intervals of even width (x max − x min)∕n. In this case, the height of each box is given by the number of items in that interval. We could represent the histogram with an n-dimensional vector of counts. Each entry represents the count of the number of data items that lie in that interval. Notice we need to be careful to ensure that each point in the range of values is claimed by exactly one interval. For example, we could have intervals of [0 − 1) and [1 − 2), or we could have intervals of (0 − 1] and (1 − 2]. We could not have intervals of [0 − 1] and [1 − 2], because then a data item with the value 1 would appear in two boxes. Similarly, we could not have intervals of (0 − 1) and (1 − 2), because then a data item with the value 1 would not appear in any box.
+
+Histograms with Uneven Intervals: For a histogram with even intervals, it is natural that the height of each box is the number of data items in that box. But a histogram with even intervals can have empty boxes (see Fig. 1.2). In this case, it can be more informative to have some larger intervals to ensure that each interval has some data items in it. But how high should we plot the box? Imagine taking two consecutive intervals in a histogram with even intervals, and fusing them. It is natural that the height of the fused box should be the average height of the two boxes. This observation gives us a rule.
+
+Write dx for the width of the intervals; n 1 for the height of the box over the first interval (which is the number of elements in the first box); and n 2 for the height of the box over the second interval. The height of the fused box will be (n 1 \+ n 2)∕2. Now the area of the first box is n 1 dx; of the second box is n 2 dx; and of the fused box is (n 1 \+ n 2)dx. For each of these boxes, the area of the box is proportional to the number of elements in the box. This gives the correct rule: plot boxes such that the area of the box is proportional to the number of elements in the box.
+
+### 1.2.4 Conditional Histograms
+
+Most people believe that normal body temperature is 98. 4∘ in Fahrenheit. If you take other people's temperatures often (for example, you might have children), you know that some individuals tend to run a little warmer or a little cooler than this number. I found data giving the body temperature of a set of individuals at http://​www2.​stetson.​edu/​~jrasp/​data.​htm. This data appears on Dr. John Rasp's statistics data website, and apparently first came from a paper in the Journal of Statistics Education. As you can see from the histogram (Fig. 1.3), the body temperatures cluster around a small set of numbers. But what causes the variation?
+
+Fig. 1.3
+
+On top, a histogram of body temperatures, from the dataset published at http://​www2.​stetson.​edu/​~jrasp/​data.​htm. These seem to be clustered fairly tightly around one value. The bottom row shows histograms for each gender (I don't know which is which). It looks as though one gender runs slightly cooler than the other
+
+One possibility is gender. We can investigate this possibility by comparing a histogram of temperatures for males with histogram of temperatures for females. The dataset gives genders as 1 or 2—I don't know which is male and which female. Histograms that plot only part of a dataset are sometimes called conditional histograms or class-conditional histograms, because each histogram is conditioned on something. In this case, each histogram uses only data that comes from a particular gender. Figure 1.3 gives the class conditional histograms. It does seem like individuals of one gender run a little cooler than individuals of the other. Being certain takes considerably more work than looking at these histograms, because the difference might be caused by an unlucky choice of subjects. But the histograms suggests that this work might be worth doing.
+
+## 1.3 Summarizing 1D Data
+
+For the rest of this chapter, we will assume that data items take values that are continuous real numbers. Furthermore, we will assume that values can be added, subtracted, and multiplied by constants in a meaningful way. Human heights are one example of such data; you can add two heights, and interpret the result as a height (perhaps one person is standing on the head of the other). You can subtract one height from another, and the result is meaningful. You can multiply a height by a constant—say, 1/2—and interpret the result (A is half as high as B).
+
+### 1.3.1 The Mean
+
+One simple and effective summary of a set of data is its mean. This is sometimes known as the average of the data.
+
+Definition 1.1 (Mean)
+
+Assume we have a dataset {x} of N data items, x 1,..., x N . Their mean is
+
+![
+$$\\displaystyle{\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) = \\frac{1} {N}\\sum _{i=1}^{i=N}x_{ i}.}$$
+](A442674_1_En_1_Chapter_Equa.gif)
+
+For example, assume you're in a bar, in a group of ten people who like to talk about money. They're average people, and their net worth is given in Table 1.1 (you can choose who you want to be in this story). The mean of this data is $107,903.
+
+The mean has several important properties you should remember. These properties are easy to prove (and so easy to remember). I have broken these out into a box of useful facts below, to emphasize them.
+
+Useful Facts 1.1 (Properties of the Mean)
+
+  * Scaling data scales the mean: or
+
+![
+$$\\displaystyle{\\mathsf{mean}\\left \(\\left \\{kx_{i}\\right \\}\\right \) = k\\mathsf{mean}\\left \(\\left \\{x_{i}\\right \\}\\right \).}$$
+](A442674_1_En_1_Chapter_Equb.gif)
+
+  * Translating data translates the mean: or
+
+![
+$$\\displaystyle{\\mathsf{mean}\\left \(\\left \\{x_{i} + c\\right \\}\\right \) = \\mathsf{mean}\\left \(\\left \\{x_{i}\\right \\}\\right \) + c.}$$
+](A442674_1_En_1_Chapter_Equc.gif)
+
+  * The sum of signed differences from the mean is zero: or,
+
+![
+$$\\displaystyle{\\sum _{i=1}^{N}\(x_{ i} -\\mathsf{mean}\\left \(\\left \\{x_{i}\\right \\}\\right \)\) = 0.}$$
+](A442674_1_En_1_Chapter_Equd.gif)
+
+  * Choose the number μ such that the sum of squared distances of data points to μ is minimized. That number is the mean. In notation
+
+![
+$$\\displaystyle{ \\begin{array}{c} \\mbox{ argmin}\\\\ \\mu \\end{array} \\sum _{i}\(x_{i}-\\mu \)^{2} = \\mathsf{mean}\\left \(\\left \\{x_{i}\\right \\}\\right \). }$$
+](A442674_1_En_1_Chapter_Eque.gif)
+
+All this means that the mean is a location parameter; it tells you where the data lies along a number line.
+
+I prove ![
+$$\\begin{array}{c} \\mbox{ argmin}\\\\ \\mu \\end{array} \\sum _{i}\(x_{i}-\\mu \)^{2} = \\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_1_Chapter_IEq1.gif) below. This result means that the mean is the single number that is closest to all the data items. The mean tells you where the overall blob of data lies. For this reason, it is often referred to as a location parameter. If you choose to summarize the dataset with a number that is as close as possible to each data item, the mean is the number to choose. The mean is also a guide to what new values will look like, if you have no other information. For example, in the case of the bar, a new person walks in, and I must guess that person's net worth. Then the mean is the best guess, because it is closest to all the data items we have already seen. In the case of the bar, if a new person walked into this bar, and you had to guess that person's net worth, you should choose $107,903.
+
+Property 1.1
+
+The Average Squared Distance to the Mean is Minimized
+
+Proposition
+
+![
+$$\\begin{array}{c} \\mathit{\\mbox{ argmin}}\\\\ \\mu \\end{array} \\sum _{i}\(x_{i}-\\mu \)^{2} = \\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_1_Chapter_IEq2.gif)
+
+Proof
+
+Choose the number μ such that the sum of squared distances of data points to μ is minimized. That number is the mean. In notation:
+
+![
+$$\\displaystyle{\\begin{array}{c} \\mbox{ argmin}\\\\ \\mu \\end{array} \\sum _{i}\(x_{i}-\\mu \)^{2} = \\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)}$$
+](A442674_1_En_1_Chapter_Equf.gif)
+
+We can show this by actually minimizing the expression. We must have that the derivative of the expression we are minimizing is zero at the value of μ we are seeking. So we have
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\frac{d} {d\\mu }\\sum _{i=1}^{N}\(x_{ i}-\\mu \)^{2}& =& \\sum _{ i=1}^{N}2\(x_{ i}-\\mu \) {}\\\\ & =& 2\\sum _{i=1}^{N}\(x_{ i}-\\mu \) {}\\\\ & =& 0 {}\\\\ \\end{array}$$
+](A442674_1_En_1_Chapter_Equ1.gif)
+
+so that ![
+$$2N\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) - 2N\\mu = 0$$
+](A442674_1_En_1_Chapter_IEq3.gif), which means that ![
+$$\\mu = \\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_1_Chapter_IEq4.gif).
+
+### 1.3.2 Standard Deviation
+
+We would also like to know the extent to which data items are close to the mean. This information is given by the standard deviation, which is the root mean square of the offsets of data from the mean.
+
+Definition 1.2 (Standard Deviation)
+
+Assume we have a dataset {x} of N data items, x 1,..., x N . The standard deviation of this dataset is:
+
+![
+$$\\displaystyle{\\begin{array}{ll} \\mathsf{std}\\left \(\\left \\{x_{ i}\\right \\}\\right \)&=\\sqrt{ \\frac{1} {N}\\sum _{i=1}^{i=N}\(x_{i} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\)^{2}} \\\\ & =\\sqrt{\\mathsf{mean } \\left \(\\left \\{\(x_{i } - \\mathsf{mean } \\left \(\\left \\{x\\right \\} \\right \) \)^{2 } \\right \\} \\right \)}.\\end{array} }$$
+](A442674_1_En_1_Chapter_Equg.gif)
+
+You should think of the standard deviation as a scale. It measures the size of the average deviation from the mean for a dataset, or how wide the spread of data is. For this reason, it is often referred to as a scale parameter. When the standard deviation of a dataset is large, there are many items with values much larger than, or much smaller than, the mean. When the standard deviation is small, most data items have values close to the mean. This means it is helpful to talk about how many standard deviations away from the mean a particular data item is. Saying that data item x j is "within k standard deviations from the mean" means that
+
+![
+$$\\displaystyle{\\mathsf{abs}\\left \(x_{j} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\\right \) \\leq k\\mathsf{std}\\left \(\\left \\{x_{i}\\right \\}\\right \).}$$
+](A442674_1_En_1_Chapter_Equh.gif)
+
+Similarly, saying that data item x j is "more than k standard deviations from the mean" means that
+
+![
+$$\\displaystyle{\\mathsf{abs}\\left \(x_{i} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\\right \)> k\\mathsf{std}\\left \(\\left \\{x\\right \\}\\right \).}$$
+](A442674_1_En_1_Chapter_Equi.gif)
+
+As I will show below, there must be some data at least one standard deviation away from the mean, and there can be very few data items that are many standard deviations away from the mean. Standard deviation has very important properties. Again, for emphasis, I have broken these properties out in a box below.
+
+Useful Facts 1.2 (Properties of Standard Deviation)
+
+  * Translating data does not change the standard deviation, i.e. ![
+$$\\mathsf{std}\\left \(\\left \\{x_{i} + c\\right \\}\\right \) = \\mathsf{std}\\left \(\\left \\{x_{i}\\right \\}\\right \)$$
+](A442674_1_En_1_Chapter_IEq5.gif).
+
+  * Scaling data scales the standard deviation, i.e. ![
+$$\\mathsf{std}\\left \(\\left \\{kx_{i}\\right \\}\\right \) = k\\mathsf{std}\\left \(\\left \\{x_{i}\\right \\}\\right \)$$
+](A442674_1_En_1_Chapter_IEq6.gif).
+
+  * For any dataset, there can be only a few items that are many standard deviations away from the mean. For N data items, x i , whose standard deviation is σ, there are at most ![
+$$\\frac{1} {k^{2}}$$
+](A442674_1_En_1_Chapter_IEq7.gif) data points lying k or more standard deviations away from the mean.
+
+  * For any dataset, there must be at least one data item that is at least one standard deviation away from the mean, that is, ![
+$$\(\\mathsf{std}\\left \(\\left \\{x\\right \\}\\right \)\)^{2} \\leq \\max _{i}\(x_{i} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\)^{2}.$$
+](A442674_1_En_1_Chapter_IEq8.gif)
+
+The standard deviation is often referred to as a scale parameter; it tells you how broadly the data spreads about the mean.
+
+Property 1.2
+
+For any dataset, it is hard for data items to get many standard deviations away from the mean.
+
+Proposition
+
+Assume we have a dataset {x} of N data items, x 1,..., x N . Assume the standard deviation of this dataset is ![
+$$\\mathsf{std}\\left \(\\left \\{x\\right \\}\\right \) =\\sigma$$
+](A442674_1_En_1_Chapter_IEq9.gif) . Then there are at most ![
+$$\\frac{1} {k^{2}}$$
+](A442674_1_En_1_Chapter_IEq10.gif) data points lying k or more standard deviations away from the mean.
+
+Proof
+
+Assume the mean is zero. There is no loss of generality here, because translating data translates the mean, but doesn't change the standard deviation. Now we must construct a dataset with the largest possible fraction r of data points lying k or more standard deviations from the mean. To achieve this, our data should have N(1 − r) data points each with the value 0, because these contribute 0 to the standard deviation. It should have Nr data points with the value kσ; if they are further from zero than this, each will contribute more to the standard deviation, so the fraction of such points will be fewer. Because
+
+![
+$$\\displaystyle{\\mathsf{std}\\left \(\\left \\{x\\right \\}\\right \) =\\sigma = \\sqrt{\\frac{\\sum _{i } x_{i }^{2 }} {N}} }$$
+](A442674_1_En_1_Chapter_Equj.gif)
+
+we have that, for this rather specially constructed dataset,
+
+![
+$$\\displaystyle{\\sigma = \\sqrt{\\frac{Nrk^{2 } \\sigma ^{2 } } {N}} }$$
+](A442674_1_En_1_Chapter_Equk.gif)
+
+so that
+
+![
+$$\\displaystyle{r = \\frac{1} {k^{2}}.}$$
+](A442674_1_En_1_Chapter_Equl.gif)
+
+We constructed the dataset so that r would be as large as possible, so
+
+![
+$$\\displaystyle{r \\leq \\frac{1} {k^{2}}}$$
+](A442674_1_En_1_Chapter_Equm.gif)
+
+for any kind of data at all.
+
+The bound in proof 1.2 is true for any kind of data. The crucial point about the standard deviation is that you won't see much data that lies many standard deviations from the mean, because you can't. This bound implies that, for example, at most 100% of any dataset could be one standard deviation away from the mean, 25% of any dataset is 2 standard deviations away from the mean and at most 11% of any dataset could be 3 standard deviations away from the mean. But the configuration of data that achieves this bound is very unusual. This means the bound tends to wildly overstate how much data is far from the mean for most practical datasets. Most data has more random structure, meaning that we expect to see very much less data far from the mean than the bound predicts. For example, much data can reasonably be modelled as coming from a normal distribution (a topic we'll go into later). For such data, we expect that about 68% of the data is within one standard deviation of the mean, 95% is within two standard deviations of the mean, and 99% is within three standard deviations of the mean, and the percentage of data that is within (say) ten standard deviations of the mean is essentially indistinguishable from 100%.
+
+Property 1.3
+
+For any dataset, there must be at least one data item that is at least one standard deviation away from the mean.
+
+Proposition
+
+![
+$$\\displaystyle{\(\\mathsf{std}\\left \(\\left \\{x\\right \\}\\right \)\)^{2} \\leq \\max _{ i}\(x_{i} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\)^{2}.}$$
+](A442674_1_En_1_Chapter_Equn.gif)
+
+Proof
+
+You can see this by looking at the expression for standard deviation. We have
+
+![
+$$\\displaystyle{\\mathsf{std}\\left \(\\left \\{x\\right \\}\\right \) = \\sqrt{ \\frac{1} {N}\\sum _{i=1}^{i=N}\(x_{i} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\)^{2}}.}$$
+](A442674_1_En_1_Chapter_Equo.gif)
+
+Now, this means that
+
+![
+$$\\displaystyle{N\(\\mathsf{std}\\left \(\\left \\{x\\right \\}\\right \)\)^{2} =\\sum _{ i=1}^{i=N}\(x_{ i} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\)^{2}.}$$
+](A442674_1_En_1_Chapter_Equp.gif)
+
+But
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\sum _{i=1}^{i=N}\(x_{ i} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\)^{2} \\leq N\\max _{ i}\(x_{i} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\)^{2}& & {}\\\\ \\end{array}$$
+](A442674_1_En_1_Chapter_Equ2.gif)
+
+so
+
+![
+$$\\displaystyle{\(\\mathsf{std}\\left \(\\left \\{x\\right \\}\\right \)\)^{2} \\leq \\max _{ i}\(x_{i} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\)^{2}.}$$
+](A442674_1_En_1_Chapter_Equq.gif)
+
+The properties proved in proof 1.2 and proof 1.3 mean that the standard deviation is quite informative. Very little data is many standard deviations away from the mean; similarly, at least some of the data should be one or more standard deviations away from the mean. So the standard deviation tells us how data points are scattered about the mean.
+
+There is an ambiguity that comes up often here because two (very slightly) different numbers are called the standard deviation of a dataset. One—the one we use in this chapter—is an estimate of the scale of the data, as we describe it. The other differs from our expression very slightly; one computes
+
+![
+$$\\displaystyle{\\mathsf{stdunbiased}\\left \(\\left \\{x\\right \\}\\right \) = \\sqrt{\\frac{\\sum _{i } \(x_{i } - \\mathsf{mean } \\left \(\\left \\{x\\right \\} \\right \) \)^{2 } } {N - 1}} }$$
+](A442674_1_En_1_Chapter_Equr.gif)
+
+(notice the N − 1 for our N). If N is large, this number is basically the same as the number we compute, but for smaller N there is a difference that can be significant. Irritatingly, this number is also called the standard deviation; even more irritatingly, we will have to deal with it, but not yet. I mention it now because you may look up terms I have used, find this definition, and wonder whether I know what I'm talking about. In this case, I do (although I would say that).
+
+The confusion arises because sometimes the datasets we see are actually samples of larger datasets. For example, in some circumstances you could think of the net worth dataset as a sample of all the net worths in the USA. In such cases, we are often interested in the standard deviation of the underlying dataset that was sampled (rather than of the dataset of samples that you have). The second number is a slightly better way to estimate this standard deviation than the definition we have been working with. Don't worry—the N in our expressions is the right thing to use for what we're doing.
+
+### 1.3.3 Computing Mean and Standard Deviation Online
+
+One useful feature of means and standard deviations is that you can estimate them online. Assume that, rather than seeing N elements of a dataset in one go, you get to see each one once in some order, and you cannot store them. This means that after seeing k elements, you will have an estimate of the mean based on those k elements. Write ![
+$$\\hat{\\mu }_{k}$$
+](A442674_1_En_1_Chapter_IEq11.gif) for this estimate. Because
+
+![
+$$\\displaystyle{\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) = \\frac{\\sum _{i}x_{i}} {N} }$$
+](A442674_1_En_1_Chapter_Equs.gif)
+
+and
+
+![
+$$\\displaystyle{\\sum _{i=1}^{k+1}x_{ i} = \\left \(\\sum _{i=1}^{k}x_{ i}\\right \) + x_{k+1},}$$
+](A442674_1_En_1_Chapter_Equt.gif)
+
+we have the following recursion
+
+![
+$$\\displaystyle{\\hat{\\mu }_{k+1} = \\frac{\\left \(k\\hat{\\mu }_{k}\\right \) + x_{k+1}} {\(k + 1\)}.}$$
+](A442674_1_En_1_Chapter_Equu.gif)
+
+Similarly, after seeing k elements, you will have an estimate of the standard deviation based on those k elements. Write ![
+$$\\hat{\\sigma }_{k}$$
+](A442674_1_En_1_Chapter_IEq12.gif) for this estimate. We have the recursion
+
+![
+$$\\displaystyle{\\hat{\\sigma }_{k+1} = \\sqrt{\\frac{\(k\\hat{\\sigma }_{k }^{2 }\) + \(x_{k+1 } -\\hat{\\mu } _{k+1 } \)^{2 } } {\(k + 1\)}}.}$$
+](A442674_1_En_1_Chapter_Equv.gif)
+
+### 1.3.4 Variance
+
+It turns out that thinking in terms of the square of the standard deviation, which is known as the variance, will allow us to generalize our summaries to apply to higher dimensional data.
+
+Definition 1.3 (Variance)
+
+Assume we have a dataset {x} of N data items, x 1,..., x N . where N > 1. Their variance is:
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{var}\\left \(\\left \\{x\\right \\}\\right \)& =& \\frac{1} {N}\\left \(\\sum _{i=1}^{i=N}\(x_{ i} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\)^{2}\\right \) {}\\\\ & =& \\mathsf{mean}\\left \(\\left \\{\(x_{i} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\)^{2}\\right \\}\\right \). {}\\\\ \\end{array}$$
+](A442674_1_En_1_Chapter_Equ3.gif)
+
+One good way to think of the variance is as the mean-square error you would incur if you replaced each data item with the mean. Another is that it is the square of the standard deviation. The properties of the variance follow from the fact that it is the square of the standard deviation. I have broken these out in a box, for emphasis.
+
+Useful Facts 1.3 (Properties of Variance)
+
+  * ![
+$$\\mathsf{var}\\left \(\\left \\{x + c\\right \\}\\right \) = \\mathsf{var}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_1_Chapter_IEq13.gif).
+
+  * ![
+$$\\mathsf{var}\\left \(\\left \\{kx\\right \\}\\right \) = k^{2}\\mathsf{var}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_1_Chapter_IEq14.gif).
+
+While one could restate the other two properties of the standard deviation in terms of the variance, it isn't really natural to do so. The standard deviation is in the same units as the original data, and should be thought of as a scale. Because the variance is the square of the standard deviation, it isn't a natural scale (unless you take its square root!).
+
+### 1.3.5 The Median
+
+One problem with the mean is that it can be affected strongly by extreme values. Go back to the bar example, of Sect. 1.3.1. Now Warren Buffett (or Bill Gates, or your favorite billionaire) walks in. What happened to the average net worth?
+
+Assume your billionaire has net worth $1,000,000,000. Then the mean net worth suddenly has become
+
+![
+$$\\displaystyle{\\frac{10\\times \\$107,903+\\$1\\!,000,000,000} {11} =\\$91,007,184}$$
+](A442674_1_En_1_Chapter_Equw.gif)
+
+But this mean isn't a very helpful summary of the people in the bar. It is probably more useful to think of the net worth data as ten people together with one billionaire. The billionaire is known as an outlier.
+
+One way to get outliers is that a small number of data items are very different, due to minor effects you don't want to model. Another is that the data was misrecorded, or mistranscribed. Another possibility is that there is just too much variation in the data to summarize it well. For example, a small number of extremely wealthy people could change the average net worth of US residents dramatically, as the example shows. An alternative to using a mean is to use a median.
+
+Definition 1.4 (Median)
+
+The median of a set of data points is obtained by sorting the data points, and finding the point halfway along the list. If the list is of even length, it's usual to average the two numbers on either side of the middle. We write
+
+![
+$$\\displaystyle{\\mathsf{median}\\left \(\\left \\{x\\right \\}\\right \)}$$
+](A442674_1_En_1_Chapter_Equx.gif)
+
+for the operator that returns the median.
+
+For example,
+
+![
+$$\\displaystyle{\\mathsf{median}\\left \(\\left \\{3,5,7\\right \\}\\right \) = 5,}$$
+](A442674_1_En_1_Chapter_Equy.gif)
+
+![
+$$\\displaystyle{\\mathsf{median}\\left \(\\left \\{3,4,5,6,7\\right \\}\\right \) = 5,}$$
+](A442674_1_En_1_Chapter_Equz.gif)
+
+and
+
+![
+$$\\displaystyle{\\mathsf{median}\\left \(\\left \\{3,4,5,6\\right \\}\\right \) = 4.5.}$$
+](A442674_1_En_1_Chapter_Equaa.gif)
+
+For much, but not all, data, you can expect that roughly half the data is smaller than the median, and roughly half is larger than the median. Sometimes this property fails. For example,
+
+![
+$$\\displaystyle{\\mathsf{median}\\left \(\\left \\{1,2,2,2,2,2,2,2,3\\right \\}\\right \) = 2.}$$
+](A442674_1_En_1_Chapter_Equab.gif)
+
+With this definition, the median of our list of net worths is $107, 835. If we insert the billionaire, the median becomes $108, 930. Notice by how little the number has changed—it remains an effective summary of the data. You can think of the median of a dataset as giving the "middle" or "center" value. It is another way of estimating where the dataset lies on a number line (and so is another location parameter). This means it is rather like the mean, which also gives a (slightly differently defined) "middle" or "center" value. The mean has the important properties that if you translate the dataset, the mean translates, and if you scale the dataset, the mean scales. The median has these properties, too, which I have broken out in a box. Each is easily proved, and proofs are relegated to the exercises.
+
+Useful Facts 1.4 (Properties of the Median)
+
+  * ![
+$$\\mathsf{median}\\left \(\\left \\{x + c\\right \\}\\right \) = \\mathsf{median}\\left \(\\left \\{x\\right \\}\\right \) + c$$
+](A442674_1_En_1_Chapter_IEq15.gif).
+
+  * ![
+$$\\mathsf{median}\\left \(\\left \\{kx\\right \\}\\right \) = k\\mathsf{median}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_1_Chapter_IEq16.gif).
+
+### 1.3.6 Interquartile Range
+
+Outliers are a nuisance in all sorts of ways. Plotting the histogram of the net worth data with the billionaire included will be tricky. Either you leave the billionaire out of the plot, or all the histogram bars are tiny. Visualizing this plot shows outliers can affect standard deviations severely, too. For our net worth data, the standard deviation without the billionaire is $9265, but if we put the billionaire in there, it is $3. 014 × 108. When the billionaire is in the dataset, the mean is about 91M$ and the standard deviation is about 300M$—so all but one of the data items lie about a third of a standard deviation away from the mean on the small side. The other data item (the billionaire) is about three standard deviations away from the mean on the large side. In this case, the standard deviation has done its work of informing us that there are huge changes in the data, but isn't really helpful as a description of the data.
+
+The problem is this: describing the net worth data with billionaire as a having a mean of $9. 101 × 107 with a standard deviation of $3. 014 × 108 isn't really helpful. Instead, the data really should be seen as a clump of values that are near $100, 000 and moderately close to one another, and one massive number (the billionaire outlier).
+
+One thing we could do is simply remove the billionaire and compute mean and standard deviation. This isn't always easy to do, because it's often less obvious which points are outliers. An alternative is to follow the strategy we did when we used the median. Find a summary that describes scale, but is less affected by outliers than the standard deviation. This is the interquartile range; to define it, we need to define percentiles and quartiles, which are useful anyway.
+
+Definition 1.5 (Percentile)
+
+The k'th percentile is the value such that k% of the data is less than or equal to that value. We write percentile({x}, k) for the k'th percentile of dataset {x}.
+
+Definition 1.6 (Quartiles)
+
+The first quartile of the data is the value such that 25% of the data is less than or equal to that value (i.e. percentile({x}, 25)). The second quartile of the data is the value such that 50% of the data is less than or equal to that value, which is usually the median (i.e. percentile({x}, 50)). The third quartile of the data is the value such that 75% of the data is less than or equal to that value (i.e. percentile({x}, 75)).
+
+Definition 1.7 (Interquartile Range)
+
+The interquartile range of a dataset {x} is iqr{x} = percentile({x}, 75) −percentile({x}, 25).
+
+Like the standard deviation, the interquartile range gives an estimate of how widely the data is spread out. But it is quite well-behaved in the presence of outliers. For our net worth data without the billionaire, the interquartile range is $12, 350; with the billionaire, it is $17, 710. You can think of the interquartile range of a dataset as giving an estimate of the scale of the difference from the mean. This means it is rather like the standard deviation, which also gives a (slightly differently defined) scale. The standard deviation has the important properties that if you translate the dataset, the standard deviation translates, and if you scale the dataset, the standard deviation scales. The interquartile range has these properties, too, which I have broken out into a box. Each is easily proved, and proofs are relegated to the exercises.
+
+Useful Facts 1.5 (Properties of the Interquartile Range)
+
+  * iqr{x \+ c} = iqr{x}.
+
+  * iqr{kx} = k iqr{x}.
+
+For most datasets, interquartile ranges tend to be somewhat larger than standard deviations. This isn't really a problem. Each is a method for estimating the scale of the data—the range of values above and below the mean that you are likely to see. It is neither here nor there if one method yields slightly larger estimates than another, as long as you don't compare estimates across methods.
+
+### 1.3.7 Using Summaries Sensibly
+
+One should be careful how one summarizes data. For example, the statement that "the average US family has 2.6 children" invites mockery (the example is from Andrew Vickers' book What is a p-value anyway?), because you can't have fractions of a child—no family has 2.6 children. A more accurate way to say things might be "the average of the number of children in a US family is 2.6", but this is clumsy. What is going wrong here is the 2.6 is a mean, but the number of children in a family is a categorical variable. Reporting the mean of a categorical variable is often a bad idea, because you may never encounter this value (the 2.6 children). For a categorical variable, giving the median value and perhaps the interquartile range often makes much more sense than reporting the mean.
+
+For continuous variables, reporting the mean is reasonable because you could expect to encounter a data item with this value, even if you haven't seen one in the particular data set you have. It is sensible to look at both mean and median; if they're significantly different, then there is probably something going on that is worth understanding. You'd want to plot the data using the methods of the next section before you decided what to report.
+
+You should also be careful about how precisely numbers are reported (equivalently, the number of significant figures). Numerical and statistical software will produce very large numbers of digits freely, but not all are always useful. This is a particular nuisance in the case of the mean, because you might add many numbers, then divide by a large number; in this case, you will get many digits, but some might not be meaningful. For example, Vickers (in the same book) describes a paper reporting the mean length of pregnancy as 32.833 weeks. That fifth digit suggests we know the mean length of pregnancy to about 0.001 weeks, or roughly 10 min. Neither medical interviewing nor people's memory for past events is that detailed. Furthermore, when you interview them about embarrassing topics, people quite often lie. There is no prospect of knowing this number with this precision.
+
+People regularly report silly numbers of digits because it is easy to miss the harm caused by doing so. But the harm is there: you are implying to other people, and to yourself, that you know something more accurately than you do. At some point, someone may suffer for it.
+
+## 1.4 Plots and Summaries
+
+Knowing the mean, standard deviation, median and interquartile range of a dataset gives us some information about what its histogram might look like. In fact, the summaries give us a language in which to describe a variety of characteristic properties of histograms that are worth knowing about (Sect. 1.4.1). Quite remarkably, many different datasets have histograms that have about the same shape (Sect. 1.4.2). For such data, we know roughly what percentage of data items are how far from the mean.
+
+Complex datasets can be difficult to interpret with histograms alone, because it is hard to compare many histograms by eye. Section 1.4.3 describes a clever plot of various summaries of datasets that makes it easier to compare many cases.
+
+### 1.4.1 Some Properties of Histograms
+
+The tails of a histogram are the relatively uncommon values that are significantly larger (resp. smaller) than the value at the peak (which is sometimes called the mode). A histogram is unimodal if there is only one peak; if there are more than one, it is multimodal, with the special term bimodal sometimes being used for the case where there are two peaks (Fig. 1.4). The histograms we have seen have been relatively symmetric, where the left and right tails are about as long as one another. Another way to think about this is that values a lot larger than the mean are about as common as values a lot smaller than the mean. Not all data is symmetric. In some datasets, one or another tail is longer (Fig. 1.5). This effect is called skew.
+
+Fig. 1.4
+
+Many histograms are unimodal, like the example on the top; there is one peak, or mode. Some are bimodal (two peaks; bottom left) or even multimodal (two or more peaks; bottom right). One common reason (but not the only reason) is that there are actually two populations being conflated in the histograms. For example, measuring adult heights might result in a bimodal histogram, if male and female heights were slightly different. As another example, measuring the weight of dogs might result in a multimodal histogram if you did not distinguish between breeds (eg chihauhau, terrier, german shepherd, pyranean mountain dog, etc.)
+
+Fig. 1.5
+
+On the top, an example of a symmetric histogram, showing its tails (relatively uncommon values that are significantly larger or smaller than the peak or mode). Lower left, a sketch of a left-skewed histogram. Here there are few large values, but some very small values that occur with significant frequency. We say the left tail is "long", and that the histogram is left skewed. You may find this confusing, because the main bump is to the right—one way to remember this is that the left tail has been stretched. Lower right, a sketch of a right-skewed histogram. Here there are few small values, but some very large values that occur with significant frequency. We say the right tail is "long", and that the histogram is right skewed
+
+Skew appears often in real data. SOCR (the Statistics Online Computational Resource) publishes a number of datasets. Here we discuss a dataset of citations to faculty publications. For each of five UCLA faculty members, SOCR collected the number of times each of the papers they had authored had been cited by other authors (data at http://​wiki.​stat.​ucla.​edu/​socr/​index.​php/​SOCR_​Data_​Dinov_​072108_​H_​Index_​Pubs). Generally, a small number of papers get many citations, and many papers get few citations. We see this pattern in the histograms of citation numbers (Fig. 1.6). These are very different from (say) the body temperature pictures. In the citation histograms, there are many data items that have very few citations, and few that have many citations. This means that the right tail of the histogram is longer, so the histogram is skewed to the right.
+
+Fig. 1.6
+
+On the left, a histogram of citations for a faculty member, from data at http://​wiki.​stat.​ucla.​edu/​socr/​index.​php/​SOCR_​Data_​Dinov_​072108_​H_​Index_​Pubs. Very few publications have many citations, and many publications have few. This means the histogram is strongly right-skewed. On the right, a histogram of birth weights for 44 babies borne in Brisbane in 1997. This histogram looks slightly left-skewed
+
+One way to check for skewness is to look at the histogram; another is to compare mean and median (though this is not foolproof). For the first citation histogram, the mean is 24.7 and the median is 7.5; for the second, the mean is 24.4, and the median is 11. In each case, the mean is a lot bigger than the median. Recall the definition of the median (form a ranked list of the data points, and find the point halfway along the list). For much data, the result is larger than about half of the data set and smaller than about half the dataset. So if the median is quite small compared to the mean, then there are many small data items and a small number of data items that are large—the right tail is longer, so the histogram is skewed to the right.
+
+Left-skewed data also occurs; Fig. 1.6 shows a histogram of the birth weights of 44 babies born in Brisbane, in 1997 (from http://​www.​amstat.​org/​publications/​jse/​jse_​data_​archive.​htm). This data appears to be somewhat left-skewed, as birth weights can be a lot smaller than the mean, but tend not to be much larger than the mean.
+
+Skewed data is often, but not always, the result of constraints. For example, good obstetrical practice tries to ensure that very large birth weights are rare (birth is typically induced before the baby gets too heavy), but it may be quite hard to avoid some small birth weights. This could skew birth weights to the left (because large babies will get born, but will not be as heavy as they could be if obstetricians had not interfered). Similarly, income data can be skewed to the right by the fact that income is always positive. Test mark data is often skewed—whether to right or left depends on the circumstances—by the fact that there is a largest possible mark and a smallest possible mark.
+
+### 1.4.2 Standard Coordinates and Normal Data
+
+It is useful to look at lots of histograms, because it is often possible to get some useful insights about data. However, in their current form, histograms are hard to compare. This is because each is in a different set of units. A histogram for length data will consist of boxes whose horizontal units are, say, metres; a histogram for mass data will consist of boxes whose horizontal units are in, say, kilograms. Furthermore, these histograms typically span different ranges.
+
+We can make histograms comparable by (a) estimating the "location" of the plot on the horizontal axis and (b) estimating the "scale" of the plot. The location is given by the mean, and the scale by the standard deviation. We could then normalize the data by subtracting the location (mean) and dividing by the standard deviation (scale). The resulting values are unitless, and have zero mean. They are often known as standard coordinates.
+
+Definition 1.8 (Standard Coordinates)
+
+Assume we have a dataset {x} of N data items, x 1,..., x N . We represent these data items in standard coordinates by computing
+
+![
+$$\\displaystyle{\\hat{x}_{i} = \\frac{\(x_{i} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\)} {\\mathsf{std}\\left \(\\left \\{x\\right \\}\\right \)}.}$$
+](A442674_1_En_1_Chapter_Equac.gif)
+
+We write ![
+$$\\{\\hat{x}\\}$$
+](A442674_1_En_1_Chapter_IEq17.gif) for a dataset that happens to be in standard coordinates.
+
+Standard coordinates have some important properties. Assume we have N data items. Write x i for the i'th data item, and ![
+$$\\hat{x}_{i}$$
+](A442674_1_En_1_Chapter_IEq18.gif) for the i'th data item in standard coordinates (I sometimes refer to these as "normalized data items"). Then we have
+
+![
+$$\\displaystyle{\\mathsf{mean}\\left \(\\left \\{\\hat{x}\\right \\}\\right \) = 0.}$$
+](A442674_1_En_1_Chapter_Equad.gif)
+
+We also have that
+
+![
+$$\\displaystyle{\\mathsf{std}\\left \(\\left \\{\\hat{x}\\right \\}\\right \) = 1.}$$
+](A442674_1_En_1_Chapter_Equae.gif)
+
+An extremely important fact about data is that, for many kinds of data, histograms of these standard coordinates look the same. Many completely different datasets produce a histogram that, in standard coordinates, has a very specific appearance. It is symmetric and unimodal, and it looks like a bump. If there were enough data points and the histogram boxes were small enough, the histogram would look like the curve in Fig. 1.7. This phenomenon is so important that data of this form has a special name.
+
+Fig. 1.7
+
+Data is standard normal data when its histogram takes a stylized, bell-shaped form, plotted above. One usually requires a lot of data and very small histogram boxes for this form to be reproduced closely. Nonetheless, the histogram for normal data is unimodal (has a single bump) and is symmetric; the tails fall off fairly fast, and there are few data items that are many standard deviations from the mean. Many quite different data sets have histograms that are similar to the normal curve; I show three such datasets here
+
+Definition 1.9 (Standard Normal Data)
+
+Data is standard normal data if, when we have a great deal of data, the histogram of the data in standard coordinates is a close approximation to the standard normal curve. This curve is given by
+
+![
+$$\\displaystyle{y\(x\) = \\frac{1} {\\sqrt{2\\pi }}e^{\\left \(-x^{2}/2\\right \) }}$$
+](A442674_1_En_1_Chapter_Equaf.gif)
+
+(which is shown in Fig. 1.7).
+
+Definition 1.10 (Normal Data)
+
+Data is normal data if, when we subtract the mean and divide by the standard deviation (i.e. compute standard coordinates), it becomes standard normal data.
+
+It is not always easy to tell whether data is normal or not, and there are a variety of tests one can use, which we discuss later. However, there are many examples of normal data. Figure 1.7 shows a diverse variety of data sets, plotted as histograms in standard coordinates. These include: the volumes of 30 oysters (from http://​www.​amstat.​org/​publications/​jse/​jse_​data_​archive.​htm; look for 30oysters.dat.txt); human heights (from http://​www2.​stetson.​edu/​~jrasp/​data.​htm; look for bodyfat.xls, and notice that I removed two outliers); and human weights (from http://​www2.​stetson.​edu/​~jrasp/​data.​htm; look for bodyfat.xls, again, I removed two outliers).
+
+For the moment, assume we know that a dataset is normal. Then we expect it to have the properties in the following box. In turn, these properties imply that data that contains outliers (points many standard deviations away from the mean) is not normal. This is usually a very safe assumption. It is quite common to model a dataset by excluding a small number of outliers, then modelling the remaining data as normal. For example, if I exclude two outliers from the height and weight data from http://​www2.​stetson.​edu/​~jrasp/​data.​htm, the data looks pretty close to normal.
+
+Useful Facts 1.6 (Properties of Normal Data)
+
+  * If we normalize it, its histogram will be close to the standard normal curve. This means, among other things, that the data is not significantly skewed.
+
+  * About 68% of the data lie within one standard deviation of the mean. We will prove this later.
+
+  * About 95% of the data lie within two standard deviations of the mean. We will prove this later.
+
+  * About 99% of the data lie within three standard deviations of the mean. We will prove this later.
+
+### 1.4.3 Box Plots
+
+It is usually hard to compare multiple histograms by eye. One problem with comparing histograms is the amount of space they take up on a plot, because each histogram involves multiple vertical bars. This means it is hard to plot multiple overlapping histograms cleanly. If you plot each one on a separate figure, you have to handle a large number of separate figures; either you print them too small to see enough detail, or you have to keep flipping over pages.
+
+A box plot is a way to plot data that simplifies comparison. A box plot displays a dataset as a vertical picture. There is a vertical box whose height corresponds to the interquartile range of the data (the width is just to make the figure easy to interpret). Then there is a horizontal line for the median; and the behavior of the rest of the data is indicated with whiskers and/or outlier markers. This means that each dataset makes is represented by a vertical structure, making it easy to show multiple datasets on one plot and interpret the plot (Fig. 1.8).
+
+Fig. 1.8
+
+A box plot showing the box, the median, the whiskers and two outliers. Notice that we can compare the two datasets rather easily; the next section explains the comparison
+
+To build a box plot, we first plot a box that runs from the first to the third quartile. We then show the median with a horizontal line. We then decide which data items should be outliers. A variety of rules are possible; for the plots I show, I used the rule that data items that are larger than q 3 \+ 1. 5(q 3 − q 1) or smaller than q 1 − 1. 5(q 3 − q 1), are outliers. This criterion looks for data items that are more than one and a half interquartile ranges above the third quartile, or more than one and a half interquartile ranges below the first quartile.
+
+Once we have identified outliers, we plot these with a special symbol (crosses in the plots I show). We then plot whiskers, which show the range of non-outlier data. We draw a whisker from q 1 to the smallest data item that is not an outlier, and from q 3 to the largest data item that is not an outlier. While all this sounds complicated, any reasonable programming environment will have a function that will do it for you. Figure 1.8 shows an example box plot. Notice that the rich graphical structure means it is quite straightforward to compare two histograms.
+
+## 1.5 Whose is Bigger? Investigating Australian Pizzas
+
+At http://​www.​amstat.​org/​publications/​jse/​jse_​data_​archive.​htm), there is a dataset giving the diameter of pizzas, measured in Australia (search for the word "pizza"). This website also gives the backstory for this dataset. Apparently, EagleBoys pizza claims that their pizzas are always bigger than Dominos pizzas, and published a set of measurements to support this claim (the measurements were available at http://​www.​eagleboys.​com.​au/​realsizepizza as of Feb 2012, but seem not to be there anymore).
+
+Whose pizzas are bigger? and why? A histogram of all the pizza sizes appears in Fig. 1.9. We would not expect every pizza produced by a restaurant to have exactly the same diameter, but the diameters are probably pretty close to one another, and pretty close to some standard value. This would suggest that we'd expect to see a histogram which looks like a single, rather narrow, bump about a mean. This is not what we see in Fig. 1.9—instead, there are two bumps, which suggests two populations of pizzas. This isn't particularly surprising, because we know that some pizzas come from EagleBoys and some from Dominos.
+
+Fig. 1.9
+
+A histogram of pizza diameters from the dataset described in the text. Notice that there seem to be two populations
+
+If you look more closely at the data in the dataset, you will notice that each data item is tagged with the company it comes from. We can now easily plot conditional histograms, conditioning on the company that the pizza came from. These appear in Fig. 1.10. Notice that EagleBoys pizzas seem to follow the pattern we expect—the diameters are clustered tightly around one value—but Dominos pizzas do not seem to be like that. This is reflected in a box plot (Fig. 1.11), which shows the range of Dominos pizza sizes is surprisingly large, and that EagleBoys pizza sizes have several large outliers. There is more to understand about this data. The dataset contains labels for the type of crust and the type of topping—perhaps these properties affect the size of the pizza?
+
+Fig. 1.10
+
+On the left, the class-conditional histogram of Dominos pizza diameters from the pizza data set; on the right, the class-conditional histogram of EagleBoys pizza diameters. Notice that EagleBoys pizzas seem to follow the pattern we expect—the diameters are clustered tightly around a mean, and there is a small standard deviation—but Dominos pizzas do not seem to be like that. There is more to understand about this data
+
+Fig. 1.11
+
+Box Plots of the pizza data, comparing EagleBoys and Dominos pizza. There are several curiosities here: why is the range for Dominos so large (25.5–29)? EagleBoys has a smaller range, but has several substantial outliers; why? One would expect pizza manufacturers to try and control diameter fairly closely, because pizzas that are too small present risks (annoying customers; publicity; hostile advertising) and pizzas that are too large should affect profits
+
+EagleBoys produces DeepPan, MidCrust and ThinCrust pizzas, and Dominos produces DeepPan, ClassicCrust and ThinNCrispy pizzas. This may have something to do with the observed patterns, but comparing six histograms by eye is unattractive. A box plot is the right way to compare these cases (Fig. 1.12). The box plot gives some more insight into the data. Dominos thin crust appear to have a narrow range of diameters (with several outliers), where the median pizza is rather larger than either the deep pan or the classic crust pizza. EagleBoys pizzas all have a range of diameters that is (a) rather similar across the types and (b) rather a lot like the Dominos thin crust. There are outliers, but few for each type.
+
+Fig. 1.12
+
+Box Plots for the pizza data, broken out by type (thin crust, etc.)
+
+Another possibility is that the variation in size is explained by the topping. We can compare types and toppings by producing a set of conditional box plots (i.e. the diameters for each type and each topping). This leads to rather a lot of boxes (Fig. 1.13), but they're still easy to compare by eye. The main difficulty is that the labels on the plot have to be shortened. I made labels using the first letter from the manufacturer ("D" or "E"); the first letter from the crust type (previous paragraph); and the first and last letter of the topping. Toppings for Dominos are: Hawaiian; Supreme; BBQMeatlovers. For EagleBoys, toppings are: Hawaiian; SuperSupremo; and BBQMeatlovers. This gives the labels: 'DCBs'; (Dominos; ClassicCrust; BBQMeatlovers); 'DCHn'; 'DCSe'; 'DDBs'; 'DDHn'; 'DDSe'; 'DTBs'; 'DTHn'; 'DTSe'; 'EDBs'; 'EDHn'; 'EDSo'; 'EMBs'; 'EMHn'; 'EMSo'; 'ETBs'; 'ETHn'; 'ETSo'. Figure 1.13 suggests that the topping isn't what is important, but the crust (group the box plots by eye).
+
+Fig. 1.13
+
+The pizzas are now broken up by topping as well as crust type (look at the source for the meaning of the names). I have separated Dominos from Eagleboys with a vertical line, and grouped each crust type with a box. It looks as though the issue is not the type of topping, but the crust. Eagleboys seems to have tighter control over the size of the final pizza
+
+What could be going on here? One possible explanation is that Eagleboys have tighter control over the size of the final pizza. One way this could happen is that all EagleBoys pizzas start the same size and shrink the same amount in baking, whereas all Dominos pizzas start a standard diameter, but different Dominos crusts shrink differently in baking. Another way is that Dominos makes different size crusts for different types, but that the cooks sometimes get confused. Yet another possibility is that Dominos controls portions by the mass of dough (so thin crust diameters tend to be larger), but Eagleboys controls by the diameter of the crust.
+
+You should notice that this is more than just a fun story. If you were a manager at a pizza firm, you'd need to make choices about how to control costs. Labor costs, rent, and portion control (i.e. how much pizza, topping, etc. a customer gets for their money) are the main thing to worry about. If the same kind of pizza has a wide range of diameters, you have a problem, because some customers are getting too much (which affects your profit) or too little (which means they might call someone else next time). But making more regular pizzas might require more skilled (and so more expensive) labor. The fact that Dominos and EagleBoys seem to be following different strategies successfully suggests that more than one strategy might work. But you can't choose if you don't know what's happening. As I said at the start, "what's going on here?" is perhaps the single most useful question anyone can ask.
+
+## 1.6 You Should
+
+### 1.6.1 Remember These Definitions
+
+Mean 7
+
+Standard deviation 9
+
+Variance 13
+
+Median 13
+
+Percentile 14
+
+Quartiles 14
+
+Interquartile Range 14
+
+Standard coordinates 18
+
+Standard normal data 19
+
+Normal data 19
+
+### 1.6.2 Remember These Terms
+
+categorical 3
+
+ordinal 3
+
+continuous 3
+
+bar chart 5
+
+histogram 6
+
+conditional histograms 7
+
+class-conditional histograms 7
+
+average 7
+
+location parameter 9
+
+scale parameter 10
+
+outlier 13
+
+tails 16
+
+mode 16
+
+unimodal 16
+
+multimodal 16
+
+bimodal 16
+
+skew 16
+
+standard normal curve 19
+
+box plot 20
+
+### 1.6.3 Remember These Facts
+
+Properties of the mean 8
+
+Properties of standard deviation 10
+
+Properties of variance 13
+
+Properties of the median 14
+
+Properties of the interquartile range 15
+
+Properties of normal data 20
+
+### 1.6.4 Be Able to
+
+  * Plot a bar chart for a dataset.
+
+  * Plot a histogram for a dataset.
+
+  * Tell whether the histogram is skewed or not, and in which direction.
+
+  * Plot and interpret conditional histograms.
+
+  * Compute basic summaries for a dataset, including mean, median, standard deviation and interquartile range.
+
+  * Plot a box plot for one or several datasets.
+
+  * Interpret a box plot.
+
+  * Use histograms, summaries and box plots to investigate datasets.
+
+Problems
+
+1.1 Show that ![
+$$\\mathsf{mean}\\left \(\\left \\{kx\\right \\}\\right \) = k\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_1_Chapter_IEq19.gif) by substituting into the definition.
+
+1.2 Show that ![
+$$\\mathsf{mean}\\left \(\\left \\{x + c\\right \\}\\right \) = \\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) + c$$
+](A442674_1_En_1_Chapter_IEq20.gif) by substituting into the definition.
+
+1.3 Show that ![
+$$\\sum _{i=1}^{N}\(x_{i} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\) = 0$$
+](A442674_1_En_1_Chapter_IEq21.gif) by substituting into the definition.
+
+1.4 Show that ![
+$$\\mathsf{std}\\left \(\\left \\{x + c\\right \\}\\right \) = \\mathsf{std}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_1_Chapter_IEq22.gif) by substituting into the definition (you'll need to recall the properties of the mean to do this).
+
+1.5 Show that ![
+$$\\mathsf{std}\\left \(\\left \\{kx\\right \\}\\right \) = k\\mathsf{std}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_1_Chapter_IEq23.gif) by substituting into the definition (you'll need to recall the properties of the mean to do this).
+
+1.6 Show that ![
+$$\\mathsf{median}\\left \(\\left \\{x+c\\right \\}\\right \)=\\mathsf{median}\\left \(\\left \\{x\\right \\}\\right \) + c$$
+](A442674_1_En_1_Chapter_IEq24.gif) by substituting into the definition.
+
+1.7 Show that ![
+$$\\mathsf{median}\\left \(\\left \\{kx\\right \\}\\right \) = k\\mathsf{median}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_1_Chapter_IEq25.gif) by substituting into the definition.
+
+1.8 Show that iqr{x \+ c} = iqr{x} by substituting into the definition.
+
+1.9 Show that iqr{kx} = k iqr{x} by substituting into the definition.
+
+Programming Exercises
+
+1.10 You can find a data set showing the number of barrels of oil produced, per year for the years 1880–1984 at http://​lib.​stat.​cmu.​edu/​DASL/​Datafiles/​Oilproduction.​html. Is a mean a useful summary of this dataset? Why?
+
+1.11 You can find a dataset giving the cost (in 1976 US dollars), number of megawatts, and year of construction of a set of nuclear power plants at http://​lib.​stat.​cmu.​edu/​DASL/​Datafiles/​NuclearPlants.​html.
+
+  1. (a)
+
+Are there outliers in this data?
+
+  2. (b)
+
+What is the mean cost of a power plant? What is the standard deviation?
+
+  3. (c)
+
+What is the mean cost per megawatt? What is the standard deviation?
+
+  4. (d)
+
+Plot a histogram of the cost per megawatt. Is it skewed? Why?
+
+1.12 You can find a dataset giving the sodium content and calorie content of three types of hot dog at http://​lib.​stat.​cmu.​edu/​DASL/​Datafiles/​Hotdogs.​html. The types are Beef, Poultry, and Meat (a rather disturbingly vague label). Use class-conditional histograms to compare these three types of hot dog with respect to sodium content and calories.
+
+1.13 You will find a dataset giving (among other things) the number of 3 or more syllable words in advertising copy appearing in magazines at http://​lib.​stat.​cmu.​edu/​DASL/​Datafiles/​magadsdat.​html. The magazines are grouped by the education level of their readers; the groups are 1, 2, and 3 (the variable is called GRP in the data).
+
+  1. (a)
+
+Use a box plot to compare the number of three or more syllable words for the ads in magazines in these three groups. What do you see?
+
+  2. (b)
+
+Use a box plot to compare the number of sentences appearing in the ads in magazines in these three groups. What do you see?
+
+1.14 You can find a dataset recording a variety of properties of secondary school students in Portugal athttp://​archive.​ics.​uci.​edu/​ml/​datasets/​STUDENT+ALCOHOL+​CONSUMPTION. This dataset was collected by P. Cortez and A. Silva, and is hosted by the UC Irvine Machine Learning Repository. There are two datasets; one for students in a math course, and another for students in a Portugese language course.
+
+  1. (a)
+
+Use plots of conditional histograms to investigate whether math students drink more alcohol during the week than Portugese language students.
+
+  2. (b)
+
+Use plots of conditional histograms to investigate whether students from small families drink more alcohol at the weekend than those from large families.
+
+  3. (c)
+
+Each of the variables school, sex, famsize and romantic has two possible values. This means that if we characterize students by the values of these variables, there are sixteen possible types of student. Use box plots to investigate which of these types drinks more alcohol in total.
+
+1.15 You can find a dataset recording some properties of Taiwanese credit card holders at http://​archive.​ics.​uci.​edu/​ml/​datasets/​default+of+credi​t+card+clients. This dataset was collected by I-Cheng Yeh, and is hosted by the UC Irvine Machine Learning Repository. There is a variable indicating whether a holder defaulted or not, and a variety of other variables.
+
+  1. (a)
+
+Use plots of conditional histograms to investigate whether people who default have more debt (use the variable X1 for debt) than those who don't default.
+
+  2. (b)
+
+Use box plots to investigate whether gender, education or marital status has any effect on the amount of debt (again, use X1 for debt).
+
+1.16 You will find a dataset giving the effects of three poisons and four antidotes at http://​www.​statsci.​org/​data/​general/​poison.​html. This dataset records survival times of animals poisoned with one of three poisons and supplied with one of four antidotes. There is no detail supplied on species, protocol, ethical questions, etc.
+
+  1. (a)
+
+Use box plots to investigate whether the poisons have different effects for antidote 1.
+
+  2. (b)
+
+Use box plots to investigate whether the antidote has any effect for poison 2.
+
+© Springer International Publishing AG 2018
+
+David ForsythProbability and Statistics for Computer Sciencehttps://doi.org/10.1007/978-3-319-64410-3_2
+
+# 2. Looking at Relationships
+
+David Forsyth1
+
+(1)
+
+Computer Science Department, University of Illinois at Urbana Champaign, Urbana, IL, USA
+
+We think of a dataset as a collection of d-tuples (a d-tuple is an ordered list of d elements). For example, the Chase and Dunner dataset had entries for Gender; Grade; Age; Race; Urban/Rural; School; Goals; Grades; Sports; Looks; and Money (so it consisted of 11-tuples). The previous chapter explored methods to visualize and summarize a set of values obtained by extracting a single element from each tuple. For example, I could visualize the heights or the weights of a population (as in Fig. 1.​7). But I could say nothing about the relationship between the height and weight. In this chapter, we will look at methods to visualize and summarize the relationships between pairs of elements of a dataset.
+
+## 2.1 Plotting 2D Data
+
+We take a dataset, choose two different entries, and extract the corresponding elements from each tuple. The result is a dataset consisting of 2-tuples, and we think of this as a two dimensional dataset. The first step is to plot this dataset in a way that reveals relationships. The topic of how best to plot data fills many books, and we can only scratch the surface here. Categorical data can be particularly tricky, because there are a variety of choices we can make, and the usefulness of each tends to depend on the dataset and to some extent on one's cleverness in graphic design (Sect. 2.1.1).
+
+For some continuous data, we can plot the one entry as a function of the other (so, for example, our tuples might consist of the date and the number of robberies; or the year and the price of lynx pelts; and so on, Sect. 2.1.2).
+
+Mostly, we use a simple device, called a scatter plot. Using and thinking about scatter plots will reveal a great deal about the relationships between our data items (Sect. 2.1.3).
+
+### 2.1.1 Categorical Data, Counts, and Charts
+
+Categorical data is a bit special. Assume we have a dataset with several categorical descriptions of each data item. One way to plot this data is to think of it as belonging to a richer set of categories. Assume the dataset has categorical descriptions, which are not ordinal. Then we can construct a new set of categories by looking at each of the cases for each of the descriptions. For example, in the Chase and Dunner data of Table 1.​2, our new categories would be: "boy-sports"; "girl-sports"; "boy-popular"; "girl-popular"; "boy-grades"; and "girl-grades". A large set of categories like this can result in a poor bar chart, though, because there may be too many bars to group the bars successfully. Figure 2.1 shows such a bar chart. Notice that it is hard to group categories by eye to compare; for example, you can see that slightly more girls think grades are important than boys do, but to do so you need to compare two bars that are separated by two other bars. An alternative is a pie chart, where a circle is divided into sections whose angle is proportional to the size of the data item. You can think of the circle as a pie, and each section as a slice of pie. Figure 2.1 shows a pie chart, where each section is proportional to the number of students in its category. In this case, I've used my judgement to lay the categories out in a way that makes comparisons easy. I'm not aware of any tight algorithm for doing this, though.
+
+Fig. 2.1
+
+I sorted the children in the Chase and Dunner study into six categories (two genders by three goals), and counted the number of children that fell into each cell. I then produced the bar chart on the left, which shows the number of children of each gender, selecting each goal. On the right, a pie chart of this information. I have organized the pie chart so it is easy to compare boys and girls by eye—start at the top; going down on the left side are boy goals, and on the right side are girl goals. Comparing the size of the corresponding wedges allows you to tell what goals boys (resp. girls) identify with more or less often
+
+Pie charts have problems, because it is hard to judge small differences in area accurately by eye. For example, from the pie chart in Fig. 2.1, it's hard to tell that the "boy-sports" category is slightly bigger than the "boy-popular" category (try it; check using the bar chart). For either kind of chart, it is quite important to think about what you plot. For example, the plot of Fig. 2.1 shows the total number of respondents, and if you refer to Fig. 1.​1, you will notice that there are slightly more girls in the study. Is the percentage of boys who think grades are important smaller (or larger) than the percentage of girls who think so? you can't tell from these plots, and you'd have to plot the percentages instead.
+
+An alternative is to use a stacked bar chart. You can (say) regard the data as of two types, "Boys" and "Girls". Within those types, there are subtypes ("Popularity", "Grades" and "Sport"). The height of the bar is given by the number of elements in the type, and the bar is divided into sections corresponding to the number of elements of that subtype. Alternatively, if you want the plot to show relative frequencies, the bars could all be the same height, but the shading corresponds to the fraction of elements of that subtype. This is all much harder to say than to see or to do (Fig. 2.2).
+
+Fig. 2.2
+
+These bar charts use stacked bars. In the top row, the overall height of the bar is given by the number of elements of that type but each different subtype is identified by shading, so you can tell by eye, for example, how many of the "Grades" in the study were "Boys". This layout makes it hard to tell what fraction of, say, "Boys" aspire to "Popularity". In the bottom row, all bars have the same height, but the shading of the bar identifies the fraction of that type that has a corresponding subtype. This means you can tell by eye what fraction of, for example, "Girls" aspire to "Sports"
+
+An alternative to a pie chart that is very useful for two dimensional data is a heat map. This is a method of displaying a matrix as an image. Each entry of the matrix is mapped to a color, and the matrix is represented as an image. For the Chase and Dunner study, I constructed a matrix where each row corresponds to a choice of "sports", "grades", or "popular", and each column corresponds to a choice of "boy" or "girl". Each entry contains the count of data items of that type. Zero values are represented as white; the largest values as red; and as the value increases, we use an increasingly saturated pink. This plot is shown in Fig. 2.3
+
+Fig. 2.3
+
+A heat map of the Chase and Dunner data. The color of each cell corresponds to the count of the number of elements of that type. The colorbar at the side gives the correspondence between color and count. You can see at a glance that the number of boys and girls who prefer grades is about the same; that about the same number of boys prefer sports and popularity, with sports showing a mild lead; and that more girls prefer popularity to sports
+
+If the categorical data is ordinal, the ordering offers some hints for making a good plot. For example, imagine we are building a user interface. We build an initial version, and collect some users, asking each to rate the interface on scales for "ease of use" ( − 2, −1, 0, 1, 2, running from bad to good) and "enjoyability" (again, − 2, −1, 0, 1, 2, running from bad to good). It is natural to build a 5 × 5 table, where each cell represents a pair of "ease of use" and "enjoyability" values. We then count the number of users in each cell, and build graphical representations of this table. One natural representation is a 3D bar chart, where each bar sits on its cell in the 2D table, and the height of the bars is given by the number of elements in the cell. Table 2.1 shows a table and Fig. 2.4 shows a 3D bar chart for some simulated data. The main difficulty with a 3D bar chart is that some bars are hidden behind others. This is a regular nuisance. You can improve things by using an interactive tool to rotate the chart to get a nice view, but this doesn't always work. Heatmaps don't suffer from this problem (Fig. 2.4), another reason they are a good choice.
+
+Table 2.1
+
+I simulated data representing user evaluations of a user interface | −2 | −1 | 0 | 1 | 2
+
+---|---|---|---|---|---
+
+−2 | 24 | 5 | 0 | 0 | 1
+
+−1 | 6 | 12 | 3 | 0 | 0
+
+0 | 2 | 4 | 13 | 6 | 0
+
+1 | 0 | 0 | 3 | 13 | 2
+
+2 | 0 | 0 | 0 | 1 | 5
+
+Each cell in the table on the left contains the count of users rating "ease of use" (horizontal, on a scale of − 2—very bad—to 2—very good) vs. "enjoyability" (vertical, same scale). Users who found the interface hard to use did not like using it either. While this data is categorical, it's also ordinal, so that the order of the cells is determined. It wouldn't make sense, for example, to reorder the columns of the table or the rows of the table
+
+Fig. 2.4
+
+On the left, a 3D bar chart of the data. The height of each bar is given by the number of users in each cell. This figure immediately reveals that users who found the interface hard to use did not like using it either. However, some of the bars at the back are hidden, so some structure might be hard to infer. On the right, a heat map of this data. Again, this figure immediately reveals that users who found the interface hard to use did not like using it either. It's more apparent that everyone disliked the interface, though, and it's clear that there is no important hidden structure
+
+Remember this: There are a variety of tools for plotting categorical data. It's difficult to give strict rules for which to use when, but usually one tries to avoid pie charts (angles are hard to judge by eye) and 3D bar charts (where occlusion can hide important effects).
+
+### 2.1.2 Series
+
+Sometimes one component of a dataset gives a natural ordering to the data. For example, we might have a dataset giving the maximum rainfall for each day of the year. We could record this either by using a two-dimensional representation, where one dimension is the number of the day and the other is the temperature, or with a convention where the i'th data item is the rainfall on the i'th day. For example, at http://​lib.​stat.​cmu.​edu/​DASL/​Datafiles/​timeseriesdat.​html, you can find four datasets indexed in this way. It is natural to plot data like this as a function of time. From this dataset, I extracted data giving the number of burglaries each month in a Chicago suburb, Hyde Park. I have plotted part this data in Fig. 2.5 (I left out the data to do with treatment effects). It is natural to plot a graph of the burglaries as a function of time (in this case, the number of the month). The plot shows each data point explicitly. I also told the plotting software to draw lines joining data points, because burglaries do not all happen on a specific day. The lines suggest, reasonably enough, the rate at which burglaries are happening between data points.
+
+Fig. 2.5
+
+Left, the number of burglaries in Hyde Park, by month. Right, a plot of the number of lynx pelts traded at Hudson Bay and of the price paid per pelt, as a function of the year. Notice the scale, and the legend box (the number of pelts is scaled by 100)
+
+As another example, at http://​lib.​stat.​cmu.​edu/​datasets/​Andrews/​ you can find a dataset that records the number of lynx pelts traded to the Hudson's Bay company and the price paid for each pelt. This version of the dataset appeared first in Table 3.2 of Data: a Collection of Problems from many Fields for the Student and Research Worker by D.F. Andrews and A.M. Herzberg, published by Springer in 1985. I have plotted it in Fig. 2.5. The dataset is famous, because it shows a periodic behavior in the number of pelts (which is a good proxy for the number of lynx), which is interpreted as a result of predator-prey interactions. Lynx eat rabbits. When there are many rabbits, lynx kittens thrive, and soon there will be many lynx; but then they eat most of the rabbits, and starve, at which point the rabbit population rockets. You should also notice that after about 1900, prices seem to have gone up rather quickly. I don't know why this is. There is also some suggestion, as there should be, that prices are low when there are many pelts, and high when there are few.
+
+### 2.1.3 Scatter Plots for Spatial Data
+
+It isn't always natural to plot data as a function. For example, in a dataset containing the temperature and blood pressure of a set of patients, there is no reason to believe that temperature is a function of blood pressure, or the other way round. Two people could have the same temperature, and different blood pressures, or vice-versa. As another example, we could be interested in what causes people to die of cholera. We have data indicating where each person died in a particular outbreak. It isn't helpful to try and plot such data as a function.
+
+The scatter plot is a powerful way to deal with this situation. In the first instance, assume that our data points actually describe points on the a real map. Then, to make a scatter plot, we make a mark on the map at a place indicated by each data point. What the mark looks like, and how we place it, depends on the particular dataset, what we are looking for, how much we are willing to work with complex tools, and our sense of graphic design.
+
+Figure 2.6 is an extremely famous scatter plot, due to John Snow. Snow—one of the founders of epidemiology—used a scatter plot to reason about a cholera outbreak centered on the Broad Street pump in London in 1854. At that time, the mechanism that causes cholera was not known. Snow plotted cholera deaths as little bars (more bars, more deaths) on the location of the house where the death occurred. More bars means more deaths, fewer bars means fewer deaths. There are more bars per block close to the pump, and few far away. This plot offers quite strong evidence of an association between the pump and death from cholera. Snow used this scatter plot as evidence that cholera was associated with water, and that the Broad Street pump was the source of the tainted water.
+
+Fig. 2.6
+
+Snow's scatter plot of cholera deaths on the left. Each cholera death is plotted as a small bar on the house in which the bar occurred (for example, the black arrow points to one stack of these bars, indicating many deaths, in the detail on the right). Notice the fairly clear pattern of many deaths close to the Broad street pump (grey arrow in the detail), and fewer deaths further away (where it was harder to get water from the pump)
+
+Remember this: Scatter plots are a most effective tool for geographic data and 2D data in general. A scatter plot should be your first step with a new 2D dataset.
+
+### 2.1.4 Exposing Relationships with Scatter Plots
+
+A scatter plot is a useful, simple tool for ferreting out associations in data. Now we need some notation. Assume we have a dataset {x} of N data items, x 1,..., x N . Each data item is a d dimensional vector (so its components are numbers). We wish to investigate the relationship between two components of the dataset. For example, we might be interested in the 7'th and the 13'th component of the dataset. We will produce a two-dimensional plot, one dimension for each component. It does not really matter which component is plotted on the x-coordinate and which on the y-coordinate (though it will be some pages before this is clear). But it is very difficult to write sensibly without talking about the x and y coordinates.
+
+We will make a two-dimensional dataset out of the components that interest us. We must choose which component goes first in the resulting 2-vector. We will plot this component on the x-coordinate (and we refer to it as the x-coordinate), and to the other component as the y-coordinate. This is just to make it easier to describe what is going on; there's no important idea here. It really will not matter which is x and which is y. The two components make a dataset {x i } = {(x i , y i )}. To produce a scatter plot of this data, we plot a small shape at the location of each data item.
+
+Such scatter plots are very revealing. For example, Fig. 2.7 shows a scatter plot of body temperature against heart rate for humans. In this dataset, the gender of the subject was recorded (as "1" or "2"—I don't know which is which), and so I have plotted a "1" at each data point with gender "1", and so on. Looking at the data suggests there isn't much difference between the blob of "1" labels and the blob of "2" labels, which suggests that females and males are about the same in this respect.
+
+Fig. 2.7
+
+A scatter plot of body temperature against heart rate, from the dataset at http://​www2.​stetson.​edu/​~jrasp/​data.​htm; normtemp.xls. I have separated the two genders by plotting a different symbol for each (though I don't know which gender is indicated by which letter); if you view this in color, the differences in color makes for a greater separation of the scatter. This picture suggests, but doesn't conclusively establish, that there isn't much dependence between temperature and heart rate, and any dependence between temperature and heart rate isn't affected by gender
+
+The scale used for a scatter plot matters. For example, plotting lengths in meters gives a very different scatter from plotting lengths in millimeters. Figure 2.8 shows two scatter plots of weight against height. Each plot is from the same dataset, but one is scaled so as to show two outliers. Keeping these outliers means that the rest of the data looks quite concentrated, just because the axes are in large units. In the other plot, the axis scale has changed (so you can't see the outliers), but the data looks more scattered. This may or may not be a misrepresentation. Figure 2.9 compares the data with outliers removed, with the same plot on a somewhat different set of axes. One plot looks as though increasing height corresponds to increasing weight; the other looks as though it doesn't. This is purely due to deceptive scaling—each plot shows the same dataset.
+
+Fig. 2.8
+
+Scatter plots of weight against height, from the dataset at http://​www2.​stetson.​edu/​~jrasp/​data.​htm. Left: Notice how two outliers dominate the picture, and to show the outliers, the rest of the data has had to be bunched up. Right shows the data with the outliers removed. The structure is now somewhat clearer
+
+Fig. 2.9
+
+Scatter plots of weight against height, from the dataset at http://​www2.​stetson.​edu/​~jrasp/​data.​htm. Left: data with two outliers removed, as in Fig. 2.8. Right: this data, rescaled slightly. Notice how the data looks less spread out. But there is no difference between the datasets. Instead, your eye is easily confused by a change of scale
+
+Dubious data can also contribute to scaling problems. Recall that, in Fig. 2.5, price data before and after 1900 appeared to behave differently. Figure 2.10 shows a scatter plot of the lynx data, where I have plotted number of pelts against price. I plotted the post-1900 data as circles, and the rest as asterisks. Notice how the circles seem to form a quite different figure, which supports the suggestion that something interesting happened around 1900. We can reasonably choose to analyze data after 1900 separately from before 1900. A choice like this should be made with care. If you exclude every data point that might disagree with your hypothesis, you may miss the fact that you are wrong. Leaving out data is an essential component of many kinds of fraud. You should always reveal whether you have excluded data, and why, to allow the reader to judge the evidence.
+
+Fig. 2.10
+
+A scatter plot of the price of lynx pelts against the number of pelts. I have plotted data for 1901 to the end of the series as circles, and the rest of the data as *'s. It is quite hard to draw any conclusion from this data, because the scale is confusing. Furthermore, the data from 1900 on behaves quite differently from the other data
+
+When you look at Fig. 2.10, you should notice the scatter plot does not seem to support the idea that prices go up when supply goes down. This is puzzling because it's generally a pretty reliable idea. In fact, the plot is just hard to interpret because it is poorly scaled. Scale is an important nuisance, and it's easy to get misled by scale effects.
+
+The way to avoid the problem is to plot in standard coordinates. We can normalize without worrying about the dimension of the data—we normalize each dimension independently by subtracting the mean of that dimension and dividing by the standard deviation of that dimension. This means we can normalize the x and y coordinates of the two-dimensional data separately. We continue to use the convention of writing the normalized x coordinate as ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq1.gif) and the normalized y coordinate as ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq2.gif). So, for example, we can write ![
+$$\\hat{x}_{j} = \(x_{j} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\)/\\mathsf{std}\\left \(\\left \\{x\\right \\}\\right \)\)$$
+](A442674_1_En_2_Chapter_IEq3.gif) for the ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq4.gif) value of the j'th data item in normalized coordinates. Normalizing shows us the dataset on a standard scale. Once we have done this, it is quite straightforward to read off simple relationships between variables from a scatter plot.
+
+Remember this: The plot scale can mask effects in scatter plots, and it's usually a good idea to plot in standard coordinates.
+
+## 2.2 Correlation
+
+Plotting data in standard coordinates can be very revealing. For example, it is pretty clear from Fig. 2.11 that someone who is taller than the mean will tend to be heavier than the mean too. This relationship isn't in the form of a function. There are some people who are quite a lot taller than the mean, and quite a lot lighter, too. But taller people are mostly heavier, too. There isn't always a relationship, as Fig. 2.12 suggests. There really doesn't seem to be any reason to suspect that heart rate and temperature are related. Sometimes the relationship goes the other way, i.e. when one variable increases, another decreases. Figure 2.13 strongly suggests that when more pelts were traded, the price tended to be lower.
+
+Fig. 2.11
+
+A normalized scatter plot of weight against height, from the dataset at http://​www2.​stetson.​edu/​~jrasp/​data.​htm. Now you can see that someone who is a standard deviation taller than the mean will tend to be somewhat heavier than the mean too
+
+Fig. 2.12
+
+Left: A scatter plot of body temperature against heart rate, from the dataset at http://​www2.​stetson.​edu/​~jrasp/​data.​htm; normtemp.xls. I have separated the two genders by plotting a different symbol for each (though I don't know which gender is indicated by which letter); if you view this in color, the differences in color makes for a greater separation of the scatter. This picture suggests, but doesn't conclusively establish, that there isn't much dependence between temperature and heart rate, and any dependence between temperature and heart rate isn't affected by gender. The scatter plot of the normalized data, in standard coordinates, on the right supports this view
+
+Fig. 2.13
+
+Left: A scatter plot of the price of lynx pelts against the number of pelts (this is a repeat of Fig. 2.10 for reference). I have plotted data for 1901 to the end of the series as circles, and the rest of the data as *'s. It is quite hard to draw any conclusion from this data, because the scale is confusing. Right: A scatter plot of the price of pelts against the number of pelts for lynx pelts. I excluded data for 1901 to the end of the series, and then normalized both price and number of pelts. Notice that there is now a distinct trend; when there are fewer pelts, they are more expensive, and when there are more, they are cheaper
+
+The simplest, and most important, relationship to look for in a scatter plot is this: when ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq5.gif) increases, does ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq6.gif) tend to increase, decrease, or stay the same? This is straightforward to spot in a normalized scatter plot, because each case produces a very clear shape on the scatter plot. Any relationship is called correlation (we will see later how to measure this), and the three cases are: positive correlation, which means that larger ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq7.gif) values tend to appear with larger ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq8.gif) values; zero correlation, which means no relationship; and negative correlation, which means that larger ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq9.gif) values tend to appear with smaller ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq10.gif) values. You should notice that this relationship isn't a function—the data forms blobs, rather than lying on curves—and it isn't affected by swapping ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq11.gif) and ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq12.gif). If larger ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq13.gif) tends to occur with larger ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq14.gif), then larger ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq15.gif) tends to occur with larger ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq16.gif), and so on. Figure 2.14 compares a plot of height against weight to one of weight against height. Usually, one just does this by rotating the page, or by imagining the new picture. The left plot tells you that data points with higher height value tend to have higher weight value; the right plot tells you that data points with higher weight value tend to have higher height value—i.e. the plots tell you the same thing. It doesn't really matter which one you look at. Again, the important word is "tend"—the plot doesn't tell you anything about why, it just tells you that when one variable is larger the other tends to be, too.
+
+Fig. 2.14
+
+On the left, a normalized scatter plot of weight (y-coordinate) against height (x-coordinate). On the right, a scatter plot of height (y-coordinate) against weight (x-coordinate). I've put these plots next to one another so you don't have to mentally rotate (which is what you should usually do)
+
+Positive correlation occurs when larger ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq17.gif) values tend to appear with larger ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq18.gif) values. This means that data points with small (i.e. negative with large magnitude) ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq19.gif) values must have small ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq20.gif) values, otherwise the mean of ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq21.gif) (resp. ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq22.gif)) would be too big. In turn, this means that the scatter plot should look like a "smear" of data from the bottom left of the graph to the top right. The smear might be broad or narrow, depending on some details we'll discuss below. Figure 2.11 shows normalized scatter plots of weight against height, and of body temperature against heart rate. In the weight-height plot, you can clearly see that individuals who are higher tend to weigh more. The important word here is "tend"—taller people could be lighter, but mostly they tend not to be. Notice, also, that I did NOT say that they weighed more because they were taller, but only that they tend to be heavier.
+
+Negative correlation occurs when larger ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq23.gif) values tend to appear with smaller ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq24.gif) values. This means that data points with small ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq25.gif) values must have large ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq26.gif) values, otherwise the mean of ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq27.gif) (resp. ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq28.gif)) would be too big. In turn, this means that the scatter plot should look like a "smear" of data from the top left of the graph to the bottom right. The smear might be broad or narrow, depending on some details we'll discuss below. Figure 2.13 shows a normalized scatter plot of the lynx pelt-price data, where I have excluded the data from 1901 on. I did so because there seemed to be some other effect operating to drive prices up, which was inconsistent with the rest of the series. This plot suggests that when there were more pelts, prices were lower, as one would expect.
+
+Zero correlation occurs when there is no relationship. This produces a characteristic shape in a scatter plot, but it takes a moment to understand why. If there really is no relationship, then knowing ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq29.gif) will tell you nothing about ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq30.gif). All we know is that ![
+$$\\mathsf{mean}\\left \(\\left \\{\\hat{y}\\right \\}\\right \) = 0$$
+](A442674_1_En_2_Chapter_IEq31.gif), and ![
+$$\\mathsf{var}\\left \(\\left \\{\\hat{y}\\right \\}\\right \) = 1$$
+](A442674_1_En_2_Chapter_IEq32.gif). This is enough information to predict what the plot will look like. We know that ![
+$$\\mathsf{mean}\\left \(\\left \\{\\hat{x}\\right \\}\\right \) = 0$$
+](A442674_1_En_2_Chapter_IEq33.gif) and ![
+$$\\mathsf{var}\\left \(\\left \\{\\hat{x}\\right \\}\\right \) = 1$$
+](A442674_1_En_2_Chapter_IEq34.gif); so there will be many data points with ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq35.gif) value close to zero, and few with a much larger or much smaller ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq36.gif) value. The same applies to ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq37.gif). Now consider the data points in a strip of ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq38.gif) values. If this strip is far away from the origin, there will be few data points in the strip, because there aren't many big ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq39.gif) values. If there is no relationship, we don't expect to see large or small ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq40.gif) values in this strip, because there are few data points in the strip and because large or small ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq41.gif) values are uncommon—we see them only if there are many data points, and then seldom. So for a strip with ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq42.gif) close to zero, we might see some ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq43.gif) values that are far from zero because we will see many ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq44.gif) values. For a strip with ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq45.gif) that is far from zero, we expect to see few ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq46.gif) values that are far from zero, because we see few points in this strip. This reasoning means the data should form a round blob, centered at the origin. In the temperature-heart rate plot of Fig. 2.12, it looks as though nothing of much significance is happening. The average heart rate seems to be about the same for people who run warm or who run cool. There is probably not much relationship here.
+
+I have shown the three cases together in one figure using a real data example (Fig. 2.15), so you can compare the appearance of the plots.
+
+Fig. 2.15
+
+The three kinds of scatter plot: I used the body temperature vs heart rate data for the zero correlation; the height-weight data for positive correlation; and the lynx data for negative correlation. The pictures aren't idealized—real data tends to be messy—but you can still see the basic structures
+
+### 2.2.1 The Correlation Coefficient
+
+Consider a normalized data set of N two-dimensional vectors. We can write the i'th data point in standard coordinates ![
+$$\(\\hat{x}_{i},\\hat{y}_{i}\)$$
+](A442674_1_En_2_Chapter_IEq47.gif). We already know many important summaries of this data, because it is in standard coordinates. We have ![
+$$\\mathsf{mean}\\left \(\\left \\{\\hat{x}\\right \\}\\right \) = 0$$
+](A442674_1_En_2_Chapter_IEq48.gif); ![
+$$\\mathsf{mean}\\left \(\\left \\{\\hat{y}\\right \\}\\right \) = 0$$
+](A442674_1_En_2_Chapter_IEq49.gif); ![
+$$\\mathsf{std}\\left \(\\left \\{\\hat{x}\\right \\}\\right \) = 1$$
+](A442674_1_En_2_Chapter_IEq50.gif); and ![
+$$\\mathsf{std}\\left \(\\left \\{\\hat{y}\\right \\}\\right \) = 1$$
+](A442674_1_En_2_Chapter_IEq51.gif). Each of these summaries is itself the mean of some monomial. So ![
+$$\\mathsf{std}\\left \(\\left \\{\\hat{x}\\right \\}\\right \)^{2} = \\mathsf{mean}\\left \(\\left \\{\\hat{x}^{2}\\right \\}\\right \) = 1$$
+](A442674_1_En_2_Chapter_IEq52.gif); ![
+$$\\mathsf{std}\\left \(\\left \\{\\hat{y}\\right \\}\\right \)^{2} = \\mathsf{mean}\\left \(\\left \\{\\hat{y}^{2}\\right \\}\\right \)$$
+](A442674_1_En_2_Chapter_IEq53.gif) (the other two are easy). We can rewrite this information in terms of means of monomials, giving ![
+$$\\mathsf{mean}\\left \(\\left \\{\\hat{x}\\right \\}\\right \) = 0$$
+](A442674_1_En_2_Chapter_IEq54.gif); ![
+$$\\mathsf{mean}\\left \(\\left \\{\\hat{y}\\right \\}\\right \) = 0$$
+](A442674_1_En_2_Chapter_IEq55.gif); ![
+$$\\mathsf{mean}\\left \(\\left \\{\\hat{x}^{2}\\right \\}\\right \) = 1$$
+](A442674_1_En_2_Chapter_IEq56.gif); and ![
+$$\\mathsf{mean}\\left \(\\left \\{\\hat{y}^{2}\\right \\}\\right \) = 1$$
+](A442674_1_En_2_Chapter_IEq57.gif). There is one monomial missing here, which is ![
+$$\\hat{x}\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq58.gif). The term ![
+$$\\mathsf{mean}\\left \(\\left \\{\\hat{x}\\hat{y}\\right \\}\\right \)$$
+](A442674_1_En_2_Chapter_IEq59.gif) captures correlation between x and y. The term is known as the correlation coefficient or correlation.
+
+Definition 2.1 (Correlation Coefficient)
+
+Assume we have N data items which are 2-vectors (x 1, y 1),..., (x N , y N ), where N > 1. These could be obtained, for example, by extracting components from larger vectors. We compute the correlation coefficient by first normalizing the x and y coordinates to obtain ![
+$$\\hat{x}_{i} = \\dfrac{\(x_{i} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\)} {\\mathsf{std}\\left \(x\\right \)}$$
+](A442674_1_En_2_Chapter_IEq60.gif), ![
+$$\\hat{y}_{i} = \\dfrac{\(y_{i} -\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)\)} {\\mathsf{std}\\left \(y\\right \)}$$
+](A442674_1_En_2_Chapter_IEq61.gif). The correlation coefficient is the mean value of ![
+$$\\hat{x}\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq62.gif), and can be computed as:
+
+![
+$$\\displaystyle{\\mathsf{corr}\\left \(\\left \\{\(x,y\)\\right \\}\\right \) = \\dfrac{\\sum _{i}\\hat{x}_{i}\\hat{y}_{i}} {N} }$$
+](A442674_1_En_2_Chapter_Equa.gif)
+
+Correlation is a measure of our ability to predict one value from another. The correlation coefficient takes values between − 1 and 1 (we'll prove this below). If the correlation coefficient is close to 1, then we are likely to predict very well. Small correlation coefficients (under about 0.5, say, but this rather depends on what you are trying to achieve) tend not to be all that interesting, because (as we shall see) they result in rather poor predictions.
+
+Figure 2.16 gives a set of scatter plots of different real data sets with different correlation coefficients. These all come from data set of age-height-weight, which you can find at http://​www2.​stetson.​edu/​~jrasp/​data.​htm (look for bodyfat.xls). In each case, two outliers have been removed. Age and height are hardly correlated, as you can see from the figure. Younger people do tend to be slightly taller, and so the correlation coefficient is − 0. 25. You should interpret this as a small correlation. However, the variable called "adiposity" (which isn't defined, but is presumably some measure of the amount of fatty tissue) is quite strongly correlated with weight, with a correlation coefficient is 0.86. Average tissue density is quite strongly negatively correlated with adiposity, because muscle is much denser than fat, so these variables are negatively correlated—we expect high density to appear with low adiposity, and vice versa. The correlation coefficient is − 0. 86. Finally, density is very strongly correlated with body weight. The correlation coefficient is − 0. 98.
+
+Fig. 2.16
+
+Scatter plots for various pairs of variables for the age-height-weight dataset from http://​www2.​stetson.​edu/​~jrasp/​data.​htm; bodyfat.xls. In each case, two outliers have been removed, and the plots are in standard coordinates (compare to Fig. 2.17, which shows these data sets plotted in their original units). The legend names the variables
+
+It's not always convenient or a good idea to produce scatter plots in standard coordinates (among other things, doing so hides the units of the data, which can be a nuisance). Fortunately, scaling or translating data does not change the value of the correlation coefficient (though it can change the sign if one scale is negative). This means that it's worth being able to spot correlation in a scatter plot that isn't in standard coordinates (even though correlation is always defined in standard coordinates). Figure 2.17 shows different correlated datasets plotted in their original units. These data sets are the same as those used in Fig. 2.16.
+
+Fig. 2.17
+
+Scatter plots for various pairs of variables for the age-height-weight dataset from http://​www2.​stetson.​edu/​~jrasp/​data.​htm; bodyfat.xls. In each case, two outliers have been removed, and the plots are NOT in standard coordinates (compare to Fig. 2.16, which shows these data sets plotted in normalized coordinates). The legend names the variables
+
+You should memorize the properties of the correlation coefficient in the box. The first property is easy, and we relegate that to the exercises. One way to see that the correlation coefficient isn't changed by translation or scale is to notice that it is defined in standard coordinates, and scaling or translating data doesn't change those. Another way to see this is to scale and translate data, then write out the equations; notice that taking standard coordinates removes the effects of the scale and translation. In each case, notice that if the scale is negative, the sign of the correlation coefficient changes.
+
+Useful Facts 2.1 (Properties of the Correlation Coefficient)
+
+  * The correlation coefficient is symmetric (it doesn't depend on the order of its arguments), so
+
+![
+$$\\displaystyle{\\mathsf{corr}\\left \(\\left \\{\(x,y\)\\right \\}\\right \) = \\mathsf{corr}\\left \(\\left \\{\(y,x\)\\right \\}\\right \)}$$
+](A442674_1_En_2_Chapter_Equb.gif)
+
+  * The value of the correlation coefficient is not changed by translating the data. Scaling the data can change the sign, but not the absolute value. For constants a ≠ 0, b, c ≠ 0, d we have
+
+![
+$$\\displaystyle{\\begin{array}{rl} \\mathsf{corr}&\\left \(\\left \\{\(ax + b,cx + d\)\\right \\}\\right \) = \\mbox{ sign}\(ab\)\\mathsf{corr}\\left \(\\left \\{\(x,y\)\\right \\}\\right \)\\end{array} }$$
+](A442674_1_En_2_Chapter_Equc.gif)
+
+  * If ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq63.gif) tends to be large (resp. small) for large (resp. small) values of ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq64.gif), then the correlation coefficient will be positive.
+
+  * If ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq65.gif) tends to be small (resp. large) for large (resp. small) values of ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq66.gif), then the correlation coefficient will be negative.
+
+  * If ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq67.gif) doesn't depend on ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq68.gif), then the correlation coefficient is zero (or close to zero).
+
+  * The largest possible value is 1, which happens when ![
+$$\\hat{x} =\\hat{ y}$$
+](A442674_1_En_2_Chapter_IEq69.gif).
+
+  * The smallest possible value is − 1, which happens when ![
+$$\\hat{x} = -\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq70.gif).
+
+The property that, if ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq71.gif) tends to be large (resp. small) for large (resp. small) values of ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq72.gif), then the correlation coefficient will be positive, doesn't really admit a formal statement. But it's relatively straightforward to see what's going on. Because ![
+$$\\mathsf{mean}\\left \(\\left \\{\\hat{x}\\right \\}\\right \) = 0$$
+](A442674_1_En_2_Chapter_IEq73.gif), small values of ![
+$$\\mathsf{mean}\\left \(\\left \\{\\hat{x}\\right \\}\\right \)$$
+](A442674_1_En_2_Chapter_IEq74.gif) must be negative and large values must be positive. But ![
+$$\\mathsf{corr}\\left \(\\left \\{\(x,y\)\\right \\}\\right \) = \\frac{\\sum _{i}\\hat{x}_{i}\\hat{y}_{i}} {N}$$
+](A442674_1_En_2_Chapter_IEq75.gif); and for this sum to be positive, it should contain mostly positive terms. It can contain few or no hugely positive (or hugely negative) terms, because ![
+$$\\mathsf{std}\\left \(\\hat{x}\\right \) = \\mathsf{std}\\left \(\\hat{y}\\right \) = 1$$
+](A442674_1_En_2_Chapter_IEq76.gif) so there aren't many large (or small) numbers. For the sum to contain mostly positive terms, then the sign of ![
+$$\\hat{x}_{i}$$
+](A442674_1_En_2_Chapter_IEq77.gif) should be the same as the sign ![
+$$\\hat{y}_{i}$$
+](A442674_1_En_2_Chapter_IEq78.gif) for most data items. Small changes to this argument work to show that if if ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq79.gif) tends to be small (resp. large) for large (resp. small) values of ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq80.gif), then the correlation coefficient will be negative.
+
+Showing that no relationship means zero correlation requires slightly more work. Divide the scatter plot of the dataset up into thin vertical strips. There are S strips. Each strip is narrow, so the ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq81.gif) value does not change much for the data points in a particular strip. For the s'th strip, write N(s) for the number of data points in the strip, ![
+$$\\hat{x}\(s\)$$
+](A442674_1_En_2_Chapter_IEq82.gif) for the ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq83.gif) value at the center of the strip, and ![
+$$\\overline{\\hat{y}}\(s\)$$
+](A442674_1_En_2_Chapter_IEq84.gif) for the mean of the ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq85.gif) values within that strip. Now the strips are narrow, so we can approximate all data points within a strip as having the same value of ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq86.gif). This yields
+
+![
+$$\\displaystyle{\\mathsf{mean}\\left \(\\left \\{\\hat{x}\\hat{y}\\right \\}\\right \) \\approx \\frac{1} {S}\\sum _{s\\in \\mbox{ strips}}\\hat{x}\(s\)\\left \[N\(s\)\\overline{\\hat{y}}\(s\)\\right \]}$$
+](A442674_1_En_2_Chapter_Equd.gif)
+
+(where you could replace ≈ with = if the strips were narrow enough). Now assume that ![
+$$\\hat{y}\(s\)$$
+](A442674_1_En_2_Chapter_IEq87.gif) does not change from strip to strip, meaning that there is no relationship between ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq88.gif) and ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq89.gif) in this dataset (so the picture is like the left hand side in Fig. 2.15). Then each value of ![
+$$\\overline{\\hat{y}}\(s\)$$
+](A442674_1_En_2_Chapter_IEq90.gif) is the same—we write ![
+$$\\overline{\\hat{y}}$$
+](A442674_1_En_2_Chapter_IEq91.gif)—and we can rearrange to get
+
+![
+$$\\displaystyle{\\mathsf{mean}\\left \(\\left \\{\\hat{x}\\hat{y}\\right \\}\\right \) \\approx \\overline{\\hat{y}} \\frac{1} {S}\\sum _{s\\in \\mbox{ strips}}\\hat{x}\(s\).}$$
+](A442674_1_En_2_Chapter_Eque.gif)
+
+Now notice that
+
+![
+$$\\displaystyle{0 = \\mathsf{mean}\\left \(\\left \\{\\hat{y}\\right \\}\\right \) \\approx \\frac{1} {S}\\sum _{s\\in \\mbox{ strips}}N\(s\)\\overline{\\hat{y}}\(s\)}$$
+](A442674_1_En_2_Chapter_Equf.gif)
+
+(where again you could replace ≈ with = if the strips were narrow enough). This means that if every strip has the same value of ![
+$$\\overline{\\hat{y}}\(s\)$$
+](A442674_1_En_2_Chapter_IEq92.gif), then that value must be zero. In turn, if there is no relationship between ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq93.gif) and ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq94.gif), we must have ![
+$$\\mathsf{mean}\\left \(\\left \\{\\hat{x}\\hat{y}\\right \\}\\right \) = 0$$
+](A442674_1_En_2_Chapter_IEq95.gif).
+
+Property 2.1
+
+The largest possible value of the correlation is 1, and this occurs when ![
+$$\\hat{x}_{i} =\\hat{ y}_{i}$$
+](A442674_1_En_2_Chapter_IEq96.gif) for all i. The smallest possible value of the correlation is − 1, and this occurs when ![
+$$\\hat{x}_{i} = -\\hat{y}_{i}$$
+](A442674_1_En_2_Chapter_IEq97.gif) for all i.
+
+Proposition
+
+![
+$$\\displaystyle{-1 \\leq \\mathsf{corr}\\left \(\\left \\{\(x,y\)\\right \\}\\right \) \\leq 1}$$
+](A442674_1_En_2_Chapter_Equg.gif)
+
+Proof
+
+Writing ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq98.gif), ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq99.gif) for the normalized coefficients, we have
+
+![
+$$\\displaystyle{\\mathsf{corr}\\left \(\\left \\{\(x,y\)\\right \\}\\right \) = \\frac{\\sum _{i}\\hat{x}_{i}\\hat{y}_{i}} {N} }$$
+](A442674_1_En_2_Chapter_Equh.gif)
+
+and you can think of the value as the inner product of two vectors. We write
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathbf{x}& =& \\frac{1} {\\sqrt{N}}\\left \[\\hat{x}_{1},\\hat{x}_{2},\\ldots \\hat{x}_{N}\\right \]\\mbox{ and } {}\\\\ \\mathbf{y}& =& \\frac{1} {\\sqrt{N}}\\left \[\\hat{y}_{1},\\hat{y}_{2},\\ldots \\hat{y}_{N}\\right \] {}\\\\ \\end{array}$$
+](A442674_1_En_2_Chapter_Equ1.gif)
+
+and we have ![
+$$\\mathsf{corr}\\left \(\\left \\{\(x,y\)\\right \\}\\right \) = \\mathbf{x}^{T}\\mathbf{y}$$
+](A442674_1_En_2_Chapter_IEq100.gif). Notice ![
+$$\\mathbf{x}^{T}\\mathbf{x} = \\mathsf{std}\\left \(x\\right \)^{2} = 1$$
+](A442674_1_En_2_Chapter_IEq101.gif), and similarly for y. But the inner product of two vectors is at its maximum when the two vectors are the same, and this maximum is 1. This argument is also sufficient to show that smallest possible value of the correlation is − 1, and this occurs when ![
+$$\\hat{x}_{i} = -\\hat{y}_{i}$$
+](A442674_1_En_2_Chapter_IEq102.gif) for all i.
+
+### 2.2.2 Using Correlation to Predict
+
+Assume we have N data items which are 2-vectors (x 1, y 1),..., (x N , y N ), where N > 1. These could be obtained, for example, by extracting components from larger vectors. As usual, we will write ![
+$$\\hat{x}_{i}$$
+](A442674_1_En_2_Chapter_IEq103.gif) for x i in normalized coordinates, and so on. Now assume that we know the correlation coefficient is r (this is an important, traditional notation). What does this mean?
+
+One (very useful) interpretation is in terms of prediction. Assume we have a data point (x 0, ?) where we know the x-coordinate, but not the y-coordinate. We can use the correlation coefficient to predict the y-coordinate. First, we transform to standard coordinates. Now we must obtain the best ![
+$$\\hat{y}_{0}$$
+](A442674_1_En_2_Chapter_IEq104.gif) value to predict, using the ![
+$$\\hat{x}_{0}$$
+](A442674_1_En_2_Chapter_IEq105.gif) value we have.
+
+We want to construct a prediction function which gives a prediction for any value of ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq106.gif). This predictor should behave as well as possible on our existing data. For each of the ![
+$$\(\\hat{x}_{i},\\hat{y}_{i}\)$$
+](A442674_1_En_2_Chapter_IEq107.gif) pairs in our data set, the predictor should take ![
+$$\\hat{x}_{i}$$
+](A442674_1_En_2_Chapter_IEq108.gif) and produce a result as close to ![
+$$\\hat{y}_{i}$$
+](A442674_1_En_2_Chapter_IEq109.gif) as possible. We can choose the predictor by looking at the errors it makes at each data point.
+
+We write ![
+$$\\hat{y}_{i}^{p}$$
+](A442674_1_En_2_Chapter_IEq110.gif) for the value of ![
+$$\\hat{y}_{i}$$
+](A442674_1_En_2_Chapter_IEq111.gif) predicted at ![
+$$\\hat{x}_{i}$$
+](A442674_1_En_2_Chapter_IEq112.gif). The simplest form of predictor is linear. If we predict using a linear function, then we have, for some unknown a, b, that ![
+$$\\hat{y}_{i}^{p} = a\\hat{x}_{i} + b$$
+](A442674_1_En_2_Chapter_IEq113.gif). Now think about ![
+$$u_{i} =\\hat{ y}_{i} -\\hat{ y}_{i}^{p}$$
+](A442674_1_En_2_Chapter_IEq114.gif), which is the error in our prediction. We would like to have ![
+$$\\mathsf{mean}\\left \(\\left \\{u\\right \\}\\right \) = 0$$
+](A442674_1_En_2_Chapter_IEq115.gif) (otherwise, we could reduce the error of the prediction just by subtracting a constant).
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{mean}\\left \(\\left \\{u\\right \\}\\right \)& =& \\mathsf{mean}\\left \(\\left \\{\\hat{y} -\\hat{ y}^{p}\\right \\}\\right \) {}\\\\ & =& \\mathsf{mean}\\left \(\\left \\{\\hat{y}\\right \\}\\right \) -\\mathsf{mean}\\left \(\\left \\{a\\hat{x}_{i} + b\\right \\}\\right \) {}\\\\ & =& \\mathsf{mean}\\left \(\\left \\{\\hat{y}\\right \\}\\right \) - a\\mathsf{mean}\\left \(\\left \\{\\hat{x}\\right \\}\\right \) + b {}\\\\ & =& 0 - a0 + b {}\\\\ & =& 0. {}\\\\ \\end{array}$$
+](A442674_1_En_2_Chapter_Equ2.gif)
+
+This means that we must have b = 0.
+
+To estimate a, we need to think about ![
+$$\\mathsf{var}\\left \(\\left \\{u\\right \\}\\right \)$$
+](A442674_1_En_2_Chapter_IEq116.gif). We should like ![
+$$\\mathsf{var}\\left \(\\left \\{u\\right \\}\\right \)$$
+](A442674_1_En_2_Chapter_IEq117.gif) to be as small as possible, so that the errors are as close to zero as possible (remember, small variance means small standard deviation which means the data is close to the mean). We have
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{var}\\left \(\\left \\{u\\right \\}\\right \)& =& \\mathsf{var}\\left \(\\left \\{\\hat{y} -\\hat{ y}^{p}\\right \\}\\right \) {}\\\\ & =& \\mathsf{mean}\\left \(\\left \\{\(\\hat{y} - a\\hat{x}\)^{2}\\right \\}\\right \)\\mbox{ because }\\mathsf{mean}\\left \(\\left \\{u\\right \\}\\right \) = 0 {}\\\\ & =& \\mathsf{mean}\\left \(\\left \\{\(\\hat{y}\)^{2} - 2a\\hat{x}\\hat{y} + a^{2}\(\\hat{x}\)^{2}\\right \\}\\right \) {}\\\\ & =& \\mathsf{mean}\\left \(\\left \\{\(\\hat{y}\)^{2}\\right \\}\\right \) - 2a\\mathsf{mean}\\left \(\\left \\{\\hat{x}\\hat{y}\\right \\}\\right \) + a^{2}\\mathsf{mean}\\left \(\\left \\{\(\\hat{x}\)^{2}\\right \\}\\right \) {}\\\\ & =& 1 - 2ar + a^{2}, {}\\\\ \\end{array}$$
+](A442674_1_En_2_Chapter_Equ3.gif)
+
+which we want to minimize by choice of a. At the minimum, we must have
+
+![
+$$\\displaystyle{\\frac{d\\mathsf{var}\\left \(\\left \\{u_{i}\\right \\}\\right \)} {da} = 0 = -2r + 2a}$$
+](A442674_1_En_2_Chapter_Equi.gif)
+
+so that a = r and the correct prediction is
+
+![
+$$\\displaystyle{\\hat{y}_{0}^{p} = r\\hat{x}_{ 0}}$$
+](A442674_1_En_2_Chapter_Equj.gif)
+
+You can use a version of this argument to establish that if we have ![
+$$\(?,\\hat{y}_{0}\)$$
+](A442674_1_En_2_Chapter_IEq118.gif), then the best prediction for ![
+$$\\hat{x}_{0}$$
+](A442674_1_En_2_Chapter_IEq119.gif) (which is in standard coordinates) is ![
+$$r\\hat{y}_{0}$$
+](A442674_1_En_2_Chapter_IEq120.gif). It is important to notice that the coefficient of ![
+$$\\hat{y}_{0}$$
+](A442674_1_En_2_Chapter_IEq121.gif) is NOT 1∕r; you should work this example, which appears in the exercises. We now have a prediction procedure, outlined below.
+
+Procedure 2.1 (Predicting a Value Using Correlation)
+
+Assume we have N data items which are 2-vectors (x 1, y 1),..., (x N , y N ), where N > 1. These could be obtained, for example, by extracting components from larger vectors. Assume we have an x value x 0 for which we want to give the best prediction of a y value, based on this data. The following procedure will produce a prediction:
+
+  * Transform the data set into standard coordinates, to get
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\hat{x}_{i}& =& \\frac{1} {\\mathsf{std}\\left \(x\\right \)}\(x_{i} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\) {}\\\\ \\hat{y}_{i}& =& \\frac{1} {\\mathsf{std}\\left \(y\\right \)}\(y_{i} -\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)\) {}\\\\ \\hat{x}_{0}& =& \\frac{1} {\\mathsf{std}\\left \(x\\right \)}\(x_{0} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\). {}\\\\ \\end{array}$$
+](A442674_1_En_2_Chapter_Equ4.gif)
+
+  * Compute the correlation
+
+![
+$$\\displaystyle{r = \\mathsf{corr}\\left \(\\left \\{\(x,y\)\\right \\}\\right \) = \\mathsf{mean}\\left \(\\left \\{\\hat{x}\\hat{y}\\right \\}\\right \).}$$
+](A442674_1_En_2_Chapter_Equk.gif)
+
+  * Predict ![
+$$\\hat{y}_{0} = r\\hat{x}_{0}$$
+](A442674_1_En_2_Chapter_IEq122.gif).
+
+  * Transform this prediction into the original coordinate system, to get
+
+![
+$$\\displaystyle{y_{0} = \\mathsf{std}\\left \(y\\right \)r\\hat{x}_{0} + \\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)}$$
+](A442674_1_En_2_Chapter_Equl.gif)
+
+Now assume we have a y value y 0, for which we want to give the best prediction of an x value, based on this data. The following procedure will produce a prediction:
+
+  * Transform the data set into standard coordinates.
+
+  * Compute the correlation.
+
+  * Predict ![
+$$\\hat{x}_{0} = r\\hat{y}_{0}$$
+](A442674_1_En_2_Chapter_IEq123.gif).
+
+  * Transform this prediction into the original coordinate system, to get
+
+![
+$$\\displaystyle{x_{0} = \\mathsf{std}\\left \(x\\right \)r\\hat{y}_{0} + \\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)}$$
+](A442674_1_En_2_Chapter_Equm.gif)
+
+There is another way of thinking about this prediction procedure, which is often helpful. Assume we need to predict a value for x 0. In normalized coordinates, our prediction is ![
+$$\\hat{y}^{p} = r\\hat{x}_{0}$$
+](A442674_1_En_2_Chapter_IEq124.gif); if we revert back to the original coordinate system, the prediction becomes
+
+![
+$$\\displaystyle{ \\frac{\(y^{p} -\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)\)} {\\mathsf{std}\\left \(y\\right \)} = r\\left \(\\frac{\(x_{0} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\)} {\\mathsf{std}\\left \(x\\right \)} \\right \). }$$
+](A442674_1_En_2_Chapter_Equn.gif)
+
+This gives a really useful rule of thumb, which I have broken out in the box below.
+
+Procedure 2.2 (Predicting a Value Using Correlation: Rule of Thumb—1)
+
+If x 0 is k standard deviations from the mean of x, then the predicted value of y will be rk standard deviations away from the mean of y, and the sign of r tells whether y increases or decreases.
+
+An even more compact version of the rule of thumb is in the following box.
+
+Procedure 2.3 (Predicting a Value Using Correlation: Rule of Thumb—2)
+
+The predicted value of y goes up by r standard deviations when the value of x goes up by one standard deviation.
+
+We can compute the average root mean square error that this prediction procedure will make. The square of this error must be
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{mean}\\left \(\\left \\{u^{2}\\right \\}\\right \)& =& \\mathsf{mean}\\left \(\\left \\{y^{2}\\right \\}\\right \) - 2r\\mathsf{mean}\\left \(\\left \\{xy\\right \\}\\right \) + r^{2}\\mathsf{mean}\\left \(\\left \\{x^{2}\\right \\}\\right \) {}\\\\ & =& 1 - 2r^{2} + r^{2} {}\\\\ & =& 1 - r^{2} {}\\\\ \\end{array}$$
+](A442674_1_En_2_Chapter_Equ5.gif)
+
+so the root mean square error will be ![
+$$\\sqrt{ 1 - r^{2}}$$
+](A442674_1_En_2_Chapter_IEq125.gif). This is yet another interpretation of correlation; if x and y have correlation close to one, then predictions could have very small root mean square error, and so might be very accurate. In this case, knowing one variable is about as good as knowing the other. If they have correlation close to zero, then the root mean square error in a prediction might be as large as the root mean square error in ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq126.gif)—which means the prediction is nearly a pure guess.
+
+The prediction argument means that we can spot correlations for data in other kinds of plots—one doesn't have to make a scatter plot. For example, if we were to observe a child's height from birth to their 10'th year (you can often find these observations in ballpen strokes, on kitchen walls), we could plot height as a function of year. If we also had their weight (less easily found), we could plot weight as a function of year, too. The prediction argument above say that, if you can predict the weight from the height (or vice versa) then they're correlated. One way to spot this is to look and see if one curve goes up when the other does (or goes down when the other goes up). You can see this effect in Fig. 2.5, where (before 19h00), prices go down when the number of pelts goes up, and vice versa. These two variables are negatively correlated.
+
+### 2.2.3 Confusion Caused by Correlation
+
+There is one very rich source of potential (often hilarious) mistakes in correlation. When two variables are correlated, they change together. If the correlation is positive, that means that, in typical data, if one is large then the other is large, and if one is small the other is small. In turn, this means that one can make a reasonable prediction of one from the other. However, correlation DOES NOT mean that changing one variable causes the other to change (sometimes known as causation).
+
+Two variables in a dataset could be correlated for a variety of reasons. One important reason is pure accident. If you look at enough pairs of variables, you may well find a pair that appears to be correlated just because you have a small set of observations. Imagine, for example, you have a dataset consisting of only two high dimensional vectors—there is a pretty good chance that there is some correlation between the components. Such accidents can occur in large datasets, particularly if the dimensions are high.
+
+Another reason variables could be correlated is that there is some causal relationship—for example, pressing the accelerator tends to make the car go faster, and so there will be some correlation between accelerator position and car acceleration. As another example, adding fertilizer does tend to make a plant grow bigger. Imagine you record the amount of fertilizer you add to each pot, and the size of the resulting potplant. There should be some correlation.
+
+Yet another reason variables could be correlated is that there is some other background variable—often called a latent variable —linked causally to each of the observed variables. For example, in children (as Freedman, Pisani and Purves note in their excellent Statistics), shoe size is correlated with reading skills. This DOES NOT mean that making your feet grow will make you read faster, or that you can make your feet shrink by forgetting how to read. The real issue here is the age of the child. Young children tend to have small feet, and tend to have weaker reading skills (because they've had less practice). Older children tend to have larger feet, and tend to have stronger reading skills (because they've had more practice). You can make a reasonable prediction of reading skills from foot size, because they're correlated, even though there is no direct connection.
+
+This kind of effect can mask correlations, too. Imagine you want to study the effect of fertilizer on potplants. You collect a set of pots, put one plant in each, and add different amounts of fertilizer. After some time, you record the size of each plant. You expect to see correlation between fertilizer amount and plant size. But you might not if you had used a different species of plant in each pot. Different species of plant can react quite differently to the same fertilizer (some plants just die if over-fertilized), so the species could act as a latent variable. With an unlucky choice of the different species, you might even conclude that there was a negative correlation between fertilizer and plant size. This sort of thing happens often, and it's an effect you should watch out for.
+
+## 2.3 Sterile Males in Wild Horse Herds
+
+Large herds of wild horses are (apparently) a nuisance, but keeping down numbers by simply shooting surplus animals would provoke outrage. One strategy that has been adopted is to sterilize males in the herd; if a herd contains sufficient sterile males, fewer foals should result. But catching stallions, sterilizing them, and reinserting them into a herd is a performance—does this strategy work?
+
+We can get some insight by plotting data. At http://​lib.​stat.​cmu.​edu/​DASL/​Datafiles/​WildHorses.​html, you can find a dataset covering herd management in wild horses. I have plotted part of this dataset in Fig. 2.18. In this dataset, there are counts of all horses, sterile males, and foals made on each of a small number of days in 1986, 1987, and 1988 for each of two herds. I extracted data for one herd. I have plotted this data as a function of the count of days since the first data point, because this makes it clear that some measurements were taken at about the same time, but there are big gaps in the measurements. In this plot, the data points are shown with a marker. Joining them leads to a confusing plot because the data points vary quite strongly. However, notice that the size of the herd drifts down slowly (you could hold a ruler against the plot to see the trend), as does the number of foals, when there is a (roughly) constant number of sterile males.
+
+Fig. 2.18
+
+A plot of the number of adult horses, sterile males, and foals in horse herds over a period of 3 years. The plot suggests that introducing sterile males might cause the number of foals to go down. Data from http://​lib.​stat.​cmu.​edu/​DASL/​Datafiles/​WildHorses.​html
+
+Does sterilizing males result in fewer foals? This is likely hard to answer for this dataset, but we could ask whether herds with more sterile males have fewer foals. A scatter plot is a natural tool to attack this question. However, the scatter plots of Fig. 2.19 suggest, rather surprisingly, that when there are more sterile males there are more adults (and vice versa), and when there are more sterile males there are more foals (and vice versa). This is borne out by a correlation analysis. The correlation coefficient between foals and sterile males is 0.74, and the correlation coefficient between adults and sterile males is 0.68. You should find this very surprising—how do the horses know how many sterile males there are in the herd? You might think that this is an effect of scaling the plot, but there is a scatter plot in normalized coordinates in Fig. 2.19 that is entirely consistent with the conclusions suggested by the unnormalized plot. What is going on here?
+
+Fig. 2.19
+
+Scatter plots of the number of sterile males in a horse herd against the number of adults, and the number of foals against the number of sterile males, from data of http://​lib.​stat.​cmu.​edu/​DASL/​Datafiles/​WildHorses.​html. Top: unnormalized; bottom: standard coordinates
+
+The answer is revealed by the scatter plots of Fig. 2.20. Here, rather than plotting a '*' at each data point, I have plotted the day number of the observation. This is in days from the first observation. You can see that the whole herd is shrinking—observations where there are many adults (resp. sterile adults, foals) occur with small day numbers, and observations where there are few have large day numbers. Because the whole herd is shrinking, it is true that when there are more adults and more sterile males, there are also more foals. Alternatively, you can see the plots of Fig. 2.18 as a scatter plot of herd size (resp. number of foals, number of sterile males) against day number. Then it becomes clear that the whole herd is shrinking, as is the size of each group. To drive this point home, we can look at the correlation coefficient between adults and days ( − 0. 24), between sterile adults and days ( − 0. 37), and between foals and days ( − 0. 61). We can use the rule of thumb in box 2.3 to interpret this. This means that every 282 days, the herd loses about three adults; about one sterile adult; and about three foals. For the herd to have a stable size, it needs to gain by birth as many foals as it loses both to growing up and to death. If the herd is losing three foals every 282 days, then if they all grow up to replace the missing adults, the herd will be shrinking slightly (because it is losing four adults in this time); but if it loses foals to natural accidents, etc., then it is shrinking rather fast.
+
+Fig. 2.20
+
+Scatter plots of the number of foals vs. the number of adults and the number of adults vs. the number of sterile adults for the wild horse herd, from http://​lib.​stat.​cmu.​edu/​DASL/​Datafiles/​WildHorses.​html. Rather than plot data points as dots, I have plotted the day on which the observation was made. Notice how the herd starts large, and then shrinks
+
+The message of this example is important. To understand a simple dataset, you might need to plot it several ways. You should make a plot, look at it and ask what it says, and then try to use another type of plot to confirm or refute what you think might be going on.
+
+## 2.4 You Should
+
+### 2.4.1 Remember These Definitions
+
+Correlation coefficient 39
+
+### 2.4.2 Remember These Terms
+
+pie chart 29
+
+stacked bar chart 30
+
+heat map 30
+
+3D bar chart 30
+
+scatter plot 33
+
+correlation 36
+
+correlation 39
+
+latent variable 45
+
+### 2.4.3 Remember These Facts
+
+Properties of the correlation coefficient 40
+
+### 2.4.4 Use These Procedures
+
+To predict a value using correlation 43
+
+To predict a value using correlation (rule of thumb) 44
+
+To predict a value using correlation (rule of thumb, compact) 44
+
+### 2.4.5 Be Able to
+
+  * Plot a bar chart, a heat map, and a pie chart for a categorical dataset.
+
+  * Plot a dataset as a graph, making sensible choices about markers, lines and the like.
+
+  * Plot a scatter plot for a dataset.
+
+  * Plot a normalized scatter plot for a dataset.
+
+  * Interpret the scatter plot to tell the sign of the correlation between two variables, and estimate the size of the correlation coefficient.
+
+  * Compute a correlation coefficient.
+
+  * Interpret a correlation coefficient.
+
+  * Use correlation to make predictions.
+
+Problems
+
+2.1 In a population, the correlation coefficient between weight and adiposity is 0.9. The mean weight is 150lb. The standard deviation in weight is 30lb. Adiposity is measured on a scale such that the mean is 0.8, and the standard deviation is 0.1.
+
+  1. (a)
+
+Using this information, predict the expected adiposity of a subject whose weight is 170lb
+
+  2. (b)
+
+Using this information, predict the expected weight of a subject whose adiposity is 0.75
+
+  3. (c)
+
+How reliable do you expect this prediction to be? Why? (your answer should be a property of correlation, not an opinion about adiposity or weight)
+
+2.2 In a population, the correlation coefficient between family income and child IQ is 0.30. The mean family income was $60,000. The standard deviation in income is $20,000. IQ is measured on a scale such that the mean is 100, and the standard deviation is 15.
+
+  1. (a)
+
+Using this information, predict the expected IQ of a child whose family income is $70,000
+
+  2. (b)
+
+How reliable do you expect this prediction to be? Why? (your answer should be a property of correlation, not an opinion about IQ)
+
+  3. (c)
+
+The family income now rises—does the correlation predict that the child will have a higher IQ? Why?
+
+2.3 Show that ![
+$$\\mathsf{corr}\\left \(\\left \\{\(x,y\)\\right \\}\\right \) = \\mathsf{corr}\\left \(\\left \\{\(y,x\)\\right \\}\\right \)$$
+](A442674_1_En_2_Chapter_IEq127.gif) by substituting into the definition.
+
+2.4 Show that if ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq128.gif) tends to be small (resp. large) for large (resp. small) values of ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq129.gif), then the correlation coefficient will be negative.
+
+2.5 We have a 2D dataset consisting of N pairs ![
+$$\(\\hat{x}_{i},\\hat{y}_{i}\)$$
+](A442674_1_En_2_Chapter_IEq130.gif) in normalized coordinates. This data has correlation coefficient r. We observe a new ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq131.gif) value ![
+$$\\hat{y}_{0}$$
+](A442674_1_En_2_Chapter_IEq132.gif), and wish to predict the (unknown) x value. We will do so with a linear prediction, choosing a, b, to predict an ![
+$$\\hat{x}$$
+](A442674_1_En_2_Chapter_IEq133.gif) for any ![
+$$\\hat{y}$$
+](A442674_1_En_2_Chapter_IEq134.gif) using the rule ![
+$$\\hat{x}^{p} = a\\hat{y}^{p} + b$$
+](A442674_1_En_2_Chapter_IEq135.gif). Write ![
+$$u_{i} =\\hat{ x}_{i} -\\hat{ x}_{i}^{p}$$
+](A442674_1_En_2_Chapter_IEq136.gif) for the error that this rule makes on each data item.
+
+  1. (a)
+
+We require ![
+$$\\mathsf{mean}\\left \(\\left \\{u\\right \\}\\right \) = 0$$
+](A442674_1_En_2_Chapter_IEq137.gif). Show that this means that b = 0.
+
+  2. (b)
+
+We require that ![
+$$\\mathsf{var}\\left \(\\left \\{u\\right \\}\\right \)$$
+](A442674_1_En_2_Chapter_IEq138.gif) is minimized. Show that this means that a = r.
+
+  3. (c)
+
+We now have a result that seems paradoxical—if I have ![
+$$\(\\hat{x}_{0},?\)$$
+](A442674_1_En_2_Chapter_IEq139.gif) I predict ![
+$$\(\\hat{x}_{0},r\\hat{x}_{0}\)$$
+](A442674_1_En_2_Chapter_IEq140.gif) and if I have (?, y 0), I predict ![
+$$\(r\\hat{y}_{0},\\hat{y}_{0}\)$$
+](A442674_1_En_2_Chapter_IEq141.gif). Use Fig. 2.21 to explain why this is right. The important difference between the two lines is that lies (approximately) in the middle of each vertical span of data, and the other lies (approximately) in the middle of each horizontal span of data.
+
+Fig. 2.21
+
+This figure shows two lines, y = 0. 86x and x = 0. 86y, superimposed on the normalized adiposity-weight scatter plot
+
+2.6 I did the programming exercise about the earth temperature below. I looked at the years 1965–2012. Write {(y, T)} for the dataset giving the temperature (T) of the earth in year y. I computed: ![
+$$\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \) = 1988.5$$
+](A442674_1_En_2_Chapter_IEq142.gif), ![
+$$\\mathsf{std}\\left \(y\\right \) = 14$$
+](A442674_1_En_2_Chapter_IEq143.gif), ![
+$$\\mathsf{mean}\\left \(\\left \\{T\\right \\}\\right \) = 0.175$$
+](A442674_1_En_2_Chapter_IEq144.gif), ![
+$$\\mathsf{std}\\left \(T\\right \) = 0.231$$
+](A442674_1_En_2_Chapter_IEq145.gif) and ![
+$$\\mathsf{corr}\\left \(\\left \\{y\\right \\}\\right \)T = 0.892$$
+](A442674_1_En_2_Chapter_IEq146.gif). What is the best prediction using this information for the temperature in mid 2014? in mid 2028? in mid 2042?
+
+2.7 I did the programming exercise about the earth temperature below. It is straightforward to build a dataset {(T, n t )} where each entry contains the temperature of the earth (T) and the number of counties where FEMA declared tornadoes n t (for each year, you look up T and n t , and make a data item). I computed: ![
+$$\\mathsf{mean}\\left \(\\left \\{T\\right \\}\\right \) = 0.175$$
+](A442674_1_En_2_Chapter_IEq147.gif), ![
+$$\\mathsf{std}\\left \(T\\right \) = 0.231$$
+](A442674_1_En_2_Chapter_IEq148.gif), ![
+$$\\mathsf{mean}\\left \(\\left \\{n_{t}\\right \\}\\right \) = 31.6$$
+](A442674_1_En_2_Chapter_IEq149.gif), ![
+$$\\mathsf{std}\\left \(n_{t}\\right \) = 30.8$$
+](A442674_1_En_2_Chapter_IEq150.gif), and ![
+$$\\mathsf{corr}\\left \(\\left \\{T\\right \\}\\right \)n_{t} = 0.471$$
+](A442674_1_En_2_Chapter_IEq151.gif). What is the best prediction using this information for the number of tornadoes if the global earth temperature is 0.5? 0.6? 0.7?
+
+Programming Exercises
+
+2.8 At http://​lib.​stat.​cmu.​edu/​DASL/​Datafiles/​cigcancerdat.​html, you will find a dataset recording per capita cigarette sales and cancer deaths per 100 K population for a variety of cancers, recorded for 43 states and the District of Columbia in 1960.
+
+  1. (a)
+
+Plot a scatter plot of lung cancer deaths against cigarette sales, using the two letter abbreviation for each state as a marker. You should see two fairly obvious outliers. The backstory at http://​lib.​stat.​cmu.​edu/​DASL/​Stories/​cigcancer.​html suggests that the unusual sales in Nevada are generated by tourism (tourists go home, and die there) and the unusual sales in DC are generated by commuting workers (who also die at home).
+
+  2. (b)
+
+What is the correlation coefficient between per capita cigarette sales and lung cancer deaths per 100 K population? Compute this with, and without the outliers. What effect did the outliers have? Why?
+
+  3. (c)
+
+What is the correlation coefficient between per capita cigarette sales and bladder cancer deaths per 100 K population? Compute this with, and without the outliers. What effect did the outliers have? Why?
+
+  4. (d)
+
+What is the correlation coefficient between per capita cigarette sales and kidney cancer deaths per 100 K population? Compute this with, and without the outliers. What effect did the outliers have? Why?
+
+  5. (e)
+
+What is the correlation coefficient between per capita cigarette sales and leukemia deaths per 100 K population? Compute this with, and without the outliers. What effect did the outliers have? Why?
+
+  6. (f)
+
+You should have computed a positive correlation between cigarette sales and lung cancer deaths. Does this mean that smoking causes lung cancer? Why?
+
+  7. (g)
+
+You should have computed a negative correlation between cigarette sales and leukemia deaths. Does this mean that smoking cures leukemia? Why?
+
+2.9 At http://​www.​cru.​uea.​ac.​uk/​cru/​info/​warming/​gtc.​csv, you can find a dataset of global temperature by year. When I accessed this, the years spanned 1880–2012. I don't know what units the temperatures are measured in. Keep in mind that measuring the temperature of the earth has non-trivial difficulties (you can't just insert an enormous thermometer!), and if you look at http://​www.​cru.​uea.​ac.​uk/​cru and http://​www.​cru.​uea.​ac.​uk/​cru/​data/​temperature/​ you can see some discussion of the choices made to get these measurements. There are two kinds of data in this dataset, smoothed and unsmoothed. I used the unsmoothed data, which should be fine for our purposes. The government publishes a great deal of data at http://​data.​gov. From there, I found a dataset, published by the Federal Emergency Management Agency (FEMA), of all federally declared disasters (which I found at http://​www.​fema.​gov/​media-library/​assets/​documents/​28318?​id=​6292). We would like to see whether weather related disasters are correlated to global temperature.
+
+  1. (a)
+
+The first step is preprocessing the data. The FEMA data has all sorts of information. From 1965 on, a disaster was declared per county (so you get one line in the data set for each county), but before 1965, it seems to have been by state. We divide the disasters into four types: TORNADO, FLOOD, STORM, HURRICANE. (FEMA seems to have a much richer type system). We want to know how many counties declare a disaster of each type, in each year. This is a really rough estimate of the number of people affected by the disaster. If a disaster has two types (in some rows, you will see "SEVERE STORMS, HEAVY RAINS & FLOODING" we will allocate the credit evenly between the types (i.e. for this case we would count 1/2 for STORM and 1/2 for FLOOD). You should write code that will (a) read the dataset and then (b) compute a table with the count of the number of counties where a disaster of each type has occurred for each year. This takes a bit of work. Notice you only need to deal with two columns of the data (the date it was declared, and the type). Notice also that FEMA changed the way it represented dates somewhere through the first column (they added times), which can cause problems. You can tell the type of the disaster by just using a string match routine with the four keywords. Figure 2.22 shows the plot of temperature and of number of counties where FEMA declared a tornado disaster for this data.
+
+Fig. 2.22
+
+Plots I prepared from left uea data on temperature and right FEMA data on tornadoes by county. These should help you tell if you're on the right track
+
+  2. (b)
+
+Plot a normalized scatter plot of the number of counties where FEMA declared the disaster against temperature, for each kind.
+
+  3. (c)
+
+For each kind of disaster, compute the correlation coefficient between the number of counties where FEMA declared the disaster and the year. For each kind of disaster, use this correlation coefficient to predict the number of disasters of this kind for 2013. Compare this to the true number, and explain what you see.
+
+  4. (d)
+
+For each kind of disaster, compute the correlation coefficient between the number of counties where FEMA declared the disaster and the global temperature. For each kind of disaster, use this correlation coefficient to predict the number of disasters of this kind when the earth reaches 0.6 temperature units and 0.7 temperature units (on the absolute temperature scale).
+
+  5. (e)
+
+Does this data show that warming of the earth causes weather disasters? Why?
+
+  6. (f)
+
+Does this data suggest that more people will be affected by disasters in the US in the future? Why?
+
+  7. (g)
+
+Does this data suggest that the earth will be warmer in the future? Why?
+
+2.10 If you go to https://​github.​com/​TheUpshot/​Military-Surplus-Gear, you will find data on purchases of military weapons by US police departments. This data is organized by state and county. There's a fair amount of data here, and you'll need to do some data jockeying.
+
+  1. (a)
+
+Prepare a plot showing how much each Illinois county spent under this program.
+
+  2. (b)
+
+Now look up population levels in the counties. Prepare a plot showing how much each county spent per capita.
+
+  3. (c)
+
+Prepare a graphic illustrating what the overall most popular items were—i.e., those items counties bought the most of.
+
+  4. (d)
+
+Prepare a graphic illustrating on what items the most money was spent—for example, was more money spent on "RIFLE, 5.56 MILLIMETER" or on "MINE RESISTANT VEHICLE"?
+
+  5. (e)
+
+Prepare a graphic illustrating the pattern of purchases across counties for the ten overall most popular items.
+
+  6. (f)
+
+Can you draw any interesting conclusions?
+
+# Part II  
+Probability
+© Springer International Publishing AG 2018
+
+David ForsythProbability and Statistics for Computer Sciencehttps://doi.org/10.1007/978-3-319-64410-3_3
+
+# 3. Basic Ideas in Probability
+
+David Forsyth1
+
+(1)
+
+Computer Science Department, University of Illinois at Urbana Champaign, Urbana, IL, USA
+
+We will perform experiments—which could be pretty much anything, from flipping a coin, to eating too much saturated fat, to smoking, to crossing the road without looking—and reason about the outcomes (mostly bad for the examples I gave). But these outcomes are uncertain, and we need to weigh those uncertainties against one another. If I flip a coin, I could get heads or tails, and there's no reason to expect to see one more often than the other. If I eat too much saturated fat or smoke, I will very likely have problems, though I might not. If I cross the road without looking, I may be squashed by a truck or I may not. Our methods need also to account for information. If I look before I cross the road, I am much less likely to be squashed. Probability is the machinery we use to describe and account for the fact that some outcomes are more frequent than others.
+
+## 3.1 Experiments, Outcomes and Probability
+
+Imagine you repeat the same experiment numerous times. You do not necessarily expect to see the same result each time. Some results might occur more frequently than others. We account for this tendency using probability. To do so, we need to be clear about what results an experiment can have. For example, you flip a coin. We might agree that the only possible results are a head or a tail, thus ignoring the possibilities that (say) a bird swoops down and steals the coin; the coin lands and stays on edge; the coin falls between the cracks in the floor and disappears; and so on. By doing so, we have idealized the experiment.
+
+### 3.1.1 Outcomes and Probability
+
+We will formalize experiments by specifying the set of outcomes that we expect from the experiment. Every run of the experiment produces exactly one of the set of possible outcomes. We never see two or more outcomes from a single experiment, and we never see no outcome. The advantage of doing this is that we can count how often each outcome appears.
+
+Definition 3.1 (Sample Space)
+
+The sample space is the set of all outcomes, which we usually write ![
+$$\\Omega$$
+](A442674_1_En_3_Chapter_IEq1.gif).
+
+Worked example 3.1 (Find the Lady)
+
+We have three playing cards. One is a queen; one is a king, and one is a jack. All are shown face down, and one is chosen at random and turned up. What is the set of outcomes?
+
+Solution
+
+Write Q for queen, K for king, J for jack; the outcomes are ![
+$$\\left \\{Q,K,J\\right \\}$$
+](A442674_1_En_3_Chapter_IEq2.gif)
+
+Worked example 3.2 (Find the Lady, Twice)
+
+We play find the lady twice, replacing the card we have chosen. What is the sample space?
+
+Solution
+
+We now have ![
+$$\\left \\{QQ,QK,QN,KQ,\\right.$$
+](A442674_1_En_3_Chapter_IEq3.gif) ![
+$$\\left.KK,KJ,JQ,JK,JJ\\right \\}$$
+](A442674_1_En_3_Chapter_IEq4.gif)
+
+Worked example 3.3 (A Poor Choice of Strategy for Planning a Family)
+
+A couple decides to have children. As they know no mathematics, they decide to have children until a girl then a boy are born. What is the sample space? Does this strategy bound the number of children they could be planning to have?
+
+Solution
+
+Write B for boy, G for girl. The sample space looks like any string of B's and G's that (a) ends in GB and (b) does not contain any other GB. In regular expression notation, you can write such strings as B ∗ G + B. There is a lower bound on the length of the string (two), but no upper bound. As a family planning strategy, this is unrealistic, but it serves to illustrate the point that sample spaces don't have to be finite to be tractable.
+
+Remember this: Sample spaces are required, and need not be finite
+
+We represent our model of how often a particular outcome will occur in a repeated experiment with a probability, a non-negative number. This number gives the relative frequency of the outcome of interest, when an experiment is repeated a very large number of times.
+
+Assume that we repeat an experiment N times. Assume also that the coins, dice, whatever involved in each repetition of the experiment don't communicate with one another from experiment to experiment (or, equivalently, that experiments don't "know" about one another). We say that an outcome A has probability P if (a) outcome A occurs in about N × P of those experiments and (b) as N gets larger, the fraction of experiments where outcome A occurs will get closer to P. We write #(A) for the number of times outcome A occurs. We interpret P as
+
+![
+$$\\displaystyle{\\lim _{N\\rightarrow \\infty }\\frac{\\#\(A\)} {N}.}$$
+](A442674_1_En_3_Chapter_Equa.gif)
+
+We can draw two important conclusions immediately.
+
+  * For any outcome A, 0 ≤ P(A) ≤ 1.
+
+  * ![
+$$\\sum _{A_{i}\\in \\Omega }P\(A_{i}\) = 1$$
+](A442674_1_En_3_Chapter_IEq5.gif).
+
+Remember that every run of the experiment produces exactly one outcome. The probabilities add up to one because each experiment must have one of the outcomes in the sample space. Some problems can be handled by building a set of outcomes and reasoning about the probability of each outcome. This is particularly useful when the outcomes must have the same probability, which happens rather a lot.
+
+Worked example 3.4 (A Biased Coin)
+
+Assume we have a coin where the probability of getting heads is ![
+$$P\(H\) = \\frac{1} {3}$$
+](A442674_1_En_3_Chapter_IEq6.gif), and so the probability of getting tails is ![
+$$P\(T\) = \\frac{2} {3}$$
+](A442674_1_En_3_Chapter_IEq7.gif). We flip this coin three million times. How many times do we see heads?
+
+Solution
+
+![
+$$P\(H\) = \\frac{1} {3}$$
+](A442674_1_En_3_Chapter_IEq8.gif), so we expect this coin will come up heads in ![
+$$\\frac{1} {3}$$
+](A442674_1_En_3_Chapter_IEq9.gif) of experiments. This means that we will very likely see very close to a million heads. Later on, we will be able to be more precise.
+
+Remember this: The probability of an outcome is the frequency of that outcome in a very large number of repeated experiments. The sum of probabilities over all outcomes must be one.
+
+## 3.2 Events
+
+Assume we run an experiment and get an outcome. We know what the outcome is (that's the whole point of a sample space). This means we can tell whether the outcome we get belongs to some particular known set of outcomes. We just look in the set and see if our outcome is there. This means that we should be able to predict the probability of a set of outcomes from any reasonable model of an experiment. For example, we might roll a die and ask what the probability of getting an even number is. We would like our probability models to be able to predict the probability of sets of outcomes.
+
+Definition 3.2 (Event)
+
+An event is a set of outcomes. I will usually write events as sets (so, for example, ![
+$$\\mathcal{E}\)$$
+](A442674_1_En_3_Chapter_IEq10.gif).
+
+Assume we are given a discrete sample space ![
+$$\\Omega$$
+](A442674_1_En_3_Chapter_IEq11.gif). A natural choice of an event space is the collection of all subsets of ![
+$$\\Omega$$
+](A442674_1_En_3_Chapter_IEq12.gif). It turns out that this is not the only possible choice, but we will ignore this point. So far, we have described the probability of each outcome with a non-negative number. We can extend this idea of probability to deal with events in a straightforward way.
+
+The set of all outcomes, which we wrote ![
+$$\\Omega$$
+](A442674_1_En_3_Chapter_IEq13.gif), must be an event. We must have ![
+$$P\(\\Omega \) = 1$$
+](A442674_1_En_3_Chapter_IEq14.gif) (because we said that every run of an experiment produces one outcome, and that outcome must be in ![
+$$\\Omega$$
+](A442674_1_En_3_Chapter_IEq15.gif)). In principle, there could be no outcome, although this never happens. This means that the empty set, which we write ![
+$$\\varnothing$$
+](A442674_1_En_3_Chapter_IEq16.gif), is an event, and we have ![
+$$P\(\\varnothing \) = 0$$
+](A442674_1_En_3_Chapter_IEq17.gif).
+
+Any given outcome must be an event, because an event is a set of outcomes. Now assume A and B are two distinct outcomes, and write ℰ = A, B for the event that contains both. We must have that ![
+$$P\(\\mathcal{E}\) = P\(A\) + P\(B\)$$
+](A442674_1_En_3_Chapter_IEq18.gif), because the number of times repeated experiments produce an outcome in ![
+$$\\mathcal{E}$$
+](A442674_1_En_3_Chapter_IEq19.gif) is given by the number of times we see A plus the number of times we see B. Now assume that C i are N distinct outcomes, and ![
+$$\\mathcal{F}$$
+](A442674_1_En_3_Chapter_IEq20.gif) is the event that contains all of them, and no other outcomes. Then we must have ![
+$$P\(\\mathcal{F}\) =\\sum _{i}P\(C_{i}\)$$
+](A442674_1_En_3_Chapter_IEq21.gif) (because we observe an outcome in ![
+$$\\mathcal{F}$$
+](A442674_1_En_3_Chapter_IEq22.gif) whenever we see any of the outcomes C i ). In turn, this means that if ![
+$$\\mathcal{E}$$
+](A442674_1_En_3_Chapter_IEq23.gif) and ![
+$$\\mathcal{F}$$
+](A442674_1_En_3_Chapter_IEq24.gif) are disjoint events, ![
+$$P\(\\mathcal{E}\\cup \\mathcal{F}\) = P\(\\mathcal{E}\) + P\(\\mathcal{F}\)$$
+](A442674_1_En_3_Chapter_IEq25.gif). All this yields a straightforward set of properties, collected in a box below.
+
+Useful Facts 3.1 (Basic Properties of the Probability Events)
+
+We have
+
+  * The probability of every event is between zero and one; in equations
+
+![
+$$\\displaystyle{0 \\leq P\(\\mathcal{A}\) \\leq 1}$$
+](A442674_1_En_3_Chapter_Equb.gif)
+
+for any event ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq26.gif).
+
+  * Every experiment has an outcome; in equations,
+
+![
+$$\\displaystyle{P\(\\Omega \) = 1.}$$
+](A442674_1_En_3_Chapter_Equc.gif)
+
+  * The probability of disjoint events is additive; writing this in equations requires some notation. Assume that we have a collection of events ![
+$$\\mathcal{A}_{i}$$
+](A442674_1_En_3_Chapter_IEq27.gif), indexed by i. We require that these have the property ![
+$$\\mathcal{A}_{i} \\cap \\mathcal{A}_{j} = \\varnothing$$
+](A442674_1_En_3_Chapter_IEq28.gif) when i ≠ j. This means that there is no outcome that appears in more than one ![
+$$\\mathcal{A}_{i}$$
+](A442674_1_En_3_Chapter_IEq29.gif). In turn, if we interpret probability as relative frequency, we must have that
+
+![
+$$\\displaystyle{P\(\\cup _{i}\\mathcal{A}_{i}\) =\\sum _{i}P\(\\mathcal{A}_{i}\)}$$
+](A442674_1_En_3_Chapter_Equd.gif)
+
+### 3.2.1 Computing Event Probabilities by Counting Outcomes
+
+If you can compute the probability of each outcome in an event ![
+$$\\mathcal{F}$$
+](A442674_1_En_3_Chapter_IEq30.gif), computing the probability of the event is straightforward. The outcomes are each disjoint events, so you just add the probabilities. A common, and particularly useful, case occurs when you know each outcome in the sample space has the same probability. In this case, computing the probability of an event is an exercise in counting. You can show
+
+![
+$$\\displaystyle{P\(\\mathcal{F}\) = \\frac{\\mbox{ Number of outcomes in }\\mathcal{F}} {\\mbox{ Total number of outcomes in }\\Omega }}$$
+](A442674_1_En_3_Chapter_Eque.gif)
+
+(look at the exercises).
+
+Worked example 3.5 (Odd Numbers with Fair Dice)
+
+We throw a fair (each number has the same probability) six-sided die twice, then add the two numbers. What is the probability of getting an odd number?
+
+Solution
+
+There are 36 outcomes. Each has the same probability (1∕36). Eighteen of them give an odd number, and the other 18 give an even number, so the probability is 18∕36 = 1∕2
+
+Worked example 3.6 (Numbers Divisible by Five with Fair Dice)
+
+We throw a fair (each number has the same probability) six-sided die twice, then add the two numbers. What is the probability of getting a number divisible by five?
+
+Solution
+
+There are 36 outcomes. Each has the same probability (1∕36). For this event, the spots must add to either 5 or to 10. There are 4 ways to get 5. There are 3 ways to get 10, so the probability is 7∕36.
+
+Sometimes a bit of fiddling with the space of outcomes makes it easy to compute what we want. Examples 3.8 and 3.47 show cases where you can use fictitious outcomes as an accounting device to simplify a computation.
+
+Worked example 3.7 (Children—1)
+
+A couple decides to have children. They decide simply to have three children. Assume that three births occur, each birth results in one child, and boys and girls are equally likely at each birth. Let ![
+$$\\mathcal{B}_{i}$$
+](A442674_1_En_3_Chapter_IEq31.gif) be the event that there are i boys, and ![
+$$\\mathcal{C}$$
+](A442674_1_En_3_Chapter_IEq32.gif) be the event there are more girls than boys. Compute ![
+$$P\(\\mathcal{B}_{1}\)$$
+](A442674_1_En_3_Chapter_IEq33.gif) and ![
+$$P\(\\mathcal{C}\)$$
+](A442674_1_En_3_Chapter_IEq34.gif).
+
+Solution
+
+There are eight outcomes. Each has the same probability. Three of them have a single boy, so ![
+$$P\(\\mathcal{B}_{1}\) = 3/8$$
+](A442674_1_En_3_Chapter_IEq35.gif). Four of these outcomes have more girls than boys, so ![
+$$P\(\\mathcal{C}\) = 1/2$$
+](A442674_1_En_3_Chapter_IEq36.gif).
+
+Worked example 3.8 (Children—2)
+
+A couple decides to have children. They decide to have children until the first girl is born, or until there are three, and then stop. Assume that each birth results in one child, and boys and girls are equally likely at each birth. Let ![
+$$\\mathcal{B}_{i}$$
+](A442674_1_En_3_Chapter_IEq37.gif) be the event that there are i boys, and ![
+$$\\mathcal{C}$$
+](A442674_1_En_3_Chapter_IEq38.gif) be the event there are more girls than boys. Compute ![
+$$P\(\\mathcal{B}_{1}\)$$
+](A442674_1_En_3_Chapter_IEq39.gif) and ![
+$$P\(\\mathcal{C}\)$$
+](A442674_1_En_3_Chapter_IEq40.gif).
+
+Solution
+
+In this case, we could write the outcomes as ![
+$$\\left \\{G,BG,BBG\\right \\}$$
+](A442674_1_En_3_Chapter_IEq41.gif), but if we think about them like this, we have no simple way to compute their probability. Instead, we could use the sample space from the previous answer, but assume that some of the later births are fictitious. This gives us natural collection of events for which it is easy to compute probabilities. Having one girl corresponds to the event ![
+$$\\left \\{Gbb,Gbg,Ggb,Ggg\\right \\}$$
+](A442674_1_En_3_Chapter_IEq42.gif), where I have used lowercase letters to write the fictitious later births; the probability is 1∕2. Having a boy then a girl corresponds to the event ![
+$$\\left \\{BGb,BGg\\right \\}$$
+](A442674_1_En_3_Chapter_IEq43.gif) (and so has probability 1∕4). Having two boys then a girl corresponds to the event ![
+$$\\left \\{BBG\\right \\}$$
+](A442674_1_En_3_Chapter_IEq44.gif) (and so has probability 1∕8). Finally, having three boys corresponds to the event ![
+$$\\left \\{BBB\\right \\}$$
+](A442674_1_En_3_Chapter_IEq45.gif) (and so has probability 1∕8). This means that ![
+$$P\(\\mathcal{B}_{1}\) = 1/4$$
+](A442674_1_En_3_Chapter_IEq46.gif) and ![
+$$P\(\\mathcal{C}\) = 1/2$$
+](A442674_1_En_3_Chapter_IEq47.gif).
+
+Counting outcomes in an event can require pretty elaborate combinatorial arguments. One form of argument that is particularly important is to reason about permutations and combinations. You should recall that the number of distinct permutations of N items is N! .
+
+Worked example 3.9 (Card Hands)
+
+You draw a hand of seven cards from a properly shuffled standard deck of cards. With what probability do receive 2–8 of hearts, in that order?
+
+Solution
+
+There are numerous ways to do this, but I'll use permutations. There are 52! different orderings of a properly shuffled deck of cards. This is the total number of outcomes. The number of outcomes in the event comes by noticing that any outcome in the event is an ordering of the cards where the first seven cards are 2–8 of hearts, in that order. So there are 45! outcomes in the event, because you can reorder the remaining 45 cards arbitrarily. This means the probability is
+
+![
+$$\\displaystyle{\\frac{45!} {52!}.}$$
+](A442674_1_En_3_Chapter_Equf.gif)
+
+The number of combinations of k items, chosen from N, where the order does not matter, is given by
+
+![
+$$\\displaystyle{ \\frac{N!} {k!\(N - k\)!} = \\left \(\\begin{array}{c} N\\\\ k \\end{array} \\right \).}$$
+](A442674_1_En_3_Chapter_Equg.gif)
+
+Worked example 3.10 (Card Hands—2)
+
+You draw a hand of seven cards from a properly shuffled standard deck of cards. With what probability do receive 2–8 of hearts, in any order?
+
+Solution
+
+There are 52! different orderings of a properly shuffled deck of cards, so 52! outcomes Of these, 45! have the first seven cards 2–8 of hearts. There are 7! orderings of these cards. So the number of outcomes in the event is 45! 7! and the probability is
+
+![
+$$\\displaystyle{\\frac{7!45!} {52!} }$$
+](A442674_1_En_3_Chapter_Equh.gif)
+
+Alternatively, there are ![
+$$\\left \(\\begin{array}{c} N\\\\ k \\end{array} \\right \)$$
+](A442674_1_En_3_Chapter_IEq48.gif) hands of seven distinct cards, ignoring the order in which they are obtained. Only one such hand contains 2–8 of hearts, so the probability is
+
+![
+$$\\displaystyle{ \\frac{1} {\\left \(\\begin{array}{c} 52\\\\ 7\\end{array} \\right \)}}$$
+](A442674_1_En_3_Chapter_Equi.gif)
+
+(and you should check this reasoning got us to the same answer as the previous argument).
+
+Worked example 3.11 (Card Hands—3)
+
+You draw a hand of seven cards from a properly shuffled standard deck of cards. With what probability does your hand contain 2–8 of any suit? The cards don't have to have the same suit, and they can arrive in any order.
+
+Solution
+
+From the previous example, there are 52! orderings of a properly shuffled deck and so 52! outcomes in total. There are 45! orderings that fix the first seven cards to some specified values, as in Worked example 3.9. The number of hands of seven cards that works is obtained by (a) choosing a suit for each card then (b) counting the number of different orders. This yields 477! 45! outcomes in the event, so the probability is
+
+![
+$$\\displaystyle{\\frac{4^{7}7!45!} {52!}.}$$
+](A442674_1_En_3_Chapter_Equj.gif)
+
+Remember this: In some problems, you can compute the probabilities of events by counting outcomes.
+
+### 3.2.2 The Probability of Events
+
+There is an analogy between probability and "size" which is helpful in deriving and remembering expressions for the probability of events. Think about the probability of an event as the "size" of that event. This "size" is relative to ![
+$$\\Omega$$
+](A442674_1_En_3_Chapter_IEq49.gif), which has "size" 1. I find this a good way to remember equations. Some people find Venn diagrams a useful way to keep track of this argument, and Fig. 3.1 is for them.
+
+Fig. 3.1
+
+If you think of the probability of an event as measuring its "size", many of the rules are quite straightforward to remember. Venn diagrams can sometimes help. On the left, a Venn diagram to help remember that ![
+$$P\(\\mathcal{A}\) + P\(\\mathcal{A}^{c}\) = 1$$
+](A442674_1_En_3_Chapter_IEq50.gif). The "size" of ![
+$$\\Omega$$
+](A442674_1_En_3_Chapter_IEq51.gif) is 1, outcomes lie either in ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq52.gif) or ![
+$$\\mathcal{A}^{c}$$
+](A442674_1_En_3_Chapter_IEq53.gif), and the two don't intersect. On the right, you can see that ![
+$$P\(\\mathcal{A}-\\mathcal{B}\) = P\(\\mathcal{A}\) - P\(\\mathcal{A}\\cap \\mathcal{B}\)$$
+](A442674_1_En_3_Chapter_IEq54.gif) by noticing that ![
+$$P\(\\mathcal{A}-\\mathcal{B}\)$$
+](A442674_1_En_3_Chapter_IEq55.gif) is the "size" of the part of ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq56.gif) that isn't ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq57.gif). This is obtained by taking the "size" of ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq58.gif) and subtracting the "size" of the part that is also in ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq59.gif), i.e. the "size" of ![
+$$\\mathcal{A}\\cap \\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq60.gif). Similarly, you can see that ![
+$$P\(\\mathcal{A}\\cup \\mathcal{B}\) = P\(\\mathcal{A}\) + P\(\\mathcal{B}\) - P\(\\mathcal{A}\\cap \\mathcal{B}\)$$
+](A442674_1_En_3_Chapter_IEq61.gif) by noticing that you can get the "size" of ![
+$$\\mathcal{A}\\cup \\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq62.gif) by adding the "sizes" of ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq63.gif) and ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq64.gif), then subtracting the "size" of the intersection to avoid double counting
+
+Notice that ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq65.gif) and ![
+$$\\mathcal{A}^{c}$$
+](A442674_1_En_3_Chapter_IEq66.gif) don't overlap, and together make up all of ![
+$$\\Omega$$
+](A442674_1_En_3_Chapter_IEq67.gif). So the "size" of ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq68.gif) and the "size" of ![
+$$\\mathcal{A}^{c}$$
+](A442674_1_En_3_Chapter_IEq69.gif) should add to the "size" of ![
+$$\\Omega$$
+](A442674_1_En_3_Chapter_IEq70.gif) and so
+
+![
+$$\\displaystyle{P\(\\mathcal{A}\) + P\(\\mathcal{A}^{c}\) = 1.}$$
+](A442674_1_En_3_Chapter_Equk.gif)
+
+Notice the "size" of the part of ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq71.gif) that isn't in ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq72.gif) is obtained by taking the "size" of ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq73.gif) and subtracting the "size" of ![
+$$\\mathcal{A}\\cap \\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq74.gif)—that is, the part of ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq75.gif) that is also in ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq76.gif). This means that
+
+![
+$$\\displaystyle{P\(\\mathcal{A}-\\mathcal{B}\) = P\(\\mathcal{A}\) - P\(\\mathcal{A}\\cap \\mathcal{B}\)}$$
+](A442674_1_En_3_Chapter_Equl.gif)
+
+Notice the "size" of ![
+$$\\mathcal{A}\\cup \\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq77.gif) is obtained by adding the two "sizes", then subtracting the "size" of the intersection because otherwise you would double-count the part where the two sets overlap. This means that
+
+![
+$$\\displaystyle{P\(\\mathcal{A}\\cup \\mathcal{B}\) = P\(\\mathcal{A}\) + P\(\\mathcal{B}\) - P\(\\mathcal{A}\\cap \\mathcal{B}\).}$$
+](A442674_1_En_3_Chapter_Equm.gif)
+
+I have collected these expressions, which you should remember, in box 3.2. The "size" analogy can be made precise by thinking about "size" in the right way; I won't bother, because doing so takes effort without really enhancing the underlying intuition. I prove the expressions are right without using the "size" analogy below.
+
+Useful Facts 3.2 (Properties of the Probability of Events)
+
+  * ![
+$$P\(\\mathcal{A}^{c}\) = 1 - P\(\\mathcal{A}\)$$
+](A442674_1_En_3_Chapter_IEq78.gif)
+
+  * ![
+$$P\(\\varnothing \) = 0$$
+](A442674_1_En_3_Chapter_IEq79.gif)
+
+  * ![
+$$P\(\\mathcal{A}-\\mathcal{B}\) = P\(\\mathcal{A}\) - P\(\\mathcal{A}\\cap \\mathcal{B}\)$$
+](A442674_1_En_3_Chapter_IEq80.gif)
+
+  * ![
+$$P\(\\mathcal{A}\\cup \\mathcal{B}\) = P\(\\mathcal{A}\) + P\(\\mathcal{B}\) - P\(\\mathcal{A}\\cap \\mathcal{B}\)$$
+](A442674_1_En_3_Chapter_IEq81.gif)
+
+  * ![
+$$P\(\\cup _{1}^{n}\\mathcal{A}_{i}\) =\\sum _{i}P\(\\mathcal{A}_{i}\)-\\sum _{i<j}P\(\\mathcal{A}_{i}\\cap \\mathcal{A}_{j}\)+\\sum _{i<j<k}P\(\\mathcal{A}_{i}\\cap \\mathcal{A}_{j}\\cap \\mathcal{A}_{k}\)+\\ldots \(-1\)^{\(n+1\)}P\(\\mathcal{A}_{1}\\cap \\mathcal{A}_{2}\\cap \\ldots \\cap \\mathcal{A}_{n}\)$$
+](A442674_1_En_3_Chapter_IEq82.gif)
+
+Proposition
+
+![
+$$P\(\\mathcal{A}^{c}\) = 1 - P\(\\mathcal{A}\)$$
+](A442674_1_En_3_Chapter_IEq83.gif)
+
+Proof
+
+![
+$$\\mathcal{A}^{c}$$
+](A442674_1_En_3_Chapter_IEq84.gif) and ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq85.gif) are disjoint, so that ![
+$$P\(\\mathcal{A}^{c} \\cup \\mathcal{A}\) = P\(\\mathcal{A}^{c}\) + P\(\\mathcal{A}\) = P\(\\Omega \) = 1$$
+](A442674_1_En_3_Chapter_IEq86.gif).
+
+Proposition
+
+![
+$$P\(\\varnothing \) = 0$$
+](A442674_1_En_3_Chapter_IEq87.gif)
+
+Proof
+
+![
+$$P\(\\varnothing \) = P\(\\Omega ^{c}\) = P\(\\Omega - \\Omega \) = 1 - P\(\\Omega \) = 1 - 1 = 0$$
+](A442674_1_En_3_Chapter_IEq88.gif).
+
+Proposition
+
+![
+$$P\(\\mathcal{A}-\\mathcal{B}\) = P\(\\mathcal{A}\) - P\(\\mathcal{A}\\cap \\mathcal{B}\)$$
+](A442674_1_En_3_Chapter_IEq89.gif)
+
+Proof
+
+![
+$$\\mathcal{A}-\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq90.gif) is disjoint from ![
+$$\\mathcal{A}\\cap \\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq91.gif), and ![
+$$\\left \(\\mathcal{A}-\\mathcal{B}\\right \) \\cup \\left \(\\mathcal{A}\\cap \\mathcal{B}\\right \) = \\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq92.gif). This means that ![
+$$P\(\\mathcal{A}-\\mathcal{B}\) + P\(\\mathcal{A}\\cap \\mathcal{B}\) = P\(\\mathcal{A}\)$$
+](A442674_1_En_3_Chapter_IEq93.gif).
+
+Proposition
+
+![
+$$P\(\\mathcal{A}\\cup \\mathcal{B}\) = P\(\\mathcal{A}\) + P\(\\mathcal{B}\) - P\(\\mathcal{A}\\cap \\mathcal{B}\)$$
+](A442674_1_En_3_Chapter_IEq94.gif)
+
+Proof
+
+![
+$$P\(\\mathcal{A}\\cup \\mathcal{B}\) = P\(\\mathcal{A}\\cup \(\\mathcal{B}\\cap \\mathcal{A}^{c}\)\) = P\(\\mathcal{A}\) + P\(\(\\mathcal{B}\\cap \\mathcal{A}^{c}\)\)$$
+](A442674_1_En_3_Chapter_IEq95.gif). Now ![
+$$\\mathcal{B} = \\left \(\\mathcal{B}\\cap \\mathcal{A}\\right \) \\cup \\left \(\\mathcal{B}\\cap \\mathcal{A}^{c}\\right \)$$
+](A442674_1_En_3_Chapter_IEq96.gif). Furthermore, ![
+$$\\left \(\\mathcal{B}\\cap \\mathcal{A}\\right \)$$
+](A442674_1_En_3_Chapter_IEq97.gif) is disjoint from ![
+$$\\left \(\\mathcal{B}\\cap \\mathcal{A}^{c}\\right \)$$
+](A442674_1_En_3_Chapter_IEq98.gif), so we have ![
+$$P\(\\mathcal{B}\) = P\(\\left \(\\mathcal{B}\\cap \\mathcal{A}\\right \)\) + P\(\\left \(\\mathcal{B}\\cap \\mathcal{A}^{c}\\right \)\)$$
+](A442674_1_En_3_Chapter_IEq99.gif). This means that ![
+$$P\(\\mathcal{A}\\cup \\mathcal{B}\) = P\(\\mathcal{A}\) + P\(\(\\mathcal{B}\\cap \\mathcal{A}^{c}\)\) = P\(\\mathcal{A}\) + P\(\\mathcal{B}\) - P\(\\left \(\\mathcal{B}\\cap \\mathcal{A}\\right \)\)$$
+](A442674_1_En_3_Chapter_IEq100.gif).
+
+Proposition
+
+![
+$$P\(\\cup _{1}^{n}\\mathcal{A}_{i}\) =\\sum _{i}P\(\\mathcal{A}_{i}\)-\\sum _{i<j}P\(\\mathcal{A}_{i}\\cap \\mathcal{A}_{j}\)+\\sum _{i<j<k}P\(\\mathcal{A}_{i}\\cap \\mathcal{A}_{j}\\cap \\mathcal{A}_{k}\)+\\ldots \(-1\)^{\(n+1\)}P\(\\mathcal{A}_{1}\\cap \\mathcal{A}_{2}\\cap \\ldots \\cap \\mathcal{A}_{n}\)$$
+](A442674_1_En_3_Chapter_IEq101.gif)
+
+Proof
+
+This can be proven by repeated application of the previous result. As an example, we show how to work the case where there are three sets (you can get the rest by induction).
+
+![
+$$\\displaystyle\\begin{array}{rcl} & & \\hspace{-12.0pt}P\(\\mathcal{A}_{1} \\cup \\mathcal{A}_{2} \\cup \\mathcal{A}_{3}\) {}\\\\ & & = P\(\\mathcal{A}_{1} \\cup \(\\mathcal{A}_{2} \\cup \\mathcal{A}_{3}\)\) {}\\\\ & & = P\(\\mathcal{A}_{1}\) + P\(\\mathcal{A}_{2} \\cup \\mathcal{A}_{3}\) {}\\\\ & & \\quad - P\(\\mathcal{A}_{1} \\cap \(\\mathcal{A}_{2} \\cup \\mathcal{A}_{3}\)\) {}\\\\ & & = P\(\\mathcal{A}_{1}\) + \(P\(\\mathcal{A}_{2}\) + P\(\\mathcal{A}_{3}\) - P\(\\mathcal{A}_{2} \\cap \\mathcal{A}_{3}\)\) {}\\\\ & & \\quad - P\(\(\\mathcal{A}_{1} \\cap \\mathcal{A}_{2}\) \\cup \(\\mathcal{A}_{1} \\cap \\mathcal{A}_{3}\)\) {}\\\\ & & = P\(\\mathcal{A}_{1}\) + \(P\(\\mathcal{A}_{2}\) + P\(\\mathcal{A}_{3}\) - P\(\\mathcal{A}_{2} \\cap \\mathcal{A}_{3}\)\) {}\\\\ & & \\quad - P\(\\mathcal{A}_{1} \\cap \\mathcal{A}_{2}\) - P\(\\mathcal{A}_{1} \\cap \\mathcal{A}_{3}\) {}\\\\ & & \\quad - \(-P\(\(\\mathcal{A}_{1} \\cap \\mathcal{A}_{2}\) \\cap \(\\mathcal{A}_{1} \\cap \\mathcal{A}_{3}\)\)\) {}\\\\ & & = P\(\\mathcal{A}_{1}\) + P\(\\mathcal{A}_{2}\) + P\(\\mathcal{A}_{3}\) {}\\\\ & & \\quad - P\(\\mathcal{A}_{2} \\cap \\mathcal{A}_{3}\) - P\(\\mathcal{A}_{1} \\cap \\mathcal{A}_{2}\) - P\(\\mathcal{A}_{1} \\cap \\mathcal{A}_{3}\) {}\\\\ & & \\quad + P\(\\mathcal{A}_{1} \\cap \\mathcal{A}_{2} \\cap \\mathcal{A}_{3}\) {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ1.gif)
+
+### 3.2.3 Computing Probabilities by Reasoning About Sets
+
+The rule ![
+$$P\(\\mathcal{A}^{c}\) = 1 - P\(\\mathcal{A}\)$$
+](A442674_1_En_3_Chapter_IEq102.gif) is occasionally useful for computing probabilities on its own. More commonly, you need other reasoning as well. The next problem illustrates an important feature of questions in probability: your intuition can be quite misleading. One problem is that the number of outcomes can be bigger or smaller than you expect.
+
+Worked example 3.12 (Shared Birthdays)
+
+What is the probability that, in a room of 30 people, there is a pair of people who have the same birthday?
+
+Solution
+
+We simplify, and assume that each year has 365 days, and that none of them are special (i.e. each day has the same probability of being chosen as a birthday). This model isn't perfect (there tend to be slightly more births roughly 9 months after: the start of spring; blackouts; major disasters; and so on) but it's workable. The easy way to attack this question is to notice that our probability, ![
+$$P\(\\left \\{\\mbox{ shared birthday}\\right \\}\)$$
+](A442674_1_En_3_Chapter_IEq103.gif), is
+
+![
+$$\\displaystyle{1 - P\(\\left \\{\\mbox{ all birthdays different}\\right \\}\).}$$
+](A442674_1_En_3_Chapter_Equn.gif)
+
+This second probability is rather easy to compute. Each outcome in the sample space is a list of 30 days (one birthday per person). Each outcome has the same probability. So
+
+![
+$$\\displaystyle\\begin{array}{rcl} & &P\(\\left \\{\\mbox{ all birthdays different}\\right \\}\)\\quad {}\\\\ & & = \\frac{\\mbox{ Number of outcomes in the event}} {\\mbox{ Total number of outcomes}}. {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ2.gif)
+
+The total number of outcomes is easily seen to be 36530, which is the total number of possible lists of 30 days. The number of outcomes in the event is the number of lists of 30 days, all different. To count these, we notice that there are 365 choices for the first day; 364 for the second; and so on. So we have
+
+![
+$$\\displaystyle\\begin{array}{rcl} & &P\(\\left \\{\\mbox{ shared birthday}\\right \\}\) {}\\\\ & & = 1 -\\frac{365 \\times 364 \\times \\ldots 336} {365^{30}} {}\\\\ & & \\approx 1 - 0.29 = 0.71 {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ3.gif)
+
+which means there's really a pretty good chance that two people in a room of 30 share a birthday.
+
+If we change the birthday example slightly, the problem changes drastically. If you stand up in a room of 30 people and bet that two people in the room have the same birthday, you have a probability of winning of about 0. 71. If you bet that there is someone else in the room who has the same birthday that you do, your probability of winning is very different.
+
+Worked example 3.13 (Shared Birthdays)
+
+You bet there is someone else in a room of 30 people who has the same birthday that you do. Assuming you know nothing about the other 29 people, what is the probability of winning?
+
+Solution
+
+The easy way to do this is
+
+![
+$$\\displaystyle{P\(\\left \\{\\mbox{ winning}\\right \\}\) = 1 - P\(\\left \\{\\mbox{ losing}\\right \\}\).}$$
+](A442674_1_En_3_Chapter_Equo.gif)
+
+Now you will lose if everyone has a birthday different from you. You can think of the birthdays of the others in the room as a list of 29 days of the year. If your birthday is on the list, you win; if it's not, you lose. The number of losing lists is the number of lists of 29 days of the year such that your birthday is not in the list. This number is easy to get. We have 364 days of the year to choose from for each of 29 locations in the list. The total number of lists is the number of lists of 29 days of the year. Each list has the same probability. So
+
+![
+$$\\displaystyle{P\(\\left \\{\\mbox{ losing}\\right \\}\) = \\frac{364^{29}} {365^{29}}}$$
+](A442674_1_En_3_Chapter_Equp.gif)
+
+and
+
+![
+$$\\displaystyle{P\(\\left \\{\\mbox{ winning}\\right \\}\) \\approx 0.0765.}$$
+](A442674_1_En_3_Chapter_Equq.gif)
+
+There is a wide variety of problems like this; if you're so inclined, you can make a small but quite reliable profit off people's inability to estimate probabilities for this kind of problem correctly (Examples 3.12 and 3.13 are reliably profitable; you could probably do quite well out of Examples 3.45 and 3.46).
+
+The rule ![
+$$P\(\\mathcal{A}-\\mathcal{B}\) = P\(\\mathcal{A}\) - P\(\\mathcal{A}\\cap \\mathcal{B}\)$$
+](A442674_1_En_3_Chapter_IEq104.gif) is also occasionally useful for computing probabilities on its own; more commonly, you need other reasoning as well.
+
+Worked example 3.14 (Dice)
+
+You flip two fair six-sided dice, and add the number of spots. What is the probability of getting a number divisible by 2, but not by 5?
+
+Solution
+
+There is an interesting way to work the problem. Write ![
+$$\\mathcal{D}_{n}$$
+](A442674_1_En_3_Chapter_IEq105.gif) for the event the number is divisible by n. Now ![
+$$P\(\\mathcal{D}_{2}\) = 1/2$$
+](A442674_1_En_3_Chapter_IEq106.gif) (count the cases; or, more elegantly, notice that each die has the same number of odd and even faces, and work from there). Now ![
+$$P\(\\mathcal{D}_{2} -\\mathcal{D}_{5}\) = P\(\\mathcal{D}_{2}\) - P\(\\mathcal{D}_{2} \\cap \\mathcal{D}_{5}\)$$
+](A442674_1_En_3_Chapter_IEq107.gif). But ![
+$$\\mathcal{D}_{2} \\cap \\mathcal{D}_{5}$$
+](A442674_1_En_3_Chapter_IEq108.gif) contains only three outcomes (6, 4, 5, 5 and 4, 6), so ![
+$$P\(\\mathcal{D}_{2} -\\mathcal{D}_{5}\) = 18/36 - 3/36 = 5/12$$
+](A442674_1_En_3_Chapter_IEq109.gif)
+
+Sometimes it is easier to reason about unions than to count outcomes directly.
+
+Worked example 3.15 (Two Fair Dice)
+
+I roll two fair six-sided dice. What is the probability that the result is divisible by either 2 or 5, or both?
+
+Solution
+
+Write ![
+$$\\mathcal{D}_{n}$$
+](A442674_1_En_3_Chapter_IEq110.gif) for the event the number is divisible by n. We want ![
+$$P\(\\mathcal{D}_{2} \\cup \\mathcal{D}_{5}\) = P\(\\mathcal{D}_{2}\) + P\(\\mathcal{D}_{5}\) - P\(\\mathcal{D}_{2} \\cap \\mathcal{D}_{5}\)$$
+](A442674_1_En_3_Chapter_IEq111.gif). From Example 3.14, we know ![
+$$P\(\\mathcal{D}_{2}\) = 1/2$$
+](A442674_1_En_3_Chapter_IEq112.gif) and ![
+$$P\(\\mathcal{D}_{2} \\cap \\mathcal{D}_{5}\) = 3/36$$
+](A442674_1_En_3_Chapter_IEq113.gif). By counting outcomes, ![
+$$P\(\\mathcal{D}_{5}\) = 7/36$$
+](A442674_1_En_3_Chapter_IEq114.gif). So ![
+$$P\(\\mathcal{D}_{2} \\cup \\mathcal{D}_{5}\) = \(18 + 7 - 3\)/36 = 22/36$$
+](A442674_1_En_3_Chapter_IEq115.gif).
+
+## 3.3 Independence
+
+Some experimental results do not affect others. For example, if I flip a coin twice, whether I get heads on the first flip has no effect on whether I get heads on the second flip. As another example, I flip a coin; the outcome does not affect whether I get hit on the head by a falling apple later in the day. We refer to events with this property as independent.
+
+Here is a pair of events that is not independent. Imagine I throw a six-sided die. Write ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq116.gif) for the event that the die comes up with an odd number of spots, and write ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq117.gif) for the event that the number of spots is either 3 or 5. Now these events are interrelated in an important way. If I know that ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq118.gif) has occurred, I also know that ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq119.gif) has occurred—I don't need to check separately, because ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq120.gif) implies ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq121.gif).
+
+Here is an example of a weaker interaction that results in events not being independent. Write ![
+$$\\mathcal{C}$$
+](A442674_1_En_3_Chapter_IEq122.gif) for the event that the die comes up with an odd number of spots, and write ![
+$$\\mathcal{D}$$
+](A442674_1_En_3_Chapter_IEq123.gif) for the event that the number of spots is larger than 3. These events are interrelated. The probability of each event separately is 1/2. If I know that ![
+$$\\mathcal{C}$$
+](A442674_1_En_3_Chapter_IEq124.gif) has occurred, then I know that the die shows either 1, 3, or 5 spots. One of these outcomes belongs to ![
+$$\\mathcal{D}$$
+](A442674_1_En_3_Chapter_IEq125.gif), and two do not. This means that knowing that ![
+$$\\mathcal{C}$$
+](A442674_1_En_3_Chapter_IEq126.gif) has occurred tells you something about whether ![
+$$\\mathcal{D}$$
+](A442674_1_En_3_Chapter_IEq127.gif) has occurred. Independent events do not have this property. This means that the probability that they occur together has an important property, given in the box below.
+
+Definition 3.3 (Independent Events)
+
+Two events ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq128.gif) and ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq129.gif) are independent if and only if
+
+![
+$$\\displaystyle{P\(\\mathcal{A}\\cap \\mathcal{B}\) = P\(\\mathcal{A}\)P\(\\mathcal{B}\)}$$
+](A442674_1_En_3_Chapter_Equr.gif)
+
+The "size" analogy helps motivate this expression. We think of P(A) as the "size" of ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq130.gif) relative to ![
+$$\\Omega$$
+](A442674_1_En_3_Chapter_IEq131.gif), and so on. Now ![
+$$P\(\\mathcal{A}\\cap \\mathcal{B}\)$$
+](A442674_1_En_3_Chapter_IEq132.gif) measures the "size" of ![
+$$\\mathcal{A}\\cap \\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq133.gif)—that is, the part of ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq134.gif) that lies inside ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq135.gif). But if ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq136.gif) and ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq137.gif) are independent, then the "size" of ![
+$$\\mathcal{A}\\cap \\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq138.gif) relative to ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq139.gif) should be the same as the "size" of ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq140.gif) relative to ![
+$$\\Omega$$
+](A442674_1_En_3_Chapter_IEq141.gif) (Fig. 3.2). Otherwise, ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq142.gif) affects ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq143.gif), because ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq144.gif) is more (or less) likely when ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq145.gif) has occurred.
+
+Fig. 3.2
+
+On the left, ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq146.gif) and ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq147.gif) are independent. ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq148.gif) spans 1∕4 of ![
+$$\\Omega$$
+](A442674_1_En_3_Chapter_IEq149.gif), and ![
+$$\\mathcal{A}\\cap \\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq150.gif) spans 1∕4 of ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq151.gif). This means that knowing whether an outcome is in ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq152.gif) or not doesn't affect the probability that it is in ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq153.gif). 1∕4 of the outcomes of ![
+$$\\Omega$$
+](A442674_1_En_3_Chapter_IEq154.gif) lie in ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq155.gif), and 1∕4 of the outcomes in ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq156.gif) lie in ![
+$$\\mathcal{A}\\cap \\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq157.gif). On the right, they are not. Very few of the outcomes in ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq158.gif) lie in ![
+$$\\mathcal{B}\\cap \\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq159.gif), so that observing ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq160.gif) means that ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq161.gif) becomes less likely, because very few of the outcomes in ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq162.gif) also lie in ![
+$$\\mathcal{A}\\cap \\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq163.gif)
+
+So for ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq164.gif) and ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq165.gif) to be independent, we must have
+
+![
+$$\\displaystyle{\\mbox{ $\\textquotedblleft$ Size$\\textquotedblright$ of }\\mathcal{A} = \\frac{\\mbox{ $\\textquotedblleft$ Size$\\textquotedblright$ of piece of }\\mathcal{A}\\mbox{ in }\\mathcal{B}} {\\mbox{ $\\textquotedblleft$ Size$\\textquotedblright$ of }\\mathcal{B}},}$$
+](A442674_1_En_3_Chapter_Equs.gif)
+
+or, equivalently,
+
+![
+$$\\displaystyle{P\(\\mathcal{A}\) = \\frac{P\(\\mathcal{A}\\cap \\mathcal{B}\)} {P\(\\mathcal{B}\)} }$$
+](A442674_1_En_3_Chapter_Equt.gif)
+
+which yields our expression.
+
+Worked example 3.16 (Fair Dice)
+
+The space of outcomes for a fair six-sided die is
+
+![
+$$\\displaystyle{\\left \\{1,2,3,4,5,6\\right \\}.}$$
+](A442674_1_En_3_Chapter_Equu.gif)
+
+The die is fair, so each outcome has the same probability. Now we toss two fair six-sided dice. The outcome for each die is independent of that for the other. With what probability do we get two threes?
+
+Solution
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\hspace{-12.0pt}P&& \(\\mbox{ first die yields 3} \\cap \\mbox{ second die yields 3}\) {}\\\\ & & = P\(\\mbox{ first die yields 3}\) \\times {}\\\\ & & P\(\\mbox{ second die yields 3}\) {}\\\\ & & = \(1/6\)\(1/6\) {}\\\\ & & = 1/36 {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ4.gif)
+
+Worked example 3.17 (Find the Lady, Twice)
+
+Recall the setup of Worked example 3.1. Assume that the card that is chosen is chosen fairly—that is, each card is chosen with the same probability. The game is played twice, and the cards are reshuffled between games. What is the probability of turning up a Queen and then a Queen again?
+
+Solution
+
+The events are independent, so 1∕9.
+
+You can use Definition 3.3 (i.e. ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq166.gif) and ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq167.gif) are independent if and only if ![
+$$P\(\\mathcal{A}\\cap \\mathcal{B}\) = P\(\\mathcal{A}\)P\(\\mathcal{B}\)$$
+](A442674_1_En_3_Chapter_IEq168.gif)) to tell whether events are independent or not. Quite small changes to a problem affect whether events are independent, as in the worked example below.
+
+Worked example 3.18 (Cards and Independence)
+
+We shuffle a standard deck of 52 cards and draw one card. The event ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq169.gif) is "the card is a red suit" and the event ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq170.gif) is "the card is a 10". (1): Are ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq171.gif) and ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq172.gif) independent?
+
+Now we take a standard deck of cards, and remove the ten of hearts. We shuffle this deck, and draw one card. The event ![
+$$\\mathcal{C}$$
+](A442674_1_En_3_Chapter_IEq173.gif) is "the card drawn from the modified deck is a red suit" and the event ![
+$$\\mathcal{D}$$
+](A442674_1_En_3_Chapter_IEq174.gif) is "the card drawn from the modified deck is a 10". (2): Are ![
+$$\\mathcal{C}$$
+](A442674_1_En_3_Chapter_IEq175.gif) and ![
+$$\\mathcal{D}$$
+](A442674_1_En_3_Chapter_IEq176.gif) independent?
+
+Solution
+
+(1): ![
+$$P\(\\mathcal{A}\) = 1/2$$
+](A442674_1_En_3_Chapter_IEq177.gif), ![
+$$P\(\\mathcal{B}\) = 1/13$$
+](A442674_1_En_3_Chapter_IEq178.gif) and in Example 3.44 we determined ![
+$$P\(\\mathcal{A}\\cap \\mathcal{B}\) = 2/52$$
+](A442674_1_En_3_Chapter_IEq179.gif). But ![
+$$2/52 = 1/26 = P\(\\mathcal{A}\)P\(\\mathcal{B}\)$$
+](A442674_1_En_3_Chapter_IEq180.gif), so they are independent.
+
+(2): These are not independent because ![
+$$P\(\\mathcal{C}\) = 25/51$$
+](A442674_1_En_3_Chapter_IEq181.gif), ![
+$$P\(\\mathcal{D}\) = 3/51$$
+](A442674_1_En_3_Chapter_IEq182.gif) and ![
+$$P\(\\mathcal{C}\\cap \\mathcal{D}\) = 1/51\\neq P\(\\mathcal{C}\)P\(\\mathcal{D}\) = 75/\(51^{2}\)$$
+](A442674_1_En_3_Chapter_IEq183.gif)
+
+The probability of a sequence of independent events can become very small very quickly, and this often misleads people.
+
+Worked example 3.19 (Accidental DNA Matches)
+
+I search a DNA database with a sample. Each time I attempt to match this sample to an entry in the database, there is a probability of an accidental chance match of 1e − 4. Chance matches are independent. There are 20,000 people in the database. What is the probability I get at least one match, purely by chance?
+
+Solution
+
+This is 1 − P(no chance matches). But P(no chance matches) is much smaller than you think. We have
+
+![
+$$\\displaystyle\\begin{array}{rcl} & & \\hspace{-18.0pt}P\(\\mbox{no chance matches}\) {}\\\\ & =& P\\left \(\\begin{array}{c} \\mbox{ no chance match to record 1}\\cap \\\\ \\mbox{ no chance match to record 2} \\cap \\\\ \\ldots \\cap \\\\ \\mbox{ no chance match to record 20,000} \\end{array} \\right \) {}\\\\ & =& P\(\\mbox{ no chance match to a record}\)^{20,000} {}\\\\ & =& \(1 - 1e - 4\)^{20,000} {}\\\\ & \\approx & 0.14 {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ5.gif)
+
+so the probability is about 0. 86 that you get at least one match by chance. If you're surprised, look at the exponent. Notice that if the database gets bigger, the probability grows; so at 40,000 the probability of one match by chance is 0. 98.
+
+People quite often reason poorly about independent events. The most common problem is known as the gambler's fallacy. This occurs when you reason that the probability of an independent event has been changed by previous outcomes. For example, imagine I toss a coin that is known to be fair 20 times and get 20 heads. The probability that the next toss will result in a head has not changed at all—it is still 0.5—but many people will believe that it has changed. At time of writing, Wikipedia has some fascinating stories about the gambler's fallacy which suggest that it's quite a common mistake. People may interpret, say, a run of 20 heads as evidence that either the coin isn't fair, or the tosses aren't independent.
+
+Remember this: Independence can mislead your intuition. There are two common problems. The first happens because the probability of a set of independent events can become very small very quickly, so that modelling events that aren't independent as independent can lead to trouble (as in Worked example  3.19 ). The second happens because most people want to believe that the universe keeps track of independent events to ensure that probability calculations work (the gambler's fallacy).
+
+### 3.3.1 Example: Airline Overbooking
+
+We can now quite easily study airline overbooking. Airlines generally sell more tickets for a flight than there are seats on the aircraft, because some passengers don't turn up on time, usually for random reasons. If the airline only sold one ticket per seat, their planes would likely have empty seats—which are lost profit—on each flight. If too many passengers turn up for a flight, the airline hopes that someone will accept a reasonable sum of money to take the next flight. Overbooking is sensible, efficient behavior and good for passengers if sensibly administered by the airline. This is because ticket prices should be at their lowest when each plane is just full, and there is quite likely some passenger who will take money to fly at some other time.
+
+To choose the number of extra tickets sold, the airline needs to think about the probability of having to pay out (which we compute below) and the amount of money they will need to pay. We don't have the tools to discuss how much the airline may need to pay, which depends quite a lot on passenger behavior, details of the schedule for the next flight, and so on. On occasion, the strategy can get expensive for the airline. While I was revising this text for publication, an airline managed to hit headlines by having airport security drag a passenger off a flight. Details of the resulting settlement were not publicised, but it can't have been cheap for the airline.
+
+Worked example 3.20 (Overbooking—1)
+
+An airline has a regular flight with six seats. It always sells seven tickets. Passengers turn up for the flight with probability p, and do so independent of other passengers. What is the probability that the flight is overbooked?
+
+Solution
+
+This is like a coin-flip problem; think of each passenger as a biased coin. With probability p, the biased coin comes up T (for turn up) and with probability (1 − p), it turns up H (for no-show). This coin is flipped seven times, and we are interested in the probability that there are seven T's. This is p 7, because the flips are independent.
+
+Worked example 3.21 (Overbooking—2)
+
+An airline has a regular flight with six seats. It always sells eight tickets. Passengers turn up for the flight with probability p, and do so independent of other passengers. What is the probability that the flight is overbooked?
+
+Solution
+
+Now we flip the coin eight times, and are interested in the probability of getting more than six T's. This is the union of two disjoint events (seven T's and eight T's). For the case of seven T's, one flip must be H; there are eight choices for this flip. For the case of eight T's, all eight flips must be T, and there is only one way to achieve this. So the probability the flight is overbooked is
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(\\mbox{ overbooked}\)& =& P\(\\mbox{ 7 }T\\mbox{ 's} \\cup \\mbox{ 8 }T\\mbox{ 's}\) {}\\\\ & =& P\(\\mbox{ 7 }T\\mbox{ 's}\) + P\(\\mbox{ 8 }T\\mbox{ 's}\) {}\\\\ & =& 8p^{7}\(1 - p\) + p^{8} {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ6.gif)
+
+Worked example 3.22 (Overbooking—3)
+
+An airline has a regular flight with six seats. It always sells eight tickets. Passengers turn up for the flight with probability p, and do so independent of other passengers. What is the probability that six passengers arrive? (i.e. the flight is not overbooked or underbooked).
+
+Solution
+
+Now we flip the coin eight times, and are interested in the probability of getting exactly six T's. The probability that a particular set of six passengers arrives is given by the probability of getting any given string of six T's and two H's. This must have probability p 6(1 − p)2. But there are a total of ![
+$$\\frac{8!} {2!6!}$$
+](A442674_1_En_3_Chapter_IEq184.gif) distinct such strings. So the probability that six passengers arrive is
+
+![
+$$\\displaystyle{ \\frac{8!} {2!6!}p^{6}\(1 - p\)^{2} = 28p^{6}\(1 - p\)^{2}.}$$
+](A442674_1_En_3_Chapter_Equv.gif)
+
+Worked example 3.23 (Overbooking—4)
+
+An airline has a regular flight with s seats. It always sells t tickets. Passengers turn up for the flight with probability p, and do so independent of other passengers. What is the probability that u passengers turn up?
+
+Solution
+
+Now we flip the coin t times, and are interested in the probability of getting u T's. There are
+
+![
+$$\\displaystyle{ \\frac{t!} {u!\(t - u\)!}}$$
+](A442674_1_En_3_Chapter_Equw.gif)
+
+disjoint outcomes with u T's and t − u H's. Each such outcome is independent, and has probability p u (1 − p) t−u . So
+
+![
+$$\\displaystyle{P\(u\\mbox{ passengers turn up}\) = \\frac{t!} {u!\(t - u\)!}\\,p^{u}\(1 - p\)^{t-u}}$$
+](A442674_1_En_3_Chapter_Equx.gif)
+
+Worked example 3.24 (Overbooking—5)
+
+An airline has a regular flight with s seats. It always sells t tickets. Passengers turn up for the flight with probability p, and do so independent of other passengers. What is the probability that the flight is oversold?
+
+Solution
+
+We need P({s \+ 1 turn up} ∪{ s \+ 2 turn up} ∪... ∪{ t turn up}). But the events {i turn up} and {j turn up} are disjoint if i ≠ j. So we can exploit Example 3.23, and write
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(\\mbox{ overbooked}\)& =& P\(\\{s + 1\\mbox{ turn up}\\}\) {}\\\\ & & +P\(\\{s + 2\\mbox{ turn up}\\}\) + {}\\\\ & & \\ldots P\(\\{t\\mbox{ turn up}\\}\) {}\\\\ & =& \\sum _{i=s+1}^{t}P\(\\{i\\mbox{ turn up}\\}\) {}\\\\ & =& \\sum _{i=s+1}^{t} \\frac{t!} {i!\(t - i\)!}p^{i}\(1 - p\)^{t-i} {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ7.gif)
+
+## 3.4 Conditional Probability
+
+Imagine we have two events ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq185.gif) and ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq186.gif). If they are independent, then the probability that they occur together is straightforward to compute. But if ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq187.gif) and ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq188.gif) are not independent, then knowing that one event has occurred can have a significant effect on the probability the other will occur. Here are two extreme examples. If ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq189.gif) and ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq190.gif) are the same, then knowing that ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq191.gif) occurred means you know that ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq192.gif) occurred, too. If ![
+$$\\mathcal{A} = \\mathcal{B}^{c}$$
+](A442674_1_En_3_Chapter_IEq193.gif), then knowing that ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq194.gif) occurred means you know that ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq195.gif) did not occur. A less extreme example appears below.
+
+Worked example 3.25 (The Probability of Events That Are Not Independent)
+
+You throw a fair six-sided die twice and add the numbers. First, compute the probability of getting a number less than six. Second, imagine you know that the first die came up three. Compute the probability the sum will be less than six. Third, imagine you know that the first die came up four. Compute the probability the sum will be less than six. Finally, imagine you know that the first die came up one. Compute the probability the sum will be less than six.
+
+Solution
+
+The probability of getting a number less than six is ![
+$$\\frac{10} {36}$$
+](A442674_1_En_3_Chapter_IEq196.gif). If the first die comes up three, then the question is what is the probability of getting a number less than three on the second die, which is ![
+$$\\frac{1} {3}$$
+](A442674_1_En_3_Chapter_IEq197.gif). If the first die comes up four, then the question is what is the probability of getting a number less than two on the second die, which is ![
+$$\\frac{1} {6}$$
+](A442674_1_En_3_Chapter_IEq198.gif). Finally, if the first die comes up one, then the question is what is the probability of getting a number less than five on the second die, which is ![
+$$\\frac{2} {3}$$
+](A442674_1_En_3_Chapter_IEq199.gif).
+
+Notice how, in Worked example 3.25, knowing what happened to the first die can have a significant effect on the probability of the event.
+
+Definition 3.4 (Conditional Probability)
+
+We assume we have a space of outcomes and a collection of events. The conditional probability of ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq200.gif), conditioned on ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq201.gif), is the probability that ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq202.gif) occurs given that ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq203.gif) has definitely occurred. We write this as
+
+![
+$$\\displaystyle{P\(\\mathcal{B}\\vert \\mathcal{A}\).}$$
+](A442674_1_En_3_Chapter_Equy.gif)
+
+From the examples, it should be clear to you that for some cases ![
+$$P\(\\mathcal{B}\\vert \\mathcal{A}\)$$
+](A442674_1_En_3_Chapter_IEq204.gif) is the same as ![
+$$P\(\\mathcal{B}\)$$
+](A442674_1_En_3_Chapter_IEq205.gif), and for other cases it is not.
+
+### 3.4.1 Evaluating Conditional Probabilities
+
+To get an expression for ![
+$$P\(\\mathcal{B}\\vert \\mathcal{A}\)$$
+](A442674_1_En_3_Chapter_IEq206.gif), notice that, because ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq207.gif) is known to have occurred, our space of outcomes or sample space is now reduced to ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq208.gif). We know that our outcome lies in ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq209.gif); ![
+$$P\(\\mathcal{B}\\vert \\mathcal{A}\)$$
+](A442674_1_En_3_Chapter_IEq210.gif) is the probability that it also lies in ![
+$$\\mathcal{B}\\cap \\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq211.gif).
+
+The outcome lies in ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq212.gif), and so it must lie in either ![
+$$\\mathcal{B}\\cap \\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq213.gif) or in ![
+$$\\mathcal{B}^{c} \\cap \\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq214.gif), and it cannot lie in both. This means that
+
+![
+$$\\displaystyle{P\(\\mathcal{B}\\vert \\mathcal{A}\) + P\(\\mathcal{B}^{c}\\vert \\mathcal{A}\) = 1.}$$
+](A442674_1_En_3_Chapter_Equz.gif)
+
+Now recall the idea of probabilities as relative frequencies. If ![
+$$P\(\\mathcal{C}\\cap \\mathcal{A}\) = kP\(\\mathcal{B}\\cap \\mathcal{A}\)$$
+](A442674_1_En_3_Chapter_IEq215.gif), this means that outcomes in ![
+$$\\mathcal{C}\\cap \\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq216.gif) will appear k times as often as outcomes in ![
+$$\\mathcal{B}\\cap \\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq217.gif). But this must apply even if we know in advance that the outcome is in ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq218.gif). This means that, if ![
+$$P\(\\mathcal{C}\\cap \\mathcal{A}\) = kP\(\\mathcal{B}\\cap \\mathcal{A}\)$$
+](A442674_1_En_3_Chapter_IEq219.gif), then ![
+$$P\(\\mathcal{C}\\vert \\mathcal{A}\) = kP\(\\mathcal{B}\\vert \\mathcal{A}\)$$
+](A442674_1_En_3_Chapter_IEq220.gif). In turn, we must have
+
+![
+$$\\displaystyle{P\(\\mathcal{B}\\vert \\mathcal{A}\) \\propto P\(\\mathcal{B}\\cap \\mathcal{A}\).}$$
+](A442674_1_En_3_Chapter_Equaa.gif)
+
+Now we need to determine the constant of proportionality; write c for this constant, meaning
+
+![
+$$\\displaystyle{P\(\\mathcal{B}\\vert \\mathcal{A}\) = cP\(\\mathcal{B}\\cap \\mathcal{A}\).}$$
+](A442674_1_En_3_Chapter_Equab.gif)
+
+We have that
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(\\mathcal{B}\\vert \\mathcal{A}\) + P\(\\mathcal{B}^{c}\\vert \\mathcal{A}\) = cP\(\\mathcal{B}\\cap \\mathcal{A}\) + cP\(\\mathcal{B}^{c} \\cap \\mathcal{A}\) = cP\(\\mathcal{A}\) = 1,& & {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ8.gif)
+
+so that
+
+![
+$$\\displaystyle{P\(\\mathcal{B}\\vert \\mathcal{A}\) = \\frac{P\(\\mathcal{B}\\cap \\mathcal{A}\)} {P\(\\mathcal{A}\)}.}$$
+](A442674_1_En_3_Chapter_Equac.gif)
+
+I find the "size" metaphor helpful here. We have that ![
+$$P\(\\mathcal{B}\\vert \\mathcal{A}\)$$
+](A442674_1_En_3_Chapter_IEq221.gif) measures the probability that an outcome is in ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq222.gif), given we know it is in ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq223.gif). From the "size" perspective, ![
+$$P\(\\mathcal{B}\\vert \\mathcal{A}\)$$
+](A442674_1_En_3_Chapter_IEq224.gif) measures the "size" of ![
+$$\(\\mathcal{A}\\cap \\mathcal{B}\)$$
+](A442674_1_En_3_Chapter_IEq225.gif) relative to ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq226.gif). So our expression makes sense, because the fraction of the event ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq227.gif) that is also part of the event ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq228.gif) is given by the "size" of the intersection divided by the "size" of ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq229.gif).
+
+Another, very useful, way to write the expression ![
+$$P\(\\mathcal{B}\\vert \\mathcal{A}\) = P\(\\mathcal{B}\\cap \\mathcal{A}\)/P\(\\mathcal{A}\)$$
+](A442674_1_En_3_Chapter_IEq230.gif) is:
+
+![
+$$\\displaystyle{P\(\\mathcal{B}\\vert \\mathcal{A}\)P\(\\mathcal{A}\) = P\(\\mathcal{B}\\cap \\mathcal{A}\).}$$
+](A442674_1_En_3_Chapter_Equad.gif)
+
+Now, since ![
+$$\\mathcal{B}\\cap \\mathcal{A} = \\mathcal{A}\\cap \\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq231.gif), we must have that
+
+![
+$$\\displaystyle{P\(\\mathcal{B}\\vert \\mathcal{A}\) = \\frac{P\(\\mathcal{A}\\vert \\mathcal{B}\)P\(\\mathcal{B}\)} {P\(\\mathcal{A}\)} }$$
+](A442674_1_En_3_Chapter_Equae.gif)
+
+Worked example 3.26 (Car Factories)
+
+There are two car factories, A and B. Each year, factory A produces 1000 cars, of which 10 are lemons. Factory B produces 2 cars, each of which is a lemon. All cars go to a single lot, where they are thoroughly mixed up. I buy a car.
+
+  * What is the probability it is a lemon?
+
+  * What is the probability it came from factory B?
+
+  * The car is now revealed to be a lemon. What is the probability it came from factory B, conditioned on the fact it is a lemon?
+
+Solution
+
+  * Write the event the car is a lemon as ![
+$$\\mathcal{L}$$
+](A442674_1_En_3_Chapter_IEq232.gif). There are 1002 cars, of which 12 are lemons. The probability that I select any given car is the same, so we have ![
+$$P\(\\mathcal{L}\) = 12/1002$$
+](A442674_1_En_3_Chapter_IEq233.gif).
+
+  * Same argument yields ![
+$$P\(\\mathcal{B}\) = 2/1002$$
+](A442674_1_En_3_Chapter_IEq234.gif).
+
+  * Write ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq235.gif) for the event the car comes from factory B. I need ![
+$$P\(\\mathcal{B}\\vert \\mathcal{L}\) = P\(\\mathcal{L}\\cap \\mathcal{B}\)/P\(\\mathcal{L}\) = P\(\\mathcal{L}\\vert \\mathcal{B}\)P\(\\mathcal{B}\)/P\(\\mathcal{L}\)$$
+](A442674_1_En_3_Chapter_IEq236.gif). I have ![
+$$P\(\\mathcal{L}\\vert \\mathcal{B}\)P\(\\mathcal{B}\)/P\(\\mathcal{L}\) = \(1 \\times 2/1002\)/\(12/1002\) = 1/6$$
+](A442674_1_En_3_Chapter_IEq237.gif).
+
+Worked example 3.27 (Royal Flushes in Poker—1)
+
+You are playing a straightforward version of poker, where you are dealt five cards face down. A royal flush is a hand of AKQJ10 all in one suit. What is the probability that you are dealt a royal flush?
+
+Solution
+
+This is
+
+![
+$$\\displaystyle{ \\frac{\\mbox{ number of hands that are royal flushes, ignoring card order}} {\\mbox{ total number of different five card hands, ignoring card order}}.}$$
+](A442674_1_En_3_Chapter_Equaf.gif)
+
+There are four hands that are royal flushes (one for each suit). Now the total number of five card hands is
+
+![
+$$\\displaystyle{\\left \(\\begin{array}{c} 52\\\\ 5\\end{array} \\right \) = 2,598,960}$$
+](A442674_1_En_3_Chapter_Equag.gif)
+
+so we have
+
+![
+$$\\displaystyle{ \\frac{4} {2,598,960} = \\frac{1} {649,740}.}$$
+](A442674_1_En_3_Chapter_Equah.gif)
+
+Worked example 3.28 (Royal Flushes in Poker—2)
+
+You are playing a straightforward version of poker, where you are dealt five cards face down. A royal flush is a hand of AKQJ10 all in one suit. The fifth card that you are dealt lands face up. What is the conditional probability of getting a royal flush, conditioned on the event that this card is the nine of spades?
+
+Solution
+
+No hand containing a nine of spades is a royal flush, so this is easily zero.
+
+Worked example 3.29 (Royal Flushes in Poker—3)
+
+You are playing a straightforward version of poker, where you are dealt five cards face down. A royal flush is a hand of AKQJ10 all in one suit. The fifth card that you are dealt lands face up. It is the Ace of spades. What now is the probability that your have been dealt a royal flush? (i.e. what is the conditional probability of getting a royal flush, conditioned on the event that one card is the Ace of spades)
+
+Solution
+
+Now consider the events
+
+![
+$$\\displaystyle{\\mathcal{A} = \\mbox{ you get a royal flush $\\mathit{and}$ the last card}}$$
+](A442674_1_En_3_Chapter_Equai.gif)
+
+![
+$$\\displaystyle{\\mbox{ is the aceof spades}}$$
+](A442674_1_En_3_Chapter_Equaj.gif)
+
+and
+
+![
+$$\\displaystyle{\\mathcal{B}\\! =\\! \\mbox{ the last card you get is the ace of spades},}$$
+](A442674_1_En_3_Chapter_Equak.gif)
+
+and the expression
+
+![
+$$\\displaystyle{P\(\\mathcal{A}\\vert \\mathcal{B}\) = \\frac{P\(\\mathcal{A}\\cap \\mathcal{B}\)} {P\(\\mathcal{B}\)}.}$$
+](A442674_1_En_3_Chapter_Equal.gif)
+
+Now ![
+$$P\(\\mathcal{B}\) = \\frac{1} {52}$$
+](A442674_1_En_3_Chapter_IEq238.gif). ![
+$$P\(\\mathcal{A}\\cap \\mathcal{B}\)$$
+](A442674_1_En_3_Chapter_IEq239.gif) is given by
+
+![
+$$\\displaystyle{\\frac{\\mbox{ number of five card royal flushes where card five is Ace of spades}} {\\mbox{ total number of different five card hands}}.}$$
+](A442674_1_En_3_Chapter_Equam.gif)
+
+This is
+
+![
+$$\\displaystyle{ \\frac{4 \\times 3 \\times 2 \\times 1} {52 \\times 51 \\times 50 \\times 49 \\times 48}}$$
+](A442674_1_En_3_Chapter_Equan.gif)
+
+yielding
+
+![
+$$\\displaystyle{P\(\\mathcal{A}\\vert \\mathcal{B}\) = \\frac{1} {249,900}.}$$
+](A442674_1_En_3_Chapter_Equao.gif)
+
+Notice the interesting part: seeing this card has really made a difference.
+
+Worked example 3.30 (Two Dice)
+
+We throw two fair six-sided dice. What is the conditional probability that the sum of spots on both dice is greater than six, conditioned on the event that the first die comes up five?
+
+Solution
+
+Write the event that the first die comes up 5 as ![
+$$\\mathcal{F}$$
+](A442674_1_En_3_Chapter_IEq240.gif), and the event the sum is greater than six as ![
+$$\\mathcal{S}$$
+](A442674_1_En_3_Chapter_IEq241.gif). There are five outcomes where the first die comes up 5 and the number is greater than 6, so ![
+$$P\(\\mathcal{F}\\cap \\mathcal{S}\) = 5/36$$
+](A442674_1_En_3_Chapter_IEq242.gif). Now
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(\\mathcal{S}\\vert \\mathcal{F}\)& =& P\(\\mathcal{F}\\cap \\mathcal{S}\)/P\(\\mathcal{F}\)=\(5/36\)/\(1/6\) {}\\\\ & =& 5/6. {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ9.gif)
+
+Notice that ![
+$$\\mathcal{A}\\cap \\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq243.gif) and ![
+$$\\mathcal{A}\\cap \\mathcal{B}^{c}$$
+](A442674_1_En_3_Chapter_IEq244.gif) are disjoint sets, and that ![
+$$\\mathcal{A} = \(\\mathcal{A}\\cap \\mathcal{B}\) \\cup \(\\mathcal{A}\\cap \\mathcal{B}^{c}\)$$
+](A442674_1_En_3_Chapter_IEq245.gif). So, because ![
+$$P\(\\mathcal{A}\) = P\(\\mathcal{A}\\cap \\mathcal{B}\) + P\(\\mathcal{A}\\cap \\mathcal{B}^{c}\)$$
+](A442674_1_En_3_Chapter_IEq246.gif), we have
+
+![
+$$\\displaystyle{P\(\\mathcal{A}\) = P\(\\mathcal{A}\\vert \\mathcal{B}\)P\(\\mathcal{B}\) + P\(\\mathcal{A}\\vert \\mathcal{B}^{c}\)P\(\\mathcal{B}^{c}\)}$$
+](A442674_1_En_3_Chapter_Equap.gif)
+
+a tremendously important and useful fact. Another version of this fact is also very useful. Assume we have a collection of disjoint sets ![
+$$\\mathcal{B}_{i}$$
+](A442674_1_En_3_Chapter_IEq247.gif). These sets must have the property that (a) ![
+$$\\mathcal{B}_{i} \\cap \\mathcal{B}_{j} = \\varnothing$$
+](A442674_1_En_3_Chapter_IEq248.gif) for i ≠ j and (b) they cover ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq249.gif), meaning that ![
+$$\\mathcal{A}\\cap \\left \(\\cup _{i}\\mathcal{B}_{i}\\right \) = \\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq250.gif). Then, because ![
+$$P\(\\mathcal{A}\) =\\sum _{i}P\(\\mathcal{A}\\cap \\mathcal{B}_{i}\)$$
+](A442674_1_En_3_Chapter_IEq251.gif), so we have
+
+![
+$$\\displaystyle{P\(\\mathcal{A}\) =\\sum _{i}P\(\\mathcal{A}\\vert \\mathcal{B}_{i}\)P\(\\mathcal{B}_{i}\)}$$
+](A442674_1_En_3_Chapter_Equaq.gif)
+
+It is wise to be suspicious of your intuitions when thinking about problems in conditional probability. There is a really big difference between ![
+$$P\(\\mathcal{A}\\vert \\mathcal{B}\)$$
+](A442674_1_En_3_Chapter_IEq252.gif) and ![
+$$P\(\\mathcal{B}\\vert \\mathcal{A}\)P\(\\mathcal{A}\)$$
+](A442674_1_En_3_Chapter_IEq253.gif). Not respecting this difference can lead to serious problems (Sect. 3.4.4), and seems to be easy to do. The division sign in the expression
+
+![
+$$\\displaystyle{P\(\\mathcal{A}\\vert \\mathcal{B}\) = P\(\\mathcal{B}\\vert \\mathcal{A}\)P\(\\mathcal{A}\)/P\(\\mathcal{B}\)}$$
+](A442674_1_En_3_Chapter_Equar.gif)
+
+can have alarming effects; as a result, most people have quite poor intuitions about conditional probability.
+
+Remember this: Here is one helpful example. If you buy a lottery ticket ( ![
+$$\\mathcal{L}$$
+](A442674_1_En_3_Chapter_IEq254.gif) ), the probability of winning ( ![
+$$\\mathcal{W}$$
+](A442674_1_En_3_Chapter_IEq255.gif) ) is small. So ![
+$$P\(\\mathcal{W}\\vert \\mathcal{L}\)$$
+](A442674_1_En_3_Chapter_IEq256.gif) may be very small. But ![
+$$P\(\\mathcal{L}\\vert \\mathcal{W}\)$$
+](A442674_1_En_3_Chapter_IEq257.gif) is 1—the winner is always someone who bought a ticket.
+
+Useful Facts 3.3 (Conditional Probability Formulas)
+
+You should remember the following formulas:
+
+  * ![
+$$P\(\\mathcal{B}\\vert \\mathcal{A}\) = \\frac{P\(\\mathcal{A}\\vert \\mathcal{B}\)P\(\\mathcal{B}\)} {P\(\\mathcal{A}\)}$$
+](A442674_1_En_3_Chapter_IEq258.gif)
+
+  * ![
+$$P\(\\mathcal{A}\) = P\(\\mathcal{A}\\vert \\mathcal{B}\)P\(\\mathcal{B}\) + P\(\\mathcal{A}\\vert \\mathcal{B}^{c}\)P\(\\mathcal{B}^{c}\)$$
+](A442674_1_En_3_Chapter_IEq259.gif)
+
+  * Assume (a) ![
+$$\\mathcal{B}_{i} \\cap \\mathcal{B}_{j} = \\varnothing$$
+](A442674_1_En_3_Chapter_IEq260.gif) for i ≠ j and (b) ![
+$$\\mathcal{A}\\cap \\left \(\\cup _{i}\\mathcal{B}_{i}\\right \) = \\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq261.gif); then ![
+$$P\(\\mathcal{A}\) =\\sum _{i}P\(\\mathcal{A}\\vert \\mathcal{B}_{i}\)P\(\\mathcal{B}_{i}\)$$
+](A442674_1_En_3_Chapter_IEq262.gif)
+
+### 3.4.2 Detecting Rare Events Is Hard
+
+It is hard to detect rare events. This nuisance is exposed by conditional probability reasoning. I have set these examples in a medical framework, but the problem occurs in pretty much any application domain. The issue comes up again and again in discussions of screening tests for diseases. Two recent important controversies have been around whether screening mammograms are a good idea, and whether screening for prostate cancer is a good idea. There is an important issue here. There are real harms that occur when a test falsely labels a patient as ill. First, the patient is distressed and frightened. Second, necessary medical interventions might be quite unpleasant and dangerous. This means it takes thought to tell whether screening does more good (by finding and helping sick people) than harm (by frightening and hurting well people).
+
+Worked example 3.31 (False Positives)
+
+You have a blood test for a rare disease that occurs by chance in 1 person in 100,000. If you have the disease, the test will report that you do with probability 0.95 (and that you do not with probability 0.05). If you do not have the disease, the test will report a false positive with probability 1e-3. If the test says you do have the disease, what is the probability it that you actually have the disease?
+
+Solution
+
+Write ![
+$$\\mathcal{S}$$
+](A442674_1_En_3_Chapter_IEq263.gif) for the event you are sick and ![
+$$\\mathcal{R}$$
+](A442674_1_En_3_Chapter_IEq264.gif) for the event the test reports you are sick. We need ![
+$$P\(\\mathcal{S}\\vert \\mathcal{R}\)$$
+](A442674_1_En_3_Chapter_IEq265.gif). We have
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(\\mathcal{S}\\vert \\mathcal{R}\)& =& \\frac{P\(\\mathcal{R}\\vert \\mathcal{S}\)P\(\\mathcal{S}\)} {P\(\\mathcal{R}\)} {}\\\\ & =& \\frac{P\(\\mathcal{R}\\vert \\mathcal{S}\)P\(\\mathcal{S}\)} {P\(\\mathcal{R}\\vert \\mathcal{S}\)P\(\\mathcal{S}\) + P\(\\mathcal{R}\\vert \\mathcal{S}^{c}\)P\(\\mathcal{S}^{c}\)} {}\\\\ & =& \\frac{0.95\\! \\times \\! 1e - 5} {0.95\\! \\times \\! 1e - 5 + 1e - 3\\! \\times \\! \(1\\! -\\! 1e\\! -\\! 5\)} {}\\\\ & =& 0.0094 {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ10.gif)
+
+which should strike you as being a bit alarming. Notice what is happening here. There are two ways that the test could come back positive: either you have the disease, or the test is producing a false positive. But the disease is so rare that it's much more likely you have a false positive result than you have the disease.
+
+If you want to be strongly confident you have detected a very rare event, you need an extremely accurate detector. The next example shows how to compute how accurate the detector needs to be. The degree of accuracy required is often well beyond anything current technologies can reach. You should remember this example the next time someone tells you their test is, say, 90% accurate—such a test could also be completely useless.
+
+Worked example 3.32 (False Positives − 2)
+
+You want to design a blood test for a rare disease that occurs by chance in 1 person in 100,000. If you have the disease, the test will report that you do with probability p (and that you do not with probability (1 − p)). If you do not have the disease, the test will report a false positive with probability q. You want to choose the value of p so that if the test says you have the disease, there is at least a 50% probability that you do.
+
+Solution
+
+Write ![
+$$\\mathcal{S}$$
+](A442674_1_En_3_Chapter_IEq266.gif) for the event you are sick and ![
+$$\\mathcal{R}$$
+](A442674_1_En_3_Chapter_IEq267.gif) for the event the test reports you are sick. We need ![
+$$P\(\\mathcal{S}\\vert \\mathcal{R}\)$$
+](A442674_1_En_3_Chapter_IEq268.gif). We have
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(\\mathcal{S}\\vert \\mathcal{R}\)& =& \\frac{P\(\\mathcal{R}\\vert \\mathcal{S}\)P\(\\mathcal{S}\)} {P\(\\mathcal{R}\)} {}\\\\ & =& \\frac{P\(\\mathcal{R}\\vert \\mathcal{S}\)P\(\\mathcal{S}\)} {P\(\\mathcal{R}\\vert \\mathcal{S}\)P\(\\mathcal{S}\) + P\(\\mathcal{R}\\vert \\mathcal{S}^{c}\)P\(\\mathcal{S}^{c}\)} {}\\\\ & =& \\frac{p \\times 1e - 5} {p \\times 1e - 5 + q \\times \(1 - 1e - 5\)} {}\\\\ & \\geq & 0.5 {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ11.gif)
+
+which means that p ≥ 99999q which should strike you as being very alarming indeed, because p ≤ 1 and q ≥ 0. One plausible pair of values is q = 1e − 5, p = 1 − 1e − 5. The test has to be spectacularly accurate to be of any use.
+
+### 3.4.3 Conditional Probability and Various Forms of Independence
+
+Two events are independent if
+
+![
+$$\\displaystyle{P\(\\mathcal{A}\\cap \\mathcal{B}\) = P\(\\mathcal{A}\)P\(\\mathcal{B}\).}$$
+](A442674_1_En_3_Chapter_Equas.gif)
+
+In turn, if two events ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq269.gif) and ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq270.gif) are independent, then
+
+![
+$$\\displaystyle{P\(\\mathcal{A}\\vert \\mathcal{B}\) = P\(\\mathcal{A}\)}$$
+](A442674_1_En_3_Chapter_Equat.gif)
+
+and
+
+![
+$$\\displaystyle{P\(\\mathcal{B}\\vert \\mathcal{A}\) = P\(\\mathcal{B}\).}$$
+](A442674_1_En_3_Chapter_Equau.gif)
+
+This means that knowing that ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq271.gif) occurred tells you nothing about ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq272.gif)—the probability that ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq273.gif) will occur is the same whether you know that ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq274.gif) occurred or not.
+
+Useful Facts 3.4 (Conditional Probability for Independent Events)
+
+If two events ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq275.gif) and ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq276.gif) are independent, then
+
+![
+$$\\displaystyle{P\(\\mathcal{A}\\vert \\mathcal{B}\) = P\(\\mathcal{A}\)}$$
+](A442674_1_En_3_Chapter_Equav.gif)
+
+and
+
+![
+$$\\displaystyle{P\(\\mathcal{B}\\vert \\mathcal{A}\) = P\(\\mathcal{B}\).}$$
+](A442674_1_En_3_Chapter_Equaw.gif)
+
+We usually do not have the information required to prove that events are independent. Instead, we use intuition (for example, two flips of the same coin are likely to be independent unless there is something very funny going on) or simply choose to apply models in which some variables are independent. There are weaker kinds of independence that are sometimes useful.
+
+Definition 3.5 (Pairwise Independence)
+
+Events ![
+$$\\mathcal{A}_{1}\\ldots \\mathcal{A}_{n}$$
+](A442674_1_En_3_Chapter_IEq277.gif) are pairwise independent if each pair is independent (i.e. ![
+$$\\mathcal{A}_{1}$$
+](A442674_1_En_3_Chapter_IEq278.gif) and ![
+$$\\mathcal{A}_{2}$$
+](A442674_1_En_3_Chapter_IEq279.gif) are independent, etc.).
+
+Worked example 3.33 (Pairwise Independence is a Weaker Property than Independence)
+
+This means that you can have events that are pairwise independent, but not independent. We draw three cards from a properly shuffled standard deck, with replacement and reshuffling (i.e., draw a card, make a note, return to deck, shuffle, draw the next, make a note, shuffle, draw the third). Let ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq280.gif) be the event that "card 1 and card 2 have the same suit"; let ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq281.gif) be the event that "card 2 and card 3 have the same suit"; let ![
+$$\\mathcal{C}$$
+](A442674_1_En_3_Chapter_IEq282.gif) be the event that "card 1 and card 3 have the same suit". Show these events are pairwise independent, but not independent.
+
+Solution
+
+By counting, you can check that ![
+$$P\(\\mathcal{A}\) = 1/4$$
+](A442674_1_En_3_Chapter_IEq283.gif); ![
+$$P\(\\mathcal{B}\) = 1/4$$
+](A442674_1_En_3_Chapter_IEq284.gif); and ![
+$$P\(\\mathcal{A}\\cap \\mathcal{B}\) = 1/16$$
+](A442674_1_En_3_Chapter_IEq285.gif), so that these two are independent. This argument works for other pairs, too. But ![
+$$P\(\\mathcal{C}\\cap \\mathcal{A}\\cap \\mathcal{B}\) = 1/16$$
+](A442674_1_En_3_Chapter_IEq286.gif) which is not 1∕43, so the events are not independent; this is because the third event is logically implied by the first two.
+
+Definition 3.6 (Conditional Independence)
+
+Events ![
+$$\\mathcal{A}_{1}\\ldots \\mathcal{A}_{n}$$
+](A442674_1_En_3_Chapter_IEq287.gif) are conditionally independent conditioned on event ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq288.gif) if
+
+![
+$$\\displaystyle{P\(\\mathcal{A}_{1} \\cap \\ldots \\cap \\mathcal{A}_{n}\\vert \\mathcal{B}\) = P\(\\mathcal{A}_{1}\\vert \\mathcal{B}\)\\ldots P\(\\mathcal{A}_{n}\\vert \\mathcal{B}\)}$$
+](A442674_1_En_3_Chapter_Equax.gif)
+
+Worked example 3.34 (Cards and Conditional Independence)
+
+We remove a red 10 and a red 6 from a standard deck of playing cards. We shuffle the remaining cards, and draw one card. Write ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq289.gif) for the event that the card drawn is a 10, ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq290.gif) for the event the card drawn is red, and ![
+$$\\mathcal{C}$$
+](A442674_1_En_3_Chapter_IEq291.gif) for the event that the card drawn is either a 10 or a 6. Show that ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq292.gif) and ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq293.gif) are not independent, but are conditionally independent conditioned on ![
+$$\\mathcal{C}$$
+](A442674_1_En_3_Chapter_IEq294.gif).
+
+Solution
+
+We have ![
+$$P\(\\mathcal{A}\) = 3/50$$
+](A442674_1_En_3_Chapter_IEq295.gif), ![
+$$P\(\\mathcal{B}\) = 24/50$$
+](A442674_1_En_3_Chapter_IEq296.gif), ![
+$$P\(\\mathcal{A}\\cap \\mathcal{B}\) = 1/50$$
+](A442674_1_En_3_Chapter_IEq297.gif), so
+
+![
+$$\\displaystyle{P\(\\mathcal{A}\\vert \\mathcal{B}\) = \\frac{1/50} {24/50} = \\frac{1} {24}\\neq P\(\\mathcal{A}\)}$$
+](A442674_1_En_3_Chapter_Equay.gif)
+
+so ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq298.gif) and ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq299.gif) are not independent. We have also that ![
+$$P\(\\mathcal{A}\\vert \\mathcal{C}\) = 1/2$$
+](A442674_1_En_3_Chapter_IEq300.gif) and ![
+$$P\(\\mathcal{B}\\vert \\mathcal{C}\) = 2/6 = 1/3$$
+](A442674_1_En_3_Chapter_IEq301.gif). Now
+
+![
+$$\\displaystyle{P\(\\mathcal{A}\\cap \\mathcal{B}\\vert \\mathcal{C}\) = 1/6 = P\(\\mathcal{A}\\vert \\mathcal{C}\)P\(\\mathcal{B}\\vert \\mathcal{C}\)}$$
+](A442674_1_En_3_Chapter_Equaz.gif)
+
+so ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq302.gif) and ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq303.gif) are conditionally independent conditioned on ![
+$$\\mathcal{C}$$
+](A442674_1_En_3_Chapter_IEq304.gif).
+
+### 3.4.4 Warning Example: The Prosecutor's Fallacy
+
+Treat conditional probability with great care, because the topic confuses a lot of people, even people you might expect not to be confused. One important mistake is the prosecutor's fallacy, which has a name because it's such a common error. A prosecutor has evidence ![
+$$\\mathcal{E}$$
+](A442674_1_En_3_Chapter_IEq305.gif) against a suspect. Write ![
+$$\\mathcal{I}$$
+](A442674_1_En_3_Chapter_IEq306.gif) for the event that the suspect is innocent. Things get interesting when ![
+$$P\(\\mathcal{E}\\vert \\mathcal{I}\)$$
+](A442674_1_En_3_Chapter_IEq307.gif) is small. The prosecutor argues, incorrectly, that the suspect must be guilty, because ![
+$$P\(\\mathcal{E}\\vert \\mathcal{I}\)$$
+](A442674_1_En_3_Chapter_IEq308.gif) is so small. The argument is incorrect because ![
+$$P\(\\mathcal{E}\\vert \\mathcal{I}\)$$
+](A442674_1_En_3_Chapter_IEq309.gif) is irrelevant to the issue. What matters is ![
+$$P\(\\mathcal{I}\\vert \\mathcal{E}\)$$
+](A442674_1_En_3_Chapter_IEq310.gif), which is the probability you are innocent, given the evidence.
+
+The distinction is very important, because ![
+$$P\(\\mathcal{I}\\vert \\mathcal{E}\)$$
+](A442674_1_En_3_Chapter_IEq311.gif) could be big even if ![
+$$P\(\\mathcal{E}\\vert \\mathcal{I}\)$$
+](A442674_1_En_3_Chapter_IEq312.gif) is small. In the expression
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(\\mathcal{I}\\vert \\mathcal{E}\)& & = \\frac{P\(\\mathcal{E}\\vert \\mathcal{I}\)P\(\\mathcal{I}\)} {P\(\\mathcal{E}\)} {}\\\\ & & = \\frac{P\(\\mathcal{E}\\vert \\mathcal{I}\)P\(\\mathcal{I}\)} {\(P\(\\mathcal{E}\\vert \\mathcal{I}\)P\(\\mathcal{I}\) + P\(\\mathcal{E}\\vert \\mathcal{I}^{c}\)\(1 - P\(\\mathcal{I}\)\)\)} {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ12.gif)
+
+notice that if ![
+$$P\(\\mathcal{I}\)$$
+](A442674_1_En_3_Chapter_IEq313.gif) is large or if ![
+$$P\(\\mathcal{E}\\vert \\mathcal{I}^{c}\)$$
+](A442674_1_En_3_Chapter_IEq314.gif) is much smaller than ![
+$$P\(\\mathcal{E}\\vert \\mathcal{I}\)$$
+](A442674_1_En_3_Chapter_IEq315.gif), then ![
+$$P\(\\mathcal{I}\\vert \\mathcal{E}\)$$
+](A442674_1_En_3_Chapter_IEq316.gif) could be close to one even if ![
+$$P\(\\mathcal{E}\\vert \\mathcal{I}\)$$
+](A442674_1_En_3_Chapter_IEq317.gif) is small.
+
+This fallacy can be made even more mischievous. Assume the prosecutor incorrectly adopts a model that items of evidence are independent (or even just conditionally independent, conditioned on ![
+$$\\mathcal{I}$$
+](A442674_1_En_3_Chapter_IEq318.gif)) when they're not. Then this model could result in an estimate of ![
+$$P\(\\mathcal{E}\\vert \\mathcal{I}\)$$
+](A442674_1_En_3_Chapter_IEq319.gif) that is much smaller than it should be.
+
+The prosecutor's fallacy has contributed to a variety of miscarriages of justice, with real, and shocking, consequences. One famous incident occurred in the UK, involving a mother, Sally Clark, who was convicted of murdering two of her children. Expert evidence by paediatrician Roy Meadow argued that the probability of both deaths resulting from Sudden Infant Death Syndrome was extremely small. Her first appeal cited, among other grounds, statistical error in the evidence. The appeals court rejected this appeal, calling the statistical point "a sideshow". This prompted a great deal of controversy, both in the public press and various professional journals, including a letter from the then president of the Royal Statistical Society to the Lord Chancellor, pointing out that "statistical evidence...(should be)...presented only by appropriately qualified statistical experts". A second appeal (on other grounds) followed, and was successful. The appellate judges specifically criticized the statistical evidence, although it was not a point of appeal. Clark never recovered from this horrific set of events and died in tragic circumstances shortly after the second appeal. Roy Meadow was then struck off the rolls for serious professional misconduct as an expert witness, a ruling he appealed successfully. You can find a more detailed account of this case, with pointers to important documents including the letter to the Lord Chancellor (which is well worth reading), at http://​en.​wikipedia.​org/​wiki/​Roy_​Meadow; there is further material on the prosecutors fallacy at http://​en.​wikipedia.​org/​wiki/​Prosecutor%27s_​fallacy.
+
+This story is not just about problems with the criminal law. There is a very significant difference between the meaning of ![
+$$P\(\\mathcal{E}\\vert \\mathcal{I}\)$$
+](A442674_1_En_3_Chapter_IEq320.gif) and the meaning of ![
+$$P\(\\mathcal{I}\\vert \\mathcal{E}\)$$
+](A442674_1_En_3_Chapter_IEq321.gif). When you use conditional probabilities, you need to be sure which one is important to you.
+
+Remember this: You need to be careful reasoning about conditional probability and about independent events. These topics mislead intuition so regularly that some errors have names. Be very careful.
+
+### 3.4.5 Warning Example: The Monty Hall Problem
+
+There are three doors. Behind one is a car. Behind each of the others is a goat. The car and goats are placed randomly and fairly, so that the probability that there is a car behind each door is the same. You will get the object that lies behind the door you choose at the end of the game. The goats are interchangeable, and, for reasons of your own, you would prefer the car to a goat. You select a door. The host then opens a door and shows you a goat. You must now choose to either keep your door, or switch to the other door. What should you do?
+
+This problem is known as the Monty Hall problem, and is a relatively simple exercise in conditional probability. But careless thinking about probability, particularly conditional probability, can cause wonderful confusion. The Monty Hall problem has been the subject of extensive, lively, and often quite inaccurate correspondence in various national periodicals—it seems to catch the attention, which is why I describe it in some detail.
+
+Notice that you cannot tell what to do using the information provided, by the following argument. Label the door you chose at the start of the game 1; the other doors 2 and 3. Write C i for the event that the car lies behind door i. Write G m for the event that a goat is revealed behind door m, where m is the number of the door where the goat was revealed (which could be 1, 2, or 3). You need to know P(C 1 | G m ). But
+
+![
+$$\\displaystyle{P\(C_{1}\\vert G_{m}\) = \\frac{P\(G_{m}\\vert C_{1}\)P\(C_{1}\)} {P\(G_{m}\\vert C_{1}\)P\(C_{1}\) + P\(G_{m}\\vert C_{2}\)P\(C_{2}\) + P\(G_{m}\\vert C_{3}\)P\(C_{3}\)}}$$
+](A442674_1_En_3_Chapter_Equba.gif)
+
+and you do not know P(G m | C 1), P(G m | C 2), P(G m | C 3), because you don't know the rule by which the host chooses which door to open to reveal a goat. Different rules lead to quite different analyses.
+
+Here are some possible rules for the host to show a goat:
+
+  * Rule 1: choose a door uniformly at random.
+
+  * Rule 2: choose from the doors with goats behind them that are not door 1 uniformly and at random.
+
+  * Rule 3: if the car is at 1, then choose 2; if at 2, choose 3; if at 3, choose 1.
+
+  * Rule 4: choose from the doors with goats behind them uniformly and at random.
+
+It should be straightforward for you to come up with other possible rules. We should keep track of the rules in the conditioning, so we write P(G m | C 1, r 1) for the conditional probability that a goat was revealed behind door m when the car is behind door 1, using rule 1 (and so on). This means we are interested in
+
+![
+$$\\displaystyle{P\(C_{1}\\vert G_{m},r_{n}\) = \\frac{P\(G_{m}\\vert C_{1},r_{n}\)P\(C_{1}\)} {P\(G_{m}\\vert C_{1},r_{n}\)P\(C_{1}\) + P\(G_{m}\\vert C_{2},r_{n}\)P\(C_{2}\) + P\(G_{m}\\vert C_{3},r_{n}\)P\(C_{3}\)}.}$$
+](A442674_1_En_3_Chapter_Equbb.gif)
+
+Notice that each of these rules is consistent with your observations—what you saw could have occurred under any of these rules. You have to know which rule the host uses to proceed. You should be aware that in many of the discussions of this problem, people assume without comment that the host uses rule 2, then proceed with this assumption.
+
+Worked example 3.35 (Monty Hall, Rule One)
+
+Assume the host uses rule one, and shows you a goat behind door two. What is P(C 1 | G 2, r 1)?
+
+Solution
+
+To work this out, we need to know P(G 2 | C 1, r 1), P(G 2 | C 2, r 1) and P(G 2 | C 3, r 1). Now P(G 2 | C 2, r 1) must be zero, because the host could not reveal a goat behind door two if there was a car behind that door. Write O 2 for the event the host chooses to open door two, and B 2 for the event there happens to be a goat behind door two. These two events are independent—the host chose the door uniformly at random. We can compute
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(G_{2}\\vert C_{1},r_{1}\)& =& P\(O_{2} \\cap B_{2}\\vert C_{1},r_{1}\) {}\\\\ & =& P\(O_{2}\\vert C_{1},r_{1}\)P\(B_{2}\\vert C_{1},r_{1}\) {}\\\\ & =& \(1/3\)\(1\) {}\\\\ & =& 1/3 {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ13.gif)
+
+where P(B 2 | C 1, r 1) = 1 because we conditioned on the fact there was a car behind door one, so there is a goat behind each other door. This argument establishes P(G 2 | C 3, r 1) = 1∕3, too. So P(C 1 | G 2, r 1) = 1∕2—the host showing you the goat does not motivate you to do anything, because if P(C 1 | G 2, r 1) = 1∕2, then P(C 3 | G 2, r 1) = 1∕2, too—there's nothing to choose between the two closed doors.
+
+Worked example 3.36 (Monty Hall, Rule Two)
+
+Assume the host uses rule two, and shows you a goat behind door two. What is P(C 1 | G 2, r 2)?
+
+Solution
+
+To work this out, we need to know P(G 2 | C 1, r 2), P(G 2 | C 2, r 2) and P(G 2 | C 3, r 2). Now P(G 2 | C 2, r 2) = 0, because the host chooses from doors with goats behind them. P(G 2 | C 1, r 2) = 1∕2, because the host chooses uniformly and at random from doors with goats behind them that are not door one; if the car is behind door one, there are two such doors. P(G 2 | C 3, r 2) = 1, because there is only one door that (a) has a goat behind it and (b) isn't door one. Plug these numbers into the formula, to get P(C 1 | G 2, r 2) = 1∕3. This is the source of all the fuss. It says that, if you know the host is using rule two, you should switch doors if the host shows you a goat behind door two (because P(C 3 | G 2, r 2) = 2∕3).
+
+Notice what is happening: if the car is behind door three, then the only choice of goat for the host is the goat behind two. So by choosing a door under rule two, the host is signalling some information to you, which you can use. By using rule three, the host can tell you precisely where the car is (exercises).
+
+Many people find the result of Example 3.36 counterintuitive. Each time I've taught this material, I've had lively discussions with students and with teaching assistants. Some people object to the extent of newspaper columns, letters to the editor, arguments on the internet, etc. One example that some people find helpful is an extreme case. Imagine that, instead of three doors, there are 1002. The host is using rule two, modified in the following way: open all but one of the doors that are not door one, choosing only doors that have goats behind them to open. You choose door one; the host opens 1000 doors—say, all but doors one and 1002. What would you do?
+
+## 3.5 Extra Worked Examples
+
+### 3.5.1 Outcomes and Probability
+
+Worked example 3.37 (Children)
+
+A co-uple decides to have children until either (a) they have both a boy and a girl or (b) they have three children. What is the set of outcomes?
+
+Solution
+
+Write B for boy, G for girl, and write them in birth order; we have ![
+$$\\left \\{BG,GB,BBG,BBB,GGB,GGG\\right \\}$$
+](A442674_1_En_3_Chapter_IEq322.gif).
+
+Worked example 3.38 (Monty Hall (Sigh!) with Indistinguishable Goats)
+
+There are three boxes. There is a goat, a second goat, and a car. These are placed into the boxes at random. The goats are indistinguishable for our purposes; equivalently, we do not care about the difference between goats. What is the sample space?
+
+Solution
+
+Write G for goat, C for car. Then we have ![
+$$\\left \\{CGG,GCG,GGC\\right \\}$$
+](A442674_1_En_3_Chapter_IEq323.gif).
+
+Worked example 3.39 (Monty Hall with Distinguishable Goats)
+
+There are three boxes. There is a goat, a second goat, and a car. These are placed into the boxes at random. One goat is male, the other female, and the distinction is important. What is the sample space?
+
+Solution
+
+Write M for male goat, F for female goat, C for car. Then we have ![
+$$\\left \\{CFM,CMF,FCM,MCF,FMC,MFC\\right \\}$$
+](A442674_1_En_3_Chapter_IEq324.gif). Notice how the number of outcomes has increased, because we now care about the distinction between goats.
+
+Worked example 3.40 (Find the Lady, with Even Probabilities)
+
+Recall the problem of Worked example 3.1. Assume that the card that is chosen is chosen fairly—that is, each card is chosen with the same probability. What is the probability of turning up a Queen?
+
+Solution
+
+There are three outcomes, and each is chosen with the same probability, so the probability is 1∕3.
+
+Worked example 3.41 (Monty Hall, Indistinguishable Goats, Even Probabilities)
+
+Recall the problem of Worked example 3.39. Each outcome has the same probability. We choose to open the first box. With what probability will we find a goat (any goat)?
+
+Solution
+
+There are three outcomes, each has the same probability, and two give a goat, so 2∕3
+
+Worked example 3.42 (Monty Hall, Yet Again)
+
+Each outcome has the same probability. We choose to open the first box. With what probability will we find the car?
+
+Solution
+
+There are three places the car could be, each has the same probability, so 1∕3
+
+Worked example 3.43 (Monty Hall, with Distinct Goats, Again)
+
+Each outcome has the same probability. We choose to open the first box. With what probability will we find a female goat?
+
+Solution
+
+Using the reasoning of the previous example, but substituting "female goat" for "car", 1∕3. The point of this example is that the sample space matters. If you care about the gender of the goat, then it's important to keep track of it; if you don't, it's a good idea to omit it from the sample space.
+
+### 3.5.2 Events
+
+Worked example 3.44 (Drawing a Red Ten)
+
+I shuffle a standard pack of cards, and draw one card. What is the probability that it is a red ten?
+
+Solution
+
+There are 52 cards, and each is an outcome. Two of these outcomes are red tens; so we have 2∕52 = 1∕26.
+
+Worked example 3.45 (Birthdays in Succession)
+
+We stop three people at random, and ask the day of the week on which they are born. What is the probability that they are born on 3 days of the week in succession (for example, the first on Monday; the second on Tuesday; the third on Wednesday; or Saturday-Sunday-Monday; and so on).
+
+Solution
+
+We assume that births are equally common on each day of the week. The space of outcomes consists of triples of days, and each outcome has the same probability. The event is the set of triples of 3 days in succession (which has seven elements, one for each starting day). The space of outcomes has 73 elements in it, so the probability is
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\frac{\\mbox{ Number of outcomes in the event}} {\\mbox{ Total number of outcomes}} & & = \\frac{7} {7^{3}} {}\\\\ & & = \\frac{1} {49}. {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ14.gif)
+
+Worked example 3.46 (Shared Birth-days)
+
+We stop two people at random. What is the probability that they were born on the same day of the week?
+
+Solution
+
+The day the first person was born doesn't matter; the probability the second person was born on that day is 1∕7. Or you could count outcomes explicitly to get
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\frac{\\mbox{ Number of outcomes in the event}} {\\mbox{ Total number of outcomes}} & & = \\frac{7} {7 \\times 7} {}\\\\ & & = \\frac{1} {7}. {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ15.gif)
+
+Worked example 3.47 (Children—3)
+
+This example is a version of example 1.12, p44, in Stirzaker, "Elementary Probability". A couple decides to have children. They decide to have children until there is one of each gender, or until there are three, and then stop. Assume that each birth results in one child, and each gender is equally likely at each birth. Let ![
+$$\\mathcal{B}_{i}$$
+](A442674_1_En_3_Chapter_IEq325.gif) be the event that there are i boys, and ![
+$$\\mathcal{C}$$
+](A442674_1_En_3_Chapter_IEq326.gif) be the event there are more girls than boys. Compute ![
+$$P\(\\mathcal{B}_{1}\)$$
+](A442674_1_En_3_Chapter_IEq327.gif) and ![
+$$P\(\\mathcal{C}\)$$
+](A442674_1_En_3_Chapter_IEq328.gif).
+
+Solution
+
+We could write the outcomes as ![
+$$\\left \\{GB,BG,GGB,GGG,BBG,BBB\\right \\}$$
+](A442674_1_En_3_Chapter_IEq329.gif). Again, if we think about them like this, we have no simple way to compute their probability; so we use the sample space from the previous example with the device of the fictitious births again. The important events are ![
+$$\\left \\{GBb,GBg\\right \\}$$
+](A442674_1_En_3_Chapter_IEq330.gif); ![
+$$\\left \\{BGb,BGg\\right \\}$$
+](A442674_1_En_3_Chapter_IEq331.gif); ![
+$$\\left \\{GGB\\right \\}$$
+](A442674_1_En_3_Chapter_IEq332.gif); ![
+$$\\left \\{GGG\\right \\}$$
+](A442674_1_En_3_Chapter_IEq333.gif); ![
+$$\\left \\{BBG\\right \\}$$
+](A442674_1_En_3_Chapter_IEq334.gif); and ![
+$$\\left \\{BBB\\right \\}$$
+](A442674_1_En_3_Chapter_IEq335.gif). Like this, we get ![
+$$P\(\\mathcal{B}_{1}\) = 5/8$$
+](A442674_1_En_3_Chapter_IEq336.gif) and ![
+$$P\(\\mathcal{C}\) = 1/4$$
+](A442674_1_En_3_Chapter_IEq337.gif).
+
+### 3.5.3 Independence
+
+Worked example 3.48 (Children)
+
+A couple decides to have two children. Genders are assigned to children at random, fairly, at birth and independently at each birth (our models have to abstract a little!). What is the probability of having a boy and then a girl?
+
+Solution
+
+![
+$$\\displaystyle\\begin{array}{rcl} & & P\(\\mbox{ first is boy} \\cap \\mbox{ second is girl}\) {}\\\\ & & ~~~~~~= \(1/2\)\(1/2\) {}\\\\ & & ~~~~~~= 1/4 {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ16.gif)
+
+Worked example 3.49 (Programs)
+
+We sample the processes on a computer at random intervals. Write ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq338.gif) for the event that program A is observed to be running in a sample, ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq339.gif) for the event that program B is observed to be running in a sample, and ![
+$$\\mathcal{N}$$
+](A442674_1_En_3_Chapter_IEq340.gif) for the (Nasty) event that program C is observed to be behaving badly in a sample. We find ![
+$$P\(\\mathcal{A}\\cap \\mathcal{N}\) = 0.07$$
+](A442674_1_En_3_Chapter_IEq341.gif); ![
+$$P\(\\mathcal{B}\\cap \\mathcal{N}\) = 0.05$$
+](A442674_1_En_3_Chapter_IEq342.gif); ![
+$$P\(\\mathcal{A}\\cap \\mathcal{B}\\cap \\mathcal{N}\) = 0.04$$
+](A442674_1_En_3_Chapter_IEq343.gif); and ![
+$$P\(\\mathcal{N}\) = 0.1$$
+](A442674_1_En_3_Chapter_IEq344.gif). Are ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq345.gif) and ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq346.gif) conditionally independent conditioned on ![
+$$\\mathcal{N}$$
+](A442674_1_En_3_Chapter_IEq347.gif)?
+
+Solution
+
+This is a straightforward calculation. You should get ![
+$$P\(\\mathcal{A}\\vert \\mathcal{N}\) = 0.7$$
+](A442674_1_En_3_Chapter_IEq348.gif); ![
+$$P\(\\mathcal{B}\\vert \\mathcal{N}\) = 0.5$$
+](A442674_1_En_3_Chapter_IEq349.gif); ![
+$$P\(\\mathcal{A}\\cap \\mathcal{B}\\vert \\mathcal{N}\) = 0.4$$
+](A442674_1_En_3_Chapter_IEq350.gif); and so ![
+$$P\(\\mathcal{A}\\cap \\mathcal{B}\\vert \\mathcal{N}\)\\neq P\(\\mathcal{A}\\vert \\mathcal{N}\) \\times P\(\\mathcal{B}\\vert \\mathcal{N}\)$$
+](A442674_1_En_3_Chapter_IEq351.gif), and they are not conditionally independent—there is some form of interaction here.
+
+Worked example 3.50 (Independent Test Results)
+
+You have a blood test for a rare disease. We study the effect of repeated tests. Write ![
+$$\\mathcal{S}$$
+](A442674_1_En_3_Chapter_IEq352.gif) for the event that the patient is sick; ![
+$$\\mathcal{D}_{i}^{+}$$
+](A442674_1_En_3_Chapter_IEq353.gif) for the event that the i'th repetition of the test reports positive; and ![
+$$\\mathcal{D}_{i}^{-}$$
+](A442674_1_En_3_Chapter_IEq354.gif) for the event that the i'th repetition of the test reports negative. The test has ![
+$$P\(\\mathcal{D}^{+}\\vert \\mathcal{S}\) = 0.8$$
+](A442674_1_En_3_Chapter_IEq355.gif) and ![
+$$P\(\\mathcal{D}^{-}\\vert \\overline{\\mathcal{S}}\) = 0.8$$
+](A442674_1_En_3_Chapter_IEq356.gif), and ![
+$$P\(\\mathcal{S}\) = 1e - 5$$
+](A442674_1_En_3_Chapter_IEq357.gif). This blood test has the property that, if you repeat the test, results are conditionally independent conditioned on the true result, meaning that ![
+$$P\(\\mathcal{D}_{1}^{+} \\cap \\mathcal{D}_{2}^{+}\\vert \\overline{\\mathcal{S}}\) = P\(\\mathcal{D}_{1}^{+}\\vert \\overline{\\mathcal{S}}\)P\(\\mathcal{D}_{2}^{+}\\vert \\overline{\\mathcal{S}}\)$$
+](A442674_1_En_3_Chapter_IEq358.gif). Assume you test positive once; twice; and ten times. In each case, what is the posterior probability that you are sick?
+
+Solution
+
+I will work the case for two positive tests. We need ![
+$$P\(\\mathcal{S}\\vert \\mathcal{D}_{1}^{+} \\cap \\mathcal{D}_{2}^{+}\)$$
+](A442674_1_En_3_Chapter_IEq359.gif). We have
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(\\mathcal{S}\\vert \\mathcal{D}_{1}^{+} \\cap \\mathcal{D}_{ 2}^{+}\)& =& \\frac{P\(\\mathcal{D}_{1}^{+} \\cap \\mathcal{D}_{ 2}^{+}\\vert \\mathcal{S}\)P\(\\mathcal{S}\)} {P\(\\mathcal{D}_{1}^{+} \\cap \\mathcal{D}_{2}^{+}\)} {}\\\\ & =& \\frac{P\(\\mathcal{D}_{1}^{+} \\cap \\mathcal{D}_{2}^{+}\\vert \\mathcal{S}\)P\(\\mathcal{S}\)} {P\(\\mathcal{D}_{1}^{+} \\cap \\mathcal{D}_{2}^{+}\\vert \\mathcal{S}\)P\(\\mathcal{S}\)+P\(\\mathcal{D}_{1}^{+} \\cap \\mathcal{D}_{2}^{+}\\vert \\overline{\\mathcal{S}}\)P\(\\overline{\\mathcal{S}}\)} {}\\\\ & =& \\frac{0.8 \\times 0.8 \\times 1e-5} {0.8 \\times 0.8 \\times 1e-5+0.2 \\times 0.2 \\times \(1-1e-5\)} {}\\\\ & \\approx & 1.6e - 4. {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ17.gif)
+
+You should check that once yields a posterior of approximately 4e-5, and ten times yields a posterior of approximately 0.91. This isn't an argument for repeating tests; rather, you should regard it as an indication of how implausible the assumption of conditional independence of test results is.
+
+### 3.5.4 Conditional Probability
+
+Worked example 3.51 (Card Games)
+
+You have two decks of 52 standard playing cards. One has been shuffled properly. The other is organized as 26 black cards, then 26 red cards. You are shown one card from one deck, which turns out to be black; what is the posterior probability that you have a card from the shuffled deck?
+
+Solution
+
+Write ![
+$$\\mathcal{S}$$
+](A442674_1_En_3_Chapter_IEq360.gif) for the event the card comes from the shuffled deck, and ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq361.gif) the event you are given a black card. We want
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(\\mathcal{S}\\vert \\mathcal{B}\)& =& \\frac{P\(\\mathcal{B}\\vert \\mathcal{S}\)P\(\\mathcal{S}\)} {P\(\\mathcal{B}\)} {}\\\\ & =& \\frac{P\(\\mathcal{B}\\vert \\mathcal{S}\)P\(\\mathcal{S}\)} {P\(\\mathcal{B}\\vert \\mathcal{S}\)P\(\\mathcal{S}\) + P\(\\mathcal{B}\\vert \\overline{\\mathcal{S}}\)P\(\\overline{\\mathcal{S}}\)} {}\\\\ & =& \\frac{\(1/2\) \\times \(1/2\)} {\(1/2\) \\times \(1/2\) + 1 \\times \(1/2\)} {}\\\\ & =& 1/3 {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ18.gif)
+
+Worked example 3.52 (Finding a Common Disease)
+
+A disease occurs with probability 0.4 (i.e. it is present in 40% of the population). You have a test that detects the disease with probability 0.6, and produces a false positive with probability 0.1. What is the posterior probability you have the disease if the test comes back positive?
+
+Solution
+
+Write ![
+$$\\mathcal{S}$$
+](A442674_1_En_3_Chapter_IEq362.gif) for the event you are sick, and ![
+$$\\mathcal{P}$$
+](A442674_1_En_3_Chapter_IEq363.gif) for the event the test comes back positive. We want
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(\\mathcal{S}\\vert \\mathcal{P}\)& =& \\frac{P\(\\mathcal{P}\\vert \\mathcal{S}\)P\(\\mathcal{S}\)} {P\(\\mathcal{P}\)} {}\\\\ & =& \\frac{P\(\\mathcal{P}\\vert \\mathcal{S}\)P\(\\mathcal{S}\)} {P\(\\mathcal{P}\\vert \\mathcal{S}\)P\(\\mathcal{S}\) + P\(\\mathcal{P}\\vert \\overline{\\mathcal{S}}\)P\(\\overline{\\mathcal{S}}\)} {}\\\\ & =& \\frac{0.6 \\times 0.4} {0.6 \\times 0.4 + 0.1 \\times 0.6} {}\\\\ & =& 0.8 {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ19.gif)
+
+Notice that if the disease is quite common, even a rather weak test is helpful.
+
+Worked example 3.53 (Which Disease Do You Have?)
+
+Disease A occurs with probability 0.1 (i.e. it is present in 20% of the population), and disease B occurs with probability 0.2. It is not possible to have both diseases. You have a single test. This test reports positive with probability 0.8 for a patient with disease A, with probability 0.5 for a patient with disease B, and with probability 0.01 for a patient with no disease. What is the posterior probability you have either disease, or neither, if the test comes back positive?
+
+Solution
+
+We are interested in ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq364.gif) (the event you have disease A), ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq365.gif) (the event you have disease B), and ![
+$$\\mathcal{W}$$
+](A442674_1_En_3_Chapter_IEq366.gif) (the event you are well). Write ![
+$$\\mathcal{P}$$
+](A442674_1_En_3_Chapter_IEq367.gif) for the event the test comes back positive. We want ![
+$$P\(\\mathcal{A}\\vert \\mathcal{P}\)$$
+](A442674_1_En_3_Chapter_IEq368.gif), ![
+$$P\(\\mathcal{B}\\vert \\mathcal{P}\)$$
+](A442674_1_En_3_Chapter_IEq369.gif) and ![
+$$P\(\\mathcal{W}\\vert \\mathcal{P}\) = 1 - P\(\\mathcal{A}\\vert \\mathcal{P}\) - P\(\\mathcal{B}\\vert \\mathcal{P}\)$$
+](A442674_1_En_3_Chapter_IEq370.gif). We have
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(\\mathcal{A}\\vert \\mathcal{P}\)& =& \\frac{P\(\\mathcal{P}\\vert \\mathcal{A}\)P\(\\mathcal{A}\)} {P\(\\mathcal{P}\)} {}\\\\ & =& \\frac{P\(\\mathcal{P}\\vert \\mathcal{A}\)P\(\\mathcal{A}\)} {P\(\\mathcal{P}\\vert \\mathcal{A}\)P\(\\mathcal{A}\) + P\(\\mathcal{P}\\vert \\mathcal{B}\)P\(\\mathcal{B}\) + P\(\\mathcal{P}\\vert \\mathcal{W}\)P\(\\mathcal{W}\)} {}\\\\ & =& \\frac{0.8 \\times 0.1} {0.8 \\times 0.1 + 0.5 \\times 0.2 + 0.01 \\times 0.7} {}\\\\ & \\approx & 0.43 {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ20.gif)
+
+A similar calculation yields ![
+$$P\(\\mathcal{B}\\vert \\mathcal{P}\)\) \\approx 0.53$$
+](A442674_1_En_3_Chapter_IEq371.gif) and ![
+$$P\(\\mathcal{W}\\vert \\mathcal{P}\) \\approx 0.04$$
+](A442674_1_En_3_Chapter_IEq372.gif). The low probability of a false positive means that a positive result very likely comes from some disease. Even though the test isn't particularly sensitive to disease B, the fact B is twice as common as A means a positive result is somewhat more likely to have come from B than from A.
+
+Worked example 3.54 (Fraud or Psychic Powers?)
+
+You want to investigate the powers of a putative psychic. You blindfold this person, then flip a fair coin 10 times. Each time, the subject correctly tells you whether it came up heads or tails. There are three possible explanations: chance, fraud, or psychic powers. What is the posterior probability of each, conditioned on the evidence.
+
+Solution
+
+We have to do some modelling here. We must choose reasonable numbers for the prior of chance (![
+$$\\mathcal{C}$$
+](A442674_1_En_3_Chapter_IEq373.gif)), fraud (![
+$$\\mathcal{F}$$
+](A442674_1_En_3_Chapter_IEq374.gif)), and psychic powers (![
+$$\\mathcal{P}\)$$
+](A442674_1_En_3_Chapter_IEq375.gif). There's little reliable evidence for psychic powers to date, so we can choose ![
+$$P\(\\mathcal{P}\) = 2\\epsilon$$
+](A442674_1_En_3_Chapter_IEq376.gif) (where ε is a very small number), and allocate the remaining probability evenly between ![
+$$\\mathcal{C}$$
+](A442674_1_En_3_Chapter_IEq377.gif) and ![
+$$\\mathcal{F}$$
+](A442674_1_En_3_Chapter_IEq378.gif). Write ![
+$$\\mathcal{E}$$
+](A442674_1_En_3_Chapter_IEq379.gif) for the event the subject correctly calls 10 flips of a fair coin. We have ![
+$$P\(\\mathcal{E}\\vert \\mathcal{C}\) = \(1/2\)^{10}$$
+](A442674_1_En_3_Chapter_IEq380.gif). Assume that fraud and psychic powers are efficient, so that ![
+$$P\(\\mathcal{E}\\vert \\mathcal{F}\) = P\(\\mathcal{E}\\vert \\mathcal{P}\) = 1$$
+](A442674_1_En_3_Chapter_IEq381.gif). Then we have
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(\\mathcal{P}\\vert \\mathcal{E}\)& =& \\frac{P\(\\mathcal{E}\\vert \\mathcal{P}\)P\(\\mathcal{P}\)} {P\(\\mathcal{E}\\vert \\mathcal{P}\)P\(\\mathcal{P}\) + P\(\\mathcal{E}\\vert \\mathcal{C}\)P\(\\mathcal{C}\) + P\(\\mathcal{E}\\vert \\mathcal{F}\)P\(\\mathcal{F}\)} {}\\\\ & =& \\frac{2\\epsilon } {2\\epsilon + \(1/2\)^{10} \\times \(0.5-\\epsilon \) + \(0.5-\\epsilon \)} {}\\\\ & \\approx & 4\\epsilon {}\\\\ \\end{array}$$
+](A442674_1_En_3_Chapter_Equ21.gif)
+
+and ![
+$$P\(\\mathcal{F}\\vert \\mathcal{E}\)$$
+](A442674_1_En_3_Chapter_IEq382.gif) is rather close to 1. I'd check how well the blindfold works; it's a traditional failure point in experiments like this.
+
+## 3.6 You Should
+
+### 3.6.1 Remember These Definitions
+
+Sample space 53
+
+Event 55
+
+Independent events 62
+
+Conditional probability 66
+
+Pairwise independence 72
+
+Conditional independence 72
+
+### 3.6.2 Remember These Terms
+
+outcomes 53
+
+probability 54
+
+gambler's fallacy 64
+
+prosecutor's fallacy 72
+
+### 3.6.3 Remember and Use These Facts
+
+Basic properties of the probability events 55
+
+Properties of the probability of events 59
+
+Conditional probability formulas 70
+
+Conditional probability for independent events 71
+
+### 3.6.4 Remember These Points
+
+Sample spaces are required, and need not be finite 54
+
+Probability is frequency 54
+
+You can compute the probability of events by counting outcomes 58
+
+Warning: independence can mislead 64
+
+Conditional probability: lottery example 69
+
+Intuitions about conditional probability are likely wrong; be careful 73
+
+### 3.6.5 Be Able to
+
+  * Write out a set of outcomes for an experiment.
+
+  * Construct an event space.
+
+  * Compute the probabilities of outcomes and events.
+
+  * Determine when events are independent.
+
+  * Compute the probabilities of outcomes by counting events, when the count is straightforward.
+
+  * Compute a conditional probability.
+
+Problems
+
+Outcomes
+
+3.1 You roll a four sided die. What is the space of outcomes?
+
+3.2 King Lear decides to allocate three provinces (1, 2, and 3) to his daughters (Goneril, Regan and Cordelia—read the book) at random. Each gets one province. What is the space of outcomes?
+
+3.3 You randomly wave a flyswatter at a fly. What is the space of outcomes?
+
+3.4 You read the book, so you know that King Lear had family problems. As a result, he decides to allocate two provinces to one daughter, one province to another daughter, and no provinces to the third. Because he's a bad problem solver, he does so at random. What is the space of outcomes?
+
+The Probability of an Outcome
+
+3.5 You roll a fair four sided die. What is the probability of getting a 3?
+
+3.6 You shuffle a standard deck of playing cards and draw a card. What is the probability that this is the king of hearts?
+
+3.7 A roulette wheel has 36 slots numbered 1–36. Of these slots, the odd numbers are red and the even numbers are black. There are two slots numbered zero, which are green. The croupier spins the wheel, and throws a ball onto the surface; the ball bounces around and ends up in a slot (which is chosen fairly and at random). What is the probability the ball ends up in slot 2?
+
+Events
+
+3.8 At a particular University, 1∕2 of the students drink alcohol and 1∕3 of the students smoke cigarettes.
+
+  1. (a)
+
+What is the largest possible fraction of students who do neither?
+
+  2. (b)
+
+It turns out that, in fact, 1∕3 of the students do neither. What fraction of the students does both?
+
+Computing Probabilities by Counting Outcomes
+
+3.9 Assume each outcome in ![
+$$\\Omega$$
+](A442674_1_En_3_Chapter_IEq383.gif) has the same probability. In this case, show
+
+![
+$$\\displaystyle{P\(\\mathcal{E}\) = \\frac{\\mbox{ Number of outcomes in }\\mathcal{E}} {\\mbox{ Total number of outcomes in }\\Omega }}$$
+](A442674_1_En_3_Chapter_Equbc.gif)
+
+3.10 You roll a fair four sided die, and then a fair six sided die. You add the numbers on the two dice. What is the probability the result is even?
+
+3.11 You roll a fair 20 sided die. What is the probability of getting an even number?
+
+3.12 You roll a fair five sided die. What is the probability of getting an even number?
+
+3.13 I am indebted to Amin Sadeghi for this exercise. You must sort four balls into two buckets. There are two white, one red and one green ball.
+
+  1. (a)
+
+For each ball, you choose a bucket independently and at random, with probability ![
+$$\\frac{1} {2}$$
+](A442674_1_En_3_Chapter_IEq384.gif). Show that the probability each bucket has a colored ball in it is ![
+$$\\frac{1} {2}$$
+](A442674_1_En_3_Chapter_IEq385.gif).
+
+  2. (b)
+
+You now choose to sort these balls in such a way that each bucket has two balls in it. You can do so by generating a permutation of the balls uniformly and at random, then placing the first two balls in the first bucket and the second two balls in the second bucket. Show that there are 16 permutations where there is one colored ball in each bucket.
+
+  3. (c)
+
+Use the results of the previous step to show that, using the sorting procedure of that step, the probability of having a colored ball in each bucket is ![
+$$\\frac{2} {3}$$
+](A442674_1_En_3_Chapter_IEq386.gif).
+
+  4. (d)
+
+Why do the two sorting procedures give such different outcomes?
+
+The Probability of Events
+
+3.14 You flip a fair coin three times. What is the probability of seeing HTH? (i.e. Heads, then Tails, then Heads)
+
+3.15 You shuffle a standard deck of playing cards and draw a card.
+
+  1. (a)
+
+What is the probability that this is a king?
+
+  2. (b)
+
+What is the probability that this is a heart?
+
+  3. (c)
+
+What is the probability that this is a red card (i.e. a heart or a diamond)?
+
+3.16 A roulette wheel has 36 slots numbered 1–36. Of these slots, the odd numbers are red and the even numbers are black. There are two slots numbered zero, which are green. The croupier spins the wheel, and throws a ball onto the surface; the ball bounces around and ends up in a slot (which is chosen fairly and at random).
+
+  1. (a)
+
+What is the probability the ball ends up in a green slot?
+
+  2. (b)
+
+What is the probability the ball ends up in a red slot with an even number?
+
+  3. (c)
+
+What is the probability the ball ends up in a red slot with a number divisible by 7?
+
+3.17 You flip a fair coin three times. What is the probability of seeing two heads and one tail?
+
+3.18 You remove the king of hearts from a standard deck of cards, then shuffle it and draw a card.
+
+  1. (a)
+
+What is the probability this card is a king?
+
+  2. (b)
+
+What is the probability this card is a heart?
+
+3.19 You shuffle a standard deck of cards, then draw four cards.
+
+  1. (a)
+
+What is the probability all four are the same suit?
+
+  2. (b)
+
+What is the probability all four are red?
+
+  3. (c)
+
+What is the probability each has a different suit?
+
+3.20 You roll three fair six-sided dice and add the numbers. What is the probability the result is even?
+
+3.21 You roll three fair six-sided dice and add the numbers. What is the probability the result is even and not divisible by 20?
+
+3.22 You shuffle a standard deck of cards, then draw seven cards. What is the probability that you see no aces?
+
+3.23 Show that ![
+$$P\(\\mathcal{A}- \(\\mathcal{B}\\cup \\mathcal{C}\)\) = P\(\\mathcal{A}\) - P\(\\mathcal{A}\\cap \\mathcal{B}\) - P\(\\mathcal{A}\\cap \\mathcal{C}\) + P\(\\mathcal{A}\\cap \\mathcal{B}\\cap \\mathcal{C}\)$$
+](A442674_1_En_3_Chapter_IEq387.gif).
+
+3.24 You draw a single card from a standard 52 card deck. What is the probability that it is red?
+
+3.25 You remove all heart cards from a standard 52 card deck, then draw a single card from the result.
+
+  1. (a)
+
+What is the probability that the card you draw is a red king?
+
+  2. (b)
+
+What is the probability that the card you draw is a spade?
+
+Permutations and Combinations
+
+3.26 You shuffle a standard deck of playing cards, and deal a hand of 10 cards. With what probability does this hand have five red cards?
+
+3.27 Magic the Gathering is a popular card game. Cards can be land cards, or other cards. We consider a game with two players. Each player has a deck of 40 cards. Each player shuffles their deck, then deals seven cards, called their hand.
+
+  1. (a)
+
+Assume that player one has 10 land cards in their deck and player two has 20. With what probability will each player have four lands in their hand?
+
+  2. (b)
+
+Assume that player one has 10 land cards in their deck and player two has 20. With what probability will player one have two lands and player two have three lands in hand?
+
+  3. (c)
+
+Assume that player one has 10 land cards in their deck and player two has 20. With what probability will player two have more lands in hand than player one?
+
+3.28 The previous exercise divided Magic the Gathering cards into lands vs. other. We now recognize four kinds of cards: land, spell, creature and artifact. We consider a game with two players. Each player has a deck of 40 cards. Each player shuffles their deck, then deals seven cards, called their hand.
+
+  1. (a)
+
+Assume that player one has 10 land cards, 10 spell cards, 10 creature cards and 10 artifact cards in their deck. With what probability will player one have at least one of each kind of card in hand?
+
+  2. (b)
+
+Assume that player two has 20 land cards, 5 spell cards, 7 creature cards and 8 artifact cards in their deck. With what probability will player two have at least one of each kind of card in hand?
+
+  3. (c)
+
+Assume that player one has 10 land cards, 10 spell cards, 10 creature cards and 10 artifact cards in their deck;. and player two has 20 land cards, 5 spell cards, 7 creature cards and 8 artifact cards in their deck. With what probability will at least one of the players have at least one of each kind card in hand?
+
+3.29 You take a standard deck of 52 playing cards and shuffle it. Compute the probability that, in the shuffled deck, there is at least one pair of cards following one another in increasing order (i.e. a 2 followed by a 3, or a 3 followed by a 4, etc.). This isn't particularly easy, but the probability is higher than most people realize; you can surprise your friends and make money with this information.
+
+Independence
+
+3.30 Event ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq388.gif) has ![
+$$P\(\\mathcal{A}\) = 0.5$$
+](A442674_1_En_3_Chapter_IEq389.gif). Event ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq390.gif) has ![
+$$P\(\\mathcal{B}\) = 0.2$$
+](A442674_1_En_3_Chapter_IEq391.gif). We also know that ![
+$$P\(\\mathcal{A}\\cup \\mathcal{B}\) = 0.65$$
+](A442674_1_En_3_Chapter_IEq392.gif). Are A and B independent? Why?
+
+3.31 Event ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq393.gif) has ![
+$$P\(\\mathcal{A}\) = 0.5$$
+](A442674_1_En_3_Chapter_IEq394.gif). Event ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq395.gif) has ![
+$$P\(\\mathcal{B}\) = 0.5$$
+](A442674_1_En_3_Chapter_IEq396.gif). These events are independent. What is ![
+$$P\(\\mathcal{A}\\cup \\mathcal{B}\)$$
+](A442674_1_En_3_Chapter_IEq397.gif)?
+
+3.32 You take a standard deck of cards, shuffle it, and remove both red kings. You then draw a card.
+
+  1. (a)
+
+Is the event ![
+$$\\left \\{\\mbox{ card is red}\\right \\}$$
+](A442674_1_En_3_Chapter_IEq398.gif) independent of the event ![
+$$\\left \\{\\mbox{ card is a queen}\\right \\}$$
+](A442674_1_En_3_Chapter_IEq399.gif)?
+
+  2. (b)
+
+Is the event ![
+$$\\left \\{\\mbox{ card is black}\\right \\}$$
+](A442674_1_En_3_Chapter_IEq400.gif) independent of the event ![
+$$\\left \\{\\mbox{ card is a king}\\right \\}$$
+](A442674_1_En_3_Chapter_IEq401.gif)?
+
+3.33 You flip a fair coin seven times. What is the probability that you see three H's and two T's?
+
+3.34 An airline sells T tickets for a flight with S seats, where T > S. Passengers turn up for the flight independently, and the probability that a passenger with a ticket will turn up for a flight is p t . The pilot is eccentric, and will fly only if precisely E passengers turn up, where E < S. Write an expression for the probability the pilot will fly.
+
+Conditional Probability
+
+3.35 You roll two fair six-sided dice. What is the conditional probability the sum of numbers is greater than three, conditioned on the first die coming up even.
+
+3.36 I claim event ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq402.gif) has probability ε, that ![
+$$P\(\\mathcal{A}\\vert \\mathcal{B}\) = 1$$
+](A442674_1_En_3_Chapter_IEq403.gif), and that ![
+$$P\(\\mathcal{B}\\vert \\mathcal{A}\) =\\epsilon /2$$
+](A442674_1_En_3_Chapter_IEq404.gif). Can such a probability distribution exist?
+
+3.37 You take a standard deck of cards, shuffle it, and remove one card. You then draw a card.
+
+  1. (a)
+
+What is the conditional probability that the card you draw is a red king, conditioned on the removed card being a king?
+
+  2. (b)
+
+What is the conditional probability that the card you draw is a red king, conditioned on the removed card being a red king?
+
+  3. (c)
+
+What is the conditional probability that the card you draw is a red king, conditioned on the removed card being a black ace?
+
+3.38 A royal flush is a hand of five cards, consisting of Ace, King, Queen, Jack and 10 of a single suit. Poker players like this hand, but don't see it all that often.
+
+  1. (a)
+
+You draw three cards from a standard deck of playing cards. These are Ace, King, Queen of hearts. What is the probability that the next two cards you draw will result in a getting a royal flush? (this is the conditional probability of getting a royal flush, conditioned on the first three cards being AKQ of hearts).
+
+3.39 You roll a fair five-sided die, and a fair six-sided die.
+
+  1. (a)
+
+What is the probability that the sum of numbers is even?
+
+  2. (b)
+
+What is the conditional probability that the sum of numbers is even, conditioned on the six-sided die producing an odd number?
+
+3.40 You take a standard deck of playing cards, shuffle it, and remove 13 cards without looking at them. You then shuffle the resulting deck of 39 cards, and draw three cards. Each of these three cards is red. What is the conditional probability that every card you removed is black?
+
+3.41 Magic the Gathering is a popular card game. Cards can be land cards, or other cards. We will consider a deck of 40 cards, containing 10 land cards and 30 other cards. A player shuffles that deck, and draws seven cards but does not look at them. The player then chooses one of these cards at random; it is a land.
+
+  1. (a)
+
+What is the conditional probability that the original hand of seven cards is all lands?
+
+  2. (b)
+
+What is the conditional probability that the original hand of seven cards contains only one land?
+
+3.42 Magic the Gathering is a popular card game. Cards can be land cards, or other cards. We will consider a deck of 40 cards, containing 10 land cards and 30 other cards. A player shuffles that deck, and draws seven cards but does not look at them. The player then chooses three of these cards at random; each of these three is a land.
+
+  1. (a)
+
+What is the conditional probability that the original hand of seven cards is all lands?
+
+  2. (b)
+
+What is the conditional probability that the original hand of seven cards contains only three lands?
+
+3.43 You take a standard deck of playing cards, and remove one card at random. You then draw a single card. Write ![
+$$\\mathcal{S}$$
+](A442674_1_En_3_Chapter_IEq405.gif) for the event that the card you remove is a six. Write ![
+$$\\mathcal{N}$$
+](A442674_1_En_3_Chapter_IEq406.gif) for the event that the card you remove is not a six. Write ![
+$$\\mathcal{R}$$
+](A442674_1_En_3_Chapter_IEq407.gif) for the event that the card you remove is red. Write ![
+$$\\mathcal{B}$$
+](A442674_1_En_3_Chapter_IEq408.gif) for the event the card you remove is black.
+
+  1. (a)
+
+Write ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq409.gif) for the event you draw a 6. What is ![
+$$P\(\\mathcal{A}\\vert \\mathcal{S}\)$$
+](A442674_1_En_3_Chapter_IEq410.gif)?
+
+  2. (b)
+
+Write ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq411.gif) for the event you draw a 6. What is ![
+$$P\(\\mathcal{A}\\vert \\mathcal{N}\)$$
+](A442674_1_En_3_Chapter_IEq412.gif)?
+
+  3. (c)
+
+Write ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq413.gif) for the event you draw a 6. What is ![
+$$P\(\\mathcal{A}\)$$
+](A442674_1_En_3_Chapter_IEq414.gif)?
+
+  4. (d)
+
+Write ![
+$$\\mathcal{D}$$
+](A442674_1_En_3_Chapter_IEq415.gif) for the event you draw a red six. Are ![
+$$\\mathcal{D}$$
+](A442674_1_En_3_Chapter_IEq416.gif) and ![
+$$\\mathcal{A}$$
+](A442674_1_En_3_Chapter_IEq417.gif) independent? why?
+
+  5. (e)
+
+Write ![
+$$\\mathcal{D}$$
+](A442674_1_En_3_Chapter_IEq418.gif) for the event you draw a red six. What is ![
+$$P\(\\mathcal{D}\)$$
+](A442674_1_En_3_Chapter_IEq419.gif)?
+
+3.44 A student takes a multiple choice test. Each question has N answers. If the student knows the answer to a question, the student gives the right answer, and otherwise guesses uniformly and at random. The student knows the answer to 70% of the questions. Write ![
+$$\\mathcal{K}$$
+](A442674_1_En_3_Chapter_IEq420.gif) for the event a student knows the answer to a question and ![
+$$\\mathcal{R}$$
+](A442674_1_En_3_Chapter_IEq421.gif) for the event the student answers the question correctly.
+
+  1. (a)
+
+What is ![
+$$P\(\\mathcal{K}\)$$
+](A442674_1_En_3_Chapter_IEq422.gif)?
+
+  2. (b)
+
+What is ![
+$$P\(\\mathcal{R}\\vert \\mathcal{K}\)$$
+](A442674_1_En_3_Chapter_IEq423.gif)?
+
+  3. (c)
+
+What is ![
+$$P\(\\mathcal{K}\\vert \\mathcal{R}\)$$
+](A442674_1_En_3_Chapter_IEq424.gif), as a function of N?
+
+  4. (d)
+
+What values of N will ensure that ![
+$$P\(\\mathcal{K}\\vert \\mathcal{R}\)> 99\\%$$
+](A442674_1_En_3_Chapter_IEq425.gif)?
+
+3.45 Write the event a patient has an illness as ![
+$$\\mathcal{I}$$
+](A442674_1_En_3_Chapter_IEq426.gif). Write the event that a test reports the patient has the illness as ![
+$$\\mathcal{R}$$
+](A442674_1_En_3_Chapter_IEq427.gif). Assume that ![
+$$P\(\\mathcal{R}\\vert \\mathcal{I}^{c}\) = 0.1$$
+](A442674_1_En_3_Chapter_IEq428.gif). We have that ![
+$$P\(\\mathcal{I}\\vert \\mathcal{R}\) = 0.5$$
+](A442674_1_En_3_Chapter_IEq429.gif).
+
+  1. (a)
+
+Compute ![
+$$P\(\\mathcal{I}\)$$
+](A442674_1_En_3_Chapter_IEq430.gif) as a function of ![
+$$P\(\\mathcal{R}\\vert \\mathcal{I}\)$$
+](A442674_1_En_3_Chapter_IEq431.gif), and plot it.
+
+  2. (b)
+
+What is the smallest possible value of ![
+$$P\(\\mathcal{I}\)$$
+](A442674_1_En_3_Chapter_IEq432.gif)? For what value of ![
+$$P\(\\mathcal{R}\\vert \\mathcal{I}\)$$
+](A442674_1_En_3_Chapter_IEq433.gif) does this occur?
+
+  3. (c)
+
+Now plot the smallest possible value of ![
+$$P\(\\mathcal{I}\)$$
+](A442674_1_En_3_Chapter_IEq434.gif) for different values of ![
+$$P\(\\mathcal{R}\\vert \\mathcal{I}^{c}\)$$
+](A442674_1_En_3_Chapter_IEq435.gif), assuming that ![
+$$P\(\\mathcal{R}\\vert \\mathcal{I}\) = 0.99$$
+](A442674_1_En_3_Chapter_IEq436.gif).
+
+The Monty Hall Problem
+
+3.46 Monty Hall, Rule 3: If the host uses rule 3, then what is P(C 1 | G 2, r 3)? Do this by computing conditional probabilities.
+
+3.47 Monty Hall, Rule 4: If the host uses rule 4, and shows you a goat behind door 2, what is P(C 1 | G 2, r 4)? Do this by computing conditional probabilities.
+© Springer International Publishing AG 2018
+
+David ForsythProbability and Statistics for Computer Sciencehttps://doi.org/10.1007/978-3-319-64410-3_4
+
+# 4. Random Variables and Expectations
+
+David Forsyth1
+
+(1)
+
+Computer Science Department, University of Illinois at Urbana Champaign, Urbana, IL, USA
+
+We have machinery to describe experiments with random outcomes, but we mostly care about numbers that are random. It is straightforward to link a number to the outcome of an experiment. The result is a random variable, a useful new idea. Random variables turn up in all sorts of places. For example, the amount of money you win or lose on a bet is a random variable. Now if you take the same bet repeatedly, you could wonder how much money will change hands in total, per bet. This yields a new and useful idea, the expected value of a random variable.
+
+Expected values have strong properties. When one knows some expected values, you can bound various probabilities. This phenomenon parallels the property of data that we saw earlier—you don't find a large fraction of the dataset many standard deviations away from the mean. Particularly important to computer scientists (and gamblers!) is the weak law of large numbers. This law says, loosely, that the value, per bet, of repeating a bet many times will almost certainly be the expected value. Among other things, this law legitimizes estimating expectations and probabilities that are hard to calculate by using a simulation. This turns out to be really useful, because simulations are often easy programs to write and can often replace rather nasty calculations.
+
+## 4.1 Random Variables
+
+Quite commonly, we would like to deal with numbers that are random. We can do so by linking numbers to the outcome of an experiment. We define a random variable:
+
+Definition 4.1 (Discrete Random Variable)
+
+Given a sample space ![
+$$\\Omega$$
+](A442674_1_En_4_Chapter_IEq1.gif), a set of events ![
+$$\\mathcal{F}$$
+](A442674_1_En_4_Chapter_IEq2.gif), a probability function P, and a countable set of real numbers D, a discrete random variable is a function with domain ![
+$$\\Omega$$
+](A442674_1_En_4_Chapter_IEq3.gif) and range D.
+
+This means that for any outcome ω there is a number X(ω). P will play an important role, but first we give some examples.
+
+Example 4.1 (Numbers from Coins)
+
+We flip a coin. Whenever the coin comes up heads, we report 1; when it comes up tails, we report 0. This is a random variable.
+
+Example 4.2 (Numbers from Coins II)
+
+We flip a coin 32 times. We record a 1 when it comes up heads, and when it comes up tails, we record a 0. This produces a 32 bit random number, which is a random variable.
+
+Example 4.3 (The Number of Pairs in a Poker Hand)
+
+We draw a hand of five cards. The number of pairs in this hand is a random variable, which takes the values 0, 1, 2 (depending on which hand we draw)
+
+A function that takes a discrete random variable to a set of numbers is also a discrete random variable.
+
+Example 4.4 (Parity of Coin Flips)
+
+We flip a coin 32 times. We record a 1 when it comes up heads, and when it comes up tails, we record a 0. This produces a 32 bit random number, which is a random variable. The parity of this number is also a random variable.
+
+Associated with any value x of the random variable X are a series of events. The most important is the set of outcomes ω such that X(ω) = x, which we can write ![
+$$\\left \\{\\omega: X\(\\omega \) = x\\right \\}$$
+](A442674_1_En_4_Chapter_IEq4.gif); it is usual to simplify to ![
+$$\\left \\{X = x\\right \\}$$
+](A442674_1_En_4_Chapter_IEq5.gif), and we will do so. The probability that a random variable X takes the value x is given by ![
+$$P\(\\left \\{\\omega: X\(\\omega \) = x\\right \\}\)$$
+](A442674_1_En_4_Chapter_IEq6.gif), which is more usually written ![
+$$P\(\\left \\{X = x\\right \\}\)$$
+](A442674_1_En_4_Chapter_IEq7.gif). This is sometimes written as P(X = x), and rather often written as P(x).
+
+We could also be interested in the set of outcomes ω such that X(ω) ≤ x (i.e. in ![
+$$\\left \\{\\omega: X\(\\omega \) \\leq x\\right \\}$$
+](A442674_1_En_4_Chapter_IEq8.gif)), which we will write ![
+$$\\left \\{X \\leq x\\right \\}$$
+](A442674_1_En_4_Chapter_IEq9.gif); The probability that X takes a value less than or equal to x is given by ![
+$$P\(\\left \\{\\omega: X\(\\omega \) \\leq x\\right \\}\)$$
+](A442674_1_En_4_Chapter_IEq10.gif), which is more usually written ![
+$$P\(\\left \\{X \\leq x\\right \\}\)$$
+](A442674_1_En_4_Chapter_IEq11.gif). Similarly, we could be interested in ω such that ![
+$$\\left \\{X\(\\omega \)> x\\right \\}$$
+](A442674_1_En_4_Chapter_IEq12.gif), and so on.
+
+Definition 4.2 (Probability Distribution of a Discrete Random Variable)
+
+The probability distribution of a discrete random variable is the set of numbers P({X = x}) for each value x that X can take. The distribution takes the value 0 at all other numbers. Notice that the distribution is non-negative. The probability distribution is sometimes known as the probability mass function.
+
+Definition 4.3 (Cumulative Distribution of a Discrete Random Variable)
+
+The cumulative distribution of a discrete random variable is the set of numbers P({X < = x}) for each value x that X can take. Notice that this is a non-decreasing function of x.
+
+Worked example 4.1 (Numbers from Coins III)
+
+We flip a biased coin 2 times. The flips are independent. The coin has P(H) = p, P(T) = 1 − p. We record a 1 when it comes up heads, and when it comes up tails, we record a 0. This produces a 2 bit random number, which is a random variable taking the values 0, 1, 2, 3. What is the probability distribution and cumulative distribution of this random variable?
+
+Solution
+
+Probability distribution: P(0) = (1 − p)2; P(1) = (1 − p)p; P(2) = p(1 − p); P(3) = p 2. Cumulative distribution: f(0) = (1 − p)2; f(1) = (1 − p); f(2) = p(1 − p) + (1 − p) = (1 − p 2); f(3) = 1.
+
+Worked example 4.2 (Betting on Coins)
+
+One way to get a random variable is to think about the reward for a bet. We agree to play the following game. I flip a coin. The coin has P(H) = p, P(T) = 1 − p. If the coin comes up heads, you pay me q; if the coin comes up tails, I pay you r. The number of dollars that change hands is a random variable. What is its probability distribution?
+
+Solution
+
+We see this problem from my perspective. If the coin comes up heads, I get q; if it comes up tails, I get − r. So we have P(X = q) = p and P(X = −r) = (1 − p), and all other probabilities are zero.
+
+### 4.1.1 Joint and Conditional Probability for Random Variables
+
+All the concepts of probability that we described for events carry over to random variables. This is as it should be, because random variables are really just a way of getting numbers out of events. However, terminology and notation change a bit.
+
+Definition 4.4 (Joint Probability Distribution of Two Discrete Random Variables)
+
+Assume we have two random variables X and Y. The probability that X takes the value x and Y takes the value y could be written as ![
+$$P\(\\left \\{X = x\\right \\} \\cap \\left \\{Y = y\\right \\}\)$$
+](A442674_1_En_4_Chapter_IEq13.gif). It is more usual to write it as
+
+![
+$$\\displaystyle{P\(x,y\).}$$
+](A442674_1_En_4_Chapter_Equa.gif)
+
+This is referred to as the joint probability distribution of the two random variables (or, quite commonly, the joint). You can think of this as a table of probabilities, one for each possible pair of x and y values.
+
+We will simplify notation further. Usually, we are interested in random variables, rather than potentially arbitrary outcomes or sets of outcomes. We will write P(X) to denote the probability distribution of a random variable, and P(x) or P(X = x) to denote the probability that random variable takes a particular value. This means that, for example, the rule we could write as
+
+![
+$$\\displaystyle\\begin{array}{rcl} & & P\(\\left \\{X = x\\right \\}\\vert \\left \\{Y = y\\right \\}\)P\(\\left \\{Y = y\\right \\}\) {}\\\\ & & \\qquad = P\(\\left \\{X = x\\right \\} \\cap \\left \\{Y = y\\right \\}\) {}\\\\ \\end{array}$$
+](A442674_1_En_4_Chapter_Equ1.gif)
+
+will be written as
+
+![
+$$\\displaystyle{P\(x\\vert y\)P\(y\) = P\(x,y\).}$$
+](A442674_1_En_4_Chapter_Equb.gif)
+
+Recall the rule from Sect. 3.​4.​1:
+
+![
+$$\\displaystyle{P\(\\mathcal{A}\\vert \\mathcal{B}\) = \\frac{P\(\\mathcal{B}\\vert \\mathcal{A}\)P\(\\mathcal{A}\)} {P\(\\mathcal{B}\)}.}$$
+](A442674_1_En_4_Chapter_Equc.gif)
+
+This rule can be rewritten in our notation for random variables. This is the most familiar form of Bayes' rule, which is important enough to appear in its own box.
+
+Definition 4.5 (Bayes' Rule)
+
+![
+$$\\displaystyle{P\(x\\vert y\) = \\frac{P\(y\\vert x\)P\(x\)} {P\(y\)} }$$
+](A442674_1_En_4_Chapter_Equd.gif)
+
+Random variables have another useful property. If x 0 ≠ x 1, then the event ![
+$$\\left \\{X = x_{0}\\right \\}$$
+](A442674_1_En_4_Chapter_IEq14.gif) must be disjoint from the event ![
+$$\\left \\{X = x_{1}\\right \\}$$
+](A442674_1_En_4_Chapter_IEq15.gif). This means that
+
+![
+$$\\displaystyle{\\sum _{x}P\(x\) = 1}$$
+](A442674_1_En_4_Chapter_Eque.gif)
+
+and that, for any y,
+
+![
+$$\\displaystyle{\\sum _{x}P\(x\\vert y\) = 1}$$
+](A442674_1_En_4_Chapter_Equf.gif)
+
+(if you're uncertain on either of these points, check them by writing them out in the language of events).
+
+Now assume we have the joint probability distribution of two random variables, X and Y. Recall that we write ![
+$$P\(\\left \\{X = x\\right \\} \\cap \\left \\{Y = y\\right \\}\)$$
+](A442674_1_En_4_Chapter_IEq16.gif) as P(x, y). Now consider the sets of outcomes ![
+$$\\left \\{Y = y\\right \\}$$
+](A442674_1_En_4_Chapter_IEq17.gif) for each different value of y. These sets must be disjoint, because y cannot take two values at the same time. Furthermore, each element of the set of outcomes ![
+$$\\left \\{X = x\\right \\}$$
+](A442674_1_En_4_Chapter_IEq18.gif) must lie in one of the sets ![
+$$\\left \\{Y = y\\right \\}$$
+](A442674_1_En_4_Chapter_IEq19.gif). So we have
+
+![
+$$\\displaystyle{\\sum _{y}P\(\\left \\{X = x\\right \\} \\cap \\left \\{Y = y\\right \\}\) = P\(\\left \\{X = x\\right \\}\)}$$
+](A442674_1_En_4_Chapter_Equg.gif)
+
+Definition 4.6 (Marginal Probability of a Random Variable)
+
+Write P(x, y) for the joint probability distribution of two random variables X and Y. Then
+
+![
+$$\\displaystyle{\\begin{array}{r} P\(x\) =\\sum _{y}P\(x,y\) =\\sum _{y}P\(\\left \\{X = x\\right \\} \\cap \\left \\{Y = y\\right \\}\) = P\(\\left \\{X = x\\right \\}\) \\end{array} }$$
+](A442674_1_En_4_Chapter_Equh.gif)
+
+is referred to as the marginal probability distribution of X.
+
+Definition 4.7 (Independent Random Variables)
+
+The random variables X and Y are independent if the events ![
+$$\\left \\{X = x\\right \\}$$
+](A442674_1_En_4_Chapter_IEq20.gif) and ![
+$$\\left \\{Y = y\\right \\}$$
+](A442674_1_En_4_Chapter_IEq21.gif) are independent for all values x and y. This means that
+
+![
+$$\\displaystyle{P\(\\left \\{X = x\\right \\} \\cap \\left \\{Y = y\\right \\}\) = P\(\\left \\{X = x\\right \\}\)P\(\\left \\{Y = y\\right \\}\),}$$
+](A442674_1_En_4_Chapter_Equi.gif)
+
+which we can rewrite as
+
+![
+$$\\displaystyle{P\(x,y\) = P\(x\)P\(y\)}$$
+](A442674_1_En_4_Chapter_Equj.gif)
+
+Worked example 4.3 (Sums and Differences of Dice)
+
+You throw two dice. The number of spots on the first die is a random variable (call it X); so is the number of spots on the second die (Y ). X and Y are independent. Now define S = X \+ Y and D = X − Y. What is the probability distribution of S and of D?
+
+Solution
+
+S can have values in the range 2,..., 12. There is only one way to get a S = 2; two ways to get S = 3; and so on. Using the methods of Chap.  for each case, the probabilities for [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] are [1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1]∕36. Similarly, D can have values in the range − 5,..., 5. Again, using the methods of chapter Worked example 14.13, the probabilities for [−5, −4, −3, −2, −1, 0, 1, 2, 3, 4, 5] are [1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1]∕36.
+
+Worked example 4.4 (Sums and Differences of Dice, II)
+
+Using the terminology of Example 4.3 , what is the joint probability distribution of S and D?
+
+Solution
+
+This is more interesting to display, because it's an 11 × 11 table. Each entry of the table represents a pair of S, D values. Many pairs can't occur (for example, for S = 2, D can only be zero; if S is even, then D must be even; and so on). You can work out the table by checking each case; it's in Table 4.1 .
+
+Table 4.1
+
+A table of the joint probability distribution of S (vertical axis; scale 2,..., 12) and D (horizontal axis; scale − 5,..., 5) from Example 4.4
+
+![
+$$\\frac{1} {36} \\times \\left \(\\begin{array}{ccccccccccc} 0&0&0&0&0&1&0&0&0&0&0\\\\ 0 &0 &0 &0 &1 &0 &1 &0 &0 &0 &0 \\\\ 0&0&0&1&0&1&0&1&0&0&0\\\\ 0 &0 &1 &0 &1 &0 &1 &0 &1 &0 &0 \\\\ 0&1&0&1&0&1&0&1&0&1&0\\\\ 1 &0 &1 &0 &1 &0 &1 &0 &1 &0 &1 \\\\ 0&1&0&1&0&1&0&1&0&1&0\\\\ 0 &0 &1 &0 &1 &0 &1 &0 &1 &0 &0 \\\\ 0&0&0&1&0&1&0&1&0&0&0\\\\ 0 &0 &0 &0 &1 &0 &1 &0 &0 &0 &0 \\\\ 0&0&0&0&0&1&0&0&0&0&0\\\\ \\end{array} \\right \)$$
+](A442674_1_En_4_Chapter_IEq22.gif)
+
+|   
+---|---
+
+Worked example 4.5 (Sums and Differences of Dice, III)
+
+Using the terminology of Example 4.3 , are X and Y independent? are S and D independent?
+
+Solution
+
+X and Y are clearly independent. But S and D are not. There are several ways to see this. One way is to notice that, if you know S = 2, then you know the value of D precisely; but if you know S = 3, D could be either 1 or − 1. This means that P(S | D) depends on D, so they're not independent. Another way is to notice that the rank of the table, as a matrix, is 6, which means that it can't be the outer product of two vectors.
+
+Worked example 4.6 (Sums and Differences of Dice, IV)
+
+Using the terminology of Example 4.3 , what is P(S | D = 0)? what is P(D | S = 11)?
+
+Solution
+
+You could work it out either of these from the table, or by first principles. If D = 0, S can have values 2, 4, 6, 8, 10, 12, and each value has conditional probability 1∕6. If S = 11, D can have values 1, or − 1, and each value has conditional probability 1∕2.
+
+### 4.1.2 Just a Little Continuous Probability
+
+Our random variables take values from a discrete set of numbers D. This makes the underlying machinery somewhat simpler to describe, and is often, but not always, enough for model building. Some phenomena are more naturally modelled as being continuous — for example, human height; human weight; the mass of a distant star; and so on. Giving a complete formal description of probability on a continuous space is surprisingly tricky, and would involve us in issues that do not arise much in practice.
+
+These issues are caused by two interrelated facts: real numbers have infinite precision; and you can't count real numbers. A continuous random variable is still a random variable, and comes with all the stuff that a random variable comes with. We will not speculate on what the underlying sample space is, nor on the underlying events. This can all be sorted out, but requires moderately heavy lifting that isn't particularly illuminating for us. The most interesting thing for us is specifying the probability distribution. Rather than talk about the probability that a real number takes a particular value (which we can't really do satisfactorily most of the time), we will instead talk about the probability that it lies in some interval. So we can specify a probability distribution for a continuous random variable by giving a set of (very small) intervals, and for each interval providing the probability that the random variable lies in this interval.
+
+The easiest way to do this is to supply a probability density function. Let p(x) be a probability density function (often called a pdf or density ) for a continuous random variable X. We interpret this function by thinking in terms of small intervals. Assume that dx is an infinitesimally small interval. Then
+
+![
+$$\\displaystyle\\begin{array}{rcl} p\(x\)dx = P\(\\left \\{\\mbox{ event that }X\\mbox{ takes a value in }\\right.& & {}\\\\ \\left.\\mbox{ the range }\\left \[x,x + dx\\right \]\\right \\}\).& & {}\\\\ \\end{array}$$
+](A442674_1_En_4_Chapter_Equ2.gif)
+
+Important properties of probability density functions follow from this definition.
+
+Useful Facts 4.1 (Properties of Probability Density Functions)
+
+  * Probability density functions are non-negative. This follows from the definition; a negative value at some u would imply that ![
+$$P\(\\{x \\in \\left \[u,u + du\\right \]\\}\)$$
+](A442674_1_En_4_Chapter_IEq23.gif) was negative, and this cannot occur.
+
+  * For a < b
+
+![
+$$\\displaystyle{ P\(\\left \\{X\\mbox{ takes a value in the range }\\left \[a,b\\right \]\\right \\}\) =\\int _{ a}^{b}p\(x\)dx. }$$
+](A442674_1_En_4_Chapter_Equk.gif)
+
+which we obtain by summing p(x)dx over all the infinitesimal intervals between a and b.
+
+  * We must have that
+
+![
+$$\\displaystyle{\\int _{-\\infty }^{\\infty }p\(x\)dx = 1.}$$
+](A442674_1_En_4_Chapter_Equl.gif)
+
+This is because
+
+![
+$$\\displaystyle{ P\(\\left \\{X\\mbox{ takes a value in the range}\\left \[-\\infty,\\infty \\right \]\\right \\}\) = 1 =\\int _{ -\\infty }^{\\infty }p\(x\)dx }$$
+](A442674_1_En_4_Chapter_Equm.gif)
+
+The property that
+
+![
+$$\\displaystyle{\\int _{-\\infty }^{\\infty }p\(x\)dx = 1}$$
+](A442674_1_En_4_Chapter_Equn.gif)
+
+is useful, because when we are trying to determine a probability density function, we can ignore a constant factor. So if g(x) is a non-negative function that is proportional to the probability density function (often pdf) we are interested in, we can recover the pdf by computing
+
+![
+$$\\displaystyle{p\(x\) = \\frac{1} {\\int _{-\\infty }^{\\infty }g\(x\)dx}g\(x\).}$$
+](A442674_1_En_4_Chapter_Equo.gif)
+
+This procedure is sometimes known as normalizing, and ∫ −∞ ∞ g(x)dx is the normalizing constant.
+
+One good way to think about pdf's is as the limit of a histogram. Imagine you collect an arbitrarily large dataset of data items, each of which is independent. You build a histogram of that dataset, using arbitrarily narrow boxes. You scale the histogram so that the sum of the box areas is one. The result is a probability density function.
+
+The pdf doesn't represent the probability that a random variable takes a value. Instead, you should think of p(x) as being the limit of a ratio (which is why it's called a density):
+
+![
+$$\\displaystyle{\\frac{\\mbox{ the probability that the random variable will lie in a small interval centered on ${\\it x}$}} {\\mbox{ the length of the small interval centered on ${\\it x}$}} }$$
+](A442674_1_En_4_Chapter_Equp.gif)
+
+Notice that, while a pdf has to be non-negative, and it has to integrate to 1, it does not have to be smaller than one. A ratio like this could be a lot larger than one, as long as it isn't larger than one for too many x (because the integral must be one). In fact, probability density functions can be strange functions (exercises).
+
+Worked example 4.7 (A Probability Density Function that is Larger than One)
+
+Assume we have a physical system that can produce random numbers. It produces numbers in the range 0 to ε, where ε > 0. Each number has the same probability of appearing. No number larger than ε or smaller than 0 can ever appear. What is the probability density function?
+
+Solution
+
+Write p(x) for the probability density function. We must have that p(x) = 0 for x < 0 and p(x) = 0 for x > ε. We must have that p(x) is constant between 0 and ε and that
+
+![
+$$\\displaystyle{\\int _{-\\infty }^{\\infty }p\(x\)dx = 1.}$$
+](A442674_1_En_4_Chapter_Equq.gif)
+
+So
+
+![
+$$\\displaystyle{p\(x\) = \\left \\{\\begin{array}{ll} 0 &\\mbox{ if }x <0 \\\\ 0 &\\mbox{ if }x>\\epsilon \\\\ \\frac{1} {\\epsilon } & \\mbox{ otherwise} \\end{array} \\right.}$$
+](A442674_1_En_4_Chapter_Equr.gif)
+
+Notice that if ε < 1, we have that p(x) > 1 for all x.
+
+Remember this: Probability notation can be quirky. Usually, one uses a big P for actual probabilities, and a small p for probability densities. The argument, or context, is supposed to tell you which probability distribution is meant (i.e P(X) likely refers to a different probability distribution than P(Y ), which should strike a computer scientist familiar with dummy variables as bizarre). Because the probability distribution for a discrete random variable is a collection of probabilities, following this convention requires that such a probability distribution be written with a big P. However, having different notation for discrete and continuous random variables can get quite clunky. In application areas it is usual to write a small p for a probability distribution, and whether a density or a distribution is intended depends on whether the random variable is continuous or discrete. However, if you want to emphasize that a probability is intended, you can write P. I will follow this convention. To add to the fun, you may encounter p(x) with the meaning "some probability distribution" or p(x) meaning "the value of the probability distribution P({X = x}) at the point x" or p(x) with the meaning "the probability distribution P({X = x}) as a function of x". You can usually figure out what is intended as long as you don't think too closely about it (authors are often quite inconsistent); context may help disambiguate different intended meanings, too. Cumulative distributions are often written with an f, so that an unexpected f(x) might mean P({X <= x}).
+
+## 4.2 Expectations and Expected Values
+
+Example 4.2 described a simple game. I flip a coin. The coin has P(H) = p, P(T) = 1 − p. If the coin comes up heads, you pay me q; if the coin comes up tails, I pay you r. Now imagine we play this game many times. Our frequency definition of probability means that in N games, we expect to see about pN heads and (1 − p)N tails. In turn, this means that my total income from these N games should be about (pN)q − ((1 − p)N)r. The N in this expression is inconvenient; instead, we could say that for any single game, my expected income is
+
+![
+$$\\displaystyle{pq - \(1 - p\)r.}$$
+](A442674_1_En_4_Chapter_Equs.gif)
+
+This isn't the actual income from a single game (which would be either q or − r, depending on what the coin did). Instead, it's an estimate of what would happen over a large number of games, on a per-game basis. This is an example of an expected value.
+
+### 4.2.1 Expected Values
+
+Definition 4.8 (Expected Value)
+
+Given a discrete random variable X which takes values in the set ![
+$$\\mathcal{D}$$
+](A442674_1_En_4_Chapter_IEq24.gif) and which has probability distribution P, we define the expected value
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[X\\right \] =\\sum _{x\\in \\mathcal{D}}xP\(X = x\).}$$
+](A442674_1_En_4_Chapter_Equt.gif)
+
+This is sometimes written ![
+$$\\mathbb{E}_{P}\\left \[X\\right \]$$
+](A442674_1_En_4_Chapter_IEq25.gif), to clarify which distribution one has in mind.
+
+Notice that an expected value could take a value that the random variable doesn't take.
+
+Example 4.5 (Betting on Coins)
+
+We agree to play the following game. I flip a fair coin (i.e. P(H) = P(T) = 1∕2). If the coin comes up heads, you pay me 1; if the coin comes up tails, I pay you 1. The expected value of my income is 0, even though the random variable never takes that value.
+
+Worked example 4.8 (Betting on Coins, Again)
+
+We agree to play the following game. I flip a fair coin (i.e. P(H) = P(T) = 1∕2). If the coin comes up heads, you pay me 2; if the coin comes up tails, I pay you 1. What is the expected value of this game?
+
+Solution
+
+The expected value of my income is
+
+![
+$$\\displaystyle{\(\\frac{1} {2}\) \\times 2 - \(\\frac{1} {2}\) \\times 1 = \\frac{1} {2}.}$$
+](A442674_1_En_4_Chapter_Equu.gif)
+
+Notice this isn't even an integer, and there's no way that any one instance of the game would yield a payoff of 1∕2. But this is what I would get, per game, if I played many times.
+
+Your intuition is likely to tell you that the game of Example 4.8 is good for me and bad for you. This intuition is correct. It turns out that an even stronger statement is possible: playing this game repeatedly is pretty much guaranteed to be excellent for me and disastrous for you. It'll take some pages before I can be crisp about precisely what I mean here and why it is true.
+
+Definition 4.9 (Expectation)
+
+Assume we have a function f that maps a discrete random variable X into a set of numbers ![
+$$\\mathcal{D}_{f}$$
+](A442674_1_En_4_Chapter_IEq26.gif). Then f(X) is a discrete random variable, too, which we write F. The expected value of this random variable is written
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[f\\right \] =\\sum _{u\\in \\mathcal{D}_{f}}uP\(F = u\) =\\sum _{x\\in \\mathcal{D}}f\(x\)P\(X = x\)}$$
+](A442674_1_En_4_Chapter_Equv.gif)
+
+which is sometimes referred to as "the expectation of f". The process of computing an expected value is sometimes referred to as "taking expectations". This is sometimes written ![
+$$\\mathbb{E}_{P}\\left \[f\\right \]$$
+](A442674_1_En_4_Chapter_IEq27.gif) or even ![
+$$\\mathbb{E}_{P\(X\)}\\left \[f\\right \]$$
+](A442674_1_En_4_Chapter_IEq28.gif), to clarify which distribution one has in mind.
+
+We can compute expectations for continuous random variables, too, though summing over all values now turns into an integral. Assume I have a continuous random variable X with probability density function p(x). Remember I interpret the probability density function as meaning that, for an infinitesimal interval size dx, p(x)dx = P({X ∈ [x, x \+ dx]})). Divide the set of possible values that X can take into small intervals of width ![
+$$\\Delta x$$
+](A442674_1_En_4_Chapter_IEq29.gif), centered on x i . We can construct a discrete random variable ![
+$$\\hat{X}$$
+](A442674_1_En_4_Chapter_IEq30.gif) which takes values x i . We have that ![
+$$P\(\\{\\hat{X} = x_{i}\\}\) \\approx p\(x_{i}\)\\Delta x$$
+](A442674_1_En_4_Chapter_IEq31.gif), where I used the approximation sign because ![
+$$\\Delta x$$
+](A442674_1_En_4_Chapter_IEq32.gif) may not be infinitesimally small.
+
+Now write ![
+$$\\mathbb{E}\\left \[\\hat{X}\\right \]$$
+](A442674_1_En_4_Chapter_IEq33.gif) for the expected value of ![
+$$\\hat{X}$$
+](A442674_1_En_4_Chapter_IEq34.gif). We have
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[\\hat{X}\\right \] =\\sum _{x_{i}}x_{i}P\(x_{i}\) \\approx \\sum _{x_{i}}x_{i}p\(x_{i}\)\\Delta x.}$$
+](A442674_1_En_4_Chapter_Equw.gif)
+
+As the intervals limit to infinitesimal intervals, ![
+$$\\hat{X}$$
+](A442674_1_En_4_Chapter_IEq35.gif) limits to X (think of a picture of a histogram with infinitely narrow boxes). Then ![
+$$\\mathbb{E}\\left \[\\hat{X}\\right \]$$
+](A442674_1_En_4_Chapter_IEq36.gif) has a limit which is an integral, and this defines the expected value. So we have the expressions in the boxes below.
+
+Definition 4.10 (Expected Value of a Continuous Random Variable)
+
+Given a continuous random variable X which takes values in the set ![
+$$\\mathcal{D}$$
+](A442674_1_En_4_Chapter_IEq37.gif) and which has probability distribution P, we define the expected value
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[X\\right \] =\\int _{x\\in \\mathcal{D}}xp\(x\)dx.}$$
+](A442674_1_En_4_Chapter_Equx.gif)
+
+This is sometimes written ![
+$$\\mathbb{E}_{p}\\left \[X\\right \]$$
+](A442674_1_En_4_Chapter_IEq38.gif), to clarify which distribution one has in mind.
+
+The expected value of a continuous random variable could be a value that the random variable doesn't take, too. Notice one attractive feature of the ![
+$$\\mathbb{E}\\left \[X\\right \]$$
+](A442674_1_En_4_Chapter_IEq39.gif) notation; we don't need to make any commitment to whether X is a discrete random variable (where we would write a sum) or a continuous random variable (where we would write an integral). The reasoning by which we turned a sum into an integral works for functions of continuous random variables, too.
+
+Definition 4.11 (Expectation of a Continuous Random Variable)
+
+Assume we have a function f that maps a continuous random variable X into a set of numbers ![
+$$\\mathcal{D}_{f}$$
+](A442674_1_En_4_Chapter_IEq40.gif). Then f(X) is a continuous random variable, too, which we write F. The expected value of this random variable is
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[f\\right \] =\\int _{x\\in \\mathcal{D}}f\(x\)p\(x\)dx}$$
+](A442674_1_En_4_Chapter_Equy.gif)
+
+which is sometimes referred to as "the expectation of f". The process of computing an expected value is sometimes referred to as "taking expectations".
+
+Under some circumstances the expected value may not exist. The integral needs to exist, and be finite, for us to interpret the expected value meaningfully, and that isn't guaranteed for every continuous random variable. Nothing we do will encounter this issue, and so we will ignore it.
+
+You can see an expectation as an operation you apply to a random variable. It doesn't matter whether the random variable is discrete or continuous; that just changes the recipe for computing the value of the expectation. The crucial property of this operation is that it is linear; this is so important I have put it in its own box.
+
+Useful Facts 4.2 (Expectations Are Linear)
+
+Write f, g for functions of random variables.
+
+  * ![
+$$\\mathbb{E}\\left \[0\\right \] = 0$$
+](A442674_1_En_4_Chapter_IEq41.gif)
+
+  * for any constant k, ![
+$$\\mathbb{E}\\left \[kf\\right \] = k\\mathbb{E}\\left \[f\\right \]$$
+](A442674_1_En_4_Chapter_IEq42.gif)
+
+  * ![
+$$\\mathbb{E}\\left \[f + g\\right \] = \\mathbb{E}\\left \[f\\right \] + \\mathbb{E}\\left \[g\\right \]$$
+](A442674_1_En_4_Chapter_IEq43.gif).
+
+I have written this box in a rather compact form. This is because the expression ![
+$$\\mathbb{E}\\left \[X\\right \]$$
+](A442674_1_En_4_Chapter_IEq44.gif) for the expected value of a random variable is actually a special case of ![
+$$\\mathbb{E}\\left \[f\\right \]$$
+](A442674_1_En_4_Chapter_IEq45.gif)—one just uses the identity function for f. So the box also tells us that ![
+$$\\mathbb{E}\\left \[X + Y \\right \] = \\mathbb{E}\\left \[X\\right \] + \\mathbb{E}\\left \[Y \\right \]$$
+](A442674_1_En_4_Chapter_IEq46.gif), and so on.
+
+### 4.2.2 Mean, Variance and Covariance
+
+There are three very important expectations with special names.
+
+Definition 4.12 (Mean or Expected Value)
+
+The mean or expected value of a random variable X is
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[X\\right \]}$$
+](A442674_1_En_4_Chapter_Equz.gif)
+
+Worked example 4.9 (Mean of a Coin Flip)
+
+We flip a biased coin, with P(H) = p. The random variable X has value 1 if the coin comes up heads, 0 otherwise. What is the mean of X? (i.e. ![
+$$\\mathbb{E}\\left \[X\\right \]$$
+](A442674_1_En_4_Chapter_IEq47.gif)).
+
+Solution
+
+![
+$$\\mathbb{E}\\left \[X\\right \] =\\sum _{x\\in D}xP\(X = x\) = 1p + 0\(1 - p\) = p$$
+](A442674_1_En_4_Chapter_IEq48.gif)
+
+Definition 4.13 (Variance)
+
+The variance of a random variable X is
+
+![
+$$\\displaystyle{\\mathsf{var}\\left \[X\\right \] = \\mathbb{E}\\left \[\(X - \\mathbb{E}\\left \[X\\right \]\)^{2}\\right \]}$$
+](A442674_1_En_4_Chapter_Equaa.gif)
+
+Useful Facts 4.3 (Properties of Variance)
+
+We have:
+
+  * For any constant k, ![
+$$\\mathsf{var}\\left \[k\\right \] = 0$$
+](A442674_1_En_4_Chapter_IEq49.gif);
+
+  * ![
+$$\\mathsf{var}\\left \[X\\right \] \\geq 0$$
+](A442674_1_En_4_Chapter_IEq50.gif);
+
+  * ![
+$$\\mathsf{var}\\left \[kX\\right \] = k^{2}\\mathsf{var}\\left \[X\\right \]$$
+](A442674_1_En_4_Chapter_IEq51.gif);
+
+  * and, if X and Y are independent, then ![
+$$\\mathsf{var}\\left \[X + Y \\right \] = \\mathsf{var}\\left \[X\\right \] + \\mathsf{var}\\left \[Y \\right \]$$
+](A442674_1_En_4_Chapter_IEq52.gif).
+
+The first three are obvious, and the fourth appears in the exercises.
+
+Useful Facts 4.4 (Variance, a Useful Expression)
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{var}\\left \[X\\right \]& =& \\mathbb{E}\\left \[\(X - \\mathbb{E}\\left \[X\\right \]\)^{2}\\right \] {}\\\\ & =& \\mathbb{E}\\left \[\(X^{2} - 2X\\mathbb{E}\\left \[X\\right \] + \\mathbb{E}\\left \[X\\right \]^{2}\)\\right \] {}\\\\ & =& \\mathbb{E}\\left \[X^{2}\\right \] - 2\\mathbb{E}\\left \[X\\right \]\\mathbb{E}\\left \[X\\right \] + \\mathbb{E}\\left \[X\\right \]^{2} {}\\\\ & =& \\mathbb{E}\\left \[X^{2}\\right \] - \(\\mathbb{E}\\left \[X\\right \]\)^{2} {}\\\\ \\end{array}$$
+](A442674_1_En_4_Chapter_Equ3.gif)
+
+Worked example 4.10 (Variance of a Coin Flip)
+
+We flip a biased coin, with P(H) = p. The random variable X has value 1 if the coin comes up heads, 0 otherwise. What is the variance of X? (i.e. ![
+$$\\mathsf{var}\\left \[X\\right \]$$
+](A442674_1_En_4_Chapter_IEq53.gif)).
+
+Solution
+
+![
+$$\\mathsf{var}\\left \[X\\right \] = \\mathbb{E}\\left \[\(X - \\mathbb{E}\\left \[X\\right \]\)^{2}\\right \] = \\mathbb{E}\\left \[X^{2}\\right \]-\\mathbb{E}\\left \[X\\right \]^{2} = \(1p-0\(1-p\)\)-p^{2} = p\(1-p\)$$
+](A442674_1_En_4_Chapter_IEq54.gif)
+
+Worked example 4.11 (Variance)
+
+Can a random variable have ![
+$$\\mathbb{E}\\left \[X\\right \]> \\sqrt{\\mathbb{E}\\left \[X^{2 } \\right \]}$$
+](A442674_1_En_4_Chapter_IEq55.gif)?
+
+Solution
+
+No, because that would mean that ![
+$$\\mathbb{E}\\left \[\(X - \\mathbb{E}\\left \[X\\right \]\)^{2}\\right \] <0$$
+](A442674_1_En_4_Chapter_IEq56.gif). But this is the expected value of a non-negative quantity; it must be non-negative.
+
+Worked example 4.12 (More Variance)
+
+We just saw that a random variable can't have ![
+$$\\mathbb{E}\\left \[X\\right \]> \\sqrt{\\mathbb{E}\\left \[X^{2 } \\right \]}$$
+](A442674_1_En_4_Chapter_IEq57.gif). But I can easily have a random variable with large mean and small variance—isn't this a contradiction?
+
+Solution
+
+No, you're confused. Your question means you think that the variance of X is given by ![
+$$\\mathbb{E}\\left \[X^{2}\\right \]$$
+](A442674_1_En_4_Chapter_IEq58.gif); but actually ![
+$$\\mathsf{var}\\left \[X\\right \] = \\mathbb{E}\\left \[X^{2}\\right \] - \\mathbb{E}\\left \[X\\right \]^{2}$$
+](A442674_1_En_4_Chapter_IEq59.gif)
+
+Now assume that we have a probability distribution P(X) defined on some discrete set of numbers. There is some random variable that produced this probability distribution. This means that we could talk about the mean of a probability distribution P (rather than the mean of a random variable whose probability distribution is P(X)). It is quite usual to talk about the mean of a probability distribution. Furthermore, we could talk about the variance of a probability distribution P (rather than the variance of a random variable whose probability distribution is P(X)).
+
+Definition 4.14 (Covariance)
+
+The covariance of two random variables X and Y is
+
+![
+$$\\displaystyle{\\mathsf{cov}\\left \(X,Y \\right \) = \\mathbb{E}\\left \[\(X - \\mathbb{E}\\left \[X\\right \]\)\(Y - \\mathbb{E}\\left \[Y \\right \]\)\\right \]}$$
+](A442674_1_En_4_Chapter_Equab.gif)
+
+Useful Facts 4.5 (Covariance, Useful Expression)
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{cov}\\left \(X,Y \\right \)& =& \\mathbb{E}\\left \[\(X - \\mathbb{E}\\left \[X\\right \]\)\(Y - \\mathbb{E}\\left \[Y \\right \]\)\\right \] {}\\\\ & =& \\mathbb{E}\\left \[\\left \(XY -Y \\mathbb{E}\\left \[X\\right \]-X\\mathbb{E}\\left \[Y \\right \]\\right.\\right. {}\\\\ & & ~~~\\left.\\left.+\\mathbb{E}\\left \[X\\right \]\\mathbb{E}\\left \[Y \\right \]\\right \)\\right \] {}\\\\ & =& \\mathbb{E}\\left \[XY \\right \]-2\\mathbb{E}\\left \[Y \\right \]\\mathbb{E}\\left \[X\\right \]+\\mathbb{E}\\left \[X\\right \]\\mathbb{E}\\left \[Y \\right \] {}\\\\ & =& \\mathbb{E}\\left \[XY \\right \] - \\mathbb{E}\\left \[X\\right \]\\mathbb{E}\\left \[Y \\right \]. {}\\\\ \\end{array}$$
+](A442674_1_En_4_Chapter_Equ4.gif)
+
+Useful Facts 4.6 (Independent Random Variables Have Zero Covariance)
+
+We have:
+
+  * if X and Y are independent, then ![
+$$\\mathbb{E}\\left \[XY \\right \] = \\mathbb{E}\\left \[X\\right \]\\mathbb{E}\\left \[Y \\right \]$$
+](A442674_1_En_4_Chapter_IEq60.gif);
+
+  * if X and Y are independent, then ![
+$$\\mathsf{cov}\\left \(X,Y \\right \) = 0$$
+](A442674_1_En_4_Chapter_IEq61.gif).
+
+If the first is true, then the second is obviously true (apply the expression of useful facts 4.5 ).
+
+Proposition
+
+If X and Y are independent random variables, then ![
+$$\\mathbb{E}\\left \[XY \\right \] = \\mathbb{E}\\left \[X\\right \]\\mathbb{E}\\left \[Y \\right \]$$
+](A442674_1_En_4_Chapter_IEq62.gif).
+
+Proof
+
+Recall that ![
+$$\\mathbb{E}\\left \[X\\right \] =\\sum _{x\\in D}xP\(X = x\)$$
+](A442674_1_En_4_Chapter_IEq63.gif), so that
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathbb{E}\\left \[XY \\right \]& =& \\sum _{\(x,y\)\\in D_{x}\\times D_{y}}xyP\(X = x,Y = y\) {}\\\\ & =& \\sum _{x\\in D_{x}}\\sum _{y\\in D_{y}}\\left \(xyP\(X = x,Y = y\)\\right \) {}\\\\ & =& \\sum _{x\\in D_{x}}\\sum _{y\\in D_{y}}\\left \(xyP\(X = x\)P\(Y = y\)\\right \) {}\\\\ & & \\mbox{ because }X\\mbox{ and }Y \\mbox{ are independent} {}\\\\ & =& \\sum _{x\\in D_{x}}\\sum _{y\\in D_{y}}\\left \(xP\(X = x\)\\right \)\\left \(yP\(Y = y\)\\right \) {}\\\\ & =& \\left \(\\sum _{x\\in D_{x}}xP\(X=x\)\\!\\right \)\\!\\!\\left \(\\sum _{y\\in D_{y}}yP\(Y =y\)\\!\\!\\right \) {}\\\\ & =& \(\\mathbb{E}\\left \[X\\right \]\)\(\\mathbb{E}\\left \[Y \\right \]\). {}\\\\ \\end{array}$$
+](A442674_1_En_4_Chapter_Equ5.gif)
+
+This is certainly not true when X and Y are not independent (try Y = −X).
+
+Useful Facts 4.7 (Variance as Covariance)
+
+We have
+
+![
+$$\\displaystyle{\\mathsf{var}\\left \[X\\right \] = \\mathsf{cov}\\left \(X,X\\right \)}$$
+](A442674_1_En_4_Chapter_Equac.gif)
+
+(substitute into definitions).
+
+The variance of a random variable is often inconvenient, because its units are the square of the units of the random variable. Instead, we could use the standard deviation.
+
+Definition 4.15 (Standard Deviation)
+
+The standard deviation of a random variable X is defined as
+
+![
+$$\\displaystyle{\\mathsf{std}\\left \(\\left \\{X\\right \\}\\right \) = \\sqrt{\\mathsf{var } \\left \[X\\right \]}}$$
+](A442674_1_En_4_Chapter_Equad.gif)
+
+You do need to be careful with standard deviations. If X and Y are independent random variables, then ![
+$$\\mathsf{var}\\left \[X + Y \\right \] = \\mathsf{var}\\left \[X\\right \] + \\mathsf{var}\\left \[Y \\right \]$$
+](A442674_1_En_4_Chapter_IEq64.gif), but ![
+$$\\mathsf{std}\\left \(\\left \\{X + Y \\right \\}\\right \) = \\sqrt{\\mathsf{std } \\left \(\\left \\{X\\right \\} \\right \) ^{2 } + \\mathsf{std } \\left \(\\left \\{Y \\right \\} \\right \) ^{2}}$$
+](A442674_1_En_4_Chapter_IEq65.gif). One way to avoid getting mixed up is to remember that variances add, and derive expressions for standard deviations from that.
+
+### 4.2.3 Expectations and Statistics
+
+I have now used each of the terms mean, variance, covariance, and standard deviation in two slightly different ways. One sense of each term, expounded in Sect. 1.​3, describes a property of a dataset. These are known as descriptive statistics. The other sense, described above, is a property of probability distributions. These are known as expectations. The reason we use one name for two notions is that the notions are not really all that different.
+
+Here is a useful construction to illustrate the point. Imagine we have a dataset {x} of N items, where the i'th item is x i . Build a random variable X using this dataset by placing the same probability on each data item. This means that each data item has probability 1∕N. Write ![
+$$\\mathbb{E}\\left \[X\\right \]$$
+](A442674_1_En_4_Chapter_IEq66.gif) for the mean of this distribution. We have
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[X\\right \] =\\sum _{i}x_{i}P\(x_{i}\) = \\frac{1} {N}\\sum _{i}x_{i} = \\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)}$$
+](A442674_1_En_4_Chapter_Equae.gif)
+
+and, by the same reasoning,
+
+![
+$$\\displaystyle{\\mathsf{var}\\left \[X\\right \] = \\mathsf{var}\\left \(\\left \\{x\\right \\}\\right \).}$$
+](A442674_1_En_4_Chapter_Equaf.gif)
+
+This construction works for standard deviation and covariance, too. For this particular distribution (sometimes called the empirical distribution), the expectations have the same value as the descriptive statistics.
+
+In Sect. 4.3.4 , we will see a form of converse to this fact. Imagine we have a dataset that consists of independent, identically distributed samples from a probability distribution (i.e. we know that each data item was obtained independently from the distribution). For example, we might have a count of heads in each of a number of coin flip experiments. Then the descriptive statistics will turn out to be accurate estimates of the expectations.
+
+## 4.3 The Weak Law of Large Numbers
+
+Assume you see repeated values of a random variable. For example, let X be the random variable which has value 1 if a coin comes up heads (which happens with probability p) and − 1 if it comes up tails. You now actually flip a coin N times, recording 1 for heads and − 1 for tails. Intuition should say that the average of these numbers should be a good estimate of the value of ![
+$$\\mathbb{E}\\left \[X\\right \]$$
+](A442674_1_En_4_Chapter_IEq67.gif), by the following argument. You should see 1 about pN times, and − 1 about (1 − p)N times. So the average should be close to p − (1 − p), which is ![
+$$\\mathbb{E}\\left \[X\\right \]$$
+](A442674_1_En_4_Chapter_IEq68.gif). Furthermore, intuition should suggest that this estimate gets better as the number of flips goes up.
+
+These intuitions are correct. You can estimate expectations accurately by experiment. This is extremely useful, because it means that you can use quite simple programs to estimate values that might take a great deal of work to obtain any other way. Most people find it natural that something of this sort should be true. What is neat is that it is quite easy to prove.
+
+### 4.3.1 IID Samples
+
+We need first to be crisp about what we are averaging. Imagine a random variable X, obtained by flipping a fair coin and reporting 1 for an H and − 1 for a T. We can talk about the probability distribution P(X) of this random variable; we can talk about the expected value that the random variable takes; but the random variable itself doesn't have a value. However, if we actually flip a coin, we get either a 1 or a − 1. Observing a value is sometimes called a trial. The resulting value is often called a sample of the random variable (or of its probability distribution); it is sometimes called a realization. So flipping the coin is a trial, and the number you get is a sample. If we flipped a coin many times, we'd have a set of numbers (or samples). These numbers would be independent. Their histogram would look like P(X). Collections of data items like this are important enough to have their own name.
+
+Assume we have a set of data items x i such that (a) they are independent; and (b) the histogram of a very large set of data items looks increasingly like the probability distribution P(X) as the number of data items increases. Then we refer to these data items as independent identically distributed samples of P(X); for short, iid samples or even just samples. It's worth knowing that it can be a difficult computational problem to get IID samples from some given probability distribution. For all of the cases we will deal with, it will be obvious how to get IID samples. Usually, they were generated for us—i.e. somebody flipped the coin, etc.
+
+Now assume you take N IID samples, and average them. The weak law of large numbers states that, as N gets larger, this average is an increasingly good estimate of ![
+$$\\mathbb{E}\\left \[X\\right \]$$
+](A442674_1_En_4_Chapter_IEq69.gif). This fact allows us estimate expectations (and so probabilities) by simulation. Furthermore, it will allow us to make strong statements about how repeated games with random outcomes will behave. Finally, it will allow us to build a theory of decision making.
+
+### 4.3.2 Two Inequalities
+
+To go further, we need two useful inequalities. Consider
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[\\mid \\!\\!X\\!\\!\\mid \\right \] =\\sum _{x\\in D}\\mid \\!\\!x\\!\\!\\mid P\(\\{X = x\\}\).}$$
+](A442674_1_En_4_Chapter_Equag.gif)
+
+Now notice that all the terms in the sum are non-negative. Then the only way to have a small value of ![
+$$\\mathbb{E}\\left \[\\mid \\!\\!X\\!\\!\\mid \\right \]$$
+](A442674_1_En_4_Chapter_IEq70.gif) is to be sure that, when ∣​​x​​∣ is large, P({X = x}) is small. It turns out to be possible (and useful!) to be more crisp about how quickly P({X = x}) falls as ∣​​x​​∣ grows, resulting in Markov's inequality (which I'll prove below)
+
+Definition 4.16 (Markov's Inequality)
+
+Markov's inequality is
+
+![
+$$\\displaystyle{P\(\\left \\{\\mid \\!\\!X\\!\\!\\mid \\geq a\\right \\}\) \\leq \\frac{\\mathbb{E}\\left \[\\mid \\!\\!X\\!\\!\\mid \\right \]} {a}.}$$
+](A442674_1_En_4_Chapter_Equah.gif)
+
+Notice that we've seen something like this before (the result about standard deviation in Sect. 1.​3.​2 has this form). The reason this is worth proving is that it leads to a second result, and that gives us the weak law of large numbers. It should seem clear that the probability of a random variable taking a particular value must fall off rather fast as that value moves away from the mean, in units scaled to the standard deviation. This is because values of a random variable that are many standard deviations above the mean must have low probability, otherwise the values would occur more often and so the standard deviation would be bigger. This result is Chebyshev's inequality, which I shall also prove below.
+
+Definition 4.17 (Chebyshev's Inequality)
+
+Chebyshev's inequality is
+
+![
+$$\\displaystyle{P\(\\left \\{\\vert X - \\mathbb{E}\\left \[X\\right \]\\vert \\geq a\\right \\}\) \\leq \\frac{\\mathsf{var}\\left \[X\\right \]} {a^{2}}.}$$
+](A442674_1_En_4_Chapter_Equai.gif)
+
+It is common to see this in another form, obtained by writing σ for the standard deviation of X, substituting kσ for a, and rearranging
+
+![
+$$\\displaystyle{P\(\\left \\{\\vert X - \\mathbb{E}\\left \[X\\right \]\\vert \\geq k\\sigma \\right \\}\) \\leq \\frac{1} {k^{2}}}$$
+](A442674_1_En_4_Chapter_Equaj.gif)
+
+We care about Chebyshev's inequality because it gives us the weak law of large numbers.
+
+### 4.3.3 Proving the Inequalities
+
+An indicator function is a function that is one when some condition is true, and zero otherwise. The reason indicator functions are useful is that their expected values have interesting properties.
+
+Definition 4.18 (Indicator Functions)
+
+An indicator function for an event is a function that takes the value zero for values of x where the event does not occur, and one where the event occurs. For the event ![
+$$\\mathcal{E}$$
+](A442674_1_En_4_Chapter_IEq71.gif), we write
+
+![
+$$\\displaystyle{\\mathbb{I}_{\\left \[\\mathcal{E}\\right \]}\(x\)}$$
+](A442674_1_En_4_Chapter_Equak.gif)
+
+for the relevant indicator function.
+
+I used a small x in the definition, because this is a function; the argument doesn't need to be a random variable. You should think about an indicator function as testing the value of its argument to tell whether it lies in the event or not, and reporting 1 or 0 accordingly. For example,
+
+![
+$$\\displaystyle{\\mathbb{I}_{\\left \[\\left \\{\\vert x\\vert \\right \\}\\leq a\\right \]}\(x\) = \\left \\{\\begin{array}{cc} 1&\\mbox{ if } - a <x <a\\\\ 0 & \\mbox{ otherwise} \\end{array} \\right.}$$
+](A442674_1_En_4_Chapter_Equal.gif)
+
+Indicator functions have one useful property.
+
+![
+$$\\displaystyle{\\mathbb{E}_{P}\\left \[\\mathbb{I}_{\\left \[\\mathcal{E}\\right \]}\\right \] = P\(\\mathcal{E}\)}$$
+](A442674_1_En_4_Chapter_Equam.gif)
+
+which you can establish by checking the definition of expectations.
+
+Proposition
+
+Markov's inequality: for X a random variable, a > 0,
+
+![
+$$\\displaystyle{P\(\\left \\{\\mid \\!\\!X\\!\\!\\mid \\geq a\\right \\}\) \\leq \\frac{\\mathbb{E}\\left \[\\mid \\!\\!X\\!\\!\\mid \\right \]} {a}.}$$
+](A442674_1_En_4_Chapter_Equan.gif)
+
+Proof
+
+(from Wikipedia). Notice that, for a > 0,
+
+![
+$$\\displaystyle{a\\mathbb{I}_{\\left \[\\left \\{\\vert X\\vert \\geq a\\right \\}\\right \]}\(X\) \\leq \\vert X\\vert }$$
+](A442674_1_En_4_Chapter_Equao.gif)
+
+(because if | X | ≥ a, the LHS is a; otherwise it is zero). Now we have
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[a\\mathbb{I}_{\\left \[\\left \\{\\vert X\\vert \\geq a\\right \\}\\right \]}\\right \] \\leq \\mathbb{E}\\left \[\\vert X\\vert \\right \]}$$
+](A442674_1_En_4_Chapter_Equap.gif)
+
+but, because expectations are linear, we have
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[a\\mathbb{I}_{\\left \[\\left \\{\\vert X\\vert \\geq a\\right \\}\\right \]}\\right \]=a\\mathbb{E}\\left \[\\mathbb{I}_{\\left \[\\left \\{\\vert X\\vert \\geq a\\right \\}\\right \]}\\right \]=aP\(\\left \\{\\vert X\\vert \\geq a\\right \\}\)}$$
+](A442674_1_En_4_Chapter_Equaq.gif)
+
+and so we have
+
+![
+$$\\displaystyle{aP\(\\left \\{\\vert X\\vert \\geq a\\right \\}\) \\leq \\mathbb{E}\\left \[\\vert X\\vert \\right \]}$$
+](A442674_1_En_4_Chapter_Equar.gif)
+
+and we get the inequality by division, which we can do because a > 0.
+
+Proposition
+
+Chebyshev's inequality: for X a random variable, a > 0,
+
+![
+$$\\displaystyle{P\(\\left \\{\\vert X - \\mathbb{E}\\left \[X\\right \]\\vert \\geq a\\right \\}\) \\leq \\frac{\\mathsf{var}\\left \[X\\right \]} {a^{2}}.}$$
+](A442674_1_En_4_Chapter_Equas.gif)
+
+Proof
+
+Write U for the random variable ![
+$$\(X - \\mathbb{E}\\left \[X\\right \]\)^{2}$$
+](A442674_1_En_4_Chapter_IEq72.gif). Markov's inequality gives us
+
+![
+$$\\displaystyle{P\(\\left \\{\\vert U\\vert \\geq w\\right \\}\) \\leq \\frac{\\mathbb{E}\\left \[\\vert U\\vert \\right \]} {w} }$$
+](A442674_1_En_4_Chapter_Equat.gif)
+
+Now notice that, if w = a 2,
+
+![
+$$\\displaystyle{P\(\\left \\{\\vert U\\vert \\geq w\\right \\}\) = P\(\\left \\{\\vert X - \\mathbb{E}\\left \[X\\right \]\\vert \\geq a\\right \\}\)}$$
+](A442674_1_En_4_Chapter_Equau.gif)
+
+so we have
+
+![
+$$\\displaystyle{\\begin{array}{rl} P\(\\left \\{\\vert U\\vert \\geq w\\right \\}\)& = P\(\\left \\{\\vert X - \\mathbb{E}\\left \[X\\right \]\\vert \\geq a\\right \\}\) \\\\ & \\leq \\frac{\\mathbb{E}\\left \[\\vert U\\vert \\right \]} {w} = \\frac{\\mathsf{var}\\left \[X\\right \]} {a^{2}}\\end{array} }$$
+](A442674_1_En_4_Chapter_Equav.gif)
+
+### 4.3.4 The Weak Law of Large Numbers
+
+Assume we have a set of N IID samples x i of a probability distribution P(X). Write
+
+![
+$$\\displaystyle{X_{N} = \\frac{\\sum _{i=1}^{N}x_{i}} {N}.}$$
+](A442674_1_En_4_Chapter_Equaw.gif)
+
+Now X N is a random variable (the x i are IID samples, and for a different set of samples you will get a different, random, X N ). Notice that P(X = x 1, X = x 2,..., X = x n ) = P(X = x 1)P(X = x 2)...P(X = x n ), because the samples are independent and each is a sample of P(X). This means that
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[X_{N}\\right \] = \\mathbb{E}\\left \[X\\right \]}$$
+](A442674_1_En_4_Chapter_Equax.gif)
+
+because
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[X_{N}\\right \] = \\left \( \\frac{1} {N}\\right \)\\sum _{i=1}^{N}\\mathbb{E}\\left \[X\\right \].}$$
+](A442674_1_En_4_Chapter_Equay.gif)
+
+This means that
+
+![
+$$\\displaystyle{\\frac{\\sum _{i=1}^{N}x_{i}} {N} }$$
+](A442674_1_En_4_Chapter_Equaz.gif)
+
+should be an accurate estimate of ![
+$$\\mathbb{E}\\left \[X\\right \]$$
+](A442674_1_En_4_Chapter_IEq73.gif). The weak law of large numbers states that, as N gets large, the estimate becomes more accurate.
+
+Definition 4.19 (Weak Law of Large Numbers)
+
+If P(X) has finite variance, then for any positive number ε
+
+![
+$$\\displaystyle{\\lim _{N\\rightarrow \\infty }P\(\\{\\mid \\!\\!X_{N} - \\mathbb{E}\\left \[X\\right \]\\!\\!\\mid \\geq \\epsilon \\}\) = 0.}$$
+](A442674_1_En_4_Chapter_Equba.gif)
+
+Equivalently, we have
+
+![
+$$\\displaystyle{\\lim _{N\\rightarrow \\infty }P\(\\{\\mid \\!\\!X_{N} - \\mathbb{E}\\left \[X\\right \]\\!\\!\\mid <\\epsilon \\}\) = 1.}$$
+](A442674_1_En_4_Chapter_Equbb.gif)
+
+Proposition
+
+Weak law of large numbers
+
+![
+$$\\displaystyle{\\lim _{N\\rightarrow \\infty }P\(\\{\\mid \\!\\!X_{N} - \\mathbb{E}\\left \[X\\right \]\\!\\!\\mid \\geq \\epsilon \\}\) = 0.}$$
+](A442674_1_En_4_Chapter_Equbc.gif)
+
+Proof
+
+Write ![
+$$\\mathsf{var}\\left \(\\left \\{X\\right \\}\\right \) =\\sigma ^{2}$$
+](A442674_1_En_4_Chapter_IEq74.gif). Choose ε > 0. Now we have that
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{var}\\left \(\\left \\{X_{N}\\right \\}\\right \)& =& \\mathsf{var}\\left \(\\left \\{\\frac{\\sum _{i=1}^{N}x_{i}} {N} \\right \\}\\right \) {}\\\\ & =& \( \\frac{1} {N^{2}}\)\\mathsf{var}\\left \(\\left \\{\\sum _{i=1}^{N}x_{ i}\\right \\}\\right \) {}\\\\ & =& \( \\frac{1} {N^{2}}\)\(N\\sigma ^{2}\) {}\\\\ & & ~~~~~~\\mbox{ the }x_{i}\\mbox{ are independent} {}\\\\ & =& \\frac{\\sigma ^{2}} {N} {}\\\\ \\end{array}$$
+](A442674_1_En_4_Chapter_Equ6.gif)
+
+and that
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[X_{N}\\right \] = \\mathbb{E}\\left \[X\\right \].}$$
+](A442674_1_En_4_Chapter_Equbd.gif)
+
+Now Chebyshev's inequality gives
+
+![
+$$\\displaystyle{P\(\\{\\mid \\!\\!X_{N} - \\mathbb{E}\\left \[X\\right \]\\!\\!\\mid \\geq \\epsilon \\}\) \\leq \\frac{\\sigma ^{2}} {N\\epsilon ^{2}}}$$
+](A442674_1_En_4_Chapter_Eqube.gif)
+
+so
+
+![
+$$\\displaystyle{\\lim _{N\\rightarrow \\infty }P\(\\{\\mid \\!\\!X_{N}-\\mathbb{E}\\left \[X\\right \]\\!\\!\\mid \\geq \\epsilon \\}\)=\\lim _{N\\rightarrow \\infty } \\frac{\\sigma ^{2}} {N\\epsilon ^{2}}=0.}$$
+](A442674_1_En_4_Chapter_Equbf.gif)
+
+The weak law of large numbers gives us a very valuable way of thinking about expectations. Assume we have a random variable X. Then the weak law says that, if you observe a large number of IID samples of this random variable, the average of the values you observe should be very close to ![
+$$\\mathbb{E}\\left \[X\\right \]$$
+](A442674_1_En_4_Chapter_IEq75.gif). This result is extremely powerful. The next section explores some applications. The weak law allows us to estimate expectations (and so probabilities, which are expectations of indicator functions) by observing random behavior. The weak law can be used to build a theory of decision making.
+
+## 4.4 Using the Weak Law of Large Numbers
+
+### 4.4.1 Should You Accept a Bet?
+
+We can't answer this as a moral question, but we can as a practical question, using expectations. Generally, a bet involves an agreement that amounts of money will change hands, depending on the outcome of an experiment. Mostly, you are interested in how much you get from the bet, so it is natural to give sums of money you receive a positive sign, and sums of money you pay out a negative sign. The weak law says that if you repeat a bet many times, you are increasingly likely to receive the expected value of the bet, per bet. Under this convention, the practical answer is easy: accept a bet enthusiastically if its expected value is positive, otherwise decline it. It is interesting to notice how poorly this advice describes actual human behavior.
+
+Worked example 4.13 (Red or Black?)
+
+On a roulette wheel (see p.xxiii if you can't remember how these work), you can bet on (among other things) whether a red number or a black number comes up. If you bet 1 on red, and a red number comes up, you keep your stake and get 1; if a black number or a zero comes up, you get − 1 (i.e. the house keeps your bet). What is the expected value of a bet of 1 on a wheel with one, two and three zeros?
+
+Solution
+
+Write p r for the probability a red number comes up. The expected value is 1 × p r \+ (−1)(1 − p r ) which is 2p r − 1. For one zero, p r = (number of red numbers)/ (total number of numbers) = 18∕37. So the expected value is − 1∕37 (you lose about three cents each time you bet a dollar). For two zeros, p r = 18∕38. So the expected value is − 2∕38 = −1∕19 (you lose slightly more than five cents each time you bet a dollar). For three zeros, p r = 18∕39. So the expected value is − 3∕39 = −1∕13 (you lose slightly less than eight cents each time you bet a dollar).
+
+Notice that in the roulette game, the money you lose will go to the house. So the expected value to the house is just the negative of the expected value to you. You might not play the wheel often, but the house plays the wheel very often when there are many players. The weak law means a house with many players can rely on receiving about three, five, or eight cents per dollar bet, depending on the number of zeros on the wheel. This is a partial explanation of why there are lots of roulette wheels, and usually free food nearby. Not all bets are like this, though.
+
+Worked example 4.14 (Coin Game)
+
+In this game, P1 flips a fair coin and P2 calls "H" or "T". If P2 calls right, then P1 throws the coin into the river; otherwise, P1 keeps the coin. The coin belongs to P1, and has value 1. What is the expected value of this game to P2? and to P1?
+
+Solution
+
+To P2, which we do first, because it's easiest: P2 gets 0 if P2 calls right, and 0 if P2 calls wrong; these are the only cases, so the expected value is 0. To P1: P1 gets − 1 if P2 calls right, and 0 if P1 calls wrong. The coin is fair, so the probability P2 calls right is 1/2. The expected value is − 1∕2. While I can't explain why people would play such a game, I've actually seen this done.
+
+We call a bet fair when its expected value is zero. Taking a bet with a negative expected value is unwise, because, on average, you will lose money. Worse, the more times you play, the more you lose. Similarly, repeatedly taking a bet with a positive expected value is reliably profitable. However, you do need to be careful you computed the expected value right.
+
+Worked example 4.15 (Birthdays in Succession)
+
+P1 and P2 agree to the following bet. P1 gives P2 a stake of 1. If three people, stopped at random on the street, have birthdays in succession (i.e. Mon-Tue-Wed, and so on), then P2 gives P1 100. Otherwise, P1 loses the stake. What is the expected value of this bet to P1?
+
+Solution
+
+Write p for the probability of winning. Then the expected value is p × 100 − (1 − p) × 1. We computed p in Example 3.45 (it was 1∕49). So the bet is worth (52∕49), or slightly more than a dollar, to P1. P1 should be happy to agree to this as often as possible.
+
+The reason P2 agrees to bets like that of Example 4.15 is most likely that P2 can't compute the probability exactly. P2 thinks the event is quite unlikely, so the expected value is negative; but it isn't as unlikely as P2 thought it was, and this is how P1 makes a profit. This is one of the many reasons you should be careful accepting a bet from a stranger: they might be able to compute better than you.
+
+### 4.4.2 Odds, Expectations and Bookmaking: A Cultural Diversion
+
+Gamblers sometimes use a terminology that is a bit different from ours. In particular, the term odds is important. The term comes from the following idea: P1 pays a bookmaker b (the stake) to make a bet; if the bet is successful, P1 receives a and the stake back, and if not, loses the original stake. This bet is referred to as odds of a: b (read "odds of a to b").
+
+Assume the bet is fair, so that the expected value is zero. Write p for the probability of winning. The net income to P1 is ap − b(1 − p). If this is zero, then p = b∕(a \+ b). So you can interpret odds in terms of probability, if you assume the bet is fair.
+
+A bookmaker sets odds at which to accept bets from gamblers. The bookmaker does not wish to lose money at this business, and so must set odds which are potentially profitable. Doing so is not simple (bookmakers can, and occasionally do, lose catastrophically, and go out of business). In the simplest case, assume that the bookmaker knows the probability p that a particular bet will win. Then the bookmaker could set odds of (1 − p)∕p: 1. In this case, the expected value of the bet is zero; this is fair, but not attractive business, so the bookmaker will set odds assuming that the probability is a bit higher than it really is. There are other bookmakers out there, so there is some reason for the bookmaker to try to set odds that are close to fair.
+
+In some cases, you can tell when you are dealing with a bookmaker who is likely to go out of business soon. For example, imagine there are two horses running in a race, both at 10: 1 odds—whatever happens, you could win by betting 1 on each. There is a more general version of this phenomenon. Assume the bet is placed on a horse race, and that bets pay off only for the winning horse. Assume also that exactly one horse will win (i.e. the race is never scratched, there aren't any ties, etc.), and write the probability that the i'th horse will win as p i . Then
+
+![
+$$\\displaystyle{\\sum _{i\\in \\mbox{ horses}}p_{i}}$$
+](A442674_1_En_4_Chapter_Equbg.gif)
+
+must be 1. Now if the bookmaker's odds yield a set of probabilities that is less than 1, their business should fail, because there is at least one horse on which they are paying out too much. Bookmakers deal with this possibility by writing odds so that ∑ i ∈ horses p i is larger than one.
+
+But this is not the only problem a bookmaker must deal with. The bookmaker doesn't actually know the probability that a particular horse will win, and must account for errors in this estimate. One way to do so is to collect as much information as possible (talk to grooms, jockeys, etc.). Another is to look at the pattern of bets that have been placed already. If the bookmaker and the gamblers agree on the probability that each horse will win, then there should be no expected advantage to choosing one horse over another—each should pay out slightly less than zero to the gambler (otherwise the bookmaker doesn't eat). But if the bookmaker has underestimated the probability that a particular horse will win, a gambler may get a positive expected payout by betting on that horse. This means that if one particular horse attracts a lot of money from bettors, it is wise for the bookmaker to offer less generous odds on that horse. There are two reasons: first, the bettors might know something the bookmaker doesn't, and they're signalling it; second, if the bets on this horse are very large and it wins, the bookmaker may not have enough capital left to pay out or to stay in business. All this means that real bookmaking is a complex, skilled business.
+
+### 4.4.3 Ending a Game Early
+
+Imagine two people are playing a game for a stake, but must stop early—who should get what percentage of the stake? One way to do this is to give each player what they put in at the start, but this is (mildly) unfair if one has an advantage over the other. The alternative is to give each player the expected value of the game at that state for that player. Sometimes one can compute that expectation quite easily.
+
+Worked example 4.16 (Ending a Game Early)
+
+Two players each pay 25 to play the following game. They toss a fair coin. If it comes up heads, player H wins that toss; if tails, player T wins. The first player to reach 10 wins takes the stake of 50. But one player is called away when the state is 8–7 (H-T)—how should the stake be divided?
+
+Solution
+
+In this state, each player can either win—and so get 50—or lose—and so get 0. The expectation for H is ![
+$$50P\(\\left \\{\\mbox{ H wins from 8-7}\\right \\}\) + 0P\(\\left \\{\\mbox{ T wins from 8-7}\\right \\}\)$$
+](A442674_1_En_4_Chapter_IEq76.gif), so we need to compute ![
+$$P\(\\left \\{\\mbox{ H wins from 8-7}\\right \\}\)$$
+](A442674_1_En_4_Chapter_IEq77.gif). Similarly, the expectation for T is ![
+$$50P\(\\left \\{\\mbox{ T wins from 8-7}\\right \\}\) + 0P\(\\left \\{\\mbox{ H wins from 8-7}\\right \\}\)$$
+](A442674_1_En_4_Chapter_IEq78.gif), so we need to compute ![
+$$P\(\\left \\{\\mbox{ T wins from 8-7}\\right \\}\)$$
+](A442674_1_En_4_Chapter_IEq79.gif); but ![
+$$P\(\\left \\{\\mbox{ T wins from 8-7}\\right \\}\) = 1 - P\(\\left \\{\\mbox{ H wins from 8-7}\\right \\}\)$$
+](A442674_1_En_4_Chapter_IEq80.gif). Now it is slightly easier to compute ![
+$$P\(\\left \\{\\mbox{ T wins from 8-7}\\right \\}\)$$
+](A442674_1_En_4_Chapter_IEq81.gif), because T can only win in two ways: 8–10 or 9–10. These are independent. For T to win 8–10, the next three flips must come up T, so that event has probability 1∕8. For T to win 9–10, the next four flips must have one H in them, but the last flip may not be H (or else H wins); so the next four flips could be HTTT, THTT, or TTHT. The probability of this is 3∕16. This means the total probability that T wins is 5∕16. So T should get 15. 625 and H should get the rest (although they might have to flip for the odd half cent).
+
+### 4.4.4 Making a Decision with Decision Trees and Expectations
+
+Imagine we have to choose an action. Once we have chosen, a sequence of random events occurs, and we get a reward with some probability. Which action should we choose? A good answer is to choose the action with the best expected outcome. If we encounter this situation repeatedly, the weak law tells us that choosing any other action than the one with best expected outcome is unwise. If we make a choice that is even only slightly worse than the best, we will reliably do worse than we could. This is a very common recipe, and it can be applied to many situations. Usually, but not always, the reward is in money, and we will compute with money rewards for the first few examples.
+
+For such problems, it can be useful to draw a decision tree. A decision tree is a drawing of possible outcomes of decisions, which makes costs, benefits and random elements explicit. Each node of the tree represents a test of an attribute (which could be either a decision, or a random variable), and each edge represents a possible outcome of a test. The final outcomes are leaves. Usually, decision nodes are drawn as squares, chance elements as circles, and leaves as triangles.
+
+Worked example 4.17 (Vaccination)
+
+It costs 10 to be vaccinated against a common disease. If you have the vaccination, the probability you will get the disease is 1e − 7. If you do not, the probability is 0. 1. The disease is unpleasant; with probability 0. 95, you will experience effects that cost you 1000 (eg several days in bed), but with probability 0. 05, you will experience effects that cost you 1e6. Should you be vaccinated?
+
+Solution
+
+Figure 4.1 shows a decision tree for this problem. I have annotated some edges with the choices represented, and some edges with probabilities; the sum of probabilities over all rightward (downgoing) edges leaving a random node is 1. It is straightforward to compute expectations. The expected cost of the disease is 0. 95 × 1000 + 0. 05 × 1e6 = 50, 950. If you are vaccinated, your expected income will be − (10 + 1e − 7 × 50, 950) ≈ −10. 01. If you are not, your expected income is − 5, 095. You should be vaccinated.
+
+Fig. 4.1
+
+A decision tree for the vaccination problem. The only decision is whether to vaccinate or not (the box at the root of the tree). I have only labelled edges where this is essential, so I did not annotate the "no vaccination" edge with zero cost. Once you decide whether to vaccinate or not, there is a circle, indicating a random node (a random event; whether you get the disease or not) and, if you get it, another (minor or major)
+
+Example 4.17 has some subtleties. The conclusion is a rather shaky, though very common, use of the weak law. It's shaky, because the weak law has nothing to say about the outcome of a decision that you make only once. The proper interpretation of the example is that, if you had to make the choice many times over under the same set of circumstances, you should choose to be vaccinated. Notice you have to be careful using the example to argue that everyone should be vaccinated, because if lots of people were vaccinated then the probability of getting the disease would change. Since this probability goes down, the conclusion is fine, but you have to be careful about how you get there.
+
+Sometimes there is more than one decision. We can still do simple examples, though drawing a decision tree is now quite important, because it allows us to keep track of cases and avoid missing anything. For example, assume I wish to buy a cupboard. Two nearby towns have used furniture shops (usually called antique shops these days). One is further away than the other. If I go to town A, I will have time to look in two (of three) shops; if I go to town B, I will have time to look in one (of two) shops. I could lay out this sequence of decisions (which town to go to; which shop to visit when I get there) as Fig. 4.2 .
+
+Fig. 4.2
+
+The decision tree for the example of visiting furniture shops. Town A is nearer than town B, so if I go there I can choose to visit two of the three shops there; if I go to town B, I can visit only one of the two shops there. To decide what to do, I could fill in the probabilities and values of outcomes, compute the expected value of each pair of decisions, and choose the best. This could be tricky to do (where do I get the probabilities from?) but offers a rational and principled way to make the decision
+
+You should notice that this figure is missing a lot of information. What is the probability that I will find what I'm looking for in the shops? What is the value of finding it? What is the cost of going to each town? and so on. This information is not always easy to obtain. In fact, I might simply need to give my best subjective guess of these numbers. Furthermore, particularly if there are several decisions, computing the expected value of each possible sequence could get difficult. There are some kinds of model where one can compute expected values easily, but a good viable hypothesis about why people don't make optimal decisions is that optimal decisions are actually too hard to compute.
+
+### 4.4.5 Utility
+
+Sometimes it is hard to work with money. For example, in the case of a serious disease, choosing treatments often boils down to expected survival times, rather than money.
+
+Worked example 4.18 (Radical Treatment)
+
+Imagine you have a nasty disease. There are two kinds of treatment: standard, and radical. Radical treatment might kill you (with probability 0. 1); might be so damaging that doctors stop (with probability 0. 3); but otherwise you will complete the treatment. If you do complete radical treatment, there could be a major response (probability 0. 1) or a minor response. If you follow standard treatment, there could be a major response (probability 0. 5) or a minor response, but the outcomes are less good. All this is best summarized in a decision tree (Fig. 4.3 ). What gives the longest expected survival time?
+
+Fig. 4.3
+
+A decision tree for Example 4.18
+
+Solution
+
+In this case, expected survival time with radical treatment is (0. 1 × 0 + 0. 3 × 6 + 0. 6 × (0. 1 × 60 + 0. 9 × 10)) = 10. 8 months; expected survival time without radical treatment is 0. 5 × 10 + 0. 5 × 6 = 8 months.
+
+Working with money values is not always a good idea. For example, many people play state lotteries. The expected value of a 1 bet on a state lottery is well below 1—why do people play? It's easy to assume that all players just can't do sums, but many players are well aware that the expected value of a bet is below the cost. It seems to be the case that people value money in a way that doesn't depend linearly on the amount of money. So, for example, people may value a million dollars rather more than a million times the value they place on one dollar. If this is true, we need some other way to keep track of value; this is sometimes called utility. It turns out to be quite hard to know how people value things, and there is quite good evidence that (a) human utility is complicated and (b) it is difficult to explain human decision making in terms of expected utility.
+
+Worked example 4.19 (Human Utility is Not Expected Payoff)
+
+Here are four games:
+
+  * Game 1: The player is given 1. A biased coin is flipped, and the money is taken back with probability p; otherwise, the player keeps it.
+
+  * Game 2: The player stakes 1, and a fair coin is flipped; if the coin comes up heads, the player gets r and the stake back, but otherwise loses the original stake.
+
+  * Game 3: The player bets nothing; a biased coin is flipped, and if it comes up heads (probability q), the player gets 1e6.
+
+  * Game 4: The player stakes 1000; a fair coin is flipped, and if it comes up heads, the player gets s and the stake back, but otherwise loses the original stake.
+
+In particular, what happens if r = 3 − 2p and q = (1 − p)∕1e6 and s = 2 − 2p \+ 1000?
+
+Solution
+
+Game 1 has expected value (1 − p)1. Game 2 has expected value (1∕2)(r − 1). Game 3 has expected value q1e6. Game 4 has expected value (1∕2)s − 500.
+
+In the case given, each game has the same expected value. Nonetheless, people usually have decided preferences for which game they would play. Generally, 4 is unattractive (seems expensive to play); 3 seems like free money, and so a good thing; 2 might be OK but is often seen as uninteresting; and 1 is unattractive. This should suggest to you that people's reasoning about money and utility is not what simple expectations predict.
+
+## 4.5 You Should
+
+### 4.5.1 Remember These Definitions
+
+Discrete random variable 87
+
+Probability distribution of a discrete random variable 88
+
+Cumulative distribution of a discrete random variable 88
+
+Joint probability distribution of two discrete random variables 89
+
+Bayes' rule 89
+
+Marginal probability of a random variable 90
+
+Independent random variables 90
+
+Expected value 93
+
+Expectation 94
+
+Expected value of a continuous random variable 95
+
+Mean or expected value 95
+
+Variance 96
+
+Covariance 97
+
+Standard deviation 98
+
+Markov's inequality 100
+
+Chebyshev's inequality 100
+
+Indicator functions 100
+
+Weak Law of Large Numbers 102
+
+### 4.5.2 Remember These Terms
+
+probability mass function 88
+
+probability density function 91
+
+pdf 91
+
+density 91
+
+normalizing 92
+
+normalizing constant 92
+
+standard deviation 98
+
+descriptive statistics 98
+
+empirical distribution 99
+
+trial 99
+
+sample 99
+
+realization 99
+
+independent identically distributed samples 99
+
+iid samples 99
+
+indicator function 100
+
+odds 104
+
+decision tree 105
+
+utility 107
+
+### 4.5.3 Use and Remember These Facts
+
+Properties of probability density functions 92
+
+Expectations are linear 95
+
+Properties of variance 96
+
+Variance, a useful expression 96
+
+Covariance, useful expression 97
+
+Independent random variables have zero covariance 97
+
+Variance as covariance 98
+
+### 4.5.4 Remember These Points
+
+### 4.5.5 Be Able to
+
+  * Interpret notation for joint and conditional probability for random variables; in particular, understand notation such as: P({X}), P({X = x}), p(x), p(x, y), p(x | y)
+
+  * Interpret a probability density function p(x) as P({X ∈ [x, x \+ dx]}).
+
+  * Interpret the expected value of a discrete random variable.
+
+  * Interpret the expected value of a continuous random variable.
+
+  * Compute expected values of random variables for straightforward cases.
+
+  * Write down expressions for mean, variance and covariance for random variables.
+
+  * Write out a decision tree.
+
+  * Exploit the weak law of large numbers.
+
+Problems
+
+Joint and Conditional Probability for Random Variables
+
+4.1 A roulette wheel has one zero. Write X for the random variable representing the number that will come up on the wheel. What is the probability distribution of X?
+
+4.2 Define a random variable X by the following procedure. Draw a card from a standard deck of playing cards. If the card is knave, queen, or king, then X = 11. If the card is an ace, then X = 1; otherwise, X is the number of the card (i.e. two through ten). Now define a second random variable Y by the following procedure. When you evaluate X, you look at the color of the card. If the card is red, then Y = X − 1; otherwise, Y = X \+ 1.
+
+  1. (a)
+
+What is P({X ≤ 2})?
+
+  2. (b)
+
+What is P({X ≥ 10})?
+
+  3. (c)
+
+What is P({X ≥ Y })?
+
+  4. (d)
+
+What is the probability distribution of Y − X?
+
+  5. (e)
+
+What is P({Y ≥ 12}?
+
+4.3 Define a random variable by the following procedure. Flip a fair coin. If it comes up heads, the value is 1. If it comes up tails, roll a die: if the outcome is 2 or 3, the value of the random variable is 2. Otherwise, the value is 3.
+
+  1. (a)
+
+What is the probability distribution of this random variable?
+
+  2. (b)
+
+What is the cumulative distribution of this random variable?
+
+4.4 Define three random variables, X, Y and Z by the following procedure. Roll a six-sided die and a four-sided die. Now flip a coin. If the coin comes up heads, then X takes the value of the six-sided die and Y takes the value of the four-sided die. Otherwise, X takes the value of the four-sided die and Y takes the value of the six-sided die. Z always takes the value of the sum of the dice.
+
+  1. (a)
+
+What is P(X), the probability distribution of this random variable?
+
+  2. (b)
+
+What is P(X, Y ), the joint probability distribution of these two random variables?
+
+  3. (c)
+
+Are X and Y independent?
+
+  4. (d)
+
+Are X and Z independent?
+
+4.5 Define two random variables X and Y by the following procedure. Flip a fair coin; if it comes up heads, then X = 1, otherwise X = −1. Now roll a six-sided die, and call the value U. We define Y = U \+ X.
+
+  1. (a)
+
+What is P(Y | X = 1)?
+
+  2. (b)
+
+What is P(X | Y = 0)?
+
+  3. (c)
+
+What is P(X | Y = 7)?
+
+  4. (d)
+
+What is P(X | Y = 3)?
+
+  5. (e)
+
+Are X and Y independent?
+
+4.6 Magic the Gathering is a popular card game. Cards can be land cards, or other cards. We consider a game with two players. Each player has a deck of 40 cards. Each player shuffles their deck, then deals seven cards, called their hand. The rest of each player's deck is called their library. Assume that player one has 10 land cards in their deck and player two has 20. Write L 1 for the number of lands in player one's hand and L 2 for the number of lands in player two's hand. Write L t for the number of lands in the top 10 cards of player one's library.
+
+  1. (a)
+
+Write S = L 1 \+ L 2. What is P({S = 0})?
+
+  2. (b)
+
+Write D = L 1 − L 2. What is P({D = 0})?
+
+  3. (c)
+
+What is the probability distribution for L 1?
+
+  4. (d)
+
+Write out the probability distribution for P(L 1 | L t = 10).
+
+  5. (e)
+
+Write out the probability distribution P(L 1 | L t = 5).
+
+Continuous Random Variables
+
+4.7 A continuous random variable has probability density function p(x) which is proportional to g(x), where
+
+![
+$$\\displaystyle{g\(x\) = \\left \\{\\begin{array}{ll} 0 &\\mbox{ if }x <-\\frac{\\pi }{2} \\\\ 0 &\\mbox{ if }x> \\frac{\\pi } {2} \\\\ \\cos \(x\)&\\mbox{ otherwise} \\end{array} \\right..}$$
+](A442674_1_En_4_Chapter_Equbh.gif)
+
+Write c for the constant of proportionality, so that p(x) = cg(x).
+
+  1. (a)
+
+What is c? (you can look up the integral if you want)
+
+  2. (b)
+
+What is P({X ≥ 0}) (i.e. the probability you will observe a value greater than 0)? (you can look up the integral if you want)
+
+  3. (c)
+
+What is P({∣​X​∣ ≤ 1})? (you can look up the integral if you want)
+
+4.8 There is some (small!) voltage over the terminals of a warm resistor caused by noise (electrons moving around in the heat and banging into one another). This is a good example of a continuous random variable, and we can assume there is some probability density function for it, say p(x). We assume that p(x) has the property that
+
+![
+$$\\displaystyle{\\lim _{\\epsilon \\rightarrow 0}\\int _{v-\\epsilon }^{v+\\epsilon }p\(x\)dx = 0}$$
+](A442674_1_En_4_Chapter_Equbi.gif)
+
+which is what you'd expect for any function you're likely to have dealt with. Now imagine I define a new random variable by the following procedure: I flip a coin; if it comes up heads, I report 0; if tails, I report the voltage over the resistor. This random variable, u, has a probability 1/2 of taking the value 0, and 1/2 of taking a value from p(x). Write this random variable's probability density function q(u).
+
+  1. (a)
+
+Show that
+
+![
+$$\\displaystyle{\\lim _{\\epsilon \\rightarrow 0}\\int _{-\\epsilon }^{\\epsilon }q\(u\)du = \\frac{1} {2}}$$
+](A442674_1_En_4_Chapter_Equbj.gif)
+
+  2. (b)
+
+Explain why this is odd behavior.
+
+Expected Values
+
+4.9 Magic the Gathering is a popular card game. Cards can be land cards, or other cards. We consider a game with two players. Each player has a deck of 40 cards. Each player shuffles their deck, then deals seven cards, called their hand. The rest of each player's deck is called their library. Assume that player one has 10 land cards in their deck and player two has 20. Write L 1 for the number of lands in player one's hand and L 2 for the number of lands in player two's hand. Write L t for the number of lands in the top 10 cards of player one's library.
+
+  1. (a)
+
+What is ![
+$$\\mathbb{E}\\left \[L_{1}\\right \]$$
+](A442674_1_En_4_Chapter_IEq82.gif)?
+
+  2. (b)
+
+What is ![
+$$\\mathbb{E}\\left \[L_{2}\\right \]$$
+](A442674_1_En_4_Chapter_IEq83.gif)?
+
+  3. (c)
+
+What is ![
+$$\\mathsf{var}\\left \[L_{1}\\right \]$$
+](A442674_1_En_4_Chapter_IEq84.gif)?
+
+4.10 A simple coin game is as follows: we have a box, which starts empty. P1 flips a fair coin. If it comes up heads, P2 gets the contents of the box, and the game ends. If it comes up tails, P1 puts a dollar in the box and they flip again; this repeats until it comes up heads
+
+  1. (a)
+
+With what probability will P2 win exactly 10 units?
+
+  2. (b)
+
+Write S ∞ = ∑ i = 0 ∞ r i . Show that (1 − r)S ∞ = 1, so that
+
+![
+$$\\displaystyle{S_{\\infty } = \\frac{1} {1 - r}}$$
+](A442674_1_En_4_Chapter_Equbk.gif)
+
+  3. (c)
+
+Show that
+
+![
+$$\\displaystyle{\\sum _{i=0}^{\\infty }ir^{i} = \(\\sum _{ i=1}^{\\infty }r^{i}\) + r\(\\sum _{ i=1}^{\\infty }r^{i}\) + r^{2}\(\\sum _{ i=1}^{\\infty }r^{i}\)+\\ldots }$$
+](A442674_1_En_4_Chapter_Equbl.gif)
+
+(look carefully at the limits of the sums!) and so show that
+
+![
+$$\\displaystyle{\\sum _{i=0}^{\\infty }ir^{i} = \\frac{r} {\(1 - r\)^{2}}.}$$
+](A442674_1_En_4_Chapter_Equbm.gif)
+
+  4. (d)
+
+What is the expected value of the game? (you may find the results of the two previous subexercises helpful; they're not there just for show).
+
+  5. (e)
+
+How much should P2 pay to play, to make the game fair?
+
+4.11 A simple card game is as follows. P1 pays a stake of 1 to play. P1 and P2 then each draw a card. If both cards are the same color, P2 keeps the stake and the game ends. If they are different colors, P2 pays P1 the stake and 1 extra (a total of 2).
+
+  1. (a)
+
+What is the expected value of the game to P1?
+
+  2. (b)
+
+P2 modifies the game, as follows. If both cards are court cards (that is, knave, queen, king), then P2 keeps the stake and the game ends; otherwise, the game works as before. Now what is the expected value of the game to P1?
+
+4.12 A coin game that is occasionally played is "odd one out". In this game, there are rounds. In a round, each person flips a coin. There is an odd person out in that round if all but one have H and the other has T, OR all but one have T and the other has H.
+
+  1. (a)
+
+Three people play one round. What is the probability that there is an odd person out?
+
+  2. (b)
+
+Now four people play one round. What is the probability that there is an odd person out?
+
+  3. (c)
+
+Five people play until there is an odd person out. What is the expected number of rounds that they will play? (you can save yourself quite a lot of calculation by reading Sect. 5.​1.​3, if you don't mind skipping ahead a bit).
+
+Mean, Variance and Covariance
+
+4.13 Show that ![
+$$\\mathsf{var}\\left \[kX\\right \] = k^{2}\\mathsf{var}\\left \[X\\right \]$$
+](A442674_1_En_4_Chapter_IEq85.gif).
+
+4.14 Show that if X and Y are independent random variables, then ![
+$$\\mathsf{var}\\left \[X + Y \\right \] = \\mathsf{var}\\left \[X\\right \] + \\mathsf{var}\\left \[Y \\right \]$$
+](A442674_1_En_4_Chapter_IEq86.gif). You will find it helpful to remember that, for X and Y independent, ![
+$$\\mathbb{E}\\left \[XY \\right \] = \\mathbb{E}\\left \[X\\right \]\\mathbb{E}\\left \[Y \\right \]$$
+](A442674_1_En_4_Chapter_IEq87.gif).
+
+Expectations and Descriptive Statistics
+
+4.15 We have a dataset {x} of N numbers, where the i'th number is x i . Write X for the random variable that takes the i'th value with probability 1∕N, and every other value with zero probability; write P(X) for the probability distribution of that random variable.
+
+  1. (a)
+
+Show that
+
+![
+$$\\displaystyle{\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) = \\mathbb{E}_{P\(X\)}\\left \[X\\right \].}$$
+](A442674_1_En_4_Chapter_Equbn.gif)
+
+  2. (b)
+
+Show that
+
+![
+$$\\displaystyle{\\mathsf{var}\\left \(\\left \\{x\\right \\}\\right \) = \\mathsf{var}\\left \[X\\right \].}$$
+](A442674_1_En_4_Chapter_Equbo.gif)
+
+  3. (c)
+
+Choose some function f. Write {f} for the dataset whose i'th item is f(x i ). Write F for the random variable f(X). Show that
+
+![
+$$\\displaystyle{\\mathsf{mean}\\left \(\\left \\{f\\right \\}\\right \) = \\mathbb{E}\\left \[F\\right \] = \\mathbb{E}_{P\(X\)}\\left \[f\\right \].}$$
+](A442674_1_En_4_Chapter_Equbp.gif)
+
+Markov and Chebyshev Inequalities
+
+4.16 The random variable X takes the values − 2, − 1, 0, 1, 2, but has an unknown probability distribution. You know that ![
+$$\\mathbb{E}\\left \[\\mid \\!\\mid \\!\\!X\\!\\!\\mid \\!\\mid \\right \] = 0.2$$
+](A442674_1_En_4_Chapter_IEq88.gif). Use Markov's inequality to give a lower bound on P({X = 0}). Hint: Notice that P({X = 0}) = 1 − P({∣​∣​​X​​∣​∣ = 1}) − P({∣​∣​​X​​∣​∣} = 2).
+
+4.17 The random variable X takes the values 1, 2, 3, 4, 5, but has unknown probability distribution. You know that ![
+$$\\mathbb{E}\\left \[X\\right \] = 2$$
+](A442674_1_En_4_Chapter_IEq89.gif) and ![
+$$\\mathsf{var}\\left \(\\left \\{X\\right \\}\\right \) = 0.01$$
+](A442674_1_En_4_Chapter_IEq90.gif). Use Chebyshev's inequality to give a lower bound on P({X = 2}).
+
+4.18 You have a biased random number generator. This generator produces a random number with mean value − 1, and standard deviation 0.5. Write ![
+$$\\mathcal{A}$$
+](A442674_1_En_4_Chapter_IEq91.gif) for the event that the number generator produces a non-negative number. Use Chebyshev's inequality to bound ![
+$$P\(\\mathcal{A}\)$$
+](A442674_1_En_4_Chapter_IEq92.gif).
+
+4.19 You observe a random number generator. You know that it can produce the values − 2, −1, 0, 1, or 2. You are told that it has been adjusted so that: (1) the mean value it produces is zero and; (2) the standard deviation of the numbers it produces is 1.
+
+  1. (a)
+
+Write ![
+$$\\mathcal{A}$$
+](A442674_1_En_4_Chapter_IEq93.gif) for the event that the number generator produces a number that is not 0. Use Chebyshev's inequality to bound ![
+$$P\(\\mathcal{A}\)$$
+](A442674_1_En_4_Chapter_IEq94.gif).
+
+  2. (b)
+
+Write ![
+$$\\mathcal{B}$$
+](A442674_1_En_4_Chapter_IEq95.gif) for the event that the number generator produces − 2 or 2. Use Chebyshev's inequality to bound ![
+$$P\(\\mathcal{B}\)$$
+](A442674_1_En_4_Chapter_IEq96.gif).
+
+Using Expectations
+
+4.20 Two players P1 and P2 agree to play the following game. Each puts up a stake of 1 unit. They will play seven rounds, where each round involves flipping a fair coin. If the coin comes up H, P1 wins the round, otherwise P2 wins. The first player to win four rounds gets both stakes. After four rounds, P1 has won three rounds and P2 has won one round, but they have to stop. What is the fairest way to divide the stakes?
+
+4.21 Imagine we have a game with two players, who are playing for a stake. There are no draws, the winner gets the whole stake, and the loser gets nothing. The game must end early. We decide to give each player the expected value of the game for that player, from that state. Show that the expected values add up to the value of the stake (i.e. there won't be too little or too much money in the stake.
+
+Programming Exercises
+
+4.22 An airline company runs a flight that has six seats. Each passenger who buys a ticket has a probability p of turning up for the flight. These events are independent.
+
+  1. (a)
+
+The airline sells six tickets. What is the expected number of passengers, if p = 0. 9?
+
+  2. (b)
+
+How many tickets should the airline sell to ensure that the expected number of passengers is greater than six, if p = 0. 7? Hint: The easiest way to do this is to write a quick program that computes the expected value of passengers that turn up for each the number of tickets sold, then search the number of tickets sold.
+
+4.23 An airline company runs a flight that has 10 seats. Each passenger who buys a ticket has a probability p of turning up for the flight. The gender of the passengers is not known until they turn up for a flight, and women buy tickets with the same frequency that men do. The pilot is eccentric, and will not fly unless at least two women turn up.
+
+  1. (a)
+
+How many tickets should the airline sell to ensure that the expected number of passengers that turn up is greater than 10?
+
+  2. (b)
+
+The airline sells 10 tickets. What is the expected number of passengers on the aircraft, given that it flies? (i.e. that at least two women turn up). Estimate this value with a simulation.
+
+4.24 We will investigate the weak law of large numbers using simulations. Write X for a random variable that takes the values -1 and 1 with equal probability, and no other value. Clearly, ![
+$$\\mathbb{E}\\left \[X\\right \] = 0$$
+](A442674_1_En_4_Chapter_IEq97.gif). Write X (N) for the random variable obtained by drawing N samples of X, then averaging them.
+
+  1. (a)
+
+For each N in ![
+$$\\left \\{1,10,20,\\ldots,100\\right \\}$$
+](A442674_1_En_4_Chapter_IEq98.gif), simulate 1000 samples of X (N). Produce a graph showing a boxplot of these samples for each N, plotted against N. What do you notice?
+
+  2. (b)
+
+For each N in ![
+$$\\left \\{1,10,20,\\ldots,100\\right \\}$$
+](A442674_1_En_4_Chapter_IEq99.gif), simulate 1000 samples of X (N). Produce a graph showing the variance of these samples as a function of 1∕N. What do you notice?
+
+  3. (c)
+
+Show that the normal approximation of a binomial distribution suggests that about 68% of the observed values of X (N) lie in the range
+
+![
+$$\\displaystyle{\\left \[- \\frac{1} {2\\sqrt{N}}, \\frac{1} {2\\sqrt{N}}\\right \].}$$
+](A442674_1_En_4_Chapter_Equbq.gif)
+
+  4. (d)
+
+For each N in ![
+$$\\left \\{1,10,20,\\ldots,100\\right \\}$$
+](A442674_1_En_4_Chapter_IEq100.gif), simulate 1000 samples of X (N). For each N, compute the 84% quantile (q 84% ) and the 16% quantile (q 16% ). Now compute
+
+![
+$$\\displaystyle{\\alpha = \\mbox{ max}\(\\mid \\!\\!q_{84\\%}\\!\\!\\mid,\\mid \\!\\!q_{16\\%}\\!\\!\\mid \).}$$
+](A442674_1_En_4_Chapter_Equbr.gif)
+
+This α should have the property that about 68% of the observed values lie in the range ![
+$$\\left \[-\\alpha,\\alpha \\right \]$$
+](A442674_1_En_4_Chapter_IEq101.gif). Now plot 1∕α 2 as a function of N. What do you notice?
+
+© Springer International Publishing AG 2018
+
+David ForsythProbability and Statistics for Computer Sciencehttps://doi.org/10.1007/978-3-319-64410-3_5
+
+# 5. Useful Probability Distributions
+
+David Forsyth1
+
+(1)
+
+Computer Science Department, University of Illinois at Urbana Champaign, Urbana, IL, USA
+
+We will use probability as a tool to resolve practical questions about data. Here are important example questions. We could ask what process produced the data? For example, I observe a set of independent coin flips. I would now like to know the probability of observing a head when the coin is flipped. We could ask what sort of data can we expect in the future? For example, what will be the outcome of the next election? Answering this requires collecting information about voters, preferences, and the like, then using it to build a model that predicts the outcome. We could ask what labels should we attach to unlabelled data? For example, we might see a large number of credit card transactions, some known to be legitimate and others known to be fraudulent. We now see a new transaction: is it legitimate? We could ask is an effect easily explained by chance variations, or is it real? For example, a medicine appears to help patients with a disease. Is there a real effect, or is it possible that by chance the patients we tested the medicine on felt better?
+
+These questions do not lend themselves to "right" answers. Instead, we will need to produce estimates and perhaps some measure of our confidence in those estimates. Sensible answers to questions like these have great practical value. Producing sensible answers to these questions requires some form of probability model. In this chapter, I describe the properties of some probability distributions that are used again and again in model building.
+
+## 5.1 Discrete Distributions
+
+### 5.1.1 The Discrete Uniform Distribution
+
+Assume we have a random variable that can take one of k different values. We can relabel these values 1,..., k without losing anything significant. If each of these values has the same probability (and all others have probability zero), then the probability distribution is the discrete uniform distribution. We have seen this distribution before, numerous times. For example, I define a random variable by the number that shows face-up on the throw of a fair die. This has a uniform distribution. As another example, write the numbers 1–52 on the face of each card of a standard deck of playing cards. The number on the face of the first card drawn from a well-shuffled deck is a random variable with a uniform distribution.
+
+Definition 5.1 (Uniform Random Variable, Discrete)
+
+A random variable has the discrete uniform distribution if it takes each of k values with the same probability 1∕k, and all other values with probability zero.
+
+One can construct expressions for the mean and variance of a discrete uniform distribution, but they're not usually much use (too many terms, not often used). Keep in mind that if two random variables have a uniform distribution, their sum and difference will not (recall Example 4.​3).
+
+### 5.1.2 Bernoulli Random Variables
+
+A Bernoulli random variable models a biased coin with probability p of coming up heads in any one flip.
+
+Definition 5.2 (Bernoulli Random Variable)
+
+A Bernoulli random variable takes the value 1 with probability p and 0 with probability 1 − p. This is a model for a coin toss, among other things.
+
+Useful Facts 5.1 (Mean and Variance of a Bernoulli Random Variable)
+
+​​​ A Bernoulli random variable that takes the value 1 with probability p has:
+
+  1. 1.
+
+mean p;
+
+  2. 2.
+
+variance p(1 − p).
+
+### 5.1.3 The Geometric Distribution
+
+We have a biased coin. The probability it will land heads up, P({H}) is given by p. We flip this coin until the first head appears. The number of flips required is a discrete random variable which takes integer values greater than or equal to one, which we shall call X. To get n flips, we must have n − 1 tails followed by 1 head. This event has probability (1 − p)(n−1) p. We can now write out the probability distribution that n flips are required.
+
+Definition 5.3 (Geometric Distribution)
+
+The geometric distribution is a probability distribution on positive integers n (i.e. n > 0). It has the form
+
+![
+$$\\displaystyle{P\(\\{X = n\\}\) = \(1 - p\)^{\(n-1\)}p.}$$
+](A442674_1_En_5_Chapter_Equa.gif)
+
+for 0 ≤ p ≤ 1 and n ≥ 1 (for other n the distribution is zero). p is called the parameter of the distribution.
+
+Notice that the geometric distribution is non-negative everywhere. It is straightforward to show that it sums to one, and so is a probability distribution (exercises).
+
+Useful Facts 5.2 (Mean and Variance of a Geometric Distribution)
+
+​​​ A geometric distribution with parameter p has
+
+  1. 1.
+
+mean ![
+$$\\frac{1} {p}$$
+](A442674_1_En_5_Chapter_IEq1.gif);
+
+  2. 2.
+
+variance ![
+$$\\frac{1-p} {p^{2}}$$
+](A442674_1_En_5_Chapter_IEq2.gif).
+
+It should be clear that this model isn't really about coins, but about repeated trials. The trial could be anything that has some probability of failing. Each trial is independent, and the rule for repeating is that you keep trying until the first success. Textbooks often set exercises involving missiles and aircraft; I'll omit these on grounds of taste.
+
+### 5.1.4 The Binomial Probability Distribution
+
+Assume we have a biased coin with probability p of coming up heads in any one flip. The binomial probability distribution gives the probability that it comes up heads h times in N flips. Recall there are
+
+![
+$$\\displaystyle{\\left \(\\begin{array}{c} N\\\\ h \\end{array} \\right \) = \\frac{N!} {h!\(N - h\)!}}$$
+](A442674_1_En_5_Chapter_Equb.gif)
+
+outcomes of N coin flips that have h heads. These outcomes are disjoint, and each has probability p h (1 − p)(N−h). As a result, we must have the probability distribution below.
+
+Definition 5.4 (Binomial Distribution)
+
+In N independent repetitions of an experiment with a binary outcome (ie heads or tails; 0 or 1; and so on) with P(H) = p and P(T) = 1 − p, the probability of observing a total of h H's and (N − h)T's is
+
+![
+$$\\displaystyle{P_{b}\(h;N,p\) = \\left \(\\begin{array}{c} N\\\\ h \\end{array} \\right \)p^{h}\(1-p\)^{\(N-h\)}}$$
+](A442674_1_En_5_Chapter_Equc.gif)
+
+as long as 0 ≤ h ≤ N; in any other case, the probability is zero.
+
+The binomial distribution really is a probability distribution. For 0 ≤ p ≤ 1, it is clearly non-negative for any i. It also sums to one. Write P b (i; N, p) for the binomial distribution that one observes i H's in N trials. Then, by pattern matching to the binomial theorem, we have
+
+![
+$$\\displaystyle{\(p + \(1 - p\)\)^{N} =\\sum _{ i=0}^{N}P_{ b}\(i;N,p\) = 1.}$$
+](A442674_1_En_5_Chapter_Equd.gif)
+
+The binomial distribution satisfies a recurrence relation. You can get h heads in N flips either by having h − 1 heads in N − 1 flips, then flipping another head, or by having h heads in N flips then flipping a tail. This means that
+
+![
+$$\\displaystyle\\begin{array}{rcl} P_{b}\(h;N,p\) = pP_{b}\(h - 1;N - 1,p\)& & {}\\\\ +\(1 - p\)P_{b}\(h;N - 1,p\)& & {}\\\\ \\end{array}$$
+](A442674_1_En_5_Chapter_Equ1.gif)
+
+(exercises).
+
+Useful Facts 5.3 (Mean and Variance of the Binomial Distribution)
+
+The binomial distribution
+
+![
+$$\\displaystyle{P_{b}\(h;N,p\) = \\left \(\\begin{array}{c} N\\\\ h \\end{array} \\right \)p^{h}\(1-p\)^{\(N-h\)}}$$
+](A442674_1_En_5_Chapter_Eque.gif)
+
+has:
+
+  1. 1.
+
+mean Np;
+
+  2. 2.
+
+variance Np(1 − p).
+
+The proofs are informative, and so are not banished to the exercises.
+
+Property 5.1
+
+Mean and variance of binomial distribution.
+
+Proposition
+
+The mean of the binomial distribution P b (h; N, p) is Np. The variance is Np(1 − p).
+
+Proof
+
+Write X for a random variable with distribution P b (h; N, p). Notice that the number of heads in N coin tosses can be obtained by adding the number of heads in each toss. Write Y i for the Bernoulli random variable representing the i'th toss. If the coin comes up heads, Y i = 1, otherwise Y i = 0. The Y i are independent. Now
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathbb{E}\\left \[X\\right \]& =& \\mathbb{E}\\left \[\\sum _{j=1}^{N}Y _{ i}\\right \] {}\\\\ & =& \\sum _{j=1}^{N}\\mathbb{E}\\left \[Y _{ i}\\right \] {}\\\\ & =& N\\mathbb{E}\\left \[Y _{1}\\right \]\\ \\mbox{ because the }Y _{i}\\mbox{ are independent} {}\\\\ & =& Np. {}\\\\ \\end{array}$$
+](A442674_1_En_5_Chapter_Equ2.gif)
+
+The variance is easy, too. Each coin toss is independent, so the variance of the sum of coin tosses is the sum of the variances. This gives
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{var}\\left \[X\\right \]& =& \\mathsf{var}\\left \[\\sum _{j=1}^{N}Y _{ i}\\right \] {}\\\\ & =& N\\mathsf{var}\\left \[Y _{1}\\right \] {}\\\\ & =& Np\(1 - p\) {}\\\\ \\end{array}$$
+](A442674_1_En_5_Chapter_Equ3.gif)
+
+### 5.1.5 Multinomial Probabilities
+
+The binomial distribution describes what happens when a coin is flipped multiple times. But we could toss a die multiple times too. Assume this die has k sides, and we toss it N times. The distribution of outcomes is known as the multinomial distribution.
+
+We can guess the form of the multinomial distribution in rather a straightforward way. The die has k sides. We toss the die N times. This gives us a sequence of N numbers. Each toss of the die is independent. Assume that side 1 appears n 1 times, side 2 appears n 2 times,... side k appears n k times. Any single sequence with this property will appear with probability ![
+$$p_{1}^{n_{1}}p_{2}^{n_{2}}\\ldots p_{k}^{n_{k}}$$
+](A442674_1_En_5_Chapter_IEq3.gif), because the tosses are independent. However, there are
+
+![
+$$\\displaystyle{ \\frac{N!} {n_{1}!n_{2}!\\ldots n_{k}!}}$$
+](A442674_1_En_5_Chapter_Equf.gif)
+
+such sequences. Using this reasoning, we arrive at the distribution below
+
+Definition 5.5 (Multinomial Distribution)
+
+Perform N independent repetitions of an experiment with k possible outcomes. The i'th such outcome has probability p i . The probability of observing outcome 1 n 1 times, outcome 2 n 2 times, etc. (where n 1 \+ n 2 \+ n 3 \+... \+ n k = N) is
+
+![
+$$\\displaystyle{\\begin{array}{l} P_{m}\(n_{1},\\ldots,n_{k};N,p_{1},\\ldots,p_{k}\) = \\frac{N!} {n_{1}!n_{2}!\\ldots n_{k}!}p_{1}^{n_{1}}p_{ 2}^{n_{2}}\\ldots p_{ k}^{n_{k}}. \\end{array} }$$
+](A442674_1_En_5_Chapter_Equg.gif)
+
+I don't recall ever using the mean and variance of a multinomial distribution, so they're not in a box. If you happen to need this information, you can derive it with using the reasoning of proof 5.1.
+
+### 5.1.6 The Poisson Distribution
+
+Assume we are interested in counts that occur in an interval of time (e.g. within a particular hour). Because they are counts, they are non-negative and integer valued. We know these counts have two important properties. First, they occur with some fixed average rate. Second, an observation occurs independent of the interval since the last observation. Then the Poisson distribution is an appropriate model.
+
+There are numerous such cases. For example, the marketing phone calls you receive during the day time are likely to be well modelled by a Poisson distribution. They come at some average rate—perhaps 5 a day as I write, during the last phases of an election year—and the probability of getting one clearly doesn't depend on the time since the last one arrived. Classic examples include the number of Prussian soldiers killed by horse-kicks each year; the number of calls arriving at a call center each minute; the number of insurance claims occurring in a given time interval (outside of a special event like a hurricane, etc.).
+
+Definition 5.6 (Poisson Distribution)
+
+A non-negative, integer valued random variable X has a Poisson distribution when its probability distribution takes the form
+
+![
+$$\\displaystyle{P\(\\left \\{X = k\\right \\}\) = \\frac{\\lambda ^{k}e^{-\\lambda }} {k!},}$$
+](A442674_1_En_5_Chapter_Equh.gif)
+
+where λ > 0 is a parameter often known as the intensity of the distribution.
+
+Notice that the Poisson distribution is a probability distribution, because it is non-negative and because
+
+![
+$$\\displaystyle{\\sum _{i=0}^{\\infty }\\frac{\\lambda ^{i}} {i!} = e^{\\lambda }}$$
+](A442674_1_En_5_Chapter_Equi.gif)
+
+so that
+
+![
+$$\\displaystyle{\\sum _{k=0}^{\\infty }\\frac{\\lambda ^{k}e^{-\\lambda }} {k!} = 1}$$
+](A442674_1_En_5_Chapter_Equj.gif)
+
+Useful Facts 5.4 (Mean and Variance of the Poisson Distribution)
+
+​​A Poisson distribution with intensity λ has:
+
+  1. 1.
+
+mean λ;
+
+  2. 2.
+
+variance λ (no, that's not an accidentally repeated line or typo).
+
+I described the Poisson distribution as a natural model for counts of randomly distributed points along a time axis. But it doesn't really matter that this is a time axis—it could be a space axis instead. For example, you could take a length of road, divide it into even intervals, then count the number of road-killed animals is in each interval. If the location of each animal is independent of the location of any other animal, then you could expect a Poisson model to apply to the count data. Assume that the Poisson model that best describes the data has parameter λ. One property of such models is that if you doubled the length of the intervals, then the resulting dataset would be described by a Poisson model with parameter 2λ; similarly, if you halved the length of the intervals, the best model would have parameter λ∕2. This corresponds to our intuition about such data; roughly, the number of road-killed animals in two miles of road should be twice the number in one mile of road. This property means that no pieces of the road are "special"—each behaves the same as the other.
+
+We can build a really useful model of spatial randomness by observing this fact and generalizing very slightly. A Poisson point process with intensity λ is a set of random points with the property that the number of points in an interval of length s is a Poisson random variable with parameter λs. Notice how this captures our intuition that if points are "very randomly" distributed, there should be twice as many of them in an interval that is twice as long.
+
+This model is easily, and very usefully, extended to points on the plane, on surfaces, and in 3D. In each case, the process is defined on a domain D (which has to meet some very minor conditions that are of no interest to us). The number of points in any subset s of D is a Poisson random variable, with intensity λm(s), where m(s) is the area (resp. volume) of s. These models are useful, because they capture the property that (a) the points are random and (b) the probability you find a point doesn't depend on where you are. You could reasonably believe models like this apply to, say, dead flies on windscreens; the places where you find acorns at the foot of an oak tree; the distribution of cowpats in a field; the distribution of cherries in a fruitcake; and so on.
+
+## 5.2 Continuous Distributions
+
+### 5.2.1 The Continuous Uniform Distribution
+
+Some continuous random variables have a natural upper bound and a natural lower bound but otherwise we know nothing about them. For example, imagine we are given a coin of unknown properties by someone who is known to be a skillful maker of unfair coins. The manufacturer makes no representations as to the behavior of the coin. The probability that this coin will come up heads is a random variable, about which we know nothing except that it has a lower bound of zero and an upper bound of one. If we know nothing about a random variable apart from the fact that it has a lower and an upper bound, then a uniform distribution is a natural model. A continuous random variable whose probability distribution is the uniform distribution is often called a uniform random variable.
+
+Definition 5.7 (Uniform Distribution, Continuous)
+
+Write l for the lower bound and u for the upper bound. The probability density function for the uniform distribution is
+
+![
+$$\\displaystyle{p\(x\) = \\left \\{\\begin{array}{lr} 0 & x <l \\\\ 1/\(u - l\)&l \\leq x \\leq u \\\\ 0 & x> u \\end{array} \\right.}$$
+](A442674_1_En_5_Chapter_Equk.gif)
+
+### 5.2.2 The Beta Distribution
+
+It's hard to explain now why the Beta (or β) distribution is useful, but it will come in useful later (Sect. 9.​2.​1). The Beta distribution is a probability distribution for a continuous random variable x in the range 0 ≤ x ≤ 1. There are two parameters, α > 0 and β > 0. Recall the definition of the ![
+$$\\Gamma$$
+](A442674_1_En_5_Chapter_IEq4.gif) function from Sect. 15.​2.
+
+Definition 5.8 (Beta Distribution)
+
+A continuous random variable x in the range 0 ≤ x ≤ 1 has a Beta distribution if its probability density function has the form
+
+![
+$$\\displaystyle{P_{\\beta }\(x\\vert \\alpha,\\beta \) = \\frac{\\Gamma \(\\alpha +\\beta \)} {\\Gamma \(\\alpha \)\\Gamma \(\\beta \)}x^{\(\\alpha -1\)}\(1 - x\)^{\(\\beta -1\)}.}$$
+](A442674_1_En_5_Chapter_Equl.gif)
+
+where α > 0 and β > 0.
+
+From the expression for the Beta distribution, you can see that:
+
+  * P β (x | 1, 1) is a uniform distribution on the unit interval.
+
+  * For α > 1, β > 1), P β (x | α, β) has a single maximum at x = (α − 1)∕(α \+ β − 2) (differentiate and set to zero).
+
+  * Generally, as α and β get larger, this peak gets narrower.
+
+  * For α = 1, β > 1 the largest value of P β (x | α, β) is at x = 0.
+
+  * For α > 1, β = 1 the largest value of P β (x | α, β) is at x = 1.
+
+Figure 5.1 shows plots of the probability density function of the Beta distribution for a variety of different values of α and β.
+
+Fig. 5.1
+
+Probability density functions for the Beta distribution with a variety of different choices of α and β
+
+Useful Facts 5.5 (Mean and Variance of a Beta Distribution)
+
+​​​​ A Beta distribution with parameters α, β has:
+
+  1. 1.
+
+mean ![
+$$\\frac{\\alpha }{\\alpha +\\beta };$$
+](A442674_1_En_5_Chapter_IEq5.gif)
+
+  2. 2.
+
+variance ![
+$$\\frac{\\alpha \\beta }{\(\\alpha +\\beta \)^{2 } \(\\alpha +\\beta +1\)}.$$
+](A442674_1_En_5_Chapter_IEq6.gif)
+
+### 5.2.3 The Gamma Distribution
+
+The Gamma (or γ) distribution will also come in useful later on (Sect. 9.​2.​1). The Gamma distribution is a probability distribution for a non-negative continuous random variable x ≥ 0. There are two parameters, α > 0 and β > 0.
+
+Definition 5.9 (Gamma Distribution)
+
+A non-negative continuous random variable x has a Gamma distribution if its probability density function is
+
+![
+$$\\displaystyle{P_{\\gamma }\(x\\vert \\alpha,\\beta \) = \\frac{\\beta ^{\\alpha }} {\\Gamma \(\\alpha \)}x^{\(\\alpha -1\)}e^{-\\beta x}.}$$
+](A442674_1_En_5_Chapter_Equm.gif)
+
+where α > 0 and β > 0.
+
+Figure 5.2 shows plots of the probability density function of the Gamma distribution for a variety of different values of α and β.
+
+Fig. 5.2
+
+Probability density functions for the Gamma distribution with a variety of different choices of α and β
+
+Useful Facts 5.6 (Mean and Variance of the Gamma Distribution)
+
+A Gamma distribution with parameters α, β has:
+
+  1. 1.
+
+mean ![
+$$\\frac{\\alpha }{\\beta };$$
+](A442674_1_En_5_Chapter_IEq7.gif)
+
+  2. 2.
+
+variance ![
+$$\\frac{\\alpha }{\\beta ^{2}}.$$
+](A442674_1_En_5_Chapter_IEq8.gif)
+
+### 5.2.4 The Exponential Distribution
+
+Assume we have an infinite interval of time or space, with points distributed on it. Assume these points form a Poisson point process, as above. For example, we might consider the times at which email arrives; or the times at which phone calls arrive at a large telephone exchange; or the locations of roadkill on a road. The distance (or span of time) between two consecutive points is a random variable X. This random variable takes an exponential distribution, defined below. There is a single parameter, λ > 0. This distribution is often useful in modelling the failure of objects. We assume that failures form a Poisson process in time; then the time to the next failure is exponentially distributed.
+
+Definition 5.10 (Exponential Distribution)
+
+A continuous random variable x has an exponential distribution when its probability density function takes the form
+
+![
+$$\\displaystyle{P_{\\mbox{ exp}}\(x\\vert \\lambda \) = \\left \\{\\begin{array}{ll} \\lambda e^{-\\lambda x}&\\mbox{ for }x \\geq 0 \\\\ 0 &\\mbox{ otherwise} \\end{array} \\right..}$$
+](A442674_1_En_5_Chapter_Equn.gif)
+
+where λ > 0 is a parameter.
+
+Useful Facts 5.7 (Mean and Variance of the Exponential Distribution)
+
+​​ An exponential distribution with parameter λ has
+
+  1. 1.
+
+mean ![
+$$\\frac{1} {\\lambda };$$
+](A442674_1_En_5_Chapter_IEq9.gif)
+
+  2. 2.
+
+variance ![
+$$\\frac{1} {\\lambda ^{2}}.$$
+](A442674_1_En_5_Chapter_IEq10.gif)
+
+Notice the relationship between this parameter and the parameter of the Poisson distribution. If (say) the phone calls are distributed with Poisson distribution with intensity λ (per hour), then your expected number of calls per hour is λ. The time between calls will be exponentially distributed with parameter λ, and the expected time to the next call is 1∕λ (in hours).
+
+## 5.3 The Normal Distribution
+
+Many real datasets have histograms that look like a "bump", and the probability density function for a normal distribution looks like a "bump", too. Some of this is just an experimental fact of life. But there are important mathematical reasons that normal distributions should be common. Imagine your data is a sum of random variables (say, you are measuring the weight of a net full of fishes). Then pretty much however the original random variables are distributed, your data will be normally distributed.
+
+### 5.3.1 The Standard Normal Distribution
+
+Definition 5.11 (Standard Normal Distribution)
+
+The probability density function
+
+![
+$$\\displaystyle{p\(x\) = \\left \( \\frac{1} {\\sqrt{2\\pi }}\\right \)\\exp \\left \(\\frac{-x^{2}} {2} \\right \).}$$
+](A442674_1_En_5_Chapter_Equo.gif)
+
+is known as the standard normal distribution
+
+The first step is to plot this probability density function (Fig. 5.3). You should notice it is quite familiar from work on histograms, etc. in chapter Worked example 14.13. It has the shape of the histogram of standard normal data, or at least the shape that the histogram of standard normal data aspires to.
+
+Fig. 5.3
+
+A plot of the probability density function of the standard normal distribution. Notice how probability is concentrated around zero, and how there is relatively little probability density for numbers with large absolute values
+
+Useful Facts 5.8 (Mean and Variance of the Standard Normal Distribution)
+
+The standard normal distribution has:
+
+  1. 1.
+
+mean 0;
+
+  2. 2.
+
+variance 1.
+
+These results are easily established by looking up (or doing!) the relevant integrals; they are relegated to the exercises.
+
+A continuous random variable is a standard normal random variable if its probability density function is a standard normal distribution.
+
+### 5.3.2 The Normal Distribution
+
+Any probability density function that is a standard normal distribution in standard coordinates is a normal distribution. Now write μ for the mean of a random variable and σ for its standard deviation; we are saying that, if
+
+![
+$$\\displaystyle{\\frac{x-\\mu } {\\sigma } }$$
+](A442674_1_En_5_Chapter_Equp.gif)
+
+has a standard normal distribution, then p(x) is a normal distribution. We can work out the form of the probability density function of a general normal distribution in two steps: first, we notice that for any normal distribution, we must have
+
+![
+$$\\displaystyle{p\(x\) \\propto \\exp \\left \[-\\frac{\(x-\\mu \)^{2}} {2\\sigma ^{2}} \\right \].}$$
+](A442674_1_En_5_Chapter_Equq.gif)
+
+But, for this to be a probability density function, we must have ∫ −∞ ∞ p(x)dx = 1. This yields the constant of proportionality, and we get
+
+Definition 5.12 (Normal Distribution)
+
+The probability density function
+
+![
+$$\\displaystyle{p\(x\) = \\left \( \\frac{1} {\\sqrt{2\\pi }\\sigma }\\right \)\\exp \\left \(\\frac{-\(x-\\mu \)^{2}} {2\\sigma ^{2}} \\right \).}$$
+](A442674_1_En_5_Chapter_Equr.gif)
+
+is a normal distribution.
+
+Useful Facts 5.9 (Mean and Variance of the Normal Distribution)
+
+The probability density function
+
+![
+$$\\displaystyle{p\(x\) = \\left \( \\frac{1} {\\sqrt{2\\pi }\\sigma }\\right \)\\exp \\left \(\\frac{-\(x-\\mu \)^{2}} {2\\sigma ^{2}} \\right \).}$$
+](A442674_1_En_5_Chapter_Equs.gif)
+
+has:
+
+  1. 1.
+
+mean μ;
+
+  2. 2.
+
+and variance σ 2.
+
+These results are easily established by looking up (or doing!) the relevant integrals; they are relegated to the exercises.
+
+A continuous random variable is a normal random variable if its probability density function is a normal distribution. Notice that it is quite usual to call normal distributions gaussian distributions.
+
+### 5.3.3 Properties of the Normal Distribution
+
+Normal distributions are important, because one often runs into data that is well described by a normal distribution. It turns out that anything that behaves like a binomial distribution with a lot of trials—for example, the number of heads in many coin tosses; as another example, the percentage of times you get the outcome of interest in a simulation in many runs—should produce a normal distribution (Sect. 5.4). For this reason, pretty much any experiment where you perform a simulation, then count to estimate a probability or an expectation, should give you an answer that has a normal distribution.
+
+It is a remarkable and deep fact that adding many independent random variables produces a normal distribution pretty much whatever the distributions of those random variables. Because it's important, exciting and non-obvious, this has been proved in various forms by many major mathematicians. It was the subject of Alan Turing's Fellowship Thesis in 1934, where the story goes that examiners didn't quite know how to react: enthusiasm for a novel and brilliant form of proof, or irritation because he didn't already know the theorem.
+
+I've not done this in detail because it's a nuisance to state in detail and to prove. However, you should remember that, if you add together many random variables, each of pretty much any distribution, then the answer has a distribution close to the normal distribution. It turns out that many of the processes we observe add up subsidiary random variables. This means that you will see normal distributions very often in practice.
+
+Remember this: The central limit theorem means that, under some not very worrying technical conditions, the sum of a large number of independent random variables will be very close to normal. The details are beyond our reach technically; the fact is extremely important.
+
+A normal random variable tends to take values that are quite close to the mean, measured in standard deviation units. We can demonstrate this important fact by computing the probability that a standard normal random variable lies between u and v. We form
+
+![
+$$\\displaystyle{\\int _{u}^{v} \\frac{1} {\\sqrt{2\\pi }}\\exp \\left \(-\\frac{u^{2}} {2} \\right \)du.}$$
+](A442674_1_En_5_Chapter_Equt.gif)
+
+It turns out that this integral can be evaluated relatively easily using a special function. The error function is defined by
+
+![
+$$\\displaystyle{\\mathsf{erf}\\left \(x\\right \) = \\frac{2} {\\sqrt{\\pi }}\\int _{0}^{x}\\exp \\left \(-t^{2}\\right \)dt}$$
+](A442674_1_En_5_Chapter_Equu.gif)
+
+so that
+
+![
+$$\\displaystyle{\\frac{1} {2}\\mathsf{erf}\\left \(\( \\frac{x} {\\sqrt{2}}\)\\right \) =\\int _{ 0}^{x} \\frac{1} {\\sqrt{2\\pi }}\\exp \\left \(-\\frac{u^{2}} {2} \\right \)du.}$$
+](A442674_1_En_5_Chapter_Equv.gif)
+
+Notice that ![
+$$\\mathsf{erf}\\left \(x\\right \)$$
+](A442674_1_En_5_Chapter_IEq11.gif) is an odd function (i.e. ![
+$$\\mathsf{erf}\\left \(-x\\right \) = \\mathsf{erf}\\left \(x\\right \)$$
+](A442674_1_En_5_Chapter_IEq12.gif)). From this (and tables for the error function, or your favorite math package) we get that, for a standard normal random variable
+
+![
+$$\\displaystyle{ \\frac{1} {\\sqrt{2\\pi }}\\int _{-1}^{1}\\exp \\left \(-\\frac{x^{2}} {2} \\right \)dx \\approx 0.68}$$
+](A442674_1_En_5_Chapter_Equw.gif)
+
+and
+
+![
+$$\\displaystyle{ \\frac{1} {\\sqrt{2\\pi }}\\int _{-2}^{2}\\exp \\left \(-\\frac{x^{2}} {2} \\right \)dx \\approx 0.95}$$
+](A442674_1_En_5_Chapter_Equx.gif)
+
+and
+
+![
+$$\\displaystyle{ \\frac{1} {\\sqrt{2\\pi }}\\int _{-3}^{3}\\exp \\left \(-\\frac{x^{2}} {2} \\right \)dx \\approx 0.99.}$$
+](A442674_1_En_5_Chapter_Equy.gif)
+
+These are very strong statements. They measure how often a standard normal random variable has values that are in the range − 1, 1, − 2, 2, and − 3, 3 respectively. But these measurements apply to normal random variables if we recognize that they now measure how often the normal random variable is some number of standard deviations away from the mean. In particular, it is worth remembering that:
+
+Useful Facts 5.10 (How Often a Normal Random Variable is How Far from the Mean)
+
+  * About 68% of the time, a normal random variable takes a value within one standard deviation of the mean.
+
+  * About 95% of the time, a normal random variable takes a value within two standard deviations of the mean.
+
+  * About 99% of the time, a normal random variable takes a value within three standard deviations of the mean.
+
+## 5.4 Approximating Binomials with Large N
+
+The Binomial distribution appears to be a straightforward thing. We assume we flip a coin N times, where N is a very large number. The coin has probability p of coming up heads, and so probability q = 1 − p of coming up tails. The number of heads h follows the binomial distribution, so
+
+![
+$$\\displaystyle{P\(h\) = \\frac{N!} {h!\(N - h\)!}p^{h}q^{\(N-h\)}}$$
+](A442674_1_En_5_Chapter_Equz.gif)
+
+The mean of this distribution is Np, the variance is Npq, and the standard deviation is ![
+$$\\sqrt{Npq}$$
+](A442674_1_En_5_Chapter_IEq13.gif).
+
+Evaluating this probability distribution for large N is very difficult, because factorials grow fast. We will construct an approximation to the binomial distribution for large N that allows us to evaluate the probability that h lies in some range.
+
+Notice that h∕N is particularly interesting, because this is the fraction of flips that comes up heads. We are dividing by a constant, so the expected value of h∕N is p and the standard deviation is ![
+$$pq/\\sqrt{N}$$
+](A442674_1_En_5_Chapter_IEq14.gif). Our approximation will show that the probability that h∕N is within one standard deviation of the mean is approximately 68%. Note the standard deviation of the mean falls as N grows. This is important, because it shows that our model of probability as frequency is consistent. As N → ∞,
+
+![
+$$\\displaystyle{ \\frac{h} {N} \\rightarrow p}$$
+](A442674_1_En_5_Chapter_Equaa.gif)
+
+because h∕N will tend to land in an interval around p that gets narrower as N gets larger.
+
+The main difficulty with Fig. 5.4 (and with the argument above) is that the mean and standard deviation of the binomial distribution tends to infinity as the number of coin flips tends to infinity. This can confuse issues. For example, the plots of Fig. 5.4 show narrowing probability distributions—but is this because the scale is compacted, or is there a real effect? It turns out there is a real effect, and a good way to see it is to consider the normalized number of heads.
+
+Fig. 5.4
+
+Plots of the binomial distribution for p = q = 0. 5 for different values of N. You should notice that the set of values of h (the number of heads) that have substantial probability is quite narrow compared to the range of possible values. This set gets narrower as the number of flips increases. This is because the mean is pN and the standard deviation is ![
+$$\\sqrt{Npq}$$
+](A442674_1_En_5_Chapter_IEq15.gif)—so the fraction of values that is within one standard deviation of the mean is ![
+$$O\(1/\\sqrt{N}\)$$
+](A442674_1_En_5_Chapter_IEq16.gif)
+
+### 5.4.1 Large N
+
+Recall that to normalize a dataset, you subtract the mean and divide the result by the standard deviation. We can do the same for a random variable. We now consider
+
+![
+$$\\displaystyle{x = \\frac{h - Np} {\\sqrt{Npq}}.}$$
+](A442674_1_En_5_Chapter_Equab.gif)
+
+The probability distribution of x can be obtained from the probability distribution for h, because ![
+$$h = Np + x\\sqrt{Npq}$$
+](A442674_1_En_5_Chapter_IEq17.gif), so
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(x\) = \\left \( \\frac{N!} {\(Np + x\\sqrt{Npq}\)!\(Nq - x\\sqrt{Npq}\)!}\\right \)p^{\(Np+x\\sqrt{Npq}\)}q^{\(Nq-x\\sqrt{Npq}\)}.& & {}\\\\ \\end{array}$$
+](A442674_1_En_5_Chapter_Equ4.gif)
+
+I have plotted this probability distribution for various values of N in Fig. 5.5.
+
+Fig. 5.5
+
+Plots of the distribution for the normalized variable x, with P(x) given in the text, obtained from the binomial distribution with p = q = 0. 5 for different values of N. These distributions are normalized (mean 0, variance 1. They look increasingly like a standard normal distribution EXCEPT that the value at their mode gets smaller as N gets bigger (look at the vertical axis; this occurs because there are more possible outcomes). In the text, we will establish that the standard normal distribution is a limit, in a useful sense
+
+But it is hard to work with this distribution for very large N. The factorials become very difficult to evaluate. Second, it is a discrete distribution on N points, spaced ![
+$$1/\\sqrt{Npq}$$
+](A442674_1_En_5_Chapter_IEq18.gif) apart. As N becomes very large, the number of points that have non-zero probability becomes very large, and x can be very large, or very small. For example, there is some probability, though there may be very little indeed, on the point where h = N, or, equivalently, ![
+$$x = N\(p + \\sqrt{Npq}\)$$
+](A442674_1_En_5_Chapter_IEq19.gif). For sufficiently large N, we think of this probability distribution as a probability density function. We can do so, for example, by spreading the probability for x i (the i'th value of x) evenly over the interval between x i and x i+1. We then have a probability density function that looks like a histogram, with bars that become narrower as N increases. But what is the limit?
+
+### 5.4.2 Getting Normal
+
+To proceed, we need Stirling's approximation, which says that, for large N,
+
+![
+$$\\displaystyle{N! \\approx \\sqrt{2\\pi }\\sqrt{N}\\left \(\\frac{N} {e} \\right \)^{N}.}$$
+](A442674_1_En_5_Chapter_Equac.gif)
+
+This yields
+
+![
+$$\\displaystyle{P\(h\) \\approx \\left \(\\frac{Np} {h} \\right \)^{h}\\left \( \\frac{Nq} {N - h}\\right \)^{\(N-h\)}\\sqrt{ \\frac{N} {2\\pi h\(N - h\)}}}$$
+](A442674_1_En_5_Chapter_Equad.gif)
+
+Recall we used the normalized variable
+
+![
+$$\\displaystyle{x = \\frac{h - Np} {\\sqrt{Npq}}.}$$
+](A442674_1_En_5_Chapter_Equae.gif)
+
+We will encounter the term ![
+$$\\sqrt{Npq}$$
+](A442674_1_En_5_Chapter_IEq20.gif) often, and we use ![
+$$\\sigma = \\sqrt{Npq}$$
+](A442674_1_En_5_Chapter_IEq21.gif) as a shorthand. We can compute h and N − h from x by the equalities
+
+![
+$$\\displaystyle{h = Np +\\sigma xN - h = Nq -\\sigma x.}$$
+](A442674_1_En_5_Chapter_Equaf.gif)
+
+So the probability distribution written in this new variable x is
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(x\) \\approx \\left \( \\frac{Np} {\(Np+\\sigma x\)}\\right \)^{\(Np+\\sigma x\)}\\left \( \\frac{Nq} {\(Nq-\\sigma x\)}\\right \)^{\(Nq-\\sigma x\)}\\sqrt{ \\frac{N} {2\\pi \(Np +\\sigma x\)\(Nq -\\sigma x\)}}& & {}\\\\ \\end{array}$$
+](A442674_1_En_5_Chapter_Equ5.gif)
+
+There are three terms to deal with here. It is easiest to work with logP. Now
+
+![
+$$\\displaystyle{\\log \(1 + x\) = x -\\frac{1} {2}x^{2} + O\(x^{3}\)}$$
+](A442674_1_En_5_Chapter_Equag.gif)
+
+so we have
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\log \\left \( \\frac{Np} {\(Np +\\sigma x\)}\\right \)& =& -\\log \\left \(1 + \\frac{\\sigma x} {Np}\\right \) {}\\\\ & \\approx & - \\frac{\\sigma x} {Np} + \(\\frac{1} {2}\)\( \\frac{\\sigma x} {Np}\)^{2} {}\\\\ \\end{array}$$
+](A442674_1_En_5_Chapter_Equ6.gif)
+
+and
+
+![
+$$\\displaystyle{\\log \\left \( \\frac{Nq} {\(Nq -\\sigma x\)}\\right \) \\approx \\frac{\\sigma x} {Nq} + \(\\frac{1} {2}\)\( \\frac{\\sigma x} {Nq}\)^{2}.}$$
+](A442674_1_En_5_Chapter_Equah.gif)
+
+From this, we have that
+
+![
+$$\\displaystyle{\\log \\left \[\\left \( \\frac{Np} {Np +\\sigma x}\\right \)^{\(Np+\\sigma x\)}\\left \( \\frac{Nq} {Nq -\\sigma x}\\right \)^{\(Nq-\\sigma x\)}\\right \]}$$
+](A442674_1_En_5_Chapter_Equai.gif)
+
+is approximately
+
+![
+$$\\displaystyle{\\left \[Np +\\sigma x\\right \]\\left \[- \\frac{\\sigma x} {Np} + \\left \(\\frac{1} {2}\\right \)\\left \( \\frac{\\sigma x} {Np}\\right \)^{2}\\right \] + \\left \[Nq -\\sigma x\\right \]\\left \[ \\frac{\\sigma x} {Nq} + \\left \(\\frac{1} {2}\\right \)\\left \( \\frac{\\sigma x} {Nq}\\right \)^{2}\\right \]}$$
+](A442674_1_En_5_Chapter_Equaj.gif)
+
+which is
+
+![
+$$\\displaystyle{-\\left \(\\frac{1} {2}\\right \)x^{2} + O\(\(\\sigma x\)^{3}\)}$$
+](A442674_1_En_5_Chapter_Equak.gif)
+
+(recall ![
+$$\\sigma = \\sqrt{Npq}$$
+](A442674_1_En_5_Chapter_IEq22.gif) if you're having trouble with the last step). Now we look at the square-root term. We have
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\log \\sqrt{ \\frac{N} {2\\pi \(Np +\\sigma x\)\(Nq -\\sigma x\)}}& =& -\\frac{1} {2}\\left \(\\begin{array}{l} \\log \\left \[Np +\\sigma x\\right \] \\\\ +\\log \\left \[Nq -\\sigma x\\right \] \\\\ -\\log N\\\\ +\\log 2\\pi \\end{array} \\right \) {}\\\\ & =& -\\frac{1} {2}\\left \(\\begin{array}{l} \\log Np + O\\left \(\\left \( \\frac{\\sigma x} {Np}\\right \)\\right \) \\\\ +\\log Nq - O\\left \(\\left \( \\frac{\\sigma x} {Nq}\\right \)\\right \) \\\\ -\\log N\\\\ +\\log 2\\pi \\end{array} \\right \){}\\\\ \\end{array}$$
+](A442674_1_En_5_Chapter_Equ7.gif)
+
+but, since N is very large compared to σx, we can ignore the ![
+$$O\(\\left \( \\frac{\\sigma x} {Np}\\right \)\)$$
+](A442674_1_En_5_Chapter_IEq23.gif) terms. Then this term is not a function of x. So we have
+
+![
+$$\\displaystyle{\\log P\(x\) \\approx \\frac{-x^{2}} {2} + \\mbox{ constant}.}$$
+](A442674_1_En_5_Chapter_Equal.gif)
+
+Now because N is very large, our probability distribution P limits to a probability density function p, with
+
+![
+$$\\displaystyle{p\(x\) \\propto \\exp \\left \(\\frac{-x^{2}} {2} \\right \).}$$
+](A442674_1_En_5_Chapter_Equam.gif)
+
+We can get the constant of proportionality from integrating, to
+
+![
+$$\\displaystyle{p\(x\) = \\left \( \\frac{1} {\\sqrt{2\\pi }}\\right \)\\exp \\left \(\\frac{-x^{2}} {2} \\right \).}$$
+](A442674_1_En_5_Chapter_Equan.gif)
+
+This constant of proportionality deals with the effect in Fig. 5.5, where the mode of the distribution gets smaller as N gets bigger. It does so because there are more points with non-zero probability to be accounted for. But we are interested in the limit where N tends to infinity. This must be a probability density function, so it must integrate to one.
+
+Review this blizzard of terms. We started with a binomial distribution, but standardized the variables so that the mean was zero and the standard deviation was one. We then assumed there was a very large number of coin tosses, so large that the distribution started to look like a continuous function. The function we get is the standard normal distribution.
+
+### 5.4.3 Using a Normal Approximation to the Binomial Distribution
+
+I have proven an extremely useful fact, which I shall now put in a box.
+
+Useful Facts 5.11 (Binomial Distribution for Large N)
+
+​​ Assume h follows the binomial distribution with parameters p and q. Write
+
+![
+$$\\displaystyle{x = \\frac{h - Np} {\\sqrt{Npq}}.}$$
+](A442674_1_En_5_Chapter_Equao.gif)
+
+Then, for sufficiently large N, the probability distribution P(x) can be approximated by the probability density function
+
+![
+$$\\displaystyle{\\left \( \\frac{1} {\\sqrt{2\\pi }}\\right \)\\exp \\left \(\\frac{-x^{2}} {2} \\right \)}$$
+](A442674_1_En_5_Chapter_Equap.gif)
+
+in the sense that
+
+![
+$$\\displaystyle{P\(\\{x \\in \[a,b\]\\}\)\\approx \\int _{a}^{b}\\!\\left \( \\frac{1} {\\sqrt{2\\pi }}\\right \)\\exp \\left \(\\frac{-u^{2}} {2} \\right \)du}$$
+](A442674_1_En_5_Chapter_Equaq.gif)
+
+This justifies our model of probability as frequency. I interpreted an event having probability p to mean that, if I had a large number N of independent repetitions of the experiment, the number that produced the event would be close to Np, and would get closer as N got larger. We know that, for example, 68% of the time a standard normal random variable takes a value between 1 and − 1. In this case, the standard normal random variable is
+
+![
+$$\\displaystyle{\\frac{h - \(Np\)} {\\sqrt{Npq}} }$$
+](A442674_1_En_5_Chapter_Equar.gif)
+
+so that 68% of the time, h must take a value in the range ![
+$$\[Np -\\sqrt{Npq},Np + \\sqrt{Npq}\]$$
+](A442674_1_En_5_Chapter_IEq24.gif). Equivalently, the relative frequency h∕N must take a value in the range
+
+![
+$$\\displaystyle{\[p - \\frac{pq} {\\sqrt{N}},p + \\frac{pq} {\\sqrt{N}}\]}$$
+](A442674_1_En_5_Chapter_Equas.gif)
+
+but as N → ∞ this range gets smaller and smaller, and h∕N limits to p. So our view of probability as a frequency is consistent.
+
+## 5.5 You Should
+
+### 5.5.1 Remember These Definitions
+
+  * Uniform random variable, discrete 115
+
+  * Bernoulli random variable 116
+
+  * Geometric distribution 116
+
+  * Binomial distribution 117
+
+  * Multinomial distribution 118
+
+  * Poisson distribution 118
+
+  * Uniform distribution, continuous 120
+
+  * Beta distribution 120
+
+  * Gamma distribution 121
+
+  * Exponential distribution 122
+
+  * Standard Normal distribution 123
+
+  * Normal distribution 124
+
+### 5.5.2 Remember These Terms
+
+  * intensity 119
+
+  * Poisson point process 119
+
+  * uniform distribution 120
+
+  * uniform random variable 120
+
+  * standard normal distribution 123
+
+  * standard normal random variable 123
+
+  * normal distribution 124
+
+  * normal random variable 124
+
+  * normal distribution 124
+
+  * gaussian distributions 124
+
+  * error function 125
+
+### 5.5.3 Remember These Facts
+
+  * Mean and variance of a Bernoulli random variable 116
+
+  * Mean and variance of a geometric distribution 116
+
+  * Mean and variance of the binomial distribution 117
+
+  * Mean and variance of the Poisson distribution 119
+
+  * Mean and variance of a Beta distribution 121
+
+  * Mean and variance of the gamma distribution 121
+
+  * Mean and variance of the exponential distribution 122
+
+  * Mean and variance of the standard normal distribution 123
+
+  * Mean and variance of the normal distribution 124
+
+  * How often a normal random variable is how far from the mean 125
+
+  * Binomial distribution for large N 130
+
+### 5.5.4 Remember These Points
+
+  * CLT means normal distributions are common 125
+
+Problems
+
+Sums and Differences of Discrete Random Variables
+
+5.1 Assume X and Y are discrete random variables which take integer values in the range 1...100 (inclusive). Write S = X \+ Y.
+
+  1. (a)
+
+Show that
+
+![
+$$\\displaystyle{P\(S = k\)=\\sum _{u=1}^{u=100}P\(\\left \\{\\left \\{X = k - u\\right \\} \\cap \\left \\{Y = u\\right \\}\\right \\}\).}$$
+](A442674_1_En_5_Chapter_Equat.gif)
+
+  2. (b)
+
+Now assume that both X and Y are uniform random variables. Show that S is not uniform by considering P(S = 2), P(S = 3), and P(S = 100).
+
+5.2 Assume X and Y are discrete random variables which take integer values in the range 1...100 (inclusive). Write D = X − Y.
+
+  1. (a)
+
+Show that
+
+![
+$$\\displaystyle{P\(D = k\)=\\sum _{u=1}^{u=100}P\(\\left \\{X = k + u\\right \\}\)P\(\\left \\{Y = u\\right \\}\).}$$
+](A442674_1_En_5_Chapter_Equau.gif)
+
+  2. (b)
+
+Now assume that both X and Y are uniform random variables. Show that D is not uniform by considering P(D = −99), P(D = 99), and P(D = 0).
+
+The Geometric Distribution
+
+5.3 Write S ∞ = ∑ i = 0 ∞ r i . Show that (1 − r)S ∞ = 1, so that
+
+![
+$$\\displaystyle{S_{\\infty } = \\frac{1} {1 - r}}$$
+](A442674_1_En_5_Chapter_Equav.gif)
+
+5.4 Write P({X = n}) for the probability that an experiment requires n repeats for success under the geometric distribution model with probability of success in one experiment p. Use the result of the previous exercise to show that
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\sum _{n=1}^{\\infty }P\(\\{X = n\\}\)& =& p\\sum _{ n=1}^{\\infty }\(1 - p\)^{\(n-1\)} {}\\\\ & =& 1 {}\\\\ \\end{array}$$
+](A442674_1_En_5_Chapter_Equ8.gif)
+
+5.5 Show that
+
+![
+$$\\displaystyle{\\sum _{i=0}^{\\infty }ir^{i} = \(\\sum _{ i=1}^{\\infty }r^{i}\) + r\(\\sum _{ i=1}^{\\infty }r^{i}\) + r^{2}\(\\sum _{ i=1}^{\\infty }r^{i}\)+\\ldots }$$
+](A442674_1_En_5_Chapter_Equaw.gif)
+
+(look carefully at the limits of the sums!) and so show that
+
+![
+$$\\displaystyle{\\sum _{i=0}^{\\infty }ir^{i} = \\frac{r} {\(1 - r\)^{2}}.}$$
+](A442674_1_En_5_Chapter_Equax.gif)
+
+5.6 Write S ∞ = ∑ i = 0 ∞ r i . Show that
+
+![
+$$\\displaystyle{\\sum _{i=0}^{\\infty }i^{2}r^{i} = \(S_{ \\infty }- 1\) + 3r\(S_{\\infty }- 1\) + 5r^{2}\(S_{ \\infty }- 1\) + 7r^{3}\(\\S _{ \\infty }- 1\)+\\ldots }$$
+](A442674_1_En_5_Chapter_Equay.gif)
+
+and so that
+
+![
+$$\\displaystyle{\\sum _{i=0}^{\\infty }i^{2}r^{i} = \\frac{r\(1 + r\)} {\(1 - r\)^{3}} }$$
+](A442674_1_En_5_Chapter_Equaz.gif)
+
+5.7 Show that, for a geometric distribution with parameter p, the mean is
+
+![
+$$\\displaystyle{\\sum _{i=1}^{\\infty }i\(1 - p\)^{\(i-1\)}p =\\sum _{ u=0}^{\\infty }\(u + 1\)\(1 - p\)^{u}p.}$$
+](A442674_1_En_5_Chapter_Equba.gif)
+
+Now by rearranging and using the previous results, show that the mean is
+
+![
+$$\\displaystyle{\\sum _{i=1}^{\\infty }i\(1 - p\)^{\(i-1\)}p = \\frac{1} {p}}$$
+](A442674_1_En_5_Chapter_Equbb.gif)
+
+5.8 Show that a geometric distribution with parameter p has variance (1 − p)∕p 2. To do this, note the variance is ![
+$$\\mathbb{E}\\left \[X^{2}\\right \] - \\mathbb{E}\\left \[X\\right \]^{2}$$
+](A442674_1_En_5_Chapter_IEq25.gif). Now use the results of the previous exercises to show that
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[X^{2}\\right \]=\\sum _{ i=1}^{\\infty }i^{2}\(1-p\)^{\(i-1\)}p = \\frac{p} {1-p} \\frac{\(1-p\)\(2 - p\)} {p^{3}},}$$
+](A442674_1_En_5_Chapter_Equbc.gif)
+
+then rearrange to get the expression for variance.
+
+5.9 You have a coin with unknown probability p of coming up heads. You wish to generate a random variable which takes the values 0 and 1, each with probability 1∕2. Assume 0 < p < 1. You adopt the following procedure. You start by flipping the coin twice. If both flips produce the same side of the coin, you start again. If the result of the first flip is different from the result of the second flip, you report the result of the first flip and you are finished (this is a trick originally due to John von Neumann).
+
+  1. (a)
+
+Show that, in this case, the probability of reporting heads is 1∕2.
+
+  2. (b)
+
+What is the expected number of flips you must make before you report a result?
+
+Bernoulli Random Variables
+
+5.10 Write X for a Bernoulli random variable which takes the value 1 with probability p (and 0 with probability (1 − p)).
+
+  1. (a)
+
+Show that ![
+$$\\mathbb{E}\\left \[X\\right \] = p$$
+](A442674_1_En_5_Chapter_IEq26.gif).
+
+  2. (b)
+
+Show that the variance of X is p(1 − p).
+
+5.11 Write X (N) for
+
+![
+$$\\displaystyle{ \\frac{1} {N}\\left \(X_{1} + X_{2} +\\ldots X_{N}\\right \)}$$
+](A442674_1_En_5_Chapter_Equbd.gif)
+
+where the X i are independent Bernoulli random variables. Each of these takes the value 1 with probability p (and 0 with probability (1 − p)).
+
+  1. (a)
+
+Show that ![
+$$\\mathbb{E}\\left \[X^{\(N\)}\\right \] = p$$
+](A442674_1_En_5_Chapter_IEq27.gif).
+
+  2. (b)
+
+Show that the variance of X (N) is p(1 − p)
+
+5.12 Write S (N) for
+
+![
+$$\\displaystyle{\\left \(X_{1} + X_{2} +\\ldots X_{N}\\right \)}$$
+](A442674_1_En_5_Chapter_Eqube.gif)
+
+where the X i are independent Bernoulli random variables. Each of these takes the value 1 with probability p (and 0 with probability (1 − p)).
+
+  1. (a)
+
+Show that, for 0 ≤ k ≤ N, ![
+$$P\(\\left \\{X = k\\right \\}\)$$
+](A442674_1_En_5_Chapter_IEq28.gif) is
+
+![
+$$\\displaystyle{\\left \(\\begin{array}{c} N\\\\ k\\end{array} \\right \)p^{k}\(1-p\)^{\(N-k\)}}$$
+](A442674_1_En_5_Chapter_Equbf.gif)
+
+  2. (b)
+
+Show that ![
+$$\\mathbb{E}\\left \[S^{\(N\)}\\right \] = Np.$$
+](A442674_1_En_5_Chapter_IEq29.gif)
+
+  3. (c)
+
+Show that the variance of S (N) is Np(1 − p).
+
+The Binomial Distribution
+
+5.13 Show that P b (N − i; N, p) = P b (i; N, (1 − p)) for all i.
+
+5.14 Show that
+
+![
+$$\\displaystyle\\begin{array}{rcl} & & P_{b}\(i;N,p\) {}\\\\ & & \\quad = pP_{b}\(i-1;N-1,p\)+\(1-p\)P_{b}\(i;N-1,p\). {}\\\\ \\end{array}$$
+](A442674_1_En_5_Chapter_Equ9.gif)
+
+5.15 Write h r for the number of heads obtained in r flips of a coin which has probability p of coming up heads. Compare the following two ways to compute the probability of getting i heads in five coin flips:
+
+  * Flip the coin three times, count h 3, then flip the coin twice, count h 2, then form w = h 3 \+ h 2.
+
+  * Flip the coin five times, and count h 5.
+
+Show that the probability distribution for w is the same as the probability distribution for h 5. Do this by showing that
+
+![
+$$\\displaystyle{P\(\\{w = i\\}\) = \\left \[\\sum _{j=0}^{5}P\(\\{h_{ 3} = j\\} \\cap \\{ h_{2} = i - j\\}\)\\right \] = P\(\\{h_{5} = i\\}\).}$$
+](A442674_1_En_5_Chapter_Equbg.gif)
+
+5.16 Now we will do the previous exercise in a more general form. Again, write h r for the number of heads obtained in r flips of a coin which has probability p of coming up heads. Compare the following two ways to compute the probability of getting i heads in N coin flips:
+
+  * Flip the coin t times, count h t , then flip the coin N − t times, count h N−t , then form w = h t \+ h N−t .
+
+  * Flip the coin N times, and count h N .
+
+Show that the probability distribution for w is the same as the probability distribution for h N . Do this by showing that
+
+![
+$$\\displaystyle{P\(\\{w = i\\}\) = \\left \[\\sum _{j=0}^{N}P\(\\{h_{ t} = j\\} \\cap \\{ h_{N-t} = i - j\\}\)\\right \] = P\(\\{h_{N} = i\\}\).}$$
+](A442674_1_En_5_Chapter_Equbh.gif)
+
+5.17 An airline runs a regular flight with six seats on it. The airline sells six tickets. The gender of the passengers is unknown at time of sale, but women are as common as men in the population. All passengers always turn up for the flight. The pilot is eccentric, and will not fly a plane unless at least one passenger is female. What is the probability that the pilot flies?
+
+5.18 An airline runs a regular flight with s seats on it. The airline always sells t tickets for this flight. The probability a passenger turns up for departure is p, and passengers do this independently. What is the probability that the plane travels with exactly three empty seats?
+
+5.19 An airline runs a regular flight with s seats on it. The airline always sells t tickets for this flight. The probability a passenger turns up for departure is p, and passengers do this independently. What is the probability that the plane travels with 1 or more empty seats?
+
+The Multinomial Distribution
+
+5.20 Show that the multinomial distribution
+
+![
+$$\\displaystyle\\begin{array}{rcl} P_{m}\(n_{1},\\ldots,n_{k};N,p_{1},\\ldots,n_{k}\)& =& \\frac{N!} {n_{1}!n_{2}!\\ldots n_{k}!} {}\\\\ & & p_{1}^{n_{1} }p_{2}^{n_{2} }\\ldots p_{k}^{n_{k} } {}\\\\ \\end{array}$$
+](A442674_1_En_5_Chapter_Equ10.gif)
+
+must satisfy the recurrence relation
+
+![
+$$\\displaystyle\\begin{array}{rcl} & & P_{m}\(n_{1},\\ldots,n_{k};N,p_{1},\\ldots,p_{k}\) {}\\\\ & & \\quad = p_{1}P_{m}\(n_{1} - 1,\\ldots,n_{k};N - 1,p_{1},\\ldots,p_{k}\) + {}\\\\ & & p_{2}P_{m}\(n_{1},n_{2} - 1,\\ldots,n_{k};N - 1,p_{1},\\ldots,p_{k}\) +\\ldots {}\\\\ & & p_{k}P_{m}\(n_{1},n_{2},\\ldots,n_{k} - 1;N - 1,p_{1},\\ldots,p_{k}\) {}\\\\ \\end{array}$$
+](A442674_1_En_5_Chapter_Equ11.gif)
+
+The Poisson Distribution
+
+5.21 The exponential function e x can be represented by the series
+
+![
+$$\\displaystyle{\\sum _{i=0}^{\\infty }\\frac{x^{i}} {i!} }$$
+](A442674_1_En_5_Chapter_Equbi.gif)
+
+(which converges absolutely; try the ratio test). Use this information to show that the Poisson distribution sums to one.
+
+5.22 You will show that the mean of the Poisson distribution with intensity parameter λ is λ.
+
+  1. (a)
+
+Show that Taylor series for xe x around x = 0 is given by
+
+![
+$$\\displaystyle{\\sum _{i=0}^{\\infty }\\frac{ix^{i}} {i!} }$$
+](A442674_1_En_5_Chapter_Equbj.gif)
+
+and use the ratio test to show that this series converges absolutely.
+
+  2. (b)
+
+Now use this series and pattern matching to show the mean of the Poisson distribution with intensity parameter λ is λ.
+
+5.23 Compute the Taylor series for (x 2 \+ x)e x around x = 0. Show that this series converges absolutely, using the ratio test. Use this and pattern matching to show that the variance of the Poisson distribution with intensity parameter λ is λ.
+
+Sums of Continuous Random Variables
+
+5.24 Write p x for the probability density function of a continuous random variable X and p y for the probability density function of a continuous random variable Y. Show that the probability density function of S = X \+ Y is
+
+![
+$$\\displaystyle{p\(s\) =\\int _{ -\\infty }^{\\infty }p_{ x}\(s - u\)p_{y}\(u\)du =\\int _{ -\\infty }^{\\infty }p_{ x}\(u\)p_{y}\(s - u\)du}$$
+](A442674_1_En_5_Chapter_Equbk.gif)
+
+The Normal Distribution
+
+5.25 Write
+
+![
+$$\\displaystyle{f\(x\) = \\left \( \\frac{1} {\\sqrt{2\\pi }}\\right \)\\exp \\left \(\\frac{-x^{2}} {2} \\right \).}$$
+](A442674_1_En_5_Chapter_Equbl.gif)
+
+  1. (a)
+
+Show that f(x) is non-negative for all x.
+
+  2. (b)
+
+By integration, show that
+
+![
+$$\\displaystyle{\\int _{-\\infty }^{\\infty }f\(x\)dx = 1,}$$
+](A442674_1_En_5_Chapter_Equbm.gif)
+
+so that f(x) is a probability density function (you can look up the integral; few people remember how to do this integral these days).
+
+  3. (c)
+
+Show that
+
+![
+$$\\displaystyle{\\int _{-\\infty }^{\\infty }xf\(x\)dx = 0.}$$
+](A442674_1_En_5_Chapter_Equbn.gif)
+
+The easiest way to do this is to notice that f(x) = f(−x)
+
+  4. (d)
+
+Show that
+
+![
+$$\\displaystyle{\\int _{-\\infty }^{\\infty }xf\(x-\\mu \)dx =\\mu.}$$
+](A442674_1_En_5_Chapter_Equbo.gif)
+
+The easiest way to do this is to change variables, and use the previous two exercises.
+
+  5. (e)
+
+Show that
+
+![
+$$\\displaystyle{\\int _{-\\infty }^{\\infty }x^{2}f\(x\)dx = 1.}$$
+](A442674_1_En_5_Chapter_Equbp.gif)
+
+You'll need to either do, or look up, the integral to do this exercise.
+
+5.26 Write
+
+![
+$$\\displaystyle{g\(x\) =\\exp \\left \[-\\frac{\(x-\\mu \)^{2}} {2\\sigma ^{2}} \\right \]}$$
+](A442674_1_En_5_Chapter_Equbq.gif)
+
+Show that
+
+![
+$$\\displaystyle{\\int _{-\\infty }^{\\infty }g\(x\)dx = \\sqrt{2\\pi }\\sigma.}$$
+](A442674_1_En_5_Chapter_Equbr.gif)
+
+You can do this by a change of variable, and the results of the previous exercises.
+
+5.27 Write
+
+![
+$$\\displaystyle{p\(x\) = \\left \( \\frac{1} {\\sqrt{2\\pi }\\sigma }\\right \)\\exp \\left \(\\frac{-\(x-\\mu \)^{2}} {2\\sigma ^{2}} \\right \).}$$
+](A442674_1_En_5_Chapter_Equbs.gif)
+
+  1. (a)
+
+Show that
+
+![
+$$\\displaystyle{\\int _{-\\infty }^{\\infty }xp\(x\)dx =\\mu }$$
+](A442674_1_En_5_Chapter_Equbt.gif)
+
+using the results of the previous exercises.
+
+  2. (b)
+
+Show that
+
+![
+$$\\displaystyle{\\int _{-\\infty }^{\\infty }\(x-\\mu \)^{2}p\(x\)dx =\\sigma ^{2}}$$
+](A442674_1_En_5_Chapter_Equbu.gif)
+
+using the results of the previous exercises.
+
+The Binomial Distribution for Large N
+
+5.28 I flip a fair coin N times and count heads. We consider the probability that h, the fraction of heads, is in some range of numbers. For each of these questions, you should just write an expression, rather than evaluate the integral. Hint: If you know the range of numbers for h, you know the range for h∕N.
+
+  1. (a)
+
+For N = 1e6, use the normal approximation to estimate
+
+![
+$$\\displaystyle{P\(\\{h \\in \[49,500,50,500\]\\}\).}$$
+](A442674_1_En_5_Chapter_Equbv.gif)
+
+  2. (b)
+
+For N = 1e4, use the normal approximation to estimate
+
+![
+$$\\displaystyle{P\(\\{h> 9000\\}\).}$$
+](A442674_1_En_5_Chapter_Equbw.gif)
+
+  3. (c)
+
+For N = 1e2, use the normal approximation to estimate
+
+![
+$$\\displaystyle{P\(\\{h> 60\\} \\cup \\{ h <40\\}\).}$$
+](A442674_1_En_5_Chapter_Equbx.gif)
+
+Programming Exercises
+
+5.29 An airline runs a regular flight with 10 seats on it. The probability that a passenger turns up for the flight is 0.95. What is the smallest number of seats the airline should sell to ensure that the probability the flight is full (i.e. 10 or more passengers turn up) is bigger than 0.99? You'll need to write a simple simulation; estimate the probability by counting.
+
+5.30 You will plot a series of figures showing how the binomial distribution for large N increasingly "looks like" the normal distribution. We will consider the number of heads h in N flips of an unbiased coin (so P(H) = P(T) = 1∕2 = p, and in this case q = 1 − p = 1∕2). Write ![
+$$x = \\frac{h-Np} {\\sqrt{Npq}}$$
+](A442674_1_En_5_Chapter_IEq30.gif).
+
+  1. (a)
+
+Prepare plots of the probability distribution of x for N = 10, N = 30, N = 60, and N = 100. These should be superimposed on the same set of axes. On this set of axes, you should also plot the normal probability distribution.
+
+  2. (b)
+
+Evaluate P({x ≥ 2}) for each case by summing over the appropriate terms in the binomial distribution. Now compare this to the prediction that the approximation would make, which is
+
+![
+$$\\displaystyle{\\int _{2}^{\\infty } \\frac{1} {\\sqrt{2\\pi }}e^{\\left \[-u^{2}/2\\right \] }du.}$$
+](A442674_1_En_5_Chapter_Equby.gif)
+
+You can obtain this number by appropriate evaluation of error functions.
+
+  3. (c)
+
+Now you will write a program to simulate coin flips and evaluate the variance of the simulated value of x for different numbers of flips. Again, the coin should be fair. For each N from 10, 40, 90, 160, 250, 490, 640, 810, 1000, estimate the value of x by simulating that number of flips. You should run each simulation 100 times, and use the set of estimates to evaluate the variance of your estimate of x. Plot this variance against N and against ![
+$$1/\\sqrt{N}$$
+](A442674_1_En_5_Chapter_IEq31.gif)—what do you see?
+
+# Part III  
+Inference
+© Springer International Publishing AG 2018
+
+David ForsythProbability and Statistics for Computer Sciencehttps://doi.org/10.1007/978-3-319-64410-3_6
+
+# 6. Samples and Populations
+
+David Forsyth1
+
+(1)
+
+Computer Science Department, University of Illinois at Urbana Champaign, Urbana, IL, USA
+
+Very often the data we see is a small part of the data we could have seen. The data we could have observed, if we could have seen everything, is the population. I will write populations like random variables with capital letters to emphasize we don't actually know the whole population. The data we actually have is the sample (lower case, as usual). We would like to know the population mean,  which we write ![
+$$\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)$$
+](A442674_1_En_6_Chapter_IEq1.gif). We must estimate this using the sample.
+
+This situation occurs very often. For example, imagine we wish to know the average weight of a rat. This isn't random; you could weigh every rat on the planet, and then average the answers. But doing so would absurd (among other things, you'd have to weigh them all at the same time, which would be tricky). Instead, we weigh a small set of rats, chosen at random but rather carefully so. If we have chosen sufficiently carefully, then we can say a great deal from the sample alone.
+
+## 6.1 The Sample Mean
+
+Assume we have a population ![
+$$\\left \\{X\\right \\}$$
+](A442674_1_En_6_Chapter_IEq2.gif), for i = 1,..., N p . Notice the subscript here—this is the number of items in the population. The population could be unreasonably big: for example, it could consist of all the people in the world. We want to know the mean of this population, but we do not get to see the whole thing. Instead, we see a sample.
+
+How the sample is obtained is key to describing the population. We will focus on only one model (there are lots of others). In our model, the sample is obtained by choosing a fixed number of data items. Write N for the number of data items in the sample. I use N to remind you of the size of a dataset, because most datasets are samples. We expect N is a lot smaller than N p . Each item is chosen independently, and fairly. This means that each time we choose, we choose one from the entire set of N p data items, and each has the same probability of being chosen. This is sometimes referred to as "sampling with replacement".
+
+One natural way to think about sampling with replacement is to imagine the data items as being written on tickets, which are placed in an urn (old-fashioned word for a jar, now used mainly by statisticians and morticians). You obtain the sample by repeating the following experiment N times: shake the urn; take a ticket from the urn and write down the data on the ticket; put it back in the urn. Notice that, in this case, each sample is drawn from the same urn. This is important, and makes the analysis easier. If we had not put the ticket back, the urn would change between samples.
+
+### 6.1.1 The Sample Mean Is an Estimate of the Population Mean
+
+We would like to estimate the mean of the whole dataset from the items that we actually see. Imagine we draw N tickets from the urn as above, and average the values. The result is a random variable, because different draws of N tickets will give us different values. Write X (N) for this random variable, which is referred to as the sample mean. Because expectations are linear, we must have that
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[X^{\(N\)}\\right \] = \\frac{1} {N}\\left \(\\mathbb{E}\\left \[X^{\(1\)}\\right \] +\\ldots +\\mathbb{E}\\left \[X^{\(1\)}\\right \]\\right \) = \\mathbb{E}\\left \[X^{\(1\)}\\right \]}$$
+](A442674_1_En_6_Chapter_Equa.gif)
+
+(where X (1) is the random variable whose value is obtained by drawing one ticket from the urn). Now
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathbb{E}\\left \[X^{\(1\)}\\right \]& =& \\sum _{ i\\in 1,\\ldots N_{p}}x_{i}p\(i\) {}\\\\ & =& \\sum _{i\\in 1,\\ldots N_{p}}x_{i} \\frac{1} {N_{p}}\\qquad \\mbox{ because we draw fairly from the urn} {}\\\\ & =& \\frac{\\sum _{i\\in 1,\\ldots N_{p}}x_{i}} {N_{p}} {}\\\\ & =& \\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \) {}\\\\ \\end{array}$$
+](A442674_1_En_6_Chapter_Equ1.gif)
+
+which is the mean value of the items in the urn. This means that
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[X^{\(N\)}\\right \] = \\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \).}$$
+](A442674_1_En_6_Chapter_Equb.gif)
+
+Under our sampling model, the expected value of the sample mean is the population mean.
+
+Useful Facts 6.1 (Properties of Sample and Population Means)
+
+The sample mean is a random variable. It is random, because different samples from the population will have different values of the sample mean. The expected value of this random variable is the population mean.
+
+### 6.1.2 The Variance of the Sample Mean
+
+We will not get the same value of X (N) each time we perform the experiment, because we see different data items in each sample. So X (N) has variance, and this variance is important. If it is large, then the estimate from each different sample will be quite different. If it is small, then the estimates will be similar. Knowing the variance of X (N) would tell us how accurate our estimate of the population mean is.
+
+We write ![
+$$\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)$$
+](A442674_1_En_6_Chapter_IEq3.gif) for the standard deviation of the whole population {X}. Again, we write it like this to keep track of the facts that (a) it's for the whole population and (b) we don't—and usually can't—know it. We can compute the variance of X (N) (the sample mean) easily. We have
+
+![
+$$\\displaystyle{\\mathsf{var}\\left \[X^{\(N\)}\\right \] = \\mathbb{E}\\left \[\(X^{\(N\)}\)^{2}\\right \] - \\mathbb{E}\\left \[X^{\(N\)}\\right \]^{2} = \\mathbb{E}\\left \[\(X^{\(N\)}\)^{2}\\right \] -\\left \(\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)\\right \)^{2}}$$
+](A442674_1_En_6_Chapter_Equc.gif)
+
+so we need to know ![
+$$\\mathbb{E}\\left \[\(X^{\(N\)}\)^{2}\\right \]$$
+](A442674_1_En_6_Chapter_IEq4.gif). We can compute this by writing
+
+![
+$$\\displaystyle{X^{\(N\)} = \\frac{1} {N}\(X_{1} + X_{2} +\\ldots X_{N}\)}$$
+](A442674_1_En_6_Chapter_Equd.gif)
+
+where X 1 is the value of the first ticket drawn from the urn, etc. We then have
+
+![
+$$\\displaystyle{X^{\(N\)}{}^{2} = \\left \( \\frac{1} {N}\\right \)^{2}\\left \(\\begin{array}{l} X^{2}_{1} + X^{2}_{2} +\\ldots X^{2}_{N} + X_{1}X_{2}+\\ldots \\\\ X_{1}X_{k} + X_{2}X_{1} +\\ldots X_{2}X_{N} +\\ldots X_{N-1}X_{N} \\end{array} \\right \).}$$
+](A442674_1_En_6_Chapter_Eque.gif)
+
+Expectations are linear, so we have that
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[\(X^{\(N\)}\)^{2}\\right \] = \\left \( \\frac{1} {N}\\right \)^{2}\\left \(\\begin{array}{l} \\mathbb{E}\\left \[X^{2}_{ 1}\\right \] + \\mathbb{E}\\left \[X^{2}_{ 2}\\right \] +\\ldots \\mathbb{E}\\left \[X^{2}_{ N}\\right \] + \\mathbb{E}\\left \[X_{1}X_{2}\\right \]+ \\\\ \\ldots \\mathbb{E}\\left \[X_{1}X_{N}\\right \] + \\mathbb{E}\\left \[X_{2}X_{1}\\right \] +\\ldots \\mathbb{E}\\left \[X_{N-1}X_{N}\\right \] \\end{array} \\right \).}$$
+](A442674_1_En_6_Chapter_Equf.gif)
+
+The order in which the tickets are drawn from the urn doesn't matter, because each time we draw a ticket we draw from the same urn. This means that ![
+$$\\mathbb{E}\\left \[X^{2}_{1}\\right \] = \\mathbb{E}\\left \[X^{2}_{2}\\right \] =\\ldots = \\mathbb{E}\\left \[X^{2}_{N}\\right \]$$
+](A442674_1_En_6_Chapter_IEq5.gif). You can think of this term as the expected value of the random variable generated by: drawing a single number out of the urn; squaring that number; and reporting the square. Notice that ![
+$$\\mathbb{E}\\left \[X^{2}_{1}\\right \] = \\mathbb{E}\\left \[\(X^{\(1\)}\)^{2}\\right \]$$
+](A442674_1_En_6_Chapter_IEq6.gif) (look at the definition of X (1)).
+
+Because the order doesn't matter, we also have that ![
+$$\\mathbb{E}\\left \[X_{1}X_{2}\\right \] = \\mathbb{E}\\left \[X_{1}X_{3}\\right \] =\\ldots \\mathbb{E}\\left \[X_{N-1}X_{N}\\right \]$$
+](A442674_1_En_6_Chapter_IEq7.gif). You can think of this term as the expected value of the random variable generated by: drawing a number out of the urn; writing it down; returning it to the urn; then drawing a second number from the urn; and reporting the product of these two numbers. So we can write
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[X^{\(N\)}{}^{2}\\right \] = \\left \( \\frac{1} {N}\\right \)^{2}\\left \(N\\mathbb{E}\\left \[\(X^{\(1\)}\){}^{2}\\right \] + N\(N - 1\)\\mathbb{E}\\left \[X_{ 1}X_{2}\\right \]\\right \)}$$
+](A442674_1_En_6_Chapter_Equg.gif)
+
+and these two terms are quite easy to evaluate.
+
+Worked example 6.1 (Urn Variances)
+
+Show that
+
+![
+$$\\displaystyle{\\begin{array}{rl} \\mathbb{E}\\left \[\(X^{\(1\)}\)^{2}\\right \] = \\frac{\\sum _{i=1}^{N_{p}}x_{ i}^{2}} {N_{p}} = \\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)^{2} + \\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)^{2} \\end{array} }$$
+](A442674_1_En_6_Chapter_Equh.gif)
+
+Solution
+
+First, we have (X (1))2 is the number obtained by taking a ticket out of the urn uniformly and at random and squaring its data item. Now
+
+![
+$$\\displaystyle{\\begin{array}{rl} \\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)^{2} & =\\mathbb{E}\\left \[\(X^{\(1\)}\)^{2}\\right \]-\\mathbb{E}\\left \[X^{\(1\)}\\right \]^{2} \\\\ & =\\mathbb{E}\\left \[\\!\(X^{\(1\)}\)^{2}\\!\\right \]-\\mathsf{popmean}\\!\\left \(\\!\\left \\{X\\right \\}\\!\\right \)^{2}\\end{array} }$$
+](A442674_1_En_6_Chapter_Equi.gif)
+
+so
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[\(X^{\(1\)}\)^{2}\\!\\right \]=\\mathsf{popsd}\\!\\left \(\\left \\{X\\right \\}\\right \)^{2}\\!+\\mathsf{popmean}\\!\\left \(\\left \\{X\\right \\}\\right \)^{2}}$$
+](A442674_1_En_6_Chapter_Equj.gif)
+
+Worked example 6.2 (Urn Variances)
+
+Show that
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[X_{1}X_{2}\\right \] = \\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)^{2}}$$
+](A442674_1_En_6_Chapter_Equk.gif)
+
+Solution
+
+This looks hard, but isn't. Recall from the facts in Chap.  (useful facts 4.6, page 97) that if X and Y are independent random variables, ![
+$$\\mathbb{E}\\left \[XY \\right \] = \\mathbb{E}\\left \[X\\right \]\\mathbb{E}\\left \[Y \\right \]$$
+](A442674_1_En_6_Chapter_IEq8.gif). But X 1 and X 2 are independent—they are different random draws from the same urn. So
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[X_{1}X_{2}\\right \] = \\mathbb{E}\\left \[X_{1}\\right \]\\mathbb{E}\\left \[X_{2}\\right \]}$$
+](A442674_1_En_6_Chapter_Equl.gif)
+
+but ![
+$$\\mathbb{E}\\left \[X_{1}\\right \] = \\mathbb{E}\\left \[X_{2}\\right \]$$
+](A442674_1_En_6_Chapter_IEq9.gif) (they are draws from the same urn) and ![
+$$\\mathbb{E}\\left \[X\\right \] = \\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)$$
+](A442674_1_En_6_Chapter_IEq10.gif). So
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[X_{1}X_{2}\\right \] = \\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)^{2}.}$$
+](A442674_1_En_6_Chapter_Equm.gif)
+
+Now
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathbb{E}\\left \[\(X^{\(N\)}\)^{2}\\right \]& =& \\frac{N\\mathbb{E}\\left \[\(X^{\(1\)}\)^{2}\\right \] + N\(N - 1\)\\mathbb{E}\\left \[X_{ 1}X_{2}\\right \]} {N^{2}} {}\\\\ & =& \\frac{\\mathbb{E}\\left \[\(X^{\(1\)}\)^{2}\\right \] + \(N - 1\)\\mathbb{E}\\left \[X_{1}X_{2}\\right \]} {N} {}\\\\ & =& \\frac{\(\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)^{2} + \\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)^{2}\) + \(N - 1\)\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)^{2}} {N} {}\\\\ & =& \\frac{\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)^{2}} {N} + \\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)^{2} {}\\\\ \\end{array}$$
+](A442674_1_En_6_Chapter_Equ2.gif)
+
+so we have
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{var}\\left \[X^{\(N\)}\\right \]& =& \\mathbb{E}\\left \[\(X^{\(N\)}\)^{2}\\right \] - \\mathbb{E}\\left \[X^{\(N\)}\\right \]^{2} {}\\\\ & =& \\frac{\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)^{2}} {N} + \\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)^{2} -\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)^{2} {}\\\\ & =& \\frac{\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)^{2}} {N}. {}\\\\ \\end{array}$$
+](A442674_1_En_6_Chapter_Equ3.gif)
+
+This is a very useful result which is well worth remembering together with our facts on the sample mean, so we'll put them in a box together.
+
+Useful Facts 6.2 (Expressions for Mean and Variance of the Sample Mean)
+
+The sample mean is a random variable. Write X (N) for the mean of N samples. We have that:
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathbb{E}\\left \[X^{\(N\)}\\right \]& =& \\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \) {}\\\\ \\mathsf{var}\\left \[X^{\(N\)}\\right \]& =& \\frac{\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)^{2}} {N} {}\\\\ \\mathsf{std}\\left \(X^{\(N\)}\\right \)& =& \\frac{\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)} {\\sqrt{N}}{}\\\\ && {}\\\\ \\end{array}$$
+](A442674_1_En_6_Chapter_Equ4.gif)
+
+The consequence is this: If you draw N samples, the standard deviation of your estimate of the mean is
+
+![
+$$\\displaystyle{\\frac{\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)} {\\sqrt{N}} }$$
+](A442674_1_En_6_Chapter_Equn.gif)
+
+which means that (a) the more samples you draw, the better your estimate becomes and (b) the estimate improves rather slowly—for example, to halve the standard deviation in your estimate, you need to draw four times as many samples.
+
+### 6.1.3 When The Urn Model Works
+
+In our model, there was a population of N p data items x i , and we saw N of them, chosen at random. In particular, each choice was fair (in the sense that each data item had the same probability of being chosen) and independent. These assumptions are very important for our analysis to apply. If our data does not have these properties, bad things can happen. For example, assume we wish to estimate the percentage of the population that has beards. This is a mean (the data items take the value 1 for a person with a beard, and 0 without a beard). If we select people according to our model, then ask them whether they have a beard, then our estimate of the percentage of beards should behave as above.
+
+The first thing that should strike you is that it isn't at all easy to select people according to this model. For example, we might select phone numbers at random, then call and ask the first person to answer the phone whether they have a beard; but many children won't answer the phone because they are too small. The next important problem is that errors in selecting people can lead to massive errors in your estimate. For example, imagine you decide to survey all of the people at a kindergarten on a particular day; or all of the people in a women's clothing store; or everyone attending a beard growing competition (they do exist). In each case, you will get an answer that is a very poor estimate of the right answer, and the standard deviation of this estimate might look very small. Of course, it is easy to tell that these cases are a bad choice.
+
+It may not be easy to tell what a good choice is. You should notice the similarity between estimating the percentage of the population that wears a beard, and estimating the percentage that will vote for a particular candidate. There is a famous example of a survey that mispredicted the result of the Dewey-Truman presidential election in 1948; poll-takers phoned random phone numbers, and asked for an opinion. But at that time, telephones tended to be owned by a small percentage of rather comfortable households, who tended to prefer one candidate, and so the polls mispredicted the result rather badly.
+
+Sometimes, we don't really have a choice of samples. For example, we might be presented with a small dataset of (say) human body temperatures. If we can be satisfied that the people were selected rather randomly, we might be able to use this dataset to predict expected body temperature. But if we knew that the subjects had their temperatures measured because they presented themselves at the doctor with a suspected fever, then we most likely cannot use it to predict expected body temperature without a lot of extra work.
+
+One important and valuable case where this model works is in simulation. If you can guarantee that your simulations are independent (which isn't always easy), this model applies to estimates obtained from a simulation. Notice that it is usually straightforward to build a simulation so that the i'th simulation reports an x i where ![
+$$\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)$$
+](A442674_1_En_6_Chapter_IEq11.gif) gives you the thing you want to measure. For example, imagine you wish to measure the probability of winning a game; then the simulation should report one when the game is won, and zero when it is lost. As another example, imagine you wish to measure the expected number of turns before a game is won; then your simulation should report the number of turns elapsed before the game was won.
+
+### 6.1.4 Distributions Are Like Populations
+
+Up to now, we have assumed that there is a large population of data items from which we drew a sample. The sample was drawn from the population uniformly at random, and with replacement. We used this sample to reason about the population. But the ideas depended on the population only to the extent that (a) the population is very big; (b) there was a population mean; and (c) the population was sampled uniformly at random, and with replacement. This suggests that we can replace the population with a probability distribution and the sampling process with drawing IID samples from the population.
+
+Now imagine that we have a set of N data items x i drawn as IID samples from some distribution P(X). We require that the mean and variance of this distribution exist (there are some distributions for which this criterion does not apply; they're of no interest to us). The derivations of Sects. 6.1.1 and 6.1.2 work fine for this case. We have that
+
+![
+$$\\displaystyle{X^{\(N\)} = \\frac{\\sum _{i}x_{i}} {N} }$$
+](A442674_1_En_6_Chapter_Equo.gif)
+
+is a random variable, because different sets of IID samples will have different values. We will have that
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[X^{\(N\)}\\right \] = \\mathbb{E}_{ P\(X\)}\\left \[X\\right \]}$$
+](A442674_1_En_6_Chapter_Equp.gif)
+
+(i.e. the expected value of X (N) will be the mean of P(X)) and that
+
+![
+$$\\displaystyle{\\mathsf{var}\\left \[X^{\(N\)}\\right \] = \\frac{\\mathsf{var}\\left \[P\(X\)\\right \]} {N} }$$
+](A442674_1_En_6_Chapter_Equq.gif)
+
+(i.e. the variance of the estimate of the mean is the variance of P(X) divided by N). It's important to keep track of the difference between the variance of the estimate of the mean—which describes how estimates of the mean from different samples will differ—and the variance of the original probability distribution.
+
+## 6.2 Confidence Intervals
+
+It can be important to know what range a parameter could take, and still be consistent with the data. This is particularly true when there are safety or legal considerations to worry about. Imagine you have a machine that fills cereal boxes. Each box gets a quantity of cereal that is random, but has low variance. If the weight of cereal in any box is below the amount printed on the label, you might be in trouble. When you choose the amount to print on the label, estimating the mean weight of cereal as a number might not be particularly helpful. If that estimate is a little low, you could have problems. Instead, what you'd like to know is an interval that the mean lies in with very high probability. Then you can print a label number that is smaller than the smallest in the interval, and be confident that the amount in the box is more than the amount on the label.
+
+### 6.2.1 Constructing Confidence Intervals
+
+A statistic is a function of a dataset. One example of a statistic is the mean of dataset. You should notice that you can write out this function without actually drawing a sample. We observe the value of a statistic by applying the function to the dataset we obtained by drawing a sample. The dataset is random, because it is either a sample from a population or an IID sample from a distribution model. This means that we should think of the value of the statistic as the observed value of a random variable—if we had a different sample from the same population (resp. IID sample from the same distribution) we would compute a different value of the statistic. We are interested in the expected value of this random variable, rather than the observed value. We would like to use the observed value of the statistic to construct an interval where we have a specified confidence that the expected value lies in the interval.
+
+The meaning of these intervals can be somewhat delicate (and so can the constructions). I will show how to build these intervals in the case of a population mean. Here we wish to use the value of the sample mean to construct an interval for the population mean. We need to be careful about the meaning of the interval, because there is nothing random once the sample has been drawn. We can't talk sensibly about the probability that the population mean lies inside the interval, because the population mean isn't random. The interval will depend on the value of the sample mean, and so will differ from sample to sample. Choose some fraction f; we will construct an interval so that, for that fraction of samples, the population mean will lie inside the interval constructed from the sample mean. The construction requires some detailed study of the distribution of sample means.
+
+Definition 6.1 (Confidence Interval for a Population Mean)
+
+Choose some fraction f. An f confidence interval for a population mean is an interval constructed using the sample mean. It has the property that for that fraction f of all samples, the population mean will lie inside the interval constructed from each sample's mean.
+
+Definition 6.2 (Centered Confidence Interval for a Population Mean)
+
+Choose some 0 < α < 0. 5. A 1 − 2α centered confidence interval for a population mean is an interval [a, b] constructed using the sample mean. It has the property that for α of all samples, the population mean is greater than b, and for another α of all samples, the population mean is less than a. For all other samples, the population mean will lie inside the interval.
+
+### 6.2.2 Estimating the Variance of the Sample Mean
+
+Recall the variance of the sample mean is
+
+![
+$$\\displaystyle{\\frac{\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)^{2}} {N} }$$
+](A442674_1_En_6_Chapter_Equr.gif)
+
+which isn't much help currently, because we do not know ![
+$$\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)$$
+](A442674_1_En_6_Chapter_IEq12.gif). But we might estimate ![
+$$\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)$$
+](A442674_1_En_6_Chapter_IEq13.gif) by computing the standard deviation of the examples that we have. I will write the sample using the notation for datasets, and will use
+
+![
+$$\\displaystyle{\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) = \\frac{\\sum _{i}x_{i}} {N} }$$
+](A442674_1_En_6_Chapter_Equs.gif)
+
+for the mean of the sample—that is, the mean of the data we actually see. Similarly, I will write
+
+![
+$$\\displaystyle{\\mathsf{std}\\left \(\\left \\{x\\right \\}\\right \) = \\sqrt{\\frac{\\sum _{i\\in \\mbox{ sample} } \(x_{i } - \\mathsf{mean } \\left \(\\left \\{x\\right \\} \\right \) \)^{2 } } {N}} }$$
+](A442674_1_En_6_Chapter_Equt.gif)
+
+for the sample standard deviation. Again, this is the standard deviation of the data we actually see; and again, this is consistent with our old notation. We could estimate
+
+![
+$$\\displaystyle{\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \) \\approx \\mathsf{std}\\left \(\\left \\{x\\right \\}\\right \)}$$
+](A442674_1_En_6_Chapter_Equu.gif)
+
+and as long as we have enough examples, this estimate is good. It turns out that, if the number of samples N is small, it is better to use
+
+![
+$$\\displaystyle{\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \) \\approx \\sqrt{\\frac{\\sum _{i } \(x_{i } - \\mathsf{mean } \\left \(\\left \\{x\\right \\} \\right \) \)^{2 } } {N - 1}}.}$$
+](A442674_1_En_6_Chapter_Equv.gif)
+
+The exercises show that
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)\\right \] = \\mathsf{std}\\left \(x\\right \)\\sqrt{\\left \( \\frac{N} {N - 1}\\right \)}.}$$
+](A442674_1_En_6_Chapter_Equw.gif)
+
+This means that estimating ![
+$$\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)$$
+](A442674_1_En_6_Chapter_IEq14.gif) using ![
+$$\\mathsf{std}\\left \(x\\right \)$$
+](A442674_1_En_6_Chapter_IEq15.gif) will produce a number that is reliably slightly too small. The estimate is referred to as a biased estimate, because the expected value of the estimate is not what we want it to be. In this case, it is straightforward to produce an unbiased estimate, and an unbiased estimate of ![
+$$\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)$$
+](A442674_1_En_6_Chapter_IEq16.gif) is
+
+![
+$$\\displaystyle{\\mathsf{stdunbiased}\\left \(\\left \\{x\\right \\}\\right \) = \\sqrt{\\frac{\\sum _{i } \(x_{i } - \\mathsf{mean } \\left \(\\left \\{x\\right \\} \\right \) \)^{2 } } {N - 1}}.}$$
+](A442674_1_En_6_Chapter_Equx.gif)
+
+The standard deviation of the estimate of the mean is often known as the standard error of the mean. I will write
+
+![
+$$\\displaystyle{\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \) = \\frac{\\mathsf{stdunbiased}\\left \(\\left \\{x\\right \\}\\right \)} {\\sqrt{N}}.}$$
+](A442674_1_En_6_Chapter_Equy.gif)
+
+This term allows us to draw a helpful distinction: the population has a standard deviation, and our estimate of its mean has a standard error.
+
+Definition 6.3 (Standard Error)
+
+Write X (N) for the mean of N samples x i . X (N) is a random variable. An estimate of the standard deviation of X (N) is
+
+![
+$$\\displaystyle{\\frac{\\mathsf{stdunbiased}\\left \(\\left \\{x\\right \\}\\right \)} {\\sqrt{N}}.}$$
+](A442674_1_En_6_Chapter_Equz.gif)
+
+This estimate is the standard error of the mean.
+
+Here is what is causing the bias in our estimate of σ 2. The numerator of S 2 is a sum of N numbers, but these numbers are not independent, because
+
+![
+$$\\displaystyle{\\sum _{i}\(x_{i} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\) = 0.}$$
+](A442674_1_En_6_Chapter_Equaa.gif)
+
+This means that there are only N − 1 independent numbers. Another way to see this is that, if you have N − 1 of the terms in the sum, you can infer the N'th; in turn, counting the N'th number in the mean is unwise. Statisticians say that this average has N − 1 degrees of freedom.
+
+Worked example 6.3 (Simulations Confirm the Standard Error Estimate)
+
+Compare the standard error of a mean estimate with the standard deviation predicted using the population from which the sample was drawn.
+
+Solution
+
+I used the heights column from the bodyfat dataset (from http://​www2.​stetson.​edu/​~jrasp/​data.​htm; look for bodyfat.xls). I removed the single height outlier. I simulated the population using the whole dataset (251 items), then drew numerous samples of various sizes, with replacement. I computed the mean of each of these sets of samples. Figure 6.1 shows a scatter plot of sample means for different samples, using a set of sizes (9, 16,..., 81). I have also plotted the population mean, and the true 1-standard error bars (i.e. using the population standard deviation) for each of these sample sizes. Notice how most sample means lie within the 1-standard error bars, as they should.
+
+Fig. 6.1
+
+A simple demonstration that sample means behave as described, by computing sample means from the heights dataset. I sampled elements with replacement to form random subsets of sizes (9, 16,..., 81). For each of 100 subsets of each size, I computed the sample mean. This means that there are 100 sample means for each sample size. I have represented these means by a boxplot. I then computed the population mean, and the standard error as measured by the population standard deviation. The x to the side of each column is the population mean, and the vertical bars are one standard error above and below the population mean. Notice how (a) the sample means vary less as the sample gets bigger and (b) the sample means largely lie within the error bars, as they should
+
+### 6.2.3 The Probability Distribution of the Sample Mean
+
+The sample mean is a random variable. We know an expression for its mean and for its variance in terms of population mean and variance, and we know that for sufficiently large samples the sample mean is a normal random variable. We have that
+
+![
+$$\\displaystyle{\\frac{\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)} {\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)/\\sqrt{N}} }$$
+](A442674_1_En_6_Chapter_Equab.gif)
+
+is a standard normal random variable. But we have to estimate the variance of the sample mean, and that estimate will be slightly wrong. Recall the notation
+
+![
+$$\\displaystyle{\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \) = \\frac{\\mathsf{stdunbiased}\\left \(\\left \\{x\\right \\}\\right \)} {\\sqrt{N}}.}$$
+](A442674_1_En_6_Chapter_Equac.gif)
+
+We are interested in the distribution of
+
+![
+$$\\displaystyle{T = \\frac{\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)} {\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \)} }$$
+](A442674_1_En_6_Chapter_Equad.gif)
+
+which is a random variable, because the samples are random.
+
+When N is small, the estimate of the population standard deviation using the sample standard deviation is more likely to be smaller than it should be, because there some probability of choosing a sample whose variance is smaller than the population's variance. In turn, the distance between the population mean and the sample mean in standard error units may be larger than a normal distribution predicts. This means the distribution of T must depend on N. It does so through the number of degrees of freedom in the estimate of the variance of the sample mean (which is N − 1). This means that there is a family of distributions for T, indexed by the number of degrees of freedom. This family is of known form, and is known as t-distribution. A random variable whose distribution is a t-distribution is often known as a t-random variable. You will often see this referred to as Student's t-distribution, after the inventor who wrote very important statistical papers under a pseudonym because he was concerned his employer wouldn't like them (the story is worth looking up).
+
+When the number of degrees of freedom is small, the t-distribution has rather heavier tails than the normal distribution. However, when the number of degrees of freedom is large, the t-distribution is very similar to the normal distribution. If N is large (for some reason, 30 seems to be the magic number), then it is usually safe to regard the t-distribution as being the same as a normal distribution.
+
+Definition 6.4 (T-Distribution)
+
+Student's t-distribution is a probability distribution taken from a family, indexed by a number (the degrees of freedom of the distribution). The form of the distribution is not important to us. We will obtain values from tables or from software, and typically only need values of the cumulative distribution. When the number of degrees of freedom is large, the distribution is very similar to a normal distribution; otherwise, the tails are somewhat heavier than those of a normal distribution.
+
+Definition 6.5 (T-Random Variable)
+
+A t-random variable is a random variable whose distribution is a Student's t-distribution.
+
+Remember this: The sample mean yields the value of a t random variable. In particular,
+
+![
+$$\\displaystyle{T = \\frac{\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)} {\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \)} }$$
+](A442674_1_En_6_Chapter_Equae.gif)
+
+has a t-distribution with N − 1 degrees of freedom.
+
+Remember this: If N is large enough, the sample mean yields the value of a standard normal random variable. In particular, if N is large enough,
+
+![
+$$\\displaystyle{Z = \\frac{\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)} {\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \)} }$$
+](A442674_1_En_6_Chapter_Equaf.gif)
+
+is a standard normal random variable.
+
+### 6.2.4 Confidence Intervals for Population Means
+
+Here is a construction for a confidence interval for the mean of a population. Draw a sample randomly and with replacement of N items (write {x} for the sample), and compute the sample mean. The sample mean is the value of a random variable—random, because it depends on the randomly drawn sample—whose probability distribution we know. Our estimate of the unknown number ![
+$$\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)$$
+](A442674_1_En_6_Chapter_IEq17.gif) is the mean of the sample we have, which we write ![
+$$\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_6_Chapter_IEq18.gif). We know that
+
+![
+$$\\displaystyle{T = \\frac{\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)} {\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \)} }$$
+](A442674_1_En_6_Chapter_Equag.gif)
+
+has a t-distribution. Now assume that N is large, so that the t-distribution is very similar to a standard normal distribution. But we know rather a lot about the behaviour of standard normal random variables. For about 68% of samples, t (the value of T) will lie between -1 and 1, and so on. In turn, this means that for about 68% of samples, ![
+$$\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)$$
+](A442674_1_En_6_Chapter_IEq19.gif) will lie in the interval between ![
+$$\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_6_Chapter_IEq20.gif) and ![
+$$\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) + \\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_6_Chapter_IEq21.gif), and so on.
+
+Useful Facts 6.3 (Easy Confidence Intervals for a Big Sample)
+
+Assume the sample is large enough so that ![
+$$\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)/\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_6_Chapter_IEq22.gif) is a standard normal random variable. Recall the facts in box 5.10 (page 184). These yield For about 68% of samples:
+
+![
+$$\\displaystyle{\\begin{array}{l} \\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \) \\leq \\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \) \\leq \\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) + \\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \).\\end{array} }$$
+](A442674_1_En_6_Chapter_Equah.gif)
+
+For about 95% of samples:
+
+![
+$$\\displaystyle{\\begin{array}{l} \\mathsf{mean}\\!\\left \(\\left \\{x\\right \\}\\right \) - 2\\mathsf{stderr}\\!\\left \(\\left \\{x\\right \\}\\right \) \\leq \\mathsf{popmean}\\!\\left \(\\left \\{X\\right \\}\\right \) \\leq \\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) + 2\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \).\\end{array} }$$
+](A442674_1_En_6_Chapter_Equai.gif)
+
+For about 99% of samples:
+
+![
+$$\\displaystyle{\\begin{array}{l} \\mathsf{mean}\\!\\left \(\\left \\{x\\right \\}\\right \) - 3\\mathsf{stderr}\\!\\left \(\\left \\{x\\right \\}\\right \) \\leq \\mathsf{popmean}\\!\\left \(\\left \\{X\\right \\}\\right \) \\leq \\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) + 3\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \).\\end{array} }$$
+](A442674_1_En_6_Chapter_Equaj.gif)
+
+Worked example 6.4 (The Weight of Female Mice Eating Chow)
+
+Give a 95% confidence interval for the weight of a female mouse who ate chow, based on the dataset at http://​cgd.​jax.​org/​datasets/​phenotype/​SvensonDO.​shtml.
+
+Solution
+
+There are great datasets dealing with a wide range of genotype and phenotype variations in mice at this URL. The one to look at is Churchill.​Mamm.​Gen.​2012.​phenotypes.​csv, which has information about 150 mice. 100 were fed with chow, and 50 with a high fat diet. You should look at Weight2, which seems to be a weight around the time the mouse was sacrificed. If we focus on the female mice who ate chow and whose weights are available (48 by my count), we find a mean weight of 27.78 gr, and a standard error of 0.70 gr (remember to divide by the square root of 48). This means that the interval we want runs from 26.38 to 29.18 gr.
+
+The authors ask that anyone using this data should cite the papers: High-Resolution Genetic Mapping Using the Mouse Diversity Outbred Population, Svenson KL, Gatti DM, Valdar W, Welsh CE, Cheng R, Chesler EJ, Palmer AA, McMillan L, Churchill GA. Genetics. 2012 Feb;190(2):437–47; and The Diversity Outbred Mouse Population Churchill GA, Gatti DM, Munger SC, Svenson KL Mammalian Genome 2012, Aug 15.
+
+We can plot the confidence interval by drawing error bars —draw a bar one (or two, or three) standard errors up and down from the estimate. We interpret this interval as representing the effect of sampling uncertainty on our estimate. If the urn model really did apply, then the confidence intervals have the property that the true mean lies inside the interval for about 68% of possible samples (for one standard error bars; or 95% for two; etc.).
+
+Procedure 6.1 (Constructing a Centered 1 − 2α Confidence Interval for a Population Mean for a Large Sample)
+
+Draw a sample {x} of N items from a population. Recall
+
+![
+$$\\displaystyle{\\mathsf{stdunbiased}\\!\\left \(\\left \\{x\\right \\}\\right \)=\\!\\sqrt{\\!\\frac{\\sum _{i } \(x_{i } - \\mathsf{mean } \\!\\left \(\\left \\{x\\right \\} \\right \) \)^{2 } } {N-1}}.}$$
+](A442674_1_En_6_Chapter_Equak.gif)
+
+Estimate the standard error using
+
+![
+$$\\displaystyle{\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \) = \\frac{\\mathsf{stdunbiased}\\left \(\\left \\{x\\right \\}\\right \)} {\\sqrt{N}}.}$$
+](A442674_1_En_6_Chapter_Equal.gif)
+
+If N is large enough, the variable
+
+![
+$$\\displaystyle{T = \\frac{\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)} {\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \)} }$$
+](A442674_1_En_6_Chapter_Equam.gif)
+
+is a standard normal random variable.
+
+Compute b such that for a standard normal random variable, ![
+$$P\(\\left \\{T \\geq b\\right \\}\) =\\alpha$$
+](A442674_1_En_6_Chapter_IEq23.gif). You can do this using tables or software. The confidence interval is then
+
+![
+$$\\displaystyle{\\begin{array}{l} \\left \[\\mathsf{mean}\\!\\left \(\\left \\{x\\right \\}\\right \) - b \\times \\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \),\\right.\\\\ \\ \\left.\\mathsf{mean } \\left \(\\left \\{x\\right \\} \\right \) + b \\times \\mathsf{stderr } \\left \(\\left \\{x\\right \\} \\right \) \\right \]. \\end{array} }$$
+](A442674_1_En_6_Chapter_Equan.gif)
+
+Now assume that N is small enough so that T is a t-random variable (which will have N − 1 degrees of freedom). Assume we wish to have a centered, 1 − 2α confidence interval. We can use tables or software to choose a value a so that ![
+$$P\(\\left \\{T \\leq a\\right \\}\) =\\alpha$$
+](A442674_1_En_6_Chapter_IEq24.gif) and a value b such that ![
+$$P\(\\left \\{T \\geq b\\right \\}\) =\\alpha$$
+](A442674_1_En_6_Chapter_IEq25.gif). In fact, we will have a = −b. This is because t-random variables have the property that ![
+$$P\(\\left \\{T \\geq b\\right \\}\) = P\(\\left \\{T \\leq -b\\right \\}\)$$
+](A442674_1_En_6_Chapter_IEq26.gif) (just like standard normal random variables; if you're in doubt, check this point). Then for 1 − 2α of all samples, we have
+
+![
+$$\\displaystyle{-b \\leq T \\leq b.}$$
+](A442674_1_En_6_Chapter_Equao.gif)
+
+This means that for 1 − 2α of all samples,
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)-b\\times \\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \)\\leq \\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)& & {}\\\\ \\leq \\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)+b\\times \\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \)& & {}\\\\ \\end{array}$$
+](A442674_1_En_6_Chapter_Equ5.gif)
+
+and so we have a centered, 1 − 2α confidence interval.
+
+Procedure 6.2 (Constructing a Centered 1 − 2α Confidence Interval for a Population Mean for a Small Sample)
+
+Draw a sample {x} of N items from a population. Recall
+
+![
+$$\\displaystyle{\\mathsf{stdunbiased}\\!\\left \(\\left \\{x\\right \\}\\right \)=\\sqrt{\\!\\frac{\\sum _{i } \(x_{i } - \\mathsf{mean } \\!\\left \(\\!\\left \\{x\\right \\} \\right \) \)^{2 } } {N-1}}.}$$
+](A442674_1_En_6_Chapter_Equap.gif)
+
+Estimate the standard error using
+
+![
+$$\\displaystyle{\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \) = \\frac{\\mathsf{stdunbiased}\\left \(\\left \\{x\\right \\}\\right \)} {\\sqrt{N}}.}$$
+](A442674_1_En_6_Chapter_Equaq.gif)
+
+If N is small, the variable
+
+![
+$$\\displaystyle{T = \\frac{\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)} {\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \)} }$$
+](A442674_1_En_6_Chapter_Equar.gif)
+
+is a t-random variable.
+
+Compute b such that for a t-random variable, ![
+$$P\(\\left \\{T \\geq b\\right \\}\) =\\alpha$$
+](A442674_1_En_6_Chapter_IEq27.gif). You can do this using tables or software. The confidence interval is then
+
+![
+$$\\displaystyle{\\begin{array}{l} \\left \[\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) - b \\times \\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \),\\right.\\\\ \\ \\left.\\mathsf{mean } \\left \(\\left \\{x\\right \\} \\right \) + b \\times \\mathsf{stderr } \\left \(\\left \\{x\\right \\} \\right \) \\right \]. \\end{array} }$$
+](A442674_1_En_6_Chapter_Equas.gif)
+
+### 6.2.5 Standard Error Estimates from Simulation
+
+We were able to produce convenient and useful estimates of standard error for sample means. But what happens if we want to reason about, say, the median of a population? Estimating the standard error of a median is difficult mathematically, and estimating the standard error of other interesting statistics can be difficult, too. This is an important problem, because our methods for building confidence intervals and for testing hypotheses rely on being able to construct standard error estimates. Quite simple simulation methods give very good estimates of standard error.
+
+The distribution of median values for different samples of a population looks normal by simple tests. For Fig. 6.2, I assumed that all 253 weight measurements represented the entire population, then simulated what would happen for different random samples (with replacement) of different sizes. Figure 6.2 suggests that the sample median behaves quite like the sample mean as the random sample changes. Different samples have different medians, but the distribution of values looks fairly normal. When there are more elements in the sample, the standard deviation of median values is smaller, but we have no expression for this standard deviation in terms of the sample.
+
+Fig. 6.2
+
+I took the weights dataset used all 253 measurements to represent a population. Rather than compute the median of the whole population, I chose to compute the median of a randomly chosen sample. The figures show a histogram of 1000 different values of the median, computed for 1000 different samples (of size 20 on the left, and of size 100 on the right). Notice that (a) there is a moderate amount of variation in the median of the sample; (b) these histograms look normal, and appear to have about the same mean; (c) increasing the size of the sample has reduced the spread of the histogram
+
+There is a method, known as the bootstrap, which gives a very good estimate of the standard error of any statistic. Assume we wish to estimate the standard error of a statistic S({x}), which is a function of our dataset {x} of N data items. We compute r bootstrap replicates of this sample. Each replicate is obtained by sampling the dataset uniformly, and with replacement. One helpful way to think of this is that we are modelling our dataset as a sample of a probability distribution. This distribution, sometimes known as the empirical distribution, has probability 1∕N at each of the data items we see, and zero elsewhere. Now to obtain replicates, we simply draw new sets of IID samples from this probability distribution. Notice that the bootstrap replicates are not a random permutation of the dataset; instead, we select one data item fairly and at random from the whole dataset N times. This means we expect a particular bootstrap replicate will have multiple copies of some data items, and no copies of others.
+
+We write {x} i for the i'th bootstrap replicate of the dataset. We now compute
+
+![
+$$\\displaystyle{\\overline{S} = \\frac{\\sum _{i}S\(\\{x\\}_{i}\)} {r} }$$
+](A442674_1_En_6_Chapter_Equat.gif)
+
+and the standard error estimate for S is given by:
+
+![
+$$\\displaystyle{\\mathsf{stderr}\\left \(\\left \\{S\\right \\}\\right \) = \\sqrt{\\frac{\\sum _{i } \\left \[S\(\\{x\\}_{i } \) - \\overline{S } \\right \] ^{2 } } {r - 1}} }$$
+](A442674_1_En_6_Chapter_Equau.gif)
+
+Worked example 6.5 (The Bootstrap Standard Error of the Median)
+
+You can find a dataset giving the salaries of CEO's at small firms in 1993 at http://​lib.​stat.​cmu.​edu/​DASL/​Datafiles/​ceodat.​html. Construct a 90% confidence interval for the median salary.
+
+Solution
+
+Salaries are in thousands of dollars, and one salary isn't given (we omit this value in what follows). Figure 6.3 shows a histogram of the salaries; notice there are some values that look like outliers. This justifies using a median. The median of the dataset is 350 (i.e. $350,000—this is 1993 data!). I constructed 10,000 bootstrap replicates. Figure 6.3 shows a histogram of the medians of the replicates. I used the matlab prctile function to extract the 5% and 95% percentiles of these medians, yielding the interval between 298 and 390. This means that we can expect that, for 90% of samples of CEO salaries for small companies, the median salary will be in the given range.
+
+Fig. 6.3
+
+On the left, a histogram of salaries for CEO's of small companies in 1993, from the dataset of http://​lib.​stat.​cmu.​edu/​DASL/​Datafiles/​ceodat.​html. On the right, a histogram of the medians of 10,000 bootstrap replicates of this data. This simulates the effect of sampling variation on the median; see Worked example 6.5
+
+Procedure 6.3 (The Bootstrap)
+
+Estimate the standard error for a statistic S evaluated on a dataset of N items {x}.
+
+  1. 1.
+
+Compute r bootstrap replicates of the dataset. Write the i'th replicate {x} i Obtain each by:
+
+    1. (a)
+
+Building a uniform probability distribution on the numbers 1...N.
+
+    2. (b)
+
+Drawing N independent samples from this distribution. Write s(i) for the i'th such sample.
+
+    3. (c)
+
+Building a new dataset ![
+$$\\left \\{x_{s\(1\)},\\ldots,x_{s\(N\)}\\right \\}$$
+](A442674_1_En_6_Chapter_IEq28.gif).
+
+  2. 2.
+
+For each replicate, compute S({x} i ).
+
+  3. 3.
+
+Compute
+
+![
+$$\\displaystyle{\\overline{S} = \\frac{\\sum _{i}S\(\\{x\\}_{i}\)} {r} }$$
+](A442674_1_En_6_Chapter_Equav.gif)
+
+  4. 4.
+
+The standard error estimate for S is given by:
+
+![
+$$\\displaystyle{\\mathsf{stderr}\\left \(\\left \\{S\\right \\}\\right \) = \\sqrt{\\frac{\\sum _{i } \\left \[S\(\\{x\\}_{i } \) - \\overline{S } \\right \] ^{2 } } {r - 1}} }$$
+](A442674_1_En_6_Chapter_Equaw.gif)
+
+## 6.3 You Should
+
+### 6.3.1 Remember These Definitions
+
+  * Confidence interval for a population mean 146
+
+  * Centered confidence interval for a population mean 146
+
+  * Standard error 147
+
+  * T-distribution 149
+
+  * T-random variable 149
+
+### 6.3.2 Remember These Terms
+
+  * population 141
+
+  * sample 141
+
+  * population mean 141
+
+  * sample mean 141
+
+  * statistic 146
+
+  * biased estimate 147
+
+  * unbiased estimate 147
+
+  * degrees of freedom 147
+
+  * error bars 150
+
+  * bootstrap 152
+
+  * bootstrap replicates 152
+
+  * empirical distribution 152
+
+### 6.3.3 Remember These Facts
+
+  * Properties of sample and population means 142
+
+  * Expressions for mean and variance of the sample mean 144
+
+### 6.3.4 Use These Procedures
+
+  * Easy confidence intervals for a big sample 150
+
+  * To compute a confidence interval for a population mean, large sample 150
+
+  * To compute a confidence interval for a population mean, small sample 151
+
+  * To compute a bootstrap estimate of standard error 153
+
+### 6.3.5 Be Able to
+
+  * Compute the standard error of a sample mean.
+
+  * Plot and interpret error bars.
+
+  * Compute a confidence interval for a population mean using a sample.
+
+  * Compute a confidence interval for a population median using bootstrap samples.
+
+Problems
+
+Estimating the Population Standard Deviation
+
+6.1 We have a population ![
+$$\\left \\{X\\right \\}$$
+](A442674_1_En_6_Chapter_IEq29.gif), and we study random samples of N items (drawn with replacement). We write any particular sample ![
+$$\\left \\{x\\right \\}$$
+](A442674_1_En_6_Chapter_IEq30.gif). Now consider ![
+$$\\mathsf{std}\\left \(\\left \\{x\\right \\}\\right \)^{2}$$
+](A442674_1_En_6_Chapter_IEq31.gif). This is a random variable (because different random samples of data would produce different values, at random).
+
+  1. (a)
+
+Show that ![
+$$\\mathbb{E}\\left \[\\mathsf{std}\\left \(\\left \\{x\\right \\}\\right \)^{2}\\right \]$$
+](A442674_1_En_6_Chapter_IEq32.gif) is equal to
+
+![
+$$\\displaystyle{\\begin{array}{l} \\mathbb{E}\\left \[\\sum _{i}\(x_{i} -\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)\)^{2}/N\\right \]-\\\\ \\\\ \(2/N\)\\mathbb{E}\\left \[\(\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)\)\\sum _{i}\(x_{i} -\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)\)\\right \]+\\\\ \\\\ \\mathbb{E}\\left \[\(\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)\)^{2}\\right \].\\end{array} }$$
+](A442674_1_En_6_Chapter_Equax.gif)
+
+(here the expectation is over sampling, though this has nothing to do with the point).
+
+  2. (b)
+
+Now show that for any sample
+
+![
+$$\\displaystyle{\\sum _{i}\(x_{i} -\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)\) = N\(\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)\).}$$
+](A442674_1_En_6_Chapter_Equay.gif)
+
+  3. (c)
+
+Now use the methods of Sect. 6.1.1 to show that
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[\(\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)\)^{2}\\right \] = \\frac{\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)^{2}} {N}.}$$
+](A442674_1_En_6_Chapter_Equaz.gif)
+
+  4. (d)
+
+Now show that
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[\\mathsf{std}\\left \(\\left \\{X\\right \\}\\right \)^{2}\\right \] = \\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)^{2}\\left \(\\frac{N - 1} {N} \\right \).}$$
+](A442674_1_En_6_Chapter_Equba.gif)
+
+Samples and Populations
+
+6.2 The Average Mouse: You wish to estimate the average weight of a mouse. You obtain 10 mice, sampled uniformly at random and with replacement from the mouse population. Their weights are 21, 23, 27, 19, 17, 18, 20, 15, 17, 22 grams respectively.
+
+  1. (a)
+
+What is the best estimate for the average weight of a mouse, from this data?
+
+  2. (b)
+
+What is the standard error of this estimate?
+
+  3. (c)
+
+How many mice would you need to reduce the standard error to 0.1?
+
+6.3 Sample Variance and Standard Error: You encounter a deck of Martian playing cards. There are 87 cards in the deck. You cannot read Martian, and so the meaning of the cards is mysterious. However, you notice that some cards are blue, and others are yellow.
+
+  1. (a)
+
+You shuffle the deck, and draw one card. You repeat this exercise 10 times, replacing the card you drew each time before shuffling. You see 7 yellow and 3 blue cards in the deck. As you know, the maximum likelihood estimate of the fraction of blue cards in the deck is 0.3. What is the standard error of this estimate?
+
+  2. (b)
+
+How many times would you need to repeat the exercise to reduce the standard error to 0.05?
+
+Confidence Intervals for Population Means
+
+6.4 The Weight of Rats You wish to estimate the average weight of a pet rat. You obtain 40 rats (easily and cheaply done; keep them, because they make excellent pets), sampled uniformly at random and with replacement from the pet rat population. The mean weight is 340 grams, with a standard deviation of 75 grams.
+
+  1. (a)
+
+Give a 68% confidence interval for the weight of a pet rat, from this data.
+
+  2. (b)
+
+Give a 99% confidence interval for the weight of a pet rat, from this data.
+
+6.5 The Weight of Mice You wish to estimate the average weight of a mouse. You obtain 10 mice, sampled uniformly at random and with replacement from the mouse population. Their weights are 21, 23, 27, 19, 17, 18, 20, 15, 17, 22 grams respectively. Notice there are too few mice to use a normal model.
+
+  1. (a)
+
+Give an 80% confidence interval for the weight of a mouse, from this data.
+
+  2. (b)
+
+Give a 95% confidence interval for the weight of a mouse, from this data.
+
+6.6 The Probability of a Female Birth In Carcelle-le-Grignon at the end of the eighteenth century, there were 2009 births. There were 983 boys and 1026 girls. You can regard this as a fair random sample (with replacement, though try not to think too hard about what that means) of births. If you map each female birth to 1 and each male birth to 0, the probability of a female birth is the population mean of this random variable. You have a sample of 2009 births.
+
+  1. (a)
+
+Using the reasoning and data above, construct a 99% confidence interval for the probability of a female birth.
+
+  2. (b)
+
+Using the reasoning and data above, construct a 99% confidence interval for the probability of a male birth.
+
+  3. (c)
+
+Do these intervals overlap? what does this suggest?
+
+6.7 Carcinomas vs Adipose Tissue The UC Irvine Machine Learning data repository hosts a dataset giving various electromagnetic measurements for different kinds of breast tissue. You can find the data at http://​archive.​ics.​uci.​edu/​ml/​datasets/​Breast+Tissue. It was submitted by JP. Marques de Sá and J. Jossinet.
+
+  1. (a)
+
+Using this data, construct a 99% confidence interval for the mean value of the I0 variable for tissue from a carcinoma.
+
+  2. (b)
+
+Using this data, construct a 99% confidence interval for the mean value of the I0 variable for adipose tissue.
+
+  3. (c)
+
+Do these intervals overlap? what does this suggest?
+
+6.8 Wine The UC Irvine Machine Learning data repository hosts a dataset giving various measurements of wine from three different regions of Italy. You can find the data at http://​archive.​ics.​uci.​edu/​ml/​datasets/​Wine. This data was submitted by S. Aeberhard and seems to have originally been owned by M. Forina
+
+  1. (a)
+
+Using this data, construct a 99% confidence interval for the mean value of the flavanoids variable for wine from region 1.
+
+  2. (b)
+
+Using this data, construct a 99% confidence interval for the mean value of the flavanoids variable for wine from region 3.
+
+  3. (c)
+
+Do these intervals overlap? what does this suggest?
+
+Programming Exercises
+
+6.9 Investigating the construction of confidence intervals The UC Irvine Machine Learning data repository hosts a dataset giving various measurements of abalone at https://​archive.​ics.​uci.​edu/​ml/​datasets/​Abalone. This data comes from an original study by W.J. Nash, T.L. Sellers, S.R. Talbot, A.J. Cawthorn and W. B. Ford, called "The Population Biology of Abalone (Haliotis species) in Tasmania. I. Blacklip Abalone (H. rubra) from the North Coast and Islands of Bass Strait", Sea Fisheries Division, Technical Report No. 48 (ISSN 1034-3288) (1994). The data was donated by S. Waugh. There are 4177 records. We will use the Length measurement. We will assume that the 4177 records is the entire population. Compute the population mean.
+
+  1. (a)
+
+Draw 10,000 samples of 20 records at random with replacement. Use each sample to compute a centered 90% confidence interval for the population mean, using the t-distribution. For what fraction of samples does the true population mean lie inside the interval?
+
+  2. (b)
+
+Draw 10,000 samples of 10 records at random with replacement. Use each sample to compute a centered 90% confidence interval for the population mean, using the t-distribution. For what fraction of samples does the true population mean lie inside the interval?
+
+  3. (c)
+
+Draw 10,000 samples of 10 records at random with replacement. Use each sample to compute a centered 90% confidence interval for the population mean, using a normal model (which you really shouldn't, because the sample is too small). For what fraction of samples does the true population mean lie inside the interval?
+
+  4. (d)
+
+Now repeat the last two subexercises, but using only three records. What conclusion do you draw?
+
+6.10 Investigating the construction of bootstrap confidence intervals The UC Irvine Machine Learning data repository hosts a dataset giving various measurements of abalone at https://​archive.​ics.​uci.​edu/​ml/​datasets/​Abalone. This data comes from an original study by W.J. Nash, T.L. Sellers, S.R. Talbot, A.J. Cawthorn and W. B. Ford, called "The Population Biology of Abalone (Haliotis species) in Tasmania. I. Blacklip Abalone (H. rubra) from the North Coast and Islands of Bass Strait", Sea Fisheries Division, Technical Report No. 48 (ISSN 1034-3288) (1994). The data was donated by S. Waugh. There are 4177 records. We will use the Length measurement. We will assume that the 4177 records is the entire population. Compute the population median.
+
+  1. (a)
+
+Draw 10,000 samples of 100 records at random with replacement. Use each sample to produce a bootstrap estimate of a centered 90% confidence interval for the population median. For what fraction of samples does the true population median lie inside the interval?
+
+  2. (b)
+
+Draw 10,000 samples of 30 records at random with replacement. Use each sample to produce a bootstrap estimate of a centered 90% confidence interval for the population median. For what fraction of samples does the true population median lie inside the interval?
+
+  3. (c)
+
+Draw 10,000 samples of 10 records at random with replacement. Use each sample to produce a bootstrap estimate of a centered 90% confidence interval for the population median. For what fraction of samples does the true population median lie inside the interval?
+
+  4. (d)
+
+What conclusion do you draw?
+
+© Springer International Publishing AG 2018
+
+David ForsythProbability and Statistics for Computer Sciencehttps://doi.org/10.1007/978-3-319-64410-3_7
+
+# 7. The Significance of Evidence
+
+David Forsyth1
+
+(1)
+
+Computer Science Department, University of Illinois at Urbana Champaign, Urbana, IL, USA
+
+Imagine you believe the mean human body weight is 72 kg. The mean human weight isn't a random number, but it's very hard to measure directly. You are forced to take a sample, and compute the sample mean. This sample mean is a random variable, and it will have different values for different samples. You need to know how to tell whether the difference between the observed value and 72 kg is just an effect of variance caused by sampling, or is because the mean weight actually isn't 72 kg. One strategy is to construct an interval around the sample mean within which the true value will lie for (say) 99% of possible samples. If 72 kg is outside that interval, then very few samples are consistent with the idea that 72 kg is the mean human body weight. If you want to believe the mean human body weight is 72 kg, you have to believe that you obtained a very odd sample.
+
+You should think of the procedure I described as assessing the extent to which the evidence you have contradicts the original hypothesis. At first glance, this may seem strange to you—surely one wants to assess the extent to which the evidence supports the hypothesis—but in fact it's natural. You can't prove that a scientific hypothesis is true; you can only fail to show that it's false. Just one piece of evidence can destroy a scientific hypothesis, but no amount of evidence can remove all doubt.
+
+There is an important, quite general, line of reasoning here. It is a bad idea to try and explain data using a hypothesis that makes the data you observed a rare event. We can use the reasoning of Sect. 6.​2 to assess how rare the observed data is. In that section, we used the distribution that the sample mean would take to construct a confidence interval. This meant we could plot an interval in which the population mean would lie with (say) 95% confidence. To assess the rarity of the sample, we could ask how large a confidence interval we would have to draw around the hypothesized mean to cover the observed sample mean. If that interval is relatively small (say 50%), then it's quite possible that the population mean takes the value we hypothesized. A cleaner way to say this is we do not have enough evidence to reject the hypothesis. If that interval requires (say) 99.99% of possible samples, that's a strong suggestion that the sample is extremely unusual. Assessing the rarity of the sample using methods like this is usually talked about as testing the significance of evidence against the hypothesis.
+
+Example 7.1 (Patriot Missiles)
+
+I got this example from "Dueling idiots", a nice book by P.J. Nahin, Princeton University Press. Apparently in 1992, the Boston Globe of Jan 24 reported on this controversy. The pentagon claimed that the patriot missile successfully engaged SCUD missiles in 80% of encounters. An MIT physicist, Theodore Postol, pointed out there was a problem. He viewed tapes of 14 patriot/SCUD encounters, with one hit and 13 misses. We can reasonably assume each encounter is independent. We can extract the probability of getting one hit and 13 misses if P(hit) = 0. 8 from the binomial model, to get a number around 1e-8. Now you could look at this information and make several arguments: (a) the pentagon is right and the probability really is 0.8, but Postol looked at a really unlucky set of videotapes; (b) the probability is not 0.8, because you would need to fire 14 patriots at 14 SCUD missiles about 1e8 times to see this set of videotapes once; (c) for some reason, the videotapes are not independent—perhaps only unsuccessful encounters get filmed. If Postol viewed tapes at random (i.e. he didn't select only unsuccessful tapes, etc.), then argument (a) is easily dismissed, because the pentagon would have had to be unreasonably unlucky—it's a bad idea to try to explain data with a hypothesis that makes the data very unlikely.
+
+This reasoning can be extended to compare populations. Imagine I want to know whether mice weigh more than rats. For practical reasons, I will estimate the weights using a sample. But this means that two different estimates will have different values purely because I used different samples. I am now in a difficult position—perhaps my observed sample mean for the weight of mice is smaller than that for rats because of random variation in the sample value. But now imagine drawing a (say) 95% confidence interval about each mean—if these don't overlap, then the population means are likely not the same. If you did the exercises for the last section, you will have noticed that I was signalling this idea. The principle here is that a very large difference may be hard to explain with random variation unless you are willing to believe in very odd samples. This leads to a procedure that can be used to decide whether (say) mice weigh more than rats.
+
+## 7.1 Significance
+
+Imagine we hypothesize that the average human body temperature is 95∘. We collect temperature measurements x i from a random sample of N people. The mean of this sample is unlikely to be 95∘. The sample will likely have too many people who run too hot, or too cool, to get exactly the number we expect. We must now find what caused the difference between the sample mean and the value we hypothesized. We could be wrong about the average body temperature. Alternatively, we could be right, and the difference might just be because the sample is randomly chosen. We can assess the significance of the evidence against the hypothesis by finding out what fraction of samples would give us sample means like the one we observe if the hypothesis is true.
+
+### 7.1.1 Evaluating Significance
+
+We hypothesize that a population mean has some value ![
+$$\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq1.gif) (a big letter, because we don't see the whole population). Write S for the random variable representing a possible sample mean. The mean of this random variable is the population mean, and the standard deviation of this random variable is estimated by the standard error, which we write ![
+$$\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq2.gif) (small letters, because we got this from the sample) Now consider the random variable
+
+![
+$$\\displaystyle{T = \\frac{\(S -\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)\)} {\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \)}.}$$
+](A442674_1_En_7_Chapter_Equa.gif)
+
+This random variable has a t-distribution with N − 1 degrees of freedom (or, if N is big enough, we can regard it as a standard normal random variable). We now have a way to tell whether the evidence supports our hypothesis. We assess how strange the sample would have to be to yield the value that we actually see, if the hypothesis is true. We can do this, because we can compute the fraction of samples that would have a less extreme value. Write s for the value of S that we observe. This yields t (the observed value of T, which is usually known as the test statistic). Write p t (u; N − 1) for the probability density function of a t-distribution with N − 1 degrees of freedom. Now the fraction of samples that will have less extreme values of s if the population mean was, indeed, ![
+$$\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq3.gif) is:
+
+![
+$$\\displaystyle{f = \\frac{1} {\\sqrt{2\\pi }}\\int _{-\\vert s\\vert }^{\\vert s\\vert }p_{ t}\(u;N - 1\)du.}$$
+](A442674_1_En_7_Chapter_Equb.gif)
+
+Remember that, if N is sufficiently large, we can use a standard normal distribution in place of p t (u; N − 1). Now assume we see a very large (or very small) value of v. The value of f will be close to one, which implies that most samples from the population will have a v value closer to zero if the hypothesis were true. Equivalently, this says that, if the hypothesis were true, our sample is highly unusual, which implies the data fails to support the hypothesis.
+
+Worked example 7.1 (Samples of 44 Male Chow Eating Mice)
+
+Assume the mean weight of a male chow eating mouse is 35 gr. and the standard error of a sample of 44 such mice is 0.827 gr. What fraction of samples of 44 such mice will have a sample mean in the range 33–37 grams?
+
+Solution
+
+You could use the data at http://​cgd.​jax.​org/​datasets/​phenotype/​SvensonDO.​shtml, but you don't really need it to answer this question. Write S for the sample mean weight of a sample of 44 male chow eating mice. Because we assumed that the true mean weight is 35 gr, we have
+
+![
+$$\\displaystyle{T = \\frac{S - 35} {0.827} }$$
+](A442674_1_En_7_Chapter_Equc.gif)
+
+is a t-distributed random variable, with 43 degrees of freedom. The question is asking for the probability that T takes a value in the range [(33 − 35)∕0. 827, (37 − 35)∕0. 827], which is [−2. 41, 2. 41]. There are enough degrees of freedom to regard S as normal, so this probability is
+
+![
+$$\\displaystyle{\\int _{-2.41}^{2.41} \\frac{1} {\\sqrt{2\\pi }}\\exp \(-u^{2}/2\)du \\approx 0.984}$$
+](A442674_1_En_7_Chapter_Equd.gif)
+
+a number I found in tables. In turn, this means about 98.4% of samples of 44 chow eating male mice will give a mean weight in this range, if the population mean is truly 35 gr.
+
+Worked example 7.2 (Samples of 48 Chow-Eating Female Mice)
+
+Assume the population mean of the weight of a chow-eating female mouse is 27.8 gr. Use the data at http://​cgd.​jax.​org/​datasets/​phenotype/​SvensonDO.​shtml to estimate the fraction of samples that will have mean weight greater than 29 gr.
+
+Solution
+
+From Worked example 6.4, the standard error of a sample of 48 chow-eating female mice is 0.70 gr. Write S for a sample mean of a sample of 48 chow-eating female mice. Because we assumed that the true mean was 27.8 gr, we have
+
+![
+$$\\displaystyle{T = \\frac{S - 27.8} {0.70} }$$
+](A442674_1_En_7_Chapter_Eque.gif)
+
+is a t-distributed random variable, with 47 degrees of freedom. The question is asking for the probability that T takes a value greater than (29 − 27. 8)∕0. 7 = 1. 7143. There are enough degrees of freedom to regard T as normal, so this probability is
+
+![
+$$\\displaystyle{\\int _{1.7143}^{\\infty } \\frac{1} {\\sqrt{2\\pi }}\\exp \(-x^{2}/2\)dx \\approx 0.043}$$
+](A442674_1_En_7_Chapter_Equf.gif)
+
+a number I found in tables. In turn, this means about 4% of samples of 48 chow eating female mice will give a mean weight greater than 29 gr, if the population mean is truly 27.8 gr
+
+### 7.1.2 P-Values
+
+The procedure of the previous section computes the fraction of samples that would give a smaller absolute value of T than the one we observed if the hypothesis was true. I called this fraction f. It is easier (and more traditional) to think about p = 1 − f than about f. You should think of p as representing the fraction of samples that would give a larger absolute value of T than the one observed, if the hypothesis was true. If this fraction is very small, then there is significant evidence against the hypothesis. The fraction is sometimes referred to as a p-value.
+
+Definition 7.1 (p-Value)
+
+The p-value represents the fraction of samples that would give a more extreme value of the test statistic than that observed, if the hypothesis was true.
+
+Here is one useful interpretation of a p-value. Assume that you are willing to believe the hypothesis when the p-value is α or less, and will reject it otherwise. Then the probability that you accept a hypothesis that is false (i.e. a false positive, or type I error) is the fraction of samples that would give you that p-value or less even though the hypothesis is true. But this is α—so you can interpret a p-value as the probability that you have accepted a hypothesis that is false. This view yields the definition of significance, which is delicate.
+
+Definition 7.2 (Statistical Significance)
+
+Statistical significance is a term that is used in a variety of ways. One can refer to the significance of a set of measurements. In this context, the term means the p-value for the relevant test statistic for those measurements. One can refer to the significance of a study. In this context, the term refers to the value against which the p-value of the test statistic will be tested. This value is, ideally, chosen in advance. It should be interpreted as meaning the fraction of all possible samples that the study could encounter that would cause the chosen procedure to reject the null hypothesis, given that it was true.
+
+The procedure I have described for evaluating evidence against a hypothetical population mean is known as a T-test. I have put the details in box 7.1. The procedure is worth knowing because it is useful and very widely used. It also sketches the general form for tests of significance. You determine a statistic which can be used to test the particular proposition you have in mind. This statistic needs to: (a) depend on your data; (b) depend on your hypothesis; and (c) have a known distribution under sampling variation. You compute the value of this statistic. You then look at the distribution to determine what fraction of samples would have a more extreme value. If this fraction is small, the evidence suggests your hypothesis isn't true.
+
+Procedure 7.1 (The T-Test of Significance for a Hypothesized Mean)
+
+The initial hypothesis is that the population has a known mean, which we write μ. Write ![
+$$\\left \\{x\\right \\}$$
+](A442674_1_En_7_Chapter_IEq4.gif) for the sample, and N for the sample size.
+
+  * Compute the sample mean, which we write ![
+$$\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq5.gif).
+
+  * Estimate the standard error ![
+$$\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq6.gif) using
+
+![
+$$\\displaystyle{\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \) = \\frac{\\mathsf{stdunbiased}\\left \(\\left \\{x\\right \\}\\right \)} {\\sqrt{N}}.}$$
+](A442674_1_En_7_Chapter_Equg.gif)
+
+  * Compute the test statistic using
+
+![
+$$\\displaystyle{v = \\frac{\(\\mu -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\)} {\\mathsf{stderr}\\left \(\\left \\{x\\right \\}\\right \)}.}$$
+](A442674_1_En_7_Chapter_Equh.gif)
+
+  * Compute the p-value, using one of the recipes below.
+
+  * The p-value summarizes the extent to which the data contradicts the hypothesis. A small p-value implies that, if the hypothesis is true, the sample is very unusual. The smaller the p-value, the more strongly the evidence contradicts the hypothesis.
+
+It is common to think that a hypothesis can be rejected only if the p-value is less than 5% (or some number). You should not think this way; the p-value summarizes the extent to which the data contradicts the hypothesis, and your particular application logic affects how you interpret it.
+
+There is more than one way to compute a p-value. In one approach, we compute the fraction of experiments that would give us a larger absolute value of t than the one we saw, computing
+
+![
+$$\\displaystyle{p = \(1 - f\) = 1 -\\int _{-\\vert s\\vert }^{\\vert s\\vert }p_{ t}\(u;N - 1\)du}$$
+](A442674_1_En_7_Chapter_Equi.gif)
+
+Here the probability distribution we use is either a t-distribution with N − 1 degrees of freedom, or a normal distribution if N is sufficiently large. Recall I wrote S for the sample mean as a random variable (i.e. before we've actually drawn a sample) and s for the value of that random variable. You should interpret p using
+
+![
+$$\\displaystyle{p = P\(\\left \\{S> \\mid \\!\\!s\\!\\!\\mid \\right \\}\) \\cup P\(\\left \\{S <-\\mid \\!\\!s\\!\\!\\mid \\right \\}\).}$$
+](A442674_1_En_7_Chapter_Equj.gif)
+
+It's usual to look this number up in tables; alternatively, any reasonable math computing environment will produce a number. This is known as a two-sided p-value (because you are computing the probability that either ![
+$$\\left \\{S> \\mid \\!\\!s\\!\\!\\mid \\right \\}$$
+](A442674_1_En_7_Chapter_IEq7.gif) or ![
+$$\\left \\{S <-\\mid \\!\\!s\\!\\!\\mid \\right \\}$$
+](A442674_1_En_7_Chapter_IEq8.gif)).
+
+Procedure 7.2 (Computing a Two-Sided p-Value for a T-Test)
+
+Evaluate
+
+![
+$$\\displaystyle{\\begin{array}{l} p = \(1 - f\) = 1 -\\int _{-\\mid \\!\\!t\\!\\!\\mid }^{\\mid \\!\\!t\\!\\!\\mid }p_{t}\(u;N - 1\)du = P\(\\left \\{S> \\mid \\!\\!s\\!\\!\\mid \\right \\}\) \\cup P\(\\left \\{S <-\\mid \\!\\!s\\!\\!\\mid \\right \\}\) \\end{array} }$$
+](A442674_1_En_7_Chapter_Equk.gif)
+
+where p t (u; N − 1) is the probability density of a t-distribution. If N > 30, it is enough to replace p t with the density of a standard normal distribution.
+
+Under some circumstances, one might compute a one-sided p-value. Here one computes either
+
+![
+$$\\displaystyle{p = P\(\\left \\{S> \\mid \\!\\!s\\!\\!\\mid \\right \\}\)}$$
+](A442674_1_En_7_Chapter_Equl.gif)
+
+or
+
+![
+$$\\displaystyle{p = P\(\\left \\{S <-\\mid \\!\\!s\\!\\!\\mid \\right \\}\).}$$
+](A442674_1_En_7_Chapter_Equm.gif)
+
+Generally, it is more conservative to use a two-sided test, and one should do so unless there is a good reason not to. Very often, authors use one-sided tests because they result in smaller p-values, and small p-values are often a requirement for publication. This is not sensible behavior.
+
+Procedure 7.3 (Computing a One-Sided p-Value for a T-Test)
+
+First, don't do this unless you have a good reason (getting a value less than 0.05 doesn't count). Now determine which side is important to you—which of ![
+$$P\(\\left \\{S> \\mid \\!\\!s\\!\\!\\mid \\right \\}\)$$
+](A442674_1_En_7_Chapter_IEq9.gif) or ![
+$$P\(\\left \\{S <-\\mid \\!\\!s\\!\\!\\mid \\right \\}\)$$
+](A442674_1_En_7_Chapter_IEq10.gif) do you care about, and why? If this process of thought hasn't dissuaded you, compute
+
+![
+$$\\displaystyle{p = P\(\\left \\{S> \\mid \\!\\!s\\!\\!\\mid \\right \\}\)}$$
+](A442674_1_En_7_Chapter_Equn.gif)
+
+or
+
+![
+$$\\displaystyle{p = P\(\\left \\{S <-\\mid \\!\\!s\\!\\!\\mid \\right \\}\)}$$
+](A442674_1_En_7_Chapter_Equo.gif)
+
+using the probability density of a t-distribution, as above. If N > 30, it is enough to replace p t with the density of a standard normal distribution.
+
+Once we have the p-value, evaluating significance is straightforward. A small p-value means that very few samples would display more extreme behavior than what we saw, if the null hypothesis is true. In turn, a small p-value means that, to believe our null hypothesis, we are forced to believe we have an extremely odd sample. More formally, the p-value that we compute is described as an assessment of the significance of the evidence against the null hypothesis. The p-value is smaller when the evidence against the null hypothesis is stronger. We get to decide how small a p-value means we should reject the null hypothesis.
+
+Worked example 7.3 (If the Mean Length of an Adult Male Mouse is 10 cm, How Unusual is the Sample in the Mouse Dataset?)
+
+The mouse dataset is the one at http://​cgd.​jax.​org/​datasets/​phenotype/​SvensonDO.​shtml. The variable to look at is Length2 (which appears to be the length of the mouse at the point it is sacrificed). We need to compute the p value.
+
+Solution
+
+The mean length of male mice in this data is 9.5 cm, and the standard error is 0.045 cm. Write S for the sample mean length of some sample of male mice. This (unknown) value is a random variable. Assuming that the mean length really is 10, we have that
+
+![
+$$\\displaystyle{T = \\frac{S - 10} {0.045} }$$
+](A442674_1_En_7_Chapter_Equp.gif)
+
+and there are enough mice to assume that this is a normal random variable. The value we observe is t = (9. 5 − 10)∕0. 045 = −11. 1 We are asking for the probability that T ≤ −∣​​t​​∣ OR T ≥ ∣​​t​​∣. This is so close to 0 that the difference is of no interest to us. In turn, the sample in the mouse dataset is quite implausibly unlikely if the mean length of an adult mouse were 10 cm. We can interpret this as overwhelming evidence that the mean length isn't 10 cm.
+
+It is conventional to reject the null hypothesis when the p-value is less than 0.05. This is sometimes called "a significance level of 5%". The phrase can mislead: the term "significance" seems to imply the result is important, or meaningful. Instead, you should interpret a p-value of 0.05 as meaning that you would see evidence this unusual in about one experiment in twenty if the null hypothesis was true. It's quite usual to feel that this means the hypothesis is unlikely to be true.
+
+Sometimes, the p-value is even smaller, and this can be interpreted as very strong evidence the null hypothesis is wrong. A p-value of less than 0.01 allows one to reject the null hypothesis at "a significance level of 1%". Similarly, you should interpret a p-value of 0.01 as meaning that you would see evidence this unusual in about one experiment in a hundred if the null hypothesis was true.
+
+Worked example 7.4 (Average Human Body Weight)
+
+Assess the significance of the evidence against the hypothesis that the average human body weight is 175 lb, using the height and weight data set of http://​www2.​stetson.​edu/​~jrasp/​data.​htm (called bodyfat.xls).
+
+Solution
+
+The dataset contains 252 samples, so we can use a normal model. The average weight is 178.9 lb. This results in a two-sided p-value of 0.02. We can interpret this as quite strong evidence that the average human body weight is not, in fact, 175 lb. This p-value says that, if (a) the average human body weight is 175 lb and (b) we repeat the experiment (weigh 252 people and average their weights) 50 times, we would see a value as far from 175 lb about once.
+
+Worked example 7.5 (Cholesterol Levels After Heart Attacks)
+
+At http://​www.​statsci.​org/​data/​general/​cholest.​html, you can find data on 28 patients whose cholesterol level was measured at various days after a heart attack. The data is attributed to "a study conducted by a major northeastern medical center" (here northeastern refers to a location in the USA). Assess the significance of the evidence that, on day 2, the mean cholesterol level is 240 mg/dL.
+
+Solution
+
+N is small enough to use a t-distribution. We have the sample mean is 253.9; the standard error is 9.02; and so the test statistic is 1.54. A two-sided test, using 27 degrees of freedom, gives a p-value of 0. 135, too large to comfortably reject the hypothesis.
+
+## 7.2 Comparing the Mean of Two Populations
+
+We have two samples, and we need to know whether these samples come from populations that have the same mean. For example, we might observe people using two different interfaces, measure how quickly they perform a task, then ask are their performances different? As another example, we might run an application with no other applications running, and test how long it takes to run some standard tasks. Because we don't know what the operating system, cache, etc. are up to, this number behaves a bit like a random variable, so it is worthwhile to do several experiments, yielding one set of samples. We now do this with other applications running as well, yielding another set of samples—is this set different from the first set? For realistic datasets, the answer is always yes, because they're random samples. A better question is: could the differences be the result of chance, or are these datasets really samples of two different populations?
+
+Worked example 7.6 (Male and Female Chow Eating Mice)
+
+Give a centered 95% confidence interval for the weight of a female mouse who ate chow and for the weight of a male mouse who ate chow, based on the dataset at http://​cgd.​jax.​org/​datasets/​phenotype/​SvensonDO.​shtml. Compare these intervals.
+
+Solution
+
+We know from Worked example 6.4 that the interval we want runs from 26.38 to 29.18 gr. For male mice, the same procedure yields the interval 34.75–38.06 gr. Now these two ranges are quite distinct. This means that, to believe the two populations are the same, we'd have to believe we have a really strange sample of at least one population. Here is rather compelling evidence that male and female chow-eating mice do not have the same mean weight.
+
+As Worked example 7.6 shows, we can use confidence intervals on the means to reason about populations. There's an alternative—we could look at the significance of the difference between means. We need some notation: write ![
+$$\\left \\{X\\right \\}$$
+](A442674_1_En_7_Chapter_IEq11.gif) for the first population and ![
+$$\\left \\{Y \\right \\}$$
+](A442674_1_En_7_Chapter_IEq12.gif) for the second population. Write ![
+$$\\left \\{x\\right \\}$$
+](A442674_1_En_7_Chapter_IEq13.gif) for the first dataset, which has size k x , and ![
+$$\\left \\{y\\right \\}$$
+](A442674_1_En_7_Chapter_IEq14.gif) for the second, which has size k y . These datasets need not be of the same size.
+
+### 7.2.1 Assuming Known Population Standard Deviations
+
+In the simplest case, assume the two populations each have known standard deviation, i.e. ![
+$$\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq15.gif) and ![
+$$\\mathsf{popsd}\\left \(\\left \\{Y \\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq16.gif) are known. In this case, the distribution of sample means is normal. We can use some simple facts about normal random variables to come up with a measure of significance.
+
+Useful Facts 7.1 (Sums and Differences of Normal Random Variables)
+
+Let X 1 be a normal random variable with mean μ 1 and standard deviation σ 1. Let X 2 be a normal random variable with mean μ 2 and standard deviation σ 2. Let X 1 and X 2 be independent. Then we have that:
+
+  * for any constant c 1 ≠ 0, c 1 X 1 is a normal random variable with mean c 1 μ 1 and standard deviation c 1 σ 1;
+
+  * for any constant c 2, X 1 \+ c 2 is a normal random variable with mean μ 1 \+ c 2 and standard deviation σ 1;
+
+  * X 1 \+ X 2 is a normal random variable with mean μ 1 \+ μ 2 and standard deviation ![
+$$\\sqrt{\\sigma _{1 }^{2 } +\\sigma _{ 2 }^{2}}$$
+](A442674_1_En_7_Chapter_IEq17.gif).
+
+I will not prove these facts; we already know the expressions for means and standard deviations from our results on expectations. The only open question is to show that the distributions are normal. This is easy for the first two results. The third requires a bit of integration that isn't worth our trouble; you could reconstruct the proof from section Worked example 14.13's notes on sums of random variables and some work with tables of integrals.
+
+Now write ![
+$$X^{\(k_{x}\)}$$
+](A442674_1_En_7_Chapter_IEq18.gif) for the random variable obtained by: drawing a random sample with replacement of k x elements from the first population, then averaging this sample. Write ![
+$$Y ^{\(k_{y}\)}$$
+](A442674_1_En_7_Chapter_IEq19.gif) for the random variable obtained by: drawing a random sample with replacement of k y elements from the first population, then averaging this sample. Each random variable is normal, because the population standard deviations are known. This means that ![
+$$X^{\(k_{x}\)} - Y ^{\(k_{y}\)}$$
+](A442674_1_En_7_Chapter_IEq20.gif) is a normal random variable.
+
+Now write D for ![
+$$X^{\(k_{x}\)} - Y ^{\(k_{y}\)}$$
+](A442674_1_En_7_Chapter_IEq21.gif). If the two populations have the same mean, then
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[D\\right \] = 0.}$$
+](A442674_1_En_7_Chapter_Equq.gif)
+
+Furthermore,
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{std}\\left \(D\\right \)& & = \\sqrt{\\mathsf{std } \\left \(X^{\(k_{x } \) } \\right \) ^{2 } + \\mathsf{std } \\left \(X^{\(k_{y } \) } \\right \) ^{2}} {}\\\\ & & = \\sqrt{\\frac{\\mathsf{popsd } \\left \(\\left \\{X\\right \\} \\right \) ^{2 } } {k_{x}} + \\frac{\\mathsf{popsd}\\left \(\\left \\{Y \\right \\}\\right \)^{2}} {k_{y}}}, {}\\\\ \\end{array}$$
+](A442674_1_En_7_Chapter_Equ1.gif)
+
+which we can evaluate because we assumed ![
+$$\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq22.gif) and ![
+$$\\mathsf{popsd}\\left \(\\left \\{Y \\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq23.gif) were known. We can now use the same reasoning we used to test the significance of the evidence that a population had a particular, known mean. We have identified a number we can compute from the two samples. We know how this number would vary under random choices of sample. If the value we observe is too many standard deviations away from the mean, the evidence is against our hypothesis. If we wanted to believe the hypothesis, we would be forced to believe that the sample is extremely strange. I have summarized this reasoning in box 7.4.
+
+Procedure 7.4 (Testing Whether Two Populations Have the Same Mean, for Known Population Standard Deviations)
+
+The initial hypothesis is that the populations have the same, unknown, mean. Write ![
+$$\\left \\{x\\right \\}$$
+](A442674_1_En_7_Chapter_IEq24.gif) for the sample of the first population, ![
+$$\\left \\{y\\right \\}$$
+](A442674_1_En_7_Chapter_IEq25.gif) for the sample of the second population, and k x , k y for the sample sizes.
+
+  * Compute the sample means for each population, ![
+$$\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq26.gif) and ![
+$$\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq27.gif).
+
+  * Compute the standard error for the difference between the means,
+
+![
+$$\\displaystyle{s_{ed} = \\sqrt{\\frac{\\mathsf{popsd } \\left \(\\left \\{X\\right \\} \\right \) } {k_{x}} + \\frac{\\mathsf{popsd}\\left \(\\left \\{Y \\right \\}\\right \)} {k_{y}}}.}$$
+](A442674_1_En_7_Chapter_Equr.gif)
+
+  * Compute the value of the test statistic using
+
+![
+$$\\displaystyle{s = \\frac{\(\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)\)} {s_{ed}}.}$$
+](A442674_1_En_7_Chapter_Equs.gif)
+
+  * Compute the p-value, using
+
+![
+$$\\displaystyle{p = \(1 - f\) = \(1 -\\int _{-\\mid \\!\\!s\\!\\!\\mid }^{\\mid \\!\\!s\\!\\!\\mid }\\exp \\left \(\\frac{-u^{2}} {2} \\right \)du\)}$$
+](A442674_1_En_7_Chapter_Equt.gif)
+
+  * The p-value summarizes the extent to which the data contradicts the hypothesis. A small p-value implies that, if the hypothesis is true, the sample is very unusual. The smaller the p-value, the more strongly the evidence contradicts the hypothesis.
+
+It is common to think that a hypothesis can be rejected only if the p-value is less than 5% (or some number). You should not think this way; the p-value summarizes the extent to which the data contradicts the hypothesis, and your particular application logic affects how you interpret it.
+
+### 7.2.2 Assuming Same, Unknown Population Standard Deviation
+
+Now assume the two populations each have the same, unknown standard deviation, i.e. ![
+$$\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \) = \\mathsf{popsd}\\left \(\\left \\{Y \\right \\}\\right \) =\\sigma$$
+](A442674_1_En_7_Chapter_IEq28.gif), with σ unknown. If ![
+$$\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \) = \\mathsf{popmean}\\left \(\\left \\{Y \\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq29.gif), then we have that ![
+$$\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq30.gif) is the value of a random variable whose mean is 0, and whose variance is
+
+![
+$$\\displaystyle{ \\frac{\\sigma ^{2}} {k_{x}} + \\frac{\\sigma ^{2}} {k_{y}} =\\sigma ^{2} \\frac{k_{x}k_{y}} {k_{x} + k_{y}}}$$
+](A442674_1_En_7_Chapter_Equu.gif)
+
+We don't know this variance, but must estimate it. Because the variance is the same in each population, we can pool the samples when estimating the variance. This yields the following estimate of the standard error:
+
+![
+$$\\displaystyle\\begin{array}{rcl} s_{ed}^{2} = \\left \(\\frac{\\mathsf{std}\\left \(\\left \\{x\\right \\}\\right \)^{2}\(k_{ x} - 1\) + \\mathsf{std}\\left \(\\left \\{y\\right \\}\\right \)^{2}\(k_{ y} - 1\)} {k_{x} + k_{y} - 2} \\right \)\\left \( \\frac{k_{x}k_{y}} {k_{x} + k_{y}}\\right \).& & {}\\\\ \\end{array}$$
+](A442674_1_En_7_Chapter_Equ2.gif)
+
+Using our previous reasoning, we have that
+
+![
+$$\\displaystyle{\\frac{\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)} {s_{ed}} }$$
+](A442674_1_En_7_Chapter_Equv.gif)
+
+is the value of a random variable with a t-distribution with k x \+ k y − 2 degrees of freedom. I have summarized this reasoning in box 7.5.
+
+Procedure 7.5 (Testing Whether Two Populations Have the Same Mean, for Same But Unknown Population Standard Deviations)
+
+The initial hypothesis is that the populations have the same, unknown, mean. Write ![
+$$\\left \\{x\\right \\}$$
+](A442674_1_En_7_Chapter_IEq31.gif) for the sample of the first population, ![
+$$\\left \\{y\\right \\}$$
+](A442674_1_En_7_Chapter_IEq32.gif) for the sample of the second population, and k x , k y for the sample sizes.
+
+  * Compute the sample means for each population, ![
+$$\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq33.gif) and ![
+$$\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq34.gif).
+
+  * Compute the standard error for the difference between the means,
+
+![
+$$\\displaystyle\\begin{array}{rcl} s_{ed}^{2} = \\left \(\\!\\frac{\\mathsf{std}\\left \(\\left \\{x\\right \\}\\right \)^{2}\(k_{ x}\\!\\! -\\!\\! 1\)\\! +\\! \\mathsf{std}\\left \(\\left \\{y\\right \\}\\right \)^{2}\(k_{ y}\\!\\! -\\!\\! 1\)} {k_{x}\\!\\! +\\!\\! k_{y}\\!\\! -\\!\\! 2} \\!\\right \)\\left \( \\frac{k_{x}k_{y}} {k_{x} + k_{y}}\\right \).& & {}\\\\ \\end{array}$$
+](A442674_1_En_7_Chapter_Equ3.gif)
+
+  * Compute the test statistic using
+
+![
+$$\\displaystyle{s = \\frac{\(\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)\)} {s_{ed}}.}$$
+](A442674_1_En_7_Chapter_Equw.gif)
+
+  * Compute the p-value, using the recipe of Procedure 7.2; the number of degrees of freedom is k x \+ k y − 2.
+
+  * The p-value summarizes the extent to which the data contradicts the hypothesis. A small p-value implies that, if the hypothesis is true, the sample is very unusual. The smaller the p-value, the more strongly the evidence contradicts the hypothesis.
+
+It is common to think that a hypothesis can be rejected only if the p-value is less than 5% (or some number). You should not think this way; the p-value summarizes the extent to which the data contradicts the hypothesis, and your particular application logic affects how you interpret it.
+
+### 7.2.3 Assuming Different, Unknown Population Standard Deviation
+
+Now assume the two populations each have the different, unknown standard deviations. If ![
+$$\\mathsf{popmean}\\left \(\\left \\{X\\right \\}\\right \) = \\mathsf{popmean}\\left \(\\left \\{Y \\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq35.gif), then we have that ![
+$$\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq36.gif) is the value of a random variable whose mean is 0, and whose variance is
+
+![
+$$\\displaystyle{\\frac{\\mathsf{popsd}\\left \(\\left \\{X\\right \\}\\right \)^{2}} {k_{x}} + \\frac{\\mathsf{popsd}\\left \(\\left \\{Y \\right \\}\\right \)^{2}} {k_{y}} }$$
+](A442674_1_En_7_Chapter_Equx.gif)
+
+We don't know this variance, but must estimate it. Because the two populations have different standard deviations, we can't pool the estimate. An estimate is
+
+![
+$$\\displaystyle{s_{ed}^{2}\\!\\! =\\!\\! \\frac{\\mathsf{stdunbiased}\\left \(\\left \\{x\\right \\}\\right \)^{2}} {k_{x}} + \\frac{\\mathsf{stdunbiased}\\left \(\\left \\{y\\right \\}\\right \)^{2}} {k_{y}}.}$$
+](A442674_1_En_7_Chapter_Equy.gif)
+
+We can form the test statistic in the natural way, yielding
+
+![
+$$\\displaystyle{\\frac{\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)} {s_{ed}}.}$$
+](A442674_1_En_7_Chapter_Equz.gif)
+
+But there is an issue here. This statistic does not have a t-distribution, and the form of its distribution is complicated. It can be approximated satisfactorily with a t-distribution. Write
+
+![
+$$\\displaystyle\\begin{array}{rcl} W = \\left \(\\frac{\\left \[\\mathsf{stdunbiased}\\left \(\\left \\{x\\right \\}\\right \)^{2}/k_{x}\\right \]^{2}} {k_{x} - 1} + \\frac{\\left \[\\mathsf{stdunbiased}\\left \(\\left \\{y\\right \\}\\right \)^{2}/k_{y}\\right \]^{2}} {k_{y} - 1} \\right \).& & {}\\\\ \\end{array}$$
+](A442674_1_En_7_Chapter_Equ4.gif)
+
+Then the approximating t-distribution has
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\frac{\\left \[\\left \(\\mathsf{stdunbiased}\\left \(\\left \\{x\\right \\}\\right \)^{2}/k_{x}\\right \) + \\left \(\\mathsf{stdunbiased}\\left \(\\left \\{y\\right \\}\\right \)^{2}/k_{y}\\right \)\\right \]^{2}} {W} & & {}\\\\ \\end{array}$$
+](A442674_1_En_7_Chapter_Equ5.gif)
+
+degrees of freedom. With this, everything proceeds as before.
+
+Procedure 7.6 (Testing Whether Two Populations Have the Same Mean, for Different Population Standard Deviations)
+
+The initial hypothesis is that the populations have the same, unknown, mean. Write ![
+$$\\left \\{x\\right \\}$$
+](A442674_1_En_7_Chapter_IEq37.gif) for the sample of the first population, ![
+$$\\left \\{y\\right \\}$$
+](A442674_1_En_7_Chapter_IEq38.gif) for the sample of the second population, and k x , k y for the sample sizes.
+
+  * Compute the sample means for each population, ![
+$$\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq39.gif) and ![
+$$\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq40.gif).
+
+  * Compute the standard error for the difference between the means,
+
+![
+$$\\displaystyle\\begin{array}{rcl} s_{ed}^{2}& & = \\frac{\\mathsf{stdunbiased}\\left \(\\left \\{x\\right \\}\\right \)^{2}} {k_{x}} {}\\\\ & & ~~~~~~+\\frac{\\mathsf{stdunbiased}\\left \(\\left \\{y\\right \\}\\right \)^{2}} {k_{y}}. {}\\\\ \\end{array}$$
+](A442674_1_En_7_Chapter_Equ6.gif)
+
+  * Compute the test statistic using
+
+![
+$$\\displaystyle{s = \\frac{\(\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)\)} {s_{ed}}.}$$
+](A442674_1_En_7_Chapter_Equaa.gif)
+
+  * Compute the p-value, using the recipe of Procedure 7.2; the number of degrees of freedom is
+
+![
+$$\\displaystyle{ \\frac{\\left \(\\mathsf{stdunbiased}\\left \(\\left \\{x\\right \\}\\right \)^{2}/k_{x}\\! +\\! \\mathsf{stdunbiased}\\left \(\\left \\{y\\right \\}\\right \)^{2}/k_{y}\\right \)^{2}} {\\left \(\\begin{array}{c} \\left \[\\mathsf{stdunbiased}\\left \(\\left \\{x\\right \\}\\right \)^{2}/k_{ x}\\right \]^{2}/\\left \(k_{ x}\\! -\\! 1\\right \)+ \\\\ \\left \[\\mathsf{stdunbiased}\\left \(\\left \\{y\\right \\}\\right \)^{2}/k_{y}\\right \]^{2}/\\left \(k_{y}\\! -\\! 1\\right \)\\end{array} \\right \)}.}$$
+](A442674_1_En_7_Chapter_Equab.gif)
+
+  * The p-value summarizes the extent to which the data contradicts the hypothesis. A small p-value implies that, if the hypothesis is true, the sample is very unusual. The smaller the p-value, the more strongly the evidence contradicts the hypothesis.
+
+It is common to think that a hypothesis can be rejected only if the p-value is less than 5% (or some number). You should not think this way; the p-value summarizes the extent to which the data contradicts the hypothesis, and your particular application logic affects how you interpret it.
+
+Worked example 7.7 (Are US and Japanese Cars Different)
+
+At http://​www.​itl.​nist.​gov/​div898/​handbook/​eda/​section3/​eda3531.​htm you can find a dataset, published by NIST, giving measurements of miles per gallon for Japanese and US cars. Assess the evidence these two populations have the same mean MPG.
+
+Solution
+
+There are 249 measurements for Japanese cars, and 79 for US cars. The mean for Japanese cars is 20.14, and for US cars is 30.48. The standard error is 0.798. The value of the test statistic is
+
+![
+$$\\displaystyle{\\frac{\(\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) -\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)\)} {s_{ed}} = 12.95}$$
+](A442674_1_En_7_Chapter_Equac.gif)
+
+and the number of degrees of freedom is about 214. There are enough degrees of freedom here that the t-distribution could be approximated with a normal distribution, so a reasonable approximate p-value is the probability of encountering a standard normal random variable of this value or greater. This is so close to zero I had some trouble getting sensible numbers; the evidence very strongly rejects this hypothesis. A version of this example is worked in the NIST/SEMATECH e-Handbook of Statistical Methods, at http://​www.​itl.​nist.​gov/​div898/​handbook/​, as of 2017.
+
+## 7.3 Other Useful Tests of Significance
+
+There are many forms of significance test. Significance testing can get quite elaborate, because determining distributions under sampling variation can get tricky. Furthemore, there are still arguments about precisely what significance means (or should mean). Mostly, we can ignore these difficulties. There are two more of the very many available procedures that will be very useful for us.
+
+### 7.3.1 F-Tests and Standard Deviations
+
+Imagine we have two datasets. There are N x items in {x}, and N y items in {y}. We believe that each dataset is normally distributed (i.e. that the values are IID samples from a normal distribution). We wish to evaluate the significance of the evidence against the belief that the two datasets have the same variance. This test will prove useful in the discussion of experiments (Chap. ).
+
+The procedure we adopt follows that of the T-test. We assume the dataset are samples drawn from a population. We compute a statistic representing the data that we actually see. This statistic will have a known distribution under sampling. We then compute the probability of observing a value of the statistic even more unusual than the value we observe, if the two dataset have the same variance. If that probability is small, then if we want to believe the two datasets have the same variance, we will have to believe we have an extremely strange sample.
+
+The statistic: In the cases I will deal with, it is clear which of the two populations has largest variance if they are different. I will assume that the X population might have larger variance. If these two populations had the same variance, we would expect that
+
+![
+$$\\displaystyle{F = \\frac{\\mathsf{stdunbiased}\\left \(\\left \\{x\\right \\}\\right \)^{2}} {\\mathsf{stdunbiased}\\left \(\\left \\{y\\right \\}\\right \)^{2}} }$$
+](A442674_1_En_7_Chapter_Equad.gif)
+
+should be close to one. This statistic is known as the F-statistic, and we are concerned that it could be greater than or equal to one, justifying the use of a one sided p-value.
+
+The distribution: The F-statistic has a known distribution (called the F-distribution), assuming that {x} and {y} are IID samples from normal distributions. The form of the distribution isn't particularly important to us. Appropriate values can be looked up in tables, or obtained from a computing environment. However, it is important to keep track of one detail. As the number of samples in either dataset goes up, the estimate of the variance obtained from the samples must get more accurate. This means that the distribution depends on the number of degrees of freedom for each dataset (i.e. N x − 1 and N y − 1). We write p f (u; N x − 1, N y − 1) for the probability density of the F-statistic. We chose the ratio that was greater than, or equal to, one. Write r for the ratio we observed. The probability of observing a value of the statistic that is even more unusual (i.e. bigger) than the one we observe is
+
+![
+$$\\displaystyle{\\int _{r}^{\\infty }p_{ f}\(u;N_{x} - 1,N_{y} - 1\)du.}$$
+](A442674_1_En_7_Chapter_Equae.gif)
+
+I write this integral for completeness, rather than because you'd ever need to actually work with it. In practice, one either works with tables or with a function from a software package.
+
+Procedure 7.7 (The F-Test of Significance for Equality of Variance)
+
+Given two datasets {x} of N x items and {y} of N y items, we wish to assess the significance of evidence against the hypothesis that the populations represented by these two datasets have the same variance. We assume that the alternative possibility is that the population represented by {x} has the larger variance. We compute
+
+![
+$$\\displaystyle{F = \\frac{\\mathsf{stdunbiased}\\left \(\\left \\{x\\right \\}\\right \)^{2}} {\\mathsf{stdunbiased}\\left \(\\left \\{y\\right \\}\\right \)^{2}} }$$
+](A442674_1_En_7_Chapter_Equaf.gif)
+
+and obtain a p-value using tables or software to recover
+
+![
+$$\\displaystyle{\\int _{r}^{\\infty }p_{ f}\(u;N_{x} - 1,N_{y} - 1\)du.}$$
+](A442674_1_En_7_Chapter_Equag.gif)
+
+where p f is the probability distribution for the F-statistic.
+
+Worked example 7.8 (Yet More Male and Female Chow Eating Mice)
+
+Does the data support the idea that the variance of the weight of female mice who ate chow and the variance of the weight of male mice who ate chow are the same? Use the dataset at http://​cgd.​jax.​org/​datasets/​phenotype/​SvensonDO.​shtml, and an F-test.
+
+Solution
+
+The value of the F-statistic is 1.686 (male mice have the larger variance). The p-value for this F-statistic is 0.035, which we can interpret as evidence that only 3.5% of samples would have a more unusual value of the F-statistic if the two populations actually had the same variance. In turn, this means the evidence quite strongly contradicts the hypothesis they have the same variance. One note for careful readers: F-tests have a reputation for being somewhat unreliable when the data isn't normally distributed. Good sense suggests that I should check that mouse weights are normally distributed before releasing the result we have here to the newspapers. Fortunately, Worked example 7.10 below suggests that mouse weights are normally distributed, so we're safe on this point.
+
+### 7.3.2  χ 2 Tests of Model Fit
+
+Sometimes we have a model, and we would like to know whether the data is consistent with that model. For example, imagine we have a six-sided die. We throw it many times, and record which number comes up each time. We would like to know if the die is fair (i.e. is the data consistent with the model that the die is fair). It is highly unlikely that each face comes up the same number of times, even if the die is fair. Instead, there will be some variation in the frequencies observed; with what probability is that variation, or bigger, the result of chance effects?
+
+As another example, we decide that the number of calls by a telemarketer in each hour is distributed with a Poisson distribution. We don't know the intensity. We could collect call data, and use maximum likelihood to determine the intensity. Once we have the best estimate of the intensity, we still want to know whether the model is consistent with the data.
+
+In each case, the model predicts the frequencies of events. For the six-sided die case, the model tells us how often we expect to see each side. For the call case, the model predicts how often we would see no calls, one call, two calls, three calls, etc. in each hour. To tell whether the model fits the data, we need to compare the frequencies we observed with theoretical frequencies.
+
+We adopt the following procedure, which should now be familiar. We assume the dataset are samples drawn from a population. We compute a statistic representing the data that we actually see. This statistic will have a known distribution under sampling. We then compute the probability of observing a value of the statistic even more unusual than the value we observe, if the model correctly predicts the frequencies of events. If that probability is small, then if we want to believe the model correctly predicts the frequencies of events, we will have to be believe we have an extremely small sample.
+
+The statistic: The appropriate statistic is computed as follows. Assume we have a set of k disjoint events ![
+$$\\mathcal{E}_{1},\\ldots,\\mathcal{E}_{k}$$
+](A442674_1_En_7_Chapter_IEq41.gif) which cover the space of outcomes (i.e. any outcome lies in one of these events). Assume we perform N experiments, and record the number of times each event occurs. We have a hypothesis regarding the probability of events. We can take the probability of each event and multiply by N to get a frequency under that hypothesis. Now write ![
+$$f_{o}\(\\mathcal{E}_{i}\)$$
+](A442674_1_En_7_Chapter_IEq42.gif) for the observed frequency of event i; ![
+$$f_{t}\(\\mathcal{E}_{i}\)$$
+](A442674_1_En_7_Chapter_IEq43.gif) for the theoretical frequency of the event under the null hypothesis. We form the statistic
+
+![
+$$\\displaystyle{C =\\sum _{i}\\frac{\\left \(f_{o}\(\\mathcal{E}_{i}\) - f_{t}\(\\mathcal{E}_{i}\)\\right \)^{2}} {f_{t}\(\\mathcal{E}_{i}\)} }$$
+](A442674_1_En_7_Chapter_Equah.gif)
+
+which compares the observed and actual frequency of events. This statistic is known as the χ 2 -statistic (say "khi-squared").
+
+The distribution: It turns out that this statistic C has a distribution very close to a known form, called the χ 2 -distribution, as long as each count is five or more. The distribution has two parameters; the statistic, and the number of degrees of freedom. The degrees of freedom refers to the dimension of the space of measurement values that you could have. We will need to fix some values. The number of values to fix has to do with the type of test you are doing. In the most common case, you want to inspect the counts in each of k bins to tell whether they are consistent with some distribution. We know the sum of counts is N. It turns out that we should compare what we observe with what we could have observed with the same N. In this case, the dimension of the space of measurement value is k − 1, because you have k numbers but they must add up to N. Now assume we have to estimate p parameters for the model. For example, rather than asking whether the data comes from a standard normal distribution, we might use the data to estimate the mean. We then test whether the data comes from a normal distribution with the estimated mean, but unit standard deviation. As another example, we could estimate both mean and standard deviation from the data. If we estimate p parameters from the data, then the number of degrees of freedom becomes k − p − 1 (because there are k counts, they must lead to p parameter values, and they must add to 1).
+
+After this, things follow the usual recipe. We compute the statistic; we then look at tables, or use our programming environment, to find the probability that the statistic takes this value or greater under the null hypothesis. If this is small, then we reject the null hypothesis.
+
+Procedure 7.8 (The χ 2-Test of Significance of Fit to a Model)
+
+The model consists of k disjoint events ![
+$$\\mathcal{E}_{1},\\ldots,\\mathcal{E}_{k}$$
+](A442674_1_En_7_Chapter_IEq44.gif) which cover the space of outcomes and the probability ![
+$$P\(\\mathcal{E}\)$$
+](A442674_1_En_7_Chapter_IEq45.gif) of each event. The model has p unknown parameters. Perform N experiments and record the number of times each event occurs in the experiments. The theoretical frequency of the i'th event for this experiment is ![
+$$NP\(\\mathcal{E}\)$$
+](A442674_1_En_7_Chapter_IEq46.gif). Write ![
+$$f_{o}\(\\mathcal{E}_{i}\)$$
+](A442674_1_En_7_Chapter_IEq47.gif) for the observed frequency of event i; ![
+$$f_{t}\(\\mathcal{E}_{i}\)$$
+](A442674_1_En_7_Chapter_IEq48.gif) for the theoretical frequency of the event under the null hypothesis. We form the statistic
+
+![
+$$\\displaystyle{C =\\sum _{i}\\frac{\\left \(f_{o}\(\\mathcal{E}_{i}\) - f_{t}\(\\mathcal{E}_{i}\)\\right \)^{2}} {f_{t}\(\\mathcal{E}_{i}\)} }$$
+](A442674_1_En_7_Chapter_Equai.gif)
+
+which has k − p − 1 degrees of freedom, and compute a p-value by using tables or software to evaluate
+
+![
+$$\\displaystyle{\\int _{C}^{\\infty }p_{\\chi ^{ 2}}\(u;k - p - 1\)du}$$
+](A442674_1_En_7_Chapter_Equaj.gif)
+
+where ![
+$$p_{\\chi ^{2}}\(u;k - p - 1\)$$
+](A442674_1_En_7_Chapter_IEq49.gif) is the χ 2 probability density function with k − p − 1 degrees of freedom. This test is safest when there are at least five instances of each event. This test is extremely elastic and can be used in a variety of non-obvious ways as sketched in the worked examples.
+
+Worked example 7.9 (χ 2 Test for Dice)
+
+I throw a die 100 times. I record the outcomes, in the table below. Is this a fair die?
+
+Face | 1 | 2 | 3 | 4 | 5 | 6
+
+---|---|---|---|---|---|---
+
+Count | 46 | 13 | 12 | 11 | 9 | 9
+
+Solution
+
+The expected frequency is 100/6 for each face. The χ 2 statistic has the value 62.7, and there are 5 degrees of freedom. We get a p-value of about 3e-12. You would have to run this experiment 3e11 times to see a table as skewed as this once, by chance. It's quite unreasonable to believe the die is fair—or, at least, if you wanted to do so, you would have to believe you did a quite extraordinary unusual experiment.
+
+Worked example 7.10 (Are Mouse Weights Normally Distributed?)
+
+Assess the evidence against the hypothesis that the weights of all mice who ate chow are normally distributed, based on the dataset at http://​cgd.​jax.​org/​datasets/​phenotype/​SvensonDO.​shtml.
+
+Solution
+
+This example takes a little thought. The way to check whether a set of data is (roughly) normally distributed, is to break the values into intervals, and count how many data items fall into each interval. This gives the observed frequencies. You can then also compute theoretical frequencies for those intervals with a normal distribution (or simulate). Then you use a χ 2 test to tell whether the two are consistent. The choice of intervals matters. It is natural to have intervals that are some fraction of a standard deviation wide, with the mean at the center. You should have one running to infinity, and one to minus infinity. You'd like to have enough intervals so that you can tell any big difference from normal, but it's important to have at least five data items in each interval. There are 92 mice who make it to whenever Weight2 is evaluated (sacrifice, I think). The mean of Weight2 is 31.91 and the standard deviation is 6.72. I broke the data into 10 intervals at breakpoints [−∞, −1. 2, −0. 9, −0. 6, −0. 3, 0, 0. 3, 0. 6, 0. 9, 1. 2, ∞] ∗ 6. 72 + 31. 91. This gave me a count vector [10, 9, 12, 9, 7, 11, 7, 9, 8, 10]. I simulated 2000 draws from a normal distribution with the given mean and standard deviation and sorted them into these intervals, getting [250, 129, 193, 191, 255, 240, 192, 192, 137, 221] (if you are less idle, you'll evaluate the integrals, but this shouldn't make much difference). I found a statistic with value 5.6338. Using 7 degrees of freedom (10 counts, but there are two parameters estimated), I found a p-value of 0.5830979, meaning there is no reason to reject the idea that the weights are normally distributed.
+
+Worked example 7.11 (Is Swearing Poisson?)
+
+A famously sweary politician gives a talk. You listen to the talk, and for each of 30 intervals 1 min long, you record the number of swearwords. You record this as a histogram (i.e. you count the number of intervals with zero swear words, with one, etc.), obtaining the table below.
+
+No. of swear words | 0 | 1 | 2 | 3 | 4
+
+---|---|---|---|---|---
+
+No. of intervals | 13 | 9 | 8 | 5 | 5
+
+The null hypothesis is that the politician's swearing is Poisson distributed, with intensity (λ) one. Can you reject this null hypothesis?
+
+Solution
+
+If the null hypothesis is true, then the probability of getting n swear words in a fixed length interval would be ![
+$$\\frac{\\lambda ^{n}e^{-\\lambda }} {n!}$$
+](A442674_1_En_7_Chapter_IEq50.gif). There are 10 intervals, so the theoretical frequencies are 10 times the following probabilities
+
+No. of swear words | 0 | 1 | 2 | 3 | 4
+
+---|---|---|---|---|---
+
+No. of intervals | 0.368 | 0.368 | 0.184 | 0.061 | 0.015
+
+so the χ 2 statistic takes the value 243.1 and there are 4 degrees of freedom. The significance is indistinguishable from zero by my programming environment, so you can firmly reject the null hypothesis. Of course, it may just be that the intensity is wrong (exercises).
+
+Worked example 7.12 (Are Goals Independent of Gender?)
+
+Assess the evidence that student goals are independent of student gender in the dataset of Chase and Dunner, which you can find at http://​lib.​stat.​cmu.​edu/​DASL/​Datafiles/​PopularKids.​html.
+
+Solution
+
+This is an example of a common use of the χ 2 test. The table below shows the count of students in the study by gender and goal. I have inserted row and column totals for convenience. In total, there were 478 students.  | Boy | Girl | Total
+
+---|---|---|---
+
+Grades | 117 | 130 | 247
+
+Popular | 50 | 91 | 141
+
+Sports | 60 | 30 | 90
+
+Total | 227 | 251 | 478
+
+We will test whether the counts observed are different from those predicted if gender and goal are independent. If they are independent, then P(boy) = 227∕478 = 0. 47, and P(Grades) = 247∕478 = 0. 52 (and so on). This means that we can produce a table of theoretical counts under the model (below).  | Boy | Girl
+
+---|---|---
+
+Grades | 117.29916 | 129.70084
+
+Popular | 66.96025 | 74.03975
+
+Sports | 42.74059 | 47.25941
+
+There are 6 cells in our table. One degree of freedom is used up by requiring that there are 478 students. Two further degrees of freedom are used up by insisting that the Grades/Popular/Sports counts yield the distribution we observed. One further degree of freedom is used up by insisting that the gender counts yield the distribution we observe. This means that there are a total of two degrees of freedom. We compute a χ 2 value of 21. 46. The p-value is 2e-5. In turn, if the two factors were independent, you'd see counts like these by chance about twice in a hundred thousand experiments. It's very hard to believe they are independent.
+
+## 7.4 P-Value Hacking and Other Dangerous Behavior
+
+Significance is a very useful way of telling whether an experimental observation might be the result of chance effects, but it is important to follow the logic of the method carefully. If you don't, you can fairly easily come to false conclusions.
+
+There is an important danger here.
+
+Removing data points and recomputing p-values is one way to have a serious problem. One context where this occurs in evaluating medical procedures. We would test the hypothesis that some sample mean of a treated population is the same as that of an untreated population. If this hypothesis fails, then the procedure did something. However, we might see outliers in the dataset. If we remove these outliers, then (of course) the p-value changes. This presents an temptation to remove outliers that shift the p-value in some desired direction. Of course, doing this consciously is fraud; but it's quite easy to simply fool yourself into removing data points whose absence is helpful.
+
+Another way to fool yourself is to look at a lot of samples, take the one with the smallest p-value, then declare the evidence is against the hypothesis. The more samples you look at, the better your chance of seeing one which has a small p-value (that's what the p-value means). If you look at lots of samples or do lots of experiments, looking for one with a small p-value, then use that to argue the hypothesis is false, you are fooling yourself. Because fooling other people can be quite profitable, this practice is common enough to have a name: it's referred to as p-value hacking.
+
+t's pretty clear that searching samples for one with a low p-value is bad behavior, but a subtle version of this mistake is to intermingle computing p-values with collecting data, then stop when you get the p-value you want. This is quite common behavior. A good example of how badly things can go wrong when you do this is described in the paper "False-Positive Psychology: Undisclosed Flexibility in Data Collection and Analysis Allows Presenting Anything as Significant", by J.P Simmons, L.D. Nelson and U. Simonsohn (Psychological Science, 2011). The authors were able to use this strategy to collect data that showed that listening to a particular song would cause your chronological age to go down. I advise you to look at this paper, which is a good, easy, and highly informative read
+
+Yet another way to fool yourself is to change hypotheses halfway through an experiment. Imagine we want to test the effect of a medical procedure. We decide to look at one particular sample mean, but halfway through collecting data it looks as though there won't be anything interesting to say about that mean. It's tempting to change measurements and focus on another statistic instead. If you do, the logic underlying the procedures we describe here fails. Essentially, you will bias the test to reject a hypothesis to a degree we can't go into.
+
+Yet another way to fool yourself is to test multiple hypotheses at the same time, and reject the one with the smallest p-value. The problem here is that the test isn't taking into account that you're testing multiple hypotheses. If you repeatedly test different hypotheses using the same dataset, the chances go up that the data you observed are inconsistent with a hypothesis that is true, purely as a result of sampling. Special procedures are required for this case.
+
+One solution is for these problems is really strict protocols, where you describe everything you will do and everything you will test before doing the experiment and then don't vary from that plan. But such protocols can be expensive and clumsy in practice.
+
+You should not get confused about what a test means. The tests I have described are referred to as tests of statistical significance. I use this terminology because that's what everyone else uses, but personally I don't find it helpful, because there is an insidious suggestion that a statistically significant difference actually matters—i.e. is significant. What significance testing procedures tell you is what fraction of random samples of data have the mean that you observe, if your hypothesis is true, and if you have collected the data correctly, tested correctly, and so on. The procedures don't tell you that you've done important, or even interesting, science.
+
+## 7.5 You Should
+
+### 7.5.1 Remember These Definitions
+
+  * p-value 162
+
+  * Statistical significance 162
+
+### 7.5.2 Remember These Terms
+
+  * test statistic 160
+
+  * T-test 162
+
+  * two-sided p-value 163
+
+  * one-sided p-value 163
+
+  * F-statistic 170
+
+  * F-distribution 170
+
+  * χ-statistic 171
+
+  * χ-distribution 171
+
+  * p-value hacking 174
+
+### 7.5.3 Remember These Facts
+
+  * Sums and differences of normal random variables 165
+
+### 7.5.4 Use These Procedures
+
+  * The T-test of significance for a hypothesized mean 162
+
+  * Compute a two-sided p-value for a T-test 163
+
+  * Compute a one-sided p-value for a T-test 163
+
+  * Assess whether means are the same (known population sds) 166
+
+  * Assess whether means are the same (same, unknown population sds) 167
+
+  * Assess whether means are the same (different population sds) 169
+
+  * The F-test of significance for equality of variance 170
+
+  * The χ-test of significance of fit to a model 172
+
+### 7.5.5 Be Able to
+
+  * Compute the fraction of samples that would have a more extreme mean than some value, if the population mean had a given value.
+
+  * Evaluate the significance of the evidence against the hypothesis that a population mean has a given value using a normal distribution model (i.e. for a large sample).
+
+  * Evaluate the significance of the evidence against the hypothesis that a population mean has a given value using a t-distribution model (i.e. for a sample that isn't large).
+
+  * Evaluate the significance of the evidence against the hypothesis that two population means are the same using a t-distribution model (i.e. for a sample that isn't large).
+
+  * Evaluate the significance of the evidence against the hypothesis that two population standard deviations are the same using an F-test.
+
+  * Evaluate the significance of the evidence against a model using a χ 2 test.
+
+  * Avoid thinking that the significance of evidence is the same as the significance of a piece of science.
+
+  * Avoid "adjusting" your experimental data to improve the p-value, and avoid p-value hacking in general.
+
+Problems
+
+Fractions of Samples
+
+7.1 In 1998, the average height of an adult male in South Africa was estimated to be 169 cm. Assume that this estimate is exact; assume also that the population standard deviation is 10 cm. What fraction of samples consisting of 50 adult males from South Africa (selected uniformly at random, and with replacement) will have average height greater than 200 cm?
+
+7.2 Assume the average weight of an adult male short-hair house cat is 5 kg, and the standard deviation is 0.7 kg (these numbers are reasonable, but there's quite a lively fight between cat fanciers about the true numbers).
+
+  1. (a)
+
+What fraction of samples consisting of 30 adult male short-hair house cats (selected uniformly at random, and with replacement) will have average weight less than 4 kg?
+
+  2. (b)
+
+What fraction of samples consisting of 300 adult male short-hair house cats (selected uniformly at random, and with replacement) will have average weight less than 4 kg?
+
+  3. (c)
+
+Why are these numbers different?
+
+Significance
+
+7.3 Yet more Mouse-weighing I claim the average weight of a mouse is 25 grams. You decide to evaluate the evidence in support of this claim. You obtain 10 mice, sampled uniformly at random and with replacement from the mouse population. Their weights are 21, 23, 27, 19, 17, 18, 20, 15, 17, 22 grams respectively. Does the evidence support my claim? to what extent? Why?
+
+7.4 How big are Parktown Prawns? The Parktown prawn is an impressively repellent large insect, common in Johannesburg (look them up on the Web). I claim that their average length is 10 cm. You collect 100 Parktown prawns (this will take about 10 mins, in the right places in Johannesburg; more difficult from the US). The mean length of these prawns is 7 cm. The standard deviation is 1 cm. Assess the evidence against my claim.
+
+7.5 Two Populations of Rats Zucker rats are specially bred to have curious weight properties, related to their genetics (look them up on the Web). You measure 30 lean Zucker rats, obtaining an average weight of 500 grams with a standard deviation of 50 grams. You measure 20 fatty Zucker rats, obtaining an average weight of 1000 grams with a standard deviation of 100 grams. Assess the evidence against the claim that these populations have the same weight.
+
+7.6 Male and Female pet Rats You measure 35 female pet rats, obtaining an average weight of 300 grams with a standard deviation of 30 grams. You measure 30 male pet rats, obtaining an average weight of 400 grams with a standard deviation of 100 grams. Assess the evidence against the claim that these populations have the same weight.
+
+7.7 Lean and Fatty Zucker Rats Zucker rats are specially bred to have curious weight properties, related to their genetics (look them up on the Web). You measure 30 lean Zucker rats, obtaining an average weight of 500 grams with a standard deviation of 50 grams. You measure 35 fatty Zucker rats, obtaining an average weight of 1000 grams with a standard deviation of 100 grams. In steps, you will assess the evidence against the claim that a fatty Zucker rat has exactly twice the weight of a lean Zucker rat. You know that the product of a normal random variable and a constant is a normal random variable. You should assume (and accept, because I won't prove it) that the sum of two normal random variables is a normal random variable.
+
+  1. (a)
+
+Write L (k) for the random variable obtained by drawing a uniform sample of k lean rats and averaging their weights. You can assume that k is large enough that this is normal.
+
+    * What is ![
+$$\\mathbb{E}\\left \[L^{\(k\)}\\right \]$$
+](A442674_1_En_7_Chapter_IEq51.gif)? (write an expression, no need to prove anything)
+
+    * What is ![
+$$\\mathsf{std}\\left \(L^{\(k\)}\\right \)$$
+](A442674_1_En_7_Chapter_IEq52.gif)? (write an expression, no need to prove anything)
+
+  2. (b)
+
+Now write F (s) for the random variable obtained by drawing a uniform sample of s fatty rats and averaging their weights. You can assume that s is large enough that this is normal.
+
+    * What is ![
+$$\\mathbb{E}\\left \[F^{\(s\)}\\right \]$$
+](A442674_1_En_7_Chapter_IEq53.gif)? (write an expression, no need to prove anything)
+
+    * What is ![
+$$\\mathsf{std}\\left \(F^{\(s\)}\\right \)$$
+](A442674_1_En_7_Chapter_IEq54.gif)? (write an expression, no need to prove anything)
+
+  3. (c)
+
+Write ![
+$$\\mathsf{popmean}\\left \(\\left \\{L\\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq55.gif) for the population mean weight of lean rats, and ![
+$$\\mathsf{popmean}\\left \(\\left \\{F\\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq56.gif) for the population mean weight of fatty rats. Assume that ![
+$$2\\mathsf{popmean}\\left \(\\left \\{L\\right \\}\\right \) = \\mathsf{popmean}\\left \(\\left \\{F\\right \\}\\right \)$$
+](A442674_1_En_7_Chapter_IEq57.gif).
+
+    * In this case, what is ![
+$$\\mathbb{E}\\left \[F^{\(s\)} - 2L^{\(k\)}\\right \]$$
+](A442674_1_En_7_Chapter_IEq58.gif)?
+
+    * In this case, what is ![
+$$\\mathsf{std}\\left \(F^{\(s\)} - 2L^{\(k\)}\\right \)$$
+](A442674_1_En_7_Chapter_IEq59.gif)?
+
+    * Your expression for ![
+$$\\mathsf{std}\\left \(F^{\(s\)} - 2L^{\(k\)}\\right \)$$
+](A442674_1_En_7_Chapter_IEq60.gif) will have contained terms in the population standard deviation of F and L. What is the standard error of F (s) − 2L (k)?
+
+  4. (d)
+
+Now assess the evidence against the hypothesis that a fatty Zucker rat weighs exactly twice as much as a lean Zucker rat.
+
+7.8 Are boys and girls equiprobable? In Carcelle-le-Grignon at the end of the eighteenth century, there were 2009 births. There were 983 boys and 1026 girls. You can regard this as a fair random sample (with replacement, though try not to think too hard about what that means) of births. Assess the evidence against the hypothesis that a boy is born with probability exactly 0.5.
+
+Chi-Squared Tests
+
+7.9 You can find a dataset of the passenger list for the Titanic disaster at http://​www.​statsci.​org/​data/​general/​titanic.​html.
+
+  1. (a)
+
+Assess the evidence that survival is independent of passenger ticket class.
+
+  2. (b)
+
+Assess the evidence that survival is independent of passenger gender.
+
+7.10 You can find a dataset giving income data for US citizens at the UC Irvine Machine Learning data archive, at http://​archive.​ics.​uci.​edu/​ml/​datasets/​Adult. Each item consists of a set of numeric and categorical features describing a person, together with whether their annual income is larger than or smaller than 50 K$.
+
+  1. (a)
+
+Assess the evidence that income category is independent of gender.
+
+  2. (b)
+
+Assess the evidence that income category is independent of education level.
+
+7.11 Assess the evidence that the swearing behavior of the politician of Worked example 7.11 follows a Poisson distribution. Hint: Once you've estimated the intensity, the rest is like that example; be careful about the number of degrees of freedom.
+© Springer International Publishing AG 2018
+
+David ForsythProbability and Statistics for Computer Sciencehttps://doi.org/10.1007/978-3-319-64410-3_8
+
+# 8. Experiments
+
+David Forsyth1
+
+(1)
+
+Computer Science Department, University of Illinois at Urbana Champaign, Urbana, IL, USA
+
+An experiment tries to evaluate the effects of one or more treatments. Imagine you wish to evaluate the effect of some treatment. One natural strategy is: choose groups of subjects; apply this treatment at different levels to different groups; then see if the groups are different after treatment. For example, you might wish to investigate whether consuming a painkiller affected headaches. Different levels of treatment would correspond to different numbers of pills (say, none, one or two). Different subjects would be different people. We would then divide the subjects into groups, apply different levels of treatment (i.e. give them different numbers of pills), and record outcomes. Now we need to tell whether the groups are different.
+
+Telling whether the groups are different requires care, because the subjects will differ for a variety of irrelevant reasons (bodyweight, susceptibility to medication, and so on), so there will be differences between groups—we need to be able to tell whether these differences are due to the treatment, or to the irrelevant reasons. One powerful strategy is to allocate subjects to groups before treatment at random, so that each group looks similar (i.e. each group has the same variation in bodyweight, and so on). We can then use the significance machinery of the previous chapter to tell whether the differences between groups are due to the effects of treatment.
+
+The procedures we use can be extended to deal with multiple treatments. We deal with the case of two treatments, because it captures important phenomena. Rather than do experiments for each treatment separately, it is a good idea to look at the results for different levels of both treatments together. Doing so allows us to identify possible interactions, and to treat fewer subjects in total. The machinery doesn't change (though there is more of it) when we pass to more than two treatments.
+
+## 8.1 A Simple Experiment: The Effect of a Treatment
+
+I use the term "treatment" very broadly here. The subject of experimental design started around the question of how to evaluate fertilizers, and it's natural to think about medical treatments, but many other things can be a treatment. Examples include: the amount of RAM you put into a computer; different interface design decisions; different choices of algorithm for rendering pictures; and so on. We will evaluate the effect of some treatment by dividing subjects into groups, applying the treatment at different levels to different groups, then seeing if the groups are "different" after treatment. For this procedure to work, we need: (a) the groups to be the "same" before treatment, and (b) a sensible way to tell whether the groups are different.
+
+Randomization is a very strong strategy for allocating subjects to groups. Randomization ensures that each group "looks like" each other group. Differences between group then result from either sampling variation or from the effects of the treatment, and we know how to assess the effects of sampling. Randomization is a strong strategy for setting up experiments, but it doesn't cover every contingency. For example, imagine one treatment involves using a particular machine at a particular setting. Changing the setting of the machine again and again might be a problem. Furthermore, the order in which you do the experiments might matter—if the machine overheats, for example, while handling a subject at one setting, you may find that the results on the next subject are affected. We will assume that the experimental equipment doesn't have a memory, and that changing settings and so on is trivial.
+
+### 8.1.1 Randomized Balanced Experiments
+
+Assume you wish to use L levels of treatment (so there are L groups; not treating a subject is one level of treatment). We will evaluate the results of the experiment by comparing the outcome for each group of subjects. This means it is helpful if there are the same number of subjects in each group—we say the experiment is balanced —so that the error resulting from chance effects in each group is the same. We will have G subjects in each group, and so there is a total of LG subjects. We must now choose which treatment each subject gets. We allocate subjects to groups at random, ensuring that each group gets G subjects. You could do this, for example, by permuting the subjects randomly, then allocating the first G to group 1, etc.
+
+We now perform the experiment, by treating each group of subjects at the prescribed level, and recording the results. For each subject, we will observe a measurement. Write x ij for the observed value for the j'th subject in the i'th group. We want to know if the groups are different. We assume that differences in observed values in each group are purely due to noise. This means that we could model
+
+![
+$$\\displaystyle{x_{ij} =\\mu _{i} +\\epsilon _{ij}}$$
+](A442674_1_En_8_Chapter_Equa.gif)
+
+where ε ij is "noise"—unmodelled effects that have zero mean, and do not depend on the treatment. If the treatment has no effect, then each of the μ i will be the same. We can model the treatment at level i as having an effect t i , so that
+
+![
+$$\\displaystyle{\\mu _{i} =\\mu +t_{i}.}$$
+](A442674_1_En_8_Chapter_Equb.gif)
+
+We would like μ to be the average of the μ i , so we must have
+
+![
+$$\\displaystyle{\\sum _{u}t_{u} = 0.}$$
+](A442674_1_En_8_Chapter_Equc.gif)
+
+All this yields a model
+
+![
+$$\\displaystyle{x_{ij} =\\mu +t_{i} +\\epsilon _{ij}}$$
+](A442674_1_En_8_Chapter_Equd.gif)
+
+We assume that the noise is independent of the treatment level. This assumption is satisfied by randomizing the allocation of subjects to treatment groups. We will assume that the noise is normally distributed, with variance σ 2. This makes it straightforward to estimate μ and μ i with least squares. We write ![
+$$\\hat{\\mu }$$
+](A442674_1_En_8_Chapter_IEq1.gif) for our estimate of μ, and so on. We then choose ![
+$$\\hat{\\mu }$$
+](A442674_1_En_8_Chapter_IEq2.gif) to be the value of μ that minimizes the overall sum of squared differences
+
+![
+$$\\displaystyle{\\hat{\\mu }= \\begin{array}{c} \\mbox{ argmin} \\\\ \\mu \\end{array} \\sum _{ij}\(x_{ij}-\\mu \)^{2}.}$$
+](A442674_1_En_8_Chapter_Eque.gif)
+
+We choose each ![
+$$\\hat{\\mu }_{i}$$
+](A442674_1_En_8_Chapter_IEq3.gif) to be the value of μ i that minimizes the sum of squared differences within the i'th group
+
+![
+$$\\displaystyle{\\hat{\\mu }_{i} = \\begin{array}{c} \\mbox{ argmin} \\\\ \\mu _{i } \\end{array} \\sum _{j}\(x_{ij}-\\mu _{i}\)^{2}.}$$
+](A442674_1_En_8_Chapter_Equf.gif)
+
+All this yields
+
+![
+$$\\displaystyle{\\hat{\\mu }= \\frac{\\sum _{ij}x_{ij}} {GL} \\mbox{ and }\\hat{\\mu }_{i} = \\frac{\\sum _{j}x_{ij}} {G}.}$$
+](A442674_1_En_8_Chapter_Equg.gif)
+
+### 8.1.2 Decomposing Error in Predictions
+
+The results are in L groups, one for each level of treatment. The overall sum of squared differences from the estimated mean ![
+$$\\hat{\\mu }$$
+](A442674_1_En_8_Chapter_IEq4.gif) is
+
+![
+$$\\displaystyle{\\mbox{ SS}_{\\mbox{ T}} =\\sum _{ij}\(x_{ij}-\\hat{\\mu }\)^{2}.}$$
+](A442674_1_En_8_Chapter_Equh.gif)
+
+This can be decomposed into two terms, as the exercises show. We have
+
+![
+$$\\displaystyle{\\sum _{ij}\(x_{ij}-\\hat{\\mu }\)^{2} = \\left \[\\sum _{ ij}\(x_{ij} -\\hat{\\mu }_{i}\)^{2}\\right \] + G\\left \[\\sum _{ i}\(\\hat{\\mu }-\\hat{\\mu }_{i}\)^{2}\\right \].}$$
+](A442674_1_En_8_Chapter_Equi.gif)
+
+This expression breaks the total sum of squared errors SST into two components. The first,
+
+![
+$$\\displaystyle{\\mbox{ SS}_{\\mbox{ W}} =\\sum _{ij}\(x_{ij} -\\hat{\\mu }_{i}\)^{2},}$$
+](A442674_1_En_8_Chapter_Equj.gif)
+
+is due to within-group variation; and the second,
+
+![
+$$\\displaystyle{\\mbox{ SS}_{\\mbox{ B}} = G\\left \[\\sum _{i}\(\\hat{\\mu }-\\hat{\\mu }_{i}\)^{2}\\right \]}$$
+](A442674_1_En_8_Chapter_Equk.gif)
+
+is due to between-group variation. The relative size of these two terms should tell us whether the treatment has any effect or not. For example, assume there are large effects. Then SSB should be "big", because the ![
+$$\\hat{\\mu }_{i}$$
+](A442674_1_En_8_Chapter_IEq5.gif) should be different from one another, and the SSW should be "small", because measurements should be rather closer to their group means than to the overall mean (and because SST = SSW \+ SSB, so when one goes up the other must go down). Now assume there is no effect. Then the ![
+$$\\hat{\\mu }_{i}$$
+](A442674_1_En_8_Chapter_IEq6.gif) should be quite similar to ![
+$$\\hat{\\mu }$$
+](A442674_1_En_8_Chapter_IEq7.gif), and so to each other. This means SSB should be "small", and so on. Using this line of reasoning requires some quantitative notion of what is "big" and what is "small".
+
+### 8.1.3 Estimating the Noise Variance
+
+The trick to knowing whether SSB is "big" or not runs as follows. If there is no effect, then SSB and SSW can each be used to produce estimates of the noise variance. We produce these estimates, and then determine whether the difference between estimates can be explained by sampling variation. If it can't, then the treatment has some effect.
+
+Using the ith treatment group, we can estimate σ 2 by
+
+![
+$$\\displaystyle{\\hat{\\sigma }^{2} = \\frac{\\sum _{j}\(x_{ij} -\\hat{\\mu }_{i}\)^{2}} {\(G - 1\)}.}$$
+](A442674_1_En_8_Chapter_Equl.gif)
+
+An even better estimate would be to average these estimates across groups. This yields
+
+![
+$$\\displaystyle{\\hat{\\sigma }^{2} = \\frac{1} {L}\\sum _{i}\\left \[\\frac{\\sum _{i}\(x_{ij} -\\hat{\\mu }_{j}\)^{2}} {\(G - 1\)} \\right \].}$$
+](A442674_1_En_8_Chapter_Equm.gif)
+
+This estimate is sometimes known as the within group variation or residual variation. We write
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mbox{ MS}_{\\mbox{ W}}& =& \\frac{1} {L}\\sum _{i}\\left \[\\frac{\\sum _{j}\(x_{ij} -\\hat{\\mu }_{i}\)^{2}} {\(G - 1\)} \\right \] \\\\ & =& \\left \( \\frac{1} {L\(G - 1\)}\\right \)\\mbox{ SS}_{\\mbox{ W}}.{}\\end{array}$$
+](A442674_1_En_8_Chapter_Equ1.gif)
+
+(8.1)
+
+This estimate has L(G − 1) degrees of freedom, because we have LG data items, but we lose one degree of freedom for each of the L means we have estimated.
+
+Assume the treatment has no effect. Then we expect that all the t i = μ i −μ are zero. In turn, each of the ![
+$$\\hat{\\mu }_{i}$$
+](A442674_1_En_8_Chapter_IEq8.gif) is an estimate of μ. These estimates are the values of a random variable whose expected value is μ, and whose variance is σ 2∕G. This means that
+
+![
+$$\\displaystyle{\\frac{\\sum _{i}\(\\hat{\\mu }_{i}-\\hat{\\mu }\)^{2}} {L - 1} \\approx \\frac{\\sigma ^{2}} {G}.}$$
+](A442674_1_En_8_Chapter_Equn.gif)
+
+We can use this to produce another estimate of σ 2, where
+
+![
+$$\\displaystyle{\\hat{\\sigma }^{2} = G\\frac{\\sum _{j}\(\\hat{\\mu }_{j}-\\hat{\\mu }\)^{2}} {L - 1} }$$
+](A442674_1_En_8_Chapter_Equo.gif)
+
+This estimate is sometimes known as the between group variation or treatment variation. We write
+
+![
+$$\\displaystyle{\\mbox{ MS}_{\\mbox{ B}} = G \\frac{\\sum _{j}\(\\hat{\\mu }_{j}-\\hat{\\mu }\)^{2}} {\(L - 1\)} = \\left \( \\frac{1} {L - 1}\\right \)\\mbox{ SS}_{\\mbox{ B}}.}$$
+](A442674_1_En_8_Chapter_Equp.gif)
+
+This estimate has L − 1 degrees of freedom, because we have used L data items (the ![
+$$\\hat{\\mu }_{j}$$
+](A442674_1_En_8_Chapter_IEq9.gif)), but estimated one mean (the ![
+$$\\hat{\\mu }$$
+](A442674_1_En_8_Chapter_IEq10.gif)).
+
+If the treatment has no effect, then any difference between these estimates would have to be the result of sampling effects. We can apply an F-test of significance (Sect. 7.​3.​1) to the ratio of the estimates. Now imagine that at least one of the treatment levels has an effect. In turn, this would mean that one of the ![
+$$\\hat{\\mu }_{j}$$
+](A442674_1_En_8_Chapter_IEq11.gif) was more different from ![
+$$\\hat{\\mu }$$
+](A442674_1_En_8_Chapter_IEq12.gif) than would occur if the treatment had no effect. This means that MSB would be larger than would happen if the treatment had no effect. This means that, if there is no effect of treatment, the statistic
+
+![
+$$\\displaystyle{F = \\frac{\\mbox{ MS}_{\\mbox{ B}}} {\\mbox{ MS}_{\\mbox{ W}}}}$$
+](A442674_1_En_8_Chapter_Equq.gif)
+
+would be "about" one, with a known distribution (the F-distribution; Sect. 7.​3.​1). If there is a treatment effect, then we expect F to be larger than one. Finally, we need the degrees of freedom. The estimate MSW has L(G − 1) degrees of freedom. The estimate MSB has L − 1 degrees of freedom. We can now use the F-test to obtain a p-value for F with L − 1, L(G − 1) degrees of freedom. In other words, we use tables or a software environment to evaluate
+
+![
+$$\\displaystyle{\\int _{F}^{\\infty }p_{ f}\(u;L - 1,L\(G - 1\)\)du}$$
+](A442674_1_En_8_Chapter_Equr.gif)
+
+(where p f is the probability density for the F-statistic, Sect. 7.​3.​1).
+
+### 8.1.4 The ANOVA Table
+
+We now have a powerful and useful procedure that I can name and put in a box. Analyzing data by comparing variances in this way is sometimes known as analysis of variance or ANOVA. It is usual to lay out the information in a table, sometimes called an ANOVA table. We have only one kind of treatment we are varying in the experiment, so the experiment is known as a one factor experiment. The form of the table appears in Procedure 8.1.
+
+Procedure 8.1 (Evaluating Whether a Treatment Has Significant Effects with a One-Way ANOVA for Balanced Experiments)
+
+Perform a randomized experiment: Choose L levels of treatment. Randomize LG subjects into L treatment groups of G subjects each. Treat each subject, and record the results.
+
+Terminology:
+
+![
+$$\\displaystyle{\\begin{array}{@{}l@{}l@{\\hspace *{-3pt}}l@{}} x_{ij} &\\mbox{ value for the }j\\mbox{ 'th subject} \\hspace{-3.0pt}& \\\\ &\\mbox{ in the }i\\mbox{ 'th treatment level group}\\hspace{-3.0pt}& \\\\ \\hat{\\mu } &\\mbox{ overall mean} \\hspace{-3.0pt}&\\hspace{35.0pt} \\left \(\\sum _{ij}x_{ij}\\right \)/\\left \(GL\\right \) \\\\ \\hat{\\mu }_{i} &i\\mbox{ 'th group mean} \\hspace{-3.0pt}&\\hspace{35.0pt} \\left \(\\sum _{j}x_{ij}\\right \)/G \\\\ \\mbox{ SS}_{\\mbox{ W}} &\\mbox{ within group sum of squares} \\hspace{-3.0pt}&\\hspace{35.0pt} \\sum _{ij}\(x_{ij} -\\hat{\\mu }_{i}\)^{2} \\\\ \\mbox{ SS}_{\\mbox{ B}} &\\mbox{ between group sum of squares} \\hspace{-3.0pt}&\\hspace{35.0pt} G\\left \[\\sum _{i}\(\\hat{\\mu }-\\hat{\\mu }_{i}\)^{2}\\right \] \\\\ \\mbox{ MS}_{\\mbox{ W}}&\\mbox{ within group mean squares} \\hspace{-3.0pt}&\\hspace{35.0pt} \\mbox{ SS}_{\\mbox{ W}}/\\left \(L\(G - 1\)\\right \) \\\\ \\mbox{ MS}_{\\mbox{ B}} &\\mbox{ between group mean squares} \\hspace{-3.0pt}&\\hspace{35.0pt} \\mbox{ SS}_{\\mbox{ B}}/\\left \(L - 1\\right \) \\\\ F &\\mbox{ value of F-statistic} \\hspace{-3.0pt}&\\hspace{35.0pt} \\mbox{ MS}_{\\mbox{ B}}/\\mbox{ MS}_{\\mbox{ W}} \\\\ \\mbox{ p-value }&\\mbox{ from tables or software} \\hspace{-3.0pt}&\\hspace{35.0pt} \\int _{F}^{\\infty }p_{f}\(u;L - 1,L\(G - 1\)\)du\\\\ \\end{array}}$$
+](A442674_1_En_8_Chapter_Equs.gif)
+
+Make the ANOVA table: Form the table  | DOF | Sum Sq | Mean Sq | F value | Pr( > F)
+
+---|---|---|---|---|---
+
+Treatment | L-1 | SSB | MSB | ![
+$$\\mbox{ MS}_{\\mbox{ B}}/\\mbox{ MS}_{\\mbox{ W}}$$
+](A442674_1_En_8_Chapter_IEq13.gif) | p-value
+
+Residuals | L (G-1) | SSW | MSW
+
+|  |
+
+Interpretation: If the p-value is small enough, then only an extremely unlikely set of samples could explain the difference between the levels of treatment as sampling error; it is more likely the treatment has an effect.
+
+Worked example 8.1 (Does Depth Affect Aldrin Concentration?)
+
+Jaffe et al measured the concentration of various pollutants in the Wolf river. Assess the evidence supporting the belief that the concentration of aldrin does not depend on depth. You can find the dataset at http://​www.​statsci.​org/​data/​general/​wolfrive.​html.
+
+Solution
+
+The original measurements are described in Jaffe, P. R., Parker, F. L., and Wilson, D. J. (1982). Distribution of toxic substances in rivers. Journal of the Environmental Engineering Division, 108, 639–649. I obtained the ANOVA table below.  | DOF | Sum Sq | Mean Sq | F value | Pr( > F)
+
+---|---|---|---|---|---
+
+Depth | 2 | 16.83 | 8.415 | 6.051 | 0.00674
+
+Residuals | 27 | 37.55 | 1.391
+
+|  |
+
+The very small p-value suggests that the evidence is very strongly against the idea that concentration does not depend on depth.
+
+### 8.1.5 Unbalanced Experiments
+
+In an ideal experiment, each group has the same number of subjects. This is known as a balanced experiment. Such experiments can be difficult to arrange. A measurement might be impossible, or a subject might get lost, meaning that some levels of treatment may have fewer than others. The result is an unbalanced experiment. This doesn't affect the reasoning of the section above, though subtleties can arise. We do need to be careful about the number of degrees of freedom, though.
+
+Assume that the i'th group has G i items in it. Then the estimate of σ 2 based on residual variation becomes
+
+![
+$$\\displaystyle{\\mbox{ MS}_{\\mbox{ W}} = \\frac{1} {L}\\sum _{j}\\left \[\\frac{\\sum _{i}\(x_{ij} -\\hat{\\mu }_{i}\)^{2}} {\(G_{i} - 1\)} \\right \].}$$
+](A442674_1_En_8_Chapter_Equt.gif)
+
+This estimate has ∑ i G i − L degrees of freedom, because there are ∑ i G i items and we had to estimate L means, one for each treatment level.
+
+The estimate of σ 2 based on treatment variation is slightly more interesting. For the i'th group, the standard error of the estimate of ![
+$$\\hat{\\mu }$$
+](A442674_1_En_8_Chapter_IEq14.gif) is
+
+![
+$$\\displaystyle{\\sqrt{ \\frac{\\sigma ^{2 } } {G_{i}}}}$$
+](A442674_1_En_8_Chapter_Equu.gif)
+
+which means that ![
+$$\\sqrt{G_{j}}\(\\hat{\\mu }_{j}-\\hat{\\mu }\)$$
+](A442674_1_En_8_Chapter_IEq15.gif) is the value of a normal random variable with mean 0 and variance σ 2. In turn, this means that
+
+![
+$$\\displaystyle{\\mbox{ MS}_{\\mbox{ B}} = \\frac{\\sum _{i}\\left \[G_{i}\(\\hat{\\mu }_{i}-\\hat{\\mu }\)^{2}\\right \]} {\(L - 1\)}.}$$
+](A442674_1_En_8_Chapter_Equv.gif)
+
+is an estimate of σ 2. This estimate has L − 1 degrees of freedom, because we used L data items but estimated one mean.
+
+In the balanced case, the sums of squares were meaningful because one scales them to get the estimates of σ 2. There's an established tradition of supplying them in the ANOVA table. In the unbalanced case, it's hard to turn the sums of squares into anything useful by eye. This is the result of the weighting terms in the expressions for the mean square error. I omit them from the ANOVA tables in the worked examples below.
+
+Worked example 8.2 (Does Olestra in Potato Chips Produce Gastro-Intestinal Symptoms?)
+
+Olestra is a fat that is edible, but cannot be digested. This means the effective calorie content of chips containing olestra is reduced, but there might be gastro-intestinal consequences. The paper "Gastrointestinal Symptoms Following Consumption of Olestra or Regular Triglyceride Potato Chips", JAMA, 279: 150–152, L. Cheskin, R. Miday, N. Zorich, and T. Filloon (1998). ran a double blind randomized trial. They observed 89 people eating chips with olestra who had a GI outcome; 474 who did not; 93 eating chips without olestra who had a GI outcome; and 436 eating chips without olestra who did not. Subjects ate chips ad libitem (i.e. until they felt like stopping). Assess the evidence that olestra causes a GI outcome under these circumstances using an ANOVA.
+
+Solution
+
+Each subject gets a 1 if there is a GI effect, and a 0 otherwise. For this data, I found  | DOF | Mean Sq | F value | Pr( > F)
+
+---|---|---|---|---
+
+Fat | 1 | 0.086 | 0.63 | 0.43
+
+Residuals | 1090 | 0.14
+
+|  |
+
+suggesting that there is no reason to feel that consumption of olestra in potato chips eaten ad libitem causes GI effects.
+
+Worked example 8.2 should give you pause. You should think carefully about what the experiment actually shows. It should not be interpreted as saying that eating quantities of indigestible fat has no GI consequences, and the authors didn't make this claim. The claim might be true, but the experiment is silent on the point. Interpreting an experiment often requires scrupulousl precision. What this experiment says is there is no reason to conclude that eating a particular indigestible fat in potato chips eaten ad libitem causes GI consequences. One would really like to know how much of the relevant fats each group of subjects ate. As an extreme example, think about chips cooked in axle grease eaten ad libitem. These would very likely not cause GI symptoms because you wouldn't eat any at all.
+
+Worked example 8.2 is not really all that interesting, because there are only two levels of treatment. You could analyze these experiments with the methods of Sect. 7.​2. Here is a more interesting example.
+
+Worked example 8.3 (Does Hair Color Affect Pain Threshold?)
+
+An experiment at the University of Melbourne tested the dependency of the pain threshold of human subjects on their hair color. You can find the dataset at http://​www.​statsci.​org/​data/​oz/​blonds.​html. Assess the evidence that pain threshold depends on hair color.
+
+Solution
+
+If you download the dataset, you'll find that there are four hair colors and nineteen data items. For each subject, there is a number. Larger values of this number represent higher tolerable pain thresholds. I obtained the ANOVA table below.  | DOF | Mean Sq | F value | Pr( > F)
+
+---|---|---|---|---
+
+HairColour | 3 | 454.0 | 7.04 | 0.0035
+
+Residuals | 15 | 64.5
+
+|  |
+
+This dataset appears in various places on the web, but its status as a "real" dataset is a bit marginal. I wasn't able to find out who conducted this experiment, where the results were published, and whether it conformed to human subjects requirements when conducted.
+
+### 8.1.6 Significant Differences
+
+We know how to reject the hypothesis that the treatment has no effect by computing a p-value from an ANOVA. We'd like to do more. Generally, it is dangerous to fish around in the data produced by an experiment to see if anything fits; if there is enough data, something should have a low p-value. It is important to avoid this danger if you want to interpret the experimental results correctly (sometimes, it is profitable, though shameful, not to). A natural first step here is a boxplot of what happens at each treatment level (Fig. 8.1). With luck, the effect of the treatment will be so strong that significance testing is a formality.
+
+Fig. 8.1
+
+On the left, a boxplot of concentration of Aldrin at three different depths (see Worked example 8.1). There's a strong suggestion here that the concentration is affected by depth level, which is the treatment. Notice how the box for the "Surface" measurements does not intersect the box for the "Deep" measurements. On the right, a boxplot of the tolerable pain level by hair color for the data of Worked example 8.3. There's a strong suggestion here that hair color affects pain threshold, though you should be careful relying on this conclusion. As the worked example points out, it isn't clear where the data came from, or what it means
+
+One way to avoid this danger is to declare a set of hypotheses we want to evaluate in advance of the experiment, then investigate those. The procedure is particularly straightforward for hypotheses known as contrasts. These are linear functions of the treatment means. Recall I wrote μ i for the treatment means. A contrast takes the form
+
+![
+$$\\displaystyle{\\sum _{i}c_{i}\\mu _{i}}$$
+](A442674_1_En_8_Chapter_Equw.gif)
+
+for some set of constants c i . There are two relatively straightforward procedures. We could produce a confidence interval for a contrast. Alternatively, we can evaluate the significance of the evidence against a contrast being zero. Each of these procedures relies on the fact that we know the distribution of the estimate ![
+$$\\hat{\\mu }_{i}$$
+](A442674_1_En_8_Chapter_IEq16.gif) for the treatment means. In particular, ![
+$$\\hat{\\mu }_{i}$$
+](A442674_1_En_8_Chapter_IEq17.gif) is the value of a random variable M j which is normally distributed (assuming that ε ij are normally distributed); has mean μ i ; and, writing G i for the number of subjects in the i'th group, has variance
+
+![
+$$\\displaystyle{ \\frac{\\sigma ^{2}} {G_{i}}.}$$
+](A442674_1_En_8_Chapter_Equx.gif)
+
+In turn, this means that
+
+![
+$$\\displaystyle{\\sum _{i}c_{i}\\hat{\\mu }_{i}}$$
+](A442674_1_En_8_Chapter_Equy.gif)
+
+is the value of a random variable C which is normally distributed; has mean ∑ i c i μ i ; and has variance
+
+![
+$$\\displaystyle{\\sigma ^{2}\\sum _{ i}\\frac{c_{i}^{2}} {G_{i}} }$$
+](A442674_1_En_8_Chapter_Equz.gif)
+
+Now producing a confidence interval for a contrast follows the lines of Sect. 6.​2. Evaluating the significance of the evidence against a contrast being zero follows the lines of Sect. 7.​1. Notice that we do not know σ 2, but must estimate it. This means we should use a T-test. We estimate σ 2 as MSW (Sect. 8.1.3), so the number of degrees of freedom is ∑ i (G i − 1).
+
+One natural set of contrasts are the differences between treatment means. You should notice that if there are L treatment levels there are L(L − 1)∕2 differences, which will get inconvenient if there are many treatment means. With that said, it is quite usual to look for significant difference between treatment means. The simplest procedure for doing so follows the recipe for contrasts, above.
+
+Worked example 8.4 (How Significant are the Differences in Aldrin Concentration at Different Depths?)
+
+Jaffe et al measured the concentration of various pollutants in the Wolf river. Assess how different the mean concentration is at each depth. You can find the dataset at http://​www.​statsci.​org/​data/​general/​wolfrive.​html.
+
+Solution
+
+There are three depths: Surface, Middepth, and Bottom. I found
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\hat{\\mu }_{\\mbox{ Middepth}} -\\hat{\\mu }_{\\mbox{ Surface}}& =& 0.83 {}\\\\ \\hat{\\mu }_{\\mbox{ Bottom}} -\\hat{\\mu }_{\\mbox{ Middepth}}& =& 1.00 {}\\\\ \\hat{\\mu }_{\\mbox{ Bottom}} -\\hat{\\mu }_{\\mbox{ Surface}}& =& 1.83 {}\\\\ & & {}\\\\ \\end{array}$$
+](A442674_1_En_8_Chapter_Equ2.gif)
+
+and the standard error for each was 0. 56. This means that the difference between Middepth and Surface is about one and half standard errors, and is quite possibly due to sampling effects (using a one sided t-test with 9 degrees of freedom, I got a p-value of 0. 086). The difference between Bottom and Middepth is unlikely to be due to sampling effects (p-value: 0. 053); and the difference between Bottom and Surface is extremely unlikely to be due to sampling effects (p-value: 0. 0049). I used a one-sided test here, because I wanted to assess the fraction of samples likely to have even larger differences than the one observed.
+
+You need to approach this procedure with moderate caution. If L is large, it is possible that you will think more differences are significant than is really the case, because you are looking at multiple differences and samples. The significance test hasn't taken this into account. More advanced methods—beyond the scope of this account—are required to do deal with this.
+
+## 8.2 Two Factor Experiments
+
+Now imagine there are two factors that likely affect the outcome of an experiment. For example, we might be testing the effect of having more RAM and having a higher clock speed on the speed of some program. The treatments here are obvious: different levels of treatment correspond to different amounts of RAM and different clock speeds (say, small, medium and large RAM; low, medium and high clock speeds). One possibility is to set up one experiment to test the effect of RAM and another to test the effect of the clock speed. But this is not a good idea.
+
+First, the treatments might interact in some way, and the experiment we set up should be able to identify this if it happens. Second, it turns out that a sensible design that investigates both factors together will involve fewer treatments than if we investigate the factors separately.
+
+This will become clear as we set up the experiment for two factors. We will deal only with balanced experiments, because they're easier to interpret. We will assume that the first factor has L 1 levels, and the second factor has L 2 levels. It's natural to visualize the experiment as an L 1 × L 2 table of cells, where each cell contains the subjects for the experiment at those levels (Fig. 8.2). Each cell will contain G subjects. Performing this experiment will involve measuring L 1 × L 2 × G measurements.
+
+Fig. 8.2
+
+Think about a two factor experiment as an L 1 × L 2 table of cells, each containing G subjects chosen at random. The subjects in each cell get the level of treatments one and two chosen by the cell's indices in the table. We write x ijk for the response observed for the k'th subject in cell i, j
+
+Now assume the factors do not interact. Then we can use the rows to estimate the effect of factor one at L 1 different levels using G × L 2 measurements per cell. Similarly, we can use the columns to estimate the effect of factor two at L 2 different levels using G × L 1 measurements per cell. To obtain the same number of measurements per cell for each factor with independent experiments would take more experiments. You would have to use ![
+$$L_{1} \\times \\left \(G \\times L_{2}\\right \)$$
+](A442674_1_En_8_Chapter_IEq18.gif) experiments for factor one and another ![
+$$L_{2} \\times \\left \(G \\times L_{1}\\right \)$$
+](A442674_1_En_8_Chapter_IEq19.gif) experiments for factor two.
+
+Randomization is still a strong strategy for assigning groups to cells. Again, we will assume that the experimental equipment doesn't have a memory, and that changing settings and so on is trivial. We allocate subjects to groups at random, ensuring that each group gets G subjects. You could do this, for example, by permuting the subjects randomly, then allocating the first G to group 1, 1, etc.
+
+We then perform the experiment, by treating each group of subjects at the prescribed level, and recording the results. For each subject, we will observe a measurement. We want to know if there is an interaction between the treatments, and if either treatment has any effect. We can investigate this question using methods like those for one-factor experiments.
+
+We assume that differences in observed values in each group are purely due to noise. Write x ijk for the observed value for the k'th subject in the treatment group that has the i'th level of the first treatment and the j'th level of the second treatment. This means that we could model
+
+![
+$$\\displaystyle{x_{ijk} =\\mu _{ij} +\\epsilon _{ij}}$$
+](A442674_1_En_8_Chapter_Equaa.gif)
+
+where ε ij is "noise"—unmodelled effects that have zero mean. There are three effects that could cause the groups to have different m ij . The first treatment might have an effect; the second treatment might have an effect; or there could be an interaction between the treatments that could have an effect. Write a i for the effects caused by the first treatment, b j for the effects caused by the second treatment, and c ij for interaction effects. We can model
+
+![
+$$\\displaystyle{\\mu _{ij} =\\mu +a_{i} + b_{j} + c_{ij}.}$$
+](A442674_1_En_8_Chapter_Equab.gif)
+
+As in the case for single factor experiments, we constrain a i , b j and c ij so that the mean of the μ ij terms is μ. This means that ∑ u a u = 0, ∑ v b v = 0, ∑ u c uv = 0, and ∑ v c uv = 0 (notice this means that ∑ uv c uv = 0, too).
+
+As in the single factor case, we assume that the noise is independent of the treatment level. This assumption is satisfied by randomizing the allocation of subjects to treatment groups. We will assume that the noise is normally distributed, with variance σ 2. We will use traditional notation, and write μ i⋅  = μ \+ a i and μ ⋅ j = μ \+ b j . Again, it is straightforward to estimate μ, μ i⋅ , μ ⋅ j , and μ ij with least squares. As before, we write ![
+$$\\hat{\\mu }$$
+](A442674_1_En_8_Chapter_IEq20.gif) for our estimate of μ, and so on. As before, we then choose ![
+$$\\hat{\\mu }$$
+](A442674_1_En_8_Chapter_IEq21.gif) to be the value of μ that minimizes the overall sum of squared differences
+
+![
+$$\\displaystyle{\\sum _{ijk}\(x_{ijk}-\\mu \)^{2}.}$$
+](A442674_1_En_8_Chapter_Equac.gif)
+
+We choose each ![
+$$\\hat{\\mu }_{ij}$$
+](A442674_1_En_8_Chapter_IEq22.gif) to be the value of μ ij that minimizes the sum of squared differences within the i, j'th group
+
+![
+$$\\displaystyle{\\hat{\\mu }_{ij} = \\begin{array}{c} \\mbox{ argmin} \\\\ \\mu _{ij } \\end{array} \\sum _{k}\(x_{ijk}-\\mu _{ij}\)^{2}.}$$
+](A442674_1_En_8_Chapter_Equad.gif)
+
+Now consider ![
+$$\\hat{\\mu }_{i\\cdot }$$
+](A442674_1_En_8_Chapter_IEq23.gif). There are L 1 different values, one for each level of the first factor. These account for the effects of the first factor alone, so we choose ![
+$$\\hat{\\mu }_{i\\cdot }$$
+](A442674_1_En_8_Chapter_IEq24.gif) to minimize the sum of squared differences to every measurement at the i'th level of the first factor. This is
+
+![
+$$\\displaystyle{\\hat{\\mu }_{i\\cdot } = \\begin{array}{c} \\mbox{ argmin} \\\\ \\mu _{i\\cdot } \\end{array} \\sum _{jk}\(x_{ijk}-\\mu _{i\\cdot }\)^{2}.}$$
+](A442674_1_En_8_Chapter_Equae.gif)
+
+A similar argument works for ![
+$$\\hat{\\mu }_{\\cdot j}$$
+](A442674_1_En_8_Chapter_IEq25.gif). There are L 2 different values, one for each level of the first factor. These account for the effects of the second factor alone, so we choose ![
+$$\\hat{\\mu }_{\\cdot j}$$
+](A442674_1_En_8_Chapter_IEq26.gif) to minimize the sum of squared differences to every measurement at the j'th level of the second factor. This is
+
+![
+$$\\displaystyle{\\hat{\\mu }_{\\cdot j} = \\begin{array}{c} \\mbox{ argmin} \\\\ \\mu _{\\cdot j } \\end{array} \\sum _{ik}\(x_{ijk}-\\mu _{\\cdot j}\)^{2}.}$$
+](A442674_1_En_8_Chapter_Equaf.gif)
+
+All this yields
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\hat{\\mu }& =& \\frac{\\sum _{ijk}x_{ijk}} {GL_{1}L_{2}} {}\\\\ \\hat{\\mu }_{ij}& =& \\frac{\\sum _{k}x_{ijk}} {G} {}\\\\ \\hat{\\mu }_{i\\cdot }& =& \\frac{\\sum _{jk}x_{ijk}} {GL_{2}} {}\\\\ \\hat{\\mu }_{\\cdot j}& =& \\frac{\\sum _{ik}x_{ijk}} {GL_{1}}. {}\\\\ \\end{array}$$
+](A442674_1_En_8_Chapter_Equ3.gif)
+
+### 8.2.1 Decomposing the Error
+
+Think of the results as sitting in a table of L 1 × L 2 cells, one for each pair of treatment levels. The overall sum of squared differences from the estimated mean ![
+$$\\hat{\\mu }$$
+](A442674_1_En_8_Chapter_IEq27.gif) is
+
+![
+$$\\displaystyle{\\mbox{ SS}_{\\mbox{ T}} =\\sum _{ijk}\(x_{ijk}-\\hat{\\mu }\)^{2}.}$$
+](A442674_1_En_8_Chapter_Equag.gif)
+
+This can be decomposed into four terms, rather like the pattern in Sect. 8.1.2. We have that
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\sum _{ijk}\(x_{ijk}-\\hat{\\mu }\)^{2}& =& \\sum _{ ijk}\\left \[\\begin{array}{c} \(x_{ijk} -\\hat{\\mu }_{ij}\)+ \\\\ \(\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }\)+ \\\\ \(\\hat{\\mu }_{\\cdot j}-\\hat{\\mu }\)+ \\\\ \(\\hat{\\mu }_{ij} -\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }_{\\cdot j}+\\hat{\\mu }\) \\end{array} \\right \]^{2} {}\\\\ & & {}\\\\ & =& \\left \[\\begin{array}{c} \\sum _{ijk}\(x_{ijk} -\\hat{\\mu }_{ij}\)^{2}+ \\\\ GL_{2}\\sum _{i}\(\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }\)^{2}+ \\\\ GL_{1}\\sum _{j}\(\\hat{\\mu }_{\\cdot j}-\\hat{\\mu }\)^{2}+ \\\\ G\\sum _{ij}\(\\hat{\\mu }_{ij} -\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }_{\\cdot j}+\\hat{\\mu }\)^{2} \\end{array} \\right \], {}\\\\ \\end{array}$$
+](A442674_1_En_8_Chapter_Equ4.gif)
+
+which you can establish using (a lot more of) the same reasoning as in Sect. 8.1.2. We label these terms
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mbox{ SS}_{\\mbox{ T}}& =& \\sum _{ijk}\(x_{ijk}-\\hat{\\mu }\)^{2} {}\\\\ \\mbox{ SS}_{\\mbox{ W}}& =& \\sum _{ijk}\(x_{ijk} -\\hat{\\mu }_{ij}\)^{2} {}\\\\ \\mbox{ SS}_{\\mbox{ Tr1}}& =& GL_{2}\\sum _{i}\(\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }\)^{2} {}\\\\ \\mbox{ SS}_{\\mbox{ Tr2}}& =& GL_{1}\\sum _{j}\(\\hat{\\mu }_{\\cdot j}-\\hat{\\mu }\)^{2} {}\\\\ \\mbox{ SS}_{\\mbox{ I}}& =& G\\sum _{ij}\(\\hat{\\mu }_{ij} -\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }_{\\cdot j}+\\hat{\\mu }\)^{2}. {}\\\\ \\end{array}$$
+](A442674_1_En_8_Chapter_Equ5.gif)
+
+The same line of reasoning as in Sect. 8.1.2 applies to these terms. Imagine neither treatment has any effect, and there is no interaction. Then each of ![
+$$\\hat{\\mu }_{ij}$$
+](A442674_1_En_8_Chapter_IEq28.gif), ![
+$$\\hat{\\mu }_{i\\cdot }$$
+](A442674_1_En_8_Chapter_IEq29.gif) and ![
+$$\\hat{\\mu }_{\\cdot j}$$
+](A442674_1_En_8_Chapter_IEq30.gif) should be similar, meaning that SSW should be large compared to the other terms. Now imagine there is a strong interaction between the treatments. This means that c ij is "large"; but ![
+$$\(\\hat{\\mu }_{ij} -\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }_{\\cdot j}+\\hat{\\mu }\)$$
+](A442674_1_En_8_Chapter_IEq31.gif) is an estimate of c ij , so we expect that SSI will be "large". Again, we need to be crisp about what it means to be "large", and we can do this using ideas of significance.
+
+As before, we will use estimates of the noise variance to compute measures of significance. We can estimate the variance of the noise using any of the cells. A better estimate is obtained by averaging over the cells, so we have as an estimate of the noise variance, sometimes called the within group mean squares and written MSW. We have
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\hat{\\sigma }_{\\mbox{ cellave}}^{2}& =& \\left \( \\frac{1} {L_{1}L_{2}}\\right \)\\sum _{ijk}\\frac{\(x_{ijk} -\\hat{\\mu }_{ij}\)^{2}} {G - 1} {}\\\\ & =& \\mbox{ SS}_{\\mbox{ W}}\\left \( \\frac{1} {L_{1}L_{2}\(G - 1\)}\\right \) = \\mbox{ MS}_{\\mbox{ W}}. {}\\\\ \\end{array}$$
+](A442674_1_En_8_Chapter_Equ6.gif)
+
+This estimate has L 1 L 2(G − 1) degrees of freedom, because we used L 1 L 2 G data items, and estimated L 1 L 2 means.
+
+### 8.2.2 Interaction Between Effects
+
+Assume there is no interaction between the treatments. Then the true c ij should be zero. Notice that we have the following estimates
+
+![
+$$\\displaystyle{\\begin{array}{l} \\mu + a_{i} + b_{j} + c_{ij} \\approx \\hat{\\mu }_{ij} \\\\ \\mu + a_{i} \\approx \\hat{\\mu }_{i\\cdot } \\\\ \\mu + b_{j} \\approx \\hat{\\mu }_{\\cdot j} \\\\ \\mu \\approx \\hat{\\mu }\\\\ \\end{array} }$$
+](A442674_1_En_8_Chapter_Equah.gif)
+
+so that we can estimate c ij as ![
+$$\\hat{\\mu }_{ij} -\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }_{\\cdot j}+\\hat{\\mu }$$
+](A442674_1_En_8_Chapter_IEq32.gif). Now each subject's response has some noise in it, with variance σ 2 which is unknown, but is the same for all subjects and all treatment levels. This means that ![
+$$\\hat{\\mu }_{ij} -\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }_{\\cdot j}+\\hat{\\mu }$$
+](A442674_1_En_8_Chapter_IEq33.gif) is the value of a random variable whose mean is zero. This estimate is averaged over G subjects, so the variance of the random variable is σ 2∕G. So we can estimate
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\hat{\\sigma }_{\\mbox{ inter}}^{2}& =& G \\frac{\\sum _{ij}\(\\hat{\\mu }_{ij} -\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }_{\\cdot j}+\\hat{\\mu }\)^{2}} {\(L_{1} - 1\)\(L_{2} - 1\)} {}\\\\ & =& \\left \( \\frac{1} {\(L_{1} - 1\)\(L_{2} - 1\)}\\right \)\\mbox{ SS}_{\\mbox{ I}} = \\mbox{ MS}_{\\mbox{ I}}. {}\\\\ \\end{array}$$
+](A442674_1_En_8_Chapter_Equ7.gif)
+
+This estimate is sometimes called the interaction mean squares, and written MSI.
+
+If there is no interaction, then any difference between MSI and MSW would have to be the result of sampling effects. We can apply an F-test of significance (Sect. 7.​3.​1), applied to the ratio of the estimates. In turn, this would mean that
+
+![
+$$\\displaystyle{F = \\frac{\\mbox{ MS}_{\\mbox{ I}}} {\\mbox{ MS}_{\\mbox{ W}}}}$$
+](A442674_1_En_8_Chapter_Equai.gif)
+
+would be "about" one, with a known distribution (the F-distribution; Sect. 7.​3.​1). Now imagine there is some interaction. Then we expect that MSI is larger than MSW, because the ![
+$$\\hat{\\mu }_{ij}$$
+](A442674_1_En_8_Chapter_IEq34.gif) differ from predictions made assuming no interaction, so we expect F is larger than one. Finally, we need the degrees of freedom. The estimate MSI has (L 1 L 2 − 1) degrees of freedom. The estimate MSW has L 1 L 2(G − 1) degrees of freedom. We can now use the F-test to obtain a p-value for F with (L 1 L 2 − 1), L 1 L 2(G − 1) degrees of freedom. In other words, we use tables or a software environment to evaluate
+
+![
+$$\\displaystyle{\\int _{F}^{\\infty }p_{ f}\(u;\(L_{1}L_{2} - 1\),L_{1}L_{2}\(G - 1\)\)du}$$
+](A442674_1_En_8_Chapter_Equaj.gif)
+
+(where p f is the probability density for the F-statistic, Sect. 7.​3.​1).
+
+### 8.2.3 The Effects of a Treatment
+
+Assume we find no interaction between the treatments. Now we investigate the effect of treatment one. If there is no effect, then the true a i should be zero. Recall that a i can be estimated by ![
+$$\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }$$
+](A442674_1_En_8_Chapter_IEq35.gif). Each subject's response has some noise in it, with variance σ 2 which is unknown, but is the same for all subjects and all treatment levels. This means that, for each i, we have ![
+$$\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }$$
+](A442674_1_En_8_Chapter_IEq36.gif) is the value of a random variable whose mean is zero. The estimate of a i has been obtained by averaging GL 2 subjects (all those who have level i of treatment one) so the variance of this random variable is σ 2∕(GL 2). We can estimate σ 2 using treatment one means. This estimate is sometimes called the treatment one mean squares, and written MST1. We have
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\hat{\\sigma }_{\\mbox{ T1}}^{2}& =& GL_{ 2}\\frac{\\sum _{i}\(\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }\)^{2}} {L_{1} - 1} = \\mbox{ SS}_{\\mbox{ Tr1}}\\left \( \\frac{1} {L_{1} - 1}\\right \) = \\mbox{ MS}_{\\mbox{ T1}}. {}\\\\ \\end{array}$$
+](A442674_1_En_8_Chapter_Equ8.gif)
+
+If there is no effect, then any difference between MST1 and MSW would have to be the result of sampling effects. We can apply an F-test of significance (Sect. 7.​3.​1), applied to the ratio of the estimates. Now imagine there is some effect of treatment. This would mean that
+
+![
+$$\\displaystyle{F = \\frac{\\mbox{ MS}_{\\mbox{ T1}}} {\\mbox{ MS}_{\\mbox{ W}}} }$$
+](A442674_1_En_8_Chapter_Equak.gif)
+
+would be "about" one, with a known distribution (the F-distribution; Sect. 7.​3.​1). If there is a treatment effect, then we expect F to be larger than one. Finally, we need the degrees of freedom. The estimate MST1 has (L 1 − 1) degrees of freedom. The estimate MSW has L 1 L 2(G − 1) degrees of freedom. We can now use the F-test to obtain a p-value for F with (L 1 − 1), L 1 L 2(G − 1) degrees of freedom. If the p-value is small enough, then only an extremely unlikely set of samples could explain the difference between the levels of treatment one as sampling error; it is more likely the treatment has an effect.
+
+Treatment two works like treatment one. The estimate of σ 2 from the treatment two means is called the treatment two mean squares, and written MST2. We have
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\hat{\\sigma }_{\\mbox{ T2}}^{2}& =& GL_{ 1}\\frac{\\sum _{j}\(\\hat{\\mu }_{\\cdot j}-\\hat{\\mu }\)^{2}} {L_{2} - 1} {}\\\\ & =& \\mbox{ SS}_{\\mbox{ Tr2}}\\left \( \\frac{1} {L_{2} - 1}\\right \) = \\mbox{ MS}_{\\mbox{ T2}}. {}\\\\ \\end{array}$$
+](A442674_1_En_8_Chapter_Equ9.gif)
+
+We end up using the F-test to obtain a p-value for the statistic
+
+![
+$$\\displaystyle{F = \\frac{\\mbox{ MS}_{\\mbox{ T2}}} {\\mbox{ MS}_{\\mbox{ W}}} }$$
+](A442674_1_En_8_Chapter_Equal.gif)
+
+with (L 2 − 1), L 1 L 2(G − 1) degrees of freedom.
+
+### 8.2.4 Setting Up An ANOVA Table
+
+We now have a powerful and useful procedure that I can name. The analysis is referred to as a two-factor ANOVA (or two-way ANOVA . Because there is so much of it, it is hard to put in one box, and I have used two. Again, we compute a set of terms, put them in a highly informative table, and then draw conclusions by inspecting the table.
+
+Procedure 8.2 (Setting up a Two-Way ANOVA)
+
+Perform a randomized experiment: Choose L 1 levels of treatment for the first treatment, and L 2 levels for the second. Randomize L 1 L 2 G subjects into L 1 × L 2 treatment groups of G subjects each. Treat each subject, and record the results.
+
+Terminology:
+
+![
+$$\\displaystyle{\\begin{array}{lll} x_{ijk} &\\mbox{ value for the }k\\mbox{ 'th subject}\\mbox{ for level }i\\mbox{ of treatment one}& \\\\ &\\mbox{ and level }j\\mbox{ of treatment two} & \\\\ \\hat{\\mu } &\\mbox{ overall mean} &\\sum _{ijk}x_{ijk}/\(GL_{1}L_{2}\) \\\\ \\hat{\\mu }_{ij} &\\mbox{ group means} &\\sum _{k}x_{ijk}/\(G\) \\\\ \\hat{\\mu }_{i\\cdot } &\\mbox{ treatment}\\mbox{ one means} &\\sum _{jk}x_{ijk}/\(GL_{2}\) \\\\ \\hat{\\mu }_{\\cdot j} &\\mbox{ treatment}\\mbox{ two means} &\\sum _{ik}x_{ijk}/\(GL_{1}\) \\\\ \\mbox{ SS}_{\\mbox{ W}} &\\mbox{ within group sum of squares} &\\sum _{ijk}\(x_{ijk} -\\hat{\\mu }_{ij}\)^{2} \\\\ \\mbox{ SS}_{\\mbox{ I}} &\\mbox{ interaction sum of squares} &G\\sum _{ij}\(\\hat{\\mu }_{ij} -\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }_{\\cdot j}+\\hat{\\mu }\)^{2} \\\\ \\mbox{ SS}_{\\mbox{ Tr1}} &\\mbox{ treatment one sum of squares} &GL_{2}\\sum _{i}\(\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }\)^{2} \\\\ \\mbox{ SS}_{\\mbox{ Tr2}} &\\mbox{ treatment two sum of squares} &GL_{1}\\sum _{j}\(\\hat{\\mu }_{\\cdot j}-\\hat{\\mu }\)^{2} \\\\ \\mbox{ MS}_{\\mbox{ W}} &\\mbox{ within group mean squares} &\\left \(1/\(L_{1}L_{2}\(G - 1\)\)\\right \)\\mbox{ SS}_{\\mbox{ W}} \\\\ \\mbox{ MS}_{\\mbox{ I}} &\\mbox{ interaction mean squares} &\\left \(1/\[\(L_{1} - 1\)\(L_{2} - 1\)\]\\right \)\\mbox{ SS}_{\\mbox{ I}} \\\\ \\mbox{ MS}_{\\mbox{ T1}} &\\mbox{ treatment one mean squares} &\\left \(1/\(L_{1} - 1\)\\right \)\\mbox{ SS}_{\\mbox{ Tr1}} \\\\ \\mbox{ MS}_{\\mbox{ T2}} &\\mbox{ treatment two mean squares} &\\left \(1/\(L_{2} - 1\)\\right \)\\mbox{ SS}_{\\mbox{ Tr2}} \\\\ F_{i} &\\mbox{ interaction F-statistic} &\\mbox{ MS}_{\\mbox{ I}}/\\mbox{ MS}_{\\mbox{ W}} \\\\ F_{1} & \\mbox{ treatment F-statistic} &\\mbox{ MS}_{\\mbox{ T1}}/\\mbox{ MS}_{\\mbox{ W}} \\\\ F_{2} & \\mbox{ treatment two F-statistic} &\\mbox{ MS}_{\\mbox{ T2}}/\\mbox{ MS}_{\\mbox{ W}} \\\\ \\mbox{ p-values} &\\mbox{ Statistic} &\\mbox{ DOF} \\\\ \\mbox{ interaction} &F_{i} &\\left \[L_{1}L_{2} - 1\\right \], \\\\ & &\\left \[L_{1}L_{2}\(G - 1\)\\right \] \\\\ \\mbox{ treatment one} &F_{1} & \\left \[L_{1} - 1\\right \] \\\\ & &\\left \[L_{1}L_{2}\(G - 1\)\\right \] \\\\ \\mbox{ treatment two}&F_{2} & \\left \[L_{2} - 1\\right \], \\\\ & &\\left \[L_{1}L_{2}\(G - 1\)\\right \]\\\\ \\end{array} }$$
+](A442674_1_En_8_Chapter_Equam.gif)
+
+Procedure 8.3 (Forming and Interpreting a Two-Way ANOVA Table)
+
+Make the ANOVA table: Form the table  | DOF | Sum Sq | Mean Sq | F value | Pr( > F)
+
+---|---|---|---|---|---
+
+Tr 1 | L 1 − 1 | SSTr1 | MST1 | ![
+$$\\mbox{ MS}_{\\mbox{ T1}}/\\mbox{ MS}_{\\mbox{ W}}$$
+](A442674_1_En_8_Chapter_IEq37.gif) | tr 1 p-value
+
+Tr 2 | L 2 − 1 | SSTr2 | MST2 | ![
+$$\\mbox{ MS}_{\\mbox{ T2}}/\\mbox{ MS}_{\\mbox{ W}}$$
+](A442674_1_En_8_Chapter_IEq38.gif) | tr 2 p-value
+
+Tr 1:Tr 2 | L 1 L 2 − 1 | SSI | MSI | ![
+$$\\mbox{ MS}_{\\mbox{ I}}/\\mbox{ MS}_{\\mbox{ W}}$$
+](A442674_1_En_8_Chapter_IEq39.gif) | int p-value
+
+Res | L 1 L 2 (G − 1) | SSW | MSW
+
+|  |
+
+Interpretation: If the interaction p-value is small enough, it is likely the treatments interact. If the treatments interact, they must have an effect. If the treatments appear not to interact, then a small value of either of the treatment p-values implies that only an extremely unlikely set of samples could explain the difference between groups.
+
+Worked example 8.5 (Poison and Treatment)
+
+Investigate the effects of poison and treatment, using the data at http://​www.​statsci.​org/​data/​general/​poison.​html
+
+Solution
+
+This dataset records survival times of animals poisoned with one of three poisons and supplied with one of four antidotes. I obtained the following ANOVA table  | DOF | Sum Sq | Mean Sq | F value | Pr( > F)
+
+---|---|---|---|---|---
+
+Poison | 2 | 1.03301 | 0.51651 | 23.2217 | 3.331e-07
+
+Antidote | 3 | 0.92121 | 0.30707 | 13.8056 | 3.777e-06
+
+Poison: Antidote | 6 | 0.25014 | 0.04169 | 1.8743 | 0.1123
+
+Residuals | 36 | 0.80073 | 0.02224
+
+|  |
+
+From which it seems safe to conclude that there isn't any interaction, but the antidotes work, and the poison does have an effect on survival time. Figure 8.3 drives home these conclusions.
+
+Fig. 8.3
+
+On the left, boxplots of the survival time of subjects (animals of unrecorded species) poisoned with poison 1, by antidote type, for the experiment of Worked example 8.6. On the right, for poison 2. Generally, if there is no effect of a treatment the boxes in a graph should look the same; if there is no interaction, the pattern formed by the boxes should look the same, with perhaps a shift up or down of all boxes. The poisons clearly have an effect, the antidotes clearly have an effect, and (though the shape of the box patterns looks a bit different), there really isn't evidence that there is an interaction
+
+Worked example 8.6 (Memory and Older People)
+
+Use Eysenck's data from http://​www.​statsci.​org/​data/​general/​eysenck.​html to determine whether Age interacts with Processing, and whether either Age or Processing have significant effects on the number of words memorized
+
+Solution
+
+In 1974, Eysenck investigated memory in people. There were two effects of interest. First, the age of the subjects (Age). Eysenck used two groups, one of subjects between 55 and 65 years old, and another of younger subjects. Second, the extent to which material to be memorized was processed (Process). Eysenck used five groups. Each was given a list of words, and asked to perform a task. Afterward, each was asked to write down all the words they could remember, and the number of words remembered is the response. The Intentional group was told they were to remember the words. The other four groups were asked to perform a task with the words. These four were: Adjective (give an adjective for each word); Counting (count the number of letters in each word); Imagery (form vivid images of each word); and Rhyming (think of a word that rhymed with each word). There are 10 subjects in each group. The original study was published as: Eysenck, M. W. (1974). Age differences in incidental learning. Developmental Psychology, 10, 936–941.
+
+I obtained the ANOVA table  | DOF | Sum Sq | Mean Sq | F value | Pr( > F)
+
+---|---|---|---|---|---
+
+Age | 1 | 240.25 | 240.25 | 29.9356 | 3.981e-07
+
+Process | 4 | 1514.94 | 378.74 | 47.1911 | < 2.2e-16
+
+Age: | 4 | 190.30 | 47.58 | 5.9279 | 0.0002793
+
+Process
+
+|  |  |  |  |
+
+Residuals | 90 | 722.30 | 8.03
+
+|  |
+
+(where the "Age:Process" line is the interaction term). Here there is an interaction, and each treatment has an effect, i.e. Age affects how you remember things, the process by which you interact with them affects it too, and the two factors affect one another. Figure 8.4 shows boxplots which make the interactions pretty clear.
+
+Fig. 8.4
+
+On the left, boxplots of the number of words remembered using different processes for younger subjects in the experiment of Worked example 8.6. On the right, for older subjects. Generally, if there is no effect of a treatment the boxes in a graph should look the same; if there is no interaction, the pattern formed by the boxes should look the same, with perhaps a shift up or down of all boxes. The interaction between age and process should be clear. Both age groups find Counting and Rhyming hard, but the effect is much more pronounced for younger subjects
+
+## 8.3 You Should
+
+### 8.3.1 Remember These Definitions
+
+### 8.3.2 Remember These Terms
+
+  * Randomization 179
+
+  * balanced 180
+
+  * within group variation 181
+
+  * residual variation 181
+
+  * between group variation 182
+
+  * treatment variation 182
+
+  * analysis of variance 182
+
+  * ANOVA 182
+
+  * ANOVA table 182
+
+  * one factor 182
+
+  * balanced experiment 183
+
+  * unbalanced experiment 183
+
+  * contrasts 185
+
+  * within group mean squares 189
+
+  * interaction mean squares 189
+
+  * treatment one mean squares 190
+
+  * treatment two mean squares 190
+
+  * two-factor ANOVA 191
+
+  * two-way ANOVA 191
+
+### 8.3.3 Remember These Facts
+
+### 8.3.4 Use These Procedures
+
+  * Evaluate treatment effects with a one-way ANOVA 182
+
+  * Set up a two-way ANOVA 191
+
+  * Evaluate two treatments with a two-way ANOVA 191
+
+### 8.3.5 Be Able to
+
+  * Set up a simple randomized balanced one factor experiment.
+
+  * Construct an ANOVA table for the resulting data, and use it to tell whether the treatment has an effect or not.
+
+  * Estimate the significance of differences in results for different treatment levels.
+
+  * Set up a simple randomized balanced two factor experiment.
+
+  * Construct an ANOVA table for the resulting data, and use it to tell whether there are interactions, and whether either treatment has an effect or not.
+
+  * Estimate the significance of differences in results for different treatment levels.
+
+  * Interpret data from an unbalanced one-factor experiment.
+
+Problems
+
+Decomposing the Squared Error
+
+8.1 You will show that the squared error of a one-way experiment decomposes into two terms, as in Sect. 8.1.2. Write
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\sum _{ij}\(x_{ij}-\\hat{\\mu }\)^{2} =\\sum _{ ij}\(\(x_{ij} -\\hat{\\mu }_{i}\) + \(\\hat{\\mu }_{i}-\\hat{\\mu }\)\)^{2}.& & {}\\\\ \\end{array}$$
+](A442674_1_En_8_Chapter_Equ10.gif)
+
+  1. (a)
+
+Show that the square can be expanded and rearranged to obtain
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\left \[\\sum _{ij}\(x_{ij} -\\hat{\\mu }_{i}\)^{2}\\right \] + G\\left \[\\sum _{ i}\(\\hat{\\mu }-\\hat{\\mu }_{i}\)^{2}\\right \] + 2\\sum _{ i}\\left \[\\left \(\\sum _{j}\(x_{ij} -\\hat{\\mu }_{i}\)\\right \)\(\\hat{\\mu }-\\hat{\\mu }_{i}\)\\right \].& & {}\\\\ \\end{array}$$
+](A442674_1_En_8_Chapter_Equ11.gif)
+
+  2. (b)
+
+Show that, because ![
+$$\\hat{\\mu }_{i}$$
+](A442674_1_En_8_Chapter_IEq40.gif) minimizes ∑ j (x ij −μ i )2,
+
+![
+$$\\displaystyle{\\left \(\\sum _{j}\(x_{ij} -\\hat{\\mu }_{i}\)\\right \) = 0}$$
+](A442674_1_En_8_Chapter_Equan.gif)
+
+  3. (c)
+
+Now show
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\sum _{ij}\(x_{ij}-\\hat{\\mu }\)^{2} = \\left \[\\sum _{ ij}\(x_{ij} -\\hat{\\mu }_{i}\)^{2}\\right \] + G\\left \[\\sum _{ i}\(\\hat{\\mu }-\\hat{\\mu }_{i}\)^{2}\\right \].& & {}\\\\ \\end{array}$$
+](A442674_1_En_8_Chapter_Equ12.gif)
+
+8.2 You will show that the squared error of a two-way experiment decomposes into four terms, after the pattern of the previous exercise.
+
+  1. (a)
+
+Show that
+
+![
+$$\\displaystyle{\\sum _{ijk}\(x_{ijk}-\\hat{\\mu }\)^{2} =\\sum _{ ijk}\\left \[\\begin{array}{l} \(x_{ijk} -\\hat{\\mu }_{ij}\)+ \\\\ \(\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }\)+ \\\\ \(\\hat{\\mu }_{\\cdot j}-\\hat{\\mu }\)+ \\\\ \(\\hat{\\mu }_{ij} -\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }_{\\cdot j}+\\hat{\\mu }\)\\end{array} \\right \]^{2}.}$$
+](A442674_1_En_8_Chapter_Equao.gif)
+
+  2. (b)
+
+Show that
+
+![
+$$\\displaystyle{\\sum _{k}\(x_{ijk} -\\hat{\\mu }_{ij}\) = 0}$$
+](A442674_1_En_8_Chapter_Equap.gif)
+
+for any i, j, by recalling the definition of ![
+$$\\hat{\\mu }_{ij}$$
+](A442674_1_En_8_Chapter_IEq41.gif).
+
+  3. (c)
+
+Show that
+
+![
+$$\\displaystyle{\\sum _{i}\(\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }\) = 0}$$
+](A442674_1_En_8_Chapter_Equaq.gif)
+
+for any i, by recalling the definition of ![
+$$\\hat{\\mu }_{i\\cdot }$$
+](A442674_1_En_8_Chapter_IEq42.gif) and ![
+$$\\hat{\\mu }$$
+](A442674_1_En_8_Chapter_IEq43.gif).
+
+  4. (d)
+
+Show that
+
+![
+$$\\displaystyle{\\sum _{j}\(\\hat{\\mu }_{j\\cdot }-\\hat{\\mu }\) = 0}$$
+](A442674_1_En_8_Chapter_Equar.gif)
+
+for any j, by recalling the definition of ![
+$$\\hat{\\mu }_{j\\cdot }$$
+](A442674_1_En_8_Chapter_IEq44.gif) and ![
+$$\\hat{\\mu }$$
+](A442674_1_En_8_Chapter_IEq45.gif).
+
+  5. (e)
+
+Now show that
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\sum _{ijk}\(x_{ijk}-\\hat{\\mu }\)^{2}& =& \\left \[\\begin{array}{l} \\sum _{ijk}\(x_{ijk} -\\hat{\\mu }_{ij}\)^{2}+ \\\\ GL_{2}\\sum _{i}\(\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }\)^{2}+ \\\\ GL_{1}\\sum _{j}\(\\hat{\\mu }_{\\cdot j}-\\hat{\\mu }\)^{2}+ \\\\ G\\sum _{ij}\(\\hat{\\mu }_{ij} -\\hat{\\mu }_{i\\cdot }-\\hat{\\mu }_{\\cdot j}+\\hat{\\mu }\)^{2}\\end{array} \\right \].{}\\\\ \\end{array}$$
+](A442674_1_En_8_Chapter_Equ13.gif)
+
+Unbalanced One-Way Experiments
+
+8.3 The Rules of Rugby You will find a dataset giving the times of passages of play in games of rugby at http://​www.​statsci.​org/​data/​oz/​rugby.​html. The data was collected by Hollings and Triggs in 1993. The first five games were played under the old rules, and the second five under the new rules. Use an unbalanced one-way ANOVA to determine whether the change of rules made a difference in the times of passages of play.
+
+8.4 Eye Color and Flicker Frequency You will find a dataset recording the critical flicker frequency and eye color for 19 individuals at http://​www.​statsci.​org/​data/​general/​flicker.​html. The data was collected by Devore and Peck in 1973. Use an unbalanced one-way ANOVA to determine whether individuals with different eye colors have different critical flicker frequencies.
+
+8.5 Survival on the Titanic You will find a dataset recording the survival status of passengers on the Titanic at http://​www.​statsci.​org/​data/​general/​titanic.​html. This data comes originally from Encyclopedia Titanica, by Philip Hinde. Use an unbalanced one-way ANOVA to determine whether the class of the passenger's ticket had an effect on their survival.
+
+Two-Way Experiments
+
+8.6 Paper Planes You will find a dataset recording the performance of paper planes at http://​www.​statsci.​org/​data/​oz/​planes.​html. This data comes originally from the paper What is the use of experiments conducted by statistics students?, by M.S. Mackisack, which appeared in the Journal of Statistics Education in 1994. Use a two-way ANOVA to analyze the effects of paper and angle on the distance covered by the plane. Is there an interaction between these variables?
+© Springer International Publishing AG 2018
+
+David ForsythProbability and Statistics for Computer Sciencehttps://doi.org/10.1007/978-3-319-64410-3_9
+
+# 9. Inferring Probability Models from Data
+
+David Forsyth1
+
+(1)
+
+Computer Science Department, University of Illinois at Urbana Champaign, Urbana, IL, USA
+
+One very useful way to draw conclusions from a dataset is to fit a probability model to that dataset. Once this is done, we can apply any of our procedures to the probability model to (for example) predict new data or estimate properties of future data. For example, you could flip a coin ten times and see five heads and five tails. To be able to estimate the probability of seeing heads on a future flip, you would need to fit a model. But once you had done so, you could also estimate the probability that five future flips would give you three heads and two tails, and so on.
+
+The first step is to choose a model. I have described a relatively small subset of available probability models (really!), and so the choice of model will mostly be obvious. For example, we will use binomial or geometric models for coin flips, and so on. The procedures I describe, however, are quite general. They extend in principle to any model. Furthermore, they do not require that the model be right. Surprisingly, you can often extract quite useful information from a dataset by fitting a model that isn't the model that produced the dataset.
+
+Once you have a model, you need to estimate values for its parameters. For example, if you use a binomial model for a coin flip, then you need to determine the probability a flip will come up heads. There are two kinds of procedure for estimating parameter values. Maximum likelihood finds the values of the parameters that make the observed data most likely. Bayesian inference produces a posterior probability distribution on the parameter values, and extracts information from that. Mostly, the description of these procedures is straightforward, but actually using them takes a little practice, so the sections in this chapter are mostly worked examples.
+
+## 9.1 Estimating Model Parameters with Maximum Likelihood
+
+Assume we have a dataset ![
+$$\\mathcal{D} =\\{ \\mathbf{x}\\}$$
+](A442674_1_En_9_Chapter_IEq1.gif), and a probability model we believe applies to that dataset. Generally, application logic suggests the type of model (i.e. normal probability density; Poisson probability; geometric probability; and so on). But usually, we do not know values for the parameters of the model—for example, the mean and standard deviation of a normal distribution; the intensity of a poisson distribution; and so on. Notice that this situation is unlike what we have seen to date. In Chap. , we assumed that we knew parameters, and could then use the model to assign a probability to a set of data items ![
+$$\\mathcal{D}$$
+](A442674_1_En_9_Chapter_IEq2.gif). Here we know the value of ![
+$$\\mathcal{D}$$
+](A442674_1_En_9_Chapter_IEq3.gif), but don't know the parameters. Our model will be better or worse depending on how well we choose the parameters. We need a strategy to estimate the parameters of a model from a sample dataset. Notice how each of the following examples fits this pattern. There is an important, and widespread, convention that I shall adhere to. Unknown parameters are widely referred to as θ (which could be a scalar, or a vector, or, if you're lucky, much more interesting than that).
+
+Example 9.1 (Inferring p from Repeated Flips—Binomial)
+
+Imagine we flip a coin N times, and count the number of heads h. An appropriate probability model for a set of independent coin flips is the binomial model P b (h; N, θ), where θ is the probability a flipped coin comes up heads (which was written p(H) or p in the binomial model). But we do not know p(H), which is the parameter. I wrote this as θ because we do not know it. We need a strategy to extract a value of θ from the data.
+
+Example 9.2 (Inferring p from Repeated Flips—Geometric)
+
+Imagine we flip coin repeatedly until we see a head. The number of flips has the geometric distribution with parameter p(H). In this case, the data is a sequence of T's with a final H from the coin flips. There are N flips (or terms) and the last flip is a head. We know that an appropriate probability model is the geometric distribution P g (N; θ). But we do not know p(H), which is the parameter I wrote as θ.
+
+Example 9.3 (Inferring the Intensity of Spam—Poisson)
+
+It is reasonable to assume that the number of spam emails one gets in an hour has a Poisson distribution. But what is the intensity parameter, written λ in the definition, of that distribution? We could count the number of spam emails that arrive in each of a set of distinct hours, giving a dataset of counts ![
+$$\\mathcal{D}$$
+](A442674_1_En_9_Chapter_IEq4.gif). We need a strategy to wrestle an estimate of the intensity parameter, which I shall write θ because we don't know it, from this dataset.
+
+Example 9.4 (Inferring the Mean and Standard Deviation of Normal Data)
+
+Imagine we know for some reason that our data is well described by a normal distribution. The missing parameters are now the mean and standard deviation of the normal distribution that best represents the data.
+
+### 9.1.1 The Maximum Likelihood Principle
+
+We have a dataset ![
+$$\\mathcal{D}$$
+](A442674_1_En_9_Chapter_IEq5.gif), a family of probability models ![
+$$P\(\\mathcal{D}\\vert \\theta \)$$
+](A442674_1_En_9_Chapter_IEq6.gif). We need a "reasonable" procedure to estimate a value of θ so that the resulting model describes our data well. A natural choice is the value of θ that makes the data observed "most probable". If we knew θ, then the probability of observing the data ![
+$$\\mathcal{D}$$
+](A442674_1_En_9_Chapter_IEq7.gif) would be ![
+$$P\(\\mathcal{D}\\vert \\theta \)$$
+](A442674_1_En_9_Chapter_IEq8.gif). We can construct an expression for ![
+$$P\(\\mathcal{D}\\vert \\theta \)$$
+](A442674_1_En_9_Chapter_IEq9.gif) using our model. Now we know ![
+$$\\mathcal{D}$$
+](A442674_1_En_9_Chapter_IEq10.gif), and we don't know θ, so the value of ![
+$$P\(\\mathcal{D}\\vert \\theta \)$$
+](A442674_1_En_9_Chapter_IEq11.gif) is a function of θ. This function is known as the likelihood.
+
+Definition 9.1 (Likelihood)
+
+The function ![
+$$P\(\\mathcal{D}\\vert \\theta \)$$
+](A442674_1_En_9_Chapter_IEq12.gif), which is a function of θ, is known as the likelihood of the data D, and is often written ![
+$$\\mathcal{L}\(\\theta \)$$
+](A442674_1_En_9_Chapter_IEq13.gif) (or ![
+$$\\mathcal{L}\(\\theta;\\mathcal{D}\)$$
+](A442674_1_En_9_Chapter_IEq14.gif) if you want to remember that data is involved).
+
+The maximum likelihood principle asks for a choice of θ such that the probability of observing the data you actually see, is maximised. This should strike you as being a reasonable choice.
+
+Definition 9.2 (Maximum Likelihood Principle)
+
+The maximum likelihood principle chooses θ such that ![
+$$\\mathcal{L}\(\\theta \) = P\(\\mathcal{D}\\vert \\theta \)$$
+](A442674_1_En_9_Chapter_IEq15.gif) is maximised, as a function of θ.
+
+For the examples we work with, the data will be independent and identically distributed or IID. This means that each data item is an idependently obtained sample from the same probability distribution (see Sect. 4.​3.​1). In turn, this means that the likelihood is a product of terms, one for each data item, which we can write as
+
+![
+$$\\displaystyle{\\mathcal{L}\(\\theta \) = P\(\\mathcal{D}\\vert \\theta \) =\\prod _{i\\in \\mbox{ dataset}}P\(\\mathbf{x}_{i}\\vert \\theta \).}$$
+](A442674_1_En_9_Chapter_Equa.gif)
+
+There are two, distinct, important concepts we must work with. One is the unknown parameter(s), which we will write θ. The other is the estimate of the value(s), which we will write ![
+$$\\hat{\\theta }$$
+](A442674_1_En_9_Chapter_IEq16.gif). This estimate is the best we can do—it may not be the "true" value of the parameter, which we will likely never know.
+
+Procedure 9.1 (Estimating with Maximum Likelihood)
+
+Given a dataset ![
+$$\\{\\mathcal{D}\\}$$
+](A442674_1_En_9_Chapter_IEq17.gif), and a model with unknown parameter(s) θ, compute an estimate ![
+$$\\hat{\\theta }$$
+](A442674_1_En_9_Chapter_IEq18.gif) of the value of the parameters by constructing the likelihood of the data under the model
+
+![
+$$\\displaystyle{\\mathcal{L}\(\\theta \) = P\(\\mathcal{D}\\vert \\theta \)}$$
+](A442674_1_En_9_Chapter_Equb.gif)
+
+which in our case will always be
+
+![
+$$\\displaystyle{\\mathcal{L}\(\\theta \) =\\prod _{i\\in \\mbox{ dataset}}P\(\\mathbf{x}_{i}\\vert \\theta \).}$$
+](A442674_1_En_9_Chapter_Equc.gif)
+
+Now estimate ![
+$$\\hat{\\theta }$$
+](A442674_1_En_9_Chapter_IEq19.gif) as
+
+![
+$$\\displaystyle{\\hat{\\theta }= \\begin{array}{c} \\mbox{ argmax} \\\\ \\theta \\end{array} \\mathcal{L}\(\\theta \).}$$
+](A442674_1_En_9_Chapter_Equd.gif)
+
+You should see Procedure 9.1 as a straightforward recipe, because that's what it is. It is a highly successful recipe. The main difficulty in applying the recipe is actually finding the maximum. The following sections show examples for important cases.
+
+### 9.1.2 Binomial, Geometric and Multinomial Distributions
+
+Worked example 9.1 (Inferring p(H) for a Coin from Flips Using a Binomial Model)
+
+In N independent coin flips, you observe k heads. Use the maximum likelihood principle to infer p(H).
+
+Solution
+
+The coin has θ = p(H), which is the unknown parameter. We know that an appropriate probability model is the binomial model P b (k; N, θ). We have that
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathcal{L}\(\\theta \)& =& P\(\\mathcal{D}\\vert \\theta \) = P_{b}\(k;N,\\theta \) {}\\\\ & =& \\left \(\\begin{array}{c} N\\\\ k \\end{array} \\right \)\\theta ^{k}\(1-\\theta \)^{\(N-k\)}{}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ1.gif)
+
+which is a function of θ—the unknown probability that a coin comes up heads; k and N are known. We must find the value of θ that maximizes this expression. Now the maximum occurs when
+
+![
+$$\\displaystyle{\\frac{\\partial \\mathcal{L}\(\\theta \)} {\\partial \\theta } = 0.}$$
+](A442674_1_En_9_Chapter_Eque.gif)
+
+We have
+
+![
+$$\\displaystyle{\\begin{array}{l} \\frac{\\partial \\mathcal{L}\(\\theta \)} {\\partial \\theta } = \\left \(\\begin{array}{c} N\\\\ k \\end{array} \\right \)\\left \(k\\theta ^{k-1}\(1-\\theta \)^{\(N-k\)} -\\theta ^{k}\(N - k\)\(1-\\theta \)^{\(n-k-1\)}\\right \) \\end{array}}$$
+](A442674_1_En_9_Chapter_Equf.gif)
+
+and this is zero when
+
+![
+$$\\displaystyle{k\\theta ^{k-1}\(1-\\theta \)^{\(N-k\)} =\\theta ^{k}\(N - k\)\(1-\\theta \)^{\(N-k-1\)}}$$
+](A442674_1_En_9_Chapter_Equg.gif)
+
+so the maximum occurs when
+
+![
+$$\\displaystyle{k\(1-\\theta \) =\\theta \(N - k\).}$$
+](A442674_1_En_9_Chapter_Equh.gif)
+
+This means the maximum likelihood estimate is
+
+![
+$$\\displaystyle{\\hat{\\theta }= \\frac{k} {N}.}$$
+](A442674_1_En_9_Chapter_Equi.gif)
+
+Worked example 9.1 produces a result that seems natural to most people, and it's very likely that you would guess this form without knowing the maximum likelihood principle. But now we have a procedure we can apply to other problems. Notice one quirk of the method that this example exposes. Generally, the method is more reliable with more data. If you flip a coin once, and get a T, the procedure will estimate ![
+$$\\hat{\\theta }= 0$$
+](A442674_1_En_9_Chapter_IEq20.gif), which should strike you as a poor estimate.
+
+Worked example 9.2 (Inferring p(H) from Coin Flips Using a Geometric Model). You flip a coin N times, stopping when you see a head. Use the maximum likelihood principle to infer p(H) for the coin.
+
+Solution
+
+The coin has θ = p(H), which is the unknown parameter. We know that an appropriate probability model is the geometric model P g (N; θ). We have that
+
+![
+$$\\displaystyle{\\mathcal{L}\(\\theta \) = P\(\\mathcal{D}\\vert \\theta \) = P_{g}\(N;\\theta \) = \(1-\\theta \)^{\(N-1\)}\\theta }$$
+](A442674_1_En_9_Chapter_Equj.gif)
+
+which is a function of θ—the unknown probability that a coin comes up heads; N is known. We must find the value of θ that maximizes this expression. Now the maximum occurs when
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\frac{\\partial \\mathcal{L}\(\\theta \)} {\\partial \\theta } = 0& =& \(\(1-\\theta \)^{\(N-1\)} {}\\\\ & & -\(N - 1\)\(1-\\theta \)^{\(N-2\)}\\theta \) {}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ2.gif)
+
+So the maximum likelihood estimate is
+
+![
+$$\\displaystyle{\\hat{\\theta }= \\frac{1} {N}.}$$
+](A442674_1_En_9_Chapter_Equk.gif)
+
+Most people don't guess the estimate of Worked example 9.2, though it usually seems reasonable in retrospect. Obtaining the maximum of the likelihood can get interesting, as the following worked example suggests.
+
+Worked example 9.3 (Inferring Die Probabilities from Multiple Rolls and a Multinomial Distribution). You throw a die N times, and see n 1 ones,... and n 6 sixes. Write p 1,..., p 6 for the probabilities that the die comes up one,..., six. Use the maximum likelihood principle to estimate p 1,..., p 6 for a multinomial model.
+
+Solution
+
+The data are N, n 1,..., n 6. The parameters are θ = (p 1,..., p 6). ![
+$$P\(\\mathcal{D}\\vert \\theta \)$$
+](A442674_1_En_9_Chapter_IEq21.gif) comes from the multinomial distribution. In particular,
+
+![
+$$\\displaystyle{\\mathcal{L}\(\\theta \) = P\(\\mathcal{D}\\vert \\theta \) = \\frac{n!} {n_{1}!\\ldots n_{6}!}p_{1}^{n_{1} }p_{2}^{n_{2} }\\ldots p_{6}^{n_{6} }}$$
+](A442674_1_En_9_Chapter_Equl.gif)
+
+which is a function of θ = (p 1,..., p 6). Now we want to maximize this function by choice of θ. Notice that we could do this by simply making all p i very large—but this omits a fact, which is that p 1 \+ p 2 \+ p 3 \+ p 4 \+ p 5 \+ p 6 = 1. So we substitute using p 6 = 1 − p 1 − p 2 − p 3 − p 4 − p 5 (there are other, neater, ways of dealing with this issue, but they take more background knowledge). At the maximum, we must have that for all i,
+
+![
+$$\\displaystyle{\\frac{\\partial \\mathcal{L}\(\\theta \)} {\\partial p_{i}} = 0}$$
+](A442674_1_En_9_Chapter_Equm.gif)
+
+which means that, for p i , we must have
+
+![
+$$\\displaystyle{\\begin{array}{l} n_{i}p_{i}^{\(n_{i}-1\)}\(1 - p_{ 1} - p_{2} - p_{3} - p_{4} - p_{5}\)^{n_{6}} - p_{ i}^{n_{i}}n_{ 6}\(1 - p_{1} - p_{2} - p_{3} - p_{4} - p_{5}\)^{\(n_{6}-1\)} = 0 \\end{array}}$$
+](A442674_1_En_9_Chapter_Equn.gif)
+
+so that, for each p i , we have
+
+![
+$$\\displaystyle{n_{i}\(1 - p_{1} - p_{2} - p_{3} - p_{4} - p_{5}\) - n_{6}p_{i} = 0}$$
+](A442674_1_En_9_Chapter_Equo.gif)
+
+or
+
+![
+$$\\displaystyle{ \\frac{p_{i}} {1 - p_{1} - p_{2} - p_{3} - p_{4} - p_{5}} = \\frac{n_{i}} {n_{6}}.}$$
+](A442674_1_En_9_Chapter_Equp.gif)
+
+You can check that this equation is solved by
+
+![
+$$\\displaystyle{\\begin{array}{ll} \\hat{\\theta } =& \\frac{1} {\(n_{1}+n_{2}+n_{3}+n_{4}+n_{5}+n_{6}\)}\(n_{1},n_{2},n_{3},n_{4},n_{5},n_{6}\) \\end{array} }$$
+](A442674_1_En_9_Chapter_Equq.gif)
+
+### 9.1.3 Poisson and Normal Distributions
+
+Maximizing the likelihood presents a problem. We usually need to take derivatives of products, which can quickly lead to quite unmanageable expressions. There is a straightforward cure. The logarithm is a monotonic function for non-negative numbers (i.e. if x > 0, y > 0, x > y, then log(x) > log(y)). This means that the values of θ that maximise the log-likelihood are the same as the values that maximise the likelihood. This observation allows us to transform a product into a sum, and the derivative of a sum is easy.
+
+Definition 9.3 (Log-Likelihood of a Dataset Under a Model)
+
+The log-likelihood of a dataset under a model is a function of the unknown parameters, and you will often see it written as
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\log \\mathcal{L}\(\\theta \)& =& \\log P\(\\mathcal{D}\\vert \\theta \) {}\\\\ & =& \\sum _{i\\in \\mbox{ dataset}}\\log P\(d_{i}\\vert \\theta \). {}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ3.gif)
+
+Worked example 9.4 (Poisson Distributions). You observe N intervals, each of the same, fixed length (in time, or space). You know that, in these intervals, events occur with a Poisson distribution (for example, you might be observing Prussian officers being kicked by horses, or telemarketer calls...). You know also that the intensity of the Poisson distribution is the same for each observation. The number of events you observe in the i'th interval is n i . What is the intensity of the Poisson distribution?
+
+Solution
+
+Write θ for the unknown intensity. The likelihood is
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathcal{L}\(\\theta \)& =& \\prod _{i\\in \\mbox{ intervals}}P\(\\left \\{n_{i}\\mbox{ events}\\right \\}\\vert \\theta \) {}\\\\ & =& \\prod _{i\\in \\mbox{ intervals}}\\frac{\\theta ^{n_{i}}e^{-\\theta }} {n_{i}!}. {}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ4.gif)
+
+It will be easier to work with logs. The log-likelihood is
+
+![
+$$\\displaystyle{\\log \\mathcal{L}\(\\theta \) =\\sum _{i}\(n_{i}\\log \\theta -\\theta -\\log n_{i}!\)}$$
+](A442674_1_En_9_Chapter_Equr.gif)
+
+so that we must solve
+
+![
+$$\\displaystyle{\\frac{\\partial \\log \\mathcal{L}\(\\theta \)} {\\partial \\theta } =\\sum _{i}\(\\frac{n_{i}} {\\theta } - 1\) = 0}$$
+](A442674_1_En_9_Chapter_Equs.gif)
+
+which yields a maximum likelihood estimate of
+
+![
+$$\\displaystyle{\\hat{\\theta }= \\frac{\\sum _{i}n_{i}} {N} }$$
+](A442674_1_En_9_Chapter_Equt.gif)
+
+Worked example 9.5
+
+Worked example 9.5 (The Intensity of Swearing). A famously sweary politician gives a talk. You listen to the talk, and for each of 30 intervals 1 min long, you record the number of swearwords. You record this as a histogram (i.e. you count the number of intervals with zero swear words, with one, etc.). For the first 10 intervals, you see
+
+No. of swear words | 0 | 1 | 2 | 3 | 4
+
+---|---|---|---|---|---
+
+No. of intervals | 5 | 2 | 2 | 1 | 0
+
+and for the following 20 intervals, you see
+
+No. of swear words | 0 | 1 | 2 | 3 | 4
+
+---|---|---|---|---|---
+
+No. of intervals | 9 | 5 | 3 | 2 | 1
+
+Assume that the politician's use of swearwords is Poisson. What is the intensity using the first 10 intervals? the second 20 intervals? all the intervals? why are they different?
+
+Solution
+
+Use the expression from Worked example 9.1.3 to find
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\hat{\\theta }_{10}& =& \\frac{\\mbox{ total number of swearwords}} {\\mbox{ number of intervals}} {}\\\\ & =& \\frac{9} {10}{}\\\\ && {}\\\\ \\hat{\\theta }_{20}& =& \\frac{\\mbox{ total number of swearwords}} {\\mbox{ number of intervals}} {}\\\\ & =& \\frac{21} {20}{}\\\\ && {}\\\\ \\hat{\\theta }_{30}& =& \\frac{\\mbox{ total number of swearwords}} {\\mbox{ number of intervals}} {}\\\\ & =& \\frac{30} {30}. {}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ5.gif)
+
+These are different because the maximum likelihood estimate is an estimate—we can't expect to recover the exact value from a dataset. Notice, however, that the estimates are quite close.
+
+Worked example 9.6 (The Mean of a Normal Distribution). Assume we have x 1,..., x N , and we wish to model these data with a normal distribution. Use the maximum likelihood principle to estimate the mean of that normal distribution.
+
+Solution
+
+The likelihood of a set of data values under the normal distribution with unknown mean θ and standard deviation σ is
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathcal{L}\(\\theta \)& =& P\(x_{1},\\ldots x_{N}\\vert \\theta,\\sigma \) {}\\\\ & =& P\(x_{1}\\vert \\theta,\\sigma \)P\(x_{2}\\vert \\theta,\\sigma \)\\ldots P\(x_{N}\\vert \\theta,\\sigma \) {}\\\\ & =& \\prod _{i=1}^{N} \\frac{1} {\\sqrt{2\\pi }\\sigma }\\exp \\left \(-\\frac{\(x_{i}-\\theta \)^{2}} {2\\sigma ^{2}} \\right \) {}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ6.gif)
+
+and this expression is a moderate nuisance to work with. The log of the likelihood is
+
+![
+$$\\displaystyle{\\begin{array}{ll} \\log \\mathcal{L}\(\\theta \)& = \\left \(\\sum _{i=1}^{N} -\\frac{\(x_{i}-\\theta \)^{2}} {2\\sigma ^{2}} \\right \) \\\\ & + \\mbox{ term not depending on }\\theta. \\end{array} }$$
+](A442674_1_En_9_Chapter_Equu.gif)
+
+We can find the maximum by differentiating wrt θ and setting to zero, which yields
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\frac{\\partial \\log \\mathcal{L}\(\\theta \)} {\\partial \\theta } & =& \\sum _{i=1}^{N}\\frac{2\(x_{i}-\\theta \)} {2\\sigma ^{2}} {}\\\\ & =& 0 {}\\\\ & =& \\frac{1} {\\sigma ^{2}} \\left \(\\sum _{i=1}^{N}x_{ i} - N\\theta \\right \) {}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ7.gif)
+
+so the maximum likelihood estimate is
+
+![
+$$\\displaystyle{\\hat{\\theta }= \\frac{\\sum _{i=1}^{N}x_{i}} {N} }$$
+](A442674_1_En_9_Chapter_Equv.gif)
+
+which probably isn't all that surprising. Notice we did not have to pay attention to σ in this derivation—we did not assume it was known, it just doesn't do anything.
+
+Remember this: The mean of a dataset is the maximum likelihood estimate of the mean of a normal model fit to that dataset.
+
+Worked example 9.7 (The Standard Deviation of a Normal Distribution). Assume we have x 1,..., x N which are data that can be modelled with a normal distribution. Use the maximum likelihood principle to estimate the standard deviation of that normal distribution.
+
+Solution
+
+Now we have to write out the log of the likelihood in more detail. Write μ for the mean of the normal distribution and θ for the unknown standard deviation of the normal distribution. We get
+
+![
+$$\\displaystyle{\\begin{array}{ll} \\log \\mathcal{L}\(\\theta \)& = \\left \(\\sum _{i=1}^{N} -\\frac{\(x_{i}-\\mu \)^{2}} {2\\theta ^{2}} \\right \) - N\\log \\theta \\\\ & + \\mbox{ Term not depending on}\\ \\theta \\end{array} }$$
+](A442674_1_En_9_Chapter_Equw.gif)
+
+We can find the maximum by differentiating wrt θ and setting to zero, which yields
+
+![
+$$\\displaystyle{\\frac{\\partial \\log \\mathcal{L}\(\\theta \)} {\\partial \\theta } = \\frac{-2} {\\theta ^{3}} \\sum _{i=1}^{N}\\frac{-\(x_{i}-\\mu \)^{2}} {2} -\\frac{N} {\\theta } = 0}$$
+](A442674_1_En_9_Chapter_Equx.gif)
+
+so the maximum likelihood estimate is
+
+![
+$$\\displaystyle{\\hat{\\theta }= \\sqrt{\\frac{\\sum _{i=1 }^{N }\(x_{i } -\\mu \)^{2 } } {N}} }$$
+](A442674_1_En_9_Chapter_Equy.gif)
+
+which probably isn't all that surprising, either.
+
+Remember this: The standard deviation of a dataset is the maximum likelihood estimate of the standard deviation of a normal model fit to that dataset.
+
+You should notice that one could maximize the likelihood of a normal distribution with respect to mean and standard deviation in one go (i.e. I could have done Worked examples 9.1.3 and 9.1.3 in one worked example, instead of two). I did this example in two parts because I felt it was more accessible that way; if you object, you're likely to be able to fill in the details yourself very easily.
+
+Remember this: If you have many data items and a probability model, you really should use maximum likelihood to estimate the parameters of the model.
+
+### 9.1.4 Confidence Intervals for Model Parameters
+
+Assume we have a dataset ![
+$$\\mathcal{D} =\\{ \\mathbf{x}\\}$$
+](A442674_1_En_9_Chapter_IEq22.gif), and a probability model we believe applies to that dataset. We know how to estimate appropriate parameter values by maximizing the likelihood function L(θ). But we do not yet have a way to think about how accurately the data determines the best choice of parameter value. In particular, we should like to be able to construct a confidence interval for a parameter. This interval should capture our certainty in the value of the parameter. If a small change in the dataset would cause a large change in the parameter we recovered, then the interval should be large. But if quite extensive changes in the dataset will result in about the same recovered value, the interval should be small.
+
+For maximum likelihood problems, it is hard to apply the reasoning of Sect. 6.​2 directly. However, the underlying notion—the confidence interval represents an interval within which the population mean will lie for most samples—is naturally associated with repeated experiments. When the data is explained by a parametric probability model, we can use that model to produce other possible datasets. If we compute a maximum likelihood estimate ![
+$$\\hat{\\theta }$$
+](A442674_1_En_9_Chapter_IEq23.gif) of the parameters of the model, we can draw IID samples from the model. We then look at the estimate from that new dataset; the spread of the estimates yields our confidence interval.
+
+A (1 − 2α) confidence interval for a parameter is an interval [c α , c (1−α)]. We construct the interval such that, if we were to perform a very large number of repetitions of our original experiment then estimate a parameter value for each, c α would be the α quantile and c (1−α) would be the (1 −α) quantile of those parameter values. We interpret this to mean that, with confidence (1 − 2α), the correct value of our parameter lies in this interval. This definition isn't really watertight. How do we perform a very large number of repetitions? If we don't, how can we tell how good the confidence interval is? Nonetheless, we can construct intervals that illustrate how sensitive our original inference is to the data that we have.
+
+There is a natural, simulation based, algorithm for estimating confidence intervals. The algorithm should feel so natural to you that you may already have guessed what to do. First, we compute the maximum likelihood estimate of the parameters, ![
+$$\\hat{\\theta }$$
+](A442674_1_En_9_Chapter_IEq24.gif). We assume this estimate is right, but need to see how our estimates of ![
+$$\\hat{\\theta }$$
+](A442674_1_En_9_Chapter_IEq25.gif) would vary with different collections of data from our model with that parameter value. So we compute a collection of simulated datasets, ![
+$$\\mathcal{D}_{i}$$
+](A442674_1_En_9_Chapter_IEq26.gif), each the same size as the original dataset. We obtain these by simulating ![
+$$P\(\\mathcal{D}\\vert \\hat{\\theta }\)$$
+](A442674_1_En_9_Chapter_IEq27.gif). Next, we compute a maximum likelihood estimate for each of these simulated datasets, ![
+$$\\hat{\\theta }_{i}$$
+](A442674_1_En_9_Chapter_IEq28.gif). Finally, we compute the relevant percentiles of these datasets. The result is our interval.
+
+Procedure 9.2 (Estimating Confidence Intervals for Maximum Likelihood Estimates Using Simulation)
+
+Assume we have a dataset ![
+$$\\mathcal{D} =\\{ x\\}$$
+](A442674_1_En_9_Chapter_IEq29.gif) of N items. We have a parametric model of this data p(x | θ), and write the likelihood ![
+$$\\mathcal{L}\(\\theta;\\mathcal{D}\) = P\(\\mathcal{D}\\vert \\theta \)$$
+](A442674_1_En_9_Chapter_IEq30.gif). We construct (1 − 2α) confidence intervals using the following steps.
+
+  1. 1.
+
+Compute the maximum likelihood estimate of the parameter, ![
+$$\\hat{\\theta }= \\begin{array}{c} \\mbox{ argmax} \\\\ \\theta \\end{array} \\mathcal{L}\(\\theta;\\mathcal{D}\)$$
+](A442674_1_En_9_Chapter_IEq31.gif).
+
+  2. 2.
+
+Construct R simulated datasets ![
+$$\\mathcal{D}_{i}$$
+](A442674_1_En_9_Chapter_IEq32.gif), each consisting of N IID samples drawn from ![
+$$p\(x\\vert \\hat{\\theta }\)$$
+](A442674_1_En_9_Chapter_IEq33.gif).
+
+  3. 3.
+
+For each such dataset, compute ![
+$$\\hat{\\theta }_{i} = \\begin{array}{c} \\mbox{ argmax} \\\\ \\theta \\end{array} \\mathcal{L}\(\\theta;\\mathcal{D}\)$$
+](A442674_1_En_9_Chapter_IEq34.gif).
+
+  4. 4.
+
+Obtain ![
+$$c_{\\alpha }\(\\hat{\\theta }_{i}\)$$
+](A442674_1_En_9_Chapter_IEq35.gif), the α'th quantile of the collection ![
+$$\\hat{\\theta }_{i}$$
+](A442674_1_En_9_Chapter_IEq36.gif) and ![
+$$c_{\(1-\\alpha \)}\(\\hat{\\theta }_{i}\)$$
+](A442674_1_En_9_Chapter_IEq37.gif), the 1 −α'th quantile of the collection ![
+$$\\hat{\\theta }_{i}$$
+](A442674_1_En_9_Chapter_IEq38.gif).
+
+The confidence interval is [c α , c (1−α)].
+
+Figure 9.1 shows an example. In this case, I worked with simulated data from a normal distribution. In each case, the normal distribution had mean 0, but there are two different standard deviations (1 and 10). I simulated 10 different datasets from each of these distributions, containing 10, 40, 90,..., 810, 1000 data items. For each, I computed the maximum likelihood estimate of the mean. This isn't zero, even though the data was drawn from a zero-mean distribution, because the dataset is finite. I then estimated the confidence intervals for each using 10,000 simulated datasets of the same size. I show 95% confidence intervals for the two cases, plotted against the size of the dataset used for the estimate. Notice that these intervals aren't symmetric about zero, because the maximum likelihood estimate isn't zero. They shrink as the dataset grows, but slowly. They are bigger when the standard deviation is bigger. It should seem reasonable that you can't expect an accurate estimate of the mean of a normal distribution with large standard deviation using only a few data points. If you think of a probability model as being like a population (it's a very large dataset, that is hidden, and from which you draw samples with replacement), then it should also seem reasonable to you that the intervals shrink as N goes up, because this is what the reasoning of Sect. 6.​2.​2 predicts.
+
+Fig. 9.1
+
+Confidence intervals computed for simulated normal data; details in the text
+
+Worked example 9.8 (Confidence Intervals by Simulation—II). Construct a 90% confidence interval for the intensity estimate for the data of Example 9.5 for the cases of 10 observations, 20 observations, and all 30 observations.
+
+Solution
+
+Recall from that example the maximum likelihood estimates of the intensity are 7/10, 22/20, and 29/30 in the three cases. I used the Matlab function poissrnd to get 10,000 replicates of a dataset of 10 (resp. 20, 30) items from a Poisson distribution with the relevant intensities. I then used prctile to get the 5% and 95% percentiles, yielding the intervals
+
+[0. 3, 1. 2] | For 10 observations
+
+---|---
+
+[0. 75, 1. 5] | For 20 observations
+
+[0. 6667, 1. 2667] | For 30 observations
+
+Notice how having more observations makes the confidence interval smaller.
+
+### 9.1.5 Cautions About Maximum Likelihood
+
+The maximum likelihood principle has a variety of neat properties we cannot expound. One worth knowing about is consistency; for our purposes, this means that the maximum likelihood estimate of parameters can be made arbitrarily close to the right answer by having a sufficiently large dataset. Now assume that our data doesn't actually come from the underlying model. This is the usual case, because we can't usually be sure that, say, the data truly is normal or truly comes from a Poisson distribution. Instead we choose a model that we think will be useful. When the data doesn't come from the model, maximum likelihood produces an estimate of θ that corresponds to the model that is (in quite a strong sense, which we can't explore here) the closest to the source of the data. Maximum likelihood is very widely used because of these neat properties. But there are some problems. One important problem is that it might be hard to find the maximum of the likelihood exactly. There are strong numerical methods for maximizing functions, and these are very helpful, but even today there are likelihood functions where it is very hard to find the maximum.
+
+The second is that small amounts of data can present nasty problems. For example, in the binomial case, if we have only one flip we will estimate p as either 1 or 0. We should find this report unconvincing. In the geometric case, with a fair coin, there is a probability 0.5 that we will perform the estimate and then report that the coin has p = 1. This should also worry you. As another example, if we throw a die only a few times, we could reasonably expect that, for some i, n i = 0. This doesn't necessarily mean that p i = 0, though that's what the maximum likelihood inference procedure will tell us.
+
+This creates a very important technical problem—how can I estimate the probability of events that haven't occurred? This might seem like a slightly silly question to you, but it isn't. Solving this problem has really significant practical consequences. As one example, consider a biologist trying to count the number of butterfly species on an island. The biologist catches and classifies a lot of butterflies, then leaves. But are there more butterfly species on the island? To get some sense that we can reason successfully about this problem, compare two cases. In the first, the biologist catches many individuals of each of the species observed. In this case, you should suspect that catching more butterflies is unlikely to yield more species. In the second case, there are many species where the biologist sees only one individual of that species. In this case, you should suspect that catching more butterflies might very well yield new species.
+
+## 9.2 Incorporating Priors with Bayesian Inference
+
+Another important issue with maximum likelihood is that there is no mechanism to incorporate prior beliefs. For example, imagine you get a new die from a reliable store, roll it six times and see a one once. You would be happy to believe that p(6) = 1∕6 for this die. Now imagine you borrow a die from a friend with a long history of making weighted dice. Your friend tells you this die is weighted so that p(1) = 1∕2. You roll the die six times and see a one once; in this case, you might worry that p(1) isn't 1∕6, and you just happened to get a slightly unusual set of rolls. You'd worry because you have good reason to believe the die isn't fair, and you'd want more evidence to believe p(6) = 1∕6. Maximum likelihood can't distinguish between these two cases.
+
+The difference lies in prior information—information we possess before we look at the data. We would like to take this information into account when we estimate the model. One way to do so is to place a prior probability distribution p(θ) on the parameters θ. Then, rather than working with the likelihood ![
+$$p\(\\mathcal{D}\\vert \\theta \)$$
+](A442674_1_En_9_Chapter_IEq39.gif), we could apply Bayes' rule, and form the posterior ![
+$$P\(\\theta \\vert \\mathcal{D}\)$$
+](A442674_1_En_9_Chapter_IEq40.gif). This posterior represents the probability that θ takes various values, given the data ![
+$$\\mathcal{D}$$
+](A442674_1_En_9_Chapter_IEq41.gif). Bayes' rule tells us that
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(\\theta \\vert \\mathcal{D}\)& =& \\frac{P\(\\mathcal{D}\\vert \\theta \) \\times P\(\\theta \)} {P\(\\mathcal{D}\)} {}\\\\ & =& \\frac{\\mbox{ Likelihood} \\times \\mbox{ Prior}} {\\mbox{ Normalizing constant}}. {}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ8.gif)
+
+Be aware that the prior distribution is often written with a π rather than a P or a p.
+
+Definition 9.4 (Bayesian Inference)
+
+Extracting information from the posterior ![
+$$P\(\\theta \\vert \\mathcal{D}\)$$
+](A442674_1_En_9_Chapter_IEq42.gif) is usually called Bayesian inference.
+
+Having the posterior probability distribution immediately allows us to answer quite sophisticated questions. For example, we could immediately compute
+
+![
+$$\\displaystyle{P\(\\left \\{\\theta \\in \\left \[0.2,0.4\\right \]\\right \\}\\vert \\mathcal{D}\)}$$
+](A442674_1_En_9_Chapter_Equz.gif)
+
+in a straightforward way. Quite often, we just want to extract an estimate of θ. For this, we can use the θ that maximizes the posterior.
+
+Definition 9.5 (MAP Estimate)
+
+A natural estimate of θ is the value that maximizes the posterior ![
+$$P\(\\theta \\vert \\mathcal{D}\)$$
+](A442674_1_En_9_Chapter_IEq43.gif). This estimate is known as a maximum a posteriori estimate or MAP estimate.
+
+To get this θ, we do not need to know the value of the posterior, and it is enough to work with
+
+![
+$$\\displaystyle{P\(\\theta \\vert \\mathcal{D}\) \\propto P\(\\mathcal{D}\\vert \\theta \)P\(\\theta \).}$$
+](A442674_1_En_9_Chapter_Equaa.gif)
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(\\theta \\vert \\mathcal{D}\)& \\propto & P\(\\mathcal{D}\\vert \\theta \) \\times P\(\\theta \) {}\\\\ & \\propto & \\mbox{ Likelihood} \\times \\mbox{ Prior} {}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ9.gif)
+
+This form exposes similarities and differences between Bayesian and maximum likelihood inference. To reason about the posterior, you need to have a prior P(θ). If you assume that this prior has the same value for every θ, you'll end up doing maximum likelihood, because P(θ) is some constant. Using a prior that isn't constant means that the MAP estimate may be different from the maximum likelihood estimate. This difference is occurring because you have prior beliefs that some θ are more likely to occur than others. Supplying these prior beliefs is part of the modelling process.
+
+Worked example 9.9 (Flipping a Coin). We have a coin with probability θ of coming up heads when flipped. We start knowing nothing about θ. We then flip the coin 10 times, and see 7 heads (and 3 tails). Plot a function proportional to ![
+$$p\(\\theta \\vert \\left \\{\\mbox{ 7 heads and 3 tails}\\right \\}\)$$
+](A442674_1_En_9_Chapter_IEq44.gif). What happens if there are 3 heads and 7 tails?
+
+Solution
+
+We know nothing about p, except that 0 ≤ θ ≤ 1, so we choose a uniform prior on p. We have that ![
+$$p\(\\left \\{\\mbox{ 7 heads and 3 tails}\\right \\}\\vert \\theta \)$$
+](A442674_1_En_9_Chapter_IEq45.gif) is binomial. The joint distribution is ![
+$$p\(\\left \\{\\mbox{ 7 heads and 3 tails}\\right \\}\\vert \\theta \) \\times p\(\\theta \)$$
+](A442674_1_En_9_Chapter_IEq46.gif) but p(θ) is uniform, so doesn't depend on θ. So the posterior is proportional to: ![
+$$\\theta ^{7}\(1-\\theta \)^{3}$$
+](A442674_1_En_9_Chapter_IEq47.gif) which is graphed in Fig. 9.2. The figure also shows ![
+$$\\theta ^{3}\(1-\\theta \)^{7}$$
+](A442674_1_En_9_Chapter_IEq48.gif) which is proportional to the posterior for 3 heads and 7 tails. In each case, the evidence does not rule out the possibility that θ = 0. 5, but tends to discourage the conclusion. Maximum likelihood would give θ = 0. 7 or θ = 0. 3, respectively.
+
+Fig. 9.2
+
+The curves show a function proportional to the posterior on θ, for the two cases of Example 9.9. Notice that this information is rather richer than the single value we would get from maximum likelihood inference
+
+Figure 9.2 shows curves proportional to the posterior. The functions are not the true posterior, because they do not integrate to one. The fact that we are missing this constant of proportionality means that, for example, we cannot compute ![
+$$P\(\\left \\{\\theta \\in \\left \[0.3,0.6\\right \]\\right \\}\\vert \\mathcal{D}\)$$
+](A442674_1_En_9_Chapter_IEq49.gif). The constant of proportionality can be computed by noticing that
+
+![
+$$\\displaystyle{P\(\\mathcal{D}\) =\\int _{\\theta }P\(\\mathcal{D}\\vert \\theta \)P\(\\theta \)d\\theta.}$$
+](A442674_1_En_9_Chapter_Equab.gif)
+
+It is usually impossible to do this integral in closed form, so we would have to use a numerical integral or a trick. For the case of Fig. 9.2, it is fairly easy to estimate the constant of proportionality (using either numerical integration or the fact that they're proportional to a binomial distribution). Figure 9.3 shows a set of true posteriors for different sets of evidence, using a uniform prior. The integral becomes much harder for more elaborate problems.
+
+Fig. 9.3
+
+The probability that an unknown coin will come up heads when flipped is p(H). For these figures, I simulated coin flips from a coin with p = 0. 75. I then plotted the posterior for various data. Notice how, as we see more flips, we get more confident about p. The graph gets higher as it gets narrower because the posterior probability must integrate to one
+
+### 9.2.1 Conjugacy
+
+Here is one really useful trick for computing the normalizing constant. In some cases, P(θ) and ![
+$$P\(\\mathcal{D}\\vert \\theta \)$$
+](A442674_1_En_9_Chapter_IEq50.gif), when multiplied together, take a familiar form. This happens when ![
+$$P\(\\mathcal{D}\\vert \\theta \)$$
+](A442674_1_En_9_Chapter_IEq51.gif) and P(θ) each belong to parametric families where there is a special relationship between the families. The property is known as conjugacy, and when a prior has this property, it is called a conjugate prior. The property is best captured in examples; the two in this section are occasionally worth knowing, but the most important example is in a section on its own (section Worked example 14.13).
+
+Worked example 9.10 (Flipping a Coin—II). We have a coin with probability θ of coming up heads when flipped. We model the prior on θ with a Beta distribution, with parameters α > 0, β > 0. We then flip the coin N times, and see h heads. What is P(θ | N, h, α, β)?
+
+Solution
+
+We have that P(N, h | θ) is binomial, and that P(θ | N, h, α, β) ∝ P(N, h | θ)P(θ | α, β). This means that
+
+![
+$$\\displaystyle\\begin{array}{rcl} & & P\(\\theta \\vert N,h,\\alpha,\\beta \) \\propto \\left \(\\begin{array}{c} N\\\\ h \\end{array} \\right \)\\theta ^{h}\(1-\\theta \)^{\(N-h\)} {}\\\\ & & \\frac{\\Gamma \(\\alpha +\\beta \)} {\\Gamma \(\\alpha \)\\Gamma \(\\beta \)}\\theta ^{\(\\alpha -1\)}\(1-\\theta \)^{\(\\beta -1\)}. {}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ10.gif)
+
+and we can write
+
+![
+$$\\displaystyle{P\(\\theta \\vert N,h,\\alpha,\\beta \) \\propto \\theta ^{\(\\alpha +h-1\)}\(1-\\theta \)^{\(\\beta +N-h-1\)}.}$$
+](A442674_1_En_9_Chapter_Equac.gif)
+
+Notice this has the form of a Beta distribution, so it is easy to recover the constant of proportionality. We have
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(\\theta \\vert N,h,\\alpha,\\beta \) = \\frac{\\Gamma \(\\alpha +\\beta + N\)} {\\Gamma \(\\alpha +h\)\\Gamma \(\\beta +N - h\)}\\theta ^{\(\\alpha +h-1\)}\(1-\\theta \)^{\(\\beta +N-h-1\)}.& & {}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ11.gif)
+
+The normalizing constant for P(θ | N, h, α, β) was easy to recover because the Beta distribution is a conjugate prior for the binomial distribution.
+
+Remember this: The Beta distribution is a conjugate prior for the binomial distribution
+
+Worked example 9.11 (More Sweary Politicians). Example 9.5 gives some data from a sweary politician. Assume we have only the first 10 intervals of observations, and we wish to estimate the intensity using a Poisson model. Write θ for this parameter. Use a Gamma distribution as a prior, and write out the posterior.
+
+Solution
+
+We have that
+
+![
+$$\\displaystyle\\begin{array}{rcl} p\(\\mathcal{D}\\vert \\theta \)& =& \\left \(\\frac{\\theta ^{0}e^{-\\theta }} {0!} \\right \)^{5}\\left \(\\frac{\\theta ^{1}e^{-\\theta }} {1!} \\right \)^{2} \\times {}\\\\ & &\\left \(\\frac{\\theta ^{2}e^{-\\theta }} {2!} \\right \)^{2}\\left \(\\frac{\\theta ^{3}e^{-\\theta }} {3!} \\right \)^{1} {}\\\\ & =& \\frac{\\theta ^{9}e^{-10\\theta }} {12} {}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ12.gif)
+
+and
+
+![
+$$\\displaystyle{p\(\\theta \\vert \\alpha,\\beta \) = \\frac{\\beta ^{\\alpha }} {\\Gamma \(\\alpha \)}\\theta ^{\(\\alpha -1\)}e^{-\\beta \\theta }}$$
+](A442674_1_En_9_Chapter_Equad.gif)
+
+This means that
+
+![
+$$\\displaystyle{p\(\\theta \\vert \\mathcal{D},\\alpha,\\beta \) \\propto \\theta ^{\(\\alpha -1+9\)}e^{-\(\\beta +10\)\\theta }.}$$
+](A442674_1_En_9_Chapter_Equae.gif)
+
+Notice this has the form of another Gamma distribution, so we can write
+
+![
+$$\\displaystyle\\begin{array}{rcl} p\(\\theta \\vert \\mathcal{D},\\alpha,\\beta \) = \\frac{\(\\beta +10\)^{\(\\alpha +9\)}} {\\Gamma \(\\alpha +9\)} \\theta ^{\(\\alpha -1+9\)}e^{-\(\\beta +10\)\\theta }& & {}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ13.gif)
+
+The normalizing constant for ![
+$$P\(\\theta \\vert \\mathcal{D},\\alpha,\\beta \)$$
+](A442674_1_En_9_Chapter_IEq52.gif) was easy to recover because the Gamma distribution is a conjugate prior for the Poisson distribution.
+
+Remember this: The Gamma distribution is a conjugate prior for the Poisson distribution
+
+### 9.2.2 MAP Inference
+
+Look at Example 9.1, where we estimated the probability a coin would come up heads with maximum likelihood. We could not change our estimate just by knowing the coin was fair, but we could come up with a number for θ = p(H) (rather than, say, a posterior distribution). A natural way to produce a point estimate for θ that incorporates prior information is to choose ![
+$$\\hat{\\theta }$$
+](A442674_1_En_9_Chapter_IEq53.gif) such that
+
+![
+$$\\displaystyle{\\hat{\\theta }= \\begin{array}{c} \\mbox{ argmax} \\\\ \\theta \\end{array} P\(\\theta \\vert \\mathcal{D}\) = \\begin{array}{c} \\mbox{ argmax} \\\\ \\theta \\end{array} \\frac{P\(\\theta,\\mathcal{D}\)} {P\(\\mathcal{D}\)} }$$
+](A442674_1_En_9_Chapter_Equaf.gif)
+
+This is the MAP estimate. If we wish to perform MAP inference, ![
+$$P\(\\mathcal{D}\)$$
+](A442674_1_En_9_Chapter_IEq54.gif) doesn't matter (it changes the value, but not the location, of the maximum). This means we can work with ![
+$$P\(\\theta,\\mathcal{D}\)$$
+](A442674_1_En_9_Chapter_IEq55.gif), often called the joint distribution.
+
+Worked example 9.12 (Flipping a Coin—II). We have a coin with probability θ of coming up heads when flipped. We model the prior on θ with a Beta distribution, with parameters α > 0, β > 0. We then flip the coin N times, and see h heads. What is the MAP estimate of θ?
+
+Solution
+
+We have that
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(\\theta \\vert N,h,\\alpha,\\beta \) = \\frac{\\Gamma \(\\alpha +\\beta + N\)} {\\Gamma \(\\alpha +h\)\\Gamma \(\\beta +N - h\)}\\theta ^{\(\\alpha +h-1\)}\(1-\\theta \)^{\(\\beta +N-h-1\)}.& & {}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ14.gif)
+
+You can get the MAP estimate by differentiating and setting to 0, yielding
+
+![
+$$\\displaystyle{\\hat{\\theta }= \\frac{\\alpha -1 + h} {\\alpha +\\beta - 2 + N}.}$$
+](A442674_1_En_9_Chapter_Equag.gif)
+
+This has rather a nice interpretation. You can see α and β as extra counts of heads (resp. tails) that are added to the observed counts. So, for example, if you were fairly sure that the coin should be fair, you might make α and β large and equal. When α = 1 and β = 1, we have a uniform prior as in the previous examples.
+
+Worked example 9.13
+
+Worked example 9.13 (More Sweary Politicians). We observe our swearing politician for N intervals, seeing n i swear words in the i'th interval. We model the swearing with a Poisson model. We wish to estimate the intensity, which we write θ. We use a Gamma distribution for the prior on θ. What is the MAP estimate of θ?
+
+Solution
+
+Write T = ∑ i = 1 N . We have that
+
+![
+$$\\displaystyle{p\(\\theta \\vert \\mathcal{D}\) = \\frac{\(\\beta +N\)^{\(\\alpha +T\)}} {\\Gamma \(\\alpha +T\)} \\theta ^{\(\\alpha -1+T\)}e^{-\(\\beta +T\)\\theta }}$$
+](A442674_1_En_9_Chapter_Equah.gif)
+
+and the MAP estimate is
+
+![
+$$\\displaystyle{\\hat{\\theta }= \\frac{\(\\alpha -1 + T\)} {\(\\beta +N\)} }$$
+](A442674_1_En_9_Chapter_Equai.gif)
+
+(which you can get by differentiating with respect to θ, then setting to zero). Notice that if β is close to zero, you can interpret α as extra counts; if β is large, then it strongly discourages large values of ![
+$$\\hat{\\theta }$$
+](A442674_1_En_9_Chapter_IEq56.gif), even if the counts are large.
+
+Useful Facts 9.1 (Bayesian Inference is Particularly Good with Little Data)
+
+​​If you have few data items and a model and a reasonable choice of prior, you really should use Bayesian inference.
+
+### 9.2.3 Cautions About Bayesian Inference
+
+Just like maximum likelihood inference, bayesian inference is not a recipe that can be applied without thought. It turns out that, when there is a lot of data, the prior has little influence on the outcome of the inference, and the MAP solution looks a lot like the maximum likelihood solution. So the difference between the two approaches is most interesting when there is little data, where the prior matters. The difficulty is that it might be hard to know what to use as a good prior. In the examples, I emphasized mathematical convenience, choosing priors that lead to clean posteriors. There is no reason to believe that nature uses conjugate priors (even though conjugacy is a neat property). How should one choose a prior for a real problem?
+
+This isn't an easy point. If there is little data, then the choice could really affect the inference. Sometimes we're lucky, and the logic of the problem dictates a choice of prior. Mostly, we have to choose and live with the consequences of the choice. Often, doing so is successful in applications.
+
+The fact we can't necessarily justify a choice of prior seems to be one of life's inconveniences, but it represents a significant philosophical problem. It's been at the core of a long series of protracted, often quite intense, arguments about the philosophical basis of statistics. I haven't followed these arguments closely enough to summarize them; they seem to have largely died down without any particular consensus being reached.
+
+## 9.3 Bayesian Inference for Normal Distributions
+
+Normal distribution models allow a variety of quite special tricks. Some algebra will establish that a normal prior and a normal likelihood yield a normal posterior. This turns out to be quite useful, because representing normal distributions is easy, and we can write straightforward equations for the mean and standard deviation of the posterior given prior and likelihood. You should remember from Sect. 1.​3.​3 that it is possible to compute the mean and standard deviation of a dataset if you see it one element at a time. Remarkably, the rules for can be put into a similar form. This means that bayesian inference for some kinds of time signal is quite straightforward.
+
+### 9.3.1 Example: Measuring Depth of a Borehole
+
+Assume we drop a measuring device down a borehole. A braking system will stop its fall and catch onto the side of the hole after it has fallen μ π meters. On board is a device to measure its depth. The measuring device measures depth in feet (not meters: it's my example, so I can do what I like and I've seen this sort of foolishness in real life). This device reports the correct depth in feet plus a zero mean normal random variable, which we call "noise". The device reports depth every second, over wireless.
+
+The first question to ask is what depth do we believe the device is at before we receive any measurement? We designed the braking system to stop at μ π meters, so we are not completely ignorant about where it is. However, it may not have worked absolutely correctly. We choose to model the depth at which it stops as μ π meters plus a zero mean normal random variable ("noise"). The noise term could be caused by error in the braking system, etc. We could estimate the standard deviation of the noise term (which we write σ π ) either by dropping devices down holes, then measuring with tape measures, or by analysis of likely errors in our braking system. The depth of the object is the unknown parameter of the model; we write this depth θ. Now the model says that θ is a normal random variable with mean μ π and standard deviation σ π .
+
+Now assume we receive a single measurement—what do we now know about the device's depth? The first thing to notice is that there is something to do here. Ignoring the prior and taking the measurement might not be wise. For example, imagine that the noise in the wireless system is large, so that the measurement is often corrupted. In this case, our original guess about the device's location might be better than the measurement. Similarly, ignoring the measurement and just taking the prior is unwise, too. The measurement tells us something about the borehole that isn't represented in the prior, and we should use that.
+
+Another reason to use the measurement is that we will receive another measurement in a second's time. Remember that each measurement is the true depth in feet plus zero-mean noise. Averaging multiple measurements will produce an estimate of depth that improves as the number of measurements goes up (by the standard error reasoning of Sect. 6.​2.​2). Since measurements will keep arriving, we want an online procedure that gives the best estimate of depth with current measurements, and then updates that estimate when a new measurement arrives. It turns out such a procedure is easy to construct.
+
+### 9.3.2 Normal Prior and Normal Likelihood Yield Normal Posterior
+
+When both ![
+$$P\(\\mathcal{D}\\vert \\theta \)$$
+](A442674_1_En_9_Chapter_IEq57.gif) and P(θ) are normal with known standard deviation, the posterior is normal, too. The mean and standard deviation of the posterior take a simple form. Assume P(θ) is normal, with mean μ π and standard deviation σ π (remember, priors are often written with π somewhere). So
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\log P\(\\theta \)& =& -\\frac{\(\\theta -\\mu _{\\pi }\)^{2}} {2\\sigma _{\\pi }^{2}} {}\\\\ & & +\\ \\mbox{ constant not dependent on }\\theta. {}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ15.gif)
+
+Start by assuming that ![
+$$\\mathcal{D}$$
+](A442674_1_En_9_Chapter_IEq58.gif) is a single measurement x 1. The measurement x 1 could be in different units from θ, and we will assume that the relevant scaling constant c 1 is known. We assume that P(x 1 | θ) is normal with known standard deviation σ m, 1, and with mean c 1 θ. Equivalently, x 1 is obtained by adding noise to c 1 θ. The noise will have zero mean and standard deviation σ m, 1. This means that
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\log P\(\\mathcal{D}\\vert \\theta \)& =& \\log P\(x_{1}\\vert \\theta \) = -\\frac{\(x_{1} - c_{1}\\theta \)^{2}} {2\\sigma _{m,1}^{2}} {}\\\\ & & +\\ \\mbox{ constant not dependent on }x_{1}\\mbox{ or }\\theta. {}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ16.gif)
+
+We would like to know P(θ | x). We have that
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\log P\(\\theta \\vert x_{1}\)& =& \\log p\(x_{1}\\vert \\theta \) +\\log p\(\\theta \) {}\\\\ & & +\\ \\mbox{ terms not depending on}\\ \\theta {}\\\\ & =& -\\frac{\(x_{1} - c_{1}\\theta \)^{2}} {2\\sigma _{m,1}^{2}} -\\frac{\(\\theta -\\mu _{\\pi }\)^{2}} {2\\sigma _{\\pi }^{2}} {}\\\\ & & +\\ \\mbox{ terms not depending on }\\theta. {}\\\\ & =& -\\left \[\\theta ^{2}\\left \( \\frac{c_{1}^{2}} {2\\sigma _{m,1}^{2}} + \\frac{1} {2\\sigma _{\\pi }^{2}}\\right \)\\right. {}\\\\ & & \\left.-\\theta \\left \( \\frac{c_{1}x_{1}} {2\\sigma _{m,1}^{2}} + \\frac{\\mu _{\\pi }} {2\\sigma _{\\pi }^{2}}\\right \)\\right \] {}\\\\ & & +\\ \\mbox{ terms not depending on }\\theta. {}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ17.gif)
+
+Now some trickery will get us an expression for P(θ | x 1). Notice first that logP(θ | x 1) is of degree 2 in θ (i.e. it has terms θ 2, θ and things that don't depend on θ). This means that P(θ | x 1) must be a normal distribution, because we can rearrange its log into the form of the log of a normal distribution.
+
+Now we can show that ![
+$$P\(\\theta \\vert \\mathcal{D}\)$$
+](A442674_1_En_9_Chapter_IEq59.gif) is normal when there are more measurements. Assume we have N measurements, x 1,..., x N . The measurements are IID samples from a normal distribution conditioned on θ. We will assume that each measurement is in its own set of units (captured by a constant c i ), and each measurement incorporates noise of different standard deviation (with standard deviation σ m, i ). So
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\log P\(x_{i}\\vert \\theta \)& =& -\\frac{\(x_{i} - c_{i}\\theta \)^{2}} {2\\sigma _{m,i}^{2}} {}\\\\ & & +\\ \\mbox{ constant not dependent on }x_{1}\\mbox{ or }\\theta. {}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ18.gif)
+
+Now
+
+![
+$$\\displaystyle{\\log P\(\\mathcal{D}\\vert \\theta \) =\\sum _{i}\\log P\(x_{i}\\vert \\theta \)}$$
+](A442674_1_En_9_Chapter_Equaj.gif)
+
+so we can write
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\log P\(\\theta \\vert \\mathcal{D}\)& =& \\log p\(x_{N}\\vert \\theta \) +\\ldots +\\log p\(x_{2}\\vert \\theta \) {}\\\\ & & +\\log p\(x_{1}\\vert \\theta \) +\\log p\(\\theta \) {}\\\\ & & +\\ \\mbox{ terms not depending on}\\ \\theta {}\\\\ & =& \\log p\(x_{N}\\vert \\theta \) +\\ldots +\\log p\(x_{2}\\vert \\theta \) {}\\\\ & & +\\log p\(\\theta \\vert x_{1}\) + \\mbox{ terms not} {}\\\\ & & \\mbox{ depending on}\\ \\theta {}\\\\ & =& \\log p\(x_{N}\\vert \\theta \) +\\ldots +\\log p\(\\theta \\vert x_{1},x_{2}\) {}\\\\ & & +\\ \\mbox{ terms not depending on}\\ \\theta. {}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ19.gif)
+
+This lays out the induction. We have that P(θ | x 1) is normal, with known standard deviation. Now regard this as the prior, and P(x 2 | θ) as the likelihood; we have that P(θ | x 1, x 2) is normal, and so on. So under our assumptions, ![
+$$P\(\\theta \\vert \\mathcal{D}\)$$
+](A442674_1_En_9_Chapter_IEq60.gif) is normal. We now have a really useful fact.
+
+Remember this: A normal prior and a normal likelihood yield a normal posterior when both standard deviations are known
+
+Some straightforward thrashing through algebra, relegated to the exercises, will yield expressions for the mean and standard deviation of the posterior. These are captured in the box below.
+
+Useful Facts 9.2 (The Parameters of a Normal Posterior with a Single Measurement)
+
+Assume we wish to estimate a parameter θ. The prior distribution for θ is normal, with known mean μ π and known standard deviation σ π . We receive a single data item x 1 and a scale c 1. The likelihood of x 1 is normal with mean c 1 θ and standard deviation σ m, 1, where σ m, 1 is known. Then the posterior, p(θ | x 1, c 1, σ m, 1, μ π , σ π ), is normal, with mean
+
+![
+$$\\displaystyle{\\mu _{1} = \\frac{c_{1}x_{1}\\sigma _{\\pi }^{2} +\\mu _{\\pi }\\sigma _{m,1}^{2}} {\\sigma _{m,1}^{2} + c_{1}^{2}\\sigma _{\\pi }^{2}} }$$
+](A442674_1_En_9_Chapter_Equak.gif)
+
+and standard deviation
+
+![
+$$\\displaystyle{\\sigma _{1} = \\sqrt{ \\frac{\\sigma _{m,1 }^{2 }\\sigma _{\\pi }^{2 }} {\\sigma _{m,1}^{2} + c^{2}\\sigma _{\\pi }^{2}}}.}$$
+](A442674_1_En_9_Chapter_Equal.gif)
+
+The equations of box 9.2 "make sense". Recall the example of dropping the depth measuring device down a borehole. Imagine that the mechanical design for the braking system was very good, but the measuring system is inaccurate. Then σ π is very small, and σ m, 1 is very big. In turn, the mean of P(θ | x 1) is about μ π . Equivalently, because our prior was very accurate, and the measurement was unreliable, the posterior mean is about the prior mean Similarly, if the measurement is reliable (i.e. σ m, 1 is small) and the prior has high variance (i.e. σ π is large), the mean of P(θ | x 1) is about x 1∕c 1.—i.e. the measurement, rescaled to the same units as θ
+
+Worked example 9.14 (MAP for Normal Prior and Likelihood with Known Standard Deviation). We wish to estimate a parameter θ. The prior distribution for θ is normal, with known mean μ π and known standard deviation σ π . We have a single data item x 1. The likelihood P(x 1 | θ) is normal, with mean c 1 θ and standard deviation σ m, 1. What is the MAP estimate of θ?
+
+Solution
+
+The equations are in a box, above (page 214). A normal distribution has a maximum at the mean, so
+
+![
+$$\\displaystyle{\\hat{\\theta }= \\frac{c_{1}x_{1}\\sigma _{\\pi }^{2} +\\mu _{\\pi }\\sigma _{m,1}^{2}} {\\sigma _{m,1}^{2} + c_{1}^{2}\\sigma _{\\pi }^{2}} }$$
+](A442674_1_En_9_Chapter_Equam.gif)
+
+### 9.3.3 Filtering
+
+Recall the device we dropped down a borehole produced a measurement every second. It does not make sense to wait all day, then use the day's measurements to produce a single depth estimate. Instead, we should update our estimate each time we get a measurement. The induction sketched in Sect. 9.3.2 gives a procedure to do this.
+
+In words, our initial representation of the parameters we are trying to estimate is the prior, which is normal. We see one measurement, which has normal likelihood, so the posterior is normal. You can think of this posterior as a prior for the parameter estimate based on the next measurement. But we know what to do with a normal prior, a normal likelihood, and a measurement, so we can incorporate the measurement and go again. This means we can exploit our expression for the posterior mean and standard deviation in the case of normal likelihood and normal prior and a single measurement to deal with multiple measurements very easily. This process of updating a representation of a dataset as new data arrives is known as filtering.
+
+To restate using mathematical notation, I will use the notation and assumptions of that section (so you have to read it, sorry). We assumed that the prior, P(θ), was normal, and the likelihood for the first measurement, P(x 1 | θ), was normal. This meant that the posterior after the first measurement, P(θ | x 1), was normal too. Now we can treat P(θ | x 1,..., x i−1) (which is normal) as a prior for the likelihood P(x i | θ) (which is also normal). This will produce the posterior P(θ | x 1,..., x i ), which will be normal. This can operate as a prior for the likelihood P(x i+1 | θ), and so on. In turn, this gives us a pattern for incorporating measurements one at a time.
+
+Useful Facts 9.3 (Normal Posteriors Can Be Updated Online)
+
+Assume we wish to estimate a parameter θ. The prior distribution for θ is normal, with known mean μ π and known standard deviation σ π . We write x i for the i'th data item. The likelihood for each separate data item is normal, with mean c i θ and standard deviation σ m, i . We have already received k data items. The posterior p(θ | x 1,..., x k , c 1,..., c k , σ m, 1,..., σ m, k , μ π , σ π ) is normal, with mean μ k and standard deviation σ k . We receive a new data item x k+1. The likelihood of this data item is normal with mean c k+1 θ and standard deviation σ m, (k+1), where c k+1 and σ m, (k+1) are known. Then the posterior, p(θ | x 1,..., x k+1, c 1,..., c k , c k+1, σ m, 1,..., σ m, (k+1), μ π , σ π ), is normal, with mean
+
+![
+$$\\displaystyle{\\mu _{k+1} = \\frac{c_{k+1}x_{k+1}\\sigma _{k}^{2} +\\mu _{k}\\sigma _{m,\(k+1\)}^{2}} {\\sigma _{m,\(k+1\)}^{2} + c_{k+1}^{2}\\sigma _{k}^{2}} }$$
+](A442674_1_En_9_Chapter_Equan.gif)
+
+and
+
+![
+$$\\displaystyle{\\sigma _{k+1}^{2} = \\frac{\\sigma _{m,\(k+1\)}^{2}\\sigma _{ k}^{2}} {\\sigma _{m,\(k+1\)}^{2} + c_{k+1}^{2}\\sigma _{k}^{2}}.}$$
+](A442674_1_En_9_Chapter_Equao.gif)
+
+Again, notice the very useful fact that, if everything is normal, we can update our posterior representation when new data arrives using a very simple recursive form.
+
+Worked example 9.15 (Estimating the Weekly Percent Growth in a Stock Price). Assume the weekly percent growth in the price of MSFT is an (unknown) constant. Use the stock price dataset at http://​archive.​ics.​uci.​edu/​ml/​datasets/​Dow+Jones+Index to estimate this constant at the end of each week. Assume that σ m, i = 1. 9 and that c i = 1 for each i. Use μ π = 0 and σ π = 5. Plot the mean and standard deviation of your current posterior at the end of each week. How do the standard deviations of the posteriors change with more measurements?
+
+Solution
+
+This dataset is part of the UC Irvine Machine Learning archive. It was collected by Michael Brown. It gives a variety of numbers for a variety of stocks at the end of each of 25 weeks. The exercise requires a few lines of code to implement the recursions of box 9.3, and rather more lines of code to produce a plot. A little algebra will show that the k'th standard deviation should be proportional to 1∕k (quick experiment shows the relevant constant is close to 1. 7). Figure 9.4 shows the results.
+
+Fig. 9.4
+
+On the left, a representation of the posterior on the weekly percent change in price of MSFT stock. The open circles are the observed prices. Distributions are represented by filled circles for the means and bars for the standard deviation (one posterior standard deviation up and down). The first distribution is the prior; then, slightly after each measurement, there's a representation of the posterior for that and all previous measurements. On the right, a plot of the posterior standard deviation after the k'th measurement (open circles) together with 1. 7∕(k \+ 0. 11) (filled diamonds); the 0. 11 is to avoid division by zero for the prior. The close agreement suggests that the k'th standard deviation should be proportional to 1∕k, which simple algebra will confirm
+
+## 9.4 You Should
+
+### 9.4.1 Remember These Definitions
+
+  * Likelihood 198
+
+  * Maximum likelihood principle 198
+
+  * Log-likelihood of a dataset under a model 201
+
+  * Bayesian inference 207
+
+  * MAP estimate 207
+
+### 9.4.2 Remember These Terms
+
+  * independent and identically distributed 198
+
+  * IID 198
+
+  * consistency 206
+
+  * prior probability distribution 207
+
+  * posterior 207
+
+  * Bayesian inference 207
+
+  * maximum a posteriori estimate 207
+
+  * MAP estimate 207
+
+  * conjugacy 209
+
+  * conjugate prior 209
+
+  * joint 210
+
+  * filtering 214
+
+### 9.4.3 Remember These Facts
+
+  * Bayesian inference is particularly good with little data 211
+
+  * The parameters of a normal posterior with a single measurement 214
+
+  * Normal posteriors can be updated online 215
+
+### 9.4.4 Use These Procedures
+
+  * Estimating parameters with maximum likelihood 199
+
+  * Constructing confidence intervals from simulation 205
+
+### 9.4.5 Be Able to
+
+  * Write out the likelihood for a set of independent data items produced by models from Chap.  (at least Normal, Binomial, Multinomial, Poisson, Beta, Gamma, Exponential).
+
+  * Write out the log likelihood for a set of independent data items produced by models from Chap.  (at least Normal, Binomial, Multinomial, Poisson, Beta, Gamma, Exponential).
+
+  * Find maximum likelihood solutions for parameters of these models from a set of independent data items (in this case, ignore Beta and Gamma; finding maximum likelihood estimates for these can be tricky, and isn't important to us).
+
+  * Describe situations where maximum likelihood estimates might not be reliable.
+
+  * Describe the difference between maximum likelihood estimation and Bayesian inference.
+
+  * Write an expression for the posterior or log-posterior of model parameters given a set of independent data items.
+
+  * Compute the MAP estimate for the cases shown in the worked examples.
+
+  * Compute on-line estimates of the maximum likelihood estimate of the mean and standard deviation of a normal model.
+
+  * Compute on-line estimates of the MAP estimate of the mean and standard deviation in the case of a normal prior and a normal likelihood.
+
+Problems
+
+Maximum Likelihood Methods
+
+9.1 Fitting a Normal Distribution: You are given a dataset of N numbers. Write x i for the i'th number. You wish to model this dataset with a normal distribution.
+
+  1. (a)
+
+Show the maximum likelihood estimate of the mean of this distribution is ![
+$$\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)$$
+](A442674_1_En_9_Chapter_IEq61.gif).
+
+  2. (b)
+
+Show the maximum likelihood estimate of the standard deviation of this distribution is ![
+$$\\mathsf{std}\\left \(x\\right \)$$
+](A442674_1_En_9_Chapter_IEq62.gif).
+
+  3. (c)
+
+Now assume that all of these numbers take the same value—what happens to your estimate of the standard deviation?
+
+9.2 Fitting an Exponential Distribution: You are given a dataset of N non-negative numbers. Write x i for the i'th number. You wish to model this dataset with an exponential distribution, with probability density function P(x | θ) = θe −θx . Show the maximum likelihood estimate of θ is
+
+![
+$$\\displaystyle{ \\frac{N} {\\sum _{i}x_{i}}}$$
+](A442674_1_En_9_Chapter_Equap.gif)
+
+9.3 Fitting a Poisson Distribution: You count the number of times that the annoying "MacSweeper" popup window appears per hour when you surf the web. You wish to model these counts with a Poisson distribution. On day 1, you surf for 4 h, and see counts of 3, 1, 4, 2 (in hours 1 through 4 respectively). On day 2, you surf for 3 h, and observe counts of 2, 1, 2. On day 3, you surf for 5 h, and observe counts of 3, 2, 2, 1, 4. On day 4, you surf for 6 h, but keep only the count for all 6 h, which is 13. You wish to model the intensity in counts per hour.
+
+  1. (a)
+
+What is the maximum likelihood estimate of the intensity for each of days 1, 2, and 3 separately?
+
+  2. (b)
+
+What is the maximum likelihood estimate of the intensity for day 4?
+
+  3. (c)
+
+What is the maximum likelihood estimate of the intensity for all days taken together?
+
+9.4 Fitting a Geometric Model: You wish to determine the number of zeros on a roulette wheel without looking at the wheel. You will do so with a geometric model. Recall that when a ball on a roulette wheel falls into a non-zero slot, odd/even bets are paid; when it falls into a zero slot, they are not paid. There are 36 non-zero slots on the wheel.
+
+  1. (a)
+
+Assume you observe a total of r odd/even bets being paid before you see a bet not being paid. What is the maximum likelihood estimate of the number of slots on the wheel?
+
+  2. (b)
+
+How reliable is this estimate? Why?
+
+  3. (c)
+
+You decide to watch the wheel k times to make an estimate. In the first experiment, you see r 1 odd/even bets being paid before you see a bet not being paid; in the second, r 2; and in the third, r 3. What is the maximum likelihood estimate of the number of slots on the wheel?
+
+9.5 Fitting a Binomial Model: You encounter a deck of Martian playing cards. There are 87 cards in the deck. You cannot read Martian, and so the meaning of the cards is mysterious. However, you notice that some cards are blue, and others are yellow.
+
+  1. (a)
+
+You shuffle the deck, and draw one card. It is yellow. What is the maximum likelihood estimate of the fraction of blue cards in the deck?
+
+  2. (b)
+
+You repeat the previous exercise 10 times, replacing the card you drew each time before shuffling. You see 7 yellow and 3 blue cards in the deck. What is the maximum likelihood estimate of the fraction of blue cards in the deck?
+
+9.6 Fitting a Least Squares Model: We observe a set of N data items. The i'th data item consists of a vector x i and a number y i . We believe that this data is explained by a model where P(y | x, θ) is normal, with mean x T θ and (known) standard deviation σ. Here θ is a vector of unknown parameters. At first sight, this model may strike you as being a bit strange, though we'll do a lot with it later. You can visualize this model by noting that y is generated by (a) forming x T θ then (b) adding a zero-mean normal random variable with standard deviation σ.
+
+  1. (a)
+
+Show that a maximum likelihood estimate ![
+$$\\hat{\\theta }$$
+](A442674_1_En_9_Chapter_IEq63.gif) of the value of θ is obtained by solving
+
+![
+$$\\displaystyle{\\sum _{i}\(y_{i} -\\mathbf{x}_{i}^{T}\\hat{\\theta }\)^{2} = 0.}$$
+](A442674_1_En_9_Chapter_Equaq.gif)
+
+  2. (b)
+
+Stack the y i into a vector y and the x i into a matrix ![
+$$\\mathcal{X}$$
+](A442674_1_En_9_Chapter_IEq64.gif) according to
+
+![
+$$\\displaystyle{\\mathbf{y} = \\left \[\\begin{array}{c} y_{1} \\\\ y_{2}\\\\ \\ldots \\\\ y_{N}\\end{array} \\right \]\\mbox{ }\\mathcal{X} = \\left \[\\begin{array}{c} \\mathbf{x}_{1}^{T} \\\\ \\mathbf{x}_{2}^{T}\\\\ \\ldots \\\\ \\mathbf{x}_{N}^{T}\\end{array} \\right \].}$$
+](A442674_1_En_9_Chapter_Equar.gif)
+
+Now show that a maximum likelihood estimate ![
+$$\\hat{\\theta }$$
+](A442674_1_En_9_Chapter_IEq65.gif) of the value of θ is obtained by solving the equation ![
+$$\\mathcal{X}^{T}\\mathcal{X}\\hat{\\theta } = \\mathcal{X}^{T}\\mathbf{y}$$
+](A442674_1_En_9_Chapter_IEq66.gif).
+
+9.7 Logistic Regression: We observe a set of N data items. The i'th data item consists of a vector x i and a discrete y i , which can take the values 0 or 1. We believe that this data is explained by a model where y i is a draw from a Bernoulli distribution. The Bernoulli distribution has the property that
+
+![
+$$\\displaystyle{\\log \\frac{P\(y = 1\\vert \\mathbf{x},\\theta \)} {P\(y = 0\\vert \\mathbf{x},\\theta \)} = \\mathbf{x}^{T}\\theta.}$$
+](A442674_1_En_9_Chapter_Equas.gif)
+
+This is known as a logistic model. Here θ is a vector of unknown parameters.
+
+  1. (a)
+
+Show that
+
+![
+$$\\displaystyle{P\(y = 1\\vert \\mathbf{x},\\theta \) = \\frac{\\exp \(\\mathbf{x}^{T}\\theta \)} {1 +\\exp \(\\mathbf{x}^{T}\\theta \)}}$$
+](A442674_1_En_9_Chapter_Equat.gif)
+
+  2. (b)
+
+Show that the log-likelihood of a dataset is given by
+
+![
+$$\\displaystyle{\\sum _{i}\\left \[y_{i}\\mathbf{x}_{i}^{T}\\theta -\\log \\left \(1 +\\exp \(\\mathbf{x}_{ i}^{T}\\theta \)\\right \)\\right \].}$$
+](A442674_1_En_9_Chapter_Equau.gif)
+
+  3. (c)
+
+Now show that a maximum likelihood estimate ![
+$$\\hat{\\theta }$$
+](A442674_1_En_9_Chapter_IEq67.gif) of the value of θ is obtained by solving the equation
+
+![
+$$\\displaystyle{\\sum _{i}\\left \[\\left \(y_{i} - \\frac{\\exp \(\\mathbf{x}_{i}^{T}\\theta \)} {1 +\\exp \(\\mathbf{x}_{i}^{T}\\theta \)}\\right \)\\right \] = 0.}$$
+](A442674_1_En_9_Chapter_Equav.gif)
+
+Likelihood Functions
+
+9.8 You have a dataset of N 1D data items x i . Each data item is a measurement of the length of an object, in centimeters. You wish to model these items with a normal distribution, with mean μ c and standard deviation σ c .
+
+  1. (a)
+
+Show that the likelihood, as a function of μ c and σ c , is
+
+![
+$$\\displaystyle{\\mathcal{L}_{c}\(\\mu,\\sigma \) = \\frac{1} {\\sqrt{2\\pi }\\sigma _{c}}\\prod _{i}\\exp -\\left \(x_{i} -\\mu _{c}\\right \)^{2}/2\\sigma _{ c}^{2}.}$$
+](A442674_1_En_9_Chapter_Equaw.gif)
+
+  2. (b)
+
+Now assume that you rescale each length by measuring in meters. Your data set now consists of y i = 100x i . Show that
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathcal{L}_{m}\(\\mu _{m},\\sigma _{m}\)& =& \\frac{1} {\\sqrt{2\\pi }\\sigma _{m}}\\prod _{i}\\exp -\\left \(y_{i} -\\mu _{m}\\right \)^{2}/2\\sigma _{ m}^{2} {}\\\\ & =& \(1/100\)\\mathcal{L}_{c}\(\\mu _{c},\\sigma _{c}\) {}\\\\ \\end{array}$$
+](A442674_1_En_9_Chapter_Equ20.gif)
+
+  3. (c)
+
+Use this to argue that the value of the likelihood is not directly meaningful.
+
+Bayesian Methods
+
+9.9 Zeros on a Roulette Wheel: We now wish to make a more sophisticated estimate of the number of zeros on a roulette wheel without looking at the wheel. We will do so with Bayesian inference. Recall that when a ball on a roulette wheel falls into a non-zero slot, odd/even bets are paid; when it falls into a zero slot, they are not paid. There are 36 non-zero slots on the wheel. We assume that the number of zeros is one of ![
+$$\\left \\{0,1,2,3\\right \\}$$
+](A442674_1_En_9_Chapter_IEq68.gif). We assume that these cases have prior probability ![
+$$\\left \\{0.1,0.2,0.4,0.3\\right \\}$$
+](A442674_1_En_9_Chapter_IEq69.gif).
+
+  1. (a)
+
+Write n for the event that, in a single spin of the wheel, an odd/even bet will not be paid (equivalently, the ball lands in one of the zeros). Write z for the number of zeros in the wheel. What is P(n | z) for each of the possible values of z (i.e. each of ![
+$$\\left \\{0,1,2,3\\right \\}$$
+](A442674_1_En_9_Chapter_IEq70.gif))?
+
+  2. (b)
+
+Under what circumstances is P(z = 0 | observations) NOT 0?
+
+  3. (c)
+
+You observe 36 independent spins of the same wheel. A zero comes up in 2 of these spins. What is P(z | observations)?
+
+9.10 Which random number generator? You have two random number generators. R1 generates numbers that are distributed uniformly and at random in the range − 1–1. R2 generates standard normal random variables. A program chooses one of these two random number generators uniformly and at random, then using that generator produces three numbers x 1, x 2 and x 3.
+
+  1. (a)
+
+Write R1 for the event the program chose R1, etc. When is P(R1 | x 1, x 2, x 3) = 0?
+
+  2. (b)
+
+You observe x 1 = −0. 1, x 2 = 0. 4 and x 3 = −0. 9. What is P(R1 | x 1, x 2, x 3)?
+
+9.11 Which random number generator, again? You have r > 1 random number generators. The i'th random number generator generates numbers that are distributed normally, with zero mean and standard deviation i. A program chooses one of these random number generators uniformly and at random, then using that generator produces N numbers x 1,..., x N . Write R i for the event the program chose the i'th random number generator, etc. Write an expression for P(R i | x 1,..., x N ) = 0
+
+9.12 Which random number generator, yet again? You have r > 1 random number generators. The i'th random number generator generates numbers that are distributed normally, with mean i and standard deviation 2. A program chooses one of these random number generators uniformly and at random, then using that generator produces N numbers x 1,..., x N . Write R i for the event the program chose the i'th random number generator, etc. Write an expression for P(R i | x 1,..., x N ) = 0
+
+9.13 A Normal Distribution: You are given a dataset of 3 numbers, − 1, 0, 20. You wish to model this dataset with a normal distribution with unknown mean μ and standard deviation 1. You will make an MAP estimate of μ. The prior on μ is normal, with mean 0 and standard deviation 10.
+
+  1. (a)
+
+What is the MAP estimate of μ?
+
+  2. (b)
+
+A new datapoint, with value 1, arrives. What is the new MAP estimate of μ?
+
+Bayesian Confidence Intervals
+
+9.14 In Worked example 7.10, we found no reason to reject the idea that mouse weights are normally distributed, using the dataset at http://​cgd.​jax.​org/​datasets/​phenotype/​SvensonDO.​shtml.
+
+  1. (a)
+
+Construct a 75% confidence interval for the weight of a mouse that eats chow, using this data. You should get this interval using reasoning about standard errors—this isn't Bayesian.
+
+  2. (b)
+
+Now construct a Bayesian 75% confidence interval for the weight of a mouse that eats chow, using this data. You should assume that the prior on the weight of such a mouse is normal, with mean 32 and standard deviation 10. Recall from Sect. 9.3.2 that a normal prior and a normal likelihood lead to normal posterior.
+
+  3. (c)
+
+Compare the intervals constructed above with a Bayesian 75% confidence interval for the weight of a mouse that eats chow, using this data and assuming the prior on the weight of such a mouse is normal, with mean 32 and standard deviation 1. What does this tell you about the importance of the prior?
+
+Programming Exercises
+
+Simulation and Maximum Likelihood
+
+9.15 One interesting way to evaluate maximum likelihood estimation is to use simulations. We will compare maximum likelihood estimates to true parameter values for normal distributions. Write a program that draws s sets of k samples from a normal distribution with mean zero and standard deviation one. Now compute the maximum likelihood estimate of the mean of this distribution from each set of samples. You should see s different values of this estimate, one for each set. How does the variance of the estimate change with k?
+
+9.16 Write a program that draws a sample of a Bernoulli random variable δ with P(δ = 1) = 0. 5. If this sample takes the value 1, your program should draw a sample from a normal distribution with mean zero and standard deviation 1. Otherwise, your program should draw a sample from a normal distribution with mean 1 and standard deviation 1. Write x 1 for the resulting sample. We will use x 1 to infer the value of δ 1.
+
+  1. (a)
+
+Show that
+
+![
+$$\\displaystyle{\\delta = \\left \\{\\begin{array}{lc} 1&\\mbox{ if}x_{1} <0.5\\\\ 0 & \\mbox{ otherwise}\\end{array} \\right.}$$
+](A442674_1_En_9_Chapter_Equax.gif)
+
+is a maximum likelihood estimate of δ 1.
+
+  2. (b)
+
+Now draw 100 sets each of one sample from this program, and infer for each the value of δ 1. How often do you get the right answer?
+
+  3. (c)
+
+Show that the true error rate is
+
+![
+$$\\displaystyle{2\\int _{0.5}^{\\infty } \\frac{1} {\\sqrt{2\\pi }}\\exp \\left \[\\frac{\(u - 1\)^{2}} {2} \\right \]du.}$$
+](A442674_1_En_9_Chapter_Equay.gif)
+
+(Hint: I did a devious change of variables to get the leading 2). Compare the number obtained from your simulation with an estimate of this integral using the error function.
+
+9.17 Logistic regression: We observe a set of N data items. The i'th data item consists of a vector x i and a discrete y i , which can take the values 0 or 1. We believe that this data is explained by a model where y i is a draw from a Bernoulli distribution. The Bernoulli distribution has the property that
+
+![
+$$\\displaystyle{\\log \\frac{P\(y = 1\\vert \\mathbf{x},\\theta \)} {P\(y = 0\\vert \\mathbf{x},\\theta \)} = \\mathbf{x}^{T}\\theta.}$$
+](A442674_1_En_9_Chapter_Equaz.gif)
+
+This is known as a logistic model. Here θ is a vector of unknown parameters. Inferring θ is often known as logistic regression. We will investigate logistic regression using simulation.
+
+  1. (a)
+
+Write a program that accepts ten dimensional vector θ and then (a) generates a sample of 1000 samples x i , each of which is an IID sample from a normal distribution with mean zero and covariance matrix the identity matrix; and (b) for each x i , forms y i which is a sample from a Bernoulli distribution where
+
+![
+$$\\displaystyle{P\(y_{i} = 1\\vert \\mathbf{x}_{i},\\theta \) = \\frac{\\exp \\mathbf{x}_{i}^{T}\\theta } {1 +\\exp \\mathbf{x}_{i}^{T}\\theta }.}$$
+](A442674_1_En_9_Chapter_Equba.gif)
+
+This is a sample dataset. Call this program the dataset maker.
+
+  2. (b)
+
+Write a program that accepts a sample dataset and estimates ![
+$$\\hat{\\theta }$$
+](A442674_1_En_9_Chapter_IEq71.gif) (the value of θ that was used to generate the sample dataset) using maximum likelihood. This will require that you use some optimization code, or write your own. Call this program the inference engine.
+
+  3. (c)
+
+Choose a θ (a sample from a normal distribution with mean zero and covariance matrix the identity matrix is one way to do this), then create 100 sample datasets. For each, apply the inference engine, and obtain a ![
+$$\\hat{\\theta }$$
+](A442674_1_En_9_Chapter_IEq72.gif). How does the mean of these estimates of θ compare to the true value? What are the eigenvalues of the covariance of these estimates like?
+
+  4. (d)
+
+Choose a θ (a sample from a normal distribution with mean zero and covariance matrix the identity matrix is one way to do this), then create 2 sample datasets. For the first, apply the inference engine, and obtain a ![
+$$\\hat{\\theta }$$
+](A442674_1_En_9_Chapter_IEq73.gif). Now use this ![
+$$\\hat{\\theta }$$
+](A442674_1_En_9_Chapter_IEq74.gif) to predict the y i values of the second, using the logistic regression model. How do your predictions compare to the true values? should the difference be zero? why?
+
+Simulation Based Confidence Intervals
+
+9.18 In the mouse dataset at http://​cgd.​jax.​org/​datasets/​phenotype/​SvensonDO.​shtml, there are 92 mice that ate a chow diet ("chow" in the relevant column) and where the weight at sacrifice is known (the value of Weight2).
+
+  1. (a)
+
+Draw a sample of 30 chow eating mice uniformly at random, and compute the mean weight of these mice. Now use the simulation method of Sect. 9.1.4 to estimate a centered 75% confidence interval for the mean weight of a mouse eating a chow diet, estimated from a sample of 30 mice.
+
+  2. (b)
+
+Draw 1000 samples of 30 chow eating mice uniformly at random, and use these samples to estimate a centered 75% confidence interval for the mean weight of a mouse eating a chow diet, estimated from a sample of 30 mice. How does this estimate compare to the previous estimate?
+
+Bayesian Confidence Intervals
+
+9.19 Example 9.5 gives data on swearing by a politician (which I've reproduced below, for your convenience).
+
+No. of swear words | 0 | 1 | 2 | 3 | 4
+
+---|---|---|---|---|---
+
+No. of intervals | 5 | 2 | 2 | 1 | 0
+
+and for the following 20 intervals, you see
+
+No. of swear words | 0 | 1 | 2 | 3 | 4
+
+---|---|---|---|---|---
+
+No. of intervals | 9 | 5 | 3 | 2 | 1
+
+Worked example 9.13 shows how to use a conjugate prior gamma distribution to obtain a gamma posterior. Write a program to identify a centered, 90% bayesian confidence interval for the intensity of the politicians swearing for different values of the prior parameters. How do different choices of these parameters affect your interval?
+
+# Part IV  
+Tools
+© Springer International Publishing AG 2018
+
+David ForsythProbability and Statistics for Computer Sciencehttps://doi.org/10.1007/978-3-319-64410-3_10
+
+# 10. Extracting Important Relationships in High Dimensions
+
+David Forsyth1
+
+(1)
+
+Computer Science Department, University of Illinois at Urbana Champaign, Urbana, IL, USA
+
+Chapter  described methods to explore the relationship between two elements in a dataset. We could extract a pair of elements and construct various plots. For vector data, we could also compute the correlation between different pairs of elements. But if each data item is d-dimensional, there could be a lot of pairs to deal with, and it is hard to plot d-dimensional vectors.
+
+To really get what is going on we need methods that can represent all relationships in a dataset in one go. These methods visualize the dataset as a "blob" in a d-dimensional space. Many such blobs are flattened in some directions, because components of the data are strongly correlated. Finding the directions in which the blobs are flat yields methods to compute lower dimensional representations of the dataset. This analysis yields an important practical fact. Most high dimensional datasets consist of low dimensional data in a high dimensional space (think of a line in 3D). Such datasets can be represented accurately using a small number of directions. Finding these directions will give us considerable insight into the structure of high dimensional datasets.
+
+## 10.1 Summaries and Simple Plots
+
+In this chapter, we assume that our data items are vectors. This means that we can add and subtract values and multiply values by a scalar without any distress. This is an important assumption, but it doesn't necessarily mean that data is continuous (for example, you can meaningfully add the number of children in one family to the number of children in another family). It does rule out a lot of discrete data. For example, you can't add "sports" to "grades" and expect a sensible answer.
+
+When we plotted histograms, we saw that mean and variance were a very helpful description of data that had a unimodal histogram. If the histogram had more than one mode, one needed to be somewhat careful to interpret the mean and variance; in the pizza example, we plotted diameters for different manufacturers to try and see the data as a collection of unimodal histograms. In higher dimensions, the analogue of a unimodal histogram is a "blob"—a group of data points that clusters nicely together and should be understood together.
+
+You might not believe that "blob" is a technical term, but it's quite widely used. This is because it is relatively easy to understand a single blob of data. There are good summary representations (mean and covariance, which I describe below). If a dataset forms multiple blobs, we can usually coerce it into a representation as a collection of blobs (using the methods of Chap. ). But many datasets really are single blobs, and we concentrate on such data here. There are quite useful tricks for understanding blobs of low dimension by plotting them, which I describe below. To understand a high dimensional blob, we will need to think about the coordinate transformations that places it into a particularly convenient form.
+
+Notation: Our data items are vectors, and we write a vector as x. The data items are d-dimensional, and there are N of them. The entire data set is {x}. When we need to refer to the i'th data item, we write x i . We write {x i } for a new dataset made up of N items, where the i'th item is x i . If we need to refer to the j'th component of a vector x i , we will write x i (j) (notice this isn't in bold, because it is a component not a vector, and the j is in parentheses because it isn't a power). Vectors are always column vectors.
+
+### 10.1.1 The Mean
+
+For one-dimensional data, we wrote
+
+![
+$$\\displaystyle{\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) = \\frac{\\sum _{i}x_{i}} {N}.}$$
+](A442674_1_En_10_Chapter_Equa.gif)
+
+This expression is meaningful for vectors, too, because we can add vectors and divide by scalars. We write
+
+![
+$$\\displaystyle{\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \) = \\frac{\\sum _{i}\\mathbf{x}_{i}} {N} }$$
+](A442674_1_En_10_Chapter_Equb.gif)
+
+and call this the mean of the data. Notice that each component of ![
+$$\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)$$
+](A442674_1_En_10_Chapter_IEq1.gif) is the mean of that component of the data. There is not an easy analogue of the median, however (how do you order high dimensional data?) and this is a nuisance. Notice that, just as for the one-dimensional mean, we have
+
+![
+$$\\displaystyle{\\mathsf{mean}\\left \(\\left \\{\\mathbf{x} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\\right \\}\\right \) = 0}$$
+](A442674_1_En_10_Chapter_Equc.gif)
+
+(i.e. if you subtract the mean from a data set, the resulting data set has zero mean).
+
+### 10.1.2 Stem Plots and Scatterplot Matrices
+
+Plotting high dimensional data is tricky. If there are relatively few dimensions, you could just choose two (or three) of them and produce a 2D (or 3D) scatterplot. Figure 10.1 shows such a scatterplot, for data that was originally four dimensional. This is the famous Iris dataset, collected by Edgar Anderson in 1936, and made popular amongst statisticians by Ronald Fisher in that year. I found a copy at the UC Irvine repository of datasets that are important in machine learning. You can find the repository at http://​archive.​ics.​uci.​edu/​ml/​index.​html.
+
+Fig. 10.1
+
+Left: a 2D scatterplot for the famous Iris data. I have chosen two variables from the four, and have plotted each species with a different marker. Right: a 3D scatterplot for the same data. You can see from the plots that the species cluster quite tightly, and are different from one another. If you compare the two plots, you can see how suppressing a variable leads to a loss of structure. Notice that, on the left, some crosses lie on top of boxes; you can see that this is an effect of projection by looking at the 3D picture (for each of these data points, the petal widths are quite different). You should worry that leaving out the last variable might have suppressed something important like this
+
+Another simple but useful plotting mechanism is the stem plot. This is can be a useful way to plot a few high dimensional data points. One plots each component of the vector as a vertical line, typically with a circle on the end (easier seen than said; look at Fig. 10.2). The dataset I used for this is the wine dataset, again from the UC Irvine machine learning data repository (you can find this dataset at http://​archive.​ics.​uci.​edu/​ml/​datasets/​Wine). For each of three types of wine, the data records the values of 13 different attributes. In the figure, I show the overall mean of the dataset, and also the mean of each type of wine (also known as the class means, or class conditional means). A natural way to compare class means is to plot them on top of one another in a stem plot (Fig. 10.2).
+
+Fig. 10.2
+
+On the left, a stem plot of the mean of all data items in the wine dataset, from http://​archive.​ics.​uci.​edu/​ml/​datasets/​Wine. On the right, I have overlaid stem plots of each class mean from the wine dataset, from http://​archive.​ics.​uci.​edu/​ml/​datasets/​Wine, so that you can see the differences between class means
+
+Another strategy that is very useful when there aren't too many dimensions is to use a scatterplot matrix. To build one, you lay out scatterplots for each pair of variables in a matrix. On the diagonal, you name the variable that is the vertical axis for each plot in the row, and the horizontal axis in the column. This sounds more complicated than it is; look at the example of Fig. 10.3, which shows both a 3D scatter plot and a scatterplot matrix for the same dataset.
+
+Fig. 10.3
+
+Left: the 3D scatterplot of the iris data of Fig. 10.1, for comparison. Right: a scatterplot matrix for the Iris data. There are four variables, measured for each of three species of iris. I have plotted each species with a different marker. You can see from the plot that the species cluster quite tightly, and are different from one another
+
+Figure 10.4 shows a scatter plot matrix for four of the variables in the height-weight dataset of http://​www2.​stetson.​edu/​~jrasp/​data.​htm (look for bodyfat.xls at that URL). This is originally a 16-dimensional dataset, but a 16 by 16 scatterplot matrix is squashed and hard to interpret. For Fig. 10.4, you can see that weight and adiposity appear to show quite strong correlations, but weight and age are pretty weakly correlated. Height and age seem to have a low correlation. It is also easy to visualize unusual data points. Usually one has an interactive process to do so—you can move a "brush" over the plot to change the color of data points under the brush.
+
+Fig. 10.4
+
+This is a scatterplot matrix for four of the variables in the height weight dataset of http://​www2.​stetson.​edu/​~jrasp/​data.​htm. Each plot is a scatterplot of a pair of variables. The name of the variable for the horizontal axis is obtained by running your eye down the column; for the vertical axis, along the row. Although this plot is redundant (half of the plots are just flipped versions of the other half), that redundancy makes it easier to follow points by eye. You can look at a column, move down to a row, move across to a column, etc. Notice how you can spot correlations between variables and outliers (the arrows)
+
+### 10.1.3 Covariance
+
+Variance, standard deviation and correlation can each be obtained by performing a more general operation on data. We have a dataset ![
+$$\\left \\{\\mathbf{x}\\right \\}$$
+](A442674_1_En_10_Chapter_IEq2.gif) of N vectors x i , and we are interested in relationships between the j'th and the k'th components. As with correlation, we would like to know whether one component tends to be large (resp. small) when the other is large. Remember that I write x i (j) for the j'th component of the i'th vector.
+
+Definition 10.1 (Covariance)
+
+We compute the covariance by
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{cov}\\left \(\\left \\{\\mathbf{x}\\right \\};j,k\\right \) = \\frac{\\sum _{i}\\left \(x_{i}^{\(j\)} -\\mathsf{mean}\\left \(\\left \\{x^{\(j\)}\\right \\}\\right \)\\right \)\\left \(x_{i}^{\(k\)} -\\mathsf{mean}\\left \(\\left \\{x^{\(k\)}\\right \\}\\right \)\\right \)} {N} & & {}\\\\ \\end{array}$$
+](A442674_1_En_10_Chapter_Equ1.gif)
+
+Just like mean, standard deviation and variance, covariance can refer either to a property of a dataset (as in the definition here) or a particular expectation (as in Chap. ). From the expression, it should be clear we have seen examples of covariance already. Notice that
+
+![
+$$\\displaystyle{\\mathsf{std}\\left \(x^{\(j\)}\\right \)^{2} = \\mathsf{var}\\left \(\\left \\{x^{\(j\)}\\right \\}\\right \) = \\mathsf{cov}\\left \(\\left \\{\\mathbf{x}\\right \\};j,j\\right \)}$$
+](A442674_1_En_10_Chapter_Equd.gif)
+
+which you can prove by substituting the expressions. Recall that variance measures the tendency of a dataset to be different from the mean, so the covariance of a dataset with itself is a measure of its tendency not to be constant. More important is the relationship between covariance and correlation, in the box below.
+
+Remember this:
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{corr}\\left \(\\left \\{\(x^{\(j\)},x^{\(k\)}\)\\right \\}\\right \) = \\frac{\\mathsf{cov}\\left \(\\left \\{\\mathbf{x}\\right \\};j,k\\right \)} {\\sqrt{\\mathsf{cov } \\left \(\\left \\{\\mathbf{x} \\right \\}; j, j \\right \)}\\sqrt{\\mathsf{cov } \\left \(\\left \\{\\mathbf{x} \\right \\}; k, k \\right \)}}& & {}\\\\ \\end{array}$$
+](A442674_1_En_10_Chapter_Equ2.gif)
+
+This is occasionally a useful way to think about correlation. It says that the correlation measures the tendency of {x} and {y} to be larger (resp. smaller) than their means for the same data points, compared to how much they change on their own.
+
+### 10.1.4 The Covariance Matrix
+
+Working with covariance (rather than correlation) allows us to unify some ideas. In particular, for data items which are d dimensional vectors, it is straightforward to compute a single matrix that captures all covariances between all pairs of components—this is the covariance matrix.
+
+Definition 10.2 (Covariance Matrix)
+
+The covariance matrix is:
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \) = \\frac{\\sum _{i}\(\\mathbf{x}_{i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\)\(\\mathbf{x}_{i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\)^{T}} {N} & & {}\\\\ \\end{array}$$
+](A442674_1_En_10_Chapter_Equ3.gif)
+
+Notice that it is quite usual to write a covariance matrix as ![
+$$\\Sigma$$
+](A442674_1_En_10_Chapter_IEq3.gif), and we will follow this convention.
+
+Covariance matrices are often written as ![
+$$\\Sigma$$
+](A442674_1_En_10_Chapter_IEq4.gif), whatever the dataset (you get to figure out precisely which dataset is intended, from context). Generally, when we want to refer to the j, k'th entry of a matrix ![
+$$\\mathcal{A}$$
+](A442674_1_En_10_Chapter_IEq5.gif), we will write ![
+$$\\mathcal{A}_{jk}$$
+](A442674_1_En_10_Chapter_IEq6.gif), so ![
+$$\\Sigma _{jk}$$
+](A442674_1_En_10_Chapter_IEq7.gif) is the covariance between the j'th and k'th components of the data.
+
+Useful Facts 10.1 (Properties of the Covariance Matrix)
+
+  * The j, k'th entry of the covariance matrix is the covariance of the j'th and the k'th components of x, which we write ![
+$$\\mathsf{cov}\\left \(\\left \\{\\mathbf{x}\\right \\};j,k\\right \)$$
+](A442674_1_En_10_Chapter_IEq8.gif).
+
+  * The j, j'th entry of the covariance matrix is the variance of the j'th component of x.
+
+  * The covariance matrix is symmetric.
+
+  * The covariance matrix is always positive semi-definite; it is positive definite, unless there is some vector a such that ![
+$$\\mathbf{a}^{T}\(\\mathbf{x}_{i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}_{i}\\right \\}\\right \) = 0$$
+](A442674_1_En_10_Chapter_IEq9.gif) for all i.
+
+Proposition
+
+![
+$$\\displaystyle{\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)_{jk} = \\mathsf{cov}\\left \(\\left \\{\\mathbf{x}\\right \\};j,k\\right \)}$$
+](A442674_1_En_10_Chapter_Eque.gif)
+
+Proof
+
+Recall
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \) = \\frac{\\sum _{i}\(\\mathbf{x}_{i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\)\(\\mathbf{x}_{i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\)^{T}} {N} & & {}\\\\ \\end{array}$$
+](A442674_1_En_10_Chapter_Equ4.gif)
+
+and the j, k'th entry in this matrix will be
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\frac{\\sum _{i}\(x_{i}^{\(j\)} -\\mathsf{mean}\\left \(\\left \\{x^{\(j\)}\\right \\}\\right \)\)\(x_{i}^{\(k\)} -\\mathsf{mean}\\left \(\\left \\{x^{\(k\)}\\right \\}\\right \)\)^{T}} {N} & & {}\\\\ \\end{array}$$
+](A442674_1_En_10_Chapter_Equ5.gif)
+
+which is ![
+$$\\mathsf{cov}\\left \(\\left \\{\\mathbf{x}\\right \\};j,k\\right \)$$
+](A442674_1_En_10_Chapter_IEq10.gif).
+
+Proposition
+
+![
+$$\\displaystyle{\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)_{jj} = \\Sigma _{jj} = \\mathsf{var}\\left \(\\left \\{x^{\(j\)}\\right \\}\\right \)}$$
+](A442674_1_En_10_Chapter_Equf.gif)
+
+Proof
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)_{jj}& =& \\mathsf{cov}\\left \(\\left \\{\\mathbf{x}\\right \\};j,j\\right \) {}\\\\ & =& \\mathsf{var}\\left \(\\left \\{x^{\(j\)}\\right \\}\\right \) {}\\\\ \\end{array}$$
+](A442674_1_En_10_Chapter_Equ6.gif)
+
+Proposition
+
+![
+$$\\displaystyle{\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \) = \\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)^{T}}$$
+](A442674_1_En_10_Chapter_Equg.gif)
+
+Proof
+
+We have
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)_{jk}& =& \\mathsf{cov}\\left \(\\left \\{\\mathbf{x}\\right \\};j,k\\right \) {}\\\\ & =& \\mathsf{cov}\\left \(\\left \\{\\mathbf{x}\\right \\};k,j\\right \) {}\\\\ & =& \\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)_{kj} {}\\\\ \\end{array}$$
+](A442674_1_En_10_Chapter_Equ7.gif)
+
+Proposition
+
+Write ![
+$$\\Sigma = \\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)$$
+](A442674_1_En_10_Chapter_IEq11.gif) . If there is no vector a such that ![
+$$\\mathbf{a}^{T}\(\\mathbf{x}_{i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\) = 0$$
+](A442674_1_En_10_Chapter_IEq12.gif) for all i, then for any vector u , such that ∣​∣​​ u ​​∣​∣ > 0,
+
+![
+$$\\displaystyle{\\mathbf{u}^{T}\\Sigma \\mathbf{u}> 0.}$$
+](A442674_1_En_10_Chapter_Equh.gif)
+
+If there is such a vector a , then
+
+![
+$$\\displaystyle{\\mathbf{u}^{T}\\Sigma \\mathbf{u} \\geq 0.}$$
+](A442674_1_En_10_Chapter_Equi.gif)
+
+Proof
+
+We have
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathbf{u}^{T}\\Sigma \\mathbf{u}& =& \\frac{1} {N}\\sum _{i}\\left \[\\mathbf{u}^{T}\(\\mathbf{x}_{ i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\)\\right \]\\left \[\(\\mathbf{x}_{i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\)^{T}\\mathbf{u}\\right \] {}\\\\ & =& \\frac{1} {N}\\sum _{i}\\left \[\\mathbf{u}^{T}\(\\mathbf{x}_{ i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\)\\right \]^{2}. {}\\\\ \\end{array}$$
+](A442674_1_En_10_Chapter_Equ8.gif)
+
+Now this is a sum of squares. If there is some a such that ![
+$$\\mathbf{a}^{T}\(\\mathbf{x}_{i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\) = 0$$
+](A442674_1_En_10_Chapter_IEq13.gif) for every i, then the covariance matrix must be positive semidefinite (because the sum of squares could be zero in this case). Otherwise, it is positive definite, because the sum of squares will always be positive.
+
+## 10.2 Using Mean and Covariance to Understand High Dimensional Data
+
+The trick to interpreting high dimensional data is to use the mean and covariance to understand the blob. Figure 10.5 shows a two-dimensional data set. Notice that there is obviously some correlation between the x and y coordinates (it's a diagonal blob), and that neither x nor y has zero mean. We can easily compute the mean and subtract it from the data points, and this translates the blob so that the origin is at the mean (Fig. 10.5). The mean of the new, translated dataset is zero.
+
+Fig. 10.5
+
+On the left, a "blob" in two dimensions. This is a set of data points that lie somewhat clustered around a single center, given by the mean. I have plotted the mean of these data points with a hollow square (it's easier to see when there is a lot of data). To translate the blob to the origin, we just subtract the mean from each datapoint, yielding the blob on the right
+
+Notice this blob is diagonal. We know what that means from our study of correlation—the two measurements are correlated. Now consider rotating the blob of data about the origin. This doesn't change the distance between any pair of points, but it does change the overall appearance of the blob of data. We can choose a rotation that means the blob looks (roughly!) like an axis aligned ellipse. In these coordinates there is no correlation between the horizontal and vertical components. But one direction has more variance than the other.
+
+It turns out we can extend this approach to high dimensional blobs. We will translate their mean to the origin, then rotate the blob so that there is no correlation between any pair of distinct components (this turns out to be straightforward, which may not be obvious to you). Now the blob looks like an axis-aligned ellipsoid, and we can reason about (a) what axes are "big" and (b) what that means about the original dataset.
+
+### 10.2.1 Mean and Covariance Under Affine Transformations
+
+We have a d dimensional dataset {x}. An affine transformation of this data is obtained by choosing some matrix ![
+$$\\mathcal{A}$$
+](A442674_1_En_10_Chapter_IEq14.gif) and vector b, then forming a new dataset {m}, where ![
+$$\\mathbf{m}_{i} = \\mathcal{A}\\mathbf{x}_{i} + \\mathbf{b}$$
+](A442674_1_En_10_Chapter_IEq15.gif). Here ![
+$$\\mathcal{A}$$
+](A442674_1_En_10_Chapter_IEq16.gif) doesn't have to be square, or symmetric, or anything else; it just has to have second dimension d.
+
+It is easy to compute the mean and covariance of {m}. We have
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{mean}\\left \(\\left \\{\\mathbf{m}\\right \\}\\right \)& =& \\mathsf{mean}\\left \(\\left \\{\\mathcal{A}\\mathbf{x} + \\mathbf{b}\\right \\}\\right \) {}\\\\ & =& \\mathcal{A}\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \) + \\mathbf{b}, {}\\\\ \\end{array}$$
+](A442674_1_En_10_Chapter_Equ9.gif)
+
+so you get the new mean by multiplying the original mean by ![
+$$\\mathcal{A}$$
+](A442674_1_En_10_Chapter_IEq17.gif) and adding b.
+
+The new covariance matrix is easy to compute as well. We have:
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{Covmat}\\left \(\\left \\{\\mathbf{m}\\right \\}\\right \)& =& \\mathsf{Covmat}\\left \(\\left \\{\\mathcal{A}\\mathbf{x} + \\mathbf{b}\\right \\}\\right \) {}\\\\ & =& \\frac{\\sum _{i}\(\\mathbf{m}_{i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{m}\\right \\}\\right \)\)\(\\mathbf{m}_{i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{m}\\right \\}\\right \)\)^{T}} {N} {}\\\\ & =& \\frac{\\sum _{i}\(\\mathcal{A}\\mathbf{x}_{i} + \\mathbf{b} -\\mathcal{A}\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \) -\\mathbf{b}\)\(\\mathcal{A}\\mathbf{x}_{i} + \\mathbf{b} -\\mathcal{A}\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \) -\\mathbf{b}\)^{T}} {N} {}\\\\ & =& \\frac{\\mathcal{A}\\left \[\\sum _{i}\(\\mathbf{x}_{i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\)\(\\mathbf{x}_{i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\)^{T}\\right \]\\mathcal{A}^{T}} {N} {}\\\\ & =& \\mathcal{A}\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\\mathcal{A}^{T}. {}\\\\ \\end{array}$$
+](A442674_1_En_10_Chapter_Equ10.gif)
+
+All this means that we can try and choose affine transformations that yield "good" means and covariance matrices. It is natural to choose b so that the mean of the new dataset is zero. An appropriate choice of ![
+$$\\mathcal{A}$$
+](A442674_1_En_10_Chapter_IEq18.gif) can reveal a lot of information about the dataset.
+
+### 10.2.2 Eigenvectors and Diagonalization
+
+Recall a matrix ![
+$$\\mathcal{M}$$
+](A442674_1_En_10_Chapter_IEq19.gif) is symmetric if ![
+$$\\mathcal{M} = \\mathcal{M}^{T}$$
+](A442674_1_En_10_Chapter_IEq20.gif). A symmetric matrix is necessarily square. Assume ![
+$$\\mathcal{S}$$
+](A442674_1_En_10_Chapter_IEq21.gif) is a d × d symmetric matrix, u is a d × 1 vector, and λ is a scalar. If we have
+
+![
+$$\\displaystyle{\\mathcal{S}\\mathbf{u} =\\lambda \\mathbf{u}}$$
+](A442674_1_En_10_Chapter_Equj.gif)
+
+then u is referred to as an eigenvector of ![
+$$\\mathcal{S}$$
+](A442674_1_En_10_Chapter_IEq22.gif) and λ is the corresponding eigenvalue. Matrices don't have to be symmetric to have eigenvectors and eigenvalues, but the symmetric case is the only one of interest to us.
+
+In the case of a symmetric matrix, the eigenvalues are real numbers, and there are d distinct eigenvectors that are normal to one another, and can be scaled to have unit length. They can be stacked into a matrix ![
+$$\\mathcal{U} = \\left \[\\mathbf{u}_{1},\\ldots,\\mathbf{u}_{d}\\right \]$$
+](A442674_1_En_10_Chapter_IEq23.gif). This matrix is orthonormal, meaning that ![
+$$\\mathcal{U}^{T}\\mathcal{U} = \\mathcal{I}$$
+](A442674_1_En_10_Chapter_IEq24.gif).
+
+This means that there is a diagonal matrix ![
+$$\\Lambda$$
+](A442674_1_En_10_Chapter_IEq25.gif) and an orthonormal matrix ![
+$$\\mathcal{U}$$
+](A442674_1_En_10_Chapter_IEq26.gif) such that
+
+![
+$$\\displaystyle{\\mathcal{S}\\mathcal{U} = \\mathcal{U}\\Lambda.}$$
+](A442674_1_En_10_Chapter_Equk.gif)
+
+In fact, there is a large number of such matrices, because we can reorder the eigenvectors in the matrix ![
+$$\\mathcal{U}$$
+](A442674_1_En_10_Chapter_IEq27.gif), and the equation still holds with a new ![
+$$\\Lambda$$
+](A442674_1_En_10_Chapter_IEq28.gif), obtained by reordering the diagonal elements of the original ![
+$$\\Lambda$$
+](A442674_1_En_10_Chapter_IEq29.gif). There is no reason to keep track of this complexity. Instead, we adopt the convention that the elements of ![
+$$\\mathcal{U}$$
+](A442674_1_En_10_Chapter_IEq30.gif) are always ordered so that the elements of ![
+$$\\Lambda$$
+](A442674_1_En_10_Chapter_IEq31.gif) are sorted along the diagonal, with the largest value coming first. This gives us a particularly important procedure.
+
+Procedure 10.1 (Diagonalizing a Symmetric Matrix)
+
+We can convert any symmetric matrix ![
+$$\\mathcal{S}$$
+](A442674_1_En_10_Chapter_IEq32.gif) to a diagonal form by computing
+
+![
+$$\\displaystyle{\\mathcal{U}^{T}\\mathcal{S}\\mathcal{U} = \\Lambda.}$$
+](A442674_1_En_10_Chapter_Equl.gif)
+
+Numerical and statistical programming environments have procedures to compute ![
+$$\\mathcal{U}$$
+](A442674_1_En_10_Chapter_IEq33.gif) and ![
+$$\\Lambda$$
+](A442674_1_En_10_Chapter_IEq34.gif) for you. We assume that the elements of ![
+$$\\mathcal{U}$$
+](A442674_1_En_10_Chapter_IEq35.gif) are always ordered so that the elements of ![
+$$\\Lambda$$
+](A442674_1_En_10_Chapter_IEq36.gif) are sorted along the diagonal, with the largest value coming first.
+
+Useful Facts 10.2 (Orthonormal Matrices are Rotations)
+
+You should think of orthonormal matrices as rotations, because they do not change lengths or angles. For x a vector, ![
+$$\\mathcal{R}$$
+](A442674_1_En_10_Chapter_IEq37.gif) an orthonormal matrix, and ![
+$$\\mathbf{m} = \\mathcal{R}\\mathbf{x}$$
+](A442674_1_En_10_Chapter_IEq38.gif), we have
+
+![
+$$\\displaystyle{\\mathbf{u}^{T}\\mathbf{u} = \\mathbf{x}^{T}\\mathcal{R}^{T}\\mathcal{R}\\mathbf{x} = \\mathbf{x}^{T}\\mathcal{I}\\mathbf{x} = \\mathbf{x}^{T}\\mathbf{x}.}$$
+](A442674_1_En_10_Chapter_Equm.gif)
+
+This means that ![
+$$\\mathcal{R}$$
+](A442674_1_En_10_Chapter_IEq39.gif) doesn't change lengths. For y, z both unit vectors, we have that the cosine of the angle between them is
+
+![
+$$\\displaystyle{\\mathbf{y}^{T}\\mathbf{x}.}$$
+](A442674_1_En_10_Chapter_Equn.gif)
+
+By the argument above, the inner product of ![
+$$\\mathcal{R}\\mathbf{y}$$
+](A442674_1_En_10_Chapter_IEq40.gif) and ![
+$$\\mathcal{R}\\mathbf{x}$$
+](A442674_1_En_10_Chapter_IEq41.gif) is the same as y T x. This means that ![
+$$\\mathcal{R}$$
+](A442674_1_En_10_Chapter_IEq42.gif) doesn't change angles, either.
+
+### 10.2.3 Diagonalizing Covariance by Rotating Blobs
+
+We start with a dataset of N d-dimensional vectors ![
+$$\\left \\{\\mathbf{x}\\right \\}$$
+](A442674_1_En_10_Chapter_IEq43.gif). We can translate this dataset to have zero mean, forming a new dataset {m} where ![
+$$\\mathbf{m}_{i} = \\mathbf{x}_{i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)$$
+](A442674_1_En_10_Chapter_IEq44.gif). Now recall that, if we were to form a new dataset {a} where
+
+![
+$$\\displaystyle{\\mathbf{a}_{i} = \\mathcal{A}\\mathbf{m}_{i}}$$
+](A442674_1_En_10_Chapter_Equo.gif)
+
+the covariance matrix of {a} would be
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{Covmat}\\left \(\\left \\{\\mathbf{a}\\right \\}\\right \)& =& \\mathcal{A}\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{m}\\right \\}\\right \)\\mathcal{A}^{T} {}\\\\ & =& \\mathcal{A}\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\\mathcal{A}^{T}. {}\\\\ \\end{array}$$
+](A442674_1_En_10_Chapter_Equ11.gif)
+
+Recall also we can diagonalize ![
+$$\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{m}\\right \\}\\right \) = \\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)$$
+](A442674_1_En_10_Chapter_IEq45.gif) to get
+
+![
+$$\\displaystyle{\\mathcal{U}^{T}\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\\mathcal{U} = \\Lambda.}$$
+](A442674_1_En_10_Chapter_Equp.gif)
+
+But this means we could form the dataset {r}, using the rule
+
+![
+$$\\displaystyle{\\mathbf{r}_{i} = \\mathcal{U}^{T}\\mathbf{m}_{ i} = \\mathcal{U}^{T}\(\\mathbf{x}_{ i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\).}$$
+](A442674_1_En_10_Chapter_Equq.gif)
+
+The mean of this new dataset is clearly 0. The covariance of this dataset is
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{Covmat}\\left \(\\left \\{\\mathbf{r}\\right \\}\\right \)& =& \\mathsf{Covmat}\\left \(\\left \\{\\mathcal{U}^{T}\\mathbf{x}\\right \\}\\right \) {}\\\\ & =& \\mathcal{U}^{T}\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\\mathcal{U} {}\\\\ & =& \\Lambda, {}\\\\ \\end{array}$$
+](A442674_1_En_10_Chapter_Equ12.gif)
+
+where ![
+$$\\Lambda$$
+](A442674_1_En_10_Chapter_IEq46.gif) is a diagonal matrix of eigenvalues of ![
+$$\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)$$
+](A442674_1_En_10_Chapter_IEq47.gif) that we obtained by diagonalization. We now have a very useful fact about {r}: its covariance matrix is diagonal. This means that every pair of distinct components has covariance zero, and so has correlation zero. Remember that, in describing diagonalization, we adopted the convention that the eigenvectors of the matrix being diagonalized were ordered so that the eigenvalues are sorted in descending order along the diagonal of ![
+$$\\Lambda$$
+](A442674_1_En_10_Chapter_IEq48.gif). Our choice of ordering means that the first component of r has the highest variance, the second component has the second highest variance, and so on.
+
+The transformation from {x} to {r} is a translation followed by a rotation (remember ![
+$$\\mathcal{U}$$
+](A442674_1_En_10_Chapter_IEq49.gif) is orthonormal, and so a rotation). So this transformation is a high dimensional version of what I showed in Figs. 10.5 and 10.6.
+
+Fig. 10.6
+
+On the left, the translated blob of Fig. 10.5. This blob lies somewhat diagonally, because the vertical and horizontal components are correlated. On the right, that blob of data rotated so that there is no correlation between these components. We can now describe the blob by the vertical and horizontal variances alone, as long as we do so in the new coordinate system. In this coordinate system, the vertical variance is significantly larger than the horizontal variance—the blob is short and wide
+
+Useful Facts 10.3 (You Can Transform Data to Zero Mean and Diagonal Covariance)
+
+We can translate and rotate any blob of data into a coordinate system where it has (a) zero mean and (b) diagonal covariance matrix.
+
+### 10.2.4 Approximating Blobs
+
+The mean of {r} is zero, and the covariance matrix of {r} is diagonal. It is quite usual for high dimensional datasets to have a small number of large values on the diagonal, and a lot of small values. These values give the variance of the corresponding components of {r}. Now imagine choosing a component of {r} that has a small variance, and replacing that with zero. Because this component has zero mean (like every other one), and small variance, replacing it with zero will not result in much error.
+
+If we can replace components with zero without causing much error, the blob of data is really a low dimensional blob in a high dimensional space. For example, think about a blob lying along, but very close to the x-axis in 3D. Replacing each data items y and z values with zero will not change the shape of the blob very much. As a visual example, look at Fig. 10.3; the scatterplot matrix strongly suggests that the blob of data is flattened (eg look at the petal width vs petal length plot).
+
+The data set {r} is d-dimensional. We will try to represent it with an s dimensional dataset, and see what error we incur. Choose some s < d. Now take each data point r i and replace the last d − s components with 0. Call the resulting data item p i . We should like to know the average error in representing r i with p i .
+
+This error is
+
+![
+$$\\displaystyle{ \\frac{1} {N}\\sum _{i}\\left \[\\left \(\\mathbf{r}_{i} -\\mathbf{p}_{i}\\right \)^{T}\\left \(\\mathbf{r}_{ i} -\\mathbf{p}_{i}\\right \)^{T}\\right \].}$$
+](A442674_1_En_10_Chapter_Equr.gif)
+
+Write r i (j) for the j′ component of r i , and so on. Remember that p i is zero in the last d − s components. The error is then
+
+![
+$$\\displaystyle{ \\frac{1} {N}\\sum _{i}\\left \[\\sum _{j=s+1}^{j=d}\\left \(r_{ i}^{\(j\)}\\right \)^{2}\\right \].}$$
+](A442674_1_En_10_Chapter_Equs.gif)
+
+Because {r} has zero mean, we have that ![
+$$\\frac{1} {N}\\sum _{i}\\left \(r_{i}^{\(j\)}\\right \)^{2}$$
+](A442674_1_En_10_Chapter_IEq50.gif) is the variance of the j'th component of {r}. So the error is
+
+![
+$$\\displaystyle{\\sum _{j=s+1}^{j=d}\\mathsf{var}\\left \(\\left \\{r^{\(j\)}\\right \\}\\right \)}$$
+](A442674_1_En_10_Chapter_Equt.gif)
+
+which is the sum of the diagonal elements of the covariance matrix from s \+ 1, s \+ 1 to d, d. If this sum is small compared to the sum of the first s components, then dropping the last d − s components results in a small error. In that case, we could think about the data as being s dimensional. Figure 10.7 shows the result of using this approach to represent the blob we've used as a running example as a 1D dataset.
+
+Fig. 10.7
+
+On the left, the translated and rotated blob of Fig. 10.6. This blob is stretched—one direction has more variance than another. Setting the y coordinate to zero for each of these datapoints results in a representation that has relatively low error, because there isn't much variance in these values. This results in the blob on the right. The text shows how the error that results from this projection is computed
+
+This is an observation of great practical importance. As a matter of experimental fact, a great deal of high dimensional data produces relatively low dimensional blobs. We can identify the main directions of variation in these blobs, and use them to understand and to represent the dataset.
+
+### 10.2.5 Example: Transforming the Height-Weight Blob
+
+Translating a blob of data doesn't change the scatterplot matrix in any interesting way (the axes change, but the picture doesn't). Rotating a blob produces really interesting results, however. Figure 10.8 shows the dataset of Fig. 10.4, translated to the origin and rotated to diagonalize it. Now we do not have names for each component of the data (they're linear combinations of the original components), but each pair is now not correlated. This blob has some interesting shape features. Figure 10.8 shows the gross shape of the blob best. Each panel of this figure has the same scale in each direction. You can see the blob extends about 80 units in direction 1, but only about 15 units in direction 2, and much less in the other two directions. You should think of this blob as being rather cigar-shaped; it's long in one direction, but there isn't much in the others. The cigar metaphor isn't perfect because there aren't any four dimensional cigars, but it's helpful. You can think of each panel of this figure as showing views down each of the four axes of the cigar.
+
+Fig. 10.8
+
+A panel plot of the bodyfat dataset of Fig. 10.4, now rotated so that the covariance between all pairs of distinct dimensions is zero. Now we do not know names for the directions—they're linear combinations of the original variables. Each scatterplot is on the same set of axes, so you can see that the dataset extends more in some directions than in others
+
+Now look at Fig. 10.9. This shows the same rotation of the same blob of data, but now the scales on the axis have changed to get the best look at the detailed shape of the blob. First, you can see that blob is a little curved (look at the projection onto direction 2 and direction 4). There might be some effect here worth studying. Second, you can see that some points seem to lie away from the main blob. I have plotted each data point with a dot, and the interesting points with a number. These points are clearly special in some way.
+
+Fig. 10.9
+
+A panel plot of the bodyfat dataset of Fig. 10.4, now rotated so that the covariance between all pairs of distinct dimensions is zero. Now we do not know names for the directions—they're linear combinations of the original variables. I have scaled the axes so you can see details; notice that the blob is a little curved, and there are several data points that seem to lie some way away from the blob, which I have numbered
+
+The problem with these figures is that the axes are meaningless. The components are weighted combinations of components of the original data, so they don't have any units, etc. This is annoying, and often inconvenient. But I obtained Fig. 10.8 by translating, rotating and projecting data. It's straightforward to undo the rotation and the translation—this takes the projected blob (which we know to be a good approximation of the rotated and translated blob) back to where the original blob was. Rotation and translation don't change distances, so the result is a good approximation of the original blob, but now in the original blob's coordinates. Figure 10.10 shows what happens to the data of Fig. 10.4. This is a two dimensional version of the original dataset, embedded like a thin pancake of data in a four dimensional space. Crucially, it represents the original dataset quite accurately.
+
+Fig. 10.10
+
+The data of Fig. 10.4, represented by translating and rotating so that the covariance is diagonal, projecting off the two smallest directions, then undoing the rotation and translation. This blob of data is two dimensional (because we projected off two dimensions), but is represented in a four dimensional space. You can think of it as a thin pancake of data in the four dimensional space (you should compare to Fig. 10.4 on page 229). It is a good representation of the original data. Notice that it looks slightly thickened on edge, because it isn't aligned with the coordinate system—think of a view of a plate at a slight slant
+
+## 10.3 Principal Components Analysis
+
+We have seen that a blob of data can be translated so that it has zero mean, then rotated so the covariance matrix is diagonal. In this coordinate system, we can set some components to zero, and get a representation of the data that is still accurate. The rotation and translation can be undone, yielding a dataset that is in the same coordinates as the original, but lower dimensional. The new dataset is a good approximation to the old dataset. All this yields a really powerful idea: we can represent the original dataset with a small number of appropriately chosen vectors.
+
+### 10.3.1 The Low Dimensional Representation
+
+We start with a dataset of N d-dimensional vectors ![
+$$\\left \\{\\mathbf{x}\\right \\}$$
+](A442674_1_En_10_Chapter_IEq51.gif). We translate this dataset to have zero mean, forming a new dataset {m} where ![
+$$\\mathbf{m}_{i} = \\mathbf{x}_{i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)$$
+](A442674_1_En_10_Chapter_IEq52.gif). We diagonalize ![
+$$\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{m}\\right \\}\\right \) = \\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)$$
+](A442674_1_En_10_Chapter_IEq53.gif) to get
+
+![
+$$\\displaystyle{\\mathcal{U}^{T}\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\\mathcal{U} = \\Lambda }$$
+](A442674_1_En_10_Chapter_Equu.gif)
+
+and form the dataset {r}, using the rule
+
+![
+$$\\displaystyle{\\mathbf{r}_{i} = \\mathcal{U}^{T}\\mathbf{m}_{ i} = \\mathcal{U}^{T}\(\\mathbf{x}_{ i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\).}$$
+](A442674_1_En_10_Chapter_Equv.gif)
+
+We saw the mean of this dataset is zero, and the covariance is diagonal. We then represented the d-dimensional data set {r} with an s dimensional dataset, by choosing some r < d, then taking each data point r i and replacing the last d − s components with 0. We call the resulting data item p i .
+
+Now consider undoing the rotation and translation. We would form a new dataset ![
+$$\\left \\{\\hat{\\mathbf{x}}\\right \\}$$
+](A442674_1_En_10_Chapter_IEq54.gif), with the i'th element given by
+
+![
+$$\\displaystyle{\\hat{\\mathbf{x}}_{i} = \\mathcal{U}\\mathbf{p}_{i} + \\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)}$$
+](A442674_1_En_10_Chapter_Equw.gif)
+
+(you should check this expression). But this expression says that ![
+$$\\hat{\\mathbf{x}}_{i}$$
+](A442674_1_En_10_Chapter_IEq55.gif) is constructed by forming a weighted sum of the first s columns of ![
+$$\\mathcal{U}$$
+](A442674_1_En_10_Chapter_IEq56.gif) (because all the other components of p i are zero), then adding ![
+$$\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)$$
+](A442674_1_En_10_Chapter_IEq57.gif). If we write u j for the j'th column of ![
+$$\\mathcal{U}$$
+](A442674_1_En_10_Chapter_IEq58.gif), we have
+
+![
+$$\\displaystyle{\\hat{\\mathbf{x}}_{i} =\\sum _{ j=1}^{s}r_{ i}^{\(j\)}\\mathbf{u}_{ j} + \\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \).}$$
+](A442674_1_En_10_Chapter_Equx.gif)
+
+What is important about this sum is that s is usually a lot less than d. The u j are known as principal components of the dataset. You can easily derive an expression for r i (j) from all this (exercises), but for reference, here it is:
+
+![
+$$\\displaystyle{r_{i}^{\(j\)} = \\mathbf{u}_{ j}^{T}\(\\mathbf{x}_{ i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\).}$$
+](A442674_1_En_10_Chapter_Equy.gif)
+
+Remember this: Data items in a d dimensional data set can usually be represented with good accuracy as a weighted sum of a small number s of d dimensional vectors, together with the mean. This means that the dataset lies on an s-dimensional subspace of the d-dimensional space. The subspace is spanned by the principal components of the data.
+
+### 10.3.2 The Error Caused by Reducing Dimension
+
+We can easily determine the error in approximating ![
+$$\\left \\{\\mathbf{x}\\right \\}$$
+](A442674_1_En_10_Chapter_IEq59.gif) with ![
+$$\\left \\{\\hat{\\mathbf{x}}\\right \\}$$
+](A442674_1_En_10_Chapter_IEq60.gif). The error in representing {r} by {p s } was easy to compute. We had
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\frac{1} {N}\\sum _{i}\\left \[\\left \(\\mathbf{r}_{i} -\\mathbf{p}_{i}\\right \)^{T}\\left \(\\mathbf{r}_{ i} -\\mathbf{p}_{i}\\right \)^{T}\\right \] = \\frac{1} {N}\\sum _{i}\\left \[\\sum _{j=s+1}^{j=d}\\left \(r_{ i}^{\(j\)}\\right \)^{2}\\right \].& & {}\\\\ \\end{array}$$
+](A442674_1_En_10_Chapter_Equ13.gif)
+
+This was the sum of the diagonal elements of the covariance matrix of {r} from s, s to d, d. If this sum is small compared to the sum of the first s components, then dropping the last d − s components results in a small error.
+
+The error in representing {x} with ![
+$$\\{\\hat{\\mathbf{x}}\\}$$
+](A442674_1_En_10_Chapter_IEq61.gif) is now easy to get. Rotations and translations do not change lengths. This means that
+
+![
+$$\\displaystyle{\\mid \\!\\mid \\!\\!\\mathbf{x}_{i} -\\hat{\\mathbf{x}}_{i}\\!\\!\\mid \\!\\mid ^{2} = \\mid \\!\\mid \\!\\!\\mathbf{r}_{ i} -\\mathbf{p}_{r,i}\\!\\!\\mid \\!\\mid ^{2} =\\sum _{ u=r+1}^{d}\(\\mathbf{r}_{ i}^{\(u\)}\)^{2}}$$
+](A442674_1_En_10_Chapter_Equz.gif)
+
+which is the sum of the diagonal elements of the covariance matrix of {r} from s, s to d, d which is easy to evaluate, because these are the values of the d − s eigenvalues that we decided to ignore. Now we could choose s by identifying how much error we can tolerate. More usual is to plot the eigenvalues of the covariance matrix, and look for a "knee", like that in Fig. 10.11. You can see that the sum of remaining eigenvalues is small.
+
+Fig. 10.11
+
+On the top left, the mean spectral reflectance of a dataset of 1995 spectral reflectances, collected by Kobus Barnard (at http://​www.​cs.​sfu.​ca/​~colour/​data/​). On the top right, eigenvalues of the covariance matrix of spectral reflectance data, from a dataset of 1995 spectral reflectances, collected by Kobus Barnard (at http://​www.​cs.​sfu.​ca/​~colour/​data/​). Notice how the first few eigenvalues are large, but most are very small; this suggests that a good representation using few principal components is available. The bottom row shows the first three principal components. A linear combination of these, with appropriate weights, added to the mean, gives a good representation of any item in the dataset
+
+Procedure 10.2 (Principal Components Analysis)
+
+Assume we have a general data set x i , consisting of N d-dimensional vectors. Now write ![
+$$\\Sigma = \\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)$$
+](A442674_1_En_10_Chapter_IEq62.gif) for the covariance matrix.
+
+Form ![
+$$\\mathcal{U}$$
+](A442674_1_En_10_Chapter_IEq63.gif), ![
+$$\\Lambda$$
+](A442674_1_En_10_Chapter_IEq64.gif), such that
+
+![
+$$\\displaystyle{\\Sigma \\mathcal{U} = \\mathcal{U}\\Lambda }$$
+](A442674_1_En_10_Chapter_Equaa.gif)
+
+(these are the eigenvectors and eigenvalues of ![
+$$\\Sigma$$
+](A442674_1_En_10_Chapter_IEq65.gif)). Ensure that the entries of ![
+$$\\Lambda$$
+](A442674_1_En_10_Chapter_IEq66.gif) are sorted in decreasing order. Choose s, the number of dimensions you wish to represent. Typically, we do this by plotting the eigenvalues and looking for a "knee" (Fig. 10.11). It is quite usual to do this by hand.
+
+Constructing a low-dimensional representation: Write u i for the j'th column of ![
+$$\\mathcal{U}$$
+](A442674_1_En_10_Chapter_IEq67.gif). Represent the data point x i as
+
+![
+$$\\displaystyle{\\begin{array}{l} \\hat{\\mathbf{x}_{i}} = \\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \) +\\sum _{ j=1}^{s}\\left \[\\mathbf{u}_{j}^{T}\\left \(\\mathbf{x}_{i} -\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\\right \)\\right \]\\mathbf{u}_{j} \\end{array} }$$
+](A442674_1_En_10_Chapter_Equab.gif)
+
+### 10.3.3 Example: Representing Colors with Principal Components
+
+Diffuse surfaces reflect light uniformly in all directions. Examples of diffuse surfaces include matte paint, many styles of cloth, many rough materials (bark, cement, stone, etc.). One way to tell a diffuse surface is that it does not look brighter (or darker) when you look at it along different directions. Diffuse surfaces can be colored, because the surface reflects different fractions of the light falling on it at different wavelengths. This effect can be represented by measuring the spectral reflectance of a surface, which is the fraction of light the surface reflects as a function of wavelength. This is usually measured in the visual range of wavelengths (about 380 nm to about 770 nm). Typical measurements are every few nm, depending on the measurement device. I obtained data for 1995 different surfaces from http://​www.​cs.​sfu.​ca/​~colour/​data/​ (there are a variety of great datasets here, from Kobus Barnard).
+
+Each spectrum has 101 measurements, which are spaced 4 nm apart. This represents surface properties to far greater precision than is really useful. Physical properties of surfaces suggest that the reflectance can't change too fast from wavelength to wavelength. It turns out that very few principal components are sufficient to describe almost any spectral reflectance function. Figure 10.11 shows the mean spectral reflectance of this dataset, and Fig. 10.11 shows the eigenvalues of the covariance matrix.
+
+This is tremendously useful in practice. One should think of a spectral reflectance as a function, usually written ρ(λ). What the principal components analysis tells us is that we can represent this function rather accurately on a (really small) finite dimensional basis. This basis is shown in Fig. 10.11. This means that there is a mean function r(λ) and k functions ϕ m (λ) such that, for any ρ(λ),
+
+![
+$$\\displaystyle{\\rho \(\\lambda \) = r\(\\lambda \) +\\sum _{ i=1}^{k}c_{ i}\\phi _{i}\(\\lambda \) + e\(\\lambda \)}$$
+](A442674_1_En_10_Chapter_Equac.gif)
+
+where e(λ) is the error of the representation, which we know is small (because it consists of all the other principal components, which have tiny variance). In the case of spectral reflectances, using a value of k around 3–5 works fine for most applications (Fig. 10.12). This is useful, because when we want to predict what a particular object will look like under a particular light, we don't need to use a detailed spectral reflectance model; instead, it's enough to know the c i for that object. This comes in useful in a variety of rendering applications in computer graphics. It is also the key step in an important computer vision problem, called color constancy. In this problem, we see a picture of a world of colored objects under unknown colored lights, and must determine what color the objects are. Modern color constancy systems are quite accurate, even though the problem sounds underconstrained. This is because they are able to exploit the fact that relatively few c i are enough to accurately describe a surface reflectance.
+
+Fig. 10.12
+
+On the left, a spectral reflectance curve (dashed) and approximations using the mean, the mean and 3 principal components, the mean and 5 principal components, and the mean and 7 principal components. Notice the mean is a relatively poor approximation, but as the number of principal components goes up, the error falls rather quickly. On the right is the error for these approximations. Figure plotted from a dataset of 1995 spectral reflectances, collected by Kobus Barnard (at http://​www.​cs.​sfu.​ca/​~colour/​data/​)
+
+### 10.3.4 Example: Representing Faces with Principal Components
+
+An image is usually represented as an array of values. We will consider intensity images, so there is a single intensity value in each cell. You can turn the image into a vector by rearranging it, for example stacking the columns onto one another. This means you can take the principal components of a set of images. Doing so was something of a fashionable pastime in computer vision for a while, though there are some reasons that this is not a great representation of pictures. However, the representation yields pictures that can give great intuition into a dataset.
+
+Figure 10.14 shows the mean of a set of face images encoding facial expressions of Japanese women (available at http://​www.​kasrl.​org/​jaffe.​html; there are tons of face datasets at http://​www.​face-rec.​org/​databases/​). I reduced the images to 64 × 64, which gives a 4096 dimensional vector. The eigenvalues of the covariance of this dataset are shown in Fig. 10.13; there are 4096 of them, so it's hard to see a trend, but the zoomed figure suggests that the first couple of dozen contain most of the variance. Once we have constructed the principal components, they can be rearranged into images; these images are shown in Fig. 10.14. Principal components give quite good approximations to real images (Fig. 10.15).
+
+Fig. 10.13
+
+On the left,the eigenvalues of the covariance of the Japanese facial expression dataset; there are 4096, so it's hard to see the curve (which is packed to the left). On the right, a zoomed version of the curve, showing how quickly the values of the eigenvalues get small
+
+Fig. 10.14
+
+The mean and first 16 principal components of the Japanese facial expression dataset
+
+Fig. 10.15
+
+Approximating a face image by the mean and some principal components; notice how good the approximation becomes with relatively few components
+
+The principal components sketch out the main kinds of variation in facial expression. Notice how the mean face in Fig. 10.14 looks like a relaxed face, but with fuzzy boundaries. This is because the faces can't be precisely aligned, because each face has a slightly different shape. The way to interpret the components is to remember one adjusts the mean towards a data point by adding (or subtracting) some scale times the component. So the first few principal components have to do with the shape of the haircut; by the fourth, we are dealing with taller/shorter faces; then several components have to do with the height of the eyebrows, the shape of the chin, and the position of the mouth; and so on. These are all images of women who are not wearing spectacles. In face pictures taken from a wider set of models, moustaches, beards and spectacles all typically appear in the first couple of dozen principal components.
+
+## 10.4 Multi-Dimensional Scaling
+
+One way to get insight into a dataset is to plot it. But choosing what to plot for a high dimensional dataset could be difficult. Assume we must plot the dataset in two dimensions (by far the most common choice). We wish to build a scatter plot in two dimensions—but where should we plot each data point? One natural requirement is that the points be laid out in two dimensions in a way that reflects how they sit in many dimensions. In particular, we would like points that are far apart in the high dimensional space to be far apart in the plot, and points that are close in the high dimensional space to be close in the plot.
+
+### 10.4.1 Choosing Low D Points Using High D Distances
+
+We will plot the high dimensional point x i at v i , which is a two-dimensional vector. Now the squared distance between points i and j in the high dimensional space is
+
+![
+$$\\displaystyle{D_{ij}^{\(2\)}\(\\mathbf{x}\) = \(\\mathbf{x}_{ i} -\\mathbf{x}_{j}\)^{T}\(\\mathbf{x}_{ i} -\\mathbf{x}_{j}\)}$$
+](A442674_1_En_10_Chapter_Equad.gif)
+
+(where the superscript is to remind you that this is a squared distance). We could build an N × N matrix of squared distances, which we write ![
+$$\\mathcal{D}^{\(2\)}\(\\mathbf{x}\)$$
+](A442674_1_En_10_Chapter_IEq68.gif). The i, j'th entry in this matrix is D ij (2)(x), and the x argument means that the distances are between points in the high-dimensional space. Now we could choose the v i to make
+
+![
+$$\\displaystyle{\\sum _{ij}\\left \(D_{ij}^{\(2\)}\(\\mathbf{x}\) - D_{ ij}^{\(2\)}\(\\mathbf{v}\)\\right \)^{2}}$$
+](A442674_1_En_10_Chapter_Equae.gif)
+
+as small as possible. Doing so should mean that points that are far apart in the high dimensional space are far apart in the plot, and that points that are close in the high dimensional space are close in the plot.
+
+In its current form, the expression is difficult to deal with, but we can refine it. Because translation does not change the distances between points, it cannot change either of the ![
+$$\\mathcal{D}^{\(2\)}$$
+](A442674_1_En_10_Chapter_IEq69.gif) matrices. So it is enough to solve the case when the mean of the points x i is zero. We can assume that
+
+![
+$$\\displaystyle{ \\frac{1} {N}\\sum _{i}\\mathbf{x}_{i} = \\mathbf{0}.}$$
+](A442674_1_En_10_Chapter_Equaf.gif)
+
+Now write 1 for the n-dimensional vector containing all ones, and ![
+$$\\mathcal{I}$$
+](A442674_1_En_10_Chapter_IEq70.gif) for the identity matrix. Notice that
+
+![
+$$\\displaystyle{D_{ij}^{\(2\)} = \(\\mathbf{x}_{ i} -\\mathbf{x}_{j}\)^{T}\(\\mathbf{x}_{ i} -\\mathbf{x}_{j}\) = \\mathbf{x}_{i} \\cdot \\mathbf{x}_{i} - 2\\mathbf{x}_{i} \\cdot \\mathbf{x}_{j} + \\mathbf{x}_{j} \\cdot \\mathbf{x}_{j}.}$$
+](A442674_1_En_10_Chapter_Equag.gif)
+
+Now write
+
+![
+$$\\displaystyle{\\mathcal{A} = \\left \[\\mathcal{I}- \\frac{1} {N}\\mathbf{1}\\mathbf{1}^{T}\\right \].}$$
+](A442674_1_En_10_Chapter_Equah.gif)
+
+Using this expression, you can show that the matrix ![
+$$\\mathcal{M}$$
+](A442674_1_En_10_Chapter_IEq71.gif), defined below,
+
+![
+$$\\displaystyle{\\mathcal{M}\(\\mathbf{x}\) = -\\frac{1} {2}\\mathcal{A}\\mathcal{D}^{\(2\)}\(\\mathbf{x}\)\\mathcal{A}^{T}}$$
+](A442674_1_En_10_Chapter_Equai.gif)
+
+has i, jth entry x i ⋅ x j (exercises). I now argue that, to make ![
+$$\\mathcal{D}^{\(2\)}\(\\mathbf{v}\)$$
+](A442674_1_En_10_Chapter_IEq72.gif) is close to ![
+$$\\mathcal{D}^{\(2\)}\(\\mathbf{x}\)$$
+](A442674_1_En_10_Chapter_IEq73.gif), it is enough to make ![
+$$\\mathcal{M}\(\\mathbf{v}\)$$
+](A442674_1_En_10_Chapter_IEq74.gif) close to ![
+$$\\mathcal{M}\(\\mathbf{x}\)$$
+](A442674_1_En_10_Chapter_IEq75.gif). Proving this will take us out of our way unnecessarily, so I omit a proof.
+
+We need some notation. Take the dataset of N d-dimensional column vectors x i , and form a matrix ![
+$$\\mathcal{X}$$
+](A442674_1_En_10_Chapter_IEq76.gif) by stacking the vectors, so
+
+![
+$$\\displaystyle{\\mathcal{X} = \\left \[\\begin{array}{c} \\mathbf{x}_{1}^{T} \\\\ \\mathbf{x}_{2}^{T}\\\\ \\ldots \\\\ \\mathbf{x}_{N}^{T} \\end{array} \\right \].}$$
+](A442674_1_En_10_Chapter_Equaj.gif)
+
+In this notation, we have
+
+![
+$$\\displaystyle{\\mathcal{M}\(\\mathbf{x}\) = \\mathcal{X}\\mathcal{X}^{T}.}$$
+](A442674_1_En_10_Chapter_Equak.gif)
+
+Notice ![
+$$\\mathcal{M}\(\\mathbf{x}\)$$
+](A442674_1_En_10_Chapter_IEq77.gif) is symmetric, and it is positive semidefinite. It can't be positive definite, because the data is zero mean, so ![
+$$\\mathcal{M}\(\\mathbf{x}\)\\mathbf{1} = 0$$
+](A442674_1_En_10_Chapter_IEq78.gif).
+
+We must now choose a set of v i that makes ![
+$$\\mathcal{D}^{\(2\)}\(\\mathbf{v}\)$$
+](A442674_1_En_10_Chapter_IEq79.gif) close to ![
+$$\\mathcal{D}^{\(2\)}\(\\mathbf{x}\)$$
+](A442674_1_En_10_Chapter_IEq80.gif). We do so by choosing a ![
+$$\\mathcal{M}\(\\mathbf{v}\)$$
+](A442674_1_En_10_Chapter_IEq81.gif) that is close to ![
+$$\\mathcal{M}\(\\mathbf{x}\)$$
+](A442674_1_En_10_Chapter_IEq82.gif). But this means we must choose ![
+$$\\mathcal{V} = \\left \[\\mathbf{v}_{1},\\mathbf{v}_{2},\\ldots,\\mathbf{v}_{N}\\right \]^{T}$$
+](A442674_1_En_10_Chapter_IEq83.gif) so that ![
+$$\\mathcal{V}\\mathcal{V}^{T}$$
+](A442674_1_En_10_Chapter_IEq84.gif) is close to ![
+$$\\mathcal{M}\(\\mathbf{x}\)$$
+](A442674_1_En_10_Chapter_IEq85.gif). We are computing an approximate factorization of the matrix ![
+$$\\mathcal{M}\(\\mathbf{x}\)$$
+](A442674_1_En_10_Chapter_IEq86.gif).
+
+### 10.4.2 Factoring a Dot-Product Matrix
+
+We seek a set of k dimensional v that can be stacked into a matrix ![
+$$\\mathcal{V}$$
+](A442674_1_En_10_Chapter_IEq87.gif). This must produce a ![
+$$\\mathcal{M}\(\\mathbf{v}\) = \\mathcal{V}\\mathcal{V}^{T}$$
+](A442674_1_En_10_Chapter_IEq88.gif) that must (a) be as close as possible to ![
+$$\\mathcal{M}\(\\mathbf{x}\)$$
+](A442674_1_En_10_Chapter_IEq89.gif) and (b) have rank at most k. It can't have rank larger than k because there must be some ![
+$$\\mathcal{V}$$
+](A442674_1_En_10_Chapter_IEq90.gif) which is N × k so that ![
+$$\\mathcal{M}\(\\mathbf{v}\) = \\mathcal{V}\\mathcal{V}^{T}$$
+](A442674_1_En_10_Chapter_IEq91.gif). The rows of this ![
+$$\\mathcal{V}$$
+](A442674_1_En_10_Chapter_IEq92.gif) are our v i T .
+
+We can obtain the best factorization of ![
+$$\\mathcal{M}\(\\mathbf{x}\)$$
+](A442674_1_En_10_Chapter_IEq93.gif) from a diagonalization. Write write ![
+$$\\mathcal{U}$$
+](A442674_1_En_10_Chapter_IEq94.gif) for the matrix of eigenvectors of ![
+$$\\mathcal{M}\(\\mathbf{x}\)$$
+](A442674_1_En_10_Chapter_IEq95.gif) and ![
+$$\\Lambda$$
+](A442674_1_En_10_Chapter_IEq96.gif) for the diagonal matrix of eigenvalues sorted in descending order, so we have
+
+![
+$$\\displaystyle{\\mathcal{M}\(\\mathbf{x}\) = \\mathcal{U}\\Lambda \\mathcal{U}^{T}}$$
+](A442674_1_En_10_Chapter_Equal.gif)
+
+and write ![
+$$\\Lambda ^{\(1/2\)}$$
+](A442674_1_En_10_Chapter_IEq97.gif) for the matrix of positive square roots of the eigenvalues. Now we have
+
+![
+$$\\displaystyle{\\mathcal{M}\(\\mathbf{x}\) = \\mathcal{U}\\Lambda ^{1/2}\\Lambda ^{1/2}\\mathcal{U}^{T} = \\left \(\\mathcal{U}\\Lambda ^{1/2}\\right \)\\left \(\\mathcal{U}\\Lambda ^{1/2}\\right \)^{T}}$$
+](A442674_1_En_10_Chapter_Equam.gif)
+
+which allows us to write
+
+![
+$$\\displaystyle{\\mathcal{X} = \\mathcal{U}\\Lambda ^{1/2}.}$$
+](A442674_1_En_10_Chapter_Equan.gif)
+
+Now think about approximating ![
+$$\\mathcal{M}\(\\mathbf{x}\)$$
+](A442674_1_En_10_Chapter_IEq98.gif) by the matrix ![
+$$\\mathcal{M}\(\\mathbf{v}\)$$
+](A442674_1_En_10_Chapter_IEq99.gif). The error is a sum of squares of the entries,
+
+![
+$$\\displaystyle{\\mbox{ err}\(\\mathcal{M}\(\\mathbf{x}\),\\mathcal{A}\) =\\sum _{ij}\(m_{ij} - a_{ij}\)^{2}.}$$
+](A442674_1_En_10_Chapter_Equao.gif)
+
+Because ![
+$$\\mathcal{U}$$
+](A442674_1_En_10_Chapter_IEq100.gif) is a rotation, it is straightforward to show that
+
+![
+$$\\displaystyle{\\mbox{ err}\(\\mathcal{U}^{T}\\mathcal{M}\(\\mathbf{x}\)\\mathcal{U},\\mathcal{U}^{T}\\mathcal{M}\(\\mathbf{v}\)\\mathcal{U}\) = \\mbox{ err}\(\\mathcal{M}\(\\mathbf{x}\),\\mathcal{M}\(\\mathbf{v}\)\).}$$
+](A442674_1_En_10_Chapter_Equap.gif)
+
+But
+
+![
+$$\\displaystyle{\\mathcal{U}^{T}\\mathcal{M}\(\\mathbf{x}\)\\mathcal{U} = \\Lambda }$$
+](A442674_1_En_10_Chapter_Equaq.gif)
+
+which means that we could find ![
+$$\\mathcal{M}\(\\mathbf{v}\)$$
+](A442674_1_En_10_Chapter_IEq101.gif) from the best rank k approximation to ![
+$$\\Lambda$$
+](A442674_1_En_10_Chapter_IEq102.gif). This is obtained by setting all but the k largest entries of ![
+$$\\Lambda$$
+](A442674_1_En_10_Chapter_IEq103.gif) to zero. Call the resulting matrix ![
+$$\\Lambda _{k}$$
+](A442674_1_En_10_Chapter_IEq104.gif). Then we have
+
+![
+$$\\displaystyle{\\mathcal{M}\(\\mathbf{v}\) = \\mathcal{U}\\Lambda _{k}\\mathcal{U}}$$
+](A442674_1_En_10_Chapter_Equar.gif)
+
+and
+
+![
+$$\\displaystyle{\\mathcal{V} = \\mathcal{U}\\Lambda _{k}^{\(1/2\)}.}$$
+](A442674_1_En_10_Chapter_Equas.gif)
+
+The first k columns of ![
+$$\\mathcal{V}$$
+](A442674_1_En_10_Chapter_IEq105.gif) are non-zero. We drop the remaining N − k columns of zeros. The rows of the resulting matrix are our v i , and we can plot these. This method for constructing a plot is known as principal coordinate analysis.
+
+This plot might not be perfect, because reducing the dimension of the data points should cause some distortions. In many cases, the distortions are tolerable. In other cases, we might need to use a more sophisticated scoring system that penalizes some kinds of distortion more strongly than others. There are many ways to do this; the general problem is known as multidimensional scaling.
+
+Procedure 10.3 (Principal Coordinate Analysis)
+
+Assume we have a matrix D (2) consisting of the squared differences between each pair of N points. We do not need to know the points. We wish to compute a set of points in r dimensions, such that the distances between these points are as similar as possible to the distances in D (2).
+
+  * Form ![
+$$\\mathcal{A} = \\left \[\\mathcal{I}- \\frac{1} {N}\\mathbf{1}\\mathbf{1}^{T}\\right \]$$
+](A442674_1_En_10_Chapter_IEq106.gif).
+
+  * Form ![
+$$\\mathcal{W} = \\frac{1} {2}\\mathcal{A}\\mathcal{D}^{\(2\)}\\mathcal{A}^{T}$$
+](A442674_1_En_10_Chapter_IEq107.gif).
+
+  * Form ![
+$$\\mathcal{U}$$
+](A442674_1_En_10_Chapter_IEq108.gif), ![
+$$\\Lambda$$
+](A442674_1_En_10_Chapter_IEq109.gif), such that ![
+$$\\mathcal{W}\\mathcal{U} = \\mathcal{U}\\Lambda$$
+](A442674_1_En_10_Chapter_IEq110.gif) (these are the eigenvectors and eigenvalues of ![
+$$\\mathcal{W}$$
+](A442674_1_En_10_Chapter_IEq111.gif)). Ensure that the entries of ![
+$$\\Lambda$$
+](A442674_1_En_10_Chapter_IEq112.gif) are sorted in decreasing order.
+
+  * Choose r, the number of dimensions you wish to represent. Form ![
+$$\\Lambda _{r}$$
+](A442674_1_En_10_Chapter_IEq113.gif), the top left r × r block of ![
+$$\\Lambda$$
+](A442674_1_En_10_Chapter_IEq114.gif). Form ![
+$$\\Lambda _{r}^{\(1/2\)}$$
+](A442674_1_En_10_Chapter_IEq115.gif), whose entries are the positive square roots of ![
+$$\\Lambda _{r}$$
+](A442674_1_En_10_Chapter_IEq116.gif). Form ![
+$$\\mathcal{U}_{r}$$
+](A442674_1_En_10_Chapter_IEq117.gif), the matrix consisting of the first r columns of ![
+$$\\mathcal{U}$$
+](A442674_1_En_10_Chapter_IEq118.gif).
+
+Then
+
+![
+$$\\displaystyle{\\mathcal{V}^{T} = \\Lambda _{ r}^{\(1/2\)}\\mathcal{U}_{ r}^{T} = \\left \[\\mathbf{v}_{ 1},\\ldots,\\mathbf{v}_{N}\\right \]}$$
+](A442674_1_En_10_Chapter_Equat.gif)
+
+is the set of points to plot.
+
+### 10.4.3 Example: Mapping with Multidimensional Scaling
+
+Multidimensional scaling gets positions (the ![
+$$\\mathcal{V}$$
+](A442674_1_En_10_Chapter_IEq119.gif) of Sect. 10.4.1) from distances (the ![
+$$\\mathcal{D}^{\(2\)}\(\\mathbf{x}\)$$
+](A442674_1_En_10_Chapter_IEq120.gif) of Sect. 10.4.1). This means we can use the method to build maps from distances alone. I collected distance information from the web (I used http://​www.​distancefromto.​net, but a google search on "city distances" yields a wide range of possible sources), then applied multidimensional scaling. I obtained distances between the South African provincial capitals, in kilometers. I then used principal coordinate analysis to find positions for each capital, and rotated, translated and scaled the resulting plot to check it against a real map (Fig. 10.16).
+
+Fig. 10.16
+
+On the left, a public domain map of South Africa, obtained from http://​commons.​wikimedia.​org/​wiki/​File:​Map_​of_​South_​Africa.​svg, and edited to remove surrounding countries. On the right, the locations of the cities inferred by multidimensional scaling, rotated, translated and scaled to allow a comparison to the map by eye. The map doesn't have all the provincial capitals on it, but it's easy to see that MDS has placed the ones that are there in the right places (use a piece of ruled tracing paper to check)
+
+One natural use of principal coordinate analysis is to see if one can spot any structure in a dataset. Does the dataset form a blob, or is it clumpy? This isn't a perfect test, but it's a good way to look and see if anything interesting is happening. In Fig. 10.17, I show a 3D plot of the spectral data, reduced to three dimensions using principal coordinate analysis. The plot is quite interesting. You should notice that the data points are spread out in 3D, but actually seem to lie on a complicated curved surface—they very clearly don't form a uniform blob. To me, the structure looks somewhat like a butterfly. I don't know why this occurs (perhaps the universe is doodling), but it certainly suggests that something worth investigating is going on. Perhaps the choice of samples that were measured is funny; perhaps the measuring instrument doesn't make certain kinds of measurement; or perhaps there are physical processes that prevent the data from spreading out over the space.
+
+Fig. 10.17
+
+Two views of the spectral data of Sect. 10.3.3, plotted as a scatter plot by applying principal coordinate analysis to obtain a 3D set of points. Notice that the data spreads out in 3D, but seems to lie on some structure; it certainly isn't a single blob. This suggests that further investigation would be fruitful
+
+Our algorithm has one really interesting property. In some cases, we do not actually know the datapoints as vectors. Instead, we just know distances between the datapoints. This happens often in the social sciences, but there are important cases in computer science as well. As a rather contrived example, one could survey people about breakfast foods (say, eggs, bacon, cereal, oatmeal, pancakes, toast, muffins, kippers and sausages for a total of 9 items). We ask each person to rate the similarity of each pair of distinct items on some scale. We advise people that similar items are ones where, if they were offered both, they would have no particular preference; but, for dissimilar items, they would have a strong preference for one over the other. The scale might be "very similar", "quite similar", "similar", "quite dissimilar", and "very dissimilar" (scales like this are often called Likert scales). We collect these similarities from many people for each pair of distinct items, and then average the similarity over all respondents. We compute distances from the similarities in a way that makes very similar items close and very dissimilar items distant. Now we have a table of distances between items, and can compute a ![
+$$\\mathcal{V}$$
+](A442674_1_En_10_Chapter_IEq121.gif) and produce a scatter plot. This plot is quite revealing, because items that most people think are easily substituted appear close together, and items that are hard to substitute are far apart. The neat trick here is that we did not start with a ![
+$$\\mathcal{X}$$
+](A442674_1_En_10_Chapter_IEq122.gif), but with just a set of distances; but we were able to associate a vector with "eggs", and produce a meaningful plot.
+
+## 10.5 Example: Understanding Height and Weight
+
+Recall the height-weight data set of Sect. 1.​2.​4 (from http://​www2.​stetson.​edu/​~jrasp/​data.​htm; look for bodyfat.xls at that URL). This is, in fact, a 16-dimensional dataset. The entries are (in this order): bodyfat; density; age; weight; height; adiposity; neck; chest; abdomen; hip; thigh; knee; ankle; biceps; forearm; wrist. We know already that many of these entries are correlated, but it's hard to grasp a 16 dimensional dataset in one go. The first step is to investigate with a multidimensional scaling (Fig. 10.18).
+
+Fig. 10.18
+
+Two views of a multidimensional scaling to three dimensions of the height-weight dataset. Notice how the data seems to lie in a flat structure in 3D, with one outlying data point. This means that the distances between data points can be (largely) explained by a 2D representation
+
+Section 1.​2.​4 shows a multidimensional scaling of this dataset down to three dimensions. The dataset seems to lie on a (fairly) flat structure in 3D, meaning that inter-point distances are relatively well explained by a 2D representation. Two points seem to be special, and lie far away from the flat structure. The structure isn't perfectly flat, so there will be small errors in a 2D representation; but it's clear that a lot of dimensions are redundant. Figure 10.19 shows a 2D representation of these points. They form a blob that is stretched along one axis, and there is no sign of multiple blobs. There's still at least one special point, which we shall ignore but might be worth investigating further. The distortions involved in squashing this dataset down to 2D seem to have made the second special point less obvious than it was in Sect. 1.​2.​4.
+
+Fig. 10.19
+
+A multidimensional scaling to two dimensions of the height-weight dataset. One data point is clearly special, and another looks pretty special. The data seems to form a blob, with one axis quite a lot more important than another
+
+The next step is to try a principal component analysis. Figure 10.20 shows the mean of the dataset. The components of the dataset have different units, and shouldn't really be compared. But it is difficult to interpret a table of 16 numbers, so I have plotted the mean as a stem plot. Figure 10.21 shows the eigenvalues of the covariance for this dataset. Notice how one dimension is very important, and after the third principal component, the contributions become small. Of course, I could have said "fourth", or "fifth", or whatever—the precise choice depends on how small a number you think is "small".
+
+Fig. 10.20
+
+The mean of the bodyfat.xls dataset. Each component is likely in a different unit (though I don't know the units), making it difficult to plot the data without being misleading. I've adopted one solution here, by plotting a stem plot. You shouldn't try to compare the values to one another. Instead, think of this plot as a compact version of a table
+
+Fig. 10.21
+
+On the left, the eigenvalues of the covariance matrix for the bodyfat data set. Notice how fast the eigenvalues fall off; this means that most principal components have very small variance, so that data can be represented well with a small number of principal components. On the right, the first principal component for this dataset, plotted using the same convention as for Fig. 10.20
+
+Figure 10.21 also shows the first principal component. The eigenvalues justify thinking of each data item as (roughly) the mean plus some weight times this principal component. From this plot you can see that data items with a larger value of weight will also have larger values of most other measurements, except age and density. You can also see how much larger; if the weight goes up by 8.5 units, then the abdomen will go up by 3 units, and so on. This explains the main variation in the dataset.
+
+In the rotated coordinate system, the components are not correlated, and they have different variances (which are the eigenvalues of the covariance matrix). You can get some sense of the data by adding these variances; in this case, we get 1404. This means that, in the translated and rotated coordinate system, the average data point is about ![
+$$37 = \\sqrt{1404}$$
+](A442674_1_En_10_Chapter_IEq123.gif) units away from the center (the origin). Translations and rotations do not change distances, so the average data point is about 37 units from the center in the original dataset, too. If we represent a datapoint by using the mean and the first three principal components, there will be some error. We can estimate the average error from the component variances. In this case, the sum of the first three eigenvalues is 1357, so the mean square error in representing a datapoint by the first three principal components is ![
+$$\\sqrt{\(1404 - 1357\)}$$
+](A442674_1_En_10_Chapter_IEq124.gif), or 6. 8. The relative error is 6. 8∕37 = 0. 18. Another way to represent this information, which is more widely used, is to say that the first three principal components explain all but (1404 − 1357)∕1404 = 0. 034, or 3. 4% of the variance; notice that this is the square of the relative error, which will be a much smaller number.
+
+All this means that explaining a data point as the mean and the first three principal components produces relatively small errors. Figure 10.22 shows the second and third principal component of the data. These two principal components suggest some further conclusions. As age gets larger, height and weight get slightly smaller, but the weight is redistributed; abdomen gets larger, whereas thigh gets smaller. A smaller effect (the third principal component) links bodyfat and abdomen. As bodyfat goes up, so does abdomen.
+
+Fig. 10.22
+
+On the left, the second principal component, and on the right the third principal component of the height-weight dataset
+
+## 10.6 You Should
+
+### 10.6.1 Remember These Definitions
+
+  * Covariance 227
+
+  * Covariance Matrix 229
+
+### 10.6.2 Remember These Terms
+
+  * symmetric 232
+
+  * eigenvector 232
+
+  * eigenvalue 232
+
+  * principal components 237
+
+  * color constancy 241
+
+  * principal coordinate analysis 245
+
+  * multidimensional scaling 245
+
+  * Likert scales 247
+
+### 10.6.3 Remember These Facts
+
+  * Properties of the covariance matrix 230
+
+  * Orthonormal matrices are rotations 233
+
+  * You can transform data to zero mean and diagonal covariance 234
+
+### 10.6.4 Use These Procedures
+
+  * To diagonalize a symmetric matrix 233
+
+  * To construct a low-d representation with principal components 240
+
+  * To make a low dimensional map 246
+
+### 10.6.5 Be Able to
+
+  * Create, plot and interpret the first few principal components of a dataset.
+
+  * Compute the error resulting from ignoring some principal components.
+
+Problems
+
+Summaries
+
+10.1 You have a dataset {x} of N vectors, x i , each of which is d-dimensional. We will consider a linear function of this dataset. Write a for a constant vector; then the value of this linear function evaluated on the i'th data item is a T x i . Write f i = a T x i . We can make a new dataset {f} out of the values of this linear function.
+
+  1. (a)
+
+Show that ![
+$$\\mathsf{mean}\\left \(\\left \\{f\\right \\}\\right \) = \\mathbf{a}^{T}\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)$$
+](A442674_1_En_10_Chapter_IEq125.gif) (easy).
+
+  2. (b)
+
+Show that ![
+$$\\mathsf{var}\\left \(\\left \\{f\\right \\}\\right \) = \\mathbf{a}^{T}\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\\mathbf{a}$$
+](A442674_1_En_10_Chapter_IEq126.gif) (harder, but just push it through the definition).
+
+  3. (c)
+
+Assume the dataset has the special property that there exists some a so that ![
+$$\\mathbf{a}^{T}\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)\\mathbf{a}$$
+](A442674_1_En_10_Chapter_IEq127.gif). Show that this means that the dataset lies on a hyperplane.
+
+10.2 On Fig. 10.23, mark the mean of the dataset, the first principal component, and the second principal component.
+
+Fig. 10.23
+
+Figure for the question
+
+10.3 You have a dataset {x} of N vectors, x i , each of which is d-dimensional. Assume that ![
+$$\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)$$
+](A442674_1_En_10_Chapter_IEq128.gif) has one non-zero eigenvalue. Assume that x 1 and x 2 do not have the same value.
+
+  1. (a)
+
+Show that you can choose a set of t i so that you can represent every data item x i exactly
+
+![
+$$\\displaystyle{\\mathbf{x}_{i} = \\mathbf{x}_{1} + t_{i}\(\\mathbf{x}_{2} -\\mathbf{x}_{1}\).}$$
+](A442674_1_En_10_Chapter_Equau.gif)
+
+  2. (b)
+
+Now consider the dataset of these t values. What is the relationship between (a) ![
+$$\\mathsf{std}\\left \(t\\right \)$$
+](A442674_1_En_10_Chapter_IEq129.gif) and (b) the non-zero eigenvalue of ![
+$$\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)$$
+](A442674_1_En_10_Chapter_IEq130.gif)? Why?
+
+Programming Exercises
+
+10.4 Obtain the iris dataset from the UC Irvine machine learning data repository at https://​archive.​ics.​uci.​edu/​ml/​machine-learning-databases/​iris/​iris.​data.
+
+  1. (a)
+
+Plot a scatterplot matrix of this dataset, showing each species with a different marker.
+
+  2. (b)
+
+Now obtain the first two principal components of the data. Plot the data on those two principal components alone, again showing each species with a different marker. Has this plot introduced significant distortions? Explain
+
+10.5 Take the wine dataset from the UC Irvine machine learning data repository at https://​archive.​ics.​uci.​edu/​ml/​datasets/​Wine.
+
+  1. (a)
+
+Plot the eigenvalues of the covariance matrix in sorted order. How many principal components should be used to represent this dataset? Why?
+
+  2. (b)
+
+Construct a stem plot of each of the first 3 principal components (i.e. the eigenvectors of the covariance matrix with largest eigenvalues). What do you see?
+
+  3. (c)
+
+Compute the first two principal components of this dataset, and project it onto those components. Now produce a scatter plot of this two dimensional dataset, where data items of class 1 are plotted as a '1', class 2 as a '2', and so on.
+
+10.6 Take the wheat kernel dataset from the UC Irvine machine learning data repository at http://​archive.​ics.​uci.​edu/​ml/​datasets/​seeds. Compute the first two principal components of this dataset, and project it onto those components.
+
+  1. (a)
+
+Produce a scatterplot of this projection. Do you see any interesting phenomena?
+
+  2. (b)
+
+Plot the eigenvalues of the covariance matrix in sorted order. How many principal components should be used to represent this dataset? why?
+
+10.7 The UC Irvine machine learning data repository hosts a collection of data on breast cancer diagnostics, donated by Olvi Mangasarian, Nick Street, and William H. Wolberg. You can find this data at http://​archive.​ics.​uci.​edu/​ml/​datasets/​Breast+Cancer+Wi​sconsin+(Diagnostic)). For each record, there is an id number, 10 continuous variables, and a class (benign or malignant). There are 569 examples. Separate this dataset randomly into 100 validation, 100 test, and 369 training examples. Plot this dataset on the first three principal components, using different markers for benign and malignant cases. What do you see?
+
+10.8 The UC Irvine Machine Learning data archive hosts a dataset of measurements of abalone at http://​archive.​ics.​uci.​edu/​ml/​datasets/​Abalone. Compute the principal components of all variables except Sex. Now produce a scatter plot of the measurements projected onto the first two principal components, plotting an "m" for male abalone, an "f" for female abalone and an "i" for infants. What do you see?
+
+10.9 Choose a state. For the 15 largest cities in your chosen state, find the distance between cities and the road mileage between cities. These differ because of the routes that roads take; you can find these distances by careful use of the internet. Prepare a map showing these cities on the plane using principal coordinate analysis for each of these two distances. How badly does using the road network distort to make a map distort the state? Does this differ from state to state? Why?
+
+10.10 CIFAR-10 is a dataset of 32 × 32 images in 10 categories, collected by Alex Krizhevsky, Vinod Nair, and Geoffrey Hinton. It is often used to evaluate machine learning algorithms. You can download this dataset from https://​www.​cs.​toronto.​edu/​~kriz/​cifar.​html.
+
+  1. (a)
+
+For each category, compute the mean image and the first 20 principal components. Plot the error resulting from representing the images of each category using the first 20 principal components against the category.
+
+  2. (b)
+
+Compute the distances between mean images for each pair of classes. Use principal coordinate analysis to make a 2D map of the means of each categories. For this exercise, compute distances by thinking of the images as vectors.
+
+  3. (c)
+
+Here is another measure of the similarity of two classes. For class A and class B, define E(A → B) to be the average error obtained by representing all the images of class A using the mean of class A and the first 20 principal components of class B. Now define the similarity between classes to be (1∕2)(E(A → B) + E(B → A)). Use principal coordinate analysis to make a 2D map of the classes. Compare this map to the map in the previous exercise—are they different? why?
+
+© Springer International Publishing AG 2018
+
+David ForsythProbability and Statistics for Computer Sciencehttps://doi.org/10.1007/978-3-319-64410-3_11
+
+# 11. Learning to Classify
+
+David Forsyth1
+
+(1)
+
+Computer Science Department, University of Illinois at Urbana Champaign, Urbana, IL, USA
+
+A classifier is a procedure that accepts a set of features and produces a class label for them. Classifiers are immensely useful, and find wide application, because many problems are naturally classification problems. For example, if you wish to determine whether to place an advert on a web-page or not, you would use a classifier (i.e. look at the page, and say yes or no according to some rule). As another example, if you have a program that you found for free on the web, you would use a classifier to decide whether it was safe to run it (i.e. look at the program, and say yes or no according to some rule). As yet another example, credit card companies must decide whether a transaction is good or fraudulent.
+
+All these examples are two class classifiers, but in many cases it is natural to have more classes. You can think of sorting laundry as applying a multi-class classifier. You can think of doctors as complex multi-class classifiers: a doctor accepts a set of features (your complaints, answers to questions, and so on) and then produces a response which we can describe as a class. The grading procedure for any class is a multi-class classifier: it accepts a set of features—performance in tests, homeworks, and so on—and produces a class label (the letter grade).
+
+A classifier is usually trained by obtaining a set of labelled training examples and then searching for a classifier that optimizes some cost function which is evaluated on the training data. What makes training classifiers interesting is that performance on training data doesn't really matter. What matters is performance on run-time data, which may be extremely hard to evaluate because one often does not know the correct answer for that data. For example, we wish to classify credit-card transactions as safe or fraudulent. We could obtain a set of transactions with true labels, and train with those. But what we care about is new transactions, where it would be very difficult to know whether the classifier's answers are right. To be able to do anything at all, the set of labelled examples must be representative of future examples in some strong way. We will always assume that the labelled examples are an IID sample from the set of all possible examples, though we never use the assumption explicitly.
+
+Definition 11.1 (Classifier)
+
+A classifier is a procedure that accepts a set of features and produces a label. Classifiers are trained on labelled examples, but the goal is to get a classifier that performs well on data which is not seen at the time of training. Training a classifier requires labelled data that is representative of future data.
+
+## 11.1 Classification: The Big Ideas
+
+We will write the training dataset (x i , y i ). For the i'th example, x i represents the values taken by a collection of features. In the simplest case, x i would be a vector of real numbers. In some cases, x i could contain categorical data or even unknown values. Although x i isn't guaranteed to be a vector, it's usually referred to as a feature vector. The y i are labels giving the type of the object that generated the example. We must use these labelled examples to come up with a classifier.
+
+### 11.1.1 The Error Rate, and Other Summaries of Performance
+
+We can summarize the performance of any particular classifier using the error or total error rate (the percentage of classification attempts that gave the wrong answer) and the accuracy (the percentage of classification attempts that give the right answer). For most practical cases, even the best choice of classifier will make mistakes. For example, an alien tries to classify humans into male and female, using only height as a feature. Whatever the alien's classifier does with that feature, it will make mistakes. This is because the classifier must choose, for each value of height, whether to label the humans with that height male or female. But for the vast majority of heights, there are some males and some females with that height, and so the alien's classifier must make some mistakes.
+
+As the example suggests, a particular feature vector x may appear with different labels (so the alien will see six foot males and six foot females, quite possibly in the training dataset and certainly in future data). Labels appear with some probability conditioned on the observations, P(y | x). If there are parts of the feature space where P(x) is relatively large (so we expect to see observations of that form) and where P(y | x) has relatively large values for more than one label, even the best possible classifier will have a high error rate. If we knew P(y | x) (which is seldom the case), we could identify the classifier with the smallest error rate and compute its error rate. The minimum expected error rate obtained with the best possible classifier applied to a particular problem is known as the Bayes risk for that problem. In most cases, it is hard to know what the Bayes risk is, because to compute it requires knowing P(y | x), which isn't usually known.
+
+The error rate of a classifier is not that meaningful on its own, because we don't usually know the Bayes risk for a problem. It is more helpful to compare a particular classifier with some natural alternatives, sometimes called baselines . The choice of baseline for a particular problem is almost always a matter of application logic. The simplest general baseline is a know-nothing strategy. Imagine classifying the data without using the feature vector at all—how well does this strategy do? If each of the C classes occurs with the same frequency, then it's enough to label the data by choosing a label uniformly and at random, and the error rate for this strategy is 1 − 1∕C. If one class is more common than the others, the lowest error rate is obtained by labelling everything with that class. This comparison is often known as comparing to chance.
+
+It is very common to deal with data where there are only two labels. You should keep in mind this means the highest possible error rate is 50%—if you have a classifier with a higher error rate, you can improve it by switching the outputs. If one class is much more common than the other, training becomes more complicated because the best strategy—labelling everything with the common class—becomes hard to beat.
+
+### 11.1.2 More Detailed Evaluation
+
+The error rate is a fairly crude summary of the classifier's behavior. For a two-class classifier and a 0–1 loss function, one can report the false positive rate (the percentage of negative test data that was classified positive) and the false negative rate (the percentage of positive test data that was classified negative). Note that it is important to provide both, because a classifier with a low false positive rate tends to have a high false negative rate, and vice versa. As a result, you should be suspicious of reports that give one number but not the other. Alternative numbers that are reported sometimes include the sensitivity (the percentage of true positives that are classified positive) and the specificity (the percentage of true negatives that are classified negative).
+
+The false positive and false negative rates of a two-class classifier can be generalized to evaluate a multi-class classifier, yielding the class confusion matrix. This is a table of cells, where the i, j'th cell contains the count of cases where the true label was i and the predicted label was j (some people show the fraction of cases rather than the count). Table 11.1 gives an example. This is a class confusion matrix from a classifier built on a dataset where one tries to predict the degree of heart disease from a collection of physiological and physical measurements. There are five classes (0...4). The i, j'th cell of the table shows the number of data points of true class i that were classified to have class j. As I find it hard to recall whether rows or columns represent true or predicted classes, I have marked this on the table. For each row, there is a class error rate, which is the percentage of data points of that class that were misclassified. The first thing to look at in a table like this is the diagonal; if the largest values appear there, then the classifier is working well. This clearly isn't what is happening for Table 11.1. Instead, you can see that the method is very good at telling whether a data point is in class 0 or not (the class error rate is rather small), but cannot distinguish between the other classes. This is a strong hint that the data can't be used to draw the distinctions that we want. It might be a lot better to work with a different set of classes.
+
+Table 11.1
+
+The class confusion matrix for a multiclass classifier
+
+True | Predict
+
+|  |  |  |  |   
+---|---|---|---|---|---|---  
+|
+
+0 | 1 | 2 | 3 | 4 | Class error
+
+0 | 151 | 7 | 2 | 3 | 1 | 7.9%
+
+1 | 32 | 5 | 9 | 9 | 0 | 91%
+
+2 | 10 | 9 | 7 | 9 | 1 | 81%
+
+3 | 6 | 13 | 9 | 5 | 2 | 86%
+
+4 | 2 | 3 | 2 | 6 | 0 | 100%
+
+This is a table of cells, where the i, j'th cell contains the count of cases where the true label was i and the predicted label was j (some people show the fraction of cases rather than the count). Further details about the dataset and this example appear in Worked example 58
+
+### 11.1.3 Overfitting and Cross-Validation
+
+Choosing and evaluating a classifier takes some care. The goal is to get a classifier that works well on future data for which we might never know the true label, using a training set of labelled examples. This isn't necessarily easy. For example, think about the (silly) classifier that takes any data point and, if it is the same as a point in the training set, emits the class of that point; otherwise, it chooses randomly between the classes.
+
+The training error of a classifier is the error rate on examples used to train the classifier. In contrast, the test error is error on examples not used to train the classifier. Classifiers that have small training error might not have small test error, because the classification procedure is chosen to do well on the training data. This effect is sometimes called overfitting. Other names include selection bias, because the training data has been selected and so isn't exactly like the test data, and generalizing badly, because the classifier must generalize from the training data to the test data. The effect occurs because the classifier has been chosen to perform well on the training dataset. An efficient training procedure is quite likely to find special properties of the training dataset that aren't representative of the test dataset, because the training dataset is not the same as the test dataset. The training dataset is typically a sample of all the data one might like to have classified, and so is quite likely a lot smaller than the test dataset. Because it is a sample, it may have quirks that don't appear in the test dataset. One consequence of overfitting is that classifiers should always be evaluated on data that was not used in training.
+
+Now assume that we want to estimate the error rate of the classifier on test data. We cannot estimate the error rate of the classifier using data that was used to train the classifier, because the classifier has been trained to do well on that data, which will mean our error rate estimate will be too low. An alternative is to separate out some training data to form a validation set (confusingly, this is sometimes called a test set), then train the classifier on the rest of the data, and evaluate on the validation set. The error estimate on the validation set is the value of a random variable, because the validation set is a sample of all possible data you might classify. But this error estimate is unbiased, meaning that the expected value of the error estimate is the true value of the error. You can see this by thinking about the error estimate as a sample mean and applying the ideas of Chap.
+
+However, separating out some training data presents the difficulty that the classifier will not be the best possible, because we left out some training data when we trained it. This issue can become a significant nuisance when we are trying to tell which of a set of classifiers to use—did the classifier perform poorly on validation data because it is not suited to the problem representation or because it was trained on too little data?
+
+We can resolve this problem with cross-validation, which involves repeatedly: splitting data into training and validation sets uniformly and at random, training a classifier on the training set, evaluating it on the validation set, and then averaging the error over all splits. Each different split is usually called a fold. This procedure yields an estimate of the likely future performance of a classifier, at the expense of substantial computation. A common form of this algorithm uses a single data item to form a validation set. This is known as leave-one-out cross-validation.
+
+Remember this: Classifiers usually perform better on training data than on test data, because the classifier was chosen to do well on the training data. This effect is known as overfitting. To get an accurate estimate of future performance, classifiers should always be evaluated on data that was not used in training.
+
+## 11.2 Classifying with Nearest Neighbors
+
+Assume we have a labelled dataset consisting of N pairs (x i , y i ). Here x i is the i'th feature vector, and y i is the i'th class label. We wish to predict the label y for any new example x; this is often known as a query example or query. Here is a really effective strategy: Find the labelled example x c that is closest to x, and report the class of that example.
+
+How well can we expect this strategy to work? A precise analysis would take us way out of our way, but simple reasoning is informative. Assume there are two classes, 1 and − 1 (the reasoning will work for more, but the description is slightly more involved). We expect that, if u and v are sufficiently close, then p(y | u) is similar to p(y | v). This means that if a labelled example x i is close to x, then p(y | x) is similar to p(y | x i ). Furthermore, we expect that queries are "like" the labelled dataset, in the sense that points that are common (resp. rare) in the labelled data will appear often (resp. seldom) in the queries.
+
+Now imagine the query comes from a location where p(y = 1 | x) is large. The closest labelled example x c should be nearby (because queries are "like" the labelled data) and should be labelled with 1 (because nearby examples have similar label probabilities). So the method should produce the right answer with high probability.
+
+Alternatively, imagine the query comes from a location where p(y = 1 | x) is about the same as p(y = −1 | x). The closest labelled example x c should be nearby (because queries are "like" the labelled data). But think about a set of examples that are about as close. The labels in this set should vary significantly (because p(y = 1 | x) is about the same as p(y = −1 | x). This means that, if the query is labelled 1 (resp. − 1), a small change in the query will cause it to be labelled − 1 (resp. 1). In these regions the classifier will tend to make mistakes more often, as it should. Using a great deal more of this kind of reasoning, nearest neighbors can be shown to produce an error that is no worse than twice the best error rate, if the method has enough examples. There is no prospect of seeing enough examples in practice for this result to apply.
+
+One important generalization is to find the k nearest neighbors, then choose a label from those. A (k, l) nearest neighbor classifier finds the k example points closest to the point being considered, and classifies this point with the class that has the highest number of votes, as long as this class has more than l votes (otherwise, the point is classified as unknown). In practice, one seldom uses more than three nearest neighbors.
+
+### 11.2.1 Practical Considerations for Nearest Neighbors
+
+One practical difficulty in using nearest neighbor classifiers is you need a lot of labelled examples for the method to work. For some problems, this means you can't use the method. A second practical difficulty is you need to use a sensible choice of distance. For features that are obviously of the same type, such as lengths, the usual metric may be good enough. But what if one feature is a length, one is a color, and one is an angle? It is almost always a good idea to scale each feature independently so that the variance of each feature is the same, or at least consistent; this prevents features with very large scales dominating those with very small scales. Another possibility is to transform the features so that the covariance matrix is the identity (this is sometimes known as whitening; the method follows from the ideas of Chap. ). This can be hard to do if the dimension is so large that the covariance matrix is hard to estimate.
+
+A third practical difficulty is you need to be able to find the nearest neighbors for your query point. This is surprisingly difficult to do faster than simply checking the distance to each training example separately. If your intuition tells you to use a tree and the difficulty will go away, your intuition isn't right. It turns out that nearest neighbors in high dimensions is one of those problems that is a lot harder than it seems, because high dimensional spaces are quite hard to reason about informally. There's a long history of methods that appear to be efficient but, once carefully investigated, turn out to be bad.
+
+Fortunately, it is usually enough to use an approximate nearest neighbor. This is an example that is, with high probability, almost as close to the query point as the nearest neighbor is. Obtaining an approximate nearest neighbor is very much easier than obtaining a nearest neighbor. We can't go into the details here, but there are several distinct methods for finding approximate nearest neighbors. Each involves a series of tuning constants and so on, and, on different datasets, different methods and different choices of tuning constant produce the best results. If you want to use a nearest neighbor classifier on a lot of run-time data, it is usually worth a careful search over methods and tuning constants to find an algorithm that yields a very fast response to a query. It is known how to do this search, and there is excellent software available (FLANN, http://​www.​cs.​ubc.​ca/​~mariusm/​index.​php/​FLANN/​FLANN, by Marius Muja and David G. Lowe).
+
+It is straightforward to use cross-validation to estimate the error rate of a nearest neighbor classifier. Split the labelled training data into two pieces, a (typically large) training set and a (typically small) validation set. Now take each element of the validation set and label it with the label of the closest element of the training set. Compute the fraction of these attempts that produce an error (the true label and predicted labels differ). Now repeat this for a different split, and average the errors over splits. With care, the code you'll write is shorter than this description.
+
+Worked example 11.1 (Classifying Usi- ng Nearest Neighbors) Build a nearest neighbor classifier to classify the MNIST digit data. This dataset is very widely used to check simple methods. It was originally constructed by Yann Lecun, Corinna Cortes, and Christopher J.C. Burges. It has been extensively studied. You can find this dataset in several places. The original dataset is at http://​yann.​lecun.​com/​exdb/​mnist/​. The version I used was used for a Kaggle competition (so I didn't have to decompress Lecun's original format). I found it at http://​www.​kaggle.​com/​c/​digit-recognizer.
+
+Solution
+
+I used R for this problem. As you'd expect, R has nearest neighbor code that seems quite good (I haven't had any real problems with it, at least). There isn't really all that much to say about the code. I used the R FNN package. I trained on 1000 of the 42,000 examples in the Kaggle version, and I tested on the next 200 examples. For this (rather small) case, I found the following class confusion matrix:
+
+True | Predict
+
+|  |  |  |  |  |  |  |  |   
+---|---|---|---|---|---|---|---|---|---|---  
+|
+
+0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
+
+0 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
+
+1 | 0 | 20 | 4 | 1 | 0 | 1 | 0 | 2 | 2 | 1
+
+2 | 0 | 0 | 20 | 1 | 0 | 0 | 0 | 0 | 0 | 0
+
+3 | 0 | 0 | 0 | 12 | 0 | 0 | 0 | 0 | 4 | 0
+
+4 | 0 | 0 | 0 | 0 | 18 | 0 | 0 | 0 | 1 | 1
+
+5 | 0 | 0 | 0 | 0 | 0 | 19 | 0 | 0 | 1 | 0
+
+6 | 1 | 0 | 0 | 0 | 0 | 0 | 18 | 0 | 0 | 0
+
+7 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 19 | 0 | 2
+
+8 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 16 | 0
+
+9 | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 1 | 1 | 14
+
+There are no class error rates here, because I couldn't recall the magic line of R to get them. However, you can see the classifier works rather well for this case. MNIST is comprehensively explored in the exercises.
+
+Remember this: Nearest neighbors has good properties. With enough training data and a low enough dimension, the error rate is guaranteed to be no more than twice the best error rate. The method is wonderfully flexible about the labels the classifier predicts. Nothing changes when you go from a two-class classifier to a multi-class classifier.
+
+There are important difficulties. You need a large training dataset. If you don't have a reliable measure of how far apart two things are, you shouldn't be doing nearest neighbors. And you need to be able to query a large dataset of examples to find the nearest neighbor of a point.
+
+## 11.3 Classifying with Naive Bayes
+
+One straightforward source of a classifier is a probability model. For the moment, assume we know p(y | x) for our data. Assume also that all errors in classification are equally important. Then the following rule produces smallest possible expected classification error rate:
+
+> For a test example x, report the class y that has the highest value of ![
+$$\\left \(p\(y\\vert \\mathbf{x}\)\\right \)$$
+](A442674_1_En_11_Chapter_IEq1.gif). If the largest value is achieved by more than one class, choose randomly from that set of classes.
+
+Usually, we do not have p(y | x). If we have p(x | y) (often called either a likelihood or class conditional probability, compare Sect. 9.​1), and p(y) (often called a prior, compare Sect. 9.​2) then we can use Bayes' rule to form
+
+![
+$$\\displaystyle{p\(y\\vert \\mathbf{x}\) = \\frac{p\(\\mathbf{x}\\vert y\)p\(y\)} {p\(\\mathbf{x}\)} }$$
+](A442674_1_En_11_Chapter_Equa.gif)
+
+(the posterior, compare Sect. 9.​2). This isn't much help in this form, but write x (j) for the j'th component of x. Now assume that features are conditionally independent conditioned on the class of the data item. Our assumption is
+
+![
+$$\\displaystyle{p\(\\mathbf{x}\\vert y\) =\\prod _{j}p\(x^{\(j\)}\\vert y\).}$$
+](A442674_1_En_11_Chapter_Equb.gif)
+
+It is very seldom the case that this assumption is true, but it turns out to be fruitful to pretend that it is. This assumption means that
+
+![
+$$\\displaystyle\\begin{array}{rcl} p\(y\\vert \\mathbf{x}\)& =& \\frac{p\(\\mathbf{x}\\vert y\)p\(y\)} {p\(\\mathbf{x}\)} {}\\\\ & =& \\frac{\\left \(\\prod _{j}p\(x^{\(j\)}\\vert y\)\\right \)p\(y\)} {p\(\\mathbf{x}\)} {}\\\\ & \\propto & \\left \(\\prod _{j}p\(x^{\(j\)}\\vert y\)\\right \)p\(y\). {}\\\\ \\end{array}$$
+](A442674_1_En_11_Chapter_Equ1.gif)
+
+Now to make a decision, we need to choose the class that has the largest value of p(y | x). In turn, this means we need only know the posterior values up to scale at x, so we don't need to estimate p(x). In the case of where all errors have the same cost, this yields the rule
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mbox{ choose }y\\mbox{ such that }\\left \[\\left \(\\prod _{j}p\(x^{\(j\)}\\vert y\)\\right \)p\(y\)\\right \]\\mbox{ is largest}.& & {}\\\\ \\end{array}$$
+](A442674_1_En_11_Chapter_Equ2.gif)
+
+This rule suffers from a practical problem. You can't actually multiply a large number of probabilities and expect to get an answer that a floating point system thinks is different from zero. Instead, you should add the log probabilities. Notice that the logarithm function has one nice property: it is monotonic, meaning that a > b is equivalent to loga > logb. This means the following, more practical, rule is equivalent:
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mbox{ choose }y\\mbox{ such that }\\left \[\\left \(\\sum _{j}\\log p\(x^{\(j\)}\\vert y\)\\right \) +\\log p\(y\)\\right \]\\mbox{ is largest}.& & {}\\\\ \\end{array}$$
+](A442674_1_En_11_Chapter_Equ3.gif)
+
+To use this rule, we need models for p(y) and for p(x (j) | y) for each j. The usual way to find a model of p(y) is to count the number of training examples in each class, then divide by the number of classes.
+
+It turns out that simple parametric models work really well for p(x (j) | y). For example, one could use a normal distribution for each x (j) in turn, for each possible value of y, using the training data. The parameters of this normal distribution are chosen using maximum likelihood. The logic of the measurements might suggest other distributions, too. If one of the x (j)'s was a count, we might fit a Poisson distribution (again, using maximum likelihood). If it was a 0–1 variable, we might fit a Bernoulli distribution. If it was a discrete variable, then we might use a multinomial model. Even if the x (j) is continuous, we can use a multinomial model by quantizing to some fixed set of values; this can be quite effective.
+
+A naive bayes classifier that has poorly fitting models for each feature could classify data very well. This (reliably confusing property) occurs because classification doesn't require a good model of p(x | y), or even of p(y | x). All that needs to happen is that, at any x, the score for the right class is higher than the score for all other classes. Figure 11.1 shows an example where a normal model of the class-conditional histograms is poor, but the normal model will result in a good naive bayes classifier. This works because a data item from (say) class one will reliably have a larger probability under the normal model for class one than it will for class two.
+
+Fig. 11.1
+
+The figure shows class conditional histograms of a feature x for two different classes. The histograms have been normalized so that the counts sum to one, so you can think of them as probability distributions. It should be fairly obvious that a normal model (superimposed) doesn't describe these histograms well. However, the normal model will result in a good naive bayes classifier
+
+Worked example 11.2 (Classifying Breast Tissue Samples). The "breast tissue" dataset at https://​archive.​ics.​uci.​edu/​ml/​datasets/​Breast+Tissue contains measurements of a variety of properties of six different classes of breast tissue. Build and evaluate a naive bayes classifier to distinguish between the classes automatically from the measurements.
+
+Solution
+
+I used R for this example, because I could then use packages easily. The main difficulty here is finding appropriate packages, understanding their documentation, and checking they're right (unless you want to write the source yourself, which really isn't all that hard). I used the R package caret to do train-test splits, cross-validation, etc. on the naive bayes classifier in the R package klaR. I separated out a test set randomly (approx 20% of the cases for each class, chosen at random), then trained with cross-validation on the remainder. I used a normal model for each feature. The class-confusion matrix on the test set was:
+
+True | Predict
+
+|  |  |  |  |   
+---|---|---|---|---|---|---  
+|
+
+adi | car | con | fad | gla | mas
+
+adi | 2 | 0 | 0 | 0 | 0 | 0
+
+car | 0 | 3 | 0 | 0 | 0 | 1
+
+con | 2 | 0 | 2 | 0 | 0 | 0
+
+fad | 0 | 0 | 0 | 0 | 1 | 0
+
+gla | 0 | 0 | 0 | 0 | 2 | 1
+
+mas | 0 | 1 | 0 | 3 | 0 | 1
+
+which is fairly good. The accuracy is 52%. In the training data, the classes are nearly balanced and there are six classes, meaning that chance is about 17%. These numbers, and the class-confusion matrix, will vary with test-train split. I have not averaged over splits, which would give a somewhat more accurate estimate of accuracy.
+
+### 11.3.1 Cross-Validation to Choose a Model
+
+Naive bayes presents us with a new problem. We can choose from several different types of model for p(x (j) | y) (eg normal models vs. Poisson models), and we need to know which one produces the best classifier. We also need to know how well that classifier will work. It is natural to use cross-validation to estimate how well each type of model works. You can't just look at every type of model for every variable, because that would yield too many models. Instead, choose M types of model that seem plausible (for example, by looking at histograms of feature components conditioned on class and using your judgement). Now compute a cross-validated error for each of M types of model, and choose the type of model with lowest cross-validated error. Computing the cross-validated error involves repeatedly splitting the training set into two pieces, fitting the model on one and computing the error on the other, then averaging the errors. Notice this means the model you fit to each fold will have slightly different parameter values, because each fold has slightly different training data.
+
+However, once we have chosen the type of model, we have two problems. First, we do not know the correct values for the parameters of the best type of model. For each fold in the cross-validation, we estimated slightly different parameters because we trained on slightly different data, and we don't know which estimate is right. Second, we do not have a good estimate of how well the best model works. This is because we chose the type of model with the smallest error estimate, which is likely smaller than the true error estimate for that type of model.
+
+This problem is easily dealt with if you have a reasonably sized dataset. Split the labelled dataset into two pieces. One (call it the training set) is used for training and for choosing a model type, the other (call it the test set) is used only for evaluating the final model. Now for each type of model, compute the cross-validated error on the training set.
+
+Now use the cross-validated error to choose the type of model. Very often this just means you choose the type that produces the lowest cross-validated error, but there might be cases where two types produce about the same error and one is a lot faster to evaluate, etc. Take the entire training set, and use this to estimate the parameters for that type of model. This estimate should be (a little) better than any of the estimates produced in the cross-validation, because it uses (slightly) more data. Finally, evaluate the resulting model on the test set.
+
+This procedure is rather harder to describe than to do (there's a pretty natural set of nested loops here). There are some strong advantages. First, the estimate of how well a particular model type works is unbiased, because we evaluated on data not used on training. Second, once you have chosen a type of model, the parameter estimate you make is the best you can because you used all the training set to obtain it. Finally, your estimate of how well that particular model works is unbiased, too, because you obtained it using data that wasn't used to train or to select a model.
+
+Remember this: Naive bayes classifiers are straightforward to build, and very effective. Experience has shown they are particularly effective at high dimensional data. A straightforward variant of cross-validation helps select which particular model to use.
+
+## 11.4 The Support Vector Machine
+
+Assume we have a labelled dataset consisting of N pairs (x i , y i ). Here x i is the i'th feature vector, and y i is the i'th class label. We will assume that there are two classes, and that y i is either 1 or − 1. We wish to predict the sign of y for any point x. We will use a linear classifier, so that for a new data item x, we will predict
+
+![
+$$\\displaystyle{\\mathsf{sign}\\left \(\\mathbf{a}^{T}\\mathbf{x} + b\\right \)}$$
+](A442674_1_En_11_Chapter_Equc.gif)
+
+and the particular classifier we use is given by our choice of a and b.
+
+You should think of a and b as representing a hyperplane, given by the points where a T x \+ b = 0. Notice that the magnitude of a T x \+ b grows as the point x moves further away from the hyperplane. This hyperplane separates the positive data from the negative data, and is an example of a decision boundary. When a point crosses the decision boundary, the label predicted for that point changes. All classifiers have decision boundaries. Searching for the decision boundary that yields the best behavior is a fruitful strategy for building classifiers.
+
+Example 11.1 (A Linear Model with a Single Feature)
+
+Assume we use a linear model with one feature. For an example with feature value x, predicts ![
+$$\\mathsf{sign}\\left \(ax + b\\right \)$$
+](A442674_1_En_11_Chapter_IEq2.gif). Equivalently, the model tests x against the threshold − b∕a.
+
+Example 11.2 (A Linear Model with Two Features)
+
+Assume we use a linear model with two features. For an example with feature vector x, the model predicts ![
+$$\\mathsf{sign}\\left \(\\mathbf{a}^{T}\\mathbf{x} + b\\right \)$$
+](A442674_1_En_11_Chapter_IEq3.gif). The sign changes along the line a T x \+ b = 0. You should check that this is, indeed, a line. On one side of this line, the model makes positive predictions; on the other, negative. Which side is which can be swapped by multiplying a and b by − 1.
+
+This family of classifiers may look bad to you, and it is easy to come up with examples that it misclassifies badly. In fact, the family is extremely strong. First, it is easy to estimate the best choice of rule for very large datasets. Second, linear classifiers have a long history of working very well in practice on real data. Third, linear classifiers are fast to evaluate.
+
+In practice, examples that are classified badly by the linear rule usually are classified badly because there are too few features. Remember the case of the alien who classified humans into male and female by looking at their heights; if that alien had looked at their chromosomes as well as height, the error rate would have been smaller. In practical examples, experience shows that the error rate of a poorly performing linear classifier can usually be improved by adding features to the vector x.
+
+We will choose a and b by choosing values that minimize a cost function. The cost function must achieve two goals. First, the cost function needs a term that ensures each training example should be on the right side of the decision boundary (or, at least, not be too far on the wrong side). Second, the cost function needs a term that should penalize errors on query examples. The appropriate cost function has the form:
+
+![
+$$\\displaystyle{\\mbox{ Training error cost} +\\lambda \\mbox{ penalty term}}$$
+](A442674_1_En_11_Chapter_Equd.gif)
+
+where λ is an unknown weight that balances these two goals. We will eventually set the value of λ by a search process.
+
+### 11.4.1 The Hinge Loss
+
+Write
+
+![
+$$\\displaystyle{\\gamma _{i} = \\mathbf{a}^{T}\\mathbf{x}_{ i} + b}$$
+](A442674_1_En_11_Chapter_Eque.gif)
+
+for the value that the linear function takes on example i. Write C(γ i , y i ) for a function that compares γ i with y i . The training error cost will be of the form
+
+![
+$$\\displaystyle{\(1/N\)\\sum _{i=1}^{N}C\(\\gamma _{ i},y_{i}\).}$$
+](A442674_1_En_11_Chapter_Equf.gif)
+
+A good choice of C should have some important properties.
+
+  * If γ i and y i have different signs, then C should be large, because the classifier will make the wrong prediction for this training example. Furthermore, if γ i and y i have different signs and γ i has large magnitude, then the classifier will very likely make the wrong prediction for test examples that are close to x i . This is because the magnitude of (a T x \+ b) grows as x gets further from the decision boundary. So C should get larger as the magnitude of γ i gets larger in this case.
+
+  * If γ i and y i have the same signs, but γ i has small magnitude, then the classifier will classify x i correctly, but might not classify points that are nearby correctly. This is because a small magnitude of γ i means that x i is close to the decision boundary, so there will be points nearby that are on the other side of the decision boundary. We want to discourage this, so C should not be zero in this case.
+
+  * Finally, if γ i and y i have the same signs and γ i has large magnitude, then C can be zero because x i is on the right side of the decision boundary and so are all the points near to x i .
+
+The hinge loss, which takes the form
+
+![
+$$\\displaystyle{C\(y_{i},\\gamma _{i}\) =\\max \(0,1 - y_{i}\\gamma _{i}\),}$$
+](A442674_1_En_11_Chapter_Equg.gif)
+
+has these properties (Fig. 11.2).
+
+  * If γ i and y i have different signs, then C will be large. Furthermore, the cost grows linearly as x i moves further away from the boundary on the wrong side.
+
+  * If γ i and y i have the same sign, but y i γ i < 1 (which means that x i is close to the decision boundary), there is some cost, which gets larger as x i gets closer to the boundary.
+
+  * If y i γ i > 1 (so the classifier predicts the sign correctly and x i is far from the boundary) there is no cost.
+
+Fig. 11.2
+
+The hinge loss, plotted for the case y i = 1. The horizontal variable is the γ i = a T x i \+ b of the text. Notice that giving a strong negative response to this positive example causes a loss that grows linearly as the magnitude of the response grows. Notice also that giving an insufficiently positive response also causes a loss. Giving a strongly positive response is free
+
+A classifier trained to minimize this loss is encouraged to (a) make strong positive (or negative) predictions for positive (or negative) examples and (b) for examples it gets wrong, make predictions with the smallest magnitude that it can. A linear classifier trained with the hinge loss is known as a support vector machine or SVM.
+
+### 11.4.2 Regularization
+
+The penalty term is needed, because the hinge loss has one odd property. Assume that the pair a, b correctly classifies all training examples, so that y i (a T x i \+ b) > 0. Then we can always ensure that the hinge loss for the dataset is zero, by scaling a and b, because you can choose a scale so that y j (a T x j \+ b) > 1 for every example index j. This scale hasn't changed the result of the classification rule on the training data. Now if a and b result in a hinge loss of zero, then so do 2a and 2b. This should worry you, because it means we can't choose the classifier parameters uniquely.
+
+Now think about future examples. We don't know what their feature values will be, and we don't know their labels. But we do know that the hinge loss for an example with feature vector x and unknown label y will be ![
+$$\\max \(0,1 - y\\left \[\\mathbf{a}^{T}\\mathbf{x} + b\\right \]\)$$
+](A442674_1_En_11_Chapter_IEq4.gif). Now imagine the hinge loss for this example isn't zero. If the example is classified correctly, then it is close to the decision boundary. We expect that there are fewer of these examples than examples that are far from the decision boundary and on the wrong side, so we concentrate on examples that are misclassified. For misclassified examples, if ∣​∣​​a​​∣​∣ is small, then at least the hinge loss will be small. By this argument, we would like to achieve a small value of the hinge loss on the training examples using an a that has small length.
+
+We can do so by adding a penalty term to the hinge loss to favor solutions where ∣​∣​​a​​∣​∣ is small. To obtain an a of small length, it is enough to ensure that (1∕2)a T a is small (the factor of 1∕2 makes the gradient cleaner). This penalty term will ensure that there is a unique choice of classifier parameters in the case the hinge loss is zero. Experience (and some theory we can't go into here) shows that having a small ∣​∣​​a​​∣​∣ helps even if there is no pair that classifies all training examples correctly. Doing so improves the error on future examples. Adding a penalty term to improve the solution of a learning problem is sometimes referred to as regularization. The penalty term is often referred to as a regularizer, because it tends to discourage solutions that are large (and so have possible high loss on future test data) but are not strongly supported by the training data. The parameter λ is often referred to as the regularization parameter.
+
+Using the hinge loss to form the training cost, and regularizing with a penalty term (1∕2)a T a means our cost function is:
+
+![
+$$\\displaystyle{S\(\\mathbf{a},b;\\lambda \) = \\left \[\(1/N\)\\sum _{i=1}^{N}\\max \(0,1 - y_{ i}\\left \(\\mathbf{a}^{T}\\mathbf{x}_{ i} + b\\right \)\)\\right \] +\\lambda \\left \(\\frac{\\mathbf{a}^{T}\\mathbf{a}} {2} \\right \).}$$
+](A442674_1_En_11_Chapter_Equh.gif)
+
+There are now two problems to solve. First, assume we know λ; we will need to find a and b that minimize S(a, b; λ). Second, we have no theory that tells us how to choose λ, so we will need to search for a good value.
+
+### 11.4.3 Finding a Classifier with Stochastic Gradient Descent
+
+The usual recipes for finding a minimum are ineffective for our cost function. First, write ![
+$$\\mathbf{u} = \\left \[\\mathbf{a},b\\right \]$$
+](A442674_1_En_11_Chapter_IEq5.gif) for the vector obtained by stacking the vector a together with b. We have a function g(u), and we wish to obtain a value of u that achieves the minimum for that function. Sometimes we can solve a problem like this by constructing the gradient and finding a value of u the makes the gradient zero, but not this time (try it; the max creates problems). We must use a numerical method.
+
+Typical numerical methods take a point u (n), update it to u (n+1), then check to see whether the result is a minimum. This process is started from a start point. The choice of start point may or may not matter for general problems, but for our problem a random start point is fine. The update is usually obtained by computing a direction p (n) such that for small values of η, g(u (n) \+ η p (n)) is smaller than g(u (n)). Such a direction is known as a descent direction. We must then determine how far to go along the descent direction, a process known as line search.
+
+Obtaining a descent direction: One method to choose a descent direction is gradient descent, which uses the negative gradient of the function. Recall our notation that
+
+![
+$$\\displaystyle{\\mathbf{u} = \\left \(\\begin{array}{c} u_{1} \\\\ u_{2}\\\\ \\ldots \\\\ u_{d}\\end{array} \\right \)}$$
+](A442674_1_En_11_Chapter_Equi.gif)
+
+and that
+
+![
+$$\\displaystyle{\\nabla g = \\left \(\\begin{array}{c} \\dfrac{\\partial g} {\\partial u_{1}} \\\\ \\dfrac{\\partial g} {\\partial u_{2}}\\\\ \\ldots \\\\ \\dfrac{\\partial g} {\\partial u_{d}} \\end{array} \\right \).}$$
+](A442674_1_En_11_Chapter_Equj.gif)
+
+We can write a Taylor series expansion for the function g(u (n) \+ η p (n)). We have that
+
+![
+$$\\displaystyle{g\(\\mathbf{u}^{\(n\)} +\\eta \\mathbf{p}^{\(n\)}\) = g\(\\mathbf{u}^{\(n\)}\) +\\eta \\left \[\(\\nabla g\)^{T}\\mathbf{p}^{\(n\)}\\right \] + O\(\\eta ^{2}\)}$$
+](A442674_1_En_11_Chapter_Equk.gif)
+
+This means that we can expect that if
+
+![
+$$\\displaystyle{\\mathbf{p}^{\(n\)} = -\\nabla g\(\\mathbf{u}^{\(n\)}\),}$$
+](A442674_1_En_11_Chapter_Equl.gif)
+
+we expect that, at least for small values of h, g(u (n) \+ η p (n)) will be less than g(u (n)). This works (as long as g is differentiable, and quite often when it isn't) because g must go down for at least small steps in this direction.
+
+But recall that our cost function is a sum of a penalty term and one error cost per example. This means the cost function looks like
+
+![
+$$\\displaystyle{g\(\\mathbf{u}\) = \\left \[\(1/N\)\\sum _{i=1}^{N}g_{ i}\(\\mathbf{u}\)\\right \] + g_{0}\(\\mathbf{u}\),}$$
+](A442674_1_En_11_Chapter_Equm.gif)
+
+as a function of u. Gradient descent would require us to form
+
+![
+$$\\displaystyle{-\\nabla g\(\\mathbf{u}\) = -\\left \(\\left \[\(1/N\)\\sum _{i=1}^{N}\\nabla g_{ i}\(\\mathbf{u}\)\\right \] + \\nabla g_{0}\(\\mathbf{u}\)\\right \)}$$
+](A442674_1_En_11_Chapter_Equn.gif)
+
+and then take a small step in this direction. But if N is large, this is unattractive, as we might have to sum a lot of terms. This happens a lot in building classifiers, where you might quite reasonably expect to deal with millions (billions; perhaps trillions) of examples. Touching each example at each step really is impractical.
+
+Stochastic gradient descent is an algorithm that replaces the exact gradient with an approximation that has a random error, but is simple and quick to compute. The term
+
+![
+$$\\displaystyle{\( \\frac{1} {N}\)\\sum _{i=1}^{N}\\nabla g_{ i}\(\\mathbf{u}\).}$$
+](A442674_1_En_11_Chapter_Equo.gif)
+
+is a population mean, and we know how to deal with those. We can estimate this term by drawing a random sample (a batch) of N b (the batch size) examples, with replacement, from the population of N examples, then computing the mean for that sample. We approximate the population mean by
+
+![
+$$\\displaystyle{\( \\frac{1} {N_{b}}\)\\sum _{j\\in \\mbox{ batch}}\\nabla g_{j}\(\\mathbf{u}\).}$$
+](A442674_1_En_11_Chapter_Equp.gif)
+
+The batch size is usually determined using considerations of computer architecture (how many examples fit neatly into cache?) or of database design (how many examples are recovered in one disk cycle?). One common choice is N b = 1, which is the same as choosing one example uniformly and at random. We form
+
+![
+$$\\displaystyle{\\mathbf{p}_{N_{b}}^{\(n\)} = -\\left \(\\left \[\(1/N_{ b}\)\\sum _{j\\in \\mbox{ batch}}\\nabla g_{i}\(\\mathbf{u}\)\\right \] + \\nabla g_{0}\(\\mathbf{u}\)\\right \)}$$
+](A442674_1_En_11_Chapter_Equq.gif)
+
+and then take a small step along ![
+$$\\mathbf{p}_{N_{b}}^{\(n\)}$$
+](A442674_1_En_11_Chapter_IEq6.gif). Our new point becomes
+
+![
+$$\\displaystyle{\\mathbf{u}^{\(n+1\)} = \\mathbf{u}^{\(n\)} +\\eta \\mathbf{p}_{ N_{b}}^{\(n\)},}$$
+](A442674_1_En_11_Chapter_Equr.gif)
+
+where η is called the steplength (or sometimes step size or learning rate , even though it isn't the size or the length of the step we take, or a rate!).
+
+Because the expected value of the sample mean is the population mean, if we take many small steps along ![
+$$\\mathbf{p}_{N_{b}}$$
+](A442674_1_En_11_Chapter_IEq7.gif), they should average out to a step backwards along the gradient. This approach is known as stochastic gradient descent because we're not going along the gradient, but along a random vector which is the gradient only in expectation. It isn't obvious that stochastic gradient descent is a good idea. Although each step is easy to take, we may need to take more steps. The question is then whether we gain in the increased speed of the step what we lose by having to take more steps. Not much is known theoretically, but in practice the approach is hugely successful for training classifiers.
+
+Choosing a steplength: Choosing a steplength η takes some work. We can't search for the step that gives us the best value of g, because we don't want to evaluate the function g (doing so involves looking at each of the g i terms). Instead, we use an η that is large at the start—so that the method can explore large changes in the values of the classifier parameters—and small steps later—so that it settles down. The choice of how η gets smaller is often known as a steplength schedule.
+
+Here are useful examples of steplength schedules. Often, you can tell how many steps are required to have seen the whole dataset; this is called an epoch. A common steplength schedule sets the steplength in the e'th epoch to be
+
+![
+$$\\displaystyle{\\eta ^{\(e\)} = \\frac{m} {e + n},}$$
+](A442674_1_En_11_Chapter_Equs.gif)
+
+where m and n are constants chosen by experiment with small subsets of the dataset. When there are a lot of examples, an epoch is a long time to fix the steplength, and this approach can reduce the steplength too slowly. Instead, you can divide training into what I shall call seasons (blocks of a fixed number of iterations, smaller than epochs), and make the steplength a function of the season number.
+
+There is no good test for whether stochastic gradient descent has converged to the right answer, because natural tests involve evaluating the gradient and the function, and doing so is expensive. More usual is to plot the error as a function of iteration on the validation set, and interrupt or stop training when the error has reached an acceptable level. The error (resp. accuracy) should vary randomly (because the steps are taken in directions that only approximate the gradient) but should decrease (resp. increase) overall as training proceeds (because the steps do approximate the gradient). Figures 11.3 and 11.4 show examples of these curves, which are sometimes known as learning curves.
+
+Fig. 11.3
+
+On the left, the magnitude of the weight vector a at the end of each season for the first training regime described in the text. On the right, the accuracy on held out data at the end of each season. Notice how different choices of regularization parameter lead to different magnitudes of a; how the method isn't particularly sensitive to choice of regularization parameter (they change by factors of 100); how the accuracy settles down fairly quickly; and how overlarge values of the regularization parameter do lead to a loss of accuracy
+
+Fig. 11.4
+
+On the left, the magnitude of the weight vector a at the end of each season for the second training regime described in the text. On the right, the accuracy on held out data at the end of each season. Notice how different choices of regularization parameter lead to different magnitudes of a; how the method isn't particularly sensitive to choice of regularization parameter (they change by factors of 100); how the accuracy settles down fairly quickly; and how overlarge values of the regularization parameter do lead to a loss of accuracy
+
+### 11.4.4 Searching for λ
+
+We do not know a good value for λ. We will obtain a value by choosing a set of different values, fitting an SVM using each value, and taking the λ value that will yield the best SVM. Experience has shown that the performance of a method is not profoundly sensitive to the value of λ, so that we can look at values spaced quite far apart. It is usual to take some small number (say, 1e − 4), then multiply by powers of 10 (or 3, if you're feeling fussy and have a fast computer). So, for example, we might look at ![
+$$\\lambda \\in \\left \\{1e - 4,1e - 3,1e - 2,1e - 1\\right \\}$$
+](A442674_1_En_11_Chapter_IEq8.gif). We know how to fit an SVM to a particular value of λ (Sect. 11.4.3). The problem is to choose the value that yields the best SVM, and to use that to get the best classifier.
+
+We have seen a version of this problem before (Sect. 11.3.1). There, we chose from several different types of model to obtain the best naive bayes classifier. The recipe from that section is easily adapted to the current problem. We regard each different λ value as representing a different model. We split the data into two pieces: one is a training set, used for fitting and choosing models; the other is a test set, used for evaluating the final chosen model.
+
+Now for each value of λ, compute the cross-validated error of an SVM using that λ on the training set. Do this by repeatedly splitting the training set into two pieces (training and validation); fitting the SVM with that λ to the training piece using stochastic gradient descent; evaluating the error on the validation piece; and averaging these errors. Now use the cross-validated error to choose the best λ value. Very often this just means you choose the value that produces the lowest cross-validated error, but there might be cases where two values produce about the same error and one is preferred for some other reason. Notice that you can compute the standard deviation of the cross-validated error as well as the mean, so you can tell whether differences between cross-validated errors are significant.
+
+Now take the entire training set, and use this to fit an SVM for the chosen λ value. This should be (a little) better than any of the SVMs obtained in the cross-validation, because it uses (slightly) more data. Finally, evaluate the resulting SVM on the test set.
+
+This procedure is rather harder to describe than to do (there's a pretty natural set of nested loops here). There are some strong advantages. First, the estimate of how well a particular SVM type works is unbiased, because we evaluated on data not used on training. Second, once you have chosen the cross-validation parameter, the SVM you fit is the best you can fit because you used all the training set to obtain it. Finally, your estimate of how well that particular SVM works is unbiased, too, because you obtained it using data that wasn't used to train or to select a model.
+
+### 11.4.5 Example: Training an SVM with Stochastic Gradient Descent
+
+I have summarized the SVM training procedure in a set of boxes, below. You should be aware that the recipe there admits many useful variations, though. One useful practical trick is to rescale the feature vector components so each has unit variance. This doesn't change anything conceptual as the best choice of decision boundary for rescaled data is easily derived from the best choice for unscaled, and vice versa. Rescaling very often makes stochastic gradient descent perform better because the method takes steps that are even in each component.
+
+It is quite usual to use packages to fit SVM's, and good packages may use a variety of tricks which we can't go into to make training more efficient. Nonetheless, you should have a grasp of the overall process, because it follows a pattern that is useful for training other models (among other things, most deep networks are trained using this pattern).
+
+Procedure 11.1 (Training an SVM: Overall)
+
+Start with a dataset containing N pairs (x i , y i ). Each x i is a d-dimensional feature vector, and each y i is a label, either 1 or − 1. Optionally, rescale the x i so that each component has unit variance. Choose a set of possible values of the regularization weight λ. Separate the dataset into two sets: test and training. Reserve the test set. For each value of the regularization weight, use the training set to estimate the accuracy of an SVM with that λ value, using cross-validation as in Procedure 11.2 and stochastic gradient descent. Use this information to choose λ 0, the best value of λ (usually, the one that yields the highest accuracy). Now use the training set to fit the best SVM using λ 0 as the regularization constant. Finally, use the test set to compute the accuracy or error rate of that SVM, and report that
+
+Procedure 11.2 (Training an SVM: Estimating the Accuracy)
+
+Repeatedly: split the training dataset into two components (training and validation), at random; use the training component to train an SVM; and compute the accuracy on the validation component. Now average the resulting accuracy values.
+
+Procedure 11.3 (Training an SVM: Stochastic Gradient Descent)
+
+Obtain ![
+$$\\mathbf{u} = \\left \(\\mathbf{a},b\\right \)$$
+](A442674_1_En_11_Chapter_IEq9.gif) by stochastic gradient descent on the cost function
+
+![
+$$\\displaystyle{g\(\\mathbf{u}\) = \\left \[\(1/N\)\\sum _{i=1}^{N}g_{ i}\(\\mathbf{u}\)\\right \] + g_{0}\(\\mathbf{u}\)}$$
+](A442674_1_En_11_Chapter_Equt.gif)
+
+where g 0(u) = λ(a T a)∕2 and ![
+$$g_{i}\(\\mathbf{u}\) =\\max \(0,1 - y_{i}\\left \(\\mathbf{a}^{T}\\mathbf{x}_{i} + b\\right \)\)$$
+](A442674_1_En_11_Chapter_IEq10.gif).
+
+Do so by first choosing a fixed number of items per batch N b , the number of steps per season N s , and the number of steps k to take before evaluating the model (this is usually a lot smaller than N s ). Choose a random start point. Now iterate:
+
+  * Update the stepsize. In the s'th season, the step size is typically ![
+$$\\eta ^{\(s\)} = \\frac{m} {s+n}$$
+](A442674_1_En_11_Chapter_IEq11.gif) for constants m and n chosen by small-scale experiments.
+
+  * Split the training dataset into a training part and a validation part. This split changes each season. Use the validation set to get an unbiased estimate of error during that season's training.
+
+  * Now, until the end of the season (i.e. until you have taken N s steps):
+
+    * Take k steps. Each step is taken by selecting a batch of N b data items uniformly and at random from the training part for that season. Write ![
+$$\\mathcal{D}$$
+](A442674_1_En_11_Chapter_IEq12.gif) for this set. Now compute
+
+![
+$$\\displaystyle{\\mathbf{p}^{\(n\)} = - \\frac{1} {N_{b}}\\left \(\\sum _{i\\in \\mathcal{D}}\\nabla g_{i}\(\\mathbf{u}^{\(n\)}\)\\right \) -\\lambda \\mathbf{u}^{\(n\)},}$$
+](A442674_1_En_11_Chapter_Equu.gif)
+
+and update the model by computing
+
+![
+$$\\displaystyle{\\mathbf{u}^{\(n+1\)} = \\mathbf{u}^{\(n\)} +\\eta \\mathbf{p}^{\(n\)}}$$
+](A442674_1_En_11_Chapter_Equv.gif)
+
+    * Evaluate the current model u (n) by computing the accuracy on the validation part for that season. Plot the accuracy as a function of step number.
+
+There are two ways to stop. You can choose a fixed number of seasons (or of epochs) and stop when that is done. Alternatively, you can watch the error plot and stop when the error reaches some level or meets some criterion.
+
+Here is an example in some detail. I downloaded the dataset at http://​archive.​ics.​uci.​edu/​ml/​datasets/​Adult. This dataset apparently contains 48,842 data items, but I worked with only the first 32,000. Each consists of a set of numeric and categorical features describing a person, together with whether their annual income is larger than or smaller than 50 K$. I ignored the categorical features to prepare these figures. This isn't wise if you want a good classifier, but it's fine for an example. I used these features to predict whether income is over or under 50 K$. I split the data into 5000 test examples, and 27,000 training examples. It's important to do so at random. There are 6 numerical features. I subtracted the mean (which doesn't usually make much difference) and rescaled each so that the variance was 1 (which is often very important).
+
+Setting up stochastic gradient descent: We have estimates a (n) and b (n) of the classifier parameters, and we want to improve the estimates. I used a batch size of N b = 1. Pick the r'th example at random. The gradient is
+
+![
+$$\\displaystyle{\\nabla \\left \(\\max \(0,1 - y_{r}\\left \(\\mathbf{a}^{T}\\mathbf{x}_{ r} + b\\right \)\) + \\frac{\\lambda } {2}\\mathbf{a}^{T}\\mathbf{a}\\right \).}$$
+](A442674_1_En_11_Chapter_Equw.gif)
+
+Assume that ![
+$$y_{k}\\left \(\\mathbf{a}^{T}\\mathbf{x}_{r} + b\\right \)> 1$$
+](A442674_1_En_11_Chapter_IEq13.gif). In this case, the classifier predicts a score with the right sign, and a magnitude that is greater than one. Then the first term is zero, and the gradient of the second term is easy. Now if ![
+$$y_{k}\\left \(\\mathbf{a}^{T}\\mathbf{x}_{r} + b\\right \) <1$$
+](A442674_1_En_11_Chapter_IEq14.gif), we can ignore the max, and the first term is ![
+$$1 - y_{r}\\left \(\\mathbf{a}^{T}\\mathbf{x}_{r} + b\\right \)$$
+](A442674_1_En_11_Chapter_IEq15.gif); the gradient is again easy. If ![
+$$y_{r}\\left \(\\mathbf{a}^{T}\\mathbf{x}_{r} + b\\right \) = 1$$
+](A442674_1_En_11_Chapter_IEq16.gif), there are two distinct values we could choose for the gradient, because the max term isn't differentiable. It does not matter which value we choose because this situation hardly ever happens. We choose a steplength η, and update our estimates using this gradient. This yields:
+
+![
+$$\\displaystyle{\\mathbf{a}^{\(n+1\)} = \\mathbf{a}^{\(n\)}-\\eta \\left \\{\\begin{array}{ll} \\lambda \\mathbf{a} &\\mbox{ if }y_{k}\\left \(\\mathbf{a}^{T}\\mathbf{x}_{k} + b\\right \) \\geq 1 \\\\ \\lambda \\mathbf{a} - y_{k}\\mathbf{x}&\\mbox{ otherwise} \\end{array} \\right.}$$
+](A442674_1_En_11_Chapter_Equx.gif)
+
+and
+
+![
+$$\\displaystyle{b^{\(n+1\)} = b^{\(n\)}-\\eta \\left \\{\\begin{array}{ll} 0 &\\mbox{ if }y_{k}\\left \(\\mathbf{a}^{T}\\mathbf{x}_{k} + b\\right \) \\geq 1 \\\\ - y_{k}&\\mbox{ otherwise} \\end{array} \\right..}$$
+](A442674_1_En_11_Chapter_Equy.gif)
+
+Training: I used two different training regimes. In the first training regime, there were 100 seasons. In each season, I applied 426 steps. For each step, I selected one data item uniformly at random (sampling with replacement), then stepped down the gradient. This means the method sees a total of 42,600 data items. This means that there is a high probability it has touched each data item once (27,000 isn't enough, because we are sampling with replacement, so some items get seen more than once). I chose five different values for the regularization parameter and trained with a steplength of 1∕(0. 01 ∗ s \+ 50), where s is the season. At the end of each season, I computed a T a and the accuracy (fraction of examples correctly classified) of the current classifier on the held out test examples. Figure 11.3 shows the results. You should notice that the accuracy changes slightly each season; that for larger regularizer values a T a is smaller; and that the accuracy settles down to about 0.8 very quickly.
+
+In the second training regime, there were 100 seasons. In each season, I applied 50 steps. For each step, I selected one data item uniformly at random (sampling with replacement), then stepped down the gradient. This means the method sees a total of 5000 data items, and about 3000 unique data items—it hasn't seen the whole training set. I chose five different values for the regularization parameter and trained with a steplength of 1∕(0. 01 ∗ s \+ 50), where s is the season. At the end of each season, I computed a T a and the accuracy (fraction of examples correctly classified) of the current classifier on the held out test examples. Figure 11.4 shows the results.
+
+This is an easy classification example. Points worth noting are
+
+  * the accuracy makes large changes early, then settles down to make slight changes each season;
+
+  * quite large changes in regularization constant have small effects on the outcome, but there is a best choice;
+
+  * for larger values of the regularization constant, a T a is smaller;
+
+  * there isn't much difference between the two training regimes;
+
+  * and the method doesn't need to see all the training data to produce a classifier that is about as good as it would be if the method had seen all training data.
+
+All of these points are relatively typical of SVM's trained using stochastic gradient descent with very large datasets.
+
+Remember this: Linear SVM's are a go-to classifier. When you have a binary classification problem, the first step should be to try a linear SVM. Training with stochastic gradient descent is straightforward, and extremely effective. Finding an appropriate value of the regularization constant requires an easy search. There is an immense quantity of good software available.
+
+### 11.4.6 Multi-Class Classification with SVMs
+
+I have shown how one trains a linear SVM to make a binary prediction (i.e. predict one of two outcomes). But what if there are three, or more, labels? In principle, you could write a binary code for each label, then use a different SVM to predict each bit of the code. It turns out that this doesn't work terribly well, because an error by one of the SVM's is usually catastrophic.
+
+There are two methods that are widely used. In the all-vs-all approach, we train a binary classifier for each pair of classes. To classify an example, we present it to each of these classifiers. Each classifier decides which of two classes the example belongs to, then records a vote for that class. The example gets the class label with the most votes. This approach is simple, but scales very badly with the number of classes (you have to build O(N 2) different SVM's for N classes).
+
+In the one-vs-all approach, we build a binary classifier for each class. This classifier must distinguish its class from all the other classes. We then take the class with the largest classifier score. One can think up quite good reasons this approach shouldn't work. For one thing, the classifier isn't told that you intend to use the score to tell similarity between classes. In practice, the approach works rather well and is quite widely used. This approach scales a bit better with the number of classes (O(N)).
+
+Remember this: It is straightforward to build a multi-class classifier out of binary classifiers. Any decent SVM package will do this for you.
+
+## 11.5 Classifying with Random Forests
+
+One way to build a classifier is to use a sequence of simple tests, where each test is allowed to use the results of all previous tests. This class of rule can be drawn as a tree (Fig. 11.5), where each node represents a test, and the edges represent the possible outcomes of the test. To classify a test item with such a tree, you present it to the first node; the outcome of the test determines which node it goes to next; and so on, until the example arrives at a leaf. When it does arrive at a leaf, we label the test item with the most common label in the leaf. This object is known as a decision tree. Notice one attractive feature of this decision tree: it deals with multiple class labels quite easily, because you just label the test item with the most common label in the leaf that it arrives at when you pass it down the tree.
+
+Fig. 11.5
+
+This—the household robot's guide to obstacles—is a typical decision tree. I have labelled only one of the outgoing branches, because the other is the negation. So if the obstacle moves, bites, but isn't furry, then it's a toddler. In general, an item is passed down the tree until it hits a leaf. It is then labelled with the leaf's label
+
+Figure 11.6 shows a simple 2D dataset with four classes, next to a decision tree that will correctly classify at least the training data. Actually classifying data with a tree like this is straightforward. We take the data item, and pass it down the tree. Notice it can't go both left and right, because of the way the tests work. This means each data item arrives at a single leaf. We take the most common label at the leaf, and give that to the test item. In turn, this means we can build a geometric structure on the feature space that corresponds to the decision tree. I have illustrated that structure in Fig. 11.6, where the first decision splits the feature space in half (which is why the term split is used so often), and then the next decisions split each of those halves into two.
+
+Fig. 11.6
+
+A straightforward decision tree, illustrated in two ways. On the left, I have given the rules at each split; on the right, I have shown the data points in two dimensions, and the structure that the tree produces in the feature space
+
+The important question is how to get the tree from data. We will always use a binary tree, because it's easier to describe, because it's usual, and because it doesn't change anything important in the algorithm. Each non-leaf node has a decision function, which takes data items and returns either 1 or − 1. We train the tree by thinking about its effect on the training data. We pass the whole pool of training data into the root. The decision function at any non-leaf node splits incoming data into two pools, one for the left child (all the data that the decision function labels 1) and the other for the right child (ditto, − 1). This goes on recursively until finally, each leaf contains a pool of data, which it can't split because it is a leaf.
+
+To classify a data item, we pass it down the tree, applying decision functions to choose left or right, until we reach a leaf. Any data item that reaches a particular leaf will get that leaf's label. This means that we would like all the items in the training data pool at a given leaf to agree on a label. But it matters how we achieve this. For example, a very large tree with one data item in each leaf will have excellent training accuracy, but is likely to have poor test accuracy. Intuition should suggest that good test accuracy can be obtained by a tree where each leaf has a large pool of data that all has one label.
+
+All this means that it is quite difficult to determine the best tree. A really powerful alternative is to build a many simple trees using algorithms that incorporate a great deal of randomness. The algorithms ensure that we get a different tree each time we train a tree on a dataset. None of the individual trees will be particularly good (they are often referred to as "weak learners"). But with many such trees (a decision forest), we can allow each to vote; the class that gets the most votes, wins. This strategy is extremely effective.
+
+### 11.5.1 Building a Decision Tree: General Algorithm
+
+There are many algorithms for building decision trees. I will sketch an approach chosen for simplicity and effectiveness; be aware there are others. I will leave out enough detail that you probably couldn't implement a program from my description. I've used this approach because most people never need to implement a program (there are excellent packages available), and need only a moderate understanding of how one could be built. For the people who want more detail, there is more in the mathematical material in the end (Sect. 15.​3).
+
+Training the tree uses a straightforward algorithm. First, we choose a class of decision functions to use at each node. It turns out that a very effective decision function is to choose a single feature at random, then test whether its value is larger than, or smaller than a threshold (some minor adjustments are required if the feature chosen isn't ordinal). For this approach to work, one needs to be quite careful about the choice of threshold, which is what we describe in the next section. Surprisingly, being clever about the choice of feature doesn't seem add a great deal of value. We won't spend more time on other kinds of decision function, though there are lots.
+
+Now assume we use a decision function as described, and we know how to choose a threshold. We start with the root node, then recursively either split the pool of data at that node, passing the left pool left and the right pool right, or stop splitting and return. Splitting involves choosing a decision function from the class to give the "best" split for a leaf. The main questions are how to choose the best split (next section), and when to stop.
+
+Stopping is relatively straightforward. Quite simple strategies for stopping are very good. It is hard to choose a decision function with very little data, so we must stop splitting when there is too little data at a node. We can tell this is the case by testing the amount of data against a threshold, chosen by experiment. If all the data at a node belongs to a single class, there is no point in splitting. Finally, constructing a tree that is too deep tends to result in generalization problems, so we usually allow no more than a fixed depth D of splits. In applications, D can be quite small. It is quite common to use D = 1 (when the chopped down tree is, rather unhappily, known as a decision stump).
+
+### 11.5.2 Building a Decision Tree: Choosing a Split
+
+Choosing the best splitting threshold is more complicated. Figure 11.7 shows two possible splits of a pool of training data. One is quite obviously a lot better than the other. In the good case, the split separates the pool into positives and negatives. In the bad case, each side of the split has the same number of positives and negatives. We cannot usually produce splits as good as the good case here. What we are looking for is a split that will make the proper label more certain.
+
+Fig. 11.7
+
+Two possible splits of a pool of training data. Positive data is represented with an 'x', negative data with a 'o'. Notice that if we split this pool with the informative line, all the points on the left are 'o's, and all the points on the right are 'x's. This is an excellent choice of split—once we have arrived in a leaf, everything has the same label. Compare this with the less informative split. We started with a node that was half 'x' and half 'o', and now have two nodes each of which is half 'x' and half 'o'—this isn't an improvement, because we do not know more about the label as a result of the split
+
+Figure 11.8 shows a more subtle case to illustrate this. The splits in this figure are obtained by testing the horizontal feature against a threshold. In one case, the left and the right pools contain about the same fraction of positive ('x') and negative ('o') examples. In the other, the left pool is all positive, and the right pool is mostly negative. This is the better choice of threshold. If we were to label any item on the left side positive and any item on the right side negative, the error rate would be fairly small. If you count, the best error rate for the informative split is 20% on the training data, and for the uninformative split it is 40% on the training data.
+
+Fig. 11.8
+
+Two possible splits of a pool of training data. Positive data is represented with an 'x', negative data with a 'o'. Notice that if we split this pool with the informative line, all the points on the left are 'x's, and two-thirds of the points on the right are 'o's. This means that knowing which side of the split a point lies would give us a good basis for estimating the label. In the less informative case, about two-thirds of the points on the left are 'x's and about half on the right are 'x's—knowing which side of the split a point lies is much less useful in deciding what the label is
+
+But we need some way to score the splits, so we can tell which threshold is best. Notice that, in the uninformative case, knowing that a data item is on the left (or the right) does not tell me much more about the data than I already knew before I had the split. In this case, we have that p(1 | left pool, uninformative) = 2∕3 ≈ 3∕5 = p(1 | parent pool) and p(1 | right pool, uninformative) = 1∕2 ≈ 3∕5 = p(1 | parent pool). For the informative split, knowing a data item is on the left classifies it completely, and knowing that it is on the right allows us to classify it an error rate of 1∕3. The informative split means that my uncertainty about what class the data item belongs to is significantly reduced if I know whether it goes left or right. To choose a good threshold, we need a score of how informative a split is. The score is known as the information gain (where larger is better). The details of how to compute information gain are involved, and of little interest unless you need to implement a decision tree; I've put this in the mathematical material at the end (Sect. 15.​3.​2).
+
+We now have a relatively straightforward blueprint for an algorithm, which I have put in a box. It's a blueprint, because there are a variety of ways in which it can be revised and changed.
+
+Procedure 11.4 (Building a Decision Tree: Overall)
+
+We have a dataset containing N pairs (x i , y i ). Each x i is a d-dimensional feature vector, and each y i is a label. Call this dataset a pool. Now recursively apply the following procedure:
+
+  * If the pool is too small, or if all items in the pool have the same label, or if the depth of the recursion has reached a limit, stop.
+
+  * Otherwise, search the features for a good split that divides the pool into two, then apply this procedure to each child.
+
+We search for a good split by the following procedure:
+
+  * Choose a subset of the feature components at random. Typically, one uses a subset whose size is about the square root of the feature dimension.
+
+  * For each component of this subset, search for a good split. If the component is ordinal, do so using the procedure of box 11.5, otherwise use the procedure of box 11.6.
+
+Procedure 11.5 (Splitting an Ordinal Feature)
+
+We search for a good split on a given ordinal feature by the following procedure:
+
+  * Select a set of possible values for the threshold.
+
+  * For each value split the dataset (every data item with a value of the component below the threshold goes left, others go right), and compute the information gain for the split.
+
+Keep the threshold that has the largest information gain.
+
+A good set of possible values for the threshold will contain values that separate the data "reasonably". If the pool of data is small, you can project the data onto the feature component (i.e. look at the values of that component alone), then choose the N − 1 distinct values that lie between two data points. If it is big, you can randomly select a subset of the data, then project that subset on the feature component and choose from the values between data points.
+
+Procedure 11.6 (Splitting a Non-Ordinal Feature)
+
+Split the values this feature takes into sets pools by flipping an unbiased coin for each value—if the coin comes up H, any data point with that value goes left, and if it comes up T, any data point with that value goes right. Repeating this procedure F times, computing the information gain for each split, then keep the split that has the best information gain. We choose F in advance, and it usually depends on the number of values the categorical variable can take.
+
+### 11.5.3 Forests
+
+Rather than build the best possible tree, we have built a tree efficiently, but with a number of random choices. If we were to rebuild the tree, we would obtain a different result. This suggests the following extremely effective strategy: build many trees, and classify by merging their results.
+
+There are two important strategies for building and evaluating decision forests. I am not aware of evidence strongly favoring one over the other, but different software packages use different strategies, and you should be aware of the options. In one strategy, we separate labelled data into a training and a test set. We then build multiple decision trees, training each using the whole training set. Finally, we evaluate the forest on the test set. In this approach, the forest has not seen some fraction of the available labelled data, because we used it to test. However, each tree has seen every training data item.
+
+Procedure 11.7 (Building a Decision Forest)
+
+We have a dataset containing N pairs (x i , y i ). Each x i is a d-dimensional feature vector, and each y i is a label. Separate the dataset into a test set and a training set. Train multiple distinct decision trees on the training set, recalling that the use of a random set of components to find a good split means you will obtain a distinct tree each time.
+
+In the other strategy, sometimes called bagging, each time we train a tree we randomly subsample the labelled data with replacement, to yield a training set the same size as the original set of labelled data. Notice that there will be duplicates in this training set, which is like a bootstrap replicate. This training set is often called a bag. We keep a record of the examples that do not appear in the bag (the "out of bag" examples). Now to evaluate the forest, we evaluate each tree on its out of bag examples. Each example gets votes for labels from each tree for which it is out of bag. Now classify each example using these votes, and compute the error for every example. In this approach, the entire forest has seen all labelled data, yet we get a good estimate of error, because no tree in the forest has been evaluated on data used to train it.
+
+Procedure 11.8 (Building a Decision Forest Using Bagging)
+
+We have a dataset containing N pairs (x i , y i ). Each x i is a d-dimensional feature vector, and each y i is a label. Now build k bootstrap replicates of the training data set. Train one decision tree on each replicate.
+
+Once we have a forest, we must classify test data items. There are two major strategies. The simplest is to classify the item with each tree in the forest, then take the class with the most votes. This is effective, but discounts some evidence that might be important. For example, imagine one of the trees in the forest has a leaf with many data items with the same class label; another tree has a leaf with exactly one data item in it. One might not want each leaf to have the same vote.
+
+Procedure 11.9 (Classification with a Decision Forest)
+
+Given a test example x, pass it down each tree of the forest. Now choose one of the following strategies.
+
+  * Each time the example arrives at a leaf, record one vote for the label that occurs most often at the leaf. Now choose the label with the most votes.
+
+  * Each time the example arrives at a leaf, record N l votes for each of the labels that occur at the leaf, where N l is the number of times the label appears in the training data at the leaf. Now choose the label with the most votes.
+
+An alternative strategy that takes this observation into account is to pass the test data item down each tree. When it arrives at a leaf, we record one vote for each of the training data items in that leaf. The vote goes to the class of the training data item. Finally, we take the class with the most votes. This approach allows big, accurate leaves to dominate the voting process. Both strategies are in use, and I am not aware of compelling evidence that one is always better than the other. This may be because the randomness in the training process makes big, accurate leaves uncommon in practice.
+
+Worked example 11.3 (Classifying Heart Disease Data) Build a random forest classifier to classify the "heart" dataset from the UC Irvine machine learning repository. The dataset is at http://​archive.​ics.​uci.​edu/​ml/​datasets/​Heart+Disease. There are several versions. You should look at the processed Cleveland data, which is in the file "processed.cleveland.data.txt".
+
+Solution
+
+I used the R random forest package. This uses a bagging strategy. This package makes it quite simple to fit a random forest, as you can see. In this dataset, variable 14 (V14) takes the value 0, 1, 2, 3 or 4 depending on the severity of the narrowing of the arteries. Other variables are physiological and physical measurements pertaining to the patient (read the details on the website). I tried to predict all five levels of variable 14, using the random forest as a multivariate classifier. This works rather poorly, as the out-of-bag class confusion matrix below shows. The total out-of-bag error rate was 45%.
+
+True | Predict
+
+|  |  |  |  |   
+---|---|---|---|---|---|---  
+|  |  |  |  |  |
+
+Class
+
+|
+
+0 | 1 | 2 | 3 | 4 | error
+
+0 | 151 | 7 | 2 | 3 | 1 | 7.9%
+
+1 | 32 | 5 | 9 | 9 | 0 | 91%
+
+2 | 10 | 9 | 7 | 9 | 1 | 81%
+
+3 | 6 | 13 | 9 | 5 | 2 | 86%
+
+4 | 2 | 3 | 2 | 6 | 0 | 100%
+
+This is the example of a class confusion matrix from Table 11.1. Fairly clearly, one can predict narrowing or no narrowing from the features, but not the degree of narrowing (at least, not with a random forest). So it is natural to quantize variable 14 to two levels, 0 (meaning no narrowing), and 1 (meaning any narrowing, so the original value could have been 1, 2, or 3). I then built a random forest to predict this quantized variable from the other variables. The total out-of-bag error rate was 19%, and I obtained the following out-of-bag class confusion matrix
+
+True | Predict
+
+|  |   
+---|---|---|---  
+|
+
+0 | 1 | Class error
+
+0 | 138 | 26 | 16%
+
+1 | 31 | 108 | 22%
+
+Notice that the false positive rate (16%, from 26∕164) is rather better than the false negative rate (22%). You might wonder whether it is better to train on and predict 0,..., 4, then quantize the predicted value. If you do this, you will find you get a false positive rate of 7. 9%, but a false negative rate that is much higher (36%, from 50∕139). In this application, a false negative is likely more of a problem than a false positive, so the tradeoff is unattractive.
+
+Remember this: Random forests are straightforward to build, and very effective. They can predict any kind of label. Good software implementations are easily available.
+
+## 11.6 You Should
+
+### 11.6.1 Remember These Definitions
+
+  * Classifier 253
+
+### 11.6.2 Remember These Terms
+
+  * classifier 253
+
+  * feature vector 253
+
+  * error 254
+
+  * total error rate 254
+
+  * accuracy 254
+
+  * Bayes risk 254
+
+  * baselines 254
+
+  * comparing to chance 254
+
+  * false positive rate 254
+
+  * false negative rate 254
+
+  * sensitivity 254
+
+  * specificity 254
+
+  * class confusion matrix 254
+
+  * class error rate 254
+
+  * training error 255
+
+  * test error 255
+
+  * overfitting 255
+
+  * selection bias 255
+
+  * generalizing badly 255
+
+  * validation set 255
+
+  * unbiased 255
+
+  * cross-validation 255
+
+  * fold 255
+
+  * leave-one-out cross-validation 255
+
+  * whitening 256
+
+  * approximate nearest neighbor 256
+
+  * likelihood 257
+
+  * class conditional probability 257
+
+  * prior 257
+
+  * posterior 258
+
+  * decision boundary 260
+
+  * hinge loss 261
+
+  * support vector machine 261
+
+  * SVM 261
+
+  * regularization 262
+
+  * regularizer 262
+
+  * regularization parameter 262
+
+  * descent direction 263
+
+  * line search 263
+
+  * gradient descent 263
+
+  * Stochastic gradient descent 263
+
+  * batch 263
+
+  * batch size 263
+
+  * steplength 264
+
+  * step size 264
+
+  * learning rate 264
+
+  * steplength schedule 264
+
+  * epoch 264
+
+  * learning curves 264
+
+  * all-vs-all 268
+
+  * one-vs-all 268
+
+  * decision tree 268
+
+  * decision function 269
+
+  * decision forest 269
+
+  * information gain 270
+
+  * bagging 272
+
+  * bag 272
+
+### 11.6.3 Remember These Facts
+
+### 11.6.4 Use These Procedures
+
+  * To fit an SVM with stochastic gradient descent 266
+
+  * To train an SVM 266
+
+  * To estimate accuracy of an SVM with known mylambdachar 266
+
+  * Overall approach to build a decision tree 270
+
+  * To split an ordinal feature in a decision tree 272
+
+  * To split a non-ordinal feature in a decision tree 272
+
+  * To build a decision forest 272
+
+  * To build a decision forest using bagging 273
+
+  * To classify with a decision forest 273
+
+### 11.6.5 Be Able to
+
+  * build a nearest neighbors classifier using your preferred software package, and produce a cross-validated estimate of its error rate or its accuracy;
+
+  * build a naive bayes classifier using your preferred software package, and produce a cross-validated estimate of its error rate or its accuracy;
+
+  * build an SVM using your preferred software package, and produce a cross-validated estimate of its error rate or its accuracy;
+
+  * write code to train an SVM using stochastic gradient descent, and produce a cross-validated estimate of its error rate or its accuracy;
+
+  * and build a decision forest using your preferred software package, and produce a cross-validated estimate of its error rate or its accuracy.
+
+Programming Exercises
+
+11.1 The UC Irvine machine learning data repository hosts a famous collection of data on whether a patient has diabetes (the Pima Indians dataset), originally owned by the National Institute of Diabetes and Digestive and Kidney Diseases and donated by Vincent Sigillito. This can be found at http://​archive.​ics.​uci.​edu/​ml/​datasets/​Pima+Indians+Dia​betes. This data has a set of attributes of patients, and a categorical variable telling whether the patient is diabetic or not. This is an exercise oriented to users of R, because you can use some packages to help.
+
+  1. (a)
+
+Build a simple naive Bayes classifier to classify this data set. You should hold out 20% of the data for evaluation, and use the other 80% for training. You should use a normal distribution to model each of the class-conditional distributions. You should write this classifier yourself.
+
+  2. (b)
+
+Now use the caret and klaR packages to build a naive bayes classifier for this data. The caret package does cross-validation (look at train) and can be used to hold out data. The klaR package can estimate class-conditional densities using a density estimation procedure that I will describe much later in the course. Use the cross-validation mechanisms in caret to estimate the accuracy of your classifier.
+
+  3. (c)
+
+Now install SVMLight, which you can find at http://​svmlight.​joachims.​org, via the interface in klaR (look for svmlight in the manual) to train and evaluate an SVM to classify this data. You don't need to understand much about SVM's to do this—we'll do that in following exercises. You should hold out 20% of the data for evaluation, and use the other 80% for training.
+
+11.2 The UC Irvine machine learning data repository hosts a collection of data on student performance in Portugal, donated by Paulo Cortez, University of Minho, in Portugal. You can find this data at https://​archive.​ics.​uci.​edu/​ml/​datasets/​Student+Performa​nce. It is described in P. Cortez and A. Silva. "Using Data Mining to Predict Secondary School Student Performance," In A. Brito and J. Teixeira Eds., Proceedings of 5th FUture BUsiness TEChnology Conference (FUBUTEC 2008) pp. 5-12, Porto, Portugal, April, 2008,
+
+There are two datasets (for grades in mathematics and for grades in Portugese). There are 30 attributes each for 649 students, and 3 values that can be predicted (G1, G2 and G3). Of these, ignore G1 and G2.
+
+  1. (a)
+
+Use the mathematics dataset. Take the G3 attribute, and quantize this into two classes, G3 > 12 and G3 ≤ 12. Build and evaluate a naive bayes classifier that predicts G3 from all attributes except G1 and G2. You should build this classifier from scratch (i.e. DON'T use the packages described in the code snippets). For binary attributes, you should use a binomial model. For the attributes described as "numeric", which take a small set of values, you should use a multinomial model. For the attributes described as "nominal", which take a small set of values, you should again use a multinomial model. Ignore the "absence" attribute. Estimate accuracy by cross-validation. You should use at least tenfolds, excluding 15% of the data at random to serve as test data, and average the accuracy over those folds. Report the mean and standard deviation of the accuracy over the folds.
+
+  2. (b)
+
+Now revise your classifier of the previous part so that, for the attributes described as "numeric", which take a small set of values, you use a multinomial model. For the attributes described as "nominal", which take a small set of values, you should still use a multinomial model. Ignore the "absence" attribute. Estimate accuracy by cross-validation. You should use at least tenfolds, excluding 15% of the data at random to serve as test data, and average the accuracy over those folds. Report the mean and standard deviation of the accuracy over the folds.
+
+  3. (c)
+
+Which classifier do you believe is more accurate and why?
+
+11.3 The UC Irvine machine learning data repository hosts a collection of data on heart disease. The data was collected and supplied by Andras Janosi, M.D., of the Hungarian Institute of Cardiology, Budapest; William Steinbrunn, M.D., of the University Hospital, Zurich, Switzerland; Matthias Pfisterer, M.D., of the University Hospital, Basel, Switzerland; and Robert Detrano, M.D., Ph.D., of the V.A. Medical Center, Long Beach and Cleveland Clinic Foundation. You can find this data at https://​archive.​ics.​uci.​edu/​ml/​datasets/​Heart+Disease.
+
+Use the processed Cleveland dataset, where there are a total of 303 instances with 14 attributes each. The irrelevant attributes described in the text have been removed in these. The 14'th attribute is the disease diagnosis. There are records with missing attributes, and you should drop these.
+
+  1. (a)
+
+Take the disease attribute, and quantize this into two classes, num = 0 and num > 0. Build and evaluate a naive bayes classifier that predicts the class from all other attributes Estimate accuracy by cross-validation. You should use at least tenfolds, excluding 15% of the data at random to serve as test data, and average the accuracy over those folds. Report the mean and standard deviation of the accuracy over the folds.
+
+  2. (b)
+
+Now revise your classifier to predict each of the possible values of the disease attribute (0–4 as I recall). Estimate accuracy by cross-validation. You should use at least tenfolds, excluding 15% of the data at random to serve as test data, and average the accuracy over those folds. Report the mean and standard deviation of the accuracy over the folds.
+
+11.4 The UC Irvine machine learning data repository hosts a collection of data on breast cancer diagnostics, donated by Olvi Mangasarian, Nick Street, and William H. Wolberg. You can find this data at http://​archive.​ics.​uci.​edu/​ml/​datasets/​Breast+Cancer+Wi​sconsin+(Diagnostic)). For each record, there is an id number, 10 continuous variables, and a class (benign or malignant). There are 569 examples. Separate this dataset randomly into 100 validation, 100 test, and 369 training examples.
+
+Write a program to train a support vector machine on this data using stochastic gradient descent. You should not use a package to train the classifier (you don't really need one), but your own code. You should ignore the id number, and use the continuous variables as a feature vector. You should scale these variables so each has unit variance. You should search for an appropriate value of the regularization constant, trying at least the values λ = [1e − 3, 1e − 2, 1e − 1, 1]. Use the validation set for this search.
+
+You should use at least 50 epochs of at least 100 steps each. In each epoch, you should separate out 50 training examples at random for evaluation. You should compute the accuracy of the current classifier on the set held out for the epoch every 10 steps. You should produce:
+
+  1. (a)
+
+A plot of the accuracy every 10 steps, for each value of the regularization constant.
+
+  2. (b)
+
+Your estimate of the best value of the regularization constant, together with a brief description of why you believe that is a good value.
+
+  3. (c)
+
+Your estimate of the accuracy of the best classifier on held out data
+
+11.5 The UC Irvine machine learning data repository hosts a collection of data on adult income, donated by Ronny Kohavi and Barry Becker. You can find this data at https://​archive.​ics.​uci.​edu/​ml/​datasets/​Adult For each record, there is a set of continuous attributes, and a class ≥ 50 K or < 50 K. There are 48,842 examples. You should use only the continous attributes (see the description on the web page) and drop examples where there are missing values of the continuous attributes. Separate the resulting dataset randomly into 10% validation, 10% test, and 80% training examples.
+
+Write a program to train a support vector machine on this data using stochastic gradient descent. You should not use a package to train the classifier (you don't really need one), but your own code. You should ignore the id number, and use the continuous variables as a feature vector. You should scale these variables so that each has unit variance. You should search for an appropriate value of the regularization constant, trying at least the values λ = [1e − 3, 1e − 2, 1e − 1, 1]. Use the validation set for this search
+
+You should use at least 50 epochs of at least 300 steps each. In each epoch, you should separate out 50 training examples at random for evaluation. You should compute the accuracy of the current classifier on the set held out for the epoch every 30 steps. You should produce:
+
+  1. (a)
+
+A plot of the accuracy every 30 steps, for each value of the regularization constant.
+
+  2. (b)
+
+Your estimate of the best value of the regularization constant, together with a brief description of why you believe that is a good value.
+
+  3. (c)
+
+Your estimate of the accuracy of the best classifier on held out data
+
+11.6 The UC Irvine machine learning data repository hosts a collection of data on the whether p53 expression is active or inactive. You can find out what this means, and more information about the dataset, by reading: Danziger, S.A., Baronio, R., Ho, L., Hall, L., Salmon, K., Hatfield, G.W., Kaiser, P., and Lathrop, R.H. "Predicting Positive p53 Cancer Rescue Regions Using Most Informative Positive (MIP) Active Learning," PLOS Computational Biology, 5(9), 2009; Danziger, S.A., Zeng, J., Wang, Y., Brachmann, R.K. and Lathrop, R.H. "Choosing where to look next in a mutation sequence space: Active Learning of informative p53 cancer rescue mutants", Bioinformatics, 23(13), 104–114, 2007; and Danziger, S.A., Swamidass, S.J., Zeng, J., Dearth, L.R., Lu, Q., Chen, J.H., Cheng, J., Hoang, V.P., Saigo, H., Luo, R., Baldi, P., Brachmann, R.K. and Lathrop, R.H. "Functional census of mutation sequence spaces: the example of p53 cancer rescue mutants," IEEE/ACM transactions on computational biology and bioinformatics, 3, 114–125, 2006.
+
+You can find this data at https://​archive.​ics.​uci.​edu/​ml/​datasets/​p53+Mutants. There are a total of 16,772 instances, with 5409 attributes per instance. Attribute 5409 is the class attribute, which is either active or inactive. There are several versions of this dataset. You should use the version K8.data.
+
+  1. (a)
+
+Train an SVM to classify this data, using stochastic gradient descent. You will need to drop data items with missing values. You should estimate a regularization constant using cross-validation, trying at least three values. Your training method should touch at least 50% of the training set data. You should produce an estimate of the accuracy of this classifier on held out data consisting of 10% of the dataset, chosen at random.
+
+  2. (b)
+
+Now train a naive bayes classifier to classify this data. You should produce an estimate of the accuracy of this classifier on held out data consisting of 10% of the dataset, chosen at random.
+
+  3. (c)
+
+Compare your classifiers. Which one is better? why?
+
+11.7 The UC Irvine machine learning data repository hosts a collection of data on whether a mushroom is edible, donated by Jeff Schlimmer and to be found at http://​archive.​ics.​uci.​edu/​ml/​datasets/​Mushroom. This data has a set of categorical attributes of the mushroom, together with two labels (poisonous or edible). Use the R random forest package (as in the example in the chapter) to build a random forest to classify a mushroom as edible or poisonous based on its attributes.
+
+  1. (a)
+
+Produce a class-confusion matrix for this problem. If you eat a mushroom based on your classifier's prediction it is edible, what is the probability of being poisoned?
+
+MNIST Exercises
+
+The following exercises are elaborate, but rewarding. The MNIST dataset is a dataset of 60,000 training and 10,000 test examples of handwritten digits, originally constructed by Yann Lecun, Corinna Cortes, and Christopher J.C. Burges. It is very widely used to check simple methods. There are 10 classes in total ("0" to "9"). This dataset has been extensively studied, and there is a history of methods and feature constructions at https://​en.​wikipedia.​org/​wiki/​MNIST_​database and at http://​yann.​lecun.​com/​exdb/​mnist/​. You should notice that the best methods perform extremely well. The original dataset is at http://​yann.​lecun.​com/​exdb/​mnist/​. It is stored in an unusual format, described in detail on that website. Writing your own reader is pretty simple, but web search yields readers for standard packages. There is reader code in matlab available (at least) at http://​ufldl.​stanford.​edu/​wiki/​index.​php/​Using_​the_​MNIST_​Dataset. There is reader code for R available (at least) at https://​stackoverflow.​com/​questions/​21521571/​how-to-read-mnist-database-in-r.
+
+The dataset consists of 28 × 28 images. These were originally binary images, but appear to be grey level images as a result of some anti-aliasing. I will ignore mid grey pixels (there aren't many of them) and call dark pixels "ink pixels", and light pixels "paper pixels". The digit has been centered in the image by centering the center of gravity of the image pixels. Here are some options for re-centering the digits that I will refer to in the exercises.
+
+  * Untouched: do not re-center the digits, but use the images as is.
+
+  * Bounding box: construct an b × b bounding box so that the horizontal (resp. vertical) range of ink pixels is centered in the box.
+
+  * Stretched bounding box: construct an b × b bounding box so that the horizontal (resp. vertical) range of ink pixels runs the full horizontal (resp. vertical) range of the box. Obtaining this representation will involve rescaling image pixels: you find the horizontal and vertical ink range, cut that out of the original image, then resize the result to b × b.
+
+Once the image has been re-centered, you can compute features. Here are some options for constructing features that I will refer to in the exercises.
+
+  * Raw pixels: use the raw pixel values from images.
+
+  * PCA: project images onto the first d principal components computed for the entire dataset.
+
+  * Local PCA: first, compute the first d principal components for each digit class separately. Now for any image, compute a 10d dimensional feature vector by, for each class, subtracting that class mean from the image, then projecting the image onto the d principal components for that class. Finally, stack all 10 d dimensional features you get. This measures how much the difference between the image and the class mean looks like the difference between images of that class and the class mean.
+
+11.8 Investigate classifying MNIST using naive bayes. Use the procedures of Sect. 11.3.1 to compare four cases on raw pixel image features. These cases are obtained by choosing either normal model or binomial model for every feature, and untouched images or stretched bounding box images.
+
+  1. (a)
+
+Which is the best case?
+
+  2. (b)
+
+How accurate is the best case? (remember, the answer to this is not obtained by taking the best accuracy from the previous subexercise—check Sect. 11.3.1 if you're vague on this point).
+
+11.9 Investigate classifying MNIST using nearest neighbors. You will use approximate nearest neighbors. Obtain the FLANN package for approximate nearest neighbors from http://​www.​cs.​ubc.​ca/​~mariusm/​index.​php/​FLANN/​FLANN. To use this package, you should consider first using a function that builds an index for the training dataset (flann_build_index(), or variants), then querying with your test points (flann_find_nearest_neighbors _index(), or variants). The alternative (flann_find_nearest_neighbors(), etc.) builds the index then throws it away, which can be inefficient if you don't use it correctly.
+
+  1. (a)
+
+Compare untouched raw pixels with bounding box raw pixels and with stretched bounding box raw pixels. Which works better? Why? Is there a difference in query times?
+
+  2. (b)
+
+Does rescaling each feature (i.e. each pixel value) so that it has unit variance improve either classifier from the previous subexercise?
+
+  3. (c)
+
+Plot accuracy against d for a variety of d values for stretched bounding box PCA. You should use some large values of d, reasonably close to 784 ( = 28 × 28). Compare this to the accuracy of stretched bounding box raw pixels (equivalent to d = 784).
+
+  4. (d)
+
+Does rescaling each feature (i.e. each projected direction) so that it has unit variance improve results from the previous subexercise?
+
+11.10 Investigate classifying MNIST using an SVM. Compare the following four cases: untouched raw pixels; stretched bounding box raw pixels; stretched bounding box PCA; and stretched bounding box local PCA. Which works best? Why?
+
+11.11 Investigate classifying MNIST using a decision forest. Using the same parameters for your forest construction (i.e. same depth of tree; same number of trees; etc.), compare the following four cases: untouched raw pixels; stretched bounding box raw pixels; stretched bounding box PCA; and stretched bounding box local PCA. Which works best? Why?
+
+11.12 If you've done all four previous exercises, you're likely tired of MNIST, but very well informed. Compare your methods to the table of methods at http://​yann.​lecun.​com/​exdb/​mnist/​. What improvements could you make?
+© Springer International Publishing AG 2018
+
+David ForsythProbability and Statistics for Computer Sciencehttps://doi.org/10.1007/978-3-319-64410-3_12
+
+# 12. Clustering: Models of High Dimensional Data
+
+David Forsyth1
+
+(1)
+
+Computer Science Department, University of Illinois at Urbana Champaign, Urbana, IL, USA
+
+High-dimensional data comes with problems. Data points tend not to be where you think; they can scattered quite far apart, and can be quite far from the mean. There is an important rule of thumb for coping with high dimensional data: Use simple models. One very good, very simple, model for high dimensional data is to assume that it consists of multiple blobs. To build models like this, we must determine which datapoints belong to which blob by collecting together data points that are close and forming blobs out of them. This process is known as clustering. It is so useful that there is a very wide range of clustering algorithms
+
+One important application for clustering methods is building features. If we need to classify signals (eg audio, images, video, accelerometer data) that have repeated structures in them, clustering pieces of signal in the training set will expose what structures are repeated commonly. A new signal can be described by recording how often each cluster center appears in the signal. This process yields a convenient and effective feature description of the signal that can be fed into the classifiers from the previous chapter.
+
+Clustering is a somewhat puzzling activity. It is extremely useful to cluster data, and it seems to be quite important to do it reasonably well. But it surprisingly hard to give crisp criteria for a good (resp. bad) clustering of a dataset. Usually, clustering is part of building a model, and the main way to know that the clustering algorithm is good is that the resulting model is useful.
+
+## 12.1 The Curse of Dimension
+
+High dimensional models display uninituitive behavior (or, rather, it can take years to make your intuition see the true behavior of high-dimensional models as natural). In these models, most data lies in places you don't expect. A very simple example dataset will illustrate these problems. The dataset is an IID sample from a uniform probability density in a cube, with edge length two, centered on the origin. At every point in the cube, this density ![
+$$P\(x\) = \\frac{1} {2^{d}}$$
+](A442674_1_En_12_Chapter_IEq1.gif) (and it is zero elsewhere). The mean of this density is at the origin, which we write as 0. Each component of every x i must lie in the range [−1, 1].
+
+### 12.1.1 Minor Banes of Dimension
+
+It is difficult to build histogram representations for high dimensional datasets, because you end up with too many boxes. In the case of our cube, imagine we wish to divide each dimension in half (i.e. between [−1, 0] and between [0, 1]), which would produce a very crude histogram. Then we must have 2 d boxes. This presents two problems. First, we will have difficulty representing this number of boxes. Second, unless we are exceptionally lucky, most boxes must be empty because we will not have 2 d data items. Splitting each dimension into a larger number of pieces just makes things a great deal worse.
+
+Covariance matrices are hard to work with because the number of entries in the matrix grows as the square of the dimension. This means the matrix can get big and difficult to store. More important, the amount of data we need to get an accurate estimate of all the entries in the matrix grows fast. As we are estimating more numbers, we need more data to be confident that our estimates are reasonable. There are a variety of straightforward work-arounds for this effect. In some cases, we have so much data there is no need to worry. In other cases, we assume that the covariance matrix has a particular parametric form, and just estimate those parameters. There are two strategies that are usual. In one, we assume that the covariance matrix is diagonal, and estimate only the diagonal entries. In the other, we assume that the covariance matrix is a scaled version of the identity, and just estimate this scale. You should see these strategies as acts of desperation, to be used only when computing the full covariance matrix seems to produce more problems than using these approaches.
+
+### 12.1.2 The Curse: Data Isn't Where You Think It Is
+
+The first surprising fact about high dimensional data is that most of the data can lie quite far away from the mean. For example, we can divide our cube into two pieces. ![
+$$\\mathcal{A}\(\\epsilon \)$$
+](A442674_1_En_12_Chapter_IEq2.gif) consists of all data items where every component of the data has a value in the range [−(1 −ε), (1 −ε)]. ![
+$$\\mathcal{B}\(\\epsilon \)$$
+](A442674_1_En_12_Chapter_IEq3.gif) consists of all the rest of the data. If you think of the data set as forming a cubical orange, then ![
+$$\\mathcal{B}\(\\epsilon \)$$
+](A442674_1_En_12_Chapter_IEq4.gif) is the rind (which has thickness ε) and ![
+$$\\mathcal{A}\(\\epsilon \)$$
+](A442674_1_En_12_Chapter_IEq5.gif) is the fruit.
+
+Your intuition will tell you that there is more fruit than rind. It is wrong, as a simple calculation shows. We can compute ![
+$$P\(\\left \\{\\mathbf{x} \\in \\mathcal{A}\(\\epsilon \)\\right \\}\)$$
+](A442674_1_En_12_Chapter_IEq6.gif) and ![
+$$P\(\\left \\{\\mathbf{x} \\in \\mathcal{B}\(\\epsilon \)\\right \\}\)$$
+](A442674_1_En_12_Chapter_IEq7.gif). These probabilities tell us the probability a data item lies in the fruit (resp. rind). ![
+$$P\(\\left \\{\\mathbf{x} \\in \\mathcal{A}\(\\epsilon \)\\right \\}\)$$
+](A442674_1_En_12_Chapter_IEq8.gif) is easy to compute as
+
+![
+$$\\displaystyle{P\(\\left \\{\\mathbf{x} \\in \\mathcal{A}\(\\epsilon \)\\right \\}\) = \\left \(2\(1-\\epsilon \)\)\\right \)^{d}\\left \( \\frac{1} {2^{d}}\\right \) = \(1-\\epsilon \)^{d}}$$
+](A442674_1_En_12_Chapter_Equa.gif)
+
+and
+
+![
+$$\\displaystyle{P\(\\left \\{\\mathbf{x} \\in \\mathcal{B}\(\\epsilon \)\\right \\}\) = 1 - P\(\\left \\{\\mathbf{x} \\in \\mathcal{A}\(\\epsilon \)\\right \\}\) = 1 - \(1-\\epsilon \)^{d}.}$$
+](A442674_1_En_12_Chapter_Equb.gif)
+
+But notice that, as d → ∞,
+
+![
+$$\\displaystyle{P\(\\left \\{\\mathbf{x} \\in \\mathcal{A}\(\\epsilon \)\\right \\}\) \\rightarrow 0.}$$
+](A442674_1_En_12_Chapter_Equc.gif)
+
+This means that, for large d, we expect most of the data to be in ![
+$$\\mathcal{B}\(\\epsilon \)$$
+](A442674_1_En_12_Chapter_IEq9.gif) (the rind is bigger than the fruit). Equivalently, for large d, we expect that at least one component of each data item is close to either 1 or − 1. In high dimensions, volume doesn't behave like you think, and so you do not expect the rind to dominate. The fact that the dataset is a cube, rather than a sphere, makes the calculations easier (most people don't remember the expressions for volume of high dimensional spheres). But the shape has nothing to do with the real problem.
+
+The fact that ![
+$$P\(\\left \\{\\mathbf{x} \\in \\mathcal{A}\(\\epsilon \)\\right \\}\)$$
+](A442674_1_En_12_Chapter_IEq10.gif) is small for large d suggests that much data is quite far from the origin, because most of it lies in ![
+$$\\mathcal{B}\(\\epsilon \)$$
+](A442674_1_En_12_Chapter_IEq11.gif). This turns out to be true. It is easy to compute the average of the squared distance of data from the origin. We want
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathbb{E}\\left \[\\mathbf{x}^{T}\\mathbf{x}\\right \]& =& \\mathbb{E}\\left \[\\sum _{ i}x_{i}^{2}\\right \] =\\sum _{ i}\\mathbb{E}\\left \[x_{i}^{2}\\right \] {}\\\\ & =& \\sum _{i}\\int _{\\mbox{ cube}}x_{i}^{2}P\(\\mathbf{x}\)d\\mathbf{x}. {}\\\\ \\end{array}$$
+](A442674_1_En_12_Chapter_Equ1.gif)
+
+Now each component of x is independent, so that P(x) = P(x 1)P(x 2)...P(x d ). Now we substitute, to get
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathbb{E}\\left \[\\mathbf{x}^{T}\\mathbf{x}\\right \]& =& \\sum _{ i}\\int _{-1}^{1}x_{ i}^{2}P\(x_{ i}\)dx_{i} =\\sum _{i}\\frac{1} {2}\\int _{-1}^{1}x_{ i}^{2}dx_{ i} {}\\\\ & =& \\frac{d} {3}, {}\\\\ \\end{array}$$
+](A442674_1_En_12_Chapter_Equ2.gif)
+
+so as d gets bigger, most data points will be further and further from the origin. Worse, as d gets bigger, data points tend to get further and further from one another. We can see this by computing the average of the squared distance of data points from one another. Write u for one data point and v for another; we can compute
+
+![
+$$\\displaystyle{ \\mathbb{E}\\left \[d\(\\mathbf{u},\\mathbf{v}\)^{2}\\right \] = \\mathbb{E}\\left \[\(\\mathbf{u} -\\mathbf{v}\)^{T}\(\\mathbf{u} -\\mathbf{v}\)\\right \] = \\mathbb{E}\\left \[\\mathbf{u}^{T}\\mathbf{u}\\right \] + \\mathbb{E}\\left \[\\mathbf{v}^{T}\\mathbf{v}\\right \] - 2\\mathbb{E}\\left \[\\mathbf{u}^{T}\\mathbf{v}\\right \] }$$
+](A442674_1_En_12_Chapter_Equd.gif)
+
+but since u and v are independent, we have ![
+$$\\mathbb{E}\\left \[\\mathbf{u}^{T}\\mathbf{v}\\right \] = \\mathbb{E}\\left \[\\mathbf{u}\\right \]^{T}\\mathbb{E}\\left \[\\mathbf{v}\\right \] = 0$$
+](A442674_1_En_12_Chapter_IEq12.gif). This means
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[d\(\\mathbf{u},\\mathbf{v}\)^{2}\\right \] = 2\\frac{d} {3}.}$$
+](A442674_1_En_12_Chapter_Eque.gif)
+
+This means that, for large d, we expect our data points to be quite far apart, too.
+
+Remember this: High dimensional data does not behave in a way that is consistent with most people's intuition. Points are always close to the boundary and further apart than you think. This property makes a nuisance of itself in a variety of ways. The most important is that only the simplest models work well in high dimensions.
+
+## 12.2 Clustering Data
+
+In high dimensional spaces, there is "too much" space for any reasonable amount of data to fill it up (that's what the curse of dimension is about). It is ineffective to cut space up into boxes and see how much data lies in each box: too many boxes, and not enough data. An alternative is to break up the dataset, rather than the space. We form clusters —coherent blobs of datapoints that are near one another. A cluster center is a summary of an entire cluster. One natural summary is the average of the elements of the cluster. Another natural summary is a data item that is close to all the items in the cluster.
+
+Clusters have a variety of uses. For example, we could form a representation that will function very like a histogram by reporting the center of each cluster and the number of data items in each cluster. As another example, chunks of data that are similar should appear in the same cluster, so cluster centers can be used to build a dictionary of patterns that repeat in a dataset (Sect. 12.4).
+
+### 12.2.1 Agglomerative and Divisive Clustering
+
+There are two natural recipes to produce clustering algorithms. In agglomerative clustering, you start with each data item being a cluster, and then merge clusters recursively to yield a good clustering (Procedure 12.1). The difficulty here is that we need to know a good way to measure the distance between clusters, which can be somewhat harder than the distance between points. There are three standard choices. In single-link clustering, the distance between the two closest elements is the inter-cluster distance. This tends to produce "long" clusters. In complete-link clustering, the maximum distance between an element of the first cluster and one of the second is the inter-cluster distance. This tends to yield rounded clusters. In group average clustering, an average of distances between elements in the clusters is the distance. This also tends to yield rounded clusters.
+
+Procedure 12.1 (Agglomerative Clustering)
+
+Choose an inter-cluster distance. Make each point a separate cluster. Now, until the clustering is satisfactory,
+
+  * Merge the two clusters with the smallest inter-cluster distance.
+
+In divisive clustering, you start with the entire data set being a cluster, and then split clusters recursively to yield a good clustering (Procedure 12.2). The difficulty here is we need to know some criterion for splitting clusters. This tends to be something that follows from the logic of the application, because the ideal is an efficient method to find a natural split in a large dataset. We won't pursue divisive clustering further.
+
+Procedure 12.2 (Divisive Clustering)
+
+Choose a splitting criterion. Regard the entire dataset as a single cluster. Now, until the clustering is satisfactory,
+
+  * choose a cluster to split;
+
+  * then split this cluster into two parts.
+
+We need to know when to stop either algorithm. This is an intrinsically difficult task if there is no model for the process that generated the clusters. Both agglomerative and divisive clustering produce a hierarchy of clusters. Usually, this hierarchy is displayed to a user in the form of a dendrogram —a representation of the structure of the hierarchy of clusters that displays inter-cluster distances—and an appropriate choice of clusters is made from the dendrogram (see the example in Fig. 12.1).
+
+Fig. 12.1
+
+Left, a data set; right, a dendrogram obtained by agglomerative clustering using single-link clustering. This representation makes it possible to guess how many clusters there are and to get some insight into how good the clusters are. Select a particular value of distance (vertical axis); now draw a a horizontal line at that distance. This will break the dendrogram into separate connected pieces, each of which is a cluster. I have marked some example distances on the dendrogram. Notice there is quite a large range of distances that yield two clusters. This is evidence that the two clusters are quite far apart, compared to their size, and is usually taken to mean that there is quite a large range of scales over which two clusters is a good clustering of the dataset
+
+Figure 12.1 illustrates some important properties of clustering, which many people find frustrating. There isn't a single right answer. Instead, different clusterings of a dataset might be acceptable; so, for example, in Fig. 12.1, point 6 might belong to a cluster with points 4 and 5, or it might reasonably be on its own. Whether an answer is good or not depends on appropriate scales of variation in the data. For example, in Fig. 12.1, if the distance between points 1 and 2 is "large", there are likely six different clusters; but if the distance between points 2 and 6 is "small", then there is likely one cluster. Whether a distance is "large" or "small" depends entirely on the application the data came from.
+
+Worked example 12.1 (Agglomerative Clustering) Cluster the seed dataset from the UC Irvine Machine Learning Dataset Repository (you can find it at http://​archive.​ics.​uci.​edu/​ml/​datasets/​seeds).
+
+Solution
+
+This dataset consists of seven geometric parameters measured for wheat kernels of three types of wheat. It was donated by M. Charytanowicz, J. Niewczas, P. Kulczycki, P. A. Kowalski, S. Lukasik and S. Zak. You can find more information on the webpage. For this example, I used Matlab, but many programming environments will provide tools that are useful for agglomerative clustering. I show a dendrogram in Fig. 12.2). I deliberately forced Matlab to plot the whole dendrogram, which accounts for the crowded look of the figure (you can allow it to merge small leaves, etc.). As you can see from the dendrogram and from Fig. 12.3, this data clusters rather well. There isn't any choice of distance that yields three cleanly separated clusters, even though there are three types of wheat here. At the same time, there is a fair range of distances that will yield three rather big clusters with a number of other small or even single point clusters. You can interpret this in terms of the feature space. The blobs of data corresponding to each type of wheat overlap somewhat, so some data items might be close to more than one blob. Furthermore, because the blobs are scattered, there are some points on the fringes of the blobs that are far from all others. This is pretty typical of real data. It's optimistic to expect that, because there are three types of wheat, there will be three clear and distinct clusters.
+
+Fig. 12.2
+
+A dendrogram obtained from the seed dataset, using single link clustering. Recall that the data points are on the horizontal axis, and that the vertical axis is distance; there is a horizontal line linking two clusters that get merged, established at the height at which they're merged. I have plotted the entire dendrogram, despite the fact it's a bit crowded at the bottom, because you can now see how clearly the data set clusters into a small set of clusters—there are a small number of vertical "runs"
+
+Fig. 12.3
+
+A clustering of the seed dataset, using agglomerative clustering, single link distance, and requiring a maximum of 30 clusters. I have plotted each cluster with a distinct marker (though some markers differ only by color). Notice that there are a set of fairly natural isolated clusters. The original data is seven dimensional, which presents plotting problems; I show a scatter plot on the first two principal components (though I computed distances for clustering in the original seven dimensional space)
+
+### 12.2.2 Clustering and Distance
+
+You may have noticed the following, occasionally useful, property of agglomerative clustering. There is no need for a feature vector for any of the objects you wish to cluster; it is enough to have a table of distances between all pairs of objects. For example, you could collect data giving the distances between cities, without knowing where the cities are (as in Sect. 10.​4.​3, particularly Fig. 10.​16), then try and cluster using this data. As another example, you could collect data giving similarities between breakfast items as in Sect. 10.​4.​3. These will be in the range 0, 1], where 0 is completely dissimilar and 1 is exactly the same. It is straightforward to turn the similarities into distances by taking the negative logarithm. This gives a useable table of distances. So it is possible to build a dendrogram and a clustering for the breakfast items of Sect. [10.​4.​3 without ever knowing a feature vector for any item. In this case, the distance is meaningful because you collected the distance information.
+
+In the more usual case, we have a feature vector for each object. This doesn't necessarily mean that the distance between feature vectors is a good guide to the difference between objects. If the features are poorly scaled, distances (measured in the usual way) between data points may not be a good representation of their similarity. This is quite an important point. For example, imagine we are clustering data representing brick walls. The features might contain several distances: the spacing between the bricks, the length of the wall, the height of the wall, and so on. If these distances are given in the same set of units, we could have real trouble. For example, assume that the units are centimeters. Then the spacing between bricks is of the order of one or two centimeters, but the heights of the walls will be in the hundreds of centimeters. In turn, this means that the distance between two datapoints is likely to be completely dominated by the height and length data. This could be what we want, but it might also not be a good thing.
+
+There are some ways to manage this issue. One is to know what the features measure, and know how they should be scaled. Usually, this happens because you have a deep understanding of your data. If you don't (which happens!), then it is often a good idea to try and normalize the scale of the data set. There are two good strategies. The simplest is to translate the data so that it has zero mean (this is just for neatness—translation doesn't change distances), then scale each direction so that it has unit variance. Another possibility, as in nearest neighbors (Sect. 11.​2.​1), is to transform the features so that the covariance matrix is the identity (this is sometimes known as whitening; the method follows from the ideas of Chap. ).
+
+Remember this: High dimensional datasets can be represented by a collection of clusters. Each cluster is a blob of data points that are near one another, and is summarized by a cluster center. The choice of distance between data points has a significant effect on clustering. Agglomerative clustering starts with each data point a cluster, then recursively merges. There are three main ways to compute the distance between clusters. Divisive clustering starts with all in one cluster, then recursively splits clusters. The choice of splitting method depends quite strongly on application. Either method yields a dendrogram, which is a helpful summary of distances between points and clusters. For datasets that are small enough to plot the dendrogram, a look at a dendrogram can yield some helpful information about the data and good clusterings.
+
+## 12.3 The K-Means Algorithm and Variants
+
+Assume we have a dataset that, we believe, forms many clusters that look like blobs. We would like to choose a clustering so that points are "close" to their cluster centers. It is natural to want to minimize the sum of squared distances from each point to its cluster center. We can even guess an algorithm for doing this. If we knew where the center of each of the clusters was, it would be easy to tell which cluster each data item belonged to—it would belong to the cluster with the closest center. Similarly, if we knew which cluster each data item belonged to, it would be easy to tell where the cluster centers were—they'd be the mean of the data items in the cluster. This is the point closest to every point in the cluster.
+
+We can formalize this fairly easily by writing an expression for the squared distance between data points and their cluster centers. Assume that we know how many clusters there are in the data, and write k for this number. There are N data items. The ith data item to be clustered is described by a feature vector x i . We write c j for the center of the jth cluster. We write δ i, j for a discrete variable that records which cluster a data item belongs to, so
+
+![
+$$\\displaystyle{\\delta _{i,j} = \\left \\{\\begin{array}{lc} 1&\\mbox{ if }\\mathbf{x}_{i}\\mbox{ belongs to cluster }j\\\\ 0 & \\mbox{ otherwise} \\end{array} \\right.}$$
+](A442674_1_En_12_Chapter_Equf.gif)
+
+We require that every data item belongs to exactly one cluster, so that ∑ j δ i, j = 1. We require that every cluster contain at least one point, because we assumed we knew how many clusters there were, so we must have that ∑ i δ i, j > 0 for every j. We can now write the sum of squared distances from data points to cluster centers as
+
+![
+$$\\displaystyle{\\Phi \(\\delta,\\mathbf{c}\) =\\sum _{i,j}\\delta _{i,j}\\left \[\(\\mathbf{x}_{i} -\\mathbf{c}_{j}\)^{T}\(\\mathbf{x}_{ i} -\\mathbf{c}_{j}\)\\right \].}$$
+](A442674_1_En_12_Chapter_Equg.gif)
+
+Notice how the δ i, j are acting as "switches". For the i'th data point, there is only one non-zero δ i, j which selects the distance from that data point to the appropriate cluster center. It is natural to want to cluster the data by choosing the δ and c that minimizes ![
+$$\\Phi \(\\delta,\\mathbf{c}\)$$
+](A442674_1_En_12_Chapter_IEq13.gif). This would yield the set of k clusters and their cluster centers such that the sum of distances from points to their cluster centers is minimized.
+
+There is no known algorithm that can minimize ![
+$$\\Phi$$
+](A442674_1_En_12_Chapter_IEq14.gif) exactly in reasonable time. The δ i, j are the problem: it turns out to be hard to choose the best allocation of points to clusters. The algorithm we guessed above is a remarkably effective approximate solution. Notice that if we know the c's, getting the δ's is easy—for the i'th data point, set the δ i, j corresponding to the closest c j to one and the others to zer. Similarly, if the δ i, j are known, it is easy to compute the best center for each cluster—just average the points in the cluster. So we iterate:
+
+  * Assume the cluster centers are known and allocate each point to the closest cluster center.
+
+  * Replace each center with the mean of the points allocated to that cluster.
+
+We choose a start point by randomly choosing cluster centers, and then iterate these stages alternately. This process eventually converges to a local minimum of the objective function (the value either goes down or is fixed at each step, and it is bounded below). It is not guaranteed to converge to the global minimum of the objective function, however. It is also not guaranteed to produce k clusters, unless we modify the allocation phase to ensure that each cluster has some nonzero number of points. This algorithm is usually referred to as k-means (summarized in Algorithm 12.3).
+
+Procedure 12.3 (K-Means Clustering)
+
+Choose k. Now choose k data points c j to act as cluster centers. Until the cluster centers change very little
+
+  * Allocate each data point to the cluster whose center is nearest.
+
+  * Now ensure that every cluster has at least one data point; one way to do this is by supplying empty clusters with a point chosen at random from points far from their cluster center.
+
+  * Replace the cluster centers with the mean of the elements in their clusters.
+
+Usually, we are clustering high dimensional data, so that visualizing clusters can present a challenge. If the dimension isn't too high, then we can use panel plots. An alternative is to project the data onto two principal components, and plot the clusters there; the process for plotting 2D covariance ellipses from Sect. 12.5.2 comes in useful here. A natural dataset to use to explore k-means is the iris data, where we know that the data should form three clusters (because there are three species). Recall this dataset from Sect. 10.​1.​2. I reproduce Fig. 10.​3 from that section as Fig. 12.4, for comparison. Figure 12.5 shows four different k-means clusterings of the data. By comparison with Fig. 12.4, notice how k = 2 clustering appears to merge the versicolor and verginica clusters. The k = 3 case appears to reproduce the species correctly. The k = 4 case appears to have broken setosa into two groups, but left versicolor and verginica as predicted by the species. The k = 5 case appears to have broken setosa into two groups, and versicolor and verginica into a total of three groups.
+
+Fig. 12.4
+
+Left: a 3D scatterplot for the famous Iris data, collected by Edgar Anderson in 1936, and made popular amongst statisticians by Ronald Fisher in that year. I have chosen three variables from the four, and have plotted each species with a different marker. You can see from the plot that the species cluster quite tightly, and are different from one another. Right: a scatterplot matrix for the Iris data. There are four variables, measured for each of three species of iris. I have plotted each species with a different marker. You can see from the plot that the species cluster quite tightly, and are different from one another
+
+Fig. 12.5
+
+Four panel plots of the iris data, clustered with k-means to different numbers of clusters
+
+### 12.3.1 How to Choose K
+
+The iris data is just a simple example. We know that the data forms clean clusters, and we know there should be three of them. Usually, we don't know how many clusters there should be, and we need to choose this by experiment. One strategy is to cluster for a variety of different values of k, then look at the value of the cost function for each. If there are more centers, each data point can find a center that is close to it, so we expect the value to go down as k goes up. This means that looking for the k that gives the smallest value of the cost function is not helpful, because that k is always the same as the number of data points (and the value is then zero). However, it can be very helpful to plot the value as a function of k, then look at the "knee" of the curve. Figure 12.6 shows this plot for the iris data. Notice that k = 3—the "true" answer—doesn't look particularly special, but k = 2, k = 3, or k = 4 all seem like reasonable choices. It is possible to come up with a procedure that makes a more precise recommendation by penalizing clusterings that use a large k, because they may represent inefficient encodings of the data. However, this is often not worth the bother.
+
+Fig. 12.6
+
+On the left, the scatterplot matrix for the Iris data, for reference. On the right, a plot of the value of the cost function for each of several different values of k. Notice how there is a sharp drop in cost going from k = 1 to k = 2, and again at k = 4; after that, the cost falls off slowly. This suggests using k = 2, k = 3, or k = 4, depending on the precise application
+
+In some special cases (like the iris example), we might know the right answer to check our clustering against. In such cases, one can evaluate the clustering by looking at the number of different labels in a cluster (sometimes called the purity), and the number of clusters. A good solution will have few clusters, all of which have high purity. Mostly, we don't have a right answer to check against. An alternative strategy, which might seem crude to you, for choosing k is extremely important in practice. Usually, one clusters data to use the clusters in an application (one of the most important, vector quantization, is described in Sect. 12.4). There are usually natural ways to evaluate this application. For example, vector quantization is often used as an early step in texture recognition or in image matching; here one can evaluate the error rate of the recognizer, or the accuracy of the image matcher. One then chooses the k that gets the best evaluation score on validation data. In this view, the issue is not how good the clustering is; it's how well the system that uses the clustering works.
+
+### 12.3.2 Soft Assignment
+
+One difficulty with k-means is that each point must belong to exactly one cluster. But, given we don't know how many clusters there are, this seems wrong. If a point is close to more than one cluster, why should it be forced to choose? This reasoning suggests we assign points to cluster centers with weights. These weights are different from the original δ i, j because they are not forced to be either zero or one, however. Write w i, j for the weight connecting point i to cluster center j. Weights should be non-negative (i.e. w i, j ≥ 0), and each point should carry a total weight of 1 (i.e. ∑ j w i, j = 1), so that it if the i'th point contributes more to one cluster center, it is forced to contribute less to all others. You should see w i, j as a simplification of the δ i, j in the original cost function. We can write a new cost function
+
+![
+$$\\displaystyle{\\Phi \(w,\\mathbf{c}\) =\\sum _{i,j}w_{i,j}\\left \[\(\\mathbf{x}_{i} -\\mathbf{c}_{j}\)^{T}\(\\mathbf{x}_{ i} -\\mathbf{c}_{j}\)\\right \],}$$
+](A442674_1_En_12_Chapter_Equh.gif)
+
+which we would like to minimize by choice of w and c. There isn't any improvement in the problem, because for any choice of c, the best choice of w is to allocate each point to its closest cluster center. This is because we have not specified any relationship between w and c.
+
+But w and c should be coupled. We would like w i, j to be large when x i is close to c j , and small otherwise. Write d i, j for the distance ∣​∣​​x i − c j ​​∣​∣, choose a scaling parameter σ > 0, and write
+
+![
+$$\\displaystyle{s_{i,j} = e^{\\frac{-d_{i,j}^{2}} {2\\sigma ^{2}} }.}$$
+](A442674_1_En_12_Chapter_Equi.gif)
+
+This s i, j is often called the affinity between the point i and the center j; it is large when they are close in σ units, and small when they are far apart. Now a natural choice of weights is
+
+![
+$$\\displaystyle{w_{i,j} = \\frac{s_{i,j}} {\\sum _{l=1}^{k}s_{i,l}}.}$$
+](A442674_1_En_12_Chapter_Equj.gif)
+
+All these weights are non-negative, they sum to one. The weight linking a point and a cluster center is large if the point is much closer to one center than to any other. The scaling parameter σ sets the meaning of "much closer"—we measure distance in units of σ.
+
+Once we have weights, re-estimating the cluster centers is easy. We use the weights to compute a weighted average of the points. In particular, we re-estimate the j'th cluster center by
+
+![
+$$\\displaystyle{\\frac{\\sum _{i}w_{i,j}\\mathbf{x}_{i}} {\\sum _{i}w_{i,j}}.}$$
+](A442674_1_En_12_Chapter_Equk.gif)
+
+Notice that k-means is a special case of this algorithm where σ limits to zero. In this case, each point has a weight of one for some cluster, and zero for all others, and the weighted mean becomes an ordinary mean. I have collected the description into a box (Procedure 12.4) for convenience.
+
+Notice one other feature of this procedure. As long as you use sufficient precision for the arithmetic (which might be a problem), w i, j is always greater than zero. This means that no cluster is empty. In practice, if σ is small compared to the distances between points, you can end up with empty clusters. You can tell if this is happening by looking at ∑ i w i, j ; if this is very small or zero, you have a problem.
+
+Procedure 12.4 (K-Means with Soft Weights)
+
+Choose k. Choose k data points c j to act as initial cluster centers. Until the cluster centers change very little:
+
+  * First, we estimate the weights. For each pair of a data point x i and a cluster c j , compute the affinity
+
+![
+$$\\displaystyle{s_{i,j} = e^{\\frac{-\\mid \\!\\mid \\!\\!\\mathbf{x}_{i}-\\mathbf{c}_{j}\\!\\!\\mid \\!\\mid } {2\\sigma ^{2}} }.}$$
+](A442674_1_En_12_Chapter_Equl.gif)
+
+  * Now for each pair of a data point x i and a cluster c j compute the soft weight linking the data point to the center
+
+![
+$$\\displaystyle{w_{i,j} = s_{i,j}/\\sum _{l=1}^{k}s_{ i,l}.}$$
+](A442674_1_En_12_Chapter_Equm.gif)
+
+  * For each cluster, compute ∑ i w i, j . If this is too small, then this cluster's new center is a point chosen at random from points far from their cluster center. Otherwise, the new center is
+
+![
+$$\\displaystyle{\\mathbf{c}_{j} = \\frac{\\sum _{i}w_{i,j}\\mathbf{x}_{i}} {\\sum _{i}w_{i,j}} }$$
+](A442674_1_En_12_Chapter_Equn.gif)
+
+### 12.3.3 Efficient Clustering and Hierarchical K Means
+
+One important difficulty occurs in applications. We might need to have an enormous dataset (millions of items is a real possibility), and so a very large k. In this case, k-means clustering becomes difficult because identifying which cluster center is closest to a particular data point scales linearly with k (and we have to do this for every data point at every iteration). There are two useful strategies for dealing with this problem.
+
+The first is to notice that, if we can be reasonably confident that each cluster contains many data points, some of the data is redundant. We could randomly subsample the data, cluster that, then keep the cluster centers. This works, but doesn't scale particularly well.
+
+A more effective strategy is to build a hierarchy of k-means clusters. We randomly subsample the data (typically quite aggressively), then cluster this with a small value of k. Each data item is then allocated to the closest cluster center, and the data in each cluster is clustered again with k-means. We now have something that looks like a two-level tree of clusters. Of course, this process can be repeated to produce a multi-level tree of clusters.
+
+### 12.3.4 K-Mediods
+
+In some cases, we want to cluster objects that can't be averaged. One case where this happens is when you have a table of distances between objects, but do not know vectors representing the objects. For example, you could collect data giving the distances between cities, without knowing where the cities are (as in Sect. 10.​4.​3, particularly Fig. 10.​16), then try and cluster using this data. As another example, you could collect data giving similarities between breakfast items as in Sect. 10.​4.​3, then turn the similarities into distances by taking the negative logarithm. This gives a useable table of distances. You still can't average kippers with oatmeal, so you couldn't use k-means to cluster this data.
+
+A variant of k-means, known as k-medoids, applies to this case. In k-medoids, the cluster centers are data items rather than averages, and so are called "mediods". The rest of the algorithm has a familiar form. We assume k, the number of cluster centers, is known. We initialize the cluster centers by choosing examples at random. We then iterate two procedures. In the first, we allocate each data point to the closest mediod. In the second, we choose the best medoid for each cluster by finding the data point that minimizes the sum of distances of points in the cluster to that medoid. This point can be found by simply searching all points.
+
+### 12.3.5 Example: Groceries in Portugal
+
+Clustering can be used to expose structure in datasets that isn't visible with simple tools. Here is an example. At http://​archive.​ics.​uci.​edu/​ml/​datasets/​Wholesale+custom​ers, you will find a dataset giving sums of money spent annually on different commodities by customers in Portugal. The commodities are divided into a set of categories (fresh; milk; grocery; frozen; detergents and paper; and delicatessen) relevant for the study. These customers are divided by channel (two channels, corresponding to different types of shop) and by region (three regions). You can think of the data as being divided into six groups (one for each pair of channel and region). There are 440 customer records, and there are many customers in each group. The data was provided by M. G. M. S. Cardoso.
+
+Figure 12.7 shows a panel plot of the customer data; the data has been clustered, and I gave each of 10 clusters its own marker. You (or at least, I) can't see any evidence of the six groups here. This is due to the form of the visualization, rather than a true property of the data. People tend to like to live near people who are "like" them, so you could expect people in a region to be somewhat similar; you could reasonably expect differences between groups (regional preferences; differences in wealth; and so on). Retailers have different channels to appeal to different people, so you could expect people using different channels to be different. But you don't see this in the plot of clusters. In fact, the plot doesn't really show much structure at all, and is basically unhelpful.
+
+Fig. 12.7
+
+A panel plot of the wholesale customer data of http://​archive.​ics.​uci.​edu/​ml/​datasets/​Wholesale+custom​ers, which records sums of money spent annually on different commodities by customers in Portugal. This data is recorded for six different groups (two channels each within three regions). I have plotted each group with a different marker, but you can't really see much structure here, for reasons explained in the text
+
+Here is a way to think about structure in the data. There are likely to be different "types" of customer. For example, customers who prepare food at home might spend more money on fresh or on grocery, and those who mainly buy prepared food might spend more money on delicatessen; similarly, coffee drinkers with cats or with children might spend more on milk than the lactose-intolerant, and so on. So we can expect customers to cluster in types. An effect like this is hard to see on a panel plot of the clustered data (Fig. 12.7). The plot for this dataset is hard to read, because the dimension is fairly high for a panel plot and the data is squashed together in the bottom left corner. However, you can see the effect when you cluster the data and look at the cost function in representing the data with different values of k—quite a small set of clusters gives quite a good representation of the customers (Fig. 12.8). The panel plot of cluster membership (also in that figure) isn't particularly informative. The dimension is quite high, and clusters get squashed together.
+
+Fig. 12.8
+
+On the left, the cost function (of Sect. 12.3) for clusterings of the customer data with k-means for k running from 2 to 35. This suggests using a k somewhere in the range 10–30; I chose 10. On the right, I have clustered this data to 10 cluster centers with k-means. The clusters do seem to be squashed together, but the plot on the left suggests that clusters do capture some important information. Using too few clusters will clearly lead to problems. Notice that I did not scale the data, because each of the measurements is in a comparable unit. For example, it wouldn't make sense to scale expenditures on fresh and expenditures on grocery with a different scale
+
+There is an important effect which isn't apparent in the panel plots. Some of what cause customers to cluster in types are driven by things like wealth and the tendency of people to have neighbors who are similar to them. This means that different groups should have different fractions of each type of customer. There might be more deli-spenders in wealthier regions; more milk-spenders and detergent-spenders in regions where it is customary to have many children; and so on. This sort of structure will not be apparent in a panel plot. A group of a few milk-spenders and many detergent-spenders will have a few data points with high milk expenditure values (and low other values) and also many data points with high detergent expenditure values (and low other values). In a panel plot, this will look like two blobs; but if there is a second group with many milk-spenders and few detergent-spenders will also look like two blobs, lying roughly on top of the first set of blobs. It will be hard to spot the difference between the groups.
+
+An easy way to see this difference is to look at histograms of the types of customer within each group. I described each group of data by the histogram of customer types that appeared in that group (Fig. 12.9). Notice how the distinction between the groups is now apparent—the groups do appear to contain quite different distributions of customer type. It looks as though the channels (rows in this figure) are more different than the regions (columns in this figure). To be more confident in this analysis, we would need to be sure that different types of customer really are different. We could do this by repeating the analysis for fewer clusters, or by looking at the similarity of customer types.
+
+Fig. 12.9
+
+The histogram of different types of customer, by group, for the customer data. Notice how the distinction between the groups is now apparent—the groups do appear to contain quite different distributions of customer type. It looks as though the channels (rows in this figure) are more different than the regions (columns in this figure)
+
+### 12.3.6 General Comments on K-Means
+
+If you experiment with k-means, you will notice one irritating habit of the algorithm. It almost always produces either some rather spread out clusters, or some single element clusters. Most clusters are usually rather tight and blobby clusters, but there is usually one or more bad cluster. This is fairly easily explained. Because every data point must belong to some cluster, data points that are far from all others (a) belong to some cluster and (b) very likely "drag" the cluster center into a poor location. This applies even if you use soft assignment, because every point must have total weight one. If the point is far from all others, then it will be assigned to the closest with a weight very close to one, and so may drag it into a poor location, or it will be in a cluster on its own.
+
+There are ways to deal with this. If k is very big, the problem is often not significant, because then you simply have many single element clusters that you can ignore. It isn't always a good idea to have too large a k, because then some larger clusters might break up. An alternative is to have a junk cluster. Any point that is too far from the closest true cluster center is assigned to the junk cluster, and the center of the junk cluster is not estimated. Notice that points should not be assigned to the junk cluster permanently; they should be able to move in and out of the junk cluster as the cluster centers move.
+
+Remember this: K-means clustering is the "go-to" clustering algorithm. You should see it as a basic recipe from which many algorithms can be concocted. The recipe is: iterate: allocate each data point to the closest cluster center; re-estimate cluster centers from their data points. There are many variations, improvements, etc. that are possible on this recipe. We have seen soft weights and k-mediods. K-means is not usually best implemented with the method I described (which isn't particularly efficient, but gets to the heart of what is going on). Implementations of k-means differ in important ways from my rather high-level description of the algorithm; for any but tiny problems, you should use a package, and you should look for a package that uses the Lloyd-Hartigan method.
+
+## 12.4 Describing Repetition with Vector Quantization
+
+The classifiers in Chap.  can be applied to simple images (the MNIST exercises at the end of the chapter, for example), but they will annoy you if you try to apply them as described to more complicated signals. All the methods described apply to feature vectors of fixed length. But typical of signals like speech, images, video, or accelerometer outputs is that different versions of the same thing have different lengths. For example, pictures appear at different resolutions, and it seems clumsy to insist that every image be 28 × 28 before it can be classified. As another example, some speakers are slow, and others are fast, but it's hard to see much future for a speech understanding system that insisted that everyone speak at the same speed so the classifier could operate. We need a construction that will take a signal and produce a useful feature vector of fixed length. This section shows one of the most useful such constructions (but be aware, this is an enormous topic).
+
+Repetition is an important feature of many interesting signals. For example, images contain textures, which are orderly patterns that look like large numbers of small structures that are repeated. Examples include the spots of animals such as leopards or cheetahs; the stripes of animals such as tigers or zebras; the patterns on bark, wood, and skin. Similarly, speech signals contain phonemes—characteristic, stylised sounds that people assemble together to produce speech (for example, the "ka" sound followed by the "tuh" sound leading to "cat"). Another example comes from accelerometers. If a subject wears an accelerometer while moving around, the signals record the accelerations during their movements. So, for example, brushing one's teeth involves a lot of repeated twisting movements at the wrist, and walking involves swinging the hand back and forth.
+
+Repetition occurs in subtle forms. The essence is that a small number of local patterns can be used to represent a large number of examples. You see this effect in pictures of scenes. If you collect many pictures of, say, a beach scene, you will expect most to contain some waves, some sky, and some sand. The individual patches of wave, sky or sand can be surprisingly similar. However, it's fair to model this by saying different images are made by selecting some patches from a vocabulary of patches, then placing them down to form an image. Similarly, pictures of living rooms contain chair patches, TV patches, and carpet patches. Many different living rooms can be made from small vocabularies of patches; but you won't often see wave patches in living rooms, or carpet patches in beach scenes. This suggests that the patches that are used to make an image reveal something about what is in the image. This observation works for speech, for video, and for accelerometer signals too.
+
+An important part of representing signals that repeat is building a vocabulary of patterns that repeat, then describing the signal in terms of those patterns. For many problems, knowing what vocabulary elements appear and how often is much more important than knowing where they appear. For example, if you want to tell the difference between zebras and leopards, you need to know whether stripes or spots are more common, but you don't particularly need to know where they appear. As another example, if you want to tell the difference between brushing teeth and walking using accelerometer signals, knowing that there are lots of (or few) twisting movements is important, but knowing how the movements are linked together in time may not be. As a general rule, one can do quite a good job of classifying video just by knowing what patterns are there (i.e. without knowing where or when the patterns appear). However, this doesn't apply to speech, where it really matters what sound follows what sound.
+
+### 12.4.1 Vector Quantization
+
+It is natural to try and find patterns by looking for small pieces of signal of fixed size that appear often. In an image, a piece of signal might be a 10 × 10 patch, which can be reshaped into a vector. In a sound file, which is likely represented as a vector, it might be a subvector of fixed size. A 3-axis accelerometer signal is usually represented as a 3 × r dimensional array (where r is the number of samples); in this case, a piece might be a 3 × 10 subarray, which can be reshaped into a vector. But finding patterns that appear often is hard to do, because the signal is continuous—each pattern will be slightly different, so we cannot simply count how many times a particular pattern occurs.
+
+Here is a strategy. We take a training set of signals, and cut each signal into pieces of fixed size and reshape them into d dimensional vectors. We then build a set of clusters out of these pieces. This set of clusters is often thought of as a dictionary, because we expect many or most cluster centers to look like pieces that occur often in the signals and so are repeated.
+
+We can now describe any new piece of signal with the cluster center closest to that piece. This means that a piece of signal is described with a number in the range [1,..., k] (where you get to choose k), and two pieces that are close should be described by the same number. This strategy is known as vector quantization.
+
+This strategy applies to any kind of signal, and is surprisingly robust to details. We could use d dimensional vectors for a sound file; ![
+$$\\sqrt{d} \\times \\sqrt{d}$$
+](A442674_1_En_12_Chapter_IEq15.gif) dimensional patches for an image; or 3 × (d∕3) dimensional subarrays for an accelerometer signal. In each case, it is easy to compute the distance between two pieces using sum of squared distances. It seems not to matter much if the signals are cut into overlapping or non-overlapping pieces when forming the dictionary, as long as there are enough pieces.
+
+Procedure 12.5 (Vector Quantization—Building a Dictionary)
+
+Take a training set of signals, and cut each signal into pieces of fixed size. The size of the piece will affect how well your method works, and is usually chosen by experiment. It does not seem to matter much if the pieces overlap. Cluster all the example pieces, and record the k cluster centers. It is usual, but not required, to use k-means clustering.
+
+We can now build features that represent important repeated structure in signals. We take a signal, and cut it up into vectors of length d. These might overlap, or be disjoint. We then take each vector, and compute the number that describes it (i.e. the number of the closest cluster center, as above). We then compute a histogram of the numbers we obtained for all the vectors in the signal. This histogram describes the signal.
+
+Procedure 12.6 (Vector Quantization—Representing a Signal)
+
+Take your signal, and cut it into pieces of fixed size. The size of the piece will affect how well your method works, and is usually chosen by experiment. It does not seem to matter much if the pieces overlap. For each piece, record the closest cluster center in the dictionary. Represent the signal with a histogram of these numbers, which will be a k dimensional vector.
+
+Notice several nice features to this construction. First, it can be applied to anything that can be thought of in terms of fixed size pieces, so it will work for speech signals, sound signals, accelerometer signals, images, and so on. Another nice feature is the construction can accept signals of different length, and produce a description of fixed length. One accelerometer signal might cover 100 time intervals; another might cover 200; but the description is always a histogram with k buckets, so it's always a vector of length k.
+
+Yet another nice feature is that we don't need to be all that careful how we cut the signal into fixed length vectors. This is because it is hard to hide repetition. This point is easier to make with a figure than in text, so look at Fig. 12.10.
+
+Fig. 12.10
+
+Top: two images with rather exaggerated repetition, published on flickr.​com with a creative commons license by webtreats. Next to these images, I have placed zoomed sampled 10 × 10 patches from those images; although the spots (resp. stripes) aren't necessarily centered in the patches, it's pretty clear which image each patch comes from. Bottom: a 40 patch dictionary computed using k-means from 4000 samples from each image. If you look closely, you'll see that some dictionary entries are clearly stripe entries, others clearly spot entries. Stripe images will have patches represented by stripe entries in the dictionary and spot images by spot entries
+
+The number of pieces of signal (and so k), might be very big indeed. It is quite reasonable to want to build a dictionary for a million items and use tens to hundreds of thousands of cluster centers. In this case, it is a good idea to use hierarchical k-means, as in Sect. 12.3.3. Hierarchical k-means produces a tree of cluster centers. It is easy to use this tree to vector quantize a query data item. We vector quantize at the first level. Doing so chooses a branch of the tree, and we pass the data item to this branch. It is either a leaf, in which case we report the number of the leaf, or it is a set of clusters, in which case we vector quantize, and pass the data item down. This procedure is efficient both when one clusters and at run time.
+
+Representing a signal as a histogram of cluster centers loses information in two important ways. First, the histogram has little or no information about how the pieces of signal are arranged. So, for example, the representation can tell whether an image has stripy or spotty patches in it, but not where those patches lie. You should not rely on your intuition to tell you whether this lost information is important or not. For many kinds of image classification task, histograms of cluster centers are much better than you might guess, despite not encoding where patches lie (though still better results are now obtained with convolutional neural networks).
+
+Second, replacing a piece of signal with a cluster center must lose some detail, which might be important, and likely results in some classification errors. There is a surprisingly simple construction that can alleviate these problems. Build three (or more) dictionaries, rather than one, using different sets of training pieces. For example, you could cut the same signals into pieces on a different grid. Now use each dictionary to produce a histogram of cluster centers, and classify with those. Finally, use a voting scheme to decide the class of each test signal. In many problems, this approach yields small but useful improvements.
+
+### 12.4.2 Example: Activity from Accelerometer Data
+
+A complex example dataset appears at https://​archive.​ics.​uci.​edu/​ml/​datasets/​Dataset+for+ADL+​Recognition+with​+Wrist-worn+Acceleromet​er. This dataset consists of examples of the signal from a wrist mounted accelerometer, produced as different subjects engaged in different activities of daily life. Activities include: brushing teeth, climbing stairs, combing hair, descending stairs, and so on. Each is performed by sixteen volunteers. The accelerometer samples the data at 32 Hz (i.e. this data samples and reports the acceleration 32 times per second). The accelerations are in the x, y and z-directions. The dataset was collected by Barbara Bruno, Fulvio Mastrogiovanni and Antonio Sgorbissa. Figure 12.11 shows the x-component of various examples of toothbrushing.
+
+Fig. 12.11
+
+Some examples from the accelerometer dataset at https://​archive.​ics.​uci.​edu/​ml/​datasets/​Dataset+for+ADL+​Recognition+with​+Wrist-worn+Acceleromet​er. I have labelled each signal by the activity. These show acceleration in the X direction (Y and Z are in the dataset, too). There are four examples for brushing teeth and four for eat meat. You should notice that the examples don't have the same length in time (some are slower and some faster eaters, etc.), but that there seem to be characteristic features that are shared within a category (brushing teeth seems to involve faster movements than eating meet)
+
+There is an important problem with using data like this. Different subjects take quite different amounts of time to perform these activities. For example, some subjects might be more thorough tooth-brushers than other subjects. As another example, people with longer legs walk at somewhat different frequencies than people with shorter legs. This means that the same activity performed by different subjects will produce data vectors that are of different lengths. It's not a good idea to deal with this by warping time and resampling the signal. For example, doing so will make a thorough toothbrusher look as though they are moving their hands very fast (or a careless toothbrusher look ludicrously slow: think speeding up or slowing down a movie). So we need a representation that can cope with signals that are a bit longer or shorter than other signals.
+
+Another important property of these signals is that all examples of a particular activity should contain repeated patterns. For example, brushing teeth should show fast accelerations up and down; walking should show a strong signal at somewhere around 2 Hz; and so on. These two points should suggest vector quantization to you. Representing the signal in terms of stylized, repeated structures is probably a good idea because the signals probably contain these structures. And if we represent the signal in terms of the relative frequency with which these structures occur, the representation will have a fixed length, even if the signal doesn't. To do so, we need to consider (a) over what time scale we will see these repeated structures and (b) how to ensure we segment the signal into pieces so that we see these structures.
+
+Generally, repetition in activity signals is so obvious that we don't need to be smart about segment boundaries. I broke these signals into 32 sample segments, one following the other. Each segment represents 1 s of activity. This is long enough for the body to do something interesting, but not so long that our representation will suffer if we put the segment boundaries in the wrong place. This resulted in about 40,000 segments. I then used hierarchical k-means to cluster these segments. I used two levels, with 40 cluster centers at the first level, and 12 at the second. Figure 12.12 shows some cluster centers at the second level.
+
+Fig. 12.12
+
+Some cluster centers from the accelerometer dataset. Each cluster center represents a one-second burst of activity. There are a total of 480 in my model, which I built using hierarchical k-means. Notice there are a couple of centers that appear to represent movement at about 5 Hz; another few that represent movement at about 2 Hz; some that look like 0.5 Hz movement; and some that seem to represent much lower frequency movement. These cluster centers are samples (rather than chosen to have this property)
+
+I then computed histogram representations for different example signals (Fig. 12.13). You should notice that when the activity label is different, the histogram looks different, too.
+
+Fig. 12.13
+
+Histograms of cluster centers for the accelerometer dataset, for different activities. You should notice that (a) these histograms look somewhat similar for different actors performing the same activity and (b) these histograms look somewhat different for different activities
+
+Another useful way to check this representation is to compare the average within class chi-squared distance with the average between class chi-squared distance. I computed the histogram for each example. Then, for each pair of examples, I computed the chi-squared distance between the pair. Finally, for each pair of activity labels, I computed the average distance between pairs of examples where one example has one of the activity labels and the other example has the other activity label. In the ideal case, all the examples with the same label would be very close to one another, and all examples with different labels would be rather different. Table 12.1 shows what happens with the real data. You should notice that for some pairs of activity label, the mean distance between examples is smaller than one would hope for (perhaps some pairs of examples are quite close?). But generally, examples of activities with different labels tend to be further apart than examples of activities with the same label.
+
+Table 12.1
+
+Each column of the table represents an activity for the activity dataset https://​archive.​ics.​uci.​edu/​ml/​datasets/​Dataset+for+ADL+​Recognition+with​+Wrist-worn+Acceleromet​er, as does each row
+
+0.9 | 2.0 | 1.9 | 2.0 | 2.0 | 2.0 | 1.9 | 2.0 | 1.9 | 1.9 | 2.0 | 2.0 | 2.0 | 2.0
+
+---|---|---|---|---|---|---|---|---|---|---|---|---|---  
+|
+
+1.6 | 2.0 | 1.8 | 2.0 | 2.0 | 2.0 | 1.9 | 1.9 | 2.0 | 1.9 | 1.9 | 2.0 | 1.7
+
+|  |
+
+1.5 | 2.0 | 1.9 | 1.9 | 1.9 | 1.9 | 1.9 | 1.9 | 1.9 | 1.9 | 1.9 | 2.0
+
+|  |  |
+
+1.4 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 1.8
+
+|  |  |  |
+
+1.5 | 1.8 | 1.7 | 1.9 | 1.9 | 1.8 | 1.9 | 1.9 | 1.8 | 2.0
+
+|  |  |  |  |
+
+0.9 | 1.7 | 1.9 | 1.9 | 1.8 | 1.9 | 1.9 | 1.9 | 2.0
+
+|  |  |  |  |  |
+
+0.3 | 1.9 | 1.9 | 1.5 | 1.9 | 1.9 | 1.9 | 2.0
+
+|  |  |  |  |  |  |
+
+1.8 | 1.8 | 1.9 | 1.9 | 1.9 | 1.9 | 1.9
+
+|  |  |  |  |  |  |  |
+
+1.7 | 1.9 | 1.9 | 1.9 | 1.9 | 1.9
+
+|  |  |  |  |  |  |  |  |
+
+1.6 | 1.9 | 1.9 | 1.9 | 2.0
+
+|  |  |  |  |  |  |  |  |  |
+
+1.8 | 1.9 | 1.9 | 1.9
+
+|  |  |  |  |  |  |  |  |  |  |
+
+1.8 | 2.0 | 1.9
+
+|  |  |  |  |  |  |  |  |  |  |  |
+
+1.5 | 2.0
+
+|  |  |  |  |  |  |  |  |  |  |  |  |
+
+1.5
+
+In each of the upper diagonal cells, I have placed the average chi-squared distance between histograms of examples from that pair of classes (I dropped the lower diagonal for clarity). Notice that in general the diagonal terms (average within class distance) are rather smaller than the off diagonal terms. This quite strongly suggests we can use these histograms to classify examples successfully
+
+Yet another way to check the representation is to try classification with nearest neighbors, using the chi-squared distance to compute distances. I split the dataset into 80 test pairs and 360 training pairs; using 1-nearest neighbors, I was able to get a held-out error rate of 0.79. This suggests that the representation is fairly good at exposing what is important.
+
+## 12.5 The Multivariate Normal Distribution
+
+All the nasty facts about high dimensional data, above, suggest that we need to use quite simple probability models. By far the most important model is the multivariate normal distribution, which is quite often known as the multivariate gaussian distribution. We will not use a multivariate normal distribution explicitly in what follows, though if you look hard enough you might see it lurking beneath the surface in a couple of places. However, it's useful to have seen the basics, which you are quite likely to bump into elsewhere.
+
+There are two sets of parameters in this model, the mean μ and the covariance ![
+$$\\Sigma$$
+](A442674_1_En_12_Chapter_IEq16.gif). For a d-dimensional model, the mean is a d-dimensional column vector and the covariance is a d × d dimensional matrix. The covariance is a symmetric matrix. For our definitions to be meaningful, the covariance matrix must be positive definite.
+
+The form of the distribution ![
+$$p\(\\mathbf{x}\\vert \\mu,\\Sigma \)$$
+](A442674_1_En_12_Chapter_IEq17.gif) is
+
+![
+$$\\displaystyle\\begin{array}{rcl} p\(\\mathbf{x}\\vert \\mu,\\Sigma \) = \\frac{1} {\\sqrt{\(2\\pi \)^{d } \\mbox{ det} \(\\Sigma \)}}\\exp \\left \(-\\frac{1} {2}\(\\mathbf{x}-\\mu \)^{T}\\Sigma ^{-1}\(\\mathbf{x}-\\mu \)\\right \).& & {}\\\\ \\end{array}$$
+](A442674_1_En_12_Chapter_Equ3.gif)
+
+The following facts explain the names of the parameters:
+
+Useful Facts 12.1 (Parameters of a Multivariate Normal Distribution)
+
+Assuming a multivariate normal distribution, we have
+
+  * ![
+$$\\mathbb{E}\\left \[\\mathbf{x}\\right \] =\\mu$$
+](A442674_1_En_12_Chapter_IEq18.gif), meaning that the mean of the distribution is μ.
+
+  * ![
+$$\\mathbb{E}\\left \[\(\\mathbf{x}-\\mu \)\(\\mathbf{x}-\\mu \)^{T}\\right \] = \\Sigma$$
+](A442674_1_En_12_Chapter_IEq19.gif), meaning that the entries in ![
+$$\\Sigma$$
+](A442674_1_En_12_Chapter_IEq20.gif) represent covariances.
+
+Assume I know have a dataset of items x i , where i runs from 1 to N, and we wish to model this data with a multivariate normal distribution. The maximum likelihood estimate of the mean, ![
+$$\\hat{\\mu }$$
+](A442674_1_En_12_Chapter_IEq21.gif), is
+
+![
+$$\\displaystyle{\\hat{\\mu }= \\frac{\\sum _{i}\\mathbf{x}_{i}} {N} }$$
+](A442674_1_En_12_Chapter_Equo.gif)
+
+(which is quite easy to show). The maximum likelihood estimate of the covariance, ![
+$$\\hat{\\Sigma }$$
+](A442674_1_En_12_Chapter_IEq22.gif), is
+
+![
+$$\\displaystyle{\\hat{\\Sigma } = \\frac{\\sum _{i}\(\\mathbf{x}_{i}-\\hat{\\mu }\)\(\\mathbf{x}_{i}-\\hat{\\mu }\)^{T}} {N} }$$
+](A442674_1_En_12_Chapter_Equp.gif)
+
+(which is rather a nuisance to show, because you need to know how to differentiate a determinant). These facts mean that we already know most of what is interesting about multivariate normal distributions (or gaussians).
+
+### 12.5.1 Affine Transformations and Gaussians
+
+Gaussians behave very well under affine transformations. In fact, we've already worked out all the math. There is one caveat, which is worth mentioning. You can't build a multivariate gaussian distribution unless the covariance is positive definite, because the distribution doesn't normalize. Assume I have a dataset x i . The mean of the maximum likelihood gaussian model is ![
+$$\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}_{i}\\right \\}\\right \)$$
+](A442674_1_En_12_Chapter_IEq23.gif), and the covariance is ![
+$$\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}_{i}\\right \\}\\right \)$$
+](A442674_1_En_12_Chapter_IEq24.gif), as long as ![
+$$\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}_{i}\\right \\}\\right \)$$
+](A442674_1_En_12_Chapter_IEq25.gif) is positive definite. I can now transform the data with an affine transformation, to get ![
+$$\\mathbf{y}_{i} = \\mathcal{A}\\mathbf{x}_{i} + \\mathbf{b}$$
+](A442674_1_En_12_Chapter_IEq26.gif). As long as ![
+$$\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{y}_{i}\\right \\}\\right \)$$
+](A442674_1_En_12_Chapter_IEq27.gif) is positive definite, we can fit a multivariate gaussian with maximum likelihood. The mean of the maximum likelihood gaussian model for the transformed dataset is ![
+$$\\mathsf{mean}\\left \(\\left \\{\\mathbf{y}_{i}\\right \\}\\right \)$$
+](A442674_1_En_12_Chapter_IEq28.gif), and we've dealt with this; similarly, the covariance is ![
+$$\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{y}_{i}\\right \\}\\right \)$$
+](A442674_1_En_12_Chapter_IEq29.gif), and we've dealt with this, too.
+
+A very important point follows in an obvious way. I can apply an affine transformation to any multivariate gaussian to obtain one with (a) zero mean and (b) independent components. In turn, this means that, in the right coordinate system, any gaussian is a product of zero mean one-dimensional normal distributions. This fact is quite useful. For example, it means that simulating multivariate normal distributions is quite straightforward—you could simulate a standard normal distribution for each component, then apply an affine transformation.
+
+### 12.5.2 Plotting a 2D Gaussian: Covariance Ellipses
+
+There are some useful tricks for plotting a 2D Gaussian, which are worth knowing both because they're useful, and they help to understand Gaussians. Assume we are working in 2D; we have a Gaussian with mean μ (which is a 2D vector), and covariance ![
+$$\\Sigma$$
+](A442674_1_En_12_Chapter_IEq30.gif) (which is a 2 × 2 matrix). We could plot the collection of points x that has some fixed value of ![
+$$p\(\\mathbf{x}\\vert \\mu,\\Sigma \)$$
+](A442674_1_En_12_Chapter_IEq31.gif). This set of points is given by:
+
+![
+$$\\displaystyle{\\frac{1} {2}\\left \(\(\\mathbf{x}-\\mu \)^{T}\\Sigma ^{-1}\(\\mathbf{x}-\\mu \)\\right \) = c^{2}}$$
+](A442674_1_En_12_Chapter_Equq.gif)
+
+where c is some constant. I will choose ![
+$$c^{2} = \\frac{1} {2}$$
+](A442674_1_En_12_Chapter_IEq32.gif), because the choice doesn't matter, and this choice simplifies some algebra. You might recall that a set of points x that satisfies a quadratic like this is a conic section. Because ![
+$$\\Sigma$$
+](A442674_1_En_12_Chapter_IEq33.gif) (and so ![
+$$\\Sigma ^{-1}$$
+](A442674_1_En_12_Chapter_IEq34.gif)) is positive definite, the curve is an ellipse. There is a useful relationship between the geometry of this ellipse and the Gaussian.
+
+This ellipse—like all ellipses—has a major axis and a minor axis. These are at right angles, and meet at the center of the ellipse. We can determine the properties of the ellipse in terms of the Gaussian quite easily. The geometry of the ellipse isn't affected by rotation or translation, so we will translate the ellipse so that μ = 0 (i.e. the mean is at the origin) and rotate it so that ![
+$$\\Sigma ^{-1}$$
+](A442674_1_En_12_Chapter_IEq35.gif) is diagonal. Writing x = [x, y] we get that the set of points on the ellipse satisfies
+
+![
+$$\\displaystyle{\\frac{1} {2}\( \\frac{1} {k_{1}^{2}}x^{2} + \\frac{1} {k_{2}^{2}}y^{2}\) = \\frac{1} {2}}$$
+](A442674_1_En_12_Chapter_Equr.gif)
+
+where ![
+$$\\frac{1} {k_{1}^{2}}$$
+](A442674_1_En_12_Chapter_IEq36.gif) and ![
+$$\\frac{1} {k_{2}^{2}}$$
+](A442674_1_En_12_Chapter_IEq37.gif) are the diagonal elements of ![
+$$\\Sigma ^{-1}$$
+](A442674_1_En_12_Chapter_IEq38.gif). We will assume that the ellipse has been rotated so that k 1 < k 2. The points (k 1, 0) and (−k 1, 0) lie on the ellipse, as do the points (0, k 2) and (0, −k 2). The major axis of the ellipse, in this coordinate system, is the x-axis, and the minor axis is the y-axis. In this coordinate system, x and y are independent. If you do a little algebra, you will see that the standard deviation of x is ![
+$$\\mathsf{abs}\\left \(k_{1}\\right \)$$
+](A442674_1_En_12_Chapter_IEq39.gif) and the standard deviation of y is ![
+$$\\mathsf{abs}\\left \(k_{2}\\right \)$$
+](A442674_1_En_12_Chapter_IEq40.gif). So the ellipse is longer in the direction of largest standard deviation and shorter in the direction of smallest standard deviation.
+
+Now rotating the ellipse is means we will pre- and post-multiply the covariance matrix with some rotation matrix. Translating it will move the origin to the mean. As a result, the ellipse has its center at the mean, its major axis is in the direction of the eigenvector of the covariance with largest eigenvalue, and its minor axis is in the direction of the eigenvector with smallest eigenvalue. A plot of this ellipse, which can be coaxed out of most programming environments with relatively little effort, gives us a great deal of information about the underlying Gaussian. These ellipses are known as covariance ellipses.
+
+Remember this: The multivariate normal distribution has the form
+
+![
+$$\\displaystyle{\\begin{array}{ll} p\(\\mathbf{x}\\vert \\mu,\\Sigma \) =& \\frac{1} {\\sqrt{\(2\\pi \)^{d } \\mbox{ det} \(\\Sigma \)}}\\exp \\left \(-\\frac{1} {2}\(\\mathbf{x}-\\mu \)^{T}\\Sigma ^{-1}\(\\mathbf{x}-\\mu \)\\right \). \\end{array} }$$
+](A442674_1_En_12_Chapter_Equs.gif)
+
+Assume you wish to model a dataset { x } with a multivariate normal distribution. This will work as long as ![
+$$\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)$$
+](A442674_1_En_12_Chapter_IEq41.gif) is positive definite. The maximum likelihood estimate of the mean is ![
+$$\\mathsf{mean}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)$$
+](A442674_1_En_12_Chapter_IEq42.gif) . The maximum likelihood estimate of the covariance ![
+$$\\Sigma$$
+](A442674_1_En_12_Chapter_IEq43.gif) is ![
+$$\\mathsf{Covmat}\\left \(\\left \\{\\mathbf{x}\\right \\}\\right \)$$
+](A442674_1_En_12_Chapter_IEq44.gif).
+
+## 12.6 You Should
+
+### 12.6.1 Remember These Definitions
+
+### 12.6.2 Remember These Terms
+
+clustering 281
+
+clusters 283
+
+cluster center 283
+
+single-link clusterin 283
+
+complete-link clustering 283
+
+group average clustering 283
+
+dendrogram 284
+
+whitening 286
+
+k-means 287
+
+affinity 290
+
+vector quantization 296
+
+covariance ellipses 302
+
+### 12.6.3 Remember These Facts
+
+Parameters of a Multivariate Normal Distribution 301
+
+### 12.6.4 Use These Procedures
+
+To cluster agglomeratively 283
+
+To cluster divisively 284
+
+To cluster with k-means 287
+
+To cluster with k-means, soft weights 291
+
+To build a dictionary for vector quantization 296
+
+To represent a signal with vector quantization 296
+
+Programming Exercises
+
+12.13 You can find a dataset dealing with European employment in 1979 at http://​dasl.​datadesk.​com/​data/​view/​47. This dataset gives the percentage of people employed in each of a set of areas in 1979 for each of a set of European countries.
+
+  1. (a)
+
+Use an agglomerative clusterer to cluster this data. Produce a dendrogram of this data for each of single link, complete link, and group average clustering. You should label the countries on the axis. What structure in the data does each method expose? it's fine to look for code, rather than writing your own. Hint: I made plots I liked a lot using R's hclust clustering function, and then turning the result into a phylogenetic tree and using a fan plot, a trick I found on the web; try plot(as.phylo(hclustre- sult),type=''fan''). You should see dendrograms that "make sense" (at least if you remember some European history), and have interesting differences.
+
+  2. (b)
+
+Using k-means, cluster this dataset. What is a good choice of k for this data and why?
+
+12.14 Obtain the activities of daily life dataset from the UC Irvine machine learning website (https://​archive.​ics.​uci.​edu/​ml/​datasets/​Dataset+for+ADL+​Recognition+with​+Wrist-worn+Acceleromet​er; data provided by Barbara Bruno, Fulvio Mastrogiovanni and Antonio Sgorbissa).
+
+  1. (a)
+
+Build a classifier that classifies sequences into one of the 14 activities provided. To make features, you should vector quantize, then use a histogram of cluster centers (as described in the subsection; this gives a pretty explicit set of steps to follow). You will find it helpful to use hierarchical k-means to vector quantize. You may use whatever multi-class classifier you wish, though I'd start with R's decision forest, because it's easy to use and effective. You should report (a) the total error rate and (b) the class confusion matrix of your classifier.
+
+  2. (b)
+
+Now see if you can improve your classifier by (a) modifying the number of cluster centers in your hierarchical k-means and (b) modifying the size of the fixed length samples that you use.
+
+CIFAR-10 and Vector Quantization Exercises
+
+The following exercises are elaborate, but rewarding. The CIFAR-10 dataset is a set of labelled images in 10 classes, collected by Alex Krizhevsky, Vinod Nair, and Geoffrey Hinton. The dataset consists of 60,000 32 × 32 colour images in 10 classes, with 6000 images per class. There are 50,000 training images and 10,000 test images. You can find this dataset at https://​www.​cs.​toronto.​edu/​~kriz/​cifar.​html; at that site, you will find pointers to information about how well various methods work, etc., too. The creators ask that anyone using this dataset acknowledge the technical report "Learning Multiple Layers of Features from Tiny Images," by Alex Krizhevsky written in 2009. It is very widely used to check simple image classification methods.
+
+12.15 We will start by visualizing CIFAR-10.
+
+  1. (a)
+
+For each category, compute the mean image and the first 20 principal components. Plot the error resulting from representing the images of each category using the first 20 principal components against the category.
+
+  2. (b)
+
+Compute the distances between mean images for each pair of classes. Use principal coordinate analysis to make a 2D map of the means of each categories. For this exercise, compute distances by thinking of the images as vectors.
+
+  3. (c)
+
+Here is another measure of the similarity of two classes. For class A and class B, define E(A → B) to be the average error obtained by representing all the images of class A using the mean of class A and the first 20 principal components of class B. Now define the similarity between classes to be (1∕2)(E(A → B) + E(B → A)). Use principal coordinate analysis to make a 2D map of the classes. Compare this map to the map in the previous exercise—are they different? why?
+
+12.16 We will build a simple baseline. Here is a simple feature construction (called "local PCA" in the MNIST exercises). First, compute the first d principal components for each image class separately. Now for any image, compute a 10d dimensional feature vector by, for each class, subtracting that class mean from the image, then projecting the image onto the d principal components for that class. Finally, stack all 10 d dimensional features you get. This measures how much the difference between the image and the class mean looks like the difference between images of that class and the class mean. Use the local PCA construction to form features for the CIFAR 10 dataset. Use a package to train a decision forest to classify these images, using this feature vector. Compare the performance of this baseline to: (a) baselines on https://​www.​cs.​toronto.​edu/​~kriz/​cifar.​html; and (b) chance.
+
+12.17 We will use the simplest vector quantization and compare to the baseline of the previous exercise. Construct a dictionary using N 8 × 8 patches selected at random locations from randomly selected training images. Use k cluster centers, and (hierarchical) k-means to build your dictionary. Now carve your image into 8 × 8 patches, overlapping by 2, and vector quantize these patches. This will produce a k-dimensional histogram representing the image.
+
+  1. (a)
+
+Use a package to train a decision forest to classify these images using the vector quantized feature, making a choice of N and k that seems reasonable to you. Evaluate the accuracy of this classifier on the test set.
+
+  2. (b)
+
+Investigate the effects of changing N and k on the accuracy of your classifier.
+
+12.18 Can you improve your MNIST classifiers using vector quantization strategies, as above?
+© Springer International Publishing AG 2018
+
+David ForsythProbability and Statistics for Computer Sciencehttps://doi.org/10.1007/978-3-319-64410-3_13
+
+# 13. Regression
+
+David Forsyth1
+
+(1)
+
+Computer Science Department, University of Illinois at Urbana Champaign, Urbana, IL, USA
+
+Classification tries to predict a class from a data item. Regression tries to predict a value. For example, we know the zip code of a house, the square footage of its lot, the number of rooms and the square footage of the house, and we wish to predict its likely sale price. As another example, we know the cost and condition of a trading card for sale, and we wish to predict a likely profit in buying it and then reselling it. As yet another example, we have a picture with some missing pixels—perhaps there was text covering them, and we want to replace it—and we want to fill in the missing values. As a final example, you can think of classification as a special case of regression, where we want to predict either + 1 or − 1; this isn't usually the best way to classify, however. Predicting values is very useful, and so there are many examples like this.
+
+Some formalities are helpful here. In the simplest case, we have a dataset consisting of a set of N pairs (x i , y i ). We want to use the examples we have—the training examples —to build a model of the dependence between y and x. This model will be used to predict values of y for new values of x, which are usually called test examples. We think of y i as the value of some function evaluated at x i , but with some random component. This means there might be two data items where the x i are the same, and the y i are different. We refer to the x i as explanatory variables and the y i as a dependent variable. We regularly say that we are regressing the dependent variable against the explanatory variables.
+
+## 13.1 Regression to Make Predictions
+
+Now imagine that we have one independent variable. An appropriate choice of x and of model (details below) will mean that the predictions made by this model will lie on a straight line. Figure 13.1 shows two regressions. The data are plotted with a scatter plot, and the line gives the prediction of the model for each value on the x axis.
+
+Fig. 13.1
+
+On the left, a regression of weight against length for perch from a Finnish lake (you can find this dataset, and the back story at http://​www.​amstat.​org/​publications/​jse/​jse_​data_​archive.​htm; look for "fishcatch" on that page). Notice that the linear regression fits the data fairly well, meaning that you should be able to predict the weight of a perch from its length fairly well. On the right, a regression of air temperature against chirp frequency for crickets. The data is fairly close to the line, meaning that you should be able to tell the temperature from the pitch of cricket's chirp fairly well. This data is from http://​mste.​illinois.​edu/​patel/​amar430/​keyprob1.​html. The R 2 you see on each figure is a measure of the goodness of fit of the regression (Sect. 13.3.5)
+
+We cannot guarantee that different values of x produce different values of y. Data just isn't like this (see the crickets example Fig. 13.1). This means you can't think of a regression as predicting the true value of y from x because usually there isn't one. Instead, you should think of a regression as predicting the expected value of y conditioned on x. Some regression models can produce more information about the probability distribution for y conditioned on x. For example, it might be very valuable to get both the mean and variance of the distribution of the likely sale value of a house from independent variables.
+
+It should be clear that none of this will work if there is not some relationship between the training examples and the test examples. If I collect training data on the height and weight of children, I'm unlikely to get good predictions of the weight of adults from their height. We can be more precise with a probabilistic framework. We think of x i as IID samples from some (usually unknown) probability distribution P(X). Then the test examples should also be IID samples from P(X), or, at least, rather like them—you usually can't check this point with any certainty.
+
+A probabilistic formalism can help be precise about the y i , too. Assume another random variable Y has joint distribution with X given by P(Y, X). We think of each y i as a sample from ![
+$$P\(Y \\vert \\left \\{X = \\mathbf{x}_{i}\\right \\}\)$$
+](A442674_1_En_13_Chapter_IEq1.gif). Then our modelling problem would be: given the training data, build a model that takes a test example x and yields ![
+$$\\mathbb{E}\\left \[Y \\vert \\left \\{X = \\mathbf{x}_{i}\\right \\}\\right \]$$
+](A442674_1_En_13_Chapter_IEq2.gif).
+
+Thinking about the problem this way should make it clear that we're not relying on any exact, physical, or causal relationship between Y and X. It's enough that their joint probability makes useful predictions possible, something we will test by experiment. This means that you can build regressions that work in somewhat surprising circumstances. For example, regressing childrens' reading ability against their foot size can be quite successful. This isn't because having big feet somehow helps you read. It's because on the whole, older children read better, and also have bigger feet. Regression isn't magic. Figure 13.2 shows two regressions where the predictions aren't particularly accurate.
+
+Fig. 13.2
+
+Regressions do not necessarily yield good predictions or good model fits. On the left, a regression of the lifespan of female fruitflies against the length of their torso as adults (apparently, this doesn't change as a fruitfly ages; you can find this dataset, and the back story at http://​www.​amstat.​org/​publications/​jse/​jse_​data_​archive.​htm; look for "fruitfly" on that page). The figure suggests you can make some prediction of how long your fruitfly will last by measuring its torso, but not a particularly accurate one. On the right, a regression of heart rate against body temperature for adults. You can find the data at http://​www.​amstat.​org/​publications/​jse/​jse_​data_​archive.​htm as well; look for "temperature" on that page. Notice that predicting heart rate from body temperature isn't going to work that well, either
+
+## 13.2 Regression to Spot Trends
+
+Regression isn't only used to predict values. Another reason to build a regression model is to compare trends in data. Doing so can make it clear what is really happening. Here is an example from Efron ("Computer-Intensive methods in statistical regression", B. Efron, SIAM Review, 1988). The table in the appendix shows some data from medical devices, which sit in the body and release a hormone. The data shows the amount of hormone currently in a device after it has spent some time in service, and the time the device spent in service. The data describes devices from three production lots (A, B, and C). Each device, from each lot, is supposed to have the same behavior. The important question is: Are the lots the same? The amount of hormone changes over time, so we can't just compare the amounts currently in each device. Instead, we need to determine the relationship between time in service and hormone, and see if this relationship is different between batches. We can do so by regressing hormone against time.
+
+Fig. 13.3
+
+On the left, a scatter plot of hormone against time for devices from Tables 13.1 and 13.1. Notice that there is a pretty clear relationship between time and amount of hormone (the longer the device has been in service the less hormone there is). The issue now is to understand that relationship so that we can tell whether lots A, B and C are the same or different. The best fit line to all the data is shown as well, fitted using the methods of Sect. 13.3. On the right, a scatter plot of residual—the distance between each data point and the best fit line—against time for the devices from Tables 13.1 and 13.1. Now you should notice a clear difference; some devices from lots B and C have positive and some negative residuals, but all lot A devices have negative residuals. This means that, when we account for loss of hormone over time, lot A devices still have less hormone in them. This is pretty good evidence that there is a problem with this lot
+
+Table 13.1
+
+A table showing the amount of hormone remaining and the time in service for devices from lot A, lot B and lot C
+
+Batch A | Batch B | Batch C
+
+---|---|---
+
+Amount of | Time in | Amount of | Time in | Amount of | Time in
+
+hormone | service | hormone | service | hormone | service
+
+25.8 | 99 | 16.3 | 376 | 28.8 | 119
+
+20.5 | 152 | 11.6 | 385 | 22.0 | 188
+
+14.3 | 293 | 11.8 | 402 | 29.7 | 115
+
+23.2 | 155 | 32.5 | 29 | 28.9 | 88
+
+20.6 | 196 | 32.0 | 76 | 32.8 | 58
+
+31.1 | 53 | 18.0 | 296 | 32.5 | 49
+
+20.9 | 184 | 24.1 | 151 | 25.4 | 150
+
+20.9 | 171 | 26.5 | 177 | 31.7 | 107
+
+30.4 | 52 | 25.8 | 209 | 28.5 | 125
+
+The numbering is arbitrary (i.e. there's no relationship between device 3 in lot A and device 3 in lot B). We expect that the amount of hormone goes down as the device spends more time in service, so cannot compare batches just by comparing numbers. This data appeared in "Computer-Intensive methods in statistical regression", B. Efron, SIAM Review, 1988, and is used for Fig. 13.3
+
+Figure 13.3 shows how a regression can help. In this case, we have modelled the amount of hormone in the device as
+
+![
+$$\\displaystyle{a \\times \\mbox{ \(time in service\)} + b}$$
+](A442674_1_En_13_Chapter_Equa.gif)
+
+for a, b chosen to get the best fit (much more on this point later!). This means we can plot each data point on a scatter plot, together with the best fitting line. This plot allows us to ask whether any particular batch behaves differently from the overall model in any interesting way.
+
+However, it is hard to evaluate the distances between data points and the best fitting line by eye. A sensible alternative is to subtract the amount of hormone predicted by the model from the amount that was measured. Doing so yields a residual—the difference between a measurement and a prediction. We can then plot those residuals (Fig. 13.3). In this case, the plot suggests that lot A is special—all devices from this lot contain less hormone than our model predicts.
+
+Definition 13.2 (Regression)
+
+Regression accepts a feature vector and produces a prediction, which is usually a number, but can sometimes have other forms. You can use these predictions as predictions, or to study trends in data. It is possible, but not usually particularly helpful, to see classification as a form of regression.
+
+## 13.3 Linear Regression and Least Squares
+
+Assume we have a dataset consisting of a set of N pairs (x i , y i ). We want to use the examples we have—the training examples—to build a model of the dependence between y and x. This model will be used to predict values of y for new values of x, which are usually called test examples. The model needs to have some probabilistic component; we do not expect that y is a function of x, and there is likely some error in evaluating y anyhow.
+
+### 13.3.1 Linear Regression
+
+We cannot expect that our model makes perfect predictions. Furthermore, y may not be a function of x—it is quite possible that the same value of x could lead to different y's. One way that this could occur is that y is a measurement (and so subject to some measurement noise). Another is that there is some randomness in y. For example, we expect that two houses with the same set of features (the x) might still sell for different prices (the y's).
+
+A good, simple model is to assume that the dependent variable (i.e. y) is obtained by evaluating a linear function of the explanatory variables (i.e. x), then adding a zero-mean normal random variable. We can write this model as
+
+![
+$$\\displaystyle{y = \\mathbf{x}^{T}\\beta +\\xi }$$
+](A442674_1_En_13_Chapter_Equb.gif)
+
+where ξ represents random (or at least, unmodelled) effects. In this expression, β is a vector of weights, which we must estimate. We will always assume that ξ has zero mean, so that
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[Y \\vert \\left \\{X = \\mathbf{x}_{i}\\right \\}\\right \] = \\mathbf{x}_{i}\\beta.}$$
+](A442674_1_En_13_Chapter_Equc.gif)
+
+When we use this model to predict a value of y for a particular set of explanatory variables x ∗, we cannot predict the value that ξ will take. Our best available prediction is the mean value (which is zero). Notice that if x = 0, the model predicts y = 0. This may seem like a problem to you—you might be concerned that we can fit only lines through the origin—but remember that x contains explanatory variables, and we can choose what appears in x. The two examples show how a sensible choice of x allows us to fit a line with an arbitrary y-intercept.
+
+Definition 13.3 (Linear Regression)
+
+A linear regression takes the feature vector x and predicts x T β, for some vector of coefficients β. The coefficients are adjusted, using data, to produce the best predictions.
+
+Example 13.1 (A Linear Model Fitted to a Single Explanatory Variable)
+
+Assume we fit a linear model to a single explanatory variable. Then the model has the form y = xβ \+ ξ, where ξ is a zero mean random variable. For any value x ∗ of the explanatory variable, our best estimate of y is βx ∗. In particular, if x ∗ = 0, the model predicts y = 0, which is unfortunate. We can draw the model by drawing a line through the origin with slope β in the x, y plane. The y-intercept of this line must be zero.
+
+Example 13.2 (A Linear Model with a Non-Zero y-Intercept)
+
+Assume we have a single explanatory variable, which we write u. We can then create a vector ![
+$$\\mathbf{x} = \\left \[u,1\\right \]^{T}$$
+](A442674_1_En_13_Chapter_IEq3.gif) from the explanatory variable. We now fit a linear model to this vector. Then the model has the form y = x T β \+ ξ, where ξ is a zero mean random variable. For any value ![
+$$\\mathbf{x}^{{\\ast}} = \\left \[u^{{\\ast}},1\\right \]^{T}$$
+](A442674_1_En_13_Chapter_IEq4.gif) of the explanatory variable, our best estimate of y is (x ∗) T β, which can be written as y = β 1 u ∗ \+ β 2. If x ∗ = 0, the model predicts y = β 2. We can draw the model by drawing a line through the origin with slope β 1 and y-intercept β 2 in the x, y plane.
+
+### 13.3.2 Choosing β
+
+We must determine β. We can proceed in two ways. I show both because different people find different lines of reasoning more compelling. Each will get us to the same solution. One is probabilistic, the other isn't. Generally, I'll proceed as if they're interchangeable, although at least in principle they're different.
+
+Probabilistic approach: we could assume that ξ is a zero mean normal random variable with unknown variance. Then P(y | x, β) is normal, with mean x T β, and so we can write out the log-likelihood of the data. Write σ 2 for the variance of ξ, which we don't know, but will not worry about right now. We have that
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\log \\mathcal{L}\(\\beta \)& =& -\\sum _{i}\\log P\(y_{i}\\vert \\mathbf{x}_{i},\\beta \) {}\\\\ & =& \\frac{1} {2\\sigma ^{2}}\\sum _{i}\(y_{i} -\\mathbf{x}_{i}^{T}\\beta \)^{2} {}\\\\ & & +\\ \\mbox{ term not depending on }\\beta {}\\\\ \\end{array}$$
+](A442674_1_En_13_Chapter_Equ1.gif)
+
+Maximizing the log-likelihood of the data is equivalent to minimizing the negative log-likelihood of the data. Furthermore, the term ![
+$$\\frac{1} {2\\sigma ^{2}}$$
+](A442674_1_En_13_Chapter_IEq5.gif) does not affect the location of the minimum, so we must have that β minimizes ∑ i (y i − x i T β)2, or anything proportional to it. It is helpful to minimize an expression that is an average of squared errors, because (hopefully) this doesn't grow much when we add data. We therefore minimize
+
+![
+$$\\displaystyle{\\left \( \\frac{1} {N}\\right \)\\left \(\\sum _{i}\(y_{i} -\\mathbf{x}_{i}^{T}\\beta \)^{2}\\right \).}$$
+](A442674_1_En_13_Chapter_Equd.gif)
+
+Direct approach: notice that, if we have an estimate of β, we have an estimate of the values of the unmodelled effects ξ i for each example. We just take ξ i = y i − x i T β. It is quite natural to make the unmodelled effects "small". A good measure of size is the mean of the squared values, which means we want to minimize
+
+![
+$$\\displaystyle{\\left \( \\frac{1} {N}\\right \)\\left \(\\sum _{i}\(y_{i} -\\mathbf{x}_{i}^{T}\\beta \)^{2}\\right \).}$$
+](A442674_1_En_13_Chapter_Eque.gif)
+
+### 13.3.3 Solving the Least Squares Problem
+
+We can write all this more conveniently using vectors and matrices. Write y for the vector
+
+![
+$$\\displaystyle{\\left \(\\begin{array}{c} y_{1} \\\\ y_{2}\\\\ \\ldots \\\\ y_{n} \\end{array} \\right \)}$$
+](A442674_1_En_13_Chapter_Equf.gif)
+
+and ![
+$$\\mathcal{X}$$
+](A442674_1_En_13_Chapter_IEq6.gif) for the matrix
+
+![
+$$\\displaystyle{\\left \(\\begin{array}{c} \\mathbf{x}_{1}^{T} \\\\ \\mathbf{x}_{2}^{T} \\\\ \\ldots \\mathbf{x}_{n}^{T} \\end{array} \\right \).}$$
+](A442674_1_En_13_Chapter_Equg.gif)
+
+Then we want to minimize
+
+![
+$$\\displaystyle{\\left \( \\frac{1} {N}\\right \)\\left \(\\mathbf{y} -\\mathcal{X}\\beta \)^{T}\(\\mathbf{y} -\\mathcal{X}\\beta \\right \)}$$
+](A442674_1_En_13_Chapter_Equh.gif)
+
+which means that we must have
+
+![
+$$\\displaystyle{\\mathcal{X}^{T}\\mathcal{X}\\beta -\\mathcal{X}^{T}\\mathbf{y} = 0.}$$
+](A442674_1_En_13_Chapter_Equi.gif)
+
+For reasonable choices of features, we could expect that ![
+$$\\mathcal{X}^{T}\\mathcal{X}$$
+](A442674_1_En_13_Chapter_IEq7.gif)—which should strike you as being a lot like a covariance matrix—has full rank. If it does, which is the usual case, this equation is easy to solve. If it does not, there is more to do, which we will do in Sect. 13.4.4.
+
+Remember this: The vector of coefficients β for a linear regression is usually estimated using a least-squares procedure.
+
+### 13.3.4 Residuals
+
+Assume we have produced a regression by solving
+
+![
+$$\\displaystyle{\\mathcal{X}^{T}\\mathcal{X}\\hat{\\beta }-\\mathcal{X}^{T}\\mathbf{y} = 0}$$
+](A442674_1_En_13_Chapter_Equj.gif)
+
+for the value of ![
+$$\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq8.gif). I write ![
+$$\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq9.gif) because this is an estimate; we likely don't have the true value of the β that generated the data (the model might be wrong; etc.). We cannot expect that ![
+$$\\mathcal{X}\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq10.gif) is the same as y. Instead, there is likely to be some error. The residual is the vector
+
+![
+$$\\displaystyle{\\mathbf{e} = \\mathbf{y} -\\mathcal{X}\\hat{\\beta }}$$
+](A442674_1_En_13_Chapter_Equk.gif)
+
+which gives the difference between the true value and the model's prediction at each point. Each component of the residual is an estimate of the unmodelled effects for that data point. The mean square error is
+
+![
+$$\\displaystyle{m = \\frac{\\mathbf{e}^{T}\\mathbf{e}} {N} }$$
+](A442674_1_En_13_Chapter_Equl.gif)
+
+and this gives the average of the squared error of prediction on the training examples.
+
+Notice that the mean squared error is not a great measure of how good the regression is. This is because the value depends on the units in which the dependent variable is measured. So, for example, if you measure y in meters you will get a different mean squared error than if you measure y in kilometers for the same dataset. This is a serious nuisance, because it means that the value of the mean squared error cannot tell you how good a regression is. There is an alternative measure of the accuracy of a regression which does not depends on the units of y.
+
+### 13.3.5 R-Squared
+
+Unless the dependent variable is a constant (which would make prediction easy), it has some variance. If our model is of any use, it should explain some aspects of the value of the dependent variable. This means that the variance of the residual should be smaller than the variance of the dependent variable. If the model made perfect predictions, then the variance of the residual should be zero.
+
+We can formalize all this in a relatively straightforward way. We will ensure that ![
+$$\\mathcal{X}$$
+](A442674_1_En_13_Chapter_IEq11.gif) always has a column of ones in it, so that the regression can have a non-zero y-intercept. We now fit a model
+
+![
+$$\\displaystyle{\\mathbf{y} = \\mathcal{X}\\beta + \\mathbf{e}}$$
+](A442674_1_En_13_Chapter_Equm.gif)
+
+(where e is the vector of residual values) by choosing β such that e T e is minimized. Then we get some useful technical results.
+
+Useful Facts 13.1 (Regression)
+
+We write ![
+$$\\mathbf{y} = \\mathcal{X}\\hat{\\beta } + \\mathbf{e}$$
+](A442674_1_En_13_Chapter_IEq12.gif), where e is the residual. For a vector v of N components, we write ![
+$$\\overline{\\mathbf{v}} = \(1/N\)\\mathbf{1}^{T}\\mathbf{v}$$
+](A442674_1_En_13_Chapter_IEq13.gif). Assume ![
+$$\\mathcal{X}$$
+](A442674_1_En_13_Chapter_IEq14.gif) has a column of ones, and ![
+$$\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq15.gif) is chosen to minimize e T e. Then we have
+
+  1. 1.
+
+![
+$$\\mathbf{e}^{T}\\mathcal{X} = \\mathbf{0}$$
+](A442674_1_En_13_Chapter_IEq16.gif), i.e. that e is orthogonal to any column of ![
+$$\\mathcal{X}$$
+](A442674_1_En_13_Chapter_IEq17.gif). If e is not orthogonal to some column of e, we can increase or decrease the ![
+$$\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq18.gif) term corresponding to that column to make the error smaller. Another way to see this is to notice that ![
+$$\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq19.gif) is chosen to minimize ![
+$$\\frac{1} {N}\\mathbf{e}^{T}\\mathbf{e}$$
+](A442674_1_En_13_Chapter_IEq20.gif), which is ![
+$$\\frac{1} {N}\(\\mathbf{y} -\\mathcal{X}\\hat{\\beta }\)^{T}\(\\mathbf{y} -\\mathcal{X}\\hat{\\beta }\)$$
+](A442674_1_En_13_Chapter_IEq21.gif). Now because this is a minimum, the gradient with respect to ![
+$$\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq22.gif) is zero, so ![
+$$\(\\mathbf{y} -\\mathcal{X}\\hat{\\beta }\)^{T}\(-\\mathcal{X}\) = -\\mathbf{e}^{T}\\mathcal{X} = 0$$
+](A442674_1_En_13_Chapter_IEq23.gif).
+
+  2. 2.
+
+e T 1 = 0 (recall that ![
+$$\\mathcal{X}$$
+](A442674_1_En_13_Chapter_IEq24.gif) has a column of all ones, and apply the previous result).
+
+  3. 3.
+
+![
+$$\\mathbf{e}^{T}\\mathcal{X}\\hat{\\beta } = 0$$
+](A442674_1_En_13_Chapter_IEq25.gif) (first result means that this is true).
+
+  4. 4.
+
+![
+$$\\mathbf{1}^{T}\(\\mathbf{y} -\\mathcal{X}\\hat{\\beta }\) = 0$$
+](A442674_1_En_13_Chapter_IEq26.gif) (same as previous result).
+
+  5. 5.
+
+![
+$$\\overline{\\mathbf{y}} = \\overline{\\mathcal{X}\\hat{\\beta }}$$
+](A442674_1_En_13_Chapter_IEq27.gif) (same as previous result).
+
+Now y is a one dimensional dataset arranged into a vector, so we can compute ![
+$$\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)$$
+](A442674_1_En_13_Chapter_IEq28.gif) and ![
+$$\\mathsf{var}\\left \[y\\right \]$$
+](A442674_1_En_13_Chapter_IEq29.gif). Similarly, ![
+$$\\mathcal{X}\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq30.gif) is a one dimensional dataset arranged into a vector (its elements are ![
+$$\\mathbf{x}_{i}^{T}\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq31.gif)), as is e, so we know the meaning of mean and variance for each. We have a particularly important result:
+
+![
+$$\\displaystyle{\\mathsf{var}\\left \[y\\right \] = \\mathsf{var}\\left \[\\mathcal{X}\\hat{\\beta }\\right \] + \\mathsf{var}\\left \[e\\right \].}$$
+](A442674_1_En_13_Chapter_Equn.gif)
+
+This is quite easy to show, with a little more notation. Write ![
+$$\\overline{\\mathbf{y}} = \(1/N\)\(\\mathbf{1}^{T}\\mathbf{y}\)\\mathbf{1}$$
+](A442674_1_En_13_Chapter_IEq32.gif) for the vector whose entries are all ![
+$$\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)$$
+](A442674_1_En_13_Chapter_IEq33.gif); similarly for ![
+$$\\overline{\\mathbf{e}}$$
+](A442674_1_En_13_Chapter_IEq34.gif) and for ![
+$$\\overline{\\mathcal{X}\\hat{\\beta }}$$
+](A442674_1_En_13_Chapter_IEq35.gif). We have
+
+![
+$$\\displaystyle{\\mathsf{var}\\left \[y\\right \] = \(1/N\)\(\\mathbf{y} -\\overline{\\mathbf{y}}\)^{T}\(\\mathbf{y} -\\overline{\\mathbf{y}}\)}$$
+](A442674_1_En_13_Chapter_Equo.gif)
+
+and so on for ![
+$$\\mathsf{var}\\left \[e_{i}\\right \]$$
+](A442674_1_En_13_Chapter_IEq36.gif), etc. Notice from the facts that ![
+$$\\overline{\\mathbf{y}} = \\overline{\\mathcal{X}\\hat{\\beta }}$$
+](A442674_1_En_13_Chapter_IEq37.gif). Now
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathsf{var}\\left \[y\\right \]& =& \(1/N\)\\left \(\\left \[\\mathcal{X}\\hat{\\beta }-\\overline{\\mathcal{X}\\hat{\\beta }}\\right \] + \\left \[\\mathbf{e} -\\overline{\\mathbf{e}}\\right \]\\right \)^{T}\\left \(\\left \[\\mathcal{X}\\hat{\\beta }-\\overline{\\mathcal{X}\\hat{\\beta }}\\right \] + \\left \[\\mathbf{e} -\\overline{\\mathbf{e}}\\right \]\\right \) {}\\\\ & =& \(1/N\)\\left \(\\left \[\\mathcal{X}\\hat{\\beta }-\\overline{\\mathcal{X}\\hat{\\beta }}\\right \]^{T}\\left \[\\mathcal{X}\\hat{\\beta }-\\overline{\\mathcal{X}\\hat{\\beta }}\\right \] + 2\\left \[\\mathbf{e} -\\overline{\\mathbf{e}}\\right \]^{T}\\left \[\\mathcal{X}\\hat{\\beta }-\\overline{\\mathcal{X}\\hat{\\beta }}\\right \] + \\left \[\\mathbf{e} -\\overline{\\mathbf{e}}\\right \]^{T}\\left \[\\mathbf{e} -\\overline{\\mathbf{e}}\\right \]\\right \) {}\\\\ & =& \(1/N\)\\left \(\\left \[\\mathcal{X}\\hat{\\beta }-\\overline{\\mathcal{X}\\hat{\\beta }}\\right \]^{T}\\left \[\\mathcal{X}\\hat{\\beta }-\\overline{\\mathcal{X}\\hat{\\beta }}\\right \] + \\left \[\\mathbf{e} -\\overline{\\mathbf{e}}\\right \]^{T}\\left \[\\mathbf{e} -\\overline{\\mathbf{e}}\\right \]\\right \) {}\\\\ & & \\mbox{ because }\\overline{\\mathbf{e}} = 0\\mbox{ and }\\mathbf{e}^{T}\\mathcal{X}\\hat{\\beta } = 0\\mbox{ and }\\mathbf{e}^{T}\\mathbf{1} = 0 {}\\\\ & =& \\mathsf{var}\\left \[\\mathcal{X}\\hat{\\beta }\\right \] + \\mathsf{var}\\left \[e\\right \]. {}\\\\ \\end{array}$$
+](A442674_1_En_13_Chapter_Equ2.gif)
+
+This is extremely important, because us allows us to think about a regression as explaining variance in y. As we are better at explaining y, ![
+$$\\mathsf{var}\\left \[e\\right \]$$
+](A442674_1_En_13_Chapter_IEq38.gif) goes down. In turn, a natural measure of the goodness of a regression is what percentage of the variance of y it explains. This is known as R 2 (the r-squared measure). We have
+
+![
+$$\\displaystyle{R^{2} = \\frac{\\mathsf{var}\\left \[\\mathbf{x}_{i}^{T}\\hat{\\beta }\\right \]} {\\mathsf{var}\\left \[y_{i}\\right \]} }$$
+](A442674_1_En_13_Chapter_Equp.gif)
+
+which gives some sense of how well the regression explains the training data. Notice that the value of R 2 is not affected by the units of y (exercises)
+
+Good predictions result in high values of R 2, and a perfect model will have R 2 = 1 (which doesn't usually happen). For example, the regression of Fig. 13.3 has an R 2 value of 0.87. Figures 13.1 and 13.2 show the R 2 values for the regressions plotted there; notice how better models yield larger values of R 2. Notice that if you look at the summary that R provides for a linear regression, it will offer you two estimates of the value for R 2. These estimates are obtained in ways that try to account for (a) the amount of data in the regression, and (b) the number of variables in the regression. For our purposes, the differences between these numbers and the R 2 I defined are not significant. For the figures, I computed R 2 as I described in the text above, but if you substitute one of R's numbers nothing terrible will happen.
+
+Remember this: The quality of predictions made by a regression can be evaluated by looking at the fraction of the variance in the dependent variable that is explained by the regression. This number is called R 2 , and lies between zero and one; regressions with larger values make better predictions.
+
+Procedure 13.1 (Linear Regression Using Least Squares)
+
+We have a dataset containing N pairs (x i , y i ). Each x i is a d-dimensional explanatory vector, and each y i is a single dependent variable. We assume that each data point conforms to the model
+
+![
+$$\\displaystyle{y_{i} = \\mathbf{x}_{i}^{T}\\beta +\\xi _{ i}}$$
+](A442674_1_En_13_Chapter_Equq.gif)
+
+where ξ i represents unmodelled effects. We assume that ξ i are samples of a random variable with 0 mean and unknown variance. Sometimes, we assume the random variable is normal. Write
+
+![
+$$\\displaystyle{\\mathbf{y} = \\left \(\\begin{array}{c} y_{1}\\\\ \\ldots \\\\ y_{n} \\end{array} \\right \)\\mbox{ and }\\mathcal{X} = \\left \(\\begin{array}{c} \\mathbf{x}_{1}^{T}\\\\ \\ldots \\\\ \\mathbf{x}_{n}^{T} \\end{array} \\right \).}$$
+](A442674_1_En_13_Chapter_Equr.gif)
+
+We estimate ![
+$$\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq39.gif) (the value of β) by solving the linear system
+
+![
+$$\\displaystyle{\\mathcal{X}^{T}\\mathcal{X}\\hat{\\beta }-\\mathcal{X}^{T}\\mathbf{y} = 0.}$$
+](A442674_1_En_13_Chapter_Equs.gif)
+
+For a data point x, our model predicts ![
+$$\\mathbf{x}^{T}\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq40.gif). The residuals are
+
+![
+$$\\displaystyle{\\mathbf{e} = \\mathbf{y} -\\mathcal{X}\\hat{\\beta }.}$$
+](A442674_1_En_13_Chapter_Equt.gif)
+
+We have that e T 1 = 0. The mean square error is given by
+
+![
+$$\\displaystyle{m = \\frac{\\mathbf{e}^{T}\\mathbf{e}} {N}.}$$
+](A442674_1_En_13_Chapter_Equu.gif)
+
+The R 2 is given by
+
+![
+$$\\displaystyle{\\frac{\\mathsf{var}\\left \(\\left \\{\\mathbf{x}_{i}^{T}\\hat{\\beta }\\right \\}\\right \)} {\\mathsf{var}\\left \(\\left \\{\\mathbf{y}\\right \\}\\right \)}.}$$
+](A442674_1_En_13_Chapter_Equv.gif)
+
+Values of R 2 range from 0 to 1; a larger value means the regression is better at explaining the data.
+
+## 13.4 Producing Good Linear Regressions
+
+Linear regression is useful, but it isn't magic. Some regressions make poor predictions (recall the regressions of Figure 13.2). As another example, regressing the first digit of your telephone number against the length of your foot won't work.
+
+We have some straightforward tests to tell whether a regression is working. You can look at a plot for a dataset with one explanatory variable and one dependent variable. You plot the data on a scatter plot, then plot the model as a line on that scatterplot. Just looking at the picture can be informative (compare Figs. 13.1 and 13.2).
+
+You can check if the regression predicts a constant. This is usually a bad sign. You can check this by looking at the predictions for each of the training data items. If the variance of these predictions is small compared to the variance of the independent variable, the regression isn't working well. If you have only one explanatory variable, then you can plot the regression line. If the line is horizontal, or close, then the value of the explanatory variable makes very little contribution to the prediction. This suggests that there is no particular relationship between the explanatory variable and the independent variable.
+
+You can also check, by eye, if the residual isn't random. If y − x T β is a zero mean normal random variable, then the value of the residual vector should not depend on the corresponding y-value. Similarly, if y − x T β is just a zero mean collection of unmodelled effects, we want the value of the residual vector to not depend on the corresponding y-value either. If it does, that means there is some phenomenon we are not modelling. Looking at a scatter plot of e against y will often reveal trouble in a regression (Fig. 13.7). In the case of Fig. 13.7, the trouble is caused by a few data points that are very different from the others severely affecting the regression. We will discuss such points in more detail below. Once they have been removed, the regression improves markedly (Fig. 13.8).
+
+Remember this: Linear regressions can make bad predictions. You can check for trouble by: evaluating R 2 ; looking at a plot; looking to see if the regression makes a constant prediction; or checking whether the residual is random. Other strategies exist, but are beyond the scope of this book.
+
+### 13.4.1 Transforming Variables
+
+Sometimes the data isn't in a form that leads to a good linear regression. In this case, transforming explanatory variables, the dependent variable, or both can lead to big improvements. Figure 13.4 shows one example, based on the idea of word frequencies. Some words are used very often in text; most are used seldom. The dataset for this figure consists of counts of the number of time a word occurred for the 100 most common words in Shakespeare's printed works. It was originally collected from a concordance, and has been used to attack a variety of interesting questions, including an attempt to assess how many words Shakespeare knew. This is hard, because he likely knew many words that he didn't use in his works, so one can't just count. If you look at the plot of Fig. 13.4, you can see that a linear regression of count (the number of times a word is used) against rank (how common a word is, 1–100) is not really useful. The most common words are used very often, and the number of times a word is used falls off very sharply as one looks at less common words. You can see this effect in the scatter plot of residual against dependent variable in Fig. 13.4—the residual depends rather strongly on the dependent variable. This is an extreme example that illustrates how poor linear regressions can be.
+
+Fig. 13.4
+
+On the left, word count plotted against rank for the 100 most common words in Shakespeare, using a dataset that comes with R (called "bard", and quite likely originating in an unpublished report by J. Gani and I. Saunders). I show a regression line too. This is a poor fit by eye, and the R 2 is poor, too (R 2 = 0. 1). On the right, log word count plotted against log rank for the 100 most common words in Shakespeare, using a dataset that comes with R (called "bard", and quite likely originating in an unpublished report by J. Gani and I. Saunders). The regression line is very close to the data
+
+However, if we regress log-count against log-rank, we get a very good fit indeed. This suggests that Shakespeare's word usage (at least for the 100 most common words) is consistent with Zipf's law. This gives the relation between frequency f and rank r for a word as
+
+![
+$$\\displaystyle{f \\propto \\frac{1} {r}^{s}}$$
+](A442674_1_En_13_Chapter_Equw.gif)
+
+where s is a constant characterizing the distribution. Our linear regression suggests that s is approximately 1. 67 for this data.
+
+In some cases, the natural logic of the problem will suggest variable transformations that improve regression performance. For example, one could argue that humans have approximately the same density, and so that weight should scale as the cube of height; in turn, this suggests that one regress weight against the cube root of height. Figure 13.5 shows the result of this transformation on the fish data, where it appears to help a lot. Generally, shorter people tend not to be scaled versions of taller people, so the cube root might be too aggressive. The body mass index (BMI: a controversial but not completely pointless measure of the relationship between weight and height) uses the square root.
+
+Fig. 13.5
+
+Two variable transformations on the perch dataset. On the top left, weight predicted from length cubed; on the top right, cube root of weight predicted from length. On the bottom corresponding plots transformed to weight-length coordinates (you need to look very closely to see the differences). The non-linear transformation helps significantly
+
+Remember this: The performance of a regression can be improved by transforming variables. Transformations can follow from looking at plots, or thinking about the logic of the problem
+
+### 13.4.2 Problem Data Points Have Significant Impact
+
+Outlying data points can significantly weaken the usefulness of a regression. For some regression problems, we can identify data points that might be a problem, and then resolve how to deal with them. One possibility is that they are true outliers—someone recorded a data item wrong, or they represent an effect that just doesn't occur all that often. Another is that they are important data, and our linear model may not be good enough. If the data points really are outliers, we can drop them from the data set. If they aren't, we may be able to improve the regression by transforming features or by finding a new explanatory variable.
+
+When we construct a regression, we are solving for the β that minimizes ∑ i (y i − x i T β)2, equivalently for the β that produces the smallest value of ∑ i e i 2. This means that residuals with large value can have a very strong influence on the outcome—we are squaring that large value, resulting in an enormous value. Generally, many residuals of medium size will have a smaller cost than one large residual and the rest tiny. As Fig. 13.6 illustrates, this means that a data point that lies far from the others can swing the regression line significantly (which affects the residual, Fig. 13.7).
+
+Fig. 13.6
+
+On the left, a synthetic dataset with one independent and one explanatory variable, with the regression line plotted. Notice the line is close to the data points, and its predictions seem likely to be reliable. On the right, the result of adding a single outlying datapoint to that dataset. The regression line has changed significantly, because the regression line tries to minimize the sum of squared vertical distances between the data points and the line. Because the outlying datapoint is far from the line, the squared vertical distance to this point is enormous. The line has moved to reduce this distance, at the cost of making the other points further from the line
+
+Fig. 13.7
+
+On the left, weight regressed against height for the bodyfat dataset. The line doesn't describe the data particularly well, because it has been strongly affected by a few data points (filled-in markers). On the right, a scatter plot of the residual against the value predicted by the regression. This doesn't look like noise, which is a sign of trouble
+
+This creates a problem, because data points that are very different from most others (sometimes called outliers) can also have the highest influence on the outcome of the regression. Figure 13.8 shows this effect for a simple case. When we have only one explanatory variable, there's an easy method to spot problem data points. We produce a scatter plot and a regression line, and the difficulty is usually obvious. In particularly tricky cases, printing the plot and using a see-through ruler to draw a line by eye can help (if you use an opaque ruler, you may not see some errors).
+
+These data points can come from many sources. They may simply be errors. Failures of equipment, transcription errors, someone guessing a value to replace lost data, and so on are some methods that might produce outliers. Another possibility is your understanding of the problem is wrong. If there are some rare effects that are very different than the most common case, you might see outliers. Major scientific discoveries have resulted from investigators taking outliers seriously, and trying to find out what caused them (though you shouldn't see a Nobel prize lurking behind every outlier).
+
+What to do about outliers is even more fraught. The simplest strategy is to find them, then remove them from the data. For low dimensional models, you can do this by plotting data and predictions, then looking for problems. There are other methods, but they are too complicated for us. You should be aware that this strategy can get dangerous fairly quickly, whether you use a simple or a sophisticated method. First, you might find that each time you remove a few problematic data points, some more data points look strange to you. This process is unlikely to end well. Second, you should be aware that throwing out outliers can increase your future prediction error, particularly if they're caused by real effects. An alternative strategy is to build methods that can either discount or model the effects of outliers.
+
+Remember this: Outliers can affect linear regressions significantly. Usually, if you can plot the regression, you can look for outliers by eyeballing the plot. Other methods exist, but are beyond the scope of this text.
+
+### 13.4.3 Functions of One Explanatory Variable
+
+Imagine we have only one measurement to form explanatory variables. For example, in the perch data of Fig. 13.1, we have only the length of the fish. If we evaluate functions of that measurement, and insert them into the vector of explanatory variables, the resulting regression is still easy to plot. It may also offer better predictions. The fitted line of Fig. 13.1 looks quite good, but the data points look as though they might be willing to follow a curve. We can get a curve quite easily. Our current model gives the weight as a linear function of the length with a noise term (which we wrote y i = β 1 x i \+ β 0 \+ ξ i ). But we could expand this model to incorporate other functions of the length. In fact, it's quite surprising that the weight of a fish should be predicted by its length. If the fish doubled in each direction, say, its weight should go up by a factor of eight. The success of our regression suggests that fish do not just scale in each direction as they grow. But we might try the model y i = β 2 x i 2 \+ β 1 x i \+ β 0 \+ ξ i . This is easy to do. The i'th row of the matrix ![
+$$\\mathcal{X}$$
+](A442674_1_En_13_Chapter_IEq41.gif) currently looks like [x i , 1]. We build a new matrix ![
+$$\\mathcal{X}^{\(b\)}$$
+](A442674_1_En_13_Chapter_IEq42.gif), where the i'th row is [x i 2, x i , 1], and proceed as before. This gets us a new model. The nice thing about this model is that it is easy to plot—our predicted weight is still a function of the length, it's just not a linear function of the length. Several such models are plotted in Fig. 13.9.
+
+Fig. 13.8
+
+On the left, weight regressed against height for the bodyfat dataset. I have now removed the four suspicious looking data points, identified in Fig. 13.7 with filled-in markers; these seemed the most likely to be outliers. On the right, a scatter plot of the residual against the value predicted by the regression. Notice that the residual looks like noise. The residual seems to be uncorrelated to the predicted value; the mean of the residual seems to be zero; and the variance of the residual doesn't depend on the predicted value. All these are good signs, consistent with our model, and suggest the regression will yield good predictions
+
+Fig. 13.9
+
+On the left, several different models predicting fish weight from length. The line uses the explanatory variables 1 and x i ; and the curves use other monomials in x i as well, as shown by the legend. This allows the models to predict curves that lie closer to the data. It is important to understand that, while you can make a curve go closer to the data by inserting monomials, that doesn't mean you necessarily have a better model. On the right, I have used monomials up to x i 10. This curve lies very much closer to the data points than any on the other side, at the cost of some very odd looking wiggles in between data points (look at small lengths; the model goes quite strongly negative there, but I can't bring myself to change the axes and show predictions that are obvious nonsense). I can't think of any reason that these structures would come from true properties of fish, and it would be hard to trust predictions from this model
+
+You should notice that it can be quite easy to add a lot of functions like this (in the case of the fish, I tried x i 3 as well). However, it's hard to decide whether the regression has actually gotten better. The least-squares error on the training data will never go up when you add new explanatory variables, so the R 2 will never get worse. This is easy to see, because you could always use a coefficient of zero with the new variables and get back the previous regression. However, the models that you choose are likely to produce worse and worse predictions as you add explanatory variables. Knowing when to stop can be tough, though it's sometimes obvious that the model is untrustworthy (Fig. 13.9).
+
+Remember this: If you have only one measurement, you can construct a high dimensional x by using functions of that measurement. This produces a regression that has many explanatory variables, but is still easy to plot. Knowing when to stop is hard. An understanding of the problem is helpful.
+
+### 13.4.4 Regularizing Linear Regressions
+
+When we have many explanatory variables, some might be significantly correlated. This means that we can predict, quite accurately, the value of one explanatory variable using the values of the other variables. This means there must be a vector w so that ![
+$$\\mathcal{X}\\mathbf{w}$$
+](A442674_1_En_13_Chapter_IEq43.gif) is small (exercises). In turn, that ![
+$$\\mathbf{w}^{T}\\mathcal{X}^{T}\\mathcal{X}\\mathbf{w}$$
+](A442674_1_En_13_Chapter_IEq44.gif) must be small, so that ![
+$$\\mathcal{X}^{T}\\mathcal{X}$$
+](A442674_1_En_13_Chapter_IEq45.gif) has some small eigenvalues. These small eigenvalues lead to bad predictions, as follows. The vector w has the property that ![
+$$\\mathcal{X}^{T}\\mathcal{X}\\mathbf{w}$$
+](A442674_1_En_13_Chapter_IEq46.gif) is small. This means that ![
+$$\\mathcal{X}^{T}\\mathcal{X}\(\\hat{\\beta }+\\mathbf{w}\)$$
+](A442674_1_En_13_Chapter_IEq47.gif) is not much different from ![
+$$\\mathcal{X}^{T}\\mathcal{X}\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq48.gif) (equivalently, the matrix can turn large vectors into small ones). All this means that ![
+$$\(\\mathcal{X}^{T}\\mathcal{X}\)^{-1}$$
+](A442674_1_En_13_Chapter_IEq49.gif) will turn some small vectors into big ones. A small change in ![
+$$\\mathcal{X}^{T}\\mathbf{Y}$$
+](A442674_1_En_13_Chapter_IEq50.gif) can lead to a large change in the estimate of ![
+$$\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq51.gif).
+
+This is a problem, because we can expect that different samples from the same data will have somewhat different values of ![
+$$\\mathcal{X}^{T}\\mathbf{Y}$$
+](A442674_1_En_13_Chapter_IEq52.gif). For example, imagine the person recording fish measurements in Lake Laengelmavesi recorded a different set of fish; we expect changes in ![
+$$\\mathcal{X}$$
+](A442674_1_En_13_Chapter_IEq53.gif) and Y. But, if ![
+$$\\mathcal{X}^{T}\\mathcal{X}$$
+](A442674_1_En_13_Chapter_IEq54.gif) has small eigenvalues, these changes could produce large changes in our model (Figs. 13.10 and 13.11).
+
+Fig. 13.10
+
+On the left, cross-validated error estimated for different choices of regularization constant for a linear regression of weight against height for the bodyfat dataset, with four outliers removed. The horizontal axis is log regression constant; the vertical is cross-validated error. The mean of the error is shown as a spot, with vertical error bars. The vertical lines show a range of reasonable choices of regularization constant (left yields the lowest observed error, right the error whose mean is within one standard error of the minimum). On the right, two regression lines on a scatter plot of this dataset; one is the line computed without regularization, the other is obtained using the regularization parameter that yields the lowest observed error. In this case, the regularizer doesn't change the line much, but may produce improved values on new data (notice how the cross-validated error is fairly flat with low values of the regularization constant)
+
+Fig. 13.11
+
+Regularization doesn't make outliers go away. On the left, cross-validated error estimated for different choices of regularization constant for a linear regression of weight against height for the bodyfat dataset, with all points. The horizontal axis is log regression constant; the vertical is cross-validated error. The mean of the error is shown as a spot, with vertical error bars. The vertical lines show a range of reasonable choices of regularization constant (left yields the lowest observed error, right the error whose mean is within one standard error of the minimum). On the right, two regression lines on a scatter plot of this dataset; one is the line computed without regularization, the other is obtained using the regularization parameter that yields the lowest observed error. In this case, the regularizer doesn't change the line much, but may produce improved values on new data (notice how the cross-validated error is fairly flat with low values of the regularization constant)
+
+The problem is relatively easy to control. When there are small eigenvalues in ![
+$$\\mathcal{X}^{T}\\mathcal{X}$$
+](A442674_1_En_13_Chapter_IEq55.gif), we expect that ![
+$$\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq56.gif) will be large (because we can add components in the direction of w without changing all that much), and the largest components in ![
+$$\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq57.gif) might be very inaccurately estimated. If we are trying to predict new y values, we expect that large components in ![
+$$\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq58.gif) turn into large errors in prediction (exercises).
+
+An important and useful way to suppress these errors is to try to find a ![
+$$\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq59.gif) that isn't large, and also gives a low error. We can do this by regularizing, using the same trick we saw in the case of classification. Instead of choosing the value of β that minimizes
+
+![
+$$\\displaystyle{\\left \( \\frac{1} {N}\\right \)\(\\mathbf{y} -\\mathcal{X}\\beta \)^{T}\(\\mathbf{y} -\\mathcal{X}\\beta \)}$$
+](A442674_1_En_13_Chapter_Equx.gif)
+
+we minimize
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\left \( \\frac{1} {N}\\right \)\(\\mathbf{y} -\\mathcal{X}\\beta \)^{T}\(\\mathbf{y} -\\mathcal{X}\\beta \)& +& \\lambda \\beta ^{T}\\beta {}\\\\ \\mbox{ Error}& +& \\mbox{ Regularizer} {}\\\\ \\end{array}$$
+](A442674_1_En_13_Chapter_Equ3.gif)
+
+Here λ > 0 is a constant (the regularization weight, though it's pretty widely known as λ) that weights the two requirements (small error; small ![
+$$\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq60.gif)) relative to one another. Notice also that dividing the total error by the number of data points means that our choice of λ shouldn't be affected by changes in the size of the data set.
+
+Regularization helps deal with the small eigenvalue, because to solve for β we must solve the equation
+
+![
+$$\\displaystyle{\\left \[\\left \( \\frac{1} {N}\\right \)\\mathcal{X}^{T}\\mathcal{X} +\\lambda \\mathcal{I}\\right \]\\hat{\\beta } = \\left \( \\frac{1} {N}\\right \)\\mathcal{X}^{T}\\mathbf{y}}$$
+](A442674_1_En_13_Chapter_Equy.gif)
+
+(obtained by differentiating with respect to β and setting to zero) and the smallest eigenvalue of the matrix ![
+$$\(\\left \( \\frac{1} {N}\\right \)\(\\mathcal{X}^{T}\\mathcal{X} +\\lambda \\mathcal{I}\)$$
+](A442674_1_En_13_Chapter_IEq61.gif) will be at least λ (exercises). Penalizing a regression with the size of β in this way is sometimes known as ridge regression. The value of λ that is most helpful depends on the dataset. Typically, one sets up a range of values, then searches, using cross-validation to estimate the error.
+
+Worked example 13.1 (Predicting the Weight of a Fish with Regularized Linear Regression)
+
+We have already seen how to predict the weight of a fish using different powers of its length (Sects. 13.4.1 and 13.4.3; Fig. 13.9). Section 13.4.3 showed that using too many powers would likely lead to poor predictions on test data. Show that regularization can be used to control this problem.
+
+SolutionThe main point of this example is how useful good statistical software can be, and to draw your attention to an excellent package. The package I use for regressions, glmnet, will choose a good range of regularization weights (λ's) and compute estimates of the mean and standard deviation of the squared cross-validated error for various values in that range. It then prepares a nice plot of this information, which makes the impact of the regularization clear. I've shown such a plot in Fig. 13.12. In this problem, quite a large value of the regularization constant produces the best result. I've also show a plot of the predictions made, with the coefficients of each power of length used in the regression for the best value of the regularization constant. You should notice that the coefficient of length and its square are fairly high, there's a small value of the coefficient for the cube of length, and for higher powers the coefficients are pretty tiny. If you're careful, you'll check that the coefficients are small compared to the scale of the numbers (because the 10'th power of 20, say, is big). The curve has no wiggles in it, because these coefficients mean that high powers make almost no contribution to its shape.
+
+Fig. 13.12
+
+Regularization can be a significant help when there are many predictors. On the left, the glmnet plot of cross-validated prediction error against log regularization coefficient for the perch data of Fig. 13.9. The set of independent variables includes all powers of length up to 10 (as in the wiggly graph on the right of Fig. 13.9). Notice that the regularization coefficient that yields the smallest error is quite large (the horizontal axis is on a logarithmic scale). On the right, the curve of predicted values. The cross-validated error chooses a regularization constant that discourages wiggles; inspecting the coefficients, shown in the inset, shows that high powers of length are firmly suppressed
+
+We choose λ in the same way we used for classification; split the training set into a training piece and a validation piece, train for different values of λ, and test the resulting regressions on the validation piece. The error is a random variable, random because of the random split. It is a fair model of the error that would occur on a randomly chosen test example (assuming that the training set is "like" the test set, in a way that I do not wish to make precise yet). We could use multiple splits, and average over the splits. Doing so yields both an average error for a value of λ and an estimate of the standard deviation of error.
+
+Statistical software will do all the work for you. I used the glmnet package in R; this package is available in Matlab, too. There are likely other such packages. Figure 13.10 shows an example, for weight regressed against height. Notice the regularization doesn't change the model (plotted in the figure) all that much. For each value of λ (horizontal axis), the method has computed the mean error and standard deviation of error using cross-validation splits, and displays these with error bars. Notice that λ = 0 yields poorer predictions than a larger value; large ![
+$$\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq62.gif) really are unreliable. Notice that now there is now no λ that yields the smallest validation error, because the value of error depends on the random splits used in cross-validation. A reasonable choice of λ lies between the one that yields the smallest error encountered (one vertical line in the plot) and the largest value whose mean error is within one standard deviation of the minimum (the other vertical line in the plot).
+
+All this is quite similar to regularizing a classification problem. We started with a cost function that evaluated the errors caused by a choice of β, then added a term that penalized β for being "large". This term is the squared length of β, as a vector. It is sometimes known as the L 2 norm of the vector.
+
+Remember this: The performance of a regression can be improved by regularizing, particularly if some explanatory variables are correlated. The procedure is similar to that used for classification.
+
+## 13.5 Exploiting Your Neighbors for Regression
+
+Nearest neighbors can clearly predict a number for a query example—you find the closest training example, and report its number. This would be one way to use nearest neighbors for regression, but it isn't terribly effective. One important difficulty is that the regression prediction is piecewise constant (Fig. 13.13). If there is an immense amount of data, this may not present major problems, because the steps in the prediction will be small and close together. But it's not generally an effective use of data.
+
+Fig. 13.13
+
+Different forms of nearest neighbors regression, predicting y from a one-dimensional x, using a total of 40 training points. Top left: reporting the nearest neighbor leads to a piecewise constant function. Top right: improvements are available by forming a weighted average of the five nearest neighbors, using inverse distance weighting or exponential weighting with three different scales. Notice if the scale is small, then the regression looks a lot like nearest neighbors, and if it is too large, all the weights in the average are nearly the same (which leads to a piecewise constant structure in the regression). Bottom left and bottom right show that using more neighbors leads to a smoother regression
+
+A more effective strategy is to find several nearby training examples, and use them to produce an estimate. This approach can produce very good regression estimates, because every prediction is made by training examples that are near to the query example. However, producing a regression estimate is expensive, because for every query one must find the nearby training examples.
+
+Write x for the query point, and assume that we have already collected the N nearest neighbors, which we write x i . Write y i for the value of the dependent variable for the i'th of these points. Notice that some of these neighbors could be quite far from the query point. We don't want distant points to make as much contribution to the model as nearby points. This suggests forming a weighted average of the predictions of each point. Write w i for the weight at the i'th point. Then the estimate is
+
+![
+$$\\displaystyle{y_{pred} = \\frac{\\sum _{i}w_{i}y_{i}} {\\sum _{i}w_{i}}.}$$
+](A442674_1_En_13_Chapter_Equz.gif)
+
+A variety of weightings are reasonable choices. Write d i = ∣​∣​​(x − x i )​​∣​∣ for the distance between the query point and the i'th nearest neighbor. Then inverse distance weighting uses w i = 1∕d i . Alternatively, we could use an exponential function to strongly weight down more distant points, using
+
+![
+$$\\displaystyle{w_{i} =\\exp \\left \(\\frac{-d_{i}^{2}} {2\\sigma ^{2}} \\right \).}$$
+](A442674_1_En_13_Chapter_Equaa.gif)
+
+We will need to choose a scale σ, which can be done by cross-validation. Hold out some examples, make predictions at the held out examples using a variety of different scales, and choose the scale that gives the best held-out error. Alternatively, if there are enough nearest neighbors, we could form a distance weighted linear regression, then predict the value at the query point from that regression.
+
+Each of these strategies presents some difficulties when x has high dimension. In that case, it is usual that the nearest neighbor is a lot closer than the second nearest neighbor. If this happens, then each of these weighted averages will boil down to evaluating the dependent variable at the nearest neighbor (because all the others will have very small weight in the average).
+
+Remember this: Nearest neighbors can be used for regression. In the simplest approach, you find the nearest neighbor to your feature vector, and take that neighbor's number as your prediction. More complex approaches smooth predictions over multiple neighbors.
+
+### 13.5.1 Using Your Neighbors to Predict More than a Number
+
+Linear regression takes some features and predicts a number. But in practice, one often wants to predict something more complex than a number. For example, I might want to predict a parse tree (which has combinatorial structure) from a sentence (the explanatory variables). As another example, I might want to predict a map of the shadows in an image (which has spatial structure) against an image (the explanatory variables). As yet another example, I might want to predict which direction to move the controls on a radio-controlled helicopter (which have to be moved together) against a path plan and the current state of the helicopter (the explanatory variables).
+
+Looking at neighbors is a very good way to solve such problems. The general strategy is relatively simple. We find a large collection of pairs of training data. Write x i for the explanatory variables for the i'th example, and y i for the dependent variable in the i'th example. This dependent variable could be anything—it doesn't need to be a single number. It might be a tree, or a shadow map, or a word, or anything at all. I wrote it as a vector because I needed to choose some notation.
+
+In the simplest, and most general, approach, we obtain a prediction for a new set of explanatory variables x by (a) finding the nearest neighbor and then (b) producing the dependent variable for that neighbor. We might vary the strategy slightly by using an approximate nearest neighbor. If the dependent variables have enough structure that it is possible to summarize a collection of different dependent variables, then we might recover the k nearest neighbors and summarize their dependent variables. How we summarize rather depends on the dependent variables. For example, it is a bit difficult to imagine the average of a set of trees, but quite straightforward to average images. If the dependent variable was a word, we might not be able to average words, but we can vote and choose the most popular word. If the dependent variable is a vector, we can compute either distance weighted averages or a distance weighted linear regression.
+
+Remember this: Nearest neighbors can be used to predict more than numbers.
+
+## 13.6 You Should
+
+### 13.6.1 Remember These Definitions
+
+Regression 308
+
+Linear regression 308
+
+### 13.6.2 Remember These Terms
+
+Regression 305
+
+training examples 305
+
+test examples 305
+
+explanatory variables 305
+
+dependent variable 305
+
+residual 310
+
+mean square error 310
+
+Zipf's law 313
+
+outliers 314
+
+regularization weight 319
+
+ridge regression 319
+
+L 2 norm 321
+
+### 13.6.3 Remember These Facts
+
+Regression 311
+
+### 13.6.4 Remember These Procedures
+
+To regress using least squares 312
+
+### Appendix: Data
+
+Problems
+
+13.1 Figure 13.14 shows a linear regression of systolic blood pressure against age. There are 30 data points.
+
+  1. (a)
+
+Write e i = y i − x i T β for the residual. What is the ![
+$$\\mathsf{mean}\\left \(\\left \\{e\\right \\}\\right \)$$
+](A442674_1_En_13_Chapter_IEq63.gif) for this regression?
+
+  2. (b)
+
+For this regression, ![
+$$\\mathsf{var}\\left \(\\left \\{y\\right \\}\\right \) = 509$$
+](A442674_1_En_13_Chapter_IEq64.gif) and the R 2 is 0.4324. What is ![
+$$\\mathsf{var}\\left \(\\left \\{e\\right \\}\\right \)$$
+](A442674_1_En_13_Chapter_IEq65.gif) for this regression?
+
+  3. (c)
+
+How well does the regression explain the data?
+
+  4. (d)
+
+What could you do to produce better predictions of blood pressure (without actually measuring blood pressure)?
+
+Fig. 13.14
+
+A regression of blood pressure against age, for 30 data points
+
+13.2 At http://​www.​statsci.​org/​data/​general/​kittiwak.​html, you can find a dataset collected by D.K. Cairns in 1988 measuring the area available for a seabird (black-legged kittiwake) colony and the number of breeding pairs for a variety of different colonies. Figure 13.15 shows a linear regression of the number of breeding pairs against the area. There are 22 data points.
+
+  1. (a)
+
+Write e i = y i − x i T β for the residual. What is the ![
+$$\\mathsf{mean}\\left \(\\left \\{e\\right \\}\\right \)$$
+](A442674_1_En_13_Chapter_IEq66.gif) for this regression?
+
+  2. (b)
+
+For this regression, ![
+$$\\mathsf{var}\\left \(\\left \\{y\\right \\}\\right \) = 16,491,357$$
+](A442674_1_En_13_Chapter_IEq67.gif) and the R 2 is 0.62. What is ![
+$$\\mathsf{var}\\left \(\\left \\{e\\right \\}\\right \)$$
+](A442674_1_En_13_Chapter_IEq68.gif) for this regression?
+
+  3. (c)
+
+How well does the regression explain the data? If you had a large island, to what extent would you trust the prediction for the number of kittiwakes produced by this regression? If you had a small island, would you trust the answer more?
+
+Fig. 13.15
+
+A regression of the number of breeding pairs of kittiwakes against the area of an island, for 22 data points
+
+13.3 At http://​www.​statsci.​org/​data/​general/​kittiwak.​html, you can find a dataset collected by D.K. Cairns in 1988 measuring the area available for a seabird (black-legged kittiwake) colony and the number of breeding pairs for a variety of different colonies. Figure 13.16 shows a linear regression of the number of breeding pairs against the log of area. There are 22 data points.
+
+  1. (a)
+
+Write e i = y i − x i T β for the residual. What is the ![
+$$\\mathsf{mean}\\left \(\\left \\{e\\right \\}\\right \)$$
+](A442674_1_En_13_Chapter_IEq69.gif) for this regression?
+
+  2. (b)
+
+For this regression, ![
+$$\\mathsf{var}\\left \(\\left \\{y\\right \\}\\right \) = 16,491,357$$
+](A442674_1_En_13_Chapter_IEq70.gif) and the R 2 is 0.31. What is ![
+$$\\mathsf{var}\\left \(\\left \\{e\\right \\}\\right \)$$
+](A442674_1_En_13_Chapter_IEq71.gif) for this regression?
+
+  3. (c)
+
+How well does the regression explain the data? If you had a large island, to what extent would you trust the prediction for the number of kittiwakes produced by this regression? If you had a small island, would you trust the answer more? Why?
+
+  4. (d)
+
+Figure 13.16 shows the result of a linear regression that ignores two likely outliers. Would you trust the predictions of this regression more? Why?
+
+Fig. 13.16
+
+Left: A regression of the number of breeding pairs of kittiwakes against the log of area of an island, for 22 data points. Right: A regression of the number of breeding pairs of kittiwakes against the log of area of an island, for 22 data points, using a method that ignores two likely outliers
+
+13.4 At http://​www.​statsci.​org/​data/​general/​brunhild.​html, you will find a dataset that measures the concentration of a sulfate in the blood of a baboon named Brunhilda as a function of time. Figure 13.17 plots this data, with a linear regression of the concentration against time. I have shown the data, and also a plot of the residual against the predicted value. The regression appears to be unsuccessful.
+
+  1. (a)
+
+What suggests the regression has problems?
+
+  2. (b)
+
+What is the cause of the problem, and why?
+
+  3. (c)
+
+What could you do to improve the problems?
+
+Fig. 13.17
+
+Left: A regression of the concentration of sulfate in the blood of Brunhilda the baboon against time.Right: For this regression, a plot of residual against fitted value
+
+13.5 Assume we have a dataset where ![
+$$\\mathbf{Y} = \\mathcal{X}\\beta +\\xi$$
+](A442674_1_En_13_Chapter_IEq72.gif), for some unknown β and ξ. The term ξ is a normal random variable with zero mean, and covariance ![
+$$\\sigma ^{2}\\mathcal{I}$$
+](A442674_1_En_13_Chapter_IEq73.gif) (i.e. this data really does follow our model).
+
+  1. (a)
+
+Write ![
+$$\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq74.gif) for the estimate of β recovered by least squares, and ![
+$$\\hat{\\mathbf{Y}}$$
+](A442674_1_En_13_Chapter_IEq75.gif) for the values predicted by our model for the training data points. Show that
+
+![
+$$\\displaystyle{\\hat{\\mathbf{Y}} = \\mathcal{X}\\left \(\\mathcal{X}^{T}\\mathcal{X}\\right \)^{-1}\\mathcal{X}^{T}\\mathbf{Y}}$$
+](A442674_1_En_13_Chapter_Equab.gif)
+
+  2. (b)
+
+Show that
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[\\hat{y}_{i} - y_{i}\\right \] = 0}$$
+](A442674_1_En_13_Chapter_Equac.gif)
+
+for each training data point y i , where the expectation is over the probability distribution of ξ.
+
+  3. (c)
+
+Show that
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[\(\\hat{\\beta }-\\beta \)\\right \] = 0}$$
+](A442674_1_En_13_Chapter_Equad.gif)
+
+where the expectation is over the probability distribution of ξ.
+
+13.6 In this exercise, I will show that the prediction process of Chap.  (see page 42) is a linear regression with two independent variables. Assume we have N data items which are 2-vectors (x 1, y 1),..., (x N , y N ), where N > 1. These could be obtained, for example, by extracting components from larger vectors. As usual, we will write ![
+$$\\hat{x}_{i}$$
+](A442674_1_En_13_Chapter_IEq76.gif) for x i in normalized coordinates, and so on. The correlation coefficient is r (this is an important, traditional notation).
+
+  1. (a)
+
+Assume that we have an x o , for which we wish to predict a y value. Show that the value of the prediction obtained using the method of page 43 is
+
+![
+$$\\displaystyle\\begin{array}{rcl} y_{\\mbox{ pred}}& =& \\frac{\\mathsf{std}\\left \(y\\right \)} {\\mathsf{std}\\left \(x\\right \)}r\(x_{0} -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\) + \\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \) {}\\\\ & =& \\left \(\\frac{\\mathsf{std}\\left \(y\\right \)} {\\mathsf{std}\\left \(x\\right \)}r\\right \)x_{0} + \\left \(\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \) -\\frac{\\mathsf{std}\\left \(x\\right \)} {\\mathsf{std}\\left \(y\\right \)}\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\\right \). {}\\\\ \\end{array}$$
+](A442674_1_En_13_Chapter_Equ4.gif)
+
+  2. (b)
+
+Show that
+
+![
+$$\\displaystyle\\begin{array}{rcl} r& =& \\frac{\\mathsf{mean}\\left \(\\left \\{\(x -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\)\(y -\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)\)\\right \\}\\right \)} {\\mathsf{std}\\left \(x\\right \)\\mathsf{std}\\left \(y\\right \)} {}\\\\ & =& \\frac{\\mathsf{mean}\\left \(\\left \\{xy\\right \\}\\right \) -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)} {\\mathsf{std}\\left \(x\\right \)\\mathsf{std}\\left \(y\\right \)}. {}\\\\ \\end{array}$$
+](A442674_1_En_13_Chapter_Equ5.gif)
+
+  3. (c)
+
+Now write
+
+![
+$$\\displaystyle{\\mathcal{X} = \\left \(\\begin{array}{lr} x_{1} & 1 \\\\ x_{2} & 1\\\\ \\ldots &\\ldots \\\\ x_{n}&1\\end{array} \\right \)\\mbox{ and }\\mathbf{Y} = \\left \(\\begin{array}{l} y_{1} \\\\ y_{2}\\\\ \\ldots \\\\ y_{n}\\end{array} \\right \).}$$
+](A442674_1_En_13_Chapter_Equae.gif)
+
+The coefficients of the linear regression will be ![
+$$\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq77.gif), where ![
+$$\\mathcal{X}^{T}\\mathcal{X}\\hat{\\beta } = \\mathcal{X}^{T}\\mathbf{Y}$$
+](A442674_1_En_13_Chapter_IEq78.gif). Show that
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathcal{X}^{T}\\mathcal{X}& =& N\\left \(\\begin{array}{lr} \\mathsf{mean}\\left \(\\left \\{x^{2}\\right \\}\\right \)&\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) \\\\ \\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) & 1\\end{array} \\right \) {}\\\\ & =& N\\left \(\\begin{array}{lr} \\mathsf{std}\\left \(x\\right \)^{2} + \\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)^{2} & \\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) \\\\ \\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) & 1\\end{array} \\right \){}\\\\ \\end{array}$$
+](A442674_1_En_13_Chapter_Equ6.gif)
+
+  4. (d)
+
+Now show that
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\mathcal{X}^{T}\\mathbf{Y}& =& N\\left \(\\begin{array}{l} \\mathsf{mean}\\left \(\\left \\{xy\\right \\}\\right \) \\\\ \\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)\\end{array} \\right \) {}\\\\ & =& N\\left \(\\begin{array}{l} \\mathsf{std}\\left \(x\\right \)\\mathsf{std}\\left \(y\\right \)r + \\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \)\\\\ \\mathsf{mean } \\left \(\\left \\{y\\right \\} \\right \) \\end{array} \\right \).{}\\\\ \\end{array}$$
+](A442674_1_En_13_Chapter_Equ7.gif)
+
+  5. (e)
+
+Now show that
+
+![
+$$\\displaystyle{ \\left \(\\mathcal{X}^{T}\\mathcal{X}\\right \)^{-1} = \\frac{1} {N} \\frac{1} {\\mathsf{std}\\left \(x\\right \)^{2}}\\left \(\\begin{array}{lr} 1 & -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \) \\\\ -\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)&\\mathsf{std}\\left \(x\\right \)^{2} + \\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)^{2}\\end{array} \\right \) }$$
+](A442674_1_En_13_Chapter_Equaf.gif)
+
+  6. (f)
+
+Now (finally!) show that if ![
+$$\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq79.gif) is the solution to ![
+$$\\mathcal{X}^{T}\\mathcal{X}\\hat{\\beta }-\\mathcal{X}^{T}\\mathbf{Y} = 0$$
+](A442674_1_En_13_Chapter_IEq80.gif), then
+
+![
+$$\\displaystyle{ \\hat{\\beta }= \\left \(\\begin{array}{l} r\\frac{\\mathsf{std}\\left \(y\\right \)} {\\mathsf{std}\\left \(x\\right \)} \\\\ \\mathsf{mean}\\left \(\\left \\{y\\right \\}\\right \) -\\left \(r\\frac{\\mathsf{std}\\left \(y\\right \)} {\\mathsf{std}\\left \(x\\right \)}\\right \)\\mathsf{mean}\\left \(\\left \\{x\\right \\}\\right \)\\end{array} \\right \) }$$
+](A442674_1_En_13_Chapter_Equag.gif)
+
+and use this to argue that the process of page 42 is a linear regression with two independent variables.
+
+13.7 This exercise investigates the effect of correlation on a regression. Assume we have N data items, (x i , y i ). We will investigate what happens when the data have the property that the first component is relatively accurately predicted by the other components. Write x i1 for the first component of x i , and ![
+$$\\mathbf{x}_{i,\\hat{1}}$$
+](A442674_1_En_13_Chapter_IEq81.gif) for the vector obtained by deleting the first component of x i . Choose u to predict the first component of the data from the rest with minimum error, so that ![
+$$x_{i1} = \\mathbf{x}_{i\\hat{1}}^{T}\\mathbf{u} + w_{i}$$
+](A442674_1_En_13_Chapter_IEq82.gif). The error of prediction is w i . Write w for the vector of errors (i.e. the i'th component of w is w i ). Because w T w is minimized by choice of u, we have w T 1 = 0 (i.e. the average of the w i 's is zero). Assume that these predictions are very good, so that there is some small positive number ε so that w T w ≤ ε.
+
+  1. (a)
+
+Write a = [−1, u] T . Show that
+
+![
+$$\\displaystyle{\\mathbf{a}^{T}\\mathcal{X}^{T}\\mathcal{X}\\mathbf{a} \\leq \\epsilon.}$$
+](A442674_1_En_13_Chapter_Equah.gif)
+
+  2. (b)
+
+Now show that the smallest eigenvalue of ![
+$$\\mathcal{X}^{T}\\mathcal{X}$$
+](A442674_1_En_13_Chapter_IEq83.gif) is less than or equal to ε.
+
+  3. (c)
+
+Assume that ![
+$$\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq84.gif) is the solution to ![
+$$\\mathcal{X}^{T}\\mathcal{X}\\hat{\\beta } = \\mathcal{X}^{T}\\mathbf{Y}$$
+](A442674_1_En_13_Chapter_IEq85.gif). Show that there is a unit vector v such that
+
+![
+$$\\displaystyle{\(\\mathcal{X}^{T}\\mathbf{Y} -\\mathcal{X}^{T}\\mathcal{X}\(\\hat{\\beta }+\\mathbf{v}\)\)^{T}\(\\mathcal{X}^{T}\\mathbf{Y} -\\mathcal{X}^{T}\\mathcal{X}\(\\hat{\\beta }+\\mathbf{v}\)\)}$$
+](A442674_1_En_13_Chapter_Equai.gif)
+
+is bounded above by
+
+![
+$$\\displaystyle{\\epsilon ^{2}}$$
+](A442674_1_En_13_Chapter_Equaj.gif)
+
+  4. (d)
+
+Use the last sub exercises to explain why correlated data will lead to a poor estimate of ![
+$$\\hat{\\beta }$$
+](A442674_1_En_13_Chapter_IEq86.gif).
+
+13.8 This exercise explores the effect of regularization on a regression. Assume we have N data items, (x i , y i ). We will investigate what happens when the data have the property that the first component is relatively accurately predicted by the other components. Write x i1 for the first component of x i , and ![
+$$\\mathbf{x}_{i,\\hat{1}}$$
+](A442674_1_En_13_Chapter_IEq87.gif) for the vector obtained by deleting the first component of x i . Choose u to predict the first component of the data from the rest with minimum error, so that ![
+$$x_{i1} = \\mathbf{x}_{i\\hat{1}}^{T}\\mathbf{u} + w_{i}$$
+](A442674_1_En_13_Chapter_IEq88.gif). The error of prediction is w i . Write w for the vector of errors (i.e. the i'th component of w is w i ). Because w T w is minimized by choice of u, we have w T 1 = 0 (i.e. the average of the w i 's is zero). Assume that these predictions are very good, so that there is some small positive number ε so that w T w ≤ ε.
+
+  1. (a)
+
+Show that ![
+$$\\mathcal{X}^{T}\\mathcal{X}$$
+](A442674_1_En_13_Chapter_IEq89.gif) is positive semi-definite, and so all its eigenvalues are non-negative.
+
+  2. (b)
+
+Show that, for any vector v,
+
+![
+$$\\displaystyle{\\mathbf{v}^{T}\\left \(\\mathcal{X}^{T}\\mathcal{X} +\\lambda \\mathcal{I}\\right \)\\mathbf{v} \\geq \\lambda \\mathbf{v}^{T}\\mathbf{v}}$$
+](A442674_1_En_13_Chapter_Equak.gif)
+
+and use this to argue that the smallest eigenvalue of ![
+$$\\left \(\\mathcal{X}^{T}\\mathcal{X} +\\lambda \\mathcal{I}\\right \)$$
+](A442674_1_En_13_Chapter_IEq90.gif) is greater than λ.
+
+  3. (c)
+
+Write b for an eigenvector of ![
+$$\\mathcal{X}^{T}\\mathcal{X}$$
+](A442674_1_En_13_Chapter_IEq91.gif) with eigenvalue λ b . Show that b is an eigenvector of ![
+$$\\left \(\\mathcal{X}^{T}\\mathcal{X} +\\lambda \\mathcal{I}\\right \)$$
+](A442674_1_En_13_Chapter_IEq92.gif) with eigenvalue λ b \+ λ.
+
+  4. (d)
+
+Assume that ![
+$$\\mathcal{X}^{T}\\mathcal{X}$$
+](A442674_1_En_13_Chapter_IEq93.gif) is positive definite and has no repeated eigenvalues (this doesn't affect the point of this exercise, but it greatly simplifies the reasoning). Recall ![
+$$\\mathcal{X}^{T}\\mathcal{X}$$
+](A442674_1_En_13_Chapter_IEq94.gif) is a d × d matrix which is symmetric, and so has d orthonormal eigenvectors. Write b i for the i'th such vector, and ![
+$$\\lambda _{\\mathbf{b}_{i}}$$
+](A442674_1_En_13_Chapter_IEq95.gif) for the corresponding eigenvalue. Show that
+
+![
+$$\\displaystyle{\\mathcal{X}^{T}\\mathcal{X}\\beta -\\mathcal{X}^{T}\\mathbf{Y} = 0}$$
+](A442674_1_En_13_Chapter_Equal.gif)
+
+is solved by
+
+![
+$$\\displaystyle{\\beta =\\sum _{ i=1}^{d}\\frac{\\left \(\\mathbf{Y}^{T}\\mathcal{X}\\mathbf{b}_{ i}\\right \)\\mathbf{b}_{i}} {\\lambda _{\\mathbf{b}_{i}}}.}$$
+](A442674_1_En_13_Chapter_Equam.gif)
+
+(Hint: an easy way to do this is to show that the eigenvectors are an orthonormal basis for d dimensional space, and that ![
+$$\\left \(\\mathcal{X}^{T}\\mathcal{X}\\beta -\\mathcal{X}^{T}\\mathbf{Y}\\right \)\\mathbf{b}_{i} = 0$$
+](A442674_1_En_13_Chapter_IEq96.gif) for any i.)
+
+  5. (e)
+
+Using the notation of the previous sub exercise, show that
+
+![
+$$\\displaystyle{\(\\mathcal{X}^{T}\\mathcal{X} +\\lambda \\mathcal{I}\)\\beta -\\mathcal{X}^{T}\\mathbf{Y} = 0}$$
+](A442674_1_En_13_Chapter_Equan.gif)
+
+is solved by
+
+![
+$$\\displaystyle{\\beta =\\sum _{ i=1}^{d}\\frac{\\left \(\\mathbf{Y}^{T}\\mathcal{X}\\mathbf{b}_{ i}\\right \)\\mathbf{b}_{i}} {\\lambda _{\\mathbf{b}_{i}}+\\lambda }.}$$
+](A442674_1_En_13_Chapter_Equao.gif)
+
+Use this expression to explain why a regularized regression may produce better results on test data than an unregularized regression.
+
+Programming Exercises
+
+13.9 At http://​www.​statsci.​org/​data/​general/​brunhild.​html, you will find a dataset that measures the concentration of a sulfate in the blood of a baboon named Brunhilda as a function of time. Build a linear regression of the log of the concentration against the log of time.
+
+  1. (a)
+
+Prepare a plot showing (a) the data points and (b) the regression line in log-log coordinates.
+
+  2. (b)
+
+Prepare a plot showing (a) the data points and (b) the regression curve in the original coordinates.
+
+  3. (c)
+
+Plot the residual against the fitted values in log-log and in original coordinates.
+
+  4. (d)
+
+Use your plots to explain whether your regression is good or bad and why.
+
+13.10 At http://​www.​statsci.​org/​data/​oz/​physical.​html, you will find a dataset of measurements by M. Larner, made in 1996. These measurements include body mass, and various diameters. Build a linear regression of predicting the body mass from these diameters.
+
+  1. (a)
+
+Plot the residual against the fitted values for your regression.
+
+  2. (b)
+
+Now regress the cube root of mass against these diameters. Plot the residual against the fitted values in both these cube root coordinates and in the original coordinates.
+
+  3. (c)
+
+Use your plots to explain which regression is better.
+
+13.11 At https://​archive.​ics.​uci.​edu/​ml/​datasets/​Abalone, you will find a dataset of measurements by W. J. Nash, T. L. Sellers, S. R. Talbot, A. J. Cawthorn and W. B. Ford, made in 1992. These are a variety of measurements of blacklip abalone (Haliotis rubra; delicious by repute) of various ages and genders.
+
+  1. (a)
+
+Build a linear regression predicting the age from the measurements, ignoring gender. Plot the residual against the fitted values.
+
+  2. (b)
+
+Build a linear regression predicting the age from the measurements, including gender. There are three levels for gender; I'm not sure whether this has to do with abalone biology or difficulty in determining gender. You can represent gender numerically by choosing one for one level, 0 for another, and -1 for the third. Plot the residual against the fitted values.
+
+  3. (c)
+
+Now build a linear regression predicting the log of age from the measurements, ignoring gender. Plot the residual against the fitted values.
+
+  4. (d)
+
+Now build a linear regression predicting the log age from the measurements, including gender, represented as above. Plot the residual against the fitted values.
+
+  5. (e)
+
+It turns out that determining the age of an abalone is possible, but difficult (you section the shell, and count rings). Use your plots to explain which regression you would use to replace this procedure, and why.
+
+  6. (f)
+
+Can you improve these regressions by using a regularizer? Use glmnet to obtain plots of the cross-validated prediction error.
+
+© Springer International Publishing AG 2018
+
+David ForsythProbability and Statistics for Computer Sciencehttps://doi.org/10.1007/978-3-319-64410-3_14
+
+# 14. Markov Chains and Hidden Markov Models
+
+David Forsyth1
+
+(1)
+
+Computer Science Department, University of Illinois at Urbana Champaign, Urbana, IL, USA
+
+There are many situations where one must work with sequences. Here is a simple, and classical, example. We see a sequence of words, but the last word is missing. I will use the sequence "I had a glass of red wine with my grilled xxxx". What is the best guess for the missing word? You could obtain one possible answer by counting word frequencies, then replacing the missing word with the most common word. This is "the", which is not a particularly good guess because it doesn't fit with the previous word. Instead, you could find the most common pair of words matching "grilled xxxx", and then choose the second word. If you do this experiment (I used Google Ngram viewer, and searched for "grilled *"), you will find mostly quite sensible suggestions (I got "meats", "meat", "fish", "chicken", in that order). If you want to produce random sequences of words, the next word should depend on some of the words you have already produced.
+
+This observation leads us to a powerful and useful model of sequences: the next item is produced by a probability that depends on a short set of previous items. This model has extremely useful applications when one wants to understand speech, sound or language.
+
+## 14.1 Markov Chains
+
+A sequence of random variables X n is a Markov chain if it has the property that,
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(X_{n} = j\\vert \\mbox{ values of all previous states}\) = P\(X_{n} = j\\vert X_{n-1}\),& & {}\\\\ \\end{array}$$
+](A442674_1_En_14_Chapter_Equ1.gif)
+
+or, equivalently, only the last state matters in determining the probability of the current state. The probabilities P(X n = j | X n−1 = i) are the transition probabilities. We will always deal with discrete random variables here, and we will assume that there is a finite number of states. For all our Markov chains, we will assume that
+
+![
+$$\\displaystyle{P\(X_{n} = j\\vert X_{n-1} = i\) = P\(X_{n-1} = j\\vert X_{n-2} = i\).}$$
+](A442674_1_En_14_Chapter_Equa.gif)
+
+This means that the transition probabilities do not change with time. Formally, we focus on discrete time, time homogenous Markov chains in a finite state space. With enough technical machinery one can construct many other kinds of Markov chain.
+
+One natural way to build Markov chains is to take a finite directed graph and label each directed edge from node i to node j with a probability. We interpret these probabilities as P(X n = j | X n−1 = i) (so the sum of probabilities over outgoing edges at any node must be 1). The Markov chain is then a biased random walk on this graph. A bug (or any other small object you prefer) sits on one of the graph's nodes. At each time step, the bug chooses one of the outgoing edges at random. The probability of choosing an edge is given by the probabilities on the drawing of the graph (equivalently, the transition probabilities). The bug then follows that edge. The bug keeps doing this until it hits an end state.
+
+Worked example 14.1 (Multiple Coin Flips)
+
+You choose to flip a fair coin until you see two heads in a row, and then stop. Represent the resulting sequence of coin flips with a Markov chain. What is the probability that you flip the coin four times?
+
+Solution
+
+Figure 14.1 shows a simple drawing of the directed graph that represents the chain. The last three flips must have been THH (otherwise you'd go on too long, or end too early). But, because the second flip must be a T, the first could be either H or T. This means there are two sequences that work: HTHH and TTHH. So P(4 flips) = 2∕8 = 1∕4. We might want to answer significantly more interesting questions. For example, what is the probability that we must flip the coin more than ten times? It is often possible to answer these questions by analysis, but we will use simulations.
+
+Fig. 14.1
+
+A directed graph representing the coin flip example. By convention, the end state is a double circle, and the start state has an incoming arrow. I've labelled the arrows with the event that leads to the transition, but haven't bothered to put in the probabilities, because each is 0.5
+
+Worked example 14.2 (Umbrellas)
+
+I own one umbrella, and I walk from home to the office each morning, and back each evening. If it is raining (which occurs with probability p, and my umbrella is with me), I take it; if it is not raining, I leave the umbrella where it is. We exclude the possibility that it starts raining while I walk. Where I am, and whether I am wet or dry, forms a Markov chain. Draw a state machine for this Markov chain.
+
+Solution
+
+Figure 14.2 gives this chain. A more interesting question is with what probability I arrive at my destination wet? Again, we will solve this with simulation.
+
+Fig. 14.2
+
+A directed graph representing the umbrella example. Notice you can't arrive at the office wet with the umbrella at home (you'd have taken it), and so on. Labelling the edges with probabilities is left to the reader
+
+Notice an important difference between Examples 14.1 and 14.2. In the coin flip case, the sequence of random variables can end (and your intuition likely tells you it should do so reliably). We say the Markov chain has an absorbing state —a state that it can never leave. In the example of the umbrella, there is an infinite sequence of random variables, each depending on the last. Each state of this chain is recurrent —it will be seen repeatedly in this infinite sequence. One way to have a state that is not recurrent is to have a state with outgoing but no incoming edges.
+
+Worked example 14.3 (The Gambler's Ruin)
+
+Assume you bet 1 a tossed coin will come up heads. If you win, you get 1 and your original stake back. If you lose, you lose your stake. But this coin has the property that P(H) = p < 1∕2. You have s when you start. You will keep betting until either (a) you have 0 (you are ruined; you can't borrow money) or (b) the amount of money you have accumulated is j, where j > s. The coin tosses are independent. The amount of money you have is a Markov chain. Draw the underlying state machine. Write P(ruined, starting with s | p) = p s . It is straightforward that p 0 = 1, p j = 0. Show that
+
+![
+$$\\displaystyle{p_{s} = pp_{s+1} + \(1 - p\)p_{s-1}.}$$
+](A442674_1_En_14_Chapter_Equb.gif)
+
+Solution
+
+Figure 14.3 illustrates this example. The recurrence relation follows because the coin tosses are independent. If you win the first bet, you have s \+ 1 and if you lose, you have s − 1.
+
+Fig. 14.3
+
+A directed graph representing the gambler's ruin example. I have labelled each state with the amount of money the gambler has at that state. There are two end states, where the gambler has zero (is ruined), or has j and decides to leave the table. The problem we discuss is to compute the probability of being ruined, given the start state is s. This means that any state except the end states could be a start state. I have labelled the state transitions with "W" (for win) and "L" for lose, but have omitted the probabilities
+
+The gambler's ruin example illustrates some points that are quite characteristic of Markov chains. You can often write recurrence relations for the probability of various events. Sometimes you can solve them in closed form, though we will not pursue this thought further.
+
+Useful Facts 14.1 (Markov Chains)
+
+A Markov chain is a sequence of random variables X n with the property that,
+
+![
+$$\\displaystyle{\\begin{array}{l} P\(X_{n} = j\\vert \\mbox{ values of all previous states}\) = P\(X_{n} = j\\vert X_{n-1}\).\\end{array} }$$
+](A442674_1_En_14_Chapter_Equc.gif)
+
+### 14.1.1 Transition Probability Matrices
+
+Define the matrix ![
+$$\\mathcal{P}$$
+](A442674_1_En_14_Chapter_IEq1.gif) with p ij = P(X n = j | X n−1 = i). Notice that this matrix has the properties that p ij ≥ 0 and
+
+![
+$$\\displaystyle{\\sum _{j}p_{ij} = 1}$$
+](A442674_1_En_14_Chapter_Equd.gif)
+
+because at the end of each time step the model must be in some state. Equivalently, the sum of transition probabilities for outgoing arrows is one. Non-negative matrices with this property are stochastic matrices. By the way, you should look very carefully at the i's and j's here—Markov chains are usually written in terms of row vectors, and this choice makes sense in that context.
+
+Worked example 14.4 (Viruses)
+
+Write out the transition probability matrix for the virus of Fig. 14.4, assuming that α = 0. 2.
+
+Fig. 14.4
+
+A virus can exist in one of 3 strains. At the end of each year, the virus mutates. With probability α, it chooses uniformly and at random from one of the 2 other strains, and turns into that; with probability 1 −α, it stays in the strain it is in. For this figure, we have transition probabilities p = (1 −α) and q = (α∕2)
+
+Solution
+
+We have P(X n = 1 | X n−1 = 1) = (1 −α) = 0. 8, and P(X n = 2 | X n−1 = 1) = α∕2 = P(X n = 3 | X n−1 = 1); so we get
+
+![
+$$\\displaystyle{\\left \(\\begin{array}{ccc} 0.8&0.1&0.1\\\\ 0.1 &0.8 &0.1 \\\\ 0.1&0.1&0.8 \\end{array} \\right \)}$$
+](A442674_1_En_14_Chapter_Eque.gif)
+
+Now imagine we do not know the initial state of the chain, but instead have a probability distribution. This gives P(X 0 = i) for each state i. It is usual to take these k probabilities and place them in a k-dimensional row vector, which is usually written π. From this information, we can compute the probability distribution over the states at time 1 by
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(X_{1} = j\)& =& \\sum _{i}P\(X_{1} = j,X_{0} = i\) {}\\\\ & =& \\sum _{i}P\(X_{1} = j\\vert X_{0} = i\)P\(X_{0} = i\) {}\\\\ & =& \\sum _{i}p_{ij}\\pi _{i}. {}\\\\ \\end{array}$$
+](A442674_1_En_14_Chapter_Equ2.gif)
+
+If we write p (n) for the row vector representing the probability distribution of the state at step n, we can write this expression as
+
+![
+$$\\displaystyle{\\mathbf{p}^{\(1\)} =\\pi \\mathcal{P}.}$$
+](A442674_1_En_14_Chapter_Equf.gif)
+
+Now notice that
+
+![
+$$\\displaystyle\\begin{array}{rcl} P\(X_{2} = j\)& =& \\sum _{i}P\(X_{2} = j,X_{1} = i\) {}\\\\ & =& \\sum _{i}P\(X_{2} = j\\vert X_{1} = i\)P\(X_{1} = i\) {}\\\\ & =& \\sum _{i}p_{ij}\\left \(\\sum _{ki}p_{ki}\\pi _{k}\\right \). {}\\\\ \\end{array}$$
+](A442674_1_En_14_Chapter_Equ3.gif)
+
+so that
+
+![
+$$\\displaystyle{\\mathbf{p}^{\(n\)} =\\pi \\mathcal{P}^{n}.}$$
+](A442674_1_En_14_Chapter_Equg.gif)
+
+This expression is useful for simulation, and also allows us to deduce a variety of interesting properties of Markov chains.
+
+Useful Facts 14.2 (Transition Probability Matrices)
+
+A finite state Markov chain can be represented with a matrix ![
+$$\\mathcal{P}$$
+](A442674_1_En_14_Chapter_IEq2.gif) of transition probabilities, where the i, j'th element p ij = P(X n = j | X n−1 = i). This matrix is a stochastic matrix. If the probability distribution of state X n−1 is represented by π n−1, then the probability distribution of state X n is given by ![
+$$\\pi _{n-1}^{T}\\mathcal{P}$$
+](A442674_1_En_14_Chapter_IEq3.gif).
+
+### 14.1.2 Stationary Distributions
+
+Worked example 14.5 (Viruses)
+
+We know that the virus of Fig. 14.4 started in strain 1. After two state transitions, what is the distribution of states when α = 0. 2? when α = 0. 9? What happens after 20 state transitions? If the virus starts in strain 2, what happens after 20 state transitions?
+
+Solution
+
+If the virus started in strain 1, then π = [1, 0, 0]. We must compute ![
+$$\\pi \(\\mathcal{P}\(\\alpha \)\)^{2}$$
+](A442674_1_En_14_Chapter_IEq4.gif). This yields [0. 66, 0. 17, 0. 17] for the case α = 0. 2 and [0. 4150, 0. 2925, 0. 2925] for the case α = 0. 9. Notice that, because the virus with small α tends to stay in whatever state it is in, the distribution of states after 2 years is still quite peaked; when α is large, the distribution of states is quite uniform. After 20 transitions, we have [0. 3339, 0. 3331, 0. 3331] for the case α = 0. 2 and [0. 3333, 0. 3333, 0. 3333] for the case α = 0. 9; you will get similar numbers even if the virus starts in strain 2. After 20 transitions, the virus has largely "forgotten" what the initial state was.
+
+In Example 14.5, the distribution of virus strains after a long interval appears not to depend much on the initial strain. This property is true of many Markov chains. Assume that our chain has a finite number of states. Assume that any state can be reached from any other state, by some sequence of transitions. Such chains are called irreducible. Notice this means there is no absorbing state, and the chain cannot get "stuck" in a state or a collection of states. Then there is a unique vector s, usually referred to as the stationary distribution, such that for any initial state distribution π,
+
+![
+$$\\displaystyle{\\begin{array}{c} \\lim \\\\ n \\rightarrow \\infty \\end{array} \\pi \\mathcal{P}^{\(n\)} = \\mathbf{s}.}$$
+](A442674_1_En_14_Chapter_Equh.gif)
+
+Equivalently, if the chain has run through many steps, it no longer matters what the initial distribution is. The probability distribution over states will be s.
+
+The stationary distribution can often be found using the following property. Assume the distribution over states is s, and the chain goes through one step. Then the new distribution over states must be s too. This means that
+
+![
+$$\\displaystyle{\\mathbf{s}\\mathcal{P} = \\mathbf{s}}$$
+](A442674_1_En_14_Chapter_Equi.gif)
+
+so that s is an eigenvector of ![
+$$\\mathcal{P}^{T}$$
+](A442674_1_En_14_Chapter_IEq5.gif), with eigenvalue 1. It turns out that, for an irreducible chain, there is exactly one such eigenvector.
+
+The stationary distribution is a useful idea in applications. It allows us to answer quite natural questions, without conditioning on the initial state of the chain. For example, in the umbrella case, we might wish to know the probability I arrive home wet. This could depend on where the chain starts (Example 14.6). If you look at the figure, the Markov chain is irreducible, so there is a stationary distribution and (as long as I've been going back and forth long enough for the chain to "forget" where it started), the probability it is in a particular state doesn't depend on where it started. So the most sensible interpretation of this probability is the probability of a particular state in the stationary distribution.
+
+Worked example 14.6 (Umbrellas, But Without a Stationary Distribution)
+
+This is a different version of the umbrella problem, but with a crucial difference. When I move to town, I decide randomly to buy an umbrella with probability 0.5. I then go from office to home and back. If I have bought an umbrella, I behave as in Example 14.2. If I have not, I just get wet. Illustrate this Markov chain with a state diagram.
+
+Solution
+
+Figure 14.5 does this. Notice this chain isn't irreducible. The state of the chain in the far future depends on where it started (i.e. did I buy an umbrella or not).
+
+Fig. 14.5
+
+In this umbrella example, there can't be a stationary distribution; what happens depends on the initial, random choice of buying/not buying an umbrella
+
+Useful Facts 14.3 (Many Markov Chains Have Stationary Distributions)
+
+If a Markov chain has a finite set of states, and if it is possible to get from any state to any other state, then the chain will have a stationary distribution. A sample state of the chain taken after it has been running for a long time will be a sample from that stationary distribution. Once the chain has run for long enough, it will visit states with a frequency corresponding to that stationary distribution, though it may take many state transitions to move from state to state.
+
+### 14.1.3 Example: Markov Chain Models of Text
+
+Imagine we wish to model English text. The very simplest model would be to estimate individual letter frequencies (most likely, by counting letters in a large body of example text). We might count spaces and punctuation marks as letters. We regard the frequencies as probabilities, then model a sequence by repeatedly drawing a letter from that probability model. You could even punctuate with this model by regarding punctuation signs as letters, too. We expect this model will produce sequences that are poor models of English text—there will be very long strings of 'a's, for example. This is clearly a (rather dull) Markov chain. It is sometimes referred to as a 0-th order chain or a 0-th order model, because each letter depends on the 0 letters behind it.
+
+A slightly more sophisticated model would be to work with pairs of letters. Again, we would estimate the frequency of pairs by counting letter pairs in a body of text. We could then draw a first letter from the letter frequency table. Assume this is an 'a'. We would then draw the second letter by drawing a sample from the conditional probability of encountering each letter after 'a', which we could compute from the table of pair frequencies. Assume this is an 'n'. We get the third letter by drawing a sample from the conditional probability of encountering each letter after 'n', which we could compute from the table of pair frequencies, and so on. This is a first order chain (because each letter depends on the one letter behind it).
+
+Second and higher order chains (or models) follow the general recipe, but the probability of a letter depends on more of the letters behind it. You may be concerned that conditioning a letter on the two (or k) previous letters means we don't have a Markov chain, because I said that the n'th state depends on only the n − 1'th state. The cure for this concern is to use states that represent two (or k) letters, and adjust transition probabilities so that the states are consistent. So for a second order chain, the string "abcde" is a sequence of four states, "ab", "bc", "cd", and "de".
+
+Worked example 14.7 (Modelling Short Words)
+
+Obtain a text resource, and use a trigram letter model to produce four letter words. What fraction of bigrams (resp. trigrams) do not occur in this resource? What fraction of the words you produce are actual words?
+
+Solution
+
+I used the text of a draft of this chapter. I ignored punctuation marks, and forced capital letters to lower case letters. I found 0. 44 of the bigrams and 0. 90 of the trigrams were not present. I built two models. In one, I just used counts to form the probability distributions (so there were many zero probabilities). In the other, I split a probability of 0.1 between all the cases that had not been observed. A list of 20 word samples from the first model is: "ngen", "ingu", "erms", "isso", "also", "plef", "trit", "issi", "stio", "esti", "coll", "tsma", "arko", "llso", "bles", "uati", "namp", "call", "riat", "eplu"; two of these are real English words (three if you count "coll", which I don't; too obscure), so perhaps 10% of the samples are real words. A list of 20 word samples from the second model is: "hate", "ther", "sout", "vect", "nces", "ffer", "msua", "ergu", "blef", "hest", "assu", "fhsp", "ults", "lend", "lsoc", "fysj", "uscr", "ithi", "prow", "lith"; four of these are real English words (you might need to look up "lith", but I refuse to count "hest" as being too archaic), so perhaps 20% of the samples are real words. In each case, the samples are too small to take the fraction estimates all that seriously.
+
+Letter models can be good enough for (say) evaluating communication devices, but they're not great at producing words (Example 14.7). More effective language models are obtained by working with words. The recipe is as above, but now we use words in place of letters. It turns out that this recipe applies to such domains as protein sequencing, dna sequencing and music synthesis as well, but now we use amino acids (resp. base pairs; notes) in place of letters. Generally, one decides what the basic item is (letter, word, amino acid, base pair, note, etc.). Then individual items are called unigrams and 0'th order models are unigram models; pairs are bigrams and first order models are bigram models; triples are trigrams, second order models trigram models; and for any other n, groups of n in sequence are n-grams and n − 1'th order models are n-gram models.
+
+Worked example 14.8 (Modelling Text with n-Grams of Words)
+
+Build a text model that uses bigrams (resp. trigrams, resp. n-grams) of words, and look at the paragraphs that your model produces.
+
+Solution
+
+This is actually a fairly arduous assignment, because it is hard to get good bigram frequencies without working with enormous text resources. Rather than solve it, I will follow the grand tradition of looking at other people's work.
+
+There are a variety of places you can find text resources. Google publishes n-gram models for English words with the year in which the n-gram occurred and information about how many different books it occurred in. So, for example, the word "circumvallate" appeared 335 times in 1978, in 91 distinct books—some books clearly felt the need to use this term more than once. This information can be found starting at http://​storage.​googleapis.​com/​books/​ngrams/​books/​datasetsv2.​html. The raw dataset is huge, as you would expect. There are numerous n-gram language models on the web. Jeff Attwood has a brief discussion of some models at https://​blog.​codinghorror.​com/​markov-and-you/​; Sophie Chou has some examples, and pointers to code snippets and text resources, at http://​blog.​sophiechou.​com/​2013/​how-to-model-markov-chains/​. Fletcher Heisler, Michael Herman, and Jeremy Johnson are authors of RealPython, a training course in Python, and give a nice worked example of a Markov chain language generator at https://​realpython.​com/​blog/​python/​lyricize-a-flask-app-to-create-lyrics-using-markov-chains/​.
+
+The paragraphs that cleverly trained Markov chain language models produce can be hilarious, and are very effective tools for satire. Garkov is Josh Millard's tool for generating comics featuring a well-known cat (at http://​joshmillard.​com/​garkov/​). There's a nice Markov chain for reviewing wines by Tony Fischetti at http://​www.​onthelambda.​com/​2014/​02/​20/​how-to-fake-a-sophisticated-knowledge-of-wine-with-markov-chains/​
+
+It is usually straightforward to build a unigram model, because it is usually easy to get enough data to estimate the frequencies of the unigrams. There are many more bigrams than unigrams, many more trigrams than bigrams, and so on. This means that estimating frequencies can get tricky. In particular, you might need to collect an immense amount of data to see every possible n-gram several times. Without seeing every possible n-gram several times, you will need to deal with estimating the probability of encountering rare n-grams that you haven't seen. Assigning these n-grams a probability of zero is unwise, because that implies that they never occur, as opposed to occur seldom.
+
+There are a variety of schemes for smoothing data (essentially, estimating the probability of rare items that have not been seen). The simplest one is to assign some very small fixed probability to every n-gram that has a zero count. It turns out that this is not a particularly good approach, because, for even quite small n, the fraction of n-grams that have zero count can be very large. In turn, you can find that most of the probability in your model is assigned to n-grams you have never seen. An improved version of this model assigns a fixed probability to unseen n-grams, then divides that probability up between all of the n-grams that have never been seen before. This approach has its own characteristic problems. It ignores evidence that some of the unseen n-grams are more common than others. Some of the unseen n-grams have (n-1) leading terms that are (n-1)-grams that we have observed. These (n-1)-grams likely differ in frequency, suggesting that n-grams involving them should differ in frequency, too. More sophisticated schemes are beyond our scope, however.
+
+## 14.2 Estimating Properties of Markov Chains
+
+Many problems in probability can be worked out in closed form if one knows enough combinatorial mathematics, or can come up with the right trick. Textbooks are full of these, and we've seen some. Explicit formulas for probabilities are often extremely useful. But it isn't always easy or possible to find a formula for the probability of an event in a model. Markov chains are a particularly rich source of probability problems that might be too much trouble to solve in closed form. An alternative strategy is to build a simulation, run it many times, and count the fraction of outcomes where the event occurs. This is a simulation experiment.
+
+### 14.2.1 Simulation
+
+Imagine we have a random variable X with probability distribution P(X) that takes values in some domain D. Assume that we can easily produce independent simulations, and that we wish to know ![
+$$\\mathbb{E}\\left \[f\\right \]$$
+](A442674_1_En_14_Chapter_IEq6.gif), the expected value of the function f under the distribution P(X).
+
+The weak law of large numbers tells us how to proceed. Define a new random variable F = f(X). This has a probability distribution P(F), which might be difficult to know. We want to estimate ![
+$$\\mathbb{E}\\left \[f\\right \]$$
+](A442674_1_En_14_Chapter_IEq7.gif), the expected value of the function f under the distribution P(X). This is the same as ![
+$$\\mathbb{E}\\left \[F\\right \]$$
+](A442674_1_En_14_Chapter_IEq8.gif). Now if we have a set of IID samples of X, which we write x i , then we can form a set of IID samples of F by forming f(x i ) = f i . Write
+
+![
+$$\\displaystyle{F_{N} = \\frac{\\sum _{i=1}^{N}f_{i}} {N}.}$$
+](A442674_1_En_14_Chapter_Equj.gif)
+
+This is a random variable, and the weak law of large numbers gives that, for any positive number ε
+
+![
+$$\\displaystyle{\\lim _{N\\rightarrow \\infty }P\(\\{\\mid \\!\\mid \\!\\!F_{N} - \\mathbb{E}\\left \[F\\right \]\\!\\!\\mid \\!\\mid>\\epsilon \\}\) = 0.}$$
+](A442674_1_En_14_Chapter_Equk.gif)
+
+You can interpret this as saying that, that for a set of IID random samples x i , the probability that
+
+![
+$$\\displaystyle{\\frac{\\sum _{i=1}^{N}f\(x_{i}\)} {N} }$$
+](A442674_1_En_14_Chapter_Equl.gif)
+
+is very close to ![
+$$\\mathbb{E}\\left \[f\\right \]$$
+](A442674_1_En_14_Chapter_IEq9.gif) is high for large N
+
+Worked example 14.9 (Computing an Expectation)
+
+Assume the random variable X is uniformly distributed in the range [0 − 1], and the random variable Y is uniformly distributed in the range [0 − 10]. X and Z are independent. Write Z = (Y − 5X)3 − X 2. What is ![
+$$\\mathsf{var}\\left \(\\left \\{Z\\right \\}\\right \)$$
+](A442674_1_En_14_Chapter_IEq10.gif)?
+
+Solution
+
+With enough work, one could probably work this out in closed form. An easy program will get a good estimate. We have that ![
+$$\\mathsf{var}\\left \(\\left \\{Z\\right \\}\\right \) = \\mathbb{E}\\left \[Z^{2}\\right \] - \\mathbb{E}\\left \[Z\\right \]^{2}$$
+](A442674_1_En_14_Chapter_IEq11.gif). My program computed 1000 values of Z (by drawing X and Y from the appropriate random number generator, then evaluating the function). I then computed ![
+$$\\mathbb{E}\\left \[Z\\right \]$$
+](A442674_1_En_14_Chapter_IEq12.gif) by averaging those values, and ![
+$$\\mathbb{E}\\left \[Z\\right \]^{2}$$
+](A442674_1_En_14_Chapter_IEq13.gif) by averaging their squares. For a run of my program, I got ![
+$$\\mathsf{var}\\left \(\\left \\{Z\\right \\}\\right \) = 2.76 \\times 10^{4}$$
+](A442674_1_En_14_Chapter_IEq14.gif).
+
+You can compute a probability using a simulation, too, because a probability can be computed by taking an expectation. Recall the property of indicator functions that
+
+![
+$$\\displaystyle{\\mathbb{E}\\left \[\\mathbb{I}_{\\left \[\\mathcal{E}\\right \]}\\right \] = P\(\\mathcal{E}\)}$$
+](A442674_1_En_14_Chapter_Equm.gif)
+
+(Section 4.​3.​3). This means that computing the probability of an event ![
+$$\\mathcal{E}$$
+](A442674_1_En_14_Chapter_IEq15.gif) involves writing a function that is 1 when the event occurs, and 0 otherwise; we then estimate the expected value of that function.
+
+Worked example 14.10 (Computing a Probability for Multiple Coin Flips)
+
+You flip a fair coin three times. Use a simulation to estimate the probability that you see three H's.
+
+Solution
+
+You really should be able to work this out in closed form. But it's amusing to check with a simulation. I wrote a simple program that obtained a 1000 × 3 table of uniformly distributed random numbers in the range [0 − 1]. For each number, if it was greater than 0.5 I recorded an H and if it was smaller, I recorded a T. Then I counted the number of rows that had 3 H's (i.e. the expected value of the relevant indicator function). This yielded the estimate 0. 127, which compares well to the right answer.
+
+Worked example 14.11 (Computing a Probability)
+
+Assume the random variable X is uniformly distributed in the range [0 − 1], and the random variable Y is uniformly distributed in the range [0 − 10]. Write Z = (Y − 5X)3 − X 2. What is P({Z > 3})?
+
+Solution
+
+With enough work, one could probably work this out in closed form. An easy program will get a good estimate. My program computed 1000 values of Z (by drawing X and Y from the appropriate random number generator, then evaluating the function) and counted the fraction of Z values that was greater than 3 (which is the relevant indicator function). For a run of my program, I got P({Z > 3}) ≈ 0. 619
+
+For all the examples we will deal with, producing an IID sample of the relevant probability distribution will be straightforward. You should be aware that it can be very hard to produce an IID sample from an arbitrary distribution, particularly if that distribution is over a continuous variable in high dimensions.
+
+### 14.2.2 Simulation Results as Random Variables
+
+The estimate of a probability or of an expectation that comes out of a simulation experiment is a random variable, because it is a function of random numbers. If you run the simulation again, you'll get a different value, unless you did something silly with the random number generator. Generally, you should expect this random variable to have a normal distribution. You can check this by constructing a histogram over a large number of runs. The mean of this random variable is the parameter you are trying to estimate. It is useful to know that this random variable tends to be normal, because it means the standard deviation of the random variable tells you a lot about the likely values you will observe.
+
+Another helpful rule of thumb, which is almost always right, is that the standard deviation of this random variable behaves like
+
+![
+$$\\displaystyle{ \\frac{C} {\\sqrt{N}}}$$
+](A442674_1_En_14_Chapter_Equn.gif)
+
+where C is a constant that depends on the problem and can be very hard to evaluate, and N is the number of runs of the simulation. What this means is that if you want to (say) double the accuracy of your estimate of the probability or the expectation, you have to run four times as many simulations. Very accurate estimates are tough to get, because they require immense numbers of simulation runs.
+
+Figure 14.6 shows how the result of a simulation behaves when the number of runs changes. I used the simulation of Example 14.11, and ran multiple experiments for each of a number of different samples (i.e. 100 experiments using 10 samples; 100 using 100 samples; and so on).
+
+Fig. 14.6
+
+Estimates of the probability from Example 14.11, obtained from different runs of my simulator using different numbers of samples. In each case, I used 100 runs; the number of samples is shown on the horizontal axis. You should notice that the estimate varies pretty widely when there are only 10 samples in each run, but the variance (equivalently, the size of the spread) goes down sharply as the number of samples per run increases to 1000. Because we expect these estimates to be roughly normally distributed, the variance gives a good idea of how accurate the original probability estimate is
+
+Worked example 14.12 (Getting 14's with 20-Sided Dice)
+
+You throw 3 fair 20-sided dice. Estimate the probability that the sum of the faces is 14 using a simulation. Use N = [1e1, 1e2, 1e3, 1e4, 1e5, 1e6]. Which estimate is likely to be more accurate, and why?
+
+Solution
+
+You need a fairly fast computer, or this will take a long time. I ran ten versions of each experiment for N = [1e1, 1e2, 1e3, 1e4, 1e5, 1e6], yielding ten probability estimates for each N. These were different for each version of the experiment, because the simulations are random. I got means of [0, 0. 0030, 0. 0096, 0. 0100, 0. 0096, 0. 0098], and standard deviations of [0 0.0067 0.0033 0.0009 0.0002 0.0001]. This suggests the true value is around 0.0098, and the estimate from N = 1e6 is best. The reason that the estimate with N = 1e1 is 0 is that the probability is very small, so you don't usually observe this case at all in only ten trials.
+
+Small probabilities can be rather hard to estimate, as we would expect. In the case of Example 14.11, let us estimate P({Z > 950}). A few moments with a computer will show that this probability is of the order of 1e-3 to 1e-4. I obtained a million different simulated values of Z from my program, and saw 310 where Z > 950. This means that to know this probability to, say, three digits of numerical accuracy might involve a daunting number of samples. Notice that this does not contradict the rule of thumb that the standard deviation of the random variable defined by a simulation estimate behaves like ![
+$$\\frac{C} {\\sqrt{N}}$$
+](A442674_1_En_14_Chapter_IEq16.gif); it's just that in this case, C is very large indeed.
+
+Useful Facts 14.4 (The Properties of Simulations)
+
+You should remember that
+
+  * The weak law of large numbers means you can estimate expectations and probabilities with a simulation.
+
+  * The result of a simulation is usually a normal random variable.
+
+  * The expected value of this random variable is usually the true value of the expectation or probability you are trying to simulate.
+
+  * The standard deviation of this random variable is usually ![
+$$\\frac{C} {\\sqrt{N}}$$
+](A442674_1_En_14_Chapter_IEq17.gif), where N is the number of examples in the simulation and C is a number usually too hard to estimate.
+
+Worked example 14.13 (Comparing Simulation with Computation)
+
+You throw three fair six-sided dice. You wish to know the probability the sum is three. Compare the true value of this probability with estimates from six runs of a simulation using N = 10, 000. What conclusions do you draw?
+
+Solution
+
+I ran six simulations with N = 10, 000, and got [ 0.0038, 0.0038, 0.0053, 0.0041, 0.0056, 0.0049]. The mean is 0. 00458, and the standard deviation is 0. 0007, which suggests the estimate isn't that great, but the right answer should be in the range [0. 00388, 0. 00528] with probability about 0. 68. The true value is 1∕216 ≈ 0. 00463. The estimate is tolerable, but not super accurate.
+
+### 14.2.3 Simulating Markov Chains
+
+We will always assume that we know the states and transition probabilities of the Markov chain. Properties that might be of interest in this case include: the probability of hitting an absorbing state; the expected time to go from one state to another; the expected time to hit an absorbing state; and which states have high probability under the stationary distribution.
+
+Worked example 14.14 (Coin Flips with End Conditions)
+
+I flip a coin repeatedly until I encounter a sequence HTHT, at which point I stop. What is the probability that I flip the coin nine times?
+
+Solution
+
+You might well be able to construct a closed form solution to this if you follow the details of example 14.13 and do quite a lot of extra work. A simulation is really straightforward to write; notice you can save time by not continuing to simulate coin flips once you've flipped past nine times. I got 0. 0411 as the mean probability over 10 runs of a simulation of 1000 experiments each, with a standard deviation of 0. 0056.
+
+Worked example 14.15 (A Queue)
+
+A bus is supposed to arrive at a bus stop every hour for 10 h each day. The number of people who arrive to queue at the bus stop each hour has a Poisson distribution, with intensity 4. If the bus stops, everyone gets on the bus and the number of people in the queue becomes zero. However, with probability 0. 1 the bus driver decides not to stop, in which case people decide to wait. If the queue is ever longer than 15, the waiting passengers will riot (and then immediately get dragged off by the police, so the queue length goes down to zero). What is the expected time between riots?
+
+Solution
+
+I'm not sure whether one could come up with a closed form solution to this problem. A simulation is completely straightforward to write. I get a mean time of 441 h between riots, with a standard deviation of 391. It's interesting to play around with the parameters of this problem; a less conscientious bus driver, or a higher intensity arrival distribution, lead to much more regular riots.
+
+Worked example 14.16 (Inventory)
+
+A store needs to control its stock of an item. It can order stocks on Friday evenings, which will be delivered on Monday mornings. The store is old-fashioned, and open only on weekdays. On each weekday, a random number of customers comes in to buy the item. This number has a Poisson distribution, with intensity 4. If the item is present, the customer buys it, and the store makes 100; otherwise, the customer leaves. Each evening at closing, the store loses 10 for each unsold item on its shelves. The store's supplier insists that it order a fixed number k of items (i.e. the store must order k items each week). The store opens on a Monday with 20 items on the shelf. What k should the store use to maximise profits?
+
+Solution
+
+I'm not sure whether one could come up with a closed form solution to this problem, either. A simulation is completely straightforward to write. To choose k, you run the simulation with different k values to see what happens. I computed accumulated profits over 100 weeks for different k values, then ran the simulation five times to see which k was predicted. Results were 21, 19, 23, 20, 21. I'd choose 21 based on this information.
+
+For Example 14.16, you should plot accumulated profits. If k is small, the store doesn't lose money by storing items, but it doesn't sell as much stuff as it could; if k is large, then it can fill any order but it loses money by having stock on the shelves. A little thought will convince you that k should be near 20, because that is the expected number of customers each week, so k = 20 means the store can expect to sell all its new stock. It may not be exactly 20, because it must depend a little on the balance between the profit in selling an item and the cost of storing it. For example, if the cost of storing items is very small compared to the profit, an very large k might be a good choice. If the cost of storage is sufficiently high, it might be better to never have anything on the shelves; this point explains the absence of small stores selling PC's.
+
+Quite substantial examples are possible. The game "snakes and ladders" involves random walk on a Markov chain. If you don't know this game, look it up; it's sometimes called "chutes and ladders", and there is an excellent Wikipedia page. The state is given by where each players' token is on the board, so on a 10 × 10 board one player involves 100 states, two players 1002 states, and so on. The set of states is finite, though big. Transitions are random, because each player throws dice. The snakes (resp. ladders) represent extra edges in the directed graph. Absorbing states occur when a player hits the top square. It is straightforward to compute the expected number of turns for a given number of players by simulation, for example. For one commercial version, the Wikipedia page gives the crucial numbers: for two players, the expected number of moves to a win is 47. 76, and the first player wins with probability 0. 509. Notice you might need to think a bit about how to write the program if there were, say, eight players on a 12 × 12 board—you would likely avoid storing the entire state space.
+
+## 14.3 Example: Ranking the Web by Simulating a Markov Chain
+
+Perhaps the most valuable technical question of the last 30 years has been: Which web pages are interesting? Some idea of the importance of this question is that it was only really asked about 20 years ago, and at least one gigantic technology company has been spawned by a partial answer. This answer, due to Larry Page and Sergey Brin, and widely known as PageRank, revolves around simulating the stationary distribution of a Markov chain.
+
+You can think of the world wide web as a directed graph. Each page is a state. Directed edges from page to page represent links. Count only the first link from a page to another page. Some pages are linked, others are not. We want to know how important each page is.
+
+One way to think about importance is to think about what a random web surfer would do. The surfer can either (a) choose one of the outgoing links on a page at random, and follow it or (b) type in the URL of a new page, and go to that instead. This is a random walk on a directed graph. We expect that this random surfer should see a lot of pages that have lots of incoming links from other pages that have lots of incoming links that (and so on). These pages are important, because lots of pages have linked to them.
+
+For the moment, ignore the surfer's option to type in a URL. Write r(i) for the importance of the i'th page. We model importance as leaking from page to page across outgoing links (the same way the surfer jumps). Page i receives importance down each incoming link. The amount of importance is proportional to the amount of importance at the other end of the link, and inversely proportional to the number of links leaving that page. So a page with only one outgoing link transfers all its importance down that link; and the way for a page to receive a lot of importance is for it to have a lot of important pages link to it alone. We write
+
+![
+$$\\displaystyle{r\(j\) =\\sum _{i\\rightarrow j}\\frac{r\(i\)} {\\mid \\!i\\!\\mid } }$$
+](A442674_1_En_14_Chapter_Equo.gif)
+
+where ∣​i​∣ means the total number of links pointing out of page i. We can stack the r(j) values into a row vector r, and construct a matrix ![
+$$\\mathcal{P}$$
+](A442674_1_En_14_Chapter_IEq18.gif), where
+
+![
+$$\\displaystyle{p_{ij} = \\left \\{\\begin{array}{lc} \\frac{1} {\\mid \\!i\\!\\mid } &\\mbox{ if }i\\mbox{ points to }j\\\\ 0 & \\mbox{ otherwise} \\end{array} \\right.}$$
+](A442674_1_En_14_Chapter_Equp.gif)
+
+With this notation, the importance vector has the property
+
+![
+$$\\displaystyle{\\mathbf{r} = \\mathbf{r}\\mathcal{P}}$$
+](A442674_1_En_14_Chapter_Equq.gif)
+
+and should look a bit like the stationary distribution of a random walk to you, except that ![
+$$\\mathcal{P}$$
+](A442674_1_En_14_Chapter_IEq19.gif) isn't stochastic—there may be some rows where the row sum of ![
+$$\\mathcal{P}$$
+](A442674_1_En_14_Chapter_IEq20.gif) is zero, because there are no outgoing links from that page. We can fix this easily by replacing each row that sums to zero with (1∕n)1, where n is the total number of pages. Call the resulting matrix ![
+$$\\mathcal{G}$$
+](A442674_1_En_14_Chapter_IEq21.gif) (it's quite often called the raw Google matrix).
+
+The web has pages with no outgoing links (which we've dealt with), pages with no incoming links, and even pages with no links at all. A random walk could get trapped by moving to a page with no outgoing links. Allowing the surfer to randomly enter a URL sorts out all of these problems, because it inserts an edge of small weight from every node to every other node. Now the random walk cannot get trapped.
+
+There are a variety of possible choices for the weight of these inserted edges. The original choice was to make each inserted edge have the same weight. Write 1 for the n dimensional column vector containing a 1 in each component, and let 0 < α < 1. We can write the matrix of transition probabilities as
+
+![
+$$\\displaystyle{\\mathcal{G}\(\\alpha \) =\\alpha \\frac{\(\\mathbf{1}\\mathbf{1}^{T}\)} {n} + \(1-\\alpha \)\\mathcal{G}}$$
+](A442674_1_En_14_Chapter_Equr.gif)
+
+where ![
+$$\\mathcal{G}$$
+](A442674_1_En_14_Chapter_IEq22.gif) is the original Google matrix. An alternative choice is to choose a weight for each web page. This weight could come from: a query; advertising revenues; thaumaturgy; blind prejudice; page visit statistics; other sources; or a mixture of all (Google keeps quiet about the details). Write this weight vector v, and require that 1 T v = 1 (i.e. the coefficients sum to one). Then we could have
+
+![
+$$\\displaystyle{\\mathcal{G}\(\\alpha,\\mathbf{v}\) =\\alpha \\frac{\(\\mathbf{1}\\mathbf{v}^{T}\)} {n} + \(1-\\alpha \)\\mathcal{G}.}$$
+](A442674_1_En_14_Chapter_Equs.gif)
+
+Now the importance vector r is the (unique, though I won't prove this) row vector r such that
+
+![
+$$\\displaystyle{\\mathbf{r} = \\mathbf{r}\\mathcal{G}\(\\alpha,\\mathbf{v}\).}$$
+](A442674_1_En_14_Chapter_Equt.gif)
+
+How do we compute this vector? One natural algorithm is to estimate r with a random walk, because r is the stationary distribution of a Markov chain. If we simulate this walk for many steps, the probability that the simulation is in state j should be r(j), at least approximately.
+
+This simulation is easy to build. Imagine our random walking bug sits on a web page. At each time step, it transitions to a new page by either (a) picking from all existing pages at random, using v as a probability distribution on the pages (which it does with probability α); or (b) chooses one of the outgoing links uniformly and at random, and follows it (which it does with probability 1 −α). The stationary distribution of this random walk is r. Another fact that I shall not prove is that, when α is sufficiently large, this random walk very quickly "forgets" it's initial distribution. As a result, you can estimate the importance of web pages by starting this random walk in a random location; letting it run for a bit; then stopping it, and collecting the page you stopped on. The pages you see like this are independent, identically distributed samples from r; so the ones you see more often are more important, and the ones you see less often are less important.
+
+## 14.4 Hidden Markov Models and Dynamic Programming
+
+Imagine we wish to build a program that can transcribe speech sounds into text. Each small chunk of text can lead to one, or some, sounds, and some randomness is involved. For example, some people pronounce the word "fishing" rather like "fission". As another example, the word "scone" is sometimes pronounced rhyming with "stone", sometimes rhyming with "gone", and occasionally rhyming with "loon". A Markov chain supplies a model of all possible text sequences, and allows us to compute the probability of any particular sequence. We will use a Markov chain to model text sequences, but what we observe is sound. We must have a model of how sound is produced by text. With that model and the Markov chain, we want to produce text that (a) is a likely sequence of words and (b) is likely to have produced the sounds we hear.
+
+Many applications contain the main elements of this example. We might wish to transcribe music from sound. We might wish to understand American sign language from video. We might wish to produce a written description of how someone moves from video observations. We might wish to break a substitution cipher. In each case, what we want to recover is a sequence that can be modelled with a Markov chain, but we don't see the states of the chain. Instead, we see noisy measurements that depend on the state of the chain, and we want to recover a state sequence that is (a) likely under the Markov chain model and (b) likely to have produced the measurements we observe.
+
+### 14.4.1 Hidden Markov Models
+
+Assume we have a finite state, time homogenous Markov chain, with S states. This chain will start at time 1, and the probability distribution P(X 1 = i) is given by the vector ![
+$$\\boldsymbol{\\pi }$$
+](A442674_1_En_14_Chapter_IEq23.gif). At time u, it will take the state X u , and its transition probability matrix is p ij = P(X u+1 = j | X u = i). We do not observe the state of the chain. Instead, we observe some Y u . We will assume that Y u is also discrete, and there are a total of O possible states for Y u for any u. We can write a probability distribution for these observations P(Y u | X u = i) = q i (Y u ). This distribution is the emission distribution of the model. For simplicity, we will assume that the emission distribution does not change with time.
+
+We can arrange the emission distribution into a matrix ![
+$$\\mathcal{Q}$$
+](A442674_1_En_14_Chapter_IEq24.gif). A hidden Markov model consists of the transition probability distribution for the states, the relationship between the state and the probability distribution on Y u , and the initial distribution on states, that is, ![
+$$\(\\mathcal{P},\\mathcal{Q},\\boldsymbol{\\pi }\)$$
+](A442674_1_En_14_Chapter_IEq25.gif). These models are often dictated by an application. An alternative is to build a model that best fits a collection of observed data, but doing so requires technical machinery we cannot expound here.
+
+I will sketch how one might build a model for transcribing speech, but you should keep in mind this is just a sketch of a very rich area. We can obtain the probability of a word following some set of words using n-gram resources, as in Sect. 14.1.3. We then build a model of each word in terms of small chunks of word that are likely to correspond to common small chunks of sound. We will call these chunks of sound phonemes. We can look up the different sets of phonemes that correspond to a word using a pronunciation dictionary. We can combine these two resources into a model of how likely it is one will pass from one phoneme inside a word to another, which might either be inside this word or inside another word. We now have ![
+$$\\mathcal{P}$$
+](A442674_1_En_14_Chapter_IEq26.gif). We will not spend much time on ![
+$$\\boldsymbol{\\pi }$$
+](A442674_1_En_14_Chapter_IEq27.gif), and might even model it as a uniform distribution. We can use a variety of strategies to build ![
+$$\\mathcal{Q}$$
+](A442674_1_En_14_Chapter_IEq28.gif). One is to build discrete features of a sound signal, then count how many times a particular set of features is produced when a particular phoneme is played.
+
+### 14.4.2 Picturing Inference with a Trellis
+
+Assume that we have a sequence of N measurements Y i that we believe to be the output of a known hidden Markov model. We wish to recover the "best" corresponding sequence of X i . Doing so is inference. We will choose to recover a sequence X i that maximises
+
+![
+$$\\displaystyle{\\log P\(X_{1},X_{2},\\ldots,X_{N}\\vert Y _{1},Y _{2},\\ldots,Y _{N},\\mathcal{P},\\mathcal{Q},\\boldsymbol{\\pi }\)}$$
+](A442674_1_En_14_Chapter_Equu.gif)
+
+which is
+
+![
+$$\\displaystyle{\\log \\left \(\\frac{P\(X_{1},X_{2},\\ldots,X_{N},Y _{1},Y _{2},\\ldots,Y _{N}\\vert \\mathcal{P},\\mathcal{Q},\\boldsymbol{\\pi }\)} {P\(Y _{1},Y _{2},\\ldots,Y _{N}\)} \\right \)}$$
+](A442674_1_En_14_Chapter_Equv.gif)
+
+and this is
+
+![
+$$\\displaystyle\\begin{array}{rcl} & & \\log P\(X_{1},X_{2},\\ldots,X_{N},Y _{1},Y _{2},\\ldots,Y _{N}\\vert \\mathcal{P},\\mathcal{Q},\\boldsymbol{\\pi }\) {}\\\\ & & -\\log P\(Y _{1},Y _{2},\\ldots,Y _{N}\). {}\\\\ \\end{array}$$
+](A442674_1_En_14_Chapter_Equ4.gif)
+
+Notice that P(Y 1, Y 2,..., Y N ) doesn't depend on the sequence of X u we choose, and so the second term can be ignored. What is important here is that we can decompose ![
+$$\\log P\(X_{1},X_{2},\\ldots,X_{N},Y _{1},Y _{2},\\ldots,Y _{N}\\vert \\mathcal{P},\\mathcal{Q},\\boldsymbol{\\pi }\)$$
+](A442674_1_En_14_Chapter_IEq29.gif) in a very useful way, because the X u form a Markov chain. We want to maximise
+
+![
+$$\\displaystyle{\\log P\(X_{1},X_{2},\\ldots,X_{N},Y _{1},Y _{2},\\ldots,Y _{N}\\vert \\mathcal{P},\\mathcal{Q},\\boldsymbol{\\pi }\)}$$
+](A442674_1_En_14_Chapter_Equw.gif)
+
+but this is
+
+![
+$$\\displaystyle\\begin{array}{rcl} & & \\log P\(X_{1}\) +\\log P\(Y _{1}\\vert X_{1}\) + {}\\\\ & & \\log P\(X_{2}\\vert X_{1}\) +\\log P\(Y _{2}\\vert X_{2}\) + {}\\\\ & & \\ldots {}\\\\ & & \\log P\(X_{N}\\vert X_{n-1}\) +\\log P\(Y _{N}\\vert X_{N}\). {}\\\\ \\end{array}$$
+](A442674_1_En_14_Chapter_Equ5.gif)
+
+Notice that this cost function has an important structure. It is a sum of terms. There are terms that depend on a single X i (unary terms) and terms that depend on two (binary terms). Any state X i appears in at most two binary terms.
+
+We can illustrate this cost function in a structure called a trellis. This is a weighted, directed graph consisting of N copies of the state space, which we arrange in columns. There is a column corresponding to each measurement. We add a directed arrow from any state in the u'th column to any state in the u \+ 1'th column if the transition probability between the states isn't 0. This represents the fact that there is a possible transition between these states. We then label the trellis with weights. We weight the node representing the case that state X u = j in the column corresponding to Y u with logP(Y u | X u = j). We weight the arc from the node representing X u = i to that representing X u+1 = j with logP(X u+1 = j | X u = i).
+
+The trellis has two crucial properties. Each directed path through the trellis from the start column to the end column represents a legal sequence of states. Now for some directed path from the start column to the end column, sum all the weights for the nodes and edges along this path. This sum is the log of the joint probability of that sequence of states with the measurements. You can verify each of these statements easily by reference to a simple example (try Fig. 14.7)
+
+Fig. 14.7
+
+At the top left, a simple state transition model. Each outgoing edge has some probability, though the topology of the model forces two of these probabilities to be 1. Below, the trellis corresponding to that model. Each path through the trellis corresponds to a legal sequence of states, for a sequence of three measurements. We weight the arcs with the log of the transition probabilities, and the nodes with the log of the emission probabilities. I have shown some weights
+
+There is an efficient algorithm for finding the path through a trellis which maximises the sum of terms. The algorithm is usually called dynamic programming or the Viterbi algorithm. I will describe this algorithm both in narrative, and as a recursion. We want to find the best path from each node in the first column to each node in the last. There are S such paths, one for each node in the first column. Once we have these paths, we can choose the one with highest log joint probability. Now consider one of these paths. It passes through the i'th node in the u'th column. The path segment from this node to the end column must, itself, be the best path from this node to the end. If it wasn't, we could improve the original path by substituting the best. This is the key insight that gives us an algorithm.
+
+Start at the final column of the tellis. We can evaluate the best path from each node in the final column to the final column, because that path is just the node, and the value of that path is the node weight. Now consider a two-state path, which will start at the second last column of the trellis (look at panel I in Fig. 14.8). We can easily obtain the value of the best path leaving each node in this column. Consider a node: we know the weight of each arc leaving the node and the weight of the node at the far end of the arc, so we can choose the path segment with the largest value of the sum; this arc is the best we can do leaving that node. This sum is the best value obtainable on leaving that node—which is often known as the cost to go function.
+
+Fig. 14.8
+
+An example of finding the best path through a trellis. The probabilities of leaving a node are uniform (and remember, ln2 ≈ −0. 69). Details in the text
+
+Now, because we know the best value obtainable on leaving each node in the second-last column, we can figure out the best value obtainable on leaving each node in the third-last column (panel II in Fig. 14.8). At each node in the third-last column, we have a choice of arcs. Each of these reaches a node from which we know the value of the best path. So we can choose the best path leaving a node in the third-last column by finding the path that has the best value of: the arc weight leaving the node; the weight of the node in the second-last column the arc arrives at; and the value of the path leaving that node. This is much more easily done than described. All this works just as well for the fourth-last column, etc. (panel III in Fig. 14.8) so we have a recursion. To find the value of the best path with X 1 = i, we go to the corresponding node in the first column, then add the value of the node to the value of the best path leaving that node (panel IV in Fig. 14.8). Finally, to find the value of the best path leaving the first column, we compute the maximum value over all nodes in the first column.
+
+We can also get the path with the maximum likelihood value. When we compute the value of a node, we erase all but the best arc leaving that node. Once we reach the first column, we simply follow the path from the node with the best value. This path is illustrated by dashed edges in Fig. 14.8.
+
+### 14.4.3 Dynamic Programming for HMM's: Formalities
+
+We will formalize the recursion of the previous section with two ideas. First, we define C w (j) to be the cost of the best path segment to the end of the trellis leaving the node representing X w = j. Second, we define B w (j) to be the node in column w \+ 1 that lies on the best path leaving the node representing X w = j. So C w (j) tells you the cost of the best path, and B w (j) tells you what node is next on the best path.
+
+Now it is straightforward to find the cost of the best path leaving each node in the second last column, and also the path. In symbols, we have
+
+![
+$$\\displaystyle\\begin{array}{rcl} C_{N-1}\(j\)& =& \\max _{u}\\left \[\\log P\(X_{N} = u\\vert X_{N-1} = j\)\\right. {}\\\\ & & \\left.+\\log P\(Y _{N}\\vert X_{N} = u\)\\right \] {}\\\\ \\end{array}$$
+](A442674_1_En_14_Chapter_Equ6.gif)
+
+and
+
+![
+$$\\displaystyle\\begin{array}{rcl} B_{N-1}\(j\)& =& \\begin{array}{c} \\mbox{ argmax}\\\\ u\\end{array} \\left \[\\log P\(X_{N} = u\\vert X_{N-1} = j\)\\right. {}\\\\ & & \\left.+\\log P\(Y _{N}\\vert X_{N} = u\)\\right \]. {}\\\\ \\end{array}$$
+](A442674_1_En_14_Chapter_Equ7.gif)
+
+You should check this against step I of Fig. 14.8
+
+Once we have the best path leaving each node in the w \+ 1'th column and its cost, it's straightforward to find the best path leaving the w'th column and its cost. In symbols, we have
+
+![
+$$\\displaystyle\\begin{array}{rcl} C_{w}\(j\)& =& \\max _{u}\\left \[\\log P\(X_{w+1} = u\\vert X_{w} = j\)\\right. {}\\\\ & & \\left.+\\log P\(Y _{w+1}\\vert X_{w+1} = u\) + C_{w+1}\(u\)\\right \] {}\\\\ \\end{array}$$
+](A442674_1_En_14_Chapter_Equ8.gif)
+
+and
+
+![
+$$\\displaystyle\\begin{array}{rcl} B_{w}\(j\)& =& \\begin{array}{c} \\mbox{ argmax}\\\\ u\\end{array} \\left \[\\log P\(X_{w+1} = u\\vert X_{w} = j\) +\\log P\(Y _{w+1}\\vert X_{w+1} = u\) + C_{w+1}\(u\)\\right \].{}\\\\ \\end{array}$$
+](A442674_1_En_14_Chapter_Equ9.gif)
+
+Check this against steps II and III in Fig. 14.8.
+
+### 14.4.4 Example: Simple Communication Errors
+
+Hidden Markov models can be used to correct text errors. We will simplify somewhat, and assume we have text that has no punctuation marks, and no capital letters. This means there are a total of 27 symbols (26 lower case letters, and a space). We send this text down some communication channel. This could be a telephone line, a fax line, a file saving procedure or anything else. This channel makes errors independently at each character. For each location, with probability 1 − p the output character at that location is the same as the input character. With probability p, the channel chooses randomly between the character one ahead or one behind in the character set, and produces that instead. You can think of this as a simple model for a mechanical error in one of those now ancient printers where a character strikes a ribbon to make a mark on the paper. We must reconstruct the transmission from the observations (Tables 14.1, 14.2 and 14.3 show some uni-, bi- and trigram probabilities).
+
+Table 14.1
+
+The most common single letters (unigrams) that I counted from a draft of this chapter, with their probabilities
+
+* | e | t | i | a | o | s | n | r | h
+
+---|---|---|---|---|---|---|---|---|---
+
+1.9e-1 | 9.7e-2 | 7.9e-2 | 6.6e-2 | 6.5e-2 | 5.8e-2 | 5.5e-2 | 5.2e-2 | 4.8e-2 | 3.7e-2
+
+The '*' stands for a space. Spaces are common in this text, because I have tended to use short words (from the probability of the '*', average word length is between five and six letters)
+
+Table 14.2
+
+The most common bigrams that I counted from a draft of this chapter, with their probabilities
+
+Lead char
+
+|  |  |  |  |   
+---|---|---|---|---|---
+
+* | *t (2.7e-2) | *a (1.7e-2) | *i (1.5e-2) | *s (1.4e-2) | *o (1.1e-2)
+
+e | e* (3.8e-2) | er (9.2e-3) | es (8.6e-3) | en (7.7e-3) | el (4.9e-3)
+
+t | th (2.2e-2) | t* (1.6e-2) | ti (9.6e-3) | te (9.3e-3) | to (5.3e-3)
+
+i | in (1.4e-2) | is (9.1e-3) | it (8.7e-3) | io (5.6e-3) | im (3.4e-3)
+
+a | at (1.2e-2) | an (9.0e-3) | ar (7.5e-3) | a* (6.4e-3) | al (5.8e-3)
+
+o | on (9.4e-3) | or (6.7e-3) | of (6.3e-3) | o* (6.1e-3) | ou (4.9e-3)
+
+s | s* (2.6e-2) | st (9.4e-3) | se (5.9e-3) | si (3.8e-3) | su (2.2e-3)
+
+n | n* (1.9e-2) | nd (6.7e-3) | ng (5.0e-3) | ns (3.6e-3) | nt (3.6e-3)
+
+r | re (1.1e-2) | r* (7.4e-3) | ra (5.6e-3) | ro (5.3e-3) | ri (4.3e-3)
+
+h | he (1.4e-2) | ha (7.8e-3) | h* (5.3e-3) | hi (5.1e-3) | ho (2.1e-3)
+
+The '*' stands for a space. For each of the ten most common letters, I have shown the five most common bigrams with that letter in the lead. This gives a broad view of the bigrams, and emphasizes the relationship between unigram and bigram frequencies. Notice that the first letter of a word has a slightly different frequency than letters (top row; bigrams starting with a space are first letters). About 40% of the possible bigrams do not appear in the text
+
+Table 14.3
+
+The most frequent ten trigrams in a draft of this chapter, with their probabilities
+
+*th | the | he* | is* | *of | of* | on* | es* | *a* | ion
+
+---|---|---|---|---|---|---|---|---|---
+
+1.7e-2 | 1.2e-2 | 9.8e-3 | 6.2e-3 | 5.6e-3 | 5.4e-3 | 4.9e-3 | 4.9e-3 | 4.9e-3 | 4.9e-3
+
+tio | e*t | in* | *st | *in | at* | ng* | ing | *to | *an
+
+4.6e-3 | 4.5e-3 | 4.2e-3 | 4.1e-3 | 4.1e-3 | 4.0e-3 | 3.9e-3 | 3.9e-3 | 3.8e-3 | 3.7e-3
+
+Again, '*' stands for space. You can see how common 'the' and '*a*' are; 'he*' is common because '*the*' is common. About 80% of possible trigrams do not appear in the text
+
+I built a unigram model, a bigram model, and a trigram model. I stripped the text of this chapter of punctuation marks and mapped the capital letters to lower case letters. I used an HMM package (in my case, for Matlab; but there's a good one for R as well) to perform inference. The main programming here is housekeeping to make sure the transition and emission models are correct. About 40% of the bigrams and 86% of the trigrams did not appear in the text. I smoothed the bigram and trigram probabilities by dividing the probability 0.01 evenly between all unobserved bigrams (resp. trigrams). The most common unigrams, bigrams and trigrams appear in the tables. As an example sequence, I used
+
+> the trellis has two crucial properties each directed path through the trellis from the start column to the end column represents a legal sequence of states now for some directed path from the start column to the end column sum all the weights for the nodes and edges along this path this sum is the log of the joint probability of that sequence of states with the measurements you can verify each of these statements easily by reference to a simple example
+
+(which is text you could find in a draft of this chapter). There are 456 characters in this sequence.
+
+When I ran this through the noise process with p = 0. 0333, I got
+
+> the trellis has two crucial properties each directed path through the tqdllit from the start column to the end coluln represents a legal sequencezof states now for some directed path from the start column to thf end column sum aml the veights for the nodes and edges along this path this sum is the log of the joint probability oe that sequence of states wish the measurements youzcan verify each of these statements easily by reference to a simple examqle
+
+which is mangled but not too badly (13 of the characters are changed, so 443 locations are the same).
+
+The unigram model produces
+
+> the trellis has two crucial properties each directed path through the tqdllit from the start column to the end coluln represents a legal sequence of states now for some directed path from the start column to thf end column sum aml the veights for the nodes and edges along this path this sum is the log of the joint probability oe that sequence of states wish the measurements you can verify each of these statements easily by reference to a simple examqle
+
+which fixes three errors. The unigram model only changes an observed character when the probability of encountering that character on its own is less than the probability it was produced by noise. This occurs only for "z", which is unlikely on its own and is more likely to have been a space. The bigram model produces
+
+> she trellis has two crucial properties each directed path through the trellit from the start column to the end coluln represents a legal sequence of states now for some directed path from the start column to the end column sum aml the veights for the nodes and edges along this path this sum is the log of the joint probability oe that sequence of states wish the measurements you can verify each of these statements easily by reference to a simple example
+
+This is the same as the correct text in 449 locations, so somewhat better than the noisy text. The trigram model produces
+
+> the trellis has two crucial properties each directed path through the trellit from the start column to the end column represents a legal sequence of states now for some directed path from the start column to the end column sum all the weights for the nodes and edges along this path this sum is the log of the joint probability of that sequence of states with the measurements you can verify each of these statements easily by reference to a simple example
+
+which corrects all but one of the errors (look for "trellit").
+
+Remember this: A hidden Markov model can be used to model many sequences. Observations are noisy versions of underlying hidden states, and the states form a Markov chain. One can infer the hidden states from the observations with dynamic programming. This approach applies very widely, and is extremely useful in practice.
+
+## 14.5 You Should
+
+### 14.5.1 Remember These Definitions
+
+### 14.5.2 Remember These Terms
+
+  * Markov chain 331
+
+  * transition probabilities 331
+
+  * biased random walk 331
+
+  * absorbing state 332
+
+  * recurrent 332
+
+  * stochastic matrices 333
+
+  * irreducible 335
+
+  * stationary distribution 335
+
+  * unigrams 337
+
+  * unigram models 337
+
+  * bigrams 337
+
+  * bigram models 337
+
+  * trigrams 337
+
+  * trigram models 337
+
+  * n-grams 337
+
+  * n-gram models 337
+
+  * smoothing 338
+
+  * raw Google matrix 343
+
+  * emission distribution 344
+
+  * hidden Markov model 344
+
+  * phonemes 344
+
+  * trellis 345
+
+  * dynamic programming 345
+
+  * Viterbi algorithm 345
+
+  * cost to go function 345
+
+### 14.5.3 Remember These Facts
+
+  * Markov chains 333
+
+  * Transition probability matrices 335
+
+  * Many Markov chains have stationary distributions 336
+
+  * The properties of simulations 341
+
+### 14.5.4 Be Able to
+
+  * Estimate various probabilities and expectations for a Markov chain by simulation.
+
+  * Evaluate the results of multiple runs of a simple simulation.
+
+  * Set up a simple HMM and use it to solve problems.
+
+Problems
+
+14.1 Multiple die rolls: You roll a fair die until you see a five, then a six; after that, you stop. Write P(N) for the probability that you roll the die N times.
+
+  1. (a)
+
+What is P(1)?
+
+  2. (b)
+
+Show that P(2) = (1∕36).
+
+  3. (c)
+
+Draw a directed graph encoding all the sequences of die rolls that you could encounter. Don't write the events on the edges; instead, write their probabilities. There are five ways not to get a five, but only one probability, so this simplifies the drawing.
+
+  4. (d)
+
+Show that P(3) = (1∕36).
+
+  5. (e)
+
+Now use your directed graph to argue that P(N) = (5∕6)P(N − 1) + (25∕36)P(N − 2).
+
+14.2 More complicated multiple coin flips: You flip a fair coin until you see either HTH or THT, and then you stop. We will compute a recurrence relation for P(N).
+
+  1. (a)
+
+Draw a directed graph for this chain.
+
+  2. (b)
+
+Think of the directed graph as a finite state machine. Write ![
+$$\\Sigma _{N}$$
+](A442674_1_En_14_Chapter_IEq30.gif) for some string of length N accepted by this finite state machine. Use this finite state machine to argue that Sigma N has one of four forms:
+
+    1. a.
+
+![
+$$TT\\Sigma _{N-2}$$
+](A442674_1_En_14_Chapter_IEq31.gif)
+
+    2. b.
+
+![
+$$HH\\Sigma _{N-3}$$
+](A442674_1_En_14_Chapter_IEq32.gif)
+
+    3. c.
+
+![
+$$THH\\Sigma _{N-2}$$
+](A442674_1_En_14_Chapter_IEq33.gif)
+
+    4. d.
+
+![
+$$HTT\\Sigma _{N-3}$$
+](A442674_1_En_14_Chapter_IEq34.gif)
+
+  3. (c)
+
+Now use this argument to show that P(N) = (1∕2)P(N − 2) + (1∕4)P(N − 3).
+
+14.3 For the umbrella example of Worked example 14.2, assume that with probability 0.7 it rains in the evening, and 0.2 it rains in the morning. I am conventional, and go to work in the morning, and leave in the evening.
+
+  1. (a)
+
+Write out a transition probability matrix.
+
+  2. (b)
+
+What is the stationary distribution? (you should use a simple computer program for this).
+
+  3. (c)
+
+What fraction of evenings do I arrive at home wet?
+
+  4. (d)
+
+What fraction of days do I arrive at my destination dry?
+
+Programming Exercises
+
+14.4 A dishonest gambler has two dice and a coin. The coin and one die are both fair. The other die is unfair. It has ![
+$$P\(n\) = \\left \[0.5,0.1,0.1,0.1,0.1,0.1\\right \]$$
+](A442674_1_En_14_Chapter_IEq35.gif) (where n is the number displayed on the top of the die). The gambler starts by choosing a die. Choosing a die is by flipping a coin; if the coin comes up heads, the gambler chooses the fair die, otherwise, the unfair die. The gambler rolls the chosen die repeatedly until a six comes up. When a six appears, the gambler chooses again (by flipping a coin, etc), and continues.
+
+  1. (a)
+
+Model this process with a hidden markov model. The emitted symbols should be 1,..., 6. Doing so requires only two hidden states (which die is in hand). Simulate a long sequence of rolls using this model. What is the probability the emitted symbol is 1?
+
+  2. (b)
+
+Use your simulation to produce 10 sequences of 100 symbols. Record the hidden state sequence for each of these. Now recover the hidden state using dynamic programming (you should likely use a software package for this; there are many good ones for R and Matlab). What fraction of the hidden states is correctly identified by your inference procedure?
+
+  3. (c)
+
+Does inference accuracy improve when you use sequences of 1000 symbols?
+
+14.5 Warning: this exercise is fairly elaborate, though straightforward. We will correct text errors using a hidden Markov model.
+
+  1. (a)
+
+Obtain the text of a copyright-free book in plain characters. One natural source is Project Gutenberg, at https://​www.​gutenberg.​org. Simplify this text by dropping all punctuation marks except spaces, mapping capital letters to lower case, and mapping groups of many spaces to a single space. The result will have 27 symbols (26 lower case letters and a space). From this text, count unigram, bigram and trigram letter frequencies.
+
+  2. (b)
+
+Use your counts to build models of unigram, bigram and trigram letter probabilities. You should build both an unsmoothed model, and at least one smoothed model. For the smoothed models, choose some small amount of probability ε and split this between all events with zero count. Your models should differ only by the size of ε.
+
+  3. (c)
+
+Construct a corrupted version of the text by passing it through a process that, with probability p c , replaces a character with a randomly chosen character, and otherwise reports the original character.
+
+  4. (d)
+
+For a reasonably sized block of corrupted text, use an HMM inference package to recover the best estimate of your true text. Be aware that your inference will run more slowly as the block gets bigger, but you won't see anything interesting if the block is (say) too small to contain any errors.
+
+  5. (e)
+
+For p c = 0. 01 and p c = 0. 1, estimate the error rate for the corrected text for different values of ε. Keep in mind that the corrected text could be worse than the corrupted text.
+
+# Part V  
+Mathematical Bits and Pieces
+© Springer International Publishing AG 2018
+
+David ForsythProbability and Statistics for Computer Sciencehttps://doi.org/10.1007/978-3-319-64410-3_15
+
+# 15. Resources and Extras
+
+David Forsyth1
+
+(1)
+
+Computer Science Department, University of Illinois at Urbana Champaign, Urbana, IL, USA
+
+This chapter contains some mathematical material that you will likely have seen, but some may not have stayed with you. I have also relegated the detailed discussion of how one splits a node in a decision tree to this chapter.
+
+## 15.1 Useful Material About Matrices
+
+Terminology:
+
+  * A matrix ![
+$$\\mathcal{M}$$
+](A442674_1_En_15_Chapter_IEq1.gif) is symmetric if ![
+$$\\mathcal{M} = \\mathcal{M}^{T}$$
+](A442674_1_En_15_Chapter_IEq2.gif). A symmetric matrix is necessarily square.
+
+  * We write ![
+$$\\mathcal{I}$$
+](A442674_1_En_15_Chapter_IEq3.gif) for the identity matrix.
+
+  * A matrix is diagonal if the only non-zero elements appear on the diagonal. A diagonal matrix is necessarily symmetric.
+
+  * A symmetric matrix is positive semidefinite if, for any x such that x T x > 0 (i.e. this vector has at least one non-zero component), we have ![
+$$\\mathbf{x}^{T}\\mathcal{M}\\mathbf{x} \\geq 0$$
+](A442674_1_En_15_Chapter_IEq4.gif).
+
+  * A symmetric matrix is positive definite if, for any x such that x T x > 0, we have ![
+$$\\mathbf{x}^{T}\\mathcal{M}\\mathbf{x}> 0$$
+](A442674_1_En_15_Chapter_IEq5.gif).
+
+  * A matrix ![
+$$\\mathcal{R}$$
+](A442674_1_En_15_Chapter_IEq6.gif) is orthonormal if ![
+$$\\mathcal{R}^{T}\\mathcal{R} = \\mathcal{I} = \\mathcal{I}^{T} = \\mathcal{R}\\mathcal{R}^{T}$$
+](A442674_1_En_15_Chapter_IEq7.gif). Orthonormal matrices are necessarily square.
+
+Orthonormal matrices: You should think of orthonormal matrices as rotations, because they do not change lengths or angles. For x a vector, ![
+$$\\mathcal{R}$$
+](A442674_1_En_15_Chapter_IEq8.gif) an orthonormal matrix, and ![
+$$\\mathbf{u} = \\mathcal{R}\\mathbf{x}$$
+](A442674_1_En_15_Chapter_IEq9.gif), we have ![
+$$\\mathbf{u}^{T}\\mathbf{u} = \\mathbf{x}^{T}\\mathcal{R}^{T}\\mathcal{R}\\mathbf{x} = \\mathbf{x}^{T}\\mathcal{I}\\mathbf{x} = \\mathbf{x}^{T}\\mathbf{x}$$
+](A442674_1_En_15_Chapter_IEq10.gif). This means that ![
+$$\\mathcal{R}$$
+](A442674_1_En_15_Chapter_IEq11.gif) doesn't change lengths. For y, z both unit vectors, we have that the cosine of the angle between them is y T x; but, by the same argument as above, the inner product of ![
+$$\\mathcal{R}\\mathbf{y}$$
+](A442674_1_En_15_Chapter_IEq12.gif) and ![
+$$\\mathcal{R}\\mathbf{x}$$
+](A442674_1_En_15_Chapter_IEq13.gif) is the same as y T x. This means that ![
+$$\\mathcal{R}$$
+](A442674_1_En_15_Chapter_IEq14.gif) doesn't change angles, either.
+
+Eigenvectors and Eigenvalues: Assume ![
+$$\\mathcal{S}$$
+](A442674_1_En_15_Chapter_IEq15.gif) is a d × d symmetric matrix, v is a d × 1 vector, and λ is a scalar. If we have
+
+![
+$$\\displaystyle{\\mathcal{S}\\mathbf{v} =\\lambda \\mathbf{v}}$$
+](A442674_1_En_15_Chapter_Equa.gif)
+
+then v is referred to as an eigenvector of ![
+$$\\mathcal{S}$$
+](A442674_1_En_15_Chapter_IEq16.gif) and λ is the corresponding eigenvalue. Matrices don't have to be symmetric to have eigenvectors and eigenvalues, but the symmetric case is the only one of interest to us.
+
+In the case of a symmetric matrix, the eigenvalues are real numbers, and there are d distinct eigenvectors that are normal to one another, and can be scaled to have unit length. They can be stacked into a matrix ![
+$$\\mathcal{U} = \\left \[\\mathbf{v}_{1},\\ldots,\\mathbf{v}_{d}\\right \]$$
+](A442674_1_En_15_Chapter_IEq17.gif). This matrix is orthonormal, meaning that ![
+$$\\mathcal{U}^{T}\\mathcal{U} = \\mathcal{I}$$
+](A442674_1_En_15_Chapter_IEq18.gif). This means that there is a diagonal matrix ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq19.gif) such that
+
+![
+$$\\displaystyle{\\mathcal{S}\\mathcal{U} = \\mathcal{U}\\Lambda.}$$
+](A442674_1_En_15_Chapter_Equb.gif)
+
+In fact, there is a large number of such matrices, because we can reorder the eigenvectors in the matrix ![
+$$\\mathcal{U}$$
+](A442674_1_En_15_Chapter_IEq20.gif), and the equation still holds with a new ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq21.gif), obtained by reordering the diagonal elements of the original ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq22.gif). There is no reason to keep track of this complexity. Instead, we adopt the convention that the elements of ![
+$$\\mathcal{U}$$
+](A442674_1_En_15_Chapter_IEq23.gif) are always ordered so that the elements of ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq24.gif) are sorted along the diagonal, with the largest value coming first.
+
+Diagonalizing a symmetric matrix: This gives us a particularly important procedure. We can convert any symmetric matrix ![
+$$\\mathcal{S}$$
+](A442674_1_En_15_Chapter_IEq25.gif) to a diagonal form by computing
+
+![
+$$\\displaystyle{\\mathcal{U}^{T}\\mathcal{S}\\mathcal{U} = \\Lambda.}$$
+](A442674_1_En_15_Chapter_Equc.gif)
+
+This procedure is referred to as diagonalizing a matrix. Again, we assume that the elements of ![
+$$\\mathcal{U}$$
+](A442674_1_En_15_Chapter_IEq26.gif) are always ordered so that the elements of ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq27.gif) are sorted along the diagonal, with the largest value coming first. Diagonalization allows us to show that positive definiteness is equivalent to having all positive eigenvalues, and positive semidefiniteness is equivalent to having all non-negative eigenvalues.
+
+Factoring a matrix: Assume that ![
+$$\\mathcal{S}$$
+](A442674_1_En_15_Chapter_IEq28.gif) is symmetric and positive semidefinite. We have that
+
+![
+$$\\displaystyle{\\mathcal{S} = \\mathcal{U}\\Lambda \\mathcal{U}^{T}}$$
+](A442674_1_En_15_Chapter_Equd.gif)
+
+and all the diagonal elements of ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq29.gif) are non-negative. Now construct a diagonal matrix whose diagonal entries are the positive square roots of the diagonal elements of ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq30.gif); call this matrix ![
+$$\\Lambda ^{\(1/2\)}$$
+](A442674_1_En_15_Chapter_IEq31.gif). We have ![
+$$\\Lambda ^{\(1/2\)}\\Lambda ^{\(1/2\)} = \\Lambda$$
+](A442674_1_En_15_Chapter_IEq32.gif) and ![
+$$\(\\Lambda ^{\(1/2\)}\)^{T} = \\Lambda ^{\(1/2\)}$$
+](A442674_1_En_15_Chapter_IEq33.gif). Then we have that
+
+![
+$$\\displaystyle{\\mathcal{S}=\(\\mathcal{U}\\Lambda ^{\(1/2\)}\)\(\\Lambda ^{\(1/2\)}\\mathcal{U}^{T}\)=\(\\mathcal{U}\\Lambda ^{\(1/2\)}\)\(\\mathcal{U}\\Lambda ^{\(1/2\)}\)^{T}}$$
+](A442674_1_En_15_Chapter_Eque.gif)
+
+so we can factor ![
+$$\\mathcal{S}$$
+](A442674_1_En_15_Chapter_IEq34.gif) into the form ![
+$$\\mathcal{X}\\mathcal{X}^{T}$$
+](A442674_1_En_15_Chapter_IEq35.gif) by computing the eigenvectors and eigenvalues.
+
+### 15.1.1 The Singular Value Decomposition
+
+For any m × p matrix ![
+$$\\mathcal{X}$$
+](A442674_1_En_15_Chapter_IEq36.gif), it is possible to obtain a decomposition
+
+![
+$$\\displaystyle{\\mathcal{X} = \\mathcal{U}\\Sigma \\mathcal{V}^{T}}$$
+](A442674_1_En_15_Chapter_Equf.gif)
+
+where ![
+$$\\mathcal{U}$$
+](A442674_1_En_15_Chapter_IEq37.gif) is m × m, ![
+$$\\mathcal{V}$$
+](A442674_1_En_15_Chapter_IEq38.gif) is p × p, and ![
+$$\\Sigma$$
+](A442674_1_En_15_Chapter_IEq39.gif) is m × p and is diagonal. If you don't recall what a diagonal matrix looks like when the matrix isn't square, it's simple. All entries are zero, except the i, i entries for i in the range 1 to min(m, p). So if ![
+$$\\Sigma$$
+](A442674_1_En_15_Chapter_IEq40.gif) is tall and thin, the top square is diagonal and everything else is zero; if ![
+$$\\Sigma$$
+](A442674_1_En_15_Chapter_IEq41.gif) is short and wide, the left square is diagonal and everything else is zero. Both ![
+$$\\mathcal{U}$$
+](A442674_1_En_15_Chapter_IEq42.gif) and ![
+$$\\mathcal{V}$$
+](A442674_1_En_15_Chapter_IEq43.gif) are orthonormal (i.e. ![
+$$\\mathcal{U}\\mathcal{U}^{T} = \\mathcal{I}$$
+](A442674_1_En_15_Chapter_IEq44.gif) and ![
+$$\\mathcal{V}\\mathcal{V}^{T} = \\mathcal{I}$$
+](A442674_1_En_15_Chapter_IEq45.gif)).
+
+Notice that there is a relationship between forming an SVD and diagonalizing a matrix. In particular, ![
+$$\\mathcal{X}^{T}\\mathcal{X}$$
+](A442674_1_En_15_Chapter_IEq46.gif) is symmetric, and it can be diagonalized as
+
+![
+$$\\displaystyle{\\mathcal{X}^{T}\\mathcal{X} = \\mathcal{V}\\Sigma ^{T}\\Sigma \\mathcal{V}^{T}.}$$
+](A442674_1_En_15_Chapter_Equg.gif)
+
+Similarly, ![
+$$\\mathcal{X}\\mathcal{X}^{T}$$
+](A442674_1_En_15_Chapter_IEq47.gif) is symmetric, and it can be diagonalized as
+
+![
+$$\\displaystyle{\\mathcal{X}\\mathcal{X}^{T} = \\mathcal{U}\\Sigma \\Sigma ^{T}\\mathcal{U}.}$$
+](A442674_1_En_15_Chapter_Equh.gif)
+
+### 15.1.2 Approximating A Symmetric Matrix
+
+Assume we have a k × k symmetric matrix ![
+$$\\mathcal{T}$$
+](A442674_1_En_15_Chapter_IEq48.gif), and we wish to construct a matrix ![
+$$\\mathcal{A}$$
+](A442674_1_En_15_Chapter_IEq49.gif) that approximates it. We require that (a) the rank of ![
+$$\\mathcal{A}$$
+](A442674_1_En_15_Chapter_IEq50.gif) is precisely r < k and (b) the approximation should minimize the Frobenius norm, that is,
+
+![
+$$\\displaystyle{\\mid \\!\\mid \\!\(\\mathcal{T} -\\mathcal{A}\)\\!\\mid \\!\\mid _{F}^{2} =\\sum _{ ij}\(T_{ij} - A_{ij}\)^{2}.}$$
+](A442674_1_En_15_Chapter_Equi.gif)
+
+It turns out that there is a straightforward construction that yields ![
+$$\\mathcal{A}$$
+](A442674_1_En_15_Chapter_IEq51.gif).
+
+The first step is to notice that if ![
+$$\\mathcal{U}$$
+](A442674_1_En_15_Chapter_IEq52.gif) is orthonormal and ![
+$$\\mathcal{M}$$
+](A442674_1_En_15_Chapter_IEq53.gif) is any matrix, then
+
+![
+$$\\displaystyle{\\mid \\!\\mid \\!\\mathcal{U}\\mathcal{M}\\!\\mid \\!\\mid _{F} = \\mid \\!\\mid \\!\\mathcal{M}\\mathcal{U}\\!\\mid \\!\\mid _{F} = \\mid \\!\\mid \\!\\mathcal{M}\\!\\mid \\!\\mid _{F}.}$$
+](A442674_1_En_15_Chapter_Equj.gif)
+
+This is true because ![
+$$\\mathcal{U}$$
+](A442674_1_En_15_Chapter_IEq54.gif) is a rotation (as is ![
+$$\\mathcal{U}^{T} = \\mathcal{U}^{-1}$$
+](A442674_1_En_15_Chapter_IEq55.gif)), and rotations do not change the length of vectors. So, for example, if we write ![
+$$\\mathcal{M}$$
+](A442674_1_En_15_Chapter_IEq56.gif) as a table of row vectors ![
+$$\\mathcal{M} = \\left \[\\mathbf{m}_{1},\\mathbf{m}_{2},\\ldots \\mathbf{m}_{k}\\right \]$$
+](A442674_1_En_15_Chapter_IEq57.gif), then ![
+$$\\mathcal{U}\\mathcal{M} = \\left \[\\mathcal{U}\\mathbf{m}_{1},\\mathcal{U}\\mathbf{m}_{2},\\ldots \\mathcal{U}\\mathbf{m}_{k}\\right \]$$
+](A442674_1_En_15_Chapter_IEq58.gif). Now ![
+$$\\mid \\!\\mid \\!\\mathcal{M}\\!\\mid \\!\\mid _{F}^{2} =\\sum _{ j=1}^{k}\\mid \\!\\mid \\!\\!\\mathbf{m}_{j}\\!\\!\\mid \\!\\mid ^{2}$$
+](A442674_1_En_15_Chapter_IEq59.gif), so ![
+$$\\mid \\!\\mid \\!\\mathcal{U}\\mathcal{M}\\!\\mid \\!\\mid _{F}^{2} =\\sum _{ i=1}^{k}\\mid \\!\\mid \\!\\!\\mathcal{U}\\mathbf{m}_{k}\\!\\!\\mid \\!\\mid ^{2}$$
+](A442674_1_En_15_Chapter_IEq60.gif). But rotations do not change lengths, so ![
+$$\\mid \\!\\mid \\!\\!\\mathcal{U}\\mathbf{m}_{k}\\!\\!\\mid \\!\\mid ^{2} = \\mid \\!\\mid \\!\\!\\mathbf{m}_{k}\\!\\!\\mid \\!\\mid ^{2}$$
+](A442674_1_En_15_Chapter_IEq61.gif), and so ![
+$$\\mid \\!\\mid \\!\\mathcal{U}\\mathcal{M}\\!\\mid \\!\\mid _{F} = \\mid \\!\\mid \\!\\mathcal{M}\\!\\mid \\!\\mid _{F}$$
+](A442674_1_En_15_Chapter_IEq62.gif). To see the result for the case of ![
+$$\\mathcal{M}\\mathcal{U}$$
+](A442674_1_En_15_Chapter_IEq63.gif), just think of ![
+$$\\mathcal{M}$$
+](A442674_1_En_15_Chapter_IEq64.gif) as a table of row vectors.
+
+Notice that, if ![
+$$\\mathcal{U}$$
+](A442674_1_En_15_Chapter_IEq65.gif) is the orthonormal matrix whose columns are eigenvectors of ![
+$$\\mathcal{T}$$
+](A442674_1_En_15_Chapter_IEq66.gif), then we have
+
+![
+$$\\displaystyle{\\mid \\!\\mid \\!\(\\mathcal{T} -\\mathcal{A}\)\\!\\mid \\!\\mid _{F}^{2} = \\mid \\!\\mid \\!\\mathcal{U}^{T}\(\\mathcal{T} -\\mathcal{A}\)\\mathcal{U}\\!\\mid \\!\\mid _{ F}^{2}.}$$
+](A442674_1_En_15_Chapter_Equk.gif)
+
+Now write ![
+$$\\Lambda _{r}$$
+](A442674_1_En_15_Chapter_IEq67.gif) for ![
+$$\\mathcal{U}^{T}\\mathcal{A}\\mathcal{U}$$
+](A442674_1_En_15_Chapter_IEq68.gif), and ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq69.gif) for the diagonal matrix of eigenvalues of ![
+$$\\mathcal{T}$$
+](A442674_1_En_15_Chapter_IEq70.gif). Then we have
+
+![
+$$\\displaystyle{\\mid \\!\\mid \\!\(\\mathcal{T} -\\mathcal{A}\)\\!\\mid \\!\\mid _{F}^{2} = \\mid \\!\\mid \\!\\Lambda - \\Lambda _{ A}\\!\\mid \\!\\mid _{F}^{2},}$$
+](A442674_1_En_15_Chapter_Equl.gif)
+
+an expression that is easy to solve for ![
+$$\\Lambda _{A}$$
+](A442674_1_En_15_Chapter_IEq71.gif). We know that ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq72.gif) is diagonal, so the best ![
+$$\\Lambda _{A}$$
+](A442674_1_En_15_Chapter_IEq73.gif) is diagonal, too. The rank of ![
+$$\\mathcal{A}$$
+](A442674_1_En_15_Chapter_IEq74.gif) must be r, so the rank of ![
+$$\\Lambda _{A}$$
+](A442674_1_En_15_Chapter_IEq75.gif) must be r as well. To get the best ![
+$$\\Lambda _{A}$$
+](A442674_1_En_15_Chapter_IEq76.gif), we keep the r largest diagonal values of ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq77.gif), and set the rest to zero; ![
+$$\\Lambda _{A}$$
+](A442674_1_En_15_Chapter_IEq78.gif) has rank r because it has only r non-zero entries on the diagonal, and every other entry is zero.
+
+Now to recover ![
+$$\\mathcal{A}$$
+](A442674_1_En_15_Chapter_IEq79.gif) from ![
+$$\\Lambda _{A}$$
+](A442674_1_En_15_Chapter_IEq80.gif), we know that ![
+$$\\mathcal{U}^{T}\\mathcal{U} = \\mathcal{U}\\mathcal{U}^{T} = \\mathcal{I}$$
+](A442674_1_En_15_Chapter_IEq81.gif) (remember, ![
+$$\\mathcal{I}$$
+](A442674_1_En_15_Chapter_IEq82.gif) is the identity). We have ![
+$$\\Lambda _{A} = \\mathcal{U}^{T}\\mathcal{A}\\mathcal{U}$$
+](A442674_1_En_15_Chapter_IEq83.gif), so
+
+![
+$$\\displaystyle{\\mathcal{A} = \\mathcal{U}\\Lambda _{A}\\mathcal{U}^{T}.}$$
+](A442674_1_En_15_Chapter_Equm.gif)
+
+We can clean up this representation in a useful way. Notice that only the first r columns of ![
+$$\\mathcal{U}$$
+](A442674_1_En_15_Chapter_IEq84.gif) (and the corresponding rows of ![
+$$\\mathcal{U}^{T}$$
+](A442674_1_En_15_Chapter_IEq85.gif)) contribute to ![
+$$\\mathcal{A}$$
+](A442674_1_En_15_Chapter_IEq86.gif). The remaining k − r are each multiplied by one of the zeros on the diagonal of ![
+$$\\Lambda _{A}$$
+](A442674_1_En_15_Chapter_IEq87.gif). Remember that, by convention, ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq88.gif) was sorted so that the diagonal values are in descending order (i.e. the largest value is in the top left corner). We now keep only the top left r × r block of ![
+$$\\Lambda _{A}$$
+](A442674_1_En_15_Chapter_IEq89.gif), which we write ![
+$$\\Lambda _{r}$$
+](A442674_1_En_15_Chapter_IEq90.gif). We then write ![
+$$\\mathcal{U}_{r}$$
+](A442674_1_En_15_Chapter_IEq91.gif) for the k × r matrix consisting of the first r columns of ![
+$$\\mathcal{U}$$
+](A442674_1_En_15_Chapter_IEq92.gif). Then
+
+![
+$$\\displaystyle{\\mathcal{A} = \\mathcal{U}_{r}\\Lambda _{r}\\mathcal{U}^{T}}$$
+](A442674_1_En_15_Chapter_Equn.gif)
+
+This is so useful a result, I have displayed it in a box; you should remember it.
+
+Procedure 15.1 (Approximating a Symmetric Matrix with a Low Rank Matrix)
+
+Assume we have a symmetric k × k matrix ![
+$$\\mathcal{T}$$
+](A442674_1_En_15_Chapter_IEq93.gif). We wish to approximate ![
+$$\\mathcal{T}$$
+](A442674_1_En_15_Chapter_IEq94.gif) with a matrix ![
+$$\\mathcal{A}$$
+](A442674_1_En_15_Chapter_IEq95.gif) that has rank r < k. Write ![
+$$\\mathcal{U}$$
+](A442674_1_En_15_Chapter_IEq96.gif) for the matrix whose columns are eigenvectors of ![
+$$\\mathcal{T}$$
+](A442674_1_En_15_Chapter_IEq97.gif), and ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq98.gif) for the diagonal matrix of eigenvalues of ![
+$$\\mathcal{A}$$
+](A442674_1_En_15_Chapter_IEq99.gif) (so ![
+$$\\mathcal{A}\\mathcal{U} = \\mathcal{U}\\Lambda$$
+](A442674_1_En_15_Chapter_IEq100.gif)). Remember that, by convention, ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq101.gif) was sorted so that the diagonal values are in descending order (i.e. the largest value is in the top left corner).
+
+Now construct ![
+$$\\Lambda _{r}$$
+](A442674_1_En_15_Chapter_IEq102.gif) from ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq103.gif) by setting the k − r smallest values of ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq104.gif) to zero, and keeping only the top left r × r block. Construct ![
+$$\\mathcal{U}_{r}$$
+](A442674_1_En_15_Chapter_IEq105.gif), the k × r matrix consisting of the first r columns of ![
+$$\\mathcal{U}$$
+](A442674_1_En_15_Chapter_IEq106.gif). Then
+
+![
+$$\\displaystyle{\\mathcal{A} = \\mathcal{U}_{r}\\Lambda _{r}\\mathcal{U}_{r}^{T}}$$
+](A442674_1_En_15_Chapter_Equo.gif)
+
+is the best possible rank r approximation to ![
+$$\\mathcal{T}$$
+](A442674_1_En_15_Chapter_IEq107.gif) in the Frobenius norm.
+
+Now if ![
+$$\\mathcal{A}$$
+](A442674_1_En_15_Chapter_IEq108.gif) is positive semidefinite (i.e. if at least the r largest eigenvalues of ![
+$$\\mathcal{T}$$
+](A442674_1_En_15_Chapter_IEq109.gif) are non-negative), then we can factor ![
+$$\\mathcal{A}$$
+](A442674_1_En_15_Chapter_IEq110.gif) as in the previous section. This yields a procedure to approximate a symmetric matrix by factors. This is so useful a result, I have displayed it in a box; you should remember it.
+
+Procedure 15.2 (Approximating a Symmetric Matrix with Low Dimensional Factors)
+
+Assume we have a symmetric k × k matrix ![
+$$\\mathcal{T}$$
+](A442674_1_En_15_Chapter_IEq111.gif). We wish to approximate ![
+$$\\mathcal{T}$$
+](A442674_1_En_15_Chapter_IEq112.gif) with a matrix ![
+$$\\mathcal{A}$$
+](A442674_1_En_15_Chapter_IEq113.gif) that has rank r < k. We assume that at least the r largest eigenvalues of ![
+$$\\mathcal{T}$$
+](A442674_1_En_15_Chapter_IEq114.gif) are non-negative. Write ![
+$$\\mathcal{U}$$
+](A442674_1_En_15_Chapter_IEq115.gif) for the matrix whose columns are eigenvectors of ![
+$$\\mathcal{T}$$
+](A442674_1_En_15_Chapter_IEq116.gif), and ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq117.gif) for the diagonal matrix of eigenvalues of ![
+$$\\mathcal{A}$$
+](A442674_1_En_15_Chapter_IEq118.gif) (so ![
+$$\\mathcal{A}\\mathcal{U} = \\mathcal{U}\\Lambda$$
+](A442674_1_En_15_Chapter_IEq119.gif)). Remember that, by convention, ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq120.gif) was sorted so that the diagonal values are in descending order (i.e. the largest value is in the top left corner).
+
+Now construct ![
+$$\\Lambda _{r}$$
+](A442674_1_En_15_Chapter_IEq121.gif) from ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq122.gif) by setting the k − r smallest values of ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq123.gif) to zero and keeping only the top left r × r block. Construct ![
+$$\\Lambda _{r}^{\(1/2\)}$$
+](A442674_1_En_15_Chapter_IEq124.gif) by replacing each diagonal element of ![
+$$\\Lambda$$
+](A442674_1_En_15_Chapter_IEq125.gif) with its positive square root. Construct ![
+$$\\mathcal{U}_{r}$$
+](A442674_1_En_15_Chapter_IEq126.gif), the k × r matrix consisting of the first r columns of ![
+$$\\mathcal{U}$$
+](A442674_1_En_15_Chapter_IEq127.gif). Then write ![
+$$\\mathcal{V} = \(\\mathcal{U}_{r}\\Lambda _{r}^{\(1/2\)}\)$$
+](A442674_1_En_15_Chapter_IEq128.gif)
+
+![
+$$\\displaystyle{\\mathcal{A} = \\mathcal{V}\\mathcal{V}^{T}}$$
+](A442674_1_En_15_Chapter_Equp.gif)
+
+is the best possible rank r approximation to ![
+$$\\mathcal{T}$$
+](A442674_1_En_15_Chapter_IEq129.gif) in the Frobenius norm.
+
+## 15.2 Some Special Functions
+
+Error functions and Gaussians: The error function is defined by
+
+![
+$$\\displaystyle{\\mathsf{erf}\\left \(x\\right \) = \\frac{2} {\\sqrt{\\pi }}\\int _{0}^{x}e^{-t^{2} }dt}$$
+](A442674_1_En_15_Chapter_Equq.gif)
+
+and programming environments can typically evaluate the error function. This fact is made useful to us by a simple change of variables. We get
+
+![
+$$\\displaystyle{ \\frac{1} {\\sqrt{2\\pi }}\\int _{0}^{x}e^{\\frac{-u^{2}} {2} }du = \\frac{1} {\\sqrt{\\pi }}\\int _{0}^{ \\frac{x} {\\sqrt{2}} }e^{-t^{2} }dt = \\frac{1} {2}\\mathsf{erf}\\left \( \\frac{x} {\\sqrt{2}}\\right \). }$$
+](A442674_1_En_15_Chapter_Equr.gif)
+
+A particularly useful manifestation of this fact comes by noticing that
+
+![
+$$\\displaystyle{ \\frac{1} {\\sqrt{2\\pi }}\\int _{-\\infty }^{0}e^{\\frac{-t^{2}} {2} }dt = 1/2}$$
+](A442674_1_En_15_Chapter_Equs.gif)
+
+(because ![
+$$\\frac{1} {\\sqrt{2\\pi }}e^{\\frac{-u^{2}} {2} }$$
+](A442674_1_En_15_Chapter_IEq130.gif) is a probability density function, and is symmetric about 0). As a result, we get
+
+![
+$$\\displaystyle{ \\frac{1} {\\sqrt{2\\pi }}\\int _{-\\infty }^{x}e^{\\frac{-t^{2}} {2} }dt = 1/2\\left \(1 + \\mathsf{erf}\\left \( \\frac{x} {\\sqrt{2}}\\right \)\\right \).}$$
+](A442674_1_En_15_Chapter_Equt.gif)
+
+Inverse error functions: We sometimes wish to know the value of x such that
+
+![
+$$\\displaystyle{ \\frac{1} {\\sqrt{2\\pi }}\\int _{-\\infty }^{x}e^{\\frac{-t^{2}} {2} }dt = p}$$
+](A442674_1_En_15_Chapter_Equu.gif)
+
+for some given p. The relevant function of p is known as the probit function or the normal quantile function. We write
+
+![
+$$\\displaystyle{x = \\Phi \(p\).}$$
+](A442674_1_En_15_Chapter_Equv.gif)
+
+The probit function ![
+$$\\Phi$$
+](A442674_1_En_15_Chapter_IEq131.gif) can be expressed in terms of the inverse error function. Most programming environments can evaluate the inverse error function (which is the inverse of the error function). We have that
+
+![
+$$\\displaystyle{\\Phi \(p\) = \\sqrt{2}\\mathsf{erf}^{-1}\\left \(2p - 1\\right \).}$$
+](A442674_1_En_15_Chapter_Equw.gif)
+
+One problem we solve with some regularity is: choose u such that
+
+![
+$$\\displaystyle{\\int _{-u}^{u} \\frac{1} {\\sqrt{2\\pi }}\\exp \(-x^{2}/2\)dx = p.}$$
+](A442674_1_En_15_Chapter_Equx.gif)
+
+Notice that
+
+![
+$$\\displaystyle\\begin{array}{rcl} \\frac{p} {2}& =& \\frac{1} {\\sqrt{2\\pi }}\\int _{0}^{u}e^{\\frac{-t^{2}} {2} }dt {}\\\\ & =& \\frac{1} {2}\\mathsf{erf}\\left \( \\frac{u} {\\sqrt{2}}\\right \) {}\\\\ \\end{array}$$
+](A442674_1_En_15_Chapter_Equ1.gif)
+
+so that
+
+![
+$$\\displaystyle{u = \\sqrt{2}\\mathsf{erf}^{-1}\\left \(p\\right \).}$$
+](A442674_1_En_15_Chapter_Equy.gif)
+
+Gamma functions: The gamma function ![
+$$\\Gamma \(x\)$$
+](A442674_1_En_15_Chapter_IEq132.gif) is defined by a series of steps. First, we have that for n an integer,
+
+![
+$$\\displaystyle{\\Gamma \(n\) = \(n - 1\)!}$$
+](A442674_1_En_15_Chapter_Equz.gif)
+
+and then for z a complex number with positive real part (which includes positive real numbers), we have
+
+![
+$$\\displaystyle{\\Gamma \(z\) =\\int _{ 0}^{\\infty }t^{z}\\frac{e^{-t}} {t} dt.}$$
+](A442674_1_En_15_Chapter_Equaa.gif)
+
+By doing this, we get a function on positive real numbers that is a smooth interpolate of the factorial function. We won't do any real work with this function, so won't expand on this definition. In practice, we'll either look up a value in tables or require a software environment to produce it.
+
+## 15.3 Splitting a Node in a Decision Tree
+
+We want to choose a split that yields the most information about the classes. To do so, we need to be able to account for information. The proper measure is entropy (described in more detail below). You should think of entropy as the number of bits, on average, that would be required to determine the value of a random variable. Filling in the details will allow us to determine which of two splits is better, and to tell whether it is worth splitting at all. At a high level, it is easy to compute which of two splits is better. We determine the entropy of the class conditioned on each split, then take the split which yields the lowest entropy. This works because less information (fewer bits) are required to determine the value of the class once we have that split. Similarly, it is easy to compute whether to split or not. We compare the entropy of the class conditioned on each split to the entropy of the class without a split, and choose the case with the lowest entropy, because less information (fewer bits) are required to determine the value of the class in that case.
+
+### 15.3.1 Accounting for Information with Entropy
+
+It turns out to be straightforward to keep track of information, in simple cases. We will start with an example. Assume I have 4 classes. There are 8 examples in class 1, 4 in class 2, 2 in class 3, and 2 in class 4. How much information on average will you need to send me to tell me the class of a given example? Clearly, this depends on how you communicate the information. You could send me the complete works of Edward Gibbon to communicate class 1; the Encyclopaedia for class 2; and so on. But this would be redundant. The question is how little can you send me. Keeping track of the amount of information is easier if we encode it with bits (i.e. you can send me sequences of '0's and '1's).
+
+Imagine the following scheme. If an example is in class 1, you send me a '1'. If it is in class 2, you send me '01'; if it is in class 3, you send me '001'; and in class 4, you send me '101'. Then the expected number of bits you will send me is
+
+![
+$$\\displaystyle\\begin{array}{rcl} & & p\(\\mbox{ class = 1}\)1 + p\(2\)2 + p\(3\)3 + p\(4\)3 {}\\\\ & =& \\frac{1} {2}1 + \\frac{1} {4}2 + \\frac{1} {8}3 + \\frac{1} {8}3 {}\\\\ \\end{array}$$
+](A442674_1_En_15_Chapter_Equ2.gif)
+
+which is 1. 75 bits. This number doesn't have to be an integer, because it's an expectation.
+
+Notice that for the i'th class, you have sent me − log2 p(i) bits. We can write the expected number of bits you need to send me as
+
+![
+$$\\displaystyle{-\\sum _{i}p\(i\)\\log _{2}p\(i\).}$$
+](A442674_1_En_15_Chapter_Equab.gif)
+
+This expression handles other simple cases correctly, too. You should notice that it isn't really important how many objects appear in each class. Instead, the fraction of all examples that appear in the class is what matters. This fraction is the prior probability that an item will belong to the class. You should try what happens if you have two classes, with an even number of examples in each; 256 classes, with an even number of examples in each; and 5 classes, with p(1) = 1∕2, p(2) = 1∕4, p(3) = 1∕8, p(4) = 1∕16 and p(5) = 1∕16. If you try other examples, you may find it hard to construct a scheme where you can send as few bits on average as this expression predicts. It turns out that, in general, the smallest number of bits you will need to send me is given by the expression
+
+![
+$$\\displaystyle{-\\sum _{i}p\(i\)\\log _{2}p\(i\)}$$
+](A442674_1_En_15_Chapter_Equac.gif)
+
+under all conditions, though it may be hard or impossible to determine what representation is required to achieve this number.
+
+The entropy of a probability distribution is a number that scores how many bits, on average, would need to be known to identify an item sampled from that probability distribution. For a discrete probability distribution, the entropy is computed as
+
+![
+$$\\displaystyle{-\\sum _{i}p\(i\)\\log _{2}p\(i\)}$$
+](A442674_1_En_15_Chapter_Equad.gif)
+
+where i ranges over all the numbers where p(i) is not zero. For example, if we have two classes and p(1) = 0. 99, then the entropy is 0. 0808, meaning you need very little information to tell which class an object belongs to. This makes sense, because there is a very high probability it belongs to class 1; you need very little information to tell you when it is in class 2. If you are worried by the prospect of having to send 0. 0808 bits, remember this is an average, so you can interpret the number as meaning that, if you want to tell which class each of 104 independent objects belong to, you could do so in principle with only 808 bits.
+
+Generally, the entropy is larger if the class of an item is more uncertain. Imagine we have two classes and p(1) = 0. 5, then the entropy is 1, and this is the largest possible value for a probability distribution on two classes. You can always tell which of two classes an object belongs to with just one bit (though you might be able to tell with even less than one bit).
+
+### 15.3.2 Choosing a Split with Information Gain
+
+Write ![
+$$\\mathcal{P}$$
+](A442674_1_En_15_Chapter_IEq133.gif) for the set of all data at the node. Write ![
+$$\\mathcal{P}_{l}$$
+](A442674_1_En_15_Chapter_IEq134.gif) for the left pool, and ![
+$$\\mathcal{P}_{r}$$
+](A442674_1_En_15_Chapter_IEq135.gif) for the right pool. The entropy of a pool ![
+$$\\mathcal{C}$$
+](A442674_1_En_15_Chapter_IEq136.gif) scores how many bits would be required to represent the class of an item in that pool, on average. Write ![
+$$n\(i;\\mathcal{C}\)$$
+](A442674_1_En_15_Chapter_IEq137.gif) for the number of items of class i in the pool, and ![
+$$N\(\\mathcal{C}\)$$
+](A442674_1_En_15_Chapter_IEq138.gif) for the number of items in the pool. Then the entropy ![
+$$H\(\\mathcal{C}\)$$
+](A442674_1_En_15_Chapter_IEq139.gif) of the pool ![
+$$\\mathcal{C}$$
+](A442674_1_En_15_Chapter_IEq140.gif) is
+
+![
+$$\\displaystyle{-\\sum _{i}\\frac{n\(i;\\mathcal{C}\)} {N\(\\mathcal{C}\)} \\log _{2}\\frac{n\(i;\\mathcal{C}\)} {N\(\\mathcal{C}}.}$$
+](A442674_1_En_15_Chapter_Equae.gif)
+
+It is straightforward that ![
+$$H\(\\mathcal{P}\)$$
+](A442674_1_En_15_Chapter_IEq141.gif) bits are required to classify an item in the parent pool ![
+$$\\mathcal{P}$$
+](A442674_1_En_15_Chapter_IEq142.gif). For an item in the left pool, we need ![
+$$H\(\\mathcal{P}_{l}\)$$
+](A442674_1_En_15_Chapter_IEq143.gif) bits; for an item in the right pool, we need ![
+$$H\(\\mathcal{P}_{r}\)$$
+](A442674_1_En_15_Chapter_IEq144.gif) bits. If we split the parent pool, we expect to encounter items in the left pool with probability
+
+![
+$$\\displaystyle{\\frac{N\(\\mathcal{P}_{l}\)} {N\(\\mathcal{P}\)} }$$
+](A442674_1_En_15_Chapter_Equaf.gif)
+
+and items in the right pool with probability
+
+![
+$$\\displaystyle{\\frac{N\(\\mathcal{P}_{r}\)} {N\(\\mathcal{P}\)}.}$$
+](A442674_1_En_15_Chapter_Equag.gif)
+
+This means that, on average, we must supply
+
+![
+$$\\displaystyle{\\frac{N\(\\mathcal{P}_{l}\)} {N\(\\mathcal{P}\)} H\(\\mathcal{P}_{l}\) + \\frac{N\(\\mathcal{P}_{r}\)} {N\(\\mathcal{P}\)} H\(\\mathcal{P}_{r}\)}$$
+](A442674_1_En_15_Chapter_Equah.gif)
+
+bits to classify data items if we split the parent pool. Now a good split is one that results in left and right pools that are informative. In turn, we should need fewer bits to classify once we have split than we need before the split. You can see the difference
+
+![
+$$\\displaystyle{ I\(\\mathcal{P}_{l},\\mathcal{P}_{r};\\mathcal{P}\) = H\(\\mathcal{P}\) -\\left \(\\frac{N\(\\mathcal{P}_{l}\)} {N\(\\mathcal{P}\)} H\(\\mathcal{P}_{l}\) + \\frac{N\(\\mathcal{P}_{r}\)} {N\(\\mathcal{P}\)} H\(\\mathcal{P}_{r}\)\\right \) }$$
+](A442674_1_En_15_Chapter_Equai.gif)
+
+as the information gain caused by the split. This is the average number of bits that you don't have to supply if you know which side of the split an example lies. Better splits have larger information gain.
+
+Recall that our decision function is to choose a feature at random, then test its value against a threshold. Any data point where the value is larger goes to the left pool; where the value is smaller goes to the right. This may sound much too simple to work, but it is actually effective and popular. Assume that we are at a node, which we will label k. We have the pool of training examples that have reached that node. The i'th example has a feature vector x i , and each of these feature vectors is a d dimensional vector.
+
+We choose an integer j in the range 1...d uniformly and at random. We will split on this feature, and we store j in the node. Recall we write x i (j) for the value of the j'th component of the i'th feature vector. We will choose a threshold t k , and split by testing the sign of x i (j) − t k . Choosing the value of t k is easy. Assume there are N k examples in the pool. Then there are N k − 1 possible values of t k that lead to different splits. To see this, sort the N k examples by x (j), then choose values of t k halfway between example values. For each of these values, we compute the information gain of the split. We then keep the threshold with the best information gain.
+
+We can elaborate this procedure in a useful way, by choosing m features at random, finding the best split for each, then keeping the feature and threshold value that is best. It is important that m is a lot smaller than the total number of features—a usual root of thumb is that m is about the square root of the total number of features. It is usual to choose a single m, and choose that for all the splits.
+Index
+
+Symbols
+
+L 2 norm
+
+χ 2 -distribution
+
+χ 2 -statistic
+
+3D bar chart
+
+A
+
+absorbing state
+
+accuracy
+
+affinity
+
+Agglomerative Clustering
+
+all-vs-all
+
+analysis of variance
+
+ANOVA
+
+ANOVA table
+
+approximate nearest neighbor
+
+Approximating a symmetric matrix with a low rank matrix
+
+Approximating a symmetric matrix with low dimensional factors
+
+average
+
+B
+
+bag
+
+bagging
+
+balanced
+
+balanced experiment
+
+bar chart
+
+baselines
+
+Basic properties of the probability events
+
+batch
+
+batch size
+
+Bayes risk
+
+Bayes' rule
+
+Bayesian inference
+
+Bayesian inference is particularly good with little data
+
+Bernoulli random variable
+
+Beta distribution
+
+between group variation
+
+biased estimate
+
+biased random walk
+
+bigram models
+
+bigrams
+
+bimodal
+
+Binomial distribution
+
+Binomial distribution for large N
+
+bootstrap
+
+bootstrap replicates
+
+box plot
+
+Building a decision forest
+
+Building a decision forest using bagging
+
+Building a decision tree: overall
+
+C
+
+categorical
+
+Centered confidence interval for a population mean
+
+Chebyshev's inequality
+
+class conditional probability
+
+class confusion matrix
+
+class error rate
+
+class-conditional histograms
+
+Classification with a decision forest
+
+Classifier
+
+classifier
+
+definition
+
+nearest neighbors
+
+cluster center
+
+clustering
+
+using K-means
+
+clusters
+
+color constancy
+
+comparing to chance
+
+complete-link clustering
+
+Computing a one-sided p-value for a T-test
+
+Computing a two-sided p-value for a T-test
+
+conditional histograms
+
+Conditional independence
+
+Conditional probability
+
+Conditional probability for independent events
+
+Conditional probability formulas
+
+Confidence interval for a population mean
+
+conjugacy
+
+conjugate prior
+
+consistency
+
+Constructing a centered 1 − 2 α confidence interval for a population mean for a large sample
+
+Constructing a centered 1 − 2 α confidence interval for a population mean for a small sample
+
+continuous
+
+contrasts
+
+correlation
+
+Correlation coefficient
+
+cost to go function
+
+Covariance
+
+covariance ellipses
+
+Covariance Matrix
+
+Covariance, useful expression
+
+cross-validation
+
+Cumulative distribution of a discrete random variable
+
+D
+
+decision function
+
+decision boundary
+
+decision forest
+
+decision tree
+
+degrees of freedom
+
+dendrogram
+
+density
+
+dependent variable
+
+descent direction
+
+descriptive statistics
+
+diagonal
+
+diagonalizing
+
+Diagonalizing a symmetric matrix
+
+Discrete random variable
+
+distributions
+
+how often a normal random variable is how far from the mean
+
+mean and variance of a bernoulli random variable
+
+mean and variance of a beta distribution
+
+mean and variance of a geometric distribution
+
+mean and variance of the binomial distribution
+
+mean and variance of the exponential distribution
+
+mean and variance of the gamma distribution
+
+mean and variance of the normal distribution
+
+mean and variance of the poisson distribution
+
+mean and variance of the standard normal distribution
+
+Divisive Clustering
+
+dynamic programming
+
+E
+
+Easy confidence intervals for a big sample
+
+eigenvalue
+
+eigenvector
+
+emission distribution
+
+empirical distribution
+
+entropy
+
+epoch
+
+error
+
+error bars
+
+error function
+
+Estimating Confidence Intervals for Maximum Likelihood Estimates using Simulation
+
+Estimating with maximum likelihood
+
+Evaluating whether a treatment has significant effects with a one-way ANOVA for balanced experiments
+
+Event
+
+Expectation
+
+Expectation of a continuous random variable
+
+Expectations are linear
+
+Expected value
+
+Expected value of a continuous random variable
+
+explanatory variables
+
+Exponential distribution
+
+Expressions for mean and variance of the sample mean
+
+F
+
+F-distribution
+
+F-statistic
+
+false positive rate
+
+false negative rate
+
+feature vector
+
+filtering
+
+fold
+
+Forming and interpreting a two-way ANOVA table
+
+Frobenius norm
+
+G
+
+gambler's fallacy
+
+Gamma distribution
+
+gaussian distributions
+
+generalizing badly
+
+Geometric distribution
+
+gradient descent
+
+group average clustering
+
+H
+
+heat map
+
+hidden Markov model
+
+hinge loss
+
+histogram
+
+I
+
+IID
+
+iid samples
+
+independent and identically distributed
+
+Independent events
+
+independent identically distributed samples
+
+Independent random variables
+
+Independent random variables have zero covariance
+
+indicator function
+
+Indicator functions
+
+information gain
+
+intensity
+
+interaction mean squares
+
+Interquartile Range
+
+inverse error function
+
+irreducible
+
+J
+
+joint
+
+Joint probability distribution of two discrete random variables
+
+K
+
+k-means
+
+Seeclustering, 287
+
+K-Means Clustering
+
+K-Means with Soft Weights
+
+L
+
+latent variable
+
+learning curves
+
+learning rate
+
+leave-one-out cross-validation
+
+Likelihood
+
+likelihood
+
+Likert scales
+
+line search
+
+Linear regression
+
+Linear Regression using Least Squares
+
+location parameter
+
+Log-likelihood of a dataset under a model
+
+M
+
+Many Markov chains have stationary distributions
+
+MAP estimate
+
+Marginal probability of a random variable
+
+Markov chain
+
+Markov chains
+
+Markov's inequality
+
+maximum a posteriori estimate
+
+Maximum likelihood principle
+
+Mean
+
+mean and variance of
+
+a bernoulli random variable
+
+a beta distribution
+
+a geometric distribution
+
+the binomial distribution
+
+the exponential distribution
+
+the gamma distribution
+
+the normal distribution
+
+the poisson distribution
+
+the standard normal distribution
+
+Mean or expected value
+
+mean square error
+
+Median
+
+mode
+
+multidimensional scaling
+
+multimodal
+
+Multinomial distribution
+
+N
+
+n-gram models
+
+n-grams
+
+normal distribution
+
+Normal data
+
+Normal distribution
+
+normal distribution
+
+Normal posteriors can be updated online
+
+normal quantile function
+
+normal random variable
+
+normalizing
+
+normalizing constant
+
+O
+
+odds
+
+one factor
+
+one-sided p-value
+
+one-vs-all
+
+ordinal
+
+orthonormal
+
+Orthonormal matrices are rotations
+
+outcomes
+
+outlier
+
+outliers
+
+overfitting
+
+P
+
+p-value
+
+p-value hacking
+
+Pairwise independence
+
+Parameters of a Multivariate Normal Distribution
+
+pdf
+
+Percentile
+
+phonemes
+
+pie chart
+
+Poisson distribution
+
+Poisson point process
+
+population
+
+population mean
+
+positive definite
+
+positive semidefinite
+
+posterior
+
+Predicting a value using correlation
+
+Predicting a value using correlation: Rule of thumb - 1
+
+Predicting a value using correlation: Rule of thumb - 2
+
+principal components
+
+Principal Components Analysis
+
+Principal Coordinate Analysis
+
+principal coordinate analysis
+
+prior
+
+prior probability distribution
+
+probability
+
+probability density function
+
+Probability distribution of a discrete random variable
+
+probability mass function
+
+probit function
+
+procedure
+
+predicting a value using correlation
+
+the t-test of significance for a hypothesized mean
+
+agglomerative clustering
+
+approximating a symmetric matrix with a low rank matrix
+
+approximating a symmetric matrix with low dimensional factors
+
+building a decision forest
+
+building a decision forest using bagging
+
+building a decision tree: overall
+
+classification with a decision forest
+
+computing a one-sided p-value for a t-test
+
+computing a two-sided p-value for a t-test
+
+constructing a centered 1 − 2 α confidence interval for a population mean for a large sample
+
+constructing a centered 1 − 2 α confidence interval for a population mean for a small sample
+
+diagonalizing a symmetric matrix
+
+divisive clustering
+
+estimating confidence intervals for maximum likelihood estimates using simulation
+
+estimating with maximum likelihood
+
+evaluating whether a treatment has significant effects with a one-way anova for balanced experiments
+
+forming and interpreting a two-way anova table
+
+k-means clustering
+
+k-means with soft weights
+
+linear regression using least squares
+
+predicting a value using correlation: rule of thumb—1
+
+predicting a value using correlation: rule of thumb—2
+
+principal components analysis
+
+principal coordinate analysis
+
+setting up a two-way anova
+
+splitting a non-ordinal feature
+
+splitting an ordinal feature
+
+testing whether two populations have the same mean, for different population standard deviations
+
+testing whether two populations have the same mean, for known population standard deviations
+
+testing whether two populations have the same mean, for same but unknown population standard deviations
+
+the χ 2 -test of significance of fit to a model
+
+the bootstrap
+
+the f-test of significance for equality of variance
+
+training an svm: estimating the accuracy
+
+training an svm: overall
+
+training an svm: stochastic gradient descent
+
+vector quantization—building a dictionary
+
+vector quantization—representing a signal
+
+Properties of normal data
+
+Properties of probability density functions
+
+Properties of sample and population means
+
+Properties of standard deviation
+
+Properties of the correlation coefficient
+
+Properties of the covariance matrix
+
+Properties of the interquartile range
+
+Properties of the mean
+
+Properties of the median
+
+Properties of the probability of events
+
+Properties of variance
+
+prosecutor's fallacy
+
+Q
+
+Quartiles
+
+R
+
+Randomization
+
+raw Google matrix
+
+realization
+
+recurrent
+
+Regression
+
+regularization
+
+regularization parameter
+
+regularization weight
+
+regularizer
+
+residual
+
+residual variation
+
+ridge regression
+
+S
+
+sample
+
+sample mean
+
+Sample space
+
+scale parameter
+
+scatter plot
+
+selection bias
+
+sensitivity
+
+Setting up a two-way ANOVA
+
+single-link clustering
+
+skew
+
+smoothing
+
+specificity
+
+Splitting a non-ordinal feature
+
+Splitting an ordinal feature
+
+stacked bar chart
+
+Standard coordinates
+
+Standard deviation
+
+standard deviation
+
+Standard error
+
+standard normal curve
+
+Standard normal data
+
+Standard Normal distribution
+
+standard normal distribution
+
+standard normal random variable
+
+stationary distribution
+
+statistic
+
+Statistical significance
+
+step size
+
+steplength
+
+steplength schedule
+
+Stochastic gradient descent
+
+stochastic matrices
+
+Sums and differences of normal random variables
+
+support vector machine
+
+SVM
+
+symmetric
+
+T
+
+T-distribution
+
+T-random variable
+
+T-test
+
+tails
+
+test error
+
+test examples
+
+test statistic
+
+Testing whether two populations have the same mean, for different population standard deviations
+
+Testing whether two populations have the same mean, for known population standard deviations
+
+Testing whether two populations have the same mean, for same but unknown population standard deviations
+
+The χ 2 -test of significance of fit to a model
+
+The bootstrap
+
+The F-test of significance for equality of variance
+
+The parameters of a normal posterior with a single measurement
+
+The properties of simulations
+
+The T-test of significance for a hypothesized mean
+
+total error rate
+
+Training an SVM: estimating the accuracy
+
+Training an SVM: Overall
+
+Training an SVM: stochastic gradient descent
+
+training error
+
+training examples
+
+transition probabilities
+
+Transition probability matrices
+
+treatment one mean squares
+
+treatment two mean squares
+
+treatment variation
+
+trellis
+
+trial
+
+trigram models
+
+trigrams
+
+two-factor ANOVA
+
+two-sided p-value
+
+two-way ANOVA
+
+U
+
+unbalanced experiment
+
+unbiased
+
+unbiased estimate
+
+uniform distribution
+
+Uniform distribution, continuous
+
+uniform random variable
+
+Uniform random variable, discrete
+
+unigram models
+
+unigrams
+
+unimodal
+
+useful facts
+
+basic properties of the probability events
+
+bayesian inference is particularly good with little data
+
+binomial distribution for large n
+
+conditional probability for independent events
+
+conditional probability formulas
+
+covariance, useful expression
+
+easy confidence intervals for a big sample
+
+expectations are linear
+
+expressions for mean and variance of the sample mean
+
+how often a normal random variable is how far from the mean
+
+independent random variables have zero covariance
+
+many markov chains have stationary distributions
+
+markov chains
+
+mean and variance of a bernoulli random variable
+
+mean and variance of a beta distribution
+
+mean and variance of a geometric distribution
+
+mean and variance of the binomial distribution
+
+mean and variance of the exponential distribution
+
+mean and variance of the gamma distribution
+
+mean and variance of the normal distribution
+
+mean and variance of the poisson distribution
+
+mean and variance of the standard normal distribution
+
+normal posteriors can be updated online
+
+orthonormal matrices are rotations
+
+parameters of a multivariate normal distribution
+
+properties of normal data
+
+properties of probability density functions
+
+properties of sample and population means
+
+properties of standard deviation
+
+properties of the correlation coefficient
+
+properties of the covariance matrix
+
+properties of the interquartile range
+
+properties of the median
+
+properties of the probability of events
+
+properties of variance
+
+regression
+
+sums and differences of normal random variables
+
+the parameters of a normal posterior with a single measurement
+
+the properties of simulations
+
+transition probability matrices
+
+variance as covariance
+
+variance, a useful expression
+
+you can transform data to zero mean and diagonal covariance
+
+utility
+
+V
+
+validation set
+
+Variance
+
+Variance as covariance
+
+Variance, a useful expression
+
+vector quantization
+
+Vector Quantization - Building a Dictionary
+
+Vector Quantization - Representing a Signal
+
+Viterbi algorithm
+
+W
+
+Weak Law of Large Numbers
+
+whitening
+
+within group variation
+
+within group mean squares
+
+Y
+
+You can transform data to zero mean and diagonal covariance
+
+Z
+
+Zipf's law
+
diff --git a/kag/examples/csqa/builder/data/professional_microsoft_sql_server_programming.txt b/kag/examples/csqa/builder/data/professional_microsoft_sql_server_programming.txt
new file mode 100644
index 00000000..68f7b8ef
--- /dev/null
+++ b/kag/examples/csqa/builder/data/professional_microsoft_sql_server_programming.txt
@@ -0,0 +1,35300 @@
+Professional Microsoft SQL Server 2008 Programming
+ 
+Table of Contents
+
+Title Page
+
+Copyright
+
+Dedication
+
+About the Authors
+
+Credits
+
+Acknowledgments
+
+Introduction
+
+Version Issues
+
+Who This Book Is For
+
+What This Book Covers
+
+How This Book Is Structured
+
+Conventions
+
+Source Code
+
+Errata
+
+p2p.wrox.com
+
+www.professionalsql.com
+
+Chapter 1: Being Objective: Re-Examining Objects in SQL Server
+
+So, What Exactly Do We Have Here?
+
+An Overview of Database Objects
+
+SQL Server Data Types
+
+SQL Server Identifiers for Objects
+
+Summary
+
+Chapter 2: Tool Time
+
+Books Online
+
+The SQL Server Configuration Manager
+
+The SQL Server Management Studio
+
+SQL Server Business Intelligence Development Studio
+
+SQL Server Integration Services (SSIS)
+
+Reporting Services
+
+Bulk Copy Program (bcp)
+
+SQL Server Profiler
+
+sqlcmd
+
+Summary
+
+Chapter 3: Asking a Better Question: Advanced Queries
+
+A Quick Review of Subqueries
+
+Building a Nested Subquery
+
+Correlated Subqueries
+
+Derived Tables
+
+The EXISTS Operator
+
+The INTERSECT and EXCEPT Operators
+
+Common Table Expressions (CTEs)
+
+Recursive Queries
+
+MERGE
+
+Using External Calls to Perform Complex Actions
+
+Performance Considerations
+
+Summary
+
+Chapter 4: XML Integration
+
+The XML Data Type
+
+Retrieving Relational Data in XML Format
+
+A Quick Heads Up Regarding XML Indexes
+
+A Brief Word on Hierarchical Data
+
+Summary
+
+Chapter 5: Daring to Design
+
+Normalization 201
+
+Relationships
+
+Diagramming
+
+Logical versus Physical Design
+
+Dealing with File-Based Information Via Classic BLOBs
+
+Subcategories
+
+Database Reuse
+
+De-Normalization
+
+Partitioning for Scalability
+
+The SQL Server Diagramming Tools
+
+Regarding Date Columns
+
+Summary
+
+Chapter 6: Core Storage and Index Structure
+
+SQL Server Storage
+
+Understanding Indexes
+
+Creating, Altering, and Dropping Indexes
+
+Choosing Wisely: Deciding What Index Goes Where and When
+
+Maintaining Your Indexes
+
+Summary
+
+Chapter 7: More Advanced Index Structures
+
+XML Indexes
+
+User-Defined Data Types
+
+Hierarchical Data
+
+Spatial Data
+
+Filestreams
+
+Enabling Filestreaming
+
+Table Compression
+
+Summary
+
+Chapter 8: Views
+
+Reviewing View Syntax
+
+More Complex Views
+
+Editing Views with T-SQL
+
+Dropping Views
+
+Auditing: Displaying Existing Code
+
+Protecting Code: Encrypting Views
+
+About Schema Binding
+
+Making Your View Look Like a Table with VIEW_METADATA
+
+Indexed (Materialized) Views
+
+Partitioned Views
+
+Summary
+
+Chapter 9: Scripts and Batches
+
+Script Basics
+
+Batches
+
+SQLCMD
+
+Dynamic SQL: Generating Your Code on the Fly with the EXEC Command
+
+Control-of-Flow Statements
+
+Summary
+
+Chapter 10: Advanced Programmability
+
+A More Advanced Look At Stored Procedures
+
+Table-Valued Parameters (TVPs)
+
+Debugging
+
+Understanding the SQLCLR and .NET Programming in SQL Server
+
+Creating Aggregate Functions
+
+Custom Data Types
+
+Summary
+
+Chapter 11: Transactions and Locks
+
+Transactions
+
+How the SQL Server Log Works
+
+Locks and Concurrency
+
+Setting the Isolation Level
+
+Dealing with Deadlocks (a.k.a. "A 1205")
+
+Summary
+
+Chapter 12: Triggers
+
+What Is a Trigger?
+
+Using Triggers for Data Integrity Rules
+
+Other Common Uses for Triggers
+
+Other Trigger Issues
+
+INSTEAD OF Triggers
+
+IF UPDATE() and COLUMNS_UPDATED()
+
+Performance Considerations
+
+Dropping Triggers
+
+Summary
+
+Chapter 13: SQL Cursors
+
+What Is a Cursor?
+
+The Life Span of a Cursor
+
+Types of Cursors and Extended Declaration Syntax
+
+Navigating the Cursor: The FETCH Statement
+
+Altering Data within Your Cursor
+
+Summary
+
+Chapter 14: Reporting Services
+
+A Quick Look at Reports as a Concept
+
+Reporting Services 101
+
+Report Server Projects
+
+A Brief Note on RDL
+
+Summary
+
+Chapter 15: Buying in Bulk: The Bulk Copy Program (BCP) and Other Basic Bulk Operations
+
+bcp Utility
+
+Format Files
+
+BULK INSERT
+
+OPENROWSET (BULK)
+
+Summary
+
+Chapter 16: Getting Integrated
+
+Understanding the Problem
+
+An Overview of Packages
+
+Building a Simple Package
+
+Executing Packages
+
+Summary
+
+Chapter 17: Replication
+
+Replication Basics
+
+Replication Models
+
+Replication Topology
+
+Planning for Replication
+
+Setting Up Replication in Management Studio
+
+Replication Management Objects (RMO)
+
+Summary
+
+Chapter 18: Looking at Things in Full: Full-Text Search
+
+Full-Text Search Architecture
+
+Setting Up Full-Text Indexes and Catalogs
+
+More on Index Population
+
+Full-Text Query Syntax
+
+Stop Words
+
+Summary
+
+Chapter 19: Feeling Secure
+
+Security Basics
+
+Security Options
+
+User Permissions
+
+Server and Database Roles
+
+Application Roles
+
+More Advanced Security
+
+Certificates and Asymmetric Keys
+
+Summary
+
+Chapter 20: A Grand Performance: Designing a Database That Performs Well
+
+When to Tune
+
+Index Choices
+
+Client vs. Server-Side Processing
+
+Strategic De-Normalization
+
+Organizing Your Sprocs Well
+
+Uses for Temporary Tables
+
+Update Your Code In a Timely Fashion
+
+Sometimes, It's the Little Things
+
+Hardware Considerations
+
+Summary
+
+Chapter 21: What Comes After: Forensic Performance Tuning
+
+When to Tune (Mark Two)
+
+Routine Maintenance
+
+Troubleshooting
+
+Summary
+
+Chapter 22: Administration
+
+Scheduling Jobs
+
+Backup and Recovery
+
+Index Maintenance
+
+Archiving of Data
+
+PowerShell
+
+Policy-Based Management
+
+Summary
+
+Chapter 23: SMO: SQL Management Objects
+
+The History of SQL Server Management Object Models
+
+The SMO Object Model
+
+Walking through Some Examples
+
+Dropping a Database
+
+Backing Up a Database
+
+Scripting
+
+Pulling It All Together
+
+Summary
+
+Chapter 24: Data Warehousing
+
+Considering Differing Requirements
+
+Dimensional Databases
+
+Data Warehouse Concepts
+
+SQL Server Integration Services
+
+Creating an Analysis Services Solution
+
+Accessing a Cube
+
+Summary
+
+Chapter 25: Being Well Connected
+
+Appendix A: System Functions
+
+Legacy System Functions (a.k.a. Global Variables)
+
+Aggregate Functions
+
+Configuration Functions
+
+Cryptographic Functions
+
+Cursor Functions
+
+Date and Time Functions
+
+Mathematical Functions
+
+Basic Metadata Functions
+
+Rowset Functions
+
+Security Functions
+
+String Functions
+
+System Functions
+
+Text and Image Functions
+
+Appendix B: Going Meta: Getting Data About Your Data
+
+System Views
+
+Dynamic Management Views
+
+Appendix C: The Basis
+
+Everything to Do with Queries
+
+The INSERT Statement
+
+The UPDATE Statement
+
+The DELETE Statement
+
+Exploring Alternative Syntax for Joins
+
+UNION
+
+Summary
+
+Index
+
+Advertisement
+
+Professional Microsoft® SQL Server® 2008 Programming
+
+Published by
+
+Wiley Publishing, Inc.
+
+10475 Crosspoint Boulevard
+
+Indianapolis, IN 46256
+
+www.wiley.com
+
+Copyright © 2009 by Wiley Publishing, Inc., Indianapolis, Indiana
+
+Published simultaneously in Canada
+
+ISBN: 978-0-470-25702-9
+
+Vieira, Robert.
+
+Professional Microsoft SQL server 2008 programming / Rob Vieira.
+
+p. cm.
+
+Includes index.
+
+ISBN 978-0-470-25702-9 (paper/website)
+
+1. SQL server. 2. Client/server computing. 3. Database management. I. Title.
+
+QA76.9.C55V535 2009
+
+005.75′85–dc22
+
+2009010578
+
+No part of this publication may be reproduced, stored in a retrieval system or transmitted in any form or by any means, electronic, mechanical, photocopying, recording, scanning or otherwise, except as permitted under Sections 107 or 108 of the 1976 United States Copyright Act, without either the prior written permission of the Publisher, or authorization through payment of the appropriate per-copy fee to the Copyright Clearance Center, 222 Rosewood Drive, Danvers, MA 01923, (978) 750-8400, fax (978) 646-8600. Requests to the Publisher for permission should be addressed to the Permissions Department, John Wiley & Sons, Inc., 111 River Street, Hoboken, NJ 07030, (201) 748-6011, fax (201) 748-6008, or online at <http://www.wiley.com/go/permissions>.
+
+Limit of Liability/Disclaimer of Warranty: The publisher and the author make no representations or warranties with respect to the accuracy or completeness of the contents of this work and specifically disclaim all warranties, including without limitation warranties of fitness for a particular purpose. No warranty may be created or extended by sales or promotional materials. The advice and strategies contained herein may not be suitable for every situation. This work is sold with the understanding that the publisher is not engaged in rendering legal, accounting, or other professional services. If professional assistance is required, the services of a competent professional person should be sought. Neither the publisher nor the author shall be liable for damages arising herefrom. The fact that an organization or Web site is referred to in this work as a citation and/or a potential source of further information does not mean that the author or the publisher endorses the information the organization or Web site may provide or recommendations it may make. Further, readers should be aware that Internet Web sites listed in this work may have changed or disappeared between when this work was written and when it is read.
+
+For general information on our other products and services please contact our Customer Care Department within the United States at (877) 762-2974, outside the United States at (317) 572-3993 or fax (317) 572-4002.
+
+Trademarks: Wiley, the Wiley logo, Wrox, the Wrox logo, Wrox Programmer to Programmer, and related trade dress are trademarks or registered trademarks of John Wiley & Sons, Inc. and/or its affiliates, in the United States and other countries, and may not be used without written permission. Microsoft and SQL Server are registered trademarks of Microsoft Corporation in the United States and/or other countries. All other trademarks are the property of their respective owners. Wiley Publishing, Inc., is not associated with any product or vendor mentioned in this book.
+
+Wiley also publishes its books in a variety of electronic formats. Some content that appears in print may not be available in electronic books.
+This book is dedicated with all my heart to Ashley, Addy, and Anna. It's hard to believe that Ashley was still in elementary school when I wrote my first book (she'll graduate from college later this year) and didn't have to share this dedication with other kids (heck, she isn't much of a kid anymore—eeek!). Then there's Addy, who literally dreads me writing, yet continues to put up with me "disappearing" into my home office during the lengthy period of time I work on a book. Finally, there's Anna, who came in the middle of this round of writing, and didn't have any real history on what to expect from my writing disappearances.
+
+A dedication of this book would most certainly not be complete without a particularly special thank you to Deb, who has leveraged my efforts in this and my recent Beginning title to redefine for me what the terms "love" and "support" mean.
+
+As I've said for many releases now, I wish Wrox would let me print a picture of the women in my life on the cover of this book rather than my ugly mug—I'm certain that you, the reader, would appreciate it too!
+About the Authors
+
+Experiencing his first infection with computing fever in 1978, Rob Vieira knew right away that this was something "really cool." In 1980 he began immersing himself into the computing world more fully—splitting time between building and repairing computer kits, and programming in BASIC as well as Z80 and 6502 assembly. In 1983, he began studies for a degree in Computer Information Systems, but found the professional mainframe environment too rigid for his tastes and dropped out in 1985 to pursue other interests. Later that year, he caught the "PC bug" and began the long road of programming in database languages from dBase to SQL Server. Rob completed a degree in business administration in 1990 and since has typically worked in roles that allow him to combine his unique knowledge of business and computing. Beyond his bachelor's degree, he has been certified as a Certified Management Accountant (CMA); Microsoft Certified as a Solutions Developer (MCSD), Trainer (MCT), and Database Administrator (MCDBA); and even had a brief stint certified as an Emergency Medical Technician (EMT).
+
+Rob is currently the Database Team Lead for Stockamp, a Huron Consulting Group practice. He has published six books on SQL Server development
+
+He resides in Vancouver, Washington, where he does his best to take care of "his girls," and even takes his best shot at letting Deb take care of him some too.
+Credits
+
+Executive Editor
+
+Bob Elliott
+
+Development Editor
+
+Sydney Jones
+
+Adaobi Obi Tulton
+
+Technical Editor
+
+John Mueller
+
+Production Editor
+
+Daniel Scribner
+
+Copy Editor
+
+Kim Cofer
+
+Editorial Manager
+
+Mary Beth Wakefield
+
+Production Manager
+
+Tim Tate
+
+Vice President and Executive Group Publisher
+
+Richard Swadley
+
+Vice President and Executive Publisher
+
+Barry Pruett
+
+Associate Publisher
+
+Jim Minatel
+
+Project Coordinator, Cover
+
+Lynsey Stanford
+
+Proofreader
+
+Publication Services, Inc.
+
+Indexer
+
+Robert Swanson
+Acknowledgments
+
+Nearly a decade has gone by since I finished my first book on SQL Server, and the list of those deserving some credit continues to grow.
+
+As always, I'll start with my kids. They are the ones who most sacrifice time when a new release comes along and I spend so much time writing.
+
+Deb—This was the first time she had to deal with one of my writing cycles, and she couldn't have done anything more to make me feel more supported. Deb has helped me redefine the meaning of the word support. She is my rock.
+
+You—the readers. You continue to write me mail and tell me how I helped you out in some way. That continues to be the number one reason I find the strength to write another book. The continued support of my Professional series titles has been amazing. We struck a chord—I'm glad. Here's to hoping we help make your SQL Server experience a little less frustrating and a lot more successful.
+
+As always, I want to pay special thanks to several people past and present. Some of these are at the old Wrox Press, but they remain so much of who I am as a writer that I need to continue to remember them. Others are new players for me, but have added their own stamp to the mix—sometimes just by showing a little patience:
+
+Kate Hall—Who, although she was probably ready to kill me by the end of each of my first two books, somehow guided me through the edit process to build a better book each time. I have long since fallen out of touch with Kate, but she will always be the most special to me as someone who really helped shape my writing career. I will likely always hold this first "professional" dedication spot for her. Wherever you are Kate, I hope you are doing splendidly.
+
+Sydney Jones—Normally, when you add someone new to a list like this, they seem to wind up at the bottom—that didn't seem quite right for Syd. Syd first appeared to me as one of a cast of seemingly thousands (it was less than that, but it seemed like a constantly changing set of people), but has evolved into one of the constants in my last few titles. Syd has done a great job of finding some balance between upholding the notion of "Wiley standards" versus allowing my books to be my books. It's beyond time that she got her due—here is at least a little bit of it.
+
+Adaobi Obi Tulton—Four books now—who'd a thunk it? Adaobi is something of the "figure out where everything is" person of the book. She has a way of displaying a kind of "peace" in most things I've seen her do—I need to learn that.
+
+Dominic Shakeshaft—Who got me writing in the first place (then again, given some nights filled with writing instead of sleep lately, maybe it's not thanks I owe him). I've had the pleasure of re-establishing contact with Dominic recently, and it's great to see him doing well.
+
+Catherine Alexander—Who played Kate's more than able-bodied sidekick for my first title and was central to round two. Catherine was much like Kate in the sense she had a significant influence on the shape and success of my first two titles. I've also recently managed to be in touch with "Cath" again, and am glad she also seems to be doing well.
+
+John Mueller—Who continues to have the dubious job of finding my mistakes. I've done tech editing myself, and it's not the easiest job to notice the little details that were missed or are, in some fashion, wrong. It's even harder to read someone else's writing style and pick the right times to say, "You might want to approach this differently" and the right times to let it be. John did a terrific job on both counts.
+
+John faced an especially trying time during the latter stages of this book's development, and did an amazing job of holding it all together. I wish he and his wife Rebecca great fortunes and good health in the year ahead.
+
+Richard Waymire—Who has been such a pillar of the SQL Server community for oh so long. Richard is one of the true "nice guys" who isn't just nice—he just plain "gets it" when it comes to SQL Server. Richard has provided the answer to many a question over the years.
+
+This book has been in development for so long and touched enough people that I'm sure I've missed one or two—if you're among those missed, please accept my humblest apologies and my assurance that your help was appreciated.
+Introduction
+
+And so we begin anew. While the wait for SQL Server 2005 took a ghastly five years, Microsoft has blessed us (and, no, I'm not being sarcastic!) with SQL Server 2008 in just three short years.
+
+It probably shouldn't be surprising to me that SQL Server 2008 comes up short in terms of marquee features—the flash and glitz stuff. Yet, what I am surprised by is just how many "little things" have made it into this release. While there are some key feature areas (Policy Based Management, for example) that Microsoft can hang their marketing hat on, the most striking new features in SQL Server 2008 are far more subtle in nature.
+
+SQL Server 2008 is jam-packed with a host of seemingly small, but extremely useful smaller additions such as:
+
+  * New, very useful data types (discrete date and time data types as well as support for geospatial data and hierarchical data representation)
+  * The new MERGE command (combining the options of INSERT, UPDATE, and DELETE all into one statement, this is sometimes referred to as an UPSERT in other products)
+  * A revamping of Reporting Services to allow for far more elegant reports
+  * Tracking and availability of "missing" index information (noticing at time of optimization that a non-existent index would have been useful had it been there)
+
+And these are just for starters.
+
+For those of you that have read the 2005 versions of my books, the change toward the separation of beginning and professional level content continues. The real "beginning" level discussion is now almost completely moved into its own book (Beginning SQL Server 2008 Programming). There are some topics that I consider borderline in terms of whether they are a beginner, intermediate, or professional level topic that I have continued to provide some degree of coverage on, but, with this edition, most all genuinely beginner content exists solely in the Beginning title (a new appendix has been added at the end of the book to give extremely short syntax listings and a few examples, but what was hundreds of pages is now in the tens of pages).
+
+There is, however, good news that has continued to free up even more space, and this has allowed the Professional title to return to more broad coverage of a wider range of topics. This allows me to get closer to the original goal I had for the Professional book: to give you enough grounding in most of the product that, even if you couldn't write at an expert level in each area, you would understand SQL Server as a whole and build a better system, knowing what's involved in the many areas where SQL Server offers functionality, and being prepared to go get even more information if need be.
+
+Other than that, this book maintains most of the style it has always had. We cover most of the add-on services, as well as advanced programming constructs (such as .NET assemblies) and some of the supporting object models (which allow for management of your SQL Server and its various engines).
+
+Version Issues
+
+This book is written for SQL Server 2008. It does, however, maintain roots going back a few versions and keeps a sharp eye out for backward compatibility issues with SQL Server 2005 and even SQL Server 2000. Prior versions are old enough now where little to no time is spent on them except in passing (in short, few remain that have even seen SQL Server 6.5 and SQL Server 7.0 was just shy of a decade old at release of SQL Server 2008).
+
+Who This Book Is For
+
+This book assumes that you have some existing experience with SQL Server and are at an intermediate to advanced level. Furthermore, the orientation of the book is highly developer focused.
+
+Aside from a quick reference-oriented appendix, very little coverage is given to beginner level topics. It is assumed you already have experience with data manipulation language (DML) statements, and know the basics of all the mainstream SQL Server objects (views, stored procedures, user defined functions and, to a much lesser extent, triggers). If you feel you are ready for more advanced topics, but also feel that brushing up on the basics can't hurt, I highly encourage you to check out Beginning SQL Server 2008 Programming, as the two books are now much more designed to work as a pair with some, but relatively limited, overlap.
+
+What This Book Covers
+
+This book is about SQL Server. More specifically, it is oriented around developing on SQL Server. Most of the concepts are agnostic to what client language you use, though the examples that leverage a client language generally do so in C#. (A few are shown in more than one language.)
+
+For those of you migrating from earlier versions of SQL Server, some of the "gotchas" that inevitably exist any time a product has versions are discussed to the extent that they seem to be a genuinely relevant issue.
+
+How This Book Is Structured
+
+As is the case for all my books, this book takes something of a lassez faire writing style. We roam around a bit within a relatively loose structure. Each chapter begins with an explanation of the things to be covered in that chapter, and then we go through those items. Within each topic covered, some background is provided, and then we work through an example if appropriate. Examples are generally created to be short, and yet still quickly get at several of the concepts you need for whatever topic is being covered at the time.
+
+In terms of "what to cover next," there is a logical progression. We review tools and data types early on (since there are changes there), but move on very fast to topics that assume you are already comfortable with the product.
+
+To make reasonable use of this book, you will need administrative access to a computer that is capable of running SQL Server 2008. While I highly recommend using the Developer Edition, the vast majority of samples and advice applies to virtually all editions of SQL Server. I do, however, recommend staying with a full version of SQL Server rather than using the Express Edition.
+
+Conventions
+
+To help you get the most from the text and keep track of what's happening, we've used a number of conventions throughout the book.
+
+Boxes like this one hold important, not-to-be forgotten information that is directly relevant to the surrounding text.
+
+Notes, tips, hints, tricks, and asides to the current discussion are offset and placed in italics like this.
+
+As for styles in the text:
+
+  * We highlight new terms and important words when we introduce them.
+  * We show keyboard strokes like this: Ctrl+A.
+  * We show file names, URLs, and code within the text like so: persistence.properties.
+  * We present code in two different ways:
+
+We use a monofont type with no highlighting for most code examples.
+
+We use gray highlighting to emphasize code that's particularly important in the present context.
+
+Source Code
+
+As you work through the examples in this book, you may choose either to type in all the code manually or to use the source code files that accompany the book. All of the source code used in this book is available for download at <http://www.wrox.com>. Once at the site, simply locate the book's title (either by using the Search box or by using one of the title lists) and click the Download Code link on the book's detail page to obtain all the source code for the book.
+
+Because many books have similar titles, you may find it easiest to search by ISBN; this book's ISBN is 978-0-470-25702-9.
+
+Once you download the code, just decompress it with your favorite compression tool. Alternatively, you can go to the main Wrox code download page at <http://www.wrox.com/dynamic/books/download.aspx> to see the code available for this book and all other Wrox books.
+
+Errata
+
+We make every effort to ensure that there are no errors in the text or in the code. However, no one is perfect, and mistakes do occur. If you find an error in one of our books, like a spelling mistake or faulty piece of code, we would be very grateful for your feedback. By sending in errata you may save another reader hours of frustration and at the same time you will be helping us provide even higher quality information.
+
+To find the errata page for this book, go to <http://www.wrox.com> and locate the title using the Search box or one of the title lists. Then, on the book details page, click the Book Errata link. On this page you can view all errata that has been submitted for this book and posted by Wrox editors. A complete book list including links to each book's errata is also available at www.wrox.com/misc-pages/booklist.shtml.
+
+If you don't spot "your" error on the Book Errata page, go to www.wrox.com/contact/techsupport.shtml and complete the form there to send us the error you have found. We'll check the information and, if appropriate, post a message to the book's errata page and fix the problem in subsequent editions of the book.
+
+p2p.wrox.com
+
+For author and peer discussion, join the P2P forums at p2p.wrox.com. The forums are a Web-based system for you to post messages relating to Wrox books and related technologies and interact with other readers and technology users. The forums offer a subscription feature to e-mail you topics of interest of your choosing when new posts are made to the forums. Wrox authors, editors, other industry experts, and your fellow readers are present on these forums.
+
+At <http://p2p.wrox.com> you will find a number of different forums that will help you not only as you read this book, but also as you develop your own applications. To join the forums, just follow these steps:
+
+1. Go to p2p.wrox.com and click the Register link.
+
+2. Read the terms of use and click Agree.
+
+3. Complete the required information to join as well as any optional information you wish to provide and click Submit.
+
+4. You will receive an e-mail with information describing how to verify your account and complete the joining process.
+
+You can read messages in the forums without joining P2P but in order to post your own messages, you must join.
+
+Once you join, you can post new messages and respond to messages other users post. You can read messages at any time on the Web. If you would like to have new messages from a particular forum e-mailed to you, click the Subscribe to this Forum icon by the forum name in the forum listing.
+
+For more information about how to use the Wrox P2P, be sure to read the P2P FAQs for answers to questions about how the forum software works as well as many common questions specific to P2P and Wrox books. To read the FAQs, click the FAQ link on any P2P page.
+
+www.professionalsql.com
+
+Limited support for the book and occasional blog entries can also be found at <http://www.professionalsql.com>. While formal support requests should be sent through the p2p.wrox.com website, professionalsql.com provides a mirror of the key downloads as well as occasional commentary from the author on the general state of the development world. You can contact me at robv@professionalsql.com; my sole requests for questions or contacts are:
+
+  * Please don't send me the questions from your take home mid-term or other school quizzes/tests (and yes, people really have done that).
+  * Focus questions to those not readily answered from general sources (Google, the p2p.wrox.com website, the many popular SQL Server websites, or a simple Books Online query).
+  * Understand that, while I try, I cannot always respond to every request for help, advice, or other questions.
+  * Recognize that the exposure of my e-mail address in this book represents a certain degree of trust in you, the reader, that you will not abuse that openness.
+
+I am always happy to hear about people's SQL experiences, so please feel free to drop me a line and brag about the wonderful things you've managed to do with SQL Server.
+1
+
+Being Objective: Re-Examining Objects in SQL Server
+
+If you're someone who's read my Professional level titles before, you'll find we're continuing the path we started in Professional SQL Server 2005 Programming and have the "Professional" become a little bit more "Pro" in level. That said, I still want to touch on all the basic objects and also address some things like new data types and additional objects that are new with SQL Server 2008.
+
+So, What Exactly Do We Have Here?
+
+Seems like sort of a silly question doesn't it? If you're here reading this title, you obviously know we have a database, but what makes up a database? It is my hope that, by now (meaning by the time you're ready for a professional level title), you've come to realize that a Relational Database Management System (RDBMS) is actually much more than data. Today's advanced RDBMSs not only store your data, they also manage that data for you, restricting what kind of data can go into the system, and also facilitating getting data out of the system. If all you want is to tuck the data away somewhere safe, you can use just about any data storage system. RDBMSs allow you to go beyond the storage of the data into the realm of defining what that data should look like—this has never been more true than with SQL Server 2008. Improved support for hierarchies means that you can store hierarchical data in a far more native way, and still access it very efficiently. The new Policy Based Management feature allows you to control many elements of how your data is administrated using a rules-driven approach. SQL Server also provides services that help automate how your data interacts with data from other systems through such powerful features as the SQL Server Agent, Integration Services, Notification Services, the increasingly popular Reporting Services, and more.
+
+This chapter provides an overview to the core objects used in SQL Server. Much of what is discussed in this chapter may be old news for you at this stage of your database learning, so this is the only point in the book where we will call them out in broad terms of how they relate to each other. I will assume that you are already somewhat familiar with most of the objects discussed here, but the goal is to fill in any holes and fully prepare you for the more advanced discussions to come.
+
+In this chapter, we will take a high-level look into:
+
+  * Database objects
+  * Data types (including some that are new with SQL Server 2008!)
+  * Other database concepts that ensure data integrity
+
+An Overview of Database Objects
+
+An RDBMS such as SQL Server contains many objects. Object purists out there may quibble with whether Microsoft's choice of what to call an object (and what not to) actually meets the normal definition of an object, but, for SQL Server's purposes, the list of some of the more important database objects can be said to contain such things as:
+
+The database itself | Indexes  
+---|---  
+The transaction log | CLR assemblies  
+Tables | Reports  
+Filegroups | Full-text catalogs  
+Diagrams | User-defined data types  
+Views | Roles  
+Stored procedures | Users  
+User-defined functions | Encryption Keys
+
+This is far from being a comprehensive list, and is in no particular order, but it does give you some of a feel for the breadth of objects that your SQL Server can manage.
+
+The Database Object
+
+The database is effectively the highest-level object that you can refer to within a given SQL Server. (Technically speaking, the server itself can be considered to be an object, but not from any real "programming" perspective, so we're not going there.) Most, but not all, other objects in a SQL Server are children of the database object.
+
+If you are familiar with old versions of SQL Server you may now be saying, "What? What happened to logins? What happened to Remote Servers and SQL Agent tasks?" SQL Server has several other objects (as listed previously) that exist in support of the database. With the exception of linked servers, and perhaps Integration Services packages, these are primarily the domain of the database administrator and as such, you generally don't give them significant thought during the design and programming processes. (They are programmable via something called the SQL Management Objects [SMO], but that is far too special a case to concern you with here. We will look at SMO more fully in Chapter 26.)
+
+A database is typically a group that includes at least a set of table objects and, more often than not, other objects, such as stored procedures and views that pertain to the data stored in the database's tables.
+
+When you first load SQL Server, you will start with four system databases:
+
+  * master
+  * model
+  * msdb
+  * tempdb
+
+All of these need to be installed for your server to run properly. (Indeed, for some of them, it won't run at all without them.) From there, things vary depending on which installation choices you made. Examples of some of the databases you may also see include the following:
+
+  * AdventureWorks or AdventureWorks2008 (the sample databases downloadable from codeplex.com)
+  * AdventureWorksLT or AdventureWorksLT2008 (a "lite" version of the main sample database)
+  * AdventureWorksDW or AdventureWorksDW2008 (sample for use with Analysis Services)
+
+In addition to the primary examples supported by Microsoft, you may, when searching the Web or using other tutorials, find reference to a couple of older samples:
+
+  * pubs
+  * Northwind
+
+The master Database
+
+Every SQL Server, regardless of version or custom modifications, has the master database. This database holds a special set of tables (system tables) that keeps track of the system as a whole. For example, when you create a new database on the server, an entry is placed in the sysdatabases table in the master database (though, if you're interested in data from sysdatabases, you should only access it via the sys.databases metadata view). All extended and system stored procedures, regardless of which database they are intended for use with, are stored in this database. Obviously, since almost everything that describes your server is stored in here, this database is critical to your system and cannot be deleted.
+
+The system tables, including those found in the master database, can, in a pinch, be extremely useful. That said, their direct use is diminishing in importance as Microsoft continues to give more and more other options for getting at system level information.
+
+I used to be a significant user of system tables; that is no longer the case.
+
+Microsoft has recommended against using the system tables since prior to version 7.0 (1998 or so?). They make absolutely no guarantees about compatibility in the master database between versions—indeed, they virtually guarantee that they will change. The worst offense comes when performing updates on objects in the master database. Trust me when I tell you that altering these tables in any way is asking for a SQL Server that no longer functions. (I've saved a system doing this, and I've killed a system doing this; I don't like 50/50 odds with the life of my server....).
+
+Microsoft has created several alternatives (for example, system functions, system stored procedures, information_schema views, and a wide array of system metadata functions) for retrieving much of the information that is stored in the system tables. These alternatives are what you should be using.
+
+The model Database
+
+The model database is aptly named, in the sense that it's the model on which a copy can be based. The model database forms a template for any new database that you create. This means that you can, if you wish, alter the model database if you want to change what standard, newly created databases look like. For example, you could add a set of audit tables that you include in every database you build. You could also include a few user groups that would be cloned into every new database that was created on the system. Note that since this database serves as the template for any other database, it's a required database and must be left on the system; you cannot delete it.
+
+There are several things to keep in mind when altering the model database. First, any database you create has to be at least as large as the model database. That means that if you alter the model database to be 100MB in size, you can't create a database smaller than 100MB. There are several other similar pitfalls. As such, for 90 percent of installations, I strongly recommend leaving this one alone.
+
+The msdb Database
+
+msdb is where the SQL Agent process stores any system tasks. If you schedule backups to run on a database nightly, there is an entry in msdb. Schedule a stored procedure for one-time execution, and yes, it has an entry in msdb. Other major subsystems in SQL Server make similar use of msdb. SQL Server Integration Services (SSIS) packages and Policy Based Management definitions are examples of other processes that make use of msdb.
+
+The tempdb Database
+
+tempdb is one of the key working areas for your server. Whenever you issue a complex or large query that SQL Server needs to build interim tables to solve, it does so in tempdb. Whenever you create a temporary table of your own, it is created in tempdb, even though you think you're creating it in the current database. Whenever there is a need for data to be stored temporarily, it's probably stored in tempdb.
+
+tempdb is very different from any other database in that not only are the objects within it temporary, but the database itself is temporary. It has the distinction of being the only database in your system that is completely rebuilt from scratch every time you start your SQL Server.
+
+Technically speaking, you can actually create objects yourself in tempdb—I strongly recommend against this practice. You can create temporary objects from within any database you have access to in your system—it will be stored in tempdb. Creating objects directly in tempdb gains you nothing but adds the confusion of referring to things across databases. This is another of those "Don't go there!" kind of things.
+
+tempdb is dropped and rebuilt from scratch each time you restart your SQL Server.
+
+AdventureWorks/AdventureWorks2008
+
+SQL Server included samples long before these came along. The old samples had their shortcomings though. For example, they contained a few poor design practices. (I'll hold off the argument of whether the AdventureWorks databases have the same issue or not. Let's just say that AdventureWorks was, among other things, an attempt to address this problem.) In addition, they were simplistic and focused on demonstrating certain database concepts rather than on SQL Server as a product or even databases as a whole.
+
+From the earliest stages of development of Yukon (the internal code name for what we look back on today as SQL Server 2005) Microsoft knew they wanted a far more robust sample database that would act as a sample for as much of the product as possible. AdventureWorks is the outcome of that effort. As much as you will hear me complain about its overly complex nature for the beginning user, it is a masterpiece in that it shows it all off. Okay, so it's not really everything, but it is a fairly complete sample, with more realistic volumes of data, complex structures, and sections that show samples for the vast majority of product features. In this sense, it's truly terrific. AdventureWorks2008 is the natural evolution of the original AdventureWorks database in the sense that it alters and extends the model to make use of features that are new with SQL Server 2008.
+
+I use AdventureWorks2008 as the core sample database for this book.
+
+AdventureWorksLT/AdventureWorksLT2008
+
+The LT in this stands for lite. This is just an extremely small subset of the full AdventureWorks database. The idea is to provide a simpler sample set for easier training of basic concepts and simple training. While I've not been privy to the exact reasoning behind this new sample set, my suspicion was that it is an effort to try and kill the older Northwind and Pubs sample sets, which have been preferred by many trainers over the newer AdventureWorks set, as the AdventureWorks database is often far too complex and cumbersome for early training. However, I've recently heard that there are plans for some updating and additional development to the Northwind sample, so perhaps they aren't ready to totally kill that one off after all.
+
+AdventureWorksDW/AdventureWorksDW2008
+
+This is the Analysis Services sample. (The DW stands for data warehouse, which is the type of database over which most Analysis Services projects will be built.) Perhaps the greatest thing about it is that Microsoft had the foresight to tie the transaction database sample with the analysis sample, providing a whole set of samples that show the two of them working together.
+
+We will utilize this database extensively when reviewing OLAP concepts and taking a look at Analysis Services. Take a look at the differences between the two databases. They are meant to serve the same fictional company, but they have different purposes; learn from this.
+
+The Transaction Log
+
+If you're far enough along in your SQL Server learning to be reading this title, then I would think you would have at least a basic familiarity with the log. That said, it is among the most misunderstood objects in SQL Server. Although the data is read from the database, any changes you make don't initially go to the database itself. Instead, they are written serially to the transaction log. At some later point in time, the database is issued a checkpoint—it is at that point in time that all the changes in the log are propagated to the actual database file.
+
+The database is in a random access arrangement, but the log is serial in nature. While the random nature of the database file allows for speedy access, the serial nature of the log allows things to be tracked in the proper order. The log accumulates changes that are deemed as having been committed, and the server writes the changes to the physical database file(s) at a later time.
+
+We'll take a much closer look at how things are logged in Chapter 11, but for now, remember that the log is the first place on disk that the data goes, and it's propagated to the actual database at a later time. You need both the database file and the transaction log to have a functional database.
+
+The Most Basic Database Object: Table
+
+Databases are made up of many things, but none are more central to the make-up of a database than tables. A table is made up of what is called domain data (columns) and entity data (rows). The actual data for a database is stored in tables. Each table definition contains the metadata (descriptive information about data) that describes the nature of the data the table is to contain. Each column has its own set of rules about what can be stored in that column. A violation of the rules of any one column can cause the system to reject an inserted row or an update to an existing row, or prevent the deletion of a row.
+
+A table can have additional objects associated with it—these objects exist only within the construct of a particular table (or, in somewhat rare cases, a view). Let's take a look at each of these.
+
+Indexes
+
+An index is an object that exists only within the framework of a particular table or view. An index works much like the index does in the back of an encyclopedia; there is some sort of lookup (or "key") value that is sorted in a particular way, and, once you have that, you are provided another key with which you can look up the actual information you are after.
+
+An index provides us ways of speeding the lookup of our information. Indexes fall into two categories:
+
+  * Clustered—You can have only one of these per table. If an index is clustered, it means that the table on which the clustered index is based is physically sorted according to that index. If you were indexing an encyclopedia, the clustered index would be the page numbers; the information in the encyclopedia is stored in the order of the page numbers.
+  * Non-clustered—You can have many of these for every table. This is more along the lines of what you probably think of when you hear the word index. This kind of index points to some other value that will let you find the data. For our encyclopedia, this would be the keyword index at the back of the book.
+
+Note that views that have indexes—or indexed views—must have at least one clustered index before they can have any non-clustered indexes.
+
+Triggers
+
+A trigger is an object that generally exists only within the framework of a table. Triggers are pieces of logical code that are automatically executed when certain things, such as inserts, updates, or deletes, happen to your table. Triggers can be used for a great variety of things but are mainly used for either copying data as it is entered or checking the update to make sure that it meets some criteria.
+
+A special kind of trigger—called a before trigger—can be associated with a view. We will take a more in-depth look at these in Chapter 12.
+
+Constraints
+
+A constraint is yet another object that exists only within the confines of a table. Constraints are much like they sound; they confine the data in your table to meet certain conditions. Constraints, in a way, compete with triggers as possible solutions to data integrity issues. They are not, however, the same thing; each has its own distinct advantages.
+
+Unlike triggers and indexes, constraints can only be associated with tables (no views).
+
+Schemas
+
+Schemas provide an intermediate namespace between your database and the other objects it contains. The default schema in any database is dbo (which stands for database owner). Every user has a default schema, and SQL Server will search for objects within that user's default schema automatically. If, however, the object is within a namespace that is not the default for that user, then the object must be referred with two parts in the form of <schema name>.<object name>.
+
+Schemas replace the concept of "owner" that was used in prior versions of SQL Server. While Microsoft now seems to be featuring their use (the idea is that you'll be able to refer to a group of tables by the schema they are in rather than listing them all), I remain dubious at best. In short, I believe they create far more problems than they solve, and I generally recommend against their use (I have made my exceptions, but they are very situational).
+
+Filegroups
+
+By default, all your tables and everything else about your database (except the log) are stored in a single file. That file is a member of what's called the primary filegroup. However, you are not stuck with this arrangement.
+
+SQL Server allows you to define a little over 32,000 secondary files. (If you need more than that, perhaps it isn't SQL Server that has the problem.) These secondary files can be added to the primary filegroup or created as part of one or more secondary filegroups. While there is only one primary filegroup (and it is actually called "Primary"), you can have up to 255 secondary filegroups. A secondary filegroup is created as an option to a CREATE DATABASE or ALTER DATABASE command.
+
+The concept of a filegroup is there primarily to allow you to manage your physical storage of data in a somewhat segmented fashion. You can backup just the files in a given filegroup (rather than the entire database). You can use individual files to spread data across multiple physical storage devices (which may provide more I/O bandwidth).
+
+Diagrams
+
+A database diagram is a visual representation of the database design, including the various tables, the column names in each table, and the relationships between tables. In your travels as a developer, you may have heard of an entity-relationship diagram—or ERD. In an ERD the database is divided into two parts: entities (such as supplier and product) and relations (such as supplies and purchases).
+
+The database design tools included in SQL Server 2008 remain a bit sparse. Indeed, the diagramming methodology the tools use doesn't adhere to any of the accepted standards in ER diagramming.
+
+Still, these diagramming tools really do provide all the "necessary" things; they are at least something of a start.
+
+Figure 1.1 is a diagram that shows some of the various tables in the AdventureWorks database. The diagram also (though it may be a bit subtle since this is new to you) describes many other properties about the database. Notice the tiny icons for keys and the infinity sign. These depict the nature of the relationship between two tables.
+
+Figure 1.1
+
+Views
+
+A view is something of a virtual table. A view, for the most part, is used just like a table, except that it doesn't contain any data of its own. Instead, a view is merely a preplanned mapping and representation of the data stored in tables. The plan is stored in the database in the form of a query. This query calls for data from some, but not necessarily all, columns to be retrieved from one or more tables. The data retrieved may or may not (depending on the view definition) have to meet special criteria in order to be shown as data in that view. For most views, this serves two major purposes: security and ease of use. With views you can control what the users see, so if there is a section of a table that should be accessed by only a few users (for example, salary details), you can create a view that includes only those columns to which everyone is allowed access. In addition, the view can be tailored so that the user doesn't have to search through any unneeded information.
+
+In addition to these most basic uses for view, we also have the ability to create what is called an indexed view. This is the same as any other view, except that we can now create an index against the view. This results in a few performance impacts (some positive, one negative):
+
+  * Views that reference multiple tables generally perform much faster with an indexed view because the join between the tables is preconstructed.
+  * Aggregations performed in the view are precalculated and stored as part of the index; again, this means that the aggregation is performed one time (when the row is inserted or updated), and then can be read directly from the index information.
+  * Inserts and deletes have higher overhead because the index on the view has to be updated immediately; updates also have higher overhead if the key column of the index is affected by the update.
+
+We will look more deeply into these performance issues as well as other special uses for views in Chapter 8.
+
+It is important to note that, while the code to create an indexed view will work in all editions, the query optimizer will only consider an indexed view when used in the Enterprise Edition of the product.
+
+Stored Procedures
+
+Stored procedures (or sprocs) are historically the bread and butter of programmatic functionality in SQL Server. Stored procedures are generally an ordered series of Transact-SQL (the language used to query Microsoft SQL Server) statements bundled up into a single logical unit. They allow for variables and parameters as well as selection and looping constructs. Sprocs offer several advantages over just sending individual statements to the server in the sense that they:
+
+  * Are referred to using short names, rather than a long string of text; as such, less network traffic is required in order to run the code within the sproc.
+  * Are pre-optimized and precompiled, saving a small amount of time each time the sproc is run.
+  * Encapsulate a process, usually for security reasons or just to hide the complexity of the database.
+  * Can be called from other sprocs, making them reusable in a somewhat limited sense.
+
+In addition, you can utilize any .NET language to create assemblies and add program constructs beyond those native to T-SQL to your stored procedures.
+
+User-Defined Functions
+
+User-defined functions (or UDFs) have a tremendous number of similarities to sprocs, except that they:
+
+  * Can return a value of most SQL Server data types. Excluded return types include text, ntext, image, cursor, and timestamp. Note that you can return varchar(max) and varbinary(max) values.
+  * Can't have "side effects." Basically, they can't do anything that reaches outside the scope of the function, such as changing tables, sending e-mails, or making system or database parameter changes.
+
+UDFs are similar to the functions that you would use in a standard programming language such as VB.NET or C++. You can pass more than one variable in, and get a value out. SQL Server's UDFs vary from the functions found in many procedural languages, however, in that all variables passed into the function are passed in by value. If you're familiar with passing in variables By Ref in VB, or passing in pointers in C++, sorry, there is no equivalent here. There is, however, some good news in that you can return a special data type called a table. We'll examine the impact of this in Chapter 10.
+
+Users and Roles
+
+These two go hand in hand. Users are pretty much the equivalent of logins. In short, this object represents an identifier for someone to log in to the SQL Server. Anyone logging in to SQL Server has to map (directly or indirectly depending on the security model in use) to a user. Users, in turn, belong to one or more roles. Rights to perform certain actions in SQL Server can then be granted directly to a user or to a role to which one or more users belong.
+
+Rules
+
+Rules and constraints provide restriction information about what can go into a table. If an updated or inserted record violates a rule, then that insertion or update will be rejected. In addition, a rule can be used to define a restriction on a user-defined data type. Unlike rules, constraints aren't really objects unto themselves but rather pieces of metadata describing a particular table.
+
+While Microsoft has not stated a particular version for doing so, they continue to warn that rules will be removed in a future release. Rules should be considered for backward compatibility only and should be avoided in new development. You should also begin phasing out any you already have in use in your database.
+
+Defaults
+
+There are two types of defaults. There is the default that is an object unto itself and the default that is not really an object, but rather metadata describing a particular column in a table (in much the same way that we have rules, which are objects, and constraints, which are not objects but metadata). They both serve the same purpose. If, when inserting a record, you don't provide the value of a column and that column has a default defined, a value will be inserted automatically as defined in the default.
+
+Much like rules, the form of default that is its own object should be treated as a legacy object and avoided in new development and actively removed from existing code. Use of default constraints is, however, still very valid.
+
+User-Defined Data Types
+
+User-defined data types are extensions to the system-defined data types. The possibilities here are almost endless, but you must keep backward compatibility in mind. Although SQL Server 2000 and earlier had the idea of user-defined data types, they were really limited to different filtering of existing data types. Since SQL Server 2005, we have the ability to bind .NET assemblies to our own data types, meaning we can have a data type that stores (within reason) about anything we can store in a .NET object.
+
+Careful with this! The data type that you're working with is pretty fundamental to your data and its storage. Although being able to define your own thing is very cool, recognize that it will almost certainly come with a large performance and most likely a security cost. Consider it carefully, be sure it's something you genuinely need, and then, as with everything like this, TEST, TEST, TEST!!!
+
+Full-Text Catalogs
+
+Full-text catalogs are mappings of data that speed the search for specific blocks of text within columns that have full-text searching enabled. Although these objects are joined at the hip to the tables and columns that they map, they are separate objects and are as such, not necessarily updated (the default is for automatic updating, but you can change it to manual update) when changes happen in the database.
+
+SQL Server Data Types
+
+This is an area of relatively significant change over the last release or two. SQL Server 2005 began the change in blob related data types (text and ntext became varchar(max) and nvarchar(max), and image became varbinary(max)). Now SQL Server 2008 adds several new time and date related data types as well as a special data type for handling hierarchical data.
+
+Note that since this book is intended for developers and that no developer could survive for 60 seconds without an understanding of data types, I'm going to assume that you already know how data types work and just need to know the particulars of SQL Server data types.
+
+SQL Server 2008 has the intrinsic data types shown in the following table:
+
+Note that compatibility with .NET data types is even stronger than it was before. For example, the new date and time cross neatly into the .NET world, and the new datetime2 data type also crosses to .NET more cleanly than the previous datetime data type does.
+
+Unfortunately, SQL Server continues to have no concept of unsigned numeric data types. If you need to allow for larger numbers than the signed data type allows, consider using a larger signed data type. If you need to prevent the use of negative numbers, consider using a CHECK constraint that restricts valid data to greater than or equal to zero.
+
+In general, SQL Server data types work much as you would expect given experience in most other modern programming languages. Adding numbers yields a sum, but adding strings concatenates them. When you mix the usage or assignment of variables or fields of different data types, a number of types convert implicitly (or automatically). Most other types can be converted explicitly. (You say specifically what type you want to convert to.) A few can't be converted between at all. Figure 1.2 contains a chart that shows the various possible conversions.
+
+Figure 1.2
+
+In short, data types in SQL Server perform much the same function that they do in other programming environments. They help prevent programming bugs by ensuring that the data supplied is of the same nature that the data is supposed to be (remember 1/1/1980 means something different as a date than as a number) and ensures that the kind of operation performed is what you expect.
+
+NULL Data
+
+What if you have a row that doesn't have any data for a particular column—that is, what if you simply don't know the value? For example, let's say that we have a record that is trying to store the company performance information for a given year. Now, imagine that one of the fields is a percentage growth over the prior year, but you don't have records for the year before the first record in your database. You might be tempted to just enter a zero in the PercentGrowth column. Would that provide the right information though? People who didn't know better might think that meant you had zero percent growth, when the fact is that you simply don't know the value for that year.
+
+Values that are indeterminate are said to be NULL. It seems that every time I teach a class in programming, at least one student asks me to define the value of NULL. Well, that's a tough one, because, by definition, a NULL value means that you don't know what the value is. It could be 1; it could be 347; it could be −294 for all we know. In short, it means undefined or perhaps not applicable.
+
+SQL Server Identifiers for Objects
+
+Now you've heard all sorts of things about objects in SQL Server. But let's take a closer look at naming objects in SQL Server.
+
+What Gets Named?
+
+Basically, everything has a name in SQL Server. Here's a partial list:
+
+Stored procedures | Tables | Columns  
+---|---|---  
+Views | Rules | Constraints  
+Defaults | Indexes | Filegroups  
+Triggers | Databases | Servers  
+User-defined functions | Logins | Roles  
+Full-text catalogs | Files | User-defined types  
+Schemas |  |
+
+And the list goes on. Most things I can think of except rows (which aren't really objects) have a name. The trick is to make every name both useful and practical.
+
+Rules for Naming
+
+The rules for naming in SQL Server are fairly relaxed, allowing things like embedded spaces and even keywords in names. Like most freedoms, however, it's easy to make some bad choices and get yourself into trouble.
+
+Here are the main rules:
+
+  * The name of your object must start with any letter as defined by the specification for Unicode 3.2. This includes the letters most westerners are used to—A–Z and a–z. Whether "A" is different from "a" depends on the way your server is configured, but either makes for a valid beginning to an object name. After that first letter, you're pretty much free to run wild; almost any character will do.
+  * The name can be up to 128 characters for normal objects and 116 for temporary objects.
+  * Any names that are the same as SQL Server keywords or contain embedded spaces must be enclosed in double quotes ("") or square brackets ([]). Which words are considered keywords varies, depending on the compatibility level to which you have set your database.
+
+Note that double quotes are acceptable as a delimiter for column names only if you have set QUOTED_IDENTIFIER on. Using square brackets ([ and ]) eliminates the chance that your users will have the wrong setting but is not as platform independent as double quotes are. I do, however, recommend against having QUOTED IDENTIFIER on due to issues it can create with indexed views.
+
+These rules are generally referred to as the rules for identifiers and are in force for any objects you name in SQL Server. Additional rules may exist for specific object types.
+
+I can't stress enough the importance of avoiding the use of SQL Server keywords or embedded spaces in names. Although both are technically legal as long as you qualify them, naming things this way will cause you no end of grief.
+
+Summary
+
+Database data has type, just as most other programming environments do. Most things that you do in SQL Server are going to have at least some consideration of type. While very little has changed in terms of basic objects available in SQL Server 2008, several new data types have been added. Be sure and review these new types (in the date and time arena as well as hierarchical data support). Review the types that are available, and think about how these types map to the data types in any programming environment with which you are familiar.
+
+Consider the many objects available to you in SQL Server 2008. While you should be pretty familiar with tables and the basics of views and scripting prior to using this book (if not, you may want to take a look at Beginning SQL Server 2008 Programming), my hope is that you also realize that tossing together a few tables and a stored procedure or two seldom makes a real database. The things that make today's RDBMSs great are the extra things—the objects that enable you to place functionality and business rules that are associated with the data right into the database with the data.
+2
+
+Tool Time
+
+If you are already familiar with the SQL Server Management Studio (which would imply you are moving from a SQL Server 2005 environment or have already been working with SQL Server 2008), then this is a chapter (probably the last) you can probably get away with only skimming for new stuff. If you decide to skim, you may want to slow down going through some of the more seldom used tools such as the Configuration Manager and the discussion of Net-Libraries (usually just referred to as NetLibs). Again, if you're new to all of this, I would suggest swallowing your pride and starting with the Beginning SQL Server 2008 Programming title—it covers the basics in far more detail. For this book, our purpose in covering the stuff in the first few chapters is really more about providing a reference than anything else, with an additional smattering of new stuff.
+
+With that in mind, it's time to move on to the toolset. If you are skipping forward in versions from SQL Server 2000 or earlier, then this is where you'll want to pay particular attention. Back in SQL Server 2005, the toolset changed—a lot. SQL Server 2008 adds some new nodes within the SQL Server 2005 tools and moves around a few more.
+
+For old fogies such as me, the new tools are a rather nasty shock to the system. For people new to SQL Server, I would say that the team has largely met a 2005 design goal to greatly simplify the tools. In general, there are far fewer places to look for things, and most of the toolset is grouped far more logically.
+
+The tools we will look at in this chapter are:
+
+  * SQL Server Books Online
+  * The SQL Server Configuration Manager
+  * SQL Server Management Studio
+  * SQL Server Business Intelligence Development Studio
+  * SQL Server Integration Services (SSIS): including the Import/Export Wizard
+  * Reporting Services
+  * The Bulk Copy Program (bcp)
+  * Profiler
+  * sqlcmd
+
+Books Online
+
+Is Books Online a tool? I think so. Let's face it: It doesn't matter how many times you read this or any other book on SQL Server; you're not going to remember everything you'll ever need to know about SQL Server. SQL Server is one of my mainstay products, and I still can't remember it all. Books Online is simply one of the most important tools you're going to find in SQL Server.
+
+Here's a simple piece of advice: Don't even try to remember it all. Remember that what you've seen is possible. Remember what is an integral foundation to what you're doing. Remember what you work with every day. Then remember to build a good reference library (starting with this book) for the rest.
+
+Everything works pretty much as one would expect here, so I'm not going to go into the details of how to operate a help system. Suffice it to say that SQL Server Books Online is a great quick reference that follows you to whatever machine you're working on at the time. Books Online also has the added benefit of often having information that is more up to date than the printed documentation.
+
+Technically speaking, it's quite possible that not every system you move to will have the Books Online (BOL) installed. This is because you can manually de-select BOL at the time of installation. Even in tight space situations, however, I strongly recommend that you always install the BOL. It really doesn't take up all that much space when you consider cost per megabyte these days, and it can save you a fortune in time by having that quick reference available wherever you are running SQL Server. (On my machine, Books Online takes up 100MB of space.)
+
+The SQL Server Configuration Manager
+
+Administrators who configure computers for database access are the main users of this tool, but it is still important for us to understand what this tool is about.
+
+The SQL Server Configuration Manager is really an effort to combine some settings that were, in past versions of SQL Server, spread across multiple tools into one spot. The items managed in the Configuration Manager fall into two areas:
+
+  * Service Management
+  * Network Configuration
+
+Service Management
+
+Let's cut to the chase—the services available for management here include:
+
+  * Integration Services—This powers the Integration Services functionality set.
+  * Analysis Services—This powers the Analysis Services engine.
+  * Full-Text Filter Daemon—Again, just what it sounds like—powers the Full-Text Search Engine.
+  * Reporting Services—The underlying engine for Report Services.
+  * SQL Server Agent—The main engine behind anything in SQL Server that is scheduled. Utilizing this service, you can schedule jobs to run on a variety of different schedules. These jobs can have multiple tasks assigned to them and can even branch into doing different tasks depending on the outcome of some previous task. Examples of things run by the SQL Server Agent include backups as well as routine import and export tasks.
+  * SQL Server—The core database engine that works on data storage, queries, and system configuration for SQL Server.
+  * SQL Server Browser—Supports advertising your server so those browsing your local network can identify your system has SQL Server installed.
+
+Network Configuration
+
+A fair percentage of the time, connectivity issues are the result of client network configuration or how that configuration matches with that of the server.
+
+SQL Server provides several of what are referred to as Net-Libraries (network libraries), or NetLibs. NetLibs serve as something of an insulator between your client application and the network protocol that is to be used—they serve the same function at the server end, too. The NetLibs supplied with SQL Server 2008 include:
+
+  * Named Pipes
+  * TCP/IP (the default)
+  * Shared Memory
+  * VIA (a proprietary virtual adaptor generally used for server-to-server communication based on special hardware)
+
+The same NetLib must be available on both the client and server computers so that they can communicate with each other via the network protocol. Choosing a client NetLib that is not also supported on the server will result in your connection attempt failing (with a Specified SQL Server Not Found error).
+
+Regardless of the data access method and kind of driver used (SQL Native Client, ODBC, OLE DB, or DB-Lib), it will always be the driver that talks to the NetLib. The process works as shown in Figure 2.1. The steps in order are:
+
+1. The client app talks to the driver (SQL Native Client, ODBC, OLE DB, or DB-Lib).
+
+2. The driver calls the client NetLib.
+
+3. This NetLib calls the appropriate network protocol and transmits the data to a server NetLib.
+
+4. The server NetLib then passes the requests from the client to the SQL Server.
+
+Replies from SQL Server to the client follow the same sequence, only in reverse.
+
+In case you're familiar with TCP/IP, the default port that the IP NetLib will listen on is 1433. This can be reconfigured to use another port (and sometimes is for security reasons), but you'll need to make sure that your client systems know what port to talk to your server on. In general, I'm not a fan of changing this as I believe that most hackers scanning for a SQL Server port are going to scan wide enough to find it no matter what port it's on. That said, it's still a practice many believe is the right way to do things, so consider your installation choice carefully.
+
+Figure 2.1
+
+The Protocols
+
+Let's start off with that "What are the available choices?" question. We can see what our server could be listening for by starting the SQL Server Computer Manager and expanding the Protocols for MSSQLSERVER tree under SQL Server Network Configuration, as shown in Figure 2.2.
+
+Note that Figure 2.2 shows the Configuration Manager as it appears on a 64-bit system. The 32-bit nodes do not appear on a 32-bit installation, as they are the default nodes.
+
+By default, only Shared Memory will be enabled. Older versions of the product had different NetLibs enabled by default depending on the version of SQL Server and the O/S.
+
+You will need to enable at least one other NetLib if you want to be able to remotely contact your SQL Server (say, from a Web server or from different clients on your network).
+
+Keep in mind that, in order for your client to gain a connection to the server, the server has to be listening for the protocol with which the client is trying to communicate and on the same port. Therefore, if we were in a Named Pipes environment, we might need to add a new library. To do that, we would go back to the Protocols tree, right-click the Named Pipes protocol, and choose Enable (indeed, I've enabled all NetLibs except VIA in Figure 2.2).
+
+At this point, you might be tempted to say, "Hey, why don't I just enable every NetLib? Then I won't have to worry about it." This situation is like anything you add onto your server—more overhead. In this case, it would both slow down your server (not terribly, but every little bit counts) and expose you to unnecessary openings in your security (why leave an extra door open if nobody is supposed to be using that door?).
+
+Figure 2.2
+
+Okay, now let's take a look at what we can support and why we would want to choose a particular protocol.
+
+Named Pipes
+
+Named Pipes can be very useful in situations where either TCP/IP is not available or there is no Domain Name Service (DNS) server to allow the naming of servers under TCP/IP. Use of Named Pipes is decreasing, and, to make a long story short, I like it that way. The short rendition of why is that you're going to have TCP/IP active anyway, so why add another protocol to the mix (especially since it opens another way that hackers could potentially infiltrate your system).
+
+Technically speaking, you can connect to a SQL Server running TCP/IP by using its IP address in the place of the name. This will work all the time as long as you have a route from the client to the server—even if there is no DNS service (if it has the IP address, then it doesn't need the name).
+
+TCP/IP
+
+TCP/IP has become something of the de facto standard networking protocol. It is also the only option if you want to connect directly to your SQL Server via the Internet (which, of course, uses IP only).
+
+Don't confuse the need to have your database server available to a Web server with the need to have your database server directly accessible to the Internet. You can have a Web server that is exposed to the Internet but that also has access to a database server that is not directly exposed to the Internet (the only way for an Internet connection to see the data server is through the Web server).
+
+Connecting your data server directly to the Internet is a security hazard in a big way. If you insist on doing it (and there can be valid reasons for doing so), then pay particular attention to security precautions.
+
+Shared Memory
+
+Shared Memory removes the need for interprocess marshaling (which is a way of packaging information before transferring it across process boundaries) between the client and the server if they are running on the same box. The client has direct access to the same memory-mapped file where the server is storing data. This removes a substantial amount of overhead and is very fast. It is useful only when accessing the server locally (say, from a Web server installed on the same server as the database), but it can be quite a boon performance-wise.
+
+VIA
+
+VIA stands for Virtual Interface Adapter, and the specific implementation will vary from vendor to vendor. In general, it is usually a network kind of interface but is usually a very high-performance, dedicated connection between two systems. Part of that high performance comes from specialized, dedicated hardware that knows that it has a dedicated connection and therefore doesn't have to deal with normal network addressing issues.
+
+On to the Client
+
+Once we know what our server is offering, we can go and configure the client. Most of the time, the defaults are going to work just fine. In the Computer Manager, expand the Client Network Configuration tree and select the Client Protocols node, as shown in Figure 2.3.
+
+Figure 2.3
+
+Beginning with SQL Server 2000, Microsoft added the ability for the client to start with one protocol, and then, if that didn't work, move on to another. In Figure 2.3, we are first using Shared Memory, then trying TCP/IP, and finally going to Named Pipes if TCP/IP doesn't work as defined by the "Order" column. Unless you change the default (changing the priority by using the up and down arrows), Shared Memory is the NetLib that will be used first for connections to any server not listed in the Aliases list (the next node under Client Network Configuration), followed by TCP/IP, and so on.
+
+Note that your dialog may look slightly different than Figure 2.3 if you are running a 32-bit installation instead of the 64-bit I have pictured. In 64-bit environments, you can configure 64-bit separately from 32-bit to adjust to the different types of applications.
+
+If you have TCP/IP support on your network, leave your server configured to use it. IP has less overhead and just plain runs faster—there is no reason not to use it unless your network doesn't support it. It's worth noting, however, that for local servers (where the server is on the same physical system as the client), the Shared Memory NetLib will be quicker, as you do not need to go across the network to view your local SQL server.
+
+The Aliases list is a listing of all the servers where you have defined a specific NetLib to be used when contacting that particular server. This means that you can contact one server using IP and another using Named Pipes—whatever you need to get to that particular server. In this case, shown in Figure 2.4, we've configured our client to use the Named Pipes NetLib for requests from the server named HOBBES, and to use whatever we've set up as our default for contact with any other SQL Server.
+
+Figure 2.4
+
+Again, remember that the Client Network Configuration setting on the network machine must either have a default protocol that matches one supported by the server or have an entry in the Aliases list to specifically choose a NetLib supported by that server.
+
+The SQL Server Management Studio
+
+The SQL Server Management Studio is pretty much home base when administering a SQL Server. It provides a variety of functionality for managing your server using a relatively easy-to-use graphical user interface. Management Studio was first introduced in SQL Server 2005 and has received only minor changes in SQL Server 2008. Patterned loosely after the Developer Studio IDE environment, it combines a myriad of functionality that used to be in separate tools.
+
+For the purposes of this book, we're not going to cover everything that the Management Studio has to offer, but let's make a quick rundown of the things you can do:
+
+  * Create, edit, and delete databases and database objects
+  * Manage scheduled tasks such as backups and the execution of SSIS package runs
+  * Display current activity, such as who is logged on, what objects are locked, which client they are running, and even more advanced performance information
+  * Manage security, including such items as roles, logins, and remote and linked servers
+  * Initiate and manage the Database Mail Service
+  * Manage configuration settings for the server
+  * Create and manage both publishing and subscribing databases for replication
+
+We will be seeing a great deal of Management Studio throughout this book, so let's take a closer look at some of its key functions.
+
+Getting Started
+
+When you first start Management Studio, you'll receive a connection dialog that will ask for basic connection information (which SQL Server service, server name, authentication type, and depending on the authentication type, maybe a username and password).
+
+Again, since we're assuming that you have prior knowledge of SQL Server (if you don't, you may want to check out the Beginning SQL Server 2008 Programming book first), we're not going to dwell much on basic connections. That said, there are a couple that are handled a bit differently than they were in older versions of SQL Server (2000 or older), so let's look at those in depth.
+
+Server Type
+
+This relates to which of the various sub-systems of SQL Server you are logging in to (the normal database server, Analysis Services, Report Server, or Integration Services). Since these different types of "servers" can share the same name, pay attention to this to make sure you're logging in to what you think you're logging in to.
+
+SQL Server
+
+As you might guess, this is the SQL Server into which you're asking to be logged.
+
+SQL Server allows multiple "instances" of SQL Server to run at one time. These are just separate loads into memory of the SQL Server engine running independently from each other.
+
+Note that the default instance of your server will have the same name as your machine on the network. There are ways to change the server name after the time of installation, but they are problematic at best, and deadly to your server at worst. Additional instances of SQL Server will be named the same as the default (KIERKEGAARD or HOBBES in many of the examples in this book), followed by a dollar sign, then the instance name—for example, ARISTOTLE$POMPEII.
+
+If you select local—you can also use a single period to mean the same thing—then your system will connect to the SQL Server on the same computer as you are trying to connect from and will use the Shared Memory NetLib regardless of what NetLib you have selected for contacting other servers. This is a bad news/good news story. The bad news is that you give up a little bit of control (SQL Server will always use Shared Memory to connect—you can't choose anything else). The good news is that you don't have to remember what server you're on, and you get a high-performance option for work on the same machine. If you use your local PC's actual server name, then your communications will still go through the network stack and incur the overhead associated with that just as if you were communicating with another system, regardless of the fact that it is on the same machine (which can be good if you're trying to accurately simulate a remote system).
+
+Authentication Type
+
+You can choose between Windows Authentication (formerly NT authentication) and Mixed Authentication. No matter how you configure your server, Windows Authentication will always be available. Logins using usernames and passwords that are local to SQL Server (not part of a larger Windows network) are acceptable to the system only if you have specifically turned Mixed Authentication on.
+
+Windows Authentication
+
+Windows Authentication is just as it sounds. You have users and groups defined in Windows 2000 or later. Those Windows users are mapped into SQL Server "Logins" in their Windows user profile. When they attempt to log in to SQL Server, they are validated through the Windows domain and mapped to "roles" according to the Login. These roles identify what the user is allowed to do.
+
+The best part of this model is that you have only one password (if you change it in the Windows domain, then it's changed for your SQL Server logins, too); you pretty much don't have to fill in anything to log in (it just takes the login information from how you're currently logged in to the Windows network). Additionally, the administrator has to administer users in only one place. The downside is that mapping out this process can get complex and, to administer the Windows user side of things, it requires that you be a domain administrator.
+
+Mixed Authentication
+
+The security does not care at all about what the user's rights to the network are, but rather what you have explicitly set up in SQL Server. The authentication process does not take into account the current network login at all—instead, the user provides a SQL Server–specific login and password.
+
+This can be nice, since the administrator for a given SQL Server does not need to be a domain administrator (or even have a username on your network for that matter) in order to give rights to users on the SQL Server. The process also tends to be somewhat simpler than it is under Windows Authentication. Finally, it means that one user can have multiple logins that give different rights to different things.
+
+Query Editor
+
+This part of the Management Studio has some minor changes for SQL Server 2008 and takes the place of a separate tool in SQL Server 2005 and earlier that was called Query Analyzer. It is your tool for interactive sessions with a given SQL Server. It is where you can execute statements using Transact-SQL (T-SQL—I lovingly pronounce it "Tee-Squeal," but it's supposed to be "Tee-Sequel"). T-SQL is the native language of SQL Server. It is a dialect of Structured Query Language (SQL).
+
+Over the last few years, I've gotten more used to this tool replacing Query Analyzer, but also find it to be somewhat cluttered due to the number of things you do with the one tool. That said, for those without the expectations of the past, Microsoft hopes it will actually prove to be more intuitive for you to use it within the larger Management Studio, too.
+
+Since the Query Editor window is where we will spend a fair amount of time in this book, let's take a quick look at this tool and get familiar with how to use it. If you're familiar with SQL Server 2005, you may want to skip ahead to sections on the new SQLCMD mode.
+
+The Query Editor window continues to have the character length limitation in results that existed under Query Analyzer. By default, Query Editor Window will return a maximum of 256 characters for any one column under text, results to file, or SQLCMD mode. You can change this in the Tools⇒Options⇒Query Results⇒SQL Server settings to a maximum of 8092, but any longer than that, and your data (say from blob columns) will always be truncated.
+
+Getting Started
+
+Open a new Query Editor window by clicking the New Query button toward the top left of Management Studio or choosing File⇒New⇒New Query With Current Connection from the File menu. When the Query Editor Window is up, we'll get menus that largely match what we had in Query Analyzer back when that was a separate tool. We'll look at the specifics, but let's get our simple query out of the way.
+
+Let's start with:
+
+SELECT * FROM INFORMATION_SCHEMA.TABLES;
+
+Much as was true in SQL Server 2000's Query Analyzer, statement keywords should appear in blue; unidentifiable items, such as column and table names (these vary with every table in every database on every server), are in black; and statement arguments and connectors are in red. You'll find most icons along the toolbar to be pretty similar. For example, the check mark icon on the toolbar quickly parses the query for you without the need to actually attempt to run the statement—just as it did in Query Analyzer.
+
+Now click the Execute button in the toolbar (with the red exclamation point next to it). The Query Editor Window changes a bit, as shown in Figure 2.5.
+
+Figure 2.5
+
+Just as it was in the old Query Analyzer, the main window is divided into two panes. The top is your original query text; the bottom contains the results.
+
+Now, let's change a setting or two and see how what we get varies. Take a look at the toolbar above the Query Editor Window, and check out the set of three icons, shown in Figure 2.6.
+
+These control the way you receive output. In order, they are Results to Text, Results to Grid, and Results to File. The same choices can also be made from the Query menu under the Results To sub-menu.
+
+Figure 2.6
+
+Results in Text
+
+The Results in Text option takes all the output from your query and puts it into one page of text results. The page can be of virtually infinite length (limited by the available memory in your system).
+
+I use this output method in several different scenarios:
+
+  * When I'm getting only one result set and the results have only fairly narrow columns
+  * When I want to be able to easily save my results in a single text file
+  * When I'm going to have multiple result sets, but the results are expected to be small, and I want to be able to see more than one result set on the same page without dealing with multiple scroll bars
+
+Note that this and the Results to File option are areas where the Query Editor window may, as I mentioned earlier in the chapter, wind up truncating longer character columns. By default, it will truncate character data that is longer than 256 characters. You can change your settings to increase this to as high as 8192 characters, but that is the maximum you can show other than in the grid mode.
+
+Results in Grid
+
+This one divides up the columns and rows into a grid arrangement. Specific things that this option gives us that the Results in Text doesn't include:
+
+  * You can resize the column by hovering your mouse pointer on the right border of the column header and then clicking and dragging the column border to its new size. Double-clicking the right border will result in the auto-fit for the column.
+  * If you select several cells, and then cut and paste them into another grid (say, Microsoft Excel or the Calc application if you use Open Office), they will be treated as individual cells (under the Results in Text option, the cut data would have been pasted all into one cell).
+  * You can select just one or two columns of multiple rows (under Results in Text, if you select several rows all of the inner rows have every column selected—you can only select in the middle of the row for the first and last row selected).
+
+I use this option for almost everything, since I find that I usually want one of the benefits I just listed.
+
+Results to File
+
+Think of this one as largely the same as Results to Text, but instead of to screen, it routes the output directly to a file. I use this one to generate files I intend to parse using some utility or that I want to easily e-mail.
+
+SQLCMD Mode
+
+SQLCMD mode is new with SQL Server 2008, and changes the behavior of the Query Editor Window to more closely match that of the SQLCMD utility. The idea is to allow you to run batch files, queries, and scripts designed for command-line use through the interactive window for debugging or other purposes. By default, the Query Editor Window does not honor SQLCMD-specific commands; using SQLCMD mode activates these commands.
+
+Show Execution Plan
+
+Every time you run a query, SQL Server parses your query into its component parts and then sends it to the Query Optimizer. The Query Optimizer is the part of SQL Server that figures out what is the best way to run your query to balance fast results with minimum impact on other users. When you use the Show Estimated Execution Plan option, you receive a graphical representation and additional information on how SQL Server plans to run your query. Similarly, you can turn on the Include Actual Execution Plan option. Most of the time, this will be the same as the estimated execution plan, but it's possible you'll see differences here for changes that the optimizer decided to make while it was running the query and for changes in the actual cost of running the query versus what the optimizer thought was going to happen.
+
+Let's see what a query plan looked like in our simple query. Click the Include Actual Execution Plan option, and re-execute the query, as shown in Figure 2.7.
+
+Note that you actually have to click the Execution Plan tab for it to come up, and that your query results are still displayed in whichever way you had selected. The Show Estimated Execution Plan option will give you the same output as an Include Actual Execution Plan with two exceptions:
+
+  * You get the plan immediately rather than after your query executes.
+  * While what you see is the actual "plan" for the query, all the cost information is estimated, and the query is not actually run. Under Show Query Plan, the query was physically executed, and the cost information you get is actual rather than estimated.
+
+Figure 2.7
+
+The DB Combo Box
+
+Finally, let's take a look at the DB combo box. In short, this is where you select the default database that you want your queries to run against for the current window. Initially, the Query Editor Window will start with whatever the default database is for the user that's logged in (for the sa user, that is the master database unless someone has changed it on your system). You can then change it to any other database that the current login has permission to access.
+
+The Object Explorer
+
+This useful little tool allows us to navigate the database, look up object names, and even perform actions such as scripting and looking at the underlying data.
+
+In the example in Figure 2.8, I've expanded the database node all the way down to the listing of tables in the AdventureWorks2008 database. You can drill down even further to see individual columns (including data type and similar properties) of the tables—a very handy tool for browsing your database.
+
+Figure 2.8
+
+SQL Server Business Intelligence Development Studio
+
+The SQL Server Business Intelligence Development Studio—sometimes referred to simply as BIDS—is just an incredibly long-winded name for what amounts to a special version of Visual Studio. Indeed, what this tool looks like when it comes up will vary depending on whether you have Visual Studio 2008 installed on your system or not. If you do, it will mix an array of Visual Studio menus in with your SQL Server Analysis Services and Integration Services related menus and templates.
+
+Business Intelligence Development Studio was new back in SQL Server 2005, replacing the Analysis and Integration Services tools that were there in SQL Server 2000. What exists now is far more of a true development environment than the relatively narrow cube builder or DTS designer that we had in SQL Server 2000.
+
+From Business Intelligence Development Studio, we can design Integration Services packages (we'll touch on these next), design reports for Reporting Services, and, of course, work directly with Analysis Services projects. We will be looking at the various services that Business Intelligence Development Studio supports in separate chapters later in this book.
+
+SQL Server Integration Services (SSIS)
+
+Your friend and mine—that's what SSIS (formerly known as Data Transformation Services—or DTS) is. With SSIS, a tremendous amount of the coding (usually in some client-side language) that had to be done to perform complex data extracts or imports becomes relatively easy. SSIS allows you to take data from any data source that has an OLE DB or .NET data provider and pump it into a SQL Server table or other data destination.
+
+While transferring our data, we can also apply what are referred to as transformations to that data. Transformations essentially alter the data according to some logical rule(s). The alteration can be as simple as changing a column name, or as complex as an analysis of the integrity of the data and application of rules to change it if necessary. To think about how this is applied, consider the problem of taking data from a field that allows nulls and moving it to a table that does not allow nulls. With SSIS, you can automatically change out any null values to some other value you choose during the transfer process (for a number, that might be zero, or, for a character, it might be something like "unknown"). We will explore SSIS in Chapter 16.
+
+Reporting Services
+
+Reporting Services was first introduced as a Web release after SQL Server 2000 and became part of the core product with SQL Server 2005. It has again received a major makeover in SQL Server 2008, adding new scalability features and several new design elements. Reporting Services provides both a framework and an engine for generating reports. For SQL Server 2005 and earlier, it works in conjunction with the built-in Windows Web server to produce reports in a Web environment. With SQL Server 2008, Reporting Services includes its own Web server, so it can avoid the need to configure a full Internet Information Services (IIS) installation.
+
+The reports are defined using an XML-based definition language called Report Definition Language (or RDL). The Business Intelligence Development Studio provides a set of templates for generating both simple and complex reports. The reports are written to an RDL file, which is processed on demand by the Reporting Services engine. We will look more fully into Reporting Services in Chapter 14.
+
+Bulk Copy Program (bcp)
+
+If SSIS is, as I've often called it, "your friend and mine," then the Bulk Copy Program, or bcp, would be that old friend that we may not see that much any more, but we really appreciate when we do see them.
+
+Bcp is a command-line program, and its sole purpose in life is to move formatted data in and out of SQL Server en masse. It was around long before what has now become SSIS was thought of, and while SSIS is replacing bcp for most import/export activity, bcp still has a certain appeal to people who like command-line utilities. In addition, you'll find an awful lot of SQL Server installations out there that still depend on bcp to move data around fast. We discuss bcp fully in Chapter 15.
+
+SQL Server Profiler
+
+I can't tell you how many times this one has saved my bacon by telling me what was going on with my server when nothing else would. It's not something a developer (or even a DBA for that matter) will tend to use every day, but it's extremely powerful and can be your salvation when you're sure nothing can save you.
+
+SQL Server Profiler is, in short, a real-time tracing tool. Whereas the Performance Monitor is all about tracking what's happening at the macro level—system configuration stuff—the Profiler is concerned with tracking specifics. This is both a blessing and a curse. The Profiler can, depending on how you configure your trace, give you the specific syntax of every statement executed on your server. Now, imagine that you are doing performance tuning on a system with 1,000 users. I'm sure you can imagine the reams of paper that would be used to print out the statements executed by so many people in just a minute or two. Fortunately, the Profiler has a vast array of filters to help you narrow things down and track more specific problems—for example: long-running queries or the exact syntax of a query being run within a stored procedure (which is nice when your procedure has conditional statements that cause it to run different things under different circumstances). We will touch on SQL Server Profiler somewhat in Chapter 22.
+
+sqlcmd
+
+You won't see sqlcmd in your SQL Server program group. Indeed, it's amazing how many people don't even know that this utility (or its older brothers—osql and isql) is around; that's because it's a console rather than a Windows program.
+
+There are occasionally items that you want to script into a larger command-line process. sqlcmd gives you that capability. sqlcmd can be very handy—particularly if you make use of files that contain the scripts you want to run under sqlcmd. Keep in mind, however, that there are usually tools that can do what you're after from sqlcmd much more effectively, and with a user interface that is more consistent with the other things you're doing with your SQL Server.
+
+In addition to command-line operation, sqlcmd supports some special commands that are more inline with other command-line utilities. For example, !!DIR gives a directory listing.
+
+Once again, for history and being able to understand if people you talk SQL Server with use a different lingo, sqlcmd is yet another new name for this tool of many names. Originally, it was referred to as isql. In SQL Server 2000 and 7.0, it was known as osql.
+
+We will explore the basics of sqlcmd together with the Bulk Copy Program (bcp) in Chapter 15.
+
+Summary
+
+Most of the tools that you've been exposed to here are not ones you'll use every day. Indeed, for the average developer, only the SQL Server Management Studio will get daily use. Nevertheless, it is important to have some idea of the role that each one can play. Each has something significant to offer you. We will see each of these tools again in our journey through this book.
+
+Note that there are some other utilities available that don't have shortcuts on your Start menu (connectivity tools, server diagnostics, and maintenance utilities), which are mostly admin-related.
+3
+
+Asking a Better Question: Advanced Queries
+
+Before I get rambling along with the topics of this chapter, let me warn those of you that are already pretty familiar with the many features of SQL Server that this is an area of significant change! SQL Server now has support for keywords such as INTERSECT and EXCEPT. There is the ability to merge the results of one statement as part of the action for another statement using the new MERGE command. We also need to take a look at recursive query support. In short, don't change that dial....
+
+The topic of advanced queries requires some tough decisions for me each time I write a book. For example, what exactly is "advanced"? In the past I've debated such things as whether to do advanced queries before or after cursors. This time around I briefly considered that question again, but also found myself asking how much overlap I wanted to have with topics I also cover in my Beginning title. Then there are other questions, such as do I want to have an advanced discussion of indexing first? How about the XML discussion? You see, it's something of a chicken and egg thing (which came first?). You don't need to know anything about cursors to make use of the topics covered in this chapter, but, for example, we'll be discussing some benefits of different query methods that avoid cursors—and it really helps to understand the benefits if you know what you're trying to avoid. There are similar issues when I consider integrating relational vs. XML-based data, or want to discuss how a different query strategy affects the Query Optimizer's indexing choices.
+
+Several of the topics in this chapter represent, to me, significant marks of the difference between a "beginner" and "professional" when it comes to SQL Server programming. While they are certainly not the only thing that marks when you are a true "pro," developers that can move from "Yeah, I know those exist and use one or two of them" to using them to make unsolvable queries solvable are true gold. I write on some of these subjects for beginners to let them know they are there and give them a taste of what they can do. I write and expand on them for professionals because full understanding of these concepts is critical to high-level success with SQL Server (or almost any major DBMS for that matter).
+
+In this chapter, we're going to be looking at our queries differently than you may have looked at them before. We'll examine ways to ask what amounts to multiple questions in just one query—essentially, looking at ways of taking what seems like multiple queries and placing them into something that will execute as a complete unit. We'll take a look at some relatively new features (some new to 2008, and some that came in 2005) that can give us even more options than we ever had before. With all this in mind, we'll also take a look at query performance, and what we can do to get the most out of our queries. Writing top-notch queries isn't just about trickiness or being able to make them complex—it's making them perform.
+
+Among the topics we'll be covering in this chapter are:
+
+  * A quick review of subqueries
+  * Correlated subqueries
+  * Derived tables
+  * Making use of the EXISTS operator
+  * Utilizing INERSECT and EXCEPT operators
+  * Common Table Expressions (CTEs)
+  * Utilizing recursive queries
+  * The MERGE command
+  * Using external calls to perform complex actions
+  * Optimizing query performance
+
+We'll see how by using subqueries we can make the seemingly impossible completely possible, and how an odd tweak here and there can make a big difference in our query performance.
+
+A Quick Review of Subqueries
+
+A subquery is a normal T-SQL query that is nested inside another query—using parentheses—created when you have a SELECT statement that serves as the basis for either part of the data or the condition in another query.
+
+Subqueries are generally used to fill one of several needs:
+
+  * Break a query up into a series of logical steps
+  * Provide a listing to be the target of a WHERE clause together with [IN|EXISTS|ANY|ALL]
+  * Provide a lookup driven by each individual record in a parent query
+
+Some subqueries are very easy to think of and build, but some are extremely complex—it usually depends on the complexity of the relationship between the inner (the sub) and outer (the top) query.
+
+It's also worth noting that most subqueries (but definitely not all) can also be written using a join. In places where you can use a join instead, the join is usually the preferable choice.
+
+Building a Nested Subquery
+
+A nested subquery is one that goes in only one direction—returning either a single value for use in the outer query, or perhaps a list of values to be used with the IN operator. In the loosest sense, your query syntax is going to look something like one of these two syntax templates:
+
+SELECT <SELECT list>
+
+FROM <SomeTable>
+
+WHERE <SomeColumn> = (
+
+SELECT <single column>
+
+FROM <SomeTable>
+
+WHERE <condition that results in only one row returned>)
+
+Or:
+
+SELECT <SELECT list>
+
+FROM <SomeTable>
+
+WHERE <SomeColumn> IN (
+
+SELECT <single column>
+
+FROM <SomeTable>
+
+[WHERE <condition>)]
+
+Obviously, the exact syntax will vary. Not for just substituting the select list and exact table names, but also because you may have a multi-table join in either the inner or outer queries—or both.
+
+Nested Queries Using Single Value SELECT Statements
+
+Let's get down to the nitty-gritty with a fast example. Let's say, for example, that we wanted to know the ProductIDs of every item sold on the first day any product was purchased from the system. If you already know the first day that an order was placed in the system, then it's no problem; but what if you don't already know? We can easily obtain the date of first sale within a nested subquery, and then utilize it in an outer query all in one statement:
+
+USE AdventureWorks2008;
+
+SELECT DISTINCT soh.OrderDate, sod.ProductID
+
+FROM Sales.SalesOrderHeader soh
+
+JOIN Sales.SalesOrderDetail sod
+
+ON soh.SalesOrderID = sod.SalesOrderID
+
+WHERE soh.OrderDate =
+
+(SELECT MIN(OrderDate) FROM Sales.SalesOrderHeader);
+
+Which gets us 47 rows back:
+
+OrderDate ProductID
+
+\----------------------- -----------
+
+2001-07-01 00:00:00.000 707
+
+2001-07-01 00:00:00.000 708
+
+2001-07-01 00:00:00.000 709
+
+...
+
+...
+
+2001-07-01 00:00:00.000 776
+
+2001-07-01 00:00:00.000 777
+
+2001-07-01 00:00:00.000 778
+
+(47 row(s) affected)
+
+It's just that quick and easy. The inner query (SELECT MIN...) retrieves a single value for use in the outer query. Since we're using an equals sign, the inner query absolutely must return only one column from one single row, or we will get a runtime error.
+
+Nested Queries Using Subqueries That Return Multiple Values
+
+Perhaps the most common of all subqueries that are implemented are those that retrieve some form of domain list and use it as a criterion for a query. What we want is a list of all the employees that have applied for another job within the company. We keep our applicants listed in a table called HumanResources.JobCandidate, so what we need is a list of EmployeeIDs (actually referred to as BusinessEntityID in the table) that have a record in the job candidate table. The actual list of all employees is, of course, in our HumanResources.Employee table. We will also have to use our Person.Person table to get things like the employee's name.
+
+We might write something like this:
+
+USE AdventureWorks2008;
+
+SELECT e.BusinessEntityID, FirstName, LastName
+
+FROM HumanResources.Employee e
+
+JOIN Person.Person pp
+
+ON e.BusinessEntityID = pp.BusinessEntityID
+
+WHERE e.BusinessEntityID IN
+
+(SELECT DISTINCT BusinessEntityID FROM HumanResources.JobCandidate);
+
+This gets us back just two rows:
+
+BusinessEntityID FirstName LastName
+
+\----------------------------------------- ---------------------------
+
+212 Peng Wu
+
+274 Stephen Jiang
+
+(2 row(s) affected)
+
+Queries of this type almost always fall into the category of one that can be done using an inner join rather than a nested SELECT. For example, we could get the same results as the preceding subquery by running this simple join:
+
+USE AdventureWorks2008;
+
+SELECT e.BusinessEntityID, FirstName, LastName
+
+FROM HumanResources.Employee e
+
+JOIN Person.Person pp
+
+ON e.BusinessEntityID = pp.BusinessEntityID
+
+JOIN HumanResources.JobCandidate jc
+
+ON e.BusinessEntityID = jc.BusinessEntityID;
+
+For performance reasons, you want to use the join method as your default solution if you don't have a specific reason for using the nested SELECT—we'll discuss this more before the chapter is done.
+
+Using a Nested SELECT to Find Orphaned Records
+
+This type of nested SELECT is nearly identical to the previous example, except that we add the NOT operator. The difference this makes when you are converting to join syntax is that you are equating to an outer join rather than an inner join.
+
+This is for the scenario where you want to see what's left out. In many cases, this might be something like order details that don't have a parent record in the header table (this can't happen in the AdventureWorks2008 database thanks to your foreign key constraint, but there are databases out there where this kind of thing happens). For our example, we'll change the scenario around to ask which employees have not applied for a different job in the company. See that "not" in there and you know just what to do—add the NOT to our query (but beware a special case issue here that we have to deal with):
+
+USE AdventureWorks2008
+
+SELECT e.BusinessEntityID, FirstName, LastName
+
+FROM HumanResources.Employee e
+
+JOIN Person.Person pp
+
+ON e.BusinessEntityID = pp.BusinessEntityID
+
+WHERE e.BusinessEntityID NOT IN
+
+(SELECT DISTINCT BusinessEntityID
+
+FROM HumanResources.JobCandidate
+
+WHERE BusinessEntityID IS NOT NULL);
+
+Run this, and, of course, you get a large result set (every employee but the two we saw in the previous example).
+
+As always, beware tests against sets that might contain a NULL value. Comparisons against NULL always result in NULL. In the preceding case, the JobCandidate table has rows where the BusinessEntityID is null. If I had allowed NULL to come back in my subquery, then every row in the outer query would have evaluated false when compared to NOT IN—I would get an empty list back (I recommend experimenting with it to make sure you understand the distinction).
+
+The ANY, SOME, and ALL Operators
+
+OK, so in every book I've written until now, I've always spent a page covering these. I guess I could do it again and pad the number of pages I have written, but each time I read it, I get more annoyed by it, so I've decided to be more succinct.
+
+In addition to the IN operator, SQL Server also supports the ANY, SOME, and ALL operators. In short, they are junk. As near as I can tell, their primary purpose is to suck up a page of text in a book and waste perhaps 10 minutes you could have spent learning something useful. The functionality of each can be replicated by using more common operators (such as >=, <=, <>, !>, and so on) in the proper way, and will, in literally every example I've ever seen, come out more readable that way.
+
+In short, know these are there, but don't waste your time on them beyond that. They offer nothing new.
+
+Correlated Subqueries
+
+Two words for you on this section: Pay attention! This is another one of those little areas that, if you truly "get it," can really set you apart from the crowd. By "get it" I don't just mean that you understand how it works but also that you understand how important it can be.
+
+Correlated subqueries are one of those things that make the impossible possible. What's more, they often turn several lines of code into one, and often create a corresponding increase in performance. The problem with them is that they require a substantially different style of thought than you're probably used to. Correlated subqueries are probably the single easiest concept in SQL to learn, understand, and then promptly forget because it simply goes against the grain of how you think. If you're one of the few who choose to remember it as an option, then you will be one of the few who figure out that hard to figure out problem. You'll also be someone with a far more complete toolset when it comes to squeezing every ounce of performance out of your queries.
+
+How Correlated Subqueries Work
+
+What makes correlated subqueries different from the nested subqueries we've been looking at is that the information travels in two directions rather than one. In a nested subquery, the inner query is only processed once, and that information is passed out for the outer query, which will also execute just once—essentially providing the same value or list that you would have provided if you had typed it in yourself.
+
+With correlated subqueries, however, the inner query runs on information provided by the outer query, and vice versa. That may seem a bit confusing (that chicken or the egg thing again), but it works in a three-step process:
+
+1. The outer query obtains a record and passes it into the inner query.
+
+2. The inner query executes based on the passed in value(s).
+
+3. The inner query then passes the values from its results back out to the outer query, which uses them to finish its processing.
+
+Correlated Subqueries in the WHERE Clause
+
+I realize that this is probably a bit confusing, so take a look at it in an example.
+
+Let's look again at the query where we wanted to know what orders happened on the first date that an order was placed in the system. However, this time we want to add a new twist: We want to know the OrderID(s) and OrderDate of the first order in the system for each customer. That is, we want to know the first day that a customer placed an order and the IDs of those orders. Let's look at it piece by piece.
+
+First, we want the OrderDate, OrderID, and CustomerID for each of our results. All of that information can be found in the SalesOrderHeader table, so we know that our query is going to be based, at least in part, on that table.
+
+Next, we need to know what the first date in the system was for each customer. That's where the tricky part comes in. When we did this with a nested subquery, we were looking only for the first date in the entire file—now we need a value that's by individual customer. This wouldn't be that big a deal if we were to do it in two separate queries—we could just create a temporary table, and then join back to it, but building two completely separate result sets, more often than not, has a negative impact on performance.
+
+Sometimes using this two-query approach is simply the only way to get things done without using a cursor—this is not one of those times.
+
+Okay, so if we want this to run in a single query, we need to find a way to look up each individual. We can do this by making use of an inner query that performs a lookup based on the current CustomerID in the outer query. We will then need to return a value back out to the outer query, so it can match things up based on the earliest order date.
+
+It looks like this:
+
+SELECT o1.CustomerID, o1.SalesOrderID, o1.OrderDate
+
+FROM Sales.SalesOrderHeader o1
+
+WHERE o1.OrderDate = (SELECT MIN(o2.OrderDate)
+
+FROM Sales.SalesOrderHeader o2
+
+WHERE o2.CustomerID = o1.CustomerID)
+
+ORDER BY CustomerID;
+
+With this, we get back the 19,134 rows. There are a few key things to notice in this query:
+
+  * We see only one row(s) affected line—giving us a good clue that only one query plan had to be executed (with separate queries, there would have been two).
+  * The outer query (in this example) looks pretty much just like a nested subquery. The inner query, however, has an explicit reference to the outer query (notice the use of the "o1" alias).
+  * Aliases are used in both queries—even though it looks like the outer query shouldn't need one—that's because they are required whenever you explicitly refer to a column from the other query (inside refers to a column on the outside or vice versa).
+
+The latter point of needing aliases is a big area of confusion. The fact is that sometimes you need them, and sometimes you don't. While I don't tend to use them at all in the types of nested subqueries that you looked at in the early part of this chapter, I alias everything when dealing with correlated subqueries.
+
+The hard-and-fast "rule" is that you must alias any table (and its related columns) that's going to be referred to by the other query. The problem is that this can quickly become very confusing. The way to be on the safe side is to alias everything—that way you're positive of which table in which query you're getting your information from.
+
+We see that 19134 row(s) affected only once. That's because it affected 19,134 rows only one time. Just by observation, we can guess that this version probably runs faster than the two-query version, and, in reality, it does. Again, we'll look into this a bit more shortly.
+
+In this particular query, the outer query references the inner query only in the WHERE clause—it could also have requested data from the inner query to include in the select list.
+
+Normally, it's up to us whether we want to make use of an alias or not, but, with correlated subqueries, they are usually required. This particular query is a really great one for showing why because the inner and outer queries are based on the same table. Since both queries are getting information from each other, without aliasing, how would they know which instance of the table data that you were interested in?
+
+Correlated Subqueries in the SELECT List
+
+Subqueries can also be used to provide a different kind of answer in your selection results. This kind of situation is often found where the information you're after is fundamentally different from the rest of the data in your query (for example, you want an aggregation on one field, but you don't want all the baggage from that to affect the other fields returned).
+
+To test this, let's just run a somewhat modified version of the query we used in the last section. What we're going to say we're after here is just the name of the customer and the first date on which they ordered something.
+
+This one creates a somewhat more significant change than is probably apparent at first. We're now asking for the customer's name, which means that we have to bring the Customer and Person tables into play. In addition, we no longer need to build any kind of condition in—we're asking for all customers (no restrictions), we just want to know when their first order date was.
+
+The query actually winds up being a bit simpler than the last one, and it looks like this:
+
+SELECT pp.FirstName, pp.LastName,
+
+(SELECT MIN(OrderDate)
+
+FROM Sales.SalesOrderHeader o
+
+WHERE o.CustomerID = c.CustomerID)
+
+AS "Order Date"
+
+FROM Person.Person pp
+
+JOIN Sales.Customer c
+
+ON pp.BusinessEntityID = c.PersonID;
+
+This gets us data that looks something like this:
+
+FirstName LastName Order Date
+
+\-------------------- ------------------------- -----------------------
+
+Catherine Abel 2003-09-01 00:00:00.000
+
+Kim Abercrombie 2001-09-01 00:00:00.000
+
+Humberto Acevedo 2001-09-01 00:00:00.000
+
+...
+
+...
+
+...
+
+Krystal Zimmerman 2004-01-29 00:00:00.000
+
+Tiffany Zimmerman 2003-01-26 00:00:00.000
+
+Jake Zukowski 2001-09-02 00:00:00.000
+
+(19119 row(s) affected)
+
+Notice that the dates vary by customer—the Order Date provided is the first date that particular customer has ordered.
+
+Derived Tables
+
+Sometimes you need to work with the results of a query, but you need to work with the results of that query in a way that doesn't really lend itself to the kinds of subqueries that I've discussed up to this point. An example is where, for each row in a given table, you may have multiple results in the subquery, but you're looking for a more complex action than your IN operator provides. Essentially, what I'm talking about here are situations where you wish you could use a JOIN operator on your subquery.
+
+It's at times like these that you turn to a somewhat lesser known construct in SQL—a derived table. A derived table (sometimes called in "inline view") is made up of the columns and rows of a result set from a query (heck, they have columns, rows, data types, and so on, just like normal tables, so why not use them as such?).
+
+Imagine for a moment that you want to get a list of customers that ordered a particular product—say, a minipump. No problem! Your query might look something like this:
+
+SELECT pp.FirstName, pp.LastName
+
+FROM Person.Person AS pp
+
+JOIN Sales.Customer sc
+
+ON sc.PersonID = pp.BusinessEntityID
+
+JOIN Sales.SalesOrderHeader AS soh
+
+ON sc.CustomerID = soh.CustomerID
+
+JOIN Sales.SalesOrderDetail AS sod
+
+ON soh.SalesOrderID = sod.SalesOrderID
+
+JOIN Production.Product AS p
+
+ON sod.ProductID = p.ProductID
+
+WHERE p.Name = 'Minipump'
+
+Okay, so that was easy. Now I'm going to throw you a twist—now say I want to know all the customers that ordered not only a minipump, but also the AWC Logo Cap. Notice that I said they have to have ordered both—now you have a problem. Your first inclination might be to write something like:
+
+WHERE p.Name = 'Minipump' AND p.Name = 'AWC Logo Cap'
+
+But that's not going to work at all—each row is for a single product, so how can it have both minipump and AWC Logo Cap as the name at the same time? Nope—that's not going to get it at all (indeed, while it will run, you'll never get any rows back at all).
+
+What we really need here is to join the results of a query to find buyers of minipumps with the results of a query to find buyers of AWC Logo Caps. How do we join results though? Well, as you might expect given the title of this section, through the use of derived tables.
+
+To create our derived table, we need two things:
+
+  * To enclose our query that generates the result set in parentheses
+  * To alias the results of the query
+
+So, the syntax looks something like this:
+
+SELECT <select list>
+
+FROM (<query that returns a regular result set>) AS <alias name>
+
+JOIN <some other base or derived table>
+
+So let's take this now and apply it to our requirements. Again, what we want is the names of all the persons that have ordered both a minipump and AWC Logo Cap. So, our query should look something like this:
+
+SELECT DISTINCT pp.FirstName, pp.LastName
+
+FROM Person.Person AS pp
+
+JOIN (SELECT sc.PersonID
+
+FROM Sales.Customer sc
+
+JOIN Sales.SalesOrderHeader AS soh
+
+ON sc.CustomerID = soh.CustomerID
+
+JOIN Sales.SalesOrderDetail AS sod
+
+ON soh.SalesOrderID = sod.SalesOrderID
+
+JOIN Production.Product AS p
+
+ON sod.ProductID = p.ProductID
+
+WHERE p.Name = 'Minipump') pumps
+
+ON pp.BusinessEntityID = pumps.PersonID
+
+JOIN (SELECT sc.PersonID
+
+FROM Sales.Customer sc
+
+JOIN Sales.SalesOrderHeader AS soh
+
+ON sc.CustomerID = soh.CustomerID
+
+JOIN Sales.SalesOrderDetail AS sod
+
+ON soh.SalesOrderID = sod.SalesOrderID
+
+JOIN Production.Product AS p
+
+ON sod.ProductID = p.ProductID
+
+WHERE p.Name = 'AWC Logo Cap') caps
+
+ON pp.BusinessEntityID = caps.PersonID;
+
+As it happens, it seems that the combination of minipumps and caps is very popular—we get 83 rows:
+
+FirstName LastName
+
+\-------------------------------------------------- ---------------------------
+
+Aidan Delaney
+
+Alexander Deborde
+
+Amy Alberts
+
+...
+
+...
+
+...
+
+Valerie Hendricks
+
+Yale Li
+
+Yuping Tian
+
+(83 row(s) affected)
+
+If you want to check things out on this, just run the queries for the two derived tables separately and compare the results.
+
+For this particular query, I needed to use the DISTINCT keyword. If I didn't, then I would have potentially received multiple rows for each customer.
+
+As you can see, we were able to take a seemingly impossible query and make it both possible and even reasonably well performing.
+
+Keep in mind that derived tables aren't the solutions for everything. For example, if the result set is going to be fairly large and you're going to have lots of joined records, then you may want to look at using a temporary table and building an index on it (derived tables have no indexes). Every situation is different, but now you have one more tool in your arsenal.
+
+The EXISTS Operator
+
+I call EXISTS an operator, but Books Online calls it a keyword. That's probably because it defies description in some senses. It's both an operator much like the IN keyword is, but it also looks at things just a bit differently.
+
+When you use EXISTS, you don't really return data—instead, you return a simple TRUE/FALSE regarding the existence of data that meets the criteria established in the query that the EXISTS statement is operating against.
+
+Let's go right to an example, so you can see how this gets applied. For this example, we're going to reuse one of our nested selects examples from earlier—we want a list of employees that have applied for another position within the company at some point:
+
+SELECT e.BusinessEntityID, FirstName, LastName
+
+FROM HumanResources.Employee e
+
+JOIN Person.Person pp
+
+ON e.BusinessEntityID = pp.BusinessEntityID
+
+WHERE EXISTS
+
+(SELECT BusinessEntityID
+
+FROM HumanResources.JobCandidate jc
+
+WHERE e.BusinessEntityID = jc.BusinessEntityID);
+
+This gets us what amounts to the same two records that we had under the more standard nested query:
+
+EmployeeID FirstName LastName
+
+\----------- -------------------------------------------- ---------------------
+
+212 Peng Wu
+
+274 Stephen Jiang
+
+(2 row(s) affected)
+
+As we saw when we examined the nested query version, we could have easily done this same thing with a join:
+
+SELECT e.BusinessEntityID, FirstName, LastName
+
+FROM HumanResources.Employee e
+
+JOIN Person.Person pp
+
+ON e.BusinessEntityID = pp.BusinessEntityID
+
+JOIN HumanResources.JobCandidate jc
+
+ON e.BusinessEntityID = jc.BusinessEntityID;
+
+This join-based syntax, for example, would have yielded exactly the same results (subject to possible sort differences). So why, then, would we need this new syntax? Performance—plain and simple.
+
+When you use the EXISTS keyword, SQL Server doesn't have to perform a full row-by-row join. Instead, it can look through the records until it finds the first match and stop right there. As soon as there is a single match, the EXISTS is true, so there is no need to go further. The performance difference here is even more marked than it is with the inner join. SQL Server just applies a little reverse logic versus the straight EXISTS statement. In the case of the NOT we're now using, SQL can still stop looking as soon as it finds one matching record—the only difference is that it knows to return FALSE for that lookup rather than TRUE. Performance wise, everything else about the query is the same.
+
+Using EXISTS in Other Ways
+
+If you work around SQL creation scripts much, you will see an oddity preceding many CREATE statements. It will look something like this:
+
+IF EXISTS (SELECT * FROM sysobjects WHERE id =
+
+object_id(N'[Sales].[SalesOrderHeader]') AND OBJECTPROPERTY(id, N'IsUserTable') = 1)
+
+DROP TABLE [Sales].[ SalesOrderHeader]
+
+GO
+
+CREATE TABLE [Sales].[ SalesOrderHeader] (
+
+...
+
+...
+
+You may see variants on the theme—that is, they may use sys.objects, sys.databases, or the INFORMATION_SCHEMA views—but the concept is still the same: They are testing to see whether an object exists before performing a CREATE. Sometimes they may just skip the CREATE if the table already exists, and sometimes they may drop it (as I did in the preceding example). The idea is pretty simple though—they want to skip a potential error condition (the CREATE would error out and blow up your script if the table already existed).
+
+Just as a simple example, we'll build a little script to create a database object. We'll also keep the statement to a minimum since we're interested in the EXISTS rather than the CREATE command:
+
+USE master
+
+GO
+
+IF NOT EXISTS (SELECT 'True' FROM sys.databases WHERE name = 'DBCreateTest')
+
+BEGIN
+
+CREATE DATABASE DBCreateTest
+
+END
+
+ELSE
+
+BEGIN
+
+PRINT 'Database already exists. Skipping CREATE DATABASE Statement'
+
+END
+
+GO
+
+The first time you run this, there won't be any database called DBCreateTest (unless by sheer coincidence that you created something called that before you got to this point), so the database will be created.
+
+Now run the script a second time, and you'll see a change:
+
+Database already exists. Skipping CREATE DATABASE Statement
+
+So, without much fanfare or fuss, we've added a rather small script in that will make things much more usable for the installers of your product. That may be an end user who bought your off-the-shelf product, or it may be you—in which case it's even better that it's fully scripted.
+
+The long and the short of it is that EXISTS is a very handy keyword indeed. It can make some queries run much faster, and it can also simplify some queries and scripts.
+
+A word of caution here—this is another one of those places where it's easy to get trapped in "traditional thinking." While EXISTS blows other options away in a large percentage of queries where EXISTS is a valid construct, that's not always the case—just remember that rules are sometimes made to be broken.
+
+The INTERSECT and EXCEPT Operators
+
+INTERSECT and EXCEPT are special keywords that operate against two result sets much as UNION does. They are relatively new to SQL Server, but have been around in other RDBMS packages for many years.
+
+So, what do they do? Well, INTERSECT and EXCEPT give us additional options in what to show when looking at the combination of separate result sets. Figure 3.1 shows what data is included depending on which result set combination operator you utilize. To summarize, it works something like this:
+
+  * UNION—Rows are included from both result sets, but duplicates are eliminated (just one instance of any overlapping rows).
+  * UNION ALL—All rows are included from both result sets, regardless of whether they are duplicate or not.
+  * EXCEPT—Only those rows that are on the left side of the EXCEPT keyword and do not exist in the right-hand result set are included. Basically, this says "show me any rows that exist in A, but do not exist in B."
+  * INTERSECT—Only rows that exist in both result sets are included. INTERSECT operates much like an inner join, except that it operates on the notion of rows that would be duplicated between result sets rather than a specific join column.
+
+Figure 3.1
+
+Now, everyone who has read any of my previous books knows that I am keen on examples, and I'm going to provide some here, but I'll point out from the beginning that these statements are fairly easily simulated using the EXISTS operator, so we'll examine both, but let's take a quick look at the syntax for each first.
+
+EXCEPT
+
+The EXCEPT operator provides all data from the left set that does not exist in the right set. The syntax is relatively straightforward, and works almost just like a UNION:
+
+<table or tabular result>
+
+EXCEPT
+
+<table or tabular result with same number of columns and type as top query>
+
+If you were paying attention when we reviewed EXISTS and NOT EXISTS, you can probably translate this to its NOT EXISTS equivalent, which would logically look something like this:
+
+<base query >
+
+WHERE NOT EXISTS
+
+(SELECT 1
+
+FROM <table or result with same number of columns and type as top query>
+
+WHERE <base query first column> = <comparison table first column> [,...])
+
+We'll see this in an example in the section following the INTERSECT syntax.
+
+INTERSECT
+
+The INTERSECT operator provides all data that matches on both sides of the INTERSECT. As with EXCEPT, the syntax is straightforward and works similar to a UNION:
+
+<table or tabular result>
+
+INTERSECT
+
+<table or tabular result with same number of columns and type as top query>
+
+Again, you can translate this to an EXISTS (this time without the NOT), which would logically look something like this:
+
+<base query >
+
+WHERE NOT EXISTS
+
+(SELECT 1
+
+FROM <table or result with same number of columns and type as top query>
+
+WHERE <base query first column> = <comparison table first column> [,...])
+
+Now that we've seen the syntax for both EXCEPT and INTERSECT, let's move on to a set of examples that show them both in action and compare them to the versions based on the EXISTS operator.
+
+Comparing EXCEPT and INTERSECT with Their EXISTS and NOT EXISTS Equivalents
+
+As I indicated when discussing the basic concepts of EXCEPT and INTERSECT, both can, in terms of end result, be replicated via appropriate use of the EXISTS or NOT EXISTS operators. Let's run an example of each form, along with a simple UNION so we can see how similar the syntax is. We'll start by populating some small test data tables, then take a look at the UNION, then move on to the EXCEPT and INTERSECT operators with their EXISTS equivalents.
+
+SET NOCOUNT ON; -- Eliminate the row counts after each query to save space
+
+\-- Create our test tables and populate them with a few relevant rows
+
+CREATE TABLE UnionTest1
+
+(
+
+idcol int IDENTITY,
+
+col2 char(3),
+
+);
+
+CREATE TABLE UnionTest2
+
+(
+
+idcol int IDENTITY,
+
+col4 char(3),
+
+);
+
+INSERT INTO UnionTest1
+
+VALUES
+
+('AAA'),
+
+('BBB'),
+
+('CCC');
+
+INSERT INTO UnionTest2
+
+VALUES
+
+('CCC'),
+
+('DDD'),
+
+('EEE');
+
+PRINT 'Source and content of both tables:';
+
+PRINT '';
+
+SELECT 1 AS SourceTable, col2 AS Value
+
+FROM UnionTest1
+
+UNION ALL
+
+SELECT 2, col4
+
+FROM UnionTest2;
+
+PRINT 'Results with classic UNION';
+
+SELECT col2
+
+FROM UnionTest1
+
+UNION
+
+SELECT col4
+
+FROM UnionTest2;
+
+PRINT 'Results with EXCEPT';
+
+PRINT '--------------------------';
+
+SELECT col2
+
+FROM UnionTest1
+
+EXCEPT
+
+SELECT col4
+
+FROM UnionTest2;
+
+PRINT 'Equivilent of EXCEPT but using NOT EXISTS';
+
+PRINT '--------------------------';
+
+SELECT col2
+
+FROM UnionTest1 ut1
+
+WHERE NOT EXISTS
+
+(SELECT col4 FROM UnionTest2 WHERE col4 = ut1.col2);
+
+PRINT 'Results with INTERSECT';
+
+PRINT '--------------------------';
+
+SELECT col2
+
+FROM UnionTest1
+
+INTERSECT
+
+SELECT col4
+
+FROM UnionTest2;
+
+PRINT 'Equivilent of INTERSECT but using EXISTS';
+
+PRINT '--------------------------';
+
+SELECT col2
+
+FROM UnionTest1 ut1
+
+WHERE EXISTS
+
+(SELECT col4 FROM UnionTest2 WHERE col4 = ut1.col2);
+
+\-- Clean up after ourselves
+
+DROP TABLE UnionTest1;
+
+DROP TABLE UnionTest2;
+
+SET NOCOUNT OFF; -- Don't forget to turn this back to the default!
+
+Let's walk through the results of this a bit at a time—focusing on the points specific to EXCEPT and INTERSECT as well as their EXISTS-related equivalents.
+
+First, let's check out the results of the EXCEPT operator and its related NOT EXISTS version:
+
+Results with EXCEPT
+
+\--------------------------
+
+col2
+
+\----
+
+AAA
+
+BBB
+
+Equivalent of EXCEPT but using NOT EXISTS
+
+\--------------------------
+
+col2
+
+\----
+
+AAA
+
+BBB
+
+As you can see, the results were the same. It is, however, worth noting that the query plans were different. For example, on my system, the cost (you can find more on this in the chapter on Performance Tuning) of the EXCEPT was more than twice that of the NOT EXISTS approach. If you're in a performance sensitive environment, you may want to test out both methods on a realistic set of data for your application, and see what you wind up with.
+
+We'll see this same theme of the EXISTS version performing better than the EXCEPT/INTERSECT equivalent as we look at INTERSECT. As of this writing, every example I've seen personally or on the web yields a plan that is either more efficient with the EXISTS approach, or is identical; never have I seen the EXCEPT/INTERSECT approach perform better.
+
+Does this mean you shouldn't use EXCEPT and INTERSECT? Well, perhaps, but I don't believe things are quite that easy to decide. For example, in your development community, which reads more easily? Which is easier to understand? If the performance you're seeing is slower, but insignificantly so or "close enough," then you may be interested in using EXCEPT and INTERSECT because they make the desired result much more exact to someone who is reviewing the code later. EXISTS and NOT EXISTS are not that hard, but they have many more possible uses, so are slightly less intuitive; the right choice is often a matter of opinion.
+
+Now let's move on to the INTERSECT results:
+
+Results with INTERSECT
+
+\--------------------------
+
+col2
+
+\----
+
+CCC
+
+Equivalent of INTERSECT but using EXISTS
+
+\--------------------------
+
+col2
+
+\----
+
+CCC
+
+The results were, again, a match; we are able to replicate the functionality of the INTERSECT by using the EXISTS operator.
+
+Much like with EXCEPT, the EXISTS performs much better (about 30% of the cost of the EXCEPT. The result will vary somewhat depending on the amount of data you're looking at. As I will so often say, "your mileage may vary," by which I mean, make sure you've tested the impact in your environment.
+
+In general, the EXISTS approach will perform at least as well as the EXCEPT/ INTERSECT approach. The latter is, however, somewhat more readable. Take your specific situation into account when choosing between the two.
+
+Common Table Expressions (CTEs)
+
+Common Table Expressions (CTE) were first introduced back in SQL Server 2005. They provide a means to refer to a temporary result set by name, and thus utilize it as a table (albeit both temporary and virtual in nature). Perhaps the coolest thing about them is that you define them before actually using them, so you can avoid separate physical steps storing and re-referencing the table (as you would do with a temporary table—or even a table variable). This can have very favorable performance impacts since SQL Server can plan the work between the CTE and the queries that utilize it as part of one logical operation rather than as a series of separate activities. In their simplest form, CTEs are similar to views created on the fly, but a CTE can also enable other things that you can't really do with a view (for example, see the following section on recursive queries).
+
+The basic syntax for a CTE utilizes the WITH keyword followed by a name and definition:
+
+WITH <CTE name> [ ( <column name> [,...n] ) ]
+
+AS
+
+( <query returning tabular data> )
+
+<statement that will make use of the CTE>
+
+After the CTE is defined, you can refer to it by name just as if it were a table.
+
+Note that while a CTE can nest, and a CTE can refer to a parent CTE, you cannot have completely independent CTEs at the same time, nor can you reference forward in your nested CTEs. Indeed, whatever statement is going to use the CTE must immediately follow the CTE declaration.
+
+So, as an example of CTE use, we could replace part of our earlier derived table with a CTE reference:
+
+USE AdventureWorks2008;
+
+WITH pumps (BusinessEntityID)
+
+AS
+
+(
+
+SELECT sc.PersonID AS BusinessEntityID
+
+FROM Sales.Customer sc
+
+JOIN Sales.SalesOrderHeader AS soh
+
+ON sc.CustomerID = soh.CustomerID
+
+JOIN Sales.SalesOrderDetail AS sod
+
+ON soh.SalesOrderID = sod.SalesOrderID
+
+JOIN Production.Product AS p
+
+ON sod.ProductID = p.ProductID
+
+WHERE p.Name = 'Minipump'
+
+)
+
+SELECT DISTINCT pp.FirstName, pp.LastName
+
+FROM Person.Person AS pp
+
+JOIN pumps
+
+ON pp.BusinessEntityID = pumps.BusinessEntityID
+
+JOIN ( SELECT sc.PersonID AS BusinessEntityID
+
+FROM Sales.Customer sc
+
+JOIN Sales.SalesOrderHeader AS soh
+
+ON sc.CustomerID = soh.CustomerID
+
+JOIN Sales.SalesOrderDetail AS sod
+
+ON soh.SalesOrderID = sod.SalesOrderID
+
+JOIN Production.Product AS p
+
+ON sod.ProductID = p.ProductID
+
+WHERE p.Name = 'AWC Logo Cap') caps
+
+ON pp.BusinessEntityID = caps.BusinessEntityID;
+
+Notice that I was able to cut the first derived table out entirely and replace it with the CTE reference. I cannot, however, also replace the caps derived table, as I can only make one CTE reference at a time. I can replace pumps, or I can replace caps, but not both.
+
+It's worth noting that certain constructs cannot be used within a CTE. These include:
+
+  * COMPUTER and COMPUTE BY
+  * ORDER BY
+  * INTO
+  * The FOR XML, FOR BROWSE, and OPTION query clauses
+
+CTEs may seem a bit worthless at first given all these restrictions, but they show their power as we begin to work with recursive queries (which are effectively impossible without CTEs). Having said that, let's move right into looking at those....
+
+Recursive Queries
+
+Historically, one of the more tricky things to deal with in a relational system has been hierarchical data. Microsoft has done much in the last two releases to ease the pain in this area. One of the pieces of functionality that is very powerful is the notion of a recursive query. A query or piece of code is considered to be recursive when it calls itself either directly or indirectly. We have long had the ability to have recursive stored procedures and functions, but the notion of a recursive query didn't become available until SQL Server 2005.
+
+Prior to the native hierarchical data type that is new with this release (we'll examine the new HierarchyID data type extensively in Chapter 7), most hierarchical data was stored in what is called a unary relationship—that is, a table that has a relationship where both the parent and the child columns are in the same table. A need for recursion is best seen in such unary relationships where the hierarchical data represented is "ragged" in structure. That is, the depth of each branch of the tree may vary, so you need to recurse until you find the bottom of the hierarchical structure—however deep that may be. Recursive queries make that possible.
+
+Recursive queries are made possible by using a properly constructed CTE. A recursive CTE needs to have at least two major parts: a foundation or "anchor" member, and a recursive member. The anchor member establishes the foundation to which the rest of the query data can be added. The recursive member handles the repetitive calls and provides the recursion check.
+
+As an example, let's look at a very typical ragged hierarchy—employee reporting chains. To take a look at this, we'll create a version of the AdventureWorks2008 Employees table where the reporting structure is represented in the older schema style (the 2008 version of AdventureWorks uses the newer HierarchyID data type). We'll generate this using data from the existing Employees table, so our data will easily match that used elsewhere in the AdventureWorks2008 database.
+
+CREATE TABLE HumanResources.Employee2
+
+(
+
+BusinessEntityID int NOT NULL PRIMARY KEY,
+
+ManagerID int NULL,
+
+JobTitle nvarchar(50) NULL
+
+);
+
+INSERT INTO HumanResources.Employee2
+
+SELECT hre.BusinessEntityID,
+
+(SELECT BusinessEntityID
+
+FROM HumanResources.Employee hre2
+
+WHERE hre.OrganizationNode.GetAncestor(1) = hre2.OrganizationNode
+
+) AS ManagerID,
+
+JobTitle
+
+FROM HumanResources.Employee hre;
+
+This should get 290 rows into a new table called HumanResources.Employee2, which we'll use for the remainder of our CTE examples.
+
+So, now that we have your typical mix where a few employees (your basic "C" level staff) report to the CEO, and then managers report to those executives, supervisors report to the managers, and so on, we're ready to begin. The exact depth of the managerial chain varies by individual department and group. We can use a recursive query to crawl that chain for us.
+
+First, we need to build the root—or "anchor"—of the hierarchy. In this case, that would obviously be the CEO (no one is higher than he is!), but the way we'll format it is to grab any record where the employee has no one that they report to:
+
+\-- Establish the "Anchor Member"
+
+\-- This essentially defines the top node of the
+
+\-- recursion hierarchy
+
+SELECT hre.ManagerID,
+
+hre.BusinessEntityID,
+
+hre.JobTitle,
+
+hredh.DepartmentID,
+
+0 AS Level
+
+FROM HumanResources.Employee2 AS hre
+
+JOIN HumanResources.EmployeeDepartmentHistory AS hredh
+
+ON hre.BusinessEntityID = hredh.BusinessEntityID
+
+AND hredh.EndDate IS NULL -- Current employees only!
+
+WHERE hre.ManagerID IS NULL;
+
+Now, we need to add to that all the various employees that report to this root node, and then recurse down the tree until we get to the bottom. We'll UNION these results to those we just got for the root:
+
+UNION ALL
+
+\-- Define the piece that actually recurses
+
+SELECT hre.ManagerID,
+
+hre.BusinessEntityID,
+
+hre.JobTitle,
+
+hredh.DepartmentID,
+
+r.Level + 1
+
+FROM HumanResources.Employee2 AS hre
+
+JOIN HumanResources.EmployeeDepartmentHistory AS hredh
+
+ON hre.BusinessEntityID = hredh.BusinessEntityID
+
+AND hredh.EndDate IS NULL -- Current employees only!
+
+JOIN Reports AS r
+
+ON hre.ManagerID = r.BusinessEntityID
+
+Now, let's put that all together, and then create a statement to make use of our CTE. I can add a WHERE clause to the calling statement, so I can filter my data down to just the groups, departments, or positions I want the reporting information on—for example:
+
+USE AdventureWorks2008;
+
+GO
+
+\-- Establish the CTE foundation for the recursion
+
+WITH Reports (ManagerID, BusinessEntityID, JobTitle, DepartmentID, Level)
+
+AS
+
+(
+
+\-- Establish the "Anchor Member"
+
+\-- This essentially defines the top node of the
+
+\-- recursion hierarchy
+
+SELECT hre.ManagerID,
+
+hre.BusinessEntityID,
+
+hre.JobTitle,
+
+hredh.DepartmentID,
+
+0 AS Level
+
+FROM HumanResources.Employee2 AS hre
+
+JOIN HumanResources.EmployeeDepartmentHistory AS hredh
+
+ON hre.BusinessEntityID = hredh.BusinessEntityID
+
+AND hredh.EndDate IS NULL -- Current employees only!
+
+WHERE hre.ManagerID IS NULL
+
+UNION ALL
+
+\-- Define the piece that actually recurses
+
+SELECT hre.ManagerID,
+
+hre.BusinessEntityID,
+
+hre.JobTitle,
+
+hredh.DepartmentID,
+
+r.Level + 1
+
+FROM HumanResources.Employee2 AS hre
+
+JOIN HumanResources.EmployeeDepartmentHistory AS hredh
+
+ON hre.BusinessEntityID = hredh.BusinessEntityID
+
+AND hredh.EndDate IS NULL -- Current employees only!
+
+JOIN Reports AS r
+
+ON hre.ManagerID = r.BusinessEntityID
+
+)
+
+\-- Code to get it all started.
+
+SELECT ManagerID, BusinessEntityID, JobTitle, Level
+
+FROM Reports r
+
+JOIN HumanResources.Department AS dp
+
+ON r.DepartmentID = dp.DepartmentID
+
+WHERE dp.GroupName LIKE '%Admin%'
+
+ORDER BY Level, ManagerID, JobTitle;
+
+GO
+
+Note that the CTE is not controlling what group names are returned; instead, that is being driven from the calling query. The WHERE clause is, however, merged into the plan prior to execution and therefore the query will be optimized differently depending on the specific makeup of the calling query.
+
+Let's take a look at the results:
+
+ManagerID BusinessEntityID JobTitle Level
+
+\----------- ---------------- ------------------------------------ -----------
+
+NULL 1 Chief Executive Officer 0
+
+1 234 Chief Financial Officer 1
+
+1 263 Information Services Manager 1
+
+25 227 Facilities Manager 2
+
+...
+
+...
+
+264 266 Network Administrator 3
+
+228 229 Janitor 4
+
+228 230 Janitor 4
+
+228 231 Janitor 4
+
+228 232 Janitor 4
+
+(35 row(s) affected)
+
+"What is the level?" you may ask. It is something that I've inserted arbitrarily here to give you a feel for the depth each row has relative to the overall hierarchy. We could just as easily have left it out.
+
+The key thing to understand here is that recursive queries are now not only possible, but also relatively easy. The trick is to understand your root node and how to build off of that anchor.
+
+MERGE
+
+In previous versions of SQL Server, when you heard the word "merge" you generally thought of merge replication. With SQL Server 2008, however, we have a whole new way of thinking about the word merge and, more importantly, of thinking about DML statements.
+
+With MERGE, we have the prospect of combining multiple DML action statements (INSERT, UPDATE, DELETE) into one overall action, improving performance (they can share many of the same physical operations) and simplifying transactions. MERGE makes use of a special USING clause that winds up working somewhat like a CTE. The result set in the USING clause can then be used to conditionally apply your INSERT, UPDATE, and DELETE statements. The basic syntax looks something like this:
+
+MERGE <target table> [AS <alias>]
+
+USING
+
+(
+
+<source query>
+
+)
+
+WHEN {[NOT] MATCHED | <expression> THEN
+
+<action statement>
+
+[<additional WHEN clauses>, [...n]]
+
+Let's use the example of receiving a shipment for inventory. We'll assume that we're keeping a special rollup table of our sales for reporting purposes. We want to run a query daily that will add any new sales to our monthly rollup. On the first night of the month, this is pretty much a no brainer, as, since there are no other rollup records for the month, any sales for the day are just rolled up and inserted. On the second day, however, we have a different scenario: We need to rollup and insert new records as we did the first day, but we need to just update existing records (for products that have already sold that month).
+
+Let's take a look at how MERGE can manage both actions in one overall step. Before we get going on this, however, we need to create our rollup table:
+
+USE AdventureWorks2008;
+
+CREATE TABLE Sales.MonthlyRollup
+
+(
+
+Year smallint NOT NULL,
+
+Month tinyint NOT NULL,
+
+ProductID int NOT NULL
+
+FOREIGN KEY
+
+REFERENCES Production.Product(ProductID),
+
+QtySold int NOT NULL,
+
+CONSTRAINT PKYearMonthProductID
+
+PRIMARY KEY
+
+(Year, Month, ProductID)
+
+);
+
+This is a pretty simple example of a monthly rollup table—making it very easy to get sales totals by product for a given year and month. To make use of this, however, we need to regularly populate it with rolled up values from our detail table. To do this, we'll use MERGE.
+
+First, we need to start by establishing a result set that will figure out what rows we need to be sourcing data for our rollup from. For purposes of this example, we'll focus on August of 2003, and start with our query for the first day of the month:
+
+SELECT soh.OrderDate, sod.ProductID, SUM(sod.OrderQty) AS QtySold
+
+FROM Sales.SalesOrderHeader soh
+
+JOIN Sales.SalesOrderDetail sod
+
+ON soh.SalesOrderID = sod.SalesOrderID
+
+WHERE soh.OrderDate >= '2003-08-01'
+
+AND soh.OrderDate < '2003-08-02'
+
+GROUP BY soh.OrderDate, sod.ProductID;
+
+This gets us the total sales, by ProductID, for every date in our range (our range just happens to be limited to one day).
+
+There is a bit of a trap built into how we've done this up to this point. I've set the GROUP BY to use the OrderDate, but OrderDate is a datetime data type as opposed to a date data type. If our order were to start coming in with actual times on them, it would mess with our assumption that all orders will group nicely into one date. If this were a production environment, we would want to cast the OrderDate to a date data type or use DATEPART to ensure that the grouping was by day rather than by time.
+
+With this, we're ready to build our merge:
+
+MERGE Sales.MonthlyRollup AS smr
+
+USING
+
+(
+
+SELECT soh.OrderDate, sod.ProductID, SUM(sod.OrderQty) AS QtySold
+
+FROM Sales.SalesOrderHeader soh
+
+JOIN Sales.SalesOrderDetail sod
+
+ON soh.SalesOrderID = sod.SalesOrderID
+
+WHERE soh.OrderDate >= '2003-08-01' AND soh.OrderDate < '2003-08-02'
+
+GROUP BY soh.OrderDate, sod.ProductID
+
+) AS s
+
+ON (s.ProductID = smr.ProductID)
+
+WHEN MATCHED THEN
+
+UPDATE SET smr.QtySold = smr.QtySold + s.QtySold
+
+WHEN NOT MATCHED THEN
+
+INSERT (Year, Month, ProductID, QtySold)
+
+VALUES (DATEPART(yy, s.OrderDate),
+
+DATEPART(m, s.OrderDate),
+
+s.ProductID,
+
+s.QtySold);
+
+Note that the semicolon is required at the end of the MERGE statement. While the semicolon remains optional on most SQL statements for backward compatibility reasons, you'll find it working its way into more and more statements as a required delimiter of the end of the statement; this is particularly true for multipart statements such as MERGE.
+
+When you run this, you should get 192 rows affected assuming you haven't been altering the data in AdventureWorks2008. Now, since our Sales.MonthlyRollup table was empty, there wouldn't have been any matches, so all rows were inserted. We can verify that by querying our Sales.MonthlyRollup table:
+
+SELECT *
+
+FROM Sales.MonthlyRollup;
+
+This gets us back the expected 192 rows:
+
+Year Month ProductID QtySold
+
+\------ ----- ----------- -----------
+
+2003 8 707 242
+
+2003 8 708 281
+
+2003 8 711 302
+
+...
+
+...
+
+2003 8 997 43
+
+2003 8 998 138
+
+2003 8 999 103
+
+(192 row(s) affected)
+
+Every row that was in the basic SELECT that powered our MERGE wound up being inserted into our table. Let's move on, however, to the 2nd day of the month:
+
+MERGE Sales.MonthlyRollup AS smr
+
+USING
+
+(
+
+SELECT soh.OrderDate, sod.ProductID, SUM(sod.OrderQty) AS QtySold
+
+FROM Sales.SalesOrderHeader soh
+
+JOIN Sales.SalesOrderDetail sod
+
+ON soh.SalesOrderID = sod.SalesOrderID
+
+WHERE soh.OrderDate >= '2003-08-02' AND soh.OrderDate < '2003-08-03'
+
+GROUP BY soh.OrderDate, sod.ProductID
+
+) AS s
+
+ON (s.ProductID = smr.ProductID)
+
+WHEN MATCHED THEN
+
+UPDATE SET smr.QtySold = smr.QtySold + s.QtySold
+
+WHEN NOT MATCHED THEN
+
+INSERT (Year, Month, ProductID, QtySold)
+
+VALUES (DATEPART(yy, s.OrderDate),
+
+DATEPART(m, s.OrderDate),
+
+s.ProductID,
+
+s.QtySold);
+
+We update the date we're running this for (simulating running it on the 2nd day of the month), and running it should get us 38 rows:
+
+(38 row(s) affected)
+
+But something is different this time; we already had rows in the table that our new batch of sales may have matched up with. We know we affected 38 rows, but how did we affect them. Re-run the SELECT on our table:
+
+SELECT *
+
+FROM Sales.MonthlyRollup
+
+And instead of 230 rows (the 192 plus the 38), we only get 194 rows. Indeed, 36 of our 38 rows were repeat sales, and were therefore treated as updates rather than insertions. Two rows (ProductIDs 882 and 928) were sales of product that had not been previously sold in that month, and thus needed to be inserted as new rows—one pass over the data, but the equivalent of two statements ran.
+
+We could perform similar actions that decide to delete rows based on matched or not matched conditions.
+
+Using External Calls to Perform Complex Actions
+
+We have always had the need, on occasion, to get information that is sourced outside of SQL Server. For the vast, vast majority of installations, actually getting that information from within SQL Server was out of reach. Instead, there was typically a client or middle tier component that sorted out what was needed from SQL Server and what was needed from the external source.
+
+In many ways, this was just fine—after all, having your database server hung up waiting on an external call seems risky at best, and deadly at worst. Who knows how long before that call is going to return (if ever?). The risk of hung processes within your database server winds up being fairly high.
+
+Now, I said for the majority of installations, and that implies that a few got around it—and they did. There were a few different methods available.
+
+First, there was the idea of an extended stored procedure. These are DLLs that you can create in C using special SQL Server libraries. They run in process with SQL Server and can be (assuming you have a smart DLL writer) very fast, save for one problem—an external call. That means that we are beholden to the external process we are calling to return to us in a timely fashion. The additional issue was one of general safety. Since you're running in process to SQL Server, if your DLL crashes, then SQL Server is going to crash (if you're distributing software, I'm sure you can guess at how your customer would react if your product was taking down their SQL Server installation). Last, but not least, very few had the knack for figuring out how to get these written.
+
+Another solution was added to SQL Server in the OLE/COM era. The sp_CreateOAMethod family of stored procedures allowed you to instantiate a COM object and make calls to it. These passed data back and forth using variants, and were always run out of process. They were safer, but they were clumsy at best and painfully slow.
+
+With the advent of .NET and SQL Server becoming CLR language aware, we live in a new world. You can write your scripts using any .NET language, and can instantiate the objects you need to get the job done. You can create user-defined functions to call external processes—such as cross-communicating with some other online system that you cannot directly link to. Imagine, for a moment, allowing SQL Server to apply information gleaned from a Web service and merge that data in the end query? Heady stuff.
+
+The possibilities are endless; however, you need to keep your head about this. External calls are still external calls! Any time you rely on something external to your system, you are at the mercy of that external system. Be very, very careful with such calls.
+
+External calls should be considered to be an extreme measure. You are taking risks in terms of security (what is the risk of someone spoofing your external source?) and also taking an extreme performance risk. Tread lightly in this area.
+
+Performance Considerations
+
+We've already touched on some of the macro-level "what's the best thing to do" stuff as we've gone through the chapter, but, like most things in life, it's not as easy as all that. What I want to do here is provide something of a quick reference for performance issues for your queries. I'll try to steer you toward the right kind of query for the right kind of situation.
+
+Yes, it's time again folks for one of my now famous soapbox diatribes. At issue this time is the concept of blanket use of blanket rules.
+
+What I'm going to be talking about in this section is the way that things usually work. The word usually is extremely operative here. There are very few rules in SQL that will be true 100 percent of the time. In a world full of exceptions, SQL has to be at the pinnacle of that—exceptions are a dime a dozen when you try and describe the performance world in SQL Server.
+
+In short, you need to gauge just how important the performance of a given query is. If performance is critical, then don't take these rules too seriously—instead, use them as a starting point, and then TEST, TEST, TEST!!!
+
+JOINs vs. Subqueries vs. ?
+
+Deciding between joins and subqueries (and for that matter, other options) is that area I mentioned earlier in the chapter that I had a heated debate with a coworker over. And, as you might expect when two people have such conviction in their point of view, both of us were correct up to a point (and it follows, wrong up to a point).
+
+The long-standing, traditional viewpoint about subqueries has always been that you are much better off to use joins instead if you can. This is absolutely correct—sometimes. In reality, it depends on a large number of factors. The following is a table that discusses some of the issues that the performance balance will depend on, and which side of the equation they favor.
+
+Situation | Favors  
+---|---  
+The value returned from a subquery is going to be the same for every row in the outer query. | Prequery. Declaring a variable and then selecting the needed value into that variable will allow the would-be subquery to be executed just once rather than once for every record in the outer table. The Optimizer in SQL Server is actually pretty smart about this and will do the prequery for you if it detects the scenario, but do not rely on it. If you know this is the scenario, perform your own prequery just to be sure.  
+Both tables are relatively small (say 10,000 records or less). | Subqueries. I don't know the exact reasons, but I've run several tests on this, and it held up pretty much every time. I suspect that the issue is the lower overhead of a lookup vs. a join when all the lookup data fits on just a data page or two. The Optimizer continues to get smarter about this with every release, so you may find some scenarios where the two options return exactly the same query plan.  
+The match, after considering all criteria, is going to return only one value. | Subqueries. Again, there is much less overhead in going and finding just one record and substituting it than in having to join the entire table.  
+The match, after considering all criteria, is going to return only a relatively few values, and there is no index on the lookup column. | Subqueries. A single lookup or even a few lookups will usually take less overhead than a hash join.  
+The lookup table is relatively small, but the base table is large. | Nested subqueries if applicable; joins vs. a correlated subquery. With subqueries the lookup will happen only once and has relatively low overhead. With correlated subqueries, however, you will be cycling the lookup many times—in this case, the join would be a better choice in most cases.  
+Correlated subquery vs. join | Join. Internally, a correlated subquery is going to create a nested loop situation. This can create quite a bit of overhead. It is substantially faster than cursors in most instances, but slower than other options that might be available.  
+Derived tables vs. whatever | Derived tables typically carry a fair amount of overhead, so proceed with caution. The thing to remember is that they are run (derived if you will) once, and then they are in memory, so, most of the overhead is in the initial creation and the lack of indexes in larger result sets. They can be fast or slow—it just depends. Think before coding on these.  
+EXISTS vs. whatever | EXISTS. It does not have to deal with multiple lookups for the same match—once it finds one match for that particular row, it is free to move onto the next lookup—this can seriously cut down on overhead.  
+Use of a CTE | A CTE is merged into the query plan of the calling query. In general, this means that a basic CTE will have no significant effect on the end performance.  
+MERGE vs. Multiple Statements | MERGE allows for the separate action statements to be accomplished in the same pass over the data and utilizing the same locks where applicable. The result will generally be improved performance. Keep in mind, however, that, for many users, it may make for code that is more difficult to read.
+
+These are just the highlights. The possibilities of different mixes and additional situations are positively endless.
+
+I can't stress enough how important it is, when in doubt—heck, even when you're not in doubt but performance is everything—to make reasonable tests of competing solutions to the problem. By reasonable, I mean that your tests should cover most of the typical scenarios that you users will execute the code in. In addition, your tests should be conducted against a database and load that is somewhat equivalent to what you expect to see in production. Most of the time the blanket rules will be fine, but not always. By performing reasonable tests, you can be certain you've made the right choice.
+
+Summary
+
+The query basics you've learned in your experience with SQL up to this point will cover perhaps 80 percent or more of the query situations that you run into, but it's that other 20 percent that can kill you. Sometimes the issue is whether you can even find a query that will give you the answers you need. Sometimes it's that you have a particular query or sproc that has unacceptable performance. Whatever the case, you'll run across plenty of situations where simple queries and joins just won't fit the bill. You need something more, and, hopefully, the options covered in this chapter have given you a little more of an arsenal to deal with those tough situations.
+4
+
+XML Integration
+
+Extensible Markup Language (XML)—looking back at its history is something of a funny thing to me. Part of its strength lies in its simplicity, so it would seem like it wouldn't change much. Indeed, the basic rules of it haven't changed at all—but all the things surrounding XML (such as how to access data stored in XML) have gone through many changes. Likewise, the way that SQL Server supports XML has seen some fairly big changes from the time it was first introduced.
+
+So, to continue my "it's a funny thing" observation, I realized some time back that I used to refer to XML support as being an "extra"—what a truly silly thing for me to say. Yeah, yeah, yeah—I always tempered that "extra" comment with the notion that it's only because XML support isn't really required to have a working SQL Server, but I've come to realize in today's world that it isn't much of a working SQL Server without support for XML. It is with this in mind, and looking back at how integral XML integration has become to the product, that I've moved my coverage of XML much further forward in the book versus where I had it in prior editions (where it was more of an afterthought).
+
+XML has, over the decade or so that it has grown into widespread use, become a fundamental consideration in the vast majority of data designs. Sure, there are many well thought out and well designed systems out there that do not use so much as one line of XML code, but there are very, very few that haven't had at least a moment of "should we use XML?" consideration in them. XML is used in websites, for data exchange, and for simple storage of things such as hierarchies—if you aren't at least considering XML in your data applications, then you probably aren't giving your data applications full consideration.
+
+So, with all that said, in this chapter we'll look at:
+
+  * The XML data type
+  * XML schema collections
+  * Methods of representing your relational data as XML
+  * Methods of querying data that we have stored natively in XML (XQuery, Microsoft's XDL language (a variant on XQuery), and other methods)
+
+Some of these are actually embedded within each other, so let's get to taking a look so we can see how they mix.
+
+This chapter assumes that you have an existing knowledge of at least basic XML rules and constructs. If you do not have that foundation knowledge, I strongly recommend picking up a copy of the latest edition of a good XML book such as Beginning XML (also available from Wrox) or another XML-specific book before getting too far into this chapter. Keep in mind though, that other chapters may occasionally reference material introduced in this chapter.
+
+The XML Data Type
+
+The XML data type was first introduced in SQL Server 2005. It was a watershed moment in the history of mixing relational and XML data. With the xml data type, SQL Server takes data that is in XML format and recognizes it as truly being XML data. In previous versions, there were an increasing number of ways to address XML data, but all of it was done from the foundation of basic character data. The XML data type recognizes XML as XML and that opens up a host of new possibilities from indexing to data validation.
+
+The number of different things going on here is massive. Among the various things that we need to talk about when discussing the XML data type include:
+
+  * Schema Collections—A core concept of XML is the notion of allowing XML to be associated with schema documents. XML schemas define the rules that allow us to determine whether our XML is "valid" (that is, does it meet the rules that this particular kind of XML document is supposed to do). XML schema collections in SQL Server are a way of storing schemas and allowing SQL Server to know that is what they are—validation documents. You can associate instances of XML data (column data or variables, for example) with XML schemas, and SQL Server will apply the schema to each instance of that XML to determine whether it is valid XML or not.
+  * Enforcing Constraints—Relational data systems have always had the notion of requiring a column to meet certain criteria before we'll let it into our table, but what about XML? XML allows for multiple pieces of discrete data to be stored within just one column—how do we validate those individual pieces of data? The XML data type understands XML, and, while direct definition of constraints is not allowed, we can utilize wrapper functions (in the form of stored procedures or triggers) to define constraints for specific nodes within our XML.
+  * XML Data Type Methods—When referring to a column or variable that is typed XML, you can utilize several methods that are intrinsic to that data type. For example, you can test for the existence of a certain node or attribute, execute XDL (a Microsoft-defined extension to XQuery that allows for data modification), or query the value of a specific node or attribute.
+
+Let's get more specific.
+
+Defining a Column as Being of XML Type
+
+We've already seen the most basic definition of an XML column. For example, if we examined the most basic definition of the Production.ProductModel table in the AdventureWorks2008 database, it would look something like this:
+
+CREATE TABLE Production.ProductModel (
+
+ProductModelID int IDENTITY(1,1) NOT NULL,
+
+Name dbo.Name NOT NULL,
+
+CatalogDescription xml NULL,
+
+Instructions xml NULL,
+
+rowguid uniqueidentifier ROWGUIDCOL NOT NULL,
+
+ModifiedDate datetime NOT NULL,
+
+CONSTRAINT PK_ProductModel_ProductModelID PRIMARY KEY CLUSTERED
+
+(
+
+ProductModelID ASC
+
+);
+
+So, let's ask ourselves what we have here in terms of our two XML columns:
+
+1. We have defined them as XML, so we will have our XML data type methods available to us (more on those coming up soon).
+
+2. We have allowed nulls, but could have just as easily chosen NOT NULL as a constraint. Note, however, that the NOT NULL would be enforced on whether the row had any data for that column, not whether that data was valid.
+
+3. Our XML is considered "non-typed XML." That is, since we have not associated any schema with it, SQL Server doesn't really know anything about how this XML is supposed to behave to be considered "valid."
+
+The first of these is implied in any column that is defined with the data type XML rather than just plain text. We will see much more about this in our next XML data type section.
+
+The second goes with any data type in SQL Server—we can specify whether we allow NULL data or not for that column.
+
+So, the real meat in terms changes we can make at definition time has to do with whether we specify our XML column as being typed or non-typed XML. The non-typed definition we used in the preceding example means that SQL Server knows very little about any XML stored in the column and, therefore, can do little to police its validity. If we set the column up as being typed XML, then we are providing much more definition about what is considered "valid" for any XML that goes in our column.
+
+The AdventureWorks2008 database already has schema collections that match the validation we want to place on our two XML columns, so let's look at how we would change our CREATE statement to adjust to typed XML:
+
+CREATE TABLE Production.ProductModel (
+
+ProductModelID int IDENTITY(1,1) NOT NULL,
+
+Name dbo.Name NOT NULL,
+
+CatalogDescription xml
+
+(CONTENT Production . ProductDescriptionSchemaCollection ) NULL,
+
+Instructions xml
+
+(CONTENT Production . ManuInstructionsSchemaCollection ) NULL,
+
+rowguid uniqueidentifier ROWGUIDCOL NOT NULL,
+
+ModifiedDate datetime NOT NULL,
+
+CONSTRAINT PK_ProductModel_ProductModelID PRIMARY KEY CLUSTERED
+
+(
+
+ProductModelID ASC
+
+);
+
+This represents the way it is defined in the actual AdventureWorks2008 sample. In order to insert a record into the Production.ProductModel table, you must either leave the CatalogDescription and Instructions fields blank or supply XML that is valid when tested against their respective schemas.
+
+XML Schema Collections
+
+XML schema collections are really nothing more than named persistence of one or more schema documents into the database. The name amounts to a handle to your set of schemas. By referring to that collection, you are indicating that the XML typed column or variable must be valid when matched against all of the schemas in that collection.
+
+We can view existing schema collections. To do this, we utilize the built-in XML_SCHEMA_NAMESPACE() function. The syntax looks like this:
+
+XML_SCHEMA_NAMESPACE( <SQL Server schema> , <xml schema collection> , [<namespace>] )
+
+This is just a little confusing, so let's touch on these parameters just a bit:
+
+Parameter | Description  
+---|---  
+SQL Server schema | This is your relational database schema (not to be confused with the XML schema). For example, for the table Production.ProductModel, Production is the relational schema. For Sales.SalesOrderHeader, Sales is the relational schema.  
+xml schema collection | The name used when the XML schema collection was created. In our create table example previously, we referred to the ProductDescriptionSchemaCollection and ManuInstructionsSchemaCollection XML schema collections.  
+namespace | Optional name for a specific namespace within the XML schema collection. Remember that XML schema collections can contain multiple schema documents—this would return anything that fell within the specified namespace.
+
+So, to use this for the Production.ManuInstructionsSchemaCollection schema collection, we would make a query like this:
+
+SELECT XML_SCHEMA_NAMESPACE('Production','ManuInstructionsSchemaCollection')
+
+This spews forth a ton of unformatted XML:
+
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"
+
+xmlns:t="http://schemas.microsoft.com/sqlserver/2004/07/adventure-
+
+works/ProductModelManuInstructions"
+
+targetNamespace="http://schemas.microsoft.com/sqlserver/2004/07/adventure-
+
+works/ProductModelManuInstructions"
+
+elementFormDefault="qualified"><xsd:element name="root"><xsd:complexType
+
+mixed="true"><xsd:complexContent mixed="true"><xsd:restriction
+
+base="xsd:anyType"><xsd:sequence><xsd:element name="Location"
+
+maxOccurs="unbounded"><xsd:complexType mixed="true"><xsd:complexContent
+
+mixed="true"><xsd:restriction base="xsd:anyType"><xsd:sequence><xsd:element
+
+name="step" type="t:StepType" maxOccurs="unbounded"
+
+/></xsd:sequence><xsd:attribute name="LocationID" type="xsd:integer"
+
+use="required" /><xsd:attribute name="SetupHours" type="xsd:decimal"
+
+/><xsd:attribute name="MachineHours" type="xsd:decimal" /><xsd:attribute
+
+name="LaborHours" type="xsd:decimal" /><xsd:attribute name="LotSize"
+
+type="xsd:decimal"
+
+/></xsd:restriction></xsd:complexContent></xsd:complexType></xsd:element></xsd
+
+:sequence></xsd:restriction></xsd:complexContent></xsd:complexType></xsd:eleme
+
+nt><xsd:complexType name="StepType" mixed="true"><xsd:complexContent
+
+mixed="true"><xsd:restriction base="xsd:anyType"><xsd:choice minOccurs="0"
+
+maxOccurs="unbounded"><xsd:element name="tool" type="xsd:string"
+
+/><xsd:element name="material" type="xsd:string" /><xsd:element
+
+name="blueprint" type="xsd:string" /><xsd:element name="specs"
+
+type="xsd:string" /><xsd:element name="diag" type="xsd:string"
+
+/></xsd:choice></xsd:restriction></xsd:complexContent></xsd:complexType></xsd:
+
+schema>
+
+SQL Server strips out any whitespace between tags, so if you create a schema collection with all sorts of pretty indentations for readability, SQL Server will remove them for the sake of efficient storage.
+
+Note that the default number of characters returned for text results in Management Studio is only 256 characters. If you're using text view, you will want to go Tools⇒Options⇒Query Results⇒SQL Server⇒Results to Text and change the maximum number of characters displayed.
+
+Creating, Altering, and Dropping XML Schema Collections
+
+The CREATE, ALTER, and DROP notions for XML schema collections work in a manner that is mostly consistent with how other such statements have worked thus far in SQL Server. We'll run through them here, but pay particular attention to the ALTER statement, as it is the one that has a few quirks versus other ALTER statements we've worked with.
+
+CREATE XML SCHEMA COLLECTION
+
+Again, the CREATE is your typical CREATE <object type> <object name> syntax that we've seen throughout the book, and uses the AS keyword we've seen with stored procedures, views, and other less structured objects:
+
+CREATE XML SCHEMA COLLECTION [<SQL Server schema>.] <collection name>
+
+AS { <schema text> | <variable containing the schema text> }
+
+So if, for example, we wanted to create an XML schema collection that is similar to the Production.ManuInstructionsSchemaCollection collection in AdventureWorks2008, we might execute something like the following:
+
+CREATE XML SCHEMA COLLECTION ProductDescriptionSchemaCollectionSummaryRequired
+
+AS
+
+'<xsd:schema
+
+targetNamespace="http://schemas.microsoft.com/sqlserver/2004/07/adventure-
+
+works/ProductModelWarrAndMain"
+
+xmlns="http://schemas.microsoft.com/sqlserver/2004/07/adventure-
+
+works/ProductModelWarrAndMain"
+
+elementFormDefault="qualified"
+
+xmlns:xsd="http://www.w3.org/2001/XMLSchema" >
+
+<xsd:element name="Warranty" >
+
+<xsd:complexType>
+
+<xsd:sequence>
+
+<xsd:element name="WarrantyPeriod" type="xsd:string" />
+
+<xsd:element name="Description" type="xsd:string" />
+
+</xsd:sequence>
+
+</xsd:complexType>
+
+</xsd:element>
+
+</xsd:schema>
+
+<xs:schema
+
+targetNamespace="http://schemas.microsoft.com/sqlserver/2004/07/adventure-
+
+works/ProductModelDescription"
+
+xmlns="http://schemas.microsoft.com/sqlserver/2004/07/adventure-
+
+works/ProductModelDescription"
+
+elementFormDefault="qualified"
+
+xmlns:mstns="http://tempuri.org/XMLSchema.xsd"
+
+xmlns:xs="http://www.w3.org/2001/XMLSchema"
+
+xmlns:wm="http://schemas.microsoft.com/sqlserver/2004/07/adventure-
+
+works/ProductModelWarrAndMain" >
+
+<xs:import
+
+namespace="http://schemas.microsoft.com/sqlserver/2004/07/adventure-
+
+works/ProductModelWarrAndMain" />
+
+<xs:element name="ProductDescription" type="ProductDescription" />
+
+<xs:complexType name="ProductDescription">
+
+<xs:sequence>
+
+<xs:element name="Summary" type="Summary" minOccurs="1" />
+
+</xs:sequence>
+
+<xs:attribute name="ProductModelID" type="xs:string" />
+
+<xs:attribute name="ProductModelName" type="xs:string" />
+
+</xs:complexType>
+
+<xs:complexType name="Summary" mixed="true" >
+
+<xs:sequence>
+
+<xs:any processContents="skip"
+
+namespace="http://www.w3.org/1999/xhtml" minOccurs="0" maxOccurs="unbounded"
+
+/>
+
+</xs:sequence>
+
+</xs:complexType>
+
+</xs:schema>'
+
+Note that the URL portion of the namespace declaration must be entered on a single line. They are shown here word wrapped onto multiple lines because there is a limit to how many characters we can show per line in print. Make sure you include the entire URL on a single line.
+
+This one happens to be just like the Production.ManuInstructionsSchemaCollection schema collection, but I've altered the schema to require the summary element rather than having it optional. Since the basic structure is the same, I utilized the same namespaces.
+
+ALTER XML SCHEMA COLLECTION
+
+This one is just slightly different from other ALTER statements in the sense that it is limited to just adding new pieces to the collection. The syntax looks like this:
+
+ALTER XML SCHEMA COLLECTION [<SQL Server schema>.] <collection name>
+
+ADD { <schema text> | <variable containing the schema text> }
+
+I would not be at all surprised if the functionality of this is boosted a bit in a later service pack, but, in the meantime, let me stress again that this is a tool for adding to your schema collection rather than changing or removing what's there.
+
+DROP XML SCHEMA COLLECTION
+
+This is one of those classic "does what it says" things and works just like any other DROP:
+
+DROP XML SCHEMA COLLECTION [<SQL Server schema>.] <collection name>
+
+So, to get rid of our ProductDescriptionSchemaCollectionSummaryRequired schema collection we created earlier, we could execute:
+
+DROP XML SCHEMA COLLECTION ProductDescriptionSchemaCollectionSummaryRequired;
+
+And it's gone.
+
+XML Data Type Methods
+
+The XML data type carries several intrinsic methods with it. These methods are unique to the XML data type, and no other data type has anything that is at all similar. The syntax within these methods varies a bit because they are based on different, but mostly industry-standard, XML access methods. The basic syntax for calling the method is standardized though:
+
+<instance of xml data type>.<method>
+
+There are a total of five methods available:
+
+  * .query—An implementation of the industry-standard XQuery language. This allows you to access your XML by running XQuery-formatted queries. XQuery allows for the prospect that you may be returning multiple pieces of data rather than a discrete value.
+  * .value—This one allows you to access a discrete value within a specific element or attribute.
+  * .modify—This is Microsoft's own extension to XQuery. Whereas XQuery is limited to requesting data (no modification language), the modify method extends XQuery to allow for data modification.
+  * .nodes—Used to break up XML data into individual, more relational-style rows.
+  * .exist—Much like the IF EXISTS clause we use extensively in standard SQL, the exist() XML data type method tests to see whether a specific kind of data exists. In the case of exist(), the test is to see whether a particular node or attribute has an entry in the instance of XML you're testing.
+
+.query (SQL Server's Implementation of XQuery)
+
+.query is an implementation of the industry standard XQuery language. The result works much like a SQL query, except that the results are for matching XML data nodes rather than relational rows and columns.
+
+.query requires a parameter that is a valid XQuery to be run against your instance of XML data. For example, if we wanted the steps out of the product documentation for ProductID 66, we could run the following:
+
+SELECT ProductModelID, Instructions.query('declare namespace
+
+PI="http://schemas.microsoft.com/sqlserver/2004/07/adventure-
+
+works/ProductModelManuInstructions";
+
+/PI:root/PI:Location/PI:step') AS Steps
+
+FROM Production.ProductModel
+
+WHERE ProductModelID = 66;
+
+Note that the URL portion of the namespace declaration must be entered on a single line. They are shown here word wrapped onto multiple lines because there is a limit to how many characters we can show per line in print. Make sure you include the entire URL on a single line.
+
+The result is rather verbose, so I've truncated the right side of it, but you can see that we've trimmed things down such that we're getting only those nodes at the step level or lower in the XML hierarchy.
+
+ProductModelID Steps
+
+\-------------- --------------------------------------------------
+
+66 <PI:step xmlns:PI="http://schemas.microsoft.com/sqlser...
+
+Put the <PI:material>Seat post Lug (Product N...
+
+</PI:step><PI:step xmlns:PI="http://schemas.micro...
+
+Insert the <PI:material>Pinch Bolt (Product N...
+
+</PI:step><PI:step xmlns:PI="http://schemas.micro...
+
+Attach the <PI:material>LL Seat (Product Numb...
+
+</PI:step><PI:step xmlns:PI="http://schemas.micro...
+
+Inspect per specification <PI:specs>FI-620</P...
+
+</PI:step>
+
+(1 row(s) affected)
+
+It's also worth pointing out that all the XML still came in one column in one row per data row in the database.
+
+It bears repeating that .query cannot modify data—it is a read-only operation.
+
+Notice, by the way, my need to declare the namespace in this. Since a namespace is declared as part of the referenced schema collection, you can see how it really expands and virtually destroys the readability of our query. We can fix that by using the WITH XMLNAMESPACES() declaration:
+
+WITH XMLNAMESPACES ('http://schemas.microsoft.com/sqlserver/2004/07/adventure-
+
+works/ProductModelManuInstructions' AS PI)
+
+SELECT ProductModelID, Instructions.query('/PI:root/PI:Location/PI:step') AS Steps
+
+FROM Production.ProductModel
+
+WHERE ProductModelID = 66;
+
+Note that the URL portion of the namespace declaration must be entered on a single line. They are shown here word wrapped onto multiple lines because there is a limit to how many characters we can show per line in print. Make sure you include the entire URL on a single line.
+
+This gives you a somewhat more readable query, but yields the same result set.
+
+.value
+
+The .value method is all about querying out discrete data. It uses an XPath syntax to locate a specific node and extract a scalar value. The syntax looks like this:
+
+<instance of xml data type>.value (<XPath location>, <non-xml SQL Server Type>)
+
+The trick here is making certain that the XPath specified really will return a discrete value.
+
+If, for example, we wanted to know the value of the LaborHours attribute in the first Location element for ProductModelID 66, we might write something like:
+
+WITH XMLNAMESPACES ('http://schemas.microsoft.com/sqlserver/2004/07/adventure-
+
+works/ProductModelManuInstructions' AS PI)
+
+SELECT ProductModelID,
+
+Instructions.value('(/PI:root/PI:Location/@LaborHours)[1]',
+
+'decimal (5,2)') AS Location
+
+FROM Production.ProductModel
+
+WHERE ProductModelID = 66
+
+Note that the URL portion of the namespace declaration must be entered on a single line. They are shown here word wrapped onto multiple lines because there is a limit to how many characters we can show per line in print. Make sure you include the entire URL on a single line.
+
+Check the results:
+
+ProductModelID Location
+
+\-------------- ---------------------------------------
+
+66 1.50
+
+(1 row(s) affected)
+
+Note that SQL Server has extracted just the specified attribute value (in this case, the LaborHours attribute of the Location node) as a discrete piece of data. The data type of the returned values must be castable into a non-XML type in SQL Server, and must return a scalar value—that is, you cannot have multiple rows.
+
+.modify
+
+Ah, here things get just a little interesting.
+
+XQuery, left in its standard W3C form, is a read-only kind of thing—that is, it is great for selecting out data but offers no equivalents to INSERT, UPDATE, or DELETE. Bummer deal! Well, Microsoft is apparently having none of that and has done its own extension to XQuery to provide data manipulation for XQuery. This extension to XQuery is called XML Data Manipulation Language, or XML DML. XML DML adds three new commands to XQuery:
+
+  * insert
+  * delete
+  * replace value of
+
+Note that these commands, like all XML keywords, are case sensitive.
+
+Each of these does what it implies, with replace value of taking the place of SQL's UPDATE statement.
+
+If, for example, we wanted to increase the original 1.5 labor hours in our .value example, we might write something like:
+
+WITH XMLNAMESPACES ('http://schemas.microsoft.com/sqlserver/2004/07/adventure-
+
+works/ProductModelManuInstructions' AS PI)
+
+UPDATE Production.ProductModel
+
+SET Instructions.modify('replace value of
+
+(/PI:root/PI:Location/@LaborHours)[1] with 1.75')
+
+WHERE ProductModelID = 66;
+
+Note that the URL portion of the namespace declaration must be entered on a single line. They are shown here word wrapped onto multiple lines because there is a limit to how many characters we can show per line in print. Make sure you include the entire URL on a single line.
+
+Now if we re-run our .value command:
+
+WITH XMLNAMESPACES ('http://schemas.microsoft.com/sqlserver/2004/07/adventure-
+
+works/ProductModelManuInstructions' AS PI)
+
+SELECT ProductModelID, Instructions.value('(/PI:root/PI:Location/@LaborHours)[1]',
+
+'decimal (5,2)') AS Location
+
+FROM Production.ProductModel
+
+WHERE ProductModelID = 66
+
+Note that the URL portion of the namespace declaration must be entered on a single line. They are shown here word wrapped onto multiple lines because there is a limit to how many characters we can show per line in print. Make sure you include the entire URL on a single line.
+
+We get a new value:
+
+ProductModelID Location
+
+\-------------- ---------------------------------------
+
+66 1.75
+
+(1 row(s) affected)
+
+Note the way that this is essentially an UPDATE within an UPDATE. We are modifying the SQL Server row, so we must use an UPDATE statement to tell SQL Server that our row of relational data (which just happens to have XML within it) is to be updated. We must also use the replace value of keyword to specify the XML portion of the update.
+
+.nodes
+
+.nodes is used to take blocks of XML and separate what would have been, were it stored in a relational form, multiple rows of data. Taking one XML document and breaking it into individual parts in this way is referred to as shredding the document.
+
+What we are doing with .nodes is essentially breaking the instances of XML data into their own table (with as many rows as there are instances of data meeting that XQuery criteria). As you might expect, this means we need to treat .nodes results as a table rather than a column. The primary difference between .nodes and a typical table is that we must cross apply our .nodes results back to the specific table that we are sourcing our XML data from. So, .nodes really involves more syntax than just .nodes—think of it somewhat like a join, but using the special CROSS APPLY keyword in the place of the JOIN and .nodes instead of the ON clause. It looks like this:
+
+SELECT <column list>
+
+FROM <source table>
+
+CROSS APPLY <column name>.nodes(<XQuery>) AS <table alias for your .nodes results>
+
+This is fairly confusing stuff, so let's look back at our .value example earlier. We see a query that looked for a specific entry and, therefore, got back exactly one result:
+
+WITH XMLNAMESPACES ('http://schemas.microsoft.com/sqlserver/2004/07/adventure-
+
+works/ProductModelManuInstructions' AS PI)
+
+SELECT ProductModelID,
+
+Instructions.value('(/PI:root/PI:Location/@LaborHours)[1]',
+
+'decimal (5,2)') AS Location
+
+FROM Production.ProductModel
+
+WHERE ProductModelID = 66;
+
+Note that the URL portion of the namespace declaration must be entered on a single line. They are shown here word wrapped onto multiple lines because there is a limit to how many characters we can show per line in print. Make sure you include the entire URL on a single line.
+
+.value expects a scalar result, so we needed to make certain our XQuery would return just that single value per individual row of XML. .nodes tells SQL Server to use XQuery to map to a specific location and treat each entry found in that XQuery to be an individual row instead.
+
+Let's modify our .value example to return all LocationIDs and their respective labor hours. We want to be able to perform queries against the data in our XML as though it were relational data, so we need to break up our LocationID and LaborHours information into columns just as if they were in a relational table.
+
+WITH XMLNAMESPACES ('http://schemas.microsoft.com/sqlserver/2004/07/adventure-
+
+works/ProductModelManuInstructions' AS PI)
+
+SELECT pm.ProductModelID,
+
+pmi.Location.value('./@LocationID', 'int') AS LocationID,
+
+pmi.Location.value('./@LaborHours', 'decimal(5,2)') AS LaborHours
+
+FROM Production.ProductModel pm
+
+CROSS APPLY pm.Instructions.nodes('/PI:root/PI:Location') AS pmi(Location);
+
+Note that the URL portion of the namespace declaration must be entered on a single line. They are shown here word wrapped onto multiple lines because there is a limit to how many characters we can show per line in print. Make sure you include the entire URL on a single line.
+
+Notice that through the use of our .nodes method, we are essentially turning one table (ProductModel) into two tables (the source table and the .nodes results from the Instructions column within the ProductModel table). Take a look at the results:
+
+ProductModelID LocationID LaborHours
+
+\-------------- ----------- ---------------------------------------
+
+7 10 2.50
+
+7 20 1.75
+
+7 30 1.00
+
+7 45 0.50
+
+7 50 3.00
+
+7 60 4.00
+
+10 10 2.00
+
+10 20 1.50
+
+10 30 1.00
+
+10 4 1.50
+
+10 50 3.00
+
+10 60 4.00
+
+43 50 3.00
+
+44 50 3.00
+
+47 10 1.00
+
+47 20 1.00
+
+47 50 3.50
+
+48 10 1.00
+
+48 20 1.00
+
+48 50 3.50
+
+53 50 0.50
+
+66 50 1.75
+
+67 50 1.00
+
+(23 row(s) affected)
+
+As you can see, we are getting back multiple rows for many of what was originally a single row in the ProductModel table. For example, ProductModelID 7 had six different instances of the Location element, so we received six rows instead of just the single row that existed in the ProductModel table.
+
+While this is, perhaps, the most complex of the various XML data type methods, the power that it gives to transform XML data for relational use is virtually limitless.
+
+.exist
+
+.exist works something like the EXISTS statement in SQL. It accepts an expression (in this case, an XQuery expression rather than a SQL expression) and will return a Boolean indication of whether the expression was true or not. (NULL is also a possible outcome.)
+
+If, in our .modify example, we had wanted to show rows that contain steps that had spec elements, we could use .exist:
+
+WITH XMLNAMESPACES ('http://schemas.microsoft.com/sqlserver/2004/07/adventure-
+
+works/ProductModelManuInstructions' AS PI)
+
+SELECT ProductModelID, Instructions
+
+FROM Production.ProductModel
+
+WHERE Instructions.exist('/PI:root/PI:Location/PI:step/PI:specs') = 1
+
+Pay particular attention to the point at which the test condition is being applied!
+
+For example, the code would show us rows where at least one step had a spec element in it—it does not necessarily require that every step has the spec element. If we wanted every element to be tested, we would either need to pull the elements out as individual rows (using .nodes) or place the test condition in the XQuery.
+
+Note that the URL portion of the namespace declaration must be entered on a single line. They are shown here word wrapped onto multiple lines because there is a limit to how many characters we can show per line in print. Make sure you include the entire URL on a single line.
+
+Enforcing Constraints beyond the Schema Collection
+
+By the time you got to this book, you should have already become somewhat familiar with the basics of constraints in a relational database. Well, if our relational database needs constraints, it follows that our XML data does. Indeed, we've already implemented much of that idea through the use of schema collections. But what if we want to enforce requirements that go beyond the base schema?
+
+Retrieving Relational Data in XML Format
+
+This is an area that SQL Server already had largely figured out prior to the 2005 release. We had a couple of different options, and we had still more options within those options—between them all, things have been pretty flexible for quite some time. Let's take a look.
+
+The FOR XML Clause
+
+This clause is at the root of most of the different integration models available. With the exception of XML mapping schemas (fairly advanced, but we'll touch on them briefly later in the chapter) and the use of XPath, FOR XML will serve as the way of telling SQL Server that it's XML that you want back, not the more typical result set. It is essentially just an option added onto the end of the existing T-SQL SELECT statement.
+
+Let's look at the SELECT statement syntax:
+
+SELECT <column list>
+
+[FROM <source table(s)>]
+
+[WHERE <restrictive condition>]
+
+[GROUP BY <column name or expression using a column in the SELECT list>
+
+[HAVING <restrictive condition based on the GROUP BY results>]
+
+[ORDER BY <I>]
+
+[FOR XML {RAW|AUTO|EXPLICIT|PATH}
+
+[, XMLDATA][, ELEMENTS][, BINARY base64]]
+
+[OPTION (<query hint>, [,...n])]
+
+Most of this should seem pretty trivial by now—after all, this is a Professional level title—but it's time to focus in on that FOR XML line.
+
+FOR XML provides four different initial options for how you want your XML formatted in the results:
+
+  * RAW—This sends each row of data in your result set back as a single data element, with the element name of "row" and with each column listed as an attribute of the row element. Even if you join multiple tables, RAW outputs the results with the same number of elements as you would have rows in a standard SQL query.
+  * AUTO—This option labels each element with either the table name or table name alias that the data is sourced from. If there is data output from more than one table in the query, the data from each table is split into separate, nested elements. If AUTO is used, then an additional option, ELEMENTS, is also supported if you would like column data presented as elements rather than as attributes.
+  * EXPLICIT—This one is certainly the most complex to format your query with, but the end result is that you have a high degree of control of what the XML looks like finally. With this option, you define something of a hierarchy to the data that's being returned, and then format your query such that each piece of data belongs to a specific hierarchy level (and gets assigned a tag accordingly) as desired. This choice has largely been supplanted by the PATH option and is here for backward compatibility.
+  * PATH—This was added in SQL Server 2005 to try to provide the level of flexibility of EXPLICIT in a more usable format—this is generally going to be what you want to use when you need a high degree of control of the format of the output.
+
+Note that none of these options provide the required root element. If you want the XML document to be considered to be "well formed," then you will need to wrap the results with a proper opening and closing tag for your root element. While this is in some ways a hassle, it is also a benefit—it means that you can build more complex XML by stringing multiple XML queries together and wrapping the different results into one XML file.
+
+In addition to the major formatting options, there are other optional parameters that further modify the output that SQL Server provides in an XML query:
+
+  * XMLDATA—This tells SQL Server that you would like to apply an XML schema onto the front of the results. The schema will define the structure (including data types) and rules of the XML data that follows.
+  * ELEMENTS—This option is available only when you are using the AUTO formatting option. It tells SQL Server that you want the columns in your data returned as nested elements rather than as attributes.
+  * BINARY BASE64—This tells SQL Server to encode any binary columns (binary, varbinary, image) in base64 format. This option is implied (SQL Server will use it even if you don't state it) if you are also using the AUTO option. It is not implied but is currently the only effective option for EXPLICIT and RAW queries—eventually, the plan is to have these two options automatically provide a URL link to the binary data (unless you say to do the base64 encoding), but this is not yet implemented.
+  * TYPE—Tells SQL Server to return the results reporting the XML data type instead of the default Unicode character type.
+  * ROOT—This option will have SQL Server add the root node for you so you don't have to. You can either supply a name for your root or use the default (root).
+
+Let's explore all these options in a little more detail.
+
+RAW
+
+This is something of the "no fuss, no muss" option. The idea here is to just get it done—no fanfare, no special formatting at all—just the absolute minimum to translate a row of relational data into an element of XML data. The element is named "row" (creative, huh?), and each column in the select list is added as an attribute using whatever name the column would have appeared with if you had been running a more traditional SELECT statement.
+
+One downside to the way attributes are named is that you need to make certain that every column has a name. Normally, SQL Server will just show no column heading if you perform an aggregation or other calculated column and don't provide an alias—when doing XML queries, everything MUST have a name, so don't forget to alias calculated columns.
+
+So, let's start things out with something relatively simple. Imagine that our manager has asked us to provide a query that lists a few customers' orders—say CustomerIDs 1 and 2. After cruising through just the first five or so chapters of the book, you would probably say "No Problem!" and supply something like this:
+
+SELECT sc.CustomerID,
+
+pp.LastName,
+
+pp.FirstName,
+
+soh.SalesOrderID,
+
+soh.OrderDate
+
+FROM Person.Person pp
+
+JOIN Sales.Customer sc
+
+ON pp.BusinessEntityID = sc.PersonID
+
+JOIN Sales.SalesOrderHeader soh
+
+ON sc.CustomerID = soh.CustomerID
+
+WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485;
+
+So, you go hand your boss the results:
+
+29484 Achong Gustavo 44132 2001-09-01 00:00:00.000
+
+29484 Achong Gustavo 45579 2002-03-01 00:00:00.000
+
+...
+
+...
+
+29485 Abel Catherine 65157 2004-03-01 00:00:00.000
+
+29485 Abel Catherine 71782 2004-06-01 00:00:00.000
+
+Easy, right? Well, now the boss comes back and says, "Great—now I'll just have Billy Bob write something to turn this into XML—too bad that will probably take a day or two." This is your cue to step in and say, "Oh, why didn't you say so?" and simply add three key words:
+
+SELECT sc.CustomerID,
+
+pp.LastName,
+
+pp.FirstName,
+
+soh.SalesOrderID,
+
+soh.OrderDate
+
+FROM Person.Person pp
+
+JOIN Sales.Customer sc
+
+ON pp.BusinessEntityID = sc.PersonID
+
+JOIN Sales.SalesOrderHeader soh
+
+ON sc.CustomerID = soh.CustomerID
+
+WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485
+
+FOR XML RAW;
+
+You have just made the boss very happy. The output is a one-to-one match versus what we would have seen in the result set had we run just a standard SQL query:
+
+<row CustomerID="1" LastName="Achong" FirstName="Gustavo" SalesOrderID="44132"
+
+OrderDate="2001-09-01T00:00:00"/>
+
+<row CustomerID="1" LastName="Achong" FirstName="Gustavo" SalesOrderID="45579"
+
+OrderDate="2002-03-01T00:00:00"/>
+
+<row CustomerID="1" LastName="Achong" FirstName="Gustavo" SalesOrderID="46389"
+
+OrderDate="2002-06-01T00:00:00"/>
+
+<row CustomerID="1" LastName="Achong" FirstName="Gustavo" SalesOrderID="47454"
+
+OrderDate="2002-09-01T00:00:00"/>
+
+<row CustomerID="1" LastName="Achong" FirstName="Gustavo" SalesOrderID="48395"
+
+OrderDate="2002-12-01T00:00:00"/>
+
+<row CustomerID="1" LastName="Achong" FirstName="Gustavo" SalesOrderID="49495"
+
+OrderDate="2003-03-01T00:00:00"/>
+
+<row CustomerID="1" LastName="Achong" FirstName="Gustavo" SalesOrderID="50756"
+
+OrderDate="2003-06-01T00:00:00"/>
+
+<row CustomerID="2" LastName="Abel" FirstName="Catherine" SalesOrderID="53459"
+
+OrderDate="2003-09-01T00:00:00"/>
+
+<row CustomerID="2" LastName="Abel" FirstName="Catherine" SalesOrderID="58907"
+
+OrderDate="2003-12-01T00:00:00"/>
+
+<row CustomerID="2" LastName="Abel" FirstName="Catherine" SalesOrderID="65157"
+
+OrderDate="2004-03-01T00:00:00"/>
+
+<row CustomerID="2" LastName="Abel" FirstName="Catherine" SalesOrderID="71782"
+
+OrderDate="2004-06-01T00:00:00"/>
+
+Let me just issue a reminder that Management Studio will truncate any column where the length exceeds the number set in the Tools⇒Options menu in the Query Results Results to Text node (maximum is 8192). This issue exists in the results window (grid or text, though grid will allow larger numbers if the data is XML) and if you output directly to a file. This is an issue with the tool—not SQL Server itself. If you use another method to retrieve results (ADO.NET for example), you shouldn't encounter an issue with this.
+
+Also, be aware that I added carriage returns in the preceding results for clarity's sake—SQL Server just runs all the elements together to make them more compact.
+
+We have one element in XML for each row of data our query produced. All column information, regardless of what table was the source of the data, is represented as an attribute of the row element. The downside of this is that we haven't represented the true hierarchical nature of our data—orders are placed only by customers. The upside, however, is that the XML Document Object Model (DOM)—if that's the model you're using—is going to be much less deep and, hence, will have a slightly smaller footprint in memory and perform better, depending on what you're doing.
+
+AUTO
+
+AUTO takes a somewhat different approach to our data than RAW does. AUTO tries to format things a little better for you—naming elements based on the table (or the table alias if you use one). In addition, AUTO recognizes the notion that our data probably has some underlying hierarchical notion to it that is supposed to be represented in the XML.
+
+Let's go back to our customer orders example from the last section. This time, we'll make use of the AUTO option, so we can see the difference versus the rather plain output we got with RAW.
+
+SELECT sc.CustomerID,
+
+pp.LastName,
+
+pp.FirstName,
+
+soh.SalesOrderID,
+
+soh.OrderDate
+
+FROM Person.Person pp
+
+JOIN Sales.Customer sc
+
+ON pp.BusinessEntityID = sc.PersonID
+
+JOIN Sales.SalesOrderHeader soh
+
+ON sc.CustomerID = soh.CustomerID
+
+WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485
+
+The first apparent difference is that the element name has changed to be the name or alias of the table that is the source of the data—you'll want to consider this when choosing the aliases for your tables in a FOR XML AUTO query. Perhaps an even more significant difference appears when we look at the XML more thoroughly. I have again cleaned up the output a bit for clarity:
+
+<sc CustomerID="29484">
+
+<pp LastName="Achong" FirstName="Gustavo">
+
+<soh SalesOrderID="44132" OrderDate="2001-09-01T00:00:00"/>
+
+<soh SalesOrderID="45579" OrderDate="2002-03-01T00:00:00"/>
+
+<soh SalesOrderID="46389" OrderDate="2002-06-01T00:00:00"/>
+
+<soh SalesOrderID="47454" OrderDate="2002-09-01T00:00:00"/>
+
+<soh SalesOrderID="48395" OrderDate="2002-12-01T00:00:00"/>
+
+<soh SalesOrderID="49495" OrderDate="2003-03-01T00:00:00"/>
+
+<soh SalesOrderID="50756" OrderDate="2003-06-01T00:00:00"/>
+
+</pp>
+
+</sc>
+
+<sc CustomerID="29485">
+
+<pp LastName="Abel" FirstName="Catherine">
+
+<soh SalesOrderID="53459" OrderDate="2003-09-01T00:00:00"/>
+
+<soh SalesOrderID="58907" OrderDate="2003-12-01T00:00:00"/>
+
+<soh SalesOrderID="65157" OrderDate="2004-03-01T00:00:00"/>
+
+<soh SalesOrderID="71782" OrderDate="2004-06-01T00:00:00"/>
+
+</pp>
+
+</sc>
+
+Data that is sourced from our second table (as determined by the SELECT list) is nested inside the data sourced from the first table. In this case, our soh elements are nested inside our pp elements, which are in turn nested inside of our c elements. If a column from the SalesOrderHeader table were listed first in our select list, then Person and Customer would both be nested inside SalesOrderHeader.
+
+Pay attention to this business of the ordering of your SELECT list! Think about the primary question your XML query is meant to answer. Arrange your SELECT list such that the style that it produces is fitting for the goal of your XML. Sure, you could always style it into the different form—but why do that if SQL Server could have just produced it for you that way in the first place?
+
+The downside to using AUTO is that the resulting XML data model ends up being slightly more complex. The upside is that the data is more explicitly broken up into a hierarchical model. This makes life easier when the elements are more significant breaking points—such as when you have a doubly sorted report (for example, SalesOrderHeader rows sorted within Contact rows).
+
+EXPLICIT
+
+The word explicit is an interesting choice for this option—it loosely describes the kind of language you're likely to use while trying to create your query. The EXPLICIT option takes much more effort to prepare, but it also rewards that effort with very fine granularity of control over what's an element and what's an attribute, as well as what elements are nested in what other elements.
+
+Much of what you can do with EXPLICIT can now be replicated using PATH. EXPLICIT does, however, give you a very fine and, as the keyword name implies, explicit level of control about your output. In general, I would point you at PATH and tell you to look at EXPLICIT when PATH doesn't seem to be meeting your needs.
+
+EXPLICIT enables you to define each level of the hierarchy and how each level is going to look. To define the hierarchy, you create what is internally called the universal table. The universal table is, in many respects, just like any other result set you might produce in SQL Server. It is usually produced by making use of UNION statements to piece it together one level at a time, but you could, for example, build much of the data in a UDF and then make a SELECT against that to produce the final XML. The big difference between the universal table and a more traditional result set is that you must provide sufficient metadata right within your result set such that SQL Server can then transform that result set into an XML document in the schema you desire.
+
+What do I mean by sufficient metadata? Well, to give you an idea of just how complex this can be, let's look at a real universal table—one used by a code example we'll examine a little later in the section:
+
+This is what the universal table we would need to build would look like in order to make our EXPLICIT return exactly the same results that we received with our AUTO query in the last example.
+
+Your first inclination might be to say, "Hey, if this is just producing the same thing as AUTO, why use it?" Well, this particular example happens to be producible using AUTO—I'm using this one on purpose to illustrate some functional differences compared to something you've already seen. We will, however, see later in this section that EXPLICIT will allow us to do the formatting extras that aren't possible with AUTO or RAW (but are with PATH)—so please bear with me on this one.
+
+You should note several things about this result set:
+
+  * It has two special metadata columns—Tag and Parent—added to it that do not, otherwise, relate to the data (they didn't come from table columns).
+  * The actual column names are adhering to a special format (which happens to supply additional metadata).
+  * The data has been ordered based on the hierarchy.
+
+Each of these items is critical to our end result, so, before we start working a complete example, let's look at what we need to know to build it.
+
+Tag and Parent
+
+XML is naturally hierarchical in nature (elements are contained with other elements, which essentially creates a parent-child relationship). Tag and Parent are columns that define the relationship of each row to the element hierarchy. Each row is assigned to a certain tag level (which will later have an element name assigned to it)—that level, as you might expect, goes in the Tag column. Parent then supplies reference information that indicates what the next highest level in the hierarchy is. When you do this, SQL Server knows at what level this row needs to be nested or assigned as an attribute (what it's going to be—element or attribute—will be figured out based on the column name—but we'll get to that in our next section). If Parent is NULL, then SQL Server knows that this row must be a top-level element or an attribute of that element.
+
+So, if we had data that looked like this:
+
+Tag | Parent  
+---|---  
+1 | NULL  
+2 | 1
+
+then the first row would be related to a top-level element (an attribute of the outer element or the element itself), and the second would be related to an element that was nested inside the top-level element (its Parent value of 1 matches with the Tag value of the first).
+
+Column Naming
+
+Frankly, this was the most confusing part of all when I first started looking at EXPLICIT. While Tag and Parent have nice neat demarcation points (they are each their own column), the name takes several pieces of metadata and crams them together as one thing—the only way to tell where one stops and the next begins is by separating them by an exclamation mark (!).
+
+The naming format looks like this:
+
+<element name>!<tag>![<attribute
+
+name>][!{element|hide|ID|IDREF|IDREFS|xml|xmltext|cdata}]
+
+The element name is, of course, just that—what you want to be the name of the element in the XML. For any given tag level, once you define a column with one name, any other column with that same tag must have the same name as the previous column(s) with that tag number. So, if you have a column already defined as [MyElement!2!MyCol], then another column could be named [MyElement!2!MyOtherCol], but [SomeOtherName!2!MyOtherCol] could not be.
+
+The tag relates the column to rows with a matching tag number. When SQL Server looks at the universal table, it reads the tag number and then analyzes the columns with the same tag number. So, when SQL Server sees the row:
+
+it can look at the tag number, see that it is 1, and know that it should process sc!1!ContactID, c!1!LastName, and c!1!FirstName, but that it doesn't have to process pp!2!LastName or soh!3!SalesOrderID, for example. Likewise it can look at the tag number in the next row, see that it is 2, and know that it should process sc!1!ContactID, pp!2!LastName, and pp!2!FirstName, but that it doesn't have to process soh!3!SalesOrderID.
+
+That takes us to the attribute name, which is where things start getting more complex (hey, we still have one more to go after this!). If you do not specify a directive (which comes next), then the attribute is required and is the name of the XML attribute that this column will supply a value for. The attribute will be in the XML as part of the element specified in the column name.
+
+If you do specify a directive, then the attribute falls into three different camps:
+
+  * It's Prohibited—That is, you must leave the attribute blank (you do still use a bang (!) to mark its place though). This is the case if you use a CDATA directive.
+  * It's Optional—That is, you can supply the attribute but don't have to. What happens in this case varies depending on the directive that you've chosen.
+  * It's Still Required—This is true for the elements and xml directives. In this case, the name of the attribute will become the name of a totally new element that will be created as a result of the elements or xml directive.
+
+So, now that we have enough of the naming down to meet the minimum requirements for a query, let's go ahead and look at an example of what kind of query produces what kind of results.
+
+We will start with the query to produce the same basic data that we used in our RAW and AUTO examples. You will notice that EXPLICIT has a much bigger impact on the code than we saw when we went with RAW and AUTO. With both RAW and AUTO, we added the FOR XML clause at the end, and we were largely done. With EXPLICIT, we will quickly see that we need to entirely rethink the way our query comes together.
+
+It looks like this (yuck):
+
+USE AdventureWorks2008
+
+SELECT 1 as Tag,
+
+NULL as Parent,
+
+sc.CustomerID as [sc!1!CustomerID],
+
+NULL as [pp!2!LastName],
+
+NULL as [pp!2!FirstName],
+
+NULL as [soh!3!SalesOrderID],
+
+NULL as [soh!3!OrderDate]
+
+FROM Person.Person pp
+
+JOIN Sales.Customer sc
+
+ON pp.BusinessEntityID = sc.PersonID
+
+WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485
+
+UNION
+
+SELECT 2,
+
+1,
+
+sc.CustomerID as [sc!1!CustomerID],
+
+pp.LastName as [pp!2!LastName],
+
+pp.FirstName as [pp!2!FirstName],
+
+NULL as [soh!3!SalesOrderID],
+
+NULL as [soh!3!OrderDate]
+
+FROM Person.Person pp
+
+JOIN Sales.Customer sc
+
+ON pp.BusinessEntityID = sc.PersonID
+
+JOIN Sales.SalesOrderHeader soh
+
+ON sc.CustomerID = soh.CustomerID
+
+WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485
+
+UNION ALL
+
+SELECT 3,
+
+2,
+
+sc.CustomerID as [sc!1!CustomerID],
+
+pp.LastName as [pp!2!LastName],
+
+pp.FirstName as [pp!2!FirstName],
+
+soh.SalesOrderID,
+
+soh.OrderDate
+
+FROM Person.Person pp
+
+JOIN Sales.Customer sc
+
+ON pp.BusinessEntityID = sc.PersonID
+
+JOIN Sales.SalesOrderHeader soh
+
+ON sc.CustomerID = soh.CustomerID
+
+WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485
+
+ORDER BY [sc!1!CustomerID], [pp!2!LastName], [pp!2!FirstName], [soh!3!SalesOrderID]
+
+FOR XML EXPLICIT
+
+Notice that we use the FOR XML clause only once—after the last query in the UNION.
+
+I reiterate—yuck! But, ugly as it is, with just a few changes, I could change my XML into forms that AUTO wouldn't give me.
+
+As a fairly simple illustration, let's make a couple of small alterations to our requirements for this query. What if we decided that we wanted the LastName information to be an attribute of the soh rather than (or, as it happens, in addition to) the pp element? With AUTO, we would need some trickery in order to get this (for every row, we would need to look up the Customer again using a correlated subquery—AUTO won't let you use the same value in two places). If you had multiple lookups, your code could get very complex—indeed, you might not be able to get what you're after at all. With EXPLICIT, this is all relatively easy (at least, by EXPLICIT's definition of easy).
+
+To do this with EXPLICIT, we just need to reference the LastName in our SELECT list again, but associate the new instance of it with soh instead of c:
+
+USE AdventureWorks2008
+
+SELECT 1 as Tag,
+
+NULL as Parent,
+
+sc.CustomerID as [sc!1!CustomerID],
+
+NULL as [pp!2!LastName],
+
+NULL as [pp!2!FirstName],
+
+NULL as [soh!3!SalesOrderID],
+
+NULL as [soh!3!OrderDate],
+
+NULL as [soh!3!LastName]
+
+FROM Person.Person pp
+
+JOIN Sales.Customer sc
+
+ON pp.BusinessEntityID = sc.PersonID
+
+WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485
+
+UNION
+
+SELECT 2,
+
+1,
+
+sc.CustomerID as [sc!1!CustomerID],
+
+pp.LastName as [pp!2!LastName],
+
+pp.FirstName as [pp!2!FirstName],
+
+NULL as [soh!3!SalesOrderID],
+
+NULL as [soh!3!OrderDate],
+
+NULL as [soh!3!LastName]
+
+FROM Person.Person pp
+
+JOIN Sales.Customer sc
+
+ON pp.BusinessEntityID = sc.PersonID
+
+JOIN Sales.SalesOrderHeader soh
+
+ON sc.CustomerID = soh.CustomerID
+
+WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485
+
+UNION ALL
+
+SELECT 3,
+
+2,
+
+sc.CustomerID as [sc!1!CustomerID],
+
+pp.LastName as [pp!2!LastName],
+
+pp.FirstName as [soh!2!FirstName],
+
+soh.SalesOrderID,
+
+soh.OrderDate,
+
+pp.LastName
+
+FROM Person.Person pp
+
+JOIN Sales.Customer sc
+
+ON pp.BusinessEntityID = sc.PersonID
+
+JOIN Sales.SalesOrderHeader soh
+
+ON sc.CustomerID = soh.CustomerID
+
+WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485
+
+ORDER BY [sc!1!CustomerID], [pp!2!LastName], [pp!2!FirstName], [soh!3!SalesOrderID]
+
+FOR XML EXPLICIT
+
+Execute this, and you get pretty much the same results as before, only this time you received the additional attribute you were looking for in your soh element:
+
+<sc CustomerID="29484">
+
+<pp LastName="Achong" FirstName="Gustavo">
+
+<soh SalesOrderID="44132" OrderDate="2001-09-01T00:00:00"
+
+LastName="Achong"/>
+
+<soh SalesOrderID="45579" OrderDate="2002-03-01T00:00:00"
+
+LastName="Achong"/>
+
+<soh SalesOrderID="46389" OrderDate="2002-06-01T00:00:00"
+
+LastName="Achong"/>
+
+<soh SalesOrderID="47454" OrderDate="2002-09-01T00:00:00"
+
+LastName="Achong"/>
+
+<soh SalesOrderID="48395" OrderDate="2002-12-01T00:00:00"
+
+LastName="Achong"/>
+
+<soh SalesOrderID="49495" OrderDate="2003-03-01T00:00:00"
+
+LastName="Achong"/>
+
+<soh SalesOrderID="50756" OrderDate="2003-06-01T00:00:00"
+
+LastName="Achong"/>
+
+</pp>
+
+</sc>
+
+<sc CustomerID="29485">
+
+<pp LastName="Abel" FirstName="Catherine">
+
+<soh SalesOrderID="53459" OrderDate="2003-09-01T00:00:00"
+
+LastName="Abel"/>
+
+<soh SalesOrderID="58907" OrderDate="2003-12-01T00:00:00"
+
+LastName="Abel"/>
+
+<soh SalesOrderID="65157" OrderDate="2004-03-01T00:00:00"
+
+LastName="Abel"/>
+
+<soh SalesOrderID="71782" OrderDate="2004-06-01T00:00:00"
+
+LastName="Abel"/>
+
+</pp>
+
+</sc>
+
+This example is really just for starters. You can utilize directives to achieve far more flexibility—shaping and controlling both your data and your schema output (if you use the XMLDATA option).
+
+Directives are a real pain to understand. Once you do understand them, they aren't all that bad to deal with, though they can still be confusing at times (some of them work pretty counterintuitively and behave differently in different situations). My personal opinion (and the members of the dev team I know are going to shoot me for saying this) is that someone at Microsoft had a really bad day and decided to make something that would inflict as much pain as he/she was feeling but would be so cool that people wouldn't be able to help but use it.
+
+All together, there are eight possible directives you can use. Some can be used in the same level of the hierarchy—others are mutually exclusive within a given hierarchy level.
+
+The purpose behind directives is to allow you to tweak your results. Without directives, the EXPLICIT option would have little or no value (AUTO would take care of most real things that you can do with EXPLICIT if you don't use directives, even though, as I indicated earlier, you sometimes have to get a little tricky). So, with this in mind, let's look at what directives are available.
+
+element
+
+This is probably the easiest of all the directives to understand. All it does is indicate that you want the column in question to be added as an element rather than an attribute. The element will be added as a child to the current tag. For example, let's say that our manager from the previous examples has indicated that he or she needs the OrderDate to be represented as its own element. This can be accomplished as easily as adding the element directive to the end of our OrderDate field:
+
+SELECT 1 as Tag,
+
+NULL as Parent,
+
+sc.CustomerID as [sc!1!CustomerID],
+
+NULL as [pp!2!LastName],
+
+NULL as [pp!2!FirstName],
+
+NULL as [soh!3!SalesOrderID],
+
+NULL as [soh!3!OrderDate!element]
+
+FROM Person.Person pp
+
+JOIN Sales.Customer sc
+
+ON pp.BusinessEntityID = sc.PersonID
+
+WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485
+
+UNION
+
+SELECT 2,
+
+1,
+
+sc.CustomerID as [sc!1!CustomerID],
+
+pp.LastName as [pp!2!LastName],
+
+pp.FirstName as [pp!2!FirstName],
+
+NULL,
+
+NULL
+
+FROM Person.Person pp
+
+JOIN Sales.Customer sc
+
+ON pp.BusinessEntityID = sc.PersonID
+
+JOIN Sales.SalesOrderHeader soh
+
+ON sc.CustomerID = soh.CustomerID
+
+WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485
+
+UNION ALL
+
+SELECT 3,
+
+2,
+
+sc.CustomerID as [sc!1!CustomerID],
+
+pp.LastName as [pp!2!LastName],
+
+pp.FirstName as [pp!2!FirstName],
+
+soh.SalesOrderID,
+
+soh.OrderDate
+
+FROM Person.Person pp
+
+JOIN Sales.Customer sc
+
+ON pp.BusinessEntityID = sc.PersonID
+
+JOIN Sales.SalesOrderHeader soh
+
+ON sc.CustomerID = soh.CustomerID
+
+WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485
+
+ORDER BY [sc!1!CustomerID], [pp!2!LastName], [pp!2!FirstName], [soh!3!SalesOrderID]
+
+FOR XML EXPLICIT
+
+Suddenly, we have an extra element instead of an attribute:
+
+<sc CustomerID="29484">
+
+<pp LastName="Achong" FirstName="Gustavo">
+
+<soh SalesOrderID="44132">
+
+<OrderDate>2001-09-01T00:00:00</OrderDate>
+
+</soh>
+
+<soh SalesOrderID="45579">
+
+<OrderDate>2002-03-01T00:00:00</OrderDate>
+
+</soh>
+
+<soh SalesOrderID="46389">
+
+<OrderDate>2002-06-01T00:00:00</OrderDate>
+
+</soh>
+
+<soh SalesOrderID="47454">
+
+<OrderDate>2002-09-01T00:00:00</OrderDate>
+
+</soh>
+
+<soh SalesOrderID="48395">
+
+<OrderDate>2002-12-01T00:00:00</OrderDate>
+
+</soh>
+
+<soh SalesOrderID="49495">
+
+<OrderDate>2003-03-01T00:00:00</OrderDate>
+
+</soh>
+
+<soh SalesOrderID="50756">
+
+<OrderDate>2003-06-01T00:00:00</OrderDate>
+
+</soh>
+
+</pp>
+
+</sc>
+
+<sc CustomerID="29485">
+
+<pp LastName="Abel" FirstName="Catherine">
+
+<soh SalesOrderID="53459">
+
+<OrderDate>2003-09-01T00:00:00</OrderDate>
+
+</soh>
+
+<soh SalesOrderID="58907">
+
+<OrderDate>2003-12-01T00:00:00</OrderDate>
+
+</soh>
+
+<soh SalesOrderID="65157">
+
+<OrderDate>2004-03-01T00:00:00</OrderDate>
+
+</soh>
+
+<soh SalesOrderID="71782">
+
+<OrderDate>2004-06-01T00:00:00</OrderDate>
+
+</soh>
+
+</pp>
+
+</sc>
+
+xml
+
+This directive is essentially just like the element directive. It causes the column in question to be generated as an element rather than an attribute. The differences between the xml and element directives will be seen only if you have special characters that require encoding—for example, the = sign is reserved in XML. If you need to represent an =, then you need to encode it (for =, it would be encoded as &eq). With the element directive, the content of the element is automatically encoded. With xml, the content is passed straight into the resulting XML without encoding. If you use the xml directive, no other item at this level (the number) can have a directive other than hide.
+
+hide
+
+Hide is another simple one that does exactly what it says it does—hides the results of that column.
+
+Why in the world would you want to do that? Well, sometimes we include columns for reasons other than output. For example, in a normal query, we can perform an ORDER BY based on columns that do not appear in the SELECT list. For UNION queries, however, we can't do that—we have to specify a column in the SELECT list because it's the one thing that unites all the queries that we are performing the UNION on.
+
+Let's use a little example of tracking some product sales. We'll say that we want a list of all of our products as well as the SalesOrderIDs of the orders they shipped on and the date that they shipped. We only want the ProductID, but we want the ProductID to be sorted such that any given product is near similar products—that means we need to sort based on the ProductSubcategoryID, but we do not want the ProductSubcategoryID to be included in the end results.
+
+We can start out by building the query without the directive—that way we can see that our sort is working:
+
+SELECT 1 as Tag,
+
+NULL as Parent,
+
+p.ProductID as [Product!1!ProductID],
+
+p.ProductSubcategoryID as [Product!1!ProductSubcategoryID],
+
+NULL as [Order!2!OrderID],
+
+NULL as [Order!2!OrderDate]
+
+FROM Production.Product p
+
+JOIN Sales.SalesOrderDetail AS sod
+
+ON p.ProductID = sod.ProductID
+
+JOIN Sales.SalesOrderHeader AS soh
+
+ON sod.SalesOrderID = soh.SalesOrderID
+
+WHERE soh.OrderDate BETWEEN '2003-03-27' AND '2003-03-27'
+
+UNION ALL
+
+SELECT 2,
+
+1,
+
+p.ProductID,
+
+p.ProductSubcategoryID,
+
+soh.SalesOrderID,
+
+soh.OrderDate
+
+FROM Production.Product AS p
+
+JOIN Sales.SalesOrderDetail AS sod
+
+ON p.ProductID = sod.ProductID
+
+JOIN Sales.SalesOrderHeader AS soh
+
+ON sod.SalesOrderID = soh.SalesOrderID
+
+WHERE soh.OrderDate BETWEEN '2003-03-27' AND '2003-03-27'
+
+ORDER BY [Product!1!ProductSubcategoryID],[Product!1!ProductID],
+
+[Order!2!OrderID]
+
+FOR XML EXPLICIT
+
+Be sure to check out the way we dealt with the OrderDate on this one. Even though I needed to fetch that information out of the SalesOrderHeader table, it was easy (since we're using EXPLICIT anyway) to combine that information with the SalesOrderID from the SalesOrderDetail table. As it happens, I could have also just grabbed the SalesOrderID from the SalesOrderHeader table, too, but sometimes you need to mix data from multiple tables in one element, and this query is yet another demonstration of how we can do just that.
+
+We can see from the results that we are indeed getting the sort we expected:
+
+<Product ProductID="779" ProductSubcategoryID="1">
+
+<Order OrderID="49775" OrderDate="2003-03-27T00:00:00"/>
+
+</Product>
+
+<Product ProductID="782" ProductSubcategoryID="1">
+
+<Order OrderID="49774" OrderDate="2003-03-27T00:00:00"/>
+
+</Product>
+
+<Product ProductID="764" ProductSubcategoryID="2">
+
+<Order OrderID="49776" OrderDate="2003-03-27T00:00:00"/>
+
+</Product><
+
+Product ProductID="766" ProductSubcategoryID="2">
+
+<Order OrderID="49777" OrderDate="2003-03-27T00:00:00"/>
+
+</Product>
+
+Now we'll add our hide directive and get rid of the category information:
+
+SELECT 1 as Tag,
+
+NULL as Parent,
+
+p.ProductID as [Product!1!ProductID],
+
+p.ProductSubcategoryID as [Product!1!ProductSubcategoryID!hide],
+
+NULL as [Order!2!OrderID],
+
+NULL as [Order!2!OrderDate]
+
+FROM Production.Product p
+
+JOIN Sales.SalesOrderDetail AS sod
+
+ON p.ProductID = sod.ProductID
+
+JOIN Sales.SalesOrderHeader AS soh
+
+ON sod.SalesOrderID = soh.SalesOrderID
+
+WHERE soh.OrderDate BETWEEN '2003-03-27' AND '2003-03-27'
+
+UNION ALL
+
+SELECT 2,
+
+1,
+
+p.ProductID,
+
+p.ProductSubcategoryID,
+
+soh.SalesOrderID,
+
+soh.OrderDate
+
+FROM Production.Product AS p
+
+JOIN Sales.SalesOrderDetail AS sod
+
+ON p.ProductID = sod.ProductID
+
+JOIN Sales.SalesOrderHeader AS soh
+
+ON sod.SalesOrderID = soh.SalesOrderID
+
+WHERE soh.OrderDate BETWEEN '2003-03-27' AND '2003-03-27'
+
+ORDER BY [Product!1!ProductSubcategoryID!hide],[Product!1!ProductID],
+
+[Order!2!OrderID]
+
+FOR XML EXPLICIT
+
+And we get the same results; only this time, our Category information is indeed hidden:
+
+<Product ProductID="779">
+
+<Order OrderID="49775" OrderDate="2003-03-27T00:00:00"/>
+
+</Product>
+
+<Product ProductID="782">
+
+<Order OrderID="49774" OrderDate="2003-03-27T00:00:00"/>
+
+</Product>
+
+<Product ProductID="764">
+
+<Order OrderID="49776" OrderDate="2003-03-27T00:00:00"/>
+
+</Product>
+
+<Product ProductID="766">
+
+<Order OrderID="49777" OrderDate="2003-03-27T00:00:00"/>
+
+</Product>
+
+id, idref, and idrefs
+
+None of these three has any affect whatsoever unless you also make use of the XMLDATA option (it goes after the EXPLICIT in the FOR clause) or validate against some other schema that has the appropriate declarations. This makes perfect sense when you think about what they do—they add things to the schema to enforce behavior, but, without a schema, what do you modify?
+
+You see, XML has the concept of an id. An id in XML works much the same as a primary key does in relational data—it designates a unique identifier for that element name in your XML document. For any element name, there can be no more than one attribute specified in the id. What attribute is to serve as the id is defined in the schema for the XML. Once you have one element with a given value for your id attribute, no other element with the same element name is allowed to have the same attribute.
+
+Unlike primary keys in SQL, you cannot have multiple attributes make up your id in XML (there is no concept of a composite key).
+
+Since XML has a concept that is similar to a primary key, it probably comes as no surprise that XML also has a concept that is similar to a foreign key—that's where idref and idrefs come in. Both are used to create a reference from an attribute in one element to an id attribute in another element.
+
+What does this do for us? Well, if we didn't have these, there would only be one way to create a relationship between two elements—nest them. By giving a certain element an id and then making reference to it from an attribute declared as being an idref or idrefs attribute, we gain the ability to link the two elements, regardless of their position in the document.
+
+This should bring on the question, "OK—so why are there two of them?" The answer is implied in their names: idref provides for a single value that must match an existing element's id value. idrefs provides a multivalued, whitespace-separated list—again, the values must each match an existing element's id value. The result is that you use idref if you are trying to establish a one-to-many relationship (there will only be one of each id value but potentially many elements with that value in an attribute of idref). Use idrefs when you are trying to establish a many-to-many relationship (each element with an idrefs can refer to many ids, and those values can be referred to by many ids).
+
+To illustrate this one, we'll go with a slight modification of our last query. We'll start with the idref directive:
+
+SELECT 1 as Tag,
+
+NULL as Parent,
+
+p.ProductID as [Product!1!ProductID!ID],
+
+p.ProductSubcategoryID as [Product!1!ProductSubCategoryID!hide],
+
+NULL as [Order!2!OrderID],
+
+NULL as [Order!2!ProductID!idref],
+
+NULL as [Order!2!OrderDate]
+
+FROM Production.Product AS p
+
+JOIN Sales.SalesOrderDetail AS sod
+
+ON p.ProductID = sod.ProductID
+
+JOIN Sales.SalesOrderHeader AS soh
+
+ON sod.SalesOrderID = soh.SalesOrderID
+
+WHERE soh.OrderDate BETWEEN '2003-03-27' AND '2003-03-27'
+
+UNION ALL
+
+SELECT 2,
+
+1,
+
+p.ProductID,
+
+p.ProductSubcategoryID,
+
+sod.SalesOrderID,
+
+sod.ProductID,
+
+soh.OrderDate
+
+FROM Production.Product AS p
+
+JOIN Sales.SalesOrderDetail AS sod
+
+ON p.ProductID = sod.ProductID
+
+JOIN Sales.SalesOrderHeader AS soh
+
+ON sod.SalesOrderID = soh.SalesOrderID
+
+WHERE soh.OrderDate BETWEEN '2003-03-27' AND '2003-03-27'
+
+ORDER BY [Product!1!ProductSubCategoryID!hide],[Product!1!ProductID!ID],
+
+[Order!2!OrderID]
+
+FOR XML EXPLICIT, XMLDATA
+
+When we look at the results, there are really just two pieces that we are interested in—the schema and our product element:
+
+<Schema name="Schema1" xmlns="urn:schemas-microsoft-com:xml-data"
+
+xmlns:dt="urn:schemas-microsoft-com:datatypes">
+
+<ElementType name="Product" content="mixed" model="open">
+
+<AttributeType name="ProductID" dt:type="id"/>
+
+<attribute type="ProductID"/>
+
+</ElementType>
+
+<ElementType name="Order" content="mixed" model="open">
+
+<AttributeType name="OrderID" dt:type="i4"/>
+
+<AttributeType name="ProductID" dt:type="idref"/>
+
+<AttributeType name="OrderDate" dt:type="dateTime"/>
+
+<attribute type="OrderID"/>
+
+<attribute type="ProductID"/>
+
+<attribute type="OrderDate"/>
+
+</ElementType>
+
+</Schema>
+
+In the schema, you can see some fairly specific type information. Our Product is declared as a type of element, and you can also see that ProductID has been declared as being the id for this element type. Likewise, we have an Order element with the ProductID declared as an idref.
+
+The next piece that we're interested in is a Product element:
+
+<Product xmlns="x-schema:#Schema1" ProductID="779">
+
+<Order OrderID="49775" ProductID="779" OrderDate="2003-03-27T00:00:00"/>
+
+</Product>
+
+In this case, notice that SQL Server has referenced our inline schema in the Product element. This declares that the Product element and everything within it must comply with our schema—thus ensuring that our id and idrefs will be enforced.
+
+When we try to use the idrefs directive, we have to get a little trickier. SQL Server requires that the query that we use to build our idrefs list be separate from the query that builds the elements with the ids. This means we must add another query to our UNION to supply the idrefs (the list of possible ids has to be known before we can build the idrefs list—but the actual ids will come after the id list). The query to generate the idrefs must immediately precede the query that generates the ids. This makes the query look pretty convoluted:
+
+SELECT 1 as Tag,
+
+NULL as Parent,
+
+p.ProductID as [Product!1!ProductID],
+
+NULL as [Product!1!OrderList!idrefs],
+
+NULL as [Order!2!OrderID!id],
+
+NULL as [Order!2!OrderDate]
+
+FROM Production.Product p
+
+JOIN Sales.SalesOrderDetail AS sod
+
+ON p.ProductID = sod.ProductID
+
+JOIN Sales.SalesOrderHeader AS soh
+
+ON sod.SalesOrderID = soh.SalesOrderID
+
+WHERE soh.OrderDate BETWEEN '2003-03-27' AND '2003-03-31'
+
+UNION ALL
+
+SELECT 1,
+
+NULL,
+
+p.ProductID,
+
+soh.SalesOrderID,
+
+NULL,
+
+NULL
+
+FROM Production.Product AS p
+
+JOIN Sales.SalesOrderDetail AS sod
+
+ON p.ProductID = sod.ProductID
+
+JOIN Sales.SalesOrderHeader AS soh
+
+ON sod.SalesOrderID = soh.SalesOrderID
+
+WHERE soh.OrderDate BETWEEN '2003-03-27' AND '2003-03-31'
+
+UNION ALL
+
+SELECT 2,
+
+1,
+
+p.ProductID,
+
+soh.SalesOrderID,
+
+soh.SalesOrderID,
+
+soh.OrderDate
+
+FROM Production.Product AS p
+
+JOIN Sales.SalesOrderDetail AS sod
+
+ON p.ProductID = sod.ProductID
+
+JOIN Sales.SalesOrderHeader AS soh
+
+ON sod.SalesOrderID = soh.SalesOrderID
+
+WHERE soh.OrderDate BETWEEN '2003-03-27' AND '2003-03-31'
+
+ORDER BY [Product!1!ProductID], [Order!2!OrderID!id],
+
+[Product!1!OrderList!idrefs]
+
+FOR XML EXPLICIT, XMLDATA
+
+Note that I've expanded the date range a bit to make sure that there are multiple product IDs for a given range so you see the proper many-to-many relationship.
+
+The schema winds up looking an awful lot like the one we got for idref:
+
+<Schema name="Schema4" xmlns="urn:schemas-microsoft-com:xml-data"
+
+xmlns:dt="urn:schemas-microsoft-com:datatypes">
+
+<ElementType name="Product" content="mixed" model="open">
+
+<AttributeType name="ProductID" dt:type="i4"/>
+
+<AttributeType name="OrderList" dt:type="idrefs"/>
+
+<attribute type="ProductID"/>
+
+<attribute type="OrderList"/>
+
+</ElementType>
+
+<ElementType name="Order" content="mixed" model="open">
+
+<AttributeType name="OrderID" dt:type="id"/>
+
+<AttributeType name="OrderDate" dt:type="dateTime"/>
+
+<attribute type="OrderID"/>
+
+<attribute type="OrderDate"/>
+
+</ElementType>
+
+</Schema>
+
+But the elements couldn't be much more different:
+
+<Product xmlns="x-schema:#Schema4" ProductID="763" OrderList="49790 49797">
+
+<Order OrderID="49790" OrderDate="2003-03-28T00:00:00"/>
+
+<Order OrderID="49797" OrderDate="2003-03-29T00:00:00"/>
+
+</Product>
+
+Using id, idref, and idrefs is very complex. Still, they allow you to make your output strongly typed. For most situations, this level of control and the hassles that go with it simply aren't necessary but, when they are, these three can be lifesavers.
+
+xmltext
+
+xmltext expects the content of the column to be XML and attempts to insert it as an integral part of the XML document you are creating.
+
+While, on the surface, that may sound simple enough (Okay, so they're inserting some text in the middle—big deal!), the rules of where, when, and how it inserts the data are a little strange:
+
+  * As long as the XML you're trying to insert is well formed, the root element will be stripped out—but the attributes of that element will be retained and applied depending on the following few rules.
+  * If you did not specify an attribute name when using the xmltext directive, then the retained attributes from the stripped element will be added to the element that contains the xmltext directive. The names of the retained attributes will be used in the combined element. If any attribute names from the retained attribute data conflict with other attribute information in the combined element, then the conflicting attribute is left out from the retained data.
+  * Any elements nested inside the stripped element will become nested elements of the combined element.
+  * If an attribute name is provided with the xmldata directive, then the retained data is placed in an element of the supplied name. The new element becomes a child of the element that made the directive.
+  * If any of the resulting XML is not well formed, there is no defined behavior. Basically, the behavior will depend on how the end result looks, but I would figure that you're going to get an error (I haven't seen an instance where you can refer to data that is not well formed and escape without an error).
+
+cdata
+
+The term cdata is a holdover from DTDs and SGML. (SGML is an old markup language, used in the graphics industry that is the ancestor of both HTML and XML. DTDs are type definition documents that outline rules that your SGML [and later, HTML and XML] documents had to live up to.) Basically, cdata stands for character data. XML acknowledges a cdata section as something of a no man's land—it completely and in all ways ignores whatever is included inside a properly marked cdata section. Since there is no validation on the data in a cdata section, no encoding of the data is necessary. You would use cdata anytime you need your data completely untouched (you can't have encoding altering the data) or, frankly, when you want to move the data but have no idea what the data is (so you can't know if it's going to cause you problems or not).
+
+For this one, we'll just take a simple example—the AdventureWorks2008 Production.Document table. This table has a field that has an nvarchar(max) data type. The contents are basically unknown. A query to generate the notes on employees into XML might look something like this:
+
+SELECT 1 as Tag,
+
+NULL as Parent,
+
+DocumentNode as [Document!1!DocumentNode],
+
+DocumentSummary as [Document!1!!cdata]
+
+FROM Production.Document Document
+
+WHERE DocumentSummary IS NOT NULL
+
+ORDER BY [Document!1!DocumentNode]
+
+FOR XML EXPLICIT
+
+The output is pretty straightforward:
+
+<Document DocumentNode="/1/2/">
+
+<![CDATA[It is important that you maintain your bicycle and keep it in good
+
+repair. Detailed repair and service guidelines are provided along with
+
+instructions for adjusting the tightness of the suspension fork.
+
+]]>
+
+</Document>
+
+<Document DocumentNode="/2/2/">
+
+<![CDATA[Guidelines and recommendations for lubricating the required
+
+components of your Adventure Works Cycles bicycle. Component lubrication is
+
+vital to ensuring a smooth and safe ride and should be part of your standard
+
+maintenance routine. Details instructions are provided for each bicycle
+
+component requiring regular lubrication including the frequency at which oil
+
+or grease should be applied.
+
+]]>
+
+</Document>
+
+<Document DocumentNode="/3/2/">
+
+<![CDATA[Reflectors are vital safety components of your bicycle. Always
+
+ensure your front and back reflectors are clean and in good repair. Detailed
+
+instructions and illustrations are included should you need to replace the
+
+front reflector or front reflector bracket of your Adventure Works Cycles
+
+bicycle.
+
+]]>
+
+</Document>
+
+<Document DocumentNode="/3/3/">
+
+<![CDATA[Detailed instructions for replacing pedals with Adventure Works
+
+Cycles replacement pedals. Instructions are applicable to all Adventure Works
+
+Cycles bicycle models and replacement pedals. Use only Adventure Works Cycles
+
+parts when replacing worn or broken components.
+
+]]>
+
+</Document>
+
+<Document DocumentNode="/3/4/">
+
+<![CDATA[Worn or damaged seats can be easily replaced following these simple
+
+instructions. Instructions are applicable to these Adventure Works Cycles
+
+models: Mountain 100 through Mountain 500. Use only Adventure Works Cycles
+
+parts when replacing worn or broken components.
+
+]]>
+
+</Document>
+
+Basically, this was a pretty easy one.
+
+PATH
+
+Now let's switch gears just a little bit and get down to a more "real" XML approach to getting data.
+
+While EXPLICIT has not been deprecated as yet, make no mistake—PATH is really meant to be a better way of doing what EXPLICIT originally was the only way of doing. PATH makes a lot of sense in a lot of ways, and it is how I recommend that you do complex XML output in most cases.
+
+This is a more complex recommendation than it might seem. The Microsoft party line on this is that PATH is easier. Well, PATH is easier is many ways, but, as we're going to see, it has its own set of "except for this, and except for that, and except for this other thing" that can twist your brain into knots trying to understand exactly what to do. In short, in some cases, EXPLICIT is actually easier if you don't know XPath. The thing is, if you're dealing with XML, then XPath should be on your learn list anyway, so, if you're going to know it, you should find the XPath-based approach more usable.
+
+Note, however, that if you need backward compatibility to SQL Server 2000, then you're going to need to stick with EXPLICIT.
+
+In its most straightforward sense, the PATH option isn't that bad at all. So, let's start by getting our feet wet by focusing in on just the basics of using PATH. From there, we'll get a bit more complex and show off some of what PATH has to offer.
+
+PATH 101
+
+With PATH, you have a model that molds an existing standard to get at your data—XPath. XPath has an accepted standard, and provides a way of pointing at specific points in your XML schema. For PATH, we're just utilizing a lot of the same rules and ideas in order to say how data should be treated in a native XML sort of way.
+
+How PATH treats the data you refer to depends on a number of rules, including whether the column is named or unnamed (like EXPLICIT, the alias is the name if you use an alias). If the column does have a name, then a number of additional rules are applied as appropriate.
+
+Let's look at some of the possibilities.
+
+Unnamed Columns
+
+Data from a column that is not named will be treated as raw text within the row's element. To demonstrate this, let's take a modified version of the example we used for XML RAW. What we're doing here is listing the two customers we're interested in and the number of orders they have placed:
+
+SELECT sc.CustomerID,
+
+COUNT(soh.SalesOrderID)
+
+FROM Person.Person pp
+
+JOIN Sales.Customer sc
+
+ON pp.BusinessEntityID = sc.PersonID
+
+JOIN Sales.SalesOrderHeader soh
+
+ON sc.CustomerID = soh.CustomerID
+
+WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485
+
+GROUP BY sc.CustomerID
+
+FOR XML PATH;
+
+Check the output from this:
+
+<row><CustomerID>29484</CustomerID>7</row>
+
+<row><CustomerID>29485</CustomerID>4</row>
+
+What it created is a row element for each row in the query—much as you had with RAW—but notice the difference in how it treated our column data.
+
+Since the CustomerID column was named, it was placed in its own element (we'll explore this more in the next section)—notice, however, the number 7 in the results. This is just loose embedded text for the row element—it isn't even associated directly with the CustomerID since it is outside the CustomerID element.
+
+Remember that the exact counts (7s in this case) that come back may vary on your system depending on how much you have been playing with the data. The key thing is to see how the counts are not associated with the CustomerID but are instead just raw text associated with the row.
+
+My personal slant on this is that the number of situations where loose text at the level of the top element is a valid way of doing things is pretty limited. The rules do say you can do it, but I believe it makes for data that is not very clear. Still, this is how it works—use it as it seems to fit the needs of your particular system.
+
+Named Columns
+
+This is where things get considerably more complex rather quickly. In its most simple form, named columns are just as easy as unnamed were—indeed, we saw one of them in our previous example. If a column is a simple named column using PATH, then it is merely added as an additional element to the row:
+
+<row><CustomerID>29484</CustomerID>7</row>
+
+Our CustomerID column was a simple named column.
+
+We can, however, add special characters into our column name to indicate that we want special behaviors for this column. Let's look at a few of the most important.
+
+@
+
+No, that's not a typo—the @ symbol is really the heading to this section. If we add an @ sign to our column name, then SQL Server will treat that column as an attribute of the previous column. Let's move the CustomerID to be an attribute of the top element for the row:
+
+SELECT sc.CustomerID AS '@CustomerID',
+
+COUNT(soh.SalesOrderID)
+
+FROM Person.Person pp
+
+JOIN Sales.Customer sc
+
+ON pp.BusinessEntityID = sc.PersonID
+
+JOIN Sales.SalesOrderHeader soh
+
+ON sc.CustomerID = soh.CustomerID
+
+WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485
+
+GROUP BY sc.CustomerID
+
+FOR XML PATH;
+
+Yields:
+
+<row CustomerID="29484">7</row>
+
+<row CustomerID="29485">4</row>
+
+Notice that our order count remained a text element of the row—only the column that we identified as an attribute moved in. We could take this to the next step by naming our count and prefixing it to make it an attribute also:
+
+SELECT sc.CustomerID AS '@CustomerID',
+
+COUNT(soh.SalesOrderID) AS '@OrderCount'
+
+FROM Person.Person pp
+
+JOIN Sales.Customer sc
+
+ON pp.BusinessEntityID = sc.PersonID
+
+JOIN Sales.SalesOrderHeader soh
+
+ON sc.CustomerID = soh.CustomerID
+
+WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485
+
+GROUP BY sc.CustomerID
+
+FOR XML PATH;
+
+With this, we no longer have our loose text for the element:
+
+<row CustomerID="29484" OrderCount="7"/>
+
+<row CustomerID="29485" OrderCount="4"/>
+
+Also notice that SQL Server was smart enough to realize that everything was contained in attributes—with no lower-level elements or simple text, it chose to make it a self-closing tag (see the / at the end of the element).
+
+So, why did I indicate that this stuff was tricky? Well, there are a lot of different "it only works if..." kind of rules here. To demonstrate this, let's make a simple modification to our original query. This one seems like it should work, but SQL Server will throw a hissy fit if you try to run it:
+
+SELECT sc.CustomerID,
+
+COUNT(soh.SalesOrderID) AS '@OrderCount'
+
+FROM Person.Person pp
+
+JOIN Sales.Customer sc
+
+ON pp.BusinessEntityID = sc.PersonID
+
+JOIN Sales.SalesOrderHeader soh
+
+ON sc.CustomerID = soh.CustomerID
+
+WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485
+
+GROUP BY sc.CustomerID
+
+FOR XML PATH;
+
+What I've done here is to go back to CustomerID as its own element. What, at first glance, you would expect to happen is to get a CustomerID element with OrderCount as an attribute, but it doesn't quite work that way:
+
+Msg 6852, Level 16, State 1, Line 1
+
+Attribute-centric column '@OrderCount' must not come after a non-attribute-centric
+
+sibling in XML hierarchy in FOR XML PATH.
+
+The short rendition of the answer to "What's wrong?" is that it doesn't really know what it's supposed to be an attribute of—is it an attribute of the row, or an attribute of the CustomerID?
+
+/
+
+Yes, a forward slash. Much like @, this special character indicates special things you want done. Essentially, you use it to define something of a path—a hierarchy that relates an element to those things that belong to it. It can exist anywhere in the column name except the first character. To demonstrate this, we're going to utilize our last (failed) example and build into what we were looking for when we got the error.
+
+First, we need to alter the OrderID to have information on what element it belongs to:
+
+SELECT sc.CustomerID,
+
+COUNT(soh.SalesOrderID) AS 'CustomerID/OrderCount'
+
+FROM Person.Person pp
+
+JOIN Sales.Customer sc
+
+ON pp.BusinessEntityID = sc.PersonID
+
+JOIN Sales.SalesOrderHeader soh
+
+ON sc.CustomerID = soh.CustomerID
+
+WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485
+
+GROUP BY sc.CustomerID
+
+FOR XML PATH;
+
+By adding the / and then placing CustomerID before the slash, we are telling SQL Server that OrderCount is below CustomerID in a hierarchy. Now, there are many ways XML hierarchy can be structured, so let's see what SQL Server does with this:
+
+<row><CustomerID>29484<OrderCount>7</OrderCount></CustomerID></row>
+
+<row><CustomerID>29485<OrderCount>4</OrderCount></CustomerID></row>
+
+Now, if you recall, we wanted to make OrderCount an attribute of CustomerID, so, while we have OrderCount below CustomerID in the hierarchy, it's still not quite in the place we wanted it. To do that, we can combine / and @, but we need to fully define all the hierarchy. Now, since I suspect this is a bit confusing, let's take it in two steps—first, the way we might be tempted to do it, but that will yield a similar error to the earlier example:
+
+SELECT sc.CustomerID,
+
+COUNT(soh.SalesOrderID) AS 'CustomerID/@OrderCount'
+
+FROM Person.Person pp
+
+JOIN Sales.Customer sc
+
+ON pp.BusinessEntityID = sc.PersonID
+
+JOIN Sales.SalesOrderHeader soh
+
+ON sc.CustomerID = soh.CustomerID
+
+WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485
+
+GROUP BY sc.CustomerID
+
+FOR XML PATH;
+
+Error time:
+
+Msg 6852, Level 16, State 1, Line 1
+
+Attribute-centric column 'CustomerID/@OrderCount' must not come after a non-
+
+attribute-centric sibling in XML hierarchy in FOR XML PATH.
+
+To fix this, we need to understand a bit about how things are constructed when building the XML tags. The key is that the tags are essentially built in the order you list them. So, if you want to add attributes to an element, you need to keep in mind that they are part of the element tag—that means you need to define any attributes before you define any other content of that element (subelements or raw text).
+
+In our case, we are putting the CustomerID as raw text, but the OrderCount as an attribute (okay, backward from what would be likely in real life, but hang with me here). This means we are telling SQL Server things backward. By the time it sees the OrderCount information, it is already done with attributes for CustomerID and can't go back.
+
+So, to fix things, we simply need to tell it about the attributes before we tell it about any more elements or raw text:
+
+SELECT COUNT(soh.SalesOrderID) AS 'CustomerID/@OrderCount',
+
+sc.CustomerID
+
+FROM Person.Person pp
+
+JOIN Sales.Customer sc
+
+ON pp.BusinessEntityID = sc.PersonID
+
+JOIN Sales.SalesOrderHeader soh
+
+ON sc.CustomerID = soh.CustomerID
+
+WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485
+
+GROUP BY sc.CustomerID
+
+FOR XML PATH;
+
+This probably seems counterintuitive, but, again, think of the order things are being written in. The attributes are written first and then, and only then, can we write the lower-level information for the CustomerID element. Run it, and you'll see we get what we were after:
+
+<row><CustomerID OrderCount="7">29484</CustomerID></row>
+
+<row><CustomerID OrderCount="4">29485</CustomerID></row>
+
+The OrderCount has now been moved into the attribute position, just as we desired, and the actual CustomerID is still raw text embedded in the element.
+
+Follow the logic of the ordering of what you ask for a bit, because it works for most everything. So, if we wanted CustomerID to also be an attribute rather than raw text, but wanted it to be after OrderCount, we could do that—we just need to make sure that it comes after the OrderCount definition.
+
+But Wait, There's More...
+
+As I said earlier, XPath has its own complexity and is a book's worth to itself, but I don't want to leave you with just what I said in the preceding sections and say that's all there is.
+
+@ and / will give you a great deal of flexibility in building the XML output just the way you want it, and probably meet the need well for most simple applications. If, however, you need something more, then there is still more out there waiting for you. For example, you can:
+
+  * "Wildcard" data such that it's all run together as text data without being treated as separate columns
+  * Embed native XML data from XML data type columns
+  * Use XPath node tests—these are special XPath directives that change the behavior of your data
+  * Use the data() directive to allow multiple values to be run together as one data point in the XML
+  * Utilize namespaces
+
+OPENXML
+
+We've spent pages and pages dealing with how to turn our relational data into XML. It seems reasonably intuitive then that SQL Server must also allow you to open a string of XML and represent it in the tabular format that is expected in SQL.
+
+OPENXML is a rowset function that opens your string much as other rowset functions (such as OPENQUERY and OPENROWSET) work. This means that you can join to an XML document, or even use it as the source of input data by using an INSERT..SELECT or a SELECT INTO. The major difference is that it requires you to use a couple of system stored procedures to prepare your document and clear the memory after you're done using it.
+
+To set up your document, you use sp_xml_preparedocument. This moves the string into memory and pre-parses it for optimal query performance. The XML document will stay in memory until you explicitly say to remove it or you terminate the connection that sp_xml_preparedocument was called on.
+
+Let me digress a moment and say that I'm not at all a fan of letting a system clean up for you. If you instantiate something, then you should proactively clean it up when you're done (if only I could teach my youngest child this when she pulls out her toys!).
+
+Much like Visual Basic, C#, and most other languages are supposed to clean up your objects when they go out of scope for you, SQL Server is supposed to clean up your prepared documents. Please do not take the lazy approach of relying on this—clean up after yourself! By explicitly deallocating it (using sp_xml_removedocument), you are making certain the clean up happens, clearing it from memory slightly sooner, and also making it very clear in your code that you're done with it.
+
+The syntax is pretty simple:
+
+sp_xml_preparedocument @hdoc = <integer variable> OUTPUT,
+
+[, @xmltext = <xml>]
+
+[, @xpath_namespaces = <url to a namespace>]
+
+Note that, if you are going to provide a namespace URL, you need to wrap it in the < and > symbols at both ends (for example, <root xmlns:sql ="run: schemas-microsoft-com:xml-sql>).
+
+The parameters of this sproc are fairly self-describing:
+
+  * @hdoc—If you've ever programmed to the Windows API (and to tons of other things, but this is a common one), then you've seen the "h" before—it's Hungarian notation for a handle. A handle is effectively a pointer to a block of memory where something (could be about anything) resides. In our case, this is the handle to the XML document that we've asked SQL Server to parse and hold onto for us. This is an output variable—the variable you reference here will, after the sproc returns, contain the handle to your XML. Be sure to store it away, as you will need it when you make use of OPENXML.
+  * @xmltext—Is what it says it is—the actual XML that you want to parse and work with.
+  * @xpath_namespaces—Any namespace reference(s) your XML needs to operate correctly.
+
+After calling this sproc and saving the handle to your document, you're ready to make use of OPENXML. The syntax for it is slightly more complex:
+
+OPENXML(<handle>,
+
+<XPath to base node>
+
+[, <mapping flags>])
+
+[WITH (<schema Declaration>|<table Name>)]
+
+We have pretty much already discussed the handle—this is going to be an integer value that you received as an output parameter for your sp_xml_preparedocument call.
+
+When you make your call to OPENXML, you must supply the XPath to a node that will serve as a starting point for all your queries. The schema declaration can refer to all parts of the XML document by navigating relative to the base node you set here.
+
+Next up are the mapping flags. These assist us in deciding whether we want to favor elements or attributes in our OPENXML results. The options are:
+
+Byte Value | Description  
+---|---  
+0 | Same as 1 except that you can't combine it with 2 or 8 (2 + 0 is still 2). This is the default.  
+1 | Unless combined with 2 (described next), only attributes will be used. If there is no attribute with the name specified, then a NULL is returned. This can also be added to either 2 or 8 (or both) to combine behavior, but this option takes precedence over option 2. If XPath finds both an attribute and an element with the same name, the attribute wins.  
+2 | Unless combined with 1 (described previously), only elements will be used. If there is no element with the name specified, then a NULL is returned. This can also be added to either 1 or 8 (or both) to combine behavior. If combined with 1, then the attribute will be mapped if it exists. If no attribute exists, then the element will be used. If no element exists, then a NULL is returned.  
+8 | Can be combined with 1 or 2 (described previously). Consumed data should not be copied to the overflow property @mp:xmltext (you would have to use the metaproperty schema item to retrieve this). If you're not going to use the metaproperties—and most of the time you won't be—I recommend this option. It cuts a small (okay, very small) amount of overhead out of the operation.
+
+Finally comes the schema or table. If you're defining a schema and are not familiar with XPath, this part can be a bit tricky. Fortunately, this particular XPath use isn't very complex and should become second nature fairly quickly (it works a lot like directories do in Windows, only with a lot more power).
+
+The schema can vary somewhat in the way you declare it. The definition is declared as:
+
+WITH (
+
+<column name> <data type> [{<column XPath>|<metaproperty>}]
+
+[,<column name> <data type> [{<column XPath>|<metaproperty>}]
+
+  * The column name is just that—the name of the attribute or element you are retrieving. This will also serve as the name you refer to when you build your SELECT list, perform JOINs, and the like.
+  * The data type is any valid SQL Server data type. Since XML can have data types that are not equivalents of those in SQL Server, an automatic coercion will take place if necessary, but this is usually predictable.
+  * The column XPath is the XPath pattern (relative to the node you established as the starting point for your OPENXML function) that gets you to the node you want for your column—whether an element or attribute gets used is dependent on the flags parameter as described above. If this is left off, then SQL Server assumes you want the current node as defined as the starting point for your OPENXML statement.
+  * Metaproperties are a set of special variables that you can refer to in your OPENXML queries. They describe various aspects of whatever part of the XML DOM you're interested in. To use them, just enclose them in single quotes and put them in the place of the column XPath. Available metaproperties include: 
+    * @mp:id—Don't confuse this with the XML id that we looked at with EXPLICIT. While this property serves a similar function, it is a unique identifier (within the scope of the document) of the DOM node. The difference is that this value is system generated—as such, you can be sure it is there. It is guaranteed to refer to the same XML node as long as the document remains in memory. If the id is zero, it is the root node (its @mp:parentid property, as referred to next, will be NULL).
+    * @mp:parentid—This is the same as the preceding, only for the parent.
+    * @mp:localname—Provides the non-fully qualified name of the node. It is used with a prefix and namespace URI (Uniform Resource Identifier—you'll usually see it starting with URN) to name element or attribute nodes.
+    * @mp:parentlocalname—This is the same as the preceding, only for the parent.
+    * @mp:namespaceuri—Provides the namespace URI of the current element. If the value of this attribute is NULL, no namespace is present.
+    * @mp:parentnamespacerui—This is the same as the preceding, only for the parent.
+    * @mp:prefix—Stores the namespace prefix of the current element name.
+    * @mp:parentprefix—This is the same as the preceding, only for the parent.
+    * @mp:prev—Stores the mp:id of the previous sibling relative to a node. Using this, you can tell something about the ordering of the elements at the current level of the hierarchy. For example, if the value of @mp:prev is NULL, then you are at the first node for this level of the tree.
+    * @mp:xmltext—This metaproperty is used for processing purposes, and contains the actual XML for the current element.
+
+Of course, you can always save yourself a ton of work by bypassing all these parameters. You get to do this if you have a table that directly relates (names and data types) to the XPath starting point that you've specified in your XML. If you do have such a table, you can just name it and SQL Server will make the translation for you!
+
+Okay, that's a lot to handle, but we're not quite finished yet. You see, when you're all done with your XML, you need to call sp_xml_removedocument to clean up the memory where your XML document was stored. Thankfully, the syntax is incredibly easy:
+
+sp_xml_removedocument [hdoc = ]<handle of XML doc>
+
+Again, I can't stress enough how important it is to get in the habit of always cleaning up after yourself. I know that, in saying that, I probably sound like your mother. Well, like your mother, SQL Server will clean up after you some, but, like your mother, you can't count on SQL Server to clean up after you every time. SQL Server will clean things up when you terminate the connection, but what if you are using connection pooling? Some connections may never go away if your system is under load. It's an easy sproc to implement, so do it—every time!
+
+Okay, I'm sure you've been waiting for me to get to how you really make use of this—so now it's time for the all-important example.
+
+Imagine that you are merging with another company and need to import some of their data into your system. For this example, we'll say that we're working on importing a few shipping providers that they have and our company doesn't. A sample of what our script might look like to import these from an XML document might be:
+
+USE AdventureWorks2008;
+
+DECLARE @idoc int ;
+
+DECLARE @xmldoc nvarchar(4000);
+
+\-- define the XML document
+
+SET @xmldoc = '
+
+<ROOT>
+
+<Shipper CompanyName="Billy Bob&apos;s Pretty Good Shipping" Base="4.50"
+
+Rate="1.05"/>
+
+<Shipper CompanyName="Fred&apos;s Freight" Base="3.95" Rate="1.29"/>
+
+</ROOT>
+
+';
+
+PRINT @xmldoc;
+
+\--Load and parse the XML document in memory
+
+EXEC sp_xml_preparedocument @idoc OUTPUT, @xmldoc;
+
+\--List out what our shippers table looks like before the insert
+
+SELECT * FROM Purchasing.ShipMethod;
+
+\--See our XML data in a tabular format
+
+SELECT * FROM OPENXML (@idoc, '/ROOT/Shipper', 0) WITH (
+
+CompanyName nvarchar(40),
+
+Base decimal(5,2),
+
+Rate decimal(5,2)) ;
+
+\--Perform and insert based on that data
+
+INSERT INTO Purchasing.ShipMethod
+
+(Name, ShipBase, ShipRate)
+
+SELECT * FROM OPENXML (@idoc, '/ROOT/Shipper', 0) WITH (
+
+CompanyName nvarchar(40),
+
+Base decimal(5,2),
+
+Rate decimal(5,2));
+
+\--Now look at the Shippers table after our insert
+
+SELECT * FROM Purchasing.ShipMethod;
+
+\--Now clear the XML document from memory
+
+EXEC sp_xml_removedocument @idoc;
+
+The final result set from this looks just like what we wanted. (Note that I've snipped off the final two columns for brevity.)
+
+ShipMethodID Name ShipBase ShipRate
+
+\------------ ----------------------------------------- ------------- ---------
+
+1 XRQ - TRUCK GROUND 3.95 0.99
+
+2 ZY - EXPRESS 9.95 1.99
+
+3 OVERSEAS - DELUXE 29.95 2.99
+
+4 OVERNIGHT J-FAST 21.95 1.29
+
+5 CARGO TRANSPORT 5 8.99 1.49
+
+6 Billy Bob's Pretty Good Shipping 4.50 1.05
+
+7 Fred's Freight 3.95 1.29
+
+It isn't pretty, but it works—XML turned into relational data.
+
+A Quick Heads Up Regarding XML Indexes
+
+We're going to defer discussion of XML indexes until we discuss some of the other indexing constructs in SQL Server, but I wanted to take a moment and make sure that you realized that indexes can be built over XML data. We will discuss them more fully in Chapter 7, but, for now, I want to make sure that you are taking XML indexes into consideration in your design efforts and performance expectations.
+
+A Brief Word on Hierarchical Data
+
+XML is naturally hierarchical. The concept of a root and then branching levels of elements and attributes pretty much says everything; one is higher in lineage than another. While XML has been index capable since SQL Server 2005, there is nothing inherent in the XML data type that allows for the handling of XML in a truly hierarchical fashion.
+
+Beginning with SQL Server 2008, we have a new data type that is explicitly created for the purpose of dealing with hierarchical data—the HierarchyID data type. I want to make sure that you're aware of this new data type as a tool for keeping track of hierarchical data in a relational format. This has significant implications in terms of when you might want to store data in XML versus a more traditional data format.
+
+We will defer full discussion of HierarchyID and other hierarchy design issues until Chapter 7 but keep the correlation in mind. You may well find that you want to store information on how deep your XML data is within the tree hierarchy to facilitate fast response to hierarchy questions.
+
+Summary
+
+The size of the XML portion of SQL Server has grown considerably since its original introduction as a "Web release" prior to SQL Server 2000, and it continues to grow. XML is one of the most important technologies to hit the industry in the last 20 or more years. It provides a flexible, very transportable way of describing data, and SQL Server now has more and more ways of meeting your XML needs.
+
+In this chapter, we've taken a look at how to get relational data into XML format, and how to get XML data into a relational structure. We've also seen how SQL Server can supply Web service data directly using XML-based methods.
+5
+
+Daring to Design
+
+And so I come to another one of those things where I have to ponder how much to assume you already know. "To normalize, or not to normalize—THAT is the question!" Okay, the real question is one of whether or not you already understand the most basic tenets of relational database design yet. Since you come to this book with a degree of experience already, I'm going to take an approach that assumes you've heard of it, know it's important, and even grasp the basics of it. I'm going to assume you need the information filled in for you rather than that you are starting from scratch.
+
+With the exception of perhaps three or four chapters, this book has an Online Transaction Processing, or OLTP, flare to the examples. Don't get me wrong; I will point out, from time to time, some of the differences between OLTP and its more analysis-oriented cousin Online Analytical Processing (OLAP). My point is that you will, in most of the examples, be seeing a table design that is optimized for the most common kind of database—OLTP. Thus, the table examples will typically have a database layout that is, for the most part, normalized to what is called the third normal form.
+
+What is "normal form"? We'll start off by taking a very short look at that and then will move quickly onto more advanced concepts. For the moment though, just say that it means your data has been broken out into a logical, nonrepetitive format that can easily be reassembled into the whole. In addition to normalization (which is the process of putting your database into normal form), we'll also be examining the characteristics of OLTP and OLAP databases. And, as if we didn't have enough to do between those two topics, we'll also be looking at many examples of how the constraints we've already seen are implemented in the overall solution.
+
+Normalization 201
+
+If you've read Beginning SQL Server 2008 Programming, then you can probably safely skip this section and move on to the more advanced concepts.
+
+I want to start off by saying that there are six normal forms (plus or minus one or two depending on which academician you listen to). We'll leave several of those to the academicians though. Those in the real world usually deal with only three normal forms. Indeed, a fully normalized database is one that is generally considered to be one that is normalized to the third normal form.
+
+The concept of normalization has to be one of most over-referenced yet misunderstood concepts in programming. Everyone thinks they understand it, and many do in at least its academic form. Unfortunately, it also tends to be one of those things that many database designers wear like a cross—it is somehow their symbol that they are "real" database architects. What it really is, however, is a symbol that they know what the normal forms are—and that's all. Normalization is really just one piece of a larger database design picture. Sometimes you need to normalize your data—then again, sometimes you need to deliberately de-normalize your data. Even within the normalization process, there are often many ways to achieve what is technically a normalized database.
+
+My point is that normalization is a theory, and that's all it is. Once you choose whether or not to implement a normalized strategy, what you have is a database—hopefully the best one you could possibly design. Don't get stuck on what the books (including this one) say you're supposed to do—do what's right for the situation that you're in. As the author of this book, all I can do is relate concepts to you—I can't implement them for you, and neither can any other author (at least not with the written word). You need to pick and choose between these concepts in order to achieve the best fit and the best solution.
+
+By this point in your database development background, I would expect that you already understand how to create a primary key and some of the reasons for using one in our tables—if we want to be able to act on just one row, then we need to be able to uniquely identify that row. The concepts of normalization are highly dependent on issues surrounding the definition of the primary key and what columns are dependent on it. One phrase you might hear frequently in normalization is:
+
+The key, the whole key, and nothing but the key.
+
+The somewhat fun addition to this is:
+
+The key, the whole key, and nothing but the key, so help me Codd!
+
+This is a super-brief summarization of what normalization is about out to the third normal form (for those who don't know, Codd is considered the father of relational design). When you can say that all your columns are dependent only on the whole key and nothing more or less, then you are at third normal form.
+
+Now let's review the various normal forms and what each does for you.
+
+Where to Begin
+
+The concepts of relational database design are founded on the notion of entities and relations. If you're familiar with object-oriented programming, then you can liken most top-level entities to objects in an object model. Much as a parent object might contain other objects that further describe it, tables may have a child or other table that further describe the rows in the original table.
+
+An entity will generally tie to one "parent" table. That table will usually have one and only one row per instance of entity you're describing (for example, a table that is the top table for tracking orders in a system will have only one row per individual order). The one entity may, however, require multiple tables to provide additional descriptive information (for example, a details or line item table to carry a list of all the things that were purchased on that particular order).
+
+A relation is a representation of how two entities relate to each other logically. For example, a customer is a different entity from an order, but they are related. You cannot have so much as one order without at least one customer. Furthermore, your order relates to only one customer.
+
+As you start the process of "normalizing" these entities and relations into tables, some things about your data are assumed even before you get to the first of the normal forms:
+
+  * The table should describe one and only one entity. (No trying to shortcut and combine things!)
+  * All rows must be unique, and there must be a primary key.
+  * The column and row order must not matter.
+
+As you gain experience, this will become less of a "process" and more of the natural starting point for your tables. You will find that creating a normalized set of tables will be the way things flow from your mind to start with rather than anything special that you have to do.
+
+Getting to Third Normal Form
+
+As I indicated earlier, there are, from a practical point of view, three normal forms:
+
+  * The First Normal Form (1NF) is all about eliminating repeating groups of data and guaranteeing atomicity (the data is self-contained and independent). At a high level, it works by creating a primary key (which you already have), then moving any repeating data groups into new tables, creating new keys for those tables, and so on. In addition, you break out any columns that combine data into separate rows for each piece of data.
+  * Second Normal Form (2NF) further reduces the incidence of repeated data (not necessarily groups). Second normal form has two rules to it: 
+    * The table must meet the rules for first normal form. (Normalization is a building block kind of process—you can't stack the third block on if you don't have the first two there already.)
+    * Each column must depend on the whole key.
+  * Third Normal Form (3NF) deals with the issue of having all the columns in your table not just be dependent on something—but the right thing. Third normal form has just three rules to it: 
+    * The table must be in 2NF (I told you this was a building block thing).
+    * No column can have any dependency on any other non-key column.
+    * You cannot have derived data (that is, data that can be inferred from other data in your tables).
+
+Other Normal Forms
+
+There are a few other forms out there that are considered, at least by academics, to be part of the normalization model. These include:
+
+  * Boyce-Codd (considered to really just be a variation on third normal form)—This one tries to address situations where you have multiple overlapping candidate keys. This can only happen if:
+
+a. All the candidate keys are composite keys (that is, it takes more than one column to make up the key).
+
+b. There is more than one candidate key.
+
+c. The candidate keys each have at least one column that is in common with another candidate key.
+
+This is typically a situation where any number of solutions works, and almost never gets thought of outside the academic community (and I think I'll stop thinking about it right now....).
+
+  * Fourth Normal Form—This one tries to deal with issues surrounding multi-valued dependence. This is the situation where, for an individual row, no column depends on a column other than the primary key and depends on the whole primary key (meeting third normal form). However, there can be rather odd situations where one column in the primary key can depend separately on other columns in the primary key. These are rare and don't usually cause any real problem. Thus, they are largely ignored in the database world, and we will not address them any further here.
+  * Fifth Normal Form—Deals with non-loss and loss decompositions. Essentially, there are certain situations where you can decompose a relationship such that you cannot logically recompose it into its original form. Again, these are rare, largely academic, and, again, we won't deal with them any further here.
+
+This is, of course, just a really quick look at these—and that's deliberate on my part. The main reason you need to know these in the real world is either to impress your friends (or prove to them you're a "know it all") and to not sound like an idiot when some database guru comes to town and starts talking about them. However you choose to use this knowledge, I do recommend against using it to get dates.
+
+Relationships
+
+Well, I've always heard from women that men immediately leave the room if you even mention the word "relationship." With that in mind, I hope that I didn't just lose about half my readers.
+
+I am, of course, kidding—but not by as much as you might think. Experts say the key to successful relationships is that you know the role of both parties and that everyone understands the boundaries and rules of the relationship that they are in. I can be talking about database relationships with that statement every bit as much as people relationships.
+
+There are three different kinds of major relationships:
+
+  * One-to-one—This is exactly what it says it is. A one-to-one relationship is one where the fact that you have a record in one table means that you have exactly one matching record in another table.
+  * One-to-many—This is one form of your run-of-the-mill, average, everyday foreign key kind of relationship. Usually, this is found in some form of header/detail relationship, and generally implements some idea of a parent to child hierarchy. For example, for every one customer, you might have several orders.
+  * Many-to-many—In this type of relationship, both sides of the relationship may have several records that match. An example of this would be the relationship of products to orders—an order may contain several products, and, likewise, a product will appear on many orders. SQL Server has no way of physically establishing a direct many-to-many relationship, so you cheat by having an intermediate table to organize the relationship.
+
+Each of these has some variations depending on whether one side of the relationship is nullable or not. For example, instead of a one-to-one relationship, you might have a zero- or one-to-one relationship.
+
+Diagramming
+
+Entity-relationship diagrams (ERDs) are an important tool in good database design. Small databases can usually be easily created from a few scripts and implemented directly without drawing things out at all. The larger your database gets, however, the faster it becomes very problematic to just do things "in your head." ERDs solve a ton of problems because they allow you to quickly visualize and understand both the entities and their relationships.
+
+For this book, I've decided to do things somewhat in reverse of how I've done things before. SQL Server includes a very basic diagramming tool that you can use as a starting point for building rudimentary ERDs. Unfortunately, it employs a proprietary diagramming methodology that does not look remotely like any standard I'm aware of out there. In addition, it does not allow for the use of logical modeling—something I consider a rather important concept. Therefore, I'm going to start off talking about the more standard diagramming methodologies first—later in the chapter we'll look at SQL Server's built-in tools and how to use them.
+
+There are two reasonably common diagramming paradigms—IE and IDEF1X. You'll find both of these in widespread use, but I'm going to limit things here to a once over of the basics of IE (also called Information Engineering). For the record, IDEF1X is a perfectly good diagramming paradigm, and was first put forth by the U.S. Air Force. IE (again, Information Engineering—not Internet Explorer) is, however, the method I use personally, and I do so for just one reason—it is far more intuitive for the inexperienced reviewer of your diagrams. I also find it to be the far more common of the two.
+
+I can't say enough about the importance of having the right tools. While the built-in tools at least give you "something," they are a long way away from "what you need."
+
+ER tools are anything but cheap—running from somewhere over $1,000 to just under $5,000 (that's per seat!). They are also something of a language unto themselves. Don't plan on just sitting down and going to work with any of the major ER tools—you had better figure on some spin-up time to get it to do what you expect.
+
+Don't let the high price of these tools keep you from building a logical model. While Visio continues to fall somewhat short in terms of answering the world's database design problems, it does do okay in a pinch for light logical modeling and can do some degree of synchronization and physical modeling. That said, if you're serious about database design, and going to be doing a lot of it, you really need to find the budget for a real ER tool.
+
+Expense aside, there is no comparison between the productivity possible in the third-party tools out there and the built-in tools. Depending on the ER tool you select, they give you the capability to do things like:
+
+  * Create logical models, and then switch back and forth between the logical and physical model.
+  * Work on the diagram offline—then propagate all your changes to the physical database at one time (when you're ready, as opposed to when you need to log off).
+  * Reverse engineer your database from any one of a number of mainstream RDBMS systems (even some ISAM databases), and then forward engineer them to a completely different RDBMS.
+  * Create your physical model on numerous different systems.
+
+This really just scratches the surface.
+
+A Couple of Relationship Types
+
+Before you get going too far in more diagramming concepts, I want to explore two types of relationships: identifying and non-identifying.
+
+Identifying Relationships
+
+For some of you, I'm sure the term identifying relationship brings back memories of some boyfriend or girlfriend you've had in the past who got just a little over possessive—this is not that kind of relationship. Instead, you're dealing with the relationships that are defined by foreign keys.
+
+An identifying relationship is one where the column or columns (remember, there can be more than one) being referenced (in the parent table) are used as all or part of the referencing (child) table's primary key. Since a primary key serves as the identity for the rows in a table, and all or part of the primary key for the child table is dependent on the parent table—the child table can be said to, at least in part, be "identified" by the parent table.
+
+Non-Identifying Relationships
+
+Non-identifying relationships are those that are created when you establish a foreign key that does not serve as part of the referencing (child) table's primary key. This is extremely common in situations where you are referencing a domain table—where essentially the sole purpose of the referenced table is to limit the referencing field to a set list of possible choices.
+
+The Entity Box
+
+One of the many big differences you'll see in both IE and IDEF1X versus SQL Server's own brand of diagramming comes in the entity box. The entity box, depending on whether you're dealing with logical or physical models, equates roughly to a table. By looking over the entity box, you should be able to easily identify the entity's name, primary key, and any attributes (effectively columns) that entity has. In addition, the diagram may expose other information such as the attribute's data type or whether it has a foreign key defined for it. As an example, consider the entity box in Figure 5.1.
+
+Figure 5.1
+
+The name of our entity is kept on the top outside the box. Then, in the top area of the overall box, but in a separate box of its own, you have the primary key (you'll look at an example with more than one column in the primary key shortly), and last, but not least, come the attributes of the entity.
+
+Take a look at a slightly different entity (Figure 5.2).
+
+Figure 5.2
+
+Several new things appear:
+
+  * The data types (I've turned on the appropriate option).
+  * Foreign keys (if any—again I've turned on the option to make this show).
+  * You have multiple columns in the primary key (everything above the line is part of the primary key).
+  * This time, the entity is rounded on the corners. This tells you that this table is identified (remember identifying relationships?) by at least one other table.
+
+Depending on the ER tool, the data types can be defined right within the ER diagram. Also, as you draw the lines that form your relationships (you'll look at those shortly), you are able to define foreign keys, which can also be shown. For most available ER tools, you can even tell the tool to automatically define the referenced field(s) in the foreign key relationship as being part (or possibly all) of the primary key in the referencing table.
+
+The Relationship Line
+
+There are two kinds, and they match 100 percent with our relationship types:
+
+A solid line indicates an identifying relationship:
+
+______
+
+A broken or dashed line indicates a non-identifying relationship:
+
+\------------------------------
+
+Again, an identifying relationship is one where the column that is referencing another table serves as all or part of the primary key of the referencing table. In a non-identifying relationship, the foreign key column has nothing to do with the primary key in the referencing table.
+
+Terminators
+
+Ahh, this is where things become slightly more interesting. The terminators we're talking about here are, of course, not the kind you'd see Arnold Schwarzenegger play in a movie—they are the end caps that we put on our relationship lines.
+
+The terminators on our lines will communicate as much or more about the nature of our database as the entities themselves will. They are the thing that will tell you the most information about the true nature of the relationship, including the cardinality of the relationship.
+
+Cardinality is, in its most basic form, the number of records on both sides of the relationship. When you say it is a one-to-many relationship, then you are indicating cardinality. Cardinality can, however, be much more specific than the zero, one, or many naming convention that you use more generically. Cardinality can address specifics, and is often augmented in a diagram with two numbers and a colon, such as:
+
+  * 1:M
+  * 1:6 (which, while meeting a one-to-many criteria, is more specific and says there is a maximum of 6 records on that side of the relationship).
+
+Walk through a couple of the parts of a terminator and examine what they mean.
+
+Just as a reminder, the terminators that follow are the ones from the IE diagramming methodology. As I have indicated, there is another diagramming standard that is in widespread use (though I see it much less than IE) called IDEF1X. While its entity boxes are much like IE's, its terminators on the relationship lines are entirely different.
+
+In the top half of the terminator shown in Figure 5.3, it is indicating the first half of our relationship. In this case, we have a zero. For the bottom half, we are indicating the second half of our relationship—in this case, a many. In this example, then, we have a zero, one, or many side of a relationship.
+
+Figure 5.3
+
+In Figure 5.4, you're not allowing nulls at this end of the relationship—this is a one or many end to a relationship.
+
+Figure 5.4
+
+In Figure 5.5, you're back to allowing a zero at this end of the relationship, but you are now allowing a maximum of one. This is a zero or one side of a relationship.
+
+Figure 5.5
+
+And last, but not least, you have Figure 5.6. This one is pretty restrictive—it's simply a "one" (no more, no less) side of a relationship.
+
+Figure 5.6
+
+Since it's probably pretty confusing to look at these just by themselves, take a look at a couple of example tables and relationships (Figure 5.7).
+
+Figure 5.7
+
+Figure 5.7 is a diagram that shows two tables that support the notion of just one logical entity—an order. You have an Orders table to keep track of information that is global to the order (this has just a CustomerNo, but it may have contained things like a shipping address, a date of the order, a due date, and so on). You also have an OrderDetails table to track the individual line items' place on this order. The diagram depicts not only your Orders and OrderDetails tables but also the one (the Orders side) to zero, one, or many (the OrderDetails side) relationship between the two tables. The relationship is an identifying relationship (solid, rather than dashed line), and the relationship is called OrderHasDetails.
+
+In Figure 5.8, you add in a Products table.
+
+Figure 5.8
+
+This new relationship is very similar to the relationship that you already looked at. It is again a one (Products this time) to zero, one, or many (OrderDetails again) relationship, but this one is non-identifying (as represented by the broken line). The IE indicates that, for this table, PartNo is an Inversion Entry, or an index that is not associated with anything other than a foreign key. The Inversion Entry has been added as it usually makes sense to have an index on a field that is a foreign key (since it is a frequent target of lookups).
+
+By looking at all three together, you can see that there is a many-to-many relationship between Orders and Products by virtue of their relationship through the OrderDetails table.
+
+Note that an Inversion Entry does not have to be associated with anything at all—it just happened to be associated with a foreign key in this particular case. An Inversion Entry is essentially any index that is not unique or associated with a primary key.
+
+As I've indicated before, you are still really only scratching the surface of the different information that your ER diagrams can convey. Still, as you look later in the chapter at the SQL Server diagramming tools, you will be able to see that the more accepted methodologies out there have an awful lot more information to convey than the included tools do. In addition, just the nature of how tables are displayed makes information such as keys more visible and easier to read.
+
+Logical versus Physical Design
+
+In your database work, you may have already heard about the concepts of logical versus physical models. In this section, we'll be exploring the differences between the two.
+
+The physical model is one that's probably pretty easy to grasp. It is essentially what you have been working with up to this point in the book. You can think of anything that you can perform a CREATE statement on as being part of the physical model. Indeed—if you run any statements in SQL Server on it at all then it must be part of the physical model.
+
+That being said, a logical model is a means to a number of different things—the physical model in particular. This means that, as you work on the logical model, you are working your way toward being able to generate DDL (Data Definition Language—or things like CREATE, ALTER, and DROP statements). Think of the logical model as being like the planning stages for an artist. The artist figures out what to paint, gets out the paints and brushes, and picks out an appropriately sized canvas, but he hasn't painted anything yet. The physical model is the actual painting. The painting is, of course, what everyone sees and notices, but the painting couldn't exist without the decision of what to paint and the gathering of the paints and other supplies needed. Likewise, the best physical models are generally put together as a progression from a solid logical model.
+
+Purpose of a Logical Model
+
+The first thing to understand about logical models is that they have somewhat different goals than physical models do. A logical model does several things for you:
+
+  * Allows you to begin to build abstracts of complex, data-related business issues as well as provide a high-level effort at identifying your entities
+  * Allows you to use these abstracts to effectively communicate business rules and content as relates to data
+  * Represents the purest form of the data (before you start introducing the realities of what will really work)
+  * Serves as a major piece of documentation in the data requirements portion of your project
+
+Because logical models aren't strictly rooted in the exact syntax to create the database, they give you a flexibility that you can't obtain from a physical model. You can attach dialog and rules to the logical model regardless of whether your particular RDBMS will support those rules or not. In short, it allows you to squeeze in all the facts before you start paring down your design to a specific implementation.
+
+What's nice about this is that logical models allow you to capture all of your data rules in one place regardless of where that rule will be actually implemented. You will frequently run into situations where you cannot sensibly implement your rules in the database. The rules in question may be data related, but due to some constraint or requirement, you need to implement them using more procedural code in your client or in some form of middle tier. With logical models, you go ahead and include the data-related rules anyway.
+
+Regardless of its source, you include all data-related information in a logical design to create one or more abstracts of the data in your system. These abstracts can then be used as a representation to your customer about what you really are intending to store and what rules you believe you have captured. Using such a representation early (and often) can save valuable time and money in your projects by opening extra doors of communication. Even a customer who is not very data savvy can often look at the highest level diagrams and say things like "Where are the purchase requisitions?" Usually, you have some handy dandy explanation of why you called them something else and you can point to them on the diagram—other times, however, you find yourself uttering that most fearsome of words—"Oops!" I don't know about you, but I'd rather utter that word in the first weeks of a project rather than the first weeks of deployment. Logical modeling, when properly shared with the customer, can help avoid those deployment-time Oops statements.
+
+I can't do enough to stress the importance of sharing your logical design (there had better be one!) with your customer both early and often. With a little education of the customer in how to read your logical model (this should also include good documentation on cause and purpose of the entities and relationships of the model), you can save a fortune in both time and money.
+
+I haven't met a developer with any real experience who hasn't, at least once (and probably far more often than that), learned the hard way about the cost of late changes to your system. Changing code is very expensive, but that typically doesn't even begin to touch what happens when you need to change your database late in a project. If you haven't done a good job of abstracting your database, then every change you make to your database is going to cascade through tons of code. In other words, one little change in your database can potentially cost several hundred or even thousands (depending on the size of the system) of changes in the code that accesses the database.
+
+In short, communication is everything, and logical modeling should be a huge part of your tool set for communicating with your customer.
+
+Parts of a Logical Model
+
+A logical model contains three major parts:
+
+  * Structure
+  * Constraints
+  * Rules
+
+The combination of these three should completely describe the requirements of the data in your system, but they may not translate entirely to the physical model. Some of the issues identified in the logical model may need to be implemented in some procedural form (such as in a middle-tier component). Other times, the entire logical model can be implemented through the various features of your RDBMS.
+
+This is a really important point, and I want to stress it again—just because it's in your logical model doesn't mean that it will be in your physical database. A logical model should take into account all of your data requirements—even those that are not possible to implement in your RDBMS (for example, data that you might be retrieving from a third-party source—perhaps in an XML document or some other storage medium). Having everything in your logical model allows you to plan the physical design in such a way that you can be sure that you have addressed all data issues—not just those that will physically reside in the database.
+
+Structure
+
+Structure is that part of the logical design that deals with the concept of actually storing the data. When you deal with the structure of the database, you're talking about entities—most of which will translate to tables that will store your data—and the particular columns you are going to need to maintain the atomicity of your data.
+
+Constraints
+
+Constraints, from a logical model standpoint, are a bit broader than the way that you've used the word constraint up until now. Prior to now, when you used the word constraint, you were talking about a specific set of features to limit data to certain values. From a logical standpoint, a constraint is anything that defines the "what" question for our data—that is, what data is valid. A logical model includes constraints, which is to say that it includes things like:
+
+  * Data types (notice that this is really a separate thought from the notion that a column needs to exist or what the name of that column should be).
+  * Constraints in the form you're used to up until now—that is, CHECK constraints, foreign keys, or even primary keys and UNIQUE constraints (alternate keys). Each of these provides a logical definition of what data can exist in our database. This area would also include things like domain tables (which you would reference using foreign keys)—which restrict the values in a column to a particular "domain" list.
+
+Rules
+
+If constraints were the "what" in our data, then rules are the "when and how much" in our data.
+
+When we define logical rules, we're defining things like "Do we require a value on this one?" (which equates to "Do we allow nulls?") and "How many of these do we allow?" (which defines the cardinality of our data—do we accept one or many?).
+
+It's worth noting yet again that any of these parts may not be implemented in the physical part of your database—we may decide that the restrictions that we want to place on things will be handled entirely at the client—regardless of where the requirement is implemented, it should still be part of our comprehensive logical data model. It is only when we achieve this complete modeling of our data that we can really know that we have addressed all the issues (regardless of where we addressed them).
+
+Dealing with File-Based Information Via Classic BLOBs
+
+BLOBs. You probably haven't seen enough of them to hate them yet. Whether that's a "yet" or not largely depends on whether or not you need to support backward compatibility.
+
+Back in SQL Server 2005, Microsoft added some new data types (varchar(max), nvarchar(max), and varbinary(max)) that greatly simplify dealing with Binary large objects—or BLOBs. SQL Server 2008 adds yet another option to the mix in the form of supporting a special file-level storage option called filestreams (these are a lot more complex, and require a very cohesive design effort with your client-side coders as well as special network considerations). Other than a quick glance at them, we'll largely defer the discussion of filestreams to the next chapter (advanced data structures) and our more advanced performance design chapter (Chapter 21).
+
+When used with a compatible data access model (ADO.NET 2.0 or higher), you can access BLOB data through the more standard methods (that is, without using filestreams) as though it were the same as its smaller base data type (varchar, nvarchar, or varbinary). For those of you still needing to deal with backward compatibility issues, you'll have to use the older (and even slower) "chunking" method to access your data. Regardless of which access method you're using, BLOBs are slow—very slow and big. Using the new access methods can really help BLOB handling performance though, so let me encourage you to migrate as soon as possible to at least SQL Server 2005 as your bottom level of support.
+
+The oldest version of SQL Server you're supporting is the critical factor—not the data access method—when using the newer BLOB data types. SQL Server will automatically translate the newer data types to appear like the old ones when dealing with the older connectivity methods. Note, however, that use of filestreams does require very specific client-side code.
+
+BLOBs are nice in the sense that they let you break the 8K barrier on row size (BLOBs can be up to about 2GB in size). The first problem is that they can be clumsy to use under the old data types and access methods. Perhaps the larger problem, however, is that they are painfully slow (I know, I'm repeating myself, but I suspect I'm also making a point here). In the race between the BLOB and the tortoise (the sequel to the tortoise and the hare), the BLOB won only after the tortoise stopped for a nap.
+
+Okay, okay, so I've beaten the slow thing into the ground. Indeed, there have been substantial performance improvements in BLOB handling over the years, and the difference is not what it used to be, but at the risk of mentioning it one too many times, BLOBs are still relatively slow.
+
+All right, so now you've heard me say BLOBs are slow and you still need to store large blocks of text or binary information. Normally, you'd do that using a BLOB—and, with the recent performance improvements in BLOB handling, that's probably best—but you do have the option of doing it another way. You can go around the problem by storing things as files instead.
+
+Okay, so by now some of you have to be asking the question of "isn't a database going to be a faster way of accessing data than the file system?" My answer is quite simply—"Usually not."
+
+There are two ways to do this without going to filestreams. We'll start with the method that has traditionally been implemented, and then we'll talk about another potential (it requires very application-specific design on your part) way to do it in the .NET era.
+
+I'm going to warn you right up front that, in order to pull the typical way of doing this off, you need to be planning for it in your client—this isn't a database server–only kind of thing to do. Indeed, you'll be removing most of the work from the database server and putting it into your middle tier and file system. You can start by looking at what you need to do on the server's file system side. The only thing that you need is to make sure that you have at least one directory to store the information in. Depending on the nature of your application, you may also need to have logic in a middle-tier object that will allow it to create additional directories as needed.
+
+All Windows operating systems have limits on the number of files they can store in one directory. With the 64-bit operating systems out, the maximum number of files per directory has increased such that the maximum isn't so much the issue, as raw performance. (Windows still tends to get very slow in file access as the number of files in a particular directory rises.) As such, you still need to think about how many files you're going to be storing. If it will be many (say, over 500), then you'll want to create a mechanism in the object that stores your BLOB so that it can create new directories either on an as-needed basis, or based on some other logical criteria.
+
+Your business component will be in charge of copying the BLOB information to the file you're going to store it in. If it is already is some defined file format, you're on easy street—just run your language's equivalent to a copy command (with a twist we'll go over shortly), and you're in business. If it is streamed data, then you'll need to put the logic in your component to store the information in a logical format for later retrieval.
+
+One big issue with this implementation is that of security. Since you're storing the information in a file that's outside of SQL Server's realm, it is also outside SQL Server's protection security-wise. Instead, you have to rely on your network security.
+
+There are several "Wow, that's scary!" things that should come to mind for you here. First, if someone's going to read data out of the directory that you're storing all this in, doesn't that mean they can see other files that are stored in there? Yes, it does (if you wanted to get really tricky, you could get around this by changing the Windows security for each file, but it would be very tedious indeed—in the case of a Web application, you would need to do something like implementing a DLL on your Web server). Second, since you'd have to give people rights to copy the file into the directory, wouldn't there be a risk of someone altering the file directly rather than using the database (potentially causing your database to be out of sync with the file)? Absolutely.
+
+The answer to these and the many other questions that you could probably come up with lies in your data access layer. (I'm assuming an n-tier approach here.) You can, for example, have the access component run under a different security context than the end user. This means that you can create a situation where the users can access their data—but only when they are using the data access component to do it (they don't have any rights to the directory themselves—indeed, they probably don't even know where the files are stored).
+
+So then, where does SQL Server come into play in all this? It keeps track of where you stored the information in question. Theoretically, the reason why you were trying to store this information in the databases in the first place is because it relates to some other information in the row you were going to store it as part of. But instead of saving the actual data in the row in the form of a BLOB, you will now store a path to the file that you saved. The process for storage will look something like this:
+
+1. Determine the name you're going to store it as.
+
+2. Copy the file to the location that you're going to store it at.
+
+3. Save the full name and path in a varchar along with the rest of the data for that row.
+
+4. To retrieve the data, run your query much as you would have if you were going to retrieve the data direction from the table, only this time, retrieve the path to where the actual BLOB data is stored.
+
+5. Retrieve the data from the file system.
+
+In general, this approach will run somewhat faster than if you were using BLOBs. There are, however, some exceptions to the rule when using this approach:
+
+  * The BLOBs you are saving are consistently small (less than 8K) in size.
+  * The data is text or some format that MS Search has a filter for, and you want to be able to perform full-text searches against it.
+
+If the size of your BLOBs is consistently less than 8K, then the data may be able to fit entirely on one data page. This significantly minimizes the overhead in dealing with your BLOB. While the file system approach may still be faster, the benefits will be sharply reduced such that it doesn't make as much sense. If you're in this scenario, and speed is everything, then all I can suggest is to experiment.
+
+If you want to perform full-text searches, you're probably going to be better off going ahead and storing the large blocks of text as a TEXT data type (which is a BLOB) in SQL Server. If the text is stored in a binary format that has an MS Search filter available (or you could write your own if you're desperate enough), then you can store the file in an image data type and MS Search will automatically use the filter to build the full-text index. Don't get me wrong; it's still very possible to do full-text searches against the text in the file, but you're going to have to do substantially more coding to keep your relationships intact if you want non-BLOB data from the same functional row. In addition, you're most likely going to wind up having to program your middle tier to make use of index server.
+
+If push comes to shove, and you need to make a full-text search against file system–based information, you could take a look at accessing the index server via a query directly. SQL Server can issue remote queries such that you can potentially access any OLE DB (a Microsoft originated data access API—we'll see a bit more about it in Chapter 25) data source. The MS Search service has an OLE DB provider and can be used at the target as a linked server or in an OPENQUERY. The bad news, however, is that performing an index server query against an index server that is not on the same physical box as your SQL Server really doesn't work. (Feel free to e-mail me if you've found a workaround to this.) The only workaround is to have an index server on the system local to SQL Server, but have it catalog files stored on another system. The problem with this is the network chatter during the cataloging process and the fact that it doesn't let you offload the cataloging work (which hurts scalability).
+
+Okay, so that was way #1 (you may recall I said there were two). The second leverages the .NET assembly architecture that was added back in SQL Server 2005. We haven't really gotten to a discussion of .NET integration yet, so we'll keep this fairly high level.
+
+This approach actually leverages many of the same concepts that were used in the middle-tier file access approach. The only real change is in what server or component takes charge of the file access.
+
+With the advent of Common Language Runtime (CLR) integration, we have the ability to create user-defined functions far more complex than those previously possible. As part of that, we have the ability to define table-valued functions that can retrieve data from nearly any base source. Indeed, in Chapter 10 we will take a look at how we can enumerate files in a directory and return them as a table-valued function, but we could just as easily return a varbinary(max) column that contains the file. Under this model, all file access would be performed under whatever network security context we establish for that assembly to run under, but it would only be performed as part of the table-valued function.
+
+It is important to note that the file system–based method mentioned earlier can be considered as something of a predecessor to the filestream feature introduced with this release. Filestreams implement a somewhat advanced version of this approach—one that includes coordinated backups among other things. That said, filestreams also add substantial complexity over even this approach—thus why I have deferred detailed discussion of them to the advanced data structures and performance chapters.
+
+Subcategories
+
+Subcategories are a logical construct that provides you another type of relationship (sometimes called a "Supertype" or "Subtype" relationship) to work with. On the physical side of the model, a subcategory is implemented using a mix of the types of relationships that I've already talked about (you'll see the specifics of that before you're done).
+
+A subcategory deals with the situation where you have a number of what may first seem like different entities but which share some, although not all, things in common.
+
+I think the best way to get across the concept of a subcategory is to show you one. To do this, we'll take the example of a document in a company.
+
+A document has a number of attributes that are common to any kind of document. For example:
+
+  * Title
+  * Author
+  * Date created
+  * Date last modified
+  * Storage location
+
+I'm sure there are more. Note that I'm not saying that every document has the same title, rather that every document has a title. Every document has an author (possibly more than one actually, but, for this example, we'll assume a limit of one). Every document was created on some date. You get the picture—you're dealing with the attributes of the concept of a document, not any particular instance of a document.
+
+But there are lots of different kinds of documents. From things like legal forms (say your mortgage documents) to office memos, to report cards—there are lots of document types. Still, each of these can still be considered to be a document—or a subcategory of a document. Consider a few examples.
+
+For our first example, we'll look at a lease. A lease has all the characteristics that we expect to find in our documents category, but it also has information that is particular to a lease. A lease has things like:
+
+  * Lessor
+  * Lessee
+  * Term (how long the lease is for)
+  * Rate (how much per month or week)
+  * Security deposit
+  * Start date
+  * Expiration date
+  * Option (which usually offers an extension at a set price for a set additional term)
+
+The fact that a lease has all of these attributes does not preclude the fact that it is still a document.
+
+We can come up with a few more examples, and I'll stay with my legal document trend—start with a divorce document. It has attributes such as:
+
+  * Petitioner (the person suing for a divorce)
+  * Respondent (the plaintiff's spouse)
+  * Separation date
+  * Date the petitioner files for the divorce
+  * Date the divorce was considered "final"
+  * Alimony (if any)
+  * Child support (if any)
+
+We could also have a bill of sale—our bill of sale might include attributes such as:
+
+  * Date of sale
+  * Amount of the sale
+  * Seller
+  * Purchaser
+  * Warranty period (if any)
+
+Again, the fact that divorces and bills of sale both have their own attributes does not change the fact that they are documents.
+
+In each case—leases, divorces, and bills of sale—we have what is really a subcategory of the category of "documents." A document really has little or no meaning without also belonging to a subcategory. Likewise, any instance of a subcategory has little meaning without the parent information that is found only in the supercategory—documents.
+
+Types of Subcategories
+
+Subcategories fall into two separate classifications of their own—exclusive and non-exclusive.
+
+When you refer to a subcategory as simply a "subcategory," then you are usually referring to a subcategory arrangement where you have a record in a table that represents the supercategory (a document in our previous example), and a matching record in at least one of the subcategories.
+
+This kind of subcategory is represented with a symbol that appears rather odd as compared to those you've seen thus far (Figure 5.9).
+
+Figure 5.9
+
+Even though there are three subcategories depicted both here and in the document example, don't misconstrue this as being any kind of official limit to the number of subcategories—there isn't one. You could have a single subcategory or 10 of them—it doesn't really make any difference.
+
+Far more common is the situation where you have an exclusive subcategory. An exclusive subcategory works exactly as a category did with only one exception—for every record in the supercategory, there is only one matching record in any of the subcategories. Each subcategory is deemed to be mutually exclusive, so a record to match the supercategory exists as exactly one row in exactly one of the subcategory tables.
+
+The diagramming for an exclusive sub-type looks even a little odder yet (Figure 5.10).
+
+Figure 5.10
+
+Keeping Track of What's What—Implementing Subcategories
+
+The thing that's really cool about subcategories is that they allow you to store all of a similar construct in one place. Before learning this concept, you would have taken one of two approaches to implement our document model:
+
+  * Add all of the attributes into one column and just leave the columns null for the information that doesn't fit the specific type of document you're interested in for a given record.
+  * Have separate tables for each type of document. The columns that are essentially the same between document types would be repeated for each table (each table stores its own copy of the document information as it applies to the records in that particular table).
+
+Using the notion of a subcategory, you can now store all documents, regardless of type, such that they all begin in one place. Any query that you have that is looking for information about all the documents in your system can now run against just one table instead of having to do something like using the UNION operator on three (maybe more, maybe less) different tables. It probably goes without saying, then, that implementing this kind of situation using a subcategory can provide a serious performance enhancement over the other options.
+
+There is a catch though (you knew there would be, right?)—you need to provide some mechanism to point to the rest of the information for that document. Your query of all documents may provide the base information on the specific document that you're looking for, but when you want the rest of the information for that document (the things that are unique to that document type), then how does your application know which of the subcategory tables to search for the matching record in? To do this, just add a field to your supercategory that indicates what the subcategory is for that record. In our example, you would probably implement another column in our documents table called "DocumentType." From that type, you would know which of your other tables to look through for the matching record with more information. Furthermore, you might implement this using a domain table—a table to limit the values in your DocumentType column to just those types that you have subcategories for—and a foreign key to that table.
+
+Keep in mind that while what I'm talking about here is the physical storage and retrieval of the data, there is no reason why you couldn't abstract this using either a sproc or a series of views (or both). For example, you could have a stored procedure call that would pull together the information from the Documents table and then join to the appropriate subcategory.
+
+Oh—for those of you who are thinking, "Wait, didn't that other text that I read about n-tier architecture say to never use sprocs?" Well, that's a garbage recommendation in my not so humble opinion (you'll look more at sprocs in Chapter 10). It's foolish not to use the performance tools available—just remember to access them only through your data access layer—don't allow middle-tier or client components to even know your sprocs exist. Follow this advice, and you'll get better performance, improved overall encapsulation, shorter dev times, and, even with all that, still live within the real theory of a separate data access layer that is so fundamental to n-tier design.
+
+In addition to establishing a pointer to the type of document, you also need to determine whether you're dealing with a plain subcategory or an exclusive subcategory. In our document example, you have what should be designed as an exclusive subcategory. You may have lots of documents, but you do not have documents that are both a lease and a divorce (a non-exclusive subcategory would allow any mix of our subcategories). Even if you had a lease with a purchase option, the bill of sale would be a separate document created at the time the lease option was exercised.
+
+Figure 5.11 shows an implementation of our logical model.
+
+Okay, so you have an entity called documents. These documents are of a specific type, and that type is limited to a domain—the boundaries of that domain are set by DocumentType. In addition, each of the types is represented by its own entity—or subcategory. The symbol in the middle of it all (the half-circle with an "X" through it), tells you that the three subcategories are exclusive in nature (you have one, and only one, for each instance of a document).
+
+Figure 5.11
+
+This is an excellent place to step back and reflect on what your logical model can do for you. As I discussed earlier in the chapter, our logical model, among other things, provides you with a way to communicate the business rules and requirements of our data. In this case, with a little explanation, someone (a customer perhaps?) can look at this and recognize the concept that you are saying that Leases, Divorces, and Sales are all variations on a theme—that they are really the same thing. This gives the viewer the chance to say, "Wait—no, those aren't really the same thing." Or perhaps something like, "Oh, I see—you know, you also have will and power-of-attorney documents—they are pretty much the same, aren't they?" These are little pieces of information that can save you a bundle of time and money later.
+
+Getting Physical—The Physical Implementation of Subcategories
+
+On the physical side of things, there's nothing quite as neat and clean as it looks in the logical model. Indeed, all you do for the physical side is implement a series of one-to-zero or -one relationships. You do, however, draw them out as being part of a single, multi-table relationship (Figure 5.12).
+
+Figure 5.12
+
+The only real trick in the game occurs if you have an exclusive subcategory (which is actually the case much more often than not). In this case, you also need to put some logic into the subcategory tables (in the form of triggers) to ensure that, if any row is to be inserted, there is not already another matching row in one of the other subcategories. For example, you would need to place an insert trigger in Leases that queried the Divorces and Sales tables for records with the same DocumentID. If one was found, then the trigger should reject inserted record with an appropriate error message and a ROLLBACK.
+
+Adding to Extensibility with Subcategories
+
+Subcategories are one of those concepts that can make a huge difference in the success of your database design. If used when appropriate, you can cut significant time off your queries and significantly simplify pulling together aggregate information for related but different pieces of information. Yet these aren't the only benefits to subcategories.
+
+Subcategories can provide a pathway to making your database more extensible. If you need to add another subcategory, the only queries you need to deal with are those that are specific to your new subcategory. Any of your queries that worked only with the parent table will still work fine—what's more, they'll pick up the information on your new subcategory without any changes!
+
+In short, you're picking up two major scalability benefits:
+
+  * The information for your supercategory (documents in the example) can be scanned from just one table rather than using a UNION operator. This means fewer joins and faster relative query performance—especially as your tables grow larger or you have more and more subcategories.
+  * Adding new subcategories often does not take as much development time as it would have if you where developing the framework for those categories from scratch.
+
+Now, just as with most things, you do need to keep in mind one downside—subcategories can create a bottleneck at the parent table. Every query that you run against all the tables and data involved in the overall set of categories is probably going to need to access the parent table. Think about the locking implications there. (If you're new to locking considerations, they are discussed in full in Chapter 12.) If you are not careful about your index and query strategies, this can lead to some very bad blocking and/or deadlocking problems. That said, with intelligent planning and query writing, this is usually not a problem. Also, if the sheer size of the parent table becomes a problem, SQL Server now gives us the option of using partitioned tables to scale to larger sizes.
+
+Database Reuse
+
+This is almost never thought of, but you can create databases that facilitate reusability. Why do I say that it's almost never thought of? Well, just trust me on this—developers think of things like reusable components. Things such as objects to validate credit cards, distribute mail, and stream binary information in and out are all things that you would immediately think about placing in a repository and using over and over again. For whatever reason, however, databases just don't seem to get thought of in that way.
+
+Perhaps one reason for this is that databases, by definition, store data. Data is normally thought of as being unique to one company or industry and, most of all, as being private. I'm guessing that you then automatically think of the storage container for that data as also being personal—who knows?
+
+Contrary to popular belief, however, databases can be built to be reusable. Surprisingly, to do this you apply a lot of the same concepts that make code components reusable—most of all compartmentalization and the use of common interfaces.
+
+Just remember to make sure you have a really good fit before you try to reuse an existing database structure. Much like most things in programming that I've seen reuse of, it's very possible to have your reuse become a situation where you are trying to use the wrong tool for the job, and things can actually become even more expensive than they would have been if you had written things from scratch to begin with.
+
+Candidates for Reusable Databases
+
+The databases that have the best chance at being reusable are those that can be broken up into separate subject areas (much as components are usually broken up into functional groupings). Each subject area is kept as generic as is feasible. An example would be something like an accounting database. You could have separate subject areas that match up with the functional areas in accounting:
+
+  * Purchasing
+  * Accounts receivable (which in turn may be broken up into invoicing and cash receipts)
+  * Inventory
+  * Accounts payable
+  * General ledger
+  * Cash management
+
+The list could go on. You can also take the approach down to a more granular level and create many, many databases, down to the level of things like persons, commercial entities (ever noticed how similar customers are to vendors?), orders—there are lots of things that have base constructs that are used repeatedly. You can roll these up into their own "mini-database," and then plug them into a larger logical model (tied together using sprocs, views, or other components of your data access layer).
+
+How to Break Things Up
+
+This is where the logical versus physical modeling really starts to show its stuff. When you're dealing with databases that you're trying to make reusable, you often have one logical database (that contains all the different subject areas) that contains many physical databases. Sometimes you'll choose to implement your logical design by referencing each of the physical implementations directly. Other times you may choose an approach that does a better job of hiding the way that you've implemented the database—you can create what amounts to a "virtual" database in that it holds nothing but views that reference the data from the appropriate physical database.
+
+Let me digress long enough to point out that this process is essentially just like encapsulation in object-oriented programming. By using the views, you are hiding the actual implementation of your database from the users of the view. This means that you can remove one subject area in your database and replace it with an entirely different design—the only trick in doing this is to map the new design to your views—from that point on, the client application and users are oblivious to the change in implementation.
+
+Breaking things up into separate physical databases and/or virtualizing the database places certain restrictions on you, and many of these restrictions contribute to the idea of being able to separate one subject area from the whole, and reuse it in another environment.
+
+Some of the things to do include:
+
+  * Minimize or eliminate direct references to other functional areas. If you've implemented the view approach, connect each physically separate piece of the database to the logical whole only through the views.
+  * Don't use foreign key constraints—where necessary, use triggers instead. Triggers can span databases; foreign key constraints can't.
+
+The High Price of Reusability
+
+All this reuse comes at a price. Many of the adjustments that you make to your design in order to facilitate reuse have negative performance impacts. Some of these include:
+
+  * Foreign key constraints are faster than triggers overall, but triggers are the only way to enforce referential integrity that crosses database boundaries.
+  * Using views means two levels of optimization run on all your queries (one to get at the underlying query and mesh that into your original query, another to sort out the best way to provide the end result)—that's more overhead, and it slows things down.
+  * If not using the virtual database approach (one database that has views that map to all the other databases), maintaining user rights across many databases can be problematic.
+
+In short, don't look for things to run as fast unless you're dealing with splitting the data across more servers than you can with the single database model.
+
+Reusing your database can make lots of sense in terms of reduced development time and cost, but you need to balance those benefits against the fact that you may suffer to some degree in the performance category.
+
+De-Normalization
+
+I'm going to keep this relatively short, since this tends to get into fairly advanced concepts, but remember not to get carried away with the normalization of your data.
+
+As I stated early in this chapter, normalization is one of those things that database designers sometimes wear like a cross. It's somehow turned into a religion for them, and they begin normalizing data for the sake of normalization rather than for the good things it does to their database. Here are a couple of things to think about in this regard:
+
+  * If declaring a computed column or storing some derived data is going to allow you to run a report more effectively, then by all means put it in. Just remember to take into account the benefit versus the risk. (For example, what if your "summary" data gets out of sync with the data it can be derived from? How will you determine that it happened, and how will you fix it if it does happen?)
+  * Sometimes, by including just one (or more) de-normalized column in a table, you can eliminate or significantly cut down the number of joins necessary to retrieve information. Watch for these scenarios—they actually come up reasonably frequently. I've dealt with situations where adding one column to one commonly used base table cut a nine-table join down to just three, and cut the query time by about 90 percent in the process.
+  * If you are keeping historical data—data that will largely go unchanged and is just used for reporting—then the integrity issue becomes a much smaller consideration. Once the data is written to a read-only area and verified, you can be reasonably certain that you won't have the kind of "out of sync" problems that is one of the major things that data normalization addresses. At that point, it may be much nicer (and faster) to just "flatten" (de-normalize) the data out into a few tables, and speed things up.
+  * The fewer tables that have to be joined, the happier your users who do their own reports are going to be. The user base out there continues to get more and more savvy with the tools they are using. Increasingly, users are coming to their DBA and asking for direct access to the database to be able to do their own custom reporting. For these users, a highly normalized database can look like a maze and become virtually useless. De-normalizing your data can make life much easier for these users.
+
+All that said, if in doubt, normalize things. There is a reason why that is the way relational systems are typically designed. When you err on the side of normalizing, you are erring on the side of better data integrity, and on the side of better performance in a transactional environment.
+
+Partitioning for Scalability
+
+Beginning with SQL Server 2000, SQL Server picked up the marvelous ability to create one logical table from multiple physical tables—partitioned views. That is, the data from one logical table is partitioned such that it is stored in a separate well-defined set of physical tables. But the notion of partitioning your data has been around a lot longer than partitioned views have been. Indeed, keeping your main accounting system on one server and your order entry and inventory systems on another is a form of partitioning—you are making sure that the load of handling the two activities is spread across multiple servers. SQL Server 2005 took an additional step by adding what are called partitioned tables.
+
+Partitioned tables are a bit different from partitioned views in a way that is implied in their name—they truly remain a table throughout. Whereas a partitioned view could not support some of the functionality found in tables (constraints, defaults, identity columns, and so on), a partitioned table supports all these.
+
+There is, of course, a catch—partitioned tables are limited to just one server (it is a means of separating a table across multiple filegroups and, therefore, drive volumes). Note that the limitation to one server doesn't mean you're limited to one physical storage device—there is nothing stopping you from linking multiple storage devices (including multiple SANs) to the one SQL Server.
+
+Partitioned tables do not allow unique indexes on columns that are not part of the partitioning key—this can be critical when the column you want to partition on is not the one you want to use as a primarykey or you have other columns that need unique constraint enforcement.
+
+Partitioned views are still an option when the load is such that you need to span multiple servers. For purposes of this chapter, you're going to stick with the basic notions of partitioning that apply to both the view and table models.
+
+Regardless of which partitioning method you're using, the concepts are pretty much the same. You utilize one or more columns in the logical table as a divider to physically separate your data. This allows you to use multiple I/O pathways and even multiple servers to process your query for you. The question of just how to partition your data should be a very big one. The tendency is going to be to take the hyper-simplistic approach and just divide things up equally based on the possible values in a partitioning column. This approach may work fine, but it is also a little shortsighted for two big reasons:
+
+  * Data rarely falls into nice, evenly distributed piles. Often, predicting the distribution requires a lot of research and sampling up front.
+  * It fails to take into account the way the data will actually be used once stored.
+
+The way that you partition your data does a lot more than determine the volume of data that each partition will receive—much more importantly, it makes a positively huge difference in how well your overall system is going to perform. Keep in mind:
+
+  * Tables rarely live in a bubble. Most of the time you are going to be joining data from any given table with other data in the system—is how the "other" data is partitioned compatible (from a performance perspective)?
+  * Network bandwidth tends to be a huge bottleneck in overall system performance—how are you taking that into account when designing your partitions? This is not that big of a deal if dealing with a partitioned table scheme (which will be local to just one server) but can be huge for a portioned view model.
+
+So, with all this in mind, here are a couple of rules for you:
+
+  * If using partitioned views to spread data across servers, keep data that will be used together stored together. That is, if certain tables are going to be used together frequently in queries, then try to partition those tables such that data that is likely to be returned as part of a query will most likely reside on the same server. Obviously, you won't be able to make that happen 100 percent of the time, but, with careful thought and recognition of how your data gets used, you should find that you can arrange things so that most queries will happen local to just one server. For example, for a given order, all the related order detail rows will be on the same server.
+  * When you design your application, you should ideally make it partition aware—that is, you should code the routines that execute the queries such that they know which server most likely has their data. The data may be broken out across multiple machines—wouldn't it be nice if the database server your application made the request to was the right one from the start, and there was no need for the request to be forwarded to another server?
+
+If you've gotten as far as deciding that you need to go with a partitioned system, then you must really have one heck of a load you're planning on dealing with. How you partition your data is going to have a huge impact on how well your system is going to deal with that load. Remember to take the time to fully plan out your partitioning scheme. After you think you've decided what you're going to do—Test! Test! Test!
+
+The SQL Server Diagramming Tools
+
+You can open up SQL Server's built-in tools by navigating to the Diagrams node of the database you want to build a diagram for (expand your server first, then the database). Some of what you are going to see you'll find familiar—some of the dialogs are the same as you saw in Chapter 4 when you were creating tables. The SQL Server diagramming tools don't give you all that many options, so you'll find that you'll get to know them fairly quickly.
+
+You can start by creating your first diagram. You can create your new diagram by right-clicking the Diagrams node underneath the AdventureWorks database and choosing the New Database Diagram option.
+
+You may (if it's the first time you've tried to create a diagram) see a dialog come up warning you that some of the objects needed to support diagramming aren't in the database and asking if you want to create them—choose yes.
+
+SQL Server starts you out with an Add Table dialog (see Figure 5.13) that lists the available tables you can add to your diagram.
+
+Figure 5.13
+
+Select the following tables (remember to hold down the control key to select more than one table):
+
+  * Address
+  * Customer
+  * CustomerAddress
+  * SalesOrderHeader
+  * SalesOrderDetail
+
+Then click Add. After a brief pause while SQL Server draws all the tables you selected, click the Close button. SQL Server has added our tables to the diagram, as shown in Figure 5.14.
+
+I've rearranged my layout slightly from what SQL Server came up with by default to make more of it fit into this book. Depending on your screen resolution, it may be difficult to see the entire diagram at once due to the zoom. To pull more of the tables into view, change the zoom setting in the toolbar.
+
+SQL Server enumerates through each table you have said you want to add and analyzes what other objects are associated with those tables. The various other items you see beyond the table itself are some of the many other objects that tie into tables—primary keys, foreign keys.
+
+So, having gotten a start, I'll use this diagram as a launching point for explaining how the diagramming tool works and building a few tables here and there.
+
+Figure 5.14
+
+Tables
+
+Each table has its own window you can move around. The primary key is shown with the little symbol of a key in the column to the left of the name like the one next to the CustomerID. This is just the default view for the table; you can select from several others that allow you to edit the very make-up of the table. To check out your options for views of a table, right-click the table that you're interested in. The default is column names only, but you should also take an interest in the choice of Custom; this or "standard" is what you would use when you want to edit the table from right within the diagram (very nice!).
+
+Adding Tables
+
+You can add a new table to the diagram in one of two ways:
+
+  * If you have a table that already exists in the database (but not in the diagram), but now you want to add it to your diagram, you simply click the Add Table button on the diagramming window's toolbar, or right-click anywhere in the diagram and choose Add Table. You'll be presented with a list of all the tables in the database; just choose the one that you want to add, and it will appear along with any relationships it has to other tables in the diagram.
+  * If you want to add a completely new table, click the New Table button on the diagramming window's toolbar or right-click in the diagram and choose New Table. You'll be asked for a name for the new table, and the table will be added to the diagram in Column Properties view. Simply edit the properties to have the column names, data types, and so on that you want, and you have a new table in the database.
+
+Let me take a moment to point out a couple of gotchas in this process.
+
+First, don't forget to add a primary key to your table. SQL Server does not automatically do this, nor does it even prompt you. This is a somewhat less than intuitive process. To add a primary key, you must select the columns that you want to have in the key. Then right-click and choose Set Primary Key.
+
+Next, be aware that your new table is not actually added to the database until you choose to save—this is also true of any edits that you make along the way.
+
+Go ahead and quickly add a table to see how this works and set you up for some later examples.
+
+First, right-click anywhere in the diagramming pane, and choose New Table. You'll be prompted for a table name—call this one CustomerNotes. Now add just three columns as shown in Figure 5.15.
+
+Figure 5.15
+
+Notice the asterisk in the title bar for the table—that means there are unsaved changes to this table (specifically, the table has yet to be saved at all). Go ahead and save the diagram, and that will also create the table in the physical database. You now have a table with three NOT NULL columns. There is not, as yet, any primary key for this table. (We'll deal with that in our section on adding constraints.)
+
+Dropping Tables from Either the Database or Diagram
+
+Dropping tables is a bit confusing since there is a vague distinction between deleting them from the diagram versus deleting them from the database. You can drop a table from the diagram either of two ways:
+
+  * Select the table and press your Delete key.
+  * Select the table and choose the Remove from Diagram button on the toolbar.
+
+To entirely drop the table from the database, you have three choices:
+
+  * Select the table, and choose Edit⇒Delete Tables from Database
+  * Select the table, and click the Delete Tables from Database icon on the toolbar
+  * Right-click the table header, and choose Delete Tables from Database
+
+Note that, while deleting a table from the diagram does not become permanent until you save the diagram, deleting it from the database happens immediately after you confirm the deletion.
+
+Dealing with Constraints
+
+If you're using the diagram tools at all, you'll want to do more than create just the basic table—you'll want to be able to establish constraints as well. The diagramming tools make these relatively easy.
+
+Primary Keys
+
+This really couldn't be much simpler. To create a primary key, just select the column(s) you want to participate in the key (again, hold down the control key if you need to select multiple columns), right-click and select Set Primary Key, as shown in Figure 5.16.
+
+Figure 5.16
+
+I'm adding a primary key to the CustomerNotes table we created in the previous section. As you choose the Set Primary Key option, you'll see it add a key icon to each of the fields that participate in your column. To change the primary key, just select a new set of columns and again choose Set Primary Key. To remove it, just choose Remove Primary Key from the same menu. (It does not show in my figure, because no primary key had been set yet.)
+
+Foreign Keys
+
+Foreign keys are nearly as easy as primary keys were—they use a simple drag-and-drop model.
+
+In our CustomerNotes example, you'll notice I used CustomerID—this is intended to be the same CustomerID that is used elsewhere in the AdventureWorks database, so it makes sense that you would want a foreign key to the base table for CustomerID's (Customer). To do this, simply click the CustomerID column in the Customer table, and drag it onto the CustomerNotes table. Management Studio will then give you the dialog in Figure 5.17 to confirm the foreign key you're after.
+
+Figure 5.17
+
+From here, you can change what the columns are in both the referenced and referencing tables, and even add additional columns if you need to. Click OK, and you move on to the dialog in Figure 5.18, which allows you to set the other properties that go with a foreign key definition, including such things as cascading actions and whether this foreign key should be propagated to any replicated databases you have out there.
+
+Figure 5.18
+
+To edit the foreign key after you've created it, select it (by clicking it), and you will see properties in the pane on the right-hand side of the screen.
+
+Note that the properties pane is a dockable window, so it's possible you have moved it away from the default right-hand side.
+
+To delete a foreign key, simply right-click the relationship and choose Delete Relationships from Database.
+
+CHECK Constraints
+
+To work on the CHECK constraints for your table, simply right-click the table and choose Check Constraints. This brings up a dialog that allows you to either create a new constraint or to select from those already defined for the table. After you create a new one or select one of the existing ones, Management Studio brings up a dialog that is not all that different from that used for foreign keys.
+
+As when you created tables, you can see the asterisk next to the CK_CustomerNotes name—this lets you know that there are unsaved changes. The primary thing you want to focus on in this dialog is the Expression field; this is where you would enter in the conditions of your constraint.
+
+Do not confuse the Identity box in this dialog with an IDENTITY column—this section of the dialog is only there for providing the name and, optionally, a description of the constraint.
+
+To edit an existing constraint, just change the properties as desired. To remove it, just select the constraint you're interested in and click the Delete button.
+
+Regarding Date Columns
+
+Normally I wouldn't spend much time on specific data types, but with SQL Server 2008 the new data types require some special attention. Of particular issue is how the new Date and Time data types alter things. We'll hold off on the performance and space ramifications for our designing for performance chapter (Chapter 21), but the new Date data type in particular deserves a brief moment of discussion.
+
+Previous versions of SQL Server supported only the notions of date and time as one combined data type. The datetime data type takes up a whopping 8 bytes, and the combination often creates hassles in development—among these are:
+
+  * Wasted space when there is no need to track a specific time (or when time of day is all you need).
+  * Hassles in comparing dates when there is time also attached. (You want to see if it's on the same day, but they don't compare equally due to different times of day; you can get around this, but it's a hassle and muddles your code).
+  * Occasional compatibility hassles when interacting with client data types that expect just the date or just the time.
+
+The new date and time data types address these issues by making date and time data discrete and adding flexibility to each type (you can even set precision). Dates are now easily compared to other dates, and times are not only easily compared to other times, but also precision settable to either save space or capture time down to the nanosecond (we were limited to roughly 3 milliseconds previously).
+
+In addition, we have new data types that are meant to deal with the increasing need to standardize time. Allowances have been made to keep track of time offsets versus Coordinated Universal Time, or UTC, which is an abbreviation for the French, Temps Universel Coordonné. This means you can accept times submitted from all around the world and easily reconcile them for more genuine time comparisons.
+
+We will touch on these new data types more as we continue through the book, but given the legacy of the datetime data type, it is important to recognize these new data types and plan for how they will affect your applications moving forward.
+
+Summary
+
+Database design is a huge concept, and one that has many excellent books dedicated to it as their sole subject. It is essentially impossible to get across every database design notion in just a chapter or two.
+
+In this chapter, you have, however, gotten off to a solid start. You've gotten a bit of review of normalization. You have, however, also seen that normalization is not always the right answer—strategic de-normalization of our data can simplify the database for users and speed reporting performance. Finally, you've looked at some non-normalization-related concepts in database design, plus how to make use of the diagramming tools to design our database.
+
+In the next chapter, you will be taking a very close look at how SQL Server stores information and how to make the best use of indexes.
+6
+
+Core Storage and Index Structure
+
+Indexes. They may well be the second most important part (to tables) of your database planning and system maintenance. Why is it then that they are, all too often, an afterthought in many designs?
+
+Think about it for a minute. Most database systems are based on the notion of fast and efficient data retrieval and maintenance. Indexes provide your database system with additional ways to look up data and take shortcuts to that data's physical location. The right index can cut huge percentages of time off your query executions. So, if efficient data retrieval and maintenance are why we build databases, and indexes are critical to the efficient access and maintenance of the data in databases, why is it that so many software architects move straight from determining a table layout to stored procedures or client code? Silly.
+
+Now, don't get me wrong: thinking about stored procedures, client code, and other non-table elements is important, and most developers aren't going to leave a database with zero indexes. Indeed, at least a few indexes will show up in your database without you having to specify them. (Creating a primary key or unique constraint creates an implied index required to enforce those constraints.) It is, however, amazing just how often indexes are applied based on only a few minutes worth of guesses or purely to address a specific performance bug that showed up in QA (or worse, as a patch to a released product). In still other scenarios, developers will take an "index everything" approach, failing to realize the additional storage required or how too many poorly planned indexes can actually increase the time it takes for your query to run.
+
+In this chapter, we will be focusing on the core index structures in SQL Server from both a developer's and an administrator's point of view. We will also look at how data is stored in SQL Server so that we may better understand how SQL Server makes optimization choices, and, from that, what indexes make sense in what situations.
+
+SQL Server Storage
+
+Storage is an area that has undergone some minor changes in SQL Server 2008. (Well, technically they showed up in a service pack for SQL Server 2005.) These changes, primarily centered around the compression of fixed-length storage types, are discussed in the next chapter.
+
+Data in SQL Server can be thought of as existing in something of a hierarchy of structures. The hierarchy is pretty simple. Some of the objects within the hierarchy are things that you will deal with directly and will therefore know easily. A few others exist under the cover, and while they can be directly addressed in some cases, they usually are not. Take a look at them one by one.
+
+The Database
+
+Okay—this one is easy. I can just hear people out there saying, "Duh! I knew that." Yes, you probably did, but I point it out as a unique entity here because it is the highest level of the definition of storage (for a given server). This is the highest level that a lock can be established at, although you cannot explicitly create a database level lock.
+
+A lock is something of both a hold and a place marker that is used by the system. We will be looking into locking extensively in Chapter 11, but we will see the lockability of objects within SQL Server discussed in passing as we look at storage.
+
+The File
+
+By default, your database has two files associated with it:
+
+  * The first is the primary physical database file—that's where your data is ultimately stored. This file should be named with an *.mdf extension (this is a recommendation, not a requirement—but I think you'll find doing it in other ways will become confusing over time). "Secondary" files can be added (and should use an *.ndf extension), and do not need to be on the same physical drive as the primary (which means you can use them to distribute I/O load—we will explore these further in Chapter 21).
+  * The second is something of an offshoot to the database file—the log. We'll dive into the log quite a bit when we deal with transactions and locks in Chapter 11, but you should be aware that it resides in its own file (which should end with an *.ldf extension), and that your database will not operate without it. The log is the serial recording of what's happened to your database since the last time that data was "committed" to the database. The database isn't really your complete set of data. The log isn't your complete set of data. Instead, if you start with the database and "apply" (add in all the activities from the last point the two synched up) the log, you have your complete set of data.
+
+There is no restriction about where these files are located relative to each other. It is possible (actually, it's even quite desirable) to place each file on a separate physical device. This not only allows for the activity in one file not to interfere with that in the other file, but it also creates a situation where losing the file with the database does not cause you to lose your work—you can restore a backup and then reapply the log (that was safe on the other drive). Likewise, if you lose the drive with the log, you'll still have a valid database up through the time of the last checkpoint (checkpoints are fully covered in Chapter 11).
+
+The Extent
+
+An extent is the basic unit of storage used to allocate space for tables and indexes within a given file. It is made up of eight contiguous 64KB data pages.
+
+The concept of allocating space based on extents, rather than actual space used, can be somewhat difficult to understand for people used to operating system storage principles. The important points about an extent include:
+
+  * Once an extent is full, the next record will take up not just the size of the record but the size of a whole new extent. Many people who are new to SQL Server get tripped up in their space estimations in part due to the allocation of an extent at a time rather than a record at a time.
+  * By pre-allocating this space, SQL Server saves the time of allocating new space with each record.
+
+It may seem like a waste that a whole extent is taken up just because one too many rows were added to fit on the currently allocated extent(s), but the amount of space wasted this way is typically not that much as a percentage of the entire database. Still, it can add up—particularly in a highly fragmented environment—so it's definitely something you should keep in mind.
+
+The good news in taking up all this space is that SQL Server skips some of the allocation time overhead. Instead of worrying about allocation issues every time it writes a row, SQL Server deals with additional space allocation only when a new extent is needed.
+
+Don't confuse the space that an extent is taking up with the space that a database takes up. Whatever space is allocated to the database is what you'll see disappear from your disk drive's available space number. An extent is merely how things are, in turn, allocated within the total space reserved by an individual database file.
+
+The Page
+
+Much like an extent is a unit of allocation within the database, a page is the unit of allocation within a specific extent. There are eight pages to every extent.
+
+A page is the last level you reach before you are at the actual data row. Whereas the number of pages per extent is fixed, the number of rows per page is not—that depends entirely on the size of the row, which can vary. You can think of a page as being something of a container for both table and index row data. A row is not allowed to be split between pages.
+
+Figure 6.1 illustrates how data gets put into a page. Notice how, for every row you insert, you have to place the row offset down at the end of the page to indicate where in the page that particular row's data begins.
+
+Figure 6.1
+
+There are a number of different page types. For purposes of this book, the types we care about are:
+
+  * Data
+  * Index
+  * Binary Large Object (BLOB) (for Image, most Text and Ntext data, and varchar(max)/nvarchar(max) data that is larger than about 8k)
+  * Global and Shared Global Allocation Map (GAM, or SGAM)
+  * Page Free Space (PFS)
+  * Index Allocation Map (IAM)
+  * Bulk Changed Map
+  * Differential Changed Map
+
+Data Pages
+
+Data pages are pretty self-explanatory—they are the actual data in your table, with the exception of any BLOB data that is not stored "in row" (more on this in the BLOB pages section). In the case of a row that has a column that contains BLOB data, the regular data is stored in a data page, and the BLOB data may be stored in page (if small enough to fit). If the BLOB data can't fit on the page, then a 16-byte pointer is used to show where to find the BLOB page that contains the start of the BLOB.
+
+Index Pages
+
+Index pages are also pretty straightforward: They hold both the non-leaf and leaf level pages (we'll examine what these are later in the chapter) of a non-clustered index, as well as the non-leaf level pages of a clustered index. These index types will become much clearer as we continue through this chapter.
+
+BLOB Pages
+
+BLOB pages are for storing Binary Large Objects. For SQL Server, these amount to data stored in varbinary(max), varchar(max), or nvarchar(max) columns. BLOB pages are special as far as data storage pages go, in that they don't have any rows as such. Since a BLOB can be as large as 2GB, they have to be able to go on more than one page—for this portion of things it doesn't matter what the version is. SQL Server will allocate as many pages as it needs in order to store the entire BLOB, but there is no guarantee that these pages will be contiguous—the pages could be located anywhere within the database file(s).
+
+As mentioned before, the connection between the non-BLOB data for a row and any BLOB-related to that row comes in the form of a pointer. The nature of that pointer and how SQL Server navigates to the BLOB data was changed for version 7.0 of SQL Server. In version 6.5 and before, the BLOB pages were put together in a chain—similar to a linked list. In order to find any page that was part of the BLOB, you needed to start at the beginning and navigate through the BLOB page by page. If you were trying to perform some form of text or binary search, this kind of arrangement would be deadly, given that you were forced into a serial scan of the data. Beginning with version 7.0, however, the pages were changed to be organized into a B-Tree structure (which I will discuss fully a little later in the chapter). B-Trees provide more of a branching structure, and, therefore, a more direct path for larger BLOBs. This has made quite a difference in how quickly text operations can be performed.
+
+Even with the significant improvements made across several versions over the years, BLOBs are very slow performance-wise, so we will talk about alternative storage methods when we look at advanced design issues later on.
+
+Global Allocation Map, Shared Global Allocation Map, and Page Free Space Pages
+
+Global Allocation Map (GAM), Shared Global Allocation Map (SGAM), and Page Free Space (PFS) page types are involved with figuring out which extents and pages are in use, and which are not. Essentially, these pages store records that indicate where there is space available. Understanding these page types is not really necessary to do high-quality development or systems administration, and is beyond the scope of this book. If, however, you're just dying to know about them (or you're having problems with insomnia), then you can find more information on them in the Books Online—just look up GAM in the index.
+
+Bulk Changed Map
+
+Hmmm. How to address this one, since we haven't addressed bulk operations yet....
+
+SQL Server has the concept of "bulk operations." Bulk operations are very high-speed changes to the database (usually a mass import of data or a truncation of a table). Part of this speed comes from the idea that they don't "log" every single thing they do. The log is a critical part of the backup and recovery system, and bulk operations mean that unlogged activity (well, it logs that it did an operation, but not the specifics, and so the log cannot reconstruct what you did) has occurred in your database.
+
+The Bulk Changed Map—or BCM—is a set of pages that track what extents have been altered via bulk operations. It cares nothing about the specifics of the changes—merely that you messed with that particular extent. Since it knows you altered that extent, it provides more options when you back up your database. More specifically, when backing up the log, you can supplement the log backup with backing up of the physical data in the extents that were affected by bulk operations.
+
+Differential Changed Map
+
+This is nearly the same thing as the Bulk Changed Map, but, instead of tracking only those extents changed by bulk operations, it instead tracks any extents that were changed since the last full backup of your database.
+
+When you request a differential backup, the Differential Changed Map—or DCM—supplies information about what extents need to be backed up. You wind up with a much smaller and faster running (albeit only partial) backup as only those extents that have changed since the prior backups are included.
+
+Page Splits
+
+When a page becomes full, it splits. This means more than just a new page being allocated—it also means that approximately half the data from the existing page is moved to the new page.
+
+The exception to this process is when a clustered index is in use. If there is a clustered index, and the next inserted row would be physically located as the last record in the table, then a new page is created, and the new row is added to the new page without relocating any of the existing data. We will see much more on page splits as we investigate indexes.
+
+Rows
+
+You will hear much about "Row Level Locking," so it shouldn't be a surprise to hear this term. Rows typically can be up to 8K.
+
+In addition to the limit of 8,060 characters, there is also a maximum of 1,024 columns. In practice, you'll find it very unusual to run into a situation where you run out of columns before you run into the 8,060-character limit. 1,024 gives you an average column width of 8 bytes. For most uses, you'll easily exceed that. The exception to this tends to be in measurement and statistical information—where you have a large number of different things that you are storing numeric samples of. Still, even those applications will find it a rare day when they bump into the 1,024 column count limit.
+
+I did, as you may have noted, use the term typically when I mentioned the 8KB limit. This limit is based on a row being limited to a single page, and the page having an 8KB size, but it can be exceeded in a few circumstances—specifically, with varchar(max) or varbinary(max) as well as traditional BLOB data types like image and text. If a row contains too much data in one of these types to fit within the single page, then these special data types know how to make your data span multiple pages (up to 2GB in a single row). In this case, the original row is used to keep track of where the actual data for that column is stored (all other columns are still stored in the original row).
+
+Full-Text Catalogs
+
+Prior to SQL Server 2008, these were a separate storage mechanism outside of your normal database. While you could associate a full-text catalog as being the default for a given database, and even back up your full-text catalogs together with your database (in 2005—prior to that even the backups were decoupled), they were stored completely separately. With SQL Server 2008, the Full-Text Catalog no longer has relevance as a storage unit—instead, it is merely a logical grouping of full-text indexes. I mention them here solely for historical reference. (We discuss full text in Chapter 18.)
+
+Coordinated backups between full-text index files and the core database did not exist prior to SQL Server 2005. Keep this in mind if you have backward compatibility concerns with prior versions.
+
+File Streams
+
+File streams are a special storage method meant to address the performance issues with the storage of very large BLOBs. Instead of storing the file's stream in a set of BLOB pages, the file is stored in an NT File System (NTFS) directory that is created explicitly for use by the particular SQL Server database you're storing data in. Unlike client-controlled systems that store binary data in the file system and a pointer in the database, SQL Server coordinates file versioning for you—allowing the file stream to participate in transactions and backups.
+
+File streams are something of a niche area, but a rather important one. We will explore their structure more fully in the next chapter, and further still in our chapter on designing for performance (Chapter 21).
+
+Understanding Indexes
+
+Webster's dictionary defines an index as:
+
+A list (as of bibliographical information or citations to a body of literature) arranged usually in alphabetical order of some specified datum (as author, subject, or keyword).
+
+I'll take a simpler approach in the context of databases and say it's a way of potentially getting to data a heck of a lot quicker. Still, the Webster's definition isn't too bad—even for our specific purposes.
+
+Perhaps the key thing to point out in the Webster's definition is the word usually that's in there. The definition of "alphabetical order" changes depending on a number of rules. For example, in SQL Server, we have a number of different collation options available to us. Among these options are:
+
+  * Binary: Sorts by the numeric representation of the character (for example, in ASCII, a space is represented by the number 32, the letter D is 68, but the letter d is 100). Because everything is numeric, this is the fastest option; unfortunately, it's also not at all the way in which people think, and can also really wreak havoc with comparisons in your WHERE clause.
+  * Dictionary Order: This sorts things just as you would expect to see in a dictionary, with a twist; you can set a number of different additional options to determine sensitivity to case, accent, and character set. Keep in mind that every language can add its own notion of what constitutes dictionary order, so, if you choose a collation that's oriented around a non-English language, you may see sort order altered somewhat.
+
+It's fairly easy to understand that, if we tell SQL Server to pay attention to case, then A is not going to be equal to a. Likewise, if we tell it to be case insensitive, then A will be equal to a. Things get a bit more confusing when we add accent sensitivity—that is, SQL Server pays attention to diacritical marks, and therefore a is different from á, which is different from à. Where many people get even more confused is in how collation order affects not only the equality of data but also the sort order (and, therefore, the way it is stored in indexes).
+
+By way of example, let's look at the equality of a couple of collation options in the following table, and what they do to our sort order and equality information:
+
+Collation Order | Comparison Values | Index Storage Order  
+---|---|---  
+Dictionary order, case insensitive, accent insensitive (the default) | A = a = à = á = â = Ä = ä = Å = å | a, A, à, â, á, Ä, ä, Å, å  
+Dictionary order, case insensitive, accent insensitive, uppercase preference | A = a = à = á = â = Ä = ä = Å = å | A, a, à, â, á, Ä, ä, Å, å  
+Dictionary order, case sensitive | A ≠ a, Ä ≠ ä, Å ≠ å, a ≠ à ≠ á ≠ â ≠ ä ≠ å, A ≠ Ä ≠ Å | A, a, à, á, â, Ä, ä, Å, å
+
+The point here is that what happens in your indexes depends on the collation information you have established for your data. Collation can be set at the database and column level, so you have a fairly fine granularity in your level of control. If you're going to assume that your server is case insensitive, then you need to be sure that the documentation for your system deals with this or you had better plan on a lot of tech support calls—particularly if you're selling outside of the United States. Imagine you're an independent software vendor (ISV) and you sell your product to a customer who installs it on an existing server (which is going to seem like an entirely reasonable thing to the customer), but that existing server happens to be an older server that's set up as case sensitive. You're going to get a support call from one very unhappy customer.
+
+Once the collation order has been set, changing it is very non-trivial (but possible), so be certain of the collation order you want before you set it.
+
+To "B," or Not to "B": B-Trees
+
+The concept of a Balanced Tree, or B-Tree, is certainly not one that was created with SQL Server. Indeed, B-Trees are used in a very large number of indexing systems both in and out of the database world.
+
+A B-Tree simply attempts to provide a consistent and relatively low-cost method of finding your way to a particular piece of information. The Balanced in the name is pretty much self-descriptive—a B-Tree is, with the odd exception, self-balancing, meaning that every time the tree branches, approximately half the data is on one side, and half on the other side. The Tree in the name is also probably pretty obvious at this point. (Hint: tree, branch—see a trend here?) It's there because, when you draw the structure, then turn it upside down, it has the general form of a tree.
+
+A B-Tree starts at the root node (another stab at the tree analogy there, but not the last). This root node can, if there is a small amount of data, point directly to the actual location of the data. In such a case, you would end up with a structure that looked something like Figure 6.2.
+
+Figure 6.2
+
+So, we start at the root and look through the records until we find the last page that starts with a value less than what we're looking for. We then obtain a pointer to that node, and look through it until we find the row that we want.
+
+In most situations though, there is too much data to reference from the root node, so the root node points at intermediate nodes—or what are called non-leaf level nodes. Non-leaf level nodes are nodes that are somewhere in between the root and the node that tells you where the data is physically stored. Non-leaf level nodes can then point to other non-leaf level nodes, or to leaf level nodes (last tree analogy reference—I promise). Leaf level nodes are the nodes where you obtain the real reference to the actual physical data. Much like the leaf is the end of the line for navigating the tree, the node we get to at the leaf level is the end of the line for our index—from here, we can go straight to the actual data node that has our data on it.
+
+As you can see in Figure 6.3, we start with the root node just as before, then move to the node that starts with the highest value that is equal to or less than what we're looking for and is also in the next level down. We then repeat the process—look for the node that has the highest starting value at or below the value for which we're looking. We keep doing this, level by level down the tree, until we get to the leaf level—from there we know the physical location of the data and can quickly navigate to it.
+
+Figure 6.3
+
+Page Splits—A Deeper Look
+
+All of this works quite nicely on the read side of the equation; it's the insert that gets a little tricky. Recall that the B in B-Tree stands for balanced. You may also recall that I mentioned that a B-Tree is balanced because about half the data is on either side every time you run into a branch in the tree. B-Trees are sometimes referred to as self-balancing because the way new data is added to the tree generally prevents them from becoming lopsided.
+
+When data is added to the tree, a node will eventually become full, and will need to split. Because, in SQL Server, a node equates to a page—this is called a page split, illustrated in Figure 6.4.
+
+When a page split occurs, data is automatically moved around to keep things balanced. The first half of the data is left on the old page, and the rest of the data is added to a new page—thus you have about a 50–50 split, and your tree remains balanced.
+
+Figure 6.4
+
+If you think about this splitting process a bit, you'll realize that it adds a substantial amount of overhead at the time of the split. Instead of inserting just one page, you are:
+
+  * Creating a new page
+  * Migrating rows from the existing page to the new page
+  * Adding your new row to one of the pages
+  * Adding another entry in the parent node
+
+But the overhead doesn't stop there. Since you're in a tree arrangement, you have the possibility for something of a cascading action. When you create the new page (because of the split), you need to make another entry in the parent node. This entry in the parent node also has the potential to cause a page split at that level, and the process starts all over again. Indeed, this possibility extends all the way up to and can even affect the root node.
+
+If the root node splits, then you actually end up creating two additional pages. Because there can be only one root node, the page that was formerly the root node is split into two pages, and becomes a new intermediate level of the tree. An entirely new root node is then created, and will have two entries (one to the old root node, one to the split page).
+
+Needless to say, page splits can have a very negative impact on system performance and are characterized by behavior where your process on the server seems to just pause for a few seconds (while the pages are being split and rewritten).
+
+We will talk about page-split prevention before we're done with this chapter.
+
+While page splits at the leaf level are a common fact of life, page splits at intermediate nodes happen far less frequently. As your table grows, every layer of the index will experience page splits, but, because the intermediate nodes have only one entry for several entries on the next lower node, the number of page splits gets less and less frequent as you move further up the tree. Still, for a split to occur above the leaf level, there must have already been a split at the next lowest level—this means that page splits up the tree are cumulative (and expensive performance-wise) in nature.
+
+SQL Server has a number of different types of indexes (which we will discuss shortly), but they all make use of this B-Tree approach in some way or another. Indeed, they are all very similar in structure thanks to the flexible nature of a B-Tree. Still, we shall see that there are indeed some significant differences, and these can have an impact on the performance of our system.
+
+For a SQL Server index, the nodes of the tree come in the form of pages, but you can actually apply this concept of a root node, the non-leaf level, the leaf level, and the tree structure to more than just SQL Server or even just databases.
+
+How Data Is Accessed in SQL Server
+
+In the broadest sense, there are only two ways in which SQL Server retrieves the data you request:
+
+  * Using a table scan
+  * Using an index
+
+Which method SQL Server will use to run your particular query will depend on what indexes are available, what columns you are asking about, what kind of joins you are doing, and the size of your tables.
+
+Use of Table Scans
+
+A table scan is a pretty straightforward process. When a table scan is performed, SQL Server starts at the physical beginning of the table looking through every row in the table. As it finds rows that match the criteria of your query, it includes them in the result set.
+
+You may hear lots of bad things about table scans, and in general, they will be true. However, table scans can actually be the fastest method of access in some instances. Typically, this is the case when retrieving data from rather small tables. The exact size where this becomes the case will vary widely according to the width of your table and what the specific nature of the query is.
+
+See if you can spot why the use of EXISTS in the WHERE clause of your queries has so much to offer performance-wise where it fits the problem. When you use the EXISTS operator, SQL Server stops as soon as it finds one record that matches the criteria. If you had a million record table, and it found a matching record on the third record, then use of the EXISTS option would have saved you the reading of 999,997 records! NOT EXISTS works in much the same way.
+
+Use of Indexes
+
+When SQL Server decides to use an index, the process actually works somewhat similarly to a table scan, but with a few shortcuts.
+
+During the query optimization process, the Optimizer takes a look at all the available indexes and chooses the best one (this is primarily based on the information you specify in your joins and WHERE clause, combined with statistical information SQL Server keeps on index make-up). Once that index is chosen, SQL Server navigates the tree structure to the point of data that matches your criteria and again extracts only the records it needs. The difference is that, since the data is sorted, the query engine knows when it has reached the end of the current range it is looking for. It can then end the query, or move on to the next range of data as necessary.
+
+If you ponder the query topics you've worked with and studied thus far, you may notice some striking resemblances to how the EXISTS option works. The EXISTS keyword allows a query to quit running the instant that it finds a match. The performance gains of using an index are similar or even better since the process of searching for data can work in a similar fashion—that is, the server is able to know when there is nothing left that's relevant, and can stop things right there. Even better, however, is that by using an index, you don't have to limit yourself to Boolean situations (does the piece of data I was after exist—yes or no?). You can apply this same notion to both the beginning and end of a range—you are able to gather ranges of data with essentially the same benefits that using an index gives to finding data. What's more, you can do a very fast lookup (called a SEEK) of your data rather than hunting through the entire table.
+
+Don't get the impression from my comparing what indexes do for you to the EXISTS operator that indexes replace the EXISTS operator altogether (or vice versa). The two are not mutually exclusive; they can be used together, and often are. I mention them here together only because they have the similarity of being able to tell when their work is done, and quit before getting to the physical end of the table.
+
+Index Types and Index Navigation
+
+Although there are nominally two types of indexes in SQL Server (clustered and non-clustered), there are actually, internally speaking, three different types:
+
+  * Clustered indexes
+  * Non-clustered indexes—which comprise: 
+    * Non-clustered indexes on a heap
+    * Non-clustered indexes on a clustered index
+
+The way the physical data is stored varies between clustered and non-clustered indexes. The way SQL Server traverses the B-Tree to get to the end data varies between all three index types.
+
+All SQL Server indexes have leaf level and non-leaf level pages. As I mentioned when I discussed B-Trees, the leaf level is the level that holds the "key" to identifying the record, and the non-leaf level pages are guides to the leaf level.
+
+The indexes are built over either a clustered table (if the table has a clustered index) or what is called a heap (what's used for a table without a clustered index).
+
+  * A clustered table is any table that has a clustered index on it. Clustered indexes are discussed in detail shortly, but what they mean to the table is that the data is physically stored in a designated order. Individual rows are uniquely identified through the use of the cluster key—the columns that define the clustered index.
+
+This should bring to mind the question, "What if the clustered index is not unique?" That is, how can a clustered index be used to uniquely identify a row if the index is not a unique index? The answer lies under the covers—SQL Server forces any clustered indexes to be unique—even if you don't define it that way. Fortunately, it does this in a way that doesn't change how you use the index. You can still insert duplicate rows if you wish, but SQL Server will add a suffix to the key internally to ensure that the row has a unique identifier.
+
+  * A heap is any table that does not have a clustered index on it. In this case, a unique identifier, or row ID (RID) is created based on a combination of the extent, pages, and row offset (places from the top of the page) for that row. A RID is necessary only if there is no cluster key available (no clustered index).
+
+Clustered Indexes
+
+A clustered index is unique for any given table—you can have only one per table. You don't have to have a clustered index, but you'll find it to be one of the most commonly chosen types as the first index, for a variety of reasons that will become apparent as you look at your index types.
+
+What makes a clustered index special is that the leaf level of a clustered index is the actual data—that is, the data is resorted to be stored in the physical order defined in the index or related key command. This means that once you get to the leaf level of the index, you're done—you're at the data. Any new record is inserted according to its correct physical order in the clustered index. How new pages are created changes depending on where the record needs to be inserted.
+
+In the case of a new record that needs to be inserted into the middle of the index structure, a normal page split occurs. The last half of the records from the old page are moved to the new page and the new record is inserted into the new or old page as appropriate.
+
+In the case of a new record that is logically at the end of the index structure, a new page is created, but only the new record is added to the new page, as shown in Figure 6.5.
+
+Navigating the Tree
+
+As I've indicated previously, even the indexes in SQL Server are stored in a B-Tree. Theoretically, a B-Tree always has half of the remaining information in each possible direction as the tree branches. Take a look at a visualization of what a B-Tree looks like for a clustered index (Figure 6.6).
+
+Figure 6.5
+
+Figure 6.6
+
+As you can see, it looks essentially identical to the more generic B-Trees we discussed earlier in the chapter. In this case, we're doing a range search (something clustered indexes are particularly good at) for numbers 158–400. All we have to do is the following:
+
+Navigate to the first record, and include all remaining records on that page—we know we need the rest of that page because the information from one node up lets us know that we'll also need data from a few other pages. Because this is an ordered list, we can be sure it's continuous—that means if the next page has records that should be included, then the rest of this page must be included. We can just start spewing out data from those pages without having to do the verification side of things.
+
+We start by navigating to the root node. SQL Server is able to locate the root node based on an entry that is kept a system table. You can look at the logical content of that table by querying sys.indexes.
+
+Every index in your database has an entry in sys.indexes. This system view is part of your database (as opposed to being in the master database) and shows the stored location information for all the indexes in your database as well as which columns they are based on.
+
+In older versions of SQL Server, you could query against the underlying table (technically you still can, but I highly recommend against such direct queries at this point) which is called sysindexes.
+
+By looking through the page that serves as the root node, we can figure out what the next page we need to examine is (the second page on the second level as we have it drawn here). We then continue the process. With each step we take down the tree, we are getting to smaller and smaller subsets of data.
+
+Eventually, we will get to the leaf level of the index. In the case of our clustered index, getting to the leaf level of the index means that we are also at our desired row(s) and our desired data.
+
+I can't stress enough the importance of the distinction that, with a clustered index, when you've fully navigated the index, you've fully navigated to your data. How much of a performance difference this can make will really show its head as you look at non-clustered indexes—particularly when the non-clustered index is built over a clustered index.
+
+Non-Clustered Indexes on a Heap
+
+Non-clustered indexes on a heap work very similarly to clustered indexes in most ways. They do, however, have a few notable differences:
+
+The leaf level is not the data—instead, it is the level at which you are able to obtain a pointer to that data. This pointer comes in the form of the RID, which, as described earlier in the chapter, is made up of the extent, page, and row offset for the particular row being pointed to by the index. Even though the leaf level is not the actual data (instead, it has the RID), you have only one more step than with a clustered index—because the RID has the full information on the location of the row, you can go directly to the data.
+
+Don't, however, misunderstand this "one more step" to mean that there's only a small amount of overhead difference, and that non-clustered indexes on a heap will run close to as fast as a clustered index. With a clustered index, the data is physically in the order of the index. That means, for a range of data, when you find the row that has the beginning of your data on it, there's a good chance that the other rows are on that page with it (that is, you're already physically almost to the next record since they are stored together). With a heap, the data is not linked together in any way other than through the index. From a physical standpoint, there is absolutely no sorting of any kind. This means that, from a physical read standpoint, your system may have to retrieve records from all over the file. Indeed, it's quite possible (possibly even probable) that you will wind up fetching data from the same page several separate times. SQL Server has no way of knowing it will have to come back to that physical location because there was no link between the data. With the clustered index, it knows that's the physical sort, and can therefore grab it all in just one visit to the page.
+
+Just to be fair to the non-clustered index on a heap here versus the clustered index, the odds are extremely high that any page that was already read once will still be in the memory cache, and, thus, will be retrieved extremely quickly. Still, it does add some additional logical operations to retrieve the data.
+
+Figure 6.7 shows the same search you did with the clustered index, only with a non-clustered index on a heap this time.
+
+Figure 6.7
+
+Through most of the index navigation, things work exactly as they did before. You start out at the same root node, and you traverse the tree dealing with more and more focused pages until you get to the leaf level of your index. This is where you run into the difference. With a clustered index, you could have stopped right here, but, with a non-clustered index, you have more work to do. If the non-clustered index is on a heap, then you have just one more level to go. You take the Row ID from the leaf-level page, and navigate to it—it is not until that point that you are at your actual data.
+
+Non-Clustered Indexes on a Clustered Table
+
+With non-clustered indexes on a clustered table, the similarities continue—but so do the differences. Just as with non-clustered indexes on a heap, the non-leaf level of the index looks pretty much as it did for a clustered index. The difference does not come until you get to the leaf level.
+
+At the leaf level, you have a rather sharp difference from what you've seen with the other two index structures—you have yet another index to look over. With clustered indexes, when you got to the leaf level, you found the actual data. With non-clustered indexes on a heap, you didn't find the actual data, but did find an identifier that let you go right to the data (you were just one step away). With non-clustered indexes on a clustered table, you find the cluster key. That is, you find enough information to go and make use of the clustered index.
+
+You end up with something that looks like Figure 6.8.
+
+What you end up with is two entirely different kinds of lookups.
+
+In the example from your diagram, you start off with a ranged search—you do one single lookup in your index and are able to look through the non-clustered index to find a continuous range of data that meets your criterion (LIKE 'T%'). This kind of lookup, where you can go right to a particular spot in the index, is called a seek.
+
+The second kind of lookup then starts—the lookup using the clustered index. This second lookup is very fast; the problem lies in the fact that it must happen multiple times. You see, SQL Server retrieved a list from the first index lookup (a list of all the names that start with "T"), but that list doesn't logically match up with the cluster key in any continuous fashion—each record needs to be looked up individually as shown in Figure 6.9.
+
+Figure 6.8
+
+Figure 6.9
+
+Needless to say, this multiple lookup situation introduces more overhead than if you had just been able to use the clustered index from the beginning. The first index search—the one through your non-clustered index—is going to require very few logical reads.
+
+For example, if I have a table with 1,000 bytes per row, and I did a lookup similar to the one in our drawing (say, something that would return 5 or 6 rows); it would take only something to the order of 8–10 logical reads to get the information from the non-clustered index. However, that gets me only as far as being ready to look up the rows in the clustered index. Those lookups would cost approximately 3–4 logical reads each, or 15–24 additional reads. That probably doesn't seem like that big a deal at first, but look at it this way:
+
+Logical reads went from 3 minimum to 24 maximum—that's an 800 percent increase in the amount of work that had to be done.
+
+Now expand this thought out to something where the range of values from the non-clustered index wasn't just five or six rows, but five or six thousand, or five or six hundred thousand rows—that's going to be a huge impact.
+
+Don't let the extra overhead versus a clustered index scare you. The point isn't meant to scare you away from using indexes, but rather to point out that a non-clustered index is not going to be as efficient as a clustered index from a read perspective (it can, in some instances, actually be a better choice at insertion time). An index of any kind is usually (there are exceptions) the fastest way to do a lookup. I'll explain what index to use and why later in the chapter.
+
+Creating, Altering, and Dropping Indexes
+
+These work much as they do on other objects such as tables. Take a look at each, starting with CREATE.
+
+Indexes can be created in two ways:
+
+  * Through an explicit CREATE INDEX command
+  * As an implied object when a constraint is created
+
+Each of these has its own quirks about what it can and can't do, so take a look at each of them individually.
+
+The CREATE INDEX Statement
+
+The CREATE INDEX statement does exactly what it sounds like; it creates an index on the specified table or view based on the stated columns.
+
+The syntax to create an index is somewhat drawn out, and introduces several items that I haven't really talked about up to this point:
+
+CREATE [UNIQUE] [CLUSTERED|NONCLUSTERED]
+
+INDEX <index name> ON <table or view name>(<column name> [ASC|DESC] [,...n])
+
+INCLUDE (<column name> [,...n])
+
+[WITH
+
+[PAD_INDEX = { ON | OFF }]
+
+[[,] FILLFACTOR = <fillfactor>]
+
+[[,] IGNORE_DUP_KEY = { ON | OFF }]
+
+[[,] DROP_EXISTING = { ON | OFF }]
+
+[[,] STATISTICS_NORECOMPUTE = { ON | OFF }]
+
+[[,] SORT_IN_TEMPDB = { ON | OFF }]
+
+[[,] ONLINE = { ON | OFF }
+
+[[,] ALLOW_ROW_LOCKS = { ON | OFF }
+
+[[,] ALLOW_PAGE_LOCKS = { ON | OFF }
+
+[[,] MAXDOP = <maximum degree of parallelism>
+
+]
+
+[ON {<filegroup> | <partition scheme name> | DEFAULT }]
+
+There is legacy syntax available for many of these options, and so you may see that syntax put into use to support prior versions of SQL Server. That syntax is, however, considered deprecated and will be removed at some point. I highly recommend that you stay with the newer syntax where possible.
+
+There is a similar but sufficiently different syntax for creating XML and spatial indexes. These will be handled separately in the next chapter.
+
+Loosely speaking, this statement follows the same CREATE <object type> <object name> syntax that you've seen plenty of already (and will see even more of). The primary hitch in things is that you have a few intervening parameters that you haven't seen elsewhere.
+
+Just as you'll see with views in Chapter 8, you do have to add an extra clause onto your CREATE statement to deal with the fact that an index isn't really a standalone kind of object. It has to go together with a table or view, and you need to state the table that your column(s) are "ON."
+
+After the ON <table or view name>(<column name>) clause, everything is optional. You can mix and match these options. Many of them are seldom used, but some (such as FILLFACTOR) can have a significant impact on system performance and behavior, so take a look at them one by one.
+
+ASC/DESC
+
+These two allow you to choose between an ascending and a descending sort order for your index. The default is ASC, which is, as you might guess, ascending order.
+
+A question that might come to mind is why ascending versus descending matters: You see, SQL Server can just look at an index backwards if it needs the reverse sort order. Life is not, however, always quite so simple. Looking at the index in reverse order works just fine if you're dealing with only one column, or if your sort is always the same for all columns, but what if you needed to mix sort orders within an index? That is, what if you need one column to be sorted ascending, but the other descending? Since the indexed columns are stored together, reversing the way you look at the index for one column would also reverse the order for the additional columns. If you explicitly state that one column is ascending, and the other is descending, then you invert the second column right within the physical data. There is suddenly no reason to change the way that you access your data.
+
+As a quick example, imagine a reporting scenario where you want to order your employee list by the hire date, beginning with the most recent (a descending order), but you also want to order by their last name (an ascending order). In previous versions, SQL Server would have to do two operations: one for the first column and one for the second. By having control over the physical sort order of your data, you gain flexibility in the way you combine columns.
+
+Generally speaking, you'll want to leave this one alone (again, remember backward compatibility). Some likely exceptions are:
+
+  * You need to mix ascending and descending order across multiple columns.
+  * Backward compatibility is not an issue.
+
+INCLUDE
+
+This was first added with SQL Server 2005. Its purpose is to provide better support for what are called covered queries. A query is considered to be "covered" when all of the data the query needs is covered in the index that is being used. If all the data needed is already in the index, then there is no need to go to the actual data page; as soon as it has gotten to the leaf level of the index, it has all it needs and can stop there (saving a bunch of I/O operations).
+
+When you INCLUDE columns as opposed to placing them in the ON list, SQL Server adds them only at the leaf level of the index. Because each row at the leaf level of an index corresponds to a data row, what you're doing is essentially including more of just the raw data in the leaf level of your index. If you think about this, you can probably guess that INCLUDE really applies only to non-clustered indexes. (Clustered indexes already are the data at the leaf level, so there would be no point.)
+
+Why does this matter? Well, as we'll discuss further as the book goes on, SQL Server stops working as soon as it has what it actually needs. So, if while traversing the index, it can find all the data that it needs without continuing on to the actual data row, then it won't bother going to the data row (what would be the point?). By including a particular column in the index, you may "cover" a query that utilizes that particular index at the leaf level and save the I/O associated with using that index pointer to go to the data page.
+
+Careful not to abuse this one! When you INCLUDE columns, you are enlarging the size of the leaf level of your index pages. That means fewer rows will fit per page, and, therefore, more I/O may be required to see the same number of rows. The result may be that your effort to speed up one query may slow down others. To quote an old film from the eighties, "Balance Danielson—balance!" Think about the effects on all parts of your system, not just the particular query you're working on that moment.
+
+WITH
+
+WITH is an easy one—it just tells SQL Server that you will indeed be supplying one or more of the options that follow.
+
+PAD_INDEX
+
+In the syntax list, this one comes first—but that will seem odd when you understand what PAD_INDEX does. In short, it determines just how full the non-leaf level pages of your index are going to be (as a percentage), when the index is first created. You don't state a percentage on PAD_INDEX because it will use whatever percentage is specified in the FILLFACTOR option that follows. Setting PAD_INDEX = ON would be meaningless without a FILLFACTOR (which is why it seems odd that it comes first).
+
+FILLFACTOR
+
+When SQL Server first creates an index, the pages are, by default, filled as full as they can be, minus two records. You can set the FILLFACTOR to be any value between 1 and 100. This number will be how full your pages are as a percentage after index construction is completed. Keep in mind, however, that as your pages split, your data will still be distributed 50–50 between the two pages. You cannot control the fill percentage on an ongoing basis other than regularly rebuilding the indexes (something you should do—setting up a maintenance schedule for this is covered in Chapter 23).
+
+You use a FILLFACTOR when you need to adjust the page densities. Think about things this way:
+
+  * If it's an OLTP system, you want the FILLFACTOR to be low.
+  * If it's an OLAP or other very stable (in terms of changes—very few additions and deletions) system, you want the FILLFACTOR to be as high as possible.
+  * If you have something that has a medium transaction rate and a lot of report type queries against it, then you probably want something in the middle (not too low, not too high).
+
+If you don't provide a value, then SQL Server will fill your pages to two rows short of full, with a minimum of one row per page. (For example, if your row is 8,000 characters wide, you can fit only one row per page, so leaving things two rows short wouldn't work).
+
+IGNORE_DUP_KEY
+
+The IGNORE_DUP_KEY option is a way of doing little more than circumventing the system. In short, it causes a UNIQUE constraint to have a slightly different action from that which it would otherwise have.
+
+Normally, a unique constraint, or unique index, does not allow duplicates of any kind. If a transaction tried to create a duplicate based on a column that is defined as unique, then that transaction would be rolled back and rejected. Once you set the IGNORE_DUP_KEY option, however, you'll get mixed behavior. You will still receive an error message, but the error will be only of a warning level. The record is still not inserted.
+
+This last line—the record is still not inserted—is a critical concept from an IGNORE_DUP_KEY standpoint. A rollback isn't issued for the transaction (the error is a warning error rather than a critical error), but the duplicate row will have been rejected.
+
+Why would you do this? Well, it's a way of storing unique values, but not disturbing a transaction that tries to insert a duplicate. For whatever process is inserting the would-be duplicate, it may not matter at all that it's a duplicate row (no logical error from it). Instead, that process may have an attitude that's more along the lines of, "Well, as long as I know there's one row like that in there, I'm happy. I don't care whether it's the specific row that I tried to insert or not."
+
+DROP_EXISTING
+
+If you specify the DROP_EXISTING option, any existing index with the name in question will be dropped prior to construction of the new index. This option is much more efficient than simply dropping and re-creating an existing index when you use it with a clustered index. If you rebuild an exact match of the existing index, SQL Server knows that it need not touch the non-clustered indexes, while an explicit drop and create would involve rebuilding all of the non-clustered indexes twice in order to accommodate the different row locations. If you change the structure of the index using DROP_EXISTING, the NCIs are rebuilt only once instead of twice. Furthermore, you cannot simply drop and re-create an index created by a constraint, for example, to implement a certain fill factor. DROP_EXISTING is a workaround to this.
+
+STATISTICS_NORECOMPUTE
+
+By default, SQL Server attempts to automate the process of updating the statistics on your tables and indexes. By selecting the STATISTICS_NORECOMPUTE option, you are saying that you will take responsibility for the updating of the statistics. To turn this option off, you need to run the UPDATE STATISTICS command, but not use the NORECOMPUTE option.
+
+I strongly recommend against using this option. Why? Well, the statistics on your index are what the Query Optimizer uses to figure out just how helpful your index is going to be for a given query. The statistics on an index are changing constantly as the data in your table goes up and down in volume and as the specific values in a column change. When you combine these two facts, you should be able to see that not updating your statistics means that the Query Optimizer is going to be running your queries based on out-of-date information. Leaving the automatic statistics feature on means that the statistics will be updated regularly. (Just how often depends on the nature and frequency of your updates to the table.) Conversely, turning automatic statistics off means that you will either be out of date or you will need to set up a schedule to manually run the UPDATE STATISTICS command.
+
+SORT_IN_TEMPDB
+
+This option makes sense only when your tempdb is stored on a physically separate drive from the database that is to contain the new index. This is largely an administrative function, so I'm not going to linger on this topic for more than a brief overview of what it is and why it makes sense only when tempdb is on a separate physical device.
+
+When SQL Server builds an index, it has to perform multiple reads to take care of the various index construction steps:
+
+1. Read through all the data, constructing a leaf row corresponding to each row of actual data. Just like the actual data and final index, these go into pages for interim storage. These intermediate pages are not the final index pages but rather a holding place to temporarily store things every time the sort buffers fill up.
+
+2. A separate run is made through these intermediate pages to merge them into the final leaf pages of the index.
+
+3. Non-leaf pages are built as the leaf pages are being populated.
+
+If the SORT_IN_TEMPDB option is not used, then the intermediate pages are written out to the same physical files that the database is stored in. This means that the reads of the actual data have to compete with the writes of the build process. The two cause the disk heads to move to different places from those the other (read versus write) needs. The result is that the disk heads are constantly moving back and forth; this takes time.
+
+If, on the other hand, SORT_IN_TEMPDB is used, then the intermediate pages will be written to tempdb rather than the database's own file. If they are on separate physical drives, this means that there is no competition between the read and write operations of the index build. Keep in mind, however, that this works only if tempdb is on a separate physical drive from your database file; otherwise, the change is only in name, and the competition for I/O is still a factor.
+
+If you're going to use SORT_IN_TEMPDB, make sure that there is enough space in tempdb for large operations.
+
+ONLINE
+
+If you set this to ON, it forces the table to remain available for general access and does not create any locks that block users from the index and/or table. By default, full index operations will grab the locks (eventually a table lock) it needs to have full and efficient access to the table. The side effect, however, is that your users are blocked out. (Yeah, it's a paradox; you're likely building an index to make the database more usable, but you essentially make the table unusable while you do it.)
+
+Now, you're probably thinking something like: "Oh, that sounds like a good idea. I'll do that every time so my users are unaffected." Poor thinking. Keep in mind that any index construction like that is probably a very highly I/O-intensive operation, so it is affecting your users one way or the other. Now, add that there is a lot of additional overhead required in the index build for it to make sure that it doesn't step on the toes of any of your users. If you let SQL Server have free reign over the table while it's building the index, then the index will be built much faster, and the overall time that the build is affecting your system will be much smaller.
+
+ONLINE index operations are supported only in the Enterprise Edition of SQL Server. You can execute the index command with the ONLINE directive in other editions, but it will be ignored, so don't be surprised if you use ONLINE and find your users still being blocked out by the index operation if you're using a lesser edition of SQL Server.
+
+ALLOW ROW/PAGE LOCKS
+
+This is a longer term directive than ONLINE is, and is a very, very advanced topic. For purposes of this book and given how much we've introduced so far on locking, I want to stick with a pretty simple explanation.
+
+Through much of the book thus far I have repeatedly used the term lock. As explained early on, this is something of a placeholder to avoid conflicts in data integrity. The ALLOW settings you're looking at here are setting directives regarding whether this index will allow those styles of locks or not. This falls under the heading of extreme performance tweak.
+
+MAXDOP
+
+This is overriding the system setting for the maximum degree of parallelism for purposes of building this index. Parallelism is not something I talk about in this book, so I'll give you a mini-dose of it here.
+
+In short, the degree of parallelism is how many processes are put to use for one database operation (in this case, the construction of an index). There is a system setting called the max degree of parallelism that allows you to set a limit on how many processes can run in parallel per logical operation. The MAXDOP option in the index creation options allows you to set the degree of parallelism to be either higher or lower than the base system setting as you deem appropriate.
+
+ON
+
+SQL Server gives you the option of storing your indexes separately from the data by using the ON option. This can be nice from a couple of perspectives:
+
+  * The space that is required for the indexes can be spread across other drives.
+  * The I/O for index operations does not burden the physical data retrieval.
+
+There's more to this, but this is highly advanced stuff. It is very data- and use-dependent, and so we'll consider it out of the scope of this book.
+
+Implied Indexes Created with Constraints
+
+I guess I call this one "index by accident." It's not that the index shouldn't be there. It has to be there if you want the constraint that created the index. It's just that I've seen an awful lot of situations where the only indexes on the system were those created in this fashion. Usually, this implies that the administrators and/or designers of the system are virtually oblivious to the concept of indexes.
+
+However, you'll also find another bizarre twist on this one—the situation where the administrator or designer knows how to create indexes but doesn't really know how to tell what indexes are already on the system and what they are doing. This kind of situation is typified by duplicate indexes. As long as they have different names, SQL Server will be more than happy to create them for you.
+
+Implied indexes are created when one of two constraints is added to a table:
+
+  * A PRIMARY KEY
+  * A UNIQUE constraint (a.k.a. an alternate key)
+
+You've seen plenty of the CREATE syntax up to this point, so I won't belabor it; however, it should be noted that all the options except for {CLUSTERED|NONCLUSTERED} and FILLFACTOR are not allowed when creating an index as an implied index to a constraint.
+
+ALTER INDEX
+
+The command ALTER INDEX is somewhat deceptive. Up until now, ALTER commands have always been about changing the definition of your object. You ALTER tables to add or disable constraints and columns for example. ALTER INDEX is different. It is all about maintenance and zero about structure. If you need to change the make-up of your index, you still need either to DROP and CREATE it or to CREATE and use the index with the DROP_EXISTING=ON option.
+
+As you saw earlier in the chapter, SQL Server gives you an option for controlling just how full your leaf level pages are, and, if you choose, another option to deal with non-leaf level pages. Unfortunately, these are proactive options. They are applied once, and then you need to reapply them as necessary by rebuilding your indexes and reapplying the options.
+
+In the upcoming section on maintenance, you'll learn more on the wheres and whys of utilizing this command, but for now take it on faith that you'll use maintenance commands like ALTER INDEX as part of your regular maintenance routine.
+
+The ALTER INDEX syntax looks like this:
+
+ALTER INDEX { <name of index> | ALL }
+
+ON <table or view name>
+
+{ REBUILD
+
+[ [ WITH (
+
+[ PAD_INDEX = { ON | OFF } ]
+
+| [[,] FILLFACTOR = <fillfactor>
+
+| [[,] SORT_IN_TEMPDB = { ON | OFF } ]
+
+| [[,] IGNORE_DUP_KEY = { ON | OFF } ]
+
+| [[,] STATISTICS_NORECOMPUTE = { ON | OFF } ]
+
+| [[,] ONLINE = { ON | OFF } ]
+
+| [[,] ALLOW_ROW_LOCKS = { ON | OFF } ]
+
+| [[,] ALLOW_PAGE_LOCKS = { ON | OFF } ]
+
+| [[,] MAXDOP = <max degree of parallelism>
+
+) ]
+
+| [ PARTITION = <partition number>
+
+[ WITH ( <partition rebuild index option>
+
+[ ,...n ] ) ] ] ]
+
+| DISABLE
+
+| REORGANIZE
+
+[ PARTITION = <partition number> ]
+
+[ WITH ( LOB_COMPACTION = { ON | OFF } ) ]
+
+| SET ([ ALLOW_ROW_LOCKS= { ON | OFF } ]
+
+| [[,] ALLOW_PAGE_LOCKS = { ON | OFF } ]
+
+| [[,] IGNORE_DUP_KEY = { ON | OFF } ]
+
+| [[,] STATISTICS_NORECOMPUTE = { ON | OFF } ]
+
+)
+
+} [ ; ]
+
+Several of the options are common to the CREATE INDEX command, so I will skip redefining those particular ones here. Beyond that, a fair amount of the ALTER-specific options are fairly detailed and relate to dealing with things like fragmentation (you'll get to fragmentation and maintenance shortly) or are more DBA oriented and usually used on an ad hoc basis to deal with very specific problems. The core elements here should, however, be part of your regular maintenance planning.
+
+You'll start by looking at a couple of top parameters and then look at the options that are part of your larger maintenance planning needs
+
+Index Name
+
+You can name a specific index if you want to maintain one specific index, or use ALL to indicate that you want to perform this maintenance on every index associated with the named table.
+
+Table or View Name
+
+Pretty much just what it sounds like—the name of the specific object (table or view) that you want to perform the maintenance on. Note that it needs to be one specific table. (You can feed it a list and say, "do all of these please!").
+
+REBUILD
+
+This is the "industrial-strength" approach to fixing an index. If you run ALTER INDEX with this option, the old index is completely thrown away and a new one reconstructed from scratch. The result is a truly optimized index, where every page in both the leaf and non-leaf levels of the index has been reconstructed as you have defined it (either with the defaults, or using switches to change things like the fill factor). If the index in question is a clustered index, then the physical data is also reorganized.
+
+By default, the pages will be reconstituted to be full minus two records. Just as with the CREATE TABLE syntax, you can set the FILLFACTOR to be any value between 0 and 100. This number will be the percent full that your pages are once the database reorganization is complete. Remember though that, as your pages split, your data will still be distributed 50–50 between the two pages. You cannot control the fill percentage on an ongoing basis other than regularly rebuilding the indexes.
+
+Careful on this one. As soon as you kick off a REBUILD, the index you are working on is essentially gone until the rebuild is complete. Any queries that relied on that index may become exceptionally slow (potentially by orders of magnitude). This is the sort of thing you want to test on an offline system first to have an idea how long it's going to take, and then schedule to run in off hours (preferably with someone monitoring it to be sure it's back online when peak hours come along).
+
+This one can have major side effects while it runs, and thus it falls squarely in the domain of the database administrator in my not so humble opinion.
+
+DISABLE
+
+This one does what it says, only in somewhat drastic fashion. It would be nice if all this command did was take your index offline until you decided further what you want to do, but instead it essentially marks the index as unusable. Once an index has been disabled, it must be rebuilt (not reorganized, but rebuilt) before it will be active again.
+
+This is one you're very, very rarely going to do yourself (you would more likely just drop the index)—it is far more likely to happen during a SQL Server upgrade or some other oddball situation.
+
+Yet another BE CAREFUL!!! warning on this one. If you disable the clustered index for your table, it has the effect of disabling the table. The data will remain, but will be inaccessible by all indexes (since they all depend on the clustered index) until you rebuild the clustered index.
+
+REORGANIZE
+
+BINGO!!! from the developer perspective. With REORGANIZE you hit much more of a happy medium in life. When you reorganize your index, you get a slightly less complete optimization than you get with a full rebuild, but one that occurs online. (Users can still utilize the index.)
+
+This should, if you're paying attention, bring about the question "What exactly do you mean by 'slightly less complete'?" Well, REORGANIZE works only on the leaf level of your index; non-leaf levels of the index go untouched. This means that you're not quite getting a full optimization, but, for the lion's share of indexes, that is not where your real cost of fragmentation is (though it can happen and your mileage may vary).
+
+Given its much lower impact on users, this is usually the tool you'll want to use as part of your regular maintenance plan. We'll look into this a bit more later when talking fragmentation.
+
+DROP INDEX
+
+This one returns to most of the simplicity of prior DROP statements. The only real trick to it is that, since an index is not a standalone object (it is essentially contained within the definition of a table), you must name not only the index but also the table that is belongs to. The syntax looks like this:
+
+DROP INDEX <table name>.<index name>
+
+As you can see, there's not really much to it. You can use full four-part naming (I guess it turns into five part if you include the index) if you need to.
+
+Choosing Wisely: Deciding What Index Goes Where and When
+
+By now, you're probably thinking to yourself, "Gee, I'm always going to create clustered indexes!" There are plenty of good reasons to think that way. Just keep in mind that there are also some reasons not to.
+
+Choosing which indexes to include and which not to include can be a tough process, and, in case that wasn't enough, you have to make some decisions about what type you want them to be. The latter decision is made simultaneously easier and harder by the fact that you can only have one clustered index. It means that you have to choose wisely to get the most out of it.
+
+Selectivity
+
+Indexes, particularly non-clustered indexes, are primarily beneficial when there is a reasonably high level of selectivity within the index. By selectivity, I'm referring to the percentage of values in the column that are unique. The higher the percentage of unique values within a column, the higher the selectivity is said to be, and the greater the benefit of indexing.
+
+If you think back to the sections on non-clustered indexes—particularly the section on non-clustered indexes over a clustered index—you will recall that the lookup in the non-clustered index is really only the beginning. You still need to make another loop through the clustered index to find the real data. Even with the non-clustered index on a heap, you still end up with multiple physically separate reads to perform.
+
+If one lookup in your non-clustered index is going to generate multiple additional lookups in a clustered index, then you are probably better off with the table scan. The exponential effect that's possible here is actually quite amazing. Consider that the looping process created by the non-clustered index is not worth it if you don't have somewhere in the area of 90–95 percent uniqueness in the indexed column.
+
+Clustered indexes are substantially less affected by this because, once you're at the start of your range of data—unique or not—you're there. There are no additional index pages to read. Still, more than likely, your clustered index has other things that it could be put to greater use on.
+
+One other exception to the rule of selectivity has to do with foreign keys. If your table has a column that is a foreign key, then, in all likelihood, you're going to benefit from having an index on that column. Why foreign keys and not other columns? Well, foreign keys are frequently the target of joins with the table they reference. Indexes, regardless of selectivity, can be very instrumental in join performance because they allow what is called a merge join. A merge join obtains a row from each table and compares them to see if they match the join criteria (what you're joining on). Since there are indexes on the related columns in both tables, the seek for both rows is very fast.
+
+The point here is that selectivity is not everything, but it is a big issue to consider. If the column in question is not in a foreign key situation, then it is almost certainly second only to the, "How often will this be used?" question in terms of issues you need to consider.
+
+Watching Costs: When Less Is More
+
+Remember that, while indexes speed up performance when reading data, they are actually very costly when modifying data. Indexes are not maintained by magic. Every time that you make a modification to your data, any indexes related to that data also need to be updated.
+
+When you insert a new row, a new entry must be made into every index on your table. Remember, too, that when you update a row, this is handled as a delete and insert; again, your indexes have to be updated. But wait! There's more! (Feeling like a late night infomercial here.) When you delete records, again, you must update all the indexes, not just the data. For every index that you create, you are creating one more block of entries that has to be updated.
+
+Notice, by the way, that I said entries plural—not just one. Remember that a B-Tree has multiple levels to it. Every time that you make a modification to the leaf level, there is a chance that a page split will occur, and that one or more non-leaf level pages must also be modified to have the reference to the proper leaf page.
+
+Sometimes—quite often actually—not creating that extra index is the thing to do. Sometimes, the best thing to do is choose your indexes based on the transactions that are critical to your system and use the table in question. Does the code for the transaction have a WHERE clause in it? What column(s) does it use? Is there a sorting required?
+
+Choosing That Clustered Index
+
+Remember that you can have only one, so you need to choose it wisely.
+
+By default, your primary key is created with a clustered index. This is often the best place to have it, but not always (indeed, it can seriously hurt you in some situations), and if you leave things this way, you won't be able to use a clustered index anywhere else. The point here is don't just accept the default. Think about it when you are defining your primary key: Do you really want it to be a clustered index?
+
+If you decide that you indeed want to change things—that is, you don't want to declare things as being clustered—just add the NONCLUSTERED keyword when you create your table. For example:
+
+CREATE TABLE MyTableKeyExample
+
+(
+
+Column1 intIDENTITY
+
+PRIMARY KEY NONCLUSTERED,
+
+Column2 int
+
+)
+
+Once the index is created, the only way to change it is to drop and rebuild it, so you want to get it set correctly up front.
+
+Keep in mind that, if you change which column(s) your clustered index is on, SQL Server will need to do a complete resorting of your entire table. (Remember, for a clustered index, the table sort order and the index order are the same.) Now, consider a table you have that is 5,000 characters wide and has a million rows in it. That is an awful lot of data that has to be reordered. Several questions should come to mind from this:
+
+  * How long will it take? It could be a long time, and there really isn't a good way to estimate that time.
+  * Do I have enough space? Figure that in order to do a resort on a clustered index you will, on average, need an additional 1.2 times (the working space plus the new index) the amount of space your table is already taking up. This can turn out to be a very significant amount of space if you're dealing with a large table. Make sure you have the room to do it in. All this activity will, by the way, happen in the database itself, so this will also be affected by how you have your maximum size and growth options set for your database.
+  * Should I use the SORT_IN_TEMPDB option? If tempdb is on a separate physical array from your main database and it has enough room, then the answer is probably yes.
+
+The Pros
+
+Clustered indexes are best for queries when the column(s) in question will frequently be the subject of a ranged query. This kind of query is typified by use of the BETWEEN statement or the < or > symbols. Queries that use a GROUP BY and make use of the MAX, MIN, and COUNT aggregators are also great examples of queries that use ranges and love clustered indexes. Clustering works well here, because the search can go straight to a particular point in the physical data, keep reading until it gets to the end of the range, and then stop. It is extremely efficient.
+
+Clusters can also be excellent when you want your data sorted (using ORDER BY) based on the cluster key.
+
+The Cons
+
+There are two situations where you don't want to create that clustered index. The first is fairly obvious—when there's a better place to use it. I know I'm sounding repetitive here, but don't use a clustered index on a column just because it seems like the thing to do. (Primary keys are the common culprit here.) Be sure that you don't have another column that it's better suited to first.
+
+Perhaps the much bigger no-no use for clustered indexes, however, is when you are going to be doing a lot of inserts in a non-sequential order. Remember that concept of page splits? Well, here's where it can come back and haunt you big time.
+
+Imagine this scenario: You are creating an accounting system. You would like to make use of the concept of a transaction number for your primary key in your transaction files, but you would also like those transaction numbers to be somewhat indicative of what kind of transaction it is. (It really helps troubleshooting by your accountants.) So, you come up with something of a scheme: You'll place a prefix on all the transactions indicating what sub-system they come out of. They will look something like this:
+
+ARXXXXXX Accounts Receivable Transactions
+
+GLXXXXXX General Ledger Transactions
+
+APXXXXXX Accounts Payable Transactions
+
+where XXXXXX will be a sequential numeric value.
+
+This seems like a great idea, so you implement it, leaving the default of the clustered index going on the primary key.
+
+At first look, everything about this setup looks fine. You're going to have unique values, and the accountants will love the fact that they can infer where something came from based on the transaction number. The clustered index seems to make sense since they will often be querying for ranges of transaction IDs.
+
+Ah, if only it were that simple. Think about your inserts for a bit. With a clustered index, you originally had a nice mechanism to avoid much of the overhead of page splits. When a new record was inserted that was to go after the last record in the table, then, even if there was a page split, only that record would go to the new page, SQL Server wouldn't try to move around any of the old data. Now you've messed things up though.
+
+New records inserted from the General Ledger will wind up going on the end of the file just fine. (GL is last alphabetically, and the numbers will be sequential.) The AR and AP transactions have a major problem though; they are going to be doing non-sequential inserts. When AP000025 gets inserted and there isn't room on the page, SQL Server is going to see AR000001 in the table, and know that it's not a sequential insert. Half the records from the old page will be copied to a new page before AP000025 is inserted.
+
+The overhead of this can be staggering. Remember that you're dealing with a clustered index, and that the clustered index is the data. The data is in index order. This means that, when you move the index to a new page, you are also moving the data. Now imagine that you're running this accounting system in a typical OLTP environment (you don't get much more OLTP-like than an accounting system) with a bunch of data-entry people keying in vendor invoices or customer orders as fast as they can. You're going to have page splits occurring constantly, and every time you do, you're going to see a brief hesitation for users of that table while the system moves data around.
+
+Fortunately, there are a couple of ways to avoid this scenario:
+
+  * Choose a cluster key that is going to be sequential in its inserting. You can either create an identity column for this or you may have another column that logically is sequential to any transaction entered regardless of system.
+  * Choose not to use a clustered index on this table. This is often the best option in a situation like that in this example, since an insert into a non-clustered index on a heap is usually faster than one on a cluster key.
+
+Even though I've told you to lean toward sequential cluster keys to avoid page splits, you also have to realize that there's a cost there. Among the downsides of sequential cluster keys are concurrency (two or more people trying to get to the same object at the same time). It's all about balancing what you want, what you're doing, and what it's going to cost you elsewhere.
+
+This is perhaps one of the best examples of why I have gone into so much depth as to how things work. You need to think through how things are actually going to get done before you have a good feel for what the right index to use (or not to use) is.
+
+Column Order Matters
+
+Just because an index has two columns, it doesn't mean that the index is useful for any query that refers to either column.
+
+An index is considered for use only if the first column listed in the index is used in the query. The bright side is that there doesn't have to be an exact one-for-one match to every column—just the first. Naturally, the more columns that match (in order), the better, but only the first creates a definite do-not-use situation.
+
+Think about things this way. Imagine that you are using a phone book. Everything is indexed by last name and then first name. Does this sorting do you any good if all you know is that the person you want to call is named Fred? On the other hand, if all you know is that his last name is Blake, the index will still serve to narrow the field for you.
+
+One of the more common mistakes that I see in index construction is to think that one index that includes all the columns is going to be helpful for all situations. Indeed, what you're really doing is storing all the data a second time. The index will totally be ignored if the first column of the index isn't mentioned in the JOIN, ORDER BY, or WHERE clauses of the query.
+
+Dropping Indexes
+
+If you're constantly re-analyzing the situation and adding indexes, don't forget to drop indexes, too. Remember the overhead on inserts. It doesn't make much sense to look at the indexes that you need and not also think about which indexes you do not need. Always ask yourself: "Can I get rid of any of these?"
+
+The syntax to drop an index is pretty much the same as that for dropping a table. The only hitch is that you need to qualify the index name with the table or view it is attached to:
+
+DROP INDEX <table or view name>.<index name>
+
+And it's gone.
+
+Use the Database Engine Tuning Advisor
+
+It would be my hope that you'll learn enough about indexes not to need the Database Engine Tuning Advisor, but it still can be quite handy. It works by taking a workload file, which you generate using the SQL Server Profiler (discussed in Chapter 22), and looking over that information for what indexes will work best on your system.
+
+The Database Engine Tuning Advisor is found as part of the Tools menu of the SQL Server Management Studio. It can also be reached as a separate program item in the Start Menu of Windows (under Microsoft SQL Server 2008 ⇒ Performance Tools). As with most tuning tools, I don't recommend using this tool as the sole way you decide what indexes to build, but it can be quite handy in terms of making some suggestions that you may not have thought of.
+
+Maintaining Your Indexes
+
+As developers, we often tend to forget about our product after it goes out the door. For many kinds of software, that's something you can get away with just fine. You ship it and then you move on to the next product or next release. However, with database-driven projects, it's virtually impossible to get away with. You need to take responsibility for the product well beyond the delivery date.
+
+Please don't take me to mean that you have to go serve a stint in the tech support department. I'm actually talking about something even more important: maintenance planning.
+
+There are really two issues to be dealt with in terms of the maintenance of indexes:
+
+  * Page splits
+  * Fragmentation
+
+Both are related to page density and, while the symptoms are substantially different, the troubleshooting tool is the same, as is the cure.
+
+Fragmentation
+
+We've already talked about page splits quite a bit, but we haven't really touched on fragmentation. I'm not talking about the fragmentation that you may have heard of with your O/S files and the defrag tool you use, because that won't help with database fragmentation.
+
+Fragmentation happens when your database grows, pages split, and then data is eventually deleted. While the B-Tree mechanism is really not that bad at keeping things balanced from a growth point of view, it doesn't really have a whole lot to offer as you delete data. Eventually, you may get down to a situation where you have one record on this page, a few records on that page—a situation where many of your data pages are holding only a small fraction of the amount of data that they could hold.
+
+The first problem with this is probably the first you would think about—wasted space. Remember that SQL Server allocates an extent of space at a time. If only one page has one record on it, then that extent is still allocated. In the case of the empty pages in the extent, SQL Server will see those pages as available for reuse in the same table or index, but if, for example, that table or index is decreasing in size, the free pages in the extent will remain unused.
+
+The second problem is the one that is more likely to cause you grief: Records that are spread all over the place cause additional overhead in data retrieval. Instead of just loading up one page and grabbing the 10 rows it requires, SQL Server may have to load 10 separate pages in order to get that same information. It isn't just reading the row that causes effort, SQL Server has to read that page in first. More pages equals more work on reads.
+
+That being said, database fragmentation does have its good side. OLTP systems positively love fragmentation. Any guesses as to why? Page splits. Pages that don't have much data in them can have data inserted with little or no fear of page splits.
+
+So, high fragmentation equates to poor read performance, but it also equates to excellent insert performance. As you might expect, this means that OLAP systems really don't like fragmentation, but OLTP systems do.
+
+Identifying Fragmentation
+
+SQL Server has always had commands to help you identify just how full the pages and extents in your database are. In SQL Server 2005, Microsoft greatly expanded the options and, in particular, the usability of management tools for indexes, and those increased options continue to become more mainstream as we move into the SQL Server 2008 era and slowly become less concerned about compatibility with SQL Server 2000 and earlier. We can use the information provided by these commands and tools to make some decisions about what we want to do to maintain our database.
+
+sy.sdm_db_index_physical_stats
+
+The sys.dm_db_index_physical_stats function is one of several metadata functions that were added back in SQL Server 2005. (There is a discussion of these in Appendix [CHECK].) The idea behind these and similar metadata functions is to allow developers and administrators alike more flexible access to data on the condition of our server, the database, and the tables and indexes within. Whereas before we were stuck with different functions within the Database Consistency Checker (DBCC), which gave free-form output that was difficult to use programmatically (you were pretty much stuck parsing the results to find what you need), we now have both scalar and table-valued functions, as appropriate, that return usable data to us that we can build conditions around, grab values to use in variables, and otherwise manipulate as discrete pieces of data. When talking indexes, the metadata function we're most likely interested in is sys.dm_db_index_physical_stats. It is a table-valued function that requires several parameters, and the syntax looks like this:
+
+sys.dm_db_index_physical_stats (
+
+{ <database id> | NULL | 0 | DEFAULT }
+
+, { <object id> | NULL | 0 | DEFAULT }
+
+, { <index id> | NULL | 0 | -1 | DEFAULT }
+
+, { <partition number>| NULL | 0 | DEFAULT }
+
+, { LIMITED | SAMPLED | DETAILED | NULL | DEFAULT }
+
+)
+
+Again, this is a table-valued function, so you need to use it in conjunction with a SELECT statement. Let's look at the input parameters individually.
+
+Parameter | Description  
+---|---  
+Database ID | SQL Server's internal identifier for the database you want containing the tables and indexes you want physical statistics for. Use the DB_ID() function to easily retrieve the database id for your database. The default for this parameter is NULL (technically the same as 0 in use), which means supply information for all databases.  
+Object ID | The internal identifier for the particular object you want physical statistical information on. Use the OBJECT_ID() function to easily retrieve the object id for the table or view you're interested in. The default is NULL (again, functionally the equivalent of 0) and implies that you want data for all objects in the database(s) you've indicated.  
+Index ID | The internal identifier for the particular index you're interested in physical statistics for. Fetching a particular index identifier is more of a challenge, as there is no system function to retrieve it. (You would need to query it from sys.indexes using the name and the object id it belongs to.) As with the other parameters so far, this one defaults to -1. Unlike previous parameters, this is not functionally equivalent to 0 (which is only valid if the table is built on a heap, and then indicates you want data on the heap itself).  
+Partition number | For the vast majority of tables, there is only going to be one partition (and its number will be 1). The default is NULL, which returns all partitions and is functionally equivalent to 0.  
+Mode | Determines the level of scanning performed to establish the statistics returned. Scan modes include LIMITED, SAMPLED, and DETAILED in order of increasing accuracy but increasing overhead (and slower response).
+
+What is returned is a very wide table with an array of different physical statistics on your index or table. We won't address every one of them here, but let's take a look at some of the highlights:
+
+Column | Description  
+---|---  
+index_type_desc | Indicates the nature of the index this row relates to. If the result is HEAP or CLUSTERED INDEX, then it relates to the physical data for the table. Other possible results include NONCLUSTERED INDEX, PRIMARY XML INDEX, XML INDEX, and SPATIAL INDEX.  
+index_depth | Number of levels to the index. If it's a heap or a set of LOB pages, then this will always be 1; otherwise, it will represent how many levels there are in the index. (For example, back in Figure 8.7, there are three levels to the non-clustered index.)  
+index_level | This one is somewhat counterintuitive in that it counts from the bottom of the index up. The leaf level of the index will be zero (also zero for a heap or LOB), and the number will go up as one navigates backwards up the tree. This value is only supplied when the mode is DETAILED  
+avg_fragmentation_in_percent | This indicates the degree of fragmentation in the index tree based on pages or extents that are out of order (the pointer to the logical next page is not the same as the physical next page). You're usually looking for a low number here, though how low depends on the specifics of your row makeup and the purpose of the index.  
+avg_record_size_in_bytes | Just what it says it is. The average size of a record in the index. This can be a highly useful number when doing space planning (If I add another 100,000 rows, how much space will it take up?).  
+record_count | Another somewhat tricky one. This value will generally match what you would get from a SELECT COUNT(*). The exception is when dealing with a heap that has a forwarding record. Forwarding records occur when a record is written onto a page, and then is later updated such that a given column no longer fits on the page (so they store a pointer to where the data is instead).
+
+Let's take a look at a quick example of using this system function. Imagine for a moment that we want to see the fragmentation on the clustered index for the Sales.SalesOrderDetail table. We could get key pieces of information with the following query:
+
+SELECT index_type_desc AS Type,
+
+index_id,
+
+avg_fragmentation_in_percent,
+
+forwarded_record_count
+
+FROM sys.dm_db_index_physical_stats(
+
+DB_ID(),
+
+OBJECT_ID('Sales.SalesOrderDetail'),
+
+DEFAULT,
+
+DEFAULT,
+
+'DETAILED' )
+
+WHERE index_id = 1
+
+AND index_level = 0;
+
+Which yields a fairly straightforward result set:
+
+Type index_id avg_fragmentation_in_percent forwarded_record_count
+
+\---------------- --------- ---------------------------- -----------------
+
+CLUSTERED INDEX 1 0.0810372771474878 NULL
+
+(1 row(s) affected)
+
+Note that I've used the index_id = 1 in my WHERE clause to force it to be the clustered index. (I would choose zero had this been on a heap.) I've chosen index_level = 0 to force it to give me information on just the leaf level of the index.
+
+By placing an additional WHERE condition on the fragmentation percentage, I could use the information provided here to build a list of indexes that I thought required maintenance (more on that in Chapter 23).
+
+Backward Compatibility
+
+So we've now seen the metadata way of getting information, but what about when we're working with older releases (prior to SQL Server 2005)? The "old standby" command is actually an option for the DBCC. This is the command you're likely to find utilized in some fashion in virtually every installation today and for years to come. This is the pre-2005 way of doing things, and any pre-2005 database installation that had any maintenance going at all utilized it. What's more, there continue to be tons and tons of articles and "how-tos" on the Web that show you how to use this tool.
+
+Before I get too far into extolling the praises of DBCC SHOWCONTIG, let me remind you that this is the "old," and, dare I say, "inflexible" way of doing things. The system views give us many more possibilities in terms of being able to more specifically query data and manage indexes on a more global level. We explore much more of that functionality in Appendix [CHECK] at the end of this book. With that said, DBCC has done the job for years, and it is the thing to use if you are monitoring indexes in a server environment that contains pre–SQL Server 2005 installations, and is what you will likely find in much of the existing management code out there.
+
+The syntax is pretty simple:
+
+DBCC SHOWCONTIG
+
+[({<table name>|<table id>|<view name>|<view id>}
+
+[, <index name>|<index id>])]
+
+[WITH { [ ALL_INDEXES ]
+
+| [, FAST ]
+
+| [, TABLERESULTS ]
+
+| [, ALL_LEVELS } ]
+
+| [, NO_INFOMSGS ]
+
+Some of this is self-describing (such as the table name), but I want address the items beyond the names:
+
+table id/view id/index id | The is the internal object id for the table, view, or index. In prior versions of SQL Server, DBCC SHOWCONTIG operated solely off this identifier, so you had to look it up using the OBJECT_ID() function prior to making your DBCC call.  
+---|---  
+ALL_INDEXES | This is one of those "what it sounds like" things. If you specify this option, you can skip providing a specific index, as all indexes will be analyzed and data returned.  
+FAST | This is about getting a return as fast as possible, and it therefore skips analyzing the actual pages of the index and will output only minimal information.  
+TABLERESULTS | A very cool feature—this one returns the results as a table rather than text. This means it's much easier to parse the results and take automated actions.  
+ALL_LEVELS | This really only has one relevance in SQL Server 2005, as what it used to do it now ignores. The relevance? Backward compatibility. Basically, you can include this option and the command will still run, but it won't be any different.  
+NO_INFOMSGS | This just trims out informational-only messages. Basically, if you have any significant errors in your table (error level 11 or higher), then messages will still come through, but error level 10 and lower will be excluded.
+
+As an example, to again get the information from the PK_SalesOrderDetail_SalesOrderID_SalesOrderDetailID index in the Sales.SalesOrderDetail table, we could run:
+
+USE AdventureWorks2008;
+
+GO
+
+DBCC SHOWCONTIG ('Sales.SalesOrderDetail',
+
+PK_SalesOrderDetail_SalesOrderID_SalesOrderDetailID);
+
+Notice the single quotation marks around the table name. These are only required because I'm using two-part naming; if I had only specified the name of the table (SalesOrderDetail), then the quotation marks would not have been required. The problem here is that, depending on how your user is set up for use of different schemas or the existence of other tables with the same name in a different schema, leaving out the schema name may generate an error or perform the operation on a different table than you expected.
+
+The output is not really all that self-describing:
+
+DBCC SHOWCONTIG scanning 'SalesOrderDetail' table...
+
+Table: 'SalesOrderDetail' (898102240); index ID: 1, database ID: 7
+
+TABLE level scan performed.
+
+\- Pages Scanned................................: 1234
+
+\- Extents Scanned..............................: 155
+
+\- Extent Switches..............................: 154
+
+\- Avg. Pages per Extent........................: 8.0
+
+\- Scan Density [Best Count:Actual Count].......: 100.00% [155:155]
+
+\- Logical Scan Fragmentation ..................: 0.08%
+
+\- Extent Scan Fragmentation ...................: 3.23%
+
+\- Avg. Bytes Free per Page.....................: 28.5
+
+\- Avg. Page Density (full).....................: 99.65%
+
+DBCC execution completed. If DBCC printed error messages, contact your
+
+system administrator.
+
+Some of this is probably pretty straightforward, but the following table will walk you through what everything means:
+
+Stat | What It Means  
+---|---  
+Pages Scanned | The number of pages in the table (for a clustered index) or index.  
+Extents Scanned | The number of extents in the table or index. This will be a minimum of the number of pages divided by 8 and then rounded up. The more extents for the same number of pages, the higher the fragmentation.  
+Extent Switches | The number of times DBCC moved from one extent to another as it traversed the pages of the table or index. This is another one for fragmentation—the more switches it has to make to see the same amount of pages, the more fragmented you are.  
+Avg. Pages per Extent | The average number of pages per extent. A fully populated extent would have eight.  
+Scan Density [Best Count: Actual Count] | The best count is the ideal number of extent changes if everything is perfectly linked. Actual count is the actual number of extent changes. Scan density is the percentage found by dividing the best count by the actual count.  
+Logical Scan Fragmentation | The percentage of pages that are out-of-order as checked by scanning the leaf pages of an index. Only relevant to scans related to a clustered table. An out-of-order page is one for which the next page indicated in the index allocation map (IAM) is different from that pointed to by the next page pointer in the leaf page.  
+Extent Scan Fragmentation | This one is telling you if an extent is not physically located next to the extent that it is logically located next to. This just means that the leaf pages of your index are not physically in order (though they still can be logically), and just what percentage of the extents this problem pertains to.  
+Avg. Bytes free per page | Average number of free bytes on the pages scanned. This number can get artificially high if you have large row sizes. For example, if your row size was 4,040 bytes, then every page could only hold one row, and you would always have an average number of free bytes of about 4,020 bytes. That would seem like a lot, but, given your row size, it can't be any less than that.  
+Avg. Page density (full) | Average page density (as a percentage). This value takes into account row size and is, therefore, a more accurate indication of how full your pages are. The higher the percentage, the better.
+
+Now, the question is how do we use this information once we have it? The answer is, of course, that it depends.
+
+Using the output from our SHOWCONTIG, we have a decent idea of whether our database is full, fragmented, or somewhere in between (the latter is, most likely, what we want to see). If we're running an OLAP system, then seeing our pages full would be great; fragmentation would bring on depression. For an OLTP system, we would want much the opposite (although only to a point).
+
+So, how do we take care of the problem? To answer that, we need to look into the concept of index rebuilding and fillfactors.
+
+DBREINDEX—That Other Way of Maintaining Your Indexes
+
+Earlier in the chapter, we looked at the ALTER INDEX command. This should be your first line command for performing index reorganization and managing your fragmentation levels. While I highly recommend the use of ALTER INDEX moving forward, DBREINDEX is the way things have been done in the past, and, much like DBCC SHOWCONTIG, there is far, far too much code and use out there already for me to just skip it.
+
+DBREINDEX is another DBCC command, and the syntax looks like this:
+
+DBCC DBREINDEX (<'database.owner.table_name'>[, <index name>
+
+[, <fillfactor>]]) [WITH NO_INFOMSGS]
+
+Executing this command completely rebuilds the requested index. If you supply a table name with no index name, then it rebuilds all the indexes for the requested table. There is no single command to rebuild all the indexes in a database.
+
+Rebuilding your indexes restructures all the information in those indexes, and reestablishes a base percentage that your pages are full. If the index in question is a clustered index, then the physical data is also reorganized.
+
+As with ALTER INDEX, the pages will, by default, be reconstituted to be full minus two records. Just as with the CREATE TABLE syntax, you can set the FILLFACTOR to be any value between 0 and 100. This number will be the percent full that your pages are once the database reorganization is complete. Remember though that, as your pages split, your data will still be distributed 50–50 between the two pages. You cannot control the fill percentage on an ongoing basis other than regularly rebuilding the indexes.
+
+There is something of an exception on the number matching the percent full that occurs if you use zero as your percentage. It will go to full minus two rows (it's a little deceiving—don't you think?).
+
+We use a FILLFACTOR when we need to adjust the page densities. As we've already discussed, lower page densities (and therefore lower FILLFACTORs) are ideal for OLTP systems where there are a lot of insertions; this helps prevent page splits. Higher page densities are desirable with OLAP systems (fewer pages to read, but no real risk of page splitting due to few to no inserts).
+
+If we wanted to rebuild the index that serves as the primary key for the Order Details table we were looking at earlier with a fill factor of 65, we would issue a DBCC command as follows:
+
+DBCC DBREINDEX ('Sales.SalesOrderDetail',
+
+PK_SalesOrderDetail_SalesOrderID_SalesOrderDetailID, 65)
+
+We can then re-run the DBCC SHOWCONTIG to see the effect:
+
+DBCC SHOWCONTIG scanning 'SalesOrderDetail' table...
+
+Table: 'SalesOrderDetail' (898102240); index ID: 1, database ID: 7
+
+TABLE level scan performed.
+
+\- Pages Scanned................................: 1883
+
+\- Extents Scanned..............................: 236
+
+\- Extent Switches..............................: 235
+
+\- Avg. Pages per Extent........................: 8.0
+
+\- Scan Density [Best Count:Actual Count].......: 100.00% [236:236]
+
+\- Logical Scan Fragmentation ..................: 0.05%
+
+\- Extent Scan Fragmentation ...................: 1.27%
+
+\- Avg. Bytes Free per Page.....................: 2809.1
+
+\- Avg. Page Density (full).....................: 65.29%
+
+DBCC execution completed. If DBCC printed error messages, contact your
+
+system administrator.
+
+The big one to notice here is the change in Avg. Page Density. The number didn't quite reach 65 percent because SQL Server has to deal with page and row sizing, but it gets as close as it can.
+
+Several things to note about DBREINDEX and FILLFACTOR:
+
+  * If a FILLFACTOR isn't provided, then the DBREINDEX will use whatever setting was used to build the index previously. If one has never been specified, then the fill factor will make the page full less two records (which is too full for most situations).
+  * If a FILLFACTOR is provided, then that value becomes the default FILLFACTOR for that index.
+  * While DBREINDEX can be done live, I strongly recommend against it. It locks resources and can cause a host of problems. At the very least, look at doing it at non-peak hours. Better still, if you're going to do it online, use ALTER INDEX instead and just do a REORGANIZE rather than a rebuild.
+  * I've said it before, but it bears repeating: DBREINDEX is now considered deprecated, and you should avoid it in situations where you do not need that backward compatibility. (Use ALTER INDEX instead.)
+
+Summary
+
+Indexes are sort of a cornerstone topic in SQL Server or any other database environment, and are not something to be taken lightly. They can drive your performance successes, but they can also drive your performance failures.
+
+Top-level things to think about with indexes:
+
+  * Clustered indexes are usually faster than non-clustered indexes (one could come very close to saying always, but there are exceptions).
+  * Only place non-clustered indexes on columns where you are going to get a high level of selectivity (that is, 95 percent or more of the rows are unique).
+  * All Data Manipulation Language (DML: INSERT, UPDATE, DELETE, SELECT) statements can benefit from indexes, but inserts, deletes, and updates (remember, they use a delete and insert approach) are slowed by indexes. The lookup part of a query is helped by the index, but anything that modifies data will have extra work to do (to maintain the index in addition to the actual data).
+  * Indexes take up space.
+  * Indexes are used only if the first column in the index is relevant to your query.
+  * Indexes can hurt as much as they help—know why you're building the index, and don't build indexes you don't need.
+  * Indexes can provide structured data performance to your unstructured XML data, but keep in mind that, like other indexes, there is overhead involved.
+
+When you're thinking about indexes, ask yourself these questions:
+
+Question | Response  
+---|---  
+Are there a lot of inserts or modifications to this table? | If yes, keep indexes to a minimum. This kind of table usually has modifications done through single record lookups of the primary key—usually, this is the only index you want on the table. If the inserts are non-sequential, think about not having a clustered index.  
+Is this a reporting table? That is, relatively few inserts, but reports that run many different ways? | More indexes are fine. Target the clustered index to frequently used information that is likely to be extracted in ranges. OLAP installations will often have many times the number of indexes seen in an OLTP environment.  
+Is there a high level of selectivity on the data? | If yes, and it is frequently the target of a WHERE clause, then add that index.  
+Have I dropped the indexes I no longer need? | If not, why not?  
+Do I have a maintenance strategy established? | If not, why not?
+7
+
+More Advanced Index Structures
+
+Alright, so we've walked through the basics of design. Heck, we've even walked through the advanced stages of traditional indexing. There are, however, some even more advanced things to think about in indexing and other storage. Among these are some of the atypical index and storage structures including:
+
+  * XML indexes
+  * Spatial data and their associated indexes
+  * User-defined data types
+  * Filestreams
+  * Table compression
+  * Hierarchical data
+
+In this chapter, we'll take a look at each of these. Some of it will build on things you already know (like the XML data type and methods we've already talked about extensively), and some will likely be totally new. (Indeed, the remaining items are new with SQL Server 2008.)
+
+The choice to group these particular items into one chapter may seem a bit crazy (even to me), but the thing they have in common is pretty simple. They are all somewhat out of the mainstream and require a bit of extra thinking to see how they work.
+
+XML Indexes
+
+XML indexes first appeared in SQL Server 2005, and I have to admit that I continue to be mildly amazed that Microsoft pulled it off. I've known some of that team for a very long time now, and I have a lot of confidence in them, but the indexing of something as unstructured as XML has been a problem that many have tried to address, but few have done with any real success. Kudos to the SQL Server team for pulling this one off. Enough gushing though. I want to get down to the business of what XML indexes are all about.
+
+Perhaps the most amazing thing about XML indexes is that they are really not all that different from indexes of more typical relational data. Indeed, the XML CREATE syntax supports all the same options you saw in the previous chapter for the CREATE INDEX statement with the exception of IGNORE_DUP_KEY and ONLINE. Why is this such a big deal? Well, while an index would seem to be a basic structure that could support anything, the nature of what's being indexed can have a significant impact in how well traditional indexes support the underlying data. Unlike the relational data that you may be more accustomed to, XML tends to be very unstructured. It utilizes tags to identify data, and can be far more variable in nature than typical relational data. The unstructured nature of XML requires the notion of "navigating" or "path" information to find a data "node" in an XML document. Now indexes, on the other hand, try to provide very specific structure and order to data. This poses something of a conflict.
+
+You can create indexes on columns in SQL Server that are of type XML. The requirements of doing this are:
+
+  * The table containing the XML you want to index must have a clustered index on it, and that clustered index must be on the table's primary key; furthermore, the primary key can not include more than 15 columns.
+  * A "primary" XML index must exist on the XML data column before you can create "secondary" indexes (more on this in a moment).
+  * XML indexes can be created only on columns of XML type (and an XML index is the only kind of index you can create on columns of that type).
+  * The XML column must be part of a base table. You cannot create the index on a view, table variable, or table user-defined data type.
+
+Creating one or more XML indexes on a table also implies an important restriction on your table: You cannot modify the primary key or (as a result) the clustered index while any XML indexes exist on the table. If you need to modify the primary key, you must first drop all the XML indexes. (You can rebuild them after the modification to the primary key is complete.)
+
+The Primary XML Index
+
+The first index you create on an XML index must be declared as a "primary" index. When you create a primary index, SQL Server "shreds" the XML (converting it to tabular form) and creates a new clustered index that combines the clustered index of the base table with data from whatever XML node you specify. In addition to the cluster key information, the primary XML index will also store:
+
+  * The tag name of the node being indexed (its element or attribute name)
+  * The value of the node
+  * The type of the node (element, attribute, or text)
+  * An internal node identifier (order information)
+  * The path from the node to the document root
+
+All this is the result of shredding the XML out into an internal table. This internal table is how the XML data is persisted in a form that allows the traditional index model to work. You can get a look at what internal tables are being stored in your system by querying sys.internal_tables (which also shows other types of internal tables) or sys.xml_indexes. For example, we can check out the XML indexes in the AdventureWorks2008 database:
+
+SELECT * FROM sys.xml_indexes;
+
+This yields us several primary XML indexes and a few secondary XML indexes. (We'll look at secondary XML indexes shortly.)
+
+object_id name index_id type
+
+\----------- ------------------------------------------------- ----------- ----
+
+162099618 PXML_ProductModel_CatalogDescription 256000 3...
+
+162099618 PXML_ProductModel_Instructions 256001 3...
+
+270624007 PXML_Store_Demographics 256000 3...
+
+1509580416 PXML_Person_AddContact 256000 3...
+
+1509580416 PXML_Person_Demographics 256001 3...
+
+1509580416 XMLPATH_Person_Demographics 256002 3...
+
+1509580416 XMLPROPERTY_Person_Demographics 256003 3...
+
+1509580416 XMLVALUE_Person_Demographics 256004 3...
+
+(8 row(s) affected)
+
+The result here has been truncated on the right side to allow it to fit in the book, but if you run the query for yourself, you'll see a wealth of additional information about the nature of each XML index listed.
+
+We'll defer discussion of the shredding process for a bit, and move, for the moment, to secondary XML indexes and how they differ from primary indexes.
+
+Secondary XML Indexes
+
+Much like non-clustered indexes point to the cluster key of the clustered index, secondary XML indexes point at the various columns that are part of the internal table of the primary XML index. Secondary XML indexes are merely separate, and far more specialized than the primary XML index they depend on or any other index for that matter. You can have up to 248 secondary XML indexes against a given column.
+
+Secondary XML indexes are special in the sense that they come in three different sub-types:
+
+  * PATH: This secondary index type focuses on providing fast access based on a path-based search criteria. This index is based on the reverse path of the internal table, plus the value.
+  * VALUE: As the name suggests, this index type provides an index oriented around searching for a specific node value. This one can be considered to be the inverse of the PATH secondary index type, indexing first on the value, and then on the reverse path.
+  * PROPERTY: Similar to VALUE, but oriented around multivalued scenarios.
+
+It follows then that the key thing to understand with secondary XML indexes is that your index choice is not targeted just around what data you're indexing, but also specific types of queries you'll be issuing against that data.
+
+Let's take a look at each of the three types.
+
+PATH XML Indexes
+
+The first of the secondary XML index types is targeted toward queries that are searching based on a specific path. If most of your queries will include a specific path in your WHERE clause, then the PATH style secondary index is for you. While the primary XML index will greatly aid in the search for a specific path (likely via .exist()), it carries with it the overhead of the identifying information for the blob (the node information we discussed earlier). As a secondary index, the PATH-based index focuses solely on the path information, and is, therefore, more compact (and therefore more efficient when simply searching).
+
+The key of using a PATH index's efficiency is making sure that a particular path is specified. Your XPath designation of the path can also include a value (if you so choose), but including a path is what will cause this kind of index to be used.
+
+For example, let's look at the Person.Person table in the AdventureWorks2008 database. We can issue a relatively straightforward XPath-oriented query against the table's Demographics column:
+
+WITH XMLNAMESPACES
+
+('http://schemas.microsoft.com/sqlserver/2004/07/adventure-
+
+works/IndividualSurvey' AS "IS")
+
+SELECT Demographics.query('
+
+/IS:IndividualSurvey/IS:TotalPurchaseYTD
+
+') AS Result
+
+FROM Person.Person
+
+WHERE Demographics.exist ('/IS:IndividualSurvey/IS:TotalPurchaseYTD') = 1;
+
+The search for a specific path creates a situation that is optimal for the PATH secondary index type. To see that SQL Server is indeed using it, check out the query plan, as shown in Figure 7.1.
+
+Figure 7.1
+
+Indeed, we can see that the XMLPATH_Person_Demographics index is in use.
+
+VALUE XML Indexes
+
+This one is all about ordering by—wait for it....—value. You knew it was coming, right?
+
+The columns used for this index are based on the Primary node value and the path. The type of the value is not important. The important thing to remember when considering this is that you may not know the entire path. In particular, you may know only the element and/or attribute that actually contains the value.
+
+Since the index is primarily focused on the value, it finds a match there first, and then concerns itself with whether or not the path matches. The path is actually stored in reverse order, which allows you to find a match to the leaf portion of a path regardless of what is the parent to the partial path you supply.
+
+PROPERTY XML Indexes
+
+PROPERTY indexes are meant to combine values from two different kinds of columns—whatever the primary key is, and, of course, the XML column. PROPERTY indexes are first oriented around the primary key of the row, and then on the path (again, stored in reverse) and value of individual XML nodes. As you might surmise from the first value being the primary key for the row, this index is useful only for situations where the primary key is known. After that, it acts somewhat like the PATH secondary index type.
+
+Creating XML Indexes
+
+So, now that we have all the different types of XML indexes figured out, we're probably set to see how to create them. Much of the syntax isn't that different from creating standard indexes, but there are a few twists. The overall syntax looks like this:
+
+CREATE [ PRIMARY ] XML INDEX <index name>
+
+ON <table> (<name of the xml column to index> )
+
+[ USING XML INDEX <name of primary xml index if creating a secondary>
+
+[ FOR { PATH | VALUE | PROPERTY } ] ]
+
+[ WITH ( PAD_INDEX = { ON | OFF }
+
+| FILLFACTOR = <fill factor>
+
+| SORT_IN_TEMPDB = { ON | OFF }
+
+| IGNORE_DUP_KEY = OFF
+
+| STATISTICS_NORECOMPUTE = { ON | OFF }
+
+| DROP_EXISTING = { ON | OFF }
+
+| ONLINE = OFF
+
+| ALLOW_ROW_LOCKS = { ON | OFF }
+
+| ALLOW_PAGE_LOCKS = { ON | OFF }
+
+| MAXDOP = <max degree of parallelism>
+
+[ ,...n ]
+
+) ][ ; ]
+
+Notice that both the IGNORE_DUP_KEY and ONLINE options have only one setting. I honestly can't tell you why Microsoft decided to keep them in there at all (I suspect just to keep it more in line with the basic CREATE INDEX statement, but it still seems odd), but they are there for now. (Perhaps they will have additional options later.) As you can see, most of the other options are the same, so let's focus on the main syntax items.
+
+First, XML indexes must be explicitly called out in the CREATE INDEX line via the XML keyword. The PRIMARY keyword is only necessary for primary XML indexes. The XML index is otherwise assumed to be a secondary index.
+
+Moving on, notice that we do not have the option of supplying multiple columns. Instead, we just name which index of type xml we plan on indexing.
+
+The USING clause is mutually exclusive with the PRIMARY keyword and applies only in the case of (and in such case is required) for secondary indexes. Use this clause along with the FOR keyword to indicate the type of secondary index you want to create (PATH, VALUE, or PROPERTY).
+
+So, were we to put this to use, we might create a primary XML index on the Production.ProductModel table:
+
+CREATE PRIMARY XML INDEX PXProductModelInstructions
+
+ON Production.ProductModel (Instructions)
+
+WITH (PAD_INDEX = OFF,
+
+SORT_IN_TEMPDB = OFF,
+
+DROP_EXISTING = OFF,
+
+ALLOW_ROW_LOCKS = ON,
+
+ALLOW_PAGE_LOCKS = ON
+
+);
+
+Note that, if you want to actually run the previous script, you would need to drop the existing XML index that came with the AdventureWorks2008 sample.
+
+Or to create a secondary index utilizing the primary we just created, we would do something like:
+
+CREATE XML INDEX SXProductModelInstructionsPATH
+
+ON Production.ProductModel (Instructions)
+
+USING XML INDEX PXProductModelInstructions
+
+FOR PATH
+
+WITH (PAD_INDEX = OFF,
+
+SORT_IN_TEMPDB = OFF,
+
+DROP_EXISTING = OFF,
+
+ALLOW_ROW_LOCKS = ON,
+
+ALLOW_PAGE_LOCKS = ON
+
+);
+
+Note that either of the preceding CREATE XML INDEX statements will fail in AdventureWorks2008 because the sample already has a default primary XML index. The second of the two examples will run if you change the USING clause to reference the existing primary XML index (PXML_ProductModel_Instructions).
+
+Again, the syntax differences versus standard indexes are relatively subtle in nature.
+
+User-Defined Data Types
+
+Ah, the awesome potential of user-defined data types or UDTs.
+
+This is a little bit of the classic "What came first, the chicken or the egg?" thing. You see, part of what has made UDTs interesting since SQL Server 2005 is the addition of .NET objects as a possible source for a UDT. For various reasons, however, I'd prefer to hold up on the addition of .NET until we're talking the procedural side of things in Chapter 10.
+
+With this in mind, I've decided to compromise a bit. We'll start our discussion of UDTs here, and finish the .NET side of it in Chapter 10 (call it a little bit of both worlds....).
+
+So, all, with the organizational stuff disposed of, let's address the issue of what exactly a user-defined data type is. If you're a true SQL Server "Pro," then the fundamentals of a UDT may well be old news. After all, UDTs have been part of SQL Server since long before anyone had even thought of .NET. Then again, until the .NET era, they had only minimal value. Even with the advent of .NET, the flexibility of .NET data types requires an exorbitant amount of complexity and requires you to turn on some things in the server configuration that may violate some security policies. (Many DBAs see a great deal of risk in turning on SQLCLR and .NET for SQL Server.) You may well have just ignored UDTs altogether, and given the changes in UDTs for SQL Server 2008, it's worth a look at UDTs as they are today.
+
+Classic UDTs
+
+The classic UDT is founded on existing data types. Indeed, it can be considered to be nothing more than an alias for the base types already found in SQL Server. Historically, it has been used primarily to aid consistency in a frequently used attribute, or in conjunction with rules and defaults (which can be bound directly to a UDT and apply anywhere the classic UDT is used).
+
+Let's start with a fairly basic example—an account number. The AdventureWorks2008 database makes use of a user-defined data type called AccountNumber that is created from the base type of nvarchar—in this case, an nvarchar(15). Using the AccountNumber UDT rather than directly using the nvarchar(15) base type ensures consistency across all instances where you want to make use of the account number concept.
+
+The syntax for creating classic UDTs (UDTs that source from built-in data types and are not tabular in nature) is pretty simple:
+
+CREATE TYPE [<schema name>.]<type name>
+
+FROM <base type>
+
+[ ( precision[, scale]) ]
+
+[ NULL | NOT NULL ]
+
+So, the AccountNumber data type used in AdventureWorks2008 would look like this:
+
+CREATE TYPE dbo.AccountNumber
+
+FROM nvarchar(15) NULL;
+
+As you can see, there isn't a whole lot to creating what amounts to a simple alias to an existing type. You may see these relatively simple types extended via the use of rules and defaults, but I recommend against this as Microsoft has said for the last four releases that rules and defaults are considered deprecated and will be removed from the product at some point.
+
+It's probably worth noting that I've been told by members on the team that Microsoft is getting more serious about "truly" following up on removing deprecated features. When SQL Server 11 (currently code named Kilimanjaro) eventually ships, expect to see some long deprecated features finally disappear from the product.
+
+.NET UDTs
+
+I'm going to defer most of our discussion of .NET-based UDTs until Chapter 10 when we will fully explore .NET-based development, but at least a cursory look at .NET-based UDTs has to be included here if for no other reason than context.
+
+As you might expect, .NET UDTs make use of a .NET assembly to implement a custom data type. These have been around since .NET first appeared in SQL Server as part of SQL Server 2005, and can implement some very complex custom types. How complex? Well, complex enough that the new geospatial data types that will be discussed a bit later in this chapter were implemented using .NET. Indeed, they are essentially the same as any .NET data type you might develop and deploy yourself, save that they are flagged as a system type and do not require explicitly enabling .NET in order for geospatial data types to be used.
+
+Just to make sure we have a copy of our syntax examples in the same place, here's the syntax for .NET UDTs:
+
+CREATE TYPE [<schema name>.]<type name>
+
+EXTERNAL NAME <assembly name>[.<class name>]
+
+So, as a preview of the example we'll use in Chapter 10 (don't actually execute this code—we'll get to it in due time!), we could add a .NET assembly called ComplexNumber with code such as:
+
+CREATE TYPE ComplexNumber
+
+EXTERNAL NAME [ComplexNumber].[Microsoft.Samples.SqlServer.ComplexNumber];
+
+Again, we will more fully explore .NET-based UDTs in Chapter 10, including creating our own type.
+
+Tabular UDTs
+
+These are new in SQL Server 2008, and they are the start of something big that I suspect will evolve over the next few releases.
+
+What are they? Well, largely what they sound like: a user-defined data type that accepts tabular data. You create them with a syntax that mostly matches the syntax used for table-valued variables or in the CREATE TABLE command. After creation, you can then utilize them in scripts or, perhaps more importantly, as a table-valued parameter in a stored procedure.
+
+Note that I did not mention using them as a type you can use within a table. Unlike other user-defined data types, user-defined data types based on the notion of a table are not usable within other table type objects (that is, a table object or table variable).
+
+Unlike other user-defined data types, tabular UDTs cannot be embedded within other tabular objects such as a table variable or table object.
+
+As of this writing, Microsoft has not yet made a commitment about how far they are going to take tabular UDTs. Right now, it would appear that we are on a path that is taking us closer and closer to a more fully functioning tabular UDT similar to that found in competitive products—such as Oracle—where you have long been able to embed a table within a table.
+
+For now, we're going to focus on how exactly we create tabular UDTs. In Chapter 10, we'll examine the most likely use for tabular UDTs: table-valued parameters for stored procedures.
+
+Creating a Table User-Defined Data Type
+
+Creating a table user-defined data type works as something of a combination of the classic CREATE TYPE and table variable syntax. The tabular CREATE TYPE syntax looks like this:
+
+CREATE TYPE [<schema name>.]<type name>
+
+AS TABLE
+
+(
+
+{ <column name> <data type>
+
+[ COLLATE <collation name> ]
+
+[ NULL | NOT NULL ]
+
+[ DEFAULT <expression> ]
+
+[ IDENTITY [ ( <seed>, <increment> ) ]
+
+[ ROWGUIDCOL ] [<column constraint> [...n ] ]
+
+[<table constraint>]}
+
+| <computed column definition> }
+
+)[;]
+
+As an example, we're going to create a user-defined table type that will represent addresses. Later in the book (in Chapter 10), we'll see how we can pass an instance of this data type into a stored procedure or function for further processing.
+
+Long ago, it seemed one address was generally enough for most people. The majority of systems out there stored a single address for most business entities they worked with. Today, however, one doesn't seem to be enough. Between dealing with companies that have multiple locations, and even individuals deciding to receive bills at one location, but ship to a different location, many business entities we work with have multiple addresses. The AdventureWorks2008 database represents this by separating addresses out into their own table (Person.Address). We've decided that we want to represent this notion of an address in a consistent way across our systems, so we create our custom type:
+
+USE AdventureWorks2008;
+
+GO
+
+CREATE TYPE Person.Address
+
+AS TABLE(
+
+AddressID int NULL,
+
+AddressLine1 nvarchar(60) NOT NULL,
+
+AddressLine2 nvarchar(60) NULL,
+
+City nvarchar(30) NOT NULL,
+
+StateProvinceID int NOT NULL,
+
+PostalCode nvarchar(15) NOT NULL,
+
+SpatialLocation geography NULL
+
+);
+
+There are a host of items to notice about this script:
+
+  * I used the exact name of an existing object in the database (there is a table called Person.Address). The type can be considered to be much like the difference between a class and an object—that is, a type is a definition, and a table is an actual instance of something (though, the table is not an instance of the type definition the way an object is an instance of a class).
+  * The syntax for creating the actual definition is very similar to the CREATE TABLE syntax.
+  * The layout maps very closely to the Person.Address table in order to support moving data between the two relatively easily.
+
+Note that I created my user-defined type with the same name as a table just to prove the point. I would not recommend duplicating names in practice, as it is likely to lead to far more confusion than it is worth.
+
+With my type now created, I can reference it as a valid data type for variable declarations, or function or sproc parameters (more on the latter two in Chapter 10).
+
+Let's further our example just a bit by utilizing our new type:
+
+DECLARE @Address Person.Address;
+
+INSERT INTO @Address
+
+(AddressID,
+
+AddressLine1,
+
+City,
+
+StateProvinceID,
+
+PostalCode
+
+)
+
+VALUES
+
+(
+
+1,
+
+'My first address',
+
+'MyTown',
+
+1,
+
+'21212'
+
+),
+
+(
+
+1,
+
+'My second address',
+
+'OtherTown',
+
+5,
+
+'43434'
+
+),
+
+(
+
+1,
+
+'My third address',
+
+'MyTown',
+
+1,
+
+'21214'
+
+);
+
+SELECT *
+
+FROM @Address;
+
+Notice that, with a simple declaration of our Person.Address user-defined type, we gained access to all the columns for that tabular type. We're able to insert rows, and select them back out:
+
+(3 row(s) affected)
+
+AddressID AddressLine1
+
+\----------- --------------------
+
+1 My first address...
+
+1 My second address...
+
+1 My third address...
+
+(3 row(s) affected)
+
+Again, we'll take a further look at uses for this in Chapter 10 as part of our table-valued parameter discussion.
+
+Dropping a User-Defined Type
+
+It's fairly intuitive, but just to make sure we've addressed the point, all varieties of UDTs are dropped using the very common DROP <object type> <object name> syntax:
+
+DROP TYPE [<schema name>.]<type name>[;]
+
+Hierarchical Data
+
+This area is somewhat more revolutionary than most of the other items we discuss during this chapter. While spatial data has been around in other products for some time now (and the lack of spatial support had been something SQL Server was often derided for), the addition of the new HierarchyID data type and its embedded functions brings a new realm of functionality to the database that was somewhat unexpected.
+
+So what is HierarchyID? Simply put, it is a special data type that is optimized to the needs of representing a single node in a hierarchical structure (usually a tree). The real horsepower here is in the idea that it is able to analyze the concepts of hierarchical ancestry (parent/child relationships) as well as understand the notion of depth and siblings (for example, all departmental managers versus operational staff or executives).
+
+A given instance of HierarchyID data does not represent a tree. Instead it is merely information about the properties of a single node of a tree, including that node's ancestry. Only by making use of a collection of related nodes can one represent a true hierarchy tree.
+
+The need for hierarchical representation of data is not new. Indeed, the version of AdventureWorks that shipped back in SQL Server 2005 included a fairly typical modeling of one of the more common hierarchical problems: employee reporting structures. (Indeed, we created a similar mapping to this when we created the Employee2 table to show off CTEs back in Chapter 3.) The typical solution was what is called a unary relationship—that is, a table that has a foreign key to itself. "Kits" are another common hierarchical problem. (For example, a part that is nothing more than a collection of other parts, with some of those, perhaps, being kits with other parts.) XML is naturally hierarchical, and has also been a frequent solution to storing hierarchies even for non-XML applications.
+
+Let's take a look at how it works, and then we'll explore some of the functionality that comes with the data type and its associated methods.
+
+Understanding Depth Versus Fanout
+
+Before getting too much into the structure of the HierarchyID type and the methods and index type that support it, it is important to understand the concepts of depth (or level) versus the idea of what is called fan out (for the moment, think about this as being horizontal).
+
+The depth—or levels deep—of a hierarchy node is based on the number of direct and indirect ancestor nodes. Note that this yields us a zero-based set—that is, the root node of a hierarchy has a level of zero, its direct descendants have a level of 1, and so on. So, for example, the node labeled E in Figure 7.1 has a level of 2. The root node labeled A in Figure 7.1 has a level of zero.
+
+The HierarchyID type gives us a special method call to tell us what level (not surprisingly called GetLevel) a given node is within a hierarchy. Levels are used primarily for comparisons with siblings, and will become important later in the chapter when we discuss breadth-first indexes on HierarchyID columns.
+
+The fanout of a hierarchy refers to the idea of how many children a given parent node has. In a tree representation such as the one in Figure 7.1, you can think of the fanout as governing the width of the hierarchy. In Figure 7.1, the E node has a fanout of 3, the B node has a fanout of 4, and the A node has a fanout of 11. In our next section, we'll take a look at how we store all this information.
+
+HierarchyID Type Structure
+
+The HierarchyID data type is stored internally as a variable-length binary representation of a node. Indeed, if we retrieve an instance of HierarchyID type data, it will come back in a hexadecimal representation. So, for example, if we execute:
+
+SELECT e.OrganizationNode
+
+FROM HumanResources.Employee e;
+
+it give us back some numbers in hex:
+
+OrganizationNode
+
+\---------------------------
+
+0x
+
+0x58
+
+0x68
+
+...
+
+...
+
+0x85EBA6
+
+0x85EBAA
+
+0x85EBAE
+
+(290 row(s) affected)
+
+You can use the ToString method (we'll explore the various method calls in the next section) to render it a bit more human readable:
+
+SELECT e.OrganizationNode.ToString() AS OrganizationNode
+
+FROM HumanResources.Employee e;
+
+This gets us back something that, at first blush, probably doesn't seem all that much more readable:
+
+OrganizationNode
+
+/
+
+/1/
+
+/2/
+
+/3/
+
+/4/
+
+/5/
+
+/6/
+
+/1/1/
+
+/2/1/
+
+/2/2/
+
+...
+
+...
+
+/4/3/1/9/
+
+/4/3/1/10/
+
+/4/3/1/11/
+
+(290 row(s) affected)
+
+There are several items of note in this string representation:
+
+  * Each forward slash (/) separates a representation of a node in the current node's lineage
+  * The numbers are largely arbitrary. You can assign them yourself, or SQL Server can find a place to insert them for you. If you have SQL Server generate the number for you, then the number will be the next available whole number unless you explicitly state that you want the new value to be between existing nodes. In which case you need to supply the points you want the value to lay between. (More on this when we look at the GetDescendant method in the next section.) Only by explicitly managing the values for a given level can you provide any form of ranking within a given level.
+  * The numeric order does not matter within a specific node, only the position in the series matters. Each set of numbers matters only within that particular level. (Notice that the first 1 in /1/1/ is a different item than the second 1 is. The number sequencing is maintained separately within each level of the hierarchy.)
+  * The solitary forward slash (/) represents what is being seen as the root node. (The equivalent in hex was 0x0.) This is, however, also arbitrary, as nothing prevents you from having multiple root nodes.
+
+There is nothing inherent in the HierarchyID data type that ensures that you only have a single root node. Indeed, there is no guarantee of uniqueness at all unless you explicitly enforce that constraint (via primary key or unique constraint).
+
+As indicated earlier, the inner workings of the HierarchyID type represent the node in a variable-length bit field (thus all the hex output). Unlike other variable-length data types, you do not explicitly define the length. Instead, SQL Server adjusts the length as required to address the depth and fanout found in the various nodes. Microsoft hasn't really said much about the specifics of how each bit is manipulated, but based on what Microsoft has said publically, you can figure that most installations are going to average 5–6 bytes per node.
+
+Working with HierarchyID Values—HierarchyID Methods
+
+So, all these concepts and theories are great, but anyone who has read much from my books knows that I'm more of a fan of showing specific examples. With that in mind, we're going to use this next section to cover the various methods that are supported by the type. These work much like the methods we used with the XML data. We'll address each method based on the function they help us perform (inserts, positions, grafting, and the like), but just to get them all into one place for reference, here is the quick list of methods and what they are used for:
+
+  * GetAncestor(n): This fetches the node value for the parent node that is n number of nodes up the tree. So, for example, GetAncestor(1) would fetch the immediate parent.
+  * GetDescendant(<child 1>, <child 2>): This varies in behavior depending on the specific values provided for the child arguments, but the name is a bit misleading. Contrary to what you might expect, GetDescendant() is not used to fetch a specific child node, which would be hard since there may be multiple children at any level of the hierarchy. Instead, it is used to calculate a value to use in inserting a new node into the hierarchy.
+  * GetLevel(): Returns the level of the current node where the root node is considered level 0 and each child level below the root adds one to the reported level.
+  * GetReparentedValue(<old root>, <new root>): This is another deceptively named one. Despite using the Get moniker in the name, GetReparentedValue() actually performs a task—that is, pruning a given node or set of child nodes from a given parent (old root) and grafting them to a new parent (new root). Do not let the use of the term "root" confuse you here. This does not need to be the primary root for the entire hierarchy, but rather the common parent that all the grafted nodes share.
+  * GetRoot(): Supplies the constant value of the root node of a hierarchy (which is always 0x0). Unlike most of the other methods discussed here, GetRoot() is a static method and thus only callable against the base type and not an individual node instance. We'll explore the specifics of this a little later when we discuss fetching the root.
+  * IsDescendantOf(<node>): Provides a true/false indication as to whether or not the current node is a descendent of the provided node. Note that a parent is considered a child of itself (so if you perform a IsDescendantOf() on a given node referencing itself, the result will be true).
+  * Parse(): Loosely speaking, this can be considered the opposite of the ToString() method. This receives a string-based representation of a node and converts it to the internal binary representation. Like GetRoot(), this is a static method, and can be called only against the base type (for example, HierarchyID::GetRoot()).
+  * Read(): This is a CLR or client language only function (it is not callable from within T-SQL), and is used to receive a stream of a HierarchyID instance in its native binary representation. In general, the database developer would utilize this only while doing extremely complex CLR programming or manipulating the hierarchy in a client language.
+  * ToString(): This does what it says—that is, it converts the binary representation into a more human-readable string.
+  * Write(): This is the functional opposite of Read(). Like Read() it is CLR/.NET only, and cannot be called from T-SQL. It is used to take a client-side binary representation of a HierarchyID instance and write it directly back to SQL Server without the need for a string conversion.
+
+So, with the introductions done, let's take a look at things from a more functional standpoint, and discuss the many things we might want to do with an instance of the HierarchyID data type.
+
+Methods Related to Retrieving a Given Level of Hierarchy Data
+
+There are a few examples of the HierarchyID data type in the AdventureWorks2008 database (one each in the Address, Document, Employee, and ProductDocument tables). We'll focus on the Employee table here as it is the easiest to get the concept of, but each of the other HierarchyID usages provides a further example of a potential hierarchy.
+
+Let's start out with retrieving a simple user-readable selection of the Employee table. For this, we use the ToString() method we saw earlier in the section. ToString() takes no arguments and is used relative to an instance of data (usually a row or variable) of type HierarchyID. So, to formalize the syntax, it would look like this:
+
+<instance of hierarchical data>.ToString()
+
+To keep things manageable, we're going to limit the results using the OrganizationLevel column:
+
+SELECT e.BusinessEntityID,
+
+p.LastName + ', ' + p.FirstName AS Name,
+
+e.OrganizationNode.ToString() AS Hierarchy
+
+FROM HumanResources.Employee e
+
+JOIN Person.Person p
+
+ON e.BusinessEntityID = p.BusinessEntityID
+
+WHERE e.OrganizationLevel BETWEEN 1 AND 2;
+
+This gives us list of a set of parents and their respective children:
+
+BusinessEntityID Name Hierarchy
+
+\---------------- ------------------------------ ----------
+
+2 Duffy, Terri /1/
+
+16 Bradley, David /2/
+
+25 Hamilton, James /3/
+
+234 Norman, Laura /4/
+
+263 Trenary, Jean /5/
+
+273 Welcker, Brian /6/
+
+3 Tamburello, Roberto /1/1/
+
+17 Brown, Kevin /2/1/
+
+18 Wood, John /2/2/
+
+19 Dempsey, Mary /2/3/
+
+20 Benshoof, Wanida /2/4/
+
+21 Eminhizer, Terry /2/5/
+
+22 Harnpadoungsataya, Sariya /2/6/
+
+23 Gibson, Mary /2/7/
+
+24 Williams, Jill /2/8/
+
+26 Krebs, Peter /3/1/
+
+211 Abolrous, Hazem /3/2/
+
+222 Wright, A. Scott /3/3/
+
+227 Altman, Gary /3/4/
+
+235 Barreto de Mattos, Paula /4/1/
+
+241 Liu, David /4/2/
+
+249 Kahn, Wendy /4/3/
+
+262 Barber, David /4/4/
+
+264 Conroy, Stephanie /5/1/
+
+267 Berg, Karen /5/2/
+
+268 Meyyappan, Ramesh /5/3/
+
+269 Bacon, Dan /5/4/
+
+270 Ajenstat, François /5/5/
+
+271 Wilson, Dan /5/6/
+
+272 Bueno, Janaina /5/7/
+
+274 Jiang, Stephen /6/1/
+
+285 Abbas, Syed /6/2/
+
+287 Alberts, Amy /6/3/
+
+(33 row(s) affected)
+
+Notice that nothing about the numbers used in the HierarchyID column has anything to do with the other columns. BusinessEntityID is the primary key for the table, but it is not utilized in the hierarchy representation at all. Taking a look at Roberto Tamburello, we can see that he reports to Terri Duffy. The number "1" is reused at each level of the hierarchy, and implies no relationship to how it might be used in other levels of the hierarchy. The number sequences we see here happen to be sequential at each level, but that is an arbitrary fact of this particular data set. There is no requirement that it be this way. (Decimals can and will occur, as can negative numbers.)
+
+Next, take note of the OrganizationLevel column that we used in the previous query. If you look at the definition of this column in the database, you'll see that this is a computed column. Indeed, it utilizes the next method we want to look at: GetLevel().
+
+GetLevel() takes no arguments. (It is assumed to be operating on the instance of hierarchy data you used the method with, and passes back just how deep that node is in the hierarchy with the root node considered to be zero, the first level of children of the root being level 1, their children being 2, and so on.) So, the syntax would look like this:
+
+<instance of hierarchical data>.GetLevel()
+
+So, if we wanted to do a comparison of the OrganizationLevel we used in our previous query to what we would see using GetLevel() directly, we could rewrite it as:
+
+SELECT e.OrganizationNode.ToString() AS Hierarchy,
+
+OrganizationLevel,
+
+e.OrganizationNode.GetLevel() AS ComputedLevel
+
+FROM HumanResources.Employee e
+
+WHERE e.OrganizationLevel BETWEEN 1 AND 2;
+
+Which would, as expected, yield identical values for OrganizationLevel and our use of GetLevel():
+
+Hierarchy OrganizationLevel ComputedLevel
+
+\---------- ----------------- -------------
+
+/1/ 1 1
+
+/2/ 1 1
+
+/3/ 1 1
+
+...
+
+...
+
+/6/1/ 2 2
+
+/6/2/ 2 2
+
+/6/3/ 2 2
+
+(33 row(s) affected)
+
+We can use this in a wide variety of ways, but the most notable would be:
+
+  * Returning all rows of data related to a certain level in a hierarchy. For example, all CxO level employees might be found by looking for level 1 or 2, or a regional manager might be at level 3. It just depends on how you set up your hierarchy.
+  * Indexing for horizontal comparisons.
+
+Methods Related to Retrieving Parent or Child Hierarchy Data
+
+Looking at the information for a specific level or node of a hierarchy is all well and good, but it doesn't really show off the horsepower of the HierarchyID data type. For that, you need to expand more fully out to the parent/child relationships that are the cornerstone of what hierarchical data is all about. The real centerpiece of this functionality are the GetAncestor() and IsDescendantOf() methods.
+
+Let's start with the syntax for GetAncestor(), which takes a single argument. It looks like this:
+
+<instance of HierarchyID data>.GetAncestor(n):
+
+The method is assumed to be operating against the instance of hierarchical data it was called as a method of, and uses the single argument to indicate how many levels up the tree you want to go.
+
+The value returned by GetAncestor() is of type HierarchyID, which means you can further extend the GetAncestor() call with other HierarchyID methods.
+
+Let's see what we get if we fetch a few different ancestor levels for the employee named Roberto Tamburello that we saw in one of our first hierarchy example queries. You may recall his hierarchy node looked like this:
+
+/1/1/
+
+So let's run a few instances of the GetAncestor() method to see what gets returned:
+
+SELECT e.BusinessEntityID,
+
+p.LastName + ', ' + p.FirstName AS Name,
+
+e.OrganizationNode.ToString() AS Hierarchy,
+
+e.OrganizationNode.GetAncestor(0).ToString() AS Self,
+
+e.OrganizationNode.GetAncestor(1).ToString() AS OneUp,
+
+e.OrganizationNode.GetAncestor(2).ToString() AS TwoUp,
+
+e.OrganizationNode.GetAncestor(3).ToString() AS TooFar
+
+FROM HumanResources.Employee e
+
+JOIN Person.Person p
+
+ON e.BusinessEntityID = p.BusinessEntityID
+
+WHERE e.BusinessEntityID = 3
+
+If you look at this closely, you'll see that I'm fetching the same node several times, but, with each separate column, I'm stepping further up the hierarchy until I've stepped beyond the level that I happen to know that this particular piece of data lays at. Run this, and we get back a single row:
+
+BusinessEntityID Name Hierarchy Self OneUp TwoUp TooFar
+
+\---------------- -------------------- --------- ----- ----- ----- ------
+
+3 Tamburello, Roberto /1/1/ /1/1/ /1/ / NULL
+
+(1 row(s) affected)
+
+Several things are of note in this result:
+
+  * Although it provides little value, zero was a valid argument (it returns the calling node).
+  * Each increase in the argument to GetAncestor() moved us further up the hierarchy tree.
+  * Using a value that goes beyond the root of the hierarchy list returns a NULL.
+
+This is great for going up the hierarchy tree, but what if we want to return the children, or simply know if a specific child has a given parent anywhere in its ancestry? For that, the right answer depends on whether we know how far down the chain we want to go (all reports or only direct reports). If it is all reports, we have IsDecendantOf(). This one takes a single node as an argument and returns a Boolean result that is, as you might expect, a simple true/false as to whether the node you pass into the method has the node are calling the method from as a child (directly or indirectly). The syntax looks like this:
+
+<instance of HierarchyID data>.IsDescendantOf(n):
+
+For this, let's look at how it can be used in either direction. For example, let's say we want to return all superiors to Mr. Tamburello. This translates to us wanting to return any row with a node that considers Mr. Tamburello's node to be a descendant. For example:
+
+DECLARE @ChildNode HierarchyID
+
+SELECT @ChildNode = OrganizationNode
+
+FROM HumanResources.Employee e
+
+WHERE e.BusinessEntityID = 3
+
+SELECT e.BusinessEntityID,
+
+p.LastName + ', ' + p.FirstName AS Name,
+
+e.OrganizationNode.ToString() AS Hierarchy
+
+FROM HumanResources.Employee e
+
+JOIN Person.Person p
+
+ON e.BusinessEntityID = p.BusinessEntityID
+
+WHERE @ChildNode.IsDescendantOf(e.OrganizationNode) = 1;
+
+First, note that we were able to move a node into a variable of type HierarchyID, and we were still able to make a method call from that variable. Why use a query like this one instead of using GetAncestor()? If you think about this for a moment, I suspect you'll see that it has to do with how open ended the question was. GetAncestor() really expects you to know how many ancestors you have. You could figure that out using GetLevel() or rig up some test for NULL values, but that is far more complicated than simply returning all rows where IsDescendant()is true.
+
+BusinessEntityID Name Hierarchy
+
+\---------------- ------------------------------ ----------
+
+1 Sánchez, Ken /
+
+2 Duffy, Terri /1/
+
+3 Tamburello, Roberto /1/1/
+
+(3 row(s) affected)
+
+Much as a node can consider itself its own ancestor (with a level input of zero), a node is also considered its own descendant.
+
+That showed us how to check what ancestors are above us, but what about the children below us? For that, we can ask an even more open-ended question. For example, listing all people that report directly or indirectly to Mr. Tamburello requires a simple reversal of the WHERE condition in our previous query:
+
+DECLARE @ChildNode HierarchyID
+
+SELECT @ChildNode = OrganizationNode
+
+FROM HumanResources.Employee e
+
+WHERE e.BusinessEntityID = 3
+
+SELECT e.BusinessEntityID,
+
+p.LastName + ', ' + p.FirstName AS Name,
+
+e.OrganizationNode.ToString() AS Hierarchy
+
+FROM HumanResources.Employee e
+
+JOIN Person.Person p
+
+ON e.BusinessEntityID = p.BusinessEntityID
+
+WHERE e.OrganizationNode.IsDescendantOf(@ChildNode) = 1;
+
+And just that quick we have all of Mr. Tamburello's reports:
+
+BusinessEntityID Name Hierarchy
+
+\---------------- ------------------------------ ----------
+
+3 Tamburello, Roberto /1/1/
+
+4 Walters, Rob /1/1/1/
+
+5 Erickson, Gail /1/1/2/
+
+6 Goldberg, Jossef /1/1/3/
+
+7 Miller, Dylan /1/1/4/
+
+8 Margheim, Diane /1/1/4/1/
+
+9 Matthew, Gigi /1/1/4/2/
+
+10 Raheem, Michael /1/1/4/3/
+
+11 Cracium, Ovidiu /1/1/5/
+
+12 D'Hers, Thierry /1/1/5/1/
+
+13 Galvin, Janice /1/1/5/2/
+
+14 Sullivan, Michael /1/1/6/
+
+15 Salavaria, Sharon /1/1/7/
+
+(13 row(s) affected)
+
+To get his direct reports, we use pretty much the same query, but return to the GetAncestor() method:
+
+DECLARE @ChildNode HierarchyID;
+
+SELECT @ChildNode = OrganizationNode
+
+FROM HumanResources.Employee e
+
+WHERE e.BusinessEntityID = 3;
+
+SELECT e.BusinessEntityID,
+
+LEFT((p.LastName + ', ' + p.FirstName), 30) AS Name,
+
+LEFT(e.OrganizationNode.ToString(), 10) AS Hierarchy
+
+FROM HumanResources.Employee e
+
+JOIN Person.Person p
+
+ON e.BusinessEntityID = p.BusinessEntityID
+
+WHERE e.OrganizationNode.GetAncestor(1) = @ChildNode;
+
+Which limits us to just the specific level below us (or, as GetAncestor() looks at it, the level that we are currently 1 above).
+
+Inserting New Hierarchical Data
+
+At its most basic level, inserting new hierarchical data isn't unlike inserting any other data in SQL Server. The real trick to inserting new hierarchy nodes lies in understanding what the representation should look like for the new row.
+
+Remember that SQL Server has no preconceived notions about your hierarchy. Indeed, SQL Server doesn't necessarily even look at it as a tree or that a given node is unique. So, while SQL Server can't build your hierarchy for you, it can help you generate values based on information you provide. The functionality for this is provided by the GetDescendant() method.
+
+GetDescendant() would probably have been more accurately named if they had called it something like "GenerateHierarchyNodeRepresentation()" or something like that. Its purpose is to generate a valid representation of a hierarchy node that falls between two optionally set parameters. The syntax looks like this:
+
+<parent node>.GetDescendant({<Low Child> | NULL}, {<High Child> | NULL})
+
+The low and high child nodes specify a range that the generated value must fall between (it is non-inclusive). The generated value may contain decimals or even be a negative value as long as it falls within the specified range. While both arguments are required, you can explicitly specify NULL as the value for either, effectively putting no bound on that side of the generation.
+
+  * If parent is NULL, returns NULL.
+  * If parent is not NULL, and both low and high children are NULL, returns a child of parent.
+  * If parent and the low child are not NULL, and the high child is NULL, returns a child of parent greater than the low child.
+  * If parent and the high child are not NULL and the low child is NULL, returns a child of parent less than the high child.
+  * If the parent, the low child, and the high child are not NULL, returns a child of parent greater than the low child and less than the high child.
+  * If the low child is not NULL and not a child of parent, an exception is raised.
+  * If high child is not NULL and not a child of parent, an exception is raised.
+  * If the low child is equal to or greater than the high child, an exception is raised.
+
+For this particular method call, we'll generate a bit more custom example using the following script:
+
+CREATE TABLE NodeTest
+
+(
+
+NodeID int NOT NULL IDENTITY PRIMARY KEY,
+
+Node hierarchyid NOT NULL,
+
+NodeLevel AS Node.GetLevel(),
+
+Name varchar(50) NOT NULL
+
+);
+
+INSERT NodeTest
+
+VALUES
+
+('/', 'Manager');
+
+DECLARE @Manager hierarchyid;
+
+SELECT @Manager = Node
+
+FROM NodeTest
+
+WHERE NodeID = 1;
+
+INSERT NodeTest
+
+VALUES
+
+(@Manager.GetDescendant(NULL, NULL), 'ReportAAA'),
+
+(@Manager.GetDescendant(NULL, NULL), 'ReportBBB'),
+
+(@Manager.GetDescendant(NULL, '/1000/'), 'ReportCCC'),
+
+(@Manager.GetDescendant(NULL, '/1000/'), 'ReportDDD'),
+
+(@Manager.GetDescendant('/1000/', NULL), 'ReportEEE'),
+
+('/547/', 'ReportFFF'),
+
+(@Manager.GetDescendant('/3/', '/547/'), 'ReportGGG'),
+
+(@Manager.GetDescendant('/1/', '/2/'), 'ReportHHH'),
+
+(@Manager.GetDescendant('/-10/', '/-1/'), 'ReportIII'),
+
+('/547/345/', 'SecondLevelAA'),
+
+('/547/346/', 'SecondLevelBB'),
+
+('/547/345/1/', 'ThirdLevelAA'),
+
+('/785/294/386/925/','RandomEntry');
+
+SELECT NodeID,
+
+Node.ToString(),
+
+Name
+
+FROM NodeTest;
+
+With this script, we've stuck a wide variety of data in, but the output may surprise you in several places:
+
+NodeID Name
+
+\----------- -------------------- ------------------------------
+
+1 / Manager
+
+2 /1/ ReportAAA
+
+3 /1/ ReportBBB
+
+4 /999/ ReportCCC
+
+5 /999/ ReportDDD
+
+6 /1001/ ReportEEE
+
+7 /547/ ReportFFF
+
+8 /4/ ReportGGG
+
+9 /1.1/ ReportHHH
+
+10 /-9/ ReportIII
+
+11 /547/345/ SecondLevelAA
+
+12 /547/346/ SecondLevelBB
+
+13 /547/345/1/ ThirdLevelAA
+
+14 /785/294/386/925/ RandomEntry
+
+(14 row(s) affected)
+
+Note that we were able to insert data randomly. For example, we have a fourth-level node called RandomEntry that is just that—random. It has no parent. SQL Server does nothing to enforce a tree representation or the validity of your hierarchy; it only provides the tools for making nodes work together in a way you are most likely to use to create hierarchy trees.
+
+Next, note that we inserted decimal-based values. Our ninth entry was inserting between 1 and 2, so there was no way to squeeze it in there without going to decimals (and so that's exactly what SQL Server did).
+
+Continuing on, we have negative values. Again, we provided SQL Server no real choice, as our low and high children were both negative.
+
+Finally, we inserted duplicate rows. HierarchyID columns are not any more inherently unique than any other data type. If you want to avoid duplicate node values, you'll need to utilize a unique or primary key constraint. Note also that given a specific high and low child, GetDescendant() will generate the same value over and over again without regard to whether or not there is a duplicate (and regardless of whether there is a unique or primary key constraint). You need to plan for the values you're going to insert. For the vast majority of hierarchies, horizontal position is not important, so you can usually use whatever the max node is for the level you're inserting into.
+
+Moving Sub-Trees Between Parents
+
+The HierarchyID data type also provides for the concept of a coordinated prune and graft of a node and its children to a new parent using the GetReparentedValue() method. Like GetDescendant(), the name of GetReparentedValue() seems to imply that its main function is getting data back. While returning data is indeed technically what it does, GetReparentedValue() is largely about moving data around. It requires two arguments: the old "root" and the new "root." So the basic syntax looks like this:
+
+<node to be moved>.GetReparentedValue(<old root>, <new root>)
+
+Note that "root" in this case doesn't mean the top-level root of the entire hierarchy. Instead, it is just the root of the particular sub-tree you're wanting to move.
+
+GetReparentedValue() does not necessarily make the represented move. It is merely a way to show a "what if?" scenario. When used with an UPDATE statement, it performs the actual move.
+
+Let's go back to the NodeTest table we created in the previous example. We want to see what things would look like if we took the children of node /547/ and moved them to node /1001/. We can do this by combining our GetReparentedValue() method with the IsDescendantOf() method:
+
+SELECT NodeID,
+
+Node.GetReparentedValue('/547/', '/1001/').ToString() AS New,
+
+Node.ToString() AS Old,
+
+Name
+
+FROM NodeTest
+
+WHERE Node.IsDescendantOf('/547/') = 1;
+
+This code shows what things would look like if we pruned the /547/ sub-tree (included the /547/ node itself) and grafted all related nodes to the /1001/ node. Let's take a look at the results:
+
+NodeID New Old Name
+
+\----------- -------------------- -------------------- ---------------------
+
+7 /1001/ /547/ ReportFFF
+
+11 /1001/345/ /547/345/ SecondLevelAA
+
+12 /1001/346/ /547/346/ SecondLevelBB
+
+13 /1001/345/1/ /547/345/1/ ThirdLevelAA
+
+(4 row(s) affected)
+
+At first blush, this looks perfect, but there is one potential problem: the actual /547/ node. In our original data, we already have a /1001/ node. If we are ok with duplicates (and thus the nodes appearing to have two parents), then there is no problem here. Most of the time, however, a node is going to have one and only one parent. To change things so that we only move the children of /547/, we just need to exclude it from the result set using the WHERE clause:
+
+SELECT NodeID,
+
+Node.GetReparentedValue('/547/', '/1001/').ToString() AS New,
+
+Node.ToString() AS Old,
+
+Name
+
+FROM NodeTest
+
+WHERE Node.IsDescendantOf('/547/') = 1
+
+AND Node.ToString() != '/547/';
+
+And we've quickly cleaned our errant node out of the results:
+
+NodeID New Old Name
+
+\----------- -------------------- -------------------- ---------------------
+
+11 /1001/345/ /547/345/ SecondLevelAA
+
+12 /1001/346/ /547/346/ SecondLevelBB
+
+13 /1001/345/1/ /547/345/1/ ThirdLevelAA
+
+(3 row(s) affected)
+
+With that all figured out, we're ready to actually move our data around using an UPDATE statement:
+
+UPDATE NodeTest
+
+SET Node = Node.GetReparentedValue('/547/', '/1001/')
+
+WHERE Node.IsDescendantOf('/547/') = 1
+
+AND Node.ToString() != '/547/';
+
+Execute this, and then reselect all the data from our NodeTest table:
+
+NodeID Name
+
+\----------- -------------------- ------------------------------
+
+1 / Manager
+
+2 /1/ ReportAAA
+
+3 /1/ ReportBBB
+
+4 /999/ ReportCCC
+
+5 /999/ ReportDDD
+
+6 /1001/ ReportEEE
+
+7 /547/ ReportFFF
+
+8 /4/ ReportGGG
+
+9 /1.1/ ReportHHH
+
+10 /-9/ ReportIII
+
+11 /1001/345/ SecondLevelAA
+
+12 /1001/346/ SecondLevelBB
+
+13 /1001/345/1/ ThirdLevelAA
+
+14 /785/294/386/925/ RandomEntry
+
+(14 row(s) affected)
+
+As planned, all of our nodes that were previously descendants of /547/ have been moved to /1001/. /547/ has been left in its original state as planned.
+
+Getting the Root of a Hierarchy
+
+Well, it deserves mentioning I guess, but it's probably going to be a bit anti-climactic. The last method we're going to cover here (I'm limiting myself to those that are T-SQL addressable) is for retrieving the root of a hierarchy. The odd thing about this method is that it is returning a constant. Since it is a static member of the HierarchyID type, you reference it using the HierarchyID type rather than a specific instance. You can, if you so choose, skip this, as the value will always be the same ("/" if you do a ToString() on it). The syntax is straightforward, and does not vary by specific implementation:
+
+HierarchyID::GetRoot()
+
+As I said, there is no real magic to this one. You can always select it to see:
+
+SELECT HierarchyID::GetRoot().ToString();
+
+which will yield you the now familiar simple forward slash:
+
+\--------------
+
+/
+
+(1 row(s) affected)
+
+Indexing Hierarchy Data
+
+There are two likely ways for you to want your hierarchical data indexed:
+
+  * Vertically (also referred to as "Depth First"): This is what is inherent to the base indexing of a HierarchyID column. It starts at the highest node it can find (the root node assuming you have one), and drills downward into the tree. As shown in Figure 7.2, when it reaches a bottom node it indexes everything at that level, and then returns to the lowest node part of the same general branch that hasn't been indexed yet, and then again starts downward. Creating the index uses the standard index syntax we covered in the previous chapter. If you index with a HierarchyID column as your first column, then you're sure to be getting a depth-first traversal index.
+
+Figure 7.2
+
+  * Horizontally (usually referred to as a "Breadth-First" index): Creating a breadth-first index requires a little extra effort, but, before we worry about that, let's focus on what exactly it does.
+
+A breadth-first index stores siblings close together (as shown in Figure 7.3). This is created for comparisons that are oriented around things like the GetAncestor() method. To create an index with this treatment order, you need to create a computed column based on the GetLevel() method just as the AdventureWorks2008 database has for the OrganizationLevel column of the Employee table (and as I created in our NodeTest). You can then index the Level column followed by the HierarchyID column to have a breadth-first index.
+
+Figure 7.3
+
+Other than considering the difference in depth versus breadth on first traversal, HierarchyID indexes work much like any other index in SQL Server.
+
+Performance Considerations
+
+In general, the HierarchyID approach is going to give you the best overall performance and functionality for hierarchical data. However, as is the case with so many things in software development, there are other approaches and exceptions to the best performance rule. We discussed some of the alternatives at the beginning of our hierarchy discussion, but let's quickly explore some of the performance ramifications of each choice by utilizing a table approach.
+
+I'm told that people love it when I build the "Best performance by the numbers" and "If this, then that" tables I occasionally have in my books. While I do put these things forward based on experience or other research, keep in mind that they are "best guess" suggestions as to approach. In short, they are what works for the listed situations "most of the time." Your mileage may vary, and you really should, as I say all too often, test, test, test!
+
+Don't confuse the number of squares indicating parent/child relationships as being a good fit as being an endorsement as parent/child continuing to be the likely best solution. Treat each case individually, and realize that sub-tree and ancestry queries are generally very common in hierarchies, and such queries are where the HierarchyID data type excels.
+
+Spatial Data
+
+The addition of spatial data handling has been one of the most touted features of SQL Server 2008. Perhaps the most interesting thing to me is that such a feature can be touted to an audience who mostly has no basis for understanding what the feature is even about.
+
+What am I trying to get at here? Well, the new geospatial data types that are part of SQL Server have been a relatively highly requested feature for perhaps the last ten years or so. (It is one of the things often focused on by the Oracle crowd, since Oracle has handled geospatial data for some time now.) While very powerful, it is addressing an area that many database developers don't even realize they may need, let alone actually understand.
+
+The geospatial data types require a grasp of a style of data that is much different than other forms of data we deal with. For example, when dealing with the new HierarchyID type that we looked at in the previous section, we were working with a style of data most developers already have some concept of. (We've dealt with hierarchies such as org charts for years.) So the new thing was simply the way we went about manipulating data and we already understood the nature of that. With geospatial data, however, many developers will be asking themselves many questions regarding what geospatial data is all about. For example:
+
+  * Is this just defining a specific location (for example an address)?
+  * Is it defining the boundaries of a property?
+  * Is it mapping a road?
+  * How many of my customers live within 5 miles of this point?
+  * How many bridges are there in Madison County?
+
+The reality is a bit larger than any one of these questions. Indeed, it encompasses all of the concepts just listed and more. How would we have designed for these kinds of questions in the past? For some of them we could have taken a relatively simple (and low power) approach, such as including a simple address. We might even have passed the address to an external application that kept geospatial data and utilized feedback from that application to ask bigger questions. Today's end users, however, expect more. It is, for example, nearly impossible to find a retail or restaurant chain website that does not include a "find a store near you" feature. They use geospatial functionality to supply that.
+
+With some of these needs in mind, let's explore the two types of geospatial data (planar or geodetic) and the functionality supporting each.
+
+Spatial Concepts
+
+To figure out the peculiarities of the specific type of geospatial data you need to work with, we are going to first get a bit of grounding in the more commonly accepted methods of representing spatial data. As you might imagine, there are standards surrounding how spatial data should be represented. Unfortunately, there isn't just one standard (indeed, SQL Server supports several "models").
+
+To begin understanding geospatial data, we must first grasp that there are two different major models of representing geospatial data: planar (flat earth) and geodetic (round earth). Both have the same basic goal: to represent space via a set of data points (points, lines, curves). Planar representations are generally more simplistic and, therefore, easier to grasp and manage. Planar data is often used for relatively "local" data—that is, data that does not need to cover a particularly large area and does not need to have precision adjustments for the curvature of the earth's surface. Geodetic representations offer a more "real world" depiction, and are generally used when you need to represent a larger area that is more likely to be affected by the curvature of the earth.
+
+Planar (Flat Earth) Data
+
+Planar data is known by several names, such as geoplaner, geometric, or flat earth. You can think of this as mapping reasonably well to the Euclidian geometry that you likely studied in high school. With planar data, everything is represented on, as you might guess, a plane or series of planes. The space being presented is assumed to be flat. This is, for smaller areas, a very practical method of looking at spatial data, as it is easy to visualize and most functionality does not require particularly complex math (for example, distance is the same as a straight line). Planar data can be represented using the sort of x, y, z data points you might have used in graphics in geometry class by mixing a collection of point data into lines and polygonal shapes; one can use basic geometry to represent complex shapes, and still handle things like overlapping objects.
+
+No matter how well we draw our planar mapping though, we are often representing something that is not truly flat by using points on a flat surface. This can introduce some problems. There are a number of approaches to minimizing the effects of a flat representation of a round earth. Figures 7.4 through 7.5 are examples of some common projections of the earth. Planar representations of the earth make use of the concept of a "projection"—that is, the round earth gets projected onto a flat surface.
+
+Figure 7.4
+
+Figure 7.5
+
+As it turns out, these projects are generally "good enough" for many applications of spatial data. Indeed, most local maps for government tracking of properties, roads, and other needs are done using planar models such as latitude and longitude.
+
+Be careful with your assumptions regarding latitude and longitude. While these may seem like well understood and agreed on concepts, there are actually multiple mappings of latitude and longitude used in the world today. For example, the longitude used in the Global Positioning System (GPS) is a noticeable distance (more than 100 meters or more depending on what part of the earth you're standing on when you measure it) different than most other representations of longitude (which are generally based on the Royal Observatory's definition of zero longitude).
+
+Planar data is supported in SQL Server by the GEOMETRY data type (which will serve as the core type for most of our upcoming examples).
+
+There are multiple accepted models of the earth. Make certain when supplying or receiving spatial data that the models being used are compatible or that you know how to adjust for differences between the two.
+
+Geodetic (Round Earth) Data
+
+Geodetic data, as shown in Figure 7.5, represents the more realistic (far more complex) model based on a round earth. Geodetic representation of data is supported by the GEOGRAPHY data type.
+
+Under the planar data model, it is assumed that the surface of the earth is flat. This works just fine for areas measured in relatively small distances (say, as much as several miles), but begins to fall apart as the distances grow larger. For example, when measuring the distance between Portland, Oregon and Beijing, China, the straight line used in a planar model would improperly represent the distance by many miles less than it actually is. Why? Well, under the flat model, the distance is a straight line rather than the more appropriate arc (which would follow the curvature of the earth's surface). Indeed, the issue can get even more complex, as the earth is not a perfect sphere (it bulges in places) with the circumference varying by literally hundreds of miles depending on which direction you're measuring. Geodetic data models the curve of the earth, and is supported in SQL Server via the GEOGRAPHY data type.
+
+It is important to note that SQL Server can only represent geographic data that resides within a single hemisphere. A hemisphere can be considered as any half of a sphere—regardless of what plane you cut the sphere along.
+
+Representing Spatial Data
+
+There are several key notions that are common to representing both planar and geodetic data and work together to allow you to represent a given type of data in different ways. The Open Geospatial Consortium (OGC)—an organization specializing in geometric data standards—defines several formats that you can utilize to represent spatial data. SQL Server 2008 implements three of these:
+
+  * Well Known Text (WKT): This is very plain-text looking, and simply sequentially names a series of objects (such as a point or a line) followed by coordinate information for each object.
+  * Well Known Binary (WKB): Implementing the same general notion as WKT, this representation encodes the same kind of information in a binary stream rather than plain text.
+  * Geography Markup Language (GML): An XML schema designed to represent geometric data. GML leverages the self-defining nature of XML data to allow additional (non-coordinate) information to be encoded along with the coordinate data. Examples of the kind of extended information that might be included with GML data would be things like a description of what is found at the location or, perhaps, sensor information (say, an ozone measurement at a specific point in Los Angeles, CA versus a similar measurement taken in Lisbon, Portugal).
+
+We will utilize WKT for the examples in this book, but this is largely a readability decision, and does not imply that WKT is a better choice in general use (the right choice will vary by situation).
+
+Regardless of which data representation is being utilized, the general objects required will be the same. Each format recognizes a set of three base objects that can be used individually or as a collection to represent spatial data. The objects are:
+
+  * Point: This is a specific point in space. It has no length, no width, and no height. It is the equivalent of the spot you mark with a thumb tack on a map to represent a place you are or have been. A point requires a simple X, Y notation.
+  * Line: In each of the formats SQL Server recognizes, a line is represented using a LINESTRING object. Note the relevance of the term STRING that is embedded. This recognizes that a line is represented as a series of two or more points. The use of multiple points in the line definition allows for the idea that the line may not be straight. Since each segment of the line string is the shortest path between the two points, increasing the number of points representing the same conceptual line will increase the accuracy of that line's representation.
+  * A line is considered "simple" if it does not cross over itself, and is considered to be a one-dimensional object even if it is curved or forms a ring (a line that has the same ending and starting point).
+
+Note that a ring does not mean that a polygon is round—only that it creates some form of enclosed space.
+
+  * Polygon: Although it is defined by one or more rings (again, a line with the same ending and starting points), defining what is individually a linestring that forms a ring as a polygon instead changes the treatment of the would-be linestring. Unlike the base ring definition, which is one dimensional and has no area, a polygon does have area. In addition, the ring that defines the outer boundary of the polygon can contain additional polygons that can define areas in the outer polygon that are hollow. The space defined by these inner hollow polygons is not considered to be part of the area of the parent polygon.
+  * Collection: This is a collection of the other three objects (point, line, polygon).
+
+Regardless of which spatial data type you're using within SQL Server—GEOMETRY or GEOGRAPHY—all three of these base objects (or a collection of them) are available and can be used in any mix within a given table. For example, a table of world landmarks might store a complex polygon to represent Yellowstone Park, a line to represent the equator, and a simple point to indicate the highest point on the earth. Each of these (or a collection of them) could be represented within the same column in the same table.
+
+In addition to these base concepts, the OGC defines a set of methods that should be supported to work with our spatial data. We'll explore some of these that are supported by SQL Server as we go through the examples, but it's worth noting that many of the methods exist for both types of spatial data (using the same name or just a slight name change) and have the same general functionality between the types. The OGC functions all start with a prefix of ST followed by a verb that indicates what the function does. They are implemented as a method for each instance of spatial data. Key examples are discussed in the following table.
+
+Note that, for each ST method call, the spatial reference id—or SRID—must match in order to perform a valid comparison. The SRID indicates what recognized (by the European Petroleum Survey Group) spatial model this particular spatial instance is referencing. If the SRIDs of two instances do not match, then any comparison will return NULL.
+
+Method | Use  
+---|---  
+.STArea() | Calculates the area of a spatial instance that is a polygon and accounts for hollow spaces created by contained polygons.  
+.STContains(<spatial instance>) | Returns a bit indicating whether the supplied instance is entirely contained within the calling instance.  
+.STDistance(<spatial instance>) | Provides a numeric value indicating the distance between the supplied instance and the calling instance.  
+.STEquals(<spatial instance>) | Returns a bit indicating whether the supplied and calling instance are qualitatively equal. Note that this does not require them to be defined in exactly the same way, but, rather, to wind up with the same result (for example, defining a square with 8 line segments, and another with 4, but with the same resulting side lengths and position would return a 1).  
+.STIntersects(<spatial instance>) | Returns a bit indicating whether the supplied instance crosses the calling instance at any point  
+.STOverlaps(<spatial instance>) | Returns a bit indicating whether the supplied instance overlaps the calling instance (for example, a line starting within a polygon, and then ending outside of it).  
+.STTouches(<spatial instance>) | Returns a bit indicating whether the supplied instance touches the calling instance in any way.  
+.STWithin(<spatial instance>) | Returns a bit indicating whether the supplied instance lies entirely within the calling instance; if any portion of the supplied instance falls outside of the calling instance, then STWithin will return a zero.
+
+The OGC function list is actually much, much longer and does vary somewhat between the GEOMETRY and GEOGRAPHY types, but these provide a taste of what's available and, among other things, includes those supported for indexes against spatial data. You can find a more complete list in the Books Online by looking under each spatial type (GEOMETRY and GEOGRAPHY).
+
+Implementing Planar Data Representations—The GEOMETRY Data Type
+
+As previously mentioned, the data type that implements the concept of planar, or flat earth, data is called GEOMETRY. Using the GEOMETRY data type not only provides a means to contain the types of geometric object definitions (point, linestring, polygon) we discussed earlier, but also a series of methods that can be utilized against that data. Like the HierarchyID data type we discussed earlier in the chapter (and the GEOGRAPHY type we'll discuss next), GEOMETRY is implemented via a CLR user-defined function (then flagged as system so it doesn't require the security considerations that true CLR UDTs require). Like other .NET classes, you can make use of a number of properties and static members of the class.
+
+The GEOMETRY type can accept any of the geometric types we just discussed. Let's check this out with a quick example that not only instantiates a geometric data type, but loads it with data.
+
+Note that SQL Server will attempt to represent some spatial data in Management Studio. The representation will, however, become visible only when you are in the Results to Grid mode in the Query Editor Window.
+
+We'll start by examining the way to get our WKT data into our data type:
+
+DECLARE @MyGeometry GEOMETRY;
+
+SET @MyGeometry = Geometry::STGeomFromText('LINESTRING(-3 3, 3 3, 3 -3, -3 -3,
+
+-3 3)', 0)
+
+SET @MyGeometry = Geometry::Parse('LINESTRING(-3 3, 3 3, 3 -3, -3 -3, -3 3)')
+
+SET @MyGeometry = 'LINESTRING(-3 3, 3 3, 3 -3, -3 -3, -3 3)'
+
+SELECT @MyGeometry;
+
+SET @MyGeometry = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3))'
+
+SELECT @MyGeometry;
+
+In this code, we've declared an instance of geometric data in a variable called @MyGeometry. We then assign linestring data to our variable in three different ways. These are all functionally the same, with the final assignment using the Parse function implicitly.
+
+We then select out our newly assigned line. When this is executed, Management Studio shows us not only a binary representation, but also the visual representation shown in Figure 7.6.
+
+Figure 7.6
+
+Note that, in order to see the spatial data tab, you must be using the "Results to Grid" mode in the Query Editor Window.
+
+We then go on to repeat the assignment and selection, but this time for a polygon instead of a linestring. This winds up yielding us slightly different results (shown in Figure 7.7).
+
+Figure 7.7
+
+Notice the slightly different representation of two objects based on the same series of points. Why are they different? Well, recall that a linestring is always considered one dimensional. Although they can curve and even cross over themselves, they are still considered to lack area (which requires two dimensions). SQL Server represents the linestring—even though it forms a ring—as hollow to represent the lack of area. For the polygon, however, SQL Server fills in the square to represent the enclosure of two-dimensional space. SQL Server is aware that the two, though based on the same series of points, have a fundamental difference distinguishing them. This difference will become more apparent later on, as various methods of the GEOGRAPHY data type are only relevant to specific object types (for example, the method that calculates area only makes sense on polygons, not on lines).
+
+Our polygons are, of course, not limited to squares or even rectangles. Indeed, they can be virtually any shape as long as they eventually are enclosed into a ring by ending at the same point they started at. (A linestring simply crossing itself is not enough to form a ring, and, therefore, a polygon. It must start and end at the same point.) In addition, we can use polygons embedded inside other polygons to represent hollow space. Let's check all these concepts out.
+
+First, we need a few different instances of the GEOMETRY data type to compare against each other. We'll also go ahead and establish a simple square again, but this time we'll call the STArea() method of the GEOMETRY type to get the area of our square:
+
+DECLARE @First GEOMETRY,
+
+@Second GEOMETRY,
+
+@Merged GEOMETRY;
+
+SET @First = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3))';
+
+SELECT 'First polygon area: ', @First.STArea();
+
+SELECT @First;
+
+The STArea() method is an example of a method that is part of the OGC list of spatial data methods. Execute this code, and we get a representation of our square (the same as we showed in Figure 7.7), but we also get a calculated area of 36.
+
+Moving on, let's expand our script to add another polygon, but this time we'll add something that has a slightly more complex linestring:
+
+DECLARE @First GEOMETRY,
+
+@Second GEOMETRY,
+
+@Merged GEOMETRY;
+
+SET @First = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3))'
+
+SELECT 'First polygon area:', @First.STArea();
+
+SELECT @First;
+
+SET @Second =
+
+'POLYGON((-1 .4, -.4 1, .4 1, 1 .4, 1 -.4, .4 -1, -1 -1, -.4 -1, -1 -.4, -1 .4))';
+
+SELECT @Second;
+
+SET @Second= @Second.MakeValid();
+
+SELECT 'Second polygon area: ', @Second.STArea();
+
+As the more complex linestring in @Second would imply, we are shown a more complex shape: an octagon (shown in Figure 7.8).
+
+Figure 7.8
+
+Note also, though, that we had to perform an additional action on our polygon to make it valid—that done, we are able to call the area calculation and receive our result (3.28000022888182).
+
+Continuing the example, we can build a polygon that utilizes both linestrings, with the second becoming a hollow area in the first:
+
+DECLARE @First GEOMETRY,
+
+@Second GEOMETRY,
+
+@Merged GEOMETRY;
+
+SET @First = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3))'
+
+SELECT 'First polygon area: ', @First.STArea();
+
+SELECT @First;
+
+SET @Second =
+
+'POLYGON((-1 .4, -.4 1, .4 1, 1 .4, 1 -.4, .4 -1, -1 -1, -.4 -1, -1 -.4, -
+
+1 .4))';
+
+SELECT @Second;
+
+SET @Second= @Second.MakeValid();
+
+SELECT 'Second polygon area: ', @Second.STArea();
+
+SET @Merged = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3),
+
+(-1 .4, -.4 1, .4 1, 1 .4, 1 -.4, .4 -1, -1 -1, -.4 -1, -1 .4, -1 .4))';
+
+SELECT @Merged;
+
+SET @Merged = @Merged.MakeValid();
+
+SELECT 'Merged polygon area: ', @Merged.STArea();
+
+This time SQL Server shows both polygons—inverting the color fill to show the hollow space (shown in Figure 7.9).
+
+Figure 7.9
+
+The calculated area for the merged polygon has properly taken into account the hollow area (that is, it subtracts it from the larger polygon) and gives us the correct area of 32.7200009155276.
+
+Let's make one last addition to this script, this time adding yet another polygon into the mix to see how SQL Server handles overlapping areas. We'll add another octagon to the merged polygon:
+
+DECLARE @First GEOMETRY,
+
+@Second GEOMETRY,
+
+@Merged GEOMETRY;
+
+SET @First = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3))'
+
+SELECT 'First polygon area: ', @First.STArea();
+
+SELECT @First;
+
+SET @Second =
+
+'POLYGON((-1 .4, -.4 1, .4 1, 1 .4, 1 -.4, .4 -1, -1 -1, -.4 -1, -1 -.4, -1
+
+.4))';
+
+SELECT @Second;
+
+SET @Second= @Second.MakeValid();
+
+SELECT 'Second polygon area: ', @Second.STArea();
+
+SET @Merged = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3),
+
+(-1 .4, -.4 1, .4 1, 1 .4, 1 -.4, .4 -1, -1 -1, -.4 -1, -1 -.4, -
+
+1 .4))'
+
+SET @Merged = @Merged.MakeValid();
+
+SELECT 'Merged polygon area: ', @Merged.STArea();
+
+SET @Merged = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3),
+
+(-1 .4, -.4 1, .4 1, 1 .4, 1 -.4, .4 -1, -1 -1, -.4 -1, -1 -.4, -1 .4),
+
+(-2.5 .4, -1.9 1, -1.1 1, -.5 .4, -.5 -.4, -1.1 -1, -2.5 -1, -1.9 -1, -2.5 –
+
+.4, -2.5 .4))'
+
+SELECT @Merged;
+
+SET @Merged = @Merged.MakeValid();
+
+SELECT 'Second Merged polygon area: ', @Merged.STArea();
+
+Pay attention to both the third figure (shown in Figure 7.10) and the area of 30.4900010681158. Note that both polygons are shown (including their overlap area), and that the area result subtracted the hollow area only once—that is, the area that overlaps between the two inner polygons was only removed once.
+
+Figure 7.10
+
+Last, but not least, let's take a quick visit of the ToString() method. For this, we'll use the same merged GEOMETRY, activate the MakeValid() method, and then output the slightly modified result:
+
+DECLARE @Merged GEOMETRY;
+
+SET @Merged = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3),
+
+(-1 .4, -.4 1, .4 1, 1 .4, 1 -.4, .4 -1, -1 -1, -.4 -1, -1 -.4,
+
+-1 .4))'
+
+SET @Merged = @Merged.MakeValid();
+
+SELECT 'Merged polygon area: ', @Merged.STArea();
+
+SELECT @Merged;
+
+SELECT @Merged.ToString()
+
+Notice the changes to the output:
+
+POLYGON ((-3 -3, 3 -3, 3 3, -3 3, -3 -3), (-0.39999961853027344 -1, -1 –
+
+0.39999961853027344, -1 0.39999961853027344, -0.39999961853027344 1,
+
+0.39999961853027344 1, 1 0.39999961853027344, 1 -0.39999961853027344,
+
+0.39999961853027344 -1, -0.39999961853027344 -1))
+
+The changes away from our relatively round numbers is a byproduct of the MakeValid() command, but, other than that change, we got back almost exactly the layout we put in.
+
+Implementing Geodetic Representations—The GEOGRAPHY Type
+
+The type that implements the concept of geodetic, or round earth, data is called GEOGRAPHY. The GEOGRAPHY data type works, in most ways, just like the GEOMETRY type did. (Indeed, they share many of the same functions.) Like the last two data types we've discussed, GEOMETRY is implemented via a CLR user-defined function.
+
+The GEOGRAPHY type can also accept any of the geometric types we discussed earlier in the section, but it also applies the notion of a hemisphere.
+
+While the geometric data type would apply a default SRID to spatial instances (the default is zero), the GEOGRAPHY data type does not generally have a default value (some individual geography methods do assume a SRID of 4326), and must be supplied each time you redefine a geographic instance.
+
+Let's start by utilizing a near duplicate of our first geometry example, only using the GEOGRAPHY type this time:
+
+DECLARE @First GEOGRAPHY;
+
+SET @First = GEOGRAPHY::STGeomFromText('LINESTRING(-3 3, 3 3, 3 -3, -3 -3, -3
+
+3)', 4326)
+
+SET @First = GEOGRAPHY::Parse('LINESTRING(-3 3, 3 3, 3 -3, -3 -3, -3 3)')
+
+SET @First = 'LINESTRING(-3 3, 3 3, 3 -3, -3 -3, -3 3)'
+
+SELECT @First;
+
+This all works fine, with only the STGeomFromText() function working differently than its geometric counterpart (and, even then, the only difference is that it requires a second parameter instead of using a default).
+
+Things get a bit more interesting when we get to a polygon though, as we must fit within a given hemisphere. A hemisphere is, just as in the dictionary definition, half of a sphere. The starting and stopping points of each hemisphere vary depending on what SRID you're referencing, but, regardless of which you've chosen, all polygons, lines, and points referenced for a given spatial instance must fit within that hemisphere.
+
+I would imagine this to provoke the question of "Why?" I know it did for me. The issue has to do with eliminating ambiguity on what is considered "inside" versus "outside" a polygon. There are functions that look to see if something is contained within a spatial instance, but how do you know if something is inside an object if you don't know which side of the defining ring is considered inside versus outside?
+
+There is, of course, more than one way to address the inside versus outside problem with spatial data in general, but the SQL Server team had to pick one, and they went with an approach that requires you to stay within a single hemisphere. If you need to map an object that crosses a hemisphere boundary, consider mapping it as two adjacent objects (sharing the hemisphere border), and utilizing them as a pair.
+
+To check this out, we'll continue to run through what is largely the same example as we used for geometry, but mapped to the curve aware data type (GEOGRAPHY):
+
+DECLARE @First GEOGRAPHY;
+
+SET @First = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3))';
+
+SELECT @First;
+
+But when you try and execute this, you run into trouble that you didn't have under the GEOGRAPHY data type:
+
+Msg 6522, Level 16, State 1, Line 3
+
+A .NET Framework error occurred during execution of user-defined routine or
+
+aggregate "geography":
+
+Microsoft.SqlServer.Types.GLArgumentException: 24205: The specified input does
+
+not represent a valid geography instance because it exceeds a single
+
+hemisphere. Each geography instance must fit inside a single hemisphere. A
+
+common reason for this error is that a polygon has the wrong ring orientation.
+
+Microsoft.SqlServer.Types.GLArgumentException:
+
+at Microsoft.SqlServer.Types.GLNativeMethods.ThrowExceptionForHr(GL_HResult
+
+errorCode)
+
+at Microsoft.SqlServer.Types.GLNativeMethods.GeodeticIsValid(GeoData g)
+
+at Microsoft.SqlServer.Types.SqlGeography.IsValidExpensive()
+
+at Microsoft.SqlServer.Types.SqlGeography.ConstructGeographyFromUserInput(GeoData
+
+g, Int32 srid)
+
+at Microsoft.SqlServer.Types.SqlGeography.GeographyFromText(OpenGisType
+
+type, SqlChars taggedText, Int32 srid)
+
+at Microsoft.SqlServer.Types.SqlGeography.Parse(SqlString s)
+
+.
+
+(1 row(s) affected)
+
+The extra stack of error lines is a result of the .NET implementation that is behind all of the new data types that are covered in this chapter. The key item, however, is the GLArgumentException line; we are in more than one hemisphere.
+
+When I first started learning about the hemisphere issue, my assumption was that it must have to do with negative and positive numbers—not so. Instead, the issue is more of a simplistic test of whether the "inside" of our polygon fits inside a hemisphere. We've defined a box that seems fairly straightforward and small here, so it's easy to see why one might be confused at how it is in more than one hemisphere. The problem is, however, also fairly simple. Our inside and outside are backwards. That is, what you likely perceive as being "outside" the square is considered to be inside as we've defined the box to SQL Server.
+
+To address this issue, we have to think of the polygon in terms of the ring that draws it—that is, as a series of connected lines that eventually ends where it started. The "inside" is always deemed to be the side that is on the left of the line as you draw it. In general, this means that, when you draw an object, you'll want to lay out the lines that enclose it in a counterclockwise direction. In our example, we were going clockwise, so we created a situation where the "outside" was the area that was bounded by our line, and the inside was unbounded. We can fix our error by simply reversing the order we draw the polygon in:
+
+DECLARE @First GEOGRAPHY;
+
+SET @First = 'POLYGON((-3 3, -3 -3, 3 -3, 3 3, -3 3))';
+
+SELECT @First;
+
+Now if we execute it, things return and look pretty much as they did when we were working with the GEOMETRY type.
+
+The set of methods implemented in the GEOMETRY and GEOGRAPHY types has significant overlap between them, but is not identical. (All the ones we've seen in this chapter, except for MakeValid() are implemented in both types). Spatial data is its own area of study, so I recommend exploring information well outside the SQL Server–specific community to understand what is expected in each implementation.
+
+Filestreams
+
+This is something of a "high-octane" feature that is new in SQL Server 2008. Indeed, it is relatively fringe in nature, and even requires you to take special steps to enable it. (It is not enabled in the default installation.) Still, while I consider this feature to still be solidly in its infancy, it has started a path to something that is potentially very special. So that should bring about the question: "OK then, what exactly do filestreams do?" Glad you asked.
+
+There has long been a series of problems in the database realm regarding what to do with storage of unstructured data files (for example, images, documents, spreadsheets, movies, and so on). The files are often an integral part of a large piece of data we are storing in a database (let's say something like photos of a crash and a scanned image of a claim form on an insurance claim).
+
+With this is mind, we would like to:
+
+  * Store all that data together and in a space-efficient manner
+  * Read and write the data with maximum performance
+  * Utilize transactions
+  * Secure the data effectively and under one model
+  * Have consistent state on the data when backing up and restoring
+
+The methods of addressing these problems have varied depending on which of these issues were considered the priority for a particular installation. The balancing act has gone something like this:
+
+  * Performance is key: The data was generally kept in individual files at the file system level.
+  * Consistent state is key: The data was generally stored as binary large objects (blobs) in the database. Often the blobs were kept on a separate drive array through the use of filegroups.
+
+The specifics vary by installation, but, while SQL Server's performance in blob handling has improved substantially over the years, it was still slow enough that the most common installation was to store files at the file system level and just store the path to the file in SQL Server. This has several risks, including:
+
+  * Files can get moved without the database knowing, breaking the link between data with no history that might allow recovery.
+  * Updates to the files are made without the database being directly aware of the change, making auditing ineffective at best.
+  * There was no means of co-enrolling data changes in the same transaction. This means you can overwrite a file, but have the associated database changes rolled back (or vice versa), destroying the proper state of your data.
+  * The lack of coordinated transactions created a time latency between changes in the file system and backup/recovery work in the database.
+
+Other installations did go the SQL Server blob route, fixing the preceding issues, but creating other problems:
+
+  * Storage was inefficient, with space loss due to SQL Server's page storage model overhead as relates to blob data.
+  * Performance suffered. In general, this performance hit occurred in a manner that affected all the data being accessed, not just the blob.
+  * Accessing blob data from the database required special handling versus other data in the database—adding complexity. What's more, the access model was generally seen as more complex than the relatively simple stream handling of files from the file system.
+
+Filestreams in SQL Server address virtually all of these problems by coordinating storage between the database and the file system into one cohesive solution, with both systems doing what they do best (SQL Server coordinating the transactions and storing the structured data, and the file system storing the unstructured data).
+
+Under a filestream model, SQL Server integrates with NTFS (the file system used in Windows). For tables and columns that are configured to do so, data for columns defined as type varbinary(max) are redirected to the file system. Access from within SQL Server is relatively transparent, and standard T-SQL statements will work against the data. For client languages, however, they can utilize a special SqlFileStream object that is derived from the Stream class in .NET, making much of the functionality very familiar to client developers that are already used to the Stream object for file handling and other stream access. Through this integration of the best parts of SQL Server and NTFS, several key problems are solved:
+
+  * Security is coordinated between SQL Server and NTFS: The directory used to store the SQL Server filestream data can only be accessed within a SQL Server granted context. This means that those who do not have appropriate access to the varbinary(max) column in SQL Server cannot gain access to the underlying file in NTFS.
+  * Transactions are fully supported: Stream updates are fully enrolled in any active transaction (indeed, clients using a filestream are required to enroll in a transaction context in order to gain access to the data at all), and will honor commits and rollbacks as appropriate. This means updates to an existing file will be rolled back as appropriate, restoring the file to its original state if the transaction did not complete.
+  * Backups are also coordinated: This means that backups of the database include the NTFS handled files in a state consistent with the rest of the backup data.
+  * Access to the file information is handled through virtually identical means as it would have been had the file been stored within NTFS directly: Only minimal coordination overhead is incurred, so performance differences versus direct NTFS storage is negligible.
+
+The ramifications of this bode very well for the future of unstructured data in otherwise structured environments. Let's take a quick look at what's involved from a development perspective.
+
+Enabling Filestreaming
+
+By default, filestream access is turned off when you install SQL Server. There is an option to set this up during the installation process, and I recommend using that option if you remember it. If, however, you forget (or just didn't, at the time, think you needed it), you can enable filestreaming for the server by using the SQL Server Configuration Manager. Go to the SQL Server Services node, and right-click the SQL Server services for your instance (the default instance is labeled MSSQLSERVER). This should bring up the dialog shown in Figure 7.11. (Notice that I've changed to the FILESTREAM tab).
+
+Installation of the AdventureWorks2008 database requires that filestream be turned on for the server you install it on, so, if you've made it this far in the book, you're certain to have filestream access turned on for the server you've been working examples with. That said, you may want to play around with a system that doesn't have filestream turned on so you can understand what's involved in turning it on after the fact.
+
+Figure 7.11
+
+In this dialog, you can define the level of access you want the filestream exposed to. Be sure and note that what you are setting up here is for the server, and your database(s) will need additional configuration to be able to store stream data.
+
+Enabling a Database for Filestreams
+
+To enable filestreaming for a database, you just need to create a filegroup using the CONTAINS FILESTREAM option. This will set the path that you want to place under SQL Server access control and enable tables to be configured for filestream access. Let's try this out by creating a database we'll use for examples in this section:
+
+CREATE DATABASE FileStreamDB
+
+ON
+
+PRIMARY ( NAME = FSDBPrimary, FILENAME = 'C:\FSDB\DB\fsdb.mdf'),
+
+FILEGROUP FSDBStream CONTAINS FILESTREAM
+
+( NAME = FSDBStream, FILENAME = 'C:\FSDB\STREAM')
+
+LOG ON ( NAME = FSDBLog, FILENAME = 'C:\FSDB\fsdb.ldf')
+
+GO
+
+Note that, unlike the data and log file paths, which must exist when you run the CREATE DATABASE statement, the filegroup you're using for the filestream must not yet exist. SQL Server creates the directory as part of the database creation, coordinating with NTFS regarding permissions and ownership of the directory.
+
+Run this (changing the file paths to something that works on your particular system) and you should get a confirmation that your database has been created.
+
+You can use the ALTER DATABASE command to add a filestream filegroup if you need to enable an existing database for filestream access.
+
+Creating a Filestream-Enabled Table
+
+There are no special settings required to enable a table for filestream. Instead, you just need to make sure that your table has a unique constrained column of type rowguidcol (a special data type that uses the uniqueidentifer type but also defines it as a row identifier for SQL Server) defined. After that, filestream access is defined on a per-column basis based on options for any varbinary(max) columns in the table.
+
+Again, let's try this out by creating a table we'll use later to store an object on our SQL Server:
+
+CREATE TABLE FSTable
+
+(
+
+FileKey int NOT NULL IDENTITY PRIMARY KEY,
+
+rowguid uniqueidentifier rowguidcol NOT NULL UNIQUE,
+
+filedata varbinary(max) FILESTREAM
+
+);
+
+Again, this should get you a simple confirmation that the command ran successfully, but, with this created, we should be ready to manipulate stream data.
+
+Using T-SQL with Filestreams
+
+Filestream data is relatively transparent to T-SQL access. We can, for example, run a simple INSERT statement just as we would any other row that had binary data:
+
+DECLARE @Ident int
+
+INSERT FSTable
+
+VALUES
+
+(NEWID(), 0x0A);
+
+SET @Ident = @@IDENTITY;
+
+SELECT FileKey, filedata
+
+FROM FSTable
+
+WHERE FileKey = @Ident;
+
+UPDATE FSTable
+
+SET filedata = 0x49276D206C6561726E696E672066696C6573747265616D73
+
+WHERE FileKey = @Ident;
+
+SELECT FileKey, filedata
+
+FROM FSTable
+
+WHERE FileKey = @Ident;
+
+DELETE FSTable
+
+WHERE FileKey = @Ident;
+
+SELECT FileKey, filedata
+
+FROM FSTable
+
+WHERE FileKey = @Ident;
+
+This explores all the main statements of SQL:
+
+(1 row(s) affected)
+
+FileKey filedata
+
+\----------- ------------------------------------------------------------------------
+
+1 0x0A
+
+(1 row(s) affected)
+
+(1 row(s) affected)
+
+FileKey filedata
+
+\----------- ------------------------------------------------------------------------
+
+1 0x49276D206C6561726E696E672066696C6573747265616D73
+
+(1 row(s) affected)
+
+(1 row(s) affected)
+
+FileKey filedata
+
+\----------- ------------------------------------------------------------------------
+
+(0 row(s) affected)
+
+As you can see, there really isn't a lot to it from a T-SQL perspective. Indeed, all the major statements work pretty much as they would with non-filestream data. There is a small amount of additional information using the PathName() property that is added to the varbinary(max) data type when filestream is enabled, for example:
+
+DECLARE @Ident int;
+
+INSERT FSTable
+
+VALUES
+
+(NEWID(), 0x0A);
+
+SELECT @Ident = @@IDENTITY;
+
+SET @Ident = @@IDENTITY;
+
+SELECT rowguid, fs.filedata.PathName() AS Path
+
+FROM FSTable fs
+
+WHERE FileKey = @Ident;
+
+Run this, and you should see a single row back. (It is, unfortunately, too wide to fit gracefully in this book.) First, notice the rowguid column. Now compare it with the final portion of the Path column, and you should see a match.
+
+As you can see, the column we identify as the rowguidcol is critically important in terms of setting a unique path for our stored filestreams.
+
+Using Filestreams with .NET
+
+I'm going to defer much of our discussion of .NET with filestreams until we discuss connectivity in Chapter 25 (which is a web-only release, so don't skip right to the back of the book!). However, I think it important to understand some key points early, as they have design ramifications that you may not otherwise think of before you get in the middle of some .NET code.
+
+Any work with a filestream requires a transaction context. Even if you're just reading data, you need the transaction context from the SQL Server side to govern issues of concurrency and consistency of your data. Unfortunately, you cannot make use of the T-SQL keyword BEGIN TRANSACTION (there are some rules for multiple active result set—or MARS—enabled connections that BEGIN TRANSACTION does not live up to), so you must use your client's data access API's method of enlisting transactions prior to accessing data via a filestream.
+
+Other than that, the primary difference between handling a SQL Server related filestream and the more generic Stream object in .NET is mostly one of what you instantiate. (For SQL Server filestreams, a SqlFileStream object takes care of most of the differences transparently.)
+
+Again, we will look at an example filestream connection in Chapter 25.
+
+Again, as an important reminder, Chapter 25 is a web release chapter, and one I hope to occasionally update during the life of this book to keep it somewhat in line with the ever changing world of connectivity.
+
+Table Compression
+
+It is important to note that, as of this writing, the data compression features in SQL Server 2008 are limited to the Enterprise edition.
+
+This one is, again, new with SQL Server 2008, but some early indications of what was to come first appeared in a SQL Server 2005 service pack. From a programming standpoint, there is actually relatively little to be done here. (It's largely about table settings.) But it's worth a visit in this "advanced" data structures chapter for three simple reasons:
+
+  * Planning: The compression feature fundamentally alters the page/row storage format of data on disk, and can significantly reduce the footprint of your data. This is done on a table-by-table basis (again, it is a table-level setting), and therefore requires an adjustment to how you plan for the required storage volume and growth in your database.
+  * Performance: There is a performance trade-off when you deal with table compression that can work for or against you. It depends on the particular scenario. There is extra overhead to managing the compression, but the compression may also sharply reduce I/O requirements, and thus gain back any performance lost to the compression overhead.
+  * Structure Knowledge: I went so far as to tell you about the traditional page/row storage methods, so anything that fundamentally alters those default storage methods probably deserves something of a look.
+
+Enabling Compression
+
+In the previous chapter, we took a look at the CREATE INDEX syntax. This, along with CREATE TABLE, is where the DATA_COMPRESSION option is available. The CREATE INDEX version is highlighted in the following code (it works the same in the CREATE TABLE statement).
+
+CREATE [UNIQUE] [CLUSTERED|NONCLUSTERED]
+
+INDEX <index name> ON <table or view name>(<column name> [ASC|DESC] [,...n])
+
+INCLUDE (<column name> [,...n])
+
+[WITH
+
+[PAD_INDEX = { ON | OFF }]
+
+[[,] FILLFACTOR = <fillfactor>]
+
+[[,] IGNORE_DUP_KEY = { ON | OFF }]
+
+[[,] DROP_EXISTING = { ON | OFF }]
+
+[[,] STATISTICS_NORECOMPUTE = { ON | OFF }]
+
+[[,] SORT_IN_TEMPDB = { ON | OFF }]
+
+[[,] ONLINE = { ON | OFF }
+
+[[,] ALLOW_ROW_LOCKS = { ON | OFF }
+
+[[,] ALLOW_PAGE_LOCKS = { ON | OFF }
+
+[[,] DATA_COMPRESSION = { NONE | ROW | PAGE}
+
+[ ON PARTITIONS ( { <partition number expression> | <range> }
+
+[[,] MAXDOP = <maximum degree of parallelism>
+
+]
+
+[ON {<filegroup> | <partition scheme name> | DEFAULT }]
+
+As mentioned before, you can turn on data compression as part of the CREATE TABLE statement by adding an identical line to that used in the CREATE INDEX statement.
+
+Summary
+
+Virtually everything seen in this chapter is new with SQL Server 2008 (XML indexes being the notable exception). Most of it is highly specialized, but each does what it does very well with data structures that have been optimized for that specific task.
+
+If you're dealing with XML data, consider your index carefully, but experiment with indexes and realize that they can greatly speed XML queries. For hierarchical data, consider the new HierarchyID data type. Not only does it include hierarchy-specific methods, but, for many developers, the notion that a given node knows its entire lineage is going to be much easier to grasp than the recursive calls that are generally required for the parent child approach to hierarchies.
+
+Spatial data is finally here, but brings SQL Server developers into a realm that they have likely not been in before. There is support for both flat and round earth models, and the ability to recognize proximity, irregular shapes, intersections, and similar spatial-specific concepts is a huge boon for many that didn't realize they had a special need—let alone conceive of a way to address that need.
+
+Filestreams address a long-standing need in SQL Server. Most of the functionality supported by filestreams has been supported in some other fashion for a long time, but filestreams integrate that functionality in a manner that allows for more coordinated backup processes and, perhaps more important, transaction-based handling of large binary files. While filestream access is largely a client application–only process, it requires substantial design and security consideration by the database architect.
+
+Data compression is finally here at the database level. While the compression is largely transparent to the application, compression can affect performance in both good and bad ways, and needs to be carefully considered prior to activating the compression feature.
+
+In our next chapter, we'll explore an old mainstay of SQL Server—views.
+8
+
+Views
+
+Since we're assuming, in this book, that you already know something about SQL Server, I am going to minimize the discussion of the basics and focus primarily on the more meaty uses of views. That said, we'll touch ever so briefly on view basics before moving on.
+
+Views have a tendency to be used either too much, or not enough—rarely just right. When we're done with this chapter, you should be able to use views to:
+
+  * Be more comfortable with view basics
+  * Add additional indexing to your database to speed query performance—even when you're not using the view the index is based on
+  * Understand and utilize the notion of partitioned views and federated servers
+
+A view is, at its core, really nothing more than a stored query. You can create a simple query that selects from only one table and leaves some columns out, or you can create a complex query that joins several tables and makes them appear as one.
+
+Reviewing View Syntax
+
+The most basic syntax for a view looks something like this:
+
+CREATE VIEW <view name>
+
+AS
+
+<SELECT statement>
+
+It utilizes that basic CREATE <object type> <object name> syntax that exists for most SQL Server objects. It is just the minimum, of course, but it's still all we need in a large percentage of the situations. The more extended syntax looks like this:
+
+CREATE VIEW [<schema name>].<view name> [(<column name list>)]
+
+[WITH [ENCRYPTION] [, SCHEMABINDING] [, VIEW_METADATA]]
+
+AS
+
+<SELECT statement>
+
+[WITH CHECK OPTION]
+
+So, an extremely simple view on the Person.Person table in the AdventureWorks2008 database might look something like:
+
+USE AdventureWorks2008;
+
+GO
+
+CREATE VIEW Person.PersonView
+
+AS
+
+SELECT FirstName, MiddleName, LastName
+
+FROM Person.Person;
+
+So, when you run:
+
+SELECT * FROM Person.PersonView;
+
+You get back exactly the same thing as:
+
+SELECT FirstName, MiddleName, LastName
+
+FROM Person.Person;
+
+You are essentially saying to SQL Server: "Give me all of the rows and columns you get when you run the statement SELECT FirstName, MiddleName, LastName FROM Person.Person."
+
+We've created something of a pass-through situation—that is, our view hasn't really changed anything, but rather just "passed through" a filtered version of the data it was accessing. Think about the uses for this a bit, and you should be able to see how this concept can be utilized to do things like simplify the data for inexperienced users (show them only the columns they care about to keep from confusing them) or to proactively hide sensitive data (such as profit or salary numbers) by granting the user rights to a view that doesn't include that data, but not giving them rights to the underlying table.
+
+Be aware that, by default, there is nothing special done for a view. The view runs just as if it were a query run from the command line—there is no pre-optimization of any kind. This means that you are adding one more layer of overhead between the request for data and the data being delivered. That means that a view is never going to run as fast as if you had just run the underlying SELECT statement directly. That said, views exist for a reason—be it security or simplification for the user—balance your need against the overhead as would seem to fit your particular situation.
+
+How much overhead? Well, it depends both on how complex the view is and on the calling code. It can range from milliseconds to much longer impacts (though usually the former) depending on the specifics.
+
+Let's take this one step further.
+
+You've already seen how to create a simple view—you just use any SELECT statement. How do you filter the results of your queries? With a WHERE clause. Views are no different.
+
+More Complex Views
+
+Perhaps one of the most common uses of views is to flatten data—that is, the removal of complexity that we outlined at the beginning of the chapter. Imagine that we are providing a view for management to make it easier to check on sales information. No offense to managers who are reading this book, but managers who write their own complex queries are still a rather rare breed—even in the information age.
+
+For an example, our manager would like to be able to do simple queries that will tell him or her what orders have been placed for what items and how many sold on each order and related pricing information. So, we create a view that he or she can perform very simple queries on:
+
+USE AdventureWorks2008;
+
+GO
+
+CREATE VIEW CustomerOrders_vw
+
+AS
+
+SELECT o.SalesOrderID,
+
+o.OrderDate,
+
+od.ProductID,
+
+p.Name,
+
+od.OrderQty,
+
+od.UnitPrice,
+
+od.LineTotal
+
+FROM Sales.SalesOrderHeader AS o
+
+JOIN Sales.SalesOrderDetail AS od
+
+ON o.SalesOrderID = od.SalesOrderID
+
+JOIN Production.Product AS p
+
+ON od.ProductID = p.ProductID;
+
+Now do a SELECT:
+
+SELECT *
+
+FROM CustomerOrders_vw;
+
+You wind up with a bunch of rows—over 100,000—but you also wind up with information that is far simpler for the average manager to comprehend and sort out. What's more, with not that much training, the manager (or whoever the user might be) can get right to the heart of what he or she is looking for:
+
+SELECT ProductID, OrderQty, LineTotal
+
+FROM CustomerOrders_vw
+
+WHERE OrderDate = '5/15/2003';
+
+The user didn't need to know how to do a four-table join—that was hidden in the view. Instead, he or she needs only limited skill (and limited imagination for that matter) in order to get the job done.
+
+ProductID OrderQty LineTotal
+
+\----------- -------- ---------------------------------------
+
+791 1 2443.350000
+
+781 1 2071.419600
+
+794 1 2181.562500
+
+798 1 1000.437500
+
+783 1 2049.098200
+
+801 1 1000.437500
+
+784 1 2049.098200
+
+779 1 2071.419600
+
+797 1 1000.437500
+
+(9 row(s) affected)
+
+However, we could make our query even more targeted. Let's say that we want our view to return only yesterday's sales. We'll make only slight changes to our query:
+
+USE AdventureWorks2008;
+
+GO
+
+CREATE VIEW YesterdaysCustomerOrders_vw
+
+AS
+
+SELECT o.SalesOrderID,
+
+o.OrderDate,
+
+od.ProductID,
+
+p.Name,
+
+od.OrderQty,
+
+od.UnitPrice,
+
+od.LineTotal
+
+FROM Sales.SalesOrderHeader AS o
+
+JOIN Sales.SalesOrderDetail AS od
+
+ON o.SalesOrderID = od.SalesOrderID
+
+JOIN Production.Product AS p
+
+ON od.ProductID = p.ProductID
+
+WHERE CONVERT(varchar(12),o.OrderDate,101) =
+
+CONVERT(varchar(12),DATEADD(day,-1,GETDATE()),101)
+
+All the dates in the AdventureWorks database are old enough that this view wouldn't return any data, so let's add a row to test it. Execute the following script all at one time:
+
+USE AdventureWorks2008;
+
+DECLARE @Ident int;
+
+INSERT INTO Sales.SalesOrderHeader
+
+(
+
+CustomerID,
+
+OrderDate,
+
+DueDate,
+
+BillToAddressID,
+
+ShipToAddressID,
+
+ShipMethodID
+
+)
+
+VALUES
+
+(
+
+1, -- CustomerID
+
+DATEADD(day,-1,GETDATE()), -- OrderDate (Yesterday)
+
+GETDATE(), -- Due Date (today)
+
+1, -- BillToAddressID
+
+1, -- ShipToAddressID
+
+1 -- ShipMethodID
+
+);
+
+SELECT @Ident = @@IDENTITY;
+
+INSERT INTO Sales.SalesOrderDetail
+
+(SalesOrderID,
+
+OrderQty,
+
+ProductID,
+
+SpecialOfferID,
+
+UnitPrice,
+
+UnitPriceDiscount)
+
+VALUES
+
+(@Ident, 4, 765, 1, 50, 0);
+
+SELECT 'The OrderID of the INSERTed row is ' + CONVERT(varchar(8),@Ident);
+
+Most of what's going on in this script shouldn't be a big mystery for non-beginners, but I'll be explaining all of what is going on here in Chapter 9. For now, just trust me that we'll need to run all of this in order for us to have a value in AdventureWorks2008 that will come up for our view. You should see a result from the Management Studio that looks something like this:
+
+(1 row(s) affected)
+
+(1 row(s) affected)
+
+\-------------------------------------------
+
+The OrderID of the INSERTed row is 75124
+
+(1 row(s) affected)
+
+Be aware that some of the messages shown in the preceding code will appear only on the Messages tab if you are using the Management Studio's Results In Grid mode. Also remember that your particular OrderID may be different from mine depending what experimenting you've already been doing in the AdventureWorks2008 database.
+
+The SalesOrderID might vary, but the rest should hold pretty true.
+
+Now let's run a query against our view and see what we get:
+
+SELECT SalesOrderID, OrderDate FROM YesterdaysCustomerOrders_vw
+
+You can see that the 75124 does indeed show up:
+
+SalesOrderID OrderDate
+
+\------------ -----------------------
+
+75124 2008-12-31 01:00:00.000
+
+(1 row(s) affected)
+
+Don't get stuck on the notion that your SalesOrderID numbers are going to be the same as mine—these are set by the system (since SalesOrderID is an identity column) and are dependent on just how many rows have already been inserted into the table. As such, your numbers will vary.
+
+Using a View to Change Data—Before INSTEAD OF Triggers
+
+As we've said before, a view works mostly like a table does from an in-use perspective (obviously, creating them works quite a bit differently). Now we're going to come across some differences, however.
+
+It's surprising to many, but you can run INSERT, UPDATE, and DELETE statements against a view successfully. There are several things, however, that you need to keep in mind when changing data through a view:
+
+  * If the view contains a join, you won't, in most cases, be able to INSERT or DELETE data unless you make use of an INSTEAD OF trigger. An UPDATE can, in some cases (as long as you are only updating columns that are sourced from a single table), work without INSTEAD OF triggers, but it requires some planning, or you'll bump into problems very quickly.
+  * If your view references only a single table, then you can INSERT data using a view without the use of an INSTEAD OF trigger provided all the required fields in the table are exposed in the view or have defaults. Even for single-table views, if there is a column not represented in the view that does not have a default value, then you must use an INSTEAD OF trigger if you want to allow an INSERT.
+  * You can, to a limited extent, restrict what is and isn't inserted or updated in a view.
+
+Now, I've already mentioned INSTEAD OF triggers several times. INSTEAD OF triggers are a special, fairly complex kind of trigger we will look at extensively in Chapter 12. The problem here is that we haven't discussed triggers to any significant extent yet. As is often the case in SQL Server items, we have something of the old chicken versus egg thing going ("Which came first?"). I need to discuss INSTEAD OF triggers because of their relevance to views, but we're also not ready to talk about INSTEAD OF triggers unless we understand both of the objects (tables and views) that they can be created against.
+
+The way we are going to handle things for this chapter is to address views the way they used to be—before there was such a thing as INSTEAD OF triggers. While we won't deal with the specifics of INSTEAD OF triggers in this chapter, we'll make sure we understand when they must be used. We'll then come back and address these issues more fully when we look at INSTEAD OF triggers in Chapter 12.
+
+Having said that, I will provide this bit of context—an INSTEAD OF trigger is a special kind of trigger that essentially runs "instead" of whatever statement caused the trigger to fire. The result is that it can see what your statement would have done, and then make decisions right in the trigger about how to resolve any conflicts or other issues that might have come up. It's very powerful but also fairly complex stuff, which is why we defer it for now.
+
+Dealing with Changes in Views with Joined Data
+
+If the view has more than one table, then using a view to modify data is, in many cases, out—sort of anyway—unless you use an INSTEAD OF trigger. Since it creates some ambiguities in the key arrangements, Microsoft locks you out by default when there are multiple tables. To resolve this, you can use an INSTEAD OF trigger to examine the altered data and explicitly tell SQL Server what you want to do with it.
+
+Required Fields Must Appear in the View or Have the Default Value
+
+By default, if you are using a view to insert data (there must be a single table SELECT in the underlying query or at least you must limit the insert to affecting just one table and have all required columns represented), then you must be able to supply some value for all required fields (fields that don't allow NULLs). Note that by "supply some value" I don't mean that it has to be in the SELECT list—a default covers the bill rather nicely. Just be aware that any columns that do not have defaults and do not accept NULL values will need to appear in the view in order to perform INSERTs through the view. The only way to get around this is—you guessed it—with an INSTEAD OF trigger.
+
+Limit What's Inserted into Views—WITH CHECK OPTION
+
+The WITH CHECK OPTION is one of those lesser-known to almost completely unknown features in SQL Server. The rules are simple—in order to update or insert data using the view, the resulting row must qualify to appear in the view results. Restated, the inserted or updated row must meet any WHERE criterion that's used in the SELECT statement that underlies your view.
+
+Editing Views with T-SQL
+
+The main thing to remember when you edit views with T-SQL is that you are completely replacing the existing view. The only differences between using the ALTER VIEW statement and the CREATE VIEW statement are:
+
+  * ALTER VIEW expects to find an existing view, whereas CREATE doesn't.
+  * ALTER VIEW retains any permissions that have been established for the view.
+  * ALTER VIEW retains any dependency information.
+
+The second of these is the biggie. If you perform a DROP and then use a CREATE, you have almost the same effect as using an ALTER VIEW statement. The problem is that you will need to entirely reestablish your permissions on who can and can't use the view.
+
+Dropping Views
+
+It doesn't get much easier than this:
+
+DROP VIEW <view name>, [<view name>,[ ...n]]
+
+And it (or they) is gone.
+
+Auditing: Displaying Existing Code
+
+What do you do when you have a view, but you're not sure what it does? The first option should be easy at this point—just go into the Management Studio as if you're going to edit the view. Go to the Views sub-node, select the view you want to edit, right-click, and either choose Design or Script View As and then choose the specific type of script you want. Either way, you'll see the code behind the view complete with color-coding.
+
+Note that the Design feature brings up a special view builder utility. While the view builder is fabulous for those with little SQL experience (it works much like a similar tool in Access), I find it to be overly invasive about the way I want my view formatted, and inevitably leaves me with a view that is much more wordy (and therefore harder to read) than I would like; therefore, I usually stick to using the scripting tool and my own SQL writing skills.
+
+Unfortunately, we don't always have the option of having the Management Studio around to hold our hand through this stuff (we may be using a lighter-weight tool of some sort, or we may need to build the actual requests into our own application). The bright side is that we have a few ways of getting at the actual view definition:
+
+  * sp_helptext
+  * The OBJECT_DEFINITION() system function
+  * The sys.comments system view
+
+Let's look at the first of these by running sp_helptext against one of the supplied views in the AdventureWorks2008 database—vStateProvinceCountryRegion:
+
+EXEC sp_helptext 'Person.vStateProvinceCountryRegion';
+
+Note the quotes. This is because this stored proc expects only one argument, and the period is a delimiter of sorts—if you pass Person.vStateProvinceCountryRegion in without the quotes, it sees the period and isn't sure what to do with it and therefore errors out. If the view was in our default schema, we could supply just the view name (no schema) and would not need to wrap it in quotes.
+
+SQL Server obliges us with the code for the view:
+
+Text
+
+\------------------------------------------------------------------------------
+
+CREATE VIEW [Person].[vStateProvinceCountryRegion]
+
+WITH SCHEMABINDING
+
+AS
+
+SELECT
+
+sp.[StateProvinceID]
+
+,sp.[StateProvinceCode]
+
+,sp.[IsOnlyStateProvinceFlag]
+
+,sp.[Name] AS [StateProvinceName]
+
+,sp.[TerritoryID]
+
+,cr.[CountryRegionCode]
+
+,cr.[Name] AS [CountryRegionName]
+
+FROM [Person].[StateProvince] sp
+
+INNER JOIN [Person].[CountryRegion] cr
+
+ON sp.[CountryRegionCode] = cr.[CountryRegionCode];
+
+Now, sp_helptext is great, but I would classify it as somewhat antiquated at this point. Why? Well, since sp_helptext is a stored procedure, you can't easily include the result set as part of a more complex data operation. Fortunately, Microsoft has given us OBJECT_DEFINITON() to deal with that issue.
+
+OBJECT_DEFINITION() should be your preferred choice for a couple of reasons:
+
+  * When new releases come out, it will automatically be updated for changes to the system tables (so you don't have to worry about such things)
+  * The value returned can easily be used within a broader query (for example, as one column, with the source code for many objects being returned)
+
+The syntax looks like this:
+
+OBJECT_DEFINITION(<object id>)
+
+The negative in this is that we rarely know what our object's id is without doing special lookup. Fortunately, SQL Server provides us a simple way to look up an object's id by using the OBJECT_ID() function. For example, if we wanted to use OBJECT_DEFINITION() to get the code for the same view we looked at earlier, we could write:
+
+SELECT OBJECT_DEFINITION (OBJECT_ID(N'Person.vStateProvinceCountryRegion'));
+
+Object IDs are SQL Server's internal way of keeping track of things. They are integer values rather than the names that you're used to for your objects. In general, they are outside the scope of this book, but it is good to realize they are there, as you will find them used by scripts you may copy from other people or just bump into them later in your SQL endeavors.
+
+Try it, and you'll see the result is nearly identical to when we used sp_helptext (it just doesn't name the column for us unless we provide an alias in our query definition).
+
+We can take this a bit further and easily return the code for every view in our database:
+
+SELECT '------------------------', OBJECT_DEFINITION(so.object_id)
+
+FROM sys.objects so
+
+WHERE so.type = 'V';
+
+I've omitted the results here in the book lest thousands of trees die needlessly—it's that lengthy. That said, running the previous query should give you all of the views in the AdventureWorks2008 database.
+
+We couldn't have done that with sp_helptext without utilizing a cursor—making it easy to see the usefulness of system functions such as OBJECT_DEFINITION versus the system stored procedure objects we had in earlier versions of SQL Server.
+
+Now let's try it the last of our ways—using sys.comments.
+
+You may see sys.comments (a system view) used interchangeably with the older, but not far less desirable syscomments (a system table). syscomments is one of many system tables that gave us most of our system information in SQL Server versions of old. Microsoft has been trying to move us away from direct calls to system tables for years, and they have finally given us the set of tools that allows us to comply with their wish.
+
+Even when system tables were the only directly queryable way to get system information, their use was somewhat risky, as Microsoft has always warned that system tables can change at any time (even service packs, though I've never seen that actually happen). Now that Microsoft has given us the views in the sys schema and a wide variety of table valued functions for metadata (see Appendix [CHECK] for more on those), it is downright silly to go directly against the system tables. I highly recommend that you migrate old code that may be accessing the system tables directly to utilize the equivalent view (usually easily found by just adding a period after the "sys" in the old system table name).
+
+sys.comments provides an actual view to your underlying source code, and thus provides something you can join directly to if you so choose. Like OBJECT_DEFINITION(), any use of sys.comments is going to require you to know your object's id. You can either join to the sys.objects system view, much as I did in the previous example, or utilize the OBJECT_ID() function as I did in the example before that. Note, however, that, when using sys.objects, you need to treat the object name and schema name separately (which means that you also need to involve the sys.schemas system view). For example:
+
+SELECT sc.text
+
+FROM sys.syscomments sc
+
+JOIN sys.objects so
+
+ON sc.id = so.object_id
+
+JOIN sys.schemas ss
+
+ON so.schema_id = ss.schema_id
+
+WHERE so.name = 'vStateProvinceCountryRegion'
+
+AND ss.name = 'Person';
+
+Again, you get the same block of code we saw in the previous two methods.
+
+Protecting Code: Encrypting Views
+
+If you're building any kind of commercial software product, odds are that you're interested in protecting your source code. All you have to do to encrypt your view (and most other forms of server stored code) is use the WITH ENCRYPTION option. This one has a couple of tricks to it if you're used to the WITH CHECK OPTION clause:
+
+  * WITH ENCRYPTION goes after the name of the view, but before the AS keyword.
+  * WITH ENCRYPTION does not use the OPTION keyword.
+
+In addition, remember that if you use an ALTER VIEW statement, you are entirely replacing the existing view except for access rights. This means that the encryption is also replaced. If you want the altered view to be encrypted, then you must use the WITH ENCRYPTION clause in the ALTER VIEW statement.
+
+Let's do an ALTER VIEW on the CustomerOrders_vw view that we created earlier in the chapter. If you haven't yet created the CustomerOrders_vw view, then just change the ALTER to CREATE (don't forget to run this against AdventureWorks):
+
+ALTER VIEW CustomerOrders_vw
+
+WITH ENCRYPTION
+
+AS
+
+SELECT o.SalesOrderID,
+
+o.OrderDate,
+
+od.ProductID,
+
+p.Name,
+
+od.OrderQty,
+
+od.UnitPrice,
+
+od.LineTotal
+
+FROM Sales.SalesOrderHeader AS o
+
+JOIN Sales.SalesOrderDetail AS od
+
+ON o.SalesOrderID = od.SalesOrderID
+
+JOIN Production.Product AS p
+
+ON od.ProductID = p.ProductID;
+
+Now do an sp_helptext on our CustomerOrders_vw:
+
+EXEC sp_helptext CustomerOrders_vw;
+
+SQL Server promptly tells us that it can't do what we're asking:
+
+The text for object 'CustomerOrders_vw' is encrypted.
+
+The heck you say, and promptly go to the sys.comments view:
+
+SELECT sc.text
+
+FROM syscomments sc
+
+JOIN sys.objects so
+
+ON sc.id = so.object_id
+
+JOIN sys.schemas ss
+
+ON so.schema_id = ss.schema_id
+
+WHERE so.name = 'CustomerOrders_vw'
+
+AND ss.name = 'dbo';
+
+But that doesn't get you very far either—SQL Server recognizes that the table was encrypted and will give you a NULL result.
+
+In short—your code is safe and sound. Even if you pull it up in other viewers (such as Management Studio, which actually won't even give you the Design option on an encrypted table), you'll find it useless.
+
+Make sure you store your source code somewhere before using the WITH ENCRYPTION option. Once it's been encrypted, there is no way to get it back. If you haven't stored your code away somewhere and you need to change it, then you may find yourself rewriting it from scratch.
+
+About Schema Binding
+
+Schema binding essentially takes the things that your view is dependent upon (tables or other views), and "binds" them to that view. The significance of this is that no one can make alterations to those objects (CREATE, ALTER) unless they drop the schema-bound view first.
+
+Why would you want to do this? Well, there are a few reasons why this can come in handy:
+
+  * It prevents your view from becoming "orphaned" by alterations in underlying objects. Imagine, for a moment, that someone performs a DROP or makes some other change (even deleting a column could cause your view grief) but doesn't pay attention to your view. Oops. If the view is schema bound, then this is prevented from happening.
+  * To allow indexed views. If you want an index on your view, you must create it using the SCHEMABINDING option. (We'll look at indexed views just a few paragraphs from now.)
+  * If you are going to create a schema-bound user-defined function (and there are instances where your UDF must be schema bound) that references your view, then your view must also be schema bound.
+
+Keep these in mind as you are building your views.
+
+Making Your View Look Like a Table with VIEW_METADATA
+
+This option has the effect of making your view look very much like an actual table to DB-LIB, ODBC, and OLE-DB clients. Without this option, the metadata passed back to the client API is that of the base table(s) that your view relies on.
+
+Providing this metadata information is required to allow for any client-side cursors (cursors your client application manages) to be updatable. Note that, if you want to support such cursors, you're also going to need to use an INSTEAD OF trigger.
+
+Indexed (Materialized) Views
+
+In SQL Server 2000, this one was supported only in the Enterprise Edition (okay, the Developer and Evaluation Editions also supported it, but you aren't allowed to use test and development editions in production systems). It is, however, supported in all editions since SQL Server 2005.
+
+When a view is referred to, the logic in the query that makes up the view is essentially incorporated into the calling query. Unfortunately, this means that the calling query just gets that much more complex. The extra overhead of figuring out the impact of the view (and what data it represents) on the fly can actually get very high. What's more, you're often adding additional joins into your query in the form of the tables that are joined in the view. Indexed views give you a way of taking care of some of this impact before the query is ever run.
+
+An indexed view is essentially a view that has had a set of unique values "materialized" into the form of a clustered index. The advantage of this is that it provides a very quick lookup in terms of pulling the information behind a view together. After the first index (which must be a clustered index against a unique set of values), SQL Server can also build additional indexes on the view using the cluster key from the first index as a reference point. That said, nothing comes for free—there are some restrictions about when you can and can't build indexes on views (I hope you're ready for this one—it's an awfully long list!):
+
+  * The view must use the SCHEMABINDING option.
+  * If it references any user-defined functions (more on these later in the book), then these must also be schema bound.
+  * The view must not reference any other views—just tables and UDFs.
+  * All tables and UDFs referenced in the view must utilize a two-part (not even three-part and four-part names are allowed) naming convention (for example, dbo.Customers, BillyBob.SomeUDF) and must also have the same owner as the view.
+  * The view must be in the same database as all objects referenced by the view.
+  * The ANSI_NULLS and QUOTED_IDENTIFIER options must have been turned on (using the SET command) at the time the view and all underlying tables were created.
+  * Any functions referenced by the view must be deterministic.
+
+To create an example indexed view, let's start by reviewing the CustomerOrders_vw object that we created earlier in the chapter. I'm showing this using the ALTER statement we used in the section on encryption, but, really, it could just as easily be the original version we created very early in the chapter as long as the WITH SCHEMABINDING is properly added.
+
+ALTER VIEW CustomerOrders_vw
+
+WITH SCHEMABINDING
+
+AS
+
+SELECT o.SalesOrderID,
+
+o.OrderDate,
+
+od.ProductID,
+
+p.Name,
+
+od.OrderQty,
+
+od.UnitPrice,
+
+od.LineTotal
+
+FROM Sales.SalesOrderHeader AS o
+
+JOIN Sales.SalesOrderDetail AS od
+
+ON o.SalesOrderID = od.SalesOrderID
+
+JOIN Production.Product AS p
+
+ON od.ProductID = p.ProductID;
+
+Some important things to notice here are:
+
+  * We had to make our view use the SCHEMABINDING option.
+  * In order to utilize the SCHEMABINDING option, we must have two-part naming for the objects (in this case, all tables) that we reference (in this case, we did anyway, but not all views you come across will already be configured that way).
+
+This is really just the beginning—we don't have an indexed view as yet. Instead, what we have is a view that can be indexed. When we create the index, the first index created on the view must be both clustered and unique.
+
+CREATE UNIQUE CLUSTERED INDEX ivCustomerOrders
+
+ON CustomerOrders_vw(SalesOrderID, ProductID, Name);
+
+Once this command has executed, we have a clustered view. We also, however, have a small problem that will become clear in just a moment.
+
+Let's test our view by running a simple SELECT against it:
+
+SELECT * FROM CustomerOrders_vw;
+
+If you execute this, you'll see that the graphical showplan as shown in Figure 8.1 (Display Estimated Execution Plan is the tooltip for this, and you'll find it toward the center of the toolbar; you can also find it in the menus at Query ⇒ Display Estimated Execution Plan) shows us using our new index.
+
+Figure 8.1
+
+The index supporting an indexed view may be utilized by SQL Server even if you do not explicitly use the view. For example, if you are performing joins that are similar to those the index is supporting for the view, SQL Server may recognize this and utilize the index.
+
+Partitioned Views
+
+These have been in use just since SQL Server 2000, but Microsoft has, since 2005, considered partitioned tables to be the preferred partitioning method. I bring them up here because they were one of the leading scalability options put forth by Microsoft for many years, and you need to know how they work in case you run into them in legacy code. In addition, there are some partitioning problems that are difficult to unsolvable utilizing partitioned tables, so it's good to know and understand another option.
+
+A partitioned view is a view that unifies multiple identical (in terms of structure—not actual data) tables and makes them appear to be a single table. At first, this seems like an easy thing to do with simple UNION clauses, but the concept actually becomes somewhat tricky when you go to handle insert and update scenarios.
+
+With partitioned views, we define a constraint on one of the tables in our view. We then define a similar, but mutually exclusive, constraint on a second (and possibly many more) table. When you build the view that unifies these mutually exclusive tables, SQL Server is able to sort out the exclusive nature of the tables in a logical manner. By doing this, SQL Server can determine exactly which table is to get the new data (by determining which table can accept the data—if you created them as mutually exclusive as you should have, then the data will be able to get into only one table and there is no conflict). The only catch is that the so called "partitioning column" must participate in the primary key. Let's see how this works by building our own little mini-sample.
+
+Imagine for a moment that you are running a very large Internet site, and you are taking in thousands of orders daily. Your Orders table is getting to be huge, and you are having issues where your purge job (to delete older records) is causing blocking issues while the DELETE statement is running.
+
+By utilizing partitioned views (or as we'll learn later, partitioned tables), we can essentially silo our data such that we can spread the data out physically (by using different filegroups, or even different servers for each table) and have SQL Server sort out where everything is supposed to go.
+
+Here's what a two-month set of data might look:
+
+CREATE TABLE OrderPartitionJan08
+
+(OrderID int NOT NULL,
+
+OrderDate date NOT NULL
+
+CONSTRAINT CKIsJanOrder
+
+CHECK (OrderDate >= '2008/01/01'
+
+AND OrderDate < '2008/02/01'),
+
+CustomerID int NOT NULL,
+
+CONSTRAINT PKOrderIDOrderDateJan
+
+PRIMARY KEY (OrderID, OrderDate)
+
+);
+
+CREATE TABLE OrderPartitionFeb08
+
+(OrderID int NOT NULL,
+
+OrderDate date NOT NULL
+
+CONSTRAINT CKIsFebOrder
+
+CHECK (OrderDate >= '2008/02/01'
+
+AND OrderDate < '2008/03/01'),
+
+CustomerID int NOT NULL,
+
+CONSTRAINT PKOrderIDOrderDateFeb
+
+PRIMARY KEY (OrderID, OrderDate)
+
+);
+
+GO
+
+CREATE VIEW Orders
+
+AS
+
+SELECT *
+
+FROM OrderPartitionJan08
+
+UNION ALL
+
+SELECT *
+
+FROM OrderPartitionFeb08;
+
+Once we have created these tables along with the view that unites them into a partitioned view, we're ready to insert a few rows of data:
+
+INSERT INTO Orders
+
+VALUES
+
+(1, '2008-01-15', 1),
+
+(2, '2008-02-15', 1);
+
+Orders is a view, and therefore has no data of its own—so where does the data go? Under the covers, SQL Server analyzes the data being inserted and figures out that, based on the constraints in our table, the data can, in each case, go to one and only one table. Let's check that out with a few queries:
+
+SELECT * FROM Orders;
+
+SELECT * FROM OrderPartitionJan08;
+
+SELECT * FROM OrderPartitionFeb08;
+
+This gets us, in order, both rows we inserted, then the row from January, then the one from February.
+
+OrderID OrderDate CustomerID
+
+\----------- ---------- -----------
+
+1 2008-01-15 1
+
+2 2008-02-15 1
+
+(2 row(s) affected)
+
+OrderID OrderDate CustomerID
+
+\----------- ---------- -----------
+
+1 2008-01-15 1
+
+(1 row(s) affected)
+
+OrderID OrderDate CustomerID
+
+\----------- ---------- -----------
+
+2 2008-02-15 1
+
+(1 row(s) affected)
+
+As you can see, our data has been split up into separate tables based on the partitioning column. We can easily create additional tables to partition our data into (for example, an OrderPartitionMar08 table) and then alter our view to union in the additional table. Likewise, we can easily remove a block of data by excluding it from the view and then dropping the table.
+
+You can also spread the tables that support the partitioned view over multiple servers utilizing linked servers. This distributes the query load for those tables out to the various servers that house them, and is usually referred to as a "distributed partitioned view." The servers that support a given distributed partitioned view are said to be "federated."
+
+Summary
+
+Views tend to be either the most overused or most underused tools in most of the databases I've seen. Some people like to use them to abstract seemingly everything (often forgetting that they are adding another layer to the process when they do this). Others just seem to forget that views are even an option. Personally, like most things, I think you should use a view when it's the right tool to use—not before, not after.
+
+Common uses for views include:
+
+  * Filtering rows
+  * Protecting sensitive data
+  * Reducing database complexity
+  * Abstracting multiple physical databases into one logical database
+  * Creating indexes that effectively pre-join data between tables
+
+Things to remember with views include:
+
+  * Stay away from building views based on views—instead, adapt the appropriate query information from the first view into your new view.
+  * Remember that a view using the WITH CHECK OPTION provides some flexibility that can't be duplicated with a normal CHECK constraint.
+  * Encrypt views when you don't want others to be able to see your source code—either for commercial products or general security reasons.
+  * Using an ALTER VIEW completely replaces the existing view other than permissions. This means you must include the WITH ENCRYPTION and WITH CHECK OPTION clauses in the ALTER statement if you want encryption and restrictions to be in effect in the altered view.
+  * Use the OBJECT_DEFINITION() system function to display the supporting code for a view—avoid using system tables.
+  * Minimize the use of views for production queries—they can add additional overhead and hurt performance.
+  * Indexing a view puts additional load on any data modification process that affects the data participating in the indexed view.
+  * Distributed partitioned views can be utilized to distribute data and query load across multiple servers, but, for single server partitioning, partitioned tables is typically a better choice.
+
+In our next chapter, we'll take a look at batches and scripting. Batches and scripting will lead us right into stored procedures and user defined functions—the closest thing that SQL Server has to its own programs.
+9
+
+Scripts and Batches
+
+Geez. I've been writing too long. For some reason, when I see the phrase "Scripts and Batches" it reminds me of the old song "Love and Marriage" (Frank Sinatra for the curious). While scripts and batches do go together like a horse and carriage, they are hardly as lyrical—but I digress....
+
+We have, of course, already written many SQL scripts in this book. My assumption, given that this is a "Professional" level book, is that you already have most of the script basics down. After all, every CREATE statement that you write, every ALTER, every SELECT is all (if you're running a single statement) or part (multiple statements) of a script. It is, however, rather difficult to get excited about a script with one line in it. Could you imagine Hamlet's "To be, or not to be...?" if it had never had the following lines. We wouldn't have any context for what he was talking about.
+
+SQL scripts are much the same way. Things get quite a bit more interesting when we string several commands together into a longer script—a full play or at least an act to finish our Shakespeare analogy. Now imagine that we add a richer set of language elements from .NET to the equation. Now we're ready to write an epic!
+
+Scripts generally have a unified goal. That is, all the commands that are in a script are usually building up to one overall purpose. Examples include scripts to build a database (these might be used for a system installation), scripts for system maintenance, such as backups, Database Consistency Checker utilities (DBCCs), and scripts for anything where several commands are usually run together.
+
+We will be reviewing the notion of scripts during this chapter, and adding in the notion of batches, which control how SQL Server groups your commands together. In addition, we will take a look at SQLCMD, the command-line utility, and how it relates to scripts.
+
+SQLCMD was introduced as the new command-line scripting tool in SQL Server 2005. For backward compatibility only, SQL Server continues to support osql.exe (the previous tool that did command-line work). You may also see references to isql.exe, which served this same function in earlier releases. (Do not confuse this with isqlw.exe.) Isql.exe is no longer supported, but, since the options are pretty much the same, migration to osql or SQLCMD is generally not that difficult.
+
+Script Basics
+
+A script technically isn't a script until you store it in a file where it can be pulled up and reused. SQL scripts are stored as text files. SQL Server Management Studio provides many tools to help you with your script writing, but, technically, you can do the writing in any text editor. Keep in mind, however, that to actually test your script, it's going to have to be something that can connect to a SQL Server. With SQL Server 2008, the Management Studio gains the additional advantage of supporting IntelliSense.
+
+I continue to occasionally make use of a highly robust text editor for its ability to handle real expressions and other text-editing features that Management Studio, and even Visual Studio, will likely never have. That said, the Management Studio has, as it has added more features, become my preferred tool for editing SQL scripts for SQL Server.
+
+Scripts are usually treated as a unit. That is, you are normally executing the entire script or nothing at all. They can make use of both system functions and local variables. As an example, let's look at a simple script that could be used to INSERT order records into a typical order header and order detail table scenario:
+
+USE SomeDatabase
+
+DECLARE @Ident int
+
+INSERT INTO Orders
+
+(CustomerID,OrderDate)
+
+VALUES
+
+(25, DATEADD(day,-1,GETDATE())) -- this always sets the OrderDate to yesterday
+
+SELECT @Ident = @@IDENTITY
+
+INSERT INTO Details
+
+(OrderID, ProductID, UnitPrice, Quantity)
+
+VALUES
+
+(@Ident, 1, 50, 25)
+
+SELECT 'The OrderID of the INSERTed row is ' + CONVERT(varchar(8),@Ident)
+
+We have six distinct commands working here, covering a range of different things that we might do in a script. We're using both system functions and local variables, the USE statement, INSERT statements, and both assignment and regular versions of the SELECT statement. They are all working in unison to accomplish one task—to insert complete orders into the database.
+
+Batches
+
+A batch is a grouping of T-SQL statements into one logical unit, and, while this seems a pretty basic concept (indeed, I cover it at length in my Beginning title), I find it to be one of the more frequently misunderstood concepts in SQL Server, even among experienced administrators and developers.
+
+All of the statements within a batch are combined into one execution plan, so all statements are parsed together and must pass a validation of the syntax or none of the statements will execute. Note, however, that this does not prevent runtime errors from happening. In the event of a runtime error, any statement that has been executed prior to the runtime error will still be in effect. To summarize, if a statement fails at parse-time, then nothing runs. If a statement fails at runtime, then all statements until the statement that generated the error have already run.
+
+All the scripts we have run up to this point are made up of one batch each. Even the script we've been analyzing so far in this chapter is just one batch. To separate a script into multiple batches, we make use of the GO statement. The GO statement:
+
+  * Must be on its own line (nothing other than a comment can be on the same line); there is an exception to this discussed shortly, but think of a GO as needing to be on a line to itself
+  * Causes all statements since the beginning of the script or the last GO statement (whichever is closer) to be compiled into one execution plan and sent to the server independently of any other batches
+  * Is not a T-SQL command, but, rather, a command recognized by the various SQL Server command utilities (OSQL, ISQL, and the Query Analyzer)
+
+A Line to Itself
+
+The GO command should stand alone on its own line. Technically, you can start a new batch on the same line after the GO command, but you'll find this puts a serious damper on readability. T-SQL statements cannot precede the GO statement, or the GO statement will often be misinterpreted and cause either a parsing error or some other unexpected result. For example, if I use a GO statement after a WHERE clause:
+
+SELECT * FROM Customers WHERE CustomerID = 2 GO
+
+The parser becomes somewhat confused:
+
+Msg 102, Level 15, State 1, Line 1
+
+Incorrect syntax near 'GO'.
+
+Each Batch Is Sent to the Server Separately
+
+Because each batch is processed independently, an error in one batch does not preclude another batch from running. To illustrate, take a look at some code:
+
+USE AdventureWorks2008;
+
+DECLARE @MyVarchar varchar(50); --This DECLARE only lasts for this batch!
+
+SELECT @MyVarchar = 'Honey, I''m home...';
+
+PRINT 'Done with first Batch...';
+
+GO
+
+PRINT @MyVarchar; --This generates an error since @MyVarchar
+
+\--isn't declared in this batch
+
+PRINT 'Done with second Batch';
+
+GO
+
+PRINT 'Done with third batch'; -- Notice that this still gets executed
+
+\-- even after the error
+
+GO
+
+If there were any dependencies between these batches, then either everything would fail—or, at the very least, everything after the point of error would fail—but it doesn't. Look at the results if you run the preceding script:
+
+Done with first Batch...
+
+Msg 137, Level 15, State 2, Line 2
+
+Must declare the scalar variable "@MyVarchar".
+
+Done with third batch
+
+Again, each batch is completely autonomous in terms of runtime issues. Keep in mind though that you can build in dependencies in the sense that one batch may try to perform work that depends on the first batch being complete. We'll see some of this in the next section when we talk about what can and can't span batches.
+
+GO Is Not a T-SQL Command
+
+Thinking that GO is a T-SQL command is a common mistake. GO is a command that is recognized only by the editing tools (Management Studio, SQLCMD). If you use a third-party tool, then it may or may not support the GO command, but most that claim SQL Server support will.
+
+When the editing tool encounters a GO statement, it sees it as a flag to terminate that batch, package it up, and send it as a single unit to the server, without including the GO. That's right; the server itself has absolutely no idea what GO is supposed to mean.
+
+If you try to execute a GO command in a pass-through query using ODBC, OLE DB, ADO, ADO.NET, or any other access method, you'll get an error message back from the server. The GO is merely an indicator to the tool that it is time to end the current batch, and time, if appropriate, to start a new one. In the case of the aforementioned access methods, they each have the concept of a "command" object. That command object may include multiple statements, but each execution of the command object is implied to represent exactly one batch.
+
+Keep this notion in mind if you are building scripts you want to be compatible with other RDBMSs. Your non-SQL Server target system will likely fail if you pass it the GO keyword.
+
+Errors in Batches
+
+Errors in batches fall into two categories:
+
+  * Syntax errors
+  * Runtime errors
+
+If the query parser finds a syntax error, the processing of that batch is canceled immediately. Since syntax checking happens before the batch is compiled or executed, a failure during the syntax check means none of the batch will be executed, regardless of the position of the syntax error within the batch.
+
+Runtime errors work quite a bit differently. Any statement that has already executed before the runtime error was encountered is already done, so anything that statement did will remain intact unless it is part of an uncommitted transaction. (Transactions are covered in Chapter 11, but the relevance here is that they imply an all or nothing situation.) What happens beyond the point of the runtime error depends on the nature of the error. Generally speaking, runtime errors will terminate execution of the batch from the point where the error occurred to the end of the batch. Some runtime errors, such as a referential-integrity violation will prevent only the offending statement from executing; all other statements in the batch will still be executed. This latter scenario is why error checking is so important. We will cover error checking in full in our chapter on stored procedures (see Chapter 10).
+
+When to Use Batches
+
+Batches have several purposes, but they all have one thing in common: They are used when something has to happen either before or separately from everything else in your script.
+
+Statements That Require Their Own Batch
+
+There are several commands that absolutely must be part of their own batch. These include:
+
+  * CREATE DEFAULT
+  * CREATE FUNCTION
+  * CREATE PROCEDURE
+  * CREATE RULE
+  * CREATE SCHEMA
+  * CREATE TRIGGER
+  * CREATE VIEW
+
+If you want to combine any of these statements with other statements in a single script, then you will need to break them up into their own batch by using a GO statement.
+
+Note that, if you DROP an object, you may want to place the DROP in its own batch or at least with a batch of other DROP statements. Why? Well, if you're going to create an object later with the same name, the CREATE will fail during the parsing of your batch unless the DROP has already happened. That means you need to run the DROP in a separate and prior batch so it will be complete when the batch with the CREATE statement executes.
+
+Using Batches to Establish Precedence
+
+Perhaps the most likely scenario for using batches is when precedence is required—that is, you need one task to be completely done before the next task starts. Most of the time, SQL Server deals with this kind of situation just fine. The first statement in the script is the first executed, and the second statement in the script can rely on the server being in the proper state when the second statement runs. There are times, however, when SQL Server can't resolve this kind of issue.
+
+Let's take the example of creating a database together with some tables:
+
+CREATE DATABASE Test;
+
+CREATE TABLE TestTable
+
+(
+
+col1 int,
+
+col2 int
+
+);
+
+Execute this and, at first, it appears that everything has gone well:
+
+Command(s) completed successfully.
+
+However, things are not as they seem. Check out the INFORMATION_SCHEMA in the Test database, and you'll notice something is missing:
+
+SELECT TABLE_CATALOG
+
+FROM INFORMATION_SCHEMA.TABLES
+
+WHERE TABLE_NAME = 'TestTable'
+
+TABLE_CATALOG
+
+\-----------------------------------------------------------------------------
+
+master
+
+(1 row(s) affected)
+
+Hey! Why was the table created in the wrong database? The answer lies in what database was current when we ran the CREATE TABLE statement. In our case, it happened to be the master database, so that's where our table was created.
+
+Note that you may have been somewhere other than the master database when you ran this, so you may get a different result. That's kind of the point though. You could be in pretty much any database. That's why making use of the USE statement is so important.
+
+When you think about it, this seems like an easy thing to fix. Just make use of the USE statement, but before we test our new theory, we have to get rid of the old (okay, not that old) database:
+
+USE MASTER;
+
+DROP DATABASE Test;
+
+We can then run our newly modified script:
+
+CREATE DATABASE Test;
+
+USE Test;
+
+CREATE TABLE TestTable
+
+(
+
+col1 int,
+
+col2 int
+
+);
+
+Unfortunately, this has its own problems:
+
+Msg 911, Level 16, State 1, Line 3
+
+Database 'Test' does not exist. Make sure that the name is entered correctly.
+
+The parser tries to validate our code and finds that we are referencing a database with a USE command that doesn't exist. Ahh, now we see the need for our batches. We need the CREATE DATABASE statement to be completed before we try to use the new database:
+
+CREATE DATABASE Test;
+
+GO
+
+USE Test;
+
+CREATE TABLE TestTable
+
+(
+
+col1 int,
+
+col2 int
+
+);
+
+Now things work a lot better. Our immediate results look the same:
+
+Command(s) completed successfully.
+
+But when we run our INFORMATION_SCHEMA query, things are confirmed:
+
+TABLE_CATALOG
+
+\------------------------------------------------------------------------------
+
+Test
+
+(1 row(s) affected)
+
+Let's move on to another example that shows an even more explicit need for precedence.
+
+When you use an ALTER TABLE statement that significantly changes the type of a column or adds columns, you cannot make use of those changes until the batch that makes the changes has completed.
+
+If we add a column to our TestTable table in our Test database and then try to reference that column without ending the first batch:
+
+USE Test;
+
+ALTER TABLE TestTable
+
+ADD col3 int;
+
+INSERT INTO TestTable
+
+(col1, col2, col3)
+
+VALUES
+
+(1,1,1);
+
+We get an error message. SQL Server cannot resolve the new column name and therefore complains:
+
+Msg 207, Level 16, State 1, Line 6
+
+Invalid column name 'col3'.
+
+Add one simple GO statement after the ADD col3 int though, and everything is working fine:
+
+(1 row(s) affected)
+
+SQLCMD
+
+SQLCMD is a utility that allows you to run scripts from a command prompt in a Windows command box. This can be very nice for executing conversion or maintenance scripts, as well as a quick-and-dirty way to capture a text file.
+
+SQLCMD replaces the older OSQL. OSQL is still included with SQL Server for backward compatibility only. An even older command-line utility—ISQL—is no longer supported.
+
+The syntax for running SQLCMD from the command line includes a large number of different switches, and looks like this:
+
+sqlcmd
+
+[
+
+{ { -U <login id> [ -P <password> ] } | –E }
+
+]
+
+[-S <server> [ \<instance > ] ] [ -H <workstation> ] [ -d <database> ]
+
+[ -l <time out> ] [ -t <time out> ] [ -h <headers> ]
+
+[ -s <col separator> ] [ -w <col width> ] [ -a <packet size> ]
+
+[ -e ] [ -I ]
+
+[ -c <cmd end> ] [ -L [ c ] ] [ -q "<query>" ] [ -Q "<query>" ]
+
+[ -m <error level> ] [ -V ] [ -W ] [ -u ] [ -r [ 0 | 1 ] ]
+
+[ -i <input file> ] [ -o <output file> ]
+
+[ -f <codepage> | i:<codepage> [ <, o: <codepage> ]
+
+[ -k [ 1 | 2 ] ]
+
+[ -y <display width> ] [-Y <display width> ]
+
+[ -p [ 1 ] ] [ -R ] [ -b ] [ -v ] [ -A ] [ -X [ 1 ] ] [ -x ]
+
+[ -? ]
+
+]
+
+The single biggest thing to keep in mind with these flags is that many of them (but, oddly enough, not all of them) are case sensitive. For example, both -Q and -q will execute queries, but the first will exit SQLCMD when the query is complete, and the second won't.
+
+So, let's try a quick query direct from the command line. Again, remember that this is meant to be run from the Windows command prompt (don't use the Management Console):
+
+SQLCMD -Usa -Pmypass -Q "SELECT * FROM AdventureWorks2008.HumanResources.Employee"
+
+The –P is the flag that indicates the password. If your server is configured with something other than a blank password (and it should be!), then you'll need to provide that password immediately following the –P with no space in between.
+
+If you run this from a command prompt, you should get something like 290 rows back. Now, let's create a quick text file to see how it works when including a file. At the command prompt, type the following:
+
+C:\>copy con testsql.sql
+
+This should take you down to a blank line (with no prompt of any kind), where you can enter this:
+
+SELECT * FROM AdventureWorks2008.HumanResources.Employee
+
+Then press F6 and Return (this ends the creation of our text file). You should get back a message like:
+
+1 file(s) copied.
+
+Now let's retry our earlier query, using a script file this time. The command line at the prompt only has a slight change to it:
+
+C:\>sqlcmd -Usa -Pmypass -i testsql.sql
+
+This should get us exactly the same results as when we ran the query using -Q. The major difference is, of course, that we took the command from a file. The file could have had hundreds—if not thousands—of different commands in it.
+
+There are a wide variety of different parameters for SQLCMD, but the most important are the login, the password, and the one that says what you want to do (straight query or input file). You can mix and match many of these parameters to obtain fairly complex behavior from this seemingly simple command-line tool.
+
+Dynamic SQL: Generating Your Code on the Fly with the EXEC Command
+
+Okay, so all this saving stuff away in scripts is all fine and dandy, but what if you don't know what code you need to execute until runtime?
+
+As a side note, notice that we are done with SQLCMD for now. The following examples should be run utilizing the Management Console.
+
+SQL Server allows us, with a few gotchas, to build our SQL statement on the fly using string manipulation. The need to do this usually stems from not being able to know the details about something until runtime. The syntax looks like this:
+
+EXEC ({<string variable>|'<literal command string>'})
+
+Or:
+
+EXECUTE ({<string variable>|'<literal command string>'})
+
+As with executing a stored proc, whether you use the EXEC or EXECUTE makes no difference.
+
+Let's build an example in the AdventureWorks2008 database by creating a dummy table from which to grab our dynamic information:
+
+USE AdventureWorks2008;
+
+GO
+
+\--Create The Table. We'll pull info from here for our dynamic SQL
+
+CREATE TABLE DynamicSQLExample
+
+(
+
+TableID int IDENTITY NOT NULL
+
+CONSTRAINT PKDynamicSQLExample
+
+PRIMARY KEY,
+
+SchemaName varchar(128) NOT NULL,
+
+TableName varchar(128) NOT NULL
+
+);
+
+GO
+
+/* Populate the table. In this case, We're grabbing every user
+
+** table object in this database */
+
+INSERT INTO DynamicSQLExample
+
+SELECT s.name AS SchemaName, t.name AS TableName
+
+FROM sys.schemas s
+
+JOIN sys.tables t
+
+ON s.schema_id = t.schema_id;
+
+This should get us a response something like:
+
+(78 row(s) affected)
+
+To quote the old advertising disclaimer: "actual results may vary." It's going to depend on which examples you've already followed along with in the book, which ones you haven't, and for which ones you took the initiative and did a DROP on once you were done with them. In any case, don't sweat it too much.
+
+Okay, so what we now have is a list of all the tables in our current database. Now let's say that we wanted to select some data from one of the tables, but we wanted to identify the table only at runtime by using its ID. For example, I'll pull out all the data for the table with an ID of 15:
+
+DECLARE @SchemaName varchar(128)
+
+DECLARE @TableName varchar(128)
+
+\-- Now, grab the table name that goes with our ID
+
+SELECT @SchemaName = SchemaName, @TableName = TableName
+
+FROM DynamicSQLExample
+
+WHERE TableID = 15
+
+\-- Finally, pass that value into the EXEC statement
+
+EXEC ('SELECT * FROM ' + @SchemaName + '.' + @TableName)
+
+If your table names went into the DynamicSQLExample table the way mine did, then a TableID of 15 should equate to the ProductProductPhoto table. If so, you should wind up with something like this:
+
+ProductID ProductPhotoID Primary ModifiedDate
+
+\----------- -------------- ------- -----------------------
+
+1 1 1 1998-05-02 00:00:00.000
+
+2 1 1 1998-05-02 00:00:00.000
+
+3 1 1 1998-05-02 00:00:00.000
+
+...
+
+...
+
+997 102 1 2003-06-01 00:00:00.000
+
+998 102 1 2003-06-01 00:00:00.000
+
+999 102 1 2003-06-01 00:00:00.000
+
+(504 row(s) affected)
+
+The Gotchas of EXEC
+
+Like most things that are of interest, using EXEC is not without its little trials and tribulations. Among the gotchas of EXEC are:
+
+  * It runs under a separate scope than the code that calls it—that is, the calling code can't reference variables inside the EXEC statement, and the EXEC can't reference variables in the calling code after they are resolved into the string for the EXEC statement.
+  * By default, it runs under the same security context as the current user—not that of the calling object. Use the EXECUTE AS option to override this.
+  * It runs under the same connection and transaction context as the calling object (we'll discuss this further with transactions in Chapter 11).
+  * Concatenation that requires a function call must be performed on the EXEC string prior to actually calling the EXEC statement. You can't do the concatenation of function in the same statement as the EXEC call.
+  * EXEC cannot be used inside a user-defined function.
+
+Each of these can be a little difficult to grasp, so let's look at each individually.
+
+The Scope of EXEC
+
+Determining variable scope with the EXEC statement is something less than intuitive. The actual statement line that calls the EXEC statement has the same scope as the rest of the batch or procedure that the EXEC statement is running in, but the code that is performed as a result of the EXEC statement is considered to be in its own batch. As is so often the case, this is best shown with an example:
+
+USE AdventureWorks2008;
+
+/* First, we'll declare to variables. One for stuff we're putting into
+
+** the EXEC, and one that we think will get something back out (it won't)
+
+*/
+
+DECLARE @InVar varchar(50);
+
+DECLARE @OutVar varchar(50);
+
+\-- Set up our string to feed into the EXEC command
+
+SET @InVar = 'SELECT @OutVar = FirstName FROM Person.Person
+
+WHERE ContactID = 1';
+
+\-- Now run it
+
+EXEC (@Invar);
+
+\-- Now, just to show there's no difference, run the select without using a in variable
+
+EXEC ('SELECT @OutVar = FirstName FROM Person.Person WHERE BusinessEntityID = 1');
+
+\-- @OutVar will still be NULL because we haven't been able to put anything in it
+
+SELECT @OutVar;
+
+Now, look at the output from this:
+
+Msg 137, Level 15, State 1, Line 1
+
+Must declare the scalar variable '@OutVar'.
+
+Msg 137, Level 15, State 1, Line 1
+
+Must declare the scalar variable '@OutVar'.
+
+\--------------------------------------------------
+
+NULL
+
+(1 row(s) affected)
+
+SQL Server wastes no time in telling us that we are scoundrels and clearly don't know what we're doing. Why do we get a "Must Declare" error message when we have already declared @OutVar? Because we've declared it in the outer scope—not within the EXEC itself.
+
+Let's look at what happens if we run things a little differently:
+
+USE AdventureWorks2008;
+
+\-- This time, we only need one variable. It does need to be longer though.
+
+DECLARE @InVar varchar(200);
+
+/* Set up our string to feed into the EXEC command. This time we're going
+
+** to feed it several statements at a time. They will all execute as one
+
+** batch.
+
+*/
+
+SET @InVar = 'DECLARE @OutVar varchar(50)
+
+SELECT @OutVar = FirstName FROM Person.Person
+
+WHERE BusinessEntityID = 1
+
+SELECT "The Value Is " + @OutVar';
+
+\-- Now run it
+
+EXEC (@Invar);
+
+This time we get back results closer to what we expect:
+
+\---------------------------------------------------------------
+
+The Value Is Ken
+
+Notice the way that I'm using two quotation marks right next to each other to indicate that I really want a quotation mark rather than to terminate my string.
+
+So, what we've seen here is that we have two different scopes operating, and nary the two shall meet. There is, unfortunately, no way to pass information between the inside and outside scopes without using an external mechanism such as a temporary table. If you decide to use a temp table to communicate between scopes, just remember that any temporary table created within the scope of your EXEC statement will live only for the life of that EXEC statement.
+
+This behavior of a temp table only lasting the life of the scope it is created in will show up again when we are dealing with triggers and sprocs.
+
+A Small Exception to the Rule
+
+There is one thing that happens inside the scope of the EXEC that can be seen after the EXEC is done—system functions. So, things like @@ROWCOUNT can still be used. Again, let's look at a quick example:
+
+USE AdventureWorks2008;
+
+EXEC('SELECT * FROM Sales.Customer');
+
+SELECT 'The Rowcount is ' + CAST(@@ROWCOUNT as varchar);
+
+This yields us (after the result set):
+
+The Rowcount is 19820
+
+Security Contexts and EXEC
+
+When you give someone the right to run a stored procedure, you imply that he or she also gains the right to perform the actions called for within the sproc. For example, let's say we had a stored procedure that lists all the employees hired within the last year. Someone who has rights to execute the sproc can do so (and get results back) even if he or she does not have rights to the HumanResources.Employee table directly. This is really handy as it allows you to grant access to information for a very specific need without granting more general access to the underlying object.
+
+Developers usually assume that this same implied right is valid for an EXEC statement also—not necessarily. Indeed, by default, any reference made inside an EXEC statement will be run under the security context of the current user. So, let's say I have the right to run a procedure called spNewEmployee, but I do not have rights to the Employee table. If spNewEmployee gets the values by running a simple SELECT statement, then everything is fine. If, however, spNewEmployee uses an EXEC statement to execute that SELECT statement, the EXEC statement will fail because I don't have the rights to perform a SELECT on the Employee table.
+
+Fortunately, we now have some (albeit limited) options to get around this by utilizing the EXECUTE AS option that was added beginning in SQL Server 2005. We'll discuss the specifics of how to do so as we work with security in Chapter 19, when we will discuss how to run under a specific user context.
+
+The security context of an EXEC statement run within a stored procedure, user-defined function, or trigger can be overridden using the EXECUTE AS clause within the sproc, function, or trigger. EXECUTE AS will be discussed more fully when we discuss security in Chapter 19.
+
+Use of Functions in Concatenation and EXEC
+
+This one is actually more of a nuisance than anything else, since there is a reasonably easy workaround. Simply put, you can't run a function against your EXEC string in the argument for an EXEC. For example:
+
+USE AdventureWorks2008;
+
+\-- This won't work
+
+DECLARE @NumberOfLetters int;
+
+SET @NumberOfLetters = 3;
+
+EXEC('SELECT LEFT(LastName,' + CAST(@NumberOfLetters AS varchar) + ') AS FilingName
+
+FROM Person.Person');
+
+GO
+
+\-- But this does
+
+DECLARE @NumberOfLetters AS int;
+
+SET @NumberOfLetters = 3;
+
+DECLARE @str AS varchar(255);
+
+SET @str = 'SELECT LEFT(LastName,' + CAST(@NumberOfLetters AS varchar) + ') AS
+
+FilingName FROM Person.Person';
+
+EXEC(@str);
+
+The first instance gets us an error message because the CAST function needs to be fully resolved prior to the EXEC line:
+
+Msg 102, Level 15, State 1, Line 6
+
+Incorrect syntax near 'CAST'.
+
+But the second line works just fine because it is already a complete string:
+
+FilingName
+
+\----------
+
+Abb
+
+Abe
+
+Abe
+
+...
+
+Zuk
+
+Zwi
+
+Zwi
+
+(19972 row(s) affected)
+
+EXEC and UDFs
+
+In short, you can't get there from here. You are not allowed to use EXEC to run dynamic SQL within a UDF—period. (Using EXEC to run a sproc is, however, legal in a few cases.)
+
+Control-of-Flow Statements
+
+Control-of-flow statements are a veritable must for any programming language these days. I can't imagine having to write my code where I couldn't change what commands to run depending on a condition.
+
+Given that we're assuming at least an intermediate knowledge of both programming and SQL, we're not going to dwell on these a lot, but since "intermediate" means different things to different people, we had best give these the once over.
+
+T-SQL offers most of the classic choices for control-of-flow situations, including:
+
+  * IF...ELSE
+  * GOTO
+  * WHILE
+  * WAITFOR
+  * TRY/CATCH
+
+We also have the CASE statement (a.k.a. SELECT CASE, DO CASE, and SWITCH/BREAK in other languages), but it doesn't have quite the level of control of flow capabilities that you've come to expect from other languages.
+
+The IF... ELSE Statement
+
+IF...ELSE statements work much as they do in any language, although I equate them most closely to C in the way they are implemented. The basic syntax is:
+
+IF <Boolean Expression>
+
+<SQL statement> | BEGIN <code series> END
+
+[ELSE
+
+<SQL statement> | BEGIN <code series> END]
+
+The expression can be pretty much any expression that evaluates to a Boolean.
+
+This brings us back to one of the most common traps that I see SQL programmers fall into—improper use of NULLs. I can't tell you how often I have debugged stored procedures only to find a statement like:
+
+IF @myvar = NULL
+
+This will, of course, never be true on most systems (see the exception shortly) and will wind up bypassing all their NULL values. Instead, it needs to read:
+
+IF @myvar IS NULL
+
+The exception to this is dependent on whether you have set the ANSI_NULLS option ON or OFF. The default is that this is ON, in which case you'll see the behavior described previously. You can change this behavior by setting ANSI_NULLS to OFF. I strongly recommend against this since it violates the ANSI standard (it's also just plain wrong).
+
+Note that only the very next statement after the IF will be considered to be conditional (as per the IF). You can include multiple statements as part of your control-of-flow block using BEGIN...END, but we'll discuss that one a little later in the chapter.
+
+To show off a simple version of this, let's run an example that's very common to build scripts. Imagine for a moment that we want to CREATE a table if it's not there, but to leave it alone if it already exists. We could make use of the EXISTS operator. (You may recall my complaint that the Books Online calls EXISTS a keyword when I consider it an operator.)
+
+\-- We'll run a SELECT for our table to start with to prove it's not there
+
+SELECT 'Found Table ' + s.name + '.' + t.name
+
+FROM sys.schemas s
+
+JOIN sys.tables t
+
+ON s.schema_id = t.schema_id
+
+WHERE s.name = 'dbo'
+
+AND t.name = 'OurIFTest';
+
+\-- Now we're run our conditional CREATE statement
+
+IF NOT EXISTS (
+
+SELECT s.name AS SchemaName, t.name AS TableName
+
+FROM sys.schemas s
+
+JOIN sys.tables t
+
+ON s.schema_id = t.schema_id
+
+WHERE s.name = 'dbo'
+
+AND t.name = 'OurIFTest'
+
+)
+
+CREATE TABLE OurIFTest(
+
+Col1 int PRIMARY KEY
+
+);
+
+\-- And now look again to prove that it's been created.
+
+SELECT 'Found Table ' + s.name + '.' + t.name
+
+FROM sys.schemas s
+
+JOIN sys.tables t
+
+ON s.schema_id = t.schema_id
+
+WHERE s.name = 'dbo'
+
+AND t.name = 'OurIFTest';
+
+The meat of this is in the middle. Notice that our CREATE TABLE statement runs only if no matching table already exists:
+
+\------------------------------------------------------------------------------
+
+(0 row(s) affected)
+
+\------------------------------------------------------------------------------
+
+Found Table dbo.OurIFTest
+
+(1 row(s) affected)
+
+The ELSE Clause
+
+Now this thing about being able to run a statement conditionally is just great, but it doesn't really deal with all the scenarios we might want to deal with. Quite often—indeed, most of the time—when we deal with an IF condition, we have specific statements we want to execute not just for the true condition, but also a separate set of statements that we want to run if the condition is false—or the ELSE condition.
+
+You will run into situations where a Boolean cannot be evaluated—that is, the result is unknown (for example, if you are comparing to a NULL). Any expression that returns a result that would be considered as an unknown result will be treated as FALSE.
+
+The ELSE statement works pretty much as it does in any other language. The exact syntax may vary slightly, but the nuts and bolts are still the same. The statements in the ELSE clause are executed if the statements in the IF clause are not.
+
+To expand our earlier example just a bit, let's actually print a warning message out if we do not create our table:
+
+\-- Now we're run our conditional CREATE statement
+
+IF NOT EXISTS (
+
+SELECT s.name AS SchemaName, t.name AS TableName
+
+FROM sys.schemas s
+
+JOIN sys.tables t
+
+ON s.schema_id = t.schema_id
+
+WHERE s.name = 'dbo'
+
+AND t.name = 'OurIFTest'
+
+)
+
+CREATE TABLE OurIFTest(
+
+Col1 int PRIMARY KEY
+
+);
+
+ELSE
+
+PRINT 'WARNING: Skipping CREATE as table already exists';
+
+If you have already run the preceding example, then the table will already exist, and running this second example should get you the warning message:
+
+WARNING: Skipping CREATE as table already exists
+
+Grouping Code into Blocks
+
+Sometimes you need to treat a group of statements as though they were all one statement. (If you execute one, then you execute them all, otherwise, you don't execute any of them.) For instance, the IF statement will, by default, consider only the very next statement after the IF to be part of the conditional code. What if you want the condition to require several statements to run? Life would be pretty miserable if you had to create a separate IF statement for each line of code you wanted to run if the condition holds.
+
+Thankfully, like most any language with an IF statement, SQL Server gives us a way to group code into blocks that are considered to all belong together. The block is started when you issue a BEGIN statement and continues until you issue an END statement. It works like this:
+
+IF <Expression>
+
+BEGIN --First block of code starts here -- executes only if
+
+\--expression is TRUE
+
+Statement that executes if expression is TRUE
+
+Additional statements
+
+...
+
+...
+
+Still going with statements from TRUE expression
+
+IF <Expression> \--Only executes if this block is active
+
+BEGIN
+
+Statement that executes if both outside and inside
+
+expressions are TRUE
+
+Additional statements
+
+...
+
+...
+
+Still statements from both TRUE expressions
+
+END
+
+Out of the condition from inner condition, but still
+
+part of first block
+
+END --First block of code ends here
+
+ELSE
+
+BEGIN
+
+Statement that executes if expression is FALSE
+
+Additional statements
+
+...
+
+...
+
+Still going with statements from FALSE expression
+
+END
+
+Notice our ability to nest blocks of code. In each case, the inner blocks are considered to be part of the outer block of code. I have never heard of there being a limit to how many levels deep you can nest your BEGIN...END blocks, but I would suggest that you minimize them. There are definitely practical limits to how deep you can keep them readable—even if you are particularly careful about the formatting of your code.
+
+Just to put this notion into play, let's make yet another modification to table creation. This time, we're going to provide an informational message regardless of whether the table was created or not:
+
+\-- This time we're adding a check to see if the table DOES already exist
+
+\-- We'll remove it if it does so that the rest of our example can test the
+
+\-- IF condition. Just remove this first IF EXISTS block if you want to test
+
+\-- the ELSE condition below again.
+
+IF EXISTS (
+
+SELECT s.name AS SchemaName, t.name AS TableName
+
+FROM sys.schemas s
+
+JOIN sys.tables t
+
+ON s.schema_id = t.schema_id
+
+WHERE s.name = 'dbo'
+
+AND t.name = 'OurIFTest'
+
+)
+
+DROP TABLE OurIFTest;
+
+\-- Now we're run our conditional CREATE statement
+
+IF NOT EXISTS (
+
+SELECT s.name AS SchemaName, t.name AS TableName
+
+FROM sys.schemas s
+
+JOIN sys.tables t
+
+ON s.schema_id = t.schema_id
+
+WHERE s.name = 'dbo'
+
+AND t.name = 'OurIFTest'
+
+)
+
+BEGIN
+
+PRINT 'Table dbo.OurIFTest not found.'
+
+PRINT 'CREATING: Table dbo.OurIFTest'
+
+CREATE TABLE OurIFTest(
+
+Col1 int PRIMARY KEY
+
+);
+
+END
+
+ELSE
+
+PRINT 'WARNING: Skipping CREATE as table already exists';
+
+Now, we've mixed all sorts of uses of the IF statement there. We have the most basic IF statement—with no BEGIN...END or ELSE. In our other IF statement, the IF portion uses a BEGIN...END block, but the ELSE does not.
+
+I did one this way just to illustrate how you can mix them. That said, I recommend you go back to my old axiom of "be consistent." It can be really hard to deal with what statement is being controlled by what IF...ELSE condition if you are mixing the way you group things. In practice, if I'm using BEGIN...END on any statement within a given IF, then I use them for every block of code in that IF statement even if there is only one statement for that particular condition.
+
+The CASE Statement
+
+The CASE statement is, in some ways, the equivalent of one of several different statements depending on the language from which you're coming. Statements in procedural programming languages that work in a similar way to CASE include:
+
+  * Switch: C, C++, C#, Delphi
+  * Select Case: Visual Basic
+  * Do Case: Xbase
+  * Evaluate: COBOL
+
+I'm sure there are others; these are just from the languages that I've worked with in some form or another over the years. The big drawback in using a CASE statement in T-SQL is that it is, in many ways, more of a substitution operator than a control-of-flow statement.
+
+There is more than one way to write a CASE statement: with an input expression or a Boolean expression. The first option is to use an input expression that will be compared with the value used in each WHEN clause. The SQL Server documentation refers to this as a simple CASE:
+
+CASE <input expression>
+
+WHEN <when expression> THEN <result expression>
+
+[...n]
+
+[ELSE <result expression>]
+
+END
+
+Option number two is to provide an expression with each WHEN clause that will evaluate to TRUE/FALSE. The docs refer to this as a searched CASE:
+
+CASE
+
+WHEN <Boolean expression> THEN <result expression>
+
+[...n]
+
+[ELSE <result expression>]
+
+END
+
+Perhaps what's nicest about CASE is that you can use it "inline" with (that is, as an integral part of) a SELECT statement. This can actually be quite powerful.
+
+A Simple CASE
+
+A simple CASE takes an expression that equates to a Boolean result. Let's get right to an example:
+
+USE AdventureWorks2008;
+
+GO
+
+SELECT TOP 10 SalesOrderID, SalesOrderID % 10 AS 'Last Digit', Position =
+
+CASE SalesOrderID % 10
+
+WHEN 1 THEN 'First'
+
+WHEN 2 THEN 'Second'
+
+WHEN 3 THEN 'Third'
+
+WHEN 4 THEN 'Fourth'
+
+ELSE 'Something Else'
+
+END
+
+FROM Sales.SalesOrderHeader;
+
+For those of you who aren't familiar with it, the % operator is for a modulus. A modulus works in a similar manner to the divide by (/), but it gives you only the remainder. Therefore, 16 % 4 = 0 (4 goes into 16 evenly), but 16 % 5 = 1. (16 divided by 5 has a remainder of 1.) In the example, since we're dividing by 10, using the modulus is giving us the last digit of the number we're evaluating.
+
+Let's see what we got with this:
+
+SalesOrderID Last Digit Position
+
+\------------ ----------- --------------
+
+75124 4 Fourth
+
+43793 3 Third
+
+51522 2 Second
+
+57418 8 Something Else
+
+43767 7 Something Else
+
+51493 3 Third
+
+72773 3 Third
+
+43736 6 Something Else
+
+51238 8 Something Else
+
+53237 7 Something Else
+
+(10 row(s) affected)
+
+Notice that whenever there is a matching value in the list, the THEN clause is invoked. Since we have an ELSE clause, any value that doesn't match one of the previous values will be assigned whatever we've put in our ELSE. If we had left the ELSE out, then any such value would be given a NULL.
+
+Let's go with one more example that expands on what we can use as an expression. This time, we'll use another column from our query:
+
+USE AdventureWorks2008;
+
+GO
+
+SELECT TOP 10 SalesOrderID % 10 AS 'OrderLastDigit',
+
+ProductID % 10 AS 'ProductLastDigit',
+
+"How Close?" = CASE SalesOrderID % 10
+
+WHEN ProductID % 1 THEN 'Exact Match!'
+
+WHEN ProductID % 1 − 1 THEN 'Within 1'
+
+WHEN ProductID % 1 + 1 THEN 'Within 1'
+
+ELSE 'More Than One Apart'
+
+END
+
+FROM Sales.SalesOrderDetail
+
+ORDER BY SalesOrderID DESC;
+
+Notice that we've used equations at every step of the way on this one, yet it still works....
+
+OrderLastDigit ProductLastDigit How Close?
+
+\-------------- ---------------- -------------------
+
+4 5 More Than One Apart
+
+3 2 More Than One Apart
+
+3 9 More Than One Apart
+
+3 8 More Than One Apart
+
+2 2 More Than One Apart
+
+2 8 More Than One Apart
+
+1 7 Within 1
+
+1 0 Within 1
+
+1 1 Within 1
+
+0 2 Exact Match!
+
+(10 row(s) affected)
+
+As long as the expression evaluates to a specific value that is of compatible type to the input expression, it can be analyzed, and the proper THEN clause applied.
+
+A Searched CASE
+
+This one works pretty much the same as a simple CASE, with only two slight twists:
+
+  * There is no input expression. (Remember, that's the part between the CASE and the first WHEN.)
+  * The WHEN expression must evaluate to a Boolean value (whereas in the simple CASE examples we've just looked at, we used values such as 1, 3, and ProductID + 1).
+
+Perhaps what I find the coolest about this kind of CASE is that we can completely change around what is forming the basis of our expression—mixing and matching column expressions, depending on our different possible situations.
+
+As usual, I find the best way to get across how this works is via an example:
+
+SELECT TOP 10 SalesOrderID % 10 AS 'OrderLastDigit',
+
+ProductID % 10 AS 'ProductLastDigit',
+
+"How Close?" = CASE
+
+WHEN (SalesOrderID % 10) < 3 THEN 'Ends With Less Than Three'
+
+WHEN ProductID = 6 THEN 'ProductID is 6'
+
+WHEN ABS(SalesOrderID % 10 - ProductID) <= 1 THEN 'Within 1'
+
+ELSE 'More Than One Apart'
+
+END
+
+FROM Sales.SalesOrderDetail
+
+ORDER BY SalesOrderID DESC;
+
+This is substantially different from our simple CASE examples, but it still works:
+
+OrderLastDigit ProductLastDigit How Close?
+
+\-------------- ---------------- -------------------------
+
+4 5 More Than One Apart
+
+3 2 More Than One Apart
+
+3 9 More Than One Apart
+
+3 8 More Than One Apart
+
+2 2 Ends With Less Than Three
+
+2 8 Ends With Less Than Three
+
+1 7 Ends With Less Than Three
+
+1 0 Ends With Less Than Three
+
+1 1 Ends With Less Than Three
+
+0 2 Ends With Less Than Three
+
+(10 row(s) affected)
+
+These are a few of the things to pay particular attention to in how SQL Server evaluated things:
+
+  * Even when two conditions evaluate to TRUE, only the first condition is used. For example, the second-to-last row meets both the first (the last digit is smaller than 3) and third (the last digit is within 1 of the ProductID) conditions. For many languages, including Visual Basic, this kind of statement always works this way. If you're from the C world, however, you'll need to remember this when you are coding; no "break" statement is required. A CASE statement always terminates after one condition is met.
+  * You can mix and match what fields you're using in your condition expressions. In this case, we used SalesOrderID, ProductID, and both together.
+  * You can perform pretty much any expression as long as, in the end, it evaluates to a Boolean result.
+
+Looping with the WHILE Statement
+
+The WHILE statement works much as it does in other languages to which you have probably been exposed. Essentially, a condition is tested each time you come to the top of the loop. If the condition is still TRUE, then the loop executes again; if not, you exit.
+
+The syntax looks like this:
+
+WHILE <Boolean expression>
+
+<sql statement> |
+
+[BEGIN
+
+<statement block>
+
+[BREAK]
+
+<sql statement> | <statement block>
+
+[CONTINUE]
+
+END]
+
+While you can just execute one statement (much as you do with an IF statement), you'll almost never see a WHILE that isn't followed by a BEGIN...END with a full statement block.
+
+The BREAK statement is a way of exiting the loop without waiting for the bottom of the loop to come and the expression to be re-evaluated.
+
+I'm sure I won't be the last to tell you this, but using a BREAK is generally thought of as something of bad form in the classical sense. I tend to sit on the fence on this one. I avoid using them if reasonably possible. Most of the time, I can indeed avoid them just by moving a statement or two around, while still coming up with the same results. The advantage of this is usually more readable code. It is simply easier to handle a looping structure (or any structure for that matter) if you have a single point of entry and a single exit. Using a BREAK violates this notion.
+
+All that being said, sometimes you can actually make things worse by reformatting the code to avoid a BREAK. In addition, I've seen people write much slower code for the sake of not using a BREAK statement—bad idea.
+
+The CONTINUE statement is something of the opposite of a BREAK statement. In short, it tells the WHILE loop to go back to the beginning. Regardless of where you are in the loop, you immediately go back to the top and re-evaluate the expression (exiting if the expression is no longer TRUE).
+
+We'll go ahead and do something of a short example here just to get our feet wet. As I mentioned before, WHILE loops tend to be rare in non-cursor situations, so forgive me if this example seems lame.
+
+What we're going to do is create something of a monitoring process using our WHILE loop and a WAITFOR command. (We'll look at the specifics of WAITFOR in our next section.) We're going to be automatically updating our statistics once per day:
+
+WHILE 1 = 1
+
+BEGIN
+
+WAITFOR TIME '01:00'
+
+EXEC sp_updatestats
+
+RAISERROR('Statistics Updated for Database', 1, 1) WITH LOG
+
+END
+
+This would update the statistics for every table in our database every night at 1 AM and write a log entry of that fact to both the SQL Server log and the Windows application log. If you want to check to see if this works, leave this running all night and then check your logs in the morning.
+
+Note that using an infinite loop like this isn't the way that you would normally want to schedule a task. If you want something to run every day, set up a job using Management Studio. In addition to not keeping a connection open all the time (which the preceding example would do), you also get the capability to make follow up actions dependent on the success or failure of your script. Also, you can e-mail or netsend messages regarding the completion status.
+
+The WAITFOR Statement
+
+There are often things that you either don't want to or simply can't have happen right this moment, but you also don't want to have to hang around waiting for the right time to execute something.
+
+No problem—use the WAITFOR statement and have SQL Server wait for you. The syntax is incredibly simple:
+
+WAITFOR
+
+DELAY <'time'> | TIME <'time'>
+
+The WAITFOR statement does exactly what it says it does. It waits for whatever you specify as the argument to occur. You can specify either an explicit time of day for something to happen, or you can specify an amount of time to wait before doing something.
+
+The DELAY Parameter
+
+The DELAY parameter choice specifies an amount of time to wait. You cannot specify a number of days—just time in hours, minutes, and seconds. The maximum allowed delay is 24 hours. So, for example:
+
+WAITFOR DELAY '01:00'
+
+would run any code prior to the WAITFOR, then reach the WAITFOR statement, and stop for one hour, after which execution of the code would continue with whatever the next statement was.
+
+The TIME Parameter
+
+The TIME parameter choice specifies to wait until a specific time of day. Again, we cannot specify any kind of date—just the time of day using a 24-hour clock. Once more, this gives us a one-day time limit for the maximum amount of delay. For example:
+
+WAITFOR TIME '01:00'
+
+would run any code prior to the WAITFOR, then reach the WAITFOR statement, and stop until 1 AM, after which execution of the code would continue with whatever the next statement was after the WAITFOR.
+
+TRY/CATCH Blocks
+
+This is yet another one of those areas that I would consider to be critical learning when learning your basics, so, in theory, you should know it well by the time you get to the "Professional" level. That said, TRY/CATCH is still relatively new (it was added in SQL Server 2005), and, if you've grown up supporting an older application, you may not have seen this lovely new addition or may have been avoiding it for backward compatibility reasons.
+
+In days of yore (meaning anything before SQL Server 2005), our error-handling options were pretty limited. We could check for error conditions, but we had to do so proactively. Indeed, in some cases we could have errors that would cause us to leave our procedure or script without an opportunity to trap it at all. (This can still happen, but is much more limited.) We're going to save a more full discussion of error handling for our stored procedures discussion in Chapter 10, but we'll touch on the fundamentals of the new TRY/CATCH blocks here.
+
+A TRY/CATCH block in SQL Server works remarkably similarly to those used in any C-derived languages (C, C++, C#, Delphi, and a host of others). The syntax looks like this:
+
+BEGIN TRY
+
+{ <sql statement(s)> }
+
+END TRY
+
+BEGIN CATCH
+
+{ <sql statement(s)> }
+
+END CATCH [ ; ]
+
+In short, SQL Server will "try" to run anything within the BEGIN...END that goes with your TRY block. If, and only if, you have an error condition that has an error level of 11–19, then SQL Server will exit the TRY block immediately and begin with the first line in your CATCH block. Since there are more possible error levels than just 11–19, take a look at what we have there:
+
+Error Level | Nature  
+---|---  
+1–10 | Informational only. This would include things like context changes such as settings being adjusted or NULL values found while calculating aggregates. These will not trigger a CATCH block, so if you need to test for this level of error, you'll need to do so manually by checking @@ERROR.  
+11–19 | Relatively severe errors, but ones that can be handled by your code (foreign key violations, as an example). Some of these can be severe enough that you are unlikely to want to continue processing (such as a memory exceeded error), but at least you can trap them and exit gracefully.  
+20–25 | Very severe. These are generally system-level errors. Your server-side code will never know this kind of error happened, as the script and connection will be terminated immediately, and the CATCH block will never execute.
+
+Keep these in mind—if you need to handle errors outside the 11–19 level range, then you'll need to make other plans. The good news is that most errors that we need to trap fall in that 11–19 range.
+
+Now, to test this out, we'll make some alterations to our CREATE script that we built back when we were looking at IF...ELSE statements. You may recall that part of the reason for our original test to see whether the table already existed was to avoid creating an error condition that might have caused our script to fail. That kind of test is the way things have been done historically (and there really wasn't much in the way of other options). With the advent of TRY/CATCH blocks, we could just try the CREATE and then handle the error if one were given:
+
+BEGIN TRY
+
+\-- Try and create our table
+
+CREATE TABLE OurIFTest(
+
+Col1 int PRIMARY KEY
+
+)
+
+END TRY
+
+BEGIN CATCH
+
+\-- Uh oh, something went wrong, see if it's something
+
+\-- we know what to do with
+
+DECLARE @ErrorNo int,
+
+@Severity tinyint,
+
+@State smallint,
+
+@LineNo int,
+
+@Message nvarchar(4000)
+
+SELECT
+
+@ErrorNo = ERROR_NUMBER(),
+
+@Severity = ERROR_SEVERITY(),
+
+@State = ERROR_STATE(),
+
+@LineNo = ERROR_LINE (),
+
+@Message = ERROR_MESSAGE()
+
+IF @ErrorNo = 2714 -- Object exists error, we knew this might happen
+
+PRINT 'WARNING: Skipping CREATE as table already exists'
+
+ELSE -- hmm, we don't recognize it, so report it and bail
+
+RAISERROR(@Message, 16, 1 )
+
+END CATCH
+
+Notice I used some special functions to retrieve the error condition, so let's take a look at those.
+
+Also note that I moved them into variables that were controlled by me so they would not be lost. I must admit this is a holdover habit that I have from the days before TRY/CATCH, when you would lose the error code on the next statement. The functions used here persist within the scope of the particular CATCH block, so you are relatively safe against losing their values. The primary reason to move the values over, at this point, is if you want to utilize the error values after you exit the CATCH block.
+
+Function | Returns  
+---|---  
+ERROR_NUMBER() | The actual error number. If this is a system error, there will be an entry in the sys.messages that matches that error and contains some of the information you'll get from the other error-related functions.  
+ERROR_SEVERITY() | This equates to what is sometimes called "error level" in other parts of this book and Books Online. My apologies for the inconsistency. I'm guilty of perpetuating something that Microsoft started doing a version or two ago. Again, the "severity" must be 11–19 before the error will wind up in a catch block. (See the previous table in this chapter for further discussion on this.)  
+ERROR_STATE() | I use this as something of a place mark. This will always be 1 for system errors. When we discuss error handling in more depth in Chapter 10, you'll see how to raise your own errors. At that point, you can use state to indicate things like at what point in your stored procedure, function, or trigger the error occurred (this helps with situations where a given error can be handled in any one of many places).  
+ERROR_PROCEDURE() | We did not use this in the preceding example, as it is only relevant to stored procedures, functions, and triggers. This supplies the name of the procedure that caused the error—very handy if your procedures are nested at all, as the procedure that causes the error may not be the one to actually handle that error.  
+ERROR_LINE() | Just what it says—the line number of the error.  
+ERROR_MESSAGE() | The text that goes with the message. For system messages, this is the same as what you'll see if you select the message from the sys.messages function. For user-defined errors, it will be the text supplied to the RAISERROR function.
+
+In our example, I utilized a known error id that SQL Server raises if we attempt to create an object that already exists. You can see all system error messages by selecting them from the sys.messages table function.
+
+Beginning with SQL Server 2005, the sys.messages output grew so lengthy that it's hard to find what you're looking for by just scanning it. My solution is less than elegant but is rather effective. I just artificially create the error I'm looking for and see what error number it gives me (simple solutions for simple minds like mine!).
+
+I simply execute the code I want to execute (in this case, the CREATE statement) and handle the error if there is one; there really isn't much more to it than that.
+
+We will look at error handling in a far more thorough fashion in Chapter 10. In the meantime, you can use TRY/CATCH to give basic error handling to your scripts.
+
+Summary
+
+Understanding scripts and batches is the cornerstone to an understanding of programming with SQL Server. The concepts of scripts and batches lay the foundation for a variety of functions from scripting complete database builds to programming stored procedures and triggers.
+
+Local variables have scope for only one batch. Even if you have declared the variable within the same overall script, you will still get an error message if you don't redeclare it (and start over with assigning values) before referencing it in a new batch.
+
+You can use batches to create precedence between different parts of your scripts. The first batch starts at the beginning of the script and ends at the end of the script or the first GO statement, whichever comes first. The next batch (if there is another) starts on the line after the first one ends and continues to the end of the script or the next GO statement—again, whichever comes first. The process continues to the end of the script. The first batch from the top of the script is executed first, the second is executed second, and so on. All commands within each batch must pass validation in the query parser, or none of that batch will be executed; however, any other batches will be parsed separately and will still be executed (if they pass the parser).
+
+In addition, we reviewed the constructs to deal with control of flow and error-handling conditions. We can use this to build complex scripts that are able to adapt to different runtime environments (such as recognizing that it needs to process an upgrade of a database instead of an installation, or even determine what version of your schema it is upgraded from).
+
+Finally, we also saw how we can create and execute SQL dynamically. This can afford us the opportunity to deal with scenarios that aren't always 100 percent predictable or situations where something we need to construct our statement is actually itself a piece of data.
+
+In the next couple of chapters, we will take the notions of scripting and batches to the next level, and apply them to stored procedures and triggers—the closest things that SQL Server has to actual programs. We will also see how we can utilize any .NET language to add more complex language functionality to our stored procedures, functions, and triggers.
+10
+
+Advanced Programmability
+
+When deciding on where the cutoff should be between my Beginning and Professional titles, this was, perhaps, the most difficult area for me to reconcile. The thing is, how much a supposed "SQL Server jock" knows about things beyond basic DML really varies a lot, so what exactly qualifies someone as ready to do the "Professional" level title?
+
+In this chapter, I'm going to assume that you already know the basics of stored procedures and user-defined functions (the differences between them, types of SQL-based user-defined functions, parameterization, and basic control of flow statements). After all, if they are "the basics," then they seem more appropriate for a beginning title (and, indeed, I cover them at length in Beginning SQL Server 2008 Programming). So what, then, is this chapter all about? Well, it's about all the things that go beyond the basics. In this section, we'll cover:
+
+  * OUTPUT parameters (often misunderstood by even advanced SQL programmers)
+  * Error handling (again, I cover this somewhat in the Beginning title, but it's so often misunderstood even amongst advanced SQL programmers that it deserves revisiting)
+  * Table-valued parameters (new with SQL Server 2008)
+  * .NET-based stored procedures and user-defined functions
+
+Even paring out the so called basics, there is a lot to be covered, so let's get to it.
+
+Most of the concepts provided in this chapter apply relatively equally to both stored procedures and user-defined functions.
+
+A More Advanced Look At Stored Procedures
+
+Stored procedures—or "sprocs"—have long been fundamental to truly "programming" in SQL Server. Prior to SQL Server 2005, they could be complex, but even the most complex was still relatively mundane given the limitations of T-SQL. With each release, however, Microsoft has added more to the puzzle. It was a rather big leap in the case of .NET assemblies (again, beginning with SQL Server 2005—we'll cover those a little later in this chapter), and the addition of table-valued parameters in SQL Server 2008 brings a lot of continuity to what we can do inside of a stored procedure.
+
+Let's start this section off with a review of the general sproc syntax:
+
+CREATE PROCEDURE|PROC <sproc name>
+
+[<parameter name> [<schema>.]<data type> [VARYING]
+
+[= <default value>] [OUT[PUT]] [READONLY]
+
+[, n...]
+
+[WITH
+
+RECOMPILE| ENCRYPTION | [EXECUTE AS { CALLER|SELF|OWNER|'<user name>'}]
+
+[FOR REPLICATION]
+
+AS
+
+<code> | EXTERNAL NAME <assembly name>.<assembly class>.<method>
+
+Most of this should be second nature at this point, but, before this chapter is done, we will have captured any elements of the syntax that you may not be as comfortable with.
+
+Let's start by taking a look at output parameters.
+
+Output Parameters
+
+Sometimes, you want to pass non-recordset information out to whatever called your sproc. Perhaps one of the most common uses for this is with sprocs that do inserts into tables with identity values. Often the code calling the sproc wants to know what the identity value was when the process is complete.
+
+To show this off, we'll utilize a stored procedure that is already in the AdventureWorks2008 database—uspLogError. It looks like this:
+
+\-- uspLogError logs error information in the ErrorLog table about the
+
+\-- error that caused execution to jump to the CATCH block of a
+
+\-- TRY...CATCH construct. This should be executed from within the scope
+
+\-- of a CATCH block otherwise it will return without inserting error
+
+\-- information.
+
+CREATE PROCEDURE [dbo].[uspLogError]
+
+@ErrorLogID [int] = 0 OUTPUT -- contains the ErrorLogID of the row inserted
+
+AS -- by uspLogError in the ErrorLog table
+
+BEGIN
+
+SET NOCOUNT ON;
+
+\-- Output parameter value of 0 indicates that error
+
+\-- information was not logged
+
+SET @ErrorLogID = 0;
+
+BEGIN TRY
+
+\-- Return if there is no error information to log
+
+IF ERROR_NUMBER() IS NULL
+
+RETURN;
+
+\-- Return if inside an uncommittable transaction.
+
+\-- Data insertion/modification is not allowed when
+
+\-- a transaction is in an uncommittable state.
+
+IF XACT_STATE() = -1
+
+BEGIN
+
+PRINT 'Cannot log error since the current transaction is in an
+
+uncommittable state. '
+
+\+ 'Rollback the transaction before executing uspLogError in order to
+
+successfully log error information.';
+
+RETURN;
+
+END
+
+INSERT [dbo].[ErrorLog]
+
+(
+
+[UserName],
+
+[ErrorNumber],
+
+[ErrorSeverity],
+
+[ErrorState],
+
+[ErrorProcedure],
+
+[ErrorLine],
+
+[ErrorMessage]
+
+)
+
+VALUES
+
+(
+
+CONVERT(sysname, CURRENT_USER),
+
+ERROR_NUMBER(),
+
+ERROR_SEVERITY(),
+
+ERROR_STATE(),
+
+ERROR_PROCEDURE(),
+
+ERROR_LINE(),
+
+ERROR_MESSAGE()
+
+);
+
+\-- Pass back the ErrorLogID of the row inserted
+
+SET @ErrorLogID = @@IDENTITY;
+
+END TRY
+
+BEGIN CATCH
+
+PRINT 'An error occurred in stored procedure uspLogError: ';
+
+EXECUTE [dbo].[uspPrintError];
+
+RETURN −1;
+
+END CATCH
+
+END;
+
+Note the sections that I've highlighted here—these are the core to our output parameter. The first declares the parameter as being an output parameter. The second makes the insert that utilizes the identity value, and finally the SET statement captures the identity value. When the procedure exits, the value in @ErrorLogID is passed to the calling script.
+
+Let's utilize our TRY/CATCH example from the tail end of the previous chapter, but this time we'll make the call to uspLogError:
+
+USE AdventureWorks2008;
+
+BEGIN TRY
+
+\-- Try and create our table
+
+CREATE TABLE OurIFTest(
+
+Col1 int PRIMARY KEY
+
+)
+
+END TRY
+
+BEGIN CATCH
+
+\-- Uh oh, something went wrong, see if it's something
+
+\-- we know what to do with
+
+DECLARE @MyOutputParameter int;
+
+IF ERROR_NUMBER() = 2714 -- Object exists error, we knew this might happen
+
+BEGIN
+
+PRINT 'WARNING: Skipping CREATE as table already exists';
+
+EXEC dbo.uspLogError @ErrorLogID = @MyOutputParameter OUTPUT;
+
+PRINT 'A error was logged. The Log ID for our error was '
+
+\+ CAST(@MyOutputParameter AS varchar);
+
+END
+
+ELSE -- hmm, we don't recognize it, so report it and bail
+
+RAISERROR('something not good happened this time around', 16, 1 );
+
+END CATCH
+
+If you run this in a database that does not already have the OurIFTest table, then you will get a simple:
+
+Command(s) completed successfully.
+
+But run it where the OurIFTest table already exists (for example, run it twice if you haven't run the CREATE code before), and you get something to indicate the error:
+
+WARNING: Skipping CREATE as table already exists
+
+A error was logged. The Log ID for our error was 1
+
+Now run a little select against the error log table:
+
+SELECT ErrorLogID, UserName, ErrorMessage
+
+FROM ErrorLog
+
+WHERE ErrorLogID = 1; -- change this value to whatever your
+
+\-- results said it was logged as
+
+And you can see that the error was indeed properly logged:
+
+ErrorLogID UserName ErrorMessage
+
+\----------- ----------- ---------------------------------------------------
+
+1 dbo There is already an object named 'OurIFTest'...
+
+(1 row(s) affected)
+
+There are several things that you should take note of between the sproc itself, and the usage of it by the calling script:
+
+  * The OUTPUT keyword was required for the output parameter in the sproc declaration.
+  * You must use the OUTPUT keyword when you call the sproc, much as you did when you declared the sproc. This gives SQL Server advance warning about the special handling that parameter will require. Be aware, however, that forgetting to include the OUTPUT keyword won't create a runtime error (you won't get any messages about it), but the value for the output parameter won't be moved into your variable (you'll just wind up with what was already there—most likely a NULL value). This means that you'll have what I consider to be the most dreaded of all computer terms—unpredictable results.
+  * The variable you assign the output result to does not have to have the same name as the internal parameter in the sproc. For example, in our previous sproc, the internal parameter in the error logging sproc was called @ErrorLogID, but the variable the value was passed to was called @MyOutputParameter.
+  * The EXEC (or EXECUTE) keyword was required since the call to the sproc wasn't the first thing in the batch (you can leave off the EXEC if the sproc call is the first thing in a batch)—personally, I recommend that you train yourself to use it regardless.
+
+Dealing with Errors
+
+This is one of those sections that squarely overlaps with my "Beginning" title. If you think about it a while, I hope you won't be that surprised.
+
+The problem is fairly simple: Many who learn SQL do so almost by accident. That is, they either don't have a beginning book to read at all, or, at best, they skim an SQL title enough to get some statements crammed into their client language and eventually move on to some basic sprocs. While they know error handling in their client environment, they suddenly find themselves writing fairly complex stored procedures having learned the things required to actually make a sproc run, but not much about how a sproc really should be written. I overlap here to back up and catch a spot that a lot of intermediate to fairly advanced stored procedure authors often have very little real exposure to.
+
+If you already have the whole error handling in SQL thing down cold, then I'd suggest just skimming through this section for new ideas, and otherwise moving on to the coverage of table-valued parameters and .NET programming in SQL Server.
+
+Four common types of errors can happen in SQL Server:
+
+  * Errors that create runtime errors and stop your code from proceeding further.
+  * Errors that are informational in nature and do not create runtime errors. A non-zero error number is returned (if you ask), but no error is raised (and so no error trapping will be activated unless you are testing for that specific error).
+  * Errors that create runtime errors but continue execution within SQL Server such that you can trap them and respond in the manner of your choosing.
+  * Errors that are more logical in nature and to which SQL Server is essentially oblivious.
+
+Now, here things get a bit sticky, and versions become important, so hang with me as I lead you down a winding road.
+
+We touched on TRY/CATCH blocks some in our last chapter, and examined how to make use of them, but they weren't always a part of T-SQL. The possibilities for error handling have changed a lot over the years, and particularly so back in SQL Server 2005. Today, we have genuine error traps in the form of the aforementioned TRY/CATCH blocks. There is, as you might expect, backward compatibility to consider, but that continues to be less of a consideration as SQL Server 2000 fades in support.
+
+One thing remains common between the old and new error-handling models: higher-level runtime errors. Some general errors cause SQL Server to terminate the script immediately. This was true prior to TRY/CATCH, and it remains true even in the TRY/CATCH era. Errors that have enough severity to generate a runtime error are problematic from the SQL Server side of the equation. The new TRY/CATCH logic is a bit more flexible for some errors than the error trapping model that preceded it, but even now your sproc won't necessarily know when something bad happens (it just depends how bad "bad" is). On the bright side, all the current data access object models pass through the message on such errors, so you know about them in your client application and can do something about them there.
+
+The Way We Were
+
+In older versions of SQL Server (prior to 2005), there was no formal error handler. You didn't have an option that essentially said, "If any error happens, go run this code over in this other spot." Instead, you had to monitor for error conditions within your own code and then decide what to do at the point you detected the error—possibly well after the actual error occurred. Let's go ahead and take a look at how we handle errors in that model.
+
+In case you're in the "since we have the new TRY/CATCH blocks, why do I even care about this?" frame of mind, let me point out that there is tons of code out there written for those earlier versions of SQL Server (before TRY/CATCH), and continues to be older style code written by developers that either don't know the newer way or are just too much creatures of habit to use it. In short, it's important to understand this way of doing things, so you understand other code that you will see in your career.
+
+Handling Inline Errors
+
+Inline errors are those pesky little things where SQL Server keeps running as such, but hasn't, for some reason, succeeded in doing what you wanted it to do. For example, try to insert a record into the Person.EmailAddress table that doesn't have a corresponding record in the Person.BusinessEntity table:
+
+USE AdventureWorks2008;
+
+GO
+
+INSERT INTO Person.EmailAddress
+
+(BusinessEntityID, EmailAddress)
+
+VALUES
+
+(0, 'robv@professionalsql.com');
+
+SQL Server won't perform this insert for you because there is a FOREIGN KEY constraint on BusinessEntityID that references another table. Since there is not a matching record in that table, the record we are trying to insert into Person.EmailAddress violates that foreign key constraint and is rejected:
+
+Msg 547, Level 16, State 0, Line 2
+
+The INSERT statement conflicted with the FOREIGN KEY constraint
+
+"FK_EmailAddress_Person_BusinessEntityID". The conflict occurred in database
+
+"AdventureWorks2008", table "Person.Person", column 'BusinessEntityID'.
+
+The statement has been terminated.
+
+Pay attention to that error 547 up there. That's something you can use.
+
+Using @@ERROR
+
+@@ERROR contains the error number of the last T-SQL statement executed. If the value is zero, then no error occurred. This is somewhat similar to the ERROR_NUMBER() function we saw in the previous chapter when we first discussed TRY/CATCH blocks. While ERROR_NUMBER() is only valid and remains the same regardless of where you are within a CATCH block, @@ERROR receives a new value with each statement you execute.
+
+The caveat with @@ERROR is that it is reset with each new statement. This means that if you want to defer analyzing the value, or you want to use it more than once, you need to move the value into some other holding bin—a local variable that you have declared for this purpose.
+
+Play with this just a bit using the INSERT example from before:
+
+USE AdventureWorks2008;
+
+GO
+
+DECLARE @Error int;
+
+\-- Bogus INSERT - there is no BusinessEntityID of 0.
+
+INSERT INTO Person.EmailAddress
+
+(BusinessEntityID, EmailAddress)
+
+VALUES
+
+(0, 'robv@professionalsql.com');
+
+\-- Move our error code into safekeeping. Note that, after this statement,
+
+\-- @@Error will be reset to whatever error number applies to this statement
+
+SELECT @Error = @@ERROR;
+
+\-- Print out a blank separator line
+
+PRINT '';
+
+\-- The value of our holding variable is just what we would expect
+
+PRINT 'The Value of @Error is ' + CONVERT(varchar, @Error);
+
+\-- The value of @@ERROR has been reset - it's back to zero
+
+PRINT 'The Value of @@ERROR is ' + CONVERT(varchar, @@ERROR);
+
+Now execute your script, and you can examine how @@ERROR is affected:
+
+Msg 547, Level 16, State 0, Line 6
+
+The INSERT statement conflicted with the FOREIGN KEY constraint
+
+"FK_EmailAddress_Person_BusinessEntityID". The conflict occurred in database
+
+"AdventureWorks2008", table "Person.Person", column 'BusinessEntityID'.
+
+The statement has been terminated.
+
+The Value of @Error is 547
+
+The Value of @@ERROR is 0
+
+This illustrates pretty quickly the issue of saving the value from @@ERROR. The first error statement is only informational in nature. SQL Server has thrown that error, but hasn't stopped the code from executing. Indeed, the only part of that message that your sproc has access to is the error number. That error number resides in @@ERROR for just that next T-SQL statement; after that it's gone.
+
+Notice that @Error and @@ERROR are two separate and distinct variables, and can be referred to separately. This isn't just because of the case difference. (Depending on how you have your server configured, case sensitivity can affect your variable names.) It's because of the difference in scope. The @ or @@ is part of the name, so the number of @ symbols on the front makes each one separate and distinct from the other.
+
+Using @@ERROR in a Sproc
+
+OK, so let's start with an assumption here: If you're using @@ERROR, then the likelihood is that you are not using TRY/CATCH blocks. If you have not made this choice for backward compatibility reasons, I'm going to bop you upside the head and suggest you reconsider—TRY/CATCH is much cleaner and all around better.
+
+TRY/CATCH will handle varieties of errors that in previous versions would have terminated the execution of your script.
+
+That said, TRY/CATCH is out of the equation if backward compatibility with SQL Server 2000 or prior is what you need, so let's take a quick look.
+
+What we're going to do is look at two short procedures to take a look at how inline error checking works when it works, and how it doesn't when it doesn't (in particular, when inline does not work, but TRY/CATCH would).
+
+Let's start with the referential integrity example we did a moment ago:
+
+USE AdventureWorks2008;
+
+GO
+
+INSERT INTO Person.EmailAddress
+
+(BusinessEntityID, EmailAddress)
+
+VALUES
+
+(0, 'robv@professionalsql.com');
+
+You may recall this got us a simple 547 error. This is one of those that is trappable. We could trap this in a simple script, but let's do it as a sproc since procedural stuff is supposedly what we're working on here....
+
+USE AdventureWorks2008;
+
+GO
+
+CREATE PROC spInsertValidatedEmailAddress
+
+@BusinessEntityID int,
+
+@EmailAddress nvarchar(50)
+
+AS
+
+BEGIN
+
+DECLARE @Error int;
+
+INSERT INTO Person.EmailAddress
+
+(BusinessEntityID, EmailAddress)
+
+VALUES
+
+(@BusinessEntityID, @EmailAddress);
+
+SET @Error = @@ERROR;
+
+IF @Error = 0
+
+PRINT 'New Record Inserted';
+
+ELSE
+
+BEGIN
+
+IF @Error = 547 -- Foreign Key violation. Tell them about it.
+
+PRINT 'At least one provided parameter was not found. Correct and retry';
+
+ELSE -- something unknown
+
+PRINT 'Unknown error occurred. Please contact your system admin';
+
+END
+
+END
+
+Now try executing this with values that work:
+
+EXEC spInsertValidatedEmailAddress 1, 'robv@professionalsql.com';
+
+Our insert happens correctly, so no error condition is detected (because there isn't one):
+
+(1 row(s) affected)
+
+New Record Inserted
+
+Now, try something that should blow up:
+
+EXEC spInsertValidatedEmailAddress 0, 'robv@professionalsql.com';
+
+And you see not only the actual SQL Server message but the message from our error trap (note that there is no way of squelching the SQL Server message):
+
+Msg 547, Level 16, State 0, Procedure spInsertValidatedEmailAddress, Line 10
+
+The INSERT statement conflicted with the FOREIGN KEY constraint
+
+"FK_EmailAddress_Person_BusinessEntityID". The conflict occurred in database
+
+"AdventureWorks2008", table "Person.Person", column 'BusinessEntityID'.
+
+The statement has been terminated.
+
+At least one provided parameter was not found. Correct and retry
+
+As you can see, we were able to detect our error without a TRY/CATCH block.
+
+Now, let's move on to an example of why TRY/CATCH is better—a situation where a TRY/CATCH works fine, but where inline error checking fails. To show this one off, all we need to do is use our example for TRY/CATCH that we used in the scripting chapter. It looked like this:
+
+BEGIN TRY
+
+\-- Try and create our table
+
+CREATE TABLE OurIFTest(
+
+Col1 int PRIMARY KEY
+
+)
+
+END TRY
+
+BEGIN CATCH
+
+\-- Uh oh, something went wrong, see if it's something
+
+\-- we know what to do with
+
+DECLARE @ErrorNo int,
+
+@Severity tinyint,
+
+@State smallint,
+
+@LineNo int,
+
+@Message nvarchar(4000)
+
+SELECT
+
+@ErrorNo = ERROR_NUMBER(),
+
+@Severity = ERROR_SEVERITY(),
+
+@State = ERROR_STATE(),
+
+@LineNo = ERROR_LINE (),
+
+@Message = ERROR_MESSAGE()
+
+IF @ErrorNo = 2714 -- Object exists error, we knew this might happen
+
+PRINT 'WARNING: Skipping CREATE as table already exists'
+
+ELSE -- hmm, we don't recognize it, so report it and bail
+
+RAISERROR(@Message, 16, 1 )
+
+END CATCH
+
+It worked just fine. But if I try and do this using inline error checking, I have a problem:
+
+CREATE TABLE OurIFTest(
+
+Col1 int PRIMARY KEY
+
+);
+
+IF @@ERROR != 0
+
+PRINT 'Problems!';
+
+ELSE
+
+PRINT 'Everything went OK!';
+
+Run this (you'll need to run it twice to generate the error if the table isn't already there), and we quickly find out that, without the TRY block, SQL Server aborts the script entirely on the particular error we're generating here:
+
+Msg 2714, Level 16, State 6, Line 2
+
+There is already an object named 'OurIFTest' in the database.
+
+Notice that our PRINT statements never got a chance to execute—SQL Server had already terminated processing. With TRY/CATCH we were able to trap and handle this error, but using inline error checking, our attempts to trap an error like this fail.
+
+Manually Raising Errors
+
+Sometimes you have errors that SQL Server doesn't really know about, but you wish it did. For example, perhaps in the previous example you don't want to return –1000. Instead, you'd like to be able to create a runtime error at the client end that the client would then use to invoke an error handler and act accordingly. To do this, you use the RAISERROR command in T-SQL. The syntax is pretty straightforward:
+
+RAISERROR (<message ID | message string>, <severity>, <state>
+
+[, <argument>
+
+[,<...n>]] )
+
+[WITH option[,...n]]
+
+Message ID/Message String
+
+The message ID or message string you provide determines which message is sent to the client.
+
+Using a message ID creates a manually raised error with the ID that you specified and the message that is associated with that ID as found in the sys.messages system view.
+
+If you want to see what your SQL Server has as predefined messages, you can always perform a SELECT * FROM sys.messages. This includes any messages you've manually added to your system using the sp_addmessage stored procedure or through Management Studio.
+
+You can also just supply a message string in the form of ad hoc text without creating a more permanent message in sys.messages:
+
+RAISERROR ('Hi there, I''m an error', 1, 1);
+
+This raises a rather simple error message:
+
+Hi there, I'm an error
+
+Msg 50000, Level 1, State 50000
+
+Notice that the assigned message number, even though you didn't supply one, is 50000. This is the default error value for any ad hoc error. It can be overridden using the WITH SETERROR option. (We'll look at that briefly in a moment.)
+
+Severity
+
+We got a quick overview of this when looking at TRY/CATCH in the chapter on scripting. For those of you already familiar with Windows servers, severity should be an old friend. Severity is an indication of just how bad things really are based on this error. For SQL Server, however, what severity codes mean can get a little bizarre. They can range from informational (severities 1–18), to system level (19–25), and even catastrophic (20–25). If you raise an error of severity 19 or higher (system level), the WITH LOG option must also be specified. 20 and higher automatically terminates the users' connections. (They hate that!)
+
+So, get back to what I meant by bizarre. SQL Server actually varies its behavior into more ranges than Windows does, or even than the Books Online will tell you about. Errors fall into five major groupings, as shown in the following table:
+
+1–10 | Purely informational but will return the specific error code in the message information.  
+---|---  
+11–16 | If you do not have a TRY/CATCH block set up, then these terminate execution of the procedure and raise an error at the client. The state is shown to be whatever value you set it to. If you have a TRY/CATCH block defined, then that handler will be called rather than raising an error at the client.  
+17 | Usually, only SQL Server should use this severity. Basically, it indicates that SQL Server has run out of resources—for example tempdb was full—and can't complete the request. Again, a TRY/CATCH block will get this before the client does.  
+18–19 | Both of these are severe errors and imply that the underlying cause requires system administrator attention. With 19, the WITH LOG option is required, and the event will show up in the NT or Windows Event Log if you are using that OS family. These are the final levels at which you can trap the error with a TRY/CATCH block—after this, it will go straight to the client.  
+20–25 | Your world has just caved in as has the user's connection. Essentially, this is a fatal error. The connection is terminated. As with 19, you must use the WITH LOG option, and a message will, if applicable, show up in the Event Log.
+
+State
+
+State is an ad hoc value. It's something that recognizes that exactly the same error may occur at multiple places within your code. The notion is that this gives you an opportunity to send something of a place marker for where exactly the error occurred.
+
+State values can be between 1 and 127. If you are troubleshooting an error with Microsoft tech support, they apparently have some arcane knowledge that hasn't been shared with us about what some of these mean. I'm told that if you make a tech support call to Microsoft, they are likely to ask about and make use of this state information.
+
+One way I make use of State when raising my own errors is as a location tool. There will be instances where your procedure has the potential to raise the same error in multiple places in the sproc—I will change the State information in my RAISERROR to provide an extra indication of which specific line raised the error.
+
+Error Arguments
+
+Some predefined errors accept arguments. These allow you to make the error to be somewhat more dynamic by changing to the specific nature of the error. You can also format your error messages to accept arguments.
+
+When you want to use dynamic information in what is otherwise a static error message, you need to format the fixed portion of your message so that it leaves room for the parameterized section of the message. You do so by using placeholders. If you're coming from the C or C++ world, then you'll recognize the parameter placeholders immediately; they are similar to the printf command arguments. If you're not from the C world, these may seem a little odd to you. All the placeholders start with the % sign and are then coded for the kind of information you'll be passing to them, as shown in the following table.
+
+Placeholder Type Indicator | Type of Value  
+---|---  
+d | Signed integer. Books Online indicates that i is an acceptable choice, but I've had problems getting it to work as expected.  
+o | Unsigned octal.  
+p | Pointer.  
+s | String.  
+u | Unsigned integer.  
+X or x | Unsigned hexadecimal.
+
+In addition, there is the option to prefix any of these placeholder indicators with some additional flag and width information:
+
+Flag | What It Does  
+---|---  
+− (dash or minus sign) | Left-justify. Only makes a difference when you supply a fixed width.  
++ (plus sign) | Indicates the positive or negative nature if the parameter is a signed numeric type.  
+0 | Tells SQL Server to pad the left side of a numeric value with zeros until it reaches the width specified in the width option.  
+
+# (pound sign) | Applies only to octal and hex values. Tells SQL Server to use the appropriate prefix (0 or 0x) depending on whether it is octal or hex.  
+' ' | Pads the left of a numeric value with spaces if positive.
+
+Last, but not least, you can also set the width, precision, and long/short status of a parameter:
+
+  * Width—Set by simply supplying an integer value for the amount of space you want to hold for the parameterized value. You can also specify a *, in which case SQL Server will automatically determine the width according to the value you've set for precision.
+  * Precision—Determines the maximum number of digits output for numeric data.
+  * Long/Short—Set by using an h (short) or I (long) when the type of the parameter is an integer, octal, or hex value.
+
+Use this in an example:
+
+RAISERROR ('This is a sample parameterized %s, along with a zero
+
+padding and a sign%+010d',1,1, 'string', 12121);
+
+If you execute this, you get back something that looks a little different from what's in the quotation marks:
+
+This is a sample parameterized string, along with a zero
+
+padding and a sign+000012121
+
+Msg 50000, Level 1, State 1
+
+The extra values supplied were inserted, in order, into your placeholders, with the final value being reformatted as specified.
+
+WITH <option>
+
+Currently, you can mix and match three options when you raise an error:
+
+  * LOG
+  * SETERROR
+  * NOWAIT
+
+WITH LOG
+
+This tells SQL Server to log the error to the SQL Server error log and the Windows Application Log. This option is required with severity levels that are 19 or higher.
+
+WITH SETERROR
+
+By default, a RAISERROR command doesn't set @@ERROR with the value of the error you generated. Instead, @@ERROR reflects the success or failure of your actual RAISERROR command. SETERROR overrides this and sets the value of @@ERROR to be equal to your error ID.
+
+WITH NOWAIT
+
+Immediately notifies the client of the error.
+
+Adding Your Own Custom Error Messages
+
+You can make use of a special system stored procedure to add messages to the system. The sproc is called sp_addmessage, and the syntax looks like this:
+
+sp_addmessage [@msgnum =] <message id>,
+
+[@severity =] <severity>,
+
+[@msgtext =] <'msg'>
+
+[, [@lang =] <'language'>]
+
+[, [@with_log =] [TRUE|FALSE]]
+
+[, [@replace =] 'replace']
+
+All the parameters mean pretty much the same thing that they did with RAISERROR, except for the addition of the language and replace parameters and a slight difference with the WITH LOG option.
+
+Custom error messages added using sp_addmessage are shared by all databases on the server. Take this into account when defining message identifiers, as conflicts are possible.
+
+@lang
+
+This specifies the language to which this message applies. What's cool here is that you can specify a separate version of your message for any language supported. This equates to the alias if you select the language list from sys.syslanguages.
+
+@with_log
+
+This works just the same as it does in RAISERROR in that, if set to TRUE the message will be automatically logged to the SQL Server error log and the Windows Application Log when raised. The only trick here is that you indicate that you want this message to be logged by setting this parameter to TRUE rather than using the WITH LOG option.
+
+@replace
+
+If you are editing an existing message rather than creating a new one, you must set the @replace parameter to 'REPLACE'. If you leave this off, you'll get an error if the message already exists.
+
+Creating a set list of additional messages for use by your applications can greatly enhance reuse, but more importantly, it can significantly improve readability of your application. Imagine if every one of your database applications made use of a constant list of custom error codes. You could then easily establish a constants file (a resource or include library, for example) that had a listing of the appropriate errors. You could even create an include library that had generic handling of some or all of the errors. In short, if you're going to be building multiple SQL Server applications in the same environment, consider using a set list of errors that is common to all your applications.
+
+Using sp_addmessage
+
+As has already been indicated, sp_addmessage creates messages in much the same way as you create ad hoc messages using RAISERROR.
+
+As an example, add your own custom message that tells users about the issues with their order date:
+
+sp_addmessage
+
+@msgnum = 60000,
+
+@severity = 10,
+
+@msgtext = '%s is not a valid Order date.
+
+Order date must be within 7 days of current date.';
+
+Execute the sproc and it confirms the addition of the new message:
+
+(1 row(s) affected)
+
+No matter what database you're working with when you run sp_addmessage, the actual message is added to a table in the master database (you can, however, see all the messages available on the server by simply querying sys.messages no matter what database you happen to be in). The significance of this is that, if you migrate your database to a new server, the messages need to be added again to that new server. The old ones will still be in the master database of the old server. As such, I strongly recommend keeping all your custom messages stored in a script somewhere, so they can easily be added into a new system.
+
+Removing an Existing Custom Message
+
+To get rid of the custom message, use the following:
+
+sp_dropmessage <msg num>
+
+And it's gone.
+
+Table-Valued Parameters (TVPs)
+
+TABLE as a data type first appeared in SQL Server 2005, but was limited to variables. They were a natural extension of table-valued user-defined functions (first introduced in SQL Server 2000), allowing you to declare tables as variables rather than necessarily creating a temporary table. Unfortunately, table variables were not usable as parameters; the new user-defined table type we discussed back in Chapter 7 supplies the framework for extending table-valued variables functionality to parameters.
+
+Why would we need a TVP? Primarily to deal with many to one constructs. In Chapter 7, we had the concept of multiple addresses: what if we wanted to update all the addresses for a particular customer or business entity? How about orders? Wouldn't it be nice to pass an entire order to a single sproc rather than call one sproc for the order header and a separate sproc once for each line item?
+
+With TVPs, we could pass an order as a single unit—scalar parameters for the various pieces of order header information, and a TVP to represent each line item. We could then do our one insert into the order header table, and, more importantly, insert all rows into the details table in a single pass rather than repeating individual calls for each separate line item (consider the performance ramifications of a single connection to the database and single pass over the data).
+
+To take a look at a relatively simple example of how a single call to a sproc can now perform actions with multiple rows, let's revisit the user-defined table type we created back in Chapter 7—it looked like this:
+
+USE AdventureWorks2008;
+
+GO
+
+CREATE TYPE Person.Address
+
+AS TABLE(
+
+AddressID int NULL,
+
+AddressLine1 nvarchar(60) NOT NULL,
+
+AddressLine2 nvarchar(60) NULL,
+
+City nvarchar(30) NOT NULL,
+
+StateProvinceID int NOT NULL,
+
+PostalCode nvarchar(15) NOT NULL,
+
+SpatialLocation geography NULL
+
+);
+
+We can now use this new type as part of a stored procedure. In our case, we'll create a procedure that is going to accept a list of addresses, and MERGE them into the Address table. The use of a TVP is relatively straightforward. You simply declare it as you would any other parameter type. The sole exception is that you must mark your table parameter(s) as READONLY. Our merge example would look something like this:
+
+CREATE PROC spAddressTVPExample
+
+@AddressesIn Person.Address READONLY
+
+AS
+
+\-- MERGE our data
+
+MERGE Person.Address AS pa
+
+USING
+
+(
+
+SELECT AddressID,
+
+AddressLine1,
+
+AddressLine2,
+
+City,
+
+StateProvinceID,
+
+PostalCode,
+
+SpatialLocation
+
+FROM @AddressesIn
+
+) AS a
+
+ON (pa.AddressID = a.AddressID)
+
+WHEN MATCHED THEN
+
+UPDATE SET pa.AddressLine1 = a.AddressLine1,
+
+pa.AddressLine2 = a.AddressLine2,
+
+pa.City = a.City,
+
+pa.StateProvinceID = a.StateProvinceID,
+
+pa.PostalCode = a.PostalCode,
+
+pa.SpatialLocation = a.SpatialLocation
+
+WHEN NOT MATCHED THEN
+
+INSERT (
+
+AddressLine1,
+
+AddressLine2,
+
+City,
+
+StateProvinceID,
+
+PostalCode,
+
+SpatialLocation
+
+)
+
+VALUES (
+
+a.AddressLine1,
+
+a.AddressLine2,
+
+a.City,
+
+a.StateProvinceID,
+
+a.PostalCode,
+
+a.SpatialLocation
+
+);
+
+We declare the type of parameter we're expecting, just as we do for any other parameter type. The primary difference is that our parameter is read only; that is, it must be explicitly defined as being unavailable for use as an OUTPUT parameter, and no changes can be made to the inbound data.
+
+So, with our sproc created, let's pass it in some data and see what happens:
+
+\-- Create the instance of our user defined type
+
+DECLARE @Address Person.Address;
+
+\-- Now populate it. One row will match existing data,
+
+\-- and the other three will be new rows.
+
+INSERT INTO @Address
+
+(AddressID,
+
+AddressLine1,
+
+City,
+
+StateProvinceID,
+
+PostalCode,
+
+SpatialLocation
+
+)
+
+VALUES
+
+(
+
+1,
+
+'1970 Napa Ct. - ALTERED',
+
+'Bothell',
+
+79,
+
+'98011',
+
+0xE6100000010CAE8BFC28BCE4474067A89189898A5EC0
+
+),
+
+(
+
+NULL,
+
+'My first address',
+
+'MyTown',
+
+1,
+
+'21212',
+
+NULL
+
+),
+
+(
+
+NULL,
+
+'My second address',
+
+'OtherTown',
+
+5,
+
+'43434',
+
+NULL
+
+),
+
+(
+
+NULL,
+
+'My third address',
+
+'MyTown',
+
+1,
+
+'21214',
+
+NULL
+
+);
+
+\-- Start a transaction just to make it easy to roll this back and
+
+\-- keep our AdventureWorks2008 database looking as it did originally
+
+BEGIN TRAN;
+
+\-- Now feed our table to the sproc we created earlier by
+
+\-- utilizing the TVP
+
+EXEC spAddressTVPExample @Address
+
+\-- Show the outcome. Note that my > 32521 number limits to rows
+
+\-- that would be beyond those included in the stock sample
+
+SELECT *
+
+FROM Person.Address
+
+WHERE AddressID = 1 OR AddressID > 32521;
+
+\-- Roll things back to keep our database a bit more pristine
+
+ROLLBACK TRAN;
+
+We were able to address many individual addresses in a single pass. We could further expand on this notion to accept a particular BusinessEntityID and use the TVP to update, insert, or delete for all related rows in both the Address table and the associate table that links the BusinessEntityID and its related addresses.
+
+Debugging
+
+Long ago and far away (SQL Server 2000), the Management Studio had real-live debugging tools. They were a little clunky in the sense that they really only worked around stored procedures (there wasn't a way to debug just a script, and debugging triggers required you to create a sproc that would fire the trigger), but, with some workarounds here and there, we had the long sought after debugger. SQL Server 2005 came along and removed all debugging functionality out of the Management Studio. (It was in the product, but you had to use the Visual Studio installation that is part of the Business Intelligence Development Studio in order to get at them—not real handy in any case, but non-existent if you didn't install BIDS for some reason). I'm happy to say that debugging is back in the Management Studio, and it's better than ever!
+
+Starting the Debugger
+
+Unlike with previous versions, the debugger in SQL Server 2008 is pretty easy to find. Much of using the debugger works as it does in VB or C#—probably like most modern debuggers for that matter. Simply choose the Debug menu (available when a query window is active). You can then choose from options to get things started: Start Debugging (Alt+F5) or Step Into (F11).
+
+Let's do a little bit of setup to show the debugger in action both in a standard script, and in a stored procedure scenario. To do this, we'll use a recursive procedure that I find to be ideal for showing off the call stack. To work through this example, you'll need to create it using the following code:
+
+CREATE PROC spTriangular
+
+@ValueIn int,
+
+@ValueOut int OUTPUT
+
+AS
+
+DECLARE @InWorking int;
+
+DECLARE @OutWorking int;
+
+IF @ValueIn != 1
+
+BEGIN
+
+SELECT @InWorking = @ValueIn - 1;
+
+EXEC spTriangular @InWorking, @OutWorking OUTPUT;
+
+SELECT @ValueOut = @ValueIn + @OutWorking;
+
+END
+
+ELSE
+
+BEGIN
+
+SELECT @ValueOut = 1;
+
+END
+
+RETURN;
+
+GO
+
+Now that our recursive sproc is created, we just need a little code to set up our debugging test:
+
+DECLARE @WorkingOut int
+
+DECLARE @WorkingIn int = 5
+
+EXEC spTriangular @WorkingIn, @WorkingOut OUTPUT
+
+PRINT CAST(@WorkingIn AS varchar) + ' Triangular is '
+
+\+ CAST(@WorkingOut AS varchar)
+
+With this script as the active query window, let's start a debugging run with the Step Into option (choose it from the Debug menu or simply press F11).
+
+Parts of the Debugger
+
+Several things are worth noticing when the Debugger window first comes up (See Figures 10.1 and 10.2):
+
+Figure 10.1
+
+Figure 10.2
+
+  * The yellow arrow on the left of your screen (shown in Figure 10.1) indicates the current execution line—this is the next line of code that will be executed if we do a "go" or we start stepping through the code.
+  * There are icons at the top (see Figure 10.2) to indicate our different options, including: 
+    * Continue This will run to the end of the sproc or the next breakpoint (including a watch condition).
+    * Stop Debugging Again, this does what it says—it stops execution immediately. The debugging window does remain open, however.
+    * Step Into This executes the next line of code and stops prior to running the next line of code regardless of what procedure or function that code is in. If the line of code being executed is calling a sproc or function, then Step Into has the effect of calling that sproc or function, adding it to the call stack, changing the Locals window to represent the newly nested sproc rather than the parent, and then stopping at the first line of code in the nested sproc.
+    * Step Over This executes every line of code required to take us to the next statement that is at the same level in the call stack. If you are not calling another sproc or a UDF, then this command will act just like a Step Into. If, however, you are calling another sproc or a UDF, then a Step Over will take you to the statement immediately following where that sproc or UDF returned its value.
+    * Step Out This executes every line of code up to the next line of code at the next highest point in the call stack. That is, we will keep running until we reach the same level as whatever code called the level we are currently at.
+    * Toggle Breakpoints and Remove All Breakpoints In addition, you can set breakpoints by clicking in the left margin of the code window. Breakpoints are points that you set to tell SQL Server to stop here! when the code is running in debug mode. This is handy in big sprocs or functions where you don't want to have to deal with every line—you just want it to run up to a point and stop every time it gets there.
+
+In addition, there is a choice that brings up the Breakpoints window, which is a list of all breakpoints that are currently set (again, handy in larger blocks of code). There are also a few of what we'll call "status" windows; let's go through a few of the more important of these.
+
+The Locals Window
+
+As I indicated back at the beginning of the book, I'm pretty much assuming that you have experience with some procedural language out there. As such, the Locals window (shown in Figure 10.3 as it matches with the current statement shown in Figure 10.2) probably isn't all that new of a concept to you. Simply put, it shows you the current value of all the variables that are currently in scope. The list of variables in the Locals window may change (as may their values) as you step into nested sprocs and back out again. Remember—these are only those variables that are in scope as of the next statement to run.
+
+In Figure 10.3, we're at the start of our first run through this sproc, so the value for the @ValueIn parameter has been set, but all other variables and parameters are not yet set and thus are effectively null.
+
+Figure 10.3
+
+Three pieces of information are provided for each variable or parameter:
+
+  * The name
+  * The current value
+  * The data type
+
+However, perhaps the best part to the Locals window is that you can edit the values in each variable. That means it's a lot easier to change things on the fly to test certain behaviors in your sproc.
+
+The Watch Window
+
+Here you can set up variables that you want to keep track of regardless of where you currently are in the call stack. You can either manually type in the name of the variable you want to watch, or you can select that variable in code, right-click, and then select Add Watch. In Figure 10.4, I've added a watch for @ValueOut, but, since we haven't addressed that variable in code, you can see that no value has been set for it as yet.
+
+Figure 10.4
+
+The Call Stack Window
+
+The Call Stack window provides a listing of all the sprocs and functions that are currently active in the process that you are running. The handy thing here is that you can see how far in you are when you are running in a nested situation, and you can change between the nesting levels to verify what current variable values are at each level.
+
+In Figure 10.5, I've stepped into the code for spTriangular such that we're down to it processing the working value of 3. If you're following along, you can just watch the @ValueIn variable in your Locals window and see how it changes as we step in. Our call stack now has several instances of spTriangular running as we've stepped into it (one for 5, one for 4, and now one for 3) as well as providing information on what statement is next in the current scope.
+
+Figure 10.5
+
+The Output Window
+
+Much as it sounds, the Output window is the spot where SQL Server prints any output. This includes result sets as well as the return value when your sproc has completed running, but also provides debug information from attaching to the process we're debugging. Some example output from the middle of a debug run is shown in Figure 10.6.
+
+Figure 10.6
+
+The Command Window
+
+The Command window is probably going to be beyond common use as it is in SQL Server 2008. In short, it allows you something of a command-line mode to access debugger commands and other objects. It is, however, cryptic at best and, as of this writing, relatively undocumented. To see examples of commands you could issue would be something like:
+
+>Debug.StepInto
+
+There are a whole host of commands available to IntelliSense, but you'll find that most of these are not actually available when debugging.
+
+Using the Debugger Once It's Started
+
+Now that we have the preliminaries out of the way and the Debugger window up, we're ready to start walking through our code. If you were walking through some of the descriptions before, stop the debugger and restart it so we're in the same place.
+
+The first executable line of our sproc is a bit deceptive. It is the DECLARE statement for @WorkingIn. Normally variable declarations are not considered executable, but, in this case, we are initializing the variable as part of the declaration, so the initialization code is seen by the debugger. You should notice that none of our variables has yet been set (the initialization code will be next to run, but has not actually executed yet. Step forward (using the menu choice, the tool tip, or simply press F11), and you should see (via the Locals window) @WorkingIn get initialized to our value of 5: @WorkingOut is not initialized as part of the declaration.
+
+Use the Step Into key one more time and we enter into our first execution of the spTriangular stored procedure and land at the first executable line in the sproc—our IF statement.
+
+Since the value of @ValueIn is indeed not equal to 1, we step into the BEGIN...END block specified by our IF statement. Specifically, we move to our SELECT statement that initializes the @InWorking parameter for this particular execution of the procedure. As we'll see later, if the value of @ValueIn had indeed been one, we would have immediately dropped down to our ELSE statement.
+
+Again, step forward one line by pressing F11 or using the Step Into icon or menu choice until just before you enter the next instance of spTriangular.
+
+Pay particular attention to the value of @InWorking in the Locals window. Notice that it changed to the correct value (@ValueIn is currently 5, so 5–1 is 4) as set by our SELECT statement. Also notice that our Call Stack window has only the current instance of our sproc in it (plus the current statement)—since we haven't stepped down into our nested versions of the sproc yet, we only see one instance.
+
+Now go ahead and step into our next statement. Since this is the execution of a sproc, we're going to see a number of different things change in the debugger. Notice that it appears that our arrow that indicates the current statement jumped back up to the IF statement. Why? Well, this is a new instance of what is otherwise the same sproc. We can tell this based on our Call Stack window—notice that it now has two instances of our sproc listed. The one at the top (with the yellow arrow) is the current instance, and the one with the red breakpoint dot is a parent instance that is now waiting for something further up in the call stack. Notice also that the @ValueIn parameter has the value of 4—that is the value we passed in from the outer instance of the sproc.
+
+If you want to see the value of variables in the scope of the outer instance of the sproc, just double-click that instance's line in the Call Stack window (the one with the green arrow) and you'll see several things changed in our debugging windows.
+
+There are two things to notice here. First, the values of our variables have changed back to those in the scope of the outer (and currently selected) instance of the sproc. Second, the icon for our current execution line is different. This new green arrow is meant to show that this is the current line in this instance of the sproc, but it is not the current line in the overall call stack.
+
+Go back to the current instance by clicking the top item in the Call Stack window. Then step in three more times. This should bring you to the top line (the IF statement) in our third instance of the sproc. Notice that our call stack has become three deep, and that the values of our variables and parameters in the Locals window have changed again. Last, but not least, notice that this time our @ValueIn parameter has a value of 3. Repeat this process until the @ValueIn parameter has a value of 1.
+
+Step into the code one more time, and you'll see a slight change in behavior. This time, since the value in @ValueIn is equal to 1, we move into the BEGIN...END block defined with our ELSE statement.
+
+Since we've reached the bottom, we're ready to start going back up the call stack. Use Step Into through the last line of our procedure, and you'll find that our call stack is back to only four levels. Also, notice that our output parameter (@OutWorking) has been appropriately set.
+
+This time, let's do something different and do a Step Out (Shift+F11). If you're not paying attention, it will look like absolutely nothing has changed.
+
+In this case, to use the old cliché, looks are deceiving. Again, notice the change in the Call Stack window and in the values in the Locals window—we stepped out of what was then the current instance of the sproc and moved up a level in the call stack. If we now keep stepping into the code (F11), then our sproc has finished running and we'll see the final version of our status windows and their respective finishing values. A big word of caution here! If you want to be able to see the truly final values (such as an output parameter being set), make sure that you use the Step Into option to execute the last line of code.
+
+If you use an option that executes several lines at once, such as a Go or Step Out, all you will get is the Output window without any final variable information.
+
+A workaround is to place a breakpoint on the last point at which you expect to perform a RETURN in the outermost instance of your sproc. That way, you can run in whatever debug mode you want, but still have execution halt in the end so you can inspect your final variables.
+
+So, you should now be able to see how the debugger can be very handy indeed.
+
+Understanding the SQLCLR and .NET Programming in SQL Server
+
+So, since you're reading this book, we're almost ready to proclaim you a "Pro" with SQL Server, but no notion of a pro is going to be complete without at least a solid concept of what is involved .NET programming in SQL Server. With that in mind, it's time to take a look at some of the major elements that .NET brought to SQL Server, and, where appropriate, provide some mention of what's new in SQL Server 2008. We'll see such things utilizing .NET as:
+
+  * Creating basic assemblies—including non T-SQL-based stored procedures, functions, and triggers
+  * Defining aggregate functions (something T-SQL user-defined functions can't do)
+  * Complex data types
+  * External calls (and with it, some security considerations)
+
+.NET is something of a wide-ranging topic that will delve into many different areas we've already touched on in this book and take them even farther, so, with that said, let's get going!
+
+Note that several of the examples in this chapter utilize the existing Microsoft Sample set. You must install the sample scripts during SQL Server installation or download the SQL Server .NET development SDK to access these samples. In addition, there is a significant reliance on Visual Studio .NET (2008 is used in the examples).
+
+Assemblies 101
+
+All the.NET functionality is surrounded by the term assembly. So, a reasonable question might be: "What exactly is an assembly?" An assembly is a DLL that has been created using managed code (what .NET language does not matter). The assembly may have been built using Visual Studio .NET or some other development environment, but the .NET Framework SDK also provides a command-line compiler for those of you who do not have Visual Studio available.
+
+Not all custom attributes or .NET Framework APIs are legal for assemblies used in SQL Server. You can consult Books Online for a full list, but, in general, anything that supports windowing is not allowed, nor is anything marked UNSAFE, unless your assembly is to be granted access at an UNSAFE level.
+
+Compiling an Assembly
+
+Use of .NET assemblies requires that you enable the Common Language Runtime (CLR) in SQL Server, which is disabled by default. You can enable the CLR by executing the following in the Management Studio:
+
+sp_configure 'clr enabled', 1;
+
+GO
+
+RECONFIGURE;
+
+There really isn't that much to this beyond compiling a normal DLL. The real key points to compiling a DLL that is going to be utilized as a SQL Server .NET assembly are:
+
+  * You cannot reference any assemblies that include functions related to windowing (dialogs and so on).
+  * How the assembly is marked (safe, external access, unsafe) will make a large difference to whether or not the assembly is allowed to execute some functions.
+
+From there, most things are not all that different from any other DLL you might create to make a set of classes available. You can either compile the project using Visual Studio (if you have it), or you can use the compiler that is included in the .NET SDK.
+
+Let's go ahead and work through a relatively simple example for an assembly we'll use as a stored procedure example a little later in the chapter.
+
+Create a new SQL Server project in Visual Studio using C# (you can translate this to VB if you wish) called ExampleProc. You'll find the SQL Server project type under the "Database" project group (under C#). When it comes up, cancel out of any database instance dialogs you get.
+
+The actual project type you start with does not really matter other than what references it starts with. While this example suggests starting with a SQL Server project, you could start with a simple class project and add the appropriate references manually.
+
+Now add a new stored procedure by right-clicking the project and selecting Add ⇒ Stored Procedure... as shown in Figure 10.7.
+
+Figure 10.7
+
+In this new stored proc, we need to set a few references; Visual Studio should already have them done for you:
+
+using System;
+
+using System.Data;
+
+using System.Data.SqlClient;
+
+using System.Data.SqlTypes;
+
+using Microsoft.SqlServer.Server;
+
+And then we're ready to get down to writing some real code. Code you want to put in a .NET assembly is implemented through a public class. I've chosen to call my class StoredProcedures, but, really, I could have called it most anything. I'm ready to add my method declaration:
+
+public partial class StoredProcedures
+
+{
+
+[Microsoft.SqlServer.Server.SqlProcedure]
+
+public static void ExampleSP(out int outval)
+
+{
+
+The method has, like the class, been declared as public. I can also have private classes if I so choose. (They would, of course, need to be supporting methods, as they would not be exposed externally.) The void indicates that I do not intend to supply a return value (when we run it in SQL Server, it will always return the default of zero). I could, however, declare it as returning type int and supply a value as appropriate in my code (most likely zero for no error, and non-zero if there was an error).
+
+Notice also the Microsoft.SqlServer.Server.SqlProcedure directive. This is completely optional and is utilized by Visual Studio's deployment tool to know that the following method is a stored procedure. I left it in there primarily to show you that it is there. (We're going to manually deploy the proc rather than use the deployment functionality of Visual Studio.)
+
+From there, we're ready to grab a reference to a connection. Pay particular attention to this one, as it is different from what you will see in typical .NET database connections. Everything about it is the same as a typical connection, except for the connect string. With that, we are using a special syntax that indicates that we want to utilize the same connection that called the stored procedure. The beauty of this is that we can assume a login context and do not need to explicitly provide a server or even a username.
+
+// This causes the connection to use the existing connection context
+
+// that the stored procedure is operating in. We could also create a
+
+// completely new connection to fetch data from external sources.
+
+using (SqlConnection cn = new SqlConnection("context connection=true"))
+
+{
+
+cn.Open();
+
+So, now we have a connection. Technically, we didn't really open a new connection (remember, we're utilizing the one that was already opened to call this stored procedure). Instead, we're really creating a new reference to the existing connection.
+
+We're now ready to create a command object. There really isn't anything mysterious about this at all. It is created using a fairly typical syntax. (Indeed, you can create the command object in any of the typical ways you would if you were doing this from a typical .NET client.) I've chosen to define the CommandText and connection properties as part of my object declaration.
+
+// set up a simple command that is going to return two columns.
+
+SqlCommand cmd = new SqlCommand("SELECT @@SERVERNAME, @@SPID", cn);
+
+And then we're ready to execute the command. This is one of the spots I'm utilizing something assembly specific, as I am not only executing the command, but also I'm explicitly saying to go ahead and send it to the client.
+
+Unlike a T-SQL-based stored procedure, queries you execute are not defaulted as going to the client. Instead, the assumption is that you want to work with the result set locally. Therefore, you need to explicitly issue a command to send anything out to the client.
+
+The object that does this is the .Pipe object within the SqlContext:
+
+// The following actually sends the row for the select.
+
+// It could have been multiple rows, and that would be fine too.
+
+SqlContext.Pipe.ExecuteAndSend(cmd);
+
+Last, but not least, I'm populating my output variable. In this procedure, I haven't really done anything special with it, but I'm tossing something into it just so we can see that it really does work.
+
+// Set the output value to something. It could have been anything
+
+// including some form of computed value, but we're just showing
+
+// that we can output some value for now.
+
+outval = 12345;
+
+}
+
+}
+
+};
+
+Now simply build your project, and you have your first assembly ready to be uploaded to your SQL Server. Later, we'll take a look at how to define the assembly for use as a stored procedure.
+
+Uploading Your Assembly to SQL Server
+
+That's right—I used the term upload. When you "create" an assembly in SQL Server, you're both creating a copy of the DLL within SQL Server as well as something of a handle that defines the assembly and the permissions associated with it.
+
+CREATE ASSEMBLY <assembly name>
+
+[ AUTHORIZATION <owner name> ]
+
+FROM { <client assembly specifier> | <assembly bits> [ ,...n ] }
+
+[ WITH PERMISSION_SET = { SAFE | EXTERNAL_ACCESS | UNSAFE } ]
+
+[ ; ]
+
+The CREATE portion of things adheres to the standard CREATE <object type> <object name> notion that we've seen throughout SQL. From there, we have a few different things to digest:
+
+Option | Description  
+---|---  
+AUTHORIZATION | The authorization is the name of the user this assembly will be considered to belong to. If this parameter is not supplied, then the current user is assumed to be the owner. You can use this to alias to a user with appropriate network access to execute any file actions defined by the assembly.  
+FROM | The fully qualified path to the physical DLL file. This can be a local file or a UNC path. You can, if you so choose, provide the actual byte sequence to build the file right on the line in the place of a file. (I have to admit I've never tried that one.)  
+WITH PERMISSION_SET | You have three options for this. SAFE is the default and implies that the object is not utilizing anything that requires access outside of the SQL Server process (no file access, no external database access). EXTERNAL_ACCESS indicates that your assembly requires access outside of the SQL Server process (to files in the operating system or UNC path, or perhaps an external ODBC/OLEDB connection). UNSAFE implies that your assembly has free access to the SQL Server memory space without regard to the CLR managed code facilities. This means your assembly has the potential to destabilize your SQL Server through improper access.
+
+So, with all this in mind, we're ready to upload our assembly:
+
+USE AdventureWorks2008;
+
+CREATE ASSEMBLY ExampleProc
+
+FROM '<solution path>\ExampleProc\bin\Debug\ExampleProc.dll'
+
+Assuming that you have the path to your compiled DLL correct, you shouldn't see any messages except for the typical "Command(s) completed successfully" message, and, with that, you are ready to create the SQL Server stored procedure that will reference this assembly.
+
+Creating Your Assembly-Based Stored Procedure
+
+All right then; all the tough stuff is done. (If you're looking for how to actually create the assembly that is the code for the stored proc, take a look back two sections.) We have a compiled assembly, and we have uploaded it to SQL Server; it's time to put it to use.
+
+To do this, we use the same CREATE PROCEDURE command we used for the more classic T-SQL-based stored procedures. The difference is that, in the place of T-SQL code, we reference our assembly. For review, the syntax looks like this:
+
+CREATE PROCEDURE|PROC <sproc name>
+
+[<parameter name> [<schema>.]<data type> [VARYING] [= <default value>] [OUT
+
+[PUT]][,
+
+<parameter name> [<schema>.]<data type> [VARYING] [= <default value>]
+
+[OUT[PUT]][,
+
+...
+
+...
+
+]]
+
+[WITH
+
+RECOMPILE| ENCRYPTION | [EXECUTE AS { CALLER|SELF|OWNER|<'user name'>}]
+
+[FOR REPLICATION]
+
+AS
+
+<code> | EXTERNAL NAME <assembly name>.<assembly class>
+
+Some of this we can ignore when doing assemblies. The key things are:
+
+  * We use the EXTERNAL NAME option instead of the <code> section we used in our main chapter on stored procedures. The EXTERNAL NAME is done in a format of
+
+<assembly name>.<class name>.<method name>
+
+  * We still need to define all parameters (in an order that matches the order our assembly method).
+
+Now let's apply that to the assembly we created in the previous section:
+
+CREATE PROC spCLRExample
+
+(
+
+@outval int = NULL OUTPUT
+
+)
+
+AS EXTERNAL NAME ExampleProc.StoredProcedures.ExampleSP;
+
+It is not until this point that we actually have a stored procedure that utilizes our assembly. Notice that the stored procedure name is completely different from the name of the method that implements the stored procedure.
+
+Now go ahead and make a test call to our new stored procedure:
+
+DECLARE @OutVal int;
+
+EXEC spCLRExample @OutVal OUTPUT;
+
+SELECT @OutVal;
+
+We're declaring a holding variable to receive the results from our output variable. We then execute the procedure and select the value for our holding variable. When you check the results, however, you'll find not one result set—but two:
+
+\------------------- ------
+
+KIERKEGAARD 52
+
+(1 row(s) affected)
+
+\-----------
+
+12345
+
+(1 row(s) affected)
+
+The first of these is the result set we sent down the SqlContext.Pipe. When we executed the cmd object, the results were directed down the pipe, and so the client received them. The second result set represents the SELECT of our @OutVal variable.
+
+This is, of course, a pretty simplistic example, but realize the possibilities here. The connection could have been, assuming we were set to EXTERNAL_ACCESS, to any data source. We could access files and even Web services. We can add in complex libraries to perform things like regular expressions (careful on performance considerations there).
+
+We will look at adding some of these kinds of things in as we explore more types of assembly-based SQL programming.
+
+Creating Scalar User-Defined Functions from Assemblies
+
+Scalar functions are not much different from stored procedures. Indeed, for the most part, they have the very same differences as the T-SQL versions. Much as with stored procedures, we utilize the same core CREATE syntax used in standard T-SQL user-defined functions (UDFs):
+
+CREATE FUNCTION [<schema name>.]<function name>
+
+([<@parameter name> [AS] [<schema name>.]<data type>
+
+[ = <default value>] [READONLY]
+
+[ ,...n ] ] )
+
+RETURNS {<type>|TABLE [(<table definition>)]}
+
+[ WITH [ENCRYPTION]|[SCHEMABINDING]|
+
+[ RETURNS NULL ON NULL INPUT | CALLED ON NULL INPUT ] | [EXECUTE AS {
+
+CALLER|SELF|OWNER|<'user name'>} ]
+
+]
+
+[AS] { EXTERNAL NAME <external method> |
+
+BEGIN
+
+[<function statements>]
+
+{RETURN <type as defined in RETURNS clause>|RETURN (<SELECT statement>)}
+
+END }[;]
+
+There are one or two new things once you get inside of the .NET code. Of particular note is that there are some properties that you can set for your function. Among those, probably the most significant is that you must indicate whether the function is deterministic. (The default is nondeterministic.) We'll see an example of this in use shortly.
+
+For the example this time, start a new SQL Server project in Visual Studio, but instead of adding a stored procedure as we did in our original assembly example, add a user-defined function.
+
+SQL Server starts you out with a simple template:
+
+using System;
+
+using System.Data;
+
+using System.Data.SqlClient;
+
+using System.Data.SqlTypes;
+
+using Microsoft.SqlServer.Server;
+
+public partial class UserDefinedFunctions
+
+{
+
+[Microsoft.SqlServer.Server.SqlFunction]
+
+public static SqlString ExampleUDF()
+
+{
+
+// Put your code here
+
+return new SqlString("Hello");
+
+}
+
+};
+
+This is actually a workable template "as is." You could compile it and add it to SQL Server as an assembly, and it would work right out of the box (although getting back nothing but the string "Hello" is probably not all that useful).
+
+We'll replace that, but this time we're going to write something that is still amazingly simple. In the end, we'll see that, while simple, it is much more powerful than our stored procedure example.
+
+In previous books, I have often lamented the issues with trying to validate e-mail fields in tables. E-mail, when you think about it, is a strongly typed notion, but one that SQL Server has only been able to perform minimal validation of. What we need are regular expressions.
+
+We could approach this issue by writing a validation function and implementing it as a user-defined data type. This approach would have some validity, but has a problem: the rules for validating e-mails change on occasion (such as when new country codes are added, or when the .biz and .info top domains were added several years ago). Instead, we're going to implement simple regex functionality and then utilize a call to that function in a constraint.
+
+We can do this with relatively minimal changes to the function template that SQL Server gave us. First, we can get rid of a few library declarations, since we won't be really working with SQL Server data to speak of, and add back two of our own. We wind up with just three using declarations:
+
+using System;
+
+using System.Text.RegularExpressions;
+
+using Microsoft.SqlServer.Server;
+
+We're then ready to implement our function with very few changes:
+
+[SqlFunction(IsDeterministic = true, IsPrecise = true)]
+
+public static bool RegExIsMatch(string pattern, string matchString)
+
+{
+
+Regex reg = new Regex(pattern.TrimEnd(null));
+
+return reg.Match(matchString.TrimEnd(null)).Success;
+
+}
+
+Oh sure, we completely replaced the old function, but it didn't take a lot to do it. Indeed, we only have two more lines of code—and that's including the determinism declaration!
+
+I'm not going to review it much here, but take a look earlier in the chapter if you need to be reminded of how determinism works The key thing is that, given the same inputs, the function must always yield the same outputs.
+
+Go ahead and compile this, and we're ready to upload the assembly:
+
+USE AdventureWorks2008;
+
+CREATE ASSEMBLY ExampleUDF
+
+FROM '<solution path>\ExampleUDF\bin\Debug\ExampleUDF.dll';
+
+And then create the function reference:
+
+CREATE FUNCTION fCLRExample
+
+(
+
+@Pattern nvarchar(max),
+
+@MatchString nvarchar(max)
+
+)
+
+RETURNS BIT
+
+AS EXTERNAL NAME ExampleUDF.UserDefinedFunctions.RegExIsMatch;
+
+Notice the use of the nvarchar type instead of varchar. The string data type is a Unicode data type, and our function data type declaration needs to match.
+
+This done, we're ready to test things out a bit:
+
+SELECT pp.BusinessEntityID, pp.FirstName, pp.LastName, pe.EmailAddress
+
+FROM Person.Person pp
+
+JOIN Person.EmailAddress pe
+
+ON pp.BusinessEntityID = pe.BusinessEntityID
+
+WHERE dbo.fCLRExample('[a-zA-Z0-9_\\-]+@([a-zA-Z0-9_\\-
+
+]+\\.)+(com|org|edu|mil|info|biz|net)',
+
+EmailAddress) = 1;
+
+If you have the default data, then this will actually return every row in the table since they all are adventure-works.com addresses. So, let's try a simple test to show what works versus what doesn't:
+
+DECLARE @GoodTestMail varchar(100),
+
+@BadTestMail varchar(100);
+
+SET @GoodTestMail = 'robv@professionalsql.com';
+
+SET @BadTestMail = 'misc. text';
+
+SELECT dbo.fCLRExample('[a-zA-Z0-9_\\-]+@([a-zA-Z0-9_\\-]+\\.)+(com|org|edu|nz|au)',
+
+@GoodTestMail) AS ShouldBe1
+
+SELECT dbo.fCLRExample('[a-zA-Z0-9_\\-]+@([a-zA-Z0-9_\\-]+\\.)+(com|org|edu|nz|au)',
+
+@BadTestMail) AS ShouldBe0;
+
+For the sake of brevity, I have not built the full e-mail regex string here. It would need to include all of the valid country code top domains such as au, ca, uk, and us. There are a couple hundred of these, so it wouldn't fit all that well. That said, the basic construct is just fine; you can tweak it to meet your particular needs.
+
+This gets us back what we would expect:
+
+ShouldBe1
+
+\---------
+
+1
+
+(1 row(s) affected)
+
+ShouldBe0
+
+\---------
+
+0
+
+(1 row(s) affected)
+
+But let's not stop there. We have this nice function, let's apply it a little further by actually applying it as a constraint to the table:
+
+ALTER TABLE Person.EmailAddress
+
+ADD CONSTRAINT ExampleFunction
+
+CHECK (dbo.fCLRExample('[a-zA-Z0-9_\\-]+@([a-zA-Z0-9_\\-
+
+]+\\.)+(com|org|edu|nz|au)',
+
+EmailAddress) = 1);
+
+Now we try to update a row to insert some bad data into our column, and it will be rejected:
+
+UPDATE Person.EmailAddress
+
+SET EmailAddress = 'blah blah'
+
+WHERE BusinessEntityID = 1
+
+AND EmailAddressID = 1;
+
+And SQL Server tells you the equivalent of "no way!":
+
+Msg 547, Level 16, State 0, Line 2
+
+The UPDATE statement conflicted with the CHECK constraint "ExampleFunction". The
+
+conflict occurred in database "AdventureWorks2008", table "Person.EmailAddress",
+
+column 'EmailAddress'.
+
+The statement has been terminated.
+
+Creating Table-Valued Functions
+
+Functions are going to continue to be a focus of our look at .NET for a bit. Why? Well, functions have a few more twists to them than some of the other assembly uses.
+
+In this section, we're going to focus in on table-valued functions. They are among the more complex things we need to deal with in this chapter, but, as they are in the T-SQL version, they were also among the more powerful. The uses range far and wide. They can be as simple as special treatment of a column in something you could have otherwise done in a typical T-SQL function or can be as complex as a merge of data from several disparate and external data sources.
+
+Go ahead and start another Visual Studio project called ExampleTVF, using the SQL Server project template. Also add a new user-defined function. We're going to be demonstrating accessing the file system this time, so add the following references:
+
+using System;
+
+using System.IO;
+
+using System.Collections;
+
+using Microsoft.SqlServer.Server;
+
+Before we get too far into the code, let's look ahead a bit at some of the things a table-valued function—or TVF—requires:
+
+The entry function must implement the IEnumerable interface. This is a special, widely used, interface in .NET that essentially allows for the iteration over some form of row (be it in an array, collection, table, or whatever). As part of this concept, we must also define the FillRowMethodName property. The function specified in this special property will be implicitly called by SQL Server every time SQL Server has the need to move between rows. You will find that a good many developers call whatever function they implement this in FillRow. For me it will vary depending on the situation and whether or not I feel it warrants something more descriptive of what it's doing.
+
+So, with those items described, let's look at the opening of our function. Our function is going to be providing a directory listing, but one based on information that must be retrieved from individual files. This means that we have to enumerate the directory to retrieve each file's information. Just to add to the flavor of things a bit, we will also support the notion of subdirectories, which means we have to understand the notion of directories within directories.
+
+We'll start with our top-level function call. This accepts the search filter criteria, including the directory we are considering the root directory for our list, the filename criteria for the search, and a Boolean indicator of whether or not to include subdirectories:
+
+public partial class UserDefinedFunctions
+
+{
+
+[SqlFunction(FillRowMethodName = "FillRow")]
+
+public static IEnumerable DirectoryList(string sRootDir, string sWildCard,
+
+bool bIncludeSubDirs)
+
+{
+
+// retrieve an array of directory entries. Where this an object of our
+
+own making,
+
+// it would need to be one that supports IEnumerable, but since
+
+ArrayList already
+
+// does that, we have nothing special to do here.
+
+ArrayList aFileArray = new ArrayList();
+
+DirectorySearch(sRootDir, sWildCard, bIncludeSubDirs, aFileArray);
+
+return aFileArray;
+
+}
+
+This has done little other than establish an array that will hold our file list and call to internal functions to populate it. Next, we need to implement the function that is enumerating the directory list to get the files in each directory:
+
+private static void DirectorySearch(string directory, string sWildCard,
+
+bool bIncludeSubDirs, ArrayList aFileArray)
+
+{
+
+GetFiles(directory, sWildCard, aFileArray);
+
+if (bIncludeSubDirs)
+
+{
+
+foreach (string d in Directory.GetDirectories(directory))
+
+{
+
+DirectorySearch(d, sWildCard, bIncludeSubDirs,
+
+aFileArray);
+
+}
+
+}
+
+}
+
+For each directory we file, we make a simple call of the GetFiles method (it is implemented in System.IO) and enumerate the results for the current directory. As we enumerate, we populate our array with the FullName and a LastWriteTime properties from the file:
+
+private static void GetFiles(string d, string sWildCard, ArrayList aFileArray)
+
+{
+
+foreach (string f in Directory.GetFiles(d, sWildCard))
+
+{
+
+FileInfo fi = new FileInfo(f);
+
+object[] column = new object[2];
+
+column[0] = fi.FullName;
+
+column[1] = fi.LastWriteTime;
+
+aFileArray.Add(column);
+
+}
+
+}
+
+From there, we're ready to bring it all home by actually implementing our FillRow function, which does nothing more than serve as a conduit between our array and the outside world—managing the feed of data to one row at a time:
+
+private static void FillRow(Object obj, out string filename, out DateTime
+
+date)
+
+{
+
+object[] row = (object[])obj;
+
+filename = (string)row[0];
+
+date = (DateTime)row[1];
+
+}
+
+};
+
+With all that done, we should be ready to compile and upload our assembly. We use the same CREATE ASSEMBLY command we've used all chapter long, but there is a small change: We must declare the assembly as having the EXTERNAL_ACCESS permission set. One of two conditions that must be met in order to do this:
+
+  * The assembly is signed with a certificate (more on these in Chapter 19) that corresponds to a user with proper EXTERNAL_ACCESS rights.
+  * The database owner has EXTERNAL_ACCESS rights and the database has been marked as being TRUSTWORTHY in the database properties.
+
+We're going to take the unsigned option, so we need to set the database to be marked as trustworthy:
+
+ALTER DATABASE AdventureWorks2008
+
+SET TRUSTWORTHY ON;
+
+And we're now ready to finish uploading our assembly with proper access:
+
+USE AdventureWorks2008;
+
+CREATE ASSEMBLY fExampleTVF
+
+FROM '<solution path>\ExampleTVF\bin\Debug\ExampleTVF.dll'
+
+WITH PERMISSION_SET = EXTERNAL_ACCESS;
+
+The actual creation of the function reference that utilizes our assembly is not bad but is slightly trickier than the one for the simple scalar function. We must define the table that is to be returned in addition to the input parameters:
+
+CREATE FUNCTION fTVFExample
+
+(
+
+@RootDir nvarchar(max),
+
+@WildCard nvarchar(max),
+
+@IncludeSubDirs bit
+
+)
+
+RETURNS TABLE
+
+(
+
+FileName nvarchar(max),
+
+LastWriteTime datetime
+
+)
+
+AS EXTERNAL NAME fExampleTVF.UserDefinedFunctions.DirectoryList;
+
+And, with that, we're ready to test:
+
+SELECT FileName, LastWriteTime
+
+FROM dbo.fTVFExample('C:\', '*.sys', 0);
+
+What you get back when you run this will vary a bit depending on what components and examples you have installed, but, in general, it should look something like:
+
+FileName LastWriteTime
+
+\-------------------------------------------------- -----------------------
+
+C:\CONFIG.SYS 2006-04-01 00:21:43.470
+
+C:\IO.SYS 2006-04-01 00:21:43.470
+
+C:\MSDOS.SYS 2006-04-01 00:21:43.470
+
+C:\pagefile.sys 2008-12-31 00:00:00.000
+
+(4 row(s) affected)
+
+We've now shown not only how we can do table-valued functions but also how we can access external data—powerful stuff!
+
+Creating Aggregate Functions
+
+Now this one is going to be the one thing in this chapter that's really new. When we look at user-defined data types a little later, we'll see something with a bigger shift than some of the other constructs we've looked at here, but aggregate functions are something that you can't do any other way. The T-SQL version of a UDF does not allow for aggregation.
+
+So, what am I talking about here? Well, examples are SUM, AVG, MIN, and MAX. These all look over a set of data and then return a value that is based on some analysis of the whole. It may be based on your entire result set or on some criteria defined in the GROUP BY clause.
+
+Performing the analysis required to support your aggregate gets rather tricky. Unlike other functions, where everything can be contained in a single call to your procedure, aggregates require mixing activities your function does (the actual aggregation part) with activities SQL Server is doing at essentially the same time (organizing the groups for the GROUP BY, for example). The result is something of staged calls to your assembly class. Your class can be called at any of four times and can support methods for each of these calls:
+
+  * Init—This supports the initialization of your function. Since you're aggregating, there's a good chance that you are setting some sort of accumulator or other holding value—this is the method where you would initialize variables that support the accumulation or holding value.
+  * Accumulate—This is called by SQL Server once for every row that is to be aggregated. How you choose to utilize the function is up to you, but presumably it will implement whatever accumulation logic you need to support your aggregate.
+  * Merge—SQL Server is a multithreaded application, and it may very well utilize multiple threads that can each be calling into your function. As such, you utilize this function to deal with merging the results from different threads into one final result. Depending on the type of aggregate you're doing, this can make things rather tricky. You can, however, make use of private members in your class to keep track of how many threads were running and reconcile the differences. It's worth noticing that this function receives a copy of your class as an argument (consider it to be what amounts to recursive when you are in this method) rather than whatever other type of value you've been accumulating—this is so you get the proper results that were calculated by the other thread.
+  * Terminate—This is essentially the opposite of Init. This is the call that actually returns the end result.
+
+Now, let's see what this looks like in practice.
+
+To start things off, create a new project in Visual Studio (I'm calling mine ExampleAggregate), and then add a new aggregate to the project (right-click the project in the solution and choose Add ⇒ Aggregate). SQL Server builds you a stock template that includes all four of the methods we just discussed:
+
+using System;
+
+using System.Data;
+
+using System.Data.SqlClient;
+
+using System.Data.SqlTypes;
+
+using Microsoft.SqlServer.Server;
+
+[Serializable]
+
+[Microsoft.SqlServer.Server.SqlUserDefinedAggregate(Format.Native)]
+
+public struct ExampleAggregate
+
+{
+
+public void Init()
+
+{
+
+// Put your code here
+
+}
+
+public void Accumulate(SqlString Value)
+
+{
+
+// Put your code here
+
+}
+
+public void Merge(ExampleAggregate Group)
+
+{
+
+// Put your code here
+
+}
+
+public SqlString Terminate()
+
+{
+
+// Put your code here
+
+return new SqlString("");
+
+}
+
+// This is a place-holder member field
+
+private int var1;
+
+}
+
+This is the genuine foundation—complete with templates for all four method calls.
+
+What we're going to be doing for an example in this section is to build an implementation of a PRODUCT function, which is essentially the same concept as SUM but multiplying instead of adding. Like the SUM function, we will ignore NULL values (unless they are all NULL, and then we will return NULL), but we will warn the user about the NULL being found and ignored should we encounter one.
+
+We need to start with some simple changes. First, I'm going to change the class name to Product instead of ExampleAggregate, which we called the project. In addition, I need to declare some member variables to hold our accumulator and some flags.
+
+using System;
+
+using System.Data;
+
+using System.Data.SqlClient;
+
+using System.Data.SqlTypes;
+
+using Microsoft.SqlServer.Server;
+
+[Serializable]
+
+[Microsoft.SqlServer.Server.SqlUserDefinedAggregate(Format.Native)]
+
+public struct Product
+
+{
+
+private SqlDouble dAccumulator;
+
+private bool fContainsNull;
+
+private bool fAllNull;
+
+The fContainsNull variable will be used to tell us if we need to warn the user about any values being ignored. The fAllNull is used to tell if every value received was null—in which case we want to return null as our result.
+
+We then need to initialize our member variables as part of the Init function:
+
+public void Init()
+
+{
+
+// Initialize our flags and accumulator
+
+dAccumulator = 1;
+
+fContainsNull = false;
+
+fAllNull = true;
+
+}
+
+We are then ready to build the main accumulator function:
+
+public void Accumulate(SqlDouble Value)
+
+{
+
+// This is the meat of things. This one is where we actually apply
+
+// whatever logic is appropriate for our accumulation. In our example,
+
+// we simply multiply whatever value is already in the accumulator by
+
+// the new input value. If the input value is null, then we set the
+
+// flag that indicates that we've seen null values and then ignore
+
+// the value we just received and maintain the existing accumulation.
+
+if (Value.IsNull)
+
+{
+
+fContainsNull = true;
+
+}
+
+else
+
+{
+
+fAllNull = false;
+
+dAccumulator *= Value;
+
+}
+
+}
+
+The comments pretty much tell the tale here. We need to watch to make sure that none of our flag conditions have changed. Other than that, we simply need to continue accumulating by multiplying the current value (assuming it's not null) by the existing accumulator value.
+
+With the accumulator fully implemented, we can move on to dealing with the merge scenario:
+
+public void Merge(Product Group)
+
+{
+
+// For this particular example, the logic of merging isn't that hard.
+
+// We simply multiply what we already have by the results of any other
+
+// instances of our Product class.
+
+if (Group.dAccumulator.IsNull)
+
+{
+
+if (Group.fContainsNull)
+
+fContainsNull = true;
+
+if (!Group.fAllNull)
+
+fAllNull = false;
+
+dAccumulator *= dAccumulator;
+
+}
+
+}
+
+For this particular function, the implementation of a merge is essentially just applying the same checks that we did in the Accumulate function.
+
+Finally, we're ready to implement our Terminate function to close out our aggregation when it's done:
+
+public SqlDouble Terminate()
+
+{
+
+// And this is where we wrap it all up and output our results
+
+if (fAllNull)
+
+{
+
+return SqlDouble.Null;
+
+}
+
+else
+
+{
+
+SqlContext.Pipe.Send("WARNING: Aggregate values exist and were ignored");
+
+return dAccumulator;
+
+}
+
+}
+
+}
+
+With all that done, we should be ready to compile our procedure and upload it:
+
+CREATE ASSEMBLY ExampleAggregate
+
+FROM '<solution path>\ExampleAggregate\bin\Debug\ExampleAggregate.dll';
+
+And create the aggregate. Note that while an aggregate is a type of function, we use a different syntax to create it. The basic syntax looks like this:
+
+CREATE AGGREGATE [ <schema name> . ] <aggregate name>
+
+(@param_name <input sql type> )
+
+RETURNS <SQL Type of Return Value>
+
+EXTERNAL NAME <assembly name> [ .<class name> ]
+
+So, to create the aggregate from our assembly, we would do something like:
+
+CREATE AGGREGATE dbo.Product(@input float)
+
+RETURNS float
+
+EXTERNAL NAME ExampleAggregate.Product;
+
+And, with that, we're ready to try it out. To test it, we'll create a small sample table that includes some data that can be multiplied along with a grouping column, so we can test out how our aggregate works with a GROUP BY scenario.
+
+CREATE TABLE TestAggregate
+
+(
+
+PK int NOT NULL PRIMARY KEY,
+
+GroupKey int NOT NULL,
+
+Value float NOT NULL
+
+);
+
+Now we just need some test data:
+
+INSERT INTO TestAggregate(PK, GroupKey, Value)
+
+VALUES (1, 1, 2),
+
+(2, 1, 6),
+
+(3, 1, 1.5),
+
+(4, 2, 2),
+
+(5, 2, 6);
+
+And we're ready to give our aggregate a try. What we're going to be doing is returning the PRODUCT of all the rows within each group (our sample data has two groups, so this should work out to two rows).
+
+SELECT GroupKey, dbo.Product(Value) AS Product
+
+FROM TestAggregate
+
+GROUP BY GroupKey;
+
+Run this and we get back two rows (just as we expected):
+
+GroupKey Product
+
+\----------- ----------------------
+
+1 18
+
+2 12
+
+(2 row(s) affected)
+
+Do the match on our sample data, and you'll see we got back just what we wanted.
+
+If you're thinking about it, you should be asking yourself "OK, this is great, but how often am I really going to use this?" For most of you, the answer will be "never." There are, however, those times where what's included just isn't ever going to do the job. Aggregates are one of those places where special cases come rarely, but when they come, they really need exactly what they need and nothing else. In short, I wouldn't crowd your brain cells by memorizing every little thing about this section, but do take the time to learn what's involved and get a concept for what it can and can't do so you know what's available should you need it.
+
+Creating Triggers from Assemblies
+
+Note that we have a bit of a "chicken or the egg" (which came first?) thing going on with triggers and .NET. Triggers are not covered until Chapter 12, but I wanted to keep all .NET items close together for reference reasons. If you understand the basics of triggers, you'll be fine with this—if not, you may want to read Chapter 12 first, and then come back to this.
+
+Much like the other assembly types we've worked with so far in this chapter, triggers have a lot in common with the rest, but also their own little smattering of special things.
+
+The differences will probably come to mind quickly if you think about it for any length of time:
+
+  * How do we deal with the contextual nature of triggers? That is, how do we know to handle things differently if it's an INSERT trigger situation versus a DELETE or UPDATE trigger?
+  * How do we access the inserted and deleted tables?
+
+You may recall from earlier examples, how we can obtain the "context" of the current connection—it is by utilizing this context that we are able to gain access to different objects that we are interested in. For example, the SqlContext object that we've obtained a connection from in prior examples also contains a SqlTriggerContext object—we can use that to get properties such as whether we are dealing with an insert, update, or delete scenario (the first question we had). The fact that we have access to the current connections also implies that we are able to access the inserted and deleted tables simply by querying them. Let's get right to putting this to use in an example.
+
+Start by creating a new SQL Server project in Visual Studio (I've called mine ExampleTrigger this time). Once your project is up, right-click the project in the Solution Explorer and select Add ⇒ Trigger.
+
+Visual Studio is nice enough to provide you with what is, for the most part, a working template. Indeed, it would run right as provided except for one issue:
+
+using System;
+
+using System.Data;
+
+using System.Data.SqlClient;
+
+using Microsoft.SqlServer.Server;
+
+public partial class Triggers
+
+{
+
+// Enter existing table or view for the target and uncomment the attribute
+
+line
+
+// [Microsoft.SqlServer.Server.SqlTrigger (Name="ExampleTrigger",
+
+Target="Table1", Event="FOR UPDATE")]
+
+public static void ExampleTrigger()
+
+{
+
+// Replace with your own code
+
+SqlContext.Pipe.Send("Trigger FIRED");
+
+}
+
+}
+
+I've highlighted the key code line for you. At issue is that we must provide more information to SQL Server than we do in our other object types. Specifically, we must identify what table and events we're going to be executing our trigger against. We're actually going to create a special demonstration table for this before the trigger is actually put into action, so we can just use the table name TriggerTable for now.
+
+[Microsoft.SqlServer.Server.SqlTrigger (Name="ExampleTrigger",
+
+Target="TriggerTable", Event="FOR INSERT, UPDATE, DELETE")]
+
+Notice that I've also altered what events will fire our trigger to include all event types.
+
+Now we'll update the meat of things just a bit, so we can show off different actions we might take in our trigger and, perhaps more importantly, how we can check the context of things and make our actions specific to what has happened to our table. We'll start by getting our class going:
+
+public static void ExampleTrigger()
+
+{
+
+// Get a handle to our current connection
+
+SqlConnection cn = new SqlConnection("context connection=true");
+
+cn.Open();
+
+SqlTriggerContext ctxt = SqlContext.TriggerContext;
+
+SqlCommand cmd = new SqlCommand();
+
+cmd.Connection = cn;
+
+So far, this isn't much different from what we've used in our other .NET examples. Perhaps the only significant difference from things we've seen already is the SqlTriggerContext object—we will use this later on to determine what action caused the trigger to fire.
+
+We're ready to start code that is conditional on the action the trigger is firing for (based on the TriggerAction property of the TriggerContext of the SqlContext). For this, I'm going to use a simple switch command (though there are those that will call me a programming charlatan for using a switch statement—to them I say "deal with it!"). I'm also going to pipe out various things to the client to report what we're doing.
+
+In practice, you generally do not want to be outputting information from a trigger—figure that they should usually run silently as far as the client is concerned. I've gone ahead and output several items in this example just to make it readily apparent what the trigger is doing under what scenario.
+
+switch (ctxt.TriggerAction)
+
+{
+
+case TriggerAction.Insert:
+
+cmd.CommandText = "SELECT COUNT(*) AS NumRows FROM INSERTED";
+
+SqlContext.Pipe.Send("Insert Trigger Fired");
+
+SqlContext.Pipe.ExecuteAndSend(cmd);
+
+break;
+
+case TriggerAction.Update:
+
+// This time, we'll use datareaders to show how we can
+
+// access the data from the inserted/deleted tables
+
+SqlContext.Pipe.Send("Update Trigger Fired");
+
+SqlContext.Pipe.Send("inserted rows...");
+
+cmd.CommandText = "SELECT * FROM INSERTED";
+
+SqlContext.Pipe.Send(cmd.ExecuteReader());
+
+break;
+
+case TriggerAction.Delete:
+
+// And now we'll go back to what we did with the inserted rows...
+
+cmd.CommandText = "SELECT COUNT(*) AS NumRows FROM DELETED";
+
+SqlContext.Pipe.Send("Delete Trigger Fired");
+
+SqlContext.Pipe.ExecuteAndSend(cmd);
+
+break;
+
+}
+
+SqlContext.Pipe.Send("Trigger Complete");
+
+}
+
+}
+
+And, with that, we're ready to compile and upload it. The assembly upload works just as most of them have so far (we're back to not needing anything other than the default PERMISSION_SET).
+
+CREATE ASSEMBLY ExampleTrigger
+
+FROM '<solution path>\ExampleTrigger\bin\Debug\ExampleTrigger.dll';
+
+Before we get to creating the reference to the trigger, however, we need a table. For this example, we'll just create something very simple:
+
+CREATE TABLE TestTrigger
+
+(
+
+PK int NOT NULL PRIMARY KEY,
+
+Value varchar(max) NOT NULL
+
+);
+
+With the assembly uploaded and the table created, we're ready to create our trigger reference.
+
+Much like stored procedures and functions, a .NET trigger creation is made from the same statement as T-SQL-based triggers. We eliminate the T-SQL side of things and replace it with the EXTERNAL NAME declaration:
+
+CREATE TRIGGER trgExampleTrigger
+
+ON TestTrigger
+
+FOR INSERT, UPDATE, DELETE
+
+AS EXTERNAL NAME ExampleTrigger.Triggers.ExampleTrigger;
+
+And with that, our trigger should be in place on our table and ready to be fired whenever one of its trigger actions is fired (which happens to be for every trigger action), so let's test it.
+
+We'll start by getting a few rows inserted into our table. And, wouldn't you just know it? That will allow us to test the insert part of our trigger.
+
+INSERT INTO TestTrigger
+
+(PK, Value)
+
+VALUES
+
+(1, 'first row'),
+
+(2, 'second row');
+
+Run this, and we not only get our rows in but we also get a little bit of feedback that is coming out of our trigger:
+
+Insert Trigger Fired
+
+NumRows
+
+\-----------
+
+1
+
+(1 row(s) affected)
+
+Trigger Complete
+
+(1 row(s) affected)
+
+Insert Trigger Fired
+
+NumRows
+
+\-----------
+
+1
+
+(1 row(s) affected)
+
+Trigger Complete
+
+(1 row(s) affected)
+
+As you can see, we're getting output from our trigger. Notice that we're getting the "(1 row(s) affected)" both from the query running inside the trigger and from the one that actually inserted the data. We could have taken any action that could have been done in a T-SQL trigger (though many are more efficient if you stay in the T-SQL world). The key is that we could do so much more if we had the need. We could, for example, make an external call or perform a calculation that isn't doable in the T-SQL world.
+
+There is an old saying: "Caution is the better part of valor." This could have been written with triggers in mind. I can't possibly express enough about the "be careful" when it comes to what you're doing in triggers. Just because you can make an external call doesn't make it a smart thing to do. Assess the need—is it really that important that the call be made right then? Realize that these things can be slow, and whatever transaction that trigger is participating in will not complete until the trigger completes—this means you may be severely damaging performance.
+
+Okay, so with all that done, let's try an update:
+
+UPDATE TestTrigger
+
+SET Value = 'Updated second row'
+
+WHERE PK = 2;
+
+And let's see what we get back:
+
+Update Trigger Fired
+
+inserted rows...
+
+PK Value
+
+\----------- ---------------------------------------
+
+2 Updated second row
+
+(1 row(s) affected)
+
+Trigger Complete
+
+(1 row(s) affected)
+
+The result set we're getting back is the one our trigger is outputting. That's followed by some of our other output as well as the base "(1 row(s) affected)" that we would normally expect from our single row update. Just as with the insert statement, we were able to see what had happened and could have adapted accordingly.
+
+And so, that leaves us with just the delete statement. This time, we'll delete all the rows, and we'll see how the count of our deleted table does indeed reflect both of the deleted rows.
+
+DELETE TestTrigger;
+
+And again check the results:
+
+Delete Trigger Fired
+
+NumRows
+
+\-----------
+
+2
+
+(1 row(s) affected)
+
+Trigger Complete
+
+(2 row(s) affected)
+
+Now, these results may be just a little confusing, so let's look at what we have.
+
+We start with the notification that our trigger fired. That comes from our trigger. (Remember, we send that message down the pipe ourselves.) Then comes the result set from our SELECT COUNT(*). Notice the "(1 row(s) affected)"—that's from our result set rather than the UPDATE that started it all. We then get to the end of execution of our trigger (again, we dropped that message in the pipe), and, finally, the "(2 row(s) affected)" that was from the original UPDATE statement.
+
+And there we have it. We've done something to address every action scenario, and we could have, of course, done a lot more within each. We could also do something to address a BEFORE trigger if we needed to.
+
+Custom Data Types
+
+Sometimes you have the need to store data that you want to be strongly typed, but that SQL Server doesn't fit within SQL Server's simple data type list. Indeed, you may need to invoke a complex set of rules in order to determine whether or not the data properly meets the type requirement.
+
+Requests for support of complex data types have been around a very long time. Indeed, I can recall being at the Sphinx Beta 2.0—known to most as Beta 2 for SQL Server 7.0—event in 1998, and having that come up as something like the second most requested item in a request session I was at. Well, it took a lot of years, but it's finally here.
+
+By utilizing a .NET assembly, we can achieve a virtually limitless number of possibilities in our data types. The type can have complex rules or even contain multiple properties.
+
+Before we get to the syntax for adding assemblies, let's get an assembly constructed.
+
+The sample used here will be the ComplexNumber.sln solution included in the SQL Server samples. You will need to locate the base directory for the solution—the location of which will vary depending on your particular installation.
+
+We need to start by creating the signature keys for this project. To do this, I recommend starting with your solution directory being current and then calling sn.exe using a fully qualified path (or, if your .NET Framework directory is already in your PATH, then it's that much easier!). For me, it looks like this:
+
+C:\Program Files\Microsoft.NET\SDK\v2.0
+
+64bit\LateBreaking\SQLCLR\UserDefinedDat
+
+aType>"C:\Program Files (x86)\Microsoft Visual Studio 8\SDK\v2.0\Bin\sn" -k
+
+temp
+
+.snk
+
+And with that, you're ready to build your DLL.
+
+Let's go ahead and upload the actual assembly (alter this to match the paths on your particular system):
+
+CREATE ASSEMBLY ComplexNumber
+
+FROM '<solution path>\ComplexNumber\bin\debug\ComplexNumber.dll'
+
+WITH PERMISSION_SET = SAFE;
+
+And with the assembly loaded, we're ready to begin.
+
+Creating Your Data Type from Your Assembly
+
+So, you have an assembly that implements your complex data type and have uploaded it to SQL Server using the CREATE ASSEMBLY command. You're ready to instruct SQL Server to use it. This works pretty much as other assemblies have. The syntax (you may recall from Chapter 7) looks like this:
+
+CREATE TYPE [<schema name>.]<type name>
+
+EXTERNAL NAME <assembly name>[.<class name>][;]
+
+You'll notice immediately that it looks like our previous assembly-related constructs, and, indeed, the use is the same.
+
+So, utilizing our complex type created in the last section, it would look like this:
+
+CREATE TYPE ComplexNumber
+
+EXTERNAL NAME [ComplexNumber].[Microsoft.Samples.SqlServer.ComplexNumber];
+
+Accessing Your Complex Data Type
+
+Microsoft has provided a file called test.sql for testing the assembly we just defined as our complex data type, but I find it falls just slightly short of where we want to be in our learning here. What I want to emphasize is how the various functions of the supporting class for our data type are still available. In addition, each individual property of the variable is fully addressable. So, let's run a modified version of the provided script:
+
+USE AdventureWorks2008;
+
+GO
+
+\-- create a variable of the type, create a value of the type and invoke
+
+\-- a behavior over it
+
+DECLARE @c ComplexNumber;
+
+SET @c = CONVERT(ComplexNumber, '(1, 2i)');
+
+SELECT @c.ToString() AS FullValueAsString;
+
+SELECT @c.Real AS JustRealProperty'
+
+GO
+
+Now run it, and check out the results:
+
+FullValueAsString
+
+\------------------
+
+(1,2i)
+
+(1 row(s) affected)
+
+JustRealProperty
+
+\------------------
+
+1
+
+(1 row(s) affected)
+
+In the first result that was returned, the ToString function was called as defined as a method of our class. The string is formatted just as our method desires. If we had wanted to reverse the order of the numbers or some silly thing like that, we would only have needed to change the ToString function in the class, recompile it, and re-import it our database.
+
+In our second result, we address just one property of our complex data type. The simple dot "." delimiter told SQL Server that we were looking for a property—just as it would in C# or VB.NET.
+
+Dropping Data Types
+
+As you might expect, the syntax for dropping a user-defined data type works just like other drop statements:
+
+DROP TYPE [<schema name>.]<type name>[;]
+
+And it's gone—maybe.
+
+Okay, so why a "maybe" this time? Well, if there is most any object out there that references this data type, then the DROP will be disallowed and will fail. So, if you have a table that has a column of this type, then an attempt to drop it would fail. Likewise, if you have a schema bound view, stored procedure, trigger, or function defined that utilizes this type, then a drop would also fail.
+
+Note that this form of restriction appears in other places in SQL Server—such as dropping a table when it is the target of a foreign key reference—but those restrictions tend to be less all-encompassing than this one is (virtually any use of it in your database at all will block the drop), so I haven't felt as much need to point it out (they were more self-explanatory).
+
+Summary
+
+Well, if you aren't thinking to yourself something along the lines of "Wow, some of that stuff is pretty powerful," then I can only guess you somehow skipped straight to the summary without reading the rest of the chapter. That's what this chapter is all about—giving you the power to do very complex things (or, in a few cases, simple things that still weren't possible before).
+
+There is a lot to think about out of this chapter. You have table-valued parameters, which allow a sharp reduction in round trips from the client and further allows you to bundle more logic in a single parent sproc.
+
+When using assemblies, you need to be careful. Think about what you're doing, and analyze each of the steps that your assembly is going to be taking even more thoroughly than you already do. Consider latency you're going to be adding if you create long-running processes. Consider external dependencies you are creating if you make external calls—how reliable are those external processes? You need to know, as your system is now only as reliable as the external systems you're calling.
+
+As always, think about what you need, and don't make your solution any more complex than it needs to be. Keep in mind, however, that what seems at first to be the more complex solution may actually be simpler in the end. I've seen stored procedures that solved the seemingly unsolvable T-SQL problem. Keeping your system away from assemblies would seem to make it simpler, but what's better: a 300-line, complex T-SQL stored proc or an assembly that is concise and takes only 25 lines including declarations?
+
+Choose wisely.
+11
+
+Transactions and Locks
+
+What to do...? What to do...? This I pondered when considering this chapter. Since I usually teach this topic even to so-called "beginners" (and I have coverage of it in Beginning SQL Server 2008 Programming), I seriously debated removing this subject from the Professional title. The problem with that, however, is that, while fundamental in nature, transactions and locks are a fundamental that even lots of fairly advanced users don't quite "get." You see, while nothing in this chapter is wildly difficult, transactions and locks tend to be two of the most misunderstood areas in the database world.
+
+This is one of those chapters that, when you go back to work, will make you sound like you've had your Wheaties today. As such, this "beginning" (or at least I think it's a basic) concept is going to make you start to look like a real pro.
+
+In this chapter, we're going to:
+
+  * Examine transactions
+  * Examine how the SQL Server log and "checkpoints" work
+  * Unlock your understanding of locks
+
+Now, lest you think that I've suddenly decided to treat you like a rookie, rest assured, we will look a tad more in depth in several places than I necessarily do for beginning readers.
+
+Transactions
+
+Transactions are all about atomicity. Atomicity is the concept that something should act as a unit. From our database standpoint, it's about the smallest grouping of one or more statements that should be considered to be "all or nothing."
+
+Often, when dealing with data, we want to make sure that if one thing happens, another thing happens, or that neither of them do. Indeed, this can be carried out to the degree where 20 things (or more) all have to happen together or nothing happens. Let's look at a classic example.
+
+Imagine that you are a banker. Sally comes in and wants to transfer $1,000 from checking to savings. You are, of course, happy to oblige, so you process her request.
+
+Behind the scenes, we have something like this happening:
+
+UPDATE checking
+
+SET Balance = Balance − 1000
+
+WHERE Account = 'Sally'
+
+UPDATE savings
+
+SET Balance = Balance + 1000
+
+WHERE Account = 'Sally'
+
+This is a hypersimplification of what's going on, but it captures the main thrust of things: you need to issue two different statements—one for each account.
+
+Now, what if the first statement executes and the second one doesn't? Sally would be out of a thousand dollars! That might, for a short time, seem okay from your perspective (heck, you just made a thousand bucks!), but not for long. By that afternoon you'd have a steady stream of customers leaving your bank. It's hard to stay in the bank business with no depositors.
+
+What you need is a way to be certain that if the first statement executes, the second statement executes. At first, it would seem that there really isn't a way that we can be certain of that. All sorts of things can go wrong, from hardware failures to simple things such as violations of data integrity rules. Fortunately, however, there is a way to do something that serves the same overall purpose. We can essentially forget that the first statement ever happened. We can enforce at least the notion that if one thing didn't happen, then nothing did—at least within the scope of our transaction.
+
+In order to capture this notion of a transaction, however, we need to be able to define boundaries. A transaction has to have very definitive begin and end points. Actually, every SELECT, INSERT, UPDATE, and DELETE statement you issue in SQL Server is part of an implicit transaction. Even if you issue only one statement, that one statement is considered to be a transaction. Everything about the statement will be executed, or none of it will. Indeed, by default, that is the length of a transaction—one statement.
+
+Again: Every SELECT, INSERT, UPDATE, and DELETE statement you issue in SQL Server is part of an implicit transaction. Even if you issue only one statement, that one statement is considered to be a transaction. Everything about the statement will be executed, or none of it will.
+
+But what if we need to have more than one statement be all or nothing—such as our preceding bank example? In such a case, we need a way of marking the beginning and end of a transaction, as well as the success or failure of that transaction. To that end, there are several T-SQL statements that we can use to "mark" these points in a transaction. We can:
+
+  * BEGIN a Transaction: Set the starting point.
+  * COMMIT a Transaction: Make the transaction a permanent, irreversible part of the database.
+  * ROLLBACK a Transaction: Essentially saying that you want to forget that it ever happened.
+  * SAVE a Transaction: Establish a specific marker to allow us to do only a partial rollback.
+
+Let's look over all of these individually before we put them together into our first transaction.
+
+BEGIN TRAN
+
+The beginning of the transaction is probably one of the easiest concepts to understand in the transaction process. Its sole purpose in life is to denote the point that is the beginning of a unit. If, for some reason, we are unable to or do not want to commit the transaction, this is the point to which all database activity will be rolled back. That is, everything beyond this point that is not eventually committed will effectively be forgotten as far as the database is concerned.
+
+The syntax is:
+
+BEGIN TRAN[SACTION] [<transaction name>|<@transaction variable>]
+
+[WITH MARK ['<description>']][;]
+
+The WITH MARK section is optional, and is, in practice, rarely used, but don't discount it as unimportant—quite the contrary!
+
+If you're marking the transaction, you must include the transaction name. (Note that it's the name, not the description that is required. The name is optional if you're not marking the transaction.) If supplied the description should be a maximum of 255 characters. (It can be longer, but, if so, it will be truncated to 255.)
+
+Regarding Marking Transactions
+
+Beginning back in SQL Server 2005, we gained the ability, when restoring a database from backups and logs, to restore to a specific point in time. You could specify an exact time that you wanted a backup rolled forward to (utilizing a log), and SQL Server would recover everything up to that point, and nothing beyond. A marked transaction expands this capability by creating a special notation in the transaction log. When performing a point-in-time recovery, you can specify the marked transaction as the point you want to recover to, instead of the time, by simply specifying the description of the mark. You can use this for things such as:
+
+  * Marking a point when a critical action took place so that, if necessary, you can recover to just that point
+  * Marking activity in two databases so that those databases can be restored to a synchronized point in time
+
+This concept of marking your point in time can be a handy thing to have available. While it is something of an extreme use, you will find scenarios where you need to synchronize with external systems (not even necessarily a SQL Server) on backups.
+
+COMMIT TRAN
+
+The committing of a transaction is the end of a completed transaction. At the point that you issue the COMMIT TRAN, the transaction is considered to be what is called durable. That is, the effect of the transaction is now permanent and will last even if you have a system failure (as long as you have a backup or the database files haven't been physically destroyed). The only way to "undo" whatever the transaction accomplished is to issue a new transaction that, functionally speaking, is a reverse of your first transaction.
+
+The syntax for a COMMIT looks pretty similar to a BEGIN:
+
+COMMIT [TRAN[SACTION] [<transaction name>|<@transaction variable>]][;]
+
+Note that, similar to the way EXECUTE can be truncated to EXEC, TRANSACTION can be truncated down to TRAN. While TRANSACTION is the more full and clear form of the word, you'll find, in practice, that most developers use the shortened TRAN. (What can I say? We're apparently a rather lazy bunch.)
+
+SQL Server also supports a more ANSI-compliant syntax in the form of:
+
+COMMIT [WORK][;]
+
+The notion of a transaction name moniker is not supported under this syntax, and, while it is more ANSI compliant, it has, for whatever reason (probably its late addition to the product), been virtually nonutilized with SQL Server in actual practice.
+
+ROLLBACK TRAN
+
+Whenever I think of a ROLLBACK, I think of the old movie The Princess Bride. If you've ever seen the film (if you haven't, I highly recommend it), you'll know that the character Vizzini (considered a genius in the film) always said, "If anything goes wrong, go back to the beginning."
+
+That was some mighty good advice. A ROLLBACK does just what Vizzini suggested. It goes back to the beginning. In this case, it's your transaction that goes back to the beginning. Anything that happened since the associated BEGIN statement is effectively forgotten. The only exception to going back to the beginning occurs when using what are called savepoints, which I'll describe shortly.
+
+The syntax for a ROLLBACK again looks pretty much the same, with the exception of allowance for a savepoint:
+
+ROLLBACK TRAN[SACTION] [<transaction name>|<save point name>|
+
+<@transaction variable>|<@savepoint variable>][;]
+
+Alternatively, you can use the ANSI syntax similar to what we saw with COMMIT:
+
+ROLLBACK [WORK][;]
+
+SAVE TRAN
+
+To save a transaction is essentially to create something of a bookmark. You establish a name for your bookmark. (You can have more than one.) After this "bookmark" is established, you can reference it in a rollback. What's nice about this is that you can roll back to the exact spot in the code that you want to just by naming a savepoint to which you want to roll back.
+
+Names for savepoints must conform to the rules for identifiers that we discussed back in Chapter 1. There is, however, a difference; savepoint names are limited to 32 characters in length.
+
+The syntax is simple enough:
+
+SAVE TRAN[SACTION] [<save point name>| <@savepoint variable>][;]
+
+The thing to remember about savepoints is that they are cleared on ROLLBACK—that is, even if you save five savepoints, once you perform one ROLLBACK they are all gone. You can start setting new savepoints again, and rolling back to those, but whatever savepoints you had when the ROLLBACK was issued are gone.
+
+Savepoints were something of a major confusion area for me when I first came across them. Books Online indicates that, after rolling back to a savepoint, you must run the transaction to a logical conclusion. (This is technically correct.) Where the confusion came was in the Books Online implication that seemed to indicate that you had to go to a ROLLBACK or COMMIT without using any more savepoints. This is not the case. You just can't use the savepoints that you declared prior to the ROLLBACK. Savepoints after this are just fine.
+
+Let's test this out with a bit of code to see what happens when we mix the different types of TRAN commands. Type the following code in and then we'll run through an explanation of it:
+
+USE AdventureWorks2008; -- We're making our own table - what DB doesn't matter
+
+\-- Create table to work with
+
+CREATE TABLE MyTranTest
+
+(
+
+OrderID INT PRIMARY KEY IDENTITY
+
+);
+
+\-- Start the transaction
+
+BEGIN TRAN TranStart;
+
+\-- Insert our first piece of data using default values.
+
+\-- Consider this record No1. It is also the 1st record that stays
+
+\-- after all the rollbacks are done.
+
+INSERT INTO MyTranTest
+
+DEFAULT VALUES;
+
+\-- Create a "Bookmark" to come back to later if need be
+
+SAVE TRAN FirstPoint;
+
+\-- Insert some more default data (this one will disappear
+
+\-- after the rollback).
+
+\-- Consider this record No2.
+
+INSERT INTO MyTranTest
+
+DEFAULT VALUES;
+
+\-- Roll back to the first savepoint. Anything up to that
+
+\-- point will still be part of the transaction. Anything
+
+\-- beyond is now toast.
+
+ROLLBACK TRAN FirstPoint;
+
+\-- Insert some more default data.
+
+\-- Consider this record No3 It is the 2nd record that stays
+
+\-- after all the rollbacks are done.
+
+INSERT INTO MyTranTest
+
+DEFAULT VALUES;
+
+\-- Create another point to roll back to.
+
+SAVE TRAN SecondPoint;
+
+\-- Yet more data. This one will also disappear,
+
+\-- only after the second rollback this time.
+
+\-- Consider this record No4.
+
+INSERT INTO MyTranTest
+
+DEFAULT VALUES;
+
+\-- Go back to second savepoint
+
+ROLLBACK TRAN SecondPoint;
+
+\-- Insert a little more data to show that things
+
+\-- are still happening.
+
+\-- Consider this record No5. It is the 3rd record that stays
+
+\-- after all the rollbacks are done.
+
+INSERT INTO MyTranTest
+
+DEFAULT VALUES;
+
+\-- Commit the transaction
+
+COMMIT TRAN TranStart;
+
+\-- See what records were finally committed.
+
+SELECT TOP 3 OrderID
+
+FROM MyTranTest
+
+ORDER BY OrderID DESC;
+
+\-- Clean up after ourselves
+
+DROP TABLE MyTranTest;
+
+First, we create a table to work with for our test:
+
+\-- Create table to work with
+
+CREATE TABLE MyTranTest
+
+(
+
+OrderID INT PRIMARY KEY IDENTITY
+
+);
+
+Since we're creating our own table to play with, what database we are using doesn't really matter for this demonstration.
+
+Then it's time to begin the transaction. This starts our grouping of "all or nothing" statements. We then INSERT a row. At this juncture, we have just one row inserted:
+
+\-- Start the transaction
+
+BEGIN TRAN TranStart;
+
+\-- Insert our first piece of data using default values.
+
+\-- Consider this record No1. It is also the 1st record that stays
+
+\-- after all the rollbacks are done.
+
+INSERT INTO MyTranTest
+
+DEFAULT VALUES;
+
+Next, we establish a savepoint called FirstPoint and insert yet another row. At this point, we have two rows inserted, but remember, they are not committed yet, so the database doesn't consider them to be part of the database:
+
+\-- Create a "Bookmark" to come back to later if need be
+
+SAVE TRAN FirstPoint;
+
+\-- Insert some more default data (this one will disappear
+
+\-- after the rollback).
+
+\-- Consider this record No2.
+
+INSERT INTO MyTranTest
+
+DEFAULT VALUES;
+
+We then ROLLBACK—explicitly saying that it is not the beginning that we want to rollback to, but just to FirstPoint. With the ROLLBACK, everything between ROLLBACK and the FirstPoint savepoint is undone. Since we have one INSERT statement between the ROLLBACK and the SAVE, that statement is rolled back. At this juncture, we are back down to just one row inserted. Any attempt to reference a savepoint would now fail since all savepoints have been reset with our ROLLBACK:
+
+\-- Roll back to the first savepoint. Anything up to that
+
+\-- point will still be part of the transaction. Anything
+
+\-- beyond is now toast.
+
+ROLLBACK TRAN FirstPoint;
+
+We add another row, putting us back up to a total of two rows inserted at this point. We also create a brand new savepoint. This is perfectly valid, and we can now refer to this savepoint since it is established after the ROLLBACK:
+
+\-- Insert some more default data.
+
+\-- Consider this record No3. It is the 2nd record that stays
+
+\-- after all the rollbacks are done.
+
+INSERT INTO MyTranTest
+
+DEFAULT VALUES;
+
+\-- Create another point to roll back to.
+
+SAVE TRAN SecondPoint;
+
+Time for yet another row to be inserted, bringing our total number of still-valid inserts up to three:
+
+\-- Yet more data. This one will also disappear,
+
+\-- only after the second rollback this time.
+
+\-- Consider this record No4.
+
+INSERT INTO MyTranTest
+
+DEFAULT VALUES;
+
+Now we perform another ROLLBACK, this time referencing our new savepoint (which happens to be the only one valid at this point since FirstPoint was reset after the first ROLLBACK). This one undoes everything between it and the savepoint it refers to—in this case just one INSERT statement. That puts us back at two INSERT statements that are still valid:
+
+\-- Go back to second savepoint
+
+ROLLBACK TRAN SecondPoint;
+
+We then issue yet another INSERT statement, bringing our total number of INSERT statements that are still part of the transaction back up to three:
+
+\-- Insert a little more data to show that things
+
+\-- are still happening.
+
+\-- Consider this record No5. It is the 3rd record that stays
+
+\-- after all the rollbacks are done.
+
+INSERT INTO MyTranTest
+
+DEFAULT VALUES;
+
+Last (for our transaction anyway), but certainly not least, we issue the COMMIT TRAN statement that locks our transaction in and makes it a permanent part of the history of the database:
+
+\-- Commit the transaction
+
+COMMIT TRAN TranStart
+
+\-- See what records were finally committed.
+
+SELECT TOP 3 OrderID
+
+FROM MyTranTest
+
+ORDER BY OrderID DESC;
+
+Note that if either of these ROLLBACK statements had not included the name of a savepoint, or had included a name that had been set with the BEGIN statement, then the entire transaction would have been rolled back, and the transaction would be considered to be closed.
+
+With the transaction complete, we can issue a little statement that shows us our three rows. When you look at this, you'll be able to see what's happened in terms of rows being added and then removed from the transaction:
+
+OrderID
+
+\-----------
+
+5
+
+3
+
+1
+
+(3 row(s) affected)
+
+Sure enough, every other row was inserted.
+
+Finally, we clean up after ourselves. This really has nothing to do with the transaction.
+
+DROP TABLE MyTranTest;
+
+How the SQL Server Log Works
+
+You definitely must have the concept of transactions down before you get into trying to figure out the way that SQL Server tracks what's in your database. You see, what you think of as your database is only rarely a complete version of all the data. Except for rare moments when it happens that everything has been written to disk, the data in your database is made up of not only the data in the physical database file(s) but also any transactions that have been committed to the log since the last checkpoint.
+
+In the normal operation of your database, most activities that you perform are "logged" to the transaction log rather than written directly to the database. A checkpoint is a periodic operation that forces all dirty pages for the database currently in use to be written to disk. Dirty pages are log or data pages that have been modified after they were read into the cache, but the modifications have not yet been written to disk. Without a checkpoint the log would fill up and/or use all the available disk space. The process works something like the diagram in Figure 11.1.
+
+Figure 11.1
+
+Don't mistake all this as meaning that you have to do something special to get your data out of the cache. SQL Server handles all of this for you. This information is only provided here to facilitate your understanding of how the log works, and, from there, the steps required to handle a transaction. Whether something is in cache or not can make a big difference to performance, so understanding when things are logged and when things go in and out of the cache can be a big deal when you are seeking maximum performance.
+
+Note that the need to read data into a cache that is already full is not the only reason that a checkpoint would be issued. Checkpoints can be issued under the following circumstances:
+
+  * By a manual statement—using the CHECKPOINT command.
+  * At normal shutdown of the server (unless the WITH NOWAIT option is used).
+  * When you change any database option (for example, single user only, dbo only, and so on).
+  * When the Simple Recovery option is used and the log becomes 70 percent full.
+  * When the amount of data in the log since the last checkpoint (often called the active portion of the log) exceeds the size that the server could recover in the amount of time specified in the recovery interval option.
+
+Let's look at each of these more carefully.
+
+Using the CHECKPOINT Command
+
+One way—but probably the least often used way—for the database to have a checkpoint issued is for it to be done manually. You can do this anytime by just typing in the word:
+
+CHECKPOINT
+
+It's just that simple.
+
+SQL Server does a very good job of managing itself in the area of checkpoints, so the times when issuing a manual checkpoint makes sense are fairly rare.
+
+One place that I will do this is during the development cycle when I have the simple recovery model turned on for my database (you are very unlikely to want that for a production database). It's not at all uncommon during the development stage of your database to perform actions that are long running and fill up the log rather quickly. While I could always just issue the appropriate command to truncate the log myself, CHECKPOINT is a little shorter and faster and, when using the simple recovery model, has the same effect.
+
+At Normal Server Shutdown
+
+Ever wonder why SQL Server can sometimes take a very long time to shut down? Besides the deallocation of memory and other destructor routines that have to run to unload the system, SQL Server must also first issue a checkpoint before the shutdown process can begin. This means that you'll have to wait for any data that's been committed in the log to be written out to the physical database before your shutdown can continue. Checkpoints also occur when the server is stopped:
+
+  * Using the Management Studio
+  * Using the NET STOP MSSQLSERVER instruction at a command window (a DOS box some would call it) prompt
+  * Using the Services icon in the Windows control panel, selecting the MSSQLSERVER service, and clicking the stop button
+
+Unlike Checkpoint on Recovery, this is something that I like. I like the fact that all my committed transactions are in the physical database (not split between the log and database), which just strikes me as being cleaner, with less chance of data corruption.
+
+There is a way you can get around the delay if you so choose. To use it, you must be shutting down using the SHUTDOWN command in T-SQL. To eliminate the delay associated with the checkpoint (and the checkpoint itself for that matter), you just add the WITH NO WAIT key phrase to your shutdown statement:
+
+SHUTDOWN [WITH NO WAIT]
+
+Note that I recommend highly against using this unless you have some programmatic need to shut down your server. It will cause the subsequent restart to take a longer time than usual to recover the databases on the server, and it means that your shutdown is not as clean. (Some data is only in the log rather than all of it being in the database file.)
+
+At a Change of Database Options
+
+A checkpoint is issued anytime you issue a change to your database options regardless of how the option gets changed (such as using sp_dboption or ALTER DATABASE). The checkpoint is issued prior to making the actual change in the database.
+
+When the Truncate on Checkpoint Option Is Active
+
+If you have turned on the Truncate On Checkpoint database option (which is a common practice during the development phase of your database), then SQL Server will automatically issue a checkpoint any time the log becomes more than 70 percent full.
+
+When Recovery Time Would Exceed the Recovery Interval Option Setting
+
+As we saw briefly earlier (and will see more closely next), SQL Server performs a process called recovery every time the SQL Server is started up. SQL Server will automatically issue a checkpoint any time the estimated time to run the recovery process would exceed the amount of time set in a database option called recovery interval. By default, the recovery interval is set to zero, which means that SQL Server will decide for you. (In practice, this means about one minute.)
+
+Failure and Recovery
+
+A recovery happens every time that SQL Server starts up. SQL Server takes the database file and then applies (by writing them out to the physical database file) any committed changes that are in the log since the last checkpoint. Any changes in the log that do not have a corresponding commit are rolled back—that is, they are essentially forgotten about.
+
+Let's take a look at how this works depending on how transactions have occurred in your database. Imagine five transactions that span the log, as pictured in Figure 11.2.
+
+Figure 11.2
+
+Let's look at what would happen to these transactions one by one.
+
+Transaction 1
+
+Absolutely nothing would happen. The transaction has already been through a checkpoint and has been fully committed to the database. There is no need to do anything at recovery, because any data that is read into the data cache would already reflect the committed transaction.
+
+Transaction 2
+
+Even though the transaction existed at the time that a checkpoint was issued, the transaction had not been committed (the transaction was still going). Without that commitment, the transaction does not actually participate in the checkpoint. This transaction would, therefore, be "rolled forward." This is just a fancy way of saying that we would need to read all the related pages back into cache and then use the information in the log to re-run all the statements that we ran in this transaction. When that's finished, the transaction should look exactly as it did before the system failed.
+
+Transaction 3
+
+It may not look the part, but this transaction is exactly the same as Transaction 2 from the standpoint of what needs to be done. Again, because Transaction 3 wasn't finished at the time of the last checkpoint, it did not participate in that checkpoint, just like Transaction 2 didn't. The only difference is that Transaction 3 didn't even exist at that time, but, from a recovery stand-point it makes no difference—it's where the commit is issued that makes all the difference.
+
+Transaction 4
+
+This transaction wasn't completed at the time of system failure, and must, therefore, be rolled back. In effect, it never happened from a row data perspective. The user would have to re-enter any data, and any process would need to start from the beginning.
+
+Transaction 5
+
+This one is no different than Transaction 4. It appears to be different because the transaction has been running longer, but that makes no difference. The transaction was not committed at the time of system failure, and must therefore be rolled back.
+
+Implicit Transactions
+
+Primarily for compatibility with other major RDBMS systems, such as Oracle or DB2, SQL Server supports (it is off by default but can be turned on if you choose) the notion of what is called an implicit transaction. Implicit transactions do not require a BEGIN TRAN statement—instead, they are automatically started with your first statement. They then continue until you issue a COMMIT TRAN or ROLLBACK TRAN statement. The next transaction then begins with your next statement.
+
+Theoretically, the purpose behind this is to make sure that every statement is part of a transaction. SQL Server also wants every statement to be part of a transaction, but, by default, takes a different approach—if there is no BEGIN TRAN, then SQL Server assumes you have a transaction of just one statement, and automatically begins and ends that transaction for you. With some other systems though, you'll find the implied transaction approach. Those systems will assume that any one statement is only the beginning of the transaction and therefore require that you explicitly end the every transaction with a COMMIT or ROLLBACK.
+
+By default, the IMPLICIT_TRANSACTIONS option is turned off (and the connection is in autocommit transaction mode). You can turn it on by issuing the command:
+
+SET IMPLICIT_TRANSACTIONS ON;
+
+After that, any of the following statements will initiate a transaction:
+
+CREATE
+
+ALTER TABLE
+
+GRANT
+
+REVOKE
+
+SELECT
+
+UPDATE
+
+DELETE
+
+INSERT
+
+TRUNCATE TABLE
+
+DROP
+
+OPEN
+
+FETCH
+
+The transaction will continue until you COMMIT or ROLLBACK. Note that the implicit transactions option will affect only the current connection—any other users will still have the option turned off unless they have also executed the SET statement.
+
+The implicit transactions option is dangerous territory, and I highly recommend that you leave this option off unless you have a very specific reason to turn it on (such as compatibility with code written in another system).
+
+Here's a common scenario: A user calls up and says, "I've been inserting data for the last half hour, and none of my changes are showing." So, you go run a DBCC OPENTRAN, and discover that there's a transaction that's been there for a while—you can take a guess at what's happened. The user has a transaction open, and his or her changes won't appear until that transaction is committed. The user may have done it using an explicit BEGIN TRANS statement, but he or she may also have executed some code that turned implicit transactions on and then didn't turn it off. A mess follows.
+
+Locks and Concurrency
+
+Concurrency is a major issue for any database system. It addresses the notion of two or more users trying to interact with the same object at the same time. The nature of that interaction may be different for each user (updating, deleting, reading, inserting), and the ideal way to handle the competition for control of the object changes depending on just what all the users in question are doing and just how important their actions are. The more users—more specifically, the more transactions—that you can run with reasonable success at the same time, the higher your concurrency is said to be.
+
+In the Online Transaction Processing (OLTP) environment, concurrency is usually the first thing we deal with in data, and it is the focus of most of the database notions put forward in this book. (Online Analytical Processing [OLAP] is usually something of an afterthought; it shouldn't necessarily be that way, but it is.) Dealing with the issue of concurrency can be critical to the performance of your system. At the foundation of dealing with concurrency in databases is a process called locking.
+
+Locks are mechanisms for preventing a process from performing an action on an object that conflicts with something already being done on that object. That is, you can't do some things to an object if someone else got there first. What you can and cannot do depends on what the other user is doing. It is also a means of describing what is being done, so the system knows whether or not the second process action is compatible with the first process. For example, 1, 2, 10, 100, 1,000, or whatever number of user connections the system can handle are usually all able to share the same piece of data at the same time as long as they all only want the record on a read-only basis. Think of it as being like a crystal shop: Lots of people can be in looking at things—even the same thing—as long as they don't move it, buy it, or otherwise change it. If more than one person does that at the same time, you're liable to wind up with broken crystal. That's why the shopkeeper usually keeps a close eye on things, and they will usually decide who gets to handle it first.
+
+The SQL Server lock manager is that shopkeeper. When you come into the SQL Server "store," the lock manager asks what is your intent—what it is you're going to be doing. If you say "just looking," and no one else already there is doing anything but "just looking," then the lock manager will let you in. If you want to "buy" (update or delete) something, then the lock manager will check to see if anyone's already there. If so, then you must wait, and everyone who comes in behind you will also wait. When you are let in to "buy," no one else will be let in until you are done.
+
+By doing things this way, SQL Server is able to help us avoid a mix of different problems that can be created by concurrency issues. We will examine the possible concurrency problems and how to set a transaction isolation level that will prevent each, but for now, let's move on to what can and cannot be locked, and what kinds of locks are available.
+
+What Problems Can Be Prevented by Locks
+
+Locks can address four major problems:
+
+  * Dirty reads
+  * Non-repeatable reads
+  * Phantoms
+  * Lost updates
+
+Each of these presents a separate set of problems, and can be handled by mix of solutions that usually includes proper setting of the transaction isolation level. Just to help make things useful as you look back at this chapter later, I'm going to include information on which transaction isolation level is appropriate for each of these problems. We'll take a complete look at isolation levels shortly, but for now, let's first make sure that we understand what each of these problems is all about.
+
+Dirty Reads
+
+Dirty reads occur when a transaction reads a record that is part of another transaction that isn't complete yet. If the first transaction completes normally, then it's unlikely there's a problem. But what if the transaction were rolled back? You would have information from a transaction that never happened from the database's perspective!
+
+Let's look at it in an example series of steps:
+
+Oops—problem!!!
+
+Transaction 2 has now made use of a value that isn't valid! If you try to go back and audit to find where this number came from, you'll wind up with no trace and an extremely large headache.
+
+Fortunately, this scenario can't happen if you're using the SQL Server default for the transaction isolation level (called READ COMMITTED, which will be explained later in the section "Setting the Isolation Level").
+
+Non-Repeatable Reads
+
+It's really easy to get this one mixed up with a dirty read. Don't worry about that—it's only terminology. Just get the concept.
+
+A non-repeatable read is caused when you read the record twice in a transaction, and a separate transaction alters the data in the interim. For this one, let's go back to our bank example. Remember that we don't want the value of the account to go below 0 dollars:
+
+Again, we have a problem. Transaction 1 has prescanned (which can be a good practice in some instances) to make sure that the value is valid and that the transaction can go through (there's enough money in the account). The problem is that, before the UPDATE was made, Transaction 2 beat Transaction 1 to the punch. If there isn't any CHECK constraint on the table to prevent the negative value, then it will indeed be set to –25—even though it logically appeared that we prevented this through the use of our IF statement.
+
+We can prevent this problem in only two ways:
+
+  * Create a CHECK constraint and monitor for the 547 Error.
+  * Set our ISOLATION LEVEL to be REPEATABLE READ or SERIALIZABLE.
+
+The CHECK constraint seems fairly obvious. The thing to realize here is that you are taking something of a reactive rather than a proactive approach with this method. Nonetheless, in most situations we have a potential for non-repeatable reads, so this would be my preferred choice in most circumstances.
+
+We'll be taking a full look at isolation levels shortly, but for now, suffice to say that there's a good chance that setting it to REPEATABLE READ or SERIALIZABLE is going to cause you as many headaches (or more) as it solves. Still—it's an option.
+
+Phantoms
+
+No—we're not talking the "of the opera" kind here—what we're talking about are records that appear mysteriously, as if unaffected by an UPDATE or DELETE statement that you've issued. This can happen quite legitimately in the normal course of operating your system, and doesn't require any kind of elaborate scenario to illustrate. Here's a classic example of how this happens.
+
+Let's say you are running a fastfood restaurant. If you're typical of that kind of establishment, you probably have a fair number of employees working at the "minimum wage" as defined by the government. The government has just decided to raise the minimum wage from $6.55 to $7.25 per hour, and you want to run an update on a table called Employees to move anyone making less than $7.25 per hour up to the new minimum wage. No problem, you say, and you issue the rather simple statement:
+
+UPDATE Employees
+
+SET HourlyRate = 7.25
+
+WHERE HourlyRate < 7.25;
+
+ALTER TABLE Employees
+
+ADD CONSTRAINT ckWage CHECK (HourlyRate >= 7.25);
+
+GO
+
+That was a breeze, right? Wrong! Just for illustration, we're going to say that you get an error message back:
+
+Msg 547, Level 16, State 1, Line 1
+
+ALTER TABLE statement conflicted with COLUMN CHECK constraint 'ckWage'. The
+
+conflict occurred in database 'FastFood', table 'Employees', column 'HourlyRate'.
+
+So, you run a quick SELECT statement checking for values below $7.25, and sure enough you find one. The question is likely to come rather quickly, "How did that get there? I just did the UPDATE which should have fixed that!" You did run the statement, and it ran just fine—you just got a phantom.
+
+The instances of phantom reads are rare and require just the right circumstances to happen. In short, someone performed an INSERT statement at the very same time your UPDATE was running. Since it was an entirely new row, it didn't have a lock on it, and it proceeded just fine.
+
+The only cure for this is setting your transaction isolation level to SERIALIZABLE, in which case any updates to the table must not fall within your WHERE clause, or they will be locked out.
+
+Lost Updates
+
+Lost updates happen when one update is successfully written to the database but is accidentally overwritten by another transaction. I can just hear you right about now, "Yikes! How could that happen?"
+
+Lost updates can happen when two transactions read an entire record, then one writes updated information back to the record, and the other writes updated information back to the record. Let's look at an example.
+
+Let's say that you are a credit analyst for your company. You get a call that customer X has reached his or her credit limit and would like an extension, so you pull up the customer information to take a look. You see that they have a credit limit of $5,000, and that they appear to always pay on time.
+
+While you're looking, Sally, another person in your credit department, pulls up customer X's record to enter a change in the address. The record she pulls up also shows the credit limit of $5,000.
+
+At this point, you decide to go ahead and raise customer X's credit limit to $7,500, and press enter. The database now shows $7,500 as the credit limit for customer X.
+
+Sally now completes her update to the address, but she's using the same edit screen that you are—that is, she updates the entire record. Remember what her screen showed as the credit limit? $5,000. Oops, the database now shows customer X with a credit limit of $5,000 again. Your update has been lost!
+
+The solution to this depends on your code somehow recognizing that another connection has updated your record between the time when you read the data and when you went to update it. How this recognition happens varies depending on what access method you're using.
+
+Lockable Resources
+
+There are six different lockable resources for SQL Server, and they form a hierarchy. The higher level the lock, the less granularity it has (that is, you're choosing a higher and higher number of objects to be locked in something of a cascading action just because the object that contains them has been locked). The more relevant of these include, in ascending order of granularity:
+
+  * Database: The entire database is locked. This happens usually during database schema changes.
+  * Table: The entire table is locked. This includes all the data-related objects associated with that table, including the actual data rows (every one of them) and all the keys in all the indexes associated with the table in question.
+  * Extent: The entire extent is locked. Remember that an extent is made up of eight pages, so an extent lock means that the lock has control of the extent, the eight data or index pages in that extent, and all the rows of data in those eight pages.
+  * Page: All the data or index keys on that page are locked.
+  * Key: There is a lock on a particular key or series of keys in an index. Other keys in the same index page may be unaffected.
+  * Row or Row Identifier (RID): Although the lock is technically placed on the row identifier (an internal SQL Server construct), it essentially locks the entire row.
+
+Lock Escalation and Lock Effects on Performance
+
+Escalation is all about recognizing that maintaining a finer level of granularity (say a row lock instead of a page lock) makes a lot of sense when the number of items being locked is small. However, as we get more and more items locked, the overhead associated with maintaining those locks actually hinders performance. It can cause the lock to be in place longer, thus creating contention issues; the longer the lock is in place, the more likely that someone will want that particular record. When you think about this for a bit, you'll realize there's probably a balancing act to be done somewhere, and that's exactly what the lock manager uses escalation to do.
+
+When the number of locks being maintained reaches a certain threshold, the lock is escalated to the next highest level, and the lower-level locks do not have to be so tightly managed (freeing resources and helping speed over contention).
+
+Note that the escalation is based on the number of locks rather than the number of users. The importance here is that you can single-handedly lock a table by performing a mass update. A row lock can graduate to a page lock, which then escalates to a table lock. That means that you could potentially be locking every other user out of the table. If your query makes use of multiple tables, it's actually quite possible to wind up locking everyone out of all of those tables.
+
+While you certainly would prefer not to lock all the other users out of your object, there are times when you still need to perform updates that are going to have that effect. There is very little you can do about escalation other than to keep your queries as targeted as possible. Recognize that escalations will happen, so make sure you've thought about what the possible ramifications of your query are.
+
+Lock Modes
+
+Beyond considering just what resource level you're locking, you also should consider what lock mode your query is going to acquire. Just as there are a variety of resources to lock, there are also a variety of lock modes.
+
+Some modes are exclusive of each other (which means they don't work together). Some modes do nothing more than essentially modify other modes. Whether modes can work together is based on whether they are compatible. We'll take a closer look at compatibility between locks later in this chapter.
+
+Just as we did with lockable resources, let's take a look at lock modes one by one.
+
+Shared Locks
+
+This is the most basic type of lock there is. A shared lock is used when you only need to read the data—that is, when you won't be changing anything. A shared lock wants to be your friend, as it is compatible with other shared locks. That doesn't mean that it still won't cause you grief—while a shared lock doesn't mind any other kind of lock, there are other locks that don't like shared locks.
+
+Shared locks tell other locks that you're out there. It's the old, "Look at me! Ain't I special?" thing. They don't serve much of a purpose, yet they can't really be ignored. However, one thing that shared locks do is to prevent users from performing dirty reads.
+
+Exclusive Locks
+
+Exclusive locks are just what they sound like. Exclusive locks are not compatible with any other lock. They cannot be achieved if any other lock exists, nor will they allow a new lock of any form to be created on the resource while the exclusive lock is still active. This prevents two people from updating, deleting, or doing whatever at the same time.
+
+Update Locks
+
+Update locks are something of a hybrid between shared locks and exclusive locks. An update lock is a special kind of placeholder. Think about it—in order to do an UPDATE, you need to validate your WHERE clause (assuming there is one) to figure out just what rows you're going to be updating. That means that you only need a shared lock, until you actually go to make the physical update. At the time of the physical update, you'll need an exclusive lock.
+
+Update locks indicate that you have a shared lock that's going to become an exclusive lock after you've done your initial scan of the data to figure out what exactly needs to be updated. This acknowledges the fact that there are two distinct stages to an update:
+
+  * First, the stage where you are figuring out what meets the WHERE clause criteria (what's going to be updated). This is the part of an update query that has an update lock.
+  * Second, the stage where, if you actually decide to perform the update, the lock is upgraded to an exclusive lock. Otherwise, the lock is converted to a shared lock.
+
+What's nice about this is that it forms a barrier against one variety of deadlock. A deadlock is not a type of lock in itself but rather a situation where a paradox has been formed. A deadlock would arise if one lock can't do what it needs to do in order to clear because another lock is holding that resource. The problem is that the opposite resource is itself stuck waiting for the lock to clear on the first transaction.
+
+Without update locks, these deadlocks would crop up all the time. Two update queries would be running in shared mode. Query A completes its query and is ready for the physical update. It wants to escalate to an exclusive lock, but it can't because Query B is finishing its query. Query B then finishes the query, except that it needs to do the physical update. To do that, Query B must escalate to an exclusive lock, but it can't because Query A is still waiting. This creates an impasse.
+
+An update lock prevents any other update locks from being established. The instant that the second transaction attempts to achieve an update lock, the new transaction will be put into a wait status for whatever the lock timeout is; the lock will not be granted. If the first lock clears before the lock timeout is reached, then the lock will be granted to the new requester, and that process can continue. If not, an error will be generated.
+
+Update locks are compatible only with shared locks and intent shared locks.
+
+Intent Locks
+
+An intent lock is a true placeholder and is meant to deal with the issue of object hierarchies. Imagine a situation where you have a lock established on a row, but someone wants to establish a lock on a page or extent, or to modify a table. You wouldn't want another transaction to go around yours by going higher up the hierarchy, would you?
+
+Without intent locks, the higher-level objects wouldn't even know that you had the lock at the lower level. Intent locks improve performance, as SQL Server needs to examine intent locks only at the table level, and not check every row or page lock on the table, to determine if a transaction can safely lock the entire table. Intent locks come in three different varieties:
+
+  * Intent Shared Lock: A shared lock has or is going to be established at some lower point in the hierarchy. For example, a page is about to have a page level shared lock established on it. This type of lock applies only to tables and pages.
+  * Intent Exclusive Lock: This is the same as intent shared, but with an exclusive lock about to be placed on the lower-level item.
+  * Shared with Intent Exclusive Lock: A shared lock has or is about to be established lower down the object hierarchy, but the intent is to modify data, so it will become an intent exclusive at some point.
+
+Schema Locks
+
+These come in two flavors:
+
+  * Schema Modification Lock (Sch-M): A schema change is being made to the object. No queries or other CREATE, ALTER, or DROP statements can be run against this object for the duration of the Sch-M lock.
+  * Schema Stability Lock (Sch-S): This is very similar to a shared lock; this lock's sole purpose is to prevent a Sch-M since there are already locks for other queries (or CREATE, ALTER, DROP statements) active on the object. This is compatible with all other lock types.
+
+Bulk Update Locks
+
+A bulk update lock (BU) is really just a variant of a table lock with one little (but significant) difference. Bulk update locks allow parallel loading of data—that is, the table is locked from any other "normal" (T-SQL Statements) activity, but multiple BULK INSERT or bcp operations can be performed at the same time.
+
+Ranged Keylocks
+
+Ranged keylocks are merely a way for SQL Server to control internally individual locks more efficiently. Rather than being its own lock, it is, instead, just a method of tracking which locks are being held. Instead of holding an individual lock for each row in a range being accessed, SQL Server is able to maintain one lock that addressed the entire range (thus saving memory and lock operations).
+
+Lock Compatibility
+
+The table that follows shows the compatibility of the resource lock modes (listed in increasing lock strength). Existing locks are shown by the columns; requested locks by the rows:
+
+Also:
+
+  * The Sch-S is compatible with all lock modes except the Sch-M.
+  * The Sch-M is incompatible with all lock modes.
+  * The BU is compatible only with schema stability and other bulk update locks.
+  * RangeS-S, RangeS-U, RangeI-N, and RangeX-X are range locks that match with the corresponding S, U, and X lock types where applicable, and, in the case of RangeI-N (the N stands for null), lock a range of potential rows to prevent phantoms.
+
+Specifying a Specific Lock Type—Optimizer Hints
+
+Sometimes you want to have more control over how the locking goes either in your query, or perhaps in your entire transaction. You can do this by making use of what are called optimizer hints.
+
+Optimizer hints are ways of explicitly telling SQL Server to escalate a lock to a specific level. They are included right after the name of the table (in your SQL Statement) that they are to act against, and are designated as follows:
+
+Hint | Description  
+---|---  
+SERIALIZABLE/HOLDLOCK | Once a lock is established by a statement in a transaction, that lock is not released until the transaction is ended (via ROLLBACK or COMMIT). Inserts are also prevented if the inserted record would match the criteria in the WHERE clause in the query that established the lock (no phantoms). This is the highest isolation level, and guarantees absolute consistency of data.  
+READUNCOMMITTED/NOLOCK | Obtains no lock (not even a shared lock) and does not honor other locks. While a very fast option, it can generate dirty reads as well as a host of other problems.  
+READCOMMITTED | The default. Honors all locks, but how it handles acquiring locks depends on the database option READ_COMMITTED_SNAPSHOT. If that setting is on, then READCOMMITTED will not acquire locks, and will instead use a row versioning scheme to determine whether any conflicts have occurred. In practice, this should work just fine, and READCOMMITTED should be the way for you to go for both backward compatibility and what is likely better performance.  
+READCOMMITTEDLOCK | This is nuance stuff here. Consider this one to be largely the same as READCOMMITTED in most situations. (Indeed, this one works exactly as READCOMMITTED did in prior versions of SQL Server.) It honors all locks but releases any locks held as soon as the object in question is no longer needed. Performs the same as the READ COMMITTED isolation level.  
+REPEATABLEREAD | Once a lock is established by a statement in a transaction, that lock is not released until the transaction is ended (via ROLLBACK or COMMIT). New data can be inserted, however.  
+READPAST | Rather than waiting for a lock to clear, skips all locked rows. The skip is limited to row locks (still waits for page, extent, and table locks) and can only be used with a SELECT statement.  
+NOWAIT | Causes the query to fail immediately rather than wait if any locks are detected.  
+ROWLOCK | This forces the initial level of the lock to be at the row level, even if the optimizer would have otherwise selected a less granular locking strategy. It does not prevent the lock from being escalated to those less granular levels if the number of locks reaches the system's lock threshold.  
+PAGLOCK | Uses a page-level lock regardless of the choice that otherwise would have been made by the optimizer. The usefulness of this can go both ways—sometimes you know that a page lock is more appropriate than a row lock for resource conservation—other times you want to minimize contention where the optimizer might have chosen a table lock.  
+TABLOCK | Forces a full table lock rather than whatever the lock manager would have used. Can really speed up known table scan situations but creates big contention problems if other users want to modify data in the table.  
+TABLOCKX | Similar to TABLOCK, but creates an exclusive lock—locks all other users out of the table for the duration of the statement or transaction depending on how the TRANSACTION ISOLATION LEVEL is set.  
+UPDLOCK | Uses an update lock instead of a shared lock. This is a highly underutilized tool in the war against deadlocks, as it still allows other users to obtain shared locks but ensures that no data modification (other update locks) are established until you end the statement or transaction (presumably after going ahead and updating the rows).  
+XLOCK | With its roots in TABLOCKX, this one first appeared in SQL Server 2000. The advantage here is that you can specify an exclusive lock regardless of what lock granularity you have chosen (or not chosen) to specify.
+
+Most of these can be very useful in specific situations, but, before you get too attached to using these, make sure that you also check out the concept of isolation levels later in the chapter.
+
+The syntax for using locks is fairly easy—just add it after the table name, or after the alias if you're using one:
+
+....
+
+FROM <table name> AS <alias>][[WITH]
+
+So, to put this into a couple of examples, any of these would be legal, and all would force a table lock (rather than the more likely key or row lock) on the SalesOrderHeader table:
+
+SELECT * FROM Sales.SalesOrderHeader AS ord WITH (TABLOCKX)
+
+SELECT * FROM Sales.SalesOrderHeader AS ord (TABLOCKX)
+
+SELECT * FROM Sales.SalesOrderHeader WITH (TABLOCKX)
+
+SELECT * FROM Sales.SalesOrderHeader (TABLOCKX)
+
+Now look at it from a multiple-table perspective. The following queries would do the same thing as the previous ones in terms of locking. They would force an exclusive table lock on the SalesOrderHeader table. The thing to note, though, is that they do not place any kind of special lock on the SalesOrderDetail table. The SQL Server lock manager still is in complete control of that table.
+
+SELECT *
+
+FROM Sales.SalesOrderHeader AS ord WITH (TABLOCKX)
+
+JOIN Sales.SalesOrderDetail AS od
+
+ON ord.SalesOrderID = od.SalesOrderID;
+
+SELECT *
+
+FROM Sales.SalesOrderHeader AS ord (TABLOCKX)
+
+JOIN Sales.SalesOrderDetail AS od
+
+ON ord.SalesOrderID = od.SalesOrderID;
+
+SELECT *
+
+FROM Sales.SalesOrderHeader WITH (TABLOCKX);
+
+JOIN Sales.SalesOrderDetail AS od
+
+ON Sales.SalesOrderHeader.SalesOrderID = od.SalesOrderID;
+
+SELECT *
+
+FROM Sales.SalesOrderHeader (TABLOCKX)
+
+JOIN Sales.SalesOrderDetail AS od
+
+ON Sales.SalesOrderHeader.SalesOrderID = od.SalesOrderID;
+
+We also could have done something completely different here and placed a totally separate hint on the SalesOrderDetail table. It's all up to you.
+
+Determining Locks Using the Management Studio
+
+Perhaps the nicest way of all to take a look at your locks is by using Management Studio. Management Studio will show you locks in two different sorts—by process ID or by object—by utilizing the Activity Monitor.
+
+To make use of Management Studio's lock display, just navigate to the Server and right-click, then choose Activity Monitor. You should come up with a new window that looks something like Figure 11.3 (I've expanded the Processes frame).
+
+Figure 11.3
+
+Just expand the node that you're interested in (either the Process ID or the Object), and you'll see various locks.
+
+Perhaps the coolest feature in Management Studio shows itself when you double-click a specific lock in the right-hand side of the window. A dialog box will come up and tell you the last statement that was run by that process ID. This can be very handy when you are troubleshooting deadlock situations.
+
+Setting the Isolation Level
+
+We've seen that several different kinds of problems can be prevented by different locking strategies. We've also seen what kinds of locks are available and how they have an impact on the availability of resources. Now it's time to take a closer look at how these process management pieces work together to ensure overall data integrity and to make certain that you can get the results you expect.
+
+The first thing to understand about the relationship between transactions and locks is that they are inextricably linked with each other. By default, any lock that is data modification related will, once created, be held for the duration of the transaction. If you have a long transaction, this means that your locks may be preventing other processes from accessing the objects you have a lock on for a long time. It probably goes without saying that this can be rather problematic.
+
+However, that's only the default. In fact, there are actually five different isolation levels that you can set at the transaction level:
+
+  * READ COMMITTED (the default)
+  * READ UNCOMMITTED
+  * REPEATABLE READ
+  * SERIALIZABLE
+  * SNAPSHOT
+
+The syntax for switching between them is pretty straightforward:
+
+SET TRANSACTION ISOLATION LEVEL <READ COMMITTED|READ UNCOMMITTED
+
+|REPEATABLE READ|SERIALIZABLE|SNAPSHOT>
+
+The change in isolation level will affect only the current connection. So you don't need to worry about adversely affecting other users (or them affecting you).
+
+Let's start by looking at the default situation (READ COMMITTED) a little more closely.
+
+READ COMMITTED
+
+With READ COMMITTED, any shared locks you create will be automatically released as soon as the statement that created them is complete. That is, if you start a transaction, run several statements, run a SELECT statement, and then run several more statements, the locks associated with the SELECT statement are freed as soon as the SELECT statement is complete. SQL Server doesn't wait for the end of the transaction.
+
+Action queries (UPDATE, DELETE, and INSERT) are a little different. If your transaction performs a query that modifies data, then those locks will be held for the duration of the transaction (in case you need to roll back).
+
+By keeping this level of default, with READ COMMITTED, you can be sure that you have enough data integrity to prevent dirty reads. However, non-repeatable reads and phantoms can still occur.
+
+READ UNCOMMITTED
+
+READ UNCOMMITTED is the most dangerous of all isolation level choices but also has the highest performance in terms of speed.
+
+Setting the isolation level to READ UNCOMMITTED tells SQL Server not to set any locks, and not to honor any locks. With this isolation level, it is possible to experience any of the various concurrency issues we discussed earlier in the chapter (most notably a dirty read).
+
+Why would one ever want to risk a dirty read? When I watch the newsgroups on Usenet, I see the question come up on a regular basis. It's surprising to a fair number of people, but there are actually good reasons to have this isolation level, and they are almost always to do with reporting.
+
+In an OLTP environment, locks are both your protector and your enemy. They prevent data integrity problems, but they also often prevent, or block, you from getting at the data you want. It is extremely commonplace to see a situation where the management wants to run reports regularly, but the data entry people are often prevented from or delayed in entering data because of locks held by the manager's reports.
+
+By using READ UNCOMMITTED, you can often get around this problem—at least for reports where the numbers don't have to be exact. For example, let's say that a sales manager wants to know just how much has been done in sales so far today. Indeed, we'll say he's a micro-manager and asks this same question (in the form of re-running the report) several times a day.
+
+If the report happened to be a long-running one, then there's a high chance that his running it would damage the productivity of other users due to locking considerations. What's nice about this report though, is that it is a truly nebulous report. The exact values are probably meaningless. The manager is really just looking for ballpark numbers.
+
+By having an isolation level of READ UNCOMMITTED, we do not set any locks, so we don't block any other transactions. Our numbers will be somewhat suspect (because of the risk of dirty reads), but we don't need exact numbers anyway, and we know that the numbers are still going to be close even on the off chance that a dirty read is rolled back.
+
+You can get the same effect as READ UNCOMMITTED by adding the NOLOCK optimizer hint in your query. The advantage to setting the isolation level is that you don't have to use a hint for every table in your query, or use it in multiple queries. The advantage to using the NOLOCK optimizer hint is that you don't need to remember to set the isolation level back to the default for the connection. (With READ UNCOMMITTED you do.)
+
+REPEATABLE READ
+
+The REPEATABLE READ escalates your isolation level somewhat, and provides an extra level of concurrency protection by preventing not only dirty reads (the default already does that) but also preventing non-repeatable reads.
+
+That prevention of non-repeatable reads is a big upside, but holding even shared locks until the end of the transaction can block users' access to objects, and therefore hurt productivity. Personally, I prefer to use other data integrity options (such as a CHECK constraint together with error handling) rather than this choice, but it remains an available option.
+
+The equivalent optimizer hint for the REPEATABLE READ isolation level is REPEATABLEREAD (these are the same, only no space).
+
+SERIALIZABLE
+
+SERIALIZABLE is something of the fortress of isolation levels. It prevents all forms of concurrency issues except for a lost update. Even phantoms are prevented.
+
+When you set your isolation to SERIALIZABLE, you're saying that any UPDATE, DELETE, or INSERT to the table or tables used by your transaction must not meet the WHERE clause of any statement in that transaction. Essentially, if the user was going to do something that your transaction would be interested in, then it must wait until your transaction has been completed.
+
+The SERIALIZABLE isolation level can also be simulated by using the SERIALIZABLE or HOLDLOCK optimizer hint in your query. Again, as with the READ UNCOMMITTED and NOLOCK debate, the option of not having to set it every time versus not having to remember to change the isolation level back is the big issue.
+
+Going with an isolation level of SERIALIZABLE would, on the surface, appear to be the way you want to do everything. Indeed, it does provide your database with the highest level of what is called consistency—that is, the update process works the same for multiple users as it would if all your users did one transaction at a time (processed things serially).
+
+As with most things in life, however, there is a trade-off. Consistency and concurrency can, in a practical sense, be thought of as polar opposites. Making things SERIALIZABLE can prevent other users from getting to the objects they need; that equates to lower concurrency. The reverse is also true: Increasing concurrency (by going to a REPEATABLE READ for example) reduces the consistency of your database.
+
+My personal recommendation on this is to stick with the default (READ COMMITTED) unless you have a specific reason not to.
+
+SNAPSHOT
+
+Note that the SNAPSHOT transaction isolation level is not available by default. To utilize it, you must enable the ALLOW_SNAPSHOT_ISOLATION option for your database utilizing the ALTER DATABASE command.
+
+This was first added in SQL Server 2005, and was not particularly well publicized (and still isn't well documented if you ask me!). SNAPSHOT utilizes what is referred to as "row versioning." Transactions that would have been blocked from a given record are instead allowed read access to that record in its last known good state, which is to say, the way it was before whatever transaction is blocking began its modifications to the row.
+
+SNAPSHOT is something of a mixed blessing. On one hand, concurrency is increased as read transactions are allowed to continue forward unabated with a value that is technically the correct value for that moment in time (at least in terms of what data has been truly committed). The down side, however, is that those transactions are being allowed to continue with data that has a significant chance of being inaccurate soon.
+
+Which should you use? Well, as you can imagine, my answer would be "It depends." The safer answer is to stick with the default of READ COMMITTED. Sometimes, however, we don't need that safety, and higher concurrency is the better choice.
+
+The default isolation level of READ COMMITTED can be switched over to a version that utilizes row versioning, effectively the same as SNAPSHOT, by enabling the READ_COMMITTED_SNAPSHOT database option with the ALTER DATABASE command. Make certain however, that you know what you fully understand the differences between the two READ COMMITTED implementations before making such a change.
+
+Dealing with Deadlocks (a.k.a. "A 1205")
+
+Okay. So now you've seen locks, and you've also seen transactions. Now that you've got both, we can move on to the rather pesky problem of dealing with deadlocks.
+
+As we've already mentioned, a deadlock is not a type of lock in itself, but rather a situation where a paradox has been formed by other locks. Like it or not, you'll bump into these on a regular basis (particularly when you're just starting out), and you'll be greeted with an error number 1205. So prolific is this particular problem that you'll hear many a database developer refer to them simply by the number.
+
+Deadlocks are caused when one lock can't do what it needs to do in order to clear because a second lock is holding that resource, and vice versa. When this happens, somebody has to win the battle, so SQL Server chooses a deadlock victim. The deadlock victim's transaction is then rolled back and is notified that this happened through the 1205 error. The other transaction can continue normally. (Indeed, it will be entirely unaware that there was a problem, other than seeing an increased execution time.)
+
+How SQL Server Figures Out There's a Deadlock
+
+Every 5 seconds SQL Server checks all the current transactions for what locks they are waiting for but haven't yet been granted. As it does this, it essentially makes a note that the request exists. It will then re-check the status of all open lock requests again, and, if one of the previous requests has still not been granted, it will recursively check all open transactions for a circular chain of lock requests. If it finds such a chain, then one or more deadlock victims will be chosen.
+
+How Deadlock Victims Are Chosen
+
+By default, a deadlock victim is chosen based on the "cost" of the transactions involved. The transaction that costs the least to roll back will be chosen (in other words, SQL Server has to do the least number of things to undo it). You can, to some degree, override this by using the DEADLOCK_PRIORITY SET option available in SQL Server; this is, however, generally both ill-advised and out of the scope of this book. (I consider this to be very much in the camp of the administrator rather than the developer.)
+
+Avoiding Deadlocks
+
+Deadlocks can't be avoided 100 percent of the time in complex systems, but you can almost always totally eliminate them from a practical standpoint—that is, make them so rare that they have little relevance to your system.
+
+To cut down or eliminate deadlocks, follow these simple (okay, usually simple) rules:
+
+  * Use your objects in the same order.
+  * Keep your transactions as short as possible and in one batch.
+  * Use the lowest transaction isolation level necessary.
+  * Do not allow open-ended interruptions (user interactions, batch separations) within the same transaction.
+  * In controlled environments, use bound connections.
+
+Nearly every time I run across deadlocking problems, at least one (usually more) of these rules has been violated. Let's look at each one individually.
+
+Using Objects in the Same Order
+
+This is the most common problem area within the few rules that I consider to be basic. What's great about using this rule is that it almost never costs you anything to speak of; it's more a way of thinking. You decide early in your design process how you want to access your database objects, including order, and it becomes a habit in every query, procedure, or trigger that you write for that project.
+
+Think about it for a minute. If our problem is that our two connections each have what the other wants, then it implies that we're dealing with the problem too late in the game. Let's look at a simple example.
+
+Consider that we have two tables: Suppliers and Products. Now say that we have two processes that make use of both of these tables. Process 1 accepts inventory entries, updates Products with the new amount of product on hand, and then updates Suppliers with the total amount of product that we've purchased. Process 2 records sales; it updates the total amount of product sold in the Suppliers table and then decreases the inventory quantity in Products.
+
+If we run these two processes at the same time, we're begging for trouble. Process 1 will grab an exclusive lock on the Products table. Process 2 grabs an exclusive lock on the Suppliers table. Process 1 then attempts to grab a lock on the Suppliers table, but it will be forced to wait for Process 2 to clear its existing lock. In the meantime, Process 2 tries to create a lock on the Products table, but it will have to wait for Process 1 to clear its existing lock. We now have a paradox: Both processes are waiting for each other. SQL Server will have to pick a deadlock victim.
+
+Now let's rearrange that scenario, with Process 2 changed to first decrease the inventory quantity in Products and then update the total amount of product sold in the Suppliers table. This is a functional equivalent to the first way we organized the processes, and it will cost us nothing to perform it this new way. The impact though, will be stunning. No more deadlocks (at least not between these two processes)! Let's walk through what will now happen.
+
+When we run these two processes at the same time, Process 1 will grab an exclusive lock on the Products table (so far, it's the same). Process 2 then also tries to grab a lock on the Products table but will be forced to wait for Process 1 to finish. (Notice that we haven't done anything with Suppliers yet.) Process 1 finishes with the Products table but doesn't release the lock because the transaction isn't complete yet. Process 2 is still waiting for the lock on Products to clear. Process 1 now moves on to grab a lock on the Suppliers table. Process 2 continues to wait for the lock to clear on Products. Process 1 finishes and commits or rolls back the transaction as required but frees all locks in either case. Process 2 now is able to obtain its lock on the Products table and moves through the rest of its transaction without further incident.
+
+Just swapping the order in which these two queries are run has eliminated a potential deadlock problem. Keep things in the same order wherever possible and you, too, will experience far fewer deadlocks.
+
+Keeping Transactions As Short As Possible
+
+This is another of the basics. Again, it should become just an instinct—something you don't really think about, something you just do.
+
+This is one that never has to cost you anything really. Put what you need to put in the transaction, and keep everything else out. It's just that simple. The reason this works isn't rocket science. The longer the transaction is open, and the more it touches (within the transaction), the higher the likelihood that you're going to run into some other process that wants one or more of the objects that you're using (reducing concurrency). If you keep your transaction short, you minimize the number of objects that can potentially cause a deadlock, plus you cut down on the time that you have your lock on them. It's as simple as that.
+
+Keeping transactions in one batch minimizes network round-trips during a transaction, reducing possible delays in completing the transaction and releasing locks.
+
+Using the Lowest Transaction Isolation Level Possible
+
+This one is considerably less basic, and requires some serious thought. As such, it isn't surprising just how often it isn't thought of at all. Consider it Rob's axiom: That which requires thought is likely not to be thought of. Be different—think about it.
+
+We have several different transaction isolation levels available. The default is READ COMMITTED. Using a lower isolation level holds shared locks for a shorter duration than a higher isolation level, thereby reducing locking contention.
+
+Allowing No Open-Ended Transactions
+
+This one probably makes the most common sense out of all the recommendations here, but it's one that's often violated because of past practices.
+
+One of the ways we used to prevent lost updates (mainframe days here, folks!) was just to grab the lock and hold it until we were done with it. I can't tell you how problematic this was. (Can you say yuck!)
+
+Imagine this real-life example: Someone in your service department likes to use update (exclusive locks) screens instead of display (shared locks) screens to look at data. "After all," he says. "That way I'm right there ready to edit if I see something that needs to be changed." He goes on to look at a work order. Now his buddy calls and asks if he's ready for lunch. "Sure!" comes the reply, and the service clerk heads off to a rather long lunch (1–2 hours). Everyone who is interested in this record is now locked out of it for the duration of this clerk's lunch.
+
+Wait—it gets worse. In the days of the mainframe, you used to see the concept of queuing far more often. (It actually can be quite efficient.) Now someone submits a print job (which is queued) for this work order. It sits in the queue waiting for the record lock to clear. Since it's a queue environment, every print job your company has for work orders now piles up behind that first print job (which is going to wait for that person's lunch before clearing).
+
+This is a rather extreme example, but it is a real-life scenario I've seen many times, and I hope that it clearly illustrates the point. Don't ever create locks that will still be open when you begin some form of open-ended process. Usually we're talking user interaction (like our lunch lover), but it could be any process that has an open-ended wait to it.
+
+Using Bound Connections
+
+Hmm. I had to debate even including this one, because it's something of a can of worms. Once you open it, you're never going to get them all back in. I'll just say that this is one which is used extremely rarely and is not for the faint of heart.
+
+It's not that it doesn't have its uses; it's just that things can become convoluted rather quickly, so you need to manage things well. It's my personal opinion that there is usually a better solution.
+
+That brings on the question of what is a bound connection. Bound connections are connections that have been associated and are essentially allowed to share the same set of locks. What that means is that the two transactions can operate in tandem without any fear of deadlocking each other or being blocked by one another. The flip side of this means that you essentially are on your own in terms of dealing with most concurrency issues. Locks aren't keeping you safe anymore.
+
+Given my distaste for these for 99.9 percent of situations, we're going to forget that these exist now that we've seen that they are an option. If you're going to insist on using them, just remember that you're going to be dealing with an extremely complex relationship between connections, and you need to manage the activities in those connections rather closely if you are going to maintain data integrity within the system.
+
+Summary
+
+Transactions and locks are both cornerstone items to how SQL Server works and, therefore, to maximizing your development of solutions in SQL Server.
+
+By using transactions, you can make sure that everything you need to have happen as a unit happens, or none of it does. SQL Server's use of locks ensures that we avoid the pitfalls of concurrency to the maximum extent possible. (You'll never avoid them entirely, but it's amazing how close you can come with a little—OK a lot—of planning.) By using the two together, you are able to pass what the database industry calls the ACID test. If a transaction is ACID, then it has:
+
+  * Atomicity: The transaction is all or nothing.
+  * Consistency: All constraints and other data integrity rules have been adhered to, and all related objects (data pages, index pages) have been updated completely.
+  * Isolation: Each transaction is completely isolated from any other transaction. The actions of one transaction cannot be interfered with by the actions of a separate transaction.
+  * Durability: After a transaction is completed, its effects are permanently in place in the system. The data is "safe," in the sense that things such as a power outage or other non-disk system failure will not lead to data that is only half-written.
+
+In short, by using transactions and locks, you can minimize deadlocks, ensure data integrity, and improve the overall efficiency of your system.
+
+In our next chapter, we'll be looking at triggers. Indeed, we'll see that, for many of the likely uses of triggers, the concepts of transactions and rollbacks will be at the very center of the trigger.
+12
+
+Triggers
+
+I am often asked, "Should I use triggers?" The answer is, as with most things in SQL, "It depends." There's little that's black and white in the wonderful world of SQL Server; triggers are definitely a very plain shade of gray.
+
+Know what you're doing before you go the triggers route; it's important for the health and performance of your database. The good news is that's what we're here to learn.
+
+As with most of the core subjects we've covered in this book (save for a few that were just too important to rush), we're going to be moving along quickly in the assumption that you already know the basics. Still, this also happens to be one of those topics where you can have become a relatively advanced user of SQL Server, and never hit this particular topic. That is, triggers can be needed by the beginner for some installations, and yet never been touched by the "Pro" in others (SQL is just that way...). The result is that, if you've read my Beginning SQL Server 2008 Programming title, then you'll definitely notice some overlap (but you'll find much more depth here). If you're in that group of people, feel free to skip ahead to the INSTEAD OF triggers section.
+
+In this chapter, we'll try to look at triggers in all of their colors—from black all the way to white and a whole lot in between. The main issues we'll be dealing with include:
+
+  * What is a trigger (the very quick and dirty version)?
+  * Using triggers for more flexible referential integrity
+  * Using triggers to create flexible data integrity rules
+  * Using INSTEAD OF triggers to create more flexible updatable views
+  * Other common uses for triggers
+  * Controlling the firing order of triggers
+  * Performance considerations
+
+By the time we're done, you should have an idea of just how complex is the decision about when and where not to use triggers. You'll also have an inkling of just how powerful and flexible they can be.
+
+Most of all, if I've done my job well, you won't be a trigger extremist (which so many SQL Server people I meet are) with the distorted notion that triggers are evil and should never be used. Neither will you side with the other end of the spectrum: those who think that triggers are the solution to all the world's problems. The right answer in this respect is that triggers can do a lot for you, but they can also cause a lot of problems. The trick is to use them when they are the right things to use, and not to use them when they aren't.
+
+Some common uses of triggers include:
+
+  * Enforcement of referential integrity: Although I recommend using declarative referential integrity (DRI) whenever possible, there are many things that DRI won't do (for example, referential integrity across databases or even servers, many complex types of relationships, and so on). The use of triggers for RI is becoming very special case, but it's still out there.
+  * Creating audit trails, which means writing out records that keep track of not just the most current data but also the actual change history for each record.
+  * Functionality similar to a CHECK constraint, but which works across tables, databases, or even servers.
+  * Substituting your own statements in the place of a user's action statement (usually used to enable inserts in complex views).
+
+In addition, you have the new but likely much more rare case (as I said, they are new, so only time will tell for sure) DDL trigger—which is about monitoring changes in the structure of your table.
+
+And these are just a few. So, with no further ado, let's look at exactly what a trigger is.
+
+What Is a Trigger?
+
+A trigger is a special kind of stored procedure that responds to specific events. There are two kinds of triggers: Data Definition Language (DDL) triggers and Data Manipulation Language (DML) triggers.
+
+DDL triggers fire in response to someone changing the structure of your database in some way (CREATE, ALTER, DROP, and similar statements). These were first added back in SQL Server 2005 and are critical to some installations (particularly high-security installations) but are pretty narrow in use. In general, you will need to look into using these only where you need extreme auditing of changes/history of your database structure. We will save these until last.
+
+DML triggers are pieces of code that you attach to a particular table or view. Unlike sprocs, where you needed to explicitly invoke the code, the code in triggers is automatically run whenever the event(s) you attached the trigger to occurs in the table. Indeed, you can't explicitly invoke triggers—the only way to do this is by performing the required action in the table that they are assigned to.
+
+Beyond not being able to explicitly invoke a trigger, you'll find two other things that exist for sprocs but are missing from triggers: parameters and return codes.
+
+While triggers take no parameters, they do have a mechanism for figuring out what records they are supposed to act on (we'll investigate this further later in the chapter). And, while you can use the RETURN keyword, you cannot return a specific return code (because you didn't explicitly call the trigger, what would you return a return code to?).
+
+What events can you attach triggers to? The three "action" query types you use in SQL. So, you wind up with triggers based in inserts, updates, and/or deletes (you can mix and match to what events you want the trigger to be attached).
+
+It's worth noting that there are times when a trigger will not fire—even though it seems that the action you are performing falls into one of the preceding categories. At issue is whether or not the operation you are doing is in a logged activity. For example, a DELETE statement is a normal, logged activity that would fire any delete trigger, but a TRUNCATE TABLE, which has the effect of deleting rows, just deallocates the space used by the table. There is no individual deletion of rows logged, and no trigger is fired.
+
+The syntax for creating triggers looks an awful lot like all of our other CREATE syntax, except that it has to be attached to a table somewhat similar to an index; a trigger can't stand on its own.
+
+Let's take a look:
+
+CREATE TRIGGER <trigger name>
+
+ON [<schema name>.]<table or view name>
+
+[WITH ENCRYPTION | EXECUTE AS <CALLER | SELF | <user> >]
+
+{{{FOR|AFTER} <[DELETE] [,] [INSERT] [,] [UPDATE]>} |INSTEAD OF}
+
+[WITH APPEND]
+
+[NOT FOR REPLICATION]
+
+AS
+
+<sql statements> | EXTERNAL NAME <assembly method specifier>
+
+As you can see, the all too familiar CREATE <object type> <object name> is still there as well as the execution stuff we've seen in many other objects—we've just added the ON clause to indicate the table to which this trigger is going to be attached, as well as when and under what conditions it fires.
+
+ON
+
+This part just names what object you are creating the trigger against. Keep in mind that if the type of the trigger is an AFTER trigger (if it uses FOR or AFTER to declare the trigger), then the target of the ON clause must be a table—AFTER triggers are not supported for views.
+
+WITH ENCRYPTION
+
+This works just as it does for views and sprocs. If you add this option, you can be certain that no one will be able to view your code (not even you!). This is particularly useful if you are going to be building software for commercial distribution, or if you are concerned about security and don't want your users to be able to see what data you're modifying or accessing. Obviously, you should keep a copy of the code required to create the trigger somewhere else, in case you want to re-create it sometime later.
+
+As with views and sprocs, the thing to remember when using the WITH ENCRYPTION option is that you must reapply it every time you ALTER your trigger. If you make use of an ALTER TRIGGER statement and do not include the WITH ENCRYPTION option, then the trigger will no longer be encrypted.
+
+The FOR|AFTER versus the INSTEAD OF Clause
+
+In addition to deciding what kind of queries will fire your trigger (INSERT, UPDATE, and/or DELETE), you also have some choice as to when the trigger fires. While the FOR (alternatively, you can use the keyword AFTER if you choose) trigger is the one that has been around a long time and is the one people generally think of, you also have the ability to run what is called an INSTEAD OF trigger. Choosing between these two will affect whether you enter your trigger before or after the data has been modified. In either case, you will be in your trigger before any changes are truly committed to the database.
+
+Confusing? Probably. Let's try it a different way with a diagram that shows where each choice fires (see Figure 12.1).
+
+The thing to note here is that, regardless of which choice you make, SQL Server will put together two working tables—one holding a copy of the records that were inserted (and, incidentally, called INSERTED) and one holding a copy of any records that were deleted (called DELETED). We'll look into the details of the uses of these working tables a little later. For now realize that with INSTEAD OF triggers the creation of these working tables will happen before any constraints are checked, and with FOR triggers, these tables will be created after constraints are checked.
+
+The key to INSTEAD OF triggers is that you can actually run your own code in the place of whatever the user requested. This means we can clean up ambiguous insert problems in views (remember the problem back in Chapter 8 with inserting when there was a JOIN in the view?). It also means that we can take action to clean up constraint violations before the constraint is even checked.
+
+Triggers using the FOR and AFTER declaration behave identically to each other. The big difference between them and INSTEAD OF triggers is that they build their working tables after any constraints have been checked.
+
+The AFTER (or, alternatively, you can use FOR) clause indicates under what type of action(s) you want this trigger to fire. You can have the trigger fire whenever there is an INSERT, UPDATE, or DELETE, or any mix of the three. So, for example, your FOR clause could look something like:
+
+AFTER INSERT, DELETE
+
+... or:
+
+AFTER UPDATE, INSERT
+
+... or:
+
+AFTER DELETE
+
+As was stated in the section about the ON clause, triggers declared using the AFTER or FOR clause can only be attached to tables—no views are allowed (see INSTEAD OF triggers for those).
+
+It's worth noting that, unlike prior editions of this book, I actually do advise a specific choice between AFTER and FOR. While both are equally usable, and there is no indication that either will be deprecated, the AFTER clause is the "standard" way of doing things, so it is more likely to be supported by other database vendors.
+
+Figure 12.1
+
+INSERT Trigger
+
+The code for any trigger that you mark as being FOR INSERT will be executed any time that someone inserts a new row into your table. For each row that is inserted, SQL Server will create a copy of that new row and insert it in a special table that exists only within the scope of your trigger. That table is called INSERTED, and we'll see much more of it over the course of this chapter. The big thing to understand is that the INSERTED table only lives as long as your trigger does. Think of it as not existing before your trigger starts or after your trigger completes.
+
+DELETE Trigger
+
+This works much the same as an INSERT trigger does, save that the INSERTED table will be empty (after all, you deleted rather than inserted, so there are no records for the INSERTED table). Instead, a copy of each record that was deleted is inserted into another table called DELETED. That table, like the INSERTED table, is limited in scope to just the life of your trigger.
+
+UPDATE Trigger
+
+More of the same, save for a twist. The code in a trigger declared as being FOR UPDATE will be fired whenever an existing record in your table is changed. The twist is that there's no such table as UPDATED. Instead, SQL Server treats each row as if the existing record had been deleted, and a totally new record was inserted. As you can probably guess from that, a trigger declared as FOR UPDATE contains not one but two special tables called INSERTED and DELETED. The two tables have exactly the same number of rows, of course.
+
+WITH APPEND
+
+WITH APPEND is something of an oddball and, in all honesty, you're pretty unlikely to use it; nonetheless, and since this is, after all, a "Professional" title, we'll cover it here for that "just-in-case" scenario. WITH APPEND applies only when you are running in 6.5 compatibility mode (which can be set using sp_dbcmptlevel).
+
+SQL Server 6.5 and prior did not allow multiple triggers of the same type on any single table. For example, if you had already declared a trigger called trgCheck to enforce data integrity on updates and inserts, then you couldn't create a separate trigger for cascading updates. Once one update (or insert, or delete) trigger was created, that was it—you couldn't create another trigger for the same type of action.
+
+This was a real pain. It meant that you had to combine logically different activities into one trigger. Trying to get what amounted to two entirely different procedures to play nicely together could, at times, be quite a challenge. In addition, it made reading the code something of an arduous task.
+
+Along came SQL Server 7.0 and the rules changed substantially. No longer do we have to worry about how many triggers we have for one type of action query—you can have several if you like. When running our database in 6.5 compatibility mode, though, we run into a problem: Our database is still working on the notion that there can only be one trigger of a given type on a given table.
+
+WITH APPEND gets around this problem by explicitly telling SQL Server that we want to add this new trigger even though we already have a trigger of that type on the table; both will be fired when the appropriate trigger action (INSERT, UPDATE, DELETE) occurs. It's a way of having a bit of both worlds.
+
+Again, this option is not really needed unless you're running SQL Server in the "way back machine" version, that is, 6.5 compatibility mode. Do not use this unless you know you have a very specific reason you need it.
+
+At this juncture, running in 6.5 compatibility mode means that you are asking SQL Server to run as it was more than a decade ago, and with a version compatibility level that is now four versions old. If the code is important enough to still be running after this much time has passed, it would seem important enough to warrant updating to a more recent version of support.
+
+NOT FOR REPLICATION
+
+Adding this option slightly alters the rules as to when the trigger is fired. With this option in place, the trigger will not be fired whenever a replication-related task modifies your table. Usually a trigger is fired (to do the housekeeping/cascading/and so on) when the original table is modified and there is no point in doing it again.
+
+AS
+
+Exactly as it was with sprocs, this is the meat of the matter. The AS keyword tells SQL Server that your code is about to start. From this point forward, we're into the scripted portion of your trigger.
+
+Using Triggers for Data Integrity Rules
+
+Although they shouldn't be your first option, triggers can also perform the same functionality as a CHECK constraint or even a DEFAULT. The answer to the question "Should I use triggers or CHECK constraints?" is the rather definitive: "It depends." If a CHECK can do the job, then it's probably the preferable choice. There are times, however, when a CHECK constraint just won't do the job, or when something inherent in the CHECK process makes it less desirable than a trigger. Examples of where you would want to use a trigger over a CHECK include:
+
+  * Your business rule needs to reference data in a separate table.
+  * Your business rule needs to check the delta (difference between before and after) of an update.
+  * You require a customized error message.
+
+This really just scratches the surface of things. Since triggers are highly flexible, deciding when to use them really just comes down to whenever you need something special done. To provide at least some guidance though, here's a comparison table I've included in past books:
+
+Restriction | Pros | Cons  
+---|---|---  
+Constraints | Fast. | Must be redefined for each table.  
+| Can reference other columns. | Can't reference other tables.  
+| Happens before the command occurs. | Can't be bound to data types.  
+| ANSI compliant. |   
+Triggers | Ultimate flexibility. | Happens after the command occurs.  
+| Can reference other columns and other tables. | High overhead.  
+| Can even use .NET to reference information that is external to your SQL Server. |
+
+Note that this is deliberately non-specific. Every situation varies, so what I've tried to provide here is a set of guidelines about where either option either succeeds or fails.
+
+Some of you may have noticed that, when I included the preceding table, I did not include the option for Rules and Defaults as I have in previous editions. Why not? Well, because Rules and Defaults (Default the object, not default the constraint) have been considered deprecated for several releases now, so I am gradually intensifying my presentation of the idea that they are there for backward compatibility only.
+
+Dealing with Requirements Sourced from Other Tables
+
+CHECK constraints are great—fast and efficient—but they don't do everything you'd like them to. Perhaps the biggest shortcoming shows up when you need to verify data across tables.
+
+To illustrate this, let's take a look at the Products and SalesOrderDetail tables in AdventureWorks2008 as well as the related SpecialOfferProduct table. The relationship looks like Figure 12.2.
+
+So, under normal DRI, you can be certain that no order line item can be entered into the SalesOrderDetail table unless there is a matching ProductID in the Products table (via the chain through the SpecialOfferProduct table). We are, however, looking for something more than just the "norm" here.
+
+Figure 12.2
+
+Our Inventory department has been complaining that our Customer Support people keep placing orders for products that are discontinued. They would like to have such orders rejected before they get into the system.
+
+We can't deal with this using a CHECK constraint because the place where we know about the discontinued status (the Products table) is in a separate table from where we are placing the restriction (the SalesOrderDetail table). Don't sweat it though; you can tell the Inventory department, "No problem!" You just need to use a trigger:
+
+USE AdventureWorks2008;
+
+GO
+
+CREATE TRIGGER OrderDetailNotDiscontinued
+
+ON Sales.SalesOrderDetail
+
+AFTER INSERT, UPDATE
+
+AS
+
+IF EXISTS
+
+(
+
+SELECT 'True'
+
+FROM Inserted i
+
+JOIN Production.Product p
+
+ON i.ProductID = p.ProductID
+
+WHERE p.DiscontinuedDate IS NOT NULL
+
+)
+
+BEGIN
+
+RAISERROR('Order Item is discontinued. Transaction Failed.',16,1);
+
+ROLLBACK TRAN;
+
+END
+
+Let's go ahead and test our handiwork. First, we need at least one record that will fail when it hits our trigger. That means we need a discontinued item in the Products table; the problem is, there is no such record currently.
+
+SELECT ProductID, Name
+
+FROM Production.Product
+
+WHERE DiscontinuedDate IS NOT NULL;
+
+ProductID Name
+
+\----------- --------------------------------------------------
+
+(0 row(s) affected)
+
+So, we'll pick one and change it ourselves for test purposes:
+
+UPDATE Production.Product
+
+SET DiscontinuedDate = GETDATE()
+
+WHERE ProductID = 680;
+
+With that done, we're ready to see if our trigger works, so let's go ahead and add a line item that violates this constraint. I'm going to make use of a SalesOrderHeader that already exists, so we don't have to get over elaborate building up a full order:
+
+INSERT Sales.SalesOrderDetail
+
+(
+
+SalesOrderID,
+
+OrderQty,
+
+ProductID,
+
+SpecialOfferID,
+
+UnitPrice,
+
+UnitPriceDiscount
+
+)
+
+VALUES
+
+(
+
+43660,
+
+5,
+
+680,
+
+1,
+
+1431,
+
+0
+
+);
+
+This gets the rejection that we expect:
+
+Msg 50000, Level 16, State 1, Procedure OrderDetailNotDiscontinued, Line 14
+
+Order Item is discontinued. Transaction Failed.
+
+Msg 3609, Level 16, State 1, Line 1
+
+The transaction ended in the trigger. The batch has been aborted.
+
+Remember that we could, if desired, also create a custom error message to raise, instead of the ad hoc message that we used with the RAISERROR command.
+
+Using Triggers to Check the Delta of an Update
+
+Sometimes, you're not interested as much in what the value was or is as you are in how much it changed. While there isn't any one column or table that gives you that information, you can calculate it by making use of both the Inserted and Deleted tables in your trigger.
+
+A quick example of this might be to write records for security reasons. Let's say, for example, that you wanted to track every adjustment to inventory regardless of what initiated it for auditing purposes (for example, inventory adjustments might be made directly against inventory tables rather than via an order item).
+
+To implement something like this, we would need an audit table to make use of both the Inserted and Deleted tables:
+
+USE AdventureWorks2008;
+
+CREATE TABLE Production.InventoryAudit
+
+(
+
+TransactionID int IDENTITY PRIMARY KEY,
+
+ProductID int NOT NULL
+
+REFERENCES Production.Product(ProductID),
+
+NetAdjustment smallint NOT NULL,
+
+ModifiedDate datetime DEFAULT(CURRENT_TIMESTAMP)
+
+);
+
+GO
+
+CREATE TRIGGER ProductAudit
+
+ON Production.ProductInventory
+
+FOR INSERT, UPDATE, DELETE
+
+AS
+
+INSERT INTO Production.InventoryAudit
+
+(ProductID, NetAdjustment)
+
+SELECT COALESCE(i.ProductID, d.ProductID),
+
+ISNULL(i.Quantity, 0) - ISNULL(d.Quantity, 0) AS NetAdjustment
+
+FROM Inserted i
+
+FULL JOIN Deleted d
+
+ON i.ProductID = d.ProductID
+
+AND i.LocationID = d.LocationID
+
+WHERE ISNULL(i.Quantity, 0) - ISNULL(d.Quantity, 0) != 0;
+
+Before we test this, let's analyze what we're doing here. I've started by adding an audit table to receive information about changes to our base table. From there, I've created a trigger that will fire on any change to the table and will write the next change out to our new audit table.
+
+Now, let's check this out by running a test script:
+
+PRINT 'The values before the change are:'
+
+SELECT ProductID, LocationID, Quantity
+
+FROM Production.ProductInventory
+
+WHERE ProductID = 1
+
+AND LocationID = 50;
+
+PRINT 'Now making the change'
+
+UPDATE Production.ProductInventory
+
+SET Quantity = Quantity + 7
+
+WHERE ProductID = 1
+
+AND LocationID = 50;
+
+UPDATE Production.ProductInventory
+
+SET Quantity = Quantity - 7
+
+WHERE ProductID = 1
+
+AND LocationID = 50;
+
+PRINT 'The values after the change are:'
+
+SELECT ProductID, LocationID, Quantity
+
+FROM Production.ProductInventory
+
+WHERE ProductID = 1
+
+AND LocationID = 50;
+
+SELECT * FROM Production.InventoryAudit;
+
+And we can use the before and after output to verify that our audit records were properly written:
+
+The values before the change are:
+
+ProductID LocationID Quantity
+
+\----------- ---------- --------
+
+1 50 353
+
+(1 row(s) affected)
+
+Now making the change
+
+(1 row(s) affected)
+
+(1 row(s) affected)
+
+(1 row(s) affected)
+
+(1 row(s) affected)
+
+The values after the change are:
+
+ProductID LocationID Quantity
+
+\----------- ---------- --------
+
+1 50 353
+
+(1 row(s) affected)
+
+TransactionID ProductID NetAdjustment ModifiedDate
+
+\------------- ----------- ------------- -----------------------
+
+1 1 7 2008-12-15 22:29:11.900
+
+2 1 -7 2008-12-15 22:29:11.900
+
+(2 row(s) affected)
+
+Using Triggers for Custom Error Messages
+
+We've already touched on this in some of our other examples, but remember that triggers can be handy for retaining control over the error message or number that gets passed out to your user or client application.
+
+With a CHECK constraint, for example, you're just going to get the standard 547 error along with its rather nondescript explanation. As often as not, this is less than helpful in terms of the user really figuring out what went wrong; indeed, your client application often doesn't have enough information to make an intelligent and helpful response on behalf of the user.
+
+In short, sometimes you create triggers when there is already something that would give you the data integrity that you want but won't give you enough information to handle it.
+
+Other Common Uses for Triggers
+
+In addition to the straight data integrity uses, triggers have a number of other uses. Indeed, the possibilities are fairly limitless, but here are a few common examples:
+
+  * Updating summary information
+  * Feeding de-normalized tables for reporting
+  * Setting condition flags
+
+Updating Summary Information
+
+Sometimes we like to keep aggregate information around to help with reporting or to speed performance when checking conditions.
+
+Take, for instance, the example of a customer's credit limit versus their current balance. The limit is a fairly static thing and is easily stored with the rest of the customer information. The current balance is another matter. We can always figure out the current balance by running a query to total all of the unpaid balances for any orders the customer has, but think about that for a moment. Let's say that you work for Sears, and you do literally millions of transactions every year. Now think about how your table is going to have many millions of records for your query to sort through and that you're going to be competing with many other transactions in order to run your query. Things would perform an awful lot better if we could just go to a single place to get that total—but how to maintain it?
+
+We certainly could just make sure that we always use a stored procedure for adding and paying order records, and then have the sproc update the customer's current balance. But that would mean that we would have to be sure that every sproc that has a potential effect on the customer's balance would have the update code. If just one sproc leaves it out, then we have a major problem, and figuring out which sproc is the offending one is a hassle at best, and problematic at worst. By using a trigger, however, the updating of the customer balance becomes pretty easy.
+
+We could maintain virtually any aggregation we want to keep track of. Keep in mind, however, that every trigger that you add increases the amount of work that has to be done to complete your transactions. That means that you are placing an additional burden on your system and increasing the chances that you will run into deadlock problems.
+
+Feeding Data into De-normalized Tables for Reporting
+
+I'm going to start right off by saying this isn't the way you should do things in most circumstances. Usually, this kind of data transfer should be handled as part of a batch process run at night or during non-peak hours for your system—depending on the nature of what you are moving, replication may also be an excellent answer. We will be discussing replication in detail in Chapter 17.
+
+That being said, sometimes you need the data in your reporting tables to be right up-to-the-minute. The only real ways to take care of this is to modify all your sprocs and other access points into your system, to update the reporting tables at the same time as they update the Online Transaction Processing (OLTP) tables (YUCK!), or to use triggers to propagate any updates to records.
+
+What's nice about using this method to propagate data is that you are always certain to be up-to-the-minute on what's happening in the OLTP tables. That being said, it defeats a large part of the purpose of keeping separate reporting tables. While keeping the data in a de-normalized format can greatly improve query performance, one of its main goals, in most installations, is to clear reporting needs out of the main OLTP database and minimize concurrency issues. If all your OLTP updates still have to update information in your reporting tables, then all you've done is to move the database in which the actual deadlock or other concurrency issue is happening. From the OLTP standpoint, you've added work without gaining any benefits.
+
+The thing you have to weigh here is whether you're going to gain enough performance in your reporting to make it worth the damage you're going to do to performance on your OLTP system.
+
+Setting Condition Flags
+
+This situation is typically used much as aggregation is—to maintain a flag as changes are made rather than having to look for a certain condition across a complete table. Lookup flags are one of the little things that, while they usually break the rules of normalization (you're not supposed to store data that can be derived elsewhere), they can really boost system performance substantially.
+
+For the example on this topic, let's assume that we maintain a variety of information on the products that we sell. Material Safety Data Sheets (MSDS), information on suppliers—imagine there can be an unlimited number of different documents that all provide some sort of information on our products. Now, further imagine that we have something more than the mere 504 products that are in the AdventureWorks2008 database (it's not at all uncommon for businesses to have 50,000 or more different line items in their catalog). The number of possible informational records could get extremely high.
+
+We want to be able to put a flag on our Customer Support screens that tell the order taker whether there is any additional information available for this product. If we were living by the rules of a normalized database, we would have to look in the ProductDocument table to see if it had any records that matched up with our ProductID.
+
+Rather than do those lookups, we can just place a bit field in our Products table that is a yes/no indicator on whether other information is available. We would then put a trigger on the ProductInformation table that updates the bit flag in the Products table. If a record is inserted into ProductInformation, then we set the bit flag to TRUE for the corresponding product. When a ProductInformation record is deleted, we look to see whether it was the last one, and, if so, set the bit flag in the Products table back to FALSE.
+
+We'll go for an ultra-quick example. First, we need to set up by creating the bit flag field and ProductDocument table:
+
+ALTER TABLE Production.Product
+
+ADD InformationFlag bit NOT NULL
+
+CONSTRAINT InformationFlagDefault
+
+DEFAULT 0 WITH VALUES;
+
+Then we need to fix the data in the table to allow for documentation we already have:
+
+UPDATE p
+
+SET p.InformationFlag = 1
+
+FROM Production.Product p
+
+WHERE EXISTS
+
+(
+
+SELECT 1
+
+FROM Production.ProductDocument pd
+
+WHERE pd.ProductID = p.ProductID
+
+);
+
+Then we're ready to add our trigger:
+
+CREATE TRIGGER DocumentBelongsToProduct
+
+ON Production.ProductDocument
+
+FOR INSERT, DELETE
+
+AS
+
+DECLARE @Count int;
+
+SELECT @Count = COUNT(*) FROM Inserted;
+
+IF @Count > 0
+
+BEGIN
+
+UPDATE p
+
+SET p.InformationFlag = 1
+
+FROM Inserted i
+
+JOIN Production.Product p
+
+ON i.ProductID = p.ProductID;
+
+END
+
+IF @@ERROR != 0
+
+ROLLBACK TRAN;
+
+SELECT @Count = COUNT(*) FROM Deleted
+
+IF @Count > 0
+
+BEGIN
+
+UPDATE p
+
+SET p.InformationFlag = 0
+
+FROM Inserted i
+
+RIGHT JOIN Production.Product p
+
+ON i.ProductID = p.ProductID
+
+WHERE i.ProductID IS NULL
+
+END
+
+IF @@ERROR != 0
+
+ROLLBACK TRAN;
+
+And we're ready to test:
+
+SELECT ProductID, InformationFlag
+
+FROM Production.Product p
+
+WHERE p.ProductID = 1;
+
+INSERT INTO Production.ProductDocument
+
+(ProductID, DocumentNode)
+
+VALUES
+
+(1, 0x);
+
+SELECT ProductID, InformationFlag
+
+FROM Production.Product p
+
+WHERE p.ProductID = 1;
+
+This yields the proper update:
+
+ProductID InformationFlag
+
+\----------- ---------------
+
+1 0
+
+(1 row(s) affected)
+
+(1 row(s) affected)
+
+(1 row(s) affected)
+
+ProductID InformationFlag
+
+\----------- ---------------
+
+1 1
+
+(1 row(s) affected)
+
+And the delete:
+
+DELETE Production.ProductDocument
+
+WHERE ProductID = 1
+
+AND DocumentNode = 0x;
+
+SELECT ProductID, InformationFlag
+
+FROM Production.Product p
+
+WHERE p.ProductID = 1;
+
+Again, this gets the proper update:
+
+ProductID InformationFlag
+
+\----------- ---------------
+
+1 0
+
+(1 row(s) affected)
+
+Now we can find out whether there's product documentation right in the very same query with which we grab the base information on the product. We won't incur the overhead of the query to the ProductDocument table unless there really is something out there for us to retrieve.
+
+Other Trigger Issues
+
+You have most of it now, but if you're thinking you are finished with triggers, then think again. As I indicated early in the chapter, triggers create an awful lot to think about. The sections that follow attempt to point out some of the biggest issues you need to consider, plus they provide some information on additional trigger features and possibilities.
+
+Triggers Can Be Nested
+
+A nested trigger is one that does not fire directly as a result of a statement that you issued but rather because of a statement that was issued by another trigger.
+
+This can actually set off quite a chain of events—with one trigger causing another trigger to fire which, in turn, causes yet another trigger to fire, and so on. Just how deep the triggers can fire depends on:
+
+  * Whether nested triggers are turned on for your system (this is a system-wide, not database-level option; it is set using Management Studio or sp_configure, and defaults to on).
+  * Whether there is a limit of nesting to 32 levels deep.
+  * Whether a trigger has already been fired. A trigger can, by default, only be fired once per trigger transaction. Once fired, it will ignore any other calls as a result of activity that is part of the same trigger action. Once you move on to an entirely new statement (even within the same overall transaction), the process can start all over again.
+
+In most circumstances, you actually want your triggers to nest (thus the default), but you need to think about what's going to happen if you get into a circle of triggers firing triggers. If it comes back around to the same table twice, then the trigger will not fire the second time, and something you think is important may not happen; for example, a data integrity violation may get through. It's also worth noting that, if you do a ROLLBACK anywhere in the nesting chain, then the entire chain is rolled back. In other words, the entire nested trigger chain behaves as a transaction.
+
+Triggers Can Be Recursive
+
+What is a recursive trigger? A trigger is said to be recursive when something the trigger does eventually causes that same trigger to be fired. It may be directly (by an action query done to the table on which the trigger is set), or indirectly (through the nesting process).
+
+Recursive triggers are rare. Indeed, by default, recursive triggers are turned off. This is, however, a way of dealing with the situation just described, where you are nesting triggers and you want the update to happen the second time around. Recursion, unlike nesting, is a database-level option and can be set using the sp_dboption system sproc.
+
+The danger in recursive triggers is that you'll get into some form of unintended loop. As such, you'll need to make sure that you get some form of recursion check in place to stop the process if necessary.
+
+Debugging Triggers
+
+Debugging triggers is a hassle at best. Since you have something of a level of indirection (you write a statement that causes the trigger to fire, rather than explicitly firing it yourself), it always seems like you have to second-guess what's going on.
+
+You can utilize the same debugger we utilized in Chapter 10—you just need to get tricky to do it. The trick? The trick is to create a block of code (stored procedure or batch) that will cause your trigger to fire, and then step into that block of code. You can then step your way right into the trigger.
+
+When debugging with the built-in tool is a trial, use PRINT and SELECT statements to output your values in the triggers. Beyond telling you what your variables are doing along the way, they can also tip you off to recursion and, in some cases, nesting problems.
+
+Nesting issues can be one of the biggest gotchas of trigger design. You will find it not at all uncommon to see situations where you execute a command and wind up with unexpected results because you didn't realize how many other triggers were, in turn, going to be fired. What's more, if the nested triggers perform updates to the initiating table, the trigger will not fire a second time—this creates data integrity problems in tables where you are certain that your trigger is correct in preventing them. It probably has the right code for the first firing, but it doesn't even run the second time around in a nested situation.
+
+You can also make use of SELECT @@NESTLEVEL to show just how deep into a nesting situation you've got.
+
+Keep in mind though, that PRINT and result set generating SELECT statements don't really have anywhere to send their data other than the screen (in Management Studio) or as an informational message (data access models). This is usually far more confusing than anything else. As such, I highly recommend removing these statements once you've finished debugging, and before you go to production release.
+
+Triggers Don't Get in the Way of Architecture Changes
+
+This is a classic good news/bad news story.
+
+Using triggers is positively great in terms of making it easy to make architecture changes. Indeed, I often use triggers for referential integrity early in the development cycle (when I'm more likely to be making lots of changes to the design of the database) and then change to DRI late in the cycle when I'm close to production.
+
+When you want to drop a table and re-create it using DRI, you must first drop all of the constraints before dropping the table. This can create quite a maze in terms of dropping multiple constraints, making your changes, and then adding the constraints again. It can be quite a wild ride trying to make sure that everything drops that is supposed to so that your changed scripts will run. Then it's just as wild a ride to make sure that you've got everything back on that needs to be. Triggers take care of all this because they don't care that anything has changed until they actually run.
+
+There's the rub though—when they run. You see, it means that you may change architecture and break several triggers without even realizing that you've done it. It won't be until the first time that those triggers try to address the object(s) in question that you find the error of your ways. By that time, you may find difficulty in piecing together exactly what you did and why.
+
+Both sides have their hassles; just keep the hassles in mind no matter which method you're employing.
+
+Triggers Can Be Turned Off without Being Removed
+
+Sometimes, just like with CHECK constraints, you want to turn off the integrity feature, so you can do something that will violate the constraint but still has a valid reason for happening (importation of data is probably the most common of these).
+
+Another common reason for doing this is when you are performing some sort of bulk insert (importation again), but you are already 100 percent certain the data is valid. In this case, you may want to turn off the triggers to eliminate their overhead and speed up the insert process.
+
+You can turn a trigger off and on by using an ALTER TABLE statement. The syntax looks like this:
+
+ALTER TABLE <table name>
+
+<ENABLE|DISABLE> TRIGGER <ALL|<trigger name>>
+
+As you might expect, my biggest words of caution in this area are, "Don't forget to re-enable your triggers!"
+
+One last thing. If you're turning them off to do some form of mass importation of data, I highly recommend that you kick out all your users and go to RESTRICTED_USER mode. This will make sure that no one sneaks in behind you while you have the triggers turned off.
+
+Be sure to consider the ability to disable triggers when addressing security concerns. If you are counting on triggers to perform audits for you, but you are allowing the disabling of triggers (granted, they would have to have some degree of security already, but you still need to fully consider the possibilities), then you have a loophole in your auditing.
+
+Trigger Firing Order
+
+In long ago releases of SQL Server (7.0 and prior), we had no control over firing order. Indeed, you may recall me discussing how there was only one of any particular kind of trigger (INSERT, UPDATE, DELETE) prior to 7.0, so firing order was something of a moot point. Later releases of SQL Server provide a limited amount of control over which triggers go in what order. For any given table (not views, since firing order can only be specified for AFTER triggers and views accept only INSTEAD OF triggers), you can elect to have one (and only one) trigger fired first. Likewise, you may elect to have one (and only one) trigger fired last. All other triggers are considered to have no preference on firing order—that is, you have no guarantee in what order a trigger with a firing order of "none" will fire, other than that it will fire after the FIRST trigger (if there is one) is complete and before the LAST trigger (again, if there is one) begins (see Figure 12.3).
+
+The creation of a trigger that is to be first or last works just the same as any other trigger. You state the firing order preference after the trigger has already been created by using a special system stored procedure, sp_settriggerorder.
+
+The syntax of sp_settriggerorder looks like this:
+
+sp_settriggerorder[@triggername =] '<trigger name>',
+
+[@order =] '{FIRST|LAST|NONE}',
+
+[@stmttype =] '{INSERT|UPDATE|DELETE}'
+
+There can be only one trigger that is considered to be "first" for any particular action (INSERT, UPDATE, or DELETE). Likewise, there can be only one "last" trigger for any particular action. Any number of triggers can be considered to be "none"—that is, the number of triggers that don't have a particular firing order is unlimited.
+
+Figure 12.3
+
+So, the question should be, "Why do I care what order they fire in?" Well, often you won't care at all. At other times, it can be important logic-wise or just a good performance idea. Let's consider what I mean in a bit more detail.
+
+Controlling Firing Order for Logic Reasons
+
+Why would you need to have one trigger fire before another? The most common reason would be that the first trigger lays some sort of foundation for, or otherwise validates, what will come afterward. Under SQL Server 6.5 and earlier, we didn't have to think about this kind of thing much—we were only allowed one trigger of any particular type (UPDATE, DELETE, or INSERT) for a given table. This meant that having one thing happen before another wasn't really a problem. Because you combined all logic into one trigger, you just put the first thing that needed to happen first in the code and the last part last (no real rocket science there at all).
+
+Version 7.0 came along and made things both better and worse than they were before. You were no longer forced to jam all of your logic into one trigger. This was really cool because it meant that you could physically separate parts of your trigger code that were logically different, which, in turn, both made the code much easier to manage and allowed one part of the code to be disabled (remember that NO CHECK thing we did a few sections ago?) while other parts of the code continued to function. The downside was that if you went ahead and separated out your code that way, you lost the logical stepping order that the code had when it was in one trigger.
+
+By gaining at least a rudimentary level of control over firing order, we now have something of the best of both worlds: we can logically separate our triggers but still maintain necessary order of precedence on what piece of code runs first or last.
+
+Controlling Firing Order for Performance Reasons
+
+On the performance front, a FIRST trigger is the only one that really has any big thing going for it. If you have multiple triggers, but only one of them is likely to generate a rollback (for example, it may be enforcing a complex data integrity rule that a constraint can't handle), you would want to consider making such a trigger a FIRST trigger. This ensures that the most likely cause of a rollback is already complete before you invest any more activity in your transaction. The more you do before the rollback is detected, the more that will have to be rolled back. Determine the highest possibility of that rollback happening before performing additional activity.
+
+INSTEAD OF Triggers
+
+While it can work against tables, the primary purpose of an INSTEAD OF trigger is usually to allow updates to views in places where it was previously not possible.
+
+Essentially, an INSTEAD OF trigger is a block of code we can use as something of an interceptor for anything that anyone tries to do to our table or view. We can either elect to go ahead and do whatever the user requests or, if we choose, we can go so far as doing something entirely different.
+
+As with FOR/AFTER triggers, INSTEAD OF triggers come in three different flavors—INSERT, UPDATE, and DELETE. Unlike FOR/AFTER triggers, however, you can only have one trigger per table or view for each of the different flavors (one each for INSERT, UPDATE, DELETE).
+
+If we're going to explore these, we need to get some appropriate sample tables out there. To that end, let's take the following four tables (you can change the script to use an existing database if you wish):
+
+CREATE DATABASE OurInsteadOfTest;
+
+GO
+
+USE OurInsteadOfTest;
+
+CREATE TABLE dbo.Customers
+
+(
+
+CustomerID varchar(5) NOT NULL PRIMARY KEY ,
+
+Name varchar(40) NOT NULL
+
+);
+
+CREATE TABLE dbo.Orders
+
+(
+
+OrderID int IDENTITY NOT NULL PRIMARY KEY,
+
+CustomerID varchar(5) NOT NULL
+
+REFERENCES Customers(CustomerID),
+
+OrderDate datetime NOT NULL
+
+);
+
+CREATE TABLE dbo.Products
+
+(
+
+ProductID int IDENTITY NOT NULL PRIMARY KEY,
+
+Name varchar(40) NOT NULL,
+
+UnitPrice money NOT NULL
+
+);
+
+CREATE TABLE dbo.OrderItems
+
+(
+
+OrderID int NOT NULL
+
+REFERENCES dbo.Orders(OrderID),
+
+ProductID int NOT NULL
+
+REFERENCES dbo.Products(ProductID),
+
+UnitPrice money NOT NULL,
+
+Quantity int NOT NULL
+
+CONSTRAINT PKOrderItem PRIMARY KEY CLUSTERED
+
+(OrderID, ProductID)
+
+);
+
+\-- INSERT sample records
+
+INSERT dbo.Customers
+
+VALUES ('ABCDE', 'Bob''s Pretty Good Garage');
+
+INSERT dbo.Orders
+
+VALUES ('ABCDE', CURRENT_TIMESTAMP);
+
+INSERT dbo.Products
+
+VALUES ('Widget', 5.55),
+
+('Thingamajig', 8.88)
+
+INSERT dbo.OrderItems
+
+VALUES (1, 1, 5.55, 3);
+
+We will use these tables for all three of the upcoming examples of INSTEAD OF triggers.
+
+INSTEAD OF INSERT Triggers
+
+The INSTEAD OF INSERT trigger allows us to examine the data that is about to go into our table or view, and decide what we want to do with it prior to the insert physically occurring. The typical use of this will usually be on a view—in which manipulating the data before the actual physical insert is attempted can mean the difference between the insert succeeding or failing.
+
+Let's look at an example by creating an updatable view—specifically, one that will accept INSERTs where, before INSTEAD OF INSERT triggers, we wouldn't have been able to do it.
+
+In this case, we'll create a view that demonstrates the update problem and then look at how to fix it. Let's take the case of showing some order line items, but with more full information about the customer and products (be sure you're using the database you created the sample tables in):
+
+USE OurInsteadOfTest;
+
+GO
+
+CREATE VIEW CustomerOrders_vw
+
+WITH SCHEMABINDING
+
+AS
+
+SELECT o.OrderID,
+
+o.OrderDate,
+
+od.ProductID,
+
+p.Name,
+
+od.Quantity,
+
+od.UnitPrice
+
+FROM dbo.Orders AS o
+
+JOIN dbo.OrderItems AS od
+
+ON o.OrderID = od.OrderID
+
+JOIN dbo.Products AS p
+
+ON od.ProductID = p.ProductID;
+
+CREATE VIEW CustomerOrders_vw
+
+WITH SCHEMABINDING
+
+AS
+
+SELECT o.SalesOrderID,
+
+o.OrderDate,
+
+od.ProductID,
+
+p.Name,
+
+od.OrderQty,
+
+od.UnitPrice,
+
+od.LineTotal
+
+FROM Sales.SalesOrderHeader AS o
+
+JOIN Sales.SalesOrderDetail AS od
+
+ON o.SalesOrderID = od.SalesOrderID
+
+JOIN Production.Product AS p
+
+ON od.ProductID = p.ProductID
+
+The view is not fully updatable in its current state. How would SQL Server know which data went to which table? Sure, one could make a case for a straight update statement working, but we don't have the primary key for every table here. Even worse, what if we wanted to do an insert (which, as it happens, we do)?
+
+The answer is something that SQL Server can't give you by itself—you need to provide more instructions as to what you want to do in such complex situations. That's where INSTEAD OF triggers really shine.
+
+Let's take a look at our example order:
+
+SELECT *
+
+FROM CustomerOrders_vw
+
+WHERE OrderID = 1;
+
+This gets us back the one row we used to prime our sample:
+
+Bob's Pretty Good Garage...1...2006-04-13 05:14:22.780...1...Widget...3...5.55
+
+Now, just to prove it doesn't work, let's try to INSERT a new order item:
+
+INSERT INTO CustomerOrders_vw
+
+(
+
+OrderID,
+
+OrderDate,
+
+ProductID,
+
+Quantity,
+
+UnitPrice
+
+)
+
+VALUES
+
+(
+
+1,
+
+'1998-04-06',
+
+2,
+
+10,
+
+6.00
+
+)
+
+As expected, it doesn't work:
+
+Server: Msg 4405, Level 16, State 1, Line 2
+
+View or function 'CustomerOrders_vw' is not updatable because the modification affects
+
+multiple base tables.
+
+It's time for us to take care of this with an INSTEAD OF trigger. What we need to do here is decide ahead of time what scenarios we want to handle (in this case, just the insert of new OrderItem records) and what we want to do about it.
+
+We're going to treat any INSERT as an attempt to add a new order item. We're going to assume for this example that the customer already exists (if we wanted to get complex, we could break things up further) and that we have an OrderID available. Our trigger might look something like:
+
+CREATE TRIGGER trCustomerOrderInsert ON CustomerOrders_vw
+
+INSTEAD OF INSERT
+
+AS
+
+BEGIN
+
+\-- Check to see whether the INSERT actually tried to feed us any rows.
+
+\-- (A WHERE clause might have filtered everything out)
+
+IF (SELECT COUNT(*) FROM Inserted) > 0
+
+BEGIN
+
+INSERT INTO dbo.OrderItems
+
+SELECT i.OrderID,
+
+i.ProductID,
+
+i.UnitPrice,
+
+i.Quantity
+
+FROM Inserted AS i
+
+JOIN Orders AS o
+
+ON i.OrderID = o.OrderID;
+
+\-- If we have records in Inserted, but no records could join to
+
+\-- the orders table, then there must not be a matching order
+
+IF @@ROWCOUNT = 0
+
+RAISERROR('No matching Orders. Cannot perform insert',10,1);
+
+END
+
+END
+
+So, let's try that insert again:
+
+INSERT INTO CustomerOrders_vw
+
+(
+
+OrderID,
+
+OrderDate,
+
+ProductID,
+
+Quantity,
+
+UnitPrice
+
+)
+
+VALUES
+
+(
+
+1,
+
+'1998-04-06',
+
+2,
+
+10,
+
+6.00
+
+)
+
+We've explicitly addressed what table we're going to insert into, and so SQL Server is happy. We could easily extend this to address non-nullable columns that don't participate in the view if we needed to. (The customer can't provide values to those columns because they are not in the view the customer is using.)
+
+INSTEAD OF UPDATE Triggers
+
+We've now seen how INSERT statements against views can lead to ambiguous situations and also how to fix them with an INSTEAD OF INSERT trigger—but what about updates?
+
+Even on the update side of things our statements can become ambiguous; if we update the ProductName in CustomerOrders_vw, does that mean we want to change the actual name on the product or does it mean that we want to change what product this line item is selling? The answer, of course, is that it depends on the situation. For one system, changing the ProductName might be the correct answer. For another system, changing the product sold might be the thing.
+
+Much like INSTEAD OF INSERT triggers, INSTEAD OF UPDATE triggers give us the chance to trap what is coming in and address it explicitly. In our ProductName example, we could have chosen to do it either way. By default, SQL Server would update the name in the Products table. We could, however, use an INSTEAD OF UPDATE trigger to trap it and explicitly look up the ProductName to find the ProductID if that is what the user intended. From there, we could generate an error if the provided ProductID did not make the one that went with the name.
+
+INSTEAD OF DELETE Triggers
+
+Okay, this is the last of our INSTEAD OF triggers and, most likely, the one that you'll run into the least often. As with the other two INSTEAD OF trigger types, these are used almost exclusively to allow views to delete data in one or more underlying tables.
+
+So, continuing with our CustomerOrders_vw example, we'll add some delete functionality. This time, however, we're going to raise the complexity bar a bit. We want to delete all the rows for a given order, but if deleting those rows means that the order has no detail items left, then we also want to delete the order header.
+
+We know from our last section (assuming you've been playing along) that we have two rows in Order 1 (the one we seeded when we built the table and the one we inserted in the INSTEAD OF INSERT example) but, before we start trying to delete things, let's build our trigger:
+
+CREATE TRIGGER trCustomerOrderDelete ON CustomerOrders_vw
+
+INSTEAD OF DELETE
+
+AS
+
+BEGIN
+
+\-- Check to see whether the DELETE actually tried to feed us any rows
+
+\-- (A WHERE clause might have filtered everything out)
+
+IF (SELECT COUNT(*) FROM Deleted) > 0
+
+BEGIN
+
+DELETE oi
+
+FROM dbo.OrderItems AS oi
+
+JOIN Deleted AS d
+
+ON d.OrderID = oi.OrderID
+
+AND d.ProductID = oi.ProductID;
+
+DELETE Orders
+
+FROM Orders AS o
+
+JOIN Deleted AS d
+
+ON o.OrderID = d.OrderID
+
+LEFT JOIN OrderItems AS oi
+
+ON oi.OrderID = d.OrderID
+
+AND oi.ProductID = d.OrderID
+
+WHERE oi.OrderID IS NULL;
+
+END
+
+END
+
+And now we're ready to test. We'll start off by deleting just a single row from our CustomerOrders_vw view:
+
+DELETE CustomerOrders_vw
+
+WHERE OrderID = 1
+
+AND ProductID = 2;
+
+We're ready to run our select again:
+
+SELECT ProductID, UnitPrice, Quantity
+
+FROM CustomerOrders_vw
+
+WHERE OrderID = 1;
+
+Sure enough, the row that we first inserted in our INSTEAD OF INSERT section is now gone:
+
+ProductID UnitPrice Quantity
+
+\----------- --------------------- -----------
+
+1 5.55 3
+
+(1 row(s) affected)
+
+So, our deleting of individual detail lines is working just fine. Now let's get a bit more cavalier and delete the entire order:
+
+DELETE CustomerOrders_vw
+
+WHERE OrderID = 1
+
+To really check that this worked okay, we need to go all the way to our Orders table:
+
+SELECT * FROM Orders WHERE OrderID = 1;
+
+Sure enough—the order has been removed.
+
+While we don't have to think about individual columns with INSTEAD OF DELETE triggers (you delete by row, not by column), we do need to be aware of what referential integrity actions exist on any table (not view) for which we are defining an INSTEAD OF DELETE trigger. Just like INSTEAD OF UPDATE triggers, INSTEAD OF DELETE triggers are not allowed on tables that have referential integrity actions.
+
+IF UPDATE() and COLUMNS_UPDATED()
+
+In an UPDATE trigger, we can often limit the amount of code that actually executes within the trigger by checking to see whether the column(s) we are interested in are the ones that have been changed. To do this, we make use of the UPDATE() or COLUMNS_UPDATED() functions. Let's look at each.
+
+The UPDATE() Function
+
+The UPDATE() function has relevance only within the scope of a trigger. Its sole purpose in life is to provide a Boolean response (true/false) to whether a particular column has been updated or not. You can use this function to decide whether or not a particular block of code needs to run—for example, if that code is only relevant when a particular column is updated.
+
+Let's run a quick example of this by modifying one of our earlier triggers:
+
+USE AdventureWorks2008;
+
+GO
+
+ALTER TRIGGER Production.ProductAudit
+
+ON Production.ProductInventory
+
+FOR INSERT, UPDATE, DELETE
+
+AS
+
+IF UPDATE(Quantity)
+
+BEGIN
+
+INSERT INTO Production.InventoryAudit
+
+(ProductID, NetAdjustment)
+
+SELECT COALESCE(i.ProductID, d.ProductID),
+
+ISNULL(i.Quantity, 0) - ISNULL(d.Quantity, 0) AS NetAdjustment
+
+FROM Inserted i
+
+FULL JOIN Deleted d
+
+ON i.ProductID = d.ProductID
+
+AND i.LocationID = d.LocationID
+
+WHERE ISNULL(i.Quantity, 0) - ISNULL(d.Quantity, 0) != 0;
+
+END
+
+With this change, we will now limit the rest of the code to run only when the Quantity column (the one we care about) has been changed. The user can change the value of any other column, and we don't care. This means that we'll be executing fewer lines of code and, therefore, this trigger will perform slightly better than our previous version.
+
+The COLUMNS_UPDATED() Function
+
+This one works somewhat differently from UPDATE() but has the same general purpose. What COLUMNS_UPDATED() gives us is the ability to check multiple columns at one time. In order to do this, the function uses a bit mask that relates individual bits in one or more bytes of varbinary data to individual columns in the table. It ends up looking something like Figure 12.4.
+
+In this case, our single byte of data is telling us that the second, third, and sixth columns were updated—the rest were not.
+
+In the event that there are more than eight columns, SQL Server just adds another byte on the right-hand side and keeps counting (see Figure 12.5).
+
+Figure 12.4
+
+Figure 12.5
+
+This time the second, ninth, and fourteenth columns were updated.
+
+I can hear you out there: "Gee, that's nice—but how do I make any use of this?" Well, to answer that, we have to get into the world of Boolean algebra.
+
+Making use of this information means that you need to add up the binary value of all the bytes, considering the leftmost digit to be the least significant. So, if you want your comparison to take into account 2, 5, and 7, then you need to add the binary value of each bit: 2 + 16 + 64. Then you need to compare the sum of the binary values of your columns to the bit mask by using bitwise operators:
+
+  * | Represents bitwise OR
+  * & Represents bitwise AND
+  * ∧ Represents bitwise Exclusive OR
+
+As I read back over what I've just written, I realize that it is correct, but about as clear as mud, so let's look a little closer at what I mean with a couple of examples.
+
+Imagine that we updated a table that contained five columns. If we updated the first, third, and fifth columns, the bit mask used by COLUMNS_UPDATED would contain 10101000, from 1 + 4 + 16 = 21. We could use:
+
+  * COLUMNS_UPDATED() > 0 to find out if any column was updated
+  * COLUMNS_UPDATED() ∧ 21 = 0 to find out if all of the columns specified (in this case 1, 3, and 5) were updated and nothing else was
+  * COLUMNS_UPDATED() & 21 = 21 to find out if all of the columns specified were updated, but the state of other columns doesn't matter
+  * COLUMNS_UPDATED | 21 != 21 to find out if any column other than those we're interested in was updated
+
+Understand that this is tough stuff—Boolean math is not exactly the easiest of concepts to grasp for most people, so check things carefully and TEST, TEST, TEST!
+
+Performance Considerations
+
+I've seen what appear almost like holy wars happen over the pros and cons, evil and good, and light and dark of triggers. The worst of it tends to come from purists—people who love the theory, and that's all they want to deal with, or people that have figured out how flexible triggers are and want to use them for seemingly everything.
+
+My two bits worth on this is, as I stated early in the chapter, use them when they are the right things to use. If that sounds sort of noncommittal and ambiguous—good! Programming is rarely black and white, and databases are almost never that way. I will, however, point out some facts for you to think about.
+
+Triggers Are Reactive Rather Than Proactive
+
+What I mean here is that triggers happen after the fact. By the time that your trigger fires, the entire query has run and your transaction has been logged (but not committed and only to the point of the statement that fired your trigger). This means that, if the trigger needs to roll things back, it has to undo what is potentially a ton of work that's already been done. Slow! Keep this knowledge in balance though. How big an impact this adds up to depends strongly on how big your query is.
+
+"So what?" you say. Well, compare this to the notion of constraints, which are proactive—that is, they happen before your statement is really executed. That means that they prevent things that will eventually fail from happening before the majority of the work has been done. This will usually mean that they will run at least slightly faster—much faster on more complex queries. Note that this extra speed really only shows itself to any significant extent when a rollback occurs.
+
+What's the end analysis here? Well, if you're dealing with very few rollbacks, and/or the complexity and runtime of the statements affected are low, then there probably isn't much of a difference between triggers and constraints. There's some, but probably not much. If, however, the number of rollbacks is unpredictable or if you know it's going to be high, you'll want to stick with constraints if you can (and frankly, I suggest sticking with constraints unless you have a very specific reason not to).
+
+Triggers Don't Have Concurrency Issues with the Process That Fires Them
+
+You may have noticed throughout this chapter that we often make use of the ROLLBACK statement, even though we don't issue a BEGIN TRAN. That's because a trigger is always implicitly part of the same transaction as the statement that caused the trigger to fire.
+
+If the firing statement was not part of an explicit transaction (one where there was a BEGIN TRAN), then it would still be part of its own one-statement transaction. In either case, a ROLLBACK TRAN issued inside the trigger will still roll back the entire transaction.
+
+Another upshot of this part-of-the-same-transaction business is that triggers inherit the locks already open on the transaction they are part of. This means that we don't have to do anything special to make sure that we don't bump into the locks created by the other statements in the transaction. We have free access within the scope of the transaction, and we see the database based on the modifications already placed by previous statements within the transaction.
+
+Keep It Short and Sweet
+
+I feel like I'm stating the obvious here, but it's for a good reason.
+
+I can't tell you how often I see bloated, stupid code in sprocs and triggers. I don't know whether it's that people get in a hurry, or if they just think that the medium they are using is fast anyway, so it won't matter.
+
+Remember that a trigger is part of the same transaction as the statement in which it is called. This means the statement is not complete until your trigger is complete. Think about it—if you write long-running code in your trigger, this means that every piece of code that you create that causes that trigger to fire will, in turn, be long running. This can really cause heartache in terms of trying to figure out why your code is taking so long to run. You write what appears to be a very efficient sproc, but it performs terribly. You may spend weeks and yet never figure out that your sproc is fine—it just fires a trigger that isn't.
+
+Don't Forget Triggers When Choosing Indexes
+
+Another common mistake. You look through all your sprocs and views figuring out what the best mix of indexes is—and totally forget that you have significant code running in your triggers.
+
+This is the same notion as the "Short and Sweet" section—long-running queries make for long-running statements which, in turn, lead to long-running everything. Don't forget your triggers when you optimize!
+
+Try Not to Roll Back within Triggers
+
+This one's hard since rollbacks are so often a major part of what you want to accomplish with your triggers.
+
+Just remember that AFTER triggers (which are far and away the most common type of trigger) happen after most of the work is already done—that means a rollback is expensive. This is where DRI picks up almost all of its performance advantage. If you are using many ROLLBACK TRAN statements in your triggers, then make sure that you pre-process looking for errors before you execute the statement that fires the trigger. That is, because SQL Server can't be proactive in this situation, be proactive for it. Test for errors beforehand rather than waiting for the rollback.
+
+Dropping Triggers
+
+Dropping triggers is as easy as it has been for almost everything else this far:
+
+DROP TRIGGER <trigger name>
+
+And it's gone.
+
+Summary
+
+Triggers are an extremely powerful tool that can add tremendous flexibility to both your data integrity and the overall operation of your system. That being said, they are not something to take lightly. Triggers can greatly enhance the performance of your system if you use them for proper summarization of data, but they can also be the bane of your existence. They can be very difficult to debug (even now that we have the debugger), and a poorly written trigger affects not only the trigger itself but any statement that causes that trigger to fire.
+13
+
+SQL Cursors
+
+Throughout this book thus far, we've been dealing with data in sets. This tends to go against the way that the more procedure-driven languages go about things. Indeed, when the data gets to the client end, they almost always have to take our set and then deal with it row by row. What they are dealing with is a cursor. Indeed, even in traditional SQL Server tools, we can wind up in something of a cursor mode if we utilize a non-SQL-oriented language in our scripts using the new CLR-based language support.
+
+In this chapter, we will be looking at:
+
+  * What a cursor is
+  * The life span of a cursor
+  * Cursor types (sensitivity and scrollability)
+  * Uses for cursors
+
+We'll discover that there's a lot to think about when creating cursors.
+
+Perhaps the biggest thing to think about when creating cursors is, "Is there a way I can get out of doing this?" If you ask yourself that question every time you're about to create a cursor, then you will be on the road to a better performing system. That being said, we shall see that there are times when nothing else will do.
+
+What Is a Cursor?
+
+Cursors are a way of taking a set of data and being able to interact with a single record at a time. It doesn't happen nearly as often as one tends to think, but there are indeed times where you just can't obtain the results you want to by modifying or even selecting the data in an entire set. The set is generated by something all of the rows have in common (as defined by a SELECT statement), but then you need to deal with those rows on a one-by-one basis.
+
+The result set that you place in a cursor has several distinct features that set it apart from a normal SELECT statement:
+
+  * You declare the cursor separately from actually executing it.
+  * The cursor and, therefore, its result set are named at declaration; you then refer to it by name.
+  * The result set in a cursor, once opened, stays open until you close it.
+  * Cursors have a special set of commands used to navigate the recordset.
+
+While SQL Server has its own engine to deal with cursors, there are actually a few different object libraries that can also create cursors in SQL Server:
+
+  * SQL Native Client (used by ADO.NET)
+  * OLE DB (used by ADO)
+  * ODBC (used by RDO, DAO, and in some cases, OLE DB/ADO)
+  * JDBC (used by Java)
+  * DB-Lib (now a distant legacy offering, but still used in some older apps)
+
+These are the libraries that client applications will typically use to access individual records. Each provides it own syntax for navigating the recordset and otherwise managing the cursor. Each, however, shares in the same set of basic concepts, so, once you have got one object model down for cursors, you're most of the way there for all of them.
+
+Every data access API out there (ADO.NET, ADO, ODBC, OLE DB, JDBC, and so on) returns data to a client application or component in a cursor. It's simply the only way that non-SQL programming languages can currently deal with things. This is the source of a big difference between this kind of cursor and SQL Server cursors. With SQL Server cursors, you usually have a choice to perform things as a set operation, which is what SQL Server was designed to do. With the API-based cursors, all you have is cursors, so you don't have the same cursor versus no cursor debate that you have in your server-side activities.
+
+The client-side part of your data handling is going to be done using cursors. That's a given, so don't worry about it. Instead, worry about making the server side of your data access as efficient as possible; that means not using cursors on the server side if you can possibly help it.
+
+The Life Span of a Cursor
+
+Cursors have lots of little pieces to them, but I think that it's best if we get right into looking first at the most basic form of cursor and then build up from there.
+
+Before we get into the actual syntax though, we need to understand that using a cursor requires more than one statement. Indeed, it takes several. The main parts include:
+
+  * The declaration
+  * Opening
+  * Utilizing/navigating
+  * Closing
+  * Deallocating
+
+That being said, the basic syntax for declaring a cursor looks like this:
+
+DECLARE <cursor name> CURSOR
+
+FOR <select statement>
+
+Keep in mind that this is the super-simple rendition. Create a cursor using defaults wherever possible. We'll look at more advanced cursors a little later in the chapter.
+
+The cursor name is just like any other variable name, and, other than not requiring the @ prefix, they must obey the rules for SQL Server naming. The SELECT statement can be any valid SELECT statement that returns a result set. Note that some result sets will not, however, be updatable. (For example, if you use a GROUP BY, then what part of the group is updated? The same holds true for calculated fields for much the same reason.)
+
+We'll go ahead and start building a reasonably simple example. For now, we're not really going to use it for much, but we'll see later that it will be the beginning of what used to be a rather handy tool for administering your indexes (still is handy if you need to support older versions of SQL Server—more on that later):
+
+USE AdventureWorks2008;
+
+DECLARE @SchemaName varchar(255);
+
+DECLARE @TableName varchar(255);
+
+DECLARE @IndexName varchar(255);
+
+DECLARE @Fragmentation float;
+
+DECLARE TableCursor CURSOR FOR
+
+SELECT SCHEMA_NAME(CAST(OBJECTPROPERTYEX(i.object_id, 'SchemaId') AS int)),
+
+OBJECT_NAME(i.object_id),
+
+i.name,
+
+ps.avg_fragmentation_in_percent
+
+FROM sys.dm_db_index_physical_stats (DB_ID(), NULL, NULL, NULL, NULL) AS ps
+
+JOIN sys.indexes AS i
+
+ON ps.object_id = i.object_id
+
+AND ps.index_id = i.index_id
+
+WHERE avg_fragmentation_in_percent > 30;
+
+Note that this is just the beginning of what you will be building. One of the first things you should notice about cursors is that they require a lot more code than the usual SELECT statement.
+
+We've just declared a cursor called TableCursor that is based on a SELECT statement that will select all of the tables in our database. We also declare a holding variable that will contain the values of our current row while we are working with the cursor.
+
+Just declaring the cursor isn't enough though. We need to actually open it:
+
+OPEN TableCursor;
+
+This actually executes the query that was the subject of the FOR clause, but we still don't have anything in place we can work with. For that, we need to do a couple of things:
+
+  * Grab—or FETCH—our first record
+  * Loop through, as necessary, FETCHing the remaining records
+
+We issue our first FETCH. This is the command that says to retrieve a particular record. We must also say into which variables we want to place the values:
+
+FETCH NEXT FROM TableCursor INTO @TableName, @IndexName, @Fragmentation
+
+Now that we have a first record, we're ready to move onto performing actions against the cursor set:
+
+WHILE @@FETCH_STATUS = 0
+
+BEGIN
+
+PRINT @SchemaName + '.' + @TableName + '.' + @IndexName + ' is '
+
+\+ CAST(@Fragmentation AS varchar) + '% Fragmentented';
+
+FETCH NEXT FROM TableCursor INTO @SchemaName, @TableName, @IndexName,
+
+@Fragmentation;
+
+END
+
+Every time we fetch a row, @@FETCH_STATUS is updated to tell us how our fetch went. The possible values are:
+
+  * 0 Fetch Succeeded: Everything's fine.
+  * −1 Fetch Failed: Record missing (you're not at the end, but a record has been deleted since you opened the cursor). We'll look at this closer later in the chapter.
+  * −2 Fetch Failed: This time it's because you're beyond the last (or before the first) record in the cursor. We'll also see more of this later in the chapter.
+
+Once we exit this loop, we are, for our purposes here, done with the cursor, so we'll close it:
+
+CLOSE TableCursor;
+
+Closing the cursor, does not, however, free up the memory associated with that cursor. It does free up the locks associated with it. To be sure that you've totally freed up the resources used by the cursor, you must deallocate it:
+
+DEALLOCATE TableCursor;
+
+So, let's bring it all together just for clarity:
+
+DECLARE @SchemaName varchar(255)
+
+DECLARE @TableName varchar(255)
+
+DECLARE @IndexName varchar(255)
+
+DECLARE @Fragmentation float
+
+DECLARE TableCursor CURSOR FOR
+
+SELECT SCHEMA_NAME(CAST(OBJECTPROPERTYEX(i.object_id, 'SchemaId') AS int)),
+
+OBJECT_NAME(i.object_id),
+
+i.name,
+
+ps.avg_fragmentation_in_percent
+
+FROM sys.dm_db_index_physical_stats (DB_ID(), NULL, NULL, NULL, NULL) AS ps
+
+JOIN sys.indexes AS i
+
+ON ps.object_id = i.object_id
+
+AND ps.index_id = i.index_id
+
+WHERE avg_fragmentation_in_percent > 30
+
+OPEN TableCursor
+
+FETCH NEXT FROM TableCursor INTO @SchemaName, @TableName, @IndexName,
+
+@Fragmentation
+
+WHILE @@FETCH_STATUS = 0
+
+BEGIN
+
+PRINT @SchemaName + '.' + @TableName + '.' +
+
+@IndexName + ' is ' + CAST(@Fragmentation AS varchar) + '% Fragmentented'
+
+FETCH NEXT FROM TableCursor INTO @SchemaName, @TableName, @IndexName,
+
+@Fragmentation
+
+END
+
+CLOSE TableCursor
+
+DEALLOCATE TableCursor
+
+We now have something that runs, but as we've created it at the moment, it's really nothing more than if we had just run the SELECT statement by itself. (Technically, this isn't true since we can't "PRINT" a SELECT statement, but you could do what amounts to the same thing.)
+
+Production.ProductInventory.PK_ProductInventory_ProductID_LocationID is 42...
+
+Production.ProductListPriceHistory.PK_ProductListPriceHistory_ProductID_St...
+
+Sales.SpecialOfferProduct.PK_SpecialOfferProduct_SpecialOfferID_ProductID...
+
+Sales.SpecialOfferProduct.AK_SpecialOfferProduct_rowguid is 50% Fragmenten...
+
+...
+
+...
+
+...
+
+Production.ProductCostHistory.PK_ProductCostHistory_ProductID_StartDate is...
+
+Production.ProductDescription.AK_ProductDescription_rowguid is 66.6667% Fr...
+
+dbo.DatabaseLog.PK_DatabaseLog_DatabaseLogID is 33.3333% Fragmentented
+
+What's different is that, if we so chose, we could have done nearly anything to the individual rows. Let's go ahead and illustrate this by completing our little utility.
+
+In days of old, there was no single statement that would rebuild all the indexes in an entire database. (Fortunately, we now have an option in DBCC INDEXDEFRAG to do an entire database.) Keeping your indexes defragmented is, however, a core part of administering your system. The cursor example we're using here is something of a descendant of what was the common way of getting this kind of index defragmentation done. In this newer version, however, we're making use of specific fragmentation information, and we're making it possible to allow for the use of ALTER INDEX (which allows for more options in how exactly to do our defragmentation) instead of DBCC INDEXDEFRAG.
+
+Okay, so we have a few different methods for rebuilding or reorganizing indexes without entirely dropping and re-creating them. ALTER INDEX is the most flexible in terms of letting you select different underlying methods of defragmenting (online or offline, complete rebuild or just a reorganization of what's there, and so on), so we're going to leverage this way of doing things. The simple version of the syntax for ALTER INDEX looks like this:
+
+ALTER INDEX <index name> | ALL
+
+ON <object>
+
+{[REBUILD] | [REORGANIZE]}
+
+Again, this is the hyper-simple version of ALTER INDEX. There are a ton of other little switches and options for it that are described in Chapter 6.
+
+The problem with trying to use this statement to rebuild all the indexes on all of your tables is that it is designed to work on one table at a time. You can use the ALL option instead of the index name if you want to build all the indexes for a table, but you can't leave off the table name to build all the indexes for all the tables. Indeed, even if we had used a tool like DBCC INDEXDEFRAG—which can do an entire database, but just doesn't have as many options—it would still be an all-or-nothing thing. That is, we can't tell it to do just the tables above a certain level of fragmentation, or to exclude particular tables that we may want to have fragmentation in.
+
+Remember that there are occasionally times when fragmentation is a good thing. In particular, it can be helpful on tables where we are doing a large number of random inserts as it reduces the number of page splits.
+
+Our cursor can get us around this by just dynamically building the DBCC command:
+
+USE AdventureWorks2008;
+
+DECLARE @SchemaName varchar(255);
+
+DECLARE @TableName varchar(255);
+
+DECLARE @IndexName varchar(255);
+
+DECLARE @Fragmentation float;
+
+DECLARE @Command varchar(max);
+
+DECLARE TableCursor CURSOR FOR
+
+SELECT SCHEMA_NAME(CAST(OBJECTPROPERTYEX(i.object_id, 'SchemaId') AS int)),
+
+OBJECT_NAME(i.object_id),
+
+i.name,
+
+ps.avg_fragmentation_in_percent
+
+FROM sys.dm_db_index_physical_stats (DB_ID(), NULL, NULL, NULL, NULL) AS ps
+
+JOIN sys.indexes AS i
+
+ON ps.object_id = i.object_id
+
+AND ps.index_id = i.index_id
+
+WHERE avg_fragmentation_in_percent > 30;
+
+OPEN TableCursor;
+
+FETCH NEXT FROM TableCursor INTO @SchemaName, @TableName, @IndexName,
+
+@Fragmentation;
+
+WHILE @@FETCH_STATUS = 0
+
+BEGIN
+
+PRINT 'Reindexing ' + ISNULL(@SchemaName, 'dbo') + '.' +
+
+@TableName + '.' + @IndexName;
+
+SET @Command = 'ALTER INDEX [' + @IndexName + '] ON [' +
+
+ISNULL(@SchemaName, 'dbo') + '.' + @TableName + '] REBUILD';
+
+EXEC (@Command);
+
+FETCH NEXT FROM TableCursor
+
+INTO @SchemaName, @TableName, @IndexName, @Fragmentation;
+
+END
+
+CLOSE TableCursor;
+
+DEALLOCATE TableCursor;
+
+We've now done what would be impossible using only set-based commands. The ALTER INDEX command is expecting a single argument. Providing it a recordset won't work. We get around the problem by combining the notion of a set operation (the SELECT that forms the basis for the cursor) with single-data-point operations (the data in the cursor).
+
+In order to mix these set-based and individual data point operations, we had to walk through a series of steps. First, we declared the cursor and any necessary holding variables. We then "opened" the cursor. It was not until this point that the data was actually retrieved from the database. Next, we utilized the cursor by navigating through it. In this case, we only navigated forward, but, as we shall see, we could have created a cursor that could scroll forward and backward. Moving on, we closed the cursor (if the cursor had still had any open locks, they were released at this point), but memory continues to be allocated for the cursor. Finally, we deallocated the cursor. At this point, all resources in use by the cursor are freed for use by other objects in the system.
+
+So just that quick, we have our first cursor. Still, this is really only the beginning. There is much more to cursors than meets the eye in this particular example. Next, we'll go on and take a closer look at some of the powerful features that give cursors additional flexibility.
+
+Types of Cursors and Extended Declaration Syntax
+
+Cursors come in a variety of different flavors. (We'll visit them all before we're done.) The default cursor is forward-only. (You can only move forward through the records, not backward.) It is also read-only, but cursors can also be scrollable and updatable. They can also have a varying level of sensitivity to changes that are made to the underlying data by other processes.
+
+The forward-only, read-only cursor is the default type of cursor in not only the native SQL Server cursor engine, but also in pretty much all the cursor models I've ever bumped into. It is extremely low in overhead, by comparison, to the other cursor choices, and is usually referred to as being a "firehose" cursor because of the sheer speed with which you can enumerate the data. Like a fire hose, it knows how to dump its contents in just one direction though. (You can't put the water back in a fire hose now can you?) Firehose cursors simply blow away the other cursor-based options in most cases, but don't mistake this as a performance choice over set operations. Even a firehose cursor is slow by comparison to most equivalent set operations.
+
+Let's start out by taking a look at a more extended syntax for cursors, and then we'll look at all of the options individually:
+
+DECLARE <cursor name> CURSOR
+
+[LOCAL|GLOBAL]
+
+[FORWARD_ONLY|SCROLL]
+
+[STATIC|KEYSET|DYNAMIC|FAST_FORWARD]
+
+[READ_ONLY|SCROLL_LOCKS|OPTIMISTIC]
+
+[TYPE_WARNING]
+
+FOR <SELECT statement>
+
+[FOR UPDATE [OF <column name >[,...n]]][;]
+
+Or for better ANSI/ISO support:
+
+DECLARE <cursor name> [INSENSITIVE|SCROLL] CURSOR
+
+FOR <SELECT statement>
+
+[FOR [READ ONLY|UPDATE [OF <column name >[,...n]]][;]
+
+Note that the ANSI/ISO version of the syntax was added long after cursors were added to the product (cursors first made an appearance in SQL Server 6.0 in the mid-'90s). It is probably for this reason (well, that and more features) that virtually all cursors I've seen on SQL Server utilize the syntax I listed first as opposed to the somewhat more portable ANSI/ISO version.
+
+At first glance, it really looks like a handful, and indeed, there are a good many things to think about when declaring cursors. (As I've said, probably the most important is along the lines of, "Do I really need to be doing this in a cursor?") The bright side is that several of these options imply one another, so once you've made one choice the others often start to fall into place quickly.
+
+Let's go ahead and apply the specific syntax in a step-by-step manner that attaches each part to the important concepts that go with it.
+
+Scope
+
+The LOCAL versus GLOBAL option determines the scope of the cursor, that is, what connections and processes can "see" the cursor. Most items that have scope will default to the more conservative approach, that is, the minimum scope (which would be LOCAL in this case). SQL Server cursors are something of an exception to this. The default is actually GLOBAL. Before we get too far into the ramifications of the LOCAL versus GLOBAL scope question, we had better digress for a moment to cover what I mean by local and global in this context.
+
+We are already dealing with something of an exception in that the default scope is set to what we're calling global rather than the more conservative option of local. The exception doesn't stop there though. In SQL Server, the notion of something being global versus local usually indicates that it can be seen by all connections rather than just the current connection. For the purposes of our cursor declaration, however, it refers to whether all processes (batches, triggers, sprocs) in the current connection can see it versus just the current process.
+
+Figure 13.1 illustrates this.
+
+Figure 13.1
+
+Now let's think about what this means and test it a bit.
+
+The ramifications to the global default fall, as you might expect, on both the pro and the con side of the things. Being global, it means that you can create a cursor within one sproc and refer to it from within a separate sproc—you don't necessarily have to pass references to it. The downside of this though is that, if you try to create another cursor with the same name, you're going to get an error.
+
+Let's test this out with a brief sample. What we're going to do here is create a sproc that will create a cursor for us:
+
+USE AdventureWorks2008;
+
+GO
+
+CREATE PROCEDURE spCursorScope
+
+AS
+
+DECLARE @Counter int,
+
+@OrderID int,
+
+@CustomerID int
+
+DECLARE CursorTest CURSOR
+
+GLOBAL
+
+FOR
+
+SELECT SalesOrderID, CustomerID
+
+FROM Sales.SalesOrderHeader;
+
+SELECT @Counter = 1;
+
+OPEN CursorTest;
+
+FETCH NEXT FROM CursorTest INTO @OrderID, @CustomerID;
+
+PRINT 'Row ' + CAST(@Counter AS varchar) + ' has a SalesOrderID of ' +
+
+CONVERT(varchar,@OrderID) + ' and a CustomerID of ' + CAST(@CustomerID AS
+
+varchar);
+
+WHILE (@Counter<=5) AND (@@FETCH_STATUS=0)
+
+BEGIN
+
+SELECT @Counter = @Counter + 1;
+
+FETCH NEXT FROM CursorTest INTO @OrderID, @CustomerID;
+
+PRINT 'Row ' + CAST(@Counter AS varchar) + ' has a SalesOrderID of ' +
+
+CONVERT(varchar,@OrderID) + ' and a CustomerID of ' + CAST(@CustomerID
+
+AS varchar);
+
+END
+
+Notice several things in this sproc. First, I've declared holding variables to do a few things for us. The first, @Counter, will just keep tabs on things so we have to move through only a few records rather than through the entire recordset. The second and third, @OrderID and @CustomerID, respectively, will hold the values retrieved from the query as we go row by row through the result set.
+
+Next, we declare the actual cursor. Note that I've explicitly set the scope. By default, if I had left off the GLOBAL keyword, then I would have still received a cursor that was global in scope.
+
+You do not have to live by this default. You can use sp_dboption or ALTER DATABASE to set the Default to local cursor option to True. (Set it back to False if you want to go back to global.)
+
+This happens to be yet another great example of why it makes sense to always explicitly state the options that you want. Don't rely on defaults. Imagine if you were just relying on the default of GLOBAL and then someone changed that option in the system! I can just hear plenty of you out there saying, "Oh, no one would ever change that." WRONG! This is exactly the kind of "small change" that people make to fix some problem somewhere. Depending on the obscurity of your cursor usage, it may be weeks before you run into the problem, by which time you'll have totally forgotten that the change was made.
+
+We then go ahead and open the cursor and step through several records. Notice, however, that we do not close or deallocate the cursor. We just leave it open and available as we exit the sproc.
+
+I can't help but think of the old show Lost in Space here, with the robot constantly yelling "DANGER Will Robinson! DANGER!" Leaving cursors open like this willy-nilly will lead you to a life of sorrow, frustration, and severe depression.
+
+I'm doing it here to illustrate fully the concept of scope, but you would want to be extremely careful about this kind of usage. The danger lies in the notion that you would call this sproc without realizing that it doesn't clean up after itself. If you don't clean up (close and deallocate) the cursor outside the sproc, then you will create something of a resource leak in the form of this abandoned, but still active, cursor. You will also expose yourself to the possibility of errors should you call the same sproc again. (It will try to declare and open the cursor again, but it already exists.)
+
+When we look into declaring our cursor for output, we will see a much more explicit and better choice for situations where we want to allow outside interaction with our cursors.
+
+Now that we've enumerated several records and proven that our sproc is operating, we will then exit the sproc. (Remember, we haven't closed or deallocated the cursor.) We'll then refer to the cursor from outside the sproc:
+
+EXEC spCursorScope;
+
+DECLARE @Counter int,
+
+@OrderID int,
+
+@CustomerID int;
+
+SET @Counter=6;
+
+WHILE (@Counter<=10) AND (@@FETCH_STATUS=0)
+
+BEGIN
+
+PRINT 'Row ' + CAST(@Counter AS varchar) + ' has a SalesOrderID of ' +
+
+CAST(@OrderID AS varchar) + ' and a CustomerID of ' +
+
+CAST(@CustomerID AS varchar);
+
+SELECT @Counter = @Counter + 1;
+
+FETCH NEXT FROM CursorTest INTO @OrderID, @CustomerID;
+
+END
+
+CLOSE CursorTest;
+
+DEALLOCATE CursorTest;
+
+Okay, so let's walk through what's happening here.
+
+First, we execute the sproc. As we've already seen, this sproc builds the cursor and then enumerates several rows. It exits, leaving the cursor open.
+
+Next, we declare the very same variables that were declared in the sproc. Why do we have to declare them again, but not the cursor? Because it is only the cursor that is global by default. That is, our variables went away as soon as the sproc went out of scope. We can't refer to them anymore, or we'll get a variable undefined error. We must redeclare them.
+
+The next code structure looks almost identical to one in our sproc. We're again looping through to enumerate several records.
+
+Finally, once we've proven our point that the cursor is still alive outside the realm of the sproc, we're ready to close and deallocate the cursor. It is not until we close the cursor that we free up the memory or tempdb space from the result set used in the cursor, and it is not until we deallocate that the memory taken up by the cursor variable and its query definition is freed.
+
+Now, go ahead and create the sproc in the system (if you haven't already) and execute the script. You should wind up with a result that looks like this:
+
+Row 1 has a SalesOrderID of 43659 and a CustomerID of 29825
+
+Row 2 has a SalesOrderID of 43660 and a CustomerID of 29672
+
+Row 3 has a SalesOrderID of 43661 and a CustomerID of 29734
+
+Row 4 has a SalesOrderID of 43662 and a CustomerID of 29994
+
+Row 5 has a SalesOrderID of 43663 and a CustomerID of 29565
+
+Row 6 has a SalesOrderID of 43664 and a CustomerID of 29898
+
+Row 7 has a SalesOrderID of 43665 and a CustomerID of 29580
+
+Row 8 has a SalesOrderID of 43666 and a CustomerID of 30052
+
+Row 9 has a SalesOrderID of 43667 and a CustomerID of 29974
+
+Row 10 has a SalesOrderID of 43668 and a CustomerID of 29614
+
+So, you can see that the cursor stayed open, and our loop outside the sproc was able to pick up right where the code inside the sproc had left off.
+
+Now let's see what happens if we alter our sproc to have local scope:
+
+USE AdventureWorks2008;
+
+GO
+
+ALTER PROCEDURE spCursorScope
+
+AS
+
+DECLARE @Counter int,
+
+@OrderID int,
+
+@CustomerID int;
+
+DECLARE CursorTest CURSOR
+
+LOCAL
+
+FOR
+
+SELECT SalesOrderID, CustomerID
+
+FROM Sales.SalesOrderHeader;
+
+SELECT @Counter = 1;
+
+OPEN CursorTest;
+
+FETCH NEXT FROM CursorTest INTO @OrderID, @CustomerID;
+
+PRINT 'Row ' + CAST(@Counter AS varchar) + ' has a SalesOrderID of ' +
+
+CAST(@OrderID AS varchar) + ' and a CustomerID of ' + CAST(@CustomerID AS varchar);
+
+WHILE (@Counter<=5) AND (@@FETCH_STATUS=0)
+
+BEGIN
+
+SELECT @Counter = @Counter + 1;
+
+FETCH NEXT FROM CursorTest INTO @OrderID, @CustomerID;
+
+PRINT 'Row ' + CAST(@Counter AS varchar) + ' has a SalesOrderID of ' +
+
+CAST(@OrderID AS varchar) + ' and a CustomerID of ' + CAST(@CustomerID
+
+AS varchar);
+
+END
+
+It seems like only a minor change, but the effects are significant when we execute our script again:
+
+Row 1 has a SalesOrderID of 43659 and a CustomerID of 29825
+
+Row 2 has a SalesOrderID of 43660 and a CustomerID of 29672
+
+Row 3 has a SalesOrderID of 43661 and a CustomerID of 29734
+
+Row 4 has a SalesOrderID of 43662 and a CustomerID of 29994
+
+Row 5 has a SalesOrderID of 43663 and a CustomerID of 29565
+
+Row 6 has a SalesOrderID of 43664 and a CustomerID of 29898
+
+Msg 16916, Level 16, State 1, Line 14
+
+A cursor with the name 'CursorTest' does not exist.
+
+Msg 16916, Level 16, State 1, Line 18
+
+A cursor with the name 'CursorTest' does not exist.
+
+Msg 16916, Level 16, State 1, Line 19
+
+A cursor with the name 'CursorTest' does not exist.
+
+Things ran just as they did before until we got out of the sproc. This time the cursor was no longer in scope as we came out of the sproc, so we were unable to refer to it, and our script ran into several errors. Later on in the chapter, we'll take a look at how to have a cursor with local scope but still be able to access it from outside the procedure in which it was created.
+
+The big thing that you should have gotten out of this section is that you need to think about the scope of your cursors. They do not behave quite the way that other items for which you use the DECLARE statement do.
+
+Scrollability
+
+Like most of the concepts we'll be talking about throughout this chapter, scrollability applies to pretty much any cursor model you might face. The notion is actually fairly simple: Can I navigate in relatively any direction, or am I limited to only moving forward? The default is no. You can only move forward.
+
+FORWARD_ONLY
+
+A forward-only cursor is exactly what it sounds like. Since it is the default method, it probably doesn't surprise you to hear that it is the only type of cursor that we've been using up to this point. When you are using a forward-only cursor, the only navigation option that is valid is FETCH NEXT. You need to be sure that you're done with each record before you move onto the next because, once it's gone, there's no getting back to the previous record unless you close and reopen the cursor.
+
+SCROLLABLE
+
+Again, this is just as it sounds. You can "scroll" the cursor backward and forward as necessary. If you're using one of the APIs (ODBC, OLE DB, JDBC, and so on), then, depending on what object model you're dealing with, you can often navigate right to a specific record. Indeed, with ADO, ADO.NET, and LINQ you can even easily resort the data and add additional filters.
+
+The cornerstone of scrolling is the FETCH keyword. You can use FETCH to move forward and backward through the cursor, as well as move to specific positions. The main arguments to FETCH are:
+
+  * NEXT: Move to the next record.
+  * PRIOR: Move to the previous record.
+  * FIRST: Move to the first record.
+  * LAST: Move to the last record.
+
+We'll take a more in-depth look at FETCH later in this chapter, but for now, be aware that FETCH exists and is what controls your navigation through the cursor set.
+
+Let's do a brief example to get across the concept of a scrollable cursor. We'll actually just use a slight variation of the sproc we created a little earlier in the chapter:
+
+USE AdventureWorks2008;
+
+GO
+
+CREATE PROCEDURE spCursorScroll
+
+AS
+
+DECLARE @Counter int,
+
+@OrderID int,
+
+@CustomerID int
+
+DECLARE CursorTest CURSOR
+
+LOCAL
+
+SCROLL
+
+FOR
+
+SELECT SalesOrderID, CustomerID
+
+FROM Sales.SalesOrderHeader;
+
+SELECT @Counter = 1;
+
+OPEN CursorTest;
+
+FETCH NEXT FROM CursorTest INTO @OrderID, @CustomerID;
+
+PRINT 'Row ' + CAST(@Counter AS varchar) + ' has a SalesOrderID of ' +
+
+CAST(@OrderID AS varchar) + ' and a CustomerID of ' + CAST(@CustomerID AS
+
+varchar);
+
+WHILE (@Counter<=5) AND (@@FETCH_STATUS=0)
+
+BEGIN
+
+SELECT @Counter = @Counter + 1;
+
+FETCH NEXT FROM CursorTest INTO @OrderID, @CustomerID;
+
+PRINT 'Row ' + CAST(@Counter AS varchar) + ' has a SalesOrderID of ' +
+
+CAST(@OrderID AS varchar) + ' and a CustomerID of ' + CAST(@CustomerID
+
+AS varchar);
+
+END
+
+WHILE (@Counter > 1) AND (@@FETCH_STATUS = 0)
+
+BEGIN
+
+SELECT @Counter = @Counter – 1;
+
+FETCH PRIOR FROM CursorTest INTO @OrderID, @CustomerID;
+
+PRINT 'Row ' + CONVERT(varchar,@Counter) + ' has an SalesOrderID of ' +
+
+CAST(@OrderID AS varchar) + ' and a CustomerID of ' + CAST(@CustomerID
+
+AS varchar);
+
+END
+
+CLOSE CursorTest;
+
+DEALLOCATE CursorTest;
+
+The big differences are:
+
+  * The cursor is declared with the SCROLL option.
+  * We added a new navigation keyword—PRIOR—in the place of NEXT.
+  * We went ahead and closed and deallocated the cursor in the sproc rather than using an outside procedure (been there, done that).
+
+The interesting part comes in the results. This one doesn't require the fancy test script. Simply execute it:
+
+EXEC spCursorScroll;
+
+and you'll see how the order values scroll forward and back:
+
+Row 1 has a SalesOrderID of 43659 and a CustomerID of 29825
+
+Row 2 has a SalesOrderID of 43660 and a CustomerID of 29672
+
+Row 3 has a SalesOrderID of 43661 and a CustomerID of 29734
+
+Row 4 has a SalesOrderID of 43662 and a CustomerID of 29994
+
+Row 5 has a SalesOrderID of 43663 and a CustomerID of 29565
+
+Row 6 has a SalesOrderID of 43664 and a CustomerID of 29898
+
+Row 5 has an SalesOrderID of 43663 and a CustomerID of 29565
+
+Row 4 has an SalesOrderID of 43662 and a CustomerID of 29994
+
+Row 3 has an SalesOrderID of 43661 and a CustomerID of 29734
+
+Row 2 has an SalesOrderID of 43660 and a CustomerID of 29672
+
+Row 1 has an SalesOrderID of 43659 and a CustomerID of 29825
+
+As you can see, we were able to successfully navigate not only forward, as we did before, but also backward.
+
+A forward-only cursor is far and away the more efficient choice of the two options. Think about the overhead for a moment: If it is read-only, then SQL Server really needs to keep track of the next record only—á la a linked list. In a situation where you may reposition the cursor in other ways, extra information must be stored in order to reasonably seek out the requested row. How exactly this is implemented depends on the specific cursor options you choose.
+
+Some types of cursors imply scrollability; others do not. Some types of cursors are sensitive to changes in the data, and some are not. We'll look at some of these issues in the next section.
+
+Cursor Types
+
+The various APIs generally break cursors into four types:
+
+  * Static
+  * Keyset driven
+  * Dynamic
+  * Forward-only
+
+How exactly these four types are implemented (and what they're called) will sometimes vary slightly among the various APIs and object models, but their general nature is usually pretty much the same.
+
+What makes the various cursor types different is their ability to be scrollable and their sensitivity to changes in the database over the life of the cursor. We've already seen what scrollability is all about, but the term "sensitivity" probably sounds like something you'd be more likely to read in Men Are from Mars, Women Are from Venus than in a programming book. Actually though, the concept of sensitivity is a rather critical one to think about when choosing your cursor type.
+
+Whether or not a cursor is sensitive defines whether or not it notices changes in the database after the cursor is opened. It also defines just what it does about it once the change is detected. Let's look at this in its most extreme versions—static versus dynamic cursors. The static cursor, once created, is absolutely oblivious to any change to the database. The dynamic cursor, however, is effectively aware of every change (inserted records, deletions, updates, you name it) to the database as long as the cursor remains open. We'll explore the sensitivity issue as we look at each of the cursor types.
+
+Static Cursors
+
+A static cursor is one that represents a "snapshot" in time. Indeed, at least one of the data access object models refers to it as a snapshot recordset rather than a static one.
+
+When a static cursor is created, the entire recordset is created in what amounts to a temporary table in tempdb. After the time that it's created, a static cursor changes for no one and nothing. That is, it is set in stone. Some of the different object models will let you update information in a static cursor, some won't, but the bottom line is always the same: you cannot write updates to the database via a static cursor.
+
+Before we get too far into this brand of cursor, I'm going to go ahead and tell you that the situations where it makes sense to use a static cursor on the server side are extremely rare. I'm not saying they don't exist. They do, but they are very rare indeed.
+
+Before you get into the notion of using a static cursor on the server side, ask yourself:
+
+  * Can I do this with a temporary table?
+  * Can I do this entirely on the client side?
+
+Remember that a static cursor is kept by SQL Server in a private table in tempdb. If that's how SQL Server is going to be storing it anyway, why not just use a temporary table yourself? There are times when that won't give you what you need (record rather than set operations). However, if you are just after the concept of a snapshot in time, rather than record-based operations, build your own temp table using SELECT INTO or CREATE TABLE and INSERT INTO and save yourself (and SQL Server) a lot of overhead.
+
+If you're working in a client-server arrangement, static cursors are often better dealt with on the client side. By moving the entire operation to the client, you can cut the number of network roundtrips to the server substantially. Since you know that your cursor isn't going to be affected by changes to the database (after all, isn't that why you chose a static cursor in the first place?), there's no reason to make contact with the server again regarding the cursor after it is created.
+
+Okay, so let's move on to an example of a static cursor. What we're going to do in this example is play around with the notion of creating a static cursor, then make changes and see what happens. We'll play with variations of this throughout the remainder of this part of the chapter as we look at each cursor type.
+
+We'll start with building a table to test with, and then we'll build our cursor and manipulate it to see what's in it:
+
+USE AdventureWorks2008;
+
+/* Build the table that we'll be playing with this time */
+
+SELECT SalesOrderID, CustomerID
+
+INTO CursorTable
+
+FROM Sales.SalesOrderHeader
+
+WHERE SalesOrderID BETWEEN 43661 AND 43665;
+
+\-- Declare our cursor
+
+DECLARE CursorTest CURSOR
+
+GLOBAL -- So we can manipulate it outside the batch
+
+SCROLL -- So we can scroll back and see the changes
+
+STATIC -- This is what we're testing this time
+
+FOR
+
+SELECT SalesOrderID, CustomerID
+
+FROM CursorTable;
+
+\-- Declare our two holding variables
+
+DECLARE @SalesOrderID int;
+
+DECLARE @CustomerID varchar(5);
+
+\-- Get the cursor open and the first record fetched
+
+OPEN CursorTest;
+
+FETCH NEXT FROM CursorTest INTO @SalesOrderID, @CustomerID;
+
+\-- Now loop through them all
+
+WHILE @@FETCH_STATUS = 0
+
+BEGIN
+
+PRINT CAST(@SalesOrderID AS varchar) + ' ' + @CustomerID;
+
+FETCH NEXT FROM CursorTest INTO @SalesOrderID, @CustomerID;
+
+END
+
+\-- Make a change. We'll see in a bit that this won't affect the cursor.
+
+UPDATE CursorTable
+
+SET CustomerID = -111
+
+WHERE SalesOrderID = 43663;
+
+\-- Now look at the table to show that the update is really there.
+
+SELECT SalesOrderID, CustomerID
+
+FROM CursorTable;
+
+\-- Now go back to the top. We can do this since we have a scrollable cursor
+
+FETCH FIRST FROM CursorTest INTO @SalesOrderID, @CustomerID;
+
+\-- And loop through again.
+
+WHILE @@FETCH_STATUS=0
+
+BEGIN
+
+PRINT CONVERT(varchar(5),@SalesOrderID) + ' ' + @CustomerID;
+
+FETCH NEXT FROM CursorTest INTO @SalesOrderID, @CustomerID;
+
+END
+
+\-- Now it's time to clean up after ourselves
+
+CLOSE CursorTest;
+
+DEALLOCATE CursorTest;
+
+DROP TABLE CursorTable;
+
+Let's take a look at what this gets us. (Note that I've switched over to the Results in Text option to make it easier to see my result sets and my PRINT messages together):
+
+(5 row(s) affected)
+
+43661 29734
+
+43662 29994
+
+43663 29565
+
+43664 29898
+
+43665 29580
+
+(1 row(s) affected)
+
+SalesOrderID CustomerID
+
+\------------ -----------
+
+43661 29734
+
+43662 29994
+
+43663 −111
+
+43664 29898
+
+43665 29580
+
+(5 row(s) affected)
+
+43661 29734
+
+43662 29994
+
+43663 29565
+
+43664 29898
+
+43665 29580
+
+There are several things to notice about what happened during the run on this script:
+
+  * First, even though we had a result set open against the table, we were able to perform the update. In this case, it's because we have a static cursor. Once it was created, it was disconnected from the actual records and no longer maintains any locks.
+  * Second, although we can clearly see that our update did indeed take place in the actual table, it did not affect the data in our cursor. Again, this is because, once created, our cursor took on something of a life of its own. It is no longer associated with the original data in any way.
+  * Under the heading of "one more thing," you could also notice that we made use of a new argument to the FETCH keyword. This time we went back to the top of our result set by using FETCH FIRST.
+
+Keyset-Driven Cursors
+
+When we talk about keysets with cursors, we're not talking your local locksmith. Instead, we're talking about maintaining a set of data that uniquely identifies the entire row in the database.
+
+Keyset-driven cursors have the following high points:
+
+  * They require a unique index to exist on the table in question.
+  * Only the keyset is stored in tempdb—not the entire dataset.
+  * They are sensitive to changes to the rows that are already part of the keyset, including the possibility that they have been deleted.
+  * They are, however, not sensitive to new rows that are added after the cursor is created.
+  * Keyset cursors can be used as the basis for a cursor that is going to perform updates to the data.
+
+Given that it has a name of "keyset" and that I've already said that the keyset uniquely identifies each row, it probably doesn't shock you in any way that you must have a unique index of some kind (usually a primary key, but it could also be any index that is explicitly defined as unique) to create the keyset from.
+
+The keys are all stored in a private table in tempdb. SQL Server uses this key as a method to find its way back to the data as you ask for a specific row in the cursor. The point to take note of here is that the actual data is being fetched, based on the key, at the time that you issue the FETCH. The great part about this is that the data for that particular row is up to date as of when the specific row is fetched. The downside (or upside depending on what you're using the cursor for) is that it uses the keyset that is already created to do the lookup. This means that once the keyset is created, that is all the rows that will be included in your cursor. Any rows that were added after the cursor was created—even if they meet the conditions of the WHERE clause in the SELECT statement—will not be seen by the cursor. The rows that are already part of the cursor can, depending on the cursor options you chose, be updated by a cursor operation.
+
+Let's modify our earlier script to illustrate the sensitivity issue when we are making use of keyset-driven cursors:
+
+USE AdventureWorks2008;
+
+/* Build the table that we'll be playing with this time */
+
+SELECT SalesOrderID, CustomerID
+
+INTO CursorTable
+
+FROM Sales.SalesOrderHeader
+
+WHERE SalesOrderID BETWEEN 43661 AND 43665;
+
+\-- Now create a unique index on it in the form of a primary key
+
+ALTER TABLE CursorTable
+
+ADD CONSTRAINT PKCursor
+
+PRIMARY KEY (SalesOrderID);
+
+/* The IDENTITY property was automatically brought over when
+
+** we did our SELECT INTO, but I want to use my own SalesOrderID
+
+** value, so I'm going to turn IDENTITY_INSERT on so that I
+
+** can override the identity value.
+
+*/
+
+SET IDENTITY_INSERT CursorTable ON;
+
+\-- Declare our cursor
+
+DECLARE CursorTest CURSOR
+
+GLOBAL -- So we can manipulate it outside the batch
+
+SCROLL -- So we can scroll back and see the changes
+
+KEYSET -- This is what we're testing this time
+
+FOR
+
+SELECT SalesOrderID, CustomerID
+
+FROM CursorTable;
+
+\-- Declare our two holding variables
+
+DECLARE @SalesOrderID int;
+
+DECLARE @CustomerID varchar(5);
+
+\-- Get the cursor open and the first record fetched
+
+OPEN CursorTest;
+
+FETCH NEXT FROM CursorTest INTO @SalesOrderID, @CustomerID;
+
+\-- Now loop through them all
+
+WHILE @@FETCH_STATUS = 0
+
+BEGIN
+
+PRINT CAST(@SalesOrderID AS varchar) + ' ' + @CustomerID;
+
+FETCH NEXT FROM CursorTest INTO @SalesOrderID, @CustomerID;
+
+END
+
+\-- Make a change. We'll see that does affect the cursor this time.
+
+UPDATE CursorTable
+
+SET CustomerID = -111
+
+WHERE SalesOrderID = 43663;
+
+\-- Now we'll delete a record so we can see how to deal with that
+
+DELETE CursorTable
+
+WHERE SalesOrderID = 43664;
+
+\-- Now Insert a record. We'll see that the cursor is oblivious to it.
+
+INSERT INTO CursorTable
+
+(SalesOrderID, CustomerID)
+
+VALUES
+
+(-99999, -99999);
+
+\-- Now look at the table to show that the changes are really there.
+
+SELECT SalesOrderID, CustomerID
+
+FROM CursorTable;
+
+\-- Now go back to the top. We can do this since we have a scrollable cursor
+
+FETCH FIRST FROM CursorTest INTO @SalesOrderID, @CustomerID;
+
+/* And loop through again.
+
+** This time, notice that we changed what we're testing for.
+
+** Since we have the possibility of rows being missing (deleted)
+
+** before we get to the end of the actual cursor, we need to do
+
+** a little bit more refined testing of the status of the cursor.
+
+*/
+
+WHILE @@FETCH_STATUS != -1
+
+BEGIN
+
+IF @@FETCH_STATUS = -2
+
+BEGIN
+
+PRINT ' MISSING! It probably was deleted.';
+
+END
+
+ELSE
+
+BEGIN
+
+PRINT CAST(@SalesOrderID AS varchar) + ' ' + CAST(@CustomerID AS varchar);
+
+END
+
+FETCH NEXT FROM CursorTest INTO @SalesOrderID, @CustomerID;
+
+END
+
+\-- Now it's time to clean up after ourselves
+
+CLOSE CursorTest;
+
+DEALLOCATE CursorTest;
+
+DROP TABLE CursorTable;
+
+The changes aren't really all that remarkable. We've gone ahead and added the required unique index. I happened to choose to do it as a primary key since that's what matches up best with the table we got this information out of, but it also could have been a unique index without the primary key. We also added something to insert a row of data so we can clearly see that the keyset doesn't see the row in question.
+
+Perhaps the most important thing that we've changed is the condition for the WHILE loop on the final run through the cursor. Technically speaking, we should have made this change to both loops, but there is zero risk of a deleted record the first time around in this example, and I wanted the difference to be visible right within the same script.
+
+The change was made to deal with something new we've added—the possibility that we might get to a record only to find that it's now missing. More than likely, someone has deleted it.
+
+Let's take a look then at the results we get after running this:
+
+(5 row(s) affected)
+
+43661 29734
+
+43662 29994
+
+43663 29565
+
+43664 29898
+
+43665 29580
+
+(1 row(s) affected)
+
+(1 row(s) affected)
+
+(1 row(s) affected)
+
+SalesOrderID CustomerID
+
+\------------ -----------
+
+-99999 -99999
+
+43661 29734
+
+43662 29994
+
+43663 −111
+
+43665 29580
+
+(5 row(s) affected)
+
+43661 29734
+
+43662 29994
+
+43663 −111
+
+MISSING! It probably was deleted.
+
+43665 29580
+
+Okay, let's walk through the highlights here.
+
+Everything starts out pretty much as it did before. We see the same five rows in the first result set as we did last time. We then see an extra couple of "affected by" messages. These are for the INSERT, UPDATE, and DELETE statements that we added. Next comes the second result set. It's at this point that things get a bit more interesting.
+
+In this next result set, we see the actual results of our UPDATE, INSERT, and DELETE statements. Just as we think we're done, SalesOrderID 43664 has been deleted, and a new order with the SalesOrderID of −99999 has been inserted. That's what's in the table, but things don't appear quite as cozy in the cursor.
+
+The next (and final) result set tells the tale on some differences in the way that things are presented in the cursor versus actually re-running the query. As it happens, we have exactly five rows—just like we started out with and just like our SELECT statement showed are in the actual table. But that's entirely coincidental.
+
+In reality, there are a couple of key differences between what the cursor is showing and what the table is showing. The first presents itself rather boldly. Our result set actually knows that a record is missing. You see, the cursor continues to show the key position in the keyset, but, when it went to do the lookup on the data, the data wasn't there anymore. Our @@FETCH_STATUS was set to −2, and we were able to test for it and report it. The SELECT statement showed us what data was actually there without any remembrance of the record ever having been there. The INSERT, on the other hand, is an entirely unknown quantity to the cursor. The record wasn't there when the cursor was created, so the cursor has no knowledge of its existence. It doesn't show up in our result set.
+
+Keyset cursors can be very handy for dealing with situations where you need some sensitivity to changes in the data, but don't need to know about every insert right up to the minute. They can, depending on the nature of the result set you're after and the keyset, also provide some substantial savings in the amount of data that has to be duplicated and stored into tempdb. This can have some favorable performance impacts for your overall server.
+
+WARNING!!! If you define a cursor as being of type KEYSET but do so on a table with no unique index, then SQL Server will implicitly convert your cursor to be STATIC. The fact that the behavior gets changed would probably be enough to ruffle your feathers a bit, but it doesn't stop there. It doesn't tell you about it. That's right; by default you get absolutely no warning about this conversion. Fortunately, you can watch out for this sort of thing by using the TYPE_WARNING option in your cursor. We'll look at this option briefly toward the end of the chapter.
+
+Dynamic Cursors
+
+Don't you just wish that you could be on a quiz show and have them answer a question like, "What's so special about a dynamic cursor?" Hmmm, then again, I suppose their pool of possible contestants would be small, but those that decided to go for it would probably have the answer right away, "They are dynamic, right?" Exactly.
+
+Well, almost exactly. Dynamic cursors fall just short of what I would call dynamic in the sense that they won't proactively tell you about changes to the underlying data. What gets them close enough to be called dynamic is that they are sensitive toward all changes to the underlying data. Of course, like most things in life, all this extra power comes with a high price tag.
+
+If you want inserted records to be added to the cursor—no problem. If you want updated rows to appear properly updated in the cursor—no problem. If you want deleted records to be removed from the cursor set—no problem (although you can't really tell that something's been deleted since you won't see the missing record that you saw with a keyset cursor type). If, however, you want to have concurrency—uh oh, big problem. (You're holding things open longer, so collisions with other users are more likely.) If you want this to be low overhead—uh oh, big problem again. (You are effectively requerying with every FETCH.) Yes, dynamic cursors can be the bane of your performance existence, but, hey, that's life isn't it?
+
+The long and the short of it is that you usually should avoid dynamic cursors.
+
+Why all the hype and hoopla? Well, in order to understand some of the impacts that a dynamic cursor can have, you need to realize a bit about how they work. You see, with a dynamic cursor, your cursor is essentially rebuilt every single time you issue a FETCH. That's right, the SELECT statement that forms the basis of your query, complete with its associated WHERE clause is effectively re-run. Think about that when dealing with large data sets. It brings just one word to mind—ugly. Very ugly indeed.
+
+One of the things I've been taught since the dawn of my RDBMS time is that dynamic cursors are performance pigs—I've found this not to always be the case. This seems to be particularly true when the underlying tables are not very large in size. If you think about it for a bit, you might be able to come up with why a dynamic cursor can actually be slightly faster in terms of raw speed.
+
+My guess as to what's driving this is the use of tempdb for keyset cursors. While a lot more work has to be done with each FETCH in order to deal with a dynamic cursor, the data for the requery will often be completely in cache (depending on the sizing and loading of your system). This means the dynamic cursor gets to work largely from RAM. The keyset cursor, on the other hand, is stored in tempdb, which is on disk (that is, much, much slower) for most systems.
+
+As your table size gets larger, there is more diverse traffic hitting your server, the memory allocated to SQL Server gets smaller, and the more that keyset-driven cursors are going to have something of an advantage over dynamic cursors. In addition, raw speed isn't everything. You really need to think about concurrency issues too (we will look at the options for concurrency in detail later in the chapter), which can be more problematic in dynamic cursors. Still, don't count out dynamic cursors for speed alone if you're dealing with a server-side cursor with small data sets.
+
+Let's go ahead and re-run our last script with only one modification—the change from KEYSET to DYNAMIC:
+
+USE AdventureWorks2008;
+
+/* Build the table that we'll be playing with this time */
+
+SELECT SalesOrderID, CustomerID
+
+INTO CursorTable
+
+FROM Sales.SalesOrderHeader
+
+WHERE SalesOrderID BETWEEN 43661 AND 43665;
+
+\-- Now create a unique index on it in the form of a primary key
+
+ALTER TABLE CursorTable
+
+ADD CONSTRAINT PKCursor
+
+PRIMARY KEY (SalesOrderID);
+
+/* The IDENTITY property was automatically brought over when
+
+** we did our SELECT INTO, but I want to use my own SalesOrderID
+
+** value, so I'm going to turn IDENTITY_INSERT on so that I
+
+** can override the identity value.
+
+*/
+
+SET IDENTITY_INSERT CursorTable ON;
+
+\-- Declare our cursor
+
+DECLARE CursorTest CURSOR
+
+GLOBAL -- So we can manipulate it outside the batch
+
+SCROLL -- So we can scroll back and see the changes
+
+DYNAMIC -- This is what we're testing this time
+
+FOR
+
+SELECT SalesOrderID, CustomerID
+
+FROM CursorTable;
+
+\-- Declare our two holding variables
+
+DECLARE @SalesOrderID int;
+
+DECLARE @CustomerID varchar(5);
+
+\-- Get the cursor open and the first record fetched
+
+OPEN CursorTest;
+
+FETCH NEXT FROM CursorTest INTO @SalesOrderID, @CustomerID;
+
+\-- Now loop through them all
+
+WHILE @@FETCH_STATUS = 0
+
+BEGIN
+
+PRINT CAST(@SalesOrderID AS varchar) + ' ' + @CustomerID;
+
+FETCH NEXT FROM CursorTest INTO @SalesOrderID, @CustomerID;
+
+END
+
+\-- Make a change. We'll see that does affect the cursor this time.
+
+UPDATE CursorTable
+
+SET CustomerID = -111
+
+WHERE SalesOrderID = 43663;
+
+\-- Now we'll delete a record so we can see how to deal with that
+
+DELETE CursorTable
+
+WHERE SalesOrderID = 43664;
+
+\-- Now Insert a record. We'll see that the cursor is oblivious to it.
+
+INSERT INTO CursorTable
+
+(SalesOrderID, CustomerID)
+
+VALUES
+
+(-99999, -99999);
+
+\-- Now look at the table to show that the changes are really there.
+
+SELECT SalesOrderID, CustomerID
+
+FROM CursorTable;
+
+\-- Now go back to the top. We can do this since we have a scrollable cursor
+
+FETCH FIRST FROM CursorTest INTO @SalesOrderID, @CustomerID;
+
+/* And loop through again.
+
+** This time, notice that we changed what we're testing for.
+
+** Since we have the possibility of rows being missing (deleted)
+
+** before we get to the end of the actual cursor, we need to do
+
+** a little bit more refined testing of the status of the cursor.
+
+*/
+
+WHILE @@FETCH_STATUS != -1
+
+BEGIN
+
+IF @@FETCH_STATUS = -2
+
+BEGIN
+
+PRINT ' MISSING! It probably was deleted.';
+
+END
+
+ELSE
+
+BEGIN
+
+PRINT CAST(@SalesOrderID AS varchar) + ' ' + CAST(@CustomerID AS varchar);
+
+END
+
+FETCH NEXT FROM CursorTest INTO @SalesOrderID, @CustomerID;
+
+END
+
+\-- Now it's time to clean up after ourselves
+
+CLOSE CursorTest;
+
+DEALLOCATE CursorTest;
+
+DROP TABLE CursorTable;
+
+And the results:
+
+(5 row(s) affected)
+
+43661 29734
+
+43662 29994
+
+43663 29565
+
+43664 29898
+
+43665 29580
+
+(1 row(s) affected)
+
+(1 row(s) affected)
+
+(1 row(s) affected)
+
+SalesOrderID CustomerID
+
+\------------ -----------
+
+-99999 -99999
+
+43661 29734
+
+43662 29994
+
+43663 -111
+
+43665 29580
+
+(5 row(s) affected)
+
+-99999 *
+
+43661 29734
+
+43662 29994
+
+43663 -111
+
+43665 29580
+
+The first two recordsets look exactly as they did last time. The change comes when we get to the third (and final) result set:
+
+  * There is no indication of a failed fetch, even though we deleted a record (no notification).
+  * The updated record shows the update (just as it did with a keyset).
+  * The inserted record now shows up in the cursor set.
+
+Dynamic cursors are the most sensitive of all cursors. They are affected by everything you do to the underlying data. The downside is that they can provide some extra concurrency problems, and they can pound the system when dealing with larger data sets.
+
+Technically speaking, and unlike a keyset cursor, a dynamic cursor can operate on a non-unique index. Avoid this at all costs. (In my opinion, it should prevent you from doing this and throw an error.) Under certain circumstances, it is quite possible to create an infinite loop because the dynamic cursor cannot keep track of where it is in the cursor set. The only sure-fire way of avoiding this is to either stay away from dynamic cursors or only work on tables with a truly unique index available.
+
+FAST_FORWARD Cursors
+
+Fast (from a cursor standpoint—queries make this or any other cursor look like a snail) is the operative word on this one. This one is the epitome of the term "firehose cursor" that is often used around forward-only cursors. I've always taken the analogy to imply the way that the data sort of spews forth. Once out, you can't put it back in. In short, you're simply awash with data. With FAST_FORWARD cursors, you open the cursor, and do nothing but deal with the data, move forward, and deallocate it. (Note that I didn't say close it.)
+
+Now, it's safe to say that calling this a cursor "type" is something of a misnomer. This kind of cursor has several different circumstances where it is automatically converted to other cursor types, but I think of them as being most like a keyset-driven cursor in the sense that membership is fixed. Once the members of the cursor are established, no new records are added. Deleted rows show up as a missing record (@@FETCH_STATUS of −2). Keep in mind though that, if the cursor is converted to something else (via automatic conversion), it will take on the behavior of that new cursor type.
+
+The nasty side here is that SQL Server doesn't tell you that the conversion has happened unless you have the TYPE_WARNING option added to your cursor definition.
+
+As I said before, there are a number of circumstances where a FAST_FORWARD cursor is implicitly converted to another cursor type. The following table outlines these conversions.
+
+Condition | Converted to  
+---|---  
+The underlying query requires that a temporary table be built | Static  
+The underlying query is distributed in nature | Keyset  
+The cursor is declared as FOR UPDATE | Dynamic  
+A condition exists that would convert to keyset driven, but at least one underlying table does not have a unique index | Static
+
+I've heard that there are other circumstances where a cursor will be converted, but I haven't seen any documentation of this, and I haven't run into it myself.
+
+If you find that you are getting that most dreaded of all computer-related terms (unpredictable results), you can make use of sp_describe_cursor (a system stored procedure) to list all the currently active options for your cursor.
+
+It's worth noting that all FAST_FORWARD cursors are read-only in nature. You can explicitly set the cursor to have the FOR UPDATE option, but, as suggested in the preceding implicit conversion table, the cursor will be implicitly converted to dynamic.
+
+Okay, so what exactly does a FAST_FORWARD cursor have that any of the other cursors wouldn't have if they were declared as being FORWARD_ONLY? Well, a FAST_FORWARD cursor will implement at least one of two tricks to help things along:
+
+  * The first is to pre-fetch data. That is, at the same time that you open the cursor, it automatically fetches the first row. This means that you save a roundtrip to the server if you are operating in a client-server environment using ODBC. Unfortunately, this is available only under ODBC.
+  * The second is the one that is a sure thing—auto-closing of the cursor. Since you are running a cursor that is forward-only, SQL Server can assume that you want the cursor closed once you reach the end of the recordset. Again, this saves a roundtrip and squeezes out a tiny bit of additional performance.
+
+Choosing a cursor type is one of the most critical decisions when structuring a cursor. Choices that have little apparent difference in the actual output of the cursor task can have major differences in performance. Other effects can be seen in sensitivity to changes, concurrency issues, and updatability.
+
+Concurrency Options
+
+We got our first taste of concurrency issues back in our chapter on transactions and locks. As you recall, we deal with concurrency issues whenever there are issues surrounding two or more processes trying to get to the same data at essentially the same time. When dealing with cursors, however, the issue becomes just slightly stickier.
+
+The problem is multifold:
+
+  * The operation tends to last longer (more time to have a concurrency problem).
+  * Each row is read at the time of the fetch, but someone may try to edit it before you get a chance to do your update.
+  * You may scroll forward and backward through the result set for what could be an essentially unlimited about of time (I hope you never do that, but it's possible to do).
+
+As with all concurrency issues, this tends to be more of a problem in a transaction environment than when running in a single statement situation. The longer the transaction, the more likely you are to have concurrency problems.
+
+SQL Server gives us three different options for dealing with this issue:
+
+  * READ_ONLY
+  * SCROLL_LOCKS (equates to Pessimistic in most terminologies)
+  * OPTIMISTIC
+
+Each of these has their own thing they bring to the party, so let's look at them one by one.
+
+READ_ONLY
+
+In a read-only situation, you don't have to worry about whether your cursor is going to try and obtain any kind of update or exclusive lock. You also don't have to worry about whether anyone has edited the data while you've been busy making changes of your own. Both of these make life considerably easier.
+
+READ_ONLY is just what it sounds like. When you choose this option, you cannot update any of the data, but you also skip most (but not all) of the notion of concurrency entirely.
+
+SCROLL_LOCKS
+
+Scroll locks equate to what is more typically referred to as pessimistic locking in the various APIs and object models. In its simplest form, it means that, as long as you are editing this record, no one else is allowed to edit it. The specifics of implementation of duration of this vary depending on:
+
+  * Whether or not you're in a transaction
+  * What transaction isolation level you've set
+
+Note that this can be different from what we saw with update locks back in our locking and transaction chapter.
+
+With update locks, we prevented other users from updating the data. This lock was held for the duration of the transaction. If it was a single statement transaction, then the lock was not released until every row affected by the update was complete.
+
+Scroll locks work identically to update locks with only one significant exception—the duration the lock is held. With scroll locks, there is much more of a variance depending on whether or not the cursor is participating in a multi-statement transaction. Assuming for the moment that you do not have a transaction wrapped around the cursor, then the lock is held only on the current record in the cursor—that is, from the time the record is first fetched until the next record (or end of the result set) is fetched. Once you move on to the next record, the lock is removed from the prior record.
+
+Let's take a look at this through a significantly pared-down version of the script we've been using through much of this chapter:
+
+USE AdventureWorks2008;
+
+/* Build the table that we'll be playing with this time */
+
+SELECT SalesOrderID, CustomerID
+
+INTO CursorTable
+
+FROM Sales.SalesOrderHeader
+
+WHERE SalesOrderID BETWEEN 43661 AND 43665;
+
+\-- Now create a unique index on it in the form of a primary key
+
+ALTER TABLE CursorTable
+
+ADD CONSTRAINT PKCursor
+
+PRIMARY KEY (SalesOrderID);
+
+/* The IDENTITY property was automatically brought over when
+
+** we did our SELECT INTO, but I want to use my own SalesOrderID
+
+** value, so I'm going to turn IDENTITY_INSERT on so that I
+
+** can override the identity value.
+
+*/
+
+SET IDENTITY_INSERT CursorTable ON;
+
+\-- Declare our cursor
+
+DECLARE CursorTest CURSOR
+
+GLOBAL -- So we can manipulate it outside the batch
+
+SCROLL -- So we can scroll back and see the changes
+
+DYNAMIC -- This is what we're testing this time
+
+SCROLL_LOCKS
+
+FOR
+
+SELECT SalesOrderID, CustomerID
+
+FROM CursorTable;
+
+\-- Declare our two holding variables
+
+DECLARE @SalesOrderID int;
+
+DECLARE @CustomerID varchar(5);
+
+\-- Get the cursor open and the first record fetched
+
+OPEN CursorTest;
+
+FETCH NEXT FROM CursorTest INTO @SalesOrderID, @CustomerID;
+
+You'll not see much gray in the preceding code block (to indicate that changes were made on that line) because only one line was added. The remainder of the changes were deletions of lines, so there's nothing for me to make gray for you. Just make sure that you've made the appropriate changes if you're going to try and run this one.
+
+What we've done is toss out most of the things that were happening, and we've refocused ourselves back on the cursor. Perhaps the biggest thing to notice though is a couple of key things that we have deliberately omitted even though they are things that would normally cause problems if we try to operate without them:
+
+  * We do not have a CLOSE on our cursor, nor do we deallocate it at this point.
+  * We don't even scroll any farther than getting the first row fetched.
+
+The reason we've left the cursor open is to create a situation where the state of the cursor being open lasts long enough to play around with the locks somewhat. In addition, we fetch only the first row because we want to make sure that there is an active row. (The way we had things before, we would have been to the end of the set before we started running with other, possibly conflicting, statements.)
+
+What you want to do is execute the preceding and then open a completely separate connection window with AdventureWorks2008 active. Then run a simple test in the new connection window:
+
+SELECT * FROM CursorTable;
+
+If you haven't been grasping what I've been saying in this section, you might be a tad surprised by the results:
+
+SalesOrderID CustomerID
+
+\------------ -----------
+
+43661 29734
+
+43662 29994
+
+43663 29565
+
+43664 29898
+
+43665 29580
+
+(5 row(s) affected)
+
+Based on what we know about locks (from Chapter 11), you would probably expect the preceding SELECT statement to be blocked by the locks on the current record. Not so with scroll locks. The lock is only on the record that is currently in the cursor, and, perhaps more importantly, the lock only prevents updates to the record. Any SELECT statements (such as ours) can see the contents of the cursor without any problems.
+
+Now that we've seen how things work, go back to the original window and run the code to clean things up. This is back to the same code we've worked with for much of this chapter:
+
+\-- Now it's time to clean up after ourselves
+
+CLOSE CursorTest;
+
+DEALLOCATE CursorTest;
+
+DROP TABLE CursorTable;
+
+Don't forget to run the preceding clean-up code!!! If you forget, then you'll have an open transaction sitting in your system until you terminate the connection. SQL Server should clean up any open transactions (by rolling them back) when the connection is broken, but I've seen situations where you run the database consistency checker (DBCC) and find that you have some really old transactions. SQL Server missed cleaning up after itself.
+
+OPTIMISTIC
+
+Optimistic locking creates a situation where no scroll locks of any kind are set on the cursor. The assumption is that, if you do an update, you want people to still be able to get at your data. You're being optimistic because you are essentially guessing (hoping may be a better word) that no one will edit your data between when you fetched it into the cursor and when you applied your update.
+
+The optimism is not necessarily misplaced. If you have a lot of records and not that many users, then the chances of two people trying to edit the same record at the same time are very small (depending on the nature of your business processes). Still, if you get this optimistic, then you need to also be prepared for the possibility that you will be wrong—that is, that someone has altered the data in between when you performed the fetch and when you went to actually update the database.
+
+If you happen to run into this problem, SQL Server will issue an error with a value in @@ERROR of 16394. When this happens, you need to completely refetch the data from the cursor (so you know what changes were being made) and either rollback the transaction or try the update again.
+
+Detecting Conversion of Cursor Types: TYPE_WARNING
+
+This one is really pretty simple. If you add this option to your cursor, then you will be notified if an implicit conversion is made on your cursor. Without this statement, the conversion just happens with no notification. If the conversion wasn't an anticipated behavior, then there's a good chance that you're going to see the most dreaded of all computer terms (unpredictable results).
+
+This is perhaps best understood with an example, so let's go back and run a variation again of the cursor that we've been using throughout most of the chapter.
+
+In this instance, we're going to take out the piece of code that creates a key for the table. Remember that without a unique index on a table, a keyset will be implicitly converted to a static cursor:
+
+USE AdventureWorks2008;
+
+/* Build the table that we'll be playing with this time */
+
+SELECT SalesOrderID, CustomerID
+
+INTO CursorTable
+
+FROM Sales.SalesOrderHeader
+
+WHERE SalesOrderID BETWEEN 43661 AND 43665;
+
+\-- Declare our cursor
+
+DECLARE CursorTest CURSOR
+
+GLOBAL -- So we can manipulate it outside the batch
+
+SCROLL -- So we can scroll back and see the changes
+
+KEYSET
+
+TYPE_WARNING
+
+FOR
+
+SELECT SalesOrderID, CustomerID
+
+FROM CursorTable;
+
+\-- Declare our two holding variables
+
+DECLARE @SalesOrderID int;
+
+DECLARE @CustomerID varchar(5);
+
+\-- Get the cursor open and the first record fetched
+
+OPEN CursorTest;
+
+FETCH NEXT FROM CursorTest INTO @SalesOrderID, @CustomerID;
+
+\-- Now loop through them all
+
+WHILE @@FETCH_STATUS=0
+
+BEGIN
+
+PRINT CAST(@SalesOrderID AS varchar) + ' ' + CAST(@CustomerID AS varchar);
+
+FETCH NEXT FROM CursorTest INTO @SalesOrderID, @CustomerID;
+
+END
+
+\-- Now it's time to clean up after ourselves
+
+CLOSE CursorTest;
+
+DEALLOCATE CursorTest;
+
+DROP TABLE CursorTable;
+
+There's nothing particularly special about this one. I'm considering it to be something of a complete rewrite only because we've deleted so much from the original and it's been so long since we've seen it. The creation of the table and cursor is pretty much the same as when we did our keyset-driven cursor much earlier in the chapter. The major changes are the removal of blocks of code that we don't need for this illustration along with the addition of the TYPE_WARNING option in the cursor declaration.
+
+Now we come up with some interesting results:
+
+(5 row(s) affected)
+
+The created cursor is not of the requested type.
+
+43661 29734
+
+43662 29994
+
+43663 29565
+
+43664 29898
+
+43665 29580
+
+Everything ran okay. We just saw a statement that was meant solely as a warning. The results may not be what you expected given that the cursor was converted.
+
+The downside here is that you get a message sent out, but no error. Programmatically speaking, there is essentially no way to tell that you received this message—which makes this option fairly useless in a production environment. Still, it can often be quite handy when you're trying to debug a cursor to determine why it isn't behaving in the expected fashion.
+
+FOR <SELECT>
+
+This section of the cursor declaration is at the very heart of the matter. This is a section that is required under even the most basic of cursor syntax, and that's because it's the one and only clause that determines what data should be placed in the cursor.
+
+Almost any SELECT statement is valid—even those including an ORDER BY clause. As long as your SELECT statement provides a single result set, you should be fine. Examples of options that would create problems would be any of the summary options such as a CUBE or ROLLUP.
+
+FOR UPDATE
+
+By default, any cursor that is updatable at all is completely updatable—that is, if one column can be edited then any of them can.
+
+The FOR UPDATE <column list> option allows you to specify that only certain columns are to be editable within this cursor. If you include this option, then only the columns in your column list will be allowed to be updatable. Any columns not explicitly mentioned will be considered to be read-only.
+
+Navigating the Cursor: The FETCH Statement
+
+I figure that whoever first created the SQL cursor syntax must have really liked dogs. They probably decided to think of the data they were after as being the bone, with SQL Server the faithful bloodhound. From this, I'm guessing, the FETCH keyword was born.
+
+It's an apt term if you think about it. In a nutshell, it tells SQL Server to "go get it boy!" With that, our faithful mutt (in the form of SQL Server) is off to find the particular bone (row) we were after. We've gotten a bit of a taste of the FETCH statement in some of the previous cursors in this chapter, but it's time to look at this very important statement more closely.
+
+FETCH actually has many more options than what we've seen so far. Up to this point, we've seen three different options for FETCH (NEXT, PREVIOUS, and FIRST). These really aren't a bad start. Indeed, we really only need to add one more for the most basic set of cursor navigation commands, and a few after that for the complete set.
+
+Let's look at each of the cursor navigation commands and see what they do for us:
+
+FETCH Option | Description  
+---|---  
+NEXT | This moves you forward exactly one row in the result set and is the backbone option. Ninety percent or more of your cursors won't need any more than this. Keep this in mind when deciding to declare as FORWARD_ONLY or not. When you try to do a FETCH NEXT and it results in moving beyond the last record, you will have a @@FETCH_STATUS of −1.  
+PRIOR | As you have probably surmised, this one is the functional opposite of NEXT. This moves backward exactly one row. If you performed a FETCH PRIOR when you were at the first row in the result set, then you will get a @@FETCH_STATUS of −1 just as if you had moved beyond the end of the file.  
+FIRST | Like most cursor options, this one says what it is pretty clearly. If you perform a FETCH FIRST, then you will be at the first record in the recordset. The only time this option should generate a @@FETCH_STATUS of −1 is if the result set is empty.  
+LAST | The functional opposite of FIRST, FETCH LAST moves you to the last record in the result set. Again, the only way you'll get a −1 for @@FETCH_STATUS on this one is if you have an empty result set.  
+ABSOLUTE | With this one, you supply an integer value that indicates how many rows you want from the beginning of the cursor. If the value supplied is negative, then it is that many rows from the end of the cursor. Note that this option is not supported with dynamic cursors (since the membership in the cursor is redone with every fetch, you can "really know where you're at"). This equates roughly to navigating to a specific "absolute position" in a few of the client access object models.  
+RELATIVE | No—this isn't your mother-in-law kind of thing. Instead, this is about navigating by moving a specified number of rows forward or backward relative to the current row.
+
+We've already gotten a fair look at a few of these in our previous cursors. The other navigational choices work pretty much the same.
+
+Altering Data within Your Cursor
+
+Up until now, we've kind of glossed over the notion of changing data directly in the cursor. Now it's time to take a look at updating and deleting records within a cursor.
+
+Since we're dealing with a specific row rather than set data, we need some special syntax to tell SQL Server that we want to update. Happily, this syntax is quite easy given that you already know how to perform an UPDATE or DELETE.
+
+Essentially, we're going to update or delete data in the table that is underlying our cursor. Doing this is as simple as running the same UPDATE and DELETE statements that we're now used to, but qualifying them with a WHERE clause that matches our cursor row. We just add one line of syntax to our DELETE or UPDATE statement:
+
+WHERE CURRENT OF <cursor name>
+
+Nothing remarkable about it at all. Just for grins though, we'll go ahead and implement a cursor using this syntax:
+
+USE AdventureWorks2008;
+
+/* Build the table that we'll be playing with this time */
+
+SELECT SalesOrderID, CustomerID
+
+INTO CursorTable
+
+FROM Sales.SalesOrderHeader
+
+WHERE SalesOrderID BETWEEN 43661 AND 43665;
+
+\-- Now create a unique index on it in the form of a primary key
+
+ALTER TABLE CursorTable
+
+ADD CONSTRAINT PKCursor
+
+PRIMARY KEY (SalesOrderID);
+
+/* The IDENTITY property was automatically brought over when
+
+** we did our SELECT INTO, but I want to use my own OrderID
+
+** value, so I'm going to turn IDENTITY_INSERT on so that I
+
+** can override the identity value.
+
+*/
+
+SET IDENTITY_INSERT CursorTable ON;
+
+\-- Declare our cursor
+
+DECLARE CursorTest CURSOR
+
+SCROLL -- So we can scroll back and see if the changes are there
+
+KEYSET
+
+FOR
+
+SELECT SalesOrderID, CustomerID
+
+FROM CursorTable;
+
+\-- Declare our two holding variables
+
+DECLARE @SalesOrderID int;
+
+DECLARE @CustomerID varchar(5);
+
+\-- Get the cursor open and the first record fetched
+
+OPEN CursorTest;
+
+FETCH NEXT FROM CursorTest INTO @SalesOrderID, @CustomerID;
+
+\-- Now loop through them all
+
+WHILE @@FETCH_STATUS=0
+
+BEGIN
+
+IF (@SalesOrderID % 2 = 0) -- Even number, so we'll update it
+
+BEGIN
+
+\-- Make a change. This time though, we'll do it using cursor syntax
+
+UPDATE CursorTable
+
+SET CustomerID = -99999
+
+WHERE CURRENT OF CursorTest;
+
+END
+
+ELSE -- Must be odd, so we'll delete it.
+
+BEGIN
+
+\-- Now we'll delete a record so we can see how to deal with that
+
+DELETE CursorTable
+
+WHERE CURRENT OF CursorTest;
+
+END
+
+FETCH NEXT FROM CursorTest INTO @SalesOrderID, @CustomerID;
+
+END
+
+\-- Now go back to the top. We can do this since we have a scrollable cursor
+
+FETCH FIRST FROM CursorTest INTO @SalesOrderID, @CustomerID;
+
+\-- And loop through again.
+
+WHILE @@FETCH_STATUS != -1
+
+BEGIN
+
+IF @@FETCH_STATUS = -2
+
+BEGIN
+
+PRINT ' MISSING! It probably was deleted.';
+
+END
+
+ELSE
+
+BEGIN
+
+PRINT CAST(@SalesOrderID AS varchar) + ' ' + CAST(@CustomerID AS varchar);
+
+END
+
+FETCH NEXT FROM CursorTest INTO @SalesOrderID, @CustomerID;
+
+END
+
+\-- Now it's time to clean up after ourselves
+
+CLOSE CursorTest;
+
+DEALLOCATE CursorTest;
+
+DROP TABLE CursorTable;
+
+Again, I'm treating this one as an entirely new cursor. We've done enough deletions, additions, and updates that I suspect you'll find it easier to just key things in a second time rather than having to look through row by row to see what you might have missed.
+
+We are also again using the modulus operator (%) that we saw earlier in the book. Remember that it gives us nothing but the remainder. Therefore, if the remainder of any number divided by 2 is zero, then we know the number was an even number.
+
+The rest of the nuts and bolts of this don't require any rocket science, yet we can quickly tell that we got some results:
+
+(5 row(s) affected)
+
+(1 row(s) affected)
+
+(1 row(s) affected)
+
+(1 row(s) affected)
+
+(1 row(s) affected)
+
+(1 row(s) affected)
+
+MISSING! It probably was deleted.
+
+43662 *
+
+MISSING! It probably was deleted.
+
+43664 *
+
+MISSING! It probably was deleted.
+
+You can see the multiple "1 row affected" that is the returned message for any row that was affected by the UPDATE and DELETE statements. When we get down to the last result set enumeration, you can quickly tell that we deleted all the odd numbers (which is what we told our code to do), and that we updated the even numbered rows with a new CustomerID.
+
+No tricks—just a WHERE clause that makes use of the WHERE CURRENT argument.
+
+Summary
+
+Cursors give us those memories of the old days when we could address things row by row. Ahhh, it sounds so romantic with that "old days" kind of thought. WRONG! I'd stick to set operations any day if I thought I could get away with it.
+
+The fact is that set operations can't do everything. Cursors are going to be the answer any time a solution must be done on a row-by-row basis. Notice that I used the word "must" in there, and that's the way you should think of it. Cursors are great for taking care of some problems that can't be solved by any other means.
+
+That being said, remember to avoid cursor use wherever possible. Cursors are resource pigs and will almost always produce 100 times or worse negative performance impact. It is extremely tempting—especially if you come from the mainframe world or from a dBase background—to just keep thinking in that row-by-row method. Don't fall into that trap! Cursors are meant to be used only when no other options are available.
+14
+
+Reporting Services
+
+There are a few chapters in my books where I've chosen to overlap content between the Beginning and Professional titles. Now, it may seem like beginning and professional topics would be mutually exclusive, but that holds true only in a perfect world where everyone is gaining experience in the same way and in the same order, and where everyone has the same definition of beginning and professional.
+
+In case you haven't already guessed it, this is one of those chapters where, if you've read my Beginning title, you're going to notice a little bit of overlap. In the case of Reporting Services, the reasons are multifold, but a couple of the key ones are:
+
+  * Some people get into database development specifically driven by the need to control more of their own reporting destiny (in which case they may have almost started with Reporting Services, and then started learning the queries they need to support the data in the report). Others are long-term database "experts" who are just getting around to using one of those "extras" that SQL Server provides.
+  * It's a relatively new feature (in the grand life of SQL Server as a product), so it's "new" to many professional-level people.
+
+Now, don't go rushing off yet if you read the chapter on Reporting in the Beginning title. While we do repeat some key items, we go a bit deeper here, and focus on more of the true developer-oriented items (and less on the model-driven aspects). Feel free, however, to skip ahead to the section on the data sources and data source views, where we will take a far more "Pro" look at things including parameterization, drill-throughs, and charting.
+
+A Quick Look at Reports as a Concept
+
+After all the queries have been written, and after all the stored procedures have been run, there remains a rather important thing we need to do in order to make our data useful—make it available to end users.
+
+Reporting is one of those things that seems incredibly simple, but turns out to be rather tricky. You see, you can't simply start sticking numbers in front of people's faces. The numbers must make sense and, if at all possible, capture the attention of the person you're reporting for. To produce reports that actually get used and, therefore, are useful, there are a few things to keep in mind:
+
+  * Use Just the Right Amount of Data: Do not try to do too much in one report; nor should you do too little. A report that is a jumble of numbers is going to lose a reader's attention quickly, and you'll find that it doesn't get utilized after the first few times it is generated. Likewise, a barren report will get just a glance and get tossed without any real thought. Find a balance of mixing the right amount of data with the right data.
+  * Make it Appealing: Sad as it is to say, another important element in reporting is what one of my daughters would call making it "prettiful," which is to say, making it look nice and pleasing to the eye. An ugly report is a dead report.
+
+In this chapter, we're going to be taking a look at a few key concepts of Reporting Services (often referred to as SSRS), and then moving on to some more advanced aspects. While I do indeed skip some of the "basics," I cover some fundamental items necessary to make any sense out of the more advanced topics, but then quickly move on to the Report Designer, which allows for the most advanced reporting options Reporting Services has to offer.
+
+For the sake of brevity (and to minimize overlap), I cover report models in this book only with a discussion of what they are there for, not with a specific example. This is one of the places where I draw the line between Beginning- and Pro-level information. That said, even if you did not already understand report models before reading this chapter, you'll find that learning about core items such as data sources and the Report Designer will make learning how to use the Report Modeler and the Report Model designer largely intuitive. The building of actual reports will be similarly easy.
+
+Reporting Services 101
+
+Odds are that you've already generated some reports in your day. They may have been paper reports off a printer (perhaps in something as rudimentary as Access's reporting area, which is actually one of the best parts of Access to me). Or perhaps you have used a rather robust reporting engine such as Crystal Reports. Even if you haven't used tools that fancy, one can argue that handing your boss the printout from a stored procedure is essentially a very simple (albeit not necessarily nice-looking) report. I would tend to agree with that argument.
+
+The reality, however, is that our managers and coworkers today expect something more. This is where Reporting Services comes in. Reporting Services really has two different varieties of operation:
+
+  * Report Models: This is making use of a relatively simple, Web-driven interface that is meant to allow end users to create their own simple reports.
+  * Reports Generated in the Business Intelligence Development Studio: While this doesn't necessarily mean you have to write code (you can actually create some fairly robust reports using drag-and-drop functionality), you can get pretty fancy and do very complex things depending on just how far you want to take it.
+
+Note that, while your users can eventually access these reports from the same Reporting Services Web host, they are based on somewhat different architectures (and are created in different fashions).
+
+In addition, Reporting Services provides features for pre-generating reports (handy if the queries that underlie the report take a while to run) as well as for distributing the report via e-mail. Exported reports can be rendered in PDF, Excel, and Word formats.
+
+Tools Used with Reporting Services
+
+Reporting Services has several tools to help you create, use, and manage reports. These include:
+
+  * Reporting Services Configuration Manager: This tool can be found in the Configuration Tools subfolder under the main SQL Server. This allows you to configure such things as the account Reporting Services runs under, the IP addresses and ports the supporting Web server will respond to, the virtual directory names used for Reporting Services, e-mail accounts to be used, and the database used to keep track of Reporting Services information, as well as encryption keys and scalability configuration information.
+  * Business Intelligence Development Studio (BIDS): This is essentially Visual Studio with a set of templates installed that focus on Reporting Services, Analytics, Integration Services, and Data Mining. If you already have Visual Studio 2008 installed, BIDS just adds some more templates and shortcuts to get to Visual Studio. We will be utilizing the Development Studio extensively over several of the remaining chapters of this book (sometimes in its base SQL Server installed form, and sometimes as part of a full Visual Studio installation).
+  * SQL Server Management Studio: In the Management Studio, you can connect to virtually all of the different SQL Server–related services in order to (can you see this one coming? Of course you can!), manage things about that particular service. While only the base data engine has what I would consider "full functionality" entirely wrapped up in the Management Studio, the Studio is the place to perform most security-related tasks as well as anything tied to job scheduling.
+  * The Report Server Website: This is where you go to actually run most of the reports you'll want executed in Reporting Services, but through the Site Settings link (in the upper-right side of the browser). It is also a place to manage some elements of your server (in particular, caching, assigning roles, and scheduling).
+
+Unfortunately, no individual tool does everything involved in Reporting Services. Indeed, none of them even comes close (as Management Studio does for the database engine). But by utilizing a combination of the various tools, we're able to manage all the aspects of our Report Server.
+
+Other Means of Accessing Reporting Services
+
+Reporting Services also supports a fairly robust Web service model. There is a set of libraries provided to support .NET projects accessing the Reporting Services Web Service API. We will take a look at the basics of that toward the end of the chapter.
+
+Report Server Projects
+
+Report Models (the primary discussion of Reporting Services in my Beginning title) can be considered as "scratching the surface" of things. Reporting Services has much more flexibility than that. (Indeed, there are entire books solely about Reporting Services; there is that much to it.) In addition to the Report Modeler, the Business Intelligence Development Studio will allow you to create Report Server Projects.
+
+As I said earlier, there are entire books about this subject, so the approach we're going to take here is to start with a little taste of the possibilities through a simple example. We'll then expand on things a bit.
+
+A lot has changed with the look and feel of Report Service Projects for this release. Microsoft bought licenses to a number of the Dundas (a component development company) Reporting Service Components. These are a significant upgrade in the componentry for Reporting Services.
+
+In our journey to look at Report Server Projects, we'll start with several core items that are common to both the Report Modeler and Report Server Projects. If you are already familiar with data sources and data source views, you can scan the next two sections to pick up the relevant parts of the project example, but otherwise skip to the section where we are discussing the actual report layout.
+
+So, let's get started with a Report Server Project. Start by opening the Business Intelligence Development Studio, and opening a new project. You'll want to use the Business Intelligence using the Report Server Project template in the Business Intelligence Development Studio, as shown in Figure 14.1.
+
+Note that the exact appearance of this dialog may vary somewhat depending on whether you have Visual Studio installed and, if so, which specific languages and templates you've installed. The image shown is of a full version of Visual Studio, as it is required for some of the more advanced topics of this book.
+
+Figure 14.1
+
+This will serve as the project for most of what we are going to do in this chapter. With our project now created, we're ready to get into some of the key concepts of a report. Some of these will be a review if you've read my Beginning title, but you'll want to get this first report together to have it available for some of the more robust examples later.
+
+Data Sources
+
+Data sources and data source views (we'll be looking at those next) are perhaps the most central items in Reporting Services. Each serves in some fashion regardless of what specific type of report you're building and regardless of whether it's using the Report Modeler or a Report Project. Although they have similar names, they serve slightly different levels in the hierarchy of pulling data together into a report.
+
+A data source is essentially the definition required for connecting to wherever you're getting your data from. This can be a connection to a SQL Server or any OLE DB or ODBC data source. If you ponder the possibilities of that for a moment, you should quickly come to the conclusion that, although Reporting Services is associated with SQL Server, you have the prospect of using a wide variety of non–SQL Server data sources in your reports. This is a very powerful concept indeed.
+
+There are two types of data sources:
+
+  * Embedded: This type of data source is stored within the same file that defined the report. We will take a look at the XML (called Report Definition Language—or RDL) a little later in the chapter, but suffice to say that all the relevant information for the data source is stored in an XML block within the report definition file. Access to this kind of data source definition is limited to the report with which it is embedded.
+  * Shared: This is largely the same as an embedded data source, except that the definition for the data source is stored in its own file (usually with the extension .ds).
+
+We will be making use of a shared data source later in the chapter.
+
+Regardless of the type, data sources store several pieces of required information, and optionally store additional items to deal with security scenarios.
+
+Creating a Data Source
+
+Let's go ahead and create a data source that we will use throughout the remainder of this chapter.
+
+If your Visual Studio environment is still in its default configuration, you should see the Solution Explorer on the upper-right side. Right-click Shared Data Sources and choose Add New Data Source, as show in Figure 14.2.
+
+Figure 14.2
+
+This will bring you up to the Shared Data Source Properties dialog (as shown in Figure 14.3).
+
+Figure 14.3
+
+The dialog has two major elements, the first of which allows us to define the name (I've named mine for the database we're going to connect to) as well as the connection string for our data source. (For those not familiar with connection strings, it tells whatever object is connecting to your data source where to go and how to log in.). You can either edit the connection string directly or click the Edit button to bring up the Connection Properties dialog shown in Figure 14.4.
+
+The first time I saw this dialog, I was mildly surprised to see that it was different than the connection dialog that had been used repeatedly in the Management Studio; nonetheless, it does contain the same basic elements, just in a slightly different visual package (in short, don't worry if it looks a little different).
+
+Figure 14.4
+
+In my case, I've selected the local server, the system administrator account (sa), and our old friend, the AdventureWorks2008 database.
+
+Go ahead and click OK, and then the Credentials option in the Data Source Properties dialog, and we get the security options for our data source (see Figure 14.5).
+
+Figure 14.5
+
+We have several options here worth discussing—they include:
+
+  * Use Windows Authentication: This is what it sounds like. It authenticates based on the user who executes the report. This means that the related Windows user account must have access to not only the report, but all underlying data related to the report.
+  * Use this user Name and Password: The user name and password referenced will be SQL Server login information (not Windows).
+  * Prompt for Credentials: Again, this is predictable. Credentials are obtained from the user at run time. The credentials supplied will be passed to whatever data provider the report utilizes.
+  * No Credentials: This forces anonymous access, so the data provider needs to support such access or you will get an authentication error when you run the report.
+
+In Figure 14.5, I've chosen to use the sa and provided the related password. This means that the supplied login and password will be persisted (in an encrypted form) with the data source in the ds file.
+
+When we click OK for this dialog, we wind up back at our relatively generic Visual Studio project, but we have our new data source, and are ready to create more of the required pieces for our report.
+
+Using the Report Wizard
+
+Even though we didn't choose the Report Wizard project type when we created this project, elements of the Report Wizard are still available as we create reports. Indeed, the simple act of asking for a new report will, by default, bring up the Report Wizard. You can cancel out of the wizard to create a blank report, but, unless you do, Visual Studio will try and use the wizard to do some of the work for you.
+
+To move on with the example we're building, we'll go ahead and add a report to walk through the Report Wizard process. For our example, we'll say that our manager has asked us for a summary report showing the total sales by category for all the sales invoices sold by David Campbell in July 2003. She has warned us that she may ask about other salespeople and periods later, but, for now, the information on Mr. Campbell for July 2003 is all she needs.
+
+To get started, right-click the Reports node in the Solution Explorer, select Add New Report as shown in Figure 14.6, and it should bring up the Report Wizard Welcome dialog.
+
+Figure 14.6
+
+Click Next to move on to the data source selection dialog shown in Figure 14.7. Note that, while I've chosen to use the shared data source we created a few moments ago, I could also create a new data source as part of this dialog. (The new data source would be embedded, but could be converted to shared later if we so chose.)
+
+Figure 14.7
+
+Again click Next to move on to the Query Builder dialog shown in Figure 14.8. I've already created a query and the query looks like this:
+
+SELECT per.FirstName + ' ' + per.LastName AS Employee,
+
+ps.Name AS Subcategory,
+
+SUM(sod.LineTotal) AS Sales,
+
+soh.SalesOrderID,
+
+soh.SalesOrderNumber,
+
+p.Name AS Product,
+
+SUM(sod.OrderQty) AS OrderQty,
+
+sod.UnitPrice,
+
+pc.Name AS Category
+
+FROM Sales.SalesOrderHeader soh
+
+JOIN Sales.SalesPerson SP
+
+ON sp.BusinessEntityID = soh.SalesPersonID
+
+JOIN Sales.SalesOrderDetail sod
+
+ON soh.SalesOrderID = sod.SalesOrderID
+
+JOIN HumanResources.Employee e
+
+ON soh.SalesPersonID = e.BusinessEntityID
+
+JOIN Person.Person per
+
+ON per.BusinessEntityID = sp.BusinessEntityID
+
+JOIN Production.Product p
+
+ON sod.ProductID = p.ProductID
+
+JOIN Production.ProductSubcategory ps
+
+ON p.ProductSubcategoryID = ps.ProductSubcategoryID
+
+JOIN Production.ProductCategory pc
+
+ON ps.ProductCategoryID = pc.ProductCategoryID
+
+WHERE (DATEPART(Year, soh.OrderDate) = 2003)
+
+AND (DATEPART(Month, soh.OrderDate) = 7)
+
+AND (soh.SalesPersonID = 283)
+
+GROUP BY per.FirstName + ' ' + per.LastName,
+
+DATEPART(Month, soh.OrderDate),
+
+soh.SalesOrderID,
+
+soh.SalesOrderNumber,
+
+p.Name,
+
+ps.Name,
+
+sod.UnitPrice,
+
+pc.Name
+
+There isn't any real rocket science to this query. It is simply gathering up sales totals for the salesperson with an ID of 283 (which happens to be David Campbell) in July of 2003. We will look at how to make this selectable later in the chapter, but, for now, we'll go with the simple, hard coded query.
+
+Figure 14.8
+
+Paste in this query code (you can find it in the downloadable sample code on the wrox.com or www.professionalsql.com websites), and click Next to choose between a tabular or matrix report. A tabular report is a classic row-by-row of data layout. A matrix looks for an intersection of data, and is more oriented around displaying totals at the intersection of a column and row. For this particular report, we'll go with the tabular option, and then click Next to move on to the dialog shown in Figure 14.9.
+
+Figure 14.9
+
+The sales report we're generating is going to be showing the total for each sales order that Mr. Campbell issued in July 2003. The selections we're making now will have the wizard create part of the formatting we need. Choose the SalesOrderNumber as a Group By item, and the Category and Sales fields for detail items, and click Next. In this next dialog (shown in Figure 14.10), I've chosen a block format. There isn't any real magic in it. I've just chosen it because I think it suits this particular data best. I've also chosen to include subtotals. Since we're grouping by SalesOrderNumber, it means we will get a total for each SalesOrderNumber value.
+
+Figure 14.10
+
+Again click Next to choose a style for the wizard with which to configure the report. I happen to be choosing Ocean, but anything will work. Click Next one last time to see a summary, as shown in Figure 14.11, of what the wizard is going to do and to name your report. (I've chosen SalesOrderSummary. I'd suggest using that name since we will alter this report as we go through the chapter.) You're then ready to click Finish to generate the actual report.
+
+Figure 14.11
+
+The report that first comes up (shown in Figure 14.12) doesn't look that complex.
+
+Figure 14.12
+
+Go ahead and choose the Preview tab to see what the report looks like with real data (shown in Figure 14.13).
+
+Figure 14.13
+
+This is indeed a nice start, but has some significant flaws, so let's look at editing the report.
+
+Editing Reports
+
+To edit a report, we move back to the Design tab for the report in Visual Studio. Continuing our example, we have a few issues we would like to take care of to clean up the look of the report:
+
+  * The title should reflect a more proper title format.
+  * The number values should look more like currency values.
+  * We're seeing each instance of a category sale, not a total as was requested.
+
+Let's take each of these in turn.
+
+First up, let's change the title. This is the easiest of the changes we'll make. Simply click the area of the title once to select it, and a second time to make your cursor active so you can edit it much as you would any other label object. Double-clicking has the same effect. Go ahead and select it and change the title to D. Campbell, July 2003 Summary.
+
+Next, we'll take on the number formatting issue. Again, this isn't that difficult. Simply right-click the field that holds our Sales information, and select Text Box Properties as shown in Figure 14.14.
+
+Figure 14.14
+
+This brings us up the dialog shown in Figure 14.15, which allows us to set a wide variety of properties for the cell of our report table (which, incidentally, is called a tablix). In Figure 14.15, I've chosen the Numbers node, and set our number display to round to the nearest whole unit of currency, and to use a separator for thousands.
+
+Notice that it doesn't ask what you want to use as a thousands separator, nor does it just assume that you want to use a comma. The thousands separator will vary based on what localization your report server is configured for, and can be overridden on a report-by-report basis.
+
+Figure 14.15
+
+That takes us to the last, and trickiest of the changes we decided to make: rolling up each category to a total within each sales order. To do this, we again right-click the cell that contains the [Sales] value as shown in Figure 14.16. We choose the row group, and modify the properties using the dialog shown in Figure 14.17. This will limit the rows returned to just one per category within the larger SalesOrderNumber group. (Notice the brackets on the far left of the tablix. Remember we added that one by selecting it when we were in the Report Wizard.) We're not quite done in here, though. Since we're focused on categories, we should probably sort the categories to make them a bit more readable. To do that, we can choose the Sorting node in the current dialog, as shown in Figure 14.18.
+
+Figure 14.16
+
+Figure 14.17
+
+Figure 14.18
+
+So, with all that accomplished, it would seem that we're ready to preview our report again, but, when we do, we see that, while things are vastly improved, we still have a few problems (as shown in Figure 14.19).
+
+Figure 14.19
+
+While our report is starting to look good, we have some problems with our numbers. If you were to compare it with the earlier values that were returned (you can go back to Figure 14.13 to see those), you should quickly see that our numbers don't add up. Indeed, the report is not showing the totals for each category, but rather the first row returned in each category. We can't have that!
+
+To fix this, we need to explicitly indicate what we want done for each cell. Once again, right-click our [Sales] cell, but, this time, click Expression as shown in Figure 14.20.
+
+Figure 14.20
+
+The dialog returned shows that we are currently returning the exact value from the Sales field in the data set:
+
+=Fields!Sales.Value
+
+What we need, however, is a total—or a Sum—for the field within the group. To do this, we can use one of the many built-in functions of Reporting Services. In this case, the Sum function:
+
+=Sum(Fields!Sales.Value)
+
+So, to see how this looks in the dialog, check out Figure 14.21.
+
+Figure 14.21
+
+Click OK, and preview the report again, and we now have a reasonably well-formatted report (don't get too carried away formatting it—we're just getting started with this report!) shown in Figure 14.22, and we're ready to run it, print it (or export it to another format), and deliver it to our manager.
+
+Figure 14.22
+
+Parameterizing Reports
+
+Getting this report on David Campbell is all well and good, but it is pretty limiting. Recall that our manager warned us that she might want it for other people and for other times, later on. It's time to implement that functionality.
+
+Parameterization is a vital part of most reporting projects. Fortunately, making SQL Server recognize a report as parameterized is relatively easy. Once a report is parameterized, SQL Server will prompt the user in some fashion to supply a parameter value. As we'll learn in this section, we have many options for making parameter choices easy on the user.
+
+As our first step, we will add the most rudimentary parameterization to our report. Making our report reliant on parameters starts with simply altering our query to expect those parameters. We'll then just need to tell the report to request the parameters before the report is executed. Let's start by editing our query. Go to the Report Data item in the View menu for the project. (It's also available as a tab in the Solution Explorer pane.) The Report Data tab is shown in Figure 14.23. Just double-click our one data set for this report to bring up the dialog shown in Figure 14.24, which will, among other things, allow us to edit our query. (Some reports can have several data sets. This particular report just has one.)
+
+Note that you can also edit the query in a separate Query Editor Window by right-clicking the data set and selecting Query.
+
+Figure 14.23
+
+Figure 14.24
+
+I have already changed our hard-coded values for Darren Campbell's BusinessEntityID, the month of July, and the year of 2003 to be parameter values (@BusinessEntityID, @Month, and @Year, respectively). With this complete, we're ready to move on to the Parameters node of the dialog as shown in Figure 14.25.
+
+Figure 14.25
+
+I've added each of the parameters in this dialog, so I can now click OK, and I'm ready to preview (or just downright run) the report. In Figure 14.26, I have run it via the Preview tab. Notice at the top of the pane how it has asked for (and I have provided) the three parameters.
+
+Figure 14.26
+
+In looking over the report, you can see that we wound up with exactly the same values that we had in our original report, only now we could run the report for a different time period, or for a sales rep. Our report just became a lot more flexible.
+
+Providing/Controlling Parameter Values and How They Are Used
+
+Well, the report as we have it seems pretty nice. We cannot only provide a report on David Campbell as we could before, but now we can input different parameters including a different employee's BusinessEntityID, and a completely different time period. We do, however, still have several usability issues. Some of these include:
+
+  * The input values are free-form, which means users may input illegal values.
+  * There are no hints at what might be a proper input value, so the user is left to know ahead of time, or guess. This is not too horrible for the date and year, but would be problematic in terms of getting the right salesperson's BusinessEntityID.
+  * No matter which sales person you input, the header is hard-coded to say David Campbell. A similar issue exists for the month and year.
+
+Let's take a look at how to fix these issues.
+
+Creating Pre-set Parameter Lists
+
+Reporting Services gives you the ability to create pre-defined value lists for your parameters. This functionality utilizes the parameters that we've already defined, and simply adds additional properties to them.
+
+To add fixed lists to our @Month and @Year parameters, we navigate to the Parameters node of the Report Data tab, expand the list, and then double-click the parameter for which we're interested in supplying values. (You could also right-click the parameter and then select Parameter Properties.) Go ahead and try this for the @Month parameter, which should bring up the dialog shown in Figure 14.27.
+
+Figure 14.27
+
+Notice that I could set a custom prompt for my parameter. (It doesn't have to be the parameter name.) I can also control the initial visibility of the parameter (perhaps for a parameter that is only valid if another parameter is set to a specific value) as well as the nullability or acceptance of blank values.
+
+I've mostly stuck with the defaults here, but I did change the data type to be an Integer. (Remember we are taking the month number as a parameter.) We're then ready to move on to the Available Values node shown in Figure 14.28.
+
+Figure 14.28
+
+I've made several modifications in this dialog—most notably supplying separate labels and values. The label indicates what the user will be shown to choose from, and the value will be what is passed to the parameter when the report is executed. I was given the ability to create this list as part of choosing the Specify Values option. Note, however, that I could also have made the list query driven. (We'll get to one of those shortly.)
+
+Go ahead and switch over to the Defaults node, and you can see we are allowed to supply a default value. (In Figure 14.29, I've chosen the value of 7 that we have been working with thus far.)
+
+Figure 14.29
+
+Finally, switch over to the Advanced node (shown in Figure 14.30), and we are given the option of selecting when our report data will change if the user changes the parameter value. We can force a refresh every time, require the user to explicitly call for the refresh, or allow SQL Server to decide when it is the right time.
+
+Figure 14.30
+
+Go ahead and try this out on your own by setting the data type for the @Year parameter to Integer and the default value to 2003. Then we're ready to preview or run the report again to check out the effects of our changes as shown in Figure 14.31.
+
+Figure 14.31
+
+While you can't see any significant difference in the BusinessEntityID and Year parameters, you should quickly notice that Month is now a drop-down list that supplies the name of each month even though the parameter will really use the integer value for the month. You can also test out entering text into the year field. SQL Server will indicate the type mismatch relatively gracefully. (It isn't the prettiest thing ever, but it's better than a full blown error.)
+
+Creating Parameter Lists from Queries
+
+Supplying a pre-populated list for our @BusinessEntityID parameter is a bit trickier than the other two parameters. We could create a fixed list much as we did with @Month, but that would mean we would have to edit the report every time the list of salespersons changed. While months are likely to remain very stable (unless Einstein comes back from the dead with a new theory on time), salespeople have a tendency to come and go with high frequency. Editing the report each time is very impractical, particularly when we already have salesperson information entered elsewhere in the system.
+
+To get this started, we need to create a new data set. Start by right-clicking the data source in the Report Data tab, and select Add Dataset as shown in Figure 14.32.
+
+Figure 14.32
+
+Which, in turn, brings us up the dialog shown in Figure 14.33.
+
+Figure 14.33
+
+I've already supplied a query that lists all of the salespeople. It is entirely visible in the dialog, but just to make it clear, it looks like this:
+
+SELECT p.BusinessEntityID, p.LastName + ', ' + p.FirstName
+
+FROM Person.Person p
+
+JOIN Sales.SalesPerson sp
+
+ON p.BusinessEntityID = sp.BusinessEntityID;
+
+Now continue to the Fields node as shown in Figure 14.34. This allows us to select what the returned fields are going to be named (so you can access them) in any reports that use this data set. I am sticking with the defaults here, but we could have altered the names on the results if we had so chosen. Click OK and our data set is created. We're now ready to use it to populate our parameter list.
+
+Figure 14.34
+
+Double-click the BusinessEntityID parameter again to open it back up for editing, then move to the Available Values node shown in Figure 14.35. I have again pre-filled-in suitable values. I have, as you might expect, chosen the Get values from a query option. I have likewise chosen what data set to use as a source and which fields from the data set relate to the value and label fields (which function just as they did when we manually supplied values for them). I am also going to go to the Default Values node and set a default of 283 (our old friend, David Campbell) before previewing or executing the report as shown in Figure 14.36.
+
+Figure 14.35
+
+Figure 14.36
+
+So, just that quickly we have all of our parameters defaulted and data typed as appropriate. All that leaves us is to deal with the fixed header.
+
+Getting Headings and Other Fields from Parameters
+
+Editing a text box to use parameter values is relatively easy. Start by selecting the text box that holds our current fixed value and get it into an edit mode. To make it dynamic, I need to combine several items. First, I'll start off with a prefix to my dynamic values. I'll use the phrase "Summary for:" I then need to again right-click and choose Create Placeholder bringing up the dialog in Figure 14.37. The placeholder will allow Reporting Services to distinguish between my literal text and my functional code. Note that the value field has a drop-down box, and that by expanding it you can choose between a wide array of dynamic values. In our case, I've supplied a reference to one of the parameters the user selected. Go ahead and click OK, and then preview or run the report to see the effect (shown in Figure 14.38).
+
+Figure 14.37
+
+Figure 14.38
+
+To finish out this section, let's add another placeholder or two, but this time let's use the expression editor. Add a comma and a space after the placeholder we just created, and then right-click and again choose Add Placeholder. This time, however, click the Fx button to the right of the Value field to bring up the dialog shown in Figure 14.39. In this figure, I'm in the middle of adding a reference to the Month parameter that the user selected when they ran the report, but notice that Visual Studio is providing me with IntelliSense while I edit. Go ahead and add placeholders for both Month and Year, and your report should now come out looking something like Figure 14.40.
+
+Figure 14.39
+
+Figure 14.40
+
+Adding Charts
+
+Reporting Services also supports chart objects. This is relatively powerful stuff, as it does a lot to allow our reports to become more than just a source for reporting, but also a venue for more genuine analysis. We're going to add one chart to our report to provide a visual representation of the sales this month between categories.
+
+Start by opening the Visual Studio toolbox and dragging a Chart object onto your report. (I'm placing mine to the right of our tablix.) This brings up the dialog shown in Figure 14.41, and allows us to choose between a wide array of chart types.
+
+Figure 14.41
+
+Given that we don't have that many categories to choose from, I've decided to go with a pie chart in a 3D representation (shown in Figure 14.41). To get this working, I can just drag fields from data sets in my Report Data tab right into special receiver areas on the chart (shown in Figure 14.42). I've dragged the sales field from Dataset1 into the Drop Data Fields Here area, and the categories field into the Drop Category Fields Here area.
+
+Figure 14.42
+
+Also change the (caption) field in the chart properties to Sales by Category, and we're again ready to run or preview the report as shown in Figure 14.43.
+
+Figure 14.43
+
+Just that quickly, we have a basic chart available to provide a visual representation of the numbers in our tablix.
+
+Note that there is no interdependence between the two objects. They happen to make use of the same data set, but there is no requirement that they do so. Indeed, we did most of this report building without the chart, and we could, if we so desired, delete the tablix and work only with the chart.
+
+Linking Reports
+
+Reporting Services also allows you to link multiple reports, either drilling down into finer levels of detail, or drilling across into a totally different report.
+
+The linking process is supported through what are termed "Actions." Actions support both internal (other reports) and external (such as a website) links.
+
+Let's add one last element to the report we've been working on in this chapter. To make use of this link, you'll want to download (if you haven't already) the code for this book, and look for the SalesOrderDetail.rdl file that I've pre-created for you. You can add it to your project by right-clicking Reports in the Solution Explorer, and choosing Add⇒Existing Item.
+
+To make use of this new Sales Order Detail report, you need to edit the properties for the text box that has the Sales Order Number on your report, then access the Actions settings as shown in Figure 14.44.
+
+Figure 14.44
+
+Once you have the SalesOrderDetail.rdl file properly added to the project and have configured the SalesOrderNumber action as shown in Figure 14.44, go ahead and run or preview your summary report one last time. Now click the first Sales Order Number for David Campbell in July of 2003, and you should get the Sales Order Detail report shown in Figure 14.45.
+
+Figure 14.45
+
+Deploying the Report
+
+The thing left to do is deploy the report. To deploy, you right-click the report in the Solution Explorer and choose Deploy. There is, however, a minor catch—you need to define the target to deploy to in the project definition.
+
+1. Right-click the Report Server Project and choose Properties.
+
+2. In the TargetReportFolder field, put whatever folder you want it to reside in when you log into the Report Manager.
+
+3. In the TargetServerURL field, enter the URL to your ReportServer. In my case, this may be as simple as <http://localhost/ReportServer>, but the server name could be any server to whom you have appropriate rights to deploy. (The Virtual Directory may also be something other than ReportServer if you defined it that way at install.)
+
+After you've deployed (by right-clicking the project and selecting Deploy), you'll want to view the report. Navigate to your report server. (If it is on the local host and uses the default directory, it would be <http://localhost/Reports>.) Click your report folder, and choose your SalesOrderSummary report. It may take a bit to come up the first time you load it but you should see your report just as we defined it in our project. (If you navigate back to it again, the report definition will be cached and thus come up fairly quickly.)
+
+A Brief Note on RDL
+
+RDL stands for Report Definition Language—an XML–based language that defines reports. All the changes we made to our report over the course of this chapter were translated into RDL by Visual Studio. If you want see what the RDL for your report project looks like, right-click your report and choose View Code. The following is an excerpt from the report I produced as an example for this chapter. It defines the data set that supplied the values for our sales staff to the appropriate parameter:
+
+<DataSet Name="SalesStaff">
+
+<Fields>
+
+<Field Name="BusinessEntityID">
+
+<DataField>BusinessEntityID</DataField>
+
+<rd:TypeName>System.Int32</rd:TypeName>
+
+</Field>
+
+<Field Name="Name">
+
+<DataField>Name</DataField>
+
+<rd:TypeName>System.String</rd:TypeName>
+
+</Field>
+
+</Fields>
+
+<Query>
+
+<DataSourceName>AdventureWorks2008</DataSourceName>
+
+<CommandText>SELECT p.BusinessEntityID, p.LastName + ', ' +
+
+p.FirstName AS Name
+
+FROM Person.Person p
+
+JOIN Sales.SalesPerson sp
+
+ON p.BusinessEntityID = sp.BusinessEntityID;</CommandText>
+
+<rd:UseGenericDesigner>true</rd:UseGenericDesigner>
+
+</Query>
+
+</DataSet>
+
+You can modify the RDL directly if you wish. (But be careful. It can be a hassle to figure out what exactly you did wrong if you introduced an error through direct editing.)
+
+Summary
+
+Reporting Services has had a major impact on many SQL Server installations. For many companies, having a relatively robust reporting server built right into their central data store has been liberating by making it much easier to disseminate information to data consumers. For other organizations, Reporting Services has provided an adequate solution to replace long-standing reporting packages such as Crystal Reports. SQL Server 2008 adds several new features and controls to allow for more elegant and powerful reports, plus the engine has been redesigned to allow for much higher scalability.
+
+Even with the relatively robust report used in this chapter, we've really only just begun to taste the possibilities. Reports can be parameterized, you can embed charts, integrate with other products (such as Microsoft Sharepoint Services or Microsoft Office Sharepoint Services), drill through from one report to another, and even embed reports inside of other reports.
+
+For more information on reporting, I'd suggest a book specific to Reporting Services.
+15
+
+Buying in Bulk: The Bulk Copy Program (BCP) and Other Basic Bulk Operations
+
+If your system is going to be operating in something of a bubble, then you can probably skip this chapter and move on. Unfortunately, the real world doesn't work that way, so you probably ought to hang around for a while.
+
+For most systems, there will eventually come a time (often, it's many times) when you need to move around large blocks of data. Sometimes you need to bring in data that's in the wrong format or that's sitting in another application's data files. Sometimes, you need to extract data directly from another system. The good thing is SQL Server has two tools to help you move data fast—the Bulk Copy Program (bcp) and SQL Server Integration Services (SSIS). In this chapter, we'll be looking primarily at the first of these. In addition, we'll take a look at bcp's close cousins—the BULK INSERT command and OPENROWSET (BULK).
+
+We will examine SSIS in the next chapter.
+
+bcp is something of an old friend. You know the one—where you hardly ever see them anymore, but, when you do, you reminisce on all the crazy things you used to do together. It was, for a very long time, the way we moved around large blocks of data; and it did so (still does as far as that goes) amazingly fast. What, however, it lacks is sex appeal—well, frankly, since SQL Server 7.0, it has lacked appeal in a whole lot of areas.
+
+So, why then am I even spending a chapter on it? Well, because bcp still definitely has its uses. Among its advantages are:
+
+  * It's very compact.
+  * It can move a lot of data very quickly.
+  * It is legacy—that is, there may be code already running that is making effective use of it, so why change it?
+  * It uses a cryptic, yet very traditional scripting style (which will probably appeal to some).
+  * It is very consistent.
+
+bcp is used for transferring text and SQL Server native format data to and from SQL Server tables. It has changed very little in the last several versions, and other bulk features have continued to erode the usefulness of bcp, but it still holds its own. You can think of bcp as a data pump, with little functionality other than moving data from one place to the other as efficiently as possible. The various other bulk operations we'll look at in this chapter are often easier to use, but usually come at the price of less flexibility.
+
+In this chapter, we will look at some of the ins and outs of bcp and then use what we learn about bcp to form the foundations of many of the other features that serve a similar purpose—to get data in and out of your system as quickly as possible.
+
+bcp Utility
+
+bcp runs from an operating system command prompt to import or export native data (specific to SQL Server), ASCII text, or Unicode text. This means that you can execute bcp from an operating system batch file or user-defined stored procedure, as well as from other places. bcp can also be run as part of a scheduled job, or executed from a .NET object through the use of a shell command.
+
+Like most command-line utilities, options can be specified using a hyphen (-) or forward slash (/); however, unlike most DOS or Windows family utilities, option switches are case sensitive.
+
+bcp Syntax
+
+bcp {[[<database name>.][<owner>].]{<table name>|<view name>}|"<query>"}
+
+{in | out | queryout | format} <data file>
+
+[-m <maximum no. of errors>] [-f <format file>] [-x] [-e <error file>]
+
+[-F <first row>] [-L <last row>] [-b <batch size>]
+
+[-n] [-c] [-w] [-N] [-V (60 | 65 | 70 | 80 | 90)] [-6]
+
+[-q] [-C <code page> ] [-t <field term>] [-r <row term>]
+
+[-i <input file>] [-o <output file>] [-a <packet size>]
+
+[-S <server name>[\<instance name>]] [-U <login id>] [-P <password>]
+
+[-T] [-v] [-R] [-k] [-E] [-h "<hint> [,...n]"]
+
+Geez—that's a lot to take in, so let's go through these switches one by one. (Thankfully, most of them are optional, so you will usually only include just a fraction of them.)
+
+Note that many of the switches for the bcp utility are case sensitive; often, a given letter has an entirely different meaning between cases.
+
+Parameter | Description  
+---|---  
+Database name | Exactly what it sounds like. Basically, this is a standard part of the four-part naming scheme. If not specified, the user's default database is assumed.  
+owner | More of the four-part naming scheme stuff. Again, exactly what it sounds like.  
+Table or View name "query" | Can only be one—table, view, or query. This is the input destination or output source table or view. A SQL Server query can be used only as a bcp output destination, and only when queryout is specified. If the query returns multiple result sets, only the first result set is used by bcp.  
+in data file out data file queryout data file format data file | Again, can only be one. If using any of these, you must also supply a source or destination file. Establishes the direction of the bcp action. in indicates that you are importing data from a source file into a table or view. out indicates that you are exporting data from a table or view into the destination file. Use queryout only for output to the destination file using a query as its source. Use format to create a format file based on the format option you've selected. You must also specify -f, as well as format options (-n, -c, -w, -6, -C, or-N) or answer prompts from interactive bcp. The source or destination path and filename is specified as <data file> and cannot include more than 255 characters.  
+-m <maximum errors> | You can specify a maximum number of errors that you will allow before SQL Server cancels the bulk copy operation, defaulting to 10 errors. Each row that cannot be copied by bcp is counted as one error.  
+-f <format file> | A format file contains responses saved from a previous bcp operation on the same table or view. This parameter should include the full path and filename to the format file. This option is used primarily with the in and format options to specify the path and filename when making use of or creating a format file.  
+-x | Generates a XML-based format file instead of the straight text version that is default. (The non-XML version is legacy support, but remains default for now.) It must be used with both the format and –f options.  
+-e <error file> | You can specify the full path and filename for an error file to store any rows that bcp is not able to transfer. Otherwise, no error file is created. Any error messages will be displayed at the client station.  
+-F first row | Use this option if you want to specify the first row to be copied by the bulk copy operation. If not specified, bcp defaults to a value of 1 and begins copying with the first row in the source data file. This option can be handy if you want to handle your loading in chunks, and can be used to pick back up where you left off in a previous loading run.  
+-L last row | This option is the complement of –F. It provides a method for determining the last row you want loaded as part of this bcp execution. If not specified, bcp defaults to a value of 0, the last row in the source file. When used in conjunction with –F, this option can allow you to load your data one chunk at a time, loading small blocks of data and then picking up next time where the previous load left off.  
+-b batch size | You can specify the number of rows copied as a batch. A batch is copied as a single transaction. Like all transactions, the rows of the batch are committed in an "all or nothing" fashion—either every row is committed or the transaction is rolled back and it is as if the batch never happened. The –h (hint) switch has a similar option (ROWS_PER_BATCH), which should be considered to be mutually exclusive with –b (use neither or one of them, but not both).  
+-n | Native data types (SQL Server data types) are used for the copy operation. Using this option prevents the need to answer the questions regarding the data types to be used in the transfer (it just picks up the native type and goes with it).  
+-c | This specifies that the operation uses character data (text) for all fields, and, as such, does not require a separate data type question for each field. A tab character is assumed as field delimiter unless you use the –t option and a newline character as row separator unless you specify different terminator using -r.  
+-w | The -w option is similar to –c but specifies Unicode data type instead of ASCII for all fields. Again, unless you override with –t and –r, the tab character and row separator are assumed to be the field delimiter and newline character, respectively. This option cannot be used with SQL Server version 6.5 or earlier.  
+-N | This is basically the same as –w, using Unicode for character data but uses native data types (database data types) for non-character data. This option offers higher performance when going from SQL Server to SQL Server. As with –w, this option cannot be used with SQL Server version 6.5 or earlier.  
+-V (60|65|70|80|90) | Causes bcp to utilize data type formats that were available only in previous versions of SQL Server. 60 uses 6.0 data types, 65 uses 6.5 data types, 70 uses 7.0 data types, 80 uses 2000 data types, and 90 uses 2005 data types. This replaces the –6 option.  
+-6 | Use this option to force bcp to use SQL Server 6.0 or 6.5 data types. This option is used in conjunction with the -c or -n format options for backward-compatibility reasons only. Use –V whenever possible (when working with SQL Server 7.0 or newer, which should be pretty much always at this point).  
+-q | Use -q to specify that a table or view name includes non-ANSI characters. This effectively executes a SET QUOTED_IDENTIFIERS ON statement for the connection used by bcp. The fully qualified name, database, owner, and table or view must be enclosed in double quotation marks, in the format "database name.owner.table".  
+-C <code page> | This option is used to specify the code page for the data file data. It is only necessary to use this option with char, varchar, or text data having ASCII character values of less than 32 or greater than 127. A code page value of ACP specifies ANSI/Microsoft Windows (ISO 1252). OEM specifies the default client code page. If RAW is specified, there will be no code page conversion. You also have the option of providing a specific code page value. Avoid this option where possible; instead, use a specific collation in the format file or when asked by bcp.  
+-t <field terminator> | This option allows you to override the default field terminator. The default terminator is the tab character. You can specify the terminator as tab (\t), newline (\n), carriage return (\r), backslash (\\\), null terminator (\0), any printable character, or a string of up to 10 printable characters. For example, you would use "-t," for a comma-delimited text file.  
+-r <row terminator> | This option works just like –t except that it allows you to override the default row terminator (as opposed to the field terminator). The default terminator is \n, the newline character. The rules are otherwise the same as -t.  
+-i <input file> | You have the option of specifying a response file, as the input file, containing the responses to be used when running bcp in interactive mode. (This can save answering a ton of questions!)  
+-o <output file> | You can redirect bcp output from the command prompt to an output file. This gives you a way to capture command output and results when executing bcp from an unattended batch or stored procedure.  
+-a <packet size> | You have the option of overriding the default packet size for data transfers across the network. Larger packet sizes tend to be more efficient when you have good line quality (few CRC errors). The specified value must be between 4096 and 65535, inclusive, and overrides whatever default has been set up for the server. At installation, the default packet size is 4096 bytes. This can be overridden using the SQL Server Management Studio or the sp_configure system stored procedure.  
+-S <server name> | If running bcp from a server, the default is the local SQL Server. This option lets you specify a different server and is required in a network environment when running bcp from a remote system.  
+-U <login name> | Unless connecting to SQL Server through a trusted connection, you must provide a valid username for login.  
+-P password | When you supply a username, you must also supply a password. Otherwise, you will be prompted for a password. Include -P as your last option with no password to specify a null password.  
+-T | You have the option of connecting to the server using network user credentials through a trusted connection. If a trusted connection is specified, there is no need to provide a login name or password for the connection.  
+-v | When this option is used, bcp returns version number and copyright information.  
+-R | Use this option to specify that the regional format for clients' local settings is used when copying currency, date, and time data. The default is that regional settings are ignored.  
+-k | Use this option to override the use of column default values during bulk copy, ignoring any default constraints. Empty columns will retain a null value rather than the column default.  
+-E | This option is used during import when the import source file contains identity column values and is essentially equivalent to SET IDENTITY_INSERT ON. If not specified, SQL Server will ignore the values supplied in the source file and automatically generate identity column values. You can use the format file to skip the identity column when importing data from a source that does not include identity values and have SQL Server generate the values.  
+-h "hint[,...]" | The hint option lets you specify one or more hints to be used by the bulk copy operation. Option -h is not supported for SQL Server version 6.5 or earlier.  
+ORDER column [ASC|DESC] | You can use this hint to improve performance when the sort order of the source data file matches the clustered index in the destination table. If the destination table does not have a clustered index or if the data is sorted in a different order the ORDER hint is ignored.  
+ROWS_PER_BATCH=nn | This can be used in place of the -b option to specify the number of rows to be transferred as a batch. Do not use this hint with the -b option.  
+KILOBYTES_PER_BATCH=nn | You can optionally specify batch size as the approximate number of kilobytes of data to be transferred in a batch.  
+TABLOCK | This will cause a table-level lock to be acquired for the duration of the operation. Default locking behavior is set by the table lock on bulk load table option.  
+CHECK_CONSTRAINTS | By default, check constraints are ignored during an import operation. This hint forces check constraints to be checked during import.  
+FIRE_TRIGGERS | Similar to CHECK_CONSTRAINTS, this option causes any triggers on the destination table to fire for the transaction. By default, triggers are not fired on bulk operations. This option is not supported in versions of SQL Server prior to 2000.
+
+bcp runs in interactive mode, prompting for format information, unless -f, -c, -n, -w, -6, or -N is specified when the command is executed. When running in interactive mode, bcp will also prompt to create a format file after receiving the format information.
+
+bcp Import
+
+Okay, so up to this point we've been stuck in the preliminaries. Well, it's time to get down to the business of what bcp is all about.
+
+Probably the most common use of bcp is to import bulk data into existing SQL Server tables and views. To import data, you must have access permissions to the server, either through a login ID or a trusted connection, and you must have INSERT and SELECT permissions on the destination table or view.
+
+The source file can contain native code, ASCII characters, Unicode, or mixed native and Unicode data. Remember to use the appropriate option to describe the source data. Also, for the data file to be usable, you must be able to describe the field and row terminators (using –t and –r) or the fields and rows must be terminated with the default tab and newline characters, respectively.
+
+Be sure you know your destination before you start. bcp has a few quirks that can affect data import. Values supplied for timestamp or computed columns are ignored. If you have values for those columns in the source file, they'll be ignored. If the source file doesn't have values for these columns, you'll need a format file (which we'll see later in this chapter), so you can skip over them.
+
+This is one of those really bizarre behaviors that you run across from time to time in about any piece of software you might use. In this case, if your destination table contains them, you're required to have columns to represent timestamp or computed data even though SQL Server will just ignore that data—silly, isn't it? Again, the way around this is to use a format file that explicitly says to skip the columns in question.
+
+For bcp operations, rules are ignored. Any triggers and constraints are ignored unless the FIRE_TRIGGERS and/or CHECK_CONSTRAINTS hints are specified. Unique constraints, indexes, and primary/foreign key constraints are enforced. Default constraints are enforced unless the -k option is specified.
+
+Data Import Example
+
+The easiest way to see how bcp import works is to look at an example. Let's start with a simple example, a tab-delimited file containing department information for the AdventureWorks2008 database. Here's how the data looks:
+
+1 Smart Guys Research and Development 2006-04-01 00:00:00.000
+
+2 Product Test Research and Development 2006-04-01 00:00:00.000
+
+To import this into the Department table using a trusted connection at the local server, you run:
+
+BCP AdventureWorks2008.HumanResources.Department in c:\DepartmentIn.txt -c -T
+
+Two things are important here: First, up to this point, everything we've run has been done in Management Studio. For bcp, however, you type your command into a command-prompt box. Second, you'll need to change the preceding command line to match wherever you've downloaded the sample files/data for this book.
+
+Because the first column in the Department table is an identity column and the -E option wasn't specified, SQL Server will ignore the identity values in the file and generate new values. The -c option identifies the source data as character data, and -T specifies to use a trusted connection.
+
+Note that, if you have not been using Windows authentication and haven't set up your network login with appropriate rights in SQL Server, then you may need to modify the preceding example to utilize the –S and –P options.
+
+When we execute it, SQL Server quickly tells us some basic information about how our bulk copy operation went:
+
+2 rows copied.
+
+Network packet size (bytes): 4096
+
+Clock Time (ms.) Total : 109 Average : (18.35 rows per sec.)
+
+We can go back into Management Studio and verify that the data went into the Department table as expected:
+
+USE AdventureWorks2008;
+
+SELECT * FROM HumanResources.Department;
+
+which gets us back several rows—most importantly, the two we expect from our bcp operation:
+
+DepartmentID Name GroupName ModifiedDate
+
+\------------ ---------------- -------------------------------------- --------------
+
+1 Engineering Research and Development 1998-06-01...
+
+2 Tool Design Research and Development 1998-06-01...
+
+...
+
+...
+
+16 Executive Executive General and Administration 1998-06-01...
+
+17 Smart Guys Research and Development 2006-04-01...
+
+18 Product Test Research and Development 2006-04-01...
+
+As always, note that, other than the two rows we just imported, your data may look a bit different depending on what parts of this book you've run the examples on, and which you haven't, and how much playing around of your own you've done. For this example, you just want to see that Smart Guys and Product Test made it into the table with the appropriate information. The identity values will have been reassigned to whatever was next for your particular server.
+
+Now let's look at a more involved example. Let's say we have a table called CustomerList. A CREATE statement to make our CustomerList table looks like this:
+
+CREATE TABLE dbo.CustomerList
+
+(
+
+CustomerID nchar(5) NOT NULL
+
+PRIMARY KEY,
+
+CompanyName nvarchar(40) NOT NULL,
+
+ContactName nvarchar(30) NULL,
+
+ContactTitle nvarchar(30) NULL,
+
+Address nvarchar(60) NULL,
+
+City nvarchar(15) NULL,
+
+Region nvarchar(15) NULL,
+
+PostalCode nvarchar(10) NULL,
+
+Country nvarchar(15) NULL,
+
+Phone nvarchar(24) NULL,
+
+Fax nvarchar(24) NULL
+
+);
+
+We have a comma-delimited file (in the same format as a .csv file) with new customer information. This time, the file looks like:
+
+XWALL,Wally's World,Wally Smith,Owner,,,,,,(503)555-8448,,
+
+XGENE,Generic Sales and Services,Al Smith,,,,,,,,(503)555-9339,,
+
+XMORE,More for You,Paul Johnston,President,,,,,,(573)555-3227,,
+
+What's with all the commas in the source file? Those are placeholders for columns in the CustomerList table. The source file doesn't provide values for all of the columns, so commas are used to skip over those columns. This isn't the only way to handle a source file that doesn't provide values for all of the columns. You can use a format file to map the source data to the destination. We'll be covering format files a little later in the chapter.
+
+Imagine for a moment that we are going to run bcp to import the data to a remote system. The command is:
+
+BCP AdventureWorks2008.dbo.CustomerList in c:\newcust.txt -c -t, -r\n -Ssocrates -
+
+Usa -Pbubbagump
+
+The line wrapping shown here was added to make the command string easier to read. Do not press Enter to wrap if you try this example yourself. Type the command as a single string and allow it to wrap itself inside the command prompt.
+
+Once again, the data is being identified as character data. The -t, option identifies the file as comma-delimited (terminated) data, and -r\n identifies the newline character as the row delimiter. Server connection information was also provided for a little variety this time, using sa as your login and bubbagump as the password.
+
+Again, bcp confirms the transfer along with basic statistics:
+
+Starting copy...
+
+3 rows copied.
+
+Network packet size (bytes): 4096
+
+Clock Time (ms.) Total : 15 Average : (200.00 rows per sec.)
+
+And again we'll also go verify that the data got there as expected:
+
+USE AdventureWorks2008;
+
+SELECT CustomerID, CompanyName, ContactName
+
+FROM dbo.CustomerList
+
+WHERE CustomerID LIKE 'X%';
+
+And, sure enough, all our data is there...
+
+CustomerID CompanyName ContactName
+
+\---------- ---------------------------------------- --------------------------
+
+XGENE Generic Sales and Services Al Smith
+
+XMORE More for You Paul Johnston
+
+XWALL Wally's World Wally Smith
+
+Logged vs. Non-logged
+
+bcp can run in either fast mode (not logged) or slow mode (logged operation). Each has its advantages. Fast mode gives you the best performance, but slow mode provides maximum recoverability. Since slow mode is logged, you can run a quick transaction log backup immediately after the import and be able to recover the database should there be a failure.
+
+Fast mode is usually your best option when you need to transfer large amounts of data. Not only does the transfer run faster, but since the operation isn't logged you don't have to worry about running out of space in the transaction log. What's the catch? There are several conditions that must be met for bcp to run as non-logged:
+
+  * The target table cannot be replicated.
+  * If the target table is indexed, it must not currently have any rows.
+  * If the target table already has rows, it must not have any indexes.
+  * The TABLOCK hint is specified.
+  * The target table must have no triggers.
+  * For versions prior to SQL Server 2000, the select into/bulkcopy option must be set to true.
+
+Obviously, if you want to do a fast mode copy into an indexed table with data, you will need to:
+
+  * Drop the indexes
+  * Drop any triggers
+  * Run bcp
+  * Reindex the target table
+  * Re-create any triggers
+
+You need to immediately back up the destination database after a non-logged bcp operation.
+
+If the target table doesn't meet the requirements for fast bcp, then the operation will be logged. This means that you can run the risk of filling the transaction log when transferring large amounts of data. You can run BACKUP LOG using the WITH TRUNCATE_ONLY option to clear the transaction log. The TRUNCATE_ONLY option truncates the inactive portion of the log without backing up any data.
+
+I can't stress enough how deadly bcp operations can be to the size of your log. If you can't achieve a minimally logged operation, then consider adjusting your batch size down and turning TRUNCATE ON CHECKPOINT on for the duration of the operation. Another solution is to use the –F and –L options to pull things in a block at a time and truncate the log in between each block of data. Recognize, however, that an important part of your backup strategy—the transaction log—is now missing part of the information it needs to properly restore the database. It is, therefore, critical that you create a fresh backup as soon as your bulk load activity is complete.
+
+bcp Export
+
+If you're going to be accepting data in via bulk operations, then it follows that you probably want to be able to pump data out, too.
+
+bcp allows you to export data from a table, view, or query. You must specify a destination filename. If the file already exists, it will be overwritten. Unlike import operations, you are not allowed to skip columns during export. Timestamp, rowguid, and computed columns are exported in the same manner (just like they were "real" data) as any other SQL Server columns. To run an export, you must have appropriate SELECT authority to the source table or tables.
+
+Look at a couple of quick examples using the HumanResources.Department table in the AdventureWorks2008 database.
+
+To export to a data file using the default format, you could run:
+
+BCP AdventureWorks2008.HumanResources.Department out c:\somedir\
+
+DepartmentOut.txt -c -T
+
+Note that if you're running Vista or a later version of the Windows operating system (including Windows Server 2008), new security controls will likely prevent you from doing a bulk extra to the root directory (C:\ on most systems)—thus my use of somedir in the preceding code.
+
+This would create a file that looks like:
+
+1 Engineering Research and Development 1998-06-01 00:00:00.000
+
+2 Tool Design Research and Development 1998-06-01 00:00:00.000
+
+...
+
+...
+
+17 Smart Guys Research and Development 2006-04-01 00:00:00.000
+
+18 Product Test Research and Development 2006-04-01 00:00:00.000
+
+In this case, we didn't have to use a format file, nor were we prompted for any field lengths or similar information. The use of the -c option indicated that we just wanted everything, regardless of type, exported as basic ASCII text in a default format. The default calls for tabs as field separators and the newline character to separate rows.
+
+Keep in mind that the destination file will be overwritten if it already exists. This will happen without any kind of prompt or warning.
+
+To modify the separator to something custom, we could run something like:
+
+BCP AdventureWorks2008.HumanResources.Department out DepartmentOut.txt -c -T -t,
+
+Notice the comma at the end. That is not a typo. The next character after the t is the field separator—in this case, a comma.
+
+This would give us:
+
+1,Engineering,Research and Development,1998-06-01 00:00:00.000
+
+2,Tool Design,Research and Development,1998-06-01 00:00:00.000
+
+...
+
+...
+
+17,Smart Guys,Research and Development,2006-04-01 00:00:00.000
+
+18,Product Test,Research and Development,2006-04-01 00:00:00.000
+
+We used a comma separator instead of a tab, and got what amounts to a .csv file.
+
+Format Files
+
+If you have any previous experience dealing with the kinds of files we typically have handed to us with the dreaded "load this data into our database" order, then you have probably looked at my previous import examples and said "Heh—I wish my data actually came in that cleanly formatted...." Yes indeed, data rarely looks as perfect as we would like it to, and that brings us to the concept of format files.
+
+Format files were first mentioned in the previous section, and provide something of an import template. Among other things, they make it easier to support recurring import operations when:
+
+  * Source file and target table structures or collations do not match.
+  * You want to skip columns in the target table.
+  * Your file contains data that makes the default data typing and collation difficult or unworkable.
+
+Format files come in two varieties: non-XML and XML. We will start off by looking at the "old" way of doing things (the non-XML version) and then take a look at the newer XML format files.
+
+To get a better idea of how each type of format file works, let's look at some specific examples. First you'll see how the file is structured when the source and destination match. Next, you can compare this to situations where the number of source file fields doesn't match the number of table columns or where source fields are ordered differently than the table columns.
+
+You can create a default format file (which is non-XML for backward-compatibility reasons) to use as your source when you run bcp in interactive mode. After prompting for column value information, you're given the option of saving the file. The default filename is BCP.fmt, but you can give the format file any valid filename.
+
+To create a default format like this for the AdventureWorks2008 database HumanResources.Department table, you could run:
+
+BCP AdventureWorks2008.HumanResources.Department out c:\somedir\department.txt –T
+
+This is a handy way of creating a quick format file that you can then edit as needed. You can do this with any table, so you can use bcp to get a jump-start on your format file needs.
+
+Accept the default prefix and data length information for each file, and, in this case, a comma as the field terminator. SQL Server will prompt you to save the format file after you've entered all of the format information; in my case I'm going to save it off as Department.fmt. You can then edit the format file to meet your particular needs with any text editor, such as Windows Notepad.
+
+Let's take a look at the format file we just produced:
+
+10.0
+
+4
+
+1 SQLSMALLINT 0 2 "," 1 DepartmentID ""
+
+2 SQLNCHAR 2 100 "," 2 Name
+
+SQL_Latin1_General_CP1_CI_AS
+
+3 SQLNCHAR 2 100 "," 3 GroupName
+
+SQL_Latin1_General_CP1_CI_AS
+
+4 SQLDATETIME 0 8 "," 4 ModifiedDate ""
+
+The first two lines in the file identify the bcp version number (10.0 for SQL Server 2008, 9.0 for SQL Server 2005, and so on) and the number of fields in the host file. The remaining lines describe the host data file and how the fields match up with target columns and collations.
+
+The first column is the host file field number. Numbering starts with 1 through the total number of fields. Next is the host file data type. The example file has a mix of a few data types. All text is in Unicode format, so the data type of all fields is SQLNCHAR. Given that there are no special characters in this data, we could have just as easily gone with a SQLCHAR (ASCII) format.
+
+The next two columns describe the prefix and data length for the data fields. The prefix is the number of prefix characters in the field. The prefix describes the length of the data in the actual bcp file and allows the data file to be compacted to a smaller size. The data field is the maximum length of the data stored in the field. Next is the field terminator (delimiter). In this case, a comma is used as the field terminator and newline as the row terminator. The next two columns describe the target table columns by providing the server column order and server column name. Since there is a direct match between the server columns and host fields in this example, the column and field numbers are the same, but it didn't necessarily have to work that way. Last, but not least, comes the collation for each column. (Remember that, with SQL Server 2000 and newer, we can have a different collation for every column in a table.)
+
+Now, let's check the XML version. To create this, we run almost the same command, but add the -x switch:
+
+BCP AdventureWorks2008.HumanResources.Department out c:\somedir\department.txt –T –x
+
+The format file we wind up with looks radically different:
+
+<?xml version="1.0"?>
+
+<BCPFORMAT xmlns="http://schemas.microsoft.com/sqlserver/2004/bulkload/format"
+
+xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+<RECORD>
+
+<FIELD ID="1" xsi:type="NativeFixed" LENGTH="2"/>
+
+<FIELD ID="2" xsi:type="NCharPrefix" PREFIX_LENGTH="2" MAX_LENGTH="100"
+
+COLLATION="SQL_Latin1_General_CP1_CI_AS"/>
+
+<FIELD ID="3" xsi:type="NCharPrefix" PREFIX_LENGTH="2" MAX_LENGTH="100"
+
+COLLATION="SQL_Latin1_General_CP1_CI_AS"/>
+
+<FIELD ID="4" xsi:type="NativeFixed" LENGTH="8"/>
+
+</RECORD>
+
+<ROW>
+
+<COLUMN SOURCE="1" NAME="DepartmentID" xsi:type="SQLSMALLINT"/>
+
+<COLUMN SOURCE="2" NAME="Name" xsi:type="SQLNVARCHAR"/>
+
+<COLUMN SOURCE="3" NAME="GroupName" xsi:type="SQLNVARCHAR"/>
+
+<COLUMN SOURCE="4" NAME="ModifiedDate" xsi:type="SQLDATETIME"/>
+
+</ROW>
+
+</BCPFORMAT>
+
+Notice that everything is explicitly called out. What's more, there is an XML schema document associated with XML format files, which means you can validate the XML in your XML editor of choice.
+
+I'm not going to pick any bones about this. I LOVE the new XML-formatted version. If you don't need to worry about compatibility with versions prior to SQL Server 2005, this one seems a no-brainer to me.
+
+The old format files work, but, every time I work with them extensively, I consider purchasing stock in a pain reliever company. They are that much of a headache if you have to do anything beyond the defaults. Everything about them has to be "just so," and in larger tables, it's easy to miss a typo since fields are not clearly separated. XML tagging fixes all that and makes clear what every little entry is there for—debugging is much, much easier.
+
+When Your Columns Don't Match
+
+If only the world was perfect and the data files we received always looked just like our tables.
+
+Okay, time to come out of dreamland. I'm reasonably happy with the world I live in, but it's hardly a perfect place and the kinds of data files I need to do bulk operations on rarely look like their destination. So, what then are we to do when the source file and destination table do not match up the way we want? Or what about going the other way—from a table to an expected data file format that isn't quite the same?
+
+Fortunately, format files allow us to deal with several different kinds of variations we may have between source and destination data. Let's take a look.
+
+Files with Fewer Columns Than the Table
+
+Let's start with the situation where the data file has fewer fields than the destination table. We need to modify the format file we've already been using to identify which columns do not exist in the data file and, accordingly, which columns in our table should be ignored. This is done by setting the prefix and data length to 0 for each missing field and the table column number to 0 for each column we are going to skip.
+
+For example, if, as one might expect, the data file has only DepartmentID, Name, and GroupName, you would modify the file to:
+
+10.0
+
+4
+
+1 SQLSMALLINT 0 2 "," 1 DepartmentID ""
+
+2 SQLNCHAR 2 100 "," 2 Name
+
+SQL_Latin1_General_CP1_CI_AS
+
+3 SQLNCHAR 2 100 "," 3 GroupName
+
+SQL_Latin1_General_CP1_CI_AS
+
+4 SQLDATETIME 0 0 "," 0 ModifiedDate ""
+
+As you can see, the ModifiedDate field and column has been zeroed out. Because ModifiedDate is not supplied and the column has a default value (Getdate()), that default value will be used for our inserted rows.
+
+The XML version doesn't look all that different, but instead of zeroing out elements of the definition, we simply don't define it:
+
+<?xml version="1.0"?>
+
+<BCPFORMAT xmlns="http://schemas.microsoft.com/sqlserver/2004/bulkload/format"
+
+xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+<RECORD>
+
+<FIELD ID="1" xsi:type="NativeFixed" LENGTH="2"/>
+
+<FIELD ID="2" xsi:type="NCharPrefix" PREFIX_LENGTH="2" MAX_LENGTH="100"
+
+COLLATION="SQL_Latin1_General_CP1_CI_AS"/>
+
+<FIELD ID="3" xsi:type="NCharPrefix" PREFIX_LENGTH="2" MAX_LENGTH="100"
+
+COLLATION="SQL_Latin1_General_CP1_CI_AS"/>
+
+</RECORD>
+
+<ROW>
+
+<COLUMN SOURCE="1" NAME="DepartmentID" xsi:type="SQLSMALLINT"/>
+
+<COLUMN SOURCE="2" NAME="Name" xsi:type="SQLNVARCHAR"/>
+
+<COLUMN SOURCE="3" NAME="GroupName" xsi:type="SQLNVARCHAR"/>
+
+</ROW>
+
+</BCPFORMAT>
+
+There was no column in the file to define, so we didn't. We aren't sticking anything in the ModifiedDate column, so we skipped that, too (counting on the default in its case).
+
+More Columns in the File Than in the Table
+
+The scenario for a data file that has more columns than the table does is actually amazingly similar to the short data file scenario we just looked at. The only trick here is that you must add column information for the additional fields, but the prefix length, data length, and column number fields are all set to 0:
+
+10.0
+
+4
+
+1 SQLSMALLINT 0 2 "," 1 DepartmentID ""
+
+2 SQLNCHAR 2 100 "," 2 Name
+
+SQL_Latin1_General_CP1_CI_AS
+
+3 SQLNCHAR 2 100 "," 3 GroupName
+
+SQL_Latin1_General_CP1_CI_AS
+
+4 SQLDATETIME 0 8 "," 4 ModifiedDate ""
+
+5 SQLDATETIME 0 0 "," 0 CreatededDate ""
+
+This time, the host file includes fields for a date the department was created. The target table doesn't have a column to receive this information. The fields are added to the original format file, as well as two dummy columns with a column number of 0. This will force bcp to ignore the fields.
+
+For this one, the XML version does have to deal with the fact that the file has a column that needs to be addressed. The destination, however, we can continue to ignore:
+
+<?xml version="1.0"?>
+
+<BCPFORMAT xmlns="http://schemas.microsoft.com/sqlserver/2004/bulkload/format"
+
+xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+<RECORD>
+
+<FIELD ID="1" xsi:type="NativeFixed" LENGTH="2"/>
+
+<FIELD ID="2" xsi:type="NCharPrefix" PREFIX_LENGTH="2" MAX_LENGTH="100"
+
+COLLATION="SQL_Latin1_General_CP1_CI_AS"/>
+
+<FIELD ID="3" xsi:type="NCharPrefix" PREFIX_LENGTH="2" MAX_LENGTH="100"
+
+COLLATION="SQL_Latin1_General_CP1_CI_AS"/>
+
+<FIELD ID="4" xsi:type="NativeFixed" LENGTH="8"/>
+
+<FIELD ID="5" xsi:type="NativeFixed" LENGTH="8"/>
+
+</RECORD>
+
+<ROW>
+
+<COLUMN SOURCE="1" NAME="DepartmentID" xsi:type="SQLSMALLINT"/>
+
+<COLUMN SOURCE="2" NAME="Name" xsi:type="SQLNVARCHAR"/>
+
+<COLUMN SOURCE="3" NAME="GroupName" xsi:type="SQLNVARCHAR"/>
+
+<COLUMN SOURCE="4" NAME="ModifiedDate" xsi:type="SQLDATETIME"/>
+
+</ROW>
+
+</BCPFORMAT>
+
+Mismatched Field Order
+
+Another possibility is that the host and target have the same fields, but the field orders don't match. This is corrected by changing the server column order to match the host file order:
+
+10.0
+
+4
+
+1 SQLSMALLINT 0 2 "," 1 DepartmentID ""
+
+2 SQLNCHAR 2 100 "," 3 GroupName
+
+SQL_Latin1_General_CP1_CI_AS
+
+3 SQLNCHAR 2 100 "," 2 Name
+
+SQL_Latin1_General_CP1_CI_AS
+
+4 SQLDATETIME 0 8 "," 4 ModifiedDate ""
+
+In this case, the group name is listed before the department name in the source file. The server column order has been changed to reflect this. Notice, the order in which the server columns are listed has not changed, but the server column numbers have been swapped.
+
+So, to translate this to XML, we just need to change a field or two versus our original XML file:
+
+<?xml version="1.0"?>
+
+<BCPFORMAT xmlns="http://schemas.microsoft.com/sqlserver/2004/bulkload/format"
+
+xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+<RECORD>
+
+<FIELD ID="1" xsi:type="NativeFixed" LENGTH="2"/>
+
+<FIELD ID="2" xsi:type="NCharPrefix" PREFIX_LENGTH="2" MAX_LENGTH="100"
+
+COLLATION="SQL_Latin1_General_CP1_CI_AS"/>
+
+<FIELD ID="3" xsi:type="NCharPrefix" PREFIX_LENGTH="2" MAX_LENGTH="100"
+
+COLLATION="SQL_Latin1_General_CP1_CI_AS"/>
+
+<FIELD ID="4" xsi:type="NativeFixed" LENGTH="8"/>
+
+</RECORD>
+
+<ROW>
+
+<COLUMN SOURCE="1" NAME="DepartmentID" xsi:type="SQLSMALLINT"/>
+
+<COLUMN SOURCE="3" NAME="Name" xsi:type="SQLNVARCHAR"/>
+
+<COLUMN SOURCE="2" NAME="GroupName" xsi:type="SQLNVARCHAR"/>
+
+<COLUMN SOURCE="4" NAME="ModifiedDate" xsi:type="SQLDATETIME"/>
+
+</ROW>
+
+</BCPFORMAT>
+
+Using Format Files
+
+As an example, let's use a format file for an import. This command will copy records into the Department table based on a file named shortdept.txt. We'll use ShortDept.fmt as our non-XML format file example, and ShortDeptX.fmt as our XML-based format file.
+
+BCP AdventureWorks2008.HumanResources.Department in c:\shortdept.txt –
+
+fc:\shortdept.fmt –Usa -Pbubbagump
+
+Just for a change of flavor, the preceding example command line uses SQL Server authentication instead of Windows authentication. If you prefer Windows authentication, just replace the –U and –P parameters with the –T we've used frequently.
+
+The sample files used in this example, ShortDept.txt, ShortDept.fmt, and ShortDeptx.fmt, are available for download from the Wrox website or from ProfessionalSQL.com.
+
+Maximizing Import Performance
+
+One obvious way of maximizing bcp performance is to make sure that the target table meets all the requirements for running bcp as a non-logged operation. This may mean you need to:
+
+  * Drop any existing indexes on the target table. While this is actually required only if you want a minimally logged operation, the fact is that leaving indexes off during bulk operation is greatly beneficial performance-wise regardless of the logging status. Be sure, however, to rebuild your indexes after the bulk operation is complete.
+  * Attempt to have your source data files created in the same order that your clustered index (if there is one) is in. During your index rebuild, this will allow you to make use of the SORTED_DATA_REORG option, which greatly speeds index creation (and thus the overall time of your bcp operation). Even if you have to leave a clustered index in place, performing the bcp with sorted data will allow the use of the ORDER column option (within the -HINT option).
+  * Make sure your maintenance properties are set to simple or non-logged. If they are set to Full Recovery, then bcp will not be allowed a minimally logged operation.
+
+If you're looking for additional improvement when importing data into a table, you can run parallel data loads from multiple clients. To do this, you must:
+
+  * Use the TABLOCK hint.
+  * Remove all indexes (you can rebuild them after the operation is complete).
+  * Set the server recovery option to Bulk-Logged.
+
+How would this work? Rather than importing one very large file, break it up into smaller files. Then you launch bcp from multiple client systems, each client importing one of the smaller files. Obviously, you will be interested in doing this only if the expected performance increase saves more time on the import than you'll spend preparing the source files and copying them to the clients.
+
+Parallel loads were not supported for SQL Server 6.5 or earlier.
+
+With either of these operations, it will be necessary to re-create any indexes on the target table after completing the operation. Re-create the target table clustered index (if any) before any non-clustered indexes.
+
+You can get additional performance improvement by letting SQL Server ignore check constraints and triggers, the default option. Keep in mind that this can result in loading data that violates the table's check constraints and any data integrity rules that are enforced by your triggers.
+
+BULK INSERT
+
+One of the "cousins" that I mentioned at the beginning of the chapter was the BULK INSERT command. In order to make use of this command, you must be a member of either the sysadmin or bulkadminserver role.
+
+BULK INSERT essentially operates like a limited version of bcp that is available directly within T-SQL. The syntax looks like this:
+
+BULK INSERT [['<database name>'.]['<schema name>'].]'<table name>' FROM '<data file>'
+
+[WITH
+
+(
+
+[BATCHSIZE [ = <batch size>]]
+
+[, CHECK_CONSTRAINTS]
+
+[, CODEPAGE [={'ACP'|'OEM'|'RAW'|'<code page>'}]]
+
+[, DATAFILETYPE [={'char'|'native'|'widechar'|'widenative'}]]
+
+[, FIELDTERMINATOR [= '<field terminator>' ]]
+
+[, FIRSTROW [= <first row>]]
+
+[, FIRE_TRIGGERS]
+
+[, FORMATFILE = '<format file path>' ]
+
+[, KEEPIDENTITY]
+
+[, KEEPNULLS]
+
+[, KILOBYTES_PER_BATCH [= <no. of kilobytes>]]
+
+[, LASTROW [ = <last row no.>]]
+
+[, MAXERRORS [ = <max errors>]]
+
+[, ORDER ({column [ASC|DESC]} [ ,...n ] )]
+
+[, ROWS_PER_BATCH [= <rows per batch>]]
+
+[, ROWTERMINATOR [ = '<row terminator>']]
+
+[, TABLOCK]
+
+[, ERRORFILE = '<file name>']
+
+)
+
+]
+
+Now, if you are getting a sense of déjà vu, then you're on top of things for sure. These switches pretty much all have equivalents in the basic bcp import syntax with which we started off the chapter.
+
+The special permission requirements of BULK INSERT are something of a hassle (not everyone belongs to sysadmin or bulkinsert), but BULK INSERT does carry with it a couple of distinct advantages:
+
+  * It can be enlisted as part of a user-defined transaction using BEGIN TRAN and its associated statements.
+  * It runs in-process to SQL Server, so it should pick up some performance benefits there as it avoids marshalling.
+  * It's slightly (very slightly) less cryptic than the command-line syntax used by bcp.
+
+The big issue with BULK INSERT is just that. It's bulk insert. BULK INSERT will not help you build format files. It will not export data for you. It's just a simple and well-performing way to get bcp functionality for moving data into your database from within SQL Server.
+
+OPENROWSET (BULK)
+
+Yet another cousin to bcp, but this one is a far more distant one. You can think of this cousin as being from the side of the family that got most of the money and power. (In case you can't tell, I like this one!) OPENROWSET (BULK) marries the bulk rowset provider with the OPENROWSET's ability to be used within queries for fast and relatively flexible access to external files without necessarily needing to load them into an intermediate table.
+
+One of the more common uses for bcp is to load external data files for use by some periodic process. For example, you may receive files that contain things like credit reports, vendor catalogs, and other data that is placed in a generic format by a vendor. This is vital information to you, but you're more interested in a one-time interaction with the data than in truly importing it. OPENROWSET (BULK) allows the possibility of treating that file—or just one portion of that file—as a table. What's more, it can utilize a format file to provide a better translation of the file layout than a simple linked table might provide. The syntax looks like this:
+
+OPENROWSET
+
+( BULK '<data file>' ,
+
+{ [ FORMATFILE = '<format file>' ] [
+
+[, CODEPAGE [={'ACP'|'OEM'|'RAW'|'<code page>'}]]
+
+[, FIRSTROW [= <first row>]]
+
+[, LASTROW [ = <last row no.>]]
+
+[, MAXERRORS [ = <max errors>]]
+
+[, ROWS_PER_BATCH [= <rows per batch>]]
+
+[, ERRORFILE = '<file name>']
+
+]
+
+| SINGLE_BLOB | SINGLE_CLOB | SINGLE_NCLOB }
+
+} )
+
+Keep in mind that OPENROWSET is more of a bulk access method than an insert method. You can most certainly do an INSERT INTO where the source of your data is an OPENROWSET (indeed, that's often how it's used), but OPENROWSET has more flexibility than that. Now, with that in mind, let's look at a couple of important bulk option issues when dealing with OPENROWSET.
+
+ROWS_PER_BATCH
+
+This is misleading. The big thing to remember is that, if you use this, you are essentially providing a hint to the Query Optimizer. SQL Server will always process the entire file, but whatever you put in this value is going to be a hint to the Optimizer about how many rows are in your file. Try to make it accurate or leave it alone.
+
+SINGLE_BLOB, SINGLE_CLOB, SINGLE_NCLOB
+
+These say to treat the entire file as one thing—one row with just one column. The type will come through as varbinary(max). Windows encoding conventions will be applied if you use SINGLE_BLOB. SINGLE_CLOB will assume that your data is ASCII, and SINGLE_NCLOB will assume it is Unicode.
+
+Summary
+
+In this chapter, we looked at the first of our two major data import/export utilities. bcp is used primarily for importing and exporting data stored as text files to and from our SQL Server. We also took a look at some of bcp's brethren.
+
+As a legacy utility, bcp will be familiar to most people who have worked with SQL Server for any length of time. Microsoft continues to enhance the core technology behind bcp, so I think it's safe to say that bcp is here to stay.
+
+That said, bcp is quite often not your best option. Be sure to check your options with BULK INSERT (and the benefits of running in-process to SQL Server) as well as OPENROWSET (BULK).
+
+In our next chapter, we will take a look at bcp's major competition—SQL Server Integration Services (SSIS). SSIS has the glamour and glitz that bcp is missing, but it also has its own quirks that can occasionally make the simplicity of bcp seem downright appealing.
+16
+
+Getting Integrated
+
+SQL Server Integration Services—or SSIS—is a tool that is a descendant of another tool called Data Transformation Services—or DTS. Remembering DTS is important particularly because of how revolutionary it was at the time it was released (in early 1999 as part of SQL Server 7.0). Never before was a significant tool for moving and transforming large blocks of data included in one of the major Relational Database Management Systems (RDBMSs). All sorts of things that were either very difficult or required very expensive third-party tools were suddenly a relative piece of cake. As we fast forward to the SQL Server 2008 era, what is now called SSIS (the name was changed when the service was totally rewritten as part of SQL Server 2005) is still relatively unique in terms of making such an important tool so accessible.
+
+In this chapter, we'll be looking at how to perform basic import and export of data, and we'll discuss some of the other things possible with tools like Integration Services. We will place our primary focus on the basics of SSIS packages, setting us up for a more advanced discussion of SSIS programmability in the Web-based chapter, Chapter 25.
+
+Understanding the Problem
+
+The problems being addressed by Integration Services exist in at least some form in a large percentage of systems—how to get data into or out of our system from or to foreign data sources. It can be things like importing data from the old system into the new, or a list of available items from a vendor—or who knows what. The common thread in all of it, however, is that we need to take data that doesn't necessarily match our table definitions and get that data into those tables anyway.
+
+What we need is a tool that will let us Extract, Transform, and Load data into our database—a tool that does this is usually referred to simply as an "ETL" tool. Just how complex of a problem this kind of tool can handle varies, but SQL Server Integration Services—or SSIS—can handle nearly every kind of situation you may have.
+
+This may bring about the question "Well, why doesn't everybody use it, then, since it's built in?" The answer is one of how intuitive it is in a cross-platform environment. There are third-party packages out there that are much more seamless and have fancier UI environments. These are really meant to allow unsophisticated users to move data around relatively easily—they are also outrageously expensive. Under the old DTS product, I actually had customers that were Oracle or other DBMS oriented, but purchased a full license for SQL Server just to make use of DTS. While the price of competing packages has come down, and SQL Server licensing prices have gone up, I'm sure that there are still SQL Server licenses out there that exist largely because of the need for SSIS.
+
+An Overview of Packages
+
+SSIS utilizes the notion of a "package" to contain a set of things to do. Each individual action is referred to as a "task." You can bundle up a series of tasks and even provide control of flow choices to conditionally run different tasks in an order of your choosing (for example, if one task were to fail, then run a different task). Packages can be created programmatically (using a rather robust object model that we will take an introductory look at in Chapter 25), but most initial package design is done in a designer that is provided in SQL Server.
+
+Let's go ahead and create a simple package just to get a feel for the environment. To get to SSIS, you need to start the SQL Server Business Intelligence Development Studio from the Programs⇒Microsoft SQL Server 2008 menu on your system—then select Integration Services Project as your project type as shown in Figure 16.1.
+
+To be honest, I'm still not a fan of the Integration Services modeler being in the Intelligence Studio rather than the Management Studio. Nonetheless, Microsoft has this nasty habit of not consulting me before they move their tools around, so I guess we'll have to live with it!
+
+So, to reiterate, the SSIS tool is in Business Intelligence Studio (much like the Reporting Services–related items)—not in Management Studio as most items we've looked at have been.
+
+The exact look of the dialog in Figure 16.1 will vary depending on whether you also have Visual Studio installed and, if so, what parts of Visual Studio you included in your installation.
+
+In this case, I've named my project an ever-so-descriptive "SSISProject"—from there, I simply click OK and SQL Server creates the project and brings up the default project window, shown in Figure 16.2, for SSIS-related projects.
+
+Figure 16.1
+
+Figure 16.2
+
+For those of you used to the Visual Studio environment, you should feel relatively at home. The only significant difference versus most Dev Studio projects is that, as we build the project, the design tab will be graphical in nature rather than in code.
+
+There are four key windows in our project, so let's start by looking at these. We will then do a walkthrough example later in the chapter.
+
+Tasks
+
+On the left side of our project (depending on your settings, you may have to click a tab to expand it), we have the toolbox window. The Control Flow Items list is at the top and thus what you first see, but you should also be able to find a section on Maintenance Plan tasks by scrolling down (these are more in the realm of the administrator, but you should take note of them—they underline my earlier notion that Integration Services is not just about ETL activities but also for a wide array of other actions, including many that you might have expected to find in Management Studio). You'll notice that many of these Control Flow Items entries are labeled as "tasks."
+
+A task, much as the word implies, is generally about an action that you want to take. They range from migration tasks (such as moving objects between servers) to data migration and transformation to tasks that manage the execution of other programs or packages. Though most are called tasks, you will also find some container objects that help organize or wrap the other objects in your package.
+
+It's worth noting that you can reorganize the tasks. You can, for example, drag and drop tasks in the task list to reorder them (perhaps to move those you use the most often up to the top where they are more visible), or create your own tabs to contain those tasks you use the most often. In addition, you can add new tasks to the list much as you can add new controls to other Dev Studio projects. In short, the environment is very customizable.
+
+There are a ton of tasks here, so let's take a quick overview at what the base tasks do.
+
+Task | Description  
+---|---  
+Pointer | Okay, it's silly to even have to describe this, but just in case: This puts things into a generic drag-and-drop mode. When pointer is selected, clicking in the designer pane implies that you merely want to select an object that is already there as opposed to adding a new one.  
+For Loop Container | This is nothing more than a glorified FOR (or FOR/NEXT depending on your language of choice) statement. The FOR loop container allows you to initialize a control counter and set the conditions by which that counter is adjusted as well as under what conditions you exit the loop. Use this task to allow for controlled repetition of other tasks.  
+For Each Container | Again, this is your run-of-the-mill FOR/EACH statement. Like the FOR loop, it allows for controlled repetition, but this time, rather than using a counter, the loop is based on iterating through a collection of some sort (perhaps a collection of tables or other objects). The object list can come from a wide variety of sources ranging from such things as ADO and ADO.NET rowsets to SMO object lists.  
+Sequence Container | I think of this one as something of a "sub-package." The sequence container allows you to group up tasks and treat them as a single unit. This is useful for things like wrapping several tasks into a single transaction (thus allowing your overall package to contain several separate transactions—each potentially having many tasks to perform). Individual sequence containers can be made active or inactive conditionally, so you could, for example, turn off an entire set of tasks by disabling that sequence container (you could even do that programmatically, based on conditions found in previous tasks!).  
+Script Tasks | One of those "what it sounds like" things—these let you run your own custom code using either any ActiveX scripting language (JavaScript or VBScript for example) or any .NET-based language. Use the ActiveX Script task for ActiveX languages, and use the Script task for .NET code.  
+Analysis Services Tasks | These allow you to construct or alter Analysis Services objects as well as execute them.  
+Bulk Insert Task | As you might guess, this allows for the bulk importing of data. It uses the same Bulk Insert facilities that you touched on in the bcp chapter, but allows the bulk operation to be part of a larger control flow. The Bulk Insert task is easily the fastest way for an SSIS package to get data into your system. Note, however, that any package containing a Bulk Insert task can be run only by a login that is a member of the sysadmins server role.  
+Data Flow Task | The Data Flow task wraps the connection between data sources along with any transformations you want to make in moving data between those data sources. The Data Flow task is among the most complex tasks in SSIS in that it operates as both a task and a container. The Data Flow task is a container in the sense that you associate several parts of a given data flow with it. The Data Flow tasks define sources as well as destinations of data as well as the transformations to take place between the source and destination. Editing Data Flow tasks will automatically take you to a different tab within the main editing window.  
+Data Mining Query Task | This task requires that you have already defined Data Mining Models in Analysis Services. You can utilize this task to run predictive queries and output the results into tables (you could then define additional tasks to make use of those tables).  
+Execute Tasks | These are somewhat specific to what you want to execute. They can range from running other packages (there are separate tasks for running old DTS packages versus the newer SSIS packages) to executing external programs to running SQL scripts.  
+File System Tasks | These allow you to create, move, and delete files and directories. In a wide variety of SSIS environments, the ability to transfer files is key to both performance and execution of your package. For example, you may need to copy a file from a remote location to local storage for performance reasons as you perform operations against that file. Likewise, you may only have network access that allows you to read or to create a file, but not to change it—File System Tasks allow you to get just the right thing done.  
+FTP Tasks | This is something of a different slant on the File System Tasks notion. Instead, however, this allows you to use the FTP protocol to retrieve files (very handy for doing things like transferring files to or from vendors, customers, or other partners).  
+Message Queue Task | This allows you to send and receive messages via Microsoft Message Queue. This is actually a very powerful tool that allows for the delivery and/or receipt of files and other messages even when the remote host is not currently online. Instead, you can "queue" the file, and that host can be notified that the file is available the next time it is online. Likewise, files can be left in queue for your process to pick up when you execute the package.  
+Send Mail | Yup—yet another of those "what it sounds like" things. This allows you to specify a mail including attachments that may have been created earlier in your package execution. The only real trick on this one is that you must specify an SMTP connection (basically the outbound mail server) to use to send the mail. SSL and Windows-based authentication is also supported.  
+Transfer Tasks | These range from server migration tasks, such as transferring logins, error messages, and master database stored procedures, to more straightforward transfers such as transferring a table.  
+Web Service Task | This allows you to execute a Web service method and retrieve the result into a variable. You can then make use of that result in the remaining tasks in your package.  
+WMI Tasks | Windows Management Instrumentation (WMI) is an API that allows for system monitoring and control. It is a Windows-specific implementation of Web-Based Enterprise Management (WBEM), which is an industry standard for accessing system information. SSIS includes tasks for monitoring WMI events (so you can tell when certain things have happened on your system) and for requesting data from WMI in the form of a WMI query. You could, for example, ask WMI what the total system memory is on your server.  
+XML Tasks | XML Tasks allows for a wide variety of XML manipulation. You can apply XSLT transformations, merge documents, filter the XML document using XPath, and the list goes on.  
+Maintenance Tasks | Much of this is outside the scope of this book, but this set of tasks allows you to perform a wide variety of maintenance tasks on your server. From a developer perspective, a key use here would be things like a backup prior to a major import or another similar activity that is part of your package. Similarly, you may want to do index rebuilds or other maintenance after performing tasks that do major operations against a particular table.
+
+The Main Window
+
+This window makes up the center of your default SSIS package window arrangement in Dev Studio. The thing to note is that it has four tabs available, and each is something of its own realm—take a look at each of them.
+
+It's worth noting that you can change from the default tab style interface to a window-based interface if you so choose. (It's in the options for Visual Studio.)
+
+Control Flow
+
+This is actually where the meat of your package comes together. No, a package isn't just made up of flow alone, but this is where you initially drag all your tasks in and establish the order in which they will execute.
+
+Data Flow
+
+As you place data flow objects into the Control Flow pane, they become available for further definition in the Data Flow pane. Data flow tasks require additional objects to define such things as data connections, sources, and destinations of data as well as actual transformations.
+
+Event Handlers
+
+SSIS packages create a ton of events as they execute, and this tab allows you to trap certain events and act upon them. Some of the more key events worth trapping include:
+
+Event | Description  
+---|---  
+OnError | This is a glorious new feature with SSIS. DTS had a quasi–error handler, but it was weak at best. This gives you something far more robust.  
+OnExecStatusChanged | This event is triggered any time the task is going into a different status. The possible statuses are idle, executing, abend (abnormal ending), completed, suspended, and validating. You can set traps for each of these conditions and have code run accordingly.  
+OnPostExecute | This one fires immediately after execution of the task is complete. In theory, this is the same as OnExecStatusChanged firing and having a status of completed, but I have to be honest and say I haven't tested this enough to swear to it.  
+OnProgress | This event is called regularly when any reasonably measurable progress happens in the package. This one is probably more useful when you're controlling a package programmatically than through one of the other execution methods, but it is nice from the standpoint of providing a progress bar for your end users if you need one.
+
+There are several other event methods available, but the preceding gives you a flavor of things.
+
+Package Explorer
+
+I find the location of this one to be a little odd. In a nutshell, this one presents a tree control of your package, complete with all the event handlers, connections, and executables (which include any tasks you have defined in the package). The reason I find this one a little odd is because I would have expected something like this to be part of or at least similar to Solution Explorer. Nonetheless, it does give you a way of looking over your project at an overall package level.
+
+Solution Explorer
+
+This is pretty much just like any other explorer window for Dev Studio. You get a listing of all the files that belong to your solution broken down by their nature (packages and data source views, for example).
+
+The Properties Window
+
+This one is pretty much the same as any other property window you've seen throughout SQL Server and Dev Studio. The only real trick here is paying attention to what exactly is selected so you know what you're setting properties for. If you've selected an object within the package, then it should be that particular task or event object. If you have nothing selected, then it should be the properties for the entire package.
+
+Building a Simple Package
+
+Okay, it's time for us to put some application to all this. This is going to be something of a quick-and-dirty example run, but, in the end, we will have shown off several of the key features of SSIS.
+
+Let's start with a little prep work. For this sample, we're going to be making use of a vbScript file that will generate some data for us to import. You can think of this script as simulating any kind of preprocess script you need to run before a major import or export.
+
+Create a text file called CreateImportText.vbs with the following code:
+
+Dim iCounter
+
+Dim oFS
+
+Dim oMyTextFile
+
+Set oFS = CreateObject("Scripting.FileSystemObject")
+
+Set oMyTextFile = oFS.CreateTextFile("C:\TextImport.txt", True)
+
+For iCounter = 1 to 10
+
+oMyTextFile.WriteLine(cstr(iCounter) & vbTab & """TestCol" &
+
+cstr(iCounter) & """")
+
+Next
+
+oMyTextFile.Close
+
+This script, when executed, will create a new text file (or replace the existing file if it's there). It will add 10 rows of text to the file containing two tab-separated columns with a newline row terminator. We will use this in conjunction with a few other tasks to create and populate a table in SQL Server.
+
+In its default form, the CreateImportText.vbs file will try and write the text file it creates to the root directory of the C drive. If your system has User Access Control enabled, you may be prevented from running the script; if so, just move it to a directory below the root and adjust the later paths in this example.
+
+This is a pretty simplistic sample, so please bear with me here. What, in the end, I hope you see from this is the concept of running a preprocess of some sort (that's our script that generates the file in this case, but it could have been any kind of script or external process), followed by SQL Server scripting and data pump activities.
+
+With our sample vbScript created, we're ready to start building a package.
+
+Let's start with the SSISProject project file we created in the previous section. At this point, our Control Flow should be empty. In order to get the proverbial ball rolling on this, we need to make a call to our vbScript to generate the text file we will be importing. Drag an Execute Process Task from the Toolbox into the main Control Flow window. Very little will happen other than SSIS adding the Execute Process Task to the Control Flow window, as shown in Figure 16.3.
+
+Figure 16.3
+
+To do much with our new task, we need to double-click the task to bring up the Execute Process Task Editor shown in Figure 16.4.
+
+Figure 16.4
+
+Note that I've switched to the Process options because they are a bit meatier to show in a screenshot than the General options are, but here's an overview of how we want to set things up:
+
+Option | Setting  
+---|---  
+General⇒Name | GenerateImportFile  
+General⇒Description | Generates the text file for import  
+Process⇒Executable | CreateImportText.vbs (prefix it with the full path to your script file)  
+Process⇒Working Directory | C:\ (or other directory of your choosing—just make sure you're consistent)
+
+When you're done making these changes, click OK, and very little will have changed in your Control Flow window, except that the name of the task will have been updated to GenerateImportFile.
+
+Next, drag an Execute SQL Task object into your Control Flow. Now, select the GenerateImportFile task, and it should have an arrow hanging from the bottom of the task box, as shown in Figure 16.5.
+
+Figure 16.5
+
+Now comes the tricky part: Click the "output" of our GenerateImportFile task—that is, click the end of the little arrow. Drag the arrow into the top of the Execute SQL Task, and the builder should connect the two tasks (as shown in Figure 16.6)—notice how the arrow indicates the control flow.
+
+Figure 16.6
+
+For the moment, let's look at what this arrow represents. Double-click it, and you'll get a Precedence Constraint Editor, as shown in Figure 16.7.
+
+Figure 16.7
+
+Notice how it defines under what conditions this flow will be allowed to happen. In our case, it will move on to the Execute SQL Task only if our GenerateImportFile task completes successfully. We could define additional flows to deal with such things as the task failing or to allow for our second task to run on completion of the first task regardless of whether the first task succeeds or fails (any completion, regardless of success).
+
+Cancel back out of this dialog, and double-click Execute SQL Task to bring up the Execute SQL Task Editor, as shown in Figure 16.8.
+
+Figure 16.8
+
+Again, I've edited the name a bit. Next, click the SQLStatement option to bring up the Enter SQL Query dialog shown in Figure 16.9.
+
+We're checking to see whether the table already exists, and, if it does, drop it. Then, knowing that the table cannot already exist (if it did, we just dropped it), we go ahead and create our destination table.
+
+One last thing we need is to have a connection to work with. Start by clicking in the Connection option and selecting New Connection. This will bring up a connection manager dialog (if you were paying attention, you may have noticed a connection manager pane below the main pane for our package—this is essentially the same functional area). Our package doesn't have any connections yet, so we need to click New again to get a somewhat run-of-the-mill OLE DB connection manager dialog. How I've filled out mine is shown in Figure 16.10, but adjust yours to match your database server name (the simple period "." in mine implies that I mean my local server) and security model.
+
+Figure 16.9
+
+Figure 16.10
+
+We're now able to connect to our database to create the destination table. And, with source data created and a destination table in place, we're ready to start working on actually transferring the data from our source to our destination. For that, we're going to utilize a Bulk Insert task, so go ahead and drag one of those into our model and connect the CreateTable task to the new BulkImport task, as shown in Figure 16.11.
+
+Figure 16.11
+
+Again, double-click our task (the Bulk Insert Task in this case) to bring up a relevant editor box. Of particular interest is the Connection tab shown in Figure 16.12.
+
+Figure 16.12
+
+We have several things to change here. For example, I've already changed the Row Delimiter to be the line feed that is written by our vbScript's WriteLine command. We do, however, need to do even more. Start by selecting the same connection you created to run the CREATE TABLE statement against. Then enter in our destination table name ([AdventureWorks].[dbo].[TextImportTable]).
+
+Note that the table must already exist for you to reference it in this dialog. I just manually run the CREATE statement once to prime the database and make sure anything that needs to reference the table at compile time can do so. This should create no harm since the process will drop the table and create a new one each time anyway.
+
+Finally, click in the File connection box and select New Connection to bring up the File Connection Management Editor for text files shown in Figure 16.13.
+
+Figure 16.13
+
+Notice the error that the file doesn't exist. This is the same issue that we had with the TextImportTable table. Either create an empty dummy file or run the CreateImportText.vbs file once to get an initial file out there, and then refresh and this error should go away.
+
+Click OK all the way back out to our Control Flow, and we're ready to rock.
+
+To execute our package immediately, click the run icon (the green arrow on the toolbar). Watch how Dev Studio indicates progress by changing the color of different tasks as they run.
+
+A few more items of note: SSIS is capable of running multiple tasks at one time for you. For example, I made this project entirely linear (one piece at a time) based on the idea that we didn't want to drop the destination data until the last minute (when we are sure there's new data available). We could, however, have placed the link from the file generation directly to the build import just the same as the CREATE TABLE dependency is linked directly to the import. If we had, SQL Server would have run both the table DROP/CREATE and the file creation at the same time but waited for "both" to complete before allowing the build import to execute.
+
+Go ahead and build your package (choose the Build option in the Build menu), as we will be utilizing it in the next section!
+
+Executing Packages
+
+There are a few different ways to execute an SSIS package. We utilized one of these in something of test mode within the Dev Studio, but this is hardly how you are likely to run your packages on a day-to-day basis. The more typical methods of executing a package include:
+
+  * The Execute Package Utility: This is essentially an executable in which you can specify the package you want to execute, set up any required parameters, and have the utility run it for you on demand.
+  * As a Scheduled Task using the SQL Server Agent: I'll talk more about the SQL Server Agent in Chapter 22, but for now, realize that executing an SSIS package is one of the many types of jobs that the agent understands. You can specify a package name and time and frequency with which to run it, and the SQL Server Agent will take care of it.
+  * From Within a Program: There is an entire object model supporting the notion of instantiating SSIS objects within your programs, setting properties for the packages, and executing them. This is fairly detailed stuff—so much so that Wrox has an entire book on the subject: Professional SQL Server 2008 Integrations Services by Knight, et. al (Wiley, 2009). We take a fast and dirty look at this in Chapter 25 (downloadable as special web content from either p2p.wrox.com or professionalsql.com), but if SSIS programmability is what you need, I recommend taking a look at Brian's work.
+
+Using the Execute Package Utility
+
+The Execute Package Utility is a little program by the name of DTExecUI.exe. You can fire it up to specify settings and parameters for existing packages and then execute them. You can also navigate using Windows Explorer and find a package in the file system (they end in .DTSX) and then double-click it to execute it. Do that to our text import package, and you should get the execute dialog shown in Figure 16.14.
+
+Figure 16.14
+
+As you can see, there are a number of different dialogs that you can select by clicking the various options to the left. Coverage of this could take up a book all to itself, but let's look at a few of the important things on several key dialogs within this utility.
+
+General
+
+Many fields on this first dialog are fairly self-explanatory, but let's pay particular attention to the Package Source field. We can store SSIS packages in one of three places:
+
+  * The File System: This is what you did on your Import/Export Wizard package. This option is really nice for mobility—you can easily save the package off and move it to another system.
+  * SQL Server: This one stores the package in SQL Server. Under this approach, your package will be backed up whenever you back up your MSDB database (which is a system database in every SQL Server installation).
+  * SSIS Package Store: This storage model provides the idea of an organized set of "folders" where you can store your package along with other packages of the same general type or purpose. The folders can be stored in either MSDB or the file system.
+
+Configurations
+
+SSIS allows you to define configurations for your packages. These are essentially a collection of settings to be used, and you can actually combine more than one of them into a suite of settings.
+
+Command Files
+
+These are batch files that you wish to run as part of your package. You can use these to do system-level things such as copying files around to places you need them (they will run under whatever account the Integration Services service is running under, so any required access on your network will need to be created to that account).
+
+Connection Managers
+
+This is a bit of misnomer—this isn't so much a list of connection managers as it is a list of connections. By taking a look at the Description column, you'll see many of the key properties for each connection your package uses. Notice that in our example package, we have two connections, and if you look closely, you'll see how one relates to file information (for our connection to the flat file we're using), and there is another that specifically relates to SQL Server (the export source connection).
+
+Execution Options
+
+Do not underestimate the importance of this one. Not only does it allow you to specify how, at a high level, you want things to happen if something goes wrong (if there's an error), but it also allows you to establish checkpoint tracking—making it easy to see when and where your package is getting to different execution points. This can be critical in performance tuning and debugging.
+
+Reporting
+
+This one is all about letting you know what is happening. You can set up for feedback: exactly how much feedback is based on which events you decide to track and the level of information you establish.
+
+Logging
+
+This one is fairly complex to set up and get going but has a very high "coolness" factor in terms of giving you a very flexible architecture for tracking even the most complex of packages.
+
+Using this area, you can configure your package to write log information to a number of preconfigured "providers" (essentially, well-understood destinations for your log data). In addition to the preinstalled providers such as text files and even a SQL Server table, you can even create your own custom providers (not for the faint of heart). You can log at the package level, or you can get very detailed levels of granularity and write to different locations for different tasks within your package.
+
+Set Values
+
+This establishes the starting value of any runtime properties your package uses (there are none in our simple package).
+
+Verification
+
+Totally different packages can have the same filename (just be in a different spot in the file system, for example). In addition, packages have the ability to retain different versions of themselves within the same file or package store. The Verification dialog is all about filtering or verifying what package/version you want to execute.
+
+Command Line
+
+You can execute SSIS packages from the command line (handy when, for example, you're trying to run DTS packages out of a batch file). This option within the SSIS Package Execution Utility is about specifying parameters you would have used if you had run the package from the command line.
+
+The utility will establish most of this for you—the option here is just to allow you to perform something of an override on the options used when you tell the utility to Execute.
+
+Executing the Package
+
+If you simply click Execute in the Package Execution Utility, your package will be off and running. After it runs, you should find a text file in whatever location you told your package to store it—open it up, take a look, and verify that it was what you expected.
+
+Executing within Management Studio
+
+While Management Studio doesn't give you a package editor, it does give you the ability to run your packages.
+
+In the Object Explorer pane of Management Studio, clicking the Connect icon can choose Integration Services. Fill out the connection dialog. This should create a connection to Integration Services on that server, and add an Integration Services node in your Object Explorer.
+
+To execute a package in this fashion (using Management Studio), the package must be local to that server (not in the file system). Fortunately, if you right-click the File System node under Stored Packages, SQL Server gives you the ability to import your package. Simply navigate the file system to the package we created, give it a name in the package store, and import it. You can then right-click and execute the package at any time. (It will bring up the execution utility we saw in a previous section, so you should be in familiar territory from here.)
+
+Summary
+
+SQL Server Integration Services is a robust Extract, Transform, and Load tool. You can utilize Integration Services to provide one-off or repeated import and export of data to and from your databases—mixing a variety of data sources while you're at it.
+
+In this chapter, we actually went just slightly beyond the basics—touching on external access and multi-stage control of flow. While becoming expert in all that Integration Services has to offer is a positively huge undertaking, getting basic imports and exports up and running is a relative piece of cake. I encourage you to start out simple and then add to it as you go. As you push yourself further and further with what SSIS can do, take a look at other books that are specific to what SSIS has to offer.
+17
+
+Replication
+
+Coming off the heels of significant change in 2005, replication is one of a few quiet areas in terms of version differences in SQL Server 2008. Indeed, virtually nothing has changed that isn't directly tied to a non-replication feature. (They had to allow for replication of the new data types, didn't they?)
+
+Replication is one of those things that everyone loves to ignore—until they need it. Then, it seems, there is a sudden crisis about learning and implementing it instantly (and not necessarily in that order, I'm sorry to say).
+
+So, what then, exactly, is replication? I'll shy entirely away from the Webster's definition of it and go to my own definition:
+
+Replication is the process of taking one or more databases and systematically providing a rule-based copy mechanism for that data to and potentially from a different database.
+
+Replication is often a topology and administration question. As such, many developers have a habit of ignoring it—bad idea. Replication has importance to software architects in a rather big way, as it can be a solution to many complex load and data distribution issues such as:
+
+  * Making data available to clients that are generally not connected to your main network
+  * Distributing the load associated with heavy reporting demands
+  * Addressing latency issues with geographically dispersed database needs
+  * Supporting geographic redundancy
+
+And those are just a few of the biggies.
+
+So, with that in mind, we're going to take a long look at replication. I'm going to warn you in advance that this isn't going to have quite as many walkthroughs as I usually do, but patience, my young padawan—there is a reason. In simple terms, once you've built one or two of the styles of replication, you have most of the "constructing" part of the learning out of the way. What's more, the actual building up of the replication instance is indeed mostly an administrator's role. Instead, we're going to focus on understanding what's happened, and, from there, save most of the space in this chapter for understanding how different replication methods both create and solve problems for us and how we might use the different replication models to solve different problems.
+
+In this chapter we will look at things like:
+
+  * General replication concepts
+  * What replication models are available (we will see an example or two here)
+  * Security considerations
+  * Replication Management Objects (RMO)—the programmatic way of managing replication
+
+In the end, while I can't promise to make you a replication expert (to be honest, I'm not really one myself), you will hopefully have a solid understanding of the fundamentals and have a reasonable understanding of the possibilities.
+
+Replication Basics
+
+Replication is like a big puzzle—made up of many pieces in order to form a complete unit. We have topology considerations (publisher, subscriber, and distributor) as well as publication models (merge, transactional, snapshot). Before you get to deciding on those, there are several things to take into account.
+
+Considerations When Planning for Replication
+
+There are a number of things to take into account when thinking about the topology and replication methods available. These should be part of an assessment you make at design time to determine what forms of replication should even be considered for your application. Among these are:
+
+  * Autonomy
+  * Latency
+  * Data consistency
+
+Let's take a quick look at each of these.
+
+Autonomy
+
+Autonomy is all about how much a replication instance is able to run as its own thing. What data needs to be replicated and at what frequency? For example, you could be supporting a sales application where each site keeps separate customer records. You would want to have these replicated to a central database for reporting and, perhaps, such other things as automatic stock replacement. Each site is highly autonomous (they really don't care whether the central database gets its data or not; they can still continue to make sales based on the data they have on-site). Indeed, even the central database, while dependent, is probably not in a catastrophic situation if it misses data from a site for a day (depends how you're using the reports that come off it or how much lag you can have before you restock).
+
+Latency
+
+Latency refers to the time delay between updates; in other words, the time taken for a change at the publishing server to be made available at the subscribing server. The higher the autonomy between sites, the greater the latency between updates can be.
+
+Determining an acceptable delay can be tricky and will likely be tied into the aforementioned autonomy question. If our site information is only transmitted to the central server for periodic rollup reporting, then we can probably get away with only daily—or even longer—updates. If, however, the sites are drawing from a central shipping facility for some of the sales, then we need to update the central database in a timelier manner, so a product is not oversold (two sites trying to sell the one remaining piece of stock).
+
+Data Consistency
+
+Data consistency is obviously going to be a key concern of virtually any distributed system. This is, of course, all about making sure that your various replication instances contain the same values from end to end, and this can be accomplished in two ways:
+
+  * Data Convergence: All sites eventually end up with the same values; however, the values aren't necessarily the same as they would be if all of the changes had taken place on one server. An example might be our overstock situation. Had our two sales happened on the same server, then the second sale would have known about the out of stock situation and perhaps not been completed. Instead, each database thought one item was available, and, depending on the way the inventory adjustment is handled, you may wind up with a negative inventory level. In the same vein, your data may wind up with exactly the same end value, but may have taken a different set of steps to arrive at that value (the actual ordering of the updates may not be the same depending on how many replication clients were involved and at what time they synchronized).
+  * Transactional Consistency: The results at any server are the same as if all transactions were executed on a single server. This is implemented by the mechanism implied in the name—transactions. I'm sure, if you ponder this for a bit, you can recognize the latency impact (both good and bad) of this—before your transaction can complete, it has to complete on every server that is participating in that particular replication set.
+
+Schema Consistency
+
+Many developers who are used to developing in non-replicated environments take the ability to easily change the database schema for granted. Need to add or drop a new column? No problem. Need to add a new table? No big deal. Well, beyond the basic problems of being so cavalier with your database in any environment, you'll quickly find that life gets a bit more complicated in a replicated world.
+
+Replication or not, remember that any time you alter the schema of your table you are essentially altering the foundation of your entire system (or at least the part that the schema object in question serves). Schema changes should always be treated as fairly serious alterations and be carefully considered as well as methodically planned. Some changes (additions in particular) can usually be made with relatively minor collateral impact. Things that change or remove existing objects, however, can be deadly when dealing with backward-compatibility issues. Also, keep in mind that others may have built "extensions" to your system that are relying on your existing schema; this can mean impacts that are hard to plan for when you change your existing schema.
+
+The good news is that SQL Server continues to increase its support for schema changes during replication. Fields that are added or dropped on the publisher may be propagated to all subscribers during future replication operations. The bad news is that your change procedures need to be much stricter. The bottom line is that, if you need to make frequent schema changes, you'll want to fully plan what your change strategy is going to be before implementing replication at all.
+
+When the concept of replicating schema changes was first added to SQL Server, it was done through the use of special stored procedures called sp_repladdcolumn and sp_repldropcolumn rather than the more familiar ALTER TABLE command. This was changed back in SQL Server 2005, and sp_repladdcolumn and sp_repldropcolumn should be considered deprecated (avoid using them).
+
+Other Considerations
+
+Some other things to think about include:
+
+  * How reliable is the connection between your servers? If it is a local connection, then you can probably count on it, but what if it is in a different geographic location? What if it's a different country?
+  * What kind of connection latency do you have? This falls somewhat into the reliability question, but is really its own issue. Do you really want to enforce transactional replication if it takes even a second or two for a simple ping to return (imagine that with a block of data now)?
+  * In the same vein as connection latency, how much bandwidth do you have? How much traffic are you going to be flushing over the wire, and what other processes are going to be using that same wire? Do you need to compress your replication related data?
+  * Is the replication method wired at all? That is, what if you don't have connectivity at all with the servers you want to replicate to? SQL Server supports a disconnected model, but what does that do to you between long updates?
+
+Replication Roles
+
+The process of replication is based on three basic roles: The publisher, distributor, and subscriber. Any one server can potentially be serving any one (or any subset) of these roles. Just to paint a picture of how flexible this can be, take a look at Figure 17.1.
+
+As you can see, multiple publishers can be utilizing the same distributor, and any given publication can have multiple subscribers. Let's take a little bit closer look at these roles.
+
+The Publisher
+
+The publisher can be considered to be the source database. Even in situations where the publisher and its various subscribers are sharing data equally, there is one database that can be thought of as something of the control database.
+
+The Distributor
+
+The distributor serves as something of the clearinghouse for changes. It has a special distribution database that keeps track of changes, as well as which subscribers have already received those changes. In addition, it will keep track of the results of any synchronization process and will know what happened in the case of any conflicts that had to be resolved (we'll look more into conflict resolution later).
+
+Figure 17.1
+
+The Subscriber
+
+Any database that is participating in the replication publication, but is not the actual publisher, can be considered a subscriber. This does not, however, mean that the subscriber only receives data—indeed, depending on the specific model chosen (again, more on those later), the subscriber may well be both receiving and disseminating data.
+
+Subscriptions
+
+The subscriptions that a subscriber receives are called publications. A publication will contain one or more articles. An article is usually a table or some subsection of the data from a table, but it can be a stored procedure or a group of stored procedures. By subscribing to a publication, the subscriber is subscribing to all of the articles in the publication. The subscriber cannot subscribe to individual articles alone.
+
+Subscriptions can be set up as push subscriptions or pull subscriptions:
+
+  * With push subscriptions, the publisher determines when updates go out to the subscriber. This is used most frequently when you want to keep latency to a minimum (since the publisher is often the only copy of the database receiving changes, it makes sense that it would be the one to know about changes as they happen and take appropriate action) or you want to keep full control at the publisher for some other reason.
+  * With pull subscriptions, the subscriber requests updates. This allows for a higher level of autonomy since the subscriber decides when updates should occur.
+
+A publication can simultaneously support both push and pull subscriptions; however, any given subscriber is restricted to either a push or pull subscription—it cannot have both push and pull to the same publication.
+
+Types of Subscribers
+
+SQL Server supports three types of subscribers:
+
+  * The default is a local subscriber. The publisher is the only server that knows about the subscriber. Local subscribers are often used as a security mechanism or when you want to maximize autonomy between servers.
+  * Global subscribers occur where all servers participating in the publication (be they the publisher or a subscriber) know about all the other subscribers. Global subscribers are commonly used in a multiserver environment where you want to be able to combine data from different publishers at the subscriber.
+  * Anonymous subscribers are visible only to the publisher while the subscriber is connected. This is useful when setting up Internet-based applications.
+
+Filtering Data
+
+SQL Server provides for the idea of horizontally or vertically filtering tables. Horizontal filtering (you may come across the term horizontal partitioning for this as well) identifies rows within the table (by way of a WHERE clause) for publication. For example, you could divide inventory information by warehouse as a way of maintaining separate warehouse totals. Vertical filtering (also known as vertical partitioning) identifies the columns to be replicated. For example, you might want to publish quantity on hand information from an inventory table, but not quantity on order.
+
+Replication Models
+
+We have three different models available to us in replication. They trade off between the notions of latency, autonomy, and some of the other considerations we discussed earlier in the chapter. Deciding which to choose is something of a balancing act between:
+
+  * Degree of Autonomy: Is there a constant connection available between the servers? If so, what kind of bandwidth is available? How many transactions will be replicating?
+  * Conflict Management: What is the risk that the same data will be edited in multiple locations either at the same time or in between replicated updates? What is the tolerance for data on one or more of the replicated servers disagreeing?
+
+Some replication scenarios don't allow for connectivity except on a sporadic basis—others may never have connectivity at all (save, perhaps, through what is sarcastically referred to as "sneaker net"—where you run, mail, fly, or the like, a disk or other portable storage medium from one site to another). Other replication scenarios have an absolute demand for perfectly consistent data at all sites with zero data loss.
+
+From highest to lowest in autonomy, the three models are:
+
+  * Snapshot replication
+  * Merge replication
+  * Transactional replication
+
+Let's look at the pros and cons of each replication model, outlining situations where it would be an appropriate solution and any data integrity concerns.
+
+It's important to note that you can mix and match the replication types as necessary to meet your implementation requirements. There are going to be some publications where you want to allow greater autonomy between sites. There will be other publications where minimizing latency is critical.
+
+Let me take a moment here to point out that a publication is just that—a publication. It does not necessarily map out that one publication equals one database. You may have one publication where the articles included in it make up only part of your subscribing database. Other objects in the subscribing database may be served by a different publication—potentially from a completely different publishing server.
+
+Snapshot Replication
+
+With snapshot replication, a "picture" is taken at the source of all of the data to be replicated (as shown in Figure 17.2). This is used to replace the data at the destination server.
+
+Figure 17.2
+
+Snapshot replication, in its simplest form, is the easiest type of replication to set up and manage. Complete tables or table segments (for partitioned tables) are written to the subscribers during replication. Since updates occur on a periodic basis only, most of the time, there is minimal server or network overhead required to support replication.
+
+Snapshot replication is frequently used to update read-only tables on subscriber systems. It allows for a high level of autonomy at the subscriber, but at the cost of relatively high latency. You are able to keep tight control on when periodic updates occur when using snapshot replication. This means that you can schedule updates to occur when network and server activity is at a lull (or you can even carry the snapshot via disk or other hard medium). There is a potential concern about the time and resources to complete replication during the periodic updates. As source tables grow, the amount of data that has to be transferred during each update increases. Over time, it may become necessary to either change the replication type or partition the table to reduce the amount of data replicated to keep traffic to manageable levels.
+
+A variation of snapshot replication is snapshot replication with immediate-updating subscribers. With this, changes can be made to the data at the subscriber. Those changes are sent to the publishing server on a periodic basis unless immediate updating has been implemented, in which case distributed transactions are executed in real time.
+
+How Snapshot Replication Works
+
+Replication is implemented through replication agents. Each agent is essentially its own, small, independent program that takes care of the tasks of monitoring transactions and distributing data as required for that particular type of agent.
+
+Snapshot Agent
+
+The Snapshot Agent supports snapshot replication and initial synchronization of data tables for other types of replication (which all also rely on a snapshot for synchronizing data for the first time). All types of replication require that the source and destination tables must be synchronized, either by the replication agents or through manual synchronization, before replication can begin. In either case, the Snapshot Agent has the same responsibility. It takes the "picture" of the published data and stores the files on the distributor.
+
+Distribution Agent
+
+The Distribution Agent is used for moving data for initial synchronization and snapshot replication (and, as we'll see later, for transactional replication) from the publisher to the subscriber(s). For push subscriptions, the Distribution Agent typically runs on the distributor. For pull subscriptions, the Distribution Agent typically runs on the subscriber. The actual location of the Distribution Agent is an option that can be configured within Management Studio or via RMO.
+
+The Process of Snapshot Replication
+
+Snapshot replication uses periodic updates (the frequency is up to you, but, in general, you'll schedule a job in the job manager to run your snapshot on a regular basis). During the updates, schemas and data files are created and sent to the subscribers. Let's step through the basic procedure (see Figure 17.3):
+
+1. The Snapshot Agent places a shared lock on all articles in the publication to be replicated, ensuring data consistency.
+
+2. A copy of each article's table schema is written to the distribution working folder on the distributor.
+
+3. A snapshot copy of table data is written to the snapshot folder.
+
+4. The Snapshot Agent releases the shared locks from the publication articles.
+
+5. The Distribution Agent creates the destination tables and database objects, such as indexes, on the subscriber and copies in the snapshot data, overwriting the existing tables, if any.
+
+Figure 17.3
+
+Snapshot data is stored as a native bcp (we explored these back in Chapter 15) file if all of the subscribers are Microsoft SQL Servers. Character mode files, instead of SQL Server bcp files, will be created if you are supporting heterogeneous (non-SQL Server) data sources.
+
+SQL Server supports heterogeneous data sources for replication. Currently, transactional and snapshot replication are supported on all O/S platforms for Oracle as well as most O/S platforms for DB2.
+
+When to Use Snapshot Replication
+
+Use snapshot replication to update lookup data or read-only copies of data on remote servers. You can use snapshot replication when you want (or need) to connect to the publisher only intermittently.
+
+As an example, think of how servers might be managed for a chain of garden supply stores. You have stores in several cities. Some larger cities have multiple stores. What are some good candidates for snapshot replication?
+
+Customer records are an obvious choice. A customer, such as a landscape gardener, may turn up at different locations. In most cases, it won't matter if there's a delay updating customer information. This would also give you a way to make sure that only users who have access to the publishing server can change customer records.
+
+Inventory records could be a little more of a problem. The items you keep in inventory are somewhat constant with most changes taking place by season. Even then, you would probably keep the items in file, but with a zero quantity on hand. The problem is, you may want to replicate more up-to-date inventory records between stores. This would let you search for items you might not have on hand without having to call each of the stores. Timely updates would most likely mean transactional replication (which we will discuss shortly).
+
+Special Planning Requirements
+
+An important issue when setting up snapshot replication is timing. You need to make sure that users are not going to need write access to any published tables when the Snapshot Agent is generating its snapshot (remember that share lock that gets set on every article in the publication? Well, that's going to prevent inserts, updates, and deletes to that data for the duration of that lock—which is to say for the duration of the publishing of the distribution). You also want to be sure that the traffic generated by replication does not interfere with other network operations.
+
+Storage space can also become an issue as published tables grow. You have to verify that you have enough physical disk space available on the destination folder (CD-ROM, DVD, jump drive, tape, and so on) to support the snapshot folder.
+
+Merge Replication
+
+Snapshot is great, but we do not always live in a "read-only" world. Among the choices for dealing with data changes taking place at multiple servers is through the use of merge replication. The changes from all of the sites are merged when they are received by the publisher (see Figure 17.4). Updates can take place either periodically (via schedule—this is the typical way of doing things) or on demand.
+
+Figure 17.4
+
+Merge replication has a high level of autonomy, but also has high latency and runs a risk of lower transactional consistency. Unlike transactional and snapshot replication, which guarantee consistency, merge replication does not. This is one of the more critical design considerations that you need to make when implementing merge replication—how important is consistency?
+
+In a way, roles tend to get somewhat blurred in merge replication. The publisher is the initial source for the merge data, but changes can be made at the publisher or the subscribers. Changes can be tracked by row or by column. Transactional consistency is not guaranteed because conflicts can occur when different systems make updates to the same row. Data consistency is maintained through conflict resolution based on criteria you establish (you can even write custom resolution algorithms). You can determine whether conflicts are recognized by row or by column.
+
+As with transactional replication, the Snapshot Agent prepares the initial snapshot for synchronization. The synchronization process is different, however, in that the Merge Agent performs synchronization. It will also apply any changes made since the initial snapshot.
+
+Merge Agent
+
+Just as we saw with snapshot replication, merge replication uses an agent—the Merge Agent. As shown in Figure 17.5, the agent copies the changes from all subscribers and applies them to the publisher. It then copies all changes at the publisher (including those made by the Merge Agent itself during the resolution process) to the subscribers. The Merge Agent typically runs on the distributor for push subscriptions and on the subscriber for pull subscriptions, but as with the snapshot and transactional replication, this can be configured to run remotely.
+
+Figure 17.5
+
+The Process of Merge Replication
+
+Assuming that the initial synchronization has already taken place (remember, that will be based on a snapshot), the steps to merge replication are:
+
+1. Triggers installed by SQL Server track changes to published data.
+
+2. Changes from the publisher are applied to subscribers.
+
+3. Changes from subscribers are applied to the publisher, and any conflicts resolved.
+
+Merge triggers do not interfere with the placement or use of user-defined triggers.
+
+Changes, whether occurring at the publisher or subscriber, are applied by the Merge Agent. Conflicts are resolved automatically through the Merge Agent, using a conflict resolver (you can select one and can even build your own). The Merge Agent tracks every row update for conflicts at the row or column level, depending on how you have configured conflict resolution. You will define the priority scheme to be used when conflicts occur between new (arriving) and current data values.
+
+When to Use Merge Replication
+
+One way of using merge replication is to support partitioned tables. Going back to the garden supply business, you could set up filtering (partitioning) so that each store can view inventory information for any store but would only be able to directly update its own inventory. Changes would be propagated through merge replication. Data can be filtered horizontally or vertically. You can exclude rows to be replicated from a table, and you can exclude any table columns. Merge replication watches for changes to any column in a replicated row. In this particular scenario, there is little risk of conflict in inventory since each store can only update its own inventory, but what if you were allowing all stores to update customer data (such as a new address for the customer)? The right answer is situational, but this illustrates how different needs can place a different burden on your replication design.
+
+Special Planning Requirements
+
+When implementing merge replication, there are checks that you need to make to ensure that your data is ready for replication. While setting up merge replication, some changes may be made automatically by SQL Server to your database objects. Use care when selecting the tables to be published. Any tables required for data validation (such as lookup tables and other foreign key situations) must be included in the publication if you want that validation to apply on the subscribers.
+
+SQL Server will identify a column as a globally unique identifier for each row in a published table. If the table already has a uniqueidentifier column, SQL Server will automatically use that column. Otherwise, it will add a rowguid column (which will, as it happens, also be called rowguid) to the table and create an index based on the column.
+
+There will be triggers created on the published tables at both the publisher and the subscribers. These are used to track data changes for Merge Agent use based on row or column changes.
+
+There will also be several tables added for tracking purposes. These tables are used by the server to manage:
+
+  * Conflict detection and resolution
+  * Data tracking
+  * Synchronization
+  * Reporting
+
+For example, conflicts are detected through a column in the MSmerge_contents table, one of the tables created when you set up merge replication.
+
+Transactional Replication
+
+The difference between transactional replication and snapshot replication is that incremental changes, rather than full tables, are replicated to the subscribers. Any changes logged to published articles, such as INSERT, UPDATE, and DELETE statements, are tracked and replicated to subscribers. In transactional replication, only changed table data is distributed, maintaining the transaction sequence. In other words, all transactions are applied to the subscriber in the same order that they were applied to the publisher.
+
+Note that only logged actions are properly replicated. Unlogged bulk operations (such as a bcp that has logging turned off) or Binary Large Object (BLOB) operations that do not generate full log entries will not be properly replicated.
+
+In its simplest form, as shown in Figure 17.6, changes can only be made at the publisher. Changes can be replicated to subscribers at set intervals or as near real-time updates. While you may have less control over when replication occurs, you are typically moving less data with each replication. Updates are occurring much more often and latency is kept to a minimum. Reliable and consistent near real-time subscriber updates (immediate transactional consistency) require a reliable network connection between the publisher and subscriber (make sure you have the bandwidth on your connection to handle the chatter between the publisher and the subscriber if it is a very high update frequency and/or volume).
+
+Figure 17.6
+
+Just as with merge replication, the published articles must be initially synchronized between the publisher and the subscriber before transactional replication can take place. This is typically managed through automatic synchronization, using snapshot replication. In situations where automatic synchronization is neither practical nor efficient, manual synchronization can be used to prepare the subscriber. This is a relatively simple process:
+
+1. Run BACKUP DATABASE to back up the Publisher database.
+
+2. Deliver the tape backup to the subscriber system.
+
+3. Run RESTORE DATABASE to create the database and database objects, and to load the data.
+
+The publisher and subscriber are synchronized as of the point when the backup was run.
+
+Transactional replication can also be used to replicate stored procedures. In its simplest implementation, changes can only be made at the publishing server. This means that you don't have to worry about conflicts.
+
+You can also implement transactional replication as transactional replication with immediate-updating subscribers. This means that changes can be made at the publisher or at the subscriber. Transactions occurring at the subscriber are treated as distributed transactions. Microsoft Distributed Transaction Coordinator (MS DTC) is used to ensure that both the local data and data on the publisher are updated at the same time to avoid update conflicts. Queued updating—where updates are placed in an ordered "to be done" list—can be used as a fallback in the event that there is a network connectivity issue such as a disconnection or if the network is physically offline.
+
+Another option would be to implement distributed transactions directly rather than using transactional replication. This will get you a lower latency than that provided with transactional replication, but you will still have the distribution delay in getting changes posted at the publisher out to all of the subscribers. Assuming a solid connection between the servers involved, distributed transactions could provide near immediate updates to all servers when data is changed at any server. However, depending on the connection speed and reliability between servers, this could result in performance problems, including locking conflicts.
+
+Log Reader Agent
+
+The Log Reader Agent is used in transactional replication. After a database is set up for transactional replication, the associated transaction log is monitored by the Log Reader Agent for changes to published tables. The agent then has responsibility for copying those transactions marked for replication from the publisher to the distributor as shown in Figure 17.7. The Distribution Agent is also used in transactional replication and is responsible for moving transactions from the distributor to the subscriber(s).
+
+The Process of Transactional Replication
+
+Assuming that initial synchronization has already taken place, transactional replication follows these basic steps:
+
+1. Modifications are posted to the publisher database and recorded in the associated transaction log.
+
+2. The Log Reader Agent reads the transaction log and identifies changes marked for replication.
+
+3. Changes taken from the transaction log are written to the distribution database on the distributor.
+
+4. The Distribution Agent applies the changes to the appropriate database tables.
+
+Figure 17.7
+
+You can set up the Log Reader Agent to read the transaction log continuously or on a schedule that you specify. As before, the Distribution Agent typically runs at the publisher for push subscriptions and at the subscriber for pull subscriptions, but this can be changed through Management Studio or RMO to run remotely.
+
+When to Use Transactional Replication
+
+Use transactional replication when you need or just want to reduce latency and provide subscribers with relatively up-to-date information. Near real-time updates usually require a local area network connection, but scheduled replication can often be managed through scheduled updates. If you choose to use scheduled updates, latency increases, but you gain control over when replication occurs.
+
+Let's go back to our garden supply store and the inventory problem discussed earlier. You want each of the stores to have up-to-date, or at the very least, relatively up-to-date, inventory information. You would probably use scheduled replication to pass data to the subscribers.
+
+Now let's see if we can make things a little more difficult. Not only do you have a chain of stores; you also have traveling salespeople who visit and take orders from your largest customers. They need to have at least relatively up-to-date inventory information but can spend their days sitting around and waiting for updates from the publisher. For systems of this type, you may want to use pull subscriptions, letting the salespeople decide when they connect to the server and download recent transactions.
+
+You've probably noticed a potential problem in both of these scenarios. The remote servers can receive data, but they are not able to make any changes to the data. We'll cover that problem a little later. Transactional replication, when implemented in this manner, is used to support read-only copies of the data at subscriber systems.
+
+Special Planning Requirements
+
+Space is an important issue when planning for transactional replication. You have to make sure that you allow adequate space for the transaction log on the publisher and for the distribution database on the distributor.
+
+Check each of the tables that you are planning to publish. For a table to be published under transactional replication, it must have a primary key. There are also potential concerns if you are supporting text or image data types in any of the tables. INSERT, UPDATE, and DELETE are supported as for any data type, but you must be sure to use an option that utilizes the transaction log when performing BLOB or bulk operations.
+
+You may encounter problems with the max text repl size parameter, which sets the maximum size of text or image data that can be replicated. Make sure that this server-level parameter is set to a high enough value to support your replication requirements.
+
+Immediate-Update Subscribers
+
+As indicated earlier in the chapter, you have the option of setting up subscribers to snapshot or transactional publications as immediate-update subscribers. Immediate-updating subscribers have the ability to update subscribed data, as long as the updates can be immediately reflected at the publisher. This is accomplished using the two-phase commit protocol managed by MS DTC. There is effectively no latency in updating the publisher. Updates to other subscribers are made normally (as if the change was initiated at the publisher), so latency when going to other subscribers will depend on the rate at which those subscribers are updated.
+
+You should consider immediate-updating subscribers when you need to post changes to replicated data at one or more subscribers and propagate near-immediate updates. You might be using multiple servers to support an Online Transaction Processing (OLTP) application as a way of improving performance and providing near real-time redundancy. When a transaction is posted to any server, it will be sent to the publisher, and through the publisher, to the remaining servers.
+
+Much as with any form of merge replication, conflicts can arise when using immediate-updating subscribers. In order to assist with conflict identification and management, a uniqueidentifier column will be added to any published tables that do not already have one (if your table has one, the column in question will have a column level property of IsRowGUID of true—you can only have one RowGUID column per table).
+
+A high-speed, reliable connection is required between the publisher and any immediate-updating subscribers, such as a local area network connection, unless queued updates are used. If queued updates are configured, then the replication process can tolerate an unreliable connection and will just process any queued transactions as soon as connectivity is restored.
+
+Keep in mind that queued updates increase the opportunities for you to have a conflict. Since the subscriber is making changes that the publisher does not know about, there is the increased prospect for the publisher to be making changes to the same rows that the subscriber is. In such a case, the conflict resolver will identify the existence of the conflict when replication occurs and resolve it according to whatever rules you have established.
+
+Mixing Replication Types
+
+You can mix and match replication types as needed. Indeed, not only can you have different replication types on the same server; you can even have different replication types for the same table.
+
+As an example of why you might want to do this, imagine that a heavy equipment warehouse wants to have up-to-date inventory information and reference copies of invoices available at each of its locations. Each location has its own local SQL Server. Invoices are posted to a central location using an Internet-based application. These are replicated to all local servers through transactional replication so that inventory records are updated. You also want to have invoice and inventory information replication updated to yet another server weekly. This information on this last server is used for business analysis and running weekly reports. This server is updated weekly through a separate snapshot publication referencing the same tables used by the distributed inventory servers that were getting immediate updates.
+
+Replication Topology
+
+Over the years, Microsoft has outlined a number of replication topology models to describe how replication can be physically implemented. Let's look at some of these here as examples of how things are commonly implemented. It's worth noting that it is not only possible to mix and modify these models but actually rather common to do so.
+
+Your decisions about the type of replication you need to use and your replication model topology can be made somewhat independent of each other. That said, there is a chance that restrictions imposed by your physical topology, such as transmission bandwidth, will influence your decisions.
+
+Simple Models
+
+Let's start with a look at the more simple models. Once you've got the basic idea, we can move on to some variations and ways these models are mixed.
+
+Central Publisher/Distributor
+
+This is the default SQL Server model. As shown in Figure 17.8, you have one system acting as publisher and as its own distributor. This publisher/distributor supports any number of subscribers. The publisher owns all replicated data and is the sole data source for replication. The most basic model assumes that all data is being published to the subscribers as read-only data. Read-only access can be enforced at the subscriber by giving users SELECT permission only on the replicated tables.
+
+Figure 17.8
+
+Since this is the easiest model to set up and manage, you should consider its use in any situation where it fits. If you have a single publisher, one or more subscribers, and read-only access to data at the subscriber, this is your best choice.
+
+Central Publisher/Remote Distributor
+
+You may find that the volume of replicated data and/or the amount of activity at the publisher may create the need to implement the publisher and distributor as separate systems. As shown in Figure 17.9, this is effectively, from an operational point of view, the same as the publisher/distributor model. The publisher is still the owner of—and only source for—replicated data. Once again, the simple model assumes that the data will be treated as read-only at the subscriber.
+
+Figure 17.9
+
+Obviously, you usually only use this model when a single publisher/distributor cannot handle both production activity and replication to subscribers.
+
+Central Subscriber
+
+In this model, as shown in Figure 17.10, you have only one subscriber receiving data, but there are multiple publishers. The publishers can be configured as publisher/distributor systems. This model provides a way to keep just local data at the local server but still have a way of consolidating the data at one central location. Horizontal filtering may be necessary to keep publishers from overwriting each other's data at the subscriber.
+
+This is the model to use when you have data consolidation requirements such as gathering distributed data up for use in a data warehouse.
+
+Figure 17.10
+
+Mixed Models
+
+Now let's look at a few variations based on the idea that we will frequently want to mix and match the basic models. Consider these as just a taste of the possibilities—something of "just the beginning." The possibilities are almost endless.
+
+Publishing Subscriber
+
+Publishing subscribers (that is subscribers that are also configured as publishers) can be added to any of the basic models. This model has two publishers publishing the same data. The original publisher replicates data to its subscribers, one of which is a publishing subscriber. The publishing subscriber can then pass the same data along to its subscribers.
+
+This model, shown in Figure 17.11, is useful when you have pockets of servers or when you have an especially slow or expensive link between servers. Another possibility is that you don't have a direct link between the initial publisher and all of the potential subscribers. The publisher only needs to pass data to one system on the far side of the link, and the publisher subscriber can then pass the data along to the other subscribers.
+
+Figure 17.11
+
+Publisher/Subscriber
+
+This is another case where you have SQL Servers acting as both publishers and subscribers (Figure 17.12). Each server has its own set of data for which it is responsible. This model can be used when you have data changes taking place at both locations and you want to keep both servers updated. This is different from publishing subscribers in that each server is generating its own data, not just passing along updates received from another server.
+
+Figure 17.12
+
+Multiple Subscribers/Multiple Publishers
+
+Figure 17.13 shows one of the more complicated scenarios. Under this scenario, you have multiple publishers and multiple subscribers. Systems may or may not act as a publisher/subscriber or publishing subscriber. This model requires very careful planning to provide optimum communications and to ensure data consistency.
+
+Figure 17.13
+
+Self-Publishing
+
+It is worth specifically calling out that you can have a server subscribe to its own published articles. This is actually fairly common in small installations, where there is a diverse need, but not necessarily enough load to justify more than one physical server. For example, you may want to segregate the data used for online transaction processing from the data used for decision making. You can use replication to make separate read-only copies of your data (updated on any schedule you consider appropriate) to be used as a reference.
+
+Whether to locate your other databases—such as a data warehouse—on the same physical server as your core system is a matter of taste and your particular scenario. An example of where this can be very valid is the scenario where you have relatively low transactional volume but complex analysis needs. In my experience, companies that have enough need for a separate data warehouse usually have a physical or operational need for that to be on a separate server, but that is far from an "always" scenario. Consider your particular situation: does your server have room to share the load? Can you risk both databases being offline at the same time in the event of a catastrophe?
+
+Planning for Replication
+
+Replication is one of those things where it can be easy to "just toss something together." It's also one of those things where it is easy to create a huge mess if you take such a cavalier approach. Keep in mind that SQL Server may automatically make some alterations to your schema to implement replication—do you really want SQL Server adding columns and objects to your database without fully thinking about that first? Of course not.
+
+Any replication installation worth doing is worth taking the time to plan out. Some planning considerations include:
+
+  * What data is to be replicated
+  * Replication type
+  * Replication model
+
+Along with these are other factors that will influence your decision, such as current network topologies, current server configurations, server growth potential, activity levels, and so forth. Each replication method has its advantages and disadvantages, and there is not a one-size-fits-all approach to replicating data. For instance, if you have a slow network or unreliable connection, then you may not want to implement transactional replication. Instead, you may opt to use merge replication that runs during a scheduled connection time. As has been pointed out repeatedly in this chapter, however, you also need to balance that against consistency needs.
+
+Data Concerns
+
+First, you have to consider what you are going to publish and to whom. You need to identify your articles (tables and specific columns to be published) and how you plan to organize them into publications. In addition, there are some other data issues of which you need to be aware. Some of these have already been mentioned, but it's worth our time to review them here.
+
+timestamp
+
+Include a timestamp column for transaction publications. That gives you a way of detecting conflicts on updates. By having a timestamp column already in place, you've already met part of the requirements for adding immediate-updating subscribers.
+
+uniqueidentifier
+
+A unique index and globally unique identifier is required for merge replication. Remember, if a published table doesn't have a uniqueidentifier column, a globally unique identifier column will be added.
+
+User-Defined Data Types
+
+User-defined data types are not supported unless they exist on the subscriber destination database. Alternatively, you can have user-defined data types converted to base data types during synchronization.
+
+NOT FOR REPLICATION
+
+The NOT FOR REPLICATION clause lets you disable table actions on subscribers. You can disable:
+
+  * The IDENTITY property
+  * CHECK constraints
+  * Triggers
+
+These actions are essentially ignored when and only when the replication process changes data on the subscriber. Any other processes would still use them normally. So, for example, an insert into the original receiving database would have an identity value assigned, but as the row was subsequently published (in the form of an INSERT) to subscribers, the existing identity value would be used rather than generating a new value.
+
+Mobile Devices
+
+SQL Server also comes in a "Mobile" version. This is an extremely small footprint version of SQL Server designed to run on Windows Mobile Edition. The Mobile edition supports replication from a subscriber point of view. Snapshot and merge replication are supported—transactional replication is not.
+
+Many of the considerations for mobile devices are just variants of the same theme that we've seen already in replication—bandwidth and space, for example. Just keep in mind that the constraints for mobile devices may be much more extreme than with a full server class system (or even your salesmen's laptops for that matter).
+
+Setting Up Replication in Management Studio
+
+Setting up replication takes a few steps. In particular, you need to:
+
+  * Configure your publication and distribution server(s) to be ready to perform those tasks
+  * Configure your actual publications
+  * Configure subscribers
+
+Let's take a look at how to do each of these within the Management Studio.
+
+Configuring the Server for Replication
+
+Before you can set up any publication or distribution on your server, your server must be configured for replication.
+
+To get at this in Management Studio, navigate to the Replication node, right-click, and select Configure Distribution.
+
+Note that, in order to configure replication, you must have connected to the Object Explorer using the actual name of the server (local, a period (.), localhost, or an ip address are not supported). If you connected using anything other than the server's DNS name, you'll get an error and be required to reconnect.
+
+SQL Server greets you with the standard splash screen that we've seen in other wizards, and then moves on to an intro dialog—in this case, it points out some of the options you will have as you go through this wizard. Click Next, and you are moved on to a dialog (shown in Figure 17.14) that decides if this publisher is to serve as its own distributor or if it should utilize an existing distributor.
+
+Figure 17.14
+
+If we select the option to use a different server as the distributor and choose Add, then we would get a standard connection dialog box (asking for login security information for the distribution server). For our example run, keep the default option (that this box will act as its own distributor) and click Next.
+
+Note that which dialog comes after the Distributor dialog will change depending on whether or not you have the SQL Server Agent configured to start automatically on system startup.
+
+If you do not have the SQL Server Agent configured to start automatically (although you almost certainly want it to be on a production server), SQL Server will pop up a dialog, shown in Figure 17.15, to ask you about this. (It will skip this next dialog if your agent is already configured to start automatically when you start your system.)
+
+Feel free to leave your system configured however you already have it (SQL Server will, however, default this dialog to changing your SQL Server Agent service to start automatically), but keep in mind that the agent will need to be running for some forms of replication to work.
+
+Figure 17.15
+
+Click Next. We move on to configuring a snapshot folder as shown in Figure 17.16. This will default to a directory in your main SQL Server folder, which for many installations may not be large enough to hold snapshots of large databases. This can be configured as a local volume or as a UNC path. Since I'm not going to assume you have a full server farm to try this stuff out on, we're going to take a "one server does everything" approach for this example, so accepting the default should be fine.
+
+Figure 17.16
+
+From there, it's on to configuring the actual distribution database. SQL Server gives a dialog to get some typical database creation information (what do you want to call it and where to store it), as shown in Figure 17.17.
+
+Figure 17.17
+
+From here, we move on to what, at first, appears to be a rather boring dialog (shown in Figure 17.18) with seemingly nothing new.
+
+Figure 17.18
+
+Looks can, however, be deceiving. If we click on the little ellipsis (...) on the right, we get yet another dialog (shown in Figure 17.19)—one that does have a key item of note.
+
+Figure 17.19
+
+As Figure 17.19 shows, we have the ability to specifically set the connection mode we're going to use when connecting the agent to the publisher. In most cases, the default of impersonating the Agent process will be fine, but keep in mind that we can use specific SQL Server security credentials if need be.
+
+Cancel out of this properties dialog, and click Next back in the publishers dialog (the one in Figure 17.18). Figure 17.20 shows the confirmation dialog, with what we want to do, at the end of the wizard. Note how it provides not only the option of immediately configuring the distribution, but also the concept of scripting the configuration for later or potentially remote use.
+
+Figure 17.20
+
+Go ahead and click Finish (the next dialog is just a summary, so there is no need to dwell there). SQL Server begins processing the configuration request. When the process is complete, go ahead and close the dialog.
+
+And, just that quick, you have a server configured for publication and distribution of replicated data. Obviously, were this a production environment, we might have some other choices to make in terms of specific locations or even whether we wanted the publisher and distributor to be on the same system, but the basic foundations of what we are doing remains the same regardless.
+
+If you wonder about the distribution database, you should now be able to find it under the "System Databases" subfolder of the Databases folder.
+
+Configuring a Publication
+
+With our server all nice and configured, we're ready to get down to creating an actual publication.
+
+To do this, navigate to the Replication node in Management Studio, right-click the Local Publications sub-node, and choose New Publication.
+
+After the usual intro dialog, we come to the Publication Database dialog shown in Figure 17.21. This allows us to choose what database we want to utilize for our publication. As you can see, I've selected our old friend, AdventureWorks2008.
+
+Figure 17.21
+
+Click Next, and you're ready to move on to the Publication Type dialog shown in Figure 17.22.
+
+Figure 17.22
+
+This allows us to select between the replication types that we looked at earlier in the chapter. I've chosen Transactional publication with updatable subscriptions.
+
+Click Next, and you move on to the Articles dialog.
+
+In Figure 17.23, I've expanded the Tables node and selected the Person.Person table. I'm taking most of that table, but I'm going to skip the AdditionalContactInfo and Demographics columns since they are schema-bound XML columns, and SQL Server does not allow for the replication of XML columns that are bound to an XML schema collection. I also could have taken other schema objects such as stored procedures (I'm sticking to just the one object for simplicity's sake).
+
+Click Next to be taken to the Article Issues dialog, as shown in Figure 17.24.
+
+Notice that SQL Server detected several issues it wants to let us know about. This is one where I say "kudos to the SQL Server team" for attempting to let a user know about some fundamental things before they become a problem.
+
+Figure 17.23
+
+Figure 17.24
+
+Click Next to move on to the Filter Table Rows dialog shown in Figure 17.25.
+
+Figure 17.25
+
+This one allows us to do horizontal partitioning—essentially just applying a WHERE clause so that only rows that meet a specific condition will go across in our publication.
+
+Click Add to get the dialog shown in Figure 17.26.
+
+In our example here, we've restricted the rows being replicated to those where the persons in question have been flagged as employees (EmployeeType = 'EM').
+
+Click OK to return to the Filter Table Rows dialog, and then click Next to move on to the Snapshot Agent dialog shown in Figure 17.27.
+
+Remember that any subscription, regardless of whether it is to a snapshot, merge, or transactional replication model, must start by synchronizing based on a snapshot. Subsequent changes are begun relative to that snapshot.
+
+Figure 17.26
+
+Figure 17.27
+
+I've configured mine to run the snapshot immediately, but I could have just as easily scheduled it to be generated at a later time (remember that snapshots place share locks on every table the snapshot utilizes—do not run them at a time where such lock issues are going to block writes to your database that you need done in a timely fashion). If, for example, you are getting frequent new subscribers, you may want to schedule a periodic update to the snapshot to give them a more up-to-date time to synchronize to.
+
+Click Next, and you're ready to define the Agent Security, as shown in Figure 17.28.
+
+I've used the Security Settings dialogs to set the agents to use the SQL Server Agent account. This is not, however, good practice in a production environment for security reasons. Give the agents their own account to impersonate to both limit agent access and increase your ability to audit.
+
+Figure 17.28
+
+Click Next, and you'll find an Action dialog (just like the one back in Figure 17.20) where you can indicate whether you want the publication created immediately or scheduled for later execution.
+
+One more click of the Next button, and you're ready for a summary and to define a publication name as shown in Figure 17.29 (I've chosen Employees).
+
+Go ahead and click Finish to create your publication, and, just like that, you're ready to have subscribers!
+
+Figure 17.29
+
+Setting Up Subscribers (via Management Studio)
+
+Setting up subscribers utilizes the same basic notions we've already leveraged with publications. Before we get started with an example, however, let's set up a dummy database to play the part of our subscriber:
+
+CREATE DATABASE AWSubscriber;
+
+And, with that created, we're ready to subscribe to some data.
+
+Start by right-clicking the Local Subscriptions sub-node below the Replication node in Management Studio, and selecting New Subscription. After the usual intro dialog, we move on to identifying our publication, as shown in Figure 17.30. Since we have only one publication, there really isn't a lot to choose from, but the list could have easily been many, many publications.
+
+Figure 17.30
+
+Click Next to move on to the Agent location, as shown in Figure 17.31. Remember that we can run our replication agent on either the subscriber or the distributor. In our case, it doesn't matter much since these are the same box, but you may make different choices depending on server loading issues.
+
+Figure 17.31
+
+Click Next to move on to the Subscribers dialog shown in Figure 17.32. I've already chosen our AWSubscriber database, but notice how we could choose Add SQL Server Subscriber and configure multiple subscribers at one time.
+
+From there it's on to the Distribution Agent Security dialog. Here we define what security context we want to run under for both the distributor and subscriber (in this case, it's the same system, but it could have easily been remote). In Figure 17.33 I've chosen to impersonate the SQL Server Agent security context, but, again, on a production server you would generally want a more specific security context for your replication agent for security reasons.
+
+Figure 17.32
+
+Figure 17.33
+
+We can move quickly through the remaining dialogs by setting the agent to "Run continuously" and leaving the default "Commit at publisher" setting of "Simultaneously commit changes." That takes us to the Login For Updatable Subscriptions dialog shown in Figure 17.34.
+
+Figure 17.34
+
+Since this is all (distribution and subscription) happening on the same server, a linked server is implied (a server is always available to itself as a linked server). Were we using a remote distributor, we could have either used a regular SQL Server login or again went with a linked server (though, in the latter case, we would need to configure the linked server separately).
+
+A linked server is another SQL Server or ODBC data source that has had an alias established for it on your server. When you refer to a linked server by name, you are essentially grabbing a reference to connection information to that linked server.
+
+Figure 17.35 allows us to choose when to initialize our subscription (I've stayed with the default of immediately). The initialization involves pulling down the snapshot from the distributor and applying it. Subsequent synchronizations will be done using the snapshot as a baseline to apply changes to.
+
+Click Next to get the same finishing dialogs that we've seen in prior examples (when to run things and a summary page), and then click Finish.
+
+Figure 17.35
+
+Using Our Replicated Database
+
+Once the replicated database is in place, the problem largely becomes one of administration. If things are running smoothly, there is very little to see. Users can access our AWSubscriber database and the Person.Person table within it. Since we configured for updating subscribers, changes made to the AWSubscriber version of the Person.Person table will be immediately reflected in the source AdventureWorks2008 database. Likewise, changes made to our AdventureWorks2008 database will be reflected in our subscriber database.
+
+You can start by taking a look in the AWSubscriber table list, a quick look at the list of tables in the database (using Management Studio, sp_help, or sys.tables)—you should find the Person.Person table that we replicated. Then go ahead and take a look in our AdventureWorks2008 database. You should find a table called Person.conflict_Employees_Person. This new table is for conflict tracking—it should receive data only in the event that changes we make in our subscriber run into a conflict with those on the publisher.
+
+In the event of a conflict, the default publishing agent chooses the publisher's data over the client's. You can change this to prefer things based on such things as which is the most recent change and other ready-made criteria. You can also write custom resolution algorithms to encompass any unusual rules you may have for resolving conflicts.
+
+Now let's test out our transaction-based replication by making a change to our data. We'll start by taking a look at the starting value of the row we're going to change:
+
+SELECT aw.FirstName AS PubFirst,
+
+aw.LastName AS PubLast,
+
+aws.FirstName AS SubFirst,
+
+aws.LastName AS SubLast
+
+FROM AdventureWorks2008.Person.Person aw
+
+JOIN AWSubscriber.Person.Person aws
+
+ON aw.BusinessEntityID = aws. BusinessEntityID
+
+WHERE aw. BusinessEntityID = 38;
+
+What I've done here is join across the databases so that we can see both the publisher and subscriber at the same time. This way, we can, in one query, compare the source and the destination. The first time we run this script (before we make any changes), we can see our starting values, and that they are indeed the same:
+
+PubFirst PubLast SubFirst SubLast
+
+\---------- ---------- ---------- -----------
+
+Kim Abercrombie Kim Abercrombie
+
+(1 row(s) affected)
+
+Okay, now let's make a change. We'll say that Kim has gotten married and decided to change her name to Abercrombie-Smith.
+
+USE AdventureWorks2008;
+
+UPDATE Person.Person
+
+SET LastName = 'Abercrombie-Smith'
+
+WHERE BusinessEntityID = 38;
+
+Now, we run our original SELECT statement again to check the results:
+
+PubFirst PubLast SubFirst SubLast
+
+\--------------- --------------- --------------- ---------------
+
+Kim Abercrombie-Smith Kim Abercrombie-Smith
+
+(1 row(s) affected)
+
+As you can see, both the publisher and subscriber received the update.
+
+Now, let's change the script just slightly to run inside the subscriber database, and see what happens on the publisher's side. This time, we'll change Kim's name back (perhaps she changed her mind...):
+
+USE AWSubscriber;
+
+UPDATE Person.Person
+
+SET LastName = 'Abercrombie'
+
+WHERE BusinessEntityID = 38;
+
+And now we're ready to run our original select statement one more time:
+
+PubFirst PubLast SubFirst SubLast
+
+\-------------- ---------------- -------------- --------------------
+
+Kim Abercrombie Kim Abercrombie
+
+(1 row(s) affected)
+
+Again, our change was seen in both databases.
+
+The change was seen going both directions and was replicated immediately because we had selected transactional replication with immediately updating subscribers. Other replication choices would have introduced latency in the change, or potentially not replicated the change at all without some form of manual intervention. Be sure to review all of the replication types (discussed earlier in the chapter) to understand the behavior of each.
+
+Replication Management Objects (RMO)
+
+Replication Management Objects, or RMO, is a .NET object model that was first seen in SQL Server 2005 and replaced the replication portion of the COM-based Distributed Management Objects (DMO) object model that was used in SQL Server 2000 and earlier. You can think of RMO as being something of a companion to SQL Management Objects (SMO), which we discuss extensively in Chapter 23.
+
+RMO gives you programmatic access to any portion of your replication creation and configuration using any .NET language. Examples of RMO use would be automating operations such as:
+
+  * Creating and Configuring a Publication: You can make use of the ReplicationDatabase as well as the TransPublication or MergePublication objects to define publications.
+  * Adding and Removing Articles: The TransArticle object supports the addition and removal of articles within your publication. In addition, you can add column filters or add a FilterClause property to limit what rows are replicated.
+  * Republishing your snapshot.
+
+These are just some more everyday use kinds of examples. RMO is, however, capable of creating, modifying, or deleting any part of the replication process.
+
+RMO can be utilized in Visual Studio by adding a reference to the Microsoft.SqlServer.Replication .NET Programming Interface library. You then point your include, imports, or using directives to Microsoft.SqlServer.RMO. As with any of the management libraries that support SQL Server, you will also need to have a reference to the Microsoft.SqlServer.ConnectionInfo library.
+
+An example application that utilizes RMO to create the same publication we created earlier in the chapter using the GUI can be downloaded from the Wrox Web site (wrox.com) or professionalsql.com.
+
+Summary
+
+As much as there was to take in this chapter, this really was something of an introduction to replication. We covered a lot of the considerations for architects reasonably well, but the scope of replication is such that entire books are written on just that topic. Indeed, there is much to consider in order to build just the right model for complex scenarios. The good news is that, if you really grasped this chapter, then you are prepared for perhaps 90 percent of what you are likely to ever face. Time and the proverbial "school of hard knocks" will teach you the rest.
+
+If you've taken anything from this chapter, I hope that it's an understanding of some of the general problems that replication can solve and how replication works best when you plan ahead both in terms of topology planning and in your application's general architecture (making sure it understands the special needs of replication).
+
+In our next chapter, we'll take a look at yet another "extension" area for SQL Server—full-text indexing.
+18
+
+Looking at Things in Full: Full-Text Search
+
+Full-Text Search is an area of significant architectural change in SQL Server 2008. While the core use and functionality hasn't changed all that much, the full-text features are far more integrated into the core of SQL Server as of this release. If you feel you are already familiar with full-text and are ready to skip this chapter, I would encourage you to at least browse the architectural changes and consider their ramifications on things like backup and recovery as well as expanded query result support.
+
+Using plain old T-SQL (without full-text functionality), our options for querying text information are somewhat limited. Indeed, we have only a couple of options:
+
+  * Use a LIKE clause. This is generally woefully inefficient, and is not able to utilize any kind of index structure unless your search pattern starts with an explicit value. If the search starts with a wildcard (say "%" or "_"), then SQL Server wouldn't know which spot in the index to begin with—any indexes become worthless.
+  * Use some other form of pattern matching, such as PATINDEX or CHARINDEX. These are generally even more inefficient, but can allow us to do things that LIKE will not.
+
+With Full-Text Search, however, we gain the ability to index the contents of the text—essentially keeping a word list that lets us know what words we can find and in what rows. In addition, we are not limited to just pattern-matching algorithms. We can search for the inflected forms of words. For example, we might use the word university but have SQL Server still find the word universities, or, even better, SQL Server can find a word like drunk when the word we asked for was drink. It's up to us to decide how precise we want to be, but even if the word we are searching for is located deep in the middle of a large text block, SQL Server can quickly find the rows that contain the word in question.
+
+Full-Text Search, or FTS, supports any document type that has a filter registered on the system that supports the iFilter interface. This means that you can store things like Word, Excel, Acrobat, and other supported files in an image data type, but still perform full-text searches against that data! Indeed, you could even write your own extensions to support other document types if necessary.
+
+Personally, I find this later point to be extremely cool. Implementation of the iFilter interface allows you to separate what is text information versus what is formatting information, so you could, for example, write a custom iFilter that knows how to string XML tags out of an XML file to allow full-text searching for a custom XML document type.
+
+In this chapter, we'll take a look at these Full-Text Search features and more.
+
+Among the sections we'll look at are:
+
+  * Full-Text Search architecture
+  * Setting up full-text indexes and catalogs
+  * Full-text query syntax
+  * Full-text quirks
+  * Noise words
+
+In addition, we'll see how there are now two ways of completing most full-text-related operations. By the time we're done, you should be prepared for the hassles that FTS creates for you, but you should also be ready to utilize what can be some wonderful functionality in return.
+
+Full-Text Search Architecture
+
+The architecture of FTS got a major overhaul with this release. While some of the fundamental concepts (such as word-breakers, filters, catalogs, and indexes) still apply, the way these items are utilized has changed somewhat. A map of the new (and rather complex) architecture is shown in Figure 18.1.
+
+In prior versions of SQL Server, the core of Full-Text Search wasn't really part of SQL Server at all. It was a shared technology item that originally came from Microsoft Index Server. You would see the separate process installed with SQL Server under the service name of MSFTESQL. With SQL Server 2008, Full-Text is now a fundamental part of the main SQL Server process. The full-text engine is excellent at examining raw text data and aggregating word lists. It maintains an association between the individual words and phrases and the places that the FTS has encountered them.
+
+Full-Text is now part of the core SQL Server process. Individual filters are, however, instantiated in their own process for security reasons.
+
+Figure 18.1
+
+To perform full-text queries against any SQL Server table, you must build a full-text index for that table. The construction and maintenance of this full-text index—or the population of the index—is done through a process of SQL Server instantiating an instance of a filter daemon, which is passed a text stream, the words in the stream are cataloged, and an association is made between the catalog entry and the row the word was sourced from.
+
+By default, tables have no full-text functionality at all. The fact that there is a table and that it has text data types is no guarantee that there is a full-text index on the table. If you want it, you need to create it. Even after you create the full-text index, the index will have nothing in it. To make the index fully functional, you need to populate the index.
+
+The population process looks over the columns specified by the index and builds the word list that is going to be used. Much like standard indexes in SQL Server, only the columns you specify to include in the index will become part of the index. Unlike normal indexes in SQL Server, however, you are allowed only one full-text index per table—so every column you want to have participate in full-text queries needs to be part of the index.
+
+The differences don't stop there though. Actually there are several. The major differences include:
+
+  * Internal Structure: Typical SQL Server indexes are stored as a balanced tree structure. Full-text indexes, however, utilize a token-based structure that is inverted (essentially storing things backwards) and compressed.
+  * Method of Creation: SQL Server indexes are created using the CREATE INDEX command in T-SQL, SQL Management Objects (SMO), or Windows Management Instrumentation (you can use the Management Studio, but it just uses SMO). Full-text indexes are created either through the use of special system stored procedures or through the use of the CREATE FULLTEXT INDEX command.
+  * Method of Update: SQL Server indexes are automatically updated in the normal course of changes to the underlying SQL Server data. Full-text indexes can either be populated on demand or through a "change tracking" mechanism with an on-demand cleanup.
+
+So that's the quick lesson in Full-Text Architecture 101. As we move through the rest of the chapter, the impact of the differences should become apparent versus the more "normal" way things are implemented in SQL Server.
+
+Setting Up Full-Text Indexes and Catalogs
+
+As we saw in the last section, each table in a SQL Server database can have zero or one full-text indexes. For SQL Server 2008, these full-text indexes are stored in with the rest of the database (you can, if you wish, specific a specific filegroup if you want the full-text items to be on separate storage). A catalog can store multiple full-text indexes. The indexes must be from the same database; you may, however, want to store indexes from one database in multiple catalogs, so you can manage the population of those indexes on separate schedules or store them in separate filegroups.
+
+Enabling Full-Text for Your Database
+
+Prior to SQL Server 2008, there was the concept of Full-Text being "enabled" for a database. In SQL Server 2008, all databases are always full-text enabled.
+
+Creating, Altering, Dropping, and Manipulating a Full-Text Catalog
+
+The CREATE syntax for Full-Text looks much like other CREATE syntaxes, but with a few additional twists:
+
+CREATE FULLTEXT CATALOG <catalog name>
+
+[ON FILEGROUP <filegroup> ]
+
+[IN PATH <'root path'>]
+
+[WITH ACCENT_SENSITIVITY = {ON|OFF}]
+
+[AS DEFAULT]
+
+[AUTHORIZATION <owner name> ]
+
+Most of this should be fairly self-explanatory, but let's take a look anyway:
+
+ON FILEGROUP | This is here for backward compatibility with SQL Server 2005 only (the CREATE FULLTEXT CATALOG command didn't exist in SQL Server 2000). It has no effect under SQL Server 2008.  
+---|---  
+IN PATH | Again, this is a backward compatibility only thing. In prior releases, the actual full-text catalogs were not created inside the database but rather as a separate file on disk. This option told SQL Server what path you wanted that file created in. In SQL Server 2008, this option has no effect.  
+WITH ACCENT_SENSITIVITY | Pretty much what it sounds like. This determines whether searches will take into account accents or not (for example, is "e" the same as "é"). Keep in mind that, if you change this setting after the catalog is created, the entire catalog will need to be repopulated.The full-text catalog will use whatever accent sensitivity the database is set to.  
+AS DEFAULT | Another one that is what it sounds like; this one sets the full-text catalog you're creating to be the default catalog for any new full-text indexes you create.  
+AUTHORIZATION | Mildly more complex. As you might imagine, this one is about security and rights. It changes the ownership of the full-text catalog to be the user or role specified instead of the default (which would be the user that actually creates the catalog). This one has gotten muddled quite a bit by SQL Server's change from ownership to schemas. Ownership has largely morphed into schemas, but the nature of this particular setting more closely fits with the older ownership notion. The key thing to realize here is that a role can be the owner of a full-text catalog—not just a user. If you're changing the ownership to a specific role, then the user creating the full-text catalog must be a member of that role at the time that he or she creates the catalog.
+
+So, let's create a full-text catalog for AdventureWorks2008. We'll simply call it:
+
+USE AdventureWorks2008;
+
+CREATE FULLTEXT CATALOG MainCatalog;
+
+This is another one of those commands you issue where you don't get much feedback. As long as you don't see an error, the catalog should be created just fine.
+
+And just that quick we have a full-text catalog available for AdventureWorks2008. I did not specify this full-text catalog as the default, so any full-text indexes that want to make use of this catalog will need to explicitly state this catalog as their destination.
+
+Altering Full-Text Catalogs
+
+Altering full-text catalogs works pretty much the same as creating them, save for the fact that you are really limited in what can be altered. The syntax is:
+
+ALTER FULLTEXT CATALOG <catalog name>
+
+{ REBUILD [WITH ACCENT_SENSITIVITY = {ON|OFF} ]
+
+| REORGANIZE
+
+| AS DEFAULT
+
+}
+
+There are three top-level options you can set with this ALTER. Let's take a look at them.
+
+REBUILD
+
+Does what it says it does—completely rebuilds the full-text catalog in question. By default, it will be created with exactly the same settings the catalog had before (Owner and whether it is the default or not).
+
+Keep in mind that your full-text catalog, and every index that catalog contains, will be offline while the rebuild is in progress.
+
+In addition to the simple rebuild that you would typically do just to compact the file (for deleted rows and such), you can also rebuild to change the accent sensitivity. If you want to reset the accent sensitivity, just specify whether you want it on or off as you issue the REBUILD command.
+
+Any rebuild implies that all indexes in the catalog will be repopulated.
+
+REORGANIZE
+
+This is similar to REBUILD, but with some pros and cons.
+
+REORGANIZE cleans up your catalog for you, but in an online fashion. The result is like most situations where you rearrange things instead of moving things all the way out and starting over. It looks pretty good, but perhaps not as good as if you had started from scratch.
+
+You can think of REORGANIZE as being like a defragmentation process. It merges what may well be several different index structures internal to the catalog. (For performance reasons at the time the full-text was analyzed, some items may have been kept in their own substructure in the index rather than merged into the master index for the catalog.) This command attempts to rectify that. Unlike REBUILD, REORGANIZE does also reorganize the internal structures for your full-text catalog (the ones that store metadata).
+
+AS DEFAULT
+
+This works just like it did under CREATE. It establishes this particular catalog as being the default full-text catalog for new full-text indexes you create for this database.
+
+Dropping Full-Text Catalogs
+
+I know you can see this one coming—after all, it's that same core DROP syntax we've been using all along:
+
+DROP FULLTEXT CATALOG <catalog name>
+
+And, of course, it's gone.
+
+Creating, Altering, Dropping, and Manipulating Full-Text Indexes
+
+Okay, so what we had with a full-text catalog was largely just a container. A full-text catalog, by itself, is nothing at all—think of it like a gas can with no gas in it. What we need are the actual full-text indexes. Whereas a full-text catalog is the place to store full-text indexes, the indexes themselves are what provide the actual reference information that allows your full-text queries to operate quickly and efficiently.
+
+Creating Full-Text Indexes
+
+When you go to create a full-text index, the core items of the command are not all that different from regular indexes; however, much as regular indexes have properties such as whether they are clustered or non-clustered, full-text indexes also have their own properties.
+
+The syntax for creating a full-text index looks like this:
+
+CREATE FULLTEXT INDEX ON <table name>
+
+[( <column name> [TYPE COLUMN <type column name> ]
+
+[LANGUAGE <language term>] [,...n])]
+
+KEY INDEX <index name>
+
+[ON <fulltext catalog name> ]
+
+[WITH
+
+{ CHANGE_TRACKING [=]{ MANUAL | AUTO | OFF }
+
+[, NO POPULATION] }
+
+] | [STOPLIST [=] {OFF | SYSTEM | <stop list name>}
+
+Note that what is optional is a bit atypical here. Most of the time, required items are listed first, but the quirks of this syntax give us an optional parameter (a column list) before a required parameter (the key index). Let's start with a quick example and then take a look at the parts:
+
+CREATE FULLTEXT INDEX ON Production.ProductModel
+
+( Name LANGUAGE English)
+
+KEY INDEX PK_ProductModel_ProductModelID
+
+ON MainCatalog
+
+WITH CHANGE_TRACKING OFF, NO POPULATION;
+
+So, what we've created here is a full-text index for the Production.ProductModel table. We've explicitly stated that the language used in that column is U.S. English. If we had wanted, we could have added a comma followed by another column name and potentially a TYPE COLUMN or another LANGUAGE identifier. After the language, we specifically stated what full-text catalog we wanted this index stored in as well as that we wanted change tracking turned off and no initial population of the index.
+
+That's a lot to think about, so let's take a look at those parts a bit closer.
+
+Notice that I did not supply a name for my full-text index. There can only be one full-text index for any given table, so there is no need to name it. (It is essentially identified by the table it's built on.) Be sure what you define includes all the columns you want to perform full-text searches on.
+
+Column List
+
+This is probably the trickiest part of the whole thing. Even though it says "column name" in the preceding syntax, you're really working on a column list. The issue is that for each column you list you need to include everything about that column before you move on to the next column. That is, you need to include the TYPE COLUMN and LANGUAGE parameters (if you're going to) before you name the next column.
+
+So, for example, if we had also wanted to include the catalog description, we could have done that, too, by adding it at the end of the first column definition:
+
+CREATE FULLTEXT INDEX ON Production.ProductModel
+
+( Name LANGUAGE English,
+
+CatalogDescription)
+
+KEY INDEX PK_ProductModel_ProductModelID
+
+ON MainCatalog
+
+WITH CHANGE_TRACKING OFF, NO POPULATION;
+
+This example is purely for reference. It will not run since we already created a full-text index on the Production.ProductModel table.
+
+LANGUAGE
+
+This specifies what language the column we've just identified is in. This is important for determination of noise words (words that occur frequently but add little to your search—we'll see more about these later in this chapter), as well as things like collation. Any language that SQL Server has localization support for (33 localizations as of this writing) is valid. To get a list of the aliases you would use, you can query the sys.syslanguages metadata view in the master database:
+
+SELECT name, alias FROM master.sys.syslanguages;
+
+TYPE COLUMN
+
+This option is for use when you want to do full-text indexing against documents stored in an image or a varbinary column. AdventureWorks2008 has a full-text index established that makes use of this. (It is on the Production.Documents table.) We'll check it out a bit later in the chapter. For now though, imagine that you're doing document management using SQL Server (not at all an uncommon use for SQL Server). If you are storing documents written in a mix of one or more applications, such as Microsoft Word (.doc), Acrobat (.PDF), Excel (.XLS), or a text editor (.TXT), then Full-Text Search will need to know what kind of document is stored for each row it analyzes, so it knows what analysis plug-in to use.
+
+In this case, you need to add another column to your table (in addition to the image or varbinary column) that contains the extension (.DOC, .PDF, and so on) of the document stored in the binary column. This column becomes the parameter value for the TYPE COLUMN property in the CREATE FULLTEXT INDEX command.
+
+KEY INDEX
+
+Unlike all the other options in the CREATE FULLTEXT INDEX command, this one is required.
+
+Any table that Full-Text is indexing must have a column that uniquely identifies each row. This can be a primary key or a unique constraint. The thing to remember on this point is that you are supplying the name of the index associated with the unique identifier, not the column or constraint name. Since this is used repeatedly to associate data in the full-text index, I would suggest you use the smallest primary key or unique index available.
+
+ON
+
+This is simply the name of the full-text catalog you want this index stored in. This is optional if your database has a default full-text catalog, and required if no default catalog has been established.
+
+WITH
+
+This supplies instructions regarding how your index is populated with data and how it copes with changes to the table that the index is built over.
+
+CHANGE_TRACKING
+
+Change tracking is all about how your full-text index deals with changes to the underlying table.
+
+The dilemma here is how you want to balance the accuracy of your full-text searches versus the amount of overhead you incur by keeping a higher overhead system (as compares to maintaining standard B-Tree indexes) up to date.
+
+Change tracking gives us three levels of support for changes:
+
+OFF | The full-text index is updated only when you perform a full population of the index. Essentially, you need to rebuild from scratch each time you populate. This means there is no ongoing maintenance overhead, but it also means that there may be rows in the table that will not be returned in your full-text queries or, perhaps worse, that rows may come back as containing the word you are interested in when, due to changes, they no longer do. This option is great when your data is slow moving (doesn't change often) and/or you don't require perfect accuracy in your results. In return for giving up that accuracy, it means you have no ongoing overhead and that your indexes are always as compact as they can be because they have no issues with fragmentation. It does mean, however, that when you do repopulate, you have a period of downtime and the overall process takes longer.  
+---|---  
+AUTO | Under this model, SQL Server is constantly updating the index for things happening in the table. While there still may be a lag between when the change is made and when it is reflected in full text, that lag is minimal and you are getting something approximating real-time updates. This is the way to go when you have fast-moving data or your need for accuracy is very high. You are enduring a high degree of overhead since SQL Server will use smaller, intermediate structures to keep track of the changes. These can become inefficient over time and may hurt search performance but are not that big of a deal in the short run. If you use this option, consider still performing a reorganization or full repopulation regularly.  
+MANUAL | This is something of a middle ground. It does tracking to be able to identify changes but does not update the full-text index until explicitly told to do so. You can then manually perform updates that apply the changes to the existing index without a full repopulation.
+
+NO POPULATION
+
+This applies only if you have chosen OFF for change tracking.
+
+By default, when you create a full-text index, SQL Server starts a background process to populate that index. If you turn off change tracking and specify NO POPULATION, then you are limiting yourself solely to defining the full-text index but not actually putting any data in it to start. You can then schedule your own index population job to run later (presumably in low-demand hours of the day).
+
+STOPLIST
+
+A stoplist replaces what was known in previous versions as a noise word list. Noise words are now called stop words. They are words that are explicitly exempt from being included in the index. In general, these equate to words that are so common (in English, these might include "the," "and," "or," and other words that occur at abnormally high frequencies, but rarely add any real value to the content. While noise words were kept in a separate file in previous releases, SQL Server 2008 stores stop words in a stoplist. For each language you can define for full-text indexing, there is an associated system stoplist, but you can also create your own custom stoplist. You can also turn off stoplist utilization if you want all words included regardless.
+
+Altering Full-Text Indexes
+
+Okay, so now you have an index, and you want to make changes to it. As you might expect, the new full-text syntax supports the notion of an ALTER statement. It is in the form of:
+
+ALTER FULLTEXT INDEX ON <table name>
+
+{ ENABLE
+
+| DISABLE
+
+| SET CHANGE_TRACKING { MANUAL | AUTO | OFF }
+
+| ADD (<column name>
+
+[TYPE COLUMN <type column name> ]
+
+[LANGUAGE <language alias>] [,...n] )
+
+| DROP (<column name> [,...n] )
+
+| START { FULL | INCREMENTAL | UPDATE } POPULATION
+
+| {STOP | PAUSE | RESUME} POPULATION
+
+| SET STOPLIST { OFF| SYSTEM | <stoplist name> }
+
+[WITH NO POPULATION]
+
+}
+
+This ALTER has some substantial differences from previous ALTER statements we've dealt with! See how verbs like START and STOP are in there? This ALTER not only changes the definition of our full-text index but also can be used to manage the index somewhat. Keep this difference in mind, as it is not very intuitive when you compare it to the other ALTER statements we use in SQL Server.
+
+Several elements of these work exactly as they did for the CREATE statement. We are merely changing a chosen option from one thing to another. However, some of this is totally new. Let's start with the more traditional ALTER statement items and then move on to the portions of this statement that are more management-oriented.
+
+ENABLE/DISABLE
+
+These do what they say. If you disable a full-text index, the index is kept in place and all data remains intact. What changes is that the index is not available for full-text queries, and the index data is not updated (any updates that were in process when the DISABLE was issued will be stopped immediately).
+
+When you ENABLE, it picks up where the index left off. (It likely has catching up to do, but any data already there is kept intact, and you do not need to do a full repopulation.)
+
+ADD
+
+This works just like the initial definition of columns. For example, if we wanted to add the Instructions column to our full-text index on Production.ProductModel, it would look like:
+
+ALTER FULLTEXT INDEX ON Production.ProductModel
+
+ADD ( Instructions )
+
+The LANGUAGE and TYPE COLUMN properties also work just as they did in our early CREATE.
+
+DROP
+
+Again, this works much as you would expect. If we were dropping the Instructions column we just added, that would look like:
+
+ALTER FULLTEXT INDEX ON Production.ProductModel
+
+DROP ( Instructions )
+
+START... POPULATION
+
+START gives us three options as to what kind of populations we want to use.
+
+FULL
+
+The nice simple one—think of this as the command to "start over!" Every row will be reexamined, and the index will be rebuilt from scratch.
+
+INCREMENTAL
+
+This one is valid only if you have a timestamp column in your table (otherwise it will default back to a FULL population) and will start a population of rows changes since the last time a population was performed for the table. Think of this one as the "catch up on your work please!" version of populating. Incremental population does not require that change tracking be turned on.
+
+UPDATE
+
+This one addresses the scenario where you have turned the AUTO populate off for the index, but want all updates, inserts, or deletes updated in the index. It does require that change tracking be turned on.
+
+STOP, PAUSE, RESUME
+
+These perform the specific action on any population that is currently running against this full-text index. The STOP option does not stop automatic change tracking—only full or incremental updates. PAUSE and RESUME operate exactly as one would expect.
+
+Dropping Full-Text Indexes
+
+I'm sure by this point that you could figure this one out for yourself, but for the sake of completeness, here we go:
+
+DROP FULLTEXT INDEX ON <table name>
+
+So, were we to run the command (don't actually run this, as we'll be using this index in our next example!):
+
+DROP FULLTEXT INDEX ON Production.ProductModel
+
+the full-text index would be gone!
+
+A Note Regarding the Older Syntax
+
+Prior to SQL Server 2005, we used a special system stored procedure called sp_fulltext_catalog. We likewise used other system stored procs to address other full-text functionality.
+
+These have now been deprecated for two releases, and are significantly out of touch with the next full-text architecture. I will not cover them in depth there, but I do want you to be aware of them in case you bump into them in production settings. If you do, I recommend migrating them to the new syntax as fast as reasonably possible (basically, as long as SQL Server 2000 support is no longer required).
+
+More on Index Population
+
+Unlike "normal" SQL Server indexes, which are naturally kept up to date by the very nature of SQL Server and the way it stores data, full-text indexes operate with a different storage structure and require substantially more overhead to populate. As such, they require a certain degree of intervention before the index will be up to date with the actual data it is supposed to represent.
+
+Population comes in three—well, more like two and a half—flavors. Let's look at each:
+
+  * Full: Is what it sounds like. With this kind of population, SQL Server basically forgets anything that it knew about the data previously and starts over. Every row is rescanned, and the index is rebuilt from scratch.
+  * Incremental: Under this option, SQL Server utilizes a column of type timestamp in order to keep track of what columns have changed since the last population. In this scenario, SQL Server only needs to record the changes for those rows that have changed in some manner. This option requires that the table in question have a timestamp column. Any updates that do not cause a change in the timestamp (nonlogged operations—usually BLOB activity) will not be detected unless something else in the same row changed.
+  * Change Tracking: Tracks the actual changes since the last population. This option can help you keep your full-text indexes up to date at near real time; however, keep in mind that full-text population is very CPU and memory intensive, and can bog down your server. Weigh the notion of immediate updates against the notion that you may be able to hold your updates to off-peak hours for your server.
+
+Unless you're using change tracking, population of your full-text indexes will occur only when you specifically start the process or according to a population schedule that you establish.
+
+Obviously, whenever you first create a full-text index or change the list of columns participating in the index, you need to completely repopulate the index (an incremental change of a previously empty index would mean that every row would have to be scanned in—right?). SQL Server will now do this automatically unless you explicitly tell it not to. We can manually perform this repopulation at either the catalog or the table level. Typically, you'll perform repopulation at the table level for newly added or changed indexes, and repopulate at the catalog level when you are performing routine maintenance.
+
+So, with this in mind, we should be ready to populate the full-text index we have created on our Production.ProductModel table. Had we not specifically stated NO POPULATION, then SQL Server would have populated the index automatically; however, since we did tell it not to populate, we have to order up our population. Since this is the first population, we probably want a full population (frankly, an incremental would have the same result, so it doesn't really matter, but it reads more logically this way). Using the new syntax, this would look like:
+
+ALTER FULLTEXT INDEX ON Production.ProductModel
+
+START FULL POPULATION;
+
+Full-text population runs as a background process. As such, your command will return a "completed successfully" message as soon as the population job is started. Do not take this message to mean that your index is done populating, which, if the index is against a large table, could potentially take hours to complete.
+
+If you need to know the status of your full-text population process, right-click the name of your full-text index under the Storage⇒Full Text Catalogs node of your database, and then check the property called "Population Status."
+
+Since this table is relatively small, you shouldn't have to wait terribly long before you can run a query against it and get results:
+
+SELECT ProductModelID, Name
+
+FROM Production.ProductModel
+
+WHERE CONTAINS(Name, 'Frame');
+
+This should get back something on the order of 10 rows:
+
+ProductModelID Name
+
+\-------------- --------------------------------------------------
+
+5 HL Mountain Frame
+
+6 HL Road Frame
+
+7 HL Touring Frame
+
+8 LL Mountain Frame
+
+9 LL Road Frame
+
+10 LL Touring Frame
+
+14 ML Mountain Frame
+
+15 ML Mountain Frame-W
+
+16 ML Road Frame
+
+17 ML Road Frame-W(10 row(s) affected)
+
+We have a full-text index, and it works! Time to move on to what that query we just ran is supposed to do and what other options we have available.
+
+Full-Text Query Syntax
+
+Full-Text Search has its own brand of query syntax. It adds special commands to extend T-SQL and to clearly indicate that we want the full-text engine to support our query rather than the regular SQL Server engine.
+
+Fortunately, the basics of full-text queries are just that—basic. There are only four base statements to work with the full-text engine. They actually fall into two overlapping categories of two statements each:
+
+| Exact or Inflectional Term | Meaning  
+---|---|---  
+Conditional | CONTAINS | FREETEXT  
+Ranked Table | CONTAINSTABLE | FREETEXTTABLE
+
+The conditional predicates both work an awful lot like an EXISTS operator. Essentially they, for each row, provide a simple yes or no as to whether the row qualifies against the search condition provided. You use both of these in the WHERE clause of your queries. On the other hand, the two ranked queries do not provide conditions at all. Instead, they return a tabular result set (which you can join to) that includes the key value of all the rows that found matches (that's what you join to) as well as a ranking to indicate the strength of the match.
+
+Let's look more closely at each of the four keywords.
+
+CONTAINS
+
+This term looks for a match based on a particular word or phrase. By default, it's looking for an exact match (that is, swim must be swim—not swam), but it can also use modifiers to look for what are called inflectional matches (words that have the same root—such as swim and swam). CONTAINS recognizes certain keywords.
+
+For now, we're going to stick with the simple form of CONTAINS. We will look at the advanced features after we have the basics of our four statements down (since they share certain modifiers, we'll look at those all at once).
+
+The basic syntax, then, looks like this:
+
+CONTAINS({<column>|*} , '<search condition>')
+
+You can name a specific column to check, or use *, in which case the condition will be compared for matches against any of the indexed columns. In its simplest form, the search condition should contain only a word or phrase.
+
+There are two things worth pointing out here. First, remember that you will only get back results against columns that were included in the full-text index. In the final index we created on the ProductModel table. That means the search includes only the Name and CatalogDescription columns. Columns like Introduction are not included in the search because they are not included in the index. (You may recall that we dropped that column in a test of our ALTER syntax.) Second, the search condition can be far more complex than the simple condition that we've shown here, but we'll get to that after you have the basic operations down.
+
+For an example, let's go back to the query we used to prove that our population exercise had worked:
+
+SELECT ProductModelID, Name
+
+FROM Production.ProductModel
+
+WHERE CONTAINS(Name, 'Frame');
+
+What we've said we want here is the ProductModelID and Name columns for all the rows where the Name column in the index includes the word Frame.
+
+If you check out the Name column for the results, you'll see that every row has an exact match.
+
+Let's quickly look at another example. This time, we're going to run pretty much the same query, but we're going to look for the word Sport:
+
+SELECT ProductModelID, Name
+
+FROM Production.ProductModel
+
+WHERE CONTAINS(Name, 'Sport');
+
+This time we get back just one row:
+
+ProductModelID Name
+
+\-------------- --------------------------------------------------
+
+33 Sport-100
+
+(1 row(s) affected)
+
+Again, we got back all the rows where the Name column had an exact match with the word Sport. Were you to look through the other rows in the table, however, you would find that there were other variations of the word Sport (a plural in this case), but they were not returned.
+
+Again—the default behavior of CONTAINS is an exact match behavior.
+
+FREETEXT
+
+FREETEXT is an incredibly close cousin to CONTAINS. Indeed, their syntax is nearly identical:
+
+FREETEXT({<column>|*} , '<search condition>')[;]
+
+So, the only real difference is in the results you get back. You see, FREETEXT is a lot more forgiving in just how exact of a match it looks for. It is more interested in the meaning of the word than it is the exact letter-for-letter spelling.
+
+To illustrate my point rather quickly here, let's look at our Sport query from the previous section, but modify it to use FREETEXT instead of CONTAINS:
+
+SELECT ProductModelID, Name
+
+FROM Production.ProductModel
+
+WHERE FREETEXT(Name, 'Sport');
+
+When we execute this, we get back slightly different results than we did with CONTAINS:
+
+ProductModelID Name
+
+\-------------- --------------------------------------------------
+
+13 Men's Sports Shorts
+
+33 Sport-100
+
+(2 row(s) affected)
+
+The difference in this case comes in interpretation of the plurals—our FREETEXT query has picked up the row that contains the word Sports—not just those with the word Sport. FREETEXT can also handle things like swim versus swam and other word variations.
+
+CONTAINSTABLE
+
+CONTAINSTABLE, in terms of figuring out which rows would be a match, works identically to CONTAINS. The difference is how the results are dealt with.
+
+The syntax is similar, but with the twist of identifying which table the CONTAINSTABLE is going to operate against plus an optional limitation to just a top set of matches:
+
+CONTAINSTABLE (<table>, {<column>|*}, '<contains search condition>' [, <top 'n'>])
+
+Where CONTAINS returns a simple Boolean response suitable for use in a WHERE clause, CONTAINSTABLE returns a table—complete with rankings of how well the search phrase matched the row being returned.
+
+Let's see what I mean here by running our original query, but with a CONTAINSTABLE this time:
+
+SELECT *
+
+FROM CONTAINSTABLE(Production.ProductModel,Name, 'Sport');
+
+This gets us back one row—just like with CONTAINS—but the information provided by the returned values is somewhat different:
+
+KEY RANK
+
+\----------- -----------
+
+33 128
+
+(1 row(s) affected)
+
+We are provided with two columns:
+
+  * KEY: Remember when we said that our full-text index had to be able to relate to a single column key in the indexed table? Well, the KEY returned by CONTAINSTABLE relates exactly to that key column. That is, the value output in the column called KEY matches with a single unique row, as identified by the key, in the index table.
+  * RANK: A value from 0 to 1000 that indicates just how well the search result matched the row being returned—the higher the value, the better the match.
+
+To make use of CONTAINSTABLE, we simply join our original table back to the CONTAINSTABLE result. For example:
+
+SELECT Rank, ProductModelID, Name
+
+FROM Production.ProductModel p
+
+JOIN CONTAINSTABLE(Production.ProductModel,Name, 'Sport') ct
+
+ON p.ProductModelID = ct.[KEY];
+
+Notice the use of brackets around the KEY column name. The reason why is that KEY is also a keyword. Remember from our rules of naming that, if we use a keyword for a column or table name (which you shouldn't do), you need to enclose them in square brackets.
+
+This gets us back our original row, but this time we have the extra information from the underlying table:
+
+Rank ProductModelID Name
+
+\----------- -------------- --------------------------------------------------
+
+128 33 Sport-100
+
+(1 row(s) affected)
+
+In this case, the values in the Rank are the same, but, given more diverse values, we could have done things like:
+
+  * Filter based on some arbitrary Rank value. For example, we could want to return only the best matches based on score.
+  * Order by the rank (sort the rankings—most likely highest to lowest).
+
+FREETEXTTABLE
+
+Much as FREETEXT was the close cousin to CONTAINS, so too is FREETEXTTABLE the close cousin to CONTAINSTABLE. FREETEXTTABLE simply combines the more inexact word matching of FREETEXT with the tabular presentation found in CONTAINSTABLE.
+
+We can then combine some of our previous examples to see how FREETEXTTABLE changes things:
+
+SELECT Rank, ProductModelID, Name
+
+FROM Production.ProductModel p
+
+JOIN FREETEXTTABLE(Production.ProductModel,Name, 'Sport') ct
+
+ON p.ProductModelID = ct.[KEY];
+
+This gets us the same two rows we had with our original FREETEXT query, but with the kind of rankings we had with our CONTAINSTABLE:
+
+Rank ProductModelID Name
+
+\----------- -------------- --------------------------------------------------
+
+102 13 Men's Sports Shorts
+
+102 33 Sport-100(2 row(s) affected)
+
+Experiment with this some in your full-text efforts, and you'll see how rankings can give you a lot to work with.
+
+Dealing with Phrases
+
+All of our various full-text keywords can deal with the concept of phrases. How the phrases are parsed and handled, however, is somewhat different.
+
+Let's start off with the most simple of examples—a simple two-word phrase. This time we'll say that the phrase we want to look for is damaged seats. To add a twist to things, we want it no matter what column it is in (as long as the column is part of our full-text index).
+
+SELECT DocumentNode, DocumentSummary, Document
+
+FROM Production.Document
+
+WHERE CONTAINS(*, '"damaged seats"');
+
+Notice that the phrase was included in double quotation marks. We need to do this any time we want a set of words to be considered as a single unit. This does, however, get us back one row. The result is a little large (due to the size of the Document and DocumentSummary columns) to put in this text, but the relevant section is:
+
+DocumentNode DocumentSummary Document
+
+\------------ -------------------- --------------------------------
+
+0x7C20 Worn or damaged se... 0xD0CF11E0A1B11AE100000000000...
+
+(1 row(s) affected)
+
+Our CONTAINS will check for rows that exactly match the phrase, as long as we enclose that phrase in double quotation marks. (Within the single quotes we always need on our search phrase.) FREETEXT works in the same way.
+
+Booleans
+
+SQL Server also supports the use of Booleans in your searches. The Boolean keywords apply:
+
+  * AND
+  * OR
+  * AND NOT
+
+There really isn't a whole lot of rocket science to these, so I'll launch right into a simple example and point out one caveat. Let's go with a variation on an example we used earlier:
+
+SELECT DocumentNode, DocumentSummary, Document
+
+FROM Production.Document
+
+WHERE CONTAINS(*, '"damaged" OR "seats"');
+
+What we've done here is change from where we were searching for the exact phrase damaged seats to a search that is looking for either word without worrying about whether the words are used together or not. Execute this, and you'll see we get back two rows instead of just one.
+
+The caveat that I mentioned earlier is that NOT cannot be used on its own. NOT is relevant only to full-text searches when used in conjunction with AND.
+
+Proximity
+
+Full-Text Search also allows us to make use of proximity terms. Currently, the list of supported proximity terms is a whopping one term long—NEAR. NEAR works a lot like it sounds. It says that the terms on either side of the NEAR keyword must be close to each other. Microsoft hasn't told us how close the words have to be to be considered NEAR, but figure around eight to ten words for most situations.
+
+Technically, there is one more "word" on the proximity keyword list, but it isn't a "word" at all—rather a symbol. You can, if you choose, use a tilde (∼) instead of the NEAR keyword. It works just the same. Personally, I recommend against this for readability reasons. Not too many readers of your code are going to recognize what ∼ means, but most of them will at least make a guess at NEAR.
+
+For examples on how NEAR works, we're going to stick with CONTAINSTABLE. NEAR works much the same in the other full-text query operators, so we're just going to focus on what happens to the rankings in a NEAR query as well as what does and doesn't get included in the query.
+
+For this example, we'll look at the words repair and instructions:
+
+SELECT Rank, DocumentNode, DocumentSummary
+
+FROM Production.Document pd
+
+JOIN CONTAINSTABLE(Production.Document, *, 'repair near instructions') ct
+
+ON pd.DocumentNode = ct.[KEY];
+
+I include only the first two columns here for brevity, but notice that we have different rankings on the two rows returned.
+
+Rank DocumentNode
+
+\----------- --------------------------
+
+3 0x5B40
+
+2 0x7B40
+
+(2 row(s) affected)
+
+If you look carefully at the DocumentSummary column in your results (again, for brevity's sake, I haven't included all of the column here), you'll see that both rows do indeed have both words but that the word repair occurs twice in the DocumentNode 0x5B40 row, thus it receives a higher ranking.
+
+Don't be surprised to see situations where a record that has your search criteria closer together gets ranked lower than one where the search criteria are not as close. Remember that, even when you use the NEAR keyword, nearness is only one of several criteria that SQL Server uses to rank the rows. Other considerations such as percentage of words that match, case values, and more can play with the numbers on you.
+
+Weighting
+
+So, these rankings are all cool and whatnot, but what would we do if one of the words in our search criteria was more important than another?
+
+To deal with situations where you need to give precedence to one or more words, Full-Text provides us with the ISABOUT() function and WEIGHT keyword. This syntax looks like this:
+
+ISABOUT(<weighted term> WEIGHT (<weight value>), <weighted term> WEIGHT (<weighted
+
+term>),...n)
+
+Let's say that you want to allow customers to select among several kinds of bikes, but to further allow for selecting "preferred" options. For our example, let's say our customer is most interested in mountain bikes but is also interested in touring and road bikes—in that order. You could get a ranked listing using the following:
+
+SELECT Rank, ProductModelID, Name
+
+FROM Production.ProductModel pm
+
+JOIN CONTAINSTABLE(
+
+Production.ProductModel,
+
+Name,
+
+'ISABOUT (Road WEIGHT (.2), Touring WEIGHT (.4), Mountain WEIGHT (.8) )'
+
+) ct
+
+ON pm.ProductModelID = ct.[KEY]
+
+ORDER BY Rank DESC;
+
+Now take a look at the results:
+
+Rank ProductModelID Name
+
+\----------- -------------- --------------------------------------------------
+
+31 5 HL Mountain Frame
+
+31 7 HL Touring Frame
+
+31 8 LL Mountain Frame
+
+...
+
+...
+
+...
+
+31 123 LL Mountain Rear Wheel
+
+31 124 ML Mountain Rear Wheel
+
+31 125 HL Mountain Rear Wheel
+
+7 126 LL Road Rear Wheel
+
+7 113 Road Bottle Cage
+
+7 93 Road Tire Tube
+
+...
+
+...
+
+...
+
+7 16 ML Road Frame
+
+7 17 ML Road Frame-W
+
+7 9 LL Road Frame
+
+7 6 HL Road Frame
+
+(89 row(s) affected)
+
+Note that not everything is perfect in our world—some touring entries come before our more heavily weighted mountain options, but if you look the list over, you will see we have indeed created a very heavy bias toward mountain bikes in our rankings.
+
+Inflectional
+
+This one doesn't really apply to FREETEXT, as FREETEXT is inherently inflectional. What is INFLECTIONAL you ask? Well, it's basically telling SQL Server that different forms of the word have the same general meaning. The syntax looks like this:
+
+FORMSOF(INFLECTIONAL, <term>[, <term>[,...n]] )
+
+An inflectional form of a word is one that has the same general meaning. For example, swam is just the past tense of swim. The underlying meaning is the same.
+
+Stop Words
+
+As we discussed earlier, there are tons and tons of words in use in different languages (Full-Text supports more than just U.S. English!). Most languages have certain words that appear over and over again with little intrinsic meaning to them. In the English language, for example, prepositions (you, she, he, and so on), articles (the, a, an), and conjunctions (and, but, or) are just few examples of words that appear in many, many sentences but are not integral to the meaning of that sentence. If SQL Server paid attention to those words, and we did searches based on them, then we would drown in the results that SQL Server gave us in our queries. Quite often, every single row in the table would be returned! The solution comes in the form of what is called a stoplist (called a noise word list in previous releases). This is a list of words (individual words are referred to as stop words) that SQL Server ignores when considering matches.
+
+SQL Server includes a default stoplist for each language it supports. You can either use this system-supplied stoplist (usually referred to as SYSTEM if you need to explicitly reference it in a command), or you can create your own using the CREATE FULLTEXT STOPLIST command. The full syntax looks like this:
+
+CREATE FULLTEXT STOPLIST <stoplist name>
+
+[FROM { [<database name>.] <source stoplist name> } | SYSTEM STOPLIST ]
+
+[AUTHORIZATION <owner name> ]
+
+[;]
+
+In general, you'll want a well-populated stoplist, and thus will want to prepopulate your list from some existing stoplist. So, for example, I could create a stoplist for AdventureWorks2008 that starts with the same stop words in the SYSTEM stoplist:
+
+CREATE FULLTEXT STOPLIST ADStopList
+
+FROM SYSTEM STOPLIST;
+
+Stoplists you create are not automatically associated with any full-text index—you need to manually attach the new stoplist to the full-text index via the ALTER FULLTEXT INDEX command.
+
+You can add and delete words from this list as suits the particular needs of your application. For example, if you are in the business of selling tractor-trailer rigs, then you might want to add words like hauling to your noise word list. More than likely, a huge percentage of your customers have that word in their name, so it is relatively unhelpful in searches. To make additions or subtractions from a stoplist, you use the ALTER FULLTEXT STOPLIST command. The full syntax looks like this:
+
+ALTER FULLTEXT STOPLIST stoplist_name
+
+{
+
+ADD '<stop word>' LANGUAGE <language number or moniker>
+
+| DROP
+
+{
+
+'<stop word>' LANGUAGE <language number or moniker>
+
+| ALL LANGUAGE <language number or moniker>
+
+| ALL }
+
+[;]
+
+Let's try this out by adding a stop word to the AWStopList we just created:
+
+ALTER FULLTEXT STOPLIST ADStopList
+
+ADD 'bicycle' LANGUAGE 1033;
+
+Were we to repopulate our full-text index, the word bicycle (which may be a worthless search term in a business where every document is going to discuss bicycles), would be ignored.
+
+Adding and removing words from a stoplist is something of a double-edged sword. When you add a word to the list, it means that searches involving that word are no longer going to return the results that users are more than likely going to expect. By the same token, it also, depending on the frequency with which the word is used, can dramatically shrink the processing time and size of your catalogs.
+
+Summary
+
+Full-Text is now core to the SQL Server engine (it was a separate service in prior releases), but a separate process is spawned by the Full-Text daemon manager each time a search is issued.
+
+When you implement Full-Text, also consider the load the population process is going to place on your server, and balance that against how quickly you need changes reflected in search results. If possible, delay repopulation of full-text indexes until the non-peak hours on your system
+
+Full-Text Search is a powerful and fast way of referencing the contents of most any character-based columns. It is substantially more efficient and powerful than a LIKE clause but comes with additional overhead in terms of both space and processing time.
+19
+
+Feeling Secure
+
+There are probably as many ideas on security as there are programmers. It's one of those things where there isn't necessarily a right way to do it, but there are definitely plenty of wrong ones.
+
+The first thing to understand about security is that there is no such thing as a totally secure application. If you can make it secure, rest assured that someone, somewhere, can defeat your efforts and "hack" into the system. Even with this knowledge, the goal still needs to be to keep unwanted intruders out of your system. The good news about security is that, for most instances, you can fairly easily make it such a hassle that 99.999 percent of people out there won't want to bother with it. For the other .001 percent, I can only encourage you to make sure that all your employees have a life so they fall into the 99.999 percent. The .001 percent will hopefully find someplace else to go.
+
+SQL Server 2005 marked the start of a very concerted effort by Microsoft to raise the level of security in SQL Server. For those who have been around long enough, you may remember the hubbub surrounding the "slammer" virus that happened during the SQL Server 2000 lifespan. Microsoft radically altered the security profile of SQL Server in a service pack that followed the slammer scare, but SQL Server 2005 marked the first full release after the advent of the slammer virus, and it was just the beginning of a series of features not so much focused just around deterring hackers as a more far reaching protection of the safety and privacy of data in SQL Server. A ton of new features were added in SQL Server 2005, some more are added in SQL Server 2008, and there are more to come in the next version of SQL Server. Needless to say, all this leaves us with a lot to cover in the security realm.
+
+In this chapter, we're going to cover:
+
+  * Security basics
+  * SQL Server security options
+  * Database and server roles
+  * Application roles
+  * Credentials
+  * Certificates
+  * Schema management
+  * XML integration security issues
+  * More advanced security
+
+What we'll discover is that there are a lot of different ways to approach the security problem. Security goes way beyond giving someone a user ID and a password—we'll see many of the things that you need to think about.
+
+Before beginning any of the examples in this chapter, you'll need to load and execute the script called NorthwindSecure.sql. This builds a special database we'll use throughout this chapter. You can download what you need for this at the book's Web site at www.wrox.com or at www.professionalsql.com.
+
+Okay, so this is a chapter where I have to make you create a working database in order for the examples to work—my apologies for that. What we're going to utilize is the old Northwind database but with any changes to permissions removed. The NorthwindSecure database that we'll use throughout this chapter is a more typical database scenario—that is, it has absolutely no permissions added to it beyond what comes naturally with creating tables and objects (which means NONE). We'll learn how to deal with this and explicitly add what permissions we want as the chapter progresses.
+
+Security Basics
+
+I'm sure that a fair amount of what we're going to look into in this section is going to seem exceedingly stupid—I mean, won't everyone know this stuff? Judging by how often I see violations of even the most simple of these rules, I would say, "No, apparently they don't." All I can ask is that you bear with me, and don't skip ahead. As seemingly obvious as some of this stuff is, you'd be amazed how often it gets forgotten or just plain ignored.
+
+Among the different basics that we'll look at here are:
+
+  * One person, one login ID, one password
+  * Password expirations
+  * Password length and makeup
+  * Number of attempts to log in
+  * Storage of user ID and password information
+
+One Person, One Login, One Password
+
+It never ceases to shock me how, everywhere I go, I almost never fail to find that the establishment has at least one "global" user—some login into the network or particular applications that is usually known by nearly everyone in the department or even the whole company. Often, this "global" user has carte blanche (in other words, complete) access. For SQL Server, it used to be common that installations hadn't even bothered to set the sa password to something other than a blank password. This is a very bad scenario indeed.
+
+Prior to SQL Server 2000, the default password for the sa account was null—that is, it didn't have one. Thankfully, SQL Server 2000 not only changed this default, SQL Server will now, by default, not allow you to use a weak password (depends on your Windows policy settings), and, assuming your Windows policy settings allow a blank password, SQL Server will proactively tell you that you are effectively being an idiot if you insist on making it blank. The thing to watch out for is that, while you're developing, it's really common to still set it to something "easy." You still need to remember to change it before you go into production or to make it something hard from the beginning if your development server is going to be exposed directly to the Internet or some other non-trustworthy access.
+
+Even now, when most installations do have something other than a null password, it is very common for lots of people to know what that password is.
+
+The first basic, then, is that if everyone has access to a user ID that is essentially anonymous (if everyone knows it, it could be that anyone has used it) and has access to everything, then you've defeated your security model entirely. Likewise, if you give every user a login that has full access to everything, you've again severely damaged your security prospects. The only real benefit that's left is being able to tell who's who as far as who is connected at any point in time (assuming that they are really using their individual login rather than the global login).
+
+Users that have carte blanche access should be limited to just one or two people. Ideally, if you need passwords for such carte blanche access, then you would want separate logins that each have the access, but only one person would know the password for each login.
+
+You'll find that users will often share their passwords with someone else in order to let someone temporarily gain some level of access (usually because the owner of the login ID is either out of the office or doesn't have time to bother with doing it themselves at the time.) You should make this nothing short of a hanging offense if possible.
+
+The problem created by password sharing is multifold. First, some users are getting access to something that you previously decided not to give them (otherwise, why don't they have the necessary rights for themselves?). If you didn't want them to have that access before, why do you want them to have it now? Second, a user that's not supposed to have access probably will now have that access semi-permanently. Since users almost never change their passwords (unless forced to), the person they gave the password to will probably be able to use that login ID indefinitely and, I assure you, they will! Third, you again lose auditing. You may have something that tracks which user did what based on the login ID. If more than one person has the password for that login ID, how can you be sure which person was logged in to that login ID at the time?
+
+This means that if someone is going to be out of the office for some time, perhaps because he is sick or on vacation, and someone else is temporarily going to be doing his job, a new login ID and password should be created specifically for that replacement person (or a modification to the access rights of his existing login ID should be made), and it should be deleted as soon as the original person has returned.
+
+To summarize, stay away from global user accounts whenever possible. If you must have them, keep their use limited to as few people as at all possible. Usually this should be kept to just two (one to be a main user, and one person as a backup if the first person isn't available). If you really must have more than one person with significant access, then consider creating multiple accounts (one per user) that have the necessary level of access. By following these simple steps, you'll find you'll do a lot for both the security and auditability of the system.
+
+Password Expiration
+
+Using expiration of passwords tends to be either abused or ignored. That's because it's a good idea that often goes bad.
+
+The principle behind password expiration is to set up your system to have passwords that automatically expire after a certain period of time. After that time, the user must change the password to continue to have access to the account. The concept has been around many years, and if you work in a larger corporation, there's a good chance that the auditors from your accounting firm are already insisting that you implement some form of password expiration (no, it's not just your IT department being controlling—they may well have been forced to a given policy by the same people who audit your financial statements).
+
+With SQL Server 2005 and later, you can enforce Windows authentication rights even for your SQL Server–specific passwords. Alternatively, you can just use Windows-based security (more on that in the next section).
+
+What Do You Get for Your Effort?
+
+So, what does password expiration get you? Well, remember that, in the final part of a previous section, I said that once a password is shared, the user would have that access forever? Well, this is the exception. If you expire passwords, then you refresh the level of your security—at least temporarily. The password would have to be shared a second time in order for the user to regain access. While this is far from foolproof (often, the owner of the login ID will be more than happy to share it again), it does deal with the situation where the sharing of the password was really just intended for one-time use. Often, users who share their passwords don't even realize that months later the other user still has the password and may be using it on occasion to gain access to something they would not have, based on their own security.
+
+Now the Bad News
+
+It is very possible to get too much of a good thing. I mentioned earlier how many audit firms will expect their clients to implement a model where a user's password regularly expires, say, every 30 days. This is a very bad idea indeed.
+
+Every installation that I've seen that does this—without exception—has worse security after implementing a 30-day expiration policy. The problem is, as you might expect, multifold in nature.
+
+  * First, technical support calls go way up. When users change passwords that often, they simply can't memorize them all. They can't remember which month's password they are supposed to use, so they are constantly calling for support to reset the password because they forgot what it is.
+  * Second, and much more important, the users get tired of both thinking of new passwords and remembering them. Experience has shown me that, for more than 90 percent of the users I've worked with in installations that use a 30-day expiration, users change their passwords to incredibly predictable (and therefore hackable) words or word/number combinations. Indeed, this often gets to a level where perhaps 50 percent or more of your users will have the same password—they are all using things like MMMYY where MMM is the month and YY is the year. For example, for January 1996 they might have used JAN96 for their password. Pretty soon, everyone in the place is doing something like that.
+
+I've seen some companies try and deal with this by implementing something of a password sniffer; it checks the password when you go to change it. The sniffing process looks for passwords that incorporate your name or start with a month prefix. These mechanisms are weak at best.
+
+Users are far smarter than you often give them credit for. It took about a week for most users to circumvent the first one of these password sniffers I saw; they simply changed their passwords to have an "X" prefix on them, and otherwise stayed with the same MMMYY format they had been using before. In short, the sniffer wound up doing next to nothing. It doesn't stop there though: they share their newfound algorithm with coworkers so they can get around the "problem" too.
+
+The bottom line here is to not get carried away with your expiration policy. Make it short enough to get reasonable turnover and deal with shared or stolen passwords but don't make it so often that users rebel and start using weak passwords. Personally, I suggest nothing more frequent than 90 days and nothing longer than 180 days.
+
+Password Length and Makeup
+
+Ah, an era of rejoicing for SQL Server in this area. In previous versions, you really didn't have much control over this if you were using SQL Server security. You can now have SQL Server enforce your Windows password policy (which you can adjust using utilities in Windows).
+
+Password Length
+
+Realize that, for each possible alphanumeric digit the user includes in the password, they are increasing the number of possible passwords by a factor of at least 36 (really a few more given special characters, but even 36 is enough to make the point here). That means there are only 36 possible single character passwords, but 1,296 possible two-character passwords. Go up to three characters, and you increase the possibilities to 46,656. By the time you add a fourth character, you're well over a million possibilities. The permutations just keep going up as you require more and more characters. The downside, though, is that it becomes more and more difficult for your users to remember what their password was and to actually think up passwords. Indeed, I suspect that you'll find that requiring anything more than 5 or 6 characters will generate a full-scale revolt from your end users.
+
+Password Makeup
+
+All right, so I've pointed out that, if you make it a requirement to use at least four alphanumeric characters, you've created a situation where there are over a million possible password combinations. The problem comes when you realize that people aren't really going to use all those combinations; they are going to use words or names that they are familiar with. Considering that the average person only uses about 5,000 words on a regular basis, that doesn't leave you with very many words to try out if you're a hacker.
+
+If you're implementing something other than the default Windows password policy, then consider requiring that at least one character be alphabetic in nature (no numbers, just letters) and that at least one character be numeric. This rules out simple numbers that are easy to guess (people really like to use their Social Security number, telephone number, or birthdays) and all words. The users can still create things that are easy to remember for them—say "77pizzas"—but the password can't be pulled out of a dictionary. Any hacker is forced to truly try each permutation in order to try and break in.
+
+Number of Tries to Log In
+
+Regardless of how you're physically storing the user and password information, your login screen should have logic to it that limits the number of tries that someone gets to log in. The response if they go over the limit can range in strength, but you want to make sure you throw in some sort of device that makes it difficult to set up a routine to try out all the passwords programmatically.
+
+How many tries to allow isn't really that important as long as it's a reasonably small number. I usually use three times, but I've seen four and five in some places and that's fine too.
+
+If you're utilizing the Windows password policy enforcement, then SQL Server will check the login attempts versus a bad password limit and enforce that policy.
+
+Storage of User and Password Information
+
+This obviously applies only if you are cooking your own security system rather than using the built-in Windows and/or SQL Server security systems (but many Web applications will do that), and, for the most part, there's no rocket science in how to store user profile and password information. There are, however, a few things to think about:
+
+  * Since you need to be able to get at the information initially, you will have to do one of the following three things: 
+    * Compile a password right into the client application or component (and then make sure that the proper login and password are created on any server that you install your application on).
+    * Utilize SQL Server's encryption technologies to encrypt and decrypt the data in the database.
+    * Require something of a double password situation—one to get the user as far as the regular password information, and one to get them to the real application. Forcing a user into two logins is generally unacceptable, which pushes you back to one of the other two options in most cases.
+  * If you go with a double password scenario, you'll want the access for the first login to be limited to just a stored procedure execution if possible. By doing this, you can allow the first login to obtain the validation that it needs while not revealing anything to anyone that tries to login through Management Studio. Have your stored procedure (sproc) accept a user ID and password, and simply pass back either a Boolean (true/false that they can log in) or pass back a recordset that lists what screens and functions the user can see at the client end. If you use a raw SELECT statement, then you won't be able to restrict what they can see.
+
+One solution I've implemented close to this scenario was to have a view that mapped the current SQL Server login to other login information. In this case, an application role was used that gave the application complete access to everything. The application had to know what the user could and couldn't do. All the user's login had a right to do was execute a stored procedure to request a listing of their rights. The sproc looked something like this (this is just pseudo-code, so don't try and actually execute this):
+
+CREATE PROC GetUserRights
+
+AS
+
+DECLARE @User varchar(128)
+
+SELECT @User = USER_NAME()
+
+SELECT * FROM UserPermissions WHERE LoginID = @User
+
+  * If you're going to store password information in the system—encrypt it!!! I can't say enough about the importance of this. Most users will use their passwords for more than one thing; it just makes life a lot easier when you have less to remember. By encrypting the data before you put it in the database, you ensure that no one is going to stumble across a user's password information—even accidentally. They may see it, but what they see is not usable unless they have the key to decrypt it.
+
+What form of encryption to use is up to you. You can utilize the built-in encryption methods (we'll discuss some of these later in the chapter), or you can implement your own encryption at the application level. One way or the other, there is little excuse for not properly protecting password information.
+
+Personally, I am a big believer in one-way encryption. That is, once it's encrypted, there really isn't any reasonable way to decrypt it. If a user loses their password, then they need to go through some form of reset mechanism and choose a new password. Why do I feel this way? Well, realize that most users will reuse the same password for many applications, so the password they use to get into your system may very well be the same password they use to get into their personal online banking system. Creating a one-way encryption system minimizes the risk that an administrator of your system is able to get at users' passwords for nefarious use.
+
+Security Options
+
+As far as built-in options go, you have two choices in how to set up security under SQL Server.
+
+  * Windows Integrated Security: The user logs in to Windows not SQL Server. Authentication is done via Windows with trusted connections.
+  * Standard Security: The user logs in to SQL Server separately from logging in to Windows. Authentication is done using SQL Server.
+
+Let's take a look at both.
+
+SQL Server Security
+
+We'll start with SQL Server's built-in login model. This was a security black hole for a very long time, but got substantially more robust in SQL Server 2005. The relatively simplistic model is still available, but there is now tons more you can do to add extra touches to just how secure your server and databases are.
+
+With SQL Server security, you create a login ID that is completely separate from your network login information. Some of the pros for using SQL Server security include:
+
+  * The user doesn't necessarily have to be a domain user in order to gain access to the system.
+  * It's easier to gain programmatic control over the user information.
+
+Some of the cons are:
+
+  * Your users may have to log in twice or more—once into whatever network access they have, and once into the SQL Server for each connection they create from a separate application.
+  * Two logins mean more maintenance for your DBA.
+  * If multiple passwords are required, they can easily get out of synch, and that leads to an awful lot of failed logins or forgotten passwords. (Does this sound familiar, "Let's see now, which one was it for this login?")
+
+An example of logging in using SQL Server security would be the use of the sa account that you've probably been using for much of this book. It doesn't matter how you've logged in to your network, you log in to the SQL Server using a login ID of sa and a separate password (which you've hopefully set to something very secure).
+
+On an ongoing basis, you really don't want to be doing things day-to-day logged in as sa. Why? Well, it will probably only take you a minute or two of thought to figure out many of the terrible things you can do by sheer accident when you're using the sa account (or any other account with system administrator access for that matter). Using sa means you have complete access to everything; that means the DROP TABLE statement you execute when you are in the wrong database will actually do what you told it—drop that table!!! About all you'll be left to say is "oops!" Your boss will probably be saying something completely different.
+
+Even if you do want to always have carte blanche access, just use the sa account to make your regular user account a member of the sysadmins server role. That gives you the power of sa, but gains you the extra security of separate passwords and the audit trail (in Profiler or when looking at system activity) of who is currently logged in to the system.
+
+Creating and Managing Logins
+
+There are currently four major ways to create logins on a SQL Server:
+
+  * By using CREATE LOGIN
+  * By using the Management Studio
+  * SQL Management Objects (SMO)
+  * By using one of the several other options that remain solely for backward compatibility
+
+CREATE LOGIN
+
+CREATE LOGIN was added in SQL Server 2005 as part of a general effort by Microsoft to standardize the syntax used to create database and server objects. It deprecated the older sp_addlogin, which was the procedural way of adding logins in prior versions, and looks like the CREATE <object> <object type> syntax that we've seen repeatedly in SQL but with some of the extra option requirements that we've seen with things like stored procedures.
+
+The most basic syntax is straightforward, but how the options can be mixed can become something of a pain to understand. The overall syntax looks like this:
+
+CREATE LOGIN <login name>
+
+[ { WITH
+
+PASSWORD = '<password>' [ HASHED ] [ MUST_CHANGE ]
+
+[, SID = <sid>
+
+| DEFAULT_DATABASE = <database>
+
+| DEFAULT_LANGUAGE = <language>
+
+| CHECK_EXPIRATION = { ON | OFF}
+
+| CHECK_POLICY = { ON | OFF}
+
+[ CREDENTIAL = <credential name>
+
+[, ... <next option>] ]
+
+} |
+
+{ FROM
+
+WINDOWS
+
+[ WITH DEFAULT_DATABASE = <database>
+
+| DEFAULT_LANGUAGE = <language> ]
+
+| CERTIFICATE <certificate name>
+
+| ASYMMETRIC KEY <asymmetric key name>
+
+}
+
+]
+
+The key part that sets the tone for things is the choice of a FROM versus a WITH clause immediately following the login name, so let's look at those along with the options as they are relevant to either the FROM or WITH clause they belong to.
+
+CREATE LOGIN... WITH
+
+The WITH clause immediately puts you into defining options that go with SQL Server authentication–based logins as opposed to any other authentication method. It is only relevant if you have SQL Server security enabled (as opposed to just Windows authentication). The number of options here can seem daunting, so let's break them down.
+
+Option | Description  
+---|---  
+PASSWORD | This is, of course, just what it sounds like. The tricky part of this is the question of whether the password is in clear text (in which case SQL Server will encrypt it as it adds it) or whether it is already hashed (in which case you need to supply the HASHED keyword that is covered next).  
+HASHED | This follows your password, and is used only if the password you supplied was already hashed (encrypted). In that case, SQL Server adds the password without re-encrypting it.  
+MUST_CHANGE | This is another one of those "is what it sounds like" things. In short, if you supply this option, then the users will be prompted to change their password the first time they login.  
+SID | Allows you to manually specify what GUID SQL Server will use to identify this login. If you don't supply this (and doing so is something I would consider to be an extreme case), then SQL Server will generate one for you.  
+DEFAULT_DATABASE | This is the database that will be made current each time the user logs in.  
+DEFAULT_LANGUAGE | This is the language that things like errors and other system messages will be delivered in for the user.  
+CHECK_EXPIRATION | Sets whether SQL Server will enforce the password expiration policy. By default, the password will not expire. Setting this to ON will enforce policy.  
+CHECK_POLICY | Sets whether SQL Server will enforce the password policy (length, character requirements, and so on). By default, the password must meet the Windows password policy. Setting this to OFF will allow virtually any password to be used.  
+CREDENTIAL | This names a credential (and we'll cover what these are later) for this login to be mapped to. In short, this maps this login to a set of permissions that may allow them to perform actions outside of SQL Server (such as network access and such).
+
+Any of these can be mixed together, and the order in which you provide them matters only in the case of HASHED and MUST_CHANGE (which must follow the PASSWORD option if you're going to utilize them at all).
+
+CREATE LOGIN... FROM
+
+The FROM clause implies that this login isn't SQL Server–specific. The FROM clause specifies the source of that login. The source falls into a few different categories:
+
+  * WINDOWS: In this case, we are mapping to an existing Windows login or group. This is basically saying "Take this existing Windows user or group, and give them rights to my SQL Server." Notice that I say "or group." You can map SQL Server to a Windows group, and that implies that any member of that group will be granted that level of access to your SQL Server. This is really handy for managing users in your network. For example, if you want everyone in accounting to have a certain set of rights in SQL Server, you could create a Windows group called Accounting and map that to a SQL Server login. If you hire someone new, then as soon as you add them to the Accounting group they will have access not only to whatever Windows resources the Accounting group has, but also all the SQL Server permissions that the Accounting group has.
+
+If you use Windows as your FROM sources, then you can also supply a WITH clause similar to a SQL Server–based login, but limited to just the default database and language.
+
+  * CERTIFICATE: This kind of login is based off of an X.509 certificate that you've already associated with your server by using the CREATE CERTIFICATE command. Certificates can be used in several different ways, but in the end, they essentially serve as a recognized secure encryption key. SQL Server has its own "certificate authority" or can import those generated from other sources. Essentially, presentation of this certificate serves as authorization to log in to the SQL Server.
+  * ASYMMETRIC KEY: Asymmetric keys are a different flavor of the same general notion that certificates work under. Essentially, it is a key that is presented that SQL Server trusts, and therefore it grants access. Asymmetric keys are merely a different method of presenting a secure key.
+
+To prepare for the examples we'll use the rest of this chapter, you'll need to set up a user in Windows that we'll supply and remove access to and from over the course of the chapter. I've named my test user TestAccount, but you can substitute another name as you see fit (just make sure you remember to also substitute it in the chapter examples). Once you have an account to test with set up in Windows, try adding it to SQL Server (again, you'll need to change "HOBBES" to the name of your system):
+
+CREATE LOGIN [HOBBES\TestAccount] FROM WINDOWS
+
+WITH DEFAULT_DATABASE = NorthwindSecure;
+
+And our test account now has login rights to the SQL Server. Note, however, that even though we've defaulted our TestAccount to the NorthwindSecure database, the account still does not have access to that database (we'll get to that shortly).
+
+ALTER LOGIN
+
+As with most CREATE statements we've seen in SQL, CREATE LOGIN has a complementing statement in the form of ALTER LOGIN. As with most ALTER statements, the syntax is primarily a subset of the options found in the related CREATE statement:
+
+ALTER LOGIN <login name>
+
+[ { ENABLE | DISABLE } ]
+
+[ { WITH
+
+PASSWORD = '<password>'
+
+[ { OLD_PASSWORD = '<old password>'
+
+| [ UNLOCK ] [ MUST_CHANGE ] }
+
+| DEFAULT_DATABASE = <database>
+
+| DEFAULT_LANGUAGE = <language>
+
+| NAME = <new login name>
+
+| CHECK_EXPIRATION = { ON | OFF}
+
+| CHECK_POLICY = { ON | OFF}
+
+[ CREDENTIAL = <credential name>
+
+| NO CREDENTIAL
+
+Most of these are exactly the same as they were with the CREATE statement, but let's look at the few differences.
+
+Option | Description  
+---|---  
+ENABLE | DISABLE | Enables or disables the login. This is something of an indicator of whether or not the login is considered active in the system, and ENABLE should not be confused with UNLOCK (they are different things). Disabling a login leaves it in place but disallows use of the login. Enabling reactivates the login.  
+OLD PASSWORD | This one applies only if a given login is utilizing ALTER LOGIN to change its own password. Security administrators with the rights to change the password at all are unlikely to know the old password and have the right to set a new password without knowing the old one.  
+UNLOCK | This allows a user to attempt to log in again after the login has been locked out due to exceeding the bad password count.  
+NAME | This allows you to change the login name, while otherwise retaining all of the old rights and other properties of the login.  
+NO CREDENTIAL | This disassociates the login with whatever credential it may have previously been mapped to.
+
+DROP LOGIN
+
+This works just like any other DROP statement in SQL Server.
+
+DROP LOGIN <login name>
+
+And it's gone.
+
+Creating a Login Using the Management Studio
+
+Creating a login using Management Studio is fairly straightforward and is much the same as it is for most other objects in SQL Server. Just navigate to the appropriate mode in the Object Explorer (in this case, Security@@Logins), right-click, and choose New Login. This gets us the typical CREATE dialog that we've seen repeatedly in this book, but adjusted for the properties that are appropriate for a login (all the same things we reviewed in the "CREATE LOGIN" section earlier in the chapter, plus a number of additional areas we have yet to take a look at), as shown in Figure 19.1.
+
+Figure 19.1
+
+Only this first set of properties (the General properties) maps to the CREATE LOGIN syntax. The additional tabs map to other objects we will be creating as we continue through the chapter.
+
+We will be reviewing several other kinds of objects that get associated with logins in some fashion. For now, the thing to notice is how the user interface in Management Studio lets you do everything at once. As we'll see as we continue the chapter, when creating these objects using code, we have to do each step separately rather than all at once as Management Studio offers. (As you might imagine, it's really just collecting all the necessary information in advance and then issuing all those individual programmatic steps for us.)
+
+SQL Management Objects
+
+This is largely out of scope for this chapter (we cover SMO in its own chapter later on), but I did want to specifically point out that SMO can create logins for you using a straightforward object model as opposed to the CREATE statement approach. See Chapter 23 for more information.
+
+Legacy Options
+
+There are three older options of significance when considering the way that logins have been created in past versions of SQL Server.
+
+  * sp_addlogin and Related Sprocs: This was a stored procedure that essentially maps to CREATE LOGIN except that several parts of the CREATE LOGIN statement implement things that were not supported prior to SQL Server 2005. The basics (creating the typical login as opposed to the certificate or asymmetric key approach) are all there though. We'll take a more detailed look at sp_addlogin shortly.
+  * WMI: Windows Management Instrumentation is an implementation of an industry-standard Web management protocol. When SQL Server 2000 first came out, the thinking was that a WMI-based model was going to take over as the primary way of automating SQL Server management. In the end, there was no WMI-based model implemented that came anywhere close to being up to the task of exposing all the things we need in SQL Server, and that effort would seem to have been largely junked. WMI is now outside the scope of this book, but realize that it's out there and remains an option if you need to manage older versions of SQL Server or are familiar with WMI for other purposes and want to add SQL Server scripting into your larger WMI plan.
+
+A Quick Look at sp_addlogin
+
+This sproc does exactly what it says, and it was the old way of implementing the things that CREATE LOGIN does for us today. While I highly recommend avoiding sp_addlogin for new development, it is still in wide use in legacy code. It requires only one parameter, but most of the time you'll use two or three. There are a couple of additional parameters, but you'll find that you use those far more rarely. The syntax looks like this:
+
+EXEC sp_addlogin [@loginame =] <'login'>
+
+[,[@passwd =] <'password'>]
+
+[,[@defdb =] <'database'>]
+
+[,[@deflanguage =] <'language'>]
+
+[,[@sid =] 'sid']
+
+[,[@encryptopt =] <'encryption_option'>]
+
+Parameter | Description  
+---|---  
+@loginame | Just what it sounds like—this is the login ID that will be used.  
+@passwd | Even more what it sounds like—the password that is used to log in using the aforementioned login ID.  
+@defdb | The default database. This defines what is the first "current" database when the user logs in. Normally, this will be the main database your application uses. If left unspecified, the default will be the master database (you usually don't want that, so be sure to provide this parameter).  
+@deflanguage | The default language for this user. You can use this to override the system default if you are supporting localization.  
+@sid | A binary number that becomes the security identifier (SID) for your login ID. If you don't supply an SID, SQL Server generates one for you. Since SIDs must be unique, any SID you supply must not already exist in the system. Using a specific SID can be handy when you are restoring your database to a different server or are otherwise migrating login information.  
+@encryptopt | The user's login ID and password information is stored in the sysusers table in the master database. The @encryptopt determines whether or not the password stored in the master database is encrypted. By default (or if you provide a NULL in this parameter), the password is indeed encrypted. The other options are skip_encryption, which does just what it says—the password is not encrypted, and skip_encryption_old, which is there only for backward compatibility, and should not be used.
+
+As you can see, most of the items here map directly to CREATE LOGIN, and that is the way I recommend doing things unless you need to utilize sp_addlogin for backward-compatibility reasons.
+
+sp_password
+
+Since we've looked at sp_addlogin, we ought to look at sp_password. While ALTER LOGIN gives you the ability to address password maintenance on a login (and it is what you should be using), sp_addlogin had no such functionality—sp_password takes care of that. The syntax is pretty straightforward:
+
+sp_password [[@old =] <'old password'>,]
+
+[@new =] <'new password'>
+
+[,[@loginame =] <'login'>]
+
+The new and old password parameters work, of course, just exactly as you would expect. You need to accept those from the user and pass them into the sproc. Note, however, that the login is an optional parameter. If you don't supply it, then it will assume that you want to change the password on the login used for the current connection. Note that sp_password cannot be executed as part of a transaction.
+
+You might be thinking something like, "Don't most systems require you to enter the new password twice?" Indeed they do. So the follow up question is, "How come sp_password doesn't do that?" The answer is a simple one—because SQL Server leaves that up to you. You would include the logic to check for a double entry of the new password in your client application before you ever got as far as using sp_password. This same issue exists for ALTER LOGIN.
+
+sp_grantlogin
+
+This simulates the CREATE LOGIN...FROM functionality as relates to Windows logins (prior to SQL Server 2005, mapping from certificates and asymmetric keys did not exist as they do now). The syntax is straightforward:
+
+sp_grantlogin [@loginname = ]'<Domain Name>\<Windows User Name>'
+
+Again, this is for backward compatibility only. Use the CREATE LOGIN...FROM syntax for 2005 and later installations (which should be the vast majority of new code at this point).
+
+Windows Authentication
+
+Windows authentication gives us the capability to map logins from trusted Windows domains into our SQL Server.
+
+It is simply a model where you take existing Windows domain user accounts or groups and provide SQL Server rights to them directly rather than forcing users to keep separate passwords and make separate logins.
+
+Windows authentication allows:
+
+  * Maintenance of a user's access from just one place
+  * Granting of SQL Server rights simply by adding a user to a Windows group (this means that you often don't have to even go into SQL Server in order to grant access to a user)
+  * Your users need to remember only one password and login
+
+That being said, let's take a look at how to grant specific rights to specific users.
+
+User Permissions
+
+The simplest definition of what a user permission is would be something like, "what a user can and can't do." In this case, the simple definition is a pretty good one.
+
+User permissions fall into three categories:
+
+  * Permission to log in
+  * Permission to access a specific database
+  * Permission to perform specific actions on particular objects within that database
+
+Since we've already looked at creating logins, we'll focus here on the specific permission that a login can have.
+
+Granting Access to a Specific Database
+
+The first thing that you need to do if you want a user to have access to a database is to grant the user permission to access that database. This can be done in Management Studio by adding the user to the Users member of the Databases node of your server. To add a user using T-SQL, you should use CREATE USER. Similar to sp_addlogin there is also, for backward compatibility, the sp_grantdbaccess stored procedure.
+
+Note that as you CREATE a user in the database, those permissions are actually stored in the database and mapped to the server's identifier for that user. As you restore a database, you may have to remap user rights to the server identifiers where you restored the database.
+
+CREATE USER
+
+The CREATE USER command adds a new user to the database. That user can be sourced from an existing login, certificate, or asymmetric key, or can be local to just the current database. The syntax looks like this:
+
+CREATE USER <user name>
+
+[ { { FOR | FROM }
+
+{
+
+LOGIN <login name>
+
+| CERTIFICATE <certificate name>
+
+| ASYMMETRIC KEY <key name>
+
+}
+
+| WITHOUT LOGIN ]
+
+[ WITH DEFAULT_SCHEMA = <schema name> ]
+
+Let's take a quick look at what some of these elements mean:
+
+Option | Description  
+---|---  
+LOGIN | The name of the login you want to grant access to for the current database.  
+CERTIFICATE | Logical name of the certificate to be associated with this user. Note that the certificate must have already been created using the CREATE CERTIFICATE command.  
+ASYMMETRIC KEY | Logical name of the asymmetric key to be associated with this user. Note that the key must have already been created using the CREATE ASYMMETRIC KEY command.  
+WITHOUT LOGIN | Creates a user that is local to the current database. It can be used to set up a specific security context but cannot be mapped to a login outside of the current database nor can it access any other database.  
+WITH DEFAULT_SCHEMA | Establishes a schema other than the default "dbo" as being the default schema for the current user.
+
+So, to grant access to our NorthwindSecure database for our TestAccount, we would issue a command such as:
+
+CREATE USER [HOBBES\TestAccount]
+
+FOR LOGIN [HOBBES\TestAccount]
+
+WITH DEFAULT_SCHEMA = dbo;
+
+This grants our login access to the specified database (NorthwindSecure in this case) and sets that login's default schema to the database owner.
+
+sp_grantdbaccess
+
+This is the legacy method for granting a login access to a specific database. The syntax looks like this:
+
+sp_grantdbaccess [@loginame =] <'login'>[, [@name_in_db =] <'name in this db'>
+
+Note that the access granted will be to the current database—that is, you need to make sure that the database you want the user to have access to is the current database when you issue the command. The login name is the actual login ID that was used to log in to SQL Server. The name_in_db parameter allows you to alias this user to another identification. The alias serves for this database only—all other databases will still use the default of the login ID or whatever alias you defined when you granted the user access to that database. The aliasing will affect identification functions such as USER_NAME(). Functions that look at things at the system level, such as SYSTEM_USER, will still return the base login ID.
+
+Granting Object Permissions within the Database
+
+Okay, so the user has a login and access to the database you want him or her to have access to, so now everything's done—right? Ah, if only it were that simple! We are, of course, not done yet.
+
+SQL Server gives us a pretty fine degree of control over what our users can access. Most of the time, you have some information that you want your users to be able to get to, but you also have other information in the database to which you don't want them to have access. For example, you might have a customer service person who has to be able to look at and maintain order information—but you probably don't want them messing around with the salary information. The opposite is also probably true—you need your human resource people to be able to edit employee records, but you probably don't want them giving somebody a major discount on a sale.
+
+SQL Server allows you to assign a separate set of rights to some of the different objects within SQL Server. The objects you can assign rights to include tables, views, and stored procedures. Triggers are implied to have the rights of the person that created them.
+
+User rights on objects fall into six different types:
+
+User Right | Description  
+---|---  
+SELECT | Allows a user to "see" the data. If a user has this permission, the user has the right to run a SELECT statement against the table or view on which the permission is granted.  
+INSERT | Allows a user to create new data. Users with this permission can run an INSERT statement. Note that, unlike many systems, having INSERT capability does not necessarily mean that you have SELECT rights.  
+UPDATE | Allows a user to modify existing data. Users with this permission can run an UPDATE statement. Like the INSERT statement, having UPDATE capability does not necessarily mean that you have SELECT rights.  
+DELETE | Allows a user to delete data. Users with this permission can run a DELETE statement. Again, having DELETE capability does not necessarily mean that you have SELECT rights.  
+REFERENCES | Allows a user to insert rows, where the table that is being inserted into has a foreign key constraint, which references another table to which that user doesn't have SELECT rights.  
+EXECUTE | Allows a user to EXECUTE a specified stored procedure.
+
+You can mix and match these rights as needed on the particular table, view, or sproc to which you're assigning rights.
+
+You can assign these rights in the Management Studio simply by navigating to the Logins option of the Security node of your server. Just right-click the user and choose Properties. You'll be presented with a different dialog depending on whether you're in the database or security node, but, in either case, you'll have the option of setting permissions. Assigning rights using T-SQL uses three commands that are good to know even if you're only going to assign rights through Management Studio (the terminology is the same).
+
+GRANT
+
+GRANT gives the specified user or role the access specified for the object that is the subject of the GRANT statement.
+
+The syntax for a GRANT statement looks like this:
+
+GRANT
+
+ALL [PRIVILEGES] | <permission>[,...n]
+
+ON
+
+<table or view name>[(<column name>[,...n])]
+
+|<stored or extended stored procedure name>
+
+TO <login or role name>[,...n]
+
+[WITH GRANT OPTION]
+
+[AS <role name>]
+
+The ALL keyword indicates that you want to grant all the rights that are applicable for that object type (EXECUTE never applies to a table). If you don't use the ALL keyword, then you need to supply one or more specific permissions that you want granted for that object.
+
+PRIVILEGES is a keyword that has no real function other than to provide ANSI/ISO compatibility.
+
+The ON keyword serves as a placeholder to say that what comes next is the object for which you want the permissions granted. Note that, if you are granting rights on a table, you can specify permissions down to the column level by specifying a column list to be affected—if you don't supply specific columns, then it's assumed to affect all columns.
+
+Microsoft appears to have done something of an about face in their opinion of column-level permissions. Being able to say that a user can do a SELECT on a particular table but only on certain columns seems like a cool idea, but it really convolutes the security process both in its use and in the work it takes Microsoft to implement it. As such, literature on the subject over the last several years has sometimes said little, and sometimes seemed to indicate that Microsoft wishes that column-level security would go away. They have occasionally recommended against its use (and other times seemed to offer no opinion)—if you need to restrict a user to seeing particular columns, consider using a view instead.
+
+The TO statement does what you would expect: it specifies those to whom you want this access granted. It can be a login ID or a role name.
+
+WITH GRANT OPTION allows the user that you're granting access to, in turn, also grant access to other users.
+
+I recommend against the use of this option since it can quickly become a pain to keep track of who has got access to what. Sure, you can always go into Management Studio and look at the permissions for that object, but then you're in a reactive mode rather than a proactive one—you're looking for what's wrong with the current access levels rather than stopping unwanted access up front.
+
+Last, but not least, is the AS keyword. This one deals with the issue of a login belonging to multiple roles.
+
+Now, we can go ahead and move on to an example or two. We'll see later that the TestAccount that we created already has some access based on being a member of the Public role—something that every database user belongs to, and from which you can't remove them. There are, however, a large number of items to which TestAccount doesn't have access (because Public is the only role it belongs to, and Public doesn't have rights either).
+
+Start by logging in with the TestAccount user. Then try a SELECT statement against the Region table:
+
+SELECT * FROM Region;
+
+You'll quickly get a message from SQL Server telling you that you are a scoundrel, and you are attempting to go to places that you shouldn't be going:
+
+Server: Msg 229, Level 14, State 5, Line 1
+
+SELECT permission denied on object 'Region', database 'NorthwindSecure', owner 'dbo'.
+
+Log in separately as sa—you can do this in the same instance of QA if you like by choosing the File@@Connect menu choice. Then select SQL Server security for the new connection and log in as sa with the appropriate password. Now execute a GRANT statement:
+
+USE NorthwindSecure;
+
+GRANT SELECT ON Region TO [HOBBES\TestAccount];
+
+Note that you'll need to replace the "HOBBES" with the name of your computer or domain as appropriate.
+
+Now switch back to the TestAccount connection (remember, the information for what user you're connected in as is in the title bar of the connection window), and try that SELECT statement again: This time, you get better results:
+
+RegionID RegionDescription
+
+\--------------- -------------------------
+
+1 Eastern
+
+2 Western
+
+3 Northern
+
+4 Southern
+
+(4 row(s) affected)
+
+Let's go ahead and try another one. This time, let's run the same tests and commands against the EmployeeTerritories table:
+
+SELECT * FROM EmployeeTerritories;
+
+This one fails—again, you don't have rights to it, so let's grant the rights to this table:
+
+USE NorthwindSecure;
+
+GRANT SELECT ON EmployeeTerritories TO [HOBBES\TestAccount];
+
+Now, if you re-run the select statement, things work just fine:
+
+EmployeeID TerritoryID
+
+\---------------- --------------
+
+1 06897
+
+1 19713
+
+...
+
+...
+
+...
+
+9 48304
+
+9 55113
+
+9 55439
+
+(49 row(s) affected)
+
+To add an additional twist, however, let's try an INSERT into this table:
+
+INSERT INTO EmployeeTerritories
+
+VALUES
+
+(1, '01581');
+
+SQL Server wastes no time in telling us to get lost. We don't have the required permissions, so let's grant them (using the sa connection):
+
+USE NorthwindSecure;
+
+GRANT INSERT ON EmployeeTerritories TO [HOBBES\TestAccount];
+
+Now try that INSERT statement again:
+
+INSERT INTO EmployeeTerritories
+
+VALUES
+
+(1, '01581');
+
+Everything works great.
+
+DENY
+
+DENY explicitly prevents the user from the access specified on the targeted object. The key to DENY is that it overrides any GRANT statements. Since a user can belong to multiple roles (discussed shortly), it's possible for a user to be part of a role that's granted access but also have a DENY in effect. If a DENY and a GRANT both exist in a user's mix of individual and role-based rights, then the DENY wins every time. In short, if the user or any role the user belongs to has a DENY for the right in question, then the user will not be able to make use of that access on that object.
+
+The syntax looks an awful lot like the GRANT statement:
+
+DENY
+
+[ALL] [PRIVILEGES]|<permission>[,...n]
+
+ON
+
+<table or view name>[(column[,...n])]
+
+|<stored or extended stored procedure name>
+
+TO <login ID or roll name>[,...n]
+
+[CASCADE]
+
+Again, the ALL keyword indicates that you want to deny all the rights that are applicable for that object type (EXECUTE never applies to a table). If you don't use the ALL keyword, then you need to supply one or more specific permissions that you want to be denied for that object.
+
+Note that the ALL keyword is now included solely for backward compatibility. It's also important to understand that ALL no longer truly affects "all" privileges. While it does affect most mainstream privileges (such as a SELECT), there is, as ALL becomes more out of date, an ever increasing list of privileges not affected by ALL.
+
+PRIVILEGES is still a new keyword and has no real function other than to provide ISO compatibility.
+
+The ON keyword serves as a placeholder to say that what comes next is the object on which you want the permissions denied.
+
+Everything has worked pretty much the same as with a GRANT statement until now. The CASCADE keyword matches up with the WITH GRANT OPTION that was in the GRANT statement. CASCADE tells SQL Server that you want to also deny access to anyone that this user has granted access to under the rules of the WITH GRANT OPTION.
+
+To run an example on DENY, let's try a simple SELECT statement using the TestAccount login:
+
+USE NorthwindSecure;
+
+SELECT * FROM Employees;
+
+This should get you nine records or so. How did you get access when we haven't granted it to TestAccount? TestAccount belongs to Public, and Public has been granted access to Employees.
+
+Let's say that we don't want TestAccount to have access. For whatever reason, TestAccount is the exception, and we don't want that user snooping in that data—we just issue our DENY statement (remember to issue the DENY using the sa login):
+
+USE NorthwindSecure;
+
+DENY ALL ON Employees TO [HOBBES\TestAccount];
+
+When you run the SELECT statement again using TestAccount, you'll get an error. You no longer have access. Note also that, since we used the ALL keyword, the INSERT, DELETE, and UPDATE access that Public has is now also denied from TestAccount.
+
+Again, note that ALL is deprecated, so you will receive a warning when running the previous example code. I have kept this example so you understand the breadth of the ALL keyword, which you may still find in your legacy code.
+
+REVOKE
+
+REVOKE eliminates the effects of a previously issued GRANT or DENY statement. Think of this one as like a targeted "Undo" statement.
+
+The syntax is a mix of the GRANT and DENY statements:
+
+REVOKE [GRANT OPTION FOR]
+
+[ALL] [PRIVILEGES] | <permission>[,...n]
+
+ON
+
+<table or view name>[(<column name> [,...n])]
+
+|<stored or extended stored procedure name>
+
+TO | FROM <login ID or roll name>[,...n]
+
+[CASCADE]
+
+[AS <role name>]
+
+The explanations here are virtually identical to those of the GRANT and DENY statements. I put them here again in case you're pulling the book back off the shelf for a quick lookup on REVOKE.
+
+Once again, the ALL keyword indicates that you want to revoke all the rights that are applicable for that object type. If you don't use the ALL keyword, then you need to supply one or more specific permissions that you want to be revoked for that object.
+
+PRIVILEGES still has no real function other than to provide ANSI/ISO compatibility.
+
+The ON keyword serves as a placeholder to say that what comes next is the object on which you want the permissions revoked.
+
+The CASCADE keyword matches up with the WITH GRANT OPTION that was in the GRANT statement. CASCADE tells SQL Server that you want to also revoke access from anyone that this user granted access to under the rules of the WITH GRANT OPTION.
+
+The AS keyword again just specifies which role you want to issue this command based on.
+
+Using the sa connection, let's undo the access that we granted to the Region table in NorthwindSecure:
+
+REVOKE ALL ON Region FROM [HOBBES\TestAccount];
+
+After executing this, our TestAccount can no longer run a SELECT statement against the Region table.
+
+In order to remove a DENY, we also issue a REVOKE statement. This time, we'll regain access to the Employees table:
+
+USE NorthwindSecure;
+
+REVOKE ALL ON Employees TO [HOBBES\TestAccount]
+
+Now that we've seen how all the commands to control access work for individual users, let's take a look at the way we can greatly simplify management of these rights by managing in groupings.
+
+User Rights and Statement-Level Permissions
+
+User permissions don't just stop with the objects in your database—they also extend to certain statements that aren't immediately tied to any particular object. SQL Server gives you control over permissions to run several different statements, including:
+
+  * CREATE DATABASE
+  * CREATE DEFAULT
+  * CREATE PROCEDURE
+  * CREATE RULE
+  * CREATE TABLE
+  * CREATE VIEW
+  * BACKUP DATABASE
+  * BACKUP LOG
+
+At this point, we've already seen all of these commands at work except for the two backup commands—what those are about is pretty self-explanatory, so I'm not going to spend any time on them here (we'll look at them in Chapter 22)—just keep in mind that they are something you can control at the statement level.
+
+Okay, so how do we assign these permissions? Actually, now that you've already seen GRANT, REVOKE, and DENY in action for objects, you're pretty much already schooled on statement-level permissions, too. Syntactically speaking, they work just the same as object-level permissions, except that they are even simpler (you don't have to fill in as much). The syntax looks like this:
+
+GRANT {ALL | <statement[,...n]>} TO <login ID>[,...n]
+
+Easy, hey? To do a quick test, let's start by verifying that our test user doesn't already have authority to CREATE. Make sure you are logged in as your TestAccount, and then run the following command. Don't forget to switch your domain name for HOBBES in the following:
+
+USE NorthwindSecure;
+
+CREATE TABLE TestCreate
+
+(
+
+Col1 int Primary Key
+
+);
+
+This gets us nowhere fast:
+
+Server: Msg 262, Level 14, State 1, Line 2
+
+CREATE TABLE permission denied, database 'NorthwindSecure', owner 'dbo'.
+
+Now log in to SQL Server using the sa account (or another account with dbo authority for NorthwindSecure). Then run our command to grant permissions:
+
+GRANT CREATE TABLE TO [HOBBES\TestAccount];
+
+You should get confirmation that your command completed successfully. Then just try running the CREATE statement again. Remember to log back in using the TestAccount:
+
+USE NorthwindSecure;
+
+CREATE TABLE TestCreate
+
+(
+
+Col1 int Primary Key
+
+);
+
+This time everything works.
+
+DENY and REVOKE also work the same way as they did for object-level permissions.
+
+Server and Database Roles
+
+A role is, in the most general sense, the same thing as a group in Windows, that is, it is a collection of access rights (or denials) that are automatically associated with a user when they are assigned that role.
+
+A role is a collection of access rights that can be assigned to a user en masse simply by assigning a user to that role.
+
+A user can belong to as little as one or potentially several roles at one time. This can be incredibly handy since you can group access rights into smaller and more logical groups and then mix and match them into the formula that best fits a user.
+
+Roles fall into two categories:
+
+  * Server roles
+  * Database roles
+
+We'll soon see a third thing that's also called role—though I wish that Microsoft had chosen another name—application roles. These are a special way to alias a user into a different set of permissions. An application role isn't something you assign a user to; it's a way of letting an application have a different set of rights from the user. For this reason, I don't usually think of application roles as a "role" in the true sense of the word.
+
+Server roles are limited to those that are already built into SQL Server when it ships and are primarily there for the maintenance of the system as well as granting the capability to do non-database-specific things like creating login accounts and creating linked servers.
+
+Much like server roles, there are a number of built-in (or "fixed") database roles, but you can also define your own database roles to meet your particular needs. Database roles are for setting up and grouping specific user rights within a single given database.
+
+Let's look at both of these types of roles individually.
+
+Server Roles
+
+All server roles available are "fixed" roles and are there right from the beginning. All the server roles that you're ever going to have existed from the moment your SQL Server was installed.
+
+Role | Nature  
+---|---  
+sysadmin | This role can perform any activity on your SQL Server. Anyone with this role is essentially the sa for that server. The creation of this server role provides Microsoft with the capability to one day eliminate the sa login—indeed, the Books Online refers to sa as being legacy in nature. It's worth noting that the Windows Administrators group on the SQL Server is automatically mapped into the sysadmin role. This means that anyone who is a member of your server's Administrators group also has sa-level access to your SQL data. You can, if you need to, remove the Windows administrators group from the sysadmin role to tighten that security loophole.  
+serveradmin | This one can set server-wide configuration options or shut down the server. It's rather limited in scope, yet the functions controlled by members of this role can have a very significant impact on the performance of your server.  
+setupadmin | This one is limited to managing linked servers and startup procedures.  
+securityadmin | This one is very handy for logins that you create specifically to manage logins, read error logs, and CREATE DATABASE permissions. In many ways, this one is the classic system operator role—it can handle most of the day-to-day stuff, but doesn't have the kind of global access that a true omnipotent superuser would have.  
+processadmin | Has the capability to manage processes running in SQL Server—this one can kill long-running processes if necessary.  
+dbcreator | Is limited to creating and altering databases.  
+diskadmin | Manages disk files (what file group things are assigned to, attaching and detaching databases, and so on).  
+bulkadmin | This one is something of an oddity. It is created explicitly to give rights to execute the BULK INSERT statement, which otherwise is executable only by someone with sysadmin rights. Frankly, I don't understand why this statement isn't granted with the GRANT command like everything else, but it isn't. Keep in mind that, even if a user has been added to the bulkadmin group, that just gives them access to the statement, not the table that they want to run it against. This means that you need, in addition to adding the user to the bulkadmin task, to GRANT them INSERT permissions to any table you want them to be able to perform the BULK INSERT against. In addition, you'll need to make sure they have proper SELECT access to any tables that they will be referencing in their BULK INSERT statement.
+
+You can mix and match these roles to individual users that are responsible for administrative roles on your server. In general, I suspect that only the very largest of database shops will use more than the sysadmin and securityadmin roles, but they're still handy to have around.
+
+Earlier in this chapter, I got into a lengthy soapbox diatribe on the evils of global users. It probably comes as no surprise to you to learn that I was positively ecstatic when the new sysadmin role was added back in version 7.0. The existence of this role means that, on an ongoing basis, you should not need to have anyone have the sa login. Just let the users that need that level of access become members of the sysadmin role, and they shouldn't ever need to log in as sa. Be careful though; having a user always have that level of access can lead to accidents (it won't, on the basis of security, stop you from dropping objects and the like). I've known many IT shops that give their administrators more than one login: one for full sysadmin access, and another "day to day" login that has the privileges they need to get most things done, but limits privileges that have a high risk of being destructive. The admin can still do what they need to do, but they need to make the conscious effort to log in with the special high access account to do the more risky activities (which means they are much more likely to be thinking about it as they do it).
+
+Database Roles
+
+Database roles are limited in scope to just one database—just because a user belongs to the db_datareader role in one database doesn't mean that it belongs to that role in another database. Database roles fall into two subcategories: fixed and user defined.
+
+Fixed Database Roles
+
+Much as there are several fixed server roles, there are also a number of fixed database roles. Some of them have a special predefined purpose, which cannot be duplicated using normal statements (that is you cannot create a user-defined database role that had the same functionality). However, most exist to deal with the more common situations and make things easier for you.
+
+Role | Nature  
+---|---  
+db_owner | This role performs as if it were a member of all the other database roles. Using this role, you can create a situation where multiple users can perform the same functions and tasks as if they were the database owner.  
+db_accessadmin | Performs a portion of the functions similar to the securityadmin server role, except this role is limited to the individual database where it is assigned and the creation of users (not individual rights). It cannot create new SQL Server logins, but members of this role can add Windows users and groups as well as existing SQL Server logins into the database.  
+db_datareader | Can issue a SELECT statement on all user tables in the database.  
+db_datawriter | Can issue INSERT, UPDATE, and DELETE statements on all user tables in the database.  
+db_ddladmin | Can add, modify, or drop objects in the database.  
+db_securityadmin | The other part of the database-level equivalent of the securityadmin server role. This database role cannot create new users in the database, but does manage roles and members of database roles as well as manage statement and object permissions in the database.  
+db_backupoperator | Backs up the database (gee, bet you wouldn't have guessed that one!).  
+db_denydatareader | Provides the equivalent of a DENY SELECT on every table and view in the database.  
+db_denydatawriter | Similar to db_denydatareader, only affects INSERT, UPDATE, and DELETE statements.
+
+Much as with the fixed server roles, you're probably not going to see all of these used in anything but the largest of database shops. Some of the roles are not replaceable with your own database roles, and others are just very handy to deal with the quick-and-dirty situations that seem to frequently come up.
+
+User-Defined Database Roles
+
+The fixed roles that are available are really only meant to be there to help you get started. The real mainstay of your security is going to be the creation and assignment of user-defined database roles. For these roles, you decide what permissions they include.
+
+With user-defined roles, you can GRANT, DENY, and REVOKE in exactly the same way as we did for individual users. The nice thing about using roles is that users tend to fall into categories of access needs. By using roles you can make a change in one place and have it propagate to all the similar users (at least the ones that you have assigned to that role).
+
+We have two means of creating a user-defined role:
+
+  * CREATE ROLE (the preferred choice)
+  * sp_addrole (for backward compatibility)
+
+Let's take a look at each.
+
+Creating a User-Defined Role Using CREATE ROLE
+
+To create our own role, the preferred option is to use the CREATE ROLE command. Much like many of the other commands we've looked at in this chapter, the functionality of this command has been migrated to a more ANSI/ISO-compliant syntax, but was previously supported by a system stored procedure—in this case, the sp_addrole system sproc. As with the others, the syntax is pretty straightforward:
+
+CREATE ROLE <role name> [AUTHORIZATION <owner name>][;]
+
+The role name is simply what you want to call that role. Examples of common naming schemas would include by department (Accounting, Sales, Marketing, and so on) or by specific job (CustomerService, Salesperson, President, and so on). Using roles like this can make it really easy to add new users to the system. If your accounting department hires someone new, you can just add him or her to the Accounting role (or, if you're being more specific, it might even be the AccountsPayable role) and forget it—no researching "What should this person have for rights?"
+
+The AUTHORIZATION parameter is optional, and allows you to override what database user or role owns this new role. (By default, it will be owned by whoever ran the CREATE command, usually someone in the db_owner role).
+
+Let's go ahead and create ourselves a role:
+
+USE NorthwindSecure;
+
+CREATE ROLE OurTestRole;
+
+When you execute this, you should get back a nice friendly message telling you that the new role has been added.
+
+Now what we need is to add some value to this role in the form of it actually having some rights assigned to it. To do this, we just use our GRANT, DENY, or REVOKE statements just as we did for actual users earlier in the chapter:
+
+USE NorthwindSecure;
+
+GRANT SELECT ON Territories TO OurTestRole;
+
+Anyone who belongs to our role now has SELECT access to the Territories table (unless they have a DENY somewhere else in their security information).
+
+Using sp_addrole
+
+As I mentioned earlier, there is an older, system stored procedure–based command that remains for backward compatibility.
+
+The syntax is again pretty simple:
+
+sp_addrole [@rolename =] <'role name'>
+
+[,[@ownername =] <'owner'>]
+
+The owner is the same thing as it is for all other objects in the system. The default is the database owner, and I strongly suggest leaving it that way (in other words, just ignore this optional parameter). If we were going to add our special test role using the older syntax, it would look something like:
+
+USE NorthwindSecure;
+
+EXEC sp_addrole 'OurTestRole';
+
+Regardless of which syntax you use, you should, at this point, be ready to start adding users.
+
+Adding Users to a Role
+
+Having all these roles around is great, but they are of no use if they don't have anyone assigned to them. Surprisingly, there isn't, as yet anyway, a new command that addresses this. Instead, we go back to the older system stored procedure model, calling the sp_addrolemember system sproc and providing the database name and login ID:
+
+sp_addrolemember [@rolename =] <role name>,
+
+[@membername =] <Login ID>[;]
+
+Everything is pretty self-explanatory on the parameters for this one, so let's move right into an example.
+
+Let's start off by verifying that our TestAccount doesn't have access to the Territories table:
+
+SELECT * FROM Territories;
+
+Sure enough, we are rejected (no access yet):
+
+Server: Msg 229, Level 14, State 5, Line 1
+
+SELECT permission denied on object 'Territories', database 'Northwind', owner 'dbo'.
+
+Now we'll go ahead and add our TestAccount Windows user to our OurTestRole role:
+
+USE NorthwindSecure;
+
+EXEC sp_addrolemember OurTestRole, [HOBBES\TestAccount];
+
+It's time to try and run the SELECT statement again—this time with much more success (you should get about 53 rows back).
+
+Removing a User from a Role
+
+What goes up must come down, and users that are added to a role will also inevitably be removed from roles.
+
+Removing a user from a role works almost exactly as adding them does, except we use a different system sproc called sp_droprolemember in the form of:
+
+sp_droprolemember [@rolename =] <role name>,
+
+[@membername =] <security account>[;]
+
+So, let's go right back to our example and remove the TestAccount from the OurTestRole database role:
+
+USE NorthwindSecure;
+
+EXEC sp_droprolemember OurTestRole, [HOBBES\TestAccount];
+
+You should receive another friendly confirmation that things have gone well. Now try our SELECT statement again:
+
+SELECT * FROM Territories;
+
+And, sure enough, we are again given the error that we don't have access.
+
+You can add and drop users from any role this way. It doesn't matter whether the role is user-defined or fixed, or whether it's a system or database role. In any case, they work pretty much the same.
+
+Note also that you can do all of this through the Management Studio. To change the rights associated with a role, just click the Roles member of the Security node (under your specific database), and assign permissions by using the checkboxes. When you want to add a user to the role, go to the users node (again, under the specific database) and right-click to select Properties. Then select either the server or database roles by putting a check mark in all the roles you want that user to have.
+
+Dropping Roles
+
+Dropping a role is as easy as adding one. The syntax is simply:
+
+EXEC sp_droprole <'role name'>[;]
+
+And it's gone.
+
+Application Roles
+
+Application roles are something of a different animal than are database and server roles. Indeed, the fact that the term role is used would make you think that they are closely related. They aren't.
+
+Application roles are really much more like a security alias for the user. Application roles allow you to define an access list (made up of individual rights or groupings of databases). They are also similar to a user in that they have their own password. They are, however, different from a user login because they cannot "log in" as such. A user account must first log in, then he or she can activate the application role.
+
+So what do we need application roles for? For applications—what else? Time and time again, you'll run into the situation where you would like a user to have a separate set of rights depending on under what context he or she is accessing the database. With an application role, you can do things like grant users no more than read-only access to the database (SELECT statements only), but still allow them to modify data when they do so within the confines of your application.
+
+The process works like this:
+
+1. The user logs in (presumably using a login screen provided by your application).
+
+2. The login is validated, and the user receives his or her access rights.
+
+3. The application executes a system sproc called sp_setapprole and provides a role name and password.
+
+4. The application role is validated, and the connection is switched to the context of that application role (all the rights the user had are gone—he or she now has the rights of the application role).
+
+5. The user continues with access based on the application role rather than his or her personal login throughout the duration of the connection; the user cannot go back to his or her own access information.
+
+You would only want to use application roles as part of a true application situation, and you would build the code to set the application role right into the application. You would also compile the required password into the application or store the information in some local file to be accessed when it is needed.
+
+Creating Application Roles
+
+To create an application role, we use a variation on the CREATE ROLE theme—CREATE APPLICATION ROLE. This is another pretty easy one to use; its syntax looks like this:
+
+CREATE APPLICATION ROLE <role name>
+
+WITH PASSWORD = <'password'> [, DEFAULT_SCHEMA = <schema name>][;]
+
+Much like the other flavors of CREATE in this chapter, the parameters are pretty self-explanatory; so let's move right on to using it by creating ourselves an application role:
+
+CREATE APPLICATION ROLE OurAppRole WITH PASSWORD = 'P@ssw0rd';
+
+Just that quick, our application role is created. Like most of the security items thus far, there is a system stored procedure that used to serve this functionality that is still supported, but, again, only for backward compatibility. It is very similar to the CREATE syntax, but looks like this:
+
+sp_addapprole [@rolename =] <role name>,
+
+[@password =] <'password'>[;]
+
+So creating the previous example using the system stored procedure instead would look like:
+
+EXEC sp_addapprole OurAppRole, 'P@ssw0rd';
+
+Adding Permissions to the Application Role
+
+Adding permissions to application roles works just like adding permissions to anything else. Just substitute the application role name anywhere that you would use a login ID or regular server or database role.
+
+Again, we'll move to the quick example:
+
+GRANT SELECT ON Region TO OurAppRole;
+
+Our application role now has SELECT rights on the Region table—it doesn't, as yet, have access to anything else.
+
+Using the Application Role
+
+Using the application role is a matter of calling a system sproc (sp_setapprole) and providing both the application role name and the password for that application role. The syntax looks like this:
+
+sp_setapprole [@rolename =] <role name>,
+
+[@password =] {Encrypt N<'password'>}|<'password'>
+
+[,[@encrypt =] {'none' | 'odbc']
+
+[, [@fCreateCookie = ] {true | false} ]
+
+[, [@cookie = ] <variable holding cookie> OUTPUT][;]
+
+The role name is simply the name of whatever application role you want to activate.
+
+The password can be either supplied as is or encrypted using the ODBC encrypt function. If you're going to encrypt the password, then you need to enclose the password in quotes after the Encrypt keyword and precede the password with a capital N—this indicates to SQL Server that you're dealing with a Unicode string (which the password must be in if you're going to encrypt it), and it will be treated accordingly. If you don't want encryption, then just supply the password without using the Encrypt keyword.
+
+It's worth noting that encryption is only an option with ODBC and OLE DB clients. Thus you cannot test it inside the Query window (which uses SqlClient). Furthermore, if you're not using encryption, realize that the password you supply is going to be plainly viewable to anyone sniffing packets on your network. In short, if you're not using ODBC encryption for sending your password, then you'll want to use SSL or IPSec (two secure transport methods) for the connection.
+
+This takes us to the cookie side of things. Setting a cookie (and storing the value you get back in the @cookie output variable) provides a bookmark of sorts for the permission set that was active before you activated the application role. You can then use the sp_unsetapprole stored procedure to revert back to the previous security context (the one indicated by the cookie). The syntax for sp_unsetapprole looks like this:
+
+sp_unsetapprole <cookie variable>
+
+Execute this, and your security context should return to the previous state.
+
+Moving right into a simple example, let's start by verifying a couple of things about the status of our TestAccount user. At this point in the chapter (assuming you've been following along with all the examples), your TestAccount user should not be able to access the Region table but should be able to access the EmployeeTerritories table. You can verify this to be the case by running a couple of SELECT statements:
+
+SELECT * FROM Region;
+
+SELECT * FROM EmployeeTerritories;
+
+The first SELECT should give you an error, and the second should return around 50 rows or so.
+
+Now let's activate the application role that we created a short time ago; type this in using TestAccount user:
+
+EXEC sp_setapprole OurAppRole, 'P@ssw0rd';
+
+When you execute this, you should get back a confirmation that your application role is now "active."
+
+Try it out by running our two SELECT statements. You'll find that what does and doesn't work has been exactly reversed. That is, TestAccount had access to EmployeeTerritories, but that was lost when we went to the application role. TestAccount did not have access to the Regions table, but the application role now provides that access.
+
+Since we didn't store a cookie (I'm deliberately making a point here...), there is no way to terminate the application role for the current connection. We're stuck with few options other than, perhaps, switching to yet another application role. We have no way of returning to our original security context without the cookie.
+
+Go ahead and terminate your TestAccount connection. Then, create a new connection with Windows authentication for your TestAccount. Try running those SELECT statements again, and you'll find that your original set of rights has been restored.
+
+Getting Rid of Application Roles
+
+When you no longer need the application role on your server, you can use the same DROP command that you should, by now, be very familiar with:
+
+DROP APPLICATION ROLE <role name>
+
+There is, of course, also a system stored procedure version of this (again, backward compatibility only please!) called sp_dropapprole. The syntax is as follows:
+
+sp_dropapprole [@rolename =] <role name>
+
+To eliminate our application role from the system using the DROP syntax, we would just issue the command (from sa):
+
+DROP APPLICATION ROLE OurAppRole;
+
+More Advanced Security
+
+This section is really nothing more than an "extra things to think about" section. All of these fall outside the realm of the basic rules we defined at the beginning of the chapter, but they address ways around some problems and also how to close some common loopholes in your system.
+
+What to Do About the Guest Account
+
+The guest account provides a way of having default access. When you have the guest account active, several things happen:
+
+  * Logins gain guest-level access to any database to which they are not explicitly given access.
+  * Outside users can log in through the guest account to gain access. This requires that they know the password for guest, but they'll already know the user exists (although, they probably also know that the sa account exists).
+
+Personally, one of the first things I do with my SQL Server is to eliminate every ounce of access the guest account has (by default, it has zero, so there should be little to do). It's a loophole, and it winds up providing access in a way you don't intuitively think of. (You probably think that when you assign rights to someone—that's all the rights they have. With guest active, that isn't necessarily so.)
+
+There is, however, one use that I'm aware of where the guest account actually serves a fairly slick purpose—when it is used with application roles. In this scenario, you leave the guest account with access to a database but without any rights beyond simply logging in to that database—that is, the guest account only makes the logged-on database "current." You can then use sp_setapprole to activate an application role, and, boom, you now have a way for otherwise anonymous users to log in to your server with appropriate rights. They can, however, only perform any useful login if they are using your application.
+
+This is definitely a scenario where you want to be protecting that application role password as if your job depended on it (it probably does). Use the ODBC encryption option and I would not allow this kind of access via the Internet!
+
+TCP/IP Port Settings
+
+By default when using TCP/IP, SQL Server uses port number 1433. A port can be thought of as something like a radio channel; it doesn't matter what channel you're broadcasting on, it won't do you any good if no one is listening to that channel.
+
+Leaving things with the default value of 1433 can be very convenient. All of your clients will automatically use port 1433 unless you specify otherwise, so this means that you have one less thing to worry about being set right if you just leave well enough alone.
+
+The problem, however, is that just about any potential SQL Server hacker also knows that port 1433 is the one to which 99 percent of all SQL Servers are listening. If your SQL Server has a direct connection to the Internet, I strongly recommend changing to a non-standard port number. Check with your network administrator for what he or she recommends as an available port. Just remember that, when you change what the server is "listening" to, you'll also need to change what all the IP-based clients are using. For example, if we were going to change to using port 1402, we would go into the Client Network Utility and set up a specific entry for our server with 1402 as the IP port to use.
+
+We also have the option of telling the client to dynamically determine the port, by checking the "Dynamically determine port" box.
+
+Note that this isn't really that huge of a security gain. The reality is that a hacker is probably going to use a port scanner or other tool to determine what every open port is on your firewall and, based on responses it seems, to make a fairly accurate guess as to what kind of software is utilizing that port. That said, every little thing you do can make it just a little more difficult for the would-be hacker.
+
+Don't Use the sa Account
+
+Everyone who's studied SQL Server for more than about 10 minutes knows about the system administrator account. SQL Server has the sysadmin fixed server role to simulate the sa user's level of access, so I strongly suggest adding true logins to that role, then changing the sa password to something very long and very incomprehensible—something not worth spending the time to hack into. If you only need Windows authentication, then turn SQL Server security off, and that will deal with the sa account issue once and for all.
+
+Keep xp_cmdshell under Wraps
+
+Remember to be careful about who you grant access to use xp_cmdshell. It will run any Windows command prompt command. The amount of authority that it grants to your users depends on what account SQL Server is running under. If it is a system or administrator account (as the majority are), then the users of xp_cmdshell will have very significant access to your server. (They could, for example, copy files onto the server from elsewhere on the network, then execute those files.) Let's raise the stakes a bit though—there are also a fair number of servers running out there under the context of a Windows domain administrator account—anyone using xp_cmdshell now has fairly open access to your entire network!!!
+
+The short rendition here is not to give anyone access to xp_cmdshell that you wouldn't give administrative rights to for your server or possibly even your domain.
+
+Don't Forget Views, Stored Procedures, and UDFs as Security Tools
+
+Remember that views, sprocs, and UDFs all have a lot to offer in terms of hiding data. Views can usually take the place of column-level security. They can do wonders to make a user think they have access to an entire table, when they, in reality, have access to only a subset of the entire data (remember our example of filtering out sensitive employee information, such as salary?). Sprocs and UDFs can do much the same. You can grant execute rights to a sproc or UDF, but that doesn't mean users get all the data from a table (they only get what the sproc or UDF gives them)—the end user may not even know what underlying table is supplying the data. In addition, views, sprocs, and UDFs have their own implied authority—that is, just because views and sprocs use a table, it doesn't mean that the user has access rights for that table.
+
+Certificates and Asymmetric Keys
+
+We have, at a few different points in the book (including earlier in this chapter), mentioned the notion of encryption. Certificates and asymmetric keys are the primary mechanism for defining the encryption keys for the different levels of your server architecture. Both of these are different methods of doing the same basic thing, and they are largely interchangeable. Whether you use certificates or asynchronous keys, you need to keep in mind that these are much like the keys to your house—if you let everyone have them, then they quickly lose their value (now anyone can get in, so why bother locking anyone out?).
+
+SQL Server supports the notion of keys at several different levels based on the notion that you may want to separate several different silos of control under different encryption keys. SQL Server maintains a Service Master Key that goes with each server installation. It is encrypted by the Windows-level Service Master Key. Likewise, each database contains a Database Master Key, which can, if you choose, itself be encrypted based on the Service Master Key. Then, within each database, you can define certificates and/or asymmetric keys (both of which are a form of key). Overall, the hierarchy looks something like Figure 19.2.
+
+Figure 19.2
+
+Certificates
+
+Since SQL Server 2000, SQL Server has included its own certificate authority, or CA. Third-party CAs are also supported. A CA issues a certificate, which includes an encryption key along with some basic information to go with the certificate such as what date range the certificate is valid for (a starting and expiration date), the name of the holder, and information on the authority that issued the certificate. A certificate is added to a server using the CREATE CERTIFICATE command.
+
+Asymmetric Keys
+
+An asymmetric key works much as a certificate does but is specified directly and is not validated by any issuing authority. Like a certificate, the encryption key is specified and then utilized to encrypt sensitive information. Asymmetric keys are added using the CREATE ASYMMETRIC KEY command.
+
+Database Encryption
+
+Most of the encryption functions that were added in SQL Server 2005 are oriented around the idea of encrypting a particular piece of data. They require you to utilize special functions (which specific functions depend on the type of encryption being used) to encrypt the data, and then another set of functions to decrypt the data.
+
+Beginning with SQL Server 2008, we also have the option of encrypting the entire database. Note that the idea here is not to password protect the data in the database, but rather to protect the wholesale theft of the entire database. Using database-level encryption, the database file and any backups made of it are effectively keyed to the server the database is on (unless you copy the server's certificate, so make sure you have a backup of that or your backups of the database will become effectively useless in the event of total server failure).
+
+Summary
+
+Security is one of those areas that tend to be ignored by developers. Unfortunately, the security of your system is going to be determined by how your client application handles things, so there's only so much a DBA can do after you've shipped your application.
+
+Treat security as if it is the lifeblood for the success or failure of your system at your customer site (which, if you're building internal projects, may be your site)—it probably is a critical factor.
+20
+
+A Grand Performance: Designing a Database That Performs Well
+
+This, and the chapter that follows, are probably the toughest chapters in the book from my perspective as the author, but not for the normal reasons. Usually, the issue is how to relate complex information in a manner that's easy to understand. As we're getting near the end of the book, I hope that I've succeeded there—even if there is still more to come. At this point, you should, from prior experience and the topics covered in this book, have a solid foundation in everything we're going to discuss in this chapter. That means I'm relatively free to get to the nitty-gritty and not worry quite as much about confusion.
+
+Why then would this be a tough chapter for me to write? Well, because deciding exactly what to put into this and the sibling chapter that follows is difficult. You see, this isn't a book on performance tuning—that can easily be a book unto itself. It is, however, a book about making you successful in your experience developing with SQL Server. Having a well-performing system is critical to that success. The problem lies in a line from Bob Seger: "What to leave in, what to leave out." What can we focus on here that's going to get you the most bang for your buck?
+
+Perhaps the most important thing to understand about performance tuning is that you are never going to know everything there is to know about it. If you're the average SQL developer, you're going to be lucky if you know 20 percent of what there is to know. Fortunately, performance tuning is one of those areas where the old 80-20 rule (80 percent of the benefit comes from the right 20 percent of the work) definitely applies.
+
+For this edition of the book, I've decided to expand this topic a bit, maintaining coverage of the structural decisions, and adding additional content on "how to figure out where performance opportunities exist." This chapter will largely be on topics that have been around for a while including such things as:
+
+  * Index choices
+  * Client vs. server-side processing
+  * Strategic de-normalization
+  * Organizing your sprocs
+  * Uses for temporary tables
+  * Small gains in repetitive processes vs. big gains in long-running processes
+
+The focus for this chapter is really going to be about things you should be thinking about in the area of design, those that are somewhat structural in nature. In many cases, it will be a subject we've already covered, but with a particular eye on performance. In our next chapter, we'll take a look at what to do once the system is already in place (maintenance, locating problems, and planning future changes).
+
+There is, however, a common theme that one should get out of both chapters: This is only the beginning. The biggest thing in performance is really just to stop and think about it. There is, for some strange reason, a tendency when working with SQL to use the first thing that comes to mind that will work. You need to give the same kind of thought to your queries, sprocs, database designs—whatever—that you would give to any other development work that you're doing. Also, keep in mind that your T-SQL code is only one part of the picture—hardware, client code, SQL Server configuration, and network issues are examples of things that are "outside the code" that can have a dramatic impact on your system.
+
+Performance means a lot of different things to a lot of different people. For example, many will think in terms of simple response time (how fast does my query finish). There is also the notion of perceived performance (many users will think in terms of how fast they receive enough to start working on, rather than how fast it actually finishes). Yet another perspective might focus on scalability (for example, how much load can I put on the system before my response time suffers or until users start colliding with each other?).
+
+Many of the examples and suggestions in the two performance chapters are about raw speed—how fast do I return results—we do, however, touch on perceived performance and scalability issues where appropriate. Make sure that all facets of performance are considered in your designs—not just time to completion.
+
+When to Tune
+
+Okay, so this is probably going to seem a little obvious, but performance starts much earlier in the process than when you are writing your code. Indeed, it really should start in the requirements-gathering process and then never end.
+
+What's the big deal about performance tuning in the requirements-gathering stage? Well, while you obviously can't do anything yet to physically tune your system, you can do a lot to logically tune your system. For example, is the concern of the customer more toward the side of perceived performance or actual completion of the job? For interactive processes, users will generally be more satisfied and think the system is faster if you do something to show them that something is happening (even if it's just a progress bar). In addition, sometimes it's worth having a process that completes a little more slowly as long as the "first response"—that is, when it starts outputting something—is faster. Which of these is preferable is something you should know in the requirements-gathering stage. Finally, you should, in the requirements-gathering process, determine what your performance requirements are for the system.
+
+Many is the time that I have seen the system that the developer thought was "fast enough" only to find out that the performance was unacceptable to the user. This can happen for a lot of reasons, though the most common is certainly the developer having his or her head buried in the sand.
+
+Find out what's expected! Also, remember to test whether you've met expectations under a realistic load on something resembling the real live hardware—not a load based on one or two developers sitting at their development system.
+
+Performance obviously also continues into design. If you design for performance, then you will generally greatly reduce the effort required to tune at completion. What's more, you'll find that you've greatly enhanced what are the "best" numbers you can achieve.
+
+I'm starting to drone on here, but performance never stops—when you're actually coding, get it working, but then STOP! Stop and take a look at your code. Once an entire system is together, the actual code will almost never be looked at again unless:
+
+  * Something breaks (there's a bug).
+  * You need to upgrade that part of the system.
+  * There is an overt performance problem (usually, a very bad one).
+
+In the first two of these instances, you probably won't be looking at the performance issues, just how to get things fixed or the additional functionality added. The point here is that an extra few minutes of looking at your code and asking yourself "Could I have done it better?" or "Hey, have I done anything stupid here?" can shave a little bit here and a little bit there and, occasionally, a whole lot in some other place.
+
+Simply put: I make stupid mistakes, and so will you. It is, however, amazing how often you can step back from your code for a minute or two, then look at it again with a critical eye and say, "Geez, I can't believe I did that!" Hopefully, those moments will be rare, but, if you take the time to be critical of your own code, you'll find most of those critical gaffes that could really bog your system down. As for the ones you don't find, well, that's what the next chapter is for!
+
+The next big testing milestone time is in the quality assurance process. At this juncture you should be establishing general system benchmarks and comparing those against the performance requirements established during the requirements phase.
+
+Last, but not least—never stop. Ask end users where their pain is from a performance perspective. Is there something they say is slow? Don't wait for them to tell you (often, they think "that's just the way it is" and say nothing—except to your boss, of course); go ask.
+
+Index Choices
+
+Again, this is something that was covered in extreme depth previously, but the topic still deserves something more than a mention here because of its sheer importance to query performance.
+
+People tend to go to extremes with indexes—I'm encouraging you not to follow any one rule but to instead think about the full range of items that your index choices impact.
+
+Any table that has a primary key (and with very rare exception, all tables should have a primary key) has at least one index. This doesn't mean, however, that it is a very useful index from a performance perspective. Indexes should be considered for any column that you're going to be frequently using as a target in a WHERE or JOIN, and, to a lesser extent, an ORDER BY clause.
+
+Remember though, that the more indexes you have, the slower your inserts, updates, and deletes are going to be. When you modify a record, one or more entries may (depending on what's going on in the non-leaf levels of the B-Tree) have to be modified for that index (certainly true in the case of an insert or delete, and true for updates on any column participating in the index). That means more indexes and also more for SQL Server to do on modification statements. In an Online Transaction Processing (OLTP) environment (where you tend to have a lot of inserts, updates, and deletes), this can be a killer. In an Online Analytical Processing (OLAP) environment, this is probably no big deal since your OLAP data is usually relatively stable (few inserts), and what inserts are made are usually done through a highly repetitive batch process (doesn't have quite the lack of predictability that users have).
+
+Technically speaking, the problem is smaller on updates and deletes. For updates, your indexes need to be updated only if the column that was changed is part of the key for that index. If you do indeed need to update the index though, think about it as a delete and an insert—that means that you're exposed to page splits again.
+
+So, what, then, about deletes? Well, again, when you delete a record you're going to need to delete all the entries from your indexes too, so you do add some additional overhead, but you don't have to worry about page splits and having to physically move data around.
+
+The bottom line here is that if you're doing a lot more querying than modifying, then more indexes are okay. However, if you're doing lots of modifications to your data, keep your indexes limited to high use columns.
+
+If you're treating this book as more of a reference than a full "learn how" book and haven't taken the time to read the index chapters (Chapters 6 and 7) yet—do it!
+
+Check the Index Tuning Tool in the Database Engine Tuning Advisor
+
+The Database Engine Tuning Advisor is a descendant of the Index Tuning Wizard that made its first appearance back in version 7.0. While the Database Tuning Advisor has grown to include much more than just index tuning, it still has this key feature.
+
+Be very careful when using automated tuning tools with indexes. In particular, watch out about what indexes you let it delete. It makes its recommendations based on the workload it has been exposed to—that workload may not include all of the queries that make up your system. Take a look at the recommendations and ask yourself why those recommendations might help. Particularly with deletions, ask yourself what that index might be used for—does deleting it make sense? Is there some long-running report that didn't run when you were capturing the workload file that might make use of that index?
+
+Client vs. Server-Side Processing
+
+Where you decide to "do the work" can have a very serious impact—for better or worse—on overall system performance.
+
+When client/server computing first came along, the assumption was that you would get more/faster/cheaper by "distributing" the computing. For some tasks, this is true. For others though, you lose more than you gain.
+
+Here's a quick review of some preferences and how they perform on client-side versus server side:
+
+Static cursors | Usually much better on the client. Since the data isn't going to change, you want to package it up and send it all to the client in one pass—thus limiting roundtrips and network impact. The obvious exception is if the cursor is generated for the sole purpose of modifying other records. In such a case, you should try and do the entire process at the server-side (most likely in the form of a stored procedure)—again eliminating round-trips.  
+---|---  
+Forward-only, read-only cursors | Client-side again. ODBC and other libraries can take special advantage of the FAST_FORWARD cursor type to gain maximum performance. Just let the server spew the records into the client cursor, and then move on with life.  
+HOLDLOCK situations | Most transactioning works much better on the server than on the client.  
+Processes that require working tables | This is another of those situations where you want to try to have the finished product created before you attempt to move records to the client. If you keep all of the data server-side until it is really ready to be used, you minimize round-trips to the server and speed up performance.  
+Minimizing client installations | Okay, so this isn't "performance" as such, but it can be a significant cost factor. If you want to minimize the number of client installations you have to do, then keep as much of the business logic out of the client as possible. Either perform that logic in sprocs, or look at using component-based development with .NET. In an ideal world, you'll have what I like to call "data logic" (logic that exists only for the purpose of figuring out how to get the final data) in sprocs and "business logic" in components.  
+|   
+Significant filtering and/or resorting | Use ADO.NET or LINQ. They have a great set of tools for receiving the data from the server just once (fewer round-trips!), then applying filters and sorts locally. If you wanted the data filtered or sorted differently by SQL Server, it would run an entirely new query using the new criteria. It doesn't take a rocket scientist to figure out that the overhead on that can get rather expensive. Both ADO.NET and LINQ also have some cool things built-in to allow you to join different data sets (including homogeneous data sets) right at the client. Note, however, that with very large result sets, your client computer may not have the wherewithal to deal with the filters and sorts effectively—you may be forced to go back to the server.
+
+These really just scratch the surface. The big thing to remember is that round-trips are a killer even in this age of gigabit Ethernet (keep in mind that connection overhead is often more of the issue than raw bandwidth). What you need to do is move the smallest amount of data back and forth—and only move it once. Usually, this means that you'll preprocess the data as much as possible on the server side, and then move the entire result to the client if possible.
+
+Keep in mind, though, that you need to be sure that your client is going to be able to handle what you give it. Servers are usually much better equipped to handle the resource demands of larger queries. By the same token, you also have to remember that the server is going to be doing this for multiple users—that means the server needs to have adequate resources to store all of the server-side activity for that number of users. If you take a process that was too big for the client to handle and move it server-side for resource reasons, just remember that you may also run out of resources on the server, if more than one client uses that process at one time. The best thing is to try to keep result sets and processes in the smallest size possible.
+
+Realize that the term "client" has more than one possible meaning. The client, from a data connection perspective, may not be where the end user sits. If it is a browser-based application, then the client that is truly handling the data is more likely the Web server. While a Web server is likely on some very solid hardware, it may be dealing with multiple such queries at the same time (multiple large data sets), so plan accordingly.
+
+Strategic De-Normalization
+
+This could also be called, "When following the rules can kill you." Normalized data tends to work for both data integrity and performance in an OLTP environment. The problem is that not everything that goes on in an OLTP database is necessarily transaction-processing related. Even OLTP systems have to do a little bit of reporting (a summary of transactions entered that day, for example).
+
+Often, adding just one extra column to a table can prevent a large join, or worse, a join involving several tables. I've seen situations where adding one column made the difference between a two-table join and a nine-table join. We're talking the difference between 100,000 records being involved and several million. This one change made the difference in a query dropping from a runtime of several minutes down to just seconds.
+
+Like most things, however, this isn't something with which you should get carried away. Normalization is the way that most things are implemented for a reason. It adds a lot to data integrity and can make a big positive difference performance-wise in many situations. Don't de-normalize just for the sake of it. Know exactly what you're trying to accomplish, and test to make sure that it had the expected impact. If it didn't, then look at going back to the original way of doing things.
+
+Organizing Your Sprocs Well
+
+I'm not talking from the outside (naming conventions and such are important, but that's not what I'm getting at here) but rather from a "how they operate" standpoint. The next few sections discuss this.
+
+Keeping Transactions Short
+
+Long transactions cannot only cause deadlock situations but also basic blocking (where someone else's process has to wait for yours because you haven't finished with the locks yet). Anytime you have a process that is blocked—even if it will eventually be able to continue after the blocking transaction is complete—you are delaying, and therefore hurting the performance of, that blocked procedure. There is nothing that has a more immediate effect on performance than that a process has to simply stop and wait.
+
+Using the Least Restrictive Transaction Isolation Level Possible
+
+The tighter you hold those locks, the more likely that you're going to wind up blocking another process. You need to be sure that you take the number of locks that you really need to ensure data integrity—but try not to take any more than that.
+
+If you need more information on isolation levels, check out transactions and locks in Chapter 11.
+
+Implementing Multiple Solutions if Necessary
+
+An example here is a search query that accepts multiple parameters but doesn't require all of them. It's quite possible to write your sproc so that it just uses one query, regardless of how many parameters were actually supplied—a "one-size-fits-all" kind of approach. This can be a real timesaver from a development perspective, but it is really deadly from a performance point of view. More than likely, it means that you are joining several unnecessary tables for every run of the sproc!
+
+The thing to do here is to add a few IF...ELSE statements to check things out. This is more of a "look before you leap" kind of approach. It means that you will have to write multiple queries to deal with each possible mix of supplied parameters, but once you have the first one written, the others can often be cloned and then altered from the first one.
+
+This is a real problem area in lots of code out there. Developers are a fickle bunch. We generally only like doing things as long as they are interesting. If you take the preceding example, you can probably see that it would get very boring very quickly to be writing what amounts to a very similar query over and over to deal with the nuances of what parameters were supplied.
+
+All I can say about this is—well, not everything can be fun, or everyone would want to be a software developer! Sometimes you just have to grin and bear it for the sake of the finished product.
+
+Avoiding Cursors if Possible
+
+If you're a programmer who has come from an ISAM or VSAM environment (these were older database storage methods), doing things by cursor is probably going to be something toward which you'll naturally gravitate. After all, the cursor process works an awful lot more like what you're used to in those environments (such looping structures are also common in many non-database data handling constructs).
+
+Don't go there!
+
+Almost all things that are first thought of as something you can do by cursors can actually be done as a set operation. Sometimes it takes some pretty careful thought, but it usually can be done.
+
+By way of illustration, I was asked several years ago for a way to take a multiline cursor-based operation and make it into a single statement if possible. The existing process ran something like 20 minutes. The runtime was definitely problematic, but the customer wasn't really looking to do this for performance reasons (they had accepted that the process was going to take that long). Instead, they were just trying to simplify the code.
+
+They had a large product database, and they were trying to set things up to automatically price their available products based on cost. If the markup had been a flat percentage (say 10 percent), then the UPDATE statement would have been easy—say something like:
+
+UPDATE Products
+
+SET UnitPrice = UnitCost * 1.1
+
+The problem was that it wasn't a straight markup—there was a logic pattern to it. The logic went something like this:
+
+  * If the pennies on the product after the markup are greater than or equal to .50, then price it at .95.
+  * If the pennies are below .50, then mark it at .49.
+
+The pseudocode to do this by cursor would look something like:
+
+Declare and open the cursor
+
+Fetch the first record
+
+Begin Loop Until the end of the result set
+
+Multiply cost * 1.1
+
+If result has cents of < .50
+
+Change cents to .49
+
+Else
+
+Change cents to .95
+
+Loop
+
+This is, of course, an extremely simplified version of things. There would actually be about 30–40 lines of code to get this done. Instead, we changed it around to work with one single correlated subquery (which had a CASE statement embedded in it). The runtime dropped down to something like 12 seconds.
+
+The point here, of course, is that, by eliminating cursors wherever reasonably possible, we can really give a boost to not only reduce complexity (as was the original goal here) but also performance.
+
+Uses for Temporary Tables
+
+The use of temporary tables can sometimes help performance—usually by allowing the elimination of cursors or by allowing working data to be indexed while it is needed.
+
+Using Temp Tables to Break Apart Complex Problems
+
+As we've seen before, cursors can be the very bane of our existence. Using temporary tables, we can sometimes eliminate the cursor by processing the operation as a series of two or more set operations. An initial query creates a working data set. Then another process comes along and operates on that working data.
+
+We can actually make use of the pricing example we laid out in the last section to illustrate the temporary table concept, too. This solution wouldn't be quite as good as the correlated subquery, but it is still quite workable and much faster than the cursor option. The steps would look something like:
+
+SELECT ProductID, FLOOR(UnitCost * 1.1) + .49 AS TempUnitPrice
+
+INTO #WorkingData
+
+FROM Products
+
+WHERE (UnitCost * 1.1) - FLOOR(UnitCost * 1.1) < .50
+
+INSERT INTO #WorkingData
+
+SELECT ProductID, FLOOR(UnitCost * 1.1) + .95 AS TempUnitPrice
+
+FROM Products
+
+WHERE (UnitCost * 1.1) - FLOOR(UnitCost * 1.1) >= .50
+
+UPDATE p
+
+SET p.UnitPrice = t.TempUnitPrice
+
+FROM Product p
+
+JOIN #WorkingData t
+
+ON p.ProductID = t.ProductID
+
+With this, we wind up with three steps instead of thirty or forty. This won't operate quite as fast as the correlated subquery would, but it still positively screams in comparison to the cursor solution.
+
+Keep this little interim step using temporary tables in mind when you run into complex problems that you think are going to require cursors. Try to avoid the temptation of just automatically taking this route—look for the single statement query before choosing this option—but if all else fails, this can really save you a lot of time versus using a cursor option.
+
+Using Temp Tables to Allow Indexing on Working Data
+
+Often we will run into a process in which we are performing many different operations on what is fundamentally the same data. This is characterized by a situation in which you are running different kinds of updates (perhaps to totally different tables), but utilizing the same source data to figure out what to change or what values to change things to. I've seen many scenarios where the same fundamental data is reused—in the same procedure—hundreds or even thousands of times.
+
+Under such "reuse" situations, consider querying the data once and placing it into a temp table. Also consider applying indexes to this data as warranted by the queries you're going to be performing against it.
+
+Even for data you're only going to be hitting twice, I've seen a temp table solution make a huge difference if the original query for the source data was, for whatever reason, inefficient. Sometimes this is due to a lack of suitable indexing on the source data, but, more often, it is a scenario with a multi-table join against a large data set. Sucking it into a temp table often allows you to explicitly filter down a large data set early in the overall process. Again, try and avoid the temptation of automatically taking this approach, but keep it in mind as an option.
+
+Update Your Code In a Timely Fashion
+
+Are you still supporting SQL Server 2000? How about 7.0? OK, so you most definitely shouldn't be supporting 7.0 by now, and even 2000 support should be gone (or at least in the late stages of sunsetting it). So, if you're no longer supporting those older editions, why does your system code and design look like you still are?
+
+OK, OK, I understand it isn't as simple as all that, but with each release of your application, make sure that you have time set aside (I recommend 10%–25%) that is oriented around improving existing performance and features. If you only need to support SQL Server 2008, look for special code you may have to address situations now addressed natively by SQL Server 2008, such as:
+
+  * Procedures or code streams that handle INSERT, UPDATE, and DELETE scenarios into a specific table; these can use the new MERGE command to make all three modifications, as indicated, in a single pass over the data. It also has the advantage of being a single statement, which means you may be able to avoid explicitly defining transactions around the three separate statements.
+  * Special hierarchy handling: SQL Server now has native constructs for something that is actually very common. The functionality includes not only hierarchy-specific functions (such as pruning or grafting), but both vertical and horizontal index functionality (very cool stuff!).
+  * Date and Time data type handling.
+
+Sometimes, It's the Little Things
+
+A common mistake in all programming for performance efforts is to ignore the small things. Whenever you're trying to squeeze performance, the natural line of thinking is that you want to work on the long-running stuff.
+
+It's true that the long-running processes are the ones for which you stand the biggest chance of getting big one-time performance gains. It's too bad that this often leads people to forget that it's the total time saved that they're interested in—that is, how much time when the process is really live.
+
+While it's definitely true that a single change in a query can often turn a several-minute query into seconds (I've actually seen a few that took literally days trimmed to just seconds by index and query tuning), the biggest gains for your application often lie in getting just a little bit more out of what already seems like a fast query. These are usually tied to often-repeated functions or items that are often executed within a loop.
+
+Think about this for a bit. Say you have a query that currently takes three seconds to run, and this query is used every time an order taker looks up a part for possible sale—say 5,000 items looked up a day. Now imagine that you are able to squeeze one second off the query time. That's 5,000 seconds, or over an hour and 20 minutes!
+
+Hardware Considerations
+
+Forgive me if I get too bland here—I'll try to keep it interesting, but if you're like the average developer, you'll probably already know enough about this to make it very boring, yet not enough about it to save yourself a degree of grief.
+
+Hardware prices have been falling like a rock over the years—unfortunately, so has what your manager or customer is probably budgeting for your hardware purchases. When deciding on a budget for your hardware, remember:
+
+  * Once you've deployed, the hardware is what's keeping your data safe—just how much is that data worth?
+  * Once you've deployed, you're likely to have many users—if you're creating a public website, it's possible that you'll have tens of thousands of users active on your system 24 hours per day. What is it going to cost you in terms of productivity loss, lost sales, loss of face, and just general credibility loss if that server is unavailable or—worse—you lose some of your data?
+  * Maintaining your system will quickly cost more than the system itself. Dollars spent early on a mainstream system that is going to have fewer quirks may save you a ton of money in the long run.
+
+There's a lot to think about when deciding from whom to purchase and what specific equipment to buy. Forgetting the budget for a moment, some of the questions to ask yourself include:
+
+  * Will the box be used exclusively as a database server?
+  * Will the activity on the system be processor or I/O intensive? (For databases, it's almost always the latter, but there are exceptions.)
+  * Am I going to be running more than one production database? If so, is the other database of a different type (OLTP versus OLAP)?
+  * Will the server be on-site at my location, or do I have to travel to do maintenance on it?
+  * What are my risks if the system goes down?
+  * What are my risks if I lose data?
+  * Is performance "everything"?
+  * What kind of long-term driver support can I expect as my O/S and supporting systems are upgraded?
+
+Again, we're just scratching the surface of things—but we've got a good start. Let's look at what these issues mean to us.
+
+Exclusive Use of the Server
+
+I suppose it doesn't take a rocket scientist to figure out that, in most cases, having your SQL Server hardware dedicated to just SQL Server and having other applications reside on totally separate system(s) is the best way to go. Note, however, that this isn't always the case.
+
+If you're running a relatively small and simple application that works with other sub-systems (say IIS as a Web server, for example), then you may actually be better off, performance-wise, to stay with one box. Why? Well, if there are large amounts of data going back and forth between the two sub-systems (your database in SQL Server and your Web pages or whatever in a separate process), then memory space to memory space communications are going to be much faster than the bottleneck that the network can create—even in a relatively dedicated network backbone environment.
+
+Remember that this is the exception, though, not the rule. The instance where this works best usually meets the following criteria:
+
+  * The systems have a very high level of interaction.
+  * The systems have little to do beyond their interactions (the activity that's causing all the interaction is the main thing that the systems do).
+  * Only one of the two processes is CPU intensive and only one is I/O intensive.
+
+If in doubt, go with conventional thinking on this and separate the processing into two or more systems.
+
+I/O vs. CPU Intensive
+
+I can just hear a bunch of you out there yelling "Both!" If that's the case, then I hope you have a very large budget—but we'll talk about that scenario, too. Assuming you haven't installed yet, it's guesswork. While almost anything you do in SQL Server is data-based and will, therefore, certainly require a degree of I/O, how much of a burden your CPU is under varies widely depending on the types of queries you're running:
+
+Low CPU Load | High CPU Load  
+---|---  
+Simple, single-table queries and updates | Large joins  
+Joined queries over relatively small tables | Aggregations (SUM, AVG, etc.) Sorting of large result sets
+
+With this in mind, let's focus in a little closer on each situation.
+
+I/O Intensive
+
+I/O-intensive tasks should cause you to focus your budget more on the drive array than on the CPU(s). Notice that I said the drive "array"—I'm not laying that out as an option. In my not-so-humble opinion on this matter, if you don't have some sort of redundancy arrangement on your database storage mechanism, then you have certainly lost your mind. Any data worth saving at all is worth protecting—we'll talk about the options there in just a moment.
+
+Before we get into talking about the options on I/O, let's look briefly into what I mean by I/O intensive. In short, I mean that a lot of data retrieval is going on, but the processes being run on the system are almost exclusively queries (not complex business processes), and those do not include updates that require wild calculations. Remember—your hard drives are, more than likely, the slowest thing in your system (short of a CD-ROM) in terms of moving data around.
+
+A Brief Look at RAID
+
+RAID; it brings images of barbarian tribes raining terror down on the masses. Actually, most of the RAID levels are there for creating something of a fail-safe mechanism against the attack of the barbarian called "lost data." If you're not a RAID aficionado, then it might surprise you to learn that not all RAID levels provide protection against lost data.
+
+RAID originally stood for Redundant Array of Inexpensive Disks. The notion was fairly simple—at the time, using a lot of little disks was cheaper than using one great big one. In addition, an array of disks meant that you had multiple drive heads at work and could also build in (if desired) redundancy.
+
+Since drive prices have come down so much (I'd be guessing, but I'd bet that drive prices are, dollar per meg, far less than 1 percent of what they were when the term RAID was coined), I've heard other renditions of what RAID stands for. The most common are Random Array of Independent Disks (this one seems like a contradiction in terms to me) and Random Array of Individual Disks (this one's not that bad). The thing to remember, no matter what you think it's an acronym for, is that you have two or more drives working together—usually for the goal of some balance between performance and safety.
+
+There are lots of places you can get information on RAID, but let's take a look at the three (well, four if you consider the one that combines two of the others) levels that are most commonly considered:
+
+RAID Level | Description  
+---|---  
+RAID 0 | a.k.a. Disk Striping without Parity. Out of the three that you are examining here, this is the one you are least likely to know. This requires at least three drives to work just as RAID 5 does. Unlike RAID 5, however, you get no safety net from lost data. (Parity is a special checksum value that allows reconstruction of lost data in some circumstances—as indicated by the time, RAID 0 doesn't have parity.) RAID 0's big claim to fame is giving you maximum performance without losing any drive space. With RAID zero, the data you store is spread across all the drives in the array (at least 3). While this may seem odd, it has the advantage of meaning that you always have three or more disk drives reading or writing your data for you at once. Under mirroring, the data is all on one drive (with a copy stored on a separate drive). This means you'll just have to wait for that one head to do the work for you.  
+RAID 1 | a.k.a. Mirroring. For each active drive in the system, there is a second drive that "mirrors" (keeps an exact copy of) the information. The two drives are usually identical in size and type, and store all the information to each drive at the same time. (Windows NT has software-based RAID that can mirror any two volumes as long as they are the same size.) Mirroring provides no performance increase when writing data (you still have to write to both drives) but can, depending on your controller arrangement, double your read performance since it will use both drives for the read. What's nice about mirroring is that as long as only one of the two mirrored drives fails, the other will go on running with no loss of data or performance (well, reads may be slower if you have a controller that does parallel reads). The biggest knock on mirroring is that you have to buy two drives to every one in order to have the disk space you need.  
+RAID 5 | The most commonly used. Although, technically speaking, mirroring is a RAID (RAID 1), when people refer to using RAID, they usually mean RAID 5. RAID 5 works exactly as RAID 0 does with one very significant exception—parity information is kept for all the data in the array. Say, for example, that you have a five-drive array. For any given write, data is stored across all five of the drives, but a percentage of each drive (the sum of which adds up to the space of one drive) is set aside to store parity information. Contrary to popular belief, no one drive is the parity drive. Instead, some of the parity information is written to all the drives—it's just that the parity information for a given byte of data is not stored on the same drive as the actual data is. If any one drive is lost, then the parity information from the other drives can be used to reconstruct the data that was lost. The great thing about RAID 5 is that you get the multi-drive read performance. The downside is that you lose one drive's worth of space (if you have a three-drive array, you'll see the space of two; if it's a seven-drive array, you'll see the space of six). It's not as bad as mirroring in the price per megabyte category, but you still see great performance.  
+RAID 6 | Raid can be considered to be something of an extension of RAID 5 and is generally only used in very large arrays (where the overhead of algorithm required to provide the extra redundancy can be spread out and therefore provides less waste on a per disk basis). RAID 6 provides extra parity encoding versus RAID 5, and the extra information can be utilized to recover from multiple drive loss. RAID 5 is generally less expensive at lower array sizes, but RAID 6 maintains a level of redundancy even while rebuilding a single failed drive.  
+RAID 10, (a.k.a. RAID 1 + 0) or RAID 0 + 1 | RAID 10 offers the best of both RAID 0 and RAID 1 in terms of performance and data protection. It is, however, far and away the most expensive of the options discussed here. RAID 10 is implemented in a coupling of both RAID 1 (Mirroring) and RAID 0 (striping without parity). The end result is mirrored sets of striped data. You will also hear of RAID 0 + 1. These are striped sets of mirrored data. The end result in total drive count is the same, but RAID 10 performs better in recovery scenarios and is therefore what is typically implemented.  
+RAID 50 | This is implemented by mirroring two RAID 5 arrays. While it is arguably the most redundant, it is still at risk of failure if two drives happen to fail in the same array. It is the most expensive of the options provided here, and generally only implemented in the most extreme of environments.
+
+The long and the short of it is that RAID 5 is the de facto minimum for database installations. That being said, if you have a loose budget, then I'd actually suggest mixing things up a bit.
+
+RAID 10 has become the standard in larger installations. For the average shop, however, RAID 5 will likely continue to rule the day for a while yet—perhaps that will change as we get into the era where even server level drives are measured in multi-tera-, peta-, and even exabytes. We certainly are getting there fast.
+
+What you'd like to have is at least a RAID 5 setup for your main databases but a completely separate mirrored set for your logs. People who manage to do both usually put both Windows and the logs on the mirror set and the physical databases on the RAID 5 array, but those with a little more cash to spend often put the O/S on a separate mirror set from the logs (with the data files still on their own RAID 5 array). Since I'm sure inquiring minds want to know why you would want to do this, let's make a brief digression into how log data is read and written.
+
+Unlike database information, which can be read in parallel (thus why RAID 5 or 10 works so well performance-wise), the transaction log is chronology dependent—that is, it needs to be written and read serially to be certain of integrity. I'm not necessarily saying that physically ordering the data in a constant stream is required; rather, I'm saying that everything needs to be logically done in a stream. As such, it actually works quite well if you can get the logs into their own drive situation where the head of the drive will only seldom have to move from the stream from which it is currently reading and writing. The upshot of this is that you really want your logs to be in a different physical device than your data, so the reading and writing of data won't upset the reading and writing of the log.
+
+Note that this sequential read/write performance of the mirror set disappears if you are keeping logs for multiple databases on the same mirror set (it has to jump around between the separate logs!).
+
+Logs, however, don't usually take up nearly as much space as the read data does. With mirroring, we can just buy two drives and have our redundancy. With RAID 5, we would have to buy three, but we don't see any real benefit from the parallel read nature of RAID 5. When you look at these facts together, it doesn't make much sense to go with RAID 5 for the logs or O/S.
+
+You can have all the RAID arrays in the world, and they still wouldn't surpass a good backup in terms of long-term safety of your data. Backups are easy to take off-site, and are not subject to mechanical failure. RAID units, while redundant and very reliable, can also become worthless if two (instead of just one) drives fail. Another issue—what if there's a fire? Probably all the drives will burn up—again, without a backup, you're in serious trouble. We'll look into how to back up your databases in Chapter 22.
+
+CPU Intensive
+
+On a SQL Server box, you'll almost always want to make sure that you go multiprocessor (yes, even in these days of multi-core processors), even for a relatively low-utilization machine. This goes a long way to preventing little "pauses" in the system that will drive your users positively nuts, so consider this part of things to be a given—particularly in this day of dual core processors. Keep in mind that the Workgroup version of SQL Server supports only up to two processors—if you need to go higher than that, you'll need to go up to either Standard (four processors) or the Enterprise edition (which is limited only by your hardware and budget).
+
+Even if you're only running SQL Server Express—which supports only one processor—you'll want to stick with the dual-proc box if at all possible. Remember, there is more going on in your system than SQL Server, so having that other proc available to perform external operations cuts down on lag on your SQL Server.
+
+Perhaps the biggest issue of all, though, is memory. This is definitely one area that you don't want to short change. In addition, remember that if you are in a multiprocessor environment (and you should be), then you are going to have more things going on at once in memory. In these days of cheap memory, no SQL Server worth installing should ever be configured with less than 512MB of RAM—even in a development environment. Production servers should be equipped with no less than 2GB of RAM—quite likely more.
+
+Things to think about when deciding how much RAM to use include:
+
+  * How many user connections will there be at one time (each one takes up space)? Each connection takes up about 24K of memory (it used to be even higher). This isn't really a killer since 1,000 users would only take up 24MB, but it's still something to think about.
+  * Will you be doing a lot of aggregations and/or sorts? These can be killers depending on the size of the data set you're working with in your query.
+  * How large is your largest database? If you have only one database, and it is only 1GB (and, actually, most databases are much smaller than people think), then having 4GB of RAM probably doesn't make much sense depending on how many queries you're running simultaneously and exactly what actions they are taking.
+  * The Workgroup edition of SQL Server 2008 only supports addressing of memory up to 3GB. If you need more than this, you'll need to go with at least the Standard edition.
+
+In addition, once you're in operation—or when you get a fully populated test system up and running—you may want to take a look at your cache-hit ratio in perfmon. We'll talk about how this number is calculated a little bit in Chapter 21. For now, it's sufficient to say that this can serve as something of a measurement for how often we are succeeding at getting things out of memory rather than off disk (memory is going to run much, much faster than disk). A low cache-hit ratio is usually a certain indication that more memory is needed. Keep in mind though, that a high ratio does not necessarily mean that you shouldn't add more memory. The read-ahead feature of SQL Server may create what is an artificially high cache-hit ratio and may disguise the need for additional memory.
+
+OLTP vs. OLAP
+
+The needs between these two systems are often at odds with each other. We discuss some of the design differences in Chapter 24, so I hope you will come to have a concept of just how different the design considerations can be.
+
+In any case, I'm going to keep my "from a hardware perspective" recommendation short here:
+
+If you are running databases to support both of these kinds of needs, run them on different servers—it's just that simple.
+
+I can't stress enough the need to separate these two. A large data warehouse import, export, or even a large report run can cause significant turnover in your OLTP procedure and/or data caches and simply decimate the performance of your system for what can be many users (and, therefore, a whole lot of cost).
+
+On-Site vs. Off-Site
+
+It used to be that anything that would be SQL Server–based would be running on-site with those who were responsible for its care and upkeep. If the system went down, people were right there to worry about reloads and to troubleshoot.
+
+In the Internet era, many installations are co-located with an Internet service provider (ISP). The ISP is responsible for making sure that the entire system is backed up—they will even restore according to your directions—but they do not take responsibility for your code. This can be very problematic when you run into a catastrophic bug in your system. While you can always connect remotely to work on it, you're going to run into several configuration and performance issues, including:
+
+  * Security—Remote access being open to you means that you're also making it somewhat more open to others who you may not be interested in having access. My two bits' worth on this is to make sure that you have very tight routing and port restrictions in place. For those of you not all that network savvy (which includes me), this means that you restrict what IP addresses are allowed to be routed to the remote server, what ports they have available, and even what protocols (SSL vs. non-SSL) are allowed through.
+  * Performance—You're probably going to be used to the 100 Mbps to 1 Gbps network speeds that you have around the home office. Now you're communicating via virtual private network (VPN) over the Internet or, worse, dialup, and you are starting to hate life (things are SLOW!).
+  * Responsiveness—It's a bit upsetting when you're running some e-commerce site or whatever and you can't get someone at your ISP to answer the phone, or they say that they will get on it right away and hours later you're still down. Make sure you investigate your remote hosting company very closely—don't assume that they'll still think you're important after the sale.
+  * Hardware Maintenance—Many co-hosting facilities will not do hardware work for you. If you have a failure that requires more than a reloading, you may have to travel to the site yourself or call yet another party to do the maintenance—that means that your application will be offline for hours or possibly days.
+
+If you're a small shop doing this with an Internet site, then off-site can actually be something of a saving grace. It's expensive, but you'll usually get lots of bandwidth plus someone to make sure that the backups actually get done—just make sure that you really check out your ISP. Many of them don't know anything about SQL Server, so make sure that expertise is there.
+
+One recent trend in major ISPs has been to locate major hosting facilities in far more remote locations than you might, at first, expect. This is usually done for accessibility to water (for cooling), cheap power, or both (near hydroelectric facilities seems to be popular). In many ways, this shouldn't matter, but think about it if you're using a third-party hardware support company—does that support company have appropriate staff located near the facility where you will be hosted?
+
+If you were thinking of your hosting company as being located in a major metropolitan area, then you would reasonably assume that your hosting company had a large number of support staff within 30–60 minutes' response time of your ISP location. If, however, your ISP is, let's say, "outside Portland, Oregon," you may want to make sure that "outside" doesn't mean 60 or 80 miles away. If it is, check with your support company about just how many people they keep on staff truly close to your ISP location.
+
+The Risks of Being Down
+
+How long and how often can I afford to be down? This may seem like a silly question. When I ask it, I often get this incredulous look. For some installations, the answer is obvious—they can't afford to be down, period. This number is not, however, as high as it might seem. You see, the only true life-and-death kinds of applications are the ones that are in acute medical applications or are immediately tied to safety operations. Other installations may lose money—they may even cause bankruptcy if they go down—but that's not life and death either.
+
+That being said, it's really not as black and white as all that. There is really something of a continuum in how critical downtime is. It ranges from the aforementioned medical applications at the high end to data-mining operations on old legacy systems at the low end (usually—for some companies, it may be all they have). The thing that pretty much everyone can agree on for every system is that downtime is highly undesirable.
+
+So, the question becomes one of just how undesirable is it? How do we quantify that?
+
+If you have a bunch of bean counters (I can get away with saying that since I was one) working for you, it shouldn't take you all that long to figure out that there are a lot of measurable costs to downtime. For example, if you have a bunch of employees sitting around saying that they can't do anything until the system comes back up, then the number of affected employees times their hourly cost (remember, the cost of an employee is more than just his or her wages) equals the cost of the system being down from a productivity standpoint. But wait, there's more. If you're running something that has online sales—how many sales did you lose because you couldn't be properly responsive to your customers? Oops—more cost. If you're running a plant with your system, then how many goods couldn't be produced because the system was down—or, even if you could still build them, did you lose quality assurance or other information that might cost you down the line?
+
+I think by now you should be able to both see and sell to your boss the notion that downtime is very expensive—how expensive depends on your specific situation. Now the thing to do is to determine just how much you're willing to spend to make sure that it doesn't happen.
+
+Lost Data
+
+There's probably no measuring this one. In some cases, you can quantify this by the amount of cost you're going to incur reconstructing the data. Sometimes you simply can't reconstruct it, in which case you'll probably never know for sure just how much it cost you.
+
+Again, how much you want to prevent this should affect your budget for redundant systems as well as things like backup tape drives and off-site archival services.
+
+Is Performance Everything?
+
+More often than not, the answer is no. It's important, but just how important has something of diminishing returns to it. For example, if buying those extra 10 percent of CPU power is going to save you two seconds per transaction—that may be a big deal if you have 50 data entry clerks trying to enter as much as they can a day. Over the course of a day, seemingly small amounts of time saved can add up. If each of those 50 clerks is performing 500 transactions a day, then saving two seconds per transaction adds up to over 13 man hours (that's over one person working all day!). Saving that time may allow you to delay a little longer in adding staff. The savings in wages will probably easily pay for the extra computing power.
+
+The company next door may look at the situation a little differently, though—they may only have one or two employees; furthermore, the process that they are working in might be one where they spend a lengthy period of time just filing out the form—the actual transaction that stores it isn't that big of deal. In such a case, their extra dollars for the additional speed may not be worth it.
+
+Driver Support
+
+Let's start off by cutting to the chase—I don't at all recommend that you save a few dollars (or even a lot of dollars) when buying your server by purchasing it from some company like "Bob's Pretty Fine Computers." Remember all those risks? Now, try introducing a strange mix of hardware and driver sets. Now imagine when you have a problem—you're quickly going to find all those companies pointing the finger at each other saying, "It's their fault!" Do you really want to be stuck in the middle?
+
+What you want is the tried and true—the tested—the known. Servers—particularly data servers—are an area to stick with well-known, trusted names. I'm not advocating anyone in particular (no ads in this book!), but I'm talking very mainstream people like Dell, IBM, HP, and so on. Note that, when I say well-known, trusted names, I mean names that are known in servers. Just because someone sells a billion desktops a year doesn't mean they know anything about servers—it's almost like apples and oranges. They are terribly different.
+
+By staying with well-known equipment, in addition to making sure that you have proper support when something fails, it also means that you're more likely to have that equipment survive upgrades well into the future. Each new version of the O/S only explicitly supports just so many pieces of equipment—you want to be sure that yours is one of them.
+
+The Ideal System
+
+Let me preface this by saying that there is no one ideal system. That being said, there is a general configuration (size excluded) that I and a very large number of other so-called "experts" seem to almost universally push as where you'd like to be if you had the budget for it. What we're talking about is drive arrangements here (the CPU and memory tends to be relative chicken feed budget- and setup-wise).
+
+What you'd like to have is a mix of mirroring and RAID 5 or 10. You place the O/S and the logs on the mirrored drives (ideally on separate mirror sets). You place the data on the RAID 5/10 array. That way, the O/S and logs—which both tend to do a lot of serial operations—have a drive setup all of their own without being interfered with by the reads and writes of the actual data. The data has a multi-head read/write arrangement for maximum performance, while maintaining a level of redundancy.
+
+Summary
+
+Performance could be, and should be, in a book by itself (indeed, there is a Wrox title around the very subject). There's simply just too much to cover and get acquainted with to do it all in one or even several chapters. The way I've tried to address this is by pointing out performance issues throughout the book, so you could take them on a piece at a time. This chapter is all about the first of two different slants I'm taking on it—design (addressing performance before it is a problem). In our next chapter, we'll look at how we can identify and address performance issues when our system is already live. It's important to note that the techniques discussed there are ones you may want to also utilize while you're still in test so you can tweak your design accordingly.
+21
+
+What Comes After: Forensic Performance Tuning
+
+Well, wouldn't it be nice if we could just develop the software, get paid for it, and forget it...? Yeah, well.... You can stop dreaming now—it just doesn't work that way.
+
+At some point, any software we consider to be part of a successful development project is going to get rolled out in front of some user base. Even if it's just a prototype, we're going to be analyzing how the prototype matched our original goals. Part of assessing whether we met our goals is taking a look at performance and asking ourselves what we could be doing better.
+
+In the previous chapter, I suggested that the most important thing to understand about performance tuning is that you are never going to know everything there is to know about it. If I were to come up with a competing idea for "most important thing to understand," it would be that you are never really done with performance tuning. The content of your system will change, the state of your server will change, the use of your system will change. In short, the overall system will change, and that will affect performance. The trick is to understand what's working poorly, what's working well, and what's working "well enough."
+
+Just as we did the previous chapter, we're going to be roaming around quite a bit in terms of the topics covered. Everything we talk about is going to be performance related in some fashion, but this time we'll be more focused on figuring out what is hurting performance. If you did your job in design and development, you should already have a great design in place, but the reality of software is that the design requirements rarely exactly match the reality of a live system. So, this chapter will be all about figuring out what's already occurring in our system and deciding what we can do better. Topics we'll cover in this chapter include:
+
+  * Routine maintenance
+  * Hardware configuration issues
+  * The SQL Server Profiler
+  * Data Collector
+
+When to Tune (Mark Two)
+
+So, I had a section in the previous chapter named this very thing—When to Tune. If you paid attention at all, you know the process should have started well before the "in test or production" mode that we're in with this chapter. That said, the new answer for this chapter is simply "regularly." Don't wait until users are screaming at you about something—instead plan on a regular optimization process.
+
+Much of the post-release maintenance is thought of as in the realm of the DBA, and I'm not necessarily going to dispute that, save for a few problems with that philosophy:
+
+  * You are producing a product that is used by many (are you going to expect every customer's DBA to individually deal with the problem you handed them?).
+  * What if there isn't a DBA (depending on your install, there may not be a DBA on staff, so what is your system and/or recommendations doing to prevent trouble for your end users?)?
+  * What if you are the DBA?
+
+This is all oversimplified, but the real key here is that you should be thinking about performance even after the product has been released and gone live. Whether it's how to build it better for the next release or simply trying to keep your paying customers happy, you should always be looking for problems (best if you know about them before your customer does) or simple ways of making your system a bit better.
+
+Routine Maintenance
+
+I hate it when good systems go bad. It happens on a regular basis though. It usually happens when people buy or build systems, put them into operation, and then forget about them.
+
+Maintenance is as much about performance as it is about system integrity. Query plans get out of date, index pages get full (so you have a lot of page splits), fragmentation happens, the best indexes need to be changed as usage and the amount of data in various tables changes.
+
+Watch the newsgroups. Talk to a few people who have older systems running. Visit some of the many SQL Server support sites on the Web. You'll hear the same story over and over again. "My system used to run great, but it just keeps getting slower and slower—I haven't changed anything, so what happened?" Well, systems will naturally become slower as the amount of data they have to search through increases; however, the change doesn't have to be all that remarkable and usually it shouldn't be. Instead, the cause is usually that the performance enhancements you put in place when you first installed the system don't really apply anymore; as the way your users use the system and the amount of data has changed, so has the mix of things that will give you the best performance.
+
+We'll be looking at maintenance quite a bit in the next chapter; however, we've discussed it here for two reasons. First, it will help if you are checking out this chapter because you have a specific performance problem; second, and perhaps more importantly, because there is a tendency to just think about maintenance as being something you do to prevent the system from going down and to ensure backups are available should the worst happen. This simply isn't the case. Maintenance is also a key from a performance perspective.
+
+Troubleshooting
+
+SQL Server offers a number of options to help with the prevention, detection, and measurement of long-running queries. The options range from a passive approach of measuring actual performance, so you know what's doing what, to a more active approach of employing a query "governor" to automatically kill queries that run over a length of time you choose. These tools are very often ignored or used only sparingly—which is something of a tragedy—they can save hours of troubleshooting by often leading you right to the problem query and even to the specific portion of your query that is creating the performance issues.
+
+Tools to take a look at include:
+
+  * The Data Collector
+  * SHOWPLAN TEXT|ALL and Graphical showplan
+  * STATISTICS IO
+  * Database Console Commands (DBCC)
+  * The sys.processes system view
+  * The Activity Monitor
+  * The SQL Server Profiler
+  * PerfMon
+
+Many people are caught up in just using one of these, but the reality is that there is little to no (depending on which two you're comparing) overlap between them. This means that developers and DBAs who try to rely on just one of them are actually missing out on a lot of potentially important information.
+
+Also, keep in mind that many of these are still useful in some form even if you are writing in a client-side language and sending the queries to the server (no sprocs). You can either watch the query come through to your server using the SQL Server Profiler, or you could even test the query in QA before moving it back to your client code.
+
+The Data Collector
+
+The Data Collector is new with SQL Server 2008 and provides a framework that pulls together the collection of data about your system's data and activity and performs analysis, troubleshooting (yes, SQL Server can use data to actually troubleshoot some of its own problems!), as well as persistence of the results for further analysis and diagnostics.
+
+Things included in the Data Collector include:
+
+  * The actual data collection engine
+  * Active performance monitoring, troubleshooting, and tuning
+  * Reporting
+
+This is a quantum leap in diagnostic possibilities over what we had in previous releases. Data collection can be aggregated on an enterprise-wide basis and reporting and analysis can span multiple servers.
+
+Setup and configuration of the Data Collector requires significant thought and analysis in its own right, and is largely deemed beyond the scope of this book (very much an administrator sort of thing), but some of the key elements include:
+
+  * Setting up logins to have appropriate rights to collect data and monitor collected data
+  * Creation of collection sets (groups of objects that collect data using one or more collection providers)
+  * Scheduling of data collection
+
+This is obviously far from comprehensive, but it gives a taste of the idea that setting up the data collection is non-trivial. Still, it can provide a wealth of information and is very worthwhile for test systems when doing scalability analysis and for larger production environments.
+
+The Data Collector and its associated framework of tools are domain aware, and can collect and warehouse data from multiple servers for comparison and overall enterprise analysis. Setup of enterprise-wide data collection is in the realm of the DBA and is considered outside the scope of this book (but it's a great thing to be aware is available!).
+
+The Various Showplans and STATISTICS
+
+SQL Server gives you a few different options for showing the specific plan being used by any given query. The information that they provide varies a bit depending on what option you choose, but this is one area where there is a fair amount of overlap between your options; however, each one definitely has its own unique thing that it brings to the picture. In addition, there are a number of options available to show query statistics.
+
+Let's take a look at the options and what they do.
+
+SHOWPLAN TEXT|ALL
+
+When either of these two SHOWPLAN options (they are mutually exclusive) is executed, SQL Server changes what results you get for your query. Indeed, the NOEXEC option (which says, "Figure out the query plan but don't actually perform the query") is put in place, and you receive no results other than those put out by the SHOWPLAN.
+
+The syntax for turning the SHOWPLAN on and off is pretty straightforward:
+
+SET SHOWPLAN TEXT|ALL ON|OFF
+
+When you use the TEXT option, you get back the query plan along with the estimated costs of running that plan. Since the NOEXEC option automatically goes with SHOWPLAN, you won't see any query results.
+
+When you use the ALL option, you receive everything you received with the TEXT option, plus a slew of additional statistical information, including such things as:
+
+  * The actual physical and logical operations planned
+  * Estimated row counts
+  * Estimated CPU usage
+  * Estimated I/O
+  * Average row size
+  * Whether or not the query will be run in parallel
+
+Let's run a very brief query utilizing (one at a time) both of these options:
+
+USE AdventureWorks2008;
+
+GO
+
+SET SHOWPLAN_TEXT ON;
+
+GO
+
+SELECT *
+
+FROM Sales.SalesOrderHeader;
+
+GO
+
+SET SHOWPLAN_TEXT OFF;
+
+GO
+
+SET SHOWPLAN_ALL ON;
+
+GO
+
+SELECT *
+
+FROM Sales.SalesOrderHeader;
+
+GO
+
+SET SHOWPLAN_ALL OFF;
+
+GO
+
+Notice that every statement is followed by a GO—thus making it part of its own batch. The batches that contain the actual query could have had an unlimited number of statements, but the batches setting the SHOWPLAN option have to be in a batch by themselves.
+
+The SHOWPLAN_TEXT portion of the results should look something like this:
+
+StmtText
+
+\-------------------------------------------
+
+SELECT *
+
+FROM Sales.SalesOrderHeader
+
+(1 row(s) affected)
+
+StmtText
+
+\------------------------------------------------------------------------------
+
+|--Compute Scalar(DEFINE:([AdventureWorks2008]....
+
+|--Compute Scalar(DEFINE:([AdventureWorks2008]...
+
+|--Clustered Index Scan(OBJECT:([AdventureWorks2008]...
+
+(3 row(s) affected)
+
+Unfortunately, the results are far too wide to fit all of it gracefully in the pages of this book, but there are a couple of key things I want you to notice about what was produced:
+
+  * There are multiple steps displayed.
+  * At each step, what object is being addressed and what kind of operation is being supplied.
+
+If we had been running a larger query—say something with several joins—then even more sub-processes would have been listed with indentations to indicate hierarchy.
+
+I'm not going to include the ALL results here since they simply will not fit in a book format (it's about 800 characters wide and won't fit in any readable form in a book—even if we flipped things sideways), but it includes a host of other information. Which one of these to use is essentially dependent on just how much information you want to be flooded with. If you just want to know the basic plan—such as is it using a merge or hash join, you probably just want to use the TEXT option. If you really want to know where the costs are and such, then you want the ALL option.
+
+Since the SHOWPLAN options imply the NOEXEC, that means nothing in your query is actually being executed. Before you do anything else, you need to set the option back to off; that even includes switching from one showplan option to the other (for example, SET SHOWPLAN_ALL ON wouldn't have any effect if you had already run SET SHOWPLAN_TEXT ON and hadn't yet turned it off).
+
+I like to make sure that every script I run that has a SET SHOWPLAN statement in it has both the on and off within that same script. It goes a long way toward keeping me from forgetting that I have it turned on and being confused when things aren't working the way I expect.
+
+Graphical Showplan
+
+The graphical showplan tool combines bits and pieces of the SHOWPLAN_ALL and wraps them up into a single graphical format. Graphical showplan is a Management Studio–only tool. It is selected through options in Management Studio rather than through T-SQL syntax—this means that it is only available when using Management Studio.
+
+The graphical showplan comes in two versions: estimated and actual. The estimated version is more like the SHOWPLAN in T-SQL. It implies that the query plan is just developed but not actually executed. This essentially waits until the query is done and shows you the way the query was actually done in the end.
+
+Why are these different? Well, SQL Server is smart enough to recognize when it starts down a given query plan based on an estimated cost and then finds the reality to be something other than what its estimates were based on. SQL Server uses statistics it keeps on tables and indexes to estimate cost. Those statistics can sometimes become skewed or downright out of date. The Query Optimizer will adjust on the fly if it starts down one path and finds something other than what it expected.
+
+For most things we do, the estimated execution plan is just fine. We have three options to activate the graphical showplan option:
+
+  * Select the Display Estimated Execution Plan option from the Query menu
+  * Press Control + L on your keyboard
+  * Click the Display Estimated Execution Plan button on the toolbar and in the Query menu (this option just shows us the plan with the NOEXEC option active)
+
+Personally, I like the option of having the graphical showplan in addition to my normal query run. While it means that I have to put the actual hit of the query on my system, it also means that the numbers I get are no longer just estimates but are based on the actual cost numbers. Indeed, if you run the showplan both ways and wind up with wildly different results, then you may want to take a look at the last time your statistics were updated on the tables on which the query is based. If necessary, you can then update them manually and try the process again.
+
+The hierarchy of the different subprocesses is then shown graphically. In order to see the costs and other specifics about any subprocess, just hover your mouse over that part of the graphical showplan and a tooltip will come up with the information:
+
+This arrangement, as shown in Figure 21.1, can often make it much easier to sort out the different pieces of the plan. The downside is that you can't print it out for reporting the way that you can with the text versions.
+
+Figure 21.1
+
+STATISTICS
+
+In addition to using the graphical showplan with actual execution of the query, you have a couple of other options for retrieving the "real" information on the statistics of your query: using SQL Server Profiler (discussed later in this chapter) and turning on STATISTICS PROFILE.
+
+STATISTICS actually has a couple of options that can be very handy in troubleshooting query performance, including those discussed in the following sections.
+
+SET STATISTICS IO ON|OFF
+
+This one is a very commonly used tool to figure out where and how the query is performing. STATISTICS IO provides several key pieces of information regarding the actual work necessary to perform your query. Information provided includes:
+
+  * Physical Reads: This represents the actual physical pages read from disk. It is never any more than, and is usually smaller than, the number for logical reads. This one can be very misleading in the sense that it will usually change (be less than the first run) the second time that you run your query. Any page that is already in the buffer cache will not have a physical read done on it, so, the second time you run the query in a reasonably short succession, the pages involved will, more than likely, still be in cache. In addition, this number will not be incremented if the page has already been read due to the read-ahead mechanism that is part of SQL Server. This means that your query may be responsible for loading the page physically into cache, but it still may not show up as part of the physical reads.
+  * Logical Reads: This is the number of times that the page was actually looked at—regardless of where it came from. That is, any page already in the memory cache will still create a logical read if the query makes use of it. Note that I said it is how many times the page was looked at. That means that you may have several logical reads for a single page if the page is needed several times (say for a nested loop that affects a page that has several rows on it).
+  * Read-Ahead Reads: This is the number of pages that SQL Server reads into the cache as a result of the read-ahead mechanism anticipating that the pages will be needed. The page may actually be used—or it may not. In either case, the read still counts as a read ahead. Read aheads are very similar to physical reads in the sense that they represent data being physically read from disk. The problem is that the number you get is based on the optimistic nature of the read-ahead mechanism and does not necessarily mean that all that work was actually put to use.
+  * Scan Count: The scan count represents the number of times that a table was accessed. This is somewhat different from logical reads, which was focused on page access. This is another situation where a nested loop is a good example. The outer table that is forming the basis for the condition on the query that is on the inside may only have a scan count of 1, where the inner loop table would have a scan count added for every time through the loop—that is, every record in the outer table.
+
+Some of the same information that forms the basis for STATISTICS IO is the information that feeds your cache-hit ratio if you look in PerfMon. The cache-hit ratio is based on the number of logical reads, less the physical reads, divided into the total actual reads (logical reads).
+
+The thing to look for with STATISTICS IO is for any one table that seems disproportionately high in either physical or logical reads.
+
+A very high physical read count could indicate that the data from the table is being pushed out of the buffer cache by other processes. If this is a table that you are going to be accessing with some regularity, then you may want to look at purchasing (or, if you're an ISV developing a SQL Server product, recommending) more memory for your system.
+
+If the logical reads are very high, then the issue may be more one of proper indexing. I'll give an example here from a client I had some time back. A query was taking approximately 15 seconds to run on an otherwise unloaded system. Since the system was to be a true OLTP system, this was an unacceptable time for the user to have to wait for information. (The query was actually a fairly simple lookup that happened to require a four-table join.) In order to find the problem, I used what amounted to STATISTICS IO. It happened to be the old graphical version that came with 6.5, but the data was much the same. After running the query just once, I could see that the process was requiring less than 20 logical reads from three of the tables, but it was performing over 45,000 logical reads from the fourth table. This is what I liked about the old graphical version; it took about a half a second to see that the bar on one table stretched all the way across the screen when the others were just a few pixels! From there, I knew right where to focus—in about two minutes, I had an index built to support a foreign key (remember, they aren't built by default), and the response time dropped to less than a second. The entire troubleshooting process on this one took literally minutes. Not every performance troubleshooting effort is that easy (indeed, most aren't), but using the right tools can often help a lot.
+
+SET STATISTICS TIME ON|OFF
+
+This one is amazingly little known. It shows the actual CPU time required to execute the query. Personally, I often use a simple SELECT GETDATE() before and after the query I'm testing—as we've done throughout most of the book, but this one can be handy because it separates out the time to parse and plan the query versus the time required to actually execute the query. It's also nice to not have to figure things out for yourself. (It will calculate the time in milliseconds; using GETDATE() you have to do that yourself.)
+
+Include Client Statistics
+
+You also have the ability to show statistical information about your connection as part of your query run. To make use of this, just select Include Client Statistics from the Query menu. As long as that option is set, every execution you make will produce a Client Statistics tab in the results pane of the Query window, as shown in Figure 21.2.
+
+Figure 21.2
+
+Database Console Commands (DBCC)
+
+The Database Console Commands (or DBCC) has a number of different options available to allow you to check the integrity and structural makeup of your database. This is far more the realm of the DBA than the developer, so I am, for the most part, considering the DBCC to be out of scope for this book.
+
+You may also hear of DBCC referred to as the Database Consistency Checker. This is what DBCC used to stand for. To be honest, I have no idea when what DBCC stood for changed, but, if you hear the other term, now you know why.
+
+Dynamic Management Views
+
+Over the last edition or two of SQL Server, Microsoft has been adding an increasing number of what are called dynamic management views—or DMVs. There is description and use information on these provided in Appendix B. They can provide a wide range of information on the current state of your server and/or database in a very code readable fashion (they can be wonderful for automating administrative tasks). To get a quick example, however, of how powerful these can be, let's take a quick look at one DMV that might be of interest.
+
+I can't stress enough that what I'm showing you in this section is really just a very small taste of what is possible with the various metadata and dynamic management views now available in SQL Server. You can get a solid start on learning them by checking out Appendix B in this book, but if you're looking to build a robust support tool, you may want to look for a book that is highly focused on this growing toolset in SQL Server.
+
+We will start by reviewing one that we first visited back in Chapter 13. We'll make a variation on a query we used in a cursor example:
+
+SELECT SCHEMA_NAME(CAST(OBJECTPROPERTYEX(i.object_id, 'SchemaId') AS int))
+
+\+ '.' +
+
+OBJECT_NAME(i.object_id)
+
+\+ '.' +
+
+i.name AS Name,
+
+ps.avg_fragmentation_in_percent
+
+FROM sys.dm_db_index_physical_stats (DB_ID(), NULL, NULL, NULL, NULL) AS ps
+
+JOIN sys.indexes AS i
+
+ON ps.object_id = i.object_id
+
+AND ps.index_id = i.index_id
+
+WHERE SCHEMA_NAME(CAST(OBJECTPROPERTYEX(i.object_id, 'SchemaId') AS int)) =
+
+'Purchasing'
+
+AND avg_fragmentation_in_percent > 30;
+
+This gives us all the indexes—regardless of what specific table they belong to—that are associated with a table in the Purchasing schema, but, more importantly, have index fragmentation in excess of 30%.
+
+What's powerful here is that we can easily script maintenance tasks based on the condition of our table. This is a major advance versus the older Database Console Commands options we previously used to view fragmentation.
+
+This is, as I suggested earlier, a relatively simple example. As is the case with many SQL Server topics, I'm sure there will be entire books written purely around the dynamic management views now available in SQL Server. Again, check out Appendix B for more information.
+
+The Activity Monitor
+
+The Activity Monitor has received a major face lift and some extra muscle with SQL Server 2008. All the old process information is there, but there is now a host of other information collected from a variety of other sources, such as PerfMon (a Windows tool for monitoring your system) and the Data Collector.
+
+The Activity Monitor can be found by right-clicking the server node of the Management Studio. Open it up and you get five major subject areas:
+
+  * Overview
+  * Processes
+  * Resource Waits
+  * Data File I/O
+  * Recent Expensive Queries
+
+Let's take a quick look at each of these.
+
+Overview
+
+This section is the one that will most remind you of PerfMon. It provides a relatively straightforward graph (as shown in Figure 21.3) of system activity as sampled on a adjustable interval (the default is every 10 seconds). Note that the values presented here are information on what SQL Server is utilizing—not your entire system.
+
+Figure 21.3
+
+Processes
+
+This largely maps, as shown in Figure 21.4, to the Activity Monitor as you would have seen it in SQL Server 2005. It provides information about what processes are running, the command they are currently executing, and metrics on resource and blocking used or incurred by that process.
+
+Figure 21.4
+
+Resource Waits
+
+Much like the Overview, this should remind you of PerfMon, providing metrics on wait times for a number of different counters (as shown in Figure 21.5).
+
+Figure 21.5
+
+Data File I/O
+
+Still providing largely PerfMon-based numbers here, this one provides information on the physical files being utilized by SQL Server. Prior to this information being gathered in one place (as shown in Figure 21.6), you would have had to set each file up individually in PerfMon. SQL Server now pulls that kind of metric up for you automatically.
+
+Figure 21.6
+
+Recent Expensive Queries
+
+This section, as shown in Figure 21.7, provides information we didn't really have prior to SQL Server 2008. We could map out some of this by using the SQL Server Profiler (discussed shortly), but it was tedious at best and very likely to be flooded with information we didn't really want or need (thus masking the information we were really after).
+
+Figure 21.7
+
+It's definitely worth noting that the expensive query information is among the information that can be logged to the Performance Data Warehouse, which means that you can use the warehouse to gather metrics not only for the last few minutes, but days or even weeks depending on the retention rules you've set up for your warehouse.
+
+The SQL Server Profiler
+
+The true lifesaver among the tools provided with SQL Server, this one is about letting you "sniff out" what's really going on with the server.
+
+Profiler can be started from the Start menu in Windows. You can also run it by selecting the Tools menu in Management Studio. When you first start it up, you can either load an existing profile template or create a new one.
+
+Let's take a look at some of the key points of the main Profiler by walking through a brief example.
+
+Start by choosing New⇒Trace from the File menu. Log in to the server you've been working with, and you should be presented with the dialog box in Figure 21.8.
+
+Figure 21.8
+
+The trace name is probably obvious enough, but the template information might not be. A template is a set of pre-established events, data columns, and filters that you want to see in a trace, and the templates provided with SQL Server are named for the kind of situation that you might want to use them in. Any templates that are stored in the default profiler template directory (which is under the tools subdirectory of wherever you installed SQL Server) are included in the Use the Template drop-down box.
+
+Pay particular attention to what template you choose. It determines exactly how much is available to you on the next tab. If you choose too restrictive of a template, you can select Show All Events and Show All Columns to expose all possible choices.
+
+Next up, you can choose whether to capture the trace to a file on disk or a table in the database. If you save to a file, then that file will be available only to the system that you store it on (or anyone who has access to a network share if that's where you save it). If you save it to a table, then everyone who can connect to the server and has appropriate permissions will be able to examine the trace.
+
+Last, but not least, on this dialog is the stop time feature. This allows you to leave a trace running (for example, for a workload file or some other long-running trace need) and have it shut down automatically at a later time.
+
+Things get somewhat more interesting on the tab that comes next (Events Selection), as shown in Figure 21.9.
+
+Figure 21.9
+
+I've chosen the "blank" template here, and have scrolled down to the Performance area and expanded it. This tab is all about what events you are going to track, and, as you can see, there's quite a range. If, for example, you chose the Tuning trace template, then the initial setup is one that tracks what's needed for the Database Engine Tuning Advisor plus a bit more. In addition, you use the table to select what information you want collected for each class of event.
+
+The temptation here is just to select everything under the sun, so you'll be sure to have all the information. There are a couple of reasons not to do this. First, it means that a lot of additional text has to come back down the pipe to your server. Remember that SQL Server Profiler has to place some audits in the system, and this means that your system is having an additional burden placed on it whenever the Profiler is running. The bigger the trace, the bigger the burden. Second, it often means lower productivity for you since you have to wade through a huge morass of data—much of which you probably won't need.
+
+I want to point out a couple of key fields here before we move on:
+
+  * TextData: This is the actual text of the statement that the Profiler happens to have added to the trace at that moment in time.
+  * Application Name: Another of those highly underutilized features. The application name is something you can set when you create the connection from the client. If you're using ADO.NET or some other data object model and underlying connection method, you can pass the application name as a parameter in your connection string. It can be quite handy for your DBAs when they are trying to troubleshoot problems in the system.
+  * NT User Name: This one is what it sounds like. What's great about this is that it can provide a level of accountability.
+  * Login Name: Same as NT User Name, only used when operating under SQL Server Security rather than Windows Security.
+  * CPU: The actual CPU cycles used.
+  * Duration: How long the query ran—includes time waiting for locks and such (where the CPU may not have been doing anything, so doesn't reference that load).
+  * SPID (SQL Process ID): This one can be nice if your trace reveals something where you want to kill a process. This is the number you would use with your KILL statement.
+
+Moving right along, let's take a look at what I consider to be one of the most important options—Column Filters.
+
+This is the one that makes sure that, on a production or load test server, you don't get buried in several thousand pages of garbage just by opening a trace up for a few minutes.
+
+With Column Filters, you can select from a number of different options to use to filter out data and limit the size of your result set. By default, Profiler automatically sets up to exclude its own activity in order to try to reduce the Profiler's impact on the end numbers. For the example in Figure 21.10, I'm adding in a Duration value where I've set the minimum to 3,000 milliseconds with no maximum.
+
+Odds are that, if you run this with a query against the Sales.SalesOrderHeaders table, you're not going to see it appear in the trace. Why is that? Because that query will probably run very fast and not meet the criteria for being included in our trace—this is an example of how you might set up a trace to capture the query text and username of someone who has been running very long-running queries on the system. Now try running something a little longer—such as a query that joins many large tables. There's a good chance that you'll now exceed the duration threshold, and your query will show up in the Profiler (if not, then try adjusting down the duration expectation that you set in Profiler).
+
+Figure 21.10
+
+I can't say enough about how important this tool is in solving performance and other problems. There have been too many times to count in which I've thought that my sproc was running down one logic path only to find that a totally different branch was being executed. How did I originally find out? I watched it execute in Profiler.
+
+The Performance Monitor (PerfMon)
+
+When you install SQL Server on Windows, SQL Server adds several counters to the Reliability and Performance Monitor (which is sometimes called PerfMon because of the executable's filename—perfmon.msc). This can be an excellent tool for finding where problems are happening and even determining the nature of some problems.
+
+Prior to Windows Vista and Windows Server 2008, the Reliability and Performance Monitor was known simply as Performance Monitor.
+
+While many of the relevant counters are now in the Activity Monitor within the Management Studio, the Reliability and Performance Monitor can be accessed through the Administrative Tools menu in Windows. SQL Server has a number of different Performance Objects, and, within each of these, you will find a series of counters related to that object. Historically, some of the important ones have included:
+
+  * SQLServer Cache Manager: Buffer Hit Cache Ratio: This is the number of pages that were read from the buffer cache rather than from a physical read from disk. The thing to watch out for here is that this number can be thrown off depending on how effective the read-ahead mechanism was—anything that the read-ahead mechanism got to and put in cache before the query actually needed it is counted as a buffer-cache hit—even though there really was a physical read related to the query. Still, this one is going to give you a decent idea of how efficient your memory usage is. You want to see really high numbers here (in the 90 + percent range) for maximum performance. Generally speaking, a low buffer hit cache ratio is indicative of needing more memory.
+  * SQLServer General Statistics: User Connections: Pretty much as it sounds, this is the number of user connections currently active in the system.
+  * SQLServer Memory Manager: Total Server Memory: The total amount of dynamic memory that the SQL Server is currently using. As you might expect, when this number is high relative to the amount of memory available in your system (remember to leave some for the O/S!), you need to seriously consider adding more RAM.
+  * SQLServer SQL Statistics: SQL Compilations/sec: This is telling you how often SQL Server needs to compile things (sprocs, triggers). Keep in mind that this number will also include recompiles (due to changes in index statistics or because a recompile was explicitly requested). When your server is first getting started, this number may spike for a bit, but it should become stable after your server has been running for a while at a constant set and rate of activities.
+  * SQLServer Buffer Manager: Page Reads/sec: The number of physical reads from disk for your server. You'd like to see a relatively low number here. Unfortunately, because the requirements and activities of each system are different, I can't give you a benchmark to work from here.
+  * SQLServer Buffer Manager: Page Writes/sec: The number of physical writes performed to disk for your server. Again, you'd like a low number here.
+
+If you want to add or change any of these, just click the plus ( + ) sign up on the toolbar. You'll be presented with a dialog, as shown in Figure 21.11, that lets you choose between all the different objects and counters available on your system (not just those related to SQL Server):
+
+Figure 21.11
+
+The big thing here is to realize that you can mix and match a wide variety of counters to be able to reach a better understanding of what's going on with your server and make the appropriate adjustments. Much of the time, this kind of task is going to have more to do with the DBA than the developer, but many of these stats can be helpful to you when you are doing load testing for your application.
+
+Summary
+
+Performance could be, and should be, in a book by itself. There's simply just too much to cover and get acquainted with to do it all in one or even several chapters. The way I've tried to address this is by pointing out performance issues throughout the book, so you could take them on a piece at a time.
+
+The biggest thing is to have a plan—a performance plan. Make performance an issue from the first stages of your project. Set benchmarks early on, and continually measure your system against those benchmarks to know where you are improving and what problems you might need to address.
+
+In this chapter, we've reviewed a number of the performance considerations touched on throughout the book, plus added several new tools and ideas to consider.
+
+In the next chapter, we'll be taking a look at administration issues. As you've seen through some of the portions of this chapter, proper administration can also be a key ingredient to performance.
+22
+
+Administration
+
+So, at this point we've covered all of the core database topics and then some. We still have a chapter or two to clean up the edges around our development effort, but we've mostly covered everything—heh, NOT!!! For the developer, we like to think our job is done, but for the application we're building, it's just beginning. And so, it's time to talk a bit about maintenance and administration of the databases you develop.
+
+If there is anything I hope to instill in you in your database development efforts, it's to avoid the "hey, I just build 'em—now it's your problem" attitude that is all too common in the world of database-driven applications. Far too many developers are guilty of attempting to build relatively bug-free code, and calling it good. Well, just because it runs, doesn't mean your end user is going to be successful with your software over the long haul. It is, therefore, important for you to look at how your system is going to be used, and what will be necessary to keep it functioning properly.
+
+In this chapter, we're going to take a look at some of the tasks that are necessary to make sure that your end users can not only recover from problems and disasters but also perform some basic maintenance that will help things keep running smoothly.
+
+Among the things we'll touch on are:
+
+  * Scheduling jobs
+  * Backing up and recovering
+  * Basic defragmenting and index rebuilding
+  * Setting alerts
+  * Archiving
+  * Using PowerShell
+  * Considering Policy-Based Management
+
+While these are far from the only administration tasks available, these do represent something of "the minimum" you should expect to address in the deployment plans for your app. We'll also take a further look at monitoring (several items in that area were discussed as part of the performance tuning coverage in the preceding chapter) through the use of the Policy-Based Management framework that was added with SQL Server 2008.
+
+This is one of those chapters where I feel that overlap with some of the coverage in my Beginning title is an unfortunate necessity. The reality is that most developers I know—even relative experts in SQL Server—know precious little about the job scheduling, index fragmentation, and even backup and recovery. Be careful, however, assuming that you've seen everything this chapter has to offer just because you may have read the Beginning title. I've added more advanced coverage of several of these topics, and I also include code-driven handling of many administrative tasks.
+
+Scheduling Jobs
+
+Many of the tasks that we'll go over in the remainder of the chapter can be scheduled. Scheduling jobs allows you to run tasks that place a load on the system at off-peak hours. It also ensures that you don't forget to take care of things. From index rebuilds to backups, you'll hear of horror stories over and over about shops that "forgot" to do that, or thought they had set up a scheduled job but never checked on it.
+
+If your background is in Windows Server, and you have scheduled other jobs using the Windows Scheduler service, you could utilize that scheduling engine to support SQL Server. Doing things all in the Windows Scheduler allows you to have everything in one place, but SQL Server has some more robust branching options.
+
+There are basically two terms to think about: jobs and tasks.
+
+  * Tasks: These are single processes that are to be executed, or batches of commands that are to be run. Tasks are not independent—they exist only as members of jobs.
+  * Jobs: These are a grouping of one or more tasks that should be run together. You can, however, set up dependencies and branching depending on the success or failure of individual tasks (for example, task A runs if the previous task succeeds, but task B runs if the previous task fails).
+
+Jobs can be scheduled based on:
+
+  * A daily, weekly, or monthly basis
+  * A specific time of the day
+  * A specific frequency (say, every 10 minutes, or every hour)
+  * When the CPU becomes idle for a period of time
+  * When the SQL Server Agent starts
+  * In response to an alert
+
+Tasks are run by virtue of being part of a job and based on the branching rules you define for your job. Just because a job runs doesn't mean that all the tasks that are part of that job will run. Some may be executed and others not depending on the success or failure of previous tasks in the job and what branching rules you have established. SQL Server not only allows one task to automatically fire when another finishes, but it also allows for doing something entirely different (such as running some sort of recovery task) if the current task fails.
+
+In addition to branching you can, depending on what happens, also tell SQL Server to:
+
+  * Provide notification of the success or failure of a job to an operator. You're allowed to send a separate notification for a network message (which would pop up on a user's screen as long as they are logged in), a pager, and an e-mail address to one operator each.
+  * Write the information to the event log.
+  * Automatically delete the job (to prevent executing it later and generally "clean up").
+
+Let's take a quick look at how to create operators in Management Studio, and then we'll move on to creating the other objects needed to get jobs scheduled.
+
+Creating an Operator
+
+If you're going to make use of the notification features of the SQL Agent, then you must have an operator set up to define the specifics for who is notified. This side of things—the creation of operators—isn't typically done through any kind of automated process or as part of the developed code. These are usually created manually by the DBA. We'll go ahead and take a rather brief look at creating operators here just to understand how it works in relation to the scheduling of tasks.
+
+Creating an Operator Using Management Studio
+
+To create an operator using Management Studio, you need to navigate to the SQL Server Agent node of the server for which you're creating the operator. Expand the SQL Server Agent node, right-click the Operators member, and choose New Operator.
+
+Be aware that, depending on your particular installation, the SQL Server Agent Service may not start automatically by default. If you run into any issues or if you notice the SQL Server Agent icon in the Management Studio has a little red square in it, then the service is probably set to manual or even disabled—you will probably want to change the service to start automatically. Regardless, make sure that it is running for the examples found in this chapter. You can do this by right-clicking the Agent node and selecting Start.
+
+You should be presented with the dialog box shown in Figure 22.1 (mine is partially filled in).
+
+Figure 22.1
+
+You can then fill out a schedule for what times this operator is to receive e-mail notifications for certain kinds of errors that we'll see on the Notifications tab.
+
+Speaking of that Notifications tab, go ahead and click over to that tab. It should appear as in Figure 22.2.
+
+Figure 22.2
+
+Until you have more alerts in your system (we'll get to those later in this chapter), this page may not make a lot of sense. What it is about is setting up what notifications you want this operator to receive depending on what defined alerts get triggered. Again, hard to understand this concept before we've gotten to alerts, but suffice to say that alerts are triggered when certain things happen in your database, and this page defines which alerts this particular operator receives.
+
+Creating an Operator Using T-SQL
+
+If you do decide to create operators programmatically, you can make use of the sp_add_operator sproc found in msdb.
+
+Note that sp_add_operator and most other SQL Server Agent–related stored procedures are managed through the msdb database rather than being true system stored procedures. As such, you need to either have msdb current when you call them or use three-part naming.
+
+After seeing all the different things you need to choose in Management Studio, it probably won't surprise you to find out that this sproc has a ton of different parameters. Fortunately, a number of them are optional, so you need to supply them only if you're going to make use of them. The syntax looks like this:
+
+sp_add_operator [@name =] '<operator name>'
+
+[, [@enabled =] <0 for no, 1 for yes>]
+
+[, [@email_address =] '<email alias or address>']
+
+[, [@pager_address =] '<pager address>']
+
+[, [@weekday_pager_start_time =] <weekday pager start time>]
+
+[, [@weekday_pager_end_time =] <weekday pager end time>]
+
+[, [@saturday_pager_start_time =] <Saturday pager start time>]
+
+[, [@saturday_pager_end_time =] <Saturday pager end time>]
+
+[, [@sunday_pager_start_time =] <Sunday pager start time>]
+
+[, [@sunday_pager_end_time =] <Sunday pager end time>]
+
+[, [@pager_days =] <pager days>]
+
+[, [@netsend_address =] '<netsend address>']
+
+[, [@category_name =] '<category name>']
+
+Most of the parameters in this sproc are self-explanatory, but there are a few we need to take a closer look at:
+
+  * @enabled: This is a Boolean value and works just the way you would typically use a bit flag—0 means disable this operator and 1 means enable the operator.
+  * @email_address: This one is just a little tricky. In order to use e-mail with your SQL Server, you need to configure Database Mail to be operational using a specific mail server. This parameter assumes that whatever value you supply is an alias on that mail server. If you are providing the more classic e-mail address type (somebody@SomeDomain.com), then you need to enclose it in square brackets—like [somebody@SomeDomain.com]. Note that the entire address—including the brackets—must still be enclosed in quotation marks.
+  * @pager_days: This is a number that indicates the days that the operator is available for pages. This is probably the toughest of all the parameters. This uses a single-byte bit-flag approach similar to what we saw with the @@OPTIONS global variable described in the system functions appendix at the back of the book). You simply add the values together for all the values that you want to set as active days for this operator. The options are:
+
+Value | Day of Week  
+---|---  
+Sunday | 1  
+Monday | 2  
+Tuesday | 4  
+Wednesday | 8  
+Thursday | 16  
+Friday | 32  
+Saturday | 64
+
+Okay, so let's go ahead and create our operator using sp_add_operator. We'll keep our use of parameters down, since many of them are redundant:
+
+USE msdb;
+
+DECLARE @PageDays int;
+
+SELECT @PageDays = 2 + 8 + 32 -- Monday, Wednesday, and Friday;
+
+EXEC sp_add_operator @name = 'TSQLOperator',
+
+@enabled = 1,
+
+@pager_address = 'YourEmail@YourDomain.com',
+
+@weekday_pager_start_time = 080000,
+
+@weekday_pager_end_time = 170000,
+
+@pager_days = @PageDays;
+
+If you go back into Management Studio and refresh your Operators list, you should see your new operator there.
+
+There are three other sprocs (plus one to retrieve information) that you need to make use of in order to have power over your operator from T-SQL:
+
+  * sp_help_operator: Provides information on the current settings for the operator.
+  * sp_update_operator: Accepts all the same information as sp_add_operator; the new information completely replaces the old information.
+  * sp_delete_operator: Removes the specified operator from the system.
+  * sp_add_notification: Accepts an alert name, an operator name, and a method of notification (e-mail, pager, netsend). Adds a notification such that, if the alert is triggered, then the specified operator will be notified via the specified method.
+
+Now that you've seen how to create operators, let's take a look at creating actual jobs and tasks.
+
+Creating Jobs and Tasks
+
+As I mentioned earlier, jobs are a collection of one or more tasks. A task is a logical unit of work, such as backing up one database or running a T-SQL script to meet a specific need, such as rebuilding all your indexes.
+
+Even though a job can contain several tasks, this is no guarantee that every task in a job will run. They will either run or not run depending on the success or failure of other tasks in the job and what you've defined as the response for each case of success or failure. For example, you might cancel the remainder of the job if one of the tasks fails.
+
+Like operators, jobs can be created in Management Studio as well as programmatic constructs.
+
+Creating Jobs and Tasks Using Management Studio
+
+The SQL Server Management Studio makes it very easy to create scheduled jobs. Just navigate to the SQL Server Agent node of your server. Then right-click the Jobs member and select New Job. You should get a multinode dialog box, shown in Figure 22.3, that will help you build the job one step at a time.
+
+Figure 22.3
+
+The name can be whatever you like as long as it adheres to the SQL Server rules for naming, as discussed early in this book.
+
+Most of the rest of the information is, again, self-explanatory with the exception of Category—which is just one way of grouping together jobs. Many of your jobs that are specific to your application are going to be Uncategorized, although you will probably on occasion run into instances where you want to create Web Assistant, Database Maintenance, Full Text, or Replication Jobs. Those each go into their own category for easy identification.
+
+We can then move on to Steps, as shown in Figure 22.4. This is the place where we tell SQL Server to start creating our new tasks that will be part of this job.
+
+Figure 22.4
+
+To add a new step to our job, we just click the New button and fill in the new dialog box, shown in Figure 22.5. We'll use a T-SQL statement to raise a bogus error just so we can see that things are really happening when we schedule this job. Note, however, that there is an Open button to the left of the command box—you can use this to import SQL Scripts that you have saved in files.
+
+Figure 22.5
+
+Let's go ahead and move on to the Advanced tab for this dialog, shown in Figure 22.6—it's here that we really start to see some of the cool functionality that our job scheduler offers.
+
+Notice several things in this dialog:
+
+  * You can automatically set the job to retry at a specific interval if the task fails.
+  * You can choose what to do if the job succeeds or fails. For each result (success or failure), you can: 
+    * Quit reporting success
+    * Quit reporting failure
+    * Move on to the next step
+  * You can output results to a file. (This is very nice for auditing.)
+  * You can impersonate another user (for rights purposes). Note that you have to have the rights for that user. Because we're logged in as a sysadmin, we can run the job as the dbo or just about anyone. The average user would probably only have, at most, the guest account available (unless they were the database owner)—but, hey, in most cases a general user shouldn't be scheduling his or her own jobs this way anyway (let your client application provide that functionality).
+
+Figure 22.6
+
+Okay, so there's little chance that our RAISERROR statement is going to fail, so we'll just take the default of "Quit the job reporting failure" on this one (we'll see other possibilities later in the chapter when we come to backups).
+
+That moves us back to the main New Job dialog, and we're now ready to move on to the Schedules node, shown in Figure 22.7.
+
+Figure 22.7
+
+In this dialog, we can manage one or more scheduled times for this job to run. To actually create a new scheduled time for the job to run, we need to click the New button. That brings up yet another dialog, shown in Figure 22.8.
+
+Figure 22.8
+
+I've largely filled this one out already (lest you get buried in a sea of screenshots), but it is from this dialog that we create a new schedule for this job. Recurrence and frequency are set here.
+
+The frequency side of things can be a bit confusing because of the funny way that they've worded things. If you want something to run at multiple times every day, then you need to set the job to Occur Daily—every 1 day. This seems like it would run only once a day, but then you also have the option of setting whether it runs once or on an interval. In our case, we want to set our job to run every 5 minutes.
+
+Now we're ready to move on to the next node of our job properties—Alerts, shown in Figure 22.9.
+
+Figure 22.9
+
+From here, we can select which alerts we want to make depending on what happens. Choose Add and we get yet another rich dialog, shown in Figure 22.10.
+
+Figure 22.10
+
+Our first node—General—is going to let us fill out some of the basics. We can, for example, limit this notification to one particular database. We also define just how severe the condition needs to be before the alert will fire (in terms of severity of the error).
+
+From there, it is on to the Response node (see Figure 22.11).
+
+Figure 22.11
+
+Notice that I was able to choose either of the operators that we created earlier in the chapter. (I've just stuck with the one we created using the Management Studio.) It is through the definitions of these operators that the SQL Server Agent knows what e-mail address or netsend address to make the notification to. Also notice that we have control, on the right-hand side, over how our operator is notified.
+
+Last, but not least, we have the Options node (see Figure 22.12), to complete the creation of our new alert.
+
+Figure 22.12
+
+With the new alert created, we can go back to the Notifications node of the main New Job dialog (see Figure 22.13).
+
+Figure 22.13
+
+This window lets you bypass the older alerts model and define a response that is specific to this one job—we'll just stick with what we already have for now, but you could define specific additional notifications in this dialog.
+
+At this point, you are ready to say OK and exit the dialog. You'll need to wait a few minutes before the task will fire, but you should start to see log entries appear every five minutes in the Windows event log. You can look at this by navigating to the Event Viewer in the Computer Management utility for your system (where to find this varies a bit depending on what version and edition of Windows you are running). You'll need to switch the view to use the Application log (under Windows logs).
+
+Don't forget that, if you're going to be running scheduled tasks like this one, you need to have the SQL Server Agent running in order for them to be executed. You can check the status of the SQL Server Agent by running the SQL Server Configuration Manager and selecting the SQL Server Agent service, or by navigating to the SQL Server Agent node of the Object Explorer in Management Studio.
+
+Also, don't forget to disable this job (right-click the job in Management Studio after you've seen that it's working the way you expect). Otherwise, it will just continue to sit there and create entries in your Application log. Eventually, the Application log will fill up and you can have problems with your system.
+
+Creating Jobs and Tasks Using T-SQL
+
+Before we get started, I want to point out that using T-SQL for this kind of stuff (creating scheduled jobs and tasks) is not usually the way things are done on a day-to-day basis. Most jobs wind up being scheduled by the DBA based on a specific need and a specific schedule that is required. If you're not in a situation where you need to script the installation of tasks, then you may want to just skip this section (it's a lot to learn if you aren't going to use it!). That being said, there can be times where your end users won't have a DBA handy (small shops, for example, often don't have anything even remotely resembling a DBA), so you'll want to script some jobs to help out unsophisticated users.
+
+Automating the creation of certain jobs is very frequently overlooked in installation procedures—particularly for shrink-wrap software. If you're working in some form of consulting or private IS shop environment, then there's a good chance that you are going to need to take care of scheduling all the needed tasks when you do the install. With shrink-wrap software, however, you often aren't at all in control of the installation process—indeed, you may be hundreds or thousands of miles away from the install and may not even know that it's happening.
+
+How then do you make sure that basic tasks (like backups, for example) get done? You can make it part of your installation process.
+
+Jobs can be added to SQL Server using T-SQL by using three different stored procedures:
+
+  * sp_add_job: This creates the actual job.
+  * sp_add_job_step: This creates a task within the job.
+  * sp_add_jobschedule: This determines when the job will run.
+
+Each of these builds a piece of the overall execution of the scheduled task much as the different tabs in Management Studio did. The next sections take a look at each individually.
+
+All jobs and tasks are stored in the msdb database. As such, you'll need to make sure that msdb is the current database (utilizing the USE command) when calling any of these sprocs.
+
+sp_add_job
+
+This one creates the top-level of a hierarchy and establishes who owns the job and how notifications should be handled. There are quite a few parameters, but most of them are fairly easy to figure out:
+
+sp_add_job [@job_name =] '<job name>'
+
+[,[@enabled =] <0 for no, 1 for yes>]
+
+[,[@description =] '<description of the job>']
+
+[,[@start_step_id =] <ID of the step you want to start at>]
+
+[,[@category_name =] '<category>']
+
+[,[@category_id =] <category ID>]
+
+[,[@owner_login_name =] '<login>']
+
+[,[@notify_level_eventlog =] <eventlog level>]
+
+[,[@notify_level_email =] <email level>]
+
+[,[@notify_level_netsend =] <netsend level>]
+
+[,[@notify_level_page =] <page level>]
+
+[,[@notify_email_operator_name =] '<name of operator to email>']
+
+[,[@notify_netsend_operator_name =] '<name of operator for network message>']
+
+[,[@notify_page_operator_name =] '<name of operator to page>']
+
+[,[@delete_level =] <delete level>]
+
+[,[@job_id =] <job id> OUTPUT]
+
+Again, most of the parameters here are self-explanatory, but let's again touch on some of the more sticky ones.
+
+  * @start_step_id: This one is going to default to 1, and that's almost always going to be the place to leave it. We'll be adding steps shortly, but those steps will have identifiers to them, and this just lets the SQL Server Agent know where to begin the job.
+  * @category_name: This one equates directly with the category we saw in Management Studio. It will often be none (in which case, see @category_ID) but could be a Database Maintenance (another common choice), Full Text, Web Assistant, Replication, or a category that you add yourself using sp_add_category.
+  * @category_id: This is just a way of providing a category without being dependent on a particular language. If you don't want to assign any particular category, then I recommend using this option instead of the name and supplying a value of either 0 (Uncategorized, but runs local) or 1 (Uncategorized Multi-Server).
+  * @notify_level_eventlog: For each type of notification, this determines under what condition the notification occurs. To use this sproc, though, we need to supply some constant values to indicate when we want the notification to happen. The constants are:
+
+Constant Value | When the Notification Occurs  
+---|---  
+0 | Never  
+1 | When the task succeeds  
+2 | When the task fails (this is the default)  
+3 | Every time the task runs
+
+  * @job_id: This is just a way of finding out what job ID was assigned to your newly created job. You'll need this value when you go to create job steps and the job schedule(s). The big things on this one are: 
+    * Remember to receive the value into a variable so you can reuse it.
+    * The variable needs to be of type uniqueidentifier rather than the types you might be more familiar with at this point.
+
+Note that all the non-level "notify" parameters are expecting an operator name. You should create your operators before running this sproc.
+
+So, let's create a job to test this process out. What we're going to do here is create a job that's nearly identical to the job we created in Management Studio.
+
+First, we need to create our top-level job. All we're going to do for notifications is to send a message on failure to the Windows event log. If you have Database Mail set up, then feel free to add in notification parameters for your operator.
+
+USE msdb;
+
+DECLARE @JobID uniqueidentifier;
+
+EXEC sp_add_job
+
+@job_name = 'TSQLCreatedTestJob',
+
+@enabled = 1,
+
+@notify_level_eventlog = 3,
+
+@job_id = @JobID OUTPUT;
+
+SELECT 'JobID is ' + CONVERT(varchar(128),@JobID);
+
+Now, execute this, and you should wind up with something like this:
+
+\---------------------------------------------------------------------
+
+JobID is 83369994-6C5B-45FA-A702-3511214A2F8A
+
+(1 row(s) affected)
+
+Note that your particular GUID will be different from the one I got here. (Remember that GUIDs are effectively guaranteed to be unique across time and space.) You can either use this value or you can use the job name to refer to the job later. (I happen to find this a lot easier, but it can create problems when dealing with multiple servers.)
+
+sp_add_jobserver
+
+This is a quick-and-dirty one. We've now got ourselves a job, but we don't have anything assigned for it to run against. You see, you can create a job on one server but still run it against a completely different server if you choose.
+
+In order to target a particular server, we'll use a sproc (in msdb still) called sp_add_jobserver. The syntax is the easiest by far of any we'll be looking at in this section, and looks like this:
+
+sp_add_jobserver [@job_id =] <job id>|[@job_name =] '<job name>',
+
+[@server_name =] '<server>'
+
+Note that you supply either the job ID or the job name—not both.
+
+So, to assign a target server for our job, we need to run a quick command:
+
+USE msdb;
+
+EXEC sp_add_jobserver
+
+@job_name = 'TSQLCreatedTestJob',
+
+@server_name = "(local)";
+
+Note that this will just point at the local server regardless of what that server is named. We could have also put the name of another valid SQL Server in to be targeted.
+
+sp_add_jobstep
+
+The second step in the process is to tell the job specifically what it is going to do. At the moment, all we have in our example is the shell. The job doesn't have any tasks to perform, and that makes it a very useless job indeed. There is a flip side to this though—a step can't even be created without some job to assign it to.
+
+The next step then is to run sp_add_jobstep. This is essentially adding a task to the job. If we had multiple steps we wanted the job to do, then we would run this particular sproc several times.
+
+The syntax looks like this:
+
+sp_add_jobstep [@job_id =] <job ID> | [@job_name =] '<job name>']
+
+[,[@step_id =] <step ID>]
+
+[,[@step_name =] '<step name>']
+
+[,[@subsystem =] '<subsystem>']
+
+[,[@command =] '<command>']
+
+[,[@additional_parameters =] '<parameters>']
+
+[,[@cmdexec_success_code =] <code>]
+
+[,[@on_success_action =] <success action>]
+
+[,[@on_success_step_id =] <success step ID>]
+
+[,[@on_fail_action =] <fail action>]
+
+[,[@on_fail_step_id =] <fail step ID>]
+
+[,[@server =] '<server>']
+
+[,[@database_name =] '<database>']
+
+[,[@database_user_name =] '<user>']
+
+[,[@retry_attempts =] <retry attempts>]
+
+[,[@retry_interval =] <retry interval>]
+
+[,[@os_run_priority =] <run priority>]
+
+[,[@output_file_name =] '<file name>']
+
+[,[@flags =] <flags>]
+
+Not as many of the parameters are self-explanatory here, so let's look at the more confusing ones in the list:
+
+  * @job_id vs. @job_name: This is actually a rather odd sproc in the sense that it expects you to enter one of the first two parameters, but not both. You can either attach this step to a job by its GUID (as you saved from the last sproc run) or by the job name.
+  * @step_id: All the steps in any job have an ID. SQL Server assigns these IDs automatically as you insert the steps. So why, if it does it automatically, do we have a parameter for it? That's in case we want to insert a step in the middle of a job. If there are already numbers 1–5 in the job, and we insert a new step and provide a step ID of 3, then our new step will be assigned to position number 3. The previous step 3 will be moved to position 4 with each succeeding step being incremented by 1 to make room for the previous step.
+  * @step_name: Is what it says—the name of that particular task. Just be aware that there is no default here. You must provide a step name.
+  * @subsystem: This ties in very closely to job categories and determines which subsystem within SQL Server (such as the replication engine, or the command line—the command prompt—or Integration Services) is responsible for executing the script. The default is that you're running a set of T-SQL statements. The possible subsystems are:
+
+SubSystem | Description  
+---|---  
+ACTIVESCRIPTING | The scripting engine (VB Script). Note that this one is considered deprecated, and Microsoft will remove it from the product at some point.  
+ANALYSISQUERY | Analysis Services query (MDX, DMX).  
+ANALYSISCOMMAND | Analysis Services command (XMLA).  
+CMDEXEC | Gives you the capability to execute compiled programs or batch files from a command (DOS) prompt.  
+DISTRIBUTION | The Replication Distribution Agent.  
+'Dts' | Integration Services package execution.  
+LOGREADER | Replication Log Reader Agent.  
+MERGE | The Replication Merge Agent.  
+'PowerShell' | PowerShell script.  
+'QueueReader' | Replication Queue Reader Agent job.  
+SNAPSHOT | The Replication Snapshot Agent.  
+TSQL | A T-SQL batch. This is the default.
+
+  * @command: This is the actual command you're issuing to a specific subsystem. In our example, this is going to be the RAISERROR command just like we issued when using Management Studio, but it could be almost any T-SQL command. What's cool here is that there are some system-supplied values you can use in your commands. You place these in the middle of your scripts as needed, and they are replaced at runtime (we'll make use of this in our example). The possible system-supplied values are:
+
+Tag | Description  
+---|---  
+A-DBN | Substitutes in the database name.  
+A-SVR | Substitutes the server name in the place of the tag.  
+A-ERR | Error number.  
+A-SEV | Error severity.  
+A-MSG | The message text from the error.  
+DATE | Supplies the current date (in YYYYMMDD format).  
+INST | Provides the name of the current instance of SQL Server (it's blank if it is the default instance).  
+JOBID | Supplies the current Job ID.  
+MACH | The current computer name.  
+MSSA | Master SQL Server Agent name.  
+OSCMD | The program that runs CmdExec steps.  
+SQLDIR | The directory in which SQL Server is installed (usually C:\Program Files\Microsoft SQL Server\MSSQL10.MSSQLSERVER\MSSQL).  
+STEPCT | A count of the number of times this step has executed (excluding retries). You could use this one to keep count of the number of executions and force the termination of a multistep loop.  
+STEPID | Step ID.  
+SVR | The name of the computer the job is running on, including the SQL Server instance name if applicable.  
+TIME | The current time in HHMMSS format.  
+STRTTM | The start time for the job in HHMMSS format.  
+STRTDT | The start date for the job in YYYYMMDD format.
+
+Note that all of these tokens must be wrapped in parentheses. This is a somewhat different requirement than was required through SQL Server 2005 RTM (which, like SQL Server 2000, required a square bracket instead). Beginning with SQL Server 2005 SP1, parentheses replaced the earlier square bracket requirement, and an escape sequence is required (we'll look at that in a bit).
+
+Beginning with SQL Server 2005 SP1, you must wrap any of the previous tokens used in the @COMMAND parameter in an escape clause. Value escape functions include:
+
+$(ESCAPE_SQUOTE(token name)) | Replaces any single quotation mark with two single quotation marks in the token replacement string.  
+---|---  
+$(ESCAPE_DQUOTE(token name)) | Replaces any single instance of a double quotation mark with two double quotation marks in the token replacement string.  
+$(ESCAPE_RBRACKET(token name)) | Replaces any single instance of a right bracket in the token replacement string with two right brackets.  
+$(ESCAPE_NONE(token name)) | Provided solely for backward compatibility, this performs the token replacement without escaping any characters in the string.
+
+  * @cmdexec_success_code: This is the value you expect to be returned by whatever command interpreter ran your job if the job ran successfully (applies only to command prompt subsystem). The default is zero.
+  * @on_success_action and @on_fail_action: This is where you say what to actually do at the success or failure of your step. Remember that at the job level we define what notifications we want to happen, but, at the step level, we can define how we want processing to continue (or end). For this parameter, you need to supply one of the following constant values:
+
+Value | Description  
+---|---  
+1 | Quit with success. This is the default for successful task executions.  
+2 | Quit with failure. This is the default for failed tasks.  
+3 | Go to the next step.  
+4 | Go to a specific step as defined in on_success_step_id or on_fail_step_id.
+
+  * @on_success_step_id and @on_fail_step_id: What step you want to run next if you've selected option 4 in the preceding table.
+  * @server: The server the task is to be run against (you can run tasks on multiple target servers from a single master server).
+  * @database_name: The database to be set as current when the task runs.
+  * @retry_interval: This is set in minutes.
+  * @os_run_priority: Ah, an undocumented feature. The default here is normal, but you can adjust how important Windows is going to think that your cmdExec (command line) scheduled task is. The possible values are:
+
+Value | Priority  
+---|---  
+−15 | Run at idle only  
+−1 thru −14 | Increasingly below normal  
+0 | Normal (this is the default)  
+1 thru 14 | Increasingly above normal  
+15 | Time critical
+
+I just can't help but think of the old Lost in Space TV show here and think of the robot saying "DANGER Will Robinson—DANGER!" Don't take messing with these values lightly. If you're not familiar with the issues surrounding Windows thread priorities, I'd suggest staying as far away from this one as possible. Going with the higher values, in particular, can have a very detrimental impact on your system—including creating significant instabilities. When you say that this is the most important thing, remember that you are taking away some of the importance of things like operating system functions—not something that's smart to do. Stay clear of this unless you really know what you're doing.
+
+  * @flags: This one relates to the Output File parameter, and indicates whether to overwrite or append your output information to the existing file. The options are:
+
+Value | Description  
+---|---  
+0 | No option specified (currently, this means your file will be overwritten every time).  
+2 | Append information to the existing file (if one exists).  
+4 | Explicitly overwrite the file.
+
+Okay, now that we've looked at the parameters, let's add a step to the job we created a short time ago:
+
+EXEC sp_add_jobstep
+
+@job_name = 'TSQLCreatedTestJob',
+
+@step_name = 'This Is The Step',
+
+@command = 'RAISERROR
+
+("RAISERROR (""TSQL Task is Job ID
+
+$(ESCAPE_SQUOTE(JOBID))."",10,1) WITH LOG",10,1)
+
+WITH LOG',
+
+@database_name = 'AdventureWorks2008',
+
+@retry_attempts = 3 ,
+
+@retry_interval = 5;
+
+Note the requirement for the escape function. Without the escape function (in this case, any one of the four would have worked), the JOBID would not be treated as a substitution token, and would have been left as the literal string of "JOBID".
+
+Technically speaking, our job should be able to be run at this point. The reason I say "technically speaking" is because we haven't scheduled the job, so the only way to run it is to manually tell the job to run. Let's take care of the scheduling issue, and then we'll be done.
+
+sp_add_jobschedule
+
+This is the last piece of the puzzle. We need to tell our job when to run. To do this, we'll make use of sp_add_jobschedule, which, like all the other sprocs we've worked on in this section, can only be found in the msdb database. Note that we could submit an entry from this sproc multiple times to create multiple schedules for our job. Keep in mind though that getting too many jobs scheduled can lead to a great deal of confusion, so schedule jobs wisely. (For example, don't schedule one job for every day of the week when you can schedule a single job to run daily.)
+
+The syntax has some similarities to what we've already been working with, but adds some new pieces to the puzzle:
+
+sp_add_jobschedule
+
+[@job_id =] <job ID>, | [@job_name =] '<job name>', [@name =] '<name>'
+
+[,[@enabled =] <0 for no, 1 for yes>]
+
+[,[@freq_type =] <frequency type>]
+
+[,[@freq_interval =] <frequency interval>]
+
+[,[@freq_subday_type =] <frequency subday type>]
+
+[,[@freq_subday_interval =] <frequency subday interval>]
+
+[,[@freq_relative_interval =] <frequency relative interval>]
+
+[,[@freq_recurrence_factor =] <frequency recurrence factor>]
+
+[,[@active_start_date =] <active start date>]
+
+[,[@active_end_date =] <active end date>]
+
+[,[@active_start_time =] <active start time>]
+
+[,[@active_end_time =] <active end time>]
+
+Again, let's look at some of these parameters:
+
+  * @freq_type: Defines the nature of the intervals that are set up in the following parameters. This is another of those parameters that uses bit flags (although you should only use one at a time). Some of the choices are clear, but some aren't until you get to @freq_interval (which is next). Your choices are:
+
+Value | Frequency  
+---|---  
+1 | Once  
+4 | Daily  
+8 | Weekly  
+16 | Monthly (fixed day)  
+32 | Monthly (relative to @freq_interval)  
+64 | Run at start of SQL Server Agent  
+128 | Run when CPU is idle
+
+  * @freq_interval: Decides the exact days that the job is executed, but the nature of this value depends entirely on @freq_type (see the preceding point). This one can get kind of confusing; just keep in mind that it works with both @freq_type and @frequency_relative_interval. The interpretation works like this:
+
+freq_type Value | Matching freq_interval Values  
+---|---  
+1 (once) | Not Used  
+4 (daily) | Runs every x days where x is the value in the frequency interval  
+8 (weekly) | The frequency interval is one or more of the following:   
+1 (Sunday)   
+2 (Monday)   
+4 (Tuesday)   
+8 (Wednesday)   
+16 (Thursday)   
+32 (Friday)   
+64 (Saturday)  
+16 (monthly - fixed) | Runs on the exact day of the month specified in the frequency interval  
+32 (monthly - relative) | Runs on exactly one of the following:   
+1 (Sunday)   
+2 (Monday)   
+3 (Tuesday)   
+4 (Wednesday)   
+5 (Thursday)   
+6 (Friday)   
+7 (Saturday)   
+8 (Specific Day)   
+9 (Every Weekday)   
+10 (Every Weekend Day)  
+64 (Run at Agent startup) | Not Used  
+128 (Run at CPU idle) | Not Used
+
+  * @freq_subday_type: Specifies the units for @freq_subday_interval. If you're running daily, then you can set a frequency to run within a given day. The possible values here are:
+
+Value | Description  
+---|---  
+1 | At the specified time  
+4 | Every x minutes where x is the value of the frequency sub-day interval  
+8 | Every x hours where x is the value of the frequency sub-day interval
+
+  * @freq_subday_interval: This is the number of @freq_subday_type periods to occur between each execution of the job (x in the preceding table).
+  * @freq_relative_interval: This is used only if the frequency type is monthly (relative) (32). If this is the case, then this value determines in which week a specific day of week job is run or flags things to be run on the last day of the month. The possible values are:
+
+Value | Description  
+---|---  
+1 | First Week  
+2 | Second Week  
+4 | Third Week  
+8 | Fourth Week  
+16 | Last Week or Day
+
+  * @freq_recurrence_factor: How many weeks or months between execution. The exact treatment depends on the frequency type and is applicable only if the type was weekly or monthly (fixed or relative). This is an integer value, and, for example, if your frequency type is 8 (weekly) and the frequency recurrence factor is 3, then the job would run on the specified day of the week every third week.
+
+The default for each of these parameters is 0.
+
+Okay, so let's move on to getting that job scheduled to run every five minutes as we did when using Management Studio:
+
+EXEC sp_add_jobschedule
+
+@job_name = 'TSQLCreatedTestJob',
+
+@name = 'Every 5 Minutes',
+
+@freq_type = 4,
+
+@freq_interval = 1,
+
+@freq_subday_type = 4,
+
+@freq_subday_interval = 5,
+
+@active_start_date = 20080731;
+
+Now, if you go and take a look at the job in Management Studio, you'll find that you have a job that is (other than the name) identical to the job we created directly in Management Studio. Our job has been fully implemented using T-SQL this time.
+
+Maintaining and Deleting Jobs and Tasks
+
+Maintaining jobs in Management Studio is pretty simple. Just double-click the job and edit it just as if you were creating a new job. Deleting jobs and tasks in Management Studio is simpler. Just highlight the job and press the Delete button. After one confirmation, your job is gone.
+
+Checking out what you have, editing it, and deleting it are all slightly trickier in T-SQL. The good news, however, is that maintaining jobs, tasks, and schedules works pretty much as creating did, and that deleting any of them is a snap.
+
+Editing and Deleting Jobs with T-SQL
+
+To edit or delete each of the four steps we just covered for T-SQL, you just use (with one exception) the corresponding update sproc—the information provided to the update sproc completely replaces that of the original add (or prior updates)—or delete sproc. The parameters are the same as the add sproc for each:
+
+If the Add Was | Then Update With | And Delete With  
+---|---|---  
+sp_add_job | sp_update_job | sp_delete_job  
+sp_add_jobserver | None (drop and add) | sp_delete_jobserver  
+sp_add_jobstep | sp_update_jobstep | sp_delete_jobstep  
+sp_add_jobschedule | sp_update_jobschedule | sp_delete_jobschedule
+
+Backup and Recovery
+
+No database-driven app should ever be deployed or sold to a customer without a mechanism for dealing with backup and recovery. As I've probably told people at least 1,000 times: You would truly be amazed at the percentage of database operations that I've gone into that do not have any kind of reliable backup. In a word: EEEeeeeeek!
+
+There is one simple rule to follow regarding backups—do them early and often. The follow up to this is to not just back up to a file on the same disk and forget it—you need to make sure that a copy moves to a completely separate place (ideally off-site) to be sure that it's safe. I've personally seen servers catch fire (the stench was terrible, as were all the freaked out staff). You don't want to find out that your backups went up in the same smoke that your original data did.
+
+For apps being done by the relative beginner, then, you're probably going to stick with referring the customer or on-site administrator to SQL Server's own backup and recovery tools, but, even if you do, you should be prepared to support them as they come up to speed in its use. In addition, there is no excuse for not understanding what it is the customer needs to do.
+
+Creating a Backup—a.k.a. "A Dump"
+
+Creating a backup file of a given database in the Management Studio is actually pretty easy. Simply navigate in the Object Explorer to the database you're interested in, and right-click.
+
+Now choose Tasks and Back Up, as shown in Figure 22.14.
+
+Figure 22.14
+
+And you'll get a dialog that lets you define pretty much all of the backup process, as in Figure 22.15.
+
+Figure 22.15
+
+The first setting here is pretty self-explanatory. Here you indicate which database you want to back up. From there, however, things get a bit trickier.
+
+Getting into the items that may not yet make sense, first up is the Recovery Model. The Recovery Model field here is just notifying you of what the database you've selected for backup is set to. It is actually a database-level setting. We're going to defer discussion of what this is for a bit—we'll get to it in the next section when we talk about backing up transaction logs.
+
+Now, those are the simple parts, but let's break down some of the rest of the options that are available.
+
+Backup Type
+
+First of the choices to be made is the Backup Type. Depending on the recovery model for your database (again, be patient with me, we'll get there on what this is!), you'll have either two or three types of backups available:
+
+  * Full: This is just what it sounds like—a full backup of your actual database file as it is as of the last transaction that was committed prior to you issuing the Backup command.
+  * Differential: This might be referred to as a "backup since" backup. When you take a differential backup, it only writes out a copy of the extents (see Chapter 6 if you've forgotten!) that have changed since you did the last full backup. These typically run much faster than a Full backup and will take up less space. How much less? Well, that depends on how much your data actually changes. For very large databases where backups can take a very long time to run, it is very common to have a strategy where you take a full backup only once a week or even only once a month, and then take differential backups in between to save both space and time.
+  * Transaction Log: This is again just what it sounds like—a copy of the transaction log. This option will only show up if your database is set to Full or Bulk logging (this option is hidden if you are using simple logging). Again, a full discussion of what these are is coming up shortly.
+
+A subtopic of the Backup Type is the Backup Component, which applies only to Full and Differential backups.
+
+For purposes of this book, we should pretty much just be focused on backing up the whole database. That said, you'll notice another option titled "Files and Filegroups." Back in Chapter 1, we touched briefly on the idea of filegroups and individual files for data to be stored in. This option lets you select just one file or filegroup to participate in for this backup. I highly recommend avoiding this option until you have graduated to the "expert" class of SQL Server user.
+
+Again, I want to stress avoiding this particular option until you've got yourself something just short of a doctorate in SQL Server backups. These are special use—designed to help with very large database installations (figure terabytes) that are in high-availability scenarios. There are major consistency issues to be considered when taking and restoring from this style of backup, and they are not for the faint of heart.
+
+Backup Set
+
+A backup set is basically a single name used to refer to one or more destinations for your backup.
+
+SQL Server allows for the idea that your backup may be particularly large or that you may otherwise have reason to back up across multiple devices—be it drives or tapes. When you do this, however, you need to have all of the devices you used as a destination available in order to recover from any of them—that is, they are a "set." The backup set essentially holds the definition of what destinations were involved in your particular backup. In addition, a backup set contains some property information for your backup. You can, for example, identify an expiration date for the backup. Creating a backup set is as easy as naming multiple file or tape destinations at the time you define your backup.
+
+Destination
+
+This is where your data is going to be backed up to. Here is where you define potentially several destinations to be utilized for one backup set. For most installations this will be a file location, but it can also be any valid UNC path (which may wind up being something other than a disk. SQL Server doesn't care as long as it's valid storage).
+
+Options
+
+In addition to those items we just covered from the General node of the dialog, you also have a node that lets you set other miscellaneous options. Most of these are fairly self-explanatory. Of particular note, however, is the Transaction Log area.
+
+Schedule
+
+With all this set up, wouldn't it be nice to set up a job to run this backup on a regular basis? Well, the Schedule button up at the top of the dialog is meant to facilitate your doing just that. Click it, and it will bring up the Job Schedule dialog you saw earlier in the chapter. You can then define a regular schedule to run the backup you just defined.
+
+Backing Up Using T-SQL
+
+To back up the database or the log in T-SQL, we make use of the BACKUP command. The syntax for BACKUP works almost, but not quite, the same, depending on whether you're backing up the database or the log. The syntax looks like this.
+
+BACKUP DATABASE|LOG <database name>
+
+{WITH
+
+NO_LOG|TRUNCATE_ONLY}
+
+| TO {DISK|TAPE} <backup device(s)> [,...n]
+
+[MIRROR TO <backup device(s)> [,...n]]
+
+[WITH
+
+[BLOCKSIZE = <block size>]
+
+[[,] CHECKSUM | NO CHECKSUM ]
+
+[[,] COMPRESSION | NO COMPRESSION]
+
+[[,] STOP_ON_ERROR | CONTINUE_AFTER_ERROR]
+
+[[,] DESCRIPTION = <description of backup>]
+
+[[,] DIFFERENTIAL]
+
+[[,] EXPIREDATE = <expiration date> | RETAINDAYS = <days>]
+
+[[,] PASSWORD = <password>]
+
+[[,] FORMAT|NOFORMAT]
+
+[[,] INIT|NOINIT]
+
+[[,] MEDIADESCRIPTION = <description>]
+
+[[,] MEDIANAME = <media name>]
+
+[[,] MEDIAPASSWORD = <media password>]
+
+[[,] NAME = <backup set name>]
+
+[[,] REWIND|NOREWIND]
+
+[[,] NOSKIP|SKIP]
+
+[[,] NOUNLOAD|UNLOAD]
+
+[[,] RESTART]
+
+[[,] STATS [= <percentage>]]
+
+[[,] COPY_ONLY]
+
+Let's look at some of the parameters:
+
+  * <backup device>: That's right; you can back up to more than one device. This creates what's called a media set. These can really speed up your backups if the media are spread over several disks, as it creates a parallel load situation. You're not bound by the I/O limitations of any of the individual devices. However, beware—you must have the entire media set intact to restore from this kind of backup.
+
+Also note that the TAPE option is only provided for backward compatibility—all backups should now appear to SQL Server as being to DISK (even if the actual device does happen to be a tape).
+
+  * BLOCKSIZE: This is automatically determined in a hard drive backup, but, for tape, you need to provide the correct block size. Contact your vendor for help on this one.
+  * COMPRESSION: This is what it sounds like: an indication of whether or not you want compression used in the backup. The default is no compression, but this can be changed at a server-wide level.
+  * DIFFERENTIAL: This is to perform a differential backup. A differential backup only backs up the data that is changed since your last full backup. Any log or other differential backup is ignored. Any row/column changed, added, or deleted since the last full backup is included in the new backup. Differential backups have the advantage of being much faster to create than a full backup and much faster to restore than applying each individual log when restoring.
+  * EXPIREDATE/RETAINDAYS: You can have your backup media expire after a certain time. Doing so lets SQL Server know when it can overwrite the older media.
+  * FORMAT/NOFORMAT: Determines whether or not the media header (required for tapes) should be rewritten. Be aware that formatting affects the entire device—this means that formatting for one backup on a device destroys all the other backups on that device as well.
+  * INIT/NOINIT: Overwrites the device data but leaves the header intact.
+  * MEDIADESCRIPTION and MEDIANAME: Just describes and names the media—maximum of 255 characters for a description and 128 for a name.
+  * SKIP/NOSKIP: Decides whether or not to pay attention to the expiration information from previous backups on the tape. If SKIP is active, then the expiration is ignored so the tape can be overwritten.
+  * UNLOAD/NOUNLOAD: Used for tape only. This determines whether to rewind and eject the tape (UNLOAD) or leave it in its current position (NOUNLOAD) after the backup is complete.
+  * RESTART: Picks up where a previously interrupted backup left off.
+  * STATS: Displays a progress bar indicating progress as the backup runs.
+  * COPY_ONLY: Creates a backup but does not affect any other backup sequence you have in any way. For example, logs that are differential backups will continue as if the copy backup had never occurred.
+
+Now let's try one out for a true backup:
+
+BACKUP DATABASE AdventureWorks2008
+
+TO DISK = 'C:\Program Files\Microsoft SQL
+
+Server\MSSQL10.MSSQLSERVER\MSSQL\Backup\TSQLDataBackup.bck'
+
+WITH
+
+DESCRIPTION = 'My what a nice backup!',
+
+STATS;
+
+The highlighted code should appear on one line.
+
+Note that you may need to change the path to a different location depending on the specifics of your particular installation.
+
+We now have a backup of our AdventureWorks2008 database.
+
+SQL Server is even nice enough to provide progress messages as it processes the backup:
+
+10 percent processed.
+
+20 percent processed.
+
+30 percent processed.
+
+40 percent processed.
+
+50 percent processed.
+
+60 percent processed.
+
+70 percent processed.
+
+80 percent processed.
+
+90 percent processed.
+
+Processed 25448 pages for database 'AdventureWorks2008', file
+
+'AdventureWorks2008_Data' on file 1.
+
+Processed 36 pages for database 'AdventureWorks2008', file 'FileStreamDocuments'
+
+on file 1.
+
+Processed 1 pages for database 'AdventureWorks2008', file 'AdventureWorks2008_Log'
+
+on file 1.
+
+100 percent processed.
+
+BACKUP DATABASE successfully processed 25484 pages in 10.825 seconds (18.391 MB/sec).
+
+It's that simple, so let's follow it up with a simple backup of the log:
+
+BACKUP LOG AdventureWorks2008
+
+TO DISK = 'C:\Program Files\Microsoft SQL
+
+Server\MSSQL10.MSSQLSERVER\MSSQL\Backup\TSQLLogBackup.bck'
+
+WITH
+
+DESCRIPTION = 'My what a nice backup of a log!',
+
+STATS;
+
+The highlighted code should appear on one line.
+
+It's worth noting that you can't do a backup of a log while the database recovery model is set to Simple. To change this to a different recovery model, right-click the AdventureWorks2008 database, select Properties and the Options tab—in T-SQL, use the sp_dboption system sproc. If you think about it, this makes sense given that your log is always going to be essentially free of any committed transactions.
+
+It's also worth noting that backups work just fine while there are users in your database. SQL Server is able to reconcile the changes that are being made by knowing the exact point in the log that the backup was begun, and using that as a reference point for the rest of the backup.
+
+Recovery Models
+
+Well, I spent most of the last section promising that we would discuss them, so it's time to ask: What is a recovery model?
+
+Well, back in Chapter 11, we talked about the transaction log. In addition to keeping track of transactions to deal with transaction rollback and atomicity of data, transaction logs are also critical to being able to recover data right up to the point of system failure.
+
+Imagine for a moment that you're running a bank. Let's say you've been taking deposits and withdrawals for the last six hours—the time since your last full backup was done. Now, if your system went down, I'm guessing you're not going to like the idea of going to last night's backup and losing all track of what money went out the door or came in during the interim. See where I'm going here? You really need every moment's worth of data.
+
+Keeping the transaction log around gives us the ability to "roll forward" any transactions that happened since the last full or differential backup was done. Assuming both the data backup and the transaction logs are available, you should be able to recover right up to the point of failure.
+
+The recovery model determines how long and what types of log records are kept. There are three options:
+
+  * Full: This is what it says. Everything is logged. Under this model, you should have no data loss in the event of system failure, assuming you had a backup of the data available and have all transaction logs since that backup. If you are missing a log or have one that is damaged, then you'll be able to recover all data up through the last intact log you have available. Keep in mind, however, that as keeping everything suggests, this can take up a fair amount of space in a system that receives a lot of changes or new data.
+  * Bulk-Logged: This is like "Full recovery light." Under this option, regular transactions are logged just as they are with the Full recovery method, but bulk operations are not. The result is that, in the event of system failure, a restored backup will contain any changes to data pages that did not participate in bulk operations (bulk import of data or index creation, for example), but any bulk operations must be redone. The good news on this one is that bulk operations perform much better. This performance comes with risk attached, so your mileage may vary... .
+  * Simple: Under this model, the transaction log essentially exists to support transactions as they happen. The transaction log is regularly truncated, with any completed or rolled back transactions essentially being removed from the log (not quite that simple, but that is the effect). This gives us a nice tight log that is smaller and often performs a bit better, but the log is of zero use for recovery from system failure.
+
+For most installations, Full recovery is going to be what you want to have for a production-level database—end of story.
+
+Recovery
+
+This is something of the reverse of the backup side of things. You've done your backups religiously, and now you want to restore one—either for recovery purposes or merely to make a copy of a database somewhere.
+
+Once you have a backup of your database, it's fairly easy to restore it to the original location. To get started—it works much as it did for backup: navigate to the database you want to restore to and right-click—then select Tasks⇒Restore, and up comes your Restore dialog, as in Figure 22.16.
+
+Figure 22.16
+
+As long as what you're after is to take your old backup and slam it over the top of the database you made the backup of, this is pretty straightforward. Simply say OK, and it should restore for you without issue.
+
+Restoring to a Different Location
+
+When things get tricky is when you want to change something about where you're restoring to. As part of the backup process, the backup knows the name of the database that was backed up, and, perhaps more important, it knows the path(s) to the physical files that it was supposed to be using.
+
+Changing the destination database name is right there—no biggie—the problem is that changing the destination database name does nothing to change what physical files (the .MDF and .LDF files) it's going to try to store to. To deal with this, go to the Options node of the Restore dialog.
+
+Again, most of the options here are self-explanatory, but, in particular, notice the "Restore As" column. In this part of the dialog, you can replace every original file's destination, location, and name, which provides you with a way to deal with restoring multiple copies of a database to the same server (perhaps for test purposes) or installing your database to a new volume or even a new system.
+
+Recovery Status
+
+This one is merely about the state you want to have the database in when you are done with this restore. This has particular relevance when you are restoring a database and still have logs to apply to the database later.
+
+If you go with the default option (which translates to using the WITH RECOVERY option if you were using T-SQL), then the database will immediately be in a full online status when the restore operation is complete. If, for example, you wanted to restore logs after your initial restore was done, you would want to select one of the two other options. Both of these prevent updates happening to the database and leave it in a state where more recovery can be done. The difference is merely one of whether users are allowed to access the database in a "read-only" mode or whether the database should appear as still being offline.
+
+The issue of availability is a larger one than you probably think it is. As big of a deal as I'm sure it already seems, it's really amazing how quickly users will find their way into your system when the restore operation suddenly marks the database as available. Quite often, even if you know that you will be "done" after the current restore is done, you'd like a chance to look over the database prior to actual users being in there. If this is the case, then be sure to use the NO RECOVERY method of restoring. You can later run a restore that is purely for a WITH RECOVERY option, and get the database fully back online once you're certain you have things just as you want them.
+
+Restoring Data Using T-SQL
+
+We use the RESTORE command to recover the data that we have in our backups. The basic syntax looks like this (there are a ton of variations on this, so, if you need every nuance, I'd suggest a book oriented toward administration, which will investigate backup and recovery as a chapter unto itself):
+
+RESTORE DATABASE|LOG <database name>
+
+[FROM <backup_device> [,...n]]
+
+[WITH
+
+[DBO_ONLY]
+
+[[,] FILE = <file number>]
+
+[[,] MEDIANAME = <media name>]
+
+[[,] MOVE '<logical file name>' TO '<operating system file name>'][,...n]
+
+[[,] {NORECOVERY|RECOVERY|STANDBY = <undo file name>}]
+
+[[,] {NOUNLOAD|UNLOAD}]
+
+[[,] REPLACE]
+
+[[,] RESTART]
+
+[[,] STATS [= percentage]]
+
+[[,] { STOPAT = { <date and time> }
+
+| STOPATMARK = { '<name of mark>' }
+
+[ AFTER <date and time> ]
+
+| STOPBEFOREMARK = { '<name of mark>' }
+
+[ AFTER <date and time> ]
+
+Let's look at some of these options:
+
+  * DBO_ONLY: When the restore is done, the database will be set with the dbo_only database option turned on. This gives the dbo a chance to look around and test things out before allowing users back onto the system.
+
+This is a biggie, and I very strongly recommend that you always use it. You would be amazed at how quickly users will be back on the system once it's backed up for even a moment. When a system is down, you'll find users very impatient to get back to work. They'll constantly be trying to log in, and they won't bother to ask if it's okay or not. They'll assume that when it's up, it's okay to go into it.
+
+  * FILE: You can back up multiple times to the same media. This option lets you select a specific version to restore. If this one isn't supplied, SQL Server will assume that you want to restore from the most recent backup.
+  * MOVE: Allows you to restore the database to a different physical file that the database was using when it was originally backed up.
+  * NORECOVERY/RECOVERY/STANDBY: RECOVERY and NORECOVERY are mutually exclusive. STANDBY works in conjunction with NORECOVERY. They work as follows:
+
+Option | Description  
+---|---  
+NORECOVERY | Restores the database but keeps it marked as offline. Uncommitted transactions are left intact. This allows you to continue with the recovery process—for example, if you still have additional logs to apply.  
+RECOVERY | As soon as the restore command is done successfully, the database is marked as active again. Data can again be changed. Any uncommitted transactions are rolled back. This is the default if none of the options are specified.  
+STANDBY | STANDBY allows you to create an undo file so that the effects of a recovery can be undone. STANDBY allows you to bring the database up for read-only access before you have issued a RECOVERY (which means at least part of your data's been restored, but you aren't considering the restoration process complete yet). This allows users to make use of the system in a read-only mode while you verify the restoration process.
+
+  * REPLACE: Overrides the safety feature that prevents you from restoring over the top of an existing database.
+  * RESTART: Tells SQL Server to continue a previously interrupted restoration process.
+
+Let's go ahead and look at an example run of restoring the AdventureWorks2008 database. Do not run this statement unless you are absolutely certain that your backup was successful and is intact.
+
+First, we drop the existing AdventureWorks2008 database:
+
+USE master;
+
+DROP DATABASE AdventureWorks2008;
+
+Once that's done, we'll try to restore it using my RESTORE command:
+
+RESTORE DATABASE AdventureWorks2008
+
+FROM DISK = 'C:\Program Files\Microsoft SQL
+
+Server\MSSQL10.MSSQLSERVER\MSSQL\Backup\TSQLDataBackup.bck'
+
+WITH
+
+DBO_ONLY,
+
+NORECOVERY,
+
+STATS;
+
+The highlighted code should appear on one line.
+
+We restored with NORECOVERY because we want to add another piece to the puzzle. Our log will contain any transactions that happened between when our database or log was last backed up and when this log was backed up. "Apply" this log, and that should bring the database as up to date as we can make it:
+
+RESTORE LOG AdventureWorks2008
+
+FROM DISK = 'C:\Program Files\Microsoft SQL
+
+Server\MSSQL10.MSSQLSERVER\MSSQL\Backup\TSQLLogBackup.bck'
+
+WITH
+
+DBO_ONLY,
+
+NORECOVERY,
+
+STATS;
+
+Note that if we had several logs to apply from this one device, then we would have to name them as we wanted to apply them. They would also need to be applied in the order in which they were backed up.
+
+Now, we could have turned everything on there, but we want to hold off for a bit before making the database active again. Even though we don't have any more logs to apply, we still need to re-run the RESTORE statement to make the database active again:
+
+RESTORE LOG AdventureWorks2008 WITH RECOVERY;
+
+We should now be able to test our database:
+
+USE AdventureWorks2008;
+
+SELECT * FROM Region;
+
+And, sure enough, we get the results we're looking for. Run a few SELECT statements to see that, indeed, our database was restored properly.
+
+After you've checked things out, remember that we chose the DBO_ONLY option for all this. If we run sp_dboption, we'll see that no one else is able to get in:
+
+EXEC sp_dboption;
+
+Look for the dbo use only:
+
+Settable database options:
+
+\-----------------------------------
+
+ANSI null default
+
+ANSI nulls
+
+ANSI padding
+
+ANSI warnings
+
+arithabort
+
+auto create statistics
+
+auto update statistics
+
+autoclose
+
+autoshrink
+
+concat null yields null
+
+cursor close on commit
+
+db chaining
+
+dbo use only
+
+default to local cursor
+
+merge publish
+
+numeric roundabort
+
+offline
+
+published
+
+quoted identifier
+
+read only
+
+recursive triggers
+
+select into/bulkcopy
+
+single user
+
+subscribed
+
+torn page detection
+
+trunc. log on chkpt.
+
+Remember to turn that option off or your users won't be able to get into the system:
+
+EXEC sp_dboption AdventureWorks2008, 'dbo use only', 'false';
+
+We now have a restored and active database.
+
+Index Maintenance
+
+Back in Chapter 6, we talked about how indexes can become fragmented. This can become a major impediment to the performance of your database over time, and it's something that you need to have a strategy in place to deal with. Fortunately, SQL Server has commands that will reorganize your data and indexes to clean things up. Couple that with the job scheduling that we've already learned about, and you can automate routine defragmentation.
+
+ALTER INDEX is the workhorse of database maintenance. It is simultaneously much easier and slightly harder than the previous maintenance mainstay—DBCC—used to be. Let's take a look at this one real quick, and then at how to get it scheduled.
+
+ALTER INDEX
+
+The command ALTER INDEX is somewhat deceptive in what it does. Up until now, ALTER commands have always been about changing the definition of our object. We ALTER tables to add or disable constraints and columns, for example. ALTER INDEX is different; it is all about maintenance and zero about structure. If you need to change the make-up of your index, you still need to either DROP and CREATE it, or you need to CREATE and use the DROP_EXISTING=ON option.
+
+The ALTER INDEX syntax looks like this:
+
+ALTER INDEX { <name of index> | ALL }
+
+ON <table or view name>
+
+{ REBUILD
+
+[ [ WITH ( <rebuild index option> [ ,...n ] ) ]
+
+| [ PARTITION = <partition number>
+
+[ WITH ( <partition rebuild index option>
+
+[ ,...n ] ) ] ] ]
+
+| DISABLE
+
+| REORGANIZE
+
+[ PARTITION = <partition number> ]
+
+[ WITH ( LOB_COMPACTION = { ON | OFF } ) ]
+
+| SET ( <set_index_option> [ ,...n ] )
+
+}
+
+[ ; ]
+
+A decent amount of this is fairly detailed "Realm of the advanced DBA" stuff—usually used on an ad hoc basis to deal with very specific problems. But there are some core elements here that should be part of our regular maintenance planning. We'll start by looking at a couple of top parameters, and then look at the options that are part of our larger maintenance planning needs.
+
+Index Name
+
+You can name a specific index if you want to maintain one specific index, or use ALL to indicate that you want to perform this maintenance on every index associated with the named table.
+
+Table or View Name
+
+Pretty much just what it sounds like—the name of the specific object (table or view) that you want to perform the maintenance on. Note that it needs to be one specific table. (You can't feed it a list and say "do all of these please!")
+
+REBUILD
+
+This is the "industrial strength" approach to fixing an index. If you run ALTER INDEX with this option, the old index is completely thrown away and reconstructed from scratch. The result is a truly optimized index, where every page in both the leaf and non-leaf levels of the index have been reconstructed as you have defined them (either the defaults, or using switches to change things like the fill factor).
+
+Careful on this one. As soon as you kick off a REBUILD, the index you are working on is essentially gone until the rebuild is complete. Any queries that relied on that index may become exceptionally slow (potentially by orders of magnitude). This is the sort of thing you want to test on an offline system first to have an idea how long it's going to take, and then schedule to run in off hours (preferably with someone monitoring it to be sure it's back online when peak hours come along).
+
+This one can have major side effects while it runs, and thus it falls squarely in the domain of the database administrator in my not-so-humble opinion.
+
+DISABLE
+
+This one does what it says, only in somewhat drastic fashion. It would be nice if all this command did was take your index offline until you decided further what you want to do, but instead it essentially marks the index as unusable. Once an index has been disabled, it must be rebuilt (not reorganized, but rebuilt) before it will be active again.
+
+This is one you're very, very rarely going to do yourself. (You would more likely just drop the index.) It is far more likely to happen during a SQL Server upgrade or some other oddball situation.
+
+Yet another BE CAREFUL!!! warning on this one. If you disable the clustered index for your table, it has the effect of disabling the table. The data will remain but will be inaccessible by all indexes (since they all depend on the clustered index) until you rebuild the clustered index.
+
+REORGANIZE
+
+BINGO!!! from the developer perspective. With REORGANIZE we hit much more of a happy medium in life. When you reorganize your index, you get a slightly less complete optimization than you get with a full rebuild, but one that occurs online (users can still utilize the index).
+
+This should, if you're paying attention, bring about the question "What exactly do you mean by 'slightly less complete'?" Well, REORGANIZE only works on the leaf level of your index—non-leaf levels of the index go untouched. This means that we're not quite getting a full optimization, but, for the lion's share of indexes, that is not where your real cost of fragmentation is (though it can happen and your mileage may vary).
+
+Given its much lower impact on users, this is usually the tool you'll want to use as part of your regular maintenance plan. Let's take a look at running an index reorganization command.
+
+To run this through its paces, we're going to do a reorg on a table in the AdventureWorks2008 database. The Production.TransactionHistory table is an excellent example of a table that is likely to have many rows inserted over time and then have rows purged back out of it as the transactions become old enough to delete. In this case, we'll reorganize all the indexes on the table in one simple command:
+
+USE AdventureWorks2008;
+
+ALTER INDEX ALL
+
+ON Production.TransactionHistory
+
+REORGANIZE;
+
+The ALTER INDEX command sees that ALL was supplied instead of a specific index name, and looks up what indexes are available for our Production.TransactionHistory table (leaving out any that are disabled since a reorganization will do nothing for them). It then enumerates each index behind the scenes and performs the reorganization on each—reorganizing just the leaf level of each index (including reorganizing the actual data since the clustered index on this table will also be reorganized).
+
+You should get back essentially nothing from the database—just a simple "Command(s) completed successfully."
+
+Archiving of Data
+
+Ooh—here's a tricky one. There are as many ways of archiving data as there are database engineers. If you're building an OLAP database—for example, to utilize with Analysis Services—then that will often address your archiving for long-term reporting needs. Regardless of how you're making sure the data you need long-term is available, there will likely come a day when you need to deal with the issue of your data becoming too voluminous for your system to perform well.
+
+As I said, there are just too many ways to go about archiving because every database is a little bit different. The key is to think about archiving needs at the time that you create your database. Realize that, as you start to delete records, you're going to be hitting referential integrity constraints and/or orphaning records—design in a logical path to delete or move records at archive time. Here are some things to think about as you write your archive scripts:
+
+  * If you already have the data in an OLAP database, then you probably don't need to worry about saving it anywhere else. Talk to your boss and your attorney on that one.
+  * How often is the data really used? Is it worth keeping? Human beings are natural born pack rats in a larger size. Simply put, we hate giving things up—that includes our data. If you're only worried about legal requirements, think about just saving a copy of never or rarely used data to tape (I'd suggest multiple backups for archive data) and reducing the amount of data you have online—your users will love you for it when they see improved performance.
+  * Don't leave orphans. As you start deleting data, your referential integrity constraints should keep you from leaving that many orphans, but you'll wind up with some where referential integrity didn't apply. This situation can lead to serious system errors.
+  * Realize that your archive program will probably need a long time to run. The length of time it runs and the number of rows affected may create concurrency issues with the data your online users are trying to get at—plan on running it at a time when your system will not be used.
+  * TEST! TEST! TEST!
+
+PowerShell
+
+SQL Server now has support for a command environment known as PowerShell. For those who haven't heard of PowerShell before, it's worth a look well beyond what we'll go into here, so I recommend a good search on the Web.
+
+What is PowerShell? At its most basic level, PowerShell is a classic command-line environment—and is not, on the surface, much different than a Windows Command window. PowerShell, however, is extensible through .NET integration and can be hosted within other applications (much as it is for SQL Server 2008). Examples of applications and operating systems that include special functionality for PowerShell include:
+
+  * SQL Server 2008 (why else would we be talking about it, eh?) and above
+  * Exchange 2007 and above
+  * Microsoft Office SharePoint Services (MOSS) 2007 and above
+  * Vista, Widows XP, Windows Server 2003 (through downloaded functionality add ons)
+  * Windows Server 2008 and later include it natively or as an option (depending on edition and version)
+
+The extensibility of PowerShell is implemented via what are called cmdlets (pronounced commandlets). These are specialized .NET assemblies that implement functionality for a given application within the PowerShell environment. The real power here is that, through the mix of different cmdlets available to PowerShell, we can create powerful scripts utilizing a mix of operating system commands and functionality that is specific to one or more applications (for example, waiting for confirmation on a load script to the database before kicking off an application hosted in another environment).
+
+PowerShell cmdlets have a standardized command structure based on a verb-noun combination such as Get-Help, or Get-Children. It also includes a robust help mechanism that is updated regularly (via TechNet).
+
+Trying Out PowerShell
+
+To get a feel for how it works, we're going to take a fairly quick test drive of PowerShell. Start by opening a command prompt window (Start⇒Run and type cmd before hitting Enter). At the command line, simply type PowerShell.
+
+C:\Users\Administrator.Kierkegaard>sqlps
+
+There is relatively little indication that you've left the standard command prompt and entered the world of PowerShell. Indeed, the only significant indication (besides the PowerShell header) is the PS prefix on a line that otherwise looks just like your command prompt:
+
+Microsoft SQL Server PowerShell
+
+Version 10.0.1600.22
+
+Microsoft Corp. All rights reserved.
+
+PS SQLSERVER:\>
+
+Let's go ahead and issue our first PowerShell command. We'll simply ask for the help page:
+
+PS SQLSERVER:\> Get-Help
+
+This spews forth a page or so worth of information:
+
+TOPIC
+
+Get-Help
+
+SHORT DESCRIPTION
+
+Displays help about PowerShell cmdlets and concepts.
+
+LONG DESCRIPTION
+
+SYNTAX
+
+get-help {<CmdletName> | <TopicName>}
+
+help {<CmdletName> | <TopicName>}
+
+<CmdletName> -?
+
+"Get-help" and "-?" display help on one page.
+
+"Help" displays help on multiple pages.
+
+Examples:
+
+get-help get-process : Displays help about the get-process cmdlet.
+
+get-help about-signing : Displays help about the signing concept.
+
+help where-object : Displays help about the where-object cmdlet.
+
+help about_foreach : Displays help about foreach loops in PowerShell.
+
+match-string -? : Displays help about the match-string cmdlet.
+
+You can use wildcard characters in the help commands (not with -?).
+
+If multiple help topics match, PowerShell displays a list of matching
+
+topics. If only one help topic matches, PowerShell displays the topic.
+
+Examples:
+
+get-help * : Displays all help topics.
+
+get-help get-* : Displays topics that begin with get-.
+
+help *object* : Displays topics with "object" in the name.
+
+get-help about* : Displays all conceptual topics.
+
+For information about wildcards, type:
+
+get-help about_wildcard
+
+REMARKS
+
+To learn about PowerShell, read the following help topics:
+
+get-command : Displays a list of cmdlets.
+
+about_object : Explains the use of objects in PowerShell.
+
+get-member : Displays the properties of an object.
+
+Conceptual help files are named "about_<topic>", such as:
+
+about_regular_expression.
+
+The help commands also display the aliases on the system.
+
+For information about aliases, type:
+
+get-help about_alias
+
+PS SQLSERVER:\>
+
+This is just basic information about getting help in PowerShell. Little if anything provided in this particular help window is SQL Server specific. We can, however, get help on a cmdlet that runs generic T-SQL commands:
+
+PS SQLSERVER:\> Get-Help Invoke-Sqlcmd
+
+This gets us helpful information about the SQL Server–specific cmdlet called Invoke-Sqlcmd:
+
+NAME
+
+Invoke-Sqlcmd
+
+SYNOPSIS
+
+Runs a script containing statements from the languages
+
+(Transact-SQL and XQuery) and commands supported by the SQL Server sqlcmd utility.
+
+SYNTAX
+
+Invoke-Sqlcmd [-ServerInstance <PSObject>] [-Database <String>] [-EncryptCo
+
+nnection] [-Username <String>] [-Password <String>] [[-Query] <String>] [-Q
+
+ueryTimeout <Int32>] [-ConnectionTimeout <Int32>] [-ErrorLevel <Int32>] [-S
+
+everityLevel <Int32>] [-MaxCharLength <Int32>] [-MaxBinaryLength <Int32>] [
+
+-AbortOnError] [-DedicatedAdministratorConnection] [-DisableVariables] [-Di
+
+sableCommands] [-HostName <String>] [-NewPassword <String>] [-Variable <Str
+
+ing[]>] [-InputFile <String>] [-OutputSqlErrors] [-SuppressProviderContextW
+
+arning] [-IgnoreProviderContext] [<CommonParameters>]
+
+DETAILED DESCRIPTION
+
+Runs a script containing the languages and commands supported by the SQL Se
+
+rver sqlcmd utility. The languages supported are Transact-SQL and the XQuer
+
+y syntax supported by the Database Engine. Invoke-Sqlcmd also accepts many
+
+of the commands supported by sqlcmd, such as GO and QUIT. Invoke-Sqlcmd ac
+
+cepts the sqlcmd scripting variables, such as SQLCMDUSER. Invoke-Sqlcmd doe
+
+s not set sqlcmd scripting variables by default.
+
+Invoke-Sqlcmd does not support the sqlcmd commands primarily related to int
+
+eractive script editing. The commands not supported include :!!, :connect,
+
+:error, :out, :ed, :list, :listvar, :reset, :perftrace, and :serverlist.
+
+The first result set the script returns is displayed as a formatted table.
+
+Result sets after the first are not displayed if their column list is diffe
+
+rent from the column list of the first result set. If result sets after the
+
+first set have the same column list, their rows are appended to the format
+
+ted table that contains the rows that were returned by the first result set
+
+.
+
+Invoke-Sqlcmd does not return message output, such as the output of PRINT s
+
+tatements, unless you use the PowerShell -Verbose parameter.
+
+RELATED LINKS
+
+SQL Server Books Online: Transact-SQL Reference
+
+SQL Server Books Online: sqlcmd Utility
+
+SQL Server Books Online: XQuery Reference
+
+REMARKS
+
+For more information, type: "get-help Invoke-Sqlcmd -detailed".
+
+For technical information, type: "get-help Invoke-Sqlcmd -full".
+
+PS SQLSERVER:\>
+
+Let's take a quick look at this using a relatively simple system stored procedure (sp_help):
+
+PS SQLSERVER:\> Invoke-Sqlcmd -Query "EXEC sp_helpdb"
+
+sp_helpdb provides a listing of all databases in the system. Normally we would see a column-oriented result set, but PowerShell has reoriented the output in a manner that is much more suitable to the limited number of characters a command window can display:
+
+name : AdventureWorks2008
+
+db_size : 245.81 MB
+
+owner : sa
+
+dbid : 7
+
+created : Dec 6 2008
+
+status : Status=ONLINE, Updateability=READ_WRITE, UserAccess=MULTI
+
+_USER, Recovery=SIMPLE, Version=655,
+
+Collation=SQL_Latin1
+
+_General_CP1_CI_AS, SQLSortOrder=52, IsAnsiNullsEnabled,
+
+IsAnsiPaddingEnabled, IsAnsiWarningsEnabled, IsArithmetic
+
+AbortEnabled, IsAutoCreateStatistics,
+
+IsAutoUpdateStatist
+
+ics, IsFullTextEnabled, IsNullConcat,
+
+IsQuotedIdentifiers
+
+Enabled, IsPublished
+
+compatibility_level : 100
+
+name : AdventureWorksDW2008
+
+db_size : 71.06 MB
+
+owner : sa
+
+dbid : 8
+
+created : Dec 6 2008
+
+status : Status=ONLINE, Updateability=READ_WRITE,
+
+UserAccess=MULTI
+
+_USER, Recovery=SIMPLE, Version=655,
+
+Collation=SQL_Latin1
+
+_General_CP1_CI_AS, SQLSortOrder=52, IsAnsiNullsEnabled,
+
+IsAnsiPaddingEnabled, IsAnsiWarningsEnabled,
+
+IsArithmetic
+
+AbortEnabled, IsAutoCreateStatistics,
+
+IsAutoUpdateStatist
+
+ics, IsFullTextEnabled, IsNullConcat,
+
+IsQuotedIdentifiers
+
+Enabled
+
+compatibility_level : 100
+
+name : AdventureWorksLT2008
+
+db_size : 7.13 MB
+
+owner : sa
+
+dbid : 9
+
+created : Dec 6 2008
+
+status : Status=ONLINE, Updateability=READ_WRITE,
+
+UserAccess=MULTI
+
+_USER, Recovery=SIMPLE, Version=655,
+
+Collation=SQL_Latin1
+
+_General_CP1_CI_AS, SQLSortOrder=52, IsAnsiNullsEnabled,
+
+IsAnsiPaddingEnabled, IsAnsiWarningsEnabled, IsArithmetic
+
+AbortEnabled, IsAutoCreateStatistics,
+
+IsAutoUpdateStatist
+
+ics, IsFullTextEnabled, IsNullConcat,
+
+IsQuotedIdentifiers
+
+Enabled
+
+compatibility_level : 100
+
+name : tempdb
+
+db_size : 8.75 MB
+
+owner : sa
+
+dbid : 2
+
+created : Dec 31 2008
+
+status : Status=ONLINE, Updateability=READ_WRITE,
+
+UserAccess=MULTI
+
+_USER, Recovery=SIMPLE, Version=655,
+
+Collation=SQL_Latin1
+
+_General_CP1_CI_AS, SQLSortOrder=52,
+
+IsAutoCreateStatisti
+
+cs, IsAutoUpdateStatistics
+
+compatibility_level : 100
+
+PS SQLSERVER:\>
+
+I have, for the sake of brevity, snipped a few databases out of the middle of the result sets here, but you can see how we were able to execute virtually any command from within PowerShell. Many commands will, over time, have specific cmdlets supporting them—supporting stronger typing and parameterization. For now, most implemented cmdlets support four major object models:
+
+  * The Database Engine: This allows you to navigate a given server.
+  * Policy-Based Management: The rules-based management tool that is new with SQL Server 2008 (we will discuss this in brief in our next major section).
+  * Database Collection: This contains the meat of manipulating a given database or set of databases.
+  * Server Registration: This is all about identifying servers and registering them locally to make them somewhat easier to access.
+
+Through the use of these object models, PowerShell can provide scripted access to almost any administrative task. Watch for more specific support and help to be added via download over the life cycle of SQL Server 2008.
+
+Navigating in PowerShell
+
+PowerShell also provides the ability to navigate in a more directory-like fashion than we have previously experienced with SQL Server. Indeed, you can think of the SQL Server world as one large hierarchy (much as a domain/directory structure is). You can navigate from a collection of registered servers to specific servers, and, from there, to roles and users on that server, or perhaps to databases and objects within the database.
+
+Let's check this out real quick by issuing a simple dir command, much as you would in a command window for the operating system:
+
+PS SQLSERVER:\> dir
+
+This may surprise you by providing a listing of the four object model areas I mentioned at the end of the previous section:
+
+Name Root Description
+
+\---- ---- -----------
+
+SQL SQLSERVER:\SQL SQL Server Database Engine
+
+SQLPolicy SQLSERVER:\SQLPolicy SQL Server Policy Management
+
+SQLRegistration SQLSERVER:\SQLRegistration SQL Server Registrations
+
+DataCollection SQLSERVER:\DataCollection SQL Server Data Collection
+
+PS SQLSERVER:\>
+
+We can actually navigate these just as we would a directory structure in Windows—for example:
+
+PS SQLSERVER:\> cd SQL
+
+You should quickly notice that we are climbing down a directory structure:
+
+PS SQLSERVER:\SQL>
+
+Let's jump forward a bit, and navigate much deeper into the tree. We'll need to navigate through our specific server (mine is KIERKEGAARD; you should replace it with the name of your SQL Server system), for instance (mine is the default, so I'll refer to it as DEFAULT), and on into the DATABASES node (we could also do other server-level objects, such as LOGINS):
+
+PS SQLSERVER:\SQL> cd KIERKEGAARD\DEFAULT\DATABASES
+
+We cut straight down to our databases node of the hierarchy just as if we were navigating a directory structure:
+
+PS SQLSERVER:\SQL\KIERKEGAARD\DEFAULT\DATABASES>
+
+But it gets better. We can issue a directory listing (in the form of the dir command) and get a list of databases, much like the one we created using sp_help earlier in the chapter (albeit not quite as verbose):
+
+PS SQLSERVER:\SQL\KIERKEGAARD\DEFAULT\DATABASES> dir
+
+This gets us:
+
+WARNING: column "Owner" does not fit into the display and was removed.
+
+Name Status Recovery Model CompatLvl Collation
+
+\---- ------ -------------- --------- ---------
+
+AdventureWorks2008 Normal Simple 100 SQL_Latin1_Genera
+
+l_CP1_CI_AS
+
+AdventureWorksDW2008 Normal Simple 100 SQL_Latin1_Genera
+
+l_CP1_CI_AS
+
+AdventureWorksLT2008 Normal Simple 100 SQL_Latin1_Genera
+
+l_CP1_CI_AS
+
+AWSubscriber Normal Full 100 SQL_Latin1_Genera
+
+l_CP1_CI_AS
+
+OurInsteadOfTest Normal Full 100 SQL_Latin1_Genera
+
+l_CP1_CI_AS
+
+ReportServer Normal Full 100 Latin1_General_CI
+
+_AS_KS_WS
+
+ReportServerTempDB Normal Simple 100 Latin1_General_CI
+
+_AS_KS_WS
+
+Test Normal Full 100 SQL_Latin1_Genera
+
+l_CP1_CI_AS
+
+PS SQLSERVER:\SQL\KIERKEGAARD\DEFAULT\DATABASES>
+
+This is, of course, a pretty simplistic example, but it can be taken much further. For example, PowerShell will allow you to enumerate a list such as the directory list we just created. You can then script different behaviors depending on the contents of the script.
+
+A Final Word on PowerShell
+
+As I write this, PowerShell is, from a SQL Server point of view, just getting started. The documentation on the cmdlets available is still rather sparse, but new items are being published regularly, and the nature of the PowerShell model is such that they will be able to continue extending the functionality within PowerShell even before Kilimanjaro (code name for the next release of SQL Server) is released.
+
+I highly recommend watching the SQL Server community on the Internet (or just Google SQL Server PowerShell from time to time) to keep an eye on what's new and where this new scripting tool is going. I can say, for example, that it is quickly becoming my preferred installation and upgrade scripting environment!
+
+Policy-Based Management
+
+Policy-Based Management—known during much of the beta phase of SQL Server 2008 as the Distributed Management Framework—is a rules-based management infrastructure primarily aimed at the management of SQL Server farms in larger enterprises. The concept is pretty simple. There are too many SQL Servers out there managed by too many different people (often completely different IT departments or DBAs that don't even know anything about the other servers and DBAs out there), so why not allow for all your SQL Servers to police themselves according to a set of "policies." What is enforced by the policies can vary ranging from things as simple as object naming guidelines to blocking specific changes to server settings. The management engine can just note violations of policy (simply be able to report on it), or it can actually block or reverse the change.
+
+The full effect of Policy-Based Management on the developer community is yet to be seen. I suspect that there are going to be some good scripting applications for it, but how exactly Policy-Based Management is going to be rolled out and just how enforcement policies are going to be implemented is something still being explored in many companies. For now, all I can say is that all of the Policy-Based Management features are exposed through SMO (in the Microsoft.SqlServer.Management.Dmf library) and through PowerShell. There is much left to be desired for documentation of the object model outside of the Management Studio, but a number of individual functions are documented in Books Online and I suspect updates to Books Online over the life of SQL Server 2008 will help fill in the picture of the Policy-Based Management object model.
+
+Summary
+
+Well, that gives you a few things to think about. It's really easy as a developer to think about many administrative tasks and establish what the inaccurately named Hitchhiker's Guide to the Galaxy trilogy called an "SEP" field. That's something that makes things like administration seem invisible because it's "somebody else's problem." Don't go there!
+
+A project I'm familiar with from several years ago is a wonderful example of taking responsibility for what can happen. A wonderful system was developed for a nonprofit group that operates in the northwestern United States. After about eight months of operation, an emergency call was placed to the company that developed the software (it was a custom job). After some discussion, it was determined that the database had somehow become corrupted, and it was recommended to the customer that the database be restored from a backup. The response? "Backup?" The development company in question missed something very important—they knew they had an inexperienced customer that would have no administration staff, and who was going to tell the customer to do backups and help set it up if the development company didn't? I'm happy to say that the development company in question learned from that experience—and so should you.
+
+Think about administration issues as you're doing your design and especially in your deployment plan. If you plan ahead to simplify the administration of your system, you'll find that your system is much more successful—that usually translates into rewards for the developer (that is, you!).
+23
+
+SMO: SQL Management Objects
+
+It's been a long road, and we're getting closer and closer to the end of our walk through SQL Server. It is, of course, no coincidence that the chapter about how to manage your SQL Server programmatically has been held until very close to the end. Among other things, we needed to have a solid idea as to what objects we were managing and what administrative needs we had before we were ready to understand the SMO object model and talk about some of the reasons we might want to use SMO.
+
+So, what exactly is SMO? Well, as the title of this chapter implies, SMO is an object model for managing SQL Server. Whereas connectivity models like ADO and LINQ are all about accessing data, SMO is all about access to the structure and health of your system.
+
+In this chapter, we'll look at:
+
+  * The convoluted history of SQL Server management object models
+  * The basics of the SQL SMO object model
+  * A simple SMO example project
+
+As with many of the SQL Server topics we cover in this book, SQL SMO is a book unto itself, so please do not expect to come out of this chapter as an expert. That said, hopefully, you will have the fundamentals down to at least the point to where you know what's possible and how much work is likely to be involved. From there, you can look for sources of more information as necessary.
+
+The History of SQL Server Management Object Models
+
+This is, to me—even as someone who genuinely loves the product—not an area where SQL Server shines. This is not to say that SMO is a bad thing but rather that the history of SQL Server management object models is a rather sordid history indeed. The team has had a tough time picking a horse and sticking with it.
+
+As I write this, I've been working with SQL Server for just under 15 years. In that time, the methods of managing SQL Server have changed several different times. "A new release? A new management method!" could be the motto for SQL Server.
+
+The good news is that SMO, at least so far, seems to be here to stay. It's on its second version as the primary object model for managing SQL Server (I know it's sad that two releases worth seems like an accomplishment). Still, there are other models that remain out there, so let's look at the highlights from the last couple of releases. These are some of the different models and technologies you may bump into as you work on legacy code out there.
+
+SQL Distributed Management Objects
+
+Distributed Management Objects, or DMO, is the relative "old dog" of the management models. When you think of the old Enterprise Manager from SQL Server 2000 and earlier, most of its underlying functionality ended up in a DMO call. The DMO model supported COM, and could perform all the basic tasks you might want management-wise, such as:
+
+  * Start a backup
+  * Restore from backup
+  * Create a database
+  * Create jobs and other agent-related tasks
+  * Reverse engineer tables into SQL code
+
+The list goes on.
+
+So, what went wrong with DMO? Well, the object model was often deemed "clunky" at best. Indeed, parts of DMO often did not work well together, and the scripting engine was buggy. In short, most developers I know only used DMO after going through an electroshock therapy program to desensitize them to the pain of it (okay, it wasn't that bad, but not far from there).
+
+SQL Namespaces
+
+SQL Namespaces (SQL NS) is actually largely about providing UI-level functionality. SQL NS encapsulates all of the functionality that you would find in the old Enterprise Manager—complete with the UI elements. You instantiate the UI objects, and those objects already utilizing SQL DMO underneath, and remove that layer of programming from the equation. In short, if you needed to build a tool that already had the UI to do management tasks, then SQL NS was your tool. The problem? Well, put it this way—EM? They decided they needed to replace it. DMO? They decided they need to replace it, too. As you can guess, apparently not even Microsoft was all that impressed.
+
+Now, lest I sound like all I am is a Microsoft basher or that I think EM was a bad product, I'll put it this way: EM was a fairly good "first shot at it." None of the RDBMS systems out there had anything remotely as powerful and useful as Enterprise Manager was when it first came out—it was a huge part of why SQL Server has been perceived as so much more usable than, say, Oracle (although Oracle has certainly made inroads in the management area). That usability, coupled with what used to be a very cheap price tag, is a big part of Microsoft's success with SQL Server.
+
+EM did, however, have a number of flaws that became more and more obvious as the Windows era taught us what a Windows application should look and act like.
+
+Windows Management Instrumentation
+
+Windows Management Instrumentation (WMI) is very different from the other management objects we've talked about this far in the sense that it is not SQL Server specific, but, rather, an implementation of a management scripting model that was already taking hold to manage servers across Windows and beyond.
+
+WMI is an implementation of the industry open standard Web-Based Enterprise Management (WBEM) initiative. WBEM goes well beyond Microsoft products, and the idea was that server administrators would be able to learn one core scripting model and manage all of their servers with it. Exchange, SQL Server, Windows O/S features, and more—it was all going to be managed using WMI (and, indeed, most of it can be).
+
+Going into SQL Server 2000, the message was clear: WMI was the future. Many of the SQL Server stalwarts (like me) were told over and over again—DMO would be going away (well, that much turned out to be true), and we should do any new management in WMI (that much turned out to be not so true).
+
+The reality is that WMI was never fully implemented for SQL Server, but what there is of it will also not go away any time soon. WMI is, as I've said, an industry standard, and many other Windows servers use WMI for configuration management. Having WMI available for the configuration fundamentals makes a lot of sense, and, for that space, it's likely here to stay (with no complaints from me).
+
+It's worth noting that WMI is now implemented as a layer over SMO—go figure.
+
+SMO
+
+It's unclear to me exactly when Microsoft decided to make the move to SMO. What I can say is that they knew they had a problem: DMO was clearly at the end its useful life, and a complete rewrite of Enterprise Manager was already planned for SQL Server 2005. At the same time, WMI was clearly not going to address everything that needed to be done. (WMI is configuration oriented, but SQL Server needs more administrative love than WMI was likely to give in any kind of usable way.)
+
+So, as SQL Server 2000 was coming to market, .NET was already clearly on the horizon. What has become Visual Studio 2005 was already in heavy design. C# was already being sold as the programming language of the future. The decision was made to use Visual Studio plug-ins as the management center (indeed, you still see that very clearly for Reporting, Integration, and somewhat for Analysis Services).
+
+In the end, what we have in SMO is a very useful set of .NET assemblies. Management Studio has gone back to being its own thing (being too tied in to Visual Studio apparently didn't work out so well, but I like the decision to keep them separate), but it is based on Visual Studio, and leverages several Visual Studio notions right down to the IntelliSense that became part of the product in SQL Server 2008. The services that require the notion of a designer-use Business Intelligence Development Studio, which is still basically a set of projects, controls, and templates for Visual Studio (indeed, it says Visual Studio as you start it up).
+
+My guess? Well, depending on how long it is before SQL Server goes to a new version again, I think it's safe to say you can count on SMO as being the object model for no less than another 1–2 releases. There is no replacement on the horizon, and SMO looks very viable (no reason to replace it in the foreseeable future). In short, you should be able to count on it for at least 5–10 years, which is about as much as anyone can hope for anything in the software business.
+
+Even though it's safe to assume SMO will be around for at least a few more releases, it's worth noting that SMO is not 100% code compatible from release to release. For example, certain classes that were part of the core Microsoft.SqlServer.Smo.dll have been moved to a new file called Microsoft.SqlServer.SmoExtended.dll. If you don't have the new reference as part of your project, then things will break when you compile using the SQL Server 2008 libraries.
+
+The SMO Object Model
+
+Server Management Objects, or SMO, replaces DMO. That said, SMO goes well beyond anything DMO was conceived to do. Beyond basic configuration or even statement execution, SMO has some truly advanced features such as:
+
+  * Event handling: SMO supports the notion of trapping events that are happening on the server and injecting code to handle the event situation.
+  * The ability to address types of objects in your server as collections (making it easy to enumerate them and provide consistent and complete treatment for all objects of that type).
+  * The ability to address all of the various server objects that are part of SQL Server in a relatively consistent manner.
+
+Like all object models, SMO establishes something of a hierarchy among objects. Because SQL Server is such a complex product, there are many, many objects to consider. Figure 23.1 includes an example of the hierarchy of what I would consider to be "core" objects in SQL Server.
+
+Note that this is not at all a comprehensive list! If you want a diagram with everything, check Books Online (they have one that isn't bad, though it's not great either—at least it's complete). This is my attempt at giving you something that is more readable and has all the core objects plus a few.
+
+Figure 23.1
+
+Walking through Some Examples
+
+This may well be the messiest section in the entire book in terms of hearing me "talk" about things, as it includes a ton of Visual Studio stuff that goes well beyond what is built into the base SQL Server Business Intelligence Studio.
+
+You must have some version of Visual Studio .NET in order to actually build these examples yourself. Not to fear, however, if you don't—I do show all the lines of code here, so you can at least look them over.
+
+Also, the following examples are done in C#, but the basic object references and method calls are the same—conversion to VB or C++ should be simple for those more comfortable in those languages.
+
+What we're going to be doing in this section is building up a little application that does a number of different "basics" that you might be interested in. Among the things that will happen at least once among all these various actions are:
+
+  * Creating a reference to a specific server, including a connection to a server using a trusted connection
+  * Creating an entirely new database
+  * Creating tables in a database
+  * Creating primary key constraints for those tables
+  * Creating a foreign key referencing from one table to another
+  * Dropping a database
+  * Backing up a database
+  * Scripting a database object
+
+Each of these is a hyper-simplified version of what is required. Keep in mind that each of the objects I reference here has many more possible properties and methods to be set. For example, in the scripting example, we could play around with scripting options to change what general property commands do and do not appear in the script.
+
+Getting Started
+
+Start by creating a new Windows Application project in Visual Studio. I called mine SQLSMOExample. In order to make use of the SMO assemblies, you'll need to set references in your project to at least five assemblies:
+
+  * Microsoft.SqlServer.ConnectionInfo
+  * Microsoft.SqlServer.Management.Sdk.Sfc
+  * Microsoft.SqlServer.Smo
+  * Microsoft.SqlServer.SmoExtended
+  * Microsoft.SqlServer.SqlEnum
+
+Setting a reference is as easy as right-clicking References in the Solution Explorer (or in the Project menu) and choosing Add Reference. Select the five assemblies in the preceding list, and click OK.
+
+For my example, all of my code is, for simplicity's sake, done in a Form called frmMain. In most cases, you would want to set up separate component files for your methods and just call them from a form as needed.
+
+Declarations
+
+We need to add declarations to a couple of the management libraries to make it simple to utilize those objects in our code:
+
+using Microsoft.SqlServer.Management.Smo;
+
+using Microsoft.SqlServer.Management.Common;
+
+using Microsoft.SqlServer.Management.Smo.SqlEnum;
+
+This will allow us to reference several objects within these libraries without having to fully qualify them.
+
+Basic Connection and Server References
+
+There is a block of code you will see me reuse in every one of the methods we'll create in this chapter. The purpose of the code is to establish a connection and a server reference—everything we do will need these.
+
+In practice, we would likely establish one or more connections that would be global to the application rather than a specific method, but, again, I am trying to keep the code blocks somewhat independent, so that you can look at them individually.
+
+The connection and server reference code looks like this:
+
+// Create the server and connect to it.
+
+ServerConnection cn = new ServerConnection();
+
+cn.LoginSecure = true;
+
+Server svr = new Server(cn);
+
+svr.ConnectionContext.Connect();
+
+Creating a Database
+
+Creating a database is pretty straightforward. In the implementation that follows, I create a Database object and immediately initialize with a reference to our svr Server object. Note, however, that all I am creating is a database definition object. The database itself is not actually created on the server until we call the Create() method of the database object. So, in short, we define the object, modify the various properties that define it, and then, and only then, do we call the Create() method to actually create the database on the server that is referenced in our Server object.
+
+Drop a button onto the main form—I've called mine btnCreateDB—and you're ready to add some code. A simple method to create the database this might include:
+
+private void btnCreateDB_Click(object sender, EventArgs e)
+
+{
+
+// Create the server and connect to it.
+
+ServerConnection cn = new ServerConnection();
+
+cn.LoginSecure = true;
+
+Server svr = new Server(cn);
+
+svr.ConnectionContext.Connect();
+
+Database db = new Database();
+
+db.Parent = svr;
+
+db.Name = "SMODatabase";
+
+db.Create();
+
+txtResult.Text = "Database Created";
+
+cn.Disconnect();
+
+}
+
+I've established a generic database object. I then associated it with a specific server, gave the logic name for the database, and then created it.
+
+The result is really nothing different than if we had connected to our database and issued the command:
+
+CREATE DATABASE SMODatabase
+
+We wind up with an empty database that is created completely with defaults. We could, however, have set things like the physical file location (including creating it with multiple filegroups), default collation, growth and size properties—basically anything you normally think of as a property of the database. More importantly, however, we are operating in a native .NET environment, so any errors, success messages, or other notifications can be handled easily within our client language.
+
+Creating Tables
+
+In this example, I'm going to add a pair of tables to our empty SMODatabase. We'll add ParentTable and ChildTable. ChildTable will have a foreign key to ParentTable. Both will have primary keys.
+
+First, we'll need to set a reference to what database we want to create our tables in:
+
+private void btnCreateTables_Click(object sender, EventArgs e)
+
+{
+
+// Create the server and connect to it.
+
+ServerConnection cn = new ServerConnection();
+
+cn.LoginSecure = true;
+
+Server svr = new Server(cn);
+
+svr.ConnectionContext.Connect();
+
+// Get a reference to our test SMO Database
+
+Database db = svr.Databases["SMODatabase"];
+
+Notice that this time I did not create the Database object as "new." Instead, I associated it with an existing database object from our referenced Server object.
+
+From there, I create a new table object. Much as when we created the Database object, all we are doing is creating an object definition in our application. No table will be created in the database until after we've fully defined our Table object and called its Create() method.
+
+// Create Table object, and begin defining said table
+
+Table ParentTable = new Table(db, "ParentTable");
+
+Now we're ready to start adding some meat to the definition of our table. Unlike a database, which has enough defaults that you really only need to specify a name to create one (the rest it will just be copied from the model database), tables require a lot of specification—specifically, it needs at least one column.
+
+Let's add a column that will eventually serve as our primary key:
+
+// Build up the table definition
+
+Column ParentKey = new Column(ParentTable, "ParentKey");
+
+ParentKey.DataType = DataType.Int;
+
+ParentKey.Nullable = false;
+
+ParentKey.Identity = true;
+
+We've created a new column object. It has been templated from the ParentTable and named ParentKey. I've given it a data type of int, made it non-nullable, and defined it as an IDENTITY column.
+
+Even though we've templated the column from the ParentTable, it is not yet associated directly with that table! The templating reference just helps establish what the initial property values are for the column (such as collation).
+
+Now let's add another column called ParentDescription:
+
+Column ParentDescription = new Column(ParentTable, "ParentDescription");
+
+ParentDescription.DataType = DataType.NVarCharMax;
+
+ParentDescription.Nullable = false;
+
+Again, the column is created, but not directly associated with the Table object yet—let's take care of that now:
+
+// Now actually add them to the table definition
+
+ParentTable.Columns.Add(ParentKey);
+
+ParentTable.Columns.Add(ParentDescription);
+
+It is not until we add them to the Columns collection of the Table object that they become directly associated with that table.
+
+So, we have a table object defined, and it has two columns associated with it. What we need now is a primary key.
+
+// Add a Primary Key
+
+Index PKParentKey = new Index(ParentTable, "PKParentKey");
+
+PKParentKey.IndexKeyType = IndexKeyType.DriPrimaryKey;
+
+PKParentKey.IndexedColumns.Add(new IndexedColumn(PKParentKey,
+
+"ParentKey"));
+
+ParentTable.Indexes.Add(PKParentKey);
+
+Notice that we're defining the primary key as an index rather than as anything explicitly called a constraint. Instead, we define the index, and then tell the index (via its IndexKeyType) that it is a primary key. When the index is created, the constraint definition will also be added.
+
+Primary and Unique constraints are not added specifically as constraints. They are, instead, added as indexes with an IndexKeyType that implies that they are to be added as a constraint rather than a raw index.
+
+Much like our columns, the primary key is not directly associated with the table until we explicitly add it to the Indexes collection of our table.
+
+With all that done, we're ready to create our table:
+
+ParentTable.Create();
+
+It is at this point that the table is physically created in the database.
+
+Okay, with our parent table created, we're ready to add our child table. The code up through the creation of the primary key looks pretty much just as the ParentTable object did:
+
+// Create Table object for child, and begin defining said table
+
+Table ChildTable = new Table(db, "ChildTable");
+
+// Build up the Child table definition
+
+Column ChildParentKey = new Column(ChildTable, "ParentKey");
+
+ChildParentKey.DataType = DataType.Int;
+
+ChildParentKey.Nullable = false;
+
+Column ChildKey = new Column(ChildTable, "ChildKey");
+
+ChildKey.DataType = DataType.Int;
+
+ChildKey.Nullable = false;
+
+Column ChildDescription = new Column(ChildTable, "ChildDescription");
+
+ChildDescription.DataType = DataType.NVarCharMax;
+
+ChildDescription.Nullable = false;
+
+// Now actually add them to the table definition
+
+ChildTable.Columns.Add(ChildParentKey);
+
+ChildTable.Columns.Add(ChildKey);
+
+ChildTable.Columns.Add(ChildDescription);
+
+// Add a Primary Key that is a composite key
+
+Index PKChildKey = new Index(ChildTable, "PKChildKey");
+
+PKChildKey.IndexKeyType = IndexKeyType.DriPrimaryKey;
+
+PKChildKey.IndexedColumns.Add(new IndexedColumn(PKChildKey,
+
+"ParentKey"));
+
+PKChildKey.IndexedColumns.Add(new IndexedColumn(PKChildKey,
+
+"ChildKey"));
+
+ChildTable.Indexes.Add(PKChildKey);
+
+But with ChildTable, we want to add a twist in the form of a foreign key. To do this, we create a ForeignKey object:
+
+// Add a Foreign Key
+
+ForeignKey FKParent = new ForeignKey(ChildTable, "FKParent");
+
+And then create ForeignKeyColumn objects to add to the ForeignKey object.
+
+// The first "Parent Key" in the definition below is the name in the
+
+// current table
+
+// The second is the name (of just the column) in the referenced table.
+
+ForeignKeyColumn FKParentParentKey = new ForeignKeyColumn(FKParent,
+
+// "ParentKey", "ParentKey");
+
+FKParent.Columns.Add(FKParentParentKey);
+
+Next, set a reference to a specific table:
+
+FKParent.ReferencedTable = "ParentTable";
+
+// I could have also set a specific schema, but since the table was created
+
+// using just a
+
+// default schema, I'm leaving the table reference to it default also. They
+
+// would be
+
+// created using whatever the user's default schema is
+
+/*
+
+** Note that there are several other properties we could define here
+
+** such as CASCADE actions. We're going to keep it simple for now.
+
+*/
+
+Then actually add the foreign key to and create the table:
+
+ChildTable.ForeignKeys.Add(FKParent);
+
+ChildTable.Create();
+
+cn.Disconnect();
+
+txtResult.Text = "Tables Created";
+
+}
+
+I recognize that this probably seems convoluted compared to just connecting and issuing a CREATE TABLE statement, but there are several advantages:
+
+  * If you are dynamically building a table, you can encapsulate the various parts of the table construction more easily than trying to do string manipulation.
+  * Changes to the properties of the various objects involved are far less sensitive to specific order of execution than trying to build a string would be.
+  * All the properties remain discrete, so they are easily addressed and edited without significant string manipulation.
+  * It is the SMO way of doing things—if the other actions you're taking are already in SMO, then doing things consistently in SMO is probably going to yield less confusion than if you mix string-based commands with SMO commands.
+
+Dropping a Database
+
+As with most drop situations, this one is pretty straightforward. We start with our now-familiar server and connection info and then set a reference to what database we're interested in:
+
+private void btnDropDB_Click(object sender, EventArgs e)
+
+{
+
+// Create the server and connect to it.
+
+ServerConnection cn = new ServerConnection();
+
+cn.LoginSecure = true;
+
+Server svr = new Server(cn);
+
+svr.ConnectionContext.Connect();
+
+Database db = svr.Databases["SMODatabase"];
+
+Then just call the Drop() method and we're done:
+
+db.Drop();
+
+txtResult.Text = "Database Dropped";
+
+cn.Disconnect();
+
+}
+
+Note that we do not have any error trapping added here (there really isn't anything different than other error-trapping issues in your language of choice). You may run into some issues dropping the database if you still have connections open to that database elsewhere in this or other applications (such as Management Studio). I encourage you to experiment with this and what you might do in your error handler (remember, we have robust error handling in most .NET languages), such as identifying and killing all connections that have locks on the database we want to drop.
+
+Backing Up a Database
+
+For this one, we're actually going to switch over and use the AdventureWorks database just to give us something in which to make a meatier backup.
+
+As you might suspect from how many different objects we've seen so far, the Backup object is its own thing. It is considered a child of the Server object but has its own set of properties and methods.
+
+To create a backup, you do the same server connection code that we've seen several times now:
+
+private void btnBackupDB_Click(object sender, EventArgs e)
+
+{
+
+// Create the server and connect to it.
+
+ServerConnection cn = new ServerConnection();
+
+cn.LoginSecure = true;
+
+Server svr = new Server(cn);
+
+svr.ConnectionContext.Connect();
+
+We're then ready to create a new Backup object. Note that, unlike the Database object, which we associated with a server early on, we don't need to reference a specific server for our Backup object until we actually go to execute the backup.
+
+// Create and define backup object
+
+Backup bkp = new Backup();
+
+bkp.Action = BackupActionType.Database;
+
+bkp.Database = "AdventureWorks2008";
+
+bkp.Devices.AddDevice(@"c:\SMOSMOSample.bak", DeviceType.File);
+
+I've created the Backup object and told it what kind of a backup it should expect to do (A Database backup as opposed to, say, a Log backup). I've also set what database it's going to be backing up and defined a device for it to use.
+
+Note that, while here I defined a file device and path on the fly, you could just as easily connect to the server and query what devices are already defined on the server and then select one of those for your backup. Similarly, the device could be of a different type—such as a tape.
+
+Now we're ready to execute the backup. We have two different methods available for this:
+
+  * SqlBackup: This is a synchronous backup—your code will not gain control again until the backup is either complete or errors out.
+  * SqlBackupAsync: This tells the server to start the backup and then returns control to your application as soon as the server accepts the backup request as being valid (the backup will then run in the background). It's important to note that you do have the ability to receive notifications as the backup reaches completion points (you can define the granularity of those completion points).
+
+I've chosen the asynchronous backup method in my example.
+
+// Actually start the backup. Note that I've said to do this Asynchronously
+
+// I could easily have make it synchronous by choosing SqlBackup instead
+
+// Also note that I'm telling it to initialize (overwrite the old if it's
+
+// there).
+
+// Without the initialize, it would append onto the existing file if found.
+
+bkp.Initialize = true;
+
+bkp.SqlBackupAsync(svr);
+
+cn.Disconnect();
+
+}
+
+After you've run this, go take a look for the SQLSMOSample.bak file in the root of your C: drive and it should be there! Also try running the backup multiple times and notice that it is overwritten each time. If we removed the bkp.Initialize command, then each new backup would append to the existing file.
+
+Scripting
+
+Perhaps one of the most compelling abilities that SMO offers the true developer crown is the ability to script out objects that are already in the database. Indeed, SMO can script out backups, reverse engineer tables, and even record the statements being sent to the server.
+
+For our example, we're going to reverse engineer a script for the HumanResources.Employee table in the AdventureWorks database. We'll see just how easily even a relatively complex table definition can be scripted out for other use.
+
+We start with the same server, connection, and database reference code we've used several times in this chapter:
+
+private void btnScript_Click(object sender, EventArgs e)
+
+{
+
+// Create the server and connect to it.
+
+ServerConnection cn = new ServerConnection();
+
+cn.LoginSecure = true;
+
+Server svr = new Server(cn);
+
+svr.ConnectionContext.Connect();
+
+// Now define the database we want to reference the table from.
+
+Database db = svr.Databases["AdventureWorks2008"];
+
+Next, we set a reference to the table that we want to script out—we could just as easily be scripting out a different type of SQL Server object such as a stored procedure, a view, or even a database. Indeed, it can even be a server-level object such as a device or login.
+
+// Get a reference to the table. Notice that schema is actually the *2nd*
+
+// parameter
+
+// not the first.
+
+Table Employee = db.Tables["Employee", "HumanResources"];
+
+We're then ready to call the Script() method. The only real trick here is to realize that it returns not just a single string but rather a collection of strings. In order to receive this, we'll need to set up a variable of the proper StringCollection type, which is not defined in any of our using declarations; we will, therefore, need to fully qualify that variable declaration.
+
+// Call the Script method. The issue with this is that it returns a string
+
+// *collection* rather than a string. We'll enumerate it into a string
+
+// shortly.
+
+System.Collections.Specialized.StringCollection script =
+
+Employee.Script();
+
+Okay, so we've received our script, but now we want to take a look. I'll define a holding variable and copy all of the separate strings into just one string to use in a MessageBox:
+
+string MyScript = "";
+
+foreach (string s in script)
+
+{
+
+MyScript = MyScript + s + "\r\n";
+
+}
+
+// Now show what we got out of it - very cool stuff.
+
+MessageBox.Show(MyScript);
+
+cn.Disconnect();
+
+}
+
+Execute this, and you get a very usable script returned, as shown in Figure 23.2.
+
+Figure 23.2
+
+Pulling It All Together
+
+Okay, we looked at the code in fragments, so I wanted to provide something of a reference section to show what all my code looked like when pulled together. How you choose to do your form is up to you, but mine looks like Figure 23.3. Which buttons are which in the code should be self-descriptive based on the button names you'll see in the code. The very bottom box is a text box that I called txtReturn in the code.
+
+Figure 23.3
+
+Following is my entire form code:
+
+using System;
+
+using System.Text;
+
+using System.Windows.Forms;
+
+using Microsoft.SqlServer.Management.Smo;
+
+using Microsoft.SqlServer.Management.Common;
+
+using Microsoft.SqlServer.Management.Smo.SqlEnum;
+
+namespace SQLSMOSample
+
+{
+
+public partial class frmMain : Form
+
+{
+
+public frmMain()
+
+{
+
+InitializeComponent();
+
+}
+
+private void btnBackupDB_Click(object sender, EventArgs e)
+
+{
+
+// Create the server and connect to it.
+
+ServerConnection cn = new ServerConnection();
+
+cn.LoginSecure = true;
+
+Server svr = new Server(cn);
+
+svr.ConnectionContext.Connect();
+
+// Create and define backup object
+
+Backup bkp = new Backup();
+
+bkp.Action = BackupActionType.Database;
+
+bkp.Database = "AdventureWorks2008";
+
+bkp.Devices.AddDevice(@"c:\SMOSample.bak", DeviceType.File);
+
+// Actually start the backup. Note that I've said to do this Asynchronously
+
+// I could easily have make it synchronous by choosing SqlBackup instead
+
+// Also note that I'm telling it to initialize (overwrite the old if it's there).
+
+// Without the initialize, it would append onto the existing file if found.
+
+bkp.Initialize = true;
+
+bkp.SqlBackupAsync(svr);
+
+cn.Disconnect();
+
+}
+
+private void btnCreateDB_Click(object sender, EventArgs e)
+
+{
+
+// Create the server and connect to it.
+
+ServerConnection cn = new ServerConnection();
+
+cn.LoginSecure = true;
+
+Server svr = new Server(cn);
+
+svr.ConnectionContext.Connect();
+
+Database db = new Database();
+
+db.Parent = svr;
+
+db.Name = "SMODatabase";
+
+db.Create();
+
+txtResult.Text = "Database Created";
+
+cn.Disconnect();
+
+}
+
+private void btnScript_Click(object sender, EventArgs e)
+
+{
+
+// Create the server and connect to it.
+
+ServerConnection cn = new ServerConnection();
+
+cn.LoginSecure = true;
+
+Server svr = new Server(cn);
+
+svr.ConnectionContext.Connect();
+
+// Now define the database we want to reference the table from.
+
+Database db = svr.Databases["AdventureWorks2008"];
+
+// Get a reference to the table. Notice that schema is actually the *2nd* parameter
+
+// not the first.
+
+Table Employee = db.Tables["Employee", "HumanResources"];
+
+// Call the Script method. The issue with this is that it returns a string
+
+// *collection* rather than a string. We'll enumerate it into a string shortly.
+
+System.Collections.Specialized.StringCollection script = Employee.Script();
+
+string MyScript = "";
+
+foreach (string s in script)
+
+{
+
+MyScript = MyScript + s + "\r\n";
+
+}
+
+// Now show what we got out of it - very cool stuff.
+
+//MessageBox.Show(MyScript);
+
+this.txtResult.Text = MyScript;
+
+cn.Disconnect();
+
+}
+
+private void btnDropDB_Click(object sender, EventArgs e)
+
+{
+
+// Create the server and connect to it.
+
+ServerConnection cn = new ServerConnection();
+
+cn.LoginSecure = true;
+
+Server svr = new Server(cn);
+
+svr.ConnectionContext.Connect();
+
+Database db = svr.Databases["SMODatabase"];
+
+db.Drop();
+
+txtResult.Text = "Database Dropped";
+
+cn.Disconnect();
+
+}
+
+private void btnCreateTables_Click(object sender, EventArgs e)
+
+{
+
+// Create the server and connect to it.
+
+ServerConnection cn = new ServerConnection();
+
+cn.LoginSecure = true;
+
+Server svr = new Server(cn);
+
+svr.ConnectionContext.Connect();
+
+// Get a reference to our test SMO Database
+
+Database db = svr.Databases["SMODatabase"];
+
+// Create Table object, and begin defining said table
+
+Table ParentTable = new Table(db, "ParentTable");
+
+// Build up the table definition
+
+Column ParentKey = new Column(ParentTable, "ParentKey");
+
+ParentKey.DataType = DataType.Int;
+
+ParentKey.Nullable = false;
+
+ParentKey.Identity = true;
+
+Column ParentDescription = new Column(ParentTable, "ParentDescription");
+
+ParentDescription.DataType = DataType.NVarCharMax;
+
+ParentDescription.Nullable = false;
+
+// Now actually add them to the table definition
+
+ParentTable.Columns.Add(ParentKey);
+
+ParentTable.Columns.Add(ParentDescription);
+
+// Add a Primary Key
+
+Index PKParentKey = new Index(ParentTable, "PKParentKey");
+
+PKParentKey.IndexKeyType = IndexKeyType.DriPrimaryKey;
+
+PKParentKey.IndexedColumns.Add(new IndexedColumn(PKParentKey, "ParentKey"));
+
+ParentTable.Indexes.Add(PKParentKey);
+
+ParentTable.Create();
+
+// Create Table object for child, and begin defining said table
+
+Table ChildTable = new Table(db, "ChildTable");
+
+// Build up the Child table definition
+
+Column ChildParentKey = new Column(ChildTable, "ParentKey");
+
+ChildParentKey.DataType = DataType.Int;
+
+ChildParentKey.Nullable = false;
+
+Column ChildKey = new Column(ChildTable, "ChildKey");
+
+ChildKey.DataType = DataType.Int;
+
+ChildKey.Nullable = false;
+
+Column ChildDescription = new Column(ChildTable, "ChildDescription");
+
+ChildDescription.DataType = DataType.NVarCharMax;
+
+ChildDescription.Nullable = false;
+
+// Now actually add them to the table definition
+
+ChildTable.Columns.Add(ChildParentKey);
+
+ChildTable.Columns.Add(ChildKey);
+
+ChildTable.Columns.Add(ChildDescription);
+
+// Add a Primary Key that is a composite key
+
+Index PKChildKey = new Index(ChildTable, "PKChildKey");
+
+PKChildKey.IndexKeyType = IndexKeyType.DriPrimaryKey;
+
+PKChildKey.IndexedColumns.Add(new IndexedColumn(PKChildKey, "ParentKey"));
+
+PKChildKey.IndexedColumns.Add(new IndexedColumn(PKChildKey, "ChildKey"));
+
+ChildTable.Indexes.Add(PKChildKey);
+
+// Add a Foreign Key
+
+ForeignKey FKParent = new ForeignKey(ChildTable, "FKParent");
+
+// The first "Parent Key" in the definition below is the name in the current table
+
+// The second is the name (of just the column) in the referenced table.
+
+ForeignKeyColumn FKParentParentKey = new ForeignKeyColumn(FKParent,
+
+"ParentKey", "ParentKey");
+
+FKParent.Columns.Add(FKParentParentKey);
+
+FKParent.ReferencedTable = "ParentTable";
+
+// I could have also set a specific schema, but since the table was created
+
+// using just a
+
+// default schema, I'm leaving the table reference to it default also. They would be
+
+// created using whatever the user's default schema is
+
+/*
+
+** Note that there are several other properties we could define here
+
+** such as CASCADE actions. We're going to keep it simple for now.
+
+*/
+
+ChildTable.ForeignKeys.Add(FKParent);
+
+ChildTable.Create();
+
+cn.Disconnect();
+
+txtResult.Text = "Tables Created";
+
+}
+
+private void frmMain_Load(object sender, EventArgs e)
+
+{
+
+}
+
+}
+
+}
+
+Summary
+
+Well, all I can say is "Wow!" Okay, so, in a way, this is nothing all that new—after all, DMO used to do a lot of this stuff (indeed, most everything we've looked at with actual code). SMO has, however, made things simpler. The "Wow!" is about thinking of the possibilities:
+
+  * Imagine issuing commands asynchronously.
+  * Imagine still being able to monitor the progress of those commands by receiving events as progress continues.
+  * Imagine being able to generate script code to support most anything you might want to do.
+  * Imagine being able to register event handlers on your SQL Server and being notified when custom events occur on the server.
+
+The list goes on and on.
+
+Most of the concepts in this chapter are nothing new. We've already looked at ways to create tables, as well as create, back up, and drop databases. The power, then, is in how discretely you can manage those tasks using SMO. We have the prospect for very robust event and error handling. We can far more easily receive configuration information about objects already in the server in a form that yields separate properties as opposed to trying to parse those values out of system-stored procedures.
+
+This chapter truly just scratches the surface of what you can do. If I've piqued your interest at all, I encourage you to consider the use of SMO in your design work, and, of course, go get a book specific to SMO if you need one (you probably will!).
+24
+
+Data Warehousing
+
+Well, while it may seem that we've already roamed all over the realm of SQL Server, we have, up to this point, been working safely within the type of databases that are the most common, and that most database developers are the most comfortable with: The Online Transaction Processing—or OLTP—database.
+
+This chapter, however, will turn things somewhat upside down (in terms of the traditional "rules" that determine how we do things). When, for example, we talked about design earlier in this book or in my Beginning title, we were talking mostly in terms of a normalized database. In this chapter, we'll be largely tossing that out the window. Instead of the transaction-oriented databases we've focused on up to this point, we're going to focus on databases and models that are oriented around the notion of data analysis. We will, for now, focus primarily on data warehousing and the special needs relating to its storage ramifications and reporting in data warehousing situations. We'll explore a new sea of terms that you may not have heard before—the lingo of data warehousing and analytics—the language of Business Intelligence (often referred to simply as BI). We'll also explore the world of multidimensional modeling by taking a quick look at yet another service included with SQL Server—Analysis Services.
+
+In this chapter we will:
+
+  * Discuss the differences between the needs of transaction processing versus analysis processing
+  * Discuss how these differences necessarily lead to substantially different solutions
+  * Explore the problems with the idea of using your OLTP solution as your OLAP solution
+  * Define the concept of a data cube, and indicate how they can help provide a solution to the special requirements of an analytics environment
+  * Look at some other aspects of Analysis Services that come as part of SQL Server 2008
+
+Considering Differing Requirements
+
+As corporations build increasingly complex applications and store their daily data in the databases that support those applications, the databases grow in size. As the size of each database increases, there are typically negative impacts on the system performance of the applications that utilize it. Left unchecked, databases can grow to sizes that seriously impact response times, increase contention (conflict between users trying to get at the same data), or even causing the entire system to go offline.
+
+End users may use data sources differently from one another. From a "how they use it" perspective, users fall into four significant categories:
+
+  * Those who want to access the data sources on a daily basis, retrieving certain records, adding new records, updating, or deleting existing records
+  * Those who want to make sense of the enormous amounts of data piling in the database, generating reports that will help them come up with the right decisions for the corporation and give it the competitive edge that will make it succeed in the marketplace
+  * Those who want to take the knowledge they gained from their analytical or transactional systems a step further by predicting business performance and analyzing trends for the future
+  * Those who want to make use of highly focused "at a glance" information to obtain fast indications of where they should focus their time
+
+The separate OLTP and OLAP systems help satisfy the different requirements of the first two categories of users. Data mining and cube analysis (through pivot tables and other "What if?" analysis) help satisfy the requirements of the third category. The final item listed tends to be served by targeted screens—or "Dashboards"—that are typically presented when someone first logs in to their system. The following sections present the characteristics of these systems and technologies, and how and when each of them can be used.
+
+Online Transaction Processing (OLTP)
+
+As previously mentioned, the OLTP systems we have focused on until now are designed to allow for high concurrency, making it possible for many users to access the same data source and conduct the processing they need. They also tend to be the "system of authority" for most data, so they place an exceptionally high value on data integrity. In addition, they tend to store data at the detail level, so they implement strategies that minimize the amount of space required to store the data.
+
+As the "transaction processing" in the name implies, OLTP systems are oriented around the idea of transaction processing against the database. Transactions further imply controlled changes to the data in the tables, due to inserts, updates, and deletes during the operation of your business. Typically, an OLTP system will have numerous client applications accessing the database to address small pieces of information in a variety of ways (inserts, updates, deletes—virtually anything).
+
+Examples of OLTP systems include data-entry programs such as banking, ticket reservation, online sales, and inventory management systems (such as AdventureWorks2008), but, no matter what the application is, OLTP systems are usually built with the following objectives in mind:
+
+  * Process data generated by transactions
+  * Maintain a high degree of accuracy by eliminating data redundancy
+  * Ensure data and information integrity
+  * Produce timely (generally "real time") documents and reports, such as receipts and invoices
+  * Increase work efficiency
+
+In focusing on these particular objectives, the design of the database is usually in the third normal form we discussed back in Chapter 5, eliminating redundancy and maximizing the power of relationships between tables.
+
+Online Analytical Processing (OLAP)
+
+The Online Analytical Processing (or OLAP) systems fall under the broader scope of Decision Support Systems (DSS), or, as is becoming more popular these days, Business Intelligence (BI). The goal of BI systems is to analyze huge amounts of data, generating summaries and aggregations in many different ways ranging from daily, weekly, quarterly, and annual reports to highly focused scorecards and dashboards typically aimed at very specific users who are prepared to act on that data to gain a competitive edge.
+
+With OLAP and BI, we generally forget about keeping our data normalized. Instead, we deliberately de-normalize the database (or flatten it) to some extent, allowing some redundancy to avoid joins and focus performance specifically on data retrieval rather than modification. Why is this okay in a data warehouse? Well, once the data arrives in the data warehouse, it is rarely changed. The data is kept there for query purposes; to generate reports that would help decision makers plan the future of their enterprise, but, since it is usually viewed as history by the time it arrives in a data warehouse environment, it doesn't need to concern itself with inserts, updates, or deletes. Instead of a highly normalized, transactional database, we wind up with what is usually called a dimensional database that follows a specific structure or schema. Dimensional databases can be used to build data cubes, which are multidimensional representations of the data that facilitates online business analysis and query performance. The dimensions of a cube represent distinct categories for analyzing business data. The dimensions found in a typical cube will almost always include time, and will usually also include geography and something akin to a product line. From there, the possibilities are endless, depending on the specific characteristics of your organization.
+
+Just because it is called a "cube," don't allow yourself to fall into the trap of considering it limited to three dimensions. Cubes allow for queries that are of n-dimensions. The "cube" representation is merely meant to get across that we are beyond the typical tabular representation seen in OLTP systems.
+
+A Brief Word on Data Mining
+
+Traditional querying techniques such as the queries we've largely focused on in this book and queries into a data warehouse help you find information from your data that is based on relationships you likely already know. (Heck, they are probably declared in your transactional system.) For instance, you can use queries or even a cube to find the number of customers who bought a certain product in a certain period of time per state or city. The information you are seeking is already in your database and the query to retrieve it is usually based on a question you know intuitively.
+
+Data mining, on the other hand, shows its power by helping you discover hidden relationships in your data. You might use it for discovering new trends, speculating on causes for certain events, or even forecasting the performance or direction of certain aspects of your data. For example, data mining might help you find out why a certain product is selling more than another product in a certain region. Data mining makes use of algorithms that bring non-intuitive relationships to our attention. For example, data mining done many years ago discovered that people who bought beer were more inclined to also purchase cheese. Retailers picked up on this, and, for a time, it wasn't uncommon to see cheese located very near the beer aisle to facilitate and encourage the sales of those products as a pair rather than just a single sale at a time.
+
+SQL Server 2008 continues SQL Server's strong support for data mining. The complexities of data mining are, however, well beyond the scope of this book. I did, however, want to make sure you were aware of its availability should you get comfortable enough with analytics to explore data mining.
+
+OLTP or OLAP?
+
+Now that you have seen the general ideas behind the two systems of OLAP and OLTP, let's consider the banking business, for example. During the bank's working hours, bank tellers help customers perform transactions, like depositing funds into their accounts, transferring funds between accounts, and withdrawing funds from these accounts. The customers may also conduct their own transactions using an ATM (Automatic Teller Machine), or a phone-based and/or computer-based banking service. In other words, such transactions are not limited to a particular part of the day but can take place around the clock. All of these operations lead to changes in the data stored in the database. These changes could be inserting new records, or updating or deleting existing records.
+
+OLTP is built to allow these transactions to be made by a large number of users accessing the database concurrently. Databases serving OLTP systems are usually highly normalized relational databases, and their table indexes need to be selected carefully for the right fields. OLTP databases should be built to balance performance away from reporting and toward high frequency of transactions. Queries executed in OLTP systems include a significant mix of inserts, updates, deletes, and selects.
+
+Let's now look at a different scenario with the banking example. Suppose that the bank managers are conducting future planning. They need to look at both current and historical performance data of the bank. If they were to query the database that is used for the OLTP system, they will likely run into significant contention issues with employees who are conducting the day-to-day business of the bank. The variety of reporting and analysis that bank management is likely to be looking for are often long running, and it can put a significant load on the transactional system as many tables are joined to relate a wide range of information and are formatted in a way that is meant to summarize and aggregate the data. For example, they might want to know the total amount of transactions conducted by all customers in a certain region. Such a query would have to sift through large amounts of data that is fragmented and scattered over many joined tables. For example, an accounting general ledger transaction could be stored in a dozen different tables. The queries will have to pull fields from these joined tables to build the views needed by the management, grouping and performing aggregations as it does so. Now imagine this process being repeated over and over again as multiple managers all ask the same general questions and look at the same data.
+
+To face these challenges, it is necessary to isolate the managers who use existing bank data to build their future outlook and planning, and have them use a different system based on OLAP principles. This means creating two different systems: an OLTP system for transaction processing by bank staff and customers, and an OLAP system to help with the decision making.
+
+Now we have two different systems; should these systems use the same database with separate tables for each system, or should they use two completely different databases? The answer to this question depends on how much effect one of the systems will have on the performance of the other, and on how the management and administration plans of these systems work. It is very likely that the two systems will be used at the same time. This causes performance problems even if the tables are separate. This is because the two systems still share many resources on the database server, and these resources may be depleted quickly with the two systems in use. These two systems are usually optimized differently. If we optimize for OLAP, we may adversely affect the performance of the OLTP system, and vice versa. Also, the two systems may have to be administered differently, with different user accounts, backup and maintenance strategies, and so on. Therefore, even though it is theoretically possible to tap into the same database, it is a good idea to keep separate databases on separate database servers for the two systems. With this, each system will have its own resources, and optimizing it will not affect the other system.
+
+Dimensional Databases
+
+The solution to the problems inherent with requesting complex queries from OLTP systems is to build a separate database that would represent the business facts more concisely. The structure of this database will not be relational; instead, it will be dimensional.
+
+The Fact Table
+
+The central table of a dimensional database is called the fact table. Its rows are known as facts and the central theme of a fact table will be measures of some kind of distinct instance of an activity or event.
+
+For example, the AdventureWorksDW2008 data warehouse sample includes a table called FactInternetSales and several related tables (shown in Figure 24.1). It focuses on individual sales, but on the key metrics for the sale at a line item level. It holds a set of measures (usually numeric)—in this case Order Quantity, Unit Price, and Extended Amount among other measures—and relates them to a set of appropriate dimensions. (In this case, product information, relevant dates, customer information, and other dimensions on which we may want to base our analysis.)
+
+Figure 24.1
+
+The Dimension Tables
+
+Dimensions help put the facts in context and represent such things as time, product, customer, and location. The dimensions describe the data in the fact table. Continuing with our AdventureWorksDW2008 example, it would make sense to have date, customer, and product dimensions, among other things.
+
+The fact table, FactInternetSales, captures transactions on a daily level for all customers and all products. Since it has a row for every line of detail, this table will likely grow to be very large (or, at least, we hope so since that means we made many sales!). Since storing every piece of customer data for every sale would take up a prohibitive amount of space, we go ahead and break out the items that don't change with every instance of a measure. These tables we link to the fact table are called dimension tables. They are used to create something of a group by which to determine the level of aggregations from the fact table. For instance, we could find the total monthly sales of all products in all sales territories if we were to query the FactInternetSales table grouping by month of the year. Alternatively, we could find the total sales by sales territory at all times, for all customers, and for all products if we queried the FactInternetSales table grouping on state. We can also have aggregations on a combination of the dimensions in FactInternetSales. For example, we could find the total sales for a particular product model by sales territory on a monthly basis for a specific type of customer by grouping on state and month and adding the appropriate criteria in the WHERE clause for the customer and product.
+
+The Star and Snowflake Schemas
+
+The database schema in Figure 24.1, where there is a single fact table with a number of dimension tables linked directly to it, is an example of a star schema. In a star schema, all objects likely to be involved in a query are no more than one join away from the fact table. You may also hear of a snowflake schema. In a snowflake schema, multiple tables may relate to a dimension that, in turn, is the one to relate directly to the fact table. A snowflake schema can be considered an extension of the star schema, providing a bit more normalization, but also requiring additional tables be joined to relate all the data.
+
+Data Cubes
+
+Until now, we have seen that data is moved from the transactional system into a data warehouse—most likely in the form of a star or snowflake schema. In a dimensional model such as we've described here, the database is frequently used as the basis for constructing what are known as cubes. To understand what cubes are, think of the data in the dimensional database as the transformed raw data for your analysis. In other words, if you look at the example in the previous section, you notice that the fact table includes the transaction information and pointers (foreign keys) to the dimensions we wish to analyze. The reports we generate based on the schema in Figure 24.1 are usually something like total sales for customers in a particular territory over a particular period of time for a specific product or category of products. To obtain such a result, you have to aggregate the values in the fact table based on the dimensions you are using to construct the needed report. SQL Server's Analysis Services allows you to pre-calculate such results and store them in a cube. Hence, the cube is a structure that stores the data aggregations from the dimensional database by combining all possible dimension values with the Internet sales facts in the fact table. With this, retrieving the final reports becomes much more efficient, since no complex queries are evaluated at runtime.
+
+To visualize what a cube looks like, look at Figure 24.2. The dimensions of the cube represent the dimensions of the fact table. Each cell in the cube represents a fact corresponding to a level of detail for the different dimensions of the cube. Although the graphical representation of the cube can only show three dimensions, a data cube can have many more dimensions when using Analysis Services. The following figure shows a representation of a data cube for the FactInternetSales table, with the territory, product category, and time dimensions shown.
+
+Figure 24.2
+
+If you want to use this cube to find the total sales in the Michigan territory during 2002 for the bicycles category, you need to look at the shaded cell in the figure, which is the resulting cell from the intersection of those three dimensions.
+
+Analysis Services allows you to build your cube from any source of data that has an OLE DB provider. This source can be a relational database in any database management system that has an ODBC driver (such as Oracle, DB2, or even MySQL) or a native OLE DB provider (such as SQL Server, Oracle, or MS Access). The data source for the cube can also be a dimensional database, text files, or even a lightweight directory access protocol (LDAP) data source.
+
+Data Warehouse Concepts
+
+Now that we have seen what cubes and dimensional databases are, let's define the larger concept of what a data warehouse is and how it might be built in SQL Server 2008.
+
+A data warehouse is a data store that holds the data collected during the company's conduction of business over a long period of time. The data warehouse may be made up of one or more data marts (smaller collections of summary or dimensional data that is generally focused on a subset of the data warehouse as a whole). The data warehouse typically uses the OLTP systems that collect the data from everyday activities and transactions as its source. The data warehouse concept also includes the processes that extract, scrub (see "Data Scrubbing" later in the chapter), and transform the data, making it ready for the data warehouse. Finally, it also includes the tools needed by the business analysts to present and use the data. These tools include BI tools (such as pivot tables in Excel, or Performance Point Server), as well as data mining and reporting tools. Figure 24.3 depicts the conceptual structure and components of a data warehouse solution.
+
+Figure 24.3
+
+Data Warehouse Characteristics
+
+A data warehouse is usually built to support decision making and analytics because it is designed with the following unique characteristics:
+
+  * Consolidated and Consistent Data: In a data warehouse, data is collected from different sources and consolidated and made consistent in many ways, including the use of naming conventions, measurements, physical attributes, and semantics. This is important because business analysts accessing the data warehouse and using its data for their decision-making processes have to use consistent standards. For example, date formats may all follow one standard, showing day, month, quarter, and year. Data should be stored in the data warehouse in a single, acceptable format. This allows for the referencing, consolidating, and cross-referencing of data from numerous heterogeneous sources, such as legacy data on mainframes, data in spreadsheets, or even data from the Internet, giving the analysts a better understanding of the business.
+
+I can't stress enough the need to treat your data consistently, including the name you use to refer to it. Make sure that you don't use the same name to refer to different things in your database. If, for example, you have more than one type of sales you're going to refer to, then require *every* instance of sales to be name qualified—for example, "bicycle sales" versus "apparel sales" with a separate name for "aggregate sales." I strongly suggest keeping a data "dictionary" that defines the meaning of each name you use and the source of that data.
+
+  * Subject-oriented Data: The data warehouse organizes key business information from OLTP sources so that it is available for business analysis. In the process, it weeds out irrelevant data that might exist in the source data store. The organization takes place based on the subject of the data, separating customer information from product information, which may have been intermingled in the source data store.
+  * Historical Data: Unlike OLTP systems, the data warehouse represents historical data. In other words, when you query the data warehouse, you use data that was collected using the OLTP system in the past. The historical data could cover a long period of time compared to the OLTP system, which contains current data that accurately describes the system, for the most part.
+  * Read-only Data: After data has been moved to the data warehouse, you may not be able to change it unless the data was incorrect in the first place. The data in the data warehouse cannot be updated because it represents historical data, which cannot be changed. Deletes, inserts, and updates (other than those involved in the data-loading process) are not applicable in a data warehouse. The only operations that occur in a data warehouse once it has been set up are loading of additional data and querying.
+
+Data Marts
+
+You may find out, after building your data warehouse, that many people in your organization access only certain portions of the data in the data warehouse. For instance, the sales managers may access only data relevant to their departments. Alternatively, they may access only data for the last year. In this case, it would be inefficient to have these people query the whole data warehouse to get their reports. Instead, it would be wise to partition the data warehouse in smaller units, called data marts, which are based on their business needs.
+
+In addition, some people in your organization may want to be able to access the data in the data warehouse in remote areas far from the company buildings. For instance, a sales manager may want to access data about products and sales particular to his or her market area while on a sales venture. People such as this would benefit from a data mart, as they would be able to carry a section of the data warehouse on their laptop computers, allowing them to access the data they need at any time.
+
+As often as not, this process actually works backwards, with a smaller data mart serving as the beginning of a larger data warehouse. Indeed, many enterprise data warehouses in use today were created through a process of unifying multiple disparate data marts under one data dictionary and consistent definition before providing additional data aggregation and rollup that takes data from all the various data marts.
+
+Of course, with data marts, the data should be kept in synch with the data warehouse at all times. This can be done in a variety of ways, such as using SQL Server Integration Services, scripting (of T-SQL or other languages), or full-blown data management
+
+SQL Server Integration Services
+
+We already looked at Integration Services extensively back in Chapter 16, but given its consistent use in association with data warehousing, it's worth mentioning again here.
+
+Many organizations need to centralize data to improve decision making. The data being centralized is often stored in a large variety of formats and comes from a number of different sources. The row data that exists in these sources has to be reconciled and transformed in many cases before it can be stored in the data warehouse. SSIS is a fabulous tool for performing this task by providing a means to move data from the source to the destination data warehouse while validating, cleaning up, consolidating, and transforming the data when needed.
+
+Data Validation
+
+Conducting data validation before the data is transferred to the destination data warehouse is extremely important. If the data is not valid, the integrity of the business analysis conducted with it will be in question. For example, if one of the fields is a currency field, and the OLTP data sources exist in multiple countries around the globe, the data in this currency field must always be transferred in the currency of the destination data warehouse and the values must always be properly adjusted for exchange rates at the time the transaction took place (not just the value that was current when the transfer took place).
+
+Data Scrubbing
+
+Often the degree or nature of "clean up" required is such that it can't be performed directly during the transformation process. You may, for example, need to reconcile data between multiple sources feeding the same data warehouse. The process of reconciling multiple data sources and applying other consistency rules to your data is referred to as data scrubbing. For example, if a bicycle is classified in one source as the mountain bike category, and in another source as the recreational category, aggregations in the data warehouse involving this category will yield inaccurate results unless the two data sources have been reconciled during the data transformation process.
+
+Data scrubbing can be achieved in different ways. These methods are beyond the scope of this book, but are mentioned briefly here:
+
+  * Using SSIS to modify data as it is copied from the source to the destination data store
+  * Use of T-SQL scripts applied to a temporary "scrubbing" database or set of tables
+
+Creating an Analysis Services Solution
+
+In this section, we're going to take a quick look at what cubes are all about, and how to create them. Then we move on to a quick example of how to use them. This is going to be a simple walk-through meant to let you get a quick taste of what's possible. If, after you get done with this taste of Analysis Services, you want more, I would suggest picking up an Analysis Services book and books on data warehousing, business intelligence, and dimensional modeling.
+
+It really is important to realize that, just because you're a great database developer, you are not automatically a great developer of data warehouses or business intelligence systems. The way of thinking required to create a great decision support system is very different from that required to build a great transactional processing system. History is littered with dead projects created when a seemingly experienced database developer assured management that he or she knew all about data warehousing and analytics. Make sure you know what you're getting into before you make such a commitment.
+
+The example shown in the remainder of this chapter requires the AdventureWorksDW2008 database.
+
+Start by firing up the Business Intelligence Development Studio (aka, BIDS). It's been discussed in earlier chapters that used BIDS, but, again, this is just a special version of Visual Studio 2008 that is included with SQL Server. Go ahead and select New Project. What you see will vary somewhat depending on whether you have Visual Studio 2008 installed separately from SQL Server and, if so, what edition of Visual Studio you have.
+
+In any case, you should wind up with a dialog that looks something like Figure 24.4. The exact set of project types may vary from mine somewhat (again, depending on what edition of Visual Studio you're working with). I have already selected Business Intelligence Projects and, more specifically, the Analysis Services template.
+
+Figure 24.4
+
+After you select a name for your project (I've chosen the oh-so-descriptive "AnalysisServicesProject"), click OK to create the project. Visual Studio will give you an empty Analysis Services project, but notice the various folders created for you. While we won't work with every one of these in this book, it does give you a feel for how broad the work on an Analysis Services project can be.
+
+Let's move right along and create a new data source. To do this, simply right-click the Data Sources folder and select New Data Source, as shown in Figure 24.5.
+
+Figure 24.5
+
+This should give you a Welcome dialog (unless you've had it up before and selected the "Don't show this page again" option). Click Next to get to a dialog that allows you to choose a method for defining the data source. Stick with the default of "Create a data source based on an existing or new connection" and then click New to bring up the dialog shown in Figure 24.6.
+
+Figure 24.6
+
+I have already filled in several key fields to fit my particular need. (You may want to choose a remote server or to use Windows Authentication.) Click OK to create the data source and go back to the previous dialog. (The new data source should now show in the Data Connections list). Click Next to move on to the Impersonation Information dialog shown in Figure 24.7. We can utilize one of four security options here to determine what credentials Analysis Services will pass when it needs to connect to the data source we're defining. I've told it to use the service account, which equates to whatever Windows account Analysis Services is running under. (So, if you use this option, make sure that accounts has rights to your source data.)
+
+Figure 24.7
+
+Clicking Next should take you to the Completing the Wizard dialog, where you can name your data source and click Finish.
+
+Next, right-click the Data Source Views folder and choose New Data Source View. This should bring up the dialog shown in Figure 24.8. As you can see, the data source we created a few moments ago is listed and chosen by default. (It also gives us a shortcut to create a new data source if we so choose.) Click Next to select the tables and views you want to work with. I've selected all the tables we saw in our star schema example earlier in the chapter as shown in Figure 24.9.
+
+Figure 24.8
+
+Figure 24.9
+
+Again click Next to get the Completing the Wizard dialog. Choose a name (I'm going with the default of Adventure Works 2008) and click Finish. This time we get a more dramatic result, as our main project window (as shown in Figure 24.10) opens up with a view designer for our new data source view.
+
+Figure 24.10
+
+Notice that it has figured out that our tables are related, and even mapped the visual into a decent representation of the "star" idea.
+
+We're going to briefly skip down to the Dimensions folder. Again, right-click and select New Dimension. Click Next to go past the Welcome dialog and get to the Select Creation Method dialog shown in Figure 24.11.
+
+Figure 24.11
+
+Notice that there are utilities here for producing a time dimension table if we needed one (AdventureWorksDW2008 comes with one already). Keep the default and again click Next to see the Specify Source Information dialog shown in Figure 24.12. I've left it at the default table of DimCurrency (this was chosen alphabetically). It has chosen the correct column as the key, so we'll again click Next to get to the Dimension Attributes dialog shown in Figure 24.13. Note that I've added Currency Name as an attribute.
+
+Figure 24.12
+
+Figure 24.13
+
+Again click Next for the Completing the Wizard dialog. Change the name to be Currency, and click Finish to finish the wizard and create the dimension.
+
+Now repeat the New Dimension process for the rest of the dimension tables in our data source view (all of the tables that start with Dim), selecting all attributes for each dimension. You should wind up with a Dimensions node in the Solution Explorer that looks like Figure 24.14.
+
+Figure 24.14
+
+Okay, so our dimensions are created, but we're not quite ready to build a cube yet. The issue we need to take care of first is the construction of a time dimension. "But wait!" you say, "We already have a time dimension." If you said that, you would be correct. There is, however, a small problem. SQL Server doesn't know that it's a time dimension. To fix this, select the Date.dim entry (if you didn't rename it as you created it, it would still be called DimDate.dim) under Dimensions, then look at the Attributes list on the left as shown in Figure 24.15.
+
+Figure 24.15
+
+Right-click the Date node and select Properties. In the Properties pane, scroll down to the Basic section and notice the entry for Type. We need to change that to Time as shown in Figure 24.16.
+
+Figure 24.16
+
+With all this created, we're ready to build our cube. Simply right-click the project and select Deploy (you could also choose to limit things to a build), as shown in Figure 24.17.
+
+Figure 24.17
+
+This should get us a fully realized cube. What we need from there is to take a quick look at what exactly a cube gives us.
+
+Accessing a Cube
+
+So, given the example we just created, we're ready to actually make use of our cube. We can do this in several ways:
+
+  * Microsoft Excel (if you connect, as we will momentarily, you'll automatically get a pivot table)
+  * Direct connection and query using Multi-Dimensional Expressions or MDX (the Analysis Services equivalent of T-SQL)
+  * Other tools that are analytics centric, such as Performance Point Server
+
+As a quick example, we're going to connect to the cube we just built using a pivot table in Excel 2007. Excel has a rich set of functionalities for asking "What if?" questions, and the pivot table and pivot chart features integrate fairly easily with Analysis Services cubes.
+
+Let's check this out by firing up Excel 2007 and navigating to the Data ribbon as show in Figure 24.18. Note that I've clicked the From Other Sources tab and selected the From Analysis Services option. (This is built in, and requires no special configuration!) This will bring up the Data Connection Wizard dialog that is very similar to many other connection dialogs we've seen throughout this book. Enter the name of your server (or simply (local) if the cube is on the same server on which you are running Excel) and click Next to move on to the Select Database and Table dialog shown in Figure 24.19. Now go ahead and click Finish to bring up the Import Data dialog shown in Figure 24.20. This allows us to position where the data goes on our sheet and to confirm what we want to do with the data (in this case, create a pivot table). Go ahead and click OK here to accept the defaults.
+
+Figure 24.18
+
+Figure 24.19
+
+Figure 24.20
+
+If you're new to pivot tables, the sheet that appears (shown in Figure 24.21) may seem a bit anticlimactic. After all, there are no numbers and no real report. Looks, however, are deceiving. The secret to the horsepower in Excel pivot tables is found in the panes along the right-hand side of the workbook as we see in Figure 24.21.
+
+Figure 24.21
+
+At first it appears you have no report. However, a template makes it easy for you to manipulate the kind of information you want on your report and, more importantly, explore the results. To check this out, let's manipulate the data a bit. You can do this by dragging fields you are interested in from the PivotTable Field List into the areas listed below (as shown in Figure 24.22). As you do this, notice the effect that dropping a field in each box has on the main PivotTable area.
+
+Be careful as you click around in the main sheet area. If you click outside of the PivotTable, all the PivotTable fields will vanish. If this happens, just click in the area of the PivotTable and they should all re-appear.
+
+Figure 24.22
+
+I'm going to leave full exploration of what PivotTables can do to a book on Excel, but hopefully you've got a taste of just how easily an Analysis Services cube can enable you to explore your data. Keep in mind that this was just an example of one easy way to connect to your data. You can also issue complex queries against the cube using MDX. Such queries can compare multiple dimensions and allow for special functions for such things as comparing year over year results. What's more, the data coming out of the cube is highly optimized for just this kind of comparison.
+
+Summary
+
+What we covered in this chapter was not really meant to make you an expert in data warehousing, Analysis Services, or business intelligence. Instead, the idea is to give you a concept of what is involved in creating cubes and perhaps a little taste of what they might do for you. I can't stress enough just how surface our coverage of the product was. Analysis Services is a full book to itself. Hopefully the information covered here has given you enough of a feel for Analysis Services to know whether you want to pursue it further.
+25
+
+Being Well Connected
+
+Having a SQL Server but not allowing programs to connect to it is almost the same as not having a SQL Server at all. Sure, we may log in to Management Studio and write queries, but the reality is that the vast majority of our users out there never actually see the database directly. They are just using input and reporting screens in some system we've written. (Ok, it is today's massively multiplayer online world and with other large applications out there, they could be on some other highly scalable system too, but not too many of us are going to work on one of those.)
+
+With this in mind, it probably makes sense to figure out how your application is actually going to talk to the database. There are tons of books out there that cover this topic directly (and, outside of a basic connection, it really is a huge topic unto itself), so I'm not even going to attempt to discuss every fine point of every access model in every language. Instead, we're going to explore basic concepts and some fundamental issues of performance, memory use, and general best practices. As I've done with some of the other broad topics we spend time with in this book, the idea is to get you some fundamentals in a quick but useful way and give you something of a taste of what's involved and what kinds of questions you should be asking.
+
+So, having tempted you with a teaser, it's time for what may seem like bad news (but it's not and we'll get to why in a moment). This particular chapter is a "Web release only" chapter, which is a long-winded way of saying, "You need to go download it off the Web." You can fetch it from either the p2p.wrox.com support site or my personal site at www.professionalsql.com. Why did we do that? Well, it's a multifold thing. Some of it was, I'll admit, time constraints on the book. There is, however, another reason—timeliness. Connectivity has been one of the most changing areas of database work over the last decade or slightly more. As we'll discuss to some extent in the downloaded copy of this chapter, history is littered with various access models that have come and gone. (Heck, there are quite a few still in use.) As I write this, the .NET world is largely using ADO.NET and LINQ. Up and coming, however, is Microsoft's whole Entity Frameworks initiative—who knows what else by the time Kilimanjaro (the code name for the next version of SQL Server) is out and we publish another book. Going to a Web release makes it far more realistic that we can update this chapter if there are enough changes to warrant it. (While we still need to go through all the editing, we don't have to typeset or deal with page numbers.)
+
+Once downloaded, you'll find information such as:
+
+  * Various data access object models past and present (a little history)
+  * Some basic best practices for data access
+  * Some brief examples of connecting to your database in .NET
+
+A
+
+System Functions
+
+SQL Server includes a number of "System Functions" as well as more typical functions with the product. Some of these are used often and are fairly clear right from the beginning in terms of how to use them. Others, though, are both rarer in use and more cryptic in nature.
+
+In this appendix, we'll try to clarify the use of most of these functions in a short, concise manner.
+
+Just as an FYI, in prior releases, many system functions were often referred to as "Global Variables." This was a misnomer, and Microsoft has striven to fix it over the last few releases—changing the documentation to refer to them by the more proper "System Function" name. Just keep the old terminology in mind in case any old fogies (such as myself) find themselves referring to them as Globals.
+
+The T-SQL functions available in SQL Server 2008 fall into 14 categories:
+
+  * Legacy "system" functions
+  * Aggregate functions
+  * Configuration functions
+  * Cryptographic functions
+  * Cursor functions
+  * Date and time functions
+  * Mathematical functions
+  * Metadata functions
+  * Ranking functions
+  * Rowset functions
+  * Security functions
+  * String functions
+  * System functions
+  * Text and image functions
+
+In addition, we have the OVER operator, which largely works as a ranking tool, and can be applied to other forms of T-SQL functions (most notably aggregates). While I only discuss it as part of the ranking functions, you may see it referenced several other places in this appendix.
+
+Legacy System Functions (a.k.a. Global Variables)
+
+@@CONNECTIONS
+
+Returns the number of connections attempted since the last time your SQL Server was started.
+
+This one is the total of all connection attempts made since the last time your SQL Server was started. The key thing to remember here is that we are talking about attempts, not actual connections, and that we are talking about connections as opposed to users.
+
+Every attempt made to create a connection increments this counter regardless of whether or not that connection was successful. The only catch with this is that the connection attempt has to have made it as far as the server. If the connection failed because of NetLib differences or some other network issue, then your SQL Server wouldn't even know that it needed to increase the count; it only counts if the server saw the connection attempt. Whether the attempt succeeded or failed does not matter.
+
+It's also important to understand that we're talking about connections instead of login attempts. Depending on your application, you may create several connections to your server, but you'll probably only ask the user for information once. Indeed, even Query Analyzer does this. When you click for a new window, it automatically creates another connection based on the same login information.
+
+This, like a number of other system functions, is often better served by a system stored procedure, sp_monitor. This procedure, in one command, produces the information from the number of connections, CPU busy, through to the total number of writes by SQL Server. So, if basic information is what you're after, sp_monitor may be better. If you need discrete data that you can manipulate, then @@CONNECTIONS provides a nice, neat, scalar piece of data.
+
+@@CPU_BUSY
+
+Returns the time in milliseconds that the CPU has been actively doing work since SQL Server last started. This number is based on the resolution of the system timer, which can vary, and can therefore vary in accuracy.
+
+This is another of the "since the server started" kind of functions. This means that you can't always count on the number going up as your application runs. It's possible, based on this number, to figure out a percentage of the CPU that your SQL Server is taking up. Realistically though, I'd rather tap right into the Performance Monitor for that if I had some dire need for it. The bottom line is that this is one of those really cool things from a "gee, isn't it swell to know that" point of view, but doesn't have all that many practical uses in most applications.
+
+@@IDLE
+
+Returns the time in milliseconds (based on the resolution of the system timer) that SQL Server has been idle since it was last started.
+
+You can think of this one as being something of the inverse of @@CPU_BUSY. Essentially, it tells you how much time your SQL Server has spent doing nothing. If anyone finds a programmatic use for this one, send me an e-mail (robv@professionalsql.com). I'd love to hear about it (I can't think of one).
+
+@@IO_BUSY
+
+Returns the time in milliseconds (based on the resolution of the system timer) that SQL Server has spent doing input and output operations since it was last started. This value is reset every time SQL Server is started.
+
+This one doesn't really have any rocket science to it, and it is another one of those that I find falls into the "no real programmatic use" category.
+
+@@PACK_RECEIVED and @@PACK_SENT
+
+Respectively return the number of input packets read to and written from the network by SQL Server since it was last started.
+
+Primarily, these are network troubleshooting tools.
+
+@@PACKET_ERRORS
+
+Returns the number of network packet errors that have occurred on connections to your SQL Server since the last time the SQL Server was started.
+
+Primarily a network troubleshooting tool.
+
+@@TIMETICKS
+
+Returns the number of microseconds per tick. This varies by machines and is another of those that falls under the category of "no real programmatic use."
+
+@@TOTAL_ERRORS
+
+Returns the number of disk read/write errors encountered by the SQL Server since it was last started.
+
+Don't confuse this with runtime errors or as having any relation to @@ERROR. This is about problems with physical I/O. This one is another of those of the "no real programmatic use" variety. The primary use here would be more along the lines of system diagnostic scripts. Generally speaking, I would use the Windows Reliability and Performance Monitor for this instead.
+
+@@TOTAL_READ and @@TOTAL_WRITE
+
+Respectively return the total number of disk reads/writes by SQL Server since it was last started.
+
+The names here are a little misleading, as these do not include any reads from cache. They are only physical I/O.
+
+@@TRANCOUNT
+
+Returns the number of active transactions—essentially the transaction nesting level—for the current connection.
+
+This is a very big one when you are doing transactioning. I'm not normally a big fan of nested transactions, but there are times where they are difficult to avoid. As such, it can be important to know just where you are in the transaction-nesting side of things. (For example, you may have logic that only starts a transaction if you're not already in one.)
+
+If you're not in a transaction, then @@TRANCOUNT is 0. From there, let's look at a brief example:
+
+SELECT @@TRANCOUNT As TransactionNestLevel --This will be zero at this
+
+point
+
+BEGIN TRAN
+
+SELECT @@TRANCOUNT As TransactionNestLevel --This will be one at this
+
+point
+
+BEGIN TRAN
+
+SELECT @@TRANCOUNT As TransactionNestLevel --This will be two at this
+
+point
+
+COMMIT TRAN
+
+SELECT @@TRANCOUNT As TransactionNestLevel --This will be back to one
+
+\--at this point
+
+ROLLBACK TRAN
+
+SELECT @@TRANCOUNT As TransactionNestLevel --This will be back to zero
+
+\--at this point
+
+Note that, in this example, the @@TRANCOUNT at the end would also have reached zero if we had a COMMIT as our last statement.
+
+Aggregate Functions
+
+Aggregate functions are applied to sets of records rather than to a single record. The information in the multiple records is processed in a particular manner and then is displayed in a single record answer. Aggregate functions are often used in conjunction with the GROUP BY clause.
+
+The aggregate functions are:
+
+  * AVG
+  * CHECKSUM
+  * CHECKSUM_AGG
+  * COUNT
+  * COUNT_BIG
+  * GROUPING
+  * MAX
+  * MIN
+  * STDEV
+  * STDEVP
+  * SUM
+  * VAR
+  * VARP
+
+In most aggregate functions, the ALL or DISTINCT keywords can be used. The ALL argument is the default and will apply the function to all the values in the expression, even if a value appears numerous times. The DISTINCT argument means that a value will only be included in the function once, even if it occurs several times.
+
+Aggregate functions cannot be nested. The expression cannot be a subquery.
+
+AVG
+
+AVG returns the average of the values in expression. The syntax is as follows:
+
+AVG([ALL | DISTINCT] <expression>)
+
+The expression must contain numeric values. NULL values are ignored. This function supports the OVER operator described in the ranking functions section of this appendix.
+
+CHECKSUM
+
+This is a basic hash algorithm usually used to detect changes or consistency in data. This particular function accepts either an expression as an argument or a * (which implies that you want all columns in all the joined tables to be included). The basic syntax is:
+
+CHECKSUM(<expression>, [...n] | * )
+
+Note that the order of your expression, or in the case of a *, the join order, will affect the checksum value, so, for example:
+
+CHECKSUM(SalesOrderID, OrderDate)
+
+would not give the same result as:
+
+CHECKSUM(OrderDate, SalesOrderID )
+
+This function is not compatible with the OVER operator.
+
+CHECKSUM_AGG
+
+Like CHECKSUM, this is a basic hash algorithm usually used to detect changes or consistency in data. The primary difference is that CHECKSUM is oriented around rows, whereas CHECKSUM_AGG is oriented around columns. The basic syntax is:
+
+CHECKSUM_AGG( [ALL | DISTINCT] <expression>)
+
+The expression value can be virtually anything, including, if you wish, concatenation of columns (just remember to cast as necessary); however, remember that expression order does matter, so if you're concatenating, Col1 + Col2 does not equal Col2 + Col1.
+
+COUNT
+
+COUNT returns the number of items in expression. The data type returned is of type int. The syntax is as follows:
+
+COUNT
+
+(
+
+[ALL | DISTINCT] <expression> | *
+
+)
+
+The expression cannot be of the uniqueidentifier, text, image, or ntext data types. The * argument returns the number of rows in the table; it does not eliminate duplicate or NULL values.
+
+This function supports the OVER operator described in the ranking functions section of this appendix.
+
+COUNT_BIG
+
+COUNT_BIG returns the number of items in a group. This is very similar to the COUNT function, with the exception that the return value has a data type of bigint. The syntax is as follows:
+
+COUNT_BIG
+
+(
+
+[ALL | DISTINCT ] <expression> | *
+
+)
+
+Like COUNT, this function supports the OVER operator described in the ranking functions section of this appendix.
+
+GROUPING
+
+GROUPING adds an extra column to the output of a SELECT statement. The GROUPING function is used in conjunction with CUBE or ROLLUP to distinguish between normal NULL values and those added as a result of CUBE and ROLLUP operations. Its syntax is:
+
+GROUPING (<column_name>)
+
+GROUPING is used only in the SELECT list. Its argument is a column that is used in the GROUP BY clause and that is to be checked for NULL values.
+
+This function supports the OVER operator described in the ranking functions section of this appendix.
+
+MAX
+
+The MAX function returns the maximum value from expression. The syntax is as follows:
+
+MAX([ALL | DISTINCT] <expression>)
+
+MAX ignores any NULL values.
+
+This function supports the OVER operator described in the ranking functions section of this appendix.
+
+MIN
+
+The MIN function returns the smallest value from expression. The syntax is as follows:
+
+MIN([ALL | DISTINCT] <expression>)
+
+MIN ignores NULL values.
+
+This function supports the OVER operator described in the ranking functions section of this appendix.
+
+STDEV
+
+The STDEV function returns the standard deviation of all values in expression. The syntax is as follows:
+
+STDEV(<expression>)
+
+STDEV ignores NULL values.
+
+This function supports the OVER operator described in the ranking functions section of this appendix.
+
+STDEVP
+
+The STDEVP function returns the standard deviation for the population of all values in expression. The syntax is as follows:
+
+STDEVP(<expression>)
+
+STDEVP ignores NULL values.
+
+This function supports the OVER operator described in the ranking functions section of this appendix.
+
+SUM
+
+The SUM function will return the total of all values in expression. The syntax is as follows:
+
+SUM([ALL | DISTINCT] <expression>)
+
+SUM ignores NULL values.
+
+This function supports the OVER operator described in the ranking functions section of this appendix.
+
+VAR
+
+The VAR function returns the variance of all values in expression. The syntax is as follows:
+
+VAR(<expression>)
+
+VAR ignores NULL values.
+
+This function supports the OVER operator described in the ranking functions section of this appendix.
+
+VARP
+
+The VARP function returns the variance for the population of all values in expression. The syntax is as follows:
+
+VARP(<expression>)
+
+VARP ignores NULL values.
+
+This function supports the OVER operator described in the ranking functions section of this appendix.
+
+Configuration Functions
+
+Well, I'm sure it will come as a complete surprise (ok, not really...), but configuration functions are those functions that tell us about options as they are set for the current server or database (as appropriate).
+
+@@DATEFIRST
+
+Returns the numeric value that corresponds to the day of the week that the system considers the first day of the week.
+
+The default in the United States is 7, which equates to Sunday. The values convert as follows:
+
+  * 1—Monday (the first day for most of the world)
+  * 2—Tuesday
+  * 3—Wednesday
+  * 4—Thursday
+  * 5—Friday
+  * 6—Saturday
+  * 7—Sunday
+
+This can be really handy when dealing with localization issues, so you can properly layout any calendar or other day-of-week-dependent information you have.
+
+Use the SET DATEFIRST function to alter this setting.
+
+@@DBTS
+
+Returns the last used timestamp for the current database.
+
+At first look, this one seems to act an awful lot like @@IDENTITY in that it gives you the chance to get back the last value set by the system (this time, it's the last timestamp instead of the last identity value). The things to watch out for on this one include:
+
+  * The value changes based on any change in the database, not just the table you're working on.
+  * Any timestamp change in the database is reflected, not just those for the current connection.
+
+Because you can't count on this value truly being the last one that you used (someone else may have done something that would change it), I personally find very little practical use for this one.
+
+@@LANGID and @@LANGUAGE
+
+Respectively return the ID and the name of the language currently in use.
+
+These can be handy for figuring out if your product has been installed in a localization situation or not, and if so what language is the default.
+
+For a full listing of the languages currently supported by SQL Server, use the system stored procedure, sp_helplanguage.
+
+@@LOCK_TIMEOUT
+
+Returns the current amount of time in milliseconds before the system will time out waiting for a blocked resource.
+
+If a resource (a page, a row, a table, whatever) is blocked, your process will stop and wait for the block to clear. This determines just how long your process will wait before the statement is canceled.
+
+The default time to wait is 0 (which equates to indefinitely) unless someone has changed it at the system level (using sp_configure). Regardless of how the system default is set, you will get a value of −1 from this global unless you have manually set the value for the current connection using SET LOCK_TIMEOUT.
+
+@@MAX_CONNECTIONS
+
+Returns the maximum number of simultaneous user connections allowed on your SQL Server.
+
+Don't mistake this one to mean the same thing as you would see under the Maximum Connections property in the Management Console. This one is based on licensing and will show a very high number if you have selected "per seat" licensing.
+
+Note that the actual number of user connections allowed also depends on the version of SQL Server you are using and the limits of your application(s) and hardware.
+
+@@MAX_PRECISION
+
+Returns the level of precision currently set for decimal and numeric data types.
+
+The default is 38 places, but the value can be changed by using the /p option when you start your SQL Server. The /p can be added by starting SQL Server from a command line or by adding it to the Startup parameters for the MSSQLServer service in the Windows Services applet.
+
+@@NESTLEVEL
+
+Returns the current nesting level for nested stored procedures.
+
+The first stored procedure (sproc) to run has an ΠTLEVEL of 0. If that sproc calls another, then the second sproc is said to be nested in the first sproc (and ΠTLEVEL is incremented to a value of 1). Likewise, the second sproc may call a third, and so on up to maximum of 32 levels deep. If you go past the level of 32 levels deep, not only will the transaction be terminated, but you should revisit the design of your application.
+
+@@OPTIONS
+
+Returns information about options that have been applied using the SET command.
+
+Since you get back only one value, but can have many options set, SQL Server uses binary flags to indicate what values are set. In order to test whether the option you are interested in is set, you must use the option value together with a bitwise operator. For example:
+
+IF (@@OPTIONS & 2)
+
+If this evaluates to True, then you would know that IMPLICIT_TRANSACTIONS had been turned on for the current connection. The values are:
+
+Bit | SET Option | Description  
+---|---|---  
+1 | DISABLE_ DEF_CNST_CHK | Interim vs. deferred constraint checking.  
+2 | IMPLICIT_ TRANSACTIONS | A transaction is started implicitly when a statement is executed.  
+4 | CURSOR_CLOSE ON_COMMIT | Controls behavior of cursors after a COMMIT operation has been performed.  
+8 | ANSI_WARNINGS | Warns of truncation and NULL in aggregates.  
+16 | ANSI_PADDING | Controls padding of fixed-length variables.  
+32 | ANSI_NULLS | Determines handling of nulls when using equality operators.  
+64 | ARITHABORT | Terminates a query when an overflow or divide-by-zero error occurs during query execution.  
+128 | ARITHIGNORE | Returns NULL when an overflow or divide-by-zero error occurs during a query.  
+256 | QUOTED_ IDENTIFIER | Differentiates between single and double quotation marks when evaluating an expression.  
+512 | NOCOUNT | Turns off the row(s) affected message returned at the end of each statement.  
+1024 | ANSI_NULL_ DFLT_ON | Alters the session's behavior to use ANSI compatibility for nullability. Columns created with new tables or added to old tables without explicit null option settings are defined to allow nulls. Mutually exclusive with ANSI_NULL_DFLT_OFF.  
+2048 | ANSI_NULL_ DFLT_OFF | Alters the session's behavior not to use ANSI compatibility for nullability. New columns defined without explicit nullability are defined not to allow nulls. Mutually exclusive with ANSI_NULL_DFLT_ON.  
+4096 | CONCAT_NULL_ YIELDS_NULL | Returns a NULL when concatenating a NULL with a string.  
+8192 | NUMERIC_ ROUNDABORT | Generates an error when a loss of precision occurs in an expression.
+
+@@REMSERVER
+
+Returns the value of the server (as it appears in the login record) that called the stored procedure.
+
+Used only in stored procedures. This one is handy when you want the sproc to behave differently depending on what remote server (often a geographic location) the sproc was called from.
+
+@@SERVERNAME
+
+Returns the name of the local server that the script is running from.
+
+If you have multiple instances of SQL Server installed (a good example would be a Web hosting service that uses a separate SQL Server installation for each client), then @@SERVERNAME returns the following local server name information if the local server name has not been changed since setup:
+
+Instance | Server Information  
+---|---  
+Default instance | <servername>  
+Named instance | <servername\instancename>  
+Virtual server—default instance | <virtualservername>  
+Virtual server—named instance | <virtualservername\instancename>
+
+@@SERVICENAME
+
+Returns the name of the registry key under which SQL Server is running.
+
+Only returns something under Windows 2000/2003/XP, and (under either of these) should always return MSSQLService unless you've been playing games in the registry.
+
+@@SPID
+
+Returns the server process ID (SPID) of the current user process.
+
+This equates to the same process ID that you see if you run sp_who. What's nice is that you can tell the SPID for your current connection, which can be used by the DBA to monitor, and if necessary terminate, that task.
+
+@@TEXTSIZE
+
+Returns the current value of the TEXTSIZE option of the SET statement, which specifies the maximum length, in bytes, returned by a SELECT statement when dealing with text or image data.
+
+The default is 4096 bytes (4KB). You can change this value by using the SET TEXTSIZE statement.
+
+@@VERSION
+
+Returns the current version of SQL Server as well as the processor type and OS architecture.
+
+For example, a run on an old SQL Server 2005 box might look like this:
+
+SELECT @@VERSION
+
+and gives:
+
+\----------------------------------------------------------------------------------
+
+Microsoft SQL Server 2008 (RTM) - 10.0.1600.22 (X64)
+
+Jul 9 2008 14:17:44
+
+Copyright (c) 1988–2008 Microsoft Corporation
+
+Developer Edition (64-bit) on Windows NT 6.0 <X64> (Build 6001: Service Pack 1)
+
+(1 row(s) affected)
+
+Unfortunately, this doesn't return the information into any kind of structured field arrangement, so you have to parse it if you want to use it to test for specific information.
+
+Consider using the xp_msver system sproc instead. It returns information in such a way that you can more easily retrieve specific information from the results.
+
+Cryptographic Functions
+
+These are functions that help support the encryption, decryption, digital signing, and digital signature validation. Some of these are new with SQL Server 2008, and some came with SQL Server 2005. Notice that there are duplicates of most functions from a general use point of view, but that they are different in that one supports a symmetric key and the duplicate (usually with an "Asym" in the name) supports an asymmetrical key.
+
+Now, you may ask "why would I need these?" The answer is as varied as the possible applications for SQL Server. The quick answer though is this: Anytime you're sending or accepting data that you want to protect during transport. For example, since SQL Server supports HTTP endpoints, and, from that, hosting of its own Web services, you may want to accept or return encrypted information with a client of your Web service. Perhaps a more basic example is simply that you've chosen to encrypt the data in your database, and now you need to get it back out in a useful manner.
+
+AsymKey_ID
+
+Given the name of an asymmetric key, this function returns an int that corresponds to the related ID from the database. The syntax is simple:
+
+AsymKey_ID('<Asymmetric Key Name>')
+
+You must have permissions to the key in question to use this function.
+
+Cert_ID
+
+Similar to AsymKey_ID, this returns an ID that relates to the name of a certificate name. The syntax is simple:
+
+Cert_ID('<Certificate Name>')
+
+You must have permissions to the certificate in question to use this function.
+
+CertProperty
+
+Allows you to fetch various properties of a given certificate (as identified by the certificate's ID). Valid properties include the start date, expiration date, certificate issuer's name, serial number, security ID (The 'SID', which can also be returned as a string), and the subject of the certificate (who or what is being certified). The syntax looks like this:
+
+CertProperty ( Cert_ID ,
+
+'Expiry_Date'|'Start_Date'|'Issuer_Name'|'Cert_Serial_Number'|'Subject'
+
+|'SID'|'String_SID' )
+
+The data type returned will vary depending on the specific property you're looking for (datetime, nvarchar, or varbinary as appropriate).
+
+DecryptByAsmKey
+
+As you can imagine by the name, this one decrypts a chunk of data utilizing an asymmetric key. It requires the key (by ID), the encrypted data (either as a literal string or a string coercible variable), and the password used to encrypt the asymmetric key in the database. The syntax is straightforward enough:
+
+DecryptByAsymKey(<Asymmetric Key ID>, {'<encrypted string>'|<string variable>}
+
+[, '<password>'])
+
+DecryptByCert
+
+This is basically the same as DecryptByAsmKey, except that it expects a certificate rather than an asymmetric key. Like DecryptByAsmKey, this one decrypts a chunk of data utilizing a key. It requires the certificate (by ID), the encrypted data (either as a literal string or a string coercible variable), and the password used to encrypt the private key of the certificate (if one was used). The syntax looks almost just like DecryptByAsymKey:
+
+DecryptByCert(<Certificate ID>, {'<encrypted string>'|<string variable>}
+
+[, '<password>'])
+
+Again, any password utilized when encrypting the private key of the certificate will be needed to properly decrypt it.
+
+DecryptByKey
+
+Like its asymmetric and certificate-based brethren, this one decrypts a chunk of data utilizing a key. What's different is that this one not only expects a symmetric key (instead of the other types of key), but it also expects that key to already be "open" (using the OPEN SYMMETRIC KEY command). Other than that, it is fairly similar in use, with the encrypted data (either as a literal string or a string coercible variable) fed in as a parameter and, in this case, a hash key optionally accepted as an authenticator:
+
+DecryptByKey({'<encrypted string>'|<string variable>},
+
+[<add authenticator value>, '<authentication hash>'|<string variable>])
+
+Note that if you provide an add authenticator value (in the form of an int), that value must match the value supplied when the string was encrypted, and you must also supply a hash value that matches the hash supplied at encryption time.
+
+DecryptByPassPhrase
+
+Like the name says, this one decrypts data that was encrypted not by a formal key, but by a passphrase. Other than accepting a passphrase parameter instead of assuming an open key, DecryptByPassPhrase works almost exactly like DecryptByKey:
+
+DecryptByPassPhrase({'<passphrase>'|<string variable>},
+
+{'<encrypted string>'|<string variable>},
+
+[<add authenticator value>, '<authentication hash>'|<string variable>])
+
+As with DecryptByKey, if you provide an add authenticator value (in the form of an int), that value must match the value supplied when the string was encrypted, and you must also supply a hash value that matches the hash supplied at encryption time.
+
+EncryptByAsmKey
+
+Encrypts a chunk of data utilizing an asymmetric key. It requires the key (by ID) and the data to be encrypted (either as a literal string or a string coercible variable). The syntax is straightforward enough:
+
+EncryptByAsymKey(<Asymmetric Key ID>, {'<string to encrypt>'|<string variable>})
+
+EncryptByCert
+
+This is basically the same as EncryptByAsmKey, except that it expects a certificate rather than an asymmetric key. Like EncryptByAsmKey, this one encrypts a chunk of data utilizing the provided key. It requires the certificate (by ID) and the data to be encrypted (either as a literal string or a string coercible variable). The syntax looks almost just like EncryptByAsymKey:
+
+EncryptByCert(<Certificate ID>, {'<string to be encrypted>'|<string
+
+variable>})
+
+EncryptByKey
+
+This one not only expects a symmetric key (instead of the other types of key), but it also expects that key to already be "open" (using the OPEN SYMMETRIC KEY command) and a GUID to be available to reference that key by. Other than that, it is fairly similar in use, with the data to be encrypted (either as a literal string or a string coercible variable) fed in as a parameter and, in this case, a hash key optionally accepted as an authenticator:
+
+EncryptByKey({<Key GUID>, '<string to be encrypted>'|<string variable>},
+
+[<add authenticator value>, '<authentication hash>'|<string variable>])
+
+Note that if you provide an add authenticator value (in the form of an int), that value must be supplied when the string is decrypted, and you must also supply a hash value (which again will be needed at decryption time).
+
+EncryptByPassPhrase
+
+This one encrypts data not by using a formal key, but by a passphrase. Other than accepting a passphrase parameter instead of assuming an open key, EncryptByPassPhrase works almost exactly like EncryptByKey:
+
+EncryptByPassPhrase({'<passphrase>'|<string variable>},
+
+{'<string to be encrypted>'|<string variable>},
+
+[<add authenticator value>, '<authentication hash>'|<string variable>])
+
+As with EncryptByKey, if you provide an add authenticator value (in the form of an int), that value must be supplied when the string is decrypted, and you must also supply a hash value.
+
+Key_GUID
+
+Fetches the GUID for a given symmetric key in the current database:
+
+Key_GUID('<Key Name>')
+
+Key_ID
+
+Fetches the GUID for a given symmetric key in the current database:
+
+Key_ID('<Key Name>')
+
+SignByAsymKey
+
+Adds an asymmetric key signature to a given plain text value:
+
+SignByAsymKey(<Asymmetric Key ID>, <string variable> [, '<password>'])
+
+SignByCert
+
+Returns a varbinary(8000) containing the resulting signature provided a given certificate and plain text value:
+
+SignByCert(<Certificate ID>, <string variable> [, '<password>'])
+
+VerifySignedByAsymKey
+
+Returns an int (though, personally I think this odd since it is functionally a bit) indicating successful or failed validation of a signature against a given asymmetric key and plain text value:
+
+VerifySignedByAsymKey(<Asymmetric Key ID>, <plain text> , <signature>)
+
+VerifySignedByCert
+
+Returns an int (though, personally I think this odd since it is functionally a bit) indicating successful or failed validation of a signature against a given asymmetric key and plain text value:
+
+VerifySignedByCert(<Certificate ID>, <signed plain text> , <signature>)
+
+Cursor Functions
+
+These provide various information on the status or nature of a given cursor.
+
+@@CURSOR_ROWS
+
+How many rows are currently in the last cursor set opened on the current connection. Note that this is for cursors, not temporary tables.
+
+Keep in mind that this number is reset every time you open a new cursor. If you need to open more than one cursor at a time, and you need to know the number of rows in the first cursor, then you'll need to move this value into a holding variable before opening subsequent cursors.
+
+It's possible to use this to set up a counter to control your WHILE loop when dealing with cursors, but I strongly recommend against this practice. The value contained in @@CURSOR_ROWS can change depending on the cursor type and whether or not SQL Server is populating the cursor asynchronously. Using @@FETCH_STATUS is going to be far more reliable and at least as easy to use.
+
+If the value returned is a negative number larger than −1, then you must be working with an asynchronous cursor, and the negative number is the number of records so far created in the cursor. If, however, the value is −1, then the cursor is a dynamic cursor, in that the number of rows is constantly changing. A returned value of 0 informs you that either no cursor has been opened or the last cursor opened is no longer open. Finally, any positive number indicates the number of rows within the cursor.
+
+To create an asynchronous cursor, set sp_configure cursor threshold to a value greater than 0. Then, when the cursor exceeds this setting, the cursor is returned, while the remaining records are placed into the cursor asynchronously.
+
+@@FETCH_STATUS
+
+Returns an indicator of the status of the last cursor FETCH operation.
+
+If you're using cursors, you're going to be using @@FETCH_STATUS. This one is how you know the success or failure of your attempt to navigate to a record in your cursor. It will return a constant depending on whether or not SQL Server succeeded in your last FETCH operation, and, if the FETCH failed, why. The constants are:
+
+  * 0—Success
+  * −1—Failed. Usually because you are beyond either the beginning or end of the cursorset.
+  * −2—Failed. The row you were fetching wasn't found, usually because it was deleted between the time when the cursorset was created and when you navigated to the current row. Should only occur in scrollable, non-dynamic cursors.
+
+For purposes of readability, I often will set up some constants prior to using @@FETCH_STATUS.
+
+For example:
+
+DECLARE @NOTFOUND int
+
+DECLARE @BEGINEND int
+
+SELECT @NOTFOUND = −2
+
+SELECT @BEGINEND = −1
+
+I can then use these in my conditional in the WHILE statement of my cursor loop instead of just the row integer. This can make the code quite a bit more readable.
+
+CURSOR_STATUS
+
+The CURSOR_STATUS function allows the caller of a stored procedure to determine if that procedure has returned a cursor and result set. The syntax is as follows:
+
+CURSOR_STATUS
+
+(
+
+{'<local>', '<cursor name>'}
+
+| {'<global'>, '<cursor name>'}
+
+| {'<variable>', '<cursor variable>'}
+
+)
+
+local, global, and variable all specify constants that indicate the source of the cursor. Local equates to a local cursor name, global to a global cursor name, and variable to a local variable.
+
+If you are using the cursor name form, then there are four possible return values:
+
+  * 1—The cursor is open. If the cursor is dynamic, its result set has zero or more rows. If the cursor is not dynamic, it has one or more rows.
+  * 0—The result set of the cursor is empty.
+  * −1—The cursor is closed.
+  * −3—A cursor of cursor name does not exist.
+
+If you are using the cursor variable form, there are five possible return values:
+
+  * 1—The cursor is open. If the cursor is dynamic, its result set has zero or more rows. If the cursor is not dynamic, it has one or more rows.
+  * 0—The result set is empty.
+  * −1—The cursor is closed.
+  * −2—There is no cursor assigned to the cursor variable.
+  * −3—The variable with name cursor variable does not exist, or if it does exist, has not had a cursor allocated to it yet.
+
+Date and Time Functions
+
+This is an area with several new items in SQL Server 2008. In addition to working with timestamp data (which is actually more oriented toward versioning than anything to do with a clock or calendar), date and time functions perform operations on values that have any of the various date and time data types supported by SQL Server.
+
+When working with many of these functions, SQL Server recognizes eleven "dateparts" and their abbreviations, as shown in the following table:
+
+Datepart | Abbreviations  
+---|---  
+year | yy, yyyy  
+quarter | qq, q  
+month | mm, m  
+dayofyear | dy, y  
+day | dd, d  
+week | wk, ww  
+weekday | dw  
+hour | hh  
+minute | mi, n  
+second | ss, s  
+millisecond | ms
+
+CURRENT_TIMESTAMP
+
+The CURRENT_TIMESTAMP function simply returns the current date and time as a datetime type. It is equivalent to GETDATE(). The syntax is as follows:
+
+CURRENT_TIMESTAMP
+
+DATEADD
+
+The DATEADD function adds an interval to a date and returns a new date. The syntax is as follows:
+
+DATEADD(<datepart>, <number>, <date>)
+
+The datepart argument specifies the time scale of the interval (day, week, month, and so on) and may be any of the dateparts recognized by SQL Server. The number argument is the number of dateparts that should be added to the date.
+
+DATEDIFF
+
+The DATEDIFF function returns the difference between two specified dates in a specified unit of time (for example: hours, days, weeks). The syntax is as follows:
+
+DATEDIFF(<datepart>, <startdate>, <enddate>)
+
+The datepart argument may be any of the dateparts recognized by SQL Server and specifies the unit of time to be used.
+
+DATENAME
+
+The DATENAME function returns a string representing the name of the specified datepart (for example: 1999, Thursday, July) of the specified date. The syntax is as follows:
+
+DATENAME(<datepart>, <date>)
+
+DATEPART
+
+The DATEPART function returns an integer that represents the specified datepart of the specified date. The syntax is as follows:
+
+DATEPART(<datepart>, <date>)
+
+The DAY function is equivalent to DATEPART(dd, <date>); MONTH is equivalent to DATEPART(mm, <date>); YEAR is equivalent to DATEPART(yy, <date>).
+
+DAY
+
+The DAY function returns an integer representing the day part of the specified date. The syntax is as follows:
+
+DAY(<date>)
+
+The DAY function is equivalent to DATEPART(dd, <date>).
+
+GETDATE
+
+The GETDATE function returns the current system date and time. The syntax is as follows:
+
+GETDATE()
+
+GETUTCDATE
+
+The GETUTCDATE function returns the current UTC (Universal Time Coordinate) time. In other words, this returns Greenwich Mean Time. The value is derived by taking the local time from the server, and the local time zone, and calculating GMT from this. Daylight saving is included. GETUTCDATE cannot be called from a user-defined function. The syntax is as follows:
+
+GETUTCDATE()
+
+ISDATE
+
+The ISDATE function determines whether an input expression is a valid date. The syntax is as follows:
+
+ISDATE(<expression>)
+
+MONTH
+
+The MONTH function returns an integer that represents the month part of the specified date. The syntax is as follows:
+
+MONTH(<date>)
+
+The MONTH function is equivalent to DATEPART(mm, <date>).
+
+SYSDATETIME
+
+Much like the more venerable GETDATE function, SYSDATETIME returns the current system date and time. The differences are twofold: First, SYSDATETIME returns a higher level of precision. Second, the newer function returns the newer datetime2 data type (to support the higher precision—a precision of 7 in this case). The syntax is as follows:
+
+SYSDATETIME()
+
+SYSDATETIMEOFFSET
+
+Similar to SYSDATETIME, this returns the current system date and time. Instead of the simple datetime2 data type, however, SYSDATETIMEOFFSET returns the time in the new datetimeoffset data type (with a precision of 7), thus providing offset information versus universal time. The syntax is as follows:
+
+SYSDATETIMEOFFSET()
+
+SYSUTCDATETIME
+
+Much like the more venerable GETUTCDATE function, SYSDATETIME returns the current UTC date and time. SYSDATETIME, however, returns the newer function returns the newer datetime2 data type (to a precision of7). The syntax is as follows:
+
+SYSUTCDATETIME()
+
+SWITCHOFFSET
+
+This one accepts two arguments—an input value of type datetimeoffset(), and a new offset to represent the time as. The syntax looks like this:
+
+SWITCHOFFSET(<datetimeoffset data instance>, <newoffset>)
+
+So, if we run a quick test:
+
+CREATE TABLE TimeTest
+
+(
+
+MyTime datetimeoffset
+
+);
+
+INSERT TimeTest
+
+VALUES ('2008-12-31 6:00:00 -5:00');
+
+SELECT SWITCHOFFSET(MyTime, '-08:00') AS Pacific
+
+FROM TimeTest;
+
+DROP TABLE TimeTest;
+
+we would get back:
+
+(1 row(s) affected)
+
+Pacific
+
+\----------------------------------
+
+2008-12-31 03:00:00.0000000 -08:00
+
+(1 row(s) affected)
+
+TODATETIMEOFFSET
+
+Accepts a given piece of date/time information and adds a provided time offset to produce a datetimeoffset data type. The syntax is:
+
+TODATETIMEOFFSET(<data that resolves to datetime>, <time zone>)
+
+So, for example:
+
+DECLARE @OurDateTimeTest datetime;
+
+SELECT @OurDateTimeTest = '2008-01-01 12:54';
+
+SELECT TODATETIMEOFFSET(@OurDateTimeTest, '-07:00');
+
+yields:
+
+\----------------------------------
+
+1/1/2008 12:54:00 PM -07:00
+
+(local)(sa): (1 row(s) affected)
+
+YEAR
+
+The YEAR function returns an integer that represents the year part of the specified date. The syntax is as follows:
+
+YEAR(<date>)
+
+The YEAR function is equivalent to DATEPART(yy, <date>).
+
+Mathematical Functions
+
+The mathematical functions perform calculations. They are:
+
+  * ABS
+  * ACOS
+  * ASIN
+  * ATAN
+  * ATN2
+  * CEILING
+  * COS
+  * COT
+  * DEGREES
+  * EXP
+  * FLOOR
+  * LOG
+  * LOG10
+  * PI
+  * POWER
+  * RADIANS
+  * RAND
+  * ROUND
+  * SIGN
+  * SIN
+  * SQRT
+  * SQUARE
+  * TAN
+
+ABS
+
+The ABS function returns the positive, absolute value of numeric expression. The syntax is as follows:
+
+ABS(<numeric expression>)
+
+ACOS
+
+The ACOS function returns the angle in radians for which the cosine is the expression (in other words, it returns the arccosine of expression). The syntax is as follows:
+
+ACOS(<expression>)
+
+The value of expression must be between −1 and 1 and be of the float data type.
+
+ASIN
+
+The ASIN function returns the angle in radians for which the sine is the expression (in other words, it returns the arcsine of expression). The syntax is as follows:
+
+ASIN(<expression>)
+
+The value of expression must be between −1 and 1 and be of the float data type.
+
+ATAN
+
+The ATAN function returns the angle in radians for which the tangent is expression. (In other words, it returns the arctangent of expression.) The syntax is as follows:
+
+ATAN(<expression>)
+
+The expression must be of the float data type.
+
+ATN2
+
+The ATN2 function returns the angle in radians for which the tangent is between the two expressions provided. (In other words, it returns the arctangent of the two expressions.) The syntax is as follows:
+
+ATN2(<expression1>, <expression2>)
+
+Both expression1 and expression2 must be of the float data type.
+
+CEILING
+
+The CEILING function returns the smallest integer that is equal to or greater than the specified expression. The syntax is as follows:
+
+CEILING(<expression>)
+
+COS
+
+The COS function returns the cosine of the angle specified in expression. The syntax is as follows:
+
+COS(<expression>)
+
+The angle given should be in radians and expression must be of the float data type.
+
+COT
+
+The COT function returns the cotangent of the angle specified in expression. The syntax is as follows:
+
+COT(<expression>)
+
+The angle given should be in radians and expression must be of the float data type.
+
+DEGREES
+
+The DEGREES function takes an angle given in radians (expression) and returns the angle in degrees. The syntax is as follows:
+
+DEGREES(<expression>)
+
+EXP
+
+The EXP function returns the exponential value of the value given in expression. The syntax is as follows:
+
+EXP(<expression>)
+
+The expression must be of the float data type.
+
+FLOOR
+
+The FLOOR function returns the largest integer that is equal to or less than the value specified in expression. The syntax is as follows:
+
+FLOOR(<expression>)
+
+LOG
+
+The LOG function returns the natural logarithm of the value specified in expression. The syntax is as follows:
+
+LOG(<expression>)
+
+The expression must be of the float data type.
+
+LOG10
+
+The LOG10 function returns the base10 logarithm of the value specified in expression. The syntax is as follows:
+
+LOG10(<expression>)
+
+The expression must be of the float data type.
+
+PI
+
+The PI function returns the value of the constant. The syntax is as follows:
+
+PI()
+
+POWER
+
+The POWER function raises the value of the specified expression to the specified power. The syntax is as follows:
+
+POWER(<expression>, <power>)
+
+RADIANS
+
+The RADIANS function returns an angle in radians corresponding to the angle in degrees specified in expression. The syntax is as follows:
+
+RADIANS(<expression>)
+
+RAND
+
+The RAND function returns a random value between 0 and 1. The syntax is as follows:
+
+RAND([<seed>])
+
+The seed value is an integer expression, which specifies the start value. Once a seed is specified, all subsequent calls to RAND() on the current connection will choose the next value based on the original seed. If no seed is supplied, then SQL Server will use a random value as the initial seed.
+
+Be careful if you use explicit seed values. For a given seed, you will always get the same sequence of numbers so if, for instance, you start every connection to your server with a RAND(10), then every connection's first three calls to RAND() will yield 0.713759689954247, 0.182458908613686, and 0.586642279446948—not very random if you ask me. You can use your own randomized value, such as something based on the current time, but that still has you in a relatively finite set of starting points, so I personally recommend just starting with the SQL Server randomized value.
+
+ROUND
+
+The ROUND function takes a number specified in expression and rounds it to the specified length:
+
+ROUND(<expression>, <length> [, <function>])
+
+The length parameter specifies the precision to which expression should be rounded. The length parameter should be of the tinyint, smallint, or int data type. The optional function parameter can be used to specify whether the number should be rounded or truncated. If a function value is omitted or is equal to 0 (the default), the value in expression will be rounded. If any value other than 0 is provided, the value in expression will be truncated.
+
+SIGN
+
+The SIGN function returns the sign of the expression. The possible return values are +1 for a positive number, 0 for zero, and −1 for a negative number. The syntax is as follows:
+
+SIGN(<expression>)
+
+SIN
+
+The SIN function returns the sine of an angle. The syntax is as follows:
+
+SIN(<angle>)
+
+The angle should be in radians and must be of the float data type. The return value will also be of the float data type.
+
+SQRT
+
+The SQRT function returns the square root of the value given in expression. The syntax is as follows:
+
+SQRT(<expression>)
+
+The expression must be of the float data type.
+
+SQUARE
+
+The SQUARE function returns the square of the value given in expression. The syntax is as follows:
+
+SQUARE(<expression>)
+
+The expression must be of the float data type.
+
+TAN
+
+The TAN function returns the tangent of the value specified in expression. The syntax is as follows:
+
+TAN(<expression>)
+
+The expression parameter specifies the number of radians and must be of the float or real data type.
+
+Basic Metadata Functions
+
+The metadata functions provide information about the database and database objects. They are:
+
+  * COL_LENGTH
+  * COL_NAME
+  * COLUMNPROPERTY
+  * DATABASEPROPERTY
+  * DATABASEPROPERTYEX
+  * DB_ID
+  * DB_NAME
+  * FILE_ID
+  * FILE_NAME
+  * FILEGROUP_ID
+  * FILEGROUP_NAME
+  * FILEGROUPPROPERTY
+  * FILEPROPERTY
+  * FULLTEXTCATALOGPROPERTY
+  * FULLTEXTSERVICEPROPERTY
+  * INDEX_COL
+  * INDEXKEY_PROPERTY
+  * INDEXPROPERTY
+  * OBJECT_ID
+  * OBJECT_NAME
+  * OBJECTPROPERTY
+  * OBJECTPROPERTYEX
+  * @@PROCID
+  * SCHEMA_ID
+  * SCHEMA_NAME
+  * SQL_VARIANT_PROPERTY
+  * TYPE_ID
+  * TYPE_NAME
+  * TYPEPROPERTY
+
+COL_LENGTH
+
+The COL_LENGTH function returns the defined length of a column. The syntax is as follows:
+
+COL_LENGTH('<table>', '<column>')
+
+The column parameter specifies the name of the column for which the length is to be determined. The table parameter specifies the name of the table that contains that column.
+
+COL_NAME
+
+The COL_NAME function takes a table ID number and a column ID number and returns the name of the database column. The syntax is as follows:
+
+COL_NAME(<table_id>, <column_id>)
+
+The column_id parameter specifies the ID number of the column. The table_id parameter specifies the ID number of the table that contains that column.
+
+COLUMNPROPERTY
+
+The COLUMNPROPERTY function returns data about a column or procedure parameter. The syntax is as follows:
+
+COLUMNPROPERTY(<id>, <column>, <property>)
+
+The id parameter specifies the ID of the table/procedure. The column parameter specifies the name of the column/parameter. The property parameter specifies the data that should be returned for the column or procedure parameter. The property parameter can be one of the following values:
+
+  * AllowsNull—Allows NULL values.
+  * IsComputed—The column is a computed column.
+  * IsCursorType—The procedure is of type CURSOR.
+  * IsFullTextIndexed—The column has been full-text indexed.
+  * IsIdentity—The column is an IDENTITY column.
+  * IsIdNotForRepl—The column checks for IDENTITY NOT FOR REPLICATION.
+  * IsOutParam—The procedure parameter is an output parameter.
+  * IsRowGuidCol—The column is a ROWGUIDCOL column.
+  * Precision—The precision for the data type of the column or parameter.
+  * Scale—The scale for the data type of the column or parameter.
+  * UseAnsiTrim—The ANSI padding setting was ON when the table was created.
+
+The return value from this function will be 1 for True, 0 for False, and NULL if the input was not valid—except for Precision (where the precision for the data type will be returned) and Scale (where the scale will be returned).
+
+DATABASEPROPERTY
+
+The DATABASEPROPERTY function returns the setting for the specified database and property name. The syntax is as follows:
+
+DATABASEPROPERTY('<database>', '<property>')
+
+The database parameter specifies the name of the database for which data on the named property will be returned. The property parameter contains the name of a database property and can be one of the following values:
+
+  * IsAnsiNullDefault—The database follows the ANSI-92 standard for NULL values.
+  * IsAnsiNullsEnabled—All comparisons made with a NULL cannot be evaluated.
+  * IsAnsiWarningsEnabled—Warning messages are issued when standard error conditions occur.
+  * IsAutoClose—The database frees resources after the last user has exited.
+  * IsAutoShrink—Database files can be shrunk automatically and periodically.
+  * IsAutoUpdateStatistics—The autoupdate statistics option has been enabled.
+  * IsBulkCopy—The database allows nonlogged operations (such as those performed with the Bulk Copy Program).
+  * IsCloseCursorsOnCommitEnabled—Any cursors that are open when a transaction is committed will be closed.
+  * IsDboOnly—The database is only accessible to the dbo.
+  * IsDetached—The database was detached by a detach operation.
+  * IsEmergencyMode—The database is in emergency mode.
+  * IsFulltextEnabled—The database has been full-text enabled.
+  * IsInLoad—The database is loading.
+  * IsInRecovery—The database is recovering.
+  * IsInStandby—The database is read-only and restore log is allowed.
+  * IsLocalCursorsDefault—Cursor declarations default to LOCAL.
+  * IsNotRecovered—The database failed to recover.
+  * IsNullConcat—Concatenating to a NULL results in a NULL.
+  * IsOffline—The database is offline.
+  * IsQuotedIdentifiersEnabled—Identifiers can be delimited by double quotation marks.
+  * IsReadOnly—The database is in a read-only mode.
+  * IsRecursiveTriggersEnabled—The recursive firing of triggers is enabled.
+  * IsShutDown—The database encountered a problem during startup.
+  * IsSingleUser—The database is in single-user mode.
+  * IsSuspect—The database is suspect.
+  * IsTruncLog—The database truncates its logon checkpoints.
+  * Version—The internal version number of the SQL Server code with which the database was created.
+
+The return value from this function will be 1 for true, 0 for false, and NULL if the input was not valid, except for Version (where the function will return the version number if the database is open and NULL if the database is closed).
+
+DATABASEPROPERTYEX
+
+The DATABASEPROPERTYEX function is basically a superset of DATABASEPROPERTY, and also returns the setting for the specified database and property name. The syntax is pretty much just the same as DATABASEPROPERTY and is as follows:
+
+DATABASEPROPERTYEX('<database>', '<property>')
+
+DATABASEPROPERTYEX just has a few more properties available, including:
+
+  * Collation—Returns the default collation for the database (remember, collations can also be overridden at the column level).
+  * ComparisonStyle—Indicates the Windows comparison style (for example, case sensitivity) of the particular collation.
+  * IsAnsiPaddingEnabled—Whether strings are padded to the same length before comparison or insert.
+  * IsArithmaticAbortEnabled—Whether queries are terminated when a major arithmetic error (such as a data overflow) occurs.
+
+The database parameter specifies the name of the database for which data on the named property will be returned. The property parameter contains the name of a database property and can be one of the following values.
+
+DB_ID
+
+The DB_ID function returns the database ID number. The syntax is as follows:
+
+DB_ID(['<database_name>'])
+
+The optional database_name parameter specifies which database's ID number is required. If the database_name is not given, the current database will be used instead.
+
+DB_NAME
+
+The DB_NAME function returns the name of the database that has the specified ID number. The syntax is as follows:
+
+DB_NAME([<database_id>])
+
+The optional database_id parameter specifies which database's name is to be returned. If no database_id is given, the name of the current database will be returned.
+
+FILE_ID
+
+The FILE_ID function returns the file ID number for the specified file name in the current database. The syntax is as follows:
+
+FILE_ID('<file_name>')
+
+The file_name parameter specifies the name of the file for which the ID is required.
+
+FILE_NAME
+
+The FILE_NAME function returns the file name for the file with the specified file ID number. The syntax is as follows:
+
+FILE_NAME(<file_id>)
+
+The file_id parameter specifies the ID number of the file for which the name is required.
+
+FILEGROUP_ID
+
+The FILEGROUP_ID function returns the filegroup ID number for the specified filegroup name. The syntax is as follows:
+
+FILEGROUP_ID('<filegroup_name>')
+
+The filegroup_name parameter specifies the filegroup name of the required filegroup ID.
+
+FILEGROUP_NAME
+
+The FILEGROUP_NAME function returns the filegroup name for the specified filegroup ID number. The syntax is as follows:
+
+FILEGROUP_NAME(<filegroup_id>)
+
+The filegroup_id parameter specifies the filegroup ID of the required filegroup name.
+
+FILEGROUPPROPERTY
+
+The FILEGROUPPROPERTY returns the setting of a specified filegroup property, given the filegroup and property name. The syntax is as follows:
+
+FILEGROUPPROPERTY(<filegroup_name>, <property>)
+
+The filegroup_name parameter specifies the name of the filegroup that contains the property being queried. The property parameter specifies the property being queried and can be one of the following values:
+
+  * IsReadOnly—The filegroup name is read-only.
+  * IsUserDefinedFG—The filegroup name is a user-defined filegroup.
+  * IsDefault—The filegroup name is the default filegroup.
+
+The return value from this function will be 1 for True, 0 for False, and NULL if the input was not valid.
+
+FILEPROPERTY
+
+The FILEPROPERTY function returns the setting of a specified file name property, given the file name and property name. The syntax is as follows:
+
+FILEPROPERTY(<file_name>, <property>)
+
+The file_name parameter specifies the name of the filegroup that contains the property being queried. The property parameter specifies the property being queried and can be one of the following values:
+
+  * IsReadOnly—The file is read-only.
+  * IsPrimaryFile—The file is the primary file.
+  * IsLogFile—The file is a log file.
+  * SpaceUsed—The amount of space used by the specified file.
+
+The return value from this function will be 1 for True, 0 for False, and NULL if the input was not valid, except for SpaceUsed (which will return the number of pages allocated in the file).
+
+FULLTEXTCATALOGPROPERTY
+
+The FULLTEXTCATALOGPROPERTY function returns data about the full-text catalog properties. The syntax is as follows:
+
+FULLTEXTCATALOGPROPERTY(<catalog_name>, <property>)
+
+The catalog_name parameter specifies the name of the full-text catalog. The property parameter specifies the property that is being queried. The properties that can be queried are:
+
+  * PopulateStatus—For which the possible return values are: 0 (idle), 1 (population in progress), 2 (paused), 3 (throttled), 4 (recovering), 5 (shutdown), 6 (incremental population in progress), 7 (updating index).
+  * ItemCount—Returns the number of full-text indexed items currently in the full-text catalog.
+  * IndexSize—Returns the size of the full-text index in megabytes.
+  * UniqueKeyCount—Returns the number of unique words that make up the full-text index in this catalog.
+  * LogSize—Returns the size (in bytes) of the combined set of error logs associated with a full-text catalog.
+  * PopulateCompletionAge—Returns the difference (in seconds) between the completion of the last full-text index population and 01/01/1990 00:00:00.
+
+FULLTEXTSERVICEPROPERTY
+
+The FULLTEXTSERVICEPROPERTY function returns data about the full-text service-level properties. The syntax is as follows:
+
+FULLTEXTSERVICEPROPERTY(<property>)
+
+The property parameter specifies the name of the service-level property that is to be queried. The property parameter may be one of the following values:
+
+  * ResourceUsage—Returns a value from 1 (background) to 5 (dedicated).
+  * ConnectTimeOut—Returns the number of seconds that the Search Service will wait for all connections to SQL Server for full-text index population before timing out.
+  * IsFulltextInstalled—Returns 1 if Full-Text Service is installed on the computer and a 0 otherwise.
+
+INDEX_COL
+
+The INDEX_COL function returns the indexed column name. The syntax is as follows:
+
+INDEX_COL('<table>', <index_id>, <key_id>)
+
+The table parameter specifies the name of the table, index_id specifies the ID of the index, and key_id specifies the ID of the key.
+
+INDEXKEY_PROPERTY
+
+This function returns information about the index key.
+
+INDEXKEY_PROPERTY(<table_id>, <index_id>, <key_id>, <property>)
+
+The table_id parameter is the numerical ID of data type int, which defines the table you wish to inspect. Use OBJECT_ID to find the numerical table_id. index_id specifies the ID of the index, and is also of data type int. key_id specifies the index column position of the key; for example, with a key of three columns, setting this value to 2 will determine that you are wishing to inspect the middle column. Finally, the property is the character string identifier of one of two properties you wish to find the setting of. The two possible values are ColumnId, which will return the physical column ID, and IsDescending, which returns the order that the column is sorted (1 is for descending and 0 is ascending).
+
+INDEXPROPERTY
+
+The INDEXPROPERTY function returns the setting of a specified index property, given the table ID, index name, and property name. The syntax is as follows:
+
+INDEXPROPERTY(<table_ID>, <index>, <property>)
+
+The property parameter specifies the property of the index that is to be queried. The property parameter can be one of these possible values:
+
+  * IndexDepth—The depth of the index.
+  * IsAutoStatistic—The index was created by the autocreate statistics option of sp_dboption.
+  * IsClustered—The index is clustered.
+  * IsStatistics—The index was created by the CREATE STATISTICS statement or by the autocreate statistics option of sp_dboption.
+  * IsUnique—The index is unique.
+  * IndexFillFactor—The index specifies its own fill factor.
+  * IsPadIndex—The index specifies space to leave open on each interior node.
+  * IsFulltextKey—The index is the full-text key for a table.
+  * IsHypothetical—The index is hypothetical and cannot be used directly as a data access path.
+
+The return value from this function will be 1 for True, 0 for False, and NULL if the input was not valid, except for IndexDepth (which will return the number of levels the index has) and IndexFillFactor (which will return the fill factor used when the index was created or last rebuilt).
+
+OBJECT_ID
+
+The OBJECT_ID function returns the specified database object's ID number. The syntax is as follows:
+
+OBJECT_ID('<object>')
+
+OBJECT_NAME
+
+The OBJECT_NAME function returns the name of the specified database object. The syntax is as follows:
+
+OBJECT_NAME(<object_id>)
+
+OBJECTPROPERTY
+
+The OBJECTPROPERTY function returns data about objects in the current database. The syntax is as follows:
+
+OBJECTPROPERTY(<id>, <property>)
+
+The id parameter specifies the ID of the object required. The property parameter specifies the information required on the object. The following property values are allowed:
+
+CnstIsClustKey | ExecIsTriggerDisabled  
+---|---  
+CnstIsColumn | ExecIsTriggerNotForRepl  
+CnstIsDeleteCascade | ExecIsUpdateTrigger  
+CnstIsDisabled | HasAfterTrigger  
+CnstIsNonclustKey | HasDeleteTrigger  
+CnstIsNotRepl | HasInsertTrigger  
+CnstIsNotTrusted | HasInsteadOfTrigger  
+CnstIsUpdateCascade | HasUpdateTrigger  
+ExecIsAfterTrigger | IsAnsiNullsOn  
+ExecIsAnsiNullsOn | IsCheckCnst  
+ExecIsDeleteTrigger | IsConstraint  
+ExecIsFirstDeleteTrigger | IsDefault  
+ExecIsFirstInsertTrigger | IsDefaultCnst  
+ExecIsFirstUpdateTrigger | IsDeterministic  
+ExecIsInsertTrigger | IsExecuted  
+ExecIsInsteadOfTrigger | IsExtendedProc  
+ExecIsLastDeleteTrigger | IsForeignKey  
+ExecIsLastInsertTrigger | IsIndexed  
+ExecIsLastUpdateTrigger | IsIndexable  
+ExecIsQuotedIdentOn | IsInlineFunction  
+ExecIsStartup | IsMSShipped  
+IsPrimaryKey | TableFulltextPopulateStatus  
+IsProcedure | TableHasActiveFulltextIndex  
+IsQuotedIdentOn | TableHasCheckCnst  
+IsQueue | TableHasClustIndex  
+IsReplProc | TableHasDefaultCnst  
+IsRule | TableHasDeleteTrigger  
+IsScalarFunction | TableHasForeignKey  
+IsSchemaBound | TableHasForeignRef  
+IsSystemTable | TableHasIdentity  
+IsTable | TableHasIndex  
+IsTableFunction | TableHasInsertTrigger  
+IsTrigger | TableHasNonclustIndex  
+IsUniqueCnst | TableHasPrimaryKey  
+IsUserTable | TableHasRowGuidCol  
+IsView | TableHasTextImage  
+OwnerId | TableHasTimestamp  
+TableDeleteTrigger | TableHasUniqueCnst  
+TableDeleteTriggerCount | TableHasUpdateTrigger  
+TableFullTextBackgroundUpdateIndexOn | TableInsertTrigger  
+TableFulltextCatalogId | TableInsertTriggerCount  
+TableFullTextChangeTrackingOn | TableIsFake  
+TableFulltextDocsProcessed | TableIsLockedOnBulkLoad  
+TableFulltextFailCount | TableIsPinned  
+TableFulltextItemCount | TableTextInRowLimit  
+TableFulltextKeyColumn | TableUpdateTrigger  
+TableFulltextPendingChanges | TableUpdateTriggerCount
+
+The return value from this function will be 1 for True, 0 for False, and NULL if the input was not valid, except for:
+
+  * OwnerId—Returns the database user ID of the owner of that object—note that this is different from the SchemaID of the object and will likely not be that useful in SQL Server 2005 and beyond.
+  * TableDeleteTrigger, TableInsertTrigger, TableUpdateTrigger—Return the ID of the first trigger with the specified type. Zero is returned if no trigger of that type exists.
+  * TableDeleteTriggerCount, TableInsertTriggerCount, TableUpdateTriggerCount—Return the number of the specified type of trigger that exists for the table in question.
+  * TableFulltextCatalogId—Returns the ID of the full-text catalog if there is one, and zero if no full-text catalog exists for that table.
+  * TableFulltextKeyColumn—Returns the ColumnID of the column being utilized as the unique index for that full-text index.
+  * TableFulltextPendingChanges—The number of entries that have changed since the last full-text analysis was run for this table. Change tracking must be enabled for this function to return useful results.
+  * TableFulltextPopulateStatus—This one has multiple possible return values: 
+    * 0—Indicates that the full-text process is currently idle.
+    * 1—A full population run is currently in progress.
+    * 2—An incremental population is currently running.
+    * 3—Changes are currently being analyzed and added to the full-text catalog.
+    * 4—Some form of background update (such as that done by the automatic change tracking mechanism) is currently running.
+    * 5—A full-text operation is in progress, but has either been throttled (to allow other system requests to perform as needed) or has been paused.
+  * You can use the feedback from this option to make decisions about what other full-text-related options are appropriate (to check whether a population is in progress so you know whether other functions, such as TableFulltextDocsProcessed, are valid).
+  * TableFulltextDocsProcessed—Valid only while full-text indexing is actually running, this returns the number of rows processed since the full-text index processing task started. A zero result indicates that full-text indexing is not currently running (a null result means full-text indexing is not configured for this table).
+  * TableFulltextFailCount—Valid only while full-text indexing is actually running, this returns the number of rows that full-text indexing has, for some reason, skipped (no indication of reason). As with TableFulltextDocsProcessed, a zero result indicates the table is not currently being analyzed for full text, and a null indicates that full text is not configured for this table.
+  * TableIsPinned—This is left in for backward compatibility only and will always return "0" in SQL Server 2005 and beyond.
+
+OBJECTPROPERTYEX
+
+OBJECTPROPERTYEX is an extended version of the OBJECTPROPERTY function.
+
+OBJECTPROPERTYEX(<id>, <property>)
+
+Like OBJECTPROPERTY, the id parameter specifies the ID of the object required. The property parameter specifies the information required on the object. OBJECTPROPERTYEX supports all the same property values as OBJECTPROPERTY but adds the following property values as additional options:
+
+  * BaseType—Returns the base data type of an object.
+  * IsPrecise—Indicates that your object does not contain any imprecise computations. For example an int or decimal is precise, but a float is not. Computations that utilize imprecise data types must be assumed to return imprecise results. Note that you can specifically mark any .NET assemblies you produce as being precise or not.
+  * IsSystemVerified—Indicates whether the IsPrecise and IsDeterministic properties can be verified by SQL Server itself (as opposed to just having been set by the user).
+  * SchemaId—Just what it sounds like. Returns the internal system ID for a given object. You can then use SCHEMA_NAME to put a more user-friendly name on the schema ID.
+  * SystemDataAccess—Indicates whether the object in question relies on any system table data.
+  * UserDataAccess—Indicates whether the object in question utilizes any of the user tables or system user data.
+
+@@PROCID
+
+Returns the stored procedure ID of the currently running procedure.
+
+Primarily a troubleshooting tool when a process is running and using up a large amount of resources. Is used mainly as a DBA function.
+
+SCHEMA_ID
+
+Given a schema name, returns the internal system ID for that schema. Utilizes the syntax:
+
+SCHEMA_ID( <schema name> )
+
+SCHEMA_NAME
+
+Given an internal schema system ID, returns the user-friendly name for that schema. The syntax is:
+
+SCHEMA_NAME( <schema id> )
+
+SQL_VARIANT_PROPERTY
+
+SQL_VARIANT_PROPERTY is a powerful function and returns information about a sql_variant. This information could be from BaseType, Precision, Scale, TotalBytes, Collation, MaxLength. The syntax is:
+
+SQL_VARIANT_PROPERTY (expression, property)
+
+Expression is an expression of type sql_variant. Property can be any one of the following values:
+
+Value | Description | Base Type of sql_variant Returned  
+---|---|---  
+BaseType | Data types include: char, int, money, nchar, ntext, numeric, nvarchar, real, smalldatetime, smallint, smallmoney, text, timestamp, tinyint, uniqueidentifier, varbinary, varchar | sysname  
+Precision | The precision of the numeric base data type:   
+datetime = 23   
+smalldatetime = 16   
+float = 53   
+real = 24   
+decimal (p,s) and numeric (p,s) = p   
+money = 19   
+smallmoney = 10   
+int = 10   
+smallint = 5   
+tinyint = 3   
+bit = 1   
+All other types = 0 | int  
+Scale | The number of digits to the right of the decimal point of the numeric base data type:   
+decimal (p,s) and numeric (p,s) = s   
+money and smallmoney = 4   
+datetime = 3   
+All other types = 0 | int  
+TotalBytes | The number of bytes required to hold both the metadata and data of the value. If the value is greater than 900, index creation will fail. | int  
+Collation | The collation of the particular sql_variant value. | sysname  
+MaxLength | The maximum data type length, in bytes. | int
+
+TYPEPROPERTY
+
+The TYPEPROPERTY function returns information about a data type. The syntax is as follows:
+
+TYPEPROPERTY(<type>, <property>)
+
+The type parameter specifies the name of the data type. The property parameter specifies the property of the data type that is to be queried; it can be one of the following values:
+
+  * Precision—Returns the number of digits/characters.
+  * Scale—Returns the number of decimal places.
+  * AllowsNull—Returns 1 for True and 0 for False.
+  * UsesAnsiTrim—Returns 1 for True and 0 for False.
+
+Rowset Functions
+
+The rowset functions return an object that can be used in place of a table reference in a T-SQL statement. The rowset functions are:
+
+  * CHANGETABLE
+  * CONTAINSTABLE
+  * FREETEXTTABLE
+  * OPENDATASOURCE
+  * OPENQUERY
+  * OPENROWSET
+  * OPENXML
+
+CHANGETABLE
+
+CHANGETABLE (
+
+{ CHANGES <table> , <last sync version>
+
+| VERSION <table> , <primary key values> } )
+
+[AS] <table alias> [ ( <column alias> [ ,...n ] )
+
+Returns all rows in the specified table since the point specified in the "last sync version" argument.
+
+CONTAINSTABLE
+
+The CONTAINSTABLE function is used in full-text queries. Please refer to Chapter 18 for an example of its usage. The syntax is as follows:
+
+CONTAINSTABLE (<table>, {<column> | *}, '<contains_search_condition>')
+
+FREETEXTTABLE
+
+The FREETEXTTABLE function is used in full-text queries. Please refer to Chapter 18 for an example of its usage. The syntax is as follows:
+
+FREETEXTTABLE (<table>, {<column> | *}, '<freetext_string>')
+
+OPENDATASOURCE
+
+The OPENDATASOURCE function provides ad hoc connection information. The syntax is as follows:
+
+OPENDATASOURCE (<provider_name>, <init_string>)
+
+The provider_name is the name registered as the ProgID of the OLE DB provider used to access the data source. The init_string should be familiar to VB programmers, as this is the initialization string to the OLE DB provider. For example, the init_string could look like:
+
+"User Id=wonderison;Password=JuniorBlues;DataSource=MyServerName"
+
+OPENQUERY
+
+The OPENQUERY function executes the specified pass-through query on the specified linked_server. The syntax is as follows:
+
+OPENQUERY(<linked_server>, '<query>')
+
+OPENROWSET
+
+The OPENROWSET function accesses remote data from an OLE DB data source. The syntax is as follows:
+
+OPENROWSET('<provider_name>'
+
+{
+
+'<datasource>';'<user_id>';'<password>'
+
+| '<provider_string>'
+
+},
+
+{
+
+[<catalog.>][<schema.>]<object>
+
+| '<query>'
+
+})
+
+The provider_name parameter is a string representing the friendly name of the OLE DB provided as specified in the registry. The data_source parameter is a string corresponding to the required OLE DB data source. The user_id parameter is a relevant username to be passed to the OLE DB provider. The password parameter is the password associated with the user_id.
+
+The provider_string parameter is a provider-specific connection string and is used in place of the datasource, user_id, and password combination.
+
+The catalog parameter is the name of the catalog/database that contains the required object. The schema parameter is the name of the schema or object owner of the required object. The object parameter is the object name.
+
+The query parameter is a string that is executed by the provider and is used instead of a combination of catalog, schema, and object.
+
+OPENXML
+
+By passing in an XML document as a parameter, or by retrieving an XML document and defining the document within a variable, OPENXML allows you to inspect the structure and return data, as if the XML document were a table. The syntax is as follows:
+
+OPENXML(<idoc_int> [in],<rowpattern> nvarchar[in],[<flags> byte[in]])
+
+[WITH (<SchemaDeclaration> | <TableName>)]
+
+The idoc_int parameter is the variable defined using the sp_xml_prepareddocument system sproc. Rowpattern is the node definition. The flags parameter specifies the mapping between the XML document and the rowset to return within the SELECT statement. SchemaDeclaration defines the XML schema for the XML document; if there is a table defined within the database that follows the XML schema, then TableName can be used instead.
+
+Before being able to use the XML document, it must be prepared by using the sp_xml_preparedocument system procedure.
+
+Security Functions
+
+The security functions return information about users and roles. They are:
+
+  * HAS_DBACCESS
+  * IS_MEMBER
+  * IS_SRVROLEMEMBER
+  * SUSER_ID
+  * SUSER_NAME
+  * SUSER_SID
+  * USER
+  * USER_ID
+  * USER_NAME
+
+HAS_DBACCESS
+
+The HAS_DBACCESS function is used to determine whether the user that is logged in has access to the database being used. A return value of 1 means the user does have access, and a return value of 0 means that he or she does not. A NULL return value means the database_name supplied was invalid. The syntax is as follows:
+
+HAS_DBACCESS ('<database_name>')
+
+IS_MEMBER
+
+The IS_MEMBER function returns whether the current user is a member of the specified Windows NT group/SQL Server role. The syntax is as follows:
+
+IS_MEMBER ({'<group>' | '<role>'})
+
+The group parameter specifies the name of the NT group and must be in the form domain\group. The role parameter specifies the name of the SQL Server role. The role can be a database fixed role or a user-defined role but cannot be a server role.
+
+This function will return a 1 if the current user is a member of the specified group or role, a 0 if the current user is not a member of the specified group or role, and NULL if the specified group or role is invalid.
+
+IS_SRVROLEMEMBER
+
+The IS_SRVROLEMEMBER function returns whether a user is a member of the specified server role. The syntax is as follows:
+
+IS_SRVROLEMEMBER ('<role>' [,'<login>'])
+
+The optional login parameter is the name of the login account to check. The default is the current user. The role parameter specifies the server role and must be one of the following possible values:
+
+  * sysadmin
+  * dbcreator
+  * diskadmin
+  * processadmin
+  * serveradmin
+  * setupadmin
+  * securityadmin
+
+This function returns a 1 if the specified login account is a member of the specified role, a 0 if the login is not a member of the role, and a NULL if the role or login is invalid.
+
+SUSER_ID
+
+The SUSER_ID function returns the specified user's login ID number. The syntax is as follows:
+
+SUSER_ID(['<login>'])
+
+The login parameter is the specified user's login ID name. If no value for login is provided, the default of the current user will be used instead.
+
+The SUSER_ID system function is included in SQL Server 2000 for backward compatibility, so if possible you should use SUSER_SID, which is inherently more secure, instead.
+
+SUSER_NAME
+
+The SUSER_NAME function returns the specified user's login ID name. The syntax is as follows:
+
+SUSER_NAME([<server_user_id>])
+
+The server_user_id parameter is the specified user's login ID number. If no value for server_user_id is provided, the default of the current user will be used instead.
+
+The SUSER_NAME system function is included in SQL Server 2000 for backward compatibility only, so if possible you should use SUSER_SNAME instead.
+
+SUSER_SID
+
+The SUSER_SID function returns the security identification number (SID) for the specified user. The syntax is as follows:
+
+SUSER_SID(['<login>'])
+
+The login parameter is the user's login name. If no value for login is provided, the current user will be used instead.
+
+SUSER_SNAME
+
+The SUSER_SNAME function returns the login ID name for the specified security identification number (SID). The syntax is as follows:
+
+SUSER_SNAME([<server_user_sid>])
+
+The server_user_sid parameter is the user's SID. If no value for the server_user_sid is provided, the current user's will be used instead.
+
+USER
+
+The USER function allows a system-supplied value for the current user's database username to be inserted into a table if no default has been supplied. The syntax is as follows:
+
+USER
+
+USER_ID
+
+The USER_ID function returns the specified user's database ID number. The syntax is as follows:
+
+USER_ID(['<user>'])
+
+The user parameter is the username to be used. If no value for user is provided, the current user is used.
+
+USER_NAME
+
+The USER_NAME function is the functional reverse of USER_ID, and returns the specified user's username in the database given a database ID number. The syntax is as follows:
+
+USER_NAME(['<user id>'])
+
+The user id parameter is the id of the user you want the name for. If no value for user id is provided, the current user is assumed.
+
+String Functions
+
+The string functions perform actions on string values and return strings or numeric values. The string functions are:
+
+  * ASCII
+  * CHAR
+  * CHARINDEX
+  * DIFFERENCE
+  * LEFT
+  * LEN
+  * LOWER
+  * LTRIM
+  * NCHAR
+  * PATINDEX
+  * QUOTENAME
+  * REPLACE
+  * REPLICATE
+  * REVERSE
+  * RIGHT
+  * RTRIM
+  * SOUNDEX
+  * SPACE
+  * STR
+  * STUFF
+  * SUBSTRING
+  * UNICODE
+  * UPPER
+
+ASCII
+
+The ASCII function returns the ASCII code value of the leftmost character in character_expression. The syntax is as follows:
+
+ASCII(<character_expression>)
+
+CHAR
+
+The CHAR function converts an ASCII code (specified in expression) into a string. The syntax is as follows:
+
+CHAR(<expression>)
+
+The expression can be any integer between 0 and 255.
+
+CHARINDEX
+
+The CHARINDEX function returns the starting position of an expression in a character_string. The syntax is as follows:
+
+CHARINDEX(<expression>, <character_string> [, <start_location>])
+
+The expression parameter is the string to be found. The character_string is the string to be searched, usually a column. The start_location is the character position to begin the search; if this is anything other than a positive number, the search will begin at the start of character_string.
+
+DIFFERENCE
+
+The DIFFERENCE function returns the difference between the SOUNDEX values of two expressions as an integer. The syntax is as follows:
+
+DIFFERENCE(<expression1>, <expression2>)
+
+This function returns an integer value between 0 and 4. If the two expressions sound identical (for example, blue and blew) a value of 4 will be returned. If there is no similarity, a value of 0 is returned.
+
+LEFT
+
+The LEFT function returns the leftmost part of an expression, starting a specified number of characters from the left. The syntax is as follows:
+
+LEFT(<expression>, <integer>)
+
+The expression parameter contains the character data from which the leftmost section will be extracted. The integer parameter specifies the number of characters from the left to begin; it must be a positive integer.
+
+LEN
+
+The LEN function returns the number of characters in the specified expression. The syntax is as follows:
+
+LEN(<expression>)
+
+LOWER
+
+The LOWER function converts any uppercase characters in the expression into lowercase characters. The syntax is as follows:
+
+LOWER(<expression>)
+
+LTRIM
+
+The LTRIM function removes any leading blanks from a character_expression. The syntax is as follows:
+
+LTRIM(<character expression>)
+
+NCHAR
+
+The NCHAR function returns the Unicode character that has the specified integer_code. The syntax is as follows:
+
+NCHAR(<integer code>)
+
+The integer_code parameter must be a positive whole number from 0 to 65,535.
+
+PATINDEX
+
+The PATINDEX function returns the starting position of the first occurrence of a pattern in a specified expression or zero if the pattern was not found. The syntax is as follows:
+
+PATINDEX('<%pattern%>', <expression>)
+
+The pattern parameter is a string that will be searched for. Wildcard characters can be used, but the % characters must surround the pattern. The expression parameter is character data in which the pattern is being searched for—usually a column.
+
+QUOTENAME
+
+The QUOTENAME function returns a Unicode string with delimiters added to make the specified string a valid SQL Server delimited identifier. The syntax is as follows:
+
+QUOTENAME('<character string>'[, '<quote character>'])
+
+The character_string parameter is Unicode string. The quote_character parameter is a one-character string that will be used as a delimiter. The quote_character parameter can be a single quotation mark ('), a left or a right bracket ([]), or a double quotation mark ("). The default is for brackets to be used.
+
+REPLACE
+
+The REPLACE function replaces all instances of the second specified string in the first specified string with a third specified string. The syntax is as follows:
+
+REPLACE('<string expression1>', '<string expression2>', '<string expression3>')
+
+The string_expression1 parameter is the expression in which to search. The string_expression2 parameter is the expression to search for in string_expression1. The string_expression3 parameter is the expression with which to replace all instances of string_expression2.
+
+REPLICATE
+
+The REPLICATE function repeats a character_expression a specified number of times. The syntax is as follows:
+
+REPLICATE(<character expression>, <integer>)
+
+REVERSE
+
+The REVERSE function returns the reverse of the specified character_expression. The syntax is as follows:
+
+REVERSE(<character expression>)
+
+RIGHT
+
+The RIGHT function returns the rightmost part of the specified character_expression, starting a specified number of characters (given by integer) from the right. The syntax is as follows:
+
+RIGHT(<character expression>, <integer>)
+
+The integer parameter must be a positive whole number.
+
+RTRIM
+
+The RTRIM function removes all the trailing blanks from a specified character_expression. The syntax is as follows:
+
+RTRIM(<character expression>)
+
+SOUNDEX
+
+The SOUNDEX function returns a four-character (SOUNDEX) code, which can be used to evaluate the similarity of two strings. The syntax is as follows:
+
+SOUNDEX(<character expression>)
+
+SPACE
+
+The SPACE function returns a string of repeated spaces, the length of which is indicated by integer. The syntax is as follows:
+
+SPACE(<integer>)
+
+STR
+
+The STR function converts numeric data into character data. The syntax is as follows:
+
+STR(<numeric expression>[, <length>[, <decimal>]])
+
+The numeric_expression parameter is a numeric expression with a decimal point. The length parameter is the total length including decimal point, digits, and spaces. The decimal parameter is the number of places to the right of the decimal point.
+
+STUFF
+
+The STUFF function deletes a specified length of characters and inserts another set of characters in their place. The syntax is as follows:
+
+STUFF(<expression>, <start>, <length>, <characters>)
+
+The expression parameter is the string of characters in which some will be deleted and new ones added. The start parameter specifies where to begin deletion and insertion of characters. The length parameter specifies the number of characters to delete. The characters parameter specifies the new set of characters to be inserted into the expression.
+
+SUBSTRING
+
+The SUBSTRING function returns part of an expression. The syntax is as follows:
+
+SUBSTRING(<expression>, <start>, <length>)
+
+The expression parameter specifies the data from which the substring will be taken, and can be a character string, binary string, text, or an expression that includes a table. The start parameter is an integer that specifies where to begin the substring. The length parameter specifies how long the substring is.
+
+UNICODE
+
+The UNICODE function returns the Unicode number that represents the first character in character_expression. The syntax is as follows:
+
+UNICODE('<character expression>')
+
+UPPER
+
+The UPPER function converts all the lowercase characters in character_expression into uppercase characters. The syntax is as follows:
+
+UPPER(<character expression>)
+
+System Functions
+
+The system functions can be used to return information about values, objects, and settings with SQL Server. The functions are as follows:
+
+  * APP_NAME
+  * CASE
+  * CAST and CONVERT
+  * COALESCE
+  * COLLATIONPROPERTY
+  * CURRENT_TIMESTAMP
+  * CURRENT_USER
+  * DATALENGTH
+  * FORMATMESSAGE
+  * GETANSINULL
+  * HOST_ID
+  * HOST_NAME
+  * IDENT_CURRENT
+  * IDENT_INCR
+  * IDENT_SEED
+  * IDENTITY
+  * ISDATE
+  * ISNULL
+  * ISNUMERIC
+  * NEWID
+  * NULLIF
+  * PARSENAME
+  * PERMISSIONS
+  * ROWCOUNT_BIG
+  * SCOPE_IDENTITY
+  * SERVERPROPERTY
+  * SESSION_USER
+  * SESSIONPROPERTY
+  * STATS_DATE
+  * SYSTEM_USER
+  * USER_NAME
+
+APP_NAME
+
+The APP_NAME function returns the application name for the current session if one has been set by the application as an nvarchar type. It has the following syntax:
+
+APP_NAME()
+
+CASE
+
+The CASE function evaluates a list of conditions and returns one of multiple possible results. It also has two formats:
+
+  * The simple CASE function compares an expression to a set of simple expressions to determine the result.
+  * The searched CASE function evaluates a set of Boolean expressions to determine the result.
+
+Both formats support an optional ELSE argument.
+
+Simple CASE function:
+
+CASE <input_expression>
+
+WHEN <when_expression> THEN <result_expression>
+
+ELSE <else_result_expression>
+
+END
+
+Searched CASE function:
+
+CASE
+
+WHEN <Boolean_expression> THEN <result_expression>
+
+ELSE <else_result_expression>
+
+END
+
+CAST and CONVERT
+
+These two functions provide similar functionality in that they both convert one data type into another type.
+
+Using CAST:
+
+CAST(<expression> AS <data_type>)
+
+Using CONVERT:
+
+CONVERT (<data_type>[(<length>)], <expression> [, <style>])
+
+where style refers to the style of date format when converting to a character data type.
+
+COALESCE
+
+The COALESCE function is passed an undefined number of arguments and it tests for the first non-null expression among them. The syntax is as follows:
+
+COALESCE(<expression> [,...n])
+
+If all arguments are NULL then COALESCE returns NULL.
+
+COLLATIONPROPERTY
+
+The COLLATIONPROPERTY function returns the property of a given collation. The syntax is as follows:
+
+COLLATIONPROPERTY(<collation_name>, <property>)
+
+The collation_name parameter is the name of the collation you wish to use, and property is the property of the collation you wish to determine. This can be one of three values:
+
+Property Name | Description  
+---|---  
+CodePage | The non-Unicode code page of the collation.  
+LCID | The Windows LCID of the collation. Returns NULL for SQL collations.  
+ComparisonStyle | The Windows comparison style of the collation. Returns NULL for binary or SQL collations.
+
+CURRENT_USER
+
+The CURRENT_USER function simply returns the current user as a sysname type. It is equivalent to USER_NAME(). The syntax is as follows:
+
+CURRENT_USER
+
+DATALENGTH
+
+The DATALENGTH function returns the number of bytes used to represent expression as an integer. It is especially useful with varchar, varbinary, text, image, nvarchar, and ntext data types because these data types can store variable-length data. The syntax is as follows:
+
+DATALENGTH(<expression>)
+
+@@ERROR
+
+Returns the error code for the last T-SQL statement that ran on the current connection. If there is no error, then the value will be zero.
+
+If you're going to be writing stored procedures or triggers, this is a bread-and-butter kind of system function. You pretty much can't live without it.
+
+The thing to remember with @@ERROR is that its lifespan is just one statement. This means that, if you want to use it to check for an error after a given statement, then you either need to make your test the very next statement, or you need to move it into a holding variable. In general, I recommend using ERROR_NUMBER() in a TRY...CATCH block unless you need to support pre SQL Server 2005 code.
+
+A listing of all the system errors can be viewed by using the sys.messages system table in the master database.
+
+To create your own custom errors, use sp_addmessage.
+
+FORMATMESSAGE
+
+The FORMATMESSAGE function uses existing messages in sysmessages to construct a message. The syntax is as follows:
+
+FORMATMESSAGE(<msg_number>, <param_value>[,...n])
+
+where msg_number is the ID of the message in sysmessages.
+
+FORMATMESSAGE looks up the message in the current language of the user. If there is no localized version of the message, the U.S. English version is used.
+
+GETANSINULL
+
+The GETANSINULL function returns the default nullability for a database as an integer. The syntax is as follows:
+
+GETANSINULL(['<database>'])
+
+The database parameter is the name of the database for which to return nullability information.
+
+When the nullability of the given database allows NULL values and the column or data type nullability is not explicitly defined, GETANSINULL returns 1. This is the ANSI NULL default.
+
+HOST_ID
+
+The HOST_ID function returns the ID of the workstation. The syntax is as follows:
+
+HOST_ID()
+
+HOST_NAME
+
+The HOST_NAME function returns the name of the workstation. The syntax is as follows:
+
+HOST_NAME()
+
+IDENT_CURRENT
+
+The IDENT_CURRENT function returns the last identity value created for a table, within any session or scope of that table. This is exactly like @@IDENTITY and SCOPE_IDENTITY; however, this has no limit to the scope of its search to return the value.
+
+The syntax is as follows:
+
+IDENT_CURRENT('<table name>')
+
+The table_name is the table for which you wish to find the current identity.
+
+IDENT_INCR
+
+The IDENT_INCR function returns the increment value specified during the creation of an identity column in a table or view that has an identity column. The syntax is as follows:
+
+IDENT_INCR('<table or view>')
+
+The table_or_view parameter is an expression specifying the table or view to check for a valid identity increment value.
+
+IDENT_SEED
+
+The IDENT_SEED function returns the seed value specified during the creation of an identity column in a table or a view that has an identity column. The syntax is as follows:
+
+IDENT_SEED('<table or view>')
+
+The table_or_view parameter is an expression specifying the table or view to check for a valid identity increment value.
+
+@@IDENTITY
+
+Returns the last identity value created by the current connection.
+
+If you're using identity columns and then referencing them as a foreign key in another table, you'll find yourself using this one all the time. You can create the parent record (usually the one with the identity you need to retrieve), then select @@IDENTITY to know what value you need to relate child records to.
+
+If you perform inserts into multiple tables with identity values, remember that the value in @@IDENTITY will only be for the last identity value inserted; anything before that will have been lost, unless you move the value into a holding variable after each insert. Also, if the last column you inserted into didn't have an identity column, then @@IDENTITY will be set to NULL.
+
+IDENTITY
+
+The IDENTITY function is used to insert an identity column into a new table. It is used only with a SELECT statement with an INTO table clause. The syntax is as follows:
+
+IDENTITY(<data type>[, <seed>, <increment>]) AS <column name>
+
+Where:
+
+  * data_type is the data type of the identity column.
+  * seed is the value to be assigned to the first row in the table. Each subsequent row is assigned the next identity value, which is equal to the last IDENTITY value plus the increment value. If neither seed nor increment is specified, both default to 1.
+  * increment is the increment to add to the seed value for successive rows in the table.
+  * column_name is the name of the column that is to be inserted into the new table.
+
+ISNULL
+
+The ISNULL function checks an expression for a NULL value and replaces it with a specified replacement value. The syntax is as follows:
+
+ISNULL(<check expression>, <replacement value>)
+
+ISNUMERIC
+
+The ISNUMERIC function determines whether an expression is a valid numeric type. The syntax is as follows:
+
+ISNUMERIC(<expression>)
+
+NEWID
+
+The NEWID function creates a unique value of type uniqueidentifier. The syntax is as follows:
+
+NEWID()
+
+NULLIF
+
+The NULLIF function compares two expressions and returns a NULL value. The syntax is as follows:
+
+NULLIF(<expression1>, <expression2>)
+
+PARSENAME
+
+The PARSENAME function returns the specified part of an object name. The syntax is as follows:
+
+PARSENAME('<object name>', <object_piece>)
+
+The object_name parameter specifies the object name from the part that is to be retrieved. The object_piece parameter specifies the part of the object to return. The object_piece parameter takes one of these possible values:
+
+  * 1—Object name
+  * 2—Owner name
+  * 3—Database name
+  * 4—Server name
+
+PERMISSIONS
+
+The PERMISSIONS function returns a value containing a bitmap, which indicates the statement, object, or column permissions for the current user. The syntax is as follows:
+
+PERMISSIONS([<objectid> [, '<column>']])
+
+The object_id parameter specifies the ID of an object. The optional column parameter specifies the name of the column for which permission information is being returned.
+
+@@ROWCOUNT
+
+Returns the number of rows affected by the last statement.
+
+One of the most used globals. My most common use for this one is to check for nonruntime errors—that is, items that are logically errors to your program but that SQL Server isn't going to see any problem with. An example is a situation where you are performing an update based on a condition, but you find that it affects zero rows. Odds are that, if your client submitted a modification for a particular row, then it was expecting that row to match the criteria given; zero rows affected is indicative of something being wrong.
+
+However, if you test this system function on any statement that does not return rows, then you will also return a value of 0.
+
+ROWCOUNT_BIG
+
+The ROWCOUNT_BIG function is very similar to @@ROWCOUNT in that it returns the number of rows from the last statement. However, the value returned is of a data type of bigint. The syntax is as follows:
+
+ROWCOUNT_BIG()
+
+SCOPE_IDENTITY
+
+The SCOPE_IDENTITY function returns the last value inserted into an identity column in the same scope (that is, within the same sproc, trigger, function, or batch). This is similar to IDENT_CURRENT, discussed previously, although that was not limited to identity insertions made in the same scope.
+
+This function returns a sql_variant data type, and the syntax is as follows:
+
+SCOPE_IDENTITY()
+
+SERVERPROPERTY
+
+The SERVERPROPERTY function returns information about the server you are running on. The syntax is as follows:
+
+SERVERPROPERTY('<propertyname>')
+
+The possible values for propertyname are:
+
+Property Name | Values Returned  
+---|---  
+Collation | The name of the default collation for the server.  
+Edition | The edition of the SQL Server instance installed on the server. Returns one of the following nvarchar results:   
+'Desktop Engine'   
+'Developer Edition'   
+'Enterprise Edition'   
+'Enterprise Evaluation Edition'   
+'Personal Edition'   
+'Standard Edition'  
+Engine Edition | The engine edition of the SQL Server instance installed on the server:   
+1—Personal or Desktop Engine   
+2—Standard   
+3—Enterprise (returned for Enterprise, Enterprise Evaluation, and Developer)  
+InstanceName | The name of the instance to which the user is connected.  
+IsClustered | Will determine if the server instance is configured in a failover cluster:   
+1—Clustered   
+0—Not clustered   
+NULL—Invalid input or error  
+IsFullText Installed | To determine if the full-text component is installed with the current instance of SQL Server:   
+1—Full-text is installed.   
+0—Full-text is not installed.   
+NULL—Invalid input or error  
+IsIntegrated SecurityOnly | To determine if the server is in integrated security mode:   
+1—Integrated security   
+0—Not integrated security   
+NULL—Invalid input or error  
+IsSingleUser | To determine if the server is a single-user installation:   
+1—Single user   
+0—Not single user   
+NULL—Invalid input or error  
+IsSync WithBackup | To determine if the database is either a published database or a distribution database, and can be restored without disrupting the current transactional replication:   
+1—True   
+0—False  
+LicenseType | What type of license is installed for this instance of SQL Server:   
+PER_SEAT—Per-seat mode   
+PER_PROCESSOR—Per-processor mode   
+DISABLED—Licensing is disabled  
+MachineName | Returns the Windows NT computer name on which the server instance is running. For a clustered instance (an instance of SQL Server running on a virtual server on Microsoft Cluster Server), it returns the name of the virtual server.  
+NumLicenses | Number of client licenses registered for this instance of SQL Server, if in per-seat mode.   
+Number of processors licensed for this instance of SQL Server, if in per-processor mode.  
+ProcessID | Process ID of the SQL Server service. (The ProcessID is useful in identifying which sqlservr.exe belongs to this instance.)  
+ProductVersion | Very much like Visual Basic projects, in that the version details of the instance of SQL Server are returned, in the form of 'major.minor.build'.  
+ProductLevel | Returns the value of the version of the SQL Server instance currently running. Returns:   
+'RTM'—Shipping version   
+'SPn'—Service pack version   
+'Bn'—Beta version  
+ServerName | Both the Windows NT server and instance information associated with a specified instance of SQL Server.
+
+The SERVERPROPERTY function is very useful for multi-sited corporations where developers need to find out information from a server.
+
+SESSION_USER
+
+The SESSION_USER function allows a system-supplied value for the current session's username to be inserted into a table if no default value has been specified. The syntax is as follows:
+
+SESSION_USER
+
+SESSIONPROPERTY
+
+The SESSIONPROPERTY function is used to return the SET options for a session. The syntax is as follows:
+
+SESSIONPROPERTY (<option>)
+
+This function is useful when there are stored procedures that are altering session properties in specific scenarios. This function should rarely be used as you should not alter too many of the SET options during runtime.
+
+STATS_DATE
+
+The STATS_DATE function returns the date that the statistics for the specified index were last updated. The syntax is as follows:
+
+STATS_DATE(<table id>, <index id>)
+
+SYSTEM_USER
+
+The SYSTEM_USER function allows a system-supplied value for the current system username to be inserted into a table if no default value has been specified. The syntax is as follows:
+
+SYSTEM_USER
+
+USER_NAME
+
+The USER_NAME returns a database username. The syntax is as follows:
+
+USER_NAME([<id>])
+
+The id parameter specifies the ID number of the required username; if no value is given the current user is assumed.
+
+Text and Image Functions
+
+The text and image functions perform operations on text or image data. They are:
+
+  * PATINDEX (This was covered in the "String Functions" section earlier in the appendix.)
+  * TEXTPTR
+  * TEXTVALID
+
+TEXTPTR
+
+The TEXTPTR function checks the value of the text pointer that corresponds to a text, ntext, or image column and returns a varbinary value. The text pointer should be checked to ensure that it points to the first text page before running READTEXT, WRITETEXT, and UPDATE statements. The syntax is as follows:
+
+TEXTPTR(<column>)
+
+TEXTVALID
+
+The TEXTVALID function checks whether a specified text pointer is valid. The syntax is as follows:
+
+TEXTVALID('<table.column>', <text ptr>)
+
+The table.column parameter specifies the name of the table and column to be used. The text_ptr parameter specifies the text pointer to be checked.
+
+This function will return 0 if the pointer is invalid and 1 if the pointer is valid.
+B
+
+Going Meta: Getting Data About Your Data
+
+Over the last few releases, Microsoft has done an amazing job of increasing types and volume of data programmatically available about your server and database. It's reached a level where I would be remiss if I didn't give it some kind of coverage.
+
+So, what am I talking about here? Well, SQL Server provides a set of functions—both scalar and tabular—that return targeted information about the current state of your server or database. This can range from simple things like what objects exist on your server (this information was always available, but actual access to it was "unsupported" in many cases) to fragmentation levels for specific indexes.
+
+In this appendix, we're going to provide basic information on a number of things that are called "metadata functions" – also sometimes referred to as dm functions (which stands for database metadata) as well as system views. Much of this tends to be in the domain of the database administrator, but it's important that you have an idea of what information is available for performance tuning as well as for any system state dashboards you may want to include in the administration panels of your application (if you have any). You can also use these for programmatically handling scheduled maintenance tasks (such as only defragmenting indexes that are beyond a certain level of fragmentation or periodic space checks and warnings).
+
+Note that I am going to be largely sticking with the table-valued functions or system views for this appendix (the more mainstream system functions were already covered in Appendix A).
+
+System Views
+
+"Back in the day" as the saying goes, SQL Server provided very limited metadata information in terms of "official" methods of getting that data. You could tell whether or not a transaction was open, but you could not query a list of tables in the database (or even databases on the server) except by directly accessing special "system tables"—something Microsoft would tell you was "unsupported."
+
+The demand for this functionality is so high that the development and administrative community essentially ignored Microsoft's "unsupported" comments, and accessed them anyway (it is very difficult to run a system without some of this data!). The use of system tables was rampant when we headed into the SQL Server 7.0 era, when Microsoft first introduced information schema views (an ANSI construct) to the product. When those didn't get the kind of acceptance and use Microsoft was looking for, they added in the new system views—mapping many directly to the old system tables (for example, the sys.objects view maps column for column to the older sysobjects table). They didn't stop there though. They added a wide variety of new, queryable views, so let's take a look at some of the key views you may want to make use of in your development.
+
+Let me stress again that this is not a comprehensive list; instead, this is a focus on those items you are more likely to use programmatically. Likewise, coverage of the columns in the views and table-valued functions is limited to those with a higher likelihood of being used.
+
+Several of the system views are providing what I consider to be server-level information. While some of them may point to information on a specific database, they are in the context of the server level—meaning they can provide information on any of the databases on the server.
+
+sys.assemblies
+
+This view and its related views (sys.assembly_files and sys.assembly_references) can provide extensive information about what assemblies are installed and registered in your SQL Server environment.
+
+sys.assemblies acts as a header table, and returns a single row for each assembly installed on your system (using CREATE ASSEMBLY). Key pieces of information you should be interested in from the sys.assemblies catalog view include:
+
+Column | Type | Description  
+---|---|---  
+name | sysname | The logical name of the assembly within the database (not the name of the file or namespace).  
+principal_id | int | The ID of the schema the assembly belongs to.  
+assembly_id | int | The unique ID of the assembly within the database.  
+clr_name | nvarchar (4000) | A string that contains several pieces of information including:   
+Simple name   
+Version number   
+Culture   
+Public key   
+Architecture   
+This is effectively the ID of the assembly to the CLR.  
+permission_set permission_set_desc | tinyint nvarchar (60) | The ID/Plain text indicator of the security access for this assembly. Valid values include:   
+1 (SAFE_ACCESS)   
+2 (EXTERNAL_ACCESS)   
+3 (UNSAFE_ACCESS)  
+create_date | datetime | The date and time the assembly was created.  
+modify_date | datetime | The date and time the assembly was last modified.  
+is_user_defined | bit | True/false indicator of whether or not this assembly was created by a user. (False implies that it was a system-included assembly.)
+
+sys.columns
+
+You can think of this one as a child of sys.objects, but one that applies only to objects that supply some form of tabular result (tables, views, table-valued functions regardless of whether they are user or system created). It will include one row for each column the object returns and provide substantial information about that column. Important items included in this view are:
+
+Column | Type | Description  
+---|---|---  
+object_id | int | The object_id of the parent object for this column. You can easily join this to the object_id in sys.objects to obtain information (such as the name) of the parent object.  
+name | sysname | The name of the column.  
+column_id | int | The ID of the column. This is unique within the table.  
+system_type_id | tinyint | The system identifier for the data type of this column. You can join to the sys.types catalog view to resolve the common name of the data type.  
+user_type_id | int | This is the data type of the column as defined by the user—again, join to the sys.types catalog view to resolve this to something more usable.  
+max_length | smallint | The maximum allowable length for the column defined in bytes (remember than an nchar or nvarchar takes up two bytes per character!). This value will be −1 for most blob-capable data types (varchar (max), nvarchar (max), varbinary (max), xml), but, for the text data type, will be either 16 (the size of the blob pointer) or the value of the text in row option if one has been applied.  
+precision | tinyint | The precision of numeric-based columns (0 for non-numerics).  
+scale | tinyint | The scale of numeric-based columns (0 for non-numerics).  
+collation_name | sysname | The collation name for any character-based column (NULL for non-character–based data).  
+is_nullable   
+is_ansi_padded   
+is_rowguildcol   
+is_identity   
+is_computed   
+is_filestream   
+is_replicated   
+is_non_sql_subscribed   
+is_merge_published   
+is_dts_replicated | bit | These are largely described by their names, but, in short, are a series of true/false indicators regarding many properties a column can have.  
+is_xml_document | bit | Again, largely self-describing, but with a bit of nuance, so we'll address it more specifically. In short, this indicates whether the column is not only XML, but is valid as a complete XML document rather than just a fragment. 1 = a complete document, and 0 = an XML fragment or non-XML data.  
+is_sparse   
+is_column_set | bit | I call these out separately mostly because they are both related specifically to sparse columns, and, unless you're familiar with sparse columns, the is_column_set piece probably won't make sense (check out sparse columns for more info there, but I wanted to at least provide context for is_column_set).  
+xml_collection_id | int | Only relevant for typed XML columns, this one calls out the ID of the XML schema collection that is enforcing the type information for the XML.
+
+sys.databases
+
+This one maps to the old sysdatabases table, providing information such as (key columns here; this is not a complete list):
+
+Column | Type | Description  
+---|---|---  
+name | sysname | The logical name of the database.  
+database_id | int | The ID of the database. This ID is used in a number of system functions and can be a foreign key column for many other system views or metadata functions.  
+create_date | datetime | The date and time the database was created. Note that this value is reset if the database is renamed.  
+compatibility_level | tinyint | Indicates the version of SQL Server this database is set for compatibility with. Valid values are:   
+70 (SQL Server 7.0)   
+80 (SQL Server 2000)   
+90 (SQL Server 2005)   
+100 (SQL Server 2008)   
+NULL (The database isn't online)  
+collation_name | sysname | Collation the database is using (sort order as well as sensitivity to case, accents, kana, and width).  
+state | tinyint | The current state of the database. Valid values include:   
+0 (Online)   
+1 (Restoring)   
+2 (Recovering)   
+3 (Recovery pending)   
+4 (Suspect)   
+5 (Emergency)   
+6 (Offline)   
+The clear text versions of what these mean can be found in the state_desc column.  
+is_in_standby | bit | True/false indicator of whether the database is ready to have a transaction log or differential backup applied.  
+is_ansi_nulls_on   
+is_ansi_padding_on   
+is_ansi_warnings_on   
+is_arithabort_on   
+is_quoted_identifier_on   
+is_fulltext_enabled   
+is_trustworthy_on   
+is_encrypted | Bit | These are all separate true/false columns that indicate whether or not a given database setting is active. A zero indicates that the setting is off, and a one indicates that the setting is on.
+
+sys.database_files
+
+This one loosely (and I do mean very loosely) maps to the old sysfiles system table. Consider it, however, to be sysfiles on steroids in the sense that it contains vastly more information. What information am I talking about here? Information about each and every physical file involved in your database. For the vast majority of databases, this will be two (the primary data file and the log), but if you are using filegroups (including for partitioning), then they will show here also.
+
+sys.database_files is focused on a specific database. You can also use sys.master_files to obtain a similar list for all databases on your server (all the same key fields are there in addition to a database_id).
+
+You'll care about this one if you're supporting advanced database configuration within your application. Some of the key columns for this view include:
+
+Column | Type | Description  
+---|---|---  
+file_id | int | The internal ID of the file within the database.  
+file_guid | uniqueidentifier | The GUID for the file. This may be null if you upgraded from a previous version of SQL Server.  
+type | tinyint | Indicates the type of file. Valid file types include:   
+0 (holds actual data, index, or full-text rows)   
+1 (Log data)   
+2 (FILESTREAM information)   
+4 (For versions prior to 2008, full-text)   
+Note that the type_desc column pre-translates these values for you (but keep in mind that the FULLTEXT value will only be for SQL Server 2005 and earlier. 2008 full text will come back as ROWS).  
+data_space_id | int | The filegroup ID of the filegroup this particular file belongs to.  
+name | sysname | Logical name of the database.  
+physical_name | nvarchar (260) | The physical file name at the operating system level.  
+state | tinyint | The current state of the database. Valid values include:   
+0 (Online)   
+1 (Restoring)   
+2 (Recovering)   
+3 (Recovery pending)   
+4 (Suspect)   
+5 (Emergency)   
+6 (Offline)   
+The clear text versions of what these mean can be found in the state_desc column.  
+size | int | Actual size of the file in 8KB data pages.  
+max_size | int | The maximum file size this file is allowed to grow to (maps to the value we use in the file section of the CREATE DATABASE statement).  
+growth | int | Whether or not this file is allowed to auto grow, and by how much. If the value is zero, the file is not allowed to grow. Values greater than zero will be either the amount to grow or the percentage to grow as determined by the is_percent_growth column.
+
+sys.identity_columns
+
+This is one of those that falls under the heading of "subtly useful" to me. It seems rather fringe in nature, and, to be honest, it is, but when you need it, it's very nice to have.
+
+So, what does it provide? Well, as the name suggest, it provides a list of all identity columns in the database. The need to do this is relatively unusual, but, when you need it, sys.identity_columns is like gold (it makes it easy).
+
+So, if, for example, we wanted to know what tables had identity columns, what those identity columns were, and what their current increment values were, we could write something like:
+
+USE AdventureWorks2008;
+
+SELECT so.name AS TableName,
+
+sic.name AS ColumnName,
+
+CAST(last_value AS bigint)
+
+\+ CAST(increment_value AS bigint) AS NextValue
+
+FROM sys.identity_columns sic
+
+JOIN sys.objects so
+
+on sic.object_id = so.object_id
+
+AND so.type = 'U';
+
+Note the need to cast the values, as those are stored in a sql_variant data type and the math I used in the example requires explicit casting. Also pay attention to the fact that I limited the results to those objects that are actual user tables. This addresses an issue where sys.identity_columns returns all columns flagged as an identity column regardless of the nature of the object, which means that views will return columns based on a table column that is an identity column, and system tables can also be returned (if they use an identity column, which some do). By limiting to user tables, I make sure I do not mess with system objects and that I only see the root source of the identity column (that is, the identity column in the table—not in any views that happen to show the base table column).
+
+The results are very straightforward:
+
+TableName ColumnName NextValue
+
+\------------------------------ ------------------------------ ---------------
+
+SpecialOffer SpecialOfferID 17
+
+Address AddressID 32522
+
+AddressType AddressTypeID 7
+
+ProductModel ProductModelID 129
+
+...
+
+...
+
+...
+
+ShoppingCartItem ShoppingCartItemID 6
+
+DatabaseLog DatabaseLogID 1596
+
+ErrorLog ErrorLogID NULL
+
+(47 row(s) affected)
+
+If no rows have ever been inserted since the creation of the identity column, then there is no last_value to be shown, and it will thus return a NULL.
+
+sys.indexes
+
+As I'm sure you can imagine, this one is all about indexes. This is something of the master system catalog view dealing with indexes, and has several child or extender views associated with it. There is one entry in this table for each index in your database regardless of the type of index. No specific column information is supplied (see sys.index_columns for that), and extended information for special index types (xml, geospatial) is stored in special index-type–specific extensions to this table (they map one for one to rows in sys.indexes for the particular index type, and include all sys.indexes columns as well as a few columns specific to that type of index).
+
+Key columns in sys.indexes include:
+
+Column | Type | Description  
+---|---|---  
+object_id | int | ID of the object this index is built on or belongs to. This column ties to the object_id column in the sys.objects table.  
+name | sysname | The logical name of the index. A null in this column implies that the index is a heap.  
+index_id | int | The ID of the index. The value of this column is predictable for the heap (0) or clustered index (1) on the table (and there will only be one or the other—not both). Values equal or greater than two are non-clustered indexes built against the heap or clustered index as appropriate.  
+type | tinyint | Indicates the type of index. Valid index types are:   
+0 (Heap)   
+1 (Clustered)   
+2 (Non-clustered)   
+3 (XML)   
+4 (Spatial)   
+Note that 0 and 1 are mutually exclusive (you must have one of these values for any table, but can only have one).  
+type_desc | nvarchar (60) | A clear text (text instead of numeric values) version of the type column.  
+is_unique | bit | True/false value indicating whether the index values must be unique.  
+data_space_id | int | Identifier of the filegroup or partition scheme this index is stored in.  
+ignore_dup_key | bit | True/false indication of whether or not the option to ignore duplicate keys is active for this index.  
+is_primary_key   
+is_unique_constraint | bit | True/false indicators of whether index is supporting a primary key or unique constraint. Note that, while neither is required, they are mutually exclusive (an index cannot be both a unique constraint and primary key).  
+fill_factor | tinyint | The fill factor that was used when this index was defined or last altered. A zero indicates that the default fill factor was accepted.  
+is_padded | bit | True/false indication of whether the PAD_INDEX option was used when this index was last created or altered.  
+is_disabled | bit | Just what the name implies—a true/false indication of whether the index has been disabled. Remember that disabled indexes are completely unavailable and must be rebuilt to become active again. Disabling a clustered index has the effect of taking a table entirely offline, and the clustered index will need to be rebuilt in order to access the table data.  
+is_hypothetical | bit | True/false indicator of whether this is really just a statistics entry rather than a true index (all the stats of an index, but without any of the sorted data storage).  
+allow_row_locks   
+allow_page_locks | bit | As you might expect from the name, these are true/false indicators of whether row or page locks are allowed when accessing this index. The affect is restricted to the index (it doesn't affect the table as a whole) unless the index in question is a heap or clustered index. These columns are not mutually exclusive. Prohibiting both row and page locks has the effect of generating a table lock for any queries that utilize this index.  
+has_filter | bit | True/false indication of whether this is a filtered index (the vast majority of indexes, and all pre-2008 indexes will be unfiltered).  
+filter_definition | nvarchar (max) | Effectively a WHERE clause, this value indicates the expression that is applied to filter the values participating in this index.
+
+As you can tell from the lengthy list of very useful columns, this system catalog view can be very useful in terms of determining what you have (for performance and use analysis) and many maintenance scripts you may wish to write.
+
+Consider joining values in this table with the "missing index" set of system catalog views.
+
+sys.index_columns
+
+While sys.indexes is all about the header-level information for an index, this one is all about the column-by-column details of the index definition. An entry will exist in this view for each column that participates in any way in the index (there are indicators that tell whether the column participates in the actual key or is just an included column). In general, this system catalog view will have virtually no usefulness without being joined to the parent view (sys.indexes). Indeed, sys.index_columns has no information that isn't column specific.
+
+Column | Type | Description  
+---|---|---  
+object_id | int | ID of the object containing the index this column participates in. This column ties to the object_id column in the sys.objects table.  
+index_id | int | The ID of the index this column is associated with. Note that, when joining back to the index header information, you will need both this column and the object_id.  
+index_column_id | int | The ID of the column as it exists within this particular index.  
+column_id | int | The ID of the column as it exists within the object this column of the index is based on. If the index is on a view, you may see duplicate column_id values, as this column is only unique within the context of a specific object. A zero in this column indicates that this column is an internal column (doesn't visibly appear in the original object) holding the row identifier (RID) on a non-clustered index on a heap.  
+key_ordinal | tinyint | 1 based ordinal value indicating this column's position within the index key definition (exclusive of included columns). A 0 value indicates that his is a non-key column.  
+partition_ordinal | tinyint | 1 based ordinal value within a set of partitioning columns (this will generally be zero, indicating it is not a partitioning column).  
+is_descending_key | bit | True/false indication of whether the this column is to be treated in descending order.  
+is_included_column | bit | True/false indicator of whether this column is just an included column (exists only at the leaf level of the index, and is not included in sort order considerations).
+
+sys.objects
+
+sys.objects can be considered to be something of the parent to several other system catalog views. Much of the information about the basic existence of virtually any object can be determined from the sys.objects view, but be aware that there are views with object type specific information for several object types (views, tables, constraints, procedures, and so on).
+
+Much as sys.database_files maps loosely to the old sysfiles system table, sys.objects maps to the old system table sysobjects (in this case, pretty much exactly).
+
+Sys.objects can be a source for a tremendous amount of information on what objects exist in your database and what the nature of those objects is. Some of the more key columns available in sys.objects include:
+
+Note that sys.objects is limited in scope to those objects in the database that are scoped to a schema. While this includes most relevant objects in a database, it's worth noting that it does mean that DDL triggers will not show up in sys.objects.
+
+Column | Type | Description  
+---|---|---  
+name | sysname | The logical name of the object.  
+object_id | int | The ID of the object database. This ID is unique within a given database.  
+principal_id | int | The ID of the owner of the object if it is different from the owner of the schema that contains the object (if the owner of the object and schema is the same, this column will be NULL). Note that this column does not apply (and is set to NULL) if the object is of a type that is a child object to a table (any type of constraint or trigger) or is one of the old-style DEFAULT or RULE objects (which are only supported in the product for backward compatibility).  
+schema_id | int | The ID of the schema that contains the object (join to sys.schemas if you need the name of that schema).  
+parent_object_id | int | The object_id of the object that is the parent of the current object (for example, the table that is the parent of a trigger or constraint).  
+type | char(2) | A one- to two-letter moniker for the type of object this is (a more plain text version is stored in the sister column—type_desc). Valid values include:   
+AF (CLR Aggregate function)   
+C (Check constraint)   
+D (Constraint or stand-alone default)   
+F (Foreign key constraint)   
+FN (SQL scalar function)   
+FS (CLR scalar-function)   
+FT (CLR table-valued function)   
+IF (Inline table-valued function)   
+IT (Internal table)   
+P (Stored procedure)   
+PC (CLR stored-procedure)   
+PG (Plan guide)   
+PK (Primary key)   
+R (Rule)   
+RF (Replication-filter)   
+S (System table)   
+SN (Synonym)   
+SQ (Service queue)   
+TA (CLR DML trigger)   
+TF (Table-valued function)   
+TR (Non-CLR DML trigger)   
+U (Non-system table)   
+UQ (Unique constraint)   
+V (Non-system view)   
+X (Extended stored procedure)  
+type_desc | nvarchar (60) | A relatively plain text version of type of object (a longer moniker than was found in the type column).  
+create_date | datetime | The date and time the object was created. Note that, unlike a database in the sys.databases view, the create time shown in sys.objects is not reset if the object is renamed.  
+modify_date | datetime | The date and time the object was altered. Changes to an underlying clustered index (if the object is a table or indexed view) will affect this time stamp, but changes to non-clustered indexes will not.  
+is_ms_shipped | bit | A flag indicating whether or not this is effectively a system object (if it is an object that shipped with SQL Server as opposed to user or application created).
+
+sys.partitions
+
+As you might imagine, this one is about what partitions your database has defined in it. Whether or not you realize it, every table has at least one partition; even if you do not explicitly create partition schemes, there is still one partition per table (it just has everything for that table). The key thing on this system view is to be able to figure out whether or not your table is explicitly partitioned and how many partitions are in it. The key useful things from a development point of view are:
+
+Column | Type | Description  
+---|---|---  
+partition_id | bigint | The internal ID of the partition within the database (note that it's by database—not table).  
+object_id | int | The specific object (by object_id) that this partition is associated with.  
+index_id | int | The base 1 partition number indicating what partition this is within the individual table.  
+rows | bigint | The approximate number of rows for this partition. Note that this is maintained with statistics, so may be inexact.  
+data_compression | int | Why they didn't use a tinyint on this one I have no idea. This indicates how compression is being handled within this partition:   
+0 (no compression)   
+1 (compressed at row level)   
+2 (compressed at the page level.   
+Like some of the indicators we've seen in other system views, there is also a column (in this case, called data_compression_desc) that contains the plain text description of this setting.
+
+sys.partition_functions
+
+In general, this one would only make sense when your application is supporting a very high scalability model. This system view gives you the ability to identify the number and nature of partition functions in use. Use of this view would imply that you are doing some pretty heady stuff with table partitioning.
+
+Column | Type | Description  
+---|---|---  
+name | sysname | As you might expect, this is the name of the partition function. It must be unique within a given database.  
+function_id | int | The internal identifier for the function. Again, this is unique within a given database.  
+fanout | int | Indicates how many partitions are logically created by the function.  
+boundary_value_on_right | bit | Table partitioning in SQL Server 2008 continues to be based solely on the notion of ranges. This column is a true/false field indicating whether the hard boundary for a range is on the right (if false, then the boundary must be on the left).
+
+sys.schemas
+
+This one is incredibly simple. Since schemas are largely very simple containers, there really isn't all that much to identify about them. The primary purpose of this table is to resolve the name of a schema that owns an object and provide who the owner of the schema is. This view provides just three columns:
+
+Column | Type | Description  
+---|---|---  
+name | sysname | The name of the schema (no surprise here).  
+schema_id | int | The internal identifier for the schema; this is unique within a given database.  
+principal_id | int | The id of the security principal (as found in sys.user_token) that owns this schema.
+
+sys.servers
+
+Provides a row for the local server as well as each linked and remote server registered with this server instance. For each server, this view provides information on the communication properties, as well as several other properties required to interact with that server.
+
+Column | Type | Description  
+---|---|---  
+server_id | int | The internal ID of the server. The local server will have an ID of zero, and all other registered servers (linked or remote) will have a value greater than zero that is unique within this particular local server instance.  
+name | sysname | As you might expect, this is the name of the partition function. It must be unique within a given database.  
+product | sysname | Equates to the product property as it is specified in an OLE DB connection. If this is the local server or another SQL Server, this value will be SQL Server.  
+provider | sysname | The OLE DB provider name—for example, SQLNCLI (the SQL native client), MSDASQL (the OLE DB provider for ODBC), MSDAORA (Oracle), Microsoft.Jet.OLEDB.4.0 (Access).  
+data_source | nvarchar (4000) | The data source as used in OLE DB. This will vary by the provider, but, for SQL Server, it will be the name or IP address of the server you're connecting to.  
+location | nvarchar (4000) | The location as used in OLE DB connections. This is often null (again, depends on the provider).  
+provider_string | nvarchar (4000) | The OLE DB provider-string property. While in other settings you can often just set the connection utilizing this one property, you will only see this populated in SQL Server through an ALTER (you must originally use the discrete properties described previously).  
+catalog | sysname | The OLE DB catalog property. For SQL Server connections, this equates to setting the default database for the connection used by this linked server connection.  
+connect_timeout | int | How long the connection can sit idle before it is automatically closed (the default is 0, which translates to no timeout).  
+query_timeout | int | How long a query is allowed to run before being terminated. Again, this default is 0, which translates to no timeout.  
+is_linked | bit | True/false indication of whether this is a linked server or some other form of remote server connection.  
+is_data_access_enabled | bit | True/false as to whether distributed queries are allowed via this connection.  
+is_collation_compatible | bit | True/false indication of whether the linked server utilizes a data collation that is compatible with the local server. If they are, it can have very significant performance impact on distributed queries since any collation casting is ignored.  
+uses_remote_collation | bit | True/false indication of whether, assuming incompatible collations, the collation on the remote server is utilized instead of the local collation.  
+collation_name | sysname | If not using the remote collation, then what collation to use. The default is NULL, which assumes you want to use the local collation.
+
+sys.spatial_indexes
+
+Much like sys.xml_indexes, this is a simple extension to the sys.indexes catalog view. It includes all columns found in sys.indexes (there is, therefore, no need to join back to the sys.indexes view), but also returns three additional spatial data specific columns (though, since all three are based on the same thing, I have no idea why you need all three) and filters the results to just those indexes that are defined as being spatial.
+
+The extra three columns included are:
+
+Column | Type | Description  
+---|---|---  
+spatial_index_type | tinyint | The type of spatial index:   
+1 (Geometric)   
+2 (Geographic)  
+spatial_index_type | nvarchar (60) | A clear text equivalent to the more basic type column. Valid values are:   
+GEOMETRY (equates to type 1)   
+GOEGRAPHY (equates to type 2)  
+tessellation_scheme | sysname | Name of the tessellation scheme being utilized to index the data. This will correlated to the type of spatial index being used. Valid values are:   
+GEOMETRY_GRID   
+GEOGRAPHY_GRID
+
+sys.synonyms
+
+This view is something of an extension and filter of the base sys.objects catalog view. It includes all columns found in sys.objects, but adds an additional column called base_object_name that is used as a pointer (in the form of a fully quoted name of up to 1035 characters) of the base object that this synonym is an alias for.
+
+sys.user_token
+
+This contains one row for every security principal with access to the database. Note that this does not necessarily equate to a token per login that has access (the access may be granted via a role, which would, instead, be what had the token here and could provide access to many logins).
+
+The columns provided are:
+
+Column | Type | Description  
+---|---|---  
+principal_id | int | The unique, internal ID of the principal within this database.  
+sid | varbinary (85) | The external security identifier of the principal. If this is a Windows user or group, it will be the Windows SID. If it is a SQL Server login, it will be a SID created by SQL Server when the login was generated.  
+name | nvarchar (128) | The name of the principal as it is to be used within this database. Note that this may not be the same as the external name for the principal. It must be unique within any given database.  
+type | nvarchar (128) | A clear text indication of what type of principal this is. Valid values include:   
+APPLICATION ROLE   
+ASYMMETRIC KEY   
+CERTIFICATE   
+DATABASE ROLE   
+ROLE   
+SQL USER   
+USER MAPPED TO ASYMMETRIC KEY   
+USER MAPPED TO CERTIFICATE   
+WINDOWS LOGIN   
+WINDOWS GROUP  
+usage | nvarchar (128) | Indicates whether the principal is evaluated for GRANT/DENY permissions, or serves solely as an authenticator (which is used for context information in encryption).
+
+sys.xml_indexes
+
+This serves as a relatively simple extension to the sys.indexes catalog view. It includes all columns found in sys.indexes (so there is no need to join back to that view), but returns three additional XML index–specific columns and limits the rows returned to just indexes of type XML.
+
+The extra three columns included are:
+
+Column | Type | Description  
+---|---|---  
+using_xml_index_id | int | If this is NULL, then this index is the primary XML index for the table (remember that the first XML index you create on a table has to be marked as the primary, and you can only have one primary XML index per table). Any non-null value indicates that this is a secondary index, and provides a unique ID within the set of XML indexes on this table.  
+secondary_type | char(1) | Only relevant to non-primary indexes (it will be NULL if this is a primary index), this indicates the nature of secondary index:   
+P (Path)   
+V (Value)   
+R (Property)  
+secondary_type_desc | nvarchar (60) | A clear text version of the secondary_type column.
+
+Dynamic Management Views
+
+These first started making a wide-scale appearance in SQL Server 2005, and fall under the heading of SQL Server items I would refer to as "really cool." Much like the system views, these provide information on the state of your server and/or database.
+
+A word of warning: Unlike system views, which are relatively stable in nature, Microsoft makes no promises that dynamic management—or dm—views will be stable between releases. Microsoft considers these views to be specific to the implementation choices made in each release of SQL Server, and, therefore, reserves the right to change them from release to release.
+
+Most core columns in dm views are probably relatively safe, but the more specific a column is, the more likely it is to be altered in some way from release to release.
+
+Dynamic management views are simply too powerful to ignore, but be aware that you may have to have some version-specific code running around them as your support moves from version to version of SQL Server.
+
+Dynamic management views vary in what kind of data you're getting back. It may be the kind of relatively static data you expect from system views (data that stays pretty stable from day to day), and it may well be data that is always changing (such as lock states, resource governor information, and other constantly changing information).
+
+Under the heading of "one more thing," it's probably important to note that not all objects typically referred to as dm views are really views; several of them are actually table-valued functions. Nonetheless, the entire set of objects that return dynamic management information is generally referred to as dm views.
+
+Index-Related dm Views and Functions
+
+I can't stress enough how much of a leap having the dm views for indexes was over the previous forms of information available about views. In SQL Server 2000 and prior, we used DBCC SHOWCONTIG or other Database Consistency Checker commands to get at our index information. While much of what we wanted was in there, it was in more of a report format, and was very difficult to use programmatically. The dm views give us a tabular result set that we can easily apply WHERE or other SQL constructs to—very powerful. The SQL Server team didn't stop there though; with SQL Server 2008 we pick up information regarding indexes we don't even have (yet)!
+
+As I mention in Chapter 22, SQL Server now provides analysis at query optimization on what indexes it thinks may have been useful had they been available. It logs that analysis information to make it available in various metadata sources including some of the index-related dm views we will look at over the next few pages.
+
+sys.db_db_index_physical_stats
+
+This is something of the new "bread and butter" source of information for index state. Using this table-valued function can provide a plethora of information on such things as the fragmentation, depth of the index, and how many records are participating in the index, among other things.
+
+This function has a standard function parameter format that includes several required parameters. The syntax looks like this:
+
+sys.dm_db_index_physical_stats (
+
+{ <database id> | NULL | 0 | DEFAULT },
+
+{ <object id> | NULL | 0 | DEFAULT },
+
+{ <index id> | NULL | 0 | −1 | DEFAULT },
+
+{ <partition number> | NULL | 0 | DEFAULT },
+
+{ <mode> | NULL | DEFAULT }
+
+)
+
+Parameter | Data Type | Description  
+---|---|---  
+database id | smallint | Internal identifier for the database you want index statistics on. NULL, 0, and DEFAULT all have the same functional result here, which is to return data for all databases.  
+object id | int | Internal identifier for the individual object you want index statistics on. Again, NULL, 0, and DEFAULT have the same functional effect here (return all objects). Utilize the OBJECT_ID() function to resolve an object by name into an id.  
+index id | int | Internal identifier for the individual index you want statistics on. A 0 indicates that you want statistics solely for the heap (the base data pages) and not for other indexes in the table. Likewise, any positive number will be matched up against the 1 based index ID for the particular table or view specified in the object ID parameter. NULL, −1 (notice the change from 0), and DEFAULT have the same functional effect here (return all indexes).  
+partition number | int | The internal partition number (1 based) you want index information for. DEFAULT, NULL, and 0 are functionally equivalent, and indicate you want information on all partitions. Any integer larger than zero will be matched against a specific partition number.  
+mode | sysname | Indicates the level of scan you want used to create the statistics you receive. Valid inputs are DEFAULT, NULL, LIMITED, SAMPLED, and DETAILED. DEFAULT, NULL, and LIMITED are functionally equivalent.
+
+We looked at examples of using this table-valued function a few times over the course of the book, but, for example, we could execute:
+
+USE AdventureWorks2008;
+
+SELECT OBJECT_NAME(object_id) AS ObjectName,
+
+index_type_desc,
+
+avg_fragmentation_in_percent
+
+FROM sys.dm_db_index_physical_stats(
+
+DB_ID('AdventureWorks2008'),
+
+DEFAULT,
+
+DEFAULT,
+
+DEFAULT,
+
+DEFAULT)
+
+WHERE avg_fragmentation_in_percent > 50;
+
+Notice that, like all table-valued functions, I can specify an explicit select list and grab only those columns that are meaningful to me for whatever my need happens to be. I could, of course, also join this to other tables or views to get additional information or insight into the results (for example, I could join to the sys.indexes system view to retrieve the name of the indexes).
+
+If I run this, I get a short list of tables or views that have index fragmentation above 50%:
+
+ObjectName index_type_desc avg_fragmentation_in_percent
+
+\------------------------- --------------- ----------------------------
+
+ProductListPriceHistory CLUSTERED IND 66.6666666666667
+
+SpecialOfferProduct CLUSTERED IND 66.6666666666667
+
+ProductReview NONCLUSTERED 66.6666666666667
+
+Employee NONCLUSTERED 66.6666666666667
+
+Product NONCLUSTERED 66.6666666666667
+
+ProductCostHistory CLUSTERED IND 66.6666666666667
+
+ProductDescription NONCLUSTERED 66.6666666666667
+
+DatabaseLog NONCLUSTERED 66.6666666666667
+
+(8 row(s) affected)
+
+This is actually a much wider view than I've shown in the previous results. Some of the more interesting columns available in this view include:
+
+Column name | Data type | Description  
+---|---|---  
+database_id | smallint | The internal identifier of the table or view. You can use the DB_NAME function to retrieve the name of the database.  
+object_id | int | Object ID of the table or view that the index is on. Use the OBJECT_NAME function to return the name associated with this ID.  
+index_id | int | Index identifier of the index listed. Note that this value will be unique only within a given table or view. A value of 0 indicates that this is the heap in a non-clustered table. A 1 indicates that this is the clustered index for the table.  
+partition_number | int | 1-based partition number within the owning object; a table, view, or index.   
+1 = Nonpartitioned index or heap.  
+index_type_desc | nvarchar (60) | Description of the index type:   
+HEAP   
+CLUSTERED INDEX   
+NONCLUSTERED INDEX   
+PRIMARY XML INDEX   
+SPATIAL INDEX   
+XML INDEX  
+alloc_unit_type_desc | nvarchar (60) | Description of the allocation unit type:   
+IN_ROW_DATA   
+LOB_DATA   
+ROW_OVERFLOW_DATA   
+The LOB_DATA allocation unit contains the data that is stored in columns of type text, ntext, image, varchar (max), nvarchar (max), varbinary (max), and xml.   
+The ROW_OVERFLOW_DATA allocation unit contains the data that is stored in columns of type varchar(n), nvarchar(n), varbinary(n), and sql_variant that have been pushed off-row.  
+index_depth | tinyint | Number of index levels   
+1 = Heap, or LOB_DATA or ROW_OVERFLOW_DATA allocation unit.  
+index_level | tinyint | Current level of the index.   
+0 for index leaf levels, heaps, and LOB_DATA or   
+ROW_OVERFLOW_DATA allocation units.   
+Greater than 0 for non-leaf index levels. index_level will be the highest at the root level of an index.   
+The non-leaf levels of indexes are only processed when mode = DETAILED.  
+avg_fragmentation_in_percent | float | Logical fragmentation for indexes, or extent fragmentation for heaps in the IN_ROW_DATA allocation unit.   
+The value is measured as a percentage and takes into account multiple files   
+0 for LOB_DATA and ROW_OVERFLOW_DATA allocation units.   
+NULL for heaps when mode = SAMPLED.  
+fragment_count | bigint | Number of fragments in the leaf level of an   
+IN_ROW_DATA allocation unit.   
+NULL for non-leaf levels of an index, and LOB_DATA or ROW_OVERFLOW_DATA allocation units.   
+NULL for heaps when mode = SAMPLED.  
+avg_fragment_size_in_pages | float | Average number of pages in one fragment in the leaf level of an IN_ROW_DATA allocation unit.   
+NULL for non-leaf levels of an index, and LOB_DATA or ROW_OVERFLOW_DATA allocation units.   
+NULL for heaps when mode = SAMPLED.  
+page_count | bigint | Total number of index or data pages. For an index, the total number of index pages in the current level of the b-tree in the IN_ROW_DATA allocation unit.   
+For a heap, the total number of data pages in the IN_ROW_DATA allocation unit.   
+For LOB_DATA or ROW_OVERFLOW_DATA allocation units, total number of pages in the allocation unit.  
+avg_page_space_used_in_percent | float | Average percentage of available data storage space used in all pages.   
+For an index, average applies to the current level of the b-tree in the IN_ROW_DATA allocation unit.   
+For a heap, the average of all data pages in the IN_ROW_DATA allocation unit.   
+For LOB_DATA or ROW_OVERFLOW DATA allocation units, the average of all pages in the allocation unit.   
+NULL when mode = LIMITED.  
+record_count | bigint | Total number of records.   
+For an index, total number of records applies to the current level of the b-tree in the IN_ROW_DATA allocation unit.   
+For a heap, the total number of records in the IN_ROW_DATA allocation unit.   
+For a heap, the number of records returned from this function might not match the number of rows that are returned by running a SELECT COUNT(*) against the heap. This is because a row may contain multiple records. For example, under some update situations, a single heap row may have a forwarding record and a forwarded record as a result of the update operation. Also, most large LOB rows are split into multiple records in LOB_DATA storage.   
+For LOB_DATA or ROW_OVERFLOW_DATA allocation units, the total number of records in the complete allocation unit.NULL when mode = LIMITED.  
+ghost_record_count | bigint | Number of ghost records ready for removal by the ghost cleanup task in the allocation unit.   
+0 for non-leaf levels of an index in the IN_ROW_DATA allocation unit.   
+NULL when mode = LIMITED.  
+version_ghost_record_count | bigint | Number of ghost records retained by an outstanding snapshot isolation transaction in an allocation unit.   
+0 for non-leaf levels of an index in the IN_ROW_DATA allocation unit.   
+NULL when mode = LIMITED.  
+min_record_size_in_bytes | int | Minimum record size in bytes.   
+For an index, minimum record size applies to the current level of the b-tree in the IN_ROW_DATA allocation unit.   
+For a heap, the minimum record size in the IN_ROW_DATA allocation unit.   
+For LOB_DATA or ROW_OVERFLOW_DATA allocation units, the minimum record size in the complete allocation unit.   
+NULL when mode = LIMITED.  
+max_record_size_in_bytes | int | Maximum record size in bytes.   
+For an index, the maximum record size applies to the current level of the b-tree in the IN_ROW_DATA allocation unit.   
+For a heap, the maximum record size in the IN_ROW_DATA allocation unit.   
+For LOB_DATA or ROW_OVERFLOW_DATA allocation units, the maximum record size in the complete allocation unit.   
+NULL when mode = LIMITED.  
+avg_record_size_in_bytes | float | Average record size in bytes.   
+For an index, the average record size applies to the current level of the b-tree in the IN_ROW_DATA allocation unit.   
+For a heap, the average record size in the IN_ROW_DATA allocation unit.   
+For LOB_DATA or ROW_OVERFLOW_DATA allocation units, the average record size in the complete allocation unit.   
+NULL when mode = LIMITED.  
+forwarded_record_count | bigint | Number of records in a heap that have forward pointers to another data location. (This state occurs during an update, when there is not enough room to store the new row in the original location.)   
+NULL for any allocation unit other than the IN_ROW_DATA allocation units for a heap.   
+NULL for heaps when mode = LIMITED.  
+compressed_page_count | bigint | The number of compressed pages.   
+For heaps, newly allocated pages are not PAGE compressed. A heap is PAGE compressed under two special conditions: when data is bulk imported or when a heap is rebuilt. Typical DML operations that cause page allocations will not be PAGE compressed. Rebuild a heap when the compressed_page_count value grows larger than the threshold you want.For tables that have a clustered index, the compressed_page_count value indicates the effectiveness of PAGE compression.
+
+sys.dm_db_index_usage_stats
+
+This is a subtle one that probably does not get enough attention from the SQL Server pundits of the world. What is it? Well, it is the current status from a series of counters SQL Server maintains whenever an index (including a raw heap) is used in an index. You can get rather robust information regarding which indexes are and are not being utilized during the normal operation of your system and, perhaps just as important, how they are being used. You can get operation-specific information such as scans, seeks, and lookups as well as how often changes are occurring. You are even provided with information on the date and time the index was last used (it may have been a heavily used index at one time, but perhaps a new index or change in the makeup of your data has caused it not to be used in quite some time). This can be key in terms of knowing whether your index is really worth it.
+
+Note that anything referred to as an "update" in this view can be more than just a result of an UPDATE statement—anything that changes the row (including inserts or deletes regardless of whether they are an explicit INSERT or DELETE statement or by a MERGE).
+
+The row counters used in sys.dm_db_index_usage_stats are reset each time the SQL Server service is restarted. Counters for any individual database are reset any time that database becomes offline for any reason (you detach it or it is closed for some other reason, such as AUTO_CLOSE, which is frequently used in SQL Server Express installations).
+
+The usage is pretty straightforward; use it just as you would a table:
+
+SELECT *
+
+FROM sys.dm_db_index_usage_stats;
+
+As you can see, there is no rocket science involved in this one. A count and date of last instance is supplied for each of four types of use with system versus user access each separated into individual buckets (aggregate them as you need for your specific application). Again, a date is included for each of these, and user access versus system access is also separated.
+
+Column | Type | Description  
+---|---|---  
+database_id | smallint | Internal identifier for the database containing the object being reported on. Remember you can use the DB_NAME() function to resolve the id into the name you are used to. Likewise, you could use the DB_ID() function to resolve a name to an id you could use in a WHERE clause.  
+object_id | int | Internal identifier for the individual object you want index usage on.Again, utilize the OBJECT_ID() function to resolve an object by name into an id for use in a WHERE clause. Likewise, use the OBJECT_NAME() function to resolve it back to a more user-friendly name in your select list. You can also join to the sys.objects table for more information on the particular object in question.  
+index_id | int | Internal identifier for the individual index the usage information is on. Use a 0 if you want statistics solely for the heap (the base data pages in a non-clustered index) and a 1 for the clustered index on a clustered table.   
+Join against the sys.indexes table if you're looking for more specific information on the nature of the index (what columns are used or included, type of index, name, and so on).  
+user_seeks system_seeks | bigint | Count of index seeks. This would be an instance where SQL Server followed the index to a specific row.  
+user_scans system_scans | bigint | This can be a full scan of the index (much like a scan of the entire table, but only of the data included in that index), or can be a ranged scan (seek to a specific point in the index, and then read every row until a specific end point is reached).  
+user_lookups system_lookups | bigint | The system used a row identifier (RID) or cluster key to "look up" a specific row—think of this as another form of seek.  
+user_updates system_updatess | bigint | Incremented when some form of change happened to the index row. Despite being called an "update," this counter is actually incremented for any change to the data regardless of insert, update, or delete.  
+last_user_seek last_system_seek | datetime | Date a user or the system (as indicated in the name) last caused a seek operation to be performed against this index.  
+last_user_scan last_system_scan | datetime | Date a user or the system (as indicated in the name) last caused an index scan operation to be performed against this index.  
+last_user_lookup last_system_lookup | datetime | Date a user or the system (as indicated in the name) last caused a bookmark lookup operation to be performed against this index.  
+last_user_update last_system_update | datetime | Date a user or the system (as indicated in the name) last changed data associated with this index.
+
+The sys.dm_db_missing_index_* family of views
+
+Full discussion of these gets complex and probably could be its own appendix.
+
+You may recall me discussing in a few different chapters that, with 2008, SQL Server now not only recognizes the indexes that it has available, but also optionally recognizes those that do not exist but would have been useful if they had been there. In addition to the administration tools that show you much of this data, you can access missing index information programmatically through the use of four missing index views—these include:
+
+  * sys.dm_db_missing_index_groups
+  * sys.dm_db_missing_index_group_stats
+  * sys.dm_db_missing_index_details
+  * sys.dm_db_missing_index_columns
+
+The key thing to understand about these system views is that they collectively provide information on such things as:
+
+  * What index would have been helpful had it been there?
+  * What columns are involved in the index?
+  * How many compilations/optimizations would have utilized the query had it been available?
+  * How exactly would that index have been utilized?
+
+Information stored in these views is temporary in nature, and can be thought of as working much like the tempdb database; that is, they are completely removed and all information starts from zero each time the SQL Server service is restarted.
+C
+
+The Basics
+
+As I mentioned back in the introduction of this book, I've gone even more towards the idea of fully separating the Beginning and Professional titles into their own pair. My hope is that, between the two, we'll be back at the full compendium we had in my first two books (before the content grew so large that it couldn't fit in a single book). That said, I recognize that many more advanced developers aren't going to be comfortable buying a book with "Beginning" in the title, but may still want to easily look up, review, or make some other kind of study of some of the more extended syntax of an otherwise basic context. If you are one of those people, then this appendix is for you. The idea is simple here: Provide syntax information and, in some cases (and not in others), see some basic examples of the statement in use. There will be no fanfare here, and anywhere from no real description to relatively minor discussion of key concepts. Several hundred pages in the Beginning book that is the companion of this one will be shrunk down to tens of pages here.
+
+There are a few places where I'll provide just a little "extra" coverage (still pretty minimal). Such coverage will generally be limited to "beginning" commands that may not have existed until recent releases, or other places where changes have been such that you might make a different choice today than you've gotten used to using from having learned in "the olden days."
+
+Everything to Do with Queries
+
+In this section, we'll take a look at all the various themes that relate directly to DML—or "Data Manipulation Language." Everything in this section is going to be, in some form, directly tied to the execution of one of four statements:
+
+  * SELECT: The statement for reading data (though, in a rare circumstance or two, it can be utilized as part of the insertion of data too.
+  * INSERT: The statement for getting data into our database.
+  * UPDATE: Changing values in existing data.
+  * DELETE: Removing data from the system.
+
+While these four will serve as the major statements in all of the discussion in this section, they are just the top level of the statement. We'll also be looking at various predicates, options, and operational concepts that are used within each of the four statements mentioned.
+
+Most things here will be "old hat"—that is, they should be familiar already—but don't rush through too fast. There are a number of at least intermediate concepts addressed here, and many keywords that are used relatively infrequently, and thus may have been missed in your learning or be something you've simply forgotten was available.
+
+The Basic SELECT Statement
+
+The SELECT statement and the structures used within it form the basis for the lion's share of all the commands we will perform with SQL Server. The basic syntax rules for a SELECT statement are:
+
+SELECT <column list>
+
+[FROM <source table(s)> [[AS] <table alias>]
+
+[[{FULL|INNER|{LEFT|RIGHT} OUTER|CROSS}] JOIN <next table>
+
+[ON <join condition>] [<additional JOIN clause> ...]]]
+
+[WHERE <restrictive condition>]
+
+[GROUP BY <column name or expression using a column in the SELECT list>]
+
+[HAVING <restrictive condition based on the GROUP BY results>]
+
+[ORDER BY <column list>]
+
+[[FOR XML {RAW|AUTO|EXPLICIT|PATH [(<element>)]}[, XMLDATA][, ELEMENTS][, BINARY
+
+base 64]]
+
+[OPTION (<query hint>, [, ...n])]
+
+[{ UNION [ALL] | EXCEPT | INTERSECT }]
+
+[;]
+
+For anyone reading a "Professional" level book, most of this should be well understood. Let's look at the syntax of two quick examples (with a GROUP BY and without).
+
+Let's start with a basic multi-table, ordered query with a WHERE clause:
+
+SELECT p.Name AS ProductName, soh.AccountNumber, soh.ShipDate
+
+FROM Production.Product p
+
+JOIN Sales.SalesOrderDetail sod
+
+ON p.ProductID = sod.ProductID
+
+JOIN Sales.SalesOrderHeader soh
+
+ON soh.SalesOrderID = sod.SalesOrderDetailID
+
+WHERE soh.ShipDate >= '07/01/2001'
+
+AND soh.ShipDate < '08/01/2001'
+
+ORDER BY AccountNumber, ShipDate;
+
+Note that, since we omitted the optional FULL, INNER, OUTER, and CROSS keywords, this is assumed to be the default (INNER) join. We could write a similar query, but utilize a GROUP BY clause (in this case, to get a number of different accounts that ordered each product that shipped in July regardless of when it was ordered):
+
+SELECT p.Name AS ProductName,
+
+COUNT(DISTINCT soh.AccountNumber) AS UniqueAccounts
+
+FROM Production.Product p
+
+JOIN Sales.SalesOrderDetail sod
+
+ON p.ProductID = sod.ProductID
+
+JOIN Sales.SalesOrderHeader soh
+
+ON soh.SalesOrderID = sod.SalesOrderDetailID
+
+WHERE soh.ShipDate >= '07/01/2001'
+
+AND soh.ShipDate < '08/01/2001'
+
+GROUP BY p.Name
+
+ORDER BY p.Name;
+
+Note here that the ORDER BY clause still needed to be last (the GROUP BY was inserted above the ORDER BY).
+
+The WHERE Clause
+
+The WHERE is a basic filter condition; if a row doesn't meet all WHERE conditions, then it isn't included in the results. Let's take a look at all the operators we can use with the WHERE clause:
+
+Operator | Example Usage | Effect  
+---|---|---  
+=, >, <, >=, <=, <>, !=, !>, !< | <Column Name> = <Other Column Name> <Column Name> = 'Bob' | Standard comparison operators—these work as they do in pretty much any programming language with a couple of notable points:   
+1. What constitutes "greater than," "less than," and "equal to" can change, depending on the collation order you have selected. For example, "ROMEY" = "romey" in places where case-insensitive sort order has been selected, but "ROMEY" < > "romey" in a case-sensitive situation.   
+2. != and <> both mean "not equal." !< and !> mean "not less than" and "not greater than," respectively.  
+AND, OR, NOT | <Column1> = <Column2> AND <Column3> >= <Column 4> <Column1> != "MyLiteral" OR <Column2> = "MyOtherLiteral" | Standard Boolean logic. You can use these to combine multiple conditions into one WHERE clause. NOT is evaluated first, then AND, then OR. If you need to change the evaluation order, you can use parentheses. Note that XOR is not supported.  
+BETWEEN | <Column1> BETWEEN 1 AND 5 | Comparison is TRUE if the first value is between the second and third values inclusive. It is the functional equivalent of A>=B AND A<=C. Any of the specified values can be column names, variables, or literals.  
+LIKE | <Column1> LIKE "ROM%" | Uses the % and _ characters for wildcarding. % indicates that a value of any length can replace the % character. _ indicates that any one character can replace the _ character.   
+Enclosing characters in [] symbols indicates that any single character within the [] is okay ([a-c] means a, b, and c are okay. [ab] indicates a or b is okay). ∧ operates as a NOT operator—indicating that the next character is to be excluded.  
+IN | <Column1> IN (List of Numbers) <Column1> IN ("A", "b", "345") | Returns TRUE if the value to the left of the IN keyword matches any of the values in the list provided after the IN keyword. This is frequently used in subqueries, which we look at in Chapter 3.  
+ALL, ANY, SOME | <column|expression> (comparison operator) <ANY|SOME> (subquery) | These return TRUE if any or all (depending on which you choose) values in a subquery meet the comparison operator (for example, <, >, =, >=) condition. ALL indicates that the value must match all the values in the set. ANY and SOME are functional equivalents and will evaluate to TRUE if the expression matches any value in the set.  
+EXISTS | EXISTS (subquery) | Returns TRUE if at least one row is returned by the subquery. Again, we look into this one further in Chapter 3.
+
+ORDER BY
+
+This does what it says; you specify for which columns you want the results to be presented and in which order. The default is ascending order, but you can also supply the optional DESC switch if you want it to be in descending order.
+
+So, in the earlier SELECT example, we could have changed things to be in descending account number order by simply adding the DESC keyword (note that I haven't specified anything for ShipDate, so that sub-sort would continue to be in ascending order).
+
+ORDER BY AccountNumber DESC, ShipDate;
+
+The GROUP BY Clause
+
+With ORDER BY, we took things somewhat out of order compared with how the SELECT statement reads at the top of the section. The GROUP BY clause is used to aggregate information. Let's look at a simple query without a GROUP BY. In our GROUP BY example at the start of the section, we got back a count that was the aggregation of orders as grouped together by the product name. The key thing to realize with a GROUP BY is that, when using a GROUP BY, any columns specified in the select list must either be part of what you're grouping by (that is, they must appear in the GROUP BY list), or they must be the target of an aggregate function.
+
+While aggregates show their power when used with a GROUP BY clause, they are not limited to grouped queries—if you include an aggregate without a GROUP BY, then the aggregate will work against the entire result set (all the rows that match the WHERE clause). The catch here is that, when not working with a GROUP BY, some aggregates can only be in the SELECT list with other aggregates—that is, they can't be paired with a column name in the SELECT list unless you have a GROUP BY. For example, unless there is a GROUP BY, AVG can be paired with SUM, but not with a specific column.
+
+Let's review a few of the most common aggregate functions (be aware that you can write your own aggregate function using a CLR-based user-defined function).
+
+  * AVG: This one is for computing averages. 'Nuff said.
+  * MIN/MAX: These, as you might expect, grab the minimum and maximum amounts for each grouping for a selected column.
+  * COUNT(Expression|*): The COUNT(*) function is about counting the rows in a query.
+
+All aggregate functions ignore NULLs except for COUNT(*).This can have a very significant impact on your results, so be careful. Many users expect NULL values in numeric fields to be treated as zero when performing averages, but a NULL does not equal zero, and as such, shouldn't be used as one. If you perform an AVG or other aggregate function on a column with NULLs, the NULL values will not be part of the aggregation unless you manipulate them into a non-NULL value inside the function (using COALESCE() or ISNULL(), for example).
+
+The HAVING Clause
+
+The HAVING clause is used only if there is also a GROUP BY in your query; whereas the WHERE clause is applied to each row before it even has a chance to become part of a group, the HAVING clause is applied to the aggregated value for that group.
+
+To demonstrate this, we'll use the same query we used in our GROUP BY clause example, but adding a HAVING clause:
+
+SELECT p.Name AS ProductName, COUNT(DISTINCT soh.AccountNumber) AS UniqueAccounts
+
+FROM Production.Product p
+
+JOIN Sales.SalesOrderDetail sod
+
+ON p.ProductID = sod.ProductID
+
+JOIN Sales.SalesOrderHeader soh
+
+ON soh.SalesOrderID = sod.SalesOrderDetailID
+
+WHERE soh.ShipDate >= '07/01/2001'
+
+AND soh.ShipDate < '08/01/2001'
+
+GROUP BY p.Name
+
+HAVING COUNT(DISTINCT soh.AccountNumber) > 1
+
+ORDER BY p.Name;
+
+Beyond Inner Joins
+
+Perhaps 95% or more of the queries we write are going to use either no joins or an inner join. For far too many SQL developers, 100% of the queries they write will fall into one of the aforementioned categories. An inner join is based on the idea of matching rows on both sides of the join—that is, when both the "left" and "right" side of the join meet the join condition, then an inner join is satisfied and the rows from both sides of the join are matched up and returned. An inner join is, therefore, exclusive in nature; if the rows don't match, then they are excluded from the result. There are, however, other joins available, so let's take a look at outer joins, full joins, and cross joins.
+
+OUTER Joins
+
+Whereas an inner join is exclusive in nature, an outer join is inclusive in nature. Whichever side you choose (left or right) will have all rows returned regardless of whether or not they match. Rows from the remaining side will have to match the "outer" side or they will be excluded. A simple way to look at an outer join would be:
+
+SELECT <SELECT list>
+
+FROM <the table you want to be the "LEFT" table>
+
+<LEFT|RIGHT> [OUTER] JOIN <table you want to be the "RIGHT" table>
+
+ON <join condition>
+
+You choose whether you want the left or right side to be the all inclusive side of the join. An example of this in the AdventureWorks2008 database might look something like:
+
+SELECT sso.SpecialOfferID, Description, DiscountPct, ProductID
+
+FROM Sales.SpecialOffer sso
+
+LEFT OUTER JOIN Sales.SpecialOfferProduct ssop
+
+ON sso.SpecialOfferID = ssop.SpecialOfferID
+
+This would pull all rows from the SpecialOffer table (the left side of the join) regardless of whether they meet the join condition, but would only include rows from the SpecialOfferProduct table (the right side) if the row met the join condition. If we used the RIGHT keyword instead of LEFT, the roles of each table would be reversed.
+
+FULL Joins
+
+Think of this one as something of a LEFT and a RIGHT join coming together. With a FULL join, you are telling SQL Server to include all rows on both sides of the join. AdventureWorks2008 doesn't give me any really great examples to show you this one, but since the concept is fairly easy once you already understand outer joins, we'll just toss together a pretty simple demonstration:
+
+CREATE TABLE Film
+
+(FilmID int PRIMARY KEY,
+
+FilmName varchar(20) NOT NULL,
+
+YearMade smallint NOT NULL
+
+);
+
+CREATE TABLE Actors
+
+(FilmID int NOT NULL,
+
+FirstName varchar(15) NOT NULL,
+
+LastName varchar(15) NOT NULL,
+
+CONSTRAINT PKActors PRIMARY KEY(FilmID, FirstName, LastName)
+
+);
+
+INSERT INTO Film
+
+VALUES
+
+(1, 'My Fair Lady', 1964);
+
+INSERT INTO Film
+
+VALUES
+
+(2, 'Unforgiven', 1992);
+
+INSERT INTO Actors
+
+VALUES
+
+(1, 'Rex', 'Harrison');
+
+INSERT INTO Actors
+
+VALUES
+
+(1, 'Audrey', 'Hepburn');
+
+INSERT INTO Actors
+
+VALUES
+
+(3, 'Anthony', 'Hopkins');
+
+Okay, now let's run a FULL JOIN and see what we get:
+
+SELECT *
+
+FROM Film f
+
+FULL JOIN Actors a
+
+ON f.FilmID = a.FilmID;
+
+When you check the results, you'll see data that has been joined where they match, data from just the left if that's all there is (and nulls for the columns on the right), and data from just the right if that happens to be all there is (and, of course, nulls for the columns on the left).
+
+FilmID FilmName YearMade FilmID FirstName LastName
+
+\----------- ------------------ -------- --------- ------------- -------------
+
+1 My Fair Lady 1964 1 Audrey Hepburn
+
+1 My Fair Lady 1964 1 Rex Harrison
+
+2 Unforgiven 1992 NULL NULL NULL
+
+NULL NULL NULL 3 Anthony Hopkins
+
+(4 row(s) affected)
+
+CROSS JOIN
+
+Our last type of join, the CROSS JOIN, is a very strange critter indeed. A CROSS JOIN differs from other JOINs in that there is no ON operator and in that it joins every record on one side of the JOIN with every record on the other side of the JOIN. In short, you wind up with a Cartesian product of all the records on both sides of the JOIN. The syntax is the same as any other JOIN except that it uses the keyword CROSS (instead of INNER, OUTER, or FULL), and that it has no ON operator.
+
+So, let's say we were just playing games and wanted to mix every film with every actor using the little sample we just built for FULL joins:
+
+SELECT *
+
+FROM Film f
+
+CROSS JOIN Actors a;
+
+We get every record in the Film table matched with every actor in the Actors table:
+
+FilmID FilmName YearMade FilmID FirstName LastName
+
+\----------- ------------------ -------- --------- ------------- -------------
+
+1 My Fair Lady 1964 1 Audrey Hepburn
+
+1 My Fair Lady 1964 1 Rex Harrison
+
+1 My Fair Lady 1964 3 Anthony Hopkins
+
+2 Unforgiven 1992 1 Audrey Hepburn
+
+2 Unforgiven 1992 1 Rex Harrison
+
+2 Unforgiven 1992 3 Anthony Hopkins
+
+(6 row(s) affected)
+
+Now, this has to bring out the question of "Why would you ever want this?" Good question. The answer is difficult because it's very situational. To date, I've seen CROSS JOINs used in just two situations:
+
+  * Sample Data: CROSS JOINs are good for putting together small sets of data and then mixing the two sets of data together in every possible way so that you get a much larger sample set to work with.
+  * Scientific Data: I believe this, again, has to do with samples, but I know there are a number of scientific calculations that make use of Cartesians. I'm told that doing CROSS JOINs is a way of "preparing" data for some types of analysis. I'm not going to pretend to understand the statistics of it, but I know that it's out there.
+
+The end story is—they are only very, very rarely used, but keep them in mind in case you need them!
+
+The INSERT Statement
+
+An INSERT statement is, obviously, the command we use to put data into our table. The basic syntax for an INSERT statement looks like this:
+
+INSERT [INTO] <table> [(<column list>)]
+
+VALUES (<data values>)
+
+[, (<data values>) [,...n]]
+
+Let's look at the parts:
+
+  * INSERT is the action statement. It tells SQL Server what it is that we're going to be doing with this statement, and everything that comes after this keyword is merely spelling out the details of that action.
+  * The INTO keyword is pretty much just fluff. Its sole purpose in life is to make the overall statement more readable. It is completely optional, but I recommend its use for the very reason that they added it to the statement—it makes things much easier to read.
+  * Next comes the table into which you are inserting.
+  * Now comes the part that's a little more difficult: the column list. An explicit column list (where you specifically state the columns to receive values) is optional, but not supplying one means that you have to be extremely careful. If you don't provide an explicit column list, then each value in your INSERT statement will be assumed to match up with a column in the same ordinal position of the table in order (first value to first column, second value to second column, and so on). Additionally, a value must be supplied for every column, in order, until you reach the last column that both does not accept nulls and has no default. In summary, this will be a list of one or more columns that you are going to be providing data for in the next part of the statement.
+  * Finally, you'll supply the values to be inserted. There are two ways of doing this: explicitly supplied values and values derived from a SELECT statement.
+  * To supply the values, we'll start with the VALUES keyword and then follow that with a list of values, separated by commas and enclosed in parentheses. The number of items in the value list must exactly match the number of columns in the column list. The data type of each value must match or be implicitly convertible to the type of the column with which it corresponds (they are taken in order). If we want to add more than one row, we add a comma and then a new list of values again separated by commas and enclosed in parentheses.
+
+The ability to include multiple inserted rows as part of a single INSERT is new with SQL Server 2008. You cannot use this option if you need SQL Server 2005 compatibility.
+
+So, for example, we might have an INSERT statement that looks something like:
+
+INSERT INTO HumanResources.JobCandidate
+
+VALUES
+
+(1, NULL, DEFAULT),
+
+..(55, NULL, GETDATE());
+
+As stated earlier, unless we provide a different column list (we'll cover how to provide a column list shortly), all the values have to be supplied in the same order as the columns are defined in the table. The exception to this is if you have an identity column, in which case that column is assumed to be skipped.
+
+If you check the definition of the HumanRescoures.JobCandidate table, you will indeed see that it starts with an identity column called JobCandidateID. Since it is an identity column, we know that the system is going to automatically generate a value for that column, so we can skip it.
+
+Since, in theory (because you're reading this book instead of the beginning one), we already mostly know this stuff and are just reviewing, I've crammed several concepts into this.
+
+  * I've skipped the identity column entirely (the system will fill that in for us).
+  * I've supplied actual values (the number 1 for EmployeeID in the first inserted row, and the number 55 for EmployeeID for the second inserted row, and an explicit declaration of NULL as a value for Resume in both inserted rows).
+  * I've used the DEFAULT keyword (for ModifiedDate) in the first insert row to explicitly tell the server to use whatever the default value is for that column.
+  * I've inserted multiple rows with just one statement (again, this is new with SQL Server 2008).
+
+Now let's try it again with modifications for inserting into specific columns:
+
+INSERT INTO HumanResources.JobCandidate
+
+(EmployeeID, Resume, ModifiedDate)
+
+VALUES
+
+(1, NULL, DEFAULT),
+
+..(55, NULL, GETDATE());
+
+Note that we are still skipping the identity column. Other than that, we are just explicitly supplying names—nothing else has changed.
+
+The INSERT INTO... SELECT Statement
+
+It is, of course, great to be able to explicitly define the data we want to go into our table, but what if we have a block of data from a queryable source and want it inserted? Examples of where you might want to do this include data sourced from:
+
+  * Another table in our database
+  * A totally different database on the same server
+  * A heterogeneous query from another SQL Server or other data
+  * The same table (usually, you're doing some sort of math or other adjustment in your SELECT statement in this case)
+
+The INSERT INTO...SELECT statement can do all of these. The syntax for this statement comes from a combination of the two statements we've seen thus far—the INSERT statement and the SELECT statement. It looks something like this:
+
+INSERT INTO <table name>
+
+[<column list>]
+
+<SELECT statement>
+
+The result set created from the SELECT statement becomes the data that is added in your INSERT statement. So, a scripted example might look something like:
+
+USE AdventureWorks2008;
+
+/* This next statement declares our working table.
+
+** This particular table is table variable we are creating on the fly.
+
+*/
+
+DECLARE @MyTable Table
+
+(
+
+SalesOrderID int,
+
+CustomerID int
+
+);
+
+/* Now that we have our table variable, we're ready to populate it with data
+
+** from our SELECT statement. Note that we could just as easily insert the
+
+** data into a permanent table (instead of a table variable).
+
+*/
+
+INSERT INTO @MyTable
+
+SELECT SalesOrderID, CustomerID
+
+FROM AdventureWorks.Sales.SalesOrderHeader
+
+WHERE SalesOrderID BETWEEN 50222 AND 50225;
+
+\-- Finally, let's make sure that the data was inserted like we think
+
+SELECT *
+
+FROM @MyTable;
+
+This should yield you results that look like this:
+
+(4 row(s) affected)
+
+SalesOrderID CustomerID
+
+\------------ -----------
+
+50222 638
+
+50223 677
+
+50224 247
+
+50225 175
+
+(4 row(s) affected)
+
+The UPDATE Statement
+
+The UPDATE statement, like most SQL statements, does pretty much what it sounds like it does—it updates existing data. The structure is a little bit different from a SELECT, though you'll notice definite similarities. Let's look at the syntax:
+
+UPDATE <table name>
+
+SET <column> = <value> [,<column> = <value>]
+
+[FROM <source table(s)>]
+
+[WHERE <restrictive condition>]
+
+An UPDATE can be created from multiple tables, but can affect only one table. Take, for a moment, the case of Jo Brown. It seems that Jo has recently gotten married, and we need to make sure that her data is accurate. Let's run a query to look at one row of data:
+
+SELECT e.BusinessEntityID,
+
+e.MaritalStatus,
+
+p.FirstName,
+
+p.LastName
+
+FROM HumanResources.Employee e
+
+JOIN Person.Person p
+
+ON e.BusinessEntityID = p.BusinessEntityID
+
+WHERE p.FirstName = 'Jo'
+
+AND p.LastName = 'Brown';
+
+which returns the following:
+
+EmployeeID MaritalStatus FirstName LastName
+
+\----------- ------------- ------------------------ ---------------------------
+
+16 S Jo Brown
+
+(1 row(s) affected)
+
+Let's update the MaritalStatus value to the more proper "M":
+
+UPDATE e
+
+SET MaritalStatus = 'M'
+
+FROM HumanResources.Employee e
+
+JOIN Person.Person pe
+
+ON e.BusinessEntityID = p.BusinessEntityID
+
+WHERE p.FirstName = 'Jo'
+
+AND p.LastName = 'Brown';
+
+Note that we could have changed more than one column just by adding a comma and the additional column expression. For example, the following statement would have also given Jo a promotion:
+
+UPDATE e
+
+SET MaritalStatus = 'M', JobTitle = 'Shift Manager'
+
+FROM HumanResources.Employee e
+
+JOIN Person.Person pe
+
+ON e.BusinessEntityID = p.BusinessEntityID
+
+WHERE p.FirstName = 'Jo'
+
+AND p.LastName = 'Brown';
+
+While SQL Server is nice enough to let us update pretty much any column (there are a few that we can't, such as timestamps), be very careful about updating primary keys. Doing so puts you at very high risk of "orphaning" other data (data that has a reference to the data you're changing).
+
+The DELETE Statement
+
+The DELETE statement is perhaps the easiest statement of them all. There's no column list—just a table name and, usually, a WHERE clause. The syntax couldn't be much easier:
+
+DELETE [TOP (<expression>) [PERCENT]
+
+[FROM ] <table name>
+
+[FROM ] <table list/JOIN conditions>
+
+[WHERE <search condition>]
+
+The tricky thing with DELETE is the two FROM clauses (nope, that is not a typo). The first is what object you want the DELETE to act on, and the second is a more traditional FROM clause (similar to in a SELECT) that is defining how to decide what rows to act on. The WHERE clause works just like all of the WHERE clauses we've seen thus far. We don't need to provide a column list because we are deleting the entire row (you can't delete half a row, for example).
+
+So, for example, if we want to delete all rows from the Actors table (used in a few samples earlier in this appendix) where there are no matching rows in the Film table, this requires a query that is aware of both tables (so a JOIN is required). In addition, however, it requires realizing that there is no match on one side of the join (that Film does not have a record that matches a particular actor).
+
+You may recall that an OUTER join will return a NULL on the side where there is no match. We are going to utilize that here by actually testing for NULL:
+
+DELETE FROM Actors
+
+FROM Actors a
+
+LEFT JOIN Film f
+
+ON a.FilmID = f.FilmID
+
+WHERE f.FilmID IS NULL;
+
+So, if we skip ahead for a moment to our second FROM clause, you can see that we are utilizing a LEFT JOIN. That means all actors will be returned. Films will be returned if there is a matching FilmID, but the film side of the columns will be NULL if no match exists. In the DELETE statement in the example, we are leveraging this knowledge, and testing for it—if we find a FilmID that is null, then we must not have found a match there (and, therefore, our actor needs to be deleted).
+
+Exploring Alternative Syntax for Joins
+
+Again, most "Pro" level people should have some familiarity with this, but since it is finally getting fairly scarce, we'll go ahead and touch on what many people still consider to be the "normal" way of coding joins.
+
+Most queries today use an ANSI/ISO-complaint SQL syntax (as they should). It is worth noting that the old syntax is actually reasonably well supported across platforms at the current time, but the ANSI/ISO syntax is now also supported by every major platform out there.
+
+The primary reason I continue to cover the old syntax is that there is absolutely no doubt that, sooner or later, you will run into it in legacy code. I don't want you staring at that code saying, "What the heck is this?" That being said, I want to reiterate my strong recommendation that you use the ANSI/ISO syntax wherever possible. Among other reasons, it is more functional. Under old syntax, it was actually possible to create ambiguous query logic—where there was more than one way to interpret the query. The ANSI/ISO syntax eliminates this problem.
+
+An Alternative Inner Join
+
+Quickly, an inner join such as:
+
+SELECT *
+
+FROM HumanResources.Employee e
+
+INNER JOIN HumanResources.Employee m
+
+ON e.ManagerID = m.EmployeeID;
+
+can be rewritten using a WHERE clause–based join syntax. Just eliminate the words INNER JOIN and add a comma, and replace the ON operator with a WHERE clause:
+
+SELECT *
+
+FROM HumanResources.Employee e, HumanResources.Employee m
+
+WHERE e.ManagerID = m.EmployeeID;
+
+This syntax is supported by virtually all major SQL systems (Oracle, DB2, MySQL, and so on) in the world today.
+
+An Alternative Outer Join
+
+With SQL Server 2005 and beyond, we do not necessarily have the alternative outer join syntax available to us. Indeed, by default it is now turned off—you must set your database compatibility level to be 80 or lower (80 is SQL Server 2000). Thankfully, there is very little code left out there using this syntax, and, given the lack of default support, I suspect the amount of code of this type will be effectively zero by the end of the SQL Server 2008 life cycle.
+
+That said, I'll provide a taste of it in case you do bump into it. The basics work pretty much the same as the inner join, except that, because we don't have the LEFT or RIGHT keywords (and no OUTER or JOIN for that matter), we need some special operators especially built for the task. These look like this:
+
+Alternative | ANSI  
+---|---  
+*= | LEFT JOIN  
+=* | RIGHT JOIN
+
+So, an outer join such as:
+
+SELECT e.EmployeeID, m.EmployeeID AS ManagerID
+
+FROM HumanResources.Employee e
+
+LEFT OUTER JOIN HumanResources.Employee m
+
+ON e.ManagerID = m.EmployeeID;
+
+can be translated into the old outer join syntax like this:
+
+SELECT e.EmployeeID, m.EmployeeID AS ManagerID
+
+FROM HumanResources.Employee e, HumanResources.Employee m
+
+WHERE e.ManagerID *= m.EmployeeID;
+
+The alternative syntax for outer joins is not available by default. You must be running 80 or lower as your compatibility mode in order to use this functionality.
+
+An Alternative CROSS JOIN
+
+This is far and away the easiest of the bunch. To create a cross join using the old syntax, you just do nothing. That is, you don't put anything in the WHERE clause of the form: TableA.ColumnA = TableB.ColumnA.
+
+So, an ANSI syntax cross join such as:
+
+SELECT *
+
+FROM Film f
+
+CROSS JOIN Actors a;
+
+would change to:
+
+SELECT *
+
+FROM Film f, Actors a;
+
+UNION
+
+This is again something you should already be at least somewhat familiar with, but I am often surprised to find relatively experienced SQL programmers who, although they know it exists, really do not understand a UNION statement. UNION is a special operator we can use to cause two or more queries to generate one result set.
+
+A UNION appends of the data from one query right onto the end of another query (functionally, it works a little differently from this, but this is the easiest way to look at the concept). Where a JOIN combines information horizontally (adding more columns), a UNION combines data vertically (adding more rows), as illustrated in Figure C.1.
+
+Figure C.1
+
+When dealing with queries that use a union, there are just a few key points:
+
+  * All the unioned queries must have the same number of columns in the SELECT list.
+  * The headings returned for the combined result set will be taken only from the first of the queries.
+  * The data types of each column in a query must be implicitly compatible with the data type in the same relative column in the other queries.
+  * Unlike non-union queries, the default return option for unions is DISTINCT rather than ALL. Unless you use the ALL keyword in your query, only one of any repeating rows will be returned.
+
+In this case, we are creating two tables from which we will select. We'll then insert three rows into each table, with one row being identical between the two tables. If our query is performing an ALL, then every row (six of them) will show up. If the query is performing a DISTINCT, then it will return only five rows (tossing out one duplicate):
+
+CREATE TABLE UnionTest1
+
+(
+
+idcol int IDENTITY,
+
+col2 char(3),
+
+);
+
+CREATE TABLE UnionTest2
+
+(
+
+idcol int IDENTITY,
+
+col4 char(3),
+
+);
+
+INSERT INTO UnionTest1
+
+VALUES
+
+('AAA');
+
+INSERT INTO UnionTest1
+
+VALUES
+
+('BBB');
+
+INSERT INTO UnionTest1
+
+VALUES
+
+('CCC');
+
+SELECT *
+
+FROM UnionTest1;
+
+INSERT INTO UnionTest2
+
+VALUES
+
+('CCC');
+
+INSERT INTO UnionTest2
+
+VALUES
+
+('DDD');
+
+INSERT INTO UnionTest2
+
+VALUES
+
+('EEE');
+
+PRINT 'Regular UNION---------------'
+
+SELECT col2
+
+FROM UnionTest1
+
+UNION
+
+SELECT col4
+
+FROM UnionTest2;
+
+PRINT 'UNION ALL-------------------'
+
+SELECT col2
+
+FROM UnionTest1
+
+UNION ALL
+
+SELECT col4
+
+FROM UnionTest2;
+
+DROP TABLE UnionTest1;
+
+DROP TABLE UnionTest2;
+
+Run it, and the key results look like this:
+
+Regular UNION---------------
+
+col2
+
+\----
+
+AAA
+
+BBB
+
+CCC
+
+DDD
+
+EEE
+
+(5 row(s) affected)
+
+UNION ALL-------------------
+
+col2
+
+\----
+
+AAA
+
+BBB
+
+CCC
+
+CCC
+
+DDD
+
+EEE
+
+(6 row(s) affected)
+
+Subqueries and Derived Tables
+
+Subqueries represent the basic concept of one query providing results that are to be used by yet another query. These utilize parentheses to embed them in the top-level query, and come in three basic forms:
+
+  * Nested Subqueries: Generally, the "inner" query provides a simple lookup to serve some need of the outer query. These can be used to provide a lookup in the select list that is, for example, not sourced for the same place as other data in the query, or in the WHERE clause to force the query to meet some lookup condition.
+  * Correlated Subqueries: These are similar to regular nested subqueries, but are bi-directional in nature. That is, the inner query receives information from the outer query, and then utilizes that information to know what information to supply back to the outer query.
+  * Derived Tables (also known as in-line views): These utilize the idea that a query returns what amounts to a table, and allows you to refer to a query as though it is a table. You can join a full table or view to the results generated from a query (the query used for the join is then referred to as a derived table).
+
+Let's do a very simple review of syntax and example of each.
+
+Nested Subqueries
+
+A nested subquery is one that goes in only one direction—returning either a single value for use in the outer query, or perhaps a full list of values to be used with the IN operator.
+
+In the loosest sense, your query syntax is going to look something like one of these two syntax templates:
+
+SELECT <SELECT list>
+
+FROM <SomeTable>
+
+WHERE <SomeColumn> = (
+
+SELECT <single column>
+
+FROM <SomeTable>
+
+WHERE <condition that results in only one row returned>)
+
+Or:
+
+SELECT <SELECT list>
+
+FROM <SomeTable>
+
+WHERE <SomeColumn> IN (
+
+SELECT <single column>
+
+FROM <SomeTable>
+
+[WHERE <condition>)]
+
+Obviously, the exact syntax will vary, not only because you will be substituting the select list and exact table names, but also because you may have a multi-table join in either the inner or outer queries—or both.
+
+So, a simple example (in this case, utilizing the second of the two templates) of the nested select would be:
+
+SELECT ProductID, Name
+
+FROM Production.Product
+
+WHERE ProductID IN (
+
+SELECT ProductID FROM Sales.SpecialOfferProduct);
+
+The key thing to grasp is that we are simply embedding another query in parentheses and, whatever that query returns must fit the use (a scalar value if the situation it's being used in would expect a scalar value, or a list if a list is suitable).
+
+Correlated Subqueries
+
+What makes correlated subqueries different from the nested subqueries we just looked at is that the information travels in two directions rather than one.
+
+Rather than just feed information out to the top query, a correlated subquery is bi-directional and works in a three-step process:
+
+1. The outer query obtains a record, and passes it into the inner query.
+
+2. The inner query executes based on the passed-in value(s).
+
+3. The inner query then passes the values from its result back to the outer query, which uses that result to finish its processing.
+
+So, an example of using a correlated subquery in the select list might look something like:
+
+SELECT sc.AccountNumber,
+
+(SELECT Min(OrderDate)
+
+FROM Sales.SalesOrderHeader soh
+
+WHERE soh.CustomerID = sc.CustomerID)
+
+AS OrderDate
+
+FROM Sales.Customer sc
+
+This query would, on a customer-by-customer basis, look up the first order we have on file for them and return it along with the Account Number.
+
+Derived Tables
+
+This lesser-known SQL construct is made up of the columns and rows of a result set from a query. (Heck, they have columns, rows, data types, and so on just like normal tables, so why not use them as such?). Like the subqueries we looked at earlier in this appendix, you simply wrap the embedded query in parentheses; you are then ready to use it as if the results were a table. You can think of it as being a view that you wrapped in parentheses rather than named as an object. The only additional requirement is that, since you do not have a formal name for it as you would a table or view, you must use an alias.
+
+So, the following is an example with multiple derived tables (it utilizes the AdventureWorks2008 database to show the account numbers and territories for all customers that have ordered both HL Mountain Rear Wheel and HL Mountain Front Wheel bicycles):
+
+SELECT DISTINCT sc.AccountNumber, sst.Name
+
+FROM Sales.Customer AS sc
+
+JOIN Sales.SalesTerritory sst
+
+ON sc.TerritoryID = sst.TerritoryID
+
+JOIN
+
+(SELECT CustomerID
+
+FROM Sales.SalesOrderHeader soh
+
+JOIN Sales.SalesOrderDetail sod
+
+ON soh.SalesOrderID = sod.SalesOrderID
+
+JOIN Production.Product pp
+
+ON sod.ProductID = pp.ProductID
+
+WHERE pp.Name = 'HL Mountain Rear Wheel') AS dt1
+
+ON sc.CustomerID = dt1.CustomerID
+
+JOIN
+
+(SELECT CustomerID
+
+FROM Sales.SalesOrderHeader soh
+
+JOIN Sales.SalesOrderDetail sod
+
+ON soh.SalesOrderID = sod.SalesOrderID
+
+JOIN Production.Product pp
+
+ON sod.ProductID = pp.ProductID
+
+WHERE Name = 'HL Mountain Front Wheel') AS dt2
+
+ON sc.CustomerID = dt2.CustomerID
+
+Pay particular attention to the use of parentheses to enclose the boundaries of each derived table as well as the use of aliases for both (dt1 and dt2).
+
+Summary
+
+This was just a quick review of some ultra basic concepts in SQL Server. The assumption as I wrote this book is that you already have fairly decent understanding of most things covered here. The idea is largely one of review. If you find that you're struggling with any of the concepts here, then you probably should consider picking up a copy of Beginning SQL Server 2008 Programming.
+Index
+
+A
+
+ABS
+
+ABSOLUTE, FETCH
+
+Accumulate
+
+ACOS
+
+ACP
+
+Acrobat
+
+FTS
+
+ACTIVESCRIPTING
+
+ActiveX
+
+Activity Monitor
+
+Data File I/O
+
+Overview
+
+Processes
+
+Recent Expensive Queries
+
+Resource Waits
+
+actual graphical showplan
+
+A-DBN
+
+ADD
+
+AdditionalContactInfo
+
+administration
+
+archiving data
+
+backup
+
+index maintenance
+
+Policy Based Management
+
+PowerShell
+
+scheduling jobs
+
+Administrators
+
+ADO
+
+cursors
+
+GO
+
+SMO
+
+ADO.NET
+
+BLOBs
+
+client-side vs. server-side
+
+cursors
+
+filters
+
+GO
+
+.NET
+
+resorting
+
+advanced programmability
+
+advanced queries
+
+AdventureWorks
+
+backup
+
+FTS
+
+recursive queries
+
+views
+
+XML
+
+AdventureWorksDW
+
+AdventureWorksLT
+
+A-ERR
+
+AFTER, triggers
+
+Agent Security
+
+aggregate functions
+
+Aliases
+
+aliases
+
+correlated subqueries
+
+PATH
+
+tables
+
+ALL
+
+DENY
+
+ALL_INDEXES
+
+ALL_LEVELS
+
+ALLOW
+
+ALTER
+
+XML schema collections
+
+ALTER DATABASE
+
+ALTER FULLTEXT CATALOG < catalog name >
+
+ALTER FULLTEXT INDEX
+
+ALTER FULLTEXT INDEX ON < table name >
+
+ALTER INDEX
+
+DBREINDEX
+
+FILLFACTOR
+
+REBUILD
+
+ALTER LOGIN < login name >
+
+ALTER TABLE
+
+ALTER TRIGGER
+
+ALTER VIEW
+
+A-MSG
+
+Analysis Services
+
+cube data
+
+data warehousing
+
+sp_add_jobstep
+
+Analysis Services Tasks
+
+ANALYSISCOMMAND
+
+ANALYSISQUERY
+
+anchors, in recursive queries
+
+AND
+
+<and>
+
+AND NOT
+
+anonymous subscribers
+
+ANSI/ISO
+
+PRIVILEGES
+
+ANSI_NULLS
+
+ANY
+
+-a < packet size >
+
+Application Name
+
+application roles
+
+permissions
+
+APP_NAME
+
+archives, data
+
+articles
+
+RMO
+
+AS
+
+REVOKE
+
+AS DEFAULT
+
+full-text catalogs
+
+ASC
+
+ASCII
+
+bcp
+
+SINGLE_CLOB
+
+SQLNCHAR
+
+ASCII
+
+ASIN
+
+assemblies
+
+data types
+
+sprocs
+
+triggers
+
+UDFs
+
+A-SVR
+
+ASYMETRIC KEY
+
+AsymKey_ID
+
+asymmetric keys, security
+
+asynchronous backup
+
+ATAN
+
+ATN2
+
+atomicity
+
+auditing, views
+
+authentication
+
+Mixed Authentication
+
+Windows Authentication
+
+login
+
+Reporting Services
+
+AUTHORIZATION
+
+AUTO
+
+ELEMENTS
+
+EXPLICIT
+
+SELECT
+
+XML
+
+FOR XML
+
+autonomy
+
+merge replication
+
+replication
+
+AVG
+
+Avg. Bytes free per page
+
+Avg. Page Density (full)
+
+Avg. Pages per Extent
+
+avg_fragmentation_in_percent
+
+avg_record_size_in_bytes
+
+AWC Logo Cap
+
+B
+
+-b batch size
+
+backslash
+
+backup
+
+administration
+
+AdventureWorks
+
+asynchronous
+
+databases, SMO
+
+Destination
+
+differential
+
+DMO
+
+full
+
+Options
+
+RAID
+
+Schedule
+
+transaction log
+
+Backup Component
+
+BACKUP DATABASE
+
+user permissions
+
+< backup device >
+
+BACKUP LOG
+
+user permissions
+
+backup set
+
+BACKUP, T-SQL
+
+Backup Type
+
+backward compatibility
+
+BLOBs
+
+DBREINDEX
+
+error trapping
+
+EXPLICIT
+
+full-text index files
+
+indexes
+
+login
+
+rules
+
+semicolon
+
+Balanced Tree (B-Tree)
+
+clustered indexes
+
+indexes
+
+page splits
+
+bandwidth, replication
+
+batches
+
+DROP
+
+errors
+
+GO
+
+BATCH = nn
+
+BCM. See Bulk Changed Map
+
+bcp. See Bulk Copy Program
+
+BCP.fmt
+
+BEGIN
+
+transactions
+
+BEGIN TRAN
+
+BEGIN TRANSACTION
+
+BEGIN...END
+
+BETWEEN
+
+BI. See Business Intelligence
+
+BIDS. See Business Intelligence Development Studio
+
+Bigint
+
+Binary
+
+BINARY BASE64
+
+binary collation
+
+Binary large objects (BLOBs)
+
+ADO.NET
+
+backward compatibility
+
+pages
+
+security
+
+transactional replication
+
+Bit
+
+bkp.Initialize
+
+BLOBs. See Binary large objects
+
+blocks
+
+code
+
+sprocs
+
+transactions
+
+BLOCKSIZE
+
+BOL. See Books Online
+
+Books Online (BOL)
+
+Boolean
+
+COLUMNS_UPDATED()
+
+CONTAINS
+
+@enabled
+
+.exist
+
+FTS
+
+keywords
+
+bound connections
+
+Boyce-Codd
+
+branching rules
+
+breadth-first index
+
+BREAK
+
+breakpoints
+
+B-Tree. See Balanced Tree
+
+BU. See bulk update lock
+
+Buffer Hit Cache Ratio
+
+Buffer Manager
+
+Bulk Changed Map (BCM)
+
+Bulk Copy Program (bcp)
+
+ASCII
+
+columns
+
+exporting
+
+format files
+
+forward slash
+
+hyphen
+
+importing
+
+logged
+
+.NET
+
+syntax
+
+transaction log
+
+unlogged
+
+BULK INSERT
+
+bulkadmin
+
+T-SQL
+
+Bulk Insert Task
+
+bulk update lock (BU)
+
+bulkadmin
+
+bulkadminserver
+
+BulkImport
+
+bulkinsert
+
+Bulk-Logged
+
+bulk-logged recovery
+
+Business Intelligence (BI)
+
+Business Intelligence Development Studio (BIDS)
+
+Reporting Services
+
+By Ref
+
+C
+
+C
+
+DLL
+
+C#
+
+C + +
+
+-c
+
+CA. See certificate authority
+
+Cache Manager
+
+Calc
+
+Call Stack window
+
+candidate keys
+
+cardinality
+
+carte blanche
+
+CASCADE, WITH GRANT OPTION
+
+CASE
+
+CAST
+
+@category_ID
+
+@category_id
+
+@category_name
+
+-C < code page >
+
+CDATA
+
+cdata
+
+CEILING
+
+Cert_ID
+
+CERTIFICATE
+
+certificate authority (CA)
+
+certificates, security
+
+CertProperty
+
+change tracking, population
+
+CHANGETABLE
+
+CHANGE_TRACKING
+
+CHAR
+
+Char
+
+char
+
+character data. See cdata
+
+CHARINDEdi
+
+CHARINDEX
+
+charts, Reporting Services
+
+CHECK
+
+non-repeatable reads
+
+triggers
+
+CHECK_CONSTRAINTS
+
+CHECK_EXPIRATION
+
+checkpoint
+
+CHECKPOINT
+
+Checkpoint on Recovery
+
+CHECK_POLICY
+
+CHECKSUM
+
+CHECKSUM_AGG
+
+ChildTable
+
+client installations
+
+.NET
+
+sprocs
+
+Client Network Configuration
+
+Client Protocols
+
+Client Statistics
+
+client-side vs. server-side
+
+ADO.NET
+
+filters
+
+LINQ
+
+resorting
+
+CLR. See Common Language Runtime
+
+clustered indexes
+
+B-Tree
+
+primary keys
+
+CMDEXEC
+
+cmdExec
+
+@cmdexec_success_code
+
+cmdlets
+
+COALESCE
+
+COBOL
+
+code
+
+blocks
+
+DELETE
+
+INSERT
+
+MERGE
+
+SMO
+
+UPDATE
+
+updates
+
+codeplex.com
+
+collation
+
+COLLATIONPROPERTY
+
+collection sets
+
+collections
+
+COL_LENGTH
+
+COL_NAME
+
+Column Filters
+
+COLUMNPROPERTY
+
+columns
+
+bcp
+
+date
+
+filters
+
+full-text indexes
+
+indexes
+
+joins
+
+named
+
+naming
+
+ParentTable
+
+primary key
+
+SELECT
+
+SQL Server Management Studio
+
+UNION
+
+unnamed
+
+COLUMNS_UPDATED()
+
+Boolean
+
+COM, DMO
+
+comma delimited
+
+@command
+
+Command Files, packages
+
+Command Line, packages
+
+Command window
+
+commas
+
+COMMIT TRAN
+
+COMMIT, transactions
+
+Common Language Runtime (CLR)
+
+assemblies
+
+Common Table Expressions (CTEs)
+
+WITH
+
+recursive queries
+
+COMPRESSION
+
+compression, tables
+
+COMPUTE BY, CTEs
+
+COMPUTER, CTEs
+
+concatenation, EXEC
+
+concurrency
+
+cursors
+
+triggers
+
+condition flags, triggers
+
+configuration functions
+
+Configuration Manager
+
+Reporting Services
+
+Configurations, packages
+
+conflict management
+
+publishers
+
+replication
+
+subscribers
+
+Connection Managers, packages
+
+@@CONNECTIONS
+
+constraints
+
+Diagram
+
+indexes
+
+logical design
+
+SMO
+
+triggers
+
+XML
+
+CONTAINS
+
+Boolean
+
+phrases
+
+WHERE
+
+CONTAINS FILESTREAM
+
+CONTAINSTABLE
+
+KEY
+
+Continue
+
+CONTINUE
+
+Control Flow
+
+Control Flow Items
+
+control-of-flow statements
+
+CONVERT
+
+@cookie
+
+COPY_ONLY
+
+correlated subqueries
+
+aliases
+
+joins
+
+SELECT
+
+WHERE
+
+COS
+
+COT
+
+COUNT
+
+COUNT_BIG
+
+covered queries
+
+CPU
+
+joins
+
+Profiler
+
+TEXT|ALL
+
+@@CPU_BUSY
+
+CREATE
+
+ALTER INDEX
+
+EXISTS
+
+XML
+
+XML schema collections
+
+Create()
+
+CREATE APPLICATION ROLE
+
+CREATE ASSEMBLY
+
+CREATE ASYMMETRIC KEY
+
+CREATE CERTIFICATE
+
+CREATE DATABASE
+
+filestreams
+
+securityadmin
+
+user permissions
+
+CREATE DEFAULT
+
+user permissions
+
+CREATE FULLTEXT CATALOG
+
+CREATE FULLTEXT INDEX
+
+CREATE FULLTEXT INDEX ON < table name >
+
+CREATE FULLTEXT STOPLIST
+
+CREATE FUNCTION
+
+CREATE INDEX
+
+CREATE LOGIN
+
+CREATE PROCEDURE
+
+T-SQL
+
+user permissions
+
+CREATE ROLE
+
+CREATE RULE
+
+user permissions
+
+CREATE SCHEMA
+
+CREATE TABLE
+
+SMO
+
+user permissions
+
+CREATE TRIGGER
+
+CREATE TYPE
+
+CREATE USER
+
+CREATE VIEW
+
+user permissions
+
+CREATE VIEW < view name >
+
+CreateImportText.vbs
+
+CREATE < object type > < object name >
+
+CREATE<object> < object type >
+
+CreateTable
+
+CREDENTIAL
+
+credentials
+
+Reporting Services
+
+SQL Server Agent
+
+CROSS APPLY.nodes
+
+CROSS JOIN
+
+cryptographic functions
+
+Crystal Reports
+
+.csv
+
+CTEs. See Common Table Expressions
+
+cubes
+
+accessing
+
+analysis
+
+data
+
+Analysis Services
+
+OLE DB
+
+Excel
+
+current execution line
+
+CURRENT_TIMESTAMP
+
+CURRENT_USER
+
+Cursor
+
+cursor
+
+cursor functions
+
+@@CURSOR_ROWS
+
+cursors
+
+ADO
+
+ADO.NET
+
+avoiding
+
+concurrency
+
+data
+
+DB-Lib
+
+dynamic
+
+FAST_FORWARD
+
+forward-only, read-only
+
+GLOBAL
+
+ISAM
+
+JDBC
+
+keyset-driven
+
+lifespan
+
+LOCAL
+
+Native Client
+
+ODBC
+
+OLE DB
+
+scrollability
+
+SELECT
+
+sprocs
+
+static
+
+client-side vs. server-side
+
+temporary tables
+
+types
+
+UPDATE
+
+VSAM
+
+WHERE
+
+WHILE
+
+CURSOR_STATUS
+
+custom data types
+
+custom error messages, triggers
+
+D
+
+damaged seats
+
+Dashboards
+
+data
+
+archives
+
+convergence
+
+cubes
+
+Analysis Services
+
+OLE DB
+
+cursors
+
+geodetic
+
+GEOGRAPHY
+
+hierarchical
+
+horizontal indexing
+
+vertical indexing
+
+XML
+
+lost
+
+mining
+
+pages
+
+planar
+
+GEOMETRY
+
+relational, XML
+
+replication
+
+scrubbing, data warehousing
+
+sources
+
+embedded
+
+Reporting Services
+
+shared
+
+spatial
+
+Management Studio
+
+temporary tables
+
+validation, data warehousing
+
+views
+
+warehousing
+
+Analysis Services
+
+characteristics
+
+cube accessing
+
+data scrubbing
+
+data validation
+
+deletes
+
+inserts
+
+OLAP
+
+OLTP
+
+read-only
+
+SSIS
+
+updates
+
+data()
+
+Data Collector
+
+login
+
+scalability
+
+data consistency, replication
+
+Data Definition Language (DDL)
+
+triggers
+
+Data File I/O
+
+Data Flow
+
+Data Flow Task
+
+Data Manipulation Language (DML)
+
+MERGE
+
+triggers
+
+data marts
+
+SSIS
+
+Data Mining Query Task
+
+Data Transformation Services (DTS)
+
+data types
+
+assemblies
+
+conversion
+
+custom
+
+DROP
+
+UDTs
+
+.NET
+
+replication
+
+tables
+
+XML
+
+Database
+
+database administrator (DBA)
+
+Database Collection
+
+Database Consistency Checker (DBCC)
+
+Database Console Commands (DBCC)
+
+Database Engine
+
+Database Engine Tuning Advisor
+
+Database Mail
+
+@job_id
+
+Database Master Key
+
+Database name
+
+@database_name
+
+DATABASEPROPERTY
+
+DATABASEPROPERTYEX
+
+databases
+
+backup, SMO
+
+design
+
+dimensional
+
+dropping, SMO
+
+encryption
+
+FTS
+
+lockable resources
+
+master
+
+full-text indexes
+
+model
+
+msdb
+
+jobs
+
+tasks
+
+reusable
+
+roles
+
+security
+
+SMO
+
+storage
+
+tempdb
+
+DATA_COMPRESSION
+
+DATALENGTH
+
+date
+
+DATE
+
+Date
+
+date and time functions
+
+date columns
+
+DATEADD
+
+DATEDIFF
+
+@@DATEFIRT
+
+DATENAME
+
+DATEPART
+
+DateTime
+
+datetime
+
+DateTime2
+
+datetime2
+
+DateTimeOffset
+
+DAY
+
+DB combo box
+
+DB2, replication
+
+DBA. See database administrator
+
+db_accessadmin
+
+db_backupoperaor
+
+DBCC. See Database Console Commands
+
+DBCC. See Database Consistency Checker
+
+DBCC INDEXDEFRAG
+
+DBCreateTest
+
+dbcreator
+
+db_datareader
+
+db_datawriter
+
+db_ddladmin
+
+db_denydatareader
+
+db_denydatawriter
+
+DB_ID()
+
+DB_ID
+
+DB-Lib
+
+cursors
+
+DBMS
+
+DB_NAME
+
+DBO_ONLY
+
+dbo_only
+
+db_owner
+
+DBREINDEX
+
+ALTER INDEX
+
+backward compatibility
+
+FILLFACTOR
+
+@@DBTS
+
+DCM. See Differential Changed Map
+
+DDL. See Data Definition Language
+
+deadlocks
+
+transactions
+
+debugging
+
+triggers
+
+Decimal
+
+Decision Support Systems (DSS)
+
+declarations, SMO
+
+declarative referential integrity (DRI)
+
+triggers
+
+DecryptByAsmKey
+
+DecryptByCert
+
+DecryptByKey
+
+DecryptByPassPhrase
+
+DEFAULT_DATABASE
+
+DEFAULT_LANGUAGE
+
+defaults
+
+@defdb
+
+@deflanguage
+
+DEGREES
+
+DELAY
+
+DELETE
+
+code
+
+db_denydatawriter
+
+MERGE
+
+object permissions
+
+transactional replication
+
+transactions
+
+triggers
+
+delete(s)
+
+data warehousing
+
+indexes
+
+OLTP
+
+delete, XQuery
+
+Deleted
+
+Delphi
+
+Demographics
+
+de-normalization
+
+OLAP
+
+strategic
+
+triggers
+
+DENY
+
+ON
+
+ALL
+
+GRANT
+
+WITH GRANT OPTION
+
+REVOKE
+
+SELECT
+
+DENY SELECT
+
+Deploy, Reporting Services
+
+depth first index
+
+derived tables
+
+DESC
+
+design
+
+databases
+
+logical
+
+Destination, backup
+
+DETAILED
+
+Dev Studio
+
+Diagrams
+
+constraints
+
+tables
+
+diagrams
+
+dictionary order collation
+
+DIFFERENCE
+
+DIFFERENTIAL
+
+differential backup
+
+Differential Changed Map (DCM)
+
+dimension tables
+
+dimensional databases
+
+!!DIR
+
+dirty reads
+
+DISABLE
+
+Disk Striping without Parity
+
+diskadmin
+
+Display Estimated Execution Plan
+
+DISTINCT
+
+Distributed Management Framework
+
+Distributed Management Objects (DMO)
+
+backup
+
+COM
+
+jobs
+
+tables
+
+distributed partition view
+
+DISTRIBUTION
+
+Distribution Agent
+
+transactional replication
+
+distributor
+
+central
+
+remote
+
+replication
+
+DLL
+
+C
+
+.NET
+
+DML. See Data Manipulation Language
+
+DMO. See Distributed Management Objects
+
+DMVs. See dynamic management views
+
+DMX, sp_add_jobstep
+
+DNS. See Domain Name Service
+
+Do Case
+
+.doc
+
+Document
+
+Document Object Model (DOM)
+
+DocumentSummary
+
+DocumentType
+
+DOM. See Document Object Model
+
+Domain Name Service (DNS)
+
+replication
+
+double quotes
+
+DRI. See declarative referential integrity
+
+driver support
+
+DROP
+
+ALTER INDEX
+
+batches
+
+data types
+
+full-text indexes
+
+triggers
+
+XML schema collections
+
+DROP APPLICATION ROLE
+
+DROP FULLTEXT CATALOG < catalog name >
+
+DROP FULLTEXT INDEX ON < table name >
+
+DROP INDEX
+
+DROP LOGIN < login name >
+
+DROP TABLE
+
+DROP VIEW < view name >
+
+DROP < bject type > < object name >
+
+DROP_EXISTING
+
+DROP_EXISTING = ON
+
+dropping
+
+full-text catalogs
+
+full-text indexes
+
+login
+
+dropping databases, SMO
+
+DSS. See Decision Support Systems
+
+DTDs
+
+DTS. See Data Transformation Services
+
+'Dts,'
+
+durable
+
+Duration
+
+DYNAMIC
+
+dynamic cursors
+
+dynamic management views
+
+dynamic management views (DMVs)
+
+E
+
+-E
+
+-e < error file >
+
+element
+
+elements
+
+ELEMENTS, AUTO
+
+ELSE
+
+@email_address
+
+embedded data sources
+
+ENABLE
+
+UNLOCK
+
+@enabled, Boolean
+
+EncryptByAsmKey
+
+EncryptByCert
+
+EncryptByKey
+
+EncryptByPassPhrase
+
+encryption
+
+ODBC
+
+OLE DB
+
+passwords
+
+views
+
+Encryption Keys
+
+@encryptopt
+
+entities
+
+entity box
+
+IE
+
+Entity Frameworks
+
+entity-relationship diagram (ERD)
+
+ERD. See entity-relationship diagram
+
+@@ERROR
+
+sprocs
+
+error(s)
+
+arguments
+
+batches
+
+handling, .NET
+
+inline
+
+runtime
+
+sprocs
+
+syntax
+
+trapping
+
+backward compatibility
+
+TRY/CATCH
+
+error number 1205
+
+ERROR_LINE()
+
+@ErrorLogID
+
+ERROR_MESSAGE()
+
+ERROR_NUMBER()
+
+ERROR_PROCEDURE()
+
+ERROR_SEVERITY()
+
+ERROR_STATE()
+
+escalation locks
+
+$(ESCAPE_DQUOTE (token name))
+
+$(ESCAPE_NONE (token name))
+
+$(ESCAPE_RBRACKET (token name))
+
+$(ESCAPE_SQUOTE (token name))
+
+estimated graphical showplan
+
+ETL. See Extract, Transform, and Load
+
+Evaluate
+
+Event Handlers
+
+Excel
+
+cubes
+
+FTS
+
+PivotTable
+
+EXCEPT
+
+EXISTS
+
+NOT EXISTS
+
+Exchange
+
+exclusive locks
+
+exclusive server, hardware
+
+exclusive subcategories
+
+EXEC
+
+concatenation
+
+security
+
+sprocs
+
+UDFs
+
+EXECUTE
+
+object permissions
+
+sprocs
+
+tables
+
+Execute
+
+Execute Package Utility
+
+Execute Process Task Editor
+
+Execute SQL Task
+
+Execute Tasks
+
+Execution Options, packages
+
+execution plan
+
+exist()
+
+.exist
+
+.exist()
+
+.exist
+
+Boolean
+
+NULL
+
+EXISTS
+
+CREATE
+
+EXCEPT
+
+INTERSECT
+
+multiple lookups
+
+EXP
+
+EXPIREDATE
+
+EXPLICIT
+
+AUTO
+
+backward compatibility
+
+BINARY BASE64
+
+hierarchy
+
+RAW
+
+UNION
+
+exporting
+
+bcp
+
+SELECT
+
+extensibility, subcategories
+
+Extensible Markup Language (XML). See also XML indexes
+
+AdventureWorks
+
+constraints
+
+CREATE
+
+data types
+
+format files
+
+hierarchical data
+
+NULL
+
+relational data
+
+schema collections
+
+schemas
+
+extent
+
+lockable resources
+
+storage
+
+Extent Scan Fragmentation
+
+Extent Switches
+
+Extents Scanned
+
+external calls
+
+EXTERNAL_ACCESS
+
+Extract, Transform, and Load (ETL)
+
+F
+
+-F first row
+
+fact table
+
+fan out
+
+FA-SEV
+
+FAST
+
+FAST_FORWARD, cursors
+
+federated distributed partitioned view
+
+FETCH
+
+ABSOLUTE
+
+RELATIVE
+
+FETCH FIRST
+
+FETCH NEXT
+
+FETCH PREVIOUS
+
+@@FETCH_STATUS
+
+-f < format file >
+
+fifth normal form
+
+FILE
+
+File Connection Management Editor
+
+file streams
+
+File System
+
+File System Tasks
+
+FILEGROUP_ID
+
+FILEGROUP_NAME
+
+FILEGROUPPROPERTY
+
+filegroups
+
+FILE_ID
+
+FILE_NAME
+
+FILEPROPERTY
+
+files, storage
+
+filestreams
+
+CREATE DATABASE
+
+.NET
+
+security
+
+tables
+
+T-SQL
+
+FILLFACTOR
+
+ALTER INDEX
+
+DBREINDEX
+
+OLTP
+
+FillRow
+
+Filter Table Rows
+
+FilterClause
+
+filters
+
+ADO.NET
+
+client-side vs. server-side
+
+columns
+
+LINQ
+
+replication
+
+FIRE_TRIGGERS
+
+first normal form (1NF)
+
+FirstPoint
+
+@flags
+
+Float
+
+FLOOR
+
+FOR
+
+triggers
+
+FOR BROWSE, CTEs
+
+For Each Container
+
+FOR INSERT
+
+For Loop Container
+
+FOR UPDATE
+
+FOR XML
+
+AUTO
+
+CTEs
+
+RAW
+
+SELECT
+
+UNION
+
+FOR XML AUTO
+
+FOR/EACH
+
+foreign keys
+
+ForeignKey
+
+ForeignKeyColummn
+
+FORMAT
+
+format
+
+format files
+
+bcp
+
+schemas
+
+XML
+
+FORMATMESSAGE
+
+FOR/NEXT
+
+forward slash
+
+bcp
+
+FORWARD_ONLY
+
+forward-only, read-only cursors
+
+fourth normal form
+
+fragmentation, indexes
+
+FREETEXT
+
+INFECTIONAL
+
+phrases
+
+FREETEXTTABLE
+
+@freq_interval
+
+@freq_recurrence_factor
+
+@freq_relative_interval
+
+@freq_subday_interval
+
+@freq_subday_type
+
+@freq_type
+
+@frequency_relative_interval
+
+Friday
+
+FROM
+
+login
+
+FTP Tasks
+
+FTS. See Full-Text Search
+
+FULL
+
+joins
+
+full backup
+
+full population
+
+full recovery
+
+full-text catalogs
+
+altering
+
+AS DEFAULT
+
+dropping
+
+REBUILD
+
+REORGANIZE
+
+tables
+
+Full-Text Filter Daemon
+
+full-text indexes
+
+altering
+
+backward compatibility
+
+columns
+
+creating
+
+dropping
+
+master database
+
+population
+
+stop lists
+
+Full-Text Search (FTS)
+
+Acrobat
+
+AdventureWorks
+
+architecture
+
+Boolean
+
+databases
+
+Excel
+
+indexes
+
+keywords
+
+phrases
+
+proximity
+
+queries
+
+WEIGHT
+
+Word
+
+FULLTEXTCATALOGPROPERTY
+
+FULLTEXTSERVICEPROPERTY
+
+G
+
+GAM. See Global Allocation Map
+
+General Statistics
+
+GenerateImportFile
+
+geodetic data
+
+GEOGRAPHY
+
+GEOGRAPHY
+
+geodetic data
+
+Geography Markup Language (GML)
+
+GEOMETRY
+
+planar data
+
+GetAncester()
+
+HierarchyID
+
+GETANSINULL
+
+GETDATE()
+
+GetDescendant
+
+GetDescendant( < child1 > , < child2 > )
+
+GetLevel
+
+GetLevel()
+
+GetReparentedValue( < old root > , < new root > )
+
+GetReparentedValue(), UPDATE
+
+GetRoot()
+
+getting direct reports
+
+GETUTCDATE
+
+Global Allocation Map (GAM)
+
+GLOBAL, cursors
+
+global subscribers
+
+Global Variables
+
+Globally Unique Identifier (GUID)
+
+@job_id
+
+GML. See Geography Markup Language
+
+GO
+
+ADO
+
+ADO.NET
+
+batches
+
+ODBC
+
+OLE DB
+
+T-SQL
+
+GOTO
+
+grafting
+
+GRANT
+
+bulkadmin
+
+DENY
+
+granularity
+
+graphical showplan
+
+actual
+
+estimated
+
+hierarchy
+
+Query Optimizer
+
+GROUP BY
+
+GROUPING
+
+guest accounts, security
+
+GUID. See Globally Unique Identifier
+
+H
+
+hardware
+
+being down
+
+CPU
+
+driver support
+
+exclusive server
+
+ideal
+
+I/O
+
+lost data
+
+maintenance
+
+OLTP vs. OLAP
+
+on-site vs. off-site
+
+RAID
+
+security
+
+HAS_DBACCESS
+
+HASHED
+
+HAVING
+
+@hdoc
+
+heaps
+
+-h"hint[,...]"
+
+hide
+
+hierarchical data
+
+horizontal indexing
+
+vertical indexing
+
+XML
+
+hierarchy
+
+EXPLICIT
+
+graphical showplan
+
+horizontal indexes
+
+lockable resources
+
+recursive queries
+
+vertical indexes
+
+HierarchyID
+
+GetAncester()
+
+methods
+
+structure
+
+values
+
+-HINT
+
+HOLDLOCK, client-side vs. server-side
+
+horizontal filtering
+
+horizontal indexes, hierarchy
+
+horizontal partitioning
+
+WHERE
+
+HOST_ID
+
+HOST_NAME
+
+hyhen, bcp
+
+I
+
+id
+
+XMLDATA
+
+IDEF1X
+
+IDENT_CURRENT
+
+identifiers, objects
+
+identifying relationship
+
+IDENT_INCR
+
+IDENTITY
+
+@@IDENTITY
+
+IDENTITY_INSERT ON
+
+IDENT_SEED
+
+@@IDLE
+
+idref
+
+XMLDATA
+
+idrefs
+
+UNION
+
+XMLDATA
+
+IE. See Information Engineering
+
+IEnumerable, .NET
+
+IF
+
+non-repeatable reads
+
+IF EXISTS
+
+IF...ELSE
+
+sprocs
+
+iFilter
+
+IGNORE_DUP_KEY
+
+-i < input file >
+
+IIS. See Internet Information Services
+
+Image
+
+image
+
+image functions
+
+immediate-updating subscribers
+
+snapshot replication
+
+transactional replication
+
+implicit transactions
+
+SET
+
+IMPLICIT_TRANSACTIONS
+
+Import/Export Wizard
+
+importing
+
+bcp
+
+INSERT
+
+login
+
+SELECT
+
+imports
+
+in
+
+IN PATH
+
+INCLUDE
+
+include
+
+INCREMENTAL
+
+incremental population
+
+INDE
+
+Index Tuning Wizard
+
+INDEX_COL
+
+index_depth
+
+indexed view
+
+indexes
+
+backward compatibility
+
+breadth-first
+
+B-Tree
+
+choices
+
+clustered
+
+B-Tree
+
+primary keys
+
+columns
+
+constraints
+
+deletes
+
+fragmentation
+
+FTS
+
+full-text
+
+altering
+
+backward compatibility
+
+columns
+
+creating
+
+dropping
+
+master database
+
+population
+
+stop lists
+
+horizontal, hierarchy
+
+inserts
+
+JOIN
+
+maintenance
+
+maintenance planning
+
+names
+
+navigation
+
+non-clustered
+
+on a clustered table
+
+on a heap
+
+ORDER BY
+
+pages
+
+selectivity
+
+SMO
+
+SQL Server Management Studio
+
+storage
+
+temporary tables
+
+token-based structure
+
+triggers
+
+T-SQL
+
+types
+
+updates
+
+vertical, hierarchy
+
+WHERE
+
+XML
+
+creating
+
+PATH
+
+primary
+
+PROPERTY
+
+secondary
+
+VALUE
+
+INDEXKEY_PROPERTY
+
+IndexKeyType
+
+index_level
+
+INDEXPROPERTY
+
+index_type_desc
+
+[IN|EXISTS|ANY|ALL]
+
+INFECTIONAL, FREETEXT
+
+Information Engineering (IE)
+
+entity box
+
+information_schema
+
+INIT
+
+Init
+
+inline errors
+
+inner joins
+
+INSERT
+
+bulkadmin
+
+code
+
+db_denydatawriter
+
+importing
+
+MERGE
+
+object permissions
+
+scripts
+
+transactional replication
+
+transactions
+
+triggers
+
+insert(s)
+
+data warehousing
+
+indexes
+
+OLAP
+
+OLTP
+
+INSERT INTO
+
+INSERT INTO...SELECT
+
+insert, XQuery
+
+INSERTED
+
+Inserted
+
+INSERT..SELECT
+
+INST
+
+< instance of xml data type > .<method>
+
+INSTEAD OF
+
+SELECT
+
+triggers
+
+INSTEAD OF DELETE, triggers
+
+INSTEAD OF INSERT, triggers
+
+INSTEAD OF UPDATE, triggers
+
+Int
+
+int
+
+Integration Services. See SQL Server Integration Services
+
+intent exclusive lock
+
+intent locks
+
+intent shared lock
+
+Internet Information Services (IIS)
+
+Internet server provider (ISP)
+
+INTERSECT
+
+EXISTS
+
+NOT EXISTS
+
+INTO, CTEs
+
+Inversion Entry
+
+Invoke-Sqlcmd
+
+I/O
+
+TEXT|ALL
+
+@@IO_BUSY
+
+IP addresses
+
+ISABOUT()
+
+ISAM
+
+cursors
+
+ISDATE
+
+IsDescendantOf( < node > )
+
+IS_MEMBER
+
+ISNULL
+
+ISNUMERIC
+
+ISOLATION LEVEL
+
+isolation levels
+
+transaction isolation level
+
+least restrictive
+
+ISP. See Internet server provider
+
+isql
+
+IsRowGuid
+
+IS_SRVROLEMEMBER
+
+J
+
+JDBC, cursors
+
+JOBID
+
+@job_id
+
+Database Mail
+
+GUID
+
+uniqueidentifier
+
+@job_name
+
+jobs
+
+creating
+
+deleting
+
+DMO
+
+maintaining
+
+msdb database
+
+scheduling
+
+SQL Server Management Studio
+
+T-SQL
+
+USE
+
+JOIN
+
+indexes
+
+joins
+
+alternative syntax
+
+columns
+
+correlated subqueries
+
+CPU
+
+FULL
+
+inner
+
+outer
+
+subqueries
+
+tables
+
+K
+
+-k
+
+KEY, CONTAINSTABLE
+
+KEY INDEX
+
+key value
+
+Key_GUID
+
+Key_ID
+
+keys
+
+asymmetric, security
+
+candidate
+
+foreign
+
+lockable resources
+
+primary
+
+clustered indexes
+
+columns
+
+tables
+
+KEYSET
+
+keyset-driven cursors
+
+keywords
+
+Boolean
+
+FTS
+
+Query Analyzer
+
+KILOBYTES_PER_BATCH = nn
+
+L
+
+-L last row
+
+@lang
+
+@@LANGID
+
+LANGUAGE
+
+@@LANGUAGE
+
+latency
+
+merge replication
+
+replication
+
+LDAP. See lightweight directory access protocol
+
+.LDF
+
+leaf level nodes
+
+REORGANIZE
+
+LEFT
+
+legacy systems
+
+LEN
+
+lightweight directory access protocol (LDAP)
+
+LIKE
+
+LIMITED
+
+lines
+
+LINESTRING
+
+linking, Reporting Services
+
+LINQ
+
+client-side vs. server-side
+
+filters
+
+.NET
+
+resorting
+
+SMO
+
+LOCAL, cursors
+
+local subscribers
+
+Locals window
+
+lock manager
+
+lock modes
+
+lockable resources
+
+locks
+
+BU
+
+compatibility
+
+escalation
+
+exclusive
+
+intent exclusive
+
+intent shared
+
+ranged
+
+Sch-M
+
+Sch-S
+
+shared
+
+SQL Server Management Studio
+
+update
+
+@@LOCK_TIMEOUT
+
+log
+
+LOG
+
+Log Reader Agent, transactional replication
+
+LOG10
+
+Logging, packages
+
+logical design
+
+constraints
+
+rules
+
+structure
+
+Logical Reads
+
+Logical Scan Fragmentation
+
+Login
+
+number of tries
+
+roles
+
+security
+
+sprocs
+
+storage
+
+login
+
+FROM
+
+WITH
+
+altering
+
+backward compatibility
+
+creating
+
+Data Collector
+
+dropping
+
+importing
+
+SMO
+
+SQL Server Management Studio
+
+Windows Authentication
+
+WMI
+
+LOGIN
+
+Login Name
+
+@loginame
+
+LOGREADER
+
+logs, mirroring
+
+Long/Short
+
+looping, WHILE
+
+lost data
+
+lost updates
+
+LOWER
+
+LTRIM
+
+M
+
+MACH
+
+maintenance
+
+hardware
+
+indexes
+
+performance
+
+Maintenance Plan
+
+maintenance planning, indexes
+
+Maintenance Tasks
+
+Management Studio. See SQL Server Management Studio
+
+MANUAL
+
+many-to-many
+
+master database
+
+full-text indexes
+
+Material Safety Data Sheets (MSDS)
+
+mathematical functions
+
+MAX
+
+max text repl size
+
+@@MAX_CONNECTIONS
+
+MAXDOP
+
+@@MAX_PRECISION
+
+*.mdf
+
+.MDF
+
+MDX. See Multi-Dimensional Expressions
+
+media set
+
+MEDIADESCRIPTION
+
+MEDIANAME
+
+memory
+
+Memory Manager
+
+MERGE
+
+code
+
+DELETE
+
+DML
+
+INSERT
+
+Multiple Statements
+
+SELECT
+
+semicolon
+
+TVPs
+
+UPDATE
+
+Merge
+
+Merge Agent
+
+merge replication
+
+autonomy
+
+latency
+
+triggers
+
+MergePublication
+
+message ID
+
+Message Queue Task
+
+message string
+
+metadata
+
+metadata
+
+metadata functions
+
+Microsoft Distributed Transaction Coordinator (MS DTC)
+
+Microsoft Office SharePoint Services (MOSS)
+
+Microsoft.SqlServer.ConnectionInfo
+
+Microsoft.SqlServer.Management.Sdk.Sfc
+
+Microsoft.SqlServer.RMO
+
+Microsoft.SqlServer.Smo
+
+Microsoft.SqlServer.Smo.dll
+
+Microsoft.SqlServer.SmoExtended
+
+Microsoft.SqlServer.SmoExtended.dll
+
+Microsoft.SqlServer.SqlEnum
+
+MIN
+
+minipumps
+
+mirroring
+
+logs
+
+Mixed Authentication
+
+-m < maximum errors >
+
+mobile devices, replication
+
+model database
+
+.modify
+
+Monday
+
+Money
+
+MONTH
+
+MOSS. See Microsoft Office SharePoint Services
+
+MOVE
+
+@mp:id
+
+@mp:localname
+
+@mp:namespacerui
+
+@mp:parentid
+
+@mp:prefix
+
+@mp:prev
+
+@mp:xmltext
+
+MS DTC. See Microsoft Distributed Transaction Coordinator
+
+msdb database
+
+jobs
+
+tasks
+
+MSDS. See Material Safety Data Sheets
+
+MSFTESQL
+
+MSSA
+
+Multi-Dimensional Expressions (MDX)
+
+sp_add_jobstep
+
+multiple instances
+
+multiple publishers
+
+multiple solutions
+
+Multiple Statements, MERGE
+
+multiple subscribers
+
+multiprocessors
+
+Must Declare
+
+MUST_CHANGED
+
+N
+
+-N
+
+-n
+
+NAME
+
+name, transactions
+
+Named Pipes
+
+TCP/IP
+
+name_in_db
+
+names
+
+namespace
+
+URL
+
+naming, rules
+
+Native Client
+
+cursors
+
+NCHAR
+
+NChar
+
+*.ndf
+
+NEAR
+
+nested query
+
+multiple values
+
+SELECT
+
+nested subquery
+
+nested triggers
+
+@@NESTLEVEL
+
+.NET
+
+ADO.NET
+
+bcp
+
+client installations
+
+DLL
+
+error handling
+
+filestreams
+
+IEnumerable
+
+LINQ
+
+PowerShell
+
+RMO
+
+SMO
+
+SSIS
+
+UDTs
+
+NET STOP MSSQLSERVER
+
+Net-Libraries (NetLibs)
+
+NetLibs. See Net-Libraries
+
+Network Configuration
+
+New Job
+
+NEWID
+
+newsgroups
+
+NO CREDENTIAL
+
+NO POPULATION
+
+NO RECOVERY
+
+.nodes
+
+CROSS APPLY
+
+XQuery
+
+NOEXEC
+
+NOFORMAT
+
+NO_INFOMSGS
+
+NOINIT
+
+noise words
+
+non-clustered indexes
+
+on a clustered table
+
+on a heap
+
+non-exclusive subcategories
+
+non-identifying relationships
+
+non-leaf level nodes
+
+non-repeatable reads
+
+NORECOMPUTE
+
+NORECOVERY
+
+normalization. See also de-normalization
+
+OLTP
+
+Northwind
+
+NorthwindSecure.sql
+
+NOT
+
+NOT EXISTS
+
+EXCEPT
+
+INTERSECT
+
+NOT FOR REPLICATION
+
+NOT IN
+
+Notification Services
+
+@notify_level_eventlog
+
+NOUNLOAD
+
+NOWAIT
+
+NT authentication. See Windows Authentication
+
+NT File System (NTFS)
+
+security
+
+NT User Name
+
+Ntext
+
+ntext
+
+NTFS. See NT File System
+
+NULL
+
+.exist
+
+Parent
+
+XML
+
+NULLIF
+
+Numeric
+
+NVarChar
+
+nvarchar(max)
+
+O
+
+Object Explorer
+
+replication
+
+object model, SMO
+
+object permissions
+
+DELETE
+
+EXECUTE
+
+INSERT
+
+REFERENCES
+
+SELECT
+
+UPDATE
+
+OBJECT_DEFINITION()
+
+OBJECT_ID()
+
+OBJECT_NAME
+
+OBJECTPROPERTY
+
+OBJECTPROPERTYEX
+
+objects
+
+identifiers
+
+ODBC
+
+cursors
+
+encryption
+
+GO
+
+OEM
+
+OFF
+
+OGC. See Open Geospatial Consortium
+
+OLAP. See Online Analytical Processing
+
+OLD PASSWORD
+
+OLE DB
+
+cube data
+
+cursors
+
+encryption
+
+GO
+
+packages
+
+SSIS
+
+OLTP. See Online Transaction Processing
+
+ON
+
+DENY
+
+REVOKE
+
+triggers
+
+ON FILEGROUP
+
+1NF. See first normal form
+
+OnError
+
+one-to-many
+
+one-to-one
+
+OnExecStatusChanged
+
+@on_fail_action
+
+@on_fail_step_id
+
+ONLINE
+
+Online Analytical Processing (OLAP)
+
+data warehousing
+
+de-normalization
+
+inserts
+
+vs. OLTP
+
+Online Transaction Processing (OLTP)
+
+data warehousing
+
+deletes
+
+FILLFACTOR
+
+inserts
+
+normalization
+
+vs. OLAP
+
+transactional replication
+
+triggers
+
+updates
+
+OnPostExecute
+
+OnProgress
+
+on-site vs. off-site, hardware
+
+@on_success_action
+
+@on_success_step_id
+
+ON < table or view name > ( < column name > )
+
+-o < output file >
+
+Open Geospatial Consortium (OGC)
+
+Open Office
+
+OPENDATASOURCE
+
+OPENQUERY
+
+OPENROWSET
+
+OPENROWSET (BULK)
+
+OPENXML
+
+XPath
+
+operators
+
+scheduling jobs
+
+SQL Server Management Studio
+
+T-SQL
+
+OPTIMISTIC
+
+optimizer hints
+
+OPTION, CTEs
+
+@OPTIONS
+
+@@OPTIONS
+
+Options, backup
+
+OR
+
+Oracle, replication
+
+ORDER
+
+ORDER BY
+
+CTEs
+
+indexes
+
+ORDER column [ASC|DESC]
+
+Order Details
+
+OrderDetails
+
+OrderHasDetails
+
+Orders
+
+orphaned records, SELECT
+
+O/S, replication
+
+OSCMD
+
+OSQL
+
+osql
+
+@os_run_priority
+
+out
+
+outer joins
+
+OUTPUT
+
+output parameters, sprocs
+
+Output window
+
+owner
+
+P
+
+-P password
+
+p2p.wrox.com
+
+Package Execution Utility
+
+Package Explorer
+
+Package Source
+
+packages
+
+building
+
+Command Files
+
+Command Line
+
+Configurations
+
+Connection Managers
+
+executing
+
+Execution Options
+
+File System
+
+Logging
+
+OLE DB
+
+Reporting
+
+Set Values
+
+SQL Server
+
+SQL Server Agent
+
+SQL Server Management Studio
+
+SSIS Package Store
+
+Verification
+
+@@PACKET_ERRORS
+
+@@PACK_RECEIVED
+
+@@PACK_SENT
+
+PAD_INDEX
+
+Page Free Space (PFS)
+
+Page Reads/sec
+
+page types
+
+Page Writes/sec
+
+@pager_days
+
+pages
+
+BLOBs
+
+data
+
+indexes
+
+lockable resources
+
+splits
+
+storage
+
+Pages Scanned
+
+PAGLOCK
+
+parallel data loads
+
+parameterization, Reporting Services
+
+parameters, TVPs
+
+MERGE
+
+Parent
+
+NULL
+
+ParentKey
+
+ParentTable
+
+columns
+
+parity drives
+
+Parse()
+
+PARSENAME
+
+partitions
+
+horizontal
+
+WHERE
+
+scalability
+
+tables
+
+vertical
+
+views
+
+PartNo
+
+@passwd
+
+PASSWORD
+
+passwords
+
+encryption
+
+expiration
+
+length
+
+makeup
+
+Reporting Services
+
+security
+
+storage
+
+PATH
+
+aliases
+
+XML indexes
+
+PATINDEX
+
+PAUSE, full-text indexes
+
+.pdf
+
+PercentGrowth
+
+PerfMon. See Reliability and Performance Monitor
+
+perfmon.msc
+
+performance
+
+maintenance
+
+on-site vs. off-site hardware
+
+Performance Point Server
+
+permissions. See object permissions; user permissions
+
+PERMISSIONS
+
+PERMISSION_SET
+
+Person.Person
+
+PFS. See Page Free Space
+
+phantoms
+
+phrases
+
+CONTAINS
+
+FREETEXT
+
+FTS
+
+Physical Reads
+
+PI
+
+PivotTable, Excel
+
+planar data
+
+GEOMETRY
+
+Pointer
+
+points
+
+Policy Based Management
+
+polygons
+
+population
+
+change tracking
+
+full
+
+incremental
+
+indexes
+
+START
+
+port 1433
+
+POWER
+
+PowerShell
+
+navigating
+
+.NET
+
+'PowerShell,'
+
+PowerShell script
+
+Precedence Constraint Editor
+
+Precision
+
+PRIMARY
+
+primary filegroup
+
+primary files
+
+PRIMARY KEY
+
+primary keys
+
+clustered indexes
+
+columns
+
+tables
+
+primary XML indexes
+
+PRIVILEGES
+
+ANSI/ISO
+
+process ID
+
+processadmin
+
+@@PROCID
+
+Production.TransactionHistory
+
+Products
+
+professionalsql.com
+
+Profiler. See SQL Server Profiler
+
+Properties Window
+
+PROPERTY, XML indexes
+
+Protocols
+
+proximity, FTS
+
+pruning
+
+publications
+
+replication
+
+RMO
+
+publishers
+
+central
+
+conflict management
+
+multiple
+
+replication
+
+subscriber
+
+publishing subscriber
+
+Pubs
+
+pull subscriptions
+
+push subscriptions
+
+Q
+
+-q
+
+quality assurance
+
+queries. See also subqueries
+
+advanced
+
+FTS
+
+recursive
+
+AdventureWorks
+
+CTEs
+
+hierarchy
+
+UNION
+
+WHERE
+
+T-SQL
+
+.query
+
+Query Analyzer
+
+keywords
+
+Query Editor
+
+Query Optimizer
+
+graphical showplan
+
+ROWS_PER_BATCH
+
+queryout
+
+'QueueReader,'
+
+queues, updates
+
+quotation marks
+
+QUOTED_IDENTIFIER
+
+QUOTENAME
+
+R
+
+-R
+
+-r
+
+RADIANS
+
+RAID. See Redundant Array of Inexpensive Disks
+
+RAID 0
+
+RAID 1
+
+RAID 5
+
+RAID 6
+
+RAID 10
+
+RAID 50
+
+RAISERROR
+
+RAM
+
+RAND
+
+ranged locks
+
+RANK
+
+Rank
+
+RAW
+
+BINARY BASE64
+
+EXPLICIT
+
+SELECT
+
+FOR XML
+
+RDBMS. See Relational Database Management System
+
+RDL. See Report Definition Language
+
+Read()
+
+Read Ahead Reads
+
+READ COMMITTED
+
+READ UNCOMMITTED
+
+READABLEREAD
+
+READCOMMITTED
+
+READCOMMITTEDLOCK
+
+READONLY
+
+read-only
+
+data warehousing
+
+subscribers
+
+READ_ONLY
+
+READPAST
+
+READUNCOMMITTED/NOLOCK
+
+Real
+
+REBUILD
+
+ALTER INDEX
+
+full-text catalogs
+
+Recent Expensive Queries
+
+record_count
+
+recovery
+
+bulk-logged
+
+full
+
+simple
+
+status
+
+RECOVERY
+
+recovery interval
+
+recovery models
+
+recursive queries
+
+AdventureWorks
+
+CTEs
+
+hierarchy
+
+UNION
+
+WHERE
+
+recursive triggers
+
+Redundant Array of Inexpensive Disks (RAID)
+
+backups
+
+transaction log
+
+REFERENCES, object permissions
+
+Region
+
+relational data, XML
+
+Relational Database Management System (RDBMS)
+
+scripts
+
+relations
+
+relationship lines
+
+RELATIVE, FETCH
+
+Reliability and Performance Monitor (PerfMon)
+
+@@REMSERVER
+
+REORGANIZE
+
+full-text catalogs
+
+leaf level nodes
+
+REPEATABLE READ
+
+REPLACE
+
+@replace
+
+replace value of
+
+UPDATE
+
+XQuery
+
+REPLICATE
+
+replication
+
+autonomy
+
+bandwidth
+
+conflict management
+
+data
+
+data consistency
+
+DB2
+
+distributor
+
+DNS
+
+filters
+
+latency
+
+merge
+
+mixing types
+
+mobile devices
+
+models
+
+Object Explorer
+
+Oracle
+
+O/S
+
+publications
+
+publisher
+
+roles
+
+schema consistency
+
+self-publishing
+
+snapshot
+
+SQL Server Agent
+
+SQL Server Management Studio
+
+subscribers
+
+subscriptions
+
+timestamp
+
+topology
+
+transactional
+
+triggers
+
+UDTs
+
+uniqueidentifier
+
+replication agents
+
+Replication Log Reader Agent, sp_add_jobstep
+
+Replication Management Objects (RMO)
+
+articles
+
+Distribution Agent
+
+.NET
+
+publications
+
+Visual Studio
+
+Replication Merge Agent, sp_add_jobstep
+
+Replication Queue Reader Agent, sp_add_jobstep
+
+Replication Snapshot Agent, sp_add_jobstep
+
+ReplicationDatabase
+
+Report Definition Language (RDL)
+
+Report Models
+
+Report Server Projects
+
+Report Server Website
+
+Report Wizard
+
+Reporting, packages
+
+Reporting Services
+
+BIDS
+
+charts
+
+Configuration Manager
+
+credentials
+
+data sources
+
+Deploy
+
+editing
+
+linking
+
+parameterization
+
+passwords
+
+SQL Server Management Studio
+
+usernames
+
+Windows Authentication
+
+reports
+
+requirements-gathering
+
+resorting
+
+ADO.NET
+
+client-side vs. server-side
+
+LINQ
+
+Resource Waits
+
+RESTART
+
+RESTORE DATABASE
+
+RESTORE, T-SQL
+
+Results in Grid
+
+Results in Grid, SQL Server Management Studio
+
+Results in Text
+
+Results to File
+
+RESUME, full-text indexes
+
+RETAINDAYS
+
+@retry_interval
+
+@retry_name
+
+RETURN
+
+reusable databases
+
+REVERSE
+
+REVOKE
+
+AS
+
+ON
+
+DENY
+
+sa
+
+RID. See Row Identifier
+
+RIGHT
+
+RMO. See Replication Management Objects
+
+-r/n
+
+roles
+
+applications
+
+databases
+
+Login
+
+replication
+
+security
+
+server
+
+user-defined database
+
+ROLLBACK
+
+savepoints
+
+transactions
+
+ROLLBACK TRAN
+
+triggers
+
+ROOT
+
+root node
+
+ROUND
+
+Row Identifier (RID)
+
+Row Level Locking
+
+@@ROWCOUNT
+
+ROWCOUNT_BIG
+
+RowGUID
+
+rowguid
+
+ROWLOCK
+
+rowset functions
+
+ROWS_PER_BATCH
+
+ROWS_PER_BATCH = nn
+
+-r < row terminator >
+
+RTRIM
+
+rules
+
+backward compatibility
+
+logical design
+
+naming
+
+runtime errors
+
+S
+
+-S
+
+sa
+
+REVOKE
+
+security
+
+sysadmin
+
+SalesOrderHeader
+
+SAMPLED
+
+Saturday
+
+SAVE TRAN
+
+SAVE, transactions
+
+savepoints
+
+ROLLBACK
+
+scalability
+
+Data Collector
+
+partitions
+
+Scan Count
+
+Scan Density [Best Count:Actual Count]
+
+Schedule, backup
+
+scheduling jobs
+
+operators
+
+schema(s)
+
+collections, XML
+
+consistency, replication
+
+format files
+
+snowflake
+
+star
+
+XML
+
+schema modification lock (Sch-M)
+
+< schema name > . < object name >
+
+schema stability lock (Sch-S)
+
+SCHEMABINDING
+
+SCHEMA_ID
+
+SCHEMA_NAME
+
+Sch-M. See schema modification lock
+
+Sch-S. See schema stability lock
+
+SCOPE_IDENTITY
+
+Script()
+
+Script Tasks
+
+scripts
+
+INSERT
+
+RDBMS
+
+SMO
+
+scrollability, cursors
+
+SCROLLABLE
+
+SCROLL_LOCKS
+
+second normal form (2NF)
+
+secondary filegroups
+
+secondary XML indexes
+
+security
+
+asymmetric keys
+
+BLOBs
+
+certificates
+
+database
+
+EXEC
+
+filestreams
+
+functions
+
+guest accounts
+
+hardware
+
+Login
+
+NTFS
+
+options
+
+passwords
+
+roles
+
+sa
+
+sprocs
+
+TCP/IP
+
+UDFs
+
+user permissions
+
+views
+
+security identifier (SID)
+
+securityadmin
+
+SELECT
+
+AUTO
+
+bulkadmin
+
+columns
+
+correlated subqueries
+
+cursors
+
+DENY
+
+exporting
+
+importing
+
+INSTEAD OF
+
+MERGE
+
+nested query
+
+object permissions
+
+orphaned records
+
+RAW
+
+transactions
+
+views
+
+WHERE
+
+FOR XML
+
+<SELECT>
+
+Select Case
+
+SELECT GETDATE()
+
+SELECT INTO
+
+select into/bulkcopy
+
+selectivity, indexes
+
+SELECT@@NESTLEVEL
+
+self-publishing, replication
+
+semicolon
+
+backward compatibility
+
+MERGE
+
+Send Mail
+
+Sequence Container
+
+SERIALIZABLE
+
+SERIALIZABLE/HOLDLOCK
+
+Server
+
+@server
+
+Server Agent. See SQL Server Agent
+
+server references, SMO
+
+Server Registration
+
+server roles
+
+serveradmin
+
+@@SERVERNAME
+
+SERVERPROPERTY
+
+server-side vs. client-side. See client-side vs. server-side
+
+Service Management
+
+Service Master Key
+
+@@SERVICENAME
+
+SESSIONPROPERTY
+
+SESSION_USER
+
+SET
+
+implicit transactions
+
+SET QUOTED_IDENTIFIERS ON
+
+SET SHOWPLAN_ALL ON
+
+SET SHOWPLAN_TEXT ON
+
+Set Values, packages
+
+SETERROR
+
+setupadmin
+
+severity
+
+SGAM. See Shared Global Allocation Map
+
+SGML
+
+shared data sources
+
+Shared Global Allocation Map (SGAM)
+
+shared locks
+
+Shared Memory
+
+TCP/IP
+
+shared with intent exclusive lock
+
+SharePoint
+
+Show Execution Plan
+
+SHOWCONTIG
+
+SHOWPLAN
+
+T-SQL
+
+SHOWPLAN_TEXT
+
+shredding
+
+SHUTDOWN
+
+SID. See security identifier
+
+SID
+
+@sid
+
+SIGN
+
+SignByAsymKey
+
+SignByCert
+
+simple recovery
+
+SIN
+
+SINGLE_BLOB
+
+SINGLE_CLOB
+
+SINGLE_NCLOB
+
+skip_encryption
+
+skip_encryption_old
+
+SmallDateTime
+
+SmallInt
+
+SmallMoney
+
+SMO. See SQL Management Objects
+
+SMTP
+
+SNAPSHOT
+
+Snapshot Agent
+
+snapshot replication
+
+immediate-updating subscribers
+
+sneaker net
+
+snowflake schema
+
+soh
+
+Solution Explorer
+
+SOME
+
+SORTED_DATA_REORG
+
+SORT_IN_TEMPDB
+
+tempdb
+
+SOUNDEX
+
+SPACE
+
+sp_add_category
+
+sp_add_job
+
+sp_add_jobschedule
+
+sp_add_jobserver
+
+sp_add_jobstep
+
+sp_add_job_step
+
+sp_addlogin
+
+sp_addmessage
+
+sp_add_notification
+
+sp_add_operator
+
+sp_addrole
+
+sp_adrolemember
+
+spatial data
+
+SQL Server Management Studio
+
+sp_CreateOAMethod
+
+sp_dbcmptlevel, WITH APPEND
+
+sp_dboption
+
+sp_delete_job
+
+sp_delete_jobschedule
+
+sp_delete_jobserver
+
+sp_delete_jobstep
+
+sp_delete_operator
+
+sp_describe_cursor
+
+sp_doption
+
+sp_dropapprole
+
+sp_droprolemember
+
+speed
+
+sp_fulltext_catalog
+
+sp_grantdbaccess
+
+sp_grantlogin
+
+sp_help
+
+sp_helpdb
+
+sp_help_operator
+
+sp_helptext
+
+SPID. See SQL Process ID
+
+@@SPID
+
+sp_password
+
+sp_repladdcolumn
+
+sp_repldropcolumn
+
+sprocs. See stored procedures
+
+sp_settriggerorder
+
+sp_unsetapprole
+
+sp_update_job
+
+sp_update_jobschedule
+
+sp_update_jobstep
+
+sp_update_operator
+
+sp_xml_preparedocument
+
+sp_xml_removedocument
+
+SQL Agent
+
+SQL Compilations/sec
+
+SQL Management Objects (SMO)
+
+ADO
+
+backup databases
+
+basic connection
+
+code
+
+constraints
+
+CREATE TABLE
+
+databases
+
+declarations
+
+dropping databases
+
+examples
+
+history
+
+indexes
+
+LINQ
+
+login
+
+.NET
+
+object model
+
+scripts
+
+server references
+
+tables
+
+Visual Studio
+
+SQL Namespaces (SQL NS)
+
+SQL NS. See SQL Namespaces
+
+SQL Process ID (SPID)
+
+SQL Server Agent
+
+credentials
+
+packages
+
+replication
+
+SQL Server Browser
+
+SQL Server Integration Services (SSIS)
+
+data marts
+
+data warehousing
+
+.NET
+
+OLE DB
+
+sp_add_jobstep
+
+Visual Studio
+
+SQL Server Management Studio
+
+characters
+
+columns
+
+graphical showplan
+
+indexes
+
+jobs
+
+locks
+
+login
+
+operators
+
+packages
+
+replication
+
+Reporting Services
+
+Results in Grid
+
+spatial data
+
+subscribers
+
+tasks
+
+SQL Server Profiler
+
+templates
+
+SQL Server schema
+
+SQL Statistics
+
+SqlBackup
+
+SqlBackupAsync
+
+SQLCMD
+
+sqlcmd
+
+SqlContext
+
+SQLDIR
+
+SQLNCHAR
+
+SQLStatement
+
+Sql_variant
+
+SQL_VARIANT_PROPERTY
+
+SQRT
+
+SQUARE
+
+square brackets
+
+SRID
+
+-S < server name >
+
+SSIS. See SQL Server Integration Services
+
+SSIS Package Store, packages
+
+SSL
+
+SSRS. See Reporting Services
+
+STANDBY
+
+star schema
+
+.STArea()
+
+START, population
+
+@start_step_id
+
+state
+
+static cursors
+
+client-side vs. server-side
+
+STATISTICS
+
+STATISTICS 10
+
+STATISTICS PROFILE
+
+STATISTICS_NORECOMPUTE
+
+STATS
+
+STATS_DATE
+
+.STContains( < spatial instance > )
+
+STDEV
+
+STDEVP
+
+.STDistance( < spatial instance > )
+
+Step Into
+
+Step Out
+
+Step Over
+
+STEPCT
+
+STEPID
+
+@step_id
+
+@step_name
+
+.STEquals( < spatial instance > )
+
+.STIntersects( < spatial instance > )
+
+Stop Debugging
+
+STOP, full-text indexes
+
+stop lists
+
+full-text indexes
+
+stop words. See noise words
+
+STOPLIST
+
+storage
+
+databases
+
+extent
+
+files
+
+indexes
+
+pages
+
+stored procedures (sprocs)
+
+assemblies
+
+blocks
+
+client installations
+
+cursors
+
+@@ERROR
+
+errors
+
+EXEC
+
+EXECUTE
+
+IF...ELSE
+
+Login
+
+organizing
+
+output parameters
+
+security
+
+.STOverlaps( < spatial instance > )
+
+STR
+
+Stream
+
+STRING
+
+string functions
+
+StringCollection
+
+STRTDT
+
+STRTTM
+
+structure, logical design
+
+.STTouches( < spatial instance > )
+
+STUFF
+
+.STWithin( < spatial instance > )
+
+subcategories
+
+extensibility
+
+UNION
+
+subqueries
+
+correlated subqueries
+
+aliases
+
+joins
+
+SELECT
+
+WHERE
+
+joins
+
+subscribers
+
+anonymous
+
+central
+
+conflict management
+
+global
+
+immediate-updating
+
+snapshot replication
+
+transactional replication
+
+local
+
+multiple
+
+publisher
+
+publishing
+
+read-only
+
+replication
+
+SQL Server Management Studio
+
+subscriptions
+
+replication
+
+synchronization
+
+SUBSTRING
+
+@subsystem
+
+SUM
+
+summary information, triggers
+
+Sunday
+
+SUSER_ID
+
+SUSER_NAME
+
+SUSER_SID
+
+SUSER_SNAME
+
+SVR
+
+Switch
+
+SWITCHOFFSET
+
+synchronization, subscriptions
+
+syntax errors
+
+sysadmin
+
+sys.assemblies
+
+sys.columns
+
+sys.comments
+
+sys.database_files
+
+sysdatabases
+
+sys.databases
+
+SYSDATETIME
+
+SYSDATETIMEOFSET
+
+sys.db_db_index_physical_stats
+
+sys.dm_db_index_physical_stats
+
+sys.dm_db_index_usage_stats
+
+sys.dm_db_missing_index_*
+
+sys.identity_colummns
+
+sys.index_columns
+
+sys.indexes
+
+sys.messages
+
+sys.objects
+
+sys.partition_functions
+
+sys.partitions
+
+sys.schemas
+
+sys.servers
+
+sys.spatial_indexes
+
+sys.synonyms
+
+sys.syslanguages
+
+sys.tables
+
+system functions
+
+system views
+
+SYSTEM_USER
+
+sys.user_token
+
+SYSUTCDATETIME
+
+sys.xml_indexes
+
+T
+
+-T
+
+-t
+
+Table
+
+table id/view id/index id
+
+Table name "query"
+
+table scans
+
+TABLERESULTS
+
+tables
+
+aliases
+
+client-side vs. server-side
+
+compression
+
+Diagram
+
+DMO
+
+EXECUTE
+
+filestreams
+
+full-text catalogs
+
+joins
+
+lockable resources
+
+names
+
+partitions
+
+primary keys
+
+SMO
+
+temporary
+
+UDTs
+
+views
+
+table-valued functions (TVFs)
+
+table-valued parameters (TVPs)
+
+MERGE
+
+TABLOCK
+
+TABLOCKX
+
+Tag
+
+TAN
+
+tasks
+
+creating
+
+deleting
+
+maintaining
+
+msdb database
+
+SQL Server Management Studio
+
+T-SQL
+
+USE
+
+TCP/IP
+
+Named Pipes
+
+security
+
+Shared Memory
+
+tempdb database
+
+tempdb, SORT_IN_TEMPDB
+
+templates, SQL Server Profiler
+
+temporary tables
+
+cursors
+
+data
+
+indexes
+
+updates
+
+Terminate
+
+terminators
+
+test.sql
+
+Text
+
+text
+
+text editor
+
+text functions
+
+TEXT|ALL
+
+CPU
+
+I/O
+
+TextData
+
+TextImportTable
+
+TEXTPTR
+
+@@TEXTSIZE
+
+TEXTVALID
+
+-t < field terminator >
+
+third normal form (3NF)
+
+32-bit
+
+3NF. See third normal form
+
+Thursday
+
+time
+
+TIME
+
+Time
+
+timestamp
+
+replication
+
+Timestamp/rowversion
+
+@@TIMETICKS
+
+TinyInt
+
+TO
+
+TODATETIMEOFFSET
+
+Toggle Breakpoints and Remove All Breakpoints
+
+token-based structure, indexes
+
+tools
+
+ToString()
+
+ToString
+
+Total Server Memory
+
+@@TOTAL_ERRORS
+
+@@TOTAL_READ
+
+@@TOTAL_WRITE
+
+trace name
+
+@@TRANCOUNT
+
+transaction context
+
+Transaction Coordinator. See Microsoft Distributed Transaction Coordinator
+
+transaction isolation level
+
+least restrictive
+
+transaction log
+
+backup
+
+bcp
+
+RAID
+
+transactional replication
+
+BLOBs
+
+DELETE
+
+Distribution Agent
+
+immediate-updating subscribers
+
+INSERT
+
+Log Reader Agent
+
+OLTP
+
+UPDATE
+
+transactions. See also Online Transaction Processing
+
+BEGIN
+
+blocks
+
+COMMIT
+
+deadlocks
+
+DELETE
+
+implicit
+
+SET
+
+INSERT
+
+marking
+
+minimizing
+
+name
+
+open-ended
+
+ROLLBACK
+
+SAVE
+
+SELECT
+
+UPDATE
+
+Transact-SQL (T-SQL)
+
+BACKUP
+
+BULK INSERT
+
+control-of-flow statements
+
+CREATE PROCEDURE
+
+data scrubbing
+
+filestreams
+
+GO
+
+indexes
+
+jobs
+
+operators
+
+queries
+
+RESTORE
+
+SHOWPLAN
+
+sp_add_jobstep
+
+tasks
+
+views
+
+TransArticle
+
+Transfer Tasks
+
+transformations
+
+transitional consistency
+
+TransPublication
+
+TriggerAction
+
+TriggerContext
+
+triggers
+
+FOR
+
+ON
+
+AFTER
+
+WITH APPEND
+
+assemblies
+
+CHECK
+
+concurrency
+
+condition flags
+
+constraints
+
+custom error messages
+
+data integrity
+
+DDL
+
+debugging
+
+DELETE
+
+de-normalization
+
+DML
+
+DRI
+
+DROP
+
+WITH ENCRYPTION
+
+firing order
+
+indexes
+
+INSERT
+
+INSTEAD OF
+
+INSTEAD OF DELETE
+
+INSTEAD OF INSERT
+
+INSTEAD OF UPDATE
+
+merge replication
+
+nested
+
+OLTP
+
+recursive
+
+replication
+
+ROLLBACK TRAN
+
+summary information
+
+turning off
+
+UPDATE
+
+troubleshooting
+
+Truncate On Checkpoint
+
+TRUNCATE_ONLY
+
+TRY/CATCH
+
+errors
+
+uspLogError
+
+TSQL
+
+T-SQL. See Transact-SQL
+
+Tuesday
+
+TVFs. See table-valued functions
+
+TVPs. See table-valued parameters
+
+.TXT
+
+TYPE
+
+TYPE COLUMN
+
+TYPEPROPERTY
+
+TYPE_WARNING
+
+U
+
+UDFs. See user-defined functions
+
+UDTs. See user-defined data types
+
+-U < login name >
+
+UNICODE
+
+UNION
+
+columns
+
+EXPLICIT
+
+idrefs
+
+recursive queries
+
+subcategories
+
+FOR XML
+
+UNION ALL
+
+UNIQUE
+
+UniqueIdentifier
+
+uniqueidentifier
+
+@job_id
+
+replication
+
+UNLOAD
+
+UNLOCK, ENABLE
+
+UPDATE()
+
+UPDATE
+
+code
+
+cursors
+
+db_denydatawriter
+
+full-text indexes
+
+GetReparentedValue()
+
+MERGE
+
+object permissions
+
+replace value of
+
+transactional replication
+
+transactions
+
+triggers
+
+update(s)
+
+code
+
+data warehousing
+
+indexes
+
+lost
+
+OLTP
+
+queues
+
+temporary tables
+
+update locks
+
+UPDATE STATISTICS
+
+UPDLOCK
+
+UPPER
+
+URL, namespace
+
+USE
+
+jobs
+
+tasks
+
+USER
+
+User Access Control
+
+User Connections
+
+user permissions
+
+application roles
+
+storage
+
+user-defined data types (UDTs)
+
+.NET
+
+replication
+
+tables
+
+user-defined database roles
+
+user-defined functions (UDFs)
+
+assemblies
+
+EXEC
+
+security
+
+USER_ID
+
+USER_NAME()
+
+usernames
+
+Reporting Services
+
+users
+
+USING
+
+using
+
+uspLogError
+
+TRY/CATCH
+
+V
+
+-V(60|65|70|80|90)
+
+.value
+
+VALUE, XML indexes
+
+@ValueIn
+
+VAR
+
+VarBinary
+
+varbinary
+
+varbinary(max)
+
+VarChar
+
+varchar
+
+varchar(max)
+
+VARP
+
+VB Script
+
+VB.NET
+
+Verification, packages
+
+VerifySignedByAsymKey
+
+VerifySignedByCert
+
+@@VERSION
+
+vertical filtering
+
+vertical indexes, hierarchy
+
+vertical partitioning
+
+VIA. See Virtual Interface Adapter
+
+victims
+
+View name "query"
+
+VIEW_METADATA
+
+views
+
+AdventureWorks
+
+auditing
+
+data
+
+distributed partition
+
+DMVs
+
+encryption
+
+federated distributed partitioned
+
+partitions
+
+security
+
+SELECT
+
+tables
+
+T-SQL
+
+Virtual Interface Adapter (VIA)
+
+virtual private network (VPN)
+
+Visio
+
+Visual Basic
+
+Visual Studio
+
+RDL
+
+RMO
+
+SMO
+
+SSIS
+
+VPN. See virtual private network
+
+VSAM, cursors
+
+W
+
+-w
+
+WAITFOR
+
+Watch window
+
+WBEM. See Web-Based Enterprise Management
+
+Web Service Task
+
+Web-Based Enterprise Management (WBEM)
+
+Wednesday
+
+WEIGHT, FTS
+
+Well Know Binary (WKB)
+
+Well Known Text (WKT)
+
+WHERE
+
+CONTAINS
+
+correlated subqueries
+
+cursors
+
+horizontal partitioning
+
+indexes
+
+recursive queries
+
+SELECT
+
+update locks
+
+WHILE
+
+cursors
+
+looping
+
+Width
+
+wildcard data
+
+WINDOWS
+
+Windows Authentication
+
+login
+
+Reporting Services
+
+Windows Management Instrumentation (WMI)
+
+login
+
+WITH
+
+CTEs
+
+login
+
+WITH ACCENT_SENSITIVITY
+
+WITH APPEND
+
+sp_dbcmptlevel
+
+triggers
+
+WITH CHECK OPTION
+
+WITH DEFAULT_SCHEMA
+
+WITH ENCRYPTION
+
+triggers
+
+WITH GRANT OPTION
+
+CASCADE
+
+DENY
+
+WITH LOG
+
+WITH NOWAIT
+
+WITH PERMISSION_SET
+
+WITH RECOVERY
+
+WITH SCHEMEBINDING
+
+WITH SETERROR
+
+WITH TRUNCATE_ONLY
+
+WITH XMLNAMESPACES()
+
+@with_log
+
+WITH<option>
+
+WITHOUT LOGIN
+
+WKB. See Well Know Binary
+
+WKT. See Well Known Text
+
+WMI. See Windows Management Instrumentation
+
+WMI Tasks
+
+Word
+
+FTS
+
+@WorkingIn
+
+Write()
+
+WriteLine
+
+wrox.com
+
+X
+
+-x
+
+Xbase
+
+XDL
+
+XLOCK
+
+.XLS
+
+XML. See Extensible Markup Language
+
+XML
+
+AUTO
+
+xml
+
+XML Data Manipulation Language (XML DML)
+
+XML DML. See XML Data Manipulation Language
+
+XML indexes
+
+creating
+
+PATH
+
+primary
+
+PROPERTY
+
+secondary
+
+VALUE
+
+xml schema collection
+
+XML schema collections
+
+ALTER
+
+CREATE
+
+DROP
+
+XML Tasks
+
+XMLDATA
+
+id
+
+idref
+
+idrefs
+
+XML_SCHEMA_NAMESPACE()
+
+xmltext
+
+@xmltext
+
+XPath
+
+node tests
+
+OPENXML
+
+@xpath_namespaces
+
+xp_cmdshell
+
+XQuery
+
+delete
+
+insert
+
+.nodes
+
+replace value of
+
+XSLT
+
+Y
+
+YEAR
+
+Yukon
+
+
diff --git a/kag/examples/csqa/builder/indexer.py b/kag/examples/csqa/builder/indexer.py
new file mode 100644
index 00000000..03f086d1
--- /dev/null
+++ b/kag/examples/csqa/builder/indexer.py
@@ -0,0 +1,36 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import os
+import logging
+from kag.common.registry import import_modules_from_path
+from kag.builder.runner import BuilderChainRunner
+
+logger = logging.getLogger(__name__)
+
+
+def buildKB(dir_path):
+    from kag.common.conf import KAG_CONFIG
+
+    runner = BuilderChainRunner.from_config(
+        KAG_CONFIG.all_config["kag_builder_pipeline"]
+    )
+    runner.invoke(dir_path)
+
+    logger.info(f"\n\nbuildKB successfully for {dir_path}\n\n")
+
+
+if __name__ == "__main__":
+    dir_path = os.path.dirname(os.path.abspath(__file__))
+    import_modules_from_path(dir_path)
+
+    data_dir_path = os.path.join(dir_path, "data")
+    buildKB(data_dir_path)
diff --git a/kag/examples/csqa/generate_data.py b/kag/examples/csqa/generate_data.py
new file mode 100644
index 00000000..2ee627b7
--- /dev/null
+++ b/kag/examples/csqa/generate_data.py
@@ -0,0 +1,48 @@
+def to_snake_case(name):
+    import re
+
+    words = re.findall("SQL|VBA|[A-Za-z][a-z0-9]*", name)
+    result = "_".join(words).lower()
+    return result
+
+
+def main():
+    import io
+    import os
+    import json
+
+    dir_path = os.path.dirname(os.path.abspath(__file__))
+    file_path = os.path.join(dir_path, "cs.jsonl")
+    if not os.path.isfile(file_path):
+        print(
+            "Please download cs.jsonl from https://huggingface.co/datasets/TommyChien/UltraDomain/tree/main"
+        )
+        return
+    with io.open(file_path, "r", encoding="utf-8", newline="\n") as fin:
+        questions = []
+        documents = set()
+        for line in fin:
+            item = json.loads(line)
+            title = item["meta"]["title"]
+            content = title + "\n" + item["context"]
+            if content not in documents:
+                name = to_snake_case(title)
+                output_file_path = os.path.join(
+                    dir_path, "builder", "data", name + ".txt"
+                )
+                with io.open(
+                    output_file_path, "w", encoding="utf-8", newline="\n"
+                ) as fout:
+                    print(content, file=fout)
+            item["context"] = title
+            questions.append(item)
+        output_file_path = os.path.join(dir_path, "solver", "data", "questions.json")
+        with io.open(output_file_path, "w", encoding="utf-8", newline="\n") as fout:
+            json.dump(
+                questions, fout, separators=(",", ": "), indent=4, ensure_ascii=False
+            )
+            print(file=fout)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/kag/examples/csqa/kag_config.yaml b/kag/examples/csqa/kag_config.yaml
new file mode 100644
index 00000000..4cc695ef
--- /dev/null
+++ b/kag/examples/csqa/kag_config.yaml
@@ -0,0 +1,126 @@
+#------------project configuration start----------------#
+openie_llm: &openie_llm
+  api_key: key
+  base_url: https://api.deepseek.com
+  model: deepseek-chat
+  type: maas
+
+chat_llm: &chat_llm
+  api_key: key
+  base_url: https://api.deepseek.com
+  model: deepseek-chat
+  type: maas
+
+vectorize_model: &vectorize_model
+  api_key: key
+  base_url: https://api.siliconflow.cn/v1/
+  model: BAAI/bge-m3
+  type: openai
+  vector_dimensions: 1024
+vectorizer: *vectorize_model
+
+log:
+  level: INFO
+
+project:
+  biz_scene: default
+  host_addr: http://127.0.0.1:8887
+  id: '8'
+  language: en
+  namespace: CsQa
+#------------project configuration end----------------#
+
+#------------kag-builder configuration start----------------#
+kag_builder_pipeline:
+  chain:
+    type: unstructured_builder_chain # kag.builder.default_chain.DefaultUnstructuredBuilderChain
+    extractor:
+      type: schema_free_extractor # kag.builder.component.extractor.schema_free_extractor.SchemaFreeExtractor
+      llm: *openie_llm
+      ner_prompt:
+        type: default_ner # kag.builder.prompt.default.ner.OpenIENERPrompt
+      std_prompt:
+        type: default_std # kag.builder.prompt.default.std.OpenIEEntitystandardizationdPrompt
+      triple_prompt:
+        type: default_triple # kag.builder.prompt.default.triple.OpenIETriplePrompt
+    reader:
+      type: txt_reader # kag.builder.component.reader.txt_reader.TXTReader
+    post_processor:
+      type: kag_post_processor # kag.builder.component.postprocessor.kag_postprocessor.KAGPostProcessor
+      similarity_threshold: 0.9
+    splitter:
+      type: length_splitter # kag.builder.component.splitter.length_splitter.LengthSplitter
+      split_length: 4950
+      window_length: 100
+    vectorizer:
+      type: batch_vectorizer # kag.builder.component.vectorizer.batch_vectorizer.BatchVectorizer
+      vectorize_model: *vectorize_model
+    writer:
+      type: kg_writer # kag.builder.component.writer.kg_writer.KGWriter
+  num_threads_per_chain: 50
+  num_chains: 16
+  scanner:
+    type: dir_file_scanner # kag.builder.component.scanner.directory_scanner.DirectoryScanner
+#------------kag-builder configuration end----------------#
+
+#------------kag-solver configuration start----------------#
+search_api: &search_api
+  type: openspg_search_api #kag.solver.tools.search_api.impl.openspg_search_api.OpenSPGSearchAPI
+
+graph_api: &graph_api
+  type: openspg_graph_api #kag.solver.tools.graph_api.impl.openspg_graph_api.OpenSPGGraphApi
+
+exact_kg_retriever: &exact_kg_retriever
+  type: default_exact_kg_retriever # kag.solver.retriever.impl.default_exact_kg_retriever.DefaultExactKgRetriever
+  el_num: 5
+  llm_client: *chat_llm
+  search_api: *search_api
+  graph_api: *graph_api
+
+fuzzy_kg_retriever: &fuzzy_kg_retriever
+  type: default_fuzzy_kg_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever
+  el_num: 5
+  vectorize_model: *vectorize_model
+  llm_client: *chat_llm
+  search_api: *search_api
+  graph_api: *graph_api
+
+chunk_retriever: &chunk_retriever
+  type: default_chunk_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever
+  llm_client: *chat_llm
+  recall_num: 10
+  rerank_topk: 10
+
+kag_solver_pipeline:
+  memory:
+    type: default_memory # kag.solver.implementation.default_memory.DefaultMemory
+    llm_client: *chat_llm
+  max_iterations: 3
+  reasoner:
+    type: default_reasoner # kag.solver.implementation.default_reasoner.DefaultReasoner
+    llm_client: *chat_llm
+    lf_planner:
+      type: default_lf_planner # kag.solver.plan.default_lf_planner.DefaultLFPlanner
+      llm_client: *chat_llm
+      vectorize_model: *vectorize_model
+    lf_executor:
+      type: default_lf_executor # kag.solver.execute.default_lf_executor.DefaultLFExecutor
+      llm_client: *chat_llm
+      force_chunk_retriever: true
+      exact_kg_retriever: *exact_kg_retriever
+      fuzzy_kg_retriever: *fuzzy_kg_retriever
+      chunk_retriever: *chunk_retriever
+      merger:
+        type: default_lf_sub_query_res_merger # kag.solver.execute.default_sub_query_merger.DefaultLFSubQueryResMerger
+        vectorize_model: *vectorize_model
+        chunk_retriever: *chunk_retriever
+  generator:
+    type: default_generator # kag.solver.implementation.default_generator.DefaultGenerator
+    llm_client: *chat_llm
+    generate_prompt:
+      type: summary_resp_generator # kag/examples/csqa/solver/prompt/resp_generator.py
+  reflector:
+    type: default_reflector # kag.solver.implementation.default_reflector.DefaultReflector
+    llm_client: *chat_llm
+
+#------------kag-solver configuration end----------------#
diff --git a/kag/examples/csqa/reasoner/__init__.py b/kag/examples/csqa/reasoner/__init__.py
new file mode 100644
index 00000000..8b8a3c91
--- /dev/null
+++ b/kag/examples/csqa/reasoner/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+"""
+Place the DSL file for graph reasoning in this directory.
+For example:
+
+```company.dsl
+MATCH (s:DEFAULT.Company)
+RETURN s.id, s.address
+```
+"""
diff --git a/kag/examples/csqa/schema/CsQa.schema b/kag/examples/csqa/schema/CsQa.schema
new file mode 100644
index 00000000..295a738e
--- /dev/null
+++ b/kag/examples/csqa/schema/CsQa.schema
@@ -0,0 +1,130 @@
+namespace CsQa
+
+Chunk(文本块): EntityType
+     properties:
+        content(内容): Text
+          index: TextAndVector
+
+ArtificialObject(人造物体): EntityType
+     properties:
+        desc(描述): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Astronomy(天文学): EntityType
+     properties:
+        desc(描述): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Building(建筑): EntityType
+     properties:
+        desc(描述): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Creature(生物): EntityType
+     properties:
+        desc(描述): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Concept(概念): EntityType
+     properties:
+        desc(描述): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Date(日期): EntityType
+     properties:
+        desc(描述): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+GeographicLocation(地理位置): EntityType
+     properties:
+        desc(描述): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Keyword(关键词): EntityType
+     properties:
+        desc(描述): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Medicine(药物): EntityType
+     properties:
+        desc(描述): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+
+NaturalScience(自然科学): EntityType
+     properties:
+        desc(描述): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Organization(组织机构): EntityType
+     properties:
+        desc(描述): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Person(人物): EntityType
+     properties:
+        desc(描述): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Transport(运输): EntityType
+     properties:
+        desc(描述): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Works(作品): EntityType
+     properties:
+        desc(描述): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Others(其他): EntityType
+     properties:
+        desc(描述): Text
+            index: TextAndVector
+        semanticType(语义类型): Text
+            index: Text
+
+Event(事件): EventType
+     properties:
+        subject(主体): Person
+        participants(参与者): Person
+            constraint: MultiValue
+        time(时间): Date
+        location(地点): GeographicLocation
+        abstract(摘要): Text
+            index: TextAndVector
+        type(事件类型): Text
+            index: Text
+
+
+SemanticConcept(语义概念): EntityType
+     properties:
+        desc(内容): Text
+          index: Text
diff --git a/kag/examples/csqa/schema/__init__.py b/kag/examples/csqa/schema/__init__.py
new file mode 100644
index 00000000..8ac86acc
--- /dev/null
+++ b/kag/examples/csqa/schema/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+"""
+{{namespace}}.schema:
+    The MarkLang file for the schema of this project.
+    You can execute `kag schema commit` to commit your schema to SPG server.
+
+
+"""
diff --git a/kag/solver/logic/core_modules/op_executor/op_output/module/__init__.py b/kag/examples/csqa/solver/__init__.py
similarity index 100%
rename from kag/solver/logic/core_modules/op_executor/op_output/module/__init__.py
rename to kag/examples/csqa/solver/__init__.py
diff --git a/kag/solver/logic/core_modules/op_executor/op_retrieval/__init__.py b/kag/examples/csqa/solver/data/__init__.py
similarity index 100%
rename from kag/solver/logic/core_modules/op_executor/op_retrieval/__init__.py
rename to kag/examples/csqa/solver/data/__init__.py
diff --git a/kag/examples/csqa/solver/data/csqa_lightrag_answers.json b/kag/examples/csqa/solver/data/csqa_lightrag_answers.json
new file mode 100644
index 00000000..bb285180
--- /dev/null
+++ b/kag/examples/csqa/solver/data/csqa_lightrag_answers.json
@@ -0,0 +1,1702 @@
+[
+    {
+        "input": "How does Spark Streaming enable real-time data processing?",
+        "answers": [
+            "Spark Streaming extends the core Spark API to process real-time data streams from sources like Kafka and Flume. It divides the streaming data into batches, which are then processed by the Spark engine to generate the final stream of results in real-time."
+        ],
+        "context": "Machine Learning With Spark",
+        "length": 131651,
+        "context_id": "7bcef8714a477fd61fc8fb0d499b2cc3",
+        "_id": "b2fd8d9c6d1499d521d778ce3d6d06fa",
+        "label": "cs",
+        "meta": {
+            "title": "Machine Learning With Spark",
+            "authors": "Nick Pentreath"
+        },
+        "lightrag_answer": "Spark Streaming is a powerful component of the Apache Spark framework designed to process real-time data streams efficiently. It enables real-time data processing through several key mechanisms and features, which are outlined below.\n\n### Discretized Streams (DStreams)\nAt the core of Spark Streaming is the concept of **Discretized Streams (DStreams)**, which represent a continuous sequence of data divided into small, time-based batches called mini-batches. Each mini-batch is treated as a Resilient Distributed Dataset (RDD), allowing Spark Streaming to leverage the same distributed processing capabilities as batch processing in Spark. This approach ensures that data is processed in near real-time, with the ability to handle large volumes of data across a cluster of machines.\n\n### Input Sources and Receivers\nSpark Streaming supports a variety of **input sources**, including file-based sources (e.g., HDFS), network-based sources (e.g., Kafka, Flume, and Amazon Kinesis), and APIs like Twitter. **Receivers** are responsible for ingesting data from these sources and converting it into DStreams. This flexibility allows Spark Streaming to integrate seamlessly with diverse data streams, making it suitable for a wide range of real-time applications.\n\n### Transformations and Actions\nSpark Streaming provides a rich set of **transformations** (e.g., `map`, `filter`, `reduceByKey`) and **actions** (e.g., `print`, `saveAsTextFiles`) that can be applied to DStreams. These operations are similar to those available in Spark's batch processing API, enabling developers to perform complex data processing tasks in real-time. Additionally, Spark Streaming supports **windowed operations**, which allow transformations to be applied over sliding time windows, making it possible to analyze data over specific time intervals.\n\n### Stateful Processing\nFor applications that require maintaining state across batches, Spark Streaming offers the **`updateStateByKey`** function. This feature allows developers to track and update state information (e.g., counts, sums, or other aggregations) in a fault-tolerant manner. This is particularly useful for tasks like counting user interactions, tracking session data, or monitoring system metrics over time.\n\n### Fault Tolerance and Caching\nSpark Streaming ensures **fault tolerance** by automatically recovering from worker node failures and recomputing lost data. It also supports **caching** of DStreams in memory, which improves performance by reducing the need to recompute data. These features make Spark Streaming a reliable choice for mission-critical real-time applications.\n\n### Integration with Machine Learning\nSpark Streaming integrates with Spark's machine learning library (MLlib), enabling **online learning** and real-time model updates. For example, models like **StreamingLinearRegressionWithSGD** can be trained incrementally as new data arrives, allowing systems to adapt quickly to changing conditions. This capability is particularly valuable for applications like fraud detection, recommendation systems, and predictive analytics.\n\n### Conclusion\nIn summary, Spark Streaming enables real-time data processing by leveraging DStreams, supporting diverse input sources, providing powerful transformations and stateful processing, and ensuring fault tolerance and scalability. Its integration with Spark's broader ecosystem, including machine learning and batch processing, makes it a versatile and robust solution for real-time data analytics.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What does the book suggest about the use of histograms in data analysis?",
+        "answers": [
+            "The book suggests that histograms are a powerful tool for visualizing the distribution of data, especially continuous data. They help in understanding the shape, central tendency, and spread of the data, which are crucial for subsequent statistical analyses."
+        ],
+        "context": "Probability and Statistics for Computer Science",
+        "length": 340084,
+        "context_id": "cd05f98133352f12d464fe3184263356",
+        "_id": "a06e053884d3820c53753979a5d1583e",
+        "label": "cs",
+        "meta": {
+            "title": "Probability and Statistics for Computer Science",
+            "authors": "David Forsyth"
+        },
+        "lightrag_answer": "The book emphasizes the importance of histograms as a fundamental tool in data analysis, particularly for visualizing the distribution of numerical data. Histograms are described as graphical representations that show the frequency of data points within specified ranges, making them useful for understanding properties such as skewness and modality. For example, they can be used to analyze continuous datasets like net worths or cheese goodness scores by dividing the data into intervals and displaying the frequency of values within each interval.\n\nAdditionally, the book highlights the utility of histograms in approximating probability density functions (PDFs). It explains that a histogram can be thought of as a discrete approximation of a PDF, where the interval size approaches zero. This connection is particularly valuable in statistical analysis, as it helps bridge the gap between empirical data and theoretical probability distributions.\n\nThe book also discusses the application of histograms in specific contexts, such as analyzing the distribution of user ages or visualizing the results of simulations to check if they follow a normal distribution. In these cases, histograms provide a clearer view of the data structure, enabling analysts to draw meaningful conclusions.\n\nOverall, the book underscores histograms as a versatile and essential tool for exploring and understanding data distributions, whether for descriptive statistics, probability approximations, or more complex analyses.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What are some advanced topics covered in the book related to Linux Kernel Networking?",
+        "answers": [
+            "Advanced topics covered in the book include Linux namespaces, network namespaces, cgroups, Busy Poll Sockets, the Bluetooth subsystem, the IEEE 802.15.4 subsystem, and Near Field Communication (NFC) subsystem."
+        ],
+        "context": "Linux Kernel Networking",
+        "length": 324154,
+        "context_id": "a26e2a9fe9627588b3f37969996aa053",
+        "_id": "dd5f8955ff5dc51715d0693b376b9a45",
+        "label": "cs",
+        "meta": {
+            "title": "Linux Kernel Networking",
+            "authors": "Rami Rosen"
+        },
+        "lightrag_answer": "The book *Linux Kernel Networking: Implementation and Theory* by Rami Rosen delves into several advanced topics related to Linux Kernel Networking, providing a comprehensive exploration of both theoretical and practical aspects. These topics are essential for understanding the complexities of modern networking within the Linux kernel. Below are some of the advanced topics covered in the book:\n\n### 1. **Advanced Routing Techniques**\n   - **Multicast Routing**: The book discusses the implementation and management of multicast routing protocols, which are crucial for applications like streaming media and conferencing. It covers the Multicast Forwarding Cache (MFC), the Vif Device, and the handling of multicast traffic.\n   - **Policy Routing**: This topic explores routing decisions based on policies rather than traditional routing tables. It allows for more complex and flexible routing decisions, such as those based on source address or incoming interface.\n   - **Multipath Routing**: The book explains how packets can be routed over multiple paths to improve network performance and reliability. This includes the use of multiple nexthops and the fib_select_multipath() method for routing decisions.\n\n### 2. **Netfilter Framework**\n   - The Netfilter subsystem is a critical component of the Linux kernel, providing hooks for packet filtering, Network Address Translation (NAT), and connection tracking. The book covers the registration of Netfilter hooks, connection tracking, and the use of IP tables for managing network traffic.\n\n### 3. **IPsec and Security**\n   - **IPsec**: The book provides an in-depth look at the IPsec protocol, which offers encryption and authentication for IP packets. It discusses the XFRM framework, which is the basis of the Linux IPsec subsystem, and covers topics like XFRM policies, XFRM states, and the ESP protocol.\n   - **NAT Traversal**: The book also touches on NAT traversal techniques, which are essential for maintaining IPsec connections through NAT devices.\n\n### 4. **Wireless Networking**\n   - **IEEE 802.11 (Wi-Fi)**: The book covers the mac80211 subsystem, which is responsible for handling wireless networking functionality within the kernel. It discusses various wireless network topologies, power save mode, and packet aggregation.\n   - **IEEE 802.15.4 (6LoWPAN)**: This topic explores the implementation of low-power wireless personal area networks (LoWPANs) and the integration of IPv6 over these networks. The book discusses the 6LoWPAN adaptation layer and the handling of compressed and uncompressed packets.\n\n### 5. **Network Namespaces**\n   - Network namespaces provide isolation of the network stack, allowing multiple virtual networks to coexist on the same physical network. The book discusses the implementation and management of network namespaces, including their use in containerization and virtualization.\n\n### 6. **Layer 4 Protocols**\n   - The book covers several Layer 4 protocols, including UDP, TCP, SCTP, and DCCP. It provides detailed insights into their initialization, packet handling, and specific features. For example, it discusses the UDP protocol's lightweight nature and the TCP protocol's reliability mechanisms.\n\n### 7. **InfiniBand and RDMA**\n   - **InfiniBand**: The book delves into the InfiniBand subsystem, a high-speed networking technology popular in data centers. It covers the RDMA stack organization, addressing in InfiniBand, and the RDMA API.\n   - **RDMA (Remote Direct Memory Access)**: This topic explores the use of RDMA for high-performance networking, allowing direct memory access between systems without involving the CPU.\n\n### 8. **Near Field Communication (NFC)**\n   - The book discusses the NFC subsystem, which enables short-range wireless communication between devices. It covers the NFC API, the use of NFC sockets, and the integration of NFC technology into the Linux kernel.\n\n### 9. **Busy Polling Sockets**\n   - Busy polling is a technique introduced in the Linux kernel to reduce latency in networking applications. The book explains how busy polling works and its implementation within the kernel.\n\n### 10. **Bluetooth and IEEE 802.15.4**\n   - The book provides an overview of the Bluetooth subsystem and its integration with the Linux kernel. It also covers the IEEE 802.15.4 standard, which is used for low-rate wireless personal area networks.\n\n### 11. **PCI Subsystem**\n   - The PCI subsystem is essential for managing hardware devices in the Linux kernel. The book discusses its role in networking, particularly in handling network interface cards (NICs) and other networking hardware.\n\n### 12. **Linux Kernel Networking Development Model**\n   - The book outlines the processes and methodologies used in kernel networking development, including the use of Git for version control, mailing lists for discussions, and the role of maintainers in the development process.\n\nThese advanced topics provide a deep understanding of the Linux kernel's networking capabilities, making the book a valuable resource for developers, software architects, and researchers working on networking-related projects.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the significance of the R tool in the context of modern optimization methods?",
+        "answers": [
+            "The R tool is significant because it is a free, open-source, and multi-platform tool specifically developed for statistical analysis, and it has an active community that continuously enhances its capabilities with new packages, including those that implement modern optimization methods."
+        ],
+        "context": "Modern Optimization With R",
+        "length": 98105,
+        "context_id": "31aa6012fa9cdbd6e17cdc426851e487",
+        "_id": "e7c78431710abdba3ea98535d03e28c8",
+        "label": "cs",
+        "meta": {
+            "title": "Modern Optimization With R",
+            "authors": "Paulo Cortez"
+        },
+        "lightrag_answer": "The R tool plays a pivotal role in the realm of modern optimization methods, offering a versatile and powerful environment for both implementing and analyzing optimization algorithms. Its significance can be understood through several key aspects:\n\n### Open Source and Cross-Platform Accessibility\nR is an open-source programming language, which means it is freely available to anyone and can be modified and distributed without restrictions. This openness fosters a collaborative environment where users can contribute to the development of new packages and functionalities. Additionally, R is cross-platform, running seamlessly on various operating systems such as Windows, macOS, and Linux. This accessibility ensures that a wide range of users, from academic researchers to industry professionals, can utilize R for optimization tasks without being constrained by their operating system.\n\n### Extensive Package Ecosystem\nOne of R's most significant strengths is its extensive package ecosystem. The Comprehensive R Archive Network (CRAN) hosts over 5,800 packages, many of which are dedicated to optimization and mathematical programming. These packages provide implementations of various modern optimization methods, such as genetic algorithms, simulated annealing, tabu search, and particle swarm optimization. The availability of these packages allows users to leverage sophisticated optimization techniques without needing to develop them from scratch, significantly reducing the time and effort required to solve complex optimization problems.\n\n### Integration with Statistical Analysis\nR was originally designed for statistical computing and graphics, making it particularly well-suited for tasks that involve both optimization and statistical analysis. This integration is crucial in many real-world applications where optimization results need to be validated or interpreted using statistical methods. For instance, R's built-in statistical functions can be used to analyze the performance of optimization algorithms, assess the significance of results, and visualize data in meaningful ways. This dual capability makes R a comprehensive tool for both optimization and the subsequent analysis of its outcomes.\n\n### Active and Supportive Community\nThe R community is highly active and continuously contributes to the development of new packages and the improvement of existing ones. This collaborative environment ensures that R remains at the forefront of computational and statistical advancements. Users can benefit from a wealth of resources, including documentation, tutorials, and forums, which facilitate learning and problem-solving. The community's support is particularly valuable for users who are new to optimization or R, as it provides a platform for sharing knowledge and best practices.\n\n### Practical Applications and Case Studies\nR's flexibility and robustness have led to its adoption in a wide range of practical applications across various domains. For example, it has been used to optimize construction works, financial portfolios, marketing campaigns, and water management in agriculture. These case studies demonstrate R's capability to handle diverse and complex optimization problems, further underscoring its significance in the field.\n\nIn summary, the R tool is indispensable in the context of modern optimization methods due to its open-source nature, extensive package ecosystem, seamless integration with statistical analysis, active community, and proven effectiveness in practical applications. Its versatility and power make it an essential tool for anyone involved in optimization tasks.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What are the key features of this text that aid in learning object-oriented concepts in Java?",
+        "answers": [
+            "Key features include an early introduction to object-oriented programming, the use of contour diagrams to illustrate object-oriented concepts, and the inclusion of questions for readers to interact with the material, reinforcing concepts through practice."
+        ],
+        "context": "Guide to Java",
+        "length": 132156,
+        "context_id": "9acb1e7611785d05ed363e0a12ff81c4",
+        "_id": "7d61004e5f34e290af4c7c37fc09723b",
+        "label": "cs",
+        "meta": {
+            "title": "Guide to Java",
+            "authors": "James T. Streib"
+        },
+        "lightrag_answer": "The text provides a comprehensive and structured approach to learning object-oriented programming (OOP) concepts in Java, making it particularly effective for beginners. Below are the key features that aid in understanding and mastering these concepts:\n\n### 1. **Clear and Concise Introduction to Objects**\n   - The text revisits the concept of objects, which were introduced earlier, and builds upon them to deepen understanding. It emphasizes the modular nature of objects, which helps in creating more complex and organized programs.\n   - By using simple objects initially, the text ensures that readers can grasp the foundational concepts before moving on to more advanced topics.\n\n### 2. **Practical Examples and Contour Diagrams**\n   - The use of contour diagrams is a standout feature. These visual tools help illustrate how objects are passed to and from methods, making abstract concepts more tangible.\n   - Practical examples, such as determining the length of a line segment, demonstrate how objects can simplify complex tasks by encapsulating related data.\n\n### 3. **Focus on Method Overloading and Constructors**\n   - The text introduces overloaded constructors and methods, which are essential for creating flexible and reusable code. This allows readers to understand how different versions of a method or constructor can handle varying inputs.\n   - The reserved word `this` is explained in the context of method overloading, helping readers understand how to reference the current object within a class.\n\n### 4. **Class Data Members and Methods**\n   - The text covers class-level data members and methods, which are crucial for understanding how data and behavior can be shared across instances of a class.\n   - By discussing local, instance, and class constants and variables, the text provides a thorough understanding of variable scope and lifetime in Java.\n\n### 5. **Complete Programs for Reinforcement**\n   - The inclusion of complete programs at the end of the chapter reinforces the concepts discussed. These programs focus on overloaded methods and class data members, providing readers with practical applications of the theory.\n   - These programs serve as templates for readers to build upon, encouraging hands-on learning and experimentation.\n\n### 6. **Modular and Incremental Learning**\n   - The text is structured in a modular way, with each chapter building on the previous one. This incremental approach ensures that readers can gradually develop their understanding of OOP concepts without feeling overwhelmed.\n   - By revisiting objects after introducing selection and iteration structures, the text shows how OOP can be integrated with other programming paradigms to create more robust solutions.\n\n### 7. **Emphasis on Object-Oriented Mindset**\n   - The text encourages an object-oriented mindset, which is essential for modern programming and database design. It emphasizes the importance of objects and their interactions, helping readers think in terms of objects rather than just procedural code.\n\n### 8. **Integration with Java Language Features**\n   - The text ties OOP concepts to specific features of the Java programming language, such as the `Point` class in the `java.awt` package. This helps readers understand how Java supports OOP principles and how to leverage built-in classes and methods.\n\n### 9. **Focus on Memory Efficiency and Optimization**\n   - The text touches on memory efficiency, particularly in the context of object creation and manipulation. This is important for understanding how to write efficient and scalable Java programs.\n\n### 10. **Use of Visual Tools and Debugging Techniques**\n   - The text incorporates visual tools like the Locals window and debugging techniques to help readers understand how objects behave during runtime. This practical approach aids in troubleshooting and refining code.\n\nIn summary, the text's combination of clear explanations, practical examples, visual aids, and hands-on programming exercises makes it an effective resource for learning object-oriented concepts in Java. Its structured and incremental approach ensures that readers can build a solid foundation in OOP and apply these principles to real-world programming challenges.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the role of the RegExr tool in the book?",
+        "answers": [
+            "The RegExr tool is used in the book as a web-based, JavaScript-powered regex implementation to help readers practice and understand regular expressions through interactive examples."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "9cc02e52ecbe63b45e21e1cdaec2b120",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        },
+        "lightrag_answer": "The **RegExr** tool plays a significant role in the book as a versatile and accessible application for testing and developing regular expressions. It is frequently used to demonstrate various regex functionalities and concepts, making it an essential resource for readers learning about pattern matching and text manipulation.\n\n### Key Features and Uses of RegExr\n1. **Testing and Experimentation**: RegExr is highlighted as an online tool that allows users to test and experiment with regular expressions in real-time. It provides a visual interface for pattern matching, making it easier for users to understand and debug their regex patterns.\n\n2. **Marking Up Text**: The tool is used to demonstrate how to add markup to text, such as converting plain text into HTML5 format. This showcases its capability for text transformation and its utility in tasks like document formatting.\n\n3. **Advanced Regex Features**: RegExr supports advanced regex features like alternation, capturing groups, backreferences, named groups, and positive lookaheads. These features are demonstrated in the book to help users understand how to create more complex and powerful regex patterns.\n\n4. **Case-Insensitivity and Multiline Options**: The tool allows users to toggle options like case-insensitivity and multiline matching, which are crucial for understanding how regex patterns behave under different conditions.\n\n5. **Community Contributions**: RegExr includes a \"Community\" tab where users can access samples and community-contributed expressions, providing additional learning resources and examples.\n\n### Practical Applications in the Book\n- **Text Processing**: RegExr is used to process and analyze text files, such as \"The Rime of the Ancient Mariner,\" to demonstrate how regex can be applied to real-world text data.\n- **Learning Regex Concepts**: The tool is employed to teach fundamental regex concepts like word boundaries, character classes, and quantifiers, as well as more advanced topics like lookaheads and lookbehinds.\n- **Comparison with Other Tools**: RegExr is compared to other regex tools like `grep`, `ack`, and Perl, highlighting its strengths and differences in functionality.\n\n### Accessibility and Platform Support\nRegExr is accessible via the browser, making it a convenient tool for users across different platforms. It is also mentioned in the context of being used on a Mac running OS X Lion, indicating its compatibility with various operating systems.\n\nIn summary, **RegExr** serves as a practical and educational tool in the book, enabling readers to experiment with and master regular expressions through hands-on examples and demonstrations. Its user-friendly interface and robust feature set make it an invaluable resource for both beginners and experienced users of regex.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the text compare to other Java programming texts in terms of content and detail?",
+        "answers": [
+            "The text aims to fill the gap between comprehensive texts that might cover too many details, making them difficult for beginners, and shortened introductions that might lack necessary examples and illustrations. It provides a balance by concentrating on the fundamentals and being accessible to readers with no prior programming experience."
+        ],
+        "context": "Guide to Java",
+        "length": 132156,
+        "context_id": "9acb1e7611785d05ed363e0a12ff81c4",
+        "_id": "1da0c1c91eb0140738cb64ecc8004d05",
+        "label": "cs",
+        "meta": {
+            "title": "Guide to Java",
+            "authors": "James T. Streib"
+        },
+        "lightrag_answer": "The text \"Guide to Java: A Concise Introduction to Programming\" by James T. Streib and Takako Soma appears to focus on providing a foundational understanding of Java programming, particularly for beginners. It emphasizes fundamental concepts such as input, arithmetic, and output, as well as object-oriented programming principles. The text also introduces control structures, including selection and iteration structures, which are essential for understanding program flow and logic in Java. \n\n### Content and Detail\nThe content of this text seems to align with other introductory Java programming books, which typically cover basic syntax, control structures, and object-oriented programming. However, the emphasis on \"concise\" suggests that it may prioritize brevity and clarity over exhaustive detail. For example, it introduces pseudocode and flowcharts to help readers visualize program logic, which is a common pedagogical approach in beginner-friendly texts. Additionally, the inclusion of practical examples, such as calculating volumes of geometric shapes, provides hands-on learning opportunities, which are crucial for reinforcing theoretical concepts.\n\n### Comparison to Other Texts\nCompared to more comprehensive Java texts, such as \"Java: The Complete Reference\" by Herbert Schildt, this book may lack in-depth coverage of advanced topics like multithreading, networking, or JavaFX. However, its focus on simplicity and clarity makes it well-suited for readers who are new to programming or those who prefer a more streamlined approach. The use of UML diagrams and the discussion of the \"dangling-else problem\" indicate that the text also touches on some intermediate concepts, though it may not delve as deeply into them as more advanced texts.\n\n### Unique Features\nOne notable feature of this text is its integration of pseudocode and flowcharts, which can help readers bridge the gap between conceptual understanding and actual code implementation. Additionally, the inclusion of exercises and sample input/output scenarios provides practical application opportunities, which are essential for reinforcing learning. The text also references external resources, such as the Java API specification document, encouraging readers to explore beyond the book for more detailed information.\n\n### Conclusion\nIn summary, \"Guide to Java\" is likely a solid choice for beginners seeking a clear and concise introduction to Java programming. While it may not cover advanced topics in great detail, its focus on foundational concepts and practical examples makes it a valuable resource for those new to the language. For readers looking for more comprehensive coverage, pairing this text with a more advanced reference book or online resources would be advisable.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What role do Bayesian inference and priors play in the book?",
+        "answers": [
+            "Bayesian inference and priors play a significant role in the book as they provide a framework for updating beliefs (priors) with new data to form posteriors. This approach is particularly useful in situations where prior knowledge is available and needs to be integrated with new evidence."
+        ],
+        "context": "Probability and Statistics for Computer Science",
+        "length": 340084,
+        "context_id": "cd05f98133352f12d464fe3184263356",
+        "_id": "89c6e8076674b2c6333b750682a7d9ca",
+        "label": "cs",
+        "meta": {
+            "title": "Probability and Statistics for Computer Science",
+            "authors": "David Forsyth"
+        },
+        "lightrag_answer": "Bayesian inference and the concept of priors play a central role in the book, particularly in the context of statistical modeling and data analysis. The book emphasizes the importance of Bayesian methods as a powerful tool for updating beliefs and making inferences based on observed data. Here’s a detailed breakdown of their roles:\n\n### Bayesian Inference\nBayesian inference is presented as a statistical method that uses Bayes' theorem to update the probability of a hypothesis as more evidence or information becomes available. It is particularly useful when dealing with limited data, as it allows for the incorporation of prior knowledge or beliefs into the analysis. The book highlights several key aspects of Bayesian inference:\n- **Posterior Probability**: The updated probability of a parameter after considering both prior beliefs and observed data is a core concept. This is calculated using Bayes' rule, which combines the prior distribution and the likelihood of the observed data.\n- **MAP Estimation**: The Maximum A Posteriori (MAP) estimate is discussed as a method for finding the most probable value of a parameter by maximizing the posterior probability. This is contrasted with maximum likelihood estimation, which does not incorporate prior information.\n- **Normal Distributions**: The book provides examples of Bayesian inference applied to normal distributions, where a normal prior and a normal likelihood yield a normal posterior. This simplifies the process of updating beliefs with new data.\n\n### Priors\nPriors are introduced as the initial beliefs or assumptions about the parameters of a model before observing any data. The choice of prior can significantly influence the results of Bayesian inference, especially when data is scarce. The book discusses several key points about priors:\n- **Conjugate Priors**: These are priors that, when combined with a specific likelihood, result in a posterior distribution of the same family. This property simplifies calculations and is often used for mathematical convenience.\n- **Prior Choice**: The selection of an appropriate prior is crucial. While conjugate priors are mathematically convenient, they may not always reflect real-world beliefs. The book acknowledges the philosophical challenges in choosing priors and the lack of consensus in the statistical community on this issue.\n- **Impact of Priors**: The influence of the prior diminishes as more data is collected. However, with limited data, the choice of prior can have a significant impact on the posterior distribution and the resulting inferences.\n\n### Applications and Examples\nThe book includes several worked examples to illustrate the application of Bayesian inference and the role of priors:\n- **Coin Flipping**: A classic example where the prior belief about the fairness of a coin is updated based on observed flips.\n- **Swearing Politicians**: An example involving the estimation of the intensity of swearing using a Poisson model and a Gamma prior.\n- **Normal Distributions**: Demonstrations of how Bayesian inference can be applied to normal distributions, showing how the posterior is updated with each new measurement.\n\n### Philosophical and Practical Considerations\nThe book also touches on the philosophical debates surrounding Bayesian inference, particularly the challenges in justifying the choice of priors. It acknowledges that while Bayesian methods are powerful, they require careful thought and consideration, especially when dealing with limited data.\n\nIn summary, Bayesian inference and priors are fundamental to the book’s approach to statistical modeling, providing a framework for incorporating prior knowledge and updating beliefs in light of new evidence. The book emphasizes both the practical applications and the philosophical considerations of these methods.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the difference between recording a macro and writing code from scratch in VBA?",
+        "answers": [
+            "Recording a macro involves using the Macro Recorder to translate user actions into VBA code, while writing code from scratch involves manually typing VBA statements in the Visual Basic Editor to create a macro."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "54079bdf524a760f148f5719b9d6b1cf",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        },
+        "lightrag_answer": "Recording a macro and writing VBA code from scratch are two distinct approaches to automating tasks in Microsoft Office applications, each with its own advantages and limitations.\n\n### Recording a Macro\nRecording a macro is a beginner-friendly method that allows users to automate tasks without needing to write any code. When you record a macro, the **Macro Recorder** captures your actions in the application (such as Excel or Word) and translates them into VBA code. For example, if you record a macro in Excel where you format a cell, the Macro Recorder will generate the corresponding VBA code for that action.\n\n#### Key Characteristics of Recording a Macro:\n- **Ease of Use**: Recording a macro is straightforward and requires no programming knowledge. You simply perform the actions you want to automate, and the Macro Recorder generates the code for you.\n- **Limited Flexibility**: The generated code is often verbose and may include unnecessary steps. It lacks the efficiency and customization that manually written code can provide.\n- **No Error Handling**: Recorded macros do not include error handling or conditional logic, which can make them less robust in complex scenarios.\n- **Quick Prototyping**: Recording is useful for quickly creating a basic macro that can later be refined by editing the generated code in the **Visual Basic Editor (VBE)**.\n\n### Writing Code from Scratch\nWriting VBA code from scratch involves manually creating procedures in the **Visual Basic Editor**. This approach gives you full control over the code, allowing you to create more efficient, flexible, and powerful macros.\n\n#### Key Characteristics of Writing Code from Scratch:\n- **Customization**: You can tailor the code to meet specific needs, including adding conditional logic, loops, and error handling.\n- **Efficiency**: Manually written code can be more concise and optimized compared to recorded macros, which often include redundant steps.\n- **Complexity**: Writing code requires a solid understanding of VBA syntax, programming concepts, and the object models of the Office applications.\n- **Debugging and Testing**: You can use the debugging tools in the VBE to test and refine your code, ensuring it works as intended.\n- **Scalability**: Handwritten code is better suited for complex tasks and larger projects, as it can be modularized and reused across multiple procedures.\n\n### When to Use Each Approach\n- **Recording a Macro**: Use this method for simple, repetitive tasks or when you’re new to VBA and want to learn how actions translate into code. It’s also useful for quickly prototyping a macro that you can later refine.\n- **Writing Code from Scratch**: Choose this approach for more complex tasks, when you need greater control over the automation process, or when you want to create reusable and maintainable code.\n\n### Combining Both Approaches\nA common workflow is to start by recording a macro to generate a basic version of the code, then open the **Visual Basic Editor** to edit and enhance the recorded code. This hybrid approach allows you to leverage the simplicity of recording while adding the flexibility and power of manual coding.\n\nIn summary, recording a macro is a quick and easy way to automate simple tasks, while writing code from scratch offers greater control and customization for more complex automation needs. Both methods have their place in VBA programming, and understanding when to use each can help you become a more effective Office automation developer.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the book address the implementation of IPv6 in comparison to IPv4?",
+        "answers": [
+            "The book discusses the implementation of IPv6, highlighting its larger address space, fixed-length header, and additional features like Neighbor Discovery and Multicast Listener Discovery, which enhance and replace certain functionalities of IPv4."
+        ],
+        "context": "Linux Kernel Networking",
+        "length": 324154,
+        "context_id": "a26e2a9fe9627588b3f37969996aa053",
+        "_id": "d415e4fa0db729e2c156fc711d56567d",
+        "label": "cs",
+        "meta": {
+            "title": "Linux Kernel Networking",
+            "authors": "Rami Rosen"
+        },
+        "lightrag_answer": "The book provides a detailed comparison between the implementation of IPv6 and IPv4, highlighting both their similarities and differences. This comparison is primarily discussed in **Chapter 4**, which focuses on IPv4, and **Chapter 8**, which delves into IPv6. Here are the key points of comparison:\n\n### **Address Space and Addressing**\n- **IPv4**: Uses a 32-bit address space, which has led to address exhaustion due to the limited number of available addresses.\n- **IPv6**: Introduces a 128-bit address space, significantly expanding the number of available addresses and addressing the limitations of IPv4. IPv6 also supports features like **stateless autoconfiguration** using protocols like **radvd**, which simplifies address assignment.\n\n### **Header Structure**\n- **IPv4**: Has a variable header size ranging from 20 to 60 bytes, with options like **Timestamp** and **Record Route**.\n- **IPv6**: Features a fixed header size of 40 bytes and introduces **extension headers** for additional functionalities, such as **Hop-by-Hop Options**, **Destination Options**, and **Fragment Options**. These headers provide more flexibility and efficiency in packet processing.\n\n### **Fragmentation and Defragmentation**\n- **IPv4**: Handles fragmentation at the sender and defragmentation at the receiver, with support for **IP_MF** (More Fragments) and **IP_OFFSET** flags.\n- **IPv6**: Moves fragmentation to the sender only, using the **Fragment Options header**, and eliminates the need for defragmentation at intermediate nodes, simplifying packet handling.\n\n### **Multicast Support**\n- **IPv4**: Uses **IGMP (Internet Group Management Protocol)** for managing multicast group memberships.\n- **IPv6**: Introduces **MLD (Multicast Listener Discovery)**, which is based on **ICMPv6** and provides enhanced multicast capabilities, including **MLDv2** for source filtering.\n\n### **Routing and Forwarding**\n- **IPv4**: Relies on traditional routing tables and supports **Policy Routing**, which allows routing decisions based on criteria other than the destination address.\n- **IPv6**: Shares similarities with IPv4 in forwarding mechanisms but introduces **IPv6 Routing Tables** and **Policy Routing** with support for up to 255 routing tables. IPv6 also uses **NDISC (Neighbour Discovery Protocol)** for address resolution, replacing ARP used in IPv4.\n\n### **Security**\n- **IPv4**: **IPsec** is optional and not widely implemented.\n- **IPv6**: **IPsec** is mandatory, providing built-in security features like encryption and authentication for traffic.\n\n### **Integration with the Linux Kernel**\n- **IPv4**: Fully integrated into the Linux Kernel Networking subsystem, with support for packet forwarding, host addressing, and multicast traffic handling.\n- **IPv6**: Also fully integrated into the Linux Kernel, with additional features like **Neighbour Discovery**, **Multicast Listener Reports**, and **Mobile Prefix Advertisements**. IPv6 is discussed in detail in **Chapter 8**, focusing on its implementation within the Linux networking stack.\n\n### **Protocol Evolution**\n- **IPv4**: The current dominant protocol but faces challenges like address exhaustion.\n- **IPv6**: Designed as the successor to IPv4, addressing its limitations and introducing new features such as **Privacy Extensions**, **Renumbering**, and **Flow Labeling** for enhanced security and efficiency.\n\n### **Summary**\nThe book emphasizes that while IPv4 and IPv6 share some similarities in their forwarding mechanisms and integration with the Linux Kernel, IPv6 introduces significant improvements in address space, header structure, multicast support, and security. These advancements make IPv6 a critical component in the transition from IPv4 to the next-generation internet. The detailed comparison in **Chapters 4 and 8** provides a comprehensive understanding of the evolution and implementation of these two protocols within the Linux networking stack.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "Can you explain the concept of standard coordinates as discussed in the book?",
+        "answers": [
+            "Standard coordinates involve normalizing data by subtracting the mean and dividing by the standard deviation for each variable. This transformation helps in comparing different datasets on a common scale, making it easier to visualize and analyze relationships between variables."
+        ],
+        "context": "Probability and Statistics for Computer Science",
+        "length": 340084,
+        "context_id": "cd05f98133352f12d464fe3184263356",
+        "_id": "2889bf6322872f411e8eaaa854393ca9",
+        "label": "cs",
+        "meta": {
+            "title": "Probability and Statistics for Computer Science",
+            "authors": "David Forsyth"
+        },
+        "lightrag_answer": "Standard coordinates are a fundamental concept in data analysis and statistics, particularly when dealing with data normalization and transformation. The book discusses standard coordinates in the context of making data more interpretable and comparable, especially when analyzing relationships between variables.\n\n### Definition and Purpose\nStandard coordinates are used to transform data into a normalized form where the mean of the data is zero, and the standard deviation is one. This transformation ensures that the data is centered around zero and scaled uniformly, making it easier to compare datasets with different units or scales. The primary purpose of standard coordinates is to eliminate the influence of the original units of measurement, allowing for a clearer interpretation of relationships between variables.\n\n### Application in Data Analysis\nIn the book, standard coordinates are particularly useful in **scatter plot analysis** and **correlation coefficient calculations**. When data is plotted in standard coordinates, the relationships between variables become more apparent because the data is normalized. For example, in **scatter plots**, standard coordinates help visualize the linear relationship between two variables without the distortion caused by differing scales. Similarly, the **correlation coefficient**, which measures the strength and direction of a linear relationship, is defined in terms of standard coordinates to ensure it is unaffected by translation or scaling of the data.\n\n### Mathematical Properties\nA key property of standard coordinates is that the **mean of the data** in standard coordinates is zero, and the **standard deviation** is one. This property is crucial for ensuring consistency in statistical analyses. For instance, when calculating the correlation coefficient, the use of standard coordinates guarantees that the coefficient remains invariant under linear transformations of the data. This invariance is essential for making valid comparisons across different datasets or variables.\n\n### Relationship to Standard Normal Data\nStandard coordinates are also closely related to **standard normal data**, which follows a standard normal distribution characterized by a bell-shaped curve. By transforming data into standard coordinates, it can be compared to standard normal data, facilitating the application of statistical techniques that assume normality. This transformation is particularly useful in hypothesis testing, where the assumption of normality is often required.\n\n### Practical Implications\nThe use of standard coordinates simplifies many statistical procedures by providing a common framework for data analysis. It allows researchers to focus on the underlying relationships between variables without being distracted by differences in scale or units. This concept is widely applied in fields such as machine learning, where feature standardization (a process similar to transforming data into standard coordinates) is often a prerequisite for model training.\n\nIn summary, standard coordinates are a powerful tool in data analysis, enabling clearer visualization, more accurate statistical comparisons, and the application of techniques that assume normalized data. Their use is essential for ensuring that data is interpretable and that statistical results are meaningful.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What are IP options and why might they be used?",
+        "answers": [
+            "IP options are optional fields in the IPv4 header that enable advanced features like strict or loose routing, record routing, time stamping, and router alert. They are used for specific networking tasks that require additional control over packet handling."
+        ],
+        "context": "Linux Kernel Networking",
+        "length": 324154,
+        "context_id": "a26e2a9fe9627588b3f37969996aa053",
+        "_id": "adeb98871fc954a358d3f686ee9a701f",
+        "label": "cs",
+        "meta": {
+            "title": "Linux Kernel Networking",
+            "authors": "Rami Rosen"
+        },
+        "lightrag_answer": "IP options are optional fields in the IPv4 header that extend its functionality beyond the standard header fields. These options can be used for various purposes, such as recording the route a packet takes through the network, adding timestamps, or enhancing security. While IP options provide additional capabilities, they also introduce complexity and can impact network performance due to increased header size and processing overhead.\n\n### Types of IP Options\n1. **Record Route Option**: This option allows each router that handles the packet to append its IP address to the packet. This is useful for tracing the path a packet takes through the network.\n2. **Timestamp Option**: This option records the time at which each router processes the packet, providing timing information for network diagnostics.\n3. **Security Option**: This option adds security parameters to the packet, which can be used for authentication or integrity checks.\n4. **Strict and Loose Source Routing**: These options allow the sender to specify the exact path (strict) or a partial path (loose) that the packet should take through the network.\n\n### Use Cases for IP Options\n- **Network Diagnostics**: Tools like `traceroute` use IP options to gather information about the path packets take through the network.\n- **Security**: IP options can be used to add security features, such as authentication or encryption, to packets.\n- **Routing Control**: Source routing options allow senders to control the path packets take, which can be useful in complex network topologies or for testing purposes.\n\n### Challenges with IP Options\n- **Performance Impact**: IP options increase the size of the packet header, which can reduce the efficiency of packet processing and transmission.\n- **Compatibility**: Not all network devices or protocols support IP options, which can lead to issues in heterogeneous networks.\n- **Security Risks**: Some IP options, like source routing, can be exploited for malicious purposes, such as bypassing security controls.\n\n### Implementation in the Linux Kernel\nIn the Linux kernel, IP options are handled by methods such as `ip_options_compile()`, which parses and compiles IP options from incoming packets, and `ip_forward_options()`, which manages options during packet forwarding. The kernel also includes mechanisms for handling specific options, such as the Record Route and Timestamp options, which are processed during packet reception and transmission.\n\nIn summary, IP options provide additional functionality for IPv4 packets, enabling features like route recording, timing, and security. However, their use must be carefully managed due to potential performance and security implications.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the book approach the teaching of jargon related to regular expressions?",
+        "answers": [
+            "The book approaches jargon by sharing the correct terms in small doses and emphasizing practical application over overwhelming readers with dry language."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "651b194a399fdf8f19eb47ec9f31a90d",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        },
+        "lightrag_answer": "The book *Introducing Regular Expressions* by Michael Fitzgerald takes a structured and accessible approach to teaching the jargon and concepts related to regular expressions. It begins by introducing fundamental terms and gradually builds on them, ensuring readers understand the vocabulary and syntax necessary to work effectively with regular expressions.\n\n### Glossary and Definitions\nThe book includes a **Regular Expression Glossary**, which serves as a reference for key terms and concepts. This glossary provides clear definitions for terms such as \"quantifiers,\" \"metacharacters,\" \"backreferences,\" and \"anchors,\" among others. By centralizing these definitions, the book helps readers quickly look up and understand the terminology as they encounter it in the text.\n\n### Practical Examples\nThe author uses **practical examples** to illustrate how each concept works in real-world scenarios. For instance, the book demonstrates the use of alternation (the OR operator in regular expressions) by showing how to match variations of the word \"the\" in a text. These examples are accompanied by detailed explanations, making it easier for readers to grasp the jargon in context.\n\n### Step-by-Step Guidance\nThe book provides **step-by-step guidance** on how to use regular expressions in various tools and programming languages, such as Perl and *grep*. For example, it explains how to use quantifiers like `*`, `+`, and `?` to match patterns in text, and it contrasts greedy, lazy, and possessive quantifiers. This hands-on approach helps readers internalize the jargon by applying it in practical exercises.\n\n### Visual Aids\nThe inclusion of **visual aids**, such as screenshots from tools like RegExr and Reggy, further enhances understanding. These visuals show the results of applying specific regular expressions, making it easier for readers to see how the jargon translates into actual pattern matching.\n\n### Comparative Analysis\nThe book also includes **comparative analysis** of different regular expression implementations. For example, it contrasts the syntax and features of regular expressions in Perl with those in other tools. This comparative approach helps readers understand the nuances of the jargon across different platforms.\n\n### Summary Tables\nTo reinforce learning, the book uses **summary tables** that list key terms and their descriptions. For instance, tables summarizing quantifiers, options, and modifiers provide a quick reference for readers to review and solidify their understanding of the jargon.\n\nIn summary, *Introducing Regular Expressions* teaches jargon through a combination of clear definitions, practical examples, step-by-step guidance, visual aids, comparative analysis, and summary tables. This multifaceted approach ensures that readers not only learn the terminology but also understand how to apply it effectively in their work with regular expressions.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What role do netlink sockets play in Linux Kernel Networking?",
+        "answers": [
+            "Netlink sockets provide a bidirectional communication channel between userspace and the kernel, allowing for tasks such as adding or deleting routes, configuring neighboring tables, and setting IPsec policies and states."
+        ],
+        "context": "Linux Kernel Networking",
+        "length": 324154,
+        "context_id": "a26e2a9fe9627588b3f37969996aa053",
+        "_id": "68e8a60e5e7f7fe1048bc0e0d88d07ac",
+        "label": "cs",
+        "meta": {
+            "title": "Linux Kernel Networking",
+            "authors": "Rami Rosen"
+        },
+        "lightrag_answer": "Netlink sockets are a crucial communication mechanism in the Linux Kernel Networking subsystem, facilitating bidirectional interaction between the kernel and userspace processes. They serve as a flexible and efficient alternative to older methods like IOCTL, enabling userspace applications to configure, monitor, and manage network-related tasks. Below, we explore the key roles and functionalities of netlink sockets in the Linux Kernel Networking ecosystem.\n\n### Core Functionality of Netlink Sockets\n\nNetlink sockets provide a robust framework for exchanging messages between the kernel and userspace. They are particularly useful for tasks such as network configuration, routing, and monitoring. Unlike IOCTL, which is less flexible and requires predefined command numbers, netlink sockets offer a more dynamic and extensible interface. This makes them well-suited for handling complex networking operations.\n\n### Key Features and Use Cases\n\n1. **Network Configuration and Management**  \n   Netlink sockets are extensively used by tools like `iproute2` to manage network interfaces, routing tables, and neighboring subsystems. For example, adding or deleting a route in the kernel can be accomplished by sending a netlink message from userspace. This is achieved through protocols like `NETLINK_ROUTE`, which is specifically designed for routing and link management.\n\n2. **Multicast Communication**  \n   Netlink sockets support multicast groups, allowing multiple userspace applications to subscribe to specific events. For instance, an application can monitor changes in network interfaces or routing tables by subscribing to the appropriate multicast group. This is particularly useful for real-time monitoring and event-driven applications.\n\n3. **Generic Netlink Protocol**  \n   The Generic Netlink Protocol extends the capabilities of netlink sockets by supporting a higher number of protocol families. It acts as a multiplexer, enabling various subsystems (e.g., ACPI, thermal events, and Near Field Communication) to use netlink for communication. This flexibility makes it a general-purpose communication channel within the kernel.\n\n4. **Kernel-Userspace Interaction**  \n   Netlink sockets enable seamless communication between the kernel and userspace. For example, the kernel can notify userspace about network events (e.g., interface status changes) by sending netlink messages. Conversely, userspace applications can send requests to the kernel for tasks like querying network statistics or configuring IPsec policies.\n\n### Implementation and Integration\n\nNetlink sockets are deeply integrated into the Linux Kernel Networking stack. They are created using the `netlink_kernel_create()` function, which allows for customization through parameters like multicast groups and input callbacks. For instance, the `rtnetlink` socket, used for routing messages, is initialized with a specific callback (`rtnetlink_rcv`) to handle incoming messages.\n\n### Advantages Over Traditional Methods\n\nNetlink sockets offer several advantages over traditional methods like IOCTL:\n- **Flexibility**: Netlink sockets support a wide range of protocols and message types, making them adaptable to various use cases.\n- **Scalability**: The Generic Netlink Protocol overcomes the limitation of 32 protocol families, enabling the addition of more families as needed.\n- **Efficiency**: Netlink sockets are optimized for high-performance communication, reducing overhead and improving responsiveness.\n\n### Conclusion\n\nNetlink sockets are a cornerstone of Linux Kernel Networking, providing a powerful and flexible mechanism for kernel-userspace communication. They enable efficient network configuration, real-time monitoring, and seamless integration with various subsystems. By leveraging netlink sockets, developers and administrators can achieve greater control and visibility over network operations, making them an indispensable tool in modern Linux networking.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the primary purpose of \"Joe Celko's SQL Programming Style\"?",
+        "answers": [
+            "The primary purpose of the book is to improve SQL programming style and proficiency, helping individual programmers write Standard SQL without an accent or dialect and to give SQL shops a coding standard for internal use."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "bcf778eb653c69da482aa29ae4133d37",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        },
+        "lightrag_answer": "\"Joe Celko's SQL Programming Style\" is a comprehensive guide aimed at improving the quality, readability, and maintainability of SQL code. Written by Joe Celko, a renowned SQL expert and former member of the ANSI SQL standards committee, the book emphasizes the importance of adhering to established standards and best practices in SQL programming. Its primary purpose is to help developers write SQL code that is not only functional but also clear, consistent, and efficient.\n\nThe book covers a wide range of topics, including naming conventions, code formatting, data declaration, and the proper use of SQL features like views and stored procedures. It also addresses common pitfalls and misconceptions, particularly those that arise from procedural programming habits or the misuse of proprietary SQL extensions. By focusing on these areas, Celko aims to bridge the gap between basic SQL knowledge and professional-level proficiency.\n\nOne of the key themes of the book is the importance of following ISO-11179 standards for naming conventions, which promote clarity and consistency in database design. Celko also stresses the need for SQL programmers to adopt basic software engineering principles, such as cohesion and coupling, which are often overlooked in SQL development. Additionally, the book provides practical advice on how to avoid unnecessary complexity, such as the overuse of views or the misuse of auto-numbering features like IDENTITY.\n\nOverall, \"Joe Celko's SQL Programming Style\" serves as both a reference and a teaching tool, offering insights and guidelines that are essential for anyone looking to write high-quality SQL code. It is particularly valuable for developers transitioning from procedural programming languages to SQL, as it highlights the unique challenges and opportunities of working with a declarative language like SQL.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the role of the tempdb database in SQL Server?",
+        "answers": [
+            "The tempdb database is a key working area for the server. It is used for storing temporary objects such as complex query interim tables, user-created temporary tables, and other temporary storage needs. It is rebuilt from scratch every time the SQL Server restarts."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "af3176b476908e8a18b0ec8e0a27c6b4",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        },
+        "lightrag_answer": "The `tempdb` database plays a crucial role in SQL Server as a system database designed for temporary storage and processing. It is a shared resource used by all users and processes connected to the SQL Server instance. Below are the key functions and characteristics of `tempdb`:\n\n### Temporary Storage for Intermediate Results\n`tempdb` is primarily used to store temporary objects and intermediate results during query execution. This includes temporary tables, table variables, and cursors. For example, when a query involves complex operations like sorting or joining large datasets, SQL Server may use `tempdb` to store intermediate results to optimize performance. Temporary tables, such as those created with the `#` prefix, are stored in `tempdb` and are automatically dropped when the session ends.\n\n### Support for Cursors and Indexing\nCertain types of cursors, such as keyset-driven cursors, rely on `tempdb` for storage. While these cursors are slower due to disk-based storage compared to dynamic cursors that operate from RAM, `tempdb` provides the necessary infrastructure for their operation. Additionally, temporary tables in `tempdb` can be indexed, which improves the performance of queries that reuse the same data multiple times.\n\n### Handling System Operations\n`tempdb` is also used for various system operations, such as managing version stores for row versioning in transactions. For instance, when using the `SNAPSHOT` isolation level, SQL Server uses `tempdb` to store row versions to ensure transactional consistency without blocking other transactions. This makes `tempdb` essential for maintaining concurrency in high-transaction environments.\n\n### Rebuild and Resource Management\nOne unique aspect of `tempdb` is that it is rebuilt from scratch every time SQL Server restarts. This ensures that the database starts with a clean slate, free from any residual temporary data. However, this also means that `tempdb` can run out of resources if it becomes full, which can impact system performance. Proper configuration and monitoring of `tempdb` are essential to avoid such issues.\n\n### Integration with SQL Server Features\n`tempdb` is tightly integrated with other SQL Server features and tools. For example, SQL Server Management Studio (SSMS) provides tools for managing and optimizing `tempdb`. Additionally, the `SORT_IN_TEMPDB` option allows SQL Server to use `tempdb` for sorting operations, which can improve query performance in certain scenarios.\n\n### Conclusion\nIn summary, `tempdb` is a vital component of SQL Server, serving as a temporary workspace for queries, cursors, and system operations. Its efficient management is critical for maintaining optimal performance and ensuring smooth operation of SQL Server instances. Understanding its role and configuring it appropriately can help database administrators avoid performance bottlenecks and resource contention.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What audience is the text primarily intended for?",
+        "answers": [
+            "The text is intended primarily for readers who have not had any previous programming experience. However, it can also serve as a useful resource for others, including those who have programmed in the past but are new to Java."
+        ],
+        "context": "Guide to Java",
+        "length": 132156,
+        "context_id": "9acb1e7611785d05ed363e0a12ff81c4",
+        "_id": "e7b68cf28240fdf4b611a7b6c5f1fdb5",
+        "label": "cs",
+        "meta": {
+            "title": "Guide to Java",
+            "authors": "James T. Streib"
+        },
+        "lightrag_answer": "The text is primarily intended for **computer professionals** and **students and researchers** who are involved in networking-related projects or academic research. \n\n### Computer Professionals\nThe book targets a wide range of professionals, including **developers, software architects, designers, project managers, and CTOs**. These individuals are likely working on projects in areas such as communication, data centers, embedded devices, virtualization, and security. The content is designed to provide practical insights and technical knowledge that can be directly applied to their work.\n\n### Students and Researchers\nIn addition to professionals, the book is also aimed at **students and academy researchers** who are engaged in networking projects, networking research, or operating systems research. The material is structured to offer a comprehensive understanding of the Linux kernel and its networking components, making it a valuable resource for academic study and theoretical exploration.\n\n### Summary\nOverall, the text serves as a technical guide for both practitioners and academics, offering detailed explanations and practical examples related to the Linux network stack and its various subsystems. Its dual focus on professional application and academic research makes it a versatile resource for a broad audience.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the book recommend handling the complexity of regular expressions?",
+        "answers": [
+            "The book recommends learning by doing, encouraging readers to follow examples and try out new concepts in practical exercises."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "914ab1cd78585d24f2049fe2db29ebc1",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        },
+        "lightrag_answer": "The book provides several recommendations for handling the complexity of regular expressions, emphasizing the use of tools, best practices, and structured approaches to make pattern matching more manageable and efficient.\n\n### Use of Tools and Applications\nThe book highlights the importance of using specialized tools for testing and developing regular expressions. For instance, **RegExr** is recommended as a desktop application for experimenting with and testing regular expressions. It provides a visual interface that helps users see the results of their patterns in real-time, making it easier to debug and refine expressions. Additionally, **RegexPal**, another online tool, is mentioned for its simplicity and ease of use, particularly for beginners learning regular expressions.\n\n### Simplifying Patterns with Options and Modifiers\nThe book suggests using options and modifiers to simplify patterns. For example, the `(?i)` option can be used to make a pattern case-insensitive, reducing the need for complex alternation patterns like `(the|The|THE)`. This approach not only makes the pattern shorter but also easier to read and maintain. Other modifiers, such as `(?m)` for multiline matching and `(?s)` for single-line matching, are also discussed to handle specific scenarios more efficiently.\n\n### Structured Approaches to Subpatterns and Capturing Groups\nThe book emphasizes the use of **subpatterns** and **capturing groups** to break down complex patterns into smaller, more manageable components. By grouping parts of a pattern within parentheses, users can capture and reuse specific segments of text, making the overall pattern more modular and easier to understand. For example, the pattern `(t|T)h(e|eir)` can match variations of \"the\" and \"their\" by breaking the problem into smaller subpatterns.\n\n### Leveraging Backreferences\nThe book also discusses the use of **backreferences** to reuse captured groups within the same pattern. This technique is particularly useful for rearranging or reformatting text. For instance, the pattern `(It is) (an ancyent Marinere)` can be rearranged using backreferences like `\\2 \\1` to produce \"an ancyent Marinere It is.\" This approach reduces redundancy and makes the pattern more efficient.\n\n### Avoiding Redundancy and Seeking Compact Forms\nThe book advises against using redundant expressions in patterns, as they can confuse both human readers and optimizers. Instead, it recommends seeking compact forms of expressions that are easier to maintain and read. For example, instead of writing `x > 0 AND x < 100`, the book suggests using the `BETWEEN` operator, which is more concise and clearly conveys the relationship between the values.\n\n### Learning from Examples and Resources\nThe book encourages readers to learn from examples and resources, such as the **Regular Expressions Cookbook** by Steven Levithan and Jan Goyvaerts, which provides practical examples and comparisons of different regex implementations. Additionally, the book recommends exploring online resources and communities for further learning and troubleshooting.\n\nIn summary, the book advocates for a combination of tools, structured approaches, and best practices to handle the complexity of regular expressions effectively. By leveraging these strategies, users can create more efficient, readable, and maintainable patterns.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is a principal type in the context of type inference?",
+        "answers": [
+            "A principal type is the most general type that can be assigned to an expression without losing information. Any other type for the expression can be derived by substituting type variables in the principal type."
+        ],
+        "context": "Introduction to the Theory of Programming Languages",
+        "length": 51704,
+        "context_id": "2369ec0826789a4f553cb31b1635c15d",
+        "_id": "8b60cd4fd9e3578409412680b1d69c0e",
+        "label": "cs",
+        "meta": {
+            "title": "Introduction to the Theory of Programming Languages",
+            "authors": "Gilles Dowek"
+        },
+        "lightrag_answer": "In the context of type inference, a **principal type** refers to the most general type that can be assigned to an expression in a programming language. This concept is central to type systems and type inference algorithms, as it allows for the automatic determination of types without requiring explicit type annotations from the programmer.\n\n### Understanding Principal Types\n\nA principal type is the most general type in the sense that it can be instantiated to other, more specific types through substitutions. For example, if an expression has a principal type `∀X [X -> X]`, this means it can be used in contexts where `X` is replaced with any specific type, such as `nat -> nat` or `bool -> bool`. This flexibility is particularly useful in polymorphic type systems, where expressions can operate on multiple types.\n\n### Role in Type Inference Algorithms\n\nPrincipal types are a key goal of type inference algorithms, such as **Hindley's Algorithm** and the **Damas and Milner Algorithm**. These algorithms aim to deduce the principal type of an expression by solving type equations and constraints. The principal type theorem states that for a closed term, the type computed by the algorithm is the most general type, ensuring that any other valid type for the term is an instance of this principal type.\n\n### Practical Implications\n\nThe concept of principal types is crucial for ensuring **type safety** and **type correctness** in programming languages. By inferring the most general type, these algorithms reduce the need for explicit type declarations, making programs more concise and easier to maintain. Additionally, principal types enable **polymorphic typing**, where functions and expressions can operate on multiple types without losing type safety.\n\n### Example\n\nConsider the identity function in a typed language. Its principal type might be `∀X [X -> X]`, meaning it can take an argument of any type `X` and return a value of the same type. This principal type allows the function to be used in various contexts, such as `nat -> nat` or `bool -> bool`, without requiring separate implementations for each type.\n\n### Conclusion\n\nIn summary, the principal type is a foundational concept in type inference, enabling the automatic and flexible determination of types in programming languages. It plays a critical role in ensuring type safety, supporting polymorphism, and reducing the burden of explicit type annotations for programmers.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What are user-defined functions (UDFs) in SQL Server and how do they differ from stored procedures?",
+        "answers": [
+            "User-defined functions are similar to sprocs but can return a value of most SQL Server data types and cannot have side effects like changing tables. They are used to encapsulate reusable logic and return a single value or a table result."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "31b27ce444fc7177802c16d0beea99b1",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        },
+        "lightrag_answer": "User-Defined Functions (UDFs) in SQL Server are custom functions created by users to perform specific tasks within the database. They are a powerful feature that allows developers to encapsulate reusable logic, simplify complex queries, and enhance data management. UDFs can be written in T-SQL or .NET assemblies, and they return a value after performing a specific action or calculation. Unlike stored procedures, UDFs are designed to be deterministic, meaning they always return the same output for the same input, and they cannot have side effects, such as modifying tables or sending emails.\n\n### Key Characteristics of UDFs:\n1. **Return Values**: UDFs can return a single value or a table. Scalar UDFs return a single value, while table-valued UDFs return a table as their result.\n2. **No Side Effects**: UDFs are not allowed to perform actions that affect the database state, such as inserting, updating, or deleting data.\n3. **Deterministic**: UDFs are deterministic functions, meaning they produce the same output for the same input every time they are called.\n4. **Reusable Logic**: UDFs encapsulate logic that can be reused across multiple queries or procedures, promoting code modularity and maintainability.\n5. **Parameters**: UDFs can accept parameters, allowing for flexible and dynamic behavior based on input values.\n\n### Differences Between UDFs and Stored Procedures:\n1. **Return Type**: \n   - UDFs must return a value, either scalar or tabular.\n   - Stored procedures can return values but are not required to; they can also perform actions without returning anything.\n\n2. **Side Effects**:\n   - UDFs cannot modify the database state or have side effects.\n   - Stored procedures can perform a wide range of actions, including modifying data, sending emails, and changing system settings.\n\n3. **Usage in Queries**:\n   - UDFs can be used directly in SQL statements, such as SELECT, WHERE, and JOIN clauses.\n   - Stored procedures are typically executed using the `EXEC` command and cannot be embedded directly in SQL statements.\n\n4. **Performance**:\n   - UDFs are generally less efficient than stored procedures, especially scalar UDFs, which can degrade query performance when used in large datasets.\n   - Stored procedures are precompiled and optimized, making them faster for complex operations.\n\n5. **Error Handling**:\n   - UDFs do not support error handling mechanisms like `TRY/CATCH` blocks.\n   - Stored procedures can include robust error handling using `TRY/CATCH` blocks.\n\n### Practical Use Cases for UDFs:\n- **Data Validation**: UDFs can be used to validate data before inserting or updating records.\n- **Complex Calculations**: UDFs are ideal for encapsulating complex mathematical or business logic.\n- **Data Transformation**: UDFs can transform data into a desired format, such as converting units or formatting strings.\n- **Reusable Query Logic**: Table-valued UDFs can simplify complex joins or filtering logic by encapsulating it in a reusable function.\n\n### Conclusion:\nUser-Defined Functions are a versatile tool in SQL Server, offering a way to encapsulate and reuse logic while maintaining database integrity. However, they differ significantly from stored procedures in terms of functionality, usage, and performance. Understanding these differences is crucial for choosing the right tool for specific database tasks. While UDFs are excellent for calculations and data transformations, stored procedures are better suited for operations that require modifying the database state or handling complex business logic.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What are the two categories of indexes in SQL Server and what distinguishes them?",
+        "answers": [
+            "Indexes in SQL Server fall into two categories: clustered and non-clustered. A clustered index can have only one per table and means the table is physically sorted according to that index. Non-clustered indexes are more like traditional indexes and can have many per table."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "a7e68e0890a882b80e2a0bcb9c3e94b9",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        },
+        "lightrag_answer": "In SQL Server, indexes are primarily categorized into two types: **Clustered Indexes** and **Non-Clustered Indexes**. These two types of indexes differ significantly in how they organize and store data, which impacts their performance and use cases.\n\n### Clustered Indexes\nA **Clustered Index** is unique in that it physically orders the data rows in a table based on the index key. This means that the leaf level of a clustered index is the actual data itself. When you create a clustered index, SQL Server rearranges the data in the table to match the order of the index. As a result, there can be only one clustered index per table because the data can only be physically sorted in one way.\n\nKey characteristics of clustered indexes include:\n- **Physical Data Ordering**: The data rows are stored in the same order as the index, which makes clustered indexes highly efficient for range queries and ordered data retrieval.\n- **Direct Data Access**: Once the index is navigated to the leaf level, the data is immediately available, eliminating the need for additional lookups.\n- **Impact on Data Modifications**: Inserting or updating data in a table with a clustered index can lead to page splits, especially if the new data does not fit sequentially, which can impact performance.\n\n### Non-Clustered Indexes\nA **Non-Clustered Index**, on the other hand, does not alter the physical order of the data in the table. Instead, it creates a separate structure that contains the index key values and pointers to the actual data rows. The leaf level of a non-clustered index contains either a pointer to the data row (if the table is a heap) or the clustered index key (if the table has a clustered index).\n\nKey characteristics of non-clustered indexes include:\n- **Separate Structure**: The index is stored separately from the data, meaning that accessing data through a non-clustered index requires an additional lookup step.\n- **Multiple Indexes Allowed**: You can create multiple non-clustered indexes on a table, which allows for flexibility in optimizing different types of queries.\n- **Efficiency for Specific Queries**: Non-clustered indexes are particularly useful for queries that filter or sort on specific columns, as they provide a fast path to the relevant data without requiring a full table scan.\n\n### Key Differences\n- **Data Storage**: Clustered indexes store the actual data rows in the order of the index, while non-clustered indexes store only the index keys and pointers to the data.\n- **Number per Table**: Only one clustered index can exist per table, whereas multiple non-clustered indexes can be created.\n- **Performance**: Clustered indexes are generally faster for range queries and ordered data retrieval, while non-clustered indexes are better suited for specific column-based queries.\n\nUnderstanding these distinctions is crucial for database administrators and developers when designing and optimizing database schemas to ensure efficient query performance.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What caution does the book provide regarding the use of maximum likelihood estimation?",
+        "answers": [
+            "The book cautions that while maximum likelihood estimation is a powerful tool for estimating model parameters from data, it can be sensitive to the presence of outliers and may not always provide the best fit if the model assumptions are not met."
+        ],
+        "context": "Probability and Statistics for Computer Science",
+        "length": 340084,
+        "context_id": "cd05f98133352f12d464fe3184263356",
+        "_id": "1b096fe0e27b3aa78ed00205f1998547",
+        "label": "cs",
+        "meta": {
+            "title": "Probability and Statistics for Computer Science",
+            "authors": "David Forsyth"
+        },
+        "lightrag_answer": "The book provides several important cautions regarding the use of **Maximum Likelihood Estimation (MLE)**, emphasizing its limitations and potential pitfalls in certain scenarios. Here are the key points to consider:\n\n### 1. **Reliability of Estimates**\n   - The book highlights that **maximum likelihood estimates might not always be reliable**, particularly in cases where the data is limited or the model assumptions are not met. For example, if the dataset is small or the underlying distribution of the data does not align with the assumed model, the MLE may produce biased or inaccurate results. This is especially true for complex models or when the likelihood function is difficult to optimize.\n\n### 2. **Handling of Prior Information**\n   - MLE does not incorporate **prior information** about the parameters, which can be a limitation in situations where prior knowledge is available. In contrast, **Bayesian inference** explicitly incorporates prior beliefs through the use of prior distributions, making it more flexible in such cases. The book notes that Bayesian methods are particularly useful when there is little data, as they can leverage prior information to improve estimates.\n\n### 3. **Sensitivity to Outliers**\n   - MLE can be **sensitive to outliers** in the data, as it aims to maximize the likelihood of the observed data. Outliers can disproportionately influence the estimates, leading to skewed results. This is a critical consideration when working with real-world datasets that may contain noisy or anomalous observations.\n\n### 4. **Complexity of Likelihood Functions**\n   - The book points out that **maximizing the likelihood function** can be computationally challenging, especially for complex models or high-dimensional data. In such cases, the likelihood function may have multiple local maxima, making it difficult to find the global maximum. This can lead to suboptimal parameter estimates if the optimization process is not carefully managed.\n\n### 5. **Assumptions of Independence**\n   - MLE assumes that the data points are **independent and identically distributed (IID)**. If this assumption is violated, the likelihood function may not accurately represent the data-generating process, leading to incorrect estimates. The book emphasizes the importance of verifying the IID assumption before applying MLE.\n\n### 6. **Comparison with Bayesian Methods**\n   - The book contrasts MLE with **Bayesian inference**, noting that Bayesian methods can address some of the limitations of MLE by incorporating prior distributions and providing a full posterior distribution over parameters. This allows for a more nuanced understanding of parameter uncertainty, which is not captured by MLE's point estimates.\n\n### Conclusion\nWhile **Maximum Likelihood Estimation** is a powerful and widely used method for parameter estimation, the book cautions that it is not a one-size-fits-all solution. Practitioners should be aware of its limitations and consider alternative methods, such as Bayesian inference, when dealing with small datasets, prior information, or complex models. Careful validation of model assumptions and robustness checks are essential to ensure the reliability of MLE-based estimates.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the significance of the ICMP protocol in Linux Kernel Networking?",
+        "answers": [
+            "The ICMP protocol is significant in Linux Kernel Networking for sending error and control messages about the network layer, helping to keep the system behaving correctly by providing feedback about problems in the communication environment."
+        ],
+        "context": "Linux Kernel Networking",
+        "length": 324154,
+        "context_id": "a26e2a9fe9627588b3f37969996aa053",
+        "_id": "fd89ca52cc6b3c8e0d521d445faf2465",
+        "label": "cs",
+        "meta": {
+            "title": "Linux Kernel Networking",
+            "authors": "Rami Rosen"
+        },
+        "lightrag_answer": "The Internet Control Message Protocol (ICMP) plays a crucial role in Linux Kernel Networking, primarily serving as a mechanism for error reporting and network diagnostics. ICMP is a core protocol of the Internet, and its implementation in the Linux Kernel is essential for maintaining network integrity and facilitating communication between devices. Below, we explore the significance of ICMP in Linux Kernel Networking in greater detail.\n\n### Error Reporting and Diagnostics\nICMP is primarily used to send error messages and operational information about network conditions. For example, when a packet cannot reach its destination, ICMP messages such as \"Destination Unreachable\" or \"Time Exceeded\" are generated to inform the sender about the issue. These messages are critical for diagnosing network problems and ensuring that devices can adapt to changing network conditions. In the Linux Kernel, ICMPv4 and ICMPv6 are implemented to handle these tasks for IPv4 and IPv6 networks, respectively.\n\n### Ping Utility\nOne of the most well-known applications of ICMP is the **ping utility**, which uses ICMP Echo Request and Echo Reply messages to test the reachability of a host on an IP network. The Linux Kernel supports ICMP sockets, which allow non-privileged users to send and receive ICMP messages without requiring raw sockets. This feature, introduced in kernel 3.0, enhances security by reducing the need for elevated privileges while still enabling essential network diagnostics.\n\n### Integration with IPv4 and IPv6\nICMP is tightly integrated with both IPv4 and IPv6 in the Linux Kernel. For IPv4, ICMPv4 handles error messages and diagnostic functions, such as ping and traceroute. In IPv6, ICMPv6 extends these capabilities to include features like Neighbor Discovery (ND) and Multicast Listener Discovery (MLD), which are essential for IPv6 network operations. The Linux Kernel's dual-stack implementation ensures that ICMPv4 and ICMPv6 work seamlessly across both protocols.\n\n### Rate Limiting and Security\nTo prevent abuse and ensure network stability, the Linux Kernel implements rate limiting for ICMP messages. This mechanism prevents ICMP from overwhelming the network with excessive error messages or diagnostic requests. Additionally, ICMP is associated with certain security vulnerabilities, such as the Smurf Attack, which exploits ICMP for denial-of-service attacks. The Linux Kernel includes safeguards to mitigate these risks, such as rate limiting and proper handling of ICMP messages.\n\n### ICMP Sockets\nThe introduction of ICMP sockets in the Linux Kernel allows userspace applications to send and receive ICMP messages without requiring raw sockets. This feature is particularly useful for implementing non-privileged ping functionality, as it reduces the security risks associated with raw socket access. ICMP sockets are supported for both IPv4 and IPv6, making them a versatile tool for network diagnostics.\n\n### Conclusion\nIn summary, ICMP is a fundamental protocol in Linux Kernel Networking, providing essential error reporting, diagnostic, and communication capabilities. Its integration with IPv4 and IPv6, support for tools like ping, and implementation of security measures like rate limiting make it a critical component of the Linux networking stack. The addition of ICMP sockets further enhances its utility by enabling secure and efficient network diagnostics without requiring elevated privileges.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the significance of the ALS algorithm in Spark's MLlib?",
+        "answers": [
+            "The ALS (Alternating Least Squares) algorithm is used in Spark's MLlib for collaborative filtering, particularly in recommendation systems. It is used to factorize a user-item rating matrix into user and item factor matrices, which can then be used to predict missing ratings."
+        ],
+        "context": "Machine Learning With Spark",
+        "length": 131651,
+        "context_id": "7bcef8714a477fd61fc8fb0d499b2cc3",
+        "_id": "4efd3e57ae57229b5d83754ba39df1fa",
+        "label": "cs",
+        "meta": {
+            "title": "Machine Learning With Spark",
+            "authors": "Nick Pentreath"
+        },
+        "lightrag_answer": "The Alternating Least Squares (ALS) algorithm holds significant importance in Apache Spark's MLlib library, particularly in the context of recommendation systems. ALS is a matrix factorization technique that is widely used for collaborative filtering, a method that predicts user preferences based on historical interactions between users and items. Below, we explore the key aspects of ALS and its role in Spark's MLlib.\n\n### **Matrix Factorization and Collaborative Filtering**\nALS is a core algorithm for matrix factorization, a technique that decomposes a user-item interaction matrix into two lower-dimensional matrices: one representing user factors and the other representing item factors. This decomposition allows the system to predict user preferences for items they have not yet interacted with. Collaborative filtering, which relies on these factorized matrices, is a fundamental approach in recommendation systems, enabling personalized suggestions based on user behavior.\n\n### **Efficiency and Scalability**\nOne of the primary reasons ALS is favored in Spark's MLlib is its efficiency and scalability. ALS works by iteratively solving a series of least squares regression problems, alternating between fixing the user-factor matrix and updating the item-factor matrix, and vice versa. This iterative process is well-suited for parallel computation, making it highly efficient for distributed computing environments like Spark. As a result, ALS can handle large-scale datasets with millions of users and items, which is essential for real-world recommendation systems.\n\n### **Support for Explicit and Implicit Feedback**\nALS in MLlib supports both explicit and implicit feedback. Explicit feedback includes direct user ratings (e.g., movie ratings), while implicit feedback is derived from user interactions such as views, clicks, or purchases. For implicit feedback, ALS treats the input data as a combination of a binary preference matrix and a confidence weight matrix, allowing it to infer user preferences even when explicit ratings are unavailable. This flexibility makes ALS applicable to a wide range of recommendation scenarios.\n\n### **Integration with Spark Ecosystem**\nALS is tightly integrated into Spark's MLlib, leveraging Spark's distributed computing capabilities to train recommendation models efficiently. The algorithm is implemented in a way that takes advantage of Spark's Resilient Distributed Datasets (RDDs) and parallel processing, ensuring optimal performance on large datasets. Additionally, ALS is the only matrix factorization algorithm currently implemented in MLlib, making it the go-to choice for collaborative filtering tasks in Spark.\n\n### **Model Evaluation and Optimization**\nALS models are evaluated using metrics such as Mean Squared Error (MSE) and Mean Average Precision at K (MAPK), which measure the accuracy of recommendations. Spark's MLlib provides tools for cross-validation and hyperparameter tuning, allowing users to optimize ALS models for better performance. This focus on evaluation and optimization ensures that ALS-based recommendation systems can deliver high-quality predictions.\n\n### **Applications in Real-World Systems**\nALS is widely used in real-world applications, such as movie recommendations (e.g., the MovieLens dataset) and e-commerce product recommendations. Its ability to handle large datasets and provide accurate predictions makes it a cornerstone of modern recommendation engines. Projects like Oryx and Prediction.io further extend ALS's capabilities by focusing on model serving and deployment for large-scale systems.\n\n### **Conclusion**\nIn summary, the ALS algorithm is a critical component of Spark's MLlib, offering a scalable, efficient, and flexible solution for collaborative filtering and recommendation systems. Its integration with Spark's distributed computing framework, support for both explicit and implicit feedback, and robust evaluation capabilities make it a powerful tool for building personalized recommendation engines. As the only matrix factorization algorithm in MLlib, ALS remains a key enabler of machine learning applications in Spark.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What does the book recommend regarding the use of proprietary data types?",
+        "answers": [
+            "The book recommends avoiding proprietary data types as they do not port to other products or from one release to another of the same product, and standard SQL has enough data types to model most real-world situations."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "2467ca0b4b3afc74d95a04fe58832bb2",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        },
+        "lightrag_answer": "The book advises against the use of proprietary data types in database design, emphasizing the importance of adhering to standard SQL data types for better portability and compatibility. Proprietary data types, which are custom or non-standard data types specific to certain systems, are discouraged because they lack portability and may require special handling. This can lead to complications when migrating or integrating databases across different systems.\n\nThe book highlights that while proprietary data types might offer some advantages in specific contexts, they come with significant drawbacks, including potential performance and security costs. It recommends using standard SQL data types to ensure consistency and interoperability across different database systems. Additionally, the book suggests that if custom data types are necessary, they should be carefully considered, thoroughly tested, and used only when absolutely required.\n\nIn summary, the book advocates for the use of standard SQL data types over proprietary ones to maintain data integrity, ensure compatibility, and simplify database management.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How do you assign a macro to a button on the Quick Access Toolbar in Word?",
+        "answers": [
+            "To assign a macro to a button on the Quick Access Toolbar in Word, right-click the Quick Access Toolbar, choose Customize Quick Access Toolbar, select Macros from the Choose Commands From drop-down list, select the macro, click Add, and then modify the button's name and icon if desired."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "52552813e55408b4b15c849771b5e0f0",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        },
+        "lightrag_answer": "Assigning a macro to a button on the Quick Access Toolbar in Microsoft Word is a straightforward process that allows you to quickly access and execute your macros. Here’s how you can do it:\n\n### Step 1: Open the Word Options Dialog Box\nTo begin, you need to access the Word Options dialog box, which is where you can customize the Quick Access Toolbar. You can do this by right-clicking anywhere on the Quick Access Toolbar (the set of icons in the upper-left corner of the Word window, above the Ribbon). From the context menu that appears, select **Customize Quick Access Toolbar**. This action opens the Word Options dialog box with the Quick Access Toolbar customization options displayed.\n\n### Step 2: Select the Macro to Assign\nIn the Word Options dialog box, you’ll see a section labeled **Choose commands from**. Click the drop-down menu in this section and select **Macros**. This will display a list of all the macros available in your Word environment. Scroll through the list and click on the macro you want to assign to a button on the Quick Access Toolbar.\n\n### Step 3: Add the Macro to the Quick Access Toolbar\nOnce you’ve selected the macro, click the **Add** button. This action will copy the macro’s name into the list of buttons on the right side of the dialog box, which represents the current configuration of the Quick Access Toolbar. The macro is now assigned to the Quick Access Toolbar, but you can further customize its appearance if desired.\n\n### Step 4: Customize the Button (Optional)\nIf you want to change the icon or the name of the button for better identification, click the **Modify** button. This opens a dialog box where you can select a new icon from a list of available options and change the display name of the button. This step is optional but can be helpful if you have multiple macros and want to easily distinguish between them.\n\n### Step 5: Save Your Changes\nAfter you’ve added the macro and customized the button (if necessary), click **OK** to close the Word Options dialog box. Your macro is now assigned to a button on the Quick Access Toolbar, and you can execute it by simply clicking the button.\n\n### Summary\nBy following these steps, you can assign any macro to a button on the Quick Access Toolbar in Word, making it easier to access and run your macros without navigating through menus or using keyboard shortcuts. This customization enhances your productivity by providing quick access to frequently used macros.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is Apache Spark and what are its key features?",
+        "answers": [
+            "Apache Spark is a framework for distributed computing designed to handle large-scale data processing tasks efficiently. Its key features include in-memory processing, fault tolerance, flexible data structures, and support for multiple programming languages like Scala, Java, and Python."
+        ],
+        "context": "Machine Learning With Spark",
+        "length": 131651,
+        "context_id": "7bcef8714a477fd61fc8fb0d499b2cc3",
+        "_id": "e17c82938a0a24b191c0642f5bc3afe5",
+        "label": "cs",
+        "meta": {
+            "title": "Machine Learning With Spark",
+            "authors": "Nick Pentreath"
+        },
+        "lightrag_answer": "Apache Spark is an open-source, unified analytics engine designed for large-scale data processing. It was originally developed at the University of California, Berkeley, as a research project focused on distributed machine learning algorithms. Spark has since evolved into a versatile and powerful framework widely used in both academic and industrial settings for handling big data and machine learning tasks.\n\n### Key Features of Apache Spark\n\n1. **High Performance**:\n   - Spark is optimized for high performance in iterative applications, where the same data is accessed multiple times. This is achieved through in-memory caching of datasets, which significantly reduces the latency associated with reading data from disk.\n   - It also features low overhead and latency when launching parallel computation tasks, making it efficient for large-scale data processing.\n\n2. **Unified Analytics Engine**:\n   - Spark provides a comprehensive set of tools and libraries for various data processing tasks, including machine learning (MLlib), structured data processing (SparkSQL), and stream processing (Spark Streaming). This unified approach allows users to perform a wide range of analytics tasks within a single framework.\n\n3. **Distributed Computing**:\n   - Spark operates in a distributed environment, where data and computations are spread across multiple nodes in a cluster. It uses **Resilient Distributed Datasets (RDDs)** as its core data structure, which enables fault-tolerant, parallel processing of large datasets.\n\n4. **Fault Tolerance**:\n   - RDDs in Spark are designed to be fault-tolerant. If a node fails during computation, Spark can automatically recompute the lost data partitions using the lineage information stored in the RDDs, ensuring data integrity and reliability.\n\n5. **Flexible API**:\n   - Spark offers a high-level API that supports multiple programming languages, including Scala, Java, and Python. This makes it accessible to a broad range of developers and allows for seamless integration with existing codebases.\n\n6. **Real-Time Stream Processing**:\n   - Spark Streaming is a component of Spark that enables real-time data processing and analytics. It processes data streams in small batches, allowing for immediate model updates and system responses.\n\n7. **Machine Learning and Advanced Analytics**:\n   - Spark includes MLlib, a library for machine learning tasks, which provides algorithms for classification, regression, clustering, and more. It also supports advanced text processing and matrix operations like PCA (Principal Component Analysis) and SVD (Singular Value Decomposition).\n\n8. **Scalability**:\n   - Spark can scale from running on a single machine to thousands of nodes in a cluster, making it suitable for both small-scale and large-scale data processing tasks.\n\n9. **Integration with Other Technologies**:\n   - Spark integrates with various distributed storage systems like Hadoop Distributed File System (HDFS), Amazon S3, and distributed databases like Cassandra and HBase. It also supports cluster management frameworks like YARN and Mesos.\n\n### Use Cases of Apache Spark\n- **Data Processing and Transformation**: Spark is used for analyzing and transforming large datasets, such as computing the mean of numeric RDDs, performing token counting, and filtering data.\n- **Machine Learning**: Spark is widely used for training classification models, building recommendation engines, and implementing matrix factorization models.\n- **Real-Time Analytics**: Spark Streaming enables real-time analytics and machine learning on data streams, making it suitable for applications like fraud detection and live dashboards.\n- **Graph Processing**: Spark’s GraphX library allows for graph processing and analysis, useful in social network analysis and recommendation systems.\n\nIn summary, Apache Spark is a versatile and powerful framework for distributed data processing and analytics, offering high performance, fault tolerance, and a unified API for a wide range of data processing tasks. Its ability to handle both batch and real-time data makes it a popular choice for big data applications.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What does the dollar sign ($) signify in regular expressions?",
+        "answers": [
+            "The dollar sign ($) in regular expressions signifies the end of a line or string."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "586693a4e121a947e66802b4f964dd7e",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        },
+        "lightrag_answer": "In regular expressions, the dollar sign (`$`) serves as an **anchor** that signifies the **end of a line** or the **end of a string**. It is one of the fundamental metacharacters used to define specific positions within a text rather than matching actual characters.\n\n### Key Functions of the Dollar Sign (`$`)\n\n1. **End of Line Matching**:  \n   When used in a regular expression, `$` ensures that the preceding pattern matches only if it occurs at the end of a line. For example, the regex `cat$` will match the word \"cat\" only if it appears at the end of a line.\n\n2. **End of String Matching**:  \n   In some contexts, `$` can also match the end of the entire input string, depending on the regex engine and its configuration. This is particularly useful when validating or parsing strings where the pattern must align with the string's conclusion.\n\n3. **Combining with Other Anchors**:  \n   The dollar sign is often used in conjunction with the caret (`^`), which signifies the start of a line or string. For instance, the regex `^cat$` will match only the exact string \"cat\" and nothing else, ensuring the entire string consists solely of \"cat.\"\n\n### Examples of Usage\n\n- **Matching a Specific Line Ending**:  \n  The regex `world$` will match the word \"world\" only if it appears at the end of a line, such as in the sentence \"Hello world.\"\n\n- **Validating Input**:  \n  To ensure a string ends with a specific pattern, such as a file extension, you might use `\\.txt$`. This would match strings like \"document.txt\" but not \"document.txt.bak.\"\n\n### Context and Variations\n\n- **Multiline Mode**:  \n  In multiline mode (often enabled with the `m` modifier), `$` matches the end of each line within a multiline string, rather than just the end of the entire string.\n\n- **Perl and Other Regex Engines**:  \n  In Perl and some other regex implementations, `$` can also match before a newline character at the end of a string, depending on the engine's behavior.\n\n### Conclusion\n\nThe dollar sign (`$`) is a powerful tool in regular expressions for anchoring patterns to the end of a line or string. Its precise behavior can vary depending on the regex engine and mode, but its primary purpose remains consistent: to ensure that a match occurs only at the specified endpoint. Understanding how to use `$` effectively is essential for tasks like input validation, text parsing, and pattern matching.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the book approach the topic of data encoding schemes?",
+        "answers": [
+            "The book discusses various data encoding schemes, emphasizing the importance of using existing standard encoding schemes and providing guidelines for designing encoding schemes that allow for expansion and avoid NULLs."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "670305552b1ce595a41912dbf0e37778",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        },
+        "lightrag_answer": "The book provides a comprehensive overview of data encoding schemes, emphasizing their importance in database design and data management. It outlines general guidelines for designing encoding schemes, discusses various types of encoding methods, and highlights practical considerations for their implementation. Below is a detailed breakdown of the book's approach to this topic:\n\n### General Guidelines for Designing Encoding Schemes\nThe book offers several general guidelines for designing encoding schemes in databases. These guidelines are not rigid rules but rather best practices to ensure effective and efficient data encoding:\n\n1. **Use Existing Encoding Standards**: The book strongly recommends leveraging existing standard encoding schemes. Standardized codes facilitate uniform data transfer and collection, as they are widely recognized and understood. The book suggests consulting industry experts or organizations like ISO, the U.S. government, and industry groups to identify appropriate standards.\n\n2. **Allow for Expansion**: Encoding schemes should be designed with future growth in mind. The book advises starting with at least one more decimal place or character position than initially needed to accommodate potential expansion. This approach prevents issues with spacing on reports and screens when codes grow in length.\n\n3. **Use Explicit Missing Values to Avoid NULLs**: The book advocates for avoiding NULLs in encoding schemes by using explicit values to represent missing or unknown data. For example, ISO gender codes use \"0\" for unknown, \"1\" for male, \"2\" for female, and \"9\" for not applicable. This practice ensures clarity and consistency in data representation.\n\n4. **Translate Codes for the End User**: The book emphasizes the importance of translating codes into human-readable formats for end users. While some codes (like state abbreviations) are widely understood, others (like billing codes) may require translation to be meaningful. Display formatting, such as punctuation and currency signs, is also crucial for readability.\n\n### Types of Encoding Schemes\nThe book discusses several types of encoding schemes, each with its own characteristics and applications:\n\n1. **Concatenation Encoding**: This method organizes data by concatenating variable components, which can be ordered or unordered. Concatenation codes were historically used in manufacturing processes and are still employed in industries like aviation. However, their variable length can make storage and sorting challenging in older computer systems.\n\n2. **Vector Encoding**: Vector encoding represents data as fixed components, which can be ordered or unordered. It is commonly used for dates and tire sizes. While vector encoding offers advantages in data organization, it can also introduce ambiguity if not designed carefully.\n\n3. **Hierarchical Encoding Schemes**: These schemes partition values into categories and subcategories, refining meaning progressively. Examples include the Dewey Decimal system and ZIP codes. Hierarchical encoding is useful for organizing data into structured, nested categories.\n\n### Practical Considerations\nThe book also addresses practical considerations for implementing encoding schemes:\n\n1. **Ambiguity and Poor Design**: Poorly designed encoding schemes can lead to ambiguity. For example, concatenation codes may become ambiguous if the head of a code is not clearly defined. The book advises careful design to avoid such issues.\n\n2. **Storage and Sorting**: Older computer systems with fixed-length records can struggle with variable-length encoding schemes. The book recommends left-justifying strings to ensure proper sorting and storage.\n\n3. **Boolean Vector Codes**: In some cases, concatenation codes are converted into Boolean vector codes, represented as adjacent columns in a file. This approach simplifies storage and avoids the complexities of variable-length codes.\n\n### Conclusion\nThe book's approach to data encoding schemes is both practical and theoretical, providing readers with a solid foundation for designing and implementing effective encoding systems. By emphasizing the use of standards, allowing for expansion, and translating codes for end users, the book ensures that encoding schemes are both functional and user-friendly. Additionally, the discussion of various encoding methods, such as concatenation, vector, and hierarchical encoding, offers readers a range of options to suit different data management needs.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What are the three main techniques used for semantic definitions in programming languages?",
+        "answers": [
+            "The three main techniques are denotational semantics, big-step operational semantics, and small-step operational semantics."
+        ],
+        "context": "Introduction to the Theory of Programming Languages",
+        "length": 51704,
+        "context_id": "2369ec0826789a4f553cb31b1635c15d",
+        "_id": "438ca00c6c6501f04e0d71f35a85a0dd",
+        "label": "cs",
+        "meta": {
+            "title": "Introduction to the Theory of Programming Languages",
+            "authors": "Gilles Dowek"
+        },
+        "lightrag_answer": "The semantics of a programming language is a crucial aspect of understanding how programs behave when executed. There are three primary techniques used for defining the semantics of programming languages: **denotational semantics**, **big-step operational semantics**, and **small-step operational semantics**. Each of these approaches provides a different perspective on how to formally describe the meaning and execution of programs.\n\n### 1. Denotational Semantics\nDenotational semantics focuses on mapping programming language constructs to mathematical objects. This technique is particularly useful for deterministic languages, where the input-output relation defined by a program is treated as a function. The goal is to bring the program closer to its mathematical interpretation, making it easier to reason about its behavior. For example, in denotational semantics, a program `p` is associated with a function `〚p〛` that describes its meaning. However, defining this function can be complex, especially for languages with flexible application rules, such as PCF (Programming Computable Functions). Tools like the **Fixed Point Theorem** are often used to define these functions rigorously.\n\n### 2. Big-Step Operational Semantics\nBig-step operational semantics, also known as **structural operational semantics (S.O.S.)** or **natural semantics**, provides an inductive definition of how terms in a programming language are evaluated. This approach describes the transformation of terms into values in a single large step, focusing on the overall execution of a program rather than the intermediate steps. For example, in the context of PCF, big-step operational semantics is used to define the evaluation of terms, ensuring that well-typed terms are interpreted correctly. However, this method is incomplete for ill-typed terms, as it does not specify how to handle certain edge cases, such as applying a numeric constant as a function.\n\n### 3. Small-Step Operational Semantics\nSmall-step operational semantics, also referred to as **reduction semantics**, breaks down the execution of a program into a series of basic steps. It defines a relation that describes how a term is transformed step-by-step into its final result. For instance, when evaluating the term `(fun x -> (x * x) + x) 4`, small-step semantics would first reduce it to `(4 * 4) + 4`, then to `16 + 4`, and finally to `20`. This approach is more detailed than big-step semantics, as it explicitly captures the intermediate states of computation. However, it does not inherently specify the order in which reductions should occur, which can lead to non-terminating computations if not managed properly.\n\n### Comparison and Use Cases\nThese three techniques serve different purposes and are often used in complementary ways. **Denotational semantics** is ideal for providing a high-level, mathematical interpretation of programs, while **big-step operational semantics** offers a more intuitive, single-step view of program execution. **Small-step operational semantics**, on the other hand, provides a granular, step-by-step description of how programs are evaluated, making it useful for understanding the fine details of computation. Together, these methods form the foundation for formally defining and analyzing the behavior of programming languages.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What are stored procedures (sprocs) and what advantages do they offer over sending individual SQL statements?",
+        "answers": [
+            "Stored procedures are pre-compiled objects that contain a series of SQL statements. They offer advantages such as reduced network traffic, pre-optimization and precompilation, encapsulation of complex logic, and reusability."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "b53e6c46b777af351513d4af10554228",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        },
+        "lightrag_answer": "### Understanding Stored Procedures (Sprocs)\n\nStored procedures, often referred to as \"sprocs,\" are precompiled collections of SQL statements that are stored within a database. These procedures can be executed as a single unit, allowing for efficient and organized database operations. They are a fundamental feature of SQL Server and other relational database management systems (RDBMS).\n\n### Key Features of Stored Procedures\n\n1. **Precompiled Execution**: Stored procedures are compiled once and stored in the database, which reduces the overhead of parsing and compiling SQL statements each time they are executed. This leads to faster execution times, especially for complex queries.\n\n2. **Modularity and Reusability**: Sprocs allow developers to encapsulate business logic and database operations into reusable modules. This modularity simplifies code maintenance and promotes consistency across applications.\n\n3. **Security and Access Control**: Stored procedures can be used to control data access by restricting direct access to tables. Users can be granted permission to execute specific sprocs without having direct access to the underlying data, enhancing security.\n\n4. **Reduced Network Traffic**: By executing multiple SQL statements within a single stored procedure, the number of round-trips between the application and the database server is minimized. This reduces network traffic and improves performance.\n\n5. **Transaction Management**: Stored procedures can include transaction management logic, ensuring that a series of operations either complete successfully or are rolled back in case of an error. This helps maintain data integrity.\n\n6. **Parameterization**: Sprocs can accept input parameters, allowing for dynamic execution based on varying conditions. This flexibility makes them suitable for a wide range of use cases.\n\n### Advantages Over Individual SQL Statements\n\n1. **Performance**: As mentioned earlier, stored procedures are precompiled, which significantly reduces execution time compared to sending individual SQL statements that need to be parsed and compiled each time.\n\n2. **Code Organization**: By centralizing database logic within stored procedures, developers can keep application code cleaner and more focused on business logic. This separation of concerns improves code readability and maintainability.\n\n3. **Security**: Stored procedures provide an additional layer of security by abstracting direct access to database tables. This reduces the risk of SQL injection attacks and unauthorized data access.\n\n4. **Consistency**: Since stored procedures encapsulate database operations, they ensure that the same logic is applied consistently across different parts of an application. This reduces the likelihood of errors caused by inconsistent implementations.\n\n5. **Scalability**: Stored procedures can handle complex operations and large datasets more efficiently than individual SQL statements. This makes them well-suited for applications that require high performance and scalability.\n\n### Conclusion\n\nStored procedures are a powerful tool in database management, offering numerous advantages over sending individual SQL statements. They improve performance, enhance security, and promote code organization and consistency. By leveraging stored procedures, developers can build more efficient, secure, and maintainable database-driven applications.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the primary purpose of VBA in Office applications?",
+        "answers": [
+            "The primary purpose of VBA in Office applications is to automate tasks and operations, allowing users to streamline complex and repetitive processes."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "acb3efc774059a3f6a857c3543647786",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        },
+        "lightrag_answer": "The primary purpose of **Visual Basic for Applications (VBA)** in Microsoft Office applications is to **automate tasks, create macros, and customize functionality** across various Office programs such as Excel, Word, PowerPoint, Outlook, and Access. VBA is a versatile programming language integrated into these applications, enabling users to streamline repetitive tasks, manipulate data, and extend the capabilities of Office software beyond its default features.\n\n### Automation of Repetitive Tasks\nOne of the key uses of VBA is to automate repetitive tasks, such as formatting documents, generating reports, or performing complex calculations. By writing VBA macros, users can save time and reduce the risk of errors that often accompany manual processes. For example, in Excel, VBA can be used to automate data entry, create pivot tables, or manage large datasets.\n\n### Customization and Enhanced Functionality\nVBA allows users to customize Office applications to better suit their specific needs. This includes creating custom dialog boxes, user forms, and Ribbon interfaces, as well as modifying existing features. For instance, in Word, VBA can be used to automate document formatting or create templates with predefined styles and layouts.\n\n### Integration Across Applications\nVBA also facilitates integration between different Office applications. For example, data from an Excel spreadsheet can be automatically transferred to a Word document or a PowerPoint presentation, streamlining workflows that involve multiple programs.\n\n### Macro Creation and Management\nVBA is widely used for creating and managing macros, which are automated sequences of commands that perform specific tasks. The **Macro Recorder** in applications like Excel and Word generates VBA code based on user actions, which can then be edited and customized in the **Visual Basic Editor (VBE)**. This makes it easier for users to automate complex tasks without needing extensive programming knowledge.\n\n### Security and Control\nVBA includes security features to protect users from malicious code, such as the ability to digitally sign macro projects and set security levels to control the execution of macros. This ensures that users can safely automate tasks without compromising the security of their systems.\n\nIn summary, VBA serves as a powerful tool for enhancing productivity, automating workflows, and customizing Office applications to meet specific requirements. Its integration into the Office suite makes it an essential skill for users looking to maximize the efficiency and functionality of their software.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the role of confluence in the operational semantics of programming languages?",
+        "answers": [
+            "Confluence ensures that regardless of the order in which reductions are applied, a term will always reduce to the same irreducible term, thus guaranteeing the uniqueness of the result of a computation."
+        ],
+        "context": "Introduction to the Theory of Programming Languages",
+        "length": 51704,
+        "context_id": "2369ec0826789a4f553cb31b1635c15d",
+        "_id": "3bf7d70db749f8edb0b8131ed38041fd",
+        "label": "cs",
+        "meta": {
+            "title": "Introduction to the Theory of Programming Languages",
+            "authors": "Gilles Dowek"
+        },
+        "lightrag_answer": "Confluence plays a critical role in the operational semantics of programming languages, particularly in ensuring the determinism and predictability of program execution. In the context of programming languages like PCF (Programming Computable Functions), confluence refers to a property of the reduction relation that guarantees that if a term can be reduced to multiple different terms, those terms will eventually converge to the same irreducible term. This property is essential for the reliability and consistency of program evaluation.\n\n### The Concept of Confluence\n\nConfluence is a formal property of a reduction relation, denoted as `R`, which states that if a term `t` can be reduced to two different terms `u1` and `u2` (i.e., `t R* u1` and `t R* u2`), then there exists a term `v` such that both `u1` and `u2` can be further reduced to `v` (i.e., `u1 R* v` and `u2 R* v`). In simpler terms, no matter which path of reduction is taken, the final result will be the same. This ensures that the order in which reductions are performed does not affect the final outcome of the computation.\n\n### Implications for Operational Semantics\n\nIn the operational semantics of programming languages, confluence has several important implications:\n\n1. **Deterministic Results**: Confluence ensures that each term has at most one irreducible result. This means that even if a term contains multiple reducible expressions (redexes) and different reduction strategies are applied, the final result will be the same. This determinism is crucial for the predictability of program behavior.\n\n2. **Reduction Strategies**: While confluence guarantees that the final result is unique, it does not specify the order in which reductions should be performed. Different reduction strategies, such as call by name or call by value, may lead to different intermediate steps, but confluence ensures that they will eventually converge to the same result. This allows for flexibility in implementation without compromising correctness.\n\n3. **Termination and Non-Termination**: Confluence also plays a role in understanding termination. If a term can be reduced to an irreducible term via one reduction sequence, confluence ensures that all other reduction sequences will also lead to the same irreducible term. However, if a term leads to non-termination under one reduction strategy, confluence does not guarantee termination under other strategies. This highlights the importance of choosing appropriate reduction strategies for efficient evaluation.\n\n### Confluence in PCF\n\nIn the context of PCF, confluence is a key property of the reduction relation. It ensures that the evaluation of terms in PCF is deterministic, regardless of the order in which redexes are reduced. For example, if a term `t` contains multiple redexes, reducing one redex first and then another will ultimately lead to the same result as reducing them in the opposite order. This property is particularly important in functional programming languages, where the order of evaluation can vary based on the evaluation strategy (e.g., lazy evaluation in Haskell).\n\n### Conclusion\n\nIn summary, confluence is a fundamental property in the operational semantics of programming languages that ensures the consistency and determinism of program evaluation. By guaranteeing that different reduction paths converge to the same result, confluence provides a solid foundation for the design and implementation of programming languages, particularly in the context of functional programming and languages like PCF. This property allows programmers to reason about program behavior with confidence, knowing that the final result will be independent of the specific reduction strategy employed.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the MovieLens dataset contribute to building recommendation engines?",
+        "answers": [
+            "The MovieLens dataset, containing user ratings for movies, is used to train recommendation models. It provides the necessary data to apply collaborative filtering techniques, where user preferences are used to predict and recommend other movies they might like."
+        ],
+        "context": "Machine Learning With Spark",
+        "length": 131651,
+        "context_id": "7bcef8714a477fd61fc8fb0d499b2cc3",
+        "_id": "7de6a30b09f1560f3161376771bf79a0",
+        "label": "cs",
+        "meta": {
+            "title": "Machine Learning With Spark",
+            "authors": "Nick Pentreath"
+        },
+        "lightrag_answer": "The MovieLens dataset plays a pivotal role in the development and evaluation of recommendation engines, particularly in the context of collaborative filtering and matrix factorization techniques. Here’s a detailed look at how this dataset contributes to building effective recommendation systems:\n\n### **1. Rich User-Movie Interaction Data**\nThe MovieLens dataset contains explicit user-movie ratings, which are essential for training recommendation models. Each record in the dataset includes a user ID, movie ID, and a rating (on a scale of 1 to 5). This explicit feedback provides a clear signal of user preferences, enabling models to learn patterns and make accurate predictions. For example, the `u.data` file in the MovieLens 100k dataset contains 100,000 such ratings, offering a robust foundation for training and testing recommendation algorithms.\n\n### **2. Matrix Factorization and Collaborative Filtering**\nThe dataset is particularly well-suited for collaborative filtering, a technique that predicts user preferences based on the behavior of similar users. Matrix factorization, a key method in collaborative filtering, decomposes the user-movie rating matrix into latent factor matrices representing users and movies. The MovieLens dataset is often used to train models like the Alternating Least Squares (ALS) algorithm, which iteratively optimizes these latent factors to minimize prediction errors. This approach allows the model to uncover hidden patterns in user preferences and generate personalized recommendations.\n\n### **3. Evaluation of Recommendation Models**\nThe MovieLens dataset is widely used for evaluating the performance of recommendation models. Metrics such as Mean Average Precision at K (MAPK) and Root Mean Squared Error (RMSE) are commonly applied to assess the accuracy of predictions. For instance, the dataset can be split into training and test sets to measure how well the model generalizes to unseen data. This evaluation process is critical for fine-tuning model parameters and ensuring the recommendation engine performs effectively in real-world scenarios.\n\n### **4. Integration with Machine Learning Frameworks**\nThe dataset is compatible with machine learning frameworks like Apache Spark and its MLlib library. Spark’s distributed computing capabilities make it ideal for processing large-scale datasets like MovieLens, enabling efficient training of recommendation models. For example, the ALS model in Spark MLlib can be trained on the MovieLens dataset to generate user and movie factor vectors, which are then used to compute predicted ratings and generate recommendations.\n\n### **5. Exploration of User and Movie Attributes**\nBeyond ratings, the MovieLens dataset includes metadata about users (e.g., age, gender, occupation) and movies (e.g., title, genre, release year). This additional information can be used to enhance recommendation models by incorporating user demographics or movie attributes into the prediction process. For instance, clustering techniques can group users or movies based on these features, enabling more targeted and personalized recommendations.\n\n### **6. Benchmarking and Research**\nThe MovieLens dataset is a standard benchmark in the field of recommender systems. Its widespread use allows researchers and practitioners to compare the performance of different algorithms and approaches. This benchmarking capability fosters innovation and helps identify best practices for building recommendation engines.\n\n### **Conclusion**\nThe MovieLens dataset is a cornerstone in the development of recommendation engines, providing the necessary data for training, evaluating, and refining models. Its rich user-movie interaction data, compatibility with machine learning frameworks, and inclusion of metadata make it an invaluable resource for both academic research and industrial applications. By leveraging this dataset, developers can build robust and personalized recommendation systems that enhance user experiences and drive engagement.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the primary goal of the book \"Introducing Regular Expressions\"?",
+        "answers": [
+            "The primary goal of the book is to make learning regular expressions as easy as possible through examples and practical applications."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "5443d621a027c59e78e253ef44bfa502",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        },
+        "lightrag_answer": "The primary goal of the book *\"Introducing Regular Expressions\"* by Michael Fitzgerald is to provide an accessible and practical introduction to regular expressions, making the topic approachable for readers who may find it daunting. The book aims to demystify regular expressions by avoiding overwhelming jargon and instead focusing on clear examples and hands-on learning. Fitzgerald, an experienced author with expertise in programming and text processing, structures the content to guide readers from foundational concepts to more advanced techniques, ensuring a gradual and comprehensive understanding of regular expressions.\n\nThe book covers essential topics such as pattern matching, text manipulation, and the use of regular expressions in various programming languages and tools. It also includes practical examples and exercises, often using tools like *RegExr* and *sed*, to reinforce learning. Additionally, the book provides resources such as a glossary of terms and references to further reading, making it a valuable resource for both beginners and those looking to deepen their knowledge of regular expressions.\n\nOverall, *\"Introducing Regular Expressions\"* is designed to make learning regular expressions straightforward and engaging, equipping readers with the skills to effectively use regular expressions in their programming and text-processing tasks.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What tools or methodologies does the text use to help readers understand and design programs?",
+        "answers": [
+            "The text uses pseudocode to outline programs, making the logic clear without being tied to specific syntax. It also employs UML diagrams to visually represent the relationships among classes, aiding in the design of more complex programs."
+        ],
+        "context": "Guide to Java",
+        "length": 132156,
+        "context_id": "9acb1e7611785d05ed363e0a12ff81c4",
+        "_id": "fad588e5aa55fbd5eb3a9640365b0917",
+        "label": "cs",
+        "meta": {
+            "title": "Guide to Java",
+            "authors": "James T. Streib"
+        },
+        "lightrag_answer": "The text provides a variety of tools and methodologies to help readers understand and design programs effectively. These include debugging tools, programming principles, and structured approaches to problem-solving. Below is a detailed overview of these resources:\n\n### Debugging Tools\nThe text emphasizes the importance of debugging in the programming process and introduces several tools available in the Visual Basic Editor (VBA Editor) to assist with identifying and resolving errors. These tools include:\n- **Immediate Window**: Allows users to execute code snippets and inspect variable values during runtime.\n- **Locals Window**: Displays the current values of all variables in the active procedure, helping users track changes and identify issues.\n- **Watch Window**: Enables users to monitor specific variables or expressions, making it easier to pinpoint errors in complex code.\n- **Break Mode**: Allows users to step through code line by line, observing the execution flow and identifying problematic sections.\n- **Breakpoints**: Users can set breakpoints to pause execution at specific lines, facilitating focused debugging.\n\n### Programming Principles\nThe text highlights several programming principles that contribute to well-behaved and efficient code:\n- **Modularity**: Encourages breaking down code into smaller, reusable modules or procedures, enhancing readability and maintainability.\n- **Error Handling**: Stresses the importance of anticipating and managing errors gracefully, ensuring programs run smoothly even under unexpected conditions.\n- **User Feedback**: Recommends keeping users informed about the program's progress, especially during lengthy operations, to prevent confusion or frustration.\n- **Environment Restoration**: Advises restoring the user environment to its original state after a procedure completes, ensuring no unintended changes persist.\n\n### Structured Problem-Solving\nThe text promotes a structured approach to problem-solving, which includes:\n- **Planning and Design**: Encourages users to plan their programs carefully, using tools like pseudocode and flowcharts to outline logic before writing code.\n- **Testing and Validation**: Emphasizes the need to test code thoroughly, using debugging tools to validate functionality and identify errors.\n- **Iterative Development**: Suggests an iterative approach to coding, where users write, test, and refine their programs incrementally.\n\n### Additional Resources\nThe text also references external resources and methodologies to aid in program design and understanding:\n- **Documentation and Help Systems**: Recommends using the VBA Editor's Help system and online resources to clarify syntax, methods, and properties.\n- **Best Practices**: Draws on programming proverbs and best practices from authors like Ledgard, H., to guide users in writing clear, efficient, and maintainable code.\n\nBy combining these tools and methodologies, the text equips readers with the skills and knowledge needed to design, implement, and debug programs effectively.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the FOR XML clause in SQL Server facilitate the conversion of relational data into XML format?",
+        "answers": [
+            "The FOR XML clause is an option added to the end of a SELECT statement to instruct SQL Server to return the result set in XML format. It provides several modes (RAW, AUTO, EXPLICIT, PATH) to control the structure and formatting of the XML output."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "5f4dc0b9d5cb8e3d70a682084a659bb0",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        },
+        "lightrag_answer": "The **FOR XML clause** in SQL Server is a powerful feature that enables the transformation of relational data into XML format. This clause is appended to a standard SQL SELECT statement, allowing developers to generate XML output directly from relational database queries. The FOR XML clause provides several formatting options, each offering different levels of control over the structure and organization of the resulting XML. Below, we explore the key aspects of the FOR XML clause and its functionality.\n\n### Key Features of the FOR XML Clause\n\nThe FOR XML clause offers four primary modes for formatting XML output: **RAW**, **AUTO**, **EXPLICIT**, and **PATH**. Each mode caters to different use cases and provides varying degrees of customization for the XML structure.\n\n1. **RAW Mode**:\n   - In RAW mode, each row of the query result is represented as a single XML element named `<row>`.\n   - Each column in the row is converted into an attribute of the `<row>` element.\n   - This mode is straightforward but offers limited control over the XML structure. It is useful for simple transformations where minimal customization is required.\n\n2. **AUTO Mode**:\n   - AUTO mode generates XML elements based on the table names or aliases used in the query.\n   - If the query involves multiple tables, the XML output is nested, reflecting the relationships between the tables.\n   - This mode is more flexible than RAW and is suitable for scenarios where the XML structure should mirror the relational schema.\n\n3. **EXPLICIT Mode**:\n   - EXPLICIT mode provides the highest level of control over the XML structure, allowing developers to define the hierarchy and attributes explicitly.\n   - This mode requires the use of a **Universal Table** format, where each row in the result set includes metadata about its position in the XML hierarchy.\n   - While powerful, EXPLICIT mode is complex and is typically used when precise control over the XML output is necessary.\n\n4. **PATH Mode**:\n   - PATH mode offers a balance between simplicity and flexibility, allowing developers to define the XML structure using XPath-like expressions.\n   - This mode is easier to use than EXPLICIT but still provides significant control over the XML output.\n   - It is ideal for scenarios where developers need to customize the XML structure without the complexity of EXPLICIT mode.\n\n### Additional Options for FOR XML\n\nThe FOR XML clause also supports several optional parameters to further customize the XML output:\n\n- **XMLDATA**: Includes an XML schema in the output, which defines the structure and rules for the XML data.\n- **ELEMENTS**: Specifies that column values should be represented as sub-elements rather than attributes.\n- **BINARY base64**: Encodes binary data in base64 format for inclusion in the XML output.\n\n### Practical Applications\n\nThe FOR XML clause is widely used in scenarios where relational data needs to be exchanged or integrated with XML-based systems. For example:\n- **Web Services**: SQL Server can supply data to web services using XML-based methods, leveraging the FOR XML clause to format the data appropriately.\n- **Data Integration**: When integrating relational databases with XML-based applications, the FOR XML clause facilitates the conversion of relational data into XML format.\n- **Reporting**: XML output generated by the FOR XML clause can be used to create reports or feed data into XML-based reporting tools.\n\n### Example Usage\n\nHere’s an example of using the FOR XML clause in AUTO mode:\n\n```sql\nSELECT CustomerID, CompanyName\nFROM Customers\nFOR XML AUTO;\n```\n\nThis query generates XML output where each customer is represented as an element, and the columns (`CustomerID` and `CompanyName`) are included as attributes.\n\n### Conclusion\n\nThe **FOR XML clause** in SQL Server is an essential tool for converting relational data into XML format. By offering multiple modes and customization options, it caters to a wide range of use cases, from simple data transformations to complex XML generation. Whether you’re working with web services, data integration, or reporting, the FOR XML clause provides the flexibility and control needed to seamlessly bridge the gap between relational and XML data.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What role do examples and exercises play in the learning process according to the text?",
+        "answers": [
+            "Examples and exercises are crucial in the learning process. The text includes numerous examples and illustrations throughout each chapter, and provides exercises at the end of each chapter to reinforce the concepts presented. Selected answers to exercises are provided in an appendix."
+        ],
+        "context": "Guide to Java",
+        "length": 132156,
+        "context_id": "9acb1e7611785d05ed363e0a12ff81c4",
+        "_id": "75d698b059b5a104e6ef6d548bbb3a42",
+        "label": "cs",
+        "meta": {
+            "title": "Guide to Java",
+            "authors": "James T. Streib"
+        },
+        "lightrag_answer": "Examples and exercises play a crucial role in the learning process, particularly in programming and technical education. They serve as practical tools to reinforce concepts, enhance understanding, and develop skills through hands-on practice. Here’s a detailed look at their significance:\n\n### Reinforcing Concepts\nExamples and exercises are designed to illustrate theoretical concepts in a tangible way. For instance, in programming, examples often demonstrate how to apply specific functions or methods, such as using the `ActiveDocument.Save` command in VBA to save a document. By seeing these concepts in action, learners can better grasp their functionality and purpose. Exercises, on the other hand, provide opportunities to practice these concepts, ensuring that learners can apply them independently.\n\n### Building Practical Skills\nExercises are particularly valuable for building practical skills. They often involve tasks that mimic real-world scenarios, such as creating loops, writing functions, or debugging code. For example, exercises might require learners to write a loop that repeats an action until a certain condition is met, helping them understand how loops work in different contexts. This hands-on practice is essential for developing proficiency in programming and other technical fields.\n\n### Encouraging Problem-Solving\nExamples and exercises also encourage problem-solving by presenting learners with challenges that require critical thinking and creativity. For instance, exercises might ask learners to modify existing code to achieve a different outcome or to debug a piece of code that isn’t working as expected. These tasks help learners develop the ability to analyze problems, identify solutions, and implement them effectively.\n\n### Providing Immediate Feedback\nOne of the key benefits of exercises is that they provide immediate feedback. When learners complete an exercise, they can quickly see whether their solution works as intended. This immediate feedback helps them identify and correct mistakes, reinforcing correct practices and improving their understanding of the material. For example, if a learner writes a loop that doesn’t terminate correctly, they can see the issue and adjust their code accordingly.\n\n### Supporting Self-Paced Learning\nExamples and exercises are particularly useful for self-paced learning. They allow learners to work through material at their own speed, revisiting concepts as needed. This flexibility is especially important in technical education, where learners may have varying levels of prior knowledge and experience. By providing a range of examples and exercises, educational materials can cater to different learning needs and styles.\n\n### Enhancing Retention\nFinally, examples and exercises enhance retention by engaging learners in active learning. Research shows that active engagement with material leads to better retention and understanding. By working through examples and completing exercises, learners are more likely to remember the concepts and skills they’ve practiced. This active engagement also makes the learning process more enjoyable and motivating.\n\nIn summary, examples and exercises are essential components of the learning process in programming and technical education. They reinforce concepts, build practical skills, encourage problem-solving, provide immediate feedback, support self-paced learning, and enhance retention. By incorporating these elements into educational materials, instructors can create a more effective and engaging learning experience.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the significance of the correlation coefficient in the book?",
+        "answers": [
+            "The correlation coefficient is significant as it measures the strength and direction of the relationship between two variables. It helps in predicting one variable from another and is a key tool in understanding data relationships."
+        ],
+        "context": "Probability and Statistics for Computer Science",
+        "length": 340084,
+        "context_id": "cd05f98133352f12d464fe3184263356",
+        "_id": "7182350cd8041542acab7b678130b2a7",
+        "label": "cs",
+        "meta": {
+            "title": "Probability and Statistics for Computer Science",
+            "authors": "David Forsyth"
+        },
+        "lightrag_answer": "The correlation coefficient is a central concept in the book, serving as a key statistical measure to quantify the strength and direction of the relationship between two variables. It is defined as a value ranging from -1 to 1, where 1 indicates a perfect positive correlation, -1 indicates a perfect negative correlation, and 0 indicates no correlation. This measure is symmetric, unaffected by translation, and can change sign with scaling, making it a robust tool for analyzing data relationships.\n\n### Key Properties and Applications\nThe book emphasizes several important properties of the correlation coefficient:\n1. **Range and Interpretation**: The correlation coefficient ranges from -1 to 1, providing a clear indication of the nature of the relationship between variables. A positive correlation coefficient suggests that both variables tend to increase or decrease together, while a negative coefficient indicates an inverse relationship.\n2. **Invariance to Translation and Scaling**: The correlation coefficient remains unaffected by translating or scaling the data, ensuring its reliability across different datasets and transformations.\n3. **Use in Prediction**: The correlation coefficient is instrumental in making predictions. For instance, it is used to estimate the value of an unknown variable based on the known value of another variable, leveraging the strength of their linear relationship.\n\n### Practical Examples\nThe book illustrates the application of the correlation coefficient through various examples:\n- **Wild Horse Herd**: The correlation coefficient is used to analyze the decline in the population of sterile adults and foals over time, showing a negative trend.\n- **Height-Weight Dataset**: The correlation coefficient is computed to demonstrate relationships between variables such as weight, adiposity, and age, highlighting strong positive and negative correlations.\n- **Heart Rate and Temperature Data**: The correlation coefficient is applied to determine if there is a relationship between these variables, with a zero correlation indicating no significant association.\n\n### Mathematical Foundations\nThe correlation coefficient is mathematically defined in terms of standard coordinates, ensuring its invariance to translation and scaling. It is calculated using the sum of the products of corresponding values from two datasets, normalized by their standard deviations. This calculation is further supported by properties such as Property 2.1, which defines the maximum and minimum values of the correlation coefficient as 1 and -1, respectively.\n\n### Conclusion\nIn summary, the correlation coefficient is a fundamental statistical tool in the book, used to analyze and interpret relationships between variables. Its properties and applications make it essential for understanding data patterns, making predictions, and drawing meaningful conclusions from statistical analyses.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What are the three main approaches to handle multi-objective tasks discussed in the book?",
+        "answers": [
+            "The three main approaches discussed are the weighted-formula approach, lexicographic approach, and Pareto front approach."
+        ],
+        "context": "Modern Optimization With R",
+        "length": 98105,
+        "context_id": "31aa6012fa9cdbd6e17cdc426851e487",
+        "_id": "8aada2c07ffcd605670c0322fd07f7cd",
+        "label": "cs",
+        "meta": {
+            "title": "Modern Optimization With R",
+            "authors": "Paulo Cortez"
+        },
+        "lightrag_answer": "The book discusses three main approaches to handle multi-objective optimization tasks: the **weighted-formula approach**, the **lexicographic approach**, and the **Pareto approach**. Each of these methods offers a distinct way to manage and optimize multiple objectives simultaneously.\n\n### 1. Weighted-Formula Approach\nThe **weighted-formula approach** involves assigning weights to each of the optimization goals and then optimizing a combined quality measure. This method requires multiple runs with different weight combinations to explore the solution space effectively. While it is straightforward to implement, it has some limitations. For instance, it may miss interesting solutions that do not align with the predefined weight combinations. Additionally, the approach can be computationally intensive due to the need for multiple runs.\n\n### 2. Lexicographic Approach\nThe **lexicographic approach** prioritizes objectives in a specific order, optimizing them sequentially. This method is useful when certain objectives are more critical than others. However, it can lead to suboptimal solutions for lower-priority objectives, as they are only considered after higher-priority objectives have been optimized. The approach is less flexible than the weighted-formula method but can be effective in scenarios where clear priority hierarchies exist.\n\n### 3. Pareto Approach\nThe **Pareto approach** focuses on identifying a set of non-dominated solutions, known as the Pareto front. These solutions represent trade-offs between conflicting objectives, where no single solution is superior in all aspects. The Pareto approach is particularly useful for complex problems with multiple conflicting goals, as it provides a comprehensive view of the solution space. One of the key advantages of this method is that it does not require predefined weights or priorities, making it more adaptable to a wide range of problems.\n\n### Comparison and Application\nThe book highlights that the **NSGA-II algorithm**, a popular multi-objective optimization method, outperforms the weighted-formula approach in producing better Pareto fronts for complex tasks. The Pareto approach is often preferred for its ability to provide a diverse set of solutions, allowing decision-makers to choose the most suitable one based on their specific needs. In contrast, the weighted-formula and lexicographic approaches are more rigid but can be effective in scenarios where specific objectives or priorities are well-defined.\n\nIn summary, the choice of approach depends on the nature of the problem and the desired outcomes. The weighted-formula and lexicographic methods are simpler and more structured, while the Pareto approach offers greater flexibility and a broader exploration of the solution space.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is a view in SQL Server and what are its primary uses?",
+        "answers": [
+            "A view is a virtual table based on the result-set of an SQL statement. It contains rows and columns, just like a real table, and is used for security (controlling what users see) and ease of use (hiding complexity and unnecessary information)."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "4948f1ca278e86c71e252878fe02cfd1",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        },
+        "lightrag_answer": "In SQL Server, a **VIEW** is a virtual table that is defined by a SQL query. Unlike physical tables, a view does not store data itself; instead, it dynamically retrieves data from one or more underlying tables whenever it is queried. Views are widely used in database systems to simplify data access, enforce security, and provide a consistent interface to the data.\n\n### Primary Uses of Views\n\n1. **Simplifying Complex Queries**:  \n   Views are often used to encapsulate complex SQL queries. By creating a view, users can query a simplified representation of the data without needing to understand the underlying table structures or complex joins. For example, a view can combine data from multiple tables into a single virtual table, making it easier for users to retrieve the information they need.\n\n2. **Data Abstraction and Security**:  \n   Views can act as a layer of abstraction, hiding the complexity of the underlying database schema from users. They can also be used to restrict access to sensitive data. For instance, a view can expose only specific columns or rows of a table, ensuring that users only see the data they are authorized to access.\n\n3. **Consistency and Reusability**:  \n   Views provide a consistent interface to the data, even if the underlying table structures change. This ensures that applications relying on the view continue to function without modification. Additionally, views can be reused across multiple queries, reducing redundancy and improving maintainability.\n\n4. **Performance Optimization**:  \n   In some cases, views can improve query performance. For example, **indexed views** (materialized views) store the result set of the view physically in the database, allowing for faster data retrieval. This is particularly useful for complex queries that are frequently executed.\n\n5. **Data Derivation and Aggregation**:  \n   Views can be used to derive new data from existing tables, such as calculating aggregates (e.g., sums, averages) or combining data from multiple sources. This allows users to access precomputed results without needing to perform the calculations themselves.\n\n6. **Partitioning Data**:  \n   **Partitioned views** allow data from multiple tables to be combined into a single logical view. This is useful for scenarios where data is physically distributed across different tables or databases but needs to be accessed as a unified dataset.\n\n### Key Features of Views\n\n- **WITH CHECK OPTION**:  \n  This clause ensures that any data inserted or updated through a view adheres to the view's defining query. It prevents modifications that would result in rows falling outside the view's scope.\n\n- **INSTEAD OF Triggers**:  \n  Views that cannot be directly updated (e.g., views involving multiple tables) can use **INSTEAD OF triggers** to handle insert, update, or delete operations. These triggers override the default behavior and allow for custom logic to be executed.\n\n- **Schema Binding**:  \n  The **SCHEMABINDING** option ensures that the underlying tables cannot be modified in a way that would break the view. This is particularly important for indexed views, which require schema binding to maintain consistency.\n\n### Limitations of Views\n\nWhile views offer many benefits, they also have some limitations:\n- **Performance Overhead**: Views that involve complex joins or aggregations can introduce performance overhead, especially if they are not indexed.\n- **Update Restrictions**: Not all views are updatable. Views that involve multiple tables, aggregates, or certain functions may require INSTEAD OF triggers to handle modifications.\n- **Catalog Clutter**: Creating too many views can lead to \"catalog clutter,\" where the database schema becomes difficult to manage. It is important to create views only when they serve a specific purpose.\n\n### Conclusion\n\nViews are a powerful tool in SQL Server, offering flexibility, security, and simplification for database users and developers. By understanding their uses and limitations, you can leverage views to enhance data management, improve query performance, and enforce security policies in your database environment.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How can you debug a macro in the Visual Basic Editor?",
+        "answers": [
+            "You can debug a macro in the Visual Basic Editor by setting breakpoints, stepping through the code line by line using the Step Into command, and examining variables and expressions in the Immediate window or other debugging windows."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "88ded4e33236e7d4d990d0f9db62e915",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        },
+        "lightrag_answer": "Debugging a macro in the Visual Basic Editor (VBE) is a critical skill for identifying and resolving issues in your VBA code. The VBE provides a suite of tools and features that make the debugging process efficient and effective. Below are the key techniques and tools you can use to debug macros:\n\n### Setting Breakpoints\nBreakpoints are one of the most commonly used debugging tools. They allow you to pause the execution of your code at a specific line, enabling you to inspect the state of variables and the flow of the program. To set a breakpoint:\n1. Open the **Code Window** in the VBE.\n2. Click in the gray margin to the left of the line where you want to pause execution. A red dot will appear, indicating the breakpoint.\n3. Run your macro. When the code reaches the breakpoint, it will pause, and you can examine variables and step through the code line by line.\n\n### Using Step Mode (Single-Stepping)\nStep mode, also known as **Break Mode**, allows you to execute your code one line at a time. This is particularly useful for understanding how your code behaves at each step. To enter step mode:\n1. Place the cursor in the procedure you want to debug.\n2. Press **F8** or click the **Step Into** button on the Debug toolbar.\n3. Repeatedly press **F8** to step through the code. This lets you observe the execution flow and identify where issues occur.\n\n### Inspecting Variables with the Locals Window\nThe **Locals Window** is a powerful debugging tool that displays the current values of all variables in the active procedure. To use it:\n1. Enter Break Mode by setting a breakpoint or pressing **Ctrl+Break**.\n2. Open the Locals Window by navigating to **View > Locals Window**.\n3. As you step through the code, the Locals Window updates to show the values of variables, helping you identify unexpected changes or errors.\n\n### Using the Immediate Window\nThe **Immediate Window** is a versatile tool for testing and debugging. You can use it to execute individual lines of code, print variable values, or evaluate expressions. To access it:\n1. Open the Immediate Window by pressing **Ctrl+G** or navigating to **View > Immediate Window**.\n2. Type commands like `Debug.Print VariableName` to print the value of a variable or execute small code snippets to test functionality.\n\n### Adding Watch Expressions\nWatch Expressions allow you to monitor the value of specific variables or expressions during execution. To add a Watch Expression:\n1. Open the **Watch Window** by navigating to **View > Watch Window**.\n2. Right-click in the Watch Window and select **Add Watch**.\n3. Enter the variable or expression you want to monitor. The Watch Window will display its value as the code runs, helping you identify when and where issues occur.\n\n### Handling Errors with Error Trapping\nError trapping is essential for managing runtime errors gracefully. You can use the **On Error** statement to define how VBA should handle errors. For example:\n```vba\nOn Error GoTo ErrorHandler\n```\nThis directs the code to a labeled section (e.g., `ErrorHandler`) when an error occurs, allowing you to log the error or take corrective action.\n\n### Using Conditional Breakpoints\nConditional breakpoints pause execution only when a specific condition is met, such as when a variable exceeds a certain value. To set a conditional breakpoint:\n1. Right-click on an existing breakpoint and select **Condition**.\n2. Enter the condition (e.g., `VariableName > 100`).\n3. The code will pause only when the condition evaluates to `True`.\n\n### Avoiding Infinite Loops\nInfinite loops can cause your code to run indefinitely. To debug them:\n1. Use breakpoints or Watch Expressions to identify the loop causing the issue.\n2. Add a counter variable to exit the loop after a certain number of iterations. For example:\n```vba\nIf Counter > 100 Then Exit Do\n```\n\n### Conclusion\nThe Visual Basic Editor provides a robust set of debugging tools, including breakpoints, step mode, the Locals Window, the Immediate Window, Watch Expressions, and error trapping. By leveraging these tools, you can efficiently identify and resolve issues in your VBA code, ensuring your macros run smoothly and reliably.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the book differentiate between probability and statistics?",
+        "answers": [
+            "The book differentiates between probability and statistics by explaining that probability deals with predicting the likelihood of future events based on known probabilities, while statistics involves making inferences about unknown probabilities based on observed data."
+        ],
+        "context": "Probability and Statistics for Computer Science",
+        "length": 340084,
+        "context_id": "cd05f98133352f12d464fe3184263356",
+        "_id": "6548f38f05a7c1334f452b8a0ce79554",
+        "label": "cs",
+        "meta": {
+            "title": "Probability and Statistics for Computer Science",
+            "authors": "David Forsyth"
+        },
+        "lightrag_answer": "The book \"Probability and Statistics for Computer Science\" by David Forsyth provides a clear distinction between probability and statistics, particularly in the context of their applications and methodologies. \n\n**Probability** is presented as a mathematical framework used to quantify the likelihood of events occurring. It is foundational to understanding random phenomena and is often used to model uncertainty in data. The book discusses probability in terms of discrete and continuous probability distributions, conditional probability, and related concepts. Probability is essential for understanding the behavior of random variables and for making predictions based on theoretical models. For example, the book explains how probability is used to calculate the likelihood of outcomes in scenarios like coin flips, card games, and other random experiments.\n\n**Statistics**, on the other hand, is described as the field that involves the analysis of datasets, including the use of statistical methods to draw conclusions about populations based on sample data. The book emphasizes that statistics is about making inferences from data, which involves techniques like hypothesis testing, confidence intervals, and regression analysis. Statistics is particularly focused on understanding and interpreting real-world data, often using probability models as a foundation. For instance, the book discusses how statistical methods are used to analyze datasets, identify trends, and make predictions based on observed data.\n\nIn summary, while probability provides the theoretical underpinnings for understanding randomness and uncertainty, statistics applies these principles to analyze and interpret real-world data. The book highlights that probability is more about modeling and predicting outcomes based on theoretical distributions, whereas statistics is about using data to make informed decisions and draw conclusions about the world. This distinction is crucial for understanding how these two fields complement each other in the study of data and uncertainty.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What does the book consider as the biggest hurdle in learning SQL?",
+        "answers": [
+            "According to the book, the biggest hurdle in learning SQL is thinking in sets and logic instead of in sequences and processes, which is a common mistake among new programmers."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "174fae450333f362e8b30bdc893eabd9",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        },
+        "lightrag_answer": "The book identifies several significant hurdles in learning SQL, particularly for those who come from a procedural or object-oriented programming background. One of the primary challenges is the mindset shift required to think in a declarative and set-oriented manner, which is fundamentally different from the procedural or object-oriented paradigms. This shift is often difficult for programmers who are accustomed to writing code that explicitly defines how to achieve a result, rather than simply declaring what the result should be.\n\nAnother major hurdle is the lack of foundational knowledge in basic software engineering principles. Many SQL programmers, especially those who are self-taught or have learned through poorly written resources, often lack an understanding of structured programming, modularity, and best practices. This leads to inefficient and poorly designed SQL code, as well as a tendency to misuse SQL for tasks it is not well-suited for, such as application presentation layers.\n\nAdditionally, the book highlights the issue of \"bad programming habits\" that are carried over from other languages. For example, new SQL programmers often try to write procedural code in SQL, which is not optimal given SQL's declarative nature. They may also adopt naming conventions or practices from their previous languages that do not align with SQL best practices, such as using prefixes like \"tbl-\" for table names or auto-numbering features without understanding their implications.\n\nThe book also points out that many SQL programmers are unaware of their own limitations and may resist learning better practices. This is compounded by the availability of low-quality resources that promise quick mastery of SQL, leading to a superficial understanding of the language. The author emphasizes that becoming proficient in SQL requires time, practice, and a willingness to unlearn bad habits, much like mastering any other skilled profession.\n\nIn summary, the biggest hurdles in learning SQL, as outlined in the book, include the mindset shift from procedural to declarative thinking, the lack of foundational software engineering knowledge, the persistence of bad programming habits, and the resistance to recognizing and addressing one's own limitations.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What are the four types of operators in VBA?",
+        "answers": [
+            "The four types of operators in VBA are arithmetic operators, comparison operators, logical operators, and concatenation operators."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "ae6e66d052fcf8f736b04624589ff901",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        },
+        "lightrag_answer": "In Visual Basic for Applications (VBA), operators are symbols or characters used to perform operations on values or variables. VBA supports four primary types of operators, each serving a distinct purpose in programming:\n\n### 1. **Arithmetic Operators**  \nArithmetic operators are used for performing basic mathematical calculations. These include:  \n- **+** (Addition)  \n- **-** (Subtraction)  \n- **\\*** (Multiplication)  \n- **/** (Division)  \n- **\\** (Integer Division)  \n- **Mod** (Modulus, for finding the remainder of a division)  \n- **^** (Exponentiation, for raising a number to a power)  \n\nThese operators are essential for numerical computations in VBA, such as calculating sums, averages, or other mathematical expressions.\n\n### 2. **Comparison Operators**  \nComparison operators are used to compare two values or expressions and return a Boolean result (`True` or `False`). These include:  \n- **=** (Equal to)  \n- **<>** (Not equal to)  \n- **>** (Greater than)  \n- **<** (Less than)  \n- **>=** (Greater than or equal to)  \n- **<=** (Less than or equal to)  \n\nThese operators are commonly used in conditional statements, such as `If` blocks, to make decisions based on the relationship between values.\n\n### 3. **Logical Operators**  \nLogical operators are used to combine or manipulate conditions in logical expressions. They are particularly useful in `If` statements or loops to evaluate multiple conditions. The primary logical operators in VBA are:  \n- **And** (Returns `True` if both conditions are true)  \n- **Or** (Returns `True` if at least one condition is true)  \n- **Not** (Reverses the logical value of a condition)  \n- **Xor** (Returns `True` if only one of the conditions is true)  \n\nThese operators enable complex decision-making by linking multiple conditions together.\n\n### 4. **Concatenation Operator**  \nThe concatenation operator is used to join strings together. In VBA, the **&** symbol is used for this purpose. For example:  \n```vba\nDim fullName As String\nfullName = \"John\" & \" \" & \"Doe\"\n```  \nThis operator is essential for combining text or variables into a single string, which is often used in generating messages or formatting output.\n\n### Summary  \nThese four types of operators—arithmetic, comparison, logical, and concatenation—form the foundation of VBA programming. They allow developers to perform calculations, make decisions, evaluate conditions, and manipulate text, making them indispensable tools for automating tasks and creating efficient code in Microsoft Office applications.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the book's stance on the use of jargon in regular expressions?",
+        "answers": [
+            "The book takes a light approach to jargon, using it sparingly and only when necessary, to avoid creating barriers for readers."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "c8fc43e65361da563733e97c2cc96143",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        },
+        "lightrag_answer": "The book *Introducing Regular Expressions* by Michael Fitzgerald takes a deliberate approach to the use of jargon, aiming to make the subject accessible to readers without overwhelming them. Fitzgerald acknowledges that regular expressions can be a complex topic, often accompanied by specialized terminology. However, he strives to balance technical accuracy with clarity, ensuring that readers can grasp the concepts without being bogged down by excessive jargon.\n\nIn the book, Fitzgerald introduces key terms and concepts gradually, providing clear explanations and practical examples to illustrate their use. For instance, he defines terms like \"metacharacters,\" \"quantifiers,\" and \"backreferences\" in straightforward language, often accompanied by visual examples or code snippets. This approach helps demystify the subject and makes it easier for readers to understand and apply regular expressions in real-world scenarios.\n\nAdditionally, Fitzgerald includes a \"Regular Expression Glossary\" in the book, which serves as a reference for readers to look up terms and their meanings. This glossary reinforces the learning process by providing concise definitions of key concepts, ensuring that readers can revisit and reinforce their understanding as needed.\n\nOverall, the book's stance on jargon is one of moderation and clarity. Fitzgerald aims to make regular expressions approachable for beginners while still providing the depth and detail needed for more advanced users. By avoiding unnecessary complexity and focusing on practical applications, the book strikes a balance that caters to a wide audience.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the book advocate for the use of views in SQL?",
+        "answers": [
+            "The book advocates for the use of views to provide row- and column-level security, ensure efficient access paths, mask complexity from users, and enforce complicated integrity constraints, among other benefits."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "0d23d9f53f66018c1eed5131bd087ab3",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        },
+        "lightrag_answer": "The book emphasizes the importance of views in SQL Server as a powerful tool for simplifying complex queries, ensuring proper data derivation, and improving data management. Views are described as virtual tables based on the result-set of a SQL query, providing a layer of abstraction that simplifies data access and masks complexity from users. They are particularly useful for creating a consistent interface to the data, improving efficiency, and ensuring security in database environments.\n\n### Key Points on Views in SQL Server:\n\n1. **Simplification of Complex Queries**:\n   Views allow users to encapsulate complex queries into a single virtual table. This abstraction makes it easier for users to interact with the data without needing to understand the underlying query logic. For example, a view can join multiple tables and present the results as a single table, simplifying data retrieval for end-users.\n\n2. **Data Abstraction and Security**:\n   Views can be used to restrict user access to specific columns or rows, ensuring that users only see the data they are authorized to access. This is particularly useful in scenarios where sensitive data needs to be protected. By using views, database administrators can control data exposure without altering the underlying table structure.\n\n3. **Performance Optimization**:\n   Indexed views, a special type of view in SQL Server, can significantly improve query performance by precomputing joins and aggregations. This is especially beneficial for queries that are frequently executed and involve complex calculations or large datasets. Indexed views store the result set physically, reducing the need for repeated computations.\n\n4. **Maintenance and Flexibility**:\n   Views provide a level of insulation between the application and the underlying database schema. If the schema changes, the view can be updated to maintain compatibility with the application, reducing the need for extensive code changes. This flexibility is particularly valuable in evolving database environments.\n\n5. **Use of INSTEAD OF Triggers**:\n   Views can be made updatable using INSTEAD OF triggers, which allow modifications to the underlying tables through the view. This is particularly useful for views that involve complex joins or aggregations, where direct updates would otherwise be challenging.\n\n### Practical Applications of Views:\n\n- **Reporting and Analytics**:\n  Views are commonly used in reporting and analytics to present data in a format that is easy to understand and analyze. For example, a view might aggregate sales data by region or product category, providing a simplified dataset for reporting purposes.\n\n- **Data Integration**:\n  Views can be used to integrate data from multiple sources, providing a unified view of the data. This is particularly useful in environments where data is distributed across different databases or systems.\n\n- **Legacy Code Support**:\n  Partitioned views, introduced in SQL Server 2000, allow data from one logical table to be stored in separate physical tables. This feature is useful for scaling legacy applications that were not designed to handle large datasets.\n\n### Conclusion:\nThe book advocates for the strategic use of views in SQL Server to enhance data management, improve query performance, and ensure data security. By leveraging views, developers and database administrators can create more maintainable, flexible, and efficient database systems. Whether for simplifying complex queries, optimizing performance, or controlling data access, views are a fundamental tool in the SQL Server toolkit.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What are some of the tools and languages covered in the book for working with regular expressions?",
+        "answers": [
+            "The book covers tools and languages such as sed, Perl, grep, RegExr, Reggy, and various programming languages like Ruby, Python, and JavaScript."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "3d8a298c07e6bf6324b98ef3b1c88aca",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        },
+        "lightrag_answer": "The book \"Introducing Regular Expressions\" by Michael Fitzgerald covers a variety of tools and programming languages that are widely used for working with regular expressions. These tools and languages are essential for text processing, pattern matching, and automation tasks. Below is an overview of some of the key tools and languages discussed in the book:\n\n### Tools for Regular Expressions\n1. **RegExr**: A desktop and online tool for testing and developing regular expressions. It provides a visual interface for pattern matching and is used for experimenting with regex patterns. RegExr is available locally on a user's machine and supports features like alternation, backreferences, and named groups.\n\n2. **Regexpal**: An online tool created by Steven Levithan for testing and experimenting with regular expressions. It uses JavaScript for regex processing and is a simple, browser-based tool for learning and testing regex patterns.\n\n3. **Reggy**: A Mac desktop application used for testing and visualizing regular expressions. It supports Java syntax and allows users to customize preferences for regex testing. Reggy is particularly useful for its ability to highlight matched text and provide visual feedback.\n\n4. **sed**: A Unix stream editor developed by Lee McMahon, used for text transformation and manipulation. It supports regular expressions and is often used alongside tools like grep for text processing tasks. sed is available on Unix/Linux systems, including Mac and Windows through Cygwin.\n\n5. **grep**: A Unix command-line utility invented by Ken Thompson in 1973, used for searching and printing strings with regular expressions. It is widely used for pattern matching in text files and is available on Linux, Mac OS X, and Windows via Cygwin.\n\n6. **pcregrep**: A version of grep designed to work with Perl-Compatible Regular Expressions (PCRE). It offers enhanced regex capabilities and is used for advanced pattern matching on the command line.\n\n### Programming Languages with Regex Support\n1. **Perl**: A general-purpose programming language known for its robust support of regular expressions. Perl is widely used for text processing, pattern matching, and HTML markup. Its native regex library makes it a key language for understanding regex implementations.\n\n2. **Python**: A high-level programming language that supports regular expressions through its `re` module. Python is used for tasks like data analysis, text processing, and machine learning, and it integrates well with tools like Apache Spark.\n\n3. **Ruby**: A programming language that includes the Oniguruma regular expression library as standard. Ruby is known for its simplicity and readability, and it is often used in conjunction with the Rails framework for web development.\n\n4. **JavaScript**: A programming language used in web development, with a built-in regex implementation. Tools like Regexpal and RegExr rely on JavaScript for processing regular expressions in the browser.\n\n5. **.NET**: A programming framework for the Windows platform that includes a regex implementation. It is used in tools like Regex Hero for pattern matching and text manipulation.\n\n### Libraries and Resources\n1. **PCRE (Perl Compatible Regular Expressions)**: A C library of functions for regex that is compatible with Perl 5. It is used in tools like Notepad++ and pcregrep for advanced regex capabilities.\n\n2. **Oniguruma**: A regex library standard with Ruby 1.9, used for pattern matching in Ruby applications. It is also used in TextMate, a popular text editor.\n\n3. **RE2**: A non-backtracking C++ regex library that is fast and efficient. It is used in applications where performance is critical, and it can fall back on Perl's native library if backreferences are needed.\n\n### Conclusion\nThe book provides a comprehensive overview of these tools and languages, emphasizing their role in text processing and pattern matching. Whether you are working with command-line utilities like sed and grep or programming languages like Perl and Python, the book offers practical examples and resources to help you master regular expressions.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the significance of the Option Explicit statement in VBA?",
+        "answers": [
+            "The Option Explicit statement in VBA enforces explicit variable declarations, which helps in preventing errors and makes the code more readable and maintainable."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "30dc7a464a5d4ad0b809afac77ea0d6d",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        },
+        "lightrag_answer": "The `Option Explicit` statement in VBA (Visual Basic for Applications) plays a crucial role in ensuring code clarity, reducing errors, and improving maintainability. It enforces explicit variable declaration, meaning that all variables must be declared before they can be used in the code. This practice helps prevent common programming mistakes, such as typographical errors in variable names or unintended variable creation due to implicit declarations.\n\n### Importance of `Option Explicit`\n\n1. **Error Prevention**:  \n   Without `Option Explicit`, VBA allows implicit variable declarations, where variables are automatically created when they are first used. This can lead to subtle bugs if a variable name is misspelled, as VBA will treat the misspelled name as a new variable rather than flagging it as an error. By requiring explicit declarations, `Option Explicit` ensures that all variables are intentionally defined, reducing the risk of such errors.\n\n2. **Code Readability and Maintainability**:  \n   Explicitly declaring variables makes the code easier to read and understand. It provides a clear list of all variables used in the procedure, making it simpler for developers (including the original author) to track and manage them. This is especially important in larger projects or when collaborating with other programmers.\n\n3. **Memory Management**:  \n   Declaring variables explicitly helps VBA manage memory more efficiently. It ensures that variables are assigned appropriate data types, preventing unnecessary memory usage or type-related errors during runtime.\n\n4. **Debugging Assistance**:  \n   When `Option Explicit` is enabled, the VBA Editor can catch undeclared variables at compile time, highlighting potential issues before the code is executed. This makes debugging easier and faster, as errors are identified early in the development process.\n\n### How to Use `Option Explicit`\n\nTo enable `Option Explicit`, place the statement at the top of a module, before any procedures or functions:\n\n```vba\nOption Explicit\n\nSub ExampleProcedure()\n    Dim myVariable As Integer\n    myVariable = 10\n    MsgBox myVariable\nEnd Sub\n```\n\nIn this example, `myVariable` is explicitly declared as an `Integer`. If `myVariable` were misspelled elsewhere in the code, the VBA Editor would generate a compile-time error, alerting the developer to the issue.\n\n### Best Practices\n\n- **Always Use `Option Explicit`**:  \n  It is considered a best practice to include `Option Explicit` in every module. This can be enforced by enabling the \"Require Variable Declaration\" option in the VBA Editor settings, which automatically adds `Option Explicit` to new modules.\n\n- **Declare Variables with Appropriate Data Types**:  \n  Explicit declarations should include the correct data type for each variable, ensuring efficient memory usage and preventing type-related errors.\n\n- **Scope Variables Appropriately**:  \n  Variables should be declared with the appropriate scope (e.g., `Dim`, `Private`, `Public`) to avoid unintended side effects or conflicts in larger projects.\n\nIn summary, `Option Explicit` is a fundamental tool in VBA programming that promotes error-free, readable, and maintainable code. By enforcing explicit variable declarations, it helps developers write more robust and reliable applications.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is an object in the context of VBA?",
+        "answers": [
+            "In the context of VBA, an object is a representation of a part of an application, such as a document in Word, a worksheet in Excel, or a slide in PowerPoint."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "6f68ec52ee65e56e8d13d446d19f1a04",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        },
+        "lightrag_answer": "In the context of Visual Basic for Applications (VBA), an **object** is a fundamental element that represents a specific part of an application, such as a document, worksheet, or control. Objects are instances of classes, which define their properties, methods, and events. These properties describe the characteristics of the object, methods are actions that can be performed on the object, and events are triggers that occur in response to user actions or system notifications.\n\n### Key Characteristics of VBA Objects\n1. **Properties**: Properties are attributes or characteristics of an object. For example, the `Font` object in Excel has properties like `Name`, `Size`, and `Bold`, which define the appearance of text.\n2. **Methods**: Methods are actions that can be performed on an object. For instance, the `Save` method can be used to save a document in Word, or the `Add` method can be used to add a new worksheet in Excel.\n3. **Events**: Events are actions or occurrences that trigger specific behaviors in the object. For example, the `Open` event of a Word document can trigger a macro when the document is opened.\n\n### Examples of VBA Objects\n- **Document Object**: Represents a Word document and allows manipulation through properties like `Name` and methods like `Save`.\n- **Worksheet Object**: Represents an Excel worksheet and can be manipulated using methods like `Add` or properties like `Name`.\n- **Application Object**: Represents the entire application (e.g., Word or Excel) and provides access to global properties and methods.\n\n### Object Model in VBA\nVBA uses an **object model**, which is a hierarchical structure that organizes objects, properties, and methods within an application. This model allows programmers to navigate and manipulate the application's functionality. For example, in Word, the object model includes objects like `Application`, `Document`, and `Selection`, each with its own properties and methods.\n\n### Object Variables\nIn VBA, **object variables** are used to store references to objects. This allows for dynamic interaction with program elements. For example, you can declare an object variable to represent a specific worksheet or document, making it easier to manage and manipulate that object in your code.\n\n### Practical Use of Objects\nObjects are essential for automating tasks in Microsoft Office applications. For example, you can use the `Document` object in Word to automate the creation and formatting of documents, or the `Worksheet` object in Excel to manipulate data in spreadsheets. By understanding and utilizing objects, you can create powerful macros and custom solutions to streamline repetitive tasks.\n\nIn summary, objects in VBA are the building blocks of automation, enabling developers to interact with and control various components of Microsoft Office applications. Understanding how to work with objects, their properties, methods, and events is crucial for effective VBA programming.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the purpose of the Object Browser in the Visual Basic Editor?",
+        "answers": [
+            "The purpose of the Object Browser in the Visual Basic Editor is to help users find and understand objects, methods, properties, and events within VBA projects and libraries."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "507f7410adea0bb66a5d3416cd7cf311",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        },
+        "lightrag_answer": "The **Object Browser** is a critical tool within the **Visual Basic Editor (VBE)** that serves as a navigational and informational resource for developers working with **Visual Basic for Applications (VBA)**. Its primary purpose is to help users explore, understand, and utilize the objects, properties, methods, and constants available in the VBA environment. By providing a hierarchical view of the **object model**, the Object Browser allows developers to locate and understand the relationships between different objects and their members, making it easier to write efficient and effective code.\n\n### Key Features of the Object Browser\n\n1. **Navigation of Object Libraries**:  \n   The Object Browser enables users to navigate through various **object libraries**, which are collections of objects, classes, and methods that can be used in programming projects. This is particularly useful when working with complex applications like **Microsoft Word**, **Excel**, or **Outlook**, as it allows developers to explore the specific objects and methods available in each application.\n\n2. **Search Functionality**:  \n   The Object Browser includes a **Search Field** that allows users to search for specific objects, classes, or members within a project or library. This feature is invaluable for quickly locating the information needed without manually browsing through the hierarchy.\n\n3. **Detailed Information Display**:  \n   When a specific object, method, or property is selected, the Object Browser displays detailed information in the **Details Pane**. This includes the definition, location, and usage of the selected member, helping developers understand how to incorporate it into their code.\n\n4. **Integration with the Visual Basic Editor**:  \n   The Object Browser is tightly integrated with the VBE, allowing users to directly insert code snippets or references into their projects. This seamless integration streamlines the coding process and reduces the likelihood of errors.\n\n### Practical Uses of the Object Browser\n\n- **Exploring Object Models**:  \n  Developers can use the Object Browser to explore the **object model** of an application, such as the **Word Object Model** or **Excel Object Model**. This helps in understanding the structure and hierarchy of objects, which is essential for writing macros and automating tasks.\n\n- **Finding Methods and Properties**:  \n  The Object Browser lists all available **methods** and **properties** for a selected object, making it easier to determine which actions can be performed on that object. For example, it can show methods like `Add` for the **Documents Collection** in Word or `Save` for the **Workbook Object** in Excel.\n\n- **Understanding Constants**:  \n  The Object Browser also displays **enumerated constants**, which are predefined values used in VBA. This is particularly useful when working with functions like `MsgBox`, where different constants (e.g., `vbYesNo`, `vbOKOnly`) specify the type of message box to display.\n\n### Conclusion\n\nIn summary, the **Object Browser** is an indispensable tool for VBA developers, providing a comprehensive and user-friendly interface for exploring and understanding the objects, methods, and properties available in the VBA environment. Its integration with the **Visual Basic Editor** enhances productivity and ensures that developers can write efficient, error-free code. Whether you're a beginner or an experienced programmer, the Object Browser is a valuable resource for mastering VBA and automating tasks in Microsoft Office applications.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the rationale behind using full reserved words in SQL according to the book?",
+        "answers": [
+            "The book suggests using full reserved words to document the program, making it easier to read and understand the code, as uppercase words are seen as a unit and act to announce a statement or clause."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "1b5823e14c1b177f76837c1d58f32918",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        },
+        "lightrag_answer": "The rationale behind using full reserved words in SQL, as discussed in the book, is primarily centered on improving code readability, maintainability, and portability. Here are the key points:\n\n### 1. **Documentation and Clarity**\nUsing full reserved words helps document the program more effectively. For example, the `AS` keyword in SQL can be omitted when defining aliases, but including it makes it clear that an alias is being defined. This avoids confusion, especially in complex queries where it might be unclear whether a comma or an alias is being used. Similarly, using full forms like `INTEGER` instead of `INT` or `DECIMAL` instead of `DEC` ensures that the code is self-explanatory and easier to understand.\n\n### 2. **Avoiding Ambiguity**\nAbbreviated forms of reserved words can sometimes resemble other words or concepts, leading to confusion. For instance, `INT` might be mistaken for the word \"into,\" and `DEC` could be confused with the month \"December.\" Using the full forms eliminates such ambiguities, making the code more precise and less prone to misinterpretation.\n\n### 3. **Portability Across SQL Products**\nSticking to standard SQL syntax and full reserved words ensures that the code is portable across different SQL implementations. Proprietary or abbreviated forms might not be supported in other database systems, requiring significant rewrites if the code needs to be migrated. By adhering to standard SQL, developers can avoid vendor lock-in and ensure that their code runs consistently across platforms.\n\n### 4. **Future-Proofing Code**\nProprietary or abbreviated syntax can change or be deprecated in future versions of a database system. By using standard, full reserved words, developers protect their code from becoming obsolete or requiring modifications due to changes in the underlying SQL implementation. This is particularly important for long-term projects where maintaining backward compatibility is crucial.\n\n### 5. **Exception for Character Data Types**\nWhile the book advocates for using full reserved words, it makes an exception for character data types. For example, `CHAR(n)` is preferred over `CHARACTER(n)`, and `VARCHAR(n)` is preferred over `VARYING CHARACTER(n)`. The rationale here is that the full names for character data types are excessively long and cumbersome, making the code less readable. In such cases, the abbreviated forms are more practical and widely accepted.\n\n### Conclusion\nIn summary, the use of full reserved words in SQL is a best practice that enhances code clarity, reduces ambiguity, and ensures portability across different database systems. While there are exceptions for certain character data types, the general rule is to favor full forms to create more robust and maintainable SQL code.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "Can you name some popular modern optimization methods discussed in the book?",
+        "answers": [
+            "Some popular modern optimization methods discussed in the book include simulated annealing, tabu search, genetic algorithms, genetic programming, NSGA-II (multi-objective optimization), differential evolution, and particle swarm optimization."
+        ],
+        "context": "Modern Optimization With R",
+        "length": 98105,
+        "context_id": "31aa6012fa9cdbd6e17cdc426851e487",
+        "_id": "f365affdc673cb61e13ed46968e9b5bb",
+        "label": "cs",
+        "meta": {
+            "title": "Modern Optimization With R",
+            "authors": "Paulo Cortez"
+        },
+        "lightrag_answer": "Modern optimization methods have become essential tools for solving complex problems across various fields. In the context of the book *Modern Optimization with R* by Paulo Cortez, several popular optimization techniques are discussed. These methods are inspired by natural processes and are designed to handle a wide range of optimization challenges. Below are some of the key methods highlighted in the book:\n\n### 1. **Simulated Annealing**\nSimulated Annealing is a probabilistic optimization technique inspired by the metallurgical process of annealing. It explores the solution space by probabilistically accepting worse solutions to escape local optima. This method is particularly useful for problems where the global optimum is difficult to find due to the presence of many local optima. The book discusses its implementation in R and its application to tasks like the Traveling Salesman Problem (TSP) and the Bag Prices optimization task.\n\n### 2. **Genetic Algorithms**\nGenetic Algorithms (GAs) are population-based optimization methods inspired by natural selection and genetics. They evolve a population of solutions over generations through processes like selection, crossover, and mutation. GAs are effective for solving complex, multi-objective optimization problems. The book explores their use in various tasks, including the optimization of the Bag Prices problem and the Rastrigin function.\n\n### 3. **Particle Swarm Optimization (PSO)**\nParticle Swarm Optimization is a swarm intelligence technique that simulates the behavior of bird flocking or fish schooling. It iteratively improves candidate solutions by moving particles toward better solutions based on social behavior. The book discusses specific versions of PSO, such as SPSO 2007 and SPSO 2011, and their application to continuous optimization tasks like the Eggholder function and the Bag Prices problem.\n\n### 4. **Evolutionary Algorithms**\nEvolutionary Algorithms (EAs) are a broader category of optimization methods inspired by biological evolution. They include techniques like Genetic Algorithms and Differential Evolution. EAs are particularly effective for parameter optimization and multi-objective problems. The book highlights their use in tasks such as the Traveling Salesman Problem and the optimization of machine learning models.\n\n### 5. **Tabu Search**\nTabu Search is a local search method that uses memory structures to avoid revisiting recently explored solutions. It is particularly effective for combinatorial optimization problems like the Traveling Salesman Problem. The book discusses its implementation and compares its performance with other methods like Simulated Annealing and Hill Climbing.\n\n### 6. **Differential Evolution**\nDifferential Evolution (DE) is a population-based optimization method that uses vector differences to generate new solutions. It is known for its robustness and efficiency in solving continuous optimization problems. The book explores its application to tasks like the Bag Prices problem, where it outperforms other methods when more computational resources are available.\n\n### 7. **Estimation of Distribution Algorithms (EDAs)**\nEDAs are population-based methods that use probabilistic models to generate new solutions. They are particularly effective for discrete optimization problems. The book discusses variants like UMDA and their application to tasks like the Bag Prices problem, where they achieve competitive results.\n\n### 8. **Hill Climbing**\nHill Climbing is a simple local search method that iteratively moves toward better solutions. While it is prone to getting stuck in local optima, it is often used as a baseline for comparing more advanced methods. The book compares its performance with Simulated Annealing and other techniques in various optimization tasks.\n\n### Conclusion\nThese modern optimization methods, as discussed in *Modern Optimization with R*, provide powerful tools for tackling a wide range of optimization problems. Each method has its strengths and is suited to specific types of problems, making them valuable additions to the toolkit of researchers and practitioners in fields like engineering, finance, and machine learning.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What fundamental shift in thinking does the book encourage for effective SQL programming?",
+        "answers": [
+            "The book encourages a shift from thinking in sequences and processes to thinking in sets and logic, which is fundamental to mastering SQL programming."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "dc8a7fdfb3333fd72e18b01c03b0f12b",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        },
+        "lightrag_answer": "The book emphasizes a fundamental shift from procedural programming to a set-oriented mindset for effective SQL programming. This shift is crucial because SQL is inherently a declarative language designed to operate on sets of data, rather than individual rows or elements. The text highlights several key points to encourage this shift:\n\n### 1. **Moving Away from Procedural Code**\n   - Procedural code, which follows a step-by-step approach, is often less efficient in SQL compared to set-oriented constructs. The book critiques the tendency of new programmers to write procedural code in SQL, which can lead to inefficiencies and poor performance. For example, it describes how newbies might use cursors to loop through data, mimicking old tape file processing methods, which are far less efficient than set-based operations.\n\n### 2. **Embracing Set-Oriented Constructs**\n   - The book advocates for using SQL's set-oriented features, such as `CASE` expressions, `JOINs`, and subqueries, to replace procedural logic like `IF-THEN-ELSE` statements or loops. For instance, it provides an example where a `CASE` expression is used to update prices in a bookstore, replacing the need for a cursor and achieving better performance with fewer lines of code.\n\n### 3. **Understanding the Relational Model**\n   - The relational model, which underpins SQL, emphasizes the separation of logical and physical implementations. The book encourages programmers to think in terms of tables and relationships rather than individual rows or procedural steps. This mindset shift is essential for writing efficient and maintainable SQL code.\n\n### 4. **Avoiding Cursors When Possible**\n   - Cursors, which process data row-by-row, are often misused by those with a procedural mindset. The book strongly recommends avoiding cursors unless absolutely necessary, as they can lead to significant performance issues. Instead, it suggests using set-based operations to achieve the same results more efficiently.\n\n### 5. **Thinking in Terms of Sets**\n   - The book advises programmers to frame problems in terms of sets rather than individual elements. For example, it suggests adding the phrase \"the set of...\" before nouns in problem descriptions to help shift thinking toward set-oriented solutions. This approach aligns with SQL's strengths and leads to more efficient and scalable code.\n\n### 6. **Leveraging SQL's Declarative Nature**\n   - SQL is designed to describe what data is needed rather than how to retrieve it. The book encourages programmers to trust the SQL optimizer to handle the \"how\" and focus on writing clear, declarative queries that express the desired result set.\n\nIn summary, the book urges SQL programmers to adopt a set-oriented mindset, moving away from procedural habits and embracing SQL's declarative nature. This shift is essential for writing efficient, maintainable, and scalable SQL code.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the author approach the topic of statistical significance?",
+        "answers": [
+            "The author approaches statistical significance by explaining its importance in determining the reliability of evidence in experiments. It involves discussing p-values, t-tests, F-tests, and χ²-tests, emphasizing the need to avoid misinterpretation and misuse of these tools."
+        ],
+        "context": "Probability and Statistics for Computer Science",
+        "length": 340084,
+        "context_id": "cd05f98133352f12d464fe3184263356",
+        "_id": "f5fdff2a87d84971c75175592636aef0",
+        "label": "cs",
+        "meta": {
+            "title": "Probability and Statistics for Computer Science",
+            "authors": "David Forsyth"
+        },
+        "lightrag_answer": "The author approaches the topic of **statistical significance** with a focus on its practical application and interpretation, while also cautioning against common pitfalls and misconceptions. The discussion is grounded in the use of **p-values** and **hypothesis testing**, which are central to determining whether observed results are statistically significant or likely due to chance.\n\n### Understanding Statistical Significance\nThe author defines **statistical significance** as a measure used to determine whether an experimental observation might be the result of chance effects. This is typically assessed using **p-values**, which quantify the probability of obtaining results at least as extreme as the observed results, assuming the null hypothesis is true. Smaller p-values indicate stronger evidence against the null hypothesis, suggesting that the observed results are statistically significant.\n\n### Hypothesis Testing and P-Values\nThe author emphasizes the role of **hypothesis testing** in evaluating statistical significance. For example, the **T-test** is used to compare the means of two populations, and the **F-test** is used to assess the equality of variances. The **p-value** is computed as part of these tests to summarize the evidence against the null hypothesis. The author also distinguishes between **one-sided** and **two-sided p-values**, explaining that the former tests the hypothesis in one direction, while the latter considers both directions.\n\n### Caution Against Misuse\nA significant portion of the discussion is dedicated to warning against the misuse of p-values and statistical significance. The author highlights the dangers of **p-value hacking**, where researchers manipulate data or testing procedures to achieve significant results. This includes practices such as stopping data collection once a desired p-value is obtained or changing hypotheses mid-experiment. The author stresses that such practices undermine the validity of statistical conclusions and can lead to false-positive findings.\n\n### Practical Implications\nThe author also addresses the practical implications of statistical significance, noting that it does not necessarily equate to scientific importance. While a result may be statistically significant, it may not be meaningful or relevant in a real-world context. This distinction is crucial for interpreting the results of statistical tests accurately.\n\n### Summary\nIn summary, the author provides a balanced view of statistical significance, emphasizing its utility in hypothesis testing while cautioning against its misuse. The discussion underscores the importance of rigorous experimental design and transparent reporting to ensure that statistical conclusions are both valid and meaningful.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the primary purpose of the text \"Guide to Java: A Concise Introduction to Programming\"?",
+        "answers": [
+            "The primary purpose of the text is to help readers quickly learn how to program using the Java programming language, focusing on the fundamentals, providing illustrations and examples, and using visual contour diagrams to illustrate object-oriented semantics."
+        ],
+        "context": "Guide to Java",
+        "length": 132156,
+        "context_id": "9acb1e7611785d05ed363e0a12ff81c4",
+        "_id": "c455b7328867b7131b34bec23dbf5486",
+        "label": "cs",
+        "meta": {
+            "title": "Guide to Java",
+            "authors": "James T. Streib"
+        },
+        "lightrag_answer": "The primary purpose of the text *\"Guide to Java: A Concise Introduction to Programming\"* is to provide a quick and accessible introduction to programming using the Java programming language. Authored by James T. Streib and Takako Soma, the book is designed to help readers, particularly beginners, learn the fundamentals of Java efficiently. It achieves this by focusing on core concepts, providing numerous examples and illustrations, and using visual tools like contour diagrams to explain object-oriented programming (OOP) principles.\n\nThe text aims to bridge the gap between overly comprehensive Java textbooks, which can overwhelm beginners with excessive details, and overly simplified guides, which may lack sufficient depth for a solid foundation. By concentrating on the essentials, the book ensures that readers can quickly grasp key programming concepts while also building a strong base for exploring more advanced topics in the future.\n\nAdditionally, the book incorporates interactive elements, such as questions posed to the reader, to encourage active engagement with the material. Each chapter includes complete programs to illustrate the concepts discussed, along with summaries and exercises to reinforce learning. This approach makes the text particularly suitable for self-study or as a resource for introductory programming courses. Overall, the goal is to make Java programming accessible, understandable, and manageable for readers with little to no prior programming experience.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How can you customize the Visual Basic Editor in Office applications?",
+        "answers": [
+            "You can customize the Visual Basic Editor by adjusting settings such as editor and view preferences, window layouts, toolbar and menu customizations, and Toolbox settings to suit your workflow and preferences."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "115366e36d584ef281e25982fab866fb",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        },
+        "lightrag_answer": "The Visual Basic Editor (VBE) in Microsoft Office applications is a powerful integrated development environment (IDE) used for writing, editing, and debugging VBA (Visual Basic for Applications) code. Customizing the VBE can enhance productivity and make the coding experience more efficient. Here are several ways to customize the Visual Basic Editor:\n\n### 1. **Customizing the Editor's Appearance**\nThe VBE allows you to modify the appearance of the code editor to suit your preferences. This includes changing the font size, style, and color scheme for different elements of the code, such as keywords, comments, and identifiers. To do this:\n- Go to **Tools > Options** in the VBE.\n- Navigate to the **Editor Format** tab.\n- Here, you can customize the colors and fonts for various code elements, such as Normal Text, Syntax Error Text, and Keywords.\n\n### 2. **Enabling or Disabling Auto Features**\nThe VBE includes several auto features that assist with coding, such as **Auto List Members**, **Auto Quick Info**, and **Auto Indent**. These features can be toggled on or off based on your workflow:\n- Access the **Editor** tab under **Tools > Options**.\n- Check or uncheck options like **Auto List Members**, **Auto Quick Info**, and **Auto Indent** to enable or disable these features.\n\n### 3. **Setting Code Compilation Options**\nThe VBE allows you to control how code is compiled, which can be particularly useful for debugging and optimizing performance:\n- In the **General** tab of the **Options** dialog, you can enable or disable **Compile On Demand** and **Background Compile**.\n- **Compile On Demand** ensures that only the necessary code is compiled when a procedure is run, while **Background Compile** allows the editor to compile code in the background while you continue working.\n\n### 4. **Docking and Arranging Windows**\nThe VBE includes multiple windows, such as the **Project Explorer**, **Properties Window**, and **Immediate Window**. You can customize how these windows are arranged:\n- Go to the **Docking** tab under **Tools > Options**.\n- Here, you can specify which windows should be dockable. Docked windows snap to the edges of the VBE, while undocked windows can be freely moved around the screen.\n\n### 5. **Customizing Toolbars and Menus**\nThe VBE provides several toolbars and menus that can be customized to include frequently used commands:\n- Right-click on any toolbar and select **Customize**.\n- You can add or remove buttons, create new toolbars, and rearrange existing ones to streamline your workflow.\n\n### 6. **Using the Immediate Window for Testing**\nThe **Immediate Window** is a powerful tool for testing and debugging code. You can execute individual lines of code, check variable values, and test functions directly in this window:\n- Open the Immediate Window by pressing **Ctrl+G** or navigating to **View > Immediate Window**.\n- Use it to quickly test code snippets or debug procedures without running the entire macro.\n\n### 7. **Customizing User Forms**\nIf you work with **User Forms** in VBA, you can customize their appearance and behavior using the **Toolbox**:\n- The Toolbox contains controls like buttons, text boxes, and labels that can be added to User Forms.\n- You can also create custom controls or add additional controls to the Toolbox for more advanced functionality.\n\n### 8. **Setting Breakpoints and Debugging**\nThe VBE provides robust debugging tools, including the ability to set **breakpoints** in your code. Breakpoints pause the execution of your code at specific lines, allowing you to inspect variables and step through code line by line:\n- Set a breakpoint by clicking in the margin next to a line of code or pressing **F9**.\n- Use the **Step Into** (F8) and **Step Over** (Shift+F8) commands to navigate through your code during debugging.\n\n### 9. **Customizing the Project Explorer**\nThe **Project Explorer** window displays all the projects, modules, and forms in your VBA environment. You can customize its view to make it easier to navigate:\n- Use the **View** menu to toggle the visibility of the Project Explorer.\n- Right-click on items in the Project Explorer to add new modules, forms, or classes.\n\n### 10. **Using the Object Browser**\nThe **Object Browser** is a valuable tool for exploring the objects, properties, and methods available in VBA. You can customize its view to focus on specific libraries or objects:\n- Open the Object Browser by pressing **F2** or navigating to **View > Object Browser**.\n- Use the search function to quickly locate specific objects or methods.\n\nBy customizing the Visual Basic Editor to fit your workflow, you can significantly improve your efficiency and make the process of writing and debugging VBA code more enjoyable. Whether it's adjusting the editor's appearance, enabling helpful auto features, or organizing your workspace, these customizations can help you get the most out of the VBE.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the significance of the QED editor in the history of regular expressions?",
+        "answers": [
+            "The QED editor, written by Ken Thompson, yielded one of the earliest practical implementations of regular expressions in computing."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "a696fb4fb29c953cc4209879647cb073",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        },
+        "lightrag_answer": "The QED editor holds a pivotal place in the history of regular expressions, marking one of the earliest practical implementations of this powerful concept in computing. Developed by Ken Thompson, a renowned computer scientist, QED was originally written for the Berkeley Time-Sharing System (BTSS) in the late 1960s. Thompson's work on QED laid the groundwork for the integration of regular expressions into text processing tools, which would later become a cornerstone of Unix and other operating systems.\n\nRegular expressions, first described by mathematician Stephen Kleene in the 1950s, were initially a theoretical construct in formal language theory. However, it was Thompson who brought them into the practical realm of computing through QED. This text editor was one of the first to utilize regular expressions for pattern matching and text manipulation, enabling users to search and edit text with unprecedented flexibility.\n\nThompson's contributions did not stop with QED. He went on to develop other influential tools, such as the Unix line editor `ed` in 1971 and the command-line utility `grep` in 1973, both of which incorporated regular expressions. These tools, along with others like `sed` and `vi`, became integral components of the Unix operating system, further popularizing the use of regular expressions in programming and text processing.\n\nThe QED editor's implementation of regular expressions was a significant milestone in the evolution of computing, bridging the gap between theoretical concepts and practical applications. Its influence can still be seen today in the widespread use of regular expressions across various programming languages and tools, making it a foundational element in the history of computer science and software development.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the book address the issue of infeasible solutions in optimization problems?",
+        "answers": [
+            "The book addresses infeasible solutions through several methods, including the death-penalty approach (assigning a very large penalty value), penalty-weights (modifying the evaluation function to include penalties), repair (transforming an infeasible solution into a feasible one), and generating only feasible solutions using domain knowledge or special operators."
+        ],
+        "context": "Modern Optimization With R",
+        "length": 98105,
+        "context_id": "31aa6012fa9cdbd6e17cdc426851e487",
+        "_id": "80221419c8122e84644b46df2cf798cc",
+        "label": "cs",
+        "meta": {
+            "title": "Modern Optimization With R",
+            "authors": "Paulo Cortez"
+        },
+        "lightrag_answer": "The book \"Modern Optimization with R\" by Paulo Cortez addresses the issue of infeasible solutions in optimization problems through various strategies and methods, particularly in the context of constraint handling. Infeasible solutions are those that do not satisfy the constraints of the optimization problem, and managing them effectively is crucial for finding optimal or near-optimal solutions.\n\n### Constraint Handling Strategies\n\nOne of the primary methods discussed in the book for handling infeasible solutions is the **Repair Method**. This strategy involves transforming infeasible solutions into feasible ones using local search techniques and domain-specific knowledge. For example, in the **Bag Prices Task**, the Repair Method is applied to adjust prices until the expected sales drop below a specified threshold, ensuring that the solution meets the constraints.\n\nAnother approach mentioned is the **Death Penalty** strategy. This method penalizes infeasible solutions by assigning them a very low fitness value, effectively removing them from consideration in the optimization process. The Death Penalty strategy is applied in the **Bag Prices Task** to handle solutions that violate the constraints, ensuring that only feasible solutions are considered for further optimization.\n\n### Local Search and Feasibility Handling\n\nThe book also discusses the use of **local search** methods within the framework of **Estimation of Distribution Algorithms (EDA)**. Specifically, the **GCEDA** variant of EDA uses the **localRepair** function to handle infeasible solutions. This function applies local search to adjust the solutions until they become feasible, ensuring that the optimization process continues with valid solutions.\n\n### Comparison of Methods\n\nIn the context of the **Bag Prices Task**, the book compares different constraint handling strategies, including the **Repair Method** and the **Death Penalty** strategy. These methods are evaluated based on their effectiveness in transforming infeasible solutions into feasible ones and their impact on the overall optimization performance. The comparison highlights the trade-offs between different approaches and provides insights into selecting the most appropriate strategy for a given problem.\n\n### Conclusion\n\nOverall, the book emphasizes the importance of effectively managing infeasible solutions in optimization problems. By employing strategies such as the Repair Method, Death Penalty, and local search techniques, optimization algorithms can navigate the constraints of the problem and converge towards feasible and optimal solutions. These methods are illustrated through practical examples, such as the Bag Prices Task, providing readers with a comprehensive understanding of how to handle infeasible solutions in real-world optimization scenarios.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What are the main components of a machine learning system designed with Spark?",
+        "answers": [
+            "A machine learning system with Spark typically includes components such as data ingestion and storage, data cleansing and transformation, model training and testing, model deployment and integration, and model monitoring and feedback."
+        ],
+        "context": "Machine Learning With Spark",
+        "length": 131651,
+        "context_id": "7bcef8714a477fd61fc8fb0d499b2cc3",
+        "_id": "6228906a75b83aef34b197a2cfb0a9a4",
+        "label": "cs",
+        "meta": {
+            "title": "Machine Learning With Spark",
+            "authors": "Nick Pentreath"
+        },
+        "lightrag_answer": "Designing a machine learning system with Apache Spark involves several key components that work together to process data, train models, and deploy them for real-world applications. Below is an overview of the main components of such a system:\n\n### 1. **Data Ingestion and Storage**\nThe first step in any machine learning system is to collect and store data. Spark can integrate with various data sources, including distributed file systems like HDFS, cloud storage like Amazon S3, and databases like HBase and Cassandra. The data is typically stored in Resilient Distributed Datasets (RDDs) or DataFrames, which are Spark's core data structures for distributed processing.\n\n### 2. **Data Preprocessing**\nOnce the data is ingested, it often requires cleaning, transformation, and feature extraction to make it suitable for machine learning models. This step includes:\n- **Data Cleaning:** Handling missing values, removing duplicates, and correcting errors.\n- **Feature Extraction:** Converting raw data into numerical features, such as encoding categorical variables, extracting text features, or normalizing numerical data.\n- **Feature Engineering:** Creating new features by combining or transforming existing ones to improve model performance.\n\nSpark provides tools like `StandardScaler` for feature scaling and `Normalizer` for vector normalization, as well as APIs for text processing and other transformations.\n\n### 3. **Model Training and Evaluation**\nThe core of the machine learning system is the model training process. Spark's MLlib library supports a wide range of machine learning algorithms, including classification, regression, clustering, and recommendation systems. Key steps in this phase include:\n- **Model Selection:** Choosing the appropriate algorithm for the task, such as logistic regression, decision trees, or support vector machines.\n- **Parameter Tuning:** Optimizing model parameters using techniques like grid search and cross-validation.\n- **Evaluation:** Assessing model performance using metrics like accuracy, precision, recall, and RMSE (Root Mean Squared Error).\n\nSpark's MLlib also provides built-in evaluation functions, such as `RegressionMetrics` and `RankingMetrics`, to simplify the evaluation process.\n\n### 4. **Model Deployment**\nAfter training and evaluating the model, the next step is to deploy it into a production environment. This involves:\n- **Exporting the Model:** Saving the trained model to a central data store or file system.\n- **Integration:** Embedding the model into a live system, such as a web application or a real-time data processing pipeline.\n- **Serving Predictions:** Using the model to make predictions on new data, often in real-time.\n\nSpark Streaming can be used for real-time model deployment, enabling online learning and predictions on data streams.\n\n### 5. **Model Monitoring and Feedback**\nOnce the model is deployed, it is crucial to monitor its performance and gather feedback to ensure it continues to perform well. This includes:\n- **Performance Monitoring:** Tracking metrics like prediction accuracy and business KPIs (Key Performance Indicators) to detect any degradation in model performance.\n- **Feedback Loops:** Collecting new data from user interactions and using it to retrain and improve the model over time.\n- **A/B Testing:** Experimenting with different models in production to determine which one performs best in real-world scenarios.\n\n### 6. **Scalability and Distributed Computing**\nOne of Spark's key strengths is its ability to handle large-scale data processing and machine learning tasks in a distributed manner. This involves:\n- **Distributed Data Processing:** Leveraging Spark's distributed computing capabilities to process data across multiple nodes in a cluster.\n- **Iterative Computation:** Efficiently performing iterative tasks, such as training machine learning models, using Spark's in-memory computation model.\n- **Fault Tolerance:** Ensuring the system can recover from failures and continue processing without data loss.\n\n### 7. **Machine Learning Pipelines**\nSpark provides a high-level API for building machine learning pipelines, which streamline the process of data preprocessing, model training, and evaluation. A pipeline consists of a sequence of stages, such as feature extraction, model training, and evaluation, that can be executed in a single workflow. This makes it easier to manage complex machine learning workflows and ensures consistency across different stages of the process.\n\n### Conclusion\nA machine learning system designed with Spark is a powerful tool for handling large-scale data and building sophisticated models. By leveraging Spark's distributed computing capabilities, MLlib's machine learning algorithms, and its pipeline API, organizations can create scalable, efficient, and robust machine learning systems that can be deployed in production environments. Monitoring and feedback loops ensure that the system continues to perform well and adapt to changing data and business needs.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the purpose of the caret (^) in regular expressions?",
+        "answers": [
+            "The caret (^) in regular expressions is used to match the beginning of a line or string."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "153e939b06da8693da93626e62f5d90a",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        },
+        "lightrag_answer": "The caret (`^`) in regular expressions serves multiple purposes depending on its context. Its primary function is to indicate the **start of a line** or **start of a string**. When used at the beginning of a regular expression, it ensures that the pattern matches only if it appears at the start of the input. For example, the regex `^Hello` will match the word \"Hello\" only if it appears at the beginning of a line or string.\n\nAdditionally, the caret has a special meaning when used inside **character classes** (enclosed in square brackets `[]`). In this context, it acts as a **negation operator**, indicating that the characters following it should **not** be matched. For instance, the regex `[^aeiou]` matches any character that is **not** a vowel.\n\nIn summary, the caret (`^`) is a versatile metacharacter in regular expressions, used to anchor patterns to the start of a line or string and to negate character sets within character classes. Its behavior depends on its placement and the context in which it is used.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the significance of the `fix` construct in PCF (Programming language for computable functions)?",
+        "answers": [
+            "The `fix` construct allows for the definition of recursive functions in PCF, enabling the computation of functions that would otherwise require iteration or recursive definitions."
+        ],
+        "context": "Introduction to the Theory of Programming Languages",
+        "length": 51704,
+        "context_id": "2369ec0826789a4f553cb31b1635c15d",
+        "_id": "065edc3713c4ba9e63ba4f064302e2d0",
+        "label": "cs",
+        "meta": {
+            "title": "Introduction to the Theory of Programming Languages",
+            "authors": "Gilles Dowek"
+        },
+        "lightrag_answer": "The `fix` construct in PCF (Programming Computable Functions) is a fundamental feature that enables the definition of recursive functions within the language. Its significance lies in its ability to handle recursion, a core concept in programming that allows functions to call themselves to solve problems by breaking them down into smaller, similar subproblems. Below, we explore the role and implications of the `fix` construct in PCF.\n\n### Recursive Function Definitions\nIn PCF, the `fix` construct is used to define recursive functions. Without it, the language would lack the ability to express functions that depend on their own definitions, which are essential for many computational tasks. For example, the factorial function, which is a classic example of recursion, can be defined using `fix` as follows:\n```\nfix f fun n -> ifz n then 1 else n * (f (n - 1))\n```\nHere, `fix` binds the function `f` to its own definition, allowing it to call itself recursively. This capability is crucial for implementing algorithms that rely on self-referential logic.\n\n### Fixed-Point Semantics\nThe `fix` construct is rooted in the mathematical concept of fixed points. In denotational semantics, a fixed point of a function is a value that remains unchanged when the function is applied to it. In PCF, `fix` represents the least fixed point of a continuous function, which is used to interpret recursive definitions. This ensures that recursive functions are well-defined and can be evaluated correctly within the language.\n\n### Handling Non-Termination\nOne of the challenges with recursion is the potential for non-termination, where a function may enter an infinite loop. The `fix` construct in PCF is designed to handle such cases by allowing recursive definitions to be evaluated step by step. However, it also introduces the possibility of non-terminating computations, as certain recursive functions may not converge to a result. This is a trade-off that comes with the flexibility of recursion.\n\n### Comparison with Other Constructs\nIn some programming languages, such as Gödel's System T, the `fix` construct is replaced by the `rec` construct to ensure that all programs terminate. This highlights the importance of `fix` in PCF, as it provides a mechanism for recursion without imposing termination guarantees. This makes PCF Turing complete, meaning it can express all computable functions, but also means that programmers must be cautious to avoid infinite loops.\n\n### Practical Applications\nThe `fix` construct is not just a theoretical tool; it has practical applications in programming. For instance, it is used in exercises and examples to illustrate the operational semantics of PCF, such as computing factorials or implementing recursive algorithms. Its presence in PCF underscores the language's focus on functional programming and its ability to model complex computations.\n\n### Conclusion\nThe `fix` construct is a cornerstone of PCF, enabling recursion and making the language expressive and powerful. It bridges the gap between mathematical theory and practical programming, allowing for the definition of recursive functions while also introducing challenges related to non-termination. Understanding `fix` is essential for working with PCF and appreciating its role in the broader context of programming language theory.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What does the book suggest as a strategy for testing SQL?",
+        "answers": [
+            "The book recommends testing all possible combinations of NULLs, inspecting and testing all CHECK () constraints, and being wary of character columns, as they often lack sufficient constraints."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "6fb16360d09136e63e30475fa3261ada",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        },
+        "lightrag_answer": "The book emphasizes the importance of **Testing SQL** as a critical practice for ensuring the correctness and performance of SQL code. It outlines several strategies and considerations for effective SQL testing, particularly in the context of **SQL Server**. Here are the key points discussed:\n\n### 1. **Testing Character Columns**\n   - The book highlights the need to test character columns thoroughly. This includes verifying that the data stored in these columns adheres to expected formats and constraints. For example, ensuring that character columns do not exceed their defined length and that they handle special characters or spaces correctly.\n\n### 2. **Testing CHECK() Constraints**\n   - **CHECK() Constraints** are a fundamental part of maintaining data integrity in SQL Server. The book suggests testing these constraints to ensure they enforce the correct rules on data values. This involves validating that the constraints prevent invalid data from being inserted or updated in the database. For instance, a CHECK() constraint might enforce that a column only accepts positive numbers, and testing would involve attempting to insert negative values to verify the constraint's effectiveness.\n\n### 3. **Testing NULL Combinations**\n   - Handling **NULL** values is a common challenge in SQL. The book recommends testing how NULL values interact with other data in queries and constraints. This includes scenarios where NULL values are combined with other data types or used in conditional logic. For example, testing how a query behaves when NULL values are included in a JOIN or WHERE clause.\n\n### 4. **Testing for Size**\n   - The book also suggests testing SQL code for performance and scalability, particularly when dealing with large datasets. This involves evaluating how queries perform under different data volumes and ensuring that they remain efficient as the dataset grows. For example, testing whether an index improves query performance on a large table.\n\n### 5. **General SQL Testing Practices**\n   - Beyond specific scenarios, the book advocates for a comprehensive approach to SQL testing. This includes:\n     - **Unit Testing**: Testing individual SQL statements or stored procedures to ensure they produce the expected results.\n     - **Integration Testing**: Testing how different SQL components work together, such as how a stored procedure interacts with a table or view.\n     - **Performance Testing**: Evaluating the execution time and resource usage of SQL queries, especially in high-load environments.\n\n### 6. **Tools for SQL Testing**\n   - The book mentions the use of tools like **SQL Server Profiler** and **Query Plans** to aid in SQL testing. These tools help developers analyze query execution, identify bottlenecks, and optimize performance. For example, using Query Plans to understand how SQL Server executes a query and whether it uses indexes effectively.\n\n### 7. **Error Handling**\n   - The book also touches on the importance of testing **Error Handling** mechanisms in SQL. This involves ensuring that SQL scripts and stored procedures gracefully handle errors, such as invalid data or failed transactions. For example, testing how a TRY/CATCH block responds to a failed INSERT operation.\n\n### Conclusion\nIn summary, the book provides a detailed framework for **Testing SQL**, covering everything from basic data validation to advanced performance optimization. By following these strategies, developers can ensure that their SQL code is robust, efficient, and reliable, even in complex and high-demand environments.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the purpose of normalization in database design and what are its benefits?",
+        "answers": [
+            "Normalization is the process of organizing data in a database to reduce redundancy and improve data integrity. Its benefits include efficient data storage, fewer modification anomalies, and better adherence to the principles of relational database design."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "6d089ff9d766a947ffd4c3d299aea38c",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        },
+        "lightrag_answer": "Normalization is a fundamental process in database design aimed at organizing data to reduce redundancy and improve data integrity. It involves structuring a database in a way that minimizes duplication and ensures that data dependencies make sense. This process is typically carried out through a series of steps known as normal forms, each addressing specific types of redundancy and dependency issues.\n\n### Purpose of Normalization\nThe primary purpose of normalization is to create a database schema that is efficient, scalable, and easy to maintain. By breaking down data into smaller, related tables, normalization helps to:\n\n1. **Eliminate Redundancy**: Redundant data can lead to inconsistencies and increased storage requirements. Normalization ensures that each piece of data is stored in only one place, reducing the risk of anomalies.\n2. **Improve Data Integrity**: By enforcing relationships between tables and using constraints, normalization helps maintain the accuracy and consistency of data.\n3. **Simplify Database Maintenance**: A normalized database is easier to update and modify because changes need to be made in only one location.\n4. **Enhance Query Performance**: While normalization can sometimes lead to more complex queries, it often improves performance by reducing the amount of data that needs to be scanned or joined.\n\n### Benefits of Normalization\nNormalization offers several key benefits, particularly in Online Transaction Processing (OLTP) systems, where data integrity and efficiency are critical:\n\n1. **Reduced Data Redundancy**: By storing data in a structured manner, normalization minimizes the duplication of information, which can save storage space and reduce the risk of inconsistencies.\n2. **Improved Data Consistency**: Normalization ensures that data dependencies are logical and that updates to one part of the database do not lead to inconsistencies elsewhere.\n3. **Easier Data Management**: A normalized database is easier to manage and modify because it is organized into smaller, more manageable tables.\n4. **Better Scalability**: Normalized databases are more scalable because they are designed to handle growth without becoming unwieldy or inefficient.\n5. **Enhanced Query Optimization**: While normalization can sometimes lead to more complex queries, it often improves performance by reducing the amount of data that needs to be processed.\n\n### Normal Forms\nNormalization is typically carried out in stages, known as normal forms, each addressing specific types of redundancy and dependency issues:\n\n1. **First Normal Form (1NF)**: Ensures that each table column contains only atomic (indivisible) values and that there are no repeating groups of data.\n2. **Second Normal Form (2NF)**: Builds on 1NF by ensuring that each column depends on the whole primary key, not just part of it.\n3. **Third Normal Form (3NF)**: Ensures that no column depends on any other non-key column, eliminating derived data.\n4. **Fourth Normal Form (4NF)**: Addresses multi-valued dependencies, where columns depend separately on other columns in the primary key.\n5. **Fifth Normal Form (5NF)**: Deals with non-loss and loss decompositions, ensuring that relationships can be logically recomposed.\n\n### Strategic De-Normalization\nWhile normalization is essential for maintaining data integrity and efficiency, there are scenarios where strategic de-normalization can be beneficial. De-normalization involves intentionally introducing redundancy to improve query performance, particularly in read-heavy systems like Online Analytical Processing (OLAP) systems. However, this should be done cautiously, as it can lead to data inconsistencies if not managed properly.\n\n### Conclusion\nNormalization is a critical aspect of database design that helps ensure data integrity, reduce redundancy, and improve efficiency. By following the principles of normalization, database designers can create robust, scalable, and maintainable systems. However, it is important to balance normalization with practical considerations, such as query performance, and to use strategic de-normalization where appropriate.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the difference between a variable and a constant in VBA?",
+        "answers": [
+            "A variable in VBA is a named storage location in memory that can hold different values during the execution of a program, whereas a constant holds a fixed value that does not change."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "511e94495adb05cc91831684a0319c89",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        },
+        "lightrag_answer": "In Visual Basic for Applications (VBA), both variables and constants are used to store data, but they serve different purposes and have distinct characteristics. Understanding the differences between them is crucial for effective programming.\n\n### Variables\nVariables are named storage locations in a program that hold data values, which can be modified during program execution. They are dynamic in nature, meaning their values can change as the program runs. Variables are declared using the `Dim` statement, and their data types can be explicitly specified (e.g., `Dim intVariable As Integer`). If no data type is specified, VBA assigns the default `Variant` type, which can hold any type of data. Variables are essential for tasks that require data manipulation, such as calculations, data processing, and iterative operations.\n\n### Constants\nConstants, on the other hand, are named memory locations that store fixed values that do not change during program execution. Once a constant is defined, its value remains the same throughout the program. Constants are declared using the `Const` statement (e.g., `Const conPi As Double = 3.14159`). They are particularly useful for storing values that are used repeatedly in a program, such as mathematical constants, configuration settings, or fixed parameters. Using constants can improve code readability and maintainability by avoiding \"magic numbers\" or hard-coded values scattered throughout the code.\n\n### Key Differences\n1. **Mutability**: Variables can change their values during program execution, while constants cannot.\n2. **Declaration**: Variables are declared using the `Dim` statement, whereas constants are declared using the `Const` statement.\n3. **Purpose**: Variables are used for data that needs to be modified or updated, while constants are used for fixed values that remain unchanged.\n4. **Scope and Lifetime**: Both variables and constants can have different scopes (e.g., procedure-level, module-level, or global), but constants are typically used for values that are universally applicable and do not change.\n\n### Example\n```vba\nDim intCounter As Integer ' Declares a variable\nintCounter = 10 ' Assigns a value to the variable\nintCounter = intCounter + 1 ' Modifies the variable's value\n\nConst conMaxValue As Integer = 100 ' Declares a constant\n' conMaxValue = 200 ' This would cause an error because constants cannot be changed\n```\n\nIn summary, variables and constants are fundamental constructs in VBA, each serving a specific purpose. Variables provide flexibility for dynamic data handling, while constants ensure consistency and clarity by storing immutable values.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the concept of \"environment\" differ between denotational and operational semantics?",
+        "answers": [
+            "In denotational semantics, an environment maps variables to their denotations (usually values or functions). In operational semantics, an environment maps variables to their current values or states during the execution of a program."
+        ],
+        "context": "Introduction to the Theory of Programming Languages",
+        "length": 51704,
+        "context_id": "2369ec0826789a4f553cb31b1635c15d",
+        "_id": "570c894ee9a40ace84dcce3fb9227902",
+        "label": "cs",
+        "meta": {
+            "title": "Introduction to the Theory of Programming Languages",
+            "authors": "Gilles Dowek"
+        },
+        "lightrag_answer": "In the study of programming languages, the concept of an \"environment\" plays a crucial role in both denotational and operational semantics, but its function and interpretation differ significantly between the two approaches.\n\n### Denotational Semantics\n\nIn **denotational semantics**, the environment is primarily used to map variables to their corresponding values or meanings within a mathematical framework. The goal of denotational semantics is to assign a precise mathematical meaning to each construct in a programming language. Here, the environment serves as a context that provides the necessary bindings for variables, enabling the interpretation of terms as mathematical objects. For example, in the denotational semantics of a language like PCF (Programming Computable Functions), the environment might map variables to their types or values, allowing the semantic function to compute the meaning of a term in a given context.\n\nThe environment in denotational semantics is often static, meaning it does not change during the evaluation of a term. Instead, it provides a fixed context for interpreting the term. This approach aligns with the mathematical nature of denotational semantics, where the focus is on defining the meaning of programs in terms of functions and mathematical structures.\n\n### Operational Semantics\n\nIn **operational semantics**, the environment is more dynamic and is used to manage the state of the program during its execution. Operational semantics describes how a program executes step by step, and the environment plays a key role in tracking the values of variables as the program runs. For instance, in the big-step operational semantics of PCF, the environment is used to store the bindings of variables to their values, which are updated as the program evaluates expressions and statements.\n\nIn small-step operational semantics, the environment is even more dynamic, as it changes with each reduction step. The environment is used to manage the state of the computation, including the values of variables, the current program counter, and other runtime information. This approach is more focused on the process of computation and the steps involved in transforming the program from one state to another.\n\n### Key Differences\n\n1. **Purpose**: In denotational semantics, the environment is used to provide a static context for interpreting terms as mathematical objects. In operational semantics, the environment is used to manage the dynamic state of the program during execution.\n\n2. **Dynamism**: The environment in denotational semantics is typically static, while in operational semantics, it is dynamic and changes as the program executes.\n\n3. **Focus**: Denotational semantics focuses on the meaning of programs in terms of mathematical functions, whereas operational semantics focuses on the step-by-step execution and transformation of programs.\n\nIn summary, while both denotational and operational semantics use the concept of an environment, they do so in fundamentally different ways, reflecting their distinct goals and approaches to understanding programming languages.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How can you ensure that a macro runs automatically when an application starts?",
+        "answers": [
+            "You can ensure that a macro runs automatically when an application starts by naming the macro \"Auto_Open\" for Excel or using the appropriate event handler for other applications, such as \"Document_Open\" for Word."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "33b180f7c06c2ad7d7e68b59aa0ad4aa",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        },
+        "lightrag_answer": "To ensure that a macro runs automatically when an application starts, you can use specific macro names or configurations that trigger execution upon application launch. Here’s how this works in different Microsoft Office applications:\n\n### **Microsoft Access: AutoExec Macro**\nIn Microsoft Access, you can create a macro named **AutoExec**, which runs automatically when the database is opened. This macro is useful for initializing settings or performing setup tasks. For example, you might use it to maximize the application window, open specific forms, or display a welcome message. To create an AutoExec macro:\n1. Open the Macro Designer in Access.\n2. Add the actions you want the macro to perform.\n3. Save the macro with the name **AutoExec**.\n\n### **Microsoft Excel: Auto_Open Macro**\nIn Excel, you can use the **Auto_Open** macro, which executes automatically when the workbook containing the macro is opened. This macro is typically written in VBA (Visual Basic for Applications) and can perform tasks like maximizing the application window or opening specific files. To create an Auto_Open macro:\n1. Open the Visual Basic Editor (Alt + F11).\n2. Insert a new module.\n3. Write a subroutine named **Auto_Open** with the desired actions.\n4. Save the workbook.\n\n### **Microsoft Word: AutoExec Macro**\nSimilar to Access, Word supports an **AutoExec** macro that runs automatically when the application starts. This macro can be used to set up preferences or perform initialization tasks. To create an AutoExec macro in Word:\n1. Open the Visual Basic Editor (Alt + F11).\n2. Insert a new module.\n3. Write a subroutine named **AutoExec** with the desired actions.\n4. Save the document.\n\n### **Outlook: Application_Startup Event**\nIn Outlook, you can use the **Application_Startup** event to run code automatically when the application starts. This event is part of the VBA programming environment and can be used to initialize event handlers or perform setup tasks. To use the Application_Startup event:\n1. Open the Visual Basic Editor (Alt + F11).\n2. In the **ThisOutlookSession** module, write a subroutine named **Application_Startup** with the desired actions.\n3. Save the module.\n\n### **General Considerations**\n- **Security Settings:** Ensure that macros are enabled in the application’s Trust Center settings. By default, macros may be disabled for security reasons.\n- **File Types:** In applications like Word and Excel, macros are only supported in macro-enabled file types (e.g., `.docm` for Word or `.xlsm` for Excel). Ensure your file is saved in the correct format.\n- **Testing:** Always test your macros in a safe environment to ensure they behave as expected and do not interfere with the application’s normal operation.\n\nBy using these methods, you can automate tasks and streamline workflows by ensuring that specific macros run automatically when an application starts.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the significance of the XML data type introduced in SQL Server 2005?",
+        "answers": [
+            "The XML data type allows SQL Server to recognize and manage data as truly XML data, enabling advanced features like indexing, data validation through schema collections, and intrinsic methods for querying and modifying XML data."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "83b7aee070a8d594d1f0777b7e61c7d2",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        },
+        "lightrag_answer": "The introduction of the **XML data type** in **SQL Server 2005** marked a pivotal advancement in how SQL Server handles and integrates XML data. This feature fundamentally changed the way XML data is stored, manipulated, and queried within relational databases, offering several key benefits and capabilities.\n\n### Enhanced XML Data Storage and Management\nThe XML data type allows SQL Server to store XML documents and fragments natively within the database. Unlike earlier versions where XML data was treated as plain text, the XML data type ensures that the data is well-formed and valid according to XML standards. This native support enables efficient storage and retrieval of XML data, making it easier to manage hierarchical and semi-structured data within a relational database environment.\n\n### Integration with XML Schema Collections\nOne of the most significant features of the XML data type is its integration with **XML Schema Collections**. XML Schema Collections allow developers to define the structure, constraints, and data types for XML documents stored in the database. By associating an XML data type column with an XML Schema Collection, SQL Server can validate XML data against the defined schema, ensuring data integrity and consistency. This validation process is crucial for applications that rely on structured XML data, such as web services or data exchange systems.\n\n### Querying XML Data with XQuery\nThe XML data type also introduced support for **XQuery**, a powerful query language designed for querying and manipulating XML data. XQuery enables developers to extract specific elements or attributes from XML documents, perform complex transformations, and integrate XML data with relational data. This capability is particularly useful for applications that need to process XML data alongside traditional relational data, such as e-commerce platforms or content management systems.\n\n### Improved Performance with XML Indexes\nTo optimize query performance on XML data, SQL Server introduced **XML Indexes**. These indexes allow for faster retrieval and querying of XML data by creating structured representations of the XML documents. XML Indexes come in two types: **Primary XML Indexes**, which index the entire XML document, and **Secondary XML Indexes**, which focus on specific aspects of the XML data (e.g., paths, values, or properties). This indexing mechanism significantly improves the performance of queries involving XML data, especially in large datasets.\n\n### Seamless Integration with Relational Data\nThe XML data type bridges the gap between relational and XML data, allowing developers to store and query both types of data within the same database. This integration is particularly valuable for applications that need to handle hybrid data models, such as those combining structured relational data with semi-structured XML data. For example, an e-commerce system might store product information in relational tables while using XML to store product specifications or customer reviews.\n\n### Conclusion\nThe introduction of the XML data type in SQL Server 2005 revolutionized the way XML data is handled in relational databases. By providing native support for XML storage, validation, querying, and indexing, SQL Server empowered developers to build more flexible and efficient applications that leverage both relational and XML data. This feature remains a cornerstone of SQL Server's capabilities, particularly in scenarios involving hierarchical data, web services, and data exchange.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the significance of the `DEoptim` package in R for optimization tasks?",
+        "answers": [
+            "The `DEoptim` package in R implements the differential evolution algorithm, which is a global optimization method useful for continuous numerical optimization. It is significant because it provides a robust and efficient way to handle complex optimization problems that may be difficult to solve using traditional methods."
+        ],
+        "context": "Modern Optimization With R",
+        "length": 98105,
+        "context_id": "31aa6012fa9cdbd6e17cdc426851e487",
+        "_id": "89b16302915e65e42d9dc4c009b541e4",
+        "label": "cs",
+        "meta": {
+            "title": "Modern Optimization With R",
+            "authors": "Paulo Cortez"
+        },
+        "lightrag_answer": "The `DEoptim` package in R is a powerful tool for global optimization, particularly for solving complex optimization problems using the **Differential Evolution (DE)** algorithm. This package is widely recognized for its effectiveness in finding optimal solutions across a variety of domains, from engineering to finance. Below, we explore the significance of `DEoptim` in detail.\n\n### Implementation of Differential Evolution\n`DEoptim` implements the **Differential Evolution** algorithm, a population-based optimization technique inspired by natural selection. Unlike traditional genetic algorithms, DE uses arithmetic operators—such as vector addition and subtraction—to generate new solutions. This approach makes it particularly well-suited for continuous numerical optimization tasks. The algorithm evolves a population of solutions over iterations, improving the best value of the initial population through mutation, crossover, and selection processes.\n\n### Key Features of `DEoptim`\nThe `DEoptim` package offers several features that make it a versatile and robust tool for optimization:\n\n1. **Control Parameters**: Users can fine-tune the optimization process using control parameters such as population size (`NP`), crossover probability (`CR`), and differential weighting factor (`F`). These parameters allow for customization based on the specific requirements of the optimization task.\n\n2. **Flexibility**: The package supports both minimization and maximization goals, making it applicable to a wide range of problems. Additionally, it allows for the imposition of constraints, such as integer or cardinality constraints, through the `fnMap` function.\n\n3. **Visualization and Summarization**: `DEoptim` includes functions like `plot.DEoptim` and `summary.DEoptim` for visualizing and summarizing the results of the optimization process. These tools help users analyze the evolution of solutions and assess the performance of the algorithm.\n\n4. **Ease of Use**: The package is designed to be user-friendly, with a straightforward interface for defining the objective function and setting bounds for the optimization variables. This makes it accessible to both novice and experienced users.\n\n### Applications of `DEoptim`\n`DEoptim` has been successfully applied to various optimization tasks, including:\n\n- **Bag Prices Optimization**: In this task, `DEoptim` outperforms other methods like Estimation of Distribution Algorithms (EDA) when more computational resources are available. The goal is to maximize profit by finding optimal prices for bags produced in a factory.\n\n- **Rastrigin Task**: `DEoptim` demonstrates strong performance in optimizing the Rastrigin function, a benchmark problem known for its multiple local minima. The algorithm achieves high success rates and competitive average best results.\n\n- **Multi-Objective Optimization**: The package can handle multi-objective tasks, where conflicting goals need to be balanced. For example, it can optimize both profit and production costs simultaneously.\n\n### Advantages Over Other Methods\n`DEoptim` offers several advantages over traditional optimization methods:\n\n- **Global Search Capability**: Unlike local search methods like hill climbing or simulated annealing, `DEoptim` is designed to explore the entire search space, reducing the risk of getting stuck in local optima.\n\n- **Robustness**: The algorithm is robust to noisy and discontinuous objective functions, making it suitable for real-world problems where such conditions are common.\n\n- **Scalability**: `DEoptim` can handle high-dimensional optimization tasks efficiently, thanks to its population-based approach and adaptive control parameters.\n\n### Conclusion\nThe `DEoptim` package is a significant contribution to the field of optimization in R. Its implementation of the Differential Evolution algorithm, combined with its flexibility, ease of use, and robust performance, makes it a go-to tool for solving complex optimization problems. Whether tackling single-objective or multi-objective tasks, `DEoptim` provides a reliable and efficient solution for finding optimal results.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the author suggest handling categorical data in the context of plotting?",
+        "answers": [
+            "The author suggests using bar charts, pie charts, stacked bar charts, and heat maps to visualize categorical data effectively. Each method has its advantages depending on the specific dataset and the comparisons one wishes to make."
+        ],
+        "context": "Probability and Statistics for Computer Science",
+        "length": 340084,
+        "context_id": "cd05f98133352f12d464fe3184263356",
+        "_id": "6b33b185034283f6f54f316e656c36b7",
+        "label": "cs",
+        "meta": {
+            "title": "Probability and Statistics for Computer Science",
+            "authors": "David Forsyth"
+        },
+        "lightrag_answer": "The author provides insights into handling categorical data in the context of plotting, emphasizing the importance of appropriate visualization techniques. Here are the key points discussed:\n\n### 1. **Avoiding Pie Charts and 3D Bar Charts**\nThe author suggests avoiding pie charts and 3D bar charts when plotting categorical data. These visualization methods are often misleading or difficult to interpret, especially when dealing with complex datasets. Instead, the author recommends using more straightforward and interpretable tools for visualizing categorical data.\n\n### 2. **Use of Bar Charts**\nBar charts are highlighted as a suitable method for visualizing categorical data. The author provides an example where bar charts are used to represent the distribution of genders and goals among students in a dataset. This approach allows for clear comparisons between different categories and is particularly effective for displaying frequency or count data.\n\n### 3. **Interactive Tools for Categorical Data**\nThe author mentions the use of interactive tools for plotting categorical data. For instance, brushing techniques can be employed to highlight specific data points or categories within a plot. This interactivity enhances the ability to explore and analyze categorical data more effectively.\n\n### 4. **Handling High-Dimensional Categorical Data**\nWhen dealing with high-dimensional categorical data, the author suggests using scatterplot matrices as a useful strategy. Scatterplot matrices allow for the visualization of relationships between pairs of variables, making it easier to identify patterns or correlations within the data. This method is particularly beneficial when the number of dimensions is not excessively large.\n\n### 5. **Stem Plots for Comparing Class Means**\nFor datasets with multiple classes or categories, the author recommends using stem plots to compare class means. This technique involves plotting each component of the vector as a vertical line, often with a circle at the end. Overlaying stem plots of different class means can help visualize differences between categories in a clear and concise manner.\n\nIn summary, the author advocates for the use of bar charts, scatterplot matrices, and stem plots as effective methods for handling categorical data in plotting. These techniques provide clarity and facilitate better understanding of categorical datasets, while avoiding less effective visualization methods like pie charts and 3D bar charts.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the text address the potential for errors in programming?",
+        "answers": [
+            "The text acknowledges the possibility of errors and emphasizes the importance of careful program design to minimize logic errors. It also introduces debugging as a process for finding and fixing logic errors, and suggests using output statements at critical points in the program to assist in this process."
+        ],
+        "context": "Guide to Java",
+        "length": 132156,
+        "context_id": "9acb1e7611785d05ed363e0a12ff81c4",
+        "_id": "aa72087fc9806e8a8052d9ee94e536d9",
+        "label": "cs",
+        "meta": {
+            "title": "Guide to Java",
+            "authors": "James T. Streib"
+        },
+        "lightrag_answer": "The text provides a comprehensive discussion on the potential for errors in programming, particularly within the context of Visual Basic for Applications (VBA). It outlines the different types of errors that programmers may encounter and offers strategies for identifying, handling, and preventing these errors. Here’s a detailed breakdown of how the text addresses this topic:\n\n### Types of Errors\nThe text categorizes programming errors into four primary types:\n1. **Language Errors (Syntax Errors):** These occur due to mistyped words, missing punctuation, or incorrect statements in the code. VBA helps catch many of these errors as they are typed, but some may only become apparent during compilation or runtime.\n2. **Compile Errors:** These arise when VBA cannot compile a statement correctly, often due to language errors or incorrect use of properties or methods.\n3. **Runtime Errors:** These occur during the execution of a program, often due to unexpected user actions or environmental issues, such as attempting to open a file that doesn’t exist.\n4. **Program Logic Errors:** These are caused by flaws in the logical flow of the program, where the code runs without errors but produces unintended results.\n\n### Debugging Techniques\nThe text emphasizes the importance of debugging as a critical part of programming. It suggests several strategies for effective debugging:\n- **Testing and Retesting:** Programmers should thoroughly test their code with various data and scenarios to identify and fix bugs. This includes anticipating unusual or unexpected user actions.\n- **Modular Code:** Breaking code into smaller, distinct procedures makes it easier to locate and fix errors, as smaller sections of code are simpler to debug.\n- **Error Trapping:** The text highlights the use of error-handling techniques, such as `On Error` statements, to manage runtime errors gracefully. This ensures that the program can respond to unexpected issues without crashing.\n- **Suppressing Alerts:** In some applications, programmers can suppress alerts and message boxes during code execution to prevent interruptions, though this should be done cautiously to avoid creating unstoppable loops.\n\n### Error Handling\nThe text provides specific guidance on implementing error handling in VBA:\n- **Error Handlers:** Programmers can use error handlers to detect and manage runtime errors. For example, the `Err` object can be used to retrieve error descriptions and take appropriate actions.\n- **Custom Error Messages:** Instead of displaying cryptic system error messages, programmers can create user-friendly messages that explain the issue and suggest solutions.\n- **Simulating Errors:** During testing, programmers can deliberately trigger errors using the `Err.Raise` method to ensure that their error handlers work as intended.\n\n### Preventing Errors\nThe text also discusses strategies for preventing errors:\n- **Documenting Code:** Adding comments and documentation to code helps programmers and others understand its purpose and behavior, reducing the likelihood of errors.\n- **Anticipating User Behavior:** Programmers should consider how users might interact with their code and build in safeguards to handle unexpected inputs or actions.\n- **Simplifying Code:** Keeping code as simple and modular as possible reduces the potential for errors and makes debugging easier.\n\n### Conclusion\nOverall, the text underscores the inevitability of errors in programming and provides a detailed framework for identifying, handling, and preventing them. By employing thorough testing, modular code design, and robust error-handling techniques, programmers can create more reliable and user-friendly applications. The emphasis on anticipating user behavior and documenting code further highlights the importance of proactive measures in minimizing errors.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the role of the Immediate window in the Visual Basic Editor?",
+        "answers": [
+            "The Immediate window in the Visual Basic Editor allows users to test individual lines of code, execute statements, and debug procedures quickly without running an entire macro."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "6561d403389bcadf0d8a2a0170ffcf1f",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        },
+        "lightrag_answer": "The **Immediate Window** is a versatile and essential debugging tool within the **Visual Basic Editor (VBE)** in **VBA (Visual Basic for Applications)**. It serves multiple purposes, primarily aiding developers in testing, debugging, and inspecting code during runtime. Here’s a detailed look at its role and functionalities:\n\n### **Primary Functions of the Immediate Window**\n\n1. **Executing Code Snippets**  \n   The Immediate Window allows developers to execute individual lines of code, code snippets, or VBA statements directly without needing to incorporate them into a full procedure or macro. This is particularly useful for quick testing of individual statements and for executing commands on-the-fly.\n\n2. **Displaying Debug Output**  \n   Developers can use the `Debug.Print` statement to output information to the Immediate Window. This is helpful for tracking the execution of code and verifying the values of variables at specific points in the program. For example, during debugging, you can print the value of a variable to see if it matches expectations.\n\n3. **Inspecting Variable Values**  \n   The Immediate Window supports various keystrokes and commands for editing, debugging, and printing information. It allows developers to inspect the current values of variables or arrays in real-time, making it a comprehensive tool for both novice and experienced VBA programmers.\n\n4. **Testing and Debugging**  \n   The Immediate Window is not visible to end-users, ensuring that it remains a dedicated environment for developers to interact with their code during the debugging process. Its integration within the VBA Editor allows for seamless execution of code snippets and immediate inspection of variable values, enhancing the overall efficiency and effectiveness of the debugging workflow.\n\n### **Key Advantages**\n\n- **Rapid Testing**: Developers can quickly test individual lines of code or small code blocks without running the entire procedure.\n- **Real-Time Feedback**: The Immediate Window provides instant feedback on the results of executed code, making it easier to identify and fix issues.\n- **Flexibility**: It supports a wide range of commands and keystrokes, allowing developers to perform various debugging tasks efficiently.\n\n### **Limitations**\n\n- **Restricted Use of Certain Statements**: Some types of statements, such as declarative or control-flow statements, cannot be used in the Immediate Window and will result in an **Invalid in Immediate Pane Error**.\n\n### **Conclusion**\n\nThe Immediate Window is an indispensable feature in the **Visual Basic Editor**, facilitating rapid testing, debugging, and real-time inspection of code and variables. Its ability to execute code snippets, display debug output, and inspect variable values makes it a powerful tool for developers working with VBA. Whether you’re troubleshooting a complex macro or testing a simple line of code, the Immediate Window provides the flexibility and efficiency needed to streamline the debugging process.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the concept of Pareto front in multi-objective optimization?",
+        "answers": [
+            "The Pareto front in multi-objective optimization consists of all non-dominated solutions, where a solution is non-dominated if there is no other solution that is better in all objectives. It represents the set of solutions that offer the best trade-offs between conflicting objectives."
+        ],
+        "context": "Modern Optimization With R",
+        "length": 98105,
+        "context_id": "31aa6012fa9cdbd6e17cdc426851e487",
+        "_id": "ded99eeb790f2a4f64685bfdfd5c4fd5",
+        "label": "cs",
+        "meta": {
+            "title": "Modern Optimization With R",
+            "authors": "Paulo Cortez"
+        },
+        "lightrag_answer": "The concept of the **Pareto front** is central to multi-objective optimization, where the goal is to optimize multiple conflicting objectives simultaneously. In such scenarios, there is often no single optimal solution that simultaneously optimizes all objectives. Instead, there exists a set of solutions that represent the best possible trade-offs between the objectives. This set of solutions is known as the **Pareto front**.\n\n### Understanding the Pareto Front\nIn multi-objective optimization, a solution is considered **Pareto optimal** if there is no other solution that improves one objective without worsening at least one other objective. The collection of all Pareto optimal solutions forms the Pareto front. This front represents the boundary of the feasible solution space, where any movement along the front involves a trade-off between the objectives.\n\nFor example, in a problem with two objectives, the Pareto front can be visualized as a curve on a graph where one objective is plotted on the x-axis and the other on the y-axis. Each point on this curve corresponds to a Pareto optimal solution, and no solution exists that is better in both objectives than any point on the front.\n\n### Applications of the Pareto Front\nThe Pareto front is particularly useful in real-world applications where decision-makers must balance competing goals. For instance, in engineering design, one might need to optimize both cost and performance. The Pareto front provides a range of solutions that show the trade-offs between these objectives, allowing decision-makers to choose the solution that best fits their priorities.\n\n### Comparison with Other Approaches\nThe Pareto front is often contrasted with other multi-objective optimization methods, such as the **weighted-formula approach**, which combines multiple objectives into a single weighted sum. While the weighted-formula approach can simplify the optimization process, it may miss interesting solutions, especially in non-convex regions of the Pareto front. In contrast, the Pareto front captures all possible trade-offs, providing a more comprehensive view of the solution space.\n\n### Example in Optimization Tasks\nIn the context of optimization tasks like the **Bag Prices** and **FES1** problems, the Pareto front is used to evaluate the performance of algorithms like **NSGA-II**. NSGA-II is designed to generate and optimize the Pareto front, producing a set of solutions that represent the best trade-offs between objectives. For instance, in the Bag Prices task, the Pareto front might show the trade-off between maximizing profit and minimizing production costs.\n\n### Conclusion\nThe Pareto front is a fundamental concept in multi-objective optimization, providing a clear representation of the trade-offs between conflicting objectives. By identifying the set of Pareto optimal solutions, decision-makers can make informed choices based on their specific priorities and constraints. This concept is widely used in various fields, from engineering to finance, and is a key tool in modern optimization methods.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the text handle the introduction of complex topics like inheritance and polymorphism?",
+        "answers": [
+            "The text introduces complex topics like inheritance and polymorphism gradually, ensuring that foundational concepts are well understood before advancing to more complex topics. It provides clear explanations and practical examples to facilitate understanding."
+        ],
+        "context": "Guide to Java",
+        "length": 132156,
+        "context_id": "9acb1e7611785d05ed363e0a12ff81c4",
+        "_id": "bae9b2c6c4fec171a2aed180ab7baed1",
+        "label": "cs",
+        "meta": {
+            "title": "Guide to Java",
+            "authors": "James T. Streib"
+        },
+        "lightrag_answer": "The text introduces complex topics like inheritance and polymorphism in a structured and gradual manner, often using practical examples and visual aids to enhance understanding. Here’s how these concepts are handled:\n\n### Inheritance\nInheritance is presented as a fundamental concept in object-oriented programming (OOP) that allows a class to inherit properties and methods from another class. The text emphasizes the hierarchical relationship between classes, where a subclass (e.g., `Triangle`) can inherit from a superclass (e.g., `RegPolygon`). This is illustrated through examples and UML diagrams, which visually represent the relationships between classes and subclasses. The text also discusses how inheritance promotes code reuse and modularity, making it easier to manage and extend software systems.\n\n### Polymorphism\nPolymorphism is introduced as a feature of OOP that allows objects of different classes to be treated as objects of a common superclass. The text explains that polymorphism enables dynamic method invocation, where the type of an object is determined at runtime, allowing for flexible and extensible code. Examples are provided to demonstrate how polymorphism works in practice, such as a superclass variable referencing a subclass object and invoking overridden methods. The text also highlights the role of polymorphism in supporting software reuse and abstraction.\n\n### Practical Examples and Visual Aids\nTo make these concepts more accessible, the text uses practical examples and visual aids. For instance, contour diagrams are employed to visually represent classes and objects, helping readers understand how inheritance and polymorphism function in code. Additionally, the text includes complete programs that demonstrate the application of these concepts, such as programs with overloaded methods and class data members. These examples provide a hands-on approach to learning, allowing readers to see how inheritance and polymorphism are implemented in real-world scenarios.\n\n### Gradual Introduction\nThe text adopts a gradual approach to introducing these topics. It starts with simpler concepts, such as sending objects to methods and overloading constructors, before moving on to more advanced topics like inheritance and polymorphism. This step-by-step progression ensures that readers build a solid foundation before tackling more complex ideas. The text also revisits earlier topics, such as objects and methods, to reinforce understanding and show how they relate to inheritance and polymorphism.\n\n### Emphasis on Modularity and Reusability\nThroughout the discussion, the text emphasizes the benefits of modularity and reusability that inheritance and polymorphism bring to software design. It explains how these concepts help manage large, complex programming projects by encapsulating code into objects and promoting code reuse. The text also discusses the importance of encapsulation, where data and methods are bundled together within a class, hiding internal details from the outside world.\n\nIn summary, the text handles the introduction of inheritance and polymorphism by using practical examples, visual aids, and a gradual progression of concepts. It emphasizes the benefits of these OOP principles in creating modular, reusable, and maintainable code, making complex topics more accessible to readers.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the role of the `optim` function in R when dealing with optimization problems?",
+        "answers": [
+            "The `optim` function in R is a general-purpose optimization function that includes various methods such as Nelder-Mead, BFGS, CG, L-BFGS-B, and SANN (simulated annealing) for minimizing or maximizing a function. It is used to solve optimization problems by providing the function to be optimized and other parameters like initial values and bounds."
+        ],
+        "context": "Modern Optimization With R",
+        "length": 98105,
+        "context_id": "31aa6012fa9cdbd6e17cdc426851e487",
+        "_id": "ee400e24ee25691ae17588b3c114110d",
+        "label": "cs",
+        "meta": {
+            "title": "Modern Optimization With R",
+            "authors": "Paulo Cortez"
+        },
+        "lightrag_answer": "The `optim` function in R plays a central role in solving optimization problems by providing a general-purpose framework for finding the minimum or maximum of a given function. It is a versatile tool that supports various optimization methods, including **simulated annealing**, **Nelder-Mead**, and **BFGS**, among others. Below, we explore its key features and applications in more detail.\n\n### Core Functionality\nThe primary purpose of the `optim` function is to optimize a specified objective function. Users provide the function they wish to minimize or maximize, along with initial parameter estimates. The function then iteratively adjusts these parameters to find the optimal solution. For example, in the context of **maximizing the log-likelihood of data**, `optim` can be used to minimize the negative log-likelihood, effectively achieving the same goal.\n\n### Supported Optimization Methods\nOne of the strengths of `optim` is its flexibility in supporting multiple optimization techniques. For instance:\n- **Simulated Annealing**: This method is particularly useful for escaping local optima by probabilistically accepting worse solutions during the search process. It is often applied in complex, non-convex optimization landscapes.\n- **Nelder-Mead**: A derivative-free method that is robust for problems where gradient information is unavailable or difficult to compute.\n- **BFGS**: A gradient-based method that is efficient for smooth, convex functions.\n\nThese methods make `optim` suitable for a wide range of optimization tasks, from **continuous numerical optimization** to **stochastic optimization** problems.\n\n### Applications in Real-World Problems\nThe `optim` function is widely used in various domains, including:\n- **Model Optimization**: It helps in tuning parameters of machine learning models, such as Support Vector Machines (SVMs), to improve performance.\n- **Function Optimization**: It is employed to optimize mathematical functions, such as the **Rastrigin function** or **Eggholder function**, which are often used as benchmarks for testing optimization algorithms.\n- **Data Analysis**: In statistical modeling, `optim` is used to fit models by minimizing error functions, such as in **ARIMA model** coefficient estimation.\n\n### Integration with Other R Tools\nThe `optim` function integrates seamlessly with other R tools and packages, enhancing its utility. For example:\n- It can be used in conjunction with **NSGA-II** for multi-objective optimization tasks.\n- It supports visualization of results through R's plotting functions, allowing users to analyze and interpret optimization outcomes effectively.\n\n### Limitations and Considerations\nWhile `optim` is powerful, it is not without limitations. For instance:\n- It may struggle with high-dimensional problems or functions with many local optima.\n- The choice of optimization method and initial parameters can significantly impact performance, requiring careful tuning.\n\nIn summary, the `optim` function is a cornerstone of optimization in R, offering a flexible and robust framework for solving a wide array of optimization problems. Its support for multiple methods and integration with other R tools make it an indispensable tool for researchers and practitioners alike.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What are the three main types of quantifiers discussed in the book?",
+        "answers": [
+            "The three main types of quantifiers discussed in the book are greedy, lazy (reluctant), and possessive quantifiers."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "d4d2cefe8061c0465c5f266f9f4555fb",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        },
+        "lightrag_answer": "The book discusses three main types of quantifiers used in regular expressions: **greedy**, **lazy**, and **possessive** quantifiers. Each type has distinct behavior in how it matches patterns in text, which can significantly impact the efficiency and outcome of pattern matching.\n\n### Greedy Quantifiers\nGreedy quantifiers are designed to match as much of the subject text as possible. They prioritize maximum matches and will consume as many characters as they can before attempting to backtrack if necessary. For example, the greedy quantifier `.*` will match the longest possible sequence of characters until it reaches the end of the line or encounters a character that doesn't fit the pattern. This behavior can sometimes lead to inefficiencies, especially in complex patterns, as the quantifier may need to backtrack extensively to find a valid match.\n\n### Lazy Quantifiers\nLazy quantifiers, in contrast, match as little of the subject text as possible. They prioritize minimal matches and will consume the fewest characters necessary to satisfy the pattern. If the pattern requires more characters, the lazy quantifier will gradually expand its match. For example, the lazy quantifier `.*?` will match the shortest possible sequence of characters that fits the pattern. This approach can be more efficient in certain scenarios, as it avoids unnecessary backtracking.\n\n### Possessive Quantifiers\nPossessive quantifiers are similar to greedy quantifiers in that they also match as much of the subject text as possible. However, they differ in that they do not allow backtracking once a match is attempted. This means that if a possessive quantifier fails to match, it will not try to adjust its match by backtracking. For example, the possessive quantifier `.*+` will match the longest possible sequence of characters and will not give up any part of the match, even if it causes the overall pattern to fail. This can be useful for optimizing performance in cases where backtracking is unnecessary or undesirable.\n\n### Summary\nIn summary, the three types of quantifiers—greedy, lazy, and possessive—offer different strategies for pattern matching in regular expressions. Greedy quantifiers aim for maximum matches but may require backtracking, lazy quantifiers aim for minimal matches and avoid unnecessary backtracking, and possessive quantifiers prioritize maximum matches without allowing backtracking. Understanding these differences is crucial for writing efficient and effective regular expressions.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What are the three major types of relationships in database design and give an example of each?",
+        "answers": [
+            "The three major types of relationships are one-to-one, one-to-many, and many-to-many. Examples include a person and their social security number (one-to-one), a customer and their orders (one-to-many), and products and orders (many-to-many)."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "4d32d4026fc0d305b8bfcfb2809cdb5e",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        },
+        "lightrag_answer": "In database design, relationships between entities are crucial for organizing and structuring data effectively. There are three major types of relationships: **one-to-one**, **one-to-many**, and **many-to-many**. Each type serves a specific purpose and is used in different scenarios depending on the data requirements.\n\n### 1. One-to-One Relationship\nA **one-to-one relationship** occurs when each record in one table corresponds to exactly one record in another table. This type of relationship is less common but is useful in scenarios where data needs to be split into separate tables for security, performance, or organizational reasons.\n\n**Example:**  \nConsider a database for employee records. One table might store basic employee information (e.g., `EmployeeID`, `Name`, `Department`), while another table stores sensitive information (e.g., `EmployeeID`, `SocialSecurityNumber`, `Salary`). Each employee has exactly one record in both tables, creating a one-to-one relationship.\n\n### 2. One-to-Many Relationship\nA **one-to-many relationship** is the most common type of relationship in database design. It occurs when a single record in one table can be associated with multiple records in another table. This type of relationship is often used to represent hierarchical or parent-child relationships.\n\n**Example:**  \nIn a database for a retail store, the `Orders` table might have a one-to-many relationship with the `OrderDetails` table. Each order (e.g., `OrderID`, `CustomerID`, `OrderDate`) can have multiple items associated with it in the `OrderDetails` table (e.g., `OrderID`, `ProductID`, `Quantity`). Here, one order can have many details, but each detail belongs to only one order.\n\n### 3. Many-to-Many Relationship\nA **many-to-many relationship** occurs when multiple records in one table can be associated with multiple records in another table. This type of relationship is implemented using an intermediate table (often called a junction or bridge table) that contains foreign keys referencing the primary keys of the two related tables.\n\n**Example:**  \nIn a database for a university, the `Students` table and the `Courses` table have a many-to-many relationship. A student can enroll in multiple courses, and a course can have multiple students. To represent this, an intermediate table, such as `Enrollments`, is used to link the two tables. The `Enrollments` table might contain fields like `StudentID` and `CourseID`, allowing for the many-to-many relationship to be managed effectively.\n\n### Summary\nUnderstanding these three types of relationships is fundamental to designing efficient and scalable databases. Each relationship type serves a specific purpose and is used to model different kinds of data interactions. By applying these relationships appropriately, database designers can ensure that data is organized logically and can be retrieved efficiently.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What naming convention does the book recommend for tables and views?",
+        "answers": [
+            "The book recommends that table and view names should be industry standards, collective, class, or plural nouns, following the principle that a table is a set and not a scalar value."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "44d9d8d01f14afe5c04bef3c21c6da19",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        },
+        "lightrag_answer": "The book provides several recommendations and insights regarding naming conventions for tables and views, emphasizing clarity, consistency, and adherence to standards. Here are the key points:\n\n### 1. **Avoid Prefixes Like \"tbl-\" for Table Names**\n   - The book criticizes the use of prefixes such as \"tbl-\" for table names, as it lacks clarity and does not add meaningful information. Instead, it suggests using descriptive names that clearly indicate the table's purpose without unnecessary prefixes.\n\n### 2. **Use Descriptive and Singular Names**\n   - Names for tables and views should be descriptive and singular, reflecting the entity they represent. This aligns with the ISO-11179 standard, which provides guidelines for naming data elements in SQL. The goal is to ensure that names are unique, singular, and descriptive, making them easier to understand and maintain.\n\n### 3. **Follow ISO-11179 Standards**\n   - The ISO-11179 standard is highlighted as a key reference for naming conventions. It provides rules for naming data elements, ensuring consistency and clarity across databases. This standard is increasingly widespread and is even required for certain government work.\n\n### 4. **Avoid Proliferation of Views**\n   - The book advises against creating a view for every base table, a practice referred to as \"The Big VIEW Myth.\" This approach can lead to unnecessary complexity and catalog clutter. Instead, views should be created only when they serve a specific, documented purpose.\n\n### 5. **Use Full Reserved Words**\n   - The book recommends using full reserved words in SQL, such as \"INTEGER\" instead of \"INT\" and \"DECIMAL\" instead of \"DEC.\" This practice improves readability and documentation, though exceptions are made for character data types like \"CHAR\" and \"VARCHAR,\" where the shorter forms are preferred.\n\n### 6. **Avoid Proprietary Keywords**\n   - Sticking to standard SQL keywords is encouraged to ensure code portability and readability. Proprietary keywords should be avoided unless no standard alternative is available. This protects the code from potential changes or deprecation of proprietary features.\n\n### 7. **Document Naming Conventions**\n   - The book emphasizes the importance of documenting naming conventions, particularly in a data dictionary. This ensures consistency across the database and helps developers understand the purpose and structure of tables and views.\n\n### 8. **Use WITH CHECK OPTION for Views**\n   - When creating views, the WITH CHECK OPTION clause is recommended to enforce data integrity. This ensures that any updates or inserts through the view adhere to the view's criteria, maintaining consistency and preventing invalid data modifications.\n\n### 9. **Avoid Ambiguity in Naming**\n   - Names should be unambiguous and clearly indicate the object's purpose. This is particularly important for views, which should be named in a way that reflects their specific use case or business requirement.\n\n### 10. **Consider Performance Implications**\n   - The book also touches on the performance implications of naming conventions, particularly in the context of indexed views. Properly named and structured views can improve query performance, especially when they are materialized or indexed.\n\nIn summary, the book advocates for clear, descriptive, and standardized naming conventions for tables and views, avoiding unnecessary prefixes and proprietary keywords. Adhering to these guidelines ensures that database objects are easy to understand, maintain, and use effectively.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the primary goal of the book \"Modern Optimization with R\"?",
+        "answers": [
+            "The primary goal of the book is to gather relevant concepts related to modern optimization methods and demonstrate how these concepts and methods can be addressed using the R tool, providing a self-contained document with practical application examples."
+        ],
+        "context": "Modern Optimization With R",
+        "length": 98105,
+        "context_id": "31aa6012fa9cdbd6e17cdc426851e487",
+        "_id": "cdfa9fa646614a9d70a36e29f1146609",
+        "label": "cs",
+        "meta": {
+            "title": "Modern Optimization With R",
+            "authors": "Paulo Cortez"
+        },
+        "lightrag_answer": "The primary goal of the book *\"Modern Optimization with R\"* by Paulo Cortez is to provide a practical guide to modern optimization methods using the R programming language. The book aims to bridge the gap between modern optimization techniques and their implementation in R, offering readers a hands-on approach to solving complex optimization problems. \n\n### Key Objectives of the Book:\n1. **Introduction to Modern Optimization**: The book introduces readers to modern optimization, also known as metaheuristics, which are general-purpose solvers that iteratively improve an initial solution or population of solutions. These methods are particularly useful for solving complex problems that classical optimization techniques struggle with, such as those involving discontinuities, dynamic changes, multiple objectives, or hard and soft constraints.\n\n2. **Application of R in Optimization**: The book emphasizes the use of R, a powerful open-source tool for statistical computing and data analysis, to implement and apply modern optimization methods. R's extensibility through packages makes it an ideal platform for exploring and applying these techniques.\n\n3. **Practical Examples and Code**: The book includes numerous R code examples that demonstrate how to apply modern optimization methods to real-world problems. These examples cover a range of applications, from optimizing financial portfolios and marketing campaigns to solving the Traveling Salesman Problem and improving data mining models.\n\n4. **Integration of Optimization with Other Tasks**: The book highlights the versatility of R by showing how optimization can be integrated with other computational tasks, such as statistical analysis, visualization, simulation, and data mining. This integration allows users to perform comprehensive analyses within a single tool.\n\n### Relevance in the Information Age:\nThe book is particularly relevant in the context of the Information Age, where organizations and individuals are increasingly pressured to improve efficiency, reduce costs, and make better-informed decisions. Modern optimization methods, supported by computational tools like R, play a crucial role in addressing these challenges.\n\n### Target Audience:\nThe book is aimed at practitioners, researchers, and students who are interested in applying modern optimization techniques to real-world problems. It assumes a basic familiarity with R and optimization concepts but provides enough detail to guide readers through the implementation of advanced methods.\n\nIn summary, *\"Modern Optimization with R\"* serves as a comprehensive resource for anyone looking to leverage the power of R to solve complex optimization problems using modern, computationally efficient methods.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How can you run Spark on Amazon EC2?",
+        "answers": [
+            "Spark can be run on Amazon EC2 by using the provided scripts in the `ec2` directory of the Spark package. These scripts automate the process of launching a Spark cluster on EC2 instances, handling setup and configuration."
+        ],
+        "context": "Machine Learning With Spark",
+        "length": 131651,
+        "context_id": "7bcef8714a477fd61fc8fb0d499b2cc3",
+        "_id": "e7df0b58497343e180bd6289562edddf",
+        "label": "cs",
+        "meta": {
+            "title": "Machine Learning With Spark",
+            "authors": "Nick Pentreath"
+        },
+        "lightrag_answer": "Running Apache Spark on Amazon EC2 involves several steps, from setting up the necessary credentials to launching and managing a Spark cluster. Below is a detailed guide on how to achieve this:\n\n### Setting Up Credentials and Key Pair\n\nBefore launching a Spark cluster on Amazon EC2, you need to set up your AWS credentials and download a key pair file for secure access. Here’s how you can do it:\n\n1. **Download the Key Pair File**: When you create a key pair in the AWS Management Console, download the `.pem` file. This file is essential for securely accessing your EC2 instances.\n2. **Set Permissions**: Ensure that the key pair file has the correct permissions by running:\n   ```bash\n   chmod 600 spark.pem\n   ```\n3. **Export AWS Credentials**: Export your AWS access key ID and secret access key as environment variables:\n   ```bash\n   export AWS_ACCESS_KEY_ID=\"your_access_key_id\"\n   export AWS_SECRET_ACCESS_KEY=\"your_secret_access_key\"\n   ```\n\n### Launching the Spark Cluster\n\nOnce your credentials are set up, you can launch a Spark cluster on EC2 using the following steps:\n\n1. **Navigate to the EC2 Directory**: Change into the `ec2` directory within your Spark installation:\n   ```bash\n   cd ec2\n   ```\n2. **Run the Cluster Launch Command**: Use the `spark-ec2` script to launch a Spark cluster. For example, to launch a cluster named `test-cluster` with one master and one slave node, run:\n   ```bash\n   ./spark-ec2 -k spark -i spark.pem -s 1 --instance-type m3.medium --hadoop-major-version 2 launch test-cluster\n   ```\n   - `-k spark`: Specifies the key pair name.\n   - `-i spark.pem`: Specifies the key pair file.\n   - `-s 1`: Launches one slave node.\n   - `--instance-type m3.medium`: Specifies the instance type.\n   - `--hadoop-major-version 2`: Specifies the Hadoop version.\n\n   This command will launch a Spark cluster with the specified configuration. Note that it may take some time for the cluster to fully initialize.\n\n### Connecting to the Cluster\n\nAfter the cluster has launched, you can connect to the master node using SSH:\n\n1. **Retrieve the Master Node’s Public Domain Name**: You can get the public domain name of the master node by running:\n   ```bash\n   ./spark-ec2 -i spark.pem get-master test-cluster\n   ```\n2. **SSH into the Master Node**: Use the retrieved domain name to SSH into the master node:\n   ```bash\n   ssh -i spark.pem root@ec2-54-227-127-14.compute-1.amazonaws.com\n   ```\n   Replace the domain name with the one specific to your cluster.\n\n### Testing the Cluster\n\nOnce connected to the master node, you can test whether the cluster is set up correctly by running a Spark example:\n\n1. **Run an Example in Local Mode**: Change into the Spark directory and run an example in local mode:\n   ```bash\n   cd spark\n   MASTER=local[2] ./bin/run-example SparkPi\n   ```\n   This will run the `SparkPi` example using two threads.\n\n2. **Run an Example in Cluster Mode**: To run the same example in cluster mode, use the master URL:\n   ```bash\n   MASTER=spark://ec2-54-227-127-14.compute-1.amazonaws.com:7077 ./bin/run-example SparkPi\n   ```\n   Replace the master URL with the one specific to your cluster.\n\n### Using the Spark Shell\n\nYou can also use the Spark shell to interact with your cluster:\n\n1. **Start the Scala Shell**:\n   ```bash\n   ./bin/spark-shell --master spark://ec2-54-227-127-14.compute-1.amazonaws.com:7077\n   ```\n2. **Start the PySpark Shell**:\n   ```bash\n   ./bin/pyspark --master spark://ec2-54-227-127-14.compute-1.amazonaws.com:7077\n   ```\n\n### Monitoring the Cluster\n\nYou can monitor the cluster using the Spark Master web interface. Navigate to the master node’s public domain name on port 8080 (e.g., `http://ec2-54-227-127-14.compute-1.amazonaws.com:8080`) to view the applications registered with the master.\n\n### Terminating the Cluster\n\nTo avoid incurring unnecessary costs, terminate the cluster once you’re done:\n```bash\n./spark-ec2 -i spark.pem destroy test-cluster\n```\n\n### Conclusion\n\nRunning Spark on Amazon EC2 provides a scalable and flexible environment for distributed data processing. By following the steps outlined above, you can easily set up, manage, and terminate a Spark cluster on EC2, allowing you to leverage the power of Spark for large-scale data analytics and machine learning tasks.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "Describe the structure and function of the IPv4 header.",
+        "answers": [
+            "The IPv4 header consists of fields such as version, IHL (Internet Header Length), Type of Service (TOS), total length, identification, flags, fragment offset, TTL (Time To Live), protocol, header checksum, source address, and destination address. It provides essential information for routing and handling packets in the network."
+        ],
+        "context": "Linux Kernel Networking",
+        "length": 324154,
+        "context_id": "a26e2a9fe9627588b3f37969996aa053",
+        "_id": "11a8dffaa240e12a786904a464f8a92e",
+        "label": "cs",
+        "meta": {
+            "title": "Linux Kernel Networking",
+            "authors": "Rami Rosen"
+        },
+        "lightrag_answer": "The IPv4 header is a critical component of the IPv4 protocol, serving as the metadata and control information necessary for routing and delivering packets across a network. It is a fixed-size structure that precedes the payload in an IPv4 packet, and its fields are essential for managing packet transmission, fragmentation, and error handling. Below is a detailed description of the IPv4 header's structure and its key functions.\n\n### Structure of the IPv4 Header\n\nThe IPv4 header is typically 20 bytes in length but can expand up to 60 bytes when IP options are included. It consists of the following fields:\n\n1. **Version (4 bits)**: Indicates the IP version, which is 4 for IPv4.\n2. **Internet Header Length (IHL) (4 bits)**: Specifies the length of the IPv4 header in 32-bit words. The minimum value is 5 (20 bytes), and the maximum is 15 (60 bytes).\n3. **Type of Service (TOS) (8 bits)**: Used to prioritize packets based on quality of service (QoS) requirements, such as low delay, high throughput, or reliability.\n4. **Total Length (16 bits)**: Represents the total size of the IPv4 packet, including the header and payload, in bytes. The maximum size is 65,535 bytes.\n5. **Identification (16 bits)**: A unique identifier for the packet, used for fragmentation and reassembly.\n6. **Flags (3 bits)**: Controls fragmentation. The flags include:\n   - **Don't Fragment (DF)**: Prevents fragmentation of the packet.\n   - **More Fragments (MF)**: Indicates that the packet is part of a fragmented sequence.\n7. **Fragment Offset (13 bits)**: Specifies the position of the fragment within the original packet, measured in 8-byte units.\n8. **Time to Live (TTL) (8 bits)**: Limits the lifespan of the packet by decrementing at each hop. When TTL reaches 0, the packet is discarded to prevent infinite loops.\n9. **Protocol (8 bits)**: Identifies the transport layer protocol (e.g., TCP, UDP, ICMP) encapsulated in the payload.\n10. **Header Checksum (16 bits)**: Ensures the integrity of the IPv4 header by verifying that it has not been corrupted during transmission.\n11. **Source Address (32 bits)**: The IP address of the sender.\n12. **Destination Address (32 bits)**: The IP address of the intended recipient.\n13. **Options (variable length)**: Optional fields that provide additional functionality, such as record route, timestamp, or security options. These can expand the header up to 60 bytes.\n\n### Functions of the IPv4 Header\n\nThe IPv4 header plays several key roles in packet transmission and network communication:\n\n1. **Routing and Delivery**: The source and destination addresses enable routers to forward packets to their intended recipients. The TTL field ensures packets do not circulate indefinitely.\n2. **Fragmentation and Reassembly**: The Identification, Flags, and Fragment Offset fields manage the fragmentation of large packets that exceed the Maximum Transmission Unit (MTU) of a network interface. These fields also facilitate the reassembly of fragmented packets at the destination.\n3. **Error Handling**: The Header Checksum field detects corruption in the IPv4 header, ensuring data integrity. Additionally, the Protocol field allows the recipient to process the payload correctly, whether it is TCP, UDP, or ICMP.\n4. **Quality of Service (QoS)**: The Type of Service field enables prioritization of packets based on specific network requirements, such as minimizing delay or maximizing throughput.\n5. **Optional Features**: The Options field provides flexibility for advanced functionalities, such as recording the route a packet takes or adding timestamps for diagnostic purposes.\n\n### IP Options\n\nIP options are optional fields in the IPv4 header that extend its functionality. While they provide useful features, they also increase the complexity and size of the header, which can impact performance. Common IP options include:\n- **Record Route**: Tracks the route a packet takes through the network.\n- **Timestamp**: Records the time at which the packet passes through each router.\n- **Security**: Adds security-related information to the packet.\n\n### Conclusion\n\nThe IPv4 header is a fundamental component of the IPv4 protocol, enabling efficient and reliable packet transmission across networks. Its fields manage routing, fragmentation, error detection, and optional features, ensuring that packets are delivered correctly and efficiently. Understanding the structure and function of the IPv4 header is essential for network administrators and developers working with IPv4-based systems.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the book suggest handling special characters in names?",
+        "answers": [
+            "The book advises avoiding all special characters in names to ensure compatibility across different SQL products and host language programs, recommending the use of letters, digits, and underscores only."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "13f850f6c26aed5d72e47af31bc0c777",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        },
+        "lightrag_answer": "The book provides guidance on handling special characters in names, particularly in the context of SQL programming and database management. Here are the key points and strategies discussed:\n\n### Quoted Identifiers\nOne approach to handling special characters in names is the use of **quoted identifiers**. Quoted identifiers allow for more flexible naming conventions by enclosing identifiers in quotation marks. This enables the inclusion of spaces, special characters, and other non-standard elements in object names. However, the book notes that while this feature is part of the SQL-92 standard, it may have limitations and portability issues across different SQL products. For example, some databases may not fully support quoted identifiers or may interpret them differently.\n\n### Avoiding Descriptive Prefixes\nThe book references Aristotle's advice in the context of naming conventions, suggesting that **descriptive prefixes** should be avoided. Descriptive prefixes are often used to clarify the purpose of data elements in a database, but they can complicate naming and make it harder to manage special characters. Instead, the book advocates for clear and concise naming without unnecessary prefixes.\n\n### SQL-92 Standards and Oracle Practices\nThe book also mentions Oracle's specific practices regarding character set usage. Oracle discourages the use of special characters in certain object names, influenced by SQL-92 standards. This highlights the importance of adhering to database-specific guidelines when dealing with special characters in names.\n\n### Practical Examples\nIn practical terms, the book provides examples of how to handle special characters in VBA (Visual Basic for Applications) using the **Chr function**. For instance, the Chr function can be used to insert special characters like tabs (`Chr(9)`) or smart quotes (`Chr(145)`) into strings. This approach ensures compatibility with XML and VBA code, especially when dealing with text that includes non-standard characters.\n\n### Summary\nIn summary, the book suggests the following strategies for handling special characters in names:\n1. Use **quoted identifiers** to include spaces and special characters, but be aware of potential portability issues.\n2. Avoid **descriptive prefixes** to simplify naming conventions.\n3. Adhere to database-specific guidelines, such as Oracle's practices influenced by SQL-92 standards.\n4. Use functions like **Chr** in VBA to manage special characters in text-based data.\n\nThese strategies ensure that special characters are handled effectively while maintaining compatibility and clarity in database and programming contexts.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What are the challenges in defining a denotational semantics for a language with side effects like references and assignments?",
+        "answers": [
+            "Defining a denotational semantics for languages with side effects requires modeling the state of the program and how it changes over time, which can be complex due to the need to track and update the state consistently across different program executions."
+        ],
+        "context": "Introduction to the Theory of Programming Languages",
+        "length": 51704,
+        "context_id": "2369ec0826789a4f553cb31b1635c15d",
+        "_id": "8d146f8de1405ae49c52e22e2413bf96",
+        "label": "cs",
+        "meta": {
+            "title": "Introduction to the Theory of Programming Languages",
+            "authors": "Gilles Dowek"
+        },
+        "lightrag_answer": "Defining denotational semantics for a programming language with side effects, such as references and assignments, presents several significant challenges. Denotational semantics aims to map language constructs to mathematical objects, providing a precise and formal interpretation of a program's behavior. However, side effects complicate this mapping because they introduce mutable state, which can change over time and affect the program's execution in ways that are difficult to model mathematically.\n\n### 1. **Handling Mutable State**\nOne of the primary challenges is modeling mutable state. In languages with side effects, variables and references can change their values during program execution. This mutability disrupts the pure functional nature of denotational semantics, which traditionally assumes that functions are deterministic and stateless. To address this, denotational semantics must incorporate mechanisms to represent and track state changes, such as using state monads or other mathematical constructs to model the evolution of state over time.\n\n### 2. **Non-Termination and Undefined Behavior**\nSide effects can lead to non-terminating computations or undefined behavior, which are difficult to capture in a denotational framework. For example, a program with a reference that points to itself in a recursive loop may never terminate. Denotational semantics must account for such cases, often by introducing concepts like bottom (⊥) to represent non-termination or undefined values. However, this adds complexity to the semantic model and requires careful handling to ensure consistency.\n\n### 3. **Order of Evaluation**\nThe order in which side effects are evaluated can significantly impact a program's behavior. For instance, in a language with assignments, the sequence of updates to variables must be precisely defined to avoid ambiguity. Denotational semantics must explicitly model the order of evaluation, which can be challenging, especially in languages with complex control flow or concurrency.\n\n### 4. **Interactions with Other Language Features**\nSide effects often interact with other language features, such as recursion, higher-order functions, or concurrency, further complicating the semantic model. For example, a recursive function that modifies a global variable introduces dependencies between the function's behavior and the state of the program. Denotational semantics must account for these interactions, often requiring extensions to the basic framework to handle such cases.\n\n### 5. **Mathematical Complexity**\nIncorporating side effects into denotational semantics increases the mathematical complexity of the model. Traditional denotational semantics relies on simple, well-understood mathematical structures like functions and sets. However, side effects require more advanced constructs, such as monads, continuations, or domain theory, to model state and non-determinism. This added complexity can make the semantic model harder to understand and work with.\n\n### 6. **Incompleteness for Untyped Languages**\nIn untyped languages, defining denotational semantics for side effects is particularly challenging because the lack of type information makes it difficult to ensure that operations on references and assignments are well-defined. Typed languages, like PCF, provide some structure that aids in defining semantics, but even in these cases, the flexibility of side effects can complicate the semantic interpretation.\n\n### Conclusion\nDefining denotational semantics for languages with side effects like references and assignments is a complex task that requires addressing issues related to mutable state, non-termination, evaluation order, and interactions with other language features. While advanced mathematical tools and techniques can help model these challenges, the resulting semantic framework is often more intricate and less intuitive than those for purely functional languages. Despite these difficulties, denotational semantics remains a valuable tool for formally understanding and reasoning about the behavior of programs with side effects.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the Macro Recorder work in Word and Excel?",
+        "answers": [
+            "The Macro Recorder in Word and Excel translates user actions into VBA code, allowing users to automate sequences of actions by recording them once and playing them back as needed."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "ba51eb8a1b1c96450160db13b175d0b5",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        },
+        "lightrag_answer": "The Macro Recorder is a powerful feature in Microsoft Word and Excel that allows users to automate repetitive tasks by recording a series of actions and translating them into Visual Basic for Applications (VBA) code. This tool is particularly useful for users who may not have extensive programming knowledge but still want to create macros to streamline their workflows. Here’s a detailed look at how the Macro Recorder functions in these applications:\n\n### **Starting the Macro Recorder**\nTo begin recording a macro in Word or Excel, users must first ensure that the Developer tab is visible on the Ribbon. This tab provides access to macro-related tools, including the Macro Recorder. Once the Developer tab is enabled, users can start the Macro Recorder by clicking the \"Record Macro\" button. This action opens the Record Macro dialog box, where users can name the macro, assign a shortcut key (optional), and specify where the macro should be stored (e.g., in the current document, a template, or the Personal Macro Workbook in Excel).\n\n### **Recording Actions**\nOnce the recording process begins, the Macro Recorder translates every action performed by the user into VBA code. For example, in Word, actions like formatting text, inserting tables, or applying styles are recorded as VBA commands. Similarly, in Excel, actions such as entering data, applying formulas, or formatting cells are captured. The Macro Recorder continues to record until the user clicks the \"Stop Recording\" button, which is also accessible from the Developer tab or the status bar.\n\n### **Editing and Viewing the Macro**\nAfter recording, the macro can be viewed and edited in the Visual Basic Editor (VBE). The VBE provides a comprehensive environment for writing, editing, and debugging VBA code. Users can modify the recorded macro to add more complex functionality or refine its behavior. For instance, they can add loops, conditional statements, or error-handling routines to make the macro more robust and versatile.\n\n### **Running the Macro**\nOnce recorded, the macro can be executed in several ways. Users can run it directly from the Macros dialog box, assign it to a button on the Ribbon or Quick Access Toolbar, or trigger it using a keyboard shortcut. This flexibility allows users to integrate macros seamlessly into their workflows, saving time and effort on repetitive tasks.\n\n### **Limitations of the Macro Recorder**\nWhile the Macro Recorder is a convenient tool, it has some limitations. It generates verbose code that may include unnecessary steps, and it cannot record certain complex actions or user interactions. Additionally, the Macro Recorder is only available in Word and Excel; other Office applications, such as PowerPoint, do not support macro recording and require users to write VBA code manually.\n\n### **Best Practices for Using the Macro Recorder**\nTo make the most of the Macro Recorder, users should plan their macros carefully before recording. This includes ensuring that the application is in the correct state (e.g., a blank document or workbook is open) and avoiding unnecessary actions during recording. After recording, users should review and clean up the generated code to remove redundant steps and improve efficiency.\n\nIn summary, the Macro Recorder in Word and Excel is a valuable tool for automating tasks and creating macros without extensive programming knowledge. By recording actions and generating VBA code, it empowers users to streamline their workflows and enhance productivity. However, users should be aware of its limitations and follow best practices to create efficient and effective macros.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What are the two types of procedures in VBA?",
+        "answers": [
+            "The two types of procedures in VBA are functions and subprocedures (subs). Functions return a value, while subprocedures do not."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "0704a96fe06ed0e5aff62cc3e4522ea0",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        },
+        "lightrag_answer": "In Visual Basic for Applications (VBA), there are two primary types of procedures: **Subprocedures** and **Functions**. These procedures serve different purposes and are used in various contexts within VBA programming.\n\n### Subprocedures\nSubprocedures, often referred to as \"Subs,\" are blocks of code that perform specific tasks but do not return a value. They are typically used for actions that do not require a result to be passed back to the calling code. For example, a Subprocedure might be used to automate a repetitive task, such as formatting a document or manipulating data in a spreadsheet. Subprocedures are defined using the `Sub` keyword and end with an `End Sub` statement. Here’s a simple example of a Subprocedure:\n\n```vba\nSub DisplayMessage()\n    MsgBox \"Hello, World!\"\nEnd Sub\n```\n\n### Functions\nFunctions, on the other hand, are procedures that perform a task and return a value to the calling code. They are useful when you need to perform a calculation or process data and return the result for further use. Functions are defined using the `Function` keyword and end with an `End Function` statement. Here’s an example of a Function that adds two numbers and returns the result:\n\n```vba\nFunction AddNumbers(num1 As Integer, num2 As Integer) As Integer\n    AddNumbers = num1 + num2\nEnd Function\n```\n\n### Key Differences\n- **Return Value**: Subprocedures do not return a value, while Functions do.\n- **Usage**: Subprocedures are used for tasks that do not require a result, whereas Functions are used when a result is needed.\n- **Syntax**: Subprocedures use `Sub` and `End Sub`, while Functions use `Function` and `End Function`.\n\nUnderstanding these two types of procedures is fundamental to writing effective VBA code, as they allow you to structure your programs in a way that is both efficient and easy to maintain.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the use of de Bruijn indices simplify the interpretation of terms in programming languages?",
+        "answers": [
+            "De Bruijn indices replace named variables with numerical indices, reducing the need for alpha-conversion and simplifying the handling of variable bindings and substitutions."
+        ],
+        "context": "Introduction to the Theory of Programming Languages",
+        "length": 51704,
+        "context_id": "2369ec0826789a4f553cb31b1635c15d",
+        "_id": "0f18e6d577f37580eebc8b5fe4872e16",
+        "label": "cs",
+        "meta": {
+            "title": "Introduction to the Theory of Programming Languages",
+            "authors": "Gilles Dowek"
+        },
+        "lightrag_answer": "The use of **de Bruijn indices** is a method for representing variables in programming languages, particularly in the context of lambda calculus and functional programming languages like **PCF (Programming Computable Functions)**. This approach simplifies the interpretation of terms by eliminating the need for variable names, which can lead to more efficient and streamlined evaluation processes. Below, we explore how de Bruijn indices achieve this simplification and their implications for programming language semantics.\n\n### What Are de Bruijn Indices?\n\nDe Bruijn indices are a numeric representation of variables in a term, where each variable is replaced by a number indicating its distance from its binding occurrence. For example, in the lambda expression `λx. λy. x`, the variable `x` would be represented by the index `1` (since it is bound by the outermost lambda), and `y` would be represented by `0` (bound by the innermost lambda). This method removes the need for explicit variable names, reducing the complexity of variable binding and substitution.\n\n### Simplifying Variable Binding\n\nOne of the primary advantages of de Bruijn indices is their ability to simplify **variable binding**. In traditional systems, variable binding requires matching variable names with their corresponding binders, which can be error-prone and computationally expensive. With de Bruijn indices, this process is streamlined because variables are represented by their position in the environment. This eliminates the need for name resolution, making the interpretation of terms more efficient.\n\nFor example, in **PCF**, de Bruijn indices are used to optimize the representation of terms in **recursive closures**, where variable bindings are managed more efficiently. This optimization reduces the overhead associated with traditional variable binding mechanisms, particularly in functional languages where nested scopes are common.\n\n### Enhancing Compilation and Interpretation\n\nDe Bruijn indices also play a crucial role in **compilation** and **interpretation**. In the context of **abstract machines**, de Bruijn indices simplify the compilation process by replacing variable names with numeric indices. This reduces the complexity of the machine's operations, as it no longer needs to handle variable names during execution. Instead, the machine can directly access variables based on their indices, leading to faster and more efficient interpretation.\n\nFor instance, in **PCF**, the use of de Bruijn indices allows for the creation of an **interpreter** that evaluates terms more efficiently. By replacing variable names with indices, the interpreter can avoid costly substitution operations, which are common in traditional evaluation strategies like **call by name** or **call by value**.\n\n### Impact on Operational Semantics\n\nThe use of de Bruijn indices also influences the **operational semantics** of programming languages. In **big-step operational semantics**, de Bruijn indices simplify the rules for evaluating terms by reducing the need for explicit substitutions. This makes the semantics more concise and easier to reason about, particularly in languages with complex scoping rules.\n\nFor example, in **PCF**, the introduction of de Bruijn indices changes the way **irreducible closed terms** are evaluated. These terms, which cannot be reduced further, are handled more efficiently because the interpreter can directly access their values without needing to resolve variable names.\n\n### Conclusion\n\nIn summary, de Bruijn indices simplify the interpretation of terms in programming languages by replacing variable names with numeric indices. This approach streamlines variable binding, enhances compilation and interpretation, and simplifies operational semantics. By eliminating the need for explicit variable names, de Bruijn indices make functional languages like **PCF** more efficient and easier to implement, particularly in contexts where nested scopes and recursive closures are common. This method is a powerful tool in the theory and practice of programming languages, contributing to both theoretical clarity and practical performance.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does Spark differ from Hadoop in terms of performance?",
+        "answers": [
+            "Spark is optimized for low-latency tasks and iterative analytics, storing intermediate data in memory, which significantly improves performance over Hadoop's MapReduce, where data is stored on disk, leading to slower processing times."
+        ],
+        "context": "Machine Learning With Spark",
+        "length": 131651,
+        "context_id": "7bcef8714a477fd61fc8fb0d499b2cc3",
+        "_id": "23b6c2db6571d4be02cc6b8e91187d82",
+        "label": "cs",
+        "meta": {
+            "title": "Machine Learning With Spark",
+            "authors": "Nick Pentreath"
+        },
+        "lightrag_answer": "Apache Spark and Apache Hadoop are both prominent frameworks for distributed data processing, but they differ significantly in terms of performance, particularly in how they handle data processing tasks. Below is a detailed comparison of their performance characteristics:\n\n### **1. In-Memory Processing vs. Disk-Based Processing**\nOne of the most significant differences between Spark and Hadoop is their approach to data storage and processing. **Spark** is designed to perform **in-memory processing**, which means it stores intermediate data and results in memory rather than writing them to disk. This approach drastically reduces the latency associated with disk I/O operations, making Spark much faster for iterative algorithms and low-latency tasks. In contrast, **Hadoop** relies on **disk-based processing**, where intermediate data is written to disk between each MapReduce job. While this approach ensures fault tolerance, it introduces significant overhead, especially for iterative workloads.\n\n### **2. Iterative and Low-Latency Workloads**\nSpark's in-memory processing makes it particularly well-suited for **iterative workloads**, such as machine learning algorithms, which require multiple passes over the same dataset. For example, training a logistic regression model involves repeated computations on the same data, and Spark's ability to cache data in memory significantly speeds up these tasks. Hadoop, on the other hand, is less efficient for such workloads due to its reliance on disk I/O for each iteration.\n\nAdditionally, Spark is optimized for **low-latency tasks**, such as real-time data processing and stream analytics. Its **Spark Streaming** module allows for near real-time processing of data streams, making it a better choice for applications requiring immediate insights. Hadoop's MapReduce framework, by contrast, is more suited for batch processing and is not designed for low-latency use cases.\n\n### **3. Fault Tolerance**\nBoth Spark and Hadoop provide fault tolerance, but they achieve it in different ways. Hadoop's MapReduce framework ensures fault tolerance by writing intermediate data to disk, which allows it to recover from failures by re-reading data from disk. Spark, on the other hand, achieves fault tolerance through **Resilient Distributed Datasets (RDDs)**, which track the lineage of transformations applied to the data. If a partition of an RDD is lost, Spark can recompute it using the lineage information. While this approach avoids the overhead of disk I/O, it can introduce additional computation time in the event of failures.\n\n### **4. Ease of Programming and API Flexibility**\nSpark provides a more **expressive and flexible API** compared to Hadoop, which contributes to its performance advantages. Spark supports high-level APIs in Scala, Java, Python, and R, allowing developers to write concise and efficient code. It also offers a rich set of libraries for machine learning (MLlib), graph processing (GraphX), and stream processing (Spark Streaming), which are tightly integrated with its core engine. Hadoop, while powerful, requires more boilerplate code for similar tasks, which can lead to longer development times and less efficient execution.\n\n### **5. Compatibility with Hadoop Ecosystem**\nDespite its performance advantages, Spark is fully compatible with the **Hadoop ecosystem**, including Hadoop Distributed File System (HDFS) and YARN. This means Spark can leverage Hadoop's storage and resource management capabilities while providing faster processing. For example, Spark can read data from HDFS and use YARN for cluster resource management, making it a seamless upgrade for organizations already using Hadoop.\n\n### **6. Scalability**\nBoth Spark and Hadoop are highly scalable, but Spark's in-memory processing gives it an edge in handling large-scale data processing tasks more efficiently. Spark's ability to cache data in memory allows it to process large datasets faster than Hadoop, especially for iterative and interactive workloads. However, Hadoop's disk-based approach can be more robust for extremely large datasets that exceed the available memory capacity.\n\n### **Conclusion**\nIn summary, **Spark outperforms Hadoop** in scenarios requiring iterative processing, low-latency tasks, and interactive analytics due to its in-memory processing model and flexible API. However, Hadoop remains a robust choice for batch processing and scenarios where fault tolerance through disk-based storage is critical. For organizations looking to enhance their data processing capabilities, Spark offers a significant performance boost while maintaining compatibility with the existing Hadoop ecosystem.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the model database function as a template in SQL Server?",
+        "answers": [
+            "The model database serves as a template for any new database that you create. Altering the model database can change what standard, newly created databases look like, such as including audit tables or predefined user groups."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "829d8b5ca614145212d9fa46c9f468ab",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        },
+        "lightrag_answer": "The **Model Database** in SQL Server serves as a template for creating new databases. When a new database is created, SQL Server uses the structure and configuration of the Model Database as the foundation for the new database. This includes the database's schema, objects, and settings. Here’s how it functions in detail:\n\n### Role of the Model Database\n1. **Template for New Databases**: Whenever a new database is created using the `CREATE DATABASE` command, SQL Server copies the entire structure of the Model Database, including its tables, views, stored procedures, and other objects, to the new database. This ensures that all new databases start with a consistent baseline configuration.\n\n2. **Customizable Template**: Database administrators can modify the Model Database to include specific objects, settings, or configurations that should be present in all new databases. For example, if certain tables, stored procedures, or security settings are required across all databases, they can be added to the Model Database. This customization ensures that every new database inherits these predefined elements.\n\n3. **System Integrity**: The Model Database plays a critical role in maintaining system integrity. By providing a standardized template, it ensures that new databases adhere to organizational or application-specific requirements, reducing the risk of inconsistencies or errors in database creation.\n\n### Practical Implications\n- **Default Settings**: If no specific configuration is provided during the creation of a new database, SQL Server defaults to the settings defined in the Model Database. This includes options like recovery model, file sizes, and growth settings.\n- **Consistency Across Databases**: By using the Model Database as a template, organizations can enforce consistency across multiple databases, simplifying maintenance and ensuring that all databases meet the same standards.\n\n### Example Use Case\nSuppose an organization requires all new databases to include a specific audit table and a set of stored procedures for logging changes. These objects can be added to the Model Database. When a new database is created, it will automatically include the audit table and stored procedures, saving time and ensuring uniformity.\n\n### Limitations\n- **Overwriting Changes**: If the Model Database is modified after databases have been created, those changes will not be reflected in existing databases. The Model Database only influences new databases created after the modifications.\n- **System Database**: The Model Database is a system database, and improper modifications can affect the creation of new databases. Care must be taken when making changes to avoid unintended consequences.\n\nIn summary, the Model Database in SQL Server acts as a blueprint for new databases, ensuring consistency, standardization, and efficiency in database creation and management.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the primary purpose of the Linux Kernel Networking stack as described in the book?",
+        "answers": [
+            "The primary purpose of the Linux Kernel Networking stack is to handle the traversal of packets in the network stack and interact with various networking layers and subsystems, describing how various networking protocols are implemented."
+        ],
+        "context": "Linux Kernel Networking",
+        "length": 324154,
+        "context_id": "a26e2a9fe9627588b3f37969996aa053",
+        "_id": "98600e07f48f68a6d91ca74cc49f777e",
+        "label": "cs",
+        "meta": {
+            "title": "Linux Kernel Networking",
+            "authors": "Rami Rosen"
+        },
+        "lightrag_answer": "The primary purpose of the **Linux Kernel Networking stack**, as described in the book *Linux Kernel Networking: Implementation and Theory* by Rami Rosen, is to manage and facilitate network communication within the Linux operating system. The networking stack is a critical subsystem of the Linux kernel, responsible for handling a wide range of networking protocols, packet processing, and network device interactions. It provides the infrastructure necessary for applications to communicate over networks, whether locally or across the internet.\n\n### Key Functions of the Linux Kernel Networking Stack\n\n1. **Packet Processing and Routing**  \n   The networking stack is responsible for processing incoming and outgoing packets, including tasks such as packet forwarding, routing, and filtering. It ensures that packets are correctly delivered to their intended destinations, whether within the local network or across different networks.\n\n2. **Protocol Implementation**  \n   The stack implements a variety of networking protocols, including IPv4, IPv6, TCP, UDP, ICMP, and more. These protocols are essential for enabling communication between devices and ensuring data integrity, reliability, and security.\n\n3. **Network Device Management**  \n   The stack interacts with network devices (both physical and virtual) to send and receive data. It manages device configurations, such as MAC addresses, MTU (Maximum Transmission Unit), and other hardware attributes, to ensure efficient data transfer.\n\n4. **Connection Tracking and Security**  \n   The networking stack includes features like **Netfilter**, which provides packet filtering, network address translation (NAT), and connection tracking. These features are crucial for network security, enabling firewalls, intrusion detection, and other security mechanisms.\n\n5. **Multicast and Advanced Routing**  \n   The stack supports advanced routing features, such as multicast routing, policy-based routing, and multipath routing. These capabilities allow for efficient data distribution and improved network performance.\n\n6. **Virtualization and Isolation**  \n   The stack supports network namespaces, which provide isolated network environments for processes. This is particularly useful in containerization and virtualization, where multiple instances of the network stack need to coexist on the same physical machine.\n\n7. **Integration with Userspace**  \n   The stack provides interfaces like **Netlink sockets** for communication between the kernel and userspace applications. This allows userspace programs to configure and monitor network operations dynamically.\n\n### Importance of the Networking Stack\n\nThe Linux Kernel Networking stack is a foundational component of the Linux operating system, enabling everything from basic internet connectivity to complex network architectures. Its versatility and robustness make it suitable for a wide range of applications, from embedded systems to large-scale data centers. The stack's modular design allows for the integration of new protocols and features, ensuring that it remains relevant in the rapidly evolving field of networking.\n\nIn summary, the Linux Kernel Networking stack serves as the backbone of network communication in Linux, providing the tools and infrastructure necessary for efficient, secure, and scalable networking.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the fixed point theorem play a role in the semantics of programming languages?",
+        "answers": [
+            "The fixed point theorem is used in operational semantics to give rise to inductive definitions and reflexive-transitive closures. In denotational semantics, it is primarily used to define the meaning of the `fix` construct."
+        ],
+        "context": "Introduction to the Theory of Programming Languages",
+        "length": 51704,
+        "context_id": "2369ec0826789a4f553cb31b1635c15d",
+        "_id": "b222dab4687973ade81a88dc5c7e1c0e",
+        "label": "cs",
+        "meta": {
+            "title": "Introduction to the Theory of Programming Languages",
+            "authors": "Gilles Dowek"
+        },
+        "lightrag_answer": "The Fixed Point Theorem is a fundamental concept in the semantics of programming languages, particularly in the context of defining recursive functions and understanding program behavior. It provides a mathematical foundation for reasoning about recursion and the behavior of programs that involve iterative or self-referential constructs.\n\n### The Role of the Fixed Point Theorem in Semantics\n\nIn programming languages, the Fixed Point Theorem is often used to define the semantics of recursive functions. Recursive functions are those that call themselves as part of their computation, and they are essential for expressing repetitive operations in a concise and elegant manner. The theorem ensures that such functions have well-defined meanings, even when they involve self-referential definitions.\n\nFor example, in the language PCF (Programming Computable Functions), the Fixed Point Theorem is used to define recursive functions using the `fix` operator. This operator allows a function to refer to itself within its own definition, enabling the creation of recursive constructs. Without the Fixed Point Theorem, it would be challenging to give a formal meaning to such recursive definitions.\n\n### Application in Denotational Semantics\n\nDenotational Semantics is a method of defining the meaning of programming languages by mapping language constructs to mathematical objects. The Fixed Point Theorem plays a crucial role in this approach, especially when dealing with recursive functions. In denotational semantics, recursive functions are interpreted as fixed points of certain mathematical functions. The theorem guarantees the existence of these fixed points, ensuring that the semantics of recursive functions are well-defined.\n\nFor instance, in PCF, the `fix` operator is used to define the least fixed point of a continuous function, which is essential for interpreting recursive functions. This allows the language to handle recursive definitions in a mathematically rigorous way, ensuring that programs behave as expected.\n\n### Connection to Operational Semantics\n\nThe Fixed Point Theorem also has implications for operational semantics, which focuses on the step-by-step execution of programs. In languages like PCF, the theorem is used to reason about the termination of recursive programs. For example, Tait's Theorem, which is related to the Fixed Point Theorem, states that certain terms in PCF will terminate if they do not contain the `fix` operator. This highlights the theorem's role in understanding the behavior of recursive programs and their termination properties.\n\n### Summary\n\nIn summary, the Fixed Point Theorem is a cornerstone in the semantics of programming languages. It provides the mathematical underpinnings for defining and reasoning about recursive functions, ensuring that they have well-defined meanings in both denotational and operational semantics. By guaranteeing the existence of fixed points, the theorem enables programming languages to handle recursion in a rigorous and predictable manner, making it an essential tool in the theory of programming languages.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "Explain the process of IPv4 fragmentation and defragmentation.",
+        "answers": [
+            "IPv4 fragmentation occurs when a packet is larger than the MTU (Maximum Transmission Unit) of the outgoing interface, and it needs to be broken into smaller fragments. Defragmentation is the process of reassembling these fragments back into the original packet at the destination."
+        ],
+        "context": "Linux Kernel Networking",
+        "length": 324154,
+        "context_id": "a26e2a9fe9627588b3f37969996aa053",
+        "_id": "004498a62b48d961ab580adbec81836d",
+        "label": "cs",
+        "meta": {
+            "title": "Linux Kernel Networking",
+            "authors": "Rami Rosen"
+        },
+        "lightrag_answer": "### IPv4 Fragmentation and Defragmentation\n\nIPv4 fragmentation and defragmentation are essential processes in network communication, particularly when packets exceed the Maximum Transmission Unit (MTU) of a network path. These processes ensure that large packets can be transmitted across networks with varying MTUs and reassembled correctly at the destination.\n\n#### Fragmentation\n\nFragmentation occurs when an IPv4 packet is too large to be transmitted over a network link without being divided into smaller fragments. This process is typically handled by the sender or intermediate routers when the packet size exceeds the MTU of the next hop. The key steps in fragmentation include:\n\n1. **Determining the Need for Fragmentation**: Before sending a packet, the sender checks if the packet size exceeds the MTU of the destination path. If the packet is too large and the \"Don't Fragment\" (DF) flag in the IPv4 header is not set, the packet is fragmented.\n\n2. **Creating Fragments**: The packet is divided into smaller fragments, each containing a portion of the original packet's data. Each fragment includes an IPv4 header with specific fields set to indicate its position in the original packet:\n   - **Fragment Offset**: Indicates the position of the fragment within the original packet, measured in 8-byte units.\n   - **More Fragments (MF) Flag**: Set to 1 for all fragments except the last one, indicating that more fragments follow.\n\n3. **Handling IP Options**: For the first fragment, IP options are copied from the original packet. Subsequent fragments may have options set to NOOP (No Operation) to maintain alignment.\n\n4. **Transmitting Fragments**: Each fragment is transmitted independently and may take different paths to the destination. The sender ensures that each fragment is small enough to fit within the MTU of the next hop.\n\n#### Defragmentation\n\nDefragmentation is the process of reassembling the original packet from its fragments at the destination. This process is crucial for ensuring that the data is delivered correctly and in the correct order. The key steps in defragmentation include:\n\n1. **Receiving Fragments**: The destination host receives the fragments, which may arrive out of order. Each fragment contains information about its position in the original packet, including the fragment offset and the MF flag.\n\n2. **Storing Fragments**: The fragments are stored in a queue (referred to as an `ipq` object in the Linux kernel) until all fragments of the packet have been received. The queue is managed using a linked list, with fragments sorted by their offset.\n\n3. **Reassembling the Packet**: Once all fragments are received, the destination host reassembles the packet by combining the data from each fragment in the correct order. The reassembly process involves:\n   - **Checking for Completeness**: The destination host checks if all fragments have been received by verifying the fragment offset and the MF flag.\n   - **Constructing the Packet**: The data from each fragment is copied into a new buffer to reconstruct the original packet. The IPv4 header of the first fragment is used as the header for the reassembled packet.\n\n4. **Handling Timeouts**: If not all fragments are received within a specified time interval (default is 30 seconds in the Linux kernel), the reassembly process is aborted, and an ICMP \"Time Exceeded\" message with a \"Fragment Reassembly Time Exceeded\" code is sent to the sender.\n\n5. **Delivering the Packet**: Once the packet is reassembled, it is passed to the appropriate transport layer protocol (e.g., TCP or UDP) for further processing.\n\n#### Key Methods and Structures\n\n- **`ip_fragment()`**: The method responsible for fragmenting packets that exceed the MTU. It handles both the \"fast path\" (for packets with a non-NULL `frag_list`) and the \"slow path\" (for packets with a NULL `frag_list`).\n- **`ip_defrag()`**: The main method for defragmenting incoming IP fragments. It manages the `ipq` queue and calls `ip_frag_reasm()` to reassemble the packet once all fragments are received.\n- **`ipq`**: A structure used to manage the queue of fragments during defragmentation. It includes fields for tracking the fragment offset, the total length of the packet, and the expiration time for reassembly.\n\n### Conclusion\n\nIPv4 fragmentation and defragmentation are critical processes for ensuring that large packets can be transmitted across networks with varying MTUs. Fragmentation divides packets into smaller fragments that can be transmitted independently, while defragmentation reassembles these fragments into the original packet at the destination. These processes are implemented in the Linux kernel through methods like `ip_fragment()` and `ip_defrag()`, which handle the complexities of packet division and reassembly. Understanding these processes is essential for network engineers and developers working with IPv4 networking.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the primary purpose of the master database in SQL Server?",
+        "answers": [
+            "The master database is critical to the system and cannot be deleted. It holds a special set of tables (system tables) that keeps track of the system as a whole, including entries for new databases, extended and system stored procedures, and more."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "3304e79d0509d96bcbe4e273f6886f30",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        },
+        "lightrag_answer": "The **master database** in SQL Server serves as a critical system database that stores essential metadata and configuration information necessary for the operation of the SQL Server instance. Its primary purpose is to maintain system-level data, which includes details about all databases on the server, login information, system configuration settings, and custom error messages added using stored procedures like `sp_addmessage`. \n\nAs the central repository for system metadata, the master database plays a pivotal role in ensuring the integrity and functionality of the SQL Server environment. It tracks the existence and locations of other databases, manages server-wide configurations, and stores critical information required for the server to operate correctly. Without the master database, SQL Server would lack the foundational data needed to manage and coordinate its operations effectively.\n\nIn summary, the master database is indispensable for SQL Server's operation, acting as the backbone for system management and configuration. Its role in storing metadata and system-level information makes it a cornerstone of the SQL Server architecture.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What are some of the practical applications of Markov chains and Hidden Markov Models discussed in the book?",
+        "answers": [
+            "The book discusses practical applications of Markov chains and Hidden Markov Models in areas such as text generation, web ranking, and dynamic programming for HMMs, illustrating their utility in modeling sequences and processes where future states depend on the current state."
+        ],
+        "context": "Probability and Statistics for Computer Science",
+        "length": 340084,
+        "context_id": "cd05f98133352f12d464fe3184263356",
+        "_id": "2952ed98935d20d7f959093eb252713d",
+        "label": "cs",
+        "meta": {
+            "title": "Probability and Statistics for Computer Science",
+            "authors": "David Forsyth"
+        },
+        "lightrag_answer": "Markov chains and Hidden Markov Models (HMMs) are powerful tools for modeling sequences and have a wide range of practical applications, particularly in fields like natural language processing, speech recognition, and error correction. Here are some of the key applications discussed in the book:\n\n### **Markov Chains**\nMarkov chains are used to model sequences where the probability of the next event depends only on the current state. This property makes them particularly useful for applications involving sequential data. Some practical applications include:\n\n1. **Natural Language Processing (NLP)**:\n   - Markov chains are used to model text sequences, such as predicting the next word in a sentence based on the previous words. For example, Google’s N-gram models use Markov chains to analyze word sequences and predict the most likely next word.\n   - Tools like Garkov use Markov chains to generate text, such as comic strips or wine reviews, by analyzing patterns in existing text data.\n\n2. **Web Page Ranking**:\n   - Markov chains are used in algorithms like Google’s PageRank to model the behavior of a random web surfer. The transition probabilities between web pages help determine the importance or rank of each page.\n\n3. **Simulation and Modeling**:\n   - Markov chains are used to simulate real-world processes, such as the probability of arriving wet in the \"Umbrellas\" example or the number of coin flips needed to get two heads in a row. These simulations help estimate probabilities and expected outcomes in complex systems.\n\n### **Hidden Markov Models (HMMs)**\nHMMs extend Markov chains by introducing hidden states that are not directly observable but influence the observed outputs. This makes them particularly useful for applications where the underlying process is hidden but can be inferred from observable data. Some practical applications include:\n\n1. **Speech Recognition and Transcription**:\n   - HMMs are widely used in speech recognition systems to model the relationship between spoken sounds (phonemes) and the corresponding text. By analyzing sequences of sounds, HMMs can predict the most likely sequence of words, enabling applications like voice-to-text transcription.\n\n2. **Text Error Correction**:\n   - HMMs are used to correct errors in text transmission by modeling the probabilities of character sequences. For example, if a sequence of characters is received with errors, an HMM can infer the most likely original text based on the observed sequence.\n\n3. **Sequence Analysis in Biology**:\n   - HMMs are used in bioinformatics to analyze sequences of DNA, RNA, or proteins. By modeling the hidden states (e.g., gene regions or protein structures), HMMs can predict the structure or function of biological sequences.\n\n4. **Speech Signal Processing**:\n   - HMMs are used to analyze speech signals, such as identifying phonemes or other patterns in audio data. This is essential for building systems that can understand and process human speech.\n\n### **Educational and Simulation Tools**\nBoth Markov chains and HMMs are often used in educational settings to help students understand probabilistic modeling and sequence analysis. For example:\n- **Garkov**: A tool that generates comics using Markov chains, demonstrating how sequences can be modeled and generated.\n- **Simulations**: Markov chains are used to simulate processes like coin flips or umbrella usage, helping students grasp concepts like transition probabilities and stationary distributions.\n\n### **Conclusion**\nMarkov chains and HMMs are versatile tools with applications ranging from text generation and web ranking to speech recognition and error correction. Their ability to model sequences and infer hidden states makes them invaluable in fields like NLP, bioinformatics, and signal processing. By understanding these models, practitioners can develop systems that analyze, predict, and generate sequential data with high accuracy.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the significance of the \"dotall\" option in regular expressions?",
+        "answers": [
+            "The \"dotall\" option allows the dot (.) character in regular expressions to match all characters, including newline characters."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "5aa412726e58606de576d5bc416faa25",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        },
+        "lightrag_answer": "The \"dotall\" option in regular expressions is a crucial setting that alters how the dot (`.`) metacharacter behaves. By default, the dot matches any single character except for newline characters (such as line feed `\\n` or carriage return `\\r`). However, when the \"dotall\" option is enabled, the dot will match **any character**, including newline characters. This feature is particularly useful when you need to match patterns that span multiple lines of text.\n\n### How Dotall Works\nIn most regex implementations, the \"dotall\" option is activated using the `(?s)` modifier or a specific flag, depending on the tool or programming language. For example:\n- In **Perl**, you can use `(?s)` within the regex pattern to enable dotall.\n- In **Python**, the `re.DOTALL` or `re.S` flag can be passed to the `re.compile()` function.\n- In **RegExr**, a popular regex testing tool, the \"dotall\" option can be toggled via a checkbox in the interface.\n\n### Practical Use Cases\n1. **Matching Multiline Text**: When working with text that includes line breaks, the \"dotall\" option allows you to match patterns that extend across multiple lines. For instance, if you want to capture everything between two tags in an HTML document, including newlines, dotall ensures the dot matches all characters in between.\n\n2. **Simplifying Patterns**: Without dotall, you would need to use more complex patterns or character classes (e.g., `[\\s\\S]`) to match any character, including newlines. Enabling dotall simplifies your regex by allowing the dot to handle this directly.\n\n### Example\nConsider the following text:\n```\nFirst line.\nSecond line.\nThird line.\n```\nTo match everything from \"First\" to \"Third\" using the dotall option, you could use:\n```regex\nFirst.*Third\n```\nWith dotall enabled, this pattern will match the entire text, including the newline characters. Without dotall, the pattern would fail because the dot would not match the newlines.\n\n### Limitations and Considerations\nWhile dotall is powerful, it’s important to use it judiciously. Enabling it can lead to unintended matches if your pattern is too broad, especially when dealing with large blocks of text. Always test your regex thoroughly to ensure it behaves as expected.\n\nIn summary, the \"dotall\" option is a valuable tool in regular expressions for handling multiline text and simplifying patterns. By allowing the dot to match newline characters, it expands the flexibility and utility of regex in text processing tasks.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How can you run a macro from the Visual Basic Editor?",
+        "answers": [
+            "You can run a macro from the Visual Basic Editor by pressing the F5 key, choosing Run ⇒ Run Sub/UserForm, or clicking the Run Sub/UserForm button on the Standard toolbar."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "c385585026ebeeb0d06746d26f8724fd",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        },
+        "lightrag_answer": "Running a macro from the Visual Basic Editor (VBE) is a straightforward process that involves a few simple steps. Here’s how you can execute a macro within the VBE environment:\n\n### Using the F5 Key\nOne of the quickest ways to run a macro is by using the **F5 key**. When you have the macro code open in the Code Window, simply press **F5** to execute the macro. This keyboard shortcut is particularly useful for testing and debugging macros during development. It runs the entire procedure without stepping through it line by line.\n\n### Using the Run Sub/UserForm Button\nAnother method is to use the **Run Sub/UserForm button**, which is available in the Visual Basic Editor. This button is located on the Standard and Debug toolbars. When you click it, the macro or UserForm currently selected in the Code Window will begin execution. This is a convenient option if you prefer using the mouse over keyboard shortcuts.\n\n### Setting Breakpoints and Debugging\nIf you need to debug your macro, you can set **breakpoints** in your code. Breakpoints allow you to pause the execution of the macro at specific lines, enabling you to inspect variables and step through the code line by line. To set a breakpoint, click in the margin next to the line of code where you want execution to pause. Once the macro hits the breakpoint, you can use the **F8 key** to step through the code or press **F5** to continue running the macro.\n\n### Immediate Window for Quick Testing\nThe **Immediate Window** is another powerful tool for running and testing macros. You can execute individual lines of code or entire procedures directly in the Immediate Window. To run a macro, simply type the name of the macro and press **Enter**. This is particularly useful for testing small snippets of code or checking the value of variables during debugging.\n\n### Running Macros from the Macros Dialog Box\nWhile not directly within the VBE, the **Macros Dialog Box** (accessed by pressing **Alt+F8**) allows you to select and run macros. This is useful if you want to execute a macro without navigating through the VBE interface. However, this method is more commonly used from within the host application (e.g., Excel or Word) rather than the VBE itself.\n\n### Summary\nRunning a macro from the Visual Basic Editor can be done in several ways, depending on your needs:\n- Use the **F5 key** for quick execution.\n- Click the **Run Sub/UserForm button** for a mouse-driven approach.\n- Set **breakpoints** for debugging and step through the code with **F8**.\n- Use the **Immediate Window** for testing individual lines or procedures.\n- Access the **Macros Dialog Box** for a broader selection of macros.\n\nThese methods provide flexibility and efficiency when working with macros in the VBE, whether you’re testing, debugging, or simply running your code.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the book's stance on using triggers in SQL programming?",
+        "answers": [
+            "The book advises avoiding triggers in favor of Declarative Referential Integrity (DRI) actions, as triggers are often proprietary and can complicate schema design and maintenance."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "57b64f7e48364d3661a6fd7b2aaa9722",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        },
+        "lightrag_answer": "The book takes a nuanced stance on the use of triggers in SQL programming, emphasizing that their appropriateness depends on the specific context and requirements of the database system. While triggers are a powerful tool for enforcing data integrity and implementing complex business logic, they come with significant considerations that must be carefully weighed.\n\n### The Power and Flexibility of Triggers\nTriggers are highlighted as a versatile feature in SQL Server, capable of providing **flexible referential integrity** and **custom data integrity rules**. They can be used to enforce constraints that go beyond what standard SQL constraints can achieve. For instance, triggers can handle complex scenarios where data integrity rules need to reference other tables or involve conditional logic. Additionally, **INSTEAD OF triggers** are particularly useful for creating updatable views that would otherwise be non-updatable due to their complexity.\n\n### Performance Considerations\nThe book underscores the potential performance implications of using triggers. Triggers execute automatically in response to specific events (e.g., INSERT, UPDATE, DELETE), which means they can significantly impact the performance of the statements that fire them. Poorly written triggers can lead to long-running operations, affecting not only the trigger itself but also any statement that invokes it. The book advises developers to **keep triggers short and efficient** and to avoid using them for tasks that can be handled more effectively by other means, such as stored procedures or constraints.\n\n### Debugging and Maintenance Challenges\nTriggers are noted to be **difficult to debug and maintain**, especially in complex systems. Since they operate behind the scenes, their behavior can be hard to trace when issues arise. The book recommends thorough testing and careful planning when implementing triggers to avoid unintended consequences.\n\n### Alternatives to Triggers\nThe book suggests that developers should always consider whether there is a **better alternative to using triggers**. For example, stored procedures are often preferred for executing database actions because they offer better control, portability, and performance optimization. The book encourages developers to ask themselves, \"Is there a way I can get out of doing this?\" before deciding to use a trigger.\n\n### Summary\nIn summary, the book advocates for a cautious and thoughtful approach to using triggers. While they are a valuable tool for certain scenarios, they should not be used indiscriminately. Developers must weigh the benefits of triggers against their potential performance costs and maintenance challenges, and always consider whether alternative solutions might be more appropriate.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What are the challenges in using naive Bayes models with numerical features?",
+        "answers": [
+            "Naive Bayes models assume that features are conditionally independent and typically work well with categorical data. When dealing with numerical features, special handling is required, such as binning or transforming the data to fit the model's assumptions."
+        ],
+        "context": "Machine Learning With Spark",
+        "length": 131651,
+        "context_id": "7bcef8714a477fd61fc8fb0d499b2cc3",
+        "_id": "86062f01fbe48f830ccb15f58aa75a98",
+        "label": "cs",
+        "meta": {
+            "title": "Machine Learning With Spark",
+            "authors": "Nick Pentreath"
+        },
+        "lightrag_answer": "Naive Bayes models, while effective for many classification tasks, face specific challenges when dealing with numerical features. These challenges stem from the underlying assumptions and requirements of the model, particularly in how it handles feature distributions and data preprocessing.\n\n### 1. **Assumption of Feature Independence**\nNaive Bayes models assume that features are conditionally independent given the class label. This assumption is often violated in real-world datasets, especially with numerical features, which may exhibit correlations or dependencies. For example, in a dataset with features like age and income, these features are likely to be correlated, violating the independence assumption. This can lead to suboptimal model performance, as the model may incorrectly estimate the joint probability distribution.\n\n### 2. **Handling Continuous Data**\nNaive Bayes models are inherently designed for categorical data. When applied to numerical features, the model requires the data to be discretized or transformed into a format that aligns with its probabilistic framework. This can be problematic because:\n   - **Discretization** can lead to loss of information, especially if the numerical features contain fine-grained or continuous variations.\n   - **Distributional Assumptions**: Naive Bayes models often assume that numerical features follow a specific distribution, such as a Gaussian (normal) distribution. If the data does not conform to this assumption, the model's performance may degrade. For example, if the numerical features are skewed or have multiple modes, the Gaussian assumption may not hold, leading to inaccurate probability estimates.\n\n### 3. **Non-Negative Feature Requirements**\nSome implementations of naive Bayes, such as the multinomial variant, require features to be non-negative. This poses a challenge for numerical features that can take on negative values. To address this, preprocessing steps like setting negative values to zero or applying transformations (e.g., absolute value) are necessary. However, these transformations can distort the data and negatively impact model performance.\n\n### 4. **Sensitivity to Outliers**\nNumerical features are often sensitive to outliers, which can disproportionately influence the model's probability estimates. For example, in a Gaussian naive Bayes model, outliers can skew the mean and variance estimates, leading to poor generalization. Robust preprocessing techniques, such as outlier detection and removal, are often required to mitigate this issue.\n\n### 5. **Feature Scaling and Normalization**\nNaive Bayes models do not inherently require feature scaling or normalization, as they rely on probability estimates rather than distance metrics. However, certain preprocessing steps, such as log transformations or standardization, may still be necessary to ensure that numerical features align with the model's distributional assumptions. For instance, if the data is highly skewed, a log transformation can help approximate a normal distribution, improving the model's performance.\n\n### 6. **Limited Flexibility with Complex Distributions**\nNaive Bayes models are limited in their ability to capture complex relationships in numerical data. For example, if the relationship between features and the target variable is nonlinear, naive Bayes may struggle to model it effectively. More flexible models, such as decision trees or support vector machines, may be better suited for such scenarios.\n\n### Conclusion\nWhile naive Bayes models are computationally efficient and easy to implement, their application to numerical features requires careful consideration of the challenges outlined above. Preprocessing steps, such as discretization, transformation, and outlier handling, are often necessary to ensure that the model's assumptions are met. However, in cases where the data violates these assumptions or exhibits complex relationships, alternative models may be more appropriate.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "What is the difference between call by name and call by value reduction strategies?",
+        "answers": [
+            "Call by name evaluates arguments only when they are needed, while call by value evaluates arguments before the function is applied. Call by name can avoid unnecessary evaluations but may repeat evaluations of the same expression."
+        ],
+        "context": "Introduction to the Theory of Programming Languages",
+        "length": 51704,
+        "context_id": "2369ec0826789a4f553cb31b1635c15d",
+        "_id": "6940e37f5596aababeca8282bf7b702e",
+        "label": "cs",
+        "meta": {
+            "title": "Introduction to the Theory of Programming Languages",
+            "authors": "Gilles Dowek"
+        },
+        "lightrag_answer": "The difference between **call by name** and **call by value** reduction strategies lies in how arguments are evaluated when passed to functions. These strategies are fundamental to understanding the operational semantics of programming languages, particularly in the context of the PCF (Programming Computable Functions) language.\n\n### Call by Name\nIn the **call by name** strategy, arguments are not evaluated before being passed to a function. Instead, the argument expressions are substituted directly into the function body and are only evaluated when they are actually needed during the function's execution. This approach can be more efficient in cases where an argument is not used in the function, as it avoids unnecessary computation. However, if the argument is used multiple times, it may lead to redundant evaluations.\n\nKey characteristics of call by name:\n- Arguments are passed unevaluated.\n- Evaluation occurs only when the argument is needed.\n- Can avoid unnecessary computation if the argument is not used.\n- May lead to repeated evaluations if the argument is used multiple times.\n\n### Call by Value\nIn contrast, the **call by value** strategy evaluates arguments before they are passed to a function. The function receives the computed value of the argument, not the expression itself. This ensures that the argument is evaluated exactly once, regardless of how many times it is used within the function. While this can lead to more predictable behavior, it may result in unnecessary computation if the argument is not used in the function.\n\nKey characteristics of call by value:\n- Arguments are evaluated before being passed to the function.\n- The function receives the computed value of the argument.\n- Ensures the argument is evaluated only once.\n- May lead to unnecessary computation if the argument is not used.\n\n### Comparison and Implications\nThe choice between call by name and call by value can have significant implications for program behavior and performance. For example, in the context of recursive functions or infinite loops, call by name may allow a program to terminate where call by value would not, as the latter evaluates arguments unconditionally. Conversely, call by value can be more efficient in cases where arguments are used multiple times, as it avoids redundant evaluations.\n\nIn summary, **call by name** delays evaluation until the argument is needed, while **call by value** evaluates the argument upfront. Both strategies have their trade-offs, and the choice between them depends on the specific requirements of the program and the programming language being used.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the book encourage the reader to engage with the R code examples?",
+        "answers": [
+            "The book encourages the reader to execute the R code examples and try to solve the proposed exercises. This hands-on approach helps readers to better understand and apply the concepts and methods discussed in the book."
+        ],
+        "context": "Modern Optimization With R",
+        "length": 98105,
+        "context_id": "31aa6012fa9cdbd6e17cdc426851e487",
+        "_id": "754e1461ce33a36cefa3fe22931fd9d1",
+        "label": "cs",
+        "meta": {
+            "title": "Modern Optimization With R",
+            "authors": "Paulo Cortez"
+        },
+        "lightrag_answer": "The book encourages readers to engage with R code examples by providing practical, hands-on exercises and demonstrations that reinforce the concepts discussed. These exercises are designed to help readers apply their knowledge in real-world scenarios, ensuring a deeper understanding of the material. Here are some key ways the book facilitates this engagement:\n\n### Practical Exercises\nThe book includes a variety of programming exercises that challenge readers to apply what they've learned. For instance, exercises like **Exercise 7.7** involve converting imperative programs to functional programs, which helps readers understand the transformation of terms and their evaluation. Similarly, **Exercise 3.14** extends the PCF language with lists, requiring readers to write operational semantic rules and an interpreter, thereby reinforcing their understanding of language extension and implementation.\n\n### Demonstrative Tasks\nDemonstrative tasks are used to illustrate how specific methods or algorithms work. For example, **Exercise 2.15** involves writing call by name and call by value evaluators for PCF, which helps readers grasp different evaluation strategies in programming. These tasks are designed to be both educational and engaging, providing a clear path from theory to practice.\n\n### Complete Programs\nThe book often includes complete programs that readers can study and modify. For instance, **Chapter 7** provides a complete program for implementing arrays, which includes detailed explanations of each step. This allows readers to see how individual components come together to form a functional program, encouraging them to experiment with the code and make their own modifications.\n\n### Real-World Applications\nThe book connects R code examples to real-world applications, making the material more relatable and practical. For example, **Modern Optimization with R** discusses optimization methods in the context of information technology, showing readers how to apply these techniques to solve real-world problems. This approach helps readers see the relevance of what they're learning and motivates them to engage more deeply with the material.\n\n### Interactive Environment\nThe book emphasizes the use of an interactive environment, such as the Scala Console in Spark, where readers can write and test code in real-time. This immediate feedback loop encourages experimentation and helps readers quickly identify and correct mistakes, fostering a more active learning experience.\n\n### Comprehensive Resources\nThe book provides access to additional resources, such as the **R Graph Gallery**, which offers examples and code related to graphical features in R. These resources allow readers to explore further and apply what they've learned in new and creative ways.\n\nBy combining these elements, the book creates a rich, interactive learning experience that encourages readers to actively engage with R code examples, experiment with the material, and develop a deeper understanding of the concepts.",
+        "lightrag_context": ""
+    },
+    {
+        "input": "How does the book introduce the concept of alternation in regular expressions?",
+        "answers": [
+            "The book introduces alternation by explaining how to use the vertical bar (|) character to separate a list of regular expressions, indicating an \"or\" condition."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "4e533bbebfea3219e9d2cc9661a8f517",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        },
+        "lightrag_answer": "The book *Introducing Regular Expressions* by Michael Fitzgerald introduces the concept of **alternation** in regular expressions as a way to match one of several possible patterns. Alternation is typically represented using the pipe symbol (`|`), which allows you to specify multiple alternatives within a single regular expression. For example, the pattern `(the|The|THE)` matches any of the three variations of the word \"the,\" regardless of case.\n\nThe book explains that alternation is particularly useful when you want to match different but related patterns without writing separate expressions for each. For instance, the pattern `(t|T)h(e|eir)` can match \"the,\" \"The,\" \"their,\" or \"Their,\" demonstrating how alternation can be combined with other regex elements like character classes and quantifiers to create more flexible and powerful patterns.\n\nAdditionally, the book highlights that alternation can be used in conjunction with **capturing groups** and **backreferences**, which allow you to reuse parts of a matched pattern later in the expression. For example, the pattern `(It is) (an ancyent Marinere)` captures two groups, which can then be rearranged using backreferences like `\\2 \\1` to produce \"an ancyent Marinere It is.\"\n\nThe book also emphasizes that alternation is supported by many regex tools and languages, including Perl, sed, and RegExr, making it a widely applicable and essential feature for text processing and pattern matching. By providing practical examples and clear explanations, the book helps readers understand how to effectively use alternation in their regular expressions.",
+        "lightrag_context": ""
+    }
+]
diff --git a/kag/examples/csqa/solver/data/questions.json b/kag/examples/csqa/solver/data/questions.json
new file mode 100644
index 00000000..d5503070
--- /dev/null
+++ b/kag/examples/csqa/solver/data/questions.json
@@ -0,0 +1,1502 @@
+[
+    {
+        "input": "How does Spark Streaming enable real-time data processing?",
+        "answers": [
+            "Spark Streaming extends the core Spark API to process real-time data streams from sources like Kafka and Flume. It divides the streaming data into batches, which are then processed by the Spark engine to generate the final stream of results in real-time."
+        ],
+        "context": "Machine Learning With Spark",
+        "length": 131651,
+        "context_id": "7bcef8714a477fd61fc8fb0d499b2cc3",
+        "_id": "b2fd8d9c6d1499d521d778ce3d6d06fa",
+        "label": "cs",
+        "meta": {
+            "title": "Machine Learning With Spark",
+            "authors": "Nick Pentreath"
+        }
+    },
+    {
+        "input": "What does the book suggest about the use of histograms in data analysis?",
+        "answers": [
+            "The book suggests that histograms are a powerful tool for visualizing the distribution of data, especially continuous data. They help in understanding the shape, central tendency, and spread of the data, which are crucial for subsequent statistical analyses."
+        ],
+        "context": "Probability and Statistics for Computer Science",
+        "length": 340084,
+        "context_id": "cd05f98133352f12d464fe3184263356",
+        "_id": "a06e053884d3820c53753979a5d1583e",
+        "label": "cs",
+        "meta": {
+            "title": "Probability and Statistics for Computer Science",
+            "authors": "David Forsyth"
+        }
+    },
+    {
+        "input": "What are some advanced topics covered in the book related to Linux Kernel Networking?",
+        "answers": [
+            "Advanced topics covered in the book include Linux namespaces, network namespaces, cgroups, Busy Poll Sockets, the Bluetooth subsystem, the IEEE 802.15.4 subsystem, and Near Field Communication (NFC) subsystem."
+        ],
+        "context": "Linux Kernel Networking",
+        "length": 324154,
+        "context_id": "a26e2a9fe9627588b3f37969996aa053",
+        "_id": "dd5f8955ff5dc51715d0693b376b9a45",
+        "label": "cs",
+        "meta": {
+            "title": "Linux Kernel Networking",
+            "authors": "Rami Rosen"
+        }
+    },
+    {
+        "input": "What is the significance of the R tool in the context of modern optimization methods?",
+        "answers": [
+            "The R tool is significant because it is a free, open-source, and multi-platform tool specifically developed for statistical analysis, and it has an active community that continuously enhances its capabilities with new packages, including those that implement modern optimization methods."
+        ],
+        "context": "Modern Optimization With R",
+        "length": 98105,
+        "context_id": "31aa6012fa9cdbd6e17cdc426851e487",
+        "_id": "e7c78431710abdba3ea98535d03e28c8",
+        "label": "cs",
+        "meta": {
+            "title": "Modern Optimization With R",
+            "authors": "Paulo Cortez"
+        }
+    },
+    {
+        "input": "What are the key features of this text that aid in learning object-oriented concepts in Java?",
+        "answers": [
+            "Key features include an early introduction to object-oriented programming, the use of contour diagrams to illustrate object-oriented concepts, and the inclusion of questions for readers to interact with the material, reinforcing concepts through practice."
+        ],
+        "context": "Guide to Java",
+        "length": 132156,
+        "context_id": "9acb1e7611785d05ed363e0a12ff81c4",
+        "_id": "7d61004e5f34e290af4c7c37fc09723b",
+        "label": "cs",
+        "meta": {
+            "title": "Guide to Java",
+            "authors": "James T. Streib"
+        }
+    },
+    {
+        "input": "What is the role of the RegExr tool in the book?",
+        "answers": [
+            "The RegExr tool is used in the book as a web-based, JavaScript-powered regex implementation to help readers practice and understand regular expressions through interactive examples."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "9cc02e52ecbe63b45e21e1cdaec2b120",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        }
+    },
+    {
+        "input": "How does the text compare to other Java programming texts in terms of content and detail?",
+        "answers": [
+            "The text aims to fill the gap between comprehensive texts that might cover too many details, making them difficult for beginners, and shortened introductions that might lack necessary examples and illustrations. It provides a balance by concentrating on the fundamentals and being accessible to readers with no prior programming experience."
+        ],
+        "context": "Guide to Java",
+        "length": 132156,
+        "context_id": "9acb1e7611785d05ed363e0a12ff81c4",
+        "_id": "1da0c1c91eb0140738cb64ecc8004d05",
+        "label": "cs",
+        "meta": {
+            "title": "Guide to Java",
+            "authors": "James T. Streib"
+        }
+    },
+    {
+        "input": "What role do Bayesian inference and priors play in the book?",
+        "answers": [
+            "Bayesian inference and priors play a significant role in the book as they provide a framework for updating beliefs (priors) with new data to form posteriors. This approach is particularly useful in situations where prior knowledge is available and needs to be integrated with new evidence."
+        ],
+        "context": "Probability and Statistics for Computer Science",
+        "length": 340084,
+        "context_id": "cd05f98133352f12d464fe3184263356",
+        "_id": "89c6e8076674b2c6333b750682a7d9ca",
+        "label": "cs",
+        "meta": {
+            "title": "Probability and Statistics for Computer Science",
+            "authors": "David Forsyth"
+        }
+    },
+    {
+        "input": "What is the difference between recording a macro and writing code from scratch in VBA?",
+        "answers": [
+            "Recording a macro involves using the Macro Recorder to translate user actions into VBA code, while writing code from scratch involves manually typing VBA statements in the Visual Basic Editor to create a macro."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "54079bdf524a760f148f5719b9d6b1cf",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        }
+    },
+    {
+        "input": "How does the book address the implementation of IPv6 in comparison to IPv4?",
+        "answers": [
+            "The book discusses the implementation of IPv6, highlighting its larger address space, fixed-length header, and additional features like Neighbor Discovery and Multicast Listener Discovery, which enhance and replace certain functionalities of IPv4."
+        ],
+        "context": "Linux Kernel Networking",
+        "length": 324154,
+        "context_id": "a26e2a9fe9627588b3f37969996aa053",
+        "_id": "d415e4fa0db729e2c156fc711d56567d",
+        "label": "cs",
+        "meta": {
+            "title": "Linux Kernel Networking",
+            "authors": "Rami Rosen"
+        }
+    },
+    {
+        "input": "Can you explain the concept of standard coordinates as discussed in the book?",
+        "answers": [
+            "Standard coordinates involve normalizing data by subtracting the mean and dividing by the standard deviation for each variable. This transformation helps in comparing different datasets on a common scale, making it easier to visualize and analyze relationships between variables."
+        ],
+        "context": "Probability and Statistics for Computer Science",
+        "length": 340084,
+        "context_id": "cd05f98133352f12d464fe3184263356",
+        "_id": "2889bf6322872f411e8eaaa854393ca9",
+        "label": "cs",
+        "meta": {
+            "title": "Probability and Statistics for Computer Science",
+            "authors": "David Forsyth"
+        }
+    },
+    {
+        "input": "What are IP options and why might they be used?",
+        "answers": [
+            "IP options are optional fields in the IPv4 header that enable advanced features like strict or loose routing, record routing, time stamping, and router alert. They are used for specific networking tasks that require additional control over packet handling."
+        ],
+        "context": "Linux Kernel Networking",
+        "length": 324154,
+        "context_id": "a26e2a9fe9627588b3f37969996aa053",
+        "_id": "adeb98871fc954a358d3f686ee9a701f",
+        "label": "cs",
+        "meta": {
+            "title": "Linux Kernel Networking",
+            "authors": "Rami Rosen"
+        }
+    },
+    {
+        "input": "How does the book approach the teaching of jargon related to regular expressions?",
+        "answers": [
+            "The book approaches jargon by sharing the correct terms in small doses and emphasizing practical application over overwhelming readers with dry language."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "651b194a399fdf8f19eb47ec9f31a90d",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        }
+    },
+    {
+        "input": "What role do netlink sockets play in Linux Kernel Networking?",
+        "answers": [
+            "Netlink sockets provide a bidirectional communication channel between userspace and the kernel, allowing for tasks such as adding or deleting routes, configuring neighboring tables, and setting IPsec policies and states."
+        ],
+        "context": "Linux Kernel Networking",
+        "length": 324154,
+        "context_id": "a26e2a9fe9627588b3f37969996aa053",
+        "_id": "68e8a60e5e7f7fe1048bc0e0d88d07ac",
+        "label": "cs",
+        "meta": {
+            "title": "Linux Kernel Networking",
+            "authors": "Rami Rosen"
+        }
+    },
+    {
+        "input": "What is the primary purpose of \"Joe Celko's SQL Programming Style\"?",
+        "answers": [
+            "The primary purpose of the book is to improve SQL programming style and proficiency, helping individual programmers write Standard SQL without an accent or dialect and to give SQL shops a coding standard for internal use."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "bcf778eb653c69da482aa29ae4133d37",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        }
+    },
+    {
+        "input": "What is the role of the tempdb database in SQL Server?",
+        "answers": [
+            "The tempdb database is a key working area for the server. It is used for storing temporary objects such as complex query interim tables, user-created temporary tables, and other temporary storage needs. It is rebuilt from scratch every time the SQL Server restarts."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "af3176b476908e8a18b0ec8e0a27c6b4",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        }
+    },
+    {
+        "input": "What audience is the text primarily intended for?",
+        "answers": [
+            "The text is intended primarily for readers who have not had any previous programming experience. However, it can also serve as a useful resource for others, including those who have programmed in the past but are new to Java."
+        ],
+        "context": "Guide to Java",
+        "length": 132156,
+        "context_id": "9acb1e7611785d05ed363e0a12ff81c4",
+        "_id": "e7b68cf28240fdf4b611a7b6c5f1fdb5",
+        "label": "cs",
+        "meta": {
+            "title": "Guide to Java",
+            "authors": "James T. Streib"
+        }
+    },
+    {
+        "input": "How does the book recommend handling the complexity of regular expressions?",
+        "answers": [
+            "The book recommends learning by doing, encouraging readers to follow examples and try out new concepts in practical exercises."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "914ab1cd78585d24f2049fe2db29ebc1",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        }
+    },
+    {
+        "input": "What is a principal type in the context of type inference?",
+        "answers": [
+            "A principal type is the most general type that can be assigned to an expression without losing information. Any other type for the expression can be derived by substituting type variables in the principal type."
+        ],
+        "context": "Introduction to the Theory of Programming Languages",
+        "length": 51704,
+        "context_id": "2369ec0826789a4f553cb31b1635c15d",
+        "_id": "8b60cd4fd9e3578409412680b1d69c0e",
+        "label": "cs",
+        "meta": {
+            "title": "Introduction to the Theory of Programming Languages",
+            "authors": "Gilles Dowek"
+        }
+    },
+    {
+        "input": "What are user-defined functions (UDFs) in SQL Server and how do they differ from stored procedures?",
+        "answers": [
+            "User-defined functions are similar to sprocs but can return a value of most SQL Server data types and cannot have side effects like changing tables. They are used to encapsulate reusable logic and return a single value or a table result."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "31b27ce444fc7177802c16d0beea99b1",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        }
+    },
+    {
+        "input": "What are the two categories of indexes in SQL Server and what distinguishes them?",
+        "answers": [
+            "Indexes in SQL Server fall into two categories: clustered and non-clustered. A clustered index can have only one per table and means the table is physically sorted according to that index. Non-clustered indexes are more like traditional indexes and can have many per table."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "a7e68e0890a882b80e2a0bcb9c3e94b9",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        }
+    },
+    {
+        "input": "What caution does the book provide regarding the use of maximum likelihood estimation?",
+        "answers": [
+            "The book cautions that while maximum likelihood estimation is a powerful tool for estimating model parameters from data, it can be sensitive to the presence of outliers and may not always provide the best fit if the model assumptions are not met."
+        ],
+        "context": "Probability and Statistics for Computer Science",
+        "length": 340084,
+        "context_id": "cd05f98133352f12d464fe3184263356",
+        "_id": "1b096fe0e27b3aa78ed00205f1998547",
+        "label": "cs",
+        "meta": {
+            "title": "Probability and Statistics for Computer Science",
+            "authors": "David Forsyth"
+        }
+    },
+    {
+        "input": "What is the significance of the ICMP protocol in Linux Kernel Networking?",
+        "answers": [
+            "The ICMP protocol is significant in Linux Kernel Networking for sending error and control messages about the network layer, helping to keep the system behaving correctly by providing feedback about problems in the communication environment."
+        ],
+        "context": "Linux Kernel Networking",
+        "length": 324154,
+        "context_id": "a26e2a9fe9627588b3f37969996aa053",
+        "_id": "fd89ca52cc6b3c8e0d521d445faf2465",
+        "label": "cs",
+        "meta": {
+            "title": "Linux Kernel Networking",
+            "authors": "Rami Rosen"
+        }
+    },
+    {
+        "input": "What is the significance of the ALS algorithm in Spark's MLlib?",
+        "answers": [
+            "The ALS (Alternating Least Squares) algorithm is used in Spark's MLlib for collaborative filtering, particularly in recommendation systems. It is used to factorize a user-item rating matrix into user and item factor matrices, which can then be used to predict missing ratings."
+        ],
+        "context": "Machine Learning With Spark",
+        "length": 131651,
+        "context_id": "7bcef8714a477fd61fc8fb0d499b2cc3",
+        "_id": "4efd3e57ae57229b5d83754ba39df1fa",
+        "label": "cs",
+        "meta": {
+            "title": "Machine Learning With Spark",
+            "authors": "Nick Pentreath"
+        }
+    },
+    {
+        "input": "What does the book recommend regarding the use of proprietary data types?",
+        "answers": [
+            "The book recommends avoiding proprietary data types as they do not port to other products or from one release to another of the same product, and standard SQL has enough data types to model most real-world situations."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "2467ca0b4b3afc74d95a04fe58832bb2",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        }
+    },
+    {
+        "input": "How do you assign a macro to a button on the Quick Access Toolbar in Word?",
+        "answers": [
+            "To assign a macro to a button on the Quick Access Toolbar in Word, right-click the Quick Access Toolbar, choose Customize Quick Access Toolbar, select Macros from the Choose Commands From drop-down list, select the macro, click Add, and then modify the button's name and icon if desired."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "52552813e55408b4b15c849771b5e0f0",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        }
+    },
+    {
+        "input": "What is Apache Spark and what are its key features?",
+        "answers": [
+            "Apache Spark is a framework for distributed computing designed to handle large-scale data processing tasks efficiently. Its key features include in-memory processing, fault tolerance, flexible data structures, and support for multiple programming languages like Scala, Java, and Python."
+        ],
+        "context": "Machine Learning With Spark",
+        "length": 131651,
+        "context_id": "7bcef8714a477fd61fc8fb0d499b2cc3",
+        "_id": "e17c82938a0a24b191c0642f5bc3afe5",
+        "label": "cs",
+        "meta": {
+            "title": "Machine Learning With Spark",
+            "authors": "Nick Pentreath"
+        }
+    },
+    {
+        "input": "What does the dollar sign ($) signify in regular expressions?",
+        "answers": [
+            "The dollar sign ($) in regular expressions signifies the end of a line or string."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "586693a4e121a947e66802b4f964dd7e",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        }
+    },
+    {
+        "input": "How does the book approach the topic of data encoding schemes?",
+        "answers": [
+            "The book discusses various data encoding schemes, emphasizing the importance of using existing standard encoding schemes and providing guidelines for designing encoding schemes that allow for expansion and avoid NULLs."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "670305552b1ce595a41912dbf0e37778",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        }
+    },
+    {
+        "input": "What are the three main techniques used for semantic definitions in programming languages?",
+        "answers": [
+            "The three main techniques are denotational semantics, big-step operational semantics, and small-step operational semantics."
+        ],
+        "context": "Introduction to the Theory of Programming Languages",
+        "length": 51704,
+        "context_id": "2369ec0826789a4f553cb31b1635c15d",
+        "_id": "438ca00c6c6501f04e0d71f35a85a0dd",
+        "label": "cs",
+        "meta": {
+            "title": "Introduction to the Theory of Programming Languages",
+            "authors": "Gilles Dowek"
+        }
+    },
+    {
+        "input": "What are stored procedures (sprocs) and what advantages do they offer over sending individual SQL statements?",
+        "answers": [
+            "Stored procedures are pre-compiled objects that contain a series of SQL statements. They offer advantages such as reduced network traffic, pre-optimization and precompilation, encapsulation of complex logic, and reusability."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "b53e6c46b777af351513d4af10554228",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        }
+    },
+    {
+        "input": "What is the primary purpose of VBA in Office applications?",
+        "answers": [
+            "The primary purpose of VBA in Office applications is to automate tasks and operations, allowing users to streamline complex and repetitive processes."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "acb3efc774059a3f6a857c3543647786",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        }
+    },
+    {
+        "input": "What is the role of confluence in the operational semantics of programming languages?",
+        "answers": [
+            "Confluence ensures that regardless of the order in which reductions are applied, a term will always reduce to the same irreducible term, thus guaranteeing the uniqueness of the result of a computation."
+        ],
+        "context": "Introduction to the Theory of Programming Languages",
+        "length": 51704,
+        "context_id": "2369ec0826789a4f553cb31b1635c15d",
+        "_id": "3bf7d70db749f8edb0b8131ed38041fd",
+        "label": "cs",
+        "meta": {
+            "title": "Introduction to the Theory of Programming Languages",
+            "authors": "Gilles Dowek"
+        }
+    },
+    {
+        "input": "How does the MovieLens dataset contribute to building recommendation engines?",
+        "answers": [
+            "The MovieLens dataset, containing user ratings for movies, is used to train recommendation models. It provides the necessary data to apply collaborative filtering techniques, where user preferences are used to predict and recommend other movies they might like."
+        ],
+        "context": "Machine Learning With Spark",
+        "length": 131651,
+        "context_id": "7bcef8714a477fd61fc8fb0d499b2cc3",
+        "_id": "7de6a30b09f1560f3161376771bf79a0",
+        "label": "cs",
+        "meta": {
+            "title": "Machine Learning With Spark",
+            "authors": "Nick Pentreath"
+        }
+    },
+    {
+        "input": "What is the primary goal of the book \"Introducing Regular Expressions\"?",
+        "answers": [
+            "The primary goal of the book is to make learning regular expressions as easy as possible through examples and practical applications."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "5443d621a027c59e78e253ef44bfa502",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        }
+    },
+    {
+        "input": "What tools or methodologies does the text use to help readers understand and design programs?",
+        "answers": [
+            "The text uses pseudocode to outline programs, making the logic clear without being tied to specific syntax. It also employs UML diagrams to visually represent the relationships among classes, aiding in the design of more complex programs."
+        ],
+        "context": "Guide to Java",
+        "length": 132156,
+        "context_id": "9acb1e7611785d05ed363e0a12ff81c4",
+        "_id": "fad588e5aa55fbd5eb3a9640365b0917",
+        "label": "cs",
+        "meta": {
+            "title": "Guide to Java",
+            "authors": "James T. Streib"
+        }
+    },
+    {
+        "input": "How does the FOR XML clause in SQL Server facilitate the conversion of relational data into XML format?",
+        "answers": [
+            "The FOR XML clause is an option added to the end of a SELECT statement to instruct SQL Server to return the result set in XML format. It provides several modes (RAW, AUTO, EXPLICIT, PATH) to control the structure and formatting of the XML output."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "5f4dc0b9d5cb8e3d70a682084a659bb0",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        }
+    },
+    {
+        "input": "What role do examples and exercises play in the learning process according to the text?",
+        "answers": [
+            "Examples and exercises are crucial in the learning process. The text includes numerous examples and illustrations throughout each chapter, and provides exercises at the end of each chapter to reinforce the concepts presented. Selected answers to exercises are provided in an appendix."
+        ],
+        "context": "Guide to Java",
+        "length": 132156,
+        "context_id": "9acb1e7611785d05ed363e0a12ff81c4",
+        "_id": "75d698b059b5a104e6ef6d548bbb3a42",
+        "label": "cs",
+        "meta": {
+            "title": "Guide to Java",
+            "authors": "James T. Streib"
+        }
+    },
+    {
+        "input": "What is the significance of the correlation coefficient in the book?",
+        "answers": [
+            "The correlation coefficient is significant as it measures the strength and direction of the relationship between two variables. It helps in predicting one variable from another and is a key tool in understanding data relationships."
+        ],
+        "context": "Probability and Statistics for Computer Science",
+        "length": 340084,
+        "context_id": "cd05f98133352f12d464fe3184263356",
+        "_id": "7182350cd8041542acab7b678130b2a7",
+        "label": "cs",
+        "meta": {
+            "title": "Probability and Statistics for Computer Science",
+            "authors": "David Forsyth"
+        }
+    },
+    {
+        "input": "What are the three main approaches to handle multi-objective tasks discussed in the book?",
+        "answers": [
+            "The three main approaches discussed are the weighted-formula approach, lexicographic approach, and Pareto front approach."
+        ],
+        "context": "Modern Optimization With R",
+        "length": 98105,
+        "context_id": "31aa6012fa9cdbd6e17cdc426851e487",
+        "_id": "8aada2c07ffcd605670c0322fd07f7cd",
+        "label": "cs",
+        "meta": {
+            "title": "Modern Optimization With R",
+            "authors": "Paulo Cortez"
+        }
+    },
+    {
+        "input": "What is a view in SQL Server and what are its primary uses?",
+        "answers": [
+            "A view is a virtual table based on the result-set of an SQL statement. It contains rows and columns, just like a real table, and is used for security (controlling what users see) and ease of use (hiding complexity and unnecessary information)."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "4948f1ca278e86c71e252878fe02cfd1",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        }
+    },
+    {
+        "input": "How can you debug a macro in the Visual Basic Editor?",
+        "answers": [
+            "You can debug a macro in the Visual Basic Editor by setting breakpoints, stepping through the code line by line using the Step Into command, and examining variables and expressions in the Immediate window or other debugging windows."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "88ded4e33236e7d4d990d0f9db62e915",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        }
+    },
+    {
+        "input": "How does the book differentiate between probability and statistics?",
+        "answers": [
+            "The book differentiates between probability and statistics by explaining that probability deals with predicting the likelihood of future events based on known probabilities, while statistics involves making inferences about unknown probabilities based on observed data."
+        ],
+        "context": "Probability and Statistics for Computer Science",
+        "length": 340084,
+        "context_id": "cd05f98133352f12d464fe3184263356",
+        "_id": "6548f38f05a7c1334f452b8a0ce79554",
+        "label": "cs",
+        "meta": {
+            "title": "Probability and Statistics for Computer Science",
+            "authors": "David Forsyth"
+        }
+    },
+    {
+        "input": "What does the book consider as the biggest hurdle in learning SQL?",
+        "answers": [
+            "According to the book, the biggest hurdle in learning SQL is thinking in sets and logic instead of in sequences and processes, which is a common mistake among new programmers."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "174fae450333f362e8b30bdc893eabd9",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        }
+    },
+    {
+        "input": "What are the four types of operators in VBA?",
+        "answers": [
+            "The four types of operators in VBA are arithmetic operators, comparison operators, logical operators, and concatenation operators."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "ae6e66d052fcf8f736b04624589ff901",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        }
+    },
+    {
+        "input": "What is the book's stance on the use of jargon in regular expressions?",
+        "answers": [
+            "The book takes a light approach to jargon, using it sparingly and only when necessary, to avoid creating barriers for readers."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "c8fc43e65361da563733e97c2cc96143",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        }
+    },
+    {
+        "input": "How does the book advocate for the use of views in SQL?",
+        "answers": [
+            "The book advocates for the use of views to provide row- and column-level security, ensure efficient access paths, mask complexity from users, and enforce complicated integrity constraints, among other benefits."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "0d23d9f53f66018c1eed5131bd087ab3",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        }
+    },
+    {
+        "input": "What are some of the tools and languages covered in the book for working with regular expressions?",
+        "answers": [
+            "The book covers tools and languages such as sed, Perl, grep, RegExr, Reggy, and various programming languages like Ruby, Python, and JavaScript."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "3d8a298c07e6bf6324b98ef3b1c88aca",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        }
+    },
+    {
+        "input": "What is the significance of the Option Explicit statement in VBA?",
+        "answers": [
+            "The Option Explicit statement in VBA enforces explicit variable declarations, which helps in preventing errors and makes the code more readable and maintainable."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "30dc7a464a5d4ad0b809afac77ea0d6d",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        }
+    },
+    {
+        "input": "What is an object in the context of VBA?",
+        "answers": [
+            "In the context of VBA, an object is a representation of a part of an application, such as a document in Word, a worksheet in Excel, or a slide in PowerPoint."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "6f68ec52ee65e56e8d13d446d19f1a04",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        }
+    },
+    {
+        "input": "What is the purpose of the Object Browser in the Visual Basic Editor?",
+        "answers": [
+            "The purpose of the Object Browser in the Visual Basic Editor is to help users find and understand objects, methods, properties, and events within VBA projects and libraries."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "507f7410adea0bb66a5d3416cd7cf311",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        }
+    },
+    {
+        "input": "What is the rationale behind using full reserved words in SQL according to the book?",
+        "answers": [
+            "The book suggests using full reserved words to document the program, making it easier to read and understand the code, as uppercase words are seen as a unit and act to announce a statement or clause."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "1b5823e14c1b177f76837c1d58f32918",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        }
+    },
+    {
+        "input": "Can you name some popular modern optimization methods discussed in the book?",
+        "answers": [
+            "Some popular modern optimization methods discussed in the book include simulated annealing, tabu search, genetic algorithms, genetic programming, NSGA-II (multi-objective optimization), differential evolution, and particle swarm optimization."
+        ],
+        "context": "Modern Optimization With R",
+        "length": 98105,
+        "context_id": "31aa6012fa9cdbd6e17cdc426851e487",
+        "_id": "f365affdc673cb61e13ed46968e9b5bb",
+        "label": "cs",
+        "meta": {
+            "title": "Modern Optimization With R",
+            "authors": "Paulo Cortez"
+        }
+    },
+    {
+        "input": "What fundamental shift in thinking does the book encourage for effective SQL programming?",
+        "answers": [
+            "The book encourages a shift from thinking in sequences and processes to thinking in sets and logic, which is fundamental to mastering SQL programming."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "dc8a7fdfb3333fd72e18b01c03b0f12b",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        }
+    },
+    {
+        "input": "How does the author approach the topic of statistical significance?",
+        "answers": [
+            "The author approaches statistical significance by explaining its importance in determining the reliability of evidence in experiments. It involves discussing p-values, t-tests, F-tests, and χ²-tests, emphasizing the need to avoid misinterpretation and misuse of these tools."
+        ],
+        "context": "Probability and Statistics for Computer Science",
+        "length": 340084,
+        "context_id": "cd05f98133352f12d464fe3184263356",
+        "_id": "f5fdff2a87d84971c75175592636aef0",
+        "label": "cs",
+        "meta": {
+            "title": "Probability and Statistics for Computer Science",
+            "authors": "David Forsyth"
+        }
+    },
+    {
+        "input": "What is the primary purpose of the text \"Guide to Java: A Concise Introduction to Programming\"?",
+        "answers": [
+            "The primary purpose of the text is to help readers quickly learn how to program using the Java programming language, focusing on the fundamentals, providing illustrations and examples, and using visual contour diagrams to illustrate object-oriented semantics."
+        ],
+        "context": "Guide to Java",
+        "length": 132156,
+        "context_id": "9acb1e7611785d05ed363e0a12ff81c4",
+        "_id": "c455b7328867b7131b34bec23dbf5486",
+        "label": "cs",
+        "meta": {
+            "title": "Guide to Java",
+            "authors": "James T. Streib"
+        }
+    },
+    {
+        "input": "How can you customize the Visual Basic Editor in Office applications?",
+        "answers": [
+            "You can customize the Visual Basic Editor by adjusting settings such as editor and view preferences, window layouts, toolbar and menu customizations, and Toolbox settings to suit your workflow and preferences."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "115366e36d584ef281e25982fab866fb",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        }
+    },
+    {
+        "input": "What is the significance of the QED editor in the history of regular expressions?",
+        "answers": [
+            "The QED editor, written by Ken Thompson, yielded one of the earliest practical implementations of regular expressions in computing."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "a696fb4fb29c953cc4209879647cb073",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        }
+    },
+    {
+        "input": "How does the book address the issue of infeasible solutions in optimization problems?",
+        "answers": [
+            "The book addresses infeasible solutions through several methods, including the death-penalty approach (assigning a very large penalty value), penalty-weights (modifying the evaluation function to include penalties), repair (transforming an infeasible solution into a feasible one), and generating only feasible solutions using domain knowledge or special operators."
+        ],
+        "context": "Modern Optimization With R",
+        "length": 98105,
+        "context_id": "31aa6012fa9cdbd6e17cdc426851e487",
+        "_id": "80221419c8122e84644b46df2cf798cc",
+        "label": "cs",
+        "meta": {
+            "title": "Modern Optimization With R",
+            "authors": "Paulo Cortez"
+        }
+    },
+    {
+        "input": "What are the main components of a machine learning system designed with Spark?",
+        "answers": [
+            "A machine learning system with Spark typically includes components such as data ingestion and storage, data cleansing and transformation, model training and testing, model deployment and integration, and model monitoring and feedback."
+        ],
+        "context": "Machine Learning With Spark",
+        "length": 131651,
+        "context_id": "7bcef8714a477fd61fc8fb0d499b2cc3",
+        "_id": "6228906a75b83aef34b197a2cfb0a9a4",
+        "label": "cs",
+        "meta": {
+            "title": "Machine Learning With Spark",
+            "authors": "Nick Pentreath"
+        }
+    },
+    {
+        "input": "What is the purpose of the caret (^) in regular expressions?",
+        "answers": [
+            "The caret (^) in regular expressions is used to match the beginning of a line or string."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "153e939b06da8693da93626e62f5d90a",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        }
+    },
+    {
+        "input": "What is the significance of the `fix` construct in PCF (Programming language for computable functions)?",
+        "answers": [
+            "The `fix` construct allows for the definition of recursive functions in PCF, enabling the computation of functions that would otherwise require iteration or recursive definitions."
+        ],
+        "context": "Introduction to the Theory of Programming Languages",
+        "length": 51704,
+        "context_id": "2369ec0826789a4f553cb31b1635c15d",
+        "_id": "065edc3713c4ba9e63ba4f064302e2d0",
+        "label": "cs",
+        "meta": {
+            "title": "Introduction to the Theory of Programming Languages",
+            "authors": "Gilles Dowek"
+        }
+    },
+    {
+        "input": "What does the book suggest as a strategy for testing SQL?",
+        "answers": [
+            "The book recommends testing all possible combinations of NULLs, inspecting and testing all CHECK () constraints, and being wary of character columns, as they often lack sufficient constraints."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "6fb16360d09136e63e30475fa3261ada",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        }
+    },
+    {
+        "input": "What is the purpose of normalization in database design and what are its benefits?",
+        "answers": [
+            "Normalization is the process of organizing data in a database to reduce redundancy and improve data integrity. Its benefits include efficient data storage, fewer modification anomalies, and better adherence to the principles of relational database design."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "6d089ff9d766a947ffd4c3d299aea38c",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        }
+    },
+    {
+        "input": "What is the difference between a variable and a constant in VBA?",
+        "answers": [
+            "A variable in VBA is a named storage location in memory that can hold different values during the execution of a program, whereas a constant holds a fixed value that does not change."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "511e94495adb05cc91831684a0319c89",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        }
+    },
+    {
+        "input": "How does the concept of \"environment\" differ between denotational and operational semantics?",
+        "answers": [
+            "In denotational semantics, an environment maps variables to their denotations (usually values or functions). In operational semantics, an environment maps variables to their current values or states during the execution of a program."
+        ],
+        "context": "Introduction to the Theory of Programming Languages",
+        "length": 51704,
+        "context_id": "2369ec0826789a4f553cb31b1635c15d",
+        "_id": "570c894ee9a40ace84dcce3fb9227902",
+        "label": "cs",
+        "meta": {
+            "title": "Introduction to the Theory of Programming Languages",
+            "authors": "Gilles Dowek"
+        }
+    },
+    {
+        "input": "How can you ensure that a macro runs automatically when an application starts?",
+        "answers": [
+            "You can ensure that a macro runs automatically when an application starts by naming the macro \"Auto_Open\" for Excel or using the appropriate event handler for other applications, such as \"Document_Open\" for Word."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "33b180f7c06c2ad7d7e68b59aa0ad4aa",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        }
+    },
+    {
+        "input": "What is the significance of the XML data type introduced in SQL Server 2005?",
+        "answers": [
+            "The XML data type allows SQL Server to recognize and manage data as truly XML data, enabling advanced features like indexing, data validation through schema collections, and intrinsic methods for querying and modifying XML data."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "83b7aee070a8d594d1f0777b7e61c7d2",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        }
+    },
+    {
+        "input": "What is the significance of the `DEoptim` package in R for optimization tasks?",
+        "answers": [
+            "The `DEoptim` package in R implements the differential evolution algorithm, which is a global optimization method useful for continuous numerical optimization. It is significant because it provides a robust and efficient way to handle complex optimization problems that may be difficult to solve using traditional methods."
+        ],
+        "context": "Modern Optimization With R",
+        "length": 98105,
+        "context_id": "31aa6012fa9cdbd6e17cdc426851e487",
+        "_id": "89b16302915e65e42d9dc4c009b541e4",
+        "label": "cs",
+        "meta": {
+            "title": "Modern Optimization With R",
+            "authors": "Paulo Cortez"
+        }
+    },
+    {
+        "input": "How does the author suggest handling categorical data in the context of plotting?",
+        "answers": [
+            "The author suggests using bar charts, pie charts, stacked bar charts, and heat maps to visualize categorical data effectively. Each method has its advantages depending on the specific dataset and the comparisons one wishes to make."
+        ],
+        "context": "Probability and Statistics for Computer Science",
+        "length": 340084,
+        "context_id": "cd05f98133352f12d464fe3184263356",
+        "_id": "6b33b185034283f6f54f316e656c36b7",
+        "label": "cs",
+        "meta": {
+            "title": "Probability and Statistics for Computer Science",
+            "authors": "David Forsyth"
+        }
+    },
+    {
+        "input": "How does the text address the potential for errors in programming?",
+        "answers": [
+            "The text acknowledges the possibility of errors and emphasizes the importance of careful program design to minimize logic errors. It also introduces debugging as a process for finding and fixing logic errors, and suggests using output statements at critical points in the program to assist in this process."
+        ],
+        "context": "Guide to Java",
+        "length": 132156,
+        "context_id": "9acb1e7611785d05ed363e0a12ff81c4",
+        "_id": "aa72087fc9806e8a8052d9ee94e536d9",
+        "label": "cs",
+        "meta": {
+            "title": "Guide to Java",
+            "authors": "James T. Streib"
+        }
+    },
+    {
+        "input": "What is the role of the Immediate window in the Visual Basic Editor?",
+        "answers": [
+            "The Immediate window in the Visual Basic Editor allows users to test individual lines of code, execute statements, and debug procedures quickly without running an entire macro."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "6561d403389bcadf0d8a2a0170ffcf1f",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        }
+    },
+    {
+        "input": "What is the concept of Pareto front in multi-objective optimization?",
+        "answers": [
+            "The Pareto front in multi-objective optimization consists of all non-dominated solutions, where a solution is non-dominated if there is no other solution that is better in all objectives. It represents the set of solutions that offer the best trade-offs between conflicting objectives."
+        ],
+        "context": "Modern Optimization With R",
+        "length": 98105,
+        "context_id": "31aa6012fa9cdbd6e17cdc426851e487",
+        "_id": "ded99eeb790f2a4f64685bfdfd5c4fd5",
+        "label": "cs",
+        "meta": {
+            "title": "Modern Optimization With R",
+            "authors": "Paulo Cortez"
+        }
+    },
+    {
+        "input": "How does the text handle the introduction of complex topics like inheritance and polymorphism?",
+        "answers": [
+            "The text introduces complex topics like inheritance and polymorphism gradually, ensuring that foundational concepts are well understood before advancing to more complex topics. It provides clear explanations and practical examples to facilitate understanding."
+        ],
+        "context": "Guide to Java",
+        "length": 132156,
+        "context_id": "9acb1e7611785d05ed363e0a12ff81c4",
+        "_id": "bae9b2c6c4fec171a2aed180ab7baed1",
+        "label": "cs",
+        "meta": {
+            "title": "Guide to Java",
+            "authors": "James T. Streib"
+        }
+    },
+    {
+        "input": "What is the role of the `optim` function in R when dealing with optimization problems?",
+        "answers": [
+            "The `optim` function in R is a general-purpose optimization function that includes various methods such as Nelder-Mead, BFGS, CG, L-BFGS-B, and SANN (simulated annealing) for minimizing or maximizing a function. It is used to solve optimization problems by providing the function to be optimized and other parameters like initial values and bounds."
+        ],
+        "context": "Modern Optimization With R",
+        "length": 98105,
+        "context_id": "31aa6012fa9cdbd6e17cdc426851e487",
+        "_id": "ee400e24ee25691ae17588b3c114110d",
+        "label": "cs",
+        "meta": {
+            "title": "Modern Optimization With R",
+            "authors": "Paulo Cortez"
+        }
+    },
+    {
+        "input": "What are the three main types of quantifiers discussed in the book?",
+        "answers": [
+            "The three main types of quantifiers discussed in the book are greedy, lazy (reluctant), and possessive quantifiers."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "d4d2cefe8061c0465c5f266f9f4555fb",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        }
+    },
+    {
+        "input": "What are the three major types of relationships in database design and give an example of each?",
+        "answers": [
+            "The three major types of relationships are one-to-one, one-to-many, and many-to-many. Examples include a person and their social security number (one-to-one), a customer and their orders (one-to-many), and products and orders (many-to-many)."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "4d32d4026fc0d305b8bfcfb2809cdb5e",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        }
+    },
+    {
+        "input": "What naming convention does the book recommend for tables and views?",
+        "answers": [
+            "The book recommends that table and view names should be industry standards, collective, class, or plural nouns, following the principle that a table is a set and not a scalar value."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "44d9d8d01f14afe5c04bef3c21c6da19",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        }
+    },
+    {
+        "input": "What is the primary goal of the book \"Modern Optimization with R\"?",
+        "answers": [
+            "The primary goal of the book is to gather relevant concepts related to modern optimization methods and demonstrate how these concepts and methods can be addressed using the R tool, providing a self-contained document with practical application examples."
+        ],
+        "context": "Modern Optimization With R",
+        "length": 98105,
+        "context_id": "31aa6012fa9cdbd6e17cdc426851e487",
+        "_id": "cdfa9fa646614a9d70a36e29f1146609",
+        "label": "cs",
+        "meta": {
+            "title": "Modern Optimization With R",
+            "authors": "Paulo Cortez"
+        }
+    },
+    {
+        "input": "How can you run Spark on Amazon EC2?",
+        "answers": [
+            "Spark can be run on Amazon EC2 by using the provided scripts in the `ec2` directory of the Spark package. These scripts automate the process of launching a Spark cluster on EC2 instances, handling setup and configuration."
+        ],
+        "context": "Machine Learning With Spark",
+        "length": 131651,
+        "context_id": "7bcef8714a477fd61fc8fb0d499b2cc3",
+        "_id": "e7df0b58497343e180bd6289562edddf",
+        "label": "cs",
+        "meta": {
+            "title": "Machine Learning With Spark",
+            "authors": "Nick Pentreath"
+        }
+    },
+    {
+        "input": "Describe the structure and function of the IPv4 header.",
+        "answers": [
+            "The IPv4 header consists of fields such as version, IHL (Internet Header Length), Type of Service (TOS), total length, identification, flags, fragment offset, TTL (Time To Live), protocol, header checksum, source address, and destination address. It provides essential information for routing and handling packets in the network."
+        ],
+        "context": "Linux Kernel Networking",
+        "length": 324154,
+        "context_id": "a26e2a9fe9627588b3f37969996aa053",
+        "_id": "11a8dffaa240e12a786904a464f8a92e",
+        "label": "cs",
+        "meta": {
+            "title": "Linux Kernel Networking",
+            "authors": "Rami Rosen"
+        }
+    },
+    {
+        "input": "How does the book suggest handling special characters in names?",
+        "answers": [
+            "The book advises avoiding all special characters in names to ensure compatibility across different SQL products and host language programs, recommending the use of letters, digits, and underscores only."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "13f850f6c26aed5d72e47af31bc0c777",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        }
+    },
+    {
+        "input": "What are the challenges in defining a denotational semantics for a language with side effects like references and assignments?",
+        "answers": [
+            "Defining a denotational semantics for languages with side effects requires modeling the state of the program and how it changes over time, which can be complex due to the need to track and update the state consistently across different program executions."
+        ],
+        "context": "Introduction to the Theory of Programming Languages",
+        "length": 51704,
+        "context_id": "2369ec0826789a4f553cb31b1635c15d",
+        "_id": "8d146f8de1405ae49c52e22e2413bf96",
+        "label": "cs",
+        "meta": {
+            "title": "Introduction to the Theory of Programming Languages",
+            "authors": "Gilles Dowek"
+        }
+    },
+    {
+        "input": "How does the Macro Recorder work in Word and Excel?",
+        "answers": [
+            "The Macro Recorder in Word and Excel translates user actions into VBA code, allowing users to automate sequences of actions by recording them once and playing them back as needed."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "ba51eb8a1b1c96450160db13b175d0b5",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        }
+    },
+    {
+        "input": "What are the two types of procedures in VBA?",
+        "answers": [
+            "The two types of procedures in VBA are functions and subprocedures (subs). Functions return a value, while subprocedures do not."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "0704a96fe06ed0e5aff62cc3e4522ea0",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        }
+    },
+    {
+        "input": "How does the use of de Bruijn indices simplify the interpretation of terms in programming languages?",
+        "answers": [
+            "De Bruijn indices replace named variables with numerical indices, reducing the need for alpha-conversion and simplifying the handling of variable bindings and substitutions."
+        ],
+        "context": "Introduction to the Theory of Programming Languages",
+        "length": 51704,
+        "context_id": "2369ec0826789a4f553cb31b1635c15d",
+        "_id": "0f18e6d577f37580eebc8b5fe4872e16",
+        "label": "cs",
+        "meta": {
+            "title": "Introduction to the Theory of Programming Languages",
+            "authors": "Gilles Dowek"
+        }
+    },
+    {
+        "input": "How does Spark differ from Hadoop in terms of performance?",
+        "answers": [
+            "Spark is optimized for low-latency tasks and iterative analytics, storing intermediate data in memory, which significantly improves performance over Hadoop's MapReduce, where data is stored on disk, leading to slower processing times."
+        ],
+        "context": "Machine Learning With Spark",
+        "length": 131651,
+        "context_id": "7bcef8714a477fd61fc8fb0d499b2cc3",
+        "_id": "23b6c2db6571d4be02cc6b8e91187d82",
+        "label": "cs",
+        "meta": {
+            "title": "Machine Learning With Spark",
+            "authors": "Nick Pentreath"
+        }
+    },
+    {
+        "input": "How does the model database function as a template in SQL Server?",
+        "answers": [
+            "The model database serves as a template for any new database that you create. Altering the model database can change what standard, newly created databases look like, such as including audit tables or predefined user groups."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "829d8b5ca614145212d9fa46c9f468ab",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        }
+    },
+    {
+        "input": "What is the primary purpose of the Linux Kernel Networking stack as described in the book?",
+        "answers": [
+            "The primary purpose of the Linux Kernel Networking stack is to handle the traversal of packets in the network stack and interact with various networking layers and subsystems, describing how various networking protocols are implemented."
+        ],
+        "context": "Linux Kernel Networking",
+        "length": 324154,
+        "context_id": "a26e2a9fe9627588b3f37969996aa053",
+        "_id": "98600e07f48f68a6d91ca74cc49f777e",
+        "label": "cs",
+        "meta": {
+            "title": "Linux Kernel Networking",
+            "authors": "Rami Rosen"
+        }
+    },
+    {
+        "input": "How does the fixed point theorem play a role in the semantics of programming languages?",
+        "answers": [
+            "The fixed point theorem is used in operational semantics to give rise to inductive definitions and reflexive-transitive closures. In denotational semantics, it is primarily used to define the meaning of the `fix` construct."
+        ],
+        "context": "Introduction to the Theory of Programming Languages",
+        "length": 51704,
+        "context_id": "2369ec0826789a4f553cb31b1635c15d",
+        "_id": "b222dab4687973ade81a88dc5c7e1c0e",
+        "label": "cs",
+        "meta": {
+            "title": "Introduction to the Theory of Programming Languages",
+            "authors": "Gilles Dowek"
+        }
+    },
+    {
+        "input": "Explain the process of IPv4 fragmentation and defragmentation.",
+        "answers": [
+            "IPv4 fragmentation occurs when a packet is larger than the MTU (Maximum Transmission Unit) of the outgoing interface, and it needs to be broken into smaller fragments. Defragmentation is the process of reassembling these fragments back into the original packet at the destination."
+        ],
+        "context": "Linux Kernel Networking",
+        "length": 324154,
+        "context_id": "a26e2a9fe9627588b3f37969996aa053",
+        "_id": "004498a62b48d961ab580adbec81836d",
+        "label": "cs",
+        "meta": {
+            "title": "Linux Kernel Networking",
+            "authors": "Rami Rosen"
+        }
+    },
+    {
+        "input": "What is the primary purpose of the master database in SQL Server?",
+        "answers": [
+            "The master database is critical to the system and cannot be deleted. It holds a special set of tables (system tables) that keeps track of the system as a whole, including entries for new databases, extended and system stored procedures, and more."
+        ],
+        "context": "Professional Microsoft SQL Server 2008 Programming",
+        "length": 398984,
+        "context_id": "36442820cc1428a4b4c0edb51ea0a9e4",
+        "_id": "3304e79d0509d96bcbe4e273f6886f30",
+        "label": "cs",
+        "meta": {
+            "title": "Professional Microsoft SQL Server 2008 Programming",
+            "authors": "Robert Vieira"
+        }
+    },
+    {
+        "input": "What are some of the practical applications of Markov chains and Hidden Markov Models discussed in the book?",
+        "answers": [
+            "The book discusses practical applications of Markov chains and Hidden Markov Models in areas such as text generation, web ranking, and dynamic programming for HMMs, illustrating their utility in modeling sequences and processes where future states depend on the current state."
+        ],
+        "context": "Probability and Statistics for Computer Science",
+        "length": 340084,
+        "context_id": "cd05f98133352f12d464fe3184263356",
+        "_id": "2952ed98935d20d7f959093eb252713d",
+        "label": "cs",
+        "meta": {
+            "title": "Probability and Statistics for Computer Science",
+            "authors": "David Forsyth"
+        }
+    },
+    {
+        "input": "What is the significance of the \"dotall\" option in regular expressions?",
+        "answers": [
+            "The \"dotall\" option allows the dot (.) character in regular expressions to match all characters, including newline characters."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "5aa412726e58606de576d5bc416faa25",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        }
+    },
+    {
+        "input": "How can you run a macro from the Visual Basic Editor?",
+        "answers": [
+            "You can run a macro from the Visual Basic Editor by pressing the F5 key, choosing Run ⇒ Run Sub/UserForm, or clicking the Run Sub/UserForm button on the Standard toolbar."
+        ],
+        "context": "Mastering VBA for Microsoft Office 2013",
+        "length": 437070,
+        "context_id": "a04a1e6250de7eaf6c98aeed6882d9f9",
+        "_id": "c385585026ebeeb0d06746d26f8724fd",
+        "label": "cs",
+        "meta": {
+            "title": "Mastering VBA for Microsoft Office 2013",
+            "authors": "Richard Mansfield"
+        }
+    },
+    {
+        "input": "What is the book's stance on using triggers in SQL programming?",
+        "answers": [
+            "The book advises avoiding triggers in favor of Declarative Referential Integrity (DRI) actions, as triggers are often proprietary and can complicate schema design and maintenance."
+        ],
+        "context": "Joe Celko's SQL Programming Style",
+        "length": 72144,
+        "context_id": "12d652369a8284cdb57d61f710229568",
+        "_id": "57b64f7e48364d3661a6fd7b2aaa9722",
+        "label": "cs",
+        "meta": {
+            "title": "Joe Celko's SQL Programming Style",
+            "authors": "Joe Celko"
+        }
+    },
+    {
+        "input": "What are the challenges in using naive Bayes models with numerical features?",
+        "answers": [
+            "Naive Bayes models assume that features are conditionally independent and typically work well with categorical data. When dealing with numerical features, special handling is required, such as binning or transforming the data to fit the model's assumptions."
+        ],
+        "context": "Machine Learning With Spark",
+        "length": 131651,
+        "context_id": "7bcef8714a477fd61fc8fb0d499b2cc3",
+        "_id": "86062f01fbe48f830ccb15f58aa75a98",
+        "label": "cs",
+        "meta": {
+            "title": "Machine Learning With Spark",
+            "authors": "Nick Pentreath"
+        }
+    },
+    {
+        "input": "What is the difference between call by name and call by value reduction strategies?",
+        "answers": [
+            "Call by name evaluates arguments only when they are needed, while call by value evaluates arguments before the function is applied. Call by name can avoid unnecessary evaluations but may repeat evaluations of the same expression."
+        ],
+        "context": "Introduction to the Theory of Programming Languages",
+        "length": 51704,
+        "context_id": "2369ec0826789a4f553cb31b1635c15d",
+        "_id": "6940e37f5596aababeca8282bf7b702e",
+        "label": "cs",
+        "meta": {
+            "title": "Introduction to the Theory of Programming Languages",
+            "authors": "Gilles Dowek"
+        }
+    },
+    {
+        "input": "How does the book encourage the reader to engage with the R code examples?",
+        "answers": [
+            "The book encourages the reader to execute the R code examples and try to solve the proposed exercises. This hands-on approach helps readers to better understand and apply the concepts and methods discussed in the book."
+        ],
+        "context": "Modern Optimization With R",
+        "length": 98105,
+        "context_id": "31aa6012fa9cdbd6e17cdc426851e487",
+        "_id": "754e1461ce33a36cefa3fe22931fd9d1",
+        "label": "cs",
+        "meta": {
+            "title": "Modern Optimization With R",
+            "authors": "Paulo Cortez"
+        }
+    },
+    {
+        "input": "How does the book introduce the concept of alternation in regular expressions?",
+        "answers": [
+            "The book introduces alternation by explaining how to use the vertical bar (|) character to separate a list of regular expressions, indicating an \"or\" condition."
+        ],
+        "context": "Introducing Regular Expressions",
+        "length": 61814,
+        "context_id": "3036128592818913f474799d44870db9",
+        "_id": "4e533bbebfea3219e9d2cc9661a8f517",
+        "label": "cs",
+        "meta": {
+            "title": "Introducing Regular Expressions",
+            "authors": "Michael Fitzgerald"
+        }
+    }
+]
diff --git a/kag/examples/csqa/solver/eval.py b/kag/examples/csqa/solver/eval.py
new file mode 100644
index 00000000..c1a42e97
--- /dev/null
+++ b/kag/examples/csqa/solver/eval.py
@@ -0,0 +1,111 @@
+class CsQaEvaluator(object):
+    def __init__(self):
+        self._questions = self._load_questions()
+
+    def _load_questions(self):
+        import io
+        import os
+        import json
+
+        dir_path = os.path.dirname(os.path.abspath(__file__))
+        dir_path = os.path.join(dir_path, "data")
+        file_path = os.path.join(dir_path, "questions.json")
+        with io.open(file_path, "r", encoding="utf-8", newline="\n") as fin:
+            questions = json.load(fin)
+        return questions
+
+    def _save_answers(self):
+        import io
+        import os
+        import json
+
+        dir_path = os.path.dirname(os.path.abspath(__file__))
+        dir_path = os.path.join(dir_path, "data")
+        file_path = os.path.join(dir_path, "csqa_kag_answers.json")
+        with io.open(file_path, "w", encoding="utf-8", newline="\n") as fout:
+            json.dump(
+                self._questions,
+                fout,
+                separators=(",", ": "),
+                indent=4,
+                ensure_ascii=False,
+            )
+            print(file=fout)
+
+    def _get_question_answer(self, item):
+        from kag.common.conf import KAG_CONFIG
+        from kag.solver.logic.solver_pipeline import SolverPipeline
+
+        resp = SolverPipeline.from_config(KAG_CONFIG.all_config["kag_solver_pipeline"])
+        answer, trace_log = resp.run(item["input"])
+
+        print(f"\n\nso the answer for '{item['input']}' is: {answer}\n\n")
+        return answer, trace_log
+
+    def _process_sample(self, index, item, ckpt):
+        try:
+            question = item["input"]
+            if question in ckpt:
+                print(f"found existing answer to question: {question}")
+                answer, trace_log = ckpt.read_from_ckpt(question)
+            else:
+                answer, trace_log = self._get_question_answer(item)
+                ckpt.write_to_ckpt(question, (answer, trace_log))
+            item["kag_answer"] = answer
+            item["kag_trace_log"] = trace_log
+            return True
+        except Exception:
+            import traceback
+
+            message = f"process question {index}. {item['input']} failed with exception:\n{traceback.format_exc()}"
+            print(message)
+            return False
+
+    def _parallel_qa(self, thread_num=50):
+        import time
+        from tqdm import tqdm
+        from concurrent.futures import ThreadPoolExecutor
+        from concurrent.futures import as_completed
+        from kag.common.checkpointer import CheckpointerManager
+
+        ckpt = CheckpointerManager.get_checkpointer(
+            {"type": "zodb", "ckpt_dir": "ckpt"}
+        )
+        start = time.monotonic_ns()
+        with ThreadPoolExecutor(max_workers=thread_num) as executor:
+            futures = [
+                executor.submit(self._process_sample, index, item, ckpt)
+                for index, item in enumerate(self._questions, 1)
+            ]
+            processed = 0
+            for future in tqdm(
+                as_completed(futures),
+                total=len(futures),
+                desc="parallel qa: ",
+            ):
+                result = future.result()
+                if result:
+                    processed += 1
+        delta = (time.monotonic_ns() - start) / 1e9
+        print("Processed %d questions." % processed)
+        print("Elapsed time: %.2f seconds" % delta)
+        CheckpointerManager.close()
+
+    def run(self):
+        self._parallel_qa()
+        self._save_answers()
+
+
+def main():
+    import os
+    from kag.common.registry import import_modules_from_path
+
+    dir_path = os.path.dirname(os.path.abspath(__file__))
+    import_modules_from_path(dir_path)
+
+    evaluator = CsQaEvaluator()
+    evaluator.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/kag/examples/csqa/solver/factual_correctness.py b/kag/examples/csqa/solver/factual_correctness.py
new file mode 100644
index 00000000..8bd16f1b
--- /dev/null
+++ b/kag/examples/csqa/solver/factual_correctness.py
@@ -0,0 +1,149 @@
+from tenacity import retry, stop_after_attempt
+
+
+class FactualCorrectnessEvaluator(object):
+    def __init__(self):
+        import os
+
+        self._kag_answers = self._load_kag_answers()
+        self._lightrag_answers = self._load_lightrag_answers()
+        self._questions_and_answers = self._get_questions_and_answers()
+        self._evaluator_llm_kwargs = {
+            "model": "gpt-4o",
+            "api_key": os.environ.get("OPENAI_API_KEY"),
+            "base_url": "https://api.openai.com/v1",
+        }
+
+    def _load_kag_answers(self):
+        import io
+        import os
+        import json
+
+        dir_path = os.path.dirname(os.path.abspath(__file__))
+        dir_path = os.path.join(dir_path, "data")
+        file_path = os.path.join(dir_path, "csqa_kag_answers.json")
+        with io.open(file_path, "r", encoding="utf-8", newline="\n") as fin:
+            kag_answers = json.load(fin)
+        return kag_answers
+
+    def _load_lightrag_answers(self):
+        import io
+        import os
+        import json
+
+        dir_path = os.path.dirname(os.path.abspath(__file__))
+        dir_path = os.path.join(dir_path, "data")
+        file_path = os.path.join(dir_path, "csqa_lightrag_answers.json")
+        with io.open(file_path, "r", encoding="utf-8", newline="\n") as fin:
+            lightrag_answers = json.load(fin)
+        return lightrag_answers
+
+    def _get_questions_and_answers(self):
+        result = []
+        for x, y in zip(self._kag_answers, self._lightrag_answers):
+            assert x["input"] == y["input"]
+            item = {
+                "question": x["input"],
+                "groundtruth_answer": " ".join(x["answers"]),
+                "kag_answer": x["kag_answer"],
+                "lightrag_answer": y["lightrag_answer"],
+            }
+            result.append(item)
+        return result
+
+    @retry(stop=stop_after_attempt(3))
+    def _check_factual_correctness(self, item):
+        #
+        # We tried the factual correctness metric in RAGAS initially,
+        # but the output was quite unstable.
+        #
+        # This function sets the `n`, `temperature` and `top_p` parameters
+        # to increase the model's determinism as described in the G-EVAL
+        # paper, but the output remains somewhat unstable.
+        #
+        import json
+        from openai import OpenAI
+
+        prompt = 'Given the following QUESTION, GROUND-TRUTH ANSWER and ANSWER you must analyze the provided answer and determine whether it is faithful to the contents of the GROUND-TRUTH ANSWER.\n The ANSWER must not offer new information beyond the context provided in the GROUND-TRUTH ANSWER. The ANSWER also must not contradict information provided in the GROUND-TRUTH ANSWER. Output your final verdict by strictly following this format: "PASS" if the answer is faithful to the GROUND-TRUTH ANSWER and "FAIL" if the answer is not faithful to the GROUND-TRUTH ANSWER. Show your reasoning.\n --\n QUESTION:\n{}\n --\n GROUND-TRUTH ANSWER:{}\n--\nANSWER:{}\n--\n Your output should be in JSON FORMAT with the keys "REASONING" and "SCORE": {{"REASONING": <your reasoning as bullet points>, "SCORE": <your final score>}}'.format(
+            item["question"], item["groundtruth_answer"], item["answer"]
+        )
+        client = OpenAI(
+            api_key=self._evaluator_llm_kwargs["api_key"],
+            base_url=self._evaluator_llm_kwargs["base_url"],
+        )
+        n = 10
+        response = client.chat.completions.create(
+            model=self._evaluator_llm_kwargs["model"],
+            messages=[
+                {"role": "user", "content": prompt},
+            ],
+            timeout=600,
+            n=n,
+            temperature=1.0,
+            top_p=1.0,
+        )
+        assert len(response.choices) == n
+        ans = 0.0
+        for i in range(n):
+            content = response.choices[i].message.content
+            if content.startswith("```json") and content.endswith("```"):
+                content = content[7:-3].strip()
+            result = json.loads(content)
+            print(prompt, result["SCORE"])
+            assert result["SCORE"] in ("PASS", "FAIL")
+            ans += result["SCORE"] == "PASS"
+        ans /= n
+        return ans
+
+    def _process_item(self, item):
+        results = {}
+        results["kag"] = self._check_factual_correctness(
+            {
+                "question": item["question"],
+                "groundtruth_answer": item["groundtruth_answer"],
+                "answer": item["kag_answer"],
+            }
+        )
+        results["lightrag"] = self._check_factual_correctness(
+            {
+                "question": item["question"],
+                "groundtruth_answer": item["groundtruth_answer"],
+                "answer": item["lightrag_answer"],
+            }
+        )
+        return results
+
+    def _get_metrics(self):
+        from tqdm import tqdm
+        from concurrent.futures import ThreadPoolExecutor, as_completed
+
+        max_workers = 50
+        all_keys = "kag", "lightrag"
+        metrics = {key: 0.0 for key in all_keys}
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = [
+                executor.submit(self._process_item, item)
+                for item in self._questions_and_answers
+            ]
+            for future in tqdm(
+                as_completed(futures), total=len(futures), desc="Evaluating: "
+            ):
+                results = future.result()
+                for key in all_keys:
+                    metrics[key] += results[key]
+        for key in all_keys:
+            metrics[key] /= len(self._questions_and_answers)
+        return metrics
+
+    def run(self):
+        metrics = self._get_metrics()
+        print(metrics)
+
+
+def main():
+    evaluator = FactualCorrectnessEvaluator()
+    evaluator.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/kag/examples/csqa/solver/prompt/__init__.py b/kag/examples/csqa/solver/prompt/__init__.py
new file mode 100644
index 00000000..dfa931cd
--- /dev/null
+++ b/kag/examples/csqa/solver/prompt/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+"""
+Place the prompts to be used for solving problems in this directory.
+"""
diff --git a/kag/examples/csqa/solver/prompt/resp_generator.py b/kag/examples/csqa/solver/prompt/resp_generator.py
new file mode 100644
index 00000000..5c7d468a
--- /dev/null
+++ b/kag/examples/csqa/solver/prompt/resp_generator.py
@@ -0,0 +1,56 @@
+from typing import List
+import logging
+
+from kag.interface import PromptABC
+
+logger = logging.getLogger(__name__)
+
+
+@PromptABC.register("summary_resp_generator")
+class RespGenerator(PromptABC):
+    #
+    # This prompt template is adapted from LightRAG:
+    #
+    #   https://github.com/HKUDS/LightRAG/blob/45cea6e/lightrag/prompt.py#L156
+    #
+    # which can produce answers with better comprehensiveness, diversity
+    # and empowerment to general questions.
+    #
+    # NOTE: This prompt template may not be the best for all the tasks.
+    #       For example, it won't produce answers with high EM and F1
+    #       scores for the hotpotqa, 2wiki and musique datasets.
+    #
+    _prompt_template = """---Role---
+
+You are a helpful assistant responding to questions about the provided data.
+
+---Goal---
+
+Generate a response of the target length and format that responds to the user's question, summarizing all information in the provided data appropriate for the response length and format, and incorporating any relevant general knowledge.
+If you don't know the answer, just say so. Do not make anything up.
+Do not include information where the supporting evidence for it is not provided.
+Give the response in {language}.
+
+---Target response length and format---
+
+Multiple Paragraphs
+
+---Provided data---
+
+$memory
+
+Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
+
+$instruction
+"""
+
+    template_zh = _prompt_template.format(language="Chinese")
+    template_en = _prompt_template.format(language="English")
+
+    @property
+    def template_variables(self) -> List[str]:
+        return ["memory", "instruction"]
+
+    def parse_response(self, response: str, **kwargs):
+        logger.debug("推理器判别:{}".format(response))
+        return response
diff --git a/kag/examples/csqa/solver/summarization_metrics.py b/kag/examples/csqa/solver/summarization_metrics.py
new file mode 100644
index 00000000..3833efc6
--- /dev/null
+++ b/kag/examples/csqa/solver/summarization_metrics.py
@@ -0,0 +1,160 @@
+class SummarizationMetricsEvaluator(object):
+    def __init__(self):
+        import os
+
+        self._kag_answers = self._load_kag_answers()
+        self._lightrag_answers = self._load_lightrag_answers()
+        self._questions_and_answers = self._get_questions_and_answers()
+        self._rounds = 3
+        self._evaluator_kwargs = {
+            "model": "gpt-4o",
+            "api_key": os.environ.get("OPENAI_API_KEY"),
+            "base_url": "https://api.openai.com/v1",
+            "language": "English",
+            "retries": 3,
+            "max_workers": 50,
+        }
+
+    def _load_kag_answers(self):
+        import io
+        import os
+        import json
+
+        dir_path = os.path.dirname(os.path.abspath(__file__))
+        dir_path = os.path.join(dir_path, "data")
+        file_path = os.path.join(dir_path, "csqa_kag_answers.json")
+        with io.open(file_path, "r", encoding="utf-8", newline="\n") as fin:
+            kag_answers = json.load(fin)
+        return kag_answers
+
+    def _load_lightrag_answers(self):
+        import io
+        import os
+        import json
+
+        dir_path = os.path.dirname(os.path.abspath(__file__))
+        dir_path = os.path.join(dir_path, "data")
+        file_path = os.path.join(dir_path, "csqa_lightrag_answers.json")
+        with io.open(file_path, "r", encoding="utf-8", newline="\n") as fin:
+            lightrag_answers = json.load(fin)
+        return lightrag_answers
+
+    def _get_questions_and_answers(self):
+        result = []
+        for x, y in zip(self._kag_answers, self._lightrag_answers):
+            assert x["input"] == y["input"]
+            item = {
+                "question": x["input"],
+                "groundtruth_answer": " ".join(x["answers"]),
+                "kag_answer": x["kag_answer"],
+                "lightrag_answer": y["lightrag_answer"],
+            }
+            result.append(item)
+        return result
+
+    def _compute_average_metrics(self, questions, answers1, answers2):
+        from kag.common.benchmarks.evaluate import Evaluate
+
+        evaluator = Evaluate()
+        metrics = evaluator.getSummarizationMetrics(
+            questions, answers1, answers2, **self._evaluator_kwargs
+        )
+        if self._rounds >= 2:
+            all_keys = "Comprehensiveness", "Diversity", "Empowerment", "Overall"
+            all_items = "Score 1", "Score 2"
+            for _ in range(self._rounds - 1):
+                another_metrics = evaluator.getSummarizationMetrics(
+                    questions, answers1, answers2, **self._evaluator_kwargs
+                )
+                for key in all_keys:
+                    for item in all_items:
+                        metrics["average_metrics"][key][item] += another_metrics[
+                            "average_metrics"
+                        ][key][item]
+            for key in all_keys:
+                for item in all_items:
+                    metrics["average_metrics"][key][item] /= self._rounds
+        return metrics
+
+    def _compute_summarization_metrics(self):
+        questions = [item["question"] for item in self._questions_and_answers]
+        kag_answers = [item["kag_answer"] for item in self._questions_and_answers]
+        lightrag_answers = [
+            item["lightrag_answer"] for item in self._questions_and_answers
+        ]
+        metrics = self._compute_average_metrics(
+            questions, kag_answers, lightrag_answers
+        )
+        return metrics
+
+    def _compute_reverse_summarization_metrics(self):
+        questions = [item["question"] for item in self._questions_and_answers]
+        kag_answers = [item["kag_answer"] for item in self._questions_and_answers]
+        lightrag_answers = [
+            item["lightrag_answer"] for item in self._questions_and_answers
+        ]
+        metrics = self._compute_average_metrics(
+            questions, lightrag_answers, kag_answers
+        )
+        return metrics
+
+    def _compute_average_summarization_metrics(self, metrics, reverse_metrics):
+        #
+        # Order matters. Here we average the results from different orders.
+        #
+        # Please refere to:
+        #
+        #   https://github.com/HKUDS/LightRAG/issues/438
+        #
+        all_keys = "Comprehensiveness", "Diversity", "Empowerment", "Overall"
+        all_items = "Score 1", "Score 2"
+        average_metrics = {key: {item: 0.0 for item in all_items} for key in all_keys}
+        average_metrics = {"average_metrics": average_metrics}
+        for key in all_keys:
+            for item, reverse_item in zip(all_items, reversed(all_items)):
+                average_metrics["average_metrics"][key][item] = (
+                    metrics["average_metrics"][key][item]
+                    + reverse_metrics["average_metrics"][key][reverse_item]
+                ) / 2
+        return average_metrics
+
+    def run(self):
+        metrics = self._compute_summarization_metrics()
+        reverse_metrics = self._compute_reverse_summarization_metrics()
+        average_metrics = self._compute_average_summarization_metrics(
+            metrics, reverse_metrics
+        )
+        all_keys = "Comprehensiveness", "Diversity", "Empowerment", "Overall"
+        all_items = "Score 1", "Score 2"
+        titles = (
+            "Metrics: KAG vs LightRAG",
+            "Metrics: LightRAG vs KAG",
+            "Average: KAG vs LightRAG",
+        )
+        metricses = (
+            metrics,
+            reverse_metrics,
+            average_metrics,
+        )
+        string = ""
+        for index, (title, metrics) in enumerate(zip(titles, metricses)):
+            if index > 0:
+                string += "\n\n"
+            string += title
+            string += "\n" + "-" * 40
+            for key in all_keys:
+                string += "\n%s:" % key
+                for i, item in enumerate(all_items):
+                    if i > 0:
+                        string += " vs"
+                    string += " %.2f" % metrics["average_metrics"][key][item]
+        print(string)
+
+
+def main():
+    evaluator = SummarizationMetricsEvaluator()
+    evaluator.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/kag/examples/example.cfg b/kag/examples/example.cfg
deleted file mode 100644
index c8d2cdc4..00000000
--- a/kag/examples/example.cfg
+++ /dev/null
@@ -1,24 +0,0 @@
-[project]
-namespace = KagDemo
-host_addr = http://localhost:8887   
-
-# vectorizer loaded by ollma
-[vectorizer]
-vectorizer = kag.common.vectorizer.OpenAIVectorizer
-model = bge-m3
-api_key = EMPTY
-base_url = http://127.0.0.1:11434/v1
-vector_dimensions = 1024
-
-[llm]
-client_type = maas
-base_url = https://api.deepseek.com
-api_key = put your deepseek api key here
-model = deepseek-chat
-
-[prompt]
-biz_scene = default
-language = zh
-
-[log]
-level = INFO
\ No newline at end of file
diff --git a/kag/examples/example_config.yaml b/kag/examples/example_config.yaml
new file mode 100644
index 00000000..f1ca0ff5
--- /dev/null
+++ b/kag/examples/example_config.yaml
@@ -0,0 +1,126 @@
+#------------project configuration start----------------#
+openie_llm: &openie_llm
+  api_key: key
+  base_url: https://api.deepseek.com
+  model: deepseek-chat
+  type: maas
+
+chat_llm: &chat_llm
+  api_key: key
+  base_url: https://api.deepseek.com
+  model: deepseek-chat
+  type: maas
+
+vectorize_model: &vectorize_model
+  api_key: key
+  base_url: https://api.siliconflow.cn/v1/
+  model: BAAI/bge-m3
+  type: openai
+  vector_dimensions: 1024
+vectorizer: *vectorize_model
+
+log:
+  level: INFO
+
+project:
+  biz_scene: default
+  host_addr: http://127.0.0.1:8887
+  id: "1"
+  language: en
+  namespace: TwoWikiTest
+#------------project configuration end----------------#
+
+#------------kag-builder configuration start----------------#
+kag_builder_pipeline:
+  chain:
+    type: unstructured_builder_chain # kag.builder.default_chain.DefaultUnstructuredBuilderChain
+    extractor:
+      type: schema_free_extractor # kag.builder.component.extractor.schema_free_extractor.SchemaFreeExtractor
+      llm: *openie_llm
+      ner_prompt:
+        type: default_ner # kag.builder.prompt.default.ner.OpenIENERPrompt
+      std_prompt:
+        type: default_std # kag.builder.prompt.default.std.OpenIEEntitystandardizationdPrompt
+      triple_prompt:
+        type: default_triple # kag.builder.prompt.default.triple.OpenIETriplePrompt
+    reader:
+      type: dict_reader # kag.builder.component.reader.dict_reader.DictReader
+    post_processor:
+      type: kag_post_processor # kag.builder.component.postprocessor.kag_postprocessor.KAGPostProcessor
+      similarity_threshold: 0.9
+    splitter:
+      type: length_splitter # kag.builder.component.splitter.length_splitter.LengthSplitter
+      split_length: 100000
+      window_length: 0
+    vectorizer:
+      type: batch_vectorizer # kag.builder.component.vectorizer.batch_vectorizer.BatchVectorizer
+      vectorize_model: *vectorize_model
+    writer:
+      type: kg_writer # kag.builder.component.writer.kg_writer.KGWriter
+  num_threads_per_chain: 1
+  num_chains: 16
+  scanner:
+    type: 2wiki_dataset_scanner # kag.builder.component.scanner.dataset_scanner.MusiqueCorpusScanner
+#------------kag-builder configuration end----------------#
+
+#------------kag-solver configuration start----------------#
+search_api: &search_api
+  type: openspg_search_api #kag.solver.tools.search_api.impl.openspg_search_api.OpenSPGSearchAPI
+
+graph_api: &graph_api
+  type: openspg_graph_api #kag.solver.tools.graph_api.impl.openspg_graph_api.OpenSPGGraphApi
+
+exact_kg_retriever: &exact_kg_retriever
+  type: default_exact_kg_retriever # kag.solver.retriever.impl.default_exact_kg_retriever.DefaultExactKgRetriever
+  el_num: 5
+  llm_client: *chat_llm
+  search_api: *search_api
+  graph_api: *graph_api
+
+fuzzy_kg_retriever: &fuzzy_kg_retriever
+  type: default_fuzzy_kg_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever
+  el_num: 5
+  vectorize_model: *vectorize_model
+  llm_client: *chat_llm
+  search_api: *search_api
+  graph_api: *graph_api
+
+chunk_retriever: &chunk_retriever
+  type: default_chunk_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever
+  llm_client: *chat_llm
+  recall_num: 10
+  rerank_topk: 10
+
+kag_solver_pipeline:
+  memory:
+    type: default_memory # kag.solver.implementation.default_memory.DefaultMemory
+    llm_client: *chat_llm
+  max_iterations: 3
+  reasoner:
+    type: default_reasoner # kag.solver.implementation.default_reasoner.DefaultReasoner
+    llm_client: *chat_llm
+    lf_planner:
+      type: default_lf_planner # kag.solver.plan.default_lf_planner.DefaultLFPlanner
+      llm_client: *chat_llm
+      vectorize_model: *vectorize_model
+    lf_executor:
+      type: default_lf_executor # kag.solver.execute.default_lf_executor.DefaultLFExecutor
+      llm_client: *chat_llm
+      force_chunk_retriever: true
+      exact_kg_retriever: *exact_kg_retriever
+      fuzzy_kg_retriever: *fuzzy_kg_retriever
+      chunk_retriever: *chunk_retriever
+      merger:
+        type: default_lf_sub_query_res_merger # kag.solver.execute.default_sub_query_merger.DefaultLFSubQueryResMerger
+        vectorize_model: *vectorize_model
+        chunk_retriever: *chunk_retriever
+  generator:
+    type: default_generator # kag.solver.implementation.default_generator.DefaultGenerator
+    llm_client: *chat_llm
+    generate_prompt:
+      type: resp_simple # kag/examples/2wiki/solver/prompt/resp_generator.py
+  reflector:
+    type: default_reflector # kag.solver.implementation.default_reflector.DefaultReflector
+    llm_client: *chat_llm
+
+#------------kag-solver configuration end----------------#
diff --git a/kag/examples/hotpotqa/.gitignore b/kag/examples/hotpotqa/.gitignore
new file mode 100644
index 00000000..c0080b03
--- /dev/null
+++ b/kag/examples/hotpotqa/.gitignore
@@ -0,0 +1,3 @@
+ckpt/
+/solver/hotpotqa_res_*.json
+/solver/hotpotqa_metrics_*.json
diff --git a/kag/examples/hotpotqa/README.md b/kag/examples/hotpotqa/README.md
new file mode 100644
index 00000000..3ced9e99
--- /dev/null
+++ b/kag/examples/hotpotqa/README.md
@@ -0,0 +1,69 @@
+# KAG Example: HotpotQA
+
+[HotpotQA](https://arxiv.org/abs/1809.09600) is a dataset for diverse, explainable
+multi-hop question answering. It's used by [KAG](https://arxiv.org/abs/2409.13731)
+and [HippoRAG](https://arxiv.org/abs/2405.14831) for multi-hop question answering
+performance evaluation.
+
+Here we demonstrate how to build a knowledge graph for the HotpotQA dataset,
+generate answers to those evaluation questions with KAG and calculate EM and F1
+metrics of the KAG generated answers compared to the ground-truth answers.
+
+## Steps to reproduce
+
+1. Follow the Quick Start guide of KAG to install the OpenSPG server and KAG.
+
+   The following steps assume the Python virtual environment with KAG installed
+   is activated and the current directory is [hotpotqa](.).
+
+2. (Optional) Update [indexer.py](./builder/indexer.py) and [evaForHotpotqa.py](./solver/evaForHotpotqa.py)
+   to use the larger dataset. You may want to skip this step the first time and
+   use the small dataset to get started quickly.
+
+3. Update the ``openie_llm``, ``chat_llm`` and ``vectorizer_model`` configurations
+   in [kag_config.yaml](./kag_config.yaml) properly.
+
+4. Restore the KAG project.
+
+   ```bash
+   knext project restore --host_addr http://127.0.0.1:8887 --proj_path .
+   ```
+
+5. Commit the schema.
+
+   ```bash
+   knext schema commit
+   ```
+
+6. Execute [indexer.py](./builder/indexer.py) in the [builder](./builder) directory to build the knowledge graph.
+
+   ```bash
+   cd builder && python indexer.py && cd ..
+   ```
+
+7. Execute [evaForHotpotqa.py](./solver/evaForHotpotqa.py) in the [solver](./solver) directory
+   to generate the answers and calculate the EM and F1 metrics.
+
+   ```bash
+   cd solver && python evaForHotpotqa.py && cd ..
+   ```
+
+   The generated answers are saved to ``./solver/hotpotqa_res_*.json``.
+
+   The calculated EM and F1 metrics are saved to ``./solver/hotpotqa_metrics_*.json``.
+
+8. (Optional) To delete checkpoints, execute the following commands.
+
+   ```bash
+   rm -rf ./builder/ckpt
+   rm -rf ./solver/ckpt
+   ```
+
+   To delete the KAG project and related knowledge graph, execute the following similar command.
+   Replace the OpenSPG server address and KAG project id with actual values.
+
+   ```bash
+   curl http://127.0.0.1:8887/project/api/delete?projectId=1
+   ```
+
+9. (Optional) Restart from Step 2 and try the larger dataset.
diff --git a/kag/examples/hotpotqa/builder/__init__.py b/kag/examples/hotpotqa/builder/__init__.py
index 94be39bc..7a018e7c 100644
--- a/kag/examples/hotpotqa/builder/__init__.py
+++ b/kag/examples/hotpotqa/builder/__init__.py
@@ -11,4 +11,4 @@
 
 """
 Builder Dir.
-"""
\ No newline at end of file
+"""
diff --git a/kag/examples/hotpotqa/builder/data/__init__.py b/kag/examples/hotpotqa/builder/data/__init__.py
index 6a8637b9..59bacd4d 100644
--- a/kag/examples/hotpotqa/builder/data/__init__.py
+++ b/kag/examples/hotpotqa/builder/data/__init__.py
@@ -11,4 +11,4 @@
 
 """
 Place the files to be used for building the index in this directory.
-"""
\ No newline at end of file
+"""
diff --git a/kag/examples/hotpotqa/builder/indexer.py b/kag/examples/hotpotqa/builder/indexer.py
index fbc80f99..8048ace0 100644
--- a/kag/examples/hotpotqa/builder/indexer.py
+++ b/kag/examples/hotpotqa/builder/indexer.py
@@ -8,75 +8,25 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
-import json
 import logging
-import os
-from typing import List, Type
+from kag.common.registry import import_modules_from_path
 
-from kag.builder.component import KGWriter
-from kag.builder.component.extractor import KAGExtractor
-from kag.builder.component.splitter import LengthSplitter
-from kag.builder.component.vectorizer.batch_vectorizer import BatchVectorizer
-from kag.builder.model.chunk import Chunk
-from kag.examples.utils import generate_hash_id
-from knext.builder.builder_chain_abc import BuilderChainABC
-from kag.interface.builder import SourceReaderABC
-from knext.common.base.runnable import Input, Output
+from kag.builder.runner import BuilderChainRunner
 
 logger = logging.getLogger(__name__)
 
 
-class HotpotqaCorpusReader(SourceReaderABC):
-    @property
-    def input_types(self) -> Type[Input]:
-        """The type of input this Runnable object accepts specified as a type annotation."""
-        return str
+def buildKB(file_path):
+    from kag.common.conf import KAG_CONFIG
 
-    @property
-    def output_types(self) -> Type[Output]:
-        """The type of output this Runnable object produces specified as a type annotation."""
-        return Chunk
+    runner = BuilderChainRunner.from_config(KAG_CONFIG.all_config["kag_builder_pipeline"])
+    runner.invoke(file_path)
 
-    def invoke(self, input: str, **kwargs) -> List[Output]:
-        if os.path.exists(str(input)):
-            with open(input, "r") as f:
-                corpus = json.load(f)
-        else:
-            corpus = json.loads(input)
-        chunks = []
+    logger.info(f"\n\nbuildKB successfully for {file_path}\n\n")
 
-        for item_key, item_value in corpus.items():
-            chunk = Chunk(
-                id=generate_hash_id("\n".join(item_value)),
-                name=item_key,
-                content="\n".join(item_value),
-            )
-            chunks.append(chunk)
-        return chunks
 
+if __name__ == "__main__":
+    import_modules_from_path(".")
+    file_path = "./data/hotpotqa_sub_corpus.json"
 
-class HotpotBuilderChain(BuilderChainABC):
-    def build(self, **kwargs):
-        source = HotpotqaCorpusReader()
-        splitter = LengthSplitter(split_length=2000)
-        extractor = KAGExtractor()
-        vectorizer = BatchVectorizer()
-        sink = KGWriter()
-
-        return source >> splitter >> extractor >> vectorizer >> sink
-
-
-def buildKB(corpusFilePath):
-    HotpotBuilderChain().invoke(file_path=corpusFilePath, max_workers=20)
-
-    logger.info(f"\n\nbuildKB successfully for {corpusFilePath}\n\n")
-
-
-if __name__ == '__main__':
-    filePath = "./data/hotpotqa_sub_corpus.json"
-    # filePath = "./data/hotpotqa_train_corpus.json"
-
-    corpusFilePath = os.path.join(
-        os.path.abspath(os.path.dirname(__file__)), filePath
-    )
-    buildKB(corpusFilePath)
+    buildKB(file_path)
diff --git a/kag/examples/hotpotqa/builder/prompt/ner.py b/kag/examples/hotpotqa/builder/prompt/ner.py
deleted file mode 100644
index cf5aa897..00000000
--- a/kag/examples/hotpotqa/builder/prompt/ner.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import json
-from string import Template
-from typing import List, Optional
-
-from kag.common.base.prompt_op import PromptOp
-from knext.schema.client import SchemaClient
-
-
-class OpenIENERPrompt(PromptOp):
-
-    template_en = """
-    {
-    "instruction": "You're a very effective entity extraction system. Please extract all the entities that are important for knowledge build and question, along with type, category and a brief description of the entity. The description of the entity is based on your OWN KNOWLEDGE AND UNDERSTANDING and does not need to be limited to the context. the entity's category belongs taxonomically to one of the items defined by schema, please also output the category. Note: Type refers to a specific, well-defined classification, such as Professor, Actor, while category is a broader group or class that may contain more than one type, such as Person, Works. Return an empty list if the entity type does not exist. Please respond in the format of a JSON string.You can refer to the example for extraction.",
-    "schema": $schema,
-    "example": [
-        {
-            "input": "The Rezort\nThe Rezort is a 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger.\n It stars Dougray Scott, Jessica De Gouw and Martin McCann.\n After humanity wins a devastating war against zombies, the few remaining undead are kept on a secure island, where they are hunted for sport.\n When something goes wrong with the island's security, the guests must face the possibility of a new outbreak.",
-            "output": [
-                        {
-                            "entity": "The Rezort",
-                            "type": "Movie",
-                            "category": "Works",
-                            "description": "A 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger."
-                        },
-                        {
-                            "entity": "2015",
-                            "type": "Year",
-                            "category": "Date",
-                            "description": "The year the movie 'The Rezort' was released."
-                        },
-                        {
-                            "entity": "British",
-                            "type": "Nationality",
-                            "category": "GeographicLocation",
-                            "description": "Great Britain, the island that includes England, Scotland, and Wales."
-                        },
-                        {
-                            "entity": "Steve Barker",
-                            "type": "Director",
-                            "category": "Person",
-                            "description": "Steve Barker is an English film director and screenwriter."
-                        },
-                        {
-                            "entity": "Paul Gerstenberger",
-                            "type": "Writer",
-                            "category": "Person",
-                            "description": "Paul is a writer and producer, known for The Rezort (2015), Primeval (2007) and House of Anubis (2011)."
-                        },
-                        {
-                            "entity": "Dougray Scott",
-                            "type": "Actor",
-                            "category": "Person",
-                            "description": "Stephen Dougray Scott (born 26 November 1965) is a Scottish actor."
-                        },
-                        {
-                            "entity": "Jessica De Gouw",
-                            "type": "Actor",
-                            "category": "Person",
-                            "description": "Jessica Elise De Gouw (born 15 February 1988) is an Australian actress. "
-                        },
-                        {
-                            "entity": "Martin McCann",
-                            "type": "Actor",
-                            "category": "Person",
-                            "description": "Martin McCann is an actor from Northern Ireland. In 2020, he was listed as number 48 on The Irish Times list of Ireland's greatest film actors"
-                        }
-                    ]
-        }
-    ],
-    "input": "$input"
-}    
-        """
-
-    template_zh = template_en
-
-    def __init__(
-            self, language: Optional[str] = "en", **kwargs
-    ):
-        super().__init__(language, **kwargs)
-        self.schema = SchemaClient(project_id=self.project_id).extract_types()
-        self.template = Template(self.template).safe_substitute(schema=self.schema)
-
-    @property
-    def template_variables(self) -> List[str]:
-        return ["input"]
-
-    def parse_response(self, response: str, **kwargs):
-        rsp = response
-        if isinstance(rsp, str):
-            rsp = json.loads(rsp)
-        if isinstance(rsp, dict) and "output" in rsp:
-            rsp = rsp["output"]
-        if isinstance(rsp, dict) and "named_entities" in rsp:
-            entities = rsp["named_entities"]
-        else:
-            entities = rsp
-
-        return entities
diff --git a/kag/examples/hotpotqa/builder/prompt/std.py b/kag/examples/hotpotqa/builder/prompt/std.py
deleted file mode 100644
index 1dfcfaaa..00000000
--- a/kag/examples/hotpotqa/builder/prompt/std.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import json
-from typing import Optional, List
-
-from kag.common.base.prompt_op import PromptOp
-
-
-class OpenIEEntitystandardizationdPrompt(PromptOp):
-    template_en = """
-{
-    "instruction": "The `input` field contains a user provided context. The `named_entities` field contains extracted named entities from the context, which may be unclear abbreviations, aliases, or slang. To eliminate ambiguity, please attempt to provide the official names of these entities based on the context and your own knowledge. Note that entities with the same meaning can only have ONE official name. Please respond in the format of a single JSONArray string without any explanation, as shown in the `output` field of the provided example.",
-    "example": {
-        "input": "American History\nWhen did the political party that favored harsh punishment of southern states after the Civil War, gain control of the House? Republicans regained control of the chamber they had lost in the 2006 midterm elections.",
-        "named_entities": [
-            {"entity": "American", "category": "GeographicLocation"},
-            {"entity": "political party", "category": "Organization"},
-            {"entity": "southern states", "category": "GeographicLocation"},
-            {"entity": "Civil War", "category": "Keyword"},
-            {"entity": "House", "category": "Organization"},
-            {"entity": "Republicans", "category": "Organization"},
-            {"entity": "chamber", "category": "Organization"},
-            {"entity": "2006 midterm elections", "category": "Date"}
-        ],
-        "output": [
-            {
-                "entity": "American",
-                "category": "GeographicLocation",
-                "official_name": "United States of America"
-            },
-            {
-                "entity": "political party",
-                "category": "Organization",
-                "official_name": "Radical Republicans"
-            },
-            {
-                "entity": "southern states",
-                "category": "GeographicLocation",
-                "official_name": "Confederacy"
-            },
-            {
-                "entity": "Civil War",
-                "category": "Keyword",
-                "official_name": "American Civil War"
-            },
-            {
-                "entity": "House",
-                "category": "Organization",
-                "official_name": "United States House of Representatives"
-            },
-            {
-                "entity": "Republicans",
-                "category": "Organization",
-                "official_name": "Republican Party"
-            },
-            {
-                "entity": "chamber",
-                "category": "Organization",
-                "official_name": "United States House of Representatives"
-            },
-            {
-                "entity": "midterm elections",
-                "category": "Date",
-                "official_name": "United States midterm elections"
-            }
-        ]
-    },
-    "input": "$input",
-    "named_entities": $named_entities
-}
-    """
-
-    template_zh = """"""
-
-    def __init__(self, language: Optional[str] = "en"):
-        super().__init__(language)
-
-    @property
-    def template_variables(self) -> List[str]:
-        return ["input", "named_entities"]
-
-    def parse_response(self, response: str, **kwargs):
-
-        rsp = response
-        if isinstance(rsp, str):
-            rsp = json.loads(rsp)
-        if isinstance(rsp, dict) and "output" in rsp:
-            rsp = rsp["output"]
-        if isinstance(rsp, dict) and "named_entities" in rsp:
-            standardized_entity = rsp["named_entities"]
-        else:
-            standardized_entity = rsp
-        entities_with_offical_name = set()
-        merged = []
-        entities = kwargs.get("named_entities", [])
-        for entity in standardized_entity:
-            merged.append(entity)
-            entities_with_offical_name.add(entity["entity"])
-        # in case llm ignores some entities
-        for entity in entities:
-            if entity["entity"] not in entities_with_offical_name:
-                entity["official_name"] = entity["entity"]
-                merged.append(entity)
-        return merged
diff --git a/kag/examples/hotpotqa/builder/prompt/triple.py b/kag/examples/hotpotqa/builder/prompt/triple.py
deleted file mode 100644
index 9e375e2c..00000000
--- a/kag/examples/hotpotqa/builder/prompt/triple.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import json
-from typing import Optional, List
-
-from kag.common.base.prompt_op import PromptOp
-
-
-class OpenIETriplePrompt(PromptOp):
-    template_en = """
-{
-    "instruction": "You are an expert specializing in carrying out open information extraction (OpenIE). Please extract any possible relations (including subject, predicate, object) from the given text, and list them following the json format {\"triples\": [[\"subject\", \"predicate\",  \"object\"]]}\n. If there are none, do not list them.\n.\n\nPay attention to the following requirements:\n- Each triple should contain at least one, but preferably two, of the named entities in the entity_list.\n- Clearly resolve pronouns to their specific names to maintain clarity.",
-    "entity_list": $entity_list,
-    "input": "$input",
-    "example": {
-        "input": "The Rezort\nThe Rezort is a 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger.\n It stars Dougray Scott, Jessica De Gouw and Martin McCann.\n After humanity wins a devastating war against zombies, the few remaining undead are kept on a secure island, where they are hunted for sport.\n When something goes wrong with the island's security, the guests must face the possibility of a new outbreak.",
-        "entity_list": [
-            {
-                "entity": "The Rezort",
-                "category": "Works"
-            },
-            {
-                "entity": "2015",
-                "category": "Others"
-            },
-            {
-                "entity": "British",
-                "category": "GeographicLocation"
-            },
-            {
-                "entity": "Steve Barker",
-                "category": "Person"
-            },
-            {
-                "entity": "Paul Gerstenberger",
-                "category": "Person"
-            },
-            {
-                "entity": "Dougray Scott",
-                "category": "Person"
-            },
-            {
-                "entity": "Jessica De Gouw",
-                "category": "Person"
-            },
-            {
-                "entity": "Martin McCann",
-                "category": "Person"
-            },
-            {
-                "entity": "zombies",
-                "category": "Creature"
-            },
-            {
-                "entity": "zombie horror film",
-                "category": "Concept"
-            },
-            {
-                "entity": "humanity",
-                "category": "Concept"
-            },
-            {
-                "entity": "secure island",
-                "category": "GeographicLocation"
-            }
-        ],
-        "output": [
-            [
-                "The Rezort",
-                "is",
-                "zombie horror film"
-            ],
-            [
-                "The Rezort",
-                "publish at",
-                "2015"
-            ],
-            [
-                "The Rezort",
-                "released",
-                "British"
-            ],
-            [
-                "The Rezort",
-                "is directed by",
-                "Steve Barker"
-            ],
-            [
-                "The Rezort",
-                "is written by",
-                "Paul Gerstenberger"
-            ],
-            [
-                "The Rezort",
-                "stars",
-                "Dougray Scott"
-            ],
-            [
-                "The Rezort",
-                "stars",
-                "Jessica De Gouw"
-            ],
-            [
-                "The Rezort",
-                "stars",
-                "Martin McCann"
-            ],
-            [
-                "humanity",
-                "wins",
-                "a devastating war against zombies"
-            ],
-            [
-                "the few remaining undead",
-                "are kept on",
-                "a secure island"
-            ],
-            [
-                "they",
-                "are hunted for",
-                "sport"
-            ],
-            [
-                "something",
-                "goes wrong with",
-                "the island's security"
-            ],
-            [
-                "the guests",
-                "must face",
-                "the possibility of a new outbreak"
-            ]
-        ]
-    }
-}    
-    """
-
-    def __init__(self, language: Optional[str] = "en"):
-        super().__init__(language)
-
-    @property
-    def template_variables(self) -> List[str]:
-        return ["entity_list", "input"]
-
-    def parse_response(self, response: str, **kwargs):
-        rsp = response
-        if isinstance(rsp, str):
-            rsp = json.loads(rsp)
-        if isinstance(rsp, dict) and "output" in rsp:
-            rsp = rsp["output"]
-        if isinstance(rsp, dict) and "triples" in rsp:
-            triples = rsp["triples"]
-        else:
-            triples = rsp
-
-        standardized_triples = []
-        for triple in triples:
-            if isinstance(triple, list):
-                standardized_triples.append(triple)
-            elif isinstance(triple, dict):
-                s = triple.get("subject")
-                p = triple.get("predicate")
-                o = triple.get("object")
-                if s and p and o:
-                    standardized_triples.append([s, p, o])
-
-        return standardized_triples
diff --git a/kag/examples/hotpotqa/kag_config.cfg b/kag/examples/hotpotqa/kag_config.cfg
deleted file mode 100644
index 1a371f2f..00000000
--- a/kag/examples/hotpotqa/kag_config.cfg
+++ /dev/null
@@ -1,26 +0,0 @@
-[project]
-namespace = HotpotQA
-host_addr = http://127.0.0.1:8887
-id = 4
-
-[vectorizer]
-vectorizer = kag.common.vectorizer.OpenAIVectorizer
-model = bge-m3
-api_key = EMPTY
-base_url = http://127.0.0.1:11434/v1
-vector_dimensions = 1024
-
-[llm]
-client_type = ollama
-base_url = http://127.0.0.1:11434
-model = llama3.1
-
-[log]
-level = INFO
-
-[qa]
-force_chunk_retriever = True
-
-[prompt]
-language = en
-biz_scene = default
\ No newline at end of file
diff --git a/kag/examples/hotpotqa/kag_config.yaml b/kag/examples/hotpotqa/kag_config.yaml
new file mode 100644
index 00000000..3b1985b7
--- /dev/null
+++ b/kag/examples/hotpotqa/kag_config.yaml
@@ -0,0 +1,126 @@
+#------------project configuration start----------------#
+openie_llm: &openie_llm
+  api_key: key
+  base_url: https://api.deepseek.com
+  model: deepseek-chat
+  type: maas
+
+chat_llm: &chat_llm
+  api_key: key
+  base_url: https://api.deepseek.com
+  model: deepseek-chat
+  type: maas
+
+vectorize_model: &vectorize_model
+  api_key: key
+  base_url: https://api.siliconflow.cn/v1/
+  model: BAAI/bge-m3
+  type: openai
+  vector_dimensions: 1024
+vectorizer: *vectorize_model
+
+log:
+  level: INFO
+
+project:
+  biz_scene: default
+  host_addr: http://127.0.0.1:8887
+  id: '9'
+  language: en
+  namespace: HotpotQA
+#------------project configuration end----------------#
+
+#------------kag-builder configuration start----------------#
+kag_builder_pipeline:
+  chain:
+    type: unstructured_builder_chain # kag.builder.default_chain.DefaultUnstructuredBuilderChain
+    extractor:
+      type: schema_free_extractor # kag.builder.component.extractor.schema_free_extractor.SchemaFreeExtractor
+      llm: *openie_llm
+      ner_prompt:
+        type: default_ner # kag.builder.prompt.default.ner.OpenIENERPrompt
+      std_prompt:
+        type: default_std # kag.builder.prompt.default.std.OpenIEEntitystandardizationdPrompt
+      triple_prompt:
+        type: default_triple # kag.builder.prompt.default.triple.OpenIETriplePrompt
+    reader:
+      type: dict_reader # kag.builder.component.reader.dict_reader.DictReader
+    post_processor:
+      type: kag_post_processor # kag.builder.component.postprocessor.kag_postprocessor.KAGPostProcessor
+      similarity_threshold: 0.9
+    splitter:
+      type: length_splitter # kag.builder.component.splitter.length_splitter.LengthSplitter
+      split_length: 100000
+      window_length: 0
+    vectorizer:
+      type: batch_vectorizer # kag.builder.component.vectorizer.batch_vectorizer.BatchVectorizer
+      vectorize_model: *vectorize_model
+    writer:
+      type: kg_writer # kag.builder.component.writer.kg_writer.KGWriter
+  num_threads_per_chain: 1
+  num_chains: 16
+  scanner:
+    type: hotpotqa_dataset_scanner # kag.builder.component.scanner.dataset_scanner.HotpotqaCorpusScanner
+#------------kag-builder configuration end----------------#
+
+#------------kag-solver configuration start----------------#
+search_api: &search_api
+  type: openspg_search_api #kag.solver.tools.search_api.impl.openspg_search_api.OpenSPGSearchAPI
+
+graph_api: &graph_api
+  type: openspg_graph_api #kag.solver.tools.graph_api.impl.openspg_graph_api.OpenSPGGraphApi
+
+exact_kg_retriever: &exact_kg_retriever
+  type: default_exact_kg_retriever # kag.solver.retriever.impl.default_exact_kg_retriever.DefaultExactKgRetriever
+  el_num: 5
+  llm_client: *chat_llm
+  search_api: *search_api
+  graph_api: *graph_api
+
+fuzzy_kg_retriever: &fuzzy_kg_retriever
+  type: default_fuzzy_kg_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever
+  el_num: 5
+  vectorize_model: *vectorize_model
+  llm_client: *chat_llm
+  search_api: *search_api
+  graph_api: *graph_api
+
+chunk_retriever: &chunk_retriever
+  type: default_chunk_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever
+  llm_client: *chat_llm
+  recall_num: 10
+  rerank_topk: 10
+
+kag_solver_pipeline:
+  memory:
+    type: default_memory # kag.solver.implementation.default_memory.DefaultMemory
+    llm_client: *chat_llm
+  max_iterations: 3
+  reasoner:
+    type: default_reasoner # kag.solver.implementation.default_reasoner.DefaultReasoner
+    llm_client: *chat_llm
+    lf_planner:
+      type: default_lf_planner # kag.solver.plan.default_lf_planner.DefaultLFPlanner
+      llm_client: *chat_llm
+      vectorize_model: *vectorize_model
+    lf_executor:
+      type: default_lf_executor # kag.solver.execute.default_lf_executor.DefaultLFExecutor
+      llm_client: *chat_llm
+      force_chunk_retriever: true
+      exact_kg_retriever: *exact_kg_retriever
+      fuzzy_kg_retriever: *fuzzy_kg_retriever
+      chunk_retriever: *chunk_retriever
+      merger:
+        type: default_lf_sub_query_res_merger # kag.solver.execute.default_sub_query_merger.DefaultLFSubQueryResMerger
+        vectorize_model: *vectorize_model
+        chunk_retriever: *chunk_retriever
+  generator:
+    type: default_generator # kag.solver.implementation.default_generator.DefaultGenerator
+    llm_client: *chat_llm
+    generate_prompt:
+      type: resp_simple # kag/examples/hotpotqa/solver/prompt/resp_generator.py
+  reflector:
+    type: default_reflector # kag.solver.implementation.default_reflector.DefaultReflector
+    llm_client: *chat_llm
+
+#------------kag-solver configuration end----------------#
diff --git a/kag/examples/hotpotqa/reasoner/__init__.py b/kag/examples/hotpotqa/reasoner/__init__.py
index a0c4032b..8b8a3c91 100644
--- a/kag/examples/hotpotqa/reasoner/__init__.py
+++ b/kag/examples/hotpotqa/reasoner/__init__.py
@@ -17,4 +17,4 @@
 MATCH (s:DEFAULT.Company)
 RETURN s.id, s.address
 ```
-"""
\ No newline at end of file
+"""
diff --git a/kag/examples/hotpotqa/schema/__init__.py b/kag/examples/hotpotqa/schema/__init__.py
index ef3dde6d..8ac86acc 100644
--- a/kag/examples/hotpotqa/schema/__init__.py
+++ b/kag/examples/hotpotqa/schema/__init__.py
@@ -15,4 +15,4 @@
     You can execute `kag schema commit` to commit your schema to SPG server.
 
 
-"""
\ No newline at end of file
+"""
diff --git a/kag/examples/hotpotqa/solver/data/hotpotqa.json b/kag/examples/hotpotqa/solver/data/hotpotqa_qa.json
similarity index 100%
rename from kag/examples/hotpotqa/solver/data/hotpotqa.json
rename to kag/examples/hotpotqa/solver/data/hotpotqa_qa.json
diff --git a/kag/examples/hotpotqa/solver/evaForHotpotqa.py b/kag/examples/hotpotqa/solver/evaForHotpotqa.py
index 041800d4..5374c784 100644
--- a/kag/examples/hotpotqa/solver/evaForHotpotqa.py
+++ b/kag/examples/hotpotqa/solver/evaForHotpotqa.py
@@ -9,6 +9,10 @@
 from kag.common.benchmarks.evaluate import Evaluate
 from kag.examples.utils import delay_run
 from kag.solver.logic.solver_pipeline import SolverPipeline
+from kag.common.conf import KAG_CONFIG
+from kag.common.registry import import_modules_from_path
+
+from kag.common.checkpointer import CheckpointerManager
 
 logger = logging.getLogger(__name__)
 
@@ -17,12 +21,12 @@ class EvaForHotpotqa:
     """
     init for kag client
     """
+
     def __init__(self):
         pass
 
     def qa(self, query):
-        # CA
-        resp = SolverPipeline()
+        resp = SolverPipeline.from_config(KAG_CONFIG.all_config["kag_solver_pipeline"])
         answer, traceLog = resp.run(query)
 
         logger.info(f"\n\nso the answer for '{query}' is: {answer}\n\n")
@@ -34,19 +38,24 @@ def qa(self, query):
     """
 
     def parallelQaAndEvaluate(
-            self, qaFilePath, resFilePath, threadNum=1, upperLimit=10, run_failed=False
+        self, qaFilePath, resFilePath, threadNum=1, upperLimit=10, run_failed=False
     ):
+        ckpt = CheckpointerManager.get_checkpointer(
+            {"type": "zodb", "ckpt_dir": "ckpt"}
+        )
+
         def process_sample(data):
             try:
                 sample_idx, sample = data
                 sample_id = sample["_id"]
                 question = sample["question"]
                 gold = sample["answer"]
-                if "prediction" not in sample.keys():
-                    prediction, traceLog = self.qa(question)
+                if question in ckpt:
+                    print(f"found existing answer to question: {question}")
+                    prediction, traceLog = ckpt.read_from_ckpt(question)
                 else:
-                    prediction = sample['prediction']
-                    traceLog = sample['traceLog']
+                    prediction, traceLog = self.qa(question)
+                    ckpt.write_to_ckpt(question, (prediction, traceLog))
 
                 evaObj = Evaluate()
                 metrics = evaObj.getBenchMark([prediction], [gold])
@@ -72,9 +81,9 @@ def process_sample(data):
                 for sample_idx, sample in enumerate(qaList[:upperLimit])
             ]
             for future in tqdm(
-                    as_completed(futures),
-                    total=len(futures),
-                    desc="parallelQaAndEvaluate completing: ",
+                as_completed(futures),
+                total=len(futures),
+                desc="parallelQaAndEvaluate completing: ",
             ):
                 result = future.result()
                 if result is not None:
@@ -104,20 +113,20 @@ def process_sample(data):
                 res_metrics[item_key] = item_value / total_metrics["processNum"]
             else:
                 res_metrics[item_key] = total_metrics["processNum"]
+        CheckpointerManager.close()
         return res_metrics
 
 
 if __name__ == "__main__":
+    import_modules_from_path("./prompt")
     delay_run(hours=0)
     evaObj = EvaForHotpotqa()
 
-    # filePath = "./data/hotpotqa_qa_train.json"
     filePath = "./data/hotpotqa_qa_sub.json"
-
+    # filePath = "./data/hotpotqa_qa_sub.json"
+    evaObj.qa("Which film was shot in or around Leland, North Carolina in 1986?")
     start_time = time.time()
-    qaFilePath = os.path.join(
-        os.path.abspath(os.path.dirname(__file__)), filePath
-    )
+    qaFilePath = os.path.join(os.path.abspath(os.path.dirname(__file__)), filePath)
     resFilePath = os.path.join(
         os.path.abspath(os.path.dirname(__file__)), f"hotpotqa_res_{start_time}.json"
     )
@@ -125,7 +134,7 @@ def process_sample(data):
         qaFilePath, resFilePath, threadNum=20, upperLimit=100000, run_failed=True
     )
 
-    total_metrics['cost'] = time.time() - start_time
+    total_metrics["cost"] = time.time() - start_time
     with open(f"./hotpotqa_metrics_{start_time}.json", "w") as f:
         json.dump(total_metrics, f)
     print(total_metrics)
diff --git a/kag/examples/hotpotqa/solver/prompt/__init__.py b/kag/examples/hotpotqa/solver/prompt/__init__.py
index dadd42a3..dfa931cd 100644
--- a/kag/examples/hotpotqa/solver/prompt/__init__.py
+++ b/kag/examples/hotpotqa/solver/prompt/__init__.py
@@ -11,4 +11,4 @@
 
 """
 Place the prompts to be used for solving problems in this directory.
-"""
\ No newline at end of file
+"""
diff --git a/kag/examples/hotpotqa/solver/prompt/resp_generator.py b/kag/examples/hotpotqa/solver/prompt/resp_generator.py
index 70e96cc9..23093a76 100644
--- a/kag/examples/hotpotqa/solver/prompt/resp_generator.py
+++ b/kag/examples/hotpotqa/solver/prompt/resp_generator.py
@@ -1,28 +1,26 @@
-import re
-from string import Template
 from typing import List
 import logging
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 logger = logging.getLogger(__name__)
 
 
-class RespGenerator(PromptOp):
-    template_zh = "基于给定的引用信息回答问题。" \
-                  "\n只输出答案，不需要输出额外的信息。" \
-                  "\n给定的引用信息：'$memory'\n问题：'$instruction'"
-    template_en = "Answer the question based on the given reference." \
-                 "\nOnly give me the answer and do not output any other words." \
-                 "\nThe following are given reference:'$memory'\nQuestion: '$instruction'"
-
-    def __init__(self, language: str):
-        super().__init__(language)
+@PromptABC.register("resp_simple")
+class RespGenerator(PromptABC):
+    template_zh = (
+        "基于给定的引用信息回答问题。" "\n只输出答案，不需要输出额外的信息。" "\n给定的引用信息：'$memory'\n问题：'$instruction'"
+    )
+    template_en = (
+        "Answer the question based on the given reference."
+        "\nOnly give me the answer and do not output any other words."
+        "\nThe following are given reference:'$memory'\nQuestion: '$instruction'"
+    )
 
     @property
     def template_variables(self) -> List[str]:
         return ["memory", "instruction"]
 
     def parse_response(self, response: str, **kwargs):
-        logger.debug('推理器判别:{}'.format(response))
+        logger.debug("推理器判别:{}".format(response))
         return response
diff --git a/kag/examples/medicine/.gitignore b/kag/examples/medicine/.gitignore
new file mode 100644
index 00000000..d21c31de
--- /dev/null
+++ b/kag/examples/medicine/.gitignore
@@ -0,0 +1,3 @@
+ckpt/
+/*-*-*_*-*-*.csv
+/reasoner/*-*-*_*-*-*.csv
diff --git a/kag/examples/medicine/README.md b/kag/examples/medicine/README.md
new file mode 100644
index 00000000..8ae0da62
--- /dev/null
+++ b/kag/examples/medicine/README.md
@@ -0,0 +1,95 @@
+# KAG Example: Medicine
+
+This example demonstrates the application of KAG in the medical domain.
+
+It leaverages LLMs to extract and construct entities and relationships
+into a knowledge graph with a schema-free extractor. It also leaverages
+the schema definitions to import data and domain knowledge into the
+knowledge graph in a schema-constrained style.
+
+![KAG Medicine Diagram](./images/kag-medicine-diag.png)
+
+## Steps to reproduce
+
+1. Follow the Quick Start guide of KAG to install the OpenSPG server and KAG.
+
+   The following steps assume the Python virtual environment with KAG installed
+   is activated and the current directory is [medicine](.).
+
+2. Update the ``openie_llm``, ``chat_llm`` and ``vectorizer_model`` configurations
+   in [kag_config.yaml](./kag_config.yaml) properly.
+
+3. Restore the KAG project.
+
+   ```bash
+   knext project restore --host_addr http://127.0.0.1:8887 --proj_path .
+   ```
+
+4. Commit the schema.
+
+   ```bash
+   knext schema commit
+   ```
+
+5. Execute [indexer.py](./builder/indexer.py) in the [builder](./builder) directory to build the knowledge graph
+   with domain knowledge importing and schema-free extraction.
+
+   ```bash
+   cd builder && python indexer.py && cd ..
+   ```
+
+   Check [Disease.csv](./builder/data/Disease.csv) to inspect the descriptions of diseases.
+   Those unstructured descriptions are schema-free extracted by ``extract_runner``
+   defined in [kag_config.yaml](./kag_config.yaml).
+
+   Other structured data in [data](./builder/data) will be imported directly by corresponding
+   builder chains defined in [kag_config.yaml](./kag_config.yaml).
+
+6. You can use the ``knext reasoner`` command to inspect the built knowledge graph.
+
+   The query DSL will be executed by the OpenSPG server, which supports ISO GQL.
+
+   * Execute the following command to execute DSL directly.
+
+     ```bash
+     knext reasoner execute --dsl "
+     MATCH
+         (s:Medicine.HospitalDepartment)-[p]->(o)
+     RETURN
+         s.id, s.name
+     "
+     ```
+
+     The results will be displayed on the screen and saved as CSV to the current directory.
+
+   * You can also save the DSL to a file and execute the file.
+
+     ```bash
+     knext reasoner execute --file ./reasoner/rule.dsl
+     ```
+
+   * You can also use the reasoner Python client to query the knowledge graph.
+
+     ```bash
+     python ./reasoner/client.py
+     ```
+
+7. Execute [evaForMedicine.py](./solver/evaForMedicine.py) in the [solver](./solver) directory
+   to ask a demo question and view the answer and trace log.
+
+   ```bash
+   cd solver && python evaForMedicine.py && cd ..
+   ```
+
+8. (Optional) To delete the checkpoint, execute the following command.
+
+   ```bash
+   rm -rf ./builder/ckpt
+   ```
+
+   To delete the KAG project and related knowledge graph, execute the following similar command.
+   Replace the OpenSPG server address and KAG project id with actual values.
+
+   ```bash
+   curl http://127.0.0.1:8887/project/api/delete?projectId=1
+   ```
diff --git a/kag/examples/medicine/builder/indexer.py b/kag/examples/medicine/builder/indexer.py
index e5ec09de..9f817b28 100644
--- a/kag/examples/medicine/builder/indexer.py
+++ b/kag/examples/medicine/builder/indexer.py
@@ -1,53 +1,29 @@
 import os
-
-from kag.builder.component.vectorizer.batch_vectorizer import BatchVectorizer
-from kag.builder.component import KGWriter
-from kag.builder.component.reader.csv_reader import CSVReader
-from kag.builder.component.extractor import SPGExtractor, KAGExtractor
-from kag.builder.component.mapping.spo_mapping import SPOMapping
-from kag.builder.component.splitter import LengthSplitter
-from kag.builder.default_chain import DefaultStructuredBuilderChain, DefaultUnstructuredBuilderChain
-from kag.common.env import init_kag_config
-from knext.builder.builder_chain_abc import BuilderChainABC
-
-
-class SPOBuilderChain(BuilderChainABC):
-    def build(self, **kwargs):
-        source = CSVReader(output_type="Dict")
-        mapping = (
-            SPOMapping()
-            .add_field_mappings(
-                s_id_col="S",
-                p_type_col="P",
-                o_id_col="O",
-            )
-            .add_sub_property_mapping("properties")
-        )
-        vectorizer = BatchVectorizer()
-        sink = KGWriter()
-        return source >> mapping >> vectorizer >> sink
-
-
-class DiseaseBuilderChain(BuilderChainABC):
-    def build(self, **kwargs):
-        source = CSVReader(output_type="Chunk", id_col="idx", name_col="title", content_col="text")
-        splitter = LengthSplitter(split_length=2000)
-        extractor = KAGExtractor()
-        vectorizer = BatchVectorizer()
-        sink = KGWriter()
-
-        return source >> splitter >> extractor >> vectorizer >> sink
-
+import copy
+from kag.common.conf import KAG_CONFIG
+from kag.common.registry import import_modules_from_path
+from kag.builder.runner import BuilderChainRunner
 
 
 def import_data():
-    file_path = os.path.dirname(__file__)
-    init_kag_config(os.path.join(file_path, "../kag_config.cfg"))
-    DefaultStructuredBuilderChain("HumanBodyPart").invoke(file_path=os.path.join(file_path,"data/HumanBodyPart.csv"))
-    DefaultStructuredBuilderChain("HospitalDepartment").invoke(file_path=os.path.join(file_path,"data/HospitalDepartment.csv"))
-    DiseaseBuilderChain().invoke(file_path=os.path.join(file_path,"data/Disease.csv"))
-
-    SPOBuilderChain().invoke(file_path=os.path.join(file_path,"data/SPO.csv"))
-
-if __name__ == '__main__':
+    pwd = os.path.dirname(__file__)
+    spo_runner_config = KAG_CONFIG.all_config["spg_runner"]
+    for spg_type_name in ["HumanBodyPart", "HospitalDepartment"]:
+        runner_config = copy.deepcopy(spo_runner_config)
+        runner_config["chain"]["mapping"]["spg_type_name"] = spg_type_name
+        file_path = os.path.join(pwd, f"data/{spg_type_name}.csv")
+        runner = BuilderChainRunner.from_config(runner_config)
+        runner.invoke(file_path)
+
+    extract_runner_config = KAG_CONFIG.all_config["extract_runner"]
+    extract_runner = BuilderChainRunner.from_config(extract_runner_config)
+    extract_runner.invoke(os.path.join(pwd, "data/Disease.csv"))
+
+    spo_runner_config = KAG_CONFIG.all_config["spo_runner"]
+    spo_runner = BuilderChainRunner.from_config(spo_runner_config)
+    spo_runner.invoke(os.path.join(pwd, "data/SPO.csv"))
+
+
+if __name__ == "__main__":
+    import_modules_from_path(".")
     import_data()
diff --git a/kag/examples/medicine/builder/prompt/ner.py b/kag/examples/medicine/builder/prompt/ner.py
index 07c6298a..f8538c3c 100644
--- a/kag/examples/medicine/builder/prompt/ner.py
+++ b/kag/examples/medicine/builder/prompt/ner.py
@@ -14,11 +14,12 @@
 from string import Template
 from typing import List, Optional
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 from knext.schema.client import SchemaClient
 
 
-class OpenIENERPrompt(PromptOp):
+@PromptABC.register("example_medical_ner")
+class OpenIENERPrompt(PromptABC):
 
     template_zh = """
     {
@@ -45,9 +46,7 @@ class OpenIENERPrompt(PromptOp):
 
     template_en = template_zh
 
-    def __init__(
-            self, language: Optional[str] = "en", **kwargs
-    ):
+    def __init__(self, language: Optional[str] = "en", **kwargs):
         super().__init__(language, **kwargs)
         self.schema = SchemaClient(project_id=self.project_id).extract_types()
         self.template = Template(self.template).safe_substitute(schema=self.schema)
diff --git a/kag/examples/medicine/builder/prompt/std.py b/kag/examples/medicine/builder/prompt/std.py
index 88ec1283..9d526488 100644
--- a/kag/examples/medicine/builder/prompt/std.py
+++ b/kag/examples/medicine/builder/prompt/std.py
@@ -13,11 +13,12 @@
 import json
 from typing import Optional, List
 
-from kag.common.base.prompt_op import PromptOp
 
+from kag.interface import PromptABC
 
-class OpenIEEntitystandardizationdPrompt(PromptOp):
 
+@PromptABC.register("example_medical_std")
+class OpenIEEntitystandardizationdPrompt(PromptABC):
     template_zh = """
 {
     "instruction": "input字段包含用户提供的上下文。命名实体字段包含从上下文中提取的命名实体，这些可能是含义不明的缩写、别名或俚语。为了消除歧义，请尝试根据上下文和您自己的知识提供这些实体的官方名称。请注意，具有相同含义的实体只能有一个官方名称。请按照提供的示例中的输出字段格式，以单个JSONArray字符串形式回复，无需任何解释。",
@@ -51,15 +52,11 @@ class OpenIEEntitystandardizationdPrompt(PromptOp):
 
     template_en = template_zh
 
-    def __init__(self, language: Optional[str] = "en"):
-        super().__init__(language)
-
     @property
     def template_variables(self) -> List[str]:
         return ["input", "named_entities"]
 
     def parse_response(self, response: str, **kwargs):
-
         rsp = response
         if isinstance(rsp, str):
             rsp = json.loads(rsp)
diff --git a/kag/examples/medicine/builder/prompt/triple.py b/kag/examples/medicine/builder/prompt/triple.py
index d925d963..359ef08a 100644
--- a/kag/examples/medicine/builder/prompt/triple.py
+++ b/kag/examples/medicine/builder/prompt/triple.py
@@ -13,11 +13,11 @@
 import json
 from typing import Optional, List
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 
-class OpenIETriplePrompt(PromptOp):
-
+@PromptABC.register("example_medical_triple")
+class OpenIETriplePrompt(PromptABC):
     template_zh = """
 {
     "instruction": "您是一位专门从事开放信息提取（OpenIE）的专家。请从input字段的文本中提取任何可能的关系（包括主语、谓语、宾语），并按照JSON格式列出它们，须遵循example字段的示例格式。请注意以下要求：1. 每个三元组应至少包含entity_list实体列表中的一个，但最好是两个命名实体。2. 明确地将代词解析为特定名称，以保持清晰度。",
@@ -53,9 +53,6 @@ class OpenIETriplePrompt(PromptOp):
 
     template_en = template_zh
 
-    def __init__(self, language: Optional[str] = "en"):
-        super().__init__(language)
-
     @property
     def template_variables(self) -> List[str]:
         return ["entity_list", "input"]
diff --git a/kag/examples/medicine/images/kag-medicine-diag.png b/kag/examples/medicine/images/kag-medicine-diag.png
new file mode 100644
index 00000000..6c90b7c3
Binary files /dev/null and b/kag/examples/medicine/images/kag-medicine-diag.png differ
diff --git a/kag/examples/medicine/kag_config.cfg b/kag/examples/medicine/kag_config.cfg
deleted file mode 100644
index 205804de..00000000
--- a/kag/examples/medicine/kag_config.cfg
+++ /dev/null
@@ -1,24 +0,0 @@
-[project]
-namespace = Medicine
-host_addr = http://127.0.0.1:8887
-id = 4
-
-[vectorizer]
-vectorizer = kag.common.vectorizer.OpenAIVectorizer
-model = bge-m3
-api_key = EMPTY
-base_url = http://127.0.0.1:11434/v1
-vector_dimensions = 1024
-
-[llm]
-client_type = maas
-base_url = https://api.deepseek.com/
-api_key = put your deepseek api key here
-model = deepseek-chat
-
-[prompt]
-biz_scene = medical
-language = zh
-
-[log]
-level = INFO
diff --git a/kag/examples/medicine/kag_config.yaml b/kag/examples/medicine/kag_config.yaml
new file mode 100644
index 00000000..e5e4035c
--- /dev/null
+++ b/kag/examples/medicine/kag_config.yaml
@@ -0,0 +1,153 @@
+#------------project configuration start----------------#
+openie_llm: &openie_llm
+  api_key: key
+  base_url: https://api.deepseek.com
+  model: deepseek-chat
+  type: maas
+
+chat_llm: &chat_llm
+  api_key: key
+  base_url: https://api.deepseek.com
+  model: deepseek-chat
+  type: maas
+
+vectorize_model: &vectorize_model
+  api_key: key
+  base_url: https://api.siliconflow.cn/v1/
+  model: BAAI/bge-m3
+  type: openai
+  vector_dimensions: 1024
+vectorizer: *vectorize_model
+
+log:
+  level: INFO
+
+project:
+  biz_scene: medical
+  host_addr: http://127.0.0.1:8887
+  id: '10'
+  language: zh
+  namespace: Medicine
+  checkpoint_path: ./ckpt
+#------------project configuration end----------------#
+
+#------------kag-builder configuration start----------------#
+extract_runner:
+  chain:
+    type: unstructured_builder_chain # kag.builder.default_chain.DefaultUnstructuredBuilderChain
+    extractor:
+      type: schema_free_extractor # kag.builder.component.extractor.schema_free_extractor.SchemaFreeExtractor
+      llm: *openie_llm
+      ner_prompt:
+        type: example_medical_ner # kag.examples.medicine.builder.prompt.ner.OpenIENERPrompt
+      std_prompt:
+        type: example_medical_std # kag.examples.medicine.builder.prompt.std.OpenIEEntitystandardizationdPrompt
+      triple_prompt:
+        type: example_medical_triple # kag.examples.medicine.builder.prompt.triple.OpenIETriplePrompt
+    reader:
+      type: dict_reader # kag.builder.component.reader.dict_reader.DictReader
+      content_col: text
+      id_col: idx
+      name_col: title
+    post_processor:
+      type: kag_post_processor # kag.builder.component.postprocessor.kag_postprocessor.KAGPostProcessor
+      similarity_threshold: 0.9
+    splitter:
+      type: length_splitter # kag.builder.component.splitter.length_splitter.LengthSplitter
+      split_length: 100000
+      window_length: 0
+    vectorizer:
+      type: batch_vectorizer # kag.builder.component.vectorizer.batch_vectorizer.BatchVectorizer
+      vectorize_model: *vectorize_model
+    writer:
+      type: kg_writer # kag.builder.component.writer.kg_writer.KGWriter
+  num_threads_per_chain: 2
+  num_chains: 4
+  scanner:
+    type: csv_scanner # kag.builder.component.scanner.csv_scanner.CSVScanner
+
+spg_runner:
+  chain:
+    type: structured_builder_chain # kag.builder.default_chain.DefaultStructuredBuilderChain
+    mapping:
+      type: spg_mapping # kag.builder.component.mapping.spg_type_mapping.SPGTypeMapping
+    writer:
+      type: kg_writer # kag.builder.component.writer.kg_writer.KGWriter
+  scanner:
+    type: csv_scanner # kag.builder.component.scanner.csv_scanner.CSVScanner
+spo_runner:
+  chain:
+    mapping:
+      o_id_col: O
+      p_type_col: P
+      s_id_col: S
+      sub_property_col: properties
+      type: spo_mapping # kag.builder.component.mapping.spo_mapping.SPOMapping
+    type: structured_builder_chain # kag.builder.default_chain.DefaultStructuredBuilderChain
+    writer:
+      type: kg_writer # kag.builder.component.writer.kg_writer.KGWriter
+  scanner:
+    type: csv_scanner # kag.builder.component.scanner.csv_scanner.CSVScanner
+#------------kag-builder configuration end----------------#
+
+#------------kag-solver configuration start----------------#
+search_api: &search_api
+  type: openspg_search_api #kag.solver.tools.search_api.impl.openspg_search_api.OpenSPGSearchAPI
+
+graph_api: &graph_api
+  type: openspg_graph_api #kag.solver.tools.graph_api.impl.openspg_graph_api.OpenSPGGraphApi
+
+exact_kg_retriever: &exact_kg_retriever
+  type: default_exact_kg_retriever # kag.solver.retriever.impl.default_exact_kg_retriever.DefaultExactKgRetriever
+  el_num: 5
+  llm_client: *chat_llm
+  search_api: *search_api
+  graph_api: *graph_api
+
+fuzzy_kg_retriever: &fuzzy_kg_retriever
+  type: default_fuzzy_kg_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever
+  el_num: 5
+  vectorize_model: *vectorize_model
+  llm_client: *chat_llm
+  search_api: *search_api
+  graph_api: *graph_api
+
+chunk_retriever: &chunk_retriever
+  type: default_chunk_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever
+  llm_client: *chat_llm
+  recall_num: 10
+  rerank_topk: 10
+
+kag_solver_pipeline:
+  memory:
+    type: default_memory # kag.solver.implementation.default_memory.DefaultMemory
+    llm_client: *chat_llm
+  max_iterations: 3
+  reasoner:
+    type: default_reasoner # kag.solver.implementation.default_reasoner.DefaultReasoner
+    llm_client: *chat_llm
+    lf_planner:
+      type: default_lf_planner # kag.solver.plan.default_lf_planner.DefaultLFPlanner
+      llm_client: *chat_llm
+      vectorize_model: *vectorize_model
+    lf_executor:
+      type: default_lf_executor # kag.solver.execute.default_lf_executor.DefaultLFExecutor
+      llm_client: *chat_llm
+      force_chunk_retriever: true
+      exact_kg_retriever: *exact_kg_retriever
+      fuzzy_kg_retriever: *fuzzy_kg_retriever
+      chunk_retriever: *chunk_retriever
+      merger:
+        type: default_lf_sub_query_res_merger # kag.solver.execute.default_sub_query_merger.DefaultLFSubQueryResMerger
+        vectorize_model: *vectorize_model
+        chunk_retriever: *chunk_retriever
+  generator:
+    type: default_generator # kag.solver.implementation.default_generator.DefaultGenerator
+    llm_client: *chat_llm
+    generate_prompt:
+      type: example_resp_generator # kag/examples/medicine/solver/prompt/resp_generator.py
+  reflector:
+    type: default_reflector # kag.solver.implementation.default_reflector.DefaultReflector
+    llm_client: *chat_llm
+
+#------------kag-solver configuration end----------------#
diff --git a/kag/examples/medicine/reasoner/client.py b/kag/examples/medicine/reasoner/client.py
index 32ab9c5b..e8da32a4 100644
--- a/kag/examples/medicine/reasoner/client.py
+++ b/kag/examples/medicine/reasoner/client.py
@@ -1,33 +1,35 @@
 import os
 
 from knext.reasoner.client import ReasonerClient
-from kag.common.env import init_kag_config
+from kag.common.conf import KAG_PROJECT_CONF
+
 
 def read_dsl_files(directory):
     """
     Read all dsl files in the reasoner directory.
     """
-    
-    dsl_contents = [] 
+
+    dsl_contents = []
 
     for filename in os.listdir(directory):
-        if filename.endswith('.dsl'): 
+        if filename.endswith(".dsl"):
             file_path = os.path.join(directory, filename)
-            with open(file_path, 'r', encoding='utf-8') as file: 
-                content = file.read() 
-                dsl_contents.append(content) 
+            with open(file_path, "r", encoding="utf-8") as file:
+                content = file.read()
+                dsl_contents.append(content)
 
     return dsl_contents
 
+
 if __name__ == "__main__":
     resonser_path = os.path.dirname(os.path.abspath(__file__))
     project_path = os.path.dirname(resonser_path)
-    cfg_path = os.path.join(project_path, "kag_config.cfg")
-    init_kag_config(cfg_path)
-    host_addr = os.environ["KAG_PROJECT_HOST_ADDR"]
-    project_id = os.environ["KAG_PROJECT_ID"]
-    namespace = os.environ["KAG_PROJECT_NAMESPACE"]
-    client = ReasonerClient(host_addr=host_addr, project_id=project_id,namespace=namespace)
+    host_addr = KAG_PROJECT_CONF.host_addr
+    project_id = KAG_PROJECT_CONF.project_id
+    namespace = KAG_PROJECT_CONF.namespace
+    client = ReasonerClient(
+        host_addr=host_addr, project_id=project_id, namespace=namespace
+    )
     dsls = read_dsl_files(resonser_path)
     for dsl in dsls:
         client.execute(dsl)
diff --git a/kag/examples/medicine/reasoner/rule.dsl b/kag/examples/medicine/reasoner/rule.dsl
index bc5af776..ca29115d 100644
--- a/kag/examples/medicine/reasoner/rule.dsl
+++ b/kag/examples/medicine/reasoner/rule.dsl
@@ -1 +1 @@
-MATCH (n:`Medicine.Drug`) RETURN n.id
\ No newline at end of file
+MATCH (n:`Medicine.HospitalDepartment`) RETURN n.id
diff --git a/kag/examples/medicine/solver/evaForMedicine.py b/kag/examples/medicine/solver/evaForMedicine.py
index d83a8f72..d9b9762f 100644
--- a/kag/examples/medicine/solver/evaForMedicine.py
+++ b/kag/examples/medicine/solver/evaForMedicine.py
@@ -1,7 +1,7 @@
 import logging
-import os
+from kag.common.conf import KAG_CONFIG
+from kag.common.registry import import_modules_from_path
 
-from kag.common.env import init_kag_config
 from kag.solver.logic.solver_pipeline import SolverPipeline
 
 logger = logging.getLogger(__name__)
@@ -13,15 +13,11 @@ class MedicineDemo:
     init for kag client
     """
 
-    def __init__(self):
-        pass
-
     def qa(self, query):
-        # CA
-        resp = SolverPipeline()
+        resp = SolverPipeline.from_config(KAG_CONFIG.all_config["kag_solver_pipeline"])
         answer, trace_log = resp.run(query)
 
-        return answer,trace_log
+        return answer, trace_log
 
     """
         parallel qa from knowledge base
@@ -30,9 +26,11 @@ def qa(self, query):
 
 
 if __name__ == "__main__":
+    import_modules_from_path("./prompt")
+
     demo = MedicineDemo()
     query = "甲状腺结节可以吃什么药？"
-    answer,trace_log = demo.qa(query)
+    answer, trace_log = demo.qa(query)
     print(f"Question: {query}")
     print(f"Answer: {answer}")
     print(f"TraceLog: {trace_log}")
diff --git a/kag/examples/medicine/solver/prompt/question_ner.py b/kag/examples/medicine/solver/prompt/question_ner.py
index 3eb8ea9d..16d4b130 100644
--- a/kag/examples/medicine/solver/prompt/question_ner.py
+++ b/kag/examples/medicine/solver/prompt/question_ner.py
@@ -14,11 +14,12 @@
 from string import Template
 from typing import List, Optional
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 from knext.schema.client import SchemaClient
 
 
-class QuestionNER(PromptOp):
+@PromptABC.register("example_medical_question_ner")
+class QuestionNER(PromptABC):
 
     template_zh = """
 {
@@ -55,9 +56,7 @@ class QuestionNER(PromptOp):
 
     template_en = template_zh
 
-    def __init__(
-            self, language: Optional[str] = "en", **kwargs
-    ):
+    def __init__(self, language: str = "en", **kwargs):
         super().__init__(language, **kwargs)
         self.schema = SchemaClient(project_id=self.project_id).extract_types()
         self.template = Template(self.template).safe_substitute(schema=self.schema)
diff --git a/kag/examples/medicine/solver/prompt/resp_generator.py b/kag/examples/medicine/solver/prompt/resp_generator.py
index 91a910d5..d55a471f 100644
--- a/kag/examples/medicine/solver/prompt/resp_generator.py
+++ b/kag/examples/medicine/solver/prompt/resp_generator.py
@@ -3,24 +3,23 @@
 from typing import List
 import logging
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 logger = logging.getLogger(__name__)
 
 
-class RespGenerator(PromptOp):
-    template_zh = "基于给定的引用信息完整回答问题。" \
-                  "\n给定的引用信息：'$memory'\n问题：'$instruction'"
-    template_en = "Answer the question completely based on the given reference." \
-                 "\nThe following are given reference:'$memory'\nQuestion: '$instruction'"
-
-    def __init__(self, language: str):
-        super().__init__(language)
+@PromptABC.register("example_resp_generator")
+class RespGenerator(PromptABC):
+    template_zh = "基于给定的引用信息完整回答问题。" "\n给定的引用信息：'$memory'\n问题：'$instruction'"
+    template_en = (
+        "Answer the question completely based on the given reference."
+        "\nThe following are given reference:'$memory'\nQuestion: '$instruction'"
+    )
 
     @property
     def template_variables(self) -> List[str]:
         return ["memory", "instruction"]
 
     def parse_response(self, response: str, **kwargs):
-        logger.debug('推理器判别:{}'.format(response))
+        logger.debug("推理器判别:{}".format(response))
         return response
diff --git a/kag/examples/musique/.gitignore b/kag/examples/musique/.gitignore
new file mode 100644
index 00000000..018fb18a
--- /dev/null
+++ b/kag/examples/musique/.gitignore
@@ -0,0 +1,3 @@
+ckpt/
+/solver/musique_res_*.json
+/solver/musique_metrics_*.json
diff --git a/kag/examples/musique/README.md b/kag/examples/musique/README.md
new file mode 100644
index 00000000..6d0182fd
--- /dev/null
+++ b/kag/examples/musique/README.md
@@ -0,0 +1,69 @@
+# KAG Example: MuSiQue
+
+[MuSiQue](https://arxiv.org/abs/2108.00573) is a multi-hop QA dataset
+for comprehensive evaluation of reasoning steps. It's used by [KAG](https://arxiv.org/abs/2409.13731)
+and [HippoRAG](https://arxiv.org/abs/2405.14831) for multi-hop question answering
+performance evaluation.
+
+Here we demonstrate how to build a knowledge graph for the MuSiQue dataset,
+generate answers to those evaluation questions with KAG and calculate EM and F1
+metrics of the KAG generated answers compared to the ground-truth answers.
+
+## Steps to reproduce
+
+1. Follow the Quick Start guide of KAG to install the OpenSPG server and KAG.
+
+   The following steps assume the Python virtual environment with KAG installed
+   is activated and the current directory is [musique](.).
+
+2. (Optional) Update [indexer.py](./builder/indexer.py) and [evaForMusique.py](./solver/evaForMusique.py)
+   to use the larger dataset. You may want to skip this step the first time and
+   use the small dataset to get started quickly.
+
+3. Update the ``openie_llm``, ``chat_llm`` and ``vectorizer_model`` configurations
+   in [kag_config.yaml](./kag_config.yaml) properly.
+
+4. Restore the KAG project.
+
+   ```bash
+   knext project restore --host_addr http://127.0.0.1:8887 --proj_path .
+   ```
+
+5. Commit the schema.
+
+   ```bash
+   knext schema commit
+   ```
+
+6. Execute [indexer.py](./builder/indexer.py) in the [builder](./builder) directory to build the knowledge graph.
+
+   ```bash
+   cd builder && python indexer.py && cd ..
+   ```
+
+7. Execute [evaForMusique.py](./solver/evaForMusique.py) in the [solver](./solver) directory
+   to generate the answers and calculate the EM and F1 metrics.
+
+   ```bash
+   cd solver && python evaForMusique.py && cd ..
+   ```
+
+   The generated answers are saved to ``./solver/musique_res_*.json``.
+
+   The calculated EM and F1 metrics are saved to ``./solver/musique_metrics_*.json``.
+
+8. (Optional) To delete checkpoints, execute the following commands.
+
+   ```bash
+   rm -rf ./builder/ckpt
+   rm -rf ./solver/ckpt
+   ```
+
+   To delete the KAG project and related knowledge graph, execute the following similar command.
+   Replace the OpenSPG server address and KAG project id with actual values.
+
+   ```bash
+   curl http://127.0.0.1:8887/project/api/delete?projectId=1
+   ```
+
+9. (Optional) Restart from Step 2 and try the larger dataset.
diff --git a/kag/examples/musique/builder/__init__.py b/kag/examples/musique/builder/__init__.py
index 94be39bc..7a018e7c 100644
--- a/kag/examples/musique/builder/__init__.py
+++ b/kag/examples/musique/builder/__init__.py
@@ -11,4 +11,4 @@
 
 """
 Builder Dir.
-"""
\ No newline at end of file
+"""
diff --git a/kag/examples/musique/builder/data/__init__.py b/kag/examples/musique/builder/data/__init__.py
index 6a8637b9..59bacd4d 100644
--- a/kag/examples/musique/builder/data/__init__.py
+++ b/kag/examples/musique/builder/data/__init__.py
@@ -11,4 +11,4 @@
 
 """
 Place the files to be used for building the index in this directory.
-"""
\ No newline at end of file
+"""
diff --git a/kag/examples/musique/builder/data/musique_sub_corpus.json b/kag/examples/musique/builder/data/musique_sub_corpus.json
index 35feffd3..c9ea114d 100644
--- a/kag/examples/musique/builder/data/musique_sub_corpus.json
+++ b/kag/examples/musique/builder/data/musique_sub_corpus.json
@@ -1,22 +1,74 @@
 [
-  {
-    "title": "Journal of Mathematical Physics",
-    "text": "The Journal of Mathematical Physics is a peer-reviewed journal published monthly by the American Institute of Physics devoted to the publication of papers in mathematical physics. The journal was first published bimonthly beginning in January 1960; it became a monthly publication in 1963. The current editor is Jan Philip Solovej from University of Copenhagen. Its 2018 Impact Factor is 1.355"
-  },
-  {
-    "title": "Person-centered therapy",
-    "text": "Person - centered therapy, also known as person - centered psychotherapy, person - centered counseling, client - centered therapy and Rogerian psychotherapy, is a form of psychotherapy developed by psychologist Carl Rogers beginning in the 1940s and extending into the 1980s. Person - centered therapy seeks to facilitate a client's self - actualizing tendency, ``an inbuilt proclivity toward growth and fulfillment '', via acceptance (unconditional positive regard), therapist congruence (genuineness), and empathic understanding."
-  },
-  {
-    "title": "Journal of Small Business Management",
-    "text": "The Journal of Small Business Management is a quarterly peer-reviewed academic journal published by Wiley-Blackwell on behalf of the International Council for Small Business. The journal was first published in 1963. It covers all aspects of managing small companies. The editor-in-chief is George T. Solomon (George Washington University)."
-  },
-  {
-    "title": "Film Journal International",
-    "text": "Film Journal International is a motion-picture industry trade magazine published by the American company Prometheus Global Media. It is a sister publication of \"Adweek\", \"Billboard\", \"The Hollywood Reporter\", and other periodicals."
-  },
-  {
-    "title": "Riemann integral",
-    "text": "In the branch of mathematics known as real analysis, the Riemann integral, created by Bernhard Riemann, was the first rigorous definition of the integral of a function on an interval. It was presented to the faculty at the University of G\u00f6ttingen in 1854, but not published in a journal until 1868. For many functions and practical applications, the Riemann integral can be evaluated by the fundamental theorem of calculus or approximated by numerical integration."
-  }
+    {
+        "title": "Canon law",
+        "text": "Other churches in the Anglican Communion around the world (e.g., the Episcopal Church in the United States, and the Anglican Church of Canada) still function under their own private systems of canon law."
+    },
+    {
+        "title": "Journal of Psychotherapy Integration",
+        "text": "The Journal of Psychotherapy Integration is a peer-reviewed academic journal published by the American Psychological Association on behalf of the Society for the Exploration of Psychotherapy Integration. It was established in 1991 and covers research in psychotherapy. The editor-in-chief is Jennifer Callahan (University of North Texas)."
+    },
+    {
+        "title": "Hank Snow",
+        "text": "Snow moved to Nashville, Tennessee, in 1949, and \"Hank Snow, the Singing Ranger\" (modified from his earlier nickname, the Yodeling Ranger), began recording for RCA Victor in the United States in 1949. His first release in the United States, \"Marriage Vow\" climbed to number ten on the country charts in the fall of 1949; However, it wasn't until he was invited to play at the Grand Ole Opry in 1950 that he gained serious significance in the United States. His second release in early 1950, \"I'm Moving On\" was the first of seven number 1 hits on the country charts. \"I'm Moving On\" stayed at the top for 21 weeks, setting the all-time record for most weeks at number 1."
+    },
+    {
+        "title": "Tennessee",
+        "text": "Tennessee (i/tɛnᵻˈsiː/) (Cherokee: ᏔᎾᏏ, Tanasi) is a state located in the southeastern United States. Tennessee is the 36th largest and the 17th most populous of the 50 United States. Tennessee is bordered by Kentucky and Virginia to the north, North Carolina to the east, Georgia, Alabama, and Mississippi to the south, and Arkansas and Missouri to the west. The Appalachian Mountains dominate the eastern part of the state, and the Mississippi River forms the state's western border. Tennessee's capital and second largest city is Nashville, which has a population of 601,222. Memphis is the state's largest city, with a population of 653,450."
+    },
+    {
+        "title": "Adolescence",
+        "text": "The formal study of adolescent psychology began with the publication of G. Stanley Hall's \"Adolescence in 1904.\" Hall, who was the first president of the American Psychological Association, viewed adolescence primarily as a time of internal turmoil and upheaval (sturm und drang). This understanding of youth was based on two then new ways of understanding human behavior: Darwin's evolutionary theory and Freud's psychodynamic theory. He believed that adolescence was a representation of our human ancestors' phylogenetic shift from being primitive to being civilized. Hall's assertions stood relatively uncontested until the 1950s when psychologists such as Erik Erikson and Anna Freud started to formulate their theories about adolescence. Freud believed that the psychological disturbances associated with youth were biologically based and culturally universal while Erikson focused on the dichotomy between identity formation and role fulfillment. Even with their different theories, these three psychologists agreed that adolescence was inherently a time of disturbance and psychological confusion. The less turbulent aspects of adolescence, such as peer relations and cultural influence, were left largely ignored until the 1980s. From the '50s until the '80s, the focus of the field was mainly on describing patterns of behavior as opposed to explaining them."
+    },
+    {
+        "title": "Publix",
+        "text": "Publix Super Markets, Inc., commonly known as Publix, is an employee - owned, American supermarket chain headquartered in Lakeland, Florida. Founded in 1930 by George W. Jenkins, Publix is a private corporation that is wholly owned by present and past employees. It is considered the largest employee - owned company in the world. Publix operates throughout the Southeastern United States, with locations in Florida (785), Georgia (186), Alabama (68), South Carolina (58), Tennessee (42), North Carolina (35), and Virginia (8)."
+    },
+    {
+        "title": "Hello Love (song)",
+        "text": "\"Hello Love\" is a 1974 single by Hank Snow. \"Hello Love\" was Snow's seventh and final number one on the U.S. country singles chart, and his first number one in twelve years. The single stayed at number one for a single week and spent a total of ten weeks on the chart."
+    },
+    {
+        "title": "Canon law",
+        "text": "In the Church of England, the ecclesiastical courts that formerly decided many matters such as disputes relating to marriage, divorce, wills, and defamation, still have jurisdiction of certain church-related matters (e.g. discipline of clergy, alteration of church property, and issues related to churchyards). Their separate status dates back to the 12th century when the Normans split them off from the mixed secular/religious county and local courts used by the Saxons. In contrast to the other courts of England the law used in ecclesiastical matters is at least partially a civil law system, not common law, although heavily governed by parliamentary statutes. Since the Reformation, ecclesiastical courts in England have been royal courts. The teaching of canon law at the Universities of Oxford and Cambridge was abrogated by Henry VIII; thereafter practitioners in the ecclesiastical courts were trained in civil law, receiving a Doctor of Civil Law (D.C.L.) degree from Oxford, or a Doctor of Laws (LL.D.) degree from Cambridge. Such lawyers (called \"doctors\" and \"civilians\") were centered at \"Doctors Commons\", a few streets south of St Paul's Cathedral in London, where they monopolized probate, matrimonial, and admiralty cases until their jurisdiction was removed to the common law courts in the mid-19th century."
+    },
+    {
+        "title": "St Giles in the Fields",
+        "text": "St Giles-in-the-Fields, also commonly known as the Poets' Church, is a church in the London Borough of Camden, in the West End. It is close to the Centre Point office tower and the Tottenham Court Road tube station. The church is part of the Diocese of London within the Church of England. Several buildings have stood on the site; the present structure (in the Palladian style) was built between 1731 and 1733."
+    },
+    {
+        "title": "Your Love Is a Song",
+        "text": "\"Your Love Is a Song\" was written and recorded by the alternative rock band Switchfoot. It was first released as a single to the iTunes Store in Australia, and became the third radio single from the band's seventh studio album, \"Hello Hurricane\"."
+    },
+    {
+        "title": "The American Economic Review",
+        "text": "The American Economic Review is a peer-reviewed academic journal of economics. Twelve (formerly seven) issues are published annually by the American Economic Association. First published in 1911, it is considered one of the most prestigious and highly distinguished journals in the field of economics. The current editor-in-chief is Esther Duflo (MIT). The previous editor was Pinelopi Goldberg. The journal is based in Pittsburgh."
+    },
+    {
+        "title": "Publix",
+        "text": "Publix stands as one of the largest U.S. regional grocery chains. Locations are found as far north as Spotsylvania, Virginia, as far south as Key West, Florida, while the westernmost location is in Mobile, Alabama. Today, the state of Florida still has the largest number of stores, with 787, about two - thirds of the outlets. As of August 2018, Publix employs about 193,000 people at its 1,231 retail locations, cooking schools, corporate offices, 9 grocery distribution centers, and 11 manufacturing facilities. The manufacturing facilities produce its dairy, deli, bakery, and other food products."
+    },
+    {
+        "title": "Say Hello 2 Heaven",
+        "text": "Cornell wrote ``Say Hello 2 Heaven ''as a tribute to his roommate, Mother Love Bone vocalist Andrew Wood, who at the time had recently died of a heroin overdose."
+    },
+    {
+        "title": "Person-centered therapy",
+        "text": "Person - centered therapy, also known as person - centered psychotherapy, person - centered counseling, client - centered therapy and Rogerian psychotherapy, is a form of psychotherapy developed by psychologist Carl Rogers beginning in the 1940s and extending into the 1980s. Person - centered therapy seeks to facilitate a client's self - actualizing tendency, ``an inbuilt proclivity toward growth and fulfillment '', via acceptance (unconditional positive regard), therapist congruence (genuineness), and empathic understanding."
+    },
+    {
+        "title": "Richmond Valley, Staten Island",
+        "text": "Richmond Valley is the name of a neighborhood located on the South Shore of Staten Island, one of the five boroughs of New York City, the largest city in the United States. Richmond Valley is bordered on the north by Pleasant Plains, to the south by Tottenville, to the west by the Arthur Kill, and to the east by the Lower New York Bay."
+    },
+    {
+        "title": "Nature Physics",
+        "text": "Nature Physics, is a monthly, peer reviewed, scientific journal published by the Nature Publishing Group. It was first published in October 2005 (volume 1, issue 1). The Chief Editor is Andrea Taroni, who is a full-time professional editor employed by this journal."
+    },
+    {
+        "title": "Peter Appleyard",
+        "text": "Peter Appleyard, (26 August 1928 – 17 July 2013) was a British–Canadian jazz vibraphonist, percussionist, and composer. He spent most of his life living and performing in the city of Toronto where for many years he was a popular performer in the city's nightclubs and hotels. He also played and recorded with many of the city's orchestras and been featured on Canadian television and radio programs. In the early 1970s he drew wide acclaim for his performances with Benny Goodman's jazz sextet with which he toured internationally. In 1992, he was made an Officer of the Order of Canada in recognition of his being an \"internationally renowned vibraphonist [who] has represented the Canadian jazz community across North America, Europe, the Middle East and Australia\"."
+    },
+    {
+        "title": "12Stone",
+        "text": "1987 - On November 1, 12Stone Church was founded as Crossroads Community Church by Kevin Myers, his wife Marcia, and three other couples that relocated from Michigan to Greater Gwinnett County to plant a church that would be relevant, fun, challenging and impacting for generations."
+    }
 ]
\ No newline at end of file
diff --git a/kag/examples/musique/builder/indexer.py b/kag/examples/musique/builder/indexer.py
index ecf42c64..6699d3f1 100644
--- a/kag/examples/musique/builder/indexer.py
+++ b/kag/examples/musique/builder/indexer.py
@@ -8,83 +8,25 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
-import json
 import logging
-import os
-from typing import List, Type
+from kag.common.registry import import_modules_from_path
 
-from kag.builder.component import KGWriter
-from kag.builder.component.extractor import KAGExtractor
-from kag.builder.component.splitter import LengthSplitter
-from kag.builder.component.vectorizer.batch_vectorizer import BatchVectorizer
-from kag.builder.model.chunk import Chunk
-from kag.examples.utils import generate_hash_id
-from knext.builder.builder_chain_abc import BuilderChainABC
-from kag.interface.builder import SourceReaderABC
-from knext.common.base.runnable import Input, Output
+from kag.builder.runner import BuilderChainRunner
 
 logger = logging.getLogger(__name__)
 
 
+def buildKB(file_path):
+    from kag.common.conf import KAG_CONFIG
 
-class MusiqueCorpusReader(SourceReaderABC):
-    @property
-    def input_types(self) -> Type[Input]:
-        """The type of input this Runnable object accepts specified as a type annotation."""
-        return str
+    runner = BuilderChainRunner.from_config(KAG_CONFIG.all_config["kag_builder_pipeline"])
+    runner.invoke(file_path)
 
-    @property
-    def output_types(self) -> Type[Output]:
-        """The type of output this Runnable object produces specified as a type annotation."""
-        return Chunk
+    logger.info(f"\n\nbuildKB successfully for {file_path}\n\n")
 
-    def get_basename(self, file_name: str):
-        base, ext = os.path.splitext(os.path.basename(file_name))
-        return base
 
-    def invoke(self, input: str, **kwargs) -> List[Output]:
-        name_column = kwargs.get("name_column", "title")
-        content_column = kwargs.get("content_column", "text")
+if __name__ == "__main__":
+    import_modules_from_path(".")
+    file_path = "./data/musique_sub_corpus.json"
 
-        if os.path.exists(str(input)):
-            with open(input, "r") as f:
-                corpusList = json.load(f)
-        else:
-            corpusList = input
-        chunks = []
-
-        for item in corpusList:
-            chunk = Chunk(
-                id=generate_hash_id(item[content_column]),
-                name=item[name_column],
-                content=item[content_column],
-            )
-            chunks.append(chunk)
-        return chunks
-
-
-class MusiqueBuilderChain(BuilderChainABC):
-    def build(self, **kwargs):
-        source = MusiqueCorpusReader()
-        splitter = LengthSplitter(split_length=2000)
-        extractor = KAGExtractor()
-        vectorizer = BatchVectorizer()
-        sink = KGWriter()
-
-        return source >> splitter >> extractor >> vectorizer >> sink
-
-
-def buildKB(corpusFilePath):
-    MusiqueBuilderChain().invoke(file_path=corpusFilePath, max_workers=20)
-
-    logger.info(f"\n\nbuildKB successfully for {corpusFilePath}\n\n")
-
-
-if __name__ == '__main__':
-    filePath = "./data/musique_sub_corpus.json"
-    # filePath = "./data/musique_train_corpus.json"
-
-    corpusFilePath = os.path.join(
-        os.path.abspath(os.path.dirname(__file__)), filePath
-    )
-    buildKB(corpusFilePath)
+    buildKB(file_path)
diff --git a/kag/examples/musique/builder/prompt/ner.py b/kag/examples/musique/builder/prompt/ner.py
deleted file mode 100644
index cf5aa897..00000000
--- a/kag/examples/musique/builder/prompt/ner.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import json
-from string import Template
-from typing import List, Optional
-
-from kag.common.base.prompt_op import PromptOp
-from knext.schema.client import SchemaClient
-
-
-class OpenIENERPrompt(PromptOp):
-
-    template_en = """
-    {
-    "instruction": "You're a very effective entity extraction system. Please extract all the entities that are important for knowledge build and question, along with type, category and a brief description of the entity. The description of the entity is based on your OWN KNOWLEDGE AND UNDERSTANDING and does not need to be limited to the context. the entity's category belongs taxonomically to one of the items defined by schema, please also output the category. Note: Type refers to a specific, well-defined classification, such as Professor, Actor, while category is a broader group or class that may contain more than one type, such as Person, Works. Return an empty list if the entity type does not exist. Please respond in the format of a JSON string.You can refer to the example for extraction.",
-    "schema": $schema,
-    "example": [
-        {
-            "input": "The Rezort\nThe Rezort is a 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger.\n It stars Dougray Scott, Jessica De Gouw and Martin McCann.\n After humanity wins a devastating war against zombies, the few remaining undead are kept on a secure island, where they are hunted for sport.\n When something goes wrong with the island's security, the guests must face the possibility of a new outbreak.",
-            "output": [
-                        {
-                            "entity": "The Rezort",
-                            "type": "Movie",
-                            "category": "Works",
-                            "description": "A 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger."
-                        },
-                        {
-                            "entity": "2015",
-                            "type": "Year",
-                            "category": "Date",
-                            "description": "The year the movie 'The Rezort' was released."
-                        },
-                        {
-                            "entity": "British",
-                            "type": "Nationality",
-                            "category": "GeographicLocation",
-                            "description": "Great Britain, the island that includes England, Scotland, and Wales."
-                        },
-                        {
-                            "entity": "Steve Barker",
-                            "type": "Director",
-                            "category": "Person",
-                            "description": "Steve Barker is an English film director and screenwriter."
-                        },
-                        {
-                            "entity": "Paul Gerstenberger",
-                            "type": "Writer",
-                            "category": "Person",
-                            "description": "Paul is a writer and producer, known for The Rezort (2015), Primeval (2007) and House of Anubis (2011)."
-                        },
-                        {
-                            "entity": "Dougray Scott",
-                            "type": "Actor",
-                            "category": "Person",
-                            "description": "Stephen Dougray Scott (born 26 November 1965) is a Scottish actor."
-                        },
-                        {
-                            "entity": "Jessica De Gouw",
-                            "type": "Actor",
-                            "category": "Person",
-                            "description": "Jessica Elise De Gouw (born 15 February 1988) is an Australian actress. "
-                        },
-                        {
-                            "entity": "Martin McCann",
-                            "type": "Actor",
-                            "category": "Person",
-                            "description": "Martin McCann is an actor from Northern Ireland. In 2020, he was listed as number 48 on The Irish Times list of Ireland's greatest film actors"
-                        }
-                    ]
-        }
-    ],
-    "input": "$input"
-}    
-        """
-
-    template_zh = template_en
-
-    def __init__(
-            self, language: Optional[str] = "en", **kwargs
-    ):
-        super().__init__(language, **kwargs)
-        self.schema = SchemaClient(project_id=self.project_id).extract_types()
-        self.template = Template(self.template).safe_substitute(schema=self.schema)
-
-    @property
-    def template_variables(self) -> List[str]:
-        return ["input"]
-
-    def parse_response(self, response: str, **kwargs):
-        rsp = response
-        if isinstance(rsp, str):
-            rsp = json.loads(rsp)
-        if isinstance(rsp, dict) and "output" in rsp:
-            rsp = rsp["output"]
-        if isinstance(rsp, dict) and "named_entities" in rsp:
-            entities = rsp["named_entities"]
-        else:
-            entities = rsp
-
-        return entities
diff --git a/kag/examples/musique/builder/prompt/std.py b/kag/examples/musique/builder/prompt/std.py
deleted file mode 100644
index 1dfcfaaa..00000000
--- a/kag/examples/musique/builder/prompt/std.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import json
-from typing import Optional, List
-
-from kag.common.base.prompt_op import PromptOp
-
-
-class OpenIEEntitystandardizationdPrompt(PromptOp):
-    template_en = """
-{
-    "instruction": "The `input` field contains a user provided context. The `named_entities` field contains extracted named entities from the context, which may be unclear abbreviations, aliases, or slang. To eliminate ambiguity, please attempt to provide the official names of these entities based on the context and your own knowledge. Note that entities with the same meaning can only have ONE official name. Please respond in the format of a single JSONArray string without any explanation, as shown in the `output` field of the provided example.",
-    "example": {
-        "input": "American History\nWhen did the political party that favored harsh punishment of southern states after the Civil War, gain control of the House? Republicans regained control of the chamber they had lost in the 2006 midterm elections.",
-        "named_entities": [
-            {"entity": "American", "category": "GeographicLocation"},
-            {"entity": "political party", "category": "Organization"},
-            {"entity": "southern states", "category": "GeographicLocation"},
-            {"entity": "Civil War", "category": "Keyword"},
-            {"entity": "House", "category": "Organization"},
-            {"entity": "Republicans", "category": "Organization"},
-            {"entity": "chamber", "category": "Organization"},
-            {"entity": "2006 midterm elections", "category": "Date"}
-        ],
-        "output": [
-            {
-                "entity": "American",
-                "category": "GeographicLocation",
-                "official_name": "United States of America"
-            },
-            {
-                "entity": "political party",
-                "category": "Organization",
-                "official_name": "Radical Republicans"
-            },
-            {
-                "entity": "southern states",
-                "category": "GeographicLocation",
-                "official_name": "Confederacy"
-            },
-            {
-                "entity": "Civil War",
-                "category": "Keyword",
-                "official_name": "American Civil War"
-            },
-            {
-                "entity": "House",
-                "category": "Organization",
-                "official_name": "United States House of Representatives"
-            },
-            {
-                "entity": "Republicans",
-                "category": "Organization",
-                "official_name": "Republican Party"
-            },
-            {
-                "entity": "chamber",
-                "category": "Organization",
-                "official_name": "United States House of Representatives"
-            },
-            {
-                "entity": "midterm elections",
-                "category": "Date",
-                "official_name": "United States midterm elections"
-            }
-        ]
-    },
-    "input": "$input",
-    "named_entities": $named_entities
-}
-    """
-
-    template_zh = """"""
-
-    def __init__(self, language: Optional[str] = "en"):
-        super().__init__(language)
-
-    @property
-    def template_variables(self) -> List[str]:
-        return ["input", "named_entities"]
-
-    def parse_response(self, response: str, **kwargs):
-
-        rsp = response
-        if isinstance(rsp, str):
-            rsp = json.loads(rsp)
-        if isinstance(rsp, dict) and "output" in rsp:
-            rsp = rsp["output"]
-        if isinstance(rsp, dict) and "named_entities" in rsp:
-            standardized_entity = rsp["named_entities"]
-        else:
-            standardized_entity = rsp
-        entities_with_offical_name = set()
-        merged = []
-        entities = kwargs.get("named_entities", [])
-        for entity in standardized_entity:
-            merged.append(entity)
-            entities_with_offical_name.add(entity["entity"])
-        # in case llm ignores some entities
-        for entity in entities:
-            if entity["entity"] not in entities_with_offical_name:
-                entity["official_name"] = entity["entity"]
-                merged.append(entity)
-        return merged
diff --git a/kag/examples/musique/builder/prompt/triple.py b/kag/examples/musique/builder/prompt/triple.py
deleted file mode 100644
index 9e375e2c..00000000
--- a/kag/examples/musique/builder/prompt/triple.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import json
-from typing import Optional, List
-
-from kag.common.base.prompt_op import PromptOp
-
-
-class OpenIETriplePrompt(PromptOp):
-    template_en = """
-{
-    "instruction": "You are an expert specializing in carrying out open information extraction (OpenIE). Please extract any possible relations (including subject, predicate, object) from the given text, and list them following the json format {\"triples\": [[\"subject\", \"predicate\",  \"object\"]]}\n. If there are none, do not list them.\n.\n\nPay attention to the following requirements:\n- Each triple should contain at least one, but preferably two, of the named entities in the entity_list.\n- Clearly resolve pronouns to their specific names to maintain clarity.",
-    "entity_list": $entity_list,
-    "input": "$input",
-    "example": {
-        "input": "The Rezort\nThe Rezort is a 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger.\n It stars Dougray Scott, Jessica De Gouw and Martin McCann.\n After humanity wins a devastating war against zombies, the few remaining undead are kept on a secure island, where they are hunted for sport.\n When something goes wrong with the island's security, the guests must face the possibility of a new outbreak.",
-        "entity_list": [
-            {
-                "entity": "The Rezort",
-                "category": "Works"
-            },
-            {
-                "entity": "2015",
-                "category": "Others"
-            },
-            {
-                "entity": "British",
-                "category": "GeographicLocation"
-            },
-            {
-                "entity": "Steve Barker",
-                "category": "Person"
-            },
-            {
-                "entity": "Paul Gerstenberger",
-                "category": "Person"
-            },
-            {
-                "entity": "Dougray Scott",
-                "category": "Person"
-            },
-            {
-                "entity": "Jessica De Gouw",
-                "category": "Person"
-            },
-            {
-                "entity": "Martin McCann",
-                "category": "Person"
-            },
-            {
-                "entity": "zombies",
-                "category": "Creature"
-            },
-            {
-                "entity": "zombie horror film",
-                "category": "Concept"
-            },
-            {
-                "entity": "humanity",
-                "category": "Concept"
-            },
-            {
-                "entity": "secure island",
-                "category": "GeographicLocation"
-            }
-        ],
-        "output": [
-            [
-                "The Rezort",
-                "is",
-                "zombie horror film"
-            ],
-            [
-                "The Rezort",
-                "publish at",
-                "2015"
-            ],
-            [
-                "The Rezort",
-                "released",
-                "British"
-            ],
-            [
-                "The Rezort",
-                "is directed by",
-                "Steve Barker"
-            ],
-            [
-                "The Rezort",
-                "is written by",
-                "Paul Gerstenberger"
-            ],
-            [
-                "The Rezort",
-                "stars",
-                "Dougray Scott"
-            ],
-            [
-                "The Rezort",
-                "stars",
-                "Jessica De Gouw"
-            ],
-            [
-                "The Rezort",
-                "stars",
-                "Martin McCann"
-            ],
-            [
-                "humanity",
-                "wins",
-                "a devastating war against zombies"
-            ],
-            [
-                "the few remaining undead",
-                "are kept on",
-                "a secure island"
-            ],
-            [
-                "they",
-                "are hunted for",
-                "sport"
-            ],
-            [
-                "something",
-                "goes wrong with",
-                "the island's security"
-            ],
-            [
-                "the guests",
-                "must face",
-                "the possibility of a new outbreak"
-            ]
-        ]
-    }
-}    
-    """
-
-    def __init__(self, language: Optional[str] = "en"):
-        super().__init__(language)
-
-    @property
-    def template_variables(self) -> List[str]:
-        return ["entity_list", "input"]
-
-    def parse_response(self, response: str, **kwargs):
-        rsp = response
-        if isinstance(rsp, str):
-            rsp = json.loads(rsp)
-        if isinstance(rsp, dict) and "output" in rsp:
-            rsp = rsp["output"]
-        if isinstance(rsp, dict) and "triples" in rsp:
-            triples = rsp["triples"]
-        else:
-            triples = rsp
-
-        standardized_triples = []
-        for triple in triples:
-            if isinstance(triple, list):
-                standardized_triples.append(triple)
-            elif isinstance(triple, dict):
-                s = triple.get("subject")
-                p = triple.get("predicate")
-                o = triple.get("object")
-                if s and p and o:
-                    standardized_triples.append([s, p, o])
-
-        return standardized_triples
diff --git a/kag/examples/musique/kag_config.cfg b/kag/examples/musique/kag_config.cfg
deleted file mode 100644
index 4b348bf6..00000000
--- a/kag/examples/musique/kag_config.cfg
+++ /dev/null
@@ -1,27 +0,0 @@
-[project]
-namespace = MuSiQue
-host_addr = http://127.0.0.1:8887
-id = 2
-
-[vectorizer]
-vectorizer = kag.common.vectorizer.OpenAIVectorizer
-model = bge-m3
-api_key = EMPTY
-base_url = http://127.0.0.1:11434/v1
-vector_dimensions = 1024
-
-[llm]
-client_type = maas
-base_url = https://api.deepseek.com/
-api_key = put your deepseek api key here
-model = deepseek-chat
-
-[log]
-level = INFO
-
-[qa]
-force_chunk_retriever = True
-
-[prompt]
-language = en
-biz_scene = default
\ No newline at end of file
diff --git a/kag/examples/musique/kag_config.yaml b/kag/examples/musique/kag_config.yaml
new file mode 100644
index 00000000..122021ec
--- /dev/null
+++ b/kag/examples/musique/kag_config.yaml
@@ -0,0 +1,126 @@
+#------------project configuration start----------------#
+openie_llm: &openie_llm
+  api_key: key
+  base_url: https://api.deepseek.com
+  model: deepseek-chat
+  type: maas
+
+chat_llm: &chat_llm
+  api_key: key
+  base_url: https://api.deepseek.com
+  model: deepseek-chat
+  type: maas
+
+vectorize_model: &vectorize_model
+  api_key: key
+  base_url: https://api.siliconflow.cn/v1/
+  model: BAAI/bge-m3
+  type: openai
+  vector_dimensions: 1024
+vectorizer: *vectorize_model
+
+log:
+  level: INFO
+
+project:
+  biz_scene: default
+  host_addr: http://127.0.0.1:8887
+  id: '11'
+  language: en
+  namespace: MuSiQue
+#------------project configuration end----------------#
+
+#------------kag-builder configuration start----------------#
+kag_builder_pipeline:
+  chain:
+    type: unstructured_builder_chain # kag.builder.default_chain.DefaultUnstructuredBuilderChain
+    extractor:
+      type: schema_free_extractor # kag.builder.component.extractor.schema_free_extractor.SchemaFreeExtractor
+      llm: *openie_llm
+      ner_prompt:
+        type: default_ner # kag.builder.prompt.default.ner.OpenIENERPrompt
+      std_prompt:
+        type: default_std # kag.builder.prompt.default.std.OpenIEEntitystandardizationdPrompt
+      triple_prompt:
+        type: default_triple # kag.builder.prompt.default.triple.OpenIETriplePrompt
+    reader:
+      type: dict_reader # kag.builder.component.reader.dict_reader.DictReader
+    post_processor:
+      type: kag_post_processor # kag.builder.component.postprocessor.kag_postprocessor.KAGPostProcessor
+      similarity_threshold: 0.9
+    splitter:
+      type: length_splitter # kag.builder.component.splitter.length_splitter.LengthSplitter
+      split_length: 100000
+      window_length: 0
+    vectorizer:
+      type: batch_vectorizer # kag.builder.component.vectorizer.batch_vectorizer.BatchVectorizer
+      vectorize_model: *vectorize_model
+    writer:
+      type: kg_writer # kag.builder.component.writer.kg_writer.KGWriter
+  num_threads_per_chain: 2
+  num_chains: 10
+  scanner:
+    type: musique_dataset_scanner # kag.builder.component.scanner.dataset_scanner.MusiqueCorpusScanner
+#------------kag-builder configuration end----------------#
+
+#------------kag-solver configuration start----------------#
+search_api: &search_api
+  type: openspg_search_api #kag.solver.tools.search_api.impl.openspg_search_api.OpenSPGSearchAPI
+
+graph_api: &graph_api
+  type: openspg_graph_api #kag.solver.tools.graph_api.impl.openspg_graph_api.OpenSPGGraphApi
+
+exact_kg_retriever: &exact_kg_retriever
+  type: default_exact_kg_retriever # kag.solver.retriever.impl.default_exact_kg_retriever.DefaultExactKgRetriever
+  el_num: 5
+  llm_client: *chat_llm
+  search_api: *search_api
+  graph_api: *graph_api
+
+fuzzy_kg_retriever: &fuzzy_kg_retriever
+  type: default_fuzzy_kg_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever
+  el_num: 5
+  vectorize_model: *vectorize_model
+  llm_client: *chat_llm
+  search_api: *search_api
+  graph_api: *graph_api
+
+chunk_retriever: &chunk_retriever
+  type: default_chunk_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever
+  llm_client: *chat_llm
+  recall_num: 10
+  rerank_topk: 10
+
+kag_solver_pipeline:
+  memory:
+    type: default_memory # kag.solver.implementation.default_memory.DefaultMemory
+    llm_client: *chat_llm
+  max_iterations: 3
+  reasoner:
+    type: default_reasoner # kag.solver.implementation.default_reasoner.DefaultReasoner
+    llm_client: *chat_llm
+    lf_planner:
+      type: default_lf_planner # kag.solver.plan.default_lf_planner.DefaultLFPlanner
+      llm_client: *chat_llm
+      vectorize_model: *vectorize_model
+    lf_executor:
+      type: default_lf_executor # kag.solver.execute.default_lf_executor.DefaultLFExecutor
+      llm_client: *chat_llm
+      force_chunk_retriever: false
+      exact_kg_retriever: *exact_kg_retriever
+      fuzzy_kg_retriever: *fuzzy_kg_retriever
+      chunk_retriever: *chunk_retriever
+      merger:
+        type: default_lf_sub_query_res_merger # kag.solver.execute.default_sub_query_merger.DefaultLFSubQueryResMerger
+        vectorize_model: *vectorize_model
+        chunk_retriever: *chunk_retriever
+  generator:
+    type: default_generator # kag.solver.implementation.default_generator.DefaultGenerator
+    llm_client: *chat_llm
+    generate_prompt:
+      type: resp_simple # kag/examples/musique/solver/prompt/resp_generator.py
+  reflector:
+    type: default_reflector # kag.solver.implementation.default_reflector.DefaultReflector
+    llm_client: *chat_llm
+
+#------------kag-solver configuration end----------------#
diff --git a/kag/examples/musique/reasoner/__init__.py b/kag/examples/musique/reasoner/__init__.py
index a0c4032b..8b8a3c91 100644
--- a/kag/examples/musique/reasoner/__init__.py
+++ b/kag/examples/musique/reasoner/__init__.py
@@ -17,4 +17,4 @@
 MATCH (s:DEFAULT.Company)
 RETURN s.id, s.address
 ```
-"""
\ No newline at end of file
+"""
diff --git a/kag/examples/musique/schema/__init__.py b/kag/examples/musique/schema/__init__.py
index ef3dde6d..8ac86acc 100644
--- a/kag/examples/musique/schema/__init__.py
+++ b/kag/examples/musique/schema/__init__.py
@@ -15,4 +15,4 @@
     You can execute `kag schema commit` to commit your schema to SPG server.
 
 
-"""
\ No newline at end of file
+"""
diff --git a/kag/examples/musique/solver/data/musique.json b/kag/examples/musique/solver/data/musique_qa.json
similarity index 100%
rename from kag/examples/musique/solver/data/musique.json
rename to kag/examples/musique/solver/data/musique_qa.json
diff --git a/kag/examples/musique/solver/evaForMusique.py b/kag/examples/musique/solver/evaForMusique.py
index 8770e5ba..5f55e23b 100644
--- a/kag/examples/musique/solver/evaForMusique.py
+++ b/kag/examples/musique/solver/evaForMusique.py
@@ -5,16 +5,14 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
 from tqdm import tqdm
-
+from kag.common.conf import KAG_CONFIG
+from kag.common.registry import import_modules_from_path
 from kag.common.benchmarks.evaluate import Evaluate
 from kag.examples.utils import delay_run
-from kag.interface.solver.lf_planner_abc import LFPlannerABC
-from kag.solver.implementation.default_kg_retrieval import KGRetrieverByLlm
-from kag.solver.implementation.default_reasoner import DefaultReasoner
-from kag.solver.implementation.lf_chunk_retriever import LFChunkRetriever
-from kag.solver.logic.core_modules.lf_solver import LFSolver
 from kag.solver.logic.solver_pipeline import SolverPipeline
 
+from kag.common.checkpointer import CheckpointerManager
+
 logger = logging.getLogger(__name__)
 
 
@@ -27,18 +25,14 @@ def __init__(self):
         pass
 
     def qa(self, query):
-        resp = SolverPipeline()
+        resp = SolverPipeline.from_config(KAG_CONFIG.all_config["kag_solver_pipeline"])
         answer, trace_log = resp.run(query)
 
         logger.info(f"\n\nso the answer for '{query}' is: {answer}\n\n")
         return answer, trace_log
 
     def qaWithoutLogicForm(self, query):
-        # CA
-        lf_solver = LFSolver(chunk_retriever=LFChunkRetriever(),
-                             kg_retriever=KGRetrieverByLlm())
-        reasoner = DefaultReasoner(lf_planner=LFPlannerABC(), lf_solver=lf_solver)
-        resp = SolverPipeline(reasoner=reasoner)
+        resp = SolverPipeline.from_config(KAG_CONFIG.all_config["resp_solver_pipeline"])
         answer, trace_log = resp.run(query)
         logger.info(f"\n\nso the answer for '{query}' is: {answer}\n\n")
         return answer, trace_log
@@ -49,15 +43,24 @@ def qaWithoutLogicForm(self, query):
     """
 
     def parallelQaAndEvaluate(
-            self, qaFilePath, resFilePath, threadNum=1, upperLimit=10
+        self, qaFilePath, resFilePath, threadNum=1, upperLimit=10
     ):
+        ckpt = CheckpointerManager.get_checkpointer(
+            {"type": "zodb", "ckpt_dir": "ckpt"}
+        )
+
         def process_sample(data):
             try:
                 sample_idx, sample = data
                 sample_id = sample["id"]
                 question = sample["question"]
                 gold = sample["answer"]
-                prediction, traceLog = self.qa(question)
+                if question in ckpt:
+                    print(f"found existing answer to question: {question}")
+                    prediction, traceLog = ckpt.read_from_ckpt(question)
+                else:
+                    prediction, traceLog = self.qa(question)
+                    ckpt.write_to_ckpt(question, (prediction, traceLog))
 
                 evaObj = Evaluate()
                 metrics = evaObj.getBenchMark([prediction], [gold])
@@ -83,9 +86,9 @@ def process_sample(data):
                 for sample_idx, sample in enumerate(qaList[:upperLimit])
             ]
             for future in tqdm(
-                    as_completed(futures),
-                    total=len(futures),
-                    desc="parallelQaAndEvaluate completing: ",
+                as_completed(futures),
+                total=len(futures),
+                desc="parallelQaAndEvaluate completing: ",
             ):
                 result = future.result()
                 if result is not None:
@@ -115,20 +118,20 @@ def process_sample(data):
                 res_metrics[item_key] = item_value / total_metrics["processNum"]
             else:
                 res_metrics[item_key] = total_metrics["processNum"]
+        CheckpointerManager.close()
         return res_metrics
 
 
 if __name__ == "__main__":
+    import_modules_from_path("./prompt")
     delay_run(hours=0)
     evaObj = EvaForMusique()
 
     start_time = time.time()
     filePath = "./data/musique_qa_sub.json"
-    #filePath = "./data/musique_qa_train.json"
+    # filePath = "./data/musique_qa_train.json"
 
-    qaFilePath = os.path.join(
-        os.path.abspath(os.path.dirname(__file__)), filePath
-    )
+    qaFilePath = os.path.join(os.path.abspath(os.path.dirname(__file__)), filePath)
     resFilePath = os.path.join(
         os.path.abspath(os.path.dirname(__file__)), f"musique_res_{start_time}.json"
     )
@@ -136,7 +139,7 @@ def process_sample(data):
         qaFilePath, resFilePath, threadNum=20, upperLimit=10000
     )
 
-    total_metrics['cost'] = time.time() - start_time
+    total_metrics["cost"] = time.time() - start_time
     with open(f"./musique_metrics_{start_time}.json", "w") as f:
         json.dump(total_metrics, f)
     print(total_metrics)
diff --git a/kag/examples/musique/solver/prompt/__init__.py b/kag/examples/musique/solver/prompt/__init__.py
index dadd42a3..dfa931cd 100644
--- a/kag/examples/musique/solver/prompt/__init__.py
+++ b/kag/examples/musique/solver/prompt/__init__.py
@@ -11,4 +11,4 @@
 
 """
 Place the prompts to be used for solving problems in this directory.
-"""
\ No newline at end of file
+"""
diff --git a/kag/examples/musique/solver/prompt/resp_generator.py b/kag/examples/musique/solver/prompt/resp_generator.py
index 70e96cc9..23093a76 100644
--- a/kag/examples/musique/solver/prompt/resp_generator.py
+++ b/kag/examples/musique/solver/prompt/resp_generator.py
@@ -1,28 +1,26 @@
-import re
-from string import Template
 from typing import List
 import logging
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 logger = logging.getLogger(__name__)
 
 
-class RespGenerator(PromptOp):
-    template_zh = "基于给定的引用信息回答问题。" \
-                  "\n只输出答案，不需要输出额外的信息。" \
-                  "\n给定的引用信息：'$memory'\n问题：'$instruction'"
-    template_en = "Answer the question based on the given reference." \
-                 "\nOnly give me the answer and do not output any other words." \
-                 "\nThe following are given reference:'$memory'\nQuestion: '$instruction'"
-
-    def __init__(self, language: str):
-        super().__init__(language)
+@PromptABC.register("resp_simple")
+class RespGenerator(PromptABC):
+    template_zh = (
+        "基于给定的引用信息回答问题。" "\n只输出答案，不需要输出额外的信息。" "\n给定的引用信息：'$memory'\n问题：'$instruction'"
+    )
+    template_en = (
+        "Answer the question based on the given reference."
+        "\nOnly give me the answer and do not output any other words."
+        "\nThe following are given reference:'$memory'\nQuestion: '$instruction'"
+    )
 
     @property
     def template_variables(self) -> List[str]:
         return ["memory", "instruction"]
 
     def parse_response(self, response: str, **kwargs):
-        logger.debug('推理器判别:{}'.format(response))
+        logger.debug("推理器判别:{}".format(response))
         return response
diff --git a/kag/examples/riskmining/.gitignore b/kag/examples/riskmining/.gitignore
new file mode 100644
index 00000000..f3a25fbf
--- /dev/null
+++ b/kag/examples/riskmining/.gitignore
@@ -0,0 +1,3 @@
+*-ckpt/
+/*-*-*_*-*-*.csv
+/reasoner/*-*-*_*-*-*.csv
diff --git a/kag/examples/riskmining/builder/indexer.py b/kag/examples/riskmining/builder/indexer.py
index f66bbb86..addd3359 100644
--- a/kag/examples/riskmining/builder/indexer.py
+++ b/kag/examples/riskmining/builder/indexer.py
@@ -13,10 +13,11 @@
 import os
 from kag.builder.component.vectorizer.batch_vectorizer import BatchVectorizer
 from kag.builder.default_chain import DefaultStructuredBuilderChain
-from kag.common.env import init_kag_config
 from kag.builder.component import KGWriter, RelationMapping, SPGTypeMapping
-from kag.builder.component.reader.csv_reader import CSVReader
-from knext.builder.builder_chain_abc import BuilderChainABC
+from kag.builder.component.scanner.csv_scanner import CSVScanner
+from kag.common.conf import KAG_CONFIG
+from kag.interface import KAGBuilderChain as BuilderChainABC
+from kag.builder.runner import BuilderChainRunner
 
 
 class RiskMiningEntityChain(BuilderChainABC):
@@ -25,21 +26,20 @@ def __init__(self, spg_type_name: str):
         self.spg_type_name = spg_type_name
 
     def build(self, **kwargs):
-        source = CSVReader(output_type="Dict")
         mapping = SPGTypeMapping(spg_type_name=self.spg_type_name)
-        vectorizer = BatchVectorizer()
+        vectorizer = BatchVectorizer.from_config(KAG_CONFIG.all_config["chain_vectorizer"])
         sink = KGWriter()
 
-        chain = source >> mapping >> vectorizer >> sink
+        chain = mapping >> vectorizer >> sink
         return chain
 
+
 class RiskMiningRelationChain(BuilderChainABC):
     def __init__(self, spg_type_name: str):
         super().__init__()
         self.spg_type_name = spg_type_name
 
     def build(self, **kwargs):
-        source = CSVReader(output_type="Dict")
         subject_name, relation, object_name = self.spg_type_name.split("_")
         mapping = (
             RelationMapping(subject_name, relation, object_name)
@@ -47,7 +47,7 @@ def build(self, **kwargs):
             .add_dst_id_mapping("dst")
         )
         sink = KGWriter()
-        return source >> mapping >> sink
+        return mapping >> sink
 
 
 class RiskMiningPersonFundTransPersonChain(RiskMiningRelationChain):
@@ -55,7 +55,6 @@ def __init__(self, spg_type_name: str):
         super().__init__(spg_type_name)
 
     def build(self, **kwargs):
-        source = CSVReader(output_type="Dict")
         subject_name, relation, object_name = self.spg_type_name.split("_")
         mapping = (
             RelationMapping(subject_name, relation, object_name)
@@ -65,34 +64,45 @@ def build(self, **kwargs):
             .add_sub_property_mapping("transAmt", "transAmt")
         )
         sink = KGWriter()
-        return source >> mapping >> sink
+        return mapping >> sink
 
 
 def import_data():
     file_path = os.path.dirname(__file__)
-    init_kag_config(os.path.join(file_path, "../kag_config.cfg"))
-    RiskMiningEntityChain(spg_type_name="Cert").invoke(os.path.join(file_path, "data/Cert.csv"))
-    RiskMiningEntityChain(spg_type_name="App").invoke(os.path.join(file_path, "data/App.csv"))
-    RiskMiningEntityChain(spg_type_name="Company").invoke(os.path.join(file_path, "data/Company.csv"))
-    RiskMiningRelationChain(spg_type_name="Company_hasCert_Cert").invoke(
-        os.path.join(file_path, "data/Company_hasCert_Cert.csv")
-    )
-    RiskMiningEntityChain(spg_type_name="Device").invoke(os.path.join(file_path, "data/Device.csv"))
-    RiskMiningPersonFundTransPersonChain(
-        spg_type_name="Person_fundTrans_Person"
-    ).invoke(os.path.join(file_path, "data/Person_fundTrans_Person.csv"))
-    RiskMiningRelationChain(spg_type_name="Person_hasCert_Cert").invoke(
-        os.path.join(file_path, "data/Person_hasCert_Cert.csv")
-    )
-    RiskMiningRelationChain(
-        spg_type_name="Person_hasDevice_Device"
-    ).invoke(os.path.join(file_path, "data/Person_hasDevice_Device.csv"))
-    RiskMiningRelationChain(
-        spg_type_name="Person_holdShare_Company"
-    ).invoke(os.path.join(file_path, "data/Person_holdShare_Company.csv"))
-    RiskMiningEntityChain(spg_type_name="Person").invoke(os.path.join(file_path, "data/Person.csv"))
-    RiskMiningEntityChain(spg_type_name="TaxOfRiskApp").invoke(os.path.join(file_path, "data/TaxOfRiskApp.csv"))
-    RiskMiningEntityChain(spg_type_name="TaxOfRiskUser").invoke(os.path.join(file_path, "data/TaxOfRiskUser.csv"))
+    for spg_type_name in [
+        "App",
+        "Cert",
+        "Company",
+        "Device",
+        "Person",
+        "TaxOfRiskApp",
+        "TaxOfRiskUser",
+    ]:
+        file_name = os.path.join(file_path, f"data/{spg_type_name}.csv")
+        chain = RiskMiningEntityChain(spg_type_name=spg_type_name)
+        runner = BuilderChainRunner(
+            scanner=CSVScanner(),
+            chain=chain,
+        )
+        runner.invoke(file_name)
+
+    for spg_type_name in [
+        "Company_hasCert_Cert",
+        "Person_fundTrans_Person",
+        "Person_hasCert_Cert",
+        "Person_hasDevice_Device",
+        "Person_holdShare_Company",
+    ]:
+        file_name = os.path.join(file_path, f"data/{spg_type_name}.csv")
+        if spg_type_name == "Person_fundTrans_Person":
+            chain = RiskMiningPersonFundTransPersonChain(spg_type_name=spg_type_name)
+        else:
+            chain = RiskMiningRelationChain(spg_type_name=spg_type_name)
+        runner = BuilderChainRunner(
+            scanner=CSVScanner(),
+            chain=chain,
+        )
+        runner.invoke(file_name)
 
 
 if __name__ == "__main__":
diff --git a/kag/examples/riskmining/kag_config.cfg b/kag/examples/riskmining/kag_config.cfg
deleted file mode 100644
index c351ecd4..00000000
--- a/kag/examples/riskmining/kag_config.cfg
+++ /dev/null
@@ -1,36 +0,0 @@
-[project]
-namespace = RiskMining
-host_addr = http://127.0.0.1:8887
-id = 8
-
-[vectorizer]
-vectorizer = kag.common.vectorizer.OpenAIVectorizer
-model = bge-m3
-api_key = EMPTY
-base_url = http://127.0.0.1:11434/v1
-vector_dimensions = 1024
-
-[llm]
-client_type = maas
-base_url = https://api.deepseek.com/
-api_key = put your deepseek api key here
-model = deepseek-chat
-
-
-[prompt]
-biz_scene = default
-language = zh
-
-[indexer]
-with_semantic = False
-similarity_threshold = 0.8
-
-[retriever]
-with_semantic = False
-pagerank_threshold = 0.9
-match_threshold = 0.8
-top_k = 10
-
-[log]
-level = INFO
-
diff --git a/kag/examples/riskmining/kag_config.yaml b/kag/examples/riskmining/kag_config.yaml
new file mode 100644
index 00000000..97a97f45
--- /dev/null
+++ b/kag/examples/riskmining/kag_config.yaml
@@ -0,0 +1,97 @@
+#------------project configuration start----------------#
+openie_llm:
+  api_key: key
+  base_url: https://api.deepseek.com
+  model: deepseek-chat
+  type: maas
+
+chat_llm: &chat_llm
+  api_key: key
+  base_url: https://api.deepseek.com
+  model: deepseek-chat
+  type: maas
+
+vectorize_model: &vectorize_model
+  type: mock
+  vector_dimensions: 768
+vectorizer: *vectorize_model
+
+log:
+  level: INFO
+
+project:
+  biz_scene: default
+  host_addr: http://127.0.0.1:8887
+  id: '5'
+  language: zh
+  namespace: RiskMining
+  checkpoint_path: ./extract-runner-ckpt
+#------------project configuration end----------------#
+
+#------------kag-solver configuration start----------------#
+search_api: &search_api
+  type: openspg_search_api #kag.solver.tools.search_api.impl.openspg_search_api.OpenSPGSearchAPI
+
+graph_api: &graph_api
+  type: openspg_graph_api #kag.solver.tools.graph_api.impl.openspg_graph_api.OpenSPGGraphApi
+
+chain_vectorizer:
+  type: batch
+  vectorize_model: *vectorize_model
+
+exact_kg_retriever: &exact_kg_retriever
+  type: default_exact_kg_retriever # kag.solver.retriever.impl.default_exact_kg_retriever.DefaultExactKgRetriever
+  el_num: 1
+  llm_client: *chat_llm
+  search_api: *search_api
+  graph_api: *graph_api
+
+fuzzy_kg_retriever: &fuzzy_kg_retriever
+  type: default_fuzzy_kg_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever
+  el_num: 1
+  vectorize_model: *vectorize_model
+  llm_client: *chat_llm
+  search_api: *search_api
+  graph_api: *graph_api
+
+chunk_retriever: &chunk_retriever
+  type: default_chunk_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever
+  llm_client: *chat_llm
+  recall_num: 10
+  rerank_topk: 10
+
+kag_solver_pipeline:
+  memory:
+    type: default_memory # kag.solver.implementation.default_memory.DefaultMemory
+    llm_client: *chat_llm
+  max_iterations: 3
+  reasoner:
+    type: default_reasoner # kag.solver.implementation.default_reasoner.DefaultReasoner
+    llm_client: *chat_llm
+    lf_planner:
+      type: default_lf_planner # kag.solver.plan.default_lf_planner.DefaultLFPlanner
+      llm_client: *chat_llm
+      vectorize_model: *vectorize_model
+      logic_form_plan_prompt:
+        type: riskmining_lf_plan # kag.examples.riskmining.solver.prompt.logic_form_plan.py
+    lf_executor:
+      type: default_lf_executor # kag.solver.execute.default_lf_executor.DefaultLFExecutor
+      llm_client: *chat_llm
+      force_chunk_retriever: true
+      exact_kg_retriever: *exact_kg_retriever
+      fuzzy_kg_retriever: *fuzzy_kg_retriever
+      chunk_retriever: *chunk_retriever
+      merger:
+        type: default_lf_sub_query_res_merger # kag.solver.execute.default_sub_query_merger.DefaultLFSubQueryResMerger
+        vectorize_model: *vectorize_model
+        chunk_retriever: *chunk_retriever
+  generator:
+    type: default_generator # kag.solver.implementation.default_generator.DefaultGenerator
+    llm_client: *chat_llm
+    generate_prompt:
+      type: resp_riskmining # kag/examples/riskmining/solver/prompt/resp_generator.py
+  reflector:
+    type: default_reflector # kag.solver.implementation.default_reflector.DefaultReflector
+    llm_client: *chat_llm
+
+#------------kag-solver configuration end----------------#
diff --git a/kag/examples/riskmining/reasoner/client.py b/kag/examples/riskmining/reasoner/client.py
index be11abb3..fa3253dd 100644
--- a/kag/examples/riskmining/reasoner/client.py
+++ b/kag/examples/riskmining/reasoner/client.py
@@ -1,33 +1,33 @@
 import os
-
+from kag.common.conf import KAG_PROJECT_CONF
 from knext.reasoner.client import ReasonerClient
-from kag.common.env import init_kag_config
+
 
 def read_dsl_files(directory):
     """
     Read all dsl files in the reasoner directory.
     """
-    
-    dsl_contents = [] 
+
+    dsl_contents = []
 
     for filename in os.listdir(directory):
-        if filename.endswith('.dsl'): 
+        if filename.endswith(".dsl"):
             file_path = os.path.join(directory, filename)
-            with open(file_path, 'r', encoding='utf-8') as file: 
-                content = file.read() 
-                dsl_contents.append(content) 
+            with open(file_path, "r", encoding="utf-8") as file:
+                content = file.read()
+                dsl_contents.append(content)
 
     return dsl_contents
 
+
 if __name__ == "__main__":
     reasoner_path = os.path.dirname(os.path.abspath(__file__))
-    project_path = os.path.dirname(reasoner_path)
-    cfg_path = os.path.join(project_path, "kag_config.cfg")
-    init_kag_config(cfg_path)
-    host_addr = os.environ["KAG_PROJECT_HOST_ADDR"]
-    project_id = os.environ["KAG_PROJECT_ID"]
-    namespace = os.environ["KAG_PROJECT_NAMESPACE"]
-    client = ReasonerClient(host_addr=host_addr, project_id=project_id, namespace=namespace)
+    host_addr = KAG_PROJECT_CONF.host_addr
+    project_id = KAG_PROJECT_CONF.project_id
+    namespace = KAG_PROJECT_CONF.namespace
+    client = ReasonerClient(
+        host_addr=host_addr, project_id=project_id, namespace=namespace
+    )
     dsls = read_dsl_files(reasoner_path)
     for dsl in dsls:
         client.execute(dsl)
diff --git a/kag/examples/riskmining/solver/prompt/logic_form_plan.py b/kag/examples/riskmining/solver/prompt/logic_form_plan.py
index ab87efb7..ed0d3648 100644
--- a/kag/examples/riskmining/solver/prompt/logic_form_plan.py
+++ b/kag/examples/riskmining/solver/prompt/logic_form_plan.py
@@ -2,12 +2,14 @@
 import re
 from string import Template
 from typing import List
+
 logger = logging.getLogger(__name__)
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 
-class LogicFormPlanPrompt(PromptOp):
+@PromptABC.register("riskmining_lf_plan")
+class LogicFormPlanPrompt(PromptABC):
     instruct_zh = """"instruction": "",
     "function_description": "functionName为算子名;基本格式为 functionName(arg_name1=arg_value1,[args_name2=arg_value2, args_name3=arg_value3]),括号中为参数，被[]包含的参数为可选参数，未被[]包含的为必选参数",
     "function": [
@@ -96,14 +98,10 @@ class LogicFormPlanPrompt(PromptOp):
 }}   
     """
 
-    def __init__(self, language: str):
-        super().__init__(language)
-
     @property
     def template_variables(self) -> List[str]:
         return ["question"]
 
-
     def parse_response(self, response: str, **kwargs):
         try:
             logger.debug(f"logic form:{response}")
@@ -111,17 +109,17 @@ def parse_response(self, response: str, **kwargs):
             _output_string = response.strip()
             sub_querys = []
             logic_forms = []
-            current_sub_query = ''
-            for line in _output_string.split('\n'):
-                if line.startswith('Step'):
-                    sub_querys_regex = re.search('Step\d+:(.*)', line)
+            current_sub_query = ""
+            for line in _output_string.split("\n"):
+                if line.startswith("Step"):
+                    sub_querys_regex = re.search("Step\d+:(.*)", line)
                     if sub_querys_regex is not None:
                         sub_querys.append(sub_querys_regex.group(1))
                         current_sub_query = sub_querys_regex.group(1)
-                elif line.startswith('Output'):
+                elif line.startswith("Output"):
                     sub_querys.append("output")
-                elif line.startswith('Action'):
-                    logic_forms_regex = re.search('Action\d+:(.*)', line)
+                elif line.startswith("Action"):
+                    logic_forms_regex = re.search("Action\d+:(.*)", line)
                     if logic_forms_regex:
                         logic_forms.append(logic_forms_regex.group(1))
                         if len(logic_forms) - len(sub_querys) == 1:
diff --git a/kag/examples/riskmining/solver/prompt/resp_generator.py b/kag/examples/riskmining/solver/prompt/resp_generator.py
new file mode 100644
index 00000000..94981860
--- /dev/null
+++ b/kag/examples/riskmining/solver/prompt/resp_generator.py
@@ -0,0 +1,28 @@
+import re
+from string import Template
+from typing import List
+import logging
+
+from kag.interface import PromptABC
+
+logger = logging.getLogger(__name__)
+
+
+@PromptABC.register("resp_riskmining")
+class RespGenerator(PromptABC):
+    template_zh = (
+        "基于给定的引用信息回答问题。" "\n输出答案，并且给出理由。" "\n给定的引用信息：'$memory'\n问题：'$instruction'"
+    )
+    template_en = (
+        "Answer the question based on the given reference."
+        "\nGive me the answer and why."
+        "\nThe following are given reference:'$memory'\nQuestion: '$instruction'"
+    )
+
+    @property
+    def template_variables(self) -> List[str]:
+        return ["memory", "instruction"]
+
+    def parse_response(self, response: str, **kwargs):
+        logger.debug("推理器判别:{}".format(response))
+        return response
diff --git a/kag/examples/riskmining/solver/qa.py b/kag/examples/riskmining/solver/qa.py
index e81e68f2..076a323d 100644
--- a/kag/examples/riskmining/solver/qa.py
+++ b/kag/examples/riskmining/solver/qa.py
@@ -6,15 +6,11 @@
 from tqdm import tqdm
 
 from kag.common.benchmarks.evaluate import Evaluate
-from kag.common.env import init_kag_config
 from kag.examples.utils import delay_run
-from kag.interface.solver.lf_planner_abc import LFPlannerABC
-from knext.reasoner.client import ReasonerClient
-from kag.solver.implementation.default_kg_retrieval import KGRetrieverByLlm
-from kag.solver.implementation.default_reasoner import DefaultReasoner
-from kag.solver.implementation.lf_chunk_retriever import LFChunkRetriever
-from kag.solver.logic.core_modules.lf_solver import LFSolver
+
 from kag.solver.logic.solver_pipeline import SolverPipeline
+from kag.common.conf import KAG_CONFIG
+from kag.common.registry import import_modules_from_path
 
 logger = logging.getLogger(__name__)
 
@@ -24,34 +20,15 @@ class EvaQA:
     init for kag client
     """
 
-    def __init__(self, configFilePath):
-        self.configFilePath = configFilePath
-        init_kag_config(self.configFilePath)
-
     def qa(self, query):
-        resp = SolverPipeline()
+        resp = SolverPipeline.from_config(KAG_CONFIG.all_config["kag_solver_pipeline"])
         answer, trace_log = resp.run(query)
 
         logger.info(f"\n\nso the answer for '{query}' is: {answer}\n\n")
         return answer, trace_log
 
-    def qaWithoutLogicForm(self, query):
-        # CA
-        lf_solver = LFSolver(chunk_retriever=LFChunkRetriever(),
-                             kg_retriever=KGRetrieverByLlm())
-        reasoner = DefaultReasoner(lf_planner=LFPlannerABC(), lf_solver=lf_solver)
-        resp = SolverPipeline(reasoner=reasoner)
-        answer, trace_log = resp.run(query)
-        logger.info(f"\n\nso the answer for '{query}' is: {answer}\n\n")
-        return answer, trace_log
-
-    """
-        parallel qa from knowledge base
-        and getBenchmarks(em, f1, answer_similarity)
-    """
-
     def parallelQaAndEvaluate(
-            self, qaFilePath, resFilePath, threadNum=1, upperLimit=10
+        self, qaFilePath, resFilePath, threadNum=1, upperLimit=10
     ):
         def process_sample(data):
             try:
@@ -85,9 +62,9 @@ def process_sample(data):
                 for sample_idx, sample in enumerate(qaList[:upperLimit])
             ]
             for future in tqdm(
-                    as_completed(futures),
-                    total=len(futures),
-                    desc="parallelQaAndEvaluate completing: ",
+                as_completed(futures),
+                total=len(futures),
+                desc="parallelQaAndEvaluate completing: ",
             ):
                 result = future.result()
                 if result is not None:
@@ -121,26 +98,8 @@ def process_sample(data):
 
 
 if __name__ == "__main__":
-
+    import_modules_from_path("./prompt")
     delay_run(hours=0)
-    configFilePath = os.path.join(
-        os.path.abspath(os.path.dirname(__file__)), "../kag_config.cfg"
-    )
-
-    project_id = os.getenv("KAG_PROJECT_ID")
-    host_addr = os.getenv("KAG_PROJECT_HOST_ADDR")
-    sc = ReasonerClient(host_addr, project_id, )
-    param = {
-        "spg.reasoner.plan.pretty.print.logger.enable": "true"
 
-    }
-
-#     ret = sc.syn_execute("""MATCH
-#     (u:`RiskMining.TaxOfRiskUser`/`赌博App开发者`)
-# RETURN u.name
-#     """, **param)
-#     print(ret)
-
-    evaObj = EvaQA(configFilePath=configFilePath)
+    evaObj = EvaQA()
     print(evaObj.qa("裘**是否有风险？"))
-
diff --git a/kag/examples/supplychain/.gitignore b/kag/examples/supplychain/.gitignore
new file mode 100644
index 00000000..f3a25fbf
--- /dev/null
+++ b/kag/examples/supplychain/.gitignore
@@ -0,0 +1,3 @@
+*-ckpt/
+/*-*-*_*-*-*.csv
+/reasoner/*-*-*_*-*-*.csv
diff --git a/kag/examples/supplychain/builder/data/Product.csv b/kag/examples/supplychain/builder/data/Product.csv
index 302f3861..ec34cadc 100644
--- a/kag/examples/supplychain/builder/data/Product.csv
+++ b/kag/examples/supplychain/builder/data/Product.csv
@@ -16,11 +16,6 @@ id,belongToIndustry,hasSupplyChain
 轮胎与橡胶-轮胎-斜交轮胎,非日常生活消费品-汽车与汽车零部件-汽车零配件-轮胎与橡胶,"建筑、农用机械与重型卡车-港口机械,汽车-摩托车制造-三轮摩托车,汽车-摩托车制造-二轮摩托车,机动车贸易-机动车零配件零售,建筑、农用机械与重型卡车-农业机械-农机具及其零部件-收割机,商业服务-综合支持服务-加工劳务-轮胎分装,建筑、农用机械与重型卡车-工程机械-筑养路机械,建筑、农用机械与重型卡车-工程机械-起重装卸机械,建筑、农用机械与重型卡车-农业机械-农机具及其零部件-农用车辆,建筑、农用机械与重型卡车-农业机械-农机具及其零部件-农用车辆-拖拉机,建筑、农用机械与重型卡车-工程机械-混凝土机械"
 建筑、农用机械与重型卡车,工业-资本品-机械制造-建筑、农用机械与重型卡车,"商业服务-综合支持服务-安装劳务,消费信贷-租赁服务,消费信贷-租赁服务-融资租赁,消费信贷-租赁服务-融资租赁-机械产品融资租赁,消费信贷-租赁服务-经营性租赁,消费信贷-租赁服务-经营性租赁-机械产品经营租赁,工业设备和产品贸易-其他工业机械和产品经销商,公路与铁路运输,交通基本设施,工业设备和产品贸易,商业服务-综合支持服务-安装劳务-其他安装劳务,煤与消费用燃料,建筑与工程,建筑与工程-其他建筑与工程承包"
 建筑、农用机械与重型卡车-港口机械,工业-资本品-机械制造-建筑、农用机械与重型卡车,"交通基本设施-海港与服务-港口服务-港口物流,交通基本设施-海港与服务,交通基本设施-海港与服务-港口服务,交通基本设施-海港与服务-港口服务-其他港口服务,交通基本设施-海港与服务-港口服务-港口物流-装卸业务,交通基本设施-海港与服务-港口服务-港口物流-堆存业务,消费信贷-租赁服务,消费信贷-租赁服务-融资租赁,消费信贷-租赁服务-融资租赁-机械产品融资租赁,消费信贷-租赁服务-经营性租赁,消费信贷-租赁服务-经营性租赁-机械产品经营租赁"
-化工商品贸易,商贸-资本品商贸-工业资本品贸易-化工商品贸易,
-化工商品贸易-化工产品贸易,商贸-资本品商贸-工业资本品贸易-化工商品贸易,
-化工商品贸易-化工产品贸易-橡塑制品贸易,商贸-资本品商贸-工业资本品贸易-化工商品贸易,
-机动车贸易,商贸-消费品商贸-非日常消费品商贸-机动车贸易,
-机动车贸易-汽车贸易,商贸-消费品商贸-非日常消费品商贸-机动车贸易,
 汽车,非日常生活消费品-汽车与汽车零部件-汽车-汽车,"机动车贸易-汽车贸易,公路与铁路运输-陆运,公路与铁路运输-陆运-汽车租赁,特殊消费者服务-商品预订,商业服务-综合支持服务-维护服务-交通工具维护服务-机动车维修服务,保险-财产与意外伤害保险-机动车辆险,石油与天然气-石油与天然气的炼制和营销-石油炼制产品-成品油-汽油,商业服务-综合支持服务-维护服务-交通工具维护服务,机动车贸易-其他机动车贸易,调查和咨询服务-专业技术服务-其他技术服务,商业服务,机动车贸易,机动车贸易-汽车贸易-二手车零售,综合金融服务-特殊金融服务-汽车金融服务,调查和咨询服务-专业技术服务,商业服务-综合支持服务-汽车美容服务-洗车服务,商业服务-综合支持服务-汽车试驾测试服务,航空货运与物流-物流服务-汽车物流服务,互联网软件与服务-电商平台提供商-汽车类信息服务,商业服务-综合支持服务-汽车美容服务"
 轮胎与橡胶,非日常生活消费品-汽车与汽车零部件-汽车零配件-轮胎与橡胶,"汽车-汽车制造,建筑、农用机械与重型卡车-环卫机械-环卫车,机动车贸易-机动车零配件零售,商业服务-综合支持服务-维护服务-交通工具维护服务-机动车维修服务,建筑、农用机械与重型卡车-重卡及专用车,汽车,建筑、农用机械与重型卡车-工程机械-工业车辆,建筑、农用机械与重型卡车-农业机械-农机具及其零部件-农用车辆"
 轮胎与橡胶-轮胎,非日常生活消费品-汽车与汽车零部件-汽车零配件-轮胎与橡胶,"汽车-汽车制造,建筑、农用机械与重型卡车-环卫机械-环卫车,消闲用品-自行车,机动车贸易-机动车零配件零售,建筑、农用机械与重型卡车-重卡及专用车,汽车,建筑、农用机械与重型卡车-工程机械-工业车辆,商业服务-综合支持服务-加工劳务-轮胎分装,建筑、农用机械与重型卡车-农业机械-农机具及其零部件-农用车辆,建筑、农用机械与重型卡车-机场服务设备-机场专用车辆"
@@ -41,13 +36,7 @@ id,belongToIndustry,hasSupplyChain
 建筑、农用机械与重型卡车-环卫机械-环卫车,工业-资本品-机械制造-建筑、农用机械与重型卡车,"消费信贷-租赁服务,消费信贷-租赁服务-融资租赁,消费信贷-租赁服务-融资租赁-机械产品融资租赁,消费信贷-租赁服务-经营性租赁,消费信贷-租赁服务-经营性租赁-机械产品经营租赁,机动车贸易-其他机动车贸易,机动车贸易,商业服务-综合支持服务-清洁服务-道路清洁服务"
 消闲用品,非日常生活消费品-耐用消费品与服装-休闲设备与用品-消闲用品,"赌场与赌博,消闲设施,消闲设施-其他消闲设施"
 消闲用品-自行车,非日常生活消费品-耐用消费品与服装-休闲设备与用品-消闲用品,"保险-财产与意外伤害保险-非机动车辆险,商业服务-综合支持服务-维护服务-交通工具维护服务,生活消费品贸易"
-机动车贸易,商贸-消费品商贸-非日常消费品商贸-机动车贸易,
-机动车贸易-机动车零配件零售,商贸-消费品商贸-非日常消费品商贸-机动车贸易,
-商业服务,工业-商业和专业服务-商业服务与商业用品-商业服务,
 商业服务-综合支持服务,工业-商业和专业服务-商业服务与商业用品-商业服务,"商业服务-环境与设施服务,商业服务-环境与设施服务-其他环境工程服务,信息技术服务-信息科技咨询与其他服务,信息技术服务-信息科技咨询与其他服务-其他信息科技服务,互联网软件与服务-电商平台提供商-其他电子商务服务,互联网软件与服务-电商平台提供商"
-商业服务-综合支持服务-维护服务,工业-商业和专业服务-商业服务与商业用品-商业服务,
-商业服务-综合支持服务-维护服务-交通工具维护服务,工业-商业和专业服务-商业服务与商业用品-商业服务,
-商业服务-综合支持服务-维护服务-交通工具维护服务-机动车维修服务,工业-商业和专业服务-商业服务与商业用品-商业服务,
 保险,金融-保险-保险-保险,调查和咨询服务-咨询服务-金融中介服务-保险公估服务
 保险-财产与意外伤害保险,金融-保险-保险-保险,"保险-保险经纪服务,保险-再保险,调查和咨询服务-咨询服务-金融中介服务-保险公估服务"
 保险-财产与意外伤害保险-机动车辆险,金融-保险-保险-保险,保险-保险经纪服务
@@ -66,10 +55,7 @@ id,belongToIndustry,hasSupplyChain
 建筑、农用机械与重型卡车,工业-资本品-机械制造-建筑、农用机械与重型卡车,"商业服务-综合支持服务-安装劳务,消费信贷-租赁服务,消费信贷-租赁服务-融资租赁,消费信贷-租赁服务-融资租赁-机械产品融资租赁,消费信贷-租赁服务-经营性租赁,消费信贷-租赁服务-经营性租赁-机械产品经营租赁,工业设备和产品贸易-其他工业机械和产品经销商,公路与铁路运输,交通基本设施,工业设备和产品贸易,商业服务-综合支持服务-安装劳务-其他安装劳务,煤与消费用燃料,建筑与工程,建筑与工程-其他建筑与工程承包"
 建筑、农用机械与重型卡车-工程机械,工业-资本品-机械制造-建筑、农用机械与重型卡车,"建筑与工程-水利工程-疏浚,工业设备和产品贸易-工业机械贸易-农业与工程机械贸易,消费信贷-租赁服务,消费信贷-租赁服务-融资租赁,消费信贷-租赁服务-融资租赁-机械产品融资租赁,消费信贷-租赁服务-经营性租赁,消费信贷-租赁服务-经营性租赁-机械产品经营租赁,石油与天然气-石油与天然气的炼制和营销-石油炼制产品-成品油-柴油,建筑与工程-燃气管道工程-城市燃气管道建设-燃气管道安装服务,建筑与工程-专项建设工程-爆破工程,建筑与工程-专项建设工程,建筑与工程-房屋建筑工程,建筑与工程-专项建设工程-其他专项建设工程,建筑与工程-工业建筑工程-化工工程,建筑与工程-交通工程-铁路工程,建筑与工程-交通工程-公路工程,建筑与工程-交通工程-桥梁工程,建筑与工程-交通工程-隧道工程,调查和咨询服务-专业技术服务-矿山开发服务-矿石开采及冶炼,建筑与工程-水利工程-节水灌溉工程,建筑与工程-工业建筑工程-冶金工程,建筑与工程-医疗建筑工程,建筑与工程-医疗建筑工程-医疗专业工程,建筑与工程-照明工程,建筑与工程-专项建设工程-供暖工程,建筑与工程-工业建筑工程,建筑与工程-专项建设工程-体育设施工程,建筑与工程-专项建设工程-混凝土拆卸工程,建筑与工程-专项建设工程-防腐涂装工程,建筑与工程-专项建设工程-建筑保温工程,建筑与工程-专项建设工程-供暖工程-地暖工程,建筑与工程-水利工程-给排水工程,建筑与工程-水利工程-给排水工程-供水管道安装工程,建筑与工程-燃气管道工程-长距离管道建设,建筑与工程-燃气管道工程-城市燃气管道建设,建筑与工程-专项建设工程-古建筑修复工程,建筑与工程-专项建设工程-气膜建筑工程,建筑与工程-专项建设工程-基坑支护工程,建筑与工程-专项建设工程-斜坡防护工程,建筑与工程,建筑与工程-交通工程,建筑与工程-水利工程,建筑与工程-电力工程,建筑与工程-燃气管道工程,建筑与工程-工业建筑工程-煤炭工程,建筑与工程-其他建筑与工程承包,建筑与工程-专项建设工程-强夯地基工程,建筑与工程-专项建设工程-建筑防水工程"
 建筑、农用机械与重型卡车-工程机械-工业车辆,工业-资本品-机械制造-建筑、农用机械与重型卡车,"交通基本设施-海港与服务-港口服务-港口物流,工业设备和产品贸易-工业机械贸易-农业与工程机械贸易,消费信贷-租赁服务,消费信贷-租赁服务-融资租赁,消费信贷-租赁服务-融资租赁-机械产品融资租赁,消费信贷-租赁服务-经营性租赁,消费信贷-租赁服务-经营性租赁-机械产品经营租赁,房地产开发与经营-多样化房地产业务,石油与天然气-石油与天然气的炼制和营销-石油炼制产品-成品油-柴油,房地产开发与经营-土地开发,建筑材料-基础材料-集料-砂石,建筑与工程-工业建筑工程-化工工程,房地产开发与经营,建筑与工程-交通工程-公路工程,建筑与工程-交通工程-桥梁工程,消费信贷-租赁服务-经营性租赁-机械产品经营租赁-叉车租赁,房地产开发与经营-土地开发-土地一级开发,建筑与工程-交通工程"
-商业服务,工业-商业和专业服务-商业服务与商业用品-商业服务,
 商业服务-综合支持服务,工业-商业和专业服务-商业服务与商业用品-商业服务,"商业服务-环境与设施服务,商业服务-环境与设施服务-其他环境工程服务,信息技术服务-信息科技咨询与其他服务,信息技术服务-信息科技咨询与其他服务-其他信息科技服务,互联网软件与服务-电商平台提供商-其他电子商务服务,互联网软件与服务-电商平台提供商"
-商业服务-综合支持服务-加工劳务,工业-商业和专业服务-商业服务与商业用品-商业服务,
-商业服务-综合支持服务-加工劳务-轮胎分装,工业-商业和专业服务-商业服务与商业用品-商业服务,
 建筑、农用机械与重型卡车,工业-资本品-机械制造-建筑、农用机械与重型卡车,"商业服务-综合支持服务-安装劳务,消费信贷-租赁服务,消费信贷-租赁服务-融资租赁,消费信贷-租赁服务-融资租赁-机械产品融资租赁,消费信贷-租赁服务-经营性租赁,消费信贷-租赁服务-经营性租赁-机械产品经营租赁,工业设备和产品贸易-其他工业机械和产品经销商,公路与铁路运输,交通基本设施,工业设备和产品贸易,商业服务-综合支持服务-安装劳务-其他安装劳务,煤与消费用燃料,建筑与工程,建筑与工程-其他建筑与工程承包"
 建筑、农用机械与重型卡车-工程机械,工业-资本品-机械制造-建筑、农用机械与重型卡车,"建筑与工程-水利工程-疏浚,工业设备和产品贸易-工业机械贸易-农业与工程机械贸易,消费信贷-租赁服务,消费信贷-租赁服务-融资租赁,消费信贷-租赁服务-融资租赁-机械产品融资租赁,消费信贷-租赁服务-经营性租赁,消费信贷-租赁服务-经营性租赁-机械产品经营租赁,石油与天然气-石油与天然气的炼制和营销-石油炼制产品-成品油-柴油,建筑与工程-燃气管道工程-城市燃气管道建设-燃气管道安装服务,建筑与工程-专项建设工程-爆破工程,建筑与工程-专项建设工程,建筑与工程-房屋建筑工程,建筑与工程-专项建设工程-其他专项建设工程,建筑与工程-工业建筑工程-化工工程,建筑与工程-交通工程-铁路工程,建筑与工程-交通工程-公路工程,建筑与工程-交通工程-桥梁工程,建筑与工程-交通工程-隧道工程,调查和咨询服务-专业技术服务-矿山开发服务-矿石开采及冶炼,建筑与工程-水利工程-节水灌溉工程,建筑与工程-工业建筑工程-冶金工程,建筑与工程-医疗建筑工程,建筑与工程-医疗建筑工程-医疗专业工程,建筑与工程-照明工程,建筑与工程-专项建设工程-供暖工程,建筑与工程-工业建筑工程,建筑与工程-专项建设工程-体育设施工程,建筑与工程-专项建设工程-混凝土拆卸工程,建筑与工程-专项建设工程-防腐涂装工程,建筑与工程-专项建设工程-建筑保温工程,建筑与工程-专项建设工程-供暖工程-地暖工程,建筑与工程-水利工程-给排水工程,建筑与工程-水利工程-给排水工程-供水管道安装工程,建筑与工程-燃气管道工程-长距离管道建设,建筑与工程-燃气管道工程-城市燃气管道建设,建筑与工程-专项建设工程-古建筑修复工程,建筑与工程-专项建设工程-气膜建筑工程,建筑与工程-专项建设工程-基坑支护工程,建筑与工程-专项建设工程-斜坡防护工程,建筑与工程,建筑与工程-交通工程,建筑与工程-水利工程,建筑与工程-电力工程,建筑与工程-燃气管道工程,建筑与工程-工业建筑工程-煤炭工程,建筑与工程-其他建筑与工程承包,建筑与工程-专项建设工程-强夯地基工程,建筑与工程-专项建设工程-建筑防水工程"
 建筑、农用机械与重型卡车-工程机械-筑养路机械,工业-资本品-机械制造-建筑、农用机械与重型卡车,"工业设备和产品贸易-工业机械贸易-农业与工程机械贸易,消费信贷-租赁服务,消费信贷-租赁服务-融资租赁,消费信贷-租赁服务-融资租赁-机械产品融资租赁,消费信贷-租赁服务-经营性租赁,消费信贷-租赁服务-经营性租赁-机械产品经营租赁,石油与天然气-石油与天然气的炼制和营销-石油炼制产品-成品油-柴油,建筑与工程-交通工程-公路工程,建筑与工程-交通工程-桥梁工程,建筑与工程-交通工程"
diff --git a/kag/examples/supplychain/builder/data/README.md b/kag/examples/supplychain/builder/data/README.md
new file mode 100644
index 00000000..78866dec
--- /dev/null
+++ b/kag/examples/supplychain/builder/data/README.md
@@ -0,0 +1,46 @@
+# Introduction to the business data
+
+## 1. Contents of the data directory
+
+```text
+supplychain
+├── builder
+│   ├── data
+│   │   ├── Company.csv
+│   │   ├── CompanyUpdate.csv
+│   │   ├── Company_fundTrans_Company.csv
+│   │   ├── Index.csv
+│   │   ├── Industry.csv
+│   │   ├── Person.csv
+│   │   ├── Product.csv
+│   │   ├── ProductChainEvent.csv
+│   │   ├── TaxOfCompanyEvent.csv
+│   │   ├── TaxOfProdEvent.csv
+│   │   └── Trend.csv
+```
+
+We will introduce the tables by sampling some rows from each one.
+
+## 2. Company data（Company.csv）
+
+```text
+id,name,products
+CSF0000002238,三角*胎股*限公司,"轮胎,全钢子午线轮胎"
+```
+
+* ``id``: The unique id in the system of the company
+* ``name``: Name of the company
+* ``products``: Products produced by the company, separated by commas
+
+## 3. Fund transferring between companies（Company_fundTrans_Company.csv）
+
+```text
+src,dst,transDate,transAmt
+CSF0000002227,CSF0000001579,20230506,73
+```
+
+* ``src``: Funds transferor
+* ``dst``: Funds transferee
+* ``transDate``: date the fund transferring happens
+* ``transAmt``: the amount of transferred fund
+
diff --git a/kag/examples/supplychain/builder/data/README_cn.md b/kag/examples/supplychain/builder/data/README_cn.md
new file mode 100644
index 00000000..27db318e
--- /dev/null
+++ b/kag/examples/supplychain/builder/data/README_cn.md
@@ -0,0 +1,46 @@
+# 业务数据介绍
+
+## 1. 数据目录
+
+```text
+supplychain
+├── builder
+│   ├── data
+│   │   ├── Company.csv
+│   │   ├── CompanyUpdate.csv
+│   │   ├── Company_fundTrans_Company.csv
+│   │   ├── Index.csv
+│   │   ├── Industry.csv
+│   │   ├── Person.csv
+│   │   ├── Product.csv
+│   │   ├── ProductChainEvent.csv
+│   │   ├── TaxOfCompanyEvent.csv
+│   │   ├── TaxOfProdEvent.csv
+│   │   └── Trend.csv
+```
+
+分别抽样部分数据进行介绍。
+
+## 2. 公司数据（Company.csv）
+
+```text
+id,name,products
+CSF0000002238,三角*胎股*限公司,"轮胎,全钢子午线轮胎"
+```
+
+* ``id``：公司在系统中的唯一 id
+* ``name``：公司名
+* ``products``：公司生产的产品，使用逗号分隔
+
+## 3. 公司资金转账（Company_fundTrans_Company.csv）
+
+```text
+src,dst,transDate,transAmt
+CSF0000002227,CSF0000001579,20230506,73
+```
+
+* ``src``：转出方
+* ``dst``：转入方
+* ``transDate``：转账日期
+* ``transAmt``：转账总金额
+
diff --git a/kag/examples/supplychain/builder/indexer.py b/kag/examples/supplychain/builder/indexer.py
index 08415712..f7aed1b7 100644
--- a/kag/examples/supplychain/builder/indexer.py
+++ b/kag/examples/supplychain/builder/indexer.py
@@ -13,44 +13,62 @@
 
 from kag.builder.component.vectorizer.batch_vectorizer import BatchVectorizer
 from kag.builder.default_chain import DefaultStructuredBuilderChain
-from kag.common.env import init_kag_config
 from kag.builder.component import SPGTypeMapping, KGWriter, RelationMapping
-from kag.builder.component.reader.csv_reader import CSVReader
+from kag.builder.component.scanner.csv_scanner import CSVScanner
 from kag.examples.supplychain.builder.operator.event_kg_writer_op import EventKGWriter
-from kag.examples.supplychain.builder.operator.fund_date_process_op import FundDateProcessComponent
+from kag.examples.supplychain.builder.operator.fund_date_process_op import (
+    FundDateProcessComponent,
+)
+from kag.common.conf import KAG_PROJECT_CONF, KAG_CONFIG
 from knext.search.client import SearchClient
-from knext.builder.builder_chain_abc import BuilderChainABC
+from kag.interface import KAGBuilderChain as BuilderChainABC
 from knext.search.client import SearchClient
+from kag.builder.runner import BuilderChainRunner
 
 
 def company_link_func(prop_value, node):
-    sc = SearchClient(os.getenv("KAG_PROJECT_HOST_ADDR"), int(os.getenv("KAG_PROJECT_ID")))
+    sc = SearchClient(KAG_PROJECT_CONF.host_addr, KAG_PROJECT_CONF.project_id)
     company_id = []
-    records = sc.search_text(prop_value, label_constraints=["SupplyChain.Company"], topk=1)
+    records = sc.search_text(
+        prop_value, label_constraints=["SupplyChain.Company"], topk=1
+    )
     if records:
-        company_id.append(records[0]["node"]['id'])
+        company_id.append(records[0]["node"]["id"])
     return company_id
 
 
 class SupplyChainPersonChain(BuilderChainABC):
     def __init__(self, spg_type_name: str):
-        super().__init__()
+        # super().__init__()
         self.spg_type_name = spg_type_name
 
     def build(self, **kwargs):
-        source = CSVReader(output_type="Dict")
-        mapping = (
+        self.mapping = (
             SPGTypeMapping(spg_type_name=self.spg_type_name)
             .add_property_mapping("name", "name")
             .add_property_mapping("id", "id")
             .add_property_mapping("age", "age")
             .add_property_mapping(
-                "legalRepresentative", "legalRepresentative", link_func=company_link_func
+                "legalRepresentative",
+                "legalRepresentative",
+                link_func=company_link_func,
             )
         )
-        vectorizer = BatchVectorizer()
-        sink = KGWriter()
-        return source >> mapping >> vectorizer >> sink
+        self.vectorizer = BatchVectorizer.from_config(
+            KAG_CONFIG.all_config["chain_vectorizer"]
+        )
+        self.sink = KGWriter()
+        return self.mapping >> self.vectorizer >> self.sink
+
+    def get_component_with_ckpts(self):
+        return [
+            self.vectorizer,
+        ]
+
+    def close_checkpointers(self):
+        for node in self.get_component_with_ckpts():
+            if node and hasattr(node, "checkpointer"):
+                node.checkpointer.close()
 
 
 class SupplyChainCompanyFundTransCompanyChain(BuilderChainABC):
@@ -59,23 +77,31 @@ def __init__(self, spg_type_name: str):
         self.spg_type_name = spg_type_name
 
     def build(self, **kwargs):
-        source = CSVReader(output_type="Dict")
         subject_name, relation, object_name = self.spg_type_name.split("_")
-        date_process_op = FundDateProcessComponent()
-        mapping = (
+        self.date_process_op = FundDateProcessComponent()
+        self.mapping = (
             RelationMapping(subject_name, relation, object_name)
             .add_src_id_mapping("srcId")
             .add_dst_id_mapping("dstId")
             .add_sub_property_mapping("transDate", "transDate")
             .add_sub_property_mapping("transAmt", "transAmt")
         )
-        vectorizer = BatchVectorizer()
-        sink = KGWriter()
-        return source >> date_process_op >> mapping >> vectorizer >> sink
+        self.vectorizer = BatchVectorizer.from_config(
+            KAG_CONFIG.all_config["chain_vectorizer"]
+        )
+        self.sink = KGWriter()
+        return self.date_process_op >> self.mapping >> self.vectorizer >> self.sink
 
-class SupplyChainDefaulStructuredBuilderChain(DefaultStructuredBuilderChain):
-    def __init__(self, spg_type_name: str, **kwargs):
-        super().__init__(spg_type_name, **kwargs)
+    def get_component_with_ckpts(self):
+        return [
+            self.vectorizer,
+        ]
+
+
+class SupplyChainDefaulStructuredBuilderChain(BuilderChainABC):
+    def __init__(self, spg_type_name: str):
+        super().__init__()
+        self.spg_type_name = spg_type_name
 
     def build(self, **kwargs):
         """
@@ -87,17 +113,23 @@ def build(self, **kwargs):
         Returns:
             chain: The constructed processing chain.
         """
-        source = CSVReader(output_type="Dict")
-        mapping = SPGTypeMapping(spg_type_name=self.spg_type_name)
-        sink = KGWriter()
-        vectorizer = BatchVectorizer()
-        chain = source >> mapping >> vectorizer >> sink
+        self.mapping = SPGTypeMapping(spg_type_name=self.spg_type_name)
+        self.sink = KGWriter()
+        self.vectorizer = BatchVectorizer.from_config(
+            KAG_CONFIG.all_config["chain_vectorizer"]
+        )
+        chain = self.mapping >> self.vectorizer >> self.sink
         return chain
 
+    def get_component_with_ckpts(self):
+        return [
+            self.vectorizer,
+        ]
+
 
 class SupplyChainEventBuilderChain(DefaultStructuredBuilderChain):
     def __init__(self, spg_type_name: str, **kwargs):
-        super().__init__(spg_type_name, **kwargs)
+        self.spg_type_name = spg_type_name
 
     def build(self, **kwargs):
         """
@@ -109,43 +141,58 @@ def build(self, **kwargs):
         Returns:
             chain: The constructed processing chain.
         """
-        source = CSVReader(output_type="Dict")
-        mapping = SPGTypeMapping(spg_type_name=self.spg_type_name)
-        sink = EventKGWriter()
-        vectorizer = BatchVectorizer()
-        chain = source >> mapping >> vectorizer >> sink
+        self.mapping = SPGTypeMapping(spg_type_name=self.spg_type_name)
+        self.sink = EventKGWriter()
+        self.vectorizer = BatchVectorizer.from_config(
+            KAG_CONFIG.all_config["chain_vectorizer"]
+        )
+        chain = self.mapping >> self.vectorizer >> self.sink
         return chain
 
+    def get_component_with_ckpts(self):
+        return [
+            self.vectorizer,
+        ]
+
+
 def import_data():
     file_path = os.path.dirname(__file__)
-    init_kag_config(os.path.join(file_path, "../kag_config.cfg"))
-
+    for spg_type_name in [
+        "TaxOfCompanyEvent",
+        "TaxOfProdEvent",
+        "Trend",
+        "Industry",
+        "Product",
+        "Company",
+        "Index",
+        "Person",
+    ]:
+        file_name = os.path.join(file_path, f"data/{spg_type_name}.csv")
+        if spg_type_name == "Person":
+            chain = SupplyChainPersonChain(spg_type_name=spg_type_name)
+        else:
+            chain = SupplyChainDefaulStructuredBuilderChain(spg_type_name=spg_type_name)
+        runner = BuilderChainRunner(
+            scanner=CSVScanner(),
+            chain=chain,
+        )
+        runner.invoke(file_name)
 
-    SupplyChainDefaulStructuredBuilderChain(spg_type_name="TaxOfCompanyEvent").invoke(
-        file_path=os.path.join(file_path,"data/TaxOfCompanyEvent.csv")
-    )
-    SupplyChainDefaulStructuredBuilderChain(spg_type_name="TaxOfProdEvent").invoke(
-        file_path=os.path.join(file_path,"data/TaxOfProdEvent.csv")
-    )
-    SupplyChainDefaulStructuredBuilderChain(spg_type_name="Trend").invoke(file_path=os.path.join(file_path,"data/Trend.csv"))
-    SupplyChainDefaulStructuredBuilderChain(spg_type_name="Industry").invoke(
-        file_path=os.path.join(file_path,"data/Industry.csv")
-    )
-    SupplyChainDefaulStructuredBuilderChain(spg_type_name="Product").invoke(
-        file_path=os.path.join(file_path,"data/Product.csv")
+    chain = SupplyChainCompanyFundTransCompanyChain(
+        spg_type_name="Company_fundTrans_Company"
     )
-    SupplyChainDefaulStructuredBuilderChain(spg_type_name="Company").invoke(
-        file_path=os.path.join(file_path,"data/Company.csv")
+    runner = BuilderChainRunner(
+        scanner=CSVScanner(),
+        chain=chain,
     )
-    SupplyChainDefaulStructuredBuilderChain(spg_type_name="Index").invoke(file_path=os.path.join(file_path,"data/Index.csv"))
-    SupplyChainPersonChain(spg_type_name="Person").invoke(file_path=os.path.join(file_path,"data/Person.csv"))
+    runner.invoke(os.path.join(file_path, "data/Company_fundTrans_Company.csv"))
 
-    SupplyChainCompanyFundTransCompanyChain(
-        spg_type_name="Company_fundTrans_Company"
-    ).invoke(file_path=os.path.join(file_path,"data/Company_fundTrans_Company.csv"))
-    SupplyChainEventBuilderChain(spg_type_name="ProductChainEvent").invoke(
-        file_path=os.path.join(file_path,"data/ProductChainEvent.csv")
+    chain = SupplyChainEventBuilderChain(spg_type_name="ProductChainEvent")
+    runner = BuilderChainRunner(
+        scanner=CSVScanner(),
+        chain=chain,
     )
+    runner.invoke(os.path.join(file_path, "data/ProductChainEvent.csv"))
 
 
 if __name__ == "__main__":
diff --git a/kag/examples/supplychain/builder/operator/company_link_op.py b/kag/examples/supplychain/builder/operator/company_link_op.py
index 226b5c38..3cbd1e57 100644
--- a/kag/examples/supplychain/builder/operator/company_link_op.py
+++ b/kag/examples/supplychain/builder/operator/company_link_op.py
@@ -11,7 +11,7 @@
 # or implied.
 import os
 from typing import List
-
+from kag.common.conf import KAG_PROJECT_CONF
 from kag.builder.model.sub_graph import Node
 from kag.builder.operator.base import LinkOpABC
 from knext.search.client import SearchClient
@@ -22,9 +22,9 @@ class CompanyLinkOp(LinkOpABC):
     bind_to = "Company"
 
     def invoke(self, source: Node, prop_value: str, target_type: str) -> List[str]:
-        sc = SearchClient(os.getenv("KAG_PROJECT_HOST_ADDR"), int(os.getenv("KAG_PROJECT_ID")))
+        sc = SearchClient(KAG_PROJECT_CONF.host_addr, KAG_PROJECT_CONF.project_id)
         company_id = []
         records = sc.search_text(prop_value, label_constraints=[target_type], topk=1)
         if records:
-            company_id.append(records[0]["node"]['id'])
+            company_id.append(records[0]["node"]["id"])
         return company_id
diff --git a/kag/examples/supplychain/builder/operator/event_kg_writer_op.py b/kag/examples/supplychain/builder/operator/event_kg_writer_op.py
index 9a83dc85..ecc13587 100644
--- a/kag/examples/supplychain/builder/operator/event_kg_writer_op.py
+++ b/kag/examples/supplychain/builder/operator/event_kg_writer_op.py
@@ -11,6 +11,9 @@ def __init__(self, project_id: str = None, **kwargs):
         super().__init__(project_id, **kwargs)
 
     def invoke(
-            self, input: Input, alter_operation: str = AlterOperationEnum.Upsert, lead_to_builder: bool = True
+        self,
+        input: Input,
+        alter_operation: str = AlterOperationEnum.Upsert,
+        lead_to_builder: bool = True,
     ) -> List[Output]:
         return super().invoke(input, alter_operation, lead_to_builder)
diff --git a/kag/examples/supplychain/kag_config.cfg b/kag/examples/supplychain/kag_config.cfg
deleted file mode 100644
index 590f42c0..00000000
--- a/kag/examples/supplychain/kag_config.cfg
+++ /dev/null
@@ -1,26 +0,0 @@
-[project]
-namespace = SupplyChain
-host_addr = http://127.0.0.1:8887
-id = 7
-
-[vectorizer]
-vectorizer = kag.common.vectorizer.OpenAIVectorizer
-model = bge-m3
-api_key = EMPTY
-base_url = http://127.0.0.1:11434/v1
-vector_dimensions = 1024
-
-[llm]
-client_type = maas
-base_url = https://api.deepseek.com/
-api_key = put your deepseek api key here
-model = deepseek-chat
-
-
-[prompt]
-biz_scene = default
-language = zh
-
-[log]
-level = INFO
-
diff --git a/kag/examples/supplychain/kag_config.yaml b/kag/examples/supplychain/kag_config.yaml
new file mode 100644
index 00000000..b4108d48
--- /dev/null
+++ b/kag/examples/supplychain/kag_config.yaml
@@ -0,0 +1,97 @@
+#------------project configuration start----------------#
+openie_llm:
+  api_key: key
+  base_url: https://api.deepseek.com
+  model: deepseek-chat
+  type: maas
+
+chat_llm: &chat_llm
+  api_key: key
+  base_url: https://api.deepseek.com
+  model: deepseek-chat
+  type: maas
+
+vectorize_model: &vectorize_model
+  type: mock
+  vector_dimensions: 768
+vectorizer: *vectorize_model
+
+log:
+  level: INFO
+
+project:
+  biz_scene: default
+  host_addr: http://127.0.0.1:8887
+  id: '12'
+  language: zh
+  namespace: SupplyChain
+  checkpoint_path: ./extract-runner-ckpt
+#------------project configuration end----------------#
+
+#------------kag-solver configuration start----------------#
+search_api: &search_api
+  type: openspg_search_api #kag.solver.tools.search_api.impl.openspg_search_api.OpenSPGSearchAPI
+
+graph_api: &graph_api
+  type: openspg_graph_api #kag.solver.tools.graph_api.impl.openspg_graph_api.OpenSPGGraphApi
+
+chain_vectorizer:
+  type: batch
+  vectorize_model: *vectorize_model
+
+exact_kg_retriever: &exact_kg_retriever
+  type: default_exact_kg_retriever # kag.solver.retriever.impl.default_exact_kg_retriever.DefaultExactKgRetriever
+  el_num: 1
+  llm_client: *chat_llm
+  search_api: *search_api
+  graph_api: *graph_api
+
+fuzzy_kg_retriever: &fuzzy_kg_retriever
+  type: default_fuzzy_kg_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever
+  el_num: 1
+  vectorize_model: *vectorize_model
+  llm_client: *chat_llm
+  search_api: *search_api
+  graph_api: *graph_api
+
+chunk_retriever: &chunk_retriever
+  type: default_chunk_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever
+  llm_client: *chat_llm
+  recall_num: 10
+  rerank_topk: 10
+
+kag_solver_pipeline:
+  memory:
+    type: default_memory # kag.solver.implementation.default_memory.DefaultMemory
+    llm_client: *chat_llm
+  max_iterations: 3
+  reasoner:
+    type: default_reasoner # kag.solver.implementation.default_reasoner.DefaultReasoner
+    llm_client: *chat_llm
+    lf_planner:
+      type: default_lf_planner # kag.solver.plan.default_lf_planner.DefaultLFPlanner
+      llm_client: *chat_llm
+      vectorize_model: *vectorize_model
+      logic_form_plan_prompt:
+        type: supplychain_lf_plan # kag.examples.supplychain.solver.prompt.logic_form_plan.LogicFormPlanPrompt
+    lf_executor:
+      type: default_lf_executor # kag.solver.execute.default_lf_executor.DefaultLFExecutor
+      llm_client: *chat_llm
+      force_chunk_retriever: false
+      exact_kg_retriever: *exact_kg_retriever
+      fuzzy_kg_retriever: *fuzzy_kg_retriever
+      chunk_retriever: *chunk_retriever
+      merger:
+        type: default_lf_sub_query_res_merger # kag.solver.execute.default_sub_query_merger.DefaultLFSubQueryResMerger
+        vectorize_model: *vectorize_model
+        chunk_retriever: *chunk_retriever
+  generator:
+    type: default_generator # kag.solver.implementation.default_generator.DefaultGenerator
+    llm_client: *chat_llm
+    generate_prompt:
+      type: resp_supplychain # kag.examples.supplychain.solver.prompt.resp_generator.RespGenerator
+  reflector:
+    type: default_reflector # kag.solver.implementation.default_reflector.DefaultReflector
+    llm_client: *chat_llm
+
+#------------kag-solver configuration end----------------#
diff --git a/kag/examples/supplychain/reasoner/client.py b/kag/examples/supplychain/reasoner/client.py
index 32ab9c5b..fa3253dd 100644
--- a/kag/examples/supplychain/reasoner/client.py
+++ b/kag/examples/supplychain/reasoner/client.py
@@ -1,33 +1,33 @@
 import os
-
+from kag.common.conf import KAG_PROJECT_CONF
 from knext.reasoner.client import ReasonerClient
-from kag.common.env import init_kag_config
+
 
 def read_dsl_files(directory):
     """
     Read all dsl files in the reasoner directory.
     """
-    
-    dsl_contents = [] 
+
+    dsl_contents = []
 
     for filename in os.listdir(directory):
-        if filename.endswith('.dsl'): 
+        if filename.endswith(".dsl"):
             file_path = os.path.join(directory, filename)
-            with open(file_path, 'r', encoding='utf-8') as file: 
-                content = file.read() 
-                dsl_contents.append(content) 
+            with open(file_path, "r", encoding="utf-8") as file:
+                content = file.read()
+                dsl_contents.append(content)
 
     return dsl_contents
 
+
 if __name__ == "__main__":
-    resonser_path = os.path.dirname(os.path.abspath(__file__))
-    project_path = os.path.dirname(resonser_path)
-    cfg_path = os.path.join(project_path, "kag_config.cfg")
-    init_kag_config(cfg_path)
-    host_addr = os.environ["KAG_PROJECT_HOST_ADDR"]
-    project_id = os.environ["KAG_PROJECT_ID"]
-    namespace = os.environ["KAG_PROJECT_NAMESPACE"]
-    client = ReasonerClient(host_addr=host_addr, project_id=project_id,namespace=namespace)
-    dsls = read_dsl_files(resonser_path)
+    reasoner_path = os.path.dirname(os.path.abspath(__file__))
+    host_addr = KAG_PROJECT_CONF.host_addr
+    project_id = KAG_PROJECT_CONF.project_id
+    namespace = KAG_PROJECT_CONF.namespace
+    client = ReasonerClient(
+        host_addr=host_addr, project_id=project_id, namespace=namespace
+    )
+    dsls = read_dsl_files(reasoner_path)
     for dsl in dsls:
         client.execute(dsl)
diff --git a/kag/examples/supplychain/solver/prompt/logic_form_plan.py b/kag/examples/supplychain/solver/prompt/logic_form_plan.py
index 2448e4a1..81c0e75b 100644
--- a/kag/examples/supplychain/solver/prompt/logic_form_plan.py
+++ b/kag/examples/supplychain/solver/prompt/logic_form_plan.py
@@ -1,12 +1,14 @@
 import logging
 import re
 from typing import List
+
 logger = logging.getLogger(__name__)
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 
-class LogicFormPlanPrompt(PromptOp):
+@PromptABC.register("supplychain_lf_plan")
+class LogicFormPlanPrompt(PromptABC):
     instruct_zh = """"instruction": "",
     "function_description": "functionName为算子名;基本格式为 functionName(arg_name1=arg_value1,[args_name2=arg_value2, args_name3=arg_value3]),括号中为参数，被[]包含的参数为可选参数，未被[]包含的为必选参数",
     "function": [
@@ -95,14 +97,10 @@ class LogicFormPlanPrompt(PromptOp):
 }}   
     """
 
-    def __init__(self, language: str):
-        super().__init__(language)
-
     @property
     def template_variables(self) -> List[str]:
         return ["question"]
 
-
     def parse_response(self, response: str, **kwargs):
         try:
             logger.debug(f"logic form:{response}")
@@ -110,17 +108,17 @@ def parse_response(self, response: str, **kwargs):
             _output_string = response.strip()
             sub_querys = []
             logic_forms = []
-            current_sub_query = ''
-            for line in _output_string.split('\n'):
-                if line.startswith('Step'):
-                    sub_querys_regex = re.search('Step\d+:(.*)', line)
+            current_sub_query = ""
+            for line in _output_string.split("\n"):
+                if line.startswith("Step"):
+                    sub_querys_regex = re.search("Step\d+:(.*)", line)
                     if sub_querys_regex is not None:
                         sub_querys.append(sub_querys_regex.group(1))
                         current_sub_query = sub_querys_regex.group(1)
-                elif line.startswith('Output'):
+                elif line.startswith("Output"):
                     sub_querys.append("output")
-                elif line.startswith('Action'):
-                    logic_forms_regex = re.search('Action\d+:(.*)', line)
+                elif line.startswith("Action"):
+                    logic_forms_regex = re.search("Action\d+:(.*)", line)
                     if logic_forms_regex:
                         logic_forms.append(logic_forms_regex.group(1))
                         if len(logic_forms) - len(sub_querys) == 1:
diff --git a/kag/examples/supplychain/solver/prompt/resp_generator.py b/kag/examples/supplychain/solver/prompt/resp_generator.py
new file mode 100644
index 00000000..81393cec
--- /dev/null
+++ b/kag/examples/supplychain/solver/prompt/resp_generator.py
@@ -0,0 +1,28 @@
+import re
+from string import Template
+from typing import List
+import logging
+
+from kag.interface import PromptABC
+
+logger = logging.getLogger(__name__)
+
+
+@PromptABC.register("resp_supplychain")
+class RespGenerator(PromptABC):
+    template_zh = (
+        "基于给定的引用信息回答问题。" "\n输出答案，并且给出理由。" "\n给定的引用信息：'$memory'\n问题：'$instruction'"
+    )
+    template_en = (
+        "Answer the question based on the given reference."
+        "\nGive me the answer and why."
+        "\nThe following are given reference:'$memory'\nQuestion: '$instruction'"
+    )
+
+    @property
+    def template_variables(self) -> List[str]:
+        return ["memory", "instruction"]
+
+    def parse_response(self, response: str, **kwargs):
+        logger.debug("推理器判别:{}".format(response))
+        return response
diff --git a/kag/examples/supplychain/solver/qa.py b/kag/examples/supplychain/solver/qa.py
index 79334ca6..0c6d856f 100644
--- a/kag/examples/supplychain/solver/qa.py
+++ b/kag/examples/supplychain/solver/qa.py
@@ -7,15 +7,10 @@
 from tqdm import tqdm
 
 from kag.common.benchmarks.evaluate import Evaluate
-from kag.common.env import init_kag_config
 from kag.examples.utils import delay_run
-from kag.interface.solver.lf_planner_abc import LFPlannerABC
-from knext.reasoner.client import ReasonerClient
-from kag.solver.implementation.default_kg_retrieval import KGRetrieverByLlm
-from kag.solver.implementation.default_reasoner import DefaultReasoner
-from kag.solver.implementation.lf_chunk_retriever import LFChunkRetriever
-from kag.solver.logic.core_modules.lf_solver import LFSolver
 from kag.solver.logic.solver_pipeline import SolverPipeline
+from kag.common.conf import KAG_CONFIG
+from kag.common.registry import import_modules_from_path
 
 logger = logging.getLogger(__name__)
 
@@ -25,34 +20,20 @@ class SupplyChainDemo:
     init for kag client
     """
 
-    def __init__(self, configFilePath):
-        self.configFilePath = configFilePath
-        init_kag_config(self.configFilePath)
-
     def qa(self, query):
-        resp = SolverPipeline()
+        resp = SolverPipeline.from_config(KAG_CONFIG.all_config["kag_solver_pipeline"])
         answer, trace_log = resp.run(query)
 
         logger.info(f"\n\nso the answer for '{query}' is: {answer}\n\n")
         return answer, trace_log
 
-    def qaWithoutLogicForm(self, query):
-        # CA
-        lf_solver = LFSolver(chunk_retriever=LFChunkRetriever(),
-                             kg_retriever=KGRetrieverByLlm())
-        reasoner = DefaultReasoner(lf_planner=LFPlannerABC(), lf_solver=lf_solver)
-        resp = SolverPipeline(reasoner=reasoner)
-        answer, trace_log = resp.run(query)
-        logger.info(f"\n\nso the answer for '{query}' is: {answer}\n\n")
-        return answer, trace_log
-
     """
         parallel qa from knowledge base
         and getBenchmarks(em, f1, answer_similarity)
     """
 
     def parallelQaAndEvaluate(
-            self, qaFilePath, resFilePath, threadNum=1, upperLimit=10
+        self, qaFilePath, resFilePath, threadNum=1, upperLimit=10
     ):
         def process_sample(data):
             try:
@@ -86,9 +67,9 @@ def process_sample(data):
                 for sample_idx, sample in enumerate(qaList[:upperLimit])
             ]
             for future in tqdm(
-                    as_completed(futures),
-                    total=len(futures),
-                    desc="parallelQaAndEvaluate completing: ",
+                as_completed(futures),
+                total=len(futures),
+                desc="parallelQaAndEvaluate completing: ",
             ):
                 result = future.result()
                 if result is not None:
@@ -122,10 +103,7 @@ def process_sample(data):
 
 
 if __name__ == "__main__":
-
+    import_modules_from_path("./prompt")
     delay_run(hours=0)
-    configFilePath = os.path.join(
-        os.path.abspath(os.path.dirname(__file__)), "../kag_config.cfg"
-    )
-    demo = SupplyChainDemo(configFilePath=configFilePath)
+    demo = SupplyChainDemo()
     print(demo.qa("顺丁橡胶成本上涨对那些公司产生了影响"))
diff --git a/kag/examples/utils.py b/kag/examples/utils.py
index d4ad866a..9023ffb5 100644
--- a/kag/examples/utils.py
+++ b/kag/examples/utils.py
@@ -1,13 +1,6 @@
 import time
-import hashlib
 import numpy as np
 
-def generate_hash_id(value):
-    m = hashlib.md5()
-    m.update(value.encode("utf-8"))
-    md5_hex = m.hexdigest()
-    decimal_value = int(md5_hex, 16)
-    return str(decimal_value)
 
 def delay_run(hours: int):
     start_time = time.time()
@@ -31,16 +24,16 @@ def compute_sub_query(trace_log: dict):
     round_max_sub_query = 0
     kg_direct_num = 0
     sub_query_num = 0
-    round_max_sub_query += 0 if len(trace_log) == 0 else len(trace_log[0]['history'])
+    round_max_sub_query += 0 if len(trace_log) == 0 else len(trace_log[0]["history"])
     for log in trace_log:
-        if 'history' not in log:
+        if "history" not in log:
             continue
-        history = log['history']
+        history = log["history"]
 
         for h in history:
             sub_query_num += 1
-            source_type = h.get('answer_source', 'chunk')
-            if source_type == 'spo':
+            source_type = h.get("answer_source", "chunk")
+            if source_type == "spo":
                 kg_direct_num += 1
     return kg_direct_num, sub_query_num, round_max_sub_query
 
@@ -58,7 +51,7 @@ def run_rerank_by_score(recall_docs: list):
         tmp_dict = {}
         for doc in iter_recall_docs:
             score = doc.split("#")[-1]
-            header = doc.replace(f"#{score}", '')
+            header = doc.replace(f"#{score}", "")
             tmp_dict[header] = score
         normalized_iter_doc_scores = min_max_normalize(
             np.array(list(tmp_dict.values())).astype(float)
@@ -83,7 +76,9 @@ def compute_recall_metrics(recall_docs: list, supporting_facts: list, extract_co
         if header is None:
             raise Exception(f"doc header extra failed {doc}")
         recall_docs_header.append(header)
-    return compute_hit_in_recalls(recall_docs_header, supporting_facts, 2), compute_hit_in_recalls(recall_docs_header,
-                                                                                                   supporting_facts,
-                                                                                                   5), compute_hit_in_recalls(
-        recall_docs_header, supporting_facts, 10), compute_hit_in_recalls(recall_docs_header, supporting_facts, 10000)
+    return (
+        compute_hit_in_recalls(recall_docs_header, supporting_facts, 2),
+        compute_hit_in_recalls(recall_docs_header, supporting_facts, 5),
+        compute_hit_in_recalls(recall_docs_header, supporting_facts, 10),
+        compute_hit_in_recalls(recall_docs_header, supporting_facts, 10000),
+    )
diff --git a/kag/interface/__init__.py b/kag/interface/__init__.py
index e69de29b..b47421b4 100644
--- a/kag/interface/__init__.py
+++ b/kag/interface/__init__.py
@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+from kag.interface.common.prompt import PromptABC
+from kag.interface.common.llm_client import LLMClient
+
+from kag.interface.common.vectorize_model import VectorizeModelABC, EmbeddingVector
+from kag.interface.builder.scanner_abc import ScannerABC
+from kag.interface.builder.reader_abc import ReaderABC
+from kag.interface.builder.splitter_abc import SplitterABC
+from kag.interface.builder.extractor_abc import ExtractorABC
+from kag.interface.builder.mapping_abc import MappingABC
+from kag.interface.builder.aligner_abc import AlignerABC
+from kag.interface.builder.writer_abc import SinkWriterABC
+from kag.interface.builder.vectorizer_abc import VectorizerABC
+from kag.interface.builder.external_graph_abc import (
+    ExternalGraphLoaderABC,
+    MatchConfig,
+)
+from kag.interface.builder.builder_chain_abc import KAGBuilderChain
+from kag.interface.builder.postprocessor_abc import PostProcessorABC
+from kag.interface.solver.base import KagBaseModule, Question
+
+# from kag.interface.solver.kag_memory_abc import KagMemoryABC
+# from kag.interface.solver.kag_generator_abc import KAGGeneratorABC
+# from kag.interface.solver.execute.lf_executor_abc import LFExecutorABC
+# from kag.interface.solver.plan.lf_planner_abc import LFPlannerABC
+# from kag.interface.solver.kag_reasoner_abc import KagReasonerABC
+# from kag.interface.solver.kag_reflector_abc import KagReflectorABC
+
+__all__ = [
+    "PromptABC",
+    "LLMClient",
+    "VectorizeModelABC",
+    "EmbeddingVector",
+    "ScannerABC",
+    "ReaderABC",
+    "SplitterABC",
+    "ExtractorABC",
+    "MappingABC",
+    "AlignerABC",
+    "SinkWriterABC",
+    "VectorizerABC",
+    "ExternalGraphLoaderABC",
+    "MatchConfig",
+    "KAGBuilderChain",
+    "PostProcessorABC",
+    "KagBaseModule",
+    "Question",
+]
diff --git a/kag/interface/builder/__init__.py b/kag/interface/builder/__init__.py
index 8f7be0a4..e69de29b 100644
--- a/kag/interface/builder/__init__.py
+++ b/kag/interface/builder/__init__.py
@@ -1,29 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-from kag.interface.builder.reader_abc import SourceReaderABC
-from kag.interface.builder.splitter_abc import SplitterABC
-from kag.interface.builder.extractor_abc import ExtractorABC
-from kag.interface.builder.mapping_abc import MappingABC
-from kag.interface.builder.aligner_abc import AlignerABC
-from kag.interface.builder.writer_abc import SinkWriterABC
-from knext.builder.builder_chain_abc import BuilderChainABC
-
-__all__ = [
-    "SourceReaderABC",
-    "SplitterABC",
-    "ExtractorABC",
-    "MappingABC",
-    "AlignerABC",
-    "SinkWriterABC",
-    "BuilderChainABC"
-]
diff --git a/kag/interface/builder/aligner_abc.py b/kag/interface/builder/aligner_abc.py
index 75b0bbf7..5cdd87f0 100644
--- a/kag/interface/builder/aligner_abc.py
+++ b/kag/interface/builder/aligner_abc.py
@@ -12,14 +12,23 @@
 from abc import ABC, abstractmethod
 from typing import List
 
-from kag.builder.component.base import BuilderComponent
+from kag.interface.builder.base import BuilderComponent
 from kag.builder.model.sub_graph import SubGraph
 from knext.common.base.runnable import Input, Output
 
 
 class AlignerABC(BuilderComponent, ABC):
     """
-    Interface for aligning extractor results to semantic schema.
+    Abstract base class for aligning extractor results to a semantic schema.
+
+    This class defines the interface for aligning the results obtained from
+    extractors to a semantic schema. It inherits from `BuilderComponent` and
+    is an abstract base class (`ABC`), meaning that concrete implementations
+    must be provided for all abstract methods.
+
+    Attributes:
+        input_types (SubGraph): The expected input type for the aligner.
+        output_types (SubGraph): The output type produced by the aligner.
     """
 
     @property
@@ -32,6 +41,23 @@ def output_types(self):
 
     @abstractmethod
     def invoke(self, input: Input, **kwargs) -> List[Output]:
+        """
+        Abstract method to invoke the alignment process.
+
+        This method must be implemented by any concrete subclass. It is
+        responsible for aligning the input data to the semantic schema and
+        returning the aligned results.
+
+        Args:
+            input (Input): The input data to be aligned.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List[Output]: A list of aligned output objects.
+
+        Raises:
+            NotImplementedError: If the method is not implemented in a subclass.
+        """
         raise NotImplementedError(
             f"`invoke` is not currently supported for {self.__class__.__name__}."
         )
diff --git a/kag/interface/builder/base.py b/kag/interface/builder/base.py
new file mode 100644
index 00000000..d13c6a79
--- /dev/null
+++ b/kag/interface/builder/base.py
@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import os
+from typing import List, Dict
+
+from knext.common.base.component import Component
+from knext.common.base.runnable import Input, Output
+from kag.common.registry import Registrable
+from kag.common.conf import KAG_PROJECT_CONF
+from kag.common.checkpointer import CheckPointer, CheckpointerManager
+from kag.common.sharding_info import ShardingInfo
+
+
+@Registrable.register("builder")
+class BuilderComponent(Component, Registrable):
+    """
+    Abstract base class for all builder component.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.language = kwargs.get("language", KAG_PROJECT_CONF.language)
+        rank = kwargs.get("rank")
+        world_size = kwargs.get("world_size")
+        if rank is None or world_size is None:
+            from kag.common.env import get_rank, get_world_size
+
+            rank = get_rank(0)
+            world_size = get_world_size(1)
+        self.sharding_info = ShardingInfo(shard_id=rank, shard_count=world_size)
+
+        if self.ckpt_subdir:
+            self.ckpt_dir = os.path.join(KAG_PROJECT_CONF.ckpt_dir, self.ckpt_subdir)
+
+            self.checkpointer: CheckPointer = CheckpointerManager.get_checkpointer(
+                {
+                    "type": "zodb",
+                    "ckpt_dir": self.ckpt_dir,
+                    "rank": rank,
+                    "world_size": world_size,
+                }
+            )
+        else:
+            self.checkpointer = None
+
+    @property
+    def type(self):
+        """
+        Get the type label of the object.
+
+        Returns:
+            str: The type label of the object, fixed as "BUILDER".
+        """
+        return "BUILDER"
+
+    def batch(self, inputs: List[Input], **kwargs) -> List[Output]:
+        results = []
+        for input in inputs:
+            results.extend(self.invoke(input, **kwargs))
+        return results
+
+    def _handle(self, input: Dict) -> List[Dict]:
+        _input = self.input_types.from_dict(input) if isinstance(input, dict) else input
+        _output = self.invoke(_input)
+        return [_o.to_dict() for _o in _output if _o]
+
+    @property
+    def ckpt_subdir(self):
+        return None
+
+    def _invoke(self, input: Input, **kwargs) -> List[Output]:
+        """
+        Abstract method to be implemented by subclasses for splitting a chunk.
+
+        Args:
+            input (Input): The chunk to be split.
+            **kwargs: Additional keyword arguments, currently unused but kept for potential future expansion.
+
+        Returns:
+            List[Output]: A list of smaller chunks resulting from the split operation.
+
+        Raises:
+            NotImplementedError: If the method is not implemented by the subclass.
+        """
+        raise NotImplementedError(
+            f"`invoke` is not currently supported for {self.__class__.__name__}."
+        )
+
+    def invoke(self, input: Input, **kwargs) -> List[Output]:
+        write_ckpt = kwargs.get("write_ckpt", True)
+        if write_ckpt and self.checkpointer:
+            input_key = kwargs.get("key")
+            # found existing data in checkpointer
+            if input_key and self.checkpointer.exists(input_key):
+                out = self.checkpointer.read_from_ckpt(input_key)
+                if out is not None:
+                    return out
+            # not found
+            output = self._invoke(input, **kwargs)
+            if input_key:
+                self.checkpointer.write_to_ckpt(input_key, output)
+            return output
+        else:
+            return self._invoke(input, **kwargs)
diff --git a/kag/interface/builder/builder_chain_abc.py b/kag/interface/builder/builder_chain_abc.py
new file mode 100644
index 00000000..0131cd5a
--- /dev/null
+++ b/kag/interface/builder/builder_chain_abc.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+from typing import List
+
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from kag.common.registry import Registrable
+
+from knext.builder.builder_chain_abc import BuilderChainABC
+
+
+class KAGBuilderChain(BuilderChainABC, Registrable):
+    """
+    KAGBuilderChain is a class that extends the BuilderChainABC and Registrable base classes.
+    It is responsible for constructing and executing a workflow represented by a directed acyclic graph (DAG).
+    Each node within the DAG is an instance of BuilderComponent, and the input for each node is processed in parallel.
+    """
+
+    def invoke(self, file_path, max_workers=10, **kwargs):
+        """
+        Invokes the builder chain to process the input file.
+
+        Args:
+            file_path: The path to the input file to be processed.
+            max_workers (int, optional): The maximum number of threads to use. Defaults to 10.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List: The final output from the builder chain.
+        """
+
+        def execute_node(node, inputs: List[str]):
+            """
+            Executes a single node in the builder chain using parallel processing.
+
+            Args:
+                node: The node to be executed.
+                inputs (List[str]): The list of input data for the node.
+
+            Returns:
+                List: The output from the node.
+            """
+            node_name = type(node).__name__.split(".")[-1]
+            with ThreadPoolExecutor(max_workers) as inner_executor:
+                inner_futures = [
+                    inner_executor.submit(node.invoke, inp) for inp in inputs
+                ]
+                result = []
+                from tqdm import tqdm
+
+                for inner_future in tqdm(
+                    as_completed(inner_futures),
+                    total=len(inner_futures),
+                    desc=f"[{node_name}]",
+                    position=1,
+                    leave=False,
+                ):
+                    # for inner_future in as_completed(inner_futures):
+                    ret = inner_future.result()
+                    result.extend(ret)
+                return result
+
+        chain = self.build(file_path=file_path, **kwargs)
+        dag = chain.dag
+        import networkx as nx
+
+        nodes = list(nx.topological_sort(dag))
+        node_outputs = {}
+        # processed_node_names = []
+        for node in nodes:
+            # node_name = type(node).__name__.split(".")[-1]
+            # processed_node_names.append(node_name)
+            predecessors = list(dag.predecessors(node))
+            if len(predecessors) == 0:
+                node_input = [file_path]
+                node_output = execute_node(node, node_input)
+            else:
+                node_input = []
+                for p in predecessors:
+                    node_input.extend(node_outputs[p])
+                node_output = execute_node(node, node_input)
+            node_outputs[node] = node_output
+        output_nodes = [node for node in nodes if dag.out_degree(node) == 0]
+        final_output = []
+        for node in output_nodes:
+            if node in node_outputs:
+                final_output.extend(node_outputs[node])
+
+        return final_output
diff --git a/kag/interface/builder/external_graph_abc.py b/kag/interface/builder/external_graph_abc.py
new file mode 100644
index 00000000..78a74cb9
--- /dev/null
+++ b/kag/interface/builder/external_graph_abc.py
@@ -0,0 +1,146 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import numpy as np
+from typing import List, Union, Any
+from kag.builder.model.sub_graph import Node, SubGraph
+from kag.common.registry import Registrable
+from kag.interface.builder.base import BuilderComponent
+from knext.common.base.runnable import Input, Output
+
+
+class MatchConfig(Registrable):
+    """
+    Configuration class for matching operations.
+
+    This class is used to define the parameters for matching operations, such as the number of matches to return,
+    the labels to consider, and the threshold for matching confidence.
+
+    Attributes:
+        k (int): The number of matches to return. Defaults to 1.
+        labels (List[str]): The list of labels to consider for matching. Defaults to None.
+        threshold (float): The confidence threshold for matching. Defaults to 0.9.
+    """
+
+    def __init__(self, k: int = 1, labels: List[str] = None, threshold: float = 0.9):
+        """
+        Initializes the MatchConfig with the specified parameters.
+
+        Args:
+            k (int, optional): The number of matches to return. Defaults to 1.
+            labels (List[str], optional): The list of labels to consider for matching. Defaults to None.
+            threshold (float, optional): The confidence threshold for matching. Defaults to 0.9.
+        """
+        self.k = k
+        self.labels = labels
+        self.threshold = threshold
+
+
+MatchConfig.register("base", as_default=True)(MatchConfig)
+
+
+class ExternalGraphLoaderABC(BuilderComponent):
+    """
+    Abstract base class for loading and interacting with external knowledge graphs.
+
+    This class defines the interface for components that load and interact with external knowledge graphs.
+    It inherits from `BuilderComponent` and provides methods for dumping subgraphs, performing named entity
+    recognition (NER), retrieving allowed labels, and matching entities.
+
+    """
+
+    def __init__(self, match_config: MatchConfig):
+        """
+        Initializes the ExternalGraphLoaderABC with the specified match configuration.
+
+        Args:
+            match_config (MatchConfig): The configuration for matching operations.
+        """
+        super().__init__()
+        self.match_config = match_config
+
+    def dump(self) -> List[SubGraph]:
+        """
+        Abstract method to dump subgraphs from the external knowledge graph.
+
+        Returns:
+            List[SubGraph]: A list of subgraphs extracted from the external knowledge graph.
+
+        Raises:
+            NotImplementedError: If the method is not implemented in the subclass.
+        """
+        raise NotImplementedError("dump not implemented yet.")
+
+    def ner(self, content: str) -> List[Node]:
+        """
+        Abstract method to perform named entity recognition (NER) on the given content based on the external graph nodes.
+
+        Args:
+            content (str): The content to perform NER on.
+
+        Returns:
+            List[Node]: A list of nodes representing the recognized entities.
+
+        Raises:
+            NotImplementedError: If the method is not implemented in the subclass.
+        """
+        raise NotImplementedError("ner not implemented yet.")
+
+    def get_allowed_labels(self, labels: List[str] = None) -> List[str]:
+        """
+        Abstract method to obtain the allowed labels during matching, which are the intersection of the node labels in the external graph and the `labels` argument.
+
+        Args:
+            labels (List[str], optional): The list of labels to filter by. Defaults to None.
+
+        Returns:
+            List[str]: A list of allowed labels.
+
+        Raises:
+            NotImplementedError: If the method is not implemented in the subclass.
+        """
+        raise NotImplementedError("get_allowed_labels not implemented yet.")
+
+    def match_entity(
+        self,
+        query: Union[str, List[float], np.ndarray],
+    ):
+        """
+        Method to match entities based on the given query.
+
+        Args:
+            query (Union[str, List[float], np.ndarray]): The query to match entities against.
+                This can be a string, a list of floats, or a numpy array.
+        Returns:
+            Nodes in the graph that match the entity.
+        """
+        pass
+
+    @property
+    def input_types(self):
+        return Any
+
+    @property
+    def output_types(self):
+        return SubGraph
+
+    def invoke(self, input: Input, **kwargs) -> List[Output]:
+        """
+        Invokes the component to process input data and return a list of subgraphs.
+
+        Args:
+            input (Input): Input data containing name and content.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List[Output]: A list of processed results, containing subgraph information.
+        """
+        return self.dump()
diff --git a/kag/interface/builder/extractor_abc.py b/kag/interface/builder/extractor_abc.py
index b8221510..58212d17 100644
--- a/kag/interface/builder/extractor_abc.py
+++ b/kag/interface/builder/extractor_abc.py
@@ -9,22 +9,20 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
-from abc import ABC, abstractmethod
-from typing import List
+from abc import ABC
 
-from kag.builder.component.base import BuilderComponent
+from kag.interface.builder.base import BuilderComponent
 from kag.builder.model.chunk import Chunk
 from kag.builder.model.sub_graph import SubGraph
-from knext.common.base.runnable import Input, Output
 
 
 class ExtractorABC(BuilderComponent, ABC):
     """
-    Interface for extracting sub graph (which contains a list of nodes and a list of edges) from chunks.
-    """
+    Abstract base class for extracting sub graphs (which contain a list of nodes and a list of edges) from chunks.
 
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
+    This class defines the interface for all extractor components that are responsible for processing input data
+    and generating sub graphs as output. It inherits from `BuilderComponent` and `ABC` (Abstract Base Class).
+    """
 
     @property
     def input_types(self):
@@ -34,8 +32,6 @@ def input_types(self):
     def output_types(self):
         return SubGraph
 
-    @abstractmethod
-    def invoke(self, input: Input, **kwargs) -> List[Output]:
-        raise NotImplementedError(
-            f"`invoke` is not currently supported for {self.__class__.__name__}."
-        )
+    @property
+    def ckpt_subdir(self):
+        return "extractor"
diff --git a/kag/interface/builder/mapping_abc.py b/kag/interface/builder/mapping_abc.py
index ca3b3ddd..e234110e 100644
--- a/kag/interface/builder/mapping_abc.py
+++ b/kag/interface/builder/mapping_abc.py
@@ -12,14 +12,16 @@
 from abc import ABC, abstractmethod
 from typing import List, Dict
 
-from kag.builder.component.base import BuilderComponent
+from kag.interface.builder.base import BuilderComponent
 from kag.builder.model.sub_graph import SubGraph
 from knext.common.base.runnable import Input, Output
 
 
 class MappingABC(BuilderComponent, ABC):
     """
-    Interface for mapping structured dicts to a list SubGraph, which can be written into KG storage.
+    Abstract base class for mapping structured dictionaries to a list of SubGraphs, which can be written into a Knowledge Graph (KG) storage.
+
+    This class defines the interface for mapping operations and provides properties to define the input and output types.
     """
 
     @property
@@ -32,6 +34,19 @@ def output_types(self):
 
     @abstractmethod
     def invoke(self, input: Input, **kwargs) -> List[Output]:
+        """
+        Abstract method to be implemented by subclasses. It processes the input and returns a list of outputs.
+
+        Args:
+            input (Input): The input to be processed.
+            **kwargs: Additional keyword arguments.
+
+        Raises:
+            NotImplementedError: This method must be implemented by subclasses.
+
+        Returns:
+            List[Output]: A list of outputs corresponding to the processed input.
+        """
         raise NotImplementedError(
             f"`invoke` is not currently supported for {self.__class__.__name__}."
         )
diff --git a/kag/interface/builder/postprocessor_abc.py b/kag/interface/builder/postprocessor_abc.py
new file mode 100644
index 00000000..71240464
--- /dev/null
+++ b/kag/interface/builder/postprocessor_abc.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from kag.interface.builder.base import BuilderComponent
+from kag.builder.model.sub_graph import SubGraph
+
+
+class PostProcessorABC(BuilderComponent):
+    """
+    Abstract base class for post-processing subgraphs.
+
+    This class defines the interface for post-processing operations on subgraphs.
+    """
+
+    @property
+    def input_types(self):
+        return SubGraph
+
+    @property
+    def output_types(self):
+        return SubGraph
+
+    @property
+    def ckpt_subdir(self):
+        return "postprocessor"
diff --git a/kag/interface/builder/reader_abc.py b/kag/interface/builder/reader_abc.py
index 6be18f59..0689ced6 100644
--- a/kag/interface/builder/reader_abc.py
+++ b/kag/interface/builder/reader_abc.py
@@ -9,17 +9,19 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
-from abc import ABC, abstractmethod
-from typing import List, Dict, Union
+from abc import ABC
 
-from kag.builder.component.base import BuilderComponent
+from kag.interface.builder.base import BuilderComponent
 from kag.builder.model.chunk import Chunk
 from knext.common.base.runnable import Input, Output
 
 
-class SourceReaderABC(BuilderComponent, ABC):
+class ReaderABC(BuilderComponent, ABC):
     """
-    Interface for reading files into a list of unstructured chunks or structured dicts.
+    Abstract base class for reading content generated by upstream scanner into a list of text chunks.
+
+    This class defines the interface for components that reading content generated by upstream scanner
+    and convert them into a list of text chunks. It inherits from `BuilderComponent` and `ABC` (Abstract Base Class).
     """
 
     @property
@@ -28,10 +30,8 @@ def input_types(self) -> Input:
 
     @property
     def output_types(self) -> Output:
-        return Union[Chunk, Dict]
+        return Chunk
 
-    @abstractmethod
-    def invoke(self, input: Input, **kwargs) -> List[Output]:
-        raise NotImplementedError(
-            f"`invoke` is not currently supported for {self.__class__.__name__}."
-        )
+    @property
+    def ckpt_subdir(self):
+        return "reader"
diff --git a/kag/interface/builder/scanner_abc.py b/kag/interface/builder/scanner_abc.py
new file mode 100644
index 00000000..1fd4da6a
--- /dev/null
+++ b/kag/interface/builder/scanner_abc.py
@@ -0,0 +1,136 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import os
+from abc import ABC, abstractmethod
+from typing import Any, Generator, List
+from kag.interface.builder.base import BuilderComponent
+from kag.common.conf import KAG_PROJECT_CONF
+from knext.common.base.runnable import Input, Output
+
+
+class ScannerABC(BuilderComponent, ABC):
+    """
+    Abstract base class for scanning  raw content from the source,
+    typically used in conjunction with downstream parsers to obtain text suitable for knowledge extraction.
+
+    This class defines the interface for components that read input sources such as a directory or csv file.
+    It inherits from `BuilderComponent` and `ABC` (Abstract Base Class).
+
+    """
+
+    @property
+    def input_types(self) -> Input:
+        return str
+
+    @property
+    def output_types(self) -> Output:
+        return Any
+
+    @abstractmethod
+    def load_data(self, input: Input, **kwargs) -> List[Output]:
+        """
+        Abstract method to load data from the input source.
+
+        This method must be implemented by any subclass. It is responsible for loading data from the input source
+        and returning a list of processed results.
+
+        Args:
+            input (Input): The input source to load data from.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List[Output]: A list of processed results.
+
+        Raises:
+            NotImplementedError: If the method is not implemented in the subclass.
+        """
+        raise NotImplementedError("load not implemented yet.")
+
+    def _generate(self, data):
+        """
+        Generates items from the data based on the sharding configuration.
+
+        This method is used internally to generate items from the source based on the sharding configuration.
+
+        Args:
+            data: The data to process.
+
+        Yields:
+            The items within the sharded range.
+        """
+        start, end = self.sharding_info.get_sharding_range(len(data))
+        worker = (
+            f"{self.sharding_info.get_rank()}/{self.sharding_info.get_world_size()}"
+        )
+        msg = (
+            f"There are total {len(data)} data to process, worker "
+            f"{worker} will process range [{start}, {end})"
+        )
+
+        print(msg)
+        for item in data[start:end]:
+            yield item
+
+    def generate(self, input: Input, **kwargs) -> Generator[Output, Input, None]:
+        """
+        Generates items from the input source based on the sharding configuration.
+
+        This method loads data from the input source and generates items based on the sharding configuration.
+
+        Args:
+            input (Input): The input source to load data from.
+            **kwargs: Additional keyword arguments.
+
+        Yields:
+            The items within the sharded range.
+        """
+        data = self.load_data(input, **kwargs)
+        for item in self._generate(data):
+            yield item
+
+    def download_data(self, input: Input, **kwargs) -> List[Output]:
+        """
+        Downloads data from a given input URL or returns the input directly if it is not a URL.
+
+        Args:
+            input (Input): The input source, which can be a URL (starting with "http://" or "https://") or a local path.
+            **kwargs: Additional keyword arguments (currently unused).
+
+        Returns:
+            List[Output]: A list containing the local file path if the input is a URL, or the input itself if it is not a URL.
+
+        """
+        if input.startswith("http://") or input.startswith("https://"):
+            from kag.common.utils import download_from_http
+
+            local_file_path = os.path.join(KAG_PROJECT_CONF.ckpt_dir, "file_scanner")
+            if not os.path.exists(local_file_path):
+                os.makedirs(local_file_path)
+            local_file = os.path.join(local_file_path, os.path.basename(input))
+            local_file = download_from_http(input, local_file)
+            return local_file
+        return input
+
+    def invoke(self, input: Input, **kwargs) -> List[Output]:
+        """
+        Invokes the component to process input data and return a list of processed results.
+
+        This method generates items from the input source and returns them as a list.
+
+        Args:
+            input (Input): The input source to load data from.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List[Output]: A list of processed results.
+        """
+        return list(self.generate(input, **kwargs))
diff --git a/kag/interface/builder/splitter_abc.py b/kag/interface/builder/splitter_abc.py
index 2ac5a72a..ae60cc59 100644
--- a/kag/interface/builder/splitter_abc.py
+++ b/kag/interface/builder/splitter_abc.py
@@ -9,19 +9,26 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
-from abc import ABC, abstractmethod
-from typing import List
 
-from kag.builder.component.base import BuilderComponent
+from abc import ABC
+
+from kag.interface.builder.base import BuilderComponent
 from kag.builder.model.chunk import Chunk
 from knext.common.base.runnable import Input, Output
 
 
 class SplitterABC(BuilderComponent, ABC):
     """
-    Interface for splitting chunk into a list of smaller chunks.
+    Abstract base class for splitting a chunk into a list of smaller chunks.
+
+    This class defines the interface for splitting a chunk into smaller chunks.
+    It inherits from BuilderComponent and ABC, ensuring that any subclass must implement
+    the `invoke` method.
     """
 
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
     @property
     def input_types(self) -> Input:
         return Chunk
@@ -30,8 +37,6 @@ def input_types(self) -> Input:
     def output_types(self) -> Output:
         return Chunk
 
-    @abstractmethod
-    def invoke(self, input: Input, **kwargs) -> List[Output]:
-        raise NotImplementedError(
-            f"`invoke` is not currently supported for {self.__class__.__name__}."
-        )
+    @property
+    def ckpt_subdir(self):
+        return "splitter"
diff --git a/kag/interface/builder/vectorizer_abc.py b/kag/interface/builder/vectorizer_abc.py
index 4b910c6d..78d6a0bd 100644
--- a/kag/interface/builder/vectorizer_abc.py
+++ b/kag/interface/builder/vectorizer_abc.py
@@ -9,17 +9,18 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
-from abc import ABC, abstractmethod
-from typing import List
 
-from kag.builder.component.base import BuilderComponent
+from kag.interface.builder.base import BuilderComponent
 from kag.builder.model.sub_graph import SubGraph
-from knext.common.base.runnable import Input, Output
 
 
-class VectorizerABC(BuilderComponent, ABC):
+class VectorizerABC(BuilderComponent):
     """
-    Interface for vectorizer.
+    Abstract base class for generating embedding vectors for node attributes in a graph.
+
+    This class defines the interface for generating embedding vectors for node attributes
+    in a SubGraph. It inherits from BuilderComponent, ensuring that any subclass must implement
+    the `invoke` method.
     """
 
     @property
@@ -30,8 +31,6 @@ def input_types(self):
     def output_types(self):
         return SubGraph
 
-    @abstractmethod
-    def invoke(self, input: Input, **kwargs) -> List[Output]:
-        raise NotImplementedError(
-            f"`invoke` is not currently supported for {self.__class__.__name__}."
-        )
+    # @property
+    # def ckpt_subdir(self):
+    #     return "vectorizer"
diff --git a/kag/interface/builder/writer_abc.py b/kag/interface/builder/writer_abc.py
index 827bd9ef..eedc1b76 100644
--- a/kag/interface/builder/writer_abc.py
+++ b/kag/interface/builder/writer_abc.py
@@ -11,14 +11,18 @@
 # or implied.
 from abc import ABC, abstractmethod
 
-from kag.builder.component.base import BuilderComponent
+from kag.interface.builder.base import BuilderComponent
 from kag.builder.model.sub_graph import SubGraph
 from knext.common.base.runnable import Input, Output
 
 
 class SinkWriterABC(BuilderComponent, ABC):
     """
-    Interface for writing SubGraphs to storage.
+    Abstract base class for writing SubGraphs to graph storage.
+
+    This class defines the interface for writing SubGraphs to a graph storage system.
+    It inherits from BuilderComponent and ABC, ensuring that any subclass must implement
+    the `invoke` method.
     """
 
     @property
@@ -27,10 +31,23 @@ def input_types(self):
 
     @property
     def output_types(self):
-        return None
+        return SubGraph
 
     @abstractmethod
     def invoke(self, input: Input, **kwargs) -> Output:
+        """
+        Abstract method to be implemented by subclasses for writing SubGraphs to storage.
+
+        Args:
+            input (Input): The SubGraph to be written to storage.
+            **kwargs: Additional keyword arguments, currently unused but kept for potential future expansion.
+
+        Returns:
+            Output: The SubGraph after it has been written to storage.
+
+        Raises:
+            NotImplementedError: If the method is not implemented by the subclass.
+        """
         raise NotImplementedError(
             f"`invoke` is not currently supported for {self.__class__.__name__}."
         )
diff --git a/kag/solver/logic/core_modules/op_executor/op_retrieval/module/__init__.py b/kag/interface/common/__init__.py
similarity index 100%
rename from kag/solver/logic/core_modules/op_executor/op_retrieval/module/__init__.py
rename to kag/interface/common/__init__.py
diff --git a/kag/interface/common/llm_client.py b/kag/interface/common/llm_client.py
new file mode 100644
index 00000000..e6816896
--- /dev/null
+++ b/kag/interface/common/llm_client.py
@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import json
+from typing import Union, Dict, List, Any
+import logging
+import traceback
+from tenacity import retry, stop_after_attempt
+from kag.interface import PromptABC
+from kag.common.registry import Registrable
+
+
+logger = logging.getLogger(__name__)
+
+
+class LLMClient(Registrable):
+    """
+    A class that provides methods for performing inference using large language model.
+
+    This class includes methods to call the model with a prompt, parse the response, and handle batch processing of prompts.
+    """
+
+    @retry(stop=stop_after_attempt(3))
+    def __call__(self, prompt: Union[str, dict, list]) -> str:
+        """
+        Perform inference on the given prompt and return the result.
+
+        Args:
+            prompt (Union[str, dict, list]): Input prompt for inference.
+
+        Returns:
+            str: Inference result.
+
+        Raises:
+            NotImplementedError: If the subclass has not implemented this method.
+        """
+        raise NotImplementedError
+
+    @retry(stop=stop_after_attempt(3))
+    def call_with_json_parse(self, prompt: Union[str, dict, list]):
+        """
+        Perform inference on the given prompt and attempt to parse the result as JSON.
+
+        Args:
+            prompt (Union[str, dict, list]): Input prompt for inference.
+
+        Returns:
+            Any: Parsed result.
+
+        Raises:
+            NotImplementedError: If the subclass has not implemented this method.
+        """
+        res = self(prompt)
+        _end = res.rfind("```")
+        _start = res.find("```json")
+        if _end != -1 and _start != -1:
+            json_str = res[_start + len("```json") : _end].strip()
+        else:
+            json_str = res
+        try:
+            json_result = json.loads(json_str)
+        except:
+            return res
+        return json_result
+
+    def invoke(
+        self,
+        variables: Dict[str, Any],
+        prompt_op: PromptABC,
+        with_json_parse: bool = True,
+        with_except: bool = True,
+    ):
+        """
+        Call the model and process the result.
+
+        Args:
+            variables (Dict[str, Any]): Variables used to build the prompt.
+            prompt_op (PromptABC): Prompt operation object for building and parsing prompts.
+            with_json_parse (bool, optional): Whether to attempt parsing the response as JSON. Defaults to True.
+            with_except (bool, optional): Whether to raise an exception if an error occurs. Defaults to False.
+
+        Returns:
+            List: Processed result list.
+        """
+        result = []
+        prompt = prompt_op.build_prompt(variables)
+        logger.debug(f"Prompt: {prompt}")
+        if not prompt:
+            return result
+        response = ""
+        try:
+            response = (
+                self.call_with_json_parse(prompt=prompt)
+                if with_json_parse
+                else self(prompt)
+            )
+            logger.debug(f"Response: {response}")
+            result = prompt_op.parse_response(response, model=self.model, **variables)
+            logger.debug(f"Result: {result}")
+        except Exception as e:
+            import traceback
+
+            logger.error(f"Error {e} during invocation: {traceback.format_exc()}")
+            if with_except:
+                raise RuntimeError(
+                    f"LLM invoke exception, info: {e}\nllm input: {input}\nllm output: {response}"
+                )
+        return result
+
+    def batch(
+        self,
+        variables: Dict[str, Any],
+        prompt_op: PromptABC,
+        with_json_parse: bool = True,
+    ) -> List:
+        """
+        Batch process prompts.
+
+        Args:
+            variables (Dict[str, Any]): Variables used to build the prompts.
+            prompt_op (PromptABC): Prompt operation object for building and parsing prompts.
+            with_json_parse (bool, optional): Whether to attempt parsing the response as JSON. Defaults to True.
+
+        Returns:
+            List: List of all processed results.
+        """
+        results = []
+        prompts = prompt_op.build_prompt(variables)
+        # If there is only one prompt, call the `invoke` method directly
+        if isinstance(prompts, str):
+            return self.invoke(variables, prompt_op, with_json_parse=with_json_parse)
+
+        for idx, prompt in enumerate(prompts, start=0):
+            logger.debug(f"Prompt_{idx}: {prompt}")
+            try:
+                response = (
+                    self.call_with_json_parse(prompt=prompt)
+                    if with_json_parse
+                    else self(prompt)
+                )
+                logger.debug(f"Response_{idx}: {response}")
+                result = prompt_op.parse_response(
+                    response, idx=idx, model=self.model, **variables
+                )
+                logger.debug(f"Result_{idx}: {result}")
+                results.extend(result)
+            except Exception as e:
+                logger.error(f"Error processing prompt {idx}: {e}")
+                logger.debug(traceback.format_exc())
+                continue
+        return results
+
+    def check(self):
+        from kag.common.conf import KAG_PROJECT_CONF
+
+        if (
+            hasattr(KAG_PROJECT_CONF, "llm_config_check")
+            and KAG_PROJECT_CONF.llm_config_check
+        ):
+            try:
+                self.__call__("Are you OK?")
+            except Exception as e:
+                logger.error("LLM config check failed!")
+                raise e
diff --git a/kag/interface/common/prompt.py b/kag/interface/common/prompt.py
new file mode 100644
index 00000000..6918864b
--- /dev/null
+++ b/kag/interface/common/prompt.py
@@ -0,0 +1,185 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import json
+import copy
+from abc import ABC
+from string import Template
+from typing import List
+from kag.common.registry import Registrable
+from kag.common.conf import KAG_PROJECT_CONF
+
+
+@Registrable.register("prompt")
+class PromptABC(Registrable, ABC):
+    """
+    Provides a template for generating and parsing prompts related to specific business scenes.
+
+    Subclasses must implement the template for specific languages (Chinese or English)
+    and override the `template_variables` and `parse_response` methods.
+    """
+
+    """English template string"""
+    template_en: str = ""
+    """Chinese template string"""
+    template_zh: str = ""
+
+    def __init__(self, language: str = "", **kwargs):
+        """
+        Initializes the prompt instance with the selected language.
+
+        Args:
+            language (str): The language for the prompt, Defaults to empty string, which will fallback to project.language config.
+
+        Raises:
+            AssertionError: If the provided language is not supported.
+        """
+        if not language:
+            language = KAG_PROJECT_CONF.language
+        self.language = language
+
+        if not hasattr(self, f"template_{self.language}"):
+            raise ValueError(f"language {self.language} not supported yet.")
+
+        self.template = getattr(self, f"template_{self.language}")
+
+        self.template_variables_value = {}
+        self.example_input = kwargs.get("example_input", None)
+        self.example_output = kwargs.get("example_output", None)
+        if isinstance(self.example_output, str):
+            try:
+                self.example_output = json.loads(self.example_output)
+            except:
+                pass
+
+    @property
+    def project_id(self):
+        return KAG_PROJECT_CONF.project_id
+
+    @property
+    def template_variables(self) -> List[str]:
+        """
+        Gets the list of template variables.
+
+        Must be implemented by subclasses.
+
+        Returns:
+        - List[str]: A list of template variable names.
+
+        Raises:
+        - NotImplementedError: If the subclass does not implement this method.
+        """
+
+        raise NotImplementedError(
+            f"{self.__class__.__name__} need to implement `template_variables` method."
+        )
+
+    def process_template_string_to_avoid_dollar_problem(self, template_string):
+        """
+        Processes the template string to avoid issues with dollar signs.
+
+        Args:
+            template_string (str): The template string to process.
+
+        Returns:
+        - str: The processed template string.
+        """
+        new_template_str = template_string.replace("$", "$$")
+        for var in self.template_variables:
+            new_template_str = new_template_str.replace(f"$${var}", f"${var}")
+        return new_template_str
+
+    def _build_dict_prompt(self, variables) -> str:
+        """
+        Builds a dictionary-based prompt with provided variables.
+
+        Args:
+            variables (dict): A dictionary of variables to include in the prompt.
+
+        Returns:
+        - str: The generated prompt as a JSON string.
+        """
+        tmpl = copy.deepcopy(self.template)
+        tmpl.update(variables)
+        if self.example_input and self.example_output:
+            tmpl["example"] = {
+                "input": self.example_input,
+                "output": json.loads(self.example_output)
+                if isinstance(self.example_output, str)
+                else self.example_output,
+            }
+        return json.dumps(tmpl, ensure_ascii=False)
+
+    def _build_str_prompt(self, variables) -> str:
+        """
+        Builds a string-based prompt with provided variables.
+
+        Args:
+            variables (dict): A dictionary of variables to include in the prompt.
+
+        Returns:
+        - str: The generated prompt as a string.
+        """
+        template_string = self.process_template_string_to_avoid_dollar_problem(
+            self.template
+        )
+        template = Template(template_string)
+        prompt = template.substitute(**variables)
+        if self.example_input and self.example_output:
+            prompt = json.loads(prompt)
+            prompt["example"] = {
+                "input": self.example_input,
+                "output": self.example_output,
+            }
+            prompt = json.dumps(prompt, ensure_ascii=False)
+        return prompt
+
+    def build_prompt(self, variables) -> str:
+        """
+        Builds a prompt based on the template and provided variables.
+
+        This method replaces placeholders in the template with actual variable values.
+        If a variable is not provided, it defaults to an empty string.
+
+        Args:
+            variables (dict): A dictionary containing variable names and their corresponding values.
+
+        Returns:
+        - str: The generated prompt, which may be a string or a JSON string depending on the template content.
+
+        Raises:
+        - ValueError: If the template format is unsupported.
+        """
+        self.template_variables_value = variables
+        if isinstance(self.template, str):
+            return self._build_str_prompt(variables)
+        elif isinstance(self.template, dict):
+            return self._build_dict_prompt(variables)
+        raise ValueError(
+            f"Unsupported template format, expect [str|dict], but got {type(self.template)}"
+        )
+
+    def parse_response(self, response: str, **kwargs):
+        """
+        Parses the response string.
+
+        Must be implemented by subclasses.
+
+        Parameters:
+        - response (str): The response string to be parsed.
+
+        Raises:
+        - NotImplementedError: If the subclass does not implement this method.
+        """
+
+        raise NotImplementedError(
+            f"{self.__class__.__name__} need to implement `parse_response` method."
+        )
diff --git a/kag/interface/common/vectorize_model.py b/kag/interface/common/vectorize_model.py
new file mode 100644
index 00000000..69118d2b
--- /dev/null
+++ b/kag/interface/common/vectorize_model.py
@@ -0,0 +1,101 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import io
+import os
+import tarfile
+import requests
+import logging
+
+from tenacity import retry, stop_after_attempt
+from kag.common.registry import Registrable
+from typing import Union, Iterable
+
+EmbeddingVector = Iterable[float]
+logger = logging.getLogger()
+
+
+@Registrable.register("vectorize_model")
+class VectorizeModelABC(Registrable):
+    """
+    An abstract base class that defines the interface for converting text into embedding vectors.
+    """
+
+    def __init__(self, vector_dimensions: int = None):
+        """
+        Initializes the VectorizeModelABC instance.
+
+        Args:
+            vector_dimensions (int, optional): The number of dimensions for the embedding vectors. Defaults to None.
+        """
+        self._vector_dimensions = vector_dimensions
+
+    def _download_model(self, path, url):
+        """
+        Downloads a model from a specified URL and extracts it to a given path.
+
+        Args:
+            path (str): The directory path to save the downloaded model.
+            url (str): The URL from which to download the model.
+
+        Raises:
+            RuntimeError: If the model configuration file is not found at the specified path.
+        """
+        logger.info(f"download model from:\n{url} to:\n{path}")
+        res = requests.get(url)
+        with io.BytesIO(res.content) as fileobj:
+            with tarfile.open(fileobj=fileobj) as tar:
+                tar.extractall(path=path)
+        config_path = os.path.join(path, "config.json")
+        if not os.path.isfile(config_path):
+            message = f"model config not found at {config_path!r}, url {url!r} specified an invalid model"
+            raise RuntimeError(message)
+
+    def get_vector_dimensions(self):
+        """
+        Retrieves the dimension of the generated embedding vectors.
+
+        Returns:
+            int: The number of dimensions for the embedding vectors.
+
+        Raises:
+            RuntimeError: If the embedding service is not available.
+        """
+        if hasattr(self, "_vector_dimensions"):
+            return int(self._vector_dimensions)
+        try:
+            example_input = "This is a test."
+            example_vector = self.vectorize(example_input)
+            self._vector_dimensions = len(example_vector)
+            return self._vector_dimensions
+
+        except Exception as ex:
+            message = "the embedding service is not available"
+            raise RuntimeError(message) from ex
+
+    @retry(stop=stop_after_attempt(3))
+    def vectorize(
+        self, texts: Union[str, Iterable[str]]
+    ) -> Union[EmbeddingVector, Iterable[EmbeddingVector]]:
+        """
+        Vectorizes text(s) into embedding vector(s).
+
+        Args:
+            texts (Union[str, Iterable[str]]): The text or texts to vectorize.
+
+        Returns:
+            Union[EmbeddingVector, Iterable[EmbeddingVector]]: The embedding vector(s) of the text(s).
+
+        Raises:
+            NotImplementedError: This method must be implemented by subclasses.
+        """
+        message = "abstract method vectorize is not implemented"
+        raise NotImplementedError(message)
diff --git a/kag/interface/retriever/chunk_retriever_abc.py b/kag/interface/retriever/chunk_retriever_abc.py
deleted file mode 100644
index 33d8fb3c..00000000
--- a/kag/interface/retriever/chunk_retriever_abc.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import List
-
-from kag.solver.common.base import KagBaseModule
-
-
-class ChunkRetrieverABC(KagBaseModule, ABC):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-    """
-    An abstract base class for chunk retrieval strategies.
-
-    This class provides a template for implementing different retrieval and reranking strategies for chunks of text.
-
-    Methods:
-        recall_docs(query: str, top_k: int = 5, **kwargs) -> List[str]:
-            Recalls documents based on the given query.
-
-        rerank_docs(queries: List[str], passages: List[str]) -> List[str]:
-            Reranks the retrieved passages based on the given queries.
-    """
-
-    @abstractmethod
-    def recall_docs(self, query: str, top_k: int = 5, **kwargs) -> List[str]:
-        """
-        Recalls documents based on the given query.
-
-        Parameters:
-            query (str): The query string to search for.
-            top_k (int, optional): The number of top documents to return. Defaults to 5.
-            **kwargs: Additional keyword arguments for retrieval.
-
-        Returns:
-            List[str]: A list of recalled document IDs or content.
-        """
-        raise NotImplementedError("Subclasses must implement this method")
-
-    @abstractmethod
-    def rerank_docs(self, queries: List[str], passages: List[str]) -> List[str]:
-        """
-        Reranks the retrieved passages based on the given queries.
-
-        Parameters:
-            queries (List[str]): A list of query strings.
-            passages (List[str]): A list of retrieved passages.
-
-        Returns:
-            List[str]: A list of reranked passage IDs or content.
-        """
-        raise NotImplementedError("Subclasses must implement this method")
diff --git a/kag/interface/retriever/kg_retriever_abc.py b/kag/interface/retriever/kg_retriever_abc.py
deleted file mode 100644
index c6593221..00000000
--- a/kag/interface/retriever/kg_retriever_abc.py
+++ /dev/null
@@ -1,52 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import List
-
-from kag.solver.common.base import KagBaseModule
-from kag.solver.logic.core_modules.common.base_model import SPOEntity
-from kag.solver.logic.core_modules.common.one_hop_graph import OneHopGraphData, KgGraph, EntityData
-from kag.solver.logic.core_modules.parser.logic_node_parser import GetSPONode
-
-
-class KGRetrieverABC(KagBaseModule, ABC):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    """
-    A base class for knowledge graph retrieval strategies.
-
-    This class provides a template for implementing different retrieval strategies for relations and entities within a knowledge graph.
-
-    Methods:
-        retrieval_relation(self, n: GetSPONode, one_hop_graph_list: List[OneHopGraphData], **kwargs) -> KgGraph:
-            Retrieves and standardizes relations based on the given node and one hop graph list.
-
-        retrieval_entity(entity_mention, topk=1, params={}):
-            Retrieves related entities based on the given entity mention.
-    """
-    @abstractmethod
-    def retrieval_relation(self, n: GetSPONode, one_hop_graph_list: List[OneHopGraphData], **kwargs) -> KgGraph:
-        '''
-        Input:
-            n: GetSPONode, the relation to be standardized
-            one_hop_graph_list: List[OneHopGraphData], list of candidate sets
-            kwargs: additional optional parameters
-
-        Output:
-            Returns KgGraph
-        '''
-
-    @abstractmethod
-    def retrieval_entity(self, mention_entity: SPOEntity, topk=1, **kwargs) -> List[EntityData]:
-        """
-        Retrieve related entities based on the given entity mention.
-
-        This function aims to retrieve the most relevant entities from storage or an index based on the provided entity name.
-
-        Parameters:
-            entity_mention (str): The name of the entity to retrieve.
-            topk (int, optional): The number of top results to return. Defaults to 1.
-            kwargs: additional optional parameters
-
-        Returns:
-            list of EntityData
-        """
diff --git a/kag/interface/solver/base.py b/kag/interface/solver/base.py
new file mode 100644
index 00000000..cbe69ffe
--- /dev/null
+++ b/kag/interface/solver/base.py
@@ -0,0 +1,229 @@
+import os
+from string import Template
+
+from kag.common.registry import Registrable
+from kag.interface import LLMClient
+from kag.common.conf import KAG_PROJECT_CONF, KAG_CONFIG
+
+
+class Question(object):
+    """
+    This class models a question that may have dependencies on other questions
+    and can also have sub-questions. It helps in structuring a complex problem
+    into manageable parts, ensuring that each part can be addressed in a logical
+    sequence.
+
+    There are two possible relationships between questions:
+    1. The content of one question depends on the answer to another question.
+    2. One question can be broken down into several sub-questions.
+    """
+
+    def __init__(
+        self, question, dependencies=[], children=[], parent=None, context=None
+    ):
+        """
+        Initialize a Question object.
+
+        :param question: The content of the question.
+        :param dependencies: A list of Question objects that this question depends on.
+        :param children: A list of sub-questions (Question objects) for this question.
+        :param parent: The parent question (if any).
+        :param context: Additional context or information related to the question.
+        """
+        self.question = question
+        self.dependencies = dependencies
+        self.children = children
+        self.parent = parent
+        self.answer = None
+        self.context = context
+        self.id = None
+
+    def is_solved(self):
+        return self.answer is not None
+
+    def get_current_depth(self):
+        """
+        Calculate the depth of the current question in the hierarchy.
+
+        :return: The depth of the question.
+        """
+        now_depth = 1
+        now = self
+        while now.parent:
+            now = now.parent
+            now_depth += 1
+        return now_depth
+
+    def __str__(self):
+        """
+        Return a string representation of the question, including dependencies, children, parent, and context.
+
+        :return: A string representation of the question.
+        """
+        repr_str = f"""question: {self.question}\n"""
+        # dependies
+        repr_str += "deps:\n"
+        for ind, dep in enumerate(self.dependencies):
+            repr_str += f"  dep_question {ind}: {dep.question}\n"
+
+        # chilren
+        repr_str += "children:\n"
+        for ind, child in enumerate(self.children):
+            repr_str += f"  childe_question {ind}: {child.question}\n"
+
+        # parent
+        if self.parent:
+            repr_str += f"parent:\n  {self.parent.question}\n"
+
+        # context:
+        if self.context:
+            repr_str += f"context:{self.context}\n"
+        return repr_str
+
+
+class KagBaseModule(Registrable):
+
+    """
+    KagBaseModule is an abstract base class designed to interact with language model (LLM) modules.
+    This class handles the processing flow from the input question to the output generated by the LLM.
+    It supports intermediate processing tools and additional information fetching tools.
+    A significant feature of this class is the management of prompt templates used to communicate with the LLM.
+
+    The class allows for default or custom prompt templates to be loaded, processed, and saved.
+    If a computational context is indicated, it initializes and manages the state dictionary containing the prompt template.
+    """
+
+    def __init__(self, llm_client: LLMClient = None, **kwargs):
+        """
+        Initializes the KagBaseModule.
+
+        Parameters:
+        llm_module: The language model module used for generating responses.
+        use_default_prompt_template (bool): Flag to use the default prompt template.
+        prompt_template_dir (str): Directory to load the prompt template from, if not using the default.
+        is_computational (bool): Indicates if the module operates in a computational context, impacting prompt template usage.
+
+        If the module is computational, it initializes the state dictionary with the prompt template.
+        """
+        super().__init__(**kwargs)
+        self.host_addr = KAG_PROJECT_CONF.host_addr
+        self.project_id = KAG_PROJECT_CONF.project_id
+        self.config = KAG_CONFIG.all_config
+        self.biz_scene = KAG_PROJECT_CONF.biz_scene
+        self.language = KAG_PROJECT_CONF.language
+
+        if llm_client is None:
+            llm_client = kwargs["llm_module"]
+        self.llm_module = llm_client
+
+    def get_module_name(self):
+        raise NotImplementedError
+
+    def get_template_var_names(self):
+        raise NotImplementedError
+
+    def forward(self, question: Question):
+        """
+        Processes the input question through the language model module.
+
+        Parameters:
+        question (Question): The input question object containing the query.
+
+        Returns:
+        The post-processed output generated by the LLM.
+        """
+        prompt = self.preprocess(question)
+        llm_output = self.llm_module(prompt)
+        post_processed_output = self.postprocess(question, llm_output)
+        return post_processed_output
+
+    async def async_forward(self, question: Question):
+        return self.forward(question)
+
+    def preprocess(self, question: Question):
+        return question.question
+
+    def postprocess(self, question: Question, llm_output):
+        return llm_output
+
+    def get_ca_default_prompt_template_dir(self):
+        directory = os.path.dirname(os.path.abspath(__file__))
+        directory = os.path.join(directory, "..", "logic/modules")
+        if self.is_prompt_template_cn:
+            return os.path.join(directory, "default_prompt_template")
+        else:
+            return os.path.join(directory, "default_prompt_template_en")
+
+    def load_prompt_template(self, prompt_dir, prompt_module_name=None):
+        if prompt_module_name is None:
+            prompt_module_name = self.get_module_name()
+        prompt_file_path = os.path.join(prompt_dir, f"{prompt_module_name}.txt")
+        if os.path.exists(prompt_file_path):
+            with open(prompt_file_path, "r") as f:
+                template_string = f.read()
+            template_string = self.process_template_string_to_avoid_doller_problem(
+                template_string
+            )
+        else:
+            template_string = """default prompt. edit it anyway"""
+        return Template(template_string)
+
+    def process_template_string_to_avoid_doller_problem(self, template_string):
+        new_template_str = template_string.replace("$", "$$")
+        for var in self.get_template_var_names():
+            new_template_str = new_template_str.replace(f"$${var}", f"${var}")
+        return new_template_str
+
+    def save_prompt_template(self, prompt_dir, prompt_template):
+        prompt_file_path = os.path.join(prompt_dir, f"{self.get_module_name()}.txt")
+        with open(prompt_file_path, "w") as f:
+            f.write(prompt_template)
+
+    def create_default_state_dict(self):
+        default_prompt_template = self.load_prompt_template(
+            self.get_ca_default_prompt_template_dir()
+        )
+        state_dict = {
+            "prompt_template": default_prompt_template,
+        }
+        return state_dict
+
+    def save_state_dict(self, save_dir, state_dict):
+        prompt_dir = os.path.join(save_dir, "prompt_template")
+        os.makedirs(prompt_dir, exist_ok=True)
+        self.save_prompt_template(prompt_dir, state_dict["prompt_template"].template)
+
+    def load_state_dict(self, save_dir):
+        prompt_dir = os.path.join(save_dir, "prompt_template")
+        prompt_template = self.load_prompt_template(prompt_dir)
+        state_dict = {
+            "prompt_template": prompt_template,
+        }
+        return state_dict
+
+    def does_state_dict_exists(self, save_dir):
+        prompt_file_path = os.path.join(
+            save_dir, "prompt_template", f"{self.get_module_name()}.txt"
+        )
+        return os.path.exists(prompt_file_path)
+
+    def init_state_dict(self):
+        """
+        Initializes the state dictionary by loading or creating the prompt template.
+
+        Returns:
+        state_dict (dict): The state dictionary containing the prompt template.
+        """
+        if self.use_default_prompt_template:
+            return self.create_default_state_dict()
+        else:
+            if self.prompt_template_dir:
+                working_dir = self.prompt_template_dir
+            else:
+                working_dir = os.getcwd()
+            if self.does_state_dict_exists(working_dir):
+                state_dict = self.load_state_dict(working_dir)
+            else:
+                state_dict = self.create_default_state_dict()
+                self.save_state_dict(working_dir, state_dict)
+            return state_dict
diff --git a/kag/interface/solver/base_model.py b/kag/interface/solver/base_model.py
new file mode 100644
index 00000000..a8948fae
--- /dev/null
+++ b/kag/interface/solver/base_model.py
@@ -0,0 +1,413 @@
+import itertools
+import json
+import re
+from typing import List, Optional
+
+
+class Identifier:
+    def __init__(self, alias_name):
+        self.alias_name = alias_name
+
+    def __repr__(self):
+        return self.alias_name
+
+    def __str__(self):
+        return self.alias_name
+
+    def __eq__(self, other):
+        if isinstance(other, Identifier):
+            return self.alias_name == other.alias_name
+        if isinstance(other, str):
+            return self.alias_name == other
+        return False
+
+    def __hash__(self):
+        return hash(self.alias_name)
+
+
+class TypeInfo:
+    def __init__(self, std_entity_type=None, un_std_entity_type=None):
+        self.std_entity_type = std_entity_type
+        self.un_std_entity_type = un_std_entity_type
+
+    def __repr__(self):
+        return f"en:{self.std_entity_type} zh:{self.un_std_entity_type}"
+
+
+def parse_entity(raw_entity):
+    if raw_entity is None:
+        return []
+    entity_parts = re.findall(r"(?:`(.+?)`|([^|]+))", raw_entity)
+    return [
+        part.replace("``", "|") if part else escaping_part
+        for escaping_part, part in entity_parts
+    ]
+
+
+class SPOBase:
+    def __init__(self):
+        self.alias_name: Identifier = None
+        self.type_set: List[TypeInfo] = []
+        self.is_attribute = False
+        self.value_list = []
+
+    def __repr__(self):
+        return f"{self.alias_name}:{self.get_un_std_entity_first_type_or_std()}"
+
+    def get_value_list_str(self):
+        return [f"{self.alias_name}.{k}={v}" for k, v in self.value_list]
+
+    def get_mention_name(self):
+        return ""
+
+    def get_type_with_gql_format(self):
+        entity_types = self.get_entity_type_set()
+        entity_zh_types = self.get_un_std_entity_type_set()
+        if len(entity_types) == 0 and len(entity_zh_types) == 0:
+            return None
+        if None in entity_types and None in entity_zh_types:
+            raise RuntimeError(
+                f"None type in entity type en {entity_types} zh {entity_zh_types}"
+            )
+        if len(entity_types) > 0:
+            return "|".join(entity_types)
+        if len(entity_zh_types) > 0:
+            return "|".join(entity_zh_types)
+
+    def get_entity_first_std_type(self):
+        type_list = list(self.get_entity_type_set())
+        if len(type_list) == 0:
+            return "Entity"
+        return type_list[0]
+
+    def get_un_std_entity_first_type_or_std(self):
+        std_type = list(self.get_entity_type_set())
+        un_std_type = list(self.get_un_std_entity_type_set())
+        if len(un_std_type) > 0:
+            return un_std_type[0]
+        elif len(std_type) > 0:
+            return std_type[0]
+        else:
+            return "Entity"
+
+    def get_entity_type_or_un_std_list(self):
+        ret = []
+        for entity_type_info in self.type_set:
+            if entity_type_info.std_entity_type is not None:
+                ret.append(entity_type_info.std_entity_type)
+            elif entity_type_info.un_std_entity_type is not None:
+                ret.append(entity_type_info.un_std_entity_type)
+        return ret
+
+    def get_entity_first_type_or_un_std(self):
+        std_type = list(self.get_entity_type_set())
+        unstd_type = list(self.get_un_std_entity_type_set())
+        if len(std_type) > 0:
+            return std_type[0]
+        elif len(unstd_type) > 0:
+            return unstd_type[0]
+        else:
+            return "Entity"
+
+    def get_entity_type_set(self):
+        entity_types = []
+        for entity_type_info in self.type_set:
+            if entity_type_info.std_entity_type is not None:
+                entity_types.append(entity_type_info.std_entity_type)
+        return set(entity_types)
+
+    def get_un_std_entity_type_set(self):
+        entity_types = []
+        for entity_type_info in self.type_set:
+            if entity_type_info.un_std_entity_type is not None:
+                entity_types.append(entity_type_info.un_std_entity_type)
+        entity_types = set(entity_types)
+        if len(entity_types) == 0:
+            return ["Entity"]
+        return entity_types
+
+
+class SPORelation(SPOBase):
+    def __init__(self, alias_name=None, rel_type=None, rel_type_zh=None):
+        super().__init__()
+        if rel_type is not None or rel_type_zh is not None:
+            type_info = TypeInfo()
+            type_info.std_entity_type = rel_type
+            type_info.un_std_entity_type = rel_type_zh
+            self.type_set.append(type_info)
+        self.alias_name: Identifier = None
+        if alias_name is not None:
+            self.alias_name = Identifier(alias_name)
+
+        self.s: SPOBase = None
+        self.o: SPOEntity = None
+
+    def __str__(self):
+        show = [f"{self.alias_name}:{self.get_un_std_entity_first_type_or_std()}"]
+        show = show + self.get_value_list_str()
+        return ",".join(show)
+
+    @staticmethod
+    def parse_logic_form(input_str):
+        """
+        Parses the logic form from the given input string and constructs a relation object.
+
+        Parameters:
+            input_str (str): The input string containing alias and entity types separated by ':'.
+
+        Returns:
+            SPORelation: A relation object with alias name and associated type set.
+        """
+
+        rel_type_set = []
+
+        # Split the input string into alias and entity_type_set parts
+        split_input = input_str.split(":", 1)
+        alias = split_input[0]
+        # If entity_type_set exists, process it further
+        if len(split_input) > 1:
+            entity_type_part = split_input[1]
+
+            entity_types = parse_entity(entity_type_part)
+            for entity_type in entity_types:
+                entity_type_obj = TypeInfo()
+                entity_type_obj.un_std_entity_type = entity_type
+                rel_type_set.append(entity_type_obj)
+
+        rel = SPORelation()
+        rel.alias_name = Identifier(alias)
+        rel.type_set = rel_type_set
+        return rel
+
+
+class SPOEntity(SPOBase):
+    def __init__(
+        self,
+        entity_id=None,
+        std_entity_type=None,
+        un_std_entity_type=None,
+        entity_name=None,
+        alias_name=None,
+        is_attribute=False,
+    ):
+        super().__init__()
+        self.is_attribute = is_attribute
+        self.id_set = []
+        self.entity_name = entity_name
+        self.alias_name: Identifier = None
+        if alias_name is not None:
+            self.alias_name = Identifier(alias_name)
+        if entity_id is not None:
+            self.id_set.append(entity_id)
+        if std_entity_type is not None or un_std_entity_type is not None:
+            type_info = TypeInfo()
+            type_info.std_entity_type = std_entity_type
+            type_info.un_std_entity_type = un_std_entity_type
+            self.type_set.append(type_info)
+
+    def __str__(self):
+        show = [
+            f"{self.alias_name}:{self.get_un_std_entity_first_type_or_std()}{'' if self.entity_name is None else '[' + self.entity_name + ']'} "
+        ]
+        show = show + self.get_value_list_str()
+        return ",".join(show)
+
+    def get_mention_name(self):
+        return self.entity_name
+
+    def generate_id_key(self):
+        if len(self.id_set) == 0:
+            return None
+        id_str_set = ['"' + id_str + '"' for id_str in self.id_set]
+        return ",".join(id_str_set)
+
+    def generate_start_infos(self, prefix=None):
+        if len(self.id_set) == 0:
+            return []
+        if len(self.type_set) == 0:
+            return []
+
+        id_type_info = list(itertools.product(self.id_set, self.type_set))
+        return [
+            {
+                "alias": self.alias_name.alias_name,
+                "id": info[0],
+                "type": info[1].std_entity_type
+                if "." in info[1].std_entity_type
+                else (prefix + "." if prefix is not None else "")
+                + info[1].std_entity_type,
+            }
+            for info in id_type_info
+        ]
+
+    @staticmethod
+    def parse_logic_form(input_str):
+        # # 正则表达式解析输入字符串
+        match = re.match(r"([^:]+):?([^\[]+)?(\[[^\[]*\])?(\[[^\[]*\])?", input_str)
+        if not match:
+            return None
+
+        # 提取和解构匹配的组件
+        alias = match.group(1)
+        entity_type_raw = match.group(2)
+        entity_name_raw = match.group(3)
+        entity_id_raw = match.group(4)
+
+        # 处理entity_type_set
+        entity_type_set = parse_entity(entity_type_raw)
+
+        # 解析entity_name和entity_id_set
+        entity_name = entity_name_raw.strip("][") if entity_name_raw else None
+        entity_name = entity_name.strip("`") if entity_name else None
+        entity_id_set = parse_entity(entity_id_raw.strip("][")) if entity_id_raw else []
+
+        spo_entity = SPOEntity()
+        spo_entity.id_set = entity_id_set
+        spo_entity.alias_name = Identifier(alias)
+        spo_entity.entity_name = entity_name
+        for entity_type in entity_type_set:
+            entity_type_obj = TypeInfo()
+            entity_type_obj.un_std_entity_type = entity_type
+            entity_type_obj.std_entity_type = entity_type
+            spo_entity.type_set.append(entity_type_obj)
+        return spo_entity
+
+
+class Entity:
+    def __init__(
+        self,
+        entity_id=None,
+        entity_type=None,
+        entity_type_zh=None,
+        entity_name=None,
+        alias_name=None,
+    ):
+        self.id = entity_id
+        self.type = entity_type
+        self.entity_type_zh = entity_type_zh
+        self.entity_name = entity_name
+        self.alias_name = alias_name
+
+    def __repr__(self):
+        return f"{[self.entity_name, self.alias_name]}:{self.id}({self.type, self.entity_type_zh})"
+
+    def save_args(
+        self, id=None, type=None, entity_type_zh=None, entity_name=None, alias_name=None
+    ):
+        self.id = id if id else self.id
+        self.type = type if type else self.type
+        self.entity_type_zh = entity_type_zh if entity_type_zh else self.entity_type_zh
+        self.entity_name = entity_name if entity_name else self.entity_name
+        self.alias_name = alias_name if alias_name else self.alias_name
+
+    @staticmethod
+    def parse_zh(entity_str):
+        alias, type_zh, name = "", "", ""
+        entity_str = entity_str.replace("：", ":")
+        match_alias_type_entity = re.match(r"(.*):(.*)\[(.*)\]", entity_str)
+        if match_alias_type_entity:
+            alias, type_zh, name = match_alias_type_entity.groups()
+        else:
+            match_alias_type = re.match(r"(.*):(.*)", entity_str)
+            if match_alias_type:
+                alias, type_zh = match_alias_type.groups()
+            else:
+                alias = entity_str
+        return Entity(
+            entity_type_zh=type_zh.strip(),
+            entity_name=name.strip(),
+            alias_name=alias.strip(),
+        )
+
+
+class LogicNode:
+    def __init__(self, operator, args):
+        self.operator = operator
+        self.args = args
+        self.sub_query = args.get("sub_query", "")
+
+    def __repr__(self):
+        params = [f"{k}={v}" for k, v in self.args.items()]
+        params_str = ",".join(params)
+        return f"{self.operator}({params_str})"
+
+    def to_dict(self):
+        return json.loads(self.to_json())
+
+    def to_json(self):
+        return json.dumps(
+            obj=self, default=lambda x: x.__dict__, sort_keys=False, indent=2
+        )
+
+    def to_dsl(self):
+        raise NotImplementedError("Subclasses should implement this method.")
+
+    def to_std(self, args):
+        for key, value in args.items():
+            self.args[key] = value
+        self.sub_query = args.get("sub_query", "")
+
+
+class SubQueryResult:
+    def __init__(self):
+        self.sub_query: str = ""
+        self.sub_answer: str = ""
+        self.doc_retrieved: list = []
+        self.spo_retrieved: list = []
+        self.match_type: str = "fuzzy"
+        self.execute_cost: float = 0.0
+
+    def to_json(self):
+        return {
+            "sub_query": self.sub_query,
+            "sub_answer": self.sub_answer,
+            "doc_retrieved": self.doc_retrieved,
+            "spo_retrieved": [str(spo) for spo in self.spo_retrieved],
+            "match_type": self.match_type,
+            "execute_cost": self.execute_cost,
+        }
+
+
+class LFPlan:
+    def __init__(self, query: str, lf_nodes: List[LogicNode]):
+        self.query: str = query
+        self.lf_nodes: List[LogicNode] = lf_nodes
+        self.res: Optional[SubQueryResult] = None
+
+    def to_json(self):
+        res = {} if self.res is None else self.res.to_json()
+        res["lf_expr"] = [str(n) for n in self.lf_nodes]
+        return res
+
+
+class LFExecuteResult:
+    def __init__(self):
+        self.kg_exact_solved_answer = ""
+        self.recall_docs = []
+        self.rerank_docs = []
+        self.sub_plans: List[LFPlan] = []
+        self.retrieved_kg_graph = None
+
+    def get_support_facts(self):
+        facts = []
+        if len(self.sub_plans) != 0:
+            facts.append("sub query:")
+            i = 0
+            for sub_plan in self.sub_plans:
+                sub_res = sub_plan.res
+                facts.append(
+                    f"query{i + 1}:{sub_res.sub_query}. \nanswer:{sub_res.sub_answer}"
+                )
+                i += 1
+        if len(self.rerank_docs) != 0:
+            facts.append("Passages:")
+            facts += self.rerank_docs
+        return "\n".join(facts)
+
+    def get_trace_log(self):
+        return {
+            "sub question": [x.to_json() for x in self.sub_plans],
+            "recall docs": self.recall_docs,
+            "rerank docs": self.rerank_docs,
+            "kg_exact_solved_answer": self.kg_exact_solved_answer,
+        }
diff --git a/kag/interface/solver/execute/__init__.py b/kag/interface/solver/execute/__init__.py
new file mode 100644
index 00000000..402cd78f
--- /dev/null
+++ b/kag/interface/solver/execute/__init__.py
@@ -0,0 +1,2 @@
+import kag.interface.solver.execute.lf_executor_abc
+import kag.interface.solver.execute.lf_sub_query_merger_abc
diff --git a/kag/interface/solver/execute/lf_executor_abc.py b/kag/interface/solver/execute/lf_executor_abc.py
new file mode 100644
index 00000000..232ac802
--- /dev/null
+++ b/kag/interface/solver/execute/lf_executor_abc.py
@@ -0,0 +1,19 @@
+from typing import List
+
+from kag.common.registry import Registrable
+from abc import ABC, abstractmethod
+
+from kag.interface.solver.base_model import LFExecuteResult, LFPlan
+
+
+class LFExecutorABC(Registrable, ABC):
+    """
+    Initializes the base planner.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    @abstractmethod
+    def execute(self, query, lf_plans: List[LFPlan], **kwargs) -> LFExecuteResult:
+        pass
diff --git a/kag/interface/solver/execute/lf_sub_query_merger_abc.py b/kag/interface/solver/execute/lf_sub_query_merger_abc.py
new file mode 100644
index 00000000..ca227a39
--- /dev/null
+++ b/kag/interface/solver/execute/lf_sub_query_merger_abc.py
@@ -0,0 +1,19 @@
+from typing import List
+
+from kag.common.registry import Registrable
+from abc import ABC, abstractmethod
+
+from kag.interface.solver.base_model import LFExecuteResult, LFPlan
+
+
+class LFSubQueryResMerger(Registrable, ABC):
+    """
+    Initializes the base planner.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    @abstractmethod
+    def merge(self, query, lf_res_list: List[LFPlan]) -> LFExecuteResult:
+        pass
diff --git a/kag/interface/solver/kag_generator_abc.py b/kag/interface/solver/kag_generator_abc.py
index 9f3d4aef..20786637 100644
--- a/kag/interface/solver/kag_generator_abc.py
+++ b/kag/interface/solver/kag_generator_abc.py
@@ -1,16 +1,14 @@
 from abc import ABC, abstractmethod
 
 from kag.interface.solver.kag_memory_abc import KagMemoryABC
-from kag.solver.common.base import KagBaseModule
+from kag.interface.solver.base import KagBaseModule
 
 
 class KAGGeneratorABC(KagBaseModule, ABC):
     """
-     The Generator class is an abstract base class for generating responses using a language model module.
-     It initializes prompts for judging and generating responses based on the business scene and language settings.
-     """
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
+    The Generator class is an abstract base class for generating responses using a language model module.
+    It initializes prompts for judging and generating responses based on the business scene and language settings.
+    """
 
     @abstractmethod
     def generate(self, instruction, memory: KagMemoryABC) -> str:
diff --git a/kag/interface/solver/kag_memory_abc.py b/kag/interface/solver/kag_memory_abc.py
index c5716745..f339b4b0 100644
--- a/kag/interface/solver/kag_memory_abc.py
+++ b/kag/interface/solver/kag_memory_abc.py
@@ -1,17 +1,9 @@
 from abc import ABC, abstractmethod
 
-from kag.solver.common.base import KagBaseModule
+from kag.interface.solver.base import KagBaseModule
 
 
 class KagMemoryABC(KagBaseModule, ABC):
-    def __init__(self, **kwargs):
-        """
-        Initializes the KagMemory module with an LLM module.
-
-        :param llm_module: The LLM module to use.
-        """
-        super().__init__(**kwargs)
-
     @abstractmethod
     def save_memory(self, solved_answer, supporting_fact, instruction):
         """
@@ -45,4 +37,4 @@ def refresh(self):
         Refreshes the memory.
 
         This method is used to reset the memory state.
-        """
\ No newline at end of file
+        """
diff --git a/kag/interface/solver/kag_reasoner_abc.py b/kag/interface/solver/kag_reasoner_abc.py
index c2eb3adb..460a3f27 100644
--- a/kag/interface/solver/kag_reasoner_abc.py
+++ b/kag/interface/solver/kag_reasoner_abc.py
@@ -1,9 +1,18 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
 from abc import abstractmethod
-from typing import Tuple
 
-from kag.interface.solver.lf_planner_abc import LFPlannerABC
-from kag.solver.common.base import KagBaseModule
-from kag.solver.logic.core_modules.lf_solver import LFSolver
+from kag.interface.solver.base import KagBaseModule
+from kag.interface.solver.base_model import LFExecuteResult
 
 
 class KagReasonerABC(KagBaseModule):
@@ -24,11 +33,9 @@ class KagReasonerABC(KagBaseModule):
     - kg_direct: Number of direct knowledge graph queries.
     - trace_log: List to log trace information.
     """
-    def __init__(self, lf_planner: LFPlannerABC = None, lf_solver: LFSolver = None, **kwargs):
-        super().__init__(**kwargs)
 
     @abstractmethod
-    def reason(self, question: str) -> Tuple[str, str, dict]:
+    def reason(self, question: str, **kwargs) -> LFExecuteResult:
         """
         Processes a given question by planning and executing logical forms to derive an answer.
 
@@ -36,8 +43,8 @@ def reason(self, question: str) -> Tuple[str, str, dict]:
         - question (str): The input question to be processed.
 
         Returns:
-        Tuple
+        LFExecuteResult
         - solved_answer: The final answer derived from solving the logical forms.
         - supporting_fact: Supporting facts gathered during the reasoning process.
         - history_log: A dictionary containing the history of QA pairs and re-ranked documents.
-        """
\ No newline at end of file
+        """
diff --git a/kag/interface/solver/kag_reflector_abc.py b/kag/interface/solver/kag_reflector_abc.py
index ef75afd9..2bfb2041 100644
--- a/kag/interface/solver/kag_reflector_abc.py
+++ b/kag/interface/solver/kag_reflector_abc.py
@@ -1,16 +1,10 @@
 from abc import abstractmethod
 
 from kag.interface.solver.kag_memory_abc import KagMemoryABC
-from kag.solver.common.base import KagBaseModule
+from kag.interface.solver.base import KagBaseModule
 
 
 class KagReflectorABC(KagBaseModule):
-    def __init__(self, **kwargs):
-        """
-        Initializes the reflector.
-        """
-        super().__init__(**kwargs)
-
     def reflect_query(self, memory: KagMemoryABC, instruction: str) -> (bool, str):
         """
         Reflects on the query and determines whether it can be answered.
@@ -22,7 +16,9 @@ def reflect_query(self, memory: KagMemoryABC, instruction: str) -> (bool, str):
             - refined_query: The refined query (string)
         """
         can_answer = self._can_answer(memory, instruction)
-        refined_query = self._refine_query(memory, instruction) if not can_answer else instruction
+        refined_query = (
+            self._refine_query(memory, instruction) if not can_answer else instruction
+        )
 
         return can_answer, refined_query
 
diff --git a/kag/interface/solver/lf_planner_abc.py b/kag/interface/solver/lf_planner_abc.py
deleted file mode 100644
index fc7719c4..00000000
--- a/kag/interface/solver/lf_planner_abc.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import os
-from abc import ABC, abstractmethod
-from typing import List
-
-from kag.solver.common.base import KagBaseModule
-from kag.solver.logic.core_modules.common.base_model import LFPlanResult
-
-
-class LFPlannerABC(KagBaseModule, ABC):
-    """
-    Initializes the base planner.
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    @abstractmethod
-    def lf_planing(self, question, llm_output=None) -> List[LFPlanResult]:
-        """
-        Method that should be implemented by all subclasses for planning logic.
-        This is a default impl
-
-         :
-        question (str): The question or task to plan.
-        llm_output (Any, optional): Output from the LLM module. Defaults to None.
-
-        Returns:
-        list of LFPlanResult
-        """
-        pass
\ No newline at end of file
diff --git a/kag/interface/solver/plan/__init__.py b/kag/interface/solver/plan/__init__.py
new file mode 100644
index 00000000..6fbdbb09
--- /dev/null
+++ b/kag/interface/solver/plan/__init__.py
@@ -0,0 +1 @@
+import kag.interface.solver.plan.lf_planner_abc
diff --git a/kag/interface/solver/plan/lf_planner_abc.py b/kag/interface/solver/plan/lf_planner_abc.py
new file mode 100644
index 00000000..64e30c2a
--- /dev/null
+++ b/kag/interface/solver/plan/lf_planner_abc.py
@@ -0,0 +1,39 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from abc import ABC
+from typing import List
+
+from kag.interface.solver.base import KagBaseModule
+from kag.interface.solver.kag_memory_abc import KagMemoryABC
+from kag.interface.solver.base_model import LFPlan
+
+
+class LFPlannerABC(KagBaseModule, ABC):
+    def lf_planing(
+        self, question: str, memory: KagMemoryABC = None, llm_output=None
+    ) -> List[LFPlan]:
+        """
+        Method that should be implemented by all subclasses for planning logic.
+        This is a default impl
+
+         :
+        question (str): The question or task to plan.
+        memory (KagMemoryABC): the execute memory. Defaults to None.
+        llm_output (Any, optional): Output from the LLM module. Defaults to None.
+
+        Returns:
+        list of LFPlanResult
+        """
+        return []
+
+
+LFPlannerABC.register("empty")(LFPlannerABC)
diff --git a/kag/solver/__init__.py b/kag/solver/__init__.py
index e69de29b..9b617769 100644
--- a/kag/solver/__init__.py
+++ b/kag/solver/__init__.py
@@ -0,0 +1 @@
+import kag.solver.implementation  # noqa
diff --git a/kag/solver/common/base.py b/kag/solver/common/base.py
index bfac2362..eea436a3 100644
--- a/kag/solver/common/base.py
+++ b/kag/solver/common/base.py
@@ -1,248 +1,3 @@
-import os
-from string import Template
-
-from knext.project.client import ProjectClient
-from kag.common.llm.client import LLMClient
 import logging
 
 logger = logging.getLogger(__name__)
-
-
-class Question(object):
-    """
-    This class models a question that may have dependencies on other questions
-    and can also have sub-questions. It helps in structuring a complex problem
-    into manageable parts, ensuring that each part can be addressed in a logical
-    sequence.
-
-    There are two possible relationships between questions:
-    1. The content of one question depends on the answer to another question.
-    2. One question can be broken down into several sub-questions.
-    """
-
-    def __init__(
-        self, question, dependencies=[], children=[], parent=None, context=None
-    ):
-        """
-        Initialize a Question object.
-
-        :param question: The content of the question.
-        :param dependencies: A list of Question objects that this question depends on.
-        :param children: A list of sub-questions (Question objects) for this question.
-        :param parent: The parent question (if any).
-        :param context: Additional context or information related to the question.
-        """
-        self.question = question
-        self.dependencies = dependencies
-        self.children = children
-        self.parent = parent
-        self.answer = None
-        self.context = context
-        self.id = None
-
-    def is_solved(self):
-        return self.answer is not None
-
-    def get_current_depth(self):
-        """
-        Calculate the depth of the current question in the hierarchy.
-
-        :return: The depth of the question.
-        """
-        now_depth = 1
-        now = self
-        while now.parent:
-            now = now.parent
-            now_depth += 1
-        return now_depth
-
-    def __str__(self):
-        """
-        Return a string representation of the question, including dependencies, children, parent, and context.
-
-        :return: A string representation of the question.
-        """
-        repr_str = f"""question: {self.question}\n"""
-        # dependies
-        repr_str += "deps:\n"
-        for ind, dep in enumerate(self.dependencies):
-            repr_str += f"  dep_question {ind}: {dep.question}\n"
-
-        # chilren
-        repr_str += "children:\n"
-        for ind, child in enumerate(self.children):
-            repr_str += f"  childe_question {ind}: {child.question}\n"
-
-        # parent
-        if self.parent:
-            repr_str += f"parent:\n  {self.parent.question}\n"
-
-        # context:
-        if self.context:
-            repr_str += f"context:{self.context}\n"
-        return repr_str
-
-
-class KagBaseModule(object):
-    """
-    KagBaseModule is an abstract base class designed to interact with language model (LLM) modules.
-    This class handles the processing flow from the input question to the output generated by the LLM.
-    It supports intermediate processing tools and additional information fetching tools.
-    A significant feature of this class is the management of prompt templates used to communicate with the LLM.
-
-    The class allows for default or custom prompt templates to be loaded, processed, and saved.
-    If a computational context is indicated, it initializes and manages the state dictionary containing the prompt template.
-    """
-
-    def __init__(self, **kwargs):
-        """
-        Initializes the KagBaseModule.
-
-        Parameters:
-        llm_module: The language model module used for generating responses.
-        use_default_prompt_template (bool): Flag to use the default prompt template.
-        prompt_template_dir (str): Directory to load the prompt template from, if not using the default.
-        is_computational (bool): Indicates if the module operates in a computational context, impacting prompt template usage.
-
-        If the module is computational, it initializes the state dictionary with the prompt template.
-        """
-        self.host_addr = kwargs.get("KAG_PROJECT_HOST_ADDR") or os.getenv(
-            "KAG_PROJECT_HOST_ADDR"
-        )
-        self.project_id = kwargs.get("KAG_PROJECT_ID") or os.getenv("KAG_PROJECT_ID")
-        self.config = ProjectClient().get_config(self.project_id)
-
-        self._init_llm()
-        self.prompt_config = self.config.get("prompt", {})
-        self.biz_scene = self.prompt_config.get("biz_scene") or os.getenv(
-            "KAG_PROMPT_BIZ_SCENE", "default"
-        )
-        self.language = self.prompt_config.get("language") or os.getenv(
-            "KAG_PROMPT_LANGUAGE", "en"
-        )
-
-    def _init_llm(self):
-        llm_config = eval(os.getenv("KAG_LLM", "{}"))
-        try:
-            if self.project_id and self.host_addr:
-                project_id = int(self.project_id)
-                config = ProjectClient(
-                    host_addr=self.host_addr, project_id=project_id
-                ).get_config(self.project_id)
-                llm_config.update(config.get("llm", {}))
-        except Exception as e:
-            logger.warning(f"init llm from local config:{e}")
-            pass
-        self.llm_module = LLMClient.from_config(llm_config)
-
-    def get_module_name(self):
-        raise NotImplementedError
-
-    def get_template_var_names(self):
-        raise NotImplementedError
-
-    def forward(self, question: Question):
-        """
-        Processes the input question through the language model module.
-
-        Parameters:
-        question (Question): The input question object containing the query.
-
-        Returns:
-        The post-processed output generated by the LLM.
-        """
-        prompt = self.preprocess(question)
-        llm_output = self.llm_module(prompt)
-        post_processed_output = self.postprocess(question, llm_output)
-        return post_processed_output
-
-    async def async_forward(self, question: Question):
-        return self.forward(question)
-
-    def preprocess(self, question: Question):
-        return question.question
-
-    def postprocess(self, question: Question, llm_output):
-        return llm_output
-
-    def get_ca_default_prompt_template_dir(self):
-        directory = os.path.dirname(os.path.abspath(__file__))
-        directory = os.path.join(directory, "..", "logic/modules")
-        if self.is_prompt_template_cn:
-            return os.path.join(directory, "default_prompt_template")
-        else:
-            return os.path.join(directory, "default_prompt_template_en")
-
-    def load_prompt_template(self, prompt_dir, prompt_module_name=None):
-        if prompt_module_name is None:
-            prompt_module_name = self.get_module_name()
-        prompt_file_path = os.path.join(prompt_dir, f"{prompt_module_name}.txt")
-        if os.path.exists(prompt_file_path):
-            with open(prompt_file_path, "r") as f:
-                template_string = f.read()
-            template_string = self.process_template_string_to_avoid_doller_problem(
-                template_string
-            )
-        else:
-            template_string = """default prompt. edit it anyway"""
-        return Template(template_string)
-
-    def process_template_string_to_avoid_doller_problem(self, template_string):
-        new_template_str = template_string.replace("$", "$$")
-        for var in self.get_template_var_names():
-            new_template_str = new_template_str.replace(f"$${var}", f"${var}")
-        return new_template_str
-
-    def save_prompt_template(self, prompt_dir, prompt_template):
-        prompt_file_path = os.path.join(prompt_dir, f"{self.get_module_name()}.txt")
-        with open(prompt_file_path, "w") as f:
-            f.write(prompt_template)
-
-    def create_default_state_dict(self):
-        default_prompt_template = self.load_prompt_template(
-            self.get_ca_default_prompt_template_dir()
-        )
-        state_dict = {
-            "prompt_template": default_prompt_template,
-        }
-        return state_dict
-
-    def save_state_dict(self, save_dir, state_dict):
-        prompt_dir = os.path.join(save_dir, "prompt_template")
-        os.makedirs(prompt_dir, exist_ok=True)
-        self.save_prompt_template(prompt_dir, state_dict["prompt_template"].template)
-
-    def load_state_dict(self, save_dir):
-        prompt_dir = os.path.join(save_dir, "prompt_template")
-        prompt_template = self.load_prompt_template(prompt_dir)
-        state_dict = {
-            "prompt_template": prompt_template,
-        }
-        return state_dict
-
-    def does_state_dict_exists(self, save_dir):
-        prompt_file_path = os.path.join(
-            save_dir, "prompt_template", f"{self.get_module_name()}.txt"
-        )
-        return os.path.exists(prompt_file_path)
-
-    def init_state_dict(self):
-        """
-        Initializes the state dictionary by loading or creating the prompt template.
-
-        Returns:
-        state_dict (dict): The state dictionary containing the prompt template.
-        """
-        if self.use_default_prompt_template:
-            return self.create_default_state_dict()
-        else:
-            if self.prompt_template_dir:
-                working_dir = self.prompt_template_dir
-            else:
-                working_dir = os.getcwd()
-            if self.does_state_dict_exists(working_dir):
-                state_dict = self.load_state_dict(working_dir)
-            else:
-                state_dict = self.create_default_state_dict()
-                self.save_state_dict(working_dir, state_dict)
-            return state_dict
diff --git a/kag/solver/execute/__init__.py b/kag/solver/execute/__init__.py
new file mode 100644
index 00000000..a0280359
--- /dev/null
+++ b/kag/solver/execute/__init__.py
@@ -0,0 +1,2 @@
+import kag.solver.execute.default_sub_query_merger
+import kag.solver.execute.default_lf_executor
diff --git a/kag/solver/execute/default_lf_executor.py b/kag/solver/execute/default_lf_executor.py
new file mode 100644
index 00000000..71c76e1d
--- /dev/null
+++ b/kag/solver/execute/default_lf_executor.py
@@ -0,0 +1,273 @@
+import logging
+from typing import List, Dict
+
+from kag.common.conf import KAG_PROJECT_CONF
+from kag.interface import LLMClient
+from kag.interface.solver.execute.lf_executor_abc import LFExecutorABC
+from kag.interface.solver.execute.lf_sub_query_merger_abc import LFSubQueryResMerger
+from kag.solver.execute.op_executor.op_deduce.deduce_executor import DeduceExecutor
+from kag.solver.execute.op_executor.op_math.math_executor import MathExecutor
+from kag.solver.execute.op_executor.op_output.output_executor import OutputExecutor
+from kag.solver.execute.op_executor.op_retrieval.retrieval_executor import (
+    RetrievalExecutor,
+)
+from kag.solver.execute.op_executor.op_sort.sort_executor import SortExecutor
+from kag.solver.execute.sub_query_generator import LFSubGenerator
+from kag.interface.solver.base_model import LFExecuteResult, LFPlan, SubQueryResult
+from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
+from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
+from kag.solver.logic.core_modules.common.utils import generate_random_string
+from kag.solver.logic.core_modules.config import LogicFormConfiguration
+from kag.solver.retriever.chunk_retriever import ChunkRetriever
+from kag.solver.retriever.exact_kg_retriever import ExactKgRetriever
+from kag.solver.retriever.fuzzy_kg_retriever import FuzzyKgRetriever
+from kag.solver.tools.info_processor import ReporterIntermediateProcessTool
+
+logger = logging.getLogger()
+
+
+@LFExecutorABC.register("default_lf_executor", as_default=True)
+class DefaultLFExecutor(LFExecutorABC):
+    def __init__(
+        self,
+        exact_kg_retriever: ExactKgRetriever,
+        fuzzy_kg_retriever: FuzzyKgRetriever,
+        chunk_retriever: ChunkRetriever,
+        merger: LFSubQueryResMerger,
+        force_chunk_retriever: bool = False,
+        llm_client: LLMClient = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.params = kwargs
+
+        # tmp graph data
+        self.schema: SchemaUtils = SchemaUtils(
+            LogicFormConfiguration(
+                {
+                    "KAG_PROJECT_ID": KAG_PROJECT_CONF.project_id,
+                    "KAG_PROJECT_HOST_ADDR": KAG_PROJECT_CONF.host_addr,
+                }
+            )
+        )
+
+        self.exact_kg_retriever = exact_kg_retriever
+        self.fuzzy_kg_retriever = fuzzy_kg_retriever
+        self.chunk_retriever = chunk_retriever
+        self.force_chunk_retriever = force_chunk_retriever
+        self.params["exact_kg_retriever"] = exact_kg_retriever
+        self.params["fuzzy_kg_retriever"] = fuzzy_kg_retriever
+        self.params["chunk_retriever"] = chunk_retriever
+        self.params["force_chunk_retriever"] = force_chunk_retriever
+        self.params["llm_module"] = llm_client
+
+        # Generate
+        self.generator = LFSubGenerator(llm_client=llm_client)
+
+        self.merger: LFSubQueryResMerger = merger
+
+        # Initialize executors for different operations.
+        self.retrieval_executor = RetrievalExecutor(schema=self.schema, **self.params)
+        self.deduce_executor = DeduceExecutor(schema=self.schema, **self.params)
+        self.sort_executor = SortExecutor(schema=self.schema, **self.params)
+        self.math_executor = MathExecutor(schema=self.schema, **self.params)
+        self.output_executor = OutputExecutor(schema=self.schema, **self.params)
+
+    def _judge_sub_answered(self, sub_answer: str):
+        return sub_answer and "i don't know" not in sub_answer.lower()
+
+    def _execute_spo_answer(
+        self,
+        req_id: str,
+        query: str,
+        lf: LFPlan,
+        process_info: Dict,
+        kg_graph: KgGraph,
+        history: List[LFPlan],
+    ) -> SubQueryResult:
+        res = SubQueryResult()
+        res.sub_query = lf.query
+        process_info[lf.query] = {
+            "spo_retrieved": [],
+            "doc_retrieved": [],
+            "match_type": "chunk",
+            "kg_answer": "",
+        }
+        # Execute graph retrieval operations.
+        for n in lf.lf_nodes:
+            if self.retrieval_executor.is_this_op(n):
+                self.retrieval_executor.executor(
+                    query, n, req_id, kg_graph, process_info, self.params
+                )
+            elif self.deduce_executor.is_this_op(n):
+                self.deduce_executor.executor(
+                    query, n, req_id, kg_graph, process_info, self.params
+                )
+            elif self.math_executor.is_this_op(n):
+                self.math_executor.executor(
+                    query, n, req_id, kg_graph, process_info, self.params
+                )
+            elif self.sort_executor.is_this_op(n):
+                self.sort_executor.executor(
+                    query, n, req_id, kg_graph, process_info, self.params
+                )
+            elif self.output_executor.is_this_op(n):
+                self.output_executor.executor(
+                    query, n, req_id, kg_graph, process_info, self.params
+                )
+            else:
+                logger.warning(f"unknown operator: {n.operator}")
+
+        res.spo_retrieved = process_info[lf.query].get("spo_retrieved", [])
+        res.sub_answer = process_info[lf.query]["kg_answer"]
+        res.doc_retrieved = []
+        res.match_type = process_info[lf.query]["match_type"]
+        # generate sub answer
+        if not self._judge_sub_answered(res.sub_answer) and (
+            len(res.spo_retrieved) and not self.force_chunk_retriever
+        ):
+            # try to use spo to generate answer
+            res.sub_answer = self.generator.generate_sub_answer(
+                lf.query, res.spo_retrieved, [], history
+            )
+        return res
+
+    def _execute_chunk_answer(
+        self,
+        req_id: str,
+        query: str,
+        lf: LFPlan,
+        process_info: Dict,
+        kg_graph: KgGraph,
+        history: List[LFPlan],
+        res: SubQueryResult,
+    ) -> SubQueryResult:
+        if not self._judge_sub_answered(res.sub_answer) or self.force_chunk_retriever:
+            if self.force_chunk_retriever:
+                # force chunk retriever, so we clear kg solved answer
+                process_info["kg_solved_answer"] = []
+            # chunk retriever
+            all_related_entities = kg_graph.get_all_spo()
+            all_related_entities = list(set(all_related_entities))
+            sub_query = self._generate_sub_query_with_history_qa(history, lf.query)
+            doc_retrieved = self.chunk_retriever.recall_docs(
+                queries=[query, sub_query],
+                retrieved_spo=all_related_entities,
+                kwargs=self.params,
+            )
+            res.doc_retrieved = doc_retrieved
+            process_info[lf.query]["doc_retrieved"] = doc_retrieved
+            process_info[lf.query]["match_type"] = "chunk"
+            # generate sub answer by chunk ans spo
+            docs = ["#".join(item.split("#")[:-1]) for item in doc_retrieved]
+            res.sub_answer = self.generator.generate_sub_answer(
+                lf.query, res.spo_retrieved, docs, history
+            )
+        return res
+
+    def _execute_lf(
+        self,
+        req_id: str,
+        query: str,
+        index: int,
+        lf: LFPlan,
+        process_info: Dict,
+        kg_graph: KgGraph,
+        history: List[LFPlan],
+        **kwargs,
+    ) -> SubQueryResult:
+        # change node state from WAITING to RUNNING
+        self._update_sub_question_status(
+            report_tool=kwargs.get("report_tool", None),
+            req_id=req_id,
+            index=index,
+            status=ReporterIntermediateProcessTool.STATE.RUNNING,
+            plan=lf,
+            kg_graph=kg_graph,
+        )
+        res = self._execute_spo_answer(
+            req_id, query, lf, process_info, kg_graph, history
+        )
+        lf.res = res
+        # update node state information
+        self._update_sub_question_status(
+            report_tool=kwargs.get("report_tool", None),
+            req_id=req_id,
+            index=index,
+            status=ReporterIntermediateProcessTool.STATE.RUNNING,
+            plan=lf,
+            kg_graph=kg_graph,
+        )
+        if not self._judge_sub_answered(res.sub_answer) or self.force_chunk_retriever:
+            # if not found answer in kg, we retrieved chunk to answer.
+            res = self._execute_chunk_answer(
+                req_id, query, lf, process_info, kg_graph, history, res
+            )
+        # change node state from RUNNING to FINISH
+        self._update_sub_question_status(
+            report_tool=kwargs.get("report_tool", None),
+            req_id=req_id,
+            index=index,
+            status=ReporterIntermediateProcessTool.STATE.FINISH,
+            plan=lf,
+            kg_graph=kg_graph,
+        )
+        return res
+
+    def _generate_sub_query_with_history_qa(self, history: List[LFPlan], sub_query):
+        # Generate a sub-query with history qa pair
+        if history:
+            history_sub_answer = [
+                h.res.sub_answer
+                for h in history[:3]
+                if "i don't know" not in h.res.sub_answer.lower()
+            ]
+            sub_query_with_history_qa = "\n".join(history_sub_answer) + "\n" + sub_query
+        else:
+            sub_query_with_history_qa = sub_query
+        return sub_query_with_history_qa
+
+    def execute(self, query, lf_nodes: List[LFPlan], **kwargs) -> LFExecuteResult:
+        self._create_report_pipeline(kwargs.get("report_tool", None), query, lf_nodes)
+
+        process_info = {"kg_solved_answer": []}
+        kg_graph = KgGraph()
+        history = []
+        # Process each sub-query.
+        for idx, lf in enumerate(lf_nodes):
+            sub_result = self._execute_lf(
+                req_id=generate_random_string(10),
+                index=idx + 1,
+                query=query,
+                lf=lf,
+                process_info=process_info,
+                kg_graph=kg_graph,
+                history=history,
+                **kwargs,
+            )
+            lf.res = sub_result
+            history.append(lf)
+        # merge all results
+        res = self.merger.merge(query, history)
+        res.retrieved_kg_graph = kg_graph
+        res.kg_exact_solved_answer = "\n".join(process_info["kg_solved_answer"])
+        return res
+
+    def _create_report_pipeline(
+        self, report_tool: ReporterIntermediateProcessTool, query, lf_nodes
+    ):
+        if report_tool:
+            report_tool.report_pipeline(query, lf_nodes)
+
+    def _update_sub_question_status(
+        self,
+        report_tool: ReporterIntermediateProcessTool,
+        req_id,
+        index,
+        status,
+        plan: LFPlan,
+        kg_graph: KgGraph,
+    ):
+        if report_tool:
+            report_tool.report_node(req_id, index, status, plan, kg_graph)
diff --git a/kag/solver/execute/default_sub_query_merger.py b/kag/solver/execute/default_sub_query_merger.py
new file mode 100644
index 00000000..e9a9ebe9
--- /dev/null
+++ b/kag/solver/execute/default_sub_query_merger.py
@@ -0,0 +1,73 @@
+from typing import List
+
+from kag.common.conf import KAG_CONFIG
+
+from kag.interface import VectorizeModelABC
+from kag.interface.solver.execute.lf_sub_query_merger_abc import LFSubQueryResMerger
+from kag.interface.solver.base_model import LFExecuteResult, LFPlan
+from kag.solver.logic.core_modules.common.text_sim_by_vector import TextSimilarity
+from kag.solver.retriever.chunk_retriever import ChunkRetriever
+
+
+@LFSubQueryResMerger.register("default_lf_sub_query_res_merger", as_default=True)
+class DefaultLFSubQueryResMerger(LFSubQueryResMerger):
+    """
+    Initializes the base planner.
+    """
+
+    def __init__(
+        self,
+        chunk_retriever: ChunkRetriever,
+        vectorize_model: VectorizeModelABC = None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.chunk_retriever = chunk_retriever
+        self.vectorize_model = vectorize_model or VectorizeModelABC.from_config(
+            KAG_CONFIG.all_config["vectorize_model"]
+        )
+        self.text_similarity = TextSimilarity(vectorize_model)
+
+    def merge(self, query, lf_res_list: List[LFPlan]) -> LFExecuteResult:
+        res = LFExecuteResult()
+        res.sub_plans = lf_res_list
+        rerank_docs, recall_docs = self._rerank_called_all_docs(query, lf_res_list)
+        res.rerank_docs = rerank_docs
+        res.recall_docs = recall_docs
+        return res
+
+    def _rerank_called_all_docs(self, query, lf_res_list: List[LFPlan]):
+        passages_set = [lf_res.res.doc_retrieved for lf_res in lf_res_list]
+        recall_docs = self._flat_passages_set(passages_set)
+        sub_queries = [lf_res.query for lf_res in lf_res_list]
+        rerank_docs = self.chunk_retriever.rerank_docs(
+            [query] + sub_queries, recall_docs
+        )
+        return rerank_docs, recall_docs
+
+    def _flat_passages_set(self, passages_set: list):
+        """
+        Flattens the passages set and scores each passage based on its position.
+
+        Parameters:
+        passages_set (list): A list of passage sets.
+
+        Returns:
+        list: A list of passages sorted by their scores.
+        """
+        score_map = {}
+        for passages in passages_set:
+            passages = ["#".join(item.split("#")[:-1]) for item in passages]
+            for i, passage in enumerate(passages):
+                score = 1.0 / (1 + i)
+                if passage in score_map:
+                    score_map[passage] += score
+                else:
+                    score_map[passage] = score
+
+        return [
+            k
+            for k, v in sorted(
+                score_map.items(), key=lambda item: item[1], reverse=True
+            )
+        ]
diff --git a/kag/solver/logic/core_modules/op_executor/op_sort/__init__.py b/kag/solver/execute/op_executor/__init__.py
similarity index 100%
rename from kag/solver/logic/core_modules/op_executor/op_sort/__init__.py
rename to kag/solver/execute/op_executor/__init__.py
diff --git a/kag/solver/logic/core_modules/retriver/__init__.py b/kag/solver/execute/op_executor/op_deduce/__init__.py
similarity index 100%
rename from kag/solver/logic/core_modules/retriver/__init__.py
rename to kag/solver/execute/op_executor/op_deduce/__init__.py
diff --git a/kag/solver/execute/op_executor/op_deduce/deduce_executor.py b/kag/solver/execute/op_executor/op_deduce/deduce_executor.py
new file mode 100644
index 00000000..7d692920
--- /dev/null
+++ b/kag/solver/execute/op_executor/op_deduce/deduce_executor.py
@@ -0,0 +1,82 @@
+from typing import Dict
+
+from kag.common.conf import KAG_PROJECT_CONF
+from kag.solver.execute.op_executor.op_deduce.module.choice import ChoiceOp
+from kag.solver.execute.op_executor.op_deduce.module.entailment import EntailmentOp
+from kag.solver.execute.op_executor.op_deduce.module.judgement import JudgementOp
+from kag.solver.execute.op_executor.op_deduce.module.multi_choice import MultiChoiceOp
+from kag.solver.execute.op_executor.op_executor import OpExecutor
+from kag.interface.solver.base_model import LogicNode
+from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
+from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
+from kag.solver.logic.core_modules.parser.logic_node_parser import (
+    DeduceNode,
+    VerifyNode,
+    FilterNode,
+    ExtractorNode,
+)
+
+
+class DeduceExecutor(OpExecutor):
+    def __init__(self, schema: SchemaUtils, **kwargs):
+        super().__init__(schema, **kwargs)
+        self.KAG_PROJECT_ID = (KAG_PROJECT_CONF.project_id,)
+
+    def _deduce_call(
+        self,
+        nl_query: str,
+        node: DeduceNode,
+        req_id: str,
+        kg_graph: KgGraph,
+        process_info: dict,
+        param: dict,
+    ) -> Dict:
+        op_mapping = {
+            "choice": ChoiceOp(
+                self.schema,
+                KAG_PROJECT_ID=self.KAG_PROJECT_ID,
+            ),
+            "multiChoice": MultiChoiceOp(
+                self.schema,
+                KAG_PROJECT_ID=self.KAG_PROJECT_ID,
+            ),
+            "entailment": EntailmentOp(
+                self.schema,
+                KAG_PROJECT_ID=self.KAG_PROJECT_ID,
+            ),
+            "judgement": JudgementOp(
+                self.schema,
+                KAG_PROJECT_ID=self.KAG_PROJECT_ID,
+            ),
+        }
+        result = []
+        for op in node.deduce_ops:
+            res = op_mapping[op].executor(
+                nl_query, node, req_id, kg_graph, process_info, param
+            )
+            if_answered = res["if_answered"]
+            answer = res["answer"]
+            if if_answered:
+                result.append(answer)
+        process_info[node.sub_query]["kg_answer"] += f"\n{';'.join(result)}"
+        return process_info[node.sub_query]
+
+    def is_this_op(self, logic_node: LogicNode) -> bool:
+        return isinstance(
+            logic_node, (DeduceNode, FilterNode, VerifyNode, ExtractorNode)
+        )
+
+    def executor(
+        self,
+        nl_query: str,
+        logic_node: LogicNode,
+        req_id: str,
+        kg_graph: KgGraph,
+        process_info: dict,
+        param: dict,
+    ) -> Dict:
+        if isinstance(logic_node, DeduceNode):
+            return self._deduce_call(
+                nl_query, logic_node, req_id, kg_graph, process_info, param
+            )
+        raise NotImplementedError(f"{logic_node}")
diff --git a/kag/solver/logic/core_modules/retriver/graph_retriver/__init__.py b/kag/solver/execute/op_executor/op_deduce/module/__init__.py
similarity index 100%
rename from kag/solver/logic/core_modules/retriver/graph_retriver/__init__.py
rename to kag/solver/execute/op_executor/op_deduce/module/__init__.py
diff --git a/kag/solver/execute/op_executor/op_deduce/module/choice.py b/kag/solver/execute/op_executor/op_deduce/module/choice.py
new file mode 100644
index 00000000..9b24e6a6
--- /dev/null
+++ b/kag/solver/execute/op_executor/op_deduce/module/choice.py
@@ -0,0 +1,37 @@
+from typing import Dict
+
+from kag.solver.execute.op_executor.op_executor import OpExecutor
+from kag.interface.solver.base_model import LogicNode
+from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
+from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
+from kag.solver.utils import init_prompt_with_fallback
+
+
+class ChoiceOp(OpExecutor):
+    def __init__(
+        self,
+        schema: SchemaUtils,
+        **kwargs,
+    ):
+        super().__init__(schema, **kwargs)
+        self.prompt = init_prompt_with_fallback("deduce_choice", self.biz_scene)
+
+    def executor(
+        self,
+        nl_query: str,
+        logic_node: LogicNode,
+        req_id: str,
+        kg_graph: KgGraph,
+        process_info: dict,
+        param: dict,
+    ) -> Dict:
+        # get history qa pair from debug_info
+        history_qa_pair = process_info.get("sub_qa_pair", [])
+        qa_pair = "\n".join([f"Q: {q}\nA: {a}" for q, a in history_qa_pair])
+        if_answered, answer = self.llm_module.invoke(
+            {"instruction": logic_node.sub_query, "memory": qa_pair},
+            self.prompt,
+            with_json_parse=False,
+            with_except=True,
+        )
+        return {"if_answered": if_answered, "answer": answer}
diff --git a/kag/solver/execute/op_executor/op_deduce/module/entailment.py b/kag/solver/execute/op_executor/op_deduce/module/entailment.py
new file mode 100644
index 00000000..d84ff28a
--- /dev/null
+++ b/kag/solver/execute/op_executor/op_deduce/module/entailment.py
@@ -0,0 +1,38 @@
+from typing import Dict
+
+from kag.interface.solver.base_model import LogicNode
+from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
+from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
+from kag.solver.execute.op_executor.op_executor import OpExecutor
+from kag.solver.utils import init_prompt_with_fallback
+
+
+class EntailmentOp(OpExecutor):
+    def __init__(
+        self,
+        schema: SchemaUtils,
+        **kwargs,
+    ):
+        super().__init__(schema, **kwargs)
+        self.prompt = init_prompt_with_fallback("deduce_entail", self.biz_scene)
+
+    def executor(
+        self,
+        nl_query: str,
+        logic_node: LogicNode,
+        req_id: str,
+        kg_graph: KgGraph,
+        process_info: dict,
+        param: dict,
+    ) -> Dict:
+        history_qa_pair = process_info.get("sub_qa_pair", [])
+        qa_pair = "\n".join([f"Q: {q}\nA: {a}" for q, a in history_qa_pair])
+        spo_info = kg_graph.to_evidence()
+        information = str(spo_info) + "\n" + qa_pair
+        if_answered, answer = self.llm_module.invoke(
+            {"instruction": logic_node.sub_query, "memory": information},
+            self.prompt,
+            with_json_parse=False,
+            with_except=True,
+        )
+        return {"if_answered": if_answered, "answer": answer}
diff --git a/kag/solver/execute/op_executor/op_deduce/module/judgement.py b/kag/solver/execute/op_executor/op_deduce/module/judgement.py
new file mode 100644
index 00000000..3275f939
--- /dev/null
+++ b/kag/solver/execute/op_executor/op_deduce/module/judgement.py
@@ -0,0 +1,39 @@
+from typing import Dict
+
+from kag.solver.execute.op_executor.op_executor import OpExecutor
+from kag.interface.solver.base_model import LogicNode
+from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
+from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
+
+from kag.solver.utils import init_prompt_with_fallback
+
+
+class JudgementOp(OpExecutor):
+    def __init__(
+        self,
+        schema: SchemaUtils,
+        **kwargs,
+    ):
+        super().__init__(schema, **kwargs)
+        self.prompt = init_prompt_with_fallback("deduce_judge", self.biz_scene)
+
+    def executor(
+        self,
+        nl_query: str,
+        logic_node: LogicNode,
+        req_id: str,
+        kg_graph: KgGraph,
+        process_info: dict,
+        param: dict,
+    ) -> Dict:
+        history_qa_pair = process_info.get("sub_qa_pair", [])
+        qa_pair = "\n".join([f"Q: {q}\nA: {a}" for q, a in history_qa_pair])
+        spo_info = kg_graph.to_evidence()
+        information = str(spo_info) + "\n" + qa_pair
+        if_answered, answer = self.llm_module.invoke(
+            {"instruction": logic_node.sub_query, "memory": information},
+            self.prompt,
+            with_json_parse=False,
+            with_except=True,
+        )
+        return {"if_answered": if_answered, "answer": answer}
diff --git a/kag/solver/execute/op_executor/op_deduce/module/multi_choice.py b/kag/solver/execute/op_executor/op_deduce/module/multi_choice.py
new file mode 100644
index 00000000..86127a4f
--- /dev/null
+++ b/kag/solver/execute/op_executor/op_deduce/module/multi_choice.py
@@ -0,0 +1,38 @@
+from typing import Dict
+
+from kag.interface.solver.base_model import LogicNode
+from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
+from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
+from kag.solver.execute.op_executor.op_executor import OpExecutor
+
+from kag.solver.utils import init_prompt_with_fallback
+
+
+class MultiChoiceOp(OpExecutor):
+    def __init__(
+        self,
+        schema: SchemaUtils,
+        **kwargs,
+    ):
+        super().__init__(schema, **kwargs)
+        self.prompt = init_prompt_with_fallback("deduce_multi_choice", self.biz_scene)
+
+    def executor(
+        self,
+        nl_query: str,
+        logic_node: LogicNode,
+        req_id: str,
+        kg_graph: KgGraph,
+        process_info: dict,
+        param: dict,
+    ) -> Dict:
+        # get history qa pair from debug_info
+        history_qa_pair = process_info.get("sub_qa_pair", [])
+        qa_pair = "\n".join([f"Q: {q}\nA: {a}" for q, a in history_qa_pair])
+        if_answered, answer = self.llm_module.invoke(
+            {"instruction": logic_node.sub_query, "memory": qa_pair},
+            self.prompt,
+            with_json_parse=False,
+            with_except=True,
+        )
+        return {"if_answered": if_answered, "answer": answer}
diff --git a/kag/solver/logic/core_modules/op_executor/op_executor.py b/kag/solver/execute/op_executor/op_executor.py
similarity index 53%
rename from kag/solver/logic/core_modules/op_executor/op_executor.py
rename to kag/solver/execute/op_executor/op_executor.py
index 0bc47931..48981866 100644
--- a/kag/solver/logic/core_modules/op_executor/op_executor.py
+++ b/kag/solver/execute/op_executor/op_executor.py
@@ -1,8 +1,8 @@
 from abc import ABC
-from typing import Union
+from typing import Dict
 
-from kag.solver.common.base import KagBaseModule
-from kag.solver.logic.core_modules.common.base_model import LogicNode
+from kag.interface import KagBaseModule
+from kag.interface.solver.base_model import LogicNode
 from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
 from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
 
@@ -13,36 +13,43 @@ class OpExecutor(KagBaseModule, ABC):
 
     Each subclass must implement the execution and judgment functions.
     """
-    def __init__(self, nl_query: str, kg_graph: KgGraph, schema: SchemaUtils, debug_info: dict, **kwargs):
+
+    def __init__(self, schema: SchemaUtils, **kwargs):
         """
         Initializes the operator executor with necessary components.
 
         Parameters:
-            nl_query (str): Natural language query string.
-            kg_graph (KgGraph): Knowledge graph object for subsequent queries and parsing.
+
             schema (SchemaUtils): Semantic structure definition to assist in the parsing process.
-            debug_info (dict): Debug information dictionary to record debugging information during parsing.
         """
         super().__init__(**kwargs)
-        self.kg_graph = kg_graph
         self.schema = schema
-        self.nl_query = nl_query
-        self.debug_info = debug_info
 
-    def executor(self, logic_node: LogicNode, req_id: str, param: dict) -> Union[KgGraph, list]:
+    def executor(
+        self,
+        nl_query: str,
+        logic_node: LogicNode,
+        req_id: str,
+        kg_graph: KgGraph,
+        process_info: dict,
+        param: dict,
+    ) -> Dict:
         """
-         Executes the operation based on the given logic node.
+        Executes the operation based on the given logic node.
 
-         This method should be implemented by subclasses to define how the operation is executed.
+        This method should be implemented by subclasses to define how the operation is executed.
 
-         Parameters:
-             logic_node (LogicNode): The logic node that defines the operation to execute.
-             req_id (str): Request identifier.
-             param (dict): Parameters needed for the execution.
+        Parameters:
+            nl_query (str): Natural language query string.
+            logic_node (LogicNode): The logic node that defines the operation to execute.
+            req_id (str): Request identifier.
+            kg_graph (KgGraph): Knowledge graph object for subsequent queries and parsing.
+            process_info (dict): Processing information dictionary to record logic node result information during executing.
+            param (dict): Parameters needed for the execution.
 
-         Returns:
-             Union[KgGraph, list]: The result of the operation, which could be a knowledge graph or a list.
-         """
+        Returns:
+            Dict: The result of the operation, which could be a dict.
+        """
         pass
 
     def is_this_op(self, logic_node: LogicNode) -> bool:
@@ -58,4 +65,4 @@ def is_this_op(self, logic_node: LogicNode) -> bool:
         Returns:
             bool: True if this executor can handle the logic node, False otherwise.
         """
-        pass
\ No newline at end of file
+        pass
diff --git a/kag/solver/logic/core_modules/rule_runner/__init__.py b/kag/solver/execute/op_executor/op_math/__init__.py
similarity index 100%
rename from kag/solver/logic/core_modules/rule_runner/__init__.py
rename to kag/solver/execute/op_executor/op_math/__init__.py
diff --git a/kag/solver/execute/op_executor/op_math/math_executor.py b/kag/solver/execute/op_executor/op_math/math_executor.py
new file mode 100644
index 00000000..119c6d65
--- /dev/null
+++ b/kag/solver/execute/op_executor/op_math/math_executor.py
@@ -0,0 +1,26 @@
+from typing import Union, Dict
+
+from kag.solver.execute.op_executor.op_executor import OpExecutor
+from kag.interface.solver.base_model import LogicNode
+from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
+from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
+from kag.solver.logic.core_modules.parser.logic_node_parser import CountNode, SumNode
+
+
+class MathExecutor(OpExecutor):
+    def __init__(self, schema: SchemaUtils, **kwargs):
+        super().__init__(schema, **kwargs)
+
+    def is_this_op(self, logic_node: LogicNode) -> bool:
+        return isinstance(logic_node, (CountNode, SumNode))
+
+    def executor(
+        self,
+        nl_query: str,
+        logic_node: LogicNode,
+        req_id: str,
+        kg_graph: KgGraph,
+        process_info: dict,
+        param: dict,
+    ) -> Dict:
+        pass
diff --git a/kag/solver/prompt/lawbench/__init__.py b/kag/solver/execute/op_executor/op_output/__init__.py
similarity index 100%
rename from kag/solver/prompt/lawbench/__init__.py
rename to kag/solver/execute/op_executor/op_output/__init__.py
diff --git a/kag/solver/prompt/medical/__init__.py b/kag/solver/execute/op_executor/op_output/module/__init__.py
similarity index 100%
rename from kag/solver/prompt/medical/__init__.py
rename to kag/solver/execute/op_executor/op_output/module/__init__.py
diff --git a/kag/solver/execute/op_executor/op_output/module/get_executor.py b/kag/solver/execute/op_executor/op_output/module/get_executor.py
new file mode 100644
index 00000000..1fdc81d6
--- /dev/null
+++ b/kag/solver/execute/op_executor/op_output/module/get_executor.py
@@ -0,0 +1,46 @@
+from typing import Dict
+
+from kag.solver.execute.op_executor.op_executor import OpExecutor
+from kag.interface.solver.base_model import LogicNode
+from kag.solver.logic.core_modules.common.one_hop_graph import (
+    KgGraph,
+    EntityData,
+    RelationData,
+)
+from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
+from kag.solver.logic.core_modules.parser.logic_node_parser import GetNode
+
+
+class GetExecutor(OpExecutor):
+    def __init__(self, schema: SchemaUtils, **kwargs):
+        super().__init__(schema, **kwargs)
+
+    def executor(
+        self,
+        nl_query: str,
+        logic_node: LogicNode,
+        req_id: str,
+        kg_graph: KgGraph,
+        process_info: dict,
+        param: dict,
+    ) -> Dict:
+        kg_qa_result = []
+        if not isinstance(logic_node, GetNode):
+            return process_info[logic_node.sub_query]
+
+        n = logic_node
+        s_data_set = kg_graph.get_entity_by_alias(n.alias_name)
+        if s_data_set is None:
+            return process_info[logic_node.sub_query]
+
+        for s_data in s_data_set:
+            if isinstance(s_data, EntityData):
+                if s_data.name == "":
+                    kg_qa_result.append(s_data.biz_id)
+                else:
+                    kg_qa_result.append(s_data.name)
+            if isinstance(s_data, RelationData):
+                kg_qa_result.append(str(s_data))
+        process_info[logic_node.sub_query]["kg_answer"] += f"\n{';'.join(kg_qa_result)}"
+        process_info["kg_solved_answer"].append(f"\n{';'.join(kg_qa_result)}")
+        return process_info[logic_node.sub_query]
diff --git a/kag/solver/execute/op_executor/op_output/output_executor.py b/kag/solver/execute/op_executor/op_output/output_executor.py
new file mode 100644
index 00000000..fcd73bc9
--- /dev/null
+++ b/kag/solver/execute/op_executor/op_output/output_executor.py
@@ -0,0 +1,37 @@
+from typing import Dict
+
+from kag.solver.execute.op_executor.op_executor import OpExecutor
+from kag.solver.execute.op_executor.op_output.module.get_executor import GetExecutor
+from kag.interface.solver.base_model import LogicNode
+from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
+from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
+from kag.solver.logic.core_modules.parser.logic_node_parser import GetNode
+
+
+class OutputExecutor(OpExecutor):
+    def __init__(self, schema: SchemaUtils, **kwargs):
+        super().__init__(schema, **kwargs)
+        self.KAG_PROJECT_ID = kwargs.get("KAG_PROJECT_ID")
+        self.op_register_map = {
+            "get": GetExecutor(
+                schema,
+                **kwargs,
+            )
+        }
+
+    def is_this_op(self, logic_node: LogicNode) -> bool:
+        return isinstance(logic_node, GetNode)
+
+    def executor(
+        self,
+        nl_query: str,
+        logic_node: LogicNode,
+        req_id: str,
+        kg_graph: KgGraph,
+        process_info: dict,
+        param: dict,
+    ) -> Dict:
+        op = self.op_register_map.get(logic_node.operator, None)
+        if op is None:
+            return {}
+        return op.executor(nl_query, logic_node, req_id, kg_graph, process_info, param)
diff --git a/tests/builder/__init__.py b/kag/solver/execute/op_executor/op_retrieval/__init__.py
similarity index 100%
rename from tests/builder/__init__.py
rename to kag/solver/execute/op_executor/op_retrieval/__init__.py
diff --git a/tests/builder/common/__init__.py b/kag/solver/execute/op_executor/op_retrieval/module/__init__.py
similarity index 100%
rename from tests/builder/common/__init__.py
rename to kag/solver/execute/op_executor/op_retrieval/module/__init__.py
diff --git a/kag/solver/execute/op_executor/op_retrieval/module/get_spo_executor.py b/kag/solver/execute/op_executor/op_retrieval/module/get_spo_executor.py
new file mode 100644
index 00000000..fd4fe87a
--- /dev/null
+++ b/kag/solver/execute/op_executor/op_retrieval/module/get_spo_executor.py
@@ -0,0 +1,199 @@
+import logging
+from typing import List, Dict, Tuple
+
+from kag.solver.execute.op_executor.op_executor import OpExecutor
+from kag.interface.solver.base_model import LogicNode, SPOEntity, SPOBase
+from kag.solver.logic.core_modules.common.one_hop_graph import (
+    EntityData,
+    KgGraph,
+    RelationData,
+)
+from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
+from kag.solver.logic.core_modules.parser.logic_node_parser import GetSPONode
+from kag.solver.retriever.base.kg_retriever import KGRetriever
+from kag.solver.retriever.exact_kg_retriever import ExactKgRetriever
+from kag.solver.retriever.fuzzy_kg_retriever import FuzzyKgRetriever
+
+logger = logging.getLogger()
+
+
+def _get_entity_node_from_lf(e: SPOEntity):
+    if not e.id_set:
+        return []
+    ret = []
+    for biz_id in e.id_set:
+        d = EntityData()
+        d.biz_id = biz_id
+        d.type = e.get_entity_first_std_type()
+        d.type_zh = e.get_un_std_entity_first_type_or_std()
+        ret.append(d)
+    return ret
+
+
+class GetSPOExecutor(OpExecutor):
+    """
+    Executor for the 'get_spo' operator.
+
+    This class is used to retrieve one-hop graphs based on the given parameters.
+    It extends the base `OpExecutor` class and initializes additional components specific to retrieving SPO triples.
+    """
+
+    def __init__(
+        self,
+        schema: SchemaUtils,
+        **kwargs,
+    ):
+        """
+        Initializes the GetSPOExecutor with necessary components.
+
+        Parameters:
+            schema (SchemaUtils): Semantic structure definition to assist in the parsing process.
+        """
+        super().__init__(schema, **kwargs)
+
+        self.exact_kg_retriever: ExactKgRetriever = kwargs.get("exact_kg_retriever")
+        self.fuzzy_kg_retriever: FuzzyKgRetriever = kwargs.get("fuzzy_kg_retriever")
+
+    def get_mentioned_entity(self, n: GetSPONode, kg_graph: KgGraph):
+        entities_candis = []
+        s_data = kg_graph.get_entity_by_alias(n.s.alias_name)
+        if (
+            s_data is None
+            and isinstance(n.s, SPOEntity)
+            and n.s.entity_name
+            and len(n.s.id_set) == 0
+        ):
+            entities_candis.append(n.s)
+
+        el_kg_graph = KgGraph()
+        if isinstance(n, GetSPONode):
+            o_data = kg_graph.get_entity_by_alias(n.o.alias_name)
+            if (
+                o_data is None
+                and isinstance(n.o, SPOEntity)
+                and n.o.entity_name
+                and len(n.o.id_set) == 0
+            ):
+                entities_candis.append(n.o)
+            el_kg_graph.query_graph[n.p.alias_name] = {
+                "s": n.s.alias_name,
+                "p": n.p.alias_name,
+                "o": n.o.alias_name,
+            }
+        return entities_candis
+
+    def _get_start_node_list(self, s: SPOBase, kg_graph: KgGraph) -> List[EntityData]:
+        s_data_set = []
+        if isinstance(s, SPOEntity) and len(s.id_set) > 0:
+            s_data_set = _get_entity_node_from_lf(s)
+        else:
+            s_data_set_up = kg_graph.get_entity_by_alias(s.alias_name)
+            if s_data_set_up is not None:
+                for s_data in s_data_set_up:
+                    if isinstance(s_data, EntityData) and s_data.type != "attribute":
+                        s_data_set.append(s_data)
+                    if isinstance(s_data, RelationData):
+                        s_data_set.append(s_data)
+        return s_data_set
+
+    def _kg_match(
+        self,
+        logic_node: GetSPONode,
+        req_id: str,
+        kg_retriever: KGRetriever,
+        kg_graph: KgGraph,
+        process_info: dict,
+        param: dict,
+    ) -> Tuple[bool, KgGraph]:
+        if not isinstance(logic_node, GetSPONode):
+            return False, KgGraph()
+
+        n: GetSPONode = logic_node
+
+        copy_kg_graph = KgGraph()
+        copy_kg_graph.merge_kg_graph(kg_graph)
+        copy_kg_graph.query_graph[n.p.alias_name] = {
+            "s": n.s.alias_name,
+            "p": n.p.alias_name,
+            "o": n.o.alias_name,
+        }
+        mentioned_entities = self.get_mentioned_entity(n, copy_kg_graph)
+        if mentioned_entities:
+            el_kg_graph = KgGraph()
+            el_kg_graph.query_graph[n.p.alias_name] = {
+                "s": n.s.alias_name,
+                "p": n.p.alias_name,
+                "o": n.o.alias_name,
+            }
+            for mentioned_entity in mentioned_entities:
+                linked_entities: List[EntityData] = kg_retriever.retrieval_entity(
+                    mentioned_entity, kwargs=param
+                )
+                for entity_id_info in linked_entities:
+                    entity_type_zh = (
+                        self.schema.node_en_zh[
+                            self.schema.get_label_without_prefix(entity_id_info.type)
+                        ]
+                        if self.schema is not None
+                        and self.schema.get_label_without_prefix(entity_id_info.type)
+                        in self.schema.node_en_zh.keys()
+                        else None
+                    )
+                    entity_id_info.type_zh = entity_type_zh
+                el_kg_graph.nodes_alias.append(mentioned_entity.alias_name)
+                el_kg_graph.entity_map[mentioned_entity.alias_name] = linked_entities
+            copy_kg_graph.merge_kg_graph(el_kg_graph)
+
+        s_data_set = self._get_start_node_list(n.s, copy_kg_graph)
+        o_data_set = self._get_start_node_list(n.o, copy_kg_graph)
+        if len(s_data_set) == 0 and len(o_data_set) == 0:
+            return False, copy_kg_graph
+
+        one_hop_graph_list = kg_retriever.recall_one_hop_graph(
+            logic_node, s_data_set, o_data_set, kwargs=param
+        )
+        cur_kg_graph = kg_retriever.retrieval_relation(
+            logic_node, one_hop_graph_list, kwargs=param
+        )
+        spo_res = cur_kg_graph.get_entity_by_alias(n.p.alias_name)
+        if not spo_res:
+            return False, copy_kg_graph
+        cur_kg_graph.nodes_alias.append(n.s.alias_name)
+        cur_kg_graph.nodes_alias.append(n.o.alias_name)
+        cur_kg_graph.edge_alias.append(n.p.alias_name)
+        copy_kg_graph.merge_kg_graph(cur_kg_graph)
+        process_info[logic_node.sub_query]["spo_retrieved"] = spo_res
+        process_info[logic_node.sub_query]["match_type"] = (
+            "exact spo" if isinstance(kg_retriever, ExactKgRetriever) else "fuzzy spo"
+        )
+        return True, copy_kg_graph
+
+    def executor(
+        self,
+        nl_query: str,
+        logic_node: LogicNode,
+        req_id: str,
+        kg_graph: KgGraph,
+        process_info: dict,
+        param: dict,
+    ) -> Dict:
+        if not isinstance(logic_node, GetSPONode):
+            return {}
+        n = logic_node
+
+        kg_graph.logic_form_base[n.s.alias_name] = n.s
+        kg_graph.logic_form_base[n.p.alias_name] = n.p
+        kg_graph.logic_form_base[n.o.alias_name] = n.o
+
+        is_success, exact_matched_graph = self._kg_match(
+            n, req_id, self.exact_kg_retriever, kg_graph, process_info, param
+        )
+        if is_success:
+            kg_graph.merge_kg_graph(exact_matched_graph)
+            return process_info[logic_node.sub_query]
+
+        _, fuzzy_matched_graph = self._kg_match(
+            n, req_id, self.fuzzy_kg_retriever, kg_graph, process_info, param
+        )
+        kg_graph.merge_kg_graph(fuzzy_matched_graph)
+        return process_info[logic_node.sub_query]
diff --git a/kag/solver/execute/op_executor/op_retrieval/module/search_s.py b/kag/solver/execute/op_executor/op_retrieval/module/search_s.py
new file mode 100644
index 00000000..8c54d476
--- /dev/null
+++ b/kag/solver/execute/op_executor/op_retrieval/module/search_s.py
@@ -0,0 +1,22 @@
+from typing import Dict
+
+from kag.solver.execute.op_executor.op_executor import OpExecutor
+from kag.interface.solver.base_model import LogicNode
+from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
+from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
+
+
+class SearchS(OpExecutor):
+    def __init__(self, schema: SchemaUtils, **kwargs):
+        super().__init__(schema, **kwargs)
+
+    def executor(
+        self,
+        nl_query: str,
+        logic_node: LogicNode,
+        req_id: str,
+        kg_graph: KgGraph,
+        process_info: dict,
+        param: dict,
+    ) -> Dict:
+        raise NotImplementedError("search s not impl")
diff --git a/kag/solver/execute/op_executor/op_retrieval/retrieval_executor.py b/kag/solver/execute/op_executor/op_retrieval/retrieval_executor.py
new file mode 100644
index 00000000..2fee9f09
--- /dev/null
+++ b/kag/solver/execute/op_executor/op_retrieval/retrieval_executor.py
@@ -0,0 +1,51 @@
+import logging
+from typing import Dict
+
+from kag.solver.execute.op_executor.op_executor import OpExecutor
+from kag.solver.execute.op_executor.op_retrieval.module.get_spo_executor import (
+    GetSPOExecutor,
+)
+from kag.solver.execute.op_executor.op_retrieval.module.search_s import SearchS
+from kag.interface.solver.base_model import LogicNode
+from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
+from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
+from kag.solver.logic.core_modules.parser.logic_node_parser import GetSPONode
+
+logger = logging.getLogger()
+
+
+class RetrievalExecutor(OpExecutor):
+    def __init__(self, schema: SchemaUtils, **kwargs):
+        super().__init__(schema, **kwargs)
+        self.query_one_graph_cache = {}
+        self.op_register_map = {
+            "get_spo": GetSPOExecutor(
+                schema,
+                **kwargs,
+            ),
+            "search_s": SearchS(
+                schema,
+                **kwargs,
+            ),
+        }
+
+    def is_this_op(self, logic_node: LogicNode) -> bool:
+        return isinstance(logic_node, GetSPONode)
+
+    def executor(
+        self,
+        nl_query: str,
+        logic_node: LogicNode,
+        req_id: str,
+        kg_graph: KgGraph,
+        process_info: dict,
+        param: dict,
+    ) -> Dict:
+        op = self.op_register_map.get(logic_node.operator, None)
+        if op is None:
+            return {}
+        try:
+            op.executor(nl_query, logic_node, req_id, kg_graph, process_info, param)
+        except Exception as e:
+            logger.warning(f"op {logic_node.operator} run failed! {e}", exc_info=True)
+        return process_info.get(logic_node.sub_query, {})
diff --git a/tests/builder/component/__init__.py b/kag/solver/execute/op_executor/op_sort/__init__.py
similarity index 100%
rename from tests/builder/component/__init__.py
rename to kag/solver/execute/op_executor/op_sort/__init__.py
diff --git a/kag/solver/execute/op_executor/op_sort/sort_executor.py b/kag/solver/execute/op_executor/op_sort/sort_executor.py
new file mode 100644
index 00000000..2af8035a
--- /dev/null
+++ b/kag/solver/execute/op_executor/op_sort/sort_executor.py
@@ -0,0 +1,26 @@
+from typing import Union, Dict
+
+from kag.solver.execute.op_executor.op_executor import OpExecutor
+from kag.interface.solver.base_model import LogicNode
+from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
+from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
+from kag.solver.logic.core_modules.parser.logic_node_parser import SortNode
+
+
+class SortExecutor(OpExecutor):
+    def __init__(self, schema: SchemaUtils, **kwargs):
+        super().__init__(schema, **kwargs)
+
+    def is_this_op(self, logic_node: LogicNode) -> bool:
+        return isinstance(logic_node, SortNode)
+
+    def executor(
+        self,
+        nl_query: str,
+        logic_node: LogicNode,
+        req_id: str,
+        kg_graph: KgGraph,
+        process_info: dict,
+        param: dict,
+    ) -> Dict:
+        pass
diff --git a/kag/solver/execute/sub_query_generator.py b/kag/solver/execute/sub_query_generator.py
new file mode 100644
index 00000000..dfdbcb9b
--- /dev/null
+++ b/kag/solver/execute/sub_query_generator.py
@@ -0,0 +1,87 @@
+import logging
+
+from tenacity import retry, stop_after_attempt
+
+from kag.interface import KagBaseModule
+
+from kag.solver.utils import init_prompt_with_fallback
+
+logger = logging.getLogger()
+
+
+class LFSubGenerator(KagBaseModule):
+    """
+    Generator class that selects different prompts based on the scenario to produce answers.
+    This class can be extended to implement custom generation strategies.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.solve_question_prompt = init_prompt_with_fallback(
+            "solve_question", self.biz_scene
+        )
+        self.solve_question_without_docs_prompt = init_prompt_with_fallback(
+            "solve_question_without_docs", self.biz_scene
+        )
+        self.solve_question_without_spo_prompt = init_prompt_with_fallback(
+            "solve_question_without_spo", self.biz_scene
+        )
+
+    @retry(stop=stop_after_attempt(3))
+    def generate_sub_answer(
+        self, question: str, knowledge_graph: [], docs: [], history=[]
+    ):
+        """
+        Generates a sub-answer based on the given question, knowledge graph, documents, and history.
+
+        Parameters:
+        question (str): The main question to answer.
+        knowledge_graph (list): A list of knowledge graph data.
+        docs (list): A list of documents related to the question.
+        history (list, optional): A list of previous query-answer pairs. Defaults to an empty list.
+
+        Returns:
+        str: The generated sub-answer.
+        """
+        history_qa = []
+        for i, item in enumerate(history):
+            sub_answer = item.res.sub_answer
+            if sub_answer and "i don't know" not in sub_answer.lower():
+                history_qa.append(
+                    f"query{i}: {item.res.sub_query} answer{i}:{sub_answer}"
+                )
+            else:
+                history_qa.append(f"query{i}: {item.res.sub_answer}")
+        if knowledge_graph:
+            if len(docs) > 0:
+                prompt = self.solve_question_prompt
+                params = {
+                    "question": question,
+                    "knowledge_graph": str(knowledge_graph),
+                    "docs": str(docs),
+                    "history": "\n".join(history_qa),
+                }
+            else:
+                prompt = self.solve_question_without_docs_prompt
+                params = {
+                    "question": question,
+                    "knowledge_graph": str(knowledge_graph),
+                    "history": "\n".join(history_qa),
+                }
+        else:
+            prompt = self.solve_question_without_spo_prompt
+            params = {
+                "question": question,
+                "docs": str(docs),
+                "history": "\n".join(history_qa),
+            }
+        llm_output = self.llm_module.invoke(
+            params, prompt, with_json_parse=False, with_except=True
+        )
+        logger.debug(
+            f"sub_question:{question}\n sub_answer:{llm_output} prompt:\n{prompt}"
+        )
+        if llm_output:
+            return llm_output
+        return "I don't know"
diff --git a/kag/solver/implementation/__init__.py b/kag/solver/implementation/__init__.py
index e69de29b..112c2ab7 100644
--- a/kag/solver/implementation/__init__.py
+++ b/kag/solver/implementation/__init__.py
@@ -0,0 +1,19 @@
+from kag.solver.implementation.default_generator import DefaultGenerator
+from kag.solver.implementation.default_memory import DefaultMemory
+from kag.solver.plan.default_lf_planner import DefaultLFPlanner
+from kag.solver.implementation.default_reasoner import DefaultReasoner
+from kag.solver.implementation.default_reflector import DefaultReflector
+from kag.solver.retriever.impl.default_chunk_retrieval import (
+    KAGRetriever,
+    DefaultChunkRetriever,
+)
+
+__all__ = [
+    "DefaultGenerator",
+    "DefaultMemory",
+    "DefaultLFPlanner",
+    "DefaultReasoner",
+    "DefaultReflector",
+    "KAGRetriever",
+    "DefaultChunkRetriever",
+]
diff --git a/kag/solver/implementation/default_generator.py b/kag/solver/implementation/default_generator.py
index 05e67ea6..48cd7970 100644
--- a/kag/solver/implementation/default_generator.py
+++ b/kag/solver/implementation/default_generator.py
@@ -1,20 +1,29 @@
+import logging
 from tenacity import stop_after_attempt, retry
 
-from kag.common.base.prompt_op import PromptOp
 from kag.interface.solver.kag_generator_abc import KAGGeneratorABC
+from kag.solver.utils import init_prompt_with_fallback
+from kag.interface import PromptABC
+from kag.interface import LLMClient
 from kag.solver.implementation.default_memory import DefaultMemory
 
 
+@KAGGeneratorABC.register("default_generator", as_default=True)
 class DefaultGenerator(KAGGeneratorABC):
     """
-     The Generator class is an abstract base class for generating responses using a language model module.
-     It initializes prompts for judging and generating responses based on the business scene and language settings.
-     """
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.generate_prompt = PromptOp.load(self.biz_scene, "resp_generator")(
-            language=self.language
-        )
+    The Generator class is an abstract base class for generating responses using a language model module.
+    It initializes prompts for judging and generating responses based on the business scene and language settings.
+    """
+
+    def __init__(
+        self, generate_prompt: PromptABC = None, llm_client: LLMClient = None, **kwargs
+    ):
+        super().__init__(llm_client, **kwargs)
+        if generate_prompt is None:
+            generate_prompt = init_prompt_with_fallback(
+                "resp_generator", self.biz_scene
+            )
+        self.generate_prompt = generate_prompt
 
     @retry(stop=stop_after_attempt(3))
     def generate(self, instruction, memory: DefaultMemory):
@@ -22,4 +31,9 @@ def generate(self, instruction, memory: DefaultMemory):
         if solved_answer is not None:
             return solved_answer
         present_memory = memory.serialize_memory()
-        return self.llm_module.invoke({'memory': present_memory, 'instruction': instruction}, self.generate_prompt, with_json_parse=False, with_except=True)
+        return self.llm_module.invoke(
+            {"memory": present_memory, "instruction": instruction},
+            self.generate_prompt,
+            with_json_parse=False,
+            with_except=True,
+        )
diff --git a/kag/solver/implementation/default_kg_retrieval.py b/kag/solver/implementation/default_kg_retrieval.py
deleted file mode 100644
index fe2a8f8f..00000000
--- a/kag/solver/implementation/default_kg_retrieval.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# coding=utf8
-import os
-import time
-from typing import List
-
-from knext.project.client import ProjectClient
-
-from kag.common.vectorizer import Vectorizer
-from kag.interface.retriever.kg_retriever_abc import KGRetrieverABC
-from knext.search.client import SearchClient
-from kag.solver.logic.core_modules.common.base_model import SPOEntity
-from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph, OneHopGraphData, EntityData
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.common.text_sim_by_vector import TextSimilarity
-from kag.solver.logic.core_modules.common.utils import get_recall_node_label, generate_biz_id_with_type
-from kag.solver.logic.core_modules.config import LogicFormConfiguration
-from kag.solver.logic.core_modules.parser.logic_node_parser import GetSPONode, ParseLogicForm
-from kag.solver.logic.core_modules.retriver.graph_retriver.dsl_executor import DslRunner, DslRunnerOnGraphStore
-from kag.solver.logic.core_modules.retriver.retrieval_spo import FuzzyMatchRetrievalSpo, ExactMatchRetrievalSpo
-
-current_dir = os.path.dirname(os.path.abspath(__file__))
-import logging
-
-logger = logging.getLogger()
-
-
-class KGRetrieverByLlm(KGRetrieverABC):
-    """
-    A subclass of KGRetrieval that implements relation and entity retrieval using large language models.
-
-    This class provides the default implementation for retrieving relations and entities within the system,
-    leveraging large language models for its operations.
-    """
-
-    def __init__(self, disable_exact_match=False, **kwargs):
-        super().__init__(**kwargs)
-
-        vectorizer_config = eval(os.getenv("KAG_VECTORIZER", "{}"))
-        if self.host_addr and self.project_id:
-            config = ProjectClient(host_addr=self.host_addr, project_id=self.project_id).get_config(self.project_id)
-            vectorizer_config.update(config.get("vectorizer", {}))
-        self.vectorizer: Vectorizer = Vectorizer.from_config(vectorizer_config)
-        self.text_similarity = TextSimilarity(vec_config=vectorizer_config)
-        self.schema = SchemaUtils(LogicFormConfiguration(kwargs))
-        self.schema.get_schema()
-
-        self.disable_exact_match = disable_exact_match
-
-        self.sc: SearchClient = SearchClient(self.host_addr, self.project_id)
-        self.dsl_runner: DslRunner = DslRunnerOnGraphStore(self.project_id, self.schema, LogicFormConfiguration(kwargs))
-
-        self.fuzzy_match = FuzzyMatchRetrievalSpo(text_similarity=self.text_similarity, llm=self.llm_module)
-        self.exact_match = ExactMatchRetrievalSpo(self.schema)
-        self.parser = ParseLogicForm(self.schema, None)
-
-    def retrieval_relation(self, n: GetSPONode, one_hop_graph_list: List[OneHopGraphData], **kwargs) -> KgGraph:
-        req_id = kwargs.get('req_id', '')
-        debug_info = kwargs.get('debug_info', {})
-        if not self.disable_exact_match:
-            process_kg, is_matched = self._exact_match_spo(n, one_hop_graph_list, req_id)
-            if is_matched:
-                debug_info["exact_match_spo"] = True and debug_info.get('exact_match_spo', True)
-                return process_kg
-            else:
-                debug_info["exact_match_spo"] = False
-        return self._fuzzy_match_spo(n, one_hop_graph_list, req_id)
-
-    def retrieval_entity(self, mention_entity: SPOEntity, topk=5, **kwargs) -> List[EntityData]:
-        recalled_el_set = self._search_retrieval_entity(mention_entity, topk=topk, kwargs=kwargs)
-        if len(mention_entity.value_list) == 0:
-            return recalled_el_set
-        # 存在参数进行过滤，先过去一跳子图
-        one_hop_graph_map = self.dsl_runner.query_vertex_one_graph_by_s_o_ids(recalled_el_set, [], {})
-        matched_entity_list = recalled_el_set
-        # 将待匹配的作为spg进行匹配
-        for k, v in mention_entity.value_list:
-            choosed_one_hop_graph_list = self._get_matched_one_hop(one_hop_graph_map, matched_entity_list)
-            param_spo = f"get_spo(s=s1:{mention_entity.get_entity_first_type_or_zh()}[{mention_entity.entity_name}],p=p1:{k},o=o1:Entity[{v}])"
-            tmp_spo = self.parser.parse_logic_form(param_spo, parsed_entity_set={}, sub_query=f"{mention_entity.entity_name} {k} {v}")
-            debug_info = {}
-            kg_graph = self.retrieval_relation(tmp_spo, choosed_one_hop_graph_list, debug_info=debug_info)
-            kg_graph.nodes_alias.append(tmp_spo.s.alias_name)
-            kg_graph.nodes_alias.append(tmp_spo.o.alias_name)
-            kg_graph.edge_alias.append(tmp_spo.p.alias_name)
-            matched_entity_list = kg_graph.get_entity_by_alias("s1")
-            if matched_entity_list is None:
-                return []
-
-        return matched_entity_list
-
-    def _get_matched_one_hop(self, one_hop_graph_map: dict, matched_entity_list: list):
-        ret_one_hop_list = []
-        for matched_entity in matched_entity_list:
-            cached_id = generate_biz_id_with_type(matched_entity.biz_id,
-                                                  matched_entity.type_zh if matched_entity.type_zh else matched_entity.type)
-            if cached_id in one_hop_graph_map:
-                ret_one_hop_list.append(one_hop_graph_map[cached_id])
-        return ret_one_hop_list
-
-    def _search_retrieval_entity(self, mention_entity: SPOEntity, topk=5, **kwargs) -> List[EntityData]:
-        retdata = []
-        if mention_entity is None:
-            return retdata
-        content = kwargs.get('content', mention_entity.entity_name)
-        query_type = mention_entity.get_entity_first_type_or_zh()
-        recognition_threshold = kwargs.get('recognition_threshold', 0.8)
-        recall_topk = topk
-        if "entity" not in query_type.lower():
-            recall_topk = 10
-        query_vector = self.vectorizer.vectorize(mention_entity.entity_name)
-        typed_nodes = self.sc.search_vector(
-            label="Entity", property_key="name", query_vector=query_vector, topk=recall_topk
-        )
-        # 根据query召回
-        if query_type not in ["Others", "Entity"]:
-            content_vector = self.vectorizer.vectorize(content)
-            content_recall_nodes = self.sc.search_vector(
-                label="Entity", property_key="desc", query_vector=content_vector, topk=recall_topk
-            )
-        else:
-            content_recall_nodes = []
-        sorted_nodes = typed_nodes + content_recall_nodes
-        if len(sorted_nodes) == 0:
-            sorted_nodes = self.sc.search_text(query_string=mention_entity.entity_name)
-
-        # rerank
-        def rerank_sematic_type(cands_nodes: list, sematic_type: str):
-            sematic_type_list = []
-            for cands in cands_nodes:
-                node = cands['node']
-                if "semanticType" not in node.keys() or node['semanticType'] == '':
-                    continue
-                sematic_type_list.append(node['semanticType'])
-            sematic_type_list = list(set(sematic_type_list))
-            sematic_match_score_list = self.text_similarity.text_sim_result(sematic_type, sematic_type_list,
-                                                                            len(sematic_type_list), low_score=-1)
-            sematic_match_score_map = {}
-            for i in sematic_match_score_list:
-                sematic_match_score_map[i[0]] = i[1]
-            for node in cands_nodes:
-                recall_node_label = get_recall_node_label(node['node']['__labels__'])
-                if recall_node_label == sematic_type:
-                    node['type_match_score'] = node['score']
-                elif "semanticType" not in node['node'].keys() or node['node']['semanticType'] == '':
-                    node['type_match_score'] = 0.3
-                else:
-                    node['type_match_score'] = node['score'] * sematic_match_score_map[node['node']['semanticType']]
-            sorted_people_dicts = sorted(cands_nodes, key=lambda node: node['type_match_score'], reverse=True)
-            # 取top5
-            return sorted_people_dicts[:topk]
-
-        if "entity" not in query_type.lower():
-            sorted_nodes = rerank_sematic_type(sorted_nodes, query_type)
-        sorted_people_dicts = sorted(sorted_nodes, key=lambda node: node['score'], reverse=True)
-        for recall in sorted_people_dicts:
-            if len(sorted_people_dicts) != 0 and recall["score"] >= recognition_threshold:
-                recalled_entity = EntityData()
-                recalled_entity.score = recall["score"]
-                recalled_entity.biz_id = recall["node"]["id"]
-                recalled_entity.name = recall["node"]["name"]
-                recalled_entity.type = get_recall_node_label(recall["node"]["__labels__"])
-                retdata.append(recalled_entity)
-            else:
-                break
-        return retdata[:topk]
-
-    def _exact_match_spo(self, n: GetSPONode, one_hop_graph_list: List[OneHopGraphData], req_id: str):
-        start_time = time.time()
-        total_one_kg_graph, matched_flag = self.exact_match.match_spo(n, one_hop_graph_list)
-        logger.debug(
-            f"{req_id} _exact_match_spo cost={time.time() - start_time} matched_flag={matched_flag}")
-        if not matched_flag:
-            return total_one_kg_graph, matched_flag
-        for alias_name in total_one_kg_graph.entity_map.keys():
-            for e in total_one_kg_graph.entity_map[alias_name]:
-                score = e.score
-                if score < 0.9:
-                    total_one_kg_graph.rmv_node_ins(alias_name, [e.biz_id])
-                    return total_one_kg_graph, False
-        return total_one_kg_graph, matched_flag
-
-    def _fuzzy_match_spo(self, n: GetSPONode, one_hop_graph_list: List[OneHopGraphData], req_id: str):
-        start_time = time.time()
-
-        total_one_kg_graph, matched_flag = self.fuzzy_match.match_spo(n, one_hop_graph_list)
-        logger.debug(
-            f"{req_id} _fuzzy_match_spo cost={time.time() - start_time} matched_flag={matched_flag}")
-        return total_one_kg_graph
diff --git a/kag/solver/implementation/default_lf_planner.py b/kag/solver/implementation/default_lf_planner.py
deleted file mode 100644
index 75cc9314..00000000
--- a/kag/solver/implementation/default_lf_planner.py
+++ /dev/null
@@ -1,91 +0,0 @@
-import os
-import re
-from typing import List
-
-from kag.common.base.prompt_op import PromptOp
-from kag.interface.solver.lf_planner_abc import LFPlannerABC
-from kag.solver.logic.core_modules.common.base_model import LFPlanResult, LogicNode
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.config import LogicFormConfiguration
-from kag.solver.logic.core_modules.parser.logic_node_parser import ParseLogicForm
-from kag.solver.logic.core_modules.retriver.schema_std import SchemaRetrieval
-
-
-class DefaultLFPlanner(LFPlannerABC):
-    """
-    Planner class that extends the base planner functionality to generate sub-queries and logic forms.
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        config = LogicFormConfiguration(kwargs)
-        schema = SchemaUtils(config)
-        schema.get_schema()
-        std_schema = SchemaRetrieval(**kwargs)
-        self.parser = ParseLogicForm(schema, std_schema)
-        # Load the prompt for generating logic forms based on the business scene and language
-        self.logic_form_plan_prompt = PromptOp.load(self.biz_scene, "logic_form_plan")(
-            language=self.language
-        )
-
-    # 需要把大模型生成结果记录下来
-    def lf_planing(self, question, llm_output=None) -> List[LFPlanResult]:
-        """
-        Generates sub-queries and logic forms based on the input question or provided LLM output.
-
-        Parameters:
-        question (str): The question or task to plan.
-        llm_output (Any, optional): Output from the LLM module. Defaults to None.
-
-        Returns:
-        list of LFPlanResult
-        """
-        if llm_output is not None:
-            sub_querys, logic_forms = self.parse_logic_form_llm_output(llm_output)
-        else:
-            sub_querys, logic_forms = self.generate_logic_form(question)
-        return self._parse_lf(question, sub_querys, logic_forms)
-
-    def _split_sub_query(self, logic_nodes: List[LogicNode]) -> List[LFPlanResult]:
-        query_lf_map = {}
-        for n in logic_nodes:
-            if n.sub_query in query_lf_map.keys():
-                query_lf_map[n.sub_query] = query_lf_map[n.sub_query] + [n]
-            else:
-                query_lf_map[n.sub_query] = [n]
-        plan_result = []
-        for k, v in query_lf_map.items():
-            plan_result.append(LFPlanResult(query=k, lf_nodes=v))
-        return plan_result
-
-    def _parse_lf(self, question, sub_querys, logic_forms) -> List[LFPlanResult]:
-        if sub_querys is None:
-            sub_querys = []
-        parsed_logic_nodes = self.parser.parse_logic_form_set(logic_forms, sub_querys, question)
-        return self._split_sub_query(parsed_logic_nodes)
-
-    def generate_logic_form(self, question: str):
-        return self.llm_module.invoke({'question': question}, self.logic_form_plan_prompt, with_json_parse=False, with_except=True)
-
-    def parse_logic_form_llm_output(self, llm_output):
-        _output_string = llm_output.replace("：", ":")
-        _output_string = llm_output.strip()
-        sub_querys = []
-        logic_forms = []
-        current_sub_query = ''
-        for line in _output_string.split('\n'):
-            line = line.strip()
-            if line.startswith('Step'):
-                sub_querys_regex = re.search('Step\d+:(.*)', line)
-                if sub_querys_regex is not None:
-                    sub_querys.append(sub_querys_regex.group(1))
-                    current_sub_query = sub_querys_regex.group(1)
-            elif line.startswith('Output'):
-                sub_querys.append("output")
-            elif line.startswith('Action'):
-                logic_forms_regex = re.search('Action\d+:(.*)', line)
-                if logic_forms_regex:
-                    logic_forms.append(logic_forms_regex.group(1))
-                    if len(logic_forms) - len(sub_querys) == 1:
-                        sub_querys.append(current_sub_query)
-        return sub_querys, logic_forms
diff --git a/kag/solver/implementation/default_memory.py b/kag/solver/implementation/default_memory.py
index ae55c96b..237dd0a1 100644
--- a/kag/solver/implementation/default_memory.py
+++ b/kag/solver/implementation/default_memory.py
@@ -1,18 +1,33 @@
+import logging
 from tenacity import retry, stop_after_attempt
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
+from kag.interface import LLMClient
 from kag.interface.solver.kag_memory_abc import KagMemoryABC
+from kag.solver.utils import init_prompt_with_fallback
 
+logger = logging.getLogger()
 
+
+@KagMemoryABC.register("default_memory", as_default=True)
 class DefaultMemory(KagMemoryABC):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.verify_prompt = PromptOp.load(self.biz_scene, "resp_verifier")(
-            language=self.language
-        )
-        self.extractor_prompt = PromptOp.load(self.biz_scene, "resp_extractor")(
-            language=self.language
-        )
+    def __init__(
+        self,
+        verify_prompt: PromptABC = None,
+        extractor_prompt: PromptABC = None,
+        llm_client: LLMClient = None,
+        **kwargs,
+    ):
+        super().__init__(llm_client, **kwargs)
+
+        if verify_prompt is None:
+            verify_prompt = init_prompt_with_fallback("resp_verifier", self.biz_scene)
+        self.verify_prompt = verify_prompt
+        if extractor_prompt is None:
+            extractor_prompt = init_prompt_with_fallback(
+                "resp_extractor", self.biz_scene
+            )
+        self.extractor_prompt = extractor_prompt
         self.state_memory = []
         self.evidence_memory = []
         self.exact_answer = []
@@ -20,9 +35,12 @@ def __init__(self, **kwargs):
 
     @retry(stop=stop_after_attempt(3))
     def _verifier(self, supporting_fact, sub_instruction):
-        res = self.llm_module.invoke({'sub_instruction': sub_instruction,
-                                      'supporting_fact': supporting_fact}, self.verify_prompt,
-                                     with_json_parse=False, with_except=True)
+        res = self.llm_module.invoke(
+            {"sub_instruction": sub_instruction, "supporting_fact": supporting_fact},
+            self.verify_prompt,
+            with_json_parse=False,
+            with_except=True,
+        )
         if res is None:
             return
         if res not in self.state_memory:
@@ -30,15 +48,19 @@ def _verifier(self, supporting_fact, sub_instruction):
 
     @retry(stop=stop_after_attempt(3))
     def _extractor(self, supporting_fact, instruction):
-        if supporting_fact is None or supporting_fact == '':
+        if supporting_fact is None or supporting_fact == "":
             return
-        evidence = self.llm_module.invoke({'supporting_fact': supporting_fact, 'instruction': instruction},
-                                          self.extractor_prompt, with_json_parse=False, with_except=True)
+        evidence = self.llm_module.invoke(
+            {"supporting_fact": supporting_fact, "instruction": instruction},
+            self.extractor_prompt,
+            with_json_parse=False,
+            with_except=True,
+        )
         if evidence not in self.evidence_memory:
             self.evidence_memory.append(evidence)
 
     def save_memory(self, solved_answer, supporting_fact, instruction):
-        if solved_answer != "":
+        if solved_answer:
             self.exact_answer.append(solved_answer)
             return
         # skip first instruction to verifier
@@ -62,4 +84,4 @@ def serialize_memory(self):
     def refresh(self):
         self.state_memory = []
         self.evidence_memory = []
-        self.exact_answer = []
\ No newline at end of file
+        self.exact_answer = []
diff --git a/kag/solver/implementation/default_reasoner.py b/kag/solver/implementation/default_reasoner.py
index 54a4316e..ada39af6 100644
--- a/kag/solver/implementation/default_reasoner.py
+++ b/kag/solver/implementation/default_reasoner.py
@@ -1,16 +1,16 @@
 import logging
 from typing import List
 
+from kag.interface.solver.execute.lf_executor_abc import LFExecutorABC
 from kag.interface.solver.kag_reasoner_abc import KagReasonerABC
-from kag.interface.solver.lf_planner_abc import LFPlannerABC
-from kag.solver.implementation.default_kg_retrieval import KGRetrieverByLlm
-from kag.solver.implementation.default_lf_planner import DefaultLFPlanner
-from kag.solver.implementation.lf_chunk_retriever import LFChunkRetriever
-from kag.solver.logic.core_modules.common.base_model import LFPlanResult
-from kag.solver.logic.core_modules.lf_solver import LFSolver
+from kag.interface.solver.plan.lf_planner_abc import LFPlannerABC
+from kag.interface.solver.base_model import LFPlan
+from kag.interface import LLMClient
 
 logger = logging.getLogger()
 
+
+@KagReasonerABC.register("default_reasoner", as_default=True)
 class DefaultReasoner(KagReasonerABC):
     """
     A processor class for handling logical form tasks in language processing.
@@ -19,7 +19,7 @@ class DefaultReasoner(KagReasonerABC):
 
     Parameters:
     - lf_planner (LFBasePlanner): The planner for structuring logical forms. Defaults to None. If not provided, the default implementation of LFPlanner is used.
-    - lf_solver: Instance of the logical form solver, which solves logical form problems. If not provided, the default implementation of LFSolver is used.
+    - lf_executor: Instance of the logical form executor, which solves logical form problems. If not provided, the default implementation of LFSolver is used.
 
     Attributes:
     - lf_planner: Instance of the logical form planner.
@@ -29,25 +29,23 @@ class DefaultReasoner(KagReasonerABC):
     - trace_log: List to log trace information.
     """
 
-    def __init__(self, lf_planner: LFPlannerABC = None, lf_solver: LFSolver = None, **kwargs):
-        super().__init__(
-            lf_planner=lf_planner,
-            lf_solver=lf_solver,
-            **kwargs
-        )
+    def __init__(
+        self,
+        lf_planner: LFPlannerABC,
+        lf_executor: LFExecutorABC,
+        llm_client: LLMClient = None,
+        **kwargs,
+    ):
+        super().__init__(llm_client, **kwargs)
 
-        self.lf_planner = lf_planner or DefaultLFPlanner(**kwargs)
-        self.lf_solver = lf_solver or LFSolver(
-            kg_retriever=KGRetrieverByLlm(**kwargs),
-            chunk_retriever=LFChunkRetriever(**kwargs),
-            **kwargs
-        )
+        self.lf_planner = lf_planner
 
+        self.lf_executor = lf_executor
         self.sub_query_total = 0
         self.kg_direct = 0
         self.trace_log = []
 
-    def reason(self, question: str):
+    def reason(self, question: str, **kwargs):
         """
         Processes a given question by planning and executing logical forms to derive an answer.
 
@@ -60,25 +58,7 @@ def reason(self, question: str):
         - history_log: A dictionary containing the history of QA pairs and re-ranked documents.
         """
         # logic form planing
-        lf_nodes: List[LFPlanResult] = self.lf_planner.lf_planing(question)
+        lf_nodes: List[LFPlan] = self.lf_planner.lf_planing(question)
 
         # logic form execution
-        solved_answer, sub_qa_pair, recall_docs, history_qa_log = self.lf_solver.solve(question, lf_nodes)
-        # Generate supporting facts for sub question-answer pair
-        supporting_fact = '\n'.join(sub_qa_pair)
-
-        # Retrieve and rank documents
-        sub_querys = [lf.query for lf in lf_nodes]
-        if self.lf_solver.chunk_retriever:
-            docs = self.lf_solver.chunk_retriever.rerank_docs([question] + sub_querys, recall_docs)
-        else:
-            logger.info("DefaultReasoner not enable chunk retriever")
-            docs = []
-        history_log = {
-            'history': history_qa_log,
-            'rerank_docs': docs
-        }
-        if len(docs) > 0:
-            # Append supporting facts for retrieved chunks
-            supporting_fact += f"\nPassages:{str(docs)}"
-        return solved_answer, supporting_fact, history_log
+        return self.lf_executor.execute(question, lf_nodes, **kwargs)
diff --git a/kag/solver/implementation/default_reflector.py b/kag/solver/implementation/default_reflector.py
index 7a77750f..ea45f701 100644
--- a/kag/solver/implementation/default_reflector.py
+++ b/kag/solver/implementation/default_reflector.py
@@ -1,27 +1,36 @@
 from tenacity import retry, stop_after_attempt
 
-from kag.common.base.prompt_op import PromptOp
-from kag.interface.solver.kag_reflector_abc import KagMemoryABC
+from kag.interface import PromptABC
+from kag.interface import LLMClient
+from kag.interface.solver.kag_memory_abc import KagMemoryABC
 from kag.interface.solver.kag_reflector_abc import KagReflectorABC
+from kag.solver.utils import init_prompt_with_fallback
 
 
+@KagReflectorABC.register("default_reflector", as_default=True)
 class DefaultReflector(KagReflectorABC):
-    def __init__(self, **kwargs):
+    def __init__(
+        self,
+        refine_prompt: PromptABC = None,
+        judge_prompt: PromptABC = None,
+        llm_client: LLMClient = None,
+        **kwargs,
+    ):
         """
         A class for rewriting instructions based on provided memory information.
 
         Attributes:
         - llm_module (Any): The LLM module to be used by this instance.
-        - rewrite_prompt (PromptOp): The prompt operation for rewriting responses.
+        - rewrite_prompt (PromptABC): The prompt operation for rewriting responses.
         """
-        super().__init__(**kwargs)
-        self.refine_prompt = PromptOp.load(self.biz_scene, "resp_reflector")(
-            language=self.language
-        )
+        super().__init__(llm_client=llm_client, **kwargs)
+        if refine_prompt is None:
+            refine_prompt = init_prompt_with_fallback("resp_reflector", self.biz_scene)
+        self.refine_prompt = refine_prompt
 
-        self.judge_prompt = PromptOp.load(self.biz_scene, "resp_judge")(
-            language=self.language
-        )
+        if judge_prompt is None:
+            judge_prompt = init_prompt_with_fallback("resp_judge", self.biz_scene)
+        self.judge_prompt = judge_prompt
 
     def _get_serialize_memory(self, memory: KagMemoryABC):
         if memory is None:
@@ -44,8 +53,12 @@ def _can_answer(self, memory: KagMemoryABC, instruction: str):
         if memory.get_solved_answer():
             return True
 
-        return self.llm_module.invoke({'memory': serialize_memory, 'instruction': instruction}, self.judge_prompt,
-                                      with_json_parse=False, with_except=True)
+        return self.llm_module.invoke(
+            {"memory": serialize_memory, "instruction": instruction},
+            self.judge_prompt,
+            with_json_parse=False,
+            with_except=True,
+        )
 
     @retry(stop=stop_after_attempt(3))
     def _refine_query(self, memory: KagMemoryABC, instruction: str):
@@ -60,9 +73,12 @@ def _refine_query(self, memory: KagMemoryABC, instruction: str):
         if serialize_memory == "":
             return instruction
 
-        update_reason_path = self.llm_module.invoke({"memory": serialize_memory, "instruction": instruction},
-                                                    self.refine_prompt,
-                                                    with_json_parse=False, with_except=True)
+        update_reason_path = self.llm_module.invoke(
+            {"memory": serialize_memory, "instruction": instruction},
+            self.refine_prompt,
+            with_json_parse=False,
+            with_except=True,
+        )
         if len(update_reason_path) == 0:
             return None
-        return update_reason_path[0]
\ No newline at end of file
+        return "\n".join(update_reason_path)
diff --git a/kag/solver/implementation/lf_chunk_retriever.py b/kag/solver/implementation/lf_chunk_retriever.py
deleted file mode 100644
index 183f8cc6..00000000
--- a/kag/solver/implementation/lf_chunk_retriever.py
+++ /dev/null
@@ -1,162 +0,0 @@
-import os
-import time
-from typing import List
-
-import numpy as np
-
-from kag.common.retriever import DefaultRetriever
-from kag.solver.logic.core_modules.common.one_hop_graph import EntityData
-from kag.solver.logic.core_modules.common.text_sim_by_vector import TextSimilarity, cosine_similarity
-from kag.solver.logic.core_modules.retriver.retrieval_spo import logger
-from knext.project.client import ProjectClient
-
-
-class LFChunkRetriever(DefaultRetriever):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        vectorizer_config = eval(os.getenv("KAG_VECTORIZER", "{}"))
-        if self.host_addr and self.project_id:
-            config = ProjectClient(host_addr=self.host_addr, project_id=self.project_id).get_config(self.project_id)
-            vectorizer_config.update(config.get("vectorizer", {}))
-        self.text_sim = TextSimilarity(vec_config=vectorizer_config)
-
-    def rerank(self, queries: List[str], passages: List[str]):
-        if not isinstance(queries, list):
-            queries = [queries]
-        if len(passages) == 0:
-            return []
-        queries = list(set(queries))
-        rank_scores = np.array([1 / (1 + i) for i in range(len(passages))])
-        passage_scores = np.zeros(len(passages)) + rank_scores
-        passages_embs = self.text_sim.sentence_encode(passages, is_cached=True)
-
-        for query in queries:
-            query_emb = self.text_sim.sentence_encode(query)
-            scores = [cosine_similarity(np.array(query_emb), np.array(passage_emb)) for passage_emb in passages_embs]
-            sorted_idx = np.argsort(-np.array(scores))
-            for rank, passage_id in enumerate(sorted_idx):
-                passage_scores[passage_id] += rank_scores[rank]
-
-        merged_sorted_idx = np.argsort(-passage_scores)
-
-        new_passages = [passages[x] for x in merged_sorted_idx]
-        return new_passages[:10]
-
-    def recall_docs(self, query: str, top_k=5, **kwargs):
-        all_related_entities = kwargs.get('related_entities', None)
-        query_ner_dict = kwargs.get('query_ner_dict', None)
-        req_id = kwargs.get('req_id', '')
-        if not all_related_entities:
-            return super().recall_docs(query, top_k, **kwargs)
-        return self.recall_docs_by_entities(query, all_related_entities, top_k, req_id, query_ner_dict)
-
-    def get_std_ner_by_query(self, query: str):
-        """
-        Retrieves standardized Named Entity Recognition (NER) results based on the input query.
-
-        Parameters:
-        query (str): The input query string.
-
-        Returns:
-        dict: A dictionary containing standardized entity names and types along with their scores.
-        """
-        entities = self.named_entity_recognition(query)
-        entities_with_official_name = self.named_entity_standardization(
-            query, entities
-        )
-
-        query_ner_list = entities_with_official_name
-        try:
-            names = []
-            for x in query_ner_list:
-                x_type = x.get("category", "Others").lower()
-                if x_type in ["works", "person", "other"]:
-                    names.append(x["entity"])
-                else:
-                    names.append(x["official_name"])
-        except:
-            names = [x["entity"] for x in query_ner_list if "entity" in x]
-        types = [x["category"] for x in query_ner_list if "category" in x]
-
-        query_ner_list = names
-        query_ner_type_list = types
-
-        top_phrases = []
-        top_phrase_scores = []
-
-        if len(query_ner_list) > 0:
-            (top_phrases, top_phrase_scores) = self.match_entities(
-                dict(zip(query_ner_list, query_ner_type_list))
-            )
-        return_data = {}
-        for i in range(0, len(top_phrases)):
-            phrases = top_phrases[i]
-            phrases["score"] = top_phrase_scores[i]
-            return_data[f"{phrases['name']}_{phrases['type']}"] = phrases
-        return return_data
-
-    def recall_docs_by_entities(self, query: str, all_related_entities: List[EntityData], top_k=10,
-                                req_id='', query_ner_dict: dict = None):
-        def convert_entity_data_to_ppr_cand(related_entities: List[EntityData]):
-            ret_ppr_candis = {}
-            for e in related_entities:
-                k = f"{e.name}_{e.type}"
-                ret_ppr_candis[k] = {
-                    'name': e.name,
-                    'type': e.type,
-                    'score': e.score
-                }
-            return ret_ppr_candis
-
-        start_time = time.time()
-        ner_cands = self.get_std_ner_by_query(query)
-        try:
-            kg_cands = convert_entity_data_to_ppr_cand(all_related_entities)
-        except Exception as e:
-            kg_cands = {}
-            logger.warning(f"{req_id} {query} generate logic form failed {str(e)}", exc_info=True)
-        for k, v in ner_cands.items():
-            if k in kg_cands.keys():
-                if v['score'] > kg_cands[k]['score']:
-                    kg_cands[k]['score'] = v['score']
-            else:
-                kg_cands[k] = v
-        if query_ner_dict is not None:
-            for k, v in query_ner_dict.items():
-                if k in kg_cands.keys():
-                    if v['score'] > kg_cands[k]['score']:
-                        kg_cands[k]['score'] = v['score']
-                else:
-                    kg_cands[k] = v
-
-        matched_entities = []
-        matched_entities_scores = []
-        for _, v in kg_cands.items():
-            matched_entities.append(v)
-            matched_entities_scores.append(v['score'])
-        logger.info(f"{req_id} kgpath ner cost={time.time() - start_time}")
-
-        start_time = time.time()
-        if len(matched_entities) == 0:
-            combined_scores = self.calculate_sim_scores(query, top_k * 20)
-            logger.info(f"{req_id} only get_dpr_scores cost={time.time() - start_time}")
-        elif (
-                matched_entities_scores and np.min(matched_entities_scores) > self.pagerank_threshold
-        ):  # high confidence in named entities
-            combined_scores = self.calculate_pagerank_scores(matched_entities)
-        else:
-            # Run Personalized PageRank (PPR) or other Graph Algorithm Doc Scores
-            pagerank_scores = self.calculate_pagerank_scores(matched_entities)
-            logger.info(f"{req_id} only get_ppr_scores cost={time.time() - start_time}")
-            start_time = time.time()
-            sim_doc_scores = self.calculate_sim_scores(query, top_k * 20)
-            logger.info(f"{req_id} only get_dpr_scores cost={time.time() - start_time}")
-
-            combined_scores = self.calculate_combined_scores(sim_doc_scores, pagerank_scores)
-
-        # Return ranked docs and ranked scores
-        sorted_doc_ids = sorted(
-                combined_scores.items(), key=lambda item: item[1], reverse=True
-            )
-        logger.debug(f"kgpath chunk recall cost={time.time() - start_time}")
-        return self.get_all_docs_by_id(query, sorted_doc_ids, top_k)
\ No newline at end of file
diff --git a/kag/solver/logic/core_modules/common/base_model.py b/kag/solver/logic/core_modules/common/base_model.py
deleted file mode 100644
index 048c2ee5..00000000
--- a/kag/solver/logic/core_modules/common/base_model.py
+++ /dev/null
@@ -1,318 +0,0 @@
-import itertools
-import json
-import re
-from typing import List
-
-
-class Identifer:
-    def __init__(self, alias_name):
-        self.alias_name = alias_name
-
-    def __repr__(self):
-        return self.alias_name
-
-    def __str__(self):
-        return self.alias_name
-
-    def __eq__(self, other):
-        if isinstance(other, Identifer):
-            return self.alias_name == other.alias_name
-        if isinstance(other, str):
-            return self.alias_name == other
-        return False
-
-    def __hash__(self):
-        return hash(self.alias_name)
-
-
-class TypeInfo:
-    def __init__(self, entity_type=None, entity_type_zh=None):
-        self.entity_type = entity_type
-        self.entity_type_zh = entity_type_zh
-
-    def __repr__(self):
-        return f"en:{self.entity_type} zh:{self.entity_type_zh}"
-
-
-def parse_entity(raw_entity):
-    if raw_entity is None:
-        return []
-    entity_parts = re.findall(r'(?:`(.+?)`|([^|]+))', raw_entity)
-    return [part.replace('``', '|') if part else escaping_part for escaping_part, part in entity_parts]
-
-
-class SPOBase:
-    def __init__(self):
-        self.alias_name: Identifer = None
-        self.type_set: List[TypeInfo] = []
-        self.is_attribute = False
-        self.value_list = []
-
-    def __repr__(self):
-        return f"{self.alias_name}:{self.get_entity_first_type_or_en()}"
-
-    def get_value_list_str(self):
-        return [f"{self.alias_name}.{k}={v}" for k,v in self.value_list]
-
-    def get_mention_name(self):
-        return ""
-
-    def get_type_with_gql_format(self):
-        entity_types = self.get_entity_type_set()
-        entity_zh_types = self.get_entity_type_zh_set()
-        if len(entity_types) == 0 and len(entity_zh_types) == 0:
-            return None
-        if None in entity_types and None in entity_zh_types:
-            raise RuntimeError(f"None type in entity type en {entity_types} zh {entity_zh_types}")
-        if len(entity_types) > 0:
-            return "|".join(entity_types)
-        if len(entity_zh_types) > 0:
-            return "|".join(entity_zh_types)
-
-    def get_entity_first_type(self):
-        type_list = list(self.get_entity_type_set())
-        if len(type_list) == 0:
-            return None
-        return type_list[0]
-
-    def get_entity_first_type_or_en(self):
-        en = list(self.get_entity_type_set())
-        zh = list(self.get_entity_type_zh_set())
-        if len(zh) > 0:
-            return zh[0]
-        elif len(en) > 0:
-            return en[0]
-        else:
-            return None
-
-    def get_entity_type_or_zh_list(self):
-        ret = []
-        for entity_type_info in self.type_set:
-            if entity_type_info.entity_type is not None:
-                ret.append(entity_type_info.entity_type)
-            elif entity_type_info.entity_type_zh is not None:
-                ret.append(entity_type_info.entity_type_zh)
-        return ret
-
-    def get_entity_first_type_or_zh(self):
-        en = list(self.get_entity_type_set())
-        zh = list(self.get_entity_type_zh_set())
-        if len(en) > 0:
-            return en[0]
-        elif len(zh) > 0:
-            return zh[0]
-        else:
-            return None
-
-    def get_entity_type_set(self):
-        entity_types = []
-        for entity_type_info in self.type_set:
-            if entity_type_info.entity_type is not None:
-                entity_types.append(entity_type_info.entity_type)
-        return set(entity_types)
-
-    def get_entity_type_zh_set(self):
-        entity_types = []
-        for entity_type_info in self.type_set:
-            if entity_type_info.entity_type_zh is not None:
-                entity_types.append(entity_type_info.entity_type_zh)
-        return set(entity_types)
-
-
-class SPORelation(SPOBase):
-    def __init__(self, alias_name=None, rel_type=None, rel_type_zh=None):
-        super().__init__()
-        if rel_type is not None or rel_type_zh is not None:
-            type_info = TypeInfo()
-            type_info.entity_type = rel_type
-            type_info.entity_type_zh = rel_type_zh
-            self.type_set.append(type_info)
-        self.alias_name: Identifer = None
-        if alias_name is not None:
-            self.alias_name = Identifer(alias_name)
-
-        self.s: SPOBase = None
-        self.o: SPOEntity = None
-
-    def __str__(self):
-        show = [f"{self.alias_name}:{self.get_entity_first_type_or_en()}"]
-        show = show + self.get_value_list_str()
-        return ",".join(show)
-
-    @staticmethod
-    def parse_logic_form(input_str):
-        """
-        Parses the logic form from the given input string and constructs a relation object.
-
-        Parameters:
-            input_str (str): The input string containing alias and entity types separated by ':'.
-
-        Returns:
-            SPORelation: A relation object with alias name and associated type set.
-        """
-
-        rel_type_set = []
-
-        # Split the input string into alias and entity_type_set parts
-        split_input = input_str.split(':', 1)
-        alias = split_input[0]
-        # If entity_type_set exists, process it further
-        if len(split_input) > 1:
-            entity_type_part = split_input[1]
-
-            entity_types = parse_entity(entity_type_part)
-            for entity_type in entity_types:
-                entity_type_obj = TypeInfo()
-                entity_type_obj.entity_type_zh = entity_type
-                rel_type_set.append(entity_type_obj)
-
-        rel = SPORelation()
-        rel.alias_name = Identifer(alias)
-        rel.type_set = rel_type_set
-        return rel
-
-
-class SPOEntity(SPOBase):
-    def __init__(self, entity_id=None, entity_type=None, entity_type_zh=None, entity_name=None, alias_name=None,
-                 is_attribute=False):
-        super().__init__()
-        self.is_attribute = is_attribute
-        self.id_set = []
-        self.entity_name = entity_name
-        self.alias_name: Identifer = None
-        if alias_name is not None:
-            self.alias_name = Identifer(alias_name)
-        if entity_id is not None:
-            self.id_set.append(entity_id)
-        if entity_type is not None or entity_type_zh is not None:
-            type_info = TypeInfo()
-            type_info.entity_type = entity_type
-            type_info.entity_type_zh = entity_type_zh
-            self.type_set.append(type_info)
-
-    def __str__(self):
-        show = [f"{self.alias_name}:{self.get_entity_first_type_or_en()}{'' if self.entity_name is None else '[' + self.entity_name + ']'} "]
-        show = show + self.get_value_list_str()
-        return ",".join(show)
-
-    def get_mention_name(self):
-        return self.entity_name
-    def generate_id_key(self):
-        if len(self.id_set) == 0:
-            return None
-        id_str_set = ['"' + id_str + '"' for id_str in self.id_set]
-        return ",".join(id_str_set)
-
-    def generate_start_infos(self, prefix=None):
-        if len(self.id_set) == 0:
-            return []
-        if len(self.type_set) == 0:
-            return []
-
-        id_type_info = list(itertools.product(self.id_set, self.type_set))
-        return [{
-            "alias": self.alias_name.alias_name,
-            "id": info[0],
-            "type": info[1].entity_type if '.' in info[1].entity_type else (
-                                                                               prefix + '.' if prefix is not None else '') +
-                                                                           info[1].entity_type
-        } for info in id_type_info]
-
-    @staticmethod
-    def parse_logic_form(input_str):
-        # # 正则表达式解析输入字符串
-        match = re.match(r"([^:]+):?([^\[]+)?(\[[^\[]*\])?(\[[^\[]*\])?", input_str)
-        if not match:
-            return None
-
-        # 提取和解构匹配的组件
-        alias = match.group(1)
-        entity_type_raw = match.group(2)
-        entity_name_raw = match.group(3)
-        entity_id_raw = match.group(4)
-
-        # 处理entity_type_set
-        entity_type_set = parse_entity(entity_type_raw)
-
-        # 解析entity_name和entity_id_set
-        entity_name = entity_name_raw.strip('][') if entity_name_raw else None
-        entity_name = entity_name.strip('`') if entity_name else None
-        entity_id_set = parse_entity(entity_id_raw.strip('][')) if entity_id_raw else []
-
-        spo_entity = SPOEntity()
-        spo_entity.id_set = entity_id_set
-        spo_entity.alias_name = Identifer(alias)
-        spo_entity.entity_name = entity_name
-        for entity_type in entity_type_set:
-            entity_type_obj = TypeInfo()
-            entity_type_obj.entity_type_zh = entity_type
-            entity_type_obj.entity_type = entity_type
-            spo_entity.type_set.append(entity_type_obj)
-        return spo_entity
-
-
-class Entity:
-    def __init__(self, entity_id=None, entity_type=None, entity_type_zh=None, entity_name=None, alias_name=None):
-        self.id = entity_id
-        self.type = entity_type
-        self.entity_type_zh = entity_type_zh
-        self.entity_name = entity_name
-        self.alias_name = alias_name
-
-    def __repr__(self):
-        return f"{[self.entity_name, self.alias_name]}:{self.id}({self.type, self.entity_type_zh})"
-
-    def save_args(self, id=None, type=None, entity_type_zh=None, entity_name=None, alias_name=None):
-        self.id = id if id else self.id
-        self.type = type if type else self.type
-        self.entity_type_zh = entity_type_zh if entity_type_zh else self.entity_type_zh
-        self.entity_name = entity_name if entity_name else self.entity_name
-        self.alias_name = alias_name if alias_name else self.alias_name
-
-    @staticmethod
-    def parse_zh(entity_str):
-        alias, type_zh, name = '', '', ''
-        entity_str = entity_str.replace('：', ':')
-        match_alias_type_entity = re.match(r'(.*):(.*)\[(.*)\]', entity_str)
-        if match_alias_type_entity:
-            alias, type_zh, name = match_alias_type_entity.groups()
-        else:
-            match_alias_type = re.match(r'(.*):(.*)', entity_str)
-            if match_alias_type:
-                alias, type_zh = match_alias_type.groups()
-            else:
-                alias = entity_str
-        return Entity(entity_type_zh=type_zh.strip(), entity_name=name.strip(), alias_name=alias.strip())
-
-
-class LogicNode:
-    def __init__(self, operator, args):
-        self.operator = operator
-        self.args = args
-        self.sub_query = args.get('sub_query', '')
-
-    def __repr__(self):
-        params = [f"{k}={v}" for k, v in self.args.items()]
-        params_str = ','.join(params)
-        return f"{self.operator}({params_str})"
-
-    def to_dict(self):
-        return json.loads(self.to_json())
-
-    def to_json(self):
-        return json.dumps(obj=self,
-                          default=lambda x: x.__dict__, sort_keys=False, indent=2)
-
-    def to_dsl(self):
-        raise NotImplementedError("Subclasses should implement this method.")
-
-    def to_std(self, args):
-        for key, value in args.items():
-            self.args[key] = value
-        self.sub_query = args.get('sub_query', '')
-
-
-class LFPlanResult:
-    def __init__(self, query: str, lf_nodes: List[LogicNode]):
-        self.query: str = query
-        self.lf_nodes: List[LogicNode] = lf_nodes
\ No newline at end of file
diff --git a/kag/solver/logic/core_modules/common/one_hop_graph.py b/kag/solver/logic/core_modules/common/one_hop_graph.py
index 0cb1848c..e755be4d 100644
--- a/kag/solver/logic/core_modules/common/one_hop_graph.py
+++ b/kag/solver/logic/core_modules/common/one_hop_graph.py
@@ -22,23 +22,21 @@ def find_and_extra_prop_objects(text):
     list: A list of dictionaries representing the parsed objects.
     """
 
-    pattern = re.compile(r'\001(.*?)\003')
+    pattern = re.compile(r"\001(.*?)\003")
 
     matches = pattern.findall(text)
 
     objects = []
 
     for match in matches:
-        attributes = match.split('\002')
+        attributes = match.split("\002")
         if len(attributes) != 3:
             logger.info(f"find_and_extra_prop_objects attribute not match {match}")
             continue
 
-        objects.append({
-            "id": attributes[1],
-            "name": attributes[0],
-            "type": attributes[2]
-        })
+        objects.append(
+            {"id": attributes[1], "name": attributes[0], "type": attributes[2]}
+        )
 
     return objects
 
@@ -49,6 +47,14 @@ def __init__(self):
         self.extend_prop_map = {}
         self.linked_prop_map = {}
 
+    def get_properties_map(self):
+        result = {}
+        for k in self.origin_prop_map.keys():
+            result[k] = self.origin_prop_map[k]
+        for k in self.extend_prop_map.keys():
+            result[k] = self.extend_prop_map[k]
+        return result
+
     def get_properties_map_list_value(self):
         result = {}
         for k in self.origin_prop_map.keys():
@@ -76,7 +82,7 @@ def from_dict(json_dict: dict, label_name: str, schema: SchemaUtils):
         attr_en_zh = Prop._get_attr_en_zh_by_label(label_name, schema)
         black_attr = ["biz_node_id", "gdb_timestamp"]
         for k in json_dict.keys():
-            if json_dict[k] == '' or k in black_attr:
+            if json_dict[k] == "" or k in black_attr:
                 continue
             if k.startswith("_") or k in ext_attrs:
                 continue
@@ -97,13 +103,16 @@ def from_dict(json_dict: dict, label_name: str, schema: SchemaUtils):
                     prop.linked_prop_map[k] = link_res
                 prop.extend_prop_map = basic_info
             except Exception as e:
-                logger.warning(f"parse basic info failed reasone: {json_dict[ext_attr]}", exc_info=True)
+                logger.warning(
+                    f"parse basic info failed reasone: {json_dict[ext_attr]}",
+                    exc_info=True,
+                )
         return prop
 
     def to_json(self):
         return {
             "origin_prop_map": self.origin_prop_map,
-            "extend_prop_map": self.extend_prop_map
+            "extend_prop_map": self.extend_prop_map,
         }
 
     def get_prop_value(self, p):
@@ -124,17 +133,27 @@ def __init__(self):
         self.type_zh: str = None
         self.score = 1.0
 
+    def get_short_name(self):
+        if self.name:
+            return self.name
+        return self.biz_id
+
     def get_properties_map_list_value(self):
         if self.prop is None:
             return {}
         return self.prop.get_properties_map_list_value()
 
-    def to_show_id(self):
-        if self.type in ['verify_op_result'] and self.description is not None and self.description != '':
-            return f"{self.type_zh}[{self.name}]({self.description})"
+    def to_show_id(self, language="en"):
+        type_name = self.type_zh if language == "zh" else self.type
+        if (
+            self.type in ["verify_op_result"]
+            and self.description is not None
+            and self.description != ""
+        ):
+            return f"{type_name}[{self.name}]({self.description})"
         if self.name == self.biz_id:
-            return f"{self.type_zh}[{self.name}]"
-        return f"{self.type_zh}[{self.name}]({self.biz_id})"
+            return f"{type_name}[{self.name}]"
+        return f"{type_name}[{self.get_short_name()}]"
 
     def to_json(self):
         return {
@@ -143,7 +162,7 @@ def to_json(self):
             "name": self.name,
             "description": self.description,
             "type": self.type,
-            "type_zh": self.type_zh
+            "type_zh": self.type_zh,
         }
 
     def get_attribute_value(self, p):
@@ -154,42 +173,52 @@ def get_attribute_value(self, p):
     def merge_entity_data(self, other):
         if other.prop is not None:
             self.prop = other.prop
-        if other.name is not None and other.name != '':
+        if other.name is not None and other.name != "":
             self.name = other.name
 
-        if other.description is not None and other.description != '':
+        if other.description is not None and other.description != "":
             self.description = other.description
 
-        if other.type is not None and other.type != '':
+        if other.type is not None and other.type != "":
             self.type = other.type
 
-        if other.type_zh is not None and other.type_zh != '':
+        if other.type_zh is not None and other.type_zh != "":
             self.type_zh = other.type_zh
 
     def to_spo_list(self):
         spo_list = []
-        spo_list.append(json.dumps({
-            "s": self.name,
-            "p": "归属类型",
-            "o": self.type
-        }, ensure_ascii=False))
+        spo_list.append(
+            json.dumps(
+                {"s": self.name, "p": "归属类型", "o": self.type}, ensure_ascii=False
+            )
+        )
         if self.prop is not None:
             for prop_key in self.prop.origin_prop_map.keys():
                 if prop_key.startswith("_"):
                     continue
-                if prop_key in ['id', 'name']:
+                if prop_key in ["id", "name"]:
                     continue
-                spo_list.append(json.dumps({
-                    "s": self.name,
-                    "p": prop_key,
-                    "o": self.prop.origin_prop_map[prop_key]
-                }, ensure_ascii=False))
+                spo_list.append(
+                    json.dumps(
+                        {
+                            "s": self.name,
+                            "p": prop_key,
+                            "o": self.prop.origin_prop_map[prop_key],
+                        },
+                        ensure_ascii=False,
+                    )
+                )
             for prop_key in self.prop.extend_prop_map.keys():
-                spo_list.append(json.dumps({
-                    "s": self.name,
-                    "p": prop_key,
-                    "o": self.prop.extend_prop_map[prop_key]
-                }, ensure_ascii=False))
+                spo_list.append(
+                    json.dumps(
+                        {
+                            "s": self.name,
+                            "p": prop_key,
+                            "o": self.prop.extend_prop_map[prop_key],
+                        },
+                        ensure_ascii=False,
+                    )
+                )
         return spo_list
 
     # def __repr__(self):
@@ -216,15 +245,23 @@ def __init__(self):
         self.end_entity: EntityData = None
         self.end_alias = "o"
         self.type: str = None
+        self.type_zh: str = None
 
-    def get_spo_type(self):
-        return f"{self.from_type}_{self.type}_{self.end_type}"
+    def _get_type_name(self, language="en"):
+        if language == "zh":
+            return self.type_zh
+        else:
+            return self.type
 
-    def get_spo_show_id(self):
-        return self.from_entity.to_show_id(), self.type, self.end_entity.to_show_id()
+    def get_spo_show_id(self, language="en"):
+        return (
+            self.from_entity.to_show_id(language),
+            self._get_type_name(language),
+            self.end_entity.to_show_id(language),
+        )
 
-    def to_show_id(self):
-        return f"{self.from_entity.to_show_id()} {self.type} {self.end_entity.to_show_id()}"
+    def to_show_id(self, langauge="en"):
+        return f"{self.from_entity.to_show_id(langauge)} {self._get_type_name(langauge)} {self.end_entity.to_show_id(langauge)}"
 
     def get_properties_map_list_value(self):
         if self.prop is None:
@@ -240,13 +277,14 @@ def to_json(self):
             "from_type": self.from_type,
             "end_entity_name": self.end_entity.name,
             "end_type": self.end_type,
-            "type": self.type
+            "type": self.type,
+            "type_zh": self.type_zh,
         }
 
     def _get_entity_description(self, entity: EntityData):
         if entity is None:
             return None
-        if entity.description is None or entity.description == '':
+        if entity.description is None or entity.description == "":
             return None
         if entity.type == "attribute":
             return None
@@ -260,31 +298,48 @@ def _get_entity_id(self, name: str, id: str):
 
     def to_spo_list(self):
         spo_list = []
-        rel = {
-            "s": self.from_entity.name,
-            "p": self.type,
-            "o": self.end_entity.name
-        }
+        rel = {"s": self.from_entity.name, "p": self.type, "o": self.end_entity.name}
         spo_list.append(json.dumps(rel, ensure_ascii=False))
         # prop
         if self.prop is not None:
             for prop_key in self.prop.origin_prop_map.keys():
-                spo_list.append(json.dumps({
-                    "s": rel,
-                    "p": prop_key,
-                    "o": self.prop.origin_prop_map[prop_key]
-                }, ensure_ascii=False))
+                spo_list.append(
+                    json.dumps(
+                        {
+                            "s": rel,
+                            "p": prop_key,
+                            "o": self.prop.origin_prop_map[prop_key],
+                        },
+                        ensure_ascii=False,
+                    )
+                )
             for prop_key in self.prop.extend_prop_map.keys():
-                spo_list.append(json.dumps({
-                    "s": rel,
-                    "p": prop_key,
-                    "o": self.prop.extend_prop_map[prop_key]
-                }, ensure_ascii=False))
+                spo_list.append(
+                    json.dumps(
+                        {
+                            "s": rel,
+                            "p": prop_key,
+                            "o": self.prop.extend_prop_map[prop_key],
+                        },
+                        ensure_ascii=False,
+                    )
+                )
         return spo_list
 
+    def rel_to_detail_prop(self):
+        spo = str(self)
+        if self.end_type != "Text":
+            prop_map = self.prop.get_properties_map_list_value() if self.prop else {}
+            if prop_map:
+                prop_str = ",".join([f"{k}={';'.join(v)}" for k, v in prop_map.items()])
+                return f"{spo} with prop: {prop_str}"
+        return spo
+
     def __repr__(self):
         from_entity_desc = self._get_entity_description(self.from_entity)
-        from_entity_desc_str = "" if from_entity_desc is None else f"({from_entity_desc})"
+        from_entity_desc_str = (
+            "" if from_entity_desc is None else f"({from_entity_desc})"
+        )
         to_entity_desc = self._get_entity_description(self.end_entity)
         to_entity_desc_str = "" if to_entity_desc is None else f"({to_entity_desc})"
         return f"({self.from_entity.name}{from_entity_desc_str} {self.type} {self.end_entity.name}{to_entity_desc_str})"
@@ -294,15 +349,19 @@ def from_dict(json_dict: dict, schema: SchemaUtils):
         rel = RelationData()
 
         rel.from_id = json_dict["__from_id__"]
-        rel.from_type = get_label_without_prefix(schema, json_dict["__from_id_type__"])
+        rel.from_type = json_dict["__from_id_type__"]
         rel.end_id = json_dict["__to_id__"]
-        rel.end_type = get_label_without_prefix(schema, json_dict["__to_id_type__"])
+        rel.end_type = json_dict["__to_id_type__"]
         rel.type = json_dict["__label__"]
-        spo_label_name = f"{rel.from_type}_{rel.type}_{rel.end_type}"
+        rel.type_zh = rel.type
+
+        from_type = schema.get_label_without_prefix(rel.from_type)
+        end_type = schema.get_label_without_prefix(rel.end_type)
+        spo_label_name = f"{from_type}_{rel.type}_{end_type}"
         rel.prop = Prop.from_dict(json_dict, spo_label_name, schema)
         if schema is not None:
             if spo_label_name in schema.spo_en_zh.keys():
-                rel.type = schema.get_spo_with_p(schema.spo_en_zh[spo_label_name])
+                rel.type_zh = schema.get_spo_with_p(schema.spo_en_zh[spo_label_name])
         return rel
 
     def revert_spo(self):
@@ -316,6 +375,7 @@ def revert_spo(self):
         rel.end_entity = self.from_entity
 
         rel.type = self.type
+        rel.type_zh = self.type_zh
         rel.prop = self.prop
         return rel
 
@@ -323,6 +383,7 @@ def revert_spo(self):
     def from_prop_value(s: EntityData, p: str, o: EntityData):
         rel = RelationData()
         rel.type = p
+        rel.type_zh = p
 
         rel.from_id = s.biz_id
         rel.from_type = s.type
@@ -342,44 +403,6 @@ def __init__(self, schema, alias_name):
         self.out_relations: dict = {}
         self.schema = schema
 
-    def to_graph_detail(self):
-        s_po_map = {}
-        prop_map = self.s.get_properties_map_list_value()
-        # get out edge map
-        for k in self.out_relations.keys():
-            for rel in self.out_relations[k]:
-                s, p, o = rel.get_spo_show_id()
-                rel_prop_map = rel.get_properties_map_list_value()
-                if len(rel_prop_map) > 0:
-                    s_po_map[f"{s} {p} {o}"] = rel_prop_map
-                if p in prop_map.keys():
-                    prop_map[p].append(o)
-                else:
-                    prop_map[p] = [o]
-                end_prop_map = rel.end_entity.get_properties_map_list_value()
-                if o not in s_po_map.keys():
-                    s_po_map[o] = end_prop_map
-
-        s_po_map[self.s.to_show_id()] = prop_map
-
-        for k in self.in_relations.keys():
-            for rel in self.in_relations[k]:
-                s, p, o = rel.get_spo_show_id()
-                rel_prop_map = rel.get_properties_map_list_value()
-                if len(rel_prop_map) > 0:
-                    s_po_map[f"{s} {p} {o}"] = rel_prop_map
-                start_prop_map = rel.from_entity.get_properties_map_list_value()
-                if s not in s_po_map.keys():
-                    s_po_map[s] = {
-                        p: [o]
-                    }
-                else:
-                    s_po_map[s].update({
-                        p: [o]
-                    })
-                s_po_map[s].update(start_prop_map)
-        return s_po_map
-
     def _schema_attr_en_to_zh(self, k):
         if self.schema is None:
             return k
@@ -392,7 +415,7 @@ def get_s_all_attribute_spo(self):
             return attr_name_set_map
         if len(self.s.prop.origin_prop_map) > 0:
             for k in self.s.prop.origin_prop_map.keys():
-                if k in ['id', 'name']:
+                if k in ["id", "name"]:
                     continue
                 if k.startswith("_"):
                     continue
@@ -405,7 +428,7 @@ def get_s_all_attribute_spo(self):
                 attr_name_set_map[k] = spo_list
         if len(self.s.prop.extend_prop_map) > 0:
             for k in self.s.prop.extend_prop_map.keys():
-                if k in ['id', 'name']:
+                if k in ["id", "name"]:
                     continue
                 if k.startswith("_"):
                     continue
@@ -481,11 +504,10 @@ def _prase_entity_relation(self, std_p: str, o_value: EntityData):
         if self.s_alias_name == "o":
             o_entity = self.s
             s_entity = o_value
-        if o_value.description is None or o_value.description == '':
+        if o_value.description is None or o_value.description == "":
             o_value.description = f"{s_entity.name} {std_p} {o_entity.name}"
         return RelationData.from_prop_value(s_entity, std_p, o_entity)
 
-
     def get_std_attr_value_by_spo_text(self, p, spo_text):
 
         spo_list = []
@@ -493,7 +515,7 @@ def get_std_attr_value_by_spo_text(self, p, spo_text):
             return spo_list
         if len(self.s.prop.origin_prop_map) > 0:
             for k in self.s.prop.origin_prop_map.keys():
-                if k in ['id', 'name']:
+                if k in ["id", "name"]:
                     continue
                 if k.startswith("_"):
                     continue
@@ -510,14 +532,14 @@ def get_std_p_value_by_spo_text(self, p, spo_text):
         relation_value_set = []
         if p in self.in_relations.keys():
             for rel in self.in_relations[p]:
-                if spo_text == str(rel).strip('(').strip(')'):
+                if spo_text == str(rel).strip("(").strip(")"):
                     if "s" == self.s_alias_name:
                         relation_value_set.append(rel.revert_spo())
                     else:
                         relation_value_set.append(rel)
         if p in self.out_relations.keys():
             for rel in self.out_relations[p]:
-                if spo_text == str(rel).strip('(').strip(')'):
+                if spo_text == str(rel).strip("(").strip(")"):
                     if "o" == self.s_alias_name:
                         relation_value_set.append(rel.revert_spo())
                     else:
@@ -529,7 +551,6 @@ def get_std_p_value_by_spo_text(self, p, spo_text):
                 relation_value_set.append(self._prase_attribute_relation(p, str(rel)))
         return relation_value_set
 
-
     def get_edge_en_to_zh(self, k):
         if self.schema is None:
             return k
@@ -540,19 +561,19 @@ def get_s_all_relation_spo(self):
         relation_name_set_map = {}
         if len(self.in_relations) > 0:
             for k in self.in_relations.keys():
-                if k in ['similarity']:
+                if k in ["similarity"]:
                     continue
                 spo_list = []
                 for v in self.in_relations[k]:
-                    spo_list.append(str(v).strip('(').strip(')'))
+                    spo_list.append(v.rel_to_detail_prop().strip("(").strip(")"))
                 relation_name_set_map[k] = spo_list
         if len(self.out_relations) > 0:
             for k in self.out_relations.keys():
-                if k in ['similarity']:
+                if k in ["similarity"]:
                     continue
                 spo_list = []
                 for v in self.out_relations[k]:
-                    spo_list.append(str(v).strip('(').strip(')'))
+                    spo_list.append(v.rel_to_detail_prop().strip("(").strip(")"))
                 relation_name_set_map[k] = spo_list
         return relation_name_set_map
 
@@ -607,7 +628,9 @@ def merge_kg_graph(self, other, wo_intersect=True):
 
         for e_alias in other.edge_map.keys():
             if e_alias in self.edge_map.keys():
-                self.edge_map[e_alias] = self.edge_map[e_alias] + other.edge_map[e_alias]
+                self.edge_map[e_alias] = (
+                    self.edge_map[e_alias] + other.edge_map[e_alias]
+                )
             else:
                 self.edge_map[e_alias] = other.edge_map[e_alias]
         for p in other.query_graph.keys():
@@ -662,12 +685,14 @@ def _edge_map_to_json(self):
             result_dict[k] = rels
         return result_dict
 
-    def to_answer_path(self):
+    def to_spo_path(self, filter_list=None, language="en"):
         answer_path = []
         sp_o_map = {}
         for k in self.edge_map.keys():
             for d in self.edge_map[k]:
-                s, p, o = d.get_spo_show_id()
+                if filter_list and d not in filter_list:
+                    continue
+                s, p, o = d.get_spo_show_id(language)
                 if (s, p) in sp_o_map.keys():
                     if o not in sp_o_map[(s, p)]:
                         sp_o_map[(s, p)].append(o)
@@ -675,16 +700,15 @@ def to_answer_path(self):
                     sp_o_map[(s, p)] = [o]
         used_entities = []
         for k in sp_o_map.keys():
-            answer_path.append({
-                "s": k[0],
-                "p": k[1],
-                "o": sp_o_map[k]
-            })
+            answer_path.append({"s": k[0], "p": k[1], "o": sp_o_map[k]})
             used_entities.append(k[0])
             used_entities = used_entities + sp_o_map[k]
             used_entities = list(set(used_entities))
         return answer_path
 
+    def to_answer_path(self):
+        return self.to_spo_path()
+
     def get_all_entity_id(self):
         all_entity_id = []
         for k in self.edge_map.keys():
@@ -707,6 +731,13 @@ def get_all_entity(self):
                 all_entity.append(d)
         return list(set(all_entity))
 
+    def get_all_spo(self):
+        all_spo = []
+        for k in self.edge_map.keys():
+            for d in self.edge_map[k]:
+                all_spo.append(d)
+        return all_spo
+
     def _graph_to_json(self):
         total_entity_map = {}
         edge_dict = {}
@@ -720,9 +751,15 @@ def _graph_to_json(self):
             for d in self.edge_map[k]:
                 has_entity = True
                 rels.append(d.to_json())
-                if d.from_alias == "s" and d.from_entity not in total_entity_map[s_alias]:
+                if (
+                    d.from_alias == "s"
+                    and d.from_entity not in total_entity_map[s_alias]
+                ):
                     total_entity_map[s_alias].append(d.from_entity)
-                if d.from_alias == "o" and d.from_entity not in total_entity_map[o_alias]:
+                if (
+                    d.from_alias == "o"
+                    and d.from_entity not in total_entity_map[o_alias]
+                ):
                     total_entity_map[o_alias].append(d.from_entity)
 
                 if d.end_alias == "s" and d.end_entity not in total_entity_map[s_alias]:
@@ -758,7 +795,7 @@ def to_json(self):
             "start_node_alias_name": list(set(self.start_node_alias_name)),
             "start_node_name": list(set(self.start_node_name)),
             "entity_map": node_dict,
-            "edge_map": edge_dict
+            "edge_map": edge_dict,
         }
 
     def to_edge_str(self):
@@ -840,8 +877,9 @@ def rmv_node_ins(self, alias_name, alias_ins_set):
             allowed_entity_dict[s.alias_name] = []
             allowed_entity_dict[o.alias_name] = []
             for rel in self.edge_map[p]:
-                if (s.alias_name == alias_name and rel.from_id not in alias_ins_set) \
-                        or (o.alias_name == alias_name and rel.end_id not in alias_ins_set):
+                if (
+                    s.alias_name == alias_name and rel.from_id not in alias_ins_set
+                ) or (o.alias_name == alias_name and rel.end_id not in alias_ins_set):
                     rel_list.append(rel)
                     self.append_into_map(allowed_entity_dict, s.alias_name, rel.from_id)
                     self.append_into_map(allowed_entity_dict, o.alias_name, rel.end_id)
diff --git a/kag/solver/logic/core_modules/common/schema_utils.py b/kag/solver/logic/core_modules/common/schema_utils.py
index 964341c8..b4008315 100644
--- a/kag/solver/logic/core_modules/common/schema_utils.py
+++ b/kag/solver/logic/core_modules/common/schema_utils.py
@@ -50,7 +50,7 @@ def __init__(self, config: LogicFormConfiguration):
         self.get_schema()
 
     def get_spo_with_p(self, spo):
-        _, p, _ = spo.split('_')
+        _, p, _ = spo.split("_")
         return p
 
     def get_label_within_prefix(self, label_name_without_prefix):
@@ -75,18 +75,14 @@ def _add_attr_with_label(self, label_name, nameZh, name):
             attr_en_zh_tmp = self.attr_en_zh_by_label[label_name]
             attr_en_zh_tmp[name] = nameZh
         else:
-            attr_en_zh_tmp = {
-                name: nameZh
-            }
+            attr_en_zh_tmp = {name: nameZh}
         self.attr_en_zh_by_label[label_name] = attr_en_zh_tmp
 
         if label_name in self.attr_zh_en_by_label.keys():
             attr_zh_en_tmp = self.attr_zh_en_by_label[label_name]
             attr_zh_en_tmp[nameZh] = name
         else:
-            attr_zh_en_tmp = {
-                nameZh: name
-            }
+            attr_zh_en_tmp = {nameZh: name}
         self.attr_zh_en_by_label[label_name] = attr_zh_en_tmp
 
     def get_attr_en_zh_by_label(self, label_name):
@@ -108,18 +104,23 @@ def get_attr(self, label_name, attributes):
                 continue
             # print('attribute:', attribute)
             attribute = json.loads(attribute)
-            if 'constraints' in attribute and 'name' in attribute['constraints'] and attribute['constraints'][
-                'name'] == "Enum":
-                enums = list(attribute['constraints']['value'].keys())
+            if (
+                "constraints" in attribute
+                and "name" in attribute["constraints"]
+                and attribute["constraints"]["name"] == "Enum"
+            ):
+                enums = list(attribute["constraints"]["value"].keys())
             else:
                 enums = None
-            if attribute['name'].startswith('kg') and attribute['name'].endswith('Raw'):
+            if attribute["name"].startswith("kg") and attribute["name"].endswith("Raw"):
                 continue
-            self.attr_zh_en[attribute['nameZh']] = attribute['name']
-            self.attr_en_zh[attribute['name']] = attribute['nameZh']
-            self.attr_enums[attribute['nameZh']] = enums
-            self._add_attr_with_label(label_name, attribute['nameZh'], attribute['name'])
-            attributes_namezh.append(attribute['nameZh'])
+            self.attr_zh_en[attribute["nameZh"]] = attribute["name"]
+            self.attr_en_zh[attribute["name"]] = attribute["nameZh"]
+            self.attr_enums[attribute["nameZh"]] = enums
+            self._add_attr_with_label(
+                label_name, attribute["nameZh"], attribute["name"]
+            )
+            attributes_namezh.append(attribute["nameZh"])
         return attributes_namezh
 
     def get_ext_json_prop(self):
@@ -152,7 +153,7 @@ def get_schema_from_spg(self):
             entity_default_attributes = [
                 '{"name": "name", "nameZh": "名称"}',
                 '{"name": "id", "nameZh": "实体主键"}',
-                '{"name": "description", "nameZh": "描述"}'
+                '{"name": "description", "nameZh": "描述"}',
             ]
             attributes += entity_default_attributes
             attributes_namezh = self.get_attr(name_en, attributes)
@@ -164,6 +165,8 @@ def get_schema_from_spg(self):
                 s_name_zh = name_zh
                 p_name_en = relation.name
                 p_name_zh = relation.name_zh
+                if relation.object_type_name not in spg_schema:
+                    continue
                 spg_o_type = spg_schema[relation.object_type_name]
                 o_name_en = spg_o_type.name_en
                 o_name_zh = spg_o_type.name_zh
@@ -194,7 +197,9 @@ def get_schema_from_spg(self):
                 if o_name_zh not in self.node_edge:
                     self.node_edge[o_name_zh] = set()
                 self.node_edge[o_name_zh].add(p_name_zh)
-                r_attributes = self._convert_spg_attr_set(list(relation.sub_properties.values()))
+                r_attributes = self._convert_spg_attr_set(
+                    list(relation.sub_properties.values())
+                )
                 r_attributes_namezh = self.get_attr(name_en, r_attributes)
                 self.edge_attr[p_name_zh] = r_attributes_namezh
 
@@ -206,18 +211,24 @@ def get_schema_from_csv(self):
         # next(reader)
         node_attributes = {}
         for row in reader:
-            obj, name_zh, name_en, father_en, edge_direction, attributes = row[0], row[1], row[2], row[3], row[4], row[
-                                                                                                                   6:]
+            obj, name_zh, name_en, father_en, edge_direction, attributes = (
+                row[0],
+                row[1],
+                row[2],
+                row[3],
+                row[4],
+                row[6:],
+            )
             if "nodeType/edgeType" in obj:
                 continue
-            name_en = name_en.replace(self.prefix, '')
-            # if name_en in ['Event', 'ProductTaxon']:   
+            name_en = name_en.replace(self.prefix, "")
+            # if name_en in ['Event', 'ProductTaxon']:
             if father_en and father_en in node_attributes:
                 attributes += node_attributes[father_en]
             node_attributes[name_en] = attributes
-            if obj not in ['edge', 'inputEdge']:
+            if obj not in ["edge", "inputEdge"]:
                 # if name_zh in ['百科实体', '热点事件', '事件']:
-                if name_zh in ['百科实体']:
+                if name_zh in ["百科实体"]:
                     continue
                 self.nodes.add(name_zh)
                 self.node_zh_en[name_zh] = name_en
@@ -225,17 +236,16 @@ def get_schema_from_csv(self):
                 entity_default_attributes = [
                     '{"name": "name", "nameZh": "名称"}',
                     '{"name": "id", "nameZh": "实体主键"}',
-                    '{"name": "description", "nameZh": "描述"}'
+                    '{"name": "description", "nameZh": "描述"}',
                 ]
                 attributes += entity_default_attributes
                 attributes_namezh = self.get_attr(name_en, attributes)
                 self.node_attr[name_zh] = attributes_namezh
 
-
-            elif obj == 'edge':
-                s, p, o = name_zh.split('_')
+            elif obj == "edge":
+                s, p, o = name_zh.split("_")
                 # if s in ['百科实体', '热点事件', '事件'] or o in ['百科实体', '热点事件', '事件']:
-                if s in ['百科实体'] or o in ['百科实体']:
+                if s in ["百科实体"] or o in ["百科实体"]:
                     continue
                 if name_zh not in self.spo:
                     self.spo.add(name_zh)
@@ -246,7 +256,7 @@ def get_schema_from_csv(self):
                 self.so_p[(s, o)].add(p)
                 self.sp_o[(s, p)].add(o)
                 self.sp_o[(o, p)].add(s)
-                s_en, p_en, o_en = name_en.split('_')
+                s_en, p_en, o_en = name_en.split("_")
                 self.so_p_en[(s_en, o_en)].add(p_en)
                 self.sp_o_en[(s_en, p_en)].add(o_en)
                 self.op_s_en[(o_en, p_en)].add(s_en)
@@ -268,22 +278,27 @@ def get_schema_rdf(self, path_node, path_edge):
         f_node = open(path_node)
         f_edge = open(path_edge)
         for row in csv.DictReader(f_node):
-            name, id = row['name'], row['alias']
+            name, id = row["name"], row["alias"]
             self.nodes.add(name)
         for row in csv.DictReader(f_edge):
-            name = row['name']
+            name = row["name"]
             self.edges.add(name)
 
     def _convert_spg_attr_set(self, attr_set: List[Property]):
-        return [json.dumps({
-            'constraints': attr.to_dict().get('constraint', {}),
-            'name': attr.to_dict().get('name'),
-            'nameZh': attr.to_dict().get('name_zh')
-        }) for attr in attr_set]
+        return [
+            json.dumps(
+                {
+                    "constraints": attr.to_dict().get("constraint", {}),
+                    "name": attr.to_dict().get("name"),
+                    "nameZh": attr.to_dict().get("name_zh"),
+                }
+            )
+            for attr in attr_set
+        ]
 
 
 def generate_nodes_edges_hetero(schema):
-    '''
+    """
     nodes {
         hetero {
             "CommonSenseKG.Person" {
@@ -308,21 +323,27 @@ def generate_nodes_edges_hetero(schema):
             }
         }
     }
-    '''
+    """
     nodes_hetero, edges_hetero = defaultdict(dict), defaultdict(dict)
     for node in schema.nodes:
         node = schema.node_zh_en[node]
         features = []
         for attr in schema.node_attr[schema.node_en_zh[node]]:
             attr = schema.attr_zh_en[attr]
-            features.append(attr + ';Raw|use_fe=False;Direct;str')
-        node = schema.prefix + '.' + node
-        nodes_hetero[node] = {'fe': features}
+            features.append(attr + ";Raw|use_fe=False;Direct;str")
+        node = schema.prefix + "." + node
+        nodes_hetero[node] = {"fe": features}
 
     for spo in schema.spo_en:
-        s, p, o = spo.split('_')
-        edge = '_'.join([schema.prefix + '.' + s, p, schema.prefix + '.' + o])
-        edges_hetero[edge] = {'fe': []}
+        s, p, o = spo.split("_")
+        edge = "_".join([schema.prefix + "." + s, p, schema.prefix + "." + o])
+        edges_hetero[edge] = {"fe": []}
 
-    print('nodes_hetero:', json.dumps(nodes_hetero, indent=2).replace('"fe"', 'fe').replace('},', '}'))
-    print('edges_hetero:', json.dumps(edges_hetero, indent=2).replace('"fe"', 'fe').replace('},', '}'))
+    print(
+        "nodes_hetero:",
+        json.dumps(nodes_hetero, indent=2).replace('"fe"', "fe").replace("},", "}"),
+    )
+    print(
+        "edges_hetero:",
+        json.dumps(edges_hetero, indent=2).replace('"fe"', "fe").replace("},", "}"),
+    )
diff --git a/kag/solver/logic/core_modules/common/text_sim_by_vector.py b/kag/solver/logic/core_modules/common/text_sim_by_vector.py
index 44b20bc0..7c7c1dfb 100644
--- a/kag/solver/logic/core_modules/common/text_sim_by_vector.py
+++ b/kag/solver/logic/core_modules/common/text_sim_by_vector.py
@@ -2,14 +2,17 @@
 from typing import List
 
 import numpy as np
-
-from kag.common.vectorizer import Vectorizer
+from kag.common.conf import KAG_CONFIG
+from kag.interface import VectorizeModelABC as Vectorizer
 
 
 def cosine_similarity(vector1, vector2):
-    cosine = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
+    cosine = np.dot(vector1, vector2) / (
+        np.linalg.norm(vector1) * np.linalg.norm(vector2)
+    )
     return cosine
 
+
 def split_list(input_list, max_length=30):
     """
     Splits a list into multiple sublists where each sublist has a maximum length of max_length.
@@ -18,23 +21,24 @@ def split_list(input_list, max_length=30):
     :param max_length: The maximum length of each sublist
     :return: A list containing multiple sublists
     """
-    return [input_list[i:i + max_length] for i in range(0, len(input_list), max_length)]
+    return [
+        input_list[i : i + max_length] for i in range(0, len(input_list), max_length)
+    ]
 
 
 class TextSimilarity:
-    def __init__(self, vec_config=None):
-        if vec_config is None:
-            vec_config = eval(os.getenv("KAG_VECTORIZER"))
-            if vec_config is None:
-                message = "vectorizer config is required"
-                raise RuntimeError(message)
-        self._vectorizer: Vectorizer = Vectorizer.from_config(vec_config)
+    def __init__(self, vectorizer: Vectorizer = None):
+        if vectorizer is None:
+            vectorizer_conf = KAG_CONFIG.all_config["vectorize_model"]
+            self.vectorize_model = Vectorizer.from_config(vectorizer_conf)
+        else:
+            self.vectorize_model = vectorizer
 
         self.cached_embs = {}
 
     def sentence_encode(self, sentences, is_cached=False):
         if isinstance(sentences, str):
-            return self._vectorizer.vectorize(sentences)
+            return self.vectorize_model.vectorize(sentences)
         if not isinstance(sentences, list):
             return []
         if len(sentences) == 0:
@@ -50,7 +54,7 @@ def sentence_encode(self, sentences, is_cached=False):
                 else:
                     need_call_emb_text.append(text)
             if len(need_call_emb_text) > 0:
-                emb_res = self._vectorizer.vectorize(need_call_emb_text)
+                emb_res = self.vectorize_model.vectorize(need_call_emb_text)
                 for text, text_emb in zip(need_call_emb_text, emb_res):
                     tmp_map[text] = text_emb
                     if is_cached:
@@ -60,13 +64,15 @@ def sentence_encode(self, sentences, is_cached=False):
         return ret
 
     def text_sim_result(self, mention, candidates: List[str], topk=1, low_score=0.63):
-        '''
+        """
         output: [(candi_name, candi_score),...]
-        '''
+        """
         if mention is None:
             return []
         mention_emb = self.sentence_encode(mention)
-        candidates = [cand for cand in candidates if cand is not None and cand.strip() != '']
+        candidates = [
+            cand for cand in candidates if cand is not None and cand.strip() != ""
+        ]
         if len(candidates) == 0:
             return []
         candidates_emb = self.sentence_encode(candidates)
@@ -76,15 +82,17 @@ def text_sim_result(self, mention, candidates: List[str], topk=1, low_score=0.63
             if cosine < low_score:
                 continue
             candidates_dis[candidate] = cosine
-        candidates_dis = sorted(candidates_dis.items(), key=lambda x:x[-1], reverse=True)
+        candidates_dis = sorted(
+            candidates_dis.items(), key=lambda x: x[-1], reverse=True
+        )
         candis = candidates_dis[:topk]
         return candis
 
     def text_type_sim(self, mention, candidates, topk=1):
-        '''
+        """
         output: [(candi_name, candi_score),...]
-        '''
+        """
         res = self.text_sim_result(mention, candidates, topk)
         if len(res) == 0:
-            return [('Entity', 1.)]
+            return [("Entity", 1.0)]
         return res
diff --git a/kag/solver/logic/core_modules/common/utils.py b/kag/solver/logic/core_modules/common/utils.py
index 4c384e82..c40df413 100644
--- a/kag/solver/logic/core_modules/common/utils.py
+++ b/kag/solver/logic/core_modules/common/utils.py
@@ -1,4 +1,4 @@
-#coding=utf8
+# coding=utf8
 import random
 import re
 import string
@@ -6,23 +6,29 @@
 
 def generate_random_string(bit=8):
     possible_characters = string.ascii_letters + string.digits
-    return ''.join(random.choice(possible_characters) for _ in range(bit))
+    random_str = "".join(random.choice(possible_characters) for _ in range(bit))
+    return "gen" + random_str
+
 
 def generate_biz_id_with_type(biz_id, type_name):
     return f"{biz_id}_{type_name}"
 
+
 def get_p_clean(p):
     if re.search(".*[\\u4e00-\\u9fa5]+.*", p):
-        p = re.sub('[ \t:：（）“”‘’\'"\[\]\(\)]+?', '', p)
+        p = re.sub("[ \t:：（）“”‘’'\"\[\]\(\)]+?", "", p)
     else:
         p = None
     return p
 
+
 def get_recall_node_label(label_set):
     for l in label_set:
         if l != "Entity":
             return l
-def node_2_doc(node:dict):
+
+
+def node_2_doc(node: dict):
     prop_set = []
     for key in node.keys():
         if key in ["id"]:
@@ -39,4 +45,4 @@ def node_2_doc(node:dict):
         else:
             prop = f"{key}:{value}"
         prop_set.append(prop)
-    return "\n".join(prop_set)
\ No newline at end of file
+    return "\n".join(prop_set)
diff --git a/kag/solver/logic/core_modules/config.py b/kag/solver/logic/core_modules/config.py
index 5a5b02b8..ab2fe51b 100644
--- a/kag/solver/logic/core_modules/config.py
+++ b/kag/solver/logic/core_modules/config.py
@@ -1,23 +1,30 @@
 import os
+from kag.common.conf import KAG_PROJECT_CONF
 
 
 class LogicFormConfiguration:
-
     def __init__(self, args={}):
         self.resource_path = args.get("resource_path", "./")
 
         self.prefix = args.get("prefix", "")
 
         # kg graph project ID.
-        self.project_id = args.get("KAG_PROJECT_ID", None) or os.getenv("KAG_PROJECT_ID")
+        self.project_id = (
+            args.get("KAG_PROJECT_ID", None) or KAG_PROJECT_CONF.project_id
+        )
         if not self.project_id:
-            raise RuntimeError("init LogicFormConfiguration failed, not found params KAG_PROJECT_ID")
+            raise RuntimeError(
+                "init LogicFormConfiguration failed, not found params KAG_PROJECT_ID"
+            )
 
         # kg graph schema file path.
         self.schema_file_name = args.get("schema_file_name", "")
 
-        self.host_addr = args.get("KAG_PROJECT_HOST_ADDR", None) or os.getenv("KAG_PROJECT_HOST_ADDR")
+        self.host_addr = (
+            args.get("KAG_PROJECT_HOST_ADDR", None) or KAG_PROJECT_CONF.host_addr
+        )
 
         if not self.host_addr:
-            raise RuntimeError("init LogicFormConfiguration failed, not found params KAG_PROJECT_HOST_ADDR")
-
+            raise RuntimeError(
+                "init LogicFormConfiguration failed, not found params KAG_PROJECT_HOST_ADDR"
+            )
diff --git a/kag/solver/logic/core_modules/lf_executor.py b/kag/solver/logic/core_modules/lf_executor.py
deleted file mode 100644
index 9ec12103..00000000
--- a/kag/solver/logic/core_modules/lf_executor.py
+++ /dev/null
@@ -1,289 +0,0 @@
-import logging
-import os
-import time
-from typing import List
-
-from kag.common.graphstore.graph_store import GraphStore
-from kag.interface.retriever.chunk_retriever_abc import ChunkRetrieverABC
-from kag.interface.retriever.kg_retriever_abc import KGRetrieverABC
-from kag.solver.common.base import Question
-from kag.solver.logic.core_modules.common.base_model import LFPlanResult
-from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.common.text_sim_by_vector import TextSimilarity
-from kag.solver.logic.core_modules.config import LogicFormConfiguration
-from kag.solver.logic.core_modules.op_executor.op_deduce.deduce_executor import DeduceExecutor
-from kag.solver.logic.core_modules.op_executor.op_math.math_executor import MathExecutor
-from kag.solver.logic.core_modules.op_executor.op_output.output_executor import OutputExecutor
-from kag.solver.logic.core_modules.op_executor.op_retrieval.retrieval_executor import RetrievalExecutor
-from kag.solver.logic.core_modules.op_executor.op_sort.sort_executor import SortExecutor
-from kag.solver.logic.core_modules.parser.logic_node_parser import ParseLogicForm
-from kag.solver.logic.core_modules.retriver.entity_linker import EntityLinkerBase
-from kag.solver.logic.core_modules.retriver.graph_retriver.dsl_executor import DslRunner, DslRunnerOnGraphStore
-from kag.solver.logic.core_modules.retriver.schema_std import SchemaRetrieval
-from kag.solver.logic.core_modules.rule_runner.rule_runner import OpRunner
-from kag.solver.tools.info_processor import ReporterIntermediateProcessTool
-
-logger = logging.getLogger()
-
-
-class LogicExecutor:
-    def __init__(self, query: str, project_id: str,
-                 schema: SchemaUtils, kg_retriever: KGRetrieverABC,
-                 chunk_retriever: ChunkRetrieverABC, std_schema: SchemaRetrieval, el: EntityLinkerBase, generator,
-                 dsl_runner: DslRunner,
-                 text_similarity: TextSimilarity=None,
-                 req_id='',
-                 need_detail=False, llm=None, report_tool=None, params=None):
-        """
-        Initializes the LogicEngine with necessary parameters and configurations.
-
-        :param query: The main query to process.
-        :param project_id: Identifier for the project.
-        :param schema: The schema used for processing.
-        :chunk_retriever (ChunkRetriever): An instance for chunk-level retrieval. If not provided, we will not execute chunk retrieval.
-        :kg_retriever (KGRetrieval): An instance for graph-level retrieval. If not provided, we will not execute retrieval on graph.
-        :param std_schema: Standard schema retrieval instance for retrieval schema label.
-        :param el: Entity linker base instance.
-        :param generator: Generator for generating answers.
-        :param dsl_runner: DSL runner instance, can run cypher to query graph. Defaults to `None`.
-        :param text_similarity: convert text to vector, and compute similarity score
-        :param req_id: Request identifier. Defaults to an empty string.
-        :param need_detail: Flag indicating whether detailed information is needed. Defaults to `False`.
-        :param llm: Language model instance. Defaults to `None`.
-        :param report_tool: Reporting tool instance. Defaults to `None`.
-        :param params: Additional parameters. Defaults to `None`.
-        """
-        # pipeline record
-        if params is None:
-            params = {}
-        self.report_tool = report_tool
-        self.need_detail = need_detail
-        self.req_id = req_id
-        self.params = params
-        self.dsl_runner = dsl_runner
-        self.schema = schema
-        self.project_id = project_id
-        self.nl_query = query
-        self.query_one_graph_cache = {}
-        self.kg_graph = KgGraph()
-        self.kg_retriever = kg_retriever
-        self.chunk_retriever = chunk_retriever
-        self.alias_entity = {}
-        self.query_schema = {}
-        self.need_query_one_hop_with_rel = []
-        self.recall_data_time = 0
-        self.debug_info = {
-            "el": [],
-            "el_detail": [],
-            "std_out": [],
-            "get_empty": [],
-            "sub_qa_pair": []
-        }
-        self.op_runner = OpRunner(self.kg_graph, llm, query, self.req_id)
-        self.parser = ParseLogicForm(self.schema, std_schema)
-        self.text_similarity = text_similarity or TextSimilarity()
-        self.llm = llm
-        self.generator = generator
-        self.el = el
-
-        self.force_chunk_retriever = os.getenv("KAG_QA_FORCE_CHUNK_RETRIEVER", False)
-
-        # Initialize executors for different operations.
-        self.retrieval_executor = RetrievalExecutor(query, self.kg_graph, self.schema, self.kg_retriever,
-                                                    self.el,
-                                                    self.dsl_runner, self.debug_info, text_similarity,KAG_PROJECT_ID = self.project_id)
-        self.deduce_executor = DeduceExecutor(query, self.kg_graph, self.schema, self.op_runner, self.debug_info, KAG_PROJECT_ID = self.project_id)
-        self.sort_executor = SortExecutor(query, self.kg_graph, self.schema, self.debug_info, KAG_PROJECT_ID = self.project_id)
-        self.math_executor = MathExecutor(query, self.kg_graph, self.schema, self.debug_info, KAG_PROJECT_ID = self.project_id)
-        self.output_executor = OutputExecutor(query, self.kg_graph, self.schema, self.el,
-                                              self.dsl_runner,
-                                              self.retrieval_executor.query_one_graph_cache, self.debug_info, KAG_PROJECT_ID = self.project_id)
-
-        self.with_sub_answer = os.getenv("KAG_QA_WITH_SUB_ANSWER", True)
-
-    def _convert_logic_nodes_2_question(self, logic_nodes: List[LFPlanResult]) -> List[Question]:
-        ret_question = []
-        for i in range(0, len(logic_nodes)):
-            if i == 0:
-                question = Question(
-                    question=logic_nodes[i].query,
-                )
-            else:
-                question = Question(
-                    question=logic_nodes[i].query,
-                    dependencies=[ret_question[i - 1]]
-                )
-            question.id = i
-            ret_question.append(question)
-        return ret_question
-
-    def _generate_sub_answer(self, history: list, spo_retrieved: list, docs: list, sub_query: str):
-        if not self.with_sub_answer:
-            return "I don't know"
-        return self.generator.generate_sub_answer(sub_query, spo_retrieved, docs, history)
-
-    def execute(self, lf_nodes: List[LFPlanResult], init_query):
-        """
-        Executes the logic nodes and processes the initial query to retrieve answers.
-
-        :param lf_nodes: List of logic nodes to be executed.
-        :param init_query: The initial query that triggered this execution.
-        :return: A tuple containing the QA results, knowledge graph, and execution history.
-        """
-        self._create_report_pipeline(init_query, lf_nodes)
-
-        kg_qa_result = []
-        history = []
-        query_ner_list = {}
-        # get NER results for the initial query, for chunk retrieve
-        if self.chunk_retriever and hasattr(self.chunk_retriever, 'get_std_ner_by_query'):
-            query_ner_list = self.chunk_retriever.get_std_ner_by_query(init_query)
-
-        query_num = 0
-
-        # Process each sub-query.
-        for lf in lf_nodes:
-            sub_query, sub_logic_nodes = lf.query, lf.lf_nodes
-            query_num += 1
-            node_begin_time = time.time()
-            sub_logic_nodes_str = "\n".join([str(ln) for ln in sub_logic_nodes])
-
-            question = self._create_sub_question_report_node(query_num, sub_logic_nodes_str, sub_query)
-            if self.kg_retriever:
-                kg_qa_result, spo_retrieved = self._execute_lf(sub_logic_nodes)
-            else:
-                logger.info(f"lf executor disabled kg retriever {init_query}")
-                kg_qa_result, spo_retrieved = [], []
-
-            question.context.append(f"#### spo retrieved:")
-            question.context.append(f"{spo_retrieved if len(spo_retrieved) > 0 else 'no spo tuple retrieved'}.")
-            self._update_sub_question_status(question, None, ReporterIntermediateProcessTool.STATE.RUNNING)
-
-
-            answer_source = "spo"
-            docs_with_score = []
-            all_related_entities, sub_answer = self._generate_sub_answer_by_graph(
-                history, kg_qa_result, spo_retrieved, sub_query)
-
-            # if sub answer is `I don't know`, we use chunk retriever
-            if "i don't know" in sub_answer.lower() and self.chunk_retriever:
-                answer_source = "chunk"
-                question.context.append(f"## Chunk Retriever")
-                self._update_sub_question_status(question, None, ReporterIntermediateProcessTool.STATE.RUNNING)
-
-                start_time = time.time()
-                # Update parameters to include retrieved SPO entities as starting points for chunk retrieval.
-                params = {
-                    'related_entities': all_related_entities,
-                    'query_ner_dict': query_ner_list,
-                    'req_id': self.req_id
-                }
-                # Retrieve chunks using the updated parameters.
-                sub_query_with_history_qa = self._generate_sub_query_with_history_qa(history, sub_query)
-                docs_with_score = self.chunk_retriever.recall_docs(sub_query_with_history_qa, top_k=10, **params)
-                docs = ["#".join(item.split("#")[:-1]) for item in docs_with_score]
-
-                self._update_sub_question_recall_docs(docs, question)
-                self._update_sub_question_status(question, None, ReporterIntermediateProcessTool.STATE.RUNNING)
-
-                retrival_time = time.time() - start_time
-                sub_answer = self._generate_sub_answer(history, spo_retrieved, docs, sub_query)
-                question.context.append("#### answer based by fuzzy retrieved:")
-                question.context.append(f"{sub_answer}")
-                logger.info(f"{self.req_id} call by docs cost: {retrival_time} docs num={len(docs)}")
-
-            history.append(
-                {"sub_query": sub_query, "sub_answer": sub_answer, 'docs': docs_with_score,
-                 'spo_retrieved': spo_retrieved,
-                 'exactly_match': self.debug_info.get('exact_match_spo', False),
-                 'logic_expr': sub_logic_nodes_str, 'answer_source': answer_source,
-                 'cost': time.time() - node_begin_time})
-            self.debug_info['sub_qa_pair'].append([sub_query, sub_answer])
-            self._update_sub_question_status(question, sub_answer, ReporterIntermediateProcessTool.STATE.FINISH)
-
-        return kg_qa_result, self.kg_graph, history
-
-    def _generate_sub_query_with_history_qa(self, history, sub_query):
-        # Generate a sub-query with history qa pair
-        if history:
-            history_sub_answer = [h['sub_answer'] for h in history[:3] if
-                                  "i don't know" not in h['sub_answer'].lower()]
-            sub_query_with_history_qa = '\n'.join(history_sub_answer) + '\n' + sub_query
-        else:
-            sub_query_with_history_qa = sub_query
-        return sub_query_with_history_qa
-
-    def _update_sub_question_recall_docs(self, docs, question):
-        question.context.extend(["|id|content|", "|-|-|"])
-        for i, d in enumerate(docs, start=1):
-            _d = d.replace('\n', '<br>')
-            question.context.append(f"|{i}|{_d}|")
-
-    def _generate_sub_answer_by_graph(self, history, kg_qa_result, spo_retrieved, sub_query):
-        sub_answer = "I don't know"
-        all_related_entities = self.kg_graph.get_all_entity()
-        all_related_entities = list(set(all_related_entities))
-        if self.force_chunk_retriever:
-            # if this flag is true, we force use chunk retriever
-            logger.info(f"lf executor disabled with_sub_answer {sub_query}")
-            return all_related_entities, sub_answer
-        # if spo retrieved empty, we use chunk retriever to retrieve answer
-        if len(spo_retrieved) == 0 and len(kg_qa_result) == 0:
-            all_related_entities = []
-        # if there is answer in kg_qa_result, and the answer is exact match with spo, we generate answer with kg result
-        elif self.debug_info.get('exact_match_spo', False):
-            if len(kg_qa_result) > 0:
-                sub_answer = str(kg_qa_result)
-            else:
-                sub_answer = str(spo_retrieved)
-        # other condition, we generate answer by graph, if can not generate answer, we will return `I don't know`
-        else:
-            if len(spo_retrieved) == 0:
-                spo_retrieved = kg_qa_result
-            sub_answer = self._generate_sub_answer(history, spo_retrieved, [], sub_query)
-        return all_related_entities, sub_answer
-
-    def _update_sub_question_status(self, question, answer, status):
-        if self.report_tool:
-            self.report_tool.report_node(question, answer, status)
-
-    def _create_sub_question_report_node(self, query_num, sub_logic_nodes_str, sub_query):
-        question = Question(
-            question=sub_query,
-        )
-        question.id = query_num
-        question.context = ["## SPO Retriever", "#### logic_form expression: ",
-                            f'```java\n{sub_logic_nodes_str}\n```']
-        if self.report_tool:
-            self.report_tool.report_node(question, None, ReporterIntermediateProcessTool.STATE.RUNNING)
-        return question
-
-    def _create_report_pipeline(self, init_query, lf_nodes):
-        if self.report_tool:
-            self.report_tool.report_pipeline(Question(question=init_query), self._convert_logic_nodes_2_question(lf_nodes))
-
-    def _execute_lf(self, sub_logic_nodes):
-        kg_qa_result = []
-        spo_set = []
-        # Execute graph retrieval operations.
-        for n in sub_logic_nodes:
-            logger.debug(f"{self.req_id} begin run logic node " + str(n))
-            if self.retrieval_executor.is_this_op(n):
-                self.retrieval_executor.executor(n, self.req_id, self.params)
-                cur_spo_set = self.kg_graph.get_entity_by_alias(n.p.alias_name)
-                if cur_spo_set is not None and len(cur_spo_set) > 0:
-                    spo_set += [str(spo) for spo in cur_spo_set]
-            elif self.deduce_executor.is_this_op(n):
-                deduce_res = self.deduce_executor.executor(n, self.req_id, self.params)
-                if isinstance(deduce_res, list):
-                    kg_qa_result += deduce_res
-            elif self.math_executor.is_this_op(n):
-                self.math_executor.executor(n, self.req_id, self.params)
-            elif self.sort_executor.is_this_op(n):
-                self.sort_executor.executor(n, self.req_id, self.params)
-            elif self.output_executor.is_this_op(n):
-                kg_qa_result += self.output_executor.executor(n, self.req_id, self.params)
-            else:
-                logger.warning(f"unknown operator: {n.operator}")
-        return kg_qa_result, spo_set
diff --git a/kag/solver/logic/core_modules/lf_generator.py b/kag/solver/logic/core_modules/lf_generator.py
deleted file mode 100644
index 89a303ca..00000000
--- a/kag/solver/logic/core_modules/lf_generator.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import logging
-
-from kag.common.base.prompt_op import PromptOp
-from kag.solver.common.base import KagBaseModule
-
-logger = logging.getLogger(__name__)
-
-
-class LFGenerator(KagBaseModule):
-    """
-    Generator class that selects different prompts based on the scenario to produce answers.
-    This class can be extended to implement custom generation strategies.
-    """
-
-    def __init__(self,**kwargs):
-        super().__init__(**kwargs)
-        self.solve_question_prompt = PromptOp.load(self.biz_scene, "solve_question")(
-            language=self.language
-        )
-
-        self.solve_question_without_docs_prompt = PromptOp.load(self.biz_scene, "solve_question_without_docs")(
-            language=self.language
-        )
-
-        self.solve_question_without_spo_prompt = PromptOp.load(self.biz_scene, "solve_question_without_spo")(
-            language=self.language
-        )
-
-    def generate_sub_answer(self, question: str, knowledge_graph: [], docs: [], history=[]):
-        """
-        Generates a sub-answer based on the given question, knowledge graph, documents, and history.
-
-        Parameters:
-        question (str): The main question to answer.
-        knowledge_graph (list): A list of knowledge graph data.
-        docs (list): A list of documents related to the question.
-        history (list, optional): A list of previous query-answer pairs. Defaults to an empty list.
-
-        Returns:
-        str: The generated sub-answer.
-        """
-        history_qa = [f"query{i}: {item['sub_query']}\nanswer{i}: {item['sub_answer']}" for i, item in
-                      enumerate(history)]
-        if knowledge_graph:
-            if len(docs) > 0:
-                prompt = self.solve_question_prompt
-                params = {
-                    'question': question,
-                    'knowledge_graph': str(knowledge_graph),
-                    'docs': str(docs),
-                    'history': '\n'.join(history_qa)
-                }
-            else:
-                prompt = self.solve_question_without_docs_prompt
-                params = {
-                    'question': question,
-                    'knowledge_graph': str(knowledge_graph),
-                    'history': '\n'.join(history_qa)
-                }
-        else:
-            prompt = self.solve_question_without_spo_prompt
-            params = {
-                'question': question,
-                'docs': str(docs),
-                'history': '\n'.join(history_qa)
-            }
-        llm_output = self.llm_module.invoke(params, prompt, with_json_parse=False, with_except=True)
-        logger.debug(f"sub_question:{question}\n sub_answer:{llm_output} prompt:\n{prompt}")
-        if llm_output:
-            return llm_output
-        return "I don't know"
diff --git a/kag/solver/logic/core_modules/lf_solver.py b/kag/solver/logic/core_modules/lf_solver.py
deleted file mode 100644
index a24d98b5..00000000
--- a/kag/solver/logic/core_modules/lf_solver.py
+++ /dev/null
@@ -1,166 +0,0 @@
-import json
-import logging
-import os
-import time
-from typing import List
-
-from kag.common.vectorizer import Vectorizer
-from kag.interface.retriever.chunk_retriever_abc import ChunkRetrieverABC
-from kag.interface.retriever.kg_retriever_abc import KGRetrieverABC
-from kag.solver.logic.core_modules.common.base_model import LFPlanResult
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.common.text_sim_by_vector import TextSimilarity
-from kag.solver.logic.core_modules.common.utils import generate_random_string
-from kag.solver.logic.core_modules.config import LogicFormConfiguration
-from kag.solver.logic.core_modules.lf_executor import LogicExecutor
-from kag.solver.logic.core_modules.lf_generator import LFGenerator
-from kag.solver.logic.core_modules.retriver.entity_linker import DefaultEntityLinker
-from kag.solver.logic.core_modules.retriver.graph_retriver.dsl_executor import DslRunnerOnGraphStore
-from kag.solver.logic.core_modules.retriver.schema_std import SchemaRetrieval
-from knext.project.client import ProjectClient
-
-logger = logging.getLogger()
-
-
-class LFSolver:
-    """
-    Solver class that integrates various components to solve queries using logic forms.
-    This class can't be extended to implement custom solver strategies.
-    """
-
-    def __init__(self, kg_retriever: KGRetrieverABC = None,
-                 chunk_retriever: ChunkRetrieverABC = None,
-                 report_tool=None, **kwargs):
-        """
-        Initializes the solver with necessary modules and configurations.
-
-        Parameters:
-        chunk_retriever (ChunkRetriever): An instance for chunk-level retrieval. If not provided, we will not execute chunk retrieval.
-        kg_retriever (KGRetrieval): An instance for graph-level retrieval. If not provided, we will not execute retrieval on graph.
-        report_tool (Tool, optional): An instance of the reporting tool. Defaults to None.
-
-        Returns:
-        None
-
-        Raises:
-        ValueError: If both `kg_retriever` and `chunk_retriever` are None.
-        """
-        if kg_retriever is None and chunk_retriever is None:
-            raise ValueError("At least one of `kg_retriever` or `chunk_retriever` must be provided.")
-
-        self.kg_retriever = kg_retriever
-        self.chunk_retriever = chunk_retriever
-        self.project_id = kwargs.get("KAG_PROJECT_ID") or os.getenv("KAG_PROJECT_ID")
-        self.host_addr = kwargs.get("KAG_PROJECT_HOST_ADDR") or os.getenv("KAG_PROJECT_HOST_ADDR")
-        if report_tool and report_tool.project_id:
-            self.project_id = report_tool.project_id
-
-        self.schema = SchemaUtils(LogicFormConfiguration(kwargs))
-        self.schema.get_schema()
-        self.std_schema = SchemaRetrieval(**kwargs)
-        self.el = DefaultEntityLinker(None, self.kg_retriever)
-        self.generator = LFGenerator(**kwargs)
-        self.report_tool = report_tool
-        self.last_iter_docs = []
-
-        vectorizer_config = eval(os.getenv("KAG_VECTORIZER", "{}"))
-        if self.host_addr and self.project_id:
-            config = ProjectClient(host_addr=self.host_addr, project_id=self.project_id).get_config(self.project_id)
-            vectorizer_config.update(config.get("vectorizer", {}))
-        self.vectorizer: Vectorizer = Vectorizer.from_config(vectorizer_config)
-        self.text_similarity = TextSimilarity(vec_config=vectorizer_config)
-
-    def _process_history(self, history):
-        """
-        Processes the history to extract sub-query-answer pairs and document sets.
-
-        Parameters:
-        history (list): A list of historical query-answer pairs.
-
-        Returns:
-        tuple: A tuple containing the list of sub-query-answer pairs and the list of document sets.
-        """
-        sub_qa_pair = []
-        docs_set = []
-        for i, h in enumerate(history):
-            if "sub_query" not in h:
-                continue
-            if 'sub_answer' in h and h['sub_answer'].lower() != "i don't know":
-                sub_qa_pair.append(f"query{i + 1}: {h['sub_query']}\nanswer{i + 1}: {h['sub_answer']}")
-            if "docs" in h and len(h['docs']) > 0:
-                docs_set.append(h['docs'])
-        return sub_qa_pair, docs_set
-
-    def _flat_passages_set(self, passages_set: list):
-        """
-        Flattens the passages set and scores each passage based on its position.
-
-        Parameters:
-        passages_set (list): A list of passage sets.
-
-        Returns:
-        list: A list of passages sorted by their scores.
-        """
-        score_map = {}
-        if len(self.last_iter_docs) > 0:
-            passages_set.append(self.last_iter_docs)
-        for passages in passages_set:
-            passages = ["#".join(item.split("#")[:-1]) for item in passages]
-            for i, passage in enumerate(passages):
-                score = 1.0 / (1 + i)
-                if passage in score_map:
-                    score_map[passage] += score
-                else:
-                    score_map[passage] = score
-
-        return [k for k, v in sorted(score_map.items(), key=lambda item: item[1], reverse=True)]
-
-    def solve(self, query, lf_nodes: List[LFPlanResult]):
-        """
-        Solves the query using logic forms and returns the results.
-
-        Parameters:
-        query (str): The main query to solve.
-        lf_nodes (list): A list of LFPlanResult to be solved.
-
-        Returns:
-        tuple: A tuple containing the final answer, sub-query-answer pairs, relevant documents, and history.
-        """
-        try:
-            start_time = time.time()
-            executor = LogicExecutor(
-                query, self.project_id, self.schema,
-                kg_retriever=self.kg_retriever,
-                chunk_retriever=self.chunk_retriever,
-                std_schema=self.std_schema,
-                el=self.el,
-                text_similarity=self.text_similarity,
-                dsl_runner=DslRunnerOnGraphStore(self.project_id, self.schema, LogicFormConfiguration({
-                    "KAG_PROJECT_ID": self.project_id,
-                    "KAG_PROJECT_HOST_ADDR": self.host_addr
-                })),
-                generator=self.generator,
-                report_tool=self.report_tool,
-                req_id=generate_random_string(10)
-            )
-            kg_qa_result, kg_graph, history = executor.execute(lf_nodes, query)
-            logger.info(
-                f"{executor.req_id} call_kb_paths cost={time.time() - start_time} kg_path={kg_graph.to_answer_path()}"
-            )
-        except Exception as e:
-            logger.warning(f"lf_retriever {query} lf failed {str(e)}", exc_info=True)
-            history = []
-            kg_qa_result = []
-
-        docs = []
-        sub_qa_pair = []
-        if history:
-            sub_qa_pair, docs_set = self._process_history(history)
-            docs = self._flat_passages_set(docs_set)
-        if len(docs) == 0 and len(sub_qa_pair) == 0 and self.chunk_retriever:
-            cur_step_recall_docs = self.chunk_retriever.recall_docs(query)
-            history.append({'docs': cur_step_recall_docs})
-            docs = self._flat_passages_set([cur_step_recall_docs])
-        if len(docs) != 0:
-            self.last_iter_docs = docs
-        return ",".join(kg_qa_result), sub_qa_pair, docs, history
diff --git a/kag/solver/logic/core_modules/op_executor/op_deduce/deduce_executor.py b/kag/solver/logic/core_modules/op_executor/op_deduce/deduce_executor.py
deleted file mode 100644
index f8d67c82..00000000
--- a/kag/solver/logic/core_modules/op_executor/op_deduce/deduce_executor.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from typing import Union
-
-from kag.solver.logic.core_modules.common.base_model import LogicNode
-from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.op_executor.op_deduce.module.choice import ChoiceOp
-from kag.solver.logic.core_modules.op_executor.op_deduce.module.entailment import EntailmentOp
-from kag.solver.logic.core_modules.op_executor.op_deduce.module.judgement import JudgementOp
-from kag.solver.logic.core_modules.op_executor.op_deduce.module.multi_choice import MultiChoiceOp
-from kag.solver.logic.core_modules.op_executor.op_executor import OpExecutor
-from kag.solver.logic.core_modules.parser.logic_node_parser import FilterNode, VerifyNode, \
-    ExtractorNode, DeduceNode
-from kag.solver.logic.core_modules.rule_runner.rule_runner import OpRunner
-
-
-class DeduceExecutor(OpExecutor):
-    def __init__(self, nl_query: str, kg_graph: KgGraph, schema: SchemaUtils, rule_runner: OpRunner, debug_info: dict,
-                 **kwargs):
-        super().__init__(nl_query, kg_graph, schema, debug_info, **kwargs)
-        self.KAG_PROJECT_ID = kwargs.get('KAG_PROJECT_ID')
-        self.rule_runner = rule_runner
-        self.op_register_map = {
-            'verify': self.rule_runner.run_verify_op,
-            'filter': self.rule_runner.run_filter_op,
-            'extractor': self.rule_runner.run_extractor_op
-        }
-
-    def _deduce_call(self, node: DeduceNode, req_id: str, param: dict) -> list:
-        op_mapping = {
-            'choice': ChoiceOp(self.nl_query, self.kg_graph, self.schema, self.debug_info,KAG_PROJECT_ID = self.KAG_PROJECT_ID),
-            'multiChoice': MultiChoiceOp(self.nl_query, self.kg_graph, self.schema, self.debug_info,KAG_PROJECT_ID = self.KAG_PROJECT_ID),
-            'entailment': EntailmentOp(self.nl_query, self.kg_graph, self.schema, self.debug_info,KAG_PROJECT_ID = self.KAG_PROJECT_ID),
-            'judgement': JudgementOp(self.nl_query, self.kg_graph, self.schema, self.debug_info,KAG_PROJECT_ID = self.KAG_PROJECT_ID)
-        }
-        result = []
-        for op in node.deduce_ops:
-            if_answered, answer = op_mapping[op].executor(node, req_id, param)
-            if if_answered:
-                result.append(answer)
-        return result
-
-    def is_this_op(self, logic_node: LogicNode) -> bool:
-        return isinstance(logic_node, (DeduceNode, FilterNode, VerifyNode, ExtractorNode))
-
-    def executor(self, logic_node: LogicNode, req_id: str, param: dict) -> Union[KgGraph, list]:
-        if isinstance(logic_node, DeduceNode):
-            return self._deduce_call(logic_node, req_id, param)
-        op_func = self.op_register_map.get(logic_node.operator, None)
-        if op_func is None:
-            return self.kg_graph
-        return op_func(logic_node)
diff --git a/kag/solver/logic/core_modules/op_executor/op_deduce/module/choice.py b/kag/solver/logic/core_modules/op_executor/op_deduce/module/choice.py
deleted file mode 100644
index ef51a843..00000000
--- a/kag/solver/logic/core_modules/op_executor/op_deduce/module/choice.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from kag.common.base.prompt_op import PromptOp
-from kag.solver.logic.core_modules.common.base_model import LogicNode
-from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.op_executor.op_executor import OpExecutor
-
-
-class ChoiceOp(OpExecutor):
-    def __init__(self, nl_query: str, kg_graph: KgGraph, schema: SchemaUtils, debug_info: dict, **kwargs):
-        super().__init__(nl_query, kg_graph, schema, debug_info, **kwargs)
-        self.prompt = PromptOp.load(self.biz_scene, "deduce_choice")(
-            language=self.language
-        )
-
-    def executor(self, logic_node: LogicNode, req_id: str, param: dict) -> list:
-        # get history qa pair from debug_info
-        history_qa_pair = self.debug_info.get("sub_qa_pair", [])
-        qa_pair = "\n".join([f"Q: {q}\nA: {a}" for q, a in history_qa_pair])
-        if_answered, answer = self.llm_module.invoke({'instruction': self.nl_query, 'memory': qa_pair},
-                                                     self.prompt, with_json_parse=False, with_except=True)
-        return [if_answered, answer]
diff --git a/kag/solver/logic/core_modules/op_executor/op_deduce/module/entailment.py b/kag/solver/logic/core_modules/op_executor/op_deduce/module/entailment.py
deleted file mode 100644
index ed737946..00000000
--- a/kag/solver/logic/core_modules/op_executor/op_deduce/module/entailment.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from kag.common.base.prompt_op import PromptOp
-from kag.solver.logic.core_modules.common.base_model import LogicNode
-from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.op_executor.op_executor import OpExecutor
-
-
-class EntailmentOp(OpExecutor):
-    def __init__(self, nl_query: str, kg_graph: KgGraph, schema: SchemaUtils, debug_info: dict, **kwargs):
-        super().__init__(nl_query, kg_graph, schema, debug_info, **kwargs)
-        self.prompt = PromptOp.load(self.biz_scene, "deduce_entail")(
-            language=self.language
-        )
-
-    def executor(self, logic_node: LogicNode, req_id: str, param: dict) -> list:
-        history_qa_pair = self.debug_info.get("sub_qa_pair", [])
-        qa_pair = "\n".join([f"Q: {q}\nA: {a}" for q, a in history_qa_pair])
-        spo_info = self.kg_graph.to_evidence()
-        information = str(spo_info) + "\n" + qa_pair
-        if_answered, answer = self.llm_module.invoke({'instruction': self.nl_query, 'memory': information},
-                                                     self.prompt, with_json_parse=False, with_except=True)
-        return [if_answered, answer]
\ No newline at end of file
diff --git a/kag/solver/logic/core_modules/op_executor/op_deduce/module/judgement.py b/kag/solver/logic/core_modules/op_executor/op_deduce/module/judgement.py
deleted file mode 100644
index 965f1bf4..00000000
--- a/kag/solver/logic/core_modules/op_executor/op_deduce/module/judgement.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from kag.common.base.prompt_op import PromptOp
-from kag.solver.logic.core_modules.common.base_model import LogicNode
-from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.op_executor.op_executor import OpExecutor
-
-
-class JudgementOp(OpExecutor):
-    def __init__(self, nl_query: str, kg_graph: KgGraph, schema: SchemaUtils, debug_info: dict, **kwargs):
-        super().__init__(nl_query, kg_graph, schema, debug_info, **kwargs)
-        self.prompt = PromptOp.load(self.biz_scene, "deduce_judge")(
-            language=self.language
-        )
-
-    def executor(self, logic_node: LogicNode, req_id: str, param: dict) -> list:
-        history_qa_pair = self.debug_info.get("sub_qa_pair", [])
-        qa_pair = "\n".join([f"Q: {q}\nA: {a}" for q, a in history_qa_pair])
-        spo_info = self.kg_graph.to_evidence()
-        information = str(spo_info) + "\n" + qa_pair
-        if_answered, answer = self.llm_module.invoke({'instruction': self.nl_query, 'memory': information},
-                                                     self.prompt, with_json_parse=False, with_except=True)
-        return [if_answered, answer]
\ No newline at end of file
diff --git a/kag/solver/logic/core_modules/op_executor/op_deduce/module/multi_choice.py b/kag/solver/logic/core_modules/op_executor/op_deduce/module/multi_choice.py
deleted file mode 100644
index ead1646f..00000000
--- a/kag/solver/logic/core_modules/op_executor/op_deduce/module/multi_choice.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from kag.common.base.prompt_op import PromptOp
-from kag.solver.logic.core_modules.common.base_model import LogicNode
-from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.op_executor.op_executor import OpExecutor
-
-
-class MultiChoiceOp(OpExecutor):
-    def __init__(self, nl_query: str, kg_graph: KgGraph, schema: SchemaUtils, debug_info: dict, **kwargs):
-        super().__init__(nl_query, kg_graph, schema, debug_info, **kwargs)
-        self.prompt = PromptOp.load(self.biz_scene, "deduce_multi_choice")(
-            language=self.language
-        )
-
-    def executor(self, logic_node: LogicNode, req_id: str, param: dict) -> list:
-        # get history qa pair from debug_info
-        history_qa_pair = self.debug_info.get("sub_qa_pair", [])
-        qa_pair = "\n".join([f"Q: {q}\nA: {a}" for q, a in history_qa_pair])
-        if_answered, answer = self.llm_module.invoke({'instruction': self.nl_query, 'memory': qa_pair},
-                                                     self.prompt, with_json_parse=False, with_except=True)
-        return [if_answered, answer]
diff --git a/kag/solver/logic/core_modules/op_executor/op_math/math_executor.py b/kag/solver/logic/core_modules/op_executor/op_math/math_executor.py
deleted file mode 100644
index 511b0288..00000000
--- a/kag/solver/logic/core_modules/op_executor/op_math/math_executor.py
+++ /dev/null
@@ -1,18 +0,0 @@
-from typing import Union
-
-from kag.solver.logic.core_modules.common.base_model import LogicNode
-from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.op_executor.op_executor import OpExecutor
-from kag.solver.logic.core_modules.parser.logic_node_parser import CountNode, SumNode
-
-
-class MathExecutor(OpExecutor):
-    def __init__(self, nl_query: str, kg_graph: KgGraph, schema: SchemaUtils, debug_info: dict, **kwargs):
-        super().__init__(nl_query, kg_graph, schema, debug_info, **kwargs)
-
-    def is_this_op(self, logic_node: LogicNode) -> bool:
-        return isinstance(logic_node, (CountNode, SumNode))
-
-    def executor(self, logic_node: LogicNode, req_id: str, param: dict) -> Union[KgGraph, list]:
-        pass
diff --git a/kag/solver/logic/core_modules/op_executor/op_output/module/get_executor.py b/kag/solver/logic/core_modules/op_executor/op_output/module/get_executor.py
deleted file mode 100644
index 61612951..00000000
--- a/kag/solver/logic/core_modules/op_executor/op_output/module/get_executor.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from kag.solver.logic.core_modules.common.base_model import SPOEntity, LogicNode
-from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph, EntityData, RelationData
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.op_executor.op_executor import OpExecutor
-from kag.solver.logic.core_modules.parser.logic_node_parser import GetNode
-from kag.solver.logic.core_modules.retriver.entity_linker import spo_entity_linker, EntityLinkerBase
-from kag.solver.logic.core_modules.retriver.graph_retriver.dsl_executor import DslRunner
-
-
-class GetExecutor(OpExecutor):
-    def __init__(self, nl_query: str, kg_graph: KgGraph, schema: SchemaUtils, el: EntityLinkerBase,
-                 dsl_runner: DslRunner, cached_map: dict, debug_info: dict, **kwargs):
-        super().__init__(nl_query, kg_graph, schema, debug_info, **kwargs)
-
-        self.el = el
-        self.dsl_runner = dsl_runner
-        self.cached_map = cached_map
-
-    def executor(self, logic_node: LogicNode, req_id: str, param: dict) -> list:
-        kg_qa_result = []
-        if not isinstance(logic_node, GetNode) or not self.debug_info.get('exact_match_spo', False):
-            return kg_qa_result
-
-        n = logic_node
-        s_data_set = self.kg_graph.get_entity_by_alias(n.alias_name)
-        if isinstance(n.s, SPOEntity) and s_data_set is None:
-            if len(n.s.id_set) > 0:
-                start_info_set = n.s.generate_start_infos()
-                for start_info in start_info_set:
-                    id_info = EntityData()
-                    id_info.type = start_info['type']
-                    id_info.biz_id = start_info['id']
-                    s_data_set = [id_info]
-            elif n.s.entity_name:
-                el_results, el_request, err_msg, call_result_data = spo_entity_linker(self.kg_graph,
-                                                                                      n,
-                                                                                      self.nl_query,
-                                                                                      self.el,
-                                                                                      self.schema,
-                                                                                      req_id,
-                                                                                      param)
-                self.debug_info['el'] = self.debug_info['el'] + el_results
-                self.debug_info['el_detail'] = self.debug_info['el_detail'] + [{
-                    "el_request": el_request,
-                    'el_results': el_results,
-                    'el_debug_result': call_result_data,
-                    'err_msg': err_msg
-                }]
-                n.to_std(n.args)
-                s_data_set = self.kg_graph.get_entity_by_alias(n.alias_name)
-        if s_data_set is None:
-            self.debug_info['get_empty'].append(n.to_dict())
-            return kg_qa_result
-
-        s_biz_id_set = []
-        for s_data in s_data_set:
-            if isinstance(s_data, EntityData):
-                if s_data.name == '':
-                    s_biz_id_set.append(s_data.biz_id)
-                else:
-                    kg_qa_result.append(s_data.name)
-            if isinstance(s_data, RelationData):
-                kg_qa_result.append(str(s_data))
-        if len(s_biz_id_set) > 0:
-            one_hop_cached_map = self.dsl_runner.query_vertex_property_by_s_ids(s_biz_id_set,
-                                                                                n.s.get_entity_first_type(),
-                                                                                self.cached_map)
-
-            self.kg_graph.nodes_alias.append(n.alias_name)
-            entities = []
-            for one_hop in one_hop_cached_map.keys():
-                kg_qa_result.append(one_hop_cached_map[one_hop].s.name)
-                entities.append(one_hop_cached_map[one_hop].s)
-
-            if n.alias_name not in self.kg_graph.entity_map.keys():
-                self.kg_graph.entity_map[n.alias_name] = entities
-            else:
-                self.kg_graph.entity_map[n.alias_name] = self.kg_graph.entity_map[n.alias_name] + entities
-        return kg_qa_result
diff --git a/kag/solver/logic/core_modules/op_executor/op_output/output_executor.py b/kag/solver/logic/core_modules/op_executor/op_output/output_executor.py
deleted file mode 100644
index 57ac70fa..00000000
--- a/kag/solver/logic/core_modules/op_executor/op_output/output_executor.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from typing import Union
-
-from kag.solver.logic.core_modules.common.base_model import LogicNode
-from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.op_executor.op_executor import OpExecutor
-from kag.solver.logic.core_modules.op_executor.op_output.module.get_executor import GetExecutor
-from kag.solver.logic.core_modules.parser.logic_node_parser import GetNode
-from kag.solver.logic.core_modules.retriver.entity_linker import EntityLinkerBase
-from kag.solver.logic.core_modules.retriver.graph_retriver.dsl_executor import DslRunner
-
-
-class OutputExecutor(OpExecutor):
-    def __init__(self, nl_query: str, kg_graph: KgGraph, schema: SchemaUtils, el: EntityLinkerBase, dsl_runner: DslRunner, cached_map: dict, debug_info: dict, **kwargs):
-        super().__init__(nl_query, kg_graph, schema, debug_info, **kwargs)
-        self.KAG_PROJECT_ID = kwargs.get('KAG_PROJECT_ID')
-        self.op_register_map = {
-            'get': GetExecutor(nl_query, kg_graph, schema, el, dsl_runner, cached_map, self.debug_info,KAG_PROJECT_ID = kwargs.get('KAG_PROJECT_ID'))
-        }
-
-    def is_this_op(self, logic_node: LogicNode) -> bool:
-        return isinstance(logic_node, GetNode)
-
-    def executor(self, logic_node: LogicNode, req_id: str, param: dict) -> Union[KgGraph, list]:
-        op = self.op_register_map.get(logic_node.operator, None)
-        if op is None:
-            return []
-        return op.executor(logic_node, req_id, param)
diff --git a/kag/solver/logic/core_modules/op_executor/op_retrieval/module/get_spo_executor.py b/kag/solver/logic/core_modules/op_executor/op_retrieval/module/get_spo_executor.py
deleted file mode 100644
index d8f5d657..00000000
--- a/kag/solver/logic/core_modules/op_executor/op_retrieval/module/get_spo_executor.py
+++ /dev/null
@@ -1,187 +0,0 @@
-import logging
-import time
-from typing import List
-
-from kag.interface.retriever.kg_retriever_abc import KGRetrieverABC
-from kag.solver.logic.core_modules.common.base_model import SPOEntity, LogicNode
-from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph, EntityData, RelationData, \
-    OneHopGraphData
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.common.text_sim_by_vector import TextSimilarity
-from kag.solver.logic.core_modules.op_executor.op_executor import OpExecutor
-from kag.solver.logic.core_modules.parser.logic_node_parser import GetSPONode
-from kag.solver.logic.core_modules.retriver.entity_linker import EntityLinkerBase, spo_entity_linker
-from kag.solver.logic.core_modules.retriver.graph_retriver.dsl_executor import DslRunner
-
-logger = logging.getLogger()
-
-
-class GetSPOExecutor(OpExecutor):
-    """
-    Executor for the 'get_spo' operator.
-
-    This class is used to retrieve one-hop graphs based on the given parameters.
-    It extends the base `OpExecutor` class and initializes additional components specific to retrieving SPO triples.
-    """
-    def __init__(self, nl_query: str, kg_graph: KgGraph, schema: SchemaUtils, retrieval_spo: KGRetrieverABC,
-                 el: EntityLinkerBase,
-                 dsl_runner: DslRunner, query_one_graph_cache: dict, debug_info: dict, text_similarity: TextSimilarity=None,**kwargs):
-        """
-        Initializes the GetSPOExecutor with necessary components.
-
-        Parameters:
-            nl_query (str): Natural language query string.
-            kg_graph (KgGraph): Knowledge graph object for subsequent queries and parsing.
-            schema (SchemaUtils): Semantic structure definition to assist in the parsing process.
-            retrieval_spo (KGRetrieverABC): Retrieval object for SPO triples.
-            el (EntityLinkerBase): Entity linker for entity linking tasks.
-            dsl_runner (DslRunner): Runner cypher for query graph database.
-            query_one_graph_cache (dict): Cache for storing results of one-hop graph queries.
-            debug_info (dict): Debug information dictionary to record debugging information during parsing.
-        """
-        super().__init__(nl_query, kg_graph, schema, debug_info, **kwargs)
-        self.retrieval_spo = retrieval_spo
-        self.dsl_runner = dsl_runner
-        self.query_one_graph_cache = query_one_graph_cache
-        self.el = el
-
-        self.text_similarity = text_similarity or TextSimilarity()
-
-    def _find_relation_result(self, n: GetSPONode, one_hop_graph_list: List[OneHopGraphData], req_id: str):
-        one_kg_graph = KgGraph()
-        is_find_relation = False
-        if n.p.get_entity_first_type_or_zh() is None and n.o.get_entity_first_type_or_zh() is None:
-            is_find_relation = True
-            for one_hop_graph in one_hop_graph_list:
-                rel_set = one_hop_graph.get_all_relation_value()
-                one_kg_graph_ = KgGraph()
-                recall_alias_name = n.s.alias_name if one_hop_graph.s_alias_name == "s" else n.o.alias_name
-                one_kg_graph_.entity_map[recall_alias_name] = [one_hop_graph.s]
-                one_kg_graph_.edge_map[n.p.alias_name] = rel_set
-                one_kg_graph.merge_kg_graph(one_kg_graph_)
-        return one_kg_graph, is_find_relation
-
-    def _get_spo_value_in_one_hop_graph_set(self, n: GetSPONode, one_hop_graph_list: List[OneHopGraphData], req_id: str):
-        process_kg, is_rel = self._find_relation_result(n, one_hop_graph_list, req_id)
-        if is_rel:
-            return process_kg
-        return self.retrieval_spo.retrieval_relation(n, one_hop_graph_list, req_id=req_id, debug_info = self.debug_info)
-
-    def _run_query_vertex_one_graph(self, s_node_set: List[EntityData], o_node_set: List[EntityData], n: GetSPONode=None):
-        return self.dsl_runner.query_vertex_one_graph_by_s_o_ids(s_node_set,
-                                                                 o_node_set,
-                                                                 self.query_one_graph_cache, n)
-
-    def _execute_get_spo_by_set(self, n: GetSPONode, s_node_set: List[EntityData], o_node_set: List[EntityData], req_id):
-        start_time = time.time()
-        kg_graph = KgGraph()
-        kg_graph.query_graph[n.p.alias_name] = {
-            "s": n.s.alias_name,
-            "p": n.p.alias_name,
-            "o": n.o.alias_name
-        }
-
-        if (s_node_set is None or len(s_node_set) == 0) and (o_node_set is None or len(o_node_set) == 0):
-            logger.info(f"{req_id} not found id is spo " + str(n))
-            return kg_graph
-        one_hop_graph_map = self._run_query_vertex_one_graph(s_node_set, o_node_set, n)
-        end_time = time.time()
-        logger.debug(f"{req_id} execute_get_spo_by_set {n} recall subgraph cost {end_time - start_time}")
-        if len(one_hop_graph_map) == 0:
-            logger.debug(f"{req_id} execute_get_spo_by_set one_hop_graph_map is empty")
-            return kg_graph
-        kg_graph.nodes_alias.append(n.s.alias_name)
-        kg_graph.nodes_alias.append(n.o.alias_name)
-        kg_graph.edge_alias.append(n.p.alias_name)
-
-        start_time = time.time()
-        one_hop_graph_list = []
-        for biz_id in one_hop_graph_map.keys():
-            self.query_one_graph_cache[biz_id] = one_hop_graph_map[biz_id]
-            one_hop_graph_list.append(one_hop_graph_map[biz_id])
-
-        res = self._get_spo_value_in_one_hop_graph_set(n, one_hop_graph_list, req_id)
-        kg_graph.merge_kg_graph(res)
-
-        logger.debug(
-            f"{req_id} execute_get_spo_by_set merged kg graph ={kg_graph.to_edge_str()} cost = {time.time() - start_time}")
-        return kg_graph
-
-    def executor(self, logic_node: LogicNode, req_id: str, param: dict) -> KgGraph:
-        kg_graph = KgGraph()
-        if not isinstance(logic_node, GetSPONode):
-            return kg_graph
-        n = logic_node
-
-        self.kg_graph.logic_form_base[n.s.alias_name] = n.s
-        self.kg_graph.logic_form_base[n.p.alias_name] = n.p
-        self.kg_graph.logic_form_base[n.o.alias_name] = n.o
-
-        # 实体标准化, 针对有实体名称的节点，需要做链指
-        el_results, el_request, err_msg, call_result_data = spo_entity_linker(self.kg_graph,
-                                                                              n,
-                                                                              self.nl_query,
-                                                                              self.el,
-                                                                              self.schema,
-                                                                              req_id,
-                                                                              param)
-        if el_request and el_request['entity_mentions']:
-            self.debug_info['el'] = self.debug_info['el'] + el_results
-            self.debug_info['el_detail'] = self.debug_info['el_detail'] + [{
-                "el_request": el_request,
-                'el_results': el_results,
-                'el_debug_result': call_result_data,
-                'err_msg': err_msg
-            }]
-            n.to_std(n.args)
-
-        s_data_set = []
-        relation_data_set = []
-
-        if isinstance(n.s, SPOEntity) and len(n.s.id_set) > 0:
-            s_data_set = self._get_entity_node_from_lf(n.s)
-        else:
-            s_data_set_up = self.kg_graph.get_entity_by_alias(n.s.alias_name)
-            if s_data_set_up is not None:
-                for s_data in s_data_set_up:
-                    if isinstance(s_data, EntityData) and s_data.type != "attribute":
-                        s_data_set.append(s_data)
-                    if isinstance(s_data, RelationData):
-                        relation_data_set.append(s_data)
-
-        if len(relation_data_set) > 0:
-            logger.info(f"{req_id} get_spo relation_data_set is not empty {str(relation_data_set)}, need get prop")
-            return kg_graph
-
-        o_data_set = []
-        if n.o:
-            if isinstance(n.o, SPOEntity) and len(n.o.id_set) > 0:
-                o_data_set = self._get_entity_node_from_lf(n.o)
-            else:
-                o_data_set_up = self.kg_graph.get_entity_by_alias(n.o.alias_name)
-                if o_data_set_up is not None:
-                    for o_data in o_data_set_up:
-                        o_data_set.append(o_data)
-        cur_spo_graph = None
-        if s_data_set:
-            cur_spo_graph = self._execute_get_spo_by_set(n, s_data_set, [], req_id)
-        if o_data_set:
-            cur_spo_graph_o = self._execute_get_spo_by_set(n, [], o_data_set, req_id)
-            if cur_spo_graph is None:
-                cur_spo_graph = cur_spo_graph_o
-            else:
-                cur_spo_graph.merge_kg_graph(cur_spo_graph_o)
-        kg_graph = cur_spo_graph
-        return kg_graph
-
-    def _get_entity_node_from_lf(self, e: SPOEntity):
-        if not e.id_set:
-            return []
-        ret = []
-        for biz_id in e.id_set:
-            d = EntityData()
-            d.biz_id = biz_id
-            d.type = e.get_entity_first_type()
-            d.type_zh = e.get_entity_first_type_or_en()
-            ret.append(d)
-        return ret
\ No newline at end of file
diff --git a/kag/solver/logic/core_modules/op_executor/op_retrieval/module/search_s.py b/kag/solver/logic/core_modules/op_executor/op_retrieval/module/search_s.py
deleted file mode 100644
index fa1ae134..00000000
--- a/kag/solver/logic/core_modules/op_executor/op_retrieval/module/search_s.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from kag.solver.logic.core_modules.common.base_model import LogicNode
-from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.op_executor.op_executor import OpExecutor
-
-
-class SearchS(OpExecutor):
-    def __init__(self, nl_query: str, kg_graph: KgGraph, schema: SchemaUtils, debug_info: dict, **kwargs):
-        super().__init__(nl_query, kg_graph, schema, debug_info, **kwargs)
-
-    def executor(self, logic_node: LogicNode, req_id: str, param: dict) -> KgGraph:
-        raise NotImplementedError("search s not impl")
\ No newline at end of file
diff --git a/kag/solver/logic/core_modules/op_executor/op_retrieval/retrieval_executor.py b/kag/solver/logic/core_modules/op_executor/op_retrieval/retrieval_executor.py
deleted file mode 100644
index aa80e989..00000000
--- a/kag/solver/logic/core_modules/op_executor/op_retrieval/retrieval_executor.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import logging
-
-from kag.interface.retriever.kg_retriever_abc import KGRetrieverABC
-from kag.solver.logic.core_modules.common.base_model import LogicNode
-from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.common.text_sim_by_vector import TextSimilarity
-from kag.solver.logic.core_modules.op_executor.op_executor import OpExecutor
-from kag.solver.logic.core_modules.op_executor.op_retrieval.module.get_spo_executor import GetSPOExecutor
-from kag.solver.logic.core_modules.op_executor.op_retrieval.module.search_s import SearchS
-from kag.solver.logic.core_modules.parser.logic_node_parser import GetSPONode
-from kag.solver.logic.core_modules.retriver.entity_linker import EntityLinkerBase
-from kag.solver.logic.core_modules.retriver.graph_retriver.dsl_executor import DslRunner
-
-logger = logging.getLogger()
-
-
-class RetrievalExecutor(OpExecutor):
-    def __init__(self, nl_query: str, kg_graph: KgGraph, schema: SchemaUtils, retrieval_spo: KGRetrieverABC, el: EntityLinkerBase,
-                 dsl_runner: DslRunner, debug_info: dict, text_similarity: TextSimilarity=None,**kwargs):
-        super().__init__(nl_query, kg_graph, schema, debug_info, **kwargs)
-        self.query_one_graph_cache = {}
-        self.op_register_map = {
-            'get_spo': GetSPOExecutor(nl_query, kg_graph, schema, retrieval_spo, el, dsl_runner, self.query_one_graph_cache, self.debug_info, text_similarity,KAG_PROJECT_ID = kwargs.get('KAG_PROJECT_ID')),
-            'search_s': SearchS(nl_query, kg_graph, schema, self.debug_info,KAG_PROJECT_ID = kwargs.get('KAG_PROJECT_ID'))
-        }
-
-    def is_this_op(self, logic_node: LogicNode) -> bool:
-        return isinstance(logic_node, GetSPONode)
-
-    def executor(self, logic_node: LogicNode, req_id: str, param: dict) -> KgGraph:
-        op = self.op_register_map.get(logic_node.operator, None)
-        if op is None:
-            return KgGraph()
-        try:
-            cur_kg_graph = op.executor(logic_node, req_id, param)
-            if cur_kg_graph is not None:
-                self.kg_graph.merge_kg_graph(cur_kg_graph)
-        except Exception as e:
-            logger.warning(f"op {logic_node.operator} run failed! {e}", exc_info=True)
-        return self.kg_graph
diff --git a/kag/solver/logic/core_modules/op_executor/op_sort/sort_executor.py b/kag/solver/logic/core_modules/op_executor/op_sort/sort_executor.py
deleted file mode 100644
index a3e47e49..00000000
--- a/kag/solver/logic/core_modules/op_executor/op_sort/sort_executor.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from typing import Union
-
-from kag.solver.logic.core_modules.common.base_model import LogicNode
-from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.op_executor.op_executor import OpExecutor
-from kag.solver.logic.core_modules.parser.logic_node_parser import SortNode
-
-
-class SortExecutor(OpExecutor):
-    def __init__(self, nl_query: str, kg_graph: KgGraph, schema: SchemaUtils, debug_info: dict, **kwargs):
-        super().__init__(nl_query, kg_graph, schema, debug_info, **kwargs)
-
-    def is_this_op(self, logic_node: LogicNode) -> bool:
-        return isinstance(logic_node, SortNode)
-    def executor(self, logic_node: LogicNode, req_id: str, param: dict) -> Union[KgGraph, list]:
-        pass
diff --git a/kag/solver/logic/core_modules/parser/logic_node_parser.py b/kag/solver/logic/core_modules/parser/logic_node_parser.py
index 2ea3607a..b1785ef1 100644
--- a/kag/solver/logic/core_modules/parser/logic_node_parser.py
+++ b/kag/solver/logic/core_modules/parser/logic_node_parser.py
@@ -1,37 +1,50 @@
 import logging
 import re
 
-from kag.solver.logic.core_modules.common.base_model import SPOBase, SPOEntity, SPORelation, Identifer, \
-    TypeInfo, LogicNode
+from kag.interface.solver.base_model import (
+    SPOBase,
+    SPOEntity,
+    SPORelation,
+    Identifier,
+    TypeInfo,
+    LogicNode,
+)
 from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
 
-logger = logging.getLogger(__name__)\
+logger = logging.getLogger(__name__)
 
 
 # get_spg(s, p, o)
 class GetSPONode(LogicNode):
     def __init__(self, operator, args):
         super().__init__(operator, args)
-        self.s: SPOBase = args.get('s', None)
-        self.p: SPOBase = args.get('p', None)
-        self.o: SPOBase = args.get('o', None)
+        self.s: SPOBase = args.get("s", None)
+        self.p: SPOBase = args.get("p", None)
+        self.o: SPOEntity = args.get("o", None)
         self.sub_query = args.get("sub_query", None)
         self.query = args.get("query", None)
 
+    def get_ele_name(self, alias):
+        ele = self.args.get(alias, None)
+        if ele is None:
+            return ""
+        if isinstance(ele, SPOEntity):
+            return ele.entity_name if ele.entity_name else ''
+        return ""
     def to_dsl(self):
         raise NotImplementedError("Subclasses should implement this method.")
 
     def to_std(self, args):
         for key, value in args.items():
             self.args[key] = value
-        self.s = args.get('s', self.s)
-        self.p = args.get('p', self.p)
-        self.o = args.get('o', self.o)
-        self.sub_query = args.get('sub_query', self.sub_query)
+        self.s = args.get("s", self.s)
+        self.p = args.get("p", self.p)
+        self.o = args.get("o", self.o)
+        self.sub_query = args.get("sub_query", self.sub_query)
 
     @staticmethod
     def parse_node(input_str):
-        equality_list = re.findall(r'([\w.]+=[^=]+)(,|，|$)', input_str)
+        equality_list = re.findall(r"([\w.]+=[^=]+)(,|，|$)", input_str)
         if len(equality_list) < 3:
             raise RuntimeError(f"parse {input_str} error not found s,p,o")
         spo_params = [e[0] for e in equality_list[:3]]
@@ -47,7 +60,7 @@ def parse_node_spo(spo_params):
         p = None
         o = None
         for spo_param in spo_params:
-            key, param = spo_param.split('=')
+            key, param = spo_param.split("=")
             if key == "s":
                 s = SPOEntity.parse_logic_form(param)
             elif key == "o":
@@ -60,17 +73,13 @@ def parse_node_spo(spo_params):
             raise RuntimeError(f"parse {str(spo_params)} error not found p")
         if o is None:
             raise RuntimeError(f"parse {str(spo_params)} error not found o")
-        return GetSPONode("get_spo", {
-            "s": s,
-            "p": p,
-            "o": o
-        })
+        return GetSPONode("get_spo", {"s": s, "p": p, "o": o})
 
     @staticmethod
     def parse_node_value(get_spo_node_op, value_params):
         for value_param in value_params:
             # a.value=123,b.brand=345
-            value_pair = re.findall(r'(?:[,\s]*(\w+)\.(\w+)=([^,，]+))', value_param)
+            value_pair = re.findall(r"(?:[,\s]*(\w+)\.(\w+)=([^,，]+))", value_param)
             for key, property, value in value_pair:
                 node = None
                 if key == "s":
@@ -83,28 +92,28 @@ def parse_node_value(get_spo_node_op, value_params):
 
 
 def binary_expr_parse(input_str):
-    pattern = re.compile(r'(\w+)=((?:(?!\w+=).)*)')
+    pattern = re.compile(r"(\w+)=((?:(?!\w+=).)*)")
     matches = pattern.finditer(input_str)
     left_expr = None
     right_expr = None
     op = None
     for match in matches:
         key = match.group(1).strip()
-        value = match.group(2).strip().rstrip(',')
-        value = value.rstrip('，')
+        value = match.group(2).strip().rstrip(",")
+        value = value.rstrip("，")
         if key == "left_expr":
             if "," in value:
-                left_expr_list = list(set([Identifer(v) for v in value.split(",")]))
+                left_expr_list = list(set([Identifier(v) for v in value.split(",")]))
             elif "，" in value:
-                left_expr_list = list(set([Identifer(v) for v in value.split("，")]))
+                left_expr_list = list(set([Identifier(v) for v in value.split("，")]))
             else:
-                left_expr_list = [Identifer(value)]
+                left_expr_list = [Identifier(value)]
             if len(left_expr_list) == 1:
                 left_expr = left_expr_list[0]
             else:
                 left_expr = left_expr_list
         elif key == "right_expr":
-            if value != '':
+            if value != "":
                 right_expr = value
         elif key == "op":
             op = value
@@ -113,21 +122,17 @@ def binary_expr_parse(input_str):
 
     if op is None:
         raise RuntimeError(f"parse {input_str} error not found op")
-    return {
-        "left_expr": left_expr,
-        "right_expr": right_expr,
-        "op": op
-    }
+    return {"left_expr": left_expr, "right_expr": right_expr, "op": op}
 
 
 # filter(left_expr=alias, right_expr=other_alias or const_data, op=equal|lt|gt|le|ge|in|contains|and|or|not)
 class FilterNode(LogicNode):
     def __init__(self, operator, args):
         super().__init__(operator, args)
-        self.left_expr = args.get('left_expr', None)
-        self.right_expr = args.get('right_expr', None)
-        self.op = args.get('op', None)
-        self.OP = 'equal|lt|gt|le|ge|in|contains|and|or|not'.split('|')
+        self.left_expr = args.get("left_expr", None)
+        self.right_expr = args.get("right_expr", None)
+        self.op = args.get("op", None)
+        self.OP = "equal|lt|gt|le|ge|in|contains|and|or|not".split("|")
 
     def to_dsl(self):
         raise NotImplementedError("Subclasses should implement this method.")
@@ -135,9 +140,9 @@ def to_dsl(self):
     def to_std(self, args):
         for key, value in args.items():
             self.args[key] = value
-        self.left_expr = args.get('left_expr', self.left_expr)
-        self.right_expr = args.get('right_expr', self.right_expr)
-        self.op = args.get('op', self.op)
+        self.left_expr = args.get("left_expr", self.left_expr)
+        self.right_expr = args.get("right_expr", self.right_expr)
+        self.op = args.get("op", self.op)
 
     @staticmethod
     def parse_node(input_str):
@@ -157,7 +162,7 @@ def to_dsl(self):
 
     @staticmethod
     def parse_node(input_str, output_name):
-        args = {'alias_name': output_name, 'set': input_str}
+        args = {"alias_name": output_name, "set": input_str}
         return CountNode("count", args)
 
 
@@ -174,7 +179,7 @@ def to_dsl(self):
     @staticmethod
     def parse_node(input_str):
         # count_alias=count(alias)
-        match = re.match(r'(\w+)[\(\（](.*)[\)\）](->)?(.*)?', input_str.strip())
+        match = re.match(r"(\w+)[\(\（](.*)[\)\）](->)?(.*)?", input_str)
         if not match:
             raise RuntimeError(f"parse logic form error {input_str}")
         # print('match:',match.groups())
@@ -182,9 +187,9 @@ def parse_node(input_str):
             operator, params, _, alias_name = match.groups()
         else:
             operator, params = match.groups()
-            alias_name = 'sum1'
-        params = params.replace('，', ',').split(',')
-        args = {'alias_name': alias_name, 'set': params}
+            alias_name = "sum1"
+        params = params.replace("，", ",").split(",")
+        args = {"alias_name": alias_name, "set": params}
         return SumNode("sum", args)
 
 
@@ -210,13 +215,15 @@ def get_set(self):
 
     @staticmethod
     def parse_node(input_str):
-        equality_list = re.findall(r'([\w.]+=[^=]+)(,|，|$)', input_str)
+        equality_list = re.findall(r"([\w.]+=[^=]+)(,|，|$)", input_str)
         if len(equality_list) < 4:
-            raise RuntimeError(f"parse {input_str} error not found set,orderby,direction,limit")
+            raise RuntimeError(
+                f"parse {input_str} error not found set,orderby,direction,limit"
+            )
         params = [e[0] for e in equality_list[:4]]
         params_dict = {}
         for param in params:
-            key, value = param.split('=')
+            key, value = param.split("=")
             params_dict[key] = value
         return SortNode("sort", params_dict)
 
@@ -241,15 +248,24 @@ def get_set(self):
 
     @staticmethod
     def parse_node(input_str):
-        equality_list = re.findall(r'([\w.]+=[^=]+)(,|，|$)', input_str)
+        equality_list = re.findall(r"([\w.]+=[^=]+)(,|，|$)", input_str)
         if len(equality_list) < 2:
-            raise RuntimeError(f"parse {input_str} error not found set,orderby,direction,limit")
+            raise RuntimeError(
+                f"parse {input_str} error not found set,orderby,direction,limit"
+            )
         params = [e[0] for e in equality_list[:2]]
         params_dict = {}
         for param in params:
-            key, value = param.split('=')
-            if key == 'set':
-                value = value.strip().replace('，', ',').replace(' ', '').strip('[').strip(']').split(',')
+            key, value = param.split("=")
+            if key == "set":
+                value = (
+                    value.strip()
+                    .replace("，", ",")
+                    .replace(" ", "")
+                    .strip("[")
+                    .strip("]")
+                    .split(",")
+                )
             params_dict[key] = value
         return CompareNode("compare", params_dict)
 
@@ -266,19 +282,25 @@ def __str__(self):
     def parse_node(input_str):
         ops = input_str.replace("op=", "")
         input_ops = ops.split(",")
-        return DeduceNode("deduce", {
-            "deduce_ops": input_ops
-        })
+        return DeduceNode("deduce", {"deduce_ops": input_ops})
 
 
 # verity(left_expr=alias, right_expr=other_alias or const_data, op=equal|gt|lt|ge|le|in|contains)
 class VerifyNode(LogicNode):
     def __init__(self, operator, args):
         super().__init__(operator, args)
-        self.left_expr = args.get('left_expr', None)
-        self.right_expr = args.get('right_expr', None)
-        self.op = args.get('op', None)
-        self.OP = {'等于': 'equal', '大于': 'gt', '小于': 'lt', '大于等于': 'ge', '小于等于': 'le', '属于': 'in', '包含': 'contains'}
+        self.left_expr = args.get("left_expr", None)
+        self.right_expr = args.get("right_expr", None)
+        self.op = args.get("op", None)
+        self.OP = {
+            "等于": "equal",
+            "大于": "gt",
+            "小于": "lt",
+            "大于等于": "ge",
+            "小于等于": "le",
+            "属于": "in",
+            "包含": "contains",
+        }
 
     def to_dsl(self):
         raise NotImplementedError("Subclasses should implement this method.")
@@ -299,16 +321,16 @@ def get_left_expr_name(self):
     def to_std(self, args):
         for key, value in args.items():
             self.args[key] = value
-        self.left_expr = args.get('left_expr', self.left_expr)
-        self.right_expr = args.get('right_expr', self.right_expr)
-        self.op = args.get('op', self.op)
+        self.left_expr = args.get("left_expr", self.left_expr)
+        self.right_expr = args.get("right_expr", self.right_expr)
+        self.op = args.get("op", self.op)
         if self.op in self.OP.values():
             self.op = self.OP[self.op]
 
     @staticmethod
     def parse_node(input_str):
         if "verify" in input_str:
-            match = re.match(r'(\w+)[\(\（](.*)[\)\）](->)?(.*)?', input_str)
+            match = re.match(r"(\w+)[\(\（](.*)[\)\）](->)?(.*)?", input_str)
             if not match:
                 raise RuntimeError(f"parse logic form error {input_str}")
             # print('match:',match.groups())
@@ -331,10 +353,8 @@ def to_dsl(self):
     @staticmethod
     def parse_node(input_str):
         params = set(input_str.split(","))
-        alias_set = [Identifer(p) for p in params]
-        ex_node = ExtractorNode("extractor", {
-            "alias_set": alias_set
-        })
+        alias_set = [Identifier(p) for p in params]
+        ex_node = ExtractorNode("extractor", {"alias_set": alias_set})
         ex_node.alias_set = alias_set
         return ex_node
 
@@ -354,22 +374,25 @@ def to_dsl(self):
     @staticmethod
     def parse_node(input_str):
         input_args = input_str.split(",")
-        return GetNode("get", {
-            "alias_name": Identifer(input_args[0]),
-            "alias_name_set": [Identifer(e) for e in input_args]
-        })
+        return GetNode(
+            "get",
+            {
+                "alias_name": Identifier(input_args[0]),
+                "alias_name_set": [Identifier(e) for e in input_args],
+            },
+        )
 
 
 # search_s()
 class SearchNode(LogicNode):
     def __init__(self, operator, args):
         super().__init__(operator, args)
-        self.s = SPOEntity(None, None, args['type'], None, args['alias'], False)
-        self.s.value_list = args['conditions']
+        self.s = SPOEntity(None, None, args["type"], None, args["alias"], False)
+        self.s.value_list = args["conditions"]
 
     @staticmethod
     def parse_node(input_str):
-        pattern = re.compile(r'[,\s]*s=(\w+):([^,\s]+),(.*)')
+        pattern = re.compile(r"[,\s]*s=(\w+):([^,\s]+),(.*)")
         matches = pattern.match(input_str)
         args = dict()
         args["alias"] = matches.group(1)
@@ -378,21 +401,21 @@ def parse_node(input_str):
             search_condition = dict()
             s_condition = matches.group(3)
 
-            condition_pattern = re.compile(r'(?:[,\s]*(\w+)\.(\w+)=([^,，]+))')
+            condition_pattern = re.compile(r"(?:[,\s]*(\w+)\.(\w+)=([^,，]+))")
             condition_list = condition_pattern.findall(s_condition)
             for condition in condition_list:
                 s_property = condition[1]
                 s_value = condition[2]
                 s_value = SearchNode.check_value_is_reference(s_value)
                 search_condition[s_property] = s_value
-            args['conditions'] = search_condition
+            args["conditions"] = search_condition
 
-        return SearchNode('search_s', args)
+        return SearchNode("search_s", args)
 
     @staticmethod
     def check_value_is_reference(value_str):
-        if '.' in value_str:
-            return value_str.split('.')
+        if "." in value_str:
+            return value_str.split(".")
         return value_str
 
 
@@ -408,22 +431,22 @@ def std_parse_kg_node(self, entity: SPOBase, parsed_entity_set):
             exist_node.value_list.extend(entity.value_list)
             return parsed_entity_set[alias_name]
 
-        zh_types = entity.get_entity_type_zh_set()
+        zh_types = entity.get_un_std_entity_type_set()
         std_entity_type_set = []
         if isinstance(entity, SPOEntity):
             for entity_type in zh_types:
                 type_info = self.get_node_type_info(entity_type)
-                if type_info.entity_type is None and self.schema is not None:
+                if type_info.std_entity_type is None and self.schema is not None:
                     entity.is_attribute = True
                 std_entity_type_set.append(type_info)
         elif isinstance(entity, SPORelation):
-            s_type_zh = entity.s.get_entity_first_type_or_en()
-            o_type_zh = entity.o.get_entity_first_type_or_en()
-            s_type_en = entity.s.get_entity_first_type_or_zh()
-            o_type_en = entity.o.get_entity_first_type_or_zh()
+            s_type_zh = entity.s.get_un_std_entity_first_type_or_std()
+            o_type_zh = entity.o.get_un_std_entity_first_type_or_std()
+            s_type_en = entity.s.get_entity_first_type_or_un_std()
+            o_type_en = entity.o.get_entity_first_type_or_un_std()
             for entity_type in zh_types:
                 type_info = TypeInfo()
-                type_info.entity_type_zh = entity_type
+                type_info.un_std_entity_type = entity_type
                 if self.schema is not None:
                     if o_type_zh == "Entity":
                         sp_index = (s_type_zh, entity_type)
@@ -431,19 +454,27 @@ def std_parse_kg_node(self, entity: SPOBase, parsed_entity_set):
                             o_candis_set = self.schema.sp_o[sp_index]
                             for candis in o_candis_set:
                                 spo_zh = f"{s_type_zh}_{entity_type}_{candis}"
-                                type_info.entity_type = self.schema.get_spo_with_p(self.schema.spo_zh_en[spo_zh])
+                                type_info.std_entity_type = self.schema.get_spo_with_p(
+                                    self.schema.spo_zh_en[spo_zh]
+                                )
                                 break
 
-                    if not type_info.entity_type and s_type_zh == "Entity":
+                    if not type_info.std_entity_type and s_type_zh == "Entity":
                         op_index = (o_type_zh, entity_type)
                         if op_index in self.schema.op_s:
                             s_candis_set = self.schema.op_s[op_index]
                             for candis in s_candis_set:
                                 spo_zh = f"{candis}_{entity_type}_{o_type_zh}"
-                                type_info.entity_type = self.schema.get_spo_with_p(self.schema.spo_zh_en[spo_zh])
+                                type_info.std_entity_type = self.schema.get_spo_with_p(
+                                    self.schema.spo_zh_en[spo_zh]
+                                )
                                 break
 
-                    if not type_info.entity_type and o_type_zh != "Entity" and s_type_zh != "Entity":
+                    if (
+                        not type_info.std_entity_type
+                        and o_type_zh != "Entity"
+                        and s_type_zh != "Entity"
+                    ):
                         so_index = (s_type_zh, o_type_zh)
                         if so_index not in self.schema.so_p:
                             so_index = (o_type_zh, s_type_zh)
@@ -451,17 +482,23 @@ def std_parse_kg_node(self, entity: SPOBase, parsed_entity_set):
                         for p_candis in candis_set:
                             if p_candis == entity_type:
                                 spo_zh = f"{s_type_zh}_{p_candis}_{o_type_zh}"
-                                type_info.entity_type = self.schema.get_spo_with_p(self.schema.spo_zh_en[spo_zh])
+                                type_info.std_entity_type = self.schema.get_spo_with_p(
+                                    self.schema.spo_zh_en[spo_zh]
+                                )
 
-                    if not type_info.entity_type:
+                    if not type_info.std_entity_type:
                         # maybe a property
-                        s_attr_zh_en = self.schema.attr_zh_en_by_label.get(s_type_en, [])
+                        s_attr_zh_en = self.schema.attr_zh_en_by_label.get(
+                            s_type_en, []
+                        )
                         if s_attr_zh_en and entity_type in s_attr_zh_en:
-                            type_info.entity_type = s_attr_zh_en[entity_type]
-                        if not type_info.entity_type:
-                            o_attr_zh_en = self.schema.attr_zh_en_by_label.get(o_type_en, [])
+                            type_info.std_entity_type = s_attr_zh_en[entity_type]
+                        if not type_info.std_entity_type:
+                            o_attr_zh_en = self.schema.attr_zh_en_by_label.get(
+                                o_type_en, []
+                            )
                             if o_attr_zh_en and entity_type in o_attr_zh_en:
-                                type_info.entity_type = o_attr_zh_en[entity_type]
+                                type_info.std_entity_type = o_attr_zh_en[entity_type]
                 std_entity_type_set.append(type_info)
 
         entity.type_set = std_entity_type_set
@@ -476,11 +513,11 @@ def std_parse_node(self, entity: SPOEntity, parsed_entity_set):
             exist_node.value_list.extend(entity.value_list)
             return parsed_entity_set[alias_name]
 
-        zh_types = entity.get_entity_type_zh_set()
+        zh_types = entity.get_un_std_entity_type_set()
         std_entity_type_set = []
         for entity_type in zh_types:
             type_info = self.get_node_type_info(entity_type)
-            if type_info.entity_type is None and self.schema is not None:
+            if type_info.std_entity_type is None and self.schema is not None:
                 entity.is_attribute = True
             std_entity_type_set.append(type_info)
         entity.type_set = std_entity_type_set
@@ -491,11 +528,11 @@ def std_parse_edge(self, edge: SPORelation, parsed_entity_set):
         alias_name = edge.alias_name
         if alias_name in parsed_entity_set.keys():
             return parsed_entity_set[alias_name]
-        zh_types = edge.get_entity_type_zh_set()
+        zh_types = edge.get_un_std_entity_type_set()
         std_edge_type_set = []
         for entity_type in zh_types:
             type_info = self.get_edge_type_info(entity_type)
-            if type_info.entity_type is None and self.schema is not None:
+            if type_info.std_entity_type is None and self.schema is not None:
                 edge.is_attribute = True
             std_edge_type_set.append(type_info)
         edge.type_set = std_edge_type_set
@@ -503,8 +540,10 @@ def std_parse_edge(self, edge: SPORelation, parsed_entity_set):
         parsed_entity_set[alias_name] = edge
         return edge
 
-    def parse_logic_form(self, input_str: str, parsed_entity_set={}, sub_query=None, query=None):
-        match = re.match(r'(\w+)[\(\（](.*)[\)\）](->)?(.*)?', input_str.strip())
+    def parse_logic_form(
+        self, input_str: str, parsed_entity_set={}, sub_query=None, query=None
+    ):
+        match = re.match(r"(\w+)[\(\（](.*)[\)\）](->)?(.*)?", input_str.strip())
         if not match:
             raise RuntimeError(f"parse logic form error {input_str}")
         if len(match.groups()) == 4:
@@ -525,41 +564,43 @@ def parse_logic_form(self, input_str: str, parsed_entity_set={}, sub_query=None,
             node.p.s = s_node
             node.p.o = o_node
             p_node = self.std_parse_kg_node(node.p, parsed_entity_set)
-            node.to_std({
-                "s": s_node,
-                "p": p_node,
-                "o": o_node,
-                "sub_query": sub_query,
-            })
+            node.to_std(
+                {
+                    "s": s_node,
+                    "p": p_node,
+                    "o": o_node,
+                    "sub_query": sub_query,
+                }
+            )
         elif low_operator in ["filter"]:
             node: FilterNode = FilterNode.parse_node(args_str)
         elif low_operator in ["deduce"]:
             node: DeduceNode = DeduceNode.parse_node(args_str)
         elif low_operator in ["verify"]:
-            node: VerifyNode = VerifyNode.parse_node(input_str)
+            node: VerifyNode = VerifyNode.parse_node(args_str)
         elif low_operator in ["count"]:
             node: CountNode = CountNode.parse_node(args_str, output_name)
         elif low_operator in ["sum"]:
-            node: SumNode = SumNode.parse_node(input_str)
+            node: SumNode = SumNode.parse_node(args_str)
         elif low_operator in ["sort"]:
             node: SortNode = SortNode.parse_node(args_str)
         elif low_operator in ["compare"]:
             node: SortNode = CompareNode.parse_node(args_str)
         elif low_operator in ["extractor"]:
             node: ExtractorNode = ExtractorNode.parse_node(args_str)
-        elif low_operator in ['search_s']:
+        elif low_operator in ["search_s"]:
             node: SearchNode = SearchNode.parse_node(args_str)
             self.std_parse_node(node.s, parsed_entity_set)
         else:
             raise NotImplementedError(f"not impl {input_str}")
 
-        node.to_std({
-            "sub_query": sub_query
-        })
+        node.to_std({"sub_query": sub_query})
 
         return node
 
-    def parse_logic_form_set(self, input_str_set: list, sub_querys: list, question: str):
+    def parse_logic_form_set(
+        self, input_str_set: list, sub_querys: list, question: str
+    ):
         parsed_cached_map = {}
         parsed_node = []
         for i, input_str in enumerate(input_str_set):
@@ -568,7 +609,9 @@ def parse_logic_form_set(self, input_str_set: list, sub_querys: list, question:
             else:
                 sub_query = None
             try:
-                logic_node = self.parse_logic_form(input_str, parsed_cached_map, sub_query=sub_query, query=question)
+                logic_node = self.parse_logic_form(
+                    input_str, parsed_cached_map, sub_query=sub_query, query=question
+                )
                 parsed_node.append(logic_node)
             except Exception as e:
                 logger.warning(f"parse node {input_str} error", exc_info=True)
@@ -577,9 +620,14 @@ def parse_logic_form_set(self, input_str_set: list, sub_querys: list, question:
     def std_node_type_name(self, type_name):
         if self.schema_retrieval is None:
             return type_name
-        search_entity_labels = self.schema_retrieval.retrieval_entity(SPOEntity(entity_name=type_name))
-        if len(search_entity_labels) > 0:
-            return search_entity_labels[0].name
+        try:
+            search_entity_labels = self.schema_retrieval.retrieval_entity(
+                SPOEntity(entity_name=type_name)
+            )
+            if len(search_entity_labels) > 0:
+                return search_entity_labels[0].name
+        except Exception as e:
+            logger.warning(f"parse node {type_name} error", exc_info=True)
         return type_name
 
     def get_edge_type_en_by_name(self, type_name):
@@ -610,15 +658,15 @@ def get_node_type_info(self, type_name):
             en = self.std_node_type_name(type_name)
             zh = en
         type_info = TypeInfo()
-        type_info.entity_type = en
-        type_info.entity_type_zh = zh
-        if type_info.entity_type_zh is None:
-            type_info.entity_type_zh = type_name
+        type_info.std_entity_type = en
+        type_info.un_std_entity_type = zh
+        if type_info.un_std_entity_type is None:
+            type_info.un_std_entity_type = type_name
         return type_info
 
     def get_edge_type_info(self, type_name):
         # Edge is not standardized currently
         type_info = TypeInfo()
-        type_info.entity_type = self.get_edge_type_en_by_name(type_name)
-        type_info.entity_type_zh = type_name
+        type_info.std_entity_type = self.get_edge_type_en_by_name(type_name)
+        type_info.un_std_entity_type = type_name
         return type_info
diff --git a/kag/solver/logic/core_modules/retriver/schema_std.py b/kag/solver/logic/core_modules/parser/schema_std.py
similarity index 55%
rename from kag/solver/logic/core_modules/retriver/schema_std.py
rename to kag/solver/logic/core_modules/parser/schema_std.py
index 27448d42..8dedf147 100644
--- a/kag/solver/logic/core_modules/retriver/schema_std.py
+++ b/kag/solver/logic/core_modules/parser/schema_std.py
@@ -1,28 +1,27 @@
 # coding=utf8
-import os
-import sys
 from typing import List
 
-from kag.solver.implementation.default_kg_retrieval import KGRetrieverByLlm
-from kag.solver.logic.core_modules.common.base_model import SPOEntity
+from kag.interface.solver.base_model import SPOEntity
 from kag.solver.logic.core_modules.common.one_hop_graph import EntityData
+from kag.solver.retriever.base.kg_retriever import KGRetriever
 
-sys.path.append('../logic_form_executor/')
-current_dir = os.path.dirname(os.path.abspath(__file__))
 import logging
 
 logger = logging.getLogger()
 
 
-class SchemaRetrieval(KGRetrieverByLlm):
+class SchemaRetrieval(KGRetriever):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    def retrieval_entity(self, mention_entity: SPOEntity, topk=1, **kwargs) -> List[EntityData]:
+    def retrieval_entity(self, mention_entity: SPOEntity, **kwargs) -> List[EntityData]:
         # 根据mention召回
-        label = self.schema.get_label_within_prefix('SemanticConcept')
-        typed_nodes = self.sc.search_vector(
-            label=label, property_key="name", query_vector=self.vectorizer.vectorize(mention_entity.entity_name), topk=1
+        label = self.schema.get_label_within_prefix("SemanticConcept")
+        typed_nodes = self.search_api.search_vector(
+            label=label,
+            property_key="name",
+            query_vector=self.vectorize_model.vectorize(mention_entity.entity_name),
+            topk=1,
         )
         recalled_entity = EntityData()
         recalled_entity.type = "SemanticConcept"
diff --git a/kag/solver/logic/core_modules/retriver/entity_linker.py b/kag/solver/logic/core_modules/retriver/entity_linker.py
deleted file mode 100644
index e6ce92de..00000000
--- a/kag/solver/logic/core_modules/retriver/entity_linker.py
+++ /dev/null
@@ -1,148 +0,0 @@
-#!/usr/bin/python
-# coding=utf8
-import concurrent.futures
-import logging
-from typing import List, Union
-
-from kag.interface.retriever.kg_retriever_abc import KGRetrieverABC
-from kag.solver.logic.core_modules.common.base_model import SPOEntity
-from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph, EntityData
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.common.text_sim_by_vector import TextSimilarity
-from kag.solver.logic.core_modules.parser.logic_node_parser import GetSPONode, GetNode
-
-logger = logging.getLogger()
-
-
-class EntityLinkerBase:
-    def __init__(self, config):
-        self.config = config
-
-    def entity_linking(self, content, entities: List[SPOEntity], req_id='', **kwargs):
-        logger.info(f"EntityLinkerBase {req_id} return empty linker")
-        return [
-               ], []
-
-    def get_service_name(self):
-        return {
-            'scene_name': '空链指调用'
-        }
-
-
-class DefaultEntityLinker(EntityLinkerBase):
-    def __init__(self, config, kg_retriever: KGRetrieverABC):
-        super().__init__(config)
-        self.recognition_threshold = float(0.8)
-        self.kg_retriever = kg_retriever
-
-    def get_service_name(self):
-        return {
-            'scene_name': 'neo4j'
-        }
-
-    def _call_feature(self, feature):
-        mention_entity = feature.get('mention_entity', None)
-        return self.kg_retriever.retrieval_entity(mention_entity, params=feature)
-
-    def compose_features(self, content, entities: List[SPOEntity], req_id='', params={}):
-        features = []
-        for i, entity in enumerate(entities):
-            content = f"{content}[Entity]{entity.entity_name}"
-            feature = {
-                "mention_entity": entity,
-                "property_key": "name",
-                'content': content,
-                "query_text": entity.entity_name,
-                'recognition_threshold': self.recognition_threshold
-            }
-            feature.update(params)
-            features.append(feature)
-        return features
-
-    ## ha3召回+精排链指
-    def entity_linking(self, content, entities: List[SPOEntity], req_id='', **kwargs):
-        '''
-        input:
-            content: str, context
-            entities: [], entity spans to be linked
-            types: [], entity types to be linked
-        output:
-            [{'content': '吉林省抚松县被人们称为是哪种药材之乡？', 'entities': [{'word': '吉林省抚松县', 'start_idx': 0, 'recall': []}]}
-        '''
-        features = self.compose_features(content, entities, req_id, kwargs)
-        entity_recalls = {}
-        logger.debug(f"{req_id} entity_linking {features}")
-        call_datas = []
-        if len(features) == 1:
-            res = self._call_feature(features[0])
-            call_datas.append({'res': res, 'recalls': entity_recalls, 'content': content})
-        else:
-            with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
-                call_datas = [{'res': d, 'recalls': entity_recalls, 'content': content} for d in
-                              list(executor.map(self._call_feature, features))]
-        logger.debug(f'{req_id} entity_linking result: {call_datas}')
-        results = []
-        for data in call_datas:
-            recalled_entities = data['res']
-            results.append(recalled_entities)
-        return results, call_datas
-
-
-def spo_entity_linker(kg_graph: KgGraph, n: Union[GetSPONode, GetNode], nl_query, el: EntityLinkerBase, schema: SchemaUtils, req_id='',
-                      params={}):
-    el_results = []
-    call_result_data = []
-    entities_candis = []
-    args_entity_mentions = [[], [], []]  # [keys, entities_name, entities_type]
-    s_data = kg_graph.get_entity_by_alias(n.s.alias_name)
-    if s_data is None and isinstance(n.s, SPOEntity) and n.s.entity_name and len(n.s.id_set) == 0:
-        entities_candis.append(n.s)
-
-    el_kg_graph = KgGraph()
-    if isinstance(n, GetSPONode):
-        o_data = kg_graph.get_entity_by_alias(n.o.alias_name)
-        if o_data is None and isinstance(n.o, SPOEntity) and n.o.entity_name and len(n.o.id_set) == 0:
-            entities_candis.append(n.o)
-        el_kg_graph.query_graph[n.p.alias_name] = {
-            "s": n.s.alias_name,
-            "p": n.p.alias_name,
-            "o": n.o.alias_name
-        }
-
-    el_request = {
-        "nl_query": nl_query,
-        "entity_mentions": entities_candis
-    }
-    err_msg = ""
-    if entities_candis and el is not None:
-        try:
-            el_results, call_result_data = el.entity_linking(nl_query, entities_candis, req_id, kwargs=params)
-        except Exception as e:
-            logger.error(f"{req_id} spo_entity_linker error, we need use name to id {str(e)}", exc_info=True)
-            el_results = []
-            call_result_data = []
-            err_msg = str(e)
-        for i in range(len(entities_candis)):
-            candis_entitiy = entities_candis[i]
-            entity_data_set = []
-            if el_results and i < len(el_results) and el_results[i] is not None and len(el_results[i]) > 0:
-                el_recalls = el_results[i]
-                for entity_id_info in el_recalls:
-                    entity_type_zh = schema.node_en_zh[
-                        entity_id_info.type] if schema is not None and entity_id_info.type in schema.node_en_zh.keys() else None
-                    entity_id_info.type_zh = entity_type_zh
-                    entity_data_set.append(entity_id_info)
-            else:
-                entity_id_info = EntityData()
-                entity_id_info.name = candis_entitiy.entity_name
-                entity_id_info.biz_id = candis_entitiy.entity_name
-                entity_id_info.type = schema.get_label_within_prefix(candis_entitiy.get_entity_first_type())
-                entity_type_zh = schema.node_en_zh[
-                    entity_id_info.type] if schema is not None and entity_id_info.type in schema.node_en_zh.keys() else None
-                entity_id_info.type_zh = entity_type_zh
-                entity_data_set.append(entity_id_info)
-            el_kg_graph.nodes_alias.append(candis_entitiy.alias_name)
-            el_kg_graph.entity_map[candis_entitiy.alias_name] = entity_data_set
-
-    kg_graph.merge_kg_graph(el_kg_graph, True)
-    return el_results, el_request, err_msg, call_result_data
diff --git a/kag/solver/logic/core_modules/retriver/graph_retriver/dsl_executor.py b/kag/solver/logic/core_modules/retriver/graph_retriver/dsl_executor.py
deleted file mode 100644
index 3c734291..00000000
--- a/kag/solver/logic/core_modules/retriver/graph_retriver/dsl_executor.py
+++ /dev/null
@@ -1,558 +0,0 @@
-import concurrent.futures
-import itertools
-import logging
-import time
-from typing import List
-
-from knext.reasoner import TableResult, ReasonTask
-from knext.reasoner.client import ReasonerClient
-from kag.solver.logic.core_modules.common.one_hop_graph import copy_one_hop_graph_data, EntityData, Prop, \
-    OneHopGraphData, RelationData
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.common.utils import generate_biz_id_with_type
-from kag.solver.logic.core_modules.config import LogicFormConfiguration
-from kag.solver.logic.core_modules.parser.logic_node_parser import GetSPONode
-
-logger = logging.getLogger()
-
-
-class DslRunner:
-    def __init__(self, project_id: str, schema: SchemaUtils, config: LogicFormConfiguration):
-        # Initialize the DslRunner with project ID, schema, and configuration.
-        """
-        Initialize the DslRunner for graph database access using Cypher or other languages to retrieve results as OneHopGraph.
-
-        :param project_id: A string representing the ID of the project.
-        :param schema: An object of type Schema used for defining the structure of the graph data.
-        :param config: An instance of LogicFormConfiguration containing configuration settings.
-        """
-        self.project_id = project_id
-        self.schema = schema
-        self.config = config
-        self.rc = ReasonerClient(self.config.host_addr, int(self.project_id))
-
-    def get_cached_one_hop_data(self, query_one_graph_cache: dict, biz_id, spo_name):
-        if biz_id in query_one_graph_cache.keys():
-            return copy_one_hop_graph_data(query_one_graph_cache[biz_id], spo_name)
-        return None
-
-    def call_sub_event(self, s_biz_id_set: list, s_node_type: str, o_node_type: str, p_name: str, out_direct: bool,
-                       filter_map: dict = None):
-        pass
-
-    def run_dsl(self, query, dsl, start_id, params, schema: SchemaUtils, graph_output=False):
-        pass
-
-    """
-        batch query with s and o
-    """
-
-    def query_vertex_one_graph_by_s_o_ids(self, s_node_set: List[EntityData], o_node_set: List[EntityData],
-                                          cached_map: dict,n: GetSPONode=None):
-        pass
-
-    """
-        batch query with s, only get property
-    """
-
-    def query_vertex_property_by_s_ids(self, s_biz_id_set: list, s_node_type: str, cached_map: dict):
-        pass
-
-
-class DslRunnerOnGraphStore(DslRunner):
-
-    def __init__(self, project_id, schema: SchemaUtils, config: LogicFormConfiguration):
-        super().__init__(project_id, schema, config)
-        self.schema = schema
-
-    def run_dsl(self, query, dsl, start_id, params, schema: SchemaUtils, graph_output=False):
-        pass
-
-    def _get_filter_gql(self, filter_map: dict, alias: str):
-        if filter_map is None or alias not in filter_map:
-            return None
-        return filter_map[alias]
-
-    def _convert_node_to_json(self, node_str):
-        try:
-            import json
-            node = json.loads(node_str)
-        except:
-            return {}
-        return {
-            'id': node['id'],
-            'type': node['__label__'],
-            'propertyValues': dict(node)
-        }
-
-    def _convert_edge_to_json(self, p_str):
-        try:
-            import json
-            p = json.loads(p_str)
-        except:
-            return {}
-        prop = dict(p)
-        prop['original_src_id1__'] = p['__from_id__']
-        prop['original_dst_id2__'] = p['__to_id__']
-        return {
-            'type': p['__label__'],
-            'propertyValues': prop
-        }
-
-    def replace_qota(self, s: str):
-        return s.replace("'", "\\'")
-
-    def _generate_gql_type(self, biz_set: list, node_type: str):
-        if biz_set is None or len(biz_set) == 0 or node_type is None:
-            return ":Entity"
-        return f":{node_type}"
-    """
-        batch query with s and o
-    """
-
-    def _do_query_vertex_one_graph_by_s_o_ids(self, s_biz_id: list, s_node_type: str, o_biz_id: list, o_node_type: str,
-                                              p_name: str = None, filter_map: dict = None):
-
-        s_biz_id_set = [f'"{self.replace_qota(str(s_id))}"' for s_id in s_biz_id]
-
-        o_biz_id_set = []
-        if o_biz_id is not None:
-            o_biz_id_set = [f"'{self.replace_qota(str(o_id))}'" for o_id in o_biz_id]
-
-        s_where_cluase = None
-        return_cluase = ["p"]
-        return_cluase.append("s.id as s_id")
-        return_cluase.append("o.id as o_id")
-
-        return_cluase.append("s")
-
-        s_where_filter_cluase = self._get_filter_gql(filter_map, "s")
-        if s_where_filter_cluase is not None:
-            s_where_cluase = f"{s_where_filter_cluase}"
-
-        o_where_cluase = None
-
-        o_where_filter_cluase = self._get_filter_gql(filter_map, "o")
-        if o_where_filter_cluase is not None:
-            o_where_cluase = f"{o_where_filter_cluase}"
-        return_cluase.append("o")
-
-        p_where_cluase = self._get_filter_gql(filter_map, "p")
-
-        s_gql = f"(s{self._generate_gql_type(s_biz_id_set, s_node_type)})"
-        o_gql = f"(o{self._generate_gql_type(o_biz_id_set, o_node_type)})"
-        where_cluase = []
-        if s_where_cluase:
-            where_cluase.append(s_where_cluase)
-
-        if o_where_cluase:
-            where_cluase.append(o_where_cluase)
-
-        if p_where_cluase:
-            where_cluase.append(p_where_cluase)
-        gql_param = {
-            "start_alias": "s" if len(s_biz_id_set) > 0 else "o",
-            "s_type": s_node_type,
-            "o_type": o_node_type
-        }
-        if len(s_biz_id_set) > 0:
-            where_cluase.append(f"s.id in $sid")
-            gql_param["sid"] = f'[{",".join(s_biz_id_set)}]'
-
-        if len(o_biz_id_set) > 0:
-            where_cluase.append(f"o.id in $oid")
-            gql_param["oid"] = f'[{",".join(o_biz_id_set)}]'
-        if p_name is None:
-            p_name = "rdf_expand()"
-        gql_set = self._generate_gql_prio_set(s_node_type, s_biz_id_set, o_node_type, o_biz_id_set, p_name, where_cluase, return_cluase)
-        logger.debug("query_vertex_one_graph_by_s_o_ids query " + str(gql_set))
-
-        start_time = time.time()
-        for gql in gql_set:
-            res = self.rc.syn_execute(gql, **gql_param)
-            add_alias = []
-            if len(s_biz_id_set) > 0:
-                add_alias.append("s")
-            if len(o_biz_id_set) > 0:
-                add_alias.append("o")
-            out = self.parse_one_hot_graph_graph_detail_with_id_map(res.task, add_alias)
-            logger.debug(f"query_vertex_one_graph_by_s_o_ids {s_biz_id_set} cost end time {time.time() - start_time}")
-            if out is not None and len(out) > 0:
-                return out
-        return {}
-
-    def _generate_gql_prio_set(self, s_type, s_biz_id_set, o_type, o_biz_id_set, p_type, where_cluase, return_cluase):
-        s_gql = f"(s{self._generate_gql_type(s_biz_id_set, s_type)})"
-        o_gql = f"(o{self._generate_gql_type(o_biz_id_set, o_type)})"
-        rdf_expand_gql = f"""match {s_gql}-[p:rdf_expand()]-{o_gql}
-        {'where ' + "and".join(where_cluase) if len(where_cluase) > 0 else ''}
-        return {','.join(return_cluase)}"""
-        if p_type == "rdf_expand()":
-            return [rdf_expand_gql]
-        s_without_prefix_type = self.schema.get_label_without_prefix(s_type)
-        o_without_prefix_type = self.schema.get_label_without_prefix(o_type)
-        ret_gql = []
-        if (s_without_prefix_type, o_without_prefix_type) in self.schema.so_p_en and p_type in self.schema.so_p_en[(s_without_prefix_type, o_without_prefix_type)]:
-            ret_gql.append(f"""match (s:{s_type})-[p:{p_type}]->(o:{o_type})
-        {'where ' + "and".join(where_cluase) if len(where_cluase) > 0 else ''}
-        return {','.join(return_cluase)}""")
-        if (o_without_prefix_type, s_without_prefix_type) in self.schema.op_s_en  and p_type in self.schema.op_s_en[(o_without_prefix_type, s_without_prefix_type)]:
-            ret_gql.append(f"""match (s:{s_type})<-[p:{p_type}]-(o:{o_type})
-                    {'where ' + "and".join(where_cluase) if len(where_cluase) > 0 else ''}
-                    return {','.join(return_cluase)}""")
-        ret_gql.append(rdf_expand_gql)
-        return ret_gql
-
-    def _cartesian_product_with_default(self, list1, list2, default=None):
-        # 如果任何一个列表为空，使用包含默认值的列表替代
-        list1 = list1 if list1 else [default]
-        list2 = list2 if list2 else [default]
-
-        return list(itertools.product(list1, list2))
-
-    def _get_node_type_zh(self, node_type):
-        if node_type == "attribute":
-            return "文本"
-        return node_type
-    def _get_p_type_name(self, n: GetSPONode):
-        if n is None:
-            return None
-        return n.p.get_entity_first_type()
-    def _get_entity_type_name(self, d: EntityData, n: GetSPONode=None, alias=None):
-        if d is None and n is None:
-            return None
-        return self.schema.get_label_within_prefix(n.s.get_entity_first_type() if alias=="s" else n.o.get_entity_first_type()) if d is None else d.type
-
-    def query_vertex_one_graph_by_s_o_ids(self, s_node_set: List[EntityData], o_node_set: List[EntityData],
-                                          cached_map: dict, n: GetSPONode=None):
-        one_hop_graph_map = {}
-        is_enable_cache = True
-        if (len(s_node_set) != 0 and len(o_node_set) != 0) or self._get_p_type_name(n):
-            # do not cache
-            is_enable_cache = False
-        s_uncached_biz_id = []
-        o_uncached_biz_id = []
-        if is_enable_cache:
-            for s_node in s_node_set:
-                cached_id = generate_biz_id_with_type(s_node.biz_id, self._get_node_type_zh(s_node.type_zh if s_node.type_zh else s_node.type))
-                cached_graph = self.get_cached_one_hop_data(cached_map, cached_id, "s")
-                if cached_graph:
-                    one_hop_graph_map[cached_id] = cached_graph
-                else:
-                    s_uncached_biz_id.append(s_node)
-
-            for o_node in o_node_set:
-                cached_id = generate_biz_id_with_type(o_node.biz_id, self._get_node_type_zh(o_node.type_zh if o_node.type_zh else o_node.type))
-                cached_graph = self.get_cached_one_hop_data(cached_map, cached_id, "o")
-                if cached_graph:
-                    one_hop_graph_map[cached_id] = cached_graph
-                else:
-                    o_uncached_biz_id.append(o_node)
-            if len(s_uncached_biz_id) == 0 and len(o_uncached_biz_id) == 0:
-                return one_hop_graph_map
-        else:
-            s_uncached_biz_id = s_node_set
-            o_uncached_biz_id = o_node_set
-
-        combined_list = self._shuffle_query_node(s_uncached_biz_id, o_uncached_biz_id)
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            futures = [
-                executor.submit(self._do_query_vertex_one_graph_by_s_o_ids, [] if s_node is None else [s_node.biz_id],
-                                self._get_entity_type_name(s_node, n, "s"), [] if o_node is None else [o_node.biz_id],
-                                self._get_entity_type_name(o_node, n, "o"), self._get_p_type_name(n)) for
-                s_node, o_node in
-                combined_list]
-            results = [future.result() for future in concurrent.futures.as_completed(futures)]
-            for r in results:
-                one_hop_graph_map.update(r)
-        for node in s_uncached_biz_id + o_uncached_biz_id:
-            cached_id = generate_biz_id_with_type(node.biz_id, self._get_node_type_zh(node.type_zh if node.type_zh else node.type))
-            if cached_id not in one_hop_graph_map:
-                continue
-            one_hop_graph = one_hop_graph_map[cached_id]
-            one_hop_graph.s.score = node.score
-        # updated score
-        return one_hop_graph_map
-
-    def _shuffle_query_node(self, s_nodes: List[EntityData], o_nodes: List[EntityData]):
-        s_group_types = self._extra_node_id_group_by_type(s_nodes)
-        o_group_types = self._extra_node_id_group_by_type(o_nodes)
-        combined_list = self._cartesian_product_with_default(s_group_types, o_group_types)
-        node_ids = []
-        for s_group, o_group in combined_list:
-            node_ids = node_ids + self._cartesian_product_with_default(s_group, o_group)
-        return node_ids
-
-    def _extra_node_id_group_by_type(self, nodes: List[EntityData]):
-        type_map = {}
-        for node in nodes:
-            if node.type not in type_map:
-                type_map[node.type] = []
-            type_map[node.type].append(node)
-        return list(type_map.values())
-
-    def query_vertex_property_by_s_ids(self, s_biz_id_set: list, s_node_type: str, cached_map: dict):
-        one_hop_graph_dict = {}
-        s_uncached_biz_id_set = []
-        s_node_type_zh = self._get_node_type_zh(s_node_type)
-        for s_id in s_biz_id_set:
-            cache_id_with_type = generate_biz_id_with_type(s_id, s_node_type_zh)
-            cached_graph = self.get_cached_one_hop_data(cached_map, cache_id_with_type, "s")
-            if cached_graph:
-                one_hop_graph_dict[cache_id_with_type] = cached_graph
-            else:
-                s_uncached_biz_id_set.append(s_id)
-        if len(s_uncached_biz_id_set) == 0:
-            return one_hop_graph_dict
-
-        s_biz_id_set = s_uncached_biz_id_set
-        s_biz_id_set = [f"'{self.replace_qota(str(s_id))}'" for s_id in s_biz_id_set]
-
-        return_cluase = []
-        id_rep = "id"
-
-        gql_param = {
-            "start_alias": "s"
-        }
-        if len(s_biz_id_set) > 0:
-            s_where_cluase = "s.id in $sid"
-            gql_param["sid"] = f"[{','.join(s_biz_id_set)}]"
-        else:
-            return {}
-
-        return_cluase.append("s")
-        return_cluase.append("s.id as s_id")
-
-        gql = f"""
-        match (s{':' + 'Entity' if len(s_biz_id_set) != 0 else ''}{' where ' + s_where_cluase if s_where_cluase else ''})
-        return {','.join(return_cluase)}
-        """
-        logger.debug("query_vertex_one_graph_by_s_o_ids query " + gql)
-        start_time = time.time()
-
-        res = self.rc.syn_execute(gql, **gql_param)
-        add_alias = []
-        if len(s_biz_id_set) > 0:
-            add_alias.append("s")
-        out = self.parse_one_hot_graph_graph_detail_with_id_map(res.task, add_alias)
-        one_hop_graph_dict.update(out)
-        logger.debug(f"query_vertex_one_graph_by_s_o_ids {s_biz_id_set} cost end time {time.time() - start_time}")
-        return one_hop_graph_dict
-
-    def _trans_normal_p_json(self, p_json, s_json, o_json):
-        s_type = s_json['type']
-        s_biz_id = s_json["propertyValues"]["id"]
-        o_type = o_json['type']
-        o_biz_id = o_json["propertyValues"]["id"]
-        p_total_type_name = p_json['type']
-        if len(s_type) > len(o_type):
-            p_type = p_json['type'].replace(s_type, "").replace(o_type, "").replace("_", "")
-        else:
-            p_type = p_json['type'].replace(o_type, "").replace(s_type, "").replace("_", "")
-        p_info = {}
-        from_id = None
-        to_id = None
-        for property_key in p_json['propertyValues'].keys():
-            if property_key == "original_src_id1__":
-                from_id = p_json['propertyValues'][property_key]
-            elif property_key == "original_dst_id2__":
-                to_id = p_json['propertyValues'][property_key]
-            else:
-                p_info[property_key] = p_json['propertyValues'][property_key]
-        if from_id is None or to_id is None:
-            return None
-        """
-        rel.from_id = json_dict["__from_id__"]
-        rel.from_type = json_dict["__from_id_type__"]
-        rel.end_id = json_dict["__to_id__"]
-        rel.end_type = json_dict["__to_id_type__"]
-        rel.type = json_dict["__label__"]
-        """
-        if s_type in p_total_type_name and s_type != o_type:
-            is_out_edge = p_total_type_name.startswith(s_type)
-        else:
-            is_out_edge = from_id == s_biz_id
-
-        if is_out_edge:
-            p_info.update({
-                "__label__": p_type,
-                "__from_id__": s_biz_id,
-                "__from_id_type__": s_type,
-                "__to_id__": o_biz_id,
-                "__to_id_type__": o_type
-            })
-        else:
-            p_info.update({
-                "__label__": p_type,
-                "__from_id__": o_biz_id,
-                "__from_id_type__": o_type,
-                "__to_id__": s_biz_id,
-                "__to_id_type__": s_type
-            })
-
-        return p_info
-
-    def _check_need_property(self, json_data):
-        if "propertyValues" not in json_data.keys():
-            return False
-        if "type" not in json_data.keys():
-            return False
-        if "id" not in json_data.keys():
-            return False
-        if "id" not in json_data["propertyValues"].keys():
-            return False
-        return True
-
-    def parse_one_hot_graph_graph_detail_with_id_map(self, task_resp: ReasonTask, add_alias: list):
-        one_hop_graph_map = {}
-        if task_resp is None or task_resp.status != "FINISH":
-            return one_hop_graph_map
-        detail = task_resp.result_table_result
-        if detail.total == 0:
-            return one_hop_graph_map
-        tmp_graph_parse_result_map = {}
-        s_index = -1
-        p_index = -1
-        o_index = -1
-        # format header s, p
-        for i in range(len(detail.header)):
-            if detail.header[i] == "s":
-                s_index = i
-            elif detail.header[i] == "p":
-                p_index = i
-            elif detail.header[i] == "o":
-                o_index = i
-        if p_index is None:
-            return one_hop_graph_map
-
-        # get all EntityData first
-        for data in detail.rows:
-            s_entity = None
-            o_entity = None
-            s_json = {}
-            if s_index != -1:
-                s_json = self._convert_node_to_json(data[s_index])
-                if self._check_need_property(s_json) is False:
-                    continue
-                prop_values = s_json['propertyValues']
-                s_biz_id = prop_values["id"]
-                s_type_name = s_json["type"]
-                s_biz_id_with_type_name = generate_biz_id_with_type(s_biz_id, self._get_node_type_zh(s_type_name))
-                if s_biz_id_with_type_name not in tmp_graph_parse_result_map.keys():
-                    s_entity = EntityData()
-                    s_entity.type = s_type_name
-                    s_entity.type_zh = self._get_node_type_zh(s_type_name)
-                    s_entity.prop = Prop.from_dict(prop_values, s_entity.type, None)
-                    s_entity.biz_id = s_biz_id
-                    s_entity.name = prop_values.get("name", "")
-                    if "description" in prop_values.keys():
-                        s_entity.description = prop_values["description"]
-                    one_hop_graph = OneHopGraphData(None, "s")
-                    one_hop_graph.s = s_entity
-                    tmp_graph_parse_result_map[s_biz_id_with_type_name] = one_hop_graph
-                else:
-                    s_entity = tmp_graph_parse_result_map[s_biz_id_with_type_name].s
-
-                if "s" in add_alias and s_biz_id_with_type_name not in one_hop_graph_map.keys():
-                    one_hop_graph_map[s_biz_id_with_type_name] = tmp_graph_parse_result_map[s_biz_id_with_type_name]
-            else:
-                s_biz_id_with_type_name = None
-
-            if o_index == -1 or p_index == -1:
-                continue
-
-            o_json = {}
-            if o_index != -1:
-                o_json = self._convert_node_to_json(data[o_index])
-                if self._check_need_property(o_json) is False:
-                    continue
-                prop_values = o_json['propertyValues']
-                o_biz_id = prop_values["id"]
-                o_type_name = o_json["type"]
-                o_biz_id_with_type_name = generate_biz_id_with_type(o_biz_id, self._get_node_type_zh(o_type_name))
-                if o_biz_id_with_type_name not in tmp_graph_parse_result_map.keys():
-                    o_entity = EntityData()
-                    o_entity.type = o_type_name
-                    o_entity.type_zh = self._get_node_type_zh(o_type_name)
-                    o_entity.prop = Prop.from_dict(prop_values, o_entity.type, None)
-                    o_entity.biz_id = o_biz_id
-
-                    o_entity.name = prop_values.get("name", "")
-                    if "description" in o_json.keys():
-                        o_entity.description = o_json["description"]
-                    one_hop_graph = OneHopGraphData(None, "o")
-                    one_hop_graph.s = o_entity
-                    tmp_graph_parse_result_map[o_biz_id_with_type_name] = one_hop_graph
-                else:
-                    o_entity = tmp_graph_parse_result_map[o_biz_id_with_type_name].s
-
-                if "o" in add_alias and o_biz_id_with_type_name not in one_hop_graph_map.keys():
-                    one_hop_graph_map[o_biz_id_with_type_name] = tmp_graph_parse_result_map[o_biz_id_with_type_name]
-            else:
-                o_biz_id_with_type_name = None
-
-            if s_entity is None and o_entity is None:
-                logger.info("parse_one_hot_graph_graph_detail_with_id_map entity is None")
-                continue
-
-            if p_index == -1:
-                continue
-            p_json = self._convert_edge_to_json(data[p_index])
-            p_json = self._trans_normal_p_json(p_json, s_json, o_json)
-            rel = RelationData.from_dict(p_json, None)
-            # if rel.type in ['similarity', 'source']:
-            #     continue
-            if s_entity is None:
-                s_entity = EntityData()
-                if rel.from_id == o_entity.biz_id:
-                    s_entity.biz_id = rel.end_id
-                    s_entity.type = rel.end_type
-                    s_entity.type_zh = self._get_node_type_zh(rel.end_type)
-                else:
-                    s_entity.biz_id = rel.from_id
-                    s_entity.type = rel.from_type
-                    s_entity.type_zh = self._get_node_type_zh(rel.from_type)
-
-            if o_entity is None:
-                o_entity = EntityData()
-                if rel.from_id == s_entity.biz_id:
-                    o_entity.biz_id = rel.end_id
-                    o_entity.type = rel.end_type
-                    o_entity.type_zh = self._get_node_type_zh(rel.end_type)
-                else:
-                    o_entity.biz_id = rel.from_id
-                    o_entity.type = rel.from_type
-                    o_entity.type_zh = self._get_node_type_zh(rel.from_type)
-
-            if generate_biz_id_with_type(rel.from_id, rel.from_type) == generate_biz_id_with_type(s_entity.biz_id,
-                                                                                                  s_type_name):
-                rel.from_entity = s_entity
-                rel.end_entity = o_entity
-
-                if s_biz_id_with_type_name is not None and s_biz_id_with_type_name in tmp_graph_parse_result_map.keys():
-                    if rel.type in tmp_graph_parse_result_map[s_biz_id_with_type_name].out_relations.keys():
-                        tmp_graph_parse_result_map[s_biz_id_with_type_name].out_relations[rel.type].append(rel)
-                    else:
-                        tmp_graph_parse_result_map[s_biz_id_with_type_name].out_relations[rel.type] = [rel]
-                if o_biz_id_with_type_name is not None and o_biz_id_with_type_name in tmp_graph_parse_result_map.keys():
-                    if rel.type in tmp_graph_parse_result_map[o_biz_id_with_type_name].in_relations.keys():
-                        tmp_graph_parse_result_map[o_biz_id_with_type_name].in_relations[rel.type].append(rel)
-                    else:
-                        tmp_graph_parse_result_map[o_biz_id_with_type_name].in_relations[rel.type] = [rel]
-            else:
-                rel.from_entity = o_entity
-                rel.from_alias = "o"
-                rel.end_entity = s_entity
-                rel.end_alias = "s"
-                if s_biz_id_with_type_name is not None and s_biz_id_with_type_name in tmp_graph_parse_result_map.keys():
-                    if rel.type in tmp_graph_parse_result_map[s_biz_id_with_type_name].in_relations.keys():
-                        tmp_graph_parse_result_map[s_biz_id_with_type_name].in_relations[rel.type].append(rel)
-                    else:
-                        tmp_graph_parse_result_map[s_biz_id_with_type_name].in_relations[rel.type] = [rel]
-
-                if o_biz_id_with_type_name is not None and o_biz_id_with_type_name in tmp_graph_parse_result_map.keys():
-                    if rel.type in tmp_graph_parse_result_map[o_biz_id_with_type_name].out_relations.keys():
-                        tmp_graph_parse_result_map[o_biz_id_with_type_name].out_relations[rel.type].append(rel)
-                    else:
-                        tmp_graph_parse_result_map[o_biz_id_with_type_name].out_relations[rel.type] = [rel]
-        return one_hop_graph_map
diff --git a/kag/solver/logic/core_modules/retriver/graph_retriver/dsl_model.py b/kag/solver/logic/core_modules/retriver/graph_retriver/dsl_model.py
deleted file mode 100644
index bd1250f0..00000000
--- a/kag/solver/logic/core_modules/retriver/graph_retriver/dsl_model.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import os
-import time
-from enum import Enum
-
-import json
-from typing import Any, List
-import logging
-
-
-logger = logging.getLogger()
-
-
-class AttributeDetail:
-    def __init__(self):
-        self.name: str = None
-        self.value: Any = None
-
-
-class EntityDetail:
-    def __init__(self):
-        self.id: str = None
-        self.entity_type_name: str = None
-        self.kg_internal_id: str = None
-        self.properties: List[AttributeDetail] = []
-
-    @staticmethod
-    def from_dict(json_dict):
-        entity = EntityDetail()
-        entity.id = json_dict["id"]
-        entity.entity_type_name = json_dict["entityTypeName"]
-        entity.kg_internal_id = json_dict["kgInternalId"]
-        entity.properties = json_dict["properties"]
-        return entity
-
-
-class TableData:
-    def __init__(self):
-        self.header = []
-        self.data = []
-
-    @staticmethod
-    def from_dict(json_dict):
-        entity = TableData()
-        entity.header = json_dict["header"]
-        entity.data = json_dict["data"]
-        return entity
-
-class RelationDetail:
-    def __init__(self):
-        self.start_entity_type_name = None
-        self.start_entity_kg_interanl_id = None
-        self.relation_type_name = None
-        self.end_entity_type_name = None
-        self.end_entity_kg_interanl_id = None
-        self.properties = []
-
-    @staticmethod
-    def from_dict(json_dict):
-        rel = RelationDetail()
-        rel.start_entity_type_name = json_dict["startEntityTypeName"]
-        rel.start_entity_kg_interanl_id = json_dict["startEntityKgInteranlId"]
-        rel.end_entity_kg_interanl_id = json_dict["endEntityKgInteranlId"]
-        rel.end_entity_type_name = json_dict["endEntityTypeName"]
-
-        rel.properties = json_dict["properties"]
-
-        rel.relation_type_name = json_dict["relationTypeName"]
-        return rel
-
-
-class ViewLevel(str, Enum):
-    GRAPH = "GRAPH"
-    TABLE = "TABLE"
-
-
-class GraphDetail:
-    def __init__(self):
-        self.nodes: List[EntityDetail] = []
-        self.edges: List[RelationDetail] = []
-        self.other = None
-        self.next_query_id: str = None
-        self.view_level: ViewLevel = ViewLevel.GRAPH
-        self.tableData: TableData = None
-
-    @staticmethod
-    def from_json(json_str):
-        json_obj = json.loads(json_str)
-        return GraphDetail.from_dict(json_obj)
-
-    @staticmethod
-    def from_dict(json_dict):
-        graph_detail = GraphDetail()
-        nodes = json_dict['nodes']
-        if len(nodes) != 0:
-            for node in nodes:
-                graph_detail.nodes.append(EntityDetail.from_dict(node))
-
-        edges = json_dict['edges']
-        if len(edges) != 0:
-            for edge in edges:
-                graph_detail.edges.append(RelationDetail.from_dict(edge))
-        graph_detail.other = json_dict['other']
-        graph_detail.next_query_id = json_dict['nextQueryId']
-        graph_detail.view_level = ViewLevel[json_dict['viewLevel'].upper()]
-
-        if "tableDetail" in json_dict.keys() and json_dict["tableDetail"] is not None:
-            graph_detail.tableData = TableData.from_dict(json_dict["tableDetail"])
-        return graph_detail
diff --git a/kag/solver/logic/core_modules/retriver/retrieval_spo.py b/kag/solver/logic/core_modules/retriver/retrieval_spo.py
deleted file mode 100644
index 07729f25..00000000
--- a/kag/solver/logic/core_modules/retriver/retrieval_spo.py
+++ /dev/null
@@ -1,308 +0,0 @@
-import json
-import logging
-import os
-import re
-import time
-from typing import List
-
-from kag.common.base.prompt_op import PromptOp
-from kag.common.llm.client import LLMClient
-from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph, EntityData, OneHopGraphData, \
-    RelationData
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.common.text_sim_by_vector import TextSimilarity
-from kag.solver.logic.core_modules.parser.logic_node_parser import GetSPONode
-
-logger = logging.getLogger()
-
-
-def default_change_en_2_zh(s, p, o, schema: SchemaUtils):
-    if schema is None:
-        return p
-    item = f"{s}_{p}_{o}"
-    if item and item in schema.spo_en_zh.keys():
-        return schema.get_spo_with_p(schema.spo_en_zh[item])
-    item = f"{o}_{p}_{s}"
-    if item and item in schema.spo_en_zh.keys():
-        return schema.get_spo_with_p(schema.spo_en_zh[item])
-
-    attr_en_zh = schema.get_attr_en_zh_by_label(s)
-    if p in attr_en_zh.keys():
-        return attr_en_zh[p]
-    return p
-
-
-def change_en_2_zh(s, p, o, schema: SchemaUtils):
-    return f"{s}_{p}_{o}"
-
-
-def split_value(value):
-    value = value.strip()
-    pattern = re.compile(r'[,、; ，]+')
-    return pattern.split(value)
-
-
-class RetrievalSpoBase:
-    def __init__(self):
-        pass
-
-    def match_spo(self, n: GetSPONode, one_hop_graph_list: List[OneHopGraphData]):
-        pass
-
-
-class ExactMatchRetrievalSpo(RetrievalSpoBase):
-    def __init__(self, schema):
-        super().__init__()
-        self.schema: SchemaUtils = schema
-
-    def _prase_attribute_relation(self, one_graph, std_p: str, attr_value: str):
-        # new a RelationData
-        prop_entity = EntityData()
-        prop_entity.biz_id = attr_value
-        prop_entity.name = attr_value
-        prop_entity.type = "attribute"
-        prop_entity.type_zh = "文本"
-
-        return self._prase_entity_relation(one_graph, std_p, prop_entity)
-
-    def _prase_entity_relation(self, one_graph, std_p: str, o_value: EntityData):
-        s_entity = one_graph.s
-        o_entity = o_value
-        if one_graph.s_alias_name == "o":
-            o_entity = one_graph.s
-            s_entity = o_value
-        if o_value.description is None or o_value.description == '':
-            o_value.description = f"{s_entity.name} {std_p} {o_entity.name}"
-        return RelationData.from_prop_value(s_entity, std_p, o_entity)
-
-    def _std_best_p_with_value_and_p_name(self, n: GetSPONode, one_graph: OneHopGraphData):
-        """
-        :param one_graph:
-        :return: list(RelationData)
-        """
-        debug_info = {
-            "el": [],
-            "el_detail": [],
-            "std_out": []
-        }
-        logger.debug(f"std_best_p_with_value_and_p_name begin std " + str(n))
-        un_std_p_list = n.p.get_entity_type_or_zh_list()
-        final_result_list = []
-        if len(un_std_p_list) == 0:
-            # return all
-            result = []
-            if len(one_graph.in_relations) > 0:
-                for k in one_graph.in_relations.keys():
-                    result = one_graph.in_relations[k] + result
-            if len(one_graph.out_relations) > 0:
-                for k in one_graph.out_relations.keys():
-                    result = one_graph.out_relations[k] + result
-            final_result_list = final_result_list + result
-
-        for un_std_p in un_std_p_list:
-            target_value = n.o.entity_name if one_graph.s_alias_name == "s" else n.s.entity_name
-            target_node = n.o if one_graph.s_alias_name == "s" else n.s
-            relation_name_set = one_graph.get_s_all_relation_name()
-            attribute_name_set = one_graph.get_s_all_attribute_name()
-            candi_name_set = relation_name_set + attribute_name_set
-
-            def find_best_match_p_name(p: str, candi_set: list):
-                if p in candi_set:
-                    return p
-                return None
-
-            std_p = find_best_match_p_name(un_std_p, candi_name_set)
-            debug_info['std_out'].append({
-                "un_std_p": un_std_p,
-                "candi_name_set": candi_name_set,
-                "std_p": std_p if std_p is not None else ''
-            })
-            if std_p is None:
-                continue
-
-            get_data_from_rel = False
-            if std_p in relation_name_set and std_p in attribute_name_set:
-                if not target_node.is_attribute:
-                    get_data_from_rel = True
-            elif std_p in relation_name_set:
-                get_data_from_rel = True
-
-            if get_data_from_rel:
-                relation_data = one_graph.get_std_relation_value(std_p)
-            else:
-                logger.info(f"relation with el: un std p is " + un_std_p + ", std p is " + std_p)
-                value = one_graph.get_std_attribute_value(std_p)
-                if value is None or value == "":
-                    continue
-                # new a RelationData
-                relation_data = [self._prase_attribute_relation(one_graph, std_p, value)]
-            if target_value is not None:
-                for r in relation_data:
-                    candi_target_value = r.end_entity.name if one_graph.s_alias_name == "s" else r.start_entity.name
-                    if candi_target_value == target_value:
-                        final_result_list.append(r)
-                        continue
-            else:
-                final_result_list = final_result_list + relation_data
-        return final_result_list, debug_info
-
-    def match_spo(self, n: GetSPONode, one_hop_graph_list: List[OneHopGraphData]):
-        matched_flag = False
-        one_kg_graph = KgGraph()
-        one_kg_graph.query_graph[n.p.alias_name] = {
-            "s": n.s.alias_name,
-            "p": n.p.alias_name,
-            "o": n.o.alias_name
-        }
-        for tmp_one_hop_graph in one_hop_graph_list:
-            rel_set, recall_debug_info = self._std_best_p_with_value_and_p_name(n, tmp_one_hop_graph)
-            if len(rel_set) > 0:
-                one_kg_graph_ = KgGraph()
-                recall_alias_name = n.s.alias_name if tmp_one_hop_graph.s_alias_name == "s" else n.o.alias_name
-                one_kg_graph_.entity_map[recall_alias_name] = [tmp_one_hop_graph.s]
-                one_kg_graph_.edge_map[n.p.alias_name] = rel_set
-                one_kg_graph.merge_kg_graph(one_kg_graph_)
-        spo_set = one_kg_graph.get_entity_by_alias(n.p.alias_name)
-        if spo_set is not None and len(spo_set) != 0:
-            matched_flag = True
-        return one_kg_graph, matched_flag
-
-
-class FuzzyMatchRetrievalSpo(RetrievalSpoBase):
-    def __init__(self, text_similarity: TextSimilarity = None, llm: LLMClient=None):
-        super().__init__()
-        model = os.getenv("KAG_LLM")
-        self.llm: LLMClient = llm or LLMClient.from_config(eval(model))
-        self.text_similarity = text_similarity or TextSimilarity()
-        self.cached_map = {}
-
-        self.biz_scene = os.getenv("KAG_PROMPT_BIZ_SCENE", "default")
-        self.language = os.getenv("KAG_PROMPT_LANGUAGE", "en")
-
-    def get_unstd_p_text(self, n: GetSPONode):
-        un_std_p = n.p.get_entity_first_type_or_zh()
-        start_value_type = n.s.get_entity_first_type_or_zh()
-        if start_value_type == "Others":
-            start_value_type = "Entity"
-        target_value_type = n.o.get_entity_first_type_or_zh()
-        if target_value_type == "Others":
-            target_value_type = "Entity"
-        un_std_p = f"{start_value_type}{'[' + n.s.entity_name + ']' if n.s.entity_name is not None else ''} {un_std_p} {target_value_type}{'[' + n.o.entity_name + ']' if n.o.entity_name is not None else ''}"
-        return un_std_p
-
-    def _choosed_by_llm(self, question, mention, candis):
-        resp_plan_prompt = PromptOp.load(self.biz_scene, "spo_retrieval")(
-            language=self.language
-        )
-        return self.llm.invoke({
-            'question': question,
-            'mention': mention,
-            'candis': candis
-        }, resp_plan_prompt, with_json_parse=False, with_except=True)
-
-    def select_relation(self, p_mention, p_candis, query='', topk=1, params={}):
-        if not p_mention:
-            print('p_mention is none')
-            return None
-        if p_mention in self.cached_map.keys():
-            cached_set = self.cached_map[p_mention]
-            intersection = list(set(cached_set) & set(p_candis))
-        else:
-            intersection = []
-        if len(intersection) == 0:
-            res = ''
-            try:
-                res = self._choosed_by_llm(query, p_mention, p_candis)
-                res = res.replace("Output:", "output:")
-                if "output:" in res:
-                    res = re.search('output:(.*)', res).group(1).strip()
-                if res != '':
-                    res = json.loads(res.replace("'", '"'))
-                    for res_ in res:
-                        self.cached_map[p_mention] = self.cached_map.get(p_mention, []) + [res_]
-                        intersection.append(res_)
-            except:
-                logger.warning(f"retrieval_spo json failed：query={query},  res={res}")
-        return [[x, 1.0] for x in intersection]
-
-    def find_best_match_p_name_by_model(self, query: str, p: str, candi_set: dict):
-        if p in candi_set:
-            return [p, candi_set[p]]
-        spo_retrieved = []
-        sen_condi_set = []
-        spo_name_map = {}
-        for p_name, spo_l in candi_set.items():
-            if p_name.startswith("_") or p_name == "id" or p_name == 'content':
-                continue
-            for spo in spo_l:
-                spo_name_map[spo] = p_name
-            sen_condi_set += spo_l
-        result = self.select_relation(p, sen_condi_set, query=query)
-        logger.debug(f"retrieval_relation: p={p}, candi_set={sen_condi_set}, p_std result={result}")
-
-        if result is None or len(result) == 0:
-            return spo_retrieved
-
-        for result_ in result:
-            spo = result_[0]
-            spo_p_name = spo_name_map.get(spo, None)
-            spo_retrieved.append([spo, spo_p_name])
-        return spo_retrieved
-
-    def match_spo(self, n: GetSPONode, one_hop_graph_list: List[OneHopGraphData]):
-        matched_flag = False
-        one_kg_graph = KgGraph()
-        # sort graph
-        unstd_p_text = self.get_unstd_p_text(n)
-        all_spo_text = []
-        revert_value_p_map = {}
-        revert_graph_map = {}
-        for one_hop_graph in one_hop_graph_list:
-            for k, v_set in one_hop_graph.get_s_all_relation_spo().items():
-                for v in v_set:
-                    all_spo_text.append(v)
-                    revert_value_p_map[v] = k
-                    revert_graph_map[v] = one_hop_graph
-            for k, v_set in one_hop_graph.get_s_all_attribute_spo().items():
-                for v in v_set:
-                    all_spo_text.append(v)
-                    revert_value_p_map[v] = k
-                    revert_graph_map[v] = one_hop_graph
-        start_time = time.time()
-        tok5_res = self.text_similarity.text_sim_result(n.sub_query, all_spo_text, 5, low_score=0.3)
-        logger.debug(f" _get_spo_value_in_one_hop_graph_set text similarity cost={time.time() - start_time}")
-
-        if len(tok5_res) == 0:
-            return one_kg_graph, matched_flag
-
-        matched_flag = True
-
-        candi_name_set = {}
-        for res in tok5_res:
-            k = revert_value_p_map[res[0]]
-            if k in candi_name_set.keys():
-                candi_name_set[k].append(res[0])
-            else:
-                candi_name_set[k] = [res[0]]
-        start_time = time.time()
-        spo_retrieved = self.find_best_match_p_name_by_model(n.sub_query, unstd_p_text,
-                                                             candi_name_set)
-        logger.debug(
-            f"_get_spo_value_in_one_hop_graph_set find_best_match_p_name_by_entity_list cost={time.time() - start_time}")
-        total_one_kg_graph = KgGraph()
-        total_one_kg_graph.query_graph[n.p.alias_name] = {
-            "s": n.s.alias_name,
-            "p": n.p.alias_name,
-            "o": n.o.alias_name
-        }
-        for std_spo_text, std_p in spo_retrieved:
-            if std_p is None or std_p == '':
-                continue
-            one_hop_graph = revert_graph_map[std_spo_text]
-            rel_set = one_hop_graph.get_std_p_value_by_spo_text(std_p, std_spo_text)
-            one_kg_graph_ = KgGraph()
-            recall_alias_name = n.s.alias_name if one_hop_graph.s_alias_name == "s" else n.o.alias_name
-            one_kg_graph_.entity_map[recall_alias_name] = [one_hop_graph.s]
-            one_kg_graph_.edge_map[n.p.alias_name] = rel_set
-            total_one_kg_graph.merge_kg_graph(one_kg_graph_)
-        return total_one_kg_graph, matched_flag
diff --git a/kag/solver/logic/core_modules/rule_runner/rule_runner.py b/kag/solver/logic/core_modules/rule_runner/rule_runner.py
deleted file mode 100644
index ff199f1a..00000000
--- a/kag/solver/logic/core_modules/rule_runner/rule_runner.py
+++ /dev/null
@@ -1,460 +0,0 @@
-import logging
-import time
-from enum import Enum
-
-from kag.solver.logic.core_modules.common.base_model import Identifer
-from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph, EntityData, RelationData
-from kag.solver.logic.core_modules.parser.logic_node_parser import FilterNode, ExtractorNode, \
-    VerifyNode
-
-
-class MatchRes(Enum):
-    MATCH = 1
-    UN_MATCH = 2
-    UN_RELATED = 3
-    RELATED = 4
-
-
-class MatchInfo:
-    def __init__(self, res: MatchRes, desc: str = ''):
-        self.res = res
-        self.desc = desc
-
-    def trans_match_res_to_str(self):
-        if self.res == MatchRes.RELATED:
-            return "相关"
-        elif self.res == MatchRes.MATCH:
-            return "满足"
-        elif self.res == MatchRes.UN_MATCH:
-            return "不满足"
-        else:
-            return "不相关"
-
-def trans_str_res_to_match(res: str):
-    if res is None or res == '' or "无相关信息" in res \
-            or "不相关" in res:
-        return MatchRes.UN_RELATED
-    return MatchRes.RELATED
-
-
-class RuleRunner:
-    def __init__(self):
-        self.op_map = {
-            "equal": self.run_equal,
-            "lt": self.run_lt,
-            "gt": self.run_gt,
-            "le": self.run_le,
-            "ge": self.run_ge,
-            "in": self.run_in,
-            "contains": self.run_contains,
-            "and": self.run_and,
-            "or": self.run_or,
-            "not": self.run_not,
-            "match": self.run_match,
-            "exist": self.run_exists,
-            "necessary": self.run_necessary,
-            "collect_in": self.run_collect_in,
-            "collect_contains": self.run_collect_contains
-        }
-
-    def run_rule(self, op_name: str, left_value, right_value):
-        pass
-
-    def run_equal(self, left_value, right_value):
-        pass
-
-    def run_gt(self, left_value, right_value):
-        pass
-
-    def run_lt(self, left_value, right_value):
-        pass
-
-    def run_ge(self, left_value, right_value):
-        pass
-
-    def run_le(self, left_value, right_value):
-        pass
-
-    def run_in(self, left_value, right_value):
-        pass
-
-    def run_contains(self, left_value, right_value):
-        pass
-
-    def run_and(self, left_value, right_value):
-        pass
-
-    def run_or(self, left_value, right_value):
-        pass
-
-    def run_not(self, left_value):
-        pass
-
-    def run_match(self, left_value, right_value):
-        pass
-
-    def run_exists(self, left_value):
-        pass
-
-    def run_necessary(self, left_value):
-        pass
-
-    def run_collect_in(self, left_value, right_value):
-        pass
-
-    def run_collect_contains(self, left_value, right_value):
-        pass
-
-
-class StrRuleRunner(RuleRunner):
-    def __init__(self):
-        super().__init__()
-
-    def run_equal(self, left_value, right_value):
-        return left_value == right_value
-
-    def run_gt(self, left_value, right_value):
-        return str(left_value) > str(right_value)
-
-    def run_lt(self, left_value, right_value):
-        return str(left_value) < str(right_value)
-
-    def run_ge(self, left_value, right_value):
-        return str(left_value) >= str(right_value)
-
-    def run_le(self, left_value, right_value):
-        return str(left_value) <= str(right_value)
-
-    def run_in(self, left_value, right_value):
-        return left_value in right_value
-
-    def run_contains(self, left_value, right_value):
-        return right_value in left_value
-
-    def run_and(self, left_value, right_value):
-        return left_value and right_value
-
-    def run_or(self, left_value, right_value):
-        return left_value or right_value
-
-    def run_not(self, left_value):
-        return not left_value
-
-    def run_match(self, left_value, right_value):
-        if left_value is None:
-            return MatchInfo(MatchRes.UN_MATCH)
-        if not isinstance(left_value, list):
-            return MatchInfo(MatchRes.UN_MATCH)
-        for v in left_value:
-            if v in right_value or right_value in v:
-                return MatchInfo(MatchRes.MATCH)
-        return MatchInfo(MatchRes.UN_MATCH)
-
-    def run_exists(self, left_value):
-        if left_value is None:
-            return MatchInfo(MatchRes.UN_MATCH)
-        if isinstance(left_value, list) and len(left_value) != 0:
-            return MatchInfo(MatchRes.MATCH)
-        return MatchInfo(MatchRes.UN_MATCH)
-
-    def run_necessary(self, left_value):
-        if left_value is None:
-            return MatchInfo(MatchRes.UN_MATCH)
-        if isinstance(left_value, list) and len(left_value) == 1:
-            return MatchInfo(MatchRes.MATCH)
-        return MatchInfo(MatchRes.UN_MATCH)
-
-    def run_collect_in(self, left_value, right_value):
-        pass
-
-    def run_collect_contains(self, left_value, right_value):
-        if left_value is None:
-            return MatchInfo(MatchRes.UN_MATCH)
-        if not isinstance(left_value, list):
-            return MatchInfo(MatchRes.UN_MATCH)
-        for v in left_value:
-            if str(right_value) in str(v):
-                return MatchInfo(MatchRes.MATCH)
-        return MatchInfo(MatchRes.UN_MATCH)
-
-
-class ModelRunner(StrRuleRunner):
-    def __init__(self, llm, kg_graph: KgGraph, query:str, req_id: str):
-        super().__init__()
-        self.llm = llm
-        self.kg_graph = kg_graph
-        self.query = query
-        self.req_id = req_id
-    def _get_kg_graph_data(self):
-        return self.kg_graph.to_spo()
-
-    def run_match(self, left_value, right_value):
-        if left_value is None:
-            return MatchInfo(MatchRes.UN_RELATED, "判定信息不足")
-        if not isinstance(left_value, list):
-            return MatchInfo(MatchRes.UN_RELATED, "判定信息不足")
-        start_time = time.time()
-        prompt = "根据提供的检索文档，请首先判断是否能够直接回答指令“{}”。如果可以直接回答，请直接回复答案，无需解释；如果不能直接回答但存在关联信息，请总结其中与指令“{}”相关的关键信息，并明确解释为何与指令相关；如果没有任何相关信息，直接回复“无相关信息”无需解释。\n【检索文档】：“{}”\n请确保所提供的信息直接准确地来自检索文档，不允许任何自身推测。".format(
-            right_value, right_value, str(self._get_kg_graph_data())
-        )
-        res = self.llm.generate(prompt, max_output_len=100)
-        logging.info(f"ModelRunner {self.req_id} cost={time.time() - start_time} prompt={prompt} res={res}")
-        return MatchInfo(trans_str_res_to_match(res), res)
-
-    def run_collect_in(self, left_value, right_value):
-        if left_value is None:
-            return MatchInfo(MatchRes.UN_RELATED, "判定信息不足")
-        if not isinstance(left_value, list):
-            return MatchInfo(MatchRes.UN_RELATED, "判定信息不足")
-        start_time = time.time()
-        prompt = "根据提供的检索文档，请首先判断是否能够直接回答指令“{}”。如果可以直接回答，请直接回复答案，无需解释；如果不能直接回答但存在关联信息，请总结其中与指令“{}”相关的关键信息，并明确解释为何与指令相关；如果没有任何相关信息，直接回复“无相关信息”无需解释。\n【检索文档】：“{}”\n请确保所提供的信息直接准确地来自检索文档，不允许任何自身推测。".format(
-            right_value, right_value, str(self._get_kg_graph_data())
-        )
-        res = self.llm.generate(prompt, max_output_len=100)
-        logging.info(f"ModelRunner {self.req_id} cost={time.time() - start_time} prompt={prompt} res={res}")
-        return MatchInfo(trans_str_res_to_match(res), res)
-
-    def run_collect_contains(self, left_value, right_value):
-        if left_value is None:
-            return MatchInfo(MatchRes.UN_RELATED, "判定信息不足")
-        if not isinstance(left_value, list):
-            return MatchInfo(MatchRes.UN_RELATED, "判定信息不足")
-        start_time = time.time()
-        prompt = "根据提供的检索文档，请首先判断是否能够直接回答指令“{}”。如果可以直接回答，请直接回复答案，无需解释；如果不能直接回答但存在关联信息，请总结其中与指令“{}”相关的关键信息，并明确解释为何与指令相关；如果没有任何相关信息，直接回复“无相关信息”无需解释。\n【检索文档】：“{}”\n请确保所提供的信息直接准确地来自检索文档，不允许任何自身推测。".format(
-            right_value, right_value, str(self._get_kg_graph_data())
-        )
-        res = self.llm.generate(prompt, max_output_len=100)
-        logging.info(f"ModelRunner {self.req_id} cost={time.time() - start_time} prompt={prompt} res={res}")
-        return MatchInfo(trans_str_res_to_match(res), res)
-
-
-class OpRunner:
-    def __init__(self, kg_graph: KgGraph, llm, query: str, req_id: str):
-        self.kg_graph = kg_graph
-        self.query = query
-        if llm is None:
-            self.runner: RuleRunner = StrRuleRunner()
-        else:
-            self.runner: ModelRunner = ModelRunner(llm, kg_graph, query, req_id)
-        self.llm = llm
-
-    def _get_identifer_to_doc(self, alias:Identifer):
-        data = self.kg_graph.get_entity_by_alias(alias)
-        if data is None:
-            return []
-        ret_data = []
-        for d in data:
-            if isinstance(d, EntityData):
-                if d.type == "attribute":
-                    ret_data.append(d.biz_id)
-                else:
-                    ret_data.append(d.to_json())
-            elif isinstance(d, RelationData):
-                ret_data.append(d.to_json())
-            else:
-                ret_data.append(d)
-        return ret_data
-
-    def _get_alias_to_doc(self, alias):
-        if not isinstance(alias, list):
-            alias_set = [alias]
-        else:
-            alias_set = set(alias)
-        ret_data = []
-        for alias_ele in alias_set:
-            if isinstance(alias_ele, Identifer):
-                ret_data = ret_data + self._get_identifer_to_doc(alias_ele)
-            else:
-                ret_data.append(alias)
-        return ret_data
-
-    def _get_value_ins_identifer(self, alias: Identifer):
-        data = self.kg_graph.get_entity_by_alias(alias)
-        if data is None:
-            return []
-        ret_data = []
-        for d in data:
-            if isinstance(d, EntityData) and d.type == "attribute":
-                ret_data.append(d.biz_id)
-            else:
-                ret_data.append(d)
-        return ret_data
-
-    def _get_value_ins(self, alias):
-        if isinstance(alias, list):
-            alias_set = set(alias)
-        else:
-            alias_set = {alias}
-        ret_data = []
-        for alias_ele in alias_set:
-            if isinstance(alias_ele, Identifer):
-                ret_data = ret_data + self._get_value_ins_identifer(alias_ele)
-            else:
-                return alias
-        return ret_data
-
-    def run_single_binary_exec_rule(self, op_name: str, left_value, right_value):
-        left_value = self._get_value_ins(left_value)
-        right_value = self._get_value_ins(right_value)
-        if left_value is None:
-            return {}
-        res = {}
-        for left_value_ins in left_value:
-            # 可能是实体或文本值
-            left_value_text = left_value_ins
-            if isinstance(left_value_ins, EntityData):
-                left_value_text = left_value_ins.biz_id
-            single_rule_res = self.runner.op_map[op_name](left_value_text, right_value)
-            res[left_value_text] = single_rule_res
-        return res
-
-    def run_single_unary_exec_rule(self, op_name: str, left_value):
-        left_value = self._get_value_ins(left_value)
-        if left_value is None:
-            return {}
-        res = {}
-        for left_value_ins in left_value:
-            single_rule_res = self.runner.op_map[op_name](left_value_ins)
-            res[left_value_ins] = single_rule_res
-        return res
-
-    def single_rule_dispatch(self, op_name: str, left_value, right_value):
-        op_name = self._get_op_zh_2_en(op_name)
-
-        binary_op = ['equal', 'lt', 'gt', 'le', 'ge', 'in', 'contains', 'and', 'or']
-        unary_op = ['not']
-
-        if op_name in binary_op:
-            return self.run_single_binary_exec_rule(op_name, left_value, right_value)
-        elif op_name in unary_op:
-            return self.run_single_unary_exec_rule(op_name, left_value)
-        else:
-            raise RuntimeError(f"not impl op {op_name}")
-
-    def collect_rule_dispatch(self, op_name: str, left_value, right_value):
-        op_name = self._get_op_zh_2_en(op_name)
-        collect_binary_op = ['match', 'contains', 'in']
-        collect_unary_op = ['exist', 'necessary']
-        if op_name in collect_unary_op:
-            return self.run_collect_unary_exec_rule(op_name, left_value)
-        elif op_name in collect_binary_op:
-            return self.run_collect_binary_exec_rule(op_name, left_value, right_value)
-        else:
-            # agg by self
-            res = self.single_rule_dispatch(op_name, left_value, right_value)
-            if res is not None and True in res.values():
-                return MatchInfo(MatchRes.MATCH, '')
-            return MatchInfo(MatchRes.UN_MATCH, '')
-
-    def run_collect_binary_exec_rule(self, op_name: str, left_value, right_value):
-        collect_op_name_map = {
-            "in": "collect_in",
-            "contains": "collect_contains",
-            "necessary": "necessary",
-            "match": "match"
-        }
-        left_value = self._get_value_ins(left_value)
-        right_value = self._get_value_ins(right_value)
-        """
-        res = MatchRes
-        """
-        res: MatchRes = self.runner.op_map[collect_op_name_map[op_name]](left_value, right_value)
-        return res
-
-    def run_collect_unary_exec_rule(self, op_name: str, left_value):
-        left_value = self._get_value_ins(left_value)
-        """
-                res = MatchRes
-                """
-        res: MatchInfo = self.runner.op_map[op_name](left_value)
-        return res
-
-    def _get_op_zh_2_en(self, op_name):
-        name_map = {
-            "包含": "contains",
-            "存在": "exist",
-            "匹配": "match",
-            "必要": "necessary",
-            "等于": "equal",
-            "大于": "gt",
-            "小于": "lt"
-        }
-        if op_name not in name_map.keys():
-            return op_name
-        return name_map[op_name]
-
-    def run_filter_op(self, f: FilterNode):
-        # 对边不执行过滤
-        if isinstance(f.left_expr, Identifer) and f.left_expr in self.kg_graph.edge_alias:
-            return
-        res = self.single_rule_dispatch(f.op, f.left_expr, f.right_expr)
-        failed_list = []
-        for r in res.keys():
-            if not res[r]:
-                failed_list.append(r)
-        self.kg_graph.rmv_ins(f.left_expr, failed_list)
-
-    def run_extractor_op(self, f: ExtractorNode):
-        update_verify = VerifyNode("verify", {
-            "left_expr": f.alias_set,
-            "right_expr": self.query,
-            "op": "匹配"
-        })
-        return self.run_verify_op(update_verify)
-
-    def run_verify_op(self, f: VerifyNode):
-        verify_kg_graph = KgGraph()
-        left_expr_name = f.get_left_expr_name()
-        s_alias_name = f"verify_s_{left_expr_name}"
-        p_alias_name = f"verify_p_{left_expr_name}"
-        o_alias_name = f"verify_o_{left_expr_name}"
-        verify_kg_graph.query_graph[p_alias_name] = {
-            "s": s_alias_name,
-            "p": p_alias_name,
-            "o": o_alias_name
-        }
-        left_value = self._get_alias_to_doc(f.left_expr)
-        if len(left_value) == 0:
-            return None, None, None
-
-        verify_kg_graph.nodes_alias.append(s_alias_name)
-        verify_kg_graph.nodes_alias.append(p_alias_name)
-        verify_kg_graph.edge_alias.append(o_alias_name)
-        s_entity_data = EntityData()
-        s_entity_data.type = "verify_op"
-        s_entity_data.type_zh = "判定"
-        s_entity_data.biz_id = f"{left_expr_name}"
-        s_entity_data.name = "检索信息"
-        s_entity_data.description = "检索信息"
-        left_expr_set = f.get_left_expr_set()
-        description = []
-        for left_epxr in left_expr_set:
-            if left_epxr in self.kg_graph.logic_form_base.keys():
-                description.append(f"{self.kg_graph.logic_form_base[left_epxr]}")
-        if len(description) > 0:
-            s_entity_data.description = "\n\n".join(description)
-        right_value = f.right_expr
-        if right_value is None or right_value == '':
-            right_value = self.query
-        right_value = self._get_alias_to_doc(right_value)
-        match_info = self.collect_rule_dispatch(f.op, f.left_expr, f.right_expr)
-        o_entity_data = EntityData()
-        o_entity_data.type = "verify_op_result"
-        o_entity_data.type_zh = "问题"
-        o_entity_data.biz_id = match_info.trans_match_res_to_str()
-        o_entity_data.name = match_info.trans_match_res_to_str()
-        o_entity_data.description = match_info.desc
-        rel = RelationData()
-        rel.from_id = s_entity_data.biz_id
-        rel.from_type = s_entity_data.type_zh
-        rel.end_id = o_entity_data.biz_id
-        rel.end_type = o_entity_data.type_zh
-        rel.from_entity = s_entity_data
-        rel.end_entity = o_entity_data
-        rel.type = f"是否{f.op} {right_value}"
-
-        verify_kg_graph.entity_map[s_alias_name] = [s_entity_data]
-        verify_kg_graph.edge_map[p_alias_name] = [rel]
-
-        self.kg_graph.merge_kg_graph(verify_kg_graph)
-        return match_info, rel, p_alias_name
diff --git a/kag/solver/logic/solver_pipeline.py b/kag/solver/logic/solver_pipeline.py
index 907268f0..a7d9d778 100644
--- a/kag/solver/logic/solver_pipeline.py
+++ b/kag/solver/logic/solver_pipeline.py
@@ -1,70 +1,91 @@
+import copy
 import logging
+from kag.common.registry import Registrable
 
+# from kag.solver.implementation.default_generator import DefaultGenerator
+# from kag.solver.implementation.default_reasoner import DefaultReasoner
+# from kag.solver.implementation.default_reflector import DefaultReflector
 from kag.interface.solver.kag_generator_abc import KAGGeneratorABC
+from kag.interface.solver.kag_memory_abc import KagMemoryABC
 from kag.interface.solver.kag_reasoner_abc import KagReasonerABC
 from kag.interface.solver.kag_reflector_abc import KagReflectorABC
-from kag.solver.implementation.default_generator import DefaultGenerator
-from kag.solver.implementation.default_memory import DefaultMemory
-from kag.solver.implementation.default_reasoner import DefaultReasoner
-from kag.solver.implementation.default_reflector import DefaultReflector
+from kag.interface.solver.base_model import LFExecuteResult
 
 logger = logging.getLogger(__name__)
 
 
-class SolverPipeline:
-    def __init__(self, max_run=3, reflector: KagReflectorABC = None, reasoner: KagReasonerABC = None,
-                 generator: KAGGeneratorABC = None, **kwargs):
+class SolverPipeline(Registrable):
+    def __init__(
+        self,
+        reflector: KagReflectorABC,
+        reasoner: KagReasonerABC,
+        generator: KAGGeneratorABC,
+        memory: KagMemoryABC,
+        max_iterations=3,
+        **kwargs
+    ):
         """
         Initializes the think-and-act loop class.
 
-        :param max_run: Maximum number of runs to limit the thinking and acting loop, defaults to 3.
+        :param max_iterations: Maximum number of iteration to limit the thinking and acting loop, defaults to 3.
         :param reflector: Reflector instance for reflect tasks.
         :param reasoner: Reasoner instance for reasoning about tasks.
         :param generator: Generator instance for generating actions.
+        :param memory: Assign memory store type
         """
-        self.max_run = max_run
-        self.memory = DefaultMemory(**kwargs)
+        super().__init__(**kwargs)
+        self.max_iterations = max_iterations
 
-        self.reflector = reflector or DefaultReflector(**kwargs)
-        self.reasoner = reasoner or DefaultReasoner(**kwargs)
-        self.generator = generator or DefaultGenerator(**kwargs)
+        self.reflector = reflector
+        self.reasoner = reasoner
+        self.generator = generator
+        self.memory = memory
+        self.param = kwargs
 
-        self.trace_log = []
-
-    def run(self, question):
+    def run(self, question, **kwargs):
         """
-       Executes the core logic of the problem-solving system.
+        Executes the core logic of the problem-solving system.
 
-       Parameters:
-       - question (str): The question to be answered.
+        Parameters:
+        - question (str): The question to be answered.
 
-       Returns:
-       - tuple: answer, trace log
-       """
+        Returns:
+        - tuple: answer, trace log
+        """
         instruction = question
         if_finished = False
-        logger.debug('input instruction:{}'.format(instruction))
+        logger.debug("input instruction:{}".format(instruction))
+        trace_log = []
         present_instruction = instruction
         run_cnt = 0
+        memory = copy.copy(self.memory)
 
-        while not if_finished and run_cnt < self.max_run:
+        while not if_finished and run_cnt < self.max_iterations:
             run_cnt += 1
-            logger.debug('present_instruction is:{}'.format(present_instruction))
+            logger.debug("present_instruction is:{}".format(present_instruction))
             # Attempt to solve the current instruction and get the answer, supporting facts, and history log
-            solved_answer, supporting_fact, history_log = self.reasoner.reason(present_instruction)
+            reason_res: LFExecuteResult = self.reasoner.reason(
+                present_instruction, **kwargs
+            )
 
             # Extract evidence from supporting facts
-            self.memory.save_memory(solved_answer, supporting_fact, instruction)
-
-            history_log['present_instruction'] = present_instruction
-            history_log['present_memory'] = self.memory.serialize_memory()
-            self.trace_log.append(history_log)
+            memory.save_memory(
+                reason_res.kg_exact_solved_answer,
+                reason_res.get_support_facts(),
+                instruction,
+            )
+            history_log = reason_res.get_trace_log()
+            history_log["present_instruction"] = present_instruction
+            history_log["present_memory"] = memory.serialize_memory()
+            trace_log.append(history_log)
 
             # Reflect the current instruction based on the current memory and instruction
-            if_finished, present_instruction = self.reflector.reflect_query(self.memory, present_instruction)
+            if_finished, present_instruction = self.reflector.reflect_query(
+                memory, present_instruction
+            )
 
-        response = self.generator.generate(instruction, self.memory)
-        return response, self.trace_log
+        response = self.generator.generate(instruction, memory)
+        return response, trace_log
 
     def get_kg_answer_num(self):
         """
diff --git a/kag/solver/main_solver.py b/kag/solver/main_solver.py
index 27795669..ec703a43 100644
--- a/kag/solver/main_solver.py
+++ b/kag/solver/main_solver.py
@@ -9,44 +9,108 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
-import os
+import copy
+import logging
 
-from kag.common.llm.client import LLMClient
-from kag.solver.common.base import Question
-from kag.solver.implementation.default_kg_retrieval import KGRetrieverByLlm
-from kag.solver.implementation.default_lf_planner import DefaultLFPlanner
-from kag.solver.implementation.default_reasoner import DefaultReasoner
-from kag.solver.implementation.lf_chunk_retriever import LFChunkRetriever
-from kag.solver.logic.core_modules.lf_solver import LFSolver
 from kag.solver.logic.solver_pipeline import SolverPipeline
 from kag.solver.tools.info_processor import ReporterIntermediateProcessTool
 
+from kag.common.conf import KAG_CONFIG, KAG_PROJECT_CONF
 
-class SolverMain:
+logger = logging.getLogger()
 
-    def invoke(self, project_id: int, task_id: int, query: str, report_tool=True, host_addr="http://127.0.0.1:8887"):
+class SolverMain:
+    def invoke(
+        self,
+        project_id: int,
+        task_id: int,
+        query: str,
+        is_report=True,
+        host_addr="http://127.0.0.1:8887",
+    ):
         # resp
-        report_tool = ReporterIntermediateProcessTool(report_log=report_tool, task_id=task_id, project_id=project_id, host_addr=host_addr)
-
-        lf_planner = DefaultLFPlanner(KAG_PROJECT_ID=project_id, KAG_PROJECT_HOST_ADDR=host_addr)
-        lf_solver = LFSolver(
-            kg_retriever=KGRetrieverByLlm(KAG_PROJECT_ID=project_id, KAG_PROJECT_HOST_ADDR=host_addr),
-            chunk_retriever=LFChunkRetriever(KAG_PROJECT_ID=project_id, KAG_PROJECT_HOST_ADDR=host_addr),
-            report_tool=report_tool,
-            KAG_PROJECT_ID=project_id,
-            KAG_PROJECT_HOST_ADDR=host_addr
+        report_tool = ReporterIntermediateProcessTool(
+            report_log=is_report,
+            task_id=str(task_id),
+            project_id=str(project_id),
+            host_addr=host_addr,
+            language=KAG_PROJECT_CONF.language,
+        )
+        llm_client = KAG_CONFIG.all_config["llm"]
+        default_pipeline_config = {
+            "max_iterations": 3,
+            "memory": {"type": "default_memory", "llm_client": llm_client},
+            "generator": {
+                "generate_prompt": {"type": "default_resp_generator"},
+                "llm_client": llm_client,
+                "type": "default_generator",
+            },
+            "reasoner": {
+                "lf_executor": {
+                    "chunk_retriever": {
+                        "recall_num": 10,
+                        "rerank_topk": 10,
+                        "type": "default_chunk_retriever",
+                        "llm_client": llm_client,
+                    },
+                    "exact_kg_retriever": {
+                        "el_num": 5,
+                        "graph_api": {"type": "openspg_graph_api"},
+                        "search_api": {"type": "openspg_search_api"},
+                        "type": "default_exact_kg_retriever",
+                        "llm_client": llm_client,
+                    },
+                    "force_chunk_retriever": True,
+                    "fuzzy_kg_retriever": {
+                        "el_num": 5,
+                        "graph_api": {"type": "openspg_graph_api"},
+                        "search_api": {"type": "openspg_search_api"},
+                        "type": "default_fuzzy_kg_retriever",
+                        "llm_client": llm_client,
+                    },
+                    "merger": {
+                        "chunk_retriever": {
+                            "recall_num": 10,
+                            "rerank_topk": 10,
+                            "llm_client": llm_client,
+                            "type": "default_chunk_retriever",
+                        },
+                        "type": "default_lf_sub_query_res_merger",
+                    },
+                    "llm_client": llm_client,
+                    "type": "default_lf_executor",
+                },
+                "lf_planner": {
+                    "type": "default_lf_planner",
+                    "llm_client": llm_client,
+                },
+                "llm_client": llm_client,
+                "type": "default_reasoner",
+            },
+            "reflector": {"type": "default_reflector", "llm_client": llm_client},
+        }
+        conf = copy.deepcopy(
+            KAG_CONFIG.all_config.get("lf_solver_pipeline", default_pipeline_config)
+        )
+        resp = SolverPipeline.from_config(conf)
+        try:
+            answer, trace_log = resp.run(query, report_tool=report_tool)
+            state = ReporterIntermediateProcessTool.STATE.FINISH
+            logger.info(f"{query} answer={answer} tracelog={trace_log}")
+        except Exception as e:
+            if KAG_PROJECT_CONF.language == 'en':
+                answer = f"Sorry, An exception occurred while processing query: {query}. Error: {str(e)}, please retry."
+            else:
+                answer = f"抱歉，处理查询 {query} 时发生异常。错误：{str(e)}, 请重试。"
+            state = ReporterIntermediateProcessTool.STATE.ERROR
+            logger.warning(f"An exception occurred while processing query: {query}. Error: {str(e)}", exc_info=True)
+        report_tool.report_final_answer(
+            query, answer, state
         )
-        reason = DefaultReasoner(lf_planner=lf_planner, lf_solver=lf_solver, KAG_PROJECT_ID=project_id, KAG_PROJECT_HOST_ADDR=host_addr)
-        question = Question(query)
-        question.id = 0
-        resp = SolverPipeline(reasoner=reason, KAG_PROJECT_ID=project_id, KAG_PROJECT_HOST_ADDR=host_addr)
-        answer, trace_log = resp.run(query)
-        print(trace_log)
-        report_tool.report_node(question, answer, ReporterIntermediateProcessTool.STATE.FINISH)
         return answer
 
 
 if __name__ == "__main__":
-    res = SolverMain().invoke(3, 283, "周杰伦在哪一年基于什么作品获得的全球畅销专辑榜”冠军的华语歌手", True, host_addr="http://127.0.0.1:8887")
+    res = SolverMain().invoke(300027, 2800106, "who is Jay Zhou", True)
     print("*" * 80)
     print("The Answer is: ", res)
diff --git a/kag/solver/plan/__init__.py b/kag/solver/plan/__init__.py
new file mode 100644
index 00000000..0f41fec7
--- /dev/null
+++ b/kag/solver/plan/__init__.py
@@ -0,0 +1 @@
+import kag.solver.plan.default_lf_planner
diff --git a/kag/solver/plan/default_lf_planner.py b/kag/solver/plan/default_lf_planner.py
new file mode 100644
index 00000000..398e9ca6
--- /dev/null
+++ b/kag/solver/plan/default_lf_planner.py
@@ -0,0 +1,135 @@
+import re
+import logging
+from typing import List
+
+from tenacity import retry, stop_after_attempt
+
+from kag.common.conf import KAG_PROJECT_CONF
+from kag.interface import LLMClient, VectorizeModelABC
+from kag.interface import PromptABC
+from kag.interface.solver.kag_memory_abc import KagMemoryABC
+from kag.interface.solver.plan.lf_planner_abc import LFPlannerABC
+from kag.interface.solver.base_model import LFPlan, LogicNode
+from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
+from kag.solver.logic.core_modules.config import LogicFormConfiguration
+from kag.solver.logic.core_modules.parser.logic_node_parser import ParseLogicForm
+from kag.solver.logic.core_modules.parser.schema_std import SchemaRetrieval
+from kag.solver.utils import init_prompt_with_fallback
+
+logger = logging.getLogger()
+
+
+@LFPlannerABC.register("default_lf_planner", as_default=True)
+class DefaultLFPlanner(LFPlannerABC):
+    """
+    Planner class that extends the base planner functionality to generate sub-queries and logic forms.
+    """
+
+    def __init__(
+        self,
+        logic_form_plan_prompt: PromptABC = None,
+        llm_client: LLMClient = None,
+        vectorize_model: VectorizeModelABC = None,
+        **kwargs,
+    ):
+        super().__init__(llm_client, **kwargs)
+        self.schema: SchemaUtils = SchemaUtils(
+            LogicFormConfiguration(
+                {
+                    "KAG_PROJECT_ID": KAG_PROJECT_CONF.project_id,
+                    "KAG_PROJECT_HOST_ADDR": KAG_PROJECT_CONF.host_addr,
+                }
+            )
+        )
+        self.schema.get_schema()
+        std_schema = SchemaRetrieval(
+            vectorize_model=vectorize_model, llm_client=llm_client, **kwargs
+        )
+        self.parser = ParseLogicForm(self.schema, std_schema)
+        # Load the prompt for generating logic forms based on the business scene and language
+        if logic_form_plan_prompt is None:
+            logic_form_plan_prompt = init_prompt_with_fallback(
+                "logic_form_plan", self.biz_scene
+            )
+        self.logic_form_plan_prompt = logic_form_plan_prompt
+
+    # 需要把大模型生成结果记录下来
+    def lf_planing(
+        self, question: str, memory: KagMemoryABC = None, llm_output=None
+    ) -> List[LFPlan]:
+        """
+        Generates sub-queries and logic forms based on the input question or provided LLM output.
+
+        Parameters:
+        question (str): The question or task to plan.
+        llm_output (Any, optional): Output from the LLM module. Defaults to None.
+
+        Returns:
+        list of LFPlanResult
+        """
+        if llm_output is not None:
+            sub_querys, logic_forms = self.parse_logic_form_llm_output(llm_output)
+        else:
+            sub_querys, logic_forms = self.generate_logic_form(question)
+        return self._parse_lf(question, sub_querys, logic_forms)
+
+    def _split_sub_query(self, logic_nodes: List[LogicNode]) -> List[LFPlan]:
+        query_lf_map = {}
+        for n in logic_nodes:
+            if n.sub_query in query_lf_map.keys():
+                query_lf_map[n.sub_query] = query_lf_map[n.sub_query] + [n]
+            else:
+                query_lf_map[n.sub_query] = [n]
+        plan_result = []
+        for k, v in query_lf_map.items():
+            plan_result.append(LFPlan(query=k, lf_nodes=v))
+        return plan_result
+
+    def _process_output_query(self, question, sub_query: str):
+        if sub_query is None:
+            return question
+        if "output" == sub_query.lower():
+            return f"output `{question}` answer:"
+        return sub_query
+
+    def _parse_lf(self, question, sub_querys, logic_forms) -> List[LFPlan]:
+        if sub_querys is None:
+            sub_querys = []
+        # process sub query
+        sub_querys = [self._process_output_query(question, q) for q in sub_querys]
+        parsed_logic_nodes = self.parser.parse_logic_form_set(
+            logic_forms, sub_querys, question
+        )
+        return self._split_sub_query(parsed_logic_nodes)
+
+    @retry(stop=stop_after_attempt(3))
+    def generate_logic_form(self, question: str):
+        return self.llm_module.invoke(
+            {"question": question},
+            self.logic_form_plan_prompt,
+            with_json_parse=False,
+            with_except=True,
+        )
+
+    def parse_logic_form_llm_output(self, llm_output):
+        _output_string = llm_output.replace("：", ":")
+        _output_string = llm_output.strip()
+        sub_querys = []
+        logic_forms = []
+        current_sub_query = ""
+        for line in _output_string.split("\n"):
+            line = line.strip()
+            if line.startswith("Step"):
+                sub_querys_regex = re.search("Step\d+:(.*)", line)
+                if sub_querys_regex is not None:
+                    sub_querys.append(sub_querys_regex.group(1))
+                    current_sub_query = sub_querys_regex.group(1)
+            elif line.startswith("Output"):
+                sub_querys.append("output")
+            elif line.startswith("Action"):
+                logic_forms_regex = re.search("Action\d+:(.*)", line)
+                if logic_forms_regex:
+                    logic_forms.append(logic_forms_regex.group(1))
+                    if len(logic_forms) - len(sub_querys) == 1:
+                        sub_querys.append(current_sub_query)
+        return sub_querys, logic_forms
diff --git a/kag/solver/prompt/__init__.py b/kag/solver/prompt/__init__.py
index e69de29b..fae9338d 100644
--- a/kag/solver/prompt/__init__.py
+++ b/kag/solver/prompt/__init__.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from kag.solver.prompt.default.deduce_choice import DeduceChoice
+from kag.solver.prompt.default.deduce_entail import DeduceEntail
+from kag.solver.prompt.default.deduce_judge import DeduceJudge
+from kag.solver.prompt.default.deduce_multi_choice import DeduceMutiChoice
+from kag.solver.prompt.default.logic_form_plan import LogicFormPlanPrompt
+from kag.solver.prompt.default.question_ner import QuestionNER
+from kag.solver.prompt.default.resp_extractor import RespExtractor
+from kag.solver.prompt.default.resp_generator import RespGenerator
+from kag.solver.prompt.default.resp_judge import RespJudge
+from kag.solver.prompt.default.resp_reflector import RespReflector
+from kag.solver.prompt.default.resp_verifier import RespVerifier
+from kag.solver.prompt.default.solve_question import SolveQuestion
+from kag.solver.prompt.default.solve_question_without_docs import (
+    SolveQuestionWithOutDocs,
+)
+from kag.solver.prompt.default.solve_question_without_spo import SolveQuestionWithOutSPO
+from kag.solver.prompt.default.spo_retrieval import SpoRetrieval
+
+__all__ = [
+    "DeduceChoice",
+    "DeduceEntail",
+    "DeduceJudge",
+    "DeduceMutiChoice",
+    "LogicFormPlanPrompt",
+    "QuestionNER",
+    "RespExtractor",
+    "RespGenerator",
+    "RespJudge",
+    "RespReflector",
+    "RespVerifier",
+    "SolveQuestion",
+    "SolveQuestionWithOutDocs",
+    "SolveQuestionWithOutSPO",
+    "SpoRetrieval",
+]
diff --git a/kag/solver/prompt/default/deduce_choice.py b/kag/solver/prompt/default/deduce_choice.py
index 6e488097..6ebf2683 100644
--- a/kag/solver/prompt/default/deduce_choice.py
+++ b/kag/solver/prompt/default/deduce_choice.py
@@ -1,33 +1,34 @@
 import logging
 from typing import List
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 logger = logging.getLogger(__name__)
 
 
-class DeduceEntail(PromptOp):
-    template_zh = "根据提供的选项及相关答案，请选择其中一个选项回答问题“$instruction”。" \
-                  "无需解释；" \
-                  "如果没有可选择的选项，直接回复“无相关信息”无需解释" \
-                  "\n【信息】：“$memory”\n请确保所提供的信息直接准确地来自检索文档，不允许任何自身推测。"
-    template_en = "Based on the provided options and related answers, choose one option to respond to the question '$instruction'." \
-                  "No explanation is needed;" \
-                  "If there are no available options, simply reply 'No relevant information' without explanation." \
-                  "\n[Information]: '$memory'" \
-                  "\nEnsure that the information provided comes directly and accurately from the retrieved document, " \
-                  "without any speculation."
-
-
-    def __init__(self, language: str):
-        super().__init__(language)
+@PromptABC.register("default_deduce_choice")
+class DeduceChoice(PromptABC):
+    template_zh = (
+        "根据提供的选项及相关答案，请选择其中一个选项回答问题“$instruction”。"
+        "无需解释；"
+        "如果没有可选择的选项，直接回复“无相关信息”无需解释"
+        "\n【信息】：“$memory”\n请确保所提供的信息直接准确地来自检索文档，不允许任何自身推测。"
+    )
+    template_en = (
+        "Based on the provided options and related answers, choose one option to respond to the question '$instruction'."
+        "No explanation is needed;"
+        "If there are no available options, simply reply 'No relevant information' without explanation."
+        "\n[Information]: '$memory'"
+        "\nEnsure that the information provided comes directly and accurately from the retrieved document, "
+        "without any speculation."
+    )
 
     @property
     def template_variables(self) -> List[str]:
         return ["memory", "instruction"]
 
     def parse_response_en(self, satisfied_info: str):
-        if satisfied_info.startswith('No relevant information'):
+        if satisfied_info.startswith("No relevant information"):
             if_answered = False
         else:
             if_answered = True
@@ -41,7 +42,7 @@ def parse_response_zh(self, satisfied_info: str):
         return if_answered, satisfied_info
 
     def parse_response(self, response: str, **kwargs):
-        logger.debug('推理器判别:{}'.format(response))
+        logger.debug("推理器判别:{}".format(response))
         if self.language == "en":
             return self.parse_response_en(response)
         return self.parse_response_zh(response)
diff --git a/kag/solver/prompt/default/deduce_entail.py b/kag/solver/prompt/default/deduce_entail.py
index dff35752..497d4d37 100644
--- a/kag/solver/prompt/default/deduce_entail.py
+++ b/kag/solver/prompt/default/deduce_entail.py
@@ -1,37 +1,38 @@
 import logging
 from typing import List
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 logger = logging.getLogger(__name__)
 
 
-class DeduceEntail(PromptOp):
-    template_zh = "根据提供的信息，请首先判断是否能够直接回答指令“$instruction”。如果可以直接回答，请直接回复答案，" \
-                  "无需解释；如果不能直接回答但存在关联信息，请总结其中与指令“$instruction”相关的关键信息，并明确解释为何与指令相关；" \
-                  "如果没有任何相关信息，直接回复“无相关信息”无需解释。" \
-                  "\n【信息】：“$memory”\n请确保所提供的信息直接准确地来自检索文档，不允许任何自身推测。"
-    template_en = "Based on the provided information, first determine whether you can directly respond to the " \
-                  "instruction '$instruction'. If you can directly answer, " \
-                  "reply with the answer without any explanation;" \
-                  " if you cannot answer directly but there is related information, " \
-                  "summarize the key information related to the instruction '$instruction' " \
-                  "and clearly explain why it is related; " \
-                  "if there is no relevant information, simply reply 'No relevant information' without explanation." \
-                  "\n[Information]: '$memory'" \
-                  "\nEnsure that the information provided comes directly and accurately from the retrieved document, " \
-                  "without any speculation."
-
-
-    def __init__(self, language: str):
-        super().__init__(language)
+@PromptABC.register("default_deduce_entail")
+class DeduceEntail(PromptABC):
+    template_zh = (
+        "根据提供的信息，请首先判断是否能够直接回答指令“$instruction”。如果可以直接回答，请直接回复答案，"
+        "无需解释；如果不能直接回答但存在关联信息，请总结其中与指令“$instruction”相关的关键信息，并明确解释为何与指令相关；"
+        "如果没有任何相关信息，直接回复“无相关信息”无需解释。"
+        "\n【信息】：“$memory”\n请确保所提供的信息直接准确地来自检索文档，不允许任何自身推测。"
+    )
+    template_en = (
+        "Based on the provided information, first determine whether you can directly respond to the "
+        "instruction '$instruction'. If you can directly answer, "
+        "reply with the answer without any explanation;"
+        " if you cannot answer directly but there is related information, "
+        "summarize the key information related to the instruction '$instruction' "
+        "and clearly explain why it is related; "
+        "if there is no relevant information, simply reply 'No relevant information' without explanation."
+        "\n[Information]: '$memory'"
+        "\nEnsure that the information provided comes directly and accurately from the retrieved document, "
+        "without any speculation."
+    )
 
     @property
     def template_variables(self) -> List[str]:
         return ["memory", "instruction"]
 
     def parse_response_en(self, satisfied_info: str):
-        if satisfied_info.startswith('No relevant information'):
+        if satisfied_info.startswith("No relevant information"):
             if_answered = False
         else:
             if_answered = True
@@ -45,7 +46,7 @@ def parse_response_zh(self, satisfied_info: str):
         return if_answered, satisfied_info
 
     def parse_response(self, response: str, **kwargs):
-        logger.debug('推理器判别:{}'.format(response))
+        logger.debug("推理器判别:{}".format(response))
         if self.language == "en":
             return self.parse_response_en(response)
         return self.parse_response_zh(response)
diff --git a/kag/solver/prompt/default/deduce_judge.py b/kag/solver/prompt/default/deduce_judge.py
index 8bb8e3a6..fd821ae4 100644
--- a/kag/solver/prompt/default/deduce_judge.py
+++ b/kag/solver/prompt/default/deduce_judge.py
@@ -1,35 +1,36 @@
 import logging
 from typing import List
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 logger = logging.getLogger(__name__)
 
 
-class DeduceJudge(PromptOp):
-    template_zh = "根据提供的信息，请首先判断是否能够直接判断问题“$instruction”。如果可以直接回答，请直接根据提供信息对问题给出判断是或者否，" \
-                  "无需解释；" \
-                  "如果没有任何相关信息，直接回复“无相关信息”无需解释。" \
-                  "\n【信息】：“$memory”\n请确保所提供的信息直接准确地来自检索文档，不允许任何自身推测。" \
-                  "\n【问题】：“$instruction”"
-    template_en = "Based on the provided information, first determine if the question '$instruction' can be directly assessed. " \
-                  "If it can be directly answered, simply respond with Yes or No based on the provided information, no explanation needed;" \
-                  "If there is no relevant information, simply reply 'No relevant information' without explanation." \
-                  "\n[Information]: '$memory'" \
-                  "\nEnsure that the information provided comes directly and accurately from the retrieved document, " \
-                  "without any speculation."\
-                  "\n[Question]: '$instruction'"
-
-
-    def __init__(self, language: str):
-        super().__init__(language)
+@PromptABC.register("default_deduce_judge")
+class DeduceJudge(PromptABC):
+    template_zh = (
+        "根据提供的信息，请首先判断是否能够直接判断问题“$instruction”。如果可以直接回答，请直接根据提供信息对问题给出判断是或者否，"
+        "无需解释；"
+        "如果没有任何相关信息，直接回复“无相关信息”无需解释。"
+        "\n【信息】：“$memory”\n请确保所提供的信息直接准确地来自检索文档，不允许任何自身推测。"
+        "\n【问题】：“$instruction”"
+    )
+    template_en = (
+        "Based on the provided information, first determine if the question '$instruction' can be directly assessed. "
+        "If it can be directly answered, simply respond with Yes or No based on the provided information, no explanation needed;"
+        "If there is no relevant information, simply reply 'No relevant information' without explanation."
+        "\n[Information]: '$memory'"
+        "\nEnsure that the information provided comes directly and accurately from the retrieved document, "
+        "without any speculation."
+        "\n[Question]: '$instruction'"
+    )
 
     @property
     def template_variables(self) -> List[str]:
         return ["memory", "instruction"]
 
     def parse_response_en(self, satisfied_info: str):
-        if satisfied_info.startswith('No relevant information'):
+        if satisfied_info.startswith("No relevant information"):
             if_answered = False
         else:
             if_answered = True
@@ -43,7 +44,7 @@ def parse_response_zh(self, satisfied_info: str):
         return if_answered, satisfied_info
 
     def parse_response(self, response: str, **kwargs):
-        logger.debug('推理器判别:{}'.format(response))
+        logger.debug("推理器判别:{}".format(response))
         if self.language == "en":
             return self.parse_response_en(response)
         return self.parse_response_zh(response)
diff --git a/kag/solver/prompt/default/deduce_multi_choice.py b/kag/solver/prompt/default/deduce_multi_choice.py
index 158c21bc..5914a03d 100644
--- a/kag/solver/prompt/default/deduce_multi_choice.py
+++ b/kag/solver/prompt/default/deduce_multi_choice.py
@@ -1,33 +1,34 @@
 import logging
 from typing import List
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 logger = logging.getLogger(__name__)
 
 
-class DeduceEntail(PromptOp):
-    template_zh = "根据提供的选项及相关答案，请选择其中至少一个选项回答问题“$instruction”。" \
-                  "无需解释；" \
-                  "如果没有可选择的选项，直接回复“无相关信息”无需解释" \
-                  "\n【信息】：“$memory”\n请确保所提供的信息直接准确地来自检索文档，不允许任何自身推测。"
-    template_en = "Based on the provided options and related answers, choose at least one option to respond to the question '$instruction'." \
-                  "No explanation is needed;" \
-                  "If there are no available options, simply reply 'No relevant information' without explanation." \
-                  "\n[Information]: '$memory'" \
-                  "\nEnsure that the information provided comes directly and accurately from the retrieved document, " \
-                  "without any speculation."
-
-
-    def __init__(self, language: str):
-        super().__init__(language)
+@PromptABC.register("default_deduce_multi_choice")
+class DeduceMutiChoice(PromptABC):
+    template_zh = (
+        "根据提供的选项及相关答案，请选择其中至少一个选项回答问题“$instruction”。"
+        "无需解释；"
+        "如果没有可选择的选项，直接回复“无相关信息”无需解释"
+        "\n【信息】：“$memory”\n请确保所提供的信息直接准确地来自检索文档，不允许任何自身推测。"
+    )
+    template_en = (
+        "Based on the provided options and related answers, choose at least one option to respond to the question '$instruction'."
+        "No explanation is needed;"
+        "If there are no available options, simply reply 'No relevant information' without explanation."
+        "\n[Information]: '$memory'"
+        "\nEnsure that the information provided comes directly and accurately from the retrieved document, "
+        "without any speculation."
+    )
 
     @property
     def template_variables(self) -> List[str]:
         return ["memory", "instruction"]
 
     def parse_response_en(self, satisfied_info: str):
-        if satisfied_info.startswith('No relevant information'):
+        if satisfied_info.startswith("No relevant information"):
             if_answered = False
         else:
             if_answered = True
@@ -41,7 +42,7 @@ def parse_response_zh(self, satisfied_info: str):
         return if_answered, satisfied_info
 
     def parse_response(self, response: str, **kwargs):
-        logger.debug('推理器判别:{}'.format(response))
+        logger.debug("推理器判别:{}".format(response))
         if self.language == "en":
             return self.parse_response_en(response)
         return self.parse_response_zh(response)
diff --git a/kag/solver/prompt/default/logic_form_plan.py b/kag/solver/prompt/default/logic_form_plan.py
index 0c726e52..86485c8f 100644
--- a/kag/solver/prompt/default/logic_form_plan.py
+++ b/kag/solver/prompt/default/logic_form_plan.py
@@ -1,16 +1,14 @@
 import logging
 import re
-from string import Template
 from typing import List
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 logger = logging.getLogger(__name__)
 
-from kag.common.base.prompt_op import PromptOp
 
-
-class LogicFormPlanPrompt(PromptOp):
+@PromptABC.register("default_logic_form_plan")
+class LogicFormPlanPrompt(PromptABC):
     instruct_zh = """"instruction": "",
     "function_description": "functionName为算子名;基本格式为 functionName(arg_name1=arg_value1,[args_name2=arg_value2, args_name3=arg_value3]),括号中为参数，被[]包含的参数为可选参数，未被[]包含的为必选参数",
     "function": [
@@ -36,7 +34,7 @@ class LogicFormPlanPrompt(PromptOp):
       },
       {
           "functionName": "get",
-          "function_declaration": "get(alias)",
+          "function_decl:aration": "get(alias)",
           "description": "返回指定的别名代表的信息，可以是实体、关系路径或get_spo中获取到的属性值；可作为最后的输出结果"
       }
     ],
@@ -89,7 +87,7 @@ class LogicFormPlanPrompt(PromptOp):
       },
       {
           "functionName": "get",
-          "function_declaration": "get(alias)",
+          "function_decl:aration": "get(alias)",
           "description": "Return the information represented by a specified alias. This can be an entity, a relationship path, or an attribute value obtained in the get_spo query. It can be used as the final output result."
       }
     ],"""
@@ -120,14 +118,10 @@ class LogicFormPlanPrompt(PromptOp):
 }}   
     """
 
-    def __init__(self, language: str):
-        super().__init__(language)
-
     @property
     def template_variables(self) -> List[str]:
         return ["question"]
 
-
     def parse_response(self, response: str, **kwargs):
         try:
             logger.debug(f"logic form:{response}")
@@ -135,17 +129,17 @@ def parse_response(self, response: str, **kwargs):
             _output_string = response.strip()
             sub_querys = []
             logic_forms = []
-            current_sub_query = ''
-            for line in _output_string.split('\n'):
-                if line.startswith('Step'):
-                    sub_querys_regex = re.search('Step\d+:(.*)', line)
+            current_sub_query = ""
+            for line in _output_string.split("\n"):
+                if line.startswith("Step"):
+                    sub_querys_regex = re.search("Step\d+:(.*)", line)
                     if sub_querys_regex is not None:
                         sub_querys.append(sub_querys_regex.group(1))
                         current_sub_query = sub_querys_regex.group(1)
-                elif line.startswith('Output'):
+                elif line.startswith("Output"):
                     sub_querys.append("output")
-                elif line.startswith('Action'):
-                    logic_forms_regex = re.search('Action\d+:(.*)', line)
+                elif line.startswith("Action"):
+                    logic_forms_regex = re.search("Action\d+:(.*)", line)
                     if logic_forms_regex:
                         logic_forms.append(logic_forms_regex.group(1))
                         if len(logic_forms) - len(sub_querys) == 1:
diff --git a/kag/solver/prompt/default/question_ner.py b/kag/solver/prompt/default/question_ner.py
index c8390770..2532bfa0 100644
--- a/kag/solver/prompt/default/question_ner.py
+++ b/kag/solver/prompt/default/question_ner.py
@@ -12,28 +12,29 @@
 
 import json
 from string import Template
-from typing import List, Optional
-
-from kag.common.base.prompt_op import PromptOp
+from typing import List
+from kag.common.conf import KAG_PROJECT_CONF
+from kag.interface import PromptABC
 from knext.reasoner.client import ReasonerClient
 
 
-class QuestionNER(PromptOp):
+@PromptABC.register("default_question_ner")
+class QuestionNER(PromptABC):
 
     template_en = """
     {
-        "instruction": "You are an expert in named entity recognition. Please extract entities and that match the schema definition from the input. Return an empty list if the entity type does not exist. Please respond in the format of a JSON string.You can refer to the example for extraction.",
+        "instruction": "You are an expert in named entity recognition. Please extract entities and that match the schema definition from the input. Please respond in the format of a JSON string.You can refer to the example for extraction.",
         "schema": $schema,
         "example": [
             {
                 "input": "Which magazine was started first, Arthur's Magazine or First for Women?",
                 "output": [
                         {
-                            "entity": "First for Women",
+                            "name": "First for Women",
                             "category": "Works"
                         },
                         {
-                            "entity": "Arthur's Magazine",
+                            "name": "Arthur's Magazine",
                             "category": "Works"
                         }
                     ]
@@ -45,11 +46,13 @@ class QuestionNER(PromptOp):
 
     template_zh = template_en
 
-    def __init__(
-            self, language: Optional[str] = "en", **kwargs
-    ):
+    def __init__(self, language: str = "", **kwargs):
         super().__init__(language, **kwargs)
-        self.schema = list(ReasonerClient(project_id=self.project_id).get_reason_schema().keys())
+        self.schema = (
+            ReasonerClient(project_id=KAG_PROJECT_CONF.project_id)
+            .get_reason_schema()
+            .keys()
+        )
         self.template = Template(self.template).safe_substitute(schema=self.schema)
 
     @property
diff --git a/kag/solver/prompt/default/resp_extractor.py b/kag/solver/prompt/default/resp_extractor.py
index 724bd731..9607aa0c 100644
--- a/kag/solver/prompt/default/resp_extractor.py
+++ b/kag/solver/prompt/default/resp_extractor.py
@@ -3,28 +3,29 @@
 from typing import List
 import logging
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 logger = logging.getLogger(__name__)
 
 
-class RespExtractor(PromptOp):
-    template_zh = "已知信息：\n$supporting_fact\n" \
-                  "你的任务是作为一名专业作家。你将仅根据提供的支持段落中的信息，撰写一段高质量的文章，以支持关于问题的给定预测。" \
-                  "现在，开始生成。在写完后，请输出[DONE]来表示已经完成任务。在生成段落时不要写前缀（例如：'Response：'）。"\
-                  "\n问题：$instruction\n段落："
-    template_en = "Known information:\n $supporting_fact\nYour job is to act as a professional writer. " \
-                 "You will write a good-quality passage that can support the given prediction about the question only based on the information in the provided supporting passages. " \
-                 "Now, let's start. After you write, please write [DONE] to indicate you are done. Do not write a prefix (e.g., 'Response:'') while writing a passage.\nQuestion:$instruction\nPassage:"
-
-    def __init__(self, language: str):
-        super().__init__(language)
+@PromptABC.register("default_resp_extractor")
+class RespExtractor(PromptABC):
+    template_zh = (
+        "已知信息：\n$supporting_fact\n"
+        "你的任务是作为一名专业作家。你将仅根据提供的支持段落中的信息，撰写一段高质量的文章，以支持关于问题的给定预测。"
+        "现在，开始生成。在写完后，请输出[DONE]来表示已经完成任务。在生成段落时不要写前缀（例如：'Response：'）。"
+        "\n问题：$instruction\n段落："
+    )
+    template_en = (
+        "Known information:\n $supporting_fact\nYour job is to act as a professional writer. "
+        "You will write a good-quality passage that can support the given prediction about the question only based on the information in the provided supporting passages. "
+        "Now, let's start. After you write, please write [DONE] to indicate you are done. Do not write a prefix (e.g., 'Response:'') while writing a passage.\nQuestion:$instruction\nPassage:"
+    )
 
     @property
     def template_variables(self) -> List[str]:
         return ["supporting_fact", "instruction"]
 
-
     def parse_response(self, response: str, **kwargs):
-        logger.debug('推理器判别:{}'.format(response))
+        logger.debug("推理器判别:{}".format(response))
         return response
diff --git a/kag/solver/prompt/default/resp_generator.py b/kag/solver/prompt/default/resp_generator.py
index 693e21d7..50c018ac 100644
--- a/kag/solver/prompt/default/resp_generator.py
+++ b/kag/solver/prompt/default/resp_generator.py
@@ -3,26 +3,26 @@
 from typing import List
 import logging
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 logger = logging.getLogger(__name__)
 
 
-class RespGenerator(PromptOp):
-    template_zh = "基于给定的引用信息回答问题。" \
-                  "\n输出答案，并且给出理由。" \
-                  "\n给定的引用信息：'$memory'\n问题：'$instruction'"
-    template_en = "Answer the question based on the given reference." \
-                 "\nGive me the answer and why." \
-                 "\nThe following are given reference:'$memory'\nQuestion: '$instruction'"
-
-    def __init__(self, language: str):
-        super().__init__(language)
+@PromptABC.register("default_resp_generator")
+class RespGenerator(PromptABC):
+    template_zh = (
+        "基于给定的引用信息回答问题。" "\n输出答案，并且给出理由。" "\n给定的引用信息：'$memory'\n问题：'$instruction'"
+    )
+    template_en = (
+        "Answer the question based on the given reference."
+        "\nGive me the answer and why."
+        "\nThe following are given reference:'$memory'\nQuestion: '$instruction'"
+    )
 
     @property
     def template_variables(self) -> List[str]:
         return ["memory", "instruction"]
 
     def parse_response(self, response: str, **kwargs):
-        logger.debug('推理器判别:{}'.format(response))
+        logger.debug("推理器判别:{}".format(response))
         return response
diff --git a/kag/solver/prompt/default/resp_judge.py b/kag/solver/prompt/default/resp_judge.py
index e27c72f3..5ea6807b 100644
--- a/kag/solver/prompt/default/resp_judge.py
+++ b/kag/solver/prompt/default/resp_judge.py
@@ -1,32 +1,43 @@
-import re
-from string import Template
-from typing import List
-import logging
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
 
-from kag.common.base.prompt_op import PromptOp
+import logging
+from typing import List
+from kag.interface import PromptABC
 
 logger = logging.getLogger(__name__)
 
 
-class RespJudge(PromptOp):
-    template_zh = "根据当前已知信息进行判断，不允许进行推理，" \
-                  "你能否完全并准确地回答这个问题'$instruction'?\n已知信息：'$memory'。" \
-                  "\n如果你能，请直接回复‘是’\n如果不能且需要更多信息，请直接回复‘否’。"
-    template_en = "Judging based solely on the current known information and without allowing for inference, " \
-                  "are you able to completely and accurately respond to the question '$instruction'? " \
-                  "\nKnown information: '$memory'. " \
-                  "\nIf you can, please reply with 'Yes' directly; " \
-                  "if you cannot and need more information, please reply with 'No' directly."
-
-    def __init__(self, language: str):
-        super().__init__(language)
+@PromptABC.register("default_resp_judge")
+class RespJudge(PromptABC):
+    template_zh = (
+        "根据当前已知信息进行判断，不允许进行推理，"
+        "你能否完全并准确地回答这个问题'$instruction'?\n已知信息：'$memory'。"
+        "\n如果你能，请直接回复‘是’\n如果不能且需要更多信息，请直接回复‘否’。"
+    )
+    template_en = (
+        "Judging based solely on the current known information and without allowing for inference, "
+        "are you able to completely and accurately respond to the question '$instruction'? "
+        "\nKnown information: '$memory'. "
+        "\nIf you can, please reply with 'Yes' directly; "
+        "if you cannot and need more information, please reply with 'No' directly."
+    )
 
     @property
     def template_variables(self) -> List[str]:
         return ["memory", "instruction"]
 
     def parse_response_en(self, satisfied_info: str):
-        if satisfied_info[:3] == 'Yes':
+        if satisfied_info[:3] == "Yes":
             if_finished = True
         else:
             if_finished = False
@@ -40,7 +51,7 @@ def parse_response_zh(self, satisfied_info: str):
         return if_finished
 
     def parse_response(self, response: str, **kwargs):
-        logger.debug('推理器判别:{}'.format(response))
+        logger.debug("推理器判别:{}".format(response))
         if self.language == "en":
             return self.parse_response_en(response)
         return self.parse_response_zh(response)
diff --git a/kag/solver/prompt/default/resp_reflector.py b/kag/solver/prompt/default/resp_reflector.py
index 9c186122..3e7c4ebf 100644
--- a/kag/solver/prompt/default/resp_reflector.py
+++ b/kag/solver/prompt/default/resp_reflector.py
@@ -1,22 +1,24 @@
 import logging
 from typing import List
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 logger = logging.getLogger(__name__)
 
 
-class RespRewriter(PromptOp):
-    template_zh = "你是一个智能助手，擅长通过复杂的、多跳的推理帮助用户在多文档中获取信息。请理解当前已知信息与目标问题之间的信息差。" \
-                  "你的任务是直接生成一个用于下一步检索的思考问题。" \
-                  "不要一次性生成所有思考过程！\n[已知信息]： $memory\n[目标问题]：$instruction\n[你的思考]："
-    template_en = "You serve as an intelligent assistant, adept at facilitating users through complex, " \
-                  "multi-hop reasoning across multiple documents. Please understand the information gap between the currently known information and the target problem." \
-                  "Your task is to generate one thought in the form of question for next retrieval step directly. " \
-                  "DON\'T generate the whole thoughts at once!\n[Known information]: $memory\n[Target question]: $instruction\n[You Thought]:"
-
-    def __init__(self, language: str):
-        super().__init__(language)
+@PromptABC.register("default_resp_reflector")
+class RespReflector(PromptABC):
+    template_zh = (
+        "你是一个智能助手，擅长通过复杂的、多跳的推理帮助用户在多文档中获取信息。请理解当前已知信息与目标问题之间的信息差。"
+        "你的任务是直接生成一个用于下一步检索的思考问题。"
+        "不要一次性生成所有思考过程！\n[已知信息]： $memory\n[目标问题]：$instruction\n[你的思考]："
+    )
+    template_en = (
+        "You serve as an intelligent assistant, adept at facilitating users through complex, "
+        "multi-hop reasoning across multiple documents. Please understand the information gap between the currently known information and the target problem."
+        "Your task is to generate one thought in the form of question for next retrieval step directly. "
+        "DON'T generate the whole thoughts at once!\n[Known information]: $memory\n[Target question]: $instruction\n[You Thought]:"
+    )
 
     @property
     def template_variables(self) -> List[str]:
@@ -26,26 +28,26 @@ def parse_response_en(self, response: str):
         update_reason_path = []
         split_path = response.split("\n")
         for p in split_path:
-            if 'Here are the steps' in p or p == '\n' or p == '':
+            if "Here are the steps" in p or p == "\n" or p == "":
                 continue
             else:
                 update_reason_path.append(p)
-        logger.debug('cur path:{}'.format(str(update_reason_path)))
+        logger.debug("cur path:{}".format(str(update_reason_path)))
         return update_reason_path
 
     def parse_response_zh(self, response: str):
         update_reason_path = []
         split_path = response.split("\n")
         for p in split_path:
-            if '步骤为' in p or p == '\n' or p == '':
+            if "步骤为" in p or p == "\n" or p == "":
                 continue
             else:
                 update_reason_path.append(p)
-        logger.debug('cur path:{}'.format(str(update_reason_path)))
+        logger.debug("cur path:{}".format(str(update_reason_path)))
         return update_reason_path
 
     def parse_response(self, response: str, **kwargs):
-        logger.debug('infer result:{}'.format(response))
+        logger.debug("infer result:{}".format(response))
         if self.language == "en":
             return self.parse_response_en(response)
         return self.parse_response_zh(response)
diff --git a/kag/solver/prompt/default/resp_verifier.py b/kag/solver/prompt/default/resp_verifier.py
index 600dd111..4778f02c 100644
--- a/kag/solver/prompt/default/resp_verifier.py
+++ b/kag/solver/prompt/default/resp_verifier.py
@@ -3,36 +3,39 @@
 from typing import List
 import logging
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 logger = logging.getLogger(__name__)
 
 
-class RespVerifier(PromptOp):
-    template_zh = "仅根据当前已知的信息，并且不允许进行推理，" \
-                  "你能否完全并准确地回答这个问题'$sub_instruction'?\n已知信息：'$supporting_fact'。" \
-                  "\n如果你能，请直接回复‘是’，并给出问题'$sub_instruction'的答案，无需重复问题；如果不可以，请直接回答'否'。"
-    template_en = "Judging based solely on the current known information and without allowing for inference, " \
-                 "are you able to respond completely and accurately to the question '$sub_instruction'? \n" \
-                 "Known information: '$supporting_fact'. If yes, please reply with 'Yes', followed by an accurate response to the question '$sub_instruction', " \
-                 "without restating the question; if no, please reply with 'No' directly."
-
-    def __init__(self, language: str):
-        super().__init__(language)
-
+@PromptABC.register("default_resp_verifier")
+class RespVerifier(PromptABC):
+    template_zh = (
+        "仅根据当前已知的信息，并且不允许进行推理，"
+        "你能否完全并准确地回答这个问题'$sub_instruction'?\n已知信息：'$supporting_fact'。"
+        "\n如果你能，请直接回复‘是’，并给出问题'$sub_instruction'的答案，无需重复问题；如果不可以，请直接回答'否'。"
+    )
+    template_en = (
+        "Judging based solely on the current known information and without allowing for inference, "
+        "are you able to respond completely and accurately to the question '$sub_instruction'? \n"
+        "Known information: '$supporting_fact'. If yes, please reply with 'Yes', followed by an accurate response to the question '$sub_instruction', "
+        "without restating the question; if no, please reply with 'No' directly."
+    )
 
     @property
     def template_variables(self) -> List[str]:
         return ["sub_instruction", "supporting_fact"]
 
     def parse_response_en(self, satisfied_info: str):
-        if satisfied_info[:3] == 'Yes':
+        if satisfied_info[:3] == "Yes":
             satisfied = True
         else:
             satisfied = False
         if satisfied:
-            satisfied_info = satisfied_info.replace('Yes', '').strip()
-            res = "The answer to the Question'{}' is '{}'".format(self.template_variables_value["sub_instruction"], satisfied_info)
+            satisfied_info = satisfied_info.replace("Yes", "").strip()
+            res = "The answer to the Question'{}' is '{}'".format(
+                self.template_variables_value["sub_instruction"], satisfied_info
+            )
             return res
         return None
 
@@ -42,13 +45,15 @@ def parse_response_zh(self, satisfied_info: str):
         else:
             satisfied = False
         if satisfied:
-            satisfied_info = satisfied_info.replace('是', '').strip()
-            res = "问题'{}' 的答案是 '{}'".format(self.template_variables_value["sub_instruction"], satisfied_info)
+            satisfied_info = satisfied_info.replace("是", "").strip()
+            res = "问题'{}' 的答案是 '{}'".format(
+                self.template_variables_value["sub_instruction"], satisfied_info
+            )
             return res
         return None
 
     def parse_response(self, response: str, **kwargs):
-        logger.debug('推理器判别:{}'.format(response))
+        logger.debug("推理器判别:{}".format(response))
         if self.language == "en":
             return self.parse_response_en(response)
         return self.parse_response_zh(response)
diff --git a/kag/solver/prompt/default/solve_question.py b/kag/solver/prompt/default/solve_question.py
index a7207a2c..e8c6a9cf 100644
--- a/kag/solver/prompt/default/solve_question.py
+++ b/kag/solver/prompt/default/solve_question.py
@@ -1,10 +1,11 @@
 from string import Template
 from typing import List
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 
-class SolveQuestion(PromptOp):
+@PromptABC.register("default_solve_question")
+class SolveQuestion(PromptABC):
 
     template_zh = """请根据检索到的知识图和相关文档回答问题“$question”，并结合历史信息进行综合分析。
 要求：
@@ -12,6 +13,7 @@ class SolveQuestion(PromptOp):
 2.不要重复问题的内容。
 3.根据提供的信息生成答案。如果可能有多个答案，请生成所有答案。
 4.如果没有合适的答案，请回答“I don't know”。
+5.给出答案的同时，也给出理由
 历史：
 $history
 知识图：
@@ -27,7 +29,7 @@ class SolveQuestion(PromptOp):
 2. Do not repeat the content of the question.
 3. Generate answers based on the provided information. If multiple answers are possible, generate all of them.
 4. If there is no suitable answer, answer 'I don't know'.
-
+5. Provide the answer and also provide the reason.
 history:
 $history
 
@@ -40,9 +42,6 @@ class SolveQuestion(PromptOp):
 answer:
 """
 
-    def __init__(self, language: str):
-        super().__init__(language)
-
     @property
     def template_variables(self) -> List[str]:
         return ["history", "question", "knowledge_graph", "docs"]
diff --git a/kag/solver/prompt/default/solve_question_without_docs.py b/kag/solver/prompt/default/solve_question_without_docs.py
index 68a82eb3..b8262f7e 100644
--- a/kag/solver/prompt/default/solve_question_without_docs.py
+++ b/kag/solver/prompt/default/solve_question_without_docs.py
@@ -1,10 +1,11 @@
 from string import Template
 from typing import List
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 
-class SolveQuestionWithOutDocs(PromptOp):
+@PromptABC.register("default_solve_question_without_docs")
+class SolveQuestionWithOutDocs(PromptABC):
 
     template_zh = """请根据检索到的知识图回答问题“$question”，并结合历史信息进行综合分析。
 要求：
@@ -12,6 +13,7 @@ class SolveQuestionWithOutDocs(PromptOp):
 2.不要重复问题的内容。
 3.根据提供的信息生成答案。如果可能有多个答案，请生成所有答案。
 4.如果没有合适的答案，请回答“I don't know”。
+5.给出答案的同时，也给出理由
 历史：
 $history
 知识图：
@@ -26,7 +28,7 @@ class SolveQuestionWithOutDocs(PromptOp):
 2. Do not repeat the content of the question.
 3. Generate answers based on the provided information. If multiple answers are possible, generate all of them.
 4. If there is no suitable answer, answer 'I don't know'.
-
+5. Provide the answer and also provide the reason.
 history:
 $history
 
@@ -36,9 +38,6 @@ class SolveQuestionWithOutDocs(PromptOp):
 answer:
 """
 
-    def __init__(self, language: str):
-        super().__init__(language)
-
     @property
     def template_variables(self) -> List[str]:
         return ["history", "question", "knowledge_graph"]
diff --git a/kag/solver/prompt/default/solve_question_without_spo.py b/kag/solver/prompt/default/solve_question_without_spo.py
index 82ef9409..dfc7314c 100644
--- a/kag/solver/prompt/default/solve_question_without_spo.py
+++ b/kag/solver/prompt/default/solve_question_without_spo.py
@@ -1,10 +1,12 @@
 from string import Template
 from typing import List
 
-from kag.common.base.prompt_op import PromptOp
 
+from kag.interface import PromptABC
 
-class SolveQuestionWithOutSPO(PromptOp):
+
+@PromptABC.register("default_solve_question_without_spo")
+class SolveQuestionWithOutSPO(PromptABC):
 
     template_zh = """请根据检索到的相关文档回答问题“$question”，并结合历史信息进行综合分析。
 要求：
@@ -12,6 +14,7 @@ class SolveQuestionWithOutSPO(PromptOp):
 2.不要重复问题的内容。
 3.根据提供的信息生成答案。如果可能有多个答案，请生成所有答案。
 4.如果没有合适的答案，请回答“I don't know”。
+5.给出答案的同时，也给出理由
 历史：
 $history
 文档：
@@ -26,6 +29,7 @@ class SolveQuestionWithOutSPO(PromptOp):
 2. Do not repeat the content of the question.
 3. Generate answers based on the provided information. If multiple answers are possible, generate all of them.
 4. If there is no suitable answer, answer 'I don't know'.
+5. Provide the answer and also provide the reason.
 
 history:
 $history
@@ -36,12 +40,9 @@ class SolveQuestionWithOutSPO(PromptOp):
 answer:
 """
 
-    def __init__(self, language: str):
-        super().__init__(language)
-
     @property
     def template_variables(self) -> List[str]:
-        return ["history", "question",  "docs"]
+        return ["history", "question", "docs"]
 
     def parse_response(self, response: str, **kwargs):
         return response
diff --git a/kag/solver/prompt/default/spo_retrieval.py b/kag/solver/prompt/default/spo_retrieval.py
index 3f3a7384..c32cd929 100644
--- a/kag/solver/prompt/default/spo_retrieval.py
+++ b/kag/solver/prompt/default/spo_retrieval.py
@@ -1,88 +1,121 @@
 import logging
-from typing import List
+from typing import List, Dict
 
-from kag.common.base.prompt_op import PromptOp
+from kag.interface import PromptABC
 
 logger = logging.getLogger(__name__)
 
 
-class SpoRetrieval(PromptOp):
-    template_zh = """示例 1:
-问题：Woman's Viewpoint 是一本英国出版物吗？
-SPO 提及：出版物[Woman's Viewpoint] 国籍 国家
-SPO 候选项：['Woman's Viewpoint 从 1923 年出版到 1927 年', 'Woman's Viewpoint 由 Florence M. Sterling 出版', 'Woman's Viewpoint 创立于 1923 年', 'Woman's Viewpoint 在德克萨斯州创立', 'Woman's Viewpoint 是一本女性杂志', 'Rolandos Liatsos 出演 Woman in Mind', 'Rolandos Liatsos 出演 Woman in Mind']
-分析：根据问题和 SPO 提及，我们需要找到出版物 “Woman's Viewpoint” 的出版国家。SPO 'Woman's Viewpoint 在德克萨斯州创立' 包含了地理位置信息，可从这里推断出所在国籍
-output：['Woman's Viewpoint 在德克萨斯州创立']
-
-示例 2:
-问题：哪位德国音乐家的手稿是《C大调长笛奏鸣曲，BWV 1033》的？
-SPO 提及：实体[Flute Sonata in C major, BWV 1033] InHandOf 实体
-SPO 候选项：['C大调长笛奏鸣曲，BWV 1033 归因于 Johann Sebastian Bach', 'C大调长笛奏鸣曲，BWV 1033 是为长笛或竖笛和低音连续演奏', 'C大调长笛奏鸣曲，BWV 1033 是一首四乐章的奏鸣曲']
-分析：根据问题和 SPO 提及，我们需要找出谁持有《C大调长笛奏鸣曲，BWV 1033》的手稿。根据提供的 SPO 候选项，SPO "C大调长笛奏鸣曲，BWV 1033 归因于 Johann Sebastian Bach" 与问题相关。可以推断出《C大调长笛奏鸣曲，BWV 1033》的手稿应在 Johann Sebastian Bach 的手中。
-output：['C大调长笛奏鸣曲，BWV 1033 归因于 Johann Sebastian Bach']
-
-要求：
-你是一名语言专家。你的任务是根据以下规则从给定的 SPO 候选项中选择正确的 SPO 文本来回答给定的问题。请确保它与 SPO 提及或问题匹配。
-
-输出必须从 SPO 候选项中选择，并且必须与其内容保持一致，以列表格式呈现。
-如果在 SPO 候选项中没有合适的答案，输出一个空列表。确保输出与问题或 SPO 提及高度匹配。
-如果在 SPO 候选项中有多个正确答案，输出所有匹配的 SPO。
-
-问题：$question
-SPO 提及: $mention
-SPO 候选项: $candis
-output:
+@PromptABC.register("default_spo_retrieval")
+class SpoRetrieval(PromptABC):
+    template_zh = """{
+  "指令": "你是一名语言专家。你的任务是根据以下规则从给定的 SPO 候选项中选择正确的 SPO 文本来回答给定的问题。请确保它与 SPO 提及或问题匹配。",
+  "要求": [
+    "输出必须从 SPO 候选项中选择，并且必须与其内容保持一致，以列表格式呈现。",
+    "如果在 SPO 候选项中没有合适的答案，输出一个空列表。确保输出与问题或 SPO 提及高度匹配。",
+    "如果在 SPO 候选项中有多个正确答案，以json列表形式输出所有匹配的 SPO。"
+  ],
+  "示例": [
+    {
+      "问题": "Woman's Viewpoint 是一本英国出版物吗？",
+      "SPO 提及": "出版物[Woman's Viewpoint] 国籍 国家",
+      "SPO 候选项": [
+        "Woman's Viewpoint 从 1923 年出版到 1927 年",
+        "Woman's Viewpoint 由 Florence M. Sterling 出版",
+        "Woman's Viewpoint 创立于 1923 年",
+        "Woman's Viewpoint 在德克萨斯州创立",
+        "Woman's Viewpoint 是一本女性杂志",
+        "Rolandos Liatsos 出演 Woman in Mind",
+        "Rolandos Liatsos 出演 Woman in Mind"
+      ],
+      "分析": "根据问题和 SPO 提及，我们需要找到出版物 “Woman's Viewpoint” 的出版国家。SPO 'Woman's Viewpoint 在德克萨斯州创立' 包含了地理位置信息，可从这里推断出所在国籍。",
+      "output": [
+        "Woman's Viewpoint 在德克萨斯州创立"
+      ]
+    },
+    {
+      "问题": "哪位德国音乐家的手稿是《C大调长笛奏鸣曲，BWV 1033》的？",
+      "SPO 提及": "实体[Flute Sonata in C major, BWV 1033] InHandOf 实体",
+      "SPO 候选项": [
+        "C大调长笛奏鸣曲，BWV 1033 归因于 Johann Sebastian Bach",
+        "C大调长笛奏鸣曲，BWV 1033 是为长笛或竖笛和低音连续演奏",
+        "C大调长笛奏鸣曲，BWV 1033 是一首四乐章的奏鸣曲"
+      ],
+      "分析": "根据问题和 SPO 提及，我们需要找出谁持有《C大调长笛奏鸣曲，BWV 1033》的手稿。根据提供的 SPO 候选项，SPO \"C大调长笛奏鸣曲，BWV 1033 归因于 Johann Sebastian Bach\" 与问题相关。可以推断出《C大调长笛奏鸣曲，BWV 1033》的手稿应在 Johann Sebastian Bach 的手中。",
+      "output": [
+        "C大调长笛奏鸣曲，BWV 1033 归因于 Johann Sebastian Bach"
+      ]
+    }
+  ],
+  "任务": {
+    "问题": "$question",
+    "SPO 提及": "$mention",
+    "SPO 候选项": "$candis"
+  },
+  "output": "提供一个JSON列表，其中包含根据SPO提及内容选出的最佳回答问题的SPO候选者。"
+}
+"""
+    template_en = """{
+  "instruction": "You are a language expert. Your task is to select the correct SPO (Subject Predicate Object) text from the given SPO candidates according to the following rules to answer the given question. Ensure that the selected SPO matches the SPO mention or appropriately answers the question.",
+  "requirements": [
+    "The output must be selected from the SPO candidates and remain consistent with their content, presented in a list format.",
+    "If there is no suitable answer in the SPO candidates, output an empty list. Ensure that the output is highly relevant to the question or SPO mention.",
+    "If there are multiple correct answers in the SPO candidates, output all matching SPOs in a JSON list format."
+  ],
+  "examples": [
+    {
+      "question": "Is the Woman's Viewpoint a British publication?",
+      "spo_mention": "Publication[Woman's Viewpoint] Nationality Country",
+      "spo_candidates": [
+        "the woman s viewpoint ranFrom 1923 to 1927",
+        "the woman s viewpoint publishedBy florence m sterling",
+        "the woman s viewpoint foundedIn 1923",
+        "the woman s viewpoint foundedIn texas",
+        "the woman s viewpoint was a woman s magazine",
+        "rolandos liatsos starredIn woman in mind",
+        "rolandos liatsos starredIn woman in mind"
+      ],
+      "analysis": "The question seeks the nationality of the publication \"Woman's Viewpoint.\" Among the SPO candidates, \"the woman s viewpoint foundedIn texas\" indicates the location of its founding, which relates to its nationality.",
+      "output": [
+        "the woman s viewpoint foundedIn texas"
+      ]
+    },
+    {
+      "question": "Who is the German musician whose hand the manuscript for Flute Sonata in C major, BWV 1033 is in?",
+      "spo_mention": "Entity[Flute Sonata in C major, BWV 1033] InHandOf Entity",
+      "spo_candidates": [
+        "flute sonata in c major bwv 1033 isAttributedTo johann sebastian bach",
+        "flute sonata in c major bwv 1033 isFor flute or recorder and basso continuo",
+        "flute sonata in c major bwv 1033 is a sonata in 4 movements"
+      ],
+      "analysis": "The question aims to identify who holds the manuscript of \"Flute Sonata in C major, BWV 1033.\" The SPO candidate \"flute sonata in c major bwv 1033 isAttributedTo johann sebastian bach\" implies ownership, indicating that Johann Sebastian Bach holds the manuscript.",
+      "output": [
+        "flute sonata in c major bwv 1033 isAttributedTo johann sebastian bach"
+      ]
+    }
+  ],
+  "task": {
+    "question": "$question",
+    "spo_mention": "$mention",
+    "spo_candidates": "$candis"
+  },
+  "output": "Provide a JSON list of the selected SPO candidates that best answer the question based on the spo mention."
+}
 """
-    template_en = """Examples 1:
-Question: Is the Woman's Viewpoint a British publication?
-spo mention: Publication[Woman's Viewpoint] Nationality Country
-spo candications: ['the woman s viewpoint ranFrom 1923 to 1927', 'the woman s viewpoint publishedBy florence m  sterling', 'the woman s viewpoint foundedIn 1923', 'the woman s viewpoint foundedIn texas', 'the woman s viewpoint was a woman s magazine', 'rolandos liatsos starredIn woman in mind', 'rolandos liatsos starredIn woman in mind']
-Analysis: Based on the Question and the SPO mention, we need to find the country of publication for the publication "Woman's Perspective". According to the information provided in the SPO candidates, the SPO  'the woman s viewpoint foundedIn texas' is related to the spo mention.
-output: ['the woman s viewpoint foundedIn texas']
-
-Examples 2:
-Question: Who is the German musician whose hand the manuscript for Flute Sonata in C major, BWV 1033 is in?
-spo mention: Entity[Flute Sonata in C major, BWV 1033] InHandOf Entity
-spo candications: ['flute sonata in c major  bwv 1033 isAttributedTo johann sebastian bach', 'flute sonata in c major  bwv 1033 isFor flute or recorder and basso continuo', 'flute sonata in c major  bwv 1033 is a sonata in 4 movements']
-Analysis: Based on the Question and the SPO mention, we need to find out who holds the manuscript of "Flute Sonata in C major, BWV 1033". According to the information provided in the SPO candidates, the SPO "flute sonata in c major bwv 1033 isAttributedTo johann sebastian bach" is related to the spo mention. It can be inferred that the manuscript of "Flute Sonata in C major, BWV 1033" should be in the hands of Johann Sebastian Bach.
-output: ['flute sonata in c major  bwv 1033 isAttributedTo johann sebastian bach']
-
-Requirements:
-You are a language expert. Your task is to select the correct SPO (subject predicate object) text from the given SPO candidates according to the following rules to answer the given question. Please ensure that it matches the SPO mention or answer question.
-1. The output must be selected from the SPO candidates and must remain consistent with their content, presented in a list format.
-2. If there is no suitable answer in the SPO candidates, output an empty list. Ensure that the output is highly matched with the question or SPO mention.
-3. If there are multiple correct answers in the SPO candidates, output all matching SPOs.
-
-Question：$question
-spo mention: $mention
-spo candications: $candis
-output: """
-
-    def __init__(self, language: str):
-        super().__init__(language)
 
     @property
     def template_variables(self) -> List[str]:
         return ["question", "mention", "candis"]
 
-    def parse_response_en(self, satisfied_info: str):
-        if satisfied_info[:3] == 'Yes':
-            if_finished = True
-        else:
-            if_finished = False
-        return if_finished
-
-    def parse_response_zh(self, satisfied_info: str):
-        if satisfied_info.startswith("是"):
-            if_finished = True
-        else:
-            if_finished = False
-        return if_finished
-
-    def parse_response(self, response: str, **kwargs):
+    def parse_response(self, response, **kwargs):
         logger.debug(
             f"SpoRetrieval {response} mention:{self.template_variables_value.get('mention', '')} "
-            f"candis:{self.template_variables_value.get('candis', '')}")
-        llm_output = response.replace('Expected Output:', '')
-        llm_output = llm_output.replace('"', '')
-        return llm_output.strip()
+            f"candis:{self.template_variables_value.get('candis', '')}"
+        )
+        if not isinstance(response, dict):
+            return []
+        if "output" in response:
+            return response["output"]
+        if "Output" in response:
+            return response["Output"]
+        return response
diff --git a/kag/solver/prompt/lawbench/logic_form_plan.py b/kag/solver/prompt/lawbench/logic_form_plan.py
deleted file mode 100644
index f067ea19..00000000
--- a/kag/solver/prompt/lawbench/logic_form_plan.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import logging
-
-from kag.solver.prompt.default.logic_form_plan import LogicFormPlanPrompt
-
-logger = logging.getLogger(__name__)
-
-
-
-class LawLogicFormPlanPrompt(LogicFormPlanPrompt):
-    default_case_zh = """"cases": [
-        {
-            "query": "中华人民共和国铁路法第二十八条的内容是什么",
-            "answer": "Step1:中华人民共和国铁路法第二十八条的内容是什么 ?\nAction1:get_spo(s=s1:Chunk[中华人民共和国铁路法第二十八条], p=p1:content, o=o1:Text)\n Action2: get(o1)"
-        }
-    ],"""
-
-    template_en = LogicFormPlanPrompt.template_en
-
-    def __init__(self, language: str):
-        super().__init__(language)
diff --git a/kag/solver/prompt/medical/question_ner.py b/kag/solver/prompt/medical/question_ner.py
deleted file mode 100644
index 3eb8ea9d..00000000
--- a/kag/solver/prompt/medical/question_ner.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2023 OpenSPG Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied.
-
-import json
-from string import Template
-from typing import List, Optional
-
-from kag.common.base.prompt_op import PromptOp
-from knext.schema.client import SchemaClient
-
-
-class QuestionNER(PromptOp):
-
-    template_zh = """
-{
-    "instruction": "你是命名实体识别的专家。请从输入中提取与模式定义匹配的实体。如果不存在该类型的实体，请返回一个空列表。请以JSON字符串格式回应。你可以参照example进行抽取。",
-    "schema": $schema,
-    "example": [
-        {
-            "input": "患儿3岁，因发热呕吐15天住院。查体，嗜睡状。营养差，右侧鼻唇沟变浅，心肺腹部未见异常。脑脊液：蛋白800mg/L，糖2.24mmol/L，氯化物100mmol/L。治疗应是:
-A. 青霉素
-B. 异烟肼
-C. 泼尼松
-D. INH+RF
-",
-            "output": [
-                    {"entity": "发热呕吐", "category": "Disease"},
-                    {"entity": "嗜睡状", "category": "Symptom"},
-                    {"entity": "营养差", "category": "Symptom"},
-                    {"entity": "右侧鼻唇沟变浅", "category": "Symptom"},
-                    {"entity": "心肺腹部未见异常", "category": "Symptom"},
-                    {"entity": "蛋白800mg/L", "category": "ExaminationTest"},
-                    {"entity": "糖2.24mmol/L", "category": "ExaminationTest"},
-                    {"entity": "氯化物100mmol/L", "category": "ExaminationTest"},
-                    {"entity": "青霉素", "category": "Medicine"},
-                    {"entity": "异烟肼", "category": "Medicine"},
-                    {"entity": "泼尼松", "category": "Medicine"},
-                    {"entity": "INH", "category": "Medicine"},
-                    {"entity": "RF", "category": "Medicine"}
-                ]
-        }
-    ],
-    "input": "$input"
-}    
-    """
-
-    template_en = template_zh
-
-    def __init__(
-            self, language: Optional[str] = "en", **kwargs
-    ):
-        super().__init__(language, **kwargs)
-        self.schema = SchemaClient(project_id=self.project_id).extract_types()
-        self.template = Template(self.template).safe_substitute(schema=self.schema)
-
-    @property
-    def template_variables(self) -> List[str]:
-        return ["input"]
-
-    def parse_response(self, response: str, **kwargs):
-        rsp = response
-        if isinstance(rsp, str):
-            rsp = json.loads(rsp)
-        if isinstance(rsp, dict) and "output" in rsp:
-            rsp = rsp["output"]
-        if isinstance(rsp, dict) and "named_entities" in rsp:
-            entities = rsp["named_entities"]
-        else:
-            entities = rsp
-
-        return entities
diff --git a/kag/solver/retriever/__init__.py b/kag/solver/retriever/__init__.py
new file mode 100644
index 00000000..41d35fc1
--- /dev/null
+++ b/kag/solver/retriever/__init__.py
@@ -0,0 +1,5 @@
+import kag.solver.retriever.base
+import kag.solver.retriever.chunk_retriever
+import kag.solver.retriever.exact_kg_retriever
+import kag.solver.retriever.fuzzy_kg_retriever
+import kag.solver.retriever.impl
diff --git a/kag/solver/retriever/base/__init__.py b/kag/solver/retriever/base/__init__.py
new file mode 100644
index 00000000..0f0c2079
--- /dev/null
+++ b/kag/solver/retriever/base/__init__.py
@@ -0,0 +1 @@
+import kag.solver.retriever.base.kg_retriever
diff --git a/kag/solver/retriever/base/kg_retriever.py b/kag/solver/retriever/base/kg_retriever.py
new file mode 100644
index 00000000..f166e9a9
--- /dev/null
+++ b/kag/solver/retriever/base/kg_retriever.py
@@ -0,0 +1,93 @@
+from typing import List
+
+from kag.common.conf import KAG_CONFIG, KAGGlobalConf, KAGConfigMgr, KAG_PROJECT_CONF
+from kag.interface import KagBaseModule, LLMClient, VectorizeModelABC
+from kag.interface.solver.base_model import SPOEntity
+from kag.solver.logic.core_modules.common.one_hop_graph import (
+    OneHopGraphData,
+    KgGraph,
+    EntityData,
+)
+from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
+from kag.solver.logic.core_modules.common.text_sim_by_vector import TextSimilarity
+from kag.solver.logic.core_modules.config import LogicFormConfiguration
+from kag.solver.logic.core_modules.parser.logic_node_parser import GetSPONode
+from kag.solver.tools.graph_api.graph_api_abc import GraphApiABC
+from kag.solver.tools.search_api.search_api_abc import SearchApiABC
+
+
+class KGRetriever(KagBaseModule):
+    def __init__(
+        self,
+        el_num=5,
+        llm_client: LLMClient = None,
+        vectorize_model: VectorizeModelABC = None,
+        graph_api: GraphApiABC = None,
+        search_api: SearchApiABC = None,
+        **kwargs
+    ):
+        super().__init__(llm_client, **kwargs)
+        self.schema: SchemaUtils = SchemaUtils(
+            LogicFormConfiguration(
+                {
+                    "KAG_PROJECT_ID": KAG_PROJECT_CONF.project_id,
+                    "KAG_PROJECT_HOST_ADDR": KAG_PROJECT_CONF.host_addr,
+                }
+            )
+        )
+        self.graph_api = graph_api or GraphApiABC.from_config(
+            {"type": "openspg_graph_api"}
+        )
+
+        self.search_api = search_api or SearchApiABC.from_config(
+            {"type": "openspg_search_api"}
+        )
+
+        self.vectorize_model = vectorize_model or VectorizeModelABC.from_config(
+            KAG_CONFIG.all_config["vectorize_model"]
+        )
+        self.text_similarity = TextSimilarity(vectorize_model)
+        self.el_num = el_num
+
+    def recall_one_hop_graph(
+        self, n: GetSPONode, heads: List[EntityData], tails: List[EntityData], **kwargs
+    ) -> List[OneHopGraphData]:
+        """
+        Recall one-hop graph data for a given entity.
+
+        Parameters:
+            n (GetSPONode): The entity to be standardized.
+            heads (List[EntityData]): A list of candidate entities.
+            tails (List[EntityData]): A list of candidate entities 'o'.
+            kwargs: Additional optional parameters.
+
+        Returns:
+            List[OneHopGraphData]: A list of one-hop graph data for the given entity.
+        """
+
+    def retrieval_relation(
+        self, n: GetSPONode, one_hop_graph_list: List[OneHopGraphData], **kwargs
+    ) -> KgGraph:
+        """
+        Input:
+            n: GetSPONode, the relation to be standardized
+            one_hop_graph_list: List[OneHopGraphData], list of candidate sets
+            kwargs: additional optional parameters
+
+        Output:
+            Returns KgGraph
+        """
+
+    def retrieval_entity(self, mention_entity: SPOEntity, **kwargs) -> List[EntityData]:
+        """
+        Retrieve related entities based on the given entity mention.
+
+        This function aims to retrieve the most relevant entities from storage or an index based on the provided entity name.
+
+        Parameters:
+            entity_mention (str): The name of the entity to retrieve.
+            kwargs: additional optional parameters
+
+        Returns:
+            list of EntityData
+        """
diff --git a/kag/solver/retriever/chunk_retriever.py b/kag/solver/retriever/chunk_retriever.py
new file mode 100644
index 00000000..da6c48a8
--- /dev/null
+++ b/kag/solver/retriever/chunk_retriever.py
@@ -0,0 +1,86 @@
+from abc import ABC
+from typing import List, Optional
+
+from kag.common.conf import KAG_PROJECT_CONF
+from kag.interface import KagBaseModule
+from kag.interface import LLMClient
+from kag.solver.logic.core_modules.common.one_hop_graph import RelationData
+from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
+from kag.solver.logic.core_modules.config import LogicFormConfiguration
+from kag.solver.tools.graph_api.graph_api_abc import GraphApiABC
+from kag.solver.tools.search_api.search_api_abc import SearchApiABC
+
+
+class ChunkRetriever(KagBaseModule, ABC):
+    def __init__(
+        self,
+        recall_num: int = 10,
+        rerank_topk: int = 10,
+        graph_api: GraphApiABC = None,
+        search_api: SearchApiABC = None,
+        llm_client: LLMClient = None,
+        **kwargs
+    ):
+        super().__init__(llm_client, **kwargs)
+        self.recall_num = recall_num
+        self.rerank_topk = rerank_topk
+        self.schema: SchemaUtils = SchemaUtils(
+            LogicFormConfiguration(
+                {
+                    "KAG_PROJECT_ID": KAG_PROJECT_CONF.project_id,
+                    "KAG_PROJECT_HOST_ADDR": KAG_PROJECT_CONF.host_addr,
+                }
+            )
+        )
+        self.graph_api = graph_api or GraphApiABC.from_config(
+            {"type": "openspg_graph_api"}
+        )
+
+        self.search_api = search_api or SearchApiABC.from_config(
+            {"type": "openspg_search_api"}
+        )
+
+    """
+    An abstract base class for chunk retrieval strategies.
+
+    This class provides a template for implementing different retrieval and reranking strategies for chunks of text.
+
+    Methods:
+        recall_docs(query: str, top_k: int = 5, retrieved_spo: Optional[List[RelationData]] = None, **kwargs) -> List[str]:
+            Recalls documents based on the given query.
+
+        rerank_docs(queries: List[str], passages: List[str]) -> List[str]:
+            Reranks the retrieved passages based on the given queries.
+    """
+
+    def recall_docs(
+        self,
+        queries: List[str],
+        retrieved_spo: Optional[List[RelationData]] = None,
+        **kwargs
+    ) -> List[str]:
+        """
+        Recalls documents based on the given query.
+
+        Parameters:
+            queries (list of str): The queries string to search for.
+            retrieved_spo (Optional[List[RelationData]], optional): A list of previously retrieved relation data. Defaults to None.
+            **kwargs: Additional keyword arguments for retrieval.
+
+        Returns:
+            List[str]: A list of recalled document IDs or content.
+        """
+        raise NotImplementedError("Subclasses must implement this method")
+
+    def rerank_docs(self, queries: List[str], passages: List[str]) -> List[str]:
+        """
+        Reranks the retrieved passages based on the given queries.
+
+        Parameters:
+            queries (List[str]): A list of query strings.
+            passages (List[str]): A list of retrieved passages.
+
+        Returns:
+            List[str]: A list of reranked passage IDs or content.
+        """
+        raise NotImplementedError("Subclasses must implement this method")
diff --git a/kag/solver/retriever/exact_kg_retriever.py b/kag/solver/retriever/exact_kg_retriever.py
new file mode 100644
index 00000000..ca4a3e64
--- /dev/null
+++ b/kag/solver/retriever/exact_kg_retriever.py
@@ -0,0 +1,56 @@
+from abc import ABC
+from typing import List
+
+from kag.interface.solver.base_model import SPOEntity
+from kag.solver.logic.core_modules.common.one_hop_graph import (
+    OneHopGraphData,
+    KgGraph,
+    EntityData,
+)
+from kag.solver.logic.core_modules.parser.logic_node_parser import GetSPONode
+from kag.solver.retriever.base.kg_retriever import KGRetriever
+
+
+class ExactKgRetriever(KGRetriever, ABC):
+    def recall_one_hop_graph(
+        self, n: GetSPONode, heads: List[EntityData], tails: List[EntityData], **kwargs
+    ) -> List[OneHopGraphData]:
+        """
+        Recall one-hop graph data for a given entity.
+
+        Parameters:
+            n (GetSPONode): The entity to be standardized.
+            heads (List[EntityData]): A list of candidate entities 's'.
+            tails (List[EntityData]): A list of candidate entities 'o'.
+            kwargs: Additional optional parameters.
+
+        Returns:
+            List[OneHopGraphData]: A list of one-hop graph data for the given entity.
+        """
+
+    def retrieval_relation(
+        self, n: GetSPONode, one_hop_graph_list: List[OneHopGraphData], **kwargs
+    ) -> KgGraph:
+        """
+        Input:
+            n: GetSPONode, the relation to be standardized
+            one_hop_graph_list: List[OneHopGraphData], list of candidate sets
+            kwargs: additional optional parameters
+
+        Output:
+            Returns KgGraph
+        """
+
+    def retrieval_entity(self, mention_entity: SPOEntity, **kwargs) -> List[EntityData]:
+        """
+        Retrieve related entities based on the given entity mention.
+
+        This function aims to retrieve the most relevant entities from storage or an index based on the provided entity name.
+
+        Parameters:
+            entity_mention (str): The name of the entity to retrieve.
+            kwargs: additional optional parameters
+
+        Returns:
+            list of EntityData
+        """
diff --git a/kag/solver/retriever/fuzzy_kg_retriever.py b/kag/solver/retriever/fuzzy_kg_retriever.py
new file mode 100644
index 00000000..2b7318d0
--- /dev/null
+++ b/kag/solver/retriever/fuzzy_kg_retriever.py
@@ -0,0 +1,55 @@
+from abc import ABC
+from typing import List
+
+from kag.interface.solver.base_model import SPOEntity
+from kag.solver.logic.core_modules.common.one_hop_graph import (
+    OneHopGraphData,
+    KgGraph,
+    EntityData,
+)
+from kag.solver.logic.core_modules.parser.logic_node_parser import GetSPONode
+from kag.solver.retriever.base.kg_retriever import KGRetriever
+
+
+class FuzzyKgRetriever(KGRetriever, ABC):
+    def recall_one_hop_graph(
+        self, n: GetSPONode, heads: List[EntityData], tails: List[EntityData], **kwargs
+    ) -> List[OneHopGraphData]:
+        """
+        Recall one-hop graph data for a given entity.
+
+        Parameters:
+            n (GetSPONode): The entity to be standardized.
+            heads (List[EntityData]): A list of candidate entities.
+            kwargs: Additional optional parameters.
+
+        Returns:
+            List[OneHopGraphData]: A list of one-hop graph data for the given entity.
+        """
+
+    def retrieval_relation(
+        self, n: GetSPONode, one_hop_graph_list: List[OneHopGraphData], **kwargs
+    ) -> KgGraph:
+        """
+        Input:
+            n: GetSPONode, the relation to be standardized
+            one_hop_graph_list: List[OneHopGraphData], list of candidate sets
+            kwargs: additional optional parameters
+
+        Output:
+            Returns KgGraph
+        """
+
+    def retrieval_entity(self, mention_entity: SPOEntity, **kwargs) -> List[EntityData]:
+        """
+        Retrieve related entities based on the given entity mention.
+
+        This function aims to retrieve the most relevant entities from storage or an index based on the provided entity name.
+
+        Parameters:
+            entity_mention (str): The name of the entity to retrieve.
+            kwargs: additional optional parameters
+
+        Returns:
+            list of EntityData
+        """
diff --git a/kag/solver/retriever/impl/__init__.py b/kag/solver/retriever/impl/__init__.py
new file mode 100644
index 00000000..fdf9ddd1
--- /dev/null
+++ b/kag/solver/retriever/impl/__init__.py
@@ -0,0 +1,3 @@
+import kag.solver.retriever.impl.default_chunk_retrieval
+import kag.solver.retriever.impl.default_exact_kg_retriever
+import kag.solver.retriever.impl.default_fuzzy_kg_retriever
diff --git a/kag/solver/retriever/impl/default_chunk_retrieval.py b/kag/solver/retriever/impl/default_chunk_retrieval.py
new file mode 100644
index 00000000..7fa7bd4d
--- /dev/null
+++ b/kag/solver/retriever/impl/default_chunk_retrieval.py
@@ -0,0 +1,621 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import knext.common.cache
+from kag.interface import PromptABC, VectorizeModelABC
+from tenacity import retry, stop_after_attempt
+
+from kag.interface import VectorizeModelABC as Vectorizer
+from kag.interface import LLMClient
+from typing import List, Dict, Optional
+
+import numpy as np
+import logging
+
+from kag.solver.tools.graph_api.graph_api_abc import GraphApiABC
+from kag.solver.tools.search_api.search_api_abc import SearchApiABC
+from knext.schema.client import CHUNK_TYPE, OTHER_TYPE
+from kag.common.utils import processing_phrases
+from kag.common.conf import KAG_CONFIG
+from kag.solver.retriever.chunk_retriever import ChunkRetriever
+from kag.solver.logic.core_modules.common.one_hop_graph import EntityData, RelationData
+from kag.solver.logic.core_modules.common.text_sim_by_vector import (
+    TextSimilarity,
+    cosine_similarity,
+)
+from kag.solver.utils import init_prompt_with_fallback
+
+logger = logging.getLogger(__name__)
+
+ner_cache = knext.common.cache.LinkCache(maxsize=100, ttl=300)
+query_sim_doc_cache = knext.common.cache.LinkCache(maxsize=100, ttl=300)
+
+
+@ChunkRetriever.register("kag")
+class KAGRetriever(ChunkRetriever):
+    """
+    KAGRetriever class for retrieving and processing knowledge graph data from a graph database.
+
+    this retriever references the implementation of Hippoag for the combination of dpr & ppr, developer can define your Retriever
+
+    Parameters:
+    - project_id (str, optional): Project ID to load specific project configurations.
+    - host_addr (str, optional): host addr to load specific server addr configurations.
+    """
+
+    def __init__(
+        self,
+        ner_prompt: PromptABC = None,
+        std_prompt: PromptABC = None,
+        pagerank_threshold: float = 0.9,
+        match_threshold: float = 0.9,
+        pagerank_weight: float = 0.5,
+        recall_num: int = 10,
+        rerank_topk: int = 10,
+        reranker_model_path: str = None,
+        vectorize_model: Vectorizer = None,
+        graph_api: GraphApiABC = None,
+        search_api: SearchApiABC = None,
+        llm_client: LLMClient = None,
+        **kwargs,
+    ):
+        super().__init__(
+            recall_num, rerank_topk, graph_api, search_api, llm_client, **kwargs
+        )
+        if vectorize_model is None:
+            vectorize_model = Vectorizer.from_config(
+                KAG_CONFIG.all_config["vectorize_model"]
+            )
+        self.vectorize_model = vectorize_model
+        if ner_prompt is None:
+            ner_prompt = init_prompt_with_fallback("question_ner", self.biz_scene)
+
+        self.ner_prompt = ner_prompt
+        if std_prompt is None:
+            std_prompt = init_prompt_with_fallback("std", self.biz_scene)
+        self.std_prompt = std_prompt
+
+        self.pagerank_threshold = pagerank_threshold
+        self.match_threshold = match_threshold
+        self.pagerank_weight = pagerank_weight
+
+        self.reranker_model_path = reranker_model_path
+        if self.reranker_model_path:
+            from kag.common.reranker.reranker import BGEReranker
+
+            self.reranker = BGEReranker(self.reranker_model_path, use_fp16=True)
+        else:
+            self.reranker = None
+
+        self.with_semantic = True
+
+    @retry(stop=stop_after_attempt(3))
+    def named_entity_recognition(self, query: str):
+        """
+        Perform named entity recognition.
+
+        This method invokes the pre-configured service client (self.llm) to process the input query,
+        using the named entity recognition (NER) prompt (self.ner_prompt).
+
+        Parameters:
+        query (str): The text input provided by the user or system for named entity recognition.
+
+        Returns:
+        The result returned by the service client, with the type and format depending on the used service.
+        """
+        return self.llm_module.invoke({"input": query}, self.ner_prompt)
+
+    @retry(stop=stop_after_attempt(3))
+    def named_entity_standardization(self, query: str, entities: List[Dict]):
+        """
+        Entity standardization function.
+
+        This function calls a remote service to process the input query and named entities,
+        standardizing the entities. This is useful for unifying different representations of the same entity in text,
+        improving the performance of natural language processing tasks.
+
+        Parameters:
+        - query: A string containing the query with named entities.
+        - entities: A list of dictionaries, each containing information about named entities.
+
+        Returns:
+        - The result of the remote service call, typically standardized named entity information.
+        """
+        return self.llm_module.invoke(
+            {"input": query, "named_entities": entities}, self.std_prompt
+        )
+
+    @staticmethod
+    def append_official_name(
+        source_entities: List[Dict], entities_with_official_name: List[Dict]
+    ):
+        """
+        Appends official names to entities.
+
+        Parameters:
+        source_entities (List[Dict]): A list of source entities.
+        entities_with_official_name (List[Dict]): A list of entities with official names.
+
+        """
+        tmp_dict = {}
+        for tmp_entity in entities_with_official_name:
+            name = tmp_entity["name"]
+            category = tmp_entity["category"]
+            official_name = tmp_entity["official_name"]
+            key = f"{category}{name}"
+            tmp_dict[key] = official_name
+
+        for tmp_entity in source_entities:
+            name = tmp_entity["name"]
+            category = tmp_entity["category"]
+            key = f"{category}{name}"
+            if key in tmp_dict:
+                official_name = tmp_dict[key]
+                tmp_entity["official_name"] = official_name
+
+    def calculate_sim_scores(self, query: str, doc_nums: int):
+        """
+        Calculate the vector similarity scores between a query and document chunks.
+
+        Parameters:
+        query (str): The user's query text.
+        doc_nums (int): The number of document chunks to return.
+
+        Returns:
+        dict: A dictionary with keys as document chunk IDs and values as the vector similarity scores.
+        """
+        try:
+            scores = query_sim_doc_cache.get(query)
+            if scores:
+                return scores
+            query_vector = self.vectorize_model.vectorize(query)
+            top_k = self.search_api.search_vector(
+                label=self.schema.get_label_within_prefix(CHUNK_TYPE),
+                property_key="content",
+                query_vector=query_vector,
+                topk=doc_nums,
+            )
+            scores = {item["node"]["id"]: item["score"] for item in top_k}
+            query_sim_doc_cache.put(query, scores)
+        except Exception as e:
+            scores = dict()
+            logger.error(f"run calculate_sim_scores failed, info: {e}", exc_info=True)
+        return scores
+
+    def calculate_pagerank_scores(self, start_nodes: List[Dict]):
+        """
+        Calculate and retrieve PageRank scores for the given starting nodes.
+
+        Parameters:
+        start_nodes (list): A list containing document fragment IDs to be used as starting nodes for the PageRank algorithm.
+
+        Returns:
+        ppr_doc_scores (dict): A dictionary containing each document fragment ID and its corresponding PageRank score.
+
+        This method uses the PageRank algorithm in the graph store to compute scores for document fragments. If `start_nodes` is empty,
+        it returns an empty dictionary. Otherwise, it attempts to retrieve PageRank scores from the graph store and converts the result
+        into a dictionary format where keys are document fragment IDs and values are their respective PageRank scores. Any exceptions,
+        such as failures in running `run_pagerank_igraph_chunk`, are logged.
+        """
+        scores = dict()
+        if len(start_nodes) != 0:
+            try:
+                target_type = self.schema.get_label_within_prefix(CHUNK_TYPE)
+                start_node_set = []
+                for s in start_nodes:
+                    if s["type"] == target_type:
+                        continue
+                    start_node_set.append(s)
+                scores = self.graph_api.calculate_pagerank_scores(
+                    self.schema.get_label_within_prefix(CHUNK_TYPE), start_node_set
+                )
+            except Exception as e:
+                logger.error(
+                    f"run calculate_pagerank_scores failed, info: {e}, start_nodes: {start_nodes}",
+                    exc_info=True,
+                )
+        return scores
+
+    def match_entities(self, queries: Dict[str, str], top_k: int = 1):
+        """
+        Match entities based on the provided queries.
+
+        :param queries: A dictionary containing keywords and their labels.
+        :param top_k: The number of top results to return. Default is 1.
+        :return: A tuple containing a list of matched entities and their scores.
+        """
+        matched_entities = []
+        for query, query_type in queries.items():
+            query = processing_phrases(query)
+            if query_type not in self.schema.node_en_zh.keys():
+                query_type = self.schema.get_label_within_prefix(OTHER_TYPE)
+            else:
+                query_type = self.schema.get_label_within_prefix(query_type)
+            typed_nodes = self.search_api.search_vector(
+                label=query_type,
+                property_key="name",
+                query_vector=self.vectorize_model.vectorize(query),
+                topk=top_k,
+            )
+            if query_type != self.schema.get_label_within_prefix(OTHER_TYPE):
+                nontyped_nodes = self.search_api.search_vector(
+                    label=self.schema.get_label_within_prefix(OTHER_TYPE),
+                    property_key="name",
+                    query_vector=self.vectorize_model.vectorize(query),
+                    topk=top_k,
+                )
+            else:
+                nontyped_nodes = typed_nodes
+
+            if len(typed_nodes) == 0 and len(nontyped_nodes) != 0:
+                matched_entities.append(
+                    {
+                        "name": nontyped_nodes[0]["node"]["name"],
+                        "type": self.schema.get_label_within_prefix(OTHER_TYPE),
+                        "score": nontyped_nodes[0]["score"],
+                    }
+                )
+            elif len(typed_nodes) != 0 and len(nontyped_nodes) != 0:
+                if typed_nodes[0]["score"] > 0.8:
+                    matched_entities.append(
+                        {
+                            "name": typed_nodes[0]["node"]["name"],
+                            "type": query_type,
+                            "score": typed_nodes[0]["score"],
+                        }
+                    )
+                else:
+                    matched_entities.append(
+                        {
+                            "name": nontyped_nodes[0]["node"]["name"],
+                            "type": self.schema.get_label_within_prefix(OTHER_TYPE),
+                            "score": nontyped_nodes[0]["score"],
+                        }
+                    )
+                    matched_entities.append(
+                        {
+                            "name": typed_nodes[0]["node"]["name"],
+                            "type": query_type,
+                            "score": typed_nodes[0]["score"],
+                        }
+                    )
+            elif len(typed_nodes) != 0 and len(nontyped_nodes) == 0:
+                if typed_nodes[0]["score"] > 0.8:
+                    matched_entities.append(
+                        {
+                            "name": typed_nodes[0]["node"]["name"],
+                            "type": query_type,
+                            "score": typed_nodes[0]["score"],
+                        }
+                    )
+
+        if not matched_entities:
+            logger.info(f"No entities matched for {queries}")
+        return matched_entities
+
+    def calculate_combined_scores(
+        self, sim_scores: Dict[str, float], pagerank_scores: Dict[str, float]
+    ):
+        """
+        Calculate and return the combined scores that integrate both similarity scores and PageRank scores.
+
+        Parameters:
+        sim_scores (Dict[str, float]): A dictionary containing similarity scores, where keys are identifiers and values are scores.
+        pagerank_scores (Dict[str, float]): A dictionary containing PageRank scores, where keys are identifiers and values are scores.
+
+        Returns:
+        Dict[str, float]: A dictionary containing the combined scores, where keys are identifiers and values are the combined scores.
+        """
+
+        def min_max_normalize(x):
+            if len(x) == 0:
+                return []
+            if np.max(x) - np.min(x) > 0:
+                return (x - np.min(x)) / (np.max(x) - np.min(x))
+            else:
+                return x - np.min(x)
+
+        all_keys = set(pagerank_scores.keys()).union(set(sim_scores.keys()))
+        for key in all_keys:
+            sim_scores.setdefault(key, 0.0)
+            pagerank_scores.setdefault(key, 0.0)
+        sim_scores = dict(
+            zip(
+                sim_scores.keys(),
+                min_max_normalize(np.array(list(sim_scores.values()))),
+            )
+        )
+        pagerank_scores = dict(
+            zip(
+                pagerank_scores.keys(),
+                min_max_normalize(np.array(list(pagerank_scores.values()))),
+            )
+        )
+        combined_scores = dict()
+        for key in pagerank_scores.keys():
+            combined_scores[key] = (
+                sim_scores[key] * (1 - self.pagerank_weight)
+                + pagerank_scores[key] * self.pagerank_weight
+            )
+        return combined_scores
+
+    def _add_extra_entity_from_spo(
+        self, matched_entities: Dict, retrieved_spo: List[RelationData]
+    ):
+        all_related_entities = []
+        if retrieved_spo:
+            for spo in retrieved_spo:
+                if spo.from_entity.type not in ["Text", "attribute"]:
+                    all_related_entities.append(spo.from_entity)
+                if spo.end_entity.type not in ["Text", "attribute"]:
+                    all_related_entities.append(spo.end_entity)
+            all_related_entities = list(set(all_related_entities))
+
+        if len(all_related_entities) == 0:
+            return matched_entities.values()
+
+        ner_cands = matched_entities
+
+        def convert_entity_data_to_ppr_cand(related_entities: List[EntityData]):
+            ret_ppr_candis = {}
+            for e in related_entities:
+                k = f"{e.name}_{e.type}"
+                ret_ppr_candis[k] = {"name": e.name, "type": e.type, "score": e.score}
+            return ret_ppr_candis
+
+        kg_cands = convert_entity_data_to_ppr_cand(all_related_entities)
+        for k, v in ner_cands.items():
+            if k in kg_cands.keys():
+                if v["score"] > kg_cands[k]["score"]:
+                    kg_cands[k]["score"] = v["score"]
+            else:
+                kg_cands[k] = v
+
+        matched_entities = []
+        matched_entities_scores = []
+        for _, v in kg_cands.items():
+            matched_entities.append(v)
+            matched_entities_scores.append(v["score"])
+        return matched_entities
+
+    def _parse_ner_list(self, query):
+        ner_list = []
+        try:
+            ner_list = ner_cache.get(query)
+            if ner_list:
+                return ner_list
+            ner_list = self.named_entity_recognition(query)
+            if self.with_semantic:
+                std_ner_list = self.named_entity_standardization(query, ner_list)
+                self.append_official_name(ner_list, std_ner_list)
+            ner_cache.put(query, ner_list)
+        except Exception as e:
+            if not ner_list:
+                ner_list = []
+            logger.warning(f"_parse_ner_list {query} failed {e}", exc_info=True)
+        return ner_list
+
+    def recall_docs(
+        self,
+        queries: List[str],
+        retrieved_spo: Optional[List[RelationData]] = None,
+        **kwargs,
+    ) -> List[str]:
+        """
+        Recall relevant documents based on the query string.
+
+        Parameters:
+        - query (str): The user's query string.
+        - top_k (int, optional): The number of documents to return, default is 5.
+
+        Keyword Arguments:
+        - kwargs: Additional keyword arguments.
+
+        Returns:
+        - list: A list containing the top_k most relevant documents.
+        """
+        chunk_nums = self.recall_num * 20
+        if chunk_nums == 0:
+            return []
+        matched_entities_map = {}
+        for query in queries:
+            entities = {}
+            assert isinstance(query, str), "Query must be a string"
+            ner_list = self._parse_ner_list(query)
+            for item in ner_list:
+                entity = item.get("name", "")
+                category = item.get("category", "")
+                official_name = item.get("official_name", "")
+                if not entity or not (category or official_name):
+                    continue
+                if category.lower() in ["works", "person", "other"]:
+                    entities[entity] = category
+                else:
+                    entities[entity] = official_name or category
+
+            cur_matched = self.match_entities(entities)
+            for matched_entity in cur_matched:
+                key = f"{matched_entity['name']}_{matched_entity['type']}"
+                if (
+                    key not in matched_entities_map
+                    or matched_entity["score"] > matched_entities_map[key]["score"]
+                ):
+                    matched_entities_map[key] = matched_entity
+
+        matched_entities = self._add_extra_entity_from_spo(
+            retrieved_spo=retrieved_spo, matched_entities=matched_entities_map
+        )
+        try:
+            matched_scores = [k["score"] for k in matched_entities]
+        except Exception as e:
+            logger.error(f"mathematics error: {e}")
+        if len(matched_entities):
+            pagerank_scores = self.calculate_pagerank_scores(matched_entities)
+        else:
+            pagerank_scores = []
+
+        if matched_entities and np.min(matched_scores) > self.pagerank_threshold:
+            combined_scores = pagerank_scores
+        else:
+            sim_scores = {}
+            queries = queries[1:]
+            for query in queries:
+                query_sim_scores = self.calculate_sim_scores(query, chunk_nums)
+                for doc_id, score in query_sim_scores.items():
+                    if doc_id not in sim_scores:
+                        sim_scores[doc_id] = score
+                    elif score > sim_scores[doc_id]:
+                        sim_scores[doc_id] = score
+            if not matched_entities:
+                combined_scores = sim_scores
+            else:
+                combined_scores = self.calculate_combined_scores(
+                    sim_scores, pagerank_scores
+                )
+        sorted_scores = sorted(
+            combined_scores.items(), key=lambda item: item[1], reverse=True
+        )
+        logger.debug(f"sorted_scores: {sorted_scores}")
+
+        return self.get_all_docs_by_id(queries, sorted_scores, self.recall_num)
+
+    def get_all_docs_by_id(self, queries: List[str], doc_ids: list, top_k: int):
+        """
+        Retrieve a list of documents based on their IDs.
+
+        Parameters:
+        - queries (list of str): The query string for text matching.
+        - doc_ids (list): A list of document IDs to retrieve documents.
+        - top_k (int): The maximum number of documents to return.
+
+        Returns:
+        - list: A list of matched documents.
+        """
+        matched_docs = []
+        hits_docs = set()
+        counter = 0
+        for doc_id in doc_ids:
+            if counter == top_k:
+                break
+            if isinstance(doc_id, tuple):
+                doc_score = doc_id[1]
+                doc_id = doc_id[0]
+            else:
+                doc_score = doc_ids[doc_id]
+            counter += 1
+            try:
+                node = self.graph_api.get_entity_prop_by_id(
+                    label=self.schema.get_label_within_prefix(CHUNK_TYPE),
+                    biz_id=doc_id,
+                )
+                node_dict = dict(node.items())
+                matched_docs.append(
+                    f"#{node_dict['name']}#{node_dict['content']}#{doc_score}"
+                )
+                hits_docs.add(node_dict["name"])
+            except Exception as e:
+                logger.warning(f"{doc_id} get_entity_prop_by_id failed: {e}", exc_info=True)
+        query = "\n".join(queries)
+        try:
+            text_matched = self.search_api.search_text(
+                query, [self.schema.get_label_within_prefix(CHUNK_TYPE)], topk=1
+            )
+            if text_matched:
+                for item in text_matched:
+                    title = item["node"]["name"]
+                    if title not in hits_docs:
+                        if len(matched_docs) > 0:
+                            matched_docs.pop()
+                        else:
+                            logger.warning(f"{query} matched docs is empty")
+                        matched_docs.append(
+                            f'#{item["node"]["name"]}#{item["node"]["content"]}#{item["score"]}'
+                        )
+                        break
+        except Exception as e:
+            logger.warning(f"{query} query chunk failed: {e}", exc_info=True)
+        logger.debug(f"matched_docs: {matched_docs}")
+        return matched_docs
+
+    def rerank_docs(self, queries: List[str], passages: List[str]):
+        """
+        Re-ranks the given passages based on the provided queries.
+
+        Parameters:
+        - queries (List[str]): A list of queries.
+        - passages (List[str]): A list of passages.
+
+        Returns:
+        - List[str]: A re-ranked list of passages.
+        """
+        if self.reranker is None:
+            return passages
+        return self.reranker.rerank(queries, passages)
+
+
+@ChunkRetriever.register("default_chunk_retriever")
+class DefaultChunkRetriever(KAGRetriever):
+    def __init__(
+        self,
+        ner_prompt: PromptABC = None,
+        std_prompt: PromptABC = None,
+        pagerank_threshold: float = 0.9,
+        match_threshold: float = 0.9,
+        pagerank_weight: float = 0.5,
+        recall_num: int = 10,
+        rerank_topk: int = 10,
+        reranker_model_path: str = None,
+        vectorize_model: VectorizeModelABC = None,
+        graph_api: GraphApiABC = None,
+        search_api: SearchApiABC = None,
+        llm_client: LLMClient = None,
+        **kwargs,
+    ):
+        super().__init__(
+            ner_prompt,
+            std_prompt,
+            pagerank_threshold,
+            match_threshold,
+            pagerank_weight,
+            recall_num,
+            rerank_topk,
+            reranker_model_path,
+            vectorize_model,
+            graph_api,
+            search_api,
+            llm_client,
+            **kwargs,
+        )
+        self.text_sim = TextSimilarity(vectorizer=self.vectorize_model)
+
+    def rerank_docs(self, queries: List[str], passages: List[str]):
+        if not isinstance(queries, list):
+            queries = [queries]
+        if len(passages) == 0:
+            return []
+        queries = list(set(queries))
+        rank_scores = np.array([1 / (1 + i) for i in range(len(passages))])
+        passage_scores = np.zeros(len(passages)) + rank_scores
+        passages_embs = self.text_sim.sentence_encode(passages, is_cached=True)
+
+        for query in queries:
+            query_emb = self.text_sim.sentence_encode(query)
+            scores = [
+                cosine_similarity(np.array(query_emb), np.array(passage_emb))
+                for passage_emb in passages_embs
+            ]
+            sorted_idx = np.argsort(-np.array(scores))
+            for rank, passage_id in enumerate(sorted_idx):
+                passage_scores[passage_id] += rank_scores[rank]
+
+        merged_sorted_idx = np.argsort(-passage_scores)
+
+        new_passages = [passages[x] for x in merged_sorted_idx]
+        return new_passages[: self.rerank_topk]
diff --git a/kag/solver/retriever/impl/default_exact_kg_retriever.py b/kag/solver/retriever/impl/default_exact_kg_retriever.py
new file mode 100644
index 00000000..46a0e4e4
--- /dev/null
+++ b/kag/solver/retriever/impl/default_exact_kg_retriever.py
@@ -0,0 +1,302 @@
+import logging
+import time
+from abc import ABC
+from typing import List
+
+from kag.interface import LLMClient, VectorizeModelABC
+from kag.interface.solver.base_model import SPOEntity, SPOBase
+from kag.solver.logic.core_modules.common.one_hop_graph import (
+    OneHopGraphData,
+    KgGraph,
+    EntityData,
+    RelationData,
+)
+from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
+from kag.solver.logic.core_modules.parser.logic_node_parser import GetSPONode
+from kag.solver.retriever.exact_kg_retriever import ExactKgRetriever
+from kag.solver.tools.algorithm.entity_linker import (
+    default_search_entity_by_name_algorithm,
+)
+from kag.solver.tools.graph_api.graph_api_abc import GraphApiABC, generate_gql_id_params
+from kag.solver.tools.search_api.search_api_abc import SearchApiABC
+
+logger = logging.getLogger()
+
+
+class ExactMatchRetrieval:
+    def __init__(self, schema):
+        self.schema: SchemaUtils = schema
+
+    def _prase_attribute_relation(self, one_graph, std_p: str, attr_value: str):
+        # new a RelationData
+        prop_entity = EntityData()
+        prop_entity.biz_id = attr_value
+        prop_entity.name = attr_value
+        prop_entity.type = "Text"
+        prop_entity.type_zh = "文本"
+
+        return self._prase_entity_relation(one_graph, std_p, prop_entity)
+
+    def _prase_entity_relation(self, one_graph, std_p: str, o_value: EntityData):
+        s_entity = one_graph.s
+        o_entity = o_value
+        if o_value.description is None or o_value.description == "":
+            o_value.description = f"{s_entity.name} {std_p} {o_entity.name}"
+        return RelationData.from_prop_value(s_entity, std_p, o_entity)
+
+    def _std_best_p_with_value_and_p_name(
+        self, n: GetSPONode, one_graph: OneHopGraphData
+    ):
+        """
+        :param one_graph:
+        :return: list(RelationData)
+        """
+        logger.debug(f"std_best_p_with_value_and_p_name begin std " + str(n))
+        un_std_p_list = n.p.get_entity_type_or_un_std_list()
+        final_result_list = []
+        if len(un_std_p_list) == 0:
+            # return all
+            result = []
+            if len(one_graph.in_relations) > 0:
+                for k in one_graph.in_relations.keys():
+                    result = one_graph.in_relations[k] + result
+            if len(one_graph.out_relations) > 0:
+                for k in one_graph.out_relations.keys():
+                    result = one_graph.out_relations[k] + result
+            final_result_list = final_result_list + result
+
+        for un_std_p in un_std_p_list:
+            target_value = n.o.entity_name
+            target_node = n.o
+            relation_name_set = one_graph.get_s_all_relation_name()
+            attribute_name_set = one_graph.get_s_all_attribute_name()
+            candi_name_set = relation_name_set + attribute_name_set
+
+            def find_best_match_p_name(p: str, candi_set: list):
+                if p in candi_set:
+                    return p
+                return None
+
+            std_p = find_best_match_p_name(un_std_p, candi_name_set)
+            if std_p is None:
+                continue
+
+            get_data_from_rel = False
+            if std_p in relation_name_set and std_p in attribute_name_set:
+                if not target_node.is_attribute:
+                    get_data_from_rel = True
+            elif std_p in relation_name_set:
+                get_data_from_rel = True
+
+            if get_data_from_rel:
+                relation_data = one_graph.get_std_relation_value(std_p)
+            else:
+                logger.info(
+                    f"relation with el: un std p is " + un_std_p + ", std p is " + std_p
+                )
+                value = one_graph.get_std_attribute_value(std_p)
+                if value is None or value == "":
+                    continue
+                # new a RelationData
+                relation_data = [
+                    self._prase_attribute_relation(one_graph, std_p, value)
+                ]
+            if target_value is not None:
+                for r in relation_data:
+                    candi_target_value = (
+                        r.end_entity.name
+                        if one_graph.s_alias_name == "s"
+                        else r.start_entity.name
+                    )
+                    if candi_target_value == target_value:
+                        final_result_list.append(r)
+                        continue
+            else:
+                final_result_list = final_result_list + relation_data
+        return final_result_list
+
+    def match_spo(self, n: GetSPONode, one_hop_graph_list: List[OneHopGraphData]):
+        matched_flag = False
+        one_kg_graph = KgGraph()
+        one_kg_graph.query_graph[n.p.alias_name] = {
+            "s": n.s.alias_name,
+            "p": n.p.alias_name,
+            "o": n.o.alias_name,
+        }
+        for tmp_one_hop_graph in one_hop_graph_list:
+            rel_set = self._std_best_p_with_value_and_p_name(n, tmp_one_hop_graph)
+            if len(rel_set) > 0:
+                one_kg_graph_ = KgGraph()
+                recall_alias_name = n.s.alias_name
+                one_kg_graph_.entity_map[recall_alias_name] = [tmp_one_hop_graph.s]
+                one_kg_graph_.edge_map[n.p.alias_name] = rel_set
+                one_kg_graph.merge_kg_graph(one_kg_graph_)
+        spo_set = one_kg_graph.get_entity_by_alias(n.p.alias_name)
+        if spo_set is not None and len(spo_set) != 0:
+            matched_flag = True
+        return one_kg_graph, matched_flag
+
+
+@ExactKgRetriever.register("default_exact_kg_retriever", as_default=True)
+class DefaultExactKgRetriever(ExactKgRetriever, ABC):
+    def __init__(
+        self,
+        el_num=5,
+        llm_client: LLMClient = None,
+        vectorize_model: VectorizeModelABC = None,
+        graph_api: GraphApiABC = None,
+        search_api: SearchApiABC = None,
+        **kwargs,
+    ):
+        super().__init__(
+            el_num, llm_client, vectorize_model, graph_api, search_api, **kwargs
+        )
+        self.match = ExactMatchRetrieval(self.schema)
+
+    def _generate_label(self, s: SPOBase, heads: List[EntityData]):
+        if heads:
+            return list(set([f"{h.type}" for h in heads]))
+
+        if not isinstance(s, SPOEntity):
+            return ["Entity"]
+
+        std_types = s.get_entity_type_set()
+        std_types_with_prefix = []
+        for std_type in std_types:
+            std_type_with_prefix = self.schema.get_label_within_prefix(std_type)
+            if std_types_with_prefix != std_type:
+                std_types_with_prefix.append(f"`{std_type_with_prefix}`")
+        if len(std_types_with_prefix):
+            return list(set(std_types_with_prefix))
+        return ["Entity"]
+
+    def recall_one_hop_graph(
+        self, n: GetSPONode, heads: List[EntityData], tails: List[EntityData], **kwargs
+    ) -> List[OneHopGraphData]:
+        """
+        Recall one-hop graph data for a given entity.
+
+        Parameters:
+            n (GetSPONode): The entity to be standardized.
+            heads (List[EntityData]): A list of candidate entities 's'.
+            tails (List[EntityData]): A list of candidate entities 'o'.
+            kwargs: Additional optional parameters.
+
+        Returns:
+            List[OneHopGraphData]: A list of one-hop graph data for the given entity.
+        """
+        params = {}
+        where_caluse = []
+        header_ids = set(head.biz_id for head in heads)
+        if len(header_ids):
+            params["sid"] = generate_gql_id_params(list(header_ids))
+            where_caluse.append(f"s.id in $sid")
+        tail_ids = set(tail.biz_id for tail in tails)
+        if len(tail_ids):
+            params["oid"] = generate_gql_id_params(list(tail_ids))
+            where_caluse.append(f"o.id in $oid")
+
+        header_std_labels = self._generate_label(n.s, heads)
+        dsl_header_label = "|".join(header_std_labels)
+
+        tail_std_labels = self._generate_label(n.o, tails)
+        dsl_tail_label = "|".join(tail_std_labels)
+
+        p_type_set = n.p.type_set
+        p_label_str_set = []
+        p_label_set = []
+        for type in p_type_set:
+            if type.std_entity_type is not None:
+                p_label_set.append(type.std_entity_type)
+                p_label_str_set.append(f'"{type.std_entity_type}"')
+            else:
+                p_label_str_set.append(f'"{type.un_std_entity_type}"')
+        p_label = ""
+        if len(p_label_str_set):
+            p_label = "[" + ",".join(p_label_str_set) + "]"
+
+        exact_dsls = []
+        if len(p_label_set) > 0:
+            # first we use exact ql to query
+            exact_dsls.append(
+                f"""
+        MATCH (s:{dsl_header_label})-[p:{'|'.join(p_label_set)}]->(o:{dsl_tail_label})
+        WHERE {' and '.join(where_caluse)}
+        RETURN s,p,o,s.id,o.id
+        """
+            )
+        # if exact ql failed, we call one hop graph to filter
+        exact_dsls.append(
+            f"""
+        MATCH (s:{dsl_header_label})-[p:rdf_expand({p_label})]->(o:{dsl_tail_label})
+        WHERE {' and '.join(where_caluse)}
+        RETURN s,p,o,s.id,o.id
+        """
+        )
+        res = []
+        for exact_dsl in exact_dsls:
+            try:
+                fat_table = self.graph_api.execute_dsl(exact_dsl, **params)
+                one_graph_map = self.graph_api.convert_spo_to_one_graph(fat_table)
+                res = list(one_graph_map.values())
+                if len(res) > 0:
+                    return res
+            except Exception as e:
+                # Log the error or handle it appropriately
+                logger.warning(f"An error occurred: {e}", exc_info=True)
+        return res
+
+    def retrieval_relation(
+        self, n: GetSPONode, one_hop_graph_list: List[OneHopGraphData], **kwargs
+    ) -> KgGraph:
+        """
+        Input:
+            n: GetSPONode, the relation to be standardized
+            one_hop_graph_list: List[OneHopGraphData], list of candidate sets
+            kwargs: additional optional parameters
+
+        Output:
+            Returns KgGraph
+        """
+        start_time = time.time()
+        total_one_kg_graph, matched_flag = self.match.match_spo(n, one_hop_graph_list)
+        logger.debug(
+            f"_exact_match_spo cost={time.time() - start_time} matched_flag={matched_flag}"
+        )
+        if not matched_flag:
+            return total_one_kg_graph
+        for alias_name in total_one_kg_graph.entity_map.keys():
+            for e in total_one_kg_graph.entity_map[alias_name]:
+                score = e.score
+                if score < 0.9:
+                    total_one_kg_graph.rmv_node_ins(alias_name, [e.biz_id])
+            if len(total_one_kg_graph.entity_map.get(alias_name, [])) == 0:
+                return KgGraph()
+        return total_one_kg_graph
+
+    def retrieval_entity(self, mention_entity: SPOEntity, **kwargs) -> List[EntityData]:
+        """
+        Retrieve related entities based on the given entity mention.
+
+        This function aims to retrieve the most relevant entities from storage or an index based on the provided entity name.
+
+        Parameters:
+            entity_mention (str): The name of the entity to retrieve.
+            topk (int, optional): The number of top results to return. Defaults to 1.
+            kwargs: additional optional parameters
+
+        Returns:
+            list of EntityData
+        """
+
+        return default_search_entity_by_name_algorithm(
+            mention_entity=mention_entity,
+            schema=self.schema,
+            vectorize_model=self.vectorize_model,
+            text_similarity=self.text_similarity,
+            search_api=self.search_api,
+            topk=self.el_num,
+            recognition_threshold=0.9,
+            use_query_type=True,
+            kwargs=kwargs,
+        )
diff --git a/kag/solver/retriever/impl/default_fuzzy_kg_retriever.py b/kag/solver/retriever/impl/default_fuzzy_kg_retriever.py
new file mode 100644
index 00000000..b0df3317
--- /dev/null
+++ b/kag/solver/retriever/impl/default_fuzzy_kg_retriever.py
@@ -0,0 +1,320 @@
+import json
+import logging
+import re
+import time
+import concurrent.futures
+from abc import ABC
+from typing import List
+
+from kag.common.conf import KAG_CONFIG, KAG_PROJECT_CONF
+from kag.interface import LLMClient, VectorizeModelABC
+from kag.interface.solver.base_model import SPOEntity
+from kag.solver.logic.core_modules.common.one_hop_graph import (
+    OneHopGraphData,
+    KgGraph,
+    EntityData,
+)
+from kag.solver.logic.core_modules.common.text_sim_by_vector import TextSimilarity
+from kag.solver.logic.core_modules.parser.logic_node_parser import GetSPONode
+from kag.solver.retriever.fuzzy_kg_retriever import FuzzyKgRetriever
+from kag.solver.tools.algorithm.entity_linker import (
+    default_search_entity_by_name_algorithm,
+)
+from kag.solver.tools.graph_api.graph_api_abc import GraphApiABC, generate_gql_id_params
+from kag.solver.tools.search_api.search_api_abc import SearchApiABC
+from kag.solver.utils import init_prompt_with_fallback
+
+logger = logging.getLogger()
+
+
+class FuzzyMatchRetrieval:
+    def __init__(self, llm: LLMClient, text_similarity: TextSimilarity):
+        self.llm: LLMClient = llm
+        self.text_similarity: TextSimilarity = text_similarity
+        self.cached_map = {}
+
+        self.biz_scene = KAG_PROJECT_CONF.biz_scene
+        self.language = KAG_PROJECT_CONF.language
+
+    def get_unstd_p_text(self, n: GetSPONode):
+        un_std_p = n.p.get_entity_first_type_or_un_std()
+        if un_std_p is None:
+            logger.warning(f"get_unstd_p_text get p emtpy {n}")
+            un_std_p = ""
+        start_value_type = n.s.get_entity_first_type_or_un_std()
+        if start_value_type is None or start_value_type == "Others":
+            logger.warning(
+                f"get_unstd_p_text get start_value_type {start_value_type} {n}"
+            )
+            start_value_type = "Entity"
+        target_value_type = n.o.get_entity_first_type_or_un_std()
+        if target_value_type is None or target_value_type == "Others":
+            logger.warning(
+                f"get_unstd_p_text get target_value_type {target_value_type} {n}"
+            )
+            target_value_type = "Entity"
+        un_std_p = f"{start_value_type}{'[' + n.get_ele_name('s') + ']' if n.get_ele_name('s') != '' else ''} {un_std_p} {target_value_type}{'[' + n.get_ele_name('o') + ']'}"
+        return un_std_p
+
+    def _choosed_by_llm(self, question, mention, candis):
+        resp_plan_prompt = init_prompt_with_fallback("spo_retrieval", self.biz_scene)
+
+        return self.llm.invoke(
+            {"question": question, "mention": mention, "candis": candis},
+            resp_plan_prompt,
+            with_json_parse=True,
+            with_except=True,
+        )
+
+    def select_relation(self, p_mention, p_candis, query=""):
+        if not p_mention:
+            print("p_mention is none")
+            return None
+        if p_mention in self.cached_map.keys():
+            cached_set = self.cached_map[p_mention]
+            intersection = list(set(cached_set) & set(p_candis))
+        else:
+            intersection = []
+        if len(intersection) == 0:
+            res = ""
+            try:
+                res = self._choosed_by_llm(query, p_mention, p_candis)
+                for res_ in res:
+                    self.cached_map[p_mention] = self.cached_map.get(p_mention, []) + [
+                        res_
+                    ]
+                    intersection.append(res_)
+            except Exception as e:
+                logger.warning(
+                    f"retrieval_spo json failed：query={query},  res={res} , except={e}",
+                    exc_info=True,
+                )
+        return [[x, 1.0] for x in intersection]
+
+    def find_best_match_p_name_by_model(self, query: str, p: str, candi_set: dict):
+        if p in candi_set:
+            return [p, candi_set[p]]
+        spo_retrieved = []
+        sen_condi_set = []
+        spo_name_map = {}
+        for p_name, spo_l in candi_set.items():
+            if (
+                p_name.startswith("_")
+                or p_name == "id"
+                or p_name == "source"
+                or p_name == "similar"
+            ):
+                continue
+            for spo in spo_l:
+                spo_name_map[spo] = p_name
+            sen_condi_set += spo_l
+        result = self.select_relation(p, sen_condi_set, query=query)
+        logger.debug(
+            f"retrieval_relation: p={p}, candi_set={sen_condi_set}, p_std result={result}"
+        )
+
+        if result is None or len(result) == 0:
+            return spo_retrieved
+
+        for result_ in result:
+            spo = result_[0]
+            spo_p_name = spo_name_map.get(spo, None)
+            spo_retrieved.append([spo, spo_p_name])
+        return spo_retrieved
+
+    def match_spo(self, n: GetSPONode, one_hop_graph_list: List[OneHopGraphData]):
+        one_kg_graph = KgGraph()
+        # sort graph
+        unstd_p_text = self.get_unstd_p_text(n)
+        all_spo_text = []
+        revert_value_p_map = {}
+        revert_graph_map = {}
+        for one_hop_graph in one_hop_graph_list:
+            for k, v_set in one_hop_graph.get_s_all_relation_spo().items():
+                for v in v_set:
+                    all_spo_text.append(v)
+                    revert_value_p_map[v] = k
+                    revert_graph_map[v] = one_hop_graph
+            for k, v_set in one_hop_graph.get_s_all_attribute_spo().items():
+                for v in v_set:
+                    attr_txt = f"{one_hop_graph.s.get_short_name()} {k} {v}"
+                    all_spo_text.append(attr_txt)
+                    revert_value_p_map[attr_txt] = k
+                    revert_graph_map[attr_txt] = one_hop_graph
+        start_time = time.time()
+        tok5_res = self.text_similarity.text_sim_result(
+            n.sub_query, all_spo_text, 5, low_score=0.3
+        )
+        logger.debug(
+            f" _get_spo_value_in_one_hop_graph_set text similarity cost={time.time() - start_time}"
+        )
+
+        if len(tok5_res) == 0:
+            return one_kg_graph
+        candi_name_set = {}
+        for res in tok5_res:
+            k = revert_value_p_map[res[0]]
+            if k in candi_name_set.keys():
+                candi_name_set[k].append(res[0])
+            else:
+                candi_name_set[k] = [res[0]]
+        start_time = time.time()
+        spo_retrieved = self.find_best_match_p_name_by_model(
+            n.sub_query, unstd_p_text, candi_name_set
+        )
+        logger.debug(
+            f"_get_spo_value_in_one_hop_graph_set find_best_match_p_name_by_entity_list cost={time.time() - start_time}"
+        )
+        total_one_kg_graph = KgGraph()
+        total_one_kg_graph.query_graph[n.p.alias_name] = {
+            "s": n.s.alias_name,
+            "p": n.p.alias_name,
+            "o": n.o.alias_name,
+        }
+        for std_spo_text, std_p in spo_retrieved:
+            if std_p is None or std_p == "":
+                continue
+            one_hop_graph = revert_graph_map[std_spo_text]
+            rel_set = one_hop_graph.get_std_p_value_by_spo_text(std_p, std_spo_text)
+            one_kg_graph_ = KgGraph()
+            recall_alias_name = (
+                n.s.alias_name if one_hop_graph.s_alias_name == "s" else n.o.alias_name
+            )
+            one_kg_graph_.entity_map[recall_alias_name] = [one_hop_graph.s]
+            one_kg_graph_.edge_map[n.p.alias_name] = rel_set
+            total_one_kg_graph.merge_kg_graph(one_kg_graph_)
+        return total_one_kg_graph
+
+
+@FuzzyKgRetriever.register("default_fuzzy_kg_retriever", as_default=True)
+class DefaultFuzzyKgRetriever(FuzzyKgRetriever, ABC):
+    def __init__(
+        self,
+        el_num=1,
+        llm_client: LLMClient = None,
+        vectorize_model: VectorizeModelABC = None,
+        graph_api: GraphApiABC = None,
+        search_api: SearchApiABC = None,
+        **kwargs,
+    ):
+        super().__init__(
+            el_num, llm_client, vectorize_model, graph_api, search_api, **kwargs
+        )
+        self.match = FuzzyMatchRetrieval(self.llm_module, self.text_similarity)
+
+    def recall_one_hop_graph(
+        self, n: GetSPONode, heads: List[EntityData], tails: List[EntityData], **kwargs
+    ) -> List[OneHopGraphData]:
+        """
+        Recall one-hop graph data for a given entity.
+
+        Parameters:
+            n (GetSPONode): The entity to be standardized.
+            heads (List[EntityData]): A list of candidate entities 's'.
+            tails (List[EntityData]): A list of candidate entities 'o'.
+            kwargs: Additional optional parameters.
+
+        Returns:
+            List[OneHopGraphData]: A list of one-hop graph data for the given entity.
+        """
+        one_hop_graph_list = []
+        try:
+            if len(heads) > 0 and len(tails) > 0:
+                header_ids = set(head.biz_id for head in heads)
+                tail_ids = set(tail.biz_id for tail in tails)
+                where_caluse = []
+                header_labels = set(f"`{head.type}`" for head in heads)
+                params = {}
+                if not header_labels:
+                    dsl_header_label = "Entity"
+                else:
+                    dsl_header_label = "|".join(header_labels)
+                    params["sid"] = generate_gql_id_params(list(header_ids))
+                    where_caluse.append(f"s.id in $sid")
+
+                tail_labels = set(f"`{tail.type}`" for tail in tails)
+                if not tail_labels:
+                    dsl_tail_label = "Entity"
+                else:
+                    dsl_tail_label = "|".join(tail_labels)
+                    params["oid"] = generate_gql_id_params(list(tail_ids))
+                    where_caluse.append(f"o.id in $oid")
+                try:
+                    dsl = f"""
+                    MATCH (s:{dsl_header_label})-[p:rdf_expand()]-(o:{dsl_tail_label})
+                    WHERE {' and '.join(where_caluse)}
+                    RETURN s,p,o,s.id,o.id
+                    """
+                    fat_table = self.graph_api.execute_dsl(dsl, **params)
+                    one_graph_map = self.graph_api.convert_spo_to_one_graph(fat_table)
+                    if len(one_graph_map) > 0:
+                        return list(one_graph_map.values())
+                except Exception as e:
+                    logger.warning(
+                        f"An error occurred: {e}, so we will call head and tail same time",
+                        exc_info=True,
+                    )
+
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                map_dict = {"s": heads, "o": tails}
+                for k, v in map_dict.items():
+                    futures = [
+                        executor.submit(self.graph_api.get_entity_one_hop, entity)
+                        for entity in v
+                    ]
+                    results = [
+                        future.result()
+                        for future in concurrent.futures.as_completed(futures)
+                    ]
+                    for r in results:
+                        if r is None:
+                            logger.warning(f"{n} recall chunk data")
+                            continue
+                        r.s_alias_name = k
+                        one_hop_graph_list.append(r)
+            return one_hop_graph_list
+        except Exception as e:
+            # Log the error or handle it appropriately
+            logger.warning(f"An error occurred: {e}", exc_info=True)
+            return one_hop_graph_list
+
+    def retrieval_relation(
+        self, n: GetSPONode, one_hop_graph_list: List[OneHopGraphData], **kwargs
+    ) -> KgGraph:
+        """
+        Input:
+            n: GetSPONode, the relation to be standardized
+            one_hop_graph_list: List[OneHopGraphData], list of candidate sets
+            kwargs: additional optional parameters
+
+        Output:
+            Returns KgGraph
+        """
+        start_time = time.time()
+        total_one_kg_graph = self.match.match_spo(n, one_hop_graph_list)
+        logger.debug(f"_exact_match_spo cost={time.time() - start_time}")
+        return total_one_kg_graph
+
+    def retrieval_entity(self, mention_entity: SPOEntity, **kwargs) -> List[EntityData]:
+        """
+        Retrieve related entities based on the given entity mention.
+
+        This function aims to retrieve the most relevant entities from storage or an index based on the provided entity name.
+
+        Parameters:
+            entity_mention (str): The name of the entity to retrieve.
+            kwargs: additional optional parameters
+
+        Returns:
+            list of EntityData
+        """
+        return default_search_entity_by_name_algorithm(
+            mention_entity=mention_entity,
+            schema=self.schema,
+            vectorize_model=self.vectorize_model,
+            text_similarity=self.text_similarity,
+            search_api=self.search_api,
+            topk=self.el_num,
+            recognition_threshold=0.8,
+            kwargs=kwargs,
+        )
diff --git a/kag/solver/tools/__init__.py b/kag/solver/tools/__init__.py
index e69de29b..ed3d9a55 100644
--- a/kag/solver/tools/__init__.py
+++ b/kag/solver/tools/__init__.py
@@ -0,0 +1,2 @@
+import kag.solver.tools.search_api
+import kag.solver.tools.graph_api
diff --git a/tests/builder/integration/__init__.py b/kag/solver/tools/algorithm/__init__.py
similarity index 100%
rename from tests/builder/integration/__init__.py
rename to kag/solver/tools/algorithm/__init__.py
diff --git a/kag/solver/tools/algorithm/entity_linker.py b/kag/solver/tools/algorithm/entity_linker.py
new file mode 100644
index 00000000..fcd0211e
--- /dev/null
+++ b/kag/solver/tools/algorithm/entity_linker.py
@@ -0,0 +1,155 @@
+from typing import List
+
+from kag.interface import VectorizeModelABC
+from kag.interface.solver.base_model import SPOEntity
+from kag.solver.logic.core_modules.common.one_hop_graph import EntityData
+from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
+from kag.solver.logic.core_modules.common.text_sim_by_vector import TextSimilarity
+from kag.solver.logic.core_modules.common.utils import get_recall_node_label
+from kag.solver.tools.search_api.search_api_abc import SearchApiABC
+
+
+def default_search_entity_by_name_algorithm(
+    mention_entity: SPOEntity,
+    schema: SchemaUtils,
+    vectorize_model: VectorizeModelABC,
+    text_similarity: TextSimilarity,
+    search_api: SearchApiABC,
+    topk=1,
+    use_query_type=False,
+    **kwargs
+) -> List[EntityData]:
+    """
+    This function searches for entities based on a mention entity's name using various methods including vector similarity and text search.
+    It returns a list of the most relevant entities up to 'topk' number of results.
+
+    Parameters:
+        mention_entity (SPOEntity): The entity mentioned in the context.
+        schema (SchemaUtils): Utility object for schema operations.
+        vectorize_model (VectorizeModelABC): Model used for vectorizing text.
+        text_similarity (TextSimilarity): Tool for calculating text similarity.
+        search_api (SearchApiABC): API for searching entities.
+        topk (int): Number of top results to return.
+        use_query_type (bool): Whether to use the query type for more specific searches.
+        kwargs: Additional keyword arguments.
+
+    Returns:
+        List[EntityData]: A list of matched EntityData objects.
+    """
+    retdata = []
+    if mention_entity is None:
+        return retdata
+
+    # Extract content from kwargs or use the entity name as default
+    content = kwargs.get("content", mention_entity.entity_name)
+
+    # Determine the query type based on the entity's standard type or set it to "Entity" if not specified
+    query_type = mention_entity.get_entity_first_std_type()
+    if query_type is None or not use_query_type:
+        query_type = "Entity"
+        with_prefix_type = query_type
+    else:
+        with_prefix_type = schema.get_label_within_prefix(query_type)
+
+    recognition_threshold = kwargs.get("recognition_threshold", 0.8)
+    recall_topk = topk
+
+    # Adjust recall_topk if the query type is not an entity
+    if "entity" not in query_type.lower():
+        recall_topk = 10
+
+    # Vectorize the entity name for vector-based search
+    query_vector = vectorize_model.vectorize(mention_entity.entity_name)
+
+    # Perform a vector-based search using the determined query type
+    typed_nodes = search_api.search_vector(
+        label=with_prefix_type,
+        property_key="name",
+        query_vector=query_vector,
+        topk=recall_topk,
+    )
+
+    # Perform an additional vector-based search on the content if the query type is not "Others" or "Entity"
+    if query_type not in ["Others", "Entity"]:
+        content_vector = vectorize_model.vectorize(content)
+        content_recall_nodes = search_api.search_vector(
+            label="Entity",
+            property_key="desc",
+            query_vector=content_vector,
+            topk=recall_topk,
+        )
+    else:
+        content_recall_nodes = []
+
+    # Combine the results from both searches
+    sorted_nodes = typed_nodes + content_recall_nodes
+
+    # Fallback to text-based search if no nodes are found
+    if len(sorted_nodes) == 0:
+        sorted_nodes = search_api.search_text(query_string=mention_entity.entity_name)
+
+    # Re-rank the nodes based on semantic type matching if the query type is not an entity
+    def rerank_sematic_type(cands_nodes: list, sematic_type: str):
+        """
+        Re-ranks candidate nodes based on their semantic type similarity to the provided semantic type.
+
+        Parameters:
+            cands_nodes (list): List of candidate nodes.
+            sematic_type (str): The semantic type to match against.
+
+        Returns:
+            list: Re-ranked list of candidate nodes.
+        """
+        sematic_type_list = []
+        for cands in cands_nodes:
+            node = cands["node"]
+            if "semanticType" not in node.keys() or node["semanticType"] == "":
+                continue
+            sematic_type_list.append(node["semanticType"])
+        sematic_type_list = list(set(sematic_type_list))
+        sematic_match_score_list = text_similarity.text_sim_result(
+            sematic_type, sematic_type_list, len(sematic_type_list), low_score=-1
+        )
+        sematic_match_score_map = {}
+        for i in sematic_match_score_list:
+            sematic_match_score_map[i[0]] = i[1]
+        for node in cands_nodes:
+            recall_node_label = get_recall_node_label(node["node"]["__labels__"])
+            if recall_node_label == sematic_type:
+                node["type_match_score"] = node["score"]
+            elif (
+                "semanticType" not in node["node"].keys()
+                or node["node"]["semanticType"] == ""
+            ):
+                node["type_match_score"] = 0.3
+            else:
+                node["type_match_score"] = (
+                    node["score"]
+                    * sematic_match_score_map[node["node"]["semanticType"]]
+                )
+        sorted_people_dicts = sorted(
+            cands_nodes, key=lambda node: node["type_match_score"], reverse=True
+        )
+        return sorted_people_dicts[:topk]
+
+    if "entity" not in query_type.lower():
+        sorted_nodes = rerank_sematic_type(sorted_nodes, query_type)
+
+    # Final sorting based on score
+    sorted_people_dicts = sorted(
+        sorted_nodes, key=lambda node: node["score"], reverse=True
+    )
+
+    # Create EntityData objects for the top results that meet the recognition threshold
+    for recall in sorted_people_dicts:
+        if len(sorted_people_dicts) != 0 and recall["score"] >= recognition_threshold:
+            recalled_entity = EntityData()
+            recalled_entity.score = recall["score"]
+            recalled_entity.biz_id = recall["node"]["id"]
+            recalled_entity.name = recall["node"]["name"]
+            recalled_entity.type = get_recall_node_label(recall["node"]["__labels__"])
+            retdata.append(recalled_entity)
+        else:
+            break
+
+    return retdata[:topk]
diff --git a/kag/solver/tools/graph_api/__init__.py b/kag/solver/tools/graph_api/__init__.py
new file mode 100644
index 00000000..645bdb14
--- /dev/null
+++ b/kag/solver/tools/graph_api/__init__.py
@@ -0,0 +1,2 @@
+import kag.solver.tools.graph_api.graph_api_abc
+import kag.solver.tools.graph_api.impl
diff --git a/kag/solver/tools/graph_api/graph_api_abc.py b/kag/solver/tools/graph_api/graph_api_abc.py
new file mode 100644
index 00000000..7a6e92b8
--- /dev/null
+++ b/kag/solver/tools/graph_api/graph_api_abc.py
@@ -0,0 +1,64 @@
+from abc import abstractmethod
+from typing import Dict, List
+
+from kag.common.registry import Registrable
+from kag.interface.solver.base_model import SPOEntity
+from kag.solver.logic.core_modules.common.one_hop_graph import (
+    EntityData,
+    OneHopGraphData,
+)
+from kag.solver.tools.graph_api.model.table_model import TableData
+
+
+def replace_qota(s: str):
+    return s.replace('"', '\\"')
+
+
+def generate_gql_id_params(ids: List[str]):
+    s_biz_id_set = [f'"{replace_qota(biz_id)}"' for biz_id in ids]
+    return f'[{",".join(s_biz_id_set)}]'
+
+
+class GraphApiABC(Registrable):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    @abstractmethod
+    def get_entity_prop_by_id(self, biz_id, label) -> Dict:
+        pass
+
+    @abstractmethod
+    def get_entity(self, entity: SPOEntity) -> List[EntityData]:
+        pass
+
+    @abstractmethod
+    def get_entity_one_hop(self, entity: EntityData) -> OneHopGraphData:
+        pass
+
+    @abstractmethod
+    def convert_spo_to_one_graph(self, table: TableData) -> Dict[str, OneHopGraphData]:
+        pass
+
+    @abstractmethod
+    def execute_dsl(self, dsl: str, **kwargs) -> TableData:
+        pass
+
+    @abstractmethod
+    def calculate_pagerank_scores(
+        self, target_vertex_type, start_nodes: List[Dict]
+    ) -> Dict:
+        """
+        Calculate and retrieve PageRank scores for the given starting nodes.
+
+        Parameters:
+        target_vertex_type (str): Return target vectex type ppr score
+        start_nodes (list): A list containing document fragment IDs to be used as starting nodes for the PageRank algorithm.
+
+        Returns:
+        ppr_doc_scores (dict): A dictionary containing each document fragment ID and its corresponding PageRank score.
+
+        This method uses the PageRank algorithm in the graph store to compute scores for document fragments. If `start_nodes` is empty,
+        it returns an empty dictionary. Otherwise, it attempts to retrieve PageRank scores from the graph store and converts the result
+        into a dictionary format where keys are document fragment IDs and values are their respective PageRank scores. Any exceptions,
+        such as failures in running `run_pagerank_igraph_chunk`, are logged.
+        """
diff --git a/kag/solver/tools/graph_api/impl/__init__.py b/kag/solver/tools/graph_api/impl/__init__.py
new file mode 100644
index 00000000..2833d93a
--- /dev/null
+++ b/kag/solver/tools/graph_api/impl/__init__.py
@@ -0,0 +1 @@
+import kag.solver.tools.graph_api.impl.openspg_graph_api
diff --git a/kag/solver/tools/graph_api/impl/openspg_graph_api.py b/kag/solver/tools/graph_api/impl/openspg_graph_api.py
new file mode 100644
index 00000000..4e8f7074
--- /dev/null
+++ b/kag/solver/tools/graph_api/impl/openspg_graph_api.py
@@ -0,0 +1,258 @@
+import logging
+from typing import List, Dict
+
+from kag.common.conf import KAG_PROJECT_CONF
+from kag.solver.logic.core_modules.config import LogicFormConfiguration
+from knext.graph.client import GraphClient
+from knext.reasoner.rest.models.reason_task import ReasonTask
+
+from kag.interface.solver.base_model import SPOEntity, TypeInfo
+from kag.solver.logic.core_modules.common.one_hop_graph import (
+    EntityData,
+    OneHopGraphData,
+    Prop,
+    RelationData,
+    copy_one_hop_graph_data,
+)
+from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
+from kag.solver.logic.core_modules.common.utils import generate_biz_id_with_type
+from kag.solver.tools.graph_api.graph_api_abc import GraphApiABC, generate_gql_id_params
+from kag.solver.tools.graph_api.model.table_model import TableData
+from knext.reasoner.client import ReasonerClient
+
+logger = logging.getLogger()
+
+
+def update_cached_one_hop_rel(rel_dict: dict, rel: RelationData):
+    rel_set = rel_dict.get(rel.type, [])
+    rel_set.append(rel)
+    rel_dict[rel.type] = rel_set
+    return rel_dict
+
+
+def convert_edge_to_json(p_str):
+    try:
+        import json
+
+        p = json.loads(p_str)
+    except Exception as e:
+        logger.warning(f"_convert_edge_to_json failed {p_str}, {e}", exc_info=True)
+        return {}
+    prop = dict(p)
+    return {"type": p["__label__"], "propertyValues": prop}
+
+
+def convert_node_to_json(node_str):
+    try:
+        import json
+
+        node = json.loads(node_str)
+    except Exception as e:
+        logger.warning(f"_convert_node_to_json failed {node_str}, {e}", exc_info=True)
+        return {}
+    return {
+        "id": node["id"],
+        "type": node["__label__"],
+        "propertyValues": dict(node),
+    }
+
+
+@GraphApiABC.register("openspg_graph_api", as_default=True)
+class OpenSPGGraphApi(GraphApiABC):
+    def __init__(self, project_id=None, host_addr=None, **kwargs):
+        super().__init__(**kwargs)
+        self.project_id = project_id or KAG_PROJECT_CONF.project_id
+        self.host_addr = host_addr or KAG_PROJECT_CONF.host_addr
+        self.schema: SchemaUtils = SchemaUtils(
+            LogicFormConfiguration(
+                {
+                    "KAG_PROJECT_ID": str(self.project_id),
+                    "KAG_PROJECT_HOST_ADDR": self.host_addr,
+                }
+            )
+        )
+
+        self.rc = ReasonerClient(self.host_addr, int(str(self.project_id)))
+        self.gr = GraphClient(self.host_addr, int(str(self.project_id)))
+
+        self.cache_one_hop_graph: [str, OneHopGraphData] = {}
+
+    def _get_cached_one_hop_graph(self, s_biz_id, s_type_name, cached_map: dict):
+        s_biz_id_with_type_name = generate_biz_id_with_type(s_biz_id, s_type_name)
+        return cached_map.get(s_biz_id_with_type_name, None)
+
+    def _put_one_hop_graph_cache(self, one_hop: OneHopGraphData, cached_map: dict):
+        s_biz_id_with_type_name = generate_biz_id_with_type(
+            one_hop.s.biz_id, one_hop.s.type
+        )
+        cached_map[s_biz_id_with_type_name] = one_hop
+
+    def _get_node_type_zh(self, s_type_name):
+        s_type_without_prefix = self.schema.get_label_without_prefix(s_type_name)
+        return self.schema.node_en_zh.get(s_type_without_prefix, s_type_without_prefix)
+
+    def _convert_json_to_entity(
+        self, s_json: dict, enable_cache: bool, cached_map: dict
+    ) -> EntityData:
+        prop_values = s_json["propertyValues"]
+        s_biz_id = prop_values["id"]
+        s_type_name = s_json["type"]
+
+        one_hop: OneHopGraphData = self._get_cached_one_hop_graph(
+            s_biz_id, s_type_name, cached_map
+        )
+        if one_hop:
+            s_entity = one_hop.s
+        else:
+            s_entity = EntityData()
+            s_entity.type = s_type_name
+            s_entity.type_zh = self._get_node_type_zh(s_type_name)
+            s_entity.prop = Prop.from_dict(prop_values, s_entity.type, self.schema)
+            s_entity.biz_id = s_biz_id
+            s_entity.name = prop_values.get("name", "")
+            s_entity.description = prop_values.get("description", "")
+            one_hop = OneHopGraphData(None, "s")
+            one_hop.s = s_entity
+            if enable_cache:
+                self._put_one_hop_graph_cache(one_hop, cached_map)
+        return s_entity
+
+    def _convert_json_to_rel(
+        self, p_json: dict, start_node: EntityData, end_node: EntityData
+    ) -> RelationData:
+        p_info = p_json["propertyValues"]
+        rel = RelationData.from_dict(p_info, self.schema)
+        s_id = generate_biz_id_with_type(start_node.biz_id, start_node.type)
+        rel_s_id = generate_biz_id_with_type(rel.from_id, rel.from_type)
+
+        rel.from_entity = start_node if rel_s_id == s_id else end_node
+        rel.end_entity = end_node if rel_s_id == s_id else start_node
+        return rel
+
+    def convert_raw_data_to_node(
+        self, data: str, enable_cache, cached_map
+    ) -> EntityData:
+        data_json = convert_node_to_json(data)
+        return self._convert_json_to_entity(data_json, enable_cache, cached_map)
+
+    def convert_raw_data_to_rel(
+        self, data: str, start_node: EntityData, end_node: EntityData
+    ) -> RelationData:
+        return self._convert_json_to_rel(
+            convert_edge_to_json(data), start_node, end_node
+        )
+
+    def get_entity(self, entity: SPOEntity) -> List[EntityData]:
+        entity_type_list = entity.get_entity_type_set()
+        entity_type_list_with_prefix = []
+        for entity_type in entity_type_list:
+            entity_type_list_with_prefix.append(
+                f"`{self.schema.get_label_within_prefix(entity_type)}`"
+            )
+        entity_labels = "|".join(entity_type_list_with_prefix)
+        n_id_param = generate_gql_id_params(entity.id_set)
+        id_set = []
+        for entity_id in entity.id_set:
+            id_set.append(f'"{entity_id}"')
+        dsl_query = f"""
+        MATCH (n:{entity_labels})
+        WHERE n.id in $nid
+        RETURN n,n.id
+        """
+        tables: TableData = self.execute_dsl(dsl_query, nid=n_id_param)
+        return [self.convert_raw_data_to_node(row[0], False, {}) for row in tables.data]
+
+    def get_entity_one_hop(self, entity: EntityData) -> OneHopGraphData:
+        s_id_param = generate_gql_id_params([entity.biz_id])
+        dsl_query = f"""
+        MATCH (s:`{entity.type}`)-[p:rdf_expand()]-(o:Entity)
+        WHERE s.id in $sid
+        )
+        RETURN s,p,o,s.id,o.id
+        """
+        one_hop: OneHopGraphData = self._get_cached_one_hop_graph(
+            entity.biz_id, entity.type, self.cache_one_hop_graph
+        )
+        if not one_hop:
+            table: TableData = self.execute_dsl(dsl_query, sid=s_id_param)
+            cached_map = self.convert_spo_to_one_graph(table)
+            self.cache_one_hop_graph.update(cached_map)
+            one_hop = self._get_cached_one_hop_graph(
+                entity.biz_id, entity.type, self.cache_one_hop_graph
+            )
+        if one_hop is None:
+            logger.warning(f"get_entity_one_hop failed! {dsl_query}")
+            return None
+        return copy_one_hop_graph_data(one_hop, "s")
+
+    def convert_spo_to_one_graph(self, table: TableData) -> Dict[str, OneHopGraphData]:
+        cached_map = {}
+        s_index = -1
+        p_index = -1
+        o_index = -1
+        # format header s, p
+        for i in range(len(table.header)):
+            if table.header[i] == "s":
+                s_index = i
+            elif table.header[i] == "p":
+                p_index = i
+            elif table.header[i] == "o":
+                o_index = i
+        if s_index == -1 or o_index == -1 or p_index == -1:
+            raise RuntimeError(f"header must contains column 's','p','o'")
+        for row in table.data:
+            s_entity = self.convert_raw_data_to_node(
+                row[s_index], enable_cache=True, cached_map=cached_map
+            )
+            o_entity = self.convert_raw_data_to_node(
+                row[o_index], enable_cache=False, cached_map=cached_map
+            )
+            rel = self.convert_raw_data_to_rel(row[p_index], s_entity, o_entity)
+            s_one_hop: OneHopGraphData = self._get_cached_one_hop_graph(
+                s_entity.biz_id, s_entity.type, cached_map
+            )
+            if rel.from_entity == s_entity:
+                update_cached_one_hop_rel(s_one_hop.out_relations, rel)
+            else:
+                update_cached_one_hop_rel(s_one_hop.in_relations, rel)
+
+        return cached_map
+
+    def execute_dsl(self, dsl: str, **kwargs) -> TableData:
+        res = self.rc.syn_execute(dsl_content=dsl, **kwargs)
+        task_resp: ReasonTask = res.task
+        if task_resp is None or task_resp.status != "FINISH":
+            logger.warning(f"execute dsl failed! {res}")
+            return TableData()
+        detail = task_resp.result_table_result
+        return TableData.from_dict({"header": detail.header, "data": detail.rows})
+
+    def calculate_pagerank_scores(
+        self, target_vertex_type, start_nodes: List[Dict]
+    ) -> Dict:
+        target_vertex_type_with_prefix = self.schema.get_label_within_prefix(
+            target_vertex_type
+        )
+        return self.gr.calculate_pagerank_scores(
+            target_vertex_type_with_prefix, start_nodes
+        )
+
+    def get_entity_prop_by_id(self, biz_id, label) -> Dict:
+        return self.rc.query_node(label=label, id_value=biz_id)
+
+
+if __name__ == "__main__":
+    rc = ReasonerClient(host_addr="http://127.0.0.1:8887", project_id=4)
+    rc.get_reason_schema()
+    graph_api = OpenSPGGraphApi(project_id="4", host_addr="http://127.0.0.1:8887")
+    entity = SPOEntity()
+    entity.id_set.append("entity_test_id")
+    entity.type_set.append(TypeInfo("Pillar"))
+    datas: List[EntityData] = graph_api.get_entity(entity)
+    assert len(datas) == 1
+    assert datas[0].biz_id == "entity_test_id"
+    one_hop = graph_api.get_entity_one_hop(datas[0])
+    assert one_hop is not None
+    # cached
+    one_hop = graph_api.get_entity_one_hop(datas[0])
+    assert one_hop is not None
diff --git a/tests/solver/__init__.py b/kag/solver/tools/graph_api/model/__init__.py
similarity index 100%
rename from tests/solver/__init__.py
rename to kag/solver/tools/graph_api/model/__init__.py
diff --git a/kag/solver/tools/graph_api/model/table_model.py b/kag/solver/tools/graph_api/model/table_model.py
new file mode 100644
index 00000000..4952bae9
--- /dev/null
+++ b/kag/solver/tools/graph_api/model/table_model.py
@@ -0,0 +1,25 @@
+import os
+import time
+from enum import Enum
+
+import json
+from typing import Any, List
+import logging
+
+
+logger = logging.getLogger()
+
+
+class TableData:
+    def __init__(self):
+        self.header = []
+        self.data = []
+        self.total = 0
+
+    @staticmethod
+    def from_dict(json_dict):
+        entity = TableData()
+        entity.header = json_dict["header"]
+        entity.data = json_dict["data"]
+        entity.total = len(entity.data)
+        return entity
diff --git a/kag/solver/tools/info_processor.py b/kag/solver/tools/info_processor.py
index f9b1cc9b..bad6a39c 100644
--- a/kag/solver/tools/info_processor.py
+++ b/kag/solver/tools/info_processor.py
@@ -1,17 +1,46 @@
 import json
 import logging
+import re
 from enum import Enum
+from typing import List
 
+from kag.interface.solver.base_model import LFPlan, SubQueryResult
+from kag.solver.logic.core_modules.common.one_hop_graph import KgGraph, EntityData
 from knext.common.rest import ApiClient, Configuration
 from knext.reasoner.rest.models.ca_pipeline import CaPipeline
+from knext.reasoner.rest.models.data_edge import DataEdge
+from knext.reasoner.rest.models.data_node import DataNode
 from knext.reasoner.rest.models.edge import Edge
 from knext.reasoner.rest.models.node import Node
 from knext.reasoner.rest.models.report_pipeline_request import ReportPipelineRequest
+from knext.reasoner.rest.models.sub_graph import SubGraph
 from knext.reasoner.rest.reasoner_api import ReasonerApi
 
 logger = logging.getLogger(__name__)
+
+
 class ReporterIntermediateProcessTool:
+    """
+    A tool for reporting intermediate processes in a reasoning pipeline.
+
+    Attributes:
+        STATE (Enum): An enumeration of possible states for nodes in the pipeline.
+        ROOT_ID (int): The root node ID.
+        report_log (bool): Whether to report logs.
+        task_id (str): The task ID.
+        project_id (str): The project ID.
+        client (ReasonerApi): API client for interacting with the reasoner.
+        cur_node_id (int): Current node ID.
+        last_sub_question_size (int): Size of the last sub-question list.
+        sub_query_node (list): List of sub-query nodes.
+        start_node_id (int): Starting node ID.
+        create_pipeline_times (int): Number of times the pipeline has been created.
+        language (str): Language for output messages.
+    """
+
     class STATE(str, Enum):
+        """Enumeration of possible states for nodes in the pipeline."""
+
         WAITING = "WAITING"
         RUNNING = "RUNNING"
         FINISH = "FINISH"
@@ -19,63 +48,331 @@ class STATE(str, Enum):
 
     ROOT_ID = 0
 
-    def __init__(self, report_log=False, task_id=None, project_id=None, host_addr=None):
+    def __init__(
+        self,
+        report_log=False,
+        task_id=None,
+        project_id=None,
+        host_addr=None,
+        language="en",
+    ):
+        """
+        Initialize the ReporterIntermediateProcessTool.
+
+        Args:
+            report_log (bool): Whether to report logs.
+            task_id (str): The task ID.
+            project_id (str): The project ID.
+            host_addr (str): Host address for the API client.
+            language (str): Language for output messages.
+        """
         self.report_log = report_log
         self.task_id = task_id
         self.project_id = project_id
         self.client: ReasonerApi = ReasonerApi(
-            api_client=ApiClient(configuration=Configuration(host=host_addr)))
+            api_client=ApiClient(configuration=Configuration(host=host_addr))
+        )
+        self.cur_node_id = self.ROOT_ID
+        self.last_sub_question_size = self.ROOT_ID
+        self.sub_query_node = []
+        self.start_node_id = 1
+        self.create_pipeline_times = 0
+        self.language = language
 
-    def report_pipeline(self, question, rewrite_question_list=[]):
-        # print(question)
-        for idx, item in enumerate(rewrite_question_list, start=2):
-            item.id = idx
-            # print(item)
+    def get_start_node_name(self):
+        """
+        Get the name for the start node based on the current pipeline creation count.
+
+        Returns:
+            str: Name for the start node.
+        """
+        start_node_name = "问题" if self.language == "zh" else "Question"
+        if self.create_pipeline_times != 0:
+            start_node_name = (
+                "反思问题" if self.language == "zh" else "Reflective Questioning"
+            )
+        return start_node_name
+
+    def get_end_node_name(self):
+        """
+        Get the name for the end node.
+
+        Returns:
+            str: Name for the end node.
+        """
+        return "问题答案" if self.language == "zh" else "Answer"
+
+    def get_sub_question_name(self, index):
+        """
+        Get the name for a sub-question node.
 
+        Args:
+            index (int): Index of the sub-question.
+
+        Returns:
+            str: Name for the sub-question node.
+        """
+        return f"子问题{index}" if self.language == "zh" else f"Sub Question {index}"
+
+    def report_pipeline(self, question, rewrite_question_list: List[LFPlan] = []):
+        """
+        Report the entire pipeline including nodes and edges.
+
+        Args:
+            question (str): The original question.
+            rewrite_question_list (List[LFPlan]): List of rewritten questions.
+        """
         pipeline = CaPipeline()
         pipeline.nodes = []
         pipeline.edges = []
-        pipeline.nodes.append(Node(id=self.ROOT_ID, state=self.STATE.WAITING, question=question.question, answer=None, logs=None))
-        dep_question_list = []
-        for item in rewrite_question_list:
-            pipeline.nodes.append(Node(id=item.id, state=self.STATE.WAITING, question=item.question, answer=None, logs=None))
-            if item.dependencies:
-                for dep_item in item.dependencies:
-                    pipeline.edges.append(Edge(_from=dep_item.id, to=item.id))
-                    dep_question_list.append(dep_item)
-        for item in rewrite_question_list:
-            if item not in dep_question_list:
-                pipeline.edges.append(Edge(_from=item.id, to=self.ROOT_ID))
-        to_list = []
-        for edge in pipeline.edges:
-            to_list.append(edge.to)
-        first_nodes = []
-        for node in pipeline.nodes:
-            if node.id not in to_list:
-                first_nodes.append(node.id)
-        # str([n.question for n in pipeline.nodes if n.id != self.ROOT_ID])
-        pipeline.nodes.insert(0, Node(id=1, state=self.STATE.FINISH, question=question.question, answer=str([n.question for n in pipeline.nodes if n.id != self.ROOT_ID]), logs=None))
-        for n in first_nodes:
-            pipeline.edges.insert(0, Edge(_from=1, to=n))
+        self.cur_node_id += self.last_sub_question_size
+        rethink_question = question
+        # print(question)
+        for idx, item in enumerate(rewrite_question_list, start=self.cur_node_id + 2):
+            item.id = self.cur_node_id + idx
+
+        if len(self.sub_query_node) == 0:
+            end_node = Node(
+                id=self.ROOT_ID,
+                state=self.STATE.WAITING,
+                question=rethink_question,
+                answer=None,
+                title=self.get_end_node_name(),
+                logs=None,
+            )
+            self.sub_query_node.append(end_node)
+            self.start_node_id = 1
+        else:
+            self.start_node_id = len(self.sub_query_node)
+
+        start_node_name = self.get_start_node_name()
+        question_node = Node(
+            id=self.start_node_id,
+            state=self.STATE.FINISH,
+            question=rethink_question,
+            answer=str([n.query for n in rewrite_question_list]),
+            title=start_node_name,
+            logs=None,
+        )
+        self.sub_query_node.append(question_node)
+
+        for idx, item in enumerate(rewrite_question_list):
+            cur_node = Node(
+                id=len(self.sub_query_node),
+                state=self.STATE.WAITING,
+                question=item.query,
+                answer=None,
+                logs=None,
+                title=self.get_sub_question_name(idx + 1),
+            )
+            self.sub_query_node.append(cur_node)
+
+        # Generate edges between nodes
+        for idx, item in enumerate(self.sub_query_node, start=1):
+            if item.id == 0:
+                continue
+            if idx == len(self.sub_query_node):
+                pipeline.edges.append(Edge(_from=item.id, to=0))
+                break
+            else:
+                pipeline.edges.append(
+                    Edge(_from=item.id, to=self.sub_query_node[idx].id)
+                )
+        pipeline.nodes = self.sub_query_node
+
         request = ReportPipelineRequest(task_id=self.task_id, pipeline=pipeline)
         if self.report_log:
-            self.client.reasoner_dialog_report_pipeline_post(report_pipeline_request=request)
+            self.client.reasoner_dialog_report_pipeline_post(
+                report_pipeline_request=request
+            )
+        else:
+            logger.info(request)
+        self.last_sub_question_size = len(rewrite_question_list)
+        self.create_pipeline_times += 1
+
+    def report_final_answer(self, query, answer, state):
+        node = self.sub_query_node[0]
+        node._state = state
+        node._question = query
+        node._answer = answer
+        request = ReportPipelineRequest(task_id=self.task_id, node=node)
+        if self.report_log:
+            self.client.reasoner_dialog_report_node_post(
+                report_pipeline_request=request
+            )
         else:
             logger.info(request)
 
-    def report_node(self, question, answer, state):
-        logs = self.format_logs(question.context)
-        if not question.id:
-            question.id = self.ROOT_ID
-        node = Node(id=(question.id+1 if question.id != 0 else 0), state=state, question=question.question, answer=answer,
-        logs=logs)
+    def report_node(self, req_id, index, state, node_plan: LFPlan, kg_graph: KgGraph):
+        """
+        Report a single node in the pipeline.
+
+        Args:
+            req_id (str): Request ID.
+            index (int): Index of the node.
+            state (STATE): State of the node.
+            node_plan (LFPlan): Logical form plan for the node.
+            kg_graph (KgGraph): Knowledge graph associated with the node.
+        """
+        sub_logic_nodes_str = "\n".join([str(ln) for ln in node_plan.lf_nodes])
+        # 为产品展示隐藏冗余信息
+        sub_logic_nodes_str = re.sub(
+            r"(\s,sub_query=[^)]+|get\([^)]+\))", "", sub_logic_nodes_str
+        ).strip()
+        context = [
+            "## SPO Retriever",
+            "#### logic_form expression: ",
+            f"```java\n{sub_logic_nodes_str}\n```",
+        ]
+        sub_answer = None
+        if node_plan.res is not None:
+            sub_answer, cur_content, sub_graph = self._convert_lf_res_to_report_format(
+                req_id=req_id,
+                index=index,
+                state=state,
+                res=node_plan.res,
+                kg_graph=kg_graph,
+            )
+            context += cur_content
+        else:
+            sub_graph = None
+
+        logs = self.format_logs(context)
+        report_node_id = self.start_node_id + index if index != 0 else 0
+        node = self.sub_query_node[report_node_id]
+        node._state = state
+        node._question = node_plan.query
+        node._answer = sub_answer
+        node._logs = logs
+        if sub_graph is not None:
+            node._subgraph = [sub_graph]
         request = ReportPipelineRequest(task_id=self.task_id, node=node)
         if self.report_log:
-            self.client.reasoner_dialog_report_node_post(report_pipeline_request=request)
+            self.client.reasoner_dialog_report_node_post(
+                report_pipeline_request=request
+            )
         else:
             logger.info(request)
 
+    def _convert_lf_res_to_report_format(
+        self, req_id, index, state, res: SubQueryResult, kg_graph: KgGraph
+    ):
+        """
+        Convert logical form result to a report format.
+
+        Args:
+            req_id (str): Request ID.
+            index (int): Index of the node.
+            state (STATE): State of the node.
+            res (SubQueryResult): Result of the logical form query.
+            kg_graph (KgGraph): Knowledge graph associated with the node.
+
+        Returns:
+            tuple: Sub-answer, context content, and sub-graph.
+        """
+        spo_retrieved = res.spo_retrieved
+        context = []
+        sub_answer = None
+        if len(spo_retrieved) > 0:
+            spo_answer_path = json.dumps(
+                kg_graph.to_spo_path(spo_retrieved, self.language),
+                ensure_ascii=False,
+                indent=4,
+            )
+            spo_answer_path = f"```json\n{spo_answer_path}\n```"
+            graph_id = f"{req_id}_{index}"
+            graph_div = f"<div class='{graph_id}'></div>\n\n"
+            sub_graph = self._convert_spo_to_graph(graph_id, spo_retrieved)
+            context.append(graph_div)
+            context.append(f"#### Triplet Retrieved:")
+            context.append(spo_answer_path)
+        else:
+            context.append(f"#### Triplet Retrieved:")
+            context.append("No triplets were retrieved.")
+            sub_graph = None
+
+        doc_retrieved = res.doc_retrieved
+        context += self._update_sub_question_recall_docs(doc_retrieved)
+        if state == ReporterIntermediateProcessTool.STATE.FINISH:
+            context.append(f"#### answer based by {res.match_type}:")
+            context.append(f"{res.sub_answer}")
+            sub_answer = res.sub_answer
+        return sub_answer, context, sub_graph
+
+    def _convert_spo_to_graph(self, graph_id, spo_retrieved):
+        """
+        Convert SPO triples to a graph representation.
+
+        Args:
+            spo_retrieved (list): List of SPO triples.
+
+        Returns:
+            SubGraph: Graph representation of the SPO triples.
+        """
+        nodes = {}
+        edges = []
+        for spo in spo_retrieved:
+
+            def _get_node(entity: EntityData):
+                node = DataNode(
+                    id=entity.to_show_id(self.language),
+                    name=entity.get_short_name(),
+                    label=entity.type_zh if self.language == "zh" else entity.type,
+                    properties=entity.prop.get_properties_map() if entity.prop else {},
+                )
+                return node
+
+            start_node = _get_node(spo.from_entity)
+            end_node = _get_node(spo.end_entity)
+            if start_node.id not in nodes:
+                nodes[start_node.id] = start_node
+            if end_node.id not in nodes:
+                nodes[end_node.id] = end_node
+            spo_id = spo.to_show_id(self.language)
+            data_spo = DataEdge(
+                id=spo_id,
+                _from=start_node.id,
+                from_type=start_node.label,
+                to=end_node.id,
+                to_type=end_node.label,
+                properties=spo.prop.get_properties_map() if spo.prop else {},
+                label=spo.type_zh if self.language == "zh" else spo.type,
+            )
+            edges.append(data_spo)
+        sub_graph = SubGraph(
+            class_name=graph_id, result_nodes=list(nodes.values()), result_edges=edges
+        )
+        return sub_graph
+
+    def _update_sub_question_recall_docs(self, docs):
+        """
+        Update the context with retrieved documents for sub-questions.
+
+        Args:
+            docs (list): List of retrieved documents.
+
+        Returns:
+            list: Updated context content.
+        """
+        if docs is None or len(docs) == 0:
+            return []
+        doc_content = [f"## Chunk Retriever"]
+        doc_content.extend(["|id|content|", "|-|-|"])
+        for i, d in enumerate(docs, start=1):
+            _d = d.replace("\n", "<br>")
+            doc_content.append(f"|{i}|{_d}|")
+        return doc_content
+
     def format_logs(self, logs):
+        """
+        Format logs into a string.
+
+        Args:
+            logs (list or str): Logs to be formatted.
+
+        Returns:
+            str: Formatted log content.
+        """
         if not logs:
             return None
         content = ""
diff --git a/kag/solver/tools/search_api/__init__.py b/kag/solver/tools/search_api/__init__.py
new file mode 100644
index 00000000..08df4096
--- /dev/null
+++ b/kag/solver/tools/search_api/__init__.py
@@ -0,0 +1,2 @@
+import kag.solver.tools.search_api.search_api_abc
+import kag.solver.tools.search_api.impl
diff --git a/kag/solver/tools/search_api/impl/__init__.py b/kag/solver/tools/search_api/impl/__init__.py
new file mode 100644
index 00000000..9ce8c424
--- /dev/null
+++ b/kag/solver/tools/search_api/impl/__init__.py
@@ -0,0 +1 @@
+import kag.solver.tools.search_api.impl.openspg_search_api
diff --git a/kag/solver/tools/search_api/impl/openspg_search_api.py b/kag/solver/tools/search_api/impl/openspg_search_api.py
new file mode 100644
index 00000000..c3e6bfe7
--- /dev/null
+++ b/kag/solver/tools/search_api/impl/openspg_search_api.py
@@ -0,0 +1,44 @@
+from typing import List
+
+from kag.common.conf import KAG_PROJECT_CONF
+from kag.solver.tools.search_api.search_api_abc import SearchApiABC
+from knext.search.client import SearchClient
+
+
+@SearchApiABC.register("openspg_search_api", as_default=True)
+class OpenSPGSearchAPI(SearchApiABC):
+    def __init__(self, project_id=None, host_addr=None, **kwargs):
+        super().__init__(**kwargs)
+        self.project_id = project_id or KAG_PROJECT_CONF.project_id
+        self.host_addr = host_addr or KAG_PROJECT_CONF.host_addr
+        self.sc = SearchClient(
+            host_addr=self.host_addr, project_id=int(self.project_id)
+        )
+
+    def search_text(
+        self, query_string, label_constraints=None, topk=10, params=None
+    ) -> List:
+        return self.sc.search_text(
+            query_string=query_string,
+            label_constraints=label_constraints,
+            topk=topk,
+            params=params,
+        )
+
+    def search_vector(
+        self, label, property_key, query_vector, topk=10, ef_search=None, params=None
+    ) -> List:
+        return self.sc.search_vector(
+            label=label,
+            property_key=property_key,
+            query_vector=query_vector,
+            topk=topk,
+            ef_search=ef_search,
+            params=params,
+        )
+
+
+if __name__ == "__main__":
+    search_api = OpenSPGSearchAPI(project_id="4", host_addr="http://127.0.0.1:8887")
+    res = search_api.search_text("test")
+    assert len(res) > 0
diff --git a/kag/solver/tools/search_api/search_api_abc.py b/kag/solver/tools/search_api/search_api_abc.py
new file mode 100644
index 00000000..c0f826e3
--- /dev/null
+++ b/kag/solver/tools/search_api/search_api_abc.py
@@ -0,0 +1,21 @@
+from abc import abstractmethod
+from typing import List
+
+from kag.common.registry import Registrable
+
+
+class SearchApiABC(Registrable):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    @abstractmethod
+    def search_text(
+        self, query_string, label_constraints=None, topk=10, params=None
+    ) -> List:
+        pass
+
+    @abstractmethod
+    def search_vector(
+        self, label, property_key, query_vector, topk=10, ef_search=None, params=None
+    ) -> List:
+        pass
diff --git a/kag/solver/utils.py b/kag/solver/utils.py
new file mode 100644
index 00000000..ec94573c
--- /dev/null
+++ b/kag/solver/utils.py
@@ -0,0 +1,4 @@
+def init_prompt_with_fallback(prompt_name, biz_scene):
+    from kag.builder.prompt.utils import init_prompt_with_fallback as func
+
+    return func(prompt_name, biz_scene)
diff --git a/kag/templates/project/builder/__init__.py b/kag/templates/project/builder/__init__.py
index 94be39bc..7a018e7c 100644
--- a/kag/templates/project/builder/__init__.py
+++ b/kag/templates/project/builder/__init__.py
@@ -11,4 +11,4 @@
 
 """
 Builder Dir.
-"""
\ No newline at end of file
+"""
diff --git a/kag/templates/project/builder/indexer.py b/kag/templates/project/builder/indexer.py
index f9e16285..6f6914a4 100644
--- a/kag/templates/project/builder/indexer.py
+++ b/kag/templates/project/builder/indexer.py
@@ -8,4 +8,3 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
-
diff --git a/kag/templates/project/builder/prompt/__init__.py b/kag/templates/project/builder/prompt/__init__.py
index 247bb44c..ba7d5d56 100644
--- a/kag/templates/project/builder/prompt/__init__.py
+++ b/kag/templates/project/builder/prompt/__init__.py
@@ -11,4 +11,4 @@
 
 """
 Place the prompts to be used for building the index in this directory.
-"""
\ No newline at end of file
+"""
diff --git a/kag/templates/project/reasoner/__init__.py b/kag/templates/project/reasoner/__init__.py
index a0c4032b..8b8a3c91 100644
--- a/kag/templates/project/reasoner/__init__.py
+++ b/kag/templates/project/reasoner/__init__.py
@@ -17,4 +17,4 @@
 MATCH (s:DEFAULT.Company)
 RETURN s.id, s.address
 ```
-"""
\ No newline at end of file
+"""
diff --git a/kag/templates/project/schema/__init__.py b/kag/templates/project/schema/__init__.py
index ef3dde6d..8ac86acc 100644
--- a/kag/templates/project/schema/__init__.py
+++ b/kag/templates/project/schema/__init__.py
@@ -15,4 +15,4 @@
     You can execute `kag schema commit` to commit your schema to SPG server.
 
 
-"""
\ No newline at end of file
+"""
diff --git a/kag/templates/project/solver/prompt/__init__.py b/kag/templates/project/solver/prompt/__init__.py
index dadd42a3..dfa931cd 100644
--- a/kag/templates/project/solver/prompt/__init__.py
+++ b/kag/templates/project/solver/prompt/__init__.py
@@ -11,4 +11,4 @@
 
 """
 Place the prompts to be used for solving problems in this directory.
-"""
\ No newline at end of file
+"""
diff --git a/kag/templates/schema/{{default}}.schema.tmpl b/kag/templates/schema/{{default}}.schema.tmpl
index a1cd6260..c0059327 100644
--- a/kag/templates/schema/{{default}}.schema.tmpl
+++ b/kag/templates/schema/{{default}}.schema.tmpl
@@ -104,20 +104,26 @@ Works(作品): EntityType
         semanticType(语义类型): Text
             index: Text
 
-Event(事件): EntityType
+Others(其他): EntityType
      properties:
         desc(描述): Text
             index: TextAndVector
         semanticType(语义类型): Text
             index: Text
 
-Others(其他): EntityType
+Event(事件): EventType
      properties:
-        desc(描述): Text
+        subject(主体): Person
+        participants(参与者): Person
+            constraint: MultiValue
+        time(时间): Date
+        location(地点): GeographicLocation
+        abstract(摘要): Text
             index: TextAndVector
-        semanticType(语义类型): Text
+        type(事件类型): Text
             index: Text
 
+
 SemanticConcept(语义概念): EntityType
      properties:
         desc(内容): Text
diff --git a/knext/__init__.py b/knext/__init__.py
new file mode 100644
index 00000000..7ed2edab
--- /dev/null
+++ b/knext/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+__package_name__ = "openspg-knext"
+__version__ = "0.6-beta3"
diff --git a/kag/common/base/__init__.py b/knext/builder/__init__.py
similarity index 100%
rename from kag/common/base/__init__.py
rename to knext/builder/__init__.py
diff --git a/knext/builder/builder_chain_abc.py b/knext/builder/builder_chain_abc.py
new file mode 100644
index 00000000..2106b507
--- /dev/null
+++ b/knext/builder/builder_chain_abc.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+from abc import ABC, abstractmethod
+
+from knext.common.base.chain import Chain
+
+
+class BuilderChainABC(Chain, ABC):
+    @abstractmethod
+    def build(self, **kwargs) -> Chain:
+        raise NotImplementedError(
+            f"`invoke` is not currently supported for {self.__class__.__name__}."
+        )
+
+    def invoke(self, file_path, max_workers=10, **kwargs):
+        chain = self.build(file_path=file_path, max_workers=max_workers, **kwargs)
+        chain.invoke(input=file_path, max_workers=max_workers, **kwargs)
diff --git a/knext/builder/client.py b/knext/builder/client.py
new file mode 100644
index 00000000..8666ae53
--- /dev/null
+++ b/knext/builder/client.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from knext.builder import rest
+from knext.builder.rest.models.writer_graph_request import WriterGraphRequest
+from knext.common.base.client import Client
+from knext.common.rest import ApiClient, Configuration
+
+
+class BuilderClient(Client):
+    """ """
+
+    def __init__(self, host_addr: str = None, project_id: int = None):
+        super().__init__(host_addr, project_id)
+        self._rest_client: rest.BuilderApi = rest.BuilderApi(
+            api_client=ApiClient(configuration=Configuration(host=host_addr))
+        )
+
+    def write_graph(self, sub_graph: dict, operation: str, lead_to_builder: bool):
+        request = WriterGraphRequest(
+            project_id=self._project_id,
+            sub_graph=sub_graph,
+            operation=operation,
+            enable_lead_to=lead_to_builder,
+        )
+        self._rest_client.builder_job_writer_graph_post(writer_graph_request=request)
+
+    def submit(self, builder_job: dict):
+        pass
diff --git a/knext/builder/operator/__init__.py b/knext/builder/operator/__init__.py
new file mode 100644
index 00000000..93aa6cd4
--- /dev/null
+++ b/knext/builder/operator/__init__.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
diff --git a/knext/builder/operator/base.py b/knext/builder/operator/base.py
new file mode 100644
index 00000000..0671af36
--- /dev/null
+++ b/knext/builder/operator/base.py
@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from abc import ABC
+from typing import Dict, Type
+
+
+class BaseOp(ABC):
+    """Base class for all user-defined operator functions.
+
+    The execution logic of the operator needs to be implemented in the `eval` method.
+    """
+
+    """Operator name."""
+    name: str
+    """Operator description."""
+    desc: str = ""
+    """Operator params."""
+    params: Dict[str, str] = None
+
+    _registry = {}
+    _local_path: str
+    _module_path: str
+    _version: int
+    _has_registered: bool = False
+
+    def __init__(self, params: Dict[str, str] = None):
+        self.params = params
+
+    def invoke(self, **kwargs):
+        """Used to implement operator execution logic."""
+        raise NotImplementedError(
+            f"{self.__class__.__name__} need to implement `invoke` method."
+        )
+
+    @classmethod
+    def register(cls, name: str, local_path: str, module_path: str):
+        """
+        Register a class as subclass of BaseOp with name and local_path.
+        After registration, the subclass object can be inspected by `BaseOp.by_name(op_name)`.
+        """
+
+        def add_subclass_to_registry(subclass: Type["BaseOp"]):
+            subclass.name = name
+            subclass._local_path = local_path
+            subclass._module_path = module_path
+            if name in cls._registry:
+                raise ValueError(
+                    f"Operator [{name}] conflict in {subclass._local_path} and {cls.by_name(name)._local_path}."
+                )
+            cls._registry[name] = subclass
+            if hasattr(subclass, "bind_to"):
+                subclass.__bases__[0].bind_schemas[subclass.bind_to] = name
+            return subclass
+
+        return add_subclass_to_registry
+
+    @classmethod
+    def by_name(cls, name: str):
+        """Reflection from op name to subclass object of BaseOp."""
+        if name in cls._registry:
+            subclass = cls._registry[name]
+            return subclass
+        else:
+            raise ValueError(f"{name} is not a registered name for {cls.__name__}. ")
+
+    @property
+    def has_registered(self):
+        return self._has_registered
diff --git a/knext/builder/rest/__init__.py b/knext/builder/rest/__init__.py
new file mode 100644
index 00000000..13c13574
--- /dev/null
+++ b/knext/builder/rest/__init__.py
@@ -0,0 +1,33 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+# flake8: noqa
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+from __future__ import absolute_import
+
+__version__ = "1"
+
+# import apis into sdk package
+from knext.builder.rest.builder_api import BuilderApi
+
+# import models into sdk package
+from knext.builder.rest.models.writer_graph_request import WriterGraphRequest
diff --git a/knext/builder/rest/builder_api.py b/knext/builder/rest/builder_api.py
new file mode 100644
index 00000000..474c2189
--- /dev/null
+++ b/knext/builder/rest/builder_api.py
@@ -0,0 +1,163 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+from __future__ import absolute_import
+
+import re  # noqa: F401
+
+# python 2 and python 3 compatibility library
+import six
+
+from knext.common.rest.api_client import ApiClient
+from knext.common.rest.exceptions import ApiTypeError, ApiValueError  # noqa: F401
+
+
+class BuilderApi(object):
+    """NOTE: This class is auto generated by OpenAPI Generator
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    def __init__(self, api_client=None):
+        if api_client is None:
+            api_client = ApiClient()
+        self.api_client = api_client
+
+    def builder_job_writer_graph_post(self, **kwargs):  # noqa: E501
+        """write_graph  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.builder_job_writer_graph_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param WriterGraphRequest writer_graph_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: object
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.builder_job_writer_graph_post_with_http_info(**kwargs)  # noqa: E501
+
+    def builder_job_writer_graph_post_with_http_info(self, **kwargs):  # noqa: E501
+        """write_graph  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.builder_job_writer_graph_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param WriterGraphRequest writer_graph_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(object, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["writer_graph_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method builder_job_writer_graph_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "writer_graph_request" in local_var_params:
+            body_params = local_var_params["writer_graph_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/graph/writerGraph",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="object",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
diff --git a/knext/builder/rest/models/__init__.py b/knext/builder/rest/models/__init__.py
new file mode 100644
index 00000000..34633d66
--- /dev/null
+++ b/knext/builder/rest/models/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from __future__ import absolute_import
+
+from knext.builder.rest.models.writer_graph_request import WriterGraphRequest
diff --git a/knext/builder/rest/models/writer_graph_request.py b/knext/builder/rest/models/writer_graph_request.py
new file mode 100644
index 00000000..dca97030
--- /dev/null
+++ b/knext/builder/rest/models/writer_graph_request.py
@@ -0,0 +1,218 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class WriterGraphRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "project_id": "int",
+        "operation": "str",
+        "sub_graph": "object",
+        "enable_lead_to": "bool",
+    }
+
+    attribute_map = {
+        "project_id": "projectId",
+        "operation": "operation",
+        "sub_graph": "subGraph",
+        "enable_lead_to": "enableLeadTo",
+    }
+
+    def __init__(
+        self,
+        project_id=None,
+        operation=None,
+        sub_graph=None,
+        enable_lead_to=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """WriterGraphRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._project_id = None
+        self._operation = None
+        self._sub_graph = None
+        self._enable_lead_to = None
+        self.discriminator = None
+
+        if project_id is not None:
+            self.project_id = project_id
+        if operation is not None:
+            self.operation = operation
+        if sub_graph is not None:
+            self.sub_graph = sub_graph
+        if enable_lead_to is not None:
+            self.enable_lead_to = enable_lead_to
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this WriterGraphRequest.  # noqa: E501
+
+
+        :return: The project_id of this WriterGraphRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this WriterGraphRequest.
+
+
+        :param project_id: The project_id of this WriterGraphRequest.  # noqa: E501
+        :type: int
+        """
+
+        self._project_id = project_id
+
+    @property
+    def operation(self):
+        """Gets the operation of this WriterGraphRequest.  # noqa: E501
+
+
+        :return: The operation of this WriterGraphRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._operation
+
+    @operation.setter
+    def operation(self, operation):
+        """Sets the operation of this WriterGraphRequest.
+
+
+        :param operation: The operation of this WriterGraphRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._operation = operation
+
+    @property
+    def sub_graph(self):
+        """Gets the sub_graph of this WriterGraphRequest.  # noqa: E501
+
+
+        :return: The sub_graph of this WriterGraphRequest.  # noqa: E501
+        :rtype: object
+        """
+        return self._sub_graph
+
+    @sub_graph.setter
+    def sub_graph(self, sub_graph):
+        """Sets the sub_graph of this WriterGraphRequest.
+
+
+        :param sub_graph: The sub_graph of this WriterGraphRequest.  # noqa: E501
+        :type: object
+        """
+
+        self._sub_graph = sub_graph
+
+    @property
+    def enable_lead_to(self):
+        """Gets the enable_lead_to of this WriterGraphRequest.  # noqa: E501
+
+
+        :return: The enable_lead_to of this WriterGraphRequest.  # noqa: E501
+        :rtype: bool
+        """
+        return self._enable_lead_to
+
+    @enable_lead_to.setter
+    def enable_lead_to(self, enable_lead_to):
+        """Sets the enable_lead_to of this WriterGraphRequest.
+
+
+        :param enable_lead_to: The enable_lead_to of this WriterGraphRequest.  # noqa: E501
+        :type: bool
+        """
+
+        self._enable_lead_to = enable_lead_to
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, WriterGraphRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, WriterGraphRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/tests/__init__.py b/knext/command/__init__.py
similarity index 100%
rename from tests/__init__.py
rename to knext/command/__init__.py
diff --git a/knext/command/exception.py b/knext/command/exception.py
new file mode 100644
index 00000000..6a81111e
--- /dev/null
+++ b/knext/command/exception.py
@@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from typing import Any
+from click import Context, Group
+
+
+class _ApiExceptionHandler(Group):
+    """Echo exceptions."""
+
+    def invoke(self, ctx: Context) -> Any:
+        return super().invoke(ctx)
diff --git a/knext/command/knext_cli.py b/knext/command/knext_cli.py
new file mode 100644
index 00000000..4cb356cc
--- /dev/null
+++ b/knext/command/knext_cli.py
@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import click
+
+# from knext.command.sub_command.builder import execute
+from knext.command.sub_command.project import (
+    create_project,
+    restore_project,
+    update_project,
+    list_project,
+)
+from knext.command.sub_command.reasoner import execute_reasoner_job
+from knext.command.sub_command.schema import commit_schema
+from knext.command.sub_command.schema import reg_concept_rule
+from knext.command.exception import _ApiExceptionHandler
+from knext import __version__
+from knext.command.sub_command.thinker import execute_thinker_job
+
+
+@click.group(cls=_ApiExceptionHandler)
+@click.version_option(__version__)
+def _main() -> None:
+    pass
+
+
+# @_main.group()
+# def builder() -> None:
+#     """Builder client."""
+#     pass
+#
+#
+# builder.command("execute")(execute)
+
+
+@_main.group()
+def project() -> None:
+    """Project client."""
+    pass
+
+
+project.command("create")(create_project)
+project.command("restore")(restore_project)
+project.command("update")(update_project)
+project.command("list")(list_project)
+
+
+@_main.group()
+def schema() -> None:
+    """Schema client."""
+    pass
+
+
+schema.command("commit")(commit_schema)
+schema.command("reg_concept_rule")(reg_concept_rule)
+
+
+@_main.group()
+def reasoner() -> None:
+    """Reasoner client."""
+    pass
+
+
+reasoner.command("execute")(execute_reasoner_job)
+
+
+@_main.group()
+def thinker() -> None:
+    """Thinker client."""
+    pass
+
+
+thinker.command("execute")(execute_thinker_job)
+
+if __name__ == "__main__":
+    _main()
diff --git a/tests/chain/__init__.py b/knext/command/sub_command/__init__.py
similarity index 100%
rename from tests/chain/__init__.py
rename to knext/command/sub_command/__init__.py
diff --git a/knext/command/sub_command/builder.py b/knext/command/sub_command/builder.py
new file mode 100644
index 00000000..93aa6cd4
--- /dev/null
+++ b/knext/command/sub_command/builder.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
diff --git a/knext/command/sub_command/project.py b/knext/command/sub_command/project.py
new file mode 100644
index 00000000..04443158
--- /dev/null
+++ b/knext/command/sub_command/project.py
@@ -0,0 +1,255 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+from collections import OrderedDict
+import re
+import json
+import os
+import sys
+from configparser import ConfigParser
+from pathlib import Path
+from ruamel.yaml import YAML
+from typing import Optional
+
+import click
+
+from knext.common.utils import copytree, copyfile
+from knext.project.client import ProjectClient
+
+from knext.common.env import env, DEFAULT_HOST_ADDR
+
+from kag.common.llm.llm_config_checker import LLMConfigChecker
+from kag.common.vectorize_model.vectorize_model_config_checker import (
+    VectorizeModelConfigChecker,
+)
+from shutil import copy2
+
+yaml = YAML()
+
+
+def _render_template(namespace: str, tmpl: str, **kwargs):
+    config_path = kwargs.get("config_path", None)
+    project_dir = Path(namespace)
+    if not project_dir.exists():
+        project_dir.mkdir()
+
+    import kag.templates.project
+
+    src = Path(kag.templates.project.__path__[0])
+    copytree(
+        src,
+        project_dir.resolve(),
+        namespace=namespace,
+        root=namespace,
+        tmpl=tmpl,
+        **kwargs,
+    )
+
+    import kag.templates.schema
+
+    src = Path(kag.templates.schema.__path__[0]) / f"{{{{{tmpl}}}}}.schema.tmpl"
+    if not src.exists():
+        click.secho(
+            f"ERROR: No such schema template: {tmpl}.schema.tmpl",
+            fg="bright_red",
+        )
+    dst = project_dir.resolve() / "schema" / f"{{{{{tmpl}}}}}.schema.tmpl"
+    copyfile(src, dst, namespace=namespace, **{tmpl: namespace})
+
+    tmpls = [tmpl, "default"] if tmpl != "default" else [tmpl]
+    # find all .yaml files in project dir
+    config = yaml.load(Path(config_path).read_text() or "{}")
+    project_id = kwargs.get("id", None)
+    config["project"]["id"] = project_id
+    config_file_path = project_dir.resolve() / "kag_config.yaml"
+    with open(config_file_path, "w") as config_file:
+        yaml.dump(config, config_file)
+    return project_dir
+
+
+def _recover_project(prj_path: str):
+    """
+    Recover project by a project dir path.
+    """
+    if not Path(prj_path).exists():
+        click.secho(f"ERROR: No such directory: {prj_path}", fg="bright_red")
+        sys.exit()
+
+    project_name = env.project_config.get("namespace", None)
+    namespace = env.project_config.get("namespace", None)
+    desc = env.project_config.get("description", None)
+    if not namespace:
+        click.secho(
+            f"ERROR: No project namespace found in {env.config_path}.",
+            fg="bright_red",
+        )
+        sys.exit()
+
+    client = ProjectClient()
+    project = client.get(namespace=namespace) or client.create(
+        name=project_name, desc=desc, namespace=namespace
+    )
+
+    env.config["project"]["id"] = project.id
+    env.dump()
+
+    click.secho(
+        f"Project [{project_name}] with namespace [{namespace}] was successfully recovered from [{prj_path}].",
+        fg="bright_green",
+    )
+
+
+@click.option("--config_path", help="Path of config.", required=True)
+@click.option(
+    "--tmpl",
+    help="Template of project, use default if not specified.",
+    default="default",
+    type=click.Choice(["default", "medical"], case_sensitive=False),
+)
+@click.option(
+    "--delete_cfg",
+    help="whether delete your defined .yaml file.",
+    default=True,
+    hidden=True,
+)
+def create_project(
+    config_path: str, tmpl: Optional[str] = None, delete_cfg: bool = False
+):
+    """
+    Create new project with a demo case.
+    """
+
+    config = yaml.load(Path(config_path).read_text() or "{}")
+    project_config = config.get("project", {})
+    namespace = project_config.get("namespace", None)
+    name = project_config.get("namespace", None)
+    host_addr = project_config.get("host_addr", None)
+
+    if not namespace:
+        click.secho("ERROR: namespace is required.")
+        sys.exit()
+
+    if not re.match(r"^[A-Z][A-Za-z0-9]{0,15}$", namespace):
+        raise click.BadParameter(
+            f"Invalid namespace: {namespace}."
+            f" Must start with an uppercase letter, only contain letters and numbers, and have a maximum length of 16."
+        )
+
+    if not tmpl:
+        tmpl = "default"
+
+    project_id = None
+    if host_addr:
+        client = ProjectClient(host_addr=host_addr)
+        project = client.create(name=name, namespace=namespace)
+
+        if project and project.id:
+            project_id = project.id
+    else:
+        click.secho("ERROR: host_addr is required.", fg="bright_red")
+        sys.exit()
+
+    project_dir = _render_template(
+        namespace=namespace,
+        tmpl=tmpl,
+        id=project_id,
+        with_server=(host_addr is not None),
+        host_addr=host_addr,
+        name=name,
+        config_path=config_path,
+        delete_cfg=delete_cfg,
+    )
+
+    config = yaml.load((Path(project_dir) / "kag_config.yaml").read_text() or "{}")
+    client.update(id=project_id, config=json.dumps(config))
+
+    if delete_cfg and os.path.exists(config_path):
+        os.remove(config_path)
+
+    click.secho(
+        f"Project with namespace [{namespace}] was successfully created in {project_dir.resolve()} \n"
+        + "You can checkout your project with: \n"
+        + f"  cd {project_dir}",
+        fg="bright_green",
+    )
+
+
+@click.option("--host_addr", help="Address of spg server.", default=None)
+@click.option("--proj_path", help="Path of project.", default=None)
+def restore_project(host_addr, proj_path):
+    if host_addr is None:
+        host_addr = env.host_addr
+    if proj_path is None:
+        proj_path = env.project_path
+    proj_client = ProjectClient(host_addr=host_addr)
+
+    project_wanted = proj_client.get_by_namespace(namespace=env.namespace)
+    if not project_wanted:
+        if host_addr:
+            client = ProjectClient(host_addr=host_addr)
+            project = client.create(name=env.name, namespace=env.namespace)
+            project_id = project.id
+    else:
+        project_id = project_wanted.id
+    # write project id and host addr to kag_config.yaml
+
+    env.config["project"]["id"] = project_id
+    env.config["project"]["host_addr"] = host_addr
+    env.dump()
+    if proj_path:
+        _recover_project(proj_path)
+        update_project(proj_path)
+
+
+@click.option("--proj_path", help="Path of config.", default=None)
+def update_project(proj_path):
+    if not proj_path:
+        proj_path = env.project_path
+    client = ProjectClient(host_addr=env.host_addr)
+
+    llm_config_checker = LLMConfigChecker()
+    vectorize_model_config_checker = VectorizeModelConfigChecker()
+    llm_config = env.config.get("chat_llm", {})
+    vectorize_model_config = env.config.get("vectorizer", {})
+    try:
+        llm_config_checker.check(json.dumps(llm_config))
+        dim = vectorize_model_config_checker.check(json.dumps(vectorize_model_config))
+        env.config["vectorizer"]["vector_dimensions"] = dim
+    except Exception as e:
+        click.secho(f"Error: {e}", fg="bright_red")
+        sys.exit()
+
+    client.update(id=env.id, config=json.dumps(env.config))
+    click.secho(
+        f"Project [{env.name}] with namespace [{env.namespace}] was successfully updated from [{proj_path}].",
+        fg="bright_green",
+    )
+
+@click.option("--host_addr", help="Address of spg server.", default=DEFAULT_HOST_ADDR)
+def list_project(host_addr):
+    client = ProjectClient(
+        host_addr=host_addr
+    )
+    projects = client.get_all()
+
+    headers = ["Project Name", "Project ID"]
+
+    click.echo(click.style(f"{' | '.join(headers)}", fg="bright_green", bold=True))
+    click.echo(
+        click.style(
+            f"{'-' * (len(headers[0]) + len(headers[1]) + 3)}", fg="bright_green"
+        )
+    )
+
+    for project_name, project_id in projects.items():
+        click.echo(
+            click.style(f"{project_name:<20} | {project_id:<10}", fg="bright_green")
+        )
diff --git a/knext/command/sub_command/reasoner.py b/knext/command/sub_command/reasoner.py
new file mode 100644
index 00000000..41323269
--- /dev/null
+++ b/knext/command/sub_command/reasoner.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import os
+import sys
+from pathlib import Path
+import yaml
+import click
+
+from knext.reasoner.client import ReasonerClient
+from knext.common.env import env
+
+
+@click.option("--file", help="Path of DSL file.")
+@click.option("--dsl", help="DSL string enclosed in double quotes.")
+@click.option("--output", help="Output file.")
+@click.option("--proj_path", help="Path of config.", default="./")
+def execute_reasoner_job(file, dsl, output=None, proj_path="./"):
+    """
+    Submit asynchronous reasoner jobs to server by providing DSL file or string.
+    """
+    client = ReasonerClient(host_addr=env.host_addr, project_id=int(env.project_id))
+    if file and not dsl:
+        with open(file, "r") as f:
+            dsl_content = f.read()
+    elif not file and dsl:
+        dsl_content = dsl
+    else:
+        click.secho("ERROR: Please choose either --file or --dsl.", fg="bright_red")
+        sys.exit()
+    client.execute(dsl_content, output_file=output)
diff --git a/knext/command/sub_command/schema.py b/knext/command/sub_command/schema.py
new file mode 100644
index 00000000..f6b6350d
--- /dev/null
+++ b/knext/command/sub_command/schema.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import os
+from pathlib import Path
+import yaml
+import click
+import knext.project
+
+from knext.schema.marklang.concept_rule_ml import SPGConceptRuleMarkLang
+from knext.schema.marklang.schema_ml import SPGSchemaMarkLang
+from knext.common.env import env
+
+
+def commit_schema():
+    """
+    Commit local schema and generate schema helper.
+    """
+    schema_file = os.path.join(
+        env.project_path,
+        knext.project.DEFAULT_SCHEMA_DIR,
+        knext.project.DEFAULT_SCHEMA_FILE.replace("$namespace", env.namespace),
+    )
+    if not Path(schema_file).exists():
+        click.secho(f"ERROR: File {schema_file} not exists.", fg="bright_red")
+        return
+
+    ml = SPGSchemaMarkLang(schema_file)
+    is_altered = ml.sync_schema()
+
+    if is_altered:
+        click.secho("Schema is successfully committed.", fg="bright_green")
+    else:
+        click.secho(
+            "There is no diff between local and server-side schema.", fg="bright_yellow"
+        )
+
+
+@click.option("--file", help="Path of DSL file.")
+def reg_concept_rule(file):
+    """
+    Register a concept rule according to DSL file.
+    """
+    SPGConceptRuleMarkLang(file)
+    click.secho(f"Concept rule is successfully registered", fg="bright_green")
diff --git a/knext/command/sub_command/thinker.py b/knext/command/sub_command/thinker.py
new file mode 100644
index 00000000..9e0399e5
--- /dev/null
+++ b/knext/command/sub_command/thinker.py
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import click
+
+from knext.thinker.client import ThinkerClient
+
+
+@click.option("--subject", help="The subject of reasoning goal, eg: id,type or type")
+@click.option("--predicate", help="The predicate of reasoning goal, eg: type")
+@click.option("--object", help="The object of reasoning goal, eg: id,type or type")
+@click.option("--mode", help="Reasoning mode, eg: spo or node")
+@click.option("--params", help="Reasoning context")
+def execute_thinker_job(subject="", predicate="", object="", mode="spo", params=""):
+    """
+    Submit asynchronous reasoner jobs to server by providing DSL file or string.
+    """
+    client = ThinkerClient()
+    client.execute(subject, predicate, object, mode, params)
diff --git a/knext/common/__init__.py b/knext/common/__init__.py
new file mode 100644
index 00000000..93aa6cd4
--- /dev/null
+++ b/knext/common/__init__.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
diff --git a/knext/common/base/__init__.py b/knext/common/base/__init__.py
new file mode 100644
index 00000000..93aa6cd4
--- /dev/null
+++ b/knext/common/base/__init__.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
diff --git a/knext/common/base/chain.py b/knext/common/base/chain.py
new file mode 100644
index 00000000..0c8f05ec
--- /dev/null
+++ b/knext/common/base/chain.py
@@ -0,0 +1,169 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Union, Type, List, Dict
+
+import networkx as nx
+from tqdm import tqdm
+
+from knext.common.base.runnable import Runnable
+from knext.common.base.restable import RESTable
+
+
+class Chain(Runnable, RESTable):
+    """
+    Base class for creating structured sequences of calls to components.
+    """
+
+    """The execution process of Chain, represented by a dag structure."""
+    dag: nx.DiGraph
+
+    def invoke(self, input: str, max_workers, **kwargs):
+        node_results = {}
+        futures = []
+
+        def execute_node(node, inputs: List[str]):
+            with ThreadPoolExecutor(max_workers) as inner_executor:
+                inner_futures = [
+                    inner_executor.submit(node.invoke, inp) for inp in inputs
+                ]
+                result = []
+                for idx, inner_future in tqdm(
+                    enumerate(as_completed(inner_futures)),
+                    total=len(inner_futures),
+                    desc=f"Processing {node.name}",
+                ):
+                    ret = inner_future.result()
+                    result.extend(ret)
+                return node, result
+
+        # Initialize a ThreadPoolExecutor
+        with ThreadPoolExecutor(max_workers) as executor:
+            # Find the starting nodes (nodes with no predecessors)
+            start_nodes = [
+                node for node in self.dag.nodes if self.dag.in_degree(node) == 0
+            ]
+
+            # Initialize the first set of tasks
+            for node in start_nodes:
+                futures.append(executor.submit(execute_node, node, [input]))
+
+            # Process nodes as futures complete
+            while futures:
+                for future in as_completed(futures):
+                    node, result = future.result()
+                    node_results[node] = result
+                    futures.remove(future)
+
+                    # Submit successors for execution
+                    successors = list(self.dag.successors(node))
+                    for successor in successors:
+                        # Check if all predecessors of the successor have finished processing
+                        if all(
+                            pred in node_results
+                            for pred in self.dag.predecessors(successor)
+                        ):
+                            # Gather all inputs from predecessors for this successor
+                            inputs = []
+                            for pred in self.dag.predecessors(successor):
+                                inputs.extend(node_results[pred])
+                            futures.append(
+                                executor.submit(execute_node, successor, inputs)
+                            )
+
+        # Collect the final results from the output nodes
+        output_nodes = [
+            node for node in self.dag.nodes if self.dag.out_degree(node) == 0
+        ]
+        final_output = []
+        for node in output_nodes:
+            if node in node_results:
+                final_output.extend(node_results[node])
+
+        return final_output
+
+    def batch(self, inputs: List[str], max_workers, **kwargs):
+        for i in inputs:
+            self.invoke(i, max_workers, **kwargs)
+
+    def to_rest(self):
+        from knext.builder import rest
+
+    def __rshift__(
+        self,
+        other: Union[
+            Type["Chain"],
+            List[Type["Chain"]],
+            Type["Component"],
+            List[Type["Component"]],
+            None,
+        ],
+    ):
+        """
+        Implements the right shift operator ">>" functionality to link Component or Chain objects.
+
+        This method can handle single Component/Chain objects or lists of them.
+        When linking Components, a new DAG (Directed Acyclic Graph) is created to represent the data flow connection.
+        When linking Chain objects, the DAGs of both Chains are merged.
+
+        Parameters:
+        other (Union[Type["Chain"], List[Type["Chain"]], Type["Component"], List[Type["Component"]], None]):
+            The subsequent steps to link, which can be a single or list of Component/Chain objects.
+
+        Returns:
+        A new Chain object with a DAG that represents the linked data flow between the current Chain and the parameter other.
+        """
+        from knext.common.base.component import Component
+
+        if not other:
+            return self
+        # If other is not a list, convert it to a list
+        if not isinstance(other, list):
+            other = [other]
+
+        dag_list = []
+        for o in other:
+            if not o:
+                dag_list.append(o.dag)
+            # If o is a Component, create a new DAG and try to add o to the graph
+            if isinstance(o, Component):
+                end_nodes = [
+                    node
+                    for node, out_degree in self.dag.out_degree()
+                    if out_degree == 0 or node._last
+                ]
+                dag = nx.DiGraph(self.dag)
+                if len(end_nodes) > 0:
+                    for end_node in end_nodes:
+                        dag.add_edge(end_node, o)
+                dag.add_node(o)
+                dag_list.append(dag)
+            # If o is a Chain, merge the DAGs of self and o
+            elif isinstance(o, Chain):
+                combined_dag = nx.compose(self.dag, o.dag)
+                end_nodes = [
+                    node
+                    for node, out_degree in self.dag.out_degree()
+                    if out_degree == 0 or node._last
+                ]
+                start_nodes = [
+                    node for node, in_degree in o.dag.in_degree() if in_degree == 0
+                ]
+
+                if len(end_nodes) > 0 and len(start_nodes) > 0:
+                    for end_node in end_nodes:
+                        for start_node in start_nodes:
+                            combined_dag.add_edge(end_node, start_node)
+        # Merge all DAGs and create the final Chain object
+        final_dag = nx.compose_all(dag_list)
+        return Chain(dag=final_dag)
diff --git a/knext/common/base/client.py b/knext/common/base/client.py
new file mode 100644
index 00000000..90d3958b
--- /dev/null
+++ b/knext/common/base/client.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import os
+from abc import ABC
+
+from knext.common import rest
+from knext.common.env import env
+
+
+class Client(ABC):
+    """
+    Base client class.
+
+    This abstract base class is used to derive specific client classes.
+    It defines a REST client instance for sending API requests.
+
+    Attributes:
+        _rest_client (rest.BaseApi): REST client instance for sending API requests.
+    """
+
+    _rest_client: rest.BaseApi
+
+    def __init__(self, host_addr: str = None, project_id: str = None):
+        """
+        Initialization method to set the connection address and project ID.
+
+        This method checks the provided `host_addr` and `project_id` parameters.
+        If these parameters are not provided, it retrieves the values from environment variables.
+
+        Parameters:
+            host_addr (str): The address of the component server. If not provided, the value from the environment variable `KAG_PROJECT_HOST_ADDR` is used.
+            project_id (int): The ID of the user's project. If not provided, the value from the environment variable `KAG_PROJECT_ID` is used.
+        """
+        self._host_addr = host_addr or env.host_addr
+        self._project_id = project_id or env.id
+
+    @staticmethod
+    def serialize(obj):
+        """
+        Serialize an object for transmission.
+
+        This method uses an instance of rest.ApiClient to sanitize the object,
+        making it suitable for serialization into JSON or another format for network transmission.
+        Serialization is the process of converting an object into a form that can be transmitted and stored.
+
+        Parameters:
+        obj (any): The object to be serialized.
+
+        Returns:
+        any: The sanitized object, suitable for serialization and transmission.
+        """
+        return rest.ApiClient().sanitize_for_serialization(obj)
diff --git a/knext/common/base/component.py b/knext/common/base/component.py
new file mode 100644
index 00000000..e6ae60aa
--- /dev/null
+++ b/knext/common/base/component.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+from abc import ABC
+from typing import List, Union, Type
+
+import networkx as nx
+
+from knext.common.base.runnable import Runnable
+from knext.common.base.restable import RESTable
+
+
+class Component(Runnable, RESTable, ABC):
+    """
+    Base class for all component.
+    """
+
+    @property
+    def id(self):
+        return str(id(self))
+
+    @property
+    def type(self):
+        return
+
+    @property
+    def label(self):
+        return
+
+    @property
+    def name(self):
+        return self.__class__.__name__
+
+    def to_dict(self):
+        return {"id": self.id, "name": self.name}
+
+    def __hash__(self):
+        return id(self)
+
+    def __eq__(self, other):
+        return hash(self) == hash(other)
+
+    def __rshift__(
+        self,
+        other: Union[
+            Type["Chain"],
+            List[Type["Chain"]],
+            Type["Component"],
+            List[Type["Component"]],
+            None,
+        ],
+    ):
+        """
+        Implements the right shift operator to support chaining.
+
+        This method allows connecting components or chains (self) with other components or chains (other),
+        forming a new workflow graph. It supports connecting single or multiple components,
+        single or multiple chains.
+
+        Parameters:
+        other (Union[Type["Chain"], List[Type["Chain"]], Type["Component"], List[Type["Component"]], None]):
+            The component(s) or chain(s) to connect with.
+
+        Returns:
+        Chain: A new Chain instance representing the connected workflow.
+        """
+        from knext.common.base.chain import Chain
+
+        if not other:
+            return self
+        if not isinstance(other, list):
+            other = [other]
+        dag_list = []
+
+        for o in other:
+            # If o is empty, create an empty directed graph and add self to it
+            if not o:
+                dag = nx.DiGraph()
+                self.last = True
+                dag.add_node(self)
+                dag_list.append(dag)
+            # If o is an instance of Component, create a directed graph and add edges between self and o
+            elif isinstance(o, Component):
+                dag = nx.DiGraph()
+                dag.add_node(self)
+                dag.add_node(o)
+                dag.add_edge(self, o)
+                dag_list.append(dag)
+            # If o is an instance of Chain, create a directed graph and combine it with o's graph
+            elif isinstance(o, Chain):
+                dag = nx.DiGraph()
+                dag.add_node(self)
+                end_nodes = [
+                    node
+                    for node, out_degree in dag.out_degree()
+                    if out_degree == 0 or node.last
+                ]
+                start_nodes = [
+                    node for node, in_degree in o.dag.in_degree() if in_degree == 0
+                ]
+                if len(end_nodes) > 0 and len(start_nodes) > 0:
+                    for end_node in end_nodes:
+                        for start_node in start_nodes:
+                            dag.add_edge(end_node, start_node)
+                    combined_dag = nx.compose(dag, o.dag)
+                    dag_list.append(combined_dag)
+        # Combine all subgraphs into a final directed graph
+        final_dag = nx.compose_all(dag_list)
+
+        return Chain(dag=final_dag)
diff --git a/knext/common/base/restable.py b/knext/common/base/restable.py
new file mode 100644
index 00000000..7a509d9a
--- /dev/null
+++ b/knext/common/base/restable.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+from abc import ABC
+from typing import Type, List
+
+
+class RESTable(ABC):
+    """
+    Abstract base class that can be serialized as REST model and submit to the SPG server.
+    """
+
+    @property
+    def upstream_types(self) -> List[Type["RESTable"]]:
+        """The types of upstream RESTable objects that the current RESTable object can support.
+
+        Returns: RESTable type list.
+
+        """
+        return []
+
+    @property
+    def downstream_types(self) -> List[Type["RESTable"]]:
+        """The types of downstream RESTable objects that the current RESTable object can support.
+
+        Returns: RESTable type list.
+
+        """
+        return []
+
+    def to_rest(self):
+        """Convert a RESTable object to REST model that can be serialized.
+
+        Returns: REST model.
+
+        """
+        raise NotImplementedError(
+            f"`to_rest` is not currently supported for {self.__class__.__name__}."
+        )
+
+    @classmethod
+    def from_rest(cls, rest_model):
+        """Convert a REST model to RESTable object.
+
+        Args:
+            rest_model: REST model that needs to be converted to a RESTable object.
+
+        Returns: Object inherits from RESTable.
+
+        """
+        raise NotImplementedError(
+            f"`from_rest` is not currently supported for {cls.__name__}."
+        )
diff --git a/knext/common/base/runnable.py b/knext/common/base/runnable.py
new file mode 100644
index 00000000..818f7dea
--- /dev/null
+++ b/knext/common/base/runnable.py
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from typing import TypeVar, Type, List
+
+Other = TypeVar("Other")
+
+Input = TypeVar("Input", contravariant=True)
+Output = TypeVar("Output", covariant=True)
+
+
+class Runnable:
+    """
+    Abstract base class that can be invoked synchronously.
+    """
+
+    _last: bool = False
+
+    def __init__(self, **kwargs):
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+
+    @property
+    def input_types(self) -> Type[Input]:
+        """The type of input this Runnable object accepts specified as a type annotation."""
+        return
+
+    @property
+    def output_types(self) -> Type[Output]:
+        """The type of output this Runnable object produces specified as a type annotation."""
+        return
+
+    def invoke(self, input: Input, **kwargs) -> List[Output]:
+        """Transform an input into an output sequence synchronously."""
+        raise NotImplementedError(
+            f"`invoke` is not currently supported for {self.__class__.__name__}."
+        )
+
+    def batch(self, inputs: List[Input], **kwargs) -> List[Output]:
+        """Transform inputs into an output sequence synchronously."""
+        raise NotImplementedError(
+            f"`batch` is not currently supported for {self.__class__.__name__}."
+        )
+
+    def __rshift__(self, other):
+        """
+        Overloads the right shift operator (>>) for the Chain class.
+
+        This method allows for chaining together Components and Chains using the
+        right shift operator. It takes an input `other`, which can be a single
+        Component, a list of Components, a single Chain, or a list of Chains.
+
+        The process is as follows:
+        - If `other` is None or an empty value, the original Chain instance (`self`)
+        is returned.
+        - If `other` is not a list, it is converted into a list containing a single
+        element.
+        - For each element in `other`, a directed acyclic graph (DAG) is created:
+        - If the element is a Component, it is added to the DAG and an edge is created
+        from the current Chain (`self`) to the Component.
+        - If the element is a Chain, the method finds the end nodes of the current
+        DAG and the start nodes of the Chain's DAG to create appropriate edges
+        between them. The two DAGs are then combined.
+        - After processing all elements in `other`, all DAGs are combined into a final
+        DAG using `nx.compose_all`.
+        - A new Chain object is created and returned, initialized with the final combined DAG.
+
+        Args:
+        other: A Chain, list of Chains, Component, list of Components, or None
+        representing the elements to chain with the current instance.
+
+        Returns:
+        A new Chain object that represents the combined DAG of the current instance
+        and the provided elements.
+        """
+        raise NotImplementedError(
+            f"`__rshift__` is not currently supported for {self.__class__.__name__}."
+        )
+
+    def _check_type(self, other: Other) -> bool:
+        pass
diff --git a/knext/common/cache.py b/knext/common/cache.py
new file mode 100644
index 00000000..79708bcd
--- /dev/null
+++ b/knext/common/cache.py
@@ -0,0 +1,42 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from cachetools import TTLCache
+
+
+class LinkCache:
+    def __init__(self, maxsize: int = 500, ttl: int = 60):
+        self._cache = TTLCache(maxsize=maxsize, ttl=ttl)
+
+    @property
+    def cache(self):
+        return self._cache
+
+    def put(self, key, value):
+        self.cache[key] = value
+
+    def get(self, key):
+        return self.cache.get(key)
+
+
+class SchemaCache:
+    def __init__(self, maxsize: int = 10, ttl: int = 300):
+        self._cache = TTLCache(maxsize=maxsize, ttl=ttl)
+
+    @property
+    def cache(self):
+        return self._cache
+
+    def put(self, key, value):
+        self.cache[key] = value
+
+    def get(self, key):
+        return self.cache.get(key)
diff --git a/knext/common/env.py b/knext/common/env.py
new file mode 100644
index 00000000..f71e02e7
--- /dev/null
+++ b/knext/common/env.py
@@ -0,0 +1,143 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+from collections import OrderedDict
+import logging
+import os
+import sys
+import json
+from ruamel.yaml import YAML
+from pathlib import Path
+from typing import Union, Optional
+
+yaml = YAML()
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_HOST_ADDR = "http://127.0.0.1:8887"
+
+
+class Environment:
+    _instance = None
+    _config = None
+
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(Environment, cls).__new__(cls)
+            try:
+                log_config = cls._instance.config.get("log", {})
+                value = log_config.get("level", "INFO")
+                logging.basicConfig(level=logging.getLevelName(value))
+            except:
+                logger.info("logger info not set")
+        return cls._instance
+
+    @property
+    def config(self):
+        if self._config is None:
+            self._config = self.get_config()
+        if self._config != self.get_config():
+            with open(self.config_path, "w") as f:
+                yaml.dump(self._config, f)
+        return self._config
+
+    @property
+    def project_path(self):
+        config_path = self._closest_config()
+        return os.path.abspath(os.path.dirname(config_path))
+
+    @property
+    def config_path(self):
+        return self._closest_config()
+
+    @property
+    def project_config(self):
+        return self.config.get("project", {})
+
+    @property
+    def id(self):
+        if os.getenv("KAG_PROJECT_ID"):
+            return os.getenv("KAG_PROJECT_ID")
+        id = self.project_config.get("id", None)
+        if id is None:
+            raise Exception(
+                "project id not restore in spgserver, please restore project first"
+            )
+        return id
+
+    @property
+    def project_id(self):
+        return self.id
+
+    @property
+    def namespace(self):
+        if os.getenv("KAG_PROJECT_NAMESPACE"):
+            return os.getenv("KAG_PROJECT_NAMESPACE")
+        namespace = self.project_config.get("namespace", None)
+        if namespace is None:
+            raise Exception("project namespace is not defined")
+        return namespace
+
+    @property
+    def name(self):
+        return self.namespace
+
+    @property
+    def host_addr(self):
+        if os.getenv("KAG_PROJECT_HOST_ADDR"):
+            return os.getenv("KAG_PROJECT_HOST_ADDR")
+        host_addr = self.project_config.get("host_addr", None)
+        if host_addr is None:
+            host_addr = DEFAULT_HOST_ADDR
+            logger.warning(f"project host_addr is not defined, use default host_addr: {host_addr}")
+        return host_addr
+
+    def get_config(self):
+        """
+        Get knext config file as a ConfigParser.
+        """
+        local_cfg_path = self._closest_config()
+        try:
+            with open(local_cfg_path) as f:
+                local_cfg = yaml.load(f)
+        except Exception as e:
+            raise Exception(f"failed to load config from {local_cfg_path}, error: {e}")
+        projdir = ""
+        if local_cfg_path:
+            projdir = str(Path(local_cfg_path).parent)
+            if projdir not in sys.path:
+                sys.path.append(projdir)
+
+        return local_cfg
+
+    def _closest_config(
+        self,
+        path: Union[str, os.PathLike] = ".",
+        prev_path: Optional[Union[str, os.PathLike]] = None,
+    ) -> str:
+        """
+        Return the path to the closest .knext.cfg file by traversing the current
+        directory and its parents
+        """
+        if prev_path is not None and str(path) == str(prev_path):
+            return ""
+        path = Path(path).resolve()
+        cfg_file = path / "kag_config.yaml"
+        if cfg_file.exists():
+            return str(cfg_file)
+        return self._closest_config(path.parent, path)
+
+    def dump(self, path=None, **kwargs):
+        with open(path or self.config_path, "w") as f:
+            yaml.dump(self.config, f, **kwargs)
+
+
+env = Environment()
diff --git a/tests/chain/test_builder_chain.py b/knext/common/rest/__init__.py
similarity index 51%
rename from tests/chain/test_builder_chain.py
rename to knext/common/rest/__init__.py
index b2aea88b..0651489b 100644
--- a/tests/chain/test_builder_chain.py
+++ b/knext/common/rest/__init__.py
@@ -9,23 +9,13 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
 
-import unittest
-
-from kag.builder.default_chain import BuilderChain
-
-
-class TestBuilderClient(unittest.TestCase):
-    """BuilderClient unit test stubs"""
-
-    def setUp(self):
-        self.chain = BuilderChain()
-
-    def tearDown(self):
-        pass
-
-    def testExecute(self):
-        """Test execute"""
-
-
-if __name__ == "__main__":
-    unittest.main()
+__version__ = "1"
+
+from knext.common.rest.api_client import ApiClient
+from knext.common.rest.api_client import BaseApi
+from knext.common.rest.configuration import Configuration
+from knext.common.rest.exceptions import ApiException
+from knext.common.rest.exceptions import ApiKeyError
+from knext.common.rest.exceptions import ApiTypeError
+from knext.common.rest.exceptions import ApiValueError
+from knext.common.rest.exceptions import OpenApiException
diff --git a/knext/common/rest/api_client.py b/knext/common/rest/api_client.py
new file mode 100644
index 00000000..9bc9a8e8
--- /dev/null
+++ b/knext/common/rest/api_client.py
@@ -0,0 +1,774 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+from __future__ import absolute_import
+
+import atexit
+import datetime
+import json
+import mimetypes
+import os
+import re
+import tempfile
+from multiprocessing.pool import ThreadPool
+
+# python 2 and python 3 compatibility library
+import six
+from dateutil.parser import parse
+from six.moves.urllib.parse import quote
+
+import knext
+from knext.common.rest import rest
+from knext.common.rest.configuration import Configuration
+from knext.common.rest.exceptions import ApiValueError, ApiException
+
+
+class ApiClient(object):
+    """Generic API client for OpenAPI client library builds.
+
+    OpenAPI generic API client. This client handles the client-
+    server communication, and is invariant across implementations. Specifics of
+    the methods and models for each application are generated from the OpenAPI
+    templates.
+
+    NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+    Do not edit the class manually.
+
+    :param configuration: .Configuration object for this client
+    :param header_name: a header to pass when making calls to the API.
+    :param header_value: a header value to pass when making calls to
+        the API.
+    :param cookie: a cookie to include in the header when making calls
+        to the API
+    :param pool_threads: The number of threads to use for async requests
+        to the API. More threads means more concurrent API requests.
+    """
+
+    PRIMITIVE_TYPES = (float, bool, bytes, six.text_type) + six.integer_types
+    NATIVE_TYPES_MAPPING = {
+        "int": int,
+        "long": int if six.PY3 else long,  # noqa: F821
+        "float": float,
+        "str": str,
+        "bool": bool,
+        "date": datetime.date,
+        "datetime": datetime.datetime,
+        "object": object,
+    }
+    _pool = None
+
+    def __init__(
+        self,
+        configuration=None,
+        header_name=None,
+        header_value=None,
+        cookie=None,
+        pool_threads=1,
+    ):
+        if configuration is None:
+            configuration = Configuration.get_default_copy()
+        self.configuration = configuration
+        self.pool_threads = pool_threads
+
+        self.rest_client = rest.RESTClientObject(configuration)
+        self.default_headers = {}
+        if header_name is not None:
+            self.default_headers[header_name] = header_value
+        self.cookie = cookie
+        # Set default User-Agent.
+        self.user_agent = "OpenAPI-Generator/1.0.0/python"
+        self.client_side_validation = configuration.client_side_validation
+        self.url_prefix = "/public/v" + knext.common.rest.__version__
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+    def close(self):
+        if self._pool:
+            self._pool.close()
+            self._pool.join()
+            self._pool = None
+            if hasattr(atexit, "unregister"):
+                atexit.unregister(self.close)
+
+    @property
+    def pool(self):
+        """Create thread pool on first request
+        avoids instantiating unused threadpool for blocking clients.
+        """
+        if self._pool is None:
+            atexit.register(self.close)
+            self._pool = ThreadPool(self.pool_threads)
+        return self._pool
+
+    @property
+    def user_agent(self):
+        """User agent for this API client"""
+        return self.default_headers["User-Agent"]
+
+    @user_agent.setter
+    def user_agent(self, value):
+        self.default_headers["User-Agent"] = value
+
+    def set_default_header(self, header_name, header_value):
+        self.default_headers[header_name] = header_value
+
+    def __call_api(
+        self,
+        resource_path,
+        method,
+        path_params=None,
+        query_params=None,
+        header_params=None,
+        body=None,
+        post_params=None,
+        files=None,
+        response_type=None,
+        auth_settings=None,
+        _return_http_data_only=None,
+        collection_formats=None,
+        _preload_content=True,
+        _request_timeout=None,
+        _host=None,
+    ):
+        config = self.configuration
+
+        # header parameters
+        header_params = header_params or {}
+        header_params.update(self.default_headers)
+        if self.cookie:
+            header_params["Cookie"] = self.cookie
+        if header_params:
+            header_params = self.sanitize_for_serialization(header_params)
+            header_params = dict(
+                self.parameters_to_tuples(header_params, collection_formats)
+            )
+
+        # path parameters
+        if path_params:
+            path_params = self.sanitize_for_serialization(path_params)
+            path_params = self.parameters_to_tuples(path_params, collection_formats)
+            for k, v in path_params:
+                # specified safe chars, encode everything
+                resource_path = resource_path.replace(
+                    "{%s}" % k, quote(str(v), safe=config.safe_chars_for_path_param)
+                )
+
+        # query parameters
+        if query_params:
+            query_params = self.sanitize_for_serialization(query_params)
+            query_params = self.parameters_to_tuples(query_params, collection_formats)
+
+        # post parameters
+        if post_params or files:
+            post_params = post_params if post_params else []
+            post_params = self.sanitize_for_serialization(post_params)
+            post_params = self.parameters_to_tuples(post_params, collection_formats)
+            post_params.extend(self.files_parameters(files))
+
+        # auth setting
+        self.update_params_for_auth(header_params, query_params, auth_settings)
+
+        # body
+        if body:
+            body = self.sanitize_for_serialization(body)
+        # request url
+        if _host is None:
+            url = self.configuration.host + self.url_prefix + resource_path
+        else:
+            # use server/host defined in path or operation instead
+            url = _host + resource_path
+
+        try:
+            # perform request and return response
+            response_data = self.request(
+                method,
+                url,
+                query_params=query_params,
+                headers=header_params,
+                post_params=post_params,
+                body=body,
+                _preload_content=_preload_content,
+                _request_timeout=_request_timeout,
+            )
+        except ApiException as e:
+            e.body = e.body.decode("utf-8") if six.PY3 else e.body
+            raise e
+
+        content_type = response_data.getheader("content-type")
+
+        self.last_response = response_data
+
+        return_data = response_data
+
+        if not _preload_content:
+            return return_data
+
+        if six.PY3 and response_type not in ["file", "bytes"]:
+            match = None
+            if content_type is not None:
+                match = re.search(r"charset=([a-zA-Z\-\d]+)[\s\;]?", content_type)
+            encoding = match.group(1) if match else "utf-8"
+            response_data.data = response_data.data.decode(encoding)
+
+        # deserialize response data
+        if response_type:
+            return_data = self.deserialize(response_data, response_type)
+        else:
+            return_data = None
+
+        if _return_http_data_only:
+            return return_data
+        else:
+            return (return_data, response_data.status, response_data.getheaders())
+
+    def sanitize_for_serialization(self, obj):
+        """Builds a JSON POST object.
+
+        If obj is None, return None.
+        If obj is str, int, long, float, bool, return directly.
+        If obj is datetime.datetime, datetime.date
+            convert to string in iso8601 format.
+        If obj is list, sanitize each element in the list.
+        If obj is dict, return the dict.
+        If obj is OpenAPI model, return the properties dict.
+
+        :param obj: The data to serialize.
+        :return: The serialized form of data.
+        """
+        if obj is None:
+            return None
+        elif isinstance(obj, self.PRIMITIVE_TYPES):
+            return obj
+        elif isinstance(obj, list):
+            return [self.sanitize_for_serialization(sub_obj) for sub_obj in obj]
+        elif isinstance(obj, tuple):
+            return tuple(self.sanitize_for_serialization(sub_obj) for sub_obj in obj)
+        elif isinstance(obj, (datetime.datetime, datetime.date)):
+            return obj.isoformat()
+        # elif hasattr(obj, "__type_name__"):
+        #     return obj.__type_name__
+
+        if isinstance(obj, dict):
+            obj_dict = obj
+        else:
+            # Convert model obj to dict except
+            # attributes `openapi_types`, `attribute_map`
+            # and attributes which value is not None.
+            # Convert attribute name to json key in
+            # model definition for request.
+            obj_dict = {
+                obj.attribute_map[attr]: getattr(obj, attr)
+                for attr, _ in six.iteritems(obj.openapi_types)
+                if getattr(obj, attr) is not None
+            }
+            if obj.discriminator is not None:
+                obj_dict["@type"] = obj.discriminator
+
+        obj_dict = {
+            key: self.sanitize_for_serialization(val)
+            for key, val in six.iteritems(obj_dict)
+            if val is not None
+        }
+        obj_dict = {key: val for key, val in six.iteritems(obj_dict) if val is not None}
+
+        return obj_dict if obj_dict else None
+
+    def deserialize(self, response, response_type):
+        """Deserializes response into an object.
+
+        :param response: RESTResponse object to be deserialized.
+        :param response_type: class literal for
+            deserialized object, or string of class name.
+
+        :return: deserialized object.
+        """
+        # handle file downloading
+        # save response body into a tmp file and return the instance
+        if response_type == "file":
+            return self.__deserialize_file(response)
+
+        # fetch data from response object
+        try:
+            data = json.loads(response.data)
+        except ValueError:
+            data = response.data
+
+        return self.__deserialize(data, response_type)
+
+    def __deserialize(self, data, klass):
+        """Deserializes dict, list, str into an object.
+
+        :param data: dict, list or str.
+        :param klass: class literal, or string of class name.
+
+        :return: object.
+        """
+        if data is None:
+            return None
+
+        if type(klass) == str:
+            if klass.startswith("list["):
+                sub_kls = re.match(r"list\[(.*)\]", klass).group(1)
+                return [self.__deserialize(sub_data, sub_kls) for sub_data in data]
+
+            if klass.startswith("dict("):
+                sub_kls = re.match(r"dict\(([^,]*), (.*)\)", klass).group(2)
+                return {
+                    k: self.__deserialize(v, sub_kls) for k, v in six.iteritems(data)
+                }
+
+            # convert str to class
+            if klass in self.NATIVE_TYPES_MAPPING:
+                klass = self.NATIVE_TYPES_MAPPING[klass]
+            else:
+                import knext.common.rest.models
+
+                klass = getattr(knext.common.rest.models, klass)
+
+        if klass in self.PRIMITIVE_TYPES:
+            return self.__deserialize_primitive(data, klass)
+        elif klass == object:
+            return self.__deserialize_object(data)
+        elif klass == datetime.date:
+            return self.__deserialize_date(data)
+        elif klass == datetime.datetime:
+            return self.__deserialize_datetime(data)
+        else:
+            return self.__deserialize_model(data, klass)
+
+    def call_api(
+        self,
+        resource_path,
+        method,
+        path_params=None,
+        query_params=None,
+        header_params=None,
+        body=None,
+        post_params=None,
+        files=None,
+        response_type=None,
+        auth_settings=None,
+        async_req=None,
+        _return_http_data_only=None,
+        collection_formats=None,
+        _preload_content=True,
+        _request_timeout=None,
+        _host=None,
+    ):
+        """Makes the HTTP request (synchronous) and returns deserialized data.
+
+        To make an async_req request, set the async_req parameter.
+
+        :param resource_path: Path to method endpoint.
+        :param method: Method to call.
+        :param path_params: Path parameters in the url.
+        :param query_params: Query parameters in the url.
+        :param header_params: Header parameters to be
+            placed in the request header.
+        :param body: Request body.
+        :param post_params dict: Request post form parameters,
+            for `application/x-www-form-urlencoded`, `multipart/form-data`.
+        :param auth_settings list: Auth Settings names for the request.
+        :param response: Response data type.
+        :param files dict: key -> filename, value -> filepath,
+            for `multipart/form-data`.
+        :param async_req bool: execute request asynchronously
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param collection_formats: dict of collection formats for path, query,
+            header, and post parameters.
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return:
+            If async_req parameter is True,
+            the request will be called asynchronously.
+            The method will return the request thread.
+            If parameter async_req is False or missing,
+            then the method will return the response directly.
+        """
+        if not async_req:
+            return self.__call_api(
+                resource_path,
+                method,
+                path_params,
+                query_params,
+                header_params,
+                body,
+                post_params,
+                files,
+                response_type,
+                auth_settings,
+                _return_http_data_only,
+                collection_formats,
+                _preload_content,
+                _request_timeout,
+                _host,
+            )
+
+        return self.pool.apply_async(
+            self.__call_api,
+            (
+                resource_path,
+                method,
+                path_params,
+                query_params,
+                header_params,
+                body,
+                post_params,
+                files,
+                response_type,
+                auth_settings,
+                _return_http_data_only,
+                collection_formats,
+                _preload_content,
+                _request_timeout,
+                _host,
+            ),
+        )
+
+    def request(
+        self,
+        method,
+        url,
+        query_params=None,
+        headers=None,
+        post_params=None,
+        body=None,
+        _preload_content=True,
+        _request_timeout=None,
+    ):
+        """Makes the HTTP request using RESTClient."""
+        if method == "GET":
+            return self.rest_client.GET(
+                url,
+                query_params=query_params,
+                _preload_content=_preload_content,
+                _request_timeout=_request_timeout,
+                headers=headers,
+            )
+        elif method == "HEAD":
+            return self.rest_client.HEAD(
+                url,
+                query_params=query_params,
+                _preload_content=_preload_content,
+                _request_timeout=_request_timeout,
+                headers=headers,
+            )
+        elif method == "OPTIONS":
+            return self.rest_client.OPTIONS(
+                url,
+                query_params=query_params,
+                headers=headers,
+                _preload_content=_preload_content,
+                _request_timeout=_request_timeout,
+            )
+        elif method == "POST":
+            return self.rest_client.POST(
+                url,
+                query_params=query_params,
+                headers=headers,
+                post_params=post_params,
+                _preload_content=_preload_content,
+                _request_timeout=_request_timeout,
+                body=body,
+            )
+        elif method == "PUT":
+            return self.rest_client.PUT(
+                url,
+                query_params=query_params,
+                headers=headers,
+                post_params=post_params,
+                _preload_content=_preload_content,
+                _request_timeout=_request_timeout,
+                body=body,
+            )
+        elif method == "PATCH":
+            return self.rest_client.PATCH(
+                url,
+                query_params=query_params,
+                headers=headers,
+                post_params=post_params,
+                _preload_content=_preload_content,
+                _request_timeout=_request_timeout,
+                body=body,
+            )
+        elif method == "DELETE":
+            return self.rest_client.DELETE(
+                url,
+                query_params=query_params,
+                headers=headers,
+                _preload_content=_preload_content,
+                _request_timeout=_request_timeout,
+                body=body,
+            )
+        else:
+            raise ApiValueError(
+                "http method must be `GET`, `HEAD`, `OPTIONS`,"
+                " `POST`, `PATCH`, `PUT` or `DELETE`."
+            )
+
+    def parameters_to_tuples(self, params, collection_formats):
+        """Get parameters as list of tuples, formatting collections.
+
+        :param params: Parameters as dict or list of two-tuples
+        :param dict collection_formats: Parameter collection formats
+        :return: Parameters as list of tuples, collections formatted
+        """
+        new_params = []
+        if collection_formats is None:
+            collection_formats = {}
+        for k, v in (
+            six.iteritems(params) if isinstance(params, dict) else params
+        ):  # noqa: E501
+            if k in collection_formats:
+                collection_format = collection_formats[k]
+                if collection_format == "multi":
+                    new_params.extend((k, value) for value in v)
+                else:
+                    if collection_format == "ssv":
+                        delimiter = " "
+                    elif collection_format == "tsv":
+                        delimiter = "\t"
+                    elif collection_format == "pipes":
+                        delimiter = "|"
+                    else:  # csv is the default
+                        delimiter = ","
+                    new_params.append((k, delimiter.join(str(value) for value in v)))
+            else:
+                new_params.append((k, v))
+        return new_params
+
+    def files_parameters(self, files=None):
+        """Builds form parameters.
+
+        :param files: File parameters.
+        :return: Form parameters with files.
+        """
+        params = []
+
+        if files:
+            for k, v in six.iteritems(files):
+                if not v:
+                    continue
+                file_names = v if type(v) is list else [v]
+                for n in file_names:
+                    with open(n, "rb") as f:
+                        filename = os.path.basename(f.name)
+                        filedata = f.read()
+                        mimetype = (
+                            mimetypes.guess_type(filename)[0]
+                            or "application/octet-stream"
+                        )
+                        params.append(tuple([k, tuple([filename, filedata, mimetype])]))
+
+        return params
+
+    def select_header_accept(self, accepts):
+        """Returns `Accept` based on an array of accepts provided.
+
+        :param accepts: List of headers.
+        :return: Accept (e.g. application/json).
+        """
+        if not accepts:
+            return
+
+        accepts = [x.lower() for x in accepts]
+
+        if "application/json" in accepts:
+            return "application/json"
+        else:
+            return ", ".join(accepts)
+
+    def select_header_content_type(self, content_types):
+        """Returns `Content-Type` based on an array of content_types provided.
+
+        :param content_types: List of content-types.
+        :return: Content-Type (e.g. application/json).
+        """
+        if not content_types:
+            return "application/json"
+
+        content_types = [x.lower() for x in content_types]
+
+        if "application/json" in content_types or "*/*" in content_types:
+            return "application/json"
+        else:
+            return content_types[0]
+
+    def update_params_for_auth(self, headers, querys, auth_settings):
+        """Updates header and query params based on authentication setting.
+
+        :param headers: Header parameters dict to be updated.
+        :param querys: Query parameters tuple list to be updated.
+        :param auth_settings: Authentication setting identifiers list.
+        """
+        if not auth_settings:
+            return
+
+        for auth in auth_settings:
+            auth_setting = self.configuration.auth_settings().get(auth)
+            if auth_setting:
+                if auth_setting["in"] == "cookie":
+                    headers["Cookie"] = auth_setting["value"]
+                elif auth_setting["in"] == "header":
+                    headers[auth_setting["key"]] = auth_setting["value"]
+                elif auth_setting["in"] == "query":
+                    querys.append((auth_setting["key"], auth_setting["value"]))
+                else:
+                    raise ApiValueError(
+                        "Authentication token must be in `query` or `header`"
+                    )
+
+    def __deserialize_file(self, response):
+        """Deserializes body to file
+
+        Saves response body into a file in a temporary folder,
+        using the filename from the `Content-Disposition` header if provided.
+
+        :param response:  RESTResponse.
+        :return: file path.
+        """
+        fd, path = tempfile.mkstemp(dir=self.configuration.temp_folder_path)
+        os.close(fd)
+        os.remove(path)
+
+        content_disposition = response.getheader("Content-Disposition")
+        if content_disposition:
+            filename = re.search(
+                r'filename=[\'"]?([^\'"\s]+)[\'"]?', content_disposition
+            ).group(1)
+            path = os.path.join(os.path.dirname(path), filename)
+
+        with open(path, "wb") as f:
+            f.write(response.data)
+
+        return path
+
+    def __deserialize_primitive(self, data, klass):
+        """Deserializes string to primitive type.
+
+        :param data: str.
+        :param klass: class literal.
+
+        :return: int, long, float, str, bool.
+        """
+        try:
+            return klass(data)
+        except UnicodeEncodeError:
+            return six.text_type(data)
+        except TypeError:
+            return data
+
+    def __deserialize_object(self, value):
+        """Return an original value.
+
+        :return: object.
+        """
+        return value
+
+    def __deserialize_date(self, string):
+        """Deserializes string to date.
+
+        :param string: str.
+        :return: date.
+        """
+        try:
+            return parse(string).date()
+        except ImportError:
+            return string
+        except ValueError:
+            raise knext.common.rest.ApiException(
+                status=0, reason="Failed to parse `{0}` as date object".format(string)
+            )
+
+    def __deserialize_datetime(self, string):
+        """Deserializes string to datetime.
+
+        The string should be in iso8601 datetime format.
+
+        :param string: str.
+        :return: datetime.
+        """
+        try:
+            return parse(string)
+        except ImportError:
+            return string
+        except ValueError:
+            raise knext.common.rest.ApiException(
+                status=0,
+                reason=("Failed to parse `{0}` as datetime object".format(string)),
+            )
+
+    def __deserialize_model(self, data, klass):
+        """Deserializes list or dict to model.
+
+        :param data: dict, list.
+        :param klass: class literal.
+        :return: model object.
+        """
+        has_discriminator = False
+        if (
+            hasattr(klass, "get_real_child_model")
+            and klass.discriminator_value_class_map
+        ):
+            has_discriminator = True
+
+        if not klass.openapi_types and has_discriminator is False:
+            return data
+
+        kwargs = {}
+        if (
+            data is not None
+            and klass.openapi_types is not None
+            and isinstance(data, (list, dict))
+        ):
+            for attr, attr_type in six.iteritems(klass.openapi_types):
+                if klass.attribute_map[attr] in data:
+                    value = data[klass.attribute_map[attr]]
+                    kwargs[attr] = self.__deserialize(value, attr_type)
+
+        instance = klass(**kwargs)
+
+        if has_discriminator:
+            klass_name = instance.get_real_child_model(data)
+            if klass_name:
+                instance = self.__deserialize(data, klass_name)
+        return instance
+
+
+class BaseApi(object):
+    api_client: ApiClient
+
+    def __init__(self, api_client=None):
+        if api_client is None:
+            api_client = ApiClient()
+        self.api_client = api_client
diff --git a/knext/common/rest/configuration.py b/knext/common/rest/configuration.py
new file mode 100644
index 00000000..965db0df
--- /dev/null
+++ b/knext/common/rest/configuration.py
@@ -0,0 +1,395 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+from __future__ import absolute_import
+
+import copy
+import logging
+import multiprocessing
+import os
+from pathlib import Path
+import sys
+import yaml
+
+import six
+import urllib3
+from six.moves import http_client as httplib
+from knext.common.env import env
+
+
+class Configuration(object):
+    """NOTE: This class is auto generated by OpenAPI Generator
+
+    Ref: https://openapi-generator.tech
+    Do not edit the class manually.
+
+    :param host: Base url
+    :param api_key: Dict to store API key(s).
+      Each entry in the dict specifies an API key.
+      The dict key is the name of the security scheme in the OAS specification.
+      The dict value is the API key secret.
+    :param api_key_prefix: Dict to store API prefix (e.g. Bearer)
+      The dict key is the name of the security scheme in the OAS specification.
+      The dict value is an API key prefix when generating the auth data.
+    :param username: Username for HTTP basic authentication
+    :param password: Password for HTTP basic authentication
+    :param discard_unknown_keys: Boolean value indicating whether to discard
+      unknown properties. A server may send a response that includes additional
+      properties that are not known by the client in the following scenarios:
+      1. The OpenAPI document is incomplete, i.e. it does not match the server
+         implementation.
+      2. The client was generated using an older version of the OpenAPI document
+         and the server has been upgraded since then.
+      If a schema in the OpenAPI document defines the additionalProperties attribute,
+      then all undeclared properties received by the server are injected into the
+      additional properties map. In that case, there are undeclared properties, and
+      nothing to discard.
+
+    """
+
+    _default = None
+
+    def __init__(
+        self,
+        host=None,
+        api_key=None,
+        api_key_prefix=None,
+        username=None,
+        password=None,
+        discard_unknown_keys=False,
+    ):
+        """Constructor"""
+        self.host = host or os.getenv("KAG_PROJECT_HOST_ADDR") or env.host_addr
+        """Default Base url
+        """
+        self.temp_folder_path = None
+        """Temp file folder for downloading files
+        """
+        # Authentication Settings
+        self.api_key = {}
+        if api_key:
+            self.api_key = api_key
+        """dict to store API key(s)
+        """
+        self.api_key_prefix = {}
+        if api_key_prefix:
+            self.api_key_prefix = api_key_prefix
+        """dict to store API prefix (e.g. Bearer)
+        """
+        self.refresh_api_key_hook = None
+        """function hook to refresh API key if expired
+        """
+        self.username = username
+        """Username for HTTP basic authentication
+        """
+        self.password = password
+        """Password for HTTP basic authentication
+        """
+        self.discard_unknown_keys = discard_unknown_keys
+        self.logger = {}
+        """Logging Settings
+        """
+        self.logger["package_logger"] = logging.getLogger("rest")
+        self.logger["urllib3_logger"] = logging.getLogger("urllib3")
+        self.logger_format = "%(asctime)s %(levelname)s %(message)s"
+        """Log format
+        """
+        self.logger_stream_handler = None
+        """Log stream handler
+        """
+        self.logger_file_handler = None
+        """Log file handler
+        """
+        self.logger_file = None
+        """Debug file location
+        """
+        self.debug = False
+        """Debug switch
+        """
+
+        self.verify_ssl = True
+        """SSL/TLS verification
+           Set this to false to skip verifying SSL certificate when calling API
+           from https server.
+        """
+        self.ssl_ca_cert = None
+        """Set this to customize the certificate file to verify the peer.
+        """
+        self.cert_file = None
+        """client certificate file
+        """
+        self.key_file = None
+        """client key file
+        """
+        self.assert_hostname = None
+        """Set this to True/False to enable/disable SSL hostname verification.
+        """
+
+        self.connection_pool_maxsize = multiprocessing.cpu_count() * 5
+        """urllib3 connection pool's maximum number of connections saved
+           per pool. urllib3 uses 1 connection as default value, but this is
+           not the best value when you are making a lot of possibly parallel
+           requests to the same host, which is often the case here.
+           cpu_count * 5 is used as default value to increase performance.
+        """
+
+        self.proxy = None
+        """Proxy URL
+        """
+        self.proxy_headers = None
+        """Proxy headers
+        """
+        self.safe_chars_for_path_param = ""
+        """Safe chars for path_param
+        """
+        self.retries = None
+        """Adding retries to override urllib3 default value 3
+        """
+        # Disable client side validation
+        self.client_side_validation = True
+
+    def __deepcopy__(self, memo):
+        cls = self.__class__
+        result = cls.__new__(cls)
+        memo[id(self)] = result
+        for k, v in self.__dict__.items():
+            if k not in ("logger", "logger_file_handler"):
+                setattr(result, k, copy.deepcopy(v, memo))
+        # shallow copy of loggers
+        result.logger = copy.copy(self.logger)
+        # use setters to configure loggers
+        result.logger_file = self.logger_file
+        result.debug = self.debug
+        return result
+
+    def __setattr__(self, name, value):
+        object.__setattr__(self, name, value)
+
+    @classmethod
+    def set_default(cls, default):
+        """Set default instance of configuration.
+
+        It stores default configuration, which can be
+        returned by get_default_copy method.
+
+        :param default: object of Configuration
+        """
+        cls._default = copy.deepcopy(default)
+
+    @classmethod
+    def get_default_copy(cls):
+        """Return new instance of configuration.
+
+        This method returns newly created, based on default constructor,
+        object of Configuration class or returns a copy of default
+        configuration passed by the set_default method.
+
+        :return: The configuration object.
+        """
+        if cls._default is not None:
+            return copy.deepcopy(cls._default)
+        return Configuration()
+
+    @property
+    def logger_file(self):
+        """The logger file.
+
+        If the logger_file is None, then add stream handler and remove file
+        handler. Otherwise, add file handler and remove stream handler.
+
+        :param value: The logger_file path.
+        :type: str
+        """
+        return self.__logger_file
+
+    @logger_file.setter
+    def logger_file(self, value):
+        """The logger file.
+
+        If the logger_file is None, then add stream handler and remove file
+        handler. Otherwise, add file handler and remove stream handler.
+
+        :param value: The logger_file path.
+        :type: str
+        """
+        self.__logger_file = value
+        if self.__logger_file:
+            # If set logging file,
+            # then add file handler and remove stream handler.
+            self.logger_file_handler = logging.FileHandler(self.__logger_file)
+            self.logger_file_handler.setFormatter(self.logger_formatter)
+            for _, logger in six.iteritems(self.logger):
+                logger.addHandler(self.logger_file_handler)
+
+    @property
+    def debug(self):
+        """Debug status
+
+        :param value: The debug status, True or False.
+        :type: bool
+        """
+        return self.__debug
+
+    @debug.setter
+    def debug(self, value):
+        """Debug status
+
+        :param value: The debug status, True or False.
+        :type: bool
+        """
+        self.__debug = value
+        if self.__debug:
+            # if debug status is True, turn on debug logging
+            for _, logger in six.iteritems(self.logger):
+                logger.setLevel(logging.DEBUG)
+            # turn on httplib debug
+            httplib.HTTPConnection.debuglevel = 1
+        else:
+            # if debug status is False, turn off debug logging,
+            # setting log level to default `logging.WARNING`
+            for _, logger in six.iteritems(self.logger):
+                logger.setLevel(logging.WARNING)
+            # turn off httplib debug
+            httplib.HTTPConnection.debuglevel = 0
+
+    @property
+    def logger_format(self):
+        """The logger format.
+
+        The logger_formatter will be updated when sets logger_format.
+
+        :param value: The format string.
+        :type: str
+        """
+        return self.__logger_format
+
+    @logger_format.setter
+    def logger_format(self, value):
+        """The logger format.
+
+        The logger_formatter will be updated when sets logger_format.
+
+        :param value: The format string.
+        :type: str
+        """
+        self.__logger_format = value
+        self.logger_formatter = logging.Formatter(self.__logger_format)
+
+    def get_api_key_with_prefix(self, identifier):
+        """Gets API key (with prefix if set).
+
+        :param identifier: The identifier of apiKey.
+        :return: The token for api key authentication.
+        """
+        if self.refresh_api_key_hook is not None:
+            self.refresh_api_key_hook(self)
+        key = self.api_key.get(identifier)
+        if key:
+            prefix = self.api_key_prefix.get(identifier)
+            if prefix:
+                return "%s %s" % (prefix, key)
+            else:
+                return key
+
+    def get_basic_auth_token(self):
+        """Gets HTTP basic authentication header (string).
+
+        :return: The token for basic HTTP authentication.
+        """
+        username = ""
+        if self.username is not None:
+            username = self.username
+        password = ""
+        if self.password is not None:
+            password = self.password
+        return urllib3.util.make_headers(basic_auth=username + ":" + password).get(
+            "authorization"
+        )
+
+    def auth_settings(self):
+        """Gets Auth Settings dict for api client.
+
+        :return: The Auth Settings information dict.
+        """
+        auth = {}
+        return auth
+
+    def to_debug_report(self):
+        """Gets the essential information for debugging.
+
+        :return: The report for debugging.
+        """
+        return (
+            "Python SDK Debug Report:\n"
+            "OS: {env}\n"
+            "Python Version: {pyversion}\n"
+            "Version of the API: 1.0.0\n"
+            "SDK Package Version: 1.0.0".format(env=sys.platform, pyversion=sys.version)
+        )
+
+    def get_host_settings(self):
+        """Gets an array of host settings
+
+        :return: An array of host settings
+        """
+        return [
+            {
+                "url": "/",
+                "description": "No description provided",
+            }
+        ]
+
+    def get_host_from_settings(self, index, variables=None):
+        """Gets host URL based on the index and variables
+        :param index: array index of the host settings
+        :param variables: hash of variable and the corresponding value
+        :return: URL based on host settings
+        """
+        variables = {} if variables is None else variables
+        servers = self.get_host_settings()
+
+        try:
+            server = servers[index]
+        except IndexError:
+            raise ValueError(
+                "Invalid index {0} when selecting the host settings. "
+                "Must be less than {1}".format(index, len(servers))
+            )
+
+        url = server["url"]
+
+        # go through variables and replace placeholders
+        for variable_name, variable in server["variables"].items():
+            used_value = variables.get(variable_name, variable["default_value"])
+
+            if "enum_values" in variable and used_value not in variable["enum_values"]:
+                raise ValueError(
+                    "The variable `{0}` in the host URL has invalid value "
+                    "{1}. Must be {2}.".format(
+                        variable_name, variables[variable_name], variable["enum_values"]
+                    )
+                )
+
+            url = url.replace("{" + variable_name + "}", used_value)
+
+        return url
diff --git a/knext/common/rest/exceptions.py b/knext/common/rest/exceptions.py
new file mode 100644
index 00000000..4fd26710
--- /dev/null
+++ b/knext/common/rest/exceptions.py
@@ -0,0 +1,127 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import six
+
+
+class OpenApiException(Exception):
+    """The base exception class for all OpenAPIExceptions"""
+
+
+class ApiTypeError(OpenApiException, TypeError):
+    def __init__(self, msg, path_to_item=None, valid_classes=None, key_type=None):
+        """Raises an exception for TypeErrors
+
+        Args:
+            msg (str): the exception message
+
+        Keyword Args:
+            path_to_item (list): a list of keys an indices to get to the
+                                 current_item
+                                 None if unset
+            valid_classes (tuple): the primitive classes that current item
+                                   should be an instance of
+                                   None if unset
+            key_type (bool): False if our value is a value in a dict
+                             True if it is a key in a dict
+                             False if our item is an item in a list
+                             None if unset
+        """
+        self.path_to_item = path_to_item
+        self.valid_classes = valid_classes
+        self.key_type = key_type
+        full_msg = msg
+        if path_to_item:
+            full_msg = "{0} at {1}".format(msg, render_path(path_to_item))
+        super(ApiTypeError, self).__init__(full_msg)
+
+
+class ApiValueError(OpenApiException, ValueError):
+    def __init__(self, msg, path_to_item=None):
+        """
+        Args:
+            msg (str): the exception message
+
+        Keyword Args:
+            path_to_item (list) the path to the exception in the
+                received_data dict. None if unset
+        """
+
+        self.path_to_item = path_to_item
+        full_msg = msg
+        if path_to_item:
+            full_msg = "{0} at {1}".format(msg, render_path(path_to_item))
+        super(ApiValueError, self).__init__(full_msg)
+
+
+class ApiKeyError(OpenApiException, KeyError):
+    def __init__(self, msg, path_to_item=None):
+        """
+        Args:
+            msg (str): the exception message
+
+        Keyword Args:
+            path_to_item (None/list) the path to the exception in the
+                received_data dict
+        """
+        self.path_to_item = path_to_item
+        full_msg = msg
+        if path_to_item:
+            full_msg = "{0} at {1}".format(msg, render_path(path_to_item))
+        super(ApiKeyError, self).__init__(full_msg)
+
+
+class ApiException(OpenApiException):
+    def __init__(self, status=None, reason=None, http_resp=None):
+        if http_resp:
+            self.status = http_resp.status
+            self.reason = http_resp.reason
+            self.body = http_resp.data
+            self.headers = http_resp.getheaders()
+        else:
+            self.status = status
+            self.reason = reason
+            self.body = None
+            self.headers = None
+
+    def __str__(self):
+        """Custom error messages for exception"""
+        error_message = "({0})\n" "Reason: {1}\n".format(self.status, self.reason)
+        if self.headers:
+            error_message += "HTTP response headers: {0}\n".format(self.headers)
+
+        if self.body:
+            error_message += "HTTP response body: {0}\n".format(self.body)
+
+        return error_message
+
+
+def render_path(path_to_item):
+    """Returns a string representation of a path"""
+    result = ""
+    for pth in path_to_item:
+        if isinstance(pth, six.integer_types):
+            result += "[{0}]".format(pth)
+        else:
+            result += "['{0}']".format(pth)
+    return result
diff --git a/tests/client/__init__.py b/knext/common/rest/models.py
similarity index 100%
rename from tests/client/__init__.py
rename to knext/common/rest/models.py
diff --git a/knext/common/rest/models/__init__.py b/knext/common/rest/models/__init__.py
new file mode 100644
index 00000000..ca17c13e
--- /dev/null
+++ b/knext/common/rest/models/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+# import models into sdk package
+from knext.schema.rest.models import *
+from knext.reasoner.rest.models import *
+from knext.project.rest.models import *
+from knext.search.rest.models import *
+from knext.graph.rest.models import *
+from knext.thinker.rest.models import *
diff --git a/knext/common/rest/rest.py b/knext/common/rest/rest.py
new file mode 100644
index 00000000..164e3446
--- /dev/null
+++ b/knext/common/rest/rest.py
@@ -0,0 +1,396 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+from __future__ import absolute_import
+
+import io
+import json
+import logging
+import re
+import ssl
+
+import certifi
+
+# python 2 and python 3 compatibility library
+import six
+import urllib3
+from six.moves.urllib.parse import urlencode
+
+from knext.common.rest.exceptions import ApiException, ApiValueError
+
+logger = logging.getLogger(__name__)
+
+
+class RESTResponse(io.IOBase):
+    def __init__(self, resp):
+        self.urllib3_response = resp
+        self.status = resp.status
+        self.reason = resp.reason
+        self.data = resp.data
+
+    def getheaders(self):
+        """Returns a dictionary of the response headers."""
+        return self.urllib3_response.getheaders()
+
+    def getheader(self, name, default=None):
+        """Returns a given response header."""
+        return self.urllib3_response.getheader(name, default)
+
+
+class RESTClientObject(object):
+    def __init__(self, configuration, pools_size=4, maxsize=None):
+        # urllib3.PoolManager will pass all kw parameters to connectionpool
+        # https://github.com/shazow/urllib3/blob/f9409436f83aeb79fbaf090181cd81b784f1b8ce/urllib3/poolmanager.py#L75  # noqa: E501
+        # https://github.com/shazow/urllib3/blob/f9409436f83aeb79fbaf090181cd81b784f1b8ce/urllib3/connectionpool.py#L680  # noqa: E501
+        # maxsize is the number of requests to host that are allowed in parallel  # noqa: E501
+        # Custom SSL certificates and client certificates: http://urllib3.readthedocs.io/en/latest/advanced-usage.html  # noqa: E501
+
+        # cert_reqs
+        if configuration.verify_ssl:
+            cert_reqs = ssl.CERT_REQUIRED
+        else:
+            cert_reqs = ssl.CERT_NONE
+
+        # ca_certs
+        if configuration.ssl_ca_cert:
+            ca_certs = configuration.ssl_ca_cert
+        else:
+            # if not set certificate file, use Mozilla's root certificates.
+            ca_certs = certifi.where()
+
+        addition_pool_args = {}
+        if configuration.assert_hostname is not None:
+            addition_pool_args[
+                "assert_hostname"
+            ] = configuration.assert_hostname  # noqa: E501
+
+        if configuration.retries is not None:
+            addition_pool_args["retries"] = configuration.retries
+
+        if maxsize is None:
+            if configuration.connection_pool_maxsize is not None:
+                maxsize = configuration.connection_pool_maxsize
+            else:
+                maxsize = 4
+
+        # https pool manager
+        if configuration.proxy:
+            self.pool_manager = urllib3.ProxyManager(
+                num_pools=pools_size,
+                maxsize=maxsize,
+                cert_reqs=cert_reqs,
+                ca_certs=ca_certs,
+                cert_file=configuration.cert_file,
+                key_file=configuration.key_file,
+                proxy_url=configuration.proxy,
+                proxy_headers=configuration.proxy_headers,
+                **addition_pool_args
+            )
+        else:
+            self.pool_manager = urllib3.PoolManager(
+                num_pools=pools_size,
+                maxsize=maxsize,
+                cert_reqs=cert_reqs,
+                ca_certs=ca_certs,
+                cert_file=configuration.cert_file,
+                key_file=configuration.key_file,
+                **addition_pool_args
+            )
+
+    def request(
+        self,
+        method,
+        url,
+        query_params=None,
+        headers=None,
+        body=None,
+        post_params=None,
+        _preload_content=True,
+        _request_timeout=None,
+    ):
+        """Perform requests.
+
+        :param method: http request method
+        :param url: http request url
+        :param query_params: query parameters in the url
+        :param headers: http request headers
+        :param body: request json body, for `application/json`
+        :param post_params: request post parameters,
+                            `application/x-www-form-urlencoded`
+                            and `multipart/form-data`
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        """
+        method = method.upper()
+        assert method in ["GET", "HEAD", "DELETE", "POST", "PUT", "PATCH", "OPTIONS"]
+
+        if post_params and body:
+            raise ApiValueError(
+                "body parameter cannot be used with post_params parameter."
+            )
+
+        post_params = post_params or {}
+        headers = headers or {}
+
+        timeout = None
+        if _request_timeout:
+            if isinstance(
+                _request_timeout, (int,) if six.PY3 else (int, long)
+            ):  # noqa: E501,F821
+                timeout = urllib3.Timeout(total=_request_timeout)
+            elif isinstance(_request_timeout, tuple) and len(_request_timeout) == 2:
+                timeout = urllib3.Timeout(
+                    connect=_request_timeout[0], read=_request_timeout[1]
+                )
+
+        if "Content-Type" not in headers:
+            headers["Content-Type"] = "application/json"
+
+        try:
+            # For `POST`, `PUT`, `PATCH`, `OPTIONS`, `DELETE`
+            if method in ["POST", "PUT", "PATCH", "OPTIONS", "DELETE"]:
+                if query_params:
+                    url += "?" + urlencode(query_params)
+                if re.search("json", headers["Content-Type"], re.IGNORECASE):
+                    request_body = None
+                    if body is not None:
+                        request_body = json.dumps(body)
+                    r = self.pool_manager.request(
+                        method,
+                        url,
+                        body=request_body,
+                        preload_content=_preload_content,
+                        timeout=timeout,
+                        headers=headers,
+                    )
+                elif (
+                    headers["Content-Type"] == "application/x-www-form-urlencoded"
+                ):  # noqa: E501
+                    r = self.pool_manager.request(
+                        method,
+                        url,
+                        fields=post_params,
+                        encode_multipart=False,
+                        preload_content=_preload_content,
+                        timeout=timeout,
+                        headers=headers,
+                    )
+                elif headers["Content-Type"] == "multipart/form-data":
+                    # must del headers['Content-Type'], or the correct
+                    # Content-Type which generated by urllib3 will be
+                    # overwritten.
+                    del headers["Content-Type"]
+                    r = self.pool_manager.request(
+                        method,
+                        url,
+                        fields=post_params,
+                        encode_multipart=True,
+                        preload_content=_preload_content,
+                        timeout=timeout,
+                        headers=headers,
+                    )
+                # Pass a `string` parameter directly in the body to support
+                # other content types than Json when `body` argument is
+                # provided in serialized form
+                elif isinstance(body, str) or isinstance(body, bytes):
+                    request_body = body
+                    r = self.pool_manager.request(
+                        method,
+                        url,
+                        body=request_body,
+                        preload_content=_preload_content,
+                        timeout=timeout,
+                        headers=headers,
+                    )
+                else:
+                    # Cannot generate the request from given parameters
+                    msg = """Cannot prepare a request message for provided
+                             arguments. Please check that your arguments match
+                             declared content type."""
+                    raise ApiException(status=0, reason=msg)
+            # For `GET`, `HEAD`
+            else:
+                r = self.pool_manager.request(
+                    method,
+                    url,
+                    fields=query_params,
+                    preload_content=_preload_content,
+                    timeout=timeout,
+                    headers=headers,
+                )
+        except urllib3.exceptions.SSLError as e:
+            msg = "{0}\n{1}".format(type(e).__name__, str(e))
+            raise ApiException(status=0, reason=msg)
+
+        if _preload_content:
+            r = RESTResponse(r)
+
+            # log response body
+            logger.debug("response body: %s", r.data)
+
+        if not 200 <= r.status <= 299:
+            raise ApiException(http_resp=r)
+
+        return r
+
+    def GET(
+        self,
+        url,
+        headers=None,
+        query_params=None,
+        _preload_content=True,
+        _request_timeout=None,
+    ):
+        return self.request(
+            "GET",
+            url,
+            headers=headers,
+            _preload_content=_preload_content,
+            _request_timeout=_request_timeout,
+            query_params=query_params,
+        )
+
+    def HEAD(
+        self,
+        url,
+        headers=None,
+        query_params=None,
+        _preload_content=True,
+        _request_timeout=None,
+    ):
+        return self.request(
+            "HEAD",
+            url,
+            headers=headers,
+            _preload_content=_preload_content,
+            _request_timeout=_request_timeout,
+            query_params=query_params,
+        )
+
+    def OPTIONS(
+        self,
+        url,
+        headers=None,
+        query_params=None,
+        post_params=None,
+        body=None,
+        _preload_content=True,
+        _request_timeout=None,
+    ):
+        return self.request(
+            "OPTIONS",
+            url,
+            headers=headers,
+            query_params=query_params,
+            post_params=post_params,
+            _preload_content=_preload_content,
+            _request_timeout=_request_timeout,
+            body=body,
+        )
+
+    def DELETE(
+        self,
+        url,
+        headers=None,
+        query_params=None,
+        body=None,
+        _preload_content=True,
+        _request_timeout=None,
+    ):
+        return self.request(
+            "DELETE",
+            url,
+            headers=headers,
+            query_params=query_params,
+            _preload_content=_preload_content,
+            _request_timeout=_request_timeout,
+            body=body,
+        )
+
+    def POST(
+        self,
+        url,
+        headers=None,
+        query_params=None,
+        post_params=None,
+        body=None,
+        _preload_content=True,
+        _request_timeout=None,
+    ):
+        return self.request(
+            "POST",
+            url,
+            headers=headers,
+            query_params=query_params,
+            post_params=post_params,
+            _preload_content=_preload_content,
+            _request_timeout=_request_timeout,
+            body=body,
+        )
+
+    def PUT(
+        self,
+        url,
+        headers=None,
+        query_params=None,
+        post_params=None,
+        body=None,
+        _preload_content=True,
+        _request_timeout=None,
+    ):
+        return self.request(
+            "PUT",
+            url,
+            headers=headers,
+            query_params=query_params,
+            post_params=post_params,
+            _preload_content=_preload_content,
+            _request_timeout=_request_timeout,
+            body=body,
+        )
+
+    def PATCH(
+        self,
+        url,
+        headers=None,
+        query_params=None,
+        post_params=None,
+        body=None,
+        _preload_content=True,
+        _request_timeout=None,
+    ):
+        return self.request(
+            "PATCH",
+            url,
+            headers=headers,
+            query_params=query_params,
+            post_params=post_params,
+            _preload_content=_preload_content,
+            _request_timeout=_request_timeout,
+            body=body,
+        )
diff --git a/knext/common/utils.py b/knext/common/utils.py
new file mode 100644
index 00000000..5246aa97
--- /dev/null
+++ b/knext/common/utils.py
@@ -0,0 +1,205 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import re
+import sys
+import json
+from typing import Type, Tuple
+import inspect
+import os
+from pathlib import Path
+import importlib
+from shutil import copystat, copy2
+from typing import Any, Union
+from jinja2 import Environment, FileSystemLoader, Template
+from stat import S_IWUSR as OWNER_WRITE_PERMISSION
+
+
+def _register(root, path, files, class_type):
+    relative_path = os.path.relpath(path, root)
+    module_prefix = relative_path.replace(".", "").replace("/", ".")
+    module_prefix = module_prefix + "." if module_prefix else ""
+    for file_name in files:
+        if file_name.endswith(".py"):
+            module_name = module_prefix + os.path.splitext(file_name)[0]
+            import importlib
+
+            module = importlib.import_module(module_name)
+            classes = inspect.getmembers(module, inspect.isclass)
+            for class_name, class_obj in classes:
+                if (
+                    issubclass(class_obj, class_type)
+                    and inspect.getmodule(class_obj) == module
+                ):
+
+                    class_type.register(
+                        name=class_name,
+                        local_path=os.path.join(path, file_name),
+                        module_path=module_name,
+                    )(class_obj)
+
+
+def register_from_package(path: str, class_type: Type) -> None:
+    """
+    Register all classes under the given package.
+    Only registered classes can be recognized by knext.
+    """
+    if not append_python_path(path):
+        return
+    for root, dirs, files in os.walk(path):
+        _register(path, root, files, class_type)
+    class_type._has_registered = True
+
+
+def append_python_path(path: str) -> bool:
+    """
+    Append the given path to `sys.path`.
+    """
+    path = Path(path).resolve()
+    path = str(path)
+    if path not in sys.path:
+        sys.path.append(path)
+        return True
+    return False
+
+
+def render_template(
+    root_dir: Union[str, os.PathLike], file: Union[str, os.PathLike], **kwargs: Any
+) -> None:
+    path_obj = Path(root_dir) / file
+    env = Environment(loader=FileSystemLoader(path_obj.parent))
+    template = env.get_template(path_obj.name)
+    content = template.render(kwargs)
+
+    render_path = path_obj.with_suffix("") if path_obj.suffix == ".tmpl" else path_obj
+
+    if path_obj.suffix == ".tmpl":
+        path_obj.rename(render_path)
+
+    render_path.write_text(content, "utf8")
+
+
+def copytree(src: Path, dst: Path, **kwargs):
+    names = [x.name for x in src.iterdir()]
+
+    if not dst.exists():
+        dst.mkdir(parents=True)
+
+    for name in names:
+        _name = Template(name).render(**kwargs)
+        src_name = src / name
+        dst_name = dst / _name
+        if src_name.is_dir():
+            copytree(src_name, dst_name, **kwargs)
+        else:
+            copyfile(src_name, dst_name, **kwargs)
+
+    copystat(src, dst)
+    _make_writable(dst)
+
+
+def copyfile(src: Path, dst: Path, **kwargs):
+    if dst.exists():
+        return
+    dst = Path(Template(str(dst)).render(**kwargs))
+    copy2(src, dst)
+    _make_writable(dst)
+    if dst.suffix != ".tmpl":
+        return
+    render_template("/", dst, **kwargs)
+
+
+def remove_files_except(path, file, new_file):
+    for filename in os.listdir(path):
+        file_path = os.path.join(path, filename)
+        if os.path.isfile(file_path) and filename != file:
+            os.remove(file_path)
+    os.rename(path / file, path / new_file)
+
+
+def _make_writable(path):
+    current_permissions = os.stat(path).st_mode
+    os.chmod(path, current_permissions | OWNER_WRITE_PERMISSION)
+
+
+def escape_single_quotes(s: str):
+    return s.replace("'", "\\'")
+
+
+def load_json(content):
+    try:
+        return json.loads(content)
+    except json.JSONDecodeError as e:
+
+        substr = content[: e.colno - 1]
+        return json.loads(substr)
+
+
+def split_module_class_name(name: str, text: str) -> Tuple[str, str]:
+    """
+    Split `name` as module name and class name pair.
+
+    :param name: fully qualified class name, e.g. ``foo.bar.MyClass``
+    :type name: str
+    :param text: describe the kind of the class, used in the exception message
+    :type text: str
+    :rtype: Tuple[str, str]
+    :raises RuntimeError: if `name` is not a fully qualified class name
+    """
+    i = name.rfind(".")
+    if i == -1:
+        message = "invalid %s class name: %s" % (text, name)
+        raise RuntimeError(message)
+    module_name = name[:i]
+    class_name = name[i + 1 :]
+    return module_name, class_name
+
+
+def dynamic_import_class(name: str, text: str):
+    """
+    Import the class specified by `name` dyanmically.
+
+    :param name: fully qualified class name, e.g. ``foo.bar.MyClass``
+    :type name: str
+    :param text: describe the kind of the class, use in the exception message
+    :type text: str
+    :raises RuntimeError: if `name` is not a fully qualified class name, or
+                          the class is not in the module specified by `name`
+    :raises ModuleNotFoundError: the module specified by `name` is not found
+    """
+    module_name, class_name = split_module_class_name(name, text)
+    module = importlib.import_module(module_name)
+    class_ = getattr(module, class_name, None)
+    if class_ is None:
+        message = "class %r not found in module %r" % (class_name, module_name)
+        raise RuntimeError(message)
+    if not isinstance(class_, type):
+        message = "%r is not a class" % (name,)
+        raise RuntimeError(message)
+    return class_
+
+
+def processing_phrases(phrase):
+    phrase = str(phrase)
+    return re.sub("[^A-Za-z0-9\u4e00-\u9fa5 ]", " ", phrase.lower()).strip()
+
+
+def to_camel_case(phrase):
+    s = processing_phrases(phrase).replace(" ", "_")
+    return "".join(
+        word.capitalize() if i != 0 else word for i, word in enumerate(s.split("_"))
+    )
+
+
+def to_snake_case(name):
+    words = re.findall("[A-Za-z][a-z0-9]*", name)
+    result = "_".join(words).lower()
+    return result
diff --git a/knext/graph/__init__.py b/knext/graph/__init__.py
new file mode 100644
index 00000000..ab480694
--- /dev/null
+++ b/knext/graph/__init__.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+# flake8: noqa
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+from __future__ import absolute_import
+
+__version__ = "1.0.0"
+
+# import apis into sdk package
+from knext.graph.rest.graph_api import GraphApi
+
+
+# import models into model package
+from knext.graph.rest.models.get_page_rank_scores_request import (
+    GetPageRankScoresRequest,
+)
+from knext.graph.rest.models.get_page_rank_scores_request_start_nodes import (
+    GetPageRankScoresRequestStartNodes,
+)
+from knext.graph.rest.models.page_rank_score_instance import PageRankScoreInstance
+from knext.graph.rest.models.delete_vertex_request import DeleteVertexRequest
+from knext.graph.rest.models.delete_edge_request import DeleteEdgeRequest
+from knext.graph.rest.models.edge_record_instance import EdgeRecordInstance
+from knext.graph.rest.models.upsert_vertex_request import UpsertVertexRequest
+from knext.graph.rest.models.upsert_edge_request import UpsertEdgeRequest
+from knext.graph.rest.models.vertex_record_instance import VertexRecordInstance
+from knext.graph.rest.models.writer_graph_request import WriterGraphRequest
+from knext.graph.rest.models.edge_record import EdgeRecord
+from knext.graph.rest.models.edge_type_name import EdgeTypeName
+from knext.graph.rest.models.expend_one_hop_request import ExpendOneHopRequest
+from knext.graph.rest.models.lpg_property_record import LpgPropertyRecord
+from knext.graph.rest.models.query_vertex_request import QueryVertexRequest
+from knext.graph.rest.models.query_vertex_response import QueryVertexResponse
+from knext.graph.rest.models.vertex_record import VertexRecord
+from knext.graph.rest.models.expend_one_hop_response import ExpendOneHopResponse
diff --git a/knext/graph/client.py b/knext/graph/client.py
new file mode 100644
index 00000000..08ba7fbc
--- /dev/null
+++ b/knext/graph/client.py
@@ -0,0 +1,102 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from typing import List, Dict
+
+from knext.common.base.client import Client
+from knext.common.rest import ApiClient, Configuration
+from knext.graph import (
+    rest,
+    GetPageRankScoresRequest,
+    GetPageRankScoresRequestStartNodes,
+    WriterGraphRequest,
+    QueryVertexRequest,
+    ExpendOneHopRequest,
+    EdgeTypeName,
+)
+
+
+class GraphClient(Client):
+    """ """
+
+    def __init__(self, host_addr: str = None, project_id: int = None):
+        super().__init__(host_addr, project_id)
+        self._rest_client: rest.GraphApi = rest.GraphApi(
+            api_client=ApiClient(configuration=Configuration(host=host_addr))
+        )
+
+    def calculate_pagerank_scores(self, target_vertex_type, start_nodes: List[Dict]):
+        """
+        Calculate and retrieve PageRank scores for the given starting nodes.
+
+        Parameters:
+        target_vertex_type (str): Return target vectex type ppr score
+        start_nodes (list): A list containing document fragment IDs to be used as starting nodes for the PageRank algorithm.
+
+        Returns:
+        ppr_doc_scores (dict): A dictionary containing each document fragment ID and its corresponding PageRank score.
+
+        This method uses the PageRank algorithm in the graph store to compute scores for document fragments. If `start_nodes` is empty,
+        it returns an empty dictionary. Otherwise, it attempts to retrieve PageRank scores from the graph store and converts the result
+        into a dictionary format where keys are document fragment IDs and values are their respective PageRank scores. Any exceptions,
+        such as failures in running `run_pagerank_igraph_chunk`, are logged.
+        """
+        ppr_start_nodes = [
+            GetPageRankScoresRequestStartNodes(id=node["name"], type=node["type"])
+            for node in start_nodes
+        ]
+        req = GetPageRankScoresRequest(
+            self._project_id, target_vertex_type, ppr_start_nodes
+        )
+        resp = self._rest_client.graph_get_page_rank_scores_post(
+            get_page_rank_scores_request=req
+        )
+        return {item.id: item.score for item in resp}
+
+    def write_graph(self, sub_graph: dict, operation: str, lead_to_builder: bool):
+        request = WriterGraphRequest(
+            project_id=self._project_id,
+            sub_graph=sub_graph,
+            operation=operation,
+            enable_lead_to=lead_to_builder,
+        )
+        self._rest_client.graph_writer_graph_post(writer_graph_request=request)
+
+    def query_vertex(self, type_name: str, biz_id: str):
+        request = QueryVertexRequest(
+            project_id=self._project_id, type_name=type_name, biz_id=biz_id
+        )
+        return self._rest_client.graph_query_vertex_post(query_vertex_request=request)
+
+    def expend_one_hop(
+        self,
+        type_name: str,
+        biz_id: str,
+        edge_type_name_constraint: List[EdgeTypeName] = None,
+    ):
+        request = ExpendOneHopRequest(
+            project_id=self._project_id,
+            type_name=type_name,
+            biz_id=biz_id,
+            edge_type_name_constraint=edge_type_name_constraint,
+        )
+        return self._rest_client.graph_expend_one_hop_post(
+            expend_one_hop_request=request
+        )
+
+
+if __name__ == "__main__":
+    sc = GraphClient("http://127.0.0.1:8887", 4)
+    out = sc.calculate_pagerank_scores(
+        "Entity", [{"name": "Anxiety_and_nervousness", "type": "Entity"}]
+    )
+    for o in out:
+        print(o)
diff --git a/knext/graph/rest/__init__.py b/knext/graph/rest/__init__.py
new file mode 100644
index 00000000..ab480694
--- /dev/null
+++ b/knext/graph/rest/__init__.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+# flake8: noqa
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+from __future__ import absolute_import
+
+__version__ = "1.0.0"
+
+# import apis into sdk package
+from knext.graph.rest.graph_api import GraphApi
+
+
+# import models into model package
+from knext.graph.rest.models.get_page_rank_scores_request import (
+    GetPageRankScoresRequest,
+)
+from knext.graph.rest.models.get_page_rank_scores_request_start_nodes import (
+    GetPageRankScoresRequestStartNodes,
+)
+from knext.graph.rest.models.page_rank_score_instance import PageRankScoreInstance
+from knext.graph.rest.models.delete_vertex_request import DeleteVertexRequest
+from knext.graph.rest.models.delete_edge_request import DeleteEdgeRequest
+from knext.graph.rest.models.edge_record_instance import EdgeRecordInstance
+from knext.graph.rest.models.upsert_vertex_request import UpsertVertexRequest
+from knext.graph.rest.models.upsert_edge_request import UpsertEdgeRequest
+from knext.graph.rest.models.vertex_record_instance import VertexRecordInstance
+from knext.graph.rest.models.writer_graph_request import WriterGraphRequest
+from knext.graph.rest.models.edge_record import EdgeRecord
+from knext.graph.rest.models.edge_type_name import EdgeTypeName
+from knext.graph.rest.models.expend_one_hop_request import ExpendOneHopRequest
+from knext.graph.rest.models.lpg_property_record import LpgPropertyRecord
+from knext.graph.rest.models.query_vertex_request import QueryVertexRequest
+from knext.graph.rest.models.query_vertex_response import QueryVertexResponse
+from knext.graph.rest.models.vertex_record import VertexRecord
+from knext.graph.rest.models.expend_one_hop_response import ExpendOneHopResponse
diff --git a/knext/graph/rest/graph_api.py b/knext/graph/rest/graph_api.py
new file mode 100644
index 00000000..4e7f0c22
--- /dev/null
+++ b/knext/graph/rest/graph_api.py
@@ -0,0 +1,991 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+from __future__ import absolute_import
+
+import re  # noqa: F401
+
+# python 2 and python 3 compatibility library
+import six
+
+from knext.common.rest.api_client import ApiClient
+from knext.common.rest.exceptions import ApiTypeError, ApiValueError  # noqa: F401
+
+
+class GraphApi(object):
+    """NOTE: This class is auto generated by OpenAPI Generator
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    def __init__(self, api_client=None):
+        if api_client is None:
+            api_client = ApiClient()
+        self.api_client = api_client
+
+    def graph_delete_edge_post(self, **kwargs):  # noqa: E501
+        """delete_edge  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.graph_delete_edge_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param DeleteEdgeRequest delete_edge_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: object
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.graph_delete_edge_post_with_http_info(**kwargs)  # noqa: E501
+
+    def graph_delete_edge_post_with_http_info(self, **kwargs):  # noqa: E501
+        """delete_edge  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.graph_delete_edge_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param DeleteEdgeRequest delete_edge_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(object, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["delete_edge_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method graph_delete_edge_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "delete_edge_request" in local_var_params:
+            body_params = local_var_params["delete_edge_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/graph/deleteEdge",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="object",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def graph_delete_vertex_post(self, **kwargs):  # noqa: E501
+        """delete_vertex  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.graph_delete_vertex_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param DeleteVertexRequest delete_vertex_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: object
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.graph_delete_vertex_post_with_http_info(**kwargs)  # noqa: E501
+
+    def graph_delete_vertex_post_with_http_info(self, **kwargs):  # noqa: E501
+        """delete_vertex  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.graph_delete_vertex_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param DeleteVertexRequest delete_vertex_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(object, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["delete_vertex_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method graph_delete_vertex_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "delete_vertex_request" in local_var_params:
+            body_params = local_var_params["delete_vertex_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/graph/deleteVertex",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="object",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def graph_get_page_rank_scores_post(self, **kwargs):  # noqa: E501
+        """page_rank  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.graph_get_page_rank_scores_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param GetPageRankScoresRequest get_page_rank_scores_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: list[PageRankScoreInstance]
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.graph_get_page_rank_scores_post_with_http_info(
+            **kwargs
+        )  # noqa: E501
+
+    def graph_get_page_rank_scores_post_with_http_info(self, **kwargs):  # noqa: E501
+        """page_rank  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.graph_get_page_rank_scores_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param GetPageRankScoresRequest get_page_rank_scores_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(list[PageRankScoreInstance], status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["get_page_rank_scores_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method graph_get_page_rank_scores_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "get_page_rank_scores_request" in local_var_params:
+            body_params = local_var_params["get_page_rank_scores_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/graph/getPageRankScores",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="list[PageRankScoreInstance]",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def graph_upsert_edge_post(self, **kwargs):  # noqa: E501
+        """upsert_edge  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.graph_upsert_edge_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param UpsertEdgeRequest upsert_edge_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: object
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.graph_upsert_edge_post_with_http_info(**kwargs)  # noqa: E501
+
+    def graph_upsert_edge_post_with_http_info(self, **kwargs):  # noqa: E501
+        """upsert_edge  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.graph_upsert_edge_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param UpsertEdgeRequest upsert_edge_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(object, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["upsert_edge_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method graph_upsert_edge_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "upsert_edge_request" in local_var_params:
+            body_params = local_var_params["upsert_edge_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/graph/upsertEdge",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="object",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def graph_upsert_vertex_post(self, **kwargs):  # noqa: E501
+        """upsert_vertex  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.graph_upsert_vertex_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param UpsertVertexRequest upsert_vertex_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: object
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.graph_upsert_vertex_post_with_http_info(**kwargs)  # noqa: E501
+
+    def graph_upsert_vertex_post_with_http_info(self, **kwargs):  # noqa: E501
+        """upsert_vertex  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.graph_upsert_vertex_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param UpsertVertexRequest upsert_vertex_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(object, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["upsert_vertex_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method graph_upsert_vertex_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "upsert_vertex_request" in local_var_params:
+            body_params = local_var_params["upsert_vertex_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/graph/upsertVertex",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="object",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def graph_writer_graph_post(self, **kwargs):  # noqa: E501
+        """write_graph  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.graph_writer_graph_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param WriterGraphRequest writer_graph_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: object
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.graph_writer_graph_post_with_http_info(**kwargs)  # noqa: E501
+
+    def graph_writer_graph_post_with_http_info(self, **kwargs):  # noqa: E501
+        """write_graph  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.graph_writer_graph_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param WriterGraphRequest writer_graph_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(object, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["writer_graph_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method graph_writer_graph_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "writer_graph_request" in local_var_params:
+            body_params = local_var_params["writer_graph_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/graph/writerGraph",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="object",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def graph_expend_one_hop_post(self, **kwargs):  # noqa: E501
+        """expend_one_hop  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.graph_expend_one_hop_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param ExpendOneHopRequest expend_one_hop_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: ExpendOneHopResponse
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.graph_expend_one_hop_post_with_http_info(**kwargs)  # noqa: E501
+
+    def graph_expend_one_hop_post_with_http_info(self, **kwargs):  # noqa: E501
+        """expend_one_hop  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.graph_expend_one_hop_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param ExpendOneHopRequest expend_one_hop_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(ExpendOneHopResponse, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["expend_one_hop_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method graph_expend_one_hop_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "expend_one_hop_request" in local_var_params:
+            body_params = local_var_params["expend_one_hop_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/graph/expendOneHop",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="ExpendOneHopResponse",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def graph_query_vertex_post(self, **kwargs):  # noqa: E501
+        """query_vertex  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.graph_query_vertex_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param QueryVertexRequest query_vertex_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: QueryVertexResponse
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.graph_query_vertex_post_with_http_info(**kwargs)  # noqa: E501
+
+    def graph_query_vertex_post_with_http_info(self, **kwargs):  # noqa: E501
+        """query_vertex  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.graph_query_vertex_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param QueryVertexRequest query_vertex_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(QueryVertexResponse, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["query_vertex_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method graph_query_vertex_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "query_vertex_request" in local_var_params:
+            body_params = local_var_params["query_vertex_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/graph/queryVertex",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="QueryVertexResponse",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
diff --git a/knext/graph/rest/models/__init__.py b/knext/graph/rest/models/__init__.py
new file mode 100644
index 00000000..6bfdb661
--- /dev/null
+++ b/knext/graph/rest/models/__init__.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+# flake8: noqa
+
+from __future__ import absolute_import
+
+# import models into model package
+from knext.graph.rest.models.get_page_rank_scores_request import (
+    GetPageRankScoresRequest,
+)
+from knext.graph.rest.models.get_page_rank_scores_request_start_nodes import (
+    GetPageRankScoresRequestStartNodes,
+)
+from knext.graph.rest.models.page_rank_score_instance import PageRankScoreInstance
+from knext.graph.rest.models.delete_vertex_request import DeleteVertexRequest
+from knext.graph.rest.models.delete_edge_request import DeleteEdgeRequest
+from knext.graph.rest.models.edge_record_instance import EdgeRecordInstance
+from knext.graph.rest.models.upsert_vertex_request import UpsertVertexRequest
+from knext.graph.rest.models.upsert_edge_request import UpsertEdgeRequest
+from knext.graph.rest.models.vertex_record_instance import VertexRecordInstance
+from knext.graph.rest.models.writer_graph_request import WriterGraphRequest
+from knext.graph.rest.models.edge_record import EdgeRecord
+from knext.graph.rest.models.edge_type_name import EdgeTypeName
+from knext.graph.rest.models.expend_one_hop_request import ExpendOneHopRequest
+from knext.graph.rest.models.lpg_property_record import LpgPropertyRecord
+from knext.graph.rest.models.query_vertex_request import QueryVertexRequest
+from knext.graph.rest.models.query_vertex_response import QueryVertexResponse
+from knext.graph.rest.models.vertex_record import VertexRecord
+from knext.graph.rest.models.expend_one_hop_response import ExpendOneHopResponse
diff --git a/knext/graph/rest/models/delete_edge_request.py b/knext/graph/rest/models/delete_edge_request.py
new file mode 100644
index 00000000..71400901
--- /dev/null
+++ b/knext/graph/rest/models/delete_edge_request.py
@@ -0,0 +1,165 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class DeleteEdgeRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"project_id": "int", "edges": "list[EdgeRecordInstance]"}
+
+    attribute_map = {"project_id": "projectId", "edges": "edges"}
+
+    def __init__(
+        self, project_id=None, edges=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """DeleteEdgeRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._project_id = None
+        self._edges = None
+        self.discriminator = None
+
+        self.project_id = project_id
+        self.edges = edges
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this DeleteEdgeRequest.  # noqa: E501
+
+
+        :return: The project_id of this DeleteEdgeRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this DeleteEdgeRequest.
+
+
+        :param project_id: The project_id of this DeleteEdgeRequest.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._project_id = project_id
+
+    @property
+    def edges(self):
+        """Gets the edges of this DeleteEdgeRequest.  # noqa: E501
+
+
+        :return: The edges of this DeleteEdgeRequest.  # noqa: E501
+        :rtype: list[EdgeRecordInstance]
+        """
+        return self._edges
+
+    @edges.setter
+    def edges(self, edges):
+        """Sets the edges of this DeleteEdgeRequest.
+
+
+        :param edges: The edges of this DeleteEdgeRequest.  # noqa: E501
+        :type: list[EdgeRecordInstance]
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and edges is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `edges`, must not be `None`"
+            )  # noqa: E501
+
+        self._edges = edges
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, DeleteEdgeRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, DeleteEdgeRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/graph/rest/models/delete_vertex_request.py b/knext/graph/rest/models/delete_vertex_request.py
new file mode 100644
index 00000000..843bed00
--- /dev/null
+++ b/knext/graph/rest/models/delete_vertex_request.py
@@ -0,0 +1,165 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class DeleteVertexRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"project_id": "int", "vertices": "list[VertexRecordInstance]"}
+
+    attribute_map = {"project_id": "projectId", "vertices": "vertices"}
+
+    def __init__(
+        self, project_id=None, vertices=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """DeleteVertexRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._project_id = None
+        self._vertices = None
+        self.discriminator = None
+
+        self.project_id = project_id
+        self.vertices = vertices
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this DeleteVertexRequest.  # noqa: E501
+
+
+        :return: The project_id of this DeleteVertexRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this DeleteVertexRequest.
+
+
+        :param project_id: The project_id of this DeleteVertexRequest.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._project_id = project_id
+
+    @property
+    def vertices(self):
+        """Gets the vertices of this DeleteVertexRequest.  # noqa: E501
+
+
+        :return: The vertices of this DeleteVertexRequest.  # noqa: E501
+        :rtype: list[VertexRecordInstance]
+        """
+        return self._vertices
+
+    @vertices.setter
+    def vertices(self, vertices):
+        """Sets the vertices of this DeleteVertexRequest.
+
+
+        :param vertices: The vertices of this DeleteVertexRequest.  # noqa: E501
+        :type: list[VertexRecordInstance]
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and vertices is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `vertices`, must not be `None`"
+            )  # noqa: E501
+
+        self._vertices = vertices
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, DeleteVertexRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, DeleteVertexRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/graph/rest/models/edge_record.py b/knext/graph/rest/models/edge_record.py
new file mode 100644
index 00000000..12414040
--- /dev/null
+++ b/knext/graph/rest/models/edge_record.py
@@ -0,0 +1,245 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class EdgeRecord(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "record_type": "str",
+        "edge_type": "EdgeTypeName",
+        "src_id": "str",
+        "dst_id": "str",
+        "properties": "list[LpgPropertyRecord]",
+    }
+
+    attribute_map = {
+        "record_type": "recordType",
+        "edge_type": "edgeType",
+        "src_id": "srcId",
+        "dst_id": "dstId",
+        "properties": "properties",
+    }
+
+    def __init__(
+        self,
+        record_type=None,
+        edge_type=None,
+        src_id=None,
+        dst_id=None,
+        properties=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """EdgeRecord - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._record_type = None
+        self._edge_type = None
+        self._src_id = None
+        self._dst_id = None
+        self._properties = None
+        self.discriminator = None
+
+        if record_type is not None:
+            self.record_type = record_type
+        if edge_type is not None:
+            self.edge_type = edge_type
+        if src_id is not None:
+            self.src_id = src_id
+        if dst_id is not None:
+            self.dst_id = dst_id
+        if properties is not None:
+            self.properties = properties
+
+    @property
+    def record_type(self):
+        """Gets the record_type of this EdgeRecord.  # noqa: E501
+
+
+        :return: The record_type of this EdgeRecord.  # noqa: E501
+        :rtype: str
+        """
+        return self._record_type
+
+    @record_type.setter
+    def record_type(self, record_type):
+        """Sets the record_type of this EdgeRecord.
+
+
+        :param record_type: The record_type of this EdgeRecord.  # noqa: E501
+        :type: str
+        """
+
+        self._record_type = record_type
+
+    @property
+    def edge_type(self):
+        """Gets the edge_type of this EdgeRecord.  # noqa: E501
+
+
+        :return: The edge_type of this EdgeRecord.  # noqa: E501
+        :rtype: EdgeTypeName
+        """
+        return self._edge_type
+
+    @edge_type.setter
+    def edge_type(self, edge_type):
+        """Sets the edge_type of this EdgeRecord.
+
+
+        :param edge_type: The edge_type of this EdgeRecord.  # noqa: E501
+        :type: EdgeTypeName
+        """
+
+        self._edge_type = edge_type
+
+    @property
+    def src_id(self):
+        """Gets the src_id of this EdgeRecord.  # noqa: E501
+
+
+        :return: The src_id of this EdgeRecord.  # noqa: E501
+        :rtype: str
+        """
+        return self._src_id
+
+    @src_id.setter
+    def src_id(self, src_id):
+        """Sets the src_id of this EdgeRecord.
+
+
+        :param src_id: The src_id of this EdgeRecord.  # noqa: E501
+        :type: str
+        """
+
+        self._src_id = src_id
+
+    @property
+    def dst_id(self):
+        """Gets the dst_id of this EdgeRecord.  # noqa: E501
+
+
+        :return: The dst_id of this EdgeRecord.  # noqa: E501
+        :rtype: str
+        """
+        return self._dst_id
+
+    @dst_id.setter
+    def dst_id(self, dst_id):
+        """Sets the dst_id of this EdgeRecord.
+
+
+        :param dst_id: The dst_id of this EdgeRecord.  # noqa: E501
+        :type: str
+        """
+
+        self._dst_id = dst_id
+
+    @property
+    def properties(self):
+        """Gets the properties of this EdgeRecord.  # noqa: E501
+
+
+        :return: The properties of this EdgeRecord.  # noqa: E501
+        :rtype: list[LpgPropertyRecord]
+        """
+        return self._properties
+
+    @properties.setter
+    def properties(self, properties):
+        """Sets the properties of this EdgeRecord.
+
+
+        :param properties: The properties of this EdgeRecord.  # noqa: E501
+        :type: list[LpgPropertyRecord]
+        """
+
+        self._properties = properties
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, EdgeRecord):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, EdgeRecord):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/graph/rest/models/edge_record_instance.py b/knext/graph/rest/models/edge_record_instance.py
new file mode 100644
index 00000000..17205072
--- /dev/null
+++ b/knext/graph/rest/models/edge_record_instance.py
@@ -0,0 +1,302 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class EdgeRecordInstance(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "src_type": "str",
+        "src_id": "str",
+        "dst_type": "str",
+        "dst_id": "str",
+        "label": "str",
+        "properties": "object",
+    }
+
+    attribute_map = {
+        "src_type": "srcType",
+        "src_id": "srcId",
+        "dst_type": "dstType",
+        "dst_id": "dstId",
+        "label": "label",
+        "properties": "properties",
+    }
+
+    def __init__(
+        self,
+        src_type=None,
+        src_id=None,
+        dst_type=None,
+        dst_id=None,
+        label=None,
+        properties=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """EdgeRecordInstance - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._src_type = None
+        self._src_id = None
+        self._dst_type = None
+        self._dst_id = None
+        self._label = None
+        self._properties = None
+        self.discriminator = None
+
+        self.src_type = src_type
+        self.src_id = src_id
+        self.dst_type = dst_type
+        self.dst_id = dst_id
+        self.label = label
+        self.properties = properties
+
+    @property
+    def src_type(self):
+        """Gets the src_type of this EdgeRecordInstance.  # noqa: E501
+
+
+        :return: The src_type of this EdgeRecordInstance.  # noqa: E501
+        :rtype: str
+        """
+        return self._src_type
+
+    @src_type.setter
+    def src_type(self, src_type):
+        """Sets the src_type of this EdgeRecordInstance.
+
+
+        :param src_type: The src_type of this EdgeRecordInstance.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and src_type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `src_type`, must not be `None`"
+            )  # noqa: E501
+
+        self._src_type = src_type
+
+    @property
+    def src_id(self):
+        """Gets the src_id of this EdgeRecordInstance.  # noqa: E501
+
+
+        :return: The src_id of this EdgeRecordInstance.  # noqa: E501
+        :rtype: str
+        """
+        return self._src_id
+
+    @src_id.setter
+    def src_id(self, src_id):
+        """Sets the src_id of this EdgeRecordInstance.
+
+
+        :param src_id: The src_id of this EdgeRecordInstance.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and src_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `src_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._src_id = src_id
+
+    @property
+    def dst_type(self):
+        """Gets the dst_type of this EdgeRecordInstance.  # noqa: E501
+
+
+        :return: The dst_type of this EdgeRecordInstance.  # noqa: E501
+        :rtype: str
+        """
+        return self._dst_type
+
+    @dst_type.setter
+    def dst_type(self, dst_type):
+        """Sets the dst_type of this EdgeRecordInstance.
+
+
+        :param dst_type: The dst_type of this EdgeRecordInstance.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and dst_type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `dst_type`, must not be `None`"
+            )  # noqa: E501
+
+        self._dst_type = dst_type
+
+    @property
+    def dst_id(self):
+        """Gets the dst_id of this EdgeRecordInstance.  # noqa: E501
+
+
+        :return: The dst_id of this EdgeRecordInstance.  # noqa: E501
+        :rtype: str
+        """
+        return self._dst_id
+
+    @dst_id.setter
+    def dst_id(self, dst_id):
+        """Sets the dst_id of this EdgeRecordInstance.
+
+
+        :param dst_id: The dst_id of this EdgeRecordInstance.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and dst_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `dst_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._dst_id = dst_id
+
+    @property
+    def label(self):
+        """Gets the label of this EdgeRecordInstance.  # noqa: E501
+
+
+        :return: The label of this EdgeRecordInstance.  # noqa: E501
+        :rtype: str
+        """
+        return self._label
+
+    @label.setter
+    def label(self, label):
+        """Sets the label of this EdgeRecordInstance.
+
+
+        :param label: The label of this EdgeRecordInstance.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and label is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `label`, must not be `None`"
+            )  # noqa: E501
+
+        self._label = label
+
+    @property
+    def properties(self):
+        """Gets the properties of this EdgeRecordInstance.  # noqa: E501
+
+
+        :return: The properties of this EdgeRecordInstance.  # noqa: E501
+        :rtype: object
+        """
+        return self._properties
+
+    @properties.setter
+    def properties(self, properties):
+        """Sets the properties of this EdgeRecordInstance.
+
+
+        :param properties: The properties of this EdgeRecordInstance.  # noqa: E501
+        :type: object
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and properties is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `properties`, must not be `None`"
+            )  # noqa: E501
+
+        self._properties = properties
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, EdgeRecordInstance):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, EdgeRecordInstance):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/graph/rest/models/edge_type_name.py b/knext/graph/rest/models/edge_type_name.py
new file mode 100644
index 00000000..fcdeafa0
--- /dev/null
+++ b/knext/graph/rest/models/edge_type_name.py
@@ -0,0 +1,208 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class EdgeTypeName(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "start_vertex_type": "str",
+        "edge_label": "str",
+        "end_vertex_type": "str",
+    }
+
+    attribute_map = {
+        "start_vertex_type": "startVertexType",
+        "edge_label": "edgeLabel",
+        "end_vertex_type": "endVertexType",
+    }
+
+    def __init__(
+        self,
+        start_vertex_type=None,
+        edge_label=None,
+        end_vertex_type=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """EdgeTypeName - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._start_vertex_type = None
+        self._edge_label = None
+        self._end_vertex_type = None
+        self.discriminator = None
+
+        self.start_vertex_type = start_vertex_type
+        self.edge_label = edge_label
+        self.end_vertex_type = end_vertex_type
+
+    @property
+    def start_vertex_type(self):
+        """Gets the start_vertex_type of this EdgeTypeName.  # noqa: E501
+
+
+        :return: The start_vertex_type of this EdgeTypeName.  # noqa: E501
+        :rtype: str
+        """
+        return self._start_vertex_type
+
+    @start_vertex_type.setter
+    def start_vertex_type(self, start_vertex_type):
+        """Sets the start_vertex_type of this EdgeTypeName.
+
+
+        :param start_vertex_type: The start_vertex_type of this EdgeTypeName.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and start_vertex_type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `start_vertex_type`, must not be `None`"
+            )  # noqa: E501
+
+        self._start_vertex_type = start_vertex_type
+
+    @property
+    def edge_label(self):
+        """Gets the edge_label of this EdgeTypeName.  # noqa: E501
+
+
+        :return: The edge_label of this EdgeTypeName.  # noqa: E501
+        :rtype: str
+        """
+        return self._edge_label
+
+    @edge_label.setter
+    def edge_label(self, edge_label):
+        """Sets the edge_label of this EdgeTypeName.
+
+
+        :param edge_label: The edge_label of this EdgeTypeName.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and edge_label is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `edge_label`, must not be `None`"
+            )  # noqa: E501
+
+        self._edge_label = edge_label
+
+    @property
+    def end_vertex_type(self):
+        """Gets the end_vertex_type of this EdgeTypeName.  # noqa: E501
+
+
+        :return: The end_vertex_type of this EdgeTypeName.  # noqa: E501
+        :rtype: str
+        """
+        return self._end_vertex_type
+
+    @end_vertex_type.setter
+    def end_vertex_type(self, end_vertex_type):
+        """Sets the end_vertex_type of this EdgeTypeName.
+
+
+        :param end_vertex_type: The end_vertex_type of this EdgeTypeName.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and end_vertex_type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `end_vertex_type`, must not be `None`"
+            )  # noqa: E501
+
+        self._end_vertex_type = end_vertex_type
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, EdgeTypeName):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, EdgeTypeName):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/graph/rest/models/expend_one_hop_request.py b/knext/graph/rest/models/expend_one_hop_request.py
new file mode 100644
index 00000000..94c339c5
--- /dev/null
+++ b/knext/graph/rest/models/expend_one_hop_request.py
@@ -0,0 +1,232 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class ExpendOneHopRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "project_id": "int",
+        "type_name": "str",
+        "biz_id": "str",
+        "edge_type_name_constraint": "list[EdgeTypeName]",
+    }
+
+    attribute_map = {
+        "project_id": "projectId",
+        "type_name": "typeName",
+        "biz_id": "bizId",
+        "edge_type_name_constraint": "edgeTypeNameConstraint",
+    }
+
+    def __init__(
+        self,
+        project_id=None,
+        type_name=None,
+        biz_id=None,
+        edge_type_name_constraint=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """ExpendOneHopRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._project_id = None
+        self._type_name = None
+        self._biz_id = None
+        self._edge_type_name_constraint = None
+        self.discriminator = None
+
+        self.project_id = project_id
+        self.type_name = type_name
+        self.biz_id = biz_id
+        self.edge_type_name_constraint = edge_type_name_constraint
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this ExpendOneHopRequest.  # noqa: E501
+
+
+        :return: The project_id of this ExpendOneHopRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this ExpendOneHopRequest.
+
+
+        :param project_id: The project_id of this ExpendOneHopRequest.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._project_id = project_id
+
+    @property
+    def type_name(self):
+        """Gets the type_name of this ExpendOneHopRequest.  # noqa: E501
+
+
+        :return: The type_name of this ExpendOneHopRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._type_name
+
+    @type_name.setter
+    def type_name(self, type_name):
+        """Sets the type_name of this ExpendOneHopRequest.
+
+
+        :param type_name: The type_name of this ExpendOneHopRequest.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and type_name is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `type_name`, must not be `None`"
+            )  # noqa: E501
+
+        self._type_name = type_name
+
+    @property
+    def biz_id(self):
+        """Gets the biz_id of this ExpendOneHopRequest.  # noqa: E501
+
+
+        :return: The biz_id of this ExpendOneHopRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._biz_id
+
+    @biz_id.setter
+    def biz_id(self, biz_id):
+        """Sets the biz_id of this ExpendOneHopRequest.
+
+
+        :param biz_id: The biz_id of this ExpendOneHopRequest.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and biz_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `biz_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._biz_id = biz_id
+
+    @property
+    def edge_type_name_constraint(self):
+        """Gets the edge_type_name_constraint of this ExpendOneHopRequest.  # noqa: E501
+
+
+        :return: The edge_type_name_constraint of this ExpendOneHopRequest.  # noqa: E501
+        :rtype: list[EdgeTypeName]
+        """
+        return self._edge_type_name_constraint
+
+    @edge_type_name_constraint.setter
+    def edge_type_name_constraint(self, edge_type_name_constraint):
+        """Sets the edge_type_name_constraint of this ExpendOneHopRequest.
+
+
+        :param edge_type_name_constraint: The edge_type_name_constraint of this ExpendOneHopRequest.  # noqa: E501
+        :type: list[EdgeTypeName]
+        """
+
+        self._edge_type_name_constraint = edge_type_name_constraint
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, ExpendOneHopRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, ExpendOneHopRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/graph/rest/models/expend_one_hop_response.py b/knext/graph/rest/models/expend_one_hop_response.py
new file mode 100644
index 00000000..fdc03665
--- /dev/null
+++ b/knext/graph/rest/models/expend_one_hop_response.py
@@ -0,0 +1,191 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class ExpendOneHopResponse(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "vertex": "VertexRecord",
+        "edges": "list[EdgeRecord]",
+        "adjacent_vertices": "list[VertexRecord]",
+    }
+
+    attribute_map = {
+        "vertex": "vertex",
+        "edges": "edges",
+        "adjacent_vertices": "adjacentVertices",
+    }
+
+    def __init__(
+        self,
+        vertex=None,
+        edges=None,
+        adjacent_vertices=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """ExpendOneHopResponse - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._vertex = None
+        self._edges = None
+        self._adjacent_vertices = None
+        self.discriminator = None
+
+        if vertex is not None:
+            self.vertex = vertex
+        if edges is not None:
+            self.edges = edges
+        if adjacent_vertices is not None:
+            self.adjacent_vertices = adjacent_vertices
+
+    @property
+    def vertex(self):
+        """Gets the vertex of this ExpendOneHopResponse.  # noqa: E501
+
+
+        :return: The vertex of this ExpendOneHopResponse.  # noqa: E501
+        :rtype: VertexRecord
+        """
+        return self._vertex
+
+    @vertex.setter
+    def vertex(self, vertex):
+        """Sets the vertex of this ExpendOneHopResponse.
+
+
+        :param vertex: The vertex of this ExpendOneHopResponse.  # noqa: E501
+        :type: VertexRecord
+        """
+
+        self._vertex = vertex
+
+    @property
+    def edges(self):
+        """Gets the edges of this ExpendOneHopResponse.  # noqa: E501
+
+
+        :return: The edges of this ExpendOneHopResponse.  # noqa: E501
+        :rtype: list[EdgeRecord]
+        """
+        return self._edges
+
+    @edges.setter
+    def edges(self, edges):
+        """Sets the edges of this ExpendOneHopResponse.
+
+
+        :param edges: The edges of this ExpendOneHopResponse.  # noqa: E501
+        :type: list[EdgeRecord]
+        """
+
+        self._edges = edges
+
+    @property
+    def adjacent_vertices(self):
+        """Gets the adjacent_vertices of this ExpendOneHopResponse.  # noqa: E501
+
+
+        :return: The adjacent_vertices of this ExpendOneHopResponse.  # noqa: E501
+        :rtype: list[VertexRecord]
+        """
+        return self._adjacent_vertices
+
+    @adjacent_vertices.setter
+    def adjacent_vertices(self, adjacent_vertices):
+        """Sets the adjacent_vertices of this ExpendOneHopResponse.
+
+
+        :param adjacent_vertices: The adjacent_vertices of this ExpendOneHopResponse.  # noqa: E501
+        :type: list[VertexRecord]
+        """
+
+        self._adjacent_vertices = adjacent_vertices
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, ExpendOneHopResponse):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, ExpendOneHopResponse):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/graph/rest/models/get_page_rank_scores_request.py b/knext/graph/rest/models/get_page_rank_scores_request.py
new file mode 100644
index 00000000..032e251a
--- /dev/null
+++ b/knext/graph/rest/models/get_page_rank_scores_request.py
@@ -0,0 +1,207 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class GetPageRankScoresRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "project_id": "int",
+        "target_vertex_type": "str",
+        "start_nodes": "list[GetPageRankScoresRequestStartNodes]",
+    }
+
+    attribute_map = {
+        "project_id": "projectId",
+        "target_vertex_type": "targetVertexType",
+        "start_nodes": "startNodes",
+    }
+
+    def __init__(
+        self,
+        project_id=None,
+        target_vertex_type=None,
+        start_nodes=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """GetPageRankScoresRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._project_id = None
+        self._target_vertex_type = None
+        self._start_nodes = None
+        self.discriminator = None
+
+        self.project_id = project_id
+        self.target_vertex_type = target_vertex_type
+        self.start_nodes = start_nodes
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this GetPageRankScoresRequest.  # noqa: E501
+
+
+        :return: The project_id of this GetPageRankScoresRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this GetPageRankScoresRequest.
+
+
+        :param project_id: The project_id of this GetPageRankScoresRequest.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._project_id = project_id
+
+    @property
+    def target_vertex_type(self):
+        """Gets the target_vertex_type of this GetPageRankScoresRequest.  # noqa: E501
+
+
+        :return: The target_vertex_type of this GetPageRankScoresRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._target_vertex_type
+
+    @target_vertex_type.setter
+    def target_vertex_type(self, target_vertex_type):
+        """Sets the target_vertex_type of this GetPageRankScoresRequest.
+
+
+        :param target_vertex_type: The target_vertex_type of this GetPageRankScoresRequest.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and target_vertex_type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `target_vertex_type`, must not be `None`"
+            )  # noqa: E501
+
+        self._target_vertex_type = target_vertex_type
+
+    @property
+    def start_nodes(self):
+        """Gets the start_nodes of this GetPageRankScoresRequest.  # noqa: E501
+
+
+        :return: The start_nodes of this GetPageRankScoresRequest.  # noqa: E501
+        :rtype: list[GetPageRankScoresRequestStartNodes]
+        """
+        return self._start_nodes
+
+    @start_nodes.setter
+    def start_nodes(self, start_nodes):
+        """Sets the start_nodes of this GetPageRankScoresRequest.
+
+
+        :param start_nodes: The start_nodes of this GetPageRankScoresRequest.  # noqa: E501
+        :type: list[GetPageRankScoresRequestStartNodes]
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and start_nodes is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `start_nodes`, must not be `None`"
+            )  # noqa: E501
+
+        self._start_nodes = start_nodes
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, GetPageRankScoresRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, GetPageRankScoresRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/graph/rest/models/get_page_rank_scores_request_start_nodes.py b/knext/graph/rest/models/get_page_rank_scores_request_start_nodes.py
new file mode 100644
index 00000000..9357b4fa
--- /dev/null
+++ b/knext/graph/rest/models/get_page_rank_scores_request_start_nodes.py
@@ -0,0 +1,226 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class GetPageRankScoresRequestStartNodes(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "type": "str",
+        "id": "str",
+        "properties": "object",
+        "vectors": "object",
+    }
+
+    attribute_map = {
+        "type": "type",
+        "id": "id",
+        "properties": "properties",
+        "vectors": "vectors",
+    }
+
+    def __init__(
+        self,
+        type=None,
+        id=None,
+        properties=None,
+        vectors=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """GetPageRankScoresRequestStartNodes - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._type = None
+        self._id = None
+        self._properties = None
+        self._vectors = None
+        self.discriminator = None
+
+        self.type = type
+        self.id = id
+        if properties is not None:
+            self.properties = properties
+        if vectors is not None:
+            self.vectors = vectors
+
+    @property
+    def type(self):
+        """Gets the type of this GetPageRankScoresRequestStartNodes.  # noqa: E501
+
+
+        :return: The type of this GetPageRankScoresRequestStartNodes.  # noqa: E501
+        :rtype: str
+        """
+        return self._type
+
+    @type.setter
+    def type(self, type):
+        """Sets the type of this GetPageRankScoresRequestStartNodes.
+
+
+        :param type: The type of this GetPageRankScoresRequestStartNodes.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `type`, must not be `None`"
+            )  # noqa: E501
+
+        self._type = type
+
+    @property
+    def id(self):
+        """Gets the id of this GetPageRankScoresRequestStartNodes.  # noqa: E501
+
+
+        :return: The id of this GetPageRankScoresRequestStartNodes.  # noqa: E501
+        :rtype: str
+        """
+        return self._id
+
+    @id.setter
+    def id(self, id):
+        """Sets the id of this GetPageRankScoresRequestStartNodes.
+
+
+        :param id: The id of this GetPageRankScoresRequestStartNodes.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and id is None
+        ):  # noqa: E501
+            raise ValueError("Invalid value for `id`, must not be `None`")  # noqa: E501
+
+        self._id = id
+
+    @property
+    def properties(self):
+        """Gets the properties of this GetPageRankScoresRequestStartNodes.  # noqa: E501
+
+
+        :return: The properties of this GetPageRankScoresRequestStartNodes.  # noqa: E501
+        :rtype: object
+        """
+        return self._properties
+
+    @properties.setter
+    def properties(self, properties):
+        """Sets the properties of this GetPageRankScoresRequestStartNodes.
+
+
+        :param properties: The properties of this GetPageRankScoresRequestStartNodes.  # noqa: E501
+        :type: object
+        """
+
+        self._properties = properties
+
+    @property
+    def vectors(self):
+        """Gets the vectors of this GetPageRankScoresRequestStartNodes.  # noqa: E501
+
+
+        :return: The vectors of this GetPageRankScoresRequestStartNodes.  # noqa: E501
+        :rtype: object
+        """
+        return self._vectors
+
+    @vectors.setter
+    def vectors(self, vectors):
+        """Sets the vectors of this GetPageRankScoresRequestStartNodes.
+
+
+        :param vectors: The vectors of this GetPageRankScoresRequestStartNodes.  # noqa: E501
+        :type: object
+        """
+
+        self._vectors = vectors
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, GetPageRankScoresRequestStartNodes):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, GetPageRankScoresRequestStartNodes):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/graph/rest/models/lpg_property_record.py b/knext/graph/rest/models/lpg_property_record.py
new file mode 100644
index 00000000..33668e0a
--- /dev/null
+++ b/knext/graph/rest/models/lpg_property_record.py
@@ -0,0 +1,160 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class LpgPropertyRecord(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"name": "str", "value": "str"}
+
+    attribute_map = {"name": "name", "value": "value"}
+
+    def __init__(
+        self, name=None, value=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """LpgPropertyRecord - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._name = None
+        self._value = None
+        self.discriminator = None
+
+        self.name = name
+        if value is not None:
+            self.value = value
+
+    @property
+    def name(self):
+        """Gets the name of this LpgPropertyRecord.  # noqa: E501
+
+
+        :return: The name of this LpgPropertyRecord.  # noqa: E501
+        :rtype: str
+        """
+        return self._name
+
+    @name.setter
+    def name(self, name):
+        """Sets the name of this LpgPropertyRecord.
+
+
+        :param name: The name of this LpgPropertyRecord.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and name is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `name`, must not be `None`"
+            )  # noqa: E501
+
+        self._name = name
+
+    @property
+    def value(self):
+        """Gets the value of this LpgPropertyRecord.  # noqa: E501
+
+
+        :return: The value of this LpgPropertyRecord.  # noqa: E501
+        :rtype: str
+        """
+        return self._value
+
+    @value.setter
+    def value(self, value):
+        """Sets the value of this LpgPropertyRecord.
+
+
+        :param value: The value of this LpgPropertyRecord.  # noqa: E501
+        :type: str
+        """
+
+        self._value = value
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, LpgPropertyRecord):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, LpgPropertyRecord):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/graph/rest/models/page_rank_score_instance.py b/knext/graph/rest/models/page_rank_score_instance.py
new file mode 100644
index 00000000..65a9ac63
--- /dev/null
+++ b/knext/graph/rest/models/page_rank_score_instance.py
@@ -0,0 +1,192 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class PageRankScoreInstance(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"type": "str", "id": "str", "score": "float"}
+
+    attribute_map = {"type": "type", "id": "id", "score": "score"}
+
+    def __init__(
+        self, type=None, id=None, score=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """PageRankScoreInstance - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._type = None
+        self._id = None
+        self._score = None
+        self.discriminator = None
+
+        self.type = type
+        self.id = id
+        self.score = score
+
+    @property
+    def type(self):
+        """Gets the type of this PageRankScoreInstance.  # noqa: E501
+
+
+        :return: The type of this PageRankScoreInstance.  # noqa: E501
+        :rtype: str
+        """
+        return self._type
+
+    @type.setter
+    def type(self, type):
+        """Sets the type of this PageRankScoreInstance.
+
+
+        :param type: The type of this PageRankScoreInstance.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `type`, must not be `None`"
+            )  # noqa: E501
+
+        self._type = type
+
+    @property
+    def id(self):
+        """Gets the id of this PageRankScoreInstance.  # noqa: E501
+
+
+        :return: The id of this PageRankScoreInstance.  # noqa: E501
+        :rtype: str
+        """
+        return self._id
+
+    @id.setter
+    def id(self, id):
+        """Sets the id of this PageRankScoreInstance.
+
+
+        :param id: The id of this PageRankScoreInstance.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and id is None
+        ):  # noqa: E501
+            raise ValueError("Invalid value for `id`, must not be `None`")  # noqa: E501
+
+        self._id = id
+
+    @property
+    def score(self):
+        """Gets the score of this PageRankScoreInstance.  # noqa: E501
+
+
+        :return: The score of this PageRankScoreInstance.  # noqa: E501
+        :rtype: float
+        """
+        return self._score
+
+    @score.setter
+    def score(self, score):
+        """Sets the score of this PageRankScoreInstance.
+
+
+        :param score: The score of this PageRankScoreInstance.  # noqa: E501
+        :type: float
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and score is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `score`, must not be `None`"
+            )  # noqa: E501
+
+        self._score = score
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, PageRankScoreInstance):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, PageRankScoreInstance):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/graph/rest/models/query_vertex_request.py b/knext/graph/rest/models/query_vertex_request.py
new file mode 100644
index 00000000..81ea26a6
--- /dev/null
+++ b/knext/graph/rest/models/query_vertex_request.py
@@ -0,0 +1,202 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class QueryVertexRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"project_id": "int", "type_name": "str", "biz_id": "str"}
+
+    attribute_map = {
+        "project_id": "projectId",
+        "type_name": "typeName",
+        "biz_id": "bizId",
+    }
+
+    def __init__(
+        self,
+        project_id=None,
+        type_name=None,
+        biz_id=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """QueryVertexRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._project_id = None
+        self._type_name = None
+        self._biz_id = None
+        self.discriminator = None
+
+        self.project_id = project_id
+        self.type_name = type_name
+        self.biz_id = biz_id
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this QueryVertexRequest.  # noqa: E501
+
+
+        :return: The project_id of this QueryVertexRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this QueryVertexRequest.
+
+
+        :param project_id: The project_id of this QueryVertexRequest.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._project_id = project_id
+
+    @property
+    def type_name(self):
+        """Gets the type_name of this QueryVertexRequest.  # noqa: E501
+
+
+        :return: The type_name of this QueryVertexRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._type_name
+
+    @type_name.setter
+    def type_name(self, type_name):
+        """Sets the type_name of this QueryVertexRequest.
+
+
+        :param type_name: The type_name of this QueryVertexRequest.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and type_name is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `type_name`, must not be `None`"
+            )  # noqa: E501
+
+        self._type_name = type_name
+
+    @property
+    def biz_id(self):
+        """Gets the biz_id of this QueryVertexRequest.  # noqa: E501
+
+
+        :return: The biz_id of this QueryVertexRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._biz_id
+
+    @biz_id.setter
+    def biz_id(self, biz_id):
+        """Sets the biz_id of this QueryVertexRequest.
+
+
+        :param biz_id: The biz_id of this QueryVertexRequest.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and biz_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `biz_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._biz_id = biz_id
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, QueryVertexRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, QueryVertexRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/graph/rest/models/query_vertex_response.py b/knext/graph/rest/models/query_vertex_response.py
new file mode 100644
index 00000000..eb48a99d
--- /dev/null
+++ b/knext/graph/rest/models/query_vertex_response.py
@@ -0,0 +1,129 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class QueryVertexResponse(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"vertex": "VertexRecord"}
+
+    attribute_map = {"vertex": "vertex"}
+
+    def __init__(self, vertex=None, local_vars_configuration=None):  # noqa: E501
+        """QueryVertexResponse - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._vertex = None
+        self.discriminator = None
+
+        if vertex is not None:
+            self.vertex = vertex
+
+    @property
+    def vertex(self):
+        """Gets the vertex of this QueryVertexResponse.  # noqa: E501
+
+
+        :return: The vertex of this QueryVertexResponse.  # noqa: E501
+        :rtype: VertexRecord
+        """
+        return self._vertex
+
+    @vertex.setter
+    def vertex(self, vertex):
+        """Sets the vertex of this QueryVertexResponse.
+
+
+        :param vertex: The vertex of this QueryVertexResponse.  # noqa: E501
+        :type: VertexRecord
+        """
+
+        self._vertex = vertex
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, QueryVertexResponse):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, QueryVertexResponse):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/graph/rest/models/upsert_edge_request.py b/knext/graph/rest/models/upsert_edge_request.py
new file mode 100644
index 00000000..74e786a0
--- /dev/null
+++ b/knext/graph/rest/models/upsert_edge_request.py
@@ -0,0 +1,207 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class UpsertEdgeRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "project_id": "int",
+        "upsert_adjacent_vertices": "bool",
+        "edges": "list[EdgeRecordInstance]",
+    }
+
+    attribute_map = {
+        "project_id": "projectId",
+        "upsert_adjacent_vertices": "upsertAdjacentVertices",
+        "edges": "edges",
+    }
+
+    def __init__(
+        self,
+        project_id=None,
+        upsert_adjacent_vertices=None,
+        edges=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """UpsertEdgeRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._project_id = None
+        self._upsert_adjacent_vertices = None
+        self._edges = None
+        self.discriminator = None
+
+        self.project_id = project_id
+        self.upsert_adjacent_vertices = upsert_adjacent_vertices
+        self.edges = edges
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this UpsertEdgeRequest.  # noqa: E501
+
+
+        :return: The project_id of this UpsertEdgeRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this UpsertEdgeRequest.
+
+
+        :param project_id: The project_id of this UpsertEdgeRequest.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._project_id = project_id
+
+    @property
+    def upsert_adjacent_vertices(self):
+        """Gets the upsert_adjacent_vertices of this UpsertEdgeRequest.  # noqa: E501
+
+
+        :return: The upsert_adjacent_vertices of this UpsertEdgeRequest.  # noqa: E501
+        :rtype: bool
+        """
+        return self._upsert_adjacent_vertices
+
+    @upsert_adjacent_vertices.setter
+    def upsert_adjacent_vertices(self, upsert_adjacent_vertices):
+        """Sets the upsert_adjacent_vertices of this UpsertEdgeRequest.
+
+
+        :param upsert_adjacent_vertices: The upsert_adjacent_vertices of this UpsertEdgeRequest.  # noqa: E501
+        :type: bool
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and upsert_adjacent_vertices is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `upsert_adjacent_vertices`, must not be `None`"
+            )  # noqa: E501
+
+        self._upsert_adjacent_vertices = upsert_adjacent_vertices
+
+    @property
+    def edges(self):
+        """Gets the edges of this UpsertEdgeRequest.  # noqa: E501
+
+
+        :return: The edges of this UpsertEdgeRequest.  # noqa: E501
+        :rtype: list[EdgeRecordInstance]
+        """
+        return self._edges
+
+    @edges.setter
+    def edges(self, edges):
+        """Sets the edges of this UpsertEdgeRequest.
+
+
+        :param edges: The edges of this UpsertEdgeRequest.  # noqa: E501
+        :type: list[EdgeRecordInstance]
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and edges is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `edges`, must not be `None`"
+            )  # noqa: E501
+
+        self._edges = edges
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, UpsertEdgeRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, UpsertEdgeRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/graph/rest/models/upsert_vertex_request.py b/knext/graph/rest/models/upsert_vertex_request.py
new file mode 100644
index 00000000..6982f49c
--- /dev/null
+++ b/knext/graph/rest/models/upsert_vertex_request.py
@@ -0,0 +1,165 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class UpsertVertexRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"project_id": "int", "vertices": "list[VertexRecordInstance]"}
+
+    attribute_map = {"project_id": "projectId", "vertices": "vertices"}
+
+    def __init__(
+        self, project_id=None, vertices=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """UpsertVertexRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._project_id = None
+        self._vertices = None
+        self.discriminator = None
+
+        self.project_id = project_id
+        self.vertices = vertices
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this UpsertVertexRequest.  # noqa: E501
+
+
+        :return: The project_id of this UpsertVertexRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this UpsertVertexRequest.
+
+
+        :param project_id: The project_id of this UpsertVertexRequest.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._project_id = project_id
+
+    @property
+    def vertices(self):
+        """Gets the vertices of this UpsertVertexRequest.  # noqa: E501
+
+
+        :return: The vertices of this UpsertVertexRequest.  # noqa: E501
+        :rtype: list[VertexRecordInstance]
+        """
+        return self._vertices
+
+    @vertices.setter
+    def vertices(self, vertices):
+        """Sets the vertices of this UpsertVertexRequest.
+
+
+        :param vertices: The vertices of this UpsertVertexRequest.  # noqa: E501
+        :type: list[VertexRecordInstance]
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and vertices is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `vertices`, must not be `None`"
+            )  # noqa: E501
+
+        self._vertices = vertices
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, UpsertVertexRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, UpsertVertexRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/graph/rest/models/vertex_record.py b/knext/graph/rest/models/vertex_record.py
new file mode 100644
index 00000000..9ed57e28
--- /dev/null
+++ b/knext/graph/rest/models/vertex_record.py
@@ -0,0 +1,218 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class VertexRecord(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "record_type": "str",
+        "vertex_type": "str",
+        "id": "str",
+        "properties": "list[LpgPropertyRecord]",
+    }
+
+    attribute_map = {
+        "record_type": "recordType",
+        "vertex_type": "vertexType",
+        "id": "id",
+        "properties": "properties",
+    }
+
+    def __init__(
+        self,
+        record_type=None,
+        vertex_type=None,
+        id=None,
+        properties=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """VertexRecord - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._record_type = None
+        self._vertex_type = None
+        self._id = None
+        self._properties = None
+        self.discriminator = None
+
+        if record_type is not None:
+            self.record_type = record_type
+        if vertex_type is not None:
+            self.vertex_type = vertex_type
+        if id is not None:
+            self.id = id
+        if properties is not None:
+            self.properties = properties
+
+    @property
+    def record_type(self):
+        """Gets the record_type of this VertexRecord.  # noqa: E501
+
+
+        :return: The record_type of this VertexRecord.  # noqa: E501
+        :rtype: str
+        """
+        return self._record_type
+
+    @record_type.setter
+    def record_type(self, record_type):
+        """Sets the record_type of this VertexRecord.
+
+
+        :param record_type: The record_type of this VertexRecord.  # noqa: E501
+        :type: str
+        """
+
+        self._record_type = record_type
+
+    @property
+    def vertex_type(self):
+        """Gets the vertex_type of this VertexRecord.  # noqa: E501
+
+
+        :return: The vertex_type of this VertexRecord.  # noqa: E501
+        :rtype: str
+        """
+        return self._vertex_type
+
+    @vertex_type.setter
+    def vertex_type(self, vertex_type):
+        """Sets the vertex_type of this VertexRecord.
+
+
+        :param vertex_type: The vertex_type of this VertexRecord.  # noqa: E501
+        :type: str
+        """
+
+        self._vertex_type = vertex_type
+
+    @property
+    def id(self):
+        """Gets the id of this VertexRecord.  # noqa: E501
+
+
+        :return: The id of this VertexRecord.  # noqa: E501
+        :rtype: str
+        """
+        return self._id
+
+    @id.setter
+    def id(self, id):
+        """Sets the id of this VertexRecord.
+
+
+        :param id: The id of this VertexRecord.  # noqa: E501
+        :type: str
+        """
+
+        self._id = id
+
+    @property
+    def properties(self):
+        """Gets the properties of this VertexRecord.  # noqa: E501
+
+
+        :return: The properties of this VertexRecord.  # noqa: E501
+        :rtype: list[LpgPropertyRecord]
+        """
+        return self._properties
+
+    @properties.setter
+    def properties(self, properties):
+        """Sets the properties of this VertexRecord.
+
+
+        :param properties: The properties of this VertexRecord.  # noqa: E501
+        :type: list[LpgPropertyRecord]
+        """
+
+        self._properties = properties
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, VertexRecord):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, VertexRecord):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/graph/rest/models/vertex_record_instance.py b/knext/graph/rest/models/vertex_record_instance.py
new file mode 100644
index 00000000..675e4246
--- /dev/null
+++ b/knext/graph/rest/models/vertex_record_instance.py
@@ -0,0 +1,236 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class VertexRecordInstance(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "type": "str",
+        "id": "str",
+        "properties": "object",
+        "vectors": "object",
+    }
+
+    attribute_map = {
+        "type": "type",
+        "id": "id",
+        "properties": "properties",
+        "vectors": "vectors",
+    }
+
+    def __init__(
+        self,
+        type=None,
+        id=None,
+        properties=None,
+        vectors=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """VertexRecordInstance - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._type = None
+        self._id = None
+        self._properties = None
+        self._vectors = None
+        self.discriminator = None
+
+        self.type = type
+        self.id = id
+        self.properties = properties
+        self.vectors = vectors
+
+    @property
+    def type(self):
+        """Gets the type of this VertexRecordInstance.  # noqa: E501
+
+
+        :return: The type of this VertexRecordInstance.  # noqa: E501
+        :rtype: str
+        """
+        return self._type
+
+    @type.setter
+    def type(self, type):
+        """Sets the type of this VertexRecordInstance.
+
+
+        :param type: The type of this VertexRecordInstance.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `type`, must not be `None`"
+            )  # noqa: E501
+
+        self._type = type
+
+    @property
+    def id(self):
+        """Gets the id of this VertexRecordInstance.  # noqa: E501
+
+
+        :return: The id of this VertexRecordInstance.  # noqa: E501
+        :rtype: str
+        """
+        return self._id
+
+    @id.setter
+    def id(self, id):
+        """Sets the id of this VertexRecordInstance.
+
+
+        :param id: The id of this VertexRecordInstance.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and id is None
+        ):  # noqa: E501
+            raise ValueError("Invalid value for `id`, must not be `None`")  # noqa: E501
+
+        self._id = id
+
+    @property
+    def properties(self):
+        """Gets the properties of this VertexRecordInstance.  # noqa: E501
+
+
+        :return: The properties of this VertexRecordInstance.  # noqa: E501
+        :rtype: object
+        """
+        return self._properties
+
+    @properties.setter
+    def properties(self, properties):
+        """Sets the properties of this VertexRecordInstance.
+
+
+        :param properties: The properties of this VertexRecordInstance.  # noqa: E501
+        :type: object
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and properties is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `properties`, must not be `None`"
+            )  # noqa: E501
+
+        self._properties = properties
+
+    @property
+    def vectors(self):
+        """Gets the vectors of this VertexRecordInstance.  # noqa: E501
+
+
+        :return: The vectors of this VertexRecordInstance.  # noqa: E501
+        :rtype: object
+        """
+        return self._vectors
+
+    @vectors.setter
+    def vectors(self, vectors):
+        """Sets the vectors of this VertexRecordInstance.
+
+
+        :param vectors: The vectors of this VertexRecordInstance.  # noqa: E501
+        :type: object
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and vectors is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `vectors`, must not be `None`"
+            )  # noqa: E501
+
+        self._vectors = vectors
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, VertexRecordInstance):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, VertexRecordInstance):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/graph/rest/models/writer_graph_request.py b/knext/graph/rest/models/writer_graph_request.py
new file mode 100644
index 00000000..dca97030
--- /dev/null
+++ b/knext/graph/rest/models/writer_graph_request.py
@@ -0,0 +1,218 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class WriterGraphRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "project_id": "int",
+        "operation": "str",
+        "sub_graph": "object",
+        "enable_lead_to": "bool",
+    }
+
+    attribute_map = {
+        "project_id": "projectId",
+        "operation": "operation",
+        "sub_graph": "subGraph",
+        "enable_lead_to": "enableLeadTo",
+    }
+
+    def __init__(
+        self,
+        project_id=None,
+        operation=None,
+        sub_graph=None,
+        enable_lead_to=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """WriterGraphRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._project_id = None
+        self._operation = None
+        self._sub_graph = None
+        self._enable_lead_to = None
+        self.discriminator = None
+
+        if project_id is not None:
+            self.project_id = project_id
+        if operation is not None:
+            self.operation = operation
+        if sub_graph is not None:
+            self.sub_graph = sub_graph
+        if enable_lead_to is not None:
+            self.enable_lead_to = enable_lead_to
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this WriterGraphRequest.  # noqa: E501
+
+
+        :return: The project_id of this WriterGraphRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this WriterGraphRequest.
+
+
+        :param project_id: The project_id of this WriterGraphRequest.  # noqa: E501
+        :type: int
+        """
+
+        self._project_id = project_id
+
+    @property
+    def operation(self):
+        """Gets the operation of this WriterGraphRequest.  # noqa: E501
+
+
+        :return: The operation of this WriterGraphRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._operation
+
+    @operation.setter
+    def operation(self, operation):
+        """Sets the operation of this WriterGraphRequest.
+
+
+        :param operation: The operation of this WriterGraphRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._operation = operation
+
+    @property
+    def sub_graph(self):
+        """Gets the sub_graph of this WriterGraphRequest.  # noqa: E501
+
+
+        :return: The sub_graph of this WriterGraphRequest.  # noqa: E501
+        :rtype: object
+        """
+        return self._sub_graph
+
+    @sub_graph.setter
+    def sub_graph(self, sub_graph):
+        """Sets the sub_graph of this WriterGraphRequest.
+
+
+        :param sub_graph: The sub_graph of this WriterGraphRequest.  # noqa: E501
+        :type: object
+        """
+
+        self._sub_graph = sub_graph
+
+    @property
+    def enable_lead_to(self):
+        """Gets the enable_lead_to of this WriterGraphRequest.  # noqa: E501
+
+
+        :return: The enable_lead_to of this WriterGraphRequest.  # noqa: E501
+        :rtype: bool
+        """
+        return self._enable_lead_to
+
+    @enable_lead_to.setter
+    def enable_lead_to(self, enable_lead_to):
+        """Sets the enable_lead_to of this WriterGraphRequest.
+
+
+        :param enable_lead_to: The enable_lead_to of this WriterGraphRequest.  # noqa: E501
+        :type: bool
+        """
+
+        self._enable_lead_to = enable_lead_to
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, WriterGraphRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, WriterGraphRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/project/__init__.py b/knext/project/__init__.py
new file mode 100644
index 00000000..e5399987
--- /dev/null
+++ b/knext/project/__init__.py
@@ -0,0 +1,16 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+DEFAULT_SCHEMA_DIR = "schema"
+DEFAULT_SCHEMA_FILE = "$namespace.schema"
+DEFAULT_BUILDER_DIR = "builder"
+DEFAULT_REASONER_DIR = "reasoner"
diff --git a/knext/project/client.py b/knext/project/client.py
new file mode 100644
index 00000000..cd44cab1
--- /dev/null
+++ b/knext/project/client.py
@@ -0,0 +1,83 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import json
+import os
+
+from knext.common.base.client import Client
+from knext.common.rest import Configuration, ApiClient
+from knext.project import rest
+
+
+class ProjectClient(Client):
+    """ """
+
+    def __init__(self, host_addr: str = None, project_id: int = None):
+        super().__init__(host_addr, project_id)
+        self._rest_client: rest.ProjectApi = rest.ProjectApi(
+            api_client=ApiClient(configuration=Configuration(host=host_addr))
+        )
+
+    def get_config(self, project_id: str):
+        project = self.get(id=int(project_id or os.getenv("KAG_PROJECT_ID")))
+        if not project:
+            return {}
+        config = project.config
+        config = json.loads(config) if config else {}
+        return config
+
+    def get(self, **conditions):
+        projects = self._rest_client.project_get()
+        for project in projects:
+            condition = True
+            for k, v in conditions.items():
+                condition = condition and str(getattr(project, k)) == str(v)
+            if condition:
+                return project
+        return None
+
+    def get_by_namespace(self, namespace: str):
+        projects = self._rest_client.project_get()
+        for project in projects:
+            if str(project.namespace) == str(namespace):
+                return project
+        return None
+
+    def get_by_id(self, project_id: str):
+        projects = self._rest_client.project_get()
+        for project in projects:
+            if str(project.id) == str(project_id):
+                return project
+        return None
+
+    def create(self, name: str, namespace: str, desc: str = None, auto_schema=False):
+        project_create_request = rest.ProjectCreateRequest(
+            name=name, desc=desc, namespace=namespace, auto_schema=auto_schema
+        )
+
+        project = self._rest_client.project_create_post(
+            project_create_request=project_create_request
+        )
+        return project
+
+    def update(self, id, config):
+        project_create_request = rest.ProjectCreateRequest(id=id, config=config)
+        project = self._rest_client.update_post(
+            project_create_request=project_create_request
+        )
+        return project
+
+    def get_all(self):
+        project_list = {}
+        projects = self._rest_client.project_get()
+        for project in projects:
+            project_list[project.namespace] = project.id
+        return project_list
diff --git a/knext/project/rest/__init__.py b/knext/project/rest/__init__.py
new file mode 100644
index 00000000..eda2e9c3
--- /dev/null
+++ b/knext/project/rest/__init__.py
@@ -0,0 +1,34 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+# flake8: noqa
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+from __future__ import absolute_import
+
+__version__ = "1"
+
+# import apis into sdk package
+from knext.project.rest.project_api import ProjectApi
+
+# import models into sdk package
+from knext.project.rest.models.project import Project
+from knext.project.rest.models.project_create_request import ProjectCreateRequest
diff --git a/knext/project/rest/models/__init__.py b/knext/project/rest/models/__init__.py
new file mode 100644
index 00000000..32bd1d5a
--- /dev/null
+++ b/knext/project/rest/models/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from __future__ import absolute_import
+
+from knext.project.rest.models.project import Project
+from knext.project.rest.models.project_create_request import ProjectCreateRequest
diff --git a/knext/project/rest/models/project.py b/knext/project/rest/models/project.py
new file mode 100644
index 00000000..2a46080a
--- /dev/null
+++ b/knext/project/rest/models/project.py
@@ -0,0 +1,272 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class Project(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "id": "str",
+        "name": "str",
+        "description": "str",
+        "namespace": "str",
+        "tenant_id": "str",
+        "config": "str",
+    }
+
+    attribute_map = {
+        "id": "id",
+        "name": "name",
+        "description": "description",
+        "namespace": "namespace",
+        "tenant_id": "tenantId",
+        "config": "config",
+    }
+
+    def __init__(
+        self,
+        id=None,
+        name=None,
+        description=None,
+        namespace=None,
+        tenant_id=None,
+        config=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """Project - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._id = None
+        self._name = None
+        self._description = None
+        self._namespace = None
+        self._tenant_id = None
+        self._config = None
+        self.discriminator = None
+
+        if id is not None:
+            self.id = id
+        if name is not None:
+            self.name = name
+        if description is not None:
+            self.description = description
+        if namespace is not None:
+            self.namespace = namespace
+        if tenant_id is not None:
+            self.tenant_id = tenant_id
+        if config is not None:
+            self.config = config
+
+    @property
+    def id(self):
+        """Gets the id of this Project.  # noqa: E501
+
+
+        :return: The id of this Project.  # noqa: E501
+        :rtype: str
+        """
+        return self._id
+
+    @id.setter
+    def id(self, id):
+        """Sets the id of this Project.
+
+
+        :param id: The id of this Project.  # noqa: E501
+        :type: str
+        """
+
+        self._id = id
+
+    @property
+    def name(self):
+        """Gets the name of this Project.  # noqa: E501
+
+
+        :return: The name of this Project.  # noqa: E501
+        :rtype: str
+        """
+        return self._name
+
+    @name.setter
+    def name(self, name):
+        """Sets the name of this Project.
+
+
+        :param name: The name of this Project.  # noqa: E501
+        :type: str
+        """
+
+        self._name = name
+
+    @property
+    def description(self):
+        """Gets the description of this Project.  # noqa: E501
+
+
+        :return: The description of this Project.  # noqa: E501
+        :rtype: str
+        """
+        return self._description
+
+    @description.setter
+    def description(self, description):
+        """Sets the description of this Project.
+
+
+        :param description: The description of this Project.  # noqa: E501
+        :type: str
+        """
+
+        self._description = description
+
+    @property
+    def namespace(self):
+        """Gets the namespace of this Project.  # noqa: E501
+
+
+        :return: The namespace of this Project.  # noqa: E501
+        :rtype: str
+        """
+        return self._namespace
+
+    @namespace.setter
+    def namespace(self, namespace):
+        """Sets the namespace of this Project.
+
+
+        :param namespace: The namespace of this Project.  # noqa: E501
+        :type: str
+        """
+
+        self._namespace = namespace
+
+    @property
+    def tenant_id(self):
+        """Gets the tenant_id of this Project.  # noqa: E501
+
+
+        :return: The tenant_id of this Project.  # noqa: E501
+        :rtype: str
+        """
+        return self._tenant_id
+
+    @tenant_id.setter
+    def tenant_id(self, tenant_id):
+        """Sets the tenant_id of this Project.
+
+
+        :param tenant_id: The tenant_id of this Project.  # noqa: E501
+        :type: str
+        """
+
+        self._tenant_id = tenant_id
+
+    @property
+    def config(self):
+        """Gets the config of this Project.  # noqa: E501
+
+
+        :return: The config of this Project.  # noqa: E501
+        :rtype: str
+        """
+        return self._config
+
+    @config.setter
+    def config(self, config):
+        """Sets the config of this Project.
+
+
+        :param config: The config of this Project.  # noqa: E501
+        :type: str
+        """
+
+        self._config = config
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, Project):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, Project):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/project/rest/models/project_create_request.py b/knext/project/rest/models/project_create_request.py
new file mode 100644
index 00000000..60b343c3
--- /dev/null
+++ b/knext/project/rest/models/project_create_request.py
@@ -0,0 +1,299 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class ProjectCreateRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "id": "int",
+        "name": "str",
+        "desc": "str",
+        "namespace": "str",
+        "tenant_id": "str",
+        "config": "str",
+        "auto_schema": "str",
+    }
+
+    attribute_map = {
+        "id": "id",
+        "name": "name",
+        "desc": "desc",
+        "namespace": "namespace",
+        "tenant_id": "tenantId",
+        "config": "config",
+        "auto_schema": "autoSchema",
+    }
+
+    def __init__(
+        self,
+        id=None,
+        name=None,
+        desc=None,
+        namespace=None,
+        tenant_id=None,
+        config=None,
+        auto_schema=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """ProjectCreateRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._id = None
+        self._name = None
+        self._desc = None
+        self._namespace = None
+        self._tenant_id = None
+        self._config = None
+        self._auto_schema = None
+        self.discriminator = None
+
+        if id is not None:
+            self.id = id
+        if name is not None:
+            self.name = name
+        if desc is not None:
+            self.desc = desc
+        if namespace is not None:
+            self.namespace = namespace
+        if tenant_id is not None:
+            self.tenant_id = tenant_id
+        if config is not None:
+            self.config = config
+        if auto_schema is not None:
+            self.auto_schema = auto_schema
+
+    @property
+    def id(self):
+        """Gets the id of this ProjectCreateRequest.  # noqa: E501
+
+
+        :return: The id of this ProjectCreateRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._id
+
+    @id.setter
+    def id(self, id):
+        """Sets the id of this ProjectCreateRequest.
+
+
+        :param id: The id of this ProjectCreateRequest.  # noqa: E501
+        :type: int
+        """
+
+        self._id = id
+
+    @property
+    def name(self):
+        """Gets the name of this ProjectCreateRequest.  # noqa: E501
+
+
+        :return: The name of this ProjectCreateRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._name
+
+    @name.setter
+    def name(self, name):
+        """Sets the name of this ProjectCreateRequest.
+
+
+        :param name: The name of this ProjectCreateRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._name = name
+
+    @property
+    def desc(self):
+        """Gets the desc of this ProjectCreateRequest.  # noqa: E501
+
+
+        :return: The desc of this ProjectCreateRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._desc
+
+    @desc.setter
+    def desc(self, desc):
+        """Sets the desc of this ProjectCreateRequest.
+
+
+        :param desc: The desc of this ProjectCreateRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._desc = desc
+
+    @property
+    def namespace(self):
+        """Gets the namespace of this ProjectCreateRequest.  # noqa: E501
+
+
+        :return: The namespace of this ProjectCreateRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._namespace
+
+    @namespace.setter
+    def namespace(self, namespace):
+        """Sets the namespace of this ProjectCreateRequest.
+
+
+        :param namespace: The namespace of this ProjectCreateRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._namespace = namespace
+
+    @property
+    def tenant_id(self):
+        """Gets the tenant_id of this ProjectCreateRequest.  # noqa: E501
+
+
+        :return: The tenant_id of this ProjectCreateRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._tenant_id
+
+    @tenant_id.setter
+    def tenant_id(self, tenant_id):
+        """Sets the tenant_id of this ProjectCreateRequest.
+
+
+        :param tenant_id: The tenant_id of this ProjectCreateRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._tenant_id = tenant_id
+
+    @property
+    def config(self):
+        """Gets the config of this ProjectCreateRequest.  # noqa: E501
+
+
+        :return: The config of this ProjectCreateRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._config
+
+    @config.setter
+    def config(self, config):
+        """Sets the config of this ProjectCreateRequest.
+
+
+        :param config: The config of this ProjectCreateRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._config = config
+
+    @property
+    def auto_schema(self):
+        """Gets the auto_schema of this ProjectCreateRequest.  # noqa: E501
+
+
+        :return: The auto_schema of this ProjectCreateRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._auto_schema
+
+    @auto_schema.setter
+    def auto_schema(self, auto_schema):
+        """Sets the auto_schema of this ProjectCreateRequest.
+
+
+        :param auto_schema: The auto_schema of this ProjectCreateRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._auto_schema = auto_schema
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, ProjectCreateRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, ProjectCreateRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/project/rest/project_api.py b/knext/project/rest/project_api.py
new file mode 100644
index 00000000..bd3b768b
--- /dev/null
+++ b/knext/project/rest/project_api.py
@@ -0,0 +1,507 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+from __future__ import absolute_import
+
+
+# python 2 and python 3 compatibility library
+import six
+
+from knext.common.rest.api_client import ApiClient
+from knext.common.rest.exceptions import ApiTypeError  # noqa: F401
+
+
+class ProjectApi(object):
+    """NOTE: This class is auto generated by OpenAPI Generator
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    def __init__(self, api_client=None):
+        if api_client is None:
+            api_client = ApiClient()
+        self.api_client = api_client
+
+    def project_create_post(self, **kwargs):  # noqa: E501
+        """create  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.project_create_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param ProjectCreateRequest project_create_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: Project
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.project_create_post_with_http_info(**kwargs)  # noqa: E501
+
+    def project_create_post_with_http_info(self, **kwargs):  # noqa: E501
+        """create  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.project_create_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param ProjectCreateRequest project_create_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(Project, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["project_create_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method project_create_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "project_create_request" in local_var_params:
+            body_params = local_var_params["project_create_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/project",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="Project",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def project_get(self, **kwargs):  # noqa: E501
+        """query  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.project_get(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param str tenant_id: 按名称模糊查找
+        :param int project_id: 按项目ID精确查找
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: list[Project]
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.project_get_with_http_info(**kwargs)  # noqa: E501
+
+    def project_get_with_http_info(self, **kwargs):  # noqa: E501
+        """query  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.project_get_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param str tenant_id: 按名称模糊查找
+        :param int project_id: 按项目ID精确查找
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(list[Project], status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["tenant_id", "project_id"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method project_get" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+        if (
+            "tenant_id" in local_var_params
+            and local_var_params["tenant_id"] is not None
+        ):  # noqa: E501
+            query_params.append(
+                ("tenantId", local_var_params["tenant_id"])
+            )  # noqa: E501
+        if (
+            "project_id" in local_var_params
+            and local_var_params["project_id"] is not None
+        ):  # noqa: E501
+            query_params.append(
+                ("projectId", local_var_params["project_id"])
+            )  # noqa: E501
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/project",
+            "GET",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="list[Project]",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def update_post(self, **kwargs):  # noqa: E501
+        """update  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.update_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param ProjectCreateRequest project_create_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: object
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.update_post_with_http_info(**kwargs)  # noqa: E501
+
+    def update_post_with_http_info(self, **kwargs):  # noqa: E501
+        """update  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.update_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param ProjectCreateRequest project_create_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(object, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["project_create_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method update_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "project_create_request" in local_var_params:
+            body_params = local_var_params["project_create_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/update",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="object",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def update_post(self, **kwargs):  # noqa: E501
+        """update  # noqa: E501
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.update_post(async_req=True)
+        >>> result = thread.get()
+        :param async_req bool: execute request asynchronously
+        :param ProjectCreateRequest project_create_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: object
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.update_post_with_http_info(**kwargs)  # noqa: E501
+
+    def update_post_with_http_info(self, **kwargs):  # noqa: E501
+        """update  # noqa: E501
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.update_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+        :param async_req bool: execute request asynchronously
+        :param ProjectCreateRequest project_create_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(object, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        local_var_params = locals()
+        all_params = ["project_create_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method update_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+        collection_formats = {}
+        path_params = {}
+        query_params = []
+        header_params = {}
+        form_params = []
+        local_var_files = {}
+        body_params = None
+        if "project_create_request" in local_var_params:
+            body_params = local_var_params["project_create_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+        return self.api_client.call_api(
+            "/project/update",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="object",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
diff --git a/knext/reasoner/__init__.py b/knext/reasoner/__init__.py
new file mode 100644
index 00000000..2e3e6758
--- /dev/null
+++ b/knext/reasoner/__init__.py
@@ -0,0 +1,37 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+# flake8: noqa
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+from __future__ import absolute_import
+
+__version__ = "1"
+
+
+from knext.reasoner.rest.models.reason_task import ReasonTask
+
+# import models into sdk package
+from knext.reasoner.rest.models.reason_task_response import ReasonTaskResponse
+from knext.reasoner.rest.models.table_result import TableResult
+
+# import apis into sdk package
+from knext.reasoner.rest.reasoner_api import ReasonerApi
diff --git a/knext/reasoner/client.py b/knext/reasoner/client.py
new file mode 100644
index 00000000..c1fd83d3
--- /dev/null
+++ b/knext/reasoner/client.py
@@ -0,0 +1,150 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import os
+import datetime
+from knext.reasoner.rest.models.reason_task_response import ReasonTaskResponse
+
+import knext.common.cache
+from knext.common.base.client import Client
+from knext.common.rest import ApiClient, Configuration
+from knext.reasoner import ReasonTask
+from knext.reasoner import rest
+from knext.reasoner.rest import SpgTypeQueryRequest
+from knext.schema.client import SchemaSession
+from knext.schema.model.base import SpgTypeEnum
+
+reason_cache = knext.common.cache.SchemaCache()
+
+
+class ReasonerClient(Client):
+    """SPG Reasoner Client."""
+
+    def __init__(self, host_addr: str = None, project_id: int = None, namespace=None):
+        super().__init__(host_addr, str(project_id))
+        self._rest_client: rest.ReasonerApi = rest.ReasonerApi(
+            api_client=ApiClient(configuration=Configuration(host=host_addr))
+        )
+        self._namespace = namespace or os.environ.get("KAG_PROJECT_NAMESPACE")
+        self._session = None
+        # load schema cache
+        self.get_reason_schema()
+
+    def create_session(self):
+        """Create session for altering schema."""
+        schema_session = reason_cache.get(self._project_id)
+        if not schema_session:
+            schema_session = SchemaSession(self._rest_client, self._project_id)
+            reason_cache.put(self._project_id, schema_session)
+        return schema_session
+
+    def get_reason_schema(self):
+        """
+        Create a new session and load schema information.
+
+        - Create a session object `schema_session`.
+        - Iterate through all types in the session and filter out types that are Concepts, Entities, or Events.
+        - Construct a dictionary where keys are type names and values are the type objects themselves.
+        - Return the constructed dictionary `schema`.
+        """
+        schema_session = self.create_session()
+        schema = {
+            k: v
+            for k, v in schema_session.spg_types.items()
+            if v.spg_type_enum
+            in [SpgTypeEnum.Concept, SpgTypeEnum.Entity, SpgTypeEnum.Event]
+        }
+        return schema
+
+    def generate_graph_connect_config(self, lib):
+        """
+        Generates the graph connection configuration based on environment variables.
+
+        This function first attempts to retrieve the graph store URI from environment variables.
+        If the URI is not set, it returns the local graph store URL and the local graph state class.
+        If the URI is set, it retrieves the username, password, and database information from
+        environment variables and constructs a graph store URL with this information.
+
+        Parameters:
+        lib (reasoner constants): Contains constants and classes related to graph connections.
+
+        Returns:
+        tuple: A tuple containing the graph store URL and the graph state class. If the URI is from
+               environment variables, the URL is a remote address; otherwise, it is a local address.
+        """
+        # Attempt to get the graph store URI; if not set, default to an empty string
+        uri = os.environ.get("KAG_GRAPH_STORE_URI", "")
+        # If URI is empty, return the local graph store URL and the local graph state class
+        if uri == "":
+            return lib.LOCAL_GRAPH_STORE_URL, lib.LOCAL_GRAPH_STATE_CLASS
+
+        # Retrieve username, password, and database information from environment variables
+        user = os.getenv("KAG_GRAPH_STORE_USER")
+        password = os.getenv("KAG_GRAPH_STORE_PASSWORD")
+        database = os.getenv("KAG_GRAPH_STORE_DATABASE")
+        namespace = self._namespace or os.environ.get("KAG_PROJECT_NAMESPACE")
+        # Construct a graph store URL with authentication information
+        graph_store_url = f"{uri}?user={user}&password={password}&database={database}&namespace={namespace}"
+
+        # Return the constructed graph store URL and the local graph state class
+        return graph_store_url, lib.LOCAL_GRAPH_STATE_CLASS
+
+    def execute(self, dsl_content: str, output_file: str = None):
+        """
+        Execute a synchronous builder job in local runner.
+        """
+        task_response: ReasonTaskResponse = self.syn_execute(dsl_content)
+        task: ReasonTask = task_response.task
+        if task.status != "FINISH":
+            print(f"RUN {task.status} {dsl_content}")
+        else:
+            default_output_file = output_file or (
+                f"./{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
+            )
+            show_data = [
+                task.result_table_result.header
+            ] + task.result_table_result.rows
+            import pandas as pd
+
+            df = pd.DataFrame(show_data)
+            print(df)
+            df.to_csv(default_output_file, index=False)
+
+    def query_node(self, label, id_value):
+        req = SpgTypeQueryRequest(
+            project_id=self._project_id, spg_type=label, ids=[id_value]
+        )
+        resp = self._rest_client.query_spg_type_post(spg_type_query_request=req)
+        if len(resp) == 0:
+            return {}
+        return resp[0].properties
+
+    def syn_execute(self, dsl_content: str, **kwargs):
+        task = ReasonTask(project_id=self._project_id, dsl=dsl_content, params=kwargs)
+        return self._rest_client.reason_run_post(reason_task=task)
+
+
+if __name__ == "__main__":
+    sc = ReasonerClient("http://127.0.0.1:8887", 4)
+    reason_schema = sc.get_reason_schema()
+    print(reason_schema)
+    prop_set = sc.query_node("KQA.Others", "Panic_disorder")
+    import time
+
+    start_time = time.time()
+    ret = sc.syn_execute(
+        "MATCH (n:KQA.Others)-[p:rdf_expand()]-(o:Entity) WHERE n.id in $nid and o.id in $oid RETURN p",
+        start_alias="n",
+        nid='["Panic_disorder"]',
+        oid='["Anxiety_and_nervousness"]',
+    )
+    print(ret)
+    print(f"cost={time.time() - start_time}")
diff --git a/knext/reasoner/lib/__init__.py b/knext/reasoner/lib/__init__.py
new file mode 100644
index 00000000..8c48f853
--- /dev/null
+++ b/knext/reasoner/lib/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+LOCAL_REASONER_JAR = "reasoner-local-runner-0.0.1-SNAPSHOT-jar-with-dependencies.jar"
+
+LOCAL_GRAPH_STORE_URL = "neo4j://127.0.0.1:7687"
+
+LOCAL_GRAPH_STATE_CLASS = (
+    "com.antgroup.openspg.reasoner.warehouse.cloudext.CloudExtGraphState"
+)
diff --git a/knext/reasoner/rest/__init__.py b/knext/reasoner/rest/__init__.py
new file mode 100644
index 00000000..c8d63f19
--- /dev/null
+++ b/knext/reasoner/rest/__init__.py
@@ -0,0 +1,36 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+# flake8: noqa
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+from __future__ import absolute_import
+
+__version__ = "1"
+# import models into sdk package
+from knext.reasoner.rest.models.reason_task import ReasonTask
+from knext.reasoner.rest.models.reason_task_response import ReasonTaskResponse
+from knext.reasoner.rest.models.table_result import TableResult
+from knext.reasoner.rest.models.spg_type_instance import SpgTypeInstance
+from knext.reasoner.rest.models.spg_type_query_request import SpgTypeQueryRequest
+
+# import apis into sdk package
+from knext.reasoner.rest.reasoner_api import ReasonerApi
diff --git a/knext/reasoner/rest/models/__init__.py b/knext/reasoner/rest/models/__init__.py
new file mode 100644
index 00000000..c30405d6
--- /dev/null
+++ b/knext/reasoner/rest/models/__init__.py
@@ -0,0 +1,33 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+# flake8: noqa
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+from __future__ import absolute_import
+
+__version__ = "1"
+# import models into sdk package
+from knext.reasoner.rest.models.reason_task import ReasonTask
+from knext.reasoner.rest.models.reason_task_response import ReasonTaskResponse
+from knext.reasoner.rest.models.table_result import TableResult
+from knext.reasoner.rest.models.spg_type_instance import SpgTypeInstance
+from knext.reasoner.rest.models.spg_type_query_request import SpgTypeQueryRequest
diff --git a/knext/reasoner/rest/models/ca_pipeline.py b/knext/reasoner/rest/models/ca_pipeline.py
new file mode 100644
index 00000000..c6ff87e9
--- /dev/null
+++ b/knext/reasoner/rest/models/ca_pipeline.py
@@ -0,0 +1,155 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class CaPipeline(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"nodes": "list[Node]", "edges": "list[Edge]"}
+
+    attribute_map = {"nodes": "nodes", "edges": "edges"}
+
+    def __init__(
+        self, nodes=None, edges=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """CaPipeline - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._nodes = None
+        self._edges = None
+        self.discriminator = None
+
+        if nodes is not None:
+            self.nodes = nodes
+        if edges is not None:
+            self.edges = edges
+
+    @property
+    def nodes(self):
+        """Gets the nodes of this CaPipeline.  # noqa: E501
+
+
+        :return: The nodes of this CaPipeline.  # noqa: E501
+        :rtype: list[Node]
+        """
+        return self._nodes
+
+    @nodes.setter
+    def nodes(self, nodes):
+        """Sets the nodes of this CaPipeline.
+
+
+        :param nodes: The nodes of this CaPipeline.  # noqa: E501
+        :type: list[Node]
+        """
+
+        self._nodes = nodes
+
+    @property
+    def edges(self):
+        """Gets the edges of this CaPipeline.  # noqa: E501
+
+
+        :return: The edges of this CaPipeline.  # noqa: E501
+        :rtype: list[Edge]
+        """
+        return self._edges
+
+    @edges.setter
+    def edges(self, edges):
+        """Sets the edges of this CaPipeline.
+
+
+        :param edges: The edges of this CaPipeline.  # noqa: E501
+        :type: list[Edge]
+        """
+
+        self._edges = edges
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, CaPipeline):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, CaPipeline):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/reasoner/rest/models/data_edge.py b/knext/reasoner/rest/models/data_edge.py
new file mode 100644
index 00000000..73ca8568
--- /dev/null
+++ b/knext/reasoner/rest/models/data_edge.py
@@ -0,0 +1,283 @@
+# coding: utf-8
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class DataEdge(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        'id': 'str',
+        '_from': 'str',
+        'from_type': 'str',
+        'to': 'str',
+        'to_type': 'str',
+        'label': 'str',
+        'properties': 'object'
+    }
+
+    attribute_map = {
+        'id': 'id',
+        '_from': 'from',
+        'from_type': 'fromType',
+        'to': 'to',
+        'to_type': 'toType',
+        'label': 'label',
+        'properties': 'properties'
+    }
+
+    def __init__(self, id=None, _from=None, from_type=None, to=None, to_type=None, label=None, properties=None, local_vars_configuration=None):  # noqa: E501
+        """DataEdge - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._id = None
+        self.__from = None
+        self._from_type = None
+        self._to = None
+        self._to_type = None
+        self._label = None
+        self._properties = None
+        self.discriminator = None
+
+        self.id = id
+        self._from = _from
+        self.from_type = from_type
+        self.to = to
+        self.to_type = to_type
+        self.label = label
+        self.properties = properties
+
+    @property
+    def id(self):
+        """Gets the id of this DataEdge.  # noqa: E501
+
+
+        :return: The id of this DataEdge.  # noqa: E501
+        :rtype: str
+        """
+        return self._id
+
+    @id.setter
+    def id(self, id):
+        """Sets the id of this DataEdge.
+
+
+        :param id: The id of this DataEdge.  # noqa: E501
+        :type: str
+        """
+        if self.local_vars_configuration.client_side_validation and id is None:  # noqa: E501
+            raise ValueError("Invalid value for `id`, must not be `None`")  # noqa: E501
+
+        self._id = id
+
+    @property
+    def _from(self):
+        """Gets the _from of this DataEdge.  # noqa: E501
+
+
+        :return: The _from of this DataEdge.  # noqa: E501
+        :rtype: str
+        """
+        return self.__from
+
+    @_from.setter
+    def _from(self, _from):
+        """Sets the _from of this DataEdge.
+
+
+        :param _from: The _from of this DataEdge.  # noqa: E501
+        :type: str
+        """
+        if self.local_vars_configuration.client_side_validation and _from is None:  # noqa: E501
+            raise ValueError("Invalid value for `_from`, must not be `None`")  # noqa: E501
+
+        self.__from = _from
+
+    @property
+    def from_type(self):
+        """Gets the from_type of this DataEdge.  # noqa: E501
+
+
+        :return: The from_type of this DataEdge.  # noqa: E501
+        :rtype: str
+        """
+        return self._from_type
+
+    @from_type.setter
+    def from_type(self, from_type):
+        """Sets the from_type of this DataEdge.
+
+
+        :param from_type: The from_type of this DataEdge.  # noqa: E501
+        :type: str
+        """
+        if self.local_vars_configuration.client_side_validation and from_type is None:  # noqa: E501
+            raise ValueError("Invalid value for `from_type`, must not be `None`")  # noqa: E501
+
+        self._from_type = from_type
+
+    @property
+    def to(self):
+        """Gets the to of this DataEdge.  # noqa: E501
+
+
+        :return: The to of this DataEdge.  # noqa: E501
+        :rtype: str
+        """
+        return self._to
+
+    @to.setter
+    def to(self, to):
+        """Sets the to of this DataEdge.
+
+
+        :param to: The to of this DataEdge.  # noqa: E501
+        :type: str
+        """
+        if self.local_vars_configuration.client_side_validation and to is None:  # noqa: E501
+            raise ValueError("Invalid value for `to`, must not be `None`")  # noqa: E501
+
+        self._to = to
+
+    @property
+    def to_type(self):
+        """Gets the to_type of this DataEdge.  # noqa: E501
+
+
+        :return: The to_type of this DataEdge.  # noqa: E501
+        :rtype: str
+        """
+        return self._to_type
+
+    @to_type.setter
+    def to_type(self, to_type):
+        """Sets the to_type of this DataEdge.
+
+
+        :param to_type: The to_type of this DataEdge.  # noqa: E501
+        :type: str
+        """
+        if self.local_vars_configuration.client_side_validation and to_type is None:  # noqa: E501
+            raise ValueError("Invalid value for `to_type`, must not be `None`")  # noqa: E501
+
+        self._to_type = to_type
+
+    @property
+    def label(self):
+        """Gets the label of this DataEdge.  # noqa: E501
+
+
+        :return: The label of this DataEdge.  # noqa: E501
+        :rtype: str
+        """
+        return self._label
+
+    @label.setter
+    def label(self, label):
+        """Sets the label of this DataEdge.
+
+
+        :param label: The label of this DataEdge.  # noqa: E501
+        :type: str
+        """
+        if self.local_vars_configuration.client_side_validation and label is None:  # noqa: E501
+            raise ValueError("Invalid value for `label`, must not be `None`")  # noqa: E501
+
+        self._label = label
+
+    @property
+    def properties(self):
+        """Gets the properties of this DataEdge.  # noqa: E501
+
+
+        :return: The properties of this DataEdge.  # noqa: E501
+        :rtype: object
+        """
+        return self._properties
+
+    @properties.setter
+    def properties(self, properties):
+        """Sets the properties of this DataEdge.
+
+
+        :param properties: The properties of this DataEdge.  # noqa: E501
+        :type: object
+        """
+        if self.local_vars_configuration.client_side_validation and properties is None:  # noqa: E501
+            raise ValueError("Invalid value for `properties`, must not be `None`")  # noqa: E501
+
+        self._properties = properties
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(map(
+                    lambda x: x.to_dict() if hasattr(x, "to_dict") else x,
+                    value
+                ))
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(map(
+                    lambda item: (item[0], item[1].to_dict())
+                    if hasattr(item[1], "to_dict") else item,
+                    value.items()
+                ))
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, DataEdge):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, DataEdge):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/reasoner/rest/models/data_node.py b/knext/reasoner/rest/models/data_node.py
new file mode 100644
index 00000000..0d4b1045
--- /dev/null
+++ b/knext/reasoner/rest/models/data_node.py
@@ -0,0 +1,202 @@
+# coding: utf-8
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class DataNode(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        'id': 'str',
+        'label': 'str',
+        'name': 'str',
+        'properties': 'object'
+    }
+
+    attribute_map = {
+        'id': 'id',
+        'label': 'label',
+        'name': 'name',
+        'properties': 'properties'
+    }
+
+    def __init__(self, id=None, label=None, name=None, properties=None, local_vars_configuration=None):  # noqa: E501
+        """DataNode - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._id = None
+        self._label = None
+        self._name = None
+        self._properties = None
+        self.discriminator = None
+
+        self.id = id
+        self.label = label
+        self.name = name
+        self.properties = properties
+
+    @property
+    def id(self):
+        """Gets the id of this DataNode.  # noqa: E501
+
+
+        :return: The id of this DataNode.  # noqa: E501
+        :rtype: str
+        """
+        return self._id
+
+    @id.setter
+    def id(self, id):
+        """Sets the id of this DataNode.
+
+
+        :param id: The id of this DataNode.  # noqa: E501
+        :type: str
+        """
+        if self.local_vars_configuration.client_side_validation and id is None:  # noqa: E501
+            raise ValueError("Invalid value for `id`, must not be `None`")  # noqa: E501
+
+        self._id = id
+
+    @property
+    def label(self):
+        """Gets the label of this DataNode.  # noqa: E501
+
+
+        :return: The label of this DataNode.  # noqa: E501
+        :rtype: str
+        """
+        return self._label
+
+    @label.setter
+    def label(self, label):
+        """Sets the label of this DataNode.
+
+
+        :param label: The label of this DataNode.  # noqa: E501
+        :type: str
+        """
+        if self.local_vars_configuration.client_side_validation and label is None:  # noqa: E501
+            raise ValueError("Invalid value for `label`, must not be `None`")  # noqa: E501
+
+        self._label = label
+
+    @property
+    def name(self):
+        """Gets the name of this DataNode.  # noqa: E501
+
+
+        :return: The name of this DataNode.  # noqa: E501
+        :rtype: str
+        """
+        return self._name
+
+    @name.setter
+    def name(self, name):
+        """Sets the name of this DataNode.
+
+
+        :param name: The name of this DataNode.  # noqa: E501
+        :type: str
+        """
+        if self.local_vars_configuration.client_side_validation and name is None:  # noqa: E501
+            raise ValueError("Invalid value for `name`, must not be `None`")  # noqa: E501
+
+        self._name = name
+
+    @property
+    def properties(self):
+        """Gets the properties of this DataNode.  # noqa: E501
+
+
+        :return: The properties of this DataNode.  # noqa: E501
+        :rtype: object
+        """
+        return self._properties
+
+    @properties.setter
+    def properties(self, properties):
+        """Sets the properties of this DataNode.
+
+
+        :param properties: The properties of this DataNode.  # noqa: E501
+        :type: object
+        """
+        if self.local_vars_configuration.client_side_validation and properties is None:  # noqa: E501
+            raise ValueError("Invalid value for `properties`, must not be `None`")  # noqa: E501
+
+        self._properties = properties
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(map(
+                    lambda x: x.to_dict() if hasattr(x, "to_dict") else x,
+                    value
+                ))
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(map(
+                    lambda item: (item[0], item[1].to_dict())
+                    if hasattr(item[1], "to_dict") else item,
+                    value.items()
+                ))
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, DataNode):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, DataNode):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/reasoner/rest/models/edge.py b/knext/reasoner/rest/models/edge.py
new file mode 100644
index 00000000..3f5be14f
--- /dev/null
+++ b/knext/reasoner/rest/models/edge.py
@@ -0,0 +1,155 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class Edge(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"_from": "str", "to": "str"}
+
+    attribute_map = {"_from": "from", "to": "to"}
+
+    def __init__(
+        self, _from=None, to=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """Edge - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self.__from = None
+        self._to = None
+        self.discriminator = None
+
+        if _from is not None:
+            self._from = _from
+        if to is not None:
+            self.to = to
+
+    @property
+    def _from(self):
+        """Gets the _from of this Edge.  # noqa: E501
+
+
+        :return: The _from of this Edge.  # noqa: E501
+        :rtype: str
+        """
+        return self.__from
+
+    @_from.setter
+    def _from(self, _from):
+        """Sets the _from of this Edge.
+
+
+        :param _from: The _from of this Edge.  # noqa: E501
+        :type: str
+        """
+
+        self.__from = _from
+
+    @property
+    def to(self):
+        """Gets the to of this Edge.  # noqa: E501
+
+
+        :return: The to of this Edge.  # noqa: E501
+        :rtype: str
+        """
+        return self._to
+
+    @to.setter
+    def to(self, to):
+        """Sets the to of this Edge.
+
+
+        :param to: The to of this Edge.  # noqa: E501
+        :type: str
+        """
+
+        self._to = to
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, Edge):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, Edge):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/reasoner/rest/models/node.py b/knext/reasoner/rest/models/node.py
new file mode 100644
index 00000000..b4ec411c
--- /dev/null
+++ b/knext/reasoner/rest/models/node.py
@@ -0,0 +1,299 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class Node(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "id": "str",
+        "state": "str",
+        "question": "str",
+        "answer": "str",
+        "logs": "str",
+        'title': 'str',
+        'subgraph': 'list[SubGraph]'
+    }
+
+    attribute_map = {
+        "id": "id",
+        "state": "state",
+        "question": "question",
+        "answer": "answer",
+        "logs": "logs",
+        'title': 'title',
+        'subgraph': 'subgraph'
+    }
+
+    def __init__(
+        self,
+        id=None,
+        state=None,
+        question=None,
+        answer=None,
+        logs=None,
+        title=None,
+        subgraph=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """Node - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._id = None
+        self._state = None
+        self._question = None
+        self._answer = None
+        self._logs = None
+        self._title = None
+        self._subgraph = None
+        self.discriminator = None
+
+        if id is not None:
+            self.id = id
+        if state is not None:
+            self.state = state
+        if question is not None:
+            self.question = question
+        if answer is not None:
+            self.answer = answer
+        if logs is not None:
+            self.logs = logs
+        if title is not None:
+            self.title = title
+        if subgraph is not None:
+            self.subgraph = subgraph
+
+    @property
+    def id(self):
+        """Gets the id of this Node.  # noqa: E501
+
+
+        :return: The id of this Node.  # noqa: E501
+        :rtype: str
+        """
+        return self._id
+
+    @id.setter
+    def id(self, id):
+        """Sets the id of this Node.
+
+
+        :param id: The id of this Node.  # noqa: E501
+        :type: str
+        """
+
+        self._id = id
+
+    @property
+    def state(self):
+        """Gets the state of this Node.  # noqa: E501
+
+
+        :return: The state of this Node.  # noqa: E501
+        :rtype: str
+        """
+        return self._state
+
+    @state.setter
+    def state(self, state):
+        """Sets the state of this Node.
+
+
+        :param state: The state of this Node.  # noqa: E501
+        :type: str
+        """
+
+        self._state = state
+
+    @property
+    def question(self):
+        """Gets the question of this Node.  # noqa: E501
+
+
+        :return: The question of this Node.  # noqa: E501
+        :rtype: str
+        """
+        return self._question
+
+    @question.setter
+    def question(self, question):
+        """Sets the question of this Node.
+
+
+        :param question: The question of this Node.  # noqa: E501
+        :type: str
+        """
+
+        self._question = question
+
+    @property
+    def answer(self):
+        """Gets the answer of this Node.  # noqa: E501
+
+
+        :return: The answer of this Node.  # noqa: E501
+        :rtype: str
+        """
+        return self._answer
+
+    @answer.setter
+    def answer(self, answer):
+        """Sets the answer of this Node.
+
+
+        :param answer: The answer of this Node.  # noqa: E501
+        :type: str
+        """
+
+        self._answer = answer
+
+    @property
+    def logs(self):
+        """Gets the logs of this Node.  # noqa: E501
+
+
+        :return: The logs of this Node.  # noqa: E501
+        :rtype: str
+        """
+        return self._logs
+
+    @logs.setter
+    def logs(self, logs):
+        """Sets the logs of this Node.
+
+
+        :param logs: The logs of this Node.  # noqa: E501
+        :type: str
+        """
+
+        self._logs = logs
+
+    @property
+    def title(self):
+        """Gets the title of this Node.  # noqa: E501
+
+
+        :return: The title of this Node.  # noqa: E501
+        :rtype: str
+        """
+        return self._title
+
+    @title.setter
+    def title(self, title):
+        """Sets the title of this Node.
+
+
+        :param title: The title of this Node.  # noqa: E501
+        :type: str
+        """
+
+        self._title = title
+
+    @property
+    def subgraph(self):
+        """Gets the subgraph of this Node.  # noqa: E501
+
+
+        :return: The subgraph of this Node.  # noqa: E501
+        :rtype: list[SubGraph]
+        """
+        return self._subgraph
+
+    @subgraph.setter
+    def subgraph(self, subgraph):
+        """Sets the subgraph of this Node.
+
+
+        :param subgraph: The subgraph of this Node.  # noqa: E501
+        :type: list[SubGraph]
+        """
+
+        self._subgraph = subgraph
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, Node):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, Node):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/reasoner/rest/models/reason_markdown_request.py b/knext/reasoner/rest/models/reason_markdown_request.py
new file mode 100644
index 00000000..9e20b232
--- /dev/null
+++ b/knext/reasoner/rest/models/reason_markdown_request.py
@@ -0,0 +1,156 @@
+# coding: utf-8
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class ReportMarkdownRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"task_id": "int", "content": "str"}
+
+    attribute_map = {"task_id": "taskId", "content": "content"}
+
+    def __init__(
+        self, task_id=None, content=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """ReportMarkdownRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._task_id = None
+        self._content = None
+        self.discriminator = None
+
+        self.task_id = task_id
+        self.content = content
+
+    @property
+    def task_id(self):
+        """Gets the task_id of this ReportMarkdownRequest.  # noqa: E501
+
+
+        :return: The task_id of this ReportMarkdownRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._task_id
+
+    @task_id.setter
+    def task_id(self, task_id):
+        """Sets the task_id of this ReportMarkdownRequest.
+
+
+        :param task_id: The task_id of this ReportMarkdownRequest.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and task_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `task_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._task_id = task_id
+
+    @property
+    def content(self):
+        """Gets the content of this ReportMarkdownRequest.  # noqa: E501
+
+
+        :return: The content of this ReportMarkdownRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._content
+
+    @content.setter
+    def content(self, content):
+        """Sets the content of this ReportMarkdownRequest.
+
+
+        :param content: The content of this ReportMarkdownRequest.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and content is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `content`, must not be `None`"
+            )  # noqa: E501
+
+        self._content = content
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (
+                            (item[0], item[1].to_dict())
+                            if hasattr(item[1], "to_dict")
+                            else item
+                        ),
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, ReportMarkdownRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, ReportMarkdownRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/reasoner/rest/models/reason_task.py b/knext/reasoner/rest/models/reason_task.py
new file mode 100644
index 00000000..43673aa4
--- /dev/null
+++ b/knext/reasoner/rest/models/reason_task.py
@@ -0,0 +1,363 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class ReasonTask(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "task_id": "str",
+        "project_id": "int",
+        "graph_store_url": "str",
+        "dsl": "str",
+        "params": "object",
+        "status": "str",
+        "result_table_result": "TableResult",
+        "result_nodes": "list[str]",
+        "result_edges": "list[str]",
+    }
+
+    attribute_map = {
+        "task_id": "taskId",
+        "project_id": "projectId",
+        "graph_store_url": "graphStoreUrl",
+        "dsl": "dsl",
+        "params": "params",
+        "status": "status",
+        "result_table_result": "resultTableResult",
+        "result_nodes": "resultNodes",
+        "result_edges": "resultEdges",
+    }
+
+    def __init__(
+        self,
+        task_id=None,
+        project_id=None,
+        graph_store_url=None,
+        dsl=None,
+        params=None,
+        status=None,
+        result_table_result=None,
+        result_nodes=None,
+        result_edges=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """ReasonTask - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._task_id = None
+        self._project_id = None
+        self._graph_store_url = None
+        self._dsl = None
+        self._params = None
+        self._status = None
+        self._result_table_result = None
+        self._result_nodes = None
+        self._result_edges = None
+        self.discriminator = None
+
+        if task_id is not None:
+            self.task_id = task_id
+        self.project_id = project_id
+        if graph_store_url is not None:
+            self.graph_store_url = graph_store_url
+        self.dsl = dsl
+        if params is not None:
+            self.params = params
+        if status is not None:
+            self.status = status
+        if result_table_result is not None:
+            self.result_table_result = result_table_result
+        if result_nodes is not None:
+            self.result_nodes = result_nodes
+        if result_edges is not None:
+            self.result_edges = result_edges
+
+    @property
+    def task_id(self):
+        """Gets the task_id of this ReasonTask.  # noqa: E501
+
+
+        :return: The task_id of this ReasonTask.  # noqa: E501
+        :rtype: str
+        """
+        return self._task_id
+
+    @task_id.setter
+    def task_id(self, task_id):
+        """Sets the task_id of this ReasonTask.
+
+
+        :param task_id: The task_id of this ReasonTask.  # noqa: E501
+        :type: str
+        """
+
+        self._task_id = task_id
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this ReasonTask.  # noqa: E501
+
+
+        :return: The project_id of this ReasonTask.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this ReasonTask.
+
+
+        :param project_id: The project_id of this ReasonTask.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._project_id = project_id
+
+    @property
+    def graph_store_url(self):
+        """Gets the graph_store_url of this ReasonTask.  # noqa: E501
+
+
+        :return: The graph_store_url of this ReasonTask.  # noqa: E501
+        :rtype: str
+        """
+        return self._graph_store_url
+
+    @graph_store_url.setter
+    def graph_store_url(self, graph_store_url):
+        """Sets the graph_store_url of this ReasonTask.
+
+
+        :param graph_store_url: The graph_store_url of this ReasonTask.  # noqa: E501
+        :type: str
+        """
+
+        self._graph_store_url = graph_store_url
+
+    @property
+    def dsl(self):
+        """Gets the dsl of this ReasonTask.  # noqa: E501
+
+
+        :return: The dsl of this ReasonTask.  # noqa: E501
+        :rtype: str
+        """
+        return self._dsl
+
+    @dsl.setter
+    def dsl(self, dsl):
+        """Sets the dsl of this ReasonTask.
+
+
+        :param dsl: The dsl of this ReasonTask.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and dsl is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `dsl`, must not be `None`"
+            )  # noqa: E501
+
+        self._dsl = dsl
+
+    @property
+    def params(self):
+        """Gets the params of this ReasonTask.  # noqa: E501
+
+
+        :return: The params of this ReasonTask.  # noqa: E501
+        :rtype: object
+        """
+        return self._params
+
+    @params.setter
+    def params(self, params):
+        """Sets the params of this ReasonTask.
+
+
+        :param params: The params of this ReasonTask.  # noqa: E501
+        :type: object
+        """
+
+        self._params = params
+
+    @property
+    def status(self):
+        """Gets the status of this ReasonTask.  # noqa: E501
+
+
+        :return: The status of this ReasonTask.  # noqa: E501
+        :rtype: str
+        """
+        return self._status
+
+    @status.setter
+    def status(self, status):
+        """Sets the status of this ReasonTask.
+
+
+        :param status: The status of this ReasonTask.  # noqa: E501
+        :type: str
+        """
+
+        self._status = status
+
+    @property
+    def result_table_result(self):
+        """Gets the result_table_result of this ReasonTask.  # noqa: E501
+
+
+        :return: The result_table_result of this ReasonTask.  # noqa: E501
+        :rtype: TableResult
+        """
+        return self._result_table_result
+
+    @result_table_result.setter
+    def result_table_result(self, result_table_result):
+        """Sets the result_table_result of this ReasonTask.
+
+
+        :param result_table_result: The result_table_result of this ReasonTask.  # noqa: E501
+        :type: TableResult
+        """
+
+        self._result_table_result = result_table_result
+
+    @property
+    def result_nodes(self):
+        """Gets the result_nodes of this ReasonTask.  # noqa: E501
+
+
+        :return: The result_nodes of this ReasonTask.  # noqa: E501
+        :rtype: list[str]
+        """
+        return self._result_nodes
+
+    @result_nodes.setter
+    def result_nodes(self, result_nodes):
+        """Sets the result_nodes of this ReasonTask.
+
+
+        :param result_nodes: The result_nodes of this ReasonTask.  # noqa: E501
+        :type: list[str]
+        """
+
+        self._result_nodes = result_nodes
+
+    @property
+    def result_edges(self):
+        """Gets the result_edges of this ReasonTask.  # noqa: E501
+
+
+        :return: The result_edges of this ReasonTask.  # noqa: E501
+        :rtype: list[str]
+        """
+        return self._result_edges
+
+    @result_edges.setter
+    def result_edges(self, result_edges):
+        """Sets the result_edges of this ReasonTask.
+
+
+        :param result_edges: The result_edges of this ReasonTask.  # noqa: E501
+        :type: list[str]
+        """
+
+        self._result_edges = result_edges
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, ReasonTask):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, ReasonTask):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/reasoner/rest/models/reason_task_response.py b/knext/reasoner/rest/models/reason_task_response.py
new file mode 100644
index 00000000..4a772d95
--- /dev/null
+++ b/knext/reasoner/rest/models/reason_task_response.py
@@ -0,0 +1,160 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class ReasonTaskResponse(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"project_id": "int", "task": "ReasonTask"}
+
+    attribute_map = {"project_id": "projectId", "task": "task"}
+
+    def __init__(
+        self, project_id=None, task=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """ReasonTaskResponse - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._project_id = None
+        self._task = None
+        self.discriminator = None
+
+        self.project_id = project_id
+        if task is not None:
+            self.task = task
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this ReasonTaskResponse.  # noqa: E501
+
+
+        :return: The project_id of this ReasonTaskResponse.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this ReasonTaskResponse.
+
+
+        :param project_id: The project_id of this ReasonTaskResponse.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._project_id = project_id
+
+    @property
+    def task(self):
+        """Gets the task of this ReasonTaskResponse.  # noqa: E501
+
+
+        :return: The task of this ReasonTaskResponse.  # noqa: E501
+        :rtype: ReasonTask
+        """
+        return self._task
+
+    @task.setter
+    def task(self, task):
+        """Sets the task of this ReasonTaskResponse.
+
+
+        :param task: The task of this ReasonTaskResponse.  # noqa: E501
+        :type: ReasonTask
+        """
+
+        self._task = task
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, ReasonTaskResponse):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, ReasonTaskResponse):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/reasoner/rest/models/report_pipeline_request.py b/knext/reasoner/rest/models/report_pipeline_request.py
new file mode 100644
index 00000000..f7356b61
--- /dev/null
+++ b/knext/reasoner/rest/models/report_pipeline_request.py
@@ -0,0 +1,179 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class ReportPipelineRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"task_id": "int", "pipeline": "CaPipeline", "node": "Node"}
+
+    attribute_map = {"task_id": "taskId", "pipeline": "pipeline", "node": "node"}
+
+    def __init__(
+        self, task_id=None, pipeline=None, node=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """ReportPipelineRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._task_id = None
+        self._pipeline = None
+        self._node = None
+        self.discriminator = None
+
+        if task_id is not None:
+            self.task_id = task_id
+        if pipeline is not None:
+            self.pipeline = pipeline
+        if node is not None:
+            self.node = node
+
+    @property
+    def task_id(self):
+        """Gets the task_id of this ReportPipelineRequest.  # noqa: E501
+
+
+        :return: The task_id of this ReportPipelineRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._task_id
+
+    @task_id.setter
+    def task_id(self, task_id):
+        """Sets the task_id of this ReportPipelineRequest.
+
+
+        :param task_id: The task_id of this ReportPipelineRequest.  # noqa: E501
+        :type: int
+        """
+
+        self._task_id = task_id
+
+    @property
+    def pipeline(self):
+        """Gets the pipeline of this ReportPipelineRequest.  # noqa: E501
+
+
+        :return: The pipeline of this ReportPipelineRequest.  # noqa: E501
+        :rtype: CaPipeline
+        """
+        return self._pipeline
+
+    @pipeline.setter
+    def pipeline(self, pipeline):
+        """Sets the pipeline of this ReportPipelineRequest.
+
+
+        :param pipeline: The pipeline of this ReportPipelineRequest.  # noqa: E501
+        :type: CaPipeline
+        """
+
+        self._pipeline = pipeline
+
+    @property
+    def node(self):
+        """Gets the node of this ReportPipelineRequest.  # noqa: E501
+
+
+        :return: The node of this ReportPipelineRequest.  # noqa: E501
+        :rtype: Node
+        """
+        return self._node
+
+    @node.setter
+    def node(self, node):
+        """Sets the node of this ReportPipelineRequest.
+
+
+        :param node: The node of this ReportPipelineRequest.  # noqa: E501
+        :type: Node
+        """
+
+        self._node = node
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, ReportPipelineRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, ReportPipelineRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/reasoner/rest/models/spg_type_instance.py b/knext/reasoner/rest/models/spg_type_instance.py
new file mode 100644
index 00000000..9ceb1e38
--- /dev/null
+++ b/knext/reasoner/rest/models/spg_type_instance.py
@@ -0,0 +1,187 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class SpgTypeInstance(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"id": "str", "spg_type": "str", "properties": "object"}
+
+    attribute_map = {"id": "id", "spg_type": "spgType", "properties": "properties"}
+
+    def __init__(
+        self, id=None, spg_type=None, properties=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """SpgTypeInstance - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._id = None
+        self._spg_type = None
+        self._properties = None
+        self.discriminator = None
+
+        self.id = id
+        self.spg_type = spg_type
+        if properties is not None:
+            self.properties = properties
+
+    @property
+    def id(self):
+        """Gets the id of this SpgTypeInstance.  # noqa: E501
+
+
+        :return: The id of this SpgTypeInstance.  # noqa: E501
+        :rtype: str
+        """
+        return self._id
+
+    @id.setter
+    def id(self, id):
+        """Sets the id of this SpgTypeInstance.
+
+
+        :param id: The id of this SpgTypeInstance.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and id is None
+        ):  # noqa: E501
+            raise ValueError("Invalid value for `id`, must not be `None`")  # noqa: E501
+
+        self._id = id
+
+    @property
+    def spg_type(self):
+        """Gets the spg_type of this SpgTypeInstance.  # noqa: E501
+
+
+        :return: The spg_type of this SpgTypeInstance.  # noqa: E501
+        :rtype: str
+        """
+        return self._spg_type
+
+    @spg_type.setter
+    def spg_type(self, spg_type):
+        """Sets the spg_type of this SpgTypeInstance.
+
+
+        :param spg_type: The spg_type of this SpgTypeInstance.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and spg_type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `spg_type`, must not be `None`"
+            )  # noqa: E501
+
+        self._spg_type = spg_type
+
+    @property
+    def properties(self):
+        """Gets the properties of this SpgTypeInstance.  # noqa: E501
+
+
+        :return: The properties of this SpgTypeInstance.  # noqa: E501
+        :rtype: object
+        """
+        return self._properties
+
+    @properties.setter
+    def properties(self, properties):
+        """Sets the properties of this SpgTypeInstance.
+
+
+        :param properties: The properties of this SpgTypeInstance.  # noqa: E501
+        :type: object
+        """
+
+        self._properties = properties
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, SpgTypeInstance):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, SpgTypeInstance):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/reasoner/rest/models/spg_type_query_request.py b/knext/reasoner/rest/models/spg_type_query_request.py
new file mode 100644
index 00000000..b4dee0c8
--- /dev/null
+++ b/knext/reasoner/rest/models/spg_type_query_request.py
@@ -0,0 +1,194 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class SpgTypeQueryRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"project_id": "int", "spg_type": "str", "ids": "list[str]"}
+
+    attribute_map = {"project_id": "projectId", "spg_type": "spgType", "ids": "ids"}
+
+    def __init__(
+        self, project_id=None, spg_type=None, ids=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """SpgTypeQueryRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._project_id = None
+        self._spg_type = None
+        self._ids = None
+        self.discriminator = None
+
+        self.project_id = project_id
+        self.spg_type = spg_type
+        self.ids = ids
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this SpgTypeQueryRequest.  # noqa: E501
+
+
+        :return: The project_id of this SpgTypeQueryRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this SpgTypeQueryRequest.
+
+
+        :param project_id: The project_id of this SpgTypeQueryRequest.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._project_id = project_id
+
+    @property
+    def spg_type(self):
+        """Gets the spg_type of this SpgTypeQueryRequest.  # noqa: E501
+
+
+        :return: The spg_type of this SpgTypeQueryRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._spg_type
+
+    @spg_type.setter
+    def spg_type(self, spg_type):
+        """Sets the spg_type of this SpgTypeQueryRequest.
+
+
+        :param spg_type: The spg_type of this SpgTypeQueryRequest.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and spg_type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `spg_type`, must not be `None`"
+            )  # noqa: E501
+
+        self._spg_type = spg_type
+
+    @property
+    def ids(self):
+        """Gets the ids of this SpgTypeQueryRequest.  # noqa: E501
+
+
+        :return: The ids of this SpgTypeQueryRequest.  # noqa: E501
+        :rtype: list[str]
+        """
+        return self._ids
+
+    @ids.setter
+    def ids(self, ids):
+        """Sets the ids of this SpgTypeQueryRequest.
+
+
+        :param ids: The ids of this SpgTypeQueryRequest.  # noqa: E501
+        :type: list[str]
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and ids is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `ids`, must not be `None`"
+            )  # noqa: E501
+
+        self._ids = ids
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, SpgTypeQueryRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, SpgTypeQueryRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/reasoner/rest/models/sub_graph.py b/knext/reasoner/rest/models/sub_graph.py
new file mode 100644
index 00000000..5dfc3fb7
--- /dev/null
+++ b/knext/reasoner/rest/models/sub_graph.py
@@ -0,0 +1,175 @@
+# coding: utf-8
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class SubGraph(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        'class_name': 'str',
+        'result_nodes': 'list[DataNode]',
+        'result_edges': 'list[DataEdge]'
+    }
+
+    attribute_map = {
+        'class_name': 'className',
+        'result_nodes': 'resultNodes',
+        'result_edges': 'resultEdges'
+    }
+
+    def __init__(self, class_name=None, result_nodes=None, result_edges=None, local_vars_configuration=None):  # noqa: E501
+        """SubGraph - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._class_name = None
+        self._result_nodes = None
+        self._result_edges = None
+        self.discriminator = None
+
+        self.class_name = class_name
+        self.result_nodes = result_nodes
+        self.result_edges = result_edges
+
+    @property
+    def class_name(self):
+        """Gets the class_name of this SubGraph.  # noqa: E501
+
+
+        :return: The class_name of this SubGraph.  # noqa: E501
+        :rtype: str
+        """
+        return self._class_name
+
+    @class_name.setter
+    def class_name(self, class_name):
+        """Sets the class_name of this SubGraph.
+
+
+        :param class_name: The class_name of this SubGraph.  # noqa: E501
+        :type: str
+        """
+        if self.local_vars_configuration.client_side_validation and class_name is None:  # noqa: E501
+            raise ValueError("Invalid value for `class_name`, must not be `None`")  # noqa: E501
+
+        self._class_name = class_name
+
+    @property
+    def result_nodes(self):
+        """Gets the result_nodes of this SubGraph.  # noqa: E501
+
+
+        :return: The result_nodes of this SubGraph.  # noqa: E501
+        :rtype: list[DataNode]
+        """
+        return self._result_nodes
+
+    @result_nodes.setter
+    def result_nodes(self, result_nodes):
+        """Sets the result_nodes of this SubGraph.
+
+
+        :param result_nodes: The result_nodes of this SubGraph.  # noqa: E501
+        :type: list[DataNode]
+        """
+        if self.local_vars_configuration.client_side_validation and result_nodes is None:  # noqa: E501
+            raise ValueError("Invalid value for `result_nodes`, must not be `None`")  # noqa: E501
+
+        self._result_nodes = result_nodes
+
+    @property
+    def result_edges(self):
+        """Gets the result_edges of this SubGraph.  # noqa: E501
+
+
+        :return: The result_edges of this SubGraph.  # noqa: E501
+        :rtype: list[DataEdge]
+        """
+        return self._result_edges
+
+    @result_edges.setter
+    def result_edges(self, result_edges):
+        """Sets the result_edges of this SubGraph.
+
+
+        :param result_edges: The result_edges of this SubGraph.  # noqa: E501
+        :type: list[DataEdge]
+        """
+        if self.local_vars_configuration.client_side_validation and result_edges is None:  # noqa: E501
+            raise ValueError("Invalid value for `result_edges`, must not be `None`")  # noqa: E501
+
+        self._result_edges = result_edges
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(map(
+                    lambda x: x.to_dict() if hasattr(x, "to_dict") else x,
+                    value
+                ))
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(map(
+                    lambda item: (item[0], item[1].to_dict())
+                    if hasattr(item[1], "to_dict") else item,
+                    value.items()
+                ))
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, SubGraph):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, SubGraph):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/reasoner/rest/models/table_result.py b/knext/reasoner/rest/models/table_result.py
new file mode 100644
index 00000000..16891548
--- /dev/null
+++ b/knext/reasoner/rest/models/table_result.py
@@ -0,0 +1,194 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class TableResult(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"total": "int", "header": "list[str]", "rows": "list[list[str]]"}
+
+    attribute_map = {"total": "total", "header": "header", "rows": "rows"}
+
+    def __init__(
+        self, total=None, header=None, rows=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """TableResult - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._total = None
+        self._header = None
+        self._rows = None
+        self.discriminator = None
+
+        self.total = total
+        self.header = header
+        self.rows = rows
+
+    @property
+    def total(self):
+        """Gets the total of this TableResult.  # noqa: E501
+
+
+        :return: The total of this TableResult.  # noqa: E501
+        :rtype: int
+        """
+        return self._total
+
+    @total.setter
+    def total(self, total):
+        """Sets the total of this TableResult.
+
+
+        :param total: The total of this TableResult.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and total is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `total`, must not be `None`"
+            )  # noqa: E501
+
+        self._total = total
+
+    @property
+    def header(self):
+        """Gets the header of this TableResult.  # noqa: E501
+
+
+        :return: The header of this TableResult.  # noqa: E501
+        :rtype: list[str]
+        """
+        return self._header
+
+    @header.setter
+    def header(self, header):
+        """Sets the header of this TableResult.
+
+
+        :param header: The header of this TableResult.  # noqa: E501
+        :type: list[str]
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and header is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `header`, must not be `None`"
+            )  # noqa: E501
+
+        self._header = header
+
+    @property
+    def rows(self):
+        """Gets the rows of this TableResult.  # noqa: E501
+
+
+        :return: The rows of this TableResult.  # noqa: E501
+        :rtype: list[list[str]]
+        """
+        return self._rows
+
+    @rows.setter
+    def rows(self, rows):
+        """Sets the rows of this TableResult.
+
+
+        :param rows: The rows of this TableResult.  # noqa: E501
+        :type: list[list[str]]
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and rows is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `rows`, must not be `None`"
+            )  # noqa: E501
+
+        self._rows = rows
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, TableResult):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, TableResult):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/reasoner/rest/reasoner_api.py b/knext/reasoner/rest/reasoner_api.py
new file mode 100644
index 00000000..e6e3d3ca
--- /dev/null
+++ b/knext/reasoner/rest/reasoner_api.py
@@ -0,0 +1,777 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+from __future__ import absolute_import
+
+import re  # noqa: F401
+
+# python 2 and python 3 compatibility library
+import six
+
+from knext.common.rest.api_client import ApiClient
+from knext.common.rest.exceptions import ApiTypeError, ApiValueError  # noqa: F401
+
+
+class ReasonerApi(object):
+    """NOTE: This class is auto generated by OpenAPI Generator
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    def __init__(self, api_client=None):
+        if api_client is None:
+            api_client = ApiClient()
+        self.api_client = api_client
+
+    def query_spg_type_post(self, **kwargs):  # noqa: E501
+        """query_spg_type  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.query_spg_type_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param SpgTypeQueryRequest spg_type_query_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: list[SpgTypeInstance]
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.query_spg_type_post_with_http_info(**kwargs)  # noqa: E501
+
+    def query_spg_type_post_with_http_info(self, **kwargs):  # noqa: E501
+        """query_spg_type  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.query_spg_type_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param SpgTypeQueryRequest spg_type_query_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(list[SpgTypeInstance], status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["spg_type_query_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method query_spg_type_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "spg_type_query_request" in local_var_params:
+            body_params = local_var_params["spg_type_query_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params["Content-Type"] = (
+            self.api_client.select_header_content_type(  # noqa: E501
+                ["application/json"]
+            )
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/query/spgType",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="list[SpgTypeInstance]",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def schema_query_project_schema_get(self, project_id, **kwargs):  # noqa: E501
+        """reason_schema  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.schema_query_project_schema_get(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param int project_id:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: ProjectSchema
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.schema_query_project_schema_get_info(
+            project_id, **kwargs
+        )  # noqa: E501
+
+    def schema_query_project_schema_get_info(self, project_id, **kwargs):  # noqa: E501
+        """reason_schema  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.schema_query_project_schema_get_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param int project_id:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(ProjectSchema, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["project_id"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method reason_schema_get" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+        if (
+            "project_id" in local_var_params
+            and local_var_params["project_id"] is not None
+        ):  # noqa: E501
+            query_params.append(
+                ("projectId", local_var_params["project_id"])
+            )  # noqa: E501
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/reason/schema",
+            "GET",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="ProjectSchema",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def reason_run_post(self, **kwargs):  # noqa: E501
+        """reason_run  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.reason_run_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param ReasonTask reason_task:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: ReasonTaskResponse
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.reason_run_post_with_http_info(**kwargs)  # noqa: E501
+
+    def reason_run_post_with_http_info(self, **kwargs):  # noqa: E501
+        """reason_run  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.reason_run_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param ReasonTask reason_task:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(ReasonTaskResponse, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["reason_task"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method reason_run_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "reason_task" in local_var_params:
+            body_params = local_var_params["reason_task"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params["Content-Type"] = (
+            self.api_client.select_header_content_type(  # noqa: E501
+                ["application/json"]
+            )
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/reason/run",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="ReasonTaskResponse",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def reasoner_dialog_report_node_post(self, **kwargs):  # noqa: E501
+        """report_node  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.reasoner_dialog_report_node_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param ReportPipelineRequest report_pipeline_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: object
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.reasoner_dialog_report_node_post_with_http_info(
+            **kwargs
+        )  # noqa: E501
+
+    def reasoner_dialog_report_node_post_with_http_info(self, **kwargs):  # noqa: E501
+        """report_node  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.reasoner_dialog_report_node_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param ReportPipelineRequest report_pipeline_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(object, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["report_pipeline_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method reasoner_dialog_report_node_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "report_pipeline_request" in local_var_params:
+            body_params = local_var_params["report_pipeline_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params["Content-Type"] = (
+            self.api_client.select_header_content_type(  # noqa: E501
+                ["application/json"]
+            )
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/reasoner/dialog/report/node",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="object",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def reasoner_dialog_report_pipeline_post(self, **kwargs):  # noqa: E501
+        """report_pipeline  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.reasoner_dialog_report_pipeline_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param str task_id:
+        :param str pipeline:
+        :param ReportPipelineRequest report_pipeline_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: object
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.reasoner_dialog_report_pipeline_post_with_http_info(
+            **kwargs
+        )  # noqa: E501
+
+    def reasoner_dialog_report_pipeline_post_with_http_info(
+        self, **kwargs
+    ):  # noqa: E501
+        """report_pipeline  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.reasoner_dialog_report_pipeline_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param str task_id:
+        :param str pipeline:
+        :param ReportPipelineRequest report_pipeline_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(object, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["task_id", "pipeline", "report_pipeline_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method reasoner_dialog_report_pipeline_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+        if (
+            "task_id" in local_var_params and local_var_params["task_id"] is not None
+        ):  # noqa: E501
+            query_params.append(("taskId", local_var_params["task_id"]))  # noqa: E501
+        if (
+            "pipeline" in local_var_params and local_var_params["pipeline"] is not None
+        ):  # noqa: E501
+            query_params.append(
+                ("pipeline", local_var_params["pipeline"])
+            )  # noqa: E501
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "report_pipeline_request" in local_var_params:
+            body_params = local_var_params["report_pipeline_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params["Content-Type"] = (
+            self.api_client.select_header_content_type(  # noqa: E501
+                ["application/json"]
+            )
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/reasoner/dialog/report/pipeline",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="object",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def reasoner_dialog_report_markdown_post(self, **kwargs):  # noqa: E501
+        """report_markdown  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.reasoner_dialog_report_markdown_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param ReportMarkdownRequest report_markdown_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: object
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.reasoner_dialog_report_markdown_post_with_http_info(
+            **kwargs
+        )  # noqa: E501
+
+    def reasoner_dialog_report_markdown_post_with_http_info(
+        self, **kwargs
+    ):  # noqa: E501
+        """report_markdown  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.reasoner_dialog_report_markdown_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param ReportMarkdownRequest report_markdown_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(object, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["report_markdown_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method reasoner_dialog_report_markdown_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "report_markdown_request" in local_var_params:
+            body_params = local_var_params["report_markdown_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params["Content-Type"] = (
+            self.api_client.select_header_content_type(  # noqa: E501
+                ["application/json"]
+            )
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/reasoner/dialog/report/markdown",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="object",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
diff --git a/knext/schema/__init__.py b/knext/schema/__init__.py
new file mode 100644
index 00000000..6e3c2455
--- /dev/null
+++ b/knext/schema/__init__.py
@@ -0,0 +1,11 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
diff --git a/knext/schema/client.py b/knext/schema/client.py
new file mode 100644
index 00000000..71bbd947
--- /dev/null
+++ b/knext/schema/client.py
@@ -0,0 +1,189 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import os
+from typing import List, Dict
+
+import knext.common.cache
+from knext.common.base.client import Client
+from knext.common.rest import ApiClient, Configuration
+from knext.schema import rest
+from knext.schema.model.base import BaseSpgType, AlterOperationEnum, SpgTypeEnum
+from knext.schema.model.relation import Relation
+
+cache = knext.common.cache.SchemaCache()
+
+
+CHUNK_TYPE = "Chunk"
+OTHER_TYPE = "Others"
+TEXT_TYPE = "Text"
+INTEGER_TYPE = "Integer"
+FLOAT_TYPE = "Float"
+BASIC_TYPES = [TEXT_TYPE, INTEGER_TYPE, FLOAT_TYPE]
+
+
+class SchemaSession:
+    def __init__(self, client, project_id):
+        self._alter_spg_types: List[BaseSpgType] = []
+        self._rest_client = client
+        self._project_id = project_id
+
+        self._spg_types = {}
+        self.__spg_types = {}
+        self._init_spg_types()
+
+    def _init_spg_types(self):
+        """Query project schema and init SPG types in session."""
+        project_schema = self._rest_client.schema_query_project_schema_get(
+            self._project_id
+        )
+        for spg_type in project_schema.spg_types:
+            spg_type_name = spg_type.basic_info.name.name
+            type_class = BaseSpgType.by_type_enum(spg_type.spg_type_enum)
+            if spg_type.spg_type_enum == SpgTypeEnum.Concept:
+                self._spg_types[spg_type_name] = type_class(
+                    name=spg_type_name,
+                    hypernym_predicate=spg_type.concept_layer_config.hypernym_predicate,
+                    rest_model=spg_type,
+                )
+            else:
+                self._spg_types[spg_type_name] = type_class(
+                    name=spg_type_name, rest_model=spg_type
+                )
+
+    @property
+    def spg_types(self) -> Dict[str, BaseSpgType]:
+        return self._spg_types
+
+    def get(self, spg_type_name) -> BaseSpgType:
+        """Get SPG type by name from project schema."""
+        spg_type = self._spg_types.get(spg_type_name)
+        if spg_type is None:
+            spg_type = self.__spg_types.get(spg_type_name)
+            if spg_type is None:
+                raise ValueError(f"{spg_type_name} is not existed")
+            else:
+                return self.__spg_types.get(spg_type_name)
+        return self._spg_types.get(spg_type_name)
+
+    def create_type(self, spg_type: BaseSpgType):
+        """Add an SPG type in session with `CREATE` operation."""
+        spg_type.alter_operation = AlterOperationEnum.Create
+        self.__spg_types[spg_type.name] = spg_type
+        self._alter_spg_types.append(spg_type)
+        return self
+
+    def update_type(self, spg_type: BaseSpgType):
+        """Add an SPG type in session with `UPDATE` operation."""
+        spg_type.alter_operation = AlterOperationEnum.Update
+        self._alter_spg_types.append(spg_type)
+        return self
+
+    def delete_type(self, spg_type: BaseSpgType):
+        """Add an SPG type in session with `DELETE` operation."""
+        spg_type.alter_operation = AlterOperationEnum.Delete
+        self._alter_spg_types.append(spg_type)
+        return self
+
+    def commit(self):
+        """Commit all altered schemas to server."""
+        schema_draft = []
+        for spg_type in self._alter_spg_types:
+            for prop in spg_type.properties.values():
+                if prop.object_spg_type is None:
+                    object_spg_type = self.get(prop.object_type_name)
+                    prop.object_spg_type = object_spg_type.spg_type_enum
+                for sub_prop in prop.sub_properties.values():
+                    if sub_prop.object_spg_type is None:
+                        object_spg_type = self.get(sub_prop.object_type_name)
+                        sub_prop.object_spg_type = object_spg_type.spg_type_enum
+            for rel in spg_type.relations.values():
+                if rel.is_dynamic is None:
+                    rel.is_dynamic = False
+                if rel.object_spg_type is None:
+                    object_spg_type = self.get(rel.object_type_name)
+                    rel.object_spg_type = object_spg_type.spg_type_enum
+                for sub_prop in rel.sub_properties.values():
+                    if sub_prop.object_spg_type is None:
+                        object_spg_type = self.get(sub_prop.object_type_name)
+                        sub_prop.object_spg_type = object_spg_type.spg_type_enum
+            schema_draft.append(spg_type.to_rest())
+        if len(schema_draft) == 0:
+            return
+
+        request = rest.SchemaAlterRequest(
+            project_id=self._project_id, schema_draft=rest.SchemaDraft(schema_draft)
+        )
+        key = "KNEXT_DEBUG_DUMP_SCHEMA"
+        dump_flag = os.getenv(key)
+        if dump_flag is not None and dump_flag.strip() == "1":
+            print(request)
+        else:
+            print(f"Committing schema: set {key}=1 to dump the schema")
+        self._rest_client.schema_alter_schema_post(schema_alter_request=request)
+
+
+class SchemaClient(Client):
+    """ """
+
+    def __init__(self, host_addr: str = None, project_id: str = None):
+        super().__init__(host_addr, project_id)
+        self._session = None
+        self._rest_client: rest.SchemaApi = rest.SchemaApi(
+            api_client=ApiClient(configuration=Configuration(host=host_addr))
+        )
+    def query_spg_type(self, spg_type_name: str) -> BaseSpgType:
+        """Query SPG type by name."""
+        rest_model = self._rest_client.schema_query_spg_type_get(spg_type_name)
+        type_class = BaseSpgType.by_type_enum(f"{rest_model.spg_type_enum}")
+
+        if rest_model.spg_type_enum == SpgTypeEnum.Concept:
+            return type_class(
+                name=spg_type_name,
+                hypernym_predicate=rest_model.concept_layer_config.hypernym_predicate,
+                rest_model=rest_model,
+            )
+        else:
+            return type_class(name=spg_type_name, rest_model=rest_model)
+
+    def query_relation(
+        self, subject_name: str, predicate_name: str, object_name: str
+    ) -> Relation:
+        """Query relation type by s_p_o name."""
+        rest_model = self._rest_client.schema_query_relation_get(
+            subject_name, predicate_name, object_name
+        )
+        return Relation(
+            name=predicate_name, object_type_name=object_name, rest_model=rest_model
+        )
+
+    def create_session(self):
+        """Create session for altering schema."""
+        schema_session = cache.get(self._project_id)
+        if not schema_session:
+            schema_session = SchemaSession(self._rest_client, self._project_id)
+            cache.put(self._project_id, schema_session)
+        return schema_session
+
+    def load(self):
+        schema_session = self.create_session()
+        schema = {
+            k.split(".")[-1]: v
+            for k, v in schema_session.spg_types.items()
+            if v.spg_type_enum
+            in [SpgTypeEnum.Concept, SpgTypeEnum.Entity, SpgTypeEnum.Event]
+        }
+        return schema
+
+    def extract_types(self):
+        schema = self.load()
+        types = [t for t in schema.keys() if t not in [CHUNK_TYPE] + BASIC_TYPES]
+        return types
diff --git a/knext/schema/marklang/__init__.py b/knext/schema/marklang/__init__.py
new file mode 100644
index 00000000..6e3c2455
--- /dev/null
+++ b/knext/schema/marklang/__init__.py
@@ -0,0 +1,11 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
diff --git a/knext/schema/marklang/concept_rule_ml.py b/knext/schema/marklang/concept_rule_ml.py
new file mode 100644
index 00000000..d1ca3243
--- /dev/null
+++ b/knext/schema/marklang/concept_rule_ml.py
@@ -0,0 +1,488 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import re
+
+from knext.schema import rest
+from knext.schema.client import SchemaClient
+from knext.schema.model.base import SpgTypeEnum
+
+
+combo_seperator = "\0+\0"
+
+
+def is_blank(text):
+    if not text:
+        return True
+    if len(text) == 0:
+        return True
+    if text.isspace():
+        return True
+    return False
+
+
+class SPGConceptRuleMarkLang:
+    """
+    SPG Concept Rule Mark Language Parser
+    Feature: parse rule script and then alter the schema of project
+    """
+
+    namespace = None
+    rule_quote_open = False
+    rule_text = ""
+    src_concept = ()
+    dst_concept = ()
+    predicate = None
+    is_reasoning = False
+    is_priority = False
+
+    def __init__(self, filename):
+        self.current_line_num = 0
+        self.session = SchemaClient().create_session()
+        self.concept_client = rest.ConceptApi()
+        self.load_script(filename)
+
+    def error_msg(self, msg):
+        return f"Line# {self.current_line_num}: {msg}"
+
+    def parse_concept(self, expression):
+        """
+        parse the concept definition
+        """
+
+        namespace_match = re.match(r"^namespace\s+([a-zA-Z0-9]+)$", expression)
+        if namespace_match:
+            assert self.namespace is None, self.error_msg(
+                "Duplicated namespace define, please ensure define it only once"
+            )
+
+            self.namespace = namespace_match.group(1)
+            return
+
+        reasoning_concept_priority_match = re.match(
+            r"^Priority\s*\(`?([a-zA-Z0-9\.]+)`?\):$",
+            expression,
+        )
+        if reasoning_concept_priority_match:
+            assert self.namespace is not None, self.error_msg(
+                "please define namespace first"
+            )
+
+            self.dst_concept = (reasoning_concept_priority_match.group(1), "_root")
+            self.is_reasoning = True
+            self.is_priority = True
+            return
+
+        reasoning_concept_match = re.match(
+            r"^\(`?([a-zA-Z0-9\.]+)`?/`([^`]+)`\):$",
+            expression,
+        )
+        if reasoning_concept_match:
+            assert self.namespace is not None, self.error_msg(
+                "please define namespace first"
+            )
+
+            self.dst_concept = (
+                reasoning_concept_match.group(1),
+                reasoning_concept_match.group(2),
+            )
+            self.is_reasoning = True
+            return
+
+        reasoning_po_match = re.match(
+            r"^\[([^\]]+)\]->\(`?([a-zA-Z0-9\.]+)`?/`([^`]+)`(\+`([^`]+)`)?\):$",
+            expression,
+        )
+        if reasoning_po_match:
+            assert self.namespace is not None, self.error_msg(
+                "please define namespace first"
+            )
+
+            combo_add = reasoning_po_match.group(5)
+            self.predicate = reasoning_po_match.group(1)
+            self.dst_concept = (
+                reasoning_po_match.group(2),
+                reasoning_po_match.group(3)
+                if combo_add is None
+                else reasoning_po_match.group(3) + combo_seperator + combo_add,
+            )
+            self.is_reasoning = True
+            return
+
+        reasoning_spo_match = re.match(
+            r"^\(`?([a-zA-Z0-9\.]+)`?/`([^`]+)`\)-\[([^\]]+)\]->\(`([a-zA-Z0-9\.]+)`/`([^`]+)`(\+`([^`]+)`)?\):$",
+            expression,
+        )
+        if reasoning_spo_match:
+            assert self.namespace is not None, self.error_msg(
+                "please define namespace first"
+            )
+
+            self.src_concept = (
+                reasoning_spo_match.group(1),
+                reasoning_spo_match.group(2),
+            )
+            self.predicate = reasoning_spo_match.group(3)
+            combo_add = reasoning_po_match.group(7)
+            self.dst_concept = (
+                reasoning_spo_match.group(4),
+                reasoning_spo_match.group(5)
+                if combo_add is None
+                else reasoning_spo_match.group(5) + combo_seperator + combo_add,
+            )
+            self.is_reasoning = True
+            return
+
+        type_match = re.match(
+            r"^`?([a-zA-Z0-9\.]+)`?/`([^`]+)`:(\s*?([a-zA-Z0-9\.]+)/`([^`]+)`)?$",
+            expression,
+        )
+        if type_match:
+            assert self.namespace is not None, self.error_msg(
+                "please define namespace first"
+            )
+
+            self.src_concept = (type_match.group(1), type_match.group(2))
+            if len(type_match.groups()) > 4:
+                self.dst_concept = (type_match.group(4), type_match.group(5))
+
+        else:
+            raise Exception(
+                self.error_msg("parse error, expect `ConceptType`/`ConceptName`:")
+            )
+
+    def parse_rule(self, rule):
+        """
+        parse the logic rule from text
+        """
+
+        strip_rule = rule.strip()
+        if strip_rule.startswith("[["):
+            self.rule_quote_open = True
+            if len(strip_rule) > 2:
+                if strip_rule.endswith("]]"):
+                    self.rule_quote_open = False
+                    self.rule_text = strip_rule[2 : len(strip_rule) - 2].lstrip()
+                else:
+                    self.rule_text = strip_rule[2].lstrip()
+            else:
+                self.rule_text = ""
+        else:
+            self.rule_text = rule
+
+    def complete_rule(self, rule):
+        """
+        Auto generate define statement and append namespace to the entity name
+        """
+
+        pattern = re.compile(r"Define\s*\(", re.IGNORECASE)
+        match = pattern.match(rule.strip())
+        if not match:
+            subject_type = None
+            subject_name = None
+            if self.is_reasoning:
+                predicate_name = self.predicate
+                subject_type = (
+                    f"{self.namespace}.{self.src_concept[0]}"
+                    if len(self.src_concept) > 0
+                    else None
+                )
+                subject_name = (
+                    self.src_concept[1] if len(self.src_concept) > 0 else None
+                )
+                object_type = (
+                    f"{self.namespace}.{self.dst_concept[0]}"
+                    if len(self.dst_concept) > 0
+                    else None
+                )
+                object_name = self.dst_concept[1] if len(self.dst_concept) > 0 else None
+            elif self.dst_concept[0] is not None:
+                predicate_name = "leadTo"
+                subject_type = f"{self.namespace}.{self.src_concept[0]}"
+                subject_name = self.src_concept[1]
+                object_type = f"{self.namespace}.{self.dst_concept[0]}"
+                object_name = self.dst_concept[1]
+            else:
+                predicate_name = "belongTo"
+                object_type = f"{self.namespace}.{self.src_concept[0]}"
+                object_name = self.src_concept[1]
+                assert object_type in self.session.spg_types, self.error_msg(
+                    f"{object_type} not found in schema"
+                )
+
+                concept_type = self.session.get(object_type)
+                assert (
+                    concept_type.spg_type_enum == SpgTypeEnum.Concept
+                ), self.error_msg(f"{object_type} is not concept type")
+
+                for spg_type in self.session.spg_types.values():
+                    for relation_name in spg_type.relations:
+                        if relation_name.startswith(f"belongTo_{object_type}"):
+                            subject_type = spg_type.name
+                            break
+
+            if self.is_reasoning:
+                if combo_seperator in object_name:
+                    names = object_name.split(combo_seperator)
+                    object_name = f"{names[0]}`+{object_type}/`{names[1]}"
+                if (
+                    subject_type is None
+                    and self.predicate is None
+                    and not self.is_priority
+                ):
+                    head = f"Define ({object_type}/`{object_name}`)" + " {\n"
+                elif subject_type is None and self.predicate is not None:
+                    head = (
+                        f"Define ()-[:{predicate_name}]->(:{object_type}/`{object_name}`)"
+                        + " {\n"
+                    )
+                elif self.is_priority:
+                    head = f"DefinePriority ({object_type})" + " {\n"
+                else:
+                    head = (
+                        f"Define (:{subject_type}/`{subject_name}`)-[:{predicate_name}]->"
+                        f"(:{object_type}/`{object_name}`)" + " {\n"
+                    )
+            elif subject_name is None:
+                head = (
+                    f"Define (s:{subject_type})-[p:{predicate_name}]->(o:`{object_type}`/`{object_name}`)"
+                    + " {\n"
+                )
+            else:
+                head = (
+                    f"Define "
+                    f"(s:`{subject_type}`/`{subject_name}`)-[p:{predicate_name}]->(o:`{object_type}`/`{object_name}`)"
+                    + " {\n"
+                )
+            rule = head + rule
+            rule += "\n}"
+        elif self.is_reasoning:
+            raise Exception(self.error_msg("Wrong format for reasoning rule"))
+
+        # complete the namespace of concept type
+        pattern = re.compile(r"\(([\w\s]*?:)`([\w\s\.]+)`/`([^`]+)`\)", re.IGNORECASE)
+        replace_list = []
+        matches = re.findall(pattern, rule)
+        if matches:
+            for group in matches:
+                if "." in group[1]:
+                    continue
+                replace_list.append(
+                    (
+                        f"({group[0]}`{group[1]}`",
+                        f"({group[0]}`{self.namespace}.{group[1].strip()}`",
+                    )
+                )
+
+        # complete the namespace of non-concept type
+        pattern = re.compile(r"\(([\w\s]*?:)([\w\s\.]+)\)", re.IGNORECASE)
+        matches = re.findall(pattern, rule)
+        if matches:
+            for group in matches:
+                if "." not in group[1]:
+                    replace_list.append(
+                        (
+                            f"({group[0]}{group[1]})",
+                            f"({group[0]}{self.namespace}.{group[1].strip()})",
+                        )
+                    )
+
+        # complete the namespace of type in action clause
+        pattern = re.compile(
+            r"createNodeInstance\s*?\([^)]+(type=)([^,]+),", re.IGNORECASE
+        )
+        matches = re.findall(pattern, rule)
+        if matches:
+            for group in matches:
+                if "." not in group[1]:
+                    replace_list.append(
+                        (
+                            f"{group[0]}{group[1]}",
+                            f"{group[0]}{self.namespace}.{group[1].strip()}",
+                        )
+                    )
+
+        if len(replace_list) > 0:
+            for t in replace_list:
+                rule = rule.replace(t[0], t[1])
+
+        return rule
+
+    def clear_session(self):
+        self.src_concept = ()
+        self.dst_concept = ()
+        self.rule_text = ""
+        self.predicate = None
+        self.is_reasoning = False
+        self.is_priority = False
+
+    def submit_rule(self):
+        """
+        submit the rule definition, make them available for inference
+        """
+
+        if self.is_reasoning:
+            # reasoning rule
+            if not is_blank(self.rule_text):
+                self.concept_client.concept_define_logical_causation_post(
+                    define_logical_causation_request=rest.DefineLogicalCausationRequest(
+                        subject_concept_type_name="Thing"
+                        if len(self.src_concept) == 0
+                        else f"{self.namespace}.{self.src_concept[0]}",
+                        subject_concept_name="1"
+                        if len(self.src_concept) == 0
+                        else self.src_concept[1],
+                        predicate_name="conclude"
+                        if self.predicate is None
+                        else self.predicate,
+                        object_concept_type_name=f"{self.namespace}.{self.dst_concept[0]}",
+                        object_concept_name=self.dst_concept[1],
+                        semantic_type="REASONING_CONCEPT",
+                        dsl=self.rule_text,
+                    )
+                )
+                print(
+                    f"Defined reasoning rule for `{self.dst_concept[0]}`/`{self.dst_concept[1]}`"
+                )
+            else:
+                self.concept_client.concept_remove_logical_causation_post(
+                    remove_logical_causation_request=rest.RemoveLogicalCausationRequest(
+                        subject_concept_type_name="Thing"
+                        if len(self.src_concept) == 0
+                        else f"{self.namespace}.{self.src_concept[0]}",
+                        subject_concept_name="1"
+                        if len(self.src_concept) == 0
+                        else self.src_concept[1],
+                        predicate_name="conclude"
+                        if self.predicate is None
+                        else self.predicate,
+                        object_concept_type_name=f"{self.namespace}.{self.dst_concept[0]}",
+                        object_concept_name=self.dst_concept[1],
+                        semantic_type="REASONING_CONCEPT",
+                    )
+                )
+                print(
+                    f"Removed reasoning rule for `{self.dst_concept[0]}`/`{self.dst_concept[1]}`"
+                )
+
+        elif self.dst_concept[0] is None:
+            # belongTo rule
+            if not is_blank(self.rule_text):
+                self.concept_client.concept_define_dynamic_taxonomy_post(
+                    define_dynamic_taxonomy_request=rest.DefineDynamicTaxonomyRequest(
+                        concept_type_name=f"{self.namespace}.{self.src_concept[0]}",
+                        concept_name=self.src_concept[1],
+                        dsl=self.rule_text,
+                    )
+                )
+                print(
+                    f"Defined belongTo rule for `{self.src_concept[0]}`/`{self.src_concept[1]}`"
+                )
+            else:
+                self.concept_client.concept_remove_dynamic_taxonomy_post(
+                    remove_dynamic_taxonomy_request=rest.RemoveDynamicTaxonomyRequest(
+                        object_concept_type_name=f"{self.namespace}.{self.src_concept[0]}",
+                        object_concept_name=self.src_concept[1],
+                    )
+                )
+                print(
+                    f"Removed belongTo rule for `{self.src_concept[0]}`/`{self.src_concept[1]}`"
+                )
+
+        else:
+            # leadTo rule
+            if not is_blank(self.rule_text):
+                self.concept_client.concept_define_logical_causation_post(
+                    define_logical_causation_request=rest.DefineLogicalCausationRequest(
+                        subject_concept_type_name=f"{self.namespace}.{self.src_concept[0]}",
+                        subject_concept_name=self.src_concept[1],
+                        predicate_name="leadTo",
+                        object_concept_type_name=f"{self.namespace}.{self.dst_concept[0]}",
+                        object_concept_name=self.dst_concept[1],
+                        dsl=self.rule_text,
+                    )
+                )
+                print(
+                    f"Defined leadTo rule for "
+                    f"`{self.src_concept[0]}`/`{self.src_concept[1]}` -> `{self.dst_concept[0]}`/`{self.dst_concept[1]}`"
+                )
+            else:
+                self.concept_client.concept_remove_logical_causation_post(
+                    remove_logical_causation_request=rest.RemoveLogicalCausationRequest(
+                        subject_concept_type_name=f"{self.namespace}.{self.src_concept[0]}",
+                        subject_concept_name=self.src_concept[1],
+                        predicate_name="leadTo",
+                        object_concept_type_name=f"{self.namespace}.{self.dst_concept[0]}",
+                        object_concept_name=self.dst_concept[1],
+                    )
+                )
+                print(
+                    f"Removed leadTo rule for "
+                    f"`{self.src_concept[0]}`/`{self.src_concept[1]}` -> `{self.dst_concept[0]}`/`{self.dst_concept[1]}`"
+                )
+
+        self.clear_session()
+
+    def load_script(self, filename):
+        """
+        Load and then parse the script file
+        """
+
+        file = open(filename, "r", encoding="utf-8")
+        lines = file.read().splitlines()
+        last_indent_level = 0
+
+        for line in lines:
+            self.current_line_num += 1
+            strip_line = line.strip()
+            if strip_line == "" or strip_line.startswith("#"):
+                # skip empty or comments line
+                continue
+
+            if self.rule_quote_open:
+                # process the multi-line assignment [[ .... ]]
+                right_strip_line = line.rstrip()
+                if strip_line.endswith("]]"):
+                    self.rule_quote_open = False
+                    if len(right_strip_line) > 2:
+                        self.rule_text += right_strip_line[: len(right_strip_line) - 2]
+                    if not is_blank(self.rule_text):
+                        self.rule_text = self.complete_rule(self.rule_text)
+                    self.submit_rule()
+
+                else:
+                    self.rule_text += line + "\n"
+                continue
+            elif len(self.rule_text) > 0:
+                self.submit_rule()
+
+            indent_count = len(line) - len(line.lstrip())
+            if indent_count == 0:
+                # the line without indent is namespace definition or a concept definition
+                if len(self.src_concept) > 1 and is_blank(self.rule_text):
+                    self.submit_rule()
+                else:
+                    self.clear_session()
+                self.parse_concept(strip_line)
+
+            elif indent_count > last_indent_level:
+                # the line is the sub definition of the previous line
+                if strip_line.startswith("rule:"):
+                    if len(strip_line) > 5:
+                        self.parse_rule(strip_line[5:])
+                else:
+                    raise Exception(self.error_msg("parse error, expect rule:"))
+
+            last_indent_level = indent_count
+
+        # if rule is the last line of file, then submit it
+        if len(self.rule_text) > 0:
+            self.submit_rule()
diff --git a/knext/schema/marklang/schema_ml.py b/knext/schema/marklang/schema_ml.py
new file mode 100644
index 00000000..bf6e6fbe
--- /dev/null
+++ b/knext/schema/marklang/schema_ml.py
@@ -0,0 +1,1479 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import os
+import re
+from enum import Enum
+from pathlib import Path
+
+from knext.schema.model.base import (
+    HypernymPredicateEnum,
+    BasicTypeEnum,
+    ConstraintTypeEnum,
+    AlterOperationEnum,
+    SpgTypeEnum,
+    PropertyGroupEnum,
+    IndexTypeEnum,
+)
+from knext.schema.model.spg_type import (
+    EntityType,
+    ConceptType,
+    EventType,
+    StandardType,
+    Property,
+    Relation,
+    BasicType,
+)
+from knext.schema.client import SchemaClient
+
+
+class IndentLevel(Enum):
+    # Define entity/concept/event/standard types or subtypes
+    Type = 0
+
+    # Define description/properties/relations of type
+    TypeMeta = 1
+
+    # Define property/relation name of type
+    Predicate = 2
+
+    # Define description/constraint/rule/index of property/relation
+    PredicateMeta = 3
+
+    # Define property about property
+    SubProperty = 4
+
+    # Define constraint of sub property
+    SubPropertyMeta = 5
+
+
+class RegisterUnit(Enum):
+    Type = "type"
+    Property = "property"
+    Relation = "relation"
+    SubProperty = "subProperty"
+
+
+class SPGSchemaMarkLang:
+    """
+    SPG Schema Mark Language Parser
+    Feature1: parse schema script and then alter the schema of project
+    Feature2: export schema script from a project
+    """
+
+    internal_type = set()
+    entity_internal_property = set()
+    event_internal_property = {"eventTime"}
+    concept_internal_property = {"stdId", "alias"}
+    keyword_type = {
+        "EntityType",
+        "ConceptType",
+        "EventType",
+        "StandardType",
+        "BasicType",
+    }
+    semantic_rel = {
+        "SYNANT": [
+            "synonym",
+            "antonym",
+            "symbolOf",
+            "distinctFrom",
+            "definedAs",
+            "locatedNear",
+            "similarTo",
+            "etymologicallyRelatedTo",
+        ],
+        "CAU": ["leadTo", "causes", "obstructedBy", "createdBy", "causesDesire"],
+        "SEQ": [
+            "happenedBefore",
+            "hasSubevent",
+            "hasFirstSubevent",
+            "hasLastSubevent",
+            "hasPrerequisite",
+        ],
+        "IND": ["belongTo"],
+        "INC": ["isPartOf", "hasA", "madeOf", "derivedFrom", "hasContext"],
+        "USE": ["usedFor", "capableOf", "receivesAction", "motivatedByGoal"],
+    }
+    semantic_rel_zh = {
+        "synonym": "同义",
+        "antonym": "反义",
+        "symbolOf": "象征",
+        "distinctFrom": "区别于",
+        "definedAs": "定义为",
+        "locatedNear": "位置临近",
+        "similarTo": "类似于",
+        "etymologicallyRelatedTo": "词源相关",
+        "leadTo": "导致",
+        "causes": "引起",
+        "obstructedBy": "受阻于",
+        "createdBy": "由...创建",
+        "causesDesire": "引发欲望",
+        "happenedBefore": "先于...发生",
+        "hasSubevent": "拥有子事件",
+        "hasFirstSubevent": "以...作为开始",
+        "hasLastSubevent": "以...作为结束",
+        "hasPrerequisite": "有前提条件",
+        "belongTo": "属于",
+        "isPartOf": "是...的一部分",
+        "hasA": "拥有",
+        "madeOf": "由…制成",
+        "derivedFrom": "源自于",
+        "hasContext": "有上下文",
+        "usedFor": "用于",
+        "capableOf": "能够",
+        "receivesAction": "接受动作",
+        "motivatedByGoal": "目标驱动",
+    }
+    parsing_register = {
+        RegisterUnit.Type: None,
+        RegisterUnit.Property: None,
+        RegisterUnit.Relation: None,
+        RegisterUnit.SubProperty: None,
+    }
+    indent_level_pos = [None, None, None, None, None, None]
+    rule_quote_predicate = None
+    rule_quote_open = False
+    current_parsing_level = 0
+    last_indent_level = 0
+    namespace = None
+    types = {}
+    defined_types = {}
+
+    def __init__(self, filename, with_server=True):
+        self.reset()
+        self.schema_file = filename
+        self.current_line_num = 0
+        if with_server:
+            self.schema = SchemaClient()
+            thing = self.schema.query_spg_type("Thing")
+            for prop in thing.properties:
+                self.entity_internal_property.add(prop)
+                self.event_internal_property.add(prop)
+                self.concept_internal_property.add(prop)
+            session = self.schema.create_session()
+            for type_name in session.spg_types:
+                spg_type = session.get(type_name)
+                if session.get(type_name).spg_type_enum in [
+                    SpgTypeEnum.Basic,
+                    SpgTypeEnum.Standard,
+                ]:
+                    self.internal_type.add(spg_type.name)
+        else:
+            self.internal_type = {"Text", "Float", "Integer"}
+        self.load_script()
+
+    def reset(self):
+        self.internal_type = set()
+        self.entity_internal_property = set()
+        self.event_internal_property = {"eventTime"}
+        self.concept_internal_property = {"stdId", "alias"}
+        self.keyword_type = {
+            "EntityType",
+            "ConceptType",
+            "EventType",
+            "StandardType",
+            "BasicType",
+        }
+
+        self.parsing_register = {
+            RegisterUnit.Type: None,
+            RegisterUnit.Property: None,
+            RegisterUnit.Relation: None,
+            RegisterUnit.SubProperty: None,
+        }
+        self.indent_level_pos = [None, None, None, None, None, None]
+        self.rule_quote_predicate = None
+        self.rule_quote_open = False
+        self.current_parsing_level = 0
+        self.last_indent_level = 0
+        self.namespace = None
+        self.types = {}
+
+    def save_register(self, element: RegisterUnit, value):
+        """
+        maintain the session for parsing
+        """
+
+        self.parsing_register[element] = value
+        if element == RegisterUnit.Type:
+            self.parsing_register[RegisterUnit.Property] = None
+            self.parsing_register[RegisterUnit.Relation] = None
+            self.parsing_register[RegisterUnit.SubProperty] = None
+        elif element == RegisterUnit.Property:
+            self.parsing_register[RegisterUnit.Relation] = None
+            self.parsing_register[RegisterUnit.SubProperty] = None
+        elif element == RegisterUnit.Relation:
+            self.parsing_register[RegisterUnit.Property] = None
+            self.parsing_register[RegisterUnit.SubProperty] = None
+
+    def adjust_parsing_level(self, step):
+        """
+        mark the indent level and clear related session
+        """
+
+        if step == 0:
+            self.current_parsing_level = IndentLevel.Type.value
+            # finish type parsing, clear the field in session
+            self.save_register(RegisterUnit.Type, None)
+            return
+        if step < 0:
+            self.current_parsing_level = max(0, self.current_parsing_level + step)
+            if self.current_parsing_level == IndentLevel.PredicateMeta.value:
+                # finish sub property parsing, clear the field in session
+                self.save_register(RegisterUnit.SubProperty, None)
+            elif self.current_parsing_level == IndentLevel.Predicate.value:
+                # finish predicate parsing, clear the fields in session
+                if self.parsing_register[RegisterUnit.Property] is not None:
+                    self.save_register(
+                        RegisterUnit.Property,
+                        Property(name="_", object_type_name="Thing"),
+                    )
+                elif self.parsing_register[RegisterUnit.Relation] is not None:
+                    self.save_register(
+                        RegisterUnit.Relation,
+                        Relation(name="_", object_type_name="Thing"),
+                    )
+        elif step == 1:
+            assert self.current_parsing_level + 1 < len(IndentLevel), self.error_msg(
+                "Invalid indentation (too many levels?)"
+            )
+
+            self.current_parsing_level += 1
+
+    def error_msg(self, msg):
+        return f"Line# {self.current_line_num}: {msg} (Please refer https://spg.openkg.cn/tutorial/schema/dsl for details)"
+
+    def get_type_name_with_ns(self, type_name: str):
+        if "." in type_name:
+            return type_name
+        else:
+            return f"{self.namespace}.{type_name}"
+
+    def parse_type(self, expression):
+        """
+        parse the SPG type definition
+        """
+
+        namespace_match = re.match(r"^namespace\s+([a-zA-Z0-9]+)$", expression)
+        if namespace_match:
+            assert self.namespace is None, self.error_msg(
+                "Duplicated namespace define, please ensure define it only once"
+            )
+
+            self.namespace = namespace_match.group(1)
+            return
+
+        type_match = re.match(
+            r"^([a-zA-Z0-9\.]+)\((\w+)\):\s*?([a-zA-Z0-9,]+)$", expression
+        )
+        if type_match:
+            assert self.namespace is not None, self.error_msg(
+                "Missing namespace, please define namespace at the first"
+            )
+
+            type_name = type_match.group(1)
+            type_name_zh = type_match.group(2).strip()
+            type_class = type_match.group(3).strip()
+            assert type_class in self.keyword_type, self.error_msg(
+                f"{type_class} is illegal, please define it before current line"
+            )
+            assert (
+                type_name.startswith("STD.")
+                or "." not in type_name
+                or type_name.startswith(f"{self.namespace}.")
+            ), self.error_msg(
+                f"The name space of {type_name} does not belong to current project."
+            )
+
+            spg_type = None
+            if type_class == "EntityType":
+                spg_type = EntityType(
+                    name=self.get_type_name_with_ns(type_name), name_zh=type_name_zh
+                )
+            elif type_class == "ConceptType":
+                spg_type = ConceptType(
+                    name=self.get_type_name_with_ns(type_name),
+                    name_zh=type_name_zh,
+                    hypernym_predicate=HypernymPredicateEnum.IsA,
+                )
+            elif type_class == "EventType":
+                spg_type = EventType(
+                    name=self.get_type_name_with_ns(type_name), name_zh=type_name_zh
+                )
+            elif type_class == "StandardType":
+                spg_type = StandardType(name=f"{type_name}", name_zh=type_name_zh)
+                spg_type.spreadable = False
+                assert type_name.startswith("STD."), self.error_msg(
+                    "The name of standard type must start with STD."
+                )
+            elif type_class == "BasicType" and type_name == "Text":
+                spg_type = BasicType.Text
+            elif type_class == "BasicType" and type_name == "Integer":
+                spg_type = BasicType.Integer
+            elif type_class == "BasicType" and type_name == "Float":
+                spg_type = BasicType.Float
+            ns_type_name = self.get_type_name_with_ns(type_name)
+            assert ns_type_name not in self.types, self.error_msg(
+                f'Type "{type_name}" is duplicated in the schema'
+            )
+
+            self.types[ns_type_name] = spg_type
+            self.save_register(RegisterUnit.Type, spg_type)
+            return
+
+        sub_type_match = re.match(
+            r"^([a-zA-Z0-9]+)\((\w+)\)\s*?->\s*?([a-zA-Z0-9\.]+):$", expression
+        )
+        if sub_type_match:
+            assert self.namespace is not None, self.error_msg(
+                "Missing namespace, please define namespace at the first"
+            )
+
+            type_name = sub_type_match.group(1)
+            type_name_zh = sub_type_match.group(2).strip()
+            type_class = sub_type_match.group(3).strip()
+            if "." not in type_class:
+                ns_type_class = self.get_type_name_with_ns(type_class)
+            else:
+                ns_type_class = type_class
+            assert (
+                type_class not in self.keyword_type
+                and type_class not in self.internal_type
+            ), self.error_msg(f"{type_class} is not a valid inheritable type")
+            assert ns_type_class in self.types, self.error_msg(
+                f"{type_class} not found, please define it first"
+            )
+
+            parent_spg_type = self.types[ns_type_class]
+            assert parent_spg_type.spg_type_enum in [
+                SpgTypeEnum.Entity,
+                SpgTypeEnum.Event,
+            ], self.error_msg(
+                f'"{type_class}" cannot be inherited, only entity/event type can be inherited.'
+            )
+
+            spg_type = EntityType(
+                name=f"{self.namespace}.{type_name}", name_zh=type_name_zh
+            )
+            if parent_spg_type.spg_type_enum == SpgTypeEnum.Event:
+                spg_type = EventType(
+                    name=f"{self.namespace}.{type_name}", name_zh=type_name_zh
+                )
+            spg_type.name = type_name
+            spg_type.name_zh = type_name_zh
+            spg_type.parent_type_name = ns_type_class
+            ns_type_name = f"{self.namespace}.{type_name}"
+            self.types[ns_type_name] = spg_type
+            self.save_register(RegisterUnit.Type, spg_type)
+            return
+
+        raise Exception(
+            self.error_msg(
+                "unrecognized expression, expect namespace A or A(B):C or A(B)->C"
+            )
+        )
+
+    def parse_type_meta(self, expression):
+        """
+        parse the meta definition of SPG type
+        """
+
+        match = re.match(
+            r"^(desc|properties|relations|hypernymPredicate|regular|spreadable|autoRelate):\s*?(.*)$",
+            expression,
+        )
+        assert match, self.error_msg(
+            "Unrecognized expression, expect desc:|properties:|relations:"
+        )
+
+        type_meta = match.group(1)
+        meta_value = match.group(2).strip()
+
+        if type_meta == "desc" and len(meta_value) > 0:
+            self.parsing_register[RegisterUnit.Type].desc = meta_value
+
+        elif type_meta == "properties":
+            assert self.parsing_register[RegisterUnit.Type].spg_type_enum not in [
+                SpgTypeEnum.Standard,
+                SpgTypeEnum.Concept,
+            ], self.error_msg(
+                "Standard/concept type does not allow defining properties."
+            )
+            self.save_register(
+                RegisterUnit.Property, Property(name="_", object_type_name="Thing")
+            )
+
+        elif type_meta == "relations":
+            assert self.parsing_register[RegisterUnit.Type].spg_type_enum not in [
+                SpgTypeEnum.Standard
+            ], self.error_msg("Standard type does not allow defining relations.")
+            self.save_register(
+                RegisterUnit.Relation, Relation(name="_", object_type_name="Thing")
+            )
+
+        elif type_meta == "hypernymPredicate":
+            assert meta_value in ["isA", "locateAt", "mannerOf"], self.error_msg(
+                "Invalid hypernym predicate, expect isA or locateAt or mannerOf"
+            )
+            assert (
+                self.parsing_register[RegisterUnit.Type].spg_type_enum
+                == SpgTypeEnum.Concept
+            ), self.error_msg("Hypernym predicate is available for concept type only")
+
+            if meta_value == "isA":
+                self.parsing_register[
+                    RegisterUnit.Type
+                ].hypernym_predicate = HypernymPredicateEnum.IsA
+            elif meta_value == "locateAt":
+                self.parsing_register[
+                    RegisterUnit.Type
+                ].hypernym_predicate = HypernymPredicateEnum.LocateAt
+            elif meta_value == "mannerOf":
+                self.parsing_register[
+                    RegisterUnit.Type
+                ].hypernym_predicate = HypernymPredicateEnum.MannerOf
+
+        elif type_meta == "regular":
+            assert (
+                self.parsing_register[RegisterUnit.Type].spg_type_enum
+                == SpgTypeEnum.Standard
+            ), self.error_msg("Regular is available for standard type only")
+            self.parsing_register[RegisterUnit.Type].constraint = {
+                "REGULAR": meta_value
+            }
+
+        elif type_meta == "spreadable":
+            assert (
+                self.parsing_register[RegisterUnit.Type].spg_type_enum
+                == SpgTypeEnum.Standard
+            ), self.error_msg("Spreadable is available for standard type only")
+            assert meta_value == "True" or meta_value == "False", self.error_msg(
+                "Spreadable only accept True or False as its value"
+            )
+            self.parsing_register[RegisterUnit.Type].spreadable = meta_value == "True"
+
+        elif type_meta == "autoRelate":
+            assert (
+                self.parsing_register[RegisterUnit.Type].spg_type_enum
+                == SpgTypeEnum.Concept
+            ), self.error_msg(
+                "AutoRelate definition is available for concept type only"
+            )
+            concept_types = meta_value.split(",")
+            for concept in concept_types:
+                c = self.get_type_name_with_ns(concept.strip())
+                assert (
+                    c in self.types
+                    and self.types[c].spg_type_enum == SpgTypeEnum.Concept
+                ), self.error_msg(
+                    f"{concept.strip()} is not a concept type, "
+                    f"concept type only allow relationships defined between concept types"
+                )
+                for k in self.semantic_rel:
+                    if k == "IND":
+                        continue
+                    for p in self.semantic_rel[k]:
+                        predicate = Relation(
+                            name=p, name_zh=self.semantic_rel_zh[p], object_type_name=c
+                        )
+                        self.parsing_register[RegisterUnit.Type].add_relation(predicate)
+        return
+
+    def check_semantic_relation(self, predicate_name, predicate_class):
+        """
+        Check if the definition of semantic relations is correct
+        """
+
+        name_arr = predicate_name.split("#")
+        short_name = name_arr[0]
+        pred_name = name_arr[1]
+        assert short_name in self.semantic_rel, self.error_msg(
+            f"{short_name} is incorrect, expect SYNANT/CAU/SEQ/IND/INC"
+        )
+        assert pred_name in self.semantic_rel[short_name], self.error_msg(
+            f'{pred_name} is incorrect, expect {" / ".join(self.semantic_rel[short_name])}'
+        )
+
+        subject_type = self.parsing_register[RegisterUnit.Type]
+        predicate_class_ns = predicate_class
+        if "." not in predicate_class:
+            predicate_class_ns = f"{self.namespace}.{predicate_class}"
+        assert (
+            predicate_class_ns in self.types or predicate_class_ns in self.defined_types
+        ), self.error_msg(
+            f"{predicate_class} is illegal, please ensure that it appears in this schema."
+        )
+        object_type = self.types[predicate_class_ns]
+
+        if short_name == "SYNANT":
+            assert subject_type.spg_type_enum == SpgTypeEnum.Concept, self.error_msg(
+                "Only concept types could define synonym/antonym relation"
+            )
+            assert object_type.spg_type_enum == SpgTypeEnum.Concept, self.error_msg(
+                "Synonymy/antonym relation can only point to concept types"
+            )
+        elif short_name == "CAU":
+            assert subject_type.spg_type_enum in [
+                SpgTypeEnum.Concept,
+                SpgTypeEnum.Event,
+            ], self.error_msg("Only concept/event types could define causal relation")
+            assert object_type.spg_type_enum in [
+                SpgTypeEnum.Concept,
+                SpgTypeEnum.Event,
+            ], self.error_msg(
+                f'"{predicate_class}" must be a concept type to conform to the definition of causal relation'
+            )
+            if subject_type.spg_type_enum == SpgTypeEnum.Concept:
+                assert object_type.spg_type_enum == SpgTypeEnum.Concept, self.error_msg(
+                    "The causal relation of concept types can only point to concept types"
+                )
+        elif short_name == "SEQ":
+            assert subject_type.spg_type_enum in [
+                SpgTypeEnum.Event,
+                SpgTypeEnum.Concept,
+            ], self.error_msg(
+                "Only concept/event types could define sequential relation"
+            )
+            assert (
+                subject_type.spg_type_enum == object_type.spg_type_enum
+            ), self.error_msg(
+                f'"{predicate_class}" should keep the same type with "{subject_type.name.split(".")[1]}"'
+            )
+        elif short_name == "IND":
+            assert subject_type.spg_type_enum in [
+                SpgTypeEnum.Entity,
+                SpgTypeEnum.Event,
+            ], self.error_msg("Only entity/event types could define inductive relation")
+            assert object_type.spg_type_enum == SpgTypeEnum.Concept, self.error_msg(
+                f'"{predicate_class}" must be a concept type to conform to the definition of inductive relation'
+            )
+        elif short_name == "INC":
+            assert subject_type.spg_type_enum == SpgTypeEnum.Concept, self.error_msg(
+                "Only concept types could define inclusive relation"
+            )
+            assert object_type.spg_type_enum == SpgTypeEnum.Concept, self.error_msg(
+                "The inclusion relation of concept types can only point to concept types"
+            )
+        elif short_name == "USE":
+            assert subject_type.spg_type_enum == SpgTypeEnum.Concept, self.error_msg(
+                "Only concept types could define usage relation"
+            )
+            assert object_type.spg_type_enum == SpgTypeEnum.Concept, self.error_msg(
+                "The usage relation of concept types can only point to concept types"
+            )
+
+    def parse_predicate(self, expression):
+        """
+        parse the property/relation definition of SPG type
+        """
+
+        match = re.match(
+            r"^([a-zA-Z0-9#]+)\(([\w\.]+)\):\s*?([a-zA-Z0-9,\.]+)$", expression
+        )
+        assert match, self.error_msg(
+            "Unrecognized expression, expect pattern like english(Chinese):Type"
+        )
+
+        predicate_name = match.group(1)
+        predicate_name_zh = match.group(2).strip()
+        predicate_class = match.group(3).strip()
+        cur_type = self.parsing_register[RegisterUnit.Type]
+        type_name = cur_type.name
+
+        if (
+            cur_type.spg_type_enum == SpgTypeEnum.Concept
+            and self.parsing_register[RegisterUnit.Relation] is None
+        ):
+            assert "#" in predicate_name, self.error_msg(
+                "Concept type only accept following categories of relation: INC#/CAU#/SYNANT#/IND#/USE#/SEQ#"
+            )
+
+        if "#" in predicate_name:
+            self.check_semantic_relation(predicate_name, predicate_class)
+            predicate_name = predicate_name.split("#")[1]
+        else:
+            for semantic_short in self.semantic_rel.values():
+                assert predicate_name not in semantic_short, self.error_msg(
+                    f"{predicate_name} is a semantic predicate, please add the semantic prefix"
+                )
+
+        if (
+            "." in predicate_class
+            and predicate_class not in self.types
+            and predicate_class not in self.internal_type
+        ):
+            try:
+                cross_type = self.schema.query_spg_type(
+                    self.get_type_name_with_ns(predicate_class)
+                )
+                self.types[self.get_type_name_with_ns(predicate_class)] = cross_type
+            except Exception as e:
+                raise ValueError(
+                    self.error_msg(
+                        f"{predicate_class} is illegal, please ensure the name space or type name is correct."
+                    )
+                )
+
+        assert (
+            self.get_type_name_with_ns(predicate_class) in self.types
+            or predicate_class in self.internal_type
+            or predicate_class in self.defined_types
+        ), self.error_msg(
+            f"{predicate_class} is illegal, please ensure that it appears in this schema."
+        )
+
+        # assert predicate_name not in self.entity_internal_property, self.error_msg(
+        #     f"property {predicate_name} is the default property of type"
+        # )
+        if predicate_class not in self.internal_type:
+            spg_type_enum = SpgTypeEnum.Entity
+            if self.get_type_name_with_ns(predicate_class) in self.types:
+                predicate_type = self.types[self.get_type_name_with_ns(predicate_class)]
+                spg_type_enum = predicate_type.spg_type_enum
+            elif predicate_class in self.defined_types:
+                spg_type_enum_txt = self.defined_types[predicate_class]
+                if spg_type_enum_txt == "EntityType":
+                    spg_type_enum = SpgTypeEnum.Entity
+                elif spg_type_enum_txt == "ConceptType":
+                    spg_type_enum = SpgTypeEnum.Concept
+                elif spg_type_enum_txt == "EventType":
+                    spg_type_enum = SpgTypeEnum.Event
+                elif spg_type_enum_txt == "StandardType":
+                    spg_type_enum = SpgTypeEnum.Standard
+
+            if cur_type.spg_type_enum == SpgTypeEnum.Concept:
+                assert spg_type_enum == SpgTypeEnum.Concept, self.error_msg(
+                    "Concept type only allow relationships that point to themselves"
+                )
+            elif cur_type.spg_type_enum == SpgTypeEnum.Entity:
+                assert spg_type_enum != SpgTypeEnum.Event, self.error_msg(
+                    "Relationships of entity types are not allowed to point to event types; "
+                    "instead, they are only permitted to point from event types to entity types, "
+                    "adhering to the principle of moving from dynamic to static."
+                )
+
+        if self.parsing_register[RegisterUnit.Relation] is not None:
+            assert (
+                predicate_name
+                not in self.parsing_register[RegisterUnit.Relation].sub_properties
+            ), self.error_msg(
+                f'Property "{predicate_name}" is duplicated under the relation '
+                f"{self.parsing_register[RegisterUnit.Relation].name}"
+            )
+        else:
+            assert (
+                predicate_name
+                not in self.parsing_register[RegisterUnit.Type].properties
+            ), self.error_msg(
+                f'Property "{predicate_name}" is duplicated under the type {type_name[type_name.index(".") + 1:]}'
+            )
+        if predicate_class == "ConceptType":
+            assert not self.is_internal_property(
+                predicate_name, SpgTypeEnum.Concept
+            ), self.error_msg(
+                f"property {predicate_name} is the default property of ConceptType"
+            )
+        if predicate_class == "EventType":
+            assert not self.is_internal_property(
+                predicate_name, SpgTypeEnum.Event
+            ), self.error_msg(
+                f"property {predicate_name} is the default property of EventType"
+            )
+
+        if (
+            "." not in predicate_class
+            and predicate_class not in BasicTypeEnum.__members__
+        ):
+            predicate_class = f"{self.namespace}.{predicate_class}"
+
+        if self.parsing_register[RegisterUnit.SubProperty]:
+            # predicate is sub property
+            predicate = Property(
+                name=predicate_name,
+                name_zh=predicate_name_zh,
+                object_type_name=predicate_class,
+            )
+            if self.parsing_register[RegisterUnit.Property] is not None:
+                self.parsing_register[RegisterUnit.Property].add_sub_property(predicate)
+            elif self.parsing_register[RegisterUnit.Relation] is not None:
+                self.parsing_register[RegisterUnit.Relation].add_sub_property(predicate)
+            self.save_register(RegisterUnit.SubProperty, predicate)
+
+        elif self.parsing_register[RegisterUnit.Property]:
+            # predicate is property
+            predicate = Property(
+                name=predicate_name,
+                name_zh=predicate_name_zh,
+                object_type_name=predicate_class,
+            )
+            if predicate_class in self.types:
+                predicate.object_spg_type = self.types[predicate_class].spg_type_enum
+                predicate.object_type_name_zh = self.types[predicate_class].name_zh
+            if (
+                self.parsing_register[RegisterUnit.Type].spg_type_enum
+                == SpgTypeEnum.Event
+                and predicate_name == "subject"
+            ):
+                assert predicate_class not in self.internal_type, self.error_msg(
+                    f"The subject of event type only allows entity/concept type"
+                )
+
+                predicate.property_group = PropertyGroupEnum.Subject
+                if "," in predicate_class:
+                    # multi-types for subject
+                    predicate.object_type_name = "Text"
+                    subject_types = predicate_class.split(",")
+                    for subject_type in subject_types:
+                        subject_type = subject_type.strip()
+                        assert (
+                            subject_type not in BasicTypeEnum.__members__
+                        ), self.error_msg(
+                            f"{predicate_class} is illegal for subject in event type"
+                        )
+
+                        if "." not in subject_type:
+                            subject_type = f"{self.namespace}.{predicate_class}"
+                        assert (
+                            subject_type in self.types
+                            or predicate_class in self.defined_types
+                        ), self.error_msg(
+                            f"{predicate_class} is illegal, please ensure that it appears in this schema."
+                        )
+
+                        subject_predicate = Property(
+                            name=f"subject{subject_type}",
+                            name_zh=predicate_name_zh,
+                            object_type_name=subject_type,
+                        )
+                        subject_predicate.property_group = PropertyGroupEnum.Subject
+                        self.parsing_register[RegisterUnit.Type].add_property(
+                            subject_predicate
+                        )
+
+            self.parsing_register[RegisterUnit.Type].add_property(predicate)
+            self.save_register(RegisterUnit.Property, predicate)
+
+        else:
+            # predicate is relation
+            assert not predicate_class.startswith("STD."), self.error_msg(
+                f"{predicate_class} is not allow appear in the definition of relation."
+            )
+            assert (
+                predicate_class in self.types
+                or predicate_class.split(".")[1] in self.defined_types
+            ), self.error_msg(
+                f"{predicate_class} is illegal, please ensure that it appears in this schema."
+            )
+            assert (
+                f"{predicate_name}_{predicate_class}"
+                not in self.parsing_register[RegisterUnit.Type].relations
+            ), self.error_msg(
+                f'Relation "{match.group()}" is duplicated under the type {type_name[type_name.index(".") + 1:]}'
+                if self.parsing_register[RegisterUnit.Type].spg_type_enum
+                != SpgTypeEnum.Concept
+                else f'Relation "{match.group()}" is already defined by keyword autoRelate'
+                f'under the {type_name[type_name.index(".") + 1:]}'
+            )
+
+            predicate = Relation(name=predicate_name, object_type_name=predicate_class)
+            if predicate_class in self.types:
+                predicate.object_spg_type = self.types[predicate_class].spg_type_enum
+                predicate.object_type_name_zh = self.types[predicate_class].name_zh
+            self.parsing_register[RegisterUnit.Type].add_relation(predicate)
+            self.save_register(RegisterUnit.Relation, predicate)
+        predicate.name_zh = predicate_name_zh
+
+    def parse_property_meta(self, expression):
+        """
+        parse the property meta definition of SPG type
+        """
+
+        match = re.match(
+            r"^(desc|properties|constraint|rule|index):\s*?(.*)$", expression
+        )
+        assert match, self.error_msg(
+            "Unrecognized expression, expect desc:|properties:|constraint:|rule:|index:"
+        )
+
+        property_meta = match.group(1)
+        meta_value = match.group(2)
+
+        if property_meta == "desc" and len(meta_value) > 0:
+            if self.parsing_register[RegisterUnit.SubProperty] is not None:
+                self.parsing_register[
+                    RegisterUnit.SubProperty
+                ].desc = meta_value.strip()
+            elif self.parsing_register[RegisterUnit.Property] is not None:
+                self.parsing_register[RegisterUnit.Property].desc = meta_value.strip()
+
+        elif property_meta == "constraint":
+            if self.parsing_register[RegisterUnit.SubProperty] is not None:
+                self.parse_constraint_for_property(
+                    meta_value, self.parsing_register[RegisterUnit.SubProperty]
+                )
+            elif self.parsing_register[RegisterUnit.Property] is not None:
+                self.parse_constraint_for_property(
+                    meta_value, self.parsing_register[RegisterUnit.Property]
+                )
+        elif property_meta == "index":
+            if self.parsing_register[RegisterUnit.SubProperty] is not None:
+                self.parse_index_for_property(
+                    meta_value, self.parsing_register[RegisterUnit.SubProperty]
+                )
+            elif self.parsing_register[RegisterUnit.Property] is not None:
+                self.parse_index_for_property(
+                    meta_value, self.parsing_register[RegisterUnit.Property]
+                )
+
+        elif property_meta == "properties":
+            self.save_register(
+                RegisterUnit.SubProperty, Property(name="_", object_type_name="Thing")
+            )
+
+        elif property_meta == "rule":
+            self.parse_predicate_rule(meta_value.lstrip(), RegisterUnit.Property)
+
+    def parse_relation_meta(self, expression):
+        """
+        parse the relation meta definition of SPG type
+        """
+
+        match = re.match(r"^(desc|properties|rule):\s*?(.*)$", expression)
+        assert match, self.error_msg(
+            "Unrecognized expression, expect desc:|properties:|rule:"
+        )
+
+        property_meta = match.group(1)
+        meta_value = match.group(2)
+
+        if property_meta == "desc" and len(meta_value) > 0:
+            self.parsing_register[RegisterUnit.Relation].desc = meta_value.strip()
+
+        elif property_meta == "properties":
+            self.save_register(
+                RegisterUnit.SubProperty, Property(name="_", object_type_name="Thing")
+            )
+
+        elif property_meta == "rule":
+            self.parse_predicate_rule(meta_value.lstrip(), RegisterUnit.Relation)
+
+    def parsing_dispatch(self, expression, parsing_level):
+        if parsing_level == IndentLevel.Type.value:
+            self.parse_type(expression)
+
+        elif parsing_level == IndentLevel.TypeMeta.value:
+            self.parse_type_meta(expression)
+
+        elif parsing_level == IndentLevel.Predicate.value:
+            self.parse_predicate(expression)
+
+        elif parsing_level == IndentLevel.PredicateMeta.value:
+            if self.parsing_register[RegisterUnit.Property] is not None:
+                self.parse_property_meta(expression)
+
+            else:
+                self.parse_relation_meta(expression)
+
+        elif parsing_level == IndentLevel.SubProperty.value:
+            self.parse_predicate(expression)
+
+        elif parsing_level == IndentLevel.SubPropertyMeta.value:
+            self.parse_property_meta(expression)
+
+    def parse_predicate_rule(self, rule, key):
+        """
+        parse the logic rule for property/relation
+        """
+
+        strip_rule = rule
+        if strip_rule.startswith("[["):
+            self.rule_quote_predicate = self.parsing_register[key]
+            self.rule_quote_open = True
+            if len(strip_rule) > 2:
+                self.rule_quote_predicate.logical_rule = strip_rule[2].lstrip()
+            else:
+                self.rule_quote_predicate.logical_rule = ""
+        else:
+            self.parsing_register[key].logical_rule = rule
+
+    def parse_constraint_for_property(self, expression, prop):
+        """
+        parse the constraint definition of property
+        """
+
+        if len(expression) == 0:
+            return
+
+        pattern = re.compile(r"(Enum|Regular)\s*?=\s*?\"([^\"]+)\"", re.IGNORECASE)
+        matches = re.findall(pattern, expression)
+        if matches:
+            for group in matches:
+                if group[0].lower() == "enum":
+                    enum_values = group[1].split(",")
+                    strip_enum_values = list()
+                    for ev in enum_values:
+                        strip_enum_values.append(ev.strip())
+                    prop.add_constraint(ConstraintTypeEnum.Enum, strip_enum_values)
+
+                elif group[0].lower() == "regular":
+                    prop.add_constraint(ConstraintTypeEnum.Regular, group[1])
+
+        expression = re.sub(r"(Enum|Regular)\s*?=\s*?\"([^\"]+)\"", "", expression)
+        array = expression.split(",")
+        for cons in array:
+            cons = cons.strip()
+            if cons.lower() == "multivalue":
+                prop.add_constraint(ConstraintTypeEnum.MultiValue)
+
+            elif cons.lower() == "notnull":
+                prop.add_constraint(ConstraintTypeEnum.NotNull)
+
+    def parse_index_for_property(self, expression, prop):
+        """
+        parse the index definition of property
+        """
+
+        if len(expression) == 0:
+            return
+
+        array = expression.split(",")
+        for cons in array:
+            cons = cons.strip()
+            if cons.lower() == "text":
+                prop.index_type = IndexTypeEnum.Text
+
+            elif cons.lower() == "vector":
+                prop.index_type = IndexTypeEnum.Vector
+
+            elif cons.lower() == "textandvector":
+                prop.index_type = IndexTypeEnum.TextAndVector
+
+    def complete_rule(self, rule):
+        """
+        Auto generate define statement and append namespace to the entity name
+        """
+
+        pattern = re.compile(r"Define\s*\(", re.IGNORECASE)
+        match = pattern.match(rule.strip())
+        if not match:
+            subject_name = self.parsing_register[RegisterUnit.Type].name
+            predicate = None
+            if self.parsing_register[RegisterUnit.Property] is not None:
+                predicate = self.parsing_register[RegisterUnit.Property]
+            elif self.parsing_register[RegisterUnit.Relation] is not None:
+                predicate = self.parsing_register[RegisterUnit.Relation]
+            head = (
+                f"Define (s:{subject_name})-[p:{predicate.name}]->(o:{predicate.object_type_name})"
+                + " {\n"
+            )
+            rule = head + rule
+            rule += "\n}"
+
+        pattern = re.compile(r"\(([\w\s]*?:)(`?[\w\s\.]+)`?/?[^)]*?\)", re.IGNORECASE)
+        matches = re.findall(pattern, rule)
+        replace_list = []
+        if matches:
+            for group in matches:
+                if "." in group[1] or group[1].lower() in ["integer", "text", "float"]:
+                    continue
+                replace_list.append(
+                    (
+                        f"({group[0]}{group[1]}",
+                        f"({group[0]}{self.namespace}.{group[1].strip()}"
+                        if "`" not in group[1]
+                        else f"({group[0]}`{self.namespace}.{group[1].replace('`', '').strip()}",
+                    )
+                )
+        if len(replace_list) > 0:
+            for t in replace_list:
+                rule = rule.replace(t[0], t[1])
+
+        return rule.strip()
+
+    def preload_types(self, lines: list):
+        """
+        Pre analyze the script to obtain defined types
+        """
+
+        for line in lines:
+            type_match = re.match(
+                r"^([a-zA-Z0-9\.]+)\((\w+)\):\s*?([a-zA-Z0-9,]+)$", line
+            )
+            if type_match:
+                self.defined_types[type_match.group(1)] = type_match.group(3).strip()
+                continue
+            sub_type_match = re.match(
+                r"^([a-zA-Z0-9]+)\((\w+)\)\s*?->\s*?([a-zA-Z0-9\.]+):$", line
+            )
+            if sub_type_match:
+                self.defined_types[sub_type_match.group(1)] = sub_type_match.group(
+                    3
+                ).strip()
+
+    def load_script(self):
+        """
+        Load and then parse the script file
+        """
+
+        file = open(self.schema_file, "r", encoding="utf-8")
+        lines = file.read().splitlines()
+        self.preload_types(lines)
+        for line in lines:
+            self.current_line_num += 1
+            strip_line = line.strip()
+            # replace tabs with two spaces
+            line = line.replace("\t", "  ")
+            if strip_line == "" or strip_line.startswith("#"):
+                # skip empty or comments line
+                continue
+
+            if self.rule_quote_open:
+                # process the multi-line assignment [[ .... ]]
+                right_strip_line = line.rstrip()
+                if strip_line.endswith("]]"):
+                    self.rule_quote_open = False
+                    if len(right_strip_line) > 2:
+                        self.rule_quote_predicate.logical_rule += right_strip_line[
+                            : len(right_strip_line) - 2
+                        ]
+                    self.rule_quote_predicate.logical_rule = self.complete_rule(
+                        self.rule_quote_predicate.logical_rule
+                    )
+
+                else:
+                    self.rule_quote_predicate.logical_rule += line + "\n"
+                continue
+
+            indent_count = len(line) - len(line.lstrip())
+            if indent_count == 0:
+                # the line without indent is namespace definition or a type definition
+                self.adjust_parsing_level(0)
+
+            elif indent_count > self.last_indent_level:
+                # the line is the sub definition of the previous line
+                self.adjust_parsing_level(1)
+
+            elif indent_count < self.last_indent_level:
+                # finish current indent parsing
+                backward_step = None
+                for i in range(0, len(self.indent_level_pos)):
+                    if indent_count == self.indent_level_pos[i]:
+                        backward_step = i - self.current_parsing_level
+                        break
+                assert backward_step, self.error_msg(
+                    f"Invalid indentation, please align with the previous definition"
+                )
+
+                if backward_step != 0:
+                    self.adjust_parsing_level(backward_step)
+
+            self.parsing_dispatch(strip_line, self.current_parsing_level)
+            self.last_indent_level = indent_count
+            self.indent_level_pos[self.current_parsing_level] = indent_count
+
+    def is_internal_property(self, prop: Property, spg_type: SpgTypeEnum):
+        if spg_type == SpgTypeEnum.Entity or spg_type == SpgTypeEnum.Standard:
+            return prop in self.entity_internal_property
+
+        elif spg_type == SpgTypeEnum.Concept:
+            return prop in self.concept_internal_property
+
+        elif spg_type == SpgTypeEnum.Event:
+            return prop in self.event_internal_property
+
+    def sync_schema(self):
+        return self.diff_and_sync(False)
+
+    def print_diff(self):
+        self.diff_and_sync(True)
+
+    def diff_sub_property(self, new, old, old_type_name, old_property, new_property):
+        need_update = False
+        inherited_type = self.get_inherited_type(old_type_name)
+        for prop in old:
+            if not old_property.inherited and prop not in new:
+                assert inherited_type is None, self.error_msg(
+                    f'"{old_type_name} was inherited by other type, such as "{inherited_type}". Prohibit property alteration!'
+                )
+
+                old[prop].alter_operation = AlterOperationEnum.Delete
+                need_update = True
+                print(
+                    f"Delete sub property: [{old_type_name}] {old_property.name}.{prop}"
+                )
+
+        for prop, o in new.items():
+            if prop not in old and not new_property.inherited:
+                assert inherited_type is None, self.error_msg(
+                    f'"{old_type_name} was inherited by other type, such as "{inherited_type}". Prohibit property alteration!'
+                )
+
+                old_property.add_sub_property(new[prop])
+                need_update = True
+                print(
+                    f"Create sub property: [{old_type_name}] {old_property.name}.{prop}"
+                )
+
+            elif old[prop].object_type_name != new[prop].object_type_name:
+                assert inherited_type is None, self.error_msg(
+                    f'"{old_type_name} was inherited by other type, such as "{inherited_type}". Prohibit property alteration!'
+                )
+                assert not old_property.inherited, self.error_msg(
+                    f"{old_type_name}] {old_property.name}.{prop} is inherited sub property, deny modify"
+                )
+
+                old[prop].alter_operation = AlterOperationEnum.Delete
+                old_property.add_sub_property(new[prop])
+                need_update = True
+                print(
+                    f"Recreate sub property: [{old_type_name}] {old_property.name}.{prop}"
+                )
+
+            elif old[prop] != new[prop]:
+                assert inherited_type is None, self.error_msg(
+                    f'"{old_type_name} was inherited by other type, such as "{inherited_type}". Prohibit property alteration!'
+                )
+                assert not old_property.inherited, self.error_msg(
+                    f"{old_type_name}] {old_property.name}.{prop} is inherited property, deny modify"
+                )
+
+                old[prop].overwritten_by(o)
+                old[prop].alter_operation = AlterOperationEnum.Update
+                need_update = True
+                print(f"Update property: [{old_type_name}] {old_property.name}.{prop}")
+        return need_update
+
+    def get_inherited_type(self, type_name):
+        for spg_type in self.types:
+            if self.types[spg_type].parent_type_name == type_name:
+                return spg_type
+        return None
+
+    def diff_and_sync(self, print_only):
+        """
+        Get the schema diff and then sync to graph storage
+        """
+        session = self.schema.create_session()
+
+        # generate the delete list of spg type
+        for spg_type in session.spg_types:
+            if not spg_type.startswith("STD.") and not spg_type.startswith(
+                f"{self.namespace}."
+            ):
+                continue
+            unique_id = session.spg_types[spg_type]._rest_model.ontology_id.unique_id
+            if spg_type in self.internal_type and unique_id < 1000:
+                continue
+
+            if spg_type not in self.types:
+                session.delete_type(session.get(spg_type))
+                print(f"Delete type: {spg_type}")
+
+        for spg_type in self.types:
+            # generate the creation list of spg type
+            if not spg_type.startswith("STD.") and not spg_type.startswith(
+                f"{self.namespace}."
+            ):
+                continue
+            if spg_type not in session.spg_types:
+                session.create_type(self.types[spg_type])
+                print(f"Create type: {spg_type}")
+                relations = self.types[spg_type].relations
+                if len(relations) > 0:
+                    for rel in relations:
+                        print(f'Create relation: [{spg_type}] {rel.split("_")[0]}')
+
+            else:
+                # generate the update list
+                new_type = self.types[spg_type]
+                old_type = session.get(spg_type)
+
+                assert (
+                    new_type.spg_type_enum == old_type.spg_type_enum
+                    and new_type.parent_type_name == old_type.parent_type_name
+                ), self.error_msg(
+                    f"Cannot alter the type definition or its parent type of {new_type.name}. "
+                    "if you still want to make change, please delete it first then re-create it."
+                )
+
+                need_update = False
+                if new_type.desc != old_type.desc:
+                    old_type.desc = new_type.desc
+                    need_update = True
+
+                if new_type.name_zh != old_type.name_zh:
+                    old_type.name_zh = new_type.name_zh
+                    need_update = True
+
+                if new_type.spg_type_enum == SpgTypeEnum.Concept:
+                    assert (
+                        new_type.hypernym_predicate == old_type.hypernym_predicate
+                    ), self.error_msg(
+                        f"Cannot alter the hypernym predicate of {new_type.name}. "
+                        "if you still want to make change, please delete it first then re-create it."
+                    )
+
+                if new_type.spg_type_enum == SpgTypeEnum.Standard:
+                    assert old_type.spreadable == new_type.spreadable, self.error_msg(
+                        f"Cannot alter the spreadable value of {new_type.name}. "
+                        f"if you still want to make change, "
+                        "please delete the definition first and then re-create it."
+                    )
+
+                    if old_type.constraint != new_type.constraint:
+                        old_type.constraint = new_type.constraint
+                        need_update = True
+                        print(f"Update standard type constraint: {spg_type}")
+
+                inherited_type = self.get_inherited_type(new_type.name)
+                for prop in old_type.properties:
+                    if (
+                        not old_type.properties[prop].inherited
+                        and prop not in new_type.properties
+                        and not self.is_internal_property(prop, new_type.spg_type_enum)
+                    ):
+                        assert (
+                            prop != "subject"
+                            and old_type.properties[prop].property_group
+                            != PropertyGroupEnum.Subject
+                        ), self.error_msg(
+                            "The subject property of event type cannot be deleted"
+                        )
+                        assert inherited_type is None, self.error_msg(
+                            f'"{new_type.name} was inherited by other type, such as "{inherited_type}". Prohibit property alteration!'
+                        )
+
+                        old_type.properties[
+                            prop
+                        ].alter_operation = AlterOperationEnum.Delete
+                        need_update = True
+                        print(f"Delete property: [{new_type.name}] {prop}")
+
+                for prop, o in new_type.properties.items():
+
+                    if (
+                        prop not in old_type.properties
+                        and not self.is_internal_property(prop, new_type.spg_type_enum)
+                        and not o.inherited
+                    ):
+                        assert inherited_type is None, self.error_msg(
+                            f'"{new_type.name} was inherited by other type, such as "{inherited_type}". Prohibit property alteration!'
+                        )
+
+                        old_type.add_property(new_type.properties[prop])
+                        need_update = True
+                        print(f"Create property: [{new_type.name}] {prop}")
+
+                    elif (
+                        old_type.properties[prop].object_type_name
+                        != new_type.properties[prop].object_type_name
+                    ):
+                        assert inherited_type is None, self.error_msg(
+                            f'"{new_type.name} was inherited by other type, such as "{inherited_type}". Prohibit property alteration!'
+                        )
+                        assert not old_type.properties[prop].inherited, self.error_msg(
+                            f"{new_type.name}] {prop} is inherited property, deny modify"
+                        )
+
+                        old_type.properties[
+                            prop
+                        ].alter_operation = AlterOperationEnum.Delete
+                        old_type.add_property(new_type.properties[prop])
+                        need_update = True
+                        print(f"Recreate property: [{new_type.name}] {prop}")
+
+                    elif (
+                        old_type.properties[prop].sub_properties
+                        != new_type.properties[prop].sub_properties
+                    ):
+                        need_update = self.diff_sub_property(
+                            new_type.properties[prop].sub_properties,
+                            old_type.properties[prop].sub_properties,
+                            old_type.name,
+                            old_type.properties[prop],
+                            new_type.properties[prop],
+                        )
+                        if need_update:
+                            old_type.properties[
+                                prop
+                            ].alter_operation = AlterOperationEnum.Update
+
+                    elif old_type.properties[prop] != new_type.properties[prop]:
+                        assert inherited_type is None, self.error_msg(
+                            f'"{new_type.name} was inherited by other type, such as "{inherited_type}". Prohibit property alteration!'
+                        )
+                        assert not old_type.properties[prop].inherited, self.error_msg(
+                            f"{new_type.name}] {prop} is inherited property, deny modify"
+                        )
+
+                        old_type.properties[prop].overwritten_by(o)
+                        old_type.properties[
+                            prop
+                        ].alter_operation = AlterOperationEnum.Update
+                        need_update = True
+                        print(f"Update property: [{new_type.name}] {prop}")
+
+                for relation in new_type.relations:
+                    p_name = relation.split("_")[0]
+                    if (
+                        relation not in old_type.relations
+                        or old_type.relations[relation].object_type_name
+                        != new_type.relations[relation].object_type_name
+                    ):
+                        assert inherited_type is None, self.error_msg(
+                            f'"{new_type.name} was inherited by other type, such as "{inherited_type}". Prohibit relation alteration!'
+                        )
+                        old_type.add_relation(new_type.relations[relation])
+                        need_update = True
+                        print(f"Create relation: [{new_type.name}] {p_name}")
+
+                    elif (
+                        old_type.relations[relation].sub_properties
+                        != new_type.relations[relation].sub_properties
+                    ):
+                        need_update = self.diff_sub_property(
+                            new_type.relations[relation].sub_properties,
+                            old_type.relations[relation].sub_properties,
+                            old_type.name,
+                            old_type.relations[relation],
+                            new_type.relations[relation],
+                        )
+                        if need_update:
+                            assert inherited_type is None, self.error_msg(
+                                f'"{new_type.name} was inherited by other type, such as "{inherited_type}". Prohibit relation alteration!'
+                            )
+                            old_type.relations[
+                                relation
+                            ].alter_operation = AlterOperationEnum.Update
+
+                    elif old_type.relations[relation] != new_type.relations[relation]:
+                        assert inherited_type is None, self.error_msg(
+                            f'"{new_type.name} was inherited by other type, such as "{inherited_type}". Prohibit relation alteration!'
+                        )
+                        assert not old_type.relations[
+                            relation
+                        ].inherited, self.error_msg(
+                            f"{new_type.name}] {p_name} is inherited relation, deny modify"
+                        )
+
+                        old_type.relations[relation].overwritten_by(
+                            new_type.relations[relation]
+                        )
+                        old_type.relations[
+                            relation
+                        ].alter_operation = AlterOperationEnum.Update
+                        need_update = True
+                        print(f"Update relation: [{new_type.name}] {relation}")
+
+                for relation, o in old_type.relations.items():
+                    p_name = relation.split("_")[0]
+                    if o.inherited or p_name in new_type.properties or o.is_dynamic:
+                        # skip the inherited and semantic relation
+                        continue
+                    if (
+                        relation not in new_type.relations
+                        and not o.inherited
+                        and not o.is_dynamic
+                        and not (
+                            new_type.spg_type_enum == SpgTypeEnum.Concept
+                            and p_name
+                            in [member.value for member in HypernymPredicateEnum]
+                        )
+                    ):
+                        assert inherited_type is None, self.error_msg(
+                            f'"{new_type.name} was inherited by other type, such as "{inherited_type}". Prohibit relation alteration!'
+                        )
+                        old_type.relations[
+                            relation
+                        ].alter_operation = AlterOperationEnum.Delete
+                        need_update = True
+                        print(f"Delete relation: [{new_type.name}] {p_name}")
+
+                if need_update:
+                    session.update_type(old_type)
+        if not print_only:
+            session.commit()
+        if session._alter_spg_types:
+            return True
+        return False
+
+    def export_schema_python(self, filename):
+        """
+        Export the schema helper class in python
+        You can import the exported class in your code to obtain the code prompt in IDE
+        """
+
+        schema = SchemaClient()
+        session = schema.create_session()
+        assert len(self.namespace) > 0, "Schema is invalid"
+
+        spg_types = []
+        for spg_type_name in sorted(session.spg_types):
+            if (
+                spg_type_name.startswith("STD.")
+                or not spg_type_name.startswith(self.namespace)
+                or spg_type_name in self.internal_type
+            ):
+                continue
+
+            sub_properties = {}
+            properties = set()
+            for prop, prop_type in session.get(spg_type_name).properties.items():
+                if len(prop_type.sub_properties) > 0:
+                    sub_properties[prop] = set()
+                    for sub_prop in prop_type.sub_properties:
+                        sub_properties[prop].add(sub_prop)
+                else:
+                    properties.add(prop)
+
+            relations = set()
+            relation_sub_properties = {}
+            hyp_predicate = [member.value for member in HypernymPredicateEnum]
+            for relation, relation_type in session.get(spg_type_name).relations.items():
+                rel = relation.split("_")[0]
+                if (
+                    rel in relations
+                    or rel in hyp_predicate
+                    or rel in session.get(spg_type_name).properties
+                ):
+                    continue
+
+                if len(relation_type.sub_properties) > 0:
+                    relation_sub_properties[rel] = set()
+                    for sub_prop in relation_type.sub_properties:
+                        relation_sub_properties[rel].add(sub_prop)
+                else:
+                    relations.add(rel)
+
+            spg_types.append(
+                {
+                    "name": spg_type_name.split(".")[1],
+                    "properties": properties,
+                    "sub_properties": sub_properties,
+                    "relations": relations,
+                    "relation_sub_properties": relation_sub_properties,
+                }
+            )
+
+        metadata = {"namespace": self.namespace, "spg_types": spg_types}
+
+        from knext.common.utils import render_template
+
+        render_template(Path(filename).parent, Path(filename).name, **metadata)
+
+
+if __name__ == "__main__":
+    os.environ["KAG_PROJECT_ID"] = "4"
+    schema = SPGSchemaMarkLang(
+        "/Users/jier/Desktop/openspgapp/openspg/python/knext/knext/examples/medicine/schema/medicine.schema"
+    )
+    schema.diff_and_sync(False)
diff --git a/knext/schema/model/__init__.py b/knext/schema/model/__init__.py
new file mode 100644
index 00000000..6f6914a4
--- /dev/null
+++ b/knext/schema/model/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
diff --git a/knext/schema/model/base.py b/knext/schema/model/base.py
new file mode 100644
index 00000000..efd61208
--- /dev/null
+++ b/knext/schema/model/base.py
@@ -0,0 +1,920 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+import pprint
+import typing
+from abc import ABC
+from enum import Enum
+from typing import Type, Union, List, Dict, Optional
+
+from knext.schema import rest
+
+ROOT_TYPE_UNIQUE_NAME = "Thing"
+
+
+class SpgTypeEnum(str, Enum):
+    Basic = "BASIC_TYPE"
+    Standard = "STANDARD_TYPE"
+    Entity = "ENTITY_TYPE"
+    Event = "EVENT_TYPE"
+    Concept = "CONCEPT_TYPE"
+
+
+class BasicTypeEnum(str, Enum):
+    Text = "Text"
+    Integer = "Integer"
+    Float = "Float"
+
+
+class PropertyGroupEnum(str, Enum):
+    Time = "TIME"
+    Subject = "SUBJECT"
+    Object = "OBJECT"
+    Loc = "LOC"
+
+
+class ConstraintTypeEnum(str, Enum):
+    NotNull = "NOT_NULL"
+    MultiValue = "MULTI_VALUE"
+    Enum = "ENUM"
+    Regular = "REGULAR"
+
+
+class HypernymPredicateEnum(str, Enum):
+    IsA = "isA"
+    LocateAt = "locateAt"
+    MannerOf = "mannerOf"
+
+
+class AlterOperationEnum(str, Enum):
+    Create = "CREATE"
+    Update = "UPDATE"
+    Delete = "DELETE"
+
+
+class IndexTypeEnum(str, Enum):
+    Vector = "VECTOR"
+    Text = "TEXT"
+    TextAndVector = "TEXT_AND_VECTOR"
+
+
+def iter_init(klass):
+    """Initialize a REST model."""
+    instance = klass()
+    for attr, attr_type in klass.openapi_types.items():
+        if hasattr(rest, attr_type):
+            attr_klass = getattr(rest, attr_type)
+            attr_instance = iter_init(attr_klass)
+            setattr(instance, attr, attr_instance)
+        elif attr_type.startswith("list["):
+            setattr(instance, attr, [])
+        else:
+            pass
+
+    return instance
+
+
+class BaseProperty(ABC):
+    """Base class of `Property` and `Relation`."""
+
+    _rest_model: Union[rest.Relation, rest.Property]
+
+    def __init__(
+        self,
+        name=None,
+        object_type_name=None,
+        name_zh=None,
+        desc=None,
+        property_group=None,
+        sub_properties=None,
+        constraint=None,
+        logical_rule=None,
+        index_type=None,
+        **kwargs,
+    ):
+        if "rest_model" in kwargs:
+            self._rest_model = kwargs["rest_model"]
+        else:
+            self._init_rest_model(
+                name=name,
+                object_type_name=object_type_name,
+                name_zh=name_zh,
+                desc=desc,
+                property_group=property_group,
+                sub_properties=sub_properties,
+                constraint=constraint,
+                logical_rule=logical_rule,
+                index_type=index_type,
+            )
+
+    def _init_rest_model(self, **kwargs):
+        """Init a BaseProperty object."""
+        super_klass = self.__class__.__name__
+        self._rest_model = iter_init(getattr(rest, super_klass))
+        for param, value in kwargs.items():
+            setattr(self, param, value)
+
+    @property
+    def name(self) -> str:
+        """Gets the name of this Property/Relation.  # noqa: E501
+
+
+        :return: The name of this Property/Relation.  # noqa: E501
+        :rtype: str
+        """
+        return self._rest_model.basic_info.name.name
+
+    @name.setter
+    def name(self, name: str):
+        """Sets the name of this Property/Relation.
+
+
+        :param name: The name of this Property/Relation.  # noqa: E501
+        :type: str
+        """
+
+        self._rest_model.basic_info.name.name = name
+
+    @property
+    def object_type_name(self) -> str:
+        """Gets the object_type_name of this Property/Relation.  # noqa: E501
+
+
+        :return: The object_type_name of this Property/Relation.  # noqa: E501
+        :rtype: str
+        """
+        return self._rest_model.object_type_ref.basic_info.name.name
+
+    @property
+    def object_type_name_en(self) -> str:
+        """Gets the object_type_name_en of this Property/Relation.  # noqa: E501
+
+
+        :return: The object_type_name_en of this Property/Relation.  # noqa: E501
+        :rtype: str
+        """
+        return self._rest_model.object_type_ref.basic_info.name.name_en
+
+    @object_type_name_en.setter
+    def object_type_name_en(self, object_type_name_en: str):
+        """Sets the object_type_name_en of this Property/Relation.
+
+
+        :param object_type_name_en: The object_type_name_en of this Property/Relation.  # noqa: E501
+        :type: str
+        """
+
+        self._rest_model.object_type_ref.basic_info.name.name_en = object_type_name_en
+
+    @object_type_name.setter
+    def object_type_name(self, object_type_name: str):
+        """Sets the object_type_name of this Property/Relation.
+
+
+        :param object_type_name: The object_type_name of this Property/Relation.  # noqa: E501
+        :type: str
+        """
+
+        self._rest_model.object_type_ref.basic_info.name.name = object_type_name
+
+    @property
+    def object_type_name_zh(self) -> str:
+        """Gets the object_type_name_zh of this Property/Relation.  # noqa: E501
+
+
+        :return: The object_type_name_zh of this Property/Relation.  # noqa: E501
+        :rtype: str
+        """
+        return self._rest_model.object_type_ref.basic_info.name_zh
+
+    @object_type_name_zh.setter
+    def object_type_name_zh(self, object_type_name_zh: str):
+        """Sets the object_type_name_zh of this Property/Relation.
+
+
+        :param object_type_name_zh: The object_type_name_zh of this Property/Relation.  # noqa: E501
+        :type: str
+        """
+        self._rest_model.object_type_ref.basic_info.name_zh = object_type_name_zh
+
+    @property
+    def inherited(self) -> bool:
+        """Gets the `inherited` of this Property/Relation.  # noqa: E501
+
+
+        :return: The `inherited` of this Property/Relation.  # noqa: E501
+        :rtype: bool
+        """
+        return self._rest_model.inherited
+
+    @inherited.setter
+    def inherited(self, inherited: bool):
+        """Sets the `inherited` of this Property/Relation.
+
+
+        :param inherited: The `inherited` of this Property/Relation.  # noqa: E501
+        :type: bool
+        """
+
+        if inherited is None:
+            return
+
+        self._rest_model.inherited = inherited
+
+    @property
+    def object_spg_type(self) -> Optional[SpgTypeEnum]:
+        """Gets the object_spg_type of this Property/Relation.  # noqa: E501
+
+
+        :return: The object_spg_type of this Property/Relation.  # noqa: E501
+        :rtype: str
+        """
+        spg_type_enum = self._rest_model.object_type_ref.spg_type_enum
+        return SpgTypeEnum(spg_type_enum) if spg_type_enum else None
+
+    @object_spg_type.setter
+    def object_spg_type(self, object_spg_type: SpgTypeEnum):
+        """Sets the object_spg_type of this Property/Relation.
+
+
+        :param object_spg_type: The object_spg_type of this Property/Relation.  # noqa: E501
+        :type: str
+        """
+
+        if object_spg_type is None:
+            return
+
+        self._rest_model.object_type_ref.spg_type_enum = object_spg_type
+
+    @property
+    def name_zh(self) -> str:
+        """Gets the name_zh of this Property/Relation.  # noqa: E501
+
+
+        :return: The name_zh of this Property/Relation.  # noqa: E501
+        :rtype: str
+        """
+        return self._rest_model.basic_info.name_zh
+
+    @name_zh.setter
+    def name_zh(self, name_zh: str):
+        """Sets the name_zh of this Property/Relation.
+
+
+        :param name_zh: The name_zh of this Property/Relation.  # noqa: E501
+        :type: str
+        """
+        if name_zh is None:
+            return
+
+        self._rest_model.basic_info.name_zh = name_zh
+
+    @property
+    def desc(self) -> str:
+        """Gets the desc of this Property/Relation.  # noqa: E501
+
+
+        :return: The desc of this Property/Relation.  # noqa: E501
+        :rtype: str
+        """
+        return self._rest_model.basic_info.desc
+
+    @desc.setter
+    def desc(self, desc: str):
+        """Sets the desc of this Property/Relation.
+
+
+        :param desc: The desc of this Property/Relation.  # noqa: E501
+        :type: str
+        """
+        if desc is None:
+            return
+
+        self._rest_model.basic_info.desc = desc
+
+    @property
+    def property_group(self) -> Optional[PropertyGroupEnum]:
+        """Gets the property_group of this Property/Relation.  # noqa: E501
+
+
+        :return: The property_group of this Property/Relation.  # noqa: E501
+        :rtype: str
+        """
+        property_group = self._rest_model.advanced_config.property_group
+        return PropertyGroupEnum(property_group) if property_group else None
+
+    @property_group.setter
+    def property_group(self, property_group: PropertyGroupEnum):
+        """Sets the property_group of this Property/Relation.
+
+
+        :param property_group: The property_group of this Property/Relation.  # noqa: E501
+        :type: str
+        """
+        if property_group is None:
+            return
+
+        self._rest_model.advanced_config.property_group = property_group
+
+    @property
+    def sub_properties(self) -> Dict[str, Type["Property"]]:
+        """Gets the sub_properties of this Property/Relation.  # noqa: E501
+
+
+        :return: The sub_properties of this Property/Relation.  # noqa: E501
+        :rtype: dict
+        """
+        if self._rest_model.advanced_config.sub_properties is None:
+            return {}
+        from knext.schema.model.property import Property
+
+        sub_properties = {}
+        for sub_property in self._rest_model.advanced_config.sub_properties:
+            sub_properties[sub_property.basic_info.name.name] = Property(
+                name=sub_property.basic_info.name.name,
+                object_type_name=sub_property.object_type_ref.basic_info.name.name,
+                rest_model=sub_property,
+            )
+        return sub_properties
+
+    @sub_properties.setter
+    def sub_properties(self, sub_properties: List["Property"]):
+        """Sets the sub_properties of this Property/Relation.
+
+
+        :param sub_properties: The sub_properties of this Property/Relation.  # noqa: E501
+        :type: list[Property]
+        """
+
+        if sub_properties is None:
+            return
+
+        self._rest_model.advanced_config.sub_properties = [
+            prop.to_rest() for prop in sub_properties
+        ]
+
+    def add_sub_property(self, sub_property: Type["Property"]):
+        """Adds a sub_property to this Property/Relation.
+
+
+        :param sub_property: The sub_property to add.
+        :type sub_property: Property
+        """
+
+        if self._rest_model.advanced_config.sub_properties is None:
+            self._rest_model.advanced_config.sub_properties = None
+        sub_property.alter_operation = AlterOperationEnum.Create
+        self._rest_model.advanced_config.sub_properties.append(sub_property.to_rest())
+        return self
+
+    @property
+    def constraint(self) -> Dict[ConstraintTypeEnum, Union[str, list]]:
+        """Gets the constraint of this Property.  # noqa: E501
+
+
+        :return: The constraint of this Property.  # noqa: E501
+        :rtype: dict
+        """
+        if self._rest_model.advanced_config.constraint is None:
+            return {}
+        constraint = {}
+        for item in self._rest_model.advanced_config.constraint.constraint_items:
+            if item.constraint_type_enum == ConstraintTypeEnum.Enum:
+                value = item.enum_values
+            elif item.constraint_type_enum == ConstraintTypeEnum.Regular:
+                value = item.regular_pattern
+            else:
+                value = None
+            constraint[item.constraint_type_enum] = value
+        return constraint
+
+    @constraint.setter
+    def constraint(self, constraint: Dict[ConstraintTypeEnum, Union[str, list]]):
+        """Sets the constraint of this Property.
+
+
+        :param constraint: The constraint of this Property.  # noqa: E501
+        :type: dict
+        """
+        if constraint is None:
+            return
+        self._rest_model.advanced_config.constraint = rest.Constraint(
+            constraint_items=[]
+        )
+        for type, value in constraint.items():
+            self.add_constraint(type, value)
+
+    def add_constraint(self, type: ConstraintTypeEnum, value: Union[str, list] = None):
+        """Adds a constraint to this Property.
+
+
+        :param type: The type of constraint to add.
+        :type type: ConstraintTypeEnum
+        :param value: The value(s) of the constraint. Optional.
+        :type value: str or list, optional
+        """
+
+        if self._rest_model.advanced_config.constraint is None:
+            self._rest_model.advanced_config.constraint = rest.Constraint(
+                constraint_items=[]
+            )
+        if type == ConstraintTypeEnum.Enum:
+            if not isinstance(value, list):
+                raise ValueError("Invalid enum format.")
+            constraint_item = rest.EnumConstraint(enum_values=value)
+        elif type == ConstraintTypeEnum.Regular:
+            constraint_item = rest.RegularConstraint(regular_pattern=value)
+        else:
+            constraint_item = rest.BaseConstraintItem(type)
+        self._rest_model.advanced_config.constraint.constraint_items.append(
+            constraint_item
+        )
+        return self
+
+    @property
+    def logical_rule(self) -> str:
+        """Gets the logical_rule of this Property/Relation.  # noqa: E501
+
+
+        :return: The logical_rule of this Property/Relation.  # noqa: E501
+        :rtype: str
+        """
+        if self._rest_model.advanced_config.logical_rule is None:
+            return ""
+        return self._rest_model.advanced_config.logical_rule.content
+
+    @logical_rule.setter
+    def logical_rule(self, logical_rule: str):
+        """Sets the logical_rule of this Property/Relation.
+
+
+        :param logical_rule: The logical_rule of this Property/Relation.  # noqa: E501
+        :type: str
+        """
+        if not logical_rule:
+            self._rest_model.advanced_config.logical_rule = None
+            return
+        if self._rest_model.advanced_config.logical_rule is None:
+            self._rest_model.advanced_config.logical_rule = rest.LogicalRule()
+
+        self._rest_model.advanced_config.logical_rule.content = logical_rule
+
+    @property
+    def index_type(self) -> IndexTypeEnum:
+        """Gets the index_type of this Property/Relation.  # noqa: E501
+
+
+        :return: The index_type of this Property/Relation.  # noqa: E501
+        :rtype: str
+        """
+        return self._rest_model.advanced_config.index_type
+
+    @index_type.setter
+    def index_type(self, index_type: IndexTypeEnum):
+        """Sets the index_type of this Property/Relation.
+
+
+        :param index_type: The index_type of this Property/Relation.  # noqa: E501
+        :type: str
+        """
+        if index_type is None:
+            return
+
+        self._rest_model.advanced_config.index_type = index_type
+
+    @property
+    def alter_operation(self) -> AlterOperationEnum:
+        """Gets the alter_operation of this Property/Relation.  # noqa: E501
+
+
+        :return: The alter_operation of this Property/Relation.  # noqa: E501
+        :rtype: AlterOperationEnum
+        """
+        alter_operation = self._rest_model.alter_operation
+        return AlterOperationEnum(alter_operation) if alter_operation else None
+
+    @alter_operation.setter
+    def alter_operation(self, alter_operation: AlterOperationEnum):
+        """Sets the alter_operation of this Property/Relation.
+
+
+        :param alter_operation: The alter_operation of this Property/Relation.  # noqa: E501
+        :type: AlterOperationEnum
+        """
+        self._rest_model.alter_operation = alter_operation
+
+    def overwritten_by(self, other: Type["BaseProperty"]):
+        """Overwrite all variables of the current class instance from another class instance."""
+        import inspect
+
+        members = inspect.getmembers(self.__class__)
+        for name, member in members:
+            if isinstance(member, property):
+                if name == "sub_properties":
+                    setattr(
+                        self, name, [prop for _, prop in getattr(other, name).items()]
+                    )
+                else:
+                    setattr(self, name, getattr(other, name))
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in self.__annotations__.items():
+            if attr == "sub_properties":
+                continue
+            value = getattr(self, attr)
+            if isinstance(value, typing.List):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, typing.Dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (
+                            (item[0], item[1].to_dict())
+                            if hasattr(item[1], "to_dict")
+                            else item
+                        ),
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def to_rest(self):
+        """Returns the REST model of this SpgType"""
+        return self._rest_model
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, self.__class__):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, self.__class__):
+            return True
+
+        return self.to_dict() != other.to_dict()
+
+
+class BaseSpgType(ABC):
+    """Base class of `ConceptType`, `EntityType`, `EventType`, `StandardType`, `BasicType`."""
+
+    _rest_model: Union[
+        rest.ConceptType, rest.EntityType, rest.EventType, rest.StandardType
+    ]
+
+    def __init__(
+        self,
+        spg_type_enum=None,
+        name=None,
+        name_zh=None,
+        desc=None,
+        parent_type_name=None,
+        properties=None,
+        relations=None,
+        **kwargs,
+    ):
+        if "rest_model" in kwargs:
+            self._rest_model = kwargs["rest_model"]
+        else:
+            self._init_rest_model(
+                spg_type_enum=spg_type_enum,
+                name=name,
+                name_zh=name_zh,
+                desc=desc,
+                parent_type_name=parent_type_name,
+                properties=properties,
+                relations=relations,
+                **kwargs,
+            )
+
+    def _init_rest_model(self, **kwargs):
+        """Init a BaseSpgType object."""
+        super_klass = self.__class__.__name__
+        self._rest_model = iter_init(getattr(rest, super_klass))
+        for param, value in kwargs.items():
+            setattr(self, param, value)
+
+    @property
+    def spg_type_enum(self) -> SpgTypeEnum:
+        """Gets the spg_type_enum of this SpgType.  # noqa: E501
+
+
+        :return: The spg_type_enum of this SpgType.  # noqa: E501
+        :rtype: str
+        """
+        spg_type_enum = self._rest_model.spg_type_enum
+        return SpgTypeEnum(spg_type_enum) if spg_type_enum else None
+
+    @spg_type_enum.setter
+    def spg_type_enum(self, spg_type_enum: SpgTypeEnum):
+        """Sets the spg_type_enum of this SpgType.
+
+
+        :param spg_type_enum: The spg_type_enum of this SpgType.  # noqa: E501
+        :type: str
+        """
+        self._rest_model.spg_type_enum = spg_type_enum
+
+    @property
+    def name(self) -> str:
+        """Gets the name of this SpgType.  # noqa: E501
+
+
+        :return: The name of this SpgType.  # noqa: E501
+        :rtype: str
+        """
+        return self._rest_model.basic_info.name.name
+
+    @property
+    def name_en(self) -> str:
+        """Gets the name_en of this SpgType.  # noqa: E501
+
+
+        :return: The name_en of this SpgType.  # noqa: E501
+        :rtype: str
+        """
+        return self._rest_model.basic_info.name.name_en
+
+    @name.setter
+    def name(self, name: str):
+        """Sets the name of this SpgType.
+
+
+        :param name: The name of this SpgType.  # noqa: E501
+        :type: str
+        """
+        if name is None:  # noqa: E501
+            raise ValueError(
+                "Invalid value for `name`, must not be `None`"
+            )  # noqa: E501
+
+        if self._rest_model.basic_info.name.name != name:
+            self._rest_model.basic_info.name.name = name
+
+    @property
+    def name_zh(self) -> str:
+        """Gets the name_zh of this SpgType.  # noqa: E501
+
+
+        :return: The name_zh of this SpgType.  # noqa: E501
+        :rtype: str
+        """
+        return self._rest_model.basic_info.name_zh
+
+    @name_zh.setter
+    def name_zh(self, name_zh: str):
+        """Sets the name_zh of this SpgType.
+
+
+        :param name_zh: The name_zh of this SpgType.  # noqa: E501
+        :type: str
+        """
+
+        if self._rest_model.basic_info.name_zh == name_zh:
+            return
+        self._rest_model.basic_info.name_zh = name_zh
+
+    @property
+    def desc(self) -> str:
+        """Gets the desc of this SpgType.  # noqa: E501
+
+
+        :return: The desc of this SpgType.  # noqa: E501
+        :rtype: str
+        """
+        return self._rest_model.basic_info.desc
+
+    @desc.setter
+    def desc(self, desc: str):
+        """Sets the desc of this SpgType.
+
+
+        :param desc: The desc of this SpgType.  # noqa: E501
+        :type: str
+        """
+
+        self._rest_model.basic_info.desc = desc
+
+    @property
+    def parent_type_name(self) -> str:
+        """Gets the parent_type_name of this SpgType.  # noqa: E501
+
+
+        :return: The parent_type_name of this SpgType.  # noqa: E501
+        :rtype: str
+        """
+        return self._rest_model.parent_type_info.parent_type_identifier.name
+
+    @parent_type_name.setter
+    def parent_type_name(self, parent_type_name: str):
+        """Sets the parent_type_name of this SpgType.
+
+
+        :param parent_type_name: The parent_type_name of this SpgType.  # noqa: E501
+        :type: BaseSpgType
+        """
+        if parent_type_name is None:
+            return
+        self._rest_model.parent_type_info.parent_type_identifier.name = parent_type_name
+
+    @property
+    def properties(self) -> Dict[str, Type["Property"]]:
+        """Gets the properties of this SpgType.  # noqa: E501
+
+
+        :return: The properties of this SpgType.  # noqa: E501
+        :rtype: dict
+        """
+        from knext.schema.model.property import Property
+
+        properties = {}
+        for prop in self._rest_model.properties:
+            properties[prop.basic_info.name.name] = Property(
+                name=prop.basic_info.name.name,
+                object_type_name=prop.object_type_ref.basic_info.name.name,
+                rest_model=prop,
+            )
+        return properties
+
+    @properties.setter
+    def properties(self, properties: List[Type["Property"]]):
+        """Sets the properties of this SpgType.
+
+
+        :param properties: The properties of this SpgType.  # noqa: E501
+        :type: list[Property]
+        """
+        if properties is None:
+            return
+
+        self._rest_model.properties = [prop.to_rest() for prop in properties]
+
+    def add_property(self, prop: Type["Property"]):
+        """Adds a property to this SpgType.
+
+
+        :param prop: The property to add.  # noqa: E501
+        :type: Property
+        """
+        prop.alter_operation = AlterOperationEnum.Create
+        self._rest_model.properties.append(prop.to_rest())
+        return self
+
+    @property
+    def relations(self) -> Dict[str, Type["Relation"]]:
+        """Gets the relations of this SpgType.  # noqa: E501
+
+
+        :return: The relations of this SpgType.  # noqa: E501
+        :rtype: dict
+        """
+        from knext.schema.model.relation import Relation
+
+        relations = {}
+        for relation in self._rest_model.relations:
+            predicate_name = relation.basic_info.name.name
+            object_type_name = relation.object_type_ref.basic_info.name.name
+            relations[predicate_name + "_" + object_type_name] = Relation(
+                name=predicate_name,
+                object_type_name=object_type_name,
+                rest_model=relation,
+            )
+        return relations
+
+    @relations.setter
+    def relations(self, relations: List["Relation"]):
+        """Sets the relations of this SpgType.
+
+
+        :param relations: The relations of this SpgType.  # noqa: E501
+        :type: list[Relation]
+        """
+
+        if relations is None:
+            return
+
+        self._rest_model.relations = [relation.to_rest() for relation in relations]
+
+    def add_relation(self, relation: Type["Relation"]):
+        """Adds a relation to this SpgType.
+
+
+        :param relation: The relation to add.  # noqa: E501
+        :type: Relation
+        """
+
+        relation.alter_operation = AlterOperationEnum.Create
+        self._rest_model.relations.append(relation.to_rest())
+        return self
+
+    @property
+    def alter_operation(self) -> Optional[AlterOperationEnum]:
+        """Gets the alter_operation of this SpgType.  # noqa: E501
+
+
+        :return: The alter_operation of this SpgType.  # noqa: E501
+        :rtype: AlterOperationEnum
+        """
+        alter_operation = self._rest_model.alter_operation
+        return AlterOperationEnum(alter_operation) if alter_operation else None
+
+    @alter_operation.setter
+    def alter_operation(self, alter_operation: AlterOperationEnum):
+        """Sets the alter_operation of this SpgType.
+
+
+        :param alter_operation: The alter_operation of this SpgType.  # noqa: E501
+        :type: AlterOperationEnum
+        """
+        self._rest_model.alter_operation = alter_operation
+
+    @staticmethod
+    def by_type_enum(type_enum: str):
+        """Reflection from type enum to subclass object of BaseSpgType."""
+
+        import knext.schema.model.spg_type as spg_type
+
+        class_obj = getattr(spg_type, f"{SpgTypeEnum(type_enum).name}Type")
+        return class_obj
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in self.__annotations__.items():
+            value = getattr(self, attr)
+            if isinstance(value, typing.List):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, typing.Dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (
+                            (item[0], item[1].to_dict())
+                            if hasattr(item[1], "to_dict")
+                            else item
+                        ),
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def to_rest(self):
+        """Returns the REST model of this SpgType"""
+        return self._rest_model
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, self.__class__):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, self.__class__):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/model/property.py b/knext/schema/model/property.py
new file mode 100644
index 00000000..be731142
--- /dev/null
+++ b/knext/schema/model/property.py
@@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from typing import List, Type, Union, Dict
+
+from knext.schema.model.base import (
+    ConstraintTypeEnum,
+    PropertyGroupEnum,
+    BaseProperty,
+    IndexTypeEnum,
+)
+
+
+class Property(BaseProperty):
+    """Property Model."""
+
+    name: str
+    object_type_name: str
+    name_zh: str
+    desc: str
+    property_group: PropertyGroupEnum
+    sub_properties: Dict[str, Type["Property"]]
+    constraint: Dict[ConstraintTypeEnum, Union[str, List[str]]]
+    logical_rule: str
+    index_type: IndexTypeEnum
+
+    def __init__(
+        self,
+        name: str,
+        object_type_name: str,
+        name_zh: str = None,
+        desc: str = None,
+        property_group: PropertyGroupEnum = None,
+        sub_properties: List[Type["Property"]] = None,
+        constraint: Dict[ConstraintTypeEnum, Union[str, List[str]]] = None,
+        logical_rule: str = None,
+        index_type: IndexTypeEnum = None,
+        **kwargs
+    ):
+        super().__init__(
+            name=name,
+            object_type_name=object_type_name,
+            name_zh=name_zh,
+            desc=desc,
+            property_group=property_group,
+            sub_properties=sub_properties,
+            constraint=constraint,
+            logical_rule=logical_rule,
+            index_type=index_type,
+            **kwargs
+        )
diff --git a/knext/schema/model/relation.py b/knext/schema/model/relation.py
new file mode 100644
index 00000000..92a1e8f5
--- /dev/null
+++ b/knext/schema/model/relation.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from typing import List, Type, Dict
+
+from knext.schema.model.base import BaseProperty
+from knext.schema.model.property import Property
+
+
+class Relation(BaseProperty):
+    """Relation Model."""
+
+    name: str
+    object_type_name: str
+    name_zh: str
+    desc: str
+    sub_properties: Dict[str, Type["Property"]]
+    logical_rule: str
+
+    def __init__(
+        self,
+        name: str,
+        object_type_name: str,
+        name_zh: str = None,
+        desc: str = None,
+        sub_properties: List[Property] = None,
+        logical_rule: str = None,
+        **kwargs
+    ):
+        super().__init__(
+            name=name,
+            object_type_name=object_type_name,
+            name_zh=name_zh,
+            desc=desc,
+            sub_properties=sub_properties,
+            logical_rule=logical_rule,
+            **kwargs
+        )
+
+    @property
+    def is_dynamic(self) -> bool:
+        """Gets the is_dynamic of this Property/Relation.  # noqa: E501
+
+
+        :return: The is_dynamic of this Property/Relation.  # noqa: E501
+        :rtype: str
+        """
+        return self._rest_model.is_dynamic
+
+    @is_dynamic.setter
+    def is_dynamic(self, is_dynamic: bool):
+        """Sets the is_dynamic of this Property/Relation.
+
+
+        :param is_dynamic: The is_dynamic of this Property/Relation.  # noqa: E501
+        :type: bool
+        """
+
+        self._rest_model.is_dynamic = is_dynamic
diff --git a/knext/schema/model/schema_helper.py b/knext/schema/model/schema_helper.py
new file mode 100644
index 00000000..b0bb84be
--- /dev/null
+++ b/knext/schema/model/schema_helper.py
@@ -0,0 +1,50 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from abc import ABC
+from typing import Union, Tuple, Optional
+
+
+class SchemaHelper(ABC, str):
+
+    __type_name__: str
+
+    def __init__(self, type_name: str):
+        self.__type_name__ = type_name
+
+
+class SPGTypeHelper(SchemaHelper):
+    def __init__(self, type_name: str):
+        super().__init__(type_name)
+
+
+class PropertyHelper(SchemaHelper):
+    def __init__(self, type_name: str):
+        super().__init__(type_name)
+
+
+class RelationHelper(SchemaHelper):
+    def __init__(self, type_name: str):
+        super().__init__(type_name)
+
+
+class SubPropertyHelper(SchemaHelper):
+    def __init__(self, type_name: str):
+        super().__init__(type_name)
+
+
+SPGTypeName = Union[str, SPGTypeHelper]
+PropertyName = Union[str, PropertyHelper]
+RelationName = Union[str, RelationHelper]
+SubPropertyName = Union[str, SubPropertyHelper]
+TripletName = Tuple[
+    SPGTypeName, Union[PropertyName, RelationName], Optional[SPGTypeName]
+]
diff --git a/knext/schema/model/spg_type.py b/knext/schema/model/spg_type.py
new file mode 100644
index 00000000..0b24bf47
--- /dev/null
+++ b/knext/schema/model/spg_type.py
@@ -0,0 +1,295 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from typing import List, Dict, Union, Optional
+
+from knext.schema import rest
+from knext.schema.model.base import (
+    BaseSpgType,
+    SpgTypeEnum,
+    ROOT_TYPE_UNIQUE_NAME,
+    HypernymPredicateEnum,
+    ConstraintTypeEnum,
+)
+from knext.schema.model.property import Property
+from knext.schema.model.relation import Relation
+
+
+class EntityType(BaseSpgType):
+    """EntityType Model."""
+
+    spg_type_enum: SpgTypeEnum
+    name: str
+    name_zh: str
+    desc: str
+    parent_type_name: str
+    properties: Dict[str, Property]
+    relations: Dict[str, Relation]
+
+    def __init__(
+        self,
+        name: str,
+        name_zh: str = None,
+        desc: str = None,
+        parent_type_name: str = ROOT_TYPE_UNIQUE_NAME,
+        properties: List[Property] = None,
+        relations: List[Relation] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            spg_type_enum=SpgTypeEnum.Entity,
+            name=name,
+            name_zh=name_zh,
+            desc=desc,
+            properties=properties,
+            relations=relations,
+            parent_type_name=parent_type_name,
+            **kwargs,
+        )
+
+
+class ConceptType(BaseSpgType):
+    """ConceptType Model."""
+
+    spg_type_enum: SpgTypeEnum
+    name: str
+    hypernym_predicate: HypernymPredicateEnum
+    name_zh: str
+    desc: str
+    parent_type_name: str
+    properties: Dict[str, Property]
+    relations: Dict[str, Relation]
+    taxonomic_type_name: str
+
+    def __init__(
+        self,
+        name: str,
+        hypernym_predicate: HypernymPredicateEnum,
+        name_zh: str = None,
+        desc: str = None,
+        parent_type_name: str = ROOT_TYPE_UNIQUE_NAME,
+        properties: List[Property] = None,
+        relations: List[Relation] = None,
+        taxonomic_type_name: str = None,
+        **kwargs,
+    ):
+        super().__init__(
+            spg_type_enum=SpgTypeEnum.Concept,
+            name=name,
+            name_zh=name_zh,
+            desc=desc,
+            properties=properties,
+            relations=relations,
+            parent_type_name=parent_type_name,
+            **kwargs,
+        )
+        if "rest_model" not in kwargs:
+            self.hypernym_predicate = hypernym_predicate
+            self.taxonomic_type_name = taxonomic_type_name
+
+    @property
+    def hypernym_predicate(self) -> Optional[HypernymPredicateEnum]:
+        """Gets the hypernym_predicate of this ConceptType.  # noqa: E501
+
+
+        :return: The hypernym_predicate of this ConceptType.  # noqa: E501
+        :rtype: HypernymPredicateEnum
+        """
+        hypernym_predicate = self._rest_model.concept_layer_config.hypernym_predicate
+        return HypernymPredicateEnum(hypernym_predicate) if hypernym_predicate else None
+
+    @hypernym_predicate.setter
+    def hypernym_predicate(self, hypernym_predicate: HypernymPredicateEnum):
+        """Sets the hypernym_predicate of this ConceptType.
+
+
+        :param hypernym_predicate: The hypernym_predicate of this ConceptType.  # noqa: E501
+        :type: HypernymPredicateEnum
+        """
+
+        self._rest_model.concept_layer_config.hypernym_predicate = hypernym_predicate
+
+    @property
+    def taxonomic_type_name(self) -> Optional[str]:
+        """Gets the taxonomic_type_name of this SpgType.  # noqa: E501
+
+
+        :return: The taxonomic_type_name of this SpgType.  # noqa: E501
+        :rtype: str
+        """
+        if self._rest_model.concept_taxonomic_config is None:
+            return None
+        return self._rest_model.concept_taxonomic_config.taxonomic_type_unique_name.name
+
+    @taxonomic_type_name.setter
+    def taxonomic_type_name(self, taxonomic_type_name: str):
+        """Sets the taxonomic_type_name of this ConceptType.
+
+
+        :param taxonomic_type_name: The taxonomic_type_name of this ConceptType.  # noqa: E501
+        :type: str
+        """
+        if taxonomic_type_name is None:
+            self._rest_model.concept_taxonomic_config = None
+            return
+        self._rest_model.concept_taxonomic_config.taxonomic_type_unique_name.name = (
+            taxonomic_type_name
+        )
+
+
+class EventType(BaseSpgType):
+    """EventType Model."""
+
+    spg_type_enum: SpgTypeEnum
+    name: str
+    name_zh: str
+    desc: str
+    parent_type_name: str
+    properties: Dict[str, Property]
+    relations: Dict[str, Relation]
+
+    def __init__(
+        self,
+        name: str,
+        name_zh: str = None,
+        desc: str = None,
+        parent_type_name: str = ROOT_TYPE_UNIQUE_NAME,
+        properties: List[Property] = None,
+        relations: List[Relation] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            spg_type_enum=SpgTypeEnum.Event,
+            name=name,
+            name_zh=name_zh,
+            desc=desc,
+            properties=properties,
+            relations=relations,
+            parent_type_name=parent_type_name,
+            **kwargs,
+        )
+
+
+class BasicType(BaseSpgType):
+    """BasicType Model."""
+
+    Text = BaseSpgType(SpgTypeEnum.Basic, "Text")
+    Integer = BaseSpgType(SpgTypeEnum.Basic, "Integer")
+    Float = BaseSpgType(SpgTypeEnum.Basic, "Float")
+
+    def __init__(self, name: str, **kwargs):
+        super().__init__(spg_type_enum=SpgTypeEnum.Basic, name=name, **kwargs)
+
+
+class StandardType(BaseSpgType):
+    """StandardType Model."""
+
+    spg_type_enum: SpgTypeEnum
+    name: str
+    parent_type_name: str
+    spreadable: bool
+    constraint: Dict[ConstraintTypeEnum, Union[str, List[str]]]
+
+    def __init__(
+        self,
+        name: str,
+        parent_type_name: str = ROOT_TYPE_UNIQUE_NAME,
+        spreadable: bool = False,
+        constraint: Dict[ConstraintTypeEnum, Union[str, List[str]]] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            spg_type_enum=SpgTypeEnum.Standard,
+            name=name,
+            parent_type_name=parent_type_name,
+            spreadable=spreadable,
+            constraint=constraint,
+            **kwargs,
+        )
+
+    @property
+    def spreadable(self) -> bool:
+        """Gets the `spreadable` of this StandardType.  # noqa: E501
+
+
+        :return: The `spreadable` of this StandardType.  # noqa: E501
+        :rtype: bool
+        """
+        return self._rest_model.spreadable
+
+    @spreadable.setter
+    def spreadable(self, spreadable: bool):
+        """Sets the `spreadable` of this StandardType.
+
+
+        :param spreadable: The `spreadable` of this StandardType.  # noqa: E501
+        :type: bool
+        """
+        self._rest_model.spreadable = spreadable
+
+    @property
+    def constraint(self) -> Dict[ConstraintTypeEnum, Union[str, list]]:
+        """Gets the constraint of this StandardType.  # noqa: E501
+
+
+        :return: The constraint of this StandardType.  # noqa: E501
+        :rtype: dict
+        """
+        if self._rest_model.constraint_items is None:
+            return {}
+        constraint = {}
+        for item in self._rest_model.constraint_items:
+            if item.constraint_type_enum == ConstraintTypeEnum.Enum:
+                value = item.enum_values
+            elif item.constraint_type_enum == ConstraintTypeEnum.Regular:
+                value = item.regular_pattern
+            else:
+                value = None
+            constraint[item.constraint_type_enum] = value
+        return constraint
+
+    @constraint.setter
+    def constraint(self, constraint: Dict[ConstraintTypeEnum, Union[str, list]]):
+        """Sets the constraint of this StandardType.
+
+
+        :param constraint: The constraint of this StandardType.  # noqa: E501
+        :type: dict
+        """
+        if constraint is None:
+            return
+        self._rest_model.constraint_items = []
+        for type, value in constraint.items():
+            self.add_constraint(type, value)
+
+    def add_constraint(self, type: ConstraintTypeEnum, value: Union[str, list] = None):
+        """Adds a constraint to this StandardType.
+
+
+        :param type: The type of constraint to add.
+        :type type: ConstraintTypeEnum
+        :param value: The value(s) of the constraint. Optional.
+        :type value: str or list, optional
+        """
+
+        if self._rest_model.constraint_items is None:
+            self._rest_model.constraint_items = []
+        if type == ConstraintTypeEnum.Enum:
+            if not isinstance(value, list):
+                raise ValueError("Invalid enum format.")
+            constraint_item = rest.EnumConstraint(enum_values=value)
+        elif type == ConstraintTypeEnum.Regular:
+            constraint_item = rest.RegularConstraint(regular_pattern=value)
+        else:
+            constraint_item = rest.BaseConstraintItem(type)
+        self._rest_model.constraint_items.append(constraint_item)
+        return self
diff --git a/knext/schema/rest/__init__.py b/knext/schema/rest/__init__.py
new file mode 100644
index 00000000..fcee98a0
--- /dev/null
+++ b/knext/schema/rest/__init__.py
@@ -0,0 +1,118 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+# flake8: noqa
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+from __future__ import absolute_import
+
+__version__ = "1"
+
+# import apis into sdk package
+from knext.schema.rest.concept_api import ConceptApi
+from knext.schema.rest.schema_api import SchemaApi
+
+# import models into sdk package
+from knext.schema.rest.models.basic_info import BasicInfo
+from knext.schema.rest.models.ontology_id import OntologyId
+from knext.schema.rest.models.base_ontology import BaseOntology
+from knext.schema.rest.models.user_info import UserInfo
+from knext.schema.rest.models.predicate.relation import Relation
+from knext.schema.rest.models.predicate.property import Property
+from knext.schema.rest.models.predicate.property_ref import PropertyRef
+from knext.schema.rest.models.predicate.mounted_concept_config import (
+    MountedConceptConfig,
+)
+from knext.schema.rest.models.predicate.property_advanced_config import (
+    PropertyAdvancedConfig,
+)
+from knext.schema.rest.models.predicate.sub_property import SubProperty
+from knext.schema.rest.models.predicate.property_ref_basic_info import (
+    PropertyRefBasicInfo,
+)
+from knext.schema.rest.models.predicate.sub_property_basic_info import (
+    SubPropertyBasicInfo,
+)
+from knext.schema.rest.models.alter.schema_draft import SchemaDraft
+from knext.schema.rest.models.alter.schema_alter_request import SchemaAlterRequest
+from knext.schema.rest.models.type.base_spg_type import BaseSpgType
+from knext.schema.rest.models.type.operator_key import OperatorKey
+from knext.schema.rest.models.type.event_type import EventType
+from knext.schema.rest.models.type.spg_type_ref_basic_info import SpgTypeRefBasicInfo
+from knext.schema.rest.models.type.entity_type import EntityType
+from knext.schema.rest.models.type.spg_type_advanced_config import SpgTypeAdvancedConfig
+from knext.schema.rest.models.type.concept_type import ConceptType
+from knext.schema.rest.models.type.base_advanced_type import BaseAdvancedType
+from knext.schema.rest.models.type.concept_layer_config import ConceptLayerConfig
+from knext.schema.rest.models.type.multi_version_config import MultiVersionConfig
+from knext.schema.rest.models.type.standard_type_basic_info import StandardTypeBasicInfo
+from knext.schema.rest.models.type.parent_type_info import ParentTypeInfo
+from knext.schema.rest.models.type.project_schema import ProjectSchema
+from knext.schema.rest.models.type.concept_taxonomic_config import (
+    ConceptTaxonomicConfig,
+)
+from knext.schema.rest.models.type.standard_type import StandardType
+from knext.schema.rest.models.type.spg_type_ref import SpgTypeRef
+from knext.schema.rest.models.type.basic_type import BasicType
+from knext.schema.rest.models.identifier.spg_type_identifier import SpgTypeIdentifier
+from knext.schema.rest.models.identifier.base_spg_identifier import BaseSpgIdentifier
+from knext.schema.rest.models.identifier.concept_identifier import ConceptIdentifier
+from knext.schema.rest.models.identifier.operator_identifier import OperatorIdentifier
+from knext.schema.rest.models.identifier.spg_triple_identifier import (
+    SpgTripleIdentifier,
+)
+from knext.schema.rest.models.identifier.predicate_identifier import PredicateIdentifier
+from knext.schema.rest.models.concept.remove_logical_causation_request import (
+    RemoveLogicalCausationRequest,
+)
+from knext.schema.rest.models.concept.define_logical_causation_request import (
+    DefineLogicalCausationRequest,
+)
+from knext.schema.rest.models.concept.remove_dynamic_taxonomy_request import (
+    RemoveDynamicTaxonomyRequest,
+)
+from knext.schema.rest.models.concept.define_dynamic_taxonomy_request import (
+    DefineDynamicTaxonomyRequest,
+)
+from knext.schema.rest.models.semantic.base_semantic import BaseSemantic
+from knext.schema.rest.models.semantic.predicate_semantic import PredicateSemantic
+from knext.schema.rest.models.semantic.rule_code import RuleCode
+from knext.schema.rest.models.semantic.logical_rule import LogicalRule
+from knext.schema.rest.models.operator.operator_version_response import (
+    OperatorVersionResponse,
+)
+from knext.schema.rest.models.operator.operator_version_request import (
+    OperatorVersionRequest,
+)
+from knext.schema.rest.models.operator.operator_overview import OperatorOverview
+from knext.schema.rest.models.operator.operator_version import OperatorVersion
+from knext.schema.rest.models.operator.operator_create_response import (
+    OperatorCreateResponse,
+)
+from knext.schema.rest.models.operator.operator_create_request import (
+    OperatorCreateRequest,
+)
+from knext.schema.rest.models.constraint.constraint import Constraint
+from knext.schema.rest.models.constraint.base_constraint_item import BaseConstraintItem
+from knext.schema.rest.models.constraint.multi_val_constraint import MultiValConstraint
+from knext.schema.rest.models.constraint.regular_constraint import RegularConstraint
+from knext.schema.rest.models.constraint.not_null_constraint import NotNullConstraint
+from knext.schema.rest.models.constraint.enum_constraint import EnumConstraint
diff --git a/knext/schema/rest/concept_api.py b/knext/schema/rest/concept_api.py
new file mode 100644
index 00000000..403955b5
--- /dev/null
+++ b/knext/schema/rest/concept_api.py
@@ -0,0 +1,533 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+from __future__ import absolute_import
+
+import re  # noqa: F401
+
+# python 2 and python 3 compatibility library
+import six
+
+from knext.common.rest.api_client import ApiClient
+from knext.common.rest.exceptions import ApiTypeError, ApiValueError  # noqa: F401
+
+
+class ConceptApi(object):
+    """NOTE: This class is auto generated by OpenAPI Generator
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    def __init__(self, api_client=None):
+        if api_client is None:
+            api_client = ApiClient()
+        self.api_client = api_client
+
+    def concept_define_dynamic_taxonomy_post(self, **kwargs):  # noqa: E501
+        """define_dynamic_taxonomy  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.concept_define_dynamic_taxonomy_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param DefineDynamicTaxonomyRequest define_dynamic_taxonomy_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: bool
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.concept_define_dynamic_taxonomy_post_with_http_info(
+            **kwargs
+        )  # noqa: E501
+
+    def concept_define_dynamic_taxonomy_post_with_http_info(
+        self, **kwargs
+    ):  # noqa: E501
+        """define_dynamic_taxonomy  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.concept_define_dynamic_taxonomy_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param DefineDynamicTaxonomyRequest define_dynamic_taxonomy_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(bool, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["define_dynamic_taxonomy_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method concept_define_dynamic_taxonomy_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "define_dynamic_taxonomy_request" in local_var_params:
+            body_params = local_var_params["define_dynamic_taxonomy_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/concept/defineDynamicTaxonomy",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="bool",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def concept_define_logical_causation_post(self, **kwargs):  # noqa: E501
+        """define_logical_causation  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.concept_define_logical_causation_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param DefineLogicalCausationRequest define_logical_causation_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: bool
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.concept_define_logical_causation_post_with_http_info(
+            **kwargs
+        )  # noqa: E501
+
+    def concept_define_logical_causation_post_with_http_info(
+        self, **kwargs
+    ):  # noqa: E501
+        """define_logical_causation  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.concept_define_logical_causation_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param DefineLogicalCausationRequest define_logical_causation_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(bool, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["define_logical_causation_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method concept_define_logical_causation_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "define_logical_causation_request" in local_var_params:
+            body_params = local_var_params["define_logical_causation_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/concept/defineLogicalCausation",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="bool",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def concept_remove_dynamic_taxonomy_post(self, **kwargs):  # noqa: E501
+        """remove_dynamic_taxonomy  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.concept_remove_dynamic_taxonomy_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param RemoveDynamicTaxonomyRequest remove_dynamic_taxonomy_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: bool
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.concept_remove_dynamic_taxonomy_post_with_http_info(
+            **kwargs
+        )  # noqa: E501
+
+    def concept_remove_dynamic_taxonomy_post_with_http_info(
+        self, **kwargs
+    ):  # noqa: E501
+        """remove_dynamic_taxonomy  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.concept_remove_dynamic_taxonomy_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param RemoveDynamicTaxonomyRequest remove_dynamic_taxonomy_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(bool, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["remove_dynamic_taxonomy_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method concept_remove_dynamic_taxonomy_get" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "remove_dynamic_taxonomy_request" in local_var_params:
+            body_params = local_var_params["remove_dynamic_taxonomy_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/concept/removeDynamicTaxonomy",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="bool",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def concept_remove_logical_causation_post(self, **kwargs):  # noqa: E501
+        """remove_logical_causation  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.concept_remove_logical_causation_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param RemoveLogicalCausationRequest remove_logical_causation_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: bool
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.concept_remove_logical_causation_post_with_http_info(
+            **kwargs
+        )  # noqa: E501
+
+    def concept_remove_logical_causation_post_with_http_info(
+        self, **kwargs
+    ):  # noqa: E501
+        """remove_logical_causation  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.concept_remove_logical_causation_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param RemoveLogicalCausationRequest remove_logical_causation_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(bool, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["remove_logical_causation_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method schema_remove_logical_causation_get" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "remove_logical_causation_request" in local_var_params:
+            body_params = local_var_params["remove_logical_causation_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/concept/removeLogicalCausation",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="bool",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
diff --git a/knext/schema/rest/models/__init__.py b/knext/schema/rest/models/__init__.py
new file mode 100644
index 00000000..5527c75b
--- /dev/null
+++ b/knext/schema/rest/models/__init__.py
@@ -0,0 +1,98 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from __future__ import absolute_import
+
+from knext.schema.rest.models.basic_info import BasicInfo
+from knext.schema.rest.models.ontology_id import OntologyId
+from knext.schema.rest.models.base_ontology import BaseOntology
+from knext.schema.rest.models.user_info import UserInfo
+from knext.schema.rest.models.predicate.relation import Relation
+from knext.schema.rest.models.predicate.property import Property
+from knext.schema.rest.models.predicate.property_ref import PropertyRef
+from knext.schema.rest.models.predicate.mounted_concept_config import (
+    MountedConceptConfig,
+)
+from knext.schema.rest.models.predicate.property_advanced_config import (
+    PropertyAdvancedConfig,
+)
+from knext.schema.rest.models.predicate.sub_property import SubProperty
+from knext.schema.rest.models.predicate.property_ref_basic_info import (
+    PropertyRefBasicInfo,
+)
+from knext.schema.rest.models.predicate.sub_property_basic_info import (
+    SubPropertyBasicInfo,
+)
+from knext.schema.rest.models.alter.schema_draft import SchemaDraft
+from knext.schema.rest.models.alter.schema_alter_request import SchemaAlterRequest
+from knext.schema.rest.models.type.base_spg_type import BaseSpgType
+from knext.schema.rest.models.type.operator_key import OperatorKey
+from knext.schema.rest.models.type.event_type import EventType
+from knext.schema.rest.models.type.spg_type_ref_basic_info import SpgTypeRefBasicInfo
+from knext.schema.rest.models.type.entity_type import EntityType
+from knext.schema.rest.models.type.spg_type_advanced_config import SpgTypeAdvancedConfig
+from knext.schema.rest.models.type.concept_type import ConceptType
+from knext.schema.rest.models.type.base_advanced_type import BaseAdvancedType
+from knext.schema.rest.models.type.concept_layer_config import ConceptLayerConfig
+from knext.schema.rest.models.type.multi_version_config import MultiVersionConfig
+from knext.schema.rest.models.type.standard_type_basic_info import StandardTypeBasicInfo
+from knext.schema.rest.models.type.parent_type_info import ParentTypeInfo
+from knext.schema.rest.models.type.project_schema import ProjectSchema
+from knext.schema.rest.models.type.concept_taxonomic_config import (
+    ConceptTaxonomicConfig,
+)
+from knext.schema.rest.models.type.standard_type import StandardType
+from knext.schema.rest.models.type.spg_type_ref import SpgTypeRef
+from knext.schema.rest.models.type.basic_type import BasicType
+from knext.schema.rest.models.identifier.spg_type_identifier import SpgTypeIdentifier
+from knext.schema.rest.models.identifier.base_spg_identifier import BaseSpgIdentifier
+from knext.schema.rest.models.identifier.concept_identifier import ConceptIdentifier
+from knext.schema.rest.models.identifier.operator_identifier import OperatorIdentifier
+from knext.schema.rest.models.identifier.spg_triple_identifier import (
+    SpgTripleIdentifier,
+)
+from knext.schema.rest.models.identifier.predicate_identifier import PredicateIdentifier
+from knext.schema.rest.models.concept.remove_logical_causation_request import (
+    RemoveLogicalCausationRequest,
+)
+from knext.schema.rest.models.concept.define_logical_causation_request import (
+    DefineLogicalCausationRequest,
+)
+from knext.schema.rest.models.concept.remove_dynamic_taxonomy_request import (
+    RemoveDynamicTaxonomyRequest,
+)
+from knext.schema.rest.models.concept.define_dynamic_taxonomy_request import (
+    DefineDynamicTaxonomyRequest,
+)
+from knext.schema.rest.models.semantic.base_semantic import BaseSemantic
+from knext.schema.rest.models.semantic.predicate_semantic import PredicateSemantic
+from knext.schema.rest.models.semantic.rule_code import RuleCode
+from knext.schema.rest.models.semantic.logical_rule import LogicalRule
+from knext.schema.rest.models.operator.operator_version_response import (
+    OperatorVersionResponse,
+)
+from knext.schema.rest.models.operator.operator_version_request import (
+    OperatorVersionRequest,
+)
+from knext.schema.rest.models.operator.operator_overview import OperatorOverview
+from knext.schema.rest.models.operator.operator_version import OperatorVersion
+from knext.schema.rest.models.operator.operator_create_response import (
+    OperatorCreateResponse,
+)
+from knext.schema.rest.models.operator.operator_create_request import (
+    OperatorCreateRequest,
+)
+from knext.schema.rest.models.constraint.constraint import Constraint
+from knext.schema.rest.models.constraint.base_constraint_item import BaseConstraintItem
+from knext.schema.rest.models.constraint.multi_val_constraint import MultiValConstraint
+from knext.schema.rest.models.constraint.regular_constraint import RegularConstraint
+from knext.schema.rest.models.constraint.not_null_constraint import NotNullConstraint
+from knext.schema.rest.models.constraint.enum_constraint import EnumConstraint
diff --git a/knext/schema/rest/models/alter/__init__.py b/knext/schema/rest/models/alter/__init__.py
new file mode 100644
index 00000000..6f6914a4
--- /dev/null
+++ b/knext/schema/rest/models/alter/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
diff --git a/knext/schema/rest/models/alter/schema_alter_request.py b/knext/schema/rest/models/alter/schema_alter_request.py
new file mode 100644
index 00000000..35e5a353
--- /dev/null
+++ b/knext/schema/rest/models/alter/schema_alter_request.py
@@ -0,0 +1,166 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class SchemaAlterRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"project_id": "int", "schema_draft": "SchemaDraft"}
+
+    attribute_map = {"project_id": "projectId", "schema_draft": "schemaDraft"}
+
+    def __init__(
+        self, project_id=None, schema_draft=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """SchemaAlterRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._project_id = None
+        self._schema_draft = None
+        self.discriminator = None
+
+        self.project_id = project_id
+        self.schema_draft = schema_draft
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this SchemaAlterRequest.  # noqa: E501
+
+
+        :return: The project_id of this SchemaAlterRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this SchemaAlterRequest.
+
+
+        :param project_id: The project_id of this SchemaAlterRequest.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._project_id = project_id
+
+    @property
+    def schema_draft(self):
+        """Gets the schema_draft of this SchemaAlterRequest.  # noqa: E501
+
+
+        :return: The schema_draft of this SchemaAlterRequest.  # noqa: E501
+        :rtype: SchemaDraft
+        """
+        return self._schema_draft
+
+    @schema_draft.setter
+    def schema_draft(self, schema_draft):
+        """Sets the schema_draft of this SchemaAlterRequest.
+
+
+        :param schema_draft: The schema_draft of this SchemaAlterRequest.  # noqa: E501
+        :type: SchemaDraft
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and schema_draft is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `schema_draft`, must not be `None`"
+            )  # noqa: E501
+
+        self._schema_draft = schema_draft
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, SchemaAlterRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, SchemaAlterRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/alter/schema_draft.py b/knext/schema/rest/models/alter/schema_draft.py
new file mode 100644
index 00000000..34037fd9
--- /dev/null
+++ b/knext/schema/rest/models/alter/schema_draft.py
@@ -0,0 +1,131 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class SchemaDraft(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"alter_spg_types": "list[BaseAdvancedType]"}
+
+    attribute_map = {"alter_spg_types": "alterSpgTypes"}
+
+    def __init__(
+        self, alter_spg_types=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """SchemaDraft - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._alter_spg_types = None
+        self.discriminator = None
+
+        if alter_spg_types is not None:
+            self.alter_spg_types = alter_spg_types
+
+    @property
+    def alter_spg_types(self):
+        """Gets the alter_spg_types of this SchemaDraft.  # noqa: E501
+
+
+        :return: The alter_spg_types of this SchemaDraft.  # noqa: E501
+        :rtype: list[BaseAdvancedType]
+        """
+        return self._alter_spg_types
+
+    @alter_spg_types.setter
+    def alter_spg_types(self, alter_spg_types):
+        """Sets the alter_spg_types of this SchemaDraft.
+
+
+        :param alter_spg_types: The alter_spg_types of this SchemaDraft.  # noqa: E501
+        :type: list[BaseAdvancedType]
+        """
+
+        self._alter_spg_types = alter_spg_types
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, SchemaDraft):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, SchemaDraft):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/base_ontology.py b/knext/schema/rest/models/base_ontology.py
new file mode 100644
index 00000000..d1972052
--- /dev/null
+++ b/knext/schema/rest/models/base_ontology.py
@@ -0,0 +1,228 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class BaseOntology(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "project_id": "int",
+        "ontology_id": "OntologyId",
+        "alter_operation": "str",
+        "ext_info": "object",
+    }
+
+    attribute_map = {
+        "project_id": "projectId",
+        "ontology_id": "ontologyId",
+        "alter_operation": "alterOperation",
+        "ext_info": "extInfo",
+    }
+
+    def __init__(
+        self,
+        project_id=None,
+        ontology_id=None,
+        alter_operation=None,
+        ext_info=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """BaseOntology - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._project_id = None
+        self._ontology_id = None
+        self._alter_operation = None
+        self._ext_info = None
+        self.discriminator = None
+
+        if project_id is not None:
+            self.project_id = project_id
+        if ontology_id is not None:
+            self.ontology_id = ontology_id
+        if alter_operation is not None:
+            self.alter_operation = alter_operation
+        if ext_info is not None:
+            self.ext_info = ext_info
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this BaseOntology.  # noqa: E501
+
+
+        :return: The project_id of this BaseOntology.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this BaseOntology.
+
+
+        :param project_id: The project_id of this BaseOntology.  # noqa: E501
+        :type: int
+        """
+
+        self._project_id = project_id
+
+    @property
+    def ontology_id(self):
+        """Gets the ontology_id of this BaseOntology.  # noqa: E501
+
+
+        :return: The ontology_id of this BaseOntology.  # noqa: E501
+        :rtype: OntologyId
+        """
+        return self._ontology_id
+
+    @ontology_id.setter
+    def ontology_id(self, ontology_id):
+        """Sets the ontology_id of this BaseOntology.
+
+
+        :param ontology_id: The ontology_id of this BaseOntology.  # noqa: E501
+        :type: OntologyId
+        """
+
+        self._ontology_id = ontology_id
+
+    @property
+    def alter_operation(self):
+        """Gets the alter_operation of this BaseOntology.  # noqa: E501
+
+
+        :return: The alter_operation of this BaseOntology.  # noqa: E501
+        :rtype: str
+        """
+        return self._alter_operation
+
+    @alter_operation.setter
+    def alter_operation(self, alter_operation):
+        """Sets the alter_operation of this BaseOntology.
+
+
+        :param alter_operation: The alter_operation of this BaseOntology.  # noqa: E501
+        :type: str
+        """
+        allowed_values = ["CREATE", "UPDATE", "DELETE"]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and alter_operation not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `alter_operation` ({0}), must be one of {1}".format(  # noqa: E501
+                    alter_operation, allowed_values
+                )
+            )
+
+        self._alter_operation = alter_operation
+
+    @property
+    def ext_info(self):
+        """Gets the ext_info of this BaseOntology.  # noqa: E501
+
+
+        :return: The ext_info of this BaseOntology.  # noqa: E501
+        :rtype: object
+        """
+        return self._ext_info
+
+    @ext_info.setter
+    def ext_info(self, ext_info):
+        """Sets the ext_info of this BaseOntology.
+
+
+        :param ext_info: The ext_info of this BaseOntology.  # noqa: E501
+        :type: object
+        """
+
+        self._ext_info = ext_info
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, BaseOntology):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, BaseOntology):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/basic_info.py b/knext/schema/rest/models/basic_info.py
new file mode 100644
index 00000000..f3b7fdfc
--- /dev/null
+++ b/knext/schema/rest/models/basic_info.py
@@ -0,0 +1,218 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class BasicInfo(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "name": "BaseSpgIdentifier",
+        "name_zh": "str",
+        "desc": "str",
+        "creator": "str",
+    }
+
+    attribute_map = {
+        "name": "name",
+        "name_zh": "nameZh",
+        "desc": "desc",
+        "creator": "creator",
+    }
+
+    def __init__(
+        self,
+        name=None,
+        name_zh=None,
+        desc=None,
+        creator=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """BasicInfo - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._name = None
+        self._name_zh = None
+        self._desc = None
+        self._creator = None
+        self.discriminator = None
+
+        if name is not None:
+            self.name = name
+        if name_zh is not None:
+            self.name_zh = name_zh
+        if desc is not None:
+            self.desc = desc
+        if creator is not None:
+            self.creator = creator
+
+    @property
+    def name(self):
+        """Gets the name of this BasicInfo.  # noqa: E501
+
+
+        :return: The name of this BasicInfo.  # noqa: E501
+        :rtype: BaseSpgIdentifier
+        """
+        return self._name
+
+    @name.setter
+    def name(self, name):
+        """Sets the name of this BasicInfo.
+
+
+        :param name: The name of this BasicInfo.  # noqa: E501
+        :type: BaseSpgIdentifier
+        """
+
+        self._name = name
+
+    @property
+    def name_zh(self):
+        """Gets the name_zh of this BasicInfo.  # noqa: E501
+
+
+        :return: The name_zh of this BasicInfo.  # noqa: E501
+        :rtype: str
+        """
+        return self._name_zh
+
+    @name_zh.setter
+    def name_zh(self, name_zh):
+        """Sets the name_zh of this BasicInfo.
+
+
+        :param name_zh: The name_zh of this BasicInfo.  # noqa: E501
+        :type: str
+        """
+
+        self._name_zh = name_zh
+
+    @property
+    def desc(self):
+        """Gets the desc of this BasicInfo.  # noqa: E501
+
+
+        :return: The desc of this BasicInfo.  # noqa: E501
+        :rtype: str
+        """
+        return self._desc
+
+    @desc.setter
+    def desc(self, desc):
+        """Sets the desc of this BasicInfo.
+
+
+        :param desc: The desc of this BasicInfo.  # noqa: E501
+        :type: str
+        """
+
+        self._desc = desc
+
+    @property
+    def creator(self):
+        """Gets the creator of this BasicInfo.  # noqa: E501
+
+
+        :return: The creator of this BasicInfo.  # noqa: E501
+        :rtype: str
+        """
+        return self._creator
+
+    @creator.setter
+    def creator(self, creator):
+        """Sets the creator of this BasicInfo.
+
+
+        :param creator: The creator of this BasicInfo.  # noqa: E501
+        :type: str
+        """
+
+        self._creator = creator
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, BasicInfo):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, BasicInfo):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/concept/__init__.py b/knext/schema/rest/models/concept/__init__.py
new file mode 100644
index 00000000..6f6914a4
--- /dev/null
+++ b/knext/schema/rest/models/concept/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
diff --git a/knext/schema/rest/models/concept/define_dynamic_taxonomy_request.py b/knext/schema/rest/models/concept/define_dynamic_taxonomy_request.py
new file mode 100644
index 00000000..44b441c4
--- /dev/null
+++ b/knext/schema/rest/models/concept/define_dynamic_taxonomy_request.py
@@ -0,0 +1,187 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class DefineDynamicTaxonomyRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"concept_type_name": "str", "concept_name": "str", "dsl": "str"}
+
+    attribute_map = {
+        "concept_type_name": "conceptTypeName",
+        "concept_name": "conceptName",
+        "dsl": "dsl",
+    }
+
+    def __init__(
+        self,
+        concept_type_name=None,
+        concept_name=None,
+        dsl=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """DefineDynamicTaxonomyRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._concept_type_name = None
+        self._concept_name = None
+        self._dsl = None
+        self.discriminator = None
+
+        if concept_type_name is not None:
+            self.concept_type_name = concept_type_name
+        if concept_name is not None:
+            self.concept_name = concept_name
+        if dsl is not None:
+            self.dsl = dsl
+
+    @property
+    def concept_type_name(self):
+        """Gets the concept_type_name of this DefineDynamicTaxonomyRequest.  # noqa: E501
+
+
+        :return: The concept_type_name of this DefineDynamicTaxonomyRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._concept_type_name
+
+    @concept_type_name.setter
+    def concept_type_name(self, concept_type_name):
+        """Sets the concept_type_name of this DefineDynamicTaxonomyRequest.
+
+
+        :param concept_type_name: The concept_type_name of this DefineDynamicTaxonomyRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._concept_type_name = concept_type_name
+
+    @property
+    def concept_name(self):
+        """Gets the concept_name of this DefineDynamicTaxonomyRequest.  # noqa: E501
+
+
+        :return: The concept_name of this DefineDynamicTaxonomyRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._concept_name
+
+    @concept_name.setter
+    def concept_name(self, concept_name):
+        """Sets the concept_name of this DefineDynamicTaxonomyRequest.
+
+
+        :param concept_name: The concept_name of this DefineDynamicTaxonomyRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._concept_name = concept_name
+
+    @property
+    def dsl(self):
+        """Gets the dsl of this DefineDynamicTaxonomyRequest.  # noqa: E501
+
+
+        :return: The dsl of this DefineDynamicTaxonomyRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._dsl
+
+    @dsl.setter
+    def dsl(self, dsl):
+        """Sets the dsl of this DefineDynamicTaxonomyRequest.
+
+
+        :param dsl: The dsl of this DefineDynamicTaxonomyRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._dsl = dsl
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, DefineDynamicTaxonomyRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, DefineDynamicTaxonomyRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/concept/define_logical_causation_request.py b/knext/schema/rest/models/concept/define_logical_causation_request.py
new file mode 100644
index 00000000..5b17807d
--- /dev/null
+++ b/knext/schema/rest/models/concept/define_logical_causation_request.py
@@ -0,0 +1,286 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class DefineLogicalCausationRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "subject_concept_type_name": "str",
+        "subject_concept_name": "str",
+        "predicate_name": "str",
+        "object_concept_type_name": "str",
+        "object_concept_name": "str",
+        "dsl": "str",
+        "semantic_type": "str",
+    }
+
+    attribute_map = {
+        "subject_concept_type_name": "subjectConceptTypeName",
+        "subject_concept_name": "subjectConceptName",
+        "predicate_name": "predicateName",
+        "object_concept_type_name": "objectConceptTypeName",
+        "object_concept_name": "objectConceptName",
+        "dsl": "dsl",
+        "semantic_type": "semanticType",
+    }
+
+    def __init__(
+        self,
+        subject_concept_type_name=None,
+        subject_concept_name=None,
+        predicate_name=None,
+        object_concept_type_name=None,
+        object_concept_name=None,
+        dsl=None,
+        semantic_type=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """DefineLogicalCausationRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._subject_concept_type_name = None
+        self._subject_concept_name = None
+        self._predicate_name = None
+        self._object_concept_type_name = None
+        self._object_concept_name = None
+        self._dsl = None
+        self._semantic_type = None
+        self.discriminator = None
+
+        if subject_concept_type_name is not None:
+            self.subject_concept_type_name = subject_concept_type_name
+        if subject_concept_name is not None:
+            self.subject_concept_name = subject_concept_name
+        if predicate_name is not None:
+            self.predicate_name = predicate_name
+        if object_concept_type_name is not None:
+            self.object_concept_type_name = object_concept_type_name
+        if object_concept_name is not None:
+            self.object_concept_name = object_concept_name
+        if dsl is not None:
+            self.dsl = dsl
+        if semantic_type is not None:
+            self.semantic_type = semantic_type
+
+    @property
+    def subject_concept_type_name(self):
+        """Gets the subject_concept_type_name of this DefineLogicalCausationRequest.  # noqa: E501
+
+
+        :return: The subject_concept_type_name of this DefineLogicalCausationRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._subject_concept_type_name
+
+    @subject_concept_type_name.setter
+    def subject_concept_type_name(self, subject_concept_type_name):
+        """Sets the subject_concept_type_name of this DefineLogicalCausationRequest.
+
+
+        :param subject_concept_type_name: The subject_concept_type_name of this DefineLogicalCausationRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._subject_concept_type_name = subject_concept_type_name
+
+    @property
+    def subject_concept_name(self):
+        """Gets the subject_concept_name of this DefineLogicalCausationRequest.  # noqa: E501
+
+
+        :return: The subject_concept_name of this DefineLogicalCausationRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._subject_concept_name
+
+    @subject_concept_name.setter
+    def subject_concept_name(self, subject_concept_name):
+        """Sets the subject_concept_name of this DefineLogicalCausationRequest.
+
+
+        :param subject_concept_name: The subject_concept_name of this DefineLogicalCausationRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._subject_concept_name = subject_concept_name
+
+    @property
+    def predicate_name(self):
+        """Gets the predicate_name of this DefineLogicalCausationRequest.  # noqa: E501
+
+
+        :return: The predicate_name of this DefineLogicalCausationRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._predicate_name
+
+    @predicate_name.setter
+    def predicate_name(self, predicate_name):
+        """Sets the predicate_name of this DefineLogicalCausationRequest.
+
+
+        :param predicate_name: The predicate_name of this DefineLogicalCausationRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._predicate_name = predicate_name
+
+    @property
+    def object_concept_type_name(self):
+        """Gets the object_concept_type_name of this DefineLogicalCausationRequest.  # noqa: E501
+
+
+        :return: The object_concept_type_name of this DefineLogicalCausationRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._object_concept_type_name
+
+    @object_concept_type_name.setter
+    def object_concept_type_name(self, object_concept_type_name):
+        """Sets the object_concept_type_name of this DefineLogicalCausationRequest.
+
+
+        :param object_concept_type_name: The object_concept_type_name of this DefineLogicalCausationRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._object_concept_type_name = object_concept_type_name
+
+    @property
+    def object_concept_name(self):
+        """Gets the object_concept_name of this DefineLogicalCausationRequest.  # noqa: E501
+
+
+        :return: The object_concept_name of this DefineLogicalCausationRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._object_concept_name
+
+    @object_concept_name.setter
+    def object_concept_name(self, object_concept_name):
+        """Sets the object_concept_name of this DefineLogicalCausationRequest.
+
+
+        :param object_concept_name: The object_concept_name of this DefineLogicalCausationRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._object_concept_name = object_concept_name
+
+    @property
+    def dsl(self):
+        """Gets the dsl of this DefineLogicalCausationRequest.  # noqa: E501
+
+
+        :return: The dsl of this DefineLogicalCausationRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._dsl
+
+    @dsl.setter
+    def dsl(self, dsl):
+        """Sets the dsl of this DefineLogicalCausationRequest.
+
+
+        :param dsl: The dsl of this DefineLogicalCausationRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._dsl = dsl
+
+    @property
+    def semantic_type(self):
+        return self._semantic_type
+
+    @semantic_type.setter
+    def semantic_type(self, semantic_type):
+        self._semantic_type = semantic_type
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, DefineLogicalCausationRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, DefineLogicalCausationRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/concept/remove_dynamic_taxonomy_request.py b/knext/schema/rest/models/concept/remove_dynamic_taxonomy_request.py
new file mode 100644
index 00000000..94687eca
--- /dev/null
+++ b/knext/schema/rest/models/concept/remove_dynamic_taxonomy_request.py
@@ -0,0 +1,161 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class RemoveDynamicTaxonomyRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"object_concept_type_name": "str", "object_concept_name": "str"}
+
+    attribute_map = {
+        "object_concept_type_name": "objectConceptTypeName",
+        "object_concept_name": "objectConceptName",
+    }
+
+    def __init__(
+        self,
+        object_concept_type_name=None,
+        object_concept_name=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """RemoveDynamicTaxonomyRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._object_concept_type_name = None
+        self._object_concept_name = None
+        self.discriminator = None
+
+        if object_concept_type_name is not None:
+            self.object_concept_type_name = object_concept_type_name
+        if object_concept_name is not None:
+            self.object_concept_name = object_concept_name
+
+    @property
+    def object_concept_type_name(self):
+        """Gets the object_concept_type_name of this RemoveDynamicTaxonomyRequest.  # noqa: E501
+
+
+        :return: The object_concept_type_name of this RemoveDynamicTaxonomyRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._object_concept_type_name
+
+    @object_concept_type_name.setter
+    def object_concept_type_name(self, object_concept_type_name):
+        """Sets the object_concept_type_name of this RemoveDynamicTaxonomyRequest.
+
+
+        :param object_concept_type_name: The object_concept_type_name of this RemoveDynamicTaxonomyRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._object_concept_type_name = object_concept_type_name
+
+    @property
+    def object_concept_name(self):
+        """Gets the object_concept_name of this RemoveDynamicTaxonomyRequest.  # noqa: E501
+
+
+        :return: The object_concept_name of this RemoveDynamicTaxonomyRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._object_concept_name
+
+    @object_concept_name.setter
+    def object_concept_name(self, object_concept_name):
+        """Sets the object_concept_name of this RemoveDynamicTaxonomyRequest.
+
+
+        :param object_concept_name: The object_concept_name of this RemoveDynamicTaxonomyRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._object_concept_name = object_concept_name
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, RemoveDynamicTaxonomyRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, RemoveDynamicTaxonomyRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/concept/remove_logical_causation_request.py b/knext/schema/rest/models/concept/remove_logical_causation_request.py
new file mode 100644
index 00000000..6ddcd980
--- /dev/null
+++ b/knext/schema/rest/models/concept/remove_logical_causation_request.py
@@ -0,0 +1,259 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class RemoveLogicalCausationRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "subject_concept_type_name": "str",
+        "subject_concept_name": "str",
+        "predicate_name": "str",
+        "object_concept_type_name": "str",
+        "object_concept_name": "str",
+        "semantic_type": "str",
+    }
+
+    attribute_map = {
+        "subject_concept_type_name": "subjectConceptTypeName",
+        "subject_concept_name": "subjectConceptName",
+        "predicate_name": "predicateName",
+        "object_concept_type_name": "objectConceptTypeName",
+        "object_concept_name": "objectConceptName",
+        "semantic_type": "semanticType",
+    }
+
+    def __init__(
+        self,
+        subject_concept_type_name=None,
+        subject_concept_name=None,
+        predicate_name=None,
+        object_concept_type_name=None,
+        object_concept_name=None,
+        semantic_type=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """RemoveLogicalCausationRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._subject_concept_type_name = None
+        self._subject_concept_name = None
+        self._predicate_name = None
+        self._object_concept_type_name = None
+        self._object_concept_name = None
+        self._semantic_type = None
+        self.discriminator = None
+
+        if subject_concept_type_name is not None:
+            self.subject_concept_type_name = subject_concept_type_name
+        if subject_concept_name is not None:
+            self.subject_concept_name = subject_concept_name
+        if predicate_name is not None:
+            self.predicate_name = predicate_name
+        if object_concept_type_name is not None:
+            self.object_concept_type_name = object_concept_type_name
+        if object_concept_name is not None:
+            self.object_concept_name = object_concept_name
+        if semantic_type is not None:
+            self.semantic_type = semantic_type
+
+    @property
+    def subject_concept_type_name(self):
+        """Gets the subject_concept_type_name of this RemoveLogicalCausationRequest.  # noqa: E501
+
+
+        :return: The subject_concept_type_name of this RemoveLogicalCausationRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._subject_concept_type_name
+
+    @subject_concept_type_name.setter
+    def subject_concept_type_name(self, subject_concept_type_name):
+        """Sets the subject_concept_type_name of this RemoveLogicalCausationRequest.
+
+
+        :param subject_concept_type_name: The subject_concept_type_name of this RemoveLogicalCausationRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._subject_concept_type_name = subject_concept_type_name
+
+    @property
+    def subject_concept_name(self):
+        """Gets the subject_concept_name of this RemoveLogicalCausationRequest.  # noqa: E501
+
+
+        :return: The subject_concept_name of this RemoveLogicalCausationRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._subject_concept_name
+
+    @subject_concept_name.setter
+    def subject_concept_name(self, subject_concept_name):
+        """Sets the subject_concept_name of this RemoveLogicalCausationRequest.
+
+
+        :param subject_concept_name: The subject_concept_name of this RemoveLogicalCausationRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._subject_concept_name = subject_concept_name
+
+    @property
+    def predicate_name(self):
+        """Gets the predicate_name of this RemoveLogicalCausationRequest.  # noqa: E501
+
+
+        :return: The predicate_name of this RemoveLogicalCausationRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._predicate_name
+
+    @predicate_name.setter
+    def predicate_name(self, predicate_name):
+        """Sets the predicate_name of this RemoveLogicalCausationRequest.
+
+
+        :param predicate_name: The predicate_name of this RemoveLogicalCausationRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._predicate_name = predicate_name
+
+    @property
+    def object_concept_type_name(self):
+        """Gets the object_concept_type_name of this RemoveLogicalCausationRequest.  # noqa: E501
+
+
+        :return: The object_concept_type_name of this RemoveLogicalCausationRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._object_concept_type_name
+
+    @object_concept_type_name.setter
+    def object_concept_type_name(self, object_concept_type_name):
+        """Sets the object_concept_type_name of this RemoveLogicalCausationRequest.
+
+
+        :param object_concept_type_name: The object_concept_type_name of this RemoveLogicalCausationRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._object_concept_type_name = object_concept_type_name
+
+    @property
+    def object_concept_name(self):
+        """Gets the object_concept_name of this RemoveLogicalCausationRequest.  # noqa: E501
+
+
+        :return: The object_concept_name of this RemoveLogicalCausationRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._object_concept_name
+
+    @object_concept_name.setter
+    def object_concept_name(self, object_concept_name):
+        """Sets the object_concept_name of this RemoveLogicalCausationRequest.
+
+
+        :param object_concept_name: The object_concept_name of this RemoveLogicalCausationRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._object_concept_name = object_concept_name
+
+    @property
+    def semantic_type(self):
+        return self._semantic_type
+
+    @semantic_type.setter
+    def semantic_type(self, semantic_type):
+        self._semantic_type = semantic_type
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, RemoveLogicalCausationRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, RemoveLogicalCausationRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/constraint/__init__.py b/knext/schema/rest/models/constraint/__init__.py
new file mode 100644
index 00000000..6f6914a4
--- /dev/null
+++ b/knext/schema/rest/models/constraint/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
diff --git a/knext/schema/rest/models/constraint/base_constraint_item.py b/knext/schema/rest/models/constraint/base_constraint_item.py
new file mode 100644
index 00000000..11ca1d05
--- /dev/null
+++ b/knext/schema/rest/models/constraint/base_constraint_item.py
@@ -0,0 +1,162 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class BaseConstraintItem(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"constraint_type_enum": "str"}
+
+    attribute_map = {"constraint_type_enum": "constraintTypeEnum"}
+
+    discriminator_value_class_map = {
+        "ENUM": "EnumConstraint",
+        "MULTI_VALUE": "MultiValConstraint",
+        "NOT_NULL": "NotNullConstraint",
+        "REGULAR": "RegularConstraint",
+    }
+
+    def __init__(
+        self, constraint_type_enum=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """BaseConstraintItem - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._constraint_type_enum = None
+        self.discriminator = constraint_type_enum
+
+        self.constraint_type_enum = constraint_type_enum
+
+    @property
+    def constraint_type_enum(self):
+        """Gets the constraint_type_enum of this BaseConstraintItem.  # noqa: E501
+
+
+        :return: The constraint_type_enum of this BaseConstraintItem.  # noqa: E501
+        :rtype: str
+        """
+        return self._constraint_type_enum
+
+    @constraint_type_enum.setter
+    def constraint_type_enum(self, constraint_type_enum):
+        """Sets the constraint_type_enum of this BaseConstraintItem.
+
+
+        :param constraint_type_enum: The constraint_type_enum of this BaseConstraintItem.  # noqa: E501
+        :type: str
+        """
+
+        allowed_values = [
+            None,
+            "NOT_NULL",
+            "MULTI_VALUE",
+            "ENUM",
+            "REGULAR",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and constraint_type_enum not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `constraint_type_enum` ({0}), must be one of {1}".format(  # noqa: E501
+                    constraint_type_enum, allowed_values
+                )
+            )
+
+        self._constraint_type_enum = constraint_type_enum
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def get_real_child_model(self, data):
+        """Returns the child model by discriminator"""
+        if "@type" in data:
+            child_type = data.get("@type")
+            real_child_model = self.discriminator_value_class_map.get(child_type)
+            return real_child_model
+        return None
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, BaseConstraintItem):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, BaseConstraintItem):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/constraint/constraint.py b/knext/schema/rest/models/constraint/constraint.py
new file mode 100644
index 00000000..bc9a0db6
--- /dev/null
+++ b/knext/schema/rest/models/constraint/constraint.py
@@ -0,0 +1,155 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class Constraint(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"id": "int", "constraint_items": "list[BaseConstraintItem]"}
+
+    attribute_map = {"id": "id", "constraint_items": "constraintItems"}
+
+    def __init__(
+        self, id=None, constraint_items=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """Constraint - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._id = None
+        self._constraint_items = None
+        self.discriminator = None
+
+        if id is not None:
+            self.id = id
+        if constraint_items is not None:
+            self.constraint_items = constraint_items
+
+    @property
+    def id(self):
+        """Gets the id of this Constraint.  # noqa: E501
+
+
+        :return: The id of this Constraint.  # noqa: E501
+        :rtype: int
+        """
+        return self._id
+
+    @id.setter
+    def id(self, id):
+        """Sets the id of this Constraint.
+
+
+        :param id: The id of this Constraint.  # noqa: E501
+        :type: int
+        """
+
+        self._id = id
+
+    @property
+    def constraint_items(self):
+        """Gets the constraint_items of this Constraint.  # noqa: E501
+
+
+        :return: The constraint_items of this Constraint.  # noqa: E501
+        :rtype: list[BaseConstraintItem]
+        """
+        return self._constraint_items
+
+    @constraint_items.setter
+    def constraint_items(self, constraint_items):
+        """Sets the constraint_items of this Constraint.
+
+
+        :param constraint_items: The constraint_items of this Constraint.  # noqa: E501
+        :type: list[BaseConstraintItem]
+        """
+
+        self._constraint_items = constraint_items
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, Constraint):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, Constraint):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/constraint/enum_constraint.py b/knext/schema/rest/models/constraint/enum_constraint.py
new file mode 100644
index 00000000..fa4c0909
--- /dev/null
+++ b/knext/schema/rest/models/constraint/enum_constraint.py
@@ -0,0 +1,178 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class EnumConstraint(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"constraint_type_enum": "str", "enum_values": "list[str]"}
+
+    attribute_map = {
+        "constraint_type_enum": "constraintTypeEnum",
+        "enum_values": "enumValues",
+    }
+
+    def __init__(
+        self,
+        constraint_type_enum="ENUM",
+        enum_values=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """EnumConstraint - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._constraint_type_enum = None
+        self._enum_values = None
+        self.discriminator = constraint_type_enum
+
+        self.constraint_type_enum = constraint_type_enum
+        if enum_values is not None:
+            self.enum_values = enum_values
+
+    @property
+    def constraint_type_enum(self):
+        """Gets the constraint_type_enum of this EnumConstraint.  # noqa: E501
+
+
+        :return: The constraint_type_enum of this EnumConstraint.  # noqa: E501
+        :rtype: str
+        """
+        return self._constraint_type_enum
+
+    @constraint_type_enum.setter
+    def constraint_type_enum(self, constraint_type_enum):
+        """Sets the constraint_type_enum of this EnumConstraint.
+
+
+        :param constraint_type_enum: The constraint_type_enum of this EnumConstraint.  # noqa: E501
+        :type: str
+        """
+        allowed_values = [
+            None,
+            "NOTNULL",
+            "UNIQUE",
+            "MULTIVALUE",
+            "ENUM",
+            "RANGE",
+            "REGULAR",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and constraint_type_enum not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `constraint_type_enum` ({0}), must be one of {1}".format(  # noqa: E501
+                    constraint_type_enum, allowed_values
+                )
+            )
+
+        self._constraint_type_enum = constraint_type_enum
+
+    @property
+    def enum_values(self):
+        """Gets the enum_values of this EnumConstraint.  # noqa: E501
+
+
+        :return: The enum_values of this EnumConstraint.  # noqa: E501
+        :rtype: list[str]
+        """
+        return self._enum_values
+
+    @enum_values.setter
+    def enum_values(self, enum_values):
+        """Sets the enum_values of this EnumConstraint.
+
+
+        :param enum_values: The enum_values of this EnumConstraint.  # noqa: E501
+        :type: list[str]
+        """
+
+        self._enum_values = enum_values
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, EnumConstraint):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, EnumConstraint):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/constraint/multi_val_constraint.py b/knext/schema/rest/models/constraint/multi_val_constraint.py
new file mode 100644
index 00000000..ab1a5c1b
--- /dev/null
+++ b/knext/schema/rest/models/constraint/multi_val_constraint.py
@@ -0,0 +1,148 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class MultiValConstraint(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"constraint_type_enum": "str"}
+
+    attribute_map = {"constraint_type_enum": "constraintTypeEnum"}
+
+    def __init__(
+        self, constraint_type_enum="MULTI_VALUE", local_vars_configuration=None
+    ):  # noqa: E501
+        """MultiValConstraint - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._constraint_type_enum = None
+        self.discriminator = constraint_type_enum
+
+        self.constraint_type_enum = constraint_type_enum
+
+    @property
+    def constraint_type_enum(self):
+        """Gets the constraint_type_enum of this MultiValConstraint.  # noqa: E501
+
+
+        :return: The constraint_type_enum of this MultiValConstraint.  # noqa: E501
+        :rtype: str
+        """
+        return self._constraint_type_enum
+
+    @constraint_type_enum.setter
+    def constraint_type_enum(self, constraint_type_enum):
+        """Sets the constraint_type_enum of this MultiValConstraint.
+
+
+        :param constraint_type_enum: The constraint_type_enum of this MultiValConstraint.  # noqa: E501
+        :type: str
+        """
+        allowed_values = [
+            None,
+            "NOT_NULL",
+            "UNIQUE",
+            "MULTI_VALUE",
+            "ENUM",
+            "RANGE",
+            "REGULAR",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and constraint_type_enum not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `constraint_type_enum` ({0}), must be one of {1}".format(  # noqa: E501
+                    constraint_type_enum, allowed_values
+                )
+            )
+
+        self._constraint_type_enum = constraint_type_enum
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, MultiValConstraint):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, MultiValConstraint):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/constraint/not_null_constraint.py b/knext/schema/rest/models/constraint/not_null_constraint.py
new file mode 100644
index 00000000..00a14b81
--- /dev/null
+++ b/knext/schema/rest/models/constraint/not_null_constraint.py
@@ -0,0 +1,148 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class NotNullConstraint(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"constraint_type_enum": "str"}
+
+    attribute_map = {"constraint_type_enum": "constraintTypeEnum"}
+
+    def __init__(
+        self, constraint_type_enum="NOT_NULL", local_vars_configuration=None
+    ):  # noqa: E501
+        """NotNullConstraint - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._constraint_type_enum = None
+        self.discriminator = constraint_type_enum
+
+        self.constraint_type_enum = constraint_type_enum
+
+    @property
+    def constraint_type_enum(self):
+        """Gets the constraint_type_enum of this NotNullConstraint.  # noqa: E501
+
+
+        :return: The constraint_type_enum of this NotNullConstraint.  # noqa: E501
+        :rtype: str
+        """
+        return self._constraint_type_enum
+
+    @constraint_type_enum.setter
+    def constraint_type_enum(self, constraint_type_enum):
+        """Sets the constraint_type_enum of this NotNullConstraint.
+
+
+        :param constraint_type_enum: The constraint_type_enum of this NotNullConstraint.  # noqa: E501
+        :type: str
+        """
+        allowed_values = [
+            None,
+            "NOT_NULL",
+            "UNIQUE",
+            "MULTI_VALUE",
+            "ENUM",
+            "RANGE",
+            "REGULAR",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and constraint_type_enum not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `constraint_type_enum` ({0}), must be one of {1}".format(  # noqa: E501
+                    constraint_type_enum, allowed_values
+                )
+            )
+
+        self._constraint_type_enum = constraint_type_enum
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, NotNullConstraint):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, NotNullConstraint):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/constraint/regular_constraint.py b/knext/schema/rest/models/constraint/regular_constraint.py
new file mode 100644
index 00000000..5c97537b
--- /dev/null
+++ b/knext/schema/rest/models/constraint/regular_constraint.py
@@ -0,0 +1,178 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class RegularConstraint(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"constraint_type_enum": "str", "regular_pattern": "str"}
+
+    attribute_map = {
+        "constraint_type_enum": "constraintTypeEnum",
+        "regular_pattern": "regularPattern",
+    }
+
+    def __init__(
+        self,
+        constraint_type_enum="REGULAR",
+        regular_pattern=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """RegularConstraint - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._constraint_type_enum = None
+        self._regular_pattern = None
+        self.discriminator = constraint_type_enum
+
+        self.constraint_type_enum = constraint_type_enum
+        if regular_pattern is not None:
+            self.regular_pattern = regular_pattern
+
+    @property
+    def constraint_type_enum(self):
+        """Gets the constraint_type_enum of this RegularConstraint.  # noqa: E501
+
+
+        :return: The constraint_type_enum of this RegularConstraint.  # noqa: E501
+        :rtype: str
+        """
+        return self._constraint_type_enum
+
+    @constraint_type_enum.setter
+    def constraint_type_enum(self, constraint_type_enum):
+        """Sets the constraint_type_enum of this RegularConstraint.
+
+
+        :param constraint_type_enum: The constraint_type_enum of this RegularConstraint.  # noqa: E501
+        :type: str
+        """
+        allowed_values = [
+            None,
+            "NOT_NULL",
+            "UNIQUE",
+            "MULTI_VALUE",
+            "ENUM",
+            "RANGE",
+            "REGULAR",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and constraint_type_enum not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `constraint_type_enum` ({0}), must be one of {1}".format(  # noqa: E501
+                    constraint_type_enum, allowed_values
+                )
+            )
+
+        self._constraint_type_enum = constraint_type_enum
+
+    @property
+    def regular_pattern(self):
+        """Gets the regular_pattern of this RegularConstraint.  # noqa: E501
+
+
+        :return: The regular_pattern of this RegularConstraint.  # noqa: E501
+        :rtype: str
+        """
+        return self._regular_pattern
+
+    @regular_pattern.setter
+    def regular_pattern(self, regular_pattern):
+        """Sets the regular_pattern of this RegularConstraint.
+
+
+        :param regular_pattern: The regular_pattern of this RegularConstraint.  # noqa: E501
+        :type: str
+        """
+
+        self._regular_pattern = regular_pattern
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, RegularConstraint):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, RegularConstraint):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/identifier/__init__.py b/knext/schema/rest/models/identifier/__init__.py
new file mode 100644
index 00000000..6f6914a4
--- /dev/null
+++ b/knext/schema/rest/models/identifier/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
diff --git a/knext/schema/rest/models/identifier/base_spg_identifier.py b/knext/schema/rest/models/identifier/base_spg_identifier.py
new file mode 100644
index 00000000..ab75bcaa
--- /dev/null
+++ b/knext/schema/rest/models/identifier/base_spg_identifier.py
@@ -0,0 +1,166 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class BaseSpgIdentifier(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"identity_type": "str"}
+
+    attribute_map = {"identity_type": "identityType"}
+
+    discriminator_value_class_map = {
+        "SPG_TYPE": "SpgTypeIdentifier",
+        "SPG_TRIPLE": "SpgTripleIdentifier",
+        "CONCEPT": "ConceptIdentifier",
+        "PREDICATE": "PredicateIdentifier",
+        "OPERATOR": "OperatorIdentifier",
+    }
+
+    def __init__(self, identity_type=None, local_vars_configuration=None):  # noqa: E501
+        """BaseSpgIdentifier - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._identity_type = None
+        self.discriminator = None
+
+        self.identity_type = identity_type
+
+    @property
+    def identity_type(self):
+        """Gets the identity_type of this BaseSpgIdentifier.  # noqa: E501
+
+
+        :return: The identity_type of this BaseSpgIdentifier.  # noqa: E501
+        :rtype: str
+        """
+        return self._identity_type
+
+    @identity_type.setter
+    def identity_type(self, identity_type):
+        """Sets the identity_type of this BaseSpgIdentifier.
+
+
+        :param identity_type: The identity_type of this BaseSpgIdentifier.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and identity_type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `identity_type`, must not be `None`"
+            )  # noqa: E501
+        allowed_values = [
+            "SPG_TYPE",
+            "SPG_TRIPLE",
+            "CONCEPT",
+            "PREDICATE",
+            "OPERATOR",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and identity_type not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `identity_type` ({0}), must be one of {1}".format(  # noqa: E501
+                    identity_type, allowed_values
+                )
+            )
+
+        self._identity_type = identity_type
+
+    def get_real_child_model(self, data):
+        """Returns the child model by discriminator"""
+        if "@type" in data:
+            child_type = data.get("@type")
+            real_child_model = self.discriminator_value_class_map.get(child_type)
+            return real_child_model
+        return None
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, BaseSpgIdentifier):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, BaseSpgIdentifier):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/identifier/concept_identifier.py b/knext/schema/rest/models/identifier/concept_identifier.py
new file mode 100644
index 00000000..0e648e5f
--- /dev/null
+++ b/knext/schema/rest/models/identifier/concept_identifier.py
@@ -0,0 +1,177 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class ConceptIdentifier(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"identity_type": "str", "name": "str"}
+
+    attribute_map = {"identity_type": "identityType", "name": "name"}
+
+    def __init__(
+        self, identity_type="CONCEPT", name=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """ConceptIdentifier - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._identity_type = None
+        self._name = None
+        self.discriminator = identity_type
+
+        self.identity_type = identity_type
+        if name is not None:
+            self.name = name
+
+    @property
+    def identity_type(self):
+        """Gets the identity_type of this ConceptIdentifier.  # noqa: E501
+
+
+        :return: The identity_type of this ConceptIdentifier.  # noqa: E501
+        :rtype: str
+        """
+        return self._identity_type
+
+    @identity_type.setter
+    def identity_type(self, identity_type):
+        """Sets the identity_type of this ConceptIdentifier.
+
+
+        :param identity_type: The identity_type of this ConceptIdentifier.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and identity_type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `identity_type`, must not be `None`"
+            )  # noqa: E501
+        allowed_values = [
+            "SPG_TYPE",
+            "SPG_TRIPLE",
+            "CONCEPT",
+            "PREDICATE",
+            "OPERATOR",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and identity_type not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `identity_type` ({0}), must be one of {1}".format(  # noqa: E501
+                    identity_type, allowed_values
+                )
+            )
+
+        self._identity_type = identity_type
+
+    @property
+    def name(self):
+        """Gets the name of this ConceptIdentifier.  # noqa: E501
+
+
+        :return: The name of this ConceptIdentifier.  # noqa: E501
+        :rtype: str
+        """
+        return self._name
+
+    @name.setter
+    def name(self, name):
+        """Sets the name of this ConceptIdentifier.
+
+
+        :param name: The name of this ConceptIdentifier.  # noqa: E501
+        :type: str
+        """
+
+        self._name = name
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, ConceptIdentifier):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, ConceptIdentifier):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/identifier/operator_identifier.py b/knext/schema/rest/models/identifier/operator_identifier.py
new file mode 100644
index 00000000..7278cccc
--- /dev/null
+++ b/knext/schema/rest/models/identifier/operator_identifier.py
@@ -0,0 +1,177 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class OperatorIdentifier(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"identity_type": "str", "name": "str"}
+
+    attribute_map = {"identity_type": "identityType", "name": "name"}
+
+    def __init__(
+        self, identity_type="OPERATOR", name=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """OperatorIdentifier - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._identity_type = None
+        self._name = None
+        self.discriminator = identity_type
+
+        self.identity_type = identity_type
+        if name is not None:
+            self.name = name
+
+    @property
+    def identity_type(self):
+        """Gets the identity_type of this OperatorIdentifier.  # noqa: E501
+
+
+        :return: The identity_type of this OperatorIdentifier.  # noqa: E501
+        :rtype: str
+        """
+        return self._identity_type
+
+    @identity_type.setter
+    def identity_type(self, identity_type):
+        """Sets the identity_type of this OperatorIdentifier.
+
+
+        :param identity_type: The identity_type of this OperatorIdentifier.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and identity_type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `identity_type`, must not be `None`"
+            )  # noqa: E501
+        allowed_values = [
+            "SPG_TYPE",
+            "SPG_TRIPLE",
+            "CONCEPT",
+            "PREDICATE",
+            "OPERATOR",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and identity_type not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `identity_type` ({0}), must be one of {1}".format(  # noqa: E501
+                    identity_type, allowed_values
+                )
+            )
+
+        self._identity_type = identity_type
+
+    @property
+    def name(self):
+        """Gets the name of this OperatorIdentifier.  # noqa: E501
+
+
+        :return: The name of this OperatorIdentifier.  # noqa: E501
+        :rtype: str
+        """
+        return self._name
+
+    @name.setter
+    def name(self, name):
+        """Sets the name of this OperatorIdentifier.
+
+
+        :param name: The name of this OperatorIdentifier.  # noqa: E501
+        :type: str
+        """
+
+        self._name = name
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, OperatorIdentifier):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, OperatorIdentifier):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/identifier/predicate_identifier.py b/knext/schema/rest/models/identifier/predicate_identifier.py
new file mode 100644
index 00000000..892c5ffc
--- /dev/null
+++ b/knext/schema/rest/models/identifier/predicate_identifier.py
@@ -0,0 +1,177 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class PredicateIdentifier(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"identity_type": "str", "name": "str"}
+
+    attribute_map = {"identity_type": "identityType", "name": "name"}
+
+    def __init__(
+        self, identity_type="PREDICATE", name=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """PredicateIdentifier - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._identity_type = None
+        self._name = None
+        self.discriminator = identity_type
+
+        self.identity_type = identity_type
+        if name is not None:
+            self.name = name
+
+    @property
+    def identity_type(self):
+        """Gets the identity_type of this PredicateIdentifier.  # noqa: E501
+
+
+        :return: The identity_type of this PredicateIdentifier.  # noqa: E501
+        :rtype: str
+        """
+        return self._identity_type
+
+    @identity_type.setter
+    def identity_type(self, identity_type):
+        """Sets the identity_type of this PredicateIdentifier.
+
+
+        :param identity_type: The identity_type of this PredicateIdentifier.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and identity_type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `identity_type`, must not be `None`"
+            )  # noqa: E501
+        allowed_values = [
+            "SPG_TYPE",
+            "SPG_TRIPLE",
+            "CONCEPT",
+            "PREDICATE",
+            "OPERATOR",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and identity_type not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `identity_type` ({0}), must be one of {1}".format(  # noqa: E501
+                    identity_type, allowed_values
+                )
+            )
+
+        self._identity_type = identity_type
+
+    @property
+    def name(self):
+        """Gets the name of this PredicateIdentifier.  # noqa: E501
+
+
+        :return: The name of this PredicateIdentifier.  # noqa: E501
+        :rtype: str
+        """
+        return self._name
+
+    @name.setter
+    def name(self, name):
+        """Sets the name of this PredicateIdentifier.
+
+
+        :param name: The name of this PredicateIdentifier.  # noqa: E501
+        :type: str
+        """
+
+        self._name = name
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, PredicateIdentifier):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, PredicateIdentifier):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/identifier/spg_triple_identifier.py b/knext/schema/rest/models/identifier/spg_triple_identifier.py
new file mode 100644
index 00000000..8697ab89
--- /dev/null
+++ b/knext/schema/rest/models/identifier/spg_triple_identifier.py
@@ -0,0 +1,240 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class SpgTripleIdentifier(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "identity_type": "str",
+        "subject": "BaseSpgIdentifier",
+        "predicate": "PredicateIdentifier",
+        "object": "BaseSpgIdentifier",
+    }
+
+    attribute_map = {
+        "identity_type": "identityType",
+        "subject": "subject",
+        "predicate": "predicate",
+        "object": "object",
+    }
+
+    def __init__(
+        self,
+        identity_type="SPG_TRIPLE",
+        subject=None,
+        predicate=None,
+        object=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """SpgTripleIdentifier - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._identity_type = None
+        self._subject = None
+        self._predicate = None
+        self._object = None
+        self.discriminator = identity_type
+
+        self.identity_type = identity_type
+        if subject is not None:
+            self.subject = subject
+        if predicate is not None:
+            self.predicate = predicate
+        if object is not None:
+            self.object = object
+
+    @property
+    def identity_type(self):
+        """Gets the identity_type of this SpgTripleIdentifier.  # noqa: E501
+
+
+        :return: The identity_type of this SpgTripleIdentifier.  # noqa: E501
+        :rtype: str
+        """
+        return self._identity_type
+
+    @identity_type.setter
+    def identity_type(self, identity_type):
+        """Sets the identity_type of this SpgTripleIdentifier.
+
+
+        :param identity_type: The identity_type of this SpgTripleIdentifier.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and identity_type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `identity_type`, must not be `None`"
+            )  # noqa: E501
+        allowed_values = [
+            "SPG_TYPE",
+            "SPG_TRIPLE",
+            "CONCEPT",
+            "PREDICATE",
+            "OPERATOR",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and identity_type not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `identity_type` ({0}), must be one of {1}".format(  # noqa: E501
+                    identity_type, allowed_values
+                )
+            )
+
+        self._identity_type = identity_type
+
+    @property
+    def subject(self):
+        """Gets the subject of this SpgTripleIdentifier.  # noqa: E501
+
+
+        :return: The subject of this SpgTripleIdentifier.  # noqa: E501
+        :rtype: BaseSpgIdentifier
+        """
+        return self._subject
+
+    @subject.setter
+    def subject(self, subject):
+        """Sets the subject of this SpgTripleIdentifier.
+
+
+        :param subject: The subject of this SpgTripleIdentifier.  # noqa: E501
+        :type: BaseSpgIdentifier
+        """
+
+        self._subject = subject
+
+    @property
+    def predicate(self):
+        """Gets the predicate of this SpgTripleIdentifier.  # noqa: E501
+
+
+        :return: The predicate of this SpgTripleIdentifier.  # noqa: E501
+        :rtype: PredicateIdentifier
+        """
+        return self._predicate
+
+    @predicate.setter
+    def predicate(self, predicate):
+        """Sets the predicate of this SpgTripleIdentifier.
+
+
+        :param predicate: The predicate of this SpgTripleIdentifier.  # noqa: E501
+        :type: PredicateIdentifier
+        """
+
+        self._predicate = predicate
+
+    @property
+    def object(self):
+        """Gets the object of this SpgTripleIdentifier.  # noqa: E501
+
+
+        :return: The object of this SpgTripleIdentifier.  # noqa: E501
+        :rtype: BaseSpgIdentifier
+        """
+        return self._object
+
+    @object.setter
+    def object(self, object):
+        """Sets the object of this SpgTripleIdentifier.
+
+
+        :param object: The object of this SpgTripleIdentifier.  # noqa: E501
+        :type: BaseSpgIdentifier
+        """
+
+        self._object = object
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, SpgTripleIdentifier):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, SpgTripleIdentifier):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/identifier/spg_type_identifier.py b/knext/schema/rest/models/identifier/spg_type_identifier.py
new file mode 100644
index 00000000..b032caa8
--- /dev/null
+++ b/knext/schema/rest/models/identifier/spg_type_identifier.py
@@ -0,0 +1,235 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class SpgTypeIdentifier(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"identity_type": "str", "namespace": "str", "name_en": "str"}
+
+    attribute_map = {
+        "identity_type": "identityType",
+        "namespace": "namespace",
+        "name_en": "nameEn",
+    }
+
+    def __init__(
+        self,
+        identity_type="SPG_TYPE",
+        namespace=None,
+        name_en=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """SpgTypeIdentifier - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._identity_type = None
+        self._namespace = None
+        self._name_en = None
+        self.discriminator = identity_type
+
+        self.identity_type = identity_type
+        if namespace is not None:
+            self.namespace = namespace
+        if name_en is not None:
+            self.name_en = name_en
+
+    @property
+    def identity_type(self):
+        """Gets the identity_type of this SpgTypeIdentifier.  # noqa: E501
+
+
+        :return: The identity_type of this SpgTypeIdentifier.  # noqa: E501
+        :rtype: str
+        """
+        return self._identity_type
+
+    @identity_type.setter
+    def identity_type(self, identity_type):
+        """Sets the identity_type of this SpgTypeIdentifier.
+
+
+        :param identity_type: The identity_type of this SpgTypeIdentifier.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and identity_type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `identity_type`, must not be `None`"
+            )  # noqa: E501
+        allowed_values = [
+            "SPG_TYPE",
+            "SPG_TRIPLE",
+            "CONCEPT",
+            "PREDICATE",
+            "OPERATOR",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and identity_type not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `identity_type` ({0}), must be one of {1}".format(  # noqa: E501
+                    identity_type, allowed_values
+                )
+            )
+
+        self._identity_type = identity_type
+
+    @property
+    def namespace(self):
+        """Gets the namespace of this SpgTypeIdentifier.  # noqa: E501
+
+
+        :return: The namespace of this SpgTypeIdentifier.  # noqa: E501
+        :rtype: str
+        """
+        return self._namespace
+
+    @namespace.setter
+    def namespace(self, namespace):
+        """Sets the namespace of this SpgTypeIdentifier.
+
+
+        :param namespace: The namespace of this SpgTypeIdentifier.  # noqa: E501
+        :type: str
+        """
+
+        self._namespace = namespace
+
+    @property
+    def name_en(self):
+        """Gets the name_en of this SpgTypeIdentifier.  # noqa: E501
+
+
+        :return: The name_en of this SpgTypeIdentifier.  # noqa: E501
+        :rtype: str
+        """
+        return self._name_en
+
+    @name_en.setter
+    def name_en(self, name_en):
+        """Sets the name_en of this SpgTypeIdentifier.
+
+
+        :param name_en: The name_en of this SpgTypeIdentifier.  # noqa: E501
+        :type: str
+        """
+
+        self._name_en = name_en
+
+    @property
+    def name(self):
+        """Gets the full name of this SpgTypeIdentifier.  # noqa: E501
+
+
+        :return: The full name of this SpgTypeIdentifier.  # noqa: E501
+        :rtype: str
+        """
+        return self.namespace + "." + self.name_en if self.namespace else self.name_en
+
+    @name.setter
+    def name(self, name):
+        """Sets the name of this SpgTypeIdentifier.
+
+
+        :param name: The name of this SpgTypeIdentifier.  # noqa: E501
+        :type: str
+        """
+
+        name_split = name.split(".")
+        if len(name_split) == 1:
+            self.name_en = name
+        elif len(name_split) == 2:
+            self.namespace = name_split[0]
+            self.name_en = name_split[1]
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, SpgTypeIdentifier):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, SpgTypeIdentifier):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/ontology_id.py b/knext/schema/rest/models/ontology_id.py
new file mode 100644
index 00000000..a1f98c42
--- /dev/null
+++ b/knext/schema/rest/models/ontology_id.py
@@ -0,0 +1,155 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class OntologyId(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"unique_id": "int", "alter_id": "int"}
+
+    attribute_map = {"unique_id": "uniqueId", "alter_id": "alterId"}
+
+    def __init__(
+        self, unique_id=None, alter_id=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """OntologyId - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._unique_id = None
+        self._alter_id = None
+        self.discriminator = None
+
+        if unique_id is not None:
+            self.unique_id = unique_id
+        if alter_id is not None:
+            self.alter_id = alter_id
+
+    @property
+    def unique_id(self):
+        """Gets the unique_id of this OntologyId.  # noqa: E501
+
+
+        :return: The unique_id of this OntologyId.  # noqa: E501
+        :rtype: int
+        """
+        return self._unique_id
+
+    @unique_id.setter
+    def unique_id(self, unique_id):
+        """Sets the unique_id of this OntologyId.
+
+
+        :param unique_id: The unique_id of this OntologyId.  # noqa: E501
+        :type: int
+        """
+
+        self._unique_id = unique_id
+
+    @property
+    def alter_id(self):
+        """Gets the alter_id of this OntologyId.  # noqa: E501
+
+
+        :return: The alter_id of this OntologyId.  # noqa: E501
+        :rtype: int
+        """
+        return self._alter_id
+
+    @alter_id.setter
+    def alter_id(self, alter_id):
+        """Sets the alter_id of this OntologyId.
+
+
+        :param alter_id: The alter_id of this OntologyId.  # noqa: E501
+        :type: int
+        """
+
+        self._alter_id = alter_id
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, OntologyId):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, OntologyId):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/operator/__init__.py b/knext/schema/rest/models/operator/__init__.py
new file mode 100644
index 00000000..6f6914a4
--- /dev/null
+++ b/knext/schema/rest/models/operator/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
diff --git a/knext/schema/rest/models/operator/operator_create_request.py b/knext/schema/rest/models/operator/operator_create_request.py
new file mode 100644
index 00000000..8f98e196
--- /dev/null
+++ b/knext/schema/rest/models/operator/operator_create_request.py
@@ -0,0 +1,210 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class OperatorCreateRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"name": "str", "desc": "str", "operator_type": "str"}
+
+    attribute_map = {"name": "name", "desc": "desc", "operator_type": "operatorType"}
+
+    def __init__(
+        self, name=None, desc=None, operator_type=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """OperatorCreateRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._name = None
+        self._desc = None
+        self._operator_type = None
+        self.discriminator = None
+
+        self.name = name
+        self.desc = desc
+        self.operator_type = operator_type
+
+    @property
+    def name(self):
+        """Gets the name of this OperatorCreateRequest.  # noqa: E501
+
+
+        :return: The name of this OperatorCreateRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._name
+
+    @name.setter
+    def name(self, name):
+        """Sets the name of this OperatorCreateRequest.
+
+
+        :param name: The name of this OperatorCreateRequest.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and name is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `name`, must not be `None`"
+            )  # noqa: E501
+
+        self._name = name
+
+    @property
+    def desc(self):
+        """Gets the desc of this OperatorCreateRequest.  # noqa: E501
+
+
+        :return: The desc of this OperatorCreateRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._desc
+
+    @desc.setter
+    def desc(self, desc):
+        """Sets the desc of this OperatorCreateRequest.
+
+
+        :param desc: The desc of this OperatorCreateRequest.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and desc is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `desc`, must not be `None`"
+            )  # noqa: E501
+
+        self._desc = desc
+
+    @property
+    def operator_type(self):
+        """Gets the operator_type of this OperatorCreateRequest.  # noqa: E501
+
+
+        :return: The operator_type of this OperatorCreateRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._operator_type
+
+    @operator_type.setter
+    def operator_type(self, operator_type):
+        """Sets the operator_type of this OperatorCreateRequest.
+
+
+        :param operator_type: The operator_type of this OperatorCreateRequest.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and operator_type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `operator_type`, must not be `None`"
+            )  # noqa: E501
+        allowed_values = [
+            "KNOWLEDGE_EXTRACT",
+            "ENTITY_LINK",
+            "PROPERTY_NORMALIZE",
+            "ENTITY_FUSE",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and operator_type not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `operator_type` ({0}), must be one of {1}".format(  # noqa: E501
+                    operator_type, allowed_values
+                )
+            )
+
+        self._operator_type = operator_type
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, OperatorCreateRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, OperatorCreateRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/operator/operator_create_response.py b/knext/schema/rest/models/operator/operator_create_response.py
new file mode 100644
index 00000000..849dafe5
--- /dev/null
+++ b/knext/schema/rest/models/operator/operator_create_response.py
@@ -0,0 +1,161 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class OperatorCreateResponse(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"name": "str", "operator_id": "str"}
+
+    attribute_map = {"name": "name", "operator_id": "operatorId"}
+
+    def __init__(
+        self, name=None, operator_id=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """OperatorCreateResponse - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._name = None
+        self._operator_id = None
+        self.discriminator = None
+
+        self.name = name
+        self.operator_id = operator_id
+
+    @property
+    def name(self):
+        """Gets the name of this OperatorCreateResponse.  # noqa: E501
+
+
+        :return: The name of this OperatorCreateResponse.  # noqa: E501
+        :rtype: str
+        """
+        return self._name
+
+    @name.setter
+    def name(self, name):
+        """Sets the name of this OperatorCreateResponse.
+
+
+        :param name: The name of this OperatorCreateResponse.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and name is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `name`, must not be `None`"
+            )  # noqa: E501
+
+        self._name = name
+
+    @property
+    def operator_id(self):
+        """Gets the operator_id of this OperatorCreateResponse.  # noqa: E501
+
+
+        :return: The operator_id of this OperatorCreateResponse.  # noqa: E501
+        :rtype: str
+        """
+        return self._operator_id
+
+    @operator_id.setter
+    def operator_id(self, operator_id):
+        """Sets the operator_id of this OperatorCreateResponse.
+
+
+        :param operator_id: The operator_id of this OperatorCreateResponse.  # noqa: E501
+        :type: str
+        """
+        # if self.local_vars_configuration.client_side_validation and operator_id is None:  # noqa: E501
+        #     raise ValueError("Invalid value for `operator_id`, must not be `None`")  # noqa: E501
+
+        self._operator_id = operator_id
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, OperatorCreateResponse):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, OperatorCreateResponse):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/operator/operator_overview.py b/knext/schema/rest/models/operator/operator_overview.py
new file mode 100644
index 00000000..849048a9
--- /dev/null
+++ b/knext/schema/rest/models/operator/operator_overview.py
@@ -0,0 +1,266 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class OperatorOverview(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "id": "str",
+        "name": "str",
+        "desc": "str",
+        "type": "str",
+        "lang_type": "str",
+    }
+
+    attribute_map = {
+        "id": "id",
+        "name": "name",
+        "desc": "desc",
+        "type": "type",
+        "lang_type": "langType",
+    }
+
+    def __init__(
+        self,
+        id=None,
+        name=None,
+        desc=None,
+        type=None,
+        lang_type=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """OperatorOverview - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._id = None
+        self._name = None
+        self._desc = None
+        self._type = None
+        self._lang_type = None
+        self.discriminator = None
+
+        self.id = id
+        self.name = name
+        self.desc = desc
+        self.type = type
+        self.lang_type = lang_type
+
+    @property
+    def id(self):
+        """Gets the id of this OperatorOverview.  # noqa: E501
+
+
+        :return: The id of this OperatorOverview.  # noqa: E501
+        :rtype: str
+        """
+        return self._id
+
+    @id.setter
+    def id(self, id):
+        """Sets the id of this OperatorOverview.
+
+
+        :param id: The id of this OperatorOverview.  # noqa: E501
+        :type: str
+        """
+
+        self._id = id
+
+    @property
+    def name(self):
+        """Gets the name of this OperatorOverview.  # noqa: E501
+
+
+        :return: The name of this OperatorOverview.  # noqa: E501
+        :rtype: str
+        """
+        return self._name
+
+    @name.setter
+    def name(self, name):
+        """Sets the name of this OperatorOverview.
+
+
+        :param name: The name of this OperatorOverview.  # noqa: E501
+        :type: str
+        """
+
+        self._name = name
+
+    @property
+    def desc(self):
+        """Gets the desc of this OperatorOverview.  # noqa: E501
+
+
+        :return: The desc of this OperatorOverview.  # noqa: E501
+        :rtype: str
+        """
+        return self._desc
+
+    @desc.setter
+    def desc(self, desc):
+        """Sets the desc of this OperatorOverview.
+
+
+        :param desc: The desc of this OperatorOverview.  # noqa: E501
+        :type: str
+        """
+
+        self._desc = desc
+
+    @property
+    def type(self):
+        """Gets the type of this OperatorOverview.  # noqa: E501
+
+
+        :return: The type of this OperatorOverview.  # noqa: E501
+        :rtype: str
+        """
+        return self._type
+
+    @type.setter
+    def type(self, type):
+        """Sets the type of this OperatorOverview.
+
+
+        :param type: The type of this OperatorOverview.  # noqa: E501
+        :type: str
+        """
+        allowed_values = [
+            None,
+            "KNOWLEDGE_EXTRACT",
+            "ENTITY_LINK",
+            "PROPERTY_NORMALIZE",
+            "ENTITY_FUSION",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and type not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `type` ({0}), must be one of {1}".format(  # noqa: E501
+                    type, allowed_values
+                )
+            )
+
+        self._type = type
+
+    @property
+    def lang_type(self):
+        """Gets the lang_type of this OperatorOverview.  # noqa: E501
+
+
+        :return: The lang_type of this OperatorOverview.  # noqa: E501
+        :rtype: str
+        """
+        return self._lang_type
+
+    @lang_type.setter
+    def lang_type(self, lang_type):
+        """Sets the lang_type of this OperatorOverview.
+
+
+        :param lang_type: The lang_type of this OperatorOverview.  # noqa: E501
+        :type: str
+        """
+        allowed_values = [None, "PYTHON", "JAVA"]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and lang_type not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `lang_type` ({0}), must be one of {1}".format(  # noqa: E501
+                    lang_type, allowed_values
+                )
+            )
+
+        self._lang_type = lang_type
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, OperatorOverview):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, OperatorOverview):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/operator/operator_version.py b/knext/schema/rest/models/operator/operator_version.py
new file mode 100644
index 00000000..275ff32e
--- /dev/null
+++ b/knext/schema/rest/models/operator/operator_version.py
@@ -0,0 +1,238 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class OperatorVersion(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "overview_id": "int",
+        "main_class": "str",
+        "file_path": "str",
+        "version": "int",
+    }
+
+    attribute_map = {
+        "overview_id": "overviewId",
+        "main_class": "mainClass",
+        "file_path": "filePath",
+        "version": "version",
+    }
+
+    def __init__(
+        self,
+        overview_id=None,
+        main_class=None,
+        file_path=None,
+        version=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """OperatorVersion - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._overview_id = None
+        self._main_class = None
+        self._file_path = None
+        self._version = None
+        self.discriminator = None
+
+        self.overview_id = overview_id
+        self.main_class = main_class
+        self.file_path = file_path
+        self.version = version
+
+    @property
+    def overview_id(self):
+        """Gets the overview_id of this OperatorVersion.  # noqa: E501
+
+
+        :return: The overview_id of this OperatorVersion.  # noqa: E501
+        :rtype: int
+        """
+        return self._overview_id
+
+    @overview_id.setter
+    def overview_id(self, overview_id):
+        """Sets the overview_id of this OperatorVersion.
+
+
+        :param overview_id: The overview_id of this OperatorVersion.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and overview_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `overview_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._overview_id = overview_id
+
+    @property
+    def main_class(self):
+        """Gets the main_class of this OperatorVersion.  # noqa: E501
+
+
+        :return: The main_class of this OperatorVersion.  # noqa: E501
+        :rtype: str
+        """
+        return self._main_class
+
+    @main_class.setter
+    def main_class(self, main_class):
+        """Sets the main_class of this OperatorVersion.
+
+
+        :param main_class: The main_class of this OperatorVersion.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and main_class is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `main_class`, must not be `None`"
+            )  # noqa: E501
+
+        self._main_class = main_class
+
+    @property
+    def file_path(self):
+        """Gets the file_path of this OperatorVersion.  # noqa: E501
+
+
+        :return: The file_path of this OperatorVersion.  # noqa: E501
+        :rtype: str
+        """
+        return self._file_path
+
+    @file_path.setter
+    def file_path(self, file_path):
+        """Sets the file_path of this OperatorVersion.
+
+
+        :param file_path: The file_path of this OperatorVersion.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and file_path is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `file_path`, must not be `None`"
+            )  # noqa: E501
+
+        self._file_path = file_path
+
+    @property
+    def version(self):
+        """Gets the version of this OperatorVersion.  # noqa: E501
+
+
+        :return: The version of this OperatorVersion.  # noqa: E501
+        :rtype: int
+        """
+        return self._version
+
+    @version.setter
+    def version(self, version):
+        """Sets the version of this OperatorVersion.
+
+
+        :param version: The version of this OperatorVersion.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and version is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `version`, must not be `None`"
+            )  # noqa: E501
+
+        self._version = version
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, OperatorVersion):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, OperatorVersion):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/operator/operator_version_request.py b/knext/schema/rest/models/operator/operator_version_request.py
new file mode 100644
index 00000000..ebe521ab
--- /dev/null
+++ b/knext/schema/rest/models/operator/operator_version_request.py
@@ -0,0 +1,165 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class OperatorVersionRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"project_id": "int", "operator_id": "int"}
+
+    attribute_map = {"project_id": "projectId", "operator_id": "operatorId"}
+
+    def __init__(
+        self, project_id=None, operator_id=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """OperatorVersionRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._project_id = None
+        self._operator_id = None
+        self.discriminator = None
+
+        self.project_id = project_id
+        self.operator_id = operator_id
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this OperatorVersionRequest.  # noqa: E501
+
+
+        :return: The project_id of this OperatorVersionRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this OperatorVersionRequest.
+
+
+        :param project_id: The project_id of this OperatorVersionRequest.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._project_id = project_id
+
+    @property
+    def operator_id(self):
+        """Gets the operator_id of this OperatorVersionRequest.  # noqa: E501
+
+
+        :return: The operator_id of this OperatorVersionRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._operator_id
+
+    @operator_id.setter
+    def operator_id(self, operator_id):
+        """Sets the operator_id of this OperatorVersionRequest.
+
+
+        :param operator_id: The operator_id of this OperatorVersionRequest.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and operator_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `operator_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._operator_id = operator_id
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, OperatorVersionRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, OperatorVersionRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/operator/operator_version_response.py b/knext/schema/rest/models/operator/operator_version_response.py
new file mode 100644
index 00000000..26c1d0d6
--- /dev/null
+++ b/knext/schema/rest/models/operator/operator_version_response.py
@@ -0,0 +1,167 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class OperatorVersionResponse(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"operator_name": "str", "latest_version": "str"}
+
+    attribute_map = {"operator_name": "operatorName", "latest_version": "latestVersion"}
+
+    def __init__(
+        self, operator_name=None, latest_version=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """OperatorVersionResponse - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._operator_name = None
+        self._latest_version = None
+        self.discriminator = None
+
+        self.operator_name = operator_name
+        self.latest_version = latest_version
+
+    @property
+    def operator_name(self):
+        """Gets the operator_name of this OperatorVersionResponse.  # noqa: E501
+
+
+        :return: The operator_name of this OperatorVersionResponse.  # noqa: E501
+        :rtype: str
+        """
+        return self._operator_name
+
+    @operator_name.setter
+    def operator_name(self, operator_name):
+        """Sets the operator_name of this OperatorVersionResponse.
+
+
+        :param operator_name: The operator_name of this OperatorVersionResponse.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and operator_name is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `operator_name`, must not be `None`"
+            )  # noqa: E501
+
+        self._operator_name = operator_name
+
+    @property
+    def latest_version(self):
+        """Gets the latest_version of this OperatorVersionResponse.  # noqa: E501
+
+
+        :return: The latest_version of this OperatorVersionResponse.  # noqa: E501
+        :rtype: str
+        """
+        return self._latest_version
+
+    @latest_version.setter
+    def latest_version(self, latest_version):
+        """Sets the latest_version of this OperatorVersionResponse.
+
+
+        :param latest_version: The latest_version of this OperatorVersionResponse.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and latest_version is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `latest_version`, must not be `None`"
+            )  # noqa: E501
+
+        self._latest_version = latest_version
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, OperatorVersionResponse):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, OperatorVersionResponse):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/predicate/__init__.py b/knext/schema/rest/models/predicate/__init__.py
new file mode 100644
index 00000000..6f6914a4
--- /dev/null
+++ b/knext/schema/rest/models/predicate/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
diff --git a/knext/schema/rest/models/predicate/mounted_concept_config.py b/knext/schema/rest/models/predicate/mounted_concept_config.py
new file mode 100644
index 00000000..fd2ecbb6
--- /dev/null
+++ b/knext/schema/rest/models/predicate/mounted_concept_config.py
@@ -0,0 +1,155 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class MountedConceptConfig(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"concept_branch": "str", "concept_layer": "str"}
+
+    attribute_map = {"concept_branch": "conceptBranch", "concept_layer": "conceptLayer"}
+
+    def __init__(
+        self, concept_branch=None, concept_layer=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """MountedConceptConfig - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._concept_branch = None
+        self._concept_layer = None
+        self.discriminator = None
+
+        if concept_branch is not None:
+            self.concept_branch = concept_branch
+        if concept_layer is not None:
+            self.concept_layer = concept_layer
+
+    @property
+    def concept_branch(self):
+        """Gets the concept_branch of this MountedConceptConfig.  # noqa: E501
+
+
+        :return: The concept_branch of this MountedConceptConfig.  # noqa: E501
+        :rtype: str
+        """
+        return self._concept_branch
+
+    @concept_branch.setter
+    def concept_branch(self, concept_branch):
+        """Sets the concept_branch of this MountedConceptConfig.
+
+
+        :param concept_branch: The concept_branch of this MountedConceptConfig.  # noqa: E501
+        :type: str
+        """
+
+        self._concept_branch = concept_branch
+
+    @property
+    def concept_layer(self):
+        """Gets the concept_layer of this MountedConceptConfig.  # noqa: E501
+
+
+        :return: The concept_layer of this MountedConceptConfig.  # noqa: E501
+        :rtype: str
+        """
+        return self._concept_layer
+
+    @concept_layer.setter
+    def concept_layer(self, concept_layer):
+        """Sets the concept_layer of this MountedConceptConfig.
+
+
+        :param concept_layer: The concept_layer of this MountedConceptConfig.  # noqa: E501
+        :type: str
+        """
+
+        self._concept_layer = concept_layer
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, MountedConceptConfig):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, MountedConceptConfig):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/predicate/property.py b/knext/schema/rest/models/predicate/property.py
new file mode 100644
index 00000000..1d20e9b0
--- /dev/null
+++ b/knext/schema/rest/models/predicate/property.py
@@ -0,0 +1,363 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class Property(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "basic_info": "SubPropertyBasicInfo",
+        "subject_type_ref": "SpgTypeRef",
+        "object_type_ref": "SpgTypeRef",
+        "inherited": "bool",
+        "advanced_config": "PropertyAdvancedConfig",
+        "project_id": "int",
+        "ontology_id": "OntologyId",
+        "alter_operation": "str",
+        "ext_info": "object",
+    }
+
+    attribute_map = {
+        "basic_info": "basicInfo",
+        "subject_type_ref": "subjectTypeRef",
+        "object_type_ref": "objectTypeRef",
+        "inherited": "inherited",
+        "advanced_config": "advancedConfig",
+        "project_id": "projectId",
+        "ontology_id": "ontologyId",
+        "alter_operation": "alterOperation",
+        "ext_info": "extInfo",
+    }
+
+    def __init__(
+        self,
+        basic_info=None,
+        subject_type_ref=None,
+        object_type_ref=None,
+        inherited=None,
+        advanced_config=None,
+        project_id=None,
+        ontology_id=None,
+        alter_operation=None,
+        ext_info=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """Property - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._basic_info = None
+        self._subject_type_ref = None
+        self._object_type_ref = None
+        self._inherited = None
+        self._advanced_config = None
+        self._project_id = None
+        self._ontology_id = None
+        self._alter_operation = None
+        self._ext_info = None
+        self.discriminator = None
+
+        if basic_info is not None:
+            self.basic_info = basic_info
+        if subject_type_ref is not None:
+            self.subject_type_ref = subject_type_ref
+        if object_type_ref is not None:
+            self.object_type_ref = object_type_ref
+        if inherited is not None:
+            self.inherited = inherited
+        if advanced_config is not None:
+            self.advanced_config = advanced_config
+        if project_id is not None:
+            self.project_id = project_id
+        if ontology_id is not None:
+            self.ontology_id = ontology_id
+        if alter_operation is not None:
+            self.alter_operation = alter_operation
+        if ext_info is not None:
+            self.ext_info = ext_info
+
+    @property
+    def basic_info(self):
+        """Gets the basic_info of this Property.  # noqa: E501
+
+
+        :return: The basic_info of this Property.  # noqa: E501
+        :rtype: SubPropertyBasicInfo
+        """
+        return self._basic_info
+
+    @basic_info.setter
+    def basic_info(self, basic_info):
+        """Sets the basic_info of this Property.
+
+
+        :param basic_info: The basic_info of this Property.  # noqa: E501
+        :type: SubPropertyBasicInfo
+        """
+
+        self._basic_info = basic_info
+
+    @property
+    def subject_type_ref(self):
+        """Gets the subject_type_ref of this Property.  # noqa: E501
+
+
+        :return: The subject_type_ref of this Property.  # noqa: E501
+        :rtype: SpgTypeRef
+        """
+        return self._subject_type_ref
+
+    @subject_type_ref.setter
+    def subject_type_ref(self, subject_type_ref):
+        """Sets the subject_type_ref of this Property.
+
+
+        :param subject_type_ref: The subject_type_ref of this Property.  # noqa: E501
+        :type: SpgTypeRef
+        """
+
+        self._subject_type_ref = subject_type_ref
+
+    @property
+    def object_type_ref(self):
+        """Gets the object_type_ref of this Property.  # noqa: E501
+
+
+        :return: The object_type_ref of this Property.  # noqa: E501
+        :rtype: SpgTypeRef
+        """
+        return self._object_type_ref
+
+    @object_type_ref.setter
+    def object_type_ref(self, object_type_ref):
+        """Sets the object_type_ref of this Property.
+
+
+        :param object_type_ref: The object_type_ref of this Property.  # noqa: E501
+        :type: SpgTypeRef
+        """
+
+        self._object_type_ref = object_type_ref
+
+    @property
+    def inherited(self):
+        """Gets the inherited of this Property.  # noqa: E501
+
+
+        :return: The inherited of this Property.  # noqa: E501
+        :rtype: bool
+        """
+        return self._inherited
+
+    @inherited.setter
+    def inherited(self, inherited):
+        """Sets the inherited of this Property.
+
+
+        :param inherited: The inherited of this Property.  # noqa: E501
+        :type: bool
+        """
+
+        self._inherited = inherited
+
+    @property
+    def advanced_config(self):
+        """Gets the advanced_config of this Property.  # noqa: E501
+
+
+        :return: The advanced_config of this Property.  # noqa: E501
+        :rtype: PropertyAdvancedConfig
+        """
+        return self._advanced_config
+
+    @advanced_config.setter
+    def advanced_config(self, advanced_config):
+        """Sets the advanced_config of this Property.
+
+
+        :param advanced_config: The advanced_config of this Property.  # noqa: E501
+        :type: PropertyAdvancedConfig
+        """
+
+        self._advanced_config = advanced_config
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this Property.  # noqa: E501
+
+
+        :return: The project_id of this Property.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this Property.
+
+
+        :param project_id: The project_id of this Property.  # noqa: E501
+        :type: int
+        """
+
+        self._project_id = project_id
+
+    @property
+    def ontology_id(self):
+        """Gets the ontology_id of this Property.  # noqa: E501
+
+
+        :return: The ontology_id of this Property.  # noqa: E501
+        :rtype: OntologyId
+        """
+        return self._ontology_id
+
+    @ontology_id.setter
+    def ontology_id(self, ontology_id):
+        """Sets the ontology_id of this Property.
+
+
+        :param ontology_id: The ontology_id of this Property.  # noqa: E501
+        :type: OntologyId
+        """
+
+        self._ontology_id = ontology_id
+
+    @property
+    def alter_operation(self):
+        """Gets the alter_operation of this Property.  # noqa: E501
+
+
+        :return: The alter_operation of this Property.  # noqa: E501
+        :rtype: str
+        """
+        return self._alter_operation
+
+    @alter_operation.setter
+    def alter_operation(self, alter_operation):
+        """Sets the alter_operation of this Property.
+
+
+        :param alter_operation: The alter_operation of this Property.  # noqa: E501
+        :type: str
+        """
+        allowed_values = ["CREATE", "UPDATE", "DELETE"]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and alter_operation not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `alter_operation` ({0}), must be one of {1}".format(  # noqa: E501
+                    alter_operation, allowed_values
+                )
+            )
+
+        self._alter_operation = alter_operation
+
+    @property
+    def ext_info(self):
+        """Gets the ext_info of this Property.  # noqa: E501
+
+
+        :return: The ext_info of this Property.  # noqa: E501
+        :rtype: object
+        """
+        return self._ext_info
+
+    @ext_info.setter
+    def ext_info(self, ext_info):
+        """Sets the ext_info of this Property.
+
+
+        :param ext_info: The ext_info of this Property.  # noqa: E501
+        :type: object
+        """
+
+        self._ext_info = ext_info
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, Property):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, Property):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/predicate/property_advanced_config.py b/knext/schema/rest/models/predicate/property_advanced_config.py
new file mode 100644
index 00000000..ea03ad66
--- /dev/null
+++ b/knext/schema/rest/models/predicate/property_advanced_config.py
@@ -0,0 +1,336 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class PropertyAdvancedConfig(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "multi_version_config": "MultiVersionConfig",
+        "mounted_concept_config": "MountedConceptConfig",
+        "property_group": "str",
+        "constraint": "Constraint",
+        "sub_properties": "list[SubProperty]",
+        "semantics": "list[PredicateSemantic]",
+        "logical_rule": "LogicalRule",
+        "index_type": "str",
+    }
+
+    attribute_map = {
+        "multi_version_config": "multiVersionConfig",
+        "mounted_concept_config": "MountedConceptConfig",
+        "property_group": "propertyGroup",
+        "constraint": "constraint",
+        "sub_properties": "subProperties",
+        "semantics": "semantics",
+        "logical_rule": "logicalRule",
+        "index_type": "indexType",
+    }
+
+    def __init__(
+        self,
+        multi_version_config=None,
+        mounted_concept_config=None,
+        property_group=None,
+        constraint=None,
+        sub_properties=None,
+        semantics=None,
+        logical_rule=None,
+        index_type=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """PropertyAdvancedConfig - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._multi_version_config = None
+        self._mounted_concept_config = None
+        self._property_group = None
+        self._constraint = None
+        self._sub_properties = None
+        self._semantics = None
+        self._logical_rule = None
+        self._index_type = None
+        self.discriminator = None
+
+        if multi_version_config is not None:
+            self.multi_version_config = multi_version_config
+        if mounted_concept_config is not None:
+            self.mounted_concept_config = mounted_concept_config
+        if property_group is not None:
+            self.property_group = property_group
+        if constraint is not None:
+            self.constraint = constraint
+        if sub_properties is not None:
+            self.sub_properties = sub_properties
+        if semantics is not None:
+            self.semantics = semantics
+        if logical_rule is not None:
+            self.logical_rule = logical_rule
+        if index_type is not None:
+            self.index_type = index_type
+
+    @property
+    def multi_version_config(self):
+        """Gets the multi_version_config of this PropertyAdvancedConfig.  # noqa: E501
+
+
+        :return: The multi_version_config of this PropertyAdvancedConfig.  # noqa: E501
+        :rtype: MultiVersionConfig
+        """
+        return self._multi_version_config
+
+    @multi_version_config.setter
+    def multi_version_config(self, multi_version_config):
+        """Sets the multi_version_config of this PropertyAdvancedConfig.
+
+
+        :param multi_version_config: The multi_version_config of this PropertyAdvancedConfig.  # noqa: E501
+        :type: MultiVersionConfig
+        """
+
+        self._multi_version_config = multi_version_config
+
+    @property
+    def mounted_concept_config(self):
+        """Gets the mounted_concept_config of this PropertyAdvancedConfig.  # noqa: E501
+
+
+        :return: The mounted_concept_config of this PropertyAdvancedConfig.  # noqa: E501
+        :rtype: MountedConceptConfig
+        """
+        return self._mounted_concept_config
+
+    @mounted_concept_config.setter
+    def mounted_concept_config(self, mounted_concept_config):
+        """Sets the mounted_concept_config of this PropertyAdvancedConfig.
+
+
+        :param mounted_concept_config: The mounted_concept_config of this PropertyAdvancedConfig.  # noqa: E501
+        :type: MountedConceptConfig
+        """
+
+        self._mounted_concept_config = mounted_concept_config
+
+    @property
+    def property_group(self):
+        """Gets the property_group of this PropertyAdvancedConfig.  # noqa: E501
+
+
+        :return: The property_group of this PropertyAdvancedConfig.  # noqa: E501
+        :rtype: str
+        """
+        return self._property_group
+
+    @property_group.setter
+    def property_group(self, property_group):
+        """Sets the property_group of this PropertyAdvancedConfig.
+
+
+        :param property_group: The property_group of this PropertyAdvancedConfig.  # noqa: E501
+        :type: str
+        """
+        allowed_values = ["TIME", "SUBJECT", "OBJECT", "LOC"]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and property_group not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `property_group` ({0}), must be one of {1}".format(  # noqa: E501
+                    property_group, allowed_values
+                )
+            )
+
+        self._property_group = property_group
+
+    @property
+    def constraint(self):
+        """Gets the constraint of this PropertyAdvancedConfig.  # noqa: E501
+
+
+        :return: The constraint of this PropertyAdvancedConfig.  # noqa: E501
+        :rtype: Constraint
+        """
+        return self._constraint
+
+    @constraint.setter
+    def constraint(self, constraint):
+        """Sets the constraint of this PropertyAdvancedConfig.
+
+
+        :param constraint: The constraint of this PropertyAdvancedConfig.  # noqa: E501
+        :type: Constraint
+        """
+
+        self._constraint = constraint
+
+    @property
+    def sub_properties(self):
+        """Gets the sub_properties of this PropertyAdvancedConfig.  # noqa: E501
+
+
+        :return: The sub_properties of this PropertyAdvancedConfig.  # noqa: E501
+        :rtype: list[SubProperty]
+        """
+        return self._sub_properties
+
+    @sub_properties.setter
+    def sub_properties(self, sub_properties):
+        """Sets the sub_properties of this PropertyAdvancedConfig.
+
+
+        :param sub_properties: The sub_properties of this PropertyAdvancedConfig.  # noqa: E501
+        :type: list[SubProperty]
+        """
+
+        self._sub_properties = sub_properties
+
+    @property
+    def semantics(self):
+        """Gets the semantics of this PropertyAdvancedConfig.  # noqa: E501
+
+
+        :return: The semantics of this PropertyAdvancedConfig.  # noqa: E501
+        :rtype: list[PredicateSemantic]
+        """
+        return self._semantics
+
+    @semantics.setter
+    def semantics(self, semantics):
+        """Sets the semantics of this PropertyAdvancedConfig.
+
+
+        :param semantics: The semantics of this PropertyAdvancedConfig.  # noqa: E501
+        :type: list[PredicateSemantic]
+        """
+
+        self._semantics = semantics
+
+    @property
+    def logical_rule(self):
+        """Gets the logical_rule of this PropertyAdvancedConfig.  # noqa: E501
+
+
+        :return: The logical_rule of this PropertyAdvancedConfig.  # noqa: E501
+        :rtype: LogicalRule
+        """
+        return self._logical_rule
+
+    @logical_rule.setter
+    def logical_rule(self, logical_rule):
+        """Sets the logical_rule of this PropertyAdvancedConfig.
+
+
+        :param logical_rule: The logical_rule of this PropertyAdvancedConfig.  # noqa: E501
+        :type: LogicalRule
+        """
+
+        self._logical_rule = logical_rule
+
+    @property
+    def index_type(self):
+        """Gets the index_type of this PropertyAdvancedConfig.  # noqa: E501
+
+
+        :return: The index_type of this PropertyAdvancedConfig.  # noqa: E501
+        :rtype: str
+        """
+        return self._index_type
+
+    @index_type.setter
+    def index_type(self, index_type):
+        """Sets the index_type of this PropertyAdvancedConfig.
+
+
+        :param index_type: The logical_rule of this PropertyAdvancedConfig.  # noqa: E501
+        :type: str
+        """
+
+        self._index_type = index_type
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, PropertyAdvancedConfig):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, PropertyAdvancedConfig):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/predicate/property_ref.py b/knext/schema/rest/models/predicate/property_ref.py
new file mode 100644
index 00000000..ada99917
--- /dev/null
+++ b/knext/schema/rest/models/predicate/property_ref.py
@@ -0,0 +1,380 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class PropertyRef(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "subject_type_ref": "SpgTypeRef",
+        "basic_info": "PropertyRefBasicInfo",
+        "object_type_ref": "SpgTypeRef",
+        "advanced_config": "PropertyAdvancedConfig",
+        "ontology_enum": "str",
+        "project_id": "int",
+        "ontology_id": "OntologyId",
+        "alter_operation": "str",
+        "ext_info": "object",
+    }
+
+    attribute_map = {
+        "subject_type_ref": "subjectTypeRef",
+        "basic_info": "basicInfo",
+        "object_type_ref": "objectTypeRef",
+        "advanced_config": "advancedConfig",
+        "ontology_enum": "ontologyEnum",
+        "project_id": "projectId",
+        "ontology_id": "ontologyId",
+        "alter_operation": "alterOperation",
+        "ext_info": "extInfo",
+    }
+
+    def __init__(
+        self,
+        subject_type_ref=None,
+        basic_info=None,
+        object_type_ref=None,
+        advanced_config=None,
+        ontology_enum=None,
+        project_id=None,
+        ontology_id=None,
+        alter_operation=None,
+        ext_info=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """PropertyRef - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._subject_type_ref = None
+        self._basic_info = None
+        self._object_type_ref = None
+        self._advanced_config = None
+        self._ontology_enum = None
+        self._project_id = None
+        self._ontology_id = None
+        self._alter_operation = None
+        self._ext_info = None
+        self.discriminator = None
+
+        if subject_type_ref is not None:
+            self.subject_type_ref = subject_type_ref
+        if basic_info is not None:
+            self.basic_info = basic_info
+        if object_type_ref is not None:
+            self.object_type_ref = object_type_ref
+        if advanced_config is not None:
+            self.advanced_config = advanced_config
+        if ontology_enum is not None:
+            self.ontology_enum = ontology_enum
+        if project_id is not None:
+            self.project_id = project_id
+        if ontology_id is not None:
+            self.ontology_id = ontology_id
+        if alter_operation is not None:
+            self.alter_operation = alter_operation
+        if ext_info is not None:
+            self.ext_info = ext_info
+
+    @property
+    def subject_type_ref(self):
+        """Gets the subject_type_ref of this PropertyRef.  # noqa: E501
+
+
+        :return: The subject_type_ref of this PropertyRef.  # noqa: E501
+        :rtype: SpgTypeRef
+        """
+        return self._subject_type_ref
+
+    @subject_type_ref.setter
+    def subject_type_ref(self, subject_type_ref):
+        """Sets the subject_type_ref of this PropertyRef.
+
+
+        :param subject_type_ref: The subject_type_ref of this PropertyRef.  # noqa: E501
+        :type: SpgTypeRef
+        """
+
+        self._subject_type_ref = subject_type_ref
+
+    @property
+    def basic_info(self):
+        """Gets the basic_info of this PropertyRef.  # noqa: E501
+
+
+        :return: The basic_info of this PropertyRef.  # noqa: E501
+        :rtype: PropertyRefBasicInfo
+        """
+        return self._basic_info
+
+    @basic_info.setter
+    def basic_info(self, basic_info):
+        """Sets the basic_info of this PropertyRef.
+
+
+        :param basic_info: The basic_info of this PropertyRef.  # noqa: E501
+        :type: PropertyRefBasicInfo
+        """
+
+        self._basic_info = basic_info
+
+    @property
+    def object_type_ref(self):
+        """Gets the object_type_ref of this PropertyRef.  # noqa: E501
+
+
+        :return: The object_type_ref of this PropertyRef.  # noqa: E501
+        :rtype: SpgTypeRef
+        """
+        return self._object_type_ref
+
+    @object_type_ref.setter
+    def object_type_ref(self, object_type_ref):
+        """Sets the object_type_ref of this PropertyRef.
+
+
+        :param object_type_ref: The object_type_ref of this PropertyRef.  # noqa: E501
+        :type: SpgTypeRef
+        """
+
+        self._object_type_ref = object_type_ref
+
+    @property
+    def advanced_config(self):
+        """Gets the advanced_config of this PropertyRef.  # noqa: E501
+
+
+        :return: The advanced_config of this PropertyRef.  # noqa: E501
+        :rtype: PropertyAdvancedConfig
+        """
+        return self._advanced_config
+
+    @advanced_config.setter
+    def advanced_config(self, advanced_config):
+        """Sets the advanced_config of this PropertyRef.
+
+
+        :param advanced_config: The advanced_config of this PropertyRef.  # noqa: E501
+        :type: PropertyAdvancedConfig
+        """
+
+        self._advanced_config = advanced_config
+
+    @property
+    def ontology_enum(self):
+        """Gets the ontology_enum of this PropertyRef.  # noqa: E501
+
+
+        :return: The ontology_enum of this PropertyRef.  # noqa: E501
+        :rtype: str
+        """
+        return self._ontology_enum
+
+    @ontology_enum.setter
+    def ontology_enum(self, ontology_enum):
+        """Sets the ontology_enum of this PropertyRef.
+
+
+        :param ontology_enum: The ontology_enum of this PropertyRef.  # noqa: E501
+        :type: str
+        """
+        allowed_values = [
+            None,
+            "TYPE",
+            "PROPERTY",
+            "RELATION",
+            "SUB_PROPERTY",
+            "CONCEPT",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and ontology_enum not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `ontology_enum` ({0}), must be one of {1}".format(  # noqa: E501
+                    ontology_enum, allowed_values
+                )
+            )
+
+        self._ontology_enum = ontology_enum
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this PropertyRef.  # noqa: E501
+
+
+        :return: The project_id of this PropertyRef.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this PropertyRef.
+
+
+        :param project_id: The project_id of this PropertyRef.  # noqa: E501
+        :type: int
+        """
+
+        self._project_id = project_id
+
+    @property
+    def ontology_id(self):
+        """Gets the ontology_id of this PropertyRef.  # noqa: E501
+
+
+        :return: The ontology_id of this PropertyRef.  # noqa: E501
+        :rtype: OntologyId
+        """
+        return self._ontology_id
+
+    @ontology_id.setter
+    def ontology_id(self, ontology_id):
+        """Sets the ontology_id of this PropertyRef.
+
+
+        :param ontology_id: The ontology_id of this PropertyRef.  # noqa: E501
+        :type: OntologyId
+        """
+
+        self._ontology_id = ontology_id
+
+    @property
+    def alter_operation(self):
+        """Gets the alter_operation of this PropertyRef.  # noqa: E501
+
+
+        :return: The alter_operation of this PropertyRef.  # noqa: E501
+        :rtype: str
+        """
+        return self._alter_operation
+
+    @alter_operation.setter
+    def alter_operation(self, alter_operation):
+        """Sets the alter_operation of this PropertyRef.
+
+
+        :param alter_operation: The alter_operation of this PropertyRef.  # noqa: E501
+        :type: str
+        """
+        allowed_values = ["CREATE", "UPDATE", "DELETE"]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and alter_operation not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `alter_operation` ({0}), must be one of {1}".format(  # noqa: E501
+                    alter_operation, allowed_values
+                )
+            )
+
+        self._alter_operation = alter_operation
+
+    @property
+    def ext_info(self):
+        """Gets the ext_info of this PropertyRef.  # noqa: E501
+
+
+        :return: The ext_info of this PropertyRef.  # noqa: E501
+        :rtype: object
+        """
+        return self._ext_info
+
+    @ext_info.setter
+    def ext_info(self, ext_info):
+        """Sets the ext_info of this PropertyRef.
+
+
+        :param ext_info: The ext_info of this PropertyRef.  # noqa: E501
+        :type: object
+        """
+
+        self._ext_info = ext_info
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, PropertyRef):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, PropertyRef):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/predicate/property_ref_basic_info.py b/knext/schema/rest/models/predicate/property_ref_basic_info.py
new file mode 100644
index 00000000..8085a4b5
--- /dev/null
+++ b/knext/schema/rest/models/predicate/property_ref_basic_info.py
@@ -0,0 +1,223 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class PropertyRefBasicInfo(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "name": "PredicateIdentifier",
+        "name_zh": "str",
+        "desc": "str",
+        "creator": "str",
+    }
+
+    attribute_map = {
+        "name": "name",
+        "name_zh": "nameZh",
+        "desc": "desc",
+        "creator": "creator",
+    }
+
+    def __init__(
+        self,
+        name=None,
+        name_zh=None,
+        desc=None,
+        creator=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """PropertyRefBasicInfo - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._name = None
+        self._name_zh = None
+        self._desc = None
+        self._creator = None
+        self.discriminator = None
+
+        self.name = name
+        if name_zh is not None:
+            self.name_zh = name_zh
+        if desc is not None:
+            self.desc = desc
+        if creator is not None:
+            self.creator = creator
+
+    @property
+    def name(self):
+        """Gets the name of this PropertyRefBasicInfo.  # noqa: E501
+
+
+        :return: The name of this PropertyRefBasicInfo.  # noqa: E501
+        :rtype: PredicateIdentifier
+        """
+        return self._name
+
+    @name.setter
+    def name(self, name):
+        """Sets the name of this PropertyRefBasicInfo.
+
+
+        :param name: The name of this PropertyRefBasicInfo.  # noqa: E501
+        :type: PredicateIdentifier
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and name is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `name`, must not be `None`"
+            )  # noqa: E501
+
+        self._name = name
+
+    @property
+    def name_zh(self):
+        """Gets the name_zh of this PropertyRefBasicInfo.  # noqa: E501
+
+
+        :return: The name_zh of this PropertyRefBasicInfo.  # noqa: E501
+        :rtype: str
+        """
+        return self._name_zh
+
+    @name_zh.setter
+    def name_zh(self, name_zh):
+        """Sets the name_zh of this PropertyRefBasicInfo.
+
+
+        :param name_zh: The name_zh of this PropertyRefBasicInfo.  # noqa: E501
+        :type: str
+        """
+
+        self._name_zh = name_zh
+
+    @property
+    def desc(self):
+        """Gets the desc of this PropertyRefBasicInfo.  # noqa: E501
+
+
+        :return: The desc of this PropertyRefBasicInfo.  # noqa: E501
+        :rtype: str
+        """
+        return self._desc
+
+    @desc.setter
+    def desc(self, desc):
+        """Sets the desc of this PropertyRefBasicInfo.
+
+
+        :param desc: The desc of this PropertyRefBasicInfo.  # noqa: E501
+        :type: str
+        """
+
+        self._desc = desc
+
+    @property
+    def creator(self):
+        """Gets the creator of this PropertyRefBasicInfo.  # noqa: E501
+
+
+        :return: The creator of this PropertyRefBasicInfo.  # noqa: E501
+        :rtype: str
+        """
+        return self._creator
+
+    @creator.setter
+    def creator(self, creator):
+        """Sets the creator of this PropertyRefBasicInfo.
+
+
+        :param creator: The creator of this PropertyRefBasicInfo.  # noqa: E501
+        :type: str
+        """
+
+        self._creator = creator
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, PropertyRefBasicInfo):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, PropertyRefBasicInfo):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/predicate/relation.py b/knext/schema/rest/models/predicate/relation.py
new file mode 100644
index 00000000..7d1de514
--- /dev/null
+++ b/knext/schema/rest/models/predicate/relation.py
@@ -0,0 +1,390 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class Relation(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "basic_info": "SubPropertyBasicInfo",
+        "subject_type_ref": "SpgTypeRef",
+        "object_type_ref": "SpgTypeRef",
+        "inherited": "bool",
+        "advanced_config": "PropertyAdvancedConfig",
+        "project_id": "int",
+        "ontology_id": "OntologyId",
+        "alter_operation": "str",
+        "ext_info": "object",
+        "is_dynamic": "bool",
+    }
+
+    attribute_map = {
+        "basic_info": "basicInfo",
+        "subject_type_ref": "subjectTypeRef",
+        "object_type_ref": "objectTypeRef",
+        "inherited": "inherited",
+        "advanced_config": "advancedConfig",
+        "project_id": "projectId",
+        "ontology_id": "ontologyId",
+        "alter_operation": "alterOperation",
+        "ext_info": "extInfo",
+        "is_dynamic": "isDynamic",
+    }
+
+    def __init__(
+        self,
+        basic_info=None,
+        subject_type_ref=None,
+        object_type_ref=None,
+        inherited=None,
+        advanced_config=None,
+        project_id=None,
+        ontology_id=None,
+        alter_operation=None,
+        ext_info=None,
+        is_dynamic=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """Relation - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._basic_info = None
+        self._subject_type_ref = None
+        self._object_type_ref = None
+        self._inherited = None
+        self._advanced_config = None
+        self._project_id = None
+        self._ontology_id = None
+        self._alter_operation = None
+        self._ext_info = None
+        self._is_dynamic = None
+        self.discriminator = None
+
+        if basic_info is not None:
+            self.basic_info = basic_info
+        if subject_type_ref is not None:
+            self.subject_type_ref = subject_type_ref
+        if object_type_ref is not None:
+            self.object_type_ref = object_type_ref
+        if inherited is not None:
+            self.inherited = inherited
+        if advanced_config is not None:
+            self.advanced_config = advanced_config
+        if project_id is not None:
+            self.project_id = project_id
+        if ontology_id is not None:
+            self.ontology_id = ontology_id
+        if alter_operation is not None:
+            self.alter_operation = alter_operation
+        if ext_info is not None:
+            self.ext_info = ext_info
+        if is_dynamic is not None:
+            self.is_dynamic = is_dynamic
+
+    @property
+    def basic_info(self):
+        """Gets the basic_info of this Relation.  # noqa: E501
+
+
+        :return: The basic_info of this Relation.  # noqa: E501
+        :rtype: SubPropertyBasicInfo
+        """
+        return self._basic_info
+
+    @basic_info.setter
+    def basic_info(self, basic_info):
+        """Sets the basic_info of this Relation.
+
+
+        :param basic_info: The basic_info of this Relation.  # noqa: E501
+        :type: SubPropertyBasicInfo
+        """
+
+        self._basic_info = basic_info
+
+    @property
+    def subject_type_ref(self):
+        """Gets the subject_type_ref of this Relation.  # noqa: E501
+
+
+        :return: The subject_type_ref of this Relation.  # noqa: E501
+        :rtype: SpgTypeRef
+        """
+        return self._subject_type_ref
+
+    @subject_type_ref.setter
+    def subject_type_ref(self, subject_type_ref):
+        """Sets the subject_type_ref of this Relation.
+
+
+        :param subject_type_ref: The subject_type_ref of this Relation.  # noqa: E501
+        :type: SpgTypeRef
+        """
+
+        self._subject_type_ref = subject_type_ref
+
+    @property
+    def object_type_ref(self):
+        """Gets the object_type_ref of this Relation.  # noqa: E501
+
+
+        :return: The object_type_ref of this Relation.  # noqa: E501
+        :rtype: SpgTypeRef
+        """
+        return self._object_type_ref
+
+    @object_type_ref.setter
+    def object_type_ref(self, object_type_ref):
+        """Sets the object_type_ref of this Relation.
+
+
+        :param object_type_ref: The object_type_ref of this Relation.  # noqa: E501
+        :type: SpgTypeRef
+        """
+
+        self._object_type_ref = object_type_ref
+
+    @property
+    def inherited(self):
+        """Gets the inherited of this Relation.  # noqa: E501
+
+
+        :return: The inherited of this Relation.  # noqa: E501
+        :rtype: bool
+        """
+        return self._inherited
+
+    @inherited.setter
+    def inherited(self, inherited):
+        """Sets the inherited of this Relation.
+
+
+        :param inherited: The inherited of this Relation.  # noqa: E501
+        :type: bool
+        """
+
+        self._inherited = inherited
+
+    @property
+    def advanced_config(self):
+        """Gets the advanced_config of this Relation.  # noqa: E501
+
+
+        :return: The advanced_config of this Relation.  # noqa: E501
+        :rtype: PropertyAdvancedConfig
+        """
+        return self._advanced_config
+
+    @advanced_config.setter
+    def advanced_config(self, advanced_config):
+        """Sets the advanced_config of this Relation.
+
+
+        :param advanced_config: The advanced_config of this Relation.  # noqa: E501
+        :type: PropertyAdvancedConfig
+        """
+
+        self._advanced_config = advanced_config
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this Relation.  # noqa: E501
+
+
+        :return: The project_id of this Relation.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this Relation.
+
+
+        :param project_id: The project_id of this Relation.  # noqa: E501
+        :type: int
+        """
+
+        self._project_id = project_id
+
+    @property
+    def ontology_id(self):
+        """Gets the ontology_id of this Relation.  # noqa: E501
+
+
+        :return: The ontology_id of this Relation.  # noqa: E501
+        :rtype: OntologyId
+        """
+        return self._ontology_id
+
+    @ontology_id.setter
+    def ontology_id(self, ontology_id):
+        """Sets the ontology_id of this Relation.
+
+
+        :param ontology_id: The ontology_id of this Relation.  # noqa: E501
+        :type: OntologyId
+        """
+
+        self._ontology_id = ontology_id
+
+    @property
+    def alter_operation(self):
+        """Gets the alter_operation of this Relation.  # noqa: E501
+
+
+        :return: The alter_operation of this Relation.  # noqa: E501
+        :rtype: str
+        """
+        return self._alter_operation
+
+    @alter_operation.setter
+    def alter_operation(self, alter_operation):
+        """Sets the alter_operation of this Relation.
+
+
+        :param alter_operation: The alter_operation of this Relation.  # noqa: E501
+        :type: str
+        """
+        allowed_values = ["CREATE", "UPDATE", "DELETE"]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and alter_operation not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `alter_operation` ({0}), must be one of {1}".format(  # noqa: E501
+                    alter_operation, allowed_values
+                )
+            )
+
+        self._alter_operation = alter_operation
+
+    @property
+    def ext_info(self):
+        """Gets the ext_info of this Relation.  # noqa: E501
+
+
+        :return: The ext_info of this Relation.  # noqa: E501
+        :rtype: object
+        """
+        return self._ext_info
+
+    @ext_info.setter
+    def ext_info(self, ext_info):
+        """Sets the ext_info of this Relation.
+
+
+        :param ext_info: The ext_info of this Relation.  # noqa: E501
+        :type: object
+        """
+
+        self._ext_info = ext_info
+
+    @property
+    def is_dynamic(self):
+        """Gets the is_dynamic of this Relation.  # noqa: E501
+
+
+        :return: The is_dynamic of this Relation.  # noqa: E501
+        :rtype: bool
+        """
+        return self._is_dynamic
+
+    @is_dynamic.setter
+    def is_dynamic(self, is_dynamic):
+        """Sets the is_dynamic of this Relation.
+
+
+        :param is_dynamic: The is_dynamic of this Relation.  # noqa: E501
+        :type: bool
+        """
+
+        self._is_dynamic = is_dynamic
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, Relation):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, Relation):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/predicate/sub_property.py b/knext/schema/rest/models/predicate/sub_property.py
new file mode 100644
index 00000000..edbe5185
--- /dev/null
+++ b/knext/schema/rest/models/predicate/sub_property.py
@@ -0,0 +1,336 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class SubProperty(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "basic_info": "SubPropertyBasicInfo",
+        "subject_type_ref": "PropertyRef",
+        "object_type_ref": "SpgTypeRef",
+        "advanced_config": "PropertyAdvancedConfig",
+        "project_id": "int",
+        "ontology_id": "OntologyId",
+        "alter_operation": "str",
+        "ext_info": "object",
+    }
+
+    attribute_map = {
+        "basic_info": "basicInfo",
+        "subject_type_ref": "subjectTypeRef",
+        "object_type_ref": "objectTypeRef",
+        "advanced_config": "advancedConfig",
+        "project_id": "projectId",
+        "ontology_id": "ontologyId",
+        "alter_operation": "alterOperation",
+        "ext_info": "extInfo",
+    }
+
+    def __init__(
+        self,
+        basic_info=None,
+        subject_type_ref=None,
+        object_type_ref=None,
+        advanced_config=None,
+        project_id=None,
+        ontology_id=None,
+        alter_operation=None,
+        ext_info=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """SubProperty - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._basic_info = None
+        self._subject_type_ref = None
+        self._object_type_ref = None
+        self._advanced_config = None
+        self._project_id = None
+        self._ontology_id = None
+        self._alter_operation = None
+        self._ext_info = None
+        self.discriminator = None
+
+        if basic_info is not None:
+            self.basic_info = basic_info
+        if subject_type_ref is not None:
+            self.subject_type_ref = subject_type_ref
+        if object_type_ref is not None:
+            self.object_type_ref = object_type_ref
+        if advanced_config is not None:
+            self.advanced_config = advanced_config
+        if project_id is not None:
+            self.project_id = project_id
+        if ontology_id is not None:
+            self.ontology_id = ontology_id
+        if alter_operation is not None:
+            self.alter_operation = alter_operation
+        if ext_info is not None:
+            self.ext_info = ext_info
+
+    @property
+    def basic_info(self):
+        """Gets the basic_info of this SubProperty.  # noqa: E501
+
+
+        :return: The basic_info of this SubProperty.  # noqa: E501
+        :rtype: SubPropertyBasicInfo
+        """
+        return self._basic_info
+
+    @basic_info.setter
+    def basic_info(self, basic_info):
+        """Sets the basic_info of this SubProperty.
+
+
+        :param basic_info: The basic_info of this SubProperty.  # noqa: E501
+        :type: SubPropertyBasicInfo
+        """
+
+        self._basic_info = basic_info
+
+    @property
+    def subject_type_ref(self):
+        """Gets the subject_type_ref of this SubProperty.  # noqa: E501
+
+
+        :return: The subject_type_ref of this SubProperty.  # noqa: E501
+        :rtype: PropertyRef
+        """
+        return self._subject_type_ref
+
+    @subject_type_ref.setter
+    def subject_type_ref(self, subject_type_ref):
+        """Sets the subject_type_ref of this SubProperty.
+
+
+        :param subject_type_ref: The subject_type_ref of this SubProperty.  # noqa: E501
+        :type: PropertyRef
+        """
+
+        self._subject_type_ref = subject_type_ref
+
+    @property
+    def object_type_ref(self):
+        """Gets the object_type_ref of this SubProperty.  # noqa: E501
+
+
+        :return: The object_type_ref of this SubProperty.  # noqa: E501
+        :rtype: SpgTypeRef
+        """
+        return self._object_type_ref
+
+    @object_type_ref.setter
+    def object_type_ref(self, object_type_ref):
+        """Sets the object_type_ref of this SubProperty.
+
+
+        :param object_type_ref: The object_type_ref of this SubProperty.  # noqa: E501
+        :type: SpgTypeRef
+        """
+
+        self._object_type_ref = object_type_ref
+
+    @property
+    def advanced_config(self):
+        """Gets the advanced_config of this SubProperty.  # noqa: E501
+
+
+        :return: The advanced_config of this SubProperty.  # noqa: E501
+        :rtype: PropertyAdvancedConfig
+        """
+        return self._advanced_config
+
+    @advanced_config.setter
+    def advanced_config(self, advanced_config):
+        """Sets the advanced_config of this SubProperty.
+
+
+        :param advanced_config: The advanced_config of this SubProperty.  # noqa: E501
+        :type: PropertyAdvancedConfig
+        """
+
+        self._advanced_config = advanced_config
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this SubProperty.  # noqa: E501
+
+
+        :return: The project_id of this SubProperty.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this SubProperty.
+
+
+        :param project_id: The project_id of this SubProperty.  # noqa: E501
+        :type: int
+        """
+
+        self._project_id = project_id
+
+    @property
+    def ontology_id(self):
+        """Gets the ontology_id of this SubProperty.  # noqa: E501
+
+
+        :return: The ontology_id of this SubProperty.  # noqa: E501
+        :rtype: OntologyId
+        """
+        return self._ontology_id
+
+    @ontology_id.setter
+    def ontology_id(self, ontology_id):
+        """Sets the ontology_id of this SubProperty.
+
+
+        :param ontology_id: The ontology_id of this SubProperty.  # noqa: E501
+        :type: OntologyId
+        """
+
+        self._ontology_id = ontology_id
+
+    @property
+    def alter_operation(self):
+        """Gets the alter_operation of this SubProperty.  # noqa: E501
+
+
+        :return: The alter_operation of this SubProperty.  # noqa: E501
+        :rtype: str
+        """
+        return self._alter_operation
+
+    @alter_operation.setter
+    def alter_operation(self, alter_operation):
+        """Sets the alter_operation of this SubProperty.
+
+
+        :param alter_operation: The alter_operation of this SubProperty.  # noqa: E501
+        :type: str
+        """
+        allowed_values = ["CREATE", "UPDATE", "DELETE"]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and alter_operation not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `alter_operation` ({0}), must be one of {1}".format(  # noqa: E501
+                    alter_operation, allowed_values
+                )
+            )
+
+        self._alter_operation = alter_operation
+
+    @property
+    def ext_info(self):
+        """Gets the ext_info of this SubProperty.  # noqa: E501
+
+
+        :return: The ext_info of this SubProperty.  # noqa: E501
+        :rtype: object
+        """
+        return self._ext_info
+
+    @ext_info.setter
+    def ext_info(self, ext_info):
+        """Sets the ext_info of this SubProperty.
+
+
+        :param ext_info: The ext_info of this SubProperty.  # noqa: E501
+        :type: object
+        """
+
+        self._ext_info = ext_info
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, SubProperty):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, SubProperty):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/predicate/sub_property_basic_info.py b/knext/schema/rest/models/predicate/sub_property_basic_info.py
new file mode 100644
index 00000000..fe79108b
--- /dev/null
+++ b/knext/schema/rest/models/predicate/sub_property_basic_info.py
@@ -0,0 +1,218 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class SubPropertyBasicInfo(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "name": "PredicateIdentifier",
+        "name_zh": "str",
+        "desc": "str",
+        "creator": "str",
+    }
+
+    attribute_map = {
+        "name": "name",
+        "name_zh": "nameZh",
+        "desc": "desc",
+        "creator": "creator",
+    }
+
+    def __init__(
+        self,
+        name=None,
+        name_zh=None,
+        desc=None,
+        creator=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """SubPropertyBasicInfo - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._name = None
+        self._name_zh = None
+        self._desc = None
+        self._creator = None
+        self.discriminator = None
+
+        if name is not None:
+            self.name = name
+        if name_zh is not None:
+            self.name_zh = name_zh
+        if desc is not None:
+            self.desc = desc
+        if creator is not None:
+            self.creator = creator
+
+    @property
+    def name(self):
+        """Gets the name of this SubPropertyBasicInfo.  # noqa: E501
+
+
+        :return: The name of this SubPropertyBasicInfo.  # noqa: E501
+        :rtype: PredicateIdentifier
+        """
+        return self._name
+
+    @name.setter
+    def name(self, name):
+        """Sets the name of this SubPropertyBasicInfo.
+
+
+        :param name: The name of this SubPropertyBasicInfo.  # noqa: E501
+        :type: PredicateIdentifier
+        """
+
+        self._name = name
+
+    @property
+    def name_zh(self):
+        """Gets the name_zh of this SubPropertyBasicInfo.  # noqa: E501
+
+
+        :return: The name_zh of this SubPropertyBasicInfo.  # noqa: E501
+        :rtype: str
+        """
+        return self._name_zh
+
+    @name_zh.setter
+    def name_zh(self, name_zh):
+        """Sets the name_zh of this SubPropertyBasicInfo.
+
+
+        :param name_zh: The name_zh of this SubPropertyBasicInfo.  # noqa: E501
+        :type: str
+        """
+
+        self._name_zh = name_zh
+
+    @property
+    def desc(self):
+        """Gets the desc of this SubPropertyBasicInfo.  # noqa: E501
+
+
+        :return: The desc of this SubPropertyBasicInfo.  # noqa: E501
+        :rtype: str
+        """
+        return self._desc
+
+    @desc.setter
+    def desc(self, desc):
+        """Sets the desc of this SubPropertyBasicInfo.
+
+
+        :param desc: The desc of this SubPropertyBasicInfo.  # noqa: E501
+        :type: str
+        """
+
+        self._desc = desc
+
+    @property
+    def creator(self):
+        """Gets the creator of this SubPropertyBasicInfo.  # noqa: E501
+
+
+        :return: The creator of this SubPropertyBasicInfo.  # noqa: E501
+        :rtype: str
+        """
+        return self._creator
+
+    @creator.setter
+    def creator(self, creator):
+        """Sets the creator of this SubPropertyBasicInfo.
+
+
+        :param creator: The creator of this SubPropertyBasicInfo.  # noqa: E501
+        :type: str
+        """
+
+        self._creator = creator
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, SubPropertyBasicInfo):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, SubPropertyBasicInfo):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/semantic/__init__.py b/knext/schema/rest/models/semantic/__init__.py
new file mode 100644
index 00000000..6f6914a4
--- /dev/null
+++ b/knext/schema/rest/models/semantic/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
diff --git a/knext/schema/rest/models/semantic/base_semantic.py b/knext/schema/rest/models/semantic/base_semantic.py
new file mode 100644
index 00000000..3a6e86b9
--- /dev/null
+++ b/knext/schema/rest/models/semantic/base_semantic.py
@@ -0,0 +1,271 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class BaseSemantic(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "ontology_enum": "str",
+        "project_id": "int",
+        "ontology_id": "OntologyId",
+        "alter_operation": "str",
+        "ext_info": "object",
+    }
+
+    attribute_map = {
+        "ontology_enum": "ontologyEnum",
+        "project_id": "projectId",
+        "ontology_id": "ontologyId",
+        "alter_operation": "alterOperation",
+        "ext_info": "extInfo",
+    }
+
+    def __init__(
+        self,
+        ontology_enum=None,
+        project_id=None,
+        ontology_id=None,
+        alter_operation=None,
+        ext_info=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """BaseSemantic - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._ontology_enum = None
+        self._project_id = None
+        self._ontology_id = None
+        self._alter_operation = None
+        self._ext_info = None
+        self.discriminator = None
+
+        self.ontology_enum = ontology_enum
+        if project_id is not None:
+            self.project_id = project_id
+        if ontology_id is not None:
+            self.ontology_id = ontology_id
+        if alter_operation is not None:
+            self.alter_operation = alter_operation
+        if ext_info is not None:
+            self.ext_info = ext_info
+
+    @property
+    def ontology_enum(self):
+        """Gets the ontology_enum of this BaseSemantic.  # noqa: E501
+
+
+        :return: The ontology_enum of this BaseSemantic.  # noqa: E501
+        :rtype: str
+        """
+        return self._ontology_enum
+
+    @ontology_enum.setter
+    def ontology_enum(self, ontology_enum):
+        """Sets the ontology_enum of this BaseSemantic.
+
+
+        :param ontology_enum: The ontology_enum of this BaseSemantic.  # noqa: E501
+        :type: str
+        """
+        allowed_values = [
+            None,
+            "TYPE",
+            "PROPERTY",
+            "RELATION",
+            "SUB_PROPERTY",
+            "CONCEPT",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and ontology_enum not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `ontology_enum` ({0}), must be one of {1}".format(  # noqa: E501
+                    ontology_enum, allowed_values
+                )
+            )
+
+        self._ontology_enum = ontology_enum
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this BaseSemantic.  # noqa: E501
+
+
+        :return: The project_id of this BaseSemantic.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this BaseSemantic.
+
+
+        :param project_id: The project_id of this BaseSemantic.  # noqa: E501
+        :type: int
+        """
+
+        self._project_id = project_id
+
+    @property
+    def ontology_id(self):
+        """Gets the ontology_id of this BaseSemantic.  # noqa: E501
+
+
+        :return: The ontology_id of this BaseSemantic.  # noqa: E501
+        :rtype: OntologyId
+        """
+        return self._ontology_id
+
+    @ontology_id.setter
+    def ontology_id(self, ontology_id):
+        """Sets the ontology_id of this BaseSemantic.
+
+
+        :param ontology_id: The ontology_id of this BaseSemantic.  # noqa: E501
+        :type: OntologyId
+        """
+
+        self._ontology_id = ontology_id
+
+    @property
+    def alter_operation(self):
+        """Gets the alter_operation of this BaseSemantic.  # noqa: E501
+
+
+        :return: The alter_operation of this BaseSemantic.  # noqa: E501
+        :rtype: str
+        """
+        return self._alter_operation
+
+    @alter_operation.setter
+    def alter_operation(self, alter_operation):
+        """Sets the alter_operation of this BaseSemantic.
+
+
+        :param alter_operation: The alter_operation of this BaseSemantic.  # noqa: E501
+        :type: str
+        """
+        allowed_values = ["CREATE", "UPDATE", "DELETE"]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and alter_operation not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `alter_operation` ({0}), must be one of {1}".format(  # noqa: E501
+                    alter_operation, allowed_values
+                )
+            )
+
+        self._alter_operation = alter_operation
+
+    @property
+    def ext_info(self):
+        """Gets the ext_info of this BaseSemantic.  # noqa: E501
+
+
+        :return: The ext_info of this BaseSemantic.  # noqa: E501
+        :rtype: object
+        """
+        return self._ext_info
+
+    @ext_info.setter
+    def ext_info(self, ext_info):
+        """Sets the ext_info of this BaseSemantic.
+
+
+        :param ext_info: The ext_info of this BaseSemantic.  # noqa: E501
+        :type: object
+        """
+
+        self._ext_info = ext_info
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, BaseSemantic):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, BaseSemantic):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/semantic/logical_rule.py b/knext/schema/rest/models/semantic/logical_rule.py
new file mode 100644
index 00000000..e45dff22
--- /dev/null
+++ b/knext/schema/rest/models/semantic/logical_rule.py
@@ -0,0 +1,309 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class LogicalRule(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "code": "RuleCode",
+        "name": "str",
+        "version": "int",
+        "is_master": "bool",
+        "atatus": "str",
+        "content": "str",
+        "creator": "UserInfo",
+    }
+
+    attribute_map = {
+        "code": "code",
+        "name": "name",
+        "version": "version",
+        "is_master": "isMaster",
+        "atatus": "atatus",
+        "content": "content",
+        "creator": "creator",
+    }
+
+    def __init__(
+        self,
+        code=None,
+        name=None,
+        version=None,
+        is_master=None,
+        atatus=None,
+        content=None,
+        creator=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """LogicalRule - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._code = None
+        self._name = None
+        self._version = None
+        self._is_master = None
+        self._atatus = None
+        self._content = None
+        self._creator = None
+        self.discriminator = None
+
+        if code is not None:
+            self.code = code
+        if name is not None:
+            self.name = name
+        if version is not None:
+            self.version = version
+        if is_master is not None:
+            self.is_master = is_master
+        if atatus is not None:
+            self.atatus = atatus
+        if content is not None:
+            self.content = content
+        if creator is not None:
+            self.creator = creator
+
+    @property
+    def code(self):
+        """Gets the code of this LogicalRule.  # noqa: E501
+
+
+        :return: The code of this LogicalRule.  # noqa: E501
+        :rtype: RuleCode
+        """
+        return self._code
+
+    @code.setter
+    def code(self, code):
+        """Sets the code of this LogicalRule.
+
+
+        :param code: The code of this LogicalRule.  # noqa: E501
+        :type: RuleCode
+        """
+
+        self._code = code
+
+    @property
+    def name(self):
+        """Gets the name of this LogicalRule.  # noqa: E501
+
+
+        :return: The name of this LogicalRule.  # noqa: E501
+        :rtype: str
+        """
+        return self._name
+
+    @name.setter
+    def name(self, name):
+        """Sets the name of this LogicalRule.
+
+
+        :param name: The name of this LogicalRule.  # noqa: E501
+        :type: str
+        """
+
+        self._name = name
+
+    @property
+    def version(self):
+        """Gets the version of this LogicalRule.  # noqa: E501
+
+
+        :return: The version of this LogicalRule.  # noqa: E501
+        :rtype: int
+        """
+        return self._version
+
+    @version.setter
+    def version(self, version):
+        """Sets the version of this LogicalRule.
+
+
+        :param version: The version of this LogicalRule.  # noqa: E501
+        :type: int
+        """
+
+        self._version = version
+
+    @property
+    def is_master(self):
+        """Gets the is_master of this LogicalRule.  # noqa: E501
+
+
+        :return: The is_master of this LogicalRule.  # noqa: E501
+        :rtype: bool
+        """
+        return self._is_master
+
+    @is_master.setter
+    def is_master(self, is_master):
+        """Sets the is_master of this LogicalRule.
+
+
+        :param is_master: The is_master of this LogicalRule.  # noqa: E501
+        :type: bool
+        """
+
+        self._is_master = is_master
+
+    @property
+    def atatus(self):
+        """Gets the atatus of this LogicalRule.  # noqa: E501
+
+
+        :return: The atatus of this LogicalRule.  # noqa: E501
+        :rtype: str
+        """
+        return self._atatus
+
+    @atatus.setter
+    def atatus(self, atatus):
+        """Sets the atatus of this LogicalRule.
+
+
+        :param atatus: The atatus of this LogicalRule.  # noqa: E501
+        :type: str
+        """
+        allowed_values = ["INIT", "GRAY", "PROD", "OFF", "DEL"]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and atatus not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `atatus` ({0}), must be one of {1}".format(  # noqa: E501
+                    atatus, allowed_values
+                )
+            )
+
+        self._atatus = atatus
+
+    @property
+    def content(self):
+        """Gets the content of this LogicalRule.  # noqa: E501
+
+
+        :return: The content of this LogicalRule.  # noqa: E501
+        :rtype: str
+        """
+        return self._content
+
+    @content.setter
+    def content(self, content):
+        """Sets the content of this LogicalRule.
+
+
+        :param content: The content of this LogicalRule.  # noqa: E501
+        :type: str
+        """
+
+        self._content = content
+
+    @property
+    def creator(self):
+        """Gets the creator of this LogicalRule.  # noqa: E501
+
+
+        :return: The creator of this LogicalRule.  # noqa: E501
+        :rtype: UserInfo
+        """
+        return self._creator
+
+    @creator.setter
+    def creator(self, creator):
+        """Sets the creator of this LogicalRule.
+
+
+        :param creator: The creator of this LogicalRule.  # noqa: E501
+        :type: UserInfo
+        """
+
+        self._creator = creator
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, LogicalRule):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, LogicalRule):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/semantic/predicate_semantic.py b/knext/schema/rest/models/semantic/predicate_semantic.py
new file mode 100644
index 00000000..01c2d514
--- /dev/null
+++ b/knext/schema/rest/models/semantic/predicate_semantic.py
@@ -0,0 +1,352 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class PredicateSemantic(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "ontology_enum": "str",
+        "project_id": "int",
+        "ontology_id": "OntologyId",
+        "alter_operation": "str",
+        "ext_info": "object",
+        "subject_type_ref": "PropertyRef",
+        "predicate": "PredicateIdentifier",
+        "object_type_ref": "PropertyRef",
+    }
+
+    attribute_map = {
+        "ontology_enum": "ontologyEnum",
+        "project_id": "projectId",
+        "ontology_id": "ontologyId",
+        "alter_operation": "alterOperation",
+        "ext_info": "extInfo",
+        "subject_type_ref": "subjectTypeRef",
+        "predicate": "predicate",
+        "object_type_ref": "objectTypeRef",
+    }
+
+    def __init__(
+        self,
+        ontology_enum=None,
+        project_id=None,
+        ontology_id=None,
+        alter_operation=None,
+        ext_info=None,
+        subject_type_ref=None,
+        predicate=None,
+        object_type_ref=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """PredicateSemantic - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._ontology_enum = None
+        self._project_id = None
+        self._ontology_id = None
+        self._alter_operation = None
+        self._ext_info = None
+        self._subject_type_ref = None
+        self._predicate = None
+        self._object_type_ref = None
+        self.discriminator = None
+
+        self.ontology_enum = ontology_enum
+        if project_id is not None:
+            self.project_id = project_id
+        if ontology_id is not None:
+            self.ontology_id = ontology_id
+        if alter_operation is not None:
+            self.alter_operation = alter_operation
+        if ext_info is not None:
+            self.ext_info = ext_info
+        if subject_type_ref is not None:
+            self.subject_type_ref = subject_type_ref
+        if predicate is not None:
+            self.predicate = predicate
+        if object_type_ref is not None:
+            self.object_type_ref = object_type_ref
+
+    @property
+    def ontology_enum(self):
+        """Gets the ontology_enum of this PredicateSemantic.  # noqa: E501
+
+
+        :return: The ontology_enum of this PredicateSemantic.  # noqa: E501
+        :rtype: str
+        """
+        return self._ontology_enum
+
+    @ontology_enum.setter
+    def ontology_enum(self, ontology_enum):
+        """Sets the ontology_enum of this PredicateSemantic.
+
+
+        :param ontology_enum: The ontology_enum of this PredicateSemantic.  # noqa: E501
+        :type: str
+        """
+        allowed_values = [
+            None,
+            "TYPE",
+            "PROPERTY",
+            "RELATION",
+            "SUB_PROPERTY",
+            "CONCEPT",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and ontology_enum not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `ontology_enum` ({0}), must be one of {1}".format(  # noqa: E501
+                    ontology_enum, allowed_values
+                )
+            )
+
+        self._ontology_enum = ontology_enum
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this PredicateSemantic.  # noqa: E501
+
+
+        :return: The project_id of this PredicateSemantic.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this PredicateSemantic.
+
+
+        :param project_id: The project_id of this PredicateSemantic.  # noqa: E501
+        :type: int
+        """
+
+        self._project_id = project_id
+
+    @property
+    def ontology_id(self):
+        """Gets the ontology_id of this PredicateSemantic.  # noqa: E501
+
+
+        :return: The ontology_id of this PredicateSemantic.  # noqa: E501
+        :rtype: OntologyId
+        """
+        return self._ontology_id
+
+    @ontology_id.setter
+    def ontology_id(self, ontology_id):
+        """Sets the ontology_id of this PredicateSemantic.
+
+
+        :param ontology_id: The ontology_id of this PredicateSemantic.  # noqa: E501
+        :type: OntologyId
+        """
+
+        self._ontology_id = ontology_id
+
+    @property
+    def alter_operation(self):
+        """Gets the alter_operation of this PredicateSemantic.  # noqa: E501
+
+
+        :return: The alter_operation of this PredicateSemantic.  # noqa: E501
+        :rtype: str
+        """
+        return self._alter_operation
+
+    @alter_operation.setter
+    def alter_operation(self, alter_operation):
+        """Sets the alter_operation of this PredicateSemantic.
+
+
+        :param alter_operation: The alter_operation of this PredicateSemantic.  # noqa: E501
+        :type: str
+        """
+        allowed_values = ["CREATE", "UPDATE", "DELETE"]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and alter_operation not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `alter_operation` ({0}), must be one of {1}".format(  # noqa: E501
+                    alter_operation, allowed_values
+                )
+            )
+
+        self._alter_operation = alter_operation
+
+    @property
+    def ext_info(self):
+        """Gets the ext_info of this PredicateSemantic.  # noqa: E501
+
+
+        :return: The ext_info of this PredicateSemantic.  # noqa: E501
+        :rtype: object
+        """
+        return self._ext_info
+
+    @ext_info.setter
+    def ext_info(self, ext_info):
+        """Sets the ext_info of this PredicateSemantic.
+
+
+        :param ext_info: The ext_info of this PredicateSemantic.  # noqa: E501
+        :type: object
+        """
+
+        self._ext_info = ext_info
+
+    @property
+    def subject_type_ref(self):
+        """Gets the subject_type_ref of this PredicateSemantic.  # noqa: E501
+
+
+        :return: The subject_type_ref of this PredicateSemantic.  # noqa: E501
+        :rtype: PropertyRef
+        """
+        return self._subject_type_ref
+
+    @subject_type_ref.setter
+    def subject_type_ref(self, subject_type_ref):
+        """Sets the subject_type_ref of this PredicateSemantic.
+
+
+        :param subject_type_ref: The subject_type_ref of this PredicateSemantic.  # noqa: E501
+        :type: PropertyRef
+        """
+
+        self._subject_type_ref = subject_type_ref
+
+    @property
+    def predicate(self):
+        """Gets the predicate of this PredicateSemantic.  # noqa: E501
+
+
+        :return: The predicate of this PredicateSemantic.  # noqa: E501
+        :rtype: PredicateIdentifier
+        """
+        return self._predicate
+
+    @predicate.setter
+    def predicate(self, predicate):
+        """Sets the predicate of this PredicateSemantic.
+
+
+        :param predicate: The predicate of this PredicateSemantic.  # noqa: E501
+        :type: PredicateIdentifier
+        """
+
+        self._predicate = predicate
+
+    @property
+    def object_type_ref(self):
+        """Gets the object_type_ref of this PredicateSemantic.  # noqa: E501
+
+
+        :return: The object_type_ref of this PredicateSemantic.  # noqa: E501
+        :rtype: PropertyRef
+        """
+        return self._object_type_ref
+
+    @object_type_ref.setter
+    def object_type_ref(self, object_type_ref):
+        """Sets the object_type_ref of this PredicateSemantic.
+
+
+        :param object_type_ref: The object_type_ref of this PredicateSemantic.  # noqa: E501
+        :type: PropertyRef
+        """
+
+        self._object_type_ref = object_type_ref
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, PredicateSemantic):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, PredicateSemantic):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/semantic/rule_code.py b/knext/schema/rest/models/semantic/rule_code.py
new file mode 100644
index 00000000..10f935bd
--- /dev/null
+++ b/knext/schema/rest/models/semantic/rule_code.py
@@ -0,0 +1,129 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class RuleCode(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"code": "str"}
+
+    attribute_map = {"code": "code"}
+
+    def __init__(self, code=None, local_vars_configuration=None):  # noqa: E501
+        """RuleCode - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._code = None
+        self.discriminator = None
+
+        if code is not None:
+            self.code = code
+
+    @property
+    def code(self):
+        """Gets the code of this RuleCode.  # noqa: E501
+
+
+        :return: The code of this RuleCode.  # noqa: E501
+        :rtype: str
+        """
+        return self._code
+
+    @code.setter
+    def code(self, code):
+        """Sets the code of this RuleCode.
+
+
+        :param code: The code of this RuleCode.  # noqa: E501
+        :type: str
+        """
+
+        self._code = code
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, RuleCode):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, RuleCode):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/type/__init__.py b/knext/schema/rest/models/type/__init__.py
new file mode 100644
index 00000000..6f6914a4
--- /dev/null
+++ b/knext/schema/rest/models/type/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
diff --git a/knext/schema/rest/models/type/base_advanced_type.py b/knext/schema/rest/models/type/base_advanced_type.py
new file mode 100644
index 00000000..dda6e70f
--- /dev/null
+++ b/knext/schema/rest/models/type/base_advanced_type.py
@@ -0,0 +1,421 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class BaseAdvancedType(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "basic_info": "StandardTypeBasicInfo",
+        "parent_type_info": "ParentTypeInfo",
+        "spg_type_enum": "str",
+        "properties": "list[Property]",
+        "relations": "list[Relation]",
+        "advanced_config": "SpgTypeAdvancedConfig",
+        "project_id": "int",
+        "ontology_id": "OntologyId",
+        "alter_operation": "str",
+        "ext_info": "object",
+    }
+
+    attribute_map = {
+        "basic_info": "basicInfo",
+        "parent_type_info": "parentTypeInfo",
+        "spg_type_enum": "spgTypeEnum",
+        "properties": "properties",
+        "relations": "relations",
+        "advanced_config": "advancedConfig",
+        "project_id": "projectId",
+        "ontology_id": "ontologyId",
+        "alter_operation": "alterOperation",
+        "ext_info": "extInfo",
+    }
+
+    discriminator_value_class_map = {
+        "STANDARD_TYPE": "StandardType",
+        "ENTITY_TYPE": "EntityType",
+        "EVENT_TYPE": "EventType",
+        "CONCEPT_TYPE": "ConceptType",
+    }
+
+    def __init__(
+        self,
+        basic_info=None,
+        parent_type_info=None,
+        spg_type_enum=None,
+        properties=None,
+        relations=None,
+        advanced_config=None,
+        project_id=None,
+        ontology_id=None,
+        alter_operation=None,
+        ext_info=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """BaseAdvancedType - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._basic_info = None
+        self._parent_type_info = None
+        self._spg_type_enum = None
+        self._properties = None
+        self._relations = None
+        self._advanced_config = None
+        self._project_id = None
+        self._ontology_id = None
+        self._alter_operation = None
+        self._ext_info = None
+        self.discriminator = None
+
+        if basic_info is not None:
+            self.basic_info = basic_info
+        if parent_type_info is not None:
+            self.parent_type_info = parent_type_info
+        if spg_type_enum is not None:
+            self.spg_type_enum = spg_type_enum
+        if properties is not None:
+            self.properties = properties
+        if relations is not None:
+            self.relations = relations
+        if advanced_config is not None:
+            self.advanced_config = advanced_config
+        if project_id is not None:
+            self.project_id = project_id
+        if ontology_id is not None:
+            self.ontology_id = ontology_id
+        if alter_operation is not None:
+            self.alter_operation = alter_operation
+        if ext_info is not None:
+            self.ext_info = ext_info
+
+    @property
+    def basic_info(self):
+        """Gets the basic_info of this BaseAdvancedType.  # noqa: E501
+
+
+        :return: The basic_info of this BaseAdvancedType.  # noqa: E501
+        :rtype: StandardTypeBasicInfo
+        """
+        return self._basic_info
+
+    @basic_info.setter
+    def basic_info(self, basic_info):
+        """Sets the basic_info of this BaseAdvancedType.
+
+
+        :param basic_info: The basic_info of this BaseAdvancedType.  # noqa: E501
+        :type: StandardTypeBasicInfo
+        """
+
+        self._basic_info = basic_info
+
+    @property
+    def parent_type_info(self):
+        """Gets the parent_type_info of this BaseAdvancedType.  # noqa: E501
+
+
+        :return: The parent_type_info of this BaseAdvancedType.  # noqa: E501
+        :rtype: ParentTypeInfo
+        """
+        return self._parent_type_info
+
+    @parent_type_info.setter
+    def parent_type_info(self, parent_type_info):
+        """Sets the parent_type_info of this BaseAdvancedType.
+
+
+        :param parent_type_info: The parent_type_info of this BaseAdvancedType.  # noqa: E501
+        :type: ParentTypeInfo
+        """
+
+        self._parent_type_info = parent_type_info
+
+    @property
+    def spg_type_enum(self):
+        """Gets the spg_type_enum of this BaseAdvancedType.  # noqa: E501
+
+
+        :return: The spg_type_enum of this BaseAdvancedType.  # noqa: E501
+        :rtype: str
+        """
+        return self._spg_type_enum
+
+    @spg_type_enum.setter
+    def spg_type_enum(self, spg_type_enum):
+        """Sets the spg_type_enum of this BaseAdvancedType.
+
+
+        :param spg_type_enum: The spg_type_enum of this BaseAdvancedType.  # noqa: E501
+        :type: str
+        """
+        allowed_values = [
+            "BASIC_TYPE",
+            "ENTITY_TYPE",
+            "CONCEPT_TYPE",
+            "EVENT_TYPE",
+            "STANDARD_TYPE",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and spg_type_enum not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `spg_type_enum` ({0}), must be one of {1}".format(  # noqa: E501
+                    spg_type_enum, allowed_values
+                )
+            )
+
+        self._spg_type_enum = spg_type_enum
+
+    @property
+    def properties(self):
+        """Gets the properties of this BaseAdvancedType.  # noqa: E501
+
+
+        :return: The properties of this BaseAdvancedType.  # noqa: E501
+        :rtype: list[Property]
+        """
+        return self._properties
+
+    @properties.setter
+    def properties(self, properties):
+        """Sets the properties of this BaseAdvancedType.
+
+
+        :param properties: The properties of this BaseAdvancedType.  # noqa: E501
+        :type: list[Property]
+        """
+
+        self._properties = properties
+
+    @property
+    def relations(self):
+        """Gets the relations of this BaseAdvancedType.  # noqa: E501
+
+
+        :return: The relations of this BaseAdvancedType.  # noqa: E501
+        :rtype: list[Relation]
+        """
+        return self._relations
+
+    @relations.setter
+    def relations(self, relations):
+        """Sets the relations of this BaseAdvancedType.
+
+
+        :param relations: The relations of this BaseAdvancedType.  # noqa: E501
+        :type: list[Relation]
+        """
+
+        self._relations = relations
+
+    @property
+    def advanced_config(self):
+        """Gets the advanced_config of this BaseAdvancedType.  # noqa: E501
+
+
+        :return: The advanced_config of this BaseAdvancedType.  # noqa: E501
+        :rtype: SpgTypeAdvancedConfig
+        """
+        return self._advanced_config
+
+    @advanced_config.setter
+    def advanced_config(self, advanced_config):
+        """Sets the advanced_config of this BaseAdvancedType.
+
+
+        :param advanced_config: The advanced_config of this BaseAdvancedType.  # noqa: E501
+        :type: SpgTypeAdvancedConfig
+        """
+
+        self._advanced_config = advanced_config
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this BaseAdvancedType.  # noqa: E501
+
+
+        :return: The project_id of this BaseAdvancedType.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this BaseAdvancedType.
+
+
+        :param project_id: The project_id of this BaseAdvancedType.  # noqa: E501
+        :type: int
+        """
+
+        self._project_id = project_id
+
+    @property
+    def ontology_id(self):
+        """Gets the ontology_id of this BaseAdvancedType.  # noqa: E501
+
+
+        :return: The ontology_id of this BaseAdvancedType.  # noqa: E501
+        :rtype: OntologyId
+        """
+        return self._ontology_id
+
+    @ontology_id.setter
+    def ontology_id(self, ontology_id):
+        """Sets the ontology_id of this BaseAdvancedType.
+
+
+        :param ontology_id: The ontology_id of this BaseAdvancedType.  # noqa: E501
+        :type: OntologyId
+        """
+
+        self._ontology_id = ontology_id
+
+    @property
+    def alter_operation(self):
+        """Gets the alter_operation of this BaseAdvancedType.  # noqa: E501
+
+
+        :return: The alter_operation of this BaseAdvancedType.  # noqa: E501
+        :rtype: str
+        """
+        return self._alter_operation
+
+    @alter_operation.setter
+    def alter_operation(self, alter_operation):
+        """Sets the alter_operation of this BaseAdvancedType.
+
+
+        :param alter_operation: The alter_operation of this BaseAdvancedType.  # noqa: E501
+        :type: str
+        """
+        allowed_values = ["CREATE", "UPDATE", "DELETE"]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and alter_operation not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `alter_operation` ({0}), must be one of {1}".format(  # noqa: E501
+                    alter_operation, allowed_values
+                )
+            )
+
+        self._alter_operation = alter_operation
+
+    @property
+    def ext_info(self):
+        """Gets the ext_info of this BaseAdvancedType.  # noqa: E501
+
+
+        :return: The ext_info of this BaseAdvancedType.  # noqa: E501
+        :rtype: object
+        """
+        return self._ext_info
+
+    @ext_info.setter
+    def ext_info(self, ext_info):
+        """Sets the ext_info of this BaseAdvancedType.
+
+
+        :param ext_info: The ext_info of this BaseAdvancedType.  # noqa: E501
+        :type: object
+        """
+
+        self._ext_info = ext_info
+
+    def get_real_child_model(self, data):
+        """Returns the child model by discriminator"""
+        if "@type" in data:
+            child_type = data.get("@type")
+            real_child_model = self.discriminator_value_class_map.get(child_type)
+            return real_child_model
+        return None
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, BaseAdvancedType):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, BaseAdvancedType):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/type/base_spg_type.py b/knext/schema/rest/models/type/base_spg_type.py
new file mode 100644
index 00000000..1a3539f3
--- /dev/null
+++ b/knext/schema/rest/models/type/base_spg_type.py
@@ -0,0 +1,424 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class BaseSpgType(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "basic_info": "StandardTypeBasicInfo",
+        "parent_type_info": "ParentTypeInfo",
+        "spg_type_enum": "str",
+        "properties": "list[Property]",
+        "relations": "list[Relation]",
+        "advanced_config": "SpgTypeAdvancedConfig",
+        "project_id": "int",
+        "ontology_id": "OntologyId",
+        "alter_operation": "str",
+        "ext_info": "object",
+    }
+
+    attribute_map = {
+        "basic_info": "basicInfo",
+        "parent_type_info": "parentTypeInfo",
+        "spg_type_enum": "spgTypeEnum",
+        "properties": "properties",
+        "relations": "relations",
+        "advanced_config": "advancedConfig",
+        "project_id": "projectId",
+        "ontology_id": "ontologyId",
+        "alter_operation": "alterOperation",
+        "ext_info": "extInfo",
+    }
+
+    discriminator_value_class_map = {
+        "TEXT": "BasicType",
+        "LONG": "BasicType",
+        "DOUBLE": "BasicType",
+        "STANDARD_TYPE": "StandardType",
+        "ENTITY_TYPE": "EntityType",
+        "EVENT_TYPE": "EventType",
+        "CONCEPT_TYPE": "ConceptType",
+    }
+
+    def __init__(
+        self,
+        basic_info=None,
+        parent_type_info=None,
+        spg_type_enum=None,
+        properties=None,
+        relations=None,
+        advanced_config=None,
+        project_id=None,
+        ontology_id=None,
+        alter_operation=None,
+        ext_info=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """BaseSpgType - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._basic_info = None
+        self._parent_type_info = None
+        self._spg_type_enum = None
+        self._properties = None
+        self._relations = None
+        self._advanced_config = None
+        self._project_id = None
+        self._ontology_id = None
+        self._alter_operation = None
+        self._ext_info = None
+        self.discriminator = None
+
+        if basic_info is not None:
+            self.basic_info = basic_info
+        if parent_type_info is not None:
+            self.parent_type_info = parent_type_info
+        if spg_type_enum is not None:
+            self.spg_type_enum = spg_type_enum
+        if properties is not None:
+            self.properties = properties
+        if relations is not None:
+            self.relations = relations
+        if advanced_config is not None:
+            self.advanced_config = advanced_config
+        if project_id is not None:
+            self.project_id = project_id
+        if ontology_id is not None:
+            self.ontology_id = ontology_id
+        if alter_operation is not None:
+            self.alter_operation = alter_operation
+        if ext_info is not None:
+            self.ext_info = ext_info
+
+    @property
+    def basic_info(self):
+        """Gets the basic_info of this BaseSpgType.  # noqa: E501
+
+
+        :return: The basic_info of this BaseSpgType.  # noqa: E501
+        :rtype: StandardTypeBasicInfo
+        """
+        return self._basic_info
+
+    @basic_info.setter
+    def basic_info(self, basic_info):
+        """Sets the basic_info of this BaseSpgType.
+
+
+        :param basic_info: The basic_info of this BaseSpgType.  # noqa: E501
+        :type: StandardTypeBasicInfo
+        """
+
+        self._basic_info = basic_info
+
+    @property
+    def parent_type_info(self):
+        """Gets the parent_type_info of this BaseSpgType.  # noqa: E501
+
+
+        :return: The parent_type_info of this BaseSpgType.  # noqa: E501
+        :rtype: ParentTypeInfo
+        """
+        return self._parent_type_info
+
+    @parent_type_info.setter
+    def parent_type_info(self, parent_type_info):
+        """Sets the parent_type_info of this BaseSpgType.
+
+
+        :param parent_type_info: The parent_type_info of this BaseSpgType.  # noqa: E501
+        :type: ParentTypeInfo
+        """
+
+        self._parent_type_info = parent_type_info
+
+    @property
+    def spg_type_enum(self):
+        """Gets the spg_type_enum of this BaseSpgType.  # noqa: E501
+
+
+        :return: The spg_type_enum of this BaseSpgType.  # noqa: E501
+        :rtype: str
+        """
+        return self._spg_type_enum
+
+    @spg_type_enum.setter
+    def spg_type_enum(self, spg_type_enum):
+        """Sets the spg_type_enum of this BaseSpgType.
+
+
+        :param spg_type_enum: The spg_type_enum of this BaseSpgType.  # noqa: E501
+        :type: str
+        """
+        allowed_values = [
+            "BASIC_TYPE",
+            "ENTITY_TYPE",
+            "CONCEPT_TYPE",
+            "EVENT_TYPE",
+            "STANDARD_TYPE",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and spg_type_enum not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `spg_type_enum` ({0}), must be one of {1}".format(  # noqa: E501
+                    spg_type_enum, allowed_values
+                )
+            )
+
+        self._spg_type_enum = spg_type_enum
+
+    @property
+    def properties(self):
+        """Gets the properties of this BaseSpgType.  # noqa: E501
+
+
+        :return: The properties of this BaseSpgType.  # noqa: E501
+        :rtype: list[Property]
+        """
+        return self._properties
+
+    @properties.setter
+    def properties(self, properties):
+        """Sets the properties of this BaseSpgType.
+
+
+        :param properties: The properties of this BaseSpgType.  # noqa: E501
+        :type: list[Property]
+        """
+
+        self._properties = properties
+
+    @property
+    def relations(self):
+        """Gets the relations of this BaseSpgType.  # noqa: E501
+
+
+        :return: The relations of this BaseSpgType.  # noqa: E501
+        :rtype: list[Relation]
+        """
+        return self._relations
+
+    @relations.setter
+    def relations(self, relations):
+        """Sets the relations of this BaseSpgType.
+
+
+        :param relations: The relations of this BaseSpgType.  # noqa: E501
+        :type: list[Relation]
+        """
+
+        self._relations = relations
+
+    @property
+    def advanced_config(self):
+        """Gets the advanced_config of this BaseSpgType.  # noqa: E501
+
+
+        :return: The advanced_config of this BaseSpgType.  # noqa: E501
+        :rtype: SpgTypeAdvancedConfig
+        """
+        return self._advanced_config
+
+    @advanced_config.setter
+    def advanced_config(self, advanced_config):
+        """Sets the advanced_config of this BaseSpgType.
+
+
+        :param advanced_config: The advanced_config of this BaseSpgType.  # noqa: E501
+        :type: SpgTypeAdvancedConfig
+        """
+
+        self._advanced_config = advanced_config
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this BaseSpgType.  # noqa: E501
+
+
+        :return: The project_id of this BaseSpgType.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this BaseSpgType.
+
+
+        :param project_id: The project_id of this BaseSpgType.  # noqa: E501
+        :type: int
+        """
+
+        self._project_id = project_id
+
+    @property
+    def ontology_id(self):
+        """Gets the ontology_id of this BaseSpgType.  # noqa: E501
+
+
+        :return: The ontology_id of this BaseSpgType.  # noqa: E501
+        :rtype: OntologyId
+        """
+        return self._ontology_id
+
+    @ontology_id.setter
+    def ontology_id(self, ontology_id):
+        """Sets the ontology_id of this BaseSpgType.
+
+
+        :param ontology_id: The ontology_id of this BaseSpgType.  # noqa: E501
+        :type: OntologyId
+        """
+
+        self._ontology_id = ontology_id
+
+    @property
+    def alter_operation(self):
+        """Gets the alter_operation of this BaseSpgType.  # noqa: E501
+
+
+        :return: The alter_operation of this BaseSpgType.  # noqa: E501
+        :rtype: str
+        """
+        return self._alter_operation
+
+    @alter_operation.setter
+    def alter_operation(self, alter_operation):
+        """Sets the alter_operation of this BaseSpgType.
+
+
+        :param alter_operation: The alter_operation of this BaseSpgType.  # noqa: E501
+        :type: str
+        """
+        allowed_values = ["CREATE", "UPDATE", "DELETE"]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and alter_operation not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `alter_operation` ({0}), must be one of {1}".format(  # noqa: E501
+                    alter_operation, allowed_values
+                )
+            )
+
+        self._alter_operation = alter_operation
+
+    @property
+    def ext_info(self):
+        """Gets the ext_info of this BaseSpgType.  # noqa: E501
+
+
+        :return: The ext_info of this BaseSpgType.  # noqa: E501
+        :rtype: object
+        """
+        return self._ext_info
+
+    @ext_info.setter
+    def ext_info(self, ext_info):
+        """Sets the ext_info of this BaseSpgType.
+
+
+        :param ext_info: The ext_info of this BaseSpgType.  # noqa: E501
+        :type: object
+        """
+
+        self._ext_info = ext_info
+
+    def get_real_child_model(self, data):
+        """Returns the child model by discriminator"""
+        if "@type" in data:
+            child_type = data.get("@type")
+            real_child_model = self.discriminator_value_class_map.get(child_type)
+            return real_child_model
+        return None
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, BaseSpgType):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, BaseSpgType):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/type/basic_type.py b/knext/schema/rest/models/type/basic_type.py
new file mode 100644
index 00000000..48d00723
--- /dev/null
+++ b/knext/schema/rest/models/type/basic_type.py
@@ -0,0 +1,454 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class BasicType(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "basic_info": "StandardTypeBasicInfo",
+        "parent_type_info": "ParentTypeInfo",
+        "spg_type_enum": "str",
+        "properties": "list[Property]",
+        "relations": "list[Relation]",
+        "advanced_config": "SpgTypeAdvancedConfig",
+        "project_id": "int",
+        "ontology_id": "OntologyId",
+        "alter_operation": "str",
+        "ext_info": "object",
+        "basic_type": "str",
+    }
+
+    attribute_map = {
+        "basic_info": "basicInfo",
+        "parent_type_info": "parentTypeInfo",
+        "spg_type_enum": "spgTypeEnum",
+        "properties": "properties",
+        "relations": "relations",
+        "advanced_config": "advancedConfig",
+        "project_id": "projectId",
+        "ontology_id": "ontologyId",
+        "alter_operation": "alterOperation",
+        "ext_info": "extInfo",
+        "basic_type": "basicType",
+    }
+
+    def __init__(
+        self,
+        basic_info=None,
+        parent_type_info=None,
+        spg_type_enum="BASIC_TYPE",
+        properties=None,
+        relations=None,
+        advanced_config=None,
+        project_id=None,
+        ontology_id=None,
+        alter_operation=None,
+        ext_info=None,
+        basic_type=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """BasicType - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._basic_info = None
+        self._parent_type_info = None
+        self._spg_type_enum = None
+        self._properties = None
+        self._relations = None
+        self._advanced_config = None
+        self._project_id = None
+        self._ontology_id = None
+        self._alter_operation = None
+        self._ext_info = None
+        self._basic_type = None
+        self.discriminator = spg_type_enum
+
+        if basic_info is not None:
+            self.basic_info = basic_info
+        if parent_type_info is not None:
+            self.parent_type_info = parent_type_info
+        self.spg_type_enum = spg_type_enum
+        if properties is not None:
+            self.properties = properties
+        if relations is not None:
+            self.relations = relations
+        if advanced_config is not None:
+            self.advanced_config = advanced_config
+        if project_id is not None:
+            self.project_id = project_id
+        if ontology_id is not None:
+            self.ontology_id = ontology_id
+        if alter_operation is not None:
+            self.alter_operation = alter_operation
+        if ext_info is not None:
+            self.ext_info = ext_info
+        self.basic_type = basic_type
+
+    @property
+    def basic_info(self):
+        """Gets the basic_info of this BasicType.  # noqa: E501
+
+
+        :return: The basic_info of this BasicType.  # noqa: E501
+        :rtype: StandardTypeBasicInfo
+        """
+        return self._basic_info
+
+    @basic_info.setter
+    def basic_info(self, basic_info):
+        """Sets the basic_info of this BasicType.
+
+
+        :param basic_info: The basic_info of this BasicType.  # noqa: E501
+        :type: StandardTypeBasicInfo
+        """
+
+        self._basic_info = basic_info
+
+    @property
+    def parent_type_info(self):
+        """Gets the parent_type_info of this BasicType.  # noqa: E501
+
+
+        :return: The parent_type_info of this BasicType.  # noqa: E501
+        :rtype: ParentTypeInfo
+        """
+        return self._parent_type_info
+
+    @parent_type_info.setter
+    def parent_type_info(self, parent_type_info):
+        """Sets the parent_type_info of this BasicType.
+
+
+        :param parent_type_info: The parent_type_info of this BasicType.  # noqa: E501
+        :type: ParentTypeInfo
+        """
+
+        self._parent_type_info = parent_type_info
+
+    @property
+    def spg_type_enum(self):
+        """Gets the spg_type_enum of this BasicType.  # noqa: E501
+
+
+        :return: The spg_type_enum of this BasicType.  # noqa: E501
+        :rtype: str
+        """
+        return self._spg_type_enum
+
+    @spg_type_enum.setter
+    def spg_type_enum(self, spg_type_enum):
+        """Sets the spg_type_enum of this BasicType.
+
+
+        :param spg_type_enum: The spg_type_enum of this BasicType.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and spg_type_enum is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `spg_type_enum`, must not be `None`"
+            )  # noqa: E501
+        allowed_values = [
+            "BASIC_TYPE",
+            "ENTITY_TYPE",
+            "CONCEPT_TYPE",
+            "EVENT_TYPE",
+            "STANDARD_TYPE",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and spg_type_enum not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `spg_type_enum` ({0}), must be one of {1}".format(  # noqa: E501
+                    spg_type_enum, allowed_values
+                )
+            )
+
+        self._spg_type_enum = spg_type_enum
+
+    @property
+    def properties(self):
+        """Gets the properties of this BasicType.  # noqa: E501
+
+
+        :return: The properties of this BasicType.  # noqa: E501
+        :rtype: list[Property]
+        """
+        return self._properties
+
+    @properties.setter
+    def properties(self, properties):
+        """Sets the properties of this BasicType.
+
+
+        :param properties: The properties of this BasicType.  # noqa: E501
+        :type: list[Property]
+        """
+
+        self._properties = properties
+
+    @property
+    def relations(self):
+        """Gets the relations of this BasicType.  # noqa: E501
+
+
+        :return: The relations of this BasicType.  # noqa: E501
+        :rtype: list[Relation]
+        """
+        return self._relations
+
+    @relations.setter
+    def relations(self, relations):
+        """Sets the relations of this BasicType.
+
+
+        :param relations: The relations of this BasicType.  # noqa: E501
+        :type: list[Relation]
+        """
+
+        self._relations = relations
+
+    @property
+    def advanced_config(self):
+        """Gets the advanced_config of this BasicType.  # noqa: E501
+
+
+        :return: The advanced_config of this BasicType.  # noqa: E501
+        :rtype: SpgTypeAdvancedConfig
+        """
+        return self._advanced_config
+
+    @advanced_config.setter
+    def advanced_config(self, advanced_config):
+        """Sets the advanced_config of this BasicType.
+
+
+        :param advanced_config: The advanced_config of this BasicType.  # noqa: E501
+        :type: SpgTypeAdvancedConfig
+        """
+
+        self._advanced_config = advanced_config
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this BasicType.  # noqa: E501
+
+
+        :return: The project_id of this BasicType.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this BasicType.
+
+
+        :param project_id: The project_id of this BasicType.  # noqa: E501
+        :type: int
+        """
+
+        self._project_id = project_id
+
+    @property
+    def ontology_id(self):
+        """Gets the ontology_id of this BasicType.  # noqa: E501
+
+
+        :return: The ontology_id of this BasicType.  # noqa: E501
+        :rtype: OntologyId
+        """
+        return self._ontology_id
+
+    @ontology_id.setter
+    def ontology_id(self, ontology_id):
+        """Sets the ontology_id of this BasicType.
+
+
+        :param ontology_id: The ontology_id of this BasicType.  # noqa: E501
+        :type: OntologyId
+        """
+
+        self._ontology_id = ontology_id
+
+    @property
+    def alter_operation(self):
+        """Gets the alter_operation of this BasicType.  # noqa: E501
+
+
+        :return: The alter_operation of this BasicType.  # noqa: E501
+        :rtype: str
+        """
+        return self._alter_operation
+
+    @alter_operation.setter
+    def alter_operation(self, alter_operation):
+        """Sets the alter_operation of this BasicType.
+
+
+        :param alter_operation: The alter_operation of this BasicType.  # noqa: E501
+        :type: str
+        """
+        allowed_values = ["CREATE", "UPDATE", "DELETE"]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and alter_operation not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `alter_operation` ({0}), must be one of {1}".format(  # noqa: E501
+                    alter_operation, allowed_values
+                )
+            )
+
+        self._alter_operation = alter_operation
+
+    @property
+    def ext_info(self):
+        """Gets the ext_info of this BasicType.  # noqa: E501
+
+
+        :return: The ext_info of this BasicType.  # noqa: E501
+        :rtype: object
+        """
+        return self._ext_info
+
+    @ext_info.setter
+    def ext_info(self, ext_info):
+        """Sets the ext_info of this BasicType.
+
+
+        :param ext_info: The ext_info of this BasicType.  # noqa: E501
+        :type: object
+        """
+
+        self._ext_info = ext_info
+
+    @property
+    def basic_type(self):
+        """Gets the basic_type of this BasicType.  # noqa: E501
+
+
+        :return: The basic_type of this BasicType.  # noqa: E501
+        :rtype: str
+        """
+        return self._basic_type
+
+    @basic_type.setter
+    def basic_type(self, basic_type):
+        """Sets the basic_type of this BasicType.
+
+
+        :param basic_type: The basic_type of this BasicType.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and basic_type is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `basic_type`, must not be `None`"
+            )  # noqa: E501
+        allowed_values = ["TEXT", "LONG", "DOUBLE"]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and basic_type not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `basic_type` ({0}), must be one of {1}".format(  # noqa: E501
+                    basic_type, allowed_values
+                )
+            )
+
+        self._basic_type = basic_type
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, BasicType):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, BasicType):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/type/concept_layer_config.py b/knext/schema/rest/models/type/concept_layer_config.py
new file mode 100644
index 00000000..07d70209
--- /dev/null
+++ b/knext/schema/rest/models/type/concept_layer_config.py
@@ -0,0 +1,168 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class ConceptLayerConfig(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"hypernym_predicate": "str", "layer_names": "list[str]"}
+
+    attribute_map = {
+        "hypernym_predicate": "hypernymPredicate",
+        "layer_names": "layerNames",
+    }
+
+    def __init__(
+        self, hypernym_predicate=None, layer_names=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """ConceptLayerConfig - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._hypernym_predicate = None
+        self._layer_names = None
+        self.discriminator = None
+
+        if hypernym_predicate is not None:
+            self.hypernym_predicate = hypernym_predicate
+        if layer_names is not None:
+            self.layer_names = layer_names
+
+    @property
+    def hypernym_predicate(self):
+        """Gets the hypernym_predicate of this ConceptLayerConfig.  # noqa: E501
+
+
+        :return: The hypernym_predicate of this ConceptLayerConfig.  # noqa: E501
+        :rtype: str
+        """
+        return self._hypernym_predicate
+
+    @hypernym_predicate.setter
+    def hypernym_predicate(self, hypernym_predicate):
+        """Sets the hypernym_predicate of this ConceptLayerConfig.
+
+
+        :param hypernym_predicate: The hypernym_predicate of this ConceptLayerConfig.  # noqa: E501
+        :type: str
+        """
+        allowed_values = [None, "isA", "locateAt", "mannerOf"]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and hypernym_predicate not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `hypernym_predicate` ({0}), must be one of {1}".format(  # noqa: E501
+                    hypernym_predicate, allowed_values
+                )
+            )
+
+        self._hypernym_predicate = hypernym_predicate
+
+    @property
+    def layer_names(self):
+        """Gets the layer_names of this ConceptLayerConfig.  # noqa: E501
+
+
+        :return: The layer_names of this ConceptLayerConfig.  # noqa: E501
+        :rtype: list[str]
+        """
+        return self._layer_names
+
+    @layer_names.setter
+    def layer_names(self, layer_names):
+        """Sets the layer_names of this ConceptLayerConfig.
+
+
+        :param layer_names: The layer_names of this ConceptLayerConfig.  # noqa: E501
+        :type: list[str]
+        """
+
+        self._layer_names = layer_names
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, ConceptLayerConfig):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, ConceptLayerConfig):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/type/concept_taxonomic_config.py b/knext/schema/rest/models/type/concept_taxonomic_config.py
new file mode 100644
index 00000000..96e18302
--- /dev/null
+++ b/knext/schema/rest/models/type/concept_taxonomic_config.py
@@ -0,0 +1,131 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class ConceptTaxonomicConfig(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"taxonomic_type_identifier": "SpgTypeIdentifier"}
+
+    attribute_map = {"taxonomic_type_identifier": "taxonomicTypeIdentifier"}
+
+    def __init__(
+        self, taxonomic_type_identifier=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """ConceptTaxonomicConfig - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._taxonomic_type_identifier = None
+        self.discriminator = None
+
+        if taxonomic_type_identifier is not None:
+            self.taxonomic_type_identifier = taxonomic_type_identifier
+
+    @property
+    def taxonomic_type_identifier(self):
+        """Gets the taxonomic_type_identifier of this ConceptTaxonomicConfig.  # noqa: E501
+
+
+        :return: The taxonomic_type_identifier of this ConceptTaxonomicConfig.  # noqa: E501
+        :rtype: SpgTypeIdentifier
+        """
+        return self._taxonomic_type_identifier
+
+    @taxonomic_type_identifier.setter
+    def taxonomic_type_identifier(self, taxonomic_type_identifier):
+        """Sets the taxonomic_type_identifier of this ConceptTaxonomicConfig.
+
+
+        :param taxonomic_type_identifier: The taxonomic_type_identifier of this ConceptTaxonomicConfig.  # noqa: E501
+        :type: SpgTypeIdentifier
+        """
+
+        self._taxonomic_type_identifier = taxonomic_type_identifier
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, ConceptTaxonomicConfig):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, ConceptTaxonomicConfig):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/type/concept_type.py b/knext/schema/rest/models/type/concept_type.py
new file mode 100644
index 00000000..8ab0b573
--- /dev/null
+++ b/knext/schema/rest/models/type/concept_type.py
@@ -0,0 +1,493 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class ConceptType(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "basic_info": "StandardTypeBasicInfo",
+        "parent_type_info": "ParentTypeInfo",
+        "spg_type_enum": "str",
+        "properties": "list[Property]",
+        "relations": "list[Relation]",
+        "advanced_config": "SpgTypeAdvancedConfig",
+        "project_id": "int",
+        "ontology_id": "OntologyId",
+        "alter_operation": "str",
+        "ext_info": "object",
+        "concept_layer_config": "ConceptLayerConfig",
+        "concept_taxonomic_config": "ConceptTaxonomicConfig",
+        "concept_multi_version_config": "MultiVersionConfig",
+    }
+
+    attribute_map = {
+        "basic_info": "basicInfo",
+        "parent_type_info": "parentTypeInfo",
+        "spg_type_enum": "spgTypeEnum",
+        "properties": "properties",
+        "relations": "relations",
+        "advanced_config": "advancedConfig",
+        "project_id": "projectId",
+        "ontology_id": "ontologyId",
+        "alter_operation": "alterOperation",
+        "ext_info": "extInfo",
+        "concept_layer_config": "conceptLayerConfig",
+        "concept_taxonomic_config": "conceptTaxonomicConfig",
+        "concept_multi_version_config": "conceptMultiVersionConfig",
+    }
+
+    def __init__(
+        self,
+        basic_info=None,
+        parent_type_info=None,
+        spg_type_enum="CONCEPT_TYPE",
+        properties=None,
+        relations=None,
+        advanced_config=None,
+        project_id=None,
+        ontology_id=None,
+        alter_operation=None,
+        ext_info=None,
+        concept_layer_config=None,
+        concept_taxonomic_config=None,
+        concept_multi_version_config=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """ConceptType - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._basic_info = None
+        self._parent_type_info = None
+        self._spg_type_enum = None
+        self._properties = None
+        self._relations = None
+        self._advanced_config = None
+        self._project_id = None
+        self._ontology_id = None
+        self._alter_operation = None
+        self._ext_info = None
+        self._concept_layer_config = None
+        self._concept_taxonomic_config = None
+        self._concept_multi_version_config = None
+        self.discriminator = spg_type_enum
+
+        if basic_info is not None:
+            self.basic_info = basic_info
+        if parent_type_info is not None:
+            self.parent_type_info = parent_type_info
+        self.spg_type_enum = spg_type_enum
+        if properties is not None:
+            self.properties = properties
+        if relations is not None:
+            self.relations = relations
+        if advanced_config is not None:
+            self.advanced_config = advanced_config
+        if project_id is not None:
+            self.project_id = project_id
+        if ontology_id is not None:
+            self.ontology_id = ontology_id
+        if alter_operation is not None:
+            self.alter_operation = alter_operation
+        if ext_info is not None:
+            self.ext_info = ext_info
+        if concept_layer_config is not None:
+            self.concept_layer_config = concept_layer_config
+        if concept_taxonomic_config is not None:
+            self.concept_taxonomic_config = concept_taxonomic_config
+        if concept_multi_version_config is not None:
+            self.concept_multi_version_config = concept_multi_version_config
+
+    @property
+    def basic_info(self):
+        """Gets the basic_info of this ConceptType.  # noqa: E501
+
+
+        :return: The basic_info of this ConceptType.  # noqa: E501
+        :rtype: StandardTypeBasicInfo
+        """
+        return self._basic_info
+
+    @basic_info.setter
+    def basic_info(self, basic_info):
+        """Sets the basic_info of this ConceptType.
+
+
+        :param basic_info: The basic_info of this ConceptType.  # noqa: E501
+        :type: StandardTypeBasicInfo
+        """
+
+        self._basic_info = basic_info
+
+    @property
+    def parent_type_info(self):
+        """Gets the parent_type_info of this ConceptType.  # noqa: E501
+
+
+        :return: The parent_type_info of this ConceptType.  # noqa: E501
+        :rtype: ParentTypeInfo
+        """
+        return self._parent_type_info
+
+    @parent_type_info.setter
+    def parent_type_info(self, parent_type_info):
+        """Sets the parent_type_info of this ConceptType.
+
+
+        :param parent_type_info: The parent_type_info of this ConceptType.  # noqa: E501
+        :type: ParentTypeInfo
+        """
+
+        self._parent_type_info = parent_type_info
+
+    @property
+    def spg_type_enum(self):
+        """Gets the spg_type_enum of this ConceptType.  # noqa: E501
+
+
+        :return: The spg_type_enum of this ConceptType.  # noqa: E501
+        :rtype: str
+        """
+        return self._spg_type_enum
+
+    @spg_type_enum.setter
+    def spg_type_enum(self, spg_type_enum):
+        """Sets the spg_type_enum of this ConceptType.
+
+
+        :param spg_type_enum: The spg_type_enum of this ConceptType.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and spg_type_enum is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `spg_type_enum`, must not be `None`"
+            )  # noqa: E501
+        allowed_values = [
+            "BASIC_TYPE",
+            "ENTITY_TYPE",
+            "CONCEPT_TYPE",
+            "EVENT_TYPE",
+            "STANDARD_TYPE",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and spg_type_enum not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `spg_type_enum` ({0}), must be one of {1}".format(  # noqa: E501
+                    spg_type_enum, allowed_values
+                )
+            )
+
+        self._spg_type_enum = spg_type_enum
+
+    @property
+    def properties(self):
+        """Gets the properties of this ConceptType.  # noqa: E501
+
+
+        :return: The properties of this ConceptType.  # noqa: E501
+        :rtype: list[Property]
+        """
+        return self._properties
+
+    @properties.setter
+    def properties(self, properties):
+        """Sets the properties of this ConceptType.
+
+
+        :param properties: The properties of this ConceptType.  # noqa: E501
+        :type: list[Property]
+        """
+
+        self._properties = properties
+
+    @property
+    def relations(self):
+        """Gets the relations of this ConceptType.  # noqa: E501
+
+
+        :return: The relations of this ConceptType.  # noqa: E501
+        :rtype: list[Relation]
+        """
+        return self._relations
+
+    @relations.setter
+    def relations(self, relations):
+        """Sets the relations of this ConceptType.
+
+
+        :param relations: The relations of this ConceptType.  # noqa: E501
+        :type: list[Relation]
+        """
+
+        self._relations = relations
+
+    @property
+    def advanced_config(self):
+        """Gets the advanced_config of this ConceptType.  # noqa: E501
+
+
+        :return: The advanced_config of this ConceptType.  # noqa: E501
+        :rtype: SpgTypeAdvancedConfig
+        """
+        return self._advanced_config
+
+    @advanced_config.setter
+    def advanced_config(self, advanced_config):
+        """Sets the advanced_config of this ConceptType.
+
+
+        :param advanced_config: The advanced_config of this ConceptType.  # noqa: E501
+        :type: SpgTypeAdvancedConfig
+        """
+
+        self._advanced_config = advanced_config
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this ConceptType.  # noqa: E501
+
+
+        :return: The project_id of this ConceptType.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this ConceptType.
+
+
+        :param project_id: The project_id of this ConceptType.  # noqa: E501
+        :type: int
+        """
+
+        self._project_id = project_id
+
+    @property
+    def ontology_id(self):
+        """Gets the ontology_id of this ConceptType.  # noqa: E501
+
+
+        :return: The ontology_id of this ConceptType.  # noqa: E501
+        :rtype: OntologyId
+        """
+        return self._ontology_id
+
+    @ontology_id.setter
+    def ontology_id(self, ontology_id):
+        """Sets the ontology_id of this ConceptType.
+
+
+        :param ontology_id: The ontology_id of this ConceptType.  # noqa: E501
+        :type: OntologyId
+        """
+
+        self._ontology_id = ontology_id
+
+    @property
+    def alter_operation(self):
+        """Gets the alter_operation of this ConceptType.  # noqa: E501
+
+
+        :return: The alter_operation of this ConceptType.  # noqa: E501
+        :rtype: str
+        """
+        return self._alter_operation
+
+    @alter_operation.setter
+    def alter_operation(self, alter_operation):
+        """Sets the alter_operation of this ConceptType.
+
+
+        :param alter_operation: The alter_operation of this ConceptType.  # noqa: E501
+        :type: str
+        """
+        allowed_values = ["CREATE", "UPDATE", "DELETE"]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and alter_operation not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `alter_operation` ({0}), must be one of {1}".format(  # noqa: E501
+                    alter_operation, allowed_values
+                )
+            )
+
+        self._alter_operation = alter_operation
+
+    @property
+    def ext_info(self):
+        """Gets the ext_info of this ConceptType.  # noqa: E501
+
+
+        :return: The ext_info of this ConceptType.  # noqa: E501
+        :rtype: object
+        """
+        return self._ext_info
+
+    @ext_info.setter
+    def ext_info(self, ext_info):
+        """Sets the ext_info of this ConceptType.
+
+
+        :param ext_info: The ext_info of this ConceptType.  # noqa: E501
+        :type: object
+        """
+
+        self._ext_info = ext_info
+
+    @property
+    def concept_layer_config(self):
+        """Gets the concept_layer_config of this ConceptType.  # noqa: E501
+
+
+        :return: The concept_layer_config of this ConceptType.  # noqa: E501
+        :rtype: ConceptLayerConfig
+        """
+        return self._concept_layer_config
+
+    @concept_layer_config.setter
+    def concept_layer_config(self, concept_layer_config):
+        """Sets the concept_layer_config of this ConceptType.
+
+
+        :param concept_layer_config: The concept_layer_config of this ConceptType.  # noqa: E501
+        :type: ConceptLayerConfig
+        """
+
+        self._concept_layer_config = concept_layer_config
+
+    @property
+    def concept_taxonomic_config(self):
+        """Gets the concept_taxonomic_config of this ConceptType.  # noqa: E501
+
+
+        :return: The concept_taxonomic_config of this ConceptType.  # noqa: E501
+        :rtype: ConceptTaxonomicConfig
+        """
+        return self._concept_taxonomic_config
+
+    @concept_taxonomic_config.setter
+    def concept_taxonomic_config(self, concept_taxonomic_config):
+        """Sets the concept_taxonomic_config of this ConceptType.
+
+
+        :param concept_taxonomic_config: The concept_taxonomic_config of this ConceptType.  # noqa: E501
+        :type: ConceptTaxonomicConfig
+        """
+
+        self._concept_taxonomic_config = concept_taxonomic_config
+
+    @property
+    def concept_multi_version_config(self):
+        """Gets the concept_multi_version_config of this ConceptType.  # noqa: E501
+
+
+        :return: The concept_multi_version_config of this ConceptType.  # noqa: E501
+        :rtype: MultiVersionConfig
+        """
+        return self._concept_multi_version_config
+
+    @concept_multi_version_config.setter
+    def concept_multi_version_config(self, concept_multi_version_config):
+        """Sets the concept_multi_version_config of this ConceptType.
+
+
+        :param concept_multi_version_config: The concept_multi_version_config of this ConceptType.  # noqa: E501
+        :type: MultiVersionConfig
+        """
+
+        self._concept_multi_version_config = concept_multi_version_config
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, ConceptType):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, ConceptType):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/type/entity_type.py b/knext/schema/rest/models/type/entity_type.py
new file mode 100644
index 00000000..ca431965
--- /dev/null
+++ b/knext/schema/rest/models/type/entity_type.py
@@ -0,0 +1,412 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class EntityType(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "basic_info": "StandardTypeBasicInfo",
+        "parent_type_info": "ParentTypeInfo",
+        "spg_type_enum": "str",
+        "properties": "list[Property]",
+        "relations": "list[Relation]",
+        "advanced_config": "SpgTypeAdvancedConfig",
+        "project_id": "int",
+        "ontology_id": "OntologyId",
+        "alter_operation": "str",
+        "ext_info": "object",
+    }
+
+    attribute_map = {
+        "basic_info": "basicInfo",
+        "parent_type_info": "parentTypeInfo",
+        "spg_type_enum": "spgTypeEnum",
+        "properties": "properties",
+        "relations": "relations",
+        "advanced_config": "advancedConfig",
+        "project_id": "projectId",
+        "ontology_id": "ontologyId",
+        "alter_operation": "alterOperation",
+        "ext_info": "extInfo",
+    }
+
+    def __init__(
+        self,
+        basic_info=None,
+        parent_type_info=None,
+        spg_type_enum="ENTITY_TYPE",
+        properties=None,
+        relations=None,
+        advanced_config=None,
+        project_id=None,
+        ontology_id=None,
+        alter_operation=None,
+        ext_info=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """EntityType - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._basic_info = None
+        self._parent_type_info = None
+        self._spg_type_enum = None
+        self._properties = None
+        self._relations = None
+        self._advanced_config = None
+        self._project_id = None
+        self._ontology_id = None
+        self._alter_operation = None
+        self._ext_info = None
+        self.discriminator = spg_type_enum
+
+        if basic_info is not None:
+            self.basic_info = basic_info
+        if parent_type_info is not None:
+            self.parent_type_info = parent_type_info
+        self.spg_type_enum = spg_type_enum
+        if properties is not None:
+            self.properties = properties
+        if relations is not None:
+            self.relations = relations
+        if advanced_config is not None:
+            self.advanced_config = advanced_config
+        if project_id is not None:
+            self.project_id = project_id
+        if ontology_id is not None:
+            self.ontology_id = ontology_id
+        if alter_operation is not None:
+            self.alter_operation = alter_operation
+        if ext_info is not None:
+            self.ext_info = ext_info
+
+    @property
+    def basic_info(self):
+        """Gets the basic_info of this EntityType.  # noqa: E501
+
+
+        :return: The basic_info of this EntityType.  # noqa: E501
+        :rtype: StandardTypeBasicInfo
+        """
+        return self._basic_info
+
+    @basic_info.setter
+    def basic_info(self, basic_info):
+        """Sets the basic_info of this EntityType.
+
+
+        :param basic_info: The basic_info of this EntityType.  # noqa: E501
+        :type: StandardTypeBasicInfo
+        """
+
+        self._basic_info = basic_info
+
+    @property
+    def parent_type_info(self):
+        """Gets the parent_type_info of this EntityType.  # noqa: E501
+
+
+        :return: The parent_type_info of this EntityType.  # noqa: E501
+        :rtype: ParentTypeInfo
+        """
+        return self._parent_type_info
+
+    @parent_type_info.setter
+    def parent_type_info(self, parent_type_info):
+        """Sets the parent_type_info of this EntityType.
+
+
+        :param parent_type_info: The parent_type_info of this EntityType.  # noqa: E501
+        :type: ParentTypeInfo
+        """
+
+        self._parent_type_info = parent_type_info
+
+    @property
+    def spg_type_enum(self):
+        """Gets the spg_type_enum of this EntityType.  # noqa: E501
+
+
+        :return: The spg_type_enum of this EntityType.  # noqa: E501
+        :rtype: str
+        """
+        return self._spg_type_enum
+
+    @spg_type_enum.setter
+    def spg_type_enum(self, spg_type_enum):
+        """Sets the spg_type_enum of this EntityType.
+
+
+        :param spg_type_enum: The spg_type_enum of this EntityType.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and spg_type_enum is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `spg_type_enum`, must not be `None`"
+            )  # noqa: E501
+        allowed_values = [
+            "BASIC_TYPE",
+            "ENTITY_TYPE",
+            "CONCEPT_TYPE",
+            "EVENT_TYPE",
+            "STANDARD_TYPE",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and spg_type_enum not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `spg_type_enum` ({0}), must be one of {1}".format(  # noqa: E501
+                    spg_type_enum, allowed_values
+                )
+            )
+
+        self._spg_type_enum = spg_type_enum
+
+    @property
+    def properties(self):
+        """Gets the properties of this EntityType.  # noqa: E501
+
+
+        :return: The properties of this EntityType.  # noqa: E501
+        :rtype: list[Property]
+        """
+        return self._properties
+
+    @properties.setter
+    def properties(self, properties):
+        """Sets the properties of this EntityType.
+
+
+        :param properties: The properties of this EntityType.  # noqa: E501
+        :type: list[Property]
+        """
+
+        self._properties = properties
+
+    @property
+    def relations(self):
+        """Gets the relations of this EntityType.  # noqa: E501
+
+
+        :return: The relations of this EntityType.  # noqa: E501
+        :rtype: list[Relation]
+        """
+        return self._relations
+
+    @relations.setter
+    def relations(self, relations):
+        """Sets the relations of this EntityType.
+
+
+        :param relations: The relations of this EntityType.  # noqa: E501
+        :type: list[Relation]
+        """
+
+        self._relations = relations
+
+    @property
+    def advanced_config(self):
+        """Gets the advanced_config of this EntityType.  # noqa: E501
+
+
+        :return: The advanced_config of this EntityType.  # noqa: E501
+        :rtype: SpgTypeAdvancedConfig
+        """
+        return self._advanced_config
+
+    @advanced_config.setter
+    def advanced_config(self, advanced_config):
+        """Sets the advanced_config of this EntityType.
+
+
+        :param advanced_config: The advanced_config of this EntityType.  # noqa: E501
+        :type: SpgTypeAdvancedConfig
+        """
+
+        self._advanced_config = advanced_config
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this EntityType.  # noqa: E501
+
+
+        :return: The project_id of this EntityType.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this EntityType.
+
+
+        :param project_id: The project_id of this EntityType.  # noqa: E501
+        :type: int
+        """
+
+        self._project_id = project_id
+
+    @property
+    def ontology_id(self):
+        """Gets the ontology_id of this EntityType.  # noqa: E501
+
+
+        :return: The ontology_id of this EntityType.  # noqa: E501
+        :rtype: OntologyId
+        """
+        return self._ontology_id
+
+    @ontology_id.setter
+    def ontology_id(self, ontology_id):
+        """Sets the ontology_id of this EntityType.
+
+
+        :param ontology_id: The ontology_id of this EntityType.  # noqa: E501
+        :type: OntologyId
+        """
+
+        self._ontology_id = ontology_id
+
+    @property
+    def alter_operation(self):
+        """Gets the alter_operation of this EntityType.  # noqa: E501
+
+
+        :return: The alter_operation of this EntityType.  # noqa: E501
+        :rtype: str
+        """
+        return self._alter_operation
+
+    @alter_operation.setter
+    def alter_operation(self, alter_operation):
+        """Sets the alter_operation of this EntityType.
+
+
+        :param alter_operation: The alter_operation of this EntityType.  # noqa: E501
+        :type: str
+        """
+        allowed_values = ["CREATE", "UPDATE", "DELETE"]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and alter_operation not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `alter_operation` ({0}), must be one of {1}".format(  # noqa: E501
+                    alter_operation, allowed_values
+                )
+            )
+
+        self._alter_operation = alter_operation
+
+    @property
+    def ext_info(self):
+        """Gets the ext_info of this EntityType.  # noqa: E501
+
+
+        :return: The ext_info of this EntityType.  # noqa: E501
+        :rtype: object
+        """
+        return self._ext_info
+
+    @ext_info.setter
+    def ext_info(self, ext_info):
+        """Sets the ext_info of this EntityType.
+
+
+        :param ext_info: The ext_info of this EntityType.  # noqa: E501
+        :type: object
+        """
+
+        self._ext_info = ext_info
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, EntityType):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, EntityType):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/type/event_type.py b/knext/schema/rest/models/type/event_type.py
new file mode 100644
index 00000000..1ec8938c
--- /dev/null
+++ b/knext/schema/rest/models/type/event_type.py
@@ -0,0 +1,412 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class EventType(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "basic_info": "StandardTypeBasicInfo",
+        "parent_type_info": "ParentTypeInfo",
+        "spg_type_enum": "str",
+        "properties": "list[Property]",
+        "relations": "list[Relation]",
+        "advanced_config": "SpgTypeAdvancedConfig",
+        "project_id": "int",
+        "ontology_id": "OntologyId",
+        "alter_operation": "str",
+        "ext_info": "object",
+    }
+
+    attribute_map = {
+        "basic_info": "basicInfo",
+        "parent_type_info": "parentTypeInfo",
+        "spg_type_enum": "spgTypeEnum",
+        "properties": "properties",
+        "relations": "relations",
+        "advanced_config": "advancedConfig",
+        "project_id": "projectId",
+        "ontology_id": "ontologyId",
+        "alter_operation": "alterOperation",
+        "ext_info": "extInfo",
+    }
+
+    def __init__(
+        self,
+        basic_info=None,
+        parent_type_info=None,
+        spg_type_enum="EVENT_TYPE",
+        properties=None,
+        relations=None,
+        advanced_config=None,
+        project_id=None,
+        ontology_id=None,
+        alter_operation=None,
+        ext_info=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """EventType - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._basic_info = None
+        self._parent_type_info = None
+        self._spg_type_enum = None
+        self._properties = None
+        self._relations = None
+        self._advanced_config = None
+        self._project_id = None
+        self._ontology_id = None
+        self._alter_operation = None
+        self._ext_info = None
+        self.discriminator = spg_type_enum
+
+        if basic_info is not None:
+            self.basic_info = basic_info
+        if parent_type_info is not None:
+            self.parent_type_info = parent_type_info
+        self.spg_type_enum = spg_type_enum
+        if properties is not None:
+            self.properties = properties
+        if relations is not None:
+            self.relations = relations
+        if advanced_config is not None:
+            self.advanced_config = advanced_config
+        if project_id is not None:
+            self.project_id = project_id
+        if ontology_id is not None:
+            self.ontology_id = ontology_id
+        if alter_operation is not None:
+            self.alter_operation = alter_operation
+        if ext_info is not None:
+            self.ext_info = ext_info
+
+    @property
+    def basic_info(self):
+        """Gets the basic_info of this EventType.  # noqa: E501
+
+
+        :return: The basic_info of this EventType.  # noqa: E501
+        :rtype: StandardTypeBasicInfo
+        """
+        return self._basic_info
+
+    @basic_info.setter
+    def basic_info(self, basic_info):
+        """Sets the basic_info of this EventType.
+
+
+        :param basic_info: The basic_info of this EventType.  # noqa: E501
+        :type: StandardTypeBasicInfo
+        """
+
+        self._basic_info = basic_info
+
+    @property
+    def parent_type_info(self):
+        """Gets the parent_type_info of this EventType.  # noqa: E501
+
+
+        :return: The parent_type_info of this EventType.  # noqa: E501
+        :rtype: ParentTypeInfo
+        """
+        return self._parent_type_info
+
+    @parent_type_info.setter
+    def parent_type_info(self, parent_type_info):
+        """Sets the parent_type_info of this EventType.
+
+
+        :param parent_type_info: The parent_type_info of this EventType.  # noqa: E501
+        :type: ParentTypeInfo
+        """
+
+        self._parent_type_info = parent_type_info
+
+    @property
+    def spg_type_enum(self):
+        """Gets the spg_type_enum of this EventType.  # noqa: E501
+
+
+        :return: The spg_type_enum of this EventType.  # noqa: E501
+        :rtype: str
+        """
+        return self._spg_type_enum
+
+    @spg_type_enum.setter
+    def spg_type_enum(self, spg_type_enum):
+        """Sets the spg_type_enum of this EventType.
+
+
+        :param spg_type_enum: The spg_type_enum of this EventType.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and spg_type_enum is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `spg_type_enum`, must not be `None`"
+            )  # noqa: E501
+        allowed_values = [
+            "BASIC_TYPE",
+            "ENTITY_TYPE",
+            "CONCEPT_TYPE",
+            "EVENT_TYPE",
+            "STANDARD_TYPE",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and spg_type_enum not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `spg_type_enum` ({0}), must be one of {1}".format(  # noqa: E501
+                    spg_type_enum, allowed_values
+                )
+            )
+
+        self._spg_type_enum = spg_type_enum
+
+    @property
+    def properties(self):
+        """Gets the properties of this EventType.  # noqa: E501
+
+
+        :return: The properties of this EventType.  # noqa: E501
+        :rtype: list[Property]
+        """
+        return self._properties
+
+    @properties.setter
+    def properties(self, properties):
+        """Sets the properties of this EventType.
+
+
+        :param properties: The properties of this EventType.  # noqa: E501
+        :type: list[Property]
+        """
+
+        self._properties = properties
+
+    @property
+    def relations(self):
+        """Gets the relations of this EventType.  # noqa: E501
+
+
+        :return: The relations of this EventType.  # noqa: E501
+        :rtype: list[Relation]
+        """
+        return self._relations
+
+    @relations.setter
+    def relations(self, relations):
+        """Sets the relations of this EventType.
+
+
+        :param relations: The relations of this EventType.  # noqa: E501
+        :type: list[Relation]
+        """
+
+        self._relations = relations
+
+    @property
+    def advanced_config(self):
+        """Gets the advanced_config of this EventType.  # noqa: E501
+
+
+        :return: The advanced_config of this EventType.  # noqa: E501
+        :rtype: SpgTypeAdvancedConfig
+        """
+        return self._advanced_config
+
+    @advanced_config.setter
+    def advanced_config(self, advanced_config):
+        """Sets the advanced_config of this EventType.
+
+
+        :param advanced_config: The advanced_config of this EventType.  # noqa: E501
+        :type: SpgTypeAdvancedConfig
+        """
+
+        self._advanced_config = advanced_config
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this EventType.  # noqa: E501
+
+
+        :return: The project_id of this EventType.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this EventType.
+
+
+        :param project_id: The project_id of this EventType.  # noqa: E501
+        :type: int
+        """
+
+        self._project_id = project_id
+
+    @property
+    def ontology_id(self):
+        """Gets the ontology_id of this EventType.  # noqa: E501
+
+
+        :return: The ontology_id of this EventType.  # noqa: E501
+        :rtype: OntologyId
+        """
+        return self._ontology_id
+
+    @ontology_id.setter
+    def ontology_id(self, ontology_id):
+        """Sets the ontology_id of this EventType.
+
+
+        :param ontology_id: The ontology_id of this EventType.  # noqa: E501
+        :type: OntologyId
+        """
+
+        self._ontology_id = ontology_id
+
+    @property
+    def alter_operation(self):
+        """Gets the alter_operation of this EventType.  # noqa: E501
+
+
+        :return: The alter_operation of this EventType.  # noqa: E501
+        :rtype: str
+        """
+        return self._alter_operation
+
+    @alter_operation.setter
+    def alter_operation(self, alter_operation):
+        """Sets the alter_operation of this EventType.
+
+
+        :param alter_operation: The alter_operation of this EventType.  # noqa: E501
+        :type: str
+        """
+        allowed_values = ["CREATE", "UPDATE", "DELETE"]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and alter_operation not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `alter_operation` ({0}), must be one of {1}".format(  # noqa: E501
+                    alter_operation, allowed_values
+                )
+            )
+
+        self._alter_operation = alter_operation
+
+    @property
+    def ext_info(self):
+        """Gets the ext_info of this EventType.  # noqa: E501
+
+
+        :return: The ext_info of this EventType.  # noqa: E501
+        :rtype: object
+        """
+        return self._ext_info
+
+    @ext_info.setter
+    def ext_info(self, ext_info):
+        """Sets the ext_info of this EventType.
+
+
+        :param ext_info: The ext_info of this EventType.  # noqa: E501
+        :type: object
+        """
+
+        self._ext_info = ext_info
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, EventType):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, EventType):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/type/multi_version_config.py b/knext/schema/rest/models/type/multi_version_config.py
new file mode 100644
index 00000000..a8452930
--- /dev/null
+++ b/knext/schema/rest/models/type/multi_version_config.py
@@ -0,0 +1,179 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class MultiVersionConfig(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"pattern": "str", "max_version": "int", "ttl": "int"}
+
+    attribute_map = {"pattern": "pattern", "max_version": "maxVersion", "ttl": "ttl"}
+
+    def __init__(
+        self, pattern=None, max_version=None, ttl=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """MultiVersionConfig - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._pattern = None
+        self._max_version = None
+        self._ttl = None
+        self.discriminator = None
+
+        if pattern is not None:
+            self.pattern = pattern
+        if max_version is not None:
+            self.max_version = max_version
+        if ttl is not None:
+            self.ttl = ttl
+
+    @property
+    def pattern(self):
+        """Gets the pattern of this MultiVersionConfig.  # noqa: E501
+
+
+        :return: The pattern of this MultiVersionConfig.  # noqa: E501
+        :rtype: str
+        """
+        return self._pattern
+
+    @pattern.setter
+    def pattern(self, pattern):
+        """Sets the pattern of this MultiVersionConfig.
+
+
+        :param pattern: The pattern of this MultiVersionConfig.  # noqa: E501
+        :type: str
+        """
+
+        self._pattern = pattern
+
+    @property
+    def max_version(self):
+        """Gets the max_version of this MultiVersionConfig.  # noqa: E501
+
+
+        :return: The max_version of this MultiVersionConfig.  # noqa: E501
+        :rtype: int
+        """
+        return self._max_version
+
+    @max_version.setter
+    def max_version(self, max_version):
+        """Sets the max_version of this MultiVersionConfig.
+
+
+        :param max_version: The max_version of this MultiVersionConfig.  # noqa: E501
+        :type: int
+        """
+
+        self._max_version = max_version
+
+    @property
+    def ttl(self):
+        """Gets the ttl of this MultiVersionConfig.  # noqa: E501
+
+
+        :return: The ttl of this MultiVersionConfig.  # noqa: E501
+        :rtype: int
+        """
+        return self._ttl
+
+    @ttl.setter
+    def ttl(self, ttl):
+        """Sets the ttl of this MultiVersionConfig.
+
+
+        :param ttl: The ttl of this MultiVersionConfig.  # noqa: E501
+        :type: int
+        """
+
+        self._ttl = ttl
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, MultiVersionConfig):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, MultiVersionConfig):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/type/operator_key.py b/knext/schema/rest/models/type/operator_key.py
new file mode 100644
index 00000000..915f688b
--- /dev/null
+++ b/knext/schema/rest/models/type/operator_key.py
@@ -0,0 +1,155 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class OperatorKey(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"name": "str", "version": "int"}
+
+    attribute_map = {"name": "name", "version": "version"}
+
+    def __init__(
+        self, name=None, version=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """OperatorKey - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._name = None
+        self._version = None
+        self.discriminator = None
+
+        if name is not None:
+            self.name = name
+        if version is not None:
+            self.version = version
+
+    @property
+    def name(self):
+        """Gets the name of this OperatorKey.  # noqa: E501
+
+
+        :return: The name of this OperatorKey.  # noqa: E501
+        :rtype: str
+        """
+        return self._name
+
+    @name.setter
+    def name(self, name):
+        """Sets the name of this OperatorKey.
+
+
+        :param name: The name of this OperatorKey.  # noqa: E501
+        :type: str
+        """
+
+        self._name = name
+
+    @property
+    def version(self):
+        """Gets the version of this OperatorKey.  # noqa: E501
+
+
+        :return: The version of this OperatorKey.  # noqa: E501
+        :rtype: int
+        """
+        return self._version
+
+    @version.setter
+    def version(self, version):
+        """Sets the version of this OperatorKey.
+
+
+        :param version: The version of this OperatorKey.  # noqa: E501
+        :type: int
+        """
+
+        self._version = version
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, OperatorKey):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, OperatorKey):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/type/parent_type_info.py b/knext/schema/rest/models/type/parent_type_info.py
new file mode 100644
index 00000000..ceea8ee6
--- /dev/null
+++ b/knext/schema/rest/models/type/parent_type_info.py
@@ -0,0 +1,218 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class ParentTypeInfo(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "unique_id": "int",
+        "parent_unique_id": "int",
+        "parent_type_identifier": "SpgTypeIdentifier",
+        "inherit_path": "list[int]",
+    }
+
+    attribute_map = {
+        "unique_id": "uniqueId",
+        "parent_unique_id": "parentUniqueId",
+        "parent_type_identifier": "parentTypeIdentifier",
+        "inherit_path": "inheritPath",
+    }
+
+    def __init__(
+        self,
+        unique_id=None,
+        parent_unique_id=None,
+        parent_type_identifier=None,
+        inherit_path=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """ParentTypeInfo - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._unique_id = None
+        self._parent_unique_id = None
+        self._parent_type_identifier = None
+        self._inherit_path = None
+        self.discriminator = None
+
+        if unique_id is not None:
+            self.unique_id = unique_id
+        if parent_unique_id is not None:
+            self.parent_unique_id = parent_unique_id
+        if parent_type_identifier is not None:
+            self.parent_type_identifier = parent_type_identifier
+        if inherit_path is not None:
+            self.inherit_path = inherit_path
+
+    @property
+    def unique_id(self):
+        """Gets the unique_id of this ParentTypeInfo.  # noqa: E501
+
+
+        :return: The unique_id of this ParentTypeInfo.  # noqa: E501
+        :rtype: int
+        """
+        return self._unique_id
+
+    @unique_id.setter
+    def unique_id(self, unique_id):
+        """Sets the unique_id of this ParentTypeInfo.
+
+
+        :param unique_id: The unique_id of this ParentTypeInfo.  # noqa: E501
+        :type: int
+        """
+
+        self._unique_id = unique_id
+
+    @property
+    def parent_unique_id(self):
+        """Gets the parent_unique_id of this ParentTypeInfo.  # noqa: E501
+
+
+        :return: The parent_unique_id of this ParentTypeInfo.  # noqa: E501
+        :rtype: int
+        """
+        return self._parent_unique_id
+
+    @parent_unique_id.setter
+    def parent_unique_id(self, parent_unique_id):
+        """Sets the parent_unique_id of this ParentTypeInfo.
+
+
+        :param parent_unique_id: The parent_unique_id of this ParentTypeInfo.  # noqa: E501
+        :type: int
+        """
+
+        self._parent_unique_id = parent_unique_id
+
+    @property
+    def parent_type_identifier(self):
+        """Gets the parent_type_identifier of this ParentTypeInfo.  # noqa: E501
+
+
+        :return: The parent_type_identifier of this ParentTypeInfo.  # noqa: E501
+        :rtype: SpgTypeIdentifier
+        """
+        return self._parent_type_identifier
+
+    @parent_type_identifier.setter
+    def parent_type_identifier(self, parent_type_identifier):
+        """Sets the parent_type_identifier of this ParentTypeInfo.
+
+
+        :param parent_type_identifier: The parent_type_identifier of this ParentTypeInfo.  # noqa: E501
+        :type: SpgTypeIdentifier
+        """
+
+        self._parent_type_identifier = parent_type_identifier
+
+    @property
+    def inherit_path(self):
+        """Gets the inherit_path of this ParentTypeInfo.  # noqa: E501
+
+
+        :return: The inherit_path of this ParentTypeInfo.  # noqa: E501
+        :rtype: list[int]
+        """
+        return self._inherit_path
+
+    @inherit_path.setter
+    def inherit_path(self, inherit_path):
+        """Sets the inherit_path of this ParentTypeInfo.
+
+
+        :param inherit_path: The inherit_path of this ParentTypeInfo.  # noqa: E501
+        :type: list[int]
+        """
+
+        self._inherit_path = inherit_path
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, ParentTypeInfo):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, ParentTypeInfo):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/type/project_schema.py b/knext/schema/rest/models/type/project_schema.py
new file mode 100644
index 00000000..2b2b6c9a
--- /dev/null
+++ b/knext/schema/rest/models/type/project_schema.py
@@ -0,0 +1,129 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class ProjectSchema(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"spg_types": "list[BaseSpgType]"}
+
+    attribute_map = {"spg_types": "spgTypes"}
+
+    def __init__(self, spg_types=None, local_vars_configuration=None):  # noqa: E501
+        """ProjectSchema - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._spg_types = None
+        self.discriminator = None
+
+        if spg_types is not None:
+            self.spg_types = spg_types
+
+    @property
+    def spg_types(self):
+        """Gets the spg_types of this ProjectSchema.  # noqa: E501
+
+
+        :return: The spg_types of this ProjectSchema.  # noqa: E501
+        :rtype: list[BaseSpgType]
+        """
+        return self._spg_types
+
+    @spg_types.setter
+    def spg_types(self, spg_types):
+        """Sets the spg_types of this ProjectSchema.
+
+
+        :param spg_types: The spg_types of this ProjectSchema.  # noqa: E501
+        :type: list[BaseSpgType]
+        """
+
+        self._spg_types = spg_types
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, ProjectSchema):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, ProjectSchema):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/type/spg_type_advanced_config.py b/knext/schema/rest/models/type/spg_type_advanced_config.py
new file mode 100644
index 00000000..c97a5034
--- /dev/null
+++ b/knext/schema/rest/models/type/spg_type_advanced_config.py
@@ -0,0 +1,218 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class SpgTypeAdvancedConfig(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "link_operator": "OperatorKey",
+        "fuse_operator": "OperatorKey",
+        "extract_operator": "OperatorKey",
+        "normalized_operator": "OperatorKey",
+    }
+
+    attribute_map = {
+        "link_operator": "linkOperator",
+        "fuse_operator": "fuseOperator",
+        "extract_operator": "extractOperator",
+        "normalized_operator": "normalizedOperator",
+    }
+
+    def __init__(
+        self,
+        link_operator=None,
+        fuse_operator=None,
+        extract_operator=None,
+        normalized_operator=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """SpgTypeAdvancedConfig - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._link_operator = None
+        self._fuse_operator = None
+        self._extract_operator = None
+        self._normalized_operator = None
+        self.discriminator = None
+
+        if link_operator is not None:
+            self.link_operator = link_operator
+        if fuse_operator is not None:
+            self.fuse_operator = fuse_operator
+        if extract_operator is not None:
+            self.extract_operator = extract_operator
+        if normalized_operator is not None:
+            self.normalized_operator = normalized_operator
+
+    @property
+    def link_operator(self):
+        """Gets the link_operator of this SpgTypeAdvancedConfig.  # noqa: E501
+
+
+        :return: The link_operator of this SpgTypeAdvancedConfig.  # noqa: E501
+        :rtype: OperatorKey
+        """
+        return self._link_operator
+
+    @link_operator.setter
+    def link_operator(self, link_operator):
+        """Sets the link_operator of this SpgTypeAdvancedConfig.
+
+
+        :param link_operator: The link_operator of this SpgTypeAdvancedConfig.  # noqa: E501
+        :type: OperatorKey
+        """
+
+        self._link_operator = link_operator
+
+    @property
+    def fuse_operator(self):
+        """Gets the fuse_operator of this SpgTypeAdvancedConfig.  # noqa: E501
+
+
+        :return: The fuse_operator of this SpgTypeAdvancedConfig.  # noqa: E501
+        :rtype: OperatorKey
+        """
+        return self._fuse_operator
+
+    @fuse_operator.setter
+    def fuse_operator(self, fuse_operator):
+        """Sets the fuse_operator of this SpgTypeAdvancedConfig.
+
+
+        :param fuse_operator: The fuse_operator of this SpgTypeAdvancedConfig.  # noqa: E501
+        :type: OperatorKey
+        """
+
+        self._fuse_operator = fuse_operator
+
+    @property
+    def extract_operator(self):
+        """Gets the extract_operator of this SpgTypeAdvancedConfig.  # noqa: E501
+
+
+        :return: The extract_operator of this SpgTypeAdvancedConfig.  # noqa: E501
+        :rtype: OperatorKey
+        """
+        return self._extract_operator
+
+    @extract_operator.setter
+    def extract_operator(self, extract_operator):
+        """Sets the extract_operator of this SpgTypeAdvancedConfig.
+
+
+        :param extract_operator: The extract_operator of this SpgTypeAdvancedConfig.  # noqa: E501
+        :type: OperatorKey
+        """
+
+        self._extract_operator = extract_operator
+
+    @property
+    def normalized_operator(self):
+        """Gets the normalized_operator of this SpgTypeAdvancedConfig.  # noqa: E501
+
+
+        :return: The normalized_operator of this SpgTypeAdvancedConfig.  # noqa: E501
+        :rtype: OperatorKey
+        """
+        return self._normalized_operator
+
+    @normalized_operator.setter
+    def normalized_operator(self, normalized_operator):
+        """Sets the normalized_operator of this SpgTypeAdvancedConfig.
+
+
+        :param normalized_operator: The normalized_operator of this SpgTypeAdvancedConfig.  # noqa: E501
+        :type: OperatorKey
+        """
+
+        self._normalized_operator = normalized_operator
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, SpgTypeAdvancedConfig):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, SpgTypeAdvancedConfig):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/type/spg_type_ref.py b/knext/schema/rest/models/type/spg_type_ref.py
new file mode 100644
index 00000000..08959935
--- /dev/null
+++ b/knext/schema/rest/models/type/spg_type_ref.py
@@ -0,0 +1,171 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class SpgTypeRef(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"basic_info": "SpgTypeRefBasicInfo", "spg_type_enum": "str"}
+
+    attribute_map = {"basic_info": "basicInfo", "spg_type_enum": "spgTypeEnum"}
+
+    def __init__(
+        self, basic_info=None, spg_type_enum=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """SpgTypeRef - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._basic_info = None
+        self._spg_type_enum = None
+        self.discriminator = None
+
+        if basic_info is not None:
+            self.basic_info = basic_info
+        if spg_type_enum is not None:
+            self.spg_type_enum = spg_type_enum
+
+    @property
+    def basic_info(self):
+        """Gets the basic_info of this SpgTypeRef.  # noqa: E501
+
+
+        :return: The basic_info of this SpgTypeRef.  # noqa: E501
+        :rtype: SpgTypeRefBasicInfo
+        """
+        return self._basic_info
+
+    @basic_info.setter
+    def basic_info(self, basic_info):
+        """Sets the basic_info of this SpgTypeRef.
+
+
+        :param basic_info: The basic_info of this SpgTypeRef.  # noqa: E501
+        :type: SpgTypeRefBasicInfo
+        """
+
+        self._basic_info = basic_info
+
+    @property
+    def spg_type_enum(self):
+        """Gets the spg_type_enum of this SpgTypeRef.  # noqa: E501
+
+
+        :return: The spg_type_enum of this SpgTypeRef.  # noqa: E501
+        :rtype: str
+        """
+        return self._spg_type_enum
+
+    @spg_type_enum.setter
+    def spg_type_enum(self, spg_type_enum):
+        """Sets the spg_type_enum of this SpgTypeRef.
+
+
+        :param spg_type_enum: The spg_type_enum of this SpgTypeRef.  # noqa: E501
+        :type: str
+        """
+        allowed_values = [
+            "BASIC_TYPE",
+            "ENTITY_TYPE",
+            "CONCEPT_TYPE",
+            "EVENT_TYPE",
+            "STANDARD_TYPE",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and spg_type_enum not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `spg_type_enum` ({0}), must be one of {1}".format(  # noqa: E501
+                    spg_type_enum, allowed_values
+                )
+            )
+
+        self._spg_type_enum = spg_type_enum
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, SpgTypeRef):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, SpgTypeRef):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/type/spg_type_ref_basic_info.py b/knext/schema/rest/models/type/spg_type_ref_basic_info.py
new file mode 100644
index 00000000..70550e9f
--- /dev/null
+++ b/knext/schema/rest/models/type/spg_type_ref_basic_info.py
@@ -0,0 +1,217 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class SpgTypeRefBasicInfo(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "name": "SpgTypeIdentifier",
+        "name_zh": "str",
+        "desc": "str",
+        "creator": "str",
+    }
+
+    attribute_map = {
+        "name": "name",
+        "name_zh": "nameZh",
+        "desc": "desc",
+        "creator": "creator",
+    }
+
+    def __init__(
+        self,
+        name=None,
+        name_zh=None,
+        desc=None,
+        creator=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """SpgTypeRefBasicInfo - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._name = None
+        self._name_zh = None
+        self._desc = None
+        self._creator = None
+        self.discriminator = None
+
+        self.name = name
+        if name_zh is not None:
+            self.name_zh = name_zh
+        if desc is not None:
+            self.desc = desc
+        if creator is not None:
+            self.creator = creator
+
+    @property
+    def name(self):
+        """Gets the name of this SpgTypeRefBasicInfo.  # noqa: E501
+
+
+        :return: The name of this SpgTypeRefBasicInfo.  # noqa: E501
+        :rtype: SpgTypeIdentifier
+        """
+        return self._name
+
+    @name.setter
+    def name(self, name):
+        """Sets the name of this SpgTypeRefBasicInfo.
+
+
+        :param name: The name of this SpgTypeRefBasicInfo.  # noqa: E501
+        :type: SpgTypeIdentifier
+        """
+
+        self._name = name
+
+    @property
+    def name_zh(self):
+        """Gets the name_zh of this SpgTypeRefBasicInfo.  # noqa: E501
+
+
+        :return: The name_zh of this SpgTypeRefBasicInfo.  # noqa: E501
+        :rtype: str
+        """
+        return self._name_zh
+
+    @name_zh.setter
+    def name_zh(self, name_zh):
+        """Sets the name_zh of this SpgTypeRefBasicInfo.
+
+
+        :param name_zh: The name_zh of this SpgTypeRefBasicInfo.  # noqa: E501
+        :type: str
+        """
+
+        self._name_zh = name_zh
+
+    @property
+    def desc(self):
+        """Gets the desc of this SpgTypeRefBasicInfo.  # noqa: E501
+
+
+        :return: The desc of this SpgTypeRefBasicInfo.  # noqa: E501
+        :rtype: str
+        """
+        return self._desc
+
+    @desc.setter
+    def desc(self, desc):
+        """Sets the desc of this SpgTypeRefBasicInfo.
+
+
+        :param desc: The desc of this SpgTypeRefBasicInfo.  # noqa: E501
+        :type: str
+        """
+
+        self._desc = desc
+
+    @property
+    def creator(self):
+        """Gets the creator of this SpgTypeRefBasicInfo.  # noqa: E501
+
+
+        :return: The creator of this SpgTypeRefBasicInfo.  # noqa: E501
+        :rtype: str
+        """
+        return self._creator
+
+    @creator.setter
+    def creator(self, creator):
+        """Sets the creator of this SpgTypeRefBasicInfo.
+
+
+        :param creator: The creator of this SpgTypeRefBasicInfo.  # noqa: E501
+        :type: str
+        """
+
+        self._creator = creator
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, SpgTypeRefBasicInfo):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, SpgTypeRefBasicInfo):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/type/standard_type.py b/knext/schema/rest/models/type/standard_type.py
new file mode 100644
index 00000000..33d7bd18
--- /dev/null
+++ b/knext/schema/rest/models/type/standard_type.py
@@ -0,0 +1,466 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class StandardType(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "basic_info": "StandardTypeBasicInfo",
+        "parent_type_info": "ParentTypeInfo",
+        "spg_type_enum": "str",
+        "properties": "list[Property]",
+        "relations": "list[Relation]",
+        "advanced_config": "SpgTypeAdvancedConfig",
+        "project_id": "int",
+        "ontology_id": "OntologyId",
+        "alter_operation": "str",
+        "ext_info": "object",
+        "spreadable": "bool",
+        "constraint_items": "list[BaseConstraintItem]",
+    }
+
+    attribute_map = {
+        "basic_info": "basicInfo",
+        "parent_type_info": "parentTypeInfo",
+        "spg_type_enum": "spgTypeEnum",
+        "properties": "properties",
+        "relations": "relations",
+        "advanced_config": "advancedConfig",
+        "project_id": "projectId",
+        "ontology_id": "ontologyId",
+        "alter_operation": "alterOperation",
+        "ext_info": "extInfo",
+        "spreadable": "spreadable",
+        "constraint_items": "constraintItems",
+    }
+
+    def __init__(
+        self,
+        basic_info=None,
+        parent_type_info=None,
+        spg_type_enum="STANDARD_TYPE",
+        properties=None,
+        relations=None,
+        advanced_config=None,
+        project_id=None,
+        ontology_id=None,
+        alter_operation=None,
+        ext_info=None,
+        spreadable=None,
+        constraint_items=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """StandardType - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._basic_info = None
+        self._parent_type_info = None
+        self._spg_type_enum = None
+        self._properties = None
+        self._relations = None
+        self._advanced_config = None
+        self._project_id = None
+        self._ontology_id = None
+        self._alter_operation = None
+        self._ext_info = None
+        self._spreadable = None
+        self._constraint_items = None
+        self.discriminator = spg_type_enum
+
+        if basic_info is not None:
+            self.basic_info = basic_info
+        if parent_type_info is not None:
+            self.parent_type_info = parent_type_info
+        self.spg_type_enum = spg_type_enum
+        if properties is not None:
+            self.properties = properties
+        if relations is not None:
+            self.relations = relations
+        if advanced_config is not None:
+            self.advanced_config = advanced_config
+        if project_id is not None:
+            self.project_id = project_id
+        if ontology_id is not None:
+            self.ontology_id = ontology_id
+        if alter_operation is not None:
+            self.alter_operation = alter_operation
+        if ext_info is not None:
+            self.ext_info = ext_info
+        if spreadable is not None:
+            self.spreadable = spreadable
+        if constraint_items is not None:
+            self.constraint_items = constraint_items
+
+    @property
+    def basic_info(self):
+        """Gets the basic_info of this StandardType.  # noqa: E501
+
+
+        :return: The basic_info of this StandardType.  # noqa: E501
+        :rtype: StandardTypeBasicInfo
+        """
+        return self._basic_info
+
+    @basic_info.setter
+    def basic_info(self, basic_info):
+        """Sets the basic_info of this StandardType.
+
+
+        :param basic_info: The basic_info of this StandardType.  # noqa: E501
+        :type: StandardTypeBasicInfo
+        """
+
+        self._basic_info = basic_info
+
+    @property
+    def parent_type_info(self):
+        """Gets the parent_type_info of this StandardType.  # noqa: E501
+
+
+        :return: The parent_type_info of this StandardType.  # noqa: E501
+        :rtype: ParentTypeInfo
+        """
+        return self._parent_type_info
+
+    @parent_type_info.setter
+    def parent_type_info(self, parent_type_info):
+        """Sets the parent_type_info of this StandardType.
+
+
+        :param parent_type_info: The parent_type_info of this StandardType.  # noqa: E501
+        :type: ParentTypeInfo
+        """
+
+        self._parent_type_info = parent_type_info
+
+    @property
+    def spg_type_enum(self):
+        """Gets the spg_type_enum of this StandardType.  # noqa: E501
+
+
+        :return: The spg_type_enum of this StandardType.  # noqa: E501
+        :rtype: str
+        """
+        return self._spg_type_enum
+
+    @spg_type_enum.setter
+    def spg_type_enum(self, spg_type_enum):
+        """Sets the spg_type_enum of this StandardType.
+
+
+        :param spg_type_enum: The spg_type_enum of this StandardType.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and spg_type_enum is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `spg_type_enum`, must not be `None`"
+            )  # noqa: E501
+        allowed_values = [
+            "BASIC_TYPE",
+            "ENTITY_TYPE",
+            "CONCEPT_TYPE",
+            "EVENT_TYPE",
+            "STANDARD_TYPE",
+        ]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and spg_type_enum not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `spg_type_enum` ({0}), must be one of {1}".format(  # noqa: E501
+                    spg_type_enum, allowed_values
+                )
+            )
+
+        self._spg_type_enum = spg_type_enum
+
+    @property
+    def properties(self):
+        """Gets the properties of this StandardType.  # noqa: E501
+
+
+        :return: The properties of this StandardType.  # noqa: E501
+        :rtype: list[Property]
+        """
+        return self._properties
+
+    @properties.setter
+    def properties(self, properties):
+        """Sets the properties of this StandardType.
+
+
+        :param properties: The properties of this StandardType.  # noqa: E501
+        :type: list[Property]
+        """
+
+        self._properties = properties
+
+    @property
+    def relations(self):
+        """Gets the relations of this StandardType.  # noqa: E501
+
+
+        :return: The relations of this StandardType.  # noqa: E501
+        :rtype: list[Relation]
+        """
+        return self._relations
+
+    @relations.setter
+    def relations(self, relations):
+        """Sets the relations of this StandardType.
+
+
+        :param relations: The relations of this StandardType.  # noqa: E501
+        :type: list[Relation]
+        """
+
+        self._relations = relations
+
+    @property
+    def advanced_config(self):
+        """Gets the advanced_config of this StandardType.  # noqa: E501
+
+
+        :return: The advanced_config of this StandardType.  # noqa: E501
+        :rtype: SpgTypeAdvancedConfig
+        """
+        return self._advanced_config
+
+    @advanced_config.setter
+    def advanced_config(self, advanced_config):
+        """Sets the advanced_config of this StandardType.
+
+
+        :param advanced_config: The advanced_config of this StandardType.  # noqa: E501
+        :type: SpgTypeAdvancedConfig
+        """
+
+        self._advanced_config = advanced_config
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this StandardType.  # noqa: E501
+
+
+        :return: The project_id of this StandardType.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this StandardType.
+
+
+        :param project_id: The project_id of this StandardType.  # noqa: E501
+        :type: int
+        """
+
+        self._project_id = project_id
+
+    @property
+    def ontology_id(self):
+        """Gets the ontology_id of this StandardType.  # noqa: E501
+
+
+        :return: The ontology_id of this StandardType.  # noqa: E501
+        :rtype: OntologyId
+        """
+        return self._ontology_id
+
+    @ontology_id.setter
+    def ontology_id(self, ontology_id):
+        """Sets the ontology_id of this StandardType.
+
+
+        :param ontology_id: The ontology_id of this StandardType.  # noqa: E501
+        :type: OntologyId
+        """
+
+        self._ontology_id = ontology_id
+
+    @property
+    def alter_operation(self):
+        """Gets the alter_operation of this StandardType.  # noqa: E501
+
+
+        :return: The alter_operation of this StandardType.  # noqa: E501
+        :rtype: str
+        """
+        return self._alter_operation
+
+    @alter_operation.setter
+    def alter_operation(self, alter_operation):
+        """Sets the alter_operation of this StandardType.
+
+
+        :param alter_operation: The alter_operation of this StandardType.  # noqa: E501
+        :type: str
+        """
+        allowed_values = ["CREATE", "UPDATE", "DELETE"]  # noqa: E501
+        if (
+            self.local_vars_configuration.client_side_validation
+            and alter_operation not in allowed_values
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `alter_operation` ({0}), must be one of {1}".format(  # noqa: E501
+                    alter_operation, allowed_values
+                )
+            )
+
+        self._alter_operation = alter_operation
+
+    @property
+    def ext_info(self):
+        """Gets the ext_info of this StandardType.  # noqa: E501
+
+
+        :return: The ext_info of this StandardType.  # noqa: E501
+        :rtype: object
+        """
+        return self._ext_info
+
+    @ext_info.setter
+    def ext_info(self, ext_info):
+        """Sets the ext_info of this StandardType.
+
+
+        :param ext_info: The ext_info of this StandardType.  # noqa: E501
+        :type: object
+        """
+
+        self._ext_info = ext_info
+
+    @property
+    def spreadable(self):
+        """Gets the spreadable of this StandardType.  # noqa: E501
+
+
+        :return: The spreadable of this StandardType.  # noqa: E501
+        :rtype: bool
+        """
+        return self._spreadable
+
+    @spreadable.setter
+    def spreadable(self, spreadable):
+        """Sets the spreadable of this StandardType.
+
+
+        :param spreadable: The spreadable of this StandardType.  # noqa: E501
+        :type: bool
+        """
+
+        self._spreadable = spreadable
+
+    @property
+    def constraint_items(self):
+        """Gets the constraint_items of this StandardType.  # noqa: E501
+
+
+        :return: The constraint_items of this StandardType.  # noqa: E501
+        :rtype: list[BaseConstraintItem]
+        """
+        return self._constraint_items
+
+    @constraint_items.setter
+    def constraint_items(self, constraint_items):
+        """Sets the constraint_items of this StandardType.
+
+
+        :param constraint_items: The constraint_items of this StandardType.  # noqa: E501
+        :type: list[BaseConstraintItem]
+        """
+
+        self._constraint_items = constraint_items
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, StandardType):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, StandardType):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/type/standard_type_basic_info.py b/knext/schema/rest/models/type/standard_type_basic_info.py
new file mode 100644
index 00000000..2ea0f824
--- /dev/null
+++ b/knext/schema/rest/models/type/standard_type_basic_info.py
@@ -0,0 +1,218 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class StandardTypeBasicInfo(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "name": "SpgTypeIdentifier",
+        "name_zh": "str",
+        "desc": "str",
+        "creator": "str",
+    }
+
+    attribute_map = {
+        "name": "name",
+        "name_zh": "nameZh",
+        "desc": "desc",
+        "creator": "creator",
+    }
+
+    def __init__(
+        self,
+        name=None,
+        name_zh=None,
+        desc=None,
+        creator=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """StandardTypeBasicInfo - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._name = None
+        self._name_zh = None
+        self._desc = None
+        self._creator = None
+        self.discriminator = None
+
+        if name is not None:
+            self.name = name
+        if name_zh is not None:
+            self.name_zh = name_zh
+        if desc is not None:
+            self.desc = desc
+        if creator is not None:
+            self.creator = creator
+
+    @property
+    def name(self):
+        """Gets the name of this StandardTypeBasicInfo.  # noqa: E501
+
+
+        :return: The name of this StandardTypeBasicInfo.  # noqa: E501
+        :rtype: SpgTypeIdentifier
+        """
+        return self._name
+
+    @name.setter
+    def name(self, name):
+        """Sets the name of this StandardTypeBasicInfo.
+
+
+        :param name: The name of this StandardTypeBasicInfo.  # noqa: E501
+        :type: SpgTypeIdentifier
+        """
+
+        self._name = name
+
+    @property
+    def name_zh(self):
+        """Gets the name_zh of this StandardTypeBasicInfo.  # noqa: E501
+
+
+        :return: The name_zh of this StandardTypeBasicInfo.  # noqa: E501
+        :rtype: str
+        """
+        return self._name_zh
+
+    @name_zh.setter
+    def name_zh(self, name_zh):
+        """Sets the name_zh of this StandardTypeBasicInfo.
+
+
+        :param name_zh: The name_zh of this StandardTypeBasicInfo.  # noqa: E501
+        :type: str
+        """
+
+        self._name_zh = name_zh
+
+    @property
+    def desc(self):
+        """Gets the desc of this StandardTypeBasicInfo.  # noqa: E501
+
+
+        :return: The desc of this StandardTypeBasicInfo.  # noqa: E501
+        :rtype: str
+        """
+        return self._desc
+
+    @desc.setter
+    def desc(self, desc):
+        """Sets the desc of this StandardTypeBasicInfo.
+
+
+        :param desc: The desc of this StandardTypeBasicInfo.  # noqa: E501
+        :type: str
+        """
+
+        self._desc = desc
+
+    @property
+    def creator(self):
+        """Gets the creator of this StandardTypeBasicInfo.  # noqa: E501
+
+
+        :return: The creator of this StandardTypeBasicInfo.  # noqa: E501
+        :rtype: str
+        """
+        return self._creator
+
+    @creator.setter
+    def creator(self, creator):
+        """Sets the creator of this StandardTypeBasicInfo.
+
+
+        :param creator: The creator of this StandardTypeBasicInfo.  # noqa: E501
+        :type: str
+        """
+
+        self._creator = creator
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, StandardTypeBasicInfo):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, StandardTypeBasicInfo):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/models/user_info.py b/knext/schema/rest/models/user_info.py
new file mode 100644
index 00000000..5624f41f
--- /dev/null
+++ b/knext/schema/rest/models/user_info.py
@@ -0,0 +1,155 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class UserInfo(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"user_id": "str", "nick_name": "str"}
+
+    attribute_map = {"user_id": "userId", "nick_name": "nickName"}
+
+    def __init__(
+        self, user_id=None, nick_name=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """UserInfo - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._user_id = None
+        self._nick_name = None
+        self.discriminator = None
+
+        if user_id is not None:
+            self.user_id = user_id
+        if nick_name is not None:
+            self.nick_name = nick_name
+
+    @property
+    def user_id(self):
+        """Gets the user_id of this UserInfo.  # noqa: E501
+
+
+        :return: The user_id of this UserInfo.  # noqa: E501
+        :rtype: str
+        """
+        return self._user_id
+
+    @user_id.setter
+    def user_id(self, user_id):
+        """Sets the user_id of this UserInfo.
+
+
+        :param user_id: The user_id of this UserInfo.  # noqa: E501
+        :type: str
+        """
+
+        self._user_id = user_id
+
+    @property
+    def nick_name(self):
+        """Gets the nick_name of this UserInfo.  # noqa: E501
+
+
+        :return: The nick_name of this UserInfo.  # noqa: E501
+        :rtype: str
+        """
+        return self._nick_name
+
+    @nick_name.setter
+    def nick_name(self, nick_name):
+        """Sets the nick_name of this UserInfo.
+
+
+        :param nick_name: The nick_name of this UserInfo.  # noqa: E501
+        :type: str
+        """
+
+        self._nick_name = nick_name
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, UserInfo):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, UserInfo):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/schema/rest/schema_api.py b/knext/schema/rest/schema_api.py
new file mode 100644
index 00000000..f5a8b369
--- /dev/null
+++ b/knext/schema/rest/schema_api.py
@@ -0,0 +1,571 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+from __future__ import absolute_import
+
+import re  # noqa: F401
+
+# python 2 and python 3 compatibility library
+import six
+
+from knext.common.rest.api_client import ApiClient
+from knext.common.rest.exceptions import ApiTypeError, ApiValueError  # noqa: F401
+
+
+class SchemaApi(object):
+    """NOTE: This class is auto generated by OpenAPI Generator
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    def __init__(self, api_client=None):
+        if api_client is None:
+            api_client = ApiClient()
+        self.api_client = api_client
+
+    def schema_alter_schema_post(self, **kwargs):  # noqa: E501
+        """alter_schema  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.schema_alter_schema_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param SchemaAlterRequest schema_alter_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: object
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.schema_alter_schema_post_with_http_info(**kwargs)  # noqa: E501
+
+    def schema_alter_schema_post_with_http_info(self, **kwargs):  # noqa: E501
+        """alter_schema  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.schema_alter_schema_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param SchemaAlterRequest schema_alter_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(object, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["schema_alter_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method schema_alter_schema_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "schema_alter_request" in local_var_params:
+            body_params = local_var_params["schema_alter_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/schema/alterSchema",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="object",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def schema_query_project_schema_get(self, project_id, **kwargs):  # noqa: E501
+        """query_project_schema  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.schema_query_project_schema_get(project_id, async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param int project_id: (required)
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: ProjectSchema
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.schema_query_project_schema_get_with_http_info(
+            project_id, **kwargs
+        )  # noqa: E501
+
+    def schema_query_project_schema_get_with_http_info(
+        self, project_id, **kwargs
+    ):  # noqa: E501
+        """query_project_schema  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.schema_query_project_schema_get_with_http_info(project_id, async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param int project_id: (required)
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(ProjectSchema, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["project_id"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method schema_query_project_schema_get" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+        # verify the required parameter 'project_id' is set
+        if self.api_client.client_side_validation and (
+            "project_id" not in local_var_params
+            or local_var_params["project_id"] is None  # noqa: E501
+        ):  # noqa: E501
+            raise ApiValueError(
+                "Missing the required parameter `project_id` when calling `schema_query_project_schema_get`"
+            )  # noqa: E501
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+        if (
+            "project_id" in local_var_params
+            and local_var_params["project_id"] is not None
+        ):  # noqa: E501
+            query_params.append(
+                ("projectId", local_var_params["project_id"])
+            )  # noqa: E501
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/schema/queryProjectSchema",
+            "GET",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="ProjectSchema",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def schema_query_relation_get(
+        self, s_name, relation, o_name, **kwargs
+    ):  # noqa: E501
+        """query_relation  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.schema_query_relation_get(s_name, relation, o_name, async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param str s_name: (required)
+        :param str relation: (required)
+        :param str o_name: (required)
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: Relation
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.schema_query_relation_get_with_http_info(
+            s_name, relation, o_name, **kwargs
+        )  # noqa: E501
+
+    def schema_query_relation_get_with_http_info(
+        self, s_name, relation, o_name, **kwargs
+    ):  # noqa: E501
+        """query_relation  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.schema_query_relation_get_with_http_info(s_name, relation, o_name, async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param str s_name: (required)
+        :param str relation: (required)
+        :param str o_name: (required)
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(Relation, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["s_name", "relation", "o_name"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method schema_query_relation_get" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+        # verify the required parameter 's_name' is set
+        if self.api_client.client_side_validation and (
+            "s_name" not in local_var_params
+            or local_var_params["s_name"] is None  # noqa: E501
+        ):  # noqa: E501
+            raise ApiValueError(
+                "Missing the required parameter `s_name` when calling `schema_query_relation_get`"
+            )  # noqa: E501
+        # verify the required parameter 'relation' is set
+        if self.api_client.client_side_validation and (
+            "relation" not in local_var_params
+            or local_var_params["relation"] is None  # noqa: E501
+        ):  # noqa: E501
+            raise ApiValueError(
+                "Missing the required parameter `relation` when calling `schema_query_relation_get`"
+            )  # noqa: E501
+        # verify the required parameter 'o_name' is set
+        if self.api_client.client_side_validation and (
+            "o_name" not in local_var_params
+            or local_var_params["o_name"] is None  # noqa: E501
+        ):  # noqa: E501
+            raise ApiValueError(
+                "Missing the required parameter `o_name` when calling `schema_query_relation_get`"
+            )  # noqa: E501
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+        if (
+            "s_name" in local_var_params and local_var_params["s_name"] is not None
+        ):  # noqa: E501
+            query_params.append(("sName", local_var_params["s_name"]))  # noqa: E501
+        if (
+            "relation" in local_var_params and local_var_params["relation"] is not None
+        ):  # noqa: E501
+            query_params.append(
+                ("relation", local_var_params["relation"])
+            )  # noqa: E501
+        if (
+            "o_name" in local_var_params and local_var_params["o_name"] is not None
+        ):  # noqa: E501
+            query_params.append(("oName", local_var_params["o_name"]))  # noqa: E501
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/schema/queryRelation",
+            "GET",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="Relation",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def schema_query_spg_type_get(self, name, **kwargs):  # noqa: E501
+        """query_spg_type  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.schema_query_spg_type_get(name, async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param str name: 实体类型名称 (required)
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: BaseSpgType
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.schema_query_spg_type_get_with_http_info(
+            name, **kwargs
+        )  # noqa: E501
+
+    def schema_query_spg_type_get_with_http_info(self, name, **kwargs):  # noqa: E501
+        """query_spg_type  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.schema_query_spg_type_get_with_http_info(name, async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param str name: 实体类型名称 (required)
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(BaseSpgType, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["name"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method schema_query_spg_type_get" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+        # verify the required parameter 'name' is set
+        if self.api_client.client_side_validation and (
+            "name" not in local_var_params
+            or local_var_params["name"] is None  # noqa: E501
+        ):  # noqa: E501
+            raise ApiValueError(
+                "Missing the required parameter `name` when calling `schema_query_spg_type_get`"
+            )  # noqa: E501
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+        if (
+            "name" in local_var_params and local_var_params["name"] is not None
+        ):  # noqa: E501
+            query_params.append(("name", local_var_params["name"]))  # noqa: E501
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/schema/querySpgType",
+            "GET",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="BaseSpgType",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
diff --git a/knext/search/__init__.py b/knext/search/__init__.py
new file mode 100644
index 00000000..936aa95d
--- /dev/null
+++ b/knext/search/__init__.py
@@ -0,0 +1,36 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+# flake8: noqa
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+from __future__ import absolute_import
+
+__version__ = "1.0.0"
+
+# import models into model package
+from knext.search.rest.models.idx_record import IdxRecord
+from knext.search.rest.models.text_search_request import TextSearchRequest
+from knext.search.rest.models.vector_search_request import VectorSearchRequest
+
+# import apis into sdk package
+from knext.search.rest.search_api import SearchApi
diff --git a/knext/search/client.py b/knext/search/client.py
new file mode 100644
index 00000000..4b28fe13
--- /dev/null
+++ b/knext/search/client.py
@@ -0,0 +1,827 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+from knext.common.base.client import Client
+from knext.common.rest import Configuration, ApiClient
+
+from knext.search import rest, TextSearchRequest, VectorSearchRequest, IdxRecord
+
+
+def idx_record_to_dict(record: IdxRecord):
+    return {"score": record.score, "node": record.fields}
+
+
+class SearchClient(Client):
+    """ """
+
+    def __init__(self, host_addr: str = None, project_id: int = None):
+        super().__init__(host_addr, project_id)
+        self._rest_client: rest.SearchApi = rest.SearchApi(
+            api_client=ApiClient(configuration=Configuration(host=host_addr))
+        )
+
+    def search_text(self, query_string, label_constraints=None, topk=10, params=None):
+        req = TextSearchRequest(
+            self._project_id, query_string, label_constraints, topk, params
+        )
+        records = self._rest_client.search_text_post(text_search_request=req)
+        return [idx_record_to_dict(record) for record in records]
+
+    def search_vector(
+        self, label, property_key, query_vector, topk=10, ef_search=None, params=None
+    ):
+        req = VectorSearchRequest(
+            self._project_id, label, property_key, query_vector, ef_search, topk, params
+        )
+        records = self._rest_client.search_vector_post(vector_search_request=req)
+        return [idx_record_to_dict(record) for record in records]
+
+
+if __name__ == "__main__":
+    sc = SearchClient("http://127.0.0.1:8887", 4)
+    a = sc.search_text("Depressive_or_psychotic_symptoms")
+    print(a)
+
+    b = sc.search_vector(
+        "Entity",
+        "name",
+        [
+            -0.011183402,
+            0.048467062,
+            0.034121536,
+            -0.012637187,
+            -0.0063864742,
+            -0.018192118,
+            -0.005130675,
+            0.0001195873,
+            -0.050548747,
+            0.0039880113,
+            -0.00019851669,
+            -0.021597484,
+            0.072632715,
+            -0.015092217,
+            0.06702121,
+            0.004932689,
+            -0.021484349,
+            -0.0026530172,
+            0.016630854,
+            0.018146865,
+            0.018237373,
+            -0.014526542,
+            0.020669777,
+            0.0030574752,
+            -0.019221647,
+            0.0034647614,
+            0.033148576,
+            0.061771747,
+            -0.03303544,
+            -0.016314076,
+            0.025297001,
+            0.015963357,
+            0.009311016,
+            -0.0052918927,
+            -0.022242354,
+            0.015578698,
+            0.011652912,
+            -0.013349937,
+            0.030546468,
+            0.057246342,
+            -0.009090402,
+            -0.018565465,
+            -0.0309085,
+            -0.04023083,
+            0.03461933,
+            -0.016913692,
+            0.060640395,
+            -0.012625873,
+            0.043715388,
+            -0.024731325,
+            0.009678705,
+            0.1618284,
+            0.011992317,
+            0.010945817,
+            -0.022038711,
+            -0.023894126,
+            -0.0048025837,
+            -0.01371197,
+            -0.047697745,
+            -0.005190071,
+            -0.05122756,
+            -0.0063864742,
+            0.054576356,
+            -0.060957175,
+            0.010363172,
+            -0.031247905,
+            -0.011058953,
+            -0.023758363,
+            -0.030636976,
+            -0.051996876,
+            -0.023215316,
+            0.0064260717,
+            -0.032243494,
+            -0.035275515,
+            -0.0061488906,
+            0.0029556535,
+            0.006584461,
+            0.042787682,
+            -0.036836777,
+            0.01242223,
+            -0.007042658,
+            0.04009507,
+            -0.0143568395,
+            -0.0052324967,
+            -0.0010097305,
+            -0.023894126,
+            -0.010555502,
+            -0.018655973,
+            0.038058635,
+            -0.054576356,
+            -0.06602562,
+            -0.005246639,
+            -0.0036882032,
+            -0.034822974,
+            0.018622031,
+            0.02393938,
+            -0.022445997,
+            -0.0119583765,
+            0.0062789964,
+            0.015080905,
+            -0.07552897,
+            0.0048110685,
+            0.035252888,
+            0.0027322117,
+            0.10019241,
+            0.020352999,
+            0.028012242,
+            0.014243705,
+            0.003965384,
+            0.0031508117,
+            -0.030999009,
+            0.037787113,
+            -0.007297212,
+            0.008162695,
+            -0.00044122676,
+            -0.017603816,
+            -0.004002153,
+            -0.025093358,
+            -0.03016181,
+            -0.0061262636,
+            0.0011143804,
+            -0.005190071,
+            0.024595562,
+            0.0139043,
+            0.008575638,
+            -0.0051023914,
+            0.018124238,
+            -0.008784938,
+            0.01738886,
+            0.03439306,
+            -0.054123815,
+            0.048602823,
+            -0.08019014,
+            0.018610718,
+            -0.012886084,
+            0.026269963,
+            -0.03355586,
+            -0.007263271,
+            0.010397113,
+            -0.05407856,
+            0.033171203,
+            -0.026496232,
+            -0.037108302,
+            -0.048467062,
+            -0.016438525,
+            0.014741499,
+            -0.0015414653,
+            0.058015663,
+            -0.008151381,
+            -0.013474386,
+            -0.039122105,
+            0.07534795,
+            0.026043693,
+            -0.0065901177,
+            0.0059452476,
+            0.039733034,
+            -0.019889144,
+            0.040660743,
+            0.00012612792,
+            0.010838339,
+            -0.040615488,
+            -0.027356058,
+            0.0033063723,
+            -0.046430632,
+            0.020601895,
+            -0.030863246,
+            -0.040570233,
+            0.04742622,
+            -0.005068451,
+            -0.0865257,
+            0.028894696,
+            -0.016596913,
+            -0.02477658,
+            0.04136218,
+            -0.021280706,
+            -0.029369863,
+            0.0030631318,
+            0.032854423,
+            0.025568524,
+            0.016461153,
+            -0.03620322,
+            -0.051861115,
+            -0.014933828,
+            -0.0053512887,
+            0.00020894632,
+            -0.015273234,
+            -0.021427782,
+            -0.012546679,
+            -0.01413057,
+            -0.024867088,
+            -0.032243494,
+            -0.010572472,
+            -0.026269963,
+            -0.022525191,
+            0.019108513,
+            0.017400173,
+            0.006923866,
+            0.00007247715,
+            -0.0016263166,
+            0.032990184,
+            0.0041407435,
+            0.036768895,
+            0.013915613,
+            0.032944933,
+            0.0150469635,
+            0.009593854,
+            0.034506194,
+            0.0056284694,
+            0.06616139,
+            0.03409891,
+            -0.008536041,
+            -0.025206493,
+            0.0108213695,
+            0.006233742,
+            -0.03545653,
+            0.035094496,
+            -0.010040737,
+            0.07960183,
+            -0.016585601,
+            0.042448275,
+            -0.0056058425,
+            -0.040887013,
+            0.010159529,
+            0.04968892,
+            -0.04756198,
+            -0.046611648,
+            -0.07955658,
+            0.023984633,
+            -0.0057048355,
+            -0.053082973,
+            -0.035049245,
+            0.0584682,
+            0.008077844,
+            0.043805897,
+            -0.11268253,
+            -0.03636161,
+            -0.007410347,
+            -0.0051844143,
+            -0.011375731,
+            0.026405724,
+            0.0578799,
+            0.037606094,
+            0.020816851,
+            -0.020058848,
+            -0.021597484,
+            0.08172877,
+            0.03925787,
+            -0.00016272004,
+            -0.03029757,
+            -0.029980792,
+            -0.04733571,
+            -0.04152057,
+            -0.011579374,
+            0.008943327,
+            -0.036044832,
+            -0.005399371,
+            0.041407432,
+            -0.03998193,
+            0.06054989,
+            0.045502923,
+            0.02597581,
+            -0.0061206073,
+            0.042923443,
+            -0.007772379,
+            -0.03371425,
+            0.009639108,
+            0.026564114,
+            -0.022445997,
+            -0.023826245,
+            0.06150022,
+            -0.042312514,
+            0.021529604,
+            0.053309243,
+            0.00052537094,
+            -0.057382107,
+            0.03332959,
+            0.03656525,
+            -0.014311586,
+            0.061817,
+            0.016212255,
+            -0.032628153,
+            0.01829394,
+            -0.013892986,
+            0.023396332,
+            0.0780632,
+            0.015431623,
+            -0.019889144,
+            -0.0046130824,
+            -0.08014488,
+            0.014775439,
+            0.006165861,
+            -0.067383245,
+            -0.015205353,
+            -0.044258438,
+            0.3663766,
+            -0.008886759,
+            0.013349937,
+            0.034121536,
+            0.07380932,
+            0.10245512,
+            0.033306964,
+            -0.057291597,
+            -0.012094138,
+            0.016596913,
+            0.022276293,
+            0.044824112,
+            -0.067202225,
+            0.05810617,
+            -0.033306964,
+            -0.060414124,
+            -0.0078968275,
+            0.019425292,
+            0.06236005,
+            -0.004941174,
+            0.019119825,
+            -0.060730904,
+            -0.04029871,
+            -0.04652114,
+            -0.044778857,
+            -0.0013851975,
+            0.005037339,
+            0.022389429,
+            0.024686072,
+            0.0023970492,
+            -0.06394394,
+            -0.018916182,
+            -0.06398919,
+            -0.023645228,
+            -0.03461933,
+            0.025025476,
+            -0.032605525,
+            0.0009072018,
+            0.02681301,
+            0.026473606,
+            0.017569875,
+            0.04235777,
+            0.03636161,
+            -0.05005095,
+            0.010923191,
+            -0.03265078,
+            -0.0086718025,
+            0.014945142,
+            0.017400173,
+            -0.037900247,
+            -0.04258404,
+            0.047018934,
+            -0.026315216,
+            -0.07996386,
+            -0.007336809,
+            -0.037809737,
+            -0.027310805,
+            0.017728265,
+            0.031881463,
+            -0.018237373,
+            0.011415328,
+            0.009876691,
+            -0.019447917,
+            -0.014741499,
+            0.026699875,
+            0.025319628,
+            -0.0006328493,
+            0.0016150031,
+            -0.045050383,
+            0.0023121978,
+            -0.07783692,
+            0.025319628,
+            0.051770605,
+            0.018112924,
+            -0.030931126,
+            -0.028691053,
+            0.02742394,
+            -0.015895477,
+            0.008179666,
+            0.027514448,
+            0.011664225,
+            0.0066636554,
+            0.061817,
+            -0.046928424,
+            0.038398042,
+            -0.05240416,
+            0.013564894,
+            0.06394394,
+            -0.009186568,
+            -0.03167782,
+            0.00077921775,
+            -0.008009963,
+            0.05014146,
+            -0.05552669,
+            -0.019436603,
+            0.011211685,
+            -0.08810959,
+            -0.0390316,
+            0.03068223,
+            -0.03278654,
+            -0.007274585,
+            -0.05765363,
+            0.02681301,
+            -0.063129365,
+            0.027695464,
+            0.011217342,
+            -0.0151261585,
+            -0.030704856,
+            0.04801452,
+            -0.00025844292,
+            -0.07684134,
+            0.019481858,
+            -0.007201047,
+            0.044756234,
+            0.008043903,
+            -0.03771923,
+            0.024233531,
+            -0.0018893556,
+            -0.01461705,
+            -0.0039455853,
+            -0.012286468,
+            0.00030175244,
+            -0.01803373,
+            -0.04742622,
+            0.029505625,
+            -0.00141136,
+            0.026473606,
+            0.06294835,
+            0.04294607,
+            -0.00675982,
+            0.09955886,
+            0.010204783,
+            -0.034868225,
+            0.043443866,
+            0.032854423,
+            0.008383309,
+            0.03545653,
+            0.03385001,
+            -0.0012734766,
+            0.01810161,
+            0.012625873,
+            0.04097752,
+            -0.009627794,
+            -0.03172307,
+            0.055798214,
+            -0.017830087,
+            -0.036248475,
+            0.0139043,
+            0.03197197,
+            -0.007240644,
+            0.010827025,
+            0.018078983,
+            0.03226612,
+            0.042674545,
+            -0.006601431,
+            0.02235549,
+            -0.02207265,
+            -0.0020986556,
+            -0.000473046,
+            -0.06973645,
+            -0.01806767,
+            -0.07009849,
+            -0.01935741,
+            0.042900816,
+            -0.0105385315,
+            0.00011941053,
+            -0.021156257,
+            -0.01810161,
+            0.014662305,
+            -0.0021962344,
+            0.016076492,
+            0.022672268,
+            0.007093569,
+            -0.015476877,
+            -0.0032582898,
+            0.055164658,
+            0.05036773,
+            -0.039710406,
+            0.024052516,
+            -0.024731325,
+            -0.033872638,
+            -0.014175824,
+            0.013474386,
+            -0.01965156,
+            -0.014911202,
+            -0.055436183,
+            -0.024889715,
+            0.032922305,
+            0.0075800493,
+            0.006918209,
+            0.00077002554,
+            -0.022445997,
+            -0.078923024,
+            -0.019843891,
+            -0.055798214,
+            0.06598037,
+            0.016540347,
+            -0.027175043,
+            0.0038098234,
+            -0.045570806,
+            0.018146865,
+            -0.0064600124,
+            0.0567938,
+            -0.021699306,
+            -0.015861535,
+            -0.05819668,
+            -0.013530954,
+            0.06226954,
+            0.043013953,
+            -0.021665365,
+            -0.047516726,
+            -0.020047534,
+            -0.0015980328,
+            0.061002426,
+            0.0012579205,
+            0.029053085,
+            -0.01461705,
+            -0.0130557865,
+            -0.042516157,
+            0.035411276,
+            0.09539549,
+            -0.03029757,
+            0.035637546,
+            -0.018146865,
+            0.055571944,
+            -0.024233531,
+            0.008722713,
+            -0.0018002618,
+            0.050096206,
+            0.0073198387,
+            -0.001998248,
+            -0.027242923,
+            0.020805538,
+            -0.0052975495,
+            0.027831227,
+            0.027446566,
+            0.023147434,
+            -0.019662874,
+            -0.03242451,
+            -0.04543504,
+            0.014832007,
+            -0.041837346,
+            0.0089546405,
+            -0.0054559386,
+            0.053173482,
+            -0.0069634635,
+            -0.017852712,
+            -0.011845241,
+            -0.020126728,
+            -0.043534372,
+            -0.027740719,
+            0.004106803,
+            -0.041475315,
+            0.01583891,
+            0.045344535,
+            -0.0021396668,
+            -0.0069747767,
+            -0.04566131,
+            0.07186339,
+            0.030659603,
+            -0.01603124,
+            -0.028781561,
+            -0.0348456,
+            0.055888724,
+            -0.0020675433,
+            -0.027582329,
+            -0.017535934,
+            0.007619647,
+            -0.025274374,
+            -0.016076492,
+            -0.012014944,
+            -0.009514659,
+            0.01887093,
+            0.025591152,
+            0.015352428,
+            -0.018836988,
+            0.04577445,
+            -0.0031790955,
+            -0.044281065,
+            0.004760158,
+            -0.023351077,
+            0.020375626,
+            0.024957595,
+            -0.012229901,
+            0.0065052663,
+            0.011460582,
+            0.0087509975,
+            -0.04710944,
+            0.0032158643,
+            0.012478798,
+            -0.02355472,
+            -0.010866623,
+            -0.051091794,
+            -0.03977829,
+            0.005023197,
+            -0.039959304,
+            -0.021755872,
+            0.0483313,
+            0.06249581,
+            -0.00003367536,
+            -0.0027039282,
+            -0.016404584,
+            0.006239399,
+            -0.0155334445,
+            -0.013372565,
+            0.023170061,
+            -0.006782447,
+            -0.016234882,
+            -0.015363742,
+            -0.011256939,
+            -0.0020788568,
+            -0.010006797,
+            0.019538425,
+            -0.026405724,
+            0.023848873,
+            -0.04733571,
+            -0.030116554,
+            -0.0038777043,
+            0.039371002,
+            0.02584005,
+            0.026835637,
+            0.054802626,
+            -0.029369863,
+            -0.004533888,
+            -0.014741499,
+            -0.041565824,
+            0.008089157,
+            -0.03550178,
+            0.032447137,
+            0.016008612,
+            -0.020386938,
+            0.008960297,
+            0.024663445,
+            0.05023197,
+            -0.031021634,
+            -0.010793085,
+            -0.0024946283,
+            -0.01780746,
+            0.027310805,
+            0.050865527,
+            0.0078742,
+            0.0011355932,
+            -0.02961876,
+            -0.022830656,
+            -0.008202292,
+            0.007432974,
+            -0.007268928,
+            -0.07602677,
+            -0.015567385,
+            -0.04910062,
+            -0.024980223,
+            -0.009876691,
+            -0.016189627,
+            0.041090656,
+            0.016404584,
+            -0.032356627,
+            0.023577347,
+            -0.0186786,
+            0.053309243,
+            0.02200477,
+            -0.0068899253,
+            0.024143023,
+            0.026383096,
+            0.006748507,
+            -0.015646579,
+            -0.018508896,
+            0.02207265,
+            0.031655192,
+            -0.008321084,
+            -0.0070256875,
+            -0.0060640397,
+            -0.07317576,
+            0.022943791,
+            -0.045095637,
+            0.037108302,
+            0.013530954,
+            -0.054666862,
+            -0.054983642,
+            -0.0038692192,
+            -0.002036431,
+            -0.012761636,
+            0.023962006,
+            0.020239864,
+            0.039506763,
+            0.030342825,
+            -0.047245204,
+            -0.019187707,
+            -0.052901957,
+            0.032152984,
+            -0.08838111,
+            -0.048919603,
+            -0.08883365,
+            -0.0133386245,
+            0.016314076,
+            -0.0034195073,
+            -0.03464196,
+            -0.014741499,
+            0.0072180172,
+            0.01284083,
+            -0.034958735,
+            -0.021122316,
+            0.057698883,
+            -0.014933828,
+            0.07765591,
+            0.003131013,
+            0.018904869,
+            0.018022416,
+            -0.0060923235,
+            -0.0052070413,
+            -0.041859973,
+            -0.028215885,
+            -0.037176184,
+            0.025319628,
+            0.012625873,
+            0.0026077633,
+            -0.045344535,
+            -0.00400781,
+            -0.020149356,
+            0.027061908,
+            0.035999577,
+            0.0059565613,
+            -0.0019402663,
+            -0.037085675,
+            0.046475884,
+            0.011562403,
+            0.028826814,
+            -0.0086718025,
+            -0.016925005,
+            -0.028306393,
+            -0.036022205,
+            -0.007676214,
+            0.05416907,
+            -0.005586044,
+            0.002853832,
+            0.0011942821,
+            -0.02461819,
+            -0.029867657,
+            0.018927496,
+            -0.055662453,
+            -0.009118686,
+            0.006256369,
+            -0.019764695,
+            0.0011773118,
+            0.06607088,
+            -0.03642949,
+            0.029188847,
+            -0.10752357,
+            0.019187707,
+            0.021687992,
+            -0.027921734,
+            0.017309666,
+            -0.021993456,
+            0.0015895477,
+            -0.03658788,
+            -0.03303544,
+            -0.023192689,
+            0.014277645,
+            -0.012026258,
+            0.053354498,
+            -0.0013738839,
+            0.041407432,
+            -0.047018934,
+            0.028804189,
+            -0.010029424,
+            -0.00004401661,
+            0.009769212,
+            -0.0004963801,
+            -0.018689914,
+            -0.0011886252,
+            0.027491821,
+            -0.009254448,
+            -0.03468721,
+            0.0139495535,
+            -0.026315216,
+        ],
+    )
+    print(b)
diff --git a/knext/search/rest/__init__.py b/knext/search/rest/__init__.py
new file mode 100644
index 00000000..936aa95d
--- /dev/null
+++ b/knext/search/rest/__init__.py
@@ -0,0 +1,36 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+# flake8: noqa
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+from __future__ import absolute_import
+
+__version__ = "1.0.0"
+
+# import models into model package
+from knext.search.rest.models.idx_record import IdxRecord
+from knext.search.rest.models.text_search_request import TextSearchRequest
+from knext.search.rest.models.vector_search_request import VectorSearchRequest
+
+# import apis into sdk package
+from knext.search.rest.search_api import SearchApi
diff --git a/knext/search/rest/models/__init__.py b/knext/search/rest/models/__init__.py
new file mode 100644
index 00000000..34634808
--- /dev/null
+++ b/knext/search/rest/models/__init__.py
@@ -0,0 +1,21 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+# flake8: noqa
+
+from __future__ import absolute_import
+
+# import models into model package
+from knext.search.rest.models.idx_record import IdxRecord
+from knext.search.rest.models.text_search_request import TextSearchRequest
+from knext.search.rest.models.vector_search_request import VectorSearchRequest
diff --git a/knext/search/rest/models/idx_record.py b/knext/search/rest/models/idx_record.py
new file mode 100644
index 00000000..01601bfa
--- /dev/null
+++ b/knext/search/rest/models/idx_record.py
@@ -0,0 +1,226 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class IdxRecord(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "idx_name": "str",
+        "doc_id": "str",
+        "score": "float",
+        "fields": "object",
+    }
+
+    attribute_map = {
+        "idx_name": "idxName",
+        "doc_id": "docId",
+        "score": "score",
+        "fields": "fields",
+    }
+
+    def __init__(
+        self,
+        idx_name=None,
+        doc_id=None,
+        score=None,
+        fields=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """IdxRecord - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._idx_name = None
+        self._doc_id = None
+        self._score = None
+        self._fields = None
+        self.discriminator = None
+
+        self.idx_name = idx_name
+        self.doc_id = doc_id
+        self.score = score
+        if fields is not None:
+            self.fields = fields
+
+    @property
+    def idx_name(self):
+        """Gets the idx_name of this IdxRecord.  # noqa: E501
+
+
+        :return: The idx_name of this IdxRecord.  # noqa: E501
+        :rtype: str
+        """
+        return self._idx_name
+
+    @idx_name.setter
+    def idx_name(self, idx_name):
+        """Sets the idx_name of this IdxRecord.
+
+
+        :param idx_name: The idx_name of this IdxRecord.  # noqa: E501
+        :type: str
+        """
+        self._idx_name = idx_name
+
+    @property
+    def doc_id(self):
+        """Gets the doc_id of this IdxRecord.  # noqa: E501
+
+
+        :return: The doc_id of this IdxRecord.  # noqa: E501
+        :rtype: str
+        """
+        return self._doc_id
+
+    @doc_id.setter
+    def doc_id(self, doc_id):
+        """Sets the doc_id of this IdxRecord.
+
+
+        :param doc_id: The doc_id of this IdxRecord.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and doc_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `doc_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._doc_id = doc_id
+
+    @property
+    def score(self):
+        """Gets the score of this IdxRecord.  # noqa: E501
+
+
+        :return: The score of this IdxRecord.  # noqa: E501
+        :rtype: float
+        """
+        return self._score
+
+    @score.setter
+    def score(self, score):
+        """Sets the score of this IdxRecord.
+
+
+        :param score: The score of this IdxRecord.  # noqa: E501
+        :type: float
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and score is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `score`, must not be `None`"
+            )  # noqa: E501
+
+        self._score = score
+
+    @property
+    def fields(self):
+        """Gets the fields of this IdxRecord.  # noqa: E501
+
+
+        :return: The fields of this IdxRecord.  # noqa: E501
+        :rtype: object
+        """
+        return self._fields
+
+    @fields.setter
+    def fields(self, fields):
+        """Sets the fields of this IdxRecord.
+
+
+        :param fields: The fields of this IdxRecord.  # noqa: E501
+        :type: object
+        """
+
+        self._fields = fields
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, IdxRecord):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, IdxRecord):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/search/rest/models/text_search_request.py b/knext/search/rest/models/text_search_request.py
new file mode 100644
index 00000000..046301ee
--- /dev/null
+++ b/knext/search/rest/models/text_search_request.py
@@ -0,0 +1,260 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class TextSearchRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "project_id": "int",
+        "query_string": "str",
+        "label_constraints": "list[str]",
+        "topk": "int",
+        "params": "object",
+    }
+
+    attribute_map = {
+        "project_id": "projectId",
+        "query_string": "queryString",
+        "label_constraints": "labelConstraints",
+        "topk": "topk",
+        "params": "params",
+    }
+
+    def __init__(
+        self,
+        project_id=None,
+        query_string=None,
+        label_constraints=None,
+        topk=None,
+        params=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """TextSearchRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._project_id = None
+        self._query_string = None
+        self._label_constraints = None
+        self._topk = None
+        self._params = None
+        self.discriminator = None
+
+        self.project_id = project_id
+        self.query_string = query_string
+        if label_constraints is not None:
+            self.label_constraints = label_constraints
+        self.topk = topk
+        if params is not None:
+            self.params = params
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this TextSearchRequest.  # noqa: E501
+
+
+        :return: The project_id of this TextSearchRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this TextSearchRequest.
+
+
+        :param project_id: The project_id of this TextSearchRequest.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._project_id = project_id
+
+    @property
+    def query_string(self):
+        """Gets the query_string of this TextSearchRequest.  # noqa: E501
+
+
+        :return: The query_string of this TextSearchRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._query_string
+
+    @query_string.setter
+    def query_string(self, query_string):
+        """Sets the query_string of this TextSearchRequest.
+
+
+        :param query_string: The query_string of this TextSearchRequest.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and query_string is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `query_string`, must not be `None`"
+            )  # noqa: E501
+
+        self._query_string = query_string
+
+    @property
+    def label_constraints(self):
+        """Gets the label_constraints of this TextSearchRequest.  # noqa: E501
+
+
+        :return: The label_constraints of this TextSearchRequest.  # noqa: E501
+        :rtype: list[str]
+        """
+        return self._label_constraints
+
+    @label_constraints.setter
+    def label_constraints(self, label_constraints):
+        """Sets the label_constraints of this TextSearchRequest.
+
+
+        :param label_constraints: The label_constraints of this TextSearchRequest.  # noqa: E501
+        :type: list[str]
+        """
+
+        self._label_constraints = label_constraints
+
+    @property
+    def topk(self):
+        """Gets the topk of this TextSearchRequest.  # noqa: E501
+
+
+        :return: The topk of this TextSearchRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._topk
+
+    @topk.setter
+    def topk(self, topk):
+        """Sets the topk of this TextSearchRequest.
+
+
+        :param topk: The topk of this TextSearchRequest.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and topk is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `topk`, must not be `None`"
+            )  # noqa: E501
+
+        self._topk = topk
+
+    @property
+    def params(self):
+        """Gets the params of this TextSearchRequest.  # noqa: E501
+
+
+        :return: The params of this TextSearchRequest.  # noqa: E501
+        :rtype: object
+        """
+        return self._params
+
+    @params.setter
+    def params(self, params):
+        """Sets the params of this TextSearchRequest.
+
+
+        :param params: The params of this TextSearchRequest.  # noqa: E501
+        :type: object
+        """
+
+        self._params = params
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, TextSearchRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, TextSearchRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/search/rest/models/vector_search_request.py b/knext/search/rest/models/vector_search_request.py
new file mode 100644
index 00000000..34727471
--- /dev/null
+++ b/knext/search/rest/models/vector_search_request.py
@@ -0,0 +1,327 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class VectorSearchRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "project_id": "int",
+        "label": "str",
+        "property_key": "str",
+        "query_vector": "list[float]",
+        "ef_search": "int",
+        "topk": "int",
+        "params": "object",
+    }
+
+    attribute_map = {
+        "project_id": "projectId",
+        "label": "label",
+        "property_key": "propertyKey",
+        "query_vector": "queryVector",
+        "ef_search": "efSearch",
+        "topk": "topk",
+        "params": "params",
+    }
+
+    def __init__(
+        self,
+        project_id=None,
+        label=None,
+        property_key=None,
+        query_vector=None,
+        ef_search=None,
+        topk=None,
+        params={},
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """VectorSearchRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._project_id = None
+        self._label = None
+        self._property_key = None
+        self._query_vector = None
+        self._ef_search = None
+        self._topk = None
+        self._params = None
+        self.discriminator = None
+
+        self.project_id = project_id
+        self.label = label
+        self.property_key = property_key
+        self.query_vector = query_vector
+        if ef_search is not None:
+            self.ef_search = ef_search
+        self.topk = topk
+        if params is not None:
+            self.params = params
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this VectorSearchRequest.  # noqa: E501
+
+
+        :return: The project_id of this VectorSearchRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this VectorSearchRequest.
+
+
+        :param project_id: The project_id of this VectorSearchRequest.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._project_id = project_id
+
+    @property
+    def label(self):
+        """Gets the label of this VectorSearchRequest.  # noqa: E501
+
+
+        :return: The label of this VectorSearchRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._label
+
+    @label.setter
+    def label(self, label):
+        """Sets the label of this VectorSearchRequest.
+
+
+        :param label: The label of this VectorSearchRequest.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and label is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `label`, must not be `None`"
+            )  # noqa: E501
+
+        self._label = label
+
+    @property
+    def property_key(self):
+        """Gets the property_key of this VectorSearchRequest.  # noqa: E501
+
+
+        :return: The property_key of this VectorSearchRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._property_key
+
+    @property_key.setter
+    def property_key(self, property_key):
+        """Sets the property_key of this VectorSearchRequest.
+
+
+        :param property_key: The property_key of this VectorSearchRequest.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and property_key is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `property_key`, must not be `None`"
+            )  # noqa: E501
+
+        self._property_key = property_key
+
+    @property
+    def query_vector(self):
+        """Gets the query_vector of this VectorSearchRequest.  # noqa: E501
+
+
+        :return: The query_vector of this VectorSearchRequest.  # noqa: E501
+        :rtype: list[float]
+        """
+        return self._query_vector
+
+    @query_vector.setter
+    def query_vector(self, query_vector):
+        """Sets the query_vector of this VectorSearchRequest.
+
+
+        :param query_vector: The query_vector of this VectorSearchRequest.  # noqa: E501
+        :type: list[float]
+        """
+        if (
+            self.local_vars_configuration.client_side_validation
+            and query_vector is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `query_vector`, must not be `None`"
+            )  # noqa: E501
+
+        self._query_vector = query_vector
+
+    @property
+    def ef_search(self):
+        """Gets the ef_search of this VectorSearchRequest.  # noqa: E501
+
+
+        :return: The ef_search of this VectorSearchRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._ef_search
+
+    @ef_search.setter
+    def ef_search(self, ef_search):
+        """Sets the ef_search of this VectorSearchRequest.
+
+
+        :param ef_search: The ef_search of this VectorSearchRequest.  # noqa: E501
+        :type: int
+        """
+
+        self._ef_search = ef_search
+
+    @property
+    def topk(self):
+        """Gets the topk of this VectorSearchRequest.  # noqa: E501
+
+
+        :return: The topk of this VectorSearchRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._topk
+
+    @topk.setter
+    def topk(self, topk):
+        """Sets the topk of this VectorSearchRequest.
+
+
+        :param topk: The topk of this VectorSearchRequest.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and topk is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `topk`, must not be `None`"
+            )  # noqa: E501
+
+        self._topk = topk
+
+    @property
+    def params(self):
+        """Gets the params of this VectorSearchRequest.  # noqa: E501
+
+
+        :return: The params of this VectorSearchRequest.  # noqa: E501
+        :rtype: object
+        """
+        return self._params
+
+    @params.setter
+    def params(self, params):
+        """Sets the params of this VectorSearchRequest.
+
+
+        :param params: The params of this VectorSearchRequest.  # noqa: E501
+        :type: object
+        """
+
+        self._params = params
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (
+                            (item[0], item[1].to_dict())
+                            if hasattr(item[1], "to_dict")
+                            else item
+                        ),
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, VectorSearchRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, VectorSearchRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/search/rest/search_api.py b/knext/search/rest/search_api.py
new file mode 100644
index 00000000..a9f3cf21
--- /dev/null
+++ b/knext/search/rest/search_api.py
@@ -0,0 +1,281 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+from __future__ import absolute_import
+
+import re  # noqa: F401
+
+# python 2 and python 3 compatibility library
+import six
+
+from knext.common.rest.api_client import ApiClient
+from knext.common.rest.exceptions import ApiTypeError, ApiValueError  # noqa: F401
+
+
+class SearchApi(object):
+    """NOTE: This class is auto generated by OpenAPI Generator
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    def __init__(self, api_client=None):
+        if api_client is None:
+            api_client = ApiClient()
+        self.api_client = api_client
+
+    def search_text_post(self, **kwargs):  # noqa: E501
+        """search_text  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.search_text_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param TextSearchRequest text_search_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: list[IdxRecord]
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.search_text_post_with_http_info(**kwargs)  # noqa: E501
+
+    def search_text_post_with_http_info(self, **kwargs):  # noqa: E501
+        """search_text  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.search_text_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param TextSearchRequest text_search_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(list[IdxRecord], status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["text_search_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method search_text_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "text_search_request" in local_var_params:
+            body_params = local_var_params["text_search_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/search/text",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="list[IdxRecord]",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
+
+    def search_vector_post(self, **kwargs):  # noqa: E501
+        """search_vector  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.search_vector_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param VectorSearchRequest vector_search_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: list[IdxRecord]
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.search_vector_post_with_http_info(**kwargs)  # noqa: E501
+
+    def search_vector_post_with_http_info(self, **kwargs):  # noqa: E501
+        """search_vector  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.search_vector_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param VectorSearchRequest vector_search_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(list[IdxRecord], status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["vector_search_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method search_vector_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "vector_search_request" in local_var_params:
+            body_params = local_var_params["vector_search_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/search/vector",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="list[IdxRecord]",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
diff --git a/knext/thinker/__init__.py b/knext/thinker/__init__.py
new file mode 100644
index 00000000..6f6914a4
--- /dev/null
+++ b/knext/thinker/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
diff --git a/knext/thinker/client.py b/knext/thinker/client.py
new file mode 100644
index 00000000..d5f95a53
--- /dev/null
+++ b/knext/thinker/client.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import os
+
+from knext.common.base.client import Client
+from knext.common.rest import Configuration, ApiClient
+from knext.thinker import rest
+from knext.thinker.rest import ThinkerTaskRequest, ThinkerTaskResponse
+
+
+class ThinkerClient(Client):
+    """SPG Thinker Client."""
+
+    def __init__(self, host_addr: str = None, project_id: int = None):
+        super().__init__(host_addr, project_id)
+
+        self._rest_client: rest.ThinkerApi = rest.ThinkerApi(
+            api_client=ApiClient(configuration=Configuration(host=host_addr))
+        )
+
+    def execute(self, subject="", predicate="", object="", mode="spo", params=""):
+        """
+        Execute a synchronous builder job in local runner.
+        """
+        req: ThinkerTaskRequest = ThinkerTaskRequest(
+            project_id=self._project_id,
+            subject=subject,
+            predicate=predicate,
+            object=object,
+            mode=mode,
+            params=params,
+        )
+        rep: ThinkerTaskResponse = self._rest_client.reason_thinker_post(
+            thinker_task_request=req
+        )
+        print(rep)
+
+
+if __name__ == "__main__":
+    sc = ThinkerClient("http://127.0.0.1:8887", 2)
+    sc.execute(
+        subject="DiseaseLevel",
+        mode="node",
+        params='{"spg.reasoner.thinker.strict":·true,·"收缩压":150}',
+    )
diff --git a/knext/thinker/rest/__init__.py b/knext/thinker/rest/__init__.py
new file mode 100644
index 00000000..913a32c7
--- /dev/null
+++ b/knext/thinker/rest/__init__.py
@@ -0,0 +1,33 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+# flake8: noqa
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+from __future__ import absolute_import
+
+__version__ = "1"
+# import models into sdk package
+from knext.thinker.rest.models.thinker_task_request import ThinkerTaskRequest
+from knext.thinker.rest.models.thinker_task_response import ThinkerTaskResponse
+
+# import apis into sdk package
+from knext.thinker.rest.thinker_api import ThinkerApi
diff --git a/knext/thinker/rest/models/__init__.py b/knext/thinker/rest/models/__init__.py
new file mode 100644
index 00000000..0ec2d921
--- /dev/null
+++ b/knext/thinker/rest/models/__init__.py
@@ -0,0 +1,28 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+# flake8: noqa
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+from __future__ import absolute_import
+
+# import models into model package
+from knext.thinker.rest.models.thinker_task_request import ThinkerTaskRequest
+from knext.thinker.rest.models.thinker_task_response import ThinkerTaskResponse
diff --git a/knext/thinker/rest/models/thinker_task_request.py b/knext/thinker/rest/models/thinker_task_request.py
new file mode 100644
index 00000000..cfdd3051
--- /dev/null
+++ b/knext/thinker/rest/models/thinker_task_request.py
@@ -0,0 +1,277 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class ThinkerTaskRequest(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {
+        "project_id": "int",
+        "subject": "str",
+        "predicate": "str",
+        "object": "str",
+        "mode": "str",
+        "params": "str",
+    }
+
+    attribute_map = {
+        "project_id": "projectId",
+        "subject": "subject",
+        "predicate": "predicate",
+        "object": "object",
+        "mode": "mode",
+        "params": "params",
+    }
+
+    def __init__(
+        self,
+        project_id=None,
+        subject=None,
+        predicate=None,
+        object=None,
+        mode=None,
+        params=None,
+        local_vars_configuration=None,
+    ):  # noqa: E501
+        """ThinkerTaskRequest - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._project_id = None
+        self._subject = None
+        self._predicate = None
+        self._object = None
+        self._mode = None
+        self._params = None
+        self.discriminator = None
+
+        self.project_id = project_id
+        if subject is not None:
+            self.subject = subject
+        if predicate is not None:
+            self.predicate = predicate
+        if object is not None:
+            self.object = object
+        if mode is not None:
+            self.mode = mode
+        if params is not None:
+            self.params = params
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this ThinkerTaskRequest.  # noqa: E501
+
+
+        :return: The project_id of this ThinkerTaskRequest.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this ThinkerTaskRequest.
+
+
+        :param project_id: The project_id of this ThinkerTaskRequest.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._project_id = project_id
+
+    @property
+    def subject(self):
+        """Gets the subject of this ThinkerTaskRequest.  # noqa: E501
+
+
+        :return: The subject of this ThinkerTaskRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._subject
+
+    @subject.setter
+    def subject(self, subject):
+        """Sets the subject of this ThinkerTaskRequest.
+
+
+        :param subject: The subject of this ThinkerTaskRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._subject = subject
+
+    @property
+    def predicate(self):
+        """Gets the predicate of this ThinkerTaskRequest.  # noqa: E501
+
+
+        :return: The predicate of this ThinkerTaskRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._predicate
+
+    @predicate.setter
+    def predicate(self, predicate):
+        """Sets the predicate of this ThinkerTaskRequest.
+
+
+        :param predicate: The predicate of this ThinkerTaskRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._predicate = predicate
+
+    @property
+    def object(self):
+        """Gets the object of this ThinkerTaskRequest.  # noqa: E501
+
+
+        :return: The object of this ThinkerTaskRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._object
+
+    @object.setter
+    def object(self, object):
+        """Sets the object of this ThinkerTaskRequest.
+
+
+        :param object: The object of this ThinkerTaskRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._object = object
+
+    @property
+    def mode(self):
+        """Gets the mode of this ThinkerTaskRequest.  # noqa: E501
+
+
+        :return: The mode of this ThinkerTaskRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._mode
+
+    @mode.setter
+    def mode(self, mode):
+        """Sets the mode of this ThinkerTaskRequest.
+
+
+        :param mode: The mode of this ThinkerTaskRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._mode = mode
+
+    @property
+    def params(self):
+        """Gets the params of this ThinkerTaskRequest.  # noqa: E501
+
+
+        :return: The params of this ThinkerTaskRequest.  # noqa: E501
+        :rtype: str
+        """
+        return self._params
+
+    @params.setter
+    def params(self, params):
+        """Sets the params of this ThinkerTaskRequest.
+
+
+        :param params: The params of this ThinkerTaskRequest.  # noqa: E501
+        :type: str
+        """
+
+        self._params = params
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, ThinkerTaskRequest):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, ThinkerTaskRequest):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/thinker/rest/models/thinker_task_response.py b/knext/thinker/rest/models/thinker_task_response.py
new file mode 100644
index 00000000..744b2aa9
--- /dev/null
+++ b/knext/thinker/rest/models/thinker_task_response.py
@@ -0,0 +1,194 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+
+
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+import pprint
+import re  # noqa: F401
+
+import six
+
+from knext.common.rest.configuration import Configuration
+
+
+class ThinkerTaskResponse(object):
+    """NOTE: This class is auto generated by OpenAPI Generator.
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    """
+    Attributes:
+      openapi_types (dict): The key is attribute name
+                            and the value is attribute type.
+      attribute_map (dict): The key is attribute name
+                            and the value is json key in definition.
+    """
+    openapi_types = {"project_id": "int", "task_id": "str", "result": "list[object]"}
+
+    attribute_map = {"project_id": "projectId", "task_id": "taskId", "result": "result"}
+
+    def __init__(
+        self, project_id=None, task_id=None, result=None, local_vars_configuration=None
+    ):  # noqa: E501
+        """ThinkerTaskResponse - a model defined in OpenAPI"""  # noqa: E501
+        if local_vars_configuration is None:
+            local_vars_configuration = Configuration()
+        self.local_vars_configuration = local_vars_configuration
+
+        self._project_id = None
+        self._task_id = None
+        self._result = None
+        self.discriminator = None
+
+        self.project_id = project_id
+        self.task_id = task_id
+        self.result = result
+
+    @property
+    def project_id(self):
+        """Gets the project_id of this ThinkerTaskResponse.  # noqa: E501
+
+
+        :return: The project_id of this ThinkerTaskResponse.  # noqa: E501
+        :rtype: int
+        """
+        return self._project_id
+
+    @project_id.setter
+    def project_id(self, project_id):
+        """Sets the project_id of this ThinkerTaskResponse.
+
+
+        :param project_id: The project_id of this ThinkerTaskResponse.  # noqa: E501
+        :type: int
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and project_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `project_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._project_id = project_id
+
+    @property
+    def task_id(self):
+        """Gets the task_id of this ThinkerTaskResponse.  # noqa: E501
+
+
+        :return: The task_id of this ThinkerTaskResponse.  # noqa: E501
+        :rtype: str
+        """
+        return self._task_id
+
+    @task_id.setter
+    def task_id(self, task_id):
+        """Sets the task_id of this ThinkerTaskResponse.
+
+
+        :param task_id: The task_id of this ThinkerTaskResponse.  # noqa: E501
+        :type: str
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and task_id is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `task_id`, must not be `None`"
+            )  # noqa: E501
+
+        self._task_id = task_id
+
+    @property
+    def result(self):
+        """Gets the result of this ThinkerTaskResponse.  # noqa: E501
+
+
+        :return: The result of this ThinkerTaskResponse.  # noqa: E501
+        :rtype: list[object]
+        """
+        return self._result
+
+    @result.setter
+    def result(self, result):
+        """Sets the result of this ThinkerTaskResponse.
+
+
+        :param result: The result of this ThinkerTaskResponse.  # noqa: E501
+        :type: list[object]
+        """
+        if (
+            self.local_vars_configuration.client_side_validation and result is None
+        ):  # noqa: E501
+            raise ValueError(
+                "Invalid value for `result`, must not be `None`"
+            )  # noqa: E501
+
+        self._result = result
+
+    def to_dict(self):
+        """Returns the model properties as a dict"""
+        result = {}
+
+        for attr, _ in six.iteritems(self.openapi_types):
+            value = getattr(self, attr)
+            if isinstance(value, list):
+                result[attr] = list(
+                    map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value)
+                )
+            elif hasattr(value, "to_dict"):
+                result[attr] = value.to_dict()
+            elif isinstance(value, dict):
+                result[attr] = dict(
+                    map(
+                        lambda item: (item[0], item[1].to_dict())
+                        if hasattr(item[1], "to_dict")
+                        else item,
+                        value.items(),
+                    )
+                )
+            else:
+                result[attr] = value
+
+        return result
+
+    def to_str(self):
+        """Returns the string representation of the model"""
+        return pprint.pformat(self.to_dict())
+
+    def __repr__(self):
+        """For `print` and `pprint`"""
+        return self.to_str()
+
+    def __eq__(self, other):
+        """Returns true if both objects are equal"""
+        if not isinstance(other, ThinkerTaskResponse):
+            return False
+
+        return self.to_dict() == other.to_dict()
+
+    def __ne__(self, other):
+        """Returns true if both objects are not equal"""
+        if not isinstance(other, ThinkerTaskResponse):
+            return True
+
+        return self.to_dict() != other.to_dict()
diff --git a/knext/thinker/rest/thinker_api.py b/knext/thinker/rest/thinker_api.py
new file mode 100644
index 00000000..d5b2782a
--- /dev/null
+++ b/knext/thinker/rest/thinker_api.py
@@ -0,0 +1,161 @@
+# coding: utf-8
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+"""
+    knext
+
+    No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)  # noqa: E501
+
+    The version of the OpenAPI document: 1.0.0
+    Generated by: https://openapi-generator.tech
+"""
+
+
+from __future__ import absolute_import
+
+import re  # noqa: F401
+
+# python 2 and python 3 compatibility library
+import six
+
+from knext.common.rest.api_client import ApiClient
+from knext.common.rest.exceptions import ApiTypeError, ApiValueError  # noqa: F401
+
+
+class ThinkerApi(object):
+    """NOTE: This class is auto generated by OpenAPI Generator
+    Ref: https://openapi-generator.tech
+
+    Do not edit the class manually.
+    """
+
+    def __init__(self, api_client=None):
+        if api_client is None:
+            api_client = ApiClient()
+        self.api_client = api_client
+
+    def reason_thinker_post(self, **kwargs):  # noqa: E501
+        """thinker  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.reason_thinker_post(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param ThinkerTaskRequest thinker_task_request:
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: ThinkerTaskResponse
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+        kwargs["_return_http_data_only"] = True
+        return self.reason_thinker_post_with_http_info(**kwargs)  # noqa: E501
+
+    def reason_thinker_post_with_http_info(self, **kwargs):  # noqa: E501
+        """thinker  # noqa: E501
+
+        This method makes a synchronous HTTP request by default. To make an
+        asynchronous HTTP request, please pass async_req=True
+        >>> thread = api.reason_thinker_post_with_http_info(async_req=True)
+        >>> result = thread.get()
+
+        :param async_req bool: execute request asynchronously
+        :param ThinkerTaskRequest thinker_task_request:
+        :param _return_http_data_only: response data without head status code
+                                       and headers
+        :param _preload_content: if False, the urllib3.HTTPResponse object will
+                                 be returned without reading/decoding response
+                                 data. Default is True.
+        :param _request_timeout: timeout setting for this request. If one
+                                 number provided, it will be total request
+                                 timeout. It can also be a pair (tuple) of
+                                 (connection, read) timeouts.
+        :return: tuple(ThinkerTaskResponse, status_code(int), headers(HTTPHeaderDict))
+                 If the method is called asynchronously,
+                 returns the request thread.
+        """
+
+        local_var_params = locals()
+
+        all_params = ["thinker_task_request"]
+        all_params.extend(
+            [
+                "async_req",
+                "_return_http_data_only",
+                "_preload_content",
+                "_request_timeout",
+            ]
+        )
+
+        for key, val in six.iteritems(local_var_params["kwargs"]):
+            if key not in all_params:
+                raise ApiTypeError(
+                    "Got an unexpected keyword argument '%s'"
+                    " to method reason_thinker_post" % key
+                )
+            local_var_params[key] = val
+        del local_var_params["kwargs"]
+
+        collection_formats = {}
+
+        path_params = {}
+
+        query_params = []
+
+        header_params = {}
+
+        form_params = []
+        local_var_files = {}
+
+        body_params = None
+        if "thinker_task_request" in local_var_params:
+            body_params = local_var_params["thinker_task_request"]
+        # HTTP header `Accept`
+        header_params["Accept"] = self.api_client.select_header_accept(
+            ["application/json"]
+        )  # noqa: E501
+
+        # HTTP header `Content-Type`
+        header_params[
+            "Content-Type"
+        ] = self.api_client.select_header_content_type(  # noqa: E501
+            ["application/json"]
+        )  # noqa: E501
+
+        # Authentication setting
+        auth_settings = []  # noqa: E501
+
+        return self.api_client.call_api(
+            "/reason/thinker",
+            "POST",
+            path_params,
+            query_params,
+            header_params,
+            body=body_params,
+            post_params=form_params,
+            files=local_var_files,
+            response_type="ThinkerTaskResponse",  # noqa: E501
+            auth_settings=auth_settings,
+            async_req=local_var_params.get("async_req"),
+            _return_http_data_only=local_var_params.get(
+                "_return_http_data_only"
+            ),  # noqa: E501
+            _preload_content=local_var_params.get("_preload_content", True),
+            _request_timeout=local_var_params.get("_request_timeout"),
+            collection_formats=collection_formats,
+        )
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 00000000..7e933804
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,31 @@
+[pytest]
+# 添加命令行选项
+addopts = --verbose --tb=short --cov=kag --cov-report=term-missing -rs
+
+# 忽略某些目录
+norecursedirs = tests/unit/solver
+
+# 指定测试文件名模式
+python_files = test_*.py *_test.py
+
+# 指定测试类名模式
+python_classes = Test* *Test
+
+# 指定测试函数名模式
+python_functions = test_*
+
+# 指定测试模块名模式
+python_modules = test_*
+
+# 指定测试文件路径
+testpaths = tests
+
+# 指定日志配置
+log_level = INFO
+log_format = %(asctime)s %(levelname)s %(message)s
+log_date_format = %Y-%m-%d %H:%M:%S
+
+# 指定自定义标记
+markers =
+    slow: marks tests as slow (deselect with '-m "not slow"')
+    smoke: marks tests as smoke tests (deselect with '-m "not smoke"')
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 1834f620..7c72d036 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -37,5 +37,11 @@ openai
 python-docx
 charset_normalizer==3.3.2
 pdfminer.six==20231228
-openspg-knext==0.5.2b2
-ollama
\ No newline at end of file
+ollama
+tenacity
+pyhocon
+scikit-learn
+zodb
+matplotlib
+PyPDF2
+ruamel.yaml
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 00000000..47773b7a
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,17 @@
+[flake8]
+max-line-length = 120
+ignore = E203, E266, E501, W503, W291, C901, E722
+select = C,E,F,W,B,B950
+exclude =
+    .git,
+    __pycache__,
+    setup.py,
+    build,
+    dist,
+    kag/examples/*.py,
+    kag/common/arks_pb2.py,
+    kag/solver/*.py,
+per-file-ignores =
+    __init__.py: F401
+    tests/*: F811
+max-complexity = 10
\ No newline at end of file
diff --git a/setup.py b/setup.py
index d4c4f3bc..b816132d 100644
--- a/setup.py
+++ b/setup.py
@@ -33,19 +33,6 @@
             license += "#\n"
         line = rf.readline()
 
-# Generate kag.__init__.py
-with open(os.path.join(cwd, "kag/__init__.py"), "w") as wf:
-    content = f"""{license}
-
-__package_name__ = "{package_name}"
-__version__ = "{version}"
-
-from kag.common.env import init_env
-
-init_env()
-"""
-    wf.write(content)
-
 setup(
     name=package_name,
     version=version,
@@ -67,6 +54,8 @@
             "*.tests",
             "*.tests.*",
             "*.pyc",
+            "__pycache__",
+            "*/__pycache__/*",
         ],
     ),
     python_requires=">=3.8",
@@ -79,4 +68,11 @@
     package_data={
         "bin": ["*"],
     },
+    entry_points={
+        "console_scripts": [
+            "kag = kag.bin.kag_cmds:main",
+            "knext=knext.command.knext_cli:_main",
+        ]
+    },
 )
+
diff --git a/tests/builder/baike-person-zhoujielun-short.md b/tests/builder/baike-person-zhoujielun-short.md
deleted file mode 100644
index 775ff782..00000000
--- a/tests/builder/baike-person-zhoujielun-short.md
+++ /dev/null
@@ -1,16 +0,0 @@
-<a name="tlFm5"></a>
-
-# 1、周杰伦
-
-2000年，在[杨峻荣](https://baike.baidu.com/item/%E6%9D%A8%E5%B3%BB%E8%8D%A3/8379373?fromModule=lemma_inlink)的推荐下，周杰伦开始演唱自己创作的歌曲；11月7日，发行个人首张音乐专辑《[Jay](https://baike.baidu.com/item/Jay/5291?fromModule=lemma_inlink)》 [26]，并包办专辑全部歌曲的作曲、和声编写以及监制工作，该专辑融合了[R&B](https://baike.baidu.com/item/R&B/15271596?fromModule=lemma_inlink)、[嘻哈](https://baike.baidu.com/item/%E5%98%BB%E5%93%88/161896?fromModule=lemma_inlink)等多种音乐风格，其中的主打歌曲《[星晴](https://baike.baidu.com/item/%E6%98%9F%E6%99%B4/4798844?fromModule=lemma_inlink)》获得第24届[十大中文金曲](https://baike.baidu.com/item/%E5%8D%81%E5%A4%A7%E4%B8%AD%E6%96%87%E9%87%91%E6%9B%B2/823339?fromModule=lemma_inlink)优秀国语歌曲金奖 [15]，而他也凭借该专辑在华语乐坛受到关注，并在次年举办的[第12届台湾金曲奖](https://baike.baidu.com/item/%E7%AC%AC12%E5%B1%8A%E5%8F%B0%E6%B9%BE%E9%87%91%E6%9B%B2%E5%A5%96/61016222?fromModule=lemma_inlink)颁奖典礼上凭借该专辑获得最佳流行音乐演唱专辑奖、入围最佳制作人奖，凭借专辑中的歌曲《[可爱女人](https://baike.baidu.com/item/%E5%8F%AF%E7%88%B1%E5%A5%B3%E4%BA%BA/3225780?fromModule=lemma_inlink)》提名[第12届台湾金曲奖](https://baike.baidu.com/item/%E7%AC%AC12%E5%B1%8A%E5%8F%B0%E6%B9%BE%E9%87%91%E6%9B%B2%E5%A5%96/61016222?fromModule=lemma_inlink)最佳作曲人奖。<br />2001年9月，周杰伦发行个人第二张音乐专辑《[范特西](https://baike.baidu.com/item/%E8%8C%83%E7%89%B9%E8%A5%BF/22666?fromModule=lemma_inlink)》 [26]，他除了担任专辑的制作人外，还包办了专辑中所有歌曲的作曲，该专辑是周杰伦确立其唱片风格的作品，其中结合中西方音乐元素的主打歌曲《[双截棍](https://baike.baidu.com/item/%E5%8F%8C%E6%88%AA%E6%A3%8D/2986610?fromModule=lemma_inlink)》成为饶舌歌曲的代表作之一，该专辑的发行也让周杰伦打开东南亚市场 [16]，并于次年凭借该专辑获得[第13届台湾金曲奖](https://baike.baidu.com/item/%E7%AC%AC13%E5%B1%8A%E5%8F%B0%E6%B9%BE%E9%87%91%E6%9B%B2%E5%A5%96/12761754?fromModule=lemma_inlink)最佳专辑制作人奖、最佳流行音乐专辑奖 [241]，以及香港唱片销量大奖颁奖典礼十大销量国语唱片等奖项，周杰伦亦凭借专辑中的歌曲《[爱在西元前](https://baike.baidu.com/item/%E7%88%B1%E5%9C%A8%E8%A5%BF%E5%85%83%E5%89%8D/3488?fromModule=lemma_inlink)》获得第13届台湾金曲奖最佳作曲人奖 [228]；10月，为[李玟](https://baike.baidu.com/item/%E6%9D%8E%E7%8E%9F/333755?fromModule=lemma_inlink)创作融合中西方音乐元素的歌曲《[刀马旦](https://baike.baidu.com/item/%E5%88%80%E9%A9%AC%E6%97%A6/3894792?fromModule=lemma_inlink)》 [325]；12月24日，发行个人音乐EP《[范特西plus](https://baike.baidu.com/item/%E8%8C%83%E7%89%B9%E8%A5%BFplus/4950842?fromModule=lemma_inlink)》，收录了他在桃园巨蛋演唱会上演唱的《[你比从前快乐](https://baike.baidu.com/item/%E4%BD%A0%E6%AF%94%E4%BB%8E%E5%89%8D%E5%BF%AB%E4%B9%90/3564385?fromModule=lemma_inlink)》《[世界末日](https://baike.baidu.com/item/%E4%B8%96%E7%95%8C%E6%9C%AB%E6%97%A5/5697158?fromModule=lemma_inlink)》等歌曲；同年，获得第19届[十大劲歌金曲颁奖典礼](https://baike.baidu.com/item/%E5%8D%81%E5%A4%A7%E5%8A%B2%E6%AD%8C%E9%87%91%E6%9B%B2%E9%A2%81%E5%A5%96%E5%85%B8%E7%A4%BC/477072?fromModule=lemma_inlink)最受欢迎唱作歌星金奖、[叱咤乐坛流行榜颁奖典礼](https://baike.baidu.com/item/%E5%8F%B1%E5%92%A4%E4%B9%90%E5%9D%9B%E6%B5%81%E8%A1%8C%E6%A6%9C%E9%A2%81%E5%A5%96%E5%85%B8%E7%A4%BC/1325994?fromModule=lemma_inlink)叱咤乐坛生力军男歌手金奖等奖项。<br />2002年，参演个人首部电视剧《[星情花园](https://baike.baidu.com/item/%E6%98%9F%E6%83%85%E8%8A%B1%E5%9B%AD/8740841?fromModule=lemma_inlink)》；2月，在新加坡新达城国际会议展览中心举行演唱会；7月，发行个人第三张音乐专辑《[八度空间](https://baike.baidu.com/item/%E5%85%AB%E5%BA%A6%E7%A9%BA%E9%97%B4/1347996?fromModule=lemma_inlink)》 [26] [317]，除了包办专辑中所有歌曲的作曲外，他还担任专辑的制作人 [17]，该专辑以节奏蓝调风格的歌曲为主，并获得[g-music](https://baike.baidu.com/item/g-music/6992427?fromModule=lemma_inlink)风云榜白金音乐奖十大金碟奖、华语流行乐传媒大奖十大华语唱片奖、[新加坡金曲奖](https://baike.baidu.com/item/%E6%96%B0%E5%8A%A0%E5%9D%A1%E9%87%91%E6%9B%B2%E5%A5%96/6360377?fromModule=lemma_inlink)大奖年度最畅销男歌手专辑奖等奖项 [18]；9月28日，在台北体育场举行“THE ONE”演唱会；12月12日至16日，在[香港体育馆](https://baike.baidu.com/item/%E9%A6%99%E6%B8%AF%E4%BD%93%E8%82%B2%E9%A6%86/2370398?fromModule=lemma_inlink)举行5场“THE ONE”演唱会；12月25日，在美国拉斯维加斯举办“THE ONE”演唱会；同年，获得第1届MTV日本音乐录影带大奖亚洲最杰出艺人奖、第2届[全球华语歌曲排行榜](https://baike.baidu.com/item/%E5%85%A8%E7%90%83%E5%8D%8E%E8%AF%AD%E6%AD%8C%E6%9B%B2%E6%8E%92%E8%A1%8C%E6%A6%9C/3189656?fromModule=lemma_inlink)最受欢迎创作歌手奖和最佳制作人奖 [350]、第9届[新加坡金曲奖](https://baike.baidu.com/item/%E6%96%B0%E5%8A%A0%E5%9D%A1%E9%87%91%E6%9B%B2%E5%A5%96/6360377?fromModule=lemma_inlink)亚太最受推崇男歌手奖等奖项 [19]。<br />[![](https://intranetproxy.alipay.com/skylark/lark/0/2024/jpeg/358/1716184907939-9c85df36-04bb-483b-8b1b-a1fe6c52429a.jpeg)](https://baike.baidu.com/pic/%E5%91%A8%E6%9D%B0%E4%BC%A6/129156/0/4a36acaf2edda3cc7cd971ca62b12e01213fb90e349a?fr=lemma&fromModule=lemma_content-image)<br />2003年2月，成为美国《[时代周刊](https://baike.baidu.com/item/%E6%97%B6%E4%BB%A3%E5%91%A8%E5%88%8A/6643818?fromModule=lemma_inlink)》亚洲版的封面人物 [2]；3月，在[第3届音乐风云榜](https://baike.baidu.com/item/%E7%AC%AC3%E5%B1%8A%E9%9F%B3%E4%B9%90%E9%A3%8E%E4%BA%91%E6%A6%9C/23707987?fromModule=lemma_inlink)上获得港台年度最佳唱作人奖、年度风云大奖等奖项，其演唱的歌曲《[暗号](https://baike.baidu.com/item/%E6%9A%97%E5%8F%B7/3948301?fromModule=lemma_inlink)》则获得港台年度十大金曲奖 [236]；5月17日，在[马来西亚](https://baike.baidu.com/item/%E9%A9%AC%E6%9D%A5%E8%A5%BF%E4%BA%9A/202243?fromModule=lemma_inlink)[吉隆坡](https://baike.baidu.com/item/%E5%90%89%E9%9A%86%E5%9D%A1/967683?fromModule=lemma_inlink)[默迪卡体育场](https://baike.baidu.com/item/%E9%BB%98%E8%BF%AA%E5%8D%A1%E4%BD%93%E8%82%B2%E5%9C%BA/8826151?fromModule=lemma_inlink)举行“THE ONE”演唱会；7月16日，他的歌曲《[以父之名](https://baike.baidu.com/item/%E4%BB%A5%E7%88%B6%E4%B9%8B%E5%90%8D/1341?fromModule=lemma_inlink)》在亚洲超过50家电台首播，预计有8亿人同时收听，而该曲首播的当日也被这些电台定为“[周杰伦日](https://baike.baidu.com/item/%E5%91%A8%E6%9D%B0%E4%BC%A6%E6%97%A5/9734555?fromModule=lemma_inlink)” [20]；7月31日，发行个人第四张音乐专辑《[叶惠美](https://baike.baidu.com/item/%E5%8F%B6%E6%83%A0%E7%BE%8E/893?fromModule=lemma_inlink)》 [21] [26]，他不仅包办了专辑所有歌曲的作曲，还担任专辑的制作人和造型师 [21]，该专辑发行首月在亚洲的销量突破200万张 [22]，并于次年获得[第15届台湾金曲奖](https://baike.baidu.com/item/%E7%AC%AC15%E5%B1%8A%E5%8F%B0%E6%B9%BE%E9%87%91%E6%9B%B2%E5%A5%96/9773084?fromModule=lemma_inlink)最佳流行音乐演唱专辑奖、第4届全球华语歌曲排行榜年度最受欢迎专辑等奖项 [23-24]，专辑主打歌曲《[东风破](https://baike.baidu.com/item/%E4%B8%9C%E9%A3%8E%E7%A0%B4/1674691?fromModule=lemma_inlink)》也是周杰伦具有代表性的中国风作品之一，而他亦凭借该曲获得[第4届华语音乐传媒大奖](https://baike.baidu.com/item/%E7%AC%AC4%E5%B1%8A%E5%8D%8E%E8%AF%AD%E9%9F%B3%E4%B9%90%E4%BC%A0%E5%AA%92%E5%A4%A7%E5%A5%96/18003952?fromModule=lemma_inlink)最佳作曲人奖；9月12日，在[北京工人体育场](https://baike.baidu.com/item/%E5%8C%97%E4%BA%AC%E5%B7%A5%E4%BA%BA%E4%BD%93%E8%82%B2%E5%9C%BA/2214906?fromModule=lemma_inlink)举行“THE ONE”演唱会；11月13日，发行个人音乐EP《[寻找周杰伦](https://baike.baidu.com/item/%E5%AF%BB%E6%89%BE%E5%91%A8%E6%9D%B0%E4%BC%A6/2632938?fromModule=lemma_inlink)》 [25]，该EP收录了周杰伦为同名电影《[寻找周杰伦](https://baike.baidu.com/item/%E5%AF%BB%E6%89%BE%E5%91%A8%E6%9D%B0%E4%BC%A6/1189?fromModule=lemma_inlink)》创作的两首歌曲《[轨迹](https://baike.baidu.com/item/%E8%BD%A8%E8%BF%B9/2770132?fromModule=lemma_inlink)》《[断了的弦](https://baike.baidu.com/item/%E6%96%AD%E4%BA%86%E7%9A%84%E5%BC%A6/1508695?fromModule=lemma_inlink)》 [25]；12月12日，在[上海体育场](https://baike.baidu.com/item/%E4%B8%8A%E6%B5%B7%E4%BD%93%E8%82%B2%E5%9C%BA/9679224?fromModule=lemma_inlink)举办“THE ONE”演唱会，并演唱了变奏版的《[双截棍](https://baike.baidu.com/item/%E5%8F%8C%E6%88%AA%E6%A3%8D/2986610?fromModule=lemma_inlink)》、加长版的《[爷爷泡的茶](https://baike.baidu.com/item/%E7%88%B7%E7%88%B7%E6%B3%A1%E7%9A%84%E8%8C%B6/2746283?fromModule=lemma_inlink)》等歌曲；同年，客串出演的电影处女作《[寻找周杰伦](https://baike.baidu.com/item/%E5%AF%BB%E6%89%BE%E5%91%A8%E6%9D%B0%E4%BC%A6/1189?fromModule=lemma_inlink)》上映 [90]。<br />2004年1月21日，首次登上[中央电视台春节联欢晚会](https://baike.baidu.com/item/%E4%B8%AD%E5%A4%AE%E7%94%B5%E8%A7%86%E5%8F%B0%E6%98%A5%E8%8A%82%E8%81%94%E6%AC%A2%E6%99%9A%E4%BC%9A/7622174?fromModule=lemma_inlink)的舞台，并演唱歌曲《[龙拳](https://baike.baidu.com/item/%E9%BE%99%E6%8B%B3/2929202?fromModule=lemma_inlink)》 [27-28]；3月，在[第4届音乐风云榜](https://baike.baidu.com/item/%E7%AC%AC4%E5%B1%8A%E9%9F%B3%E4%B9%90%E9%A3%8E%E4%BA%91%E6%A6%9C/23707984?fromModule=lemma_inlink)上获得台湾地区最受欢迎男歌手奖、年度风云大奖、年度港台及海外华人最佳制作人等奖项 [326]；8月3日，发行融合嘻哈、R&B、[古典音乐](https://baike.baidu.com/item/%E5%8F%A4%E5%85%B8%E9%9F%B3%E4%B9%90/106197?fromModule=lemma_inlink)等风格的音乐专辑《[七里香](https://baike.baidu.com/item/%E4%B8%83%E9%87%8C%E9%A6%99/2181450?fromModule=lemma_inlink)》 [29] [289]，该专辑发行当月在亚洲的销量突破300万张 [316]，而专辑同名主打歌曲《[七里香](https://baike.baidu.com/item/%E4%B8%83%E9%87%8C%E9%A6%99/12009481?fromModule=lemma_inlink)》则获得[第27届十大中文金曲](https://baike.baidu.com/item/%E7%AC%AC27%E5%B1%8A%E5%8D%81%E5%A4%A7%E4%B8%AD%E6%96%87%E9%87%91%E6%9B%B2/12709616?fromModule=lemma_inlink)十大金曲奖、优秀流行国语歌曲奖金奖，以及[第5届全球华语歌曲排行榜](https://baike.baidu.com/item/%E7%AC%AC5%E5%B1%8A%E5%85%A8%E7%90%83%E5%8D%8E%E8%AF%AD%E6%AD%8C%E6%9B%B2%E6%8E%92%E8%A1%8C%E6%A6%9C/24682097?fromModule=lemma_inlink)年度25大金曲等奖项 [30]；9月，获得第16届[世界音乐大奖](https://baike.baidu.com/item/%E4%B8%96%E7%95%8C%E9%9F%B3%E4%B9%90%E5%A4%A7%E5%A5%96/6690633?fromModule=lemma_inlink)中国区最畅销艺人奖 [320]；10月起，在台北、香港、洛杉矶、蒙特维尔等地举行“无与伦比”世界巡回演唱会。<br />2005年1月11日，在第11届[全球华语榜中榜](https://baike.baidu.com/item/%E5%85%A8%E7%90%83%E5%8D%8E%E8%AF%AD%E6%A6%9C%E4%B8%AD%E6%A6%9C/10768347?fromModule=lemma_inlink)颁奖盛典上获得港台最佳男歌手奖、港台最受欢迎男歌手奖、港台最佳创作歌手奖等奖项 [31]；4月，凭借专辑《[七里香](https://baike.baidu.com/item/%E4%B8%83%E9%87%8C%E9%A6%99/2181450?fromModule=lemma_inlink)》入围[第16届台湾金曲奖](https://baike.baidu.com/item/%E7%AC%AC16%E5%B1%8A%E5%8F%B0%E6%B9%BE%E9%87%91%E6%9B%B2%E5%A5%96/4745538?fromModule=lemma_inlink)最佳国语男演唱人奖、最佳流行音乐演唱专辑奖，凭借歌曲《[七里香](https://baike.baidu.com/item/%E4%B8%83%E9%87%8C%E9%A6%99/12009481?fromModule=lemma_inlink)》入围第16届台湾金曲奖最佳作曲人奖；6月23日，由其担任男主角主演的电影《[头文字D](https://baike.baidu.com/item/%E5%A4%B4%E6%96%87%E5%AD%97D/2711022?fromModule=lemma_inlink)》上映 [91]，他在该片中饰演[藤原拓海](https://baike.baidu.com/item/%E8%97%A4%E5%8E%9F%E6%8B%93%E6%B5%B7/702611?fromModule=lemma_inlink) [314] [347]，这也是他主演的个人首部电影 [314]，他也凭借该片获得[第42届台湾电影金马奖](https://baike.baidu.com/item/%E7%AC%AC42%E5%B1%8A%E5%8F%B0%E6%B9%BE%E7%94%B5%E5%BD%B1%E9%87%91%E9%A9%AC%E5%A5%96/10483829?fromModule=lemma_inlink)最佳新演员奖 [3]、[第25届香港电影金像奖](https://baike.baidu.com/item/%E7%AC%AC25%E5%B1%8A%E9%A6%99%E6%B8%AF%E7%94%B5%E5%BD%B1%E9%87%91%E5%83%8F%E5%A5%96/10324781?fromModule=lemma_inlink)最佳新演员奖 [315]；7月1日，在上海体育场举行“无与伦比巡回演唱会” [32]；7月9日，在北京工人体育场举行“无与伦比巡回演唱会” [33]。8月31日，在日本发行个人首张精选专辑《[Initial J](https://baike.baidu.com/item/Initial%20J/2268270?fromModule=lemma_inlink)》 [327]，该专辑收录了周杰伦为电影《头文字D》演唱的主题曲《[一路向北](https://baike.baidu.com/item/%E4%B8%80%E8%B7%AF%E5%90%91%E5%8C%97/52259?fromModule=lemma_inlink)》和《[飘移](https://baike.baidu.com/item/%E9%A3%98%E7%A7%BB/1246934?fromModule=lemma_inlink)》 [34]；11月1日，发行个人第六张音乐专辑《[11月的萧邦](https://baike.baidu.com/item/11%E6%9C%88%E7%9A%84%E8%90%A7%E9%82%A6/467565?fromModule=lemma_inlink)》 [296]，并包办了专辑中所有歌曲的作曲以及专辑的造型设计 [35]，该专辑发行后以4.28%的销售份额获得台湾[G-MUSIC](https://baike.baidu.com/item/G-MUSIC/6992427?fromModule=lemma_inlink)年终排行榜冠军；同年，其创作的歌曲《[蜗牛](https://baike.baidu.com/item/%E8%9C%97%E7%89%9B/8578273?fromModule=lemma_inlink)》入选“上海中学生爱国主义歌曲推荐目录” [328]。<br />2006年1月11日，在第12届全球华语榜中榜颁奖盛典上获得最佳男歌手奖、最佳创作歌手奖、最受欢迎男歌手奖，并凭借歌曲《[夜曲](https://baike.baidu.com/item/%E5%A4%9C%E6%9B%B2/3886391?fromModule=lemma_inlink)》及其MV分别获得年度最佳歌曲奖、最受欢迎音乐录影带奖 [234]；1月20日，发行个人音乐EP《[霍元甲](https://baike.baidu.com/item/%E9%9C%8D%E5%85%83%E7%94%B2/24226609?fromModule=lemma_inlink)》 [329]，同名主打歌曲《[霍元甲](https://baike.baidu.com/item/%E9%9C%8D%E5%85%83%E7%94%B2/8903362?fromModule=lemma_inlink)》是[李连杰](https://baike.baidu.com/item/%E6%9D%8E%E8%BF%9E%E6%9D%B0/202569?fromModule=lemma_inlink)主演的同名电影《[霍元甲](https://baike.baidu.com/item/%E9%9C%8D%E5%85%83%E7%94%B2/8903304?fromModule=lemma_inlink)》的主题曲 [36]；1月23日，在[第28届十大中文金曲](https://baike.baidu.com/item/%E7%AC%AC28%E5%B1%8A%E5%8D%81%E5%A4%A7%E4%B8%AD%E6%96%87%E9%87%91%E6%9B%B2/13467291?fromModule=lemma_inlink)颁奖典礼上获得了优秀流行歌手大奖、全年最高销量歌手大奖男歌手奖 [246]；2月5日至6日，在日本东京举行演唱会；9月，发行个人第七张音乐专辑《[依然范特西](https://baike.baidu.com/item/%E4%BE%9D%E7%84%B6%E8%8C%83%E7%89%B9%E8%A5%BF/7709602?fromModule=lemma_inlink)》 [290]，该专辑延续了周杰伦以往的音乐风格，并融合了中国风、说唱等音乐风格，其中与[费玉清](https://baike.baidu.com/item/%E8%B4%B9%E7%8E%89%E6%B8%85/651674?fromModule=lemma_inlink)合唱的中国风歌曲《[千里之外](https://baike.baidu.com/item/%E5%8D%83%E9%87%8C%E4%B9%8B%E5%A4%96/781?fromModule=lemma_inlink)》获得第13届全球华语音乐榜中榜年度最佳歌曲奖、[第29届十大中文金曲](https://baike.baidu.com/item/%E7%AC%AC29%E5%B1%8A%E5%8D%81%E5%A4%A7%E4%B8%AD%E6%96%87%E9%87%91%E6%9B%B2/7944447?fromModule=lemma_inlink)全国最受欢迎中文歌曲奖等奖项 [37-38]，该专辑发行后以5.34%的销售份额位列台湾五大唱片排行榜第一位 [39]，并获得[中华音乐人交流协会](https://baike.baidu.com/item/%E4%B8%AD%E5%8D%8E%E9%9F%B3%E4%B9%90%E4%BA%BA%E4%BA%A4%E6%B5%81%E5%8D%8F%E4%BC%9A/3212583?fromModule=lemma_inlink)年度十大优良专辑奖、IFPI香港唱片销量大奖最高销量国语唱片奖等奖项 [40]；12月，发行个人音乐EP《[黄金甲](https://baike.baidu.com/item/%E9%BB%84%E9%87%91%E7%94%B2/62490685?fromModule=lemma_inlink)》 [330]，该专辑获得IFPI香港唱片销量大奖十大畅销国语唱片奖 [332]；同年，获得世界音乐大奖中国区最畅销艺人奖 [4]；12月14日，主演的古装动作片《[满城尽带黄金甲](https://baike.baidu.com/item/%E6%BB%A1%E5%9F%8E%E5%B0%BD%E5%B8%A6%E9%BB%84%E9%87%91%E7%94%B2/18156?fromModule=lemma_inlink)》在中国内地上映 [331]，他在片中饰演武功超群的二王子元杰，并凭借该片获得第16届上海影评人奖最佳男演员奖，而他为该片创作并演唱的主题曲《[菊花台](https://baike.baidu.com/item/%E8%8F%8A%E8%8A%B1%E5%8F%B0/2999088?fromModule=lemma_inlink)》则获得了[第26届香港电影金像奖](https://baike.baidu.com/item/%E7%AC%AC26%E5%B1%8A%E9%A6%99%E6%B8%AF%E7%94%B5%E5%BD%B1%E9%87%91%E5%83%8F%E5%A5%96/10324838?fromModule=lemma_inlink)最佳原创电影歌曲奖 [92] [220]。<br />[![](https://intranetproxy.alipay.com/skylark/lark/0/2024/jpeg/358/1716184907940-eb369354-d912-42d0-89d1-deddfa87f15b.jpeg)](https://baike.baidu.com/pic/%E5%91%A8%E6%9D%B0%E4%BC%A6/129156/0/fcfaaf51f3deb48f8c54823393472d292df5e1fed29c?fr=lemma&fromModule=lemma_content-image)<br />2007年2月，首度担任导演并自导自演爱情片《[不能说的秘密](https://baike.baidu.com/item/%E4%B8%8D%E8%83%BD%E8%AF%B4%E7%9A%84%E7%A7%98%E5%AF%86/39267?fromModule=lemma_inlink)》 [93] [321]，该片上映后获得[第44届台湾电影金马奖](https://baike.baidu.com/item/%E7%AC%AC44%E5%B1%8A%E5%8F%B0%E6%B9%BE%E7%94%B5%E5%BD%B1%E9%87%91%E9%A9%AC%E5%A5%96/10483746?fromModule=lemma_inlink)年度台湾杰出电影奖、[第27届香港电影金像奖](https://baike.baidu.com/item/%E7%AC%AC27%E5%B1%8A%E9%A6%99%E6%B8%AF%E7%94%B5%E5%BD%B1%E9%87%91%E5%83%8F%E5%A5%96/3846497?fromModule=lemma_inlink)最佳亚洲电影提名等奖项 [5]，而他电影创作并演唱的同名主题曲《[不能说的秘密](https://baike.baidu.com/item/%E4%B8%8D%E8%83%BD%E8%AF%B4%E7%9A%84%E7%A7%98%E5%AF%86/1863255?fromModule=lemma_inlink)》获得了第44届台湾电影金马奖最佳原创电影歌曲奖 [5]；5月，凭借《千里之外》和《[红模仿](https://baike.baidu.com/item/%E7%BA%A2%E6%A8%A1%E4%BB%BF/8705177?fromModule=lemma_inlink)》分别入围[第18届台湾金曲奖](https://baike.baidu.com/item/%E7%AC%AC18%E5%B1%8A%E5%8F%B0%E6%B9%BE%E9%87%91%E6%9B%B2%E5%A5%96/4678259?fromModule=lemma_inlink)最佳年度歌曲、最佳音乐录像带导演等奖项 [41]；6月，凭借单曲《[霍元甲](https://baike.baidu.com/item/%E9%9C%8D%E5%85%83%E7%94%B2/8903362?fromModule=lemma_inlink)》获得第18届台湾金曲奖最佳单曲制作人奖 [42]；11月2日，发行个人第八张音乐专辑《[我很忙](https://baike.baidu.com/item/%E6%88%91%E5%BE%88%E5%BF%99/1374653?fromModule=lemma_inlink)》 [243] [291]，并在专辑中首次尝试美式乡村的音乐风格，而他也于次年凭借专辑中的中国风歌曲《[青花瓷](https://baike.baidu.com/item/%E9%9D%92%E8%8A%B1%E7%93%B7/9864403?fromModule=lemma_inlink)》获得[第19届台湾金曲奖](https://baike.baidu.com/item/%E7%AC%AC19%E5%B1%8A%E5%8F%B0%E6%B9%BE%E9%87%91%E6%9B%B2%E5%A5%96/3968762?fromModule=lemma_inlink)最佳作曲人奖以及最佳年度歌曲奖 [43] [292]；11月4日，凭借专辑《[依然范特西](https://baike.baidu.com/item/%E4%BE%9D%E7%84%B6%E8%8C%83%E7%89%B9%E8%A5%BF/7709602?fromModule=lemma_inlink)》蝉联世界音乐大奖中国区最畅销艺人奖 [44]；11月24日，在上海八万人体育场举行演唱会，并在演唱会中模仿了[维塔斯](https://baike.baidu.com/item/%E7%BB%B4%E5%A1%94%E6%96%AF/3770095?fromModule=lemma_inlink)的假声唱法 [45]；12月，在香港体育馆举行7场“周杰伦07-08世界巡回香港站演唱会”。<br />2008年1月10日，周杰伦自导自演的爱情文艺片《[不能说的秘密](https://baike.baidu.com/item/%E4%B8%8D%E8%83%BD%E8%AF%B4%E7%9A%84%E7%A7%98%E5%AF%86/39267?fromModule=lemma_inlink)》在韩国上映 [94]；2月6日，在[2008年中央电视台春节联欢晚会](https://baike.baidu.com/item/2008%E5%B9%B4%E4%B8%AD%E5%A4%AE%E7%94%B5%E8%A7%86%E5%8F%B0%E6%98%A5%E8%8A%82%E8%81%94%E6%AC%A2%E6%99%9A%E4%BC%9A/8970911?fromModule=lemma_inlink)上演唱歌曲《青花瓷》 [46]；之后，《青花瓷》的歌词出现在山东、江苏两省的高考试题中 [47]；2月16日，在日本[武道馆](https://baike.baidu.com/item/%E6%AD%A6%E9%81%93%E9%A6%86/1989260?fromModule=lemma_inlink)连开两场演唱会，成为继[邓丽君](https://baike.baidu.com/item/%E9%82%93%E4%B8%BD%E5%90%9B/27007?fromModule=lemma_inlink)、[王菲](https://baike.baidu.com/item/%E7%8E%8B%E8%8F%B2/11029?fromModule=lemma_inlink)之后第三位在武道馆开唱的华人歌手；同月，其主演的爱情喜剧片《[大灌篮](https://baike.baidu.com/item/%E5%A4%A7%E7%81%8C%E7%AF%AE/9173184?fromModule=lemma_inlink)》上映 [334]，在片中饰演见义勇为、好打不平的孤儿[方世杰](https://baike.baidu.com/item/%E6%96%B9%E4%B8%96%E6%9D%B0/9936534?fromModule=lemma_inlink) [335]，并为该片创作、演唱主题曲《[周大侠](https://baike.baidu.com/item/%E5%91%A8%E5%A4%A7%E4%BE%A0/10508241?fromModule=lemma_inlink)》 [334]；4月30日，发行为[北京奥运会](https://baike.baidu.com/item/%E5%8C%97%E4%BA%AC%E5%A5%A5%E8%BF%90%E4%BC%9A/335299?fromModule=lemma_inlink)创作并演唱的歌曲《[千山万水](https://baike.baidu.com/item/%E5%8D%83%E5%B1%B1%E4%B8%87%E6%B0%B4/3167078?fromModule=lemma_inlink)》 [253]；7月，在第19届台湾金曲奖颁奖典礼上凭借专辑《[不能说的秘密电影原声带](https://baike.baidu.com/item/%E4%B8%8D%E8%83%BD%E8%AF%B4%E7%9A%84%E7%A7%98%E5%AF%86%E7%94%B5%E5%BD%B1%E5%8E%9F%E5%A3%B0%E5%B8%A6/7752656?fromModule=lemma_inlink)》获得演奏类最佳专辑制作人奖，凭借《[琴房](https://baike.baidu.com/item/%E7%90%B4%E6%88%BF/2920397?fromModule=lemma_inlink)》获得演奏类最佳作曲人奖 [43]；10月15日，发行个人第九张音乐专辑《[魔杰座](https://baike.baidu.com/item/%E9%AD%94%E6%9D%B0%E5%BA%A7/49875?fromModule=lemma_inlink)》 [297]，该专辑融合了嘻哈、民谣等音乐风格，推出首周在G-MUSIC排行榜、五大唱片排行榜上获得冠军，发行一星期在亚洲的销量突破100万张 [48]；11月，凭借专辑《[我很忙](https://baike.baidu.com/item/%E6%88%91%E5%BE%88%E5%BF%99/1374653?fromModule=lemma_inlink)》第四次获得世界音乐大奖中国区最畅销艺人奖 [4]，并成为首位连续三届获得该奖项的华人歌手 [44]。<br />2009年1月25日，在[2009年中央电视台春节联欢晚会](https://baike.baidu.com/item/2009%E5%B9%B4%E4%B8%AD%E5%A4%AE%E7%94%B5%E8%A7%86%E5%8F%B0%E6%98%A5%E8%8A%82%E8%81%94%E6%AC%A2%E6%99%9A%E4%BC%9A/5938543?fromModule=lemma_inlink)上与[宋祖英](https://baike.baidu.com/item/%E5%AE%8B%E7%A5%96%E8%8B%B1/275282?fromModule=lemma_inlink)合作演唱歌曲《[本草纲目](https://baike.baidu.com/item/%E6%9C%AC%E8%8D%89%E7%BA%B2%E7%9B%AE/10619620?fromModule=lemma_inlink)》 [333]；5月，在[昆山市体育中心](https://baike.baidu.com/item/%E6%98%86%E5%B1%B1%E5%B8%82%E4%BD%93%E8%82%B2%E4%B8%AD%E5%BF%83/10551658?fromModule=lemma_inlink)体育场举行演唱会；6月，在[第20届台湾金曲奖](https://baike.baidu.com/item/%E7%AC%AC20%E5%B1%8A%E5%8F%B0%E6%B9%BE%E9%87%91%E6%9B%B2%E5%A5%96/8055336?fromModule=lemma_inlink)颁奖典礼上，周杰伦凭借歌曲《[稻香](https://baike.baidu.com/item/%E7%A8%BB%E9%A6%99/11539?fromModule=lemma_inlink)》获得最佳年度歌曲奖，凭借歌曲《[魔术先生](https://baike.baidu.com/item/%E9%AD%94%E6%9C%AF%E5%85%88%E7%94%9F/6756619?fromModule=lemma_inlink)》获得最佳音乐录影带奖，凭借专辑《魔杰座》获得最佳国语男歌手奖 [7]；7月，周杰伦悉尼演唱会的票房在美国公告牌上排名第二，成为该年全球单场演唱会票房收入第二名，并且打破了华人歌手在澳大利亚开演唱会的票房纪录；8月起，在[佛山世纪莲体育中心](https://baike.baidu.com/item/%E4%BD%9B%E5%B1%B1%E4%B8%96%E7%BA%AA%E8%8E%B2%E4%BD%93%E8%82%B2%E4%B8%AD%E5%BF%83/2393458?fromModule=lemma_inlink)体育场、[沈阳奥体中心](https://baike.baidu.com/item/%E6%B2%88%E9%98%B3%E5%A5%A5%E4%BD%93%E4%B8%AD%E5%BF%83/665665?fromModule=lemma_inlink)体育场等场馆举办个人巡回演唱会；12月，入选美国[CNN](https://baike.baidu.com/item/CNN/86482?fromModule=lemma_inlink)评出的“亚洲最具影响力的25位人物” [49]；同月9日，与[林志玲](https://baike.baidu.com/item/%E6%9E%97%E5%BF%97%E7%8E%B2/172898?fromModule=lemma_inlink)共同主演的探险片《[刺陵](https://baike.baidu.com/item/%E5%88%BA%E9%99%B5/7759069?fromModule=lemma_inlink)》上映 [336]，他在片中饰演拥有神秘力量的古城守陵人乔飞 [95]。<br />2010年2月9日，出演的古装武侠片《[苏乞儿](https://baike.baidu.com/item/%E8%8B%8F%E4%B9%9E%E5%84%BF/7887736?fromModule=lemma_inlink)》上映 [337]，他在片中饰演冷酷、不苟言笑的[武神](https://baike.baidu.com/item/%E6%AD%A6%E7%A5%9E/61764957?fromModule=lemma_inlink) [338]；同年，执导科幻剧《[熊猫人](https://baike.baidu.com/item/%E7%86%8A%E7%8C%AB%E4%BA%BA/23175?fromModule=lemma_inlink)》，并特别客串出演该剧 [339]，他还为该剧创作了《[熊猫人](https://baike.baidu.com/item/%E7%86%8A%E7%8C%AB%E4%BA%BA/19687027?fromModule=lemma_inlink)》《[爱情引力](https://baike.baidu.com/item/%E7%88%B1%E6%83%85%E5%BC%95%E5%8A%9B/8585685?fromModule=lemma_inlink)》等歌曲 [96]；3月28日，在[第14届全球华语榜中榜](https://baike.baidu.com/item/%E7%AC%AC14%E5%B1%8A%E5%85%A8%E7%90%83%E5%8D%8E%E8%AF%AD%E6%A6%9C%E4%B8%AD%E6%A6%9C/2234155?fromModule=lemma_inlink)暨亚洲影响力大典上获得12530无线音乐年度大奖 [242]；5月18日，发行个人第十张音乐专辑《[跨时代](https://baike.baidu.com/item/%E8%B7%A8%E6%97%B6%E4%BB%A3/516122?fromModule=lemma_inlink)》 [293]，并包办专辑中全部歌曲的作曲和制作，该专辑于次年获得[第22届台湾金曲奖](https://baike.baidu.com/item/%E7%AC%AC22%E5%B1%8A%E5%8F%B0%E6%B9%BE%E9%87%91%E6%9B%B2%E5%A5%96/7220967?fromModule=lemma_inlink)最佳国语专辑奖、[中国原创音乐流行榜](https://baike.baidu.com/item/%E4%B8%AD%E5%9B%BD%E5%8E%9F%E5%88%9B%E9%9F%B3%E4%B9%90%E6%B5%81%E8%A1%8C%E6%A6%9C/10663228?fromModule=lemma_inlink)最优秀专辑奖等奖项，而周杰伦也凭借该专辑获得第22届台湾金曲奖最佳国语男歌手奖 [50] [294]；6月，入选美国杂志《[Fast Company](https://baike.baidu.com/item/Fast%20Company/6508066?fromModule=lemma_inlink)》评出的“全球百大创意人物”，并且成为首位入榜的华人男歌手；6月11日，在[台北小巨蛋](https://baike.baidu.com/item/%E5%8F%B0%E5%8C%97%E5%B0%8F%E5%B7%A8%E8%9B%8B/10648327?fromModule=lemma_inlink)举行“超时代”演唱会首场演出；8月，在一项名为“全球歌曲下载量最高歌手”（2008年年初至2010年8月10日）的调查中，周杰伦的歌曲下载量排名全球第三 [51]；12月，编号为257248的小行星被命名为“[周杰伦星](https://baike.baidu.com/item/%E5%91%A8%E6%9D%B0%E4%BC%A6%E6%98%9F/8257706?fromModule=lemma_inlink)”，而周杰伦也创作了以该小行星为题材的歌曲《[爱的飞行日记](https://baike.baidu.com/item/%E7%88%B1%E7%9A%84%E9%A3%9E%E8%A1%8C%E6%97%A5%E8%AE%B0/1842823?fromModule=lemma_inlink)》；12月30日，美国古柏蒂奴市宣布把每年的12月31日设立为“周杰伦日” [52]。<br />2011年1月，凭借动作片《[青蜂侠](https://baike.baidu.com/item/%E9%9D%92%E8%9C%82%E4%BE%A0/7618833?fromModule=lemma_inlink)》进军好莱坞 [340]，并入选美国电影网站Screen Crave评出的“十大最值得期待的新秀演员”；2月11日，登上[2011年中央电视台春节联欢晚会](https://baike.baidu.com/item/2011%E5%B9%B4%E4%B8%AD%E5%A4%AE%E7%94%B5%E8%A7%86%E5%8F%B0%E6%98%A5%E8%8A%82%E8%81%94%E6%AC%A2%E6%99%9A%E4%BC%9A/3001908?fromModule=lemma_inlink)，并与林志玲表演、演唱歌曲《[兰亭序](https://baike.baidu.com/item/%E5%85%B0%E4%BA%AD%E5%BA%8F/2879867?fromModule=lemma_inlink)》 [341]；2月23日，与[科比·布莱恩特](https://baike.baidu.com/item/%E7%A7%91%E6%AF%94%C2%B7%E5%B8%83%E8%8E%B1%E6%81%A9%E7%89%B9/318773?fromModule=lemma_inlink)拍摄雪碧广告以及MV，并创作了广告主题曲《[天地一斗](https://baike.baidu.com/item/%E5%A4%A9%E5%9C%B0%E4%B8%80%E6%96%97/6151126?fromModule=lemma_inlink)》；4月21日，美国《[时代周刊](https://baike.baidu.com/item/%E6%97%B6%E4%BB%A3%E5%91%A8%E5%88%8A/6643818?fromModule=lemma_inlink)》评选了“全球年度最具影响力人物100强”，周杰伦位列第二名；5月13日，凭借专辑《[跨时代](https://baike.baidu.com/item/%E8%B7%A8%E6%97%B6%E4%BB%A3/516122?fromModule=lemma_inlink)》、歌曲《[超人不会飞](https://baike.baidu.com/item/%E8%B6%85%E4%BA%BA%E4%B8%8D%E4%BC%9A%E9%A3%9E/39269?fromModule=lemma_inlink)》《[烟花易冷](https://baike.baidu.com/item/%E7%83%9F%E8%8A%B1%E6%98%93%E5%86%B7/211?fromModule=lemma_inlink)》分别入围第22届台湾金曲奖最佳专辑制作人奖、最佳年度歌曲奖、最佳作曲人奖等奖项 [53-54]；5月，凭借动作片《青蜂侠》获得第20届美国[MTV电影电视奖](https://baike.baidu.com/item/MTV%E7%94%B5%E5%BD%B1%E7%94%B5%E8%A7%86%E5%A5%96/20817009?fromModule=lemma_inlink)最佳新人提名 [97]；11月11日，发行个人第11张音乐专辑《[惊叹号！](https://baike.baidu.com/item/%E6%83%8A%E5%8F%B9%E5%8F%B7%EF%BC%81/10482087?fromModule=lemma_inlink)》 [247] [298]，该专辑融合了[重金属摇滚](https://baike.baidu.com/item/%E9%87%8D%E9%87%91%E5%B1%9E%E6%91%87%E6%BB%9A/1514206?fromModule=lemma_inlink)、嘻哈、节奏蓝调、[爵士](https://baike.baidu.com/item/%E7%88%B5%E5%A3%AB/8315440?fromModule=lemma_inlink)等音乐风格，并首次引入[电子舞曲](https://baike.baidu.com/item/%E7%94%B5%E5%AD%90%E8%88%9E%E6%9B%B2/5673907?fromModule=lemma_inlink) [55]；同年，在洛杉矶、吉隆坡、高雄等地举行“超时代世界巡回演唱会” [56]。<br />2012年，主演枪战动作电影《[逆战](https://baike.baidu.com/item/%E9%80%86%E6%88%98/9261017?fromModule=lemma_inlink)》，在片中饰演对错分明、具有强烈正义感的国际警务人员万飞 [98]；4月，在[第16届全球华语榜中榜](https://baike.baidu.com/item/%E7%AC%AC16%E5%B1%8A%E5%85%A8%E7%90%83%E5%8D%8E%E8%AF%AD%E6%A6%9C%E4%B8%AD%E6%A6%9C/2211134?fromModule=lemma_inlink)亚洲影响力大典上获得了亚洲影响力最佳华语艺人奖、榜中榜最佳数字音乐奖，他的专辑《惊叹号！》也获得了港台最佳专辑奖 [342]；5月，位列[福布斯中国名人榜](https://baike.baidu.com/item/%E7%A6%8F%E5%B8%83%E6%96%AF%E4%B8%AD%E5%9B%BD%E5%90%8D%E4%BA%BA%E6%A6%9C/2125?fromModule=lemma_inlink)第一名；5月15日，凭借专辑《惊叹号！》和歌曲《[水手怕水](https://baike.baidu.com/item/%E6%B0%B4%E6%89%8B%E6%80%95%E6%B0%B4/9504982?fromModule=lemma_inlink)》分别入围[第23届台湾金曲奖](https://baike.baidu.com/item/%E7%AC%AC23%E5%B1%8A%E5%8F%B0%E6%B9%BE%E9%87%91%E6%9B%B2%E5%A5%96/2044143?fromModule=lemma_inlink)最佳国语男歌手奖、最佳编曲人奖；9月22日，在新加坡F1赛道举办演唱会，成为首位在F1演出的华人歌手 [57]；12月28日，发行个人第12张音乐专辑《[12新作](https://baike.baidu.com/item/12%E6%96%B0%E4%BD%9C/8186612?fromModule=lemma_inlink)》 [299]，该专辑包括了中国风、说唱、蓝调、R&B、爵士等音乐风格，主打歌曲《[红尘客栈](https://baike.baidu.com/item/%E7%BA%A2%E5%B0%98%E5%AE%A2%E6%A0%88/8396283?fromModule=lemma_inlink)》获得第13届全球华语歌曲排行榜二十大金曲奖、[第36届十大中文金曲](https://baike.baidu.com/item/%E7%AC%AC36%E5%B1%8A%E5%8D%81%E5%A4%A7%E4%B8%AD%E6%96%87%E9%87%91%E6%9B%B2/12632953?fromModule=lemma_inlink)优秀流行国语歌曲银奖等奖项。<br />2013年5月17日，在上海[梅赛德斯－奔驰文化中心](https://baike.baidu.com/item/%E6%A2%85%E8%B5%9B%E5%BE%B7%E6%96%AF%EF%BC%8D%E5%A5%94%E9%A9%B0%E6%96%87%E5%8C%96%E4%B8%AD%E5%BF%83/12524895?fromModule=lemma_inlink)举行“魔天伦”世界巡回演唱会；5月22日，凭借专辑《12新作》入围[第24届台湾金曲奖](https://baike.baidu.com/item/%E7%AC%AC24%E5%B1%8A%E5%8F%B0%E6%B9%BE%E9%87%91%E6%9B%B2%E5%A5%96/4788862?fromModule=lemma_inlink)最佳国语专辑奖、最佳国语男歌手奖、最佳专辑制作人奖；6月1日，为动画电影《[十万个冷笑话](https://baike.baidu.com/item/%E5%8D%81%E4%B8%87%E4%B8%AA%E5%86%B7%E7%AC%91%E8%AF%9D/2883102?fromModule=lemma_inlink)》中的角色[太乙真人](https://baike.baidu.com/item/%E5%A4%AA%E4%B9%99%E7%9C%9F%E4%BA%BA/23686155?fromModule=lemma_inlink)配音；6月22日，在[成都市体育中心](https://baike.baidu.com/item/%E6%88%90%E9%83%BD%E5%B8%82%E4%BD%93%E8%82%B2%E4%B8%AD%E5%BF%83/4821286?fromModule=lemma_inlink)体育场举行演唱会；7月11日，自导自演的爱情片《[天台爱情](https://baike.baidu.com/item/%E5%A4%A9%E5%8F%B0%E7%88%B1%E6%83%85/3568321?fromModule=lemma_inlink)》上映 [344]，该片还被选为[纽约亚洲电影节](https://baike.baidu.com/item/%E7%BA%BD%E7%BA%A6%E4%BA%9A%E6%B4%B2%E7%94%B5%E5%BD%B1%E8%8A%82/12609945?fromModule=lemma_inlink)闭幕影片 [99]；9月6日至8日，在台北小巨蛋举行3场“魔天伦”演唱会 [58]；10月4日，担任音乐爱情电影《[听见下雨的声音](https://baike.baidu.com/item/%E5%90%AC%E8%A7%81%E4%B8%8B%E9%9B%A8%E7%9A%84%E5%A3%B0%E9%9F%B3/7239472?fromModule=lemma_inlink)》的音乐总监 [100]。<br />2014年4月起，在悉尼、贵阳、上海、吉隆坡等地举行“魔天伦”世界巡回演唱会 [59]；5月，位列福布斯中国名人榜第3名 [60]；11月，在动作片《[惊天魔盗团2](https://baike.baidu.com/item/%E6%83%8A%E5%A4%A9%E9%AD%94%E7%9B%97%E5%9B%A22/9807509?fromModule=lemma_inlink)》中饰演魔术道具店的老板Li [101]；12月10日，发行首张个人数字音乐专辑《[哎呦，不错哦](https://baike.baidu.com/item/%E5%93%8E%E5%91%A6%EF%BC%8C%E4%B8%8D%E9%94%99%E5%93%A6/9851748?fromModule=lemma_inlink)》 [295]，成为首位发行数字音乐专辑的华人歌手 [61]；该专辑发行后获得第二届[QQ音乐年度盛典](https://baike.baidu.com/item/QQ%E9%9F%B3%E4%B9%90%E5%B9%B4%E5%BA%A6%E7%9B%9B%E5%85%B8/13131216?fromModule=lemma_inlink)年度畅销数字专辑奖，专辑中的歌曲《[鞋子特大号](https://baike.baidu.com/item/%E9%9E%8B%E5%AD%90%E7%89%B9%E5%A4%A7%E5%8F%B7/16261949?fromModule=lemma_inlink)》获得第5届[全球流行音乐金榜](https://baike.baidu.com/item/%E5%85%A8%E7%90%83%E6%B5%81%E8%A1%8C%E9%9F%B3%E4%B9%90%E9%87%91%E6%A6%9C/3621354?fromModule=lemma_inlink)年度二十大金曲奖。<br />2015年4月，在[第19届全球华语榜中榜](https://baike.baidu.com/item/%E7%AC%AC19%E5%B1%8A%E5%85%A8%E7%90%83%E5%8D%8E%E8%AF%AD%E6%A6%9C%E4%B8%AD%E6%A6%9C/16913437?fromModule=lemma_inlink)暨亚洲影响力大典上获得亚洲影响力最受欢迎全能华语艺人奖、华语乐坛跨时代实力唱作人奖 [343]；5月，在福布斯中国名人榜中排名第2位 [63]；6月27日，凭借专辑《[哎呦，不错哦](https://baike.baidu.com/item/%E5%93%8E%E5%91%A6%EF%BC%8C%E4%B8%8D%E9%94%99%E5%93%A6/9851748?fromModule=lemma_inlink)》获得[第26届台湾金曲奖](https://baike.baidu.com/item/%E7%AC%AC26%E5%B1%8A%E5%8F%B0%E6%B9%BE%E9%87%91%E6%9B%B2%E5%A5%96/16997436?fromModule=lemma_inlink)最佳国语专辑奖、最佳专辑制作人奖两项提名；7月起，担任[浙江卫视](https://baike.baidu.com/item/%E6%B5%99%E6%B1%9F%E5%8D%AB%E8%A7%86/868580?fromModule=lemma_inlink)励志音乐评论节目《[中国好声音第四季](https://baike.baidu.com/item/%E4%B8%AD%E5%9B%BD%E5%A5%BD%E5%A3%B0%E9%9F%B3%E7%AC%AC%E5%9B%9B%E5%AD%A3/16040352?fromModule=lemma_inlink)》的导师 [62]；9月26日，在[佛山世纪莲体育中心](https://baike.baidu.com/item/%E4%BD%9B%E5%B1%B1%E4%B8%96%E7%BA%AA%E8%8E%B2%E4%BD%93%E8%82%B2%E4%B8%AD%E5%BF%83/2393458?fromModule=lemma_inlink)体育场举行“魔天伦”演唱会；12月20日，在昆明拓东体育场举行“魔天伦”演唱会。<br />2016年3月，在[QQ音乐巅峰盛典](https://baike.baidu.com/item/QQ%E9%9F%B3%E4%B9%90%E5%B7%85%E5%B3%B0%E7%9B%9B%E5%85%B8/19430591?fromModule=lemma_inlink)上获得年度巅峰人气歌手奖、年度音乐全能艺人奖、年度最具影响力演唱会奖；3月24日，发行个人作词、作曲的单曲《[英雄](https://baike.baidu.com/item/%E8%8B%B1%E9%9B%84/19459565?fromModule=lemma_inlink)》，上线两周播放量突破8000万；6月1日，为电影《[惊天魔盗团2](https://baike.baidu.com/item/%E6%83%8A%E5%A4%A9%E9%AD%94%E7%9B%97%E5%9B%A22/9807509?fromModule=lemma_inlink)》创作的主题曲《[Now You See Me](https://baike.baidu.com/item/Now%20You%20See%20Me/19708831?fromModule=lemma_inlink)》发布 [64]；6月24日，发行融合古典、摇滚、嘻哈等曲风的数字音乐专辑《[周杰伦的床边故事](https://baike.baidu.com/item/%E5%91%A8%E6%9D%B0%E4%BC%A6%E7%9A%84%E5%BA%8A%E8%BE%B9%E6%95%85%E4%BA%8B/19711456?fromModule=lemma_inlink)》 [65] [300]，该专辑发行两日销量突破100万张，打破数字专辑在中国内地的销售纪录 [66]，专辑在大中华地区的累计销量突破200万张，销售额超过4000万元 [67]；6月，参演的好莱坞电影《[惊天魔盗团2](https://baike.baidu.com/item/%E6%83%8A%E5%A4%A9%E9%AD%94%E7%9B%97%E5%9B%A22/9807509?fromModule=lemma_inlink)》在中国内地上映；7月15日起，担任浙江卫视音乐评论节目《[中国新歌声第一季](https://baike.baidu.com/item/%E4%B8%AD%E5%9B%BD%E6%96%B0%E6%AD%8C%E5%A3%B0%E7%AC%AC%E4%B8%80%E5%AD%A3/19837166?fromModule=lemma_inlink)》的导师 [68]；12月23日起，由周杰伦自编自导的文艺片《不能说的秘密》而改编的同名音乐剧《[不能说的秘密](https://baike.baidu.com/item/%E4%B8%8D%E8%83%BD%E8%AF%B4%E7%9A%84%E7%A7%98%E5%AF%86/19661975?fromModule=lemma_inlink)》在[北京天桥艺术中心](https://baike.baidu.com/item/%E5%8C%97%E4%BA%AC%E5%A4%A9%E6%A1%A5%E8%89%BA%E6%9C%AF%E4%B8%AD%E5%BF%83/17657501?fromModule=lemma_inlink)举行全球首演，该音乐剧的作曲、作词、原著故事均由周杰伦完成 [102-103]；同年，在上海、北京、青岛、郑州、常州等地举行[周杰伦“地表最强”世界巡回演唱会](https://baike.baidu.com/item/%E5%91%A8%E6%9D%B0%E4%BC%A6%E2%80%9C%E5%9C%B0%E8%A1%A8%E6%9C%80%E5%BC%BA%E2%80%9D%E4%B8%96%E7%95%8C%E5%B7%A1%E5%9B%9E%E6%BC%94%E5%94%B1%E4%BC%9A/53069809?fromModule=lemma_inlink)。<br />[![](https://intranetproxy.alipay.com/skylark/lark/0/2024/jpeg/358/1716184907937-4e820a4b-34a1-44bb-9e3d-b292f3558415.jpeg)](https://baike.baidu.com/pic/%E5%91%A8%E6%9D%B0%E4%BC%A6/129156/0/faedab64034f78f0f736d51e1a691d55b319eac4b69c?fr=lemma&fromModule=lemma_content-image)<br />2017年1月6日，周杰伦监制的爱情电影《[一万公里的约定](https://baike.baidu.com/item/%E4%B8%80%E4%B8%87%E5%85%AC%E9%87%8C%E7%9A%84%E7%BA%A6%E5%AE%9A/17561190?fromModule=lemma_inlink)》在中国内地上映 [104]；1月13日，在江苏卫视推出的科学类真人秀节目《[最强大脑第四季](https://baike.baidu.com/item/%E6%9C%80%E5%BC%BA%E5%A4%A7%E8%84%91%E7%AC%AC%E5%9B%9B%E5%AD%A3/19450808?fromModule=lemma_inlink)》中担任嘉宾 [69]；4月15日至16日，在昆明拓东体育场举办两场个人演唱会，其后在重庆、南京、沈阳、厦门等地举行“地表最强”世界巡回演唱会 [70]；5月16日，凭借歌曲《[告白气球](https://baike.baidu.com/item/%E5%91%8A%E7%99%BD%E6%B0%94%E7%90%83/19713859?fromModule=lemma_inlink)》《[床边故事](https://baike.baidu.com/item/%E5%BA%8A%E8%BE%B9%E6%95%85%E4%BA%8B/19710370?fromModule=lemma_inlink)》、专辑《周杰伦的床边故事》分别入围[第28届台湾金曲奖](https://baike.baidu.com/item/%E7%AC%AC28%E5%B1%8A%E5%8F%B0%E6%B9%BE%E9%87%91%E6%9B%B2%E5%A5%96/20804578?fromModule=lemma_inlink)最佳年度歌曲奖、最佳音乐录影带奖、最佳国语男歌手奖 [235]；6月4日，获得Hito年度最佳男歌手奖；随后，参加原创专业音乐节目《[中国新歌声第二季](https://baike.baidu.com/item/%E4%B8%AD%E5%9B%BD%E6%96%B0%E6%AD%8C%E5%A3%B0%E7%AC%AC%E4%BA%8C%E5%AD%A3/20128840?fromModule=lemma_inlink)》并担任导师 [71]；8月9日，其发行的音乐专辑《周杰伦的床边故事》获得[华语金曲奖](https://baike.baidu.com/item/%E5%8D%8E%E8%AF%AD%E9%87%91%E6%9B%B2%E5%A5%96/2477095?fromModule=lemma_inlink)年度最佳国语专辑奖 [72]。<br />2018年1月6日，在新加坡举行“地表最强2”世界巡回演唱会的首场演出 [73]；1月18日，发行由其个人作词、作曲的音乐单曲《[等你下课](https://baike.baidu.com/item/%E7%AD%89%E4%BD%A0%E4%B8%8B%E8%AF%BE/22344815?fromModule=lemma_inlink)》 [250]，该曲由周杰伦与[杨瑞代](https://baike.baidu.com/item/%E6%9D%A8%E7%91%9E%E4%BB%A3/1538482?fromModule=lemma_inlink)共同演唱 [74]；2月15日，在[2018年中央电视台春节联欢晚会](https://baike.baidu.com/item/2018%E5%B9%B4%E4%B8%AD%E5%A4%AE%E7%94%B5%E8%A7%86%E5%8F%B0%E6%98%A5%E8%8A%82%E8%81%94%E6%AC%A2%E6%99%9A%E4%BC%9A/20848218?fromModule=lemma_inlink)上与[蔡威泽](https://baike.baidu.com/item/%E8%94%A1%E5%A8%81%E6%B3%BD/20863889?fromModule=lemma_inlink)合作表演魔术与歌曲《[告白气球](https://baike.baidu.com/item/%E5%91%8A%E7%99%BD%E6%B0%94%E7%90%83/22388056?fromModule=lemma_inlink)》，该节目在2018年央视春晚节目收视率TOP10榜单中位列第一位 [75-76]；5月15日，发行个人创作的音乐单曲《[不爱我就拉倒](https://baike.baidu.com/item/%E4%B8%8D%E7%88%B1%E6%88%91%E5%B0%B1%E6%8B%89%E5%80%92/22490709?fromModule=lemma_inlink)》 [77] [346]；11月21日，加盟由[D·J·卡卢索](https://baike.baidu.com/item/D%C2%B7J%C2%B7%E5%8D%A1%E5%8D%A2%E7%B4%A2/16013808?fromModule=lemma_inlink)执导的电影《[极限特工4](https://baike.baidu.com/item/%E6%9E%81%E9%99%90%E7%89%B9%E5%B7%A54/20901306?fromModule=lemma_inlink)》 [105]。<br />2019年2月9日，在美国拉斯维加斯举行个人演唱会 [78]；7月24日，宣布“嘉年华”世界巡回演唱会于10月启动 [79]，该演唱会是周杰伦庆祝出道20周年的演唱会 [80]；9月16日，发行与[陈信宏](https://baike.baidu.com/item/%E9%99%88%E4%BF%A1%E5%AE%8F/334?fromModule=lemma_inlink)共同演唱的音乐单曲《[说好不哭](https://baike.baidu.com/item/%E8%AF%B4%E5%A5%BD%E4%B8%8D%E5%93%AD/23748447?fromModule=lemma_inlink)》 [355]，该曲由[方文山](https://baike.baidu.com/item/%E6%96%B9%E6%96%87%E5%B1%B1/135622?fromModule=lemma_inlink)作词 [81]；10月17日，在上海举行[周杰伦“嘉年华”世界巡回演唱会](https://baike.baidu.com/item/%E5%91%A8%E6%9D%B0%E4%BC%A6%E2%80%9C%E5%98%89%E5%B9%B4%E5%8D%8E%E2%80%9D%E4%B8%96%E7%95%8C%E5%B7%A1%E5%9B%9E%E6%BC%94%E5%94%B1%E4%BC%9A/62969657?fromModule=lemma_inlink)的首场演出 [80]；11月1日，发行“地表最强”世界巡回演唱会Live专辑 [82]；12月15日，周杰伦为电影《[天·火](https://baike.baidu.com/item/%E5%A4%A9%C2%B7%E7%81%AB/23375274?fromModule=lemma_inlink)》献唱的主题曲《[我是如此相信](https://baike.baidu.com/item/%E6%88%91%E6%98%AF%E5%A6%82%E6%AD%A4%E7%9B%B8%E4%BF%A1/24194094?fromModule=lemma_inlink)》发行 [84]。<br />2020年1月10日至11日，在[新加坡国家体育场](https://baike.baidu.com/item/%E6%96%B0%E5%8A%A0%E5%9D%A1%E5%9B%BD%E5%AE%B6%E4%BD%93%E8%82%B2%E5%9C%BA/8820507?fromModule=lemma_inlink)举行两场“嘉年华”世界巡回演唱会 [85]；3月21日，在浙江卫视全球户外生活文化实境秀节目《[周游记](https://baike.baidu.com/item/%E5%91%A8%E6%B8%B8%E8%AE%B0/22427755?fromModule=lemma_inlink)》中担任发起人 [86]；6月12日，发行个人音乐单曲《[Mojito](https://baike.baidu.com/item/Mojito/50474451?fromModule=lemma_inlink)》 [88] [249]；5月29日，周杰伦首个中文社交媒体在快手开通 [267]；7月26日，周杰伦在快手进行了直播首秀，半小时内直播观看人次破6800万 [268]；10月，监制并特别出演赛车题材电影《[叱咤风云](https://baike.baidu.com/item/%E5%8F%B1%E5%92%A4%E9%A3%8E%E4%BA%91/22756550?fromModule=lemma_inlink)》 [106-107]。<br />2021年1月29日，获得[中国歌曲TOP排行榜](https://baike.baidu.com/item/%E4%B8%AD%E5%9B%BD%E6%AD%8C%E6%9B%B2TOP%E6%8E%92%E8%A1%8C%E6%A6%9C/53567645?fromModule=lemma_inlink)最佳男歌手奖；2月12日，以“云录制”形式在[2021年中央广播电视总台春节联欢晚会](https://baike.baidu.com/item/2021%E5%B9%B4%E4%B8%AD%E5%A4%AE%E5%B9%BF%E6%92%AD%E7%94%B5%E8%A7%86%E6%80%BB%E5%8F%B0%E6%98%A5%E8%8A%82%E8%81%94%E6%AC%A2%E6%99%9A%E4%BC%9A/23312983?fromModule=lemma_inlink)演唱歌曲《Mojito》 [89]；2月12日，周杰伦“既来之，则乐之”唱聊会在快手上线 [269]；5月12日，凭借单曲《Mojito》入围[第32届台湾金曲奖](https://baike.baidu.com/item/%E7%AC%AC32%E5%B1%8A%E5%8F%B0%E6%B9%BE%E9%87%91%E6%9B%B2%E5%A5%96/56977769?fromModule=lemma_inlink)最佳单曲制作人奖 [240]。<br />2022年5月20日至21日，周杰伦“奇迹现场重映计划”线上视频演唱会开始播出 [259]；7月6日，音乐专辑《[最伟大的作品](https://baike.baidu.com/item/%E6%9C%80%E4%BC%9F%E5%A4%A7%E7%9A%84%E4%BD%9C%E5%93%81/61539892?fromModule=lemma_inlink)》在QQ音乐的预约量超过568万人 [263]；7月6日，音乐专辑《最伟大的作品》同名先行曲的MV在网络平台播出 [262] [264]；7月8日，专辑《最伟大的作品》开始预售 [265]，8小时内在QQ音乐、[咪咕音乐](https://baike.baidu.com/item/%E5%92%AA%E5%92%95%E9%9F%B3%E4%B9%90/4539596?fromModule=lemma_inlink)等平台的预售额超过三千万元 [266]；7月15日，周杰伦正式发行个人第15张音乐专辑《[最伟大的作品](https://baike.baidu.com/item/%E6%9C%80%E4%BC%9F%E5%A4%A7%E7%9A%84%E4%BD%9C%E5%93%81/61539892?fromModule=lemma_inlink)》 [261]，专辑上线后一小时的总销售额超过1亿 [270]；截至17时，该专辑在四大音乐平台总销量突破500万张，销售额超过1.5亿元 [272]；7月18日，周杰伦在快手开启独家直播，直播间累计观看人数1.1亿，最高实时在线观看人数超654万 [273]；9月，参加2022联盟嘉年华 [274]；11月19日，周杰伦通过快手平台直播线上“哥友会” [277-278] [280]，这也是他首次以线上的方式举办歌友会 [276]；他在直播中演唱了《[还在流浪](https://baike.baidu.com/item/%E8%BF%98%E5%9C%A8%E6%B5%81%E6%B5%AA/61707897?fromModule=lemma_inlink)》《[半岛铁盒](https://baike.baidu.com/item/%E5%8D%8A%E5%B2%9B%E9%93%81%E7%9B%92/2268287?fromModule=lemma_inlink)》等5首歌曲 [279]；12月16日，周杰伦参加动感地带世界杯音乐盛典，并在现场演唱了歌曲《我是如此相信》以及《[安静](https://baike.baidu.com/item/%E5%AE%89%E9%9D%99/2940419?fromModule=lemma_inlink)》 [282] [284]。<br />2023年3月，周杰伦发行的音乐专辑《[最伟大的作品](https://baike.baidu.com/item/%E6%9C%80%E4%BC%9F%E5%A4%A7%E7%9A%84%E4%BD%9C%E5%93%81/61539892?fromModule=lemma_inlink)》获得[国际唱片业协会](https://baike.baidu.com/item/%E5%9B%BD%E9%99%85%E5%94%B1%E7%89%87%E4%B8%9A%E5%8D%8F%E4%BC%9A/1486316?fromModule=lemma_inlink)（IFPI）发布的“2022年全球畅销专辑榜”冠军，成为首位获得该榜冠军的华语歌手 [287]；5月16日，其演唱的歌曲《[最伟大的作品](https://baike.baidu.com/item/%E6%9C%80%E4%BC%9F%E5%A4%A7%E7%9A%84%E4%BD%9C%E5%93%81/61702109?fromModule=lemma_inlink)》获得[第34届台湾金曲奖](https://baike.baidu.com/item/%E7%AC%AC34%E5%B1%8A%E5%8F%B0%E6%B9%BE%E9%87%91%E6%9B%B2%E5%A5%96/62736300?fromModule=lemma_inlink)年度歌曲奖提名 [288]；8月17日-20日，在呼和浩特市举行嘉年华世界巡回演唱会 [349]；11月25日，参加的户外实境互动综艺节目《[周游记2](https://baike.baidu.com/item/%E5%91%A8%E6%B8%B8%E8%AE%B02/53845056?fromModule=lemma_inlink)》在浙江卫视播出 [312]；12月6日，[环球音乐集团](https://baike.baidu.com/item/%E7%8E%AF%E7%90%83%E9%9F%B3%E4%B9%90%E9%9B%86%E5%9B%A2/1964357?fromModule=lemma_inlink)与周杰伦及其经纪公司“杰威尔音乐”达成战略合作伙伴关系 [318]；12月9日，在泰国曼谷[拉加曼加拉国家体育场](https://baike.baidu.com/item/%E6%8B%89%E5%8A%A0%E6%9B%BC%E5%8A%A0%E6%8B%89%E5%9B%BD%E5%AE%B6%E4%BD%93%E8%82%B2%E5%9C%BA/6136556?fromModule=lemma_inlink)举行“嘉年华”世界巡回演唱会 [324]；12月21日，发行音乐单曲《[圣诞星](https://baike.baidu.com/item/%E5%9C%A3%E8%AF%9E%E6%98%9F/63869869?fromModule=lemma_inlink)》 [345]。2024年4月，由坚果工作室制片的说唱真人秀综艺《说唱梦工厂》在北京举行媒体探班活动，其中主要嘉宾有周杰伦。 [356]5月23日，参演的综艺《说唱梦工厂》播出。 [358]
-<a name="jHm9k"></a>
-
-## 1.1、早年经历
-
-周杰伦出生于台湾省新北市，祖籍福建省泉州市永春县 [13]。4岁的时候，母亲[叶惠美](https://baike.baidu.com/item/%E5%8F%B6%E6%83%A0%E7%BE%8E/2325933?fromModule=lemma_inlink)把他送到淡江山叶幼儿音乐班学习钢琴。初中二年级时，父母因性格不合离婚，周杰伦归母亲叶惠美抚养。中考时，没有考上普通高中，同年，因为擅长钢琴而被[淡江中学](https://baike.baidu.com/item/%E6%B7%A1%E6%B1%9F%E4%B8%AD%E5%AD%A6/5340877?fromModule=lemma_inlink)第一届音乐班录取。高中毕业以后，两次报考[台北大学](https://baike.baidu.com/item/%E5%8F%B0%E5%8C%97%E5%A4%A7%E5%AD%A6/7685732?fromModule=lemma_inlink)音乐系均没有被录取，于是开始在一家餐馆打工。<br />1997年9月，周杰伦在母亲的鼓励下报名参加了台北星光电视台的娱乐节目《[超级新人王](https://baike.baidu.com/item/%E8%B6%85%E7%BA%A7%E6%96%B0%E4%BA%BA%E7%8E%8B/6107880?fromModule=lemma_inlink)》 [26]，并在节目中邀人演唱了自己创作的歌曲《梦有翅膀》。
-<a name="tUR5m"></a>
-
-## 1.2、演艺经历
-
-2000年，在[杨峻荣](https://baike.baidu.com/item/%E6%9D%A8%E5%B3%BB%E8%8D%A3/8379373?fromModule=lemma_inlink)的推荐下，周杰伦开始演唱自己创作的歌曲；11月7日，发行个人首张音乐专辑《[Jay](https://baike.baidu.com/item/Jay/5291?fromModule=lemma_inlink)》 [26]，并包办专辑全部歌曲的作曲、和声编写以及监制工作，该专辑融合了[R&B](https://baike.baidu.com/item/R&B/15271596?fromModule=lemma_inlink)、[嘻哈](https://baike.baidu.com/item/%E5%98%BB%E5%93%88/161896?fromModule=lemma_inlink)等多种音乐风格，其中的主打歌曲《[星晴](https://baike.baidu.com/item/%E6%98%9F%E6%99%B4/4798844?fromModule=lemma_inlink)》获得第24届[十大中文金曲](https://baike.baidu.com/item/%E5%8D%81%E5%A4%A7%E4%B8%AD%E6%96%87%E9%87%91%E6%9B%B2/823339?fromModule=lemma_inlink)优秀国语歌曲金奖 [15]，而他也凭借该专辑在华语乐坛受到关注，并在次年举办的[第12届台湾金曲奖](https://baike.baidu.com/item/%E7%AC%AC12%E5%B1%8A%E5%8F%B0%E6%B9%BE%E9%87%91%E6%9B%B2%E5%A5%96/61016222?fromModule=lemma_inlink)颁奖典礼上凭借该专辑获得最佳流行音乐演唱专辑奖、入围最佳制作人奖，凭借专辑中的歌曲《[可爱女人](https://baike.baidu.com/item/%E5%8F%AF%E7%88%B1%E5%A5%B3%E4%BA%BA/3225780?fromModule=lemma_inlink)》提名[第12届台湾金曲奖](https://baike.baidu.com/item/%E7%AC%AC12%E5%B1%8A%E5%8F%B0%E6%B9%BE%E9%87%91%E6%9B%B2%E5%A5%96/61016222?fromModule=lemma_inlink)最佳作曲人奖。
-<a name="URZoY"></a>
diff --git a/tests/builder/component/test_reader.py b/tests/builder/component/test_reader.py
deleted file mode 100644
index e17ee2fd..00000000
--- a/tests/builder/component/test_reader.py
+++ /dev/null
@@ -1,294 +0,0 @@
-import unittest
-from unittest import TestCase, mock
-import os
-
-from kag.builder.component.reader.txt_reader import TXTReader
-from kag.builder.component.reader.docx_reader import DocxReader
-from kag.builder.component.reader.json_reader import JSONReader
-from kag.builder.component.reader.csv_reader import CSVReader
-from kag.builder.component.reader.pdf_reader import PDFReader
-from kag.builder.component.reader.markdown_reader import MarkDownReader
-from kag.builder.component.reader.yuque_reader import YuqueReader
-from unittest.mock import patch, mock_open, MagicMock
-from kag.builder.model.chunk import Chunk, ChunkTypeEnum
-
-dir_path = os.path.dirname(__file__)
-
-class TestTXTReader(TestCase):
-    def setUp(self):
-        self.reader = TXTReader()
-
-    def test_invoke_with_mock_file(self):
-        file_path = "test_file.txt"
-        with mock.patch(
-            "kag.builder.component.reader.txt_reader.os.path.exists",
-            return_value=True,
-        ):
-            with mock.patch(
-                "kag.builder.component.reader.txt_reader.open",
-                new_callable=mock.mock_open,
-                read_data="file content",
-            ):
-                chunks = self.reader.invoke(file_path)
-                self.assertEqual(len(chunks), 1)
-                self.assertIsInstance(chunks[0], Chunk)
-                self.assertEqual(chunks[0].id, Chunk.generate_hash_id(file_path))
-                self.assertEqual(chunks[0].content, "file content")
-
-    def test_invoke_with_mock_text(self):
-        text = "input text"
-        chunks = self.reader.invoke(text)
-        self.assertEqual(len(chunks), 1)
-        self.assertIsInstance(chunks[0], Chunk)
-        self.assertEqual(chunks[0].content, text)
-
-    def test_invoke_with_empty_input(self):
-        with self.assertRaises(ValueError):
-            self.reader.invoke("")
-
-    def test_invoke_with_exist_file(self):
-        file_path = os.path.join(dir_path, "../data/test_txt.txt")
-        with open(file_path) as f:
-            content = f.read()
-        chunks = self.reader.invoke(file_path)
-        self.assertEqual(len(chunks), 1)
-        self.assertIsInstance(chunks[0], Chunk)
-        self.assertEqual(chunks[0].id, Chunk.generate_hash_id(file_path))
-        self.assertEqual(chunks[0].content, content)
-
-
-class TestDocxReader(TestCase):
-    def setUp(self):
-        self.reader = DocxReader()
-
-    def test_input_types(self):
-        self.assertEqual(self.reader.input_types, str)
-
-    def test_output_types(self):
-        self.assertEqual(self.reader.output_types, Chunk)
-
-    @patch("kag.builder.component.reader.docx_reader.Document")
-    def test_extract_text_from_docx(self, mock_document):
-        # Mock the Document object to return a fake paragraph
-        mock_paragraph = MagicMock()
-        mock_paragraph.para.text = "Test paragraph"
-        mock_document.return_value.paragraphs = [mock_paragraph]
-
-        text = self.reader._extract_text_from_docx(mock_document)
-
-        # Assert the expected result
-        self.assertEqual(text, "")
-
-    def test_invoke(self):
-        # Invoke the method under test
-        chunks = self.reader.invoke(os.path.join(os.path.dirname(dir_path),"data/test_docx.docx"))
-        # Assert the expected result
-        self.assertEqual(len(chunks), 1)
-        self.assertIsInstance(chunks[0], Chunk)
-        self.assertNotEqual(chunks[0].content, "")
-        self.assertEqual(chunks[0].name, "test_docx")
-
-    @patch("kag.builder.component.reader.docx_reader.Document")
-    def test_invoke_raises_io_error(self, mock_document):
-        # Set up the mock document to raise an OSError
-        mock_document.side_effect = OSError("Test error")
-
-        # Invoke the method under test and assert the expected exception
-        with self.assertRaises(IOError):
-            self.reader.invoke("test_docx.docx")
-
-
-class TestJSONReader(TestCase):
-    def setUp(self):
-        self.reader = JSONReader()
-
-    def test_input_types(self):
-        self.assertEqual(self.reader.input_types, str)
-
-    def test_output_types(self):
-        self.assertEqual(self.reader.output_types, Chunk)
-
-    @patch(
-        "kag.builder.component.reader.json_reader.open",
-        new_callable=mock_open,
-        read_data='{"key": "value"}',
-    )
-    def test_read_from_file_success(self, mock_file):
-        file_path = "dummy.json"
-        content = self.reader._read_from_file(file_path)
-        self.assertEqual(content, {"key": "value"})
-        mock_file.assert_called_once_with(file_path, "r")
-
-    @patch(
-        "kag.builder.component.reader.json_reader.open", new_callable=mock_open
-    )
-    def test_read_from_file_not_found(self, mock_file):
-        mock_file.side_effect = FileNotFoundError
-        with self.assertRaises(ValueError):
-            self.reader._read_from_file("non_existent.json")
-
-    def test_parse_json_string_success(self):
-        json_string = '{"key": "value"}'
-        content = self.reader._parse_json_string(json_string)
-        self.assertEqual(content, {"key": "value"})
-
-    def test_parse_json_string_failure(self):
-        json_string = "invalid json"
-        with self.assertRaises(ValueError):
-            self.reader._parse_json_string(json_string)
-
-    def test_invoke_with_file(self):
-        json_file_path = os.path.join(dir_path, "../data/test_json.json")
-        chunks = self.reader.invoke(json_file_path, name_col="title", content_col = "text")
-
-        # Check output
-        self.assertEqual(len(chunks), 24)
-        self.assertEqual(chunks[0].name, "Thomas C. Sudhof")
-        self.assertEqual(
-            chunks[0].content,
-            "Introduction\nThomas Christian Sudhof (German pronunciation: ['to:mas 'zy:t,ho:f] i; born December 22, 1955), ForMemRS, is a German-American biochemist known for his study of synaptic transmission. Currently, he is a professor in the school of medicine in the department of molecular and cellular physiology, and by courtesy in neurology, and in psychiatry and behavioral sciences at Stanford University.",
-        )
-
-    def test_invoke_with_json_string(self):
-        json_string = '[{"title": "Test", "text": "Test content"}]'
-        chunks = self.reader.invoke(json_string,name_col = "title",content_col = "text")
-        self.assertEqual(len(chunks), 1)
-        self.assertEqual(chunks[0].name, "Test")
-        self.assertEqual(chunks[0].content, "Test content")
-
-    def test_invoke_non_dict_input(self):
-        with self.assertRaises(ValueError):
-            self.reader.invoke("invalid input")
-
-    def test_invoke_missing_columns(self):
-        json_string = '[{"title": "test_json", "text": "Test content"}]'
-        chunks = self.reader.invoke(json_string,name_column="title",content_col = "text")
-
-        self.assertEqual(len(chunks), 1)
-        self.assertNotEqual(
-            chunks[0].name, ""
-        ) 
-        self.assertEqual(chunks[0].content, "Test content")
-
-
-class TestCSVReader(unittest.TestCase):
-    def setUp(self):
-        self.csv_reader = CSVReader()
-
-    def test_input_types(self):
-        self.assertEqual(self.csv_reader.input_types, str)
-
-    def test_output_types(self):
-        self.assertEqual(self.csv_reader.output_types, Chunk)
-
-    def test_invoke(self):
-        file_path = os.path.join(dir_path, "../data/test_csv.csv")
-        chunks = self.csv_reader.invoke(file_path, id_col='id', name_col='title', content_col='text')
-
-        self.assertEqual(len(chunks), 24)
-
-
-class TestMarkDownReader(unittest.TestCase):
-    def setUp(self):
-        self.reader = MarkDownReader(cut_depth=1)
-
-    def test_init(self):
-        self.assertEqual(self.reader.cut_depth, 1)
-
-    def test_input_types(self):
-        self.assertEqual(self.reader.input_types, str)
-
-    def test_output_types(self):
-        self.assertEqual(self.reader.output_types, Chunk)
-
-    def test_invoke(self):
-        file_path = os.path.join(dir_path, "../data/test_markdown.md")
-        chunks = self.reader.invoke(file_path)
-        self.assertTrue(isinstance(chunks[0], Chunk))
-        self.assertEqual(chunks[0].name, "test_markdown#0")
-
-    def test_to_text(self):
-        pass
-
-    def test_extract_table(self):
-        pass
-
-
-class TestPDFReader(unittest.TestCase):
-    def setUp(self):
-        self.reader = PDFReader()
-
-    def test_input_types(self):
-        self.assertEqual(self.reader.input_types, str)
-
-    def test_output_types(self):
-        self.assertEqual(self.reader.output_types, Chunk)
-
-    def test_process_single_page(self):
-        page = "Header\nContent 1\nContent 2\nFooter"
-        watermark = "Header"
-        expected = ["Content 1", "Content 2"]
-        result = self.reader._process_single_page(
-            page, watermark, remove_header=True, remove_footnote=True
-        )
-        self.assertEqual(result, expected)
-
-    def test_invoke(self):
-
-        file_path = os.path.join(dir_path, "../data/test_pdf.pdf")
-        chunks = self.reader.invoke(file_path)
-
-        self.assertIsInstance(chunks[0], Chunk)
-        self.assertEqual(chunks[0].name, "test_pdf")
-        self.assertEqual(chunks[0].type, ChunkTypeEnum.Text)
-
-    def test_invoke_non_pdf(self):
-        with self.assertRaises(ValueError):
-            self.reader.invoke("../data/test_txt.txt")
-
-
-class TestYuqueReader(unittest.TestCase):
-    def setUp(self):
-        self.token = "1yPz1LbE20FmXvemCDVwjlSHpAp18qtEu7wcjCfv"
-        self.reader = YuqueReader(self.token)
-
-    def test_init(self):
-        self.assertEqual(self.reader.token, self.token)
-        self.assertIsInstance(self.reader.markdown_reader, MarkDownReader)
-
-    def test_input_types(self):
-        self.assertEqual(self.reader.input_types, str)
-
-    def test_output_types(self):
-        self.assertEqual(self.reader.output_types, Chunk)
-
-    @patch("kag.builder.component.reader.yuque_reader.requests.get")
-    def test_get_yuque_api_data(self, mock_get):
-        mock_response = MagicMock()
-        mock_response.json.return_value = {
-            "data": {"id": "test_id", "title": "test_title", "body": "test_content"}
-        }
-        mock_response.raise_for_status.return_value = None
-        mock_get.return_value = mock_response
-
-        data = YuqueReader.get_yuque_api_data(self.token, "test_url")
-        self.assertEqual(data["id"], "test_id")
-        self.assertEqual(data["title"], "test_title")
-        self.assertEqual(data["body"], "test_content")
-
-    def test_invoke(self):
-        # Assuming 'solve_content' method of MarkDownReader returns a list of Chunk objects
-        # Here we mock the behavior to return a dummy Chunk object
-
-        chunks = self.reader.invoke(
-            "https://yuque-api.antfin-inc.com/api/v2/repos/ob46m2/it70c2/docs/bnp80qitsy5vqoa5"
-        )
-        self.assertIsInstance(chunks, list)
-        self.assertIsInstance(chunks[0], Chunk)
-        self.assertEqual(chunks[0].content[:6], "1、建设目标")
-        self.assertEqual(chunks[0].type, ChunkTypeEnum.Text)
-
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/builder/component/test_splitter.py b/tests/builder/component/test_splitter.py
deleted file mode 100644
index 7ece40da..00000000
--- a/tests/builder/component/test_splitter.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import unittest
-import os
-from unittest import TestCase
-from unittest.mock import patch, mock_open, MagicMock
-
-from kag.builder.component.splitter.length_splitter import LengthSplitter
-from kag.builder.component.splitter.outline_splitter import OutlineSplitter
-from kag.builder.component.reader.docx_reader import DocxReader
-from kag.builder.model.chunk import Chunk, ChunkTypeEnum
-from kag.common.env import init_kag_config
-
-init_kag_config(os.path.join(os.path.dirname(__file__),"test_config.cfg"))
-
-
-class TestLengthSplitter(TestCase):
-
-    def setUp(self):
-        self.splitter = LengthSplitter()
-
-    def test_split_sentence(self):
-        sentence = "Hello World!This is a test. You and df aasd? sadq a asd !"
-        self.splitter.split_sentence(sentence)
-        self.assertEqual(len(self.splitter.split_sentence(sentence)), 4)
-    
-    @patch("kag.builder.component.splitter.length_splitter.LengthSplitter.slide_window_chunk")    
-    def test_invoke(self, mock_slide_window_chunk):
-        mock_slide_window_chunk.return_value = [Chunk(id = 1,name = "test",content = "Hello World!This is a test. You and df aasd? sadq a asd !")]
-        res = self.splitter.invoke("test")
-        self.assertEqual(len(res), 1)
-        
-class TestOutlineSplitter(TestCase):
-    
-    def setUp(self):
-        self.length_splitter = LengthSplitter(split_length=8000)
-        self.outline_splitter = OutlineSplitter()
-        self.docx_reader = DocxReader()
-
-    def test_invoke(self):
-        docx_path = os.path.join(os.path.dirname(__file__),"../data/test_docx.docx")
-        chunk = self.docx_reader.invoke(docx_path)
-        chunks = self.length_splitter.invoke(chunk)
-        chunks = self.outline_splitter.invoke(chunks)
-        print(chunks)
-    
-        
-
-if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
diff --git a/tests/builder/data/David Eagleman.txt b/tests/builder/data/David Eagleman.txt
deleted file mode 100644
index 88d9e7e3..00000000
--- a/tests/builder/data/David Eagleman.txt	
+++ /dev/null
@@ -1,3 +0,0 @@
-David Eagleman
-Introduction
-David Eagleman (born April 25, 1971) is an American neuroscientist, author, and science communicator. He teaches neuroscience at Stanford University[1] and is CEO and co-founder of Neosensory, a company that develops devices for sensory substitution. [2] He also directs the non-profit Center for Science and Law, which seeks to align the legal system with modern neuroscience[3] and is Chief Science Officer and co-founder of BrainCheck, a digital cognitive health platform used in medical practices and health systems.
diff --git a/tests/builder/data/David_Eagleman.txt b/tests/builder/data/David_Eagleman.txt
deleted file mode 100644
index 88d9e7e3..00000000
--- a/tests/builder/data/David_Eagleman.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-David Eagleman
-Introduction
-David Eagleman (born April 25, 1971) is an American neuroscientist, author, and science communicator. He teaches neuroscience at Stanford University[1] and is CEO and co-founder of Neosensory, a company that develops devices for sensory substitution. [2] He also directs the non-profit Center for Science and Law, which seeks to align the legal system with modern neuroscience[3] and is Chief Science Officer and co-founder of BrainCheck, a digital cognitive health platform used in medical practices and health systems.
diff --git a/tests/builder/data/Karl Deisseroth.txt b/tests/builder/data/Karl Deisseroth.txt
deleted file mode 100644
index d946d865..00000000
--- a/tests/builder/data/Karl Deisseroth.txt	
+++ /dev/null
@@ -1,3 +0,0 @@
-Karl Deisseroth
-Introduction
-Karl Alexander Deisseroth (born November 18, 1971) is an American scientist. He is the D.H. Chen Foundation Professor of Bioengineering and of psychiatry and behavioral sciences at Stanford University. He is known for creating and developing the technologies of hydrogel-tissue chemistry (e.g., CLARITY, STARmap) and optogenetics, and for applying integrated optical and genetic strategies to study normal neural circuit function, as well as dysfunction in neurological and psychiatric disease.
\ No newline at end of file
diff --git a/tests/builder/data/Karl_Deisseroth.txt b/tests/builder/data/Karl_Deisseroth.txt
deleted file mode 100644
index d946d865..00000000
--- a/tests/builder/data/Karl_Deisseroth.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-Karl Deisseroth
-Introduction
-Karl Alexander Deisseroth (born November 18, 1971) is an American scientist. He is the D.H. Chen Foundation Professor of Bioengineering and of psychiatry and behavioral sciences at Stanford University. He is known for creating and developing the technologies of hydrogel-tissue chemistry (e.g., CLARITY, STARmap) and optogenetics, and for applying integrated optical and genetic strategies to study normal neural circuit function, as well as dysfunction in neurological and psychiatric disease.
\ No newline at end of file
diff --git a/tests/builder/data/Thomas C. Sudhof.txt b/tests/builder/data/Thomas C. Sudhof.txt
deleted file mode 100644
index 64dcbc0a..00000000
--- a/tests/builder/data/Thomas C. Sudhof.txt	
+++ /dev/null
@@ -1,3 +0,0 @@
-Thomas C. Sudhof
-Introduction\nThomas Christian Sudhof (German pronunciation: ['to:mas 'zy:t,ho:f] i; born December 22, 1955), ForMemRS, is a German-American biochemist known for his study of synaptic transmission. Currently, he is a professor in the school of medicine in the department of molecular and cellular physiology, and by courtesy in neurology, and in psychiatry and behavioral sciences at Stanford University.
-Career and research\nSudhof's research has not only given the scientific community a great understanding of the processes underlying synaptic transmission and synapse formation, but has also advanced medical knowledge of mechanisms behind poorly understood diseases such as Alzheimer's, Schizophrenia, and Autism. He is currently working with a diverse group of researchers at the Howard Hughes Medical Institute to develop mouse models for mutants of synaptic genes.
\ No newline at end of file
diff --git a/tests/builder/data/Thomas_C.Sudhof.txt b/tests/builder/data/Thomas_C.Sudhof.txt
deleted file mode 100644
index 64dcbc0a..00000000
--- a/tests/builder/data/Thomas_C.Sudhof.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-Thomas C. Sudhof
-Introduction\nThomas Christian Sudhof (German pronunciation: ['to:mas 'zy:t,ho:f] i; born December 22, 1955), ForMemRS, is a German-American biochemist known for his study of synaptic transmission. Currently, he is a professor in the school of medicine in the department of molecular and cellular physiology, and by courtesy in neurology, and in psychiatry and behavioral sciences at Stanford University.
-Career and research\nSudhof's research has not only given the scientific community a great understanding of the processes underlying synaptic transmission and synapse formation, but has also advanced medical knowledge of mechanisms behind poorly understood diseases such as Alzheimer's, Schizophrenia, and Autism. He is currently working with a diverse group of researchers at the Howard Hughes Medical Institute to develop mouse models for mutants of synaptic genes.
\ No newline at end of file
diff --git a/tests/builder/integration/test_markdown.py b/tests/builder/integration/test_markdown.py
deleted file mode 100644
index ad61e8c5..00000000
--- a/tests/builder/integration/test_markdown.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import logging
-
-from kag.builder.component.extractor import KAGExtractor
-from kag.builder.component.reader import MarkDownReader
-from kag.builder.component.splitter import SemanticSplitter
-from kag.builder.component.writer import KGWriter
-from kag.solver.logic.solver_pipeline import SolverPipeline
-
-logger = logging.getLogger(__name__)
-
-def build():
-    reader = MarkDownReader()
-    splitter = SemanticSplitter()
-    extractor = KAGExtractor()
-    triple_writer = KGWriter()
-
-    chain = reader >> splitter >> extractor >> triple_writer
-    chain.invoke("../data/角色信息表说明.md", max_workers=16)
-
-
-def ca(query):
-    resp = SolverPipeline()
-    answer, trace_log = resp.run(query)
-
-    # answer = IRCoT().agent(report_log=False).solve_problem(Question(query))
-    logger.info(f"\n\nso the answer for '{query}' is: {answer}\n\n")
-    return answer
-
-
-if __name__ == "__main__":
-    build()
-    # ca("周杰伦发行过那些专辑")
diff --git a/tests/builder/integration/test_pdf.py b/tests/builder/integration/test_pdf.py
deleted file mode 100644
index 697819d2..00000000
--- a/tests/builder/integration/test_pdf.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import logging
-
-from kag.builder.component.extractor import KAGExtractor
-from kag.builder.component.reader import PDFReader
-from kag.builder.component.splitter import PatternSplitter
-from kag.builder.component.writer import KGWriter
-from kag.solver.logic.solver_pipeline import SolverPipeline
-
-logger = logging.getLogger(__name__)
-
-def build():
-    reader = PDFReader()
-    splitter = PatternSplitter()
-    extractor = KAGExtractor()
-    triple_writer = KGWriter()
-
-    chain = reader >> splitter >> extractor >> triple_writer
-    chain.invoke("tests/component/data/aiwen.pdf", max_workers=16)
-
-
-def ca(query):
-    resp = SolverPipeline()
-    answer, trace_log = resp.run(query)
-
-    # answer = IRCoT().agent(report_log=False).solve_problem(Question(query))
-    logger.info(f"\n\nso the answer for '{query}' is: {answer}\n\n")
-    return answer
-
-
-if __name__ == "__main__":
-    build()
-    ca("竞业限制时间最长多久")
diff --git a/tests/llm/config/deepseek.yaml b/tests/llm/config/deepseek.yaml
deleted file mode 100644
index d206d057..00000000
--- a/tests/llm/config/deepseek.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-client_type: maas
-base_url: https://api.deepseek.com/beta
-api_key: 
-model: deepseek-chat
\ No newline at end of file
diff --git a/tests/llm/config/maya.yaml b/tests/llm/config/maya.yaml
deleted file mode 100644
index 5cbb42bc..00000000
--- a/tests/llm/config/maya.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-client_type: maya
-scene_name: Qwen2_7B_Instruct_Knowledge
-chain_name: v1
-lora_name: humming-v25
\ No newline at end of file
diff --git a/tests/llm/config/ollama.yaml b/tests/llm/config/ollama.yaml
deleted file mode 100644
index 2e3820fd..00000000
--- a/tests/llm/config/ollama.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-client_type: ollama
-model: llama3.1
-base_url: http://localhost:11434
\ No newline at end of file
diff --git a/tests/llm/config/openai.yaml b/tests/llm/config/openai.yaml
deleted file mode 100644
index 347fe7c8..00000000
--- a/tests/llm/config/openai.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-client_type: maas
-api_key: 
-base_url: https://api.openai.com/v1/chat/completions
-model: gpt-3.5-turbo
diff --git a/tests/llm/config/vllm.yaml b/tests/llm/config/vllm.yaml
deleted file mode 100644
index 4f337250..00000000
--- a/tests/llm/config/vllm.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-client_type: vllm
-model: qwen
-base_url: http://localhost:8000/v1/chat/completions
\ No newline at end of file
diff --git a/tests/llm/test.py b/tests/llm/test.py
deleted file mode 100644
index a46187cc..00000000
--- a/tests/llm/test.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import os
-import unittest
-from kag.common.llm.client import LLMClient
-from kag.builder.prompt.outline_prompt import OutlinePrompt
-import argparse
-import base64
-
-dir = os.path.dirname(os.path.abspath(__file__))
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--config_path", type=str, default=os.path.join(dir,"config/ollama.yaml"))
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-
-class TestLLMClient(unittest.TestCase):
-    def setUp(self):
-        self.test_path = os.path.dirname(os.path.abspath(__file__))
-        
-    def test_llm(self):
-        llm_config_path = args.config_path
-        llm_client = LLMClient.from_config(llm_config_path)
-        res = llm_client("你是谁？")
-        print(res)
-        assert res is not None
-
-    def test_invoke(self):
-        llm_config_path = args.config_path
-        llm_client = LLMClient.from_config(llm_config_path)
-        prompt = OutlinePrompt(language="zh")
-        var_name = prompt.template_variables
-        input = {}
-        for i in var_name:
-            input[i] = "你是谁？"
-        res = llm_client.invoke(variables=input, prompt_op=prompt)
-        print(res)
-        assert res is not None
-def main():
-    runner = unittest.TextTestRunner(verbosity=2)
-    suite = unittest.TestLoader().loadTestsFromTestCase(TestLLMClient)
-    runner.run(suite)
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/tests/schema/test_schema_ml.py b/tests/schema/test_schema_ml.py
deleted file mode 100644
index 0fd45d54..00000000
--- a/tests/schema/test_schema_ml.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from knext.schema.marklang.schema_ml import SPGSchemaMarkLang
-
-
-def test_load_schema():
-    schema_file = "../../kag/examples/medicine/schema/Medicine.schema"
-    ml = SPGSchemaMarkLang(schema_file, with_server=False)
-    for _,m in ml.types.items():
-        for r in m.relations.values():
-            print(r.object_type_name)
-    print(ml.types)
-
-test_load_schema()
\ No newline at end of file
diff --git a/tests/solver/logic_form_executor/kag_config.cfg b/tests/solver/logic_form_executor/kag_config.cfg
deleted file mode 100644
index 6c642c24..00000000
--- a/tests/solver/logic_form_executor/kag_config.cfg
+++ /dev/null
@@ -1,27 +0,0 @@
-[project]
-id = 4
-
-[vectorizer]
-vectorizer = kag.common.vectorizer.OpenAIVectorizer
-model = bge-m3
-api_key = EMPTY
-base_url = http://127.0.0.1:11434/v1
-vector_dimensions = 1024
-
-[llm]
-client_type = ollama
-base_url = http://localhost:11434/api/generate
-model = llama3.1
-
-[indexer]
-with_semantic = False
-similarity_threshold = 0.8
-
-[retriever]
-with_semantic = False
-pagerank_threshold = 0.9
-match_threshold = 0.8
-top_k = 10
-
-[log]
-kag_log_level = INFO
\ No newline at end of file
diff --git a/tests/solver/logic_form_executor/kg_ca_pipeline_test.py b/tests/solver/logic_form_executor/kg_ca_pipeline_test.py
deleted file mode 100644
index 377a965e..00000000
--- a/tests/solver/logic_form_executor/kg_ca_pipeline_test.py
+++ /dev/null
@@ -1,220 +0,0 @@
-import logging
-import os
-import unittest
-
-import numpy as np
-
-from kag.common.env import init_kag_config
-from kag.common.graphstore.neo4j_graph_store import Neo4jClient
-from kag.common.vectorizer import Vectorizer
-from kag.interface.solver.lf_planner_abc import LFPlannerABC
-from kag.solver.implementation.default_reasoner import DefaultReasoner
-from kag.solver.implementation.lf_chunk_retriever import LFChunkRetriever
-from kag.solver.logic.core_modules.common.text_sim_by_vector import TextSimilarity, cosine_similarity
-from kag.solver.logic.core_modules.lf_solver import LFSolver
-from kag.solver.logic.solver_pipeline import SolverPipeline
-
-logger = logging.getLogger(__name__)
-
-configFilePath = os.path.join(os.path.abspath(os.path.dirname(__file__)), "kag_config.cfg")
-init_kag_config(configFilePath)
-
-class KgCATest(unittest.TestCase):
-    def testgraph_vector_test1(self):
-        graph_store = Neo4jClient(
-            uri=os.getenv("KAG_GRAPH_STORE_URI"),
-            user=os.getenv("KAG_GRAPH_STORE_USER"),
-            password=os.getenv("KAG_GRAPH_STORE_PASSWORD"),
-            database=os.getenv("KAG_GRAPH_STORE_DATABASE"),
-        )
-        graph_store.vectorizer = Vectorizer.from_config(eval(os.getenv("KAG_VECTORIZER")))
-        text_similarity = TextSimilarity()
-        out = graph_store.get_node("Creature", "gallu", "name")
-        mention_emb = text_similarity.sentence_encode("gallu")
-
-        cosine = cosine_similarity(np.array(mention_emb), np.array(out['_name_vector']))
-        print(cosine)
-        res = graph_store.vector_search("Creature", property_key="name", query_text_or_vector="gallu", topk=1)
-        print(res)
-
-    def run_question(self, q, use_lf_rer = True, use_resp=False)->str:
-        init_kag_config(configFilePath)
-        if use_lf_rer:
-            reasoner = DefaultReasoner()
-            resp = SolverPipeline(reasoner=reasoner)
-            answer, traceInfo = resp.run(q)
-            print("unit test question:" + q + " result:" + (str(answer))+ " trace_info:" + str(traceInfo))
-            return answer
-        if use_resp:
-            lf_solver = LFSolver(chunk_retriever=LFChunkRetriever())
-            reasoner = DefaultReasoner(lf_planner=LFPlannerABC(), lf_solver=lf_solver)
-            resp = SolverPipeline(reasoner=reasoner, max_run=4)
-            answer, traceInfo = resp.run(q)
-            print("unit test question:" + q + " result:" + (str(answer)) + " trace_info:" + str(traceInfo))
-            return answer
-        return ""
-
-    def test_0(self):
-        res = self.run_question("When was christopher nolan born?")
-        print(res)
-    def test_1(self):
-        res = self.run_question("If Gallu is a demon Lilu is what?")
-        assert "spirit" in res.lower()
-
-    def test_2(self):
-        res = self.run_question("Are Christopher Nolan and Sathish Kalathil both film directors?")
-        assert "yes" in res.lower()
-
-    def test_3(self):
-        res = self.run_question("Are Nantong and Jingdezhen situated in the same province ?")
-        assert "no" in res.lower()
-
-    def test_4(self):
-        res = self.run_question("Steve Johnson is the current head football coach at a University which also has a second campus in which California city?")
-        assert "San Diego" in res
-
-    def test_5(self):
-        res = self.run_question("Grace Krilanovich's first novel was published by an independent mom-and-pop publishing house that was founded in 2005, and is based where?")
-        assert "Columbus, Ohio" in res
-
-    def test_6(self):
-        res = self.run_question("Are both magazines, the Woman's Viewpoint and Pick Me Up, British publications?")
-        assert "no" in res.lower()
-
-    def test_7(self):
-        res = self.run_question("The runner-up in the 1999 World Drivers' Championship appears on the front cover of a racing video game developed by what company?")
-        assert "Studio 33" in res
-
-    def test_8(self):
-        res = self.run_question("""At the 2011 census, what was he population of the city where Kerry Saxby-Junna was born?""")
-        assert "6,960" in res
-
-    def test_9(self):
-        res = self.run_question("Which plant is larger, the Pterocarya or the Cotula?")
-        assert "Pterocarya" in res
-
-    def test_10(self):
-        res = self.run_question(
-            "The manuscript for Flute Sonata in C major, BWV 1033 is in the hand of a German musician whose godfather is whom?")
-        assert "Georg Philipp Telemann" in res
-
-    def test_11(self):
-        res = self.run_question("""Who directed the film that was shot in or around Leland, North Carolina in 1986""")
-        assert "Stephen King" in res
-
-    def test_12(self):
-        res = self.run_question(
-            "What actresses were involved in the scandal that temporarily separated the Hong Kong duo Twins in 2008?")
-
-    def test_re_failed_1(self):
-        res = self.run_question("""Who performed in both Welcome to the Show and the band f(x)?""")
-        assert "Choi Jin-ri" in res
-
-    def test_re_failed_2(self):
-        res = self.run_question("Scott Andrew Buchholz, is an Australian politician, he previously served as chief of staff to which Queensland Senator Australian politician, who has served as the Deputy Prime Minister of Australia since 18 February 2016?")
-        assert "Barnaby Joyce" in res
-
-    def test_re_failed_3(self):
-        res = self.run_question("In what year was the man that resigned as the president of Israel before the Israeli presidential election, 2000 born?")
-        assert "1924" in res
-
-    def test_re_failed_5(self):
-        res = self.run_question("""What genre is the author of the story behind "Act of War; Direct Action" associated with?""")
-        assert "aviation techno-thriller novels" in res
-
-    def test_re_failed_6(self):
-        res = self.run_question("""Philip Carlo was a biographer for an American contract killer who was associated with members of what crime family?""")
-        assert "DeCavalcante crime family" in res
-
-    def test_re_failed_7(self):
-        res = self.run_question(
-            """Who was Audrey Williams pregnant with during the recording of "Dear Brother"?""")
-        assert "Hank Williams, Jr." in res
-
-    def test_re_failed_8(self):
-        res = self.run_question("""Are Medici and Senet both board games?""")
-        assert "yes" in res.lower()
-
-    def test_re_failed_9(self):
-        res = self.run_question("""Charles Andrews graduated from what college preparatory boys' school?""")
-        assert "Hebron Academy" in res
-
-    def test_re_failed_10(self):
-        res = self.run_question("""Which magazine was published first, Guitar World or Science News?""")
-        assert "Science News" in res
-
-    def test_re_failed_11(self):
-        res = self.run_question("""When was the Welsh singer born who is known for her distinctive husky voice and who's third album is "Diamond Cut" (1979)?""")
-        assert "8 June 1951" in res
-
-    def test_re_failed_12(self):
-        res = self.run_question("""Are Frozen and Escape from the Dark both animated features?""")
-        assert "no" in res.lower()
-
-    def test_re_failed_13(self):
-        res = self.run_question("""Jean Vander Pyl provided the voice of Rosie on the Hanna-Barbera animated sitcom that originally premiered on which date?""")
-        assert "September 23, 1962" in res
-
-    def test_re_failed_14(self):
-        res = self.run_question("""The film Darkon follows the Darkon Wargaming Club based in Baltimore, Maryland, a group that participates in an activity with this 4-letter acronym name.""")
-        assert "LARP" in res
-
-    def test_re_failed_15(self):
-        res = self.run_question("""Jon L. Luther was the chairman and CEO of a restaurant holding company headquartered in what city?""")
-        assert "Canton, Massachusetts" in res
-
-    def test_re_failed_17(self):
-        res = self.run_question("""What position does the footballer who plays for the capital and the largest city of Portugal paly?""")
-        assert "central defender" in res
-
-    def test_re_failed_18(self):
-        res = self.run_question("Which of the two tornado outbreaks killed the most people?")
-        assert "March 2 and 3, 2012" in res
-
-    def test_qwen2_failed_1(self):
-        res = self.run_question(
-            """Scott Howell is a consultant who has worked with the mayor of what city?""", use_default=True)
-        assert "New York City" in res
-
-    def test_qwen2_failed_2(self):
-        res = self.run_question(
-            """What language were books being translated into during the era of Haymo of Faversham?""", use_default=True)
-        assert "Latin" in res
-
-
-    def test_musique_1(self):
-        res = self.run_question("Who was the first president of the association which published Journal of Psychotherapy Integration?")
-
-    def test_musique_2(self):
-        res = self.run_question("When did the Admiral Twin open in the city where the Philbrook Museum is located?")
-
-    def test_musique_3(self):
-        res = self.run_question("What county shares a border with the county in which Johnnycake, West Virginia is located?")
-
-    def test_re_failed_resp_1(self):
-        res = self.run_question("""Steve Johnson is the current head football coach at a University which also has a second campus in which California city?""")
-        assert "San Diego" in res
-
-    def test_re_failed_resp_2(self):
-        res = self.run_question("""What is the abbreviation to the magazine that called Qvwm  "an unusually impressive imposter"?""")
-        assert "LXF" in res
-
-    def test_re_failed_resp_3(self):
-        res = self.run_question("""Thumb Wrestling Federation is on a network founded by who?""")
-        assert "Betty Cohen" in res
-
-    def test_re_failed_resp_4(self):
-        res = self.run_question("""What position did the receiver of the 2007 FIFA U-20 Golden Shoe play?""")
-        assert "striker" in res
-
-    def test_re_failed_resp_5(self):
-        res = self.run_question("""Drew Fuller stared in a series that originally broadcast on what station?""")
-        assert "The WB" in res
-
-    def test_re_failed_resp_6(self):
-        res = self.run_question("""Terry McGurrin was the story editor for the show "Scaredy Squirrel" which was written by who?""")
-        assert "M&eacute;lanie Watt" in res
-
-    def test_re_failed_resp_7(self):
-        res = self.run_question("""Which city was the band which was formed in 1981 in Los Angeles when vocalist/guitarist James Hetfield responded to an advertisement posted by drummer Lars Ulrich in a local newspaper hosted by L'Amour?""")
-        assert "New York" in res
diff --git a/tests/solver/logic_form_executor/logic_form_test.py b/tests/solver/logic_form_executor/logic_form_test.py
deleted file mode 100644
index 9fbab6ac..00000000
--- a/tests/solver/logic_form_executor/logic_form_test.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import os
-import unittest
-
-from kag.common.env import init_kag_config
-from kag.solver.implementation.default_kg_retrieval import KGRetrieverByLlm
-from kag.solver.implementation.default_lf_planner import DefaultLFPlanner
-from kag.solver.implementation.lf_chunk_retriever import LFChunkRetriever
-from kag.solver.logic.core_modules.common.base_model import SPOEntity
-from kag.solver.logic.core_modules.config import LogicFormConfiguration
-from kag.solver.logic.core_modules.lf_executor import LogicExecutor
-from kag.solver.logic.core_modules.lf_generator import LFGenerator
-from kag.solver.logic.core_modules.lf_solver import LFSolver
-from kag.solver.logic.core_modules.parser.logic_node_parser import GetNode
-from kag.solver.logic.core_modules.retriver.entity_linker import DefaultEntityLinker
-from kag.solver.logic.core_modules.retriver.graph_retriver.dsl_executor import DslRunnerOnGraphStore
-from kag.solver.logic.core_modules.retriver.schema_std import SchemaRetrieval
-
-configFilePath = os.path.join(os.path.abspath(os.path.dirname(__file__)), "kag_config.cfg")
-init_kag_config(configFilePath)
-
-lf_solver = LFSolver(chunk_retriever=LFChunkRetriever(), kg_retriever=KGRetrieverByLlm())
-
-std_schema = SchemaRetrieval()
-schema = std_schema.schema
-el = DefaultEntityLinker(None, lf_solver.kg_retriever)
-
-generator = LFGenerator()
-
-project_id = os.getenv("KAG_PROJECT_ID")
-dsl_runner = DslRunnerOnGraphStore(os.getenv("KAG_PROJECT_ID"), schema, LogicFormConfiguration({}))
-
-
-def convert_node_to_json(s):
-    return {
-        'id': s.element_id,
-        'type': list(s.labels)[0],
-        'propertyValues': dict(s)
-    }
-
-
-def convert_edge_to_json(p):
-    prop = dict(p)
-    start_node = convert_node_to_json(p.start_node)
-    end_node = convert_node_to_json(p.end_node)
-    prop['original_src_id1__'] = start_node['propertyValues']['id']
-    prop['original_dst_id2__'] = end_node['propertyValues']['id']
-    return {
-        'type': p.type,
-        'propertyValues': prop
-    }
-
-
-re = DefaultLFPlanner()
-
-
-class LogicFormTest(unittest.TestCase):
-
-    def test_entity_linker(self):
-        get_node = GetNode.parse_node("s")
-        get_node.s = SPOEntity(entity_name='david eagleman', entity_type='Person', entity_type_zh='自然人')
-        el_data, _ = el.entity_linking("When was David Eagleman born?", [get_node.s])
-        print(el_data)
-        assert len(el_data) > 0
-        assert len(el_data[0]) > 0
-
-    def test_entity_linker2(self):
-        get_node = GetNode.parse_node("s")
-        get_node.s = SPOEntity(entity_name='Lover Come Back', entity_type='Person', entity_type_zh='自然人')
-        el_data, _ = el.entity_linking(
-            "Lover Come Back contained the actress who played which part on The Brady Bunch?", [get_node.s])
-        print(el_data)
-        assert len(el_data) > 0
-        assert len(el_data[0]) > 0
-
-    def test_one_hop(self):
-        llm_output = """Step1: When was christopher nolan born?
-Action1: get_spo(s=s1:Person[christopher nolan], p=p1:born, o=o1:Date)
-Action2: get(o1)"""
-        lf_nodes = re.lf_planing("When was christopher nolan born?", llm_output)
-        executor = LogicExecutor("When was christopher nolan born?", project_id=project_id, schema=schema,
-                                 kg_retriever=lf_solver.kg_retriever, chunk_retriever=lf_solver.chunk_retriever,
-                                 std_schema=std_schema,
-                                 el=el, dsl_runner=dsl_runner,
-                                 generator=generator)
-        kg_qa_result, _, history, = executor.execute(lf_nodes, "When was christopher nolan born?")
-
-        print(kg_qa_result)
-        print(history)
-        assert len(kg_qa_result) > 0
-        assert kg_qa_result[0] == '30 july 1970'
-
-    def test_one_hop2(self):
-        llm_output = """Step1: When was christopher nolan born?
-Action1: get_spo(s=s1:Person[christopher], p=p1:born, o=o1:Date)
-Action2: get(o1)"""
-        lf_nodes = re.lf_planing("When was christopher nolan born?", llm_output)
-        executor = LogicExecutor("When was christopher nolan born?", project_id=project_id, schema=schema,
-                                 kg_retriever=lf_solver.kg_retriever, chunk_retriever=lf_solver.chunk_retriever,
-                                 std_schema=std_schema,
-                                 el=el, dsl_runner=dsl_runner,
-                                 generator=generator)
-        kg_qa_result, _, history, = executor.execute(lf_nodes, "When was christopher nolan born?")
-
-        assert len(kg_qa_result) == 0
-        assert len(history) > 0
-        print(history[-1]['sub_answer'])
-
-    def test_one_hop_deduce_judge_entail(self):
-        llm_output = """Step1: When was christopher nolan born?
-Action1: get_spo(s=s1:Person[christopher nolan], p=p1:born, o=o1:Date)
-Step2: Verify if Christopher Nolan's birthdate is July 30, 1970. And give the real birthday
-Action2: deduce(op=judgement,entailment)"""
-        lf_nodes = re.lf_planing("When was christopher nolan born? Is Christopher Nolan's date of birth July 30, 1970?",
-                                 llm_output)
-        executor = LogicExecutor("When was christopher nolan born? Is Christopher Nolan's date of birth July 30, 1970?",
-                                 project_id=project_id, schema=schema,
-                                 kg_retriever=lf_solver.kg_retriever, chunk_retriever=lf_solver.chunk_retriever,
-                                 std_schema=std_schema,
-                                 el=el, dsl_runner=dsl_runner,
-                                 generator=generator)
-        kg_qa_result, _, history, = executor.execute(lf_nodes, "When was christopher nolan born?")
-
-        print(kg_qa_result)
-        print(history)
-        assert len(kg_qa_result) == 2
-        assert kg_qa_result[0] == 'Yes'
-
-    def test_one_hop_deduce_judge_entail2(self):
-        llm_output = """Step1: When was christopher nolan born?
-Action1: get_spo(s=s1:Person[christopher nolan], p=p1:born, o=o1:Date)
-Step2: Verify if Christopher Nolan's birthdate is July 29, 1970. And give the real birthday
-Action2: deduce(op=judgement,entailment)"""
-        lf_nodes = re.lf_planing(
-            "When was christopher nolan born? Is Christopher Nolan's date of birth July 29, 1970?", llm_output)
-        executor = LogicExecutor(
-            "When was christopher nolan born? Is Christopher Nolan's date of birth July 29, 1970?", project_id=project_id, schema=schema,
-                                 kg_retriever=lf_solver.kg_retriever, chunk_retriever=lf_solver.chunk_retriever,
-                                 std_schema=std_schema,
-                                 el=el, dsl_runner=dsl_runner,
-                                 generator=generator)
-        kg_qa_result, _, history, = executor.execute(lf_nodes, "When was christopher nolan born?")
-
-        print(kg_qa_result)
-        print(history)
-        assert len(kg_qa_result) == 2
-        assert kg_qa_result[0] == 'No'
-
-    def test_one_hop_deduce_judge(self):
-        llm_output = """Step1: When was christopher nolan born?
-Action1: get_spo(s=s1:Person[christopher nolan], p=p1:born, o=o1:Date)
-Step2: Verify if Christopher Nolan's birthdate is July 30, 1970.
-Action2: deduce(op=judgement)"""
-        lf_nodes = re.lf_planing("Is Christopher Nolan's date of birth July 30, 1970?", llm_output)
-        executor = LogicExecutor("Is Christopher Nolan's date of birth July 30, 1970?", project_id=project_id, schema=schema,
-                                 kg_retriever=lf_solver.kg_retriever, chunk_retriever=lf_solver.chunk_retriever,
-                                 std_schema=std_schema,
-                                 el=el, dsl_runner=dsl_runner,
-                                 generator=generator)
-        kg_qa_result, _, history, = executor.execute(lf_nodes, "When was christopher nolan born?")
-
-        print(kg_qa_result)
-        print(history)
-        assert len(kg_qa_result) > 0
-        assert kg_qa_result[0] == 'Yes'
-
-    def test_galu_query(self):
-        llm_output = """Step1: Identify the entity type of Gallu
-Action1: get_spo(s=s1:Entity[Gallu], p=p1:EntityType, o=o1:Concept)
-
-Step2: Identify the entity type of Lilu
-Action2: get_spo(s=s2:Entity[Lilu], p=p2:EntityType, o=o2:Concept)
-
-Step3: Determine the relationship between Gallu and Lilu
-Action3: get_spo(s=s1, p=p3:RelatedTo, o=o2)
-
-Output: output o2
-Action4: get(o2)"""
-        lf_nodes = re.lf_planing("If Gallu is a demon Lilu is what?", llm_output)
-        executor = LogicExecutor("If Gallu is a demon Lilu is what?", project_id=project_id, schema=schema,
-                                 kg_retriever=lf_solver.kg_retriever, chunk_retriever=lf_solver.chunk_retriever,
-                                 std_schema=std_schema,
-                                 el=el, dsl_runner=dsl_runner,
-                                 generator=generator)
-        kg_qa_result, kg_graph, _, = executor.execute(lf_nodes, "If Gallu is a demon Lilu is what?")
-        print(f"call_kb_paths  kg_path={kg_graph.to_answer_path()}")
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/solver/logic_form_plan/kag_config.cfg b/tests/solver/logic_form_plan/kag_config.cfg
deleted file mode 100644
index abd179e4..00000000
--- a/tests/solver/logic_form_plan/kag_config.cfg
+++ /dev/null
@@ -1,25 +0,0 @@
-
-[vectorizer]
-vectorizer = kag.common.vectorizer.OpenAIVectorizer
-model = bge-m3
-api_key = EMPTY
-base_url = http://127.0.0.1:11434/v1
-vector_dimensions = 1024
-
-[llm]
-client_type = ollama
-base_url = http://localhost:11434/api/generate
-model = llama3.1
-
-[indexer]
-with_semantic = False
-similarity_threshold = 0.8
-
-[retriever]
-with_semantic = False
-pagerank_threshold = 0.9
-match_threshold = 0.8
-top_k = 10
-
-[log]
-kag_log_level = INFO
\ No newline at end of file
diff --git a/tests/solver/retriver.py b/tests/solver/retriver.py
deleted file mode 100644
index c8792fbb..00000000
--- a/tests/solver/retriver.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import unittest
-
-from kag.solver.logic.core_modules.common.one_hop_graph import EntityData
-from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils
-from kag.solver.logic.core_modules.config import LogicFormConfiguration
-from kag.solver.logic.core_modules.retriver.graph_retriver.dsl_executor import DslRunnerOnGraphStore
-
-
-class KgRetriverTest(unittest.TestCase):
-    def test_reason_dsl(self):
-        config = LogicFormConfiguration({
-            "prefix": "KQA2",
-            "project_id": "5"
-        })
-        schema = SchemaUtils(config)
-        schema.get_schema()
-        dsl_runner = DslRunnerOnGraphStore(config.project_id, schema, config)
-        s_data = EntityData()
-        s_data.type = "KQA2.Others"
-        s_data.biz_id = "Panic disorder"
-        ret = dsl_runner.query_vertex_one_graph_by_s_o_ids([s_data], [], {})
-        assert len(ret) == 1
\ No newline at end of file
diff --git a/tests/unit/.coverage b/tests/unit/.coverage
new file mode 100644
index 00000000..3ff62cbd
Binary files /dev/null and b/tests/unit/.coverage differ
diff --git a/tests/unit/.coverage.MacBook-Pro.local.61268.XgTYnYax b/tests/unit/.coverage.MacBook-Pro.local.61268.XgTYnYax
new file mode 100644
index 00000000..60d9e23c
Binary files /dev/null and b/tests/unit/.coverage.MacBook-Pro.local.61268.XgTYnYax differ
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
new file mode 100644
index 00000000..6f6914a4
--- /dev/null
+++ b/tests/unit/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
diff --git a/tests/solver/logic_form_executor/__init__.py b/tests/unit/builder/__init__.py
similarity index 100%
rename from tests/solver/logic_form_executor/__init__.py
rename to tests/unit/builder/__init__.py
diff --git a/tests/solver/logic_form_plan/__init__.py b/tests/unit/builder/component/__init__.py
similarity index 100%
rename from tests/solver/logic_form_plan/__init__.py
rename to tests/unit/builder/component/__init__.py
diff --git a/tests/unit/builder/component/test_batch_vectorizer.py b/tests/unit/builder/component/test_batch_vectorizer.py
new file mode 100644
index 00000000..6c24dc61
--- /dev/null
+++ b/tests/unit/builder/component/test_batch_vectorizer.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+from kag.interface import VectorizerABC
+from kag.builder.model.sub_graph import SubGraph
+
+
+def test_batch_vectorizer():
+    batch_vectorizer = VectorizerABC.from_config(
+        {
+            "type": "batch",
+            "vectorize_model": {
+                "type": "bge",
+                "path": "~/.cache/vectorizer/BAAI/bge-base-zh-v1.5",
+                "url": "",
+                "vector_dimensions": 768,
+            },
+        }
+    )
+    names = [
+        "精卫填海",
+        "海阔天空",
+        "空前绝后",
+        "后来居上",
+        "上下一心",
+        "心旷神怡",
+        "怡然自得",
+        "得心应手",
+    ]
+    subgraph = SubGraph([], [])
+    for name in names:
+        subgraph.add_node(id=name, name=name, label="Chunk")
+
+    new_graph = batch_vectorizer.invoke(subgraph)[0]
+    assert len(subgraph.nodes) == len(new_graph.nodes)
+    for node in new_graph.nodes:
+        assert node.name in names
+        assert "_name_vector" in node.properties
+        assert len(node.properties["_name_vector"]) == 768
diff --git a/tests/builder/component/test_config.cfg b/tests/unit/builder/component/test_config.cfg
similarity index 91%
rename from tests/builder/component/test_config.cfg
rename to tests/unit/builder/component/test_config.cfg
index 8260d56c..fce814ca 100644
--- a/tests/builder/component/test_config.cfg
+++ b/tests/unit/builder/component/test_config.cfg
@@ -12,13 +12,13 @@ host_addr = http://localhost:8887
 id = 1
 
 [vectorizer]
-vectorizer = kag.common.vectorizer.OpenAIVectorizer
+type = openai
 model = bge-m3
 api_key = EMPTY
 base_url = http://127.0.0.1:11434/v1
 vector_dimensions = 1024
 [llm]
-client_type = zdfmng
+type = zdfmng
 model = deepseek-chat
 url = https://zdfmng.alipay.com/commonQuery/queryData
 key = gs540iivzezmidi3
diff --git a/tests/unit/builder/component/test_external_graph.py b/tests/unit/builder/component/test_external_graph.py
new file mode 100644
index 00000000..ab38820c
--- /dev/null
+++ b/tests/unit/builder/component/test_external_graph.py
@@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import os
+from kag.interface import ExternalGraphLoaderABC, SinkWriterABC, VectorizerABC
+from kag.builder.model.sub_graph import Node
+from kag.common.conf import KAG_CONFIG
+from kag.common.utils import get_vector_field_name
+
+pwd = os.path.dirname(__file__)
+
+
+def get_config():
+    config = {
+        "type": "base",
+        "node_file_path": os.path.join(pwd, "../data/nodes.json"),
+        "edge_file_path": os.path.join(pwd, "../data/edges.json"),
+        "match_config": {
+            "k": 1,
+            "threshold": 0.9,
+        },
+    }
+    return config
+
+
+def _test_eg_base():
+    config = get_config()
+    eg = ExternalGraphLoaderABC.from_config(config)
+    assert len(eg.nodes) == 38
+    assert len(eg.edges) == 19
+
+
+def _test_eg_dump():
+    config = get_config()
+    eg = ExternalGraphLoaderABC.from_config(config)
+    graphs = eg.invoke(None)
+    for graph in graphs:
+        if len(graph.nodes) > 0:
+            assert len(graph.edges) == 0
+            labels = set()
+            for node in graph.nodes:
+                labels.add(node.label)
+            assert len(labels) == 1
+        elif len(graph.edges) > 0:
+            assert len(graph.nodes) == 0
+            labels = set()
+            for edge in graph.edges:
+                labels.add(edge.label)
+            assert len(labels) == 1
+
+    vectorizer = VectorizerABC.from_config(KAG_CONFIG.all_config["vectorizer"])
+    writer = SinkWriterABC.from_config(KAG_CONFIG.all_config["writer"])
+    for graph in graphs:
+        new_graph = vectorizer.invoke(graph)[0]
+        writer.invoke(new_graph)
+
+
+def _test_eg_query():
+    config = get_config()
+    eg = ExternalGraphLoaderABC.from_config(config)
+    entities = eg.ner("促生长素抑制素和蛋白酶有什么关系")
+    assert len(entities) > 0
+    for entity in entities:
+        assert isinstance(entity, Node)
+
+    text_matched = eg.match_entity("蛋白水解酶")
+    assert len(text_matched) > 0 and text_matched[0]["node"]["name"] == "蛋白水解酶"
+    vector = text_matched[0]["node"][get_vector_field_name("name")]
+    vector_matched = eg.match_entity(vector)
+    assert len(vector_matched) > 0 and vector_matched[0]["node"]["name"] == "蛋白水解酶"
+
+
+def test_eg():
+    _test_eg_base()
+    _test_eg_dump()
+    _test_eg_query()
diff --git a/tests/unit/builder/component/test_extractor.py b/tests/unit/builder/component/test_extractor.py
new file mode 100644
index 00000000..4ad046e0
--- /dev/null
+++ b/tests/unit/builder/component/test_extractor.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+import os
+from kag.common.conf import KAG_CONFIG
+from kag.builder.model.chunk import Chunk
+from kag.interface import ExtractorABC
+from kag.builder.model.sub_graph import SubGraph
+
+llm_config = KAG_CONFIG.all_config["llm"]
+
+pwd = os.path.dirname(__file__)
+
+
+def test_kag_extractor():
+    conf = {
+        "type": "schema_free",
+        "llm": llm_config,
+        "ner_prompt": {"type": "default_ner"},
+    }
+
+    extractor = ExtractorABC.from_config(conf)
+    with open(os.path.join(pwd, "../data/test_txt.txt"), "r") as reader:
+        content = reader.read()
+    chunk = Chunk(id="111", name="test", content=content)
+    subgraph = extractor.invoke(chunk)[0]
+    print(subgraph)
+    print(type(subgraph))
+    assert isinstance(subgraph, SubGraph)
+
+
+def test_spg_extractor():
+    conf = {
+        "type": "schema_constraint",
+        "llm": llm_config,
+        "ner_prompt": {"type": "default_ner"},
+    }
+
+    extractor = ExtractorABC.from_config(conf)
+    with open(os.path.join(pwd, "../data/test_txt.txt"), "r") as reader:
+        content = reader.read()
+    chunk = Chunk(id="111", name="test", content=content)
+    subgraph = extractor.invoke(chunk)[0]
+    print(subgraph)
+    print(type(subgraph))
+    assert isinstance(subgraph, SubGraph)
diff --git a/tests/builder/component/test_mapping.py b/tests/unit/builder/component/test_mapping.py
similarity index 100%
rename from tests/builder/component/test_mapping.py
rename to tests/unit/builder/component/test_mapping.py
diff --git a/tests/unit/builder/component/test_post_processor.py b/tests/unit/builder/component/test_post_processor.py
new file mode 100644
index 00000000..4a67baab
--- /dev/null
+++ b/tests/unit/builder/component/test_post_processor.py
@@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 OpenSPG Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied.
+import os
+from kag.interface import PostProcessorABC, VectorizerABC
+from kag.builder.model.sub_graph import Node, Edge, SubGraph
+from kag.common.conf import KAG_CONFIG
+
+pwd = os.path.dirname(__file__)
+
+
+def get_config():
+    config = {
+        "type": "base",
+        "similarity_threshold": 0.1,
+    }
+    return config
+
+
+def create_mock_graph():
+    nodes = [
+        Node("Apple", name="Apple", label="Concept", properties={}),
+        Node("Banana", name="Banana", label="Concept", properties={}),
+        Node("Peach", name="Peach", label="Concept", properties={}),
+        Node("000", name="000", label="Number", properties={}),
+        Node("Error", name="Peach", label="", properties={}),
+    ]
+
+    edges = [
+        Edge("1", from_node=nodes[0], to_node=nodes[1], label="sim", properties={}),
+        Edge("2", from_node=nodes[0], to_node=nodes[2], label="sim", properties={}),
+        Edge("3", from_node=nodes[1], to_node=nodes[2], label="sim", properties={}),
+        Edge("4", from_node=nodes[0], to_node=nodes[3], label="", properties={}),
+        Edge("5", from_node=nodes[0], to_node=nodes[4], label="", properties={}),
+    ]
+    graph = SubGraph(nodes=nodes, edges=edges)
+    return graph
+
+
+def test_postprocessor_filter():
+    config = get_config()
+    postprocessor = PostProcessorABC.from_config(config)
+    graph = create_mock_graph()
+    new_graph = postprocessor.filter_invalid_data(graph)
+    assert len(new_graph.nodes) == 3
+    assert len(new_graph.edges) == 3
+    for node in new_graph.nodes:
+        assert node.label == "Concept"
+    for edge in new_graph.edges:
+        assert edge.label == "sim"
+
+
+def test_postprocessor_add_sim_edges():
+    config = get_config()
+    postprocessor = PostProcessorABC.from_config(config)
+    graph = create_mock_graph()
+    vectorizer = VectorizerABC.from_config(KAG_CONFIG.all_config["vectorizer"])
+    graph = vectorizer.invoke(graph)[0]
+    origin_num_edges = len(graph.edges)
+    postprocessor.similarity_based_link(graph)
+    assert len(graph.edges) > origin_num_edges
+
+
+def test_postprocessor_add_eg_edges():
+    config = get_config()
+    config["external_graph"] = {
+        "type": "base",
+        "node_file_path": os.path.join(pwd, "../data/nodes.json"),
+        "edge_file_path": os.path.join(pwd, "../data/edges.json"),
+        "match_config": {
+            "k": 1,
+            "threshold": 0.9,
+        },
+    }
+    postprocessor = PostProcessorABC.from_config(config)
+    graph = create_mock_graph()
+    vectorizer = VectorizerABC.from_config(KAG_CONFIG.all_config["vectorizer"])
+    graph = vectorizer.invoke(graph)[0]
+    origin_num_edges = len(graph.edges)
+    postprocessor.external_graph_based_link(graph)
+    assert len(graph.edges) > origin_num_edges
diff --git a/tests/unit/builder/component/test_reader.py b/tests/unit/builder/component/test_reader.py
new file mode 100644
index 00000000..b042ee82
--- /dev/null
+++ b/tests/unit/builder/component/test_reader.py
@@ -0,0 +1,107 @@
+# -*- coding: utf-8 -*-
+
+import os
+
+import copy
+import shutil
+from kag.interface import ReaderABC
+from kag.builder.model.chunk import Chunk
+
+pwd = os.path.dirname(__file__)
+
+
+def test_dict_reader():
+    if os.path.exists(os.path.join(pwd, "ckpt")):
+        shutil.rmtree(os.path.join(pwd, "ckpt"))
+    reader = ReaderABC.from_config(
+        {
+            "type": "dict",
+            "id_col": "data_id",
+            "name_col": "data_name",
+            "content_col": "data_content",
+        }
+    )
+    content = {
+        "data_id": "111",
+        "data_name": "222",
+        "data_content": "hello.",
+        "extra": "Nice.",
+    }
+    chunks = reader.invoke(copy.deepcopy(content))
+    assert len(chunks) == 1
+    assert isinstance(chunks[0], Chunk)
+    chunk = chunks[0]
+    assert chunk.id == content["data_id"]
+    assert chunk.name == content["data_name"]
+    assert chunk.content == content["data_content"]
+    assert chunk.kwargs["extra"] == content["extra"]
+
+
+def test_text_reader():
+    if os.path.exists(os.path.join(pwd, "ckpt")):
+        shutil.rmtree(os.path.join(pwd, "ckpt"))
+
+    reader = ReaderABC.from_config({"type": "txt"})
+    text = "您好！"
+    chunks = reader.invoke(text)
+    assert len(chunks) == 1 and chunks[0].content == text
+
+    file_path = os.path.join(pwd, "../data/test_txt.txt")
+    chunks = reader.invoke(file_path)
+    with open(file_path) as f:
+        content = f.read()
+    chunks = reader.invoke(file_path)
+    assert len(chunks) == 1
+    assert chunks[0].content == content
+
+
+def test_docx_reader():
+    if os.path.exists(os.path.join(pwd, "ckpt")):
+        shutil.rmtree(os.path.join(pwd, "ckpt"))
+
+    reader = ReaderABC.from_config({"type": "docx"})
+
+    file_path = os.path.join(pwd, "../data/test_docx.docx")
+    chunks = reader.invoke(file_path)
+    # Assert the expected result
+    assert len(chunks) == 1
+    assert len(chunks[0].content) > 0
+
+
+def test_md_reader():
+    if os.path.exists(os.path.join(pwd, "ckpt")):
+        shutil.rmtree(os.path.join(pwd, "ckpt"))
+
+    reader = ReaderABC.from_config({"type": "md", "cut_depth": 1})
+    file_path = os.path.join(pwd, "../data/test_markdown.md")
+    chunks = reader.invoke(file_path)
+    assert len(chunks) > 0
+
+
+def test_pdf_reader():
+    if os.path.exists(os.path.join(pwd, "ckpt")):
+        shutil.rmtree(os.path.join(pwd, "ckpt"))
+
+    reader = ReaderABC.from_config({"type": "pdf"})
+
+    page = "Header\nContent 1\nContent 2\nFooter"
+    watermark = "Header"
+    expected = ["Content 1", "Content 2"]
+    result = reader._process_single_page(
+        page, watermark, remove_header=True, remove_footnote=True
+    )
+    assert result == expected
+    file_path = os.path.join(pwd, "../data/test_pdf.pdf")
+    chunks = reader.invoke(file_path)
+    assert chunks[0].name == "test_pdf#0"
+
+
+def test_yuque_reader():
+    if os.path.exists(os.path.join(pwd, "ckpt")):
+        shutil.rmtree(os.path.join(pwd, "ckpt"))
+
+    reader = ReaderABC.from_config({"type": "yuque", "cut_depth": 2})
+    chunks = reader.invoke(
+        "xxx@https://yuque-api.antfin-inc.com/api/v2/repos/un8gkl/kg7h1z/docs/odtmme"
+    )
+    assert len(chunks) > 0
diff --git a/tests/unit/builder/component/test_scanner.py b/tests/unit/builder/component/test_scanner.py
new file mode 100644
index 00000000..97525f16
--- /dev/null
+++ b/tests/unit/builder/component/test_scanner.py
@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+
+import json
+import pandas as pd
+import os
+
+
+from kag.interface import ScannerABC
+from kag.builder.model.chunk import Chunk, ChunkTypeEnum
+
+pwd = os.path.dirname(__file__)
+
+
+def test_json_scanner():
+    scanner = ScannerABC.from_config({"type": "json", "rank": 0, "world_size": 1})
+    file_path = os.path.join(pwd, "../data/test_json.json")
+
+    with open(file_path, "r") as r:
+        json_string = r.read()
+    json_content = json.loads(json_string)
+
+    data = scanner.invoke(file_path)
+    assert len(data) == len(json_content)
+    for l, r in zip(data, json_content):
+        assert l == r
+
+    scanner_1 = ScannerABC.from_config({"type": "json", "rank": 0, "world_size": 2})
+    scanner_2 = ScannerABC.from_config({"type": "json", "rank": 1, "world_size": 2})
+
+    data_1 = scanner_1.invoke(file_path)
+    data_2 = scanner_2.invoke(file_path)
+    data = data_1 + data_2
+    assert len(data) == len(json_content)
+    for l, r in zip(data, json_content):
+        assert l == r
+
+
+def test_csv_scanner():
+    file_path = os.path.join(pwd, "../data/test_csv.csv")
+    scanner = ScannerABC.from_config({"type": "csv", "rank": 0, "world_size": 1})
+    csv_content = []
+    for _, item in pd.read_csv(file_path, dtype=str).iterrows():
+        csv_content.append(item.to_dict())
+    data = scanner.invoke(file_path)
+
+    assert len(data) == len(csv_content)
+    for l, r in zip(data, csv_content):
+        assert l == r
+
+    scanner_1 = ScannerABC.from_config({"type": "csv", "rank": 0, "world_size": 2})
+    scanner_2 = ScannerABC.from_config({"type": "csv", "rank": 1, "world_size": 2})
+
+    data_1 = scanner_1.invoke(file_path)
+    data_2 = scanner_2.invoke(file_path)
+    data = data_1 + data_2
+    assert len(data) == len(csv_content)
+    for l, r in zip(data, csv_content):
+        assert l == r
+
+
+def test_csv_scanner_with_cols():
+    file_path = os.path.join(pwd, "../data/test_csv.csv")
+    scanner = ScannerABC.from_config(
+        {"type": "csv", "rank": 0, "world_size": 1, "col_names": ["title", "text"]}
+    )
+    csv_content = []
+    for _, item in pd.read_csv(file_path, dtype=str).iterrows():
+        csv_content.append(item.to_dict())
+    data = scanner.invoke(file_path)
+
+    assert len(data) == len(csv_content) * 2
+
+    scanner_1 = ScannerABC.from_config(
+        {"type": "csv", "rank": 0, "world_size": 2, "col_names": ["title", "text"]}
+    )
+    scanner_2 = ScannerABC.from_config(
+        {"type": "csv", "rank": 1, "world_size": 2, "col_names": ["title", "text"]}
+    )
+
+    data_1 = scanner_1.invoke(file_path)
+    data_2 = scanner_2.invoke(file_path)
+    data = data_1 + data_2
+    assert len(data) == len(csv_content) * 2
+
+    file_path = os.path.join(pwd, "../data/test_csv_headerless.csv")
+    scanner = ScannerABC.from_config(
+        {"type": "csv", "rank": 0, "world_size": 1, "col_ids": [0, 1], "header": False}
+    )
+    csv_content = []
+    for _, item in pd.read_csv(file_path, dtype=str, header=None).iterrows():
+        csv_content.append(item.to_dict())
+    data = scanner.invoke(file_path)
+
+    assert len(data) == len(csv_content) * 2
+
+
+def test_file_scanner():
+    scanner = ScannerABC.from_config({"type": "file"})
+    file_name = "test.txt"
+    out = scanner.invoke(file_name)
+    assert out == [file_name]
+
+    file_name2 = (
+        "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
+    )
+    out = scanner.invoke(file_name2)
+    assert (
+        isinstance(out, list)
+        and len(out) == 1
+        and os.path.basename(out[0]) == "dummy.pdf"
+    )
+
+
+def test_directory_scanner():
+    scanner = ScannerABC.from_config({"type": "dir", "file_suffix": "json"})
+    dir_path = os.path.join(pwd, "../data/")
+    all_data = scanner.invoke(dir_path)
+    for item in all_data:
+        assert os.path.exists(item)
+        assert item.endswith("json")
+
+    scanner_1 = ScannerABC.from_config(
+        {"type": "dir", "file_suffix": "json", "rank": 0, "world_size": 2}
+    )
+    scanner_2 = ScannerABC.from_config(
+        {"type": "dir", "file_suffix": "json", "rank": 1, "world_size": 2}
+    )
+    data_1 = scanner_1.invoke(dir_path)
+    data_2 = scanner_2.invoke(dir_path)
+    assert len(all_data) == len(data_1) + len(data_2)
+
+    scanner = ScannerABC.from_config({"type": "dir", "file_pattern": ".*txt$"})
+    all_data = scanner.invoke(dir_path)
+
+    for item in all_data:
+        assert os.path.exists(item)
+        assert item.endswith("txt")
+
+
+def test_yuque_scanner():
+    token = "pKjtrFOr7w4QUzBTEjXdpV33QzVEHR49kvkZmGFV"
+    scanner = ScannerABC.from_config({"type": "yuque", "token": token})
+    urls = scanner.invoke(
+        "https://yuque-api.antfin-inc.com/api/v2/repos/un8gkl/kg7h1z/docs/"
+    )
+    for url in urls:
+        token, rea_url = url.split("@", 1)
+        assert token == scanner.token
+
+    urls = scanner.invoke(
+        ["https://yuque-api.antfin-inc.com/api/v2/repos/un8gkl/kg7h1z/docs/"]
+    )
+    assert (
+        len(urls) == 1
+        and urls[0]
+        == f"{token}@https://yuque-api.antfin-inc.com/api/v2/repos/un8gkl/kg7h1z/docs/"
+    )
diff --git a/tests/unit/builder/component/test_splitter.py b/tests/unit/builder/component/test_splitter.py
new file mode 100644
index 00000000..64416c9b
--- /dev/null
+++ b/tests/unit/builder/component/test_splitter.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+import os
+import copy
+
+from kag.common.conf import KAG_CONFIG
+from kag.interface import SplitterABC
+from kag.builder.model.chunk import Chunk
+
+llm_config = KAG_CONFIG.all_config["llm"]
+
+pwd = os.path.dirname(__file__)
+
+
+def test_length_splitter():
+    splitter = SplitterABC.from_config(
+        {"type": "length", "split_length": 20, "window_length": 10}
+    )
+    content = "The quick brown fox jumps over the lazy dog. " * 4
+    sentences = splitter.split_sentence(content)
+    assert len(sentences) == 4
+
+    chunk = Chunk(id=1, name="test", content=content)
+    chunks = splitter.invoke(chunk)
+    assert len(chunks) > 1
+
+
+def test_outline_splitter():
+    splitter = SplitterABC.from_config(
+        {
+            "type": "outline",
+            "llm": copy.deepcopy(llm_config),
+        }
+    )
+    with open(os.path.join(pwd, "../data/test_txt.txt"), "r") as reader:
+        content = reader.read()
+    chunk = Chunk(id=1, name="test", content=content)
+
+    chunks = splitter.invoke(chunk)
+    assert len(chunks) > 0 and isinstance(chunks[0], Chunk)
+
+
+def test_semantic_splitter():
+    splitter = SplitterABC.from_config(
+        {
+            "type": "semantic",
+            "llm": copy.deepcopy(llm_config),
+        }
+    )
+    with open(os.path.join(pwd, "../data/test_txt.txt"), "r") as reader:
+        content = reader.read()
+    chunk = Chunk(id=1, name="test", content=content)
+
+    chunks = splitter.invoke(chunk)
+    assert len(chunks) > 0 and isinstance(chunks[0], Chunk)
diff --git a/tests/unit/builder/component/test_writer.py b/tests/unit/builder/component/test_writer.py
new file mode 100644
index 00000000..97d4bfe7
--- /dev/null
+++ b/tests/unit/builder/component/test_writer.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+from kag.interface import SinkWriterABC
+from kag.common.conf import KAG_PROJECT_CONF
+
+
+def test_writer():
+    conf = {
+        "type": "kg",
+    }
+
+    writer = SinkWriterABC.from_config(conf)
+    from kag.builder.component.writer.kg_writer import KGWriter
+
+    assert isinstance(writer, KGWriter)
+    assert writer.project_id == KAG_PROJECT_CONF.project_id
+
+    conf = {
+        "type": "kg",
+        "project_id": 888,
+    }
+    writer = SinkWriterABC.from_config(conf)
+    assert writer.project_id == 888
diff --git a/tests/builder/data/KnowledgeGraphTutorialSub.pdf b/tests/unit/builder/data/KnowledgeGraphTutorialSub.pdf
similarity index 100%
rename from tests/builder/data/KnowledgeGraphTutorialSub.pdf
rename to tests/unit/builder/data/KnowledgeGraphTutorialSub.pdf
diff --git a/tests/builder/data/aiwen.pdf b/tests/unit/builder/data/aiwen.pdf
similarity index 100%
rename from tests/builder/data/aiwen.pdf
rename to tests/unit/builder/data/aiwen.pdf
diff --git a/tests/unit/builder/data/edges.json b/tests/unit/builder/data/edges.json
new file mode 100644
index 00000000..bbeaaa18
--- /dev/null
+++ b/tests/unit/builder/data/edges.json
@@ -0,0 +1,154 @@
+[
+    {
+        "id": "(缩)肾上腺皮质激素-促肾上腺皮质激素",
+        "from": "(缩)肾上腺皮质激素",
+        "fromType": "Concept",
+        "to": "促肾上腺皮质激素",
+        "toType": "Concept",
+        "label": "isA"
+    },
+    {
+        "id": "促肾皮素-促肾上腺皮质激素",
+        "from": "促肾皮素",
+        "fromType": "Concept",
+        "to": "促肾上腺皮质激素",
+        "toType": "Concept",
+        "label": "isA"
+    },
+    {
+        "id": "促皮质素-促肾上腺皮质激素",
+        "from": "促皮质素",
+        "fromType": "Concept",
+        "to": "促肾上腺皮质激素",
+        "toType": "Concept",
+        "label": "isA"
+    },
+    {
+        "id": "促肾上腺皮质[激]素-促肾上腺皮质激素",
+        "from": "促肾上腺皮质[激]素",
+        "fromType": "Concept",
+        "to": "促肾上腺皮质激素",
+        "toType": "Concept",
+        "label": "isA"
+    },
+    {
+        "id": "ACTH-促肾上腺皮质激素",
+        "from": "ACTH",
+        "fromType": "Concept",
+        "to": "促肾上腺皮质激素",
+        "toType": "Concept",
+        "label": "isA"
+    },
+    {
+        "id": "促皮质激素-促肾上腺皮质激素",
+        "from": "促皮质激素",
+        "fromType": "Concept",
+        "to": "促肾上腺皮质激素",
+        "toType": "Concept",
+        "label": "isA"
+    },
+    {
+        "id": "促肾上腺皮质素-促肾上腺皮质激素",
+        "from": "促肾上腺皮质素",
+        "fromType": "Concept",
+        "to": "促肾上腺皮质激素",
+        "toType": "Concept",
+        "label": "isA"
+    },
+    {
+        "id": "人生长激素-促生长素",
+        "from": "人生长激素",
+        "fromType": "Concept",
+        "to": "促生长素",
+        "toType": "Concept",
+        "label": "isA"
+    },
+    {
+        "id": "生长激素-促生长素",
+        "from": "生长激素",
+        "fromType": "Concept",
+        "to": "促生长素",
+        "toType": "Concept",
+        "label": "isA"
+    },
+    {
+        "id": "生长激素释放抑制激素-生长抑素",
+        "from": "生长激素释放抑制激素",
+        "fromType": "Concept",
+        "to": "生长抑素",
+        "toType": "Concept",
+        "label": "isA"
+    },
+    {
+        "id": "促生长素抑制素-生长抑素",
+        "from": "促生长素抑制素",
+        "fromType": "Concept",
+        "to": "生长抑素",
+        "toType": "Concept",
+        "label": "isA"
+    },
+    {
+        "id": "生长抑素醋酸盐-生长抑素",
+        "from": "生长抑素醋酸盐",
+        "fromType": "Concept",
+        "to": "生长抑素",
+        "toType": "Concept",
+        "label": "isA"
+    },
+    {
+        "id": "胃泌激素-促胃液素",
+        "from": "胃泌激素",
+        "fromType": "Concept",
+        "to": "促胃液素",
+        "toType": "Concept",
+        "label": "isA"
+    },
+    {
+        "id": "胃泌素-促胃液素",
+        "from": "胃泌素",
+        "fromType": "Concept",
+        "to": "促胃液素",
+        "toType": "Concept",
+        "label": "isA"
+    },
+    {
+        "id": "促乳素-催乳素",
+        "from": "促乳素",
+        "fromType": "Concept",
+        "to": "催乳素",
+        "toType": "Concept",
+        "label": "isA"
+    },
+    {
+        "id": "泌乳素-催乳素",
+        "from": "泌乳素",
+        "fromType": "Concept",
+        "to": "催乳素",
+        "toType": "Concept",
+        "label": "isA"
+    },
+    {
+        "id": "催乳激素-催乳素",
+        "from": "催乳激素",
+        "fromType": "Concept",
+        "to": "催乳素",
+        "toType": "Concept",
+        "label": "isA"
+    },
+    {
+        "id": "蛋白水解酶-内肽酶",
+        "from": "蛋白水解酶",
+        "fromType": "Concept",
+        "to": "内肽酶",
+        "toType": "Concept",
+        "label": "isA"
+    },
+    {
+        "id": "蛋白酶-内肽酶",
+        "from": "蛋白酶",
+        "fromType": "Concept",
+        "to": "内肽酶",
+        "toType": "Concept",
+        "label": "isA"
+    }
+]
\ No newline at end of file
diff --git a/tests/unit/builder/data/long_text_1.txt b/tests/unit/builder/data/long_text_1.txt
new file mode 100644
index 00000000..a353fb71
--- /dev/null
+++ b/tests/unit/builder/data/long_text_1.txt
@@ -0,0 +1,123 @@
+第一回 灵根育孕源流出 心性修持大道生
+
+诗曰：
+ 
+混沌未分天地乱，茫茫渺渺无人见。
+自从盘古破鸿蒙，开辟从兹清浊辨。
+覆载群生仰至仁，发明万物皆成善。
+欲知造化会元功，须看《西游释厄传》。
+ 
+盖闻天地之数，有十二万九千六百岁为一元。将一元分为十二会，乃子、丑、寅、卯、辰、巳（sì）、午、未、申、酉、戌、亥之十二支也。每会该一万八百岁。且就一日而论：子时得阳气，而丑则鸡鸣；寅不通光，而卯则日出；辰时食后，而巳则挨排；日午天中，而未则西蹉；申时晡而日落酉；戌黄昏而人定亥。譬于大数，若到戌会之终，则天地昏蒙而万物否矣。再去五千四百岁，交亥会之初，则当黑暗，而两间人物俱无矣，故曰混沌。又五千四百岁，亥会将终，贞下起元，近子之会，而复逐渐开明。邵康节曰：“冬至子之半，天心无改移。一阳初动处，万物未生时。”到此，天始有根。再五千四百岁，正当子会，轻清上腾，有日，有月，有星，有辰。日、月、星、辰，谓之四象。故曰，天开于子。又经五千四百岁，子会将终，近丑之会，而逐渐坚实。易曰：“大哉乾元！至哉坤元！万物资生，乃顺承天。”至此，地始凝结。再五千四百岁，正当丑会，重浊下凝，有水，有火，有山，有石，有土。水、火、山、石、土谓之五形。故曰，地辟于丑。又经五千四百岁，丑会终而寅会之初，发生万物。历曰：“天气下降，地气上升；天地交合，群物皆生。”至此，天清地爽，阴阳交合。再五千四百岁，正当寅会，生人，生兽，生禽，正谓天地人，三才定位。故曰，人生于寅。
+ 
+感盘古开辟，三皇治世，五帝定伦，世界之间，遂分为四大部洲：曰东胜神洲，曰西牛贺洲，曰南赡部洲，曰北俱芦洲。这部书单表东胜神洲。海外有一国土，名曰傲来国。国近大海，海中有一座山，唤为花果山。此山乃十洲之祖脉，三岛之来龙，自开清浊而立，鸿蒙判后而成。真个好山！有词赋为证。赋曰：
+ 
+势镇汪洋，威宁瑶海。势镇汪洋，潮涌银山鱼入穴；威宁瑶海，波翻雪浪蜃（shèn）离渊。木火方隅高积上，东海之处耸崇巅。丹崖怪石，削壁奇峰。丹崖上，彩凤双鸣；削壁前，麒麟独卧。峰头时听锦鸡鸣，石窟每观龙出入。林中有寿鹿仙狐，树上有灵禽玄鹤。瑶草奇花不谢，青松翠柏长春。仙桃常结果，修竹每留云。一条涧壑藤萝密，四面原堤草色新。正是百川会处擎天柱，万劫无移大地根。
+ 
+那座山，正当顶上，有一块仙石。其石有三丈六尺五寸高，有二丈四尺围圆。三丈六尺五寸高，按周天三百六十五度；二丈四尺围圆，按政历二十四气。上有九窍八孔，按九宫八卦。四面更无树木遮阴，左右倒有芝兰相衬。盖自开辟以来，每受天真地秀，日精月华，感之既久，遂有灵通之意。内育仙胞，一日迸裂，产一石卵，似圆球样大。因见风，化作一个石猴，五官俱备，四肢皆全。便就学爬学走，拜了四方。目运两道金光，射冲斗府。惊动高天上圣大慈仁者玉皇大天尊玄穹高上帝，驾座金阙云宫灵霄宝殿，聚集仙卿，见有金光焰焰，即命千里眼、顺风耳开南天门观看。二将果奉旨出门外，看的真，听的明。须臾回报道：“臣奉旨观听金光之处，乃东胜神洲海东傲来小国之界，有一座花果山，山上有一仙石，石产一卵，见风化一石猴，在那里拜四方，眼运金光，射冲斗府。如今服饵水食，金光将潜息矣。”玉帝垂赐恩慈曰：“下方之物，乃天地精华所生，不足为异。”
+ 
+那猴在山中，却会行走跳跃，食草木，饮涧泉，采山花，觅树果；与狼虫为伴，虎豹为群，獐鹿为友，猕猿为亲；夜宿石崖之下，朝游峰洞之中。真是“山中无甲子，寒尽不知年。”
+ 
+一朝天气炎热，与群猴避暑，都在松阴之下顽耍。你看他一个个：
+ 
+跳树攀枝，采花觅果；抛弹子，邷么儿（以磨光的碎瓦片或小石子为玩具的儿童游戏。有些地方称为“抓子儿”。邷，wá）
+；跑沙窝，砌宝塔；赶蜻蜓，扑　　蜡；参老天，拜菩萨；扯葛藤，编草帓；捉虱子，咬又掐；理毛衣，剔指甲；挨的挨，擦的擦；推的推，压的压；扯的扯，拉的拉，青松林下任他顽，绿水涧边随洗濯（zhuó）。
+ 
+一群猴子耍了一会，却去那山涧中洗澡。见那股涧水奔流，真个似滚瓜涌溅。古云：“禽有禽言，兽有兽语。”众猴都道：“这股水不知是那里的水。我们今日赶闲无事，顺涧边往上溜头寻看源流，耍子去耶！”喊一声，都拖男挈女，呼弟呼兄，一齐跑来，顺涧爬山，直至源流之处，乃是一股瀑布飞泉。但见那：
+ 
+一派白虹起，千寻雪浪飞；海风吹不断，江月照还依。
+冷气分青嶂，馀流润翠微；潺湲（chán yuán，水慢慢流动的样子）名瀑布，真似挂帘帷。
+ 
+众猴拍手称扬道：“好水！好水！原来此处远通山脚之下，直接大海之波。”又道：“那一个有本事的，钻进去寻个源头出来，不伤身体者，我等即拜他为王。”连呼了三声，忽见丛杂中跳出一名石猴，应声高叫道：“我进去！我进去！”好猴！也是他：
+ 
+今日芳名显，时来大运通；有缘居此地，王遣入仙宫。
+ 
+你看他瞑（míng）目蹲身，将身一纵，径跳入瀑布泉中，忽睁睛抬头观看，那里边却无水无波，明明朗朗的一架桥梁。他住了身，定了神，仔细再看，原来是座铁板桥。桥下之水，冲贯于石窍之间，倒挂流出去，遮闭了桥门。却又欠身上桥头，再走再看，却似有人家住处一般，真个好所在。但见那：
+ 
+翠藓堆蓝，白云浮玉，光摇片片烟霞。虚窗静室，滑凳板生花。乳窟龙珠倚挂，萦回满地奇葩。锅灶傍崖存火迹，樽罍（zūn
+léi）靠案见肴渣。石座石床真可爱，石盆石碗更堪夸。又见那一竿两竿修竹，三点五点梅花。几树青松常带雨，浑然像个人家。
+ 
+看罢多时，跳过桥中间，左右观看，只见正当中有一石碣。碣上有一行楷书大字，镌着“花果山福地，水帘洞洞天。”石猴喜不自胜，急抽身往外便走，复瞑目蹲身，跳出水外，打了两个呵呵道：“大造化！大造化！”众猴把他围住，问道：“里面怎么样？水有多深？”石猴道：“没水！没水！原来是一座铁板桥。桥那边是一座天造地设的家当。”众猴道：“怎见得是个家当？”石猴笑道：“这股水乃是桥下冲贯石桥，倒挂下来遮闭门户的。桥边有花有树，乃是一座石房。房内有石窝、石灶、石碗、石盆、石床、石凳。中间一块石碣上，镌着‘花果山福地，水帘洞洞天。’真个是我们安身之处。里面且是宽阔，容得千百口老小。我们都进去住也，省得受老天之气。这里边：
+ 
+刮风有处躲，下雨好存身。霜雪全无惧，雷声永不闻。
+烟霞常照耀，祥瑞每蒸熏。松竹年年秀，奇花日日新。”
+ 
+众猴听得，个个欢喜。都道：“你还先走，带我们进去，进去！”石猴却又瞑目蹲身，往里一跳，叫道：“都随我进来！进来！”那些猴有胆大的，都跳进去了；胆小的，一个个伸头缩颈，抓耳挠腮，大声叫喊，缠一会，也都进去了。跳过桥头，一个个抢盆夺碗，占灶争床，搬过来，移过去，正是猴性顽劣，再无一个宁时，只搬得力倦神疲方止。石猴端坐上面道：“列位呵，‘人而无信，不知其可’。你们才说有本事进得来，出得去，不伤身体者，就拜他为王。我如今进来又出去，出去又进来，寻了这一个洞天与列位安眠稳睡，各享成家之福，何不拜我为王？”众猴听说，即拱伏无违。一个个序齿（以年龄为顺序）排班，朝上礼拜，都称“千岁大王”。自此，石猴高登王位，将“石”字儿隐了，遂称美猴王。有诗为证，诗曰：
+ 
+三阳交泰产群生，仙石胞含日月精。
+借卵化猴完大道，假他名姓配丹成。
+内观不识因无相，外合明知作有形。
+历代人人皆属此，称王称圣任纵横。
+ 
+美猴王领一群猿猴、猕猴、马猴等，分派了君臣佐使，朝游花果山，暮宿水帘洞，合契同情，不入飞鸟之丛，不从走兽之类，独自为王，不胜欢乐。是以：
+ 
+春采百花为饮食，夏寻诸果作生涯。
+秋收芋栗延时节，冬觅黄精度岁华。
+ 
+美猴王享乐天真，何期有三五百载。一日，与群猴喜宴之间，忽然忧恼，堕下泪来。众猴慌忙罗拜道：“大王何为烦恼？”猴王道：“我虽在欢喜之时，却有一点儿远虑，故此烦恼。”众猴又笑道：“大王好不知足！我等日日欢会，在仙山福地，古洞神州，不伏麒麟辖，不伏凤凰管，又不伏人间王位所拘束，自由自在，乃无量之福，为何远虑而忧也？”猴王道：“今日虽不归人王法律，不惧禽兽威服，将来年老血衰，暗中有阎王老子管着，一旦身亡，可不枉生世界之中，不得久住天人之内？”众猴闻此言，一个个掩面悲啼，俱以无常为虑。
+ 
+只见那班部中，忽跳出一个通背猿猴，厉声高叫道：“大王若是这般远虑，真所谓道心开发也！如今五虫之内，惟有三等名色，不伏阎王老子所管。”猴王道：“你知那三等人？”猿猴道：“乃是佛与仙与神圣三者，躲过轮回，不生不灭，与天地山川齐寿。”猴王道：“此三者居于何所？”猿猴道：“他只在阎浮世界之中，古洞仙山之内。”猴王闻之，满心欢喜，道：“我明日就辞汝等下山，云游海角，远涉天涯，务必访此三者，学一个不老长生，常躲过阎君之难。”噫！这句话，顿教跳出轮回网，致使齐天大圣成。众猴鼓掌称扬，都道：“善哉！善哉！我等明日越岭登山，广寻些果品，大设筵宴送大王也。”
+ 
+次日，众猴果去采仙桃，摘异果，刨山药，劚黄精，芝兰香蕙，瑶草奇花，般般件件，整整齐齐，摆开石凳石桌，排列仙酒仙肴。但见那：
+ 
+金丸珠弹，红绽黄肥。金丸珠弹腊樱桃，色真甘美；红绽黄肥熟梅子，味果香酸。鲜龙眼，肉甜皮薄；火荔枝，核小囊红。林檎碧实连枝献，枇杷缃苞带叶擎。兔头梨子鸡心枣，消渴除烦更解酲（chéng）。香桃烂杏，美甘甘似玉液琼浆；脆李杨梅，酸荫荫如脂酥膏酪。红囊黑子熟西瓜，四瓣黄皮大柿子。石榴裂破，丹砂粒现火晶珠；芋栗剖开，坚硬肉团金玛瑙。胡桃银杏可传茶，椰子葡萄能做酒。榛松榧奈满盘盛，橘蔗柑橙盈案摆。熟煨山药，烂煮黄精。捣碎茯苓并薏苡（yì yǐ），石锅微火漫炊羹。人间纵有珍馐味，怎比山猴乐更宁？
+ 
+群猴尊美猴王上坐，各依齿肩排于下边，一个个轮流上前，奉酒，奉花，奉果，痛饮了一日。
+ 
+次日，美猴王早起，教：“小的们，替我折些枯松，编作筏子，取个竹竿作篙，收拾些果品之类，我将去也。”果独自登筏，尽力撑开，飘飘荡荡，径向大海波中，趁天风，来渡南赡部洲地界。这一去，正是那：
+ 
+天产仙猴道行隆，离山驾筏趁天风。
+飘洋过海寻仙道，立志潜心建大功。
+有分有缘休俗愿，无忧无虑会元龙。
+料应必遇知音者，说破源流万法通。
+ 
+也是他运至时来，自登木筏之后，连日东南风紧，将他送到西北岸前，乃是南赡部洲地界。持篙试水，偶得浅水，弃了筏子，跳上岸来，只见海边有人捕鱼、打雁、挖蛤、淘盐。他走近前，弄个把戏，妆个【上左“齿”右“可”，下“女”】虎（做出一种吓人的怪样子）
+，吓得那些人丢筐弃网，四散奔跑。将那跑不动的拿住一个，剥了他衣裳，也学人穿在身上，摇摇摆摆，穿州过府，在市廛（chán）中，学人礼，学人话。朝餐夜宿，一心里访问佛仙神圣之道，觅个长生不老之方。见世人都是为名为利之徒，更无一个为身命者。正是那：
+ 
+争名夺利几时休？早起迟眠不自由！
+骑着驴骡思骏马，官居宰相望王侯。
+只愁衣食耽劳碌，何怕阎君就取勾？
+继子荫孙图富贵，更无一个肯回头！
+ 
+猴王参访仙道，无缘得遇。在于南赡部洲，串长城，游小县，不觉八九年余。忽行至西洋大海，他想着海外必有神仙。独自个依前作筏，又飘过西海，直至西牛贺洲地界。登岸遍访多时，忽见一座高山秀丽，林麓（lù）幽深。他也不怕狼虫，不惧虎豹，登山顶上观看。果是好山：
+ 
+千峰排戟，万仞开屏。日映岚光轻锁翠，雨收黛色冷含青。枯藤缠老树，古渡界幽程。奇花瑞草，修竹乔松。修竹乔松，万载常青欺福地；奇花瑞草，四时不谢赛蓬瀛（péng yíng，蓬莱山和瀛洲，相传为仙人所居之处。亦泛指仙境）。幽鸟啼声近，源泉响溜清。重重谷壑芝兰绕，处处巉（chán）崖苔藓生。起伏峦头龙脉好，必有高人隐姓名。
+ 
+正观看间，忽闻得林深之处，有人言语，急忙趋步，穿入林中，侧耳而听，原来是歌唱之声。歌曰：
+ 
+“观棋柯烂，伐木丁丁，云边谷口徐行，卖薪沽酒，狂笑自陶情。苍迳秋高，对月枕松根，一觉天明。认旧林，登崖过岭，持斧断枯藤。
+ 
+“观棋柯烂，伐木丁丁，云边谷口徐行。卖薪沽酒（买酒。沽，gū），狂笑自陶情。苍径秋高对月，枕松根，一觉天明。认旧林，登崖过岭，持斧断枯藤。收来成一担，行歌市上，易米三升。更无些子争竞，时价平平。不会机谋巧算，没荣辱，恬淡延生。相逢处，非仙即道，静坐讲《黄庭》。”
+ 
+美猴王听得此言，满心欢喜道：“神仙原来藏在这里！”急忙跳入里面，仔细再看，乃是一个樵子，在那里举斧砍柴。但看他打扮非常：
+ 
+头上戴箬笠（ruò lì，用箬竹叶及篾编成的宽边帽），乃是新笋初脱之箨（tuò）。身上穿布衣，乃是木绵拈就之纱。腰间系环绦，乃是老蚕口吐之丝。足下踏草履（lǚ），乃是枯莎槎就之爽。手执衠钢斧，担挽火麻绳。扳松劈枯树，争似此樵能
+ 
+猴王近前叫道：“老神仙！弟子起手。”那樵汉慌忙丢了斧，转身答礼道：“不当人！不当人！我拙汉衣食不全，怎敢当‘神仙’二字？”猴王道：“你不是神仙，如何说出神仙的话来？”樵夫道：“我说什么神仙话？”猴王道：“我才来至林边，只听的你说：‘相逢处，非仙即道，静坐讲《黄庭》。’《黄庭》乃道德真言，非神仙而何？”樵夫笑道：“实不瞒你说，这个词名做《满庭芳》，乃一神仙教我的。那神仙与我舍下相邻。他见我家事劳苦，日常烦恼，教我遇烦恼时，即把这词儿念念，一则散心，二则解困。我才有些不足处思虑，故此念念。不期被你听了。”猴王道：“你家既与神仙相邻，何不从他修行？学得个不老之方，却不是好？”樵夫道：“我一生命苦：自幼蒙父母养育至八九岁，才知人事，不幸父丧，母亲居孀（shuāng）。再无兄弟姊妹，只我一人，没奈何，早晚侍奉。如今母老，一发不敢抛离。却又田园荒芜，衣食不足，只得斫（zhuó）两束柴薪，挑向市廛之间，货几文钱，籴几升米，自炊自造，安排些茶饭，供养老母，所以不能修行。”猴王道：“据你说起来，乃是一个行孝的君子，向后必有好处。但望你指与我那神仙住处，却好拜访去也。”樵夫道：“不远，不远。此山叫做灵台方寸山。山中有座斜月三星洞。那洞中有一个神仙，称名须菩提祖师。那祖师出去的徒弟，也不计其数，见今还有三四十人从他修行。你顺那条小路儿，向南行七八里远近，即是他家了。”猴王用手扯住樵夫道：“老兄，你便同我去去。若还得了好处，决不忘你指引之恩。”樵夫道：“你这汉子，甚不通变。我方才这般与你说了，你还不省？假若我与你去了，却不误了我的生意？老母何人奉养？我要斫柴，你自去，自去！”
+ 
+猴王听说，只得相辞。出深林，找上路径，过一山坡，约有七八里远，果然望见一座洞府。挺身观看，真好去处！但见：
+ 
+烟霞散彩，日月摇光。千株老柏，万节修篁（修竹，长竹子。篁，huáng）。千株老柏，带雨半空青冉冉；万节修篁，含烟一壑色苍苍。门外奇花布锦，桥边瑶草喷香。石崖突兀青苔润，悬壁高张翠藓长。时闻仙鹤唳（lì），每见凤凰翔。仙鹤唳时，声振九皋霄汉远；凤凰翔起，翎毛五色彩云光。玄猿白鹿随隐见，金狮玉象任行藏。细观灵福地，真个赛天堂！
+ 
+又见那洞门紧闭，静悄悄杳无人迹。忽回头，见崖头立一石碑，约有三丈余高，八尺余阔，上有一行十个大字，乃是“灵台方寸山，斜月三星洞”。美猴王十分欢喜道：“此间人果是朴实。果有此山此洞。”看勾多时，不敢敲门。且去跳上松枝梢头，摘松子吃了顽耍。
+ 
+少顷间，只听得呀的一声，洞门开处，里面走出一个仙童，真个丰姿英伟，像貌清奇，比寻常俗子不同。但见他：
+ 
+髽髻（即抓髻）双丝绾，宽袍两袖风。貌和身自别，心与相俱空。
+物外长年客，山中永寿童。一尘全不染，甲子任翻腾。
+ 
+那童子出得门来，高叫道：“甚么人在此搔扰？”猴王扑的跳下树来，上前躬身道：“仙童，我是个访道学仙之弟子，更不敢在此搔扰。”仙童笑道：“你是个访道的么？”猴王道：“是。”童子道：“我家师父，正才下榻，登坛讲道。还未说出原由，就教我出来开门。说：‘外面有个修行的来了，可去接待接待。’想必就是你了？”猴王笑道：“是我，是我。”童子道：“你跟我进来。”
+ 
+这猴王整衣端肃，随童子径入洞天深处观看：一层层深阁琼楼，一进进珠宫贝阙，说不尽那静室幽居，直至瑶台之下。见那菩提祖师端坐在台上，两边有三十个小仙侍立台下。果然是：
+ 
+大觉金仙没垢姿，西方妙相祖菩提；
+不生不灭三三行，全气全神万万慈。
+空寂自然随变化，真如本性任为之；
+与天同寿庄严体，历劫明心大法师。
+ 
+美猴王一见，倒身下拜，磕头不计其数，口中只道：“师父！师父！我弟子志心朝礼！志心朝礼！”祖师道：“你是那方人氏？且说个乡贯姓名明白，再拜。”猴王道：“弟子东胜神洲傲来国花果山水帘洞人氏。”祖师喝令：“赶出去！他本是个撒诈捣虚之徒，那里修甚么道果！”猴王慌忙磕头不住道：“弟子是老实之言，决无虚诈。”祖师道：“你既老实，怎么说东胜神洲？那去处到我这里，隔两重大海，一座南赡部洲，如何就得到此？”猴王叩头道：“弟子飘洋过海，登界游方，有十数个年头，方才访到此处。”
+ 
+祖师道：“既是逐渐行来的也罢。你姓甚么？”猴王又道：“我无性。人若骂我，我也不恼；若打我，我也不嗔，只是陪个礼儿就罢了。一生无性。”祖师道：“不是这个性。你父母原来姓甚么？”猴王道：“我也无父母。”祖师道：“既无父母，想是树上生的？”猴王道：“我虽不是树生，却是石里长的。我只记得花果山上有一块仙石，其年石破，我便生也。”祖师闻言，暗喜道：“这等说，却是天地生成的。你起来走走我看。”猴王纵身跳起，拐呀拐的走了两遍。祖师笑道：“你身躯虽是鄙陋，却像个食松果的猢狲。我与你就身上取个姓氏，意思教你姓‘猢’。猢字去了个兽傍，乃是古月。古者，老也；月者，阴也。老阴不能化育，教你姓‘狲’倒好。狲字去了兽傍，乃是个子系。子者，儿男也；系者，婴细也。正合婴儿之本论。教你姓‘孙’罢。”猴王听说，满心欢喜，朝上叩头道：“好！好！好！今日方知姓也。万望师父慈悲！既然有姓，再乞赐个名字，却好呼唤。”祖师道：“我门中有十二个字，分派起名到你乃第十辈之小徒矣。”猴王道：“那十二个字？”祖师道：“乃广、大、智、慧、真、如、性、海、颖、悟、圆、觉十二字。排到你，正当‘悟’字。与你起个法名叫做‘孙悟空’好么？”猴王笑道：“好！好！好！自今就叫做孙悟空也！”正是：
+ 
+鸿蒙初辟原无姓，打破顽空须悟空。
+ 
+毕竟不之向后修些甚么道果，且听下回分解。
diff --git a/tests/unit/builder/data/long_text_2.txt b/tests/unit/builder/data/long_text_2.txt
new file mode 100644
index 00000000..397834c3
--- /dev/null
+++ b/tests/unit/builder/data/long_text_2.txt
@@ -0,0 +1,93 @@
+第二回 悟彻菩提真妙理 断魔归本合元神
+当前位置：
+主页
+西游记
+神话表美猴王得了姓名，怡然踊跃，对菩提前作礼启谢。那祖师即命大众引孙悟空出二门外，教他洒扫应对，进退周旋之节。众仙奉行而出。悟空到门外，又拜了大众师兄，就于廊庑（láng wǔ）之间，安排寝处。次早，与众师兄学言语礼貌，讲经论道，习字焚香，每日如此。闲时即扫地锄园，养花修树，寻柴燃火，挑水运浆。凡所用之物，无一不备。在洞中不觉倏六七年。一日，祖师登坛高坐，唤集诸仙，开讲大道。真个是：
+ 
+天花乱坠，地涌金莲。妙演三乘（佛教术语。三乘指小乘、中乘和大乘）教，精微万法全。
+慢摇麈尾（古人闲谈时执以驱虫、掸尘的一种工具。麈，zhǔ）喷珠玉，响振雷霆动九天。
+说一会道，讲一会禅，三家配合本如然。
+开明一字皈诚理，指引无生了性玄。
+ 
+孙悟空在旁闻讲，喜得他抓耳挠腮，眉花眼笑。忍不住手之舞之，足之蹈之。忽被祖师看见，叫孙悟空道：“你在班中，怎么颠狂跃舞，不听我讲？”悟空道：“弟子诚心听讲，听到老师父妙音处，喜不自胜，故不觉作此踊跃之状。望师父恕罪！”祖师道：“你既识妙音，我且问你，你到洞中多少时了？”悟空道：“弟子本来懵懂（头脑不清楚或不能明辨事物。懵，měng），不知多少时节。只记得灶下无火，常去山后打柴，见一山好桃树，我在那里吃了七次饱桃矣。”祖师道：“那山唤名烂桃山。你既吃七次，想是七年了。你今要从我学些什么道？”悟空道：“但凭尊师教诲，只是有些道气儿，弟子便就学了。”
+ 
+祖师道：“‘道’字门中有三百六十傍门（bàng mén，道教术语。道教以修炼金丹、全身保真为正道，余皆为“傍门”，不能得正果），傍门皆有正果。不知你学那一门哩？”悟空道：“凭尊师意思。弟子倾心听从。”祖师道：“我教你个‘术’字门中之道，如何？”悟空道：“术门之道怎么说？”祖师道：“术字门中，乃是些请仙扶鸾（两人执丁字木笔在沙盘上写字叫扶鸾，是一种迷信求神以问吉凶的方法。鸾，luán），问卜揲蓍（shéshī，古代问卜的一种方式），能知趋吉避凶之理。”悟空道：“似这般可得长生么？”祖师道：“不能！不能！”悟空道：“不学！不学！”
+ 
+祖师道：“教你‘静’字门中之道，如何？”悟空道：“静字门中，是甚正果？”祖师道：“此是休粮（即“避谷”，停食谷物。道家的修炼方法之一）守谷，清静无为，参禅打坐，戒语持斋，或睡功，或立功，并入定坐关之类。”悟空道：“这般也能长生么？”祖师道：“也似‘窑头土坯’。”悟空笑道：“师父果有些滴。一行说我不会打市语。怎么谓之‘窑头土坯’？”祖师道：“就如那窑头上，造成砖瓦之坯，虽已成形，尚未经水火煅炼，一朝大雨滂沱，他必滥矣。”悟空道：“也不长远。不学！不学！”
+ 
+祖师道：“教你‘动’字门中之道，如何？”悟空道：“动门之道，却又怎样？”祖师道：“此是有为有作，采阴补阳，攀弓踏弩，摩脐过气，用方炮制，烧茅打鼎，进红铅，炼秋石，并服妇乳之类。”悟空道：“似这等也得长生么？”祖师道：“此欲长生，亦如‘水中捞月’。”悟空道：“师父又来了！怎么叫做‘水中捞月’？”祖师道：“月在长空，水中有影，虽然看见，只是无捞摸处，到底只成空耳。”悟空道：“也不学！不学！”
+ 
+祖师闻言，咄的一声，跳下高台，手持戒尺，指定悟空道：“你这猢狲，这般不学，那般不学，却待怎么？”走上前，将悟空头上打了三下，倒背着手，走入里面，将中门关了，撇下大众而去。唬得那一班听讲的，人人惊惧，皆怨悟空道：“你这泼猴，十分无状（没有礼貌）！师父传你道法，如何不学，却与师父顶嘴？这番冲撞了他，不知几时才出来呵！”此时俱甚报怨他，又鄙贱嫌恶他。悟空一些儿也不恼，只是满脸陪笑。原来那猴王，已打破盘中之谜，暗暗在心，所以不与众人争竞，只是忍耐无言。祖师打他三下者，教他三更时分存心；倒背着手，走入里面，将中门关上者，教他从后门进步，秘处传他道也。
+ 
+当日悟空与众等，喜喜欢欢，在三星仙洞之前，盼望天色，急不能到晚。及黄昏时，却与众就寝，假合眼，定息存神。山中又没支更传箭（报告时间。古用铜壶滴漏计时，看水平面箭上的刻度，即知时刻），不知时分，只自家将鼻孔中出入之气调定。约到子时前后，轻轻的起来，穿了衣服，偷开前门，躲离大众，走出外，抬头观看。正是那：
+ 
+月明清露冷，八极迥无尘。深树幽禽宿，源头水溜汾。
+飞萤光散影，过雁字排云。正直三更候，应该访道真。
+ 
+你看他从旧路径至后门外，只见那门儿半开半掩。悟空喜道：“老师父果然注意与我传道，故此开着门也。”即曳步近前，侧身进得门里，只走到祖师寝榻之下。见祖师蜷局身躯，朝里睡着了。悟空不敢惊动，即跪在榻前。那祖师不多时觉来，舒开两足，口中自吟道：
+ 
+“难！难！难！道最玄，莫把金丹作等闲。
+不遇至人传妙诀，空言口困舌头干！”
+ 
+悟空应声叫道：“师父，弟子在此跪候多时。”祖师闻得声音是悟空，即起披衣，盘坐喝道：“这猢狲！你不在前边去睡，却来我这后边作甚？”悟空道：“师父昨日坛前对众相允，教弟子三更时候，从后门里传我道理，故此大胆径拜老爷榻下。”祖师听说，十分欢喜，暗自寻思道：“这厮（古时对男子的称呼）果然是个天地生成的！不然，何就打破我盘中之暗谜也？”悟空道：“此间更无六耳，止只弟子一人，望师父大舍慈悲，传与我长生之道罢，永不忘恩！”祖师道：“你今有缘，我亦喜说。既识得盘中暗谜，你近前来，仔细听之，当传与你长生之妙道也。”悟空叩头谢了，洗耳用心，跪于榻下。祖师云：
+ 
+“显密圆通真妙诀，惜修生命无他说。
+都来总是精气神，谨固牢藏休漏泄。
+休漏泄，体中藏，汝受吾传道自昌。
+口诀记来多有益，屏除邪欲得清凉。
+得清凉，光皎洁，好向丹台赏明月。
+月藏玉兔日藏乌，自有龟蛇相盘结。
+相盘结，性命坚，却能火里种金莲。
+攒簇五行颠倒用，功完随作佛和仙。”
+ 
+此时说破根源，悟空心灵福至，切切记了口诀，对祖师拜谢深恩，即出后门观看。但见东方天色微舒白，西路金光大显明。依旧路，转到前门，轻轻的推开进去，坐在原寝之处，故将床铺摇响道：“天光了！天光了！起耶！”那大众还正睡哩，不知悟空已得了好事。当日起来打混，暗暗维持，子前午后，自己调息。
+ 
+却早过了三年，祖师复登宝座，与众说法。谈的是公案比语，论的是外像（佛教术语。指显露、表现在外表上的善恶美丑和言语行动）包皮。忽问：“悟空何在？”悟空近前跪下：“弟子有。”祖师道：“你这一向修些什么道来？”悟空道：“弟子近来法性颇通，根源亦渐坚固矣。”祖师道：“你既通法性，会得根源，已注神体，却只是防备着‘三灾利害’。”悟空听说，沉吟良久道：“师父之言谬矣。我尝闻道高德隆，与天同寿；水火既济，百病不生，却怎么有个‘三灾利害’？”祖师道：“此乃非常之道：夺天地之造化，侵日月之玄机；丹成之后，鬼神难容。虽驻颜益寿，但到了五百年后，天降雷灾打你，须要见性明心，预先躲避。躲得过，寿与天齐；躲不过，就此绝命。再五百年后，天降火灾烧你。这火不是天火，亦不是凡火，唤做‘阴火’。自本身涌泉穴（指足心）下烧起，直透泥垣宫，五脏成灰，四肢皆朽，把千年苦行，俱为虚幻。再五百年，又降风灾吹你。这风不是东南西北风，不是和熏金朔风，亦不是花柳松竹风，唤做‘赑风’。自囟门中吹入六腑，过丹田，穿九窍，骨肉消疏，其身自解。所以都要躲过。”
+ 
+悟空闻说，毛骨悚然（毛发竖起，脊梁骨发冷。形容恐惧惊骇的样子。悚，sǒng），叩头礼拜道：“万望老爷垂悯，传与躲避三灾之法，到底不敢忘恩。”祖师道：“此亦无难，只是你比他人不同，故传不得。”悟空道：“我也头圆顶天，足方履地，一般有九窍四肢，五脏六腑，何以比人不同？”祖师道：“你虽然像人，却比人少腮。”原来那猴子孤拐面，凹脸尖嘴。悟空伸手一摸，笑道：“师父没成算！我虽少腮，却比人多这个嗉袋（指猿猴类、啮齿类的嗉囊。嗉，sù），亦可准折过也。”祖师说：“也罢，你要学那一般？有一般天罡（gāng）数，该三十六般变化；有一般地煞数，该七十二般变化。”悟空道：“弟子愿多里捞摸，学一个地煞变化罢。”祖师道：“既如此，上前来，传与你口诀。”遂附耳低言，不知说了些什么妙法。这猴王也是他一窍通时百窍通，当时习了口诀，自修自炼，将七十二般变化，都学成了。
+ 
+忽一日，祖师与众门人在三星洞前戏玩晚景。祖师道：“悟空，事成了未曾？”悟空道：“多蒙师父海恩，弟子功果完备，已能霞举飞升也。”祖师道：“你试飞举我看。”
+ 
+悟空弄本事，将身一耸，打了个连扯跟头，跳离地有五六丈，踏云霞去勾有顿饭功夫，返复不上三里远近，落在面前，叉手道：“师父，这就是飞举腾云了。”祖师笑道：“这个算不得腾云，只算得爬云而已。自古道：‘神仙朝游北海暮苍梧。’似你这半日，去不上三里，即爬云也还算不得哩！”悟空道：“怎么为‘朝游北海暮苍梧’？”祖师道：“凡腾云之辈，早辰起自北海，游过东海、西海、南海、复转苍梧，苍梧者却是北海零陵之语话也。将四海之外，一日都游遍，方算得腾云。”悟空道：“这个却难！却难！”祖师道：“世上无难事，只怕有心人。”悟空闻得此言，叩头礼拜，启道：“师父，‘为人须为彻’，索性舍个大慈悲，将此腾云之法，一发传与我罢，决不敢忘恩。”祖师道：“凡诸仙腾云，皆跌足而起，你却不是这般。我才见你去，连扯方才跳上。我今只就你这个势，传你个‘筋斗云’罢。”悟空又礼拜恳求，祖师却又传个口诀道：“这朵云，捻着诀，念动真言，攒紧了拳，对身一抖，跳将起来，一筋斗就有十万八千里路哩！”大众听说，一个个嘻嘻笑道：“悟空造化！若会这个法儿，与人家当铺兵，送文书，递报单，不管那里都寻了饭吃！”师徒们天昏各归洞府。这一夜，悟空即运神炼法，会了筋斗云。逐日家无拘无束，自在逍遥此一长生之美。
+ 
+一日，春归夏至，大众都在松树下会讲多时。大众曰：“悟空，你是那世修来的缘法？前日师父拊耳低言，传与你的躲三灾变化之法，可都会么？”悟空笑道：“不瞒诸兄长说，一则是师父传授，二来也是我昼夜殷勤，那几般儿都会了。”大众道：“趁此良时，你试演演，让我等看看。”悟空闻说，抖搜精神，卖弄手段道：“众师兄请出个题目。要我变化甚么？”大众道：“就变棵松树罢。”悟空捻着诀，念动咒语，摇身一变，就变做一棵松树。真个是：
+ 
+郁郁含烟贯四时，凌云直上秀贞姿。
+全无一点妖猴像，尽是经霜耐雪枝。
+ 
+大众见了，鼓掌呀呀大笑。都道：“好猴儿！好猴儿！”不觉的嚷闹，惊动了祖师。
+ 
+祖师急拽杖出门来问道：“是何人在此喧哗？”大众闻呼，慌忙检束，整衣向前。悟空也现了本相，杂在丛中道：“启上尊师，我等在此会讲，更无外姓喧哗。”祖师怒喝道：“你等大呼小叫，全不像个修行的体段！修行的人，口开神气散，舌动是非生。如何在此嚷笑？”大众道：“不敢瞒师父，适才孙悟空演变化耍子。教他变棵松树，果然是棵松树，弟子们俱称扬喝采，故高声惊冒尊师，望乞恕罪。”祖师道：“你等起去。”叫：“悟空，过来！我问你弄甚么精神，变甚么松树？这个工夫，可好在人前卖弄？假如你见别人有，不要求他？别人见你有，必然求你。你若畏祸，却要传他；若不传他，必然加害：你之性命又不可保。”悟空叩道：“只望师父恕罪！”祖师道：“我也不罪你，但只是你去吧。”悟空闻此言，满眼堕泪道：“师父教我往那里去？”祖师道：“你从那里来，便从那里去就是了。”悟空顿然醒悟道：“我自东胜神洲傲来国花果山水帘洞来的。”祖师道：“你快回去，全你性命，若在此间，断然不可！”悟空领罪，“上告尊师，我也离家有二十年矣，虽是回顾旧日儿孙，但念师父厚恩未报，不敢去。”祖师道：“那里甚么恩义？你只是不惹祸不牵带我就罢了！”
+ 
+悟空见没奈何，只得拜辞，与众相别。祖师道：“你这去，定生不良。凭你怎么惹祸行凶，却不许说是我的徒弟。你说出半个字来，我就知之，把你这猢狲剥皮锉骨，将神魂贬在九幽之处，教你万劫不得翻身！”悟空道：“决不敢提起师父一字，只说是我自家会的便罢。”
+ 
+悟空谢了。即抽身，捻着诀，丢个连扯，纵起筋斗云，径回东海。那里消一个时辰，早看见花果山水帘洞。美猴王自知快乐，暗暗的自称道：
+ 
+“去时凡骨凡胎重，得道身轻体亦轻。
+举世无人肯立志，立志修玄玄自明。
+当时过海波难进，今日来回甚易行。
+别语叮咛还在耳，何期顷刻见东溟。”
+ 
+悟空按下云头，直至花果山。找路而走，忽听得鹤唳猿啼，鹤唳声冲霄汉外，猿啼悲切甚伤情。即开口叫道：“孩儿们，我来了也！”那崖下石坎边，花草中，树木里，若大若小之猴，跳出千千万万，把个美猴王围在当中，叩头叫道：“大王，你好宽心！怎么一去许久？把我们俱闪在这里，望你诚如饥渴！近来被一妖魔在此欺虐，强要占我们水帘洞府，是我等舍死忘生，与他争斗。这些时，被那厮抢了我们家火，捉了许多子侄，教我们昼夜无眠，看守家业。幸得大王来了！大王若再年载不来，我等连山洞尽属他人矣！”
+ 
+悟空闻说，心中大怒道：“是什么妖魔，辄（zhé）敢无状！你且细细说来，待我寻他报仇。”众猴叩头：“告上大王，那厮自称混世魔王，住居在直北下。”悟空道：“此间到他那里，有多少路程？”众猴道：“他来时云，去时雾，或风或雨，或电或雷，我等不知有多少路。”悟空道：“既如此，你们休怕，且自顽耍，等我寻他去来！”
+ 
+好猴王，将身一纵，跳起去，一路筋斗，直至北下观看，见一座高山，真是十分险峻。好山：
+ 
+笔峰挺立，曲涧深沉。笔峰挺立透空霄，曲涧深沉通地户。两崖花木争奇，几处松篁斗翠。左边龙，熟熟驯驯；右边虎，平平伏伏。每见铁牛耕，常有金钱种。幽禽睍睆（xiàn huǎn，婉转的鸟鸣声）声，丹凤朝阳立。石磷磷，波净净，古怪跷蹊真恶狞。世上名山无数多，花开花谢蘩还众。争如此景永长存，八节四时浑不动。诚为三界（佛教术语。指众生轮回的欲界、色界和无色界）坎源山，滋养五行水脏洞！
+ 
+美猴王正默看景致，只听得有人言语。径自下山寻觅，原来那陡崖之前，乃是那水脏洞。洞门外有几个小妖跳舞，见了悟空就走。悟空道：“休走！借你口中言，传我心内事。我乃正南方花果山水帘洞洞主。你家甚么混世鸟魔，屡次欺我儿孙，我特寻来，要与他见个上下！”
+ 
+那小妖听说，疾忙跑入洞里，报道：“大王！祸事了！”魔王道：“有甚祸事？”小妖道：“洞外有猴头称为花果山水帘洞洞主。他说你屡次欺他儿孙，特来寻你，见个上下哩。”魔王笑道：“我常闻得那些猴精说他有个大王，出家修行去，想是今番来了。你们见他怎生打扮，有甚器械？”小妖道：“他也没甚么器械，光着个头，穿一领红色衣，勒一条黄绦，足下踏一对乌靴，不僧不俗，又不像道士神仙，赤手空拳，在门外叫哩。”魔王闻说：“取我批挂兵器来！”那小妖即时取出。那魔王穿了甲胄，绰刀在手，与众妖出得门来，即高声叫道：“那个是水帘洞洞主？”悟空急睁睛观看，只见那魔王：
+ 
+头戴乌金盔，映日光明；身挂皂罗袍，迎风飘荡。下穿着黑铁甲，紧勒皮条；足踏着花褶靴，雄如上将。腰广十围，身高三丈，手执一口刀，锋刃多明亮。称为混世魔，磊落凶模样。
+ 
+猴王喝道：“这泼魔这般眼大，看不见老孙！”魔王见了，笑道：“你身不满四尺，年不过三旬，手内又无兵器，怎么大胆猖狂，要寻我见甚么上下？”悟空骂道：“你这泼魔，原来没眼！你量我小，要大却也不难。你量我无兵器，我两只手勾着天边月哩！你不要怕，只吃老孙一拳！”纵一纵，跳上去，劈脸就打。那魔王伸手架住道：“你这般矬矮，我这般高长，你要使拳，我要使刀，使刀就杀了你，也吃人笑，待我放下刀，与你使路拳看。”悟空道：“说得是。好汉子！走来！”那魔王丢开架子便打，这悟空钻进去相撞相迎。他两个拳捶脚踢，一冲一撞。原来长拳空大，短簇坚牢。那魔王被悟空掏短肋，撞了裆，几下筋节，把他打重了。他闪过，拿起那板大的钢刀，望悟空劈头就砍。悟空急撤身，他砍了一个空。悟空见他凶猛，即使身外身法，拔一把毫毛，丢在口中嚼碎，望空中喷去，叫一声“变！”，即变做三二百个小猴，周围攒簇。
+ 
+原来人得仙体，出神变化，无方不知。这猴王自从了道之后，身上有八万四千毛羽，根根能变，应物随心。那些小猴，眼乖会跳，刀来砍不着，枪去不能伤。你看他前踊后跃，钻上去，把魔王围绕，抱的抱，扯的扯，钻裆的钻裆，扳脚的扳脚，踢打挦毛，抠眼睛，捻鼻子，抬鼓弄，直打做一个攒盘。这悟空才去夺得他的刀来，分开小猴，照顶门一下，砍为两段。领众杀进洞中，将那大小妖精，尽皆剿灭。却把毫毛一抖，收上身来。又见那收不上身者，却是那魔王在水帘洞中擒去的小猴，悟空道：“汝等何为到此？”约有三五十个，都含泪道：“我等因大王修仙去后，这两年被他争吵，把我们都摄将来，那不是我们洞中的家火？石盆、石碗都被这厮拿来也。”悟空道：“既是我们的家火，你们都搬出外去。”随即洞里放起火来，把那水脏洞烧得枯干，尽归了一体。对众道：“汝等跟我回去。”众猴道：“大王，我们来时，只听得耳边风声，虚飘飘到于此地，更不识路径，今怎得回乡？”悟空道：“这是他弄的个术法儿，有何难也！我如今一窍通，百窍通，我也会弄。你们都合了眼，休怕！”
+ 
+好猴王，念声咒语，驾阵狂风，云头落下。叫：“孩儿们，睁眼。”众猴脚屣实地，认得是家乡，个个欢喜，都奔洞门旧路。那在洞众猴，都一齐簇拥同入，分班齿序，礼拜猴王。安排酒果，接风贺喜，启问降魔救子之事。悟空备细言了一遍，众猴称扬不尽道：“大王去到那方，不意学得这般手段！”悟空又道：“我当年别汝等，随波逐流，飘过东洋大海，径至南赡部洲，学成人像，着此衣，穿此履，摆摆摇摇，云游八九年馀，更不曾有道；又渡西洋大海，到西牛贺洲地界，访问多时，幸遇一老祖，传了我与天同寿的真功果，不死长生的大法门。”众猴称贺。都道：“万劫难逢也！”悟空又笑道：“小的们，又喜我这一门皆有姓氏。”众猴道：“大王何姓？”悟空道：“我今姓孙，法名悟空。”众猴闻说，鼓掌忻然道：“大王是老孙，我们都是二孙、三孙、细孙、小孙、——一家孙、一国孙、一窝孙矣！”都来奉承老孙，大盆小碗的，椰子酒、葡萄酒、仙花、仙果，真个是合家欢乐！咦！
+ 
+贯通一姓身归本，只待荣迁仙录箓名。
+ 
+毕竟不知怎生结果，居此界终始如何，且听下回分解。
diff --git a/tests/unit/builder/data/nodes.json b/tests/unit/builder/data/nodes.json
new file mode 100644
index 00000000..2b0c8af8
--- /dev/null
+++ b/tests/unit/builder/data/nodes.json
@@ -0,0 +1,192 @@
+[
+    {
+        "id": "(缩)肾上腺皮质激素",
+        "name": "(缩)肾上腺皮质激素",
+        "label": "Concept"
+    },
+    {
+        "id": "促肾上腺皮质激素",
+        "name": "促肾上腺皮质激素",
+        "label": "Concept"
+    },
+    {
+        "id": "促肾皮素",
+        "name": "促肾皮素",
+        "label": "Concept"
+    },
+    {
+        "id": "促肾上腺皮质激素",
+        "name": "促肾上腺皮质激素",
+        "label": "Concept"
+    },
+    {
+        "id": "促皮质素",
+        "name": "促皮质素",
+        "label": "Concept"
+    },
+    {
+        "id": "促肾上腺皮质激素",
+        "name": "促肾上腺皮质激素",
+        "label": "Concept"
+    },
+    {
+        "id": "促肾上腺皮质[激]素",
+        "name": "促肾上腺皮质[激]素",
+        "label": "Concept"
+    },
+    {
+        "id": "促肾上腺皮质激素",
+        "name": "促肾上腺皮质激素",
+        "label": "Concept"
+    },
+    {
+        "id": "ACTH",
+        "name": "ACTH",
+        "label": "Concept"
+    },
+    {
+        "id": "促肾上腺皮质激素",
+        "name": "促肾上腺皮质激素",
+        "label": "Concept"
+    },
+    {
+        "id": "促皮质激素",
+        "name": "促皮质激素",
+        "label": "Concept"
+    },
+    {
+        "id": "促肾上腺皮质激素",
+        "name": "促肾上腺皮质激素",
+        "label": "Concept"
+    },
+    {
+        "id": "促肾上腺皮质素",
+        "name": "促肾上腺皮质素",
+        "label": "Concept"
+    },
+    {
+        "id": "促肾上腺皮质激素",
+        "name": "促肾上腺皮质激素",
+        "label": "Concept"
+    },
+    {
+        "id": "人生长激素",
+        "name": "人生长激素",
+        "label": "Concept"
+    },
+    {
+        "id": "促生长素",
+        "name": "促生长素",
+        "label": "Concept"
+    },
+    {
+        "id": "生长激素",
+        "name": "生长激素",
+        "label": "Concept"
+    },
+    {
+        "id": "促生长素",
+        "name": "促生长素",
+        "label": "Concept"
+    },
+    {
+        "id": "生长激素释放抑制激素",
+        "name": "生长激素释放抑制激素",
+        "label": "Concept"
+    },
+    {
+        "id": "生长抑素",
+        "name": "生长抑素",
+        "label": "Concept"
+    },
+    {
+        "id": "促生长素抑制素",
+        "name": "促生长素抑制素",
+        "label": "Concept"
+    },
+    {
+        "id": "生长抑素",
+        "name": "生长抑素",
+        "label": "Concept"
+    },
+    {
+        "id": "生长抑素醋酸盐",
+        "name": "生长抑素醋酸盐",
+        "label": "Concept"
+    },
+    {
+        "id": "生长抑素",
+        "name": "生长抑素",
+        "label": "Concept"
+    },
+    {
+        "id": "胃泌激素",
+        "name": "胃泌激素",
+        "label": "Concept"
+    },
+    {
+        "id": "促胃液素",
+        "name": "促胃液素",
+        "label": "Concept"
+    },
+    {
+        "id": "胃泌素",
+        "name": "胃泌素",
+        "label": "Concept"
+    },
+    {
+        "id": "促胃液素",
+        "name": "促胃液素",
+        "label": "Concept"
+    },
+    {
+        "id": "促乳素",
+        "name": "促乳素",
+        "label": "Concept"
+    },
+    {
+        "id": "催乳素",
+        "name": "催乳素",
+        "label": "Concept"
+    },
+    {
+        "id": "泌乳素",
+        "name": "泌乳素",
+        "label": "Concept"
+    },
+    {
+        "id": "催乳素",
+        "name": "催乳素",
+        "label": "Concept"
+    },
+    {
+        "id": "催乳激素",
+        "name": "催乳激素",
+        "label": "Concept"
+    },
+    {
+        "id": "催乳素",
+        "name": "催乳素",
+        "label": "Concept"
+    },
+    {
+        "id": "蛋白水解酶",
+        "name": "蛋白水解酶",
+        "label": "Concept"
+    },
+    {
+        "id": "内肽酶",
+        "name": "内肽酶",
+        "label": "Concept"
+    },
+    {
+        "id": "蛋白酶",
+        "name": "蛋白酶",
+        "label": "Concept"
+    },
+    {
+        "id": "内肽酶",
+        "name": "内肽酶",
+        "label": "Concept"
+    }
+]
\ No newline at end of file
diff --git a/tests/builder/data/test_csv.csv b/tests/unit/builder/data/test_csv.csv
similarity index 100%
rename from tests/builder/data/test_csv.csv
rename to tests/unit/builder/data/test_csv.csv
diff --git a/tests/unit/builder/data/test_csv_headerless.csv b/tests/unit/builder/data/test_csv_headerless.csv
new file mode 100644
index 00000000..c1c6354b
--- /dev/null
+++ b/tests/unit/builder/data/test_csv_headerless.csv
@@ -0,0 +1,48 @@
+0,Thomas C. Sudhof,"Introduction
+Thomas Christian Sudhof (German pronunciation: ['to:mas 'zy:t,ho:f] i; born December 22, 1955), ForMemRS, is a German-American biochemist known for his study of synaptic transmission. Currently, he is a professor in the school of medicine in the department of molecular and cellular physiology, and by courtesy in neurology, and in psychiatry and behavioral sciences at Stanford University.",0
+1,David Eagleman,"Introduction
+David Eagleman (born April 25, 1971) is an American neuroscientist, author, and science communicator. He teaches neuroscience at Stanford University[1] and is CEO and co-founder of Neosensory, a company that develops devices for sensory substitution. [2] He also directs the non-profit Center for Science and Law, which seeks to align the legal system with modern neuroscience[3] and is Chief Science Officer and co-founder of BrainCheck, a digital cognitive health platform used in medical practices and health systems.",1
+2,Karl Deisseroth,"Introduction
+Karl Alexander Deisseroth (born November 18, 1971) is an American scientist. He is the D.H. Chen Foundation Professor of Bioengineering and of psychiatry and behavioral sciences at Stanford University. He is known for creating and developing the technologies of hydrogel-tissue chemistry (e.g., CLARITY, STARmap) and optogenetics, and for applying integrated optical and genetic strategies to study normal neural circuit function, as well as dysfunction in neurological and psychiatric disease.",2
+3,Thomas C. Sudhof,"Career and research
+Sudhof's research has not only given the scientific community a great understanding of the processes underlying synaptic transmission and synapse formation, but has also advanced medical knowledge of mechanisms behind poorly understood diseases such as Alzheimer's, Schizophrenia, and Autism. He is currently working with a diverse group of researchers at the Howard Hughes Medical Institute to develop mouse models for mutants of synaptic genes.",3
+4,Thomas C. Sudhof,"Introduction
+Thomas Christian Sudhof (German pronunciation: ['to:mas 'zy:t,ho:f] i; born December 22, 1955), ForMemRS, is a German-American biochemist known for his study of synaptic transmission. Currently, he is a professor in the school of medicine in the department of molecular and cellular physiology, and by courtesy in neurology, and in psychiatry and behavioral sciences at Stanford University.",4
+5,David Eagleman,"Introduction
+David Eagleman (born April 25, 1971) is an American neuroscientist, author, and science communicator. He teaches neuroscience at Stanford University[1] and is CEO and co-founder of Neosensory, a company that develops devices for sensory substitution. [2] He also directs the non-profit Center for Science and Law, which seeks to align the legal system with modern neuroscience[3] and is Chief Science Officer and co-founder of BrainCheck, a digital cognitive health platform used in medical practices and health systems.",5
+6,Karl Deisseroth,"Introduction
+Karl Alexander Deisseroth (born November 18, 1971) is an American scientist. He is the D.H. Chen Foundation Professor of Bioengineering and of psychiatry and behavioral sciences at Stanford University. He is known for creating and developing the technologies of hydrogel-tissue chemistry (e.g., CLARITY, STARmap) and optogenetics, and for applying integrated optical and genetic strategies to study normal neural circuit function, as well as dysfunction in neurological and psychiatric disease.",6
+7,Thomas C. Sudhof,"Career and research
+Sudhof's research has not only given the scientific community a great understanding of the processes underlying synaptic transmission and synapse formation, but has also advanced medical knowledge of mechanisms behind poorly understood diseases such as Alzheimer's, Schizophrenia, and Autism. He is currently working with a diverse group of researchers at the Howard Hughes Medical Institute to develop mouse models for mutants of synaptic genes.",7
+8,Thomas C. Sudhof,"Introduction
+Thomas Christian Sudhof (German pronunciation: ['to:mas 'zy:t,ho:f] i; born December 22, 1955), ForMemRS, is a German-American biochemist known for his study of synaptic transmission. Currently, he is a professor in the school of medicine in the department of molecular and cellular physiology, and by courtesy in neurology, and in psychiatry and behavioral sciences at Stanford University.",8
+9,David Eagleman,"Introduction
+David Eagleman (born April 25, 1971) is an American neuroscientist, author, and science communicator. He teaches neuroscience at Stanford University[1] and is CEO and co-founder of Neosensory, a company that develops devices for sensory substitution. [2] He also directs the non-profit Center for Science and Law, which seeks to align the legal system with modern neuroscience[3] and is Chief Science Officer and co-founder of BrainCheck, a digital cognitive health platform used in medical practices and health systems.",9
+10,Karl Deisseroth,"Introduction
+Karl Alexander Deisseroth (born November 18, 1971) is an American scientist. He is the D.H. Chen Foundation Professor of Bioengineering and of psychiatry and behavioral sciences at Stanford University. He is known for creating and developing the technologies of hydrogel-tissue chemistry (e.g., CLARITY, STARmap) and optogenetics, and for applying integrated optical and genetic strategies to study normal neural circuit function, as well as dysfunction in neurological and psychiatric disease.",10
+11,Thomas C. Sudhof,"Career and research
+Sudhof's research has not only given the scientific community a great understanding of the processes underlying synaptic transmission and synapse formation, but has also advanced medical knowledge of mechanisms behind poorly understood diseases such as Alzheimer's, Schizophrenia, and Autism. He is currently working with a diverse group of researchers at the Howard Hughes Medical Institute to develop mouse models for mutants of synaptic genes.",11
+12,Thomas C. Sudhof,"Introduction
+Thomas Christian Sudhof (German pronunciation: ['to:mas 'zy:t,ho:f] i; born December 22, 1955), ForMemRS, is a German-American biochemist known for his study of synaptic transmission. Currently, he is a professor in the school of medicine in the department of molecular and cellular physiology, and by courtesy in neurology, and in psychiatry and behavioral sciences at Stanford University.",12
+13,David Eagleman,"Introduction
+David Eagleman (born April 25, 1971) is an American neuroscientist, author, and science communicator. He teaches neuroscience at Stanford University[1] and is CEO and co-founder of Neosensory, a company that develops devices for sensory substitution. [2] He also directs the non-profit Center for Science and Law, which seeks to align the legal system with modern neuroscience[3] and is Chief Science Officer and co-founder of BrainCheck, a digital cognitive health platform used in medical practices and health systems.",13
+14,Karl Deisseroth,"Introduction
+Karl Alexander Deisseroth (born November 18, 1971) is an American scientist. He is the D.H. Chen Foundation Professor of Bioengineering and of psychiatry and behavioral sciences at Stanford University. He is known for creating and developing the technologies of hydrogel-tissue chemistry (e.g., CLARITY, STARmap) and optogenetics, and for applying integrated optical and genetic strategies to study normal neural circuit function, as well as dysfunction in neurological and psychiatric disease.",14
+15,Thomas C. Sudhof,"Career and research
+Sudhof's research has not only given the scientific community a great understanding of the processes underlying synaptic transmission and synapse formation, but has also advanced medical knowledge of mechanisms behind poorly understood diseases such as Alzheimer's, Schizophrenia, and Autism. He is currently working with a diverse group of researchers at the Howard Hughes Medical Institute to develop mouse models for mutants of synaptic genes.",15
+16,Thomas C. Sudhof,"Introduction
+Thomas Christian Sudhof (German pronunciation: ['to:mas 'zy:t,ho:f] i; born December 22, 1955), ForMemRS, is a German-American biochemist known for his study of synaptic transmission. Currently, he is a professor in the school of medicine in the department of molecular and cellular physiology, and by courtesy in neurology, and in psychiatry and behavioral sciences at Stanford University.",16
+17,David Eagleman,"Introduction
+David Eagleman (born April 25, 1971) is an American neuroscientist, author, and science communicator. He teaches neuroscience at Stanford University[1] and is CEO and co-founder of Neosensory, a company that develops devices for sensory substitution. [2] He also directs the non-profit Center for Science and Law, which seeks to align the legal system with modern neuroscience[3] and is Chief Science Officer and co-founder of BrainCheck, a digital cognitive health platform used in medical practices and health systems.",17
+18,Karl Deisseroth,"Introduction
+Karl Alexander Deisseroth (born November 18, 1971) is an American scientist. He is the D.H. Chen Foundation Professor of Bioengineering and of psychiatry and behavioral sciences at Stanford University. He is known for creating and developing the technologies of hydrogel-tissue chemistry (e.g., CLARITY, STARmap) and optogenetics, and for applying integrated optical and genetic strategies to study normal neural circuit function, as well as dysfunction in neurological and psychiatric disease.",18
+19,Thomas C. Sudhof,"Career and research
+Sudhof's research has not only given the scientific community a great understanding of the processes underlying synaptic transmission and synapse formation, but has also advanced medical knowledge of mechanisms behind poorly understood diseases such as Alzheimer's, Schizophrenia, and Autism. He is currently working with a diverse group of researchers at the Howard Hughes Medical Institute to develop mouse models for mutants of synaptic genes.",19
+20,Thomas C. Sudhof,"Introduction
+Thomas Christian Sudhof (German pronunciation: ['to:mas 'zy:t,ho:f] i; born December 22, 1955), ForMemRS, is a German-American biochemist known for his study of synaptic transmission. Currently, he is a professor in the school of medicine in the department of molecular and cellular physiology, and by courtesy in neurology, and in psychiatry and behavioral sciences at Stanford University.",20
+21,David Eagleman,"Introduction
+David Eagleman (born April 25, 1971) is an American neuroscientist, author, and science communicator. He teaches neuroscience at Stanford University[1] and is CEO and co-founder of Neosensory, a company that develops devices for sensory substitution. [2] He also directs the non-profit Center for Science and Law, which seeks to align the legal system with modern neuroscience[3] and is Chief Science Officer and co-founder of BrainCheck, a digital cognitive health platform used in medical practices and health systems.",21
+22,Karl Deisseroth,"Introduction
+Karl Alexander Deisseroth (born November 18, 1971) is an American scientist. He is the D.H. Chen Foundation Professor of Bioengineering and of psychiatry and behavioral sciences at Stanford University. He is known for creating and developing the technologies of hydrogel-tissue chemistry (e.g., CLARITY, STARmap) and optogenetics, and for applying integrated optical and genetic strategies to study normal neural circuit function, as well as dysfunction in neurological and psychiatric disease.",22
+23,Thomas C. Sudhof,"Career and research
+Sudhof's research has not only given the scientific community a great understanding of the processes underlying synaptic transmission and synapse formation, but has also advanced medical knowledge of mechanisms behind poorly understood diseases such as Alzheimer's, Schizophrenia, and Autism. He is currently working with a diverse group of researchers at the Howard Hughes Medical Institute to develop mouse models for mutants of synaptic genes.",23
diff --git a/tests/builder/data/test_docx.docx b/tests/unit/builder/data/test_docx.docx
similarity index 100%
rename from tests/builder/data/test_docx.docx
rename to tests/unit/builder/data/test_docx.docx
diff --git a/tests/builder/data/test_json.json b/tests/unit/builder/data/test_json.json
similarity index 100%
rename from tests/builder/data/test_json.json
rename to tests/unit/builder/data/test_json.json
diff --git a/tests/builder/data/test_markdown.md b/tests/unit/builder/data/test_markdown.md
similarity index 100%
rename from tests/builder/data/test_markdown.md
rename to tests/unit/builder/data/test_markdown.md
diff --git a/tests/builder/data/test_pdf.pdf b/tests/unit/builder/data/test_pdf.pdf
similarity index 100%
rename from tests/builder/data/test_pdf.pdf
rename to tests/unit/builder/data/test_pdf.pdf
diff --git a/tests/builder/data/test_txt.txt b/tests/unit/builder/data/test_txt.txt
similarity index 100%
rename from tests/builder/data/test_txt.txt
rename to tests/unit/builder/data/test_txt.txt
diff --git "a/tests/builder/data/\350\247\222\350\211\262\344\277\241\346\201\257\350\241\250\350\257\264\346\230\216.md" "b/tests/unit/builder/data/\350\247\222\350\211\262\344\277\241\346\201\257\350\241\250\350\257\264\346\230\216.md"
similarity index 100%
rename from "tests/builder/data/\350\247\222\350\211\262\344\277\241\346\201\257\350\241\250\350\257\264\346\230\216.md"
rename to "tests/unit/builder/data/\350\247\222\350\211\262\344\277\241\346\201\257\350\241\250\350\257\264\346\230\216.md"
diff --git a/tests/builder/common/test_reranker.py b/tests/unit/builder/model/__init__.py
similarity index 100%
rename from tests/builder/common/test_reranker.py
rename to tests/unit/builder/model/__init__.py
diff --git a/tests/unit/builder/model/test_model.py b/tests/unit/builder/model/test_model.py
new file mode 100644
index 00000000..2190126e
--- /dev/null
+++ b/tests/unit/builder/model/test_model.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+from kag.builder.model.sub_graph import Node, Edge, SubGraph
+
+
+def test_node():
+    node = Node("1", "node", "Concept", {"desc": "a graph node"})
+    node_dict = node.to_dict()
+    new_node = Node.from_dict(node_dict)
+    assert node.id == new_node.id
+    assert node == new_node
+
+
+def test_edge():
+    node1 = Node("1", "node1", "Concept", {"desc": "a graph node"})
+    node2 = Node("2", "node2", "Concept", {"desc": "another graph node"})
+    edge = Edge("1", node1, node2, "link", {"desc": "node1 links to node2"})
+    edge_dict = edge.to_dict()
+    new_edge = Edge.from_dict(edge_dict)
+    assert edge.from_id == new_edge.from_id
+    assert edge.from_type == new_edge.from_type
+    assert edge.to_id == new_edge.to_id
+    assert edge.to_type == new_edge.to_type
+    assert edge.label == new_edge.label
+    assert edge.properties == new_edge.properties
+
+
+def test_subgraph():
+    node1 = Node("1", "node1", "Concept", {"desc": "a graph node"})
+    node2 = Node("2", "node2", "Concept", {"desc": "another graph node"})
+    edge = Edge("1", node1, node2, "link", {"desc": "node1 links to node2"})
+    subgraph = SubGraph(nodes=[node1, node2], edges=[edge])
+    subgraph.add_node("3", "node3", "Concept", {"desc": "3th graph node"})
+    subgraph.add_edge(
+        "1", "Concept", "link", "3", "Concept", {"desc": "node1 links to node3"}
+    )
+
+    subgraph_dict = subgraph.to_dict()
+    new_subgraph = SubGraph.from_dict(subgraph_dict)
+    assert len(subgraph.nodes) == len(new_subgraph.nodes)
+    assert len(subgraph.edges) == len(new_subgraph.edges)
+    node_ids = set([x.id for x in subgraph.nodes])
+    new_node_ids = set([x.id for x in new_subgraph.nodes])
+    assert node_ids == new_node_ids
diff --git a/tests/builder/common/test_retriever.py b/tests/unit/builder/prompt/__init__.py
similarity index 100%
rename from tests/builder/common/test_retriever.py
rename to tests/unit/builder/prompt/__init__.py
diff --git a/tests/unit/builder/prompt/test_prompt.py b/tests/unit/builder/prompt/test_prompt.py
new file mode 100644
index 00000000..aac294ae
--- /dev/null
+++ b/tests/unit/builder/prompt/test_prompt.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+import json
+from kag.interface import PromptABC
+from kag.common.conf import KAG_PROJECT_CONF
+
+
+def test_default_ner_prompt():
+    conf = {"type": "default_ner", "example_input": "xxx", "example_output": "yyy"}
+    prompt = PromptABC.from_config(conf)
+    from kag.builder.prompt.default.ner import OpenIENERPrompt
+
+    assert prompt.language == KAG_PROJECT_CONF.language
+    assert isinstance(prompt, OpenIENERPrompt)
+    prompt_str = prompt.build_prompt({"input": "zzz"})
+    content = json.loads(prompt_str)
+    assert content["example"]["input"] == conf["example_input"]
+    assert content["example"]["output"] == conf["example_output"]
+    assert content["input"] == "zzz"
+
+
+def test_default_std_prompt():
+    conf = {"type": "default_std"}
+    prompt = PromptABC.from_config(conf)
+    from kag.builder.prompt.default.std import OpenIEEntitystandardizationdPrompt
+
+    assert prompt.language == KAG_PROJECT_CONF.language
+    assert isinstance(prompt, OpenIEEntitystandardizationdPrompt)
+
+
+def test_default_triple_prompt():
+    conf = {"type": "default_triple"}
+    prompt = PromptABC.from_config(conf)
+    from kag.builder.prompt.default.triple import OpenIETriplePrompt
+
+    assert prompt.language == KAG_PROJECT_CONF.language
+    assert isinstance(prompt, OpenIETriplePrompt)
+
+
+def test_medical_ner_prompt():
+    conf = {"type": "medical_ner"}
+    prompt = PromptABC.from_config(conf)
+    from kag.builder.prompt.medical.ner import OpenIENERPrompt
+
+    assert prompt.language == KAG_PROJECT_CONF.language
+    assert isinstance(prompt, OpenIENERPrompt)
+
+
+def test_medical_std_prompt():
+    conf = {"type": "medical_std"}
+    prompt = PromptABC.from_config(conf)
+    from kag.builder.prompt.medical.std import OpenIEEntitystandardizationdPrompt
+
+    assert prompt.language == KAG_PROJECT_CONF.language
+    assert isinstance(prompt, OpenIEEntitystandardizationdPrompt)
+
+
+def test_medical_triple_prompt():
+    conf = {"type": "medical_triple"}
+    prompt = PromptABC.from_config(conf)
+    from kag.builder.prompt.medical.triple import OpenIETriplePrompt
+
+    assert prompt.language == KAG_PROJECT_CONF.language
+    assert isinstance(prompt, OpenIETriplePrompt)
+
+
+def test_spg_entity_prompt():
+    conf = {"type": "spg_entity"}
+    prompt = PromptABC.from_config(conf)
+    assert prompt.language == KAG_PROJECT_CONF.language
+    conf = {
+        "type": "spg_entity",
+        "example_input": "xxx",
+        "example_output": [{"type": "Person", "properties": {"name": "yyy"}}],
+    }
+    prompt = PromptABC.from_config(conf)
+    prompt_str = prompt.build_prompt({"input": "zzz"})
+    content = json.loads(prompt_str)
+    assert content["example"]["input"] == conf["example_input"]
+    assert content["example"]["output"] == conf["example_output"]
+    assert content["input"] == "zzz"
+
+
+def test_spg_event_prompt():
+    conf = {"type": "spg_event"}
+    prompt = PromptABC.from_config(conf)
+    assert prompt.language == KAG_PROJECT_CONF.language
+    conf = {
+        "type": "spg_event",
+        "example_input": "xxx",
+        "example_output": [{"type": "Person", "properties": {"name": "yyy"}}],
+    }
+    prompt = PromptABC.from_config(conf)
+    prompt_str = prompt.build_prompt({"input": "zzz"})
+    content = json.loads(prompt_str)
+    assert content["example"]["input"] == conf["example_input"]
+    assert content["example"]["output"] == conf["example_output"]
+    assert content["input"] == "zzz"
diff --git a/tests/unit/builder/test_runner.py b/tests/unit/builder/test_runner.py
new file mode 100644
index 00000000..891dc294
--- /dev/null
+++ b/tests/unit/builder/test_runner.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+import os
+from kag.common.conf import KAG_CONFIG
+from kag.builder.runner import BuilderChainRunner
+
+pwd = os.path.dirname(__file__)
+
+
+def test_chain_runner():
+    runner_config = {
+        "scanner": {"type": "dir", "file_pattern": ".*long_text.*"},
+        "chain": {
+            "type": "unstructured",
+            "extractor": {
+                "type": "schema_free",
+                "llm": KAG_CONFIG.all_config["llm"],
+            },
+            "reader": {"type": "txt"},
+            "splitter": {
+                "type": "length",
+                "split_length": 200,
+                "window_length": 0,
+            },
+            "vectorizer": KAG_CONFIG.all_config["vectorizer"],
+            "writer": {"type": "kg"},
+        },
+        "num_chains": 2,
+        "num_threads_per_chain": 4,
+    }
+    runner = BuilderChainRunner.from_config(runner_config)
+    runner.invoke(os.path.join(pwd, "data/"))
+
+
+test_chain_runner()
diff --git a/tests/unit/common/data/cfg.yaml b/tests/unit/common/data/cfg.yaml
new file mode 100644
index 00000000..40b42555
--- /dev/null
+++ b/tests/unit/common/data/cfg.yaml
@@ -0,0 +1,10 @@
+database:
+  host: localhost
+  port: 3306
+  user: root
+  password: secret
+  name: my_database
+
+server:
+  host: 0.0.0.0
+  port: 8080
diff --git a/tests/unit/common/data/cfg.yaml.tmpl b/tests/unit/common/data/cfg.yaml.tmpl
new file mode 100644
index 00000000..14eda935
--- /dev/null
+++ b/tests/unit/common/data/cfg.yaml.tmpl
@@ -0,0 +1,10 @@
+database:
+  host: {{ database.host }}
+  port: {{ database.port }}
+  user: {{ database.user }}
+  password: {{ database.password }}
+  name: {{ database.name }}
+
+server:
+  host: {{ server.host }}
+  port: {{ server.port }}
\ No newline at end of file
diff --git a/tests/unit/common/kag_config.yaml.bak b/tests/unit/common/kag_config.yaml.bak
new file mode 100644
index 00000000..075f3e92
--- /dev/null
+++ b/tests/unit/common/kag_config.yaml.bak
@@ -0,0 +1,27 @@
+global: &global_config
+  host_addr: http://127.0.0.1:8887
+  project_id: 666
+  biz_scene: default
+  language: en
+project: *global_config
+vectorize_model:
+  type: bge
+  path: ~/.cache/vectorizer/BAAI/bge-base-zh-v1.5
+  url: https://alps-common.oss-cn-hangzhou-zmf.aliyuncs.com/alps/huaidong.xhd/Documents/models/BAAI-bge-base-zh-v1.5.tar.gz
+  vector_dimensions: 768
+llm:
+  type: maas
+  base_url: https://api.deepseek.com
+  api_key: key
+  model: deepseek-chat
+indexer:
+  with_semantic: False
+  similarity_threshold: 0.8
+retriever:
+  with_semantic: False
+  pagerank_threshold: 0.9
+  match_threshold: 0.8
+  top_k: 10
+log:
+  level: INFO
+
diff --git a/tests/unit/common/llm/test_llm.py b/tests/unit/common/llm/test_llm.py
new file mode 100644
index 00000000..03407f48
--- /dev/null
+++ b/tests/unit/common/llm/test_llm.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+import pytest
+from kag.interface import LLMClient
+
+
+def get_vllm_config():
+    return {
+        "type": "vllm",
+        "model": "qwen2.5-3b",
+        "base_url": "http://localhost:8000/v1/chat/completions",
+    }
+
+
+def get_openai_config():
+    return {
+        "type": "openai",
+        "base_url": "https://api.deepseek.com/beta",
+        "api_key": "",
+        "model": "deepseek-chat",
+    }
+
+
+def get_ollama_config():
+    return {
+        "type": "ollama",
+        "base_url": "http://localhost:11434/api/generate",
+        "model": "llama3.1",
+    }
+
+
+@pytest.mark.skip(reason="Missing API key")
+def test_llm_client():
+
+    for conf in [get_vllm_config(), get_openai_config(), get_ollama_config()]:
+        client = LLMClient.from_config(conf)
+        rsp = client("Who are you?")
+        # assert rsp is not None
+
+
+def test_mock_llm_client():
+    conf = {"type": "mock"}
+    client = LLMClient.from_config(conf)
+    rsp = client.call_with_json_parse("who are you?")
+    assert rsp == "I am an intelligent assistant"
diff --git a/tests/unit/common/registry/test_registry.py b/tests/unit/common/registry/test_registry.py
new file mode 100644
index 00000000..0b1c870c
--- /dev/null
+++ b/tests/unit/common/registry/test_registry.py
@@ -0,0 +1,420 @@
+# -*- coding: utf-8 -*-
+import json
+from typing import List, Dict, Union
+from pyhocon import ConfigTree, ConfigFactory
+from kag.common.registry import Registrable, Lazy, Functor
+import numpy as np
+
+
+def test_list_available():
+    from kag.interface import LLMClient
+
+    ava = LLMClient.list_available_with_detail()
+    print(json.dumps(ava, indent=4))
+
+
+class MockModel(Registrable):
+    def __init__(self, name: str = "mock_model"):
+        self.name = name
+
+
+@MockModel.register("Simple")
+class Simple(MockModel):
+    def __init__(self, name, age=None):
+        pass
+
+
+@MockModel.register("gaussian")
+class Gaussian(MockModel):
+    def __init__(
+        self, mean: float, variance: float, noise: List[int], attr: ConfigTree
+    ):
+        pass
+
+
+@MockModel.register("gaussian_var")
+class GaussianVar(MockModel):
+    def __init__(
+        self,
+        mean: float,
+        variance: float,
+        noise: List[int],
+        **kwargs,
+    ):
+        pass
+
+
+@MockModel.register("obj_gaussian")
+class ObjGaussian(MockModel):
+    def __init__(self, mean: float, variance: float, data: List[np.ndarray]):
+        pass
+
+
+@MockModel.register("complex_gaussian_var")
+class ComplexGaussianVar(MockModel):
+    def __init__(self, list_gaussian: List[GaussianVar], **kwargs):
+        pass
+
+
+@MockModel.register("complex_gaussian")
+class ComplexGaussian(MockModel):
+    def __init__(
+        self, dict_gaussian: Dict[str, Gaussian], list_gaussian: List[Gaussian]
+    ):
+        pass
+
+
+@MockModel.register("lazy_gaussian")
+class LazyGaussian(MockModel):
+    def __init__(
+        self,
+        gaussian: Lazy[Gaussian],
+    ):
+        pass
+
+
+class BaseCount(Registrable):
+    pass
+
+
+@BaseCount.register("default", as_default=True)
+@BaseCount.register("from_list_of_ints", constructor="from_list_of_ints")
+@BaseCount.register("from_list_of_strings", constructor="from_list_of_strings")
+@BaseCount.register("from_string_length", constructor="from_string_length")
+class Count(BaseCount):
+    def __init__(self, count: int):
+        self.count = count
+
+    @classmethod
+    def from_list_of_ints(cls, int_list: List[int]):
+        ins = cls(len(int_list))
+        # we should add attr int_list to instance, otherwise we can't correctly
+        # convert it to params.
+        setattr(ins, "int_list", int_list)
+        return ins
+
+    @classmethod
+    def from_list_of_strings(cls, str_list: List[str]):
+        ins = cls(len(str_list))
+        setattr(ins, "str_list", str_list)
+        return ins
+
+    @classmethod
+    def from_string_length(cls, string: str):
+        ins = cls(len(string))
+        setattr(ins, "string", string)
+        return ins
+
+
+class Type1(Registrable):
+    pass
+
+
+class Type2(Registrable):
+    pass
+
+
+class MixBase(Registrable):
+    pass
+
+
+@Type1.register("sub1")
+class Sub1(Type1):
+    def __init__(self, name1: str):
+        pass
+
+
+@Type1.register("sub11")
+class Sub11(Type1):
+    def __init__(self, name11: str):
+        pass
+
+
+@Type2.register("sub2")
+class Sub2(Type2):
+    def __init__(self, name2: str):
+        pass
+
+
+@MixBase.register("mix1")
+class Mix1(MixBase):
+    def __init__(self, sub: Union[Type1, Type2]):
+        pass
+
+
+@MixBase.register("mix2")
+class Mix2(MixBase):
+    def __init__(self, sub: Union[Sub1, Sub11, Sub2]):
+        pass
+
+
+class Root(Registrable):
+    pass
+
+
+@Root.register("depth1_1")
+class Depth1_1(Root):
+    def __init__(self, depth1_1: str):
+        pass
+
+
+@Root.register("depth1_2")
+class Depth1_2(Root):
+    def __init__(self, depth1_2: str):
+        pass
+
+
+@Root.register("depth2_1")
+@Depth1_1.register("depth2_1")
+class Depth2_1(Depth1_1):
+    def __init__(self, depth2_1: str):
+        pass
+
+
+@Functor.register("simple")
+def simple_func(name: "str", age: list = []):
+    print(f"name = {name}")
+    print(f"age = {age}")
+    return sum(age)
+
+
+@Functor.register("complex")
+def complex_func(gaussian: ComplexGaussian):
+    return len(gaussian.dict_gaussian), len(gaussian.list_gaussian)
+
+
+@Functor.register("with_kwargs")
+def simple_func3(**kwargs):
+    print(f"kwargs = {kwargs}")
+    return kwargs
+
+
+def gen_conf():
+    gaussian_0 = {
+        "mean": 0,
+        "variance": 1,
+        "noise": [2, 3, 4],
+        "attr": {"name": "xxx", "age": 999},
+    }
+    gaussian_1 = {
+        "mean": 13,
+        "variance": 2,
+        "noise": [3, 4, 5],
+        "attr": {"name": "yyy", "age": 11},
+    }
+    gaussian_2 = {
+        "mean": 20,
+        "variance": 3,
+        "noise": [4, 5, 6],
+        "attr": {"name": "zzz", "age": 234},
+    }
+    gaussian_3 = {
+        "mean": 39,
+        "variance": 3,
+        "noise": [4, 5, 6],
+        "attr": {"name": "xxx", "age": 66},
+    }
+    gaussian_4 = {
+        "mean": 47,
+        "variance": 3,
+        "noise": [4, 5, 6],
+        "attr": {"name": "xxx", "age": 712},
+    }
+    params_dict = {
+        "dict_gaussian": {"0": gaussian_0, "1": gaussian_1},
+        "list_gaussian": [gaussian_2, gaussian_3, gaussian_4],
+    }
+
+    params = ConfigFactory.from_dict(params_dict)
+    return params
+
+
+def test_from_param():
+    params = gen_conf()
+    model = ComplexGaussian.from_config(params)
+    assert model.list_gaussian[-1].mean == 47
+
+
+def test_from_param_base():
+    params = gen_conf()
+    params.put("type", "complex_gaussian")
+    model = MockModel.from_config(params)
+    assert (
+        type(model) is ComplexGaussian
+    ), f"expect type ComplexGaussian, got {type(model)}"
+    assert model.list_gaussian[-1].mean == 47
+
+
+def test_to_config():
+    params = gen_conf()
+    model = ComplexGaussian.from_config(params)
+    reconstructed_params = model.to_config()
+    reconstructed_model = ComplexGaussian.from_config(reconstructed_params)
+    assert len(reconstructed_model.list_gaussian) == 3
+    assert reconstructed_model.list_gaussian[-1].mean == 47
+
+
+def test_multi_constructor():
+    # without type key, will use default_implementation
+    params = ConfigFactory.from_dict({"count": 32})
+    ins = BaseCount.from_config(params)
+    reconstructed_params = ins.to_config()
+    assert reconstructed_params.count == 32
+
+    params = ConfigFactory.from_dict(
+        {"type": "from_list_of_ints", "int_list": [1, 2, 3]}
+    )
+    ins = BaseCount.from_config(params)
+    reconstructed_params = ins.to_config()
+    assert reconstructed_params.type == "from_list_of_ints"
+    assert reconstructed_params.int_list == [1, 2, 3]
+
+    params = ConfigFactory.from_dict(
+        {"type": "from_list_of_strings", "str_list": ["1", "2", "#", "*"]}
+    )
+    ins = BaseCount.from_config(params)
+    reconstructed_params = ins.to_config_with_constructor("from_list_of_strings")
+    assert reconstructed_params.type == "from_list_of_strings"
+    assert reconstructed_params.str_list == ["1", "2", "#", "*"]
+
+
+def test_union_type():
+    params = ConfigFactory.from_dict(
+        {"type": "mix1", "sub": {"type": "sub11", "name11": "sub11"}}
+    )
+    ins = MixBase.from_config(params)
+    assert type(ins.sub) == Sub11
+    assert ins.sub.name11 == "sub11"
+
+    params = ConfigFactory.from_dict(
+        {"type": "mix1", "sub": {"type": "sub2", "name2": "sub2"}}
+    )
+    ins = MixBase.from_config(params)
+    assert type(ins.sub) == Sub2
+    assert ins.sub.name2 == "sub2"
+
+    # for Mix2, type of sub is not required, which has been indicated in __init__
+    params = ConfigFactory.from_dict({"type": "mix2", "sub": {"name2": "sub2"}})
+    ins = MixBase.from_config(params)
+    assert type(ins.sub) == Sub2
+    assert ins.sub.name2 == "sub2"
+
+
+def test_nested():
+    conf = ConfigFactory.from_dict({"type": "depth1_1", "depth1_1": "zz"})
+    ins = Root.from_config(conf)
+    assert type(ins) == Depth1_1
+    # instantiate from intermediate class (has both parent class and subclass)
+    conf = ConfigFactory.from_dict({"depth1_1": "zz"})
+    ins = Depth1_1.from_config(conf)
+    assert type(ins) == Depth1_1
+    # instantiate from leaf class (have no subclass)
+    conf = ConfigFactory.from_dict({"type": "depth1_2", "depth1_2": "zz"})
+    ins = Depth1_2.from_config(conf)
+    assert type(ins) == Depth1_2
+    # instantiate from root class[require extra register declarition]
+    conf = ConfigFactory.from_dict({"type": "depth2_1", "depth2_1": "zz"})
+    ins = Root.from_config(conf)
+    assert type(ins) == Depth2_1
+    # instantiate from parent class
+    conf = ConfigFactory.from_dict({"type": "depth2_1", "depth2_1": "zz"})
+    ins = Depth1_1.from_config(conf)
+    assert type(ins) == Depth2_1
+
+
+def test_pass_dict():
+    conf = {"type": "depth1_1", "depth1_1": "zz"}
+    ins = Root.from_config(conf)
+    assert type(ins) == Depth1_1
+
+
+def test_with_kwargs():
+    conf = {
+        "type": "gaussian_var",
+        "mean": 1.1,
+        "variance": "2.2",
+        "noise": [1, 2, 3],
+        "less": "more",
+        "x": "y",
+    }
+    res = GaussianVar.from_config(conf)
+    assert res.less == "more" and res.x == "y"
+
+    conf = {
+        "type": "complex_gaussian_var",
+        "less": "more",
+        "x": "y",
+        "list_gaussian": [
+            {"mean": 0.7, "variance": 1.1, "noise": [1, 2, 3], "less": "more", "x": "y"}
+        ],
+    }
+    ComplexGaussianVar.list_available()
+    res = ComplexGaussianVar.from_config(conf)
+    assert (
+        res.less == "more"
+        and res.x == "y"
+        and res.list_gaussian[0].less == "more"
+        and res.list_gaussian[0].x == "y"
+    )
+
+
+def test_with_obj():
+    conf = {
+        "type": "gaussian_var",
+        "mean": 1.1,
+        "variance": "2.2",
+        "noise": [1, 2, 3],
+        "less": "more",
+        "x": "y",
+    }
+    res = GaussianVar.from_config(conf)
+    assert res.less == "more" and res.x == "y"
+
+    conf = {
+        "type": "complex_gaussian_var",
+        "less": "more",
+        "x": "y",
+        "list_gaussian": [
+            {"mean": 0.7, "variance": 1.1, "noise": [1, 2, 3], "less": "more", "x": "y"}
+        ],
+    }
+
+    res = ComplexGaussianVar.from_config(conf)
+
+    # use object instead of config
+    conf["list_gaussian"] = res.list_gaussian
+    res2 = ComplexGaussianVar.from_config(conf)
+
+    assert id(res.list_gaussian[0]) == id(
+        res2.list_gaussian[0]
+    ), "The two objects are different!!"
+
+    data = np.random.rand(128)
+    conf = {"mean": 1.1, "variance": 2.2, "data": data}
+    res = ObjGaussian.from_config(conf)
+
+    assert data is res.data, "The two objects are different!!"
+
+
+def test_functor():
+    simple_conf = ConfigFactory.from_dict(
+        {"type": "simple", "name": "pyfunc", "age": [1, 2, 3]}
+    )
+    func = Functor.from_config(simple_conf)
+    reconstructed_conf = func.to_config()
+    reconstructed_func = Functor.from_config(reconstructed_conf)
+    assert reconstructed_func() == 6
+
+    complex_conf = ConfigFactory.from_dict({"type": "complex", "gaussian": gen_conf()})
+    func = Functor.from_config(complex_conf)
+    reconstructed_conf = func.to_config()
+    reconstructed_func = Functor.from_config(reconstructed_conf)
+    assert reconstructed_func() == (2, 3)
+
+    with_kwargs_conf = ConfigFactory.from_dict(
+        {"type": "with_kwargs", "name": "pyfunc"}
+    )
+
+    func = Functor.from_config(with_kwargs_conf)
+    kwargs = func()
+    assert kwargs["name"] == "pyfunc"
diff --git a/tests/unit/common/test_checkpointer.py b/tests/unit/common/test_checkpointer.py
new file mode 100644
index 00000000..2c444661
--- /dev/null
+++ b/tests/unit/common/test_checkpointer.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+import time
+from kag.common.checkpointer import CheckpointerManager
+
+
+def test_txt_checkpointer():
+    config = {"type": "txt", "ckpt_dir": "ckpt/txt"}
+    checkpointer = CheckpointerManager.get_checkpointer(config)
+    k = "aaa"
+    v = {"name": "aaa", "time": time.time()}
+    checkpointer.write_to_ckpt(k, v)
+    assert checkpointer.exists(k)
+    assert v == checkpointer.read_from_ckpt(k)
+    checkpointer2 = CheckpointerManager.get_checkpointer(config)
+    assert checkpointer is checkpointer2
+
+    CheckpointerManager.close()
+    checkpointer3 = CheckpointerManager.get_checkpointer(config)
+    assert checkpointer3.exists(k)
+    assert v == checkpointer3.read_from_ckpt(k)
+    CheckpointerManager.close()
+
+
+def test_bin_checkpointer():
+    config = {"type": "zodb", "ckpt_dir": "ckpt/bin"}
+    checkpointer = CheckpointerManager.get_checkpointer(config)
+    k = "aaa"
+    v = {"name": "aaa", "time": time.time()}
+    checkpointer.write_to_ckpt(k, v)
+    assert checkpointer.exists(k)
+    assert v == checkpointer.read_from_ckpt(k)
+    checkpointer2 = CheckpointerManager.get_checkpointer(config)
+    assert checkpointer is checkpointer2
+    CheckpointerManager.close()
+    checkpointer3 = CheckpointerManager.get_checkpointer(config)
+    assert checkpointer3.exists(k)
+    assert v == checkpointer3.read_from_ckpt(k)
+    CheckpointerManager.close()
diff --git a/tests/unit/common/test_conf.py b/tests/unit/common/test_conf.py
new file mode 100644
index 00000000..9e78d982
--- /dev/null
+++ b/tests/unit/common/test_conf.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+from kag.common.conf import KAG_CONFIG, KAG_PROJECT_CONF, init_env, KAGConstants
+from knext.project.client import ProjectClient
+
+
+def test_local_config():
+    os.environ.pop(KAGConstants.ENV_KAG_PROJECT_ID, None)
+    os.environ.pop(KAGConstants.ENV_KAG_PROJECT_HOST_ADDR, None)
+
+    init_env()
+    assert KAG_PROJECT_CONF.language == "en"
+    assert KAG_PROJECT_CONF.host_addr == "http://127.0.0.1:8887"
+    assert KAG_PROJECT_CONF.project_id == 1
+    assert KAG_PROJECT_CONF.biz_scene == "default"
+
+    all_config = KAG_CONFIG.all_config
+    for key in ["project", "vectorize_model", "llm", "writer", "log"]:
+        assert key in all_config, f"Config {key} not found!"
+
+
+def test_remote_config():
+    init_env()
+
+    os.environ[KAGConstants.ENV_KAG_PROJECT_ID] = "1"
+    os.environ[KAGConstants.ENV_KAG_PROJECT_HOST_ADDR] = "http://127.0.0.1:8887"
+    tmp_conf = KAG_CONFIG.all_config
+    tmp_conf["project"]["id"] = os.environ[KAGConstants.ENV_KAG_PROJECT_ID]
+    tmp_conf["project"]["host_addr"] = os.environ[
+        KAGConstants.ENV_KAG_PROJECT_HOST_ADDR
+    ]
+    tmp_conf["biz_scene"] = "default"
+    ProjectClient(host_addr=os.environ[KAGConstants.ENV_KAG_PROJECT_HOST_ADDR]).update(
+        os.environ[KAGConstants.ENV_KAG_PROJECT_ID], json.dumps(tmp_conf)
+    )
+    # init env again
+    init_env()
+    # print(KAG_PROJECT_CONF.host_addr)
+    # print(KAG_PROJECT_CONF.project_id)
+    # print(KAG_PROJECT_CONF.biz_scene)
+    # print(KAG_PROJECT_CONF.language)
+    assert (
+        KAG_PROJECT_CONF.host_addr == os.environ[KAGConstants.ENV_KAG_PROJECT_HOST_ADDR]
+    )
diff --git a/tests/common/test_template.py b/tests/unit/common/test_template.py
similarity index 61%
rename from tests/common/test_template.py
rename to tests/unit/common/test_template.py
index f0db54cd..d2733a9f 100644
--- a/tests/common/test_template.py
+++ b/tests/unit/common/test_template.py
@@ -17,35 +17,41 @@
 
 
 def get_render_contents():
-    with open(os.path.join(PWD, "resource/cfg.tmpl"), "r") as reader:
+    with open(os.path.join(PWD, "data/cfg.yaml.tmpl"), "r") as reader:
         template = reader.read()
-    with open(os.path.join(PWD, "resource/cfg"), "r") as reader:
+    with open(os.path.join(PWD, "data/cfg.yaml"), "r") as reader:
         rendered = reader.read()
     return template, rendered
 
 
 def set_render_contents(template_content, rendered_content):
-    with open(os.path.join(PWD, "resource/cfg.tmpl"), "w") as writer:
+    with open(os.path.join(PWD, "data/cfg.yaml.tmpl"), "w") as writer:
         writer.write(template_content)
-    with open(os.path.join(PWD, "resource/cfg"), "w") as writer:
+    with open(os.path.join(PWD, "data/cfg.yaml"), "w") as writer:
         writer.write(rendered_content)
 
 
 def _test_render_template(rendered_content):
-    work_dir = os.path.join(PWD, "resource")
-    render_template(
-        root=work_dir,
-        file="cfg.tmpl",
-        project_name="TEST",
-        description="TEST",
-        namespace="kag",
-        project_id="0324",
-        project_dir="/root",
-    )
-    rendered_file = os.path.join(work_dir, "cfg")
+    work_dir = os.path.join(PWD, "data")
+
+    data = {
+        "database": {
+            "host": "localhost",
+            "port": 3306,
+            "user": "root",
+            "password": "secret",
+            "name": "my_database",
+        },
+        "server": {"host": "0.0.0.0", "port": 8080},
+    }
+
+    render_template(root_dir=work_dir, file="cfg.yaml.tmpl", **data)
+    rendered_file = os.path.join(work_dir, "cfg.yaml")
     with open(rendered_file, "r") as reader:
         rendered = reader.read()
-    assert rendered_content == rendered, "template render error"
+    assert (
+        rendered_content.strip() == rendered.strip()
+    ), f"\n{rendered_content}\n=====VS=======\n{rendered}"
 
 
 def test_render_template():
@@ -56,6 +62,3 @@ def test_render_template():
         set_render_contents(template_content, rendered_content)
         raise e
     set_render_contents(template_content, rendered_content)
-
-
-test_render_template()
diff --git a/tests/unit/common/vectorize_model/test_vectorize_model.py b/tests/unit/common/vectorize_model/test_vectorize_model.py
new file mode 100644
index 00000000..9513f193
--- /dev/null
+++ b/tests/unit/common/vectorize_model/test_vectorize_model.py
@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+import copy
+import pytest
+from kag.interface import VectorizeModelABC
+
+
+@pytest.mark.skip(reason="Missing API key")
+def test_openai_vectorize_model():
+
+    conf = {
+        "type": "openai",
+        "model": "bge-m3",
+        "api_key": "EMPTY",
+        "base_url": "http://127.0.0.1:11434/v1",
+        "vector_dimensions": 1024,
+    }
+    vectorize_model = VectorizeModelABC.from_config(copy.deepcopy(conf))
+    res = vectorize_model.vectorize("你好")
+    assert res is not None
+
+
+@pytest.mark.skip(reason="Missing model file")
+def test_bge_vectorize_model():
+    conf = {
+        "type": "bge",
+        "path": "~/.cache/vectorize_model/BAAI/bge-base-zh-v1.5",
+        "url": "xxx",
+        "vector_dimensions": 768,
+    }
+
+    vectorize_model = VectorizeModelABC.from_config(copy.deepcopy(conf))
+    emb = vectorize_model.vectorize("你好")
+    assert len(emb) == vectorize_model.get_vector_dimensions()
+
+    vectorize_model2 = VectorizeModelABC.from_config(copy.deepcopy(conf))
+
+    assert id(vectorize_model.model) == id(vectorize_model2.model)
+
+
+@pytest.mark.skip(reason="Missing model file")
+def test_bge_m3_vectorize_model():
+    conf = {
+        "type": "bge_m3",
+        "path": "~/.cache/vectorize_model/BAAI/bge-m3",
+        "url": "xxx",
+        "vector_dimensions": 1024,
+    }
+
+    vectorize_model = VectorizeModelABC.from_config(copy.deepcopy(conf))
+    emb = vectorize_model.vectorize("你好")
+    assert len(emb) == vectorize_model.get_vector_dimensions()
+
+    vectorize_model2 = VectorizeModelABC.from_config(copy.deepcopy(conf))
+
+    assert id(vectorize_model.model) == id(vectorize_model2.model)
+
+
+def test_mock_vectorize_model():
+    conf = {
+        "type": "mock",
+        "vector_dimensions": 768,
+    }
+    vectorize_model = VectorizeModelABC.from_config(copy.deepcopy(conf))
+    emb = vectorize_model.vectorize("你好")
+    assert len(emb) == vectorize_model.get_vector_dimensions()
+    embs = vectorize_model.vectorize(["你好", "再见"])
+    assert len(embs) == 2
+    for emb in embs:
+        assert len(emb) == vectorize_model.get_vector_dimensions()
diff --git a/tests/unit/kag_config.yaml b/tests/unit/kag_config.yaml
new file mode 100644
index 00000000..e584eca3
--- /dev/null
+++ b/tests/unit/kag_config.yaml
@@ -0,0 +1,25 @@
+project:
+  biz_scene: default
+  host_addr: http://127.0.0.1:8887
+  id: 1
+  language: en
+  namespace: MuSiQue
+
+
+llm: &llm_conf
+  type: mock
+
+
+vectorize_model: &vec_model
+  type: mock
+  vector_dimensions: 768
+
+vectorizer: &vec
+  type: batch
+  vectorize_model: *vec_model
+
+writer:
+  type: kg
+
+log:
+  level: INFO
diff --git a/tests/builder/component/test_extractor.py b/tests/unit/solver/__init__.py
similarity index 100%
rename from tests/builder/component/test_extractor.py
rename to tests/unit/solver/__init__.py
diff --git a/tests/builder/component/test_post_processor.py b/tests/unit/solver/logic_form_executor/__init__.py
similarity index 100%
rename from tests/builder/component/test_post_processor.py
rename to tests/unit/solver/logic_form_executor/__init__.py
diff --git a/tests/solver/logic_form_executor/parse_test.py b/tests/unit/solver/logic_form_executor/parse_test.py
similarity index 87%
rename from tests/solver/logic_form_executor/parse_test.py
rename to tests/unit/solver/logic_form_executor/parse_test.py
index d2fdfaa2..4f4908e3 100644
--- a/tests/solver/logic_form_executor/parse_test.py
+++ b/tests/unit/solver/logic_form_executor/parse_test.py
@@ -1,6 +1,6 @@
 import unittest
 
-from kag.solver.logic.core_modules.common.base_model import SPOEntity, SPORelation
+from kag.interface.solver.base_model import SPOEntity, SPORelation
 from kag.solver.logic.core_modules.parser.logic_node_parser import binary_expr_parse, ParseLogicForm
 
 parser = ParseLogicForm(None, None)
@@ -12,7 +12,7 @@ def test_entity_parse0(self):
         assert spo_entity.entity_name == None
         assert len(spo_entity.id_set) == 0
         assert len(spo_entity.type_set) == 1
-        assert spo_entity.type_set[0].entity_type_zh == "政务事项"
+        assert spo_entity.type_set[0].un_std_entity_type == "政务事项"
         assert spo_entity.alias_name == "s1"
 
     def test_binary_expr_parse_1(self):
@@ -33,7 +33,7 @@ def test_entity_parse(self):
         assert spo_entity.entity_name == "浙江省-杭州市-西湖区申领"
         assert len(spo_entity.id_set) == 0
         assert len(spo_entity.type_set) == 1
-        assert spo_entity.type_set[0].entity_type_zh == "政务事项"
+        assert spo_entity.type_set[0].un_std_entity_type == "政务事项"
         assert spo_entity.alias_name == "s1"
 
     def test_entity_parse2(self):
@@ -41,7 +41,7 @@ def test_entity_parse2(self):
         assert spo_entity.entity_name == "张三"
         assert len(spo_entity.id_set) == 0
         assert len(spo_entity.type_set) == 2
-        assert spo_entity.type_set[0].entity_type_zh == "电影明星"
+        assert spo_entity.type_set[0].un_std_entity_type == "电影明星"
         assert spo_entity.alias_name == "s1"
 
     def test_entity_parse3(self):
@@ -50,7 +50,7 @@ def test_entity_parse3(self):
         assert len(spo_entity.id_set) == 1
         assert spo_entity.id_set[0] == "1"
         assert len(spo_entity.type_set) == 1
-        assert spo_entity.type_set[0].entity_type_zh == "电影明星"
+        assert spo_entity.type_set[0].un_std_entity_type == "电影明星"
         assert spo_entity.alias_name == "s1"
 
     def test_entity_parse4(self):
@@ -60,8 +60,8 @@ def test_entity_parse4(self):
         assert spo_entity.id_set[0] == "1|3"
         assert spo_entity.id_set[1] == "2"
         assert len(spo_entity.type_set) == 2
-        assert spo_entity.type_set[0].entity_type_zh == "电影（，明星"
-        assert spo_entity.type_set[1].entity_type_zh == "电影导演"
+        assert spo_entity.type_set[0].un_std_entity_type == "电影（，明星"
+        assert spo_entity.type_set[1].un_std_entity_type == "电影导演"
         assert spo_entity.alias_name == "s1"
 
     def test_entity_parse5(self):
@@ -76,23 +76,23 @@ def test_rel_parse2(self):
         spo_rel = SPORelation.parse_logic_form("p1:演）员")
         assert spo_rel.alias_name == "p1"
         assert len(spo_rel.type_set) == 1
-        assert spo_rel.type_set[0].entity_type_zh == "演）员"
+        assert spo_rel.type_set[0].un_std_entity_type == "演）员"
 
     def test_rel_parse3(self):
         spo_rel = SPORelation.parse_logic_form("p1:`参演`|`导演`")
         assert spo_rel.alias_name == "p1"
         assert len(spo_rel.type_set) == 2
-        assert spo_rel.type_set[0].entity_type_zh == "参演"
-        assert spo_rel.type_set[1].entity_type_zh == "导演"
+        assert spo_rel.type_set[0].un_std_entity_type == "参演"
+        assert spo_rel.type_set[1].un_std_entity_type == "导演"
 
     def test_get_spo_parse(self):
         spo_node = parser.parse_logic_form("get_spo(s=s1:政务事项[身份证挂失], p=p1:线上办事渠道， o=o1:办事渠道）", {})
-        assert spo_node.o.get_entity_first_type_or_en() == "办事渠道"
+        assert spo_node.o.get_un_std_entity_first_type_or_std() == "办事渠道"
         print(spo_node.to_dict())
 
     def test_get_spo_parse2(self):
         spo_node = parser.parse_logic_form("get_spo(s=s1:政务事项[身份证挂失],p=p1:线上办事渠道,o=o1:办事渠道)", {})
-        assert spo_node.o.get_entity_first_type_or_en() == "办事渠道"
+        assert spo_node.o.get_un_std_entity_first_type_or_std() == "办事渠道"
         print(spo_node.to_dict())
 
     def test_get_spo_parse3(self):
@@ -157,13 +157,13 @@ def test_get_spo_parse5(self):
     def test_search_s_parse1(self):
         spo_node = parser.parse_logic_form("search_s(s=s1:医院, s.省份=浙江省, s.城市=杭州市)", {})
         assert spo_node.s.alias_name == "s1"
-        assert spo_node.s.type_set[0].entity_type_zh == '医院'
+        assert spo_node.s.type_set[0].un_std_entity_type == '医院'
         assert '省份' in spo_node.s.value_list.keys()
         assert '城市' in spo_node.s.value_list.keys()
 
         spo_node = parser.parse_logic_form("search_s(s=s2:医生, s.主执业医院=s1.name, s.主执业科室=心内科)", {})
         assert spo_node.s.alias_name == "s2"
-        assert spo_node.s.type_set[0].entity_type_zh == '医生'
+        assert spo_node.s.type_set[0].un_std_entity_type == '医生'
         assert '主执业医院' in spo_node.s.value_list.keys()
         assert '主执业科室' in spo_node.s.value_list.keys()
         assert spo_node.s.value_list['主执业医院'] == ['s1', 'name']
diff --git a/tests/builder/component/test_writer.py b/tests/unit/solver/logic_form_plan/__init__.py
similarity index 100%
rename from tests/builder/component/test_writer.py
rename to tests/unit/solver/logic_form_plan/__init__.py
diff --git a/tests/solver/logic_form_plan/test_planner.py b/tests/unit/solver/logic_form_plan/test_planner.py
similarity index 96%
rename from tests/solver/logic_form_plan/test_planner.py
rename to tests/unit/solver/logic_form_plan/test_planner.py
index d8707dfe..e9d90833 100644
--- a/tests/solver/logic_form_plan/test_planner.py
+++ b/tests/unit/solver/logic_form_plan/test_planner.py
@@ -2,7 +2,7 @@
 import unittest
 
 from kag.common.env import init_kag_config
-from kag.solver.implementation.default_lf_planner import DefaultLFPlanner
+from kag.solver.plan.default_lf_planner import DefaultLFPlanner
 from kag.solver.logic.core_modules.common.text_sim_by_vector import TextSimilarity
 
 configFilePath = os.path.join(os.path.abspath(os.path.dirname(__file__)), "kag_config.cfg")
diff --git a/tests/vectorizer/config/ollama_vectorizer.yaml b/tests/vectorizer/config/ollama_vectorizer.yaml
deleted file mode 100644
index 295c630b..00000000
--- a/tests/vectorizer/config/ollama_vectorizer.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-vectorizer: kag.common.vectorizer.OpenAIVectorizer
-model: bge-m3
-api_key: EMPTY
-base_url: http://127.0.0.1:11434/v1
-vector_dimensions: 1024
\ No newline at end of file
diff --git a/tests/vectorizer/test_ollama_vectorizer.py b/tests/vectorizer/test_ollama_vectorizer.py
deleted file mode 100644
index 0fc369e0..00000000
--- a/tests/vectorizer/test_ollama_vectorizer.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import unittest
-import os
-from kag.common.vectorizer import Vectorizer
-
-class TestOllamaVectorizer(unittest.TestCase):
-
-    def setUp(self):
-        self.file_path = os.path.dirname(__file__)
-    def test_ollama_vectorizer(self):
-        config_path = os.path.join(self.file_path, "config/ollama_vectorizer.yaml")
-        vectorizer = Vectorizer.from_config(config_path)
-        res = vectorizer.vectorize("你好")
-        print(res)
-        assert res is not None
-    
-if __name__ == '__main__':
-    unittest.main()